diff --git a/.circleci/config.yml b/.circleci/config.yml deleted file mode 100644 index 39aea3517a..0000000000 --- a/.circleci/config.yml +++ /dev/null @@ -1,122 +0,0 @@ -docs_deploy: &docs - docker: - - image: node:8.10.0 - working_directory: /tmp/gh-pages - steps: - - run: - name: Check whether this is the original repo - command: | - if [[ "$CIRCLE_PROJECT_USERNAME" != "nipype" ]]; then - echo "Not in nipype/pydra - skipping docs deploy." - circleci step halt - fi - - add_ssh_keys: - fingerprints: - - "21:5b:2f:41:9e:e6:7a:47:e7:90:37:62:b2:ac:e8:a0" - - run: - name: Install gh-pages tool - command: | - npm install -g --silent gh-pages@3.0.0 - - checkout - - run: - name: Set git settings - command: | - git config user.email "nipype@mit.edu" - git config user.name "Nipype Bot" - - attach_workspace: - at: docs/_build - - run: - name: Disable jekyll builds - command: touch docs/_build/html/.nojekyll - - run: - name: Deploy docs to gh-pages branch - command: gh-pages --no-history --dotfiles --message "doc(update) [skip ci]" --dist docs/_build/html - -version: 2.1 -jobs: - - build_docs: - docker: - - image: cimg/python:3.11 - working_directory: /tmp/gh-pages - environment: - - FSLOUTPUTTYPE: NIFTI - - SUBJECTS_DIR: /tmp/subjects - steps: - - checkout - - run: - name: Install deps - command: | - pip install --upgrade pip - pip install --no-cache-dir -r docs/requirements.txt - pip install --no-cache-dir .[doc] - - run: - name: Build only this commit - command: make -C docs SPHINXOPTS="-W" BUILDDIR="_build/no_version_html" SPHINX_APIDOC_OPTIONS="members,undoc-members,show-inheritance,noindex" html - - store_artifacts: - path: ./docs/_build/no_version_html - - run: - name: Stop or generate versioned docs? - command: | - set +e - force_versioned="$( git log --format=oneline -n 1 $CIRCLE_SHA1 | grep -i -E '\[docs?[ _]?versions?\]' )" - set -e - if [[ "x${CIRCLE_TAG}" = "x" && "${CIRCLE_BRANCH}" != "master" && "x${force_versioned}" = "x" ]]; then - echo "Not a tag or master branch - skipping versioned docs." - circleci step halt - fi - - restore_cache: - keys: - - docs-v1-{{ .Branch }}-{{ .Revision }} - - docs-v1-{{ .Branch }}- - - docs-v1-master - - docs-v1- - paths: - - ./docs/_build/_html - - run: - name: Generate Versioned Docs - command: make -f ./docs/Makefile versioned CURBRANCH=${CIRCLE_TAG:-$CIRCLE_BRANCH} - - save_cache: - key: docs-v1-{{ .Branch }}-{{ .Revision }} - paths: - - ./docs/_build/_html - - persist_to_workspace: - root: docs/_build - paths: html - - store_artifacts: - path: ./docs/_build/html - - deploy_docs_tag: - <<: *docs - - deploy_docs_master: - <<: *docs - - -workflows: - version: 2 - build_deploy: - jobs: - - build_docs: - filters: - branches: - ignore: - - /tests?\/.*/ - tags: - only: /.*/ - - deploy_docs_master: - requires: - - build_docs - filters: - branches: - only: /master/ - tags: - ignore: /.*/ - - deploy_docs_tag: - requires: - - build_docs - filters: - branches: - ignore: /.*/ - tags: - only: /.*/ diff --git a/.github/workflows/ci-cd.yml b/.github/workflows/ci-cd.yml new file mode 100644 index 0000000000..063a0890f2 --- /dev/null +++ b/.github/workflows/ci-cd.yml @@ -0,0 +1,384 @@ +# This workflows will upload a Python Package using Twine when a release is created +# For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries + +name: CI/CD + +on: + release: + types: [published] + push: + branches: + - master + pull_request: + +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + +permissions: + contents: read + +jobs: + + build: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + with: + fetch-depth: 0 + - uses: actions/setup-python@v5 + with: + python-version: 3 + - run: pip install --upgrade build twine + - run: python -m build + - run: twine check dist/* + - uses: actions/upload-artifact@v4 + with: + name: dist + path: dist/ + - name: Build archive + run: | + git clean -fxd + mkdir archive + git archive -o archive/pydra.zip HEAD + - uses: actions/upload-artifact@v4 + with: + name: archive + path: archive/ + + test: + needs: ['build'] + strategy: + matrix: + os: [ubuntu-latest, macos-latest, windows-latest] + python-version: ['3.11', '3.12', '3.13'] + fail-fast: false + runs-on: ${{ matrix.os }} + steps: + - name: Fetch repository + uses: actions/checkout@v4 + - name: Fetch tags + run: git fetch --prune --unshallow + - name: Set up Python ${{ matrix.python-version }} on ${{ matrix.os }} + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + - name: Update pip + run: python -m pip install --upgrade pip + - name: Install Pydra + run: pip install .[test] + - name: Print version + run: python -c "import pydra.engine; print(pydra.utils.__version__)" + - name: Disable etelemetry + run: echo "NO_ET=TRUE" >> $GITHUB_ENV + - name: Pytest + run: | + pytest -vs -n auto pydra --doctest-modules --import-mode=importlib --cov pydra --cov-config .coveragerc --cov-report xml:cov.xml --rootdir pydra + - name: Upload coverage to Codecov + uses: codecov/codecov-action@v2 + with: + fail_ci_if_error: true + token: ${{ secrets.CODECOV_TOKEN }} + + test-singularity: + needs: ['build'] + runs-on: ubuntu-22.04 + strategy: + matrix: + python-version: ['3.11', '3.12', '3.13'] + fail-fast: False + steps: + - name: Set env + run: | + echo "RELEASE_VERSION=v3.7.1" >> $GITHUB_ENV + echo "NO_ET=TRUE" >> $GITHUB_ENV + - name: Setup Singularity + uses: actions/checkout@v4 + with: + repository: hpcng/singularity + ref: 'v3.7.1' + path: 'singularity' + - name: Setup GO + uses: actions/setup-go@v5 + with: + go-version: '^1.13' + - name: Install OS deps + run: | + sudo apt-get update + sudo apt-get install flawfinder squashfs-tools uuid-dev libuuid1 libffi-dev libssl-dev libssl1.1 \ + libarchive-dev libgpgme11-dev libseccomp-dev wget gcc make pkg-config -y + - name: Build + run: | + cd singularity + ./mconfig --without-suid -p /usr/local/ + make -C ./builddir + sudo make -C ./builddir install + cd .. + - name: Echo singularity version + run: | + echo ${{ github.ref }} + singularity --version + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + - name: Update build tools + run: python -m pip install --upgrade pip + - name: Checkout Pydra repo + uses: actions/checkout@v4 + with: + repository: ${{ github.repository }} + - name: Fetch tags + run: git fetch --prune --unshallow + - name: Install pydra (test) + run: pip install -e ".[test]" + - name: Pytest + run: pytest -vs --import-mode=importlib --cov pydra --cov-config .coveragerc --cov-report xml:cov.xml pydra/environments/tests/test_singularity.py pydra/environments/tests/test_environments.py --rootdir pydra + - name: Upload coverage to Codecov + uses: codecov/codecov-action@v2 + with: + fail_ci_if_error: true + token: ${{ secrets.CODECOV_TOKEN }} + + test-slurm: + needs: ['build'] + strategy: + matrix: + python-version: [3.11.5] + fail-fast: false + runs-on: ubuntu-latest + env: + DOCKER_IMAGE: adi611/docker-centos7-slurm:23.02.1 + steps: + - name: Disable etelemetry + run: echo "NO_ET=TRUE" >> $GITHUB_ENV + - uses: actions/checkout@v4 + - name: Fetch tags + run: git fetch --prune --unshallow + - name: Pull docker image + run: | + docker pull $DOCKER_IMAGE + # Have image running in the background + docker run `bash <(curl -s https://codecov.io/env)` -itd -h slurmctl --cap-add sys_admin -d --name slurm -v `pwd`:/pydra -e NO_ET=$NO_ET $DOCKER_IMAGE + - name: Display previous jobs with sacct + run: | + echo "Allowing ports/daemons time to start" && sleep 10 + docker exec slurm bash -c "sacctmgr -i add account none,test Cluster=linux Description='none' Organization='none'" + docker exec slurm bash -c "sacct && sinfo && squeue" 2&> /dev/null + if [ $? -ne 0 ]; then + echo "Slurm docker image error" + exit 1 + fi + - name: Setup Python + run: | + docker exec slurm bash -c "echo $NO_ET" + docker exec slurm bash -c "ls -la && echo list top level dir" + docker exec slurm bash -c "ls -la /pydra && echo list pydra dir" + if [[ "${{ matrix.python-version }}" == "3.11.5" ]]; then + docker exec slurm bash -c "CONFIGURE_OPTS=\"-with-openssl=/opt/openssl\" pyenv install -v 3.11.5" + fi + docker exec slurm bash -c "pyenv global ${{ matrix.python-version }}" + docker exec slurm bash -c "pip install --upgrade pip && pip install -e /pydra[test,psij] && python -c 'import pydra.engine; print(pydra.utils.__version__)'" + - name: Run pytest + run: | + docker exec slurm bash -c "pytest /pydra/pydra/workers/tests/test_worker.py --import-mode=importlib --rootdir /pydra/pydra --only-worker=slurm --color=yes -vs --cov pydra --cov-config /pydra/.coveragerc --cov-report xml:/pydra/cov.xml" + - name: Upload coverage to Codecov + uses: codecov/codecov-action@v2 + with: + fail_ci_if_error: true + token: ${{ secrets.CODECOV_TOKEN }} + files: ./cov.xml + + # test-sge: + # needs: ['build'] + # strategy: + # matrix: + # python-version: [3.11.5] + # fail-fast: false + # runs-on: ubuntu-latest + # env: + # VERSION: 1.8.1 + # steps: + # - name: Disable etelemetry + # run: echo "NO_ET=TRUE" >> $GITHUB_ENV + # - uses: actions/checkout@v4 + # - name: Fetch tags + # run: git fetch --prune --unshallow + # - name: Set up Python ${{ matrix.python-version }} + # uses: actions/setup-python@v5 + # with: + # python-version: ${{ matrix.python-version }} + # - name: Install SGE + # run: | + # mkdir -p /sge-build + # cd /sge-build + # wget -c https://github.com/sge-network/sge/archive/refs/tags/v${{ env.VERSION }}.tar.gz + # tar zxvf v${{ env.VERSION }}.tar.gz + # cd v${{ env.VERSION }} + # mkdir -p /opt/sge + # useradd -r -m -U -d /home/sgeadmin -s /bin/bash -c "Docker SGE Admin" sgeadmin + # usermod -a -G sudo sgeadmin + # sh scripts/bootstrap.sh && ./aimk -no-qmon -no-qtcsh && ./aimk -man + # echo Y | ./scripts/distinst -local -allall -libs -noexit + # cd /opt/sge + # touch bin/lx-amd64/{qmon,qtcsh} + # ln -s /opt/sge/default/common/settings.sh /etc/profile.d/sge_settings.sh + # echo . /etc/profile.d/sge_settings.sh >> /etc/bash.bashrc + # chmod a+x /root/boot-sge.sh + # apt-get clean && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* + # - name: Boot SGE + # run: | + # cd /opt/sge + # ./inst_sge -m -x -s -auto util/install_modules/inst_template.conf + # . /etc/profile.d/sge_settings.sh + # cd $HOME + # qconf -as `hostname` + # qconf -mattr queue shell_start_mode unix_behavior all.q + # update_conf() { + # TMPF=`mktemp` + # cat > $TMPF <> $GITHUB_ENV + echo "DBUS_SESSION_BUS_ADDRESS=$DBUS_SESSION_BUS_ADDRESS" >> $GITHUB_ENV + - name: Start Notification Daemon (for notifications) + run: | + xfce4-notifyd & + sleep 2 # Give it some time to start + - name: Send Notification (test notifications) + run: | + notify-send "GitHub Runner Notification" "This is a test notification from GitHub Actions" + - name: Debug Running Processes (for notifications) + run: | + ps aux | grep notify + ps aux | grep xfce4-notifyd + dbus-monitor --session & + sleep 3 + - uses: actions/checkout@v4 + - name: Fetch tags + run: git fetch --prune --unshallow + - name: Install Minconda + uses: conda-incubator/setup-miniconda@v3 + with: + auto-activate-base: true + activate-environment: "" + - name: Install MRtrix via Conda + run: | + conda install -c mrtrix3 mrtrix3 + mrconvert --version + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: '3.x' + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install build twine + - name: Install package + run: pip install .[doc] + - name: Install Python3 kernel + run: python -m ipykernel install --user + - name: Build docs + run: | + cd docs + make html + cd .. + - uses: actions/upload-artifact@v4 + with: + name: docs + path: docs/build/html + + deploy: + needs: [build-docs, test, test-singularity, test-slurm] + runs-on: ubuntu-latest + steps: + - name: Download dist + uses: actions/download-artifact@v4 + with: + name: dist + path: dist + - name: Check for PyPI token on tag + id: deployable + if: github.event_name == 'release' + env: + PYPI_API_TOKEN: "${{ secrets.PYPI_API_TOKEN }}" + run: if [ -n "$PYPI_API_TOKEN" ]; then echo "DEPLOY=true" >> $GITHUB_OUTPUT; fi + - name: Upload to PyPI + if: steps.deployable.outputs.DEPLOY + uses: pypa/gh-action-pypi-publish@release/v1 + with: + user: __token__ + password: ${{ secrets.PYPI_API_TOKEN }} + + deploy-docs: + needs: [build-docs, deploy] + runs-on: ubuntu-latest + steps: + - name: Download docs + uses: actions/download-artifact@v4 + with: + name: docs + path: docs-build + - name: Check for GHPAGES_DEPLOY_KEY token + id: deployable + if: github.event_name == 'release' + env: + GHPAGES_DEPLOY_KEY: "${{ secrets.GHPAGES_DEPLOY_KEY }}" + run: if [ -n "$GHPAGES_DEPLOY_KEY" ]; then echo "DEPLOY=true" >> $GITHUB_OUTPUT; fi + - name: Deploy Docs to GitHub Pages + if: steps.deployable.outputs.DEPLOY + uses: peaceiris/actions-gh-pages@v3 + with: + github_token: ${{ secrets.GHPAGES_DEPLOY_KEY }} + publish_dir: docs-build diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml deleted file mode 100644 index 640a5b0d09..0000000000 --- a/.github/workflows/publish.yml +++ /dev/null @@ -1,35 +0,0 @@ -# This workflows will upload a Python Package using Twine when a release is created -# For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries - -name: Upload to PyPI - -on: - release: - types: [published] - -concurrency: - group: ${{ github.workflow }}-${{ github.ref }} - cancel-in-progress: true - -jobs: - deploy: - runs-on: ubuntu-latest - - steps: - - uses: actions/checkout@v4 - - name: Set up Python - uses: actions/setup-python@v5 - with: - python-version: '3.x' - - name: Install dependencies - run: | - python -m pip install --upgrade pip - pip install build twine - - - name: Build and publish - env: - TWINE_USERNAME: __token__ - TWINE_PASSWORD: ${{ secrets.PYPI_API_TOKEN }} - run: | - python -m build - twine upload dist/* diff --git a/.github/workflows/testdask.yml b/.github/workflows/testdask.yml deleted file mode 100644 index 7ca8a29f51..0000000000 --- a/.github/workflows/testdask.yml +++ /dev/null @@ -1,45 +0,0 @@ -name: Dask - -on: - push: - branches: - - master - pull_request: - -concurrency: - group: ${{ github.workflow }}-${{ github.ref }} - cancel-in-progress: true - -permissions: - contents: read - -jobs: - test: - strategy: - matrix: - os: [ubuntu-latest, macos-latest] - python-version: ['3.9', '3.10', '3.11', '3.12'] - fail-fast: false - runs-on: ${{ matrix.os }} - - steps: - - name: Checkout repository - uses: actions/checkout@v4 - with: - repository: ${{ github.repository }} - - - name: Setup Python version ${{ matrix.python-version }} - uses: actions/setup-python@v5 - with: - python-version: ${{ matrix.python-version }} - - - name: Install dependencies for Dask - run: | - pip install -e ".[test,dask]" - - - name: Run tests for Dask - run: | - pytest -v --dask pydra/engine --cov pydra --cov-config .coveragerc --cov-report xml:cov.xml - - - name: Upload to codecov - run: codecov -f cov.xml -F unittests -e GITHUB_WORKFLOW diff --git a/.github/workflows/testpsijlocal.yml b/.github/workflows/testpsijlocal.yml deleted file mode 100644 index 2e1a752ed2..0000000000 --- a/.github/workflows/testpsijlocal.yml +++ /dev/null @@ -1,45 +0,0 @@ -name: PSI/J-Local - -on: - push: - branches: - - master - pull_request: - -concurrency: - group: ${{ github.workflow }}-${{ github.ref }} - cancel-in-progress: true - -permissions: - contents: read - -jobs: - test: - strategy: - matrix: - os: [ubuntu-latest, macos-latest] - python-version: ['3.11'] - fail-fast: false - runs-on: ${{ matrix.os }} - - steps: - - name: Checkout repository - uses: actions/checkout@v4 - with: - repository: ${{ github.repository }} - - - name: Setup Python version ${{ matrix.python-version }} - uses: actions/setup-python@v5 - with: - python-version: ${{ matrix.python-version }} - - - name: Install dependencies for PSI/J - run: | - pip install -e ".[test, psij]" - - - name: Run tests for PSI/J - run: | - pytest --color=yes -vs --psij=local -n auto pydra/engine --cov pydra --cov-config .coveragerc --cov-report xml:cov.xml - - - name: Upload to codecov - run: codecov -f cov.xml -F unittests -e GITHUB_WORKFLOW diff --git a/.github/workflows/testpsijslurm.yml b/.github/workflows/testpsijslurm.yml deleted file mode 100644 index 9dc9100800..0000000000 --- a/.github/workflows/testpsijslurm.yml +++ /dev/null @@ -1,58 +0,0 @@ -name: PSI/J-SLURM - -on: - push: - branches: - - master - pull_request: - -concurrency: - group: ${{ github.workflow }}-${{ github.ref }} - cancel-in-progress: true - -jobs: - build: - strategy: - matrix: - python-version: [3.11.5] - fail-fast: false - runs-on: ubuntu-latest - env: - DOCKER_IMAGE: adi611/docker-centos7-slurm:23.02.1 - - steps: - - name: Disable etelemetry - run: echo "NO_ET=TRUE" >> $GITHUB_ENV - - uses: actions/checkout@v4 - - name: Pull docker image - run: | - docker pull $DOCKER_IMAGE - # Have image running in the background - docker run `bash <(curl -s https://codecov.io/env)` -itd -h slurmctl --cap-add sys_admin -d --name slurm -v `pwd`:/pydra -e NO_ET=$NO_ET $DOCKER_IMAGE - - name: Display previous jobs with sacct - run: | - echo "Allowing ports/daemons time to start" && sleep 10 - docker exec slurm bash -c "sacctmgr -i add account none,test Cluster=linux Description='none' Organization='none'" - docker exec slurm bash -c "sacct && sinfo && squeue" 2&> /dev/null - if [ $? -ne 0 ]; then - echo "Slurm docker image error" - exit 1 - fi - - name: Setup Python - run: | - docker exec slurm bash -c "echo $NO_ET" - docker exec slurm bash -c "ls -la && echo list top level dir" - docker exec slurm bash -c "ls -la /pydra && echo list pydra dir" - if [[ "${{ matrix.python-version }}" == "3.11.5" ]]; then - docker exec slurm bash -c "CONFIGURE_OPTS=\"-with-openssl=/opt/openssl\" pyenv install -v 3.11.5" - fi - docker exec slurm bash -c "pyenv global ${{ matrix.python-version }}" - docker exec slurm bash -c "pip install --upgrade pip && pip install -e /pydra[test,psij] && python -c 'import pydra; print(pydra.__version__)'" - - name: Run pytest - run: | - docker exec slurm bash -c "pytest --color=yes -vs -n auto --psij=slurm --cov pydra --cov-config /pydra/.coveragerc --cov-report xml:/pydra/cov.xml --doctest-modules /pydra/pydra/ -k 'not test_audit_prov and not test_audit_prov_messdir_1 and not test_audit_prov_messdir_2 and not test_audit_prov_wf and not test_audit_all'" - - name: Upload to codecov - run: | - docker exec slurm bash -c "pip install urllib3==1.26.6" - docker exec slurm bash -c "codecov --root /pydra -f /pydra/cov.xml -F unittests" - docker rm -f slurm diff --git a/.github/workflows/testpydra.yml b/.github/workflows/testpydra.yml deleted file mode 100644 index 3ead2e3a6b..0000000000 --- a/.github/workflows/testpydra.yml +++ /dev/null @@ -1,125 +0,0 @@ -name: Pydra - -on: - push: - branches: - - master - pull_request: - -defaults: - run: - shell: bash - -concurrency: - group: ${{ github.workflow }}-${{ github.ref }} - cancel-in-progress: true - -permissions: - contents: read - - -jobs: - build: - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v4 - with: - fetch-depth: 0 - - uses: actions/setup-python@v5 - with: - python-version: 3 - - run: pip install --upgrade build twine - - run: python -m build - - run: twine check dist/* - - uses: actions/upload-artifact@v4 - with: - name: dist - path: dist/ - - name: Build archive - run: | - git clean -fxd - mkdir archive - git archive -o archive/pydra.zip HEAD - - uses: actions/upload-artifact@v4 - with: - name: archive - path: archive/ - - test: - needs: ['build'] - strategy: - matrix: - os: [macos-latest, ubuntu-latest, windows-latest] - python-version: ['3.8', '3.9', '3.10', '3.11', '3.12'] - install: ['wheel'] - include: - - os: 'ubuntu-latest' - python-version: '3.11' - install: 'sdist' - - os: 'ubuntu-latest' - python-version: '3.11' - install: 'repo' - - os: 'ubuntu-latest' - python-version: '3.11' - install: 'archive' - fail-fast: false - runs-on: ${{ matrix.os }} - - - steps: - - name: Fetch sdist/wheel - uses: actions/download-artifact@v4 - if: matrix.install == 'sdist' || matrix.install == 'wheel' - with: - name: dist - path: dist/ - - name: Fetch git archive - uses: actions/download-artifact@v4 - if: matrix.install == 'archive' - with: - name: archive - path: archive/ - - name: Fetch repository - uses: actions/checkout@v4 - if: matrix.install == 'repo' - - - name: Set up Python ${{ matrix.python-version }} on ${{ matrix.os }} - uses: actions/setup-python@v5 - with: - python-version: ${{ matrix.python-version }} - - name: Update pip - run: python -m pip install --upgrade pip - - - name: Determine installation target - run: | - if [[ "$INSTALL" = "sdist" ]]; then - echo "ARCHIVE=$( ls dist/*.tar.gz )" >> $GITHUB_ENV - elif [[ "$INSTALL" = "wheel" ]]; then - echo "ARCHIVE=$( ls dist/*.whl )" >> $GITHUB_ENV - elif [[ "$INSTALL" = "archive" ]]; then - echo "ARCHIVE=$( ls archive/*.zip )" >> $GITHUB_ENV - elif [[ "$INSTALL" = "repo" ]]; then - echo "ARCHIVE=." >> $GITHUB_ENV - fi - env: - INSTALL: ${{ matrix.install }} - - - name: Install Pydra - run: pip install $ARCHIVE - - - name: Print version - run: python -c "import pydra; print(pydra.__version__)" - - - name: Install Pydra tests dependencies - run: pip install pydra[test] - - - name: Disable etelemetry - run: echo "NO_ET=TRUE" >> $GITHUB_ENV - - - name: Pytest - run: | - pytest -vs -n auto --doctest-modules --pyargs pydra \ - --cov pydra --cov-config .coveragerc --cov-report xml:cov.xml - - - name: Upload to codecov - run: codecov -f cov.xml -F unittests -e GITHUB_WORKFLOW diff --git a/.github/workflows/testsingularity.yml b/.github/workflows/testsingularity.yml deleted file mode 100644 index 6cb597cdf8..0000000000 --- a/.github/workflows/testsingularity.yml +++ /dev/null @@ -1,74 +0,0 @@ -name: Singularity - -on: - push: - branches: - - master - pull_request: - -concurrency: - group: ${{ github.workflow }}-${{ github.ref }} - cancel-in-progress: true - -jobs: - build: - name: Build - runs-on: ubuntu-latest - strategy: - matrix: - python-version: [3.8, 3.9, "3.10", "3.11"] - fail-fast: False - - steps: - - name: Set env - run: | - echo "RELEASE_VERSION=v3.7.1" >> $GITHUB_ENV - echo "NO_ET=TRUE" >> $GITHUB_ENV - - name: Setup Singularity - uses: actions/checkout@v4 - with: - repository: hpcng/singularity - ref: 'v3.7.1' - path: 'singularity' - - name: Setup GO - uses: actions/setup-go@v5 - with: - go-version: '^1.13' - - name: Install OS deps - run: | - sudo apt-get update - sudo apt-get install flawfinder squashfs-tools uuid-dev libuuid1 libffi-dev libssl-dev libssl1.1 \ - libarchive-dev libgpgme11-dev libseccomp-dev wget gcc make pkg-config -y - - name: Build - run: | - cd singularity - ./mconfig --without-suid -p /usr/local/ - make -C ./builddir - sudo make -C ./builddir install - cd .. - - name: Echo singularity version - run: | - echo ${{ github.ref }} - singularity --version - - - - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v5 - with: - python-version: ${{ matrix.python-version }} - - name: Update build tools - run: python -m pip install --upgrade pip - - - - name: Checkout Pydra repo - uses: actions/checkout@v4 - with: - repository: ${{ github.repository }} - - name: Install pydra (test) - run: pip install -e ".[test]" - - - - name: Pytest - run: pytest -vs --cov pydra --cov-config .coveragerc --cov-report xml:cov.xml pydra/engine/tests/test_singularity.py - - name: Upload to codecov - run: codecov -f cov.xml -F unittests -e GITHUB_WORKFLOW diff --git a/.github/workflows/testslurm.yml b/.github/workflows/testslurm.yml deleted file mode 100644 index 0e1d17f09b..0000000000 --- a/.github/workflows/testslurm.yml +++ /dev/null @@ -1,58 +0,0 @@ -name: SLURM - -on: - push: - branches: - - master - pull_request: - -concurrency: - group: ${{ github.workflow }}-${{ github.ref }} - cancel-in-progress: true - -jobs: - build: - strategy: - matrix: - python-version: [3.8.16, 3.9.16, 3.10.9, 3.11.5] - fail-fast: false - runs-on: ubuntu-latest - env: - DOCKER_IMAGE: adi611/docker-centos7-slurm:23.02.1 - - steps: - - name: Disable etelemetry - run: echo "NO_ET=TRUE" >> $GITHUB_ENV - - uses: actions/checkout@v4 - - name: Pull docker image - run: | - docker pull $DOCKER_IMAGE - # Have image running in the background - docker run `bash <(curl -s https://codecov.io/env)` -itd -h slurmctl --cap-add sys_admin -d --name slurm -v `pwd`:/pydra -e NO_ET=$NO_ET $DOCKER_IMAGE - - name: Display previous jobs with sacct - run: | - echo "Allowing ports/daemons time to start" && sleep 10 - docker exec slurm bash -c "sacctmgr -i add account none,test Cluster=linux Description='none' Organization='none'" - docker exec slurm bash -c "sacct && sinfo && squeue" 2&> /dev/null - if [ $? -ne 0 ]; then - echo "Slurm docker image error" - exit 1 - fi - - name: Setup Python - run: | - docker exec slurm bash -c "echo $NO_ET" - docker exec slurm bash -c "ls -la && echo list top level dir" - docker exec slurm bash -c "ls -la /pydra && echo list pydra dir" - if [[ "${{ matrix.python-version }}" == "3.11.5" ]]; then - docker exec slurm bash -c "CONFIGURE_OPTS=\"-with-openssl=/opt/openssl\" pyenv install -v 3.11.5" - fi - docker exec slurm bash -c "pyenv global ${{ matrix.python-version }}" - docker exec slurm bash -c "pip install --upgrade pip && pip install -e /pydra[test] && python -c 'import pydra; print(pydra.__version__)'" - - name: Run pytest - run: | - docker exec slurm bash -c "pytest --color=yes -vs --cov pydra --cov-config /pydra/.coveragerc --cov-report xml:/pydra/cov.xml --doctest-modules /pydra/pydra/ -k 'not test_audit_prov and not test_audit_prov_messdir_1 and not test_audit_prov_messdir_2 and not test_audit_prov_wf and not test_audit_all'" - - name: Upload to codecov - run: | - docker exec slurm bash -c "pip install urllib3==1.26.6" - docker exec slurm bash -c "codecov --root /pydra -f /pydra/cov.xml -F unittests" - docker rm -f slurm diff --git a/.gitignore b/.gitignore index da16b937b9..08331cdb4a 100644 --- a/.gitignore +++ b/.gitignore @@ -4,6 +4,7 @@ dist __pycache__ *.pyc +.python-version .ipynb_checkpoints .vscode/ @@ -18,6 +19,7 @@ cov.xml *.venv .DS_Store +.ipynb_checkpoints # This can be generated in-tree. We never want to commit it. -pydra/_version.py +pydra/utils/_version.py diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index f36105398e..2ea004790e 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,24 +1,29 @@ # See https://pre-commit.com for more information # See https://pre-commit.com/hooks.html for more hooks repos: -- repo: https://github.com/pre-commit/pre-commit-hooks - rev: v4.6.0 - hooks: - - id: trailing-whitespace - - id: end-of-file-fixer - - id: check-yaml - - id: check-added-large-files -- repo: https://github.com/psf/black - rev: 24.4.2 - hooks: - - id: black -- repo: https://github.com/codespell-project/codespell - rev: v2.3.0 - hooks: - - id: codespell - additional_dependencies: - - tomli -- repo: https://github.com/PyCQA/flake8 - rev: 7.0.0 - hooks: - - id: flake8 + - repo: https://github.com/pre-commit/pre-commit-hooks + rev: v4.6.0 + hooks: + - id: trailing-whitespace + - id: end-of-file-fixer + - id: check-yaml + - id: check-added-large-files + - repo: https://github.com/psf/black + rev: 24.4.2 + hooks: + - id: black + - repo: https://github.com/codespell-project/codespell + rev: v2.3.0 + hooks: + - id: codespell + additional_dependencies: + - tomli + - repo: https://github.com/PyCQA/flake8 + rev: 7.0.0 + hooks: + - id: flake8 + - repo: https://github.com/kynan/nbstripout + rev: 0.5.0 + hooks: + - id: nbstripout + files: \.(ipynb)$ diff --git a/.readthedocs.yaml b/.readthedocs.yaml index 0512ede649..48d77d79d3 100644 --- a/.readthedocs.yaml +++ b/.readthedocs.yaml @@ -5,20 +5,17 @@ # Required version: 2 -# Build with Python 3.9 on the latest version of Ubuntu. +# Build with Python 3.11 on the latest version of Ubuntu. build: os: ubuntu-22.04 tools: - python: '3.9' + python: '3.11' # Build documentation in the docs/ directory with Sphinx. sphinx: - configuration: docs/conf.py + configuration: empty-docs/conf.py # Install extra requirements for the documentation. python: install: - - method: pip - path: '.' - extra_requirements: - - doc + - requirements: empty-docs/requirements.txt diff --git a/benchmark.py b/benchmark.py new file mode 100644 index 0000000000..35f6de914d --- /dev/null +++ b/benchmark.py @@ -0,0 +1,38 @@ +import asyncio +import time + + +def sync_function(x): + return x * 2 + + +async def async_function(x): + return x * 2 + + +def benchmark_sync(): + start_time = time.time() + for _ in range(1000000): + sync_function(10) + end_time = time.time() + return end_time - start_time + + +async def benchmark_async(): + start_time = time.time() + for _ in range(1000000): + await async_function(10) + end_time = time.time() + return end_time - start_time + + +def main(): + sync_time = benchmark_sync() + print(f"Sync function time: {sync_time:.6f} seconds") + + async_time = asyncio.run(benchmark_async()) + print(f"Async function time: {async_time:.6f} seconds") + + +if __name__ == "__main__": + main() diff --git a/docker-scripts-for-tests/Dockerfile b/docker-scripts-for-tests/Dockerfile new file mode 100644 index 0000000000..27d57610e7 --- /dev/null +++ b/docker-scripts-for-tests/Dockerfile @@ -0,0 +1,3 @@ +FROM adi611/docker-centos7-slurm:23.02.1 +RUN CONFIGURE_OPTS="-with-openssl=/opt/openssl" pyenv install -v 3.11.5 +RUN pyenv global 3.11.5 diff --git a/docker-scripts-for-tests/run-docker-slurm-test.sh b/docker-scripts-for-tests/run-docker-slurm-test.sh new file mode 100755 index 0000000000..2445f8553e --- /dev/null +++ b/docker-scripts-for-tests/run-docker-slurm-test.sh @@ -0,0 +1,8 @@ +if [ -z "$1" ]; then + TEST="::$1" +else + TEST="" +fi + + +docker exec pydra-slurm-docker bash -c "pytest -vv --with-psij --only-slurm -s /pydra/pydra/engine/test_submitter.py$TEST --color=yes -vs $2" diff --git a/docker-scripts-for-tests/start-docker-slurm.sh b/docker-scripts-for-tests/start-docker-slurm.sh new file mode 100755 index 0000000000..0592145567 --- /dev/null +++ b/docker-scripts-for-tests/start-docker-slurm.sh @@ -0,0 +1,23 @@ +PKG_DIR=$(realpath $(dirname $0)/..) +NO_ET=TRUE + +# Pull image +docker build -t pydra-slurm-docker $PKG_DIR/docker-scripts-for-tests + +# Start image +docker run --rm -itd -h slurmctl --cap-add sys_admin -d --name pydra-slurm-docker -v $PKG_DIR:/pydra -e NO_ET=$NO_ET pydra-slurm-docker + +# Display previous jobs with sacct +echo "Allowing ports/daemons time to start" && sleep 20 +docker exec pydra-slurm-docker bash -c "sacctmgr -i add account none,test Cluster=linux Description='none' Organization='none'" +docker exec pydra-slurm-docker bash -c "sacct && sinfo && squeue" 2&> /dev/null +if [ $? -ne 0 ]; then + echo "Slurm docker image error" + exit 1 +fi + +# Setup Python +docker exec pydra-slurm-docker bash -c "echo $NO_ET" +docker exec pydra-slurm-docker bash -c "ls -la && echo list top level dir" +docker exec pydra-slurm-docker bash -c "ls -la /pydra && echo list pydra dir" +docker exec pydra-slurm-docker bash -c "pip install --upgrade pip && pip install -e /pydra[test,psij] && python -c 'import pydra.engine; print(pydra.engine.__version__)'" diff --git a/docs/.gitignore b/docs/.gitignore deleted file mode 100644 index 7dcd40d957..0000000000 --- a/docs/.gitignore +++ /dev/null @@ -1,2 +0,0 @@ -_build/ -api/ diff --git a/docs/Makefile b/docs/Makefile index b167d81714..e6d46dcbcc 100644 --- a/docs/Makefile +++ b/docs/Makefile @@ -5,9 +5,7 @@ SPHINXOPTS = SPHINXBUILD = sphinx-build PAPER = -BUILDDIR = _build -CURBRANCH = master -PYTHONPATH = $(PWD) +BUILDDIR = build # User-friendly check for sphinx-build ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1) @@ -17,11 +15,11 @@ endif # Internal variables. PAPEROPT_a4 = -D latex_paper_size=a4 PAPEROPT_letter = -D latex_paper_size=letter -ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . +ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) source # the i18n builder cannot share the environment and doctrees with the others -I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . +I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) source -.PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest gettext +.PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest coverage gettext help: @echo "Please use \`make ' where is one of" @@ -32,6 +30,7 @@ help: @echo " json to make JSON files" @echo " htmlhelp to make HTML files and a HTML help project" @echo " qthelp to make HTML files and a qthelp project" + @echo " applehelp to make an Apple Help Book" @echo " devhelp to make HTML files and a Devhelp project" @echo " epub to make an epub" @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter" @@ -47,16 +46,13 @@ help: @echo " pseudoxml to make pseudoxml-XML files for display purposes" @echo " linkcheck to check all external links for integrity" @echo " doctest to run all doctests embedded in the documentation (if enabled)" - + @echo " coverage to run coverage check of the documentation (if enabled)" clean: rm -rf $(BUILDDIR)/* - rm -rf reference/* - rm -rf docs/api html: - mkdir -p _static _templates - PYTHONPATH=$(PYTHONPATH) $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html + $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html @echo @echo "Build finished. The HTML pages are in $(BUILDDIR)/html." @@ -91,17 +87,25 @@ qthelp: @echo @echo "Build finished; now you can run "qcollectiongenerator" with the" \ ".qhcp project file in $(BUILDDIR)/qthelp, like this:" - @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/pydra.qhcp" + @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/Pype9.qhcp" @echo "To view the help file:" - @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/pydra.qhc" + @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/Pype9.qhc" + +applehelp: + $(SPHINXBUILD) -b applehelp $(ALLSPHINXOPTS) $(BUILDDIR)/applehelp + @echo + @echo "Build finished. The help book is in $(BUILDDIR)/applehelp." + @echo "N.B. You won't be able to view it unless you put it in" \ + "~/Library/Documentation/Help or install it in your application" \ + "bundle." devhelp: $(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp @echo @echo "Build finished." @echo "To view the help file:" - @echo "# mkdir -p $$HOME/.local/share/devhelp/pydra" - @echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/pydra" + @echo "# mkdir -p $$HOME/.local/share/devhelp/Pype9" + @echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/Pype9" @echo "# devhelp" epub: @@ -172,6 +176,11 @@ doctest: @echo "Testing of doctests in the sources finished, look at the " \ "results in $(BUILDDIR)/doctest/output.txt." +coverage: + $(SPHINXBUILD) -b coverage $(ALLSPHINXOPTS) $(BUILDDIR)/coverage + @echo "Testing of coverage in the sources finished, look at the " \ + "results in $(BUILDDIR)/coverage/python.txt." + xml: $(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml @echo @@ -181,6 +190,3 @@ pseudoxml: $(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml @echo @echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml." - -versioned: - PYTHONPATH=$(PYTHONPATH) sphinx-versioning -vv -l ./docs/conf.py build -r $(CURBRANCH) ./docs/ docs/$(BUILDDIR)/html/ diff --git a/docs/api.rst b/docs/api.rst deleted file mode 100644 index fe1e815677..0000000000 --- a/docs/api.rst +++ /dev/null @@ -1,17 +0,0 @@ -Library API (application programmer interface) -============================================== - -.. automodule:: pydra - :members: - :undoc-members: - :show-inheritance: - -Subpackages ------------ - -.. toctree:: - - api/pydra.engine - api/pydra.mark - api/pydra.tasks - api/pydra.utils diff --git a/docs/changes.rst b/docs/changes.rst deleted file mode 100644 index 4e23840e90..0000000000 --- a/docs/changes.rst +++ /dev/null @@ -1,132 +0,0 @@ -Release Notes -============= - -0.8.0 ------ - -* refactoring template formatting for ``input_spec`` -* fixing issues with input fields with extension (and using them in templates) -* adding simple validators to input spec (using ``attr.validator``) -* adding ``create_dotfile`` for workflows, that creates graphs as dotfiles (can convert to other formats if dot available) -* adding a simple user guide with ``input_spec`` description -* expanding docstrings for ``State``, ``audit`` and ``messenger`` -* updating syntax to newer python - -0.7.0 ------ - -* refactoring the error handling by padra: improving raised errors, removing nodes from the workflow graph that can't be run -* refactoring of the ``input_spec``: adapting better to the nipype interfaces -* switching from ``pkg_resources.declare_namespace`` to the stdlib ``pkgutil.extend_path`` -* moving ``readme`` to rst format - - -0.6.2 ------ - -* Use pkgutil to declare ``pydra.tasks`` as a namespace package, ensuring better support for - editable mode. - -0.6.1 ------ - -* Add ``pydra.tasks`` namespace package to enable separate packages of ``Task``\s to be - installed into ``pydra.tasks``. -* Raise error when task or workflow name conflicts with names of attributes, methods, or - other tasks already added to workflow -* Mention ``requirements.txt`` in README - -0.6 ---- - -* removing the tutorial to a `separate repo `__ -* adding windows tests to codecov -* accepting ``None`` as a valid output from a ``FunctionTask``, also for function that returns multiple values -* fixing slurm error files -* adding ``wf._connection`` to ``checksum`` -* allowing for updates of ``wf._connections`` -* editing output, so it works with ``numpy.arrays`` -* removing ``to_job`` and pickling task instead (workers read the tasks and set the proper input, so the multiple copies of the input are not kept in the memory) -* adding standalone function ``load_and_run`` that can load and run a task from a pickle file -* removing ``create_pyscript`` and simplifying the slurm worker -* improving error reports in errors flies -* fixing ``make_class`` so the ``Output`` is properly formatted - -0.5 ---- - -* fixing ``hash_dir`` function -* adding ``get_available_cpus`` to get the number of CPUs available to the current process or available on the system -* adding simple implementation for ``BoshTask`` that uses boutiques descriptor -* adding azure to CI -* fixing code for windows -* etelementry updates -* adding more verbose output for task ``result`` - returns values or indices for input fields -* adding an experimental implementation of Dask Worker (limited testing with ci) - -0.4 ---- - -* reorganization of the ``State`` class, fixing small issues with the class -* fixing some paths issues on windows os -* adding osx and window sto the travis runs (right now allowing for failures for windows) -* adding ``PydraStateError`` for exception in the ``State`` class -* small fixes to the hashing functions, adding more tests -* adding ``hash_dir`` to calculate hash for ``Directory`` type - -0.3.1 ------ - -* passing ``wf.cache_locations`` to the task -* using ``rerun`` from submitter to all task -* adding ``test_rerun`` and ``propagate_rerun`` for workflows -* fixing task with a full combiner -* adding ``cont_dim`` to specify dimensionality of the input variables (how much the input is nested) - -0.3 ---- - -* adding sphinx documentation -* moving from ``dataclasses`` to ``attrs`` -* adding ``container`` flag to the ``ShellCommandTask`` -* fixing ``cmdline``, ``command_args`` and ``container_args`` for tasks with states -* adding ``CONTRIBUTING.md`` -* fixing hash calculations for inputs with a list of files -* using ``attr.NOTHING`` for input that is not set - -0.2.2 ------ - -* supporting tuple as a single element of an input - -0.2.1 ------ - -* fixing: nodes with states and input fields (from splitter) that are empty were failing - -0.2 ---- - -* big changes in ``ShellTask``, ``DockerTask`` and ``SingularityTask`` - * customized input specification and output specification for ``Task``\s - * adding singularity checks to Travis CI - * binding all input files to the container -* changes in ``Workflow`` - * passing all outputs to the next node: ``lzout.all_`` - * fixing inner splitter -* allowing for ``splitter`` and ``combiner`` updates -* adding ``etelementry`` support - -0.1 ---- - -* Core dataflow creation and management API -* Distributed workers: - * concurrent futures - * SLURM -* Notebooks for Pydra concepts - -0.0.1 ------ - -Initial Pydra Dataflow Engine release. diff --git a/docs/combiner.rst b/docs/combiner.rst deleted file mode 100644 index 78875e1e55..0000000000 --- a/docs/combiner.rst +++ /dev/null @@ -1,66 +0,0 @@ -Grouping Task's Output -======================= - -In addition to the splitting the input, *Pydra* supports grouping -or combining the output resulting from the splits. -In order to achieve this for a *Task*, a user can specify a *combiner*. -This can be set by calling ``combine`` method. -Note, the *combiner* only makes sense when a *splitter* is -set first. When *combiner=x*, all values are combined together within one list, -and each element of the list represents an output of the *Task* for the specific -value of the input *x*. Splitting and combining for this example can be written -as follows: - -.. math:: - - S = x &:& ~x=[x_1, x_2, ..., x_n] \mapsto x=x_1, x=x_2, ..., x=x_n, \\ - C = x &:& ~out(x_1), ...,out(x_n) \mapsto out_{comb}=[out(x_1), ...out(x_n)], - -where `S` represents the *splitter*, *C* represents the *combiner*, :math:`x` is the input field, -:math:`out(x_i)` represents the output of the *Task* for :math:`x_i`, and :math:`out_{comb}` -is the final output after applying the *combiner*. - -In the situation where input has multiple fields and an *outer splitter* is used, -there are various ways of combining the output. -Taking as an example the task from the previous section, -user might want to combine all the outputs for one specific value of :math:`x_i` and -all the values of :math:`y`. -In this situation, the combined output would be a two dimensional list, each -inner list for each value of :math:`x`. This can be written as follow: - -.. math:: - - C = y &:& ~out(x_1, y1), out(x_1, y2), ...out(x_n, y_m) \\ - &\longmapsto& ~[[out(x_1, y_1), ..., out(x_1, y_m)], \\ - && ~..., \\ - && ~[out(x_n, y_1), ..., out(x_n, y_m)]]. - - - - -.. figure:: images/nd_spl_3_comb1.png - :figclass: h! - :scale: 75% - - - -However, for the same task the user might want to combine -all values of :math:`x` for specific values of :math:`y`. -One may also need to combine all the values together. -This can be achieved by providing a list of fields, :math:`[x, y]` to the combiner. -When a full combiner is set, i.e. all the fields from -the splitter are also in the combiner, the output is a one dimensional list: - -.. math:: - - C = [x, y] : out(x_1, y1), ...out(x_n, y_m) \longmapsto [out(x_1, y_1), ..., out(x_n, y_m)]. - - -.. figure:: images/nd_spl_3_comb3.png - :figclass: h! - :scale: 75% - -These are the basic examples of the *Pydra*'s *splitter-combiner* concept. It -is important to note, that *Pydra* allows for mixing *splitters* and *combiners* -on various levels of a dataflow. They can be set on a single *Task* or a *Workflow*. -They can be passed from one *Task* to following *Tasks* within the *Workflow*. diff --git a/docs/components.rst b/docs/components.rst deleted file mode 100644 index d4928e82c6..0000000000 --- a/docs/components.rst +++ /dev/null @@ -1,189 +0,0 @@ -Dataflows Components: Task and Workflow -======================================= -A *Task* is the basic runnable component of *Pydra* and is described by the -class ``TaskBase``. A *Task* has named inputs and outputs, thus allowing -construction of dataflows. It can be hashed and executes in a specific working -directory. Any *Pydra*'s *Task* can be used as a function in a script, thus allowing -dual use in *Pydra*'s *Workflows* and in standalone scripts. There are several -classes that inherit from ``TaskBase`` and each has a different application: - - -Function Tasks --------------- - -* ``FunctionTask`` is a *Task* that executes Python functions. Most Python functions - declared in an existing library, package, or interactively in a terminal can - be converted to a ``FunctionTask`` by using *Pydra*'s decorator - ``mark.task``. - - .. code-block:: python - - import numpy as np - from pydra import mark - fft = mark.annotate({'a': np.ndarray, - 'return': float})(np.fft.fft) - fft_task = mark.task(fft)() - result = fft_task(a=np.random.rand(512)) - - - `fft_task` is now a *Pydra* *Task* and result will contain a *Pydra*'s ``Result`` object. - In addition, the user can use Python's function annotation or another *Pydra* - decorator --- ``mark.annotate`` in order to specify the output. In the - following example, we decorate an arbitrary Python function to create named - outputs: - - .. code-block:: python - - @mark.task - @mark.annotate( - {"return": {"mean": float, "std": float}} - ) - def mean_dev(my_data): - import statistics as st - return st.mean(my_data), st.stdev(my_data) - - result = mean_dev(my_data=[...])() - - When the *Task* is executed `result.output` will contain two attributes: `mean` - and `std`. Named attributes facilitate passing different outputs to - different downstream nodes in a dataflow. - - -.. _shell_command_task: - -Shell Command Tasks -------------------- - -* ``ShellCommandTask`` is a *Task* used to run shell commands and executables. - It can be used with a simple command without any arguments, or with specific - set of arguments and flags, e.g.: - - .. code-block:: python - - ShellCommandTask(executable="pwd") - - ShellCommandTask(executable="ls", args="my_dir") - - The *Task* can accommodate more complex shell commands by allowing the user to - customize inputs and outputs of the commands. - One can generate an input - specification to specify names of inputs, positions in the command, types of - the inputs, and other metadata. - As a specific example, FSL's BET command (Brain - Extraction Tool) can be called on the command line as: - - .. code-block:: python - - bet input_file output_file -m - - Each of the command argument can be treated as a named input to the - ``ShellCommandTask``, and can be included in the input specification. - As shown next, even an output is specified by constructing - the *out_file* field form a template: - - .. code-block:: python - - bet_input_spec = SpecInfo( - name="Input", - fields=[ - ( "in_file", File, - { "help_string": "input file ...", - "position": 1, - "mandatory": True } ), - ( "out_file", str, - { "help_string": "name of output ...", - "position": 2, - "output_file_template": - "{in_file}_br" } ), - ( "mask", bool, - { "help_string": "create binary mask", - "argstr": "-m", } ) ], - bases=(ShellSpec,) ) - - ShellCommandTask(executable="bet", - input_spec=bet_input_spec) - - More details are in the :ref:`Input Specification section`. - -Container Tasks ---------------- -* ``ContainerTask`` class is a child class of ``ShellCommandTask`` and serves as - a parent class for ``DockerTask`` and ``SingularityTask``. Both *Container Tasks* - run shell commands or executables within containers with specific user defined - environments using Docker_ and Singularity_ software respectively. - This might be extremely useful for users and projects that require environment - encapsulation and sharing. - Using container technologies helps improve scientific - workflows reproducibility, one of the key concept behind *Pydra*. - - These *Container Tasks* can be defined by using - ``DockerTask`` and ``SingularityTask`` classes directly, or can be created - automatically from ``ShellCommandTask``, when an optional argument - ``container_info`` is used when creating a *Shell Task*. The following two - types of syntax are equivalent: - - .. code-block:: python - - DockerTask(executable="pwd", image="busybox") - - ShellCommandTask(executable="ls", - container_info=("docker", "busybox")) - -Workflows ---------- -* ``Workflow`` - is a subclass of *Task* that provides support for creating *Pydra* - dataflows. As a subclass, a *Workflow* acts like a *Task* and has inputs, outputs, - is hashable, and is treated as a single unit. Unlike *Tasks*, workflows embed - a directed acyclic graph. Each node of the graph contains a *Task* of any type, - including another *Workflow*, and can be added to the *Workflow* simply by calling - the ``add`` method. The connections between *Tasks* are defined by using so - called *Lazy Inputs* or *Lazy Outputs*. These are special attributes that allow - assignment of values when a *Workflow* is executed rather than at the point of - assignment. The following example creates a *Workflow* from two *Pydra* *Tasks*. - - .. code-block:: python - - # creating workflow with two input fields - wf = Workflow(input_spec=["x", "y"]) - # adding a task and connecting task's input - # to the workflow input - wf.add(mult(name="mlt", - x=wf.lzin.x, y=wf.lzin.y)) - # adding another task and connecting - # task's input to the "mult" task's output - wf.add(add2(name="add", x=wf.mlt.lzout.out)) - # setting workflow output - wf.set_output([("out", wf.add.lzout.out)]) - - -Task's State ------------- -All Tasks, including Workflows, can have an optional attribute representing an instance of the State class. -This attribute controls the execution of a Task over different input parameter sets. -This class is at the heart of Pydra's powerful Map-Reduce over arbitrary inputs of nested dataflows feature. -The State class formalizes how users can specify arbitrary combinations. -Its functionality is used to create and track different combinations of input parameters, -and optionally allow limited or complete recombinations. -In order to specify how the inputs should be split into parameter sets, and optionally combined after -the Task execution, the user can set splitter and combiner attributes of the State class. - -.. code-block:: python - - task_with_state = - add2().split(x=[1, 5]).combine("x") - -In this example, the ``State`` class is responsible for creating a list of two -separate inputs, *[{x: 1}, {x:5}]*, each run of the *Task* should get one -element from the list. Note that in this case the value for `x` is set in the `split()` -method, not at the task's initialisation. -The `combine()` method, specifies that the results are grouped back when returning the -result from the *Task*. - -While this example illustrates mapping and grouping of results over a single parameter, -*Pydra* extends this to arbitrary combinations of input fields and downstream grouping -over nested dataflows. Details of how splitters and combiners power *Pydra*'s -scalable dataflows are described in the next section. - - - -.. _Docker: https://www.docker.com/ -.. _Singularity: https://www.singularity.lbl.gov/ diff --git a/docs/conf.py b/docs/conf.py deleted file mode 100644 index fd0b69ca43..0000000000 --- a/docs/conf.py +++ /dev/null @@ -1,102 +0,0 @@ -# Configuration file for the Sphinx documentation builder. -# -# This file only contains a selection of the most common options. For a full -# list see the documentation: -# https://www.sphinx-doc.org/en/master/usage/configuration.html - -# -- Path setup -------------------------------------------------------------- - -# If extensions (or modules to document with autodoc) are in another directory, -# add these directories to sys.path here. If the directory is relative to the -# documentation root, use os.path.abspath to make it absolute, like shown here. -# -import sys -from pathlib import Path -from packaging.version import Version - -sys.path.insert(0, str(Path(__file__).parent.parent.absolute())) -sys.path.insert(1, str(Path(__file__).parent / "sphinxext")) -from pydra import __version__ -from github_link import make_linkcode_resolve - - -# -- Project information ----------------------------------------------------- - -project = "Pydra: A simple dataflow engine with scalable semantics" -copyright = "2019 - 2020, The Nipype Developers team" -author = "The Nipype Developers team" - -# The full version, including alpha/beta/rc tags -release = __version__ -version = Version(release).public - - -# -- General configuration --------------------------------------------------- - -# Add any Sphinx extension module names here, as strings. They can be -# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom -# ones. -extensions = [ - "sphinx.ext.autodoc", - "sphinx.ext.doctest", - "sphinx.ext.intersphinx", - "sphinx.ext.coverage", - "sphinx.ext.mathjax", - "sphinx.ext.ifconfig", - "sphinx.ext.linkcode", - "sphinx.ext.githubpages", - "sphinx.ext.napoleon", - "sphinxcontrib.apidoc", -] - -# Add any paths that contain templates here, relative to this directory. -templates_path = ["_templates"] - -# List of patterns, relative to source directory, that match files and -# directories to ignore when looking for source files. -# This pattern also affects html_static_path and html_extra_path. -exclude_patterns = ["_build", "Thumbs.db", ".DS_Store", "api/pydra.rst"] - - -# -- Options for HTML output ------------------------------------------------- - -# The theme to use for HTML and HTML Help pages. See the documentation for -# a list of builtin themes. -# -html_theme = "sphinx_rtd_theme" - -# Add any paths that contain custom static files (such as style sheets) here, -# relative to this directory. They are copied after the builtin static files, -# so a file named "default.css" will overwrite the builtin "default.css". -html_static_path = ["_static"] - -# -- Options for extensions --------------------------------------------------- - -# Autodoc -autodoc_mock_imports = ["cloudpickle", "matplotlib", "numpy", "psutil"] -apidoc_module_dir = "../pydra" -apidoc_output_dir = "api" -apidoc_excluded_paths = ["conftest.py", "*/tests/*", "tests/*", "data/*"] -apidoc_separate_modules = True -apidoc_extra_args = ["--module-first", "-d 1", "-T"] - -# Napoleon -# Accept custom section names to be parsed for numpy-style docstrings -# of parameters. -# Requires pinning sphinxcontrib-napoleon to a specific commit while -# https://github.com/sphinx-contrib/napoleon/pull/10 is merged. -napoleon_use_param = False -napoleon_custom_sections = [("Inputs", "Parameters"), ("Outputs", "Parameters")] - -# Intersphinx -intersphinx_mapping = {"python": ("https://docs.python.org/3", None)} - -# Linkcode -# The following is used by sphinx.ext.linkcode to provide links to github -linkcode_resolve = make_linkcode_resolve( - "pydra", - "https://github.com/nipype/pydra/blob/{revision}/" "{package}/{path}#L{lineno}", -) - -# Sphinx-versioning -scv_show_banner = True diff --git a/docs/input_spec.rst b/docs/input_spec.rst deleted file mode 100644 index 48d66fd814..0000000000 --- a/docs/input_spec.rst +++ /dev/null @@ -1,181 +0,0 @@ -.. _Input Specification section: - -Input Specification -=================== - -As it was mentioned in :ref:`shell_command_task`, the user can customize the input and output -for the `ShellCommandTask`. -In this section, more examples of the input specification will be provided. - - -Let's start from the previous example: - -.. code-block:: python - - bet_input_spec = SpecInfo( - name="Input", - fields=[ - ( "in_file", File, - { "help_string": "input file ...", - "position": 1, - "mandatory": True } ), - ( "out_file", str, - { "help_string": "name of output ...", - "position": 2, - "output_file_template": - "{in_file}_br" } ), - ( "mask", bool, - { "help_string": "create binary mask", - "argstr": "-m", } ) ], - bases=(ShellSpec,) ) - - ShellCommandTask(executable="bet", - input_spec=bet_input_spec) - - - -In order to create an input specification, a new `SpecInfo` object has to be created. -The field `name` specifies the type of the spec and it should be always "Input" for -the input specification. -The field `bases` specifies the "base specification" you want to use (can think about it as a -`parent class`) and it will usually contains `ShellSpec` only, unless you want to build on top of -your other specification (this will not be cover in this section). -The part that should be always customised is the `fields` part. -Each element of the `fields` is a separate input field that is added to the specification. -In this example, three-elements tuples - with name, type and dictionary with additional -information - are used. -But this is only one of the supported syntax, more options will be described below. - -Adding a New Field to the Spec ------------------------------- - -Pydra uses `attr` classes to represent the input specification, and the full syntax for each field -is: - -.. code-block:: python - - field1 = ("field1_name", attr.ib(type=<'field1_type'>, metadata=<'dictionary with metadata'>) - -However, we allow for shorter syntax, that does not include `attr.ib`: - -- providing only name and the type - -.. code-block:: python - - field1 = ("field1_name", <'field1_type'>) - - -- providing name, type and metadata (as in the example above) - -.. code-block:: python - - field1 = ("field1_name", <'field1_type'>, <'dictionary with metadata'>)) - -- providing name, type and default value - -.. code-block:: python - - field1 = ("field1_name", <'field1_type'>, <'default value'>) - -- providing name, type, default value and metadata - -.. code-block:: python - - field1 = ("field1_name", <'field1_type'>, <'default value', <'dictionary with metadata'>)) - - -Each of the shorter versions will be converted to the `(name, attr.ib(...)`. - - -Types ------ - -Type can be provided as a simple python type (e.g. `str`, `int`, `float`, etc.) -or can be more complex by using `typing.List`, `typing.Dict` and `typing.Union`. - -There are also special types provided by Pydra: - -- `File` and `Directory` - should be used in `input_spec` if the field is an existing file - or directory. - Pydra checks if the file or directory exists, and returns an error if it doesn't exist. - - -- `MultiInputObj` - a special type that takes a any value and if the value is not a list it - converts value to a 1-element list (it could be used together with `MultiOutputObj` - in the `output_spec` to reverse the conversion of the output values). - - - -Metadata --------- - -In the example we used multiple keys in the metadata dictionary including `help_string`, -`position`, etc. In this section all allowed key will be described: - -`help_string` (`str`, mandatory): - A short description of the input field. - -`mandatory` (`bool`, default: `False`): - If `True` user has to provide a value for the field. - -`sep` (`str`): - A separator if a list is provided as a value. - -`argstr` (`str`): - A flag or string that is used in the command before the value, e.g. `-v` or `-v {inp_field}`, - but it could be and empty string, `""`. - If `...` are used, e.g. `-v...`, the flag is used before every element if a list is provided - as a value. - If no `argstr` is used the field is not part of the command. - -`position` (`int`): - Position of the field in the command, could be nonnegative or negative integer. - If nothing is provided the field will be inserted between all fields with nonnegative positions - and fields with negative positions. - -`allowed_values` (`list`): - List of allowed values for the field. - -`requires` (`list`): - List of field names that are required together with the field. - -`xor` (`list`): - List of field names that are mutually exclusive with the field. - -`copyfile` (`bool`, default: `False`): - If `True`, a hard link is created for the input file in the output directory. - If hard link not possible, the file is copied to the output directory. - -`container_path` (`bool`, default: `False`, only for `ContainerTask`): - If `True` a path will be consider as a path inside the container (and not as a local path). - -`output_file_template` (`str`): - If provided, the field is treated also as an output field and it is added to the output spec. - The template can use other fields, e.g. `{file1}`. - Used in order to create an output specification. - -`output_field_name` (`str`, used together with `output_file_template`) - If provided the field is added to the output spec with changed name. - Used in order to create an output specification. - -`keep_extension` (`bool`, default: `True`): - A flag that specifies if the file extension should be removed from the field value. - Used in order to create an output specification. - -`readonly` (`bool`, default: `False`): - If `True` the input field can't be provided by the user but it aggregates other input fields - (for example the fields with `argstr: -o {fldA} {fldB}`). - -`formatter` (`function`): - If provided the `argstr` of the field is created using the function. This function can for example - be used to combine several inputs into one command argument. - The function can take `field` (this input field will be passed to the function), - `inputs` (entire `inputs` will be passed) or any input field name - (a specific input field will be sent). - - -Validators ----------- -Pydra allows for using simple validator for types and `allowev_values`. -The validators are disabled by default, but can be enabled by calling -`pydra.set_input_validator(flag=True)`. diff --git a/docs/make.bat b/docs/make.bat new file mode 100644 index 0000000000..523fa3eb58 --- /dev/null +++ b/docs/make.bat @@ -0,0 +1,263 @@ +@ECHO OFF + +REM Command file for Sphinx documentation + +if "%SPHINXBUILD%" == "" ( + set SPHINXBUILD=sphinx-build +) +set BUILDDIR=build +set ALLSPHINXOPTS=-d %BUILDDIR%/doctrees %SPHINXOPTS% source +set I18NSPHINXOPTS=%SPHINXOPTS% source +if NOT "%PAPER%" == "" ( + set ALLSPHINXOPTS=-D latex_paper_size=%PAPER% %ALLSPHINXOPTS% + set I18NSPHINXOPTS=-D latex_paper_size=%PAPER% %I18NSPHINXOPTS% +) + +if "%1" == "" goto help + +if "%1" == "help" ( + :help + echo.Please use `make ^` where ^ is one of + echo. html to make standalone HTML files + echo. dirhtml to make HTML files named index.html in directories + echo. singlehtml to make a single large HTML file + echo. pickle to make pickle files + echo. json to make JSON files + echo. htmlhelp to make HTML files and a HTML help project + echo. qthelp to make HTML files and a qthelp project + echo. devhelp to make HTML files and a Devhelp project + echo. epub to make an epub + echo. latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter + echo. text to make text files + echo. man to make manual pages + echo. texinfo to make Texinfo files + echo. gettext to make PO message catalogs + echo. changes to make an overview over all changed/added/deprecated items + echo. xml to make Docutils-native XML files + echo. pseudoxml to make pseudoxml-XML files for display purposes + echo. linkcheck to check all external links for integrity + echo. doctest to run all doctests embedded in the documentation if enabled + echo. coverage to run coverage check of the documentation if enabled + goto end +) + +if "%1" == "clean" ( + for /d %%i in (%BUILDDIR%\*) do rmdir /q /s %%i + del /q /s %BUILDDIR%\* + goto end +) + + +REM Check if sphinx-build is available and fallback to Python version if any +%SPHINXBUILD% 2> nul +if errorlevel 9009 goto sphinx_python +goto sphinx_ok + +:sphinx_python + +set SPHINXBUILD=python -m sphinx.__init__ +%SPHINXBUILD% 2> nul +if errorlevel 9009 ( + echo. + echo.The 'sphinx-build' command was not found. Make sure you have Sphinx + echo.installed, then set the SPHINXBUILD environment variable to point + echo.to the full path of the 'sphinx-build' executable. Alternatively you + echo.may add the Sphinx directory to PATH. + echo. + echo.If you don't have Sphinx installed, grab it from + echo.http://sphinx-doc.org/ + exit /b 1 +) + +:sphinx_ok + + +if "%1" == "html" ( + %SPHINXBUILD% -b html %ALLSPHINXOPTS% %BUILDDIR%/html + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The HTML pages are in %BUILDDIR%/html. + goto end +) + +if "%1" == "dirhtml" ( + %SPHINXBUILD% -b dirhtml %ALLSPHINXOPTS% %BUILDDIR%/dirhtml + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The HTML pages are in %BUILDDIR%/dirhtml. + goto end +) + +if "%1" == "singlehtml" ( + %SPHINXBUILD% -b singlehtml %ALLSPHINXOPTS% %BUILDDIR%/singlehtml + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The HTML pages are in %BUILDDIR%/singlehtml. + goto end +) + +if "%1" == "pickle" ( + %SPHINXBUILD% -b pickle %ALLSPHINXOPTS% %BUILDDIR%/pickle + if errorlevel 1 exit /b 1 + echo. + echo.Build finished; now you can process the pickle files. + goto end +) + +if "%1" == "json" ( + %SPHINXBUILD% -b json %ALLSPHINXOPTS% %BUILDDIR%/json + if errorlevel 1 exit /b 1 + echo. + echo.Build finished; now you can process the JSON files. + goto end +) + +if "%1" == "htmlhelp" ( + %SPHINXBUILD% -b htmlhelp %ALLSPHINXOPTS% %BUILDDIR%/htmlhelp + if errorlevel 1 exit /b 1 + echo. + echo.Build finished; now you can run HTML Help Workshop with the ^ +.hhp project file in %BUILDDIR%/htmlhelp. + goto end +) + +if "%1" == "qthelp" ( + %SPHINXBUILD% -b qthelp %ALLSPHINXOPTS% %BUILDDIR%/qthelp + if errorlevel 1 exit /b 1 + echo. + echo.Build finished; now you can run "qcollectiongenerator" with the ^ +.qhcp project file in %BUILDDIR%/qthelp, like this: + echo.^> qcollectiongenerator %BUILDDIR%\qthelp\Pype9.qhcp + echo.To view the help file: + echo.^> assistant -collectionFile %BUILDDIR%\qthelp\Pype9.ghc + goto end +) + +if "%1" == "devhelp" ( + %SPHINXBUILD% -b devhelp %ALLSPHINXOPTS% %BUILDDIR%/devhelp + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. + goto end +) + +if "%1" == "epub" ( + %SPHINXBUILD% -b epub %ALLSPHINXOPTS% %BUILDDIR%/epub + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The epub file is in %BUILDDIR%/epub. + goto end +) + +if "%1" == "latex" ( + %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex + if errorlevel 1 exit /b 1 + echo. + echo.Build finished; the LaTeX files are in %BUILDDIR%/latex. + goto end +) + +if "%1" == "latexpdf" ( + %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex + cd %BUILDDIR%/latex + make all-pdf + cd %~dp0 + echo. + echo.Build finished; the PDF files are in %BUILDDIR%/latex. + goto end +) + +if "%1" == "latexpdfja" ( + %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex + cd %BUILDDIR%/latex + make all-pdf-ja + cd %~dp0 + echo. + echo.Build finished; the PDF files are in %BUILDDIR%/latex. + goto end +) + +if "%1" == "text" ( + %SPHINXBUILD% -b text %ALLSPHINXOPTS% %BUILDDIR%/text + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The text files are in %BUILDDIR%/text. + goto end +) + +if "%1" == "man" ( + %SPHINXBUILD% -b man %ALLSPHINXOPTS% %BUILDDIR%/man + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The manual pages are in %BUILDDIR%/man. + goto end +) + +if "%1" == "texinfo" ( + %SPHINXBUILD% -b texinfo %ALLSPHINXOPTS% %BUILDDIR%/texinfo + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The Texinfo files are in %BUILDDIR%/texinfo. + goto end +) + +if "%1" == "gettext" ( + %SPHINXBUILD% -b gettext %I18NSPHINXOPTS% %BUILDDIR%/locale + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The message catalogs are in %BUILDDIR%/locale. + goto end +) + +if "%1" == "changes" ( + %SPHINXBUILD% -b changes %ALLSPHINXOPTS% %BUILDDIR%/changes + if errorlevel 1 exit /b 1 + echo. + echo.The overview file is in %BUILDDIR%/changes. + goto end +) + +if "%1" == "linkcheck" ( + %SPHINXBUILD% -b linkcheck %ALLSPHINXOPTS% %BUILDDIR%/linkcheck + if errorlevel 1 exit /b 1 + echo. + echo.Link check complete; look for any errors in the above output ^ +or in %BUILDDIR%/linkcheck/output.txt. + goto end +) + +if "%1" == "doctest" ( + %SPHINXBUILD% -b doctest %ALLSPHINXOPTS% %BUILDDIR%/doctest + if errorlevel 1 exit /b 1 + echo. + echo.Testing of doctests in the sources finished, look at the ^ +results in %BUILDDIR%/doctest/output.txt. + goto end +) + +if "%1" == "coverage" ( + %SPHINXBUILD% -b coverage %ALLSPHINXOPTS% %BUILDDIR%/coverage + if errorlevel 1 exit /b 1 + echo. + echo.Testing of coverage in the sources finished, look at the ^ +results in %BUILDDIR%/coverage/python.txt. + goto end +) + +if "%1" == "xml" ( + %SPHINXBUILD% -b xml %ALLSPHINXOPTS% %BUILDDIR%/xml + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The XML files are in %BUILDDIR%/xml. + goto end +) + +if "%1" == "pseudoxml" ( + %SPHINXBUILD% -b pseudoxml %ALLSPHINXOPTS% %BUILDDIR%/pseudoxml + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The pseudo-XML files are in %BUILDDIR%/pseudoxml. + goto end +) + +:end diff --git a/docs/output_spec.rst b/docs/output_spec.rst deleted file mode 100644 index 2e0907076b..0000000000 --- a/docs/output_spec.rst +++ /dev/null @@ -1,90 +0,0 @@ -.. _Output Specification section: - -Output Specification -==================== - -As it was mentioned in :ref:`shell_command_task`, the user can customize the input and output -for the `ShellCommandTask`. -In this section, the output specification will be covered. - - -Instead of using field with `output_file_template` in the customized `input_spec` to specify an output field, -a customized `output_spec` can be used, e.g.: - - -.. code-block:: python - - output_spec = SpecInfo( - name="Output", - fields=[ - ( - "out1", - attr.ib( - type=File, - metadata={ - "output_file_template": "{inp1}", - "help_string": "output file", - "requires": ["inp1", "inp2"] - }, - ), - ) - ], - bases=(ShellOutSpec,), - ) - - ShellCommandTask(executable=executable, - output_spec=output_spec) - - - -Similarly as for `input_spec`, in order to create an output specification, -a new `SpecInfo` object has to be created. -The field `name` specifies the type of the spec and it should be always "Output" for -the output specification. -The field `bases` specifies the "base specification" you want to use (can think about it as a -`parent class`) and it will usually contains `ShellOutSpec` only, unless you want to build on top of -your other specification (this will not be cover in this section). -The part that should be always customised is the `fields` part. -Each element of the `fields` is a separate output field that is added to the specification. -In this example, a three-elements tuple - with name, type and dictionary with additional -information - is used. -See :ref:`Input Specification section` for other recognized syntax for specification's fields -and possible types. - - - -Metadata --------- - -The metadata dictionary for `output_spec` can include: - -`help_string` (`str`, mandatory): - A short description of the input field. The same as in `input_spec`. - -`mandatory` (`bool`, default: `False`): - If `True` the output file has to exist, otherwise an error will be raised. - -`output_file_template` (`str`): - If provided the output file name (or list of file names) is created using the template. - The template can use other fields, e.g. `{file1}`. The same as in `input_spec`. - -`output_field_name` (`str`, used together with `output_file_template`) - If provided the field is added to the output spec with changed name. - The same as in `input_spec`. - -`keep_extension` (`bool`, default: `True`): - A flag that specifies if the file extension should be removed from the field value. - The same as in `input_spec`. - -`requires` (`list`): - List of field names that are required to create a specific output. - The fields do not have to be a part of the `output_file_template` and - if any field from the list is not provided in the input, a `NOTHING` is returned for the specific output. - This has a different meaning than the `requires` form the `input_spec`. - -`callable` (`function`): - If provided the output file name (or list of file names) is created using the function. - The function can take `field` (the specific output field will be passed to the function), - `output_dir` (task `output_dir` will be used), `stdout`, `stderr` (`stdout` and `stderr` of - the task will be sent) `inputs` (entire `inputs` will be passed) or any input field name - (a specific input field will be sent). diff --git a/docs/requirements.txt b/docs/requirements.txt deleted file mode 100644 index fd5bbc2c82..0000000000 --- a/docs/requirements.txt +++ /dev/null @@ -1,3 +0,0 @@ -# This requirements file includes unreleased dependencies above and beyond those -# specified in the project.optional-dependencies.doc table of pyproject.toml -git+https://github.com/effigies/sphinxcontrib-versioning.git@master#egg=sphinxcontrib-versioning diff --git a/docs/source/_static/css/custom.css b/docs/source/_static/css/custom.css new file mode 100644 index 0000000000..161e475987 --- /dev/null +++ b/docs/source/_static/css/custom.css @@ -0,0 +1,4 @@ +div.nbinput .prompt, +div.nboutput .prompt { + display: none; +} diff --git a/docs/images/nd_spl_1.png b/docs/source/_static/images/nd_spl_1.png similarity index 100% rename from docs/images/nd_spl_1.png rename to docs/source/_static/images/nd_spl_1.png diff --git a/docs/images/nd_spl_3.png b/docs/source/_static/images/nd_spl_3.png similarity index 100% rename from docs/images/nd_spl_3.png rename to docs/source/_static/images/nd_spl_3.png diff --git a/docs/images/nd_spl_3_comb1.png b/docs/source/_static/images/nd_spl_3_comb1.png similarity index 100% rename from docs/images/nd_spl_3_comb1.png rename to docs/source/_static/images/nd_spl_3_comb1.png diff --git a/docs/images/nd_spl_3_comb3.png b/docs/source/_static/images/nd_spl_3_comb3.png similarity index 100% rename from docs/images/nd_spl_3_comb3.png rename to docs/source/_static/images/nd_spl_3_comb3.png diff --git a/docs/images/nd_spl_4.png b/docs/source/_static/images/nd_spl_4.png similarity index 100% rename from docs/images/nd_spl_4.png rename to docs/source/_static/images/nd_spl_4.png diff --git a/docs/logo/pydra_logo.jpg b/docs/source/_static/logo/pydra_logo.jpg similarity index 100% rename from docs/logo/pydra_logo.jpg rename to docs/source/_static/logo/pydra_logo.jpg diff --git a/docs/logo/pydra_logo.png b/docs/source/_static/logo/pydra_logo.png similarity index 100% rename from docs/logo/pydra_logo.png rename to docs/source/_static/logo/pydra_logo.png diff --git a/docs/logo/pydra_logo.svg b/docs/source/_static/logo/pydra_logo.svg similarity index 100% rename from docs/logo/pydra_logo.svg rename to docs/source/_static/logo/pydra_logo.svg diff --git a/docs/source/conf.py b/docs/source/conf.py new file mode 100644 index 0000000000..3e623d1873 --- /dev/null +++ b/docs/source/conf.py @@ -0,0 +1,401 @@ +# -*- coding: utf-8 -*- +# +# Pype9 documentation build configuration file, created by +# sphinx-quickstart on Thu Mar 30 21:41:02 2017. +# +# This file is execfile()d with the current directory set to its +# containing dir. +# +# Note that not all possible configuration values are present in this +# autogenerated file. +# +# All configuration values have a default; values that are commented out +# serve to show the default. +from __future__ import print_function +import typing as ty +import datetime + +from pydra.utils import __version__ # noqa + + +authors = [("Nipype developers", "neuroimaging@python.org")] + +# If extensions (or modules to document with autodoc) are in another directory, +# add these directories to sys.path here. If the directory is relative to the +# documentation root, use op.abspath to make it absolute, like shown here. +# sys.path.insert(0, op.abspath('.')) + +# -- General configuration ------------------------------------------------ + +# If your documentation needs a minimal Sphinx version, state it here. +# needs_sphinx = '1.0' + +# Add any Sphinx extension module names here, as strings. They can be +# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom +# ones. +extensions = [ + "sphinx.ext.autodoc", + "sphinx.ext.doctest", + "sphinx.ext.intersphinx", + "sphinx.ext.todo", + "sphinx.ext.coverage", + "sphinx.ext.mathjax", + "nbsphinx", + "sphinx.ext.viewcode", + "sphinx.ext.napoleon", + "sphinx.ext.autosectionlabel", + "sphinxarg.ext", + "sphinx_click.ext", + "numpydoc", +] + + +nbsphinx_allow_errors = False + +# Add any paths that contain templates here, relative to this directory. +templates_path = ["_templates"] + +# The suffix(es) of source filenames. +# You can specify multiple suffix as a list of string: +# source_suffix = ['.rst', '.md'] +source_suffix = ".rst" + +# The encoding of source files. +# source_encoding = 'utf-8-sig' + +# The master toctree document. +master_doc = "index" + +# General information about the project. +project = "Pydra" +author = ", ".join(a for a, _ in authors) +copyright = "{}, {}".format(datetime.datetime.now().year, author) + + +# The version info for the project you're documenting, acts as replacement for +# |version| and |release|, also used in various other places throughout the +# built documents. +# +# The short X.Y version. +version = ".".join(__version__.split(".")[:2]) +# The full version, including alpha/beta/rc tags. +release = __version__ + +# The language for content autogenerated by Sphinx. Refer to documentation +# for a list of supported languages. +# +# This is also used if you do content translation via gettext catalogs. +# Usually you set "language" from the command line for these cases. +# language = None + +# There are two options for replacing |today|: either, you set today to some +# non-false value, then it is used: +# today = '' +# Else, today_fmt is used as the format for a strftime call. +# today_fmt = '%B %d, %Y' + +# List of patterns, relative to source directory, that match files and +# directories to ignore when looking for source files. +exclude_patterns: ty.List[str] = [] + +# The reST default role (used for this markup: `text`) to use for all +# documents. +# default_role = None + +# If true, '()' will be appended to :func: etc. cross-reference text. +# add_function_parentheses = True + +# If true, the current module name will be prepended to all description +# unit titles (such as .. function::). +# add_module_names = True + +# If true, sectionauthor and moduleauthor directives will be shown in the +# output. They are ignored by default. +# show_authors = False + +# The name of the Pygments (syntax highlighting) style to use. +pygments_style = "lovelace" +pygments_dark_style = "fruity" + +# A list of ignored prefixes for module index sorting. +# modindex_common_prefix = [] + +# If true, keep warnings as "system message" paragraphs in the built documents. +# keep_warnings = False + +# If true, `todo` and `todoList` produce output, else they produce nothing. +todo_include_todos = True + + +# -- Options for HTML output ---------------------------------------------- + +# The theme to use for HTML and HTML Help pages. See the documentation for +# a list of builtin themes. +html_theme = "furo" + +# Theme options are theme-specific and customize the look and feel of a theme +# further. For a list of options available for each theme, see the +# documentation. +html_theme_options = { + "light_css_variables": { + "color-brand-primary": "#69306d", + "color-brand-content": "#69306d", + }, + "dark_css_variables": { + "color-brand-primary": "#ce8dcf", + "color-brand-content": "#ce8dcf", + }, +} + +html_static_path = ["_static"] +html_css_files = ["css/custom.css"] + +# Add any paths that contain custom themes here, relative to this directory. +# html_theme_path = [sphinx_rtd_theme.get_html_theme_path()] + +# The name for this set of Sphinx documents. If None, it defaults to +# " v documentation". +html_title = "Pydra v{}".format(version) + +# A shorter title for the navigation bar. Default is the same as html_title. +# html_short_title = 'Pydra v' + +# The name of an image file (relative to this directory) to place at the top +# of the sidebar. +html_logo = "_static/logo/pydra_logo.png" + +# The name of an image file (within the static path) to use as favicon of the +# docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 +# pixels large. +html_favicon = "_static/logo/pydra_logo.png" + +# Add any paths that contain custom static files (such as style sheets) here, +# relative to this directory. They are copied after the builtin static files, +# so a file named "default.css" will overwrite the builtin "default.css". +html_static_path = ["_static"] + +# Add any extra paths that contain custom files (such as robots.txt or +# .htaccess) here, relative to this directory. These files are copied +# directly to the root of the documentation. +# html_extra_path = [] + +# If not '', a 'Last updated on:' timestamp is inserted at every page bottom, +# using the given strftime format. +# html_last_updated_fmt = '%b %d, %Y' + +# If true, SmartyPants will be used to convert quotes and dashes to +# typographically correct entities. +# html_use_smartypants = True + +# Custom sidebar templates, maps document names to template names. +# html_sidebars = {} + +# Additional templates that should be rendered to pages, maps page names to +# template names. +# html_additional_pages = {} + +# If false, no module index is generated. +# html_domain_indices = True + +# If false, no index is generated. +# html_use_index = True + +# If true, the index is split into individual pages for each letter. +# html_split_index = False + +# If true, links to the reST sources are added to the pages. +# html_show_sourcelink = True + +# If true, "Created using Sphinx" is shown in the HTML footer. Default is True. +# html_show_sphinx = True + +# If true, "(C) Copyright ..." is shown in the HTML footer. Default is True. +# html_show_copyright = True + +# If true, an OpenSearch description file will be output, and all pages will +# contain a tag referring to it. The value of this option must be the +# base URL from which the finished HTML is served. +# html_use_opensearch = '' + +# This is the file name suffix for HTML files (e.g. ".xhtml"). +# html_file_suffix = None + +# Language to be used for generating the HTML full-text search index. +# Sphinx supports the following languages: +# 'da', 'de', 'en', 'es', 'fi', 'fr', 'hu', 'it', 'ja' +# 'nl', 'no', 'pt', 'ro', 'ru', 'sv', 'tr' +# html_search_language = 'en' + +# A dictionary with options for the search language support, empty by default. +# Now only 'ja' uses this config value +# html_search_options = {'type': 'default'} + +# The name of a javascript file (relative to the configuration directory) that +# implements a search results scorer. If empty, the default will be used. +# html_search_scorer = 'scorer.js' + +language = "English" + +# Output file base name for HTML help builder. +htmlhelp_basename = "Pydra" + +# -- Options for LaTeX output --------------------------------------------- + +latex_elements = { + # The paper size ('letterpaper' or 'a4paper'). + "papersize": "a4paper", + # The font size ('10pt', '11pt' or '12pt'). + # 'pointsize': '10pt', + # Additional stuff for the LaTeX preamble. + # 'preamble': '', + # Latex figure (float) alignment + # 'figure_align': 'htbp', +} + +# Grouping the document tree into LaTeX files. List of tuples +# (source start file, target name, title, +# author, documentclass [howto, manual, or own class]). +latex_documents = [ + (master_doc, "pydra.tex", "Pydra Documentation", author, "manual"), +] + +# Autodoc settings +autodoc_default_options = { + "undoc-members": True, + "show-inheritance": True, +} + +# The name of an image file (relative to this directory) to place at the top of +# the title page. +# latex_logo = None + +# For "manual" documents, if this is true, then toplevel headings are parts, +# not chapters. +# latex_use_parts = False + +# If true, show page references after internal links. +# latex_show_pagerefs = False + +# If true, show URL addresses after external links. +# latex_show_urls = False + +# Documents to append as an appendix to all manuals. +# latex_appendices = [] + +# If false, no module index is generated. +# latex_domain_indices = True + +# -- Options for manual page output --------------------------------------- + +# One entry per manual page. List of tuples +# (source start file, name, description, authors, manual section). +man_pages = [(master_doc, "pydra", "Pydra Documentation", [author], 1)] + +# If true, show URL addresses after external links. +# man_show_urls = False + + +# -- Options for Texinfo output ------------------------------------------- + +# Grouping the document tree into Texinfo files. List of tuples +# (source start file, target name, title, author, +# dir menu entry, description, category) +texinfo_documents = [ + ( + master_doc, + "Pydra", + "Pydra Documentation", + author, + "Pydra", + "Archive-centered analysis of neuroimaging data", + "Miscellaneous", + ), +] + +# Documents to append as an appendix to all manuals. +# texinfo_appendices = [] + +# If false, no module index is generated. +# texinfo_domain_indices = True + +# How to display URL addresses: 'footnote', 'no', or 'inline'. +# texinfo_show_urls = 'footnote' + +# If true, do not generate a @detailmenu in the "Top" node's menu. +# texinfo_no_detailmenu = False + + +# -- Options for Epub output ---------------------------------------------- + +# Bibliographic Dublin Core info. +epub_title = project +epub_author = author +epub_publisher = author +epub_copyright = copyright + +# The basename for the epub file. It defaults to the project name. +# epub_basename = project + +# The HTML theme for the epub output. Since the default themes are not optimized +# for small screen space, using the same theme for HTML and epub output is +# usually not wise. This defaults to 'epub', a theme designed to save visual +# space. +# epub_theme = 'epub' + +# The language of the text. It defaults to the language option +# or 'en' if the language is not set. +# epub_language = '' + +# The scheme of the identifier. Typical schemes are ISBN or URL. +# epub_scheme = '' + +# The unique identifier of the text. This can be a ISBN number +# or the project homepage. +# epub_identifier = '' + +# A unique identification for the text. +# epub_uid = '' + +# A tuple containing the cover image and cover page html template filenames. +# epub_cover = () + +# A sequence of (type, uri, title) tuples for the guide element of content.opf. +# epub_guide = () + +# HTML files that should be inserted before the pages created by sphinx. +# The format is a list of tuples containing the path and title. +# epub_pre_files = [] + +# HTML files that should be inserted after the pages created by sphinx. +# The format is a list of tuples containing the path and title. +# epub_post_files = [] + +# A list of files that should not be packed into the epub file. +epub_exclude_files = ["search.html"] + +# The depth of the table of contents in toc.ncx. +# epub_tocdepth = 3 + +# Allow duplicate toc entries. +# epub_tocdup = True + +# Choose between 'default' and 'includehidden'. +# epub_tocscope = 'default' + +# Fix unsupported image types using the Pillow. +# epub_fix_images = False + +# Scale large images. +# epub_max_image_width = 0 + +# How to display URL addresses: 'footnote', 'no', or 'inline'. +# epub_show_urls = 'inline' + +# If false, no index is generated. +# epub_use_index = True + + +# Example configuration for intersphinx: refer to the Python standard library. +intersphinx_mapping = {"python": ("https://docs.python.org/", None)} + +numpydoc_show_class_members = False diff --git a/docs/source/examples/glm.ipynb b/docs/source/examples/glm.ipynb new file mode 100644 index 0000000000..aae6497b9a --- /dev/null +++ b/docs/source/examples/glm.ipynb @@ -0,0 +1,715 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "c8149a94", + "metadata": {}, + "source": [ + "# General Linear Model (GLM)" + ] + }, + { + "cell_type": "markdown", + "id": "b54b132a", + "metadata": {}, + "source": [ + "In this tutorial, which is adapted from the Nilearn docs, we will go through a simple workflow of the first level general linear modeling with a BIDS dataset from openneuro. This analysis is only performed on **one** subject.\n", + "\n", + "This tutorial is based on the [Nilearn GLM tutorial](https://nilearn.github.io/stable/auto_examples/04_glm_first_level/plot_bids_features.html#sphx-glr-auto-examples-04-glm-first-level-plot-bids-features-py)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9f514ffe", + "metadata": {}, + "outputs": [], + "source": [ + "import nest_asyncio\n", + "nest_asyncio.apply()" + ] + }, + { + "cell_type": "markdown", + "id": "8313a041", + "metadata": {}, + "source": [ + "## Preparation\n", + "\n", + "Import packages that will be used globally and set up output directory" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "72d1dfdd", + "metadata": {}, + "outputs": [], + "source": [ + "import warnings\n", + "import sys \n", + "if not sys.warnoptions:\n", + " warnings.simplefilter(\"ignore\")\n", + " \n", + "import os\n", + "import typing as ty\n", + "from pathlib import Path\n", + "\n", + "from pydra.compose import python, workflow\n", + "from pydra.engine.submitter import Submitter\n", + "from fileformats.generic import File, Directory\n", + "from fileformats.text import Csv\n", + "import pandas as pd\n", + "from scipy.stats import norm\n", + "\n", + "import nibabel as nib\n", + "# These functions were removed within nilearn, so this notebook needs to be rewritten\n", + "# to use the 'openneuro' module instead\n", + "# from nilearn.datasets import (\n", + "# fetch_openneuro_dataset_index,\n", + "# fetch_openneuro_dataset,\n", + "# select_from_index,\n", + "# )\n", + "from nilearn.interfaces.fsl import get_design_from_fslmat\n", + "from nilearn.glm.first_level import first_level_from_bids\n", + "from nilearn.reporting import get_clusters_table, make_glm_report\n", + "from nilearn.plotting import (\n", + " plot_glass_brain,\n", + " plot_img_comparison,\n", + " plot_contrast_matrix,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5716cb50", + "metadata": {}, + "outputs": [], + "source": [ + "# get current directory\n", + "pydra_tutorial_dir = os.path.dirname(os.getcwd())\n", + "\n", + "# set up output directory\n", + "workflow_dir = Path(pydra_tutorial_dir) / 'outputs'\n", + "workflow_out_dir = workflow_dir / '6_glm'\n", + "\n", + "# create the output directory if not exit\n", + "os.makedirs(workflow_out_dir, exist_ok=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1878928b", + "metadata": {}, + "outputs": [], + "source": [ + "workflow_out_dir" + ] + }, + { + "cell_type": "markdown", + "id": "6cafd6a1", + "metadata": {}, + "source": [ + "## Create tasks\n", + "\n", + "In this section, we converte major steps into tasks.\n", + "Each pydra task can have multiple python functions. We recommend to put those logically more related functions into the same task.\n", + "\n", + "It is very **important** to keep in mind what adjacent tasks of your current task will be.\n", + "1. Your previous task will decide your arguments in the current task\n", + "2. Your next task will be impacted by the returns in the current task" + ] + }, + { + "cell_type": "markdown", + "id": "823780ab", + "metadata": {}, + "source": [ + "### fetch openneuro BIDS dataset\n", + "\n", + "In this task, we do the following:\n", + "1. get openneuro dataset index\n", + "2. specify exclusion patterns and number of subjects\n", + "3. download the data we need\n", + "\n", + "\n", + "**Notes:** Here we still use `n_subjects` as an argument. Given that we will only analyze one subject, you can also remove this argument and specify `n_subjects =1` in `select_from_index`. If you do, do not forget to modify the argument in the workflow later." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e2ab134c", + "metadata": {}, + "outputs": [], + "source": [ + "@python.define(outputs=[\"data_dir\"])\n", + "def GetOpenneuroDataset(exclusion_patterns: list, n_subjects: int) -> str:\n", + " _, urls = fetch_openneuro_dataset_index()\n", + " urls = select_from_index(\n", + " urls, exclusion_filters=exclusion_patterns, n_subjects=n_subjects\n", + " )\n", + " data_dir, _ = fetch_openneuro_dataset(urls=urls)\n", + " return data_dir" + ] + }, + { + "cell_type": "markdown", + "id": "1b4899de", + "metadata": {}, + "source": [ + "### obtain FirstLevelModel objects automatically and fit arguments\n", + "\n", + "To get the first level model(s) we have to specify\n", + "1. the dataset directory\n", + "2. the task_label\n", + "3. the space_label\n", + "4. the folder with the desired derivatives (fMRIPrep)\n", + "\n", + "In our case, we only have one subject so we will only have one first level model.\n", + "Then, for this model, we will obtain\n", + "1. the list of run images\n", + "2. events\n", + "3. confound regressors\n", + "\n", + "Those are inferred from the confounds.tsv files available in the BIDS dataset." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3c2710dc", + "metadata": {}, + "outputs": [], + "source": [ + "@python.define(outputs=[\"model\", \"imgs\", \"subject\"])\n", + "def GetInfoFromBids(\n", + " data_dir: Directory,\n", + " task_label: str,\n", + " space_label: str,\n", + " smoothing_fwhm: float,\n", + " derivatives_folder: Directory,\n", + ") -> ty.Tuple[ty.Any, list, str]:\n", + " (\n", + " models,\n", + " models_run_imgs,\n", + " models_events,\n", + " models_confounds,\n", + " ) = first_level_from_bids(\n", + " dataset_path=data_dir,\n", + " task_label=task_label,\n", + " space_label=space_label,\n", + " smoothing_fwhm=smoothing_fwhm,\n", + " derivatives_folder=derivatives_folder,\n", + " )\n", + " model, imgs, events, confounds = (\n", + " models[0],\n", + " models_run_imgs[0],\n", + " models_events[0],\n", + " models_confounds[0],\n", + " )\n", + " subject = 'sub-' + model.subject_label\n", + " return model, imgs, subject" + ] + }, + { + "cell_type": "markdown", + "id": "e5af99cb", + "metadata": {}, + "source": [ + "### Get design matrix\n", + "\n", + "This task does the following:\n", + "1. read the design matrix in `.mat`\n", + "2. rename the column\n", + "3. save the new design matrix as `.csv`\n", + "\n", + "**Think:** What if we don't save the new design matrix, but `return` it directly? In other words, we `return` a `pandas.DataFrame` instead of a `path`. What will happen? Worth a try :)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2bdfcfd9", + "metadata": {}, + "outputs": [], + "source": [ + "@python.define(outputs=[\"dm_path\"])\n", + "def GetDesignMatrix(data_dir: Directory, subject: str) -> Csv:\n", + " fsl_design_matrix_path = data_dir.joinpath(\n", + " 'derivatives',\n", + " 'task',\n", + " subject,\n", + " 'stopsignal.feat',\n", + " 'design.mat',\n", + " )\n", + " design_matrix = get_design_from_fslmat(\n", + " fsl_design_matrix_path, column_names=None\n", + " )\n", + "\n", + " design_columns = [\n", + " 'cond_%02d' % i for i in range(len(design_matrix.columns))\n", + " ]\n", + " design_columns[0] = 'Go'\n", + " design_columns[4] = 'StopSuccess'\n", + " design_matrix.columns = design_columns\n", + " dm_path = Path('designmatrix.csv')\n", + " design_matrix.to_csv(dm_path, index=None)\n", + " return dm_path" + ] + }, + { + "cell_type": "markdown", + "id": "e1cb37d0", + "metadata": {}, + "source": [ + "### Fit the first level model\n", + "\n", + "What we are doing here is:\n", + "1. use the design matrix to fit the first level model\n", + "2. compute the contrast\n", + "3. save the z_map and masker for further use\n", + "4. generate a glm report (HTML file)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "65cec504", + "metadata": {}, + "outputs": [], + "source": [ + "@python.define(outputs=[\"model\", \"z_map_path\", \"masker\", \"glm_report_file\"])\n", + "def ModelFit(model, imgs, dm_path, contrast: str) -> ty.Tuple[ty.Any, str, ty.Any, str]:\n", + " design_matrix = pd.read_csv(dm_path)\n", + " model.fit(imgs, design_matrices=[design_matrix])\n", + " z_map = model.compute_contrast(contrast)\n", + " z_map_path = Path('firstlevel_z_map.nii.gz')\n", + " z_map.to_filename(z_map_path)\n", + " masker_path = Path('firstlevel_masker.nii.gz')\n", + " masker = model.masker_\n", + " glm_report_file = Path('glm_report.html')\n", + " report = make_glm_report(model, contrast)\n", + " report.save_as_html(glm_report_file)\n", + " return model, z_map_path, masker, glm_report_file" + ] + }, + { + "cell_type": "markdown", + "id": "05576ba4", + "metadata": {}, + "source": [ + "### Get cluster table\n", + "\n", + "For publication purposes, we obtain a cluster table." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d4a86a6f", + "metadata": {}, + "outputs": [], + "source": [ + "@python.define(outputs=[\"output_file\"])\n", + "def ClusterTable(z_map_path: File) -> Csv:\n", + " stat_img = nib.load(z_map_path)\n", + " output_file = Path('cluster_table.csv')\n", + " df = get_clusters_table(\n", + " stat_img, stat_threshold=norm.isf(0.001), cluster_threshold=10\n", + " )\n", + " df.to_csv(output_file, index=None)\n", + " return output_file" + ] + }, + { + "cell_type": "markdown", + "id": "c1e8effd", + "metadata": {}, + "source": [ + "### Make plots\n", + "\n", + "Here we want to make some plots to display our results and compare the result from FSL.\n", + "1. plot nilearn z-map\n", + "2. plot fsl z-map\n", + "3. plot nilearn and fsl comparison\n", + "4. plot design matrix contrast\n", + "\n", + "You can also separate this task into multiple sub-tasks. But it makes more sense to put them into one task as they use the same files and function `nilearn.plotting` repeatedly." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c0f78107", + "metadata": {}, + "outputs": [], + "source": [ + "@python.define(outputs=[\"output_file1\", \"output_file2\", \"output_file3\", \"output_file4\"])\n", + "def Plots(\n", + " data_dir: Directory,\n", + " dm_path: File,\n", + " z_map_path: File,\n", + " contrast: str,\n", + " subject: str,\n", + " masker\n", + ") -> ty.Tuple[str, str, str, str]:\n", + " # plot and save nilearn z-map\n", + " z_map = nib.load(z_map_path)\n", + " output_file1 = Path('nilearn_z_map.jpg')\n", + " plot_glass_brain(\n", + " z_map,\n", + " output_file=output_file1,\n", + " colorbar=True,\n", + " threshold=norm.isf(0.001),\n", + " title='Nilearn Z map of \"StopSuccess - Go\" (unc p<0.001)',\n", + " plot_abs=False,\n", + " display_mode='ortho',\n", + " )\n", + "\n", + " # plot and save fsl z-map\n", + " fsl_z_map = nib.load(\n", + " os.path.join(\n", + " data_dir,\n", + " 'derivatives',\n", + " 'task',\n", + " subject,\n", + " 'stopsignal.feat',\n", + " 'stats',\n", + " 'zstat12.nii.gz',\n", + " )\n", + " )\n", + " output_file2 = Path('fsl_z_map.jpg')\n", + " plot_glass_brain(\n", + " fsl_z_map,\n", + " output_file=output_file2,\n", + " colorbar=True,\n", + " threshold=norm.isf(0.001),\n", + " title='FSL Z map of \"StopSuccess - Go\" (unc p<0.001)',\n", + " plot_abs=False,\n", + " display_mode='ortho',\n", + " )\n", + "\n", + " # plot and save nilearn and fsl comparison\n", + " plot_img_comparison(\n", + " [z_map],\n", + " [fsl_z_map],\n", + " masker,\n", + " output_dir=workflow_out_dir,\n", + " ref_label='Nilearn',\n", + " src_label='FSL',\n", + " )\n", + " old = Path('0000.png')\n", + " new = Path('nilearn_fsl_comp.jpg')\n", + " os.rename(old, new)\n", + " output_file3 = new\n", + " print(output_file3)\n", + "\n", + " # plot and save design matrix contrast\n", + " design_matrix = pd.read_csv(dm_path)\n", + " output_file4 = Path('firstlevel_contrast.jpg')\n", + " plot_contrast_matrix(contrast, design_matrix, output_file=output_file4)\n", + " return output_file1, output_file2, output_file3, output_file4" + ] + }, + { + "cell_type": "markdown", + "id": "12a99b96", + "metadata": {}, + "source": [ + "## Make a workflow from tasks\n", + "\n", + "Now we have created all tasks we need for this first level analysis, and there are two choices for our next step.\n", + "1. create one workflow to connect all tasks together\n", + "2. create sub-workflows with some closely related tasks, and connect these workflows along with other tasks into a larger workflow.\n", + "\n", + "We recommend the second approach as it is always a good practice to group tasks, especially when there are a large number of tasks in the analysis.\n", + "\n", + "Our analysis can be divided into three parts: (1) get/read the data, (2) analyze the data, and (3) plot the result, where (1) and (3) only have one task each. So we can put all tasks in (2) into one workflow and name it as `firstlevel` or whatever you prefer." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6e79e9b1", + "metadata": {}, + "outputs": [], + "source": [ + "@workflow.define(outputs=[\"z_map\", \"masker\", \"subject\", \"dm_path\", \"cluster_table\", \"glm_report\"])\n", + "def FirstLevelWorkflow(\n", + " data_dir: Directory,\n", + " contrast: str,\n", + " output_dir: Path,\n", + " task_label: str = 'stopsignal',\n", + " space_label: str = 'MNI152NLin2009cAsym',\n", + " derivatives_folder: str = 'derivatives/fmriprep',\n", + " smoothing_fwhm: float = 5.0,\n", + ") -> ty.Tuple[str, str, str, File, str, str]:\n", + "\n", + " # add task - get_info_from_bids\n", + " get_info_from_bids = workflow.add(\n", + " GetInfoFromBids(\n", + " data_dir=data_dir,\n", + " task_label=task_label,\n", + " space_label=space_label,\n", + " derivatives_folder=derivatives_folder,\n", + " smoothing_fwhm=smoothing_fwhm,\n", + " )\n", + " )\n", + " # add task - get_designmatrix\n", + " get_designmatrix = workflow.add(\n", + " GetDesignMatrix(\n", + " data_dir=data_dir,\n", + " subject=get_info_from_bids.subject,\n", + " )\n", + " )\n", + " l1estimation = workflow.add(\n", + " ModelFit(\n", + " model=get_info_from_bids.model,\n", + " imgs=get_info_from_bids.imgs,\n", + " dm_path=get_designmatrix.dm_path,\n", + " contrast=contrast,\n", + " )\n", + " )\n", + " # add task - cluster_table\n", + " cluster_table = workflow.add(\n", + " ClusterTable(\n", + " z_map_path=l1estimation.z_map_path,\n", + " )\n", + " )\n", + " # specify output\n", + " return (\n", + " l1estimation.z_map_path,\n", + " l1estimation.masker,\n", + " get_info_from_bids.subject,\n", + " get_designmatrix.dm_path,\n", + " cluster_table.output_file,\n", + " l1estimation.glm_report_file,\n", + " )" + ] + }, + { + "cell_type": "markdown", + "id": "657690ea", + "metadata": {}, + "source": [ + "## The overaching workflow\n", + "\n", + "Connect other tasks and the above workflow into one\n", + "\n", + "Now we need to create the overaching glm workflow that connects the above workflow and other tasks (e.g., `get/read the data` and `plot the result`)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d055c5d0", + "metadata": {}, + "outputs": [], + "source": [ + "@workflow.define(outputs=[\"output1\", \"output2\", \"output3\", \"output4\"])\n", + "def FullWorkflow(\n", + " output_dir: Path,\n", + " n_subjects: int = 1,\n", + " contrast: str = 'StopSuccess - Go',\n", + " exclusion_patterns: list[str] | None = None,\n", + ") -> tuple[ty.Any, ty.Any, ty.Any, ty.Any]:\n", + " if exclusion_patterns is None:\n", + " exclusion_patterns = [\n", + " '*group*',\n", + " '*phenotype*',\n", + " '*mriqc*',\n", + " '*parameter_plots*',\n", + " '*physio_plots*',\n", + " '*space-fsaverage*',\n", + " '*space-T1w*',\n", + " '*dwi*',\n", + " '*beh*',\n", + " '*task-bart*',\n", + " '*task-rest*',\n", + " '*task-scap*',\n", + " '*task-task*',\n", + " ]\n", + "\n", + " get_openneuro_dataset = workflow.add(\n", + " GetOpenneuroDataset(\n", + " exclusion_patterns=exclusion_patterns,\n", + " n_subjects=n_subjects,\n", + " )\n", + " )\n", + "\n", + " wf_firstlevel = workflow.add(\n", + " FirstLevelWorkflow(\n", + " data_dir=get_openneuro_dataset.data_dir,\n", + " contrast=contrast,\n", + " output_dir=output_dir,\n", + " )\n", + " )\n", + "\n", + " plots = workflow.add(\n", + " Plots(\n", + " data_dir=get_openneuro_dataset.data_dir,\n", + " dm_path=wf_firstlevel.dm_path,\n", + " z_map_path=wf_firstlevel.z_map,\n", + " contrast=contrast,\n", + " subject=wf_firstlevel.subject,\n", + " masker=wf_firstlevel.masker,\n", + " )\n", + " )\n", + "\n", + " return (\n", + " plots.output_file1,\n", + " plots.output_file2,\n", + " plots.output_file3,\n", + " plots.output_file4,\n", + " )" + ] + }, + { + "cell_type": "markdown", + "id": "1b2e9a46", + "metadata": {}, + "source": [ + "## Run Workflow Run" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9a90088e", + "metadata": { + "tags": [ + "hide-output" + ] + }, + "outputs": [], + "source": [ + "wf = FullWorkflow(output_dir=workflow_out_dir, n_subjects=1, contrast='StopSuccess - Go')\n", + "\n", + "if __name__ == \"__main__\":\n", + " with Submitter(worker='cf', n_procs=4) as sub:\n", + " results = sub(wf)\n", + "\n", + " print(results)" + ] + }, + { + "cell_type": "markdown", + "id": "f540cdd4", + "metadata": {}, + "source": [ + "## Visualization" + ] + }, + { + "cell_type": "markdown", + "id": "e8def869", + "metadata": {}, + "source": [ + "If you arrive here without any errors, yay, you just made your first pydra workflow for a first-level GLM!" + ] + }, + { + "cell_type": "markdown", + "id": "9b0585e3", + "metadata": {}, + "source": [ + "## Examine folder structure\n", + "\n", + "Let's take a look at what you have got." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "75c1cfc9", + "metadata": { + "tags": [ + "hide-output" + ] + }, + "outputs": [], + "source": [ + "! ls ../outputs/6_glm" + ] + }, + { + "cell_type": "markdown", + "id": "56aeee0c", + "metadata": {}, + "source": [ + "### Plot figures" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1f657571", + "metadata": { + "tags": [ + "hide-input" + ] + }, + "outputs": [], + "source": [ + "from IPython.display import Image\n", + "\n", + "\n", + "if not results.errored:\n", + " # First-level contrast\n", + " Image(filename='../outputs/6_glm/firstlevel_contrast.jpg')\n", + "\n", + " # Nilearn Z map\n", + " Image(filename='../outputs/6_glm/nilearn_z_map.jpg')\n", + "\n", + " # FSL Z map\n", + " Image(filename='../outputs/6_glm/fsl_z_map.jpg')\n", + "\n", + " # Nilearn and FSL comparison\n", + " Image(filename='../outputs/6_glm/nilearn_fsl_comp.jpg')" + ] + }, + { + "cell_type": "markdown", + "id": "081bf13a", + "metadata": {}, + "source": [ + "## Exercise" + ] + }, + { + "cell_type": "markdown", + "id": "a3d55272", + "metadata": {}, + "source": [ + "What if we need to run the first-level GLM on multiple subject? We will need the `splitter`.\n", + "\n", + "So, where should we add `.split`?" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "wf12", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.5" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/docs/source/examples/t1w-preprocess.ipynb b/docs/source/examples/t1w-preprocess.ipynb new file mode 100644 index 0000000000..3c1271d26a --- /dev/null +++ b/docs/source/examples/t1w-preprocess.ipynb @@ -0,0 +1,46 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# T1w MRI preprocessing" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This is an real-world example of a workflow to pre-process T1-weighted MRI images for further analysis\n", + "\n", + "Work in progress..." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "wf12", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.5" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/docs/source/explanation/conditional-lazy.rst b/docs/source/explanation/conditional-lazy.rst new file mode 100644 index 0000000000..85178a6653 --- /dev/null +++ b/docs/source/explanation/conditional-lazy.rst @@ -0,0 +1,37 @@ +Dynamic construction +==================== + +Pydra workflows are constructed dynamically by workflow "constructor" functions. These +functions can use any valid Python code, allowing rich and complex workflows to be +constructed based on the inputs to the workflow. For example, a workflow constructor +could include conditional branches, loops, or other control flow structures, to tailor +the workflow to the specific inputs provided. + + +Lazy fields +----------- + +Pydra workflows are constructed by the assignment of "lazy field" placeholders from +the outputs of upstream nodes to the inputs of downstream nodes. These placeholders, +which are instances of the :class:`pydra.engine.specs.LazyField` class, are replaced +by the actual values they represent when the workflow is run. + + +Caching of workflow construction +-------------------------------- + +Workflows are constructed just before they are executed to produce a Directed Acyclic Graph +(DAG) of nodes. Tasks are generated from these nodes as upstream inputs become available +and added to the execution stack. If the workflow has been split, either at the top-level, +in an upstream node or at the current node, then a separate task will be generated for +split. + + +Nested workflows and lazy conditionals +-------------------------------------- + +Since lazy fields are only evaluated at runtime, they can't be used in conditional +statements that construct the workflow. However, if there is a section of a workflow +that needs to be conditionally included or excluded based on upstream outputs, that +section can be implemented in a nested workflow and that upstream be connected to the +nested workflow. diff --git a/docs/index.rst b/docs/source/explanation/design-approach.rst similarity index 82% rename from docs/index.rst rename to docs/source/explanation/design-approach.rst index bd30e2e088..07c94226f1 100644 --- a/docs/index.rst +++ b/docs/source/explanation/design-approach.rst @@ -1,14 +1,9 @@ -.. Pydra: A simple dataflow engine with scalable semantics documentation master file, created by - sphinx-quickstart on Fri Jan 3 13:52:41 2020. - You can adapt this file completely to your liking, but it should at least - contain the root `toctree` directive. -Welcome to Pydra: A simple dataflow engine with scalable semantics's documentation! -=================================================================================== +Design philosophy +================= -Pydra is a new lightweight dataflow engine written in Python. -Pydra is developed as an open-source project in the neuroimaging community, -but it is designed as a general-purpose dataflow engine to support any scientific domain. +Rationale +--------- Scientific workflows often require sophisticated analyses that encompass a large collection of algorithms. @@ -23,6 +18,9 @@ Consistency, reproducibility and scalability demand scientific workflows to be organized into fully automated pipelines. This was the motivation behind Pydra - a new dataflow engine written in Python. +History +------- + The Pydra package is a part of the second generation of the Nipype_ ecosystem --- an open-source framework that provides a uniform interface to existing neuroimaging software and facilitates interaction between different software components. @@ -36,6 +34,9 @@ and is being developed with reproducibility, ease of use, and scalability in min Pydra itself is a standalone project and is designed as a general-purpose dataflow engine to support any scientific domain. +Goals +----- + The goal of Pydra is to provide a lightweight dataflow engine for computational graph construction, manipulation, and distributed execution, as well as ensuring reproducibility of scientific pipelines. In Pydra, a dataflow is represented as a directed acyclic graph, where each node represents a Python @@ -72,20 +73,3 @@ The combination of several key features makes Pydra a customizable and powerful .. _fMRIPrep: https://fmriprep.org/en/stable/ .. _C-PAC: https://fcp-indi.github.io/docs/latest/index .. _Map-Reduce: https://en.wikipedia.org/wiki/MapReduce - -.. toctree:: - :maxdepth: 2 - :caption: Contents: - - user_guide - changes - api - - - -Indices and tables -================== - -* :ref:`genindex` -* :ref:`modindex` -* :ref:`search` diff --git a/docs/source/explanation/environments.rst b/docs/source/explanation/environments.rst new file mode 100644 index 0000000000..9f3ad88e87 --- /dev/null +++ b/docs/source/explanation/environments.rst @@ -0,0 +1,33 @@ +Software environments +===================== + +Pydra supports running tasks within encapsulated software environments, such as Docker_ +and Singularity_ containers. This can be specified at runtime or during workflow +construction, and allows tasks to be run in environments that are isolated from the +host system, and that have specific software dependencies. + +The environment a task runs within is specified by the ``environment`` argument passed +to the execution call (e.g. ``my_task(worker="cf", environment="docker")``) or in the +``workflow.add()`` call in workflow constructors. + +Specifying at execution +----------------------- + +Work in progress... + + +Specifying at workflow construction +----------------------------------- + +Work in progress... + + + +Implementing new environment types +---------------------------------- + +Work in progress... + + +.. _Docker: https://www.docker.com/ +.. _Singularity: https://sylabs.io/singularity/ diff --git a/docs/source/explanation/hashing-caching.rst b/docs/source/explanation/hashing-caching.rst new file mode 100644 index 0000000000..3edbd434f5 --- /dev/null +++ b/docs/source/explanation/hashing-caching.rst @@ -0,0 +1,61 @@ +Caches and hashes +================= + +In Pydra, each task is run within its own working directory. If a task completes +successfully, their outputs are stored within this working directory. Working directories +are created within a cache directory, which is specified when the task is executed, and +named according to the hash of the task's inputs. This means that if the same task is +executed with the same inputs, the same working directory will be used, and instead of the task +being rerun, the outputs from the previous run will be reused. + +In this manner, incomplete workflows can be resumed from where they left off, and completed +workflows can be rerun without having to rerun all of the tasks. This is particularly useful +when working with datasets that are to be analysed in several different ways with +common intermediate steps, or when debugging workflows that have failed part way through. + + +Hash calculations +----------------- + +Hashes are calculated for different types of objects in different ways. For example, the +hash of a string is simply the hash of the string itself, whereas the hash of a dictionary +is the hash of all the file names and contents within the directory. Implementations for +most common types are provided in the :mod:`pydra.utils.hash` module, but custom types +can be hashed by providing a custom ``bytes_repr`` function (see +:ref:`Registering custom bytes_repr functions`). + +A cache dictionary, is passed each ``bytes_repr`` call that maps an objects id (i.e. +as returned by the built-in ``id()`` function) to the hash, to avoid infinite recursions +in the case of circular references. + +The byte representation of each object is hashed using the BlakeB cryptographic algorithm, +and these hashes are then combined to create a hash of the entire inputs object. + + +File hash caching by mtime +-------------------------- + +To avoid having to recalculate the hash of large files between runs, file hashes themselves +are cached in a platform specific user directory. These hashes are stored within small +files named by yet another hash of the file-system path an mtime of the file. This means that +the contents of a file should only need to be hashed once unless it is modified. + +.. note:: + + Due to limitations in mtime resolution on different platforms (e.g. 1 second on Linux, + potentially 2 seconds on Windows), it is conceivable that a file could be modified, + hashed, and then modified again within resolution period, causing the hash to be + invalid. Therefore, cached hashes are only used once the mtime resolution period + has lapsed since it was last modified, and may be recalculated in some rare cases. + + +Registering custom bytes_repr functions +--------------------------------------- + +Work in progress... + + +Cache misses due to unstable hashes +----------------------------------- + +Work in progress... diff --git a/docs/state.rst b/docs/source/explanation/splitting-combining.rst similarity index 94% rename from docs/state.rst rename to docs/source/explanation/splitting-combining.rst index c99dadd00a..906a51443c 100644 --- a/docs/state.rst +++ b/docs/source/explanation/splitting-combining.rst @@ -1,5 +1,5 @@ -State and Nested Loops over Input -================================= +Splitting and combining +======================= One of the main goals of creating Pydra was to support flexible evaluation of a Task or a Workflow over combinations of input parameters. @@ -25,7 +25,8 @@ This is also represented in the diagram, where :math:`x=[1, 2, 3]` as an example nodes represent stateless copies of the original Task after splitting the input, (these are the runnables that are executed). -.. image:: images/nd_spl_1.png +.. figure:: ../_static/images/nd_spl_1.png + :figclass: h! :scale: 50 % Types of Splitter @@ -50,7 +51,7 @@ Python tuples and its operation is therefore represented by a parenthesis, ``()` where `S` represents the *splitter*, `x` and `y` are the input fields. This is also represented as a diagram: -.. figure:: images/nd_spl_4.png +.. figure:: ../_static/images/nd_spl_4.png :figclass: h! :scale: 80% @@ -72,7 +73,7 @@ brackets, ``[]``: The *outer splitter* for a node with two input fields is schematically represented in the diagram: -.. figure:: images/nd_spl_3.png +.. figure:: ../_static/images/nd_spl_3.png :figclass: h! :scale: 80% diff --git a/docs/source/explanation/typing.rst b/docs/source/explanation/typing.rst new file mode 100644 index 0000000000..ca562bee06 --- /dev/null +++ b/docs/source/explanation/typing.rst @@ -0,0 +1,78 @@ +Typing and file-formats +======================= + +Pydra implements strong(-ish) type-checking at workflow construction time so some errors +can be caught before workflows are run on potentially expensive computing resources. +Input and output fields of tasks can be typed using Python annotations. +Unlike how they are typically used, in Pydra these type annotations are not just for +documentation and linting purposes, but are used to enforce the types of the inputs +and outputs of tasks and workflows at workflow construction and runtime. + +.. note:: + + With the exception of fields containing file-system paths, which should be typed + a FileFormats_ class, types don't need to be specified if not desired. + +File formats +------------ + +The FileFormats_ package provides a way to specify the format of a file, or set of +files, by the extensible collection of file format classes. These classes can be +used to specify the format of a file in a task input or output, and can be used +to validate the format of a file at runtime. + +It is important to use a FileFormats_ type instead of a ``str`` or ``pathlib.Path``, +when defining a field that take paths to file-system objects, because otherwise only +the file path, not the file contents, will be used in the hash used to locate the cache +(see :ref:`Caches and hashes`). However, in most cases, it is sufficient to use the +generic ``fileformats.generic.File``, ``fileformats.generic.Directory``, or the even +more generic ``fileformats.generic.FsObject`` or ``fileformats.generic.FileSet`` classes. + +The only cases where it isn't sufficient to use generic classes, is when there are +implicit header or side cars assumed to be present adjacent to the primary file (e.g. +a NIfTI file `my_nifti.nii` with an associated JSON sidecar file `my_nifti.json`). +Because the header/sidecar file(s) will not be included in the hash calculation +by default and may be omitted if the "file set" is copied into a different work +directories. In such cases, a specific file format class, such as +``fileformats.nifti.NiftiGzX``, should be used instead. + +Coercion +-------- + +Pydra will attempt to coerce the input to the correct type if it is not already, for example +if a tuple is provided to a field that is typed as a list, Pydra will convert the tuple to a list +before the task is run. By default the following coercions will be automatically +applied between the following types: + +* ty.Sequence → ty.Sequence +* ty.Mapping → ty.Mapping +* Path → os.PathLike +* str → os.PathLike +* os.PathLike → Path +* os.PathLike → str +* ty.Any → MultiInputObj +* int → float +* field.Integer → float +* int → field.Decimal + +In addition to this, ``fileformats.fields.Singular`` (see FileFormats_) +can be coerced to and from their primitive types and Numpy ndarrays and primitive types +can be coerced to and from Python sequences and built-in types, respectively. + +Superclass auto-casting +----------------------- + +Pydra is designed so that strict and specific typing can be used, but is not +unnecessarily strict, if it proves too burdensome. Therefore, upstream fields that are +typed as super classes (or as ``typing.Any`` by default) of the task input they are +connected to will be automatically cast to the subclass when the task is run. +This allows workflows and tasks to be easily connected together +regardless of how specific typing is defined in the task definition. This includes +file format types, so a task that expects a ``fileformats.medimage.NiftiGz`` file can +be connected to a task that outputs a ``fileformats.generic.File`` file. +Therefore, the only cases where a typing error will be raised are when the upstream +field can't be cast or coerced to the downstream field, e.g. a ``fileformats.medimage.DicomSeries`` +cannot be cast to a ``fileformats.medimage.Nifti`` file. + + +.. _FileFormats: https://arcanaframework.github.io/fileformats diff --git a/docs/source/howto/create-task-package.ipynb b/docs/source/howto/create-task-package.ipynb new file mode 100644 index 0000000000..6b454fbae2 --- /dev/null +++ b/docs/source/howto/create-task-package.ipynb @@ -0,0 +1,25 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Create a task package\n", + "\n", + "Work in progress..." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [] + } + ], + "metadata": { + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/docs/source/howto/port-from-nipype.ipynb b/docs/source/howto/port-from-nipype.ipynb new file mode 100644 index 0000000000..ba228e387c --- /dev/null +++ b/docs/source/howto/port-from-nipype.ipynb @@ -0,0 +1,25 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Port interfaces from Nipype\n", + "\n", + "Work in progress..." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [] + } + ], + "metadata": { + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/docs/source/index.rst b/docs/source/index.rst new file mode 100644 index 0000000000..340f74cc73 --- /dev/null +++ b/docs/source/index.rst @@ -0,0 +1,183 @@ +.. _home: + +Pydra +===== + +Pydra is a lightweight dataflow engine written in Python. Although designed to succeed +Nipype_ in order to address the needs of the neuroimaging community, Pydra can be used +for analytics in any scientific domain. Pydra facilitates the design of reproducible, +scalable and robust workflows that can link diverse processing tasks implemented as +shell commands or Python functions. + +**Key features:** + +* Combine diverse tasks (`Python functions <./tutorial/3-python.html>`__ or `shell commands <./tutorial/4-shell.html>`__) into coherent, robust `workflows <./tutorial/5-workflow.html>`__ +* Dynamic workflow construction using Python code (see :ref:`Dynamic construction`) +* Concurrent execution on `choice of computing platform (e.g. workstation, SLURM, SGE, Dask, etc...) <./tutorial/3-advanced-execution.html#Workers>`__ +* Map-reduce-like semantics (see :ref:`Splitting and combining`) +* Global caching to reduce recomputation (see :ref:`Caches and hashes`) +* Tasks can be executed in separate software environments, e.g. containers (see :ref:`Software environments`) +* Strong type-checking, including file types, before execution (see :ref:`Typing and file-formats`) + +See :ref:`Design philosophy` for more details on the rationale behind Pydra's design. + + +Installation +------------ + +Pydra is implemented purely in Python and has a small number of dependencies +It is easy to install via pip for Python >= 3.11 (preferably within a +`virtual environment`_): + +.. code-block:: bash + + $ pip install pydra + +Pre-designed tasks are available under the `pydra.tasks.*` namespace. These tasks +are typically implemented within separate packages that are specific to a given +shell-command toolkit, such as FSL_ (*pydra-fsl*), AFNI_ (*pydra-afni*) or +ANTs_ (*pydra-ants*), or a collection of related tasks/workflows, such as Niworkflows +(*pydra-niworkflows*). Pip can be used to install these extension packages as well: + +.. code-block:: bash + + $ pip install pydra-fsl pydra-ants + +Of course, if you use Pydra to execute commands within non-Python toolkits, you will +need to either have those commands installed on the execution machine, or use containers +to run them (see :ref:`Software environments`). + + +Tutorials and notebooks +----------------------- + +The following tutorials provide a step-by-step guide to using Pydra. They can be +studied in any order, but it is recommended to start with :ref:`Getting started` and +step through the list from there. + +The tutorials are written in Jupyter notebooks, which can be downloaded and run locally +or run online using the |Binder| button within each tutorial. + +If you decide to download the notebooks and run locally, be sure to install the necessary +dependencies (ideally within a `virtual environment`_): + +.. code-block:: bash + + $ pip install -e /path/to/your/pydra[tutorial] + + +Execution +~~~~~~~~~ + +Learn how to execute existing tasks (including workflows) on different systems + +* :ref:`Getting started` +* :ref:`Advanced execution` +* :ref:`Troubleshooting` + +Design +~~~~~~ + +Learn how to design your own tasks, wrapped shell commands or Python functions, or +workflows, + +* :ref:`Python-tasks` +* :ref:`Shell-tasks` +* :ref:`Workflows` +* :ref:`Canonical task form` + +Examples +~~~~~~~~ + +The following comprehensive examples demonstrate how to use Pydra to build and execute +complex workflows + +* :ref:`T1w MRI preprocessing` +* :ref:`General Linear Model (GLM)` + +How-to Guides +------------- + +The following guides provide step-by-step instructions on how to + +* :ref:`Create a task package` +* :ref:`Port interfaces from Nipype` + +Reference +--------- + +See the full reference documentation for Pydra + +* :ref:`API` +* :ref:`genindex` +* :ref:`modindex` +* :ref:`Glossary` + + +.. toctree:: + :maxdepth: 2 + :caption: Tutorials: Execution + :hidden: + + tutorial/1-getting-started + tutorial/2-advanced-execution + tutorial/3-troubleshooting + +.. toctree:: + :maxdepth: 2 + :caption: Tutorials: Design + :hidden: + + tutorial/4-python + tutorial/5-shell + tutorial/6-workflow + tutorial/7-canonical-form + + +.. toctree:: + :maxdepth: 2 + :caption: Examples + :hidden: + + examples/t1w-preprocess + examples/glm + +.. toctree:: + :maxdepth: 2 + :caption: How-to + :hidden: + + howto/create-task-package + howto/port-from-nipype + +.. toctree:: + :maxdepth: 2 + :caption: Explanation + :hidden: + + explanation/design-approach + explanation/splitting-combining + explanation/conditional-lazy + explanation/environments + explanation/hashing-caching + explanation/typing + + +.. toctree:: + :maxdepth: 2 + :caption: Reference + :hidden: + + reference/api + genindex + modindex + reference/glossary + +.. _FSL: https://fsl.fmrib.ox.ac.uk/fsl/fslwiki/FSL +.. _ANTs: http://stnava.github.io/ANTs/ +.. _AFNI: https://afni.nimh.nih.gov/ +.. _niworkflows: https://niworkflows.readthedocs.io/en/latest/ +.. _Nipype: https://nipype.readthedocs.io/en/latest/ +.. _virtual environment: https://docs.python.org/3/library/venv.html +.. |Binder| image:: https://mybinder.org/badge_logo.svg + :target: https://mybinder.org/v2/gh/nipype/pydra/develop diff --git a/docs/source/reference/api.rst b/docs/source/reference/api.rst new file mode 100644 index 0000000000..662034e703 --- /dev/null +++ b/docs/source/reference/api.rst @@ -0,0 +1,34 @@ +API +=== + +Python tasks +------------ + +.. automodule:: pydra.compose.python + :members: + :undoc-members: + :show-inheritance: + +Shell tasks +----------- + +.. automodule:: pydra.compose.shell + :members: + :undoc-members: + :show-inheritance: + +Workflows +--------- + +.. automodule:: pydra.compose.workflow + :members: + :undoc-members: + :show-inheritance: + +Specification classes +--------------------- + +.. automodule:: pydra.engine.specs + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/reference/glossary.rst b/docs/source/reference/glossary.rst new file mode 100644 index 0000000000..e94985038f --- /dev/null +++ b/docs/source/reference/glossary.rst @@ -0,0 +1,93 @@ +Glossary +======== + +.. glossary:: + + Cache-root + The directory where cache directories for tasks to be executed are created. + Task cache directories are named within the cache root directory using a hash + of the task's parameters, so that the same task with the same parameters can be + reused. + + Combiner + A combiner is used to combine :ref:`State-array` values created by a split operation + defined by a :ref:`Splitter` on the current node, upstream workflow nodes or + stand-alone tasks. + + Container-ndim + The number of dimensions of the container object to be iterated over when using + a :ref:`Splitter` to split over an iterable value. For example, a list-of-lists + or a 2D array with `container_ndim=2` would be split over the elements of the + inner lists into a single 1-D state array. However, if `container_ndim=1`, + the outer list/2D would be split into a 1-D state array of lists/1D arrays. + + Environment + An environment refers to a specific software encapsulation, such as a Docker + or Singularity image, that is used to run a task. + + Field + A field is a parameter of a task, or a task outputs object, that can be set to + a specific value. Fields are specified to be of any types, including objects + and file-system objects. + + Hook + A hook is a user-defined function that is executed at a specific point in the task + execution process. Hooks can be used to prepare/finalise the task cache directory + or send notifications + + Job + A job is a discrete unit of work, a :ref:`Task`, with all inputs resolved + (i.e. not lazy-values or state-arrays) that has been assigned to a worker. + A task describes "what" is to be done and a submitter object describes + "how" it is to be done, a job combines both objects to describe a concrete unit + of processing. + + Lazy-fields + A lazy-field is a field that is not immediately resolved to a value. Instead, + it is a placeholder that will be resolved at runtime, allowing for dynamic + parameterisation of tasks. + + Node + A single task within the context of a workflow, which is assigned a name and + references a state. Note this task can be nested workflow task. + + Read-only-caches + A read-only cache is a cache root directory that was created by a previous + pydra runs, which is checked for matching task caches to be reused if present + but not written not modified during the execution of a task. + + State + The combination of all upstream splits and combines with any splitters and + combiners for a given node, it is used to track how many jobs, and their + parameterisations, need to be run for a given workflow node. + + State-array + A state array is a collection of parameterised tasks or values that were generated + by a split operation either at the current or upstream node of a workflow. The + size of the array is determined by the :ref:`State` of the workflow node. + + Splitter + Defines how a task's inputs are to be split into multiple jobs. For example if + a task's input takes an integer, a list of integers can be passed to it split + over to create a ref:`State-array` of jobs. Different combinations of + + Submitter + A submitter object parameterises how a task is to be executed, by defining the + worker, environment, cache-root directory and other key execution parameters to + be used when executing a task. + + Task + A task describes a unit of work to be done (but not how it will be), either + standalone or as one step in a larger workflow. Tasks can be of various types, + including Python functions, shell commands, and nested workflows. Tasks are + parameterised, meaning they can accept inputs and produce + + Worker + Encapsulation of a task execution environment. It is responsible for executing + tasks and managing their lifecycle. Workers can be local (e.g., a thread or + process) or remote (e.g., high-performance cluster). + + Workflow + A Directed-Acyclic-Graph (DAG) of parameterised tasks, to be executed in order. + Note that a Workflow object is created by a :class:`WorkflowTask`'s + `construct()` method at runtime and is not directly created by the end user. diff --git a/docs/source/tutorial/1-getting-started.ipynb b/docs/source/tutorial/1-getting-started.ipynb new file mode 100644 index 0000000000..d6534a493b --- /dev/null +++ b/docs/source/tutorial/1-getting-started.ipynb @@ -0,0 +1,343 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Getting started\n", + "\n", + "The basic runnable component of Pydra is a *task*. Tasks are conceptually similar to\n", + "functions, in that they take inputs, operate on them and then return results. However,\n", + "unlike functions, tasks are parameterised before they are executed in a separate step.\n", + "This enables parameterised tasks to be linked together into workflows that are checked for\n", + "errors before they are executed, and modular execution workers and environments to specified\n", + "independently of the task being performed.\n", + "\n", + "Tasks can encapsulate Python functions or shell-commands, or be multi-component workflows,\n", + "themselves constructed from task components including nested workflows.\n", + "\n", + "## Preparation\n", + "\n", + "Before we get started, lets set up some test data to play with. Here we create a sample\n", + "JSON file in a temporary directory" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from pathlib import Path\n", + "from tempfile import mkdtemp\n", + "from pprint import pprint\n", + "import json\n", + "\n", + "JSON_CONTENTS = {\"a\": True, \"b\": \"two\", \"c\": 3, \"d\": [7, 0.55, 6]}\n", + "\n", + "test_dir = Path(mkdtemp())\n", + "json_file = test_dir / \"test.json\"\n", + "with open(json_file, \"w\") as f:\n", + " json.dump(JSON_CONTENTS, f)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Next we create a directory containing 10 randomly generated [NIfTI](https://nifti.nimh.nih.gov/) files" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from fileformats.medimage import Nifti1\n", + "\n", + "nifti_dir = test_dir / \"nifti\"\n", + "nifti_dir.mkdir()\n", + "\n", + "for i in range(10):\n", + " Nifti1.sample(nifti_dir, seed=i) # Create a dummy NIfTI file in the dest. directory" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Note that when you run concurrent processes within a Jupyter notebook the following snippet\n", + "is also required" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import nest_asyncio\n", + "\n", + "nest_asyncio.apply()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "## Running your first task\n", + "\n", + "Pydra allows you to install independent packages with pre-defined tasks (e.g., `pydra-fsl`, `pydra-ants`). The task from the packages are installed under the `pydra.tasks.*`. You always have access to `pydra.tasks.common`, in addition `pydra-mrtrix3.v3_0` was also installed for this tutorial. To use a pre-defined task\n", + "\n", + "* import the class from the `pydra.tasks.*` package it is in\n", + "* instantiate it with appropriate parameters\n", + "* \"call\" resulting object (i.e. `my_task(...)`) to execute it as you would a function \n", + "\n", + "To demonstrate with an example of loading a JSON file with the\n", + "`pydra.tasks.common.LoadJson` task, we first create an example JSON file to test with" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now we can load the JSON contents back from the file using the `LoadJson` task\n", + "class" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Import the task\n", + "from pydra.tasks.common import LoadJson\n", + "\n", + "# Instantiate the task, providing the JSON file we want to load\n", + "load_json = LoadJson(file=json_file)\n", + "\n", + "# Run the task to load the JSON file\n", + "outputs = load_json()\n", + "\n", + "# Access the loaded JSON output contents and check they match original\n", + "assert outputs.out == JSON_CONTENTS" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Iterating over inputs\n", + "\n", + "It is straightforward to apply the same operation over a set of inputs using the `split()`\n", + "method. For example, if we wanted to re-grid all the NIfTI images stored in a directory,\n", + "such as the sample ones generated by the code below" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Then we can by importing the `MrGrid` shell-command task from the `pydra-mrtrix3` package\n", + "and run it over every NIfTI file in the directory using the `Task.split()` method" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from pydra.tasks.mrtrix3.v3_0 import MrGrid\n", + "\n", + "# Instantiate the task, \"splitting\" over all NIfTI files in the test directory\n", + "# by splitting the \"input\" input field over all files in the directory\n", + "mrgrid = MrGrid(operation=\"regrid\", voxel=(0.5, 0.5, 0.5)).split(\n", + " in_file=nifti_dir.iterdir()\n", + ")\n", + "\n", + "# Run the task to resample all NIfTI files\n", + "outputs = mrgrid()\n", + "\n", + "# Print the locations of the output files\n", + "pprint(outputs.out_file)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "It is also possible to iterate over inputs in pairs/n-tuples. For example, if you wanted to use\n", + "different voxel sizes for different images, both the list of images and the voxel sizes\n", + "are passed to the `split()` method and their combination is specified by a tuple \"splitter\"\n", + "\n", + "\n", + "Note that it is important to use a tuple not a list for the splitter definition in this\n", + "case, because a list splitter is interpreted as the split over each combination of inputs\n", + "(see [Splitting and combining](../explanation/splitting-combining.html) for more details\n", + "on splitters)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "mrgrid_varying_vox_sizes = MrGrid(operation=\"regrid\").split(\n", + " (\"in_file\", \"voxel\"),\n", + " in_file=nifti_dir.iterdir(),\n", + " # Define a list of voxel sizes to resample the NIfTI files to,\n", + " # the list must be the same length as the list of NIfTI files\n", + " voxel=[\n", + " (1.0, 1.0, 1.0),\n", + " (1.0, 1.0, 1.0),\n", + " (1.0, 1.0, 1.0),\n", + " (0.5, 0.5, 0.5),\n", + " (0.75, 0.75, 0.75),\n", + " (0.5, 0.5, 0.5),\n", + " (0.5, 0.5, 0.5),\n", + " (1.0, 1.0, 1.0),\n", + " (1.25, 1.25, 1.25),\n", + " (1.25, 1.25, 1.25),\n", + " ],\n", + ")\n", + "\n", + "outputs = mrgrid_varying_vox_sizes()\n", + "\n", + "pprint(outputs.out_file)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Executing tasks in parallel\n", + "\n", + "By default, Pydra will use the *debug* worker, which executes each task sequentially.\n", + "This makes it easier to debug tasks and workflows, however, in most cases, once a workflow\n", + "is tested, a concurrent worker is preferable so tasks can be executed in parallel\n", + "(see [Workers](./3-advanced-execution.html#Workers)). To use multiple processes on a\n", + "workstation, select the `cf` worker option when executing the task/workflow. Additional\n", + "keyword arguments, will be passed to the worker initialisation (e.g. `n_procs=4`).\n", + "\n", + "Note that when multiprocessing in Python on Windows and macOS (and good practice on Linux/POSIX\n", + "OSs for compatibility), you need to place a `if __name__ == \"__main__\"` block when\n", + "executing in top-level scripts to allow the script to be imported, but not executed,\n", + "by subprocesses." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from pydra.tasks.mrtrix3.v3_0 import MrGrid\n", + "\n", + "if (\n", + " __name__ == \"__main__\"\n", + "): # <-- Add this block to allow the script to imported by subprocesses\n", + " mrgrid = MrGrid(operation=\"regrid\", voxel=(0.5, 0.5, 0.5)).split(\n", + " in_file=nifti_dir.iterdir()\n", + " )\n", + " outputs = mrgrid(worker=\"cf\", n_procs=4) # <-- Select the \"cf\" worker here\n", + " print(\"\\n\".join(str(p) for p in outputs.out_file))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## File-system locations\n", + "\n", + "Output and intermediate files are typically generated during the course of a workflow/task run.\n", + "In addition to this, Pydra generates a cache directory for each task, in which\n", + "the task, results and any errors are stored in [cloudpickle](https://github.com/cloudpipe/cloudpickle)\n", + "files for future reference (see [Troubleshooting](./troubleshooting.html)).\n", + "By default, these cache directories are stored in a platform-specific application-cache\n", + "directory\n", + "\n", + "* Windows: `C:\\Users\\\\AppData\\Local\\pydra\\\\run-cache`\n", + "* Linux: `/home//.cache/pydra//run-cache`\n", + "* macOS: `/Users//Library/Caches/pydra//run-cache`\n", + "\n", + "When a task runs, a unique hash is generated by the combination of all the inputs to the\n", + "task and the operation to be performed. This hash is used to name the task cache directory\n", + "within the specified cache root. Therefore, if you use the same cache\n", + "root and in a subsequent run the same task is executed with the same\n", + "inputs, then the path of its cache directory will be the same, and if Pydra finds\n", + "existing results at that path, then the outputs generated by the previous run will be\n", + "reused.\n", + "\n", + "This cache will grow as more runs are called, therefore care needs to be taken to ensure\n", + "there is enough space on the target disk.\n", + "a different location for this cache, simply provide the `cache_root` keyword argument to the execution call" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "outputs = mrgrid(cache_root=Path(\"~/pydra-cache\").expanduser())\n", + "\n", + "pprint(outputs)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To check alternative cache roots, while storing any generated task cache dirs in the \n", + "specified cache root, the `readonly_caches` keyword argument can be used" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from pydra.utils.general import default_run_cache_root\n", + "\n", + "my_cache_root = Path(\"~/new-pydra-cache\").expanduser()\n", + "my_cache_root.mkdir(exist_ok=True)\n", + "\n", + "outputs = mrgrid(cache_root=my_cache_root, readonly_caches=[default_run_cache_root])\n", + "\n", + "print(outputs)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "wf13", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.13.1" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/docs/source/tutorial/2-advanced-execution.ipynb b/docs/source/tutorial/2-advanced-execution.ipynb new file mode 100644 index 0000000000..3afd9774d8 --- /dev/null +++ b/docs/source/tutorial/2-advanced-execution.ipynb @@ -0,0 +1,437 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Advanced execution\n", + "\n", + "One of the key design features of Pydra is the separation between the parameterisation of\n", + "the task to be executed, and the parameresiation of where and how the task should be\n", + "executed (e.g. on the cloud, on a HPC cluster, ...). This tutorial steps you through\n", + "some of the available options for executing a task.\n", + "\n", + "[![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/nipype/pydra-tutorial/develop/notebooks/tutorial/advanced_execution.ipynb)\n", + "\n", + "Remember that before attempting to run multi-process code in Jupyter notebooks, the\n", + "following snippet must be called" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import nest_asyncio\n", + "\n", + "nest_asyncio.apply()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Submitter\n", + "\n", + "If you want to access a richer `Result` object you can use a Submitter object to initiate\n", + "the task execution. For example, using the `TenToThePower` task from the testing package" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from pydra.engine.submitter import Submitter\n", + "from pydra.tasks.testing import TenToThePower\n", + "\n", + "\n", + "ten_to_the_power = TenToThePower(p=3)\n", + "\n", + "with Submitter() as submitter:\n", + " result = submitter(ten_to_the_power)\n", + "\n", + "print(result)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The `Result` object contains\n", + "\n", + "* `output`: the outputs of the task (if there is only one output it is called `out` by default)\n", + "* `runtime`: information about the peak memory and CPU usage\n", + "* `errored`: the error status of the task\n", + "* `task`: the task object that generated the results\n", + "* `cache_dir`: the output directory the results are stored in" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Workers\n", + "\n", + "Pydra supports several workers with which to execute tasks\n", + "\n", + "- `debug` (default)\n", + "- `cf`\n", + "- `slurm`\n", + "- `sge`\n", + "- `psij`\n", + "- `dask` (experimental)\n", + "\n", + "By default, the *debug* worker is used, which runs tasks serially in a single process\n", + "without use of the `asyncio` module. This makes it easier to debug errors in workflows\n", + "and python tasks, however, when using in Pydra in production you will typically want to\n", + "parallelise the execution for efficiency.\n", + "\n", + "If running on a local workstation, then the `cf` (*ConcurrentFutures*) worker is a good\n", + "option because it is able to spread the tasks to be run over multiple processes and\n", + "maximise CPU usage.\n", + "\n", + "If you have access to a high-performance cluster (HPC) then\n", + "the [SLURM](https://slurm.schedmd.com/documentation.html) and\n", + "[SGE](https://www.metagenomics.wiki/tools/hpc-sge) and [PSI/J](https://exaworks.org/psij)\n", + "workers can be used to submit each workflow node as separate jobs to the HPC scheduler.\n", + "There is also an experimental [Dask](https://www.dask.org/) worker, which provides a\n", + "range of execution backends to choose from.\n", + "\n", + "To specify a worker, the abbreviation can be passed either as a string or using the\n", + "class itself. Additional parameters can be passed to the worker initialisation as keyword\n", + "arguments to the execution call. For example, if we wanted to run five tasks using the\n", + "ConcurentFutures worker but only use three CPUs, we can pass `n_procs=3` to the execution\n", + "call.\n", + "\n", + "Remember that when calling multi-process code in a top level script the call must be\n", + "enclosed within a `if __name__ == \"__main__\"` block to allow the worker processes to\n", + "import the module without re-executing it." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import tempfile\n", + "\n", + "cache_root = tempfile.mkdtemp()\n", + "\n", + "if __name__ == \"__main__\":\n", + "\n", + " ten_to_the_power = TenToThePower().split(p=[1, 2, 3, 4, 5])\n", + "\n", + " # Run the 5 tasks in parallel split across 3 processes\n", + " outputs = ten_to_the_power(worker=\"cf\", n_procs=3, cache_root=cache_root)\n", + "\n", + " p1, p2, p3, p4, p5 = outputs.out\n", + "\n", + " print(f\"10^5 = {p5}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Alternatively, the worker object can be initialised in the calling code and passed directly to the execution call" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from pydra.workers import cf\n", + "\n", + "ten_to_the_power = TenToThePower().split(p=[6, 7, 8, 9, 10])\n", + "\n", + "# Run the 5 tasks in parallel split across 3 processes\n", + "outputs = ten_to_the_power(worker=cf.Worker(n_procs=3))\n", + "\n", + "p6, p7, p8, p9, p10 = outputs.out\n", + "\n", + "print(f\"10^10 = {p10}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Reusing previously generated results\n", + "\n", + "Pydra caches all task results in the runtime cache (see [File-system locations](./1-getting-started.html##File-system-locations))\n", + "as long as exactly the hashes of the inputs provided to the task are the same. Here we\n", + "go through some of the practicalities of this caching and hashing (see\n", + "[Caches and hashes](../explanation/hashing-caching.html) for more details and issues\n", + "to consider)." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "If we attempt to run the same task with the same parameterisation the cache directory\n", + "will point to the same location and the results will be reused" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from copy import copy\n", + "from pathlib import Path\n", + "import tempfile\n", + "from pprint import pprint\n", + "from fileformats.medimage import Nifti1\n", + "from pydra.engine.submitter import Submitter\n", + "from pydra.tasks.mrtrix3.v3_0 import MrGrid\n", + "\n", + "# Make a temporary directory\n", + "test_dir = Path(tempfile.mkdtemp())\n", + "nifti_dir = test_dir / \"nifti\"\n", + "nifti_dir.mkdir()\n", + "\n", + "# Generate some random NIfTI files to work with\n", + "nifti_files = [Nifti1.sample(nifti_dir, seed=i) for i in range(10)]\n", + "\n", + "VOX_SIZES = [\n", + " (0.5, 0.5, 0.5),\n", + " (0.25, 0.25, 0.25),\n", + " (0.1, 0.1, 0.1),\n", + " (0.35, 0.35, 0.35),\n", + " (0.1, 0.1, 0.1),\n", + " (0.5, 0.5, 0.5),\n", + " (0.25, 0.25, 0.25),\n", + " (0.2, 0.2, 0.2),\n", + " (0.35, 0.35, 0.35),\n", + " (0.1, 0.1, 0.1),\n", + "]\n", + "\n", + "mrgrid_varying_vox = MrGrid(operation=\"regrid\").split(\n", + " (\"in_file\", \"voxel\"),\n", + " in_file=nifti_files,\n", + " voxel=VOX_SIZES,\n", + ")\n", + "\n", + "submitter = Submitter(cache_root=test_dir / \"cache\")\n", + "\n", + "\n", + "with submitter:\n", + " result1 = submitter(mrgrid_varying_vox)\n", + "\n", + "\n", + "mrgrid_varying_vox2 = MrGrid(operation=\"regrid\").split(\n", + " (\"in_file\", \"voxel\"),\n", + " in_file=nifti_files,\n", + " voxel=copy(VOX_SIZES),\n", + ")\n", + "\n", + "# Result from previous run is reused as the task and inputs are identical\n", + "with submitter:\n", + " result2 = submitter(mrgrid_varying_vox2)\n", + "\n", + "# Check that the output directory is the same for both runs\n", + "assert result2.cache_dir == result1.cache_dir\n", + "\n", + "# Change the voxel sizes to resample the NIfTI files to for one of the files\n", + "mrgrid_varying_vox2.voxel[2] = [0.25]\n", + "\n", + "# Result from previous run is reused as the task and inputs are identical\n", + "with submitter:\n", + " result3 = submitter(mrgrid_varying_vox2)\n", + "\n", + "# The output directory will be different as the inputs are now different\n", + "assert result3.cache_dir != result1.cache_dir" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Note that for file objects, the contents of the files are used to calculate the hash\n", + "not their paths. Therefore, when inputting large files there might be some additional\n", + "overhead on the first run (the file hashes themselves are cached by path and mtime so\n", + "shouldn't need to be recalculated unless they are modified). However, this makes the\n", + "hashes invariant to file-system movement. For example, changing the name of one of the\n", + "files in the nifti directory won't invalidate the hash." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Rename a NIfTI file within the test directory\n", + "nifti_files[0] = Nifti1(\n", + " nifti_files[0].fspath.rename(nifti_files[0].fspath.with_name(\"first.nii\"))\n", + ")\n", + "\n", + "mrgrid_varying_vox3 = MrGrid(operation=\"regrid\").split(\n", + " (\"in_file\", \"voxel\"),\n", + " in_file=nifti_files,\n", + " voxel=VOX_SIZES,\n", + ")\n", + "\n", + "# Result from previous run is reused as contents of the files have not changed, despite\n", + "# the file names changing\n", + "with submitter:\n", + " result4 = submitter(mrgrid_varying_vox3)\n", + "\n", + "assert result4.cache_dir == result1.cache_dir\n", + "\n", + "# Replace the first NIfTI file with a new file\n", + "nifti_files[0] = Nifti1.sample(nifti_dir, seed=100)\n", + "\n", + "# Update the in_file input field to include the new file\n", + "mrgrid_varying_vox4 = MrGrid(operation=\"regrid\").split(\n", + " (\"in_file\", \"voxel\"),\n", + " in_file=nifti_files,\n", + " voxel=VOX_SIZES,\n", + ")\n", + "\n", + "# The results from the previous runs are ignored as the files have changed\n", + "with submitter:\n", + " result4 = submitter(mrgrid_varying_vox4)\n", + "\n", + "# The cache directory for the new run is different\n", + "assert result4.cache_dir != result1.cache_dir" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Environments and hooks\n", + "\n", + "For shell tasks, it is possible to specify that the command runs within a specific\n", + "software environment, such as those provided by software containers (e.g. Docker or Singularity/Apptainer).\n", + "This is down by providing the environment to the submitter/execution call," + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import tempfile\n", + "from pydra.tasks.mrtrix3.v3_0 import MrGrid\n", + "from pydra.environments import docker\n", + "\n", + "test_dir = tempfile.mkdtemp()\n", + "\n", + "nifti_file = Nifti1.sample(test_dir, seed=0)\n", + "\n", + "# Instantiate the task, \"splitting\" over all NIfTI files in the test directory\n", + "# by splitting the \"input\" input field over all files in the directory\n", + "mrgrid = MrGrid(in_file=nifti_file, operation=\"regrid\", voxel=(0.5, 0.5, 0.5))\n", + "\n", + "# Run the task to resample all NIfTI files\n", + "outputs = mrgrid(environment=docker.Environment(image=\"mrtrix3/mrtrix3\", tag=\"latest\"))\n", + "\n", + "# Print the locations of the output files\n", + "pprint(outputs.out_file)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Of course for this to work Docker needs to work and be configured for\n", + "[sudo-less execution](https://docs.docker.com/engine/install/linux-postinstall/).\n", + "See [Containers and Environments](../explanation/environments.rst) for more details on\n", + "how to utilise containers and add support for other software environments.\n", + "\n", + "It is also possible to specify functions to run at hooks that are immediately before and after\n", + "the task is executed by passing a `pydra.engine.spec.TaskHooks` object to the `hooks`\n", + "keyword arg. The callable should take the `pydra.engine.core.Job` object as its only\n", + "argument and return None. The available hooks to attach functions are:\n", + "\n", + "* pre_run: before the task cache directory is created\n", + "* pre_run_task: after the cache directory has been created and the inputs resolved but before the task is executed\n", + "* post_run_task: after the task has been run and the outputs collected\n", + "* post_run: after the cache directory has been finalised\n", + "\n", + "\n", + "QUESTION: What are these hooks intended for? Should the post_run_task hook be run before the outputs have been\n", + "collected?" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from pydra.engine.job import Job\n", + "from pydra.engine.hooks import TaskHooks\n", + "from pydra.engine.result import Result\n", + "import os\n", + "import platform\n", + "\n", + "\n", + "def notify_task_completion(task: Job, result: Result):\n", + " # Print a message to the terminal\n", + " print(f\"Job completed! Results are stored in {str(task.cache_dir)!r}\")\n", + "\n", + " # Platform-specific notifications\n", + " if platform.system() == \"Darwin\": # macOS\n", + " os.system(\n", + " 'osascript -e \\'display notification \"Job has completed successfully!\" '\n", + " 'with title \"Job Notification\"\\''\n", + " )\n", + " elif platform.system() == \"Linux\": # Linux\n", + " os.system('notify-send \"Job Notification\" \"Job has completed successfully!\"')\n", + " elif platform.system() == \"Windows\": # Windows\n", + " os.system('msg * \"Job has completed successfully!\"')\n", + "\n", + "\n", + "# Run the task to resample all NIfTI files\n", + "outputs = mrgrid(\n", + " hooks=TaskHooks(post_run=notify_task_completion), cache_root=tempfile.mkdtemp()\n", + ")\n", + "\n", + "# Print the locations of the output files\n", + "pprint(outputs.out_file)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Provenance and auditing\n", + "\n", + "Work in progress..." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "wf13", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.13.1" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/docs/source/tutorial/3-troubleshooting.ipynb b/docs/source/tutorial/3-troubleshooting.ipynb new file mode 100644 index 0000000000..126280e349 --- /dev/null +++ b/docs/source/tutorial/3-troubleshooting.ipynb @@ -0,0 +1,263 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Troubleshooting\n", + "\n", + "This tutorial steps through tecnhiques to identify errors and pipeline failures, as well\n", + "as avoid common pitfalls setting up executing over multiple processes." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "## Things to check if Pydra gets stuck\n", + "\n", + "I There are a number of common gotchas, related to running multi-process code, that can\n", + "cause Pydra workflows to get stuck and not execute correctly. If using the concurrent\n", + "futures worker (e.g. `worker=\"cf\"`), check these issues first before filing a bug report\n", + "or reaching out for help.\n", + "\n", + "### Applying `nest_asyncio` when running within a notebook\n", + "\n", + "When using the concurrent futures worker within a Jupyter notebook you need to apply\n", + "`nest_asyncio` with the following lines" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# This is needed to run parallel workflows in Jupyter notebooks\n", + "import nest_asyncio\n", + "\n", + "nest_asyncio.apply()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Enclosing multi-process code within `if __name__ == \"__main__\"`\n", + "\n", + "When running multi-process Python code on macOS or Windows, as is the case when the \n", + "concurrent futures worker is selected (i.e. `worker=\"cf\"`), then scripts that execute\n", + "the forking code need to be enclosed within an `if __name__ == \"__main__\"` block, e.g." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from pydra.tasks.testing import UnsafeDivisionWorkflow\n", + "from pydra.engine.submitter import Submitter\n", + "\n", + "# This workflow will fail because we are trying to divide by 0\n", + "wf = UnsafeDivisionWorkflow(a=10, b=5, denominator=2)\n", + "\n", + "if __name__ == \"__main__\":\n", + " with Submitter(worker=\"cf\") as sub:\n", + " result = sub(wf)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This allows the secondary processes to import the script without executing it. Without\n", + "such a block Pydra will lock up and not process the workflow. On Linux this is not an\n", + "issue due to the way that processes are forked, but is good practice in any case for\n", + "code portability." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Removing stray lockfiles\n", + "\n", + "When a Pydra task is executed, a lockfile is generated to signify that the task is running.\n", + "Other processes will wait for this lock to be released before attempting to access the\n", + "tasks results. The lockfiles are automatically deleted after a task completes, either\n", + "successfully or with an error, within a *try/finally* block so should run most of the time.\n", + "However, if a task/workflow is terminated by an interactive\n", + "debugger, the finally block may not be executed, leaving stray lockfiles. This\n", + "can cause the Pydra to hang waiting for the lock to be released. If you suspect this to be\n", + "an issue, and there are no other jobs running, then simply remove all lock files from your\n", + "cache directory (e.g. `rm /*.lock`) and re-submit your job.\n", + "\n", + "If the `clean_stale_locks` flag is set (by default when using the *debug* worker), locks that\n", + "were created before the outer task was submitted are removed before the task is run.\n", + "However, since these locks could be created by separate submission processes, `clean_stale_locks`\n", + "is not switched on by default when using production workers (e.g. `cf`, `slurm`, etc...)." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Inspecting errors\n", + "\n", + "### Running in *debug* mode\n", + "\n", + "By default, Pydra will run with the *debug* worker, which executes each task serially\n", + "within a single process without use of `async/await` blocks, to allow raised exceptions\n", + "to propagate gracefully to the calling code. If you are having trouble with a pipeline,\n", + "ensure that `worker=debug` is passed to the submission/execution call (the default).\n", + "\n", + "### Reading error files\n", + "\n", + "When a task raises an error, it is captured and saved in pickle file named `_error.pklz`\n", + "within task's cache directory. For example, when calling the toy `UnsafeDivisionWorkflow`\n", + "with a `denominator=0`, the task will fail." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# This workflow will fail because we are trying to divide by 0\n", + "wf = UnsafeDivisionWorkflow(a=10, b=5).split(denominator=[3, 2, 0])\n", + "\n", + "if __name__ == \"__main__\":\n", + " try:\n", + " with Submitter(worker=\"cf\") as sub:\n", + " result = sub(wf)\n", + " except Exception as e:\n", + " print(e)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The error pickle files can be loaded using the `cloudpickle` library, noting that it is\n", + "important to use the same Python version to load the files that was used to run the Pydra\n", + "workflow" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from pydra.utils.general import default_run_cache_root\n", + "import cloudpickle as cp\n", + "from pprint import pprint\n", + "from pydra.tasks.testing import Divide\n", + "\n", + "with open(\n", + " default_run_cache_root / Divide(x=15, y=0)._checksum / \"_error.pklz\", \"rb\"\n", + ") as f:\n", + " error = cp.load(f)\n", + "\n", + "pprint(error)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Tracing upstream issues\n", + "\n", + "Failures are common in scientific analysis, even for well tested workflows, due to\n", + "the novel nature and of scientific experiments and known artefacts that can occur.\n", + "Therefore, it is always to sanity-check results produced by workflows. When a problem\n", + "occurs in a multi-stage workflow it can be difficult to identify at which stage the\n", + "issue occurred.\n", + "\n", + "Currently in Pydra you need to step backwards through the tasks of the workflow, load\n", + "the saved task object and inspect its inputs to find the preceding nodes. If any of the\n", + "inputs that have been generated by previous nodes are not ok, then you should check the\n", + "tasks that generated them in turn. For file-based inputs, you should be able to find\n", + "the path of the preceding task's cache directory from the provided file path. However,\n", + "for non-file inputs you may need to exhaustively iterate through all the task dirs\n", + "in your cache root to find the issue.\n", + "\n", + "For example, in the following example workflow, if a divide by 0 occurs within the division\n", + "node of the workflow, then an `float('inf')` will be returned, which will then propagate\n", + "through the workflow." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from pydra.engine.submitter import Submitter\n", + "from pydra.tasks.testing import SafeDivisionWorkflow\n", + "\n", + "wf = SafeDivisionWorkflow(a=10, b=5).split(denominator=[3, 2, 0])\n", + "\n", + "if __name__ == \"__main__\":\n", + " with Submitter(worker=\"cf\") as sub:\n", + " result = sub(wf)\n", + "\n", + "print(f\"Workflow completed successfully, results saved in: {result.cache_dir}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To find the task directory where the issue first surfaced, iterate through every task\n", + "cache dir and check the results for `float(\"inf\")`s" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import cloudpickle as cp\n", + "from pydra.utils.general import user_cache_root\n", + "\n", + "run_cache = user_cache_root / \"run-cache\"\n", + "\n", + "for task_cache_root in run_cache.iterdir():\n", + " with open(task_cache_root / \"_result.pklz\", \"rb\") as f:\n", + " result = cp.load(f)\n", + " if result.outputs is not None:\n", + " for field_name in result.outputs:\n", + " if result.outputs[field_name] == float(\"nan\"):\n", + " print(\n", + " f\"Job {task_cache_root.name!r} produced a NaN value for {field_name!r}\"\n", + " )" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "wf13", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.13.1" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/docs/source/tutorial/4-python.ipynb b/docs/source/tutorial/4-python.ipynb new file mode 100644 index 0000000000..550d78393c --- /dev/null +++ b/docs/source/tutorial/4-python.ipynb @@ -0,0 +1,262 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Python-tasks\n", + "\n", + "Python tasks are Python functions that are parameterised in a separate step before\n", + "they are executed or added to a workflow.\n", + "\n", + "## Define decorator\n", + "\n", + "The simplest way to define a Python task is to decorate a function with `pydra.compose.python.define`" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from pydra.compose import python\n", + "\n", + "\n", + "# Note that we use PascalCase because the object returned by the decorator is actually a class\n", + "@python.define\n", + "def MyFirstTask(a, b):\n", + " \"\"\"Sample function for testing\"\"\"\n", + " return a + b" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The resulting task-definition class can be then parameterized (instantiated), and\n", + "executed" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Instantiate the task, setting all parameters\n", + "my_first_task = MyFirstTask(a=1, b=2.0)\n", + "\n", + "# Execute the task\n", + "outputs = my_first_task()\n", + "\n", + "print(outputs.out)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "By default, the name of the output field for a function with only one output is `out`. To\n", + "name this something else, or in the case where there are multiple output fields, the `outputs`\n", + "argument can be provided to `python.define`\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "@python.define(outputs=[\"c\", \"d\"])\n", + "def NamedOutputTask(a, b):\n", + " \"\"\"Sample function for testing\"\"\"\n", + " return a + b, a - b\n", + "\n", + "\n", + "named_output_task = NamedOutputTask(a=2, b=1)\n", + "\n", + "outputs = named_output_task()\n", + "\n", + "print(outputs)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The input and output field attributes automatically extracted from the function, explicit\n", + "attributes can be augmented" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "@python.define(\n", + " inputs={\"a\": python.arg(allowed_values=[1, 2, 3]), \"b\": python.arg(default=10.0)},\n", + " outputs={\n", + " \"c\": python.out(type=float, help=\"the sum of the inputs\"),\n", + " \"d\": python.out(type=float, help=\"the difference of the inputs\"),\n", + " },\n", + ")\n", + "def AugmentedTask(a, b):\n", + " \"\"\"Sample function for testing\"\"\"\n", + " return a + b, a - b" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Type annotations\n", + "\n", + "If provided, type annotations are included in the task, and are checked at\n", + "the time of parameterisation." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from pydra.compose import python\n", + "\n", + "\n", + "@python.define\n", + "def MyTypedTask(a: int, b: float) -> float:\n", + " \"\"\"Sample function for testing\"\"\"\n", + " return a + b\n", + "\n", + "\n", + "try:\n", + " # 1.5 is not an integer so this should raise a TypeError\n", + " my_typed_task = MyTypedTask(a=1.5, b=2.0)\n", + "except TypeError as e:\n", + " print(f\"Type error caught: {e}\")\n", + "else:\n", + " assert False, \"Expected a TypeError\"\n", + "\n", + "# While 2 is an integer, it can be implicitly coerced to a float\n", + "my_typed_task = MyTypedTask(a=1, b=2)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "## Docstring parsing\n", + "\n", + "Instead of explicitly providing help strings and output names in `inputs` and `outputs`\n", + "arguments, if the function describes the its inputs and/or outputs in the doc string, \n", + "in either reST, Google or NumpyDoc style, then they will be extracted and included in the\n", + "input or output fields\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from pydra.utils import print_help\n", + "\n", + "\n", + "@python.define(outputs=[\"c\", \"d\"])\n", + "def DocStrExample(a: int, b: float) -> tuple[float, float]:\n", + " \"\"\"Example python task with help strings pulled from doc-string\n", + "\n", + " Args:\n", + " a: First input\n", + " to be inputted\n", + " b: Second input\n", + "\n", + " Returns:\n", + " c: Sum of a and b\n", + " d: Product of a and b\n", + " \"\"\"\n", + " return a + b, a * b\n", + "\n", + "\n", + "print_help(DocStrExample)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Wrapping external functions\n", + "\n", + "Like all decorators, `python.define` is just a function, so can also be used to convert\n", + "a function that is defined separately into a Python task." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "\n", + "NumpyCorrelate = python.define(np.correlate)\n", + "\n", + "numpy_correlate = NumpyCorrelate(a=[1, 2, 3], v=[0, 1, 0.5])\n", + "\n", + "outputs = numpy_correlate()\n", + "\n", + "print(outputs.out)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Like with decorated functions, input and output fields can be explicitly augmented via\n", + "the `inputs` and `outputs` arguments" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "\n", + "NumpyCorrelate = python.define(np.correlate, outputs=[\"correlation\"])\n", + "\n", + "numpy_correlate = NumpyCorrelate(a=[1, 2, 3], v=[0, 1, 0.5])\n", + "\n", + "outputs = numpy_correlate()\n", + "\n", + "print(outputs.correlation)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "wf13", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.13.1" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/docs/source/tutorial/5-shell.ipynb b/docs/source/tutorial/5-shell.ipynb new file mode 100644 index 0000000000..96c038d58d --- /dev/null +++ b/docs/source/tutorial/5-shell.ipynb @@ -0,0 +1,439 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Shell-tasks" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Command-line templates\n", + "\n", + "Shell task specs can be defined using string templates that resemble the command-line usage examples typically used in in-line help. Therefore, they can be quick and intuitive way to specify a shell task. For example, a simple spec for the copy command `cp` that omits optional flags," + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from pydra.compose import shell\n", + "\n", + "Cp = shell.define(\"cp \")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Input and output fields are both specified by placing the name of the field within enclosing `<` and `>`. Outputs are differentiated by the `out|` prefix.\n", + "\n", + "This shell task can then be run just as a Python task would be run, first parameterising it, then executing" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from pathlib import Path\n", + "from tempfile import mkdtemp\n", + "\n", + "# Make a test file to copy\n", + "test_dir = Path(mkdtemp())\n", + "test_file = test_dir / \"in.txt\"\n", + "with open(test_file, \"w\") as f:\n", + " f.write(\"Contents to be copied\")\n", + "\n", + "# Parameterise the task\n", + "cp = Cp(in_file=test_file, destination=test_dir / \"out.txt\")\n", + "\n", + "# Print the cmdline to be run to double check\n", + "print(f\"Command-line to be run: {cp.cmdline}\")\n", + "\n", + "# Run the shell-comand task\n", + "outputs = cp()\n", + "\n", + "print(\n", + " f\"Contents of copied file ('{outputs.destination}'): \"\n", + " f\"'{Path(outputs.destination).read_text()}'\"\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "If paths to output files are not provided in the parameterisation, it will default to the name of the field" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "cp = Cp(in_file=test_file)\n", + "print(cp.cmdline)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Defining input/output types\n", + "\n", + "By default, shell-command fields are considered to be of `fileformats.generic.FsObject` type. However, more specific file formats or built-in Python types can be specified by appending the type to the field name after a `:`.\n", + "\n", + "File formats are specified by their MIME type or \"MIME-like\" strings (see the [FileFormats docs](https://arcanaframework.github.io/fileformats/mime.html) for details)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from fileformats.image import Png\n", + "\n", + "TrimPng = shell.define(\"trim-png \")\n", + "\n", + "trim_png = TrimPng(in_image=Png.mock(), out_image=\"/path/to/output.png\")\n", + "\n", + "print(trim_png.cmdline)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Flags and options\n", + "\n", + "Command line flags can also be added to the shell template, either the single or double hyphen form.\n", + "The field template name immediately following the flag will be associate with that flag.\n", + "If there is no space between the flag and the field template, then the field is assumed\n", + "to be a boolean, otherwise it is assumed to be of type string unless otherwise specified.\n", + "\n", + "If a field is optional, the field template should end with a `?`. Tuple fields are\n", + "specified by comma separated types. The ellipsis (`...`) can signify tuple types with\n", + "variable number of items. Arguments and options that can be repeated are specified by\n", + "appending a `+` (at least one must be provided) or `*` (defaults to empty list). Note that\n", + "for options, this signifies that the flag itself is printed multiple times. e.g.\n", + "`my-command --multi-opt 1 2 --multi-opt 1 5`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from pydra.utils import print_help\n", + "\n", + "Cp = shell.define(\n", + " \"cp \"\n", + " \"-R \"\n", + " \"--text-arg \"\n", + " \"--int-arg \"\n", + " \"--tuple-arg \"\n", + ")\n", + "\n", + "print_help(Cp)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Defaults\n", + "\n", + "Defaults can be specified by appending them to the field template after `=`" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from pydra.utils import task_fields\n", + "\n", + "Cp = shell.define(\n", + " \"cp \"\n", + " \"-R \"\n", + " \"--text-arg \"\n", + " \"--int-arg \"\n", + " \"--tuple-arg \"\n", + ")\n", + "\n", + "print(f\"'--int-arg' default: {task_fields(Cp).int_arg.default}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Path templates for output files\n", + "\n", + "By default, when an output file argument is defined, a `path_template` attribute will\n", + "be assigned to the field based on its name and extension (if applicable). For example,\n", + "the `zipped` output field in the following Gzip command will be assigned a\n", + "`path_template` of `out_file.gz`" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from pydra.compose import shell\n", + "from fileformats.generic import File\n", + "\n", + "Gzip = shell.define(\"gzip \")\n", + "gzip = Gzip(in_files=File.mock(\"/a/file.txt\"))\n", + "print(gzip.cmdline)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "However, if this needs to be specified it can be by using the `$` operator, e.g." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "Gzip = shell.define(\"gzip \")\n", + "gzip = Gzip(in_files=File.mock(\"/a/file.txt\"))\n", + "print(gzip.cmdline)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To give the field a path_template of `archive.gz` when it is written on the command line.\n", + "Note that this value can always be overridden when the task is initialised, e.g." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "gzip = Gzip(in_files=File.mock(\"/a/file.txt\"), out_file=\"/path/to/archive.gz\")\n", + "print(gzip.cmdline)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Additional field attributes\n", + "\n", + "Additional attributes of the fields in the template can be specified by providing `shell.arg` or `shell.outarg` fields to the `inputs` and `outputs` keyword arguments to the define" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "Cp = shell.define(\n", + " (\n", + " \"cp \"\n", + " \"-R \"\n", + " \"--text-arg \"\n", + " \"--int-arg \"\n", + " \"--tuple-arg \"\n", + " ),\n", + " inputs={\n", + " \"recursive\": shell.arg(\n", + " help=(\n", + " \"If source_file designates a directory, cp copies the directory and \"\n", + " \"the entire subtree connected at that point.\"\n", + " )\n", + " )\n", + " },\n", + " outputs={\n", + " \"out_dir\": shell.outarg(position=-2),\n", + " \"out_file\": shell.outarg(position=-1),\n", + " },\n", + ")\n", + "\n", + "\n", + "print_help(Cp)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Callable outptus\n", + "\n", + "In addition to outputs that are specified to the tool on the command line, outputs can be derived from the outputs of the tool by providing a Python function that can take the output directory and inputs as arguments and return the output value. Callables can be either specified in the `callable` attribute of the `shell.out` field, or in a dictionary mapping the output name to the callable" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "from pydra.compose import shell\n", + "from pathlib import Path\n", + "from fileformats.generic import File\n", + "\n", + "\n", + "# Arguments to the callable function can be one of\n", + "def get_file_size(out_file: Path) -> int:\n", + " \"\"\"Calculate the file size\"\"\"\n", + " result = os.stat(out_file)\n", + " return result.st_size\n", + "\n", + "\n", + "CpWithSize = shell.define(\n", + " \"cp \",\n", + " outputs={\"out_file_size\": get_file_size},\n", + ")\n", + "\n", + "# Parameterise the task\n", + "cp_with_size = CpWithSize(in_file=File.sample())\n", + "\n", + "# Run the command\n", + "outputs = cp_with_size()\n", + "\n", + "\n", + "print(f\"Size of the output file is: {outputs.out_file_size}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The callable can take any combination of the following arguments, which will be passed\n", + "to it when it is called\n", + "\n", + "* field: the `Field` object to be provided a value, useful when writing generic callables\n", + "* cache_dir: a `Path` object referencing the working directory the command was run within\n", + "* inputs: a dictionary containing all the resolved inputs to the task\n", + "* stdout: the standard output stream produced by the command\n", + "* stderr: the standard error stream produced by the command\n", + "* *name of an input*: the name of any of the input arguments to the task, including output args that are part of the command line (i.e. output files)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To make workflows that use the interface type-checkable, the canonical form of a shell\n", + "task dataclass should inherit from `shell.Def` parameterized by its nested Outputs class,\n", + "and the `Outputs` nested class should inherit from `shell.Outputs`. Arguments that are\n", + "provided None values are not included in the command line, so optional arguments should\n", + "be typed as one of these equivalent forms `ty.Union[T, None]`, `ty.Optional[T]` or `T | None`\n", + "and have a default of `None`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from pydra.utils.typing import MultiInputObj\n", + "from fileformats.generic import FsObject, Directory\n", + "\n", + "\n", + "@shell.define\n", + "class Cp(shell.Task[\"Cp.Outputs\"]):\n", + "\n", + " executable = \"cp\"\n", + "\n", + " in_fs_objects: MultiInputObj[FsObject]\n", + " recursive: bool = shell.arg(argstr=\"-R\", default=False)\n", + " text_arg: str = shell.arg(argstr=\"--text-arg\")\n", + " int_arg: int | None = shell.arg(argstr=\"--int-arg\", default=None)\n", + " tuple_arg: tuple[int, str] | None = shell.arg(argstr=\"--tuple-arg\", default=None)\n", + "\n", + " class Outputs(shell.Outputs):\n", + " out_dir: Directory = shell.outarg(path_template=\"{out_dir}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Dynamic definitions\n", + "\n", + "In some cases, it is required to generate the definition for a task dynamically, which can be done by just providing the executable to `shell.define` and specifying all inputs and outputs explicitly" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from fileformats.generic import File\n", + "from pydra.utils import print_help\n", + "\n", + "ACommand = shell.define(\n", + " \"a-command\",\n", + " inputs={\n", + " \"in_file\": shell.arg(type=File, help=\"output file\", argstr=\"\", position=-2)\n", + " },\n", + " outputs={\n", + " \"out_file\": shell.outarg(type=File, help=\"output file\", argstr=\"\", position=-1),\n", + " \"out_file_size\": {\n", + " \"type\": int,\n", + " \"help\": \"size of the output directory\",\n", + " \"callable\": get_file_size,\n", + " },\n", + " },\n", + ")\n", + "\n", + "print_help(ACommand)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "wf13", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.13.1" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/docs/source/tutorial/6-workflow.ipynb b/docs/source/tutorial/6-workflow.ipynb new file mode 100644 index 0000000000..73e61afac1 --- /dev/null +++ b/docs/source/tutorial/6-workflow.ipynb @@ -0,0 +1,599 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Workflows\n", + "\n", + "In Pydra, workflows are DAG of component tasks to be executed on specified inputs.\n", + "Workflow definitions are dataclasses, which interchangeable with Python and shell tasks\n", + "definitions and executed in the same way." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Constructor functions\n", + "\n", + "Workflows are typically defined using the `pydra.compose.workflow.define` decorator on \n", + "a \"constructor\" function that generates the workflow. For example, given two task\n", + "definitions, `Add` and `Mul`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from pydra.compose import workflow, python\n", + "from pydra.utils import show_workflow, print_help\n", + "\n", + "\n", + "# Example python tasks\n", + "@python.define\n", + "def Add(a, b):\n", + " return a + b\n", + "\n", + "\n", + "@python.define\n", + "def Mul(a, b):\n", + " return a * b" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + " we can create a simple workflow definition using `workflow.define` to decorate a function that constructs the workflow. Nodes are added to the workflow being constructed by calling `workflow.add` function." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "@workflow.define\n", + "def BasicWorkflow(a, b):\n", + " add = workflow.add(Add(a=a, b=b))\n", + " mul = workflow.add(Mul(a=add.out, b=b))\n", + " return mul.out\n", + "\n", + "\n", + "print_help(BasicWorkflow)\n", + "show_workflow(BasicWorkflow, figsize=(2, 2.5))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "`workflow.add` returns an \"outputs\" object corresponding to the definition added to the\n", + "workflow. The fields of the outptus object can be referenced as inputs to downstream\n", + "workflow nodes. Note that these output fields are just placeholders for the values that will\n", + "be returned and can't be used in conditional statements during workflow construction\n", + "(see [Dynamic construction](../explanation/conditional-lazy.html) on how to work around this\n", + "limitation). The fields of the outputs to be returned by the workflow should be returned\n", + "in a tuple." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from pydra.compose import shell\n", + "from fileformats import image, video\n", + "\n", + "\n", + "@workflow.define\n", + "def ShellWorkflow(\n", + " input_video: video.Mp4,\n", + " watermark: image.Png,\n", + " watermark_dims: tuple[int, int] = (10, 10),\n", + ") -> video.Mp4:\n", + "\n", + " add_watermark = workflow.add(\n", + " shell.define(\n", + " \"ffmpeg -i -i \"\n", + " \"-filter_complex \"\n", + " )(\n", + " in_video=input_video,\n", + " watermark=watermark,\n", + " filter=\"overlay={}:{}\".format(*watermark_dims),\n", + " )\n", + " )\n", + " output_video = workflow.add(\n", + " shell.define(\n", + " \"HandBrakeCLI -i -o \"\n", + " \"--width --height \",\n", + " )(in_video=add_watermark.out_video, width=1280, height=720)\n", + " ).out_video\n", + "\n", + " return output_video # test implicit detection of output name\n", + "\n", + "\n", + "print_help(ShellWorkflow)\n", + "show_workflow(ShellWorkflow, figsize=(2.5, 3))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Splitting/combining task inputs\n", + "\n", + "Sometimes, you might want to perform the same task over a set of input values/files, and then collect the results into a list to perform further processing. This can be achieved by using the `split` and `combine` methods" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "@python.define\n", + "def Sum(x: list[float]) -> float:\n", + " return sum(x)\n", + "\n", + "\n", + "@workflow.define\n", + "def SplitWorkflow(a: list[int], b: list[float]) -> list[float]:\n", + " # Multiply over all combinations of the elements of a and b, then combine the results\n", + " # for each a element into a list over each b element\n", + " mul = workflow.add(Mul().split(a=a, b=b).combine(\"a\"))\n", + " # Sume the multiplications across all all b elements for each a element\n", + " sum = workflow.add(Sum(x=mul.out))\n", + " return sum.out\n", + "\n", + "\n", + "print_help(SplitWorkflow)\n", + "show_workflow(SplitWorkflow, figsize=(2, 2.5))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The combination step doesn't have to be done on the same step as the split, in which case the splits propagate to downstream nodes" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "@workflow.define\n", + "def SplitThenCombineWorkflow(a: list[int], b: list[float], c: float) -> list[float]:\n", + " mul = workflow.add(Mul().split(a=a, b=b))\n", + " add = workflow.add(Add(a=mul.out, b=c).combine(\"Mul.a\"))\n", + " sum = workflow.add(Sum(x=add.out))\n", + " return sum.out\n", + "\n", + "\n", + "print_help(SplitThenCombineWorkflow)\n", + "show_workflow(SplitThenCombineWorkflow, figsize=(3, 3.5))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "For more advanced discussion on the intricacies of splitting and combining see [Splitting and combining](../explanation/splitting-combining.html)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Nested and conditional workflows\n", + "\n", + "One of the most powerful features of Pydra is the ability to use inline Python code to conditionally add/omit nodes to workflow, and alter the parameterisation of the nodes, depending on inputs to the workflow " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "@workflow.define\n", + "def ConditionalWorkflow(\n", + " input_video: video.Mp4,\n", + " watermark: image.Png,\n", + " watermark_dims: tuple[int, int] | None = None,\n", + ") -> video.Mp4:\n", + "\n", + " if watermark_dims is not None:\n", + " add_watermark = workflow.add(\n", + " shell.define(\n", + " \"ffmpeg -i -i \"\n", + " \"-filter_complex \"\n", + " )(\n", + " in_video=input_video,\n", + " watermark=watermark,\n", + " filter=\"overlay={}:{}\".format(*watermark_dims),\n", + " )\n", + " )\n", + " handbrake_input = add_watermark.out_video\n", + " else:\n", + " handbrake_input = input_video\n", + "\n", + " output_video = workflow.add(\n", + " shell.define(\n", + " \"HandBrakeCLI -i -o \"\n", + " \"--width --height \",\n", + " )(in_video=handbrake_input, width=1280, height=720)\n", + " ).out_video\n", + "\n", + " return output_video # test implicit detection of output name\n", + "\n", + "\n", + "print_help(ConditionalWorkflow)\n", + "show_workflow(ConditionalWorkflow(watermark_dims=(10, 10)), figsize=(2.5, 3))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Note that outputs of upstream nodes cannot be used in conditional statements, since these are just placeholders at the time the workflow is being constructed. However, you can get around\n", + "this limitation by placing the conditional logic within a nested workflow" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "@python.define\n", + "def Subtract(x: float, y: float) -> float:\n", + " return x - y\n", + "\n", + "\n", + "@workflow.define\n", + "def RecursiveNestedWorkflow(a: float, depth: int) -> float:\n", + " add = workflow.add(Add(a=a, b=1))\n", + " decrement_depth = workflow.add(Subtract(x=depth, y=1))\n", + " if depth > 0:\n", + " out_node = workflow.add(\n", + " RecursiveNestedWorkflow(a=add.out, depth=decrement_depth.out)\n", + " )\n", + " else:\n", + " out_node = add\n", + " return out_node.out\n", + "\n", + "\n", + "print_help(RecursiveNestedWorkflow)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "For more detailed discussion of the construction of conditional workflows and \"lazy field\"\n", + "placeholders see [Conditionals and lazy fields](../explanation/conditional-lazy.html)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Type-checking between nodes\n", + "\n", + "Pydra utilizes Python type annotations to implement strong type-checking, which is performed\n", + "when values or upstream outputs are assigned to task inputs.\n", + "\n", + "Job input and output fields do not need to be assigned types, since they will default to `typing.Any`.\n", + "However, if they are assigned a type and a value or output from an upstream node conflicts\n", + "with the type, a `TypeError` will be raised at construction time.\n", + "\n", + "Note that the type-checking \"assumes the best\", and will pass if the upstream field is typed\n", + "by `Any` or a super-class of the field being assigned to. For example, an input of\n", + "`fileformats.generic.File` passed to a field expecting a `fileformats.image.Png` file type,\n", + "because `Png` is a subtype of `File`, where as `fileformats.image.Jpeg` input would fail\n", + "since it is clearly not the intended type.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from fileformats import generic\n", + "\n", + "Mp4Handbrake = shell.define(\n", + " \"HandBrakeCLI -i -o \"\n", + " \"--width --height \",\n", + ")\n", + "\n", + "\n", + "QuicktimeHandbrake = shell.define(\n", + " \"HandBrakeCLI -i -o \"\n", + " \"--width --height \",\n", + ")\n", + "\n", + "\n", + "@workflow.define\n", + "def TypeErrorWorkflow(\n", + " input_video: video.Mp4,\n", + " watermark: generic.File,\n", + " watermark_dims: tuple[int, int] = (10, 10),\n", + ") -> video.Mp4:\n", + "\n", + " add_watermark = workflow.add(\n", + " shell.define(\n", + " \"ffmpeg -i -i \"\n", + " \"-filter_complex \"\n", + " )(\n", + " in_video=input_video, # This is OK because in_video is typed Any\n", + " watermark=watermark, # Type is OK because generic.File is superclass of image.Png\n", + " filter=\"overlay={}:{}\".format(*watermark_dims),\n", + " ),\n", + " name=\"add_watermark\",\n", + " )\n", + "\n", + " try:\n", + " handbrake = workflow.add(\n", + " QuicktimeHandbrake(\n", + " in_video=add_watermark.out_video, width=1280, height=720\n", + " ),\n", + " ) # This will raise a TypeError because the input video is an Mp4\n", + " except TypeError:\n", + " handbrake = workflow.add(\n", + " Mp4Handbrake(in_video=add_watermark.out_video, width=1280, height=720),\n", + " ) # The type of the input video is now correct\n", + "\n", + " return handbrake.out_video\n", + "\n", + "\n", + "print_help(TypeErrorWorkflow)\n", + "show_workflow(TypeErrorWorkflow, plot_type=\"detailed\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "For more detailed discussion on Pydra's type-checking see [Type Checking](../explanation/typing.html)." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Accessing the workflow object\n", + "\n", + "If you need to access the workflow object being constructed from inside the constructor function you can use `workflow.this()`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "@python.define(outputs=[\"divided\"])\n", + "def Divide(x, y):\n", + " return x / y\n", + "\n", + "\n", + "@workflow.define(outputs=[\"out1\", \"out2\"])\n", + "def DirectAccesWorkflow(a: int, b: float) -> tuple[float, float]:\n", + " \"\"\"A test workflow demonstration a few alternative ways to set and connect nodes\n", + "\n", + " Args:\n", + " a: An integer input\n", + " b: A float input\n", + "\n", + " Returns:\n", + " out1: The first output\n", + " out2: The second output\n", + " \"\"\"\n", + "\n", + " wf = workflow.this()\n", + "\n", + " add = wf.add(Add(a=a, b=b), name=\"addition\")\n", + " mul = wf.add(Mul(a=add.out, b=b))\n", + " divide = wf.add(Divide(x=wf[\"addition\"].lzout.out, y=mul.out), name=\"division\")\n", + "\n", + " # Alter one of the inputs to a node after it has been initialised\n", + " wf[\"Mul\"].inputs.b *= 2\n", + "\n", + " return mul.out, divide.divided\n", + "\n", + "\n", + "print_help(DirectAccesWorkflow)\n", + "show_workflow(DirectAccesWorkflow(b=1), plot_type=\"detailed\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Directly access the workflow being constructed also enables you to set the outputs of the workflow directly" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "@workflow.define(outputs={\"out1\": float, \"out2\": float})\n", + "def SetOutputsOfWorkflow(a: int, b: float):\n", + " \"\"\"A test workflow demonstration a few alternative ways to set and connect nodes\n", + "\n", + " Args:\n", + " a: An integer input\n", + " b: A float input\n", + "\n", + " Returns:\n", + " out1: The first output\n", + " out2: The second output\n", + " \"\"\"\n", + "\n", + " wf = workflow.this()\n", + "\n", + " add = wf.add(Add(a=a, b=b), name=\"addition\")\n", + " mul = wf.add(Mul(a=add.out, b=b))\n", + " divide = wf.add(Divide(x=wf[\"addition\"].lzout.out, y=mul.out), name=\"division\")\n", + "\n", + " # Alter one of the inputs to a node after it has been initialised\n", + " wf[\"Mul\"].inputs.b *= 2\n", + "\n", + " # Set the outputs of the workflow directly\n", + " wf.outputs.out1 = mul.out\n", + " wf.outputs.out2 = divide.divided\n", + "\n", + "\n", + "print_help(SetOutputsOfWorkflow)\n", + "show_workflow(SetOutputsOfWorkflow(b=3), plot_type=\"detailed\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Setting software environments per node\n", + "\n", + "The [Advanced execution tutorial](./2-advanced-execution.html) showed how the software\n", + "environment (e.g. Docker container) could be specified for shell tasks by passing the\n", + "`environment` variable to the task execution/submission call. For shell tasks\n", + "within workflows, the software environment used for them is specified when adding\n", + "a new workflow node, i.e." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import tempfile\n", + "from pathlib import Path\n", + "import numpy as np\n", + "from fileformats.medimage import Nifti1\n", + "import fileformats.medimage_mrtrix3 as mrtrix3\n", + "from pydra.environments import docker\n", + "from pydra.compose import workflow, python\n", + "from pydra.tasks.mrtrix3.v3_0 import MrConvert, MrThreshold\n", + "\n", + "MRTRIX2NUMPY_DTYPES = {\n", + " \"Int8\": np.dtype(\"i1\"),\n", + " \"UInt8\": np.dtype(\"u1\"),\n", + " \"Int16LE\": np.dtype(\"i2\"),\n", + " \"UInt16LE\": np.dtype(\"u2\"),\n", + " \"Int32LE\": np.dtype(\"i4\"),\n", + " \"UInt32LE\": np.dtype(\"u4\"),\n", + " \"Float32LE\": np.dtype(\"f4\"),\n", + " \"Float64LE\": np.dtype(\"f8\"),\n", + " \"CFloat32LE\": np.dtype(\"c8\"),\n", + " \"CFloat64LE\": np.dtype(\"c16\"),\n", + "}\n", + "\n", + "\n", + "@workflow.define(outputs=[\"out_image\"])\n", + "def ToyMedianThreshold(in_image: Nifti1) -> mrtrix3.ImageFormat:\n", + " \"\"\"A toy example workflow that\n", + "\n", + " * converts a NIfTI image to MRTrix3 image format with a separate header\n", + " * loads the separate data file and selects the median value\n", + " \"\"\"\n", + "\n", + " input_conversion = workflow.add(\n", + " MrConvert(in_file=in_image, out_file=\"out_file.mih\"),\n", + " name=\"input_conversion\",\n", + " environment=docker.Environment(\"mrtrix3/mrtrix3\", tag=\"latest\"),\n", + " )\n", + "\n", + " @python.define\n", + " def Median(mih: mrtrix3.ImageHeader) -> float:\n", + " \"\"\"A bespoke function that reads the separate data file in the MRTrix3 image\n", + " header format (i.e. .mih) and calculates the median value.\n", + "\n", + " NB: We could use a MrStats task here, but this is just an example to show how\n", + " to use a bespoke function in a workflow.\n", + " \"\"\"\n", + " dtype = MRTRIX2NUMPY_DTYPES[mih.metadata[\"datatype\"].strip()]\n", + " data = np.frombuffer(Path.read_bytes(mih.data_file), dtype=dtype)\n", + " return np.median(data)\n", + "\n", + " median = workflow.add(Median(mih=input_conversion.out_file))\n", + "\n", + " threshold = workflow.add(\n", + " MrThreshold(in_file=in_image, out_file=\"binary.mif\", abs=median.out),\n", + " environment=docker.Environment(\"mrtrix3/mrtrix3\", tag=\"latest\"),\n", + " )\n", + "\n", + " output_conversion = workflow.add(\n", + " MrConvert(in_file=threshold.out_file, out_file=\"out_image.mif\"),\n", + " name=\"output_conversion\",\n", + " environment=docker.Environment(\"mrtrix3/mrtrix3\", tag=\"latest\"),\n", + " )\n", + "\n", + " return output_conversion.out_file\n", + "\n", + "\n", + "test_dir = Path(tempfile.mkdtemp())\n", + "\n", + "nifti_file = Nifti1.sample(test_dir, seed=0)\n", + "\n", + "wf = ToyMedianThreshold(in_image=nifti_file)\n", + "\n", + "outputs = wf(cache_root=test_dir / \"cache\")\n", + "\n", + "print(outputs)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "See [Containers and Environments](../explanation/environments.rst) for more details on\n", + "how to utilise containers and add support for other software environments." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "wf13", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.13.1" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/docs/source/tutorial/7-canonical-form.ipynb b/docs/source/tutorial/7-canonical-form.ipynb new file mode 100644 index 0000000000..ee6e043b14 --- /dev/null +++ b/docs/source/tutorial/7-canonical-form.ipynb @@ -0,0 +1,263 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Canonical task form\n", + "\n", + "Under the hood, all Python, shell and workflow tasks generated by the\n", + "`pydra.compose.*.define` decorators/functions are translated to\n", + "[dataclass](https://docs.python.org/3/library/dataclasses.html)-like classes by the\n", + "[attrs](https://www.attrs.org/en/stable/) library. While the more compact syntax described\n", + "in the [Python-tasks](./4-python.html), [Shell-tasks](./5-shell.html) and [Workflow](./6-workflow.html)\n", + "tutorials is convenient when designing tasks for specific use cases, it is too magical\n", + "for linters follow. Therefore, when designing tasks to be used by third\n", + "parties (e.g. `pydra-fsl`, `pydra-ants`) it is recommended to favour the, more\n", + "explicit, \"canonical\" dataclass form.\n", + "\n", + "The syntax of the canonical form is close to that used by the\n", + "[Attrs](https://www.attrs.org/en/stable/) package itself, with class type annotations\n", + "used to define the fields of the inputs and outputs of the task. Tasks defined in canonical\n", + "form will be able to be statically type-checked by [MyPy](https://mypy-lang.org/)." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Python-tasks\n", + "\n", + "Python tasks in dataclass form are decorated by `pydra.compose.python.define`\n", + "with inputs listed as type annotations. Outputs are similarly defined in a nested class\n", + "called `Outputs`. The function to be executed should be a staticmethod called `function`.\n", + "Default values can also be set directly, as with Attrs classes.\n", + "\n", + "In order to allow static type-checkers to check the type of outputs of tasks added\n", + "to workflows, it is also necessary to explicitly extend from the `pydra.engine.python.Task`\n", + "and `pydra.engine.python.Outputs` classes (they are otherwise set as bases by the\n", + "`define` method implicitly). Thus the \"canonical form\" of Python task is as\n", + "follows" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from pydra.utils import print_help\n", + "from pydra.compose import python\n", + "\n", + "\n", + "@python.define\n", + "class CanonicalPythonTask(python.Task[\"CanonicalPythonTask.Outputs\"]):\n", + " \"\"\"Canonical Python task class for testing\n", + "\n", + " Args:\n", + " a: First input\n", + " to be inputted\n", + " b: Second input\n", + " \"\"\"\n", + "\n", + " a: int\n", + " b: float = 2.0 # set default value\n", + "\n", + " class Outputs(python.Outputs):\n", + " \"\"\"\n", + " Args:\n", + " c: Sum of a and b\n", + " d: Product of a and b\n", + " \"\"\"\n", + "\n", + " c: float\n", + " d: float\n", + "\n", + " @staticmethod\n", + " def function(a, b):\n", + " return a + b, a / b\n", + "\n", + "\n", + "print_help(CanonicalPythonTask)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To set additional attributes other than the type and default, such as `allowed_values`\n", + "and `validators`, `python.arg` and `python.out` can be used instead." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import attrs.validators\n", + "\n", + "\n", + "@python.define\n", + "class CanonicalPythonTask(python.Task[\"CanonicalPythonTask.Outputs\"]):\n", + " \"\"\"Canonical Python task class for testing\n", + "\n", + " Args:\n", + " a: First input\n", + " to be inputted\n", + " b: Second input\n", + " \"\"\"\n", + "\n", + " a: int = python.arg(allowed_values=[1, 2, 3, 4, 5])\n", + " b: float = python.arg(default=2.0, validator=attrs.validators.not_(0))\n", + "\n", + " class Outputs(python.Outputs):\n", + " \"\"\"\n", + " Args:\n", + " c: Sum of a and b\n", + " d: Product of a and b\n", + " \"\"\"\n", + "\n", + " c: float\n", + " d: float\n", + "\n", + " @staticmethod\n", + " def function(a, b):\n", + " return a + b, a / b\n", + "\n", + "\n", + "print_help(CanonicalPythonTask)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Shell-tasks\n", + "\n", + "The canonical form of shell tasks is the same as for Python tasks, except a string `executable`\n", + "attribute replaces the `function` staticmethod." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "from pathlib import Path\n", + "from fileformats import generic\n", + "from pydra.compose import shell\n", + "from pydra.utils.typing import MultiInputObj\n", + "\n", + "\n", + "@shell.define\n", + "class CpWithSize(shell.Task[\"CpWithSize.Outputs\"]):\n", + "\n", + " executable = \"cp\"\n", + "\n", + " in_fs_objects: MultiInputObj[generic.FsObject]\n", + " recursive: bool = shell.arg(argstr=\"-R\")\n", + " text_arg: str = shell.arg(argstr=\"--text-arg\")\n", + " int_arg: int | None = shell.arg(argstr=\"--int-arg\")\n", + " tuple_arg: tuple[int, str] | None = shell.arg(argstr=\"--tuple-arg\")\n", + "\n", + " class Outputs(shell.Outputs):\n", + "\n", + " @staticmethod\n", + " def get_file_size(out_file: Path) -> int:\n", + " \"\"\"Calculate the file size\"\"\"\n", + " result = os.stat(out_file)\n", + " return result.st_size\n", + "\n", + " copied: generic.FsObject = shell.outarg(path_template=\"copied\")\n", + " out_file_size: int = shell.out(callable=get_file_size)\n", + "\n", + "\n", + "print_help(CpWithSize)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Workflow definitions\n", + "\n", + "Workflows can also be defined in canonical form, which is the same as for Python tasks\n", + "but with a staticmethod called `constructor` that constructs the workflow." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import typing as ty\n", + "import re\n", + "from pydra.compose import python, workflow\n", + "from pydra.compose.base import is_set\n", + "from pydra.utils import print_help, show_workflow\n", + "\n", + "\n", + "# Example python tasks\n", + "@python.define\n", + "def Add(a, b):\n", + " return a + b\n", + "\n", + "\n", + "@python.define\n", + "def Mul(a, b):\n", + " return a * b\n", + "\n", + "\n", + "@workflow.define\n", + "class CanonicalWorkflowTask(workflow.Task[\"CanonicalWorkflowTask.Outputs\"]):\n", + "\n", + " @staticmethod\n", + " def str2num(value: ty.Any) -> float | int:\n", + " if isinstance(value, str) and re.match(r\"^\\d+(\\.\\d+)?$\", value):\n", + " return eval(value) # use eval to convert string to number\n", + " return value\n", + "\n", + " a: int\n", + " b: float = workflow.arg(help=\"A float input\", converter=str2num)\n", + "\n", + " @staticmethod\n", + " def constructor(a, b):\n", + " add = workflow.add(Add(a=a, b=b))\n", + " mul = workflow.add(Mul(a=add.out, b=b))\n", + " return mul.out\n", + "\n", + " class Outputs(workflow.Outputs):\n", + " out: float\n", + "\n", + "\n", + "print_help(CanonicalWorkflowTask)\n", + "show_workflow(CanonicalWorkflowTask)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "wf13", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.13.1" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/docs/source/tutorial/tst.py b/docs/source/tutorial/tst.py new file mode 100644 index 0000000000..acc122e47f --- /dev/null +++ b/docs/source/tutorial/tst.py @@ -0,0 +1,43 @@ +from pydra.compose import python, workflow +from pydra.utils import print_help, show_workflow + + +# Example python tasks +@python.define +def Add(a, b): + return a + b + + +@python.define +def Mul(a, b): + return a * b + + +@workflow.define +class CanonicalWorkflowTask(workflow.Task["CanonicalWorkflowTask.Outputs"]): + + @staticmethod + def a_converter(value): + try: + return float(value) + except (TypeError, ValueError): + return value + + a: int + b: float = workflow.arg( + help="A float input", + converter=a_converter, + ) + + @staticmethod + def constructor(a, b): + add = workflow.add(Add(a=a, b=b)) + mul = workflow.add(Mul(a=add.out, b=b)) + return mul.out + + class Outputs(workflow.Outputs): + out: float + + +print_help(CanonicalWorkflowTask) +show_workflow(CanonicalWorkflowTask) diff --git a/docs/sphinxext/github_link.py b/docs/sphinxext/github_link.py deleted file mode 100644 index b9a5684c20..0000000000 --- a/docs/sphinxext/github_link.py +++ /dev/null @@ -1,94 +0,0 @@ -""" -This script comes from scikit-learn: -https://github.com/scikit-learn/scikit-learn/blob/master/doc/sphinxext/github_link.py -""" - -import inspect -import os -import subprocess -import sys -from functools import partial -from operator import attrgetter - -REVISION_CMD = "git rev-parse --short HEAD" - - -def _get_git_revision(): - try: - revision = subprocess.check_output(REVISION_CMD.split()).strip() - except (subprocess.CalledProcessError, OSError): - print("Failed to execute git to get revision") - return None - return revision.decode("utf-8") - - -def _linkcode_resolve(domain, info, package, url_fmt, revision): - """Determine a link to online source for a class/method/function - - This is called by sphinx.ext.linkcode - - An example with a long-untouched module that everyone has - >>> _linkcode_resolve('py', {'module': 'tty', - ... 'fullname': 'setraw'}, - ... package='tty', - ... url_fmt='http://hg.python.org/cpython/file/' - ... '{revision}/Lib/{package}/{path}#L{lineno}', - ... revision='xxxx') - 'http://hg.python.org/cpython/file/xxxx/Lib/tty/tty.py#L18' - """ - - if revision is None: - return - if domain not in ("py", "pyx"): - return - if not info.get("module") or not info.get("fullname"): - return - - class_name = info["fullname"].split(".")[0] - module = __import__(info["module"], fromlist=[class_name]) - - # FIXME: Bypass resolving for attrs-defined classes. - try: - obj = attrgetter(info["fullname"])(module) - except AttributeError: - return - - # Unwrap the object to get the correct source - # file in case that is wrapped by a decorator - obj = inspect.unwrap(obj) - - try: - fn = inspect.getsourcefile(obj) - except Exception: - fn = None - if not fn: - try: - fn = inspect.getsourcefile(sys.modules[obj.__module__]) - except Exception: - fn = None - if not fn: - return - - fn = os.path.relpath(fn, start=os.path.dirname(__import__(package).__file__)) - try: - lineno = inspect.getsourcelines(obj)[1] - except Exception: - lineno = "" - return url_fmt.format(revision=revision, package=package, path=fn, lineno=lineno) - - -def make_linkcode_resolve(package, url_fmt): - """Returns a linkcode_resolve function for the given URL format - - revision is a git commit reference (hash or name) - - package is the name of the root module of the package - - url_fmt is along the lines of ('https://github.com/USER/PROJECT/' - 'blob/{revision}/{package}/' - '{path}#L{lineno}') - """ - revision = _get_git_revision() - return partial( - _linkcode_resolve, revision=revision, package=package, url_fmt=url_fmt - ) diff --git a/docs/user_guide.rst b/docs/user_guide.rst deleted file mode 100644 index bf48a9a8a9..0000000000 --- a/docs/user_guide.rst +++ /dev/null @@ -1,12 +0,0 @@ -User Guide -========== - - - -.. toctree:: - - components - state - combiner - input_spec - output_spec diff --git a/empty-docs/conf.py b/empty-docs/conf.py new file mode 100644 index 0000000000..978a9c7a79 --- /dev/null +++ b/empty-docs/conf.py @@ -0,0 +1,162 @@ +# -*- coding: utf-8 -*- +# +# Empty Docs configuration file for Sphinx. +# +# This file is execfile()d with the current directory set to its +# containing dir. +# +# Note that not all possible configuration values are present in this +# autogenerated file. +# +# All configuration values have a default; values that are commented out +# serve to show the default. + +import datetime + +# If extensions (or modules to document with autodoc) are in another directory, +# add these directories to sys.path here. If the directory is relative to the +# documentation root, use os.path.abspath to make it absolute, like shown here. +# sys.path.insert(0, os.path.abspath('.')) + +# -- General configuration ------------------------------------------------ + +# If your documentation needs a minimal Sphinx version, state it here. +# needs_sphinx = '1.0' + +# Add any Sphinx extension module names here, as strings. They can be +# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom +# ones. +extensions = [ + "sphinx.ext.autodoc", + "sphinx.ext.doctest", + "sphinx.ext.intersphinx", + "sphinx.ext.todo", + "sphinx.ext.coverage", + "sphinx.ext.mathjax", + "sphinx.ext.viewcode", + "sphinx.ext.napoleon", +] + +# Add any paths that contain templates here, relative to this directory. +templates_path = ["_templates"] + +# The suffix(es) of source filenames. +# You can specify multiple suffix as a list of string: +# source_suffix = [".rst", ".md"] +source_suffix = ".rst" + +# The encoding of source files. +# source_encoding = "utf-8-sig" + +# The master toctree document. +master_doc = "index" + +# General information about the project. +project = "Empty Docs" +author = "Your Name" +copyright = "{}, {}".format(datetime.datetime.now().year, author) + +# The version info for the project you're documenting, acts as replacement for +# |version| and |release|, also used in various other places throughout the +# built documents. +# +# The short X.Y version. +version = "0.1" +# The full version, including alpha/beta/rc tags. +release = "0.1.0" + +# The language for content autogenerated by Sphinx. Refer to documentation +# for a list of supported languages. +# +# This is also used if you do content translation via gettext catalogs. +# Usually you set "language" from the command line for these cases. +# language = None + +# List of patterns, relative to source directory, that match files and +# directories to ignore when looking for source files. +exclude_patterns = [] + +# The reST default role (used for this markup: `text`) to use for all +# documents. +# default_role = None + +# If true, '()' will be appended to :func: etc. cross-reference text. +# add_function_parentheses = True + +# If true, the current module name will be prepended to all description +# unit titles (such as .. function::). +# add_module_names = True + +# The name of the Pygments (syntax highlighting) style to use. +pygments_style = "sphinx" + +# A list of ignored prefixes for module index sorting. +# modindex_common_prefix = [] + +# If true, keep warnings as "system message" paragraphs in the built documents. +# keep_warnings = False + +# If true, `todo` and `todoList` produce output, else they produce nothing. +todo_include_todos = True + +# -- Options for HTML output ---------------------------------------------- + +# The theme to use for HTML and HTML Help pages. See the documentation for +# a list of builtin themes. +html_theme = "alabaster" + +# Theme options are theme-specific and customize the look and feel of a theme +# further. For a list of options available for each theme, see the +# documentation. +html_theme_options = {} + +# Add any paths that contain custom static files (such as style sheets) here, +# relative to this directory. They are copied after the builtin static files, +# so a file named "default.css" will overwrite the builtin "default.css". +html_static_path = ["_static"] + +# -- Options for LaTeX output --------------------------------------------- + +latex_elements = { + # The paper size ('letterpaper' or 'a4paper'). + # 'papersize': 'letterpaper', + # The font size ('10pt', '11pt' or '12pt'). + # 'pointsize': '10pt', + # Additional stuff for the LaTeX preamble. + # 'preamble': '', + # Latex figure (float) alignment + # 'figure_align': 'htbp', +} + +# Grouping the document tree into LaTeX files. List of tuples +# (source start file, target name, title, +# author, documentclass [howto, manual, or own class]). +latex_documents = [ + (master_doc, "EmptyDocs.tex", "Empty Docs Documentation", "Your Name", "manual"), +] + +# -- Options for manual page output --------------------------------------- + +# One entry per manual page. List of tuples +# (source start file, name, description, authors, manual section). +man_pages = [(master_doc, "emptydocs", "Empty Docs Documentation", [author], 1)] + +# -- Options for Texinfo output ------------------------------------------- + +# Grouping the document tree into Texinfo files. List of tuples +# (source start file, target name, title, author, +# dir menu entry, description, category) +texinfo_documents = [ + ( + master_doc, + "EmptyDocs", + "Empty Docs Documentation", + author, + "EmptyDocs", + "One line description of project.", + "Miscellaneous", + ), +] + +# Example configuration for intersphinx: refer to the Python standard library. +intersphinx_mapping = {"python": ("https://docs.python.org/3", None)} diff --git a/empty-docs/index.rst b/empty-docs/index.rst new file mode 100644 index 0000000000..88bee5a341 --- /dev/null +++ b/empty-docs/index.rst @@ -0,0 +1,5 @@ +.. meta:: + :http-equiv=Refresh: 0; url='https://nipype.github.io/pydra/' + +The docs for Nipype have been moved to GitHub, if you aren't redirected automatically +go to https://nipype.github.io/pydra/. diff --git a/empty-docs/requirements.txt b/empty-docs/requirements.txt new file mode 100644 index 0000000000..6966869c70 --- /dev/null +++ b/empty-docs/requirements.txt @@ -0,0 +1 @@ +sphinx diff --git a/environment.yml b/environment.yml new file mode 100644 index 0000000000..f2712b1ae6 --- /dev/null +++ b/environment.yml @@ -0,0 +1,25 @@ +name: pydra-tutorial +channels: + - conda-forge + - defaults +dependencies: + - datalad + - pip + - pip: + - pydra==0.23.0a0 + - jupyter + - jupyter_contrib_nbextensions + - jupytext + - jupyterlab + - matplotlib + - nbformat + - nbval + - nest_asyncio + - psutil + - sh + - pytest + - numpy + - pandas + - scipy + - nibabel + - nilearn diff --git a/example.py b/example.py new file mode 100644 index 0000000000..dd9dc87a5f --- /dev/null +++ b/example.py @@ -0,0 +1,34 @@ +import asyncio + + +def is_coroutine_function(func): + return asyncio.iscoroutinefunction(func) + + +async def call_function(func, *args, **kwargs): + if is_coroutine_function(func): + return await func(*args, **kwargs) + else: + return func(*args, **kwargs) + + +# Example usage +async def async_function(x): + await asyncio.sleep(1) + return x * 2 + + +def sync_function(x): + return x * 2 + + +async def main(): + result1 = await call_function(async_function, 10) + result2 = await call_function(sync_function, 10) + print(result1) # Output: 20 + print(result2) # Output: 20 + + +# To run the example +if __name__ == "__main__": + asyncio.run(main()) diff --git a/notebooks/examples b/notebooks/examples new file mode 120000 index 0000000000..4c987681e2 --- /dev/null +++ b/notebooks/examples @@ -0,0 +1 @@ +../docs/source/examples \ No newline at end of file diff --git a/notebooks/tutorial b/notebooks/tutorial new file mode 120000 index 0000000000..6ae303bf43 --- /dev/null +++ b/notebooks/tutorial @@ -0,0 +1 @@ +../docs/source/tutorial \ No newline at end of file diff --git a/pydra/__init__.py b/pydra/__init__.py deleted file mode 100644 index f704d670a5..0000000000 --- a/pydra/__init__.py +++ /dev/null @@ -1,49 +0,0 @@ -""" -The Pydra workflow engine. - -Pydra is a rewrite of the Nipype engine with mapping and joining as -first-class operations. It forms the core of the Nipype 2.0 ecosystem. - -""" - -# This call enables pydra.tasks to be used as a namespace package when installed -# in editable mode. In normal installations it has no effect. -__path__ = __import__("pkgutil").extend_path(__path__, __name__) - -import logging - -import __main__ -import attr - -from . import mark -from .engine import AuditFlag, ShellCommandTask, Submitter, Workflow, specs - -__all__ = ( - "Submitter", - "Workflow", - "AuditFlag", - "ShellCommandTask", - "specs", - "mark", -) - -try: - from ._version import __version__ -except ImportError: - pass - -logger = logging.getLogger("pydra") - - -def check_latest_version(): - import etelemetry - - return etelemetry.check_available_version("nipype/pydra", __version__, lgr=logger) - - -# Run telemetry on import for interactive sessions, such as IPython, Jupyter notebooks, Python REPL -if not hasattr(__main__, "__file__"): - from .engine.core import TaskBase - - if TaskBase._etelemetry_version_data is None: - TaskBase._etelemetry_version_data = check_latest_version() diff --git a/pydra/compose/base/__init__.py b/pydra/compose/base/__init__.py new file mode 100644 index 0000000000..feac377eb5 --- /dev/null +++ b/pydra/compose/base/__init__.py @@ -0,0 +1,27 @@ +from .field import Field, Arg, Out, NO_DEFAULT +from .helpers import ( + ensure_field_objects, + parse_doc_string, + extract_function_inputs_and_outputs, + check_explicit_fields_are_none, + extract_fields_from_class, + is_set, +) +from .task import Task, Outputs +from .builder import build_task_class + +__all__ = [ + "Field", + "Arg", + "Out", + "NO_DEFAULT", + "ensure_field_objects", + "parse_doc_string", + "extract_function_inputs_and_outputs", + "check_explicit_fields_are_none", + "extract_fields_from_class", + "is_set", + "build_task_class", + "Task", + "Outputs", +] diff --git a/pydra/compose/base/builder.py b/pydra/compose/base/builder.py new file mode 100644 index 0000000000..18c5b96c7e --- /dev/null +++ b/pydra/compose/base/builder.py @@ -0,0 +1,330 @@ +import typing as ty +import types +from pathlib import Path +from copy import copy +import attrs.validators +from pydra.utils.typing import TypeParser, is_optional, is_fileset_or_union +import attrs +from .task import Task, Outputs +from pydra.utils.hash import hash_function +from pydra.utils.general import ( + from_list_if_single, + ensure_list, + PYDRA_ATTR_METADATA, +) +from pydra.utils.typing import ( + MultiInputObj, + MultiInputFile, + MultiOutputObj, + MultiOutputFile, + is_lazy, +) +from .field import Field, Arg, Out + + +def build_task_class( + spec_type: type["Task"], + out_type: type["Outputs"], + inputs: dict[str, Arg], + outputs: dict[str, Out], + klass: type | None = None, + name: str | None = None, + bases: ty.Sequence[type] = (), + outputs_bases: ty.Sequence[type] = (), + xor: ty.Sequence[str | None] | ty.Sequence[ty.Sequence[str | None]] = (), +): + """Create a task class and its outputs class from the + input and output fields provided to the decorator/function. + + Modifies the class so that its attributes are converted from pydra fields to attrs fields + and then calls `attrs.define` to create an attrs class (dataclass-like). + on + + Parameters + ---------- + task_type : type + The type of the task to be created + inputs : dict[str, Arg] + The input fields of the task + outputs : dict[str, Out] + The output fields of the task + klass : type, optional + The class to be decorated, by default None + name : str, optional + The name of the class, by default + bases : ty.Sequence[type], optional + The base classes for the task class, by default () + outputs_bases : ty.Sequence[type], optional + The base classes for the outputs class, by default () + xor: Sequence[str | None] | Sequence[Sequence[str | None]], optional + Names of args that are exclusive mutually exclusive, which must include + the name of the current field. If this list includes None, then none of the + fields need to be set. + + Returns + ------- + klass : type + The class created using the attrs package + """ + + # Convert a single xor set into a set of xor sets + if not xor: + xor = frozenset() + elif all(isinstance(x, str) or x is None for x in xor): + xor = frozenset([frozenset(xor)]) + else: + xor = frozenset(frozenset(x) for x in xor) + + spec_type._check_arg_refs(inputs, outputs, xor) + + # Check that the field attributes are valid after all fields have been set + # (especially the type) + for inpt in inputs.values(): + attrs.validate(inpt) + for outpt in outputs.values(): + attrs.validate(outpt) + + if name is None and klass is not None: + name = klass.__name__ + if reserved_names := [n for n in inputs if n in spec_type.RESERVED_FIELD_NAMES]: + raise ValueError( + f"{reserved_names} are reserved and cannot be used for {spec_type} field names" + ) + outputs_klass = build_outputs_class(out_type, outputs, outputs_bases, name) + if klass is None: + if name is None: + raise ValueError("name must be provided if klass is not") + bases = tuple(bases) + # Ensure that Task is a base class + if not any(issubclass(b, spec_type) for b in bases): + bases = bases + (spec_type,) + # If building from a decorated class (as opposed to dynamically from a function + # or shell-template), add any base classes not already in the bases tuple + if klass is not None: + bases += tuple(c for c in klass.__mro__ if c not in bases + (object,)) + # Create a new class with the Task as a base class + klass = types.new_class( + name=name, + bases=bases, + kwds={}, + exec_body=lambda ns: ns.update({"Outputs": outputs_klass}), + ) + else: + # Ensure that the class has it's own annotations dict so we can modify it without + # messing up other classes + klass.__annotations__ = copy(klass.__annotations__) + klass.Outputs = outputs_klass + # Now that we have saved the attributes in lists to be + for arg in inputs.values(): + # If an outarg input then the field type should be Path not a FileSet + attrs_kwargs = _get_attrs_kwargs(arg) + if isinstance(arg, Out) and is_fileset_or_union(arg.type): + if getattr(arg, "path_template", False): + if is_optional(arg.type): + field_type = Path | bool | None + if arg.mandatory: # provide default if one is not provided + attrs_kwargs["default"] = True if arg.requires else None + del attrs_kwargs["factory"] + else: + field_type = Path | bool + if arg.mandatory: # provide default if one is not provided + attrs_kwargs["default"] = True # use the template by default + del attrs_kwargs["factory"] + elif is_optional(arg.type): + field_type = Path | None + else: + field_type = Path + else: + field_type = arg.type + setattr( + klass, + arg.name, + attrs.field( + converter=make_converter(arg, klass.__name__, field_type), + validator=make_validator(arg, klass.__name__), + metadata={PYDRA_ATTR_METADATA: arg}, + on_setattr=attrs.setters.convert, + **attrs_kwargs, + ), + ) + # Store the xor sets for the class + klass._xor = xor + klass.__annotations__[arg.name] = field_type + + # Create class using attrs package, will create attributes for all columns and + # parameters + attrs_klass = attrs.define(auto_attribs=False, kw_only=True, eq=False, repr=False)( + klass + ) + + return attrs_klass + + +def build_outputs_class( + spec_type: type["Outputs"], + outputs: dict[str, Out], + bases: ty.Sequence[type], + spec_name: str, +) -> type["Outputs"]: + """Create an outputs class and its outputs class from the + output fields provided to the decorator/function. + + Creates a new class with attrs fields and then calls `attrs.define` to create an + attrs class (dataclass-like). + + Parameters + ---------- + outputs : dict[str, Out] + The output fields of the task + bases : ty.Sequence[type], optional + The base classes for the outputs class, by default () + spec_name : str + The name of the task class the outputs are for + + Returns + ------- + klass : type + The class created using the attrs package + """ + + if not any(issubclass(b, spec_type) for b in bases): + if out_spec_bases := [b for b in bases if issubclass(b, Outputs)]: + raise ValueError( + f"Cannot make {spec_type} output definition from {out_spec_bases} bases" + ) + outputs_bases = bases + (spec_type,) + if reserved_names := [n for n in outputs if n in spec_type.RESERVED_FIELD_NAMES]: + raise ValueError( + f"{reserved_names} are reserved and cannot be used for {spec_type} field names" + ) + # Add in any fields in base classes that haven't already been converted into attrs + # fields (e.g. stdout, stderr and return_code) + for base in outputs_bases: + base_outputs = { + n: o + for n, o in base.__dict__.items() + if isinstance(o, Out) and n not in outputs + } + for name, field in base_outputs.items(): + field.name = name + field.type = base.__annotations__.get(name, ty.Any) + outputs.update(base_outputs) + assert all(o.name == n for n, o in outputs.items()) + outputs_klass = type( + spec_name + "Outputs", + tuple(outputs_bases), + { + n: attrs.field( + converter=make_converter(o, f"{spec_name}.Outputs"), + metadata={PYDRA_ATTR_METADATA: o}, + **_get_attrs_kwargs(o), + ) + for n, o in outputs.items() + }, + ) + outputs_klass.__annotations__.update((o.name, o.type) for o in outputs.values()) + outputs_klass = attrs.define(auto_attribs=False, kw_only=True, eq=False)( + outputs_klass + ) + return outputs_klass + + +def make_converter( + field: Field, interface_name: str, field_type: ty.Type | None = None +) -> ty.Callable[..., ty.Any]: + """Makes an attrs converter for the field, combining type checking with any explicit + converters + + Parameters + ---------- + field : Field + The field to make the converter for + interface_name : str + The name of the interface the field is part of + field_type : type, optional + The type of the field, by default None + + Returns + ------- + converter : callable + The converter for the field + """ + if field_type is None: + field_type = field.type + checker_label = f"'{field.name}' field of {interface_name} interface" + type_checker = TypeParser[field_type]( + field_type, label=checker_label, superclass_auto_cast=True + ) + converters = [] + if field_type in (MultiInputObj, MultiInputFile): + converters.append(ensure_list) + elif field_type in (MultiOutputObj, MultiOutputFile): + converters.append(from_list_if_single) + if field.converter: + converters.append(field.converter) + if converters: + converters.append(type_checker) + converter = attrs.converters.pipe(*converters) + else: + converter = type_checker + return converter + + +def make_validator(field: Field, interface_name: str) -> ty.Callable[..., None] | None: + """Makes an attrs validator for the field, combining allowed values and any explicit + validators + + Parameters + ---------- + field : Field + The field to make the validator for + interface_name : str + The name of the interface the field is part of + + Returns + ------- + validator : callable + The validator for the field + """ + validators = [] + if field.allowed_values: + validators.append(allowed_values_validator) + if isinstance(field.validator, ty.Iterable): + validators.extend(field.validator) + elif field.validator: + validators.append(field.validator) + if len(validators) > 1: + return validators + elif validators: + return validators[0] + return None + + +def allowed_values_validator(_, attribute, value): + """checking if the values is in allowed_values""" + allowed = attribute.metadata[PYDRA_ATTR_METADATA].allowed_values + if value is attrs.NOTHING or is_lazy(value): + pass + elif value is None and is_optional(attribute.type): + pass + elif value not in allowed: + raise ValueError( + f"value of {attribute.name} has to be from {allowed}, but {value} provided" + ) + + +def _get_attrs_kwargs(field: Field) -> dict[str, ty.Any]: + kwargs = {} + if not field.mandatory: + kwargs["default"] = field.default + # elif is_optional(field.type): + # kwargs["default"] = None + else: + kwargs["factory"] = nothing_factory + if field.hash_eq: + kwargs["eq"] = hash_function + return kwargs + + +def nothing_factory(): + return attrs.NOTHING diff --git a/pydra/compose/base/field.py b/pydra/compose/base/field.py new file mode 100644 index 0000000000..1cf435b10a --- /dev/null +++ b/pydra/compose/base/field.py @@ -0,0 +1,366 @@ +import typing as ty +import enum +from typing import Self +import attrs.validators +from attrs.converters import default_if_none +from fileformats.core import to_mime +from fileformats.generic import File, FileSet +from pydra.utils.typing import TypeParser, is_optional, is_type, is_union +from pydra.utils.general import task_fields, wrap_text +import attrs + +if ty.TYPE_CHECKING: + from .task import Task + + +class _Empty(enum.Enum): + + NO_DEFAULT = enum.auto() + + def __repr__(self): + return "NO_DEFAULT" + + def __bool__(self): + return False + + +NO_DEFAULT = _Empty.NO_DEFAULT # To provide a blank placeholder for the default field + + +def convert_default_value(value: ty.Any, self_: "Field") -> ty.Any: + """Ensure the default value has been coerced into the correct type""" + if value is NO_DEFAULT or isinstance(value, attrs.Factory): + return value + if self_.type is ty.Callable and isinstance(value, ty.Callable): + return value + if isinstance(self_, Out) and TypeParser.contains_type(FileSet, self_.type): + return value + return TypeParser[self_.type](self_.type, label=self_.name)(value) + + +def allowed_values_converter(value: ty.Iterable[str] | None) -> list[str] | None: + """Ensure the allowed_values field is a list of strings or None""" + if value is None: + return None + return list(value) + + +@attrs.define +class Requirement: + """Define a requirement for a task input field + + Parameters + ---------- + name : str + The name of the input field that is required + allowed_values : list[str], optional + The allowed values for the input field that is required, if not provided any + value is allowed + """ + + name: str + allowed_values: list[str] | None = attrs.field( + default=None, converter=allowed_values_converter + ) + + def satisfied(self, inputs: "Task") -> bool: + """Check if the requirement is satisfied by the inputs""" + value = getattr(inputs, self.name) + field = {f.name: f for f in task_fields(inputs)}[self.name] + if value is None or field.type is bool and value is False: + return False + if self.allowed_values is None: + return True + return value in self.allowed_values + + @classmethod + def parse(cls, value: ty.Any) -> Self: + if isinstance(value, Requirement): + return value + elif isinstance(value, str): + return Requirement(value) + elif isinstance(value, tuple): + name, allowed_values = value + if isinstance(allowed_values, str) or not isinstance( + allowed_values, ty.Collection + ): + raise ValueError( + f"allowed_values must be a collection of strings, not {allowed_values}" + ) + return Requirement(name, allowed_values) + else: + raise ValueError( + f"Requirements must be a input field name, a tuple of an input field " + f"name and allowed values or a Requirement object, not {value!r}" + ) + + def __str__(self): + if not self.allowed_values: + return self.name + return f"{self.name}(" + ",".join(repr(v) for v in self.allowed_values) + ")" + + +def requirements_converter(value: ty.Any) -> list[Requirement]: + """Ensure the requires field is a list of Requirement objects""" + if isinstance(value, Requirement): + return [value] + elif isinstance(value, (str, tuple)): + try: + return [Requirement.parse(value)] + except ValueError as e: + e.add_note( + f"Parsing requirements specification {value!r} as a single requirement" + ) + raise e + try: + return [Requirement.parse(v) for v in value] + except ValueError as e: + e.add_note( + f"Parsing requirements specification {value!r} as a set of concurrent " + "requirements (i.e. logical AND)" + ) + raise e + + +@attrs.define +class RequirementSet: + """Define a set of requirements for a task input field, all of which must be satisfied""" + + requirements: list[Requirement] = attrs.field( + factory=list, + converter=requirements_converter, + ) + + def satisfied(self, inputs: "Task") -> bool: + """Check if all the requirements are satisfied by the inputs""" + return all(req.satisfied(inputs) for req in self.requirements) + + def __str__(self): + if len(self.requirements) == 1: + return str(self.requirements[0]) + return "+".join(str(r) for r in self.requirements) + + def __iter__(self): + return iter(self.requirements) + + def __iadd__(self, other: "RequirementSet | list[Requirement]") -> "RequirementSet": + self.requirements.extend(requirements_converter(other)) + return self + + +def requires_converter( + value: ( + str + | ty.Collection[ + Requirement | str | ty.Collection[str | tuple[str, ty.Collection[ty.Any]]] + ] + ), +) -> list[RequirementSet]: + """Ensure the requires field is a tuple of tuples""" + if isinstance(value, (str, tuple, Requirement)): + try: + return [RequirementSet(value)] + except ValueError as e: + e.add_note( + f"Parsing requirements set specification {value!r} as a single requirement set" + ) + raise e + try: + return [RequirementSet(v) for v in value] + except ValueError as e: + e.add_note( + f"Parsing requirements set specification {value!r} as a set of alternative " + "requirements (i.e. logical OR)" + ) + raise e + + +@attrs.define(kw_only=True) +class Field: + """Base class for input and output fields to tasks + + Parameters + ---------- + name: str, optional + The name of the field, used when specifying a list of fields instead of a mapping + type: type, optional + The type of the field, by default it is Any + from name to field, by default it is None + default : Any, optional + the default value for the field, by default it is NO_DEFAULT + help: str, optional + A short description of the input field. + requires: str | list[str | list[str] | Requirement], optional + The input fields that are required to be provided, along with the optional allowed + values, that are required together with the field. Can be provided + as a single name, a collection of names, a collection of collections of names, + or a collection of collection of name/allowed values pairs. + converter: callable, optional + The converter for the field passed through to the attrs.field, by default it is None + validator: callable | iterable[callable], optional + The validator(s) for the field passed through to the attrs.field, by default it is None + hash_eq: bool, optional + Whether to use the hash of the value for equality comparison, by default it is False + """ + + name: str | None = None + type: ty.Type[ty.Any] = attrs.field( + validator=is_type, default=ty.Any, converter=default_if_none(ty.Any) + ) + default: ty.Any = attrs.field( + default=NO_DEFAULT, + converter=attrs.Converter(convert_default_value, takes_self=True), + ) + help: str = "" + requires: list[RequirementSet] = attrs.field( + factory=list, converter=requires_converter + ) + converter: ty.Callable[..., ty.Any] | None = None + validator: ty.Callable[..., bool] | None = None + hash_eq: bool = False + + def requirements_satisfied(self, inputs: "Task") -> bool: + """Check if all the requirements are satisfied by the inputs""" + return any(req.satisfied(inputs) for req in self.requires) + + @property + def mandatory(self): + return self.default is NO_DEFAULT + + @requires.validator + def _requires_validator(self, _, value): + if value and self.type not in (ty.Any, bool) and not is_optional(self.type): + raise ValueError( + f"Fields with requirements must be of optional type (i.e. in union " + f"with None) or boolean, not type {self.type} ({self!r})" + ) + + def markdown_listing( + self, line_width: int = 79, help_indent: int = 4, **kwargs + ) -> str: + """Get the listing for the field in markdown-like format + + Parameters + ---------- + line_width: int + The maximum line width for the output, by default it is 79 + help_indent: int + The indentation for the help text, by default it is 4 + + Returns + ------- + str + The listing for the field in markdown-like format + """ + + def type_to_str(type_: ty.Type[ty.Any]) -> str: + if type_ is type(None): + return "None" + if is_union(type_): + return " | ".join( + type_to_str(t) for t in ty.get_args(type_) if t is not None + ) + try: + type_str = to_mime(type_, official=False) + except Exception: + if origin := ty.get_origin(type_): + type_str = f"{origin.__name__}[{', '.join(map(type_to_str, ty.get_args(type_)))}]" + else: + try: + type_str = type_.__name__ + except AttributeError: + type_str = str(type_) + return type_str + + s = f"- {self.name}: {type_to_str(self.type)}" + if isinstance(self.default, attrs.Factory): + s += f"; default-factory = {self.default.factory.__name__}()" + elif callable(self.default): + s += f"; default = {self.default.__name__}()" + elif not self.mandatory: + s += f"; default = {self.default!r}" + if self._additional_descriptors(**kwargs): + s += f" ({', '.join(self._additional_descriptors(**kwargs))})" + if self.help: + s += f"\n{wrap_text(self.help, width=line_width, indent_size=help_indent)}" + return s + + def _additional_descriptors(self, **kwargs) -> list[str]: + """Get additional descriptors for the field""" + return [] + + def __lt__(self, other: "Field") -> bool: + """Compare two fields based on their position""" + return self.name < other.name + + +@attrs.define(kw_only=True) +class Arg(Field): + """Base class for input fields of tasks + + Parameters + ---------- + name: str, optional + The name of the field, used when specifying a list of fields instead of a mapping + from name to field, by default it is None + type: type, optional + The type of the field, by default it is Any + default : Any, optional + the default value for the field, by default it is NO_DEFAULT + help: str + A short description of the input field. + requires: list, optional + Names of the inputs that are required together with the field. + allowed_values: Sequence, optional + List of allowed values for the field. + copy_mode: File.CopyMode, optional + The mode of copying the file, by default it is File.CopyMode.any + copy_collation: File.CopyCollation, optional + The collation of the file, by default it is File.CopyCollation.any + copy_ext_decomp: File.ExtensionDecomposition, optional + The extension decomposition of the file, by default it is + File.ExtensionDecomposition.single + readonly: bool, optional + If True the input field can’t be provided by the user but it aggregates other + input fields (for example the fields with argstr: -o {fldA} {fldB}), by default + it is False + """ + + allowed_values: frozenset = attrs.field(default=(), converter=frozenset) + copy_mode: File.CopyMode = File.CopyMode.any + copy_collation: File.CopyCollation = File.CopyCollation.any + copy_ext_decomp: File.ExtensionDecomposition = File.ExtensionDecomposition.single + readonly: bool = False + + def _additional_descriptors(self, **kwargs) -> list[str]: + """Get additional descriptors for the field""" + descriptors = super()._additional_descriptors(**kwargs) + if self.allowed_values: + descriptors.append(f"allowed_values={self.allowed_values}") + return descriptors + + +@attrs.define(kw_only=True, slots=False) +class Out(Field): + """Base class for output fields of tasks + + Parameters + ---------- + name: str, optional + The name of the field, used when specifying a list of fields instead of a mapping + from name to field, by default it is None + type: type, optional + The type of the field, by default it is Any + default : Any, optional + the default value for the field, by default it is NO_DEFAULT + help: str, optional + A short description of the input field. + requires: list, optional + Names of the inputs that are required together with the field. + converter: callable, optional + The converter for the field passed through to the attrs.field, by default it is None + validator: callable | iterable[callable], optional + The validator(s) for the field passed through to the attrs.field, by default it is None + """ + + pass diff --git a/pydra/compose/base/helpers.py b/pydra/compose/base/helpers.py new file mode 100644 index 0000000000..ea9f3f0842 --- /dev/null +++ b/pydra/compose/base/helpers.py @@ -0,0 +1,476 @@ +import typing as ty +import inspect +import attrs +import re +from copy import copy +from pydra.utils.typing import is_type, is_optional +from pydra.utils.general import task_fields +from .field import Field, Arg, Out, NO_DEFAULT + + +if ty.TYPE_CHECKING: + from .task import Task, Outputs + + +def is_set(value: ty.Any) -> bool: + """Check if a value has been set.""" + return value not in (attrs.NOTHING, NO_DEFAULT) + + +def ensure_field_objects( + arg_type: type[Arg], + out_type: type[Out], + doc_string: str | None = None, + inputs: dict[str, Arg | type] | None = None, + outputs: dict[str, Out | type] | None = None, + input_helps: dict[str, str] | None = None, + output_helps: dict[str, str] | None = None, +) -> tuple[dict[str, Arg], dict[str, Out]]: + """Converts dicts containing input/output types into input/output, including any + help strings to the appropriate inputs and outputs + + Parameters + ---------- + arg_type : type + The type of the input fields + out_type : type + The type of the output fields + doc_string : str, optional + The docstring of the function or class + inputs : dict[str, Arg | type], optional + The inputs to the function or class + outputs : dict[str, Out | type], optional + The outputs of the function or class + input_helps : dict[str, str], optional + The help strings for the inputs + output_helps : dict[str, str], optional + The help strings for the outputs + + Returns + ------- + inputs : dict[str, Arg] + The input fields with help strings added + outputs : dict[str, Out] + The output fields with help strings added + """ + + for input_name, arg in list(inputs.items()): + if isinstance(arg, Arg): + if arg.name is None: + arg.name = input_name + elif arg.name != input_name: + raise ValueError( + "Name of the argument must be the same as the key in the " + f"dictionary. The argument name is {arg.name} and the key " + f"is {input_name}" + ) + else: + arg.name = input_name + if not arg.help: + arg.help = input_helps.get(input_name, "") + elif is_type(arg): + inputs[input_name] = arg_type( + type=arg, + name=input_name, + help=input_helps.get(input_name, ""), + ) + elif isinstance(arg, dict): + arg_kwds = copy(arg) + if "help" not in arg_kwds: + arg_kwds["help"] = input_helps.get(input_name, "") + inputs[input_name] = arg_type( + name=input_name, + **arg_kwds, + ) + else: + raise ValueError( + f"Input {input_name} must be an instance of {Arg}, a type, or a dictionary " + f" of keyword arguments to pass to {Arg}, not {arg}" + ) + + for output_name, out in list(outputs.items()): + if isinstance(out, Out): + if out.name is None: + out.name = output_name + elif out.name != output_name: + raise ValueError( + "Name of the argument must be the same as the key in the " + f"dictionary. The argument name is {out.name} and the key " + f"is {output_name}" + ) + else: + out.name = output_name + if not out.help: + out.help = output_helps.get(output_name, "") + elif is_type(out): + outputs[output_name] = out_type( + type=out, + name=output_name, + help=output_helps.get(output_name, ""), + ) + if is_optional(out): + outputs[output_name].default = None + elif isinstance(out, dict): + out_kwds = copy(out) + if "help" not in out_kwds: + out_kwds["help"] = output_helps.get(output_name, "") + outputs[output_name] = out_type( + name=output_name, + **out_kwds, + ) + elif isinstance(out, ty.Callable) and hasattr(out_type, "callable"): + outputs[output_name] = out_type( + name=output_name, + type=ty.get_type_hints(out).get("return", ty.Any), + callable=out, + help=re.split(r"\n\s*\n", out.__doc__)[0] if out.__doc__ else "", + ) + else: + raise ValueError( + f"Unrecognised value provided to outputs ({arg}), can be either {out_type} " + "type" + (" or callable" if hasattr(out_type, "callable") else "") + ) + + return inputs, outputs + + +def extract_function_inputs_and_outputs( + function: ty.Callable, + arg_type: type[Arg], + inputs: list[str | Arg] | dict[str, Arg | type] | None = None, + outputs: list[str | Out] | dict[str, Out | type] | type | None = None, +) -> tuple[dict[str, type | Arg], dict[str, type | Out]]: + """Extract input output types and output names from the function source if they + aren't explicitly + + Parameters + ---------- + function : callable + The function to extract the inputs and outputs from + arg_type : type + The type of the input fields + out_type : type + The type of the output fields + inputs : list[str | Arg] | dict[str, Arg | type] | None + The inputs to the function + outputs : list[str | Out] | dict[str, Out | type] | type | None + The outputs of the function + + Returns + ------- + inputs : dict[str, Arg] + The input fields extracted from the function + outputs : dict[str, Out] + The output fields extracted from the function + """ + # if undefined_symbols := get_undefined_symbols( + # function, exclude_signature_type_hints=True, ignore_decorator=True + # ): + # raise ValueError( + # f"The following symbols are not defined within the scope of the function " + # f"{function!r}, {undefined_symbols}. Ensure that all imports are " + # "defined within the function scope so it is portable" + # ) + sig = inspect.signature(function) + type_hints = ty.get_type_hints(function) + input_types = {} + input_defaults = {} + has_varargs = False + for p in sig.parameters.values(): + if p.kind is p.VAR_POSITIONAL or p.kind is p.VAR_KEYWORD: + has_varargs = True + continue + input_types[p.name] = type_hints.get(p.name, ty.Any) + if p.default is not inspect.Parameter.empty: + input_defaults[p.name] = p.default + if inputs: + if not isinstance(inputs, dict): + raise ValueError( + f"Input names ({inputs}) should not be provided when " + "wrapping/decorating a function as " + ) + if not has_varargs: + if unrecognised := set(inputs) - set(input_types): + raise ValueError( + f"Unrecognised input names ({unrecognised}) not present in the signature " + f"of the function {function!r}" + ) + for inpt_name, type_ in input_types.items(): + try: + inpt = inputs[inpt_name] + except KeyError: + inputs[inpt_name] = type_ + else: + if isinstance(inpt, Arg) and inpt.type is ty.Any: + inpt.type = type_ + else: + inputs = input_types + for inpt_name, default in input_defaults.items(): + inpt = inputs[inpt_name] + if isinstance(inpt, arg_type): + if inpt.mandatory: + inpt.default = default + elif inspect.isclass(inpt) or ty.get_origin(inpt): + inputs[inpt_name] = arg_type(type=inpt, default=default) + else: + raise ValueError( + f"Unrecognised input type ({inpt}) for input {inpt_name} with default " + f"value {default}" + ) + return_type = type_hints.get("return", ty.Any) + if outputs and len(outputs) > 1: + if return_type is not ty.Any: + if ty.get_origin(return_type) is not tuple: + raise ValueError( + f"Multiple outputs specified ({outputs}) but non-tuple " + f"return value {return_type}" + ) + return_types = ty.get_args(return_type) + if len(return_types) != len(outputs): + raise ValueError( + f"Length of the outputs ({outputs}) does not match that " + f"of the return types ({return_types})" + ) + output_types = dict(zip(outputs, return_types)) + else: + output_types = {o: ty.Any for o in outputs} + if isinstance(outputs, dict): + for output_name, output in outputs.items(): + if isinstance(output, Out) and output.type is ty.Any: + output.type = output_types[output_name] + else: + outputs = output_types + + elif outputs: + if isinstance(outputs, dict): + output_name, output = next(iter(outputs.items())) + elif isinstance(outputs, list): + output_name = outputs[0] + output = ty.Any + if isinstance(output, Out): + if output.type is ty.Any: + output.type = return_type + elif output is ty.Any: + output = return_type + outputs = {output_name: output} + else: + outputs = {"out": return_type} + return inputs, outputs + + +def parse_doc_string(doc_str: str) -> tuple[dict[str, str], dict[str, str] | list[str]]: + """Parse the docstring to pull out the description of the parameters/args and returns + + Parameters + ----------- + doc_string + the doc string to parse + + Returns + ------- + input_helps + the documentation for each of the parameter/args of the class/function + output_helps + the documentation for each of the return values of the class function, if no + names are provided then the help strings are returned as a list + """ + input_helps = {} + output_helps = {} + if doc_str is None: + return input_helps, output_helps + for param, param_help in re.findall(r":param (\w+): (.*)", doc_str): + input_helps[param] = param_help + for return_val, return_help in re.findall(r":return (\w+): (.*)", doc_str): + output_helps[return_val] = return_help + google_args_match = re.match( + r"(?:.*\n)?\s*Args:\n(.*)", doc_str, flags=re.DOTALL | re.MULTILINE + ) + google_returns_match = re.match( + r"(?:.*\n)?\s*Returns:\n(.*)", doc_str, flags=re.DOTALL | re.MULTILINE + ) + if google_args_match: + args_str = google_args_match.group(1) + for arg_str in split_block(args_str): + arg_name, arg_help = arg_str.split(":", maxsplit=1) + arg_name = arg_name.strip() + arg_help = white_space_re.sub(" ", arg_help).strip() + input_helps[arg_name] = arg_help + if google_returns_match: + returns_str = google_returns_match.group(1) + for return_str in split_block(returns_str): + return_name, return_help = return_str.split(":", maxsplit=1) + return_name = return_name.strip() + return_help = white_space_re.sub(" ", return_help).strip() + output_helps[return_name] = return_help + numpy_args_match = re.match( + r"(?:.*\n)?\s+Parameters\n\s*----------\s*\n(.*)", + doc_str, + flags=re.DOTALL | re.MULTILINE, + ) + numpy_returns_match = re.match( + r"(?:.*\n)?\s+Returns\n\s*-------\s*\n(.*)", + doc_str, + flags=re.DOTALL | re.MULTILINE, + ) + if numpy_args_match: + args_str = numpy_args_match.group(1) + for arg_str in split_block(args_str): + arg_decl, arg_help = arg_str.split("\n", maxsplit=1) + arg_name = arg_decl.split(":")[0].strip() + arg_help = white_space_re.sub(" ", arg_help).strip() + input_helps[arg_name] = arg_help + if numpy_returns_match: + returns_str = numpy_returns_match.group(1) + for return_str in split_block(returns_str): + return_decl, return_help = return_str.split("\n", maxsplit=1) + return_name = return_decl.split(":")[0].strip() + return_help = white_space_re.sub(" ", return_help).strip() + output_helps[return_name] = return_help + return input_helps, output_helps + + +def split_block(string: str) -> ty.Generator[str, None, None]: + """Split a block of text into groups lines""" + indent_re = re.compile(r"^\s*") + leading_indent = indent_re.match(string).group() + leading_indent_len = len(leading_indent) + block = "" + for line in string.split("\n"): + if not line.strip(): + break + indent_len = len(indent_re.match(line).group()) + if block and indent_len == leading_indent_len: + yield block.strip() + block = "" + block += line + "\n" + if indent_len < leading_indent_len: + raise ValueError( + f"Indentation block is not consistent in docstring:\n{string}" + ) + if block: + yield block.strip() + + +def check_explicit_fields_are_none(klass, inputs, outputs): + if inputs is not None: + raise ValueError( + f"inputs should not be provided to `python.task` ({inputs}) " + f"explicitly when decorated a class ({klass})" + ) + if outputs is not None: + raise ValueError( + f"outputs should not be provided to `python.task` ({outputs}) " + f"explicitly when decorated a class ({klass})" + ) + + +def extract_fields_from_class( + spec_type: type["Task"], + outputs_type: type["Outputs"], + klass: type, + arg_type: type[Arg], + out_type: type[Out], + auto_attribs: bool, + skip_fields: ty.Iterable[str] = (), +) -> tuple[dict[str, Arg], dict[str, Out]]: + """Extract the input and output fields from an existing class + + Parameters + ---------- + klass : type + The class to extract the fields from + arg_type : type + The type of the input fields + out_type : type + The type of the output fields + auto_attribs : bool + Whether to assume that all attribute annotations should be interpreted as + fields or not + skip_fields : Iterable[str], optional + The names of attributes to skip when extracting the fields, by default () + + Returns + ------- + inputs : dict[str, Arg] + The input fields extracted from the class + outputs : dict[str, Out] + The output fields extracted from the class + """ + + input_helps, _ = parse_doc_string(klass.__doc__) + + def get_fields(klass, field_type, auto_attribs, helps) -> dict[str, Field]: + """Get the fields from a class""" + fields_dict = {} + # Get fields defined in base classes if present + for field in task_fields(klass): + if field.name not in skip_fields: + fields_dict[field.name] = field + type_hints = ty.get_type_hints(klass) + for atr_name in dir(klass): + if ( + atr_name == "Outputs" + or atr_name in skip_fields + or atr_name.startswith("__") + ): + continue + try: + atr = getattr(klass, atr_name) + except Exception: + continue + if isinstance(atr, Field): + atr.name = atr_name + fields_dict[atr_name] = atr + if atr_name in type_hints: + atr.type = type_hints[atr_name] + if not atr.help: + atr.help = helps.get(atr_name, "") + elif atr_name in type_hints: + if atr_name.startswith("_"): + continue + if atr_name in fields_dict: + fields_dict[atr_name].type = type_hints[atr_name] + elif auto_attribs: + fields_dict[atr_name] = field_type( + name=atr_name, + type=type_hints[atr_name], + default=atr, + help=helps.get(atr_name, ""), + ) + if auto_attribs: + for atr_name, type_ in type_hints.items(): + if atr_name.startswith("_") or atr_name in skip_fields: + continue + if atr_name not in list(fields_dict) + ["Outputs"]: + fields_dict[atr_name] = field_type( + name=atr_name, type=type_, help=helps.get(atr_name, "") + ) + return fields_dict + + if not issubclass(klass, spec_type): + raise ValueError( + f"When using the canonical form for {spec_type.__module__.split('.')[-1]} " + f"tasks, {klass} must inherit from {spec_type}" + ) + + inputs = get_fields(klass, arg_type, auto_attribs, input_helps) + + try: + outputs_klass = klass.Outputs + except AttributeError: + raise AttributeError( + f"Nested Outputs class not found in {klass.__name__}" + ) from None + if not issubclass(outputs_klass, outputs_type): + raise ValueError( + f"When using the canonical form for {outputs_type.__module__.split('.')[-1]} " + f"task outputs {outputs_klass}, you must inherit from {outputs_type}" + ) + + output_helps, _ = parse_doc_string(outputs_klass.__doc__) + outputs = get_fields(outputs_klass, out_type, auto_attribs, output_helps) + + return inputs, outputs + + +white_space_re = re.compile(r"\s+") diff --git a/pydra/compose/base/task.py b/pydra/compose/base/task.py new file mode 100644 index 0000000000..29de92cb6e --- /dev/null +++ b/pydra/compose/base/task.py @@ -0,0 +1,620 @@ +import typing as ty +import re +from pathlib import Path +from copy import copy +from typing import Self +import attrs.validators +from pydra.utils.typing import is_optional, is_fileset_or_union +from pydra.utils.general import task_fields +from pydra.utils.typing import StateArray, is_lazy +from pydra.utils.hash import hash_function +import os +import itertools +from collections import Counter +import attrs +import cloudpickle as cp +from pydra.utils.messenger import AuditFlag, Messenger +from pydra.utils.general import ( + attrs_fields, + attrs_values, +) +from pydra.utils.hash import Cache, hash_single, register_serializer +from .field import Field, Arg, Out + + +if ty.TYPE_CHECKING: + from pydra.engine.job import Job + from pydra.environments.base import Environment + from pydra.workers.base import Worker + from pydra.engine.result import Result + from pydra.engine.hooks import TaskHooks + +TaskType = ty.TypeVar("TaskType", bound="Task") + + +@attrs.define(kw_only=True, auto_attribs=False, eq=False, repr=False) +class Outputs: + """Base class for all output definitions""" + + RESERVED_FIELD_NAMES = ("inputs",) + + _cache_dir: Path = attrs.field(default=None, init=False, repr=False) + + @property + def inputs(self): + """The inputs object associated with a lazy-outputs object""" + return self._get_node().inputs + + @classmethod + def _from_task(cls, job: "Job[TaskType]") -> Self: + """Collect the outputs of a job. This is just an abstract base method that + should be used by derived classes to set default values for the outputs. + + Parameters + ---------- + job : Job[TaskType] + The job whose outputs are being collected. + + Returns + ------- + outputs : Outputs + The outputs of the job + """ + defaults = {} + for output in task_fields(cls): + if output.mandatory: + default = attrs.NOTHING + elif isinstance(output.default, attrs.Factory): + default = output.default.factory() + else: + default = output.default + defaults[output.name] = default + outputs = cls(**defaults) + outputs._cache_dir = job.cache_dir + return outputs + + @property + def _results(self) -> "Result[Self]": + results_path = self._cache_dir / "_job.pklz" + if not results_path.exists(): + raise FileNotFoundError(f"Job results file {results_path} not found") + with open(results_path, "rb") as f: + return cp.load(f) + + def _get_node(self): + try: + return self._node + except AttributeError: + raise AttributeError( + f"{self} outputs object is not a lazy output of a workflow node" + ) from None + + def __iter__(self) -> ty.Generator[str, None, None]: + """The names of the fields in the output object""" + return iter(sorted(f.name for f in attrs_fields(self))) + + def __getitem__(self, name_or_index: str | int) -> ty.Any: + """Return the value for the given attribute + + Parameters + ---------- + name : str + the name of the attribute to return + + Returns + ------- + Any + the value of the attribute + """ + if isinstance(name_or_index, int): + return list(self)[name_or_index] + try: + return getattr(self, name_or_index) + except AttributeError: + raise KeyError( + f"{self} doesn't have an attribute {name_or_index}" + ) from None + + def __eq__(self, other: ty.Any) -> bool: + """Check if two tasks are equal""" + values = attrs.asdict(self) + fields = task_fields(self) + try: + other_values = attrs.asdict(other) + except AttributeError: + return False + try: + other_fields = task_fields(other) + except AttributeError: + return False + if fields != other_fields: + return False + for field in task_fields(self): + if field.hash_eq: + values[field.name] = hash_function(values[field.name]) + other_values[field.name] = hash_function(other_values[field.name]) + return values == other_values + + def __repr__(self) -> str: + """A string representation of the task""" + fields_str = ", ".join( + f"{f.name}={getattr(self, f.name)!r}" + for f in task_fields(self) + if getattr(self, f.name) != f.default + ) + return f"{self.__class__.__name__}({fields_str})" + + +OutputsType = ty.TypeVar("OutputType", bound=Outputs) + + +@attrs.define(kw_only=True, auto_attribs=False, eq=False, repr=False) +class Task(ty.Generic[OutputsType]): + """Base class for all tasks""" + + # Class attributes + _xor: frozenset[frozenset[str | None]] = ( + frozenset() + ) # overwritten in derived classes + + # The following fields are used to store split/combine state information + _splitter = attrs.field(default=None, init=False, repr=False) + _combiner = attrs.field(default=None, init=False, repr=False) + _container_ndim = attrs.field(default=None, init=False, repr=False) + _hashes = attrs.field(default=None, init=False, eq=False, repr=False) + + RESERVED_FIELD_NAMES = ("split", "combine") + + def __call__( + self, + /, + cache_root: os.PathLike | None = None, + worker: "str | ty.Type[Worker] | Worker" = "debug", + environment: "Environment | None" = None, + rerun: bool = False, + readonly_caches: ty.Iterable[os.PathLike] | None = None, + audit_flags: AuditFlag = AuditFlag.NONE, + messengers: ty.Iterable[Messenger] | None = None, + messenger_args: dict[str, ty.Any] | None = None, + hooks: "TaskHooks | None" = None, + **kwargs: ty.Any, + ) -> OutputsType: + """Create a job from this task and execute it to produce a result. + + Parameters + ---------- + cache_root : os.PathLike, optional + Cache directory where the working directory/results for the job will be + stored, by default None + worker : str or Worker, optional + The worker to use, by default "cf" + environment: Environment, optional + The execution environment to use, by default None + rerun : bool, optional + Whether to force the re-computation of the job results even if existing + results are found, by default False + readonly_caches : list[os.PathLike], optional + Alternate cache locations to check for pre-computed results, by default None + audit_flags : AuditFlag, optional + Auditing configuration, by default AuditFlag.NONE + messengers : list, optional + Messengers, by default None + messenger_args : dict, optional + Messenger arguments, by default None + **kwargs : dict + Keyword arguments to pass on to the worker initialisation + + Returns + ------- + OutputsType or list[OutputsType] + The output interface of the job, or in the case of split tasks, a list of + output interfaces + """ + from pydra.engine.submitter import ( # noqa: F811 + Submitter, + WORKER_KWARG_FAIL_NOTE, + ) + + try: + with Submitter( + audit_flags=audit_flags, + cache_root=cache_root, + readonly_caches=readonly_caches, + messenger_args=messenger_args, + messengers=messengers, + environment=environment, + worker=worker, + **kwargs, + ) as sub: + result = sub( + self, + hooks=hooks, + rerun=rerun, + ) + except TypeError as e: + # Catch any inadvertent passing of task parameters to the + # execution call + if hasattr(e, "__notes__") and WORKER_KWARG_FAIL_NOTE in e.__notes__: + if match := re.match( + r".*got an unexpected keyword argument '(\w+)'", str(e) + ): + if match.group(1) in self: + e.add_note( + f"Note that the unrecognised argument, {match.group(1)!r}, is " + f"an input of the task {self!r} that has already been " + f"parameterised (it is being called to execute it)" + ) + raise + if result.errored: + if result.errors: + time_of_crash = result.errors["time of crash"] + error_message = "\n".join(result.errors["error message"]) + else: + time_of_crash = "UNKNOWN-TIME" + error_message = "NOT RETRIEVED" + raise RuntimeError( + f"Job {self} failed @ {time_of_crash} with the " + f"following errors:\n{error_message}\n" + "To inspect, please load the pickled job object from here: " + f"{result.cache_dir}/_job.pklz" + ) + return result.outputs + + def split( + self, + splitter: ty.Union[str, ty.List[str], ty.Tuple[str, ...], None] = None, + /, + overwrite: bool = False, + container_ndim: ty.Optional[dict] = None, + **inputs, + ) -> Self: + """ + Run this job parametrically over lists of split inputs. + + Parameters + ---------- + splitter : str or list[str] or tuple[str] or None + the fields which to split over. If splitting over multiple fields, lists of + fields are interpreted as outer-products and tuples inner-products. If None, + then the fields to split are taken from the keyword-arg names. + overwrite : bool, optional + whether to overwrite an existing split on the node, by default False + container_ndim : dict, optional + Container dimensions for specific inputs, used in the splitter. + If input name is not in container_ndim, it is assumed that the input values has + a container dimension of 1, so only the most outer dim will be used for splitting. + **inputs + fields to split over, will be automatically wrapped in a StateArray object + and passed to the node inputs + + Returns + ------- + self : TaskBase + a reference to the job + """ + from pydra.engine.state import unwrap_splitter + from pydra.engine import lazy + + if self._splitter and not overwrite: + raise ValueError( + f"Cannot overwrite existing splitter {self._splitter} on {self}, " + "set 'overwrite=True' to do so" + ) + if splitter: + unwraped_split = list(unwrap_splitter(splitter)) + if duplicated := [f for f, c in Counter(unwraped_split).items() if c > 1]: + raise ValueError(f"Splitter fields {duplicated} are duplicated") + split_names = set( + s for s in unwraped_split if not s.startswith("_") and "." not in s + ) + input_names = set(inputs) + if missing_inputs := list(split_names - input_names): + raise ValueError( + f"Splitter fields {missing_inputs} need to be provided as a keyword " + f"arguments to the split method (provided {list(inputs)})" + ) + if unrecognised_inputs := list(input_names - split_names): + raise ValueError( + f"Provided inputs {unrecognised_inputs} are not present in the " + f"splitter {splitter}" + ) + else: + # If no splitter is provided, use the names of the inputs as combinatorial splitter + split_names = splitter = list(inputs) + for field_name in container_ndim or []: + if field_name not in split_names: + raise ValueError( + f"Container dimension for {field_name} is provided but the field " + f"is not present in the inputs" + ) + split_inputs = {} + for name, value in inputs.items(): + if isinstance(value, lazy.LazyField): + split_val = value.split() + elif isinstance(value, ty.Iterable) and not isinstance( + value, (ty.Mapping, str) + ): + split_val = StateArray(value) + else: + raise TypeError( + f"Could not split {value!r} as it is not a sequence type" + ) + split_inputs[name] = split_val + split_def = attrs.evolve(self, **split_inputs) + split_def._splitter = splitter + split_def._container_ndim = container_ndim + return split_def + + def combine( + self, + combiner: ty.Union[ty.List[str], str], + overwrite: bool = False, + ) -> Self: + """ + Combine inputs parameterized by one or more previous tasks. + + Parameters + ---------- + combiner : list[str] or str + the field or list of inputs to be combined (i.e. not left split) after the + job has been run + overwrite : bool + whether to overwrite an existing combiner on the node + **kwargs : dict[str, Any] + values for the job that will be "combined" before they are provided to the + node + + Returns + ------- + self : Self + a reference to the outputs object + """ + if self._combiner and not overwrite: + raise ValueError( + f"Attempting to overwrite existing combiner {self._combiner} on {self}, " + "set 'overwrite=True' to do so" + ) + if isinstance(combiner, str): + combiner = [combiner] + local_names = set(c for c in combiner if "." not in c and not c.startswith("_")) + if unrecognised := local_names - set(self): + raise ValueError( + f"Combiner fields {unrecognised} are not present in the task" + ) + combined_def = copy(self) + combined_def._combiner = combiner + return combined_def + + def __repr__(self) -> str: + """A string representation of the task""" + fields_str = ", ".join( + f"{f.name}={getattr(self, f.name)!r}" + for f in task_fields(self) + if getattr(self, f.name) != f.default + ) + return f"{self.__class__.__name__}({fields_str})" + + def __iter__(self) -> ty.Generator[str, None, None]: + """Iterate through all the names in the task""" + return ( + f.name + for f in task_fields(self) + if not (f.name.startswith("_") or f.name in self.RESERVED_FIELD_NAMES) + ) + + def __eq__(self, other: ty.Any) -> bool: + """Check if two tasks are equal""" + values = attrs.asdict(self, recurse=False) + try: + other_values = attrs.asdict(other, recurse=False) + except AttributeError: + return False + if set(values) != set(other_values): + return False # Return if attribute keys don't match + for field in task_fields(self): + if field.hash_eq: + values[field.name] = hash_function(values[field.name]) + other_values[field.name] = hash_function(other_values[field.name]) + if values != other_values: + return False + hash_cache = Cache() + if hash_function(type(self), cache=hash_cache) != hash_function( + type(other), cache=hash_cache + ): + return False + try: + other_outputs = other.Outputs + except AttributeError: + return False + return hash_function(self.Outputs, cache=hash_cache) == hash_function( + other_outputs, cache=hash_cache + ) + + def __getitem__(self, name: str) -> ty.Any: + """Return the value for the given attribute, resolving any templates + + Parameters + ---------- + name : str + the name of the attribute to return + + Returns + ------- + Any + the value of the attribute + """ + try: + return getattr(self, name) + except AttributeError: + raise KeyError(f"{self} doesn't have an attribute {name}") from None + + @property + def _hash(self): + hsh, self._hashes = self._compute_hashes() + return hsh + + @property + def _checksum(self): + return f"{self._task_type}-{self._hash}" + + def _hash_changes(self): + """Detects any changes in the hashed values between the current inputs and the + previously calculated values""" + _, new_hashes = self._compute_hashes() + return [k for k, v in new_hashes.items() if v != self._hashes[k]] + + def _compute_hashes(self) -> ty.Tuple[bytes, ty.Dict[str, bytes]]: + """Compute a basic hash for any given set of fields.""" + inp_dict = {} + for field in task_fields(self): + if isinstance(field, Out): + continue # Skip output fields + # removing values that are not set from hash calculation + if getattr(self, field.name) is attrs.NOTHING: + continue + if getattr(field, "container_path", False): + continue + inp_dict[field.name] = getattr(self, field.name) + # Include the outputs class, just in case any names or types have changed + inp_dict["Outputs"] = self.Outputs + hash_cache = Cache() + field_hashes = { + k: hash_function(v, cache=hash_cache) for k, v in inp_dict.items() + } + return hash_function(sorted(field_hashes.items())), field_hashes + + def _rule_violations(self) -> list[str]: + """Check rules and returns a list of errors.""" + + field: Arg + errors = [] + for field in task_fields(self): + value = self[field.name] + + if is_lazy(value): + continue + + if ( + value is attrs.NOTHING + and not getattr(field, "path_template", False) + and not field.readonly + ): + errors.append(f"Mandatory field {field.name!r} is not set") + + # Raise error if any required field is unset. + if ( + not ( + value is None + or value is False + or ( + is_optional(field.type) + and is_fileset_or_union(field.type) + and value is True + ) + ) + and field.requires + and not any(rs.satisfied(self) for rs in field.requires) + ): + if len(field.requires) > 1: + qualification = ( + " at least one of the following requirements to be satisfied: " + ) + else: + qualification = "" + errors.append( + f"{field.name!r} requires{qualification} {[str(r) for r in field.requires]}" + ) + # Collect alternative fields associated with this field. + for xor_set in self._xor: + mutually_exclusive = {name: self[name] for name in xor_set if name} + are_set = [f"{n}={v!r}" for n, v in mutually_exclusive.items() if v] + if len(are_set) > 1: + errors.append( + f"Mutually exclusive fields ({', '.join(sorted(are_set))}) are set " + "together" + ) + elif not are_set and None not in xor_set: + errors.append( + "At least one of the mutually exclusive fields should be set: " + + ", ".join(f"{n}={v!r}" for n, v in mutually_exclusive.items()) + ) + return errors + + def _check_rules(self): + """Check if all rules are satisfied.""" + + attrs.validate(self) + + if errors := self._rule_violations(): + raise ValueError( + f"Found the following errors in job {self} task:\n" + "\n".join(errors) + ) + + @classmethod + def _check_arg_refs( + cls, + inputs: list[Arg], + outputs: list[Out], + xor: frozenset[frozenset[str | None]], + ) -> None: + """ + Checks if all fields referenced in requirements and xor are present in the inputs + are valid field names + """ + field: Field + input_names = set(inputs) + for field in itertools.chain(inputs.values(), outputs.values()): + if unrecognised := ( + set([r.name for s in field.requires for r in s]) - input_names + ): + raise ValueError( + "'Unrecognised' field names in referenced in the requirements " + f"of {field} " + str(list(unrecognised)) + ) + + for xor_set in xor: + if unrecognised := xor_set - (input_names | {None}): + raise ValueError( + f"'Unrecognised' field names in referenced in the xor {xor_set} " + + str(list(unrecognised)) + ) + for field_name in xor_set: + if field_name is None: # i.e. none of the fields being set is valid + continue + type_ = inputs[field_name].type + if type_ not in (ty.Any, bool) and not is_optional(type_): + raise ValueError( + f"Fields included in a 'xor' ({field_name!r}) must be of boolean " + f"or optional types, not type {type_}" + ) + + def _check_resolved(self): + """Checks that all the fields in the task have been resolved""" + if lazy_values := [n for n, v in attrs_values(self).items() if is_lazy(v)]: + raise ValueError( + f"Cannot execute {self} because the following fields " + f"still have lazy values {lazy_values}" + ) + + +# def set_none_default_if_optional(field: Field) -> None: +# if is_optional(field.type) and field.mandatory: +# field.default = None + + +@register_serializer +def bytes_repr_task(obj: Task, cache: Cache) -> ty.Iterator[bytes]: + yield f"task[{obj._task_type}]:(".encode() + for field in task_fields(obj): + yield f"{field.name}=".encode() + yield hash_single(getattr(obj, field.name), cache) + yield b"," + yield b"_splitter=" + yield hash_single(obj._splitter, cache) + yield b",_combiner=" + yield hash_single(obj._combiner, cache) + yield b",_container_ndim=" + yield hash_single(obj._container_ndim, cache) + yield b",_xor=" + yield hash_single(obj._xor, cache) + yield b")" diff --git a/pydra/compose/python.py b/pydra/compose/python.py new file mode 100644 index 0000000000..4ee374b386 --- /dev/null +++ b/pydra/compose/python.py @@ -0,0 +1,259 @@ +import typing as ty +import inspect +from typing import dataclass_transform +import attrs +from pydra.utils.general import task_fields, attrs_values +from pydra.compose import base +from pydra.compose.base import ( + ensure_field_objects, + build_task_class, + parse_doc_string, + extract_function_inputs_and_outputs, + check_explicit_fields_are_none, + extract_fields_from_class, +) + +if ty.TYPE_CHECKING: + from pydra.engine.job import Job + +__all__ = ["arg", "out", "define", "Task", "Outputs"] + + +@attrs.define +class arg(base.Arg): + """Argument of a Python task + + Parameters + ---------- + help: str + A short description of the input field. + default : Any, optional + the default value for the argument + allowed_values: list, optional + List of allowed values for the field. + requires: list, optional + Names of the inputs that are required together with the field. + copy_mode: File.CopyMode, optional + The mode of copying the file, by default it is File.CopyMode.any + copy_collation: File.CopyCollation, optional + The collation of the file, by default it is File.CopyCollation.any + copy_ext_decomp: File.ExtensionDecomposition, optional + The extension decomposition of the file, by default it is + File.ExtensionDecomposition.single + readonly: bool, optional + If True the input field can’t be provided by the user but it aggregates other + input fields (for example the fields with argstr: -o {fldA} {fldB}), by default + it is False + type: type, optional + The type of the field, by default it is Any + name: str, optional + The name of the field, used when specifying a list of fields instead of a mapping + from name to field, by default it is None + """ + + pass + + +@attrs.define +class out(base.Out): + """Output of a Python task + + Parameters + ---------- + name: str, optional + The name of the field, used when specifying a list of fields instead of a mapping + from name to field, by default it is None + type: type, optional + The type of the field, by default it is Any + help: str, optional + A short description of the input field. + requires: list, optional + Names of the inputs that are required together with the field. + converter: callable, optional + The converter for the field passed through to the attrs.field, by default it is None + validator: callable | iterable[callable], optional + The validator(s) for the field passed through to the attrs.field, by default it is None + position : int + The position of the output in the output list, allows for tuple unpacking of + outputs + """ + + pass + + +@dataclass_transform( + kw_only_default=True, + field_specifiers=(out,), +) +def outputs(wrapped): + """Decorator to specify the output fields of a shell command is a dataclass-style type""" + return wrapped + + +@dataclass_transform( + kw_only_default=True, + field_specifiers=(arg,), +) +def define( + wrapped: type | ty.Callable | None = None, + /, + inputs: list[str | arg] | dict[str, arg | type] | None = None, + outputs: list[str | out] | dict[str, out | type] | type | None = None, + bases: ty.Sequence[type] = (), + outputs_bases: ty.Sequence[type] = (), + auto_attribs: bool = True, + xor: ty.Sequence[str | None] | ty.Sequence[ty.Sequence[str | None]] = (), +) -> "Task": + """ + Create an interface for a function or a class. + + Parameters + ---------- + wrapped : type | callable | None + The function or class to create an interface for. + inputs : list[str | Arg] | dict[str, Arg | type] | None + The inputs to the function or class. + outputs : list[str | base.Out] | dict[str, base.Out | type] | type | None + The outputs of the function or class. + auto_attribs : bool + Whether to use auto_attribs mode when creating the class. + xor: Sequence[str | None] | Sequence[Sequence[str | None]], optional + Names of args that are exclusive mutually exclusive, which must include + the name of the current field. If this list includes None, then none of the + fields need to be set. + + Returns + ------- + Task + The task class for the Python function + """ + + def make(wrapped: ty.Callable | type) -> Task: + if inspect.isclass(wrapped): + klass = wrapped + function = klass.function + name = klass.__name__ + check_explicit_fields_are_none(klass, inputs, outputs) + parsed_inputs, parsed_outputs = extract_fields_from_class( + Task, + Outputs, + klass, + arg, + out, + auto_attribs, + skip_fields=["function"], + ) + else: + if not isinstance(wrapped, ty.Callable): + raise ValueError( + f"wrapped must be a class or a function, not {wrapped!r}" + ) + klass = None + function = wrapped + input_helps, output_helps = parse_doc_string(function.__doc__) + inferred_inputs, inferred_outputs = extract_function_inputs_and_outputs( + function, arg, inputs, outputs + ) + name = function.__name__ + + parsed_inputs, parsed_outputs = ensure_field_objects( + arg_type=arg, + out_type=out, + inputs=inferred_inputs, + outputs=inferred_outputs, + input_helps=input_helps, + output_helps=output_helps, + ) + if "function" in parsed_inputs: + raise ValueError( + "The argument 'function' is reserved for a field to hold the function " + "to be wrapped" + ) + + parsed_inputs["function"] = arg( + name="function", type=ty.Callable, default=function, hash_eq=True + ) + + defn = build_task_class( + Task, + Outputs, + parsed_inputs, + parsed_outputs, + name=name, + klass=klass, + bases=bases, + outputs_bases=outputs_bases, + xor=xor, + ) + + return defn + + if wrapped is not None: + if not isinstance(wrapped, (ty.Callable, type)): + raise ValueError(f"wrapped must be a class or a callable, not {wrapped!r}") + return make(wrapped) + return make + + +@attrs.define(kw_only=True, auto_attribs=False, eq=False, repr=False) +class PythonOutputs(base.Outputs): + + @classmethod + def _from_task(cls, job: "Job[PythonTask]") -> ty.Self: + """Collect the outputs of a job from a combination of the provided inputs, + the objects in the output directory, and the stdout and stderr of the process. + + Parameters + ---------- + job : Job[Task] + The job whose outputs are being collected. + outputs_dict : dict[str, ty.Any] + The outputs of the job, as a dictionary + + Returns + ------- + outputs : Outputs + The outputs of the job in dataclass + """ + outputs = super()._from_task(job) + for name, val in job.return_values.items(): + setattr(outputs, name, val) + return outputs + + +PythonOutputsType = ty.TypeVar("OutputType", bound=PythonOutputs) + + +@attrs.define(kw_only=True, auto_attribs=False, eq=False, repr=False) +class PythonTask(base.Task[PythonOutputsType]): + + _task_type = "python" + + def _run(self, job: "Job[PythonTask]", rerun: bool = True) -> None: + # Prepare the inputs to the function + inputs = attrs_values(self) + del inputs["function"] + # Run the actual function + returned = self.function(**inputs) + # Collect the outputs and save them into the job.return_values dictionary + return_names = [f.name for f in task_fields(self.Outputs)] + if returned is None: + job.return_values = {nm: None for nm in return_names} + elif len(return_names) == 1: + # if only one element in the fields, everything should be returned together + job.return_values[return_names[0]] = returned + elif isinstance(returned, tuple) and len(return_names) == len(returned): + job.return_values.update(zip(return_names, returned)) + elif isinstance(returned, dict): + job.return_values.update( + {key: returned[key] for key in return_names if key in returned} + ) + else: + raise RuntimeError( + f"expected {len(return_names)} elements, but {returned} were returned" + ) + + +# Alias ShellTask to Task so we can refer to it by shell.Task +Task = PythonTask +Outputs = PythonOutputs diff --git a/pydra/compose/shell/__init__.py b/pydra/compose/shell/__init__.py new file mode 100644 index 0000000000..26d7c089b4 --- /dev/null +++ b/pydra/compose/shell/__init__.py @@ -0,0 +1,5 @@ +from .field import arg, out, outarg +from .builder import define +from .task import Task, Outputs + +__all__ = ["arg", "out", "outarg", "define", "Task", "Outputs"] diff --git a/pydra/compose/shell/builder.py b/pydra/compose/shell/builder.py new file mode 100644 index 0000000000..82ffeb3ca9 --- /dev/null +++ b/pydra/compose/shell/builder.py @@ -0,0 +1,589 @@ +"""Decorators and helper functions to create ShellTasks used in Pydra workflows""" + +from __future__ import annotations +import typing as ty +import re +import glob +from collections import defaultdict +import inspect +from copy import copy +import attrs +import builtins +from typing import dataclass_transform +from fileformats.core import from_mime +from fileformats import generic +from fileformats.core.exceptions import FormatRecognitionError +from pydra.utils.general import attrs_values +from pydra.compose.base import ( + Arg, + Out, + check_explicit_fields_are_none, + extract_fields_from_class, + ensure_field_objects, + build_task_class, + NO_DEFAULT, +) +from pydra.utils.typing import ( + is_fileset_or_union, + MultiInputObj, + TypeParser, + is_optional, +) +from . import field +from .task import Task, Outputs + + +@dataclass_transform( + kw_only_default=True, + field_specifiers=(field.out, field.outarg), +) +def outputs(wrapped): + """Decorator to specify the output fields of a shell command is a dataclass-style type""" + return wrapped + + +@dataclass_transform( + kw_only_default=True, + field_specifiers=(field.arg,), +) +def define( + wrapped: type | str | None = None, + /, + inputs: list[str | Arg] | dict[str, Arg | type] | None = None, + outputs: list[str | Out] | dict[str, Out | type] | type | None = None, + bases: ty.Sequence[type] = (), + outputs_bases: ty.Sequence[type] = (), + auto_attribs: bool = True, + name: str | None = None, + xor: ty.Sequence[str | None] | ty.Sequence[ty.Sequence[str | None]] = (), +) -> "Task": + """Create a task for a shell command. Can be used either as a decorator on + the "canonical" dataclass-form of a task or as a function that takes a + "shell-command template string" of the form + + ``` + shell.define("command --output ") + ``` + + Fields are inferred from the template if not provided. In the template, inputs are + specified with `` and outputs with ``. + + ``` + my_command + ``` + + The types of the fields can be specified using their MIME like (see fileformats.core.from_mime), e.g. + + ``` + my_command + ``` + + The template can also specify options with `-` or `--` followed by the option name + and arguments with ``. The type is optional and will default to + `generic/fs-object` if not provided for arguments and `field/text` for + options. The file-formats namespace can be dropped for generic and field formats, e.g. + + ``` + another-command --output + ``` + + Parameters + ---------- + wrapped : type | str | None + The class or command line template to create an interface for + inputs : list[str | Arg] | dict[str, Arg | type] | None + The input fields of the shell command + outputs : list[str | Out] | dict[str, Out | type] | type | None + The output fields of the shell command + auto_attribs : bool + Whether to use auto_attribs mode when creating the class + args_last : bool + Whether to put the executable argument last in the command line instead of first + as they appear in the template + name: str | None + The name of the returned class + xor: Sequence[str | None] | Sequence[Sequence[str | None]], optional + Names of args that are exclusive mutually exclusive, which must include + the name of the current field. If this list includes None, then none of the + fields need to be set. + + Returns + ------- + Task + The interface for the shell command + """ + + def make( + wrapped: ty.Callable | type | None = None, + ) -> Task: + + if inspect.isclass(wrapped): + klass = wrapped + executable: str + try: + executable = attrs.fields(klass).executable.default + except (AttributeError, attrs.exceptions.NotAnAttrsClassError): + try: + executable = klass.executable + except AttributeError: + raise AttributeError( + f"Shell task class {wrapped} must have an `executable` " + "attribute that specifies the command to run" + ) from None + if not isinstance(executable, str) and not ( + isinstance(executable, ty.Sequence) + and all(isinstance(e, str) for e in executable) + ): + raise ValueError( + "executable must be a string or a sequence of strings" + f", not {executable!r}" + ) + class_name = klass.__name__ + check_explicit_fields_are_none(klass, inputs, outputs) + parsed_inputs, parsed_outputs = extract_fields_from_class( + Task, + Outputs, + klass, + field.arg, + field.out, + auto_attribs, + skip_fields=["executable"], + ) + else: + if not isinstance(wrapped, (str, list)): + raise ValueError( + f"wrapped must be a class or a string, not {wrapped!r}" + ) + klass = None + input_helps, output_helps = {}, {} + + executable, inferred_inputs, inferred_outputs = parse_command_line_template( + wrapped, + inputs=inputs, + outputs=outputs, + ) + + parsed_inputs, parsed_outputs = ensure_field_objects( + arg_type=field.arg, + out_type=field.out, + inputs=inferred_inputs, + outputs=inferred_outputs, + input_helps=input_helps, + output_helps=output_helps, + ) + + if name: + class_name = name + else: + class_name = ( + "_".join(executable) if isinstance(executable, list) else executable + ) + class_name = re.sub(r"[^\w]", "_", class_name) + if class_name[0].isdigit(): + class_name = f"_{class_name}" + + # Add in fields from base classes + parsed_inputs.update({n: getattr(Task, n) for n in Task.BASE_NAMES}) + parsed_outputs.update({n: getattr(Outputs, n) for n in Outputs.BASE_NAMES}) + + if "executable" in parsed_inputs: + raise ValueError( + "The argument 'executable' is reserved for a field to hold the command " + "to be run" + ) + + # Update the inputs (overriding inputs from base classes) with the executable + # and the output argument fields + parsed_inputs.update( + {o.name: o for o in parsed_outputs.values() if isinstance(o, field.arg)} + ) + parsed_inputs["executable"] = field.arg( + name="executable", + type=str | ty.Sequence[str], + argstr="", + position=0, + default=executable, + validator=attrs.validators.min_len(1), + help=Task.EXECUTABLE_HELP, + ) + + # Set positions for the remaining inputs that don't have an explicit position + position_stack = remaining_positions(list(parsed_inputs.values())) + for inpt in parsed_inputs.values(): + if inpt.name == "append_args": + continue + if inpt.position is None: + inpt.position = position_stack.pop(0) + + # Convert string default values to callables that glob the files in the cwd + for outpt in parsed_outputs.values(): + if ( + isinstance(outpt, field.out) + and isinstance(outpt.default, str) + and TypeParser.contains_type(generic.FileSet, outpt.type) + ): + outpt.callable = GlobCallable(outpt.default) + outpt.default = NO_DEFAULT + + defn = build_task_class( + Task, + Outputs, + parsed_inputs, + parsed_outputs, + name=class_name, + klass=klass, + bases=bases, + outputs_bases=outputs_bases, + xor=xor, + ) + return defn + + # If a name is provided (and hence not being used as a decorator), check to see if + # we are extending from a class that already defines an executable + if wrapped is None and name is not None: + for base in bases: + try: + wrapped = attrs.fields(base).executable.default + except (AttributeError, attrs.exceptions.NotAnAttrsClassError): + try: + wrapped = base.executable + except AttributeError: + pass + if wrapped: + break + if wrapped is None: + raise ValueError( + f"name ({name!r}) can only be provided when creating a class " + "dynamically, i.e. not using it as a decorator. Check to see " + "whether you have forgotten to provide the command line template" + ) + # If wrapped is provided (i.e. this is not being used as a decorator), return the + # interface class + if wrapped is not None: + if not isinstance(wrapped, (type, str, list)): + raise ValueError( + f"wrapped must be a class, a string or a list, not {wrapped!r}" + ) + return make(wrapped) + return make + + +def parse_command_line_template( + template: str, + inputs: list[str | Arg] | dict[str, Arg | type] | None = None, + outputs: list[str | Out] | dict[str, Out | type] | None = None, +) -> ty.Tuple[str, dict[str, Arg | type], dict[str, Out | type]]: + """Parses a command line template into a name and input and output fields. Fields + are inferred from the template if not explicitly provided. + + In the template, inputs are specified with `` and outputs with ``. + The types of the fields can be specified using their MIME like (see fileformats.core.from_mime), e.g. + + ``` + my_command + ``` + + The template can also specify options with `-` or `--` + followed by the option name and arguments with ``. The type is optional and + will default to `generic/fs-object` if not provided for arguments and `field/text` for + options. The file-formats namespace can be dropped for generic and field formats, e.g. + + ``` + another-command --output + ``` + + Parameters + ---------- + template : str + The command line template + inputs : list[str | Arg] | dict[str, Arg | type] | None + The input fields of the shell command + outputs : list[str | Out] | dict[str, Out | type] | type | None + The output fields of the shell command + + Returns + ------- + executable : str + The name of the command line template + inputs : dict[str, Arg | type] + The input fields of the command line template + outputs : dict[str, Out | type] + The output fields of the command line template + + Raises + ------ + ValueError + If an unknown token is found in the command line template + TypeError + If an unknown type is found in the command line template + """ + if isinstance(inputs, list): + inputs = {arg.name: arg for arg in inputs} + elif isinstance(inputs, dict): + inputs = copy(inputs) # We don't want to modify the original + else: + assert inputs is None + inputs = {} + if isinstance(outputs, list): + outputs = {o.name: o for o in outputs} + elif isinstance(outputs, dict): + outputs = copy(outputs) # We don't want to modify the original + else: + assert outputs is None + outputs = {} + if isinstance(template, list): + tokens = template + else: + tokens = template.split() + executable = [] + start_args_index = 0 + for part in tokens: + if part.startswith("<") or part.startswith("-"): + break + executable.append(part) + start_args_index += 1 + if not executable: + raise ValueError(f"Found no executable in command line template: {template}") + if len(executable) == 1: + executable = executable[0] + tokens = tokens[start_args_index:] + if not tokens: + return executable, inputs, outputs + arg_pattern = r"<([:a-zA-Z0-9_,\|\-\.\/\+\*]+(?:\?|(?:=|\$)[^>]+)?)>" + opt_pattern = r"--?[a-zA-Z0-9_]+" + arg_re = re.compile(arg_pattern) + opt_re = re.compile(opt_pattern) + bool_arg_re = re.compile(f"({opt_pattern}){arg_pattern}") + + arguments = [] + option = None + + def add_arg(name, field_type, kwds): + """Merge the typing information with an existing field if it exists""" + if issubclass(field_type, Out): + dct = outputs + else: + dct = inputs + try: + fld = dct.pop(name) + except KeyError: + fld = field_type(name=name, **kwds) + else: + if isinstance(fld, dict): + fld = field_type(**fld) + elif isinstance(fld, type) or ty.get_origin(fld): + kwds["type"] = fld + fld = field_type(name=name, **kwds) + elif not isinstance(fld, field_type): # If fld type is outarg not out + fld = field_type(**attrs_values(fld)) + fld.name = name + type_ = kwds.pop("type", fld.type) + if fld.type is ty.Any: + fld.type = type_ + for k, v in kwds.items(): + setattr(fld, k, v) + dct[name] = fld + if issubclass(field_type, Arg): + arguments.append(fld) + + def from_type_str(type_str) -> type: + types = [] + for tp in type_str.split(","): + if "/" in tp: + type_ = from_mime(tp) + elif tp == "...": + type_ = "..." + else: + if tp in ("int", "float", "str", "bool"): + type_ = getattr(builtins, tp) + else: + try: + type_ = from_mime(f"generic/{tp}") + except FormatRecognitionError: + raise TypeError( + f"Found unknown type, {tp!r}, in command template: {template!r}" + ) from None + types.append(type_) + if len(types) == 2 and types[1] == "...": + type_ = tuple[types[0], ...] + elif len(types) > 1: + type_ = tuple[*types] + else: + type_ = types[0] + return type_ + + for token in tokens: + if match := arg_re.match(token): + name = match.group(1) + modify = False + if name.startswith("out|"): + name = name[4:] + field_type = field.outarg + elif name.startswith("modify|"): + name = name[7:] + field_type = field.arg + modify = True + else: + field_type = field.arg + # Identify type after ':' symbols + kwds = {} + is_multi = False + optional = False + if name.endswith("?"): + assert "=" not in name + name = name[:-1] + optional = True + kwds["default"] = None + elif name.endswith("+"): + is_multi = True + name = name[:-1] + elif name.endswith("*"): + is_multi = True + name = name[:-1] + kwds["default"] = attrs.Factory(list) + elif "=" in name: + name, default = name.split("=") + kwds["default"] = ( + default[1:-1] if re.match(r"('|\").*\1", default) else eval(default) + ) + elif "$" in name: + name, path_template = name.split("$") + kwds["path_template"] = path_template + if field_type is not field.outarg: + raise ValueError( + f"Path templates can only be used with output fields, not {token}" + ) + if ":" in name: + name, type_str = name.split(":") + type_ = from_type_str(type_str) + if ty.get_origin(type_) is tuple: + kwds["sep"] = " " + else: + type_ = generic.FsObject if option is None else str + if is_multi: + type_ = MultiInputObj[type_] + if optional: + type_ |= None # Make the arguments optional + kwds["type"] = type_ + if modify: + kwds["copy_mode"] = generic.File.CopyMode.copy + # Add field to outputs with the same name as the input + add_arg( + name, + field.out, + {"type": type_, "callable": _InputPassThrough(name)}, + ) + # If name contains a '.', treat it as a file template and strip it from the name + if field_type is field.outarg and "path_template" not in kwds: + path_template = name + if is_fileset_or_union(type_): + if ty.get_origin(type_): + ext_type = next(a for a in ty.get_args(type_) if a is not None) + else: + ext_type = type_ + if ext_type.ext is not None: + path_template = name + ext_type.ext + kwds["path_template"] = path_template + # Set the default value to None if the field is optional and no default is + # provided + if is_optional(type_) and "default" not in kwds: + kwds["default"] = None + if option is None: + add_arg(name, field_type, kwds) + else: + kwds["argstr"] = option + add_arg(name, field_type, kwds) + option = None + + elif match := bool_arg_re.match(token): + argstr, var = match.groups() + if "=" in var: + var, default = var.split("=") + default = eval(default) + else: + default = False + add_arg( + var, field.arg, {"type": bool, "argstr": argstr, "default": default} + ) + elif match := opt_re.match(token): + option = token + else: + raise ValueError( + f"Found unknown token {token!r} in command line template: {template}" + ) + + remaining_pos = remaining_positions(arguments, len(arguments) + 1, 1) + + for argument in arguments: + if argument.position is None: + argument.position = remaining_pos.pop(0) + + return executable, inputs, outputs + + +def remaining_positions( + args: list[Arg], num_args: int | None = None, start: int = 0 +) -> ty.List[int]: + """Get the remaining positions for input fields + + Parameters + ---------- + args : list[Arg] + The list of input fields + num_args : int, optional + The number of arguments, by default it is the length of the args + + Returns + ------- + list[int] + The list of remaining positions + + Raises + ------ + ValueError + If multiple fields have the same position + """ + if num_args is None: + num_args = len(args) - 1 # Subtract 1 for the 'append_args' field + # Check for multiple positions + positions = defaultdict(list) + for arg in args: + if arg.name == "append_args": + continue + if arg.position is not None: + if arg.position >= 0: + positions[arg.position].append(arg) + else: + positions[num_args + arg.position].append(arg) + if multiple_positions := { + k: [f"{a.name}({a.position})" for a in v] + for k, v in positions.items() + if len(v) > 1 + }: + raise ValueError( + f"Multiple fields have the overlapping positions: {multiple_positions}" + ) + return [i for i in range(start, num_args) if i not in positions] + + +@attrs.define +class _InputPassThrough: + """A class that can be used to pass through an input to the output""" + + name: str + + def __call__(self, inputs: Task) -> ty.Any: + return getattr(inputs, self.name) + + +class GlobCallable: + """Callable that can be used to glob files""" + + def __init__(self, pattern: str): + self.pattern = pattern + + def __call__(self) -> generic.FileSet: + matches = glob.glob(self.pattern) + if not matches: + raise FileNotFoundError(f"No files found matching pattern: {self.pattern}") + return matches diff --git a/pydra/compose/shell/field.py b/pydra/compose/shell/field.py new file mode 100644 index 0000000000..b9bedb1819 --- /dev/null +++ b/pydra/compose/shell/field.py @@ -0,0 +1,269 @@ +from __future__ import annotations +import typing as ty +import attrs +from pydra.compose.base import ( + Arg, + Out, + NO_DEFAULT, +) +from pydra.utils.typing import is_optional +from pydra.utils.general import wrap_text + + +@attrs.define(kw_only=True) +class arg(Arg): + """An input field that specifies a command line argument + + Parameters + ---------- + help: str + A short description of the input field. + default : Any, optional + the default value for the argument + mandatory: bool, optional + If True user has to provide a value for the field, by default it is False + allowed_values: list, optional + List of allowed values for the field. + requires: list, optional + List of field names that are required together with the field. + copy_mode: File.CopyMode, optional + The mode of copying the file, by default it is File.CopyMode.any + copy_collation: File.CopyCollation, optional + The collation of the file, by default it is File.CopyCollation.any + copy_ext_decomp: File.ExtensionDecomposition, optional + The extension decomposition of the file, by default it is + File.ExtensionDecomposition.single + readonly: bool, optional + If True the input field can’t be provided by the user but it aggregates other + input fields (for example the fields with argstr: -o {fldA} {fldB}), by default + it is False + type: type, optional + The type of the field, by default it is Any + name: str, optional + The name of the field, used when specifying a list of fields instead of a mapping + from name to field, by default it is None + argstr: str, optional + A flag or string that is used in the command before the value, e.g. -v or + -v {inp_field}, but it could be and empty string, “”, in which case the value is + just printed to the command line. If … are used, e.g. -v…, + the flag is used before every element if a list is provided as a value. If the + argstr is None, the field is not part of the command. + position: int, optional + Position of the field in the command, could be nonnegative or negative integer. + If nothing is provided the field will be inserted between all fields with + nonnegative positions and fields with negative positions. + sep: str, optional + A separator if a sequence type is provided as a value, by default " ". + container_path: bool, optional + If True a path will be consider as a path inside the container (and not as a + local path, by default it is False + formatter: function, optional + If provided the argstr of the field is created using the function. This function + can for example be used to combine several inputs into one command argument. The + function can take field (this input field will be passed to the function), + inputs (entire inputs will be passed) or any input field name (a specific input + field will be sent). + """ + + argstr: str | None = "" + position: int | None = None + sep: str = " " + allowed_values: list | None = None + container_path: bool = False # IS THIS STILL USED?? + formatter: ty.Callable | None = None + + def _additional_descriptors(self, as_input: bool = False, **kwargs) -> str: + if not self.argstr or not as_input: + return super()._additional_descriptors(as_input=as_input, **kwargs) + descriptors = [f"{self.argstr!r}"] + descriptors.extend(super()._additional_descriptors(as_input=as_input, **kwargs)) + return descriptors + + def __lt__(self, other: "arg") -> bool: + """Compare two fields based on their position""" + if self.position is None and other.position is None: + return super().__lt__(other) + elif self.position is None: + return False + elif other.position is None: + return True + else: + assert self.position != other.position, "positions should be unique" + if self.position < 0 and other.position < 0: + return self.position > other.position + elif self.position < 0: + return False + elif other.position < 0: + return True + return self.position < other.position + + +@attrs.define(kw_only=True) +class out(Out): + """An output field that specifies a command line argument + + Parameters + ---------- + callable : Callable, optional + If provided the output file name (or list of file names) is created using the + function. The function can take field (the specific output field will be passed + to the function), cache_dir (task cache_dir will be used), stdout, stderr + (stdout and stderr of the task will be sent) inputs (entire inputs will be + passed) or any input field name (a specific input field will be sent). + """ + + callable: ty.Callable | None = attrs.field(default=None) + + def __attrs_post_init__(self): + # Set type from return annotation of callable if not set + if self.type is ty.Any and self.callable: + self.type = ty.get_type_hints(self.callable).get("return", ty.Any) + + @callable.validator + def _callable_validator(self, _, value): + + if value: + if not callable(value): + raise ValueError(f"callable must be a function, not {value!r}") + elif ( + self.mandatory + and not getattr(self, "path_template", None) + and self.name + not in [ + "return_code", + "stdout", + "stderr", + ] + ): # shell.Outputs.BASE_NAMES + raise ValueError( + "A shell output field must have either a callable or a path_template" + ) + + +@attrs.define(kw_only=True) +class outarg(arg, Out): + """An input field that specifies where to save the output file + + Parameters + ---------- + help: str + A short description of the input field. + default : Any, optional + the default value for the argument + mandatory: bool, optional + If True user has to provide a value for the field, by default it is False + allowed_values: list, optional + List of allowed values for the field. + requires: list, optional + List of field names that are required together with the field. + copy_mode: File.CopyMode, optional + The mode of copying the file, by default it is File.CopyMode.any + copy_collation: File.CopyCollation, optional + The collation of the file, by default it is File.CopyCollation.any + copy_ext_decomp: File.ExtensionDecomposition, optional + The extension decomposition of the file, by default it is + File.ExtensionDecomposition.single + readonly: bool, optional + If True the input field can’t be provided by the user but it aggregates other + input fields (for example the fields with argstr: -o {fldA} {fldB}), by default + it is False + type: type, optional + The type of the field, by default it is Any + name: str, optional + The name of the field, used when specifying a list of fields instead of a mapping + from name to field, by default it is None + argstr: str, optional + A flag or string that is used in the command before the value, e.g. -v or + -v {inp_field}, but it could be and empty string, “”. If … are used, e.g. -v…, + the flag is used before every element if a list is provided as a value. If no + argstr is used the field is not part of the command. + position: int, optional + Position of the field in the command line, could be nonnegative or negative integer. + If nothing is provided the field will be inserted between all fields with + nonnegative positions and fields with negative positions. + sep: str, optional + A separator if a list is provided as a value. + container_path: bool, optional + If True a path will be consider as a path inside the container (and not as a + local path, by default it is False + formatter: function, optional + If provided the argstr of the field is created using the function. This function + can for example be used to combine several inputs into one command argument. The + function can take field (this input field will be passed to the function), + inputs (entire inputs will be passed) or any input field name (a specific input + field will be sent). + path_template: str, optional + The template used to specify where the output file will be written to can use + other fields, e.g. {file1}. Used in order to create an output definition. + """ + + path_template: str | None = attrs.field(default=None) + keep_extension: bool = attrs.field(default=True) + + @path_template.validator + def _validate_path_template(self, attribute, value): + if value: + if self.default not in (NO_DEFAULT, True, None): + raise ValueError( + f"path_template ({value!r}) can only be provided when there is no " + f"default value provided ({self.default!r})" + ) + + def markdown_listing( + self, + line_width: int = 79, + help_indent: int = 4, + as_input: bool = False, + **kwargs, + ): + """Get the listing for the field in markdown-like format + + Parameters + ---------- + line_width: int + The maximum line width for the output, by default it is 79 + help_indent: int + The indentation for the help text, by default it is 4 + as_input: bool + Whether to format the field as an input or output if it can be both, by default + it is False + **kwargs: Any + Additional arguments to allow it to be duck-typed with extension classes + + Returns + ------- + str + The listing for the field in markdown-like format + """ + if not as_input: + return super().markdown_listing( + width=line_width, help_indent=help_indent, **kwargs + ) + + type_str = "Path | bool" + if is_optional(self.type): + type_str += " | None" + default = "None" + help_text = wrap_text( + self.OPTIONAL_PATH_TEMPLATE_HELP, + width=line_width, + indent_size=help_indent, + ) + else: + default = True + help_text = wrap_text( + self.PATH_TEMPLATE_HELP, width=line_width, indent_size=help_indent + ) + s = f"- {self.name}: {type_str}; default = {default}" + if self._additional_descriptors(as_input=as_input): + s += f" ({', '.join(self._additional_descriptors(as_input=as_input))})" + s += "\n" + help_text + return s + + PATH_TEMPLATE_HELP = ( + "The path specified for the output file, if True, the default " + "'path template' will be used." + ) + OPTIONAL_PATH_TEMPLATE_HELP = PATH_TEMPLATE_HELP + ( + "If False or None, the output file will not be saved." + ) diff --git a/pydra/compose/shell/task.py b/pydra/compose/shell/task.py new file mode 100644 index 0000000000..7e82267a93 --- /dev/null +++ b/pydra/compose/shell/task.py @@ -0,0 +1,511 @@ +from __future__ import annotations +import typing as ty +import re +import glob +import inspect +import shlex +import platform +from pathlib import Path +from copy import copy, deepcopy +import attrs +from fileformats.generic import FileSet, File +from pydra.utils.general import ( + attrs_values, + task_fields, + ensure_list, + position_sort, +) +from pydra.utils.typing import ( + is_fileset_or_union, + state_array_support, + is_optional, + optional_type, + is_multi_input, + MultiOutputObj, + MultiOutputFile, +) +from pydra.compose import base +from pydra.compose.base.field import RequirementSet +from pydra.compose.base.helpers import is_set +from . import field +from .templating import ( + template_update, + template_update_single, + argstr_formatting, + fields_in_formatter, + parse_format_string, +) + +if ty.TYPE_CHECKING: + from pydra.engine.job import Job + +TaskType = ty.TypeVar("TaskType", bound="Task") + + +@attrs.define(kw_only=True, auto_attribs=False, eq=False, repr=False) +class ShellOutputs(base.Outputs): + """Output task of a generic shell process.""" + + BASE_NAMES = ["return_code", "stdout", "stderr"] + RETURN_CODE_HELP = """The process' exit code.""" + STDOUT_HELP = """The standard output stream produced by the command.""" + STDERR_HELP = """The standard error stream produced by the command.""" + + return_code: int = field.out(name="return_code", type=int, help=RETURN_CODE_HELP) + stdout: str = field.out(name="stdout", type=str, help=STDOUT_HELP) + stderr: str = field.out(name="stderr", type=str, help=STDERR_HELP) + + @classmethod + def _from_task(cls, job: "Job[Task]") -> ty.Self: + """Collect the outputs of a shell process from a combination of the provided inputs, + the objects in the output directory, and the stdout and stderr of the process. + + Parameters + ---------- + inputs : Task + The input task of the shell process. + cache_dir : Path + The directory where the process was run. + stdout : str + The standard output of the process. + stderr : str + The standard error of the process. + return_code : int + The exit code of the process. + + Returns + ------- + outputs : Outputs + The outputs of the shell process + """ + outputs = super()._from_task(job) + fld: field.out + for fld in task_fields(cls): + if fld.name in ["return_code", "stdout", "stderr"]: + resolved_value = job.return_values[fld.name] + # Get the corresponding value from the inputs if it exists, which will be + # passed through to the outputs, to permit manual overrides + elif isinstance(fld, field.outarg) and isinstance( + job.inputs[fld.name], Path + ): + resolved_value = job.inputs[fld.name] + elif is_set(fld.default): + resolved_value = cls._resolve_default_value(fld, job.cache_dir) + else: + resolved_value = cls._resolve_value(fld, job) + # Set the resolved value + try: + setattr(outputs, fld.name, resolved_value) + except FileNotFoundError: + if is_optional(fld.type): + setattr(outputs, fld.name, None) + else: + raise ValueError( + f"file system path(s) provided to mandatory field {fld.name!r}, " + f"'{resolved_value}', does not exist, this is likely due to an " + f"error in the {job.name!r} job" + ) + return outputs + + @classmethod + def _resolve_default_value(cls, fld: field.out, cache_dir: Path) -> ty.Any: + """Resolve path and glob expr default values relative to the output dir""" + default = fld.default + if fld.type is Path: + assert isinstance(default, Path) + if not default.is_absolute(): + default = cache_dir.joinpath(default) + if "*" not in str(default): + if default.exists(): + return default + else: + raise FileNotFoundError(f"file {default} does not exist") + else: + all_files = [Path(el) for el in glob(default.expanduser())] + if len(all_files) > 1: + return all_files + elif len(all_files) == 1: + return all_files[0] + else: + raise FileNotFoundError(f"no file matches {default.name}") + return default + + @classmethod + def _required_fields_satisfied(cls, fld: field.out, inputs: "Task") -> bool: + """checking if all fields from the requires and template are set in the input + if requires is a list of list, checking if at least one list has all elements set + """ + + if not fld.requires: + return True + + requirements: list[RequirementSet] + if fld.requires: + requirements = deepcopy(fld.requires) + else: + requirements = [RequirementSet()] + + # if the output has output_file_template field, add in all input fields from + # the template to requires + if isinstance(fld, field.outarg) and fld.path_template: + # if a template is a function it has to be run first with the inputs as the only arg + if callable(fld.path_template): + template = fld.path_template(inputs) + else: + template = fld.path_template + inp_fields = re.findall(r"{(\w+)(?:\:[^\}]+)?}", template) + for req in requirements: + req += inp_fields + + # Check to see if any of the requirement sets are satisfied + return any(rs.satisfied(inputs) for rs in requirements) + + @classmethod + def _resolve_value( + cls, + fld: "field.out", + job: "Job[TaskType]", + ) -> ty.Any: + """Collect output file if metadata specified.""" + + if not cls._required_fields_satisfied(fld, job.task): + return None + if isinstance(fld, field.outarg) and fld.path_template: + return template_update_single( + fld, + task=job.task, + cache_dir=job.cache_dir, + spec_type="output", + ) + assert fld.callable, ( + f"Output field '{fld.name}', does not not contain any of the required fields " + f'("callable", "output_file_template" or "value"): {fld}.' + ) + callable_ = fld.callable + if isinstance(fld.callable, staticmethod): + # In case callable is defined as a static method, + # retrieve the function wrapped in the descriptor. + callable_ = fld.callable.__func__ + call_args = inspect.getfullargspec(callable_) + call_args_val = {} + for argnm in call_args.args: + if argnm == "field": + call_args_val[argnm] = fld + elif argnm == "cache_dir": + call_args_val[argnm] = job.cache_dir + elif argnm == "executable": + call_args_val[argnm] = job.task.executable + elif argnm == "inputs": + call_args_val[argnm] = job.inputs + elif argnm == "stdout": + call_args_val[argnm] = job.return_values["stdout"] + elif argnm == "stderr": + call_args_val[argnm] = job.return_values["stderr"] + elif argnm == "self": + pass # If the callable is a class + else: + try: + call_args_val[argnm] = job.inputs[argnm] + except KeyError as e: + e.add_note( + f"arguments of the callable function from {fld.name!r} " + f"has to be in inputs or be field or cache_dir, " + f"but {argnm!r} is used" + ) + raise + return callable_(**call_args_val) + + +ShellOutputsType = ty.TypeVar("OutputType", bound=ShellOutputs) + + +@state_array_support +def append_args_converter(value: ty.Any) -> list[str]: + """Convert additional arguments to a list of strings.""" + if isinstance(value, str): + return shlex.split(value) + if not isinstance(value, ty.Sequence): + return [value] + return list(value) + + +@attrs.define(kw_only=True, auto_attribs=False, eq=False, repr=False) +class ShellTask(base.Task[ShellOutputsType]): + + _task_type = "shell" + + BASE_NAMES = ["append_args"] + + EXECUTABLE_HELP = ( + "the first part of the command, can be a string, " + "e.g. 'ls', or a list, e.g. ['ls', '-l', 'dirname']" + ) + + append_args: list[str | File] = field.arg( + name="append_args", + default=attrs.Factory(list), + converter=append_args_converter, + type=list[str | File], + sep=" ", + help="Additional free-form arguments to append to the end of the command.", + ) + + RESERVED_FIELD_NAMES = base.Task.RESERVED_FIELD_NAMES + ("cmdline",) + + def _run(self, job: "Job[ShellTask]", rerun: bool = True) -> None: + """Run the shell command.""" + job.return_values = job.environment.execute(job) + + @property + def cmdline(self) -> str: + """The equivalent command line that would be submitted if the job were run on + the current working directory.""" + # Skip the executable, which can be a multi-part command, e.g. 'docker run'. + values = attrs_values(self) + values.update(template_update(self, cache_dir=Path.cwd())) + cmd_args = self._command_args(values=values) + cmdline = cmd_args[0] + for arg in cmd_args[1:]: + # If there are spaces in the arg, and it is not enclosed by matching + # quotes, add quotes to escape the space. Not sure if this should + # be expanded to include other special characters apart from spaces + if " " in arg: + cmdline += " '" + arg + "'" + else: + cmdline += " " + arg + return cmdline + + def _command_args(self, values: dict[str, ty.Any]) -> list[str]: + """Get command line arguments""" + self._check_resolved() + self._check_rules() + # Drop none/empty values and optional path fields that are set to false + values = copy(values) # Create a copy so we can drop items from the dictionary + for fld in task_fields(self): + fld_value = values[fld.name] + if fld_value is None or (is_multi_input(fld.type) and fld_value == []): + del values[fld.name] + if is_fileset_or_union(fld.type) and type(fld_value) is bool: + del values[fld.name] + # Drop special fields that are added separately + del values["executable"] + del values["append_args"] + # Add executable + pos_args = [ + self._command_shelltask_executable(fld, self.executable), + ] # list for (position, command arg) + positions_provided = [0] + fields = {f.name: f for f in task_fields(self)} + for field_name in values: + pos_val = self._command_pos_args( + fld=fields[field_name], + values=values, + positions_provided=positions_provided, + ) + if pos_val: + pos_args.append(pos_val) + # Sort command and arguments by position + cmd_args = position_sort(pos_args) + # pos_args values are each a list of arguments, so concatenate lists after sorting + command_args = sum(cmd_args, []) + # Append additional arguments to the end of the command + command_args += self.append_args + return command_args + + def _command_shelltask_executable( + self, fld: field.arg, value: ty.Any + ) -> tuple[int, ty.Any]: + """Returning position and value for executable Task input""" + pos = 0 # executable should be the first el. of the command + assert value + return pos, ensure_list(value, tuple2list=True) + + def _command_shelltask_args( + self, fld: field.arg, value: ty.Any + ) -> tuple[int, ty.Any]: + """Returning position and value for args Task input""" + pos = -1 # assuming that args is the last el. of the command + if value is None: + return None + else: + return pos, ensure_list(value, tuple2list=True) + + def _command_pos_args( + self, + fld: field.arg, + values: dict[str, ty.Any], + positions_provided: list[str], + ) -> tuple[int, ty.Any]: + """ + Checking all additional input fields, setting pos to None, if position not set. + Creating a list with additional parts of the command that comes from + the specific field. + + Parameters + ---------- + """ + if fld.argstr is None and fld.formatter is None: + # assuming that input that has no argstr is not used in the command, + # or a formatter is not provided too. + return None + if fld.position is not None: + if not isinstance(fld.position, int): + raise Exception( + f"position should be an integer, but {fld.position} given" + ) + # checking if the position is not already used + if fld.position in positions_provided: + raise Exception( + f"{fld.name} can't have provided position, {fld.position} is already used" + ) + + positions_provided.append(fld.position) + + value = values[fld.name] + + if fld.readonly and type(value) is not bool and value is not attrs.NOTHING: + raise Exception(f"{fld.name} is read only, the value can't be provided") + elif value is None and not fld.readonly and fld.formatter is None: + return None + + cmd_add = [] + # formatter that creates a custom command argument + # it can take the value of the field, all inputs, or the value of other fields. + tp = optional_type(fld.type) if is_optional(fld.type) else fld.type + if fld.formatter: + call_args = inspect.getfullargspec(fld.formatter) + call_args_val = {} + for argnm in call_args.args: + if argnm == "field": + call_args_val[argnm] = fld + elif argnm == "inputs": + call_args_val[argnm] = values + else: + if argnm in values: + call_args_val[argnm] = values[argnm] + else: + raise AttributeError( + f"arguments of the formatter function from {fld.name} " + f"has to be in inputs or be field, but {argnm} is used" + ) + cmd_el_str = fld.formatter(**call_args_val) + cmd_el_str = cmd_el_str.strip().replace(" ", " ") + if cmd_el_str != "": + cmd_add += split_cmd(cmd_el_str) + elif tp is bool and "{" not in fld.argstr: + # if value is simply True the original argstr is used, + # if False, nothing is added to the command. + if value is True: + cmd_add.append(fld.argstr) + elif is_multi_input(tp) or tp is MultiOutputObj or tp is MultiOutputFile: + # if the field is MultiInputObj, it is used to create a list of arguments + for val in value or []: + split_values = copy(values) + split_values[fld.name] = val + cmd_add += self._format_arg(fld, split_values) + else: + cmd_add += self._format_arg(fld, values) + return fld.position, cmd_add + + def _format_arg(self, fld: field.arg, values: dict[str, ty.Any]) -> list[str]: + """Returning arguments used to specify the command args for a single inputs""" + value = values[fld.name] + if ( + fld.argstr.endswith("...") + and isinstance(value, ty.Iterable) + and not isinstance(value, (str, bytes)) + ): + argstr = fld.argstr.replace("...", "") + # if argstr has a more complex form, with "{input_field}" + if "{" in argstr and "}" in argstr: + argstr_formatted_l = [] + for val in value: + split_values = copy(values) + split_values[fld.name] = val + argstr_f = argstr_formatting(argstr, split_values) + argstr_formatted_l.append(f" {argstr_f}") + cmd_el_str = fld.sep.join(argstr_formatted_l) + else: # argstr has a simple form, e.g. "-f", or "--f" + cmd_el_str = fld.sep.join([f" {argstr} {val}" for val in value]) + else: + # in case there are ... when input is not a list + argstr = fld.argstr.replace("...", "") + if isinstance(value, ty.Iterable) and not isinstance(value, (str, bytes)): + cmd_el_str = fld.sep.join([str(val) for val in value]) + value = cmd_el_str + # if argstr has a more complex form, with "{input_field}" + if "{" in argstr and "}" in argstr: + cmd_el_str = argstr.replace(f"{{{fld.name}}}", str(value)) + cmd_el_str = argstr_formatting(cmd_el_str, values) + else: # argstr has a simple form, e.g. "-f", or "--f" + if value: + cmd_el_str = f"{argstr} {value}" + else: + cmd_el_str = "" + return split_cmd(cmd_el_str) + + def _rule_violations(self) -> list[str]: + + errors = super()._rule_violations() + # if there is a value that has to be updated (e.g. single value from a list) + # getting all fields that should be formatted, i.e. {field_name}, ... + fields = task_fields(self) + available_template_names = [f.name for f in fields] + ["field", "inputs"] + for fld in fields: + if fld.argstr: + if unrecognised := [ + f + for f in parse_format_string(fld.argstr) + if f not in available_template_names + ]: + errors.append( + f"Unrecognised field names in the argstr of {fld.name} " + f"({fld.argstr}): {unrecognised}" + ) + if getattr(fld, "path_template", None): + if unrecognised := [ + f + for f in fields_in_formatter(fld.path_template) + if f not in available_template_names + ]: + errors.append( + f"Unrecognised field names in the path_template of {fld.name} " + f"({fld.path_template}): {unrecognised}" + ) + + return errors + + DEFAULT_COPY_COLLATION = FileSet.CopyCollation.adjacent + + +def split_cmd(cmd: str | None): + """Splits a shell command line into separate arguments respecting quotes + + Parameters + ---------- + cmd : str + Command line string or part thereof + + Returns + ------- + str + the command line string split into process args + """ + if cmd is None: + return [] + # Check whether running on posix or Windows system + on_posix = platform.system() != "Windows" + args = shlex.split(cmd, posix=on_posix) + cmd_args = [] + for arg in args: + match = re.match("(['\"])(.*)\\1$", arg) + if match: + cmd_args.append(match.group(2)) + else: + cmd_args.append(arg) + return cmd_args + + +# Alias ShellTask to Task so we can refer to it by shell.Task +Task = ShellTask +Outputs = ShellOutputs diff --git a/pydra/compose/shell/templating.py b/pydra/compose/shell/templating.py new file mode 100644 index 0000000000..3bea8ca369 --- /dev/null +++ b/pydra/compose/shell/templating.py @@ -0,0 +1,328 @@ +import typing as ty +import re +import os +import inspect +from copy import copy +from pathlib import Path +from fileformats.generic import FileSet +from pydra.utils.general import attrs_values, task_fields +from pydra.utils.typing import is_lazy +from . import field + +if ty.TYPE_CHECKING: + from . import Task + + +def template_update( + task, + cache_dir: Path | None = None, + map_copyfiles: dict[str, Path] | None = None, +): + """ + Update all templates that are present in the input task. + + Should be run when all inputs used in the templates are already set. + + """ + + values = attrs_values(task) + if map_copyfiles is not None: + values.update(map_copyfiles) + + # Collect templated inputs for which all requirements are satisfied. + fields_templ = [ + fld + for fld in task_fields(task) + if isinstance(fld, field.outarg) + and fld.path_template + and getattr(task, fld.name) + and all(req.satisfied(task) for req in fld.requires) + ] + + dict_mod = {} + for fld in fields_templ: + dict_mod[fld.name] = template_update_single( + fld=fld, + task=task, + values=values, + cache_dir=cache_dir, + ) + # adding elements from map_copyfiles to fields with templates + if map_copyfiles: + dict_mod.update(map_copyfiles) + return dict_mod + + +def template_update_single( + fld: "field.outarg", + task: "Task", + values: dict[str, ty.Any] = None, + cache_dir: Path | None = None, + spec_type: str = "input", +) -> Path | list[Path | None] | None: + """Update a single template from the input_spec or output_spec + based on the value from inputs_dict + (checking the types of the fields, that have "output_file_template)" + """ + # if input_dict_st with state specific value is not available, + # the dictionary will be created from inputs object + from pydra.utils.typing import TypeParser, OUTPUT_TEMPLATE_TYPES # noqa + + if values is None: + values = attrs_values(task) + + if spec_type == "input": + field_value = values[fld.name] + if isinstance(field_value, bool) and fld.type in (Path, str): + raise TypeError( + f"type of '{fld.name}' is Path, consider using Union[Path, bool]" + ) + if field_value is not None and not is_lazy(field_value): + field_value = TypeParser(ty.Union[OUTPUT_TEMPLATE_TYPES])(field_value) + elif spec_type == "output": + if not TypeParser.contains_type(FileSet, fld.type): + raise TypeError( + f"output {fld.name} should be file-system object, but {fld.type} " + "set as the type" + ) + else: + raise TypeError(f"spec_type can be input or output, but {spec_type} provided") + # for inputs that the value is set (so the template is ignored) + if spec_type == "input": + if isinstance(field_value, (Path, list)): + return field_value + if field_value is False: + # if input fld is set to False, the fld shouldn't be used (setting NOTHING) + return None + # inputs_dict[fld.name] is True or spec_type is output + value = _template_formatting(fld, task, values) + if cache_dir and value is not None: + # changing path so it is in the cache_dir + # should be converted to str, it is also used for input fields that should be str + if type(value) is list: + value = [cache_dir / val.name for val in value] + else: + value = cache_dir / value.name + return value + + +def _template_formatting( + fld: "field.arg", task: "Task", values: dict[str, ty.Any] +) -> Path | list[Path] | None: + """Formatting the fld template based on the values from inputs. + Taking into account that the fld with a template can be a MultiOutputFile + and the fld values needed in the template can be a list - + returning a list of formatted templates in that case. + Allowing for multiple input values used in the template as longs as + there is no more than one file (i.e. File, PathLike or string with extensions) + + Parameters + ---------- + fld : pydra.utils.general.Field + field with a template + task : pydra.compose.shell.Task + the task + values : dict + dictionary with values from inputs object + + Returns + ------- + formatted : Path or list[Path | None] or None + formatted template + """ + # if a template is a function it has to be run first with the inputs as the only arg + template = fld.path_template + if callable(template): + template = template(task) + + # as default, we assume that keep_extension is True + if isinstance(template, (tuple, list)): + formatted = [_single_template_formatting(fld, t, values) for t in template] + if any([val is None for val in formatted]): + return None + else: + assert isinstance(template, str) + formatted = _single_template_formatting(fld, template, values) + return formatted + + +def _single_template_formatting( + fld: "field.outarg", + template: str, + values: dict[str, ty.Any], +) -> Path | None: + from pydra.utils.typing import MultiInputObj, MultiOutputFile + + inp_fields = re.findall(r"{\w+}", template) + inp_fields_fl = re.findall(r"{\w+:[0-9.]+f}", template) + inp_fields += [re.sub(":[0-9.]+f", "", el) for el in inp_fields_fl] + + # FIXME: This would be a better solution, and would allow you to explicitly specify + # whether you want to use the extension of the input file or not, by referencing + # the "ext" attribute of the input file. However, this would require a change in the + # way the element formatting is done + # + # inp_fields = set(re.findall(r"{(\w+)(?:\.\w+)?(?::[0-9.]+f)?}", template)) + + if len(inp_fields) == 0: + return Path(template) + + val_dict = {} + file_template = None + + for inp_fld in inp_fields: + fld_name = inp_fld[1:-1] # extracting the name form {field_name} + if fld_name not in values: + raise AttributeError(f"{fld_name} is not provided in the input") + fld_value = values[fld_name] + if fld_value is None: + # if value is NOTHING, nothing should be added to the command + return None + # checking for fields that can be treated as a file: + # have type File, or value that is path like (including str with extensions) + if isinstance(fld_value, os.PathLike): + if file_template: + raise Exception( + f"can't have multiple paths in {fld.name} template," + f" but {template} provided" + ) + else: + file_template = (fld_name, fld_value) + else: + val_dict[fld_name] = fld_value + + # if field is MultiOutputFile and some elements from val_dict are lists, + # each element of the list should be used separately in the template + # and return a list with formatted values + if fld.type is MultiOutputFile and any( + [isinstance(el, (list, MultiInputObj)) for el in val_dict.values()] + ): + # all fields that are lists + keys_list = [ + k for k, el in val_dict.items() if isinstance(el, (list, MultiInputObj)) + ] + if any( + [len(val_dict[key]) != len(val_dict[keys_list[0]]) for key in keys_list[1:]] + ): + raise Exception( + f"all fields used in {fld.name} template have to have the same length" + f" or be a single value" + ) + formatted_value = [] + for ii in range(len(val_dict[keys_list[0]])): + val_dict_el = copy(val_dict) + # updating values to a single element from the list + for key in keys_list: + val_dict_el[key] = val_dict[key][ii] + + formatted_value.append( + _element_formatting( + template, + val_dict_el, + file_template, + keep_extension=fld.keep_extension, + ) + ) + else: + formatted_value = _element_formatting( + template, val_dict, file_template, keep_extension=fld.keep_extension + ) + if isinstance(formatted_value, list): + return [Path(val) for val in formatted_value] + elif isinstance(formatted_value, str): + return Path(formatted_value) + return None + + +def _element_formatting( + template: str, + values_template_dict: dict[str, ty.Any], + file_template: str, + keep_extension: bool, +): + """Formatting a single template for a single element (if a list). + Taking into account that a file used in the template (file_template) + and the template itself could have file extensions + (assuming that if template has extension, the field value extension is removed, + if field has extension, and no template extension, than it is moved to the end). + For values_template_dict the simple formatting can be used (no file values inside) + """ + if file_template: + fld_name_file, fld_value_file = file_template + # splitting the filename for name and extension, + # the final value used for formatting depends on the template and keep_extension flag + name, *ext = Path(fld_value_file).name.split(".", maxsplit=1) + filename = str(Path(fld_value_file).parent / name) + # updating values_template_dic with the name of file + values_template_dict[fld_name_file] = filename + # if keep_extension is False, the extensions are removed + if keep_extension is False: + ext = [] + else: + ext = [] + + # if file_template is at the end of the template, the simplest formatting should work + if file_template and template.endswith(f"{{{fld_name_file}}}"): + # recreating fld_value with the updated extension + values_template_dict[fld_name_file] = ".".join([filename] + ext) + formatted_value = template.format(**values_template_dict) + # file_template provided, but the template doesn't have its own extension + elif file_template and "." not in template: + # if the fld_value_file has extension, it will be moved to the end + formatted_value = ".".join([template.format(**values_template_dict)] + ext) + # template has its own extension or no file_template provided + # the simplest formatting, if file_template is provided it's used without the extension + else: + formatted_value = template.format(**values_template_dict) + return formatted_value + + +def parse_format_string(fmtstr: str) -> set[str]: + """Parse a argstr format string and return all keywords used in it.""" + identifier = r"[a-zA-Z_]\w*" + attribute = rf"\.{identifier}" + item = r"\[\w+\]" + # Example: var.attrs[key][0].attr2 (capture "var") + field_with_lookups = ( + f"({identifier})(?:{attribute}|{item})*" # Capture only the keyword + ) + conversion = "(?:!r|!s)" + nobrace = "[^{}]*" + # Example: 0{pads[hex]}x (capture "pads") + fmtspec = f"{nobrace}(?:{{({identifier}){nobrace}}}{nobrace})?" # Capture keywords in definition + full_field = f"{{{field_with_lookups}{conversion}?(?::{fmtspec})?}}" + + all_keywords = re.findall(full_field, fmtstr) + return set().union(*all_keywords) - {""} + + +def fields_in_formatter(formatter: str | ty.Callable[..., str]) -> set[str]: + """Extract all field names from a formatter string or function.""" + if isinstance(formatter, str): + return parse_format_string(formatter) + elif isinstance(formatter, ty.Sequence): + return set().union(*[fields_in_formatter(f) for f in formatter]) + elif isinstance(formatter, ty.Callable): + return set(inspect.signature(formatter).parameters.keys()) + else: + raise ValueError(f"Unsupported formatter type: {type(formatter)} ({formatter})") + + +def argstr_formatting(argstr: str, values: dict[str, ty.Any]): + """formatting argstr that have form {field_name}, + using values from inputs and updating with value_update if provided + """ + # if there is a value that has to be updated (e.g. single value from a list) + # getting all fields that should be formatted, i.e. {field_name}, ... + inp_fields = parse_format_string(argstr) + # formatting string based on the val_dict + argstr_formatted = argstr.format(**{n: values.get(n, "") for n in inp_fields}) + # removing extra commas and spaces after removing the field that have NOTHING + argstr_formatted = ( + argstr_formatted.replace("[ ", "[") + .replace(" ]", "]") + .replace("[,", "[") + .replace(",]", "]") + .strip() + ) + return argstr_formatted diff --git a/pydra/engine/tests/conftest.py b/pydra/compose/shell/tests/conftest.py similarity index 56% rename from pydra/engine/tests/conftest.py rename to pydra/compose/shell/tests/conftest.py index b7ecfbb8e9..642944cf5c 100644 --- a/pydra/engine/tests/conftest.py +++ b/pydra/compose/shell/tests/conftest.py @@ -1,3 +1,4 @@ +from pathlib import Path import pytest @@ -8,9 +9,9 @@ @pytest.fixture(scope="package") -def data_tests_dir(): - test_nii = importlib_resources.files("pydra").joinpath( +def data_tests_dir() -> Path: + data_dir = importlib_resources.files("pydra").joinpath( "engine", "tests", "data_tests" ) - with importlib_resources.as_file(test_nii) as path: + with importlib_resources.as_file(data_dir) as path: yield path diff --git a/pydra/compose/shell/tests/test_shell_cmdline.py b/pydra/compose/shell/tests/test_shell_cmdline.py new file mode 100644 index 0000000000..aea5d6ae0d --- /dev/null +++ b/pydra/compose/shell/tests/test_shell_cmdline.py @@ -0,0 +1,1525 @@ +import typing as ty +from pathlib import Path +import attrs +import pytest +from fileformats.generic import File +from pydra.compose import shell +from pydra.utils.typing import MultiInputObj +from pydra.engine.tests.utils import get_output_names + + +def test_shell_cmd_execargs_1(): + # separate command into exec + args + Shelly = shell.define(["executable", "arg"]) + shelly = Shelly() + assert shelly.cmdline == "executable arg" + + +def test_shell_cmd_execargs_2(): + # separate command into exec + args + Shelly = shell.define(["cmd_1", "cmd_2", "arg"]) + shelly = Shelly() + assert shelly.cmdline == "cmd_1 cmd_2 arg" + + +def test_shell_cmd_inputs_1(): + """additional input with provided position""" + + @shell.define + class Shelly(shell.Task["Shelly.Outputs"]): + class Outputs(shell.Outputs): + pass + + executable = "executable" + inpA: str = shell.arg(position=1, help="inp1", argstr="") + + shelly = Shelly( + append_args=["arg"], + inpA="inp1", + ) + assert shelly.cmdline == "executable inp1 arg" + + +def test_shell_cmd_inputs_1a(): + """additional input without provided position""" + + @shell.define + class Shelly(shell.Task["Shelly.Outputs"]): + class Outputs(shell.Outputs): + pass + + executable = "executable" + inpA: str = shell.arg(help="inpA", argstr="") + + shelly = Shelly( + append_args=["arg"], + inpA="inpNone1", + ) + # inp1 should be the first one after executable + assert shelly.cmdline == "executable inpNone1 arg" + + +def test_shell_cmd_inputs_1b(): + """additional input with negative position""" + + @shell.define + class Shelly(shell.Task["Shelly.Outputs"]): + class Outputs(shell.Outputs): + pass + + executable = "executable" + inpA: str = shell.arg(position=-1, help="inpA", argstr="") + + # separate command into exec + args + shelly = Shelly( + append_args=["arg"], + inpA="inp-1", + ) + # inp1 should be last before arg + assert shelly.cmdline == "executable inp-1 arg" + + +def test_shell_cmd_inputs_2(): + """additional inputs with provided positions""" + + @shell.define + class Shelly(shell.Task["Shelly.Outputs"]): + class Outputs(shell.Outputs): + pass + + executable = "executable" + + inpA: str = shell.arg(position=2, help="inpA", argstr="") + inpB: str = shell.arg(position=1, help="inpN", argstr="") + + # separate command into exec + args + shelly = Shelly( + inpB="inp1", + inpA="inp2", + ) + assert shelly.cmdline == "executable inp1 inp2" + + +def test_shell_cmd_inputs_2a(): + """additional inputs without provided positions""" + + @shell.define + class Shelly(shell.Task["Shelly.Outputs"]): + class Outputs(shell.Outputs): + pass + + executable = "executable" + + inpA: str = shell.arg(help="inpA", argstr="") + inpB: str = shell.arg(help="inpB", argstr="") + + # separate command into exec + args + shelly = Shelly( + inpA="inpNone1", + inpB="inpNone2", + ) + # position taken from the order in input definition + assert shelly.cmdline == "executable inpNone1 inpNone2" + + +def test_shell_cmd_inputs_2_err(): + """additional inputs with provided positions (exception due to the duplication)""" + + with pytest.raises(Exception) as e: + + @shell.define + class Shelly(shell.Task["Shelly.Outputs"]): + class Outputs(shell.Outputs): + pass + + executable = "executable" + + inpA: str = shell.arg(position=1, help="inpA", argstr="") + inpB: str = shell.arg(position=1, help="inpB", argstr="") + + assert "Multiple fields have the overlapping positions" in str(e.value) + + +def test_shell_cmd_inputs_3(): + """additional inputs: positive pos, negative pos and no pos""" + + @shell.define + class Shelly(shell.Task["Shelly.Outputs"]): + class Outputs(shell.Outputs): + pass + + executable = "executable" + + inpA: str = shell.arg(position=1, help="inpA", argstr="") + inpB: str = shell.arg(position=-1, help="inpB", argstr="") + inpC: str = shell.arg(help="inpC", argstr="") + + # separate command into exec + args + shelly = Shelly( + inpA="inp1", + inpB="inp-1", + inpC="inpNone", + ) + # input without position should be between positive an negative positions + assert shelly.cmdline == "executable inp1 inpNone inp-1" + + +def test_shell_cmd_inputs_argstr_1(): + """additional string inputs with argstr""" + + @shell.define + class Shelly(shell.Task["Shelly.Outputs"]): + class Outputs(shell.Outputs): + pass + + executable = "executable" + + inpA: str = shell.arg(position=1, help="inpA", argstr="-v") + + shelly = Shelly(inpA="inp1") + # flag used before inp1 + assert shelly.cmdline == "executable -v inp1" + + +def test_shell_cmd_inputs_argstr_2(): + """additional bool inputs with argstr""" + + @shell.define + class Shelly(shell.Task["Shelly.Outputs"]): + class Outputs(shell.Outputs): + pass + + executable = "executable" + + inpA: bool = shell.arg(position=1, help="inpA", argstr="-v") + + # separate command into exec + args + shelly = Shelly(append_args=["arg"], inpA=True) + # a flag is used without any additional argument + assert shelly.cmdline == "executable -v arg" + + +def test_shell_cmd_inputs_list_1(): + """providing list as an additional input, no sep, no argstr""" + + @shell.define + class Shelly(shell.Task["Shelly.Outputs"]): + class Outputs(shell.Outputs): + pass + + executable = "executable" + + inpA: ty.List[str] = shell.arg(position=2, help="inpA", argstr="", sep=" ") + + shelly = Shelly(inpA=["el_1", "el_2", "el_3"]) + # multiple elements + assert shelly.cmdline == "executable el_1 el_2 el_3" + + +def test_shell_cmd_inputs_list_2(): + """providing list as an additional input, no sep, but argstr""" + + @shell.define + class Shelly(shell.Task["Shelly.Outputs"]): + class Outputs(shell.Outputs): + pass + + executable = "executable" + + inpA: ty.List[str] = shell.arg(position=2, help="inpA", argstr="-v", sep=" ") + + shelly = Shelly(inpA=["el_1", "el_2", "el_3"]) + assert shelly.cmdline == "executable -v el_1 el_2 el_3" + + +def test_shell_cmd_inputs_list_3(): + """providing list as an additional input, no sep, argstr with ...""" + + @shell.define + class Shelly(shell.Task["Shelly.Outputs"]): + class Outputs(shell.Outputs): + pass + + executable = "executable" + + inpA: ty.List[str] = shell.arg(position=2, help="inpA", argstr="-v...", sep=" ") + + shelly = Shelly(inpA=["el_1", "el_2", "el_3"]) + # a flag is repeated + assert shelly.cmdline == "executable -v el_1 -v el_2 -v el_3" + + +def test_shell_cmd_inputs_list_sep_1(): + """providing list as an additional input:, sep, no argstr""" + + @shell.define + class Shelly(shell.Task["Shelly.Outputs"]): + class Outputs(shell.Outputs): + pass + + executable = "executable" + + inpA: list[str] = shell.arg( + position=1, + help="inpA", + sep=",", + argstr="", + ) + + shelly = Shelly(inpA=["aaa", "bbb", "ccc"]) + # separated by commas + assert shelly.cmdline == "executable aaa,bbb,ccc" + + +def test_shell_cmd_inputs_list_sep_2(): + """providing list as an additional input:, sep, and argstr""" + + @shell.define + class Shelly(shell.Task["Shelly.Outputs"]): + class Outputs(shell.Outputs): + pass + + executable = "executable" + + inpA: list[str] = shell.arg( + position=1, + help="inpA", + sep=",", + argstr="-v", + ) + + shelly = Shelly(inpA=["aaa", "bbb", "ccc"]) + # a flag is used once + assert shelly.cmdline == "executable -v aaa,bbb,ccc" + + +def test_shell_cmd_inputs_list_sep_2a(): + """providing list as an additional input:, sep, and argstr with f-string""" + + @shell.define + class Shelly(shell.Task["Shelly.Outputs"]): + class Outputs(shell.Outputs): + pass + + executable = "executable" + + inpA: list[str] = shell.arg( + position=1, + help="inpA", + sep=",", + argstr="-v {inpA}", + ) + + shelly = Shelly(inpA=["aaa", "bbb", "ccc"]) + # a flag is used once + assert shelly.cmdline == "executable -v aaa,bbb,ccc" + + +def test_shell_cmd_inputs_list_sep_3(): + """providing list as an additional input:, sep, argstr with ...""" + + @shell.define + class Shelly(shell.Task["Shelly.Outputs"]): + class Outputs(shell.Outputs): + pass + + executable = "executable" + + inpA: list[str] = shell.arg( + position=1, + help="inpA", + sep=",", + argstr="-v...", + ) + + shelly = Shelly(inpA=["aaa", "bbb", "ccc"]) + # a flag is repeated + assert shelly.cmdline == "executable -v aaa, -v bbb, -v ccc" + + +def test_shell_cmd_inputs_list_sep_3a(): + """providing list as an additional input:, sep, argstr with ... and f-string""" + + @shell.define + class Shelly(shell.Task["Shelly.Outputs"]): + class Outputs(shell.Outputs): + pass + + executable = "executable" + + inpA: list[str] = shell.arg( + position=1, + help="inpA", + sep=",", + argstr="-v {inpA}...", + ) + + shelly = Shelly(inpA=["aaa", "bbb", "ccc"]) + # a flag is repeated + assert shelly.cmdline == "executable -v aaa, -v bbb, -v ccc" + + +def test_shell_cmd_inputs_sep_4(): + """providing 1-el list as an additional input:, sep, argstr with ...,""" + + @shell.define + class Shelly(shell.Task["Shelly.Outputs"]): + class Outputs(shell.Outputs): + pass + + executable = "executable" + + inpA: MultiInputObj[str] = shell.arg( + position=1, + help="inpA", + argstr="-v...", + ) + + shelly = Shelly(inpA=["aaa"]) + assert shelly.cmdline == "executable -v aaa" + + +def test_shell_cmd_inputs_sep_4a(): + """providing str instead of list as an additional input:, sep, argstr with ...""" + + @shell.define + class Shelly(shell.Task["Shelly.Outputs"]): + class Outputs(shell.Outputs): + pass + + executable = "executable" + + inpA: str = shell.arg( + position=1, + help="inpA", + argstr="-v...", + ) + + shelly = Shelly(inpA="aaa") + assert shelly.cmdline == "executable -v aaa" + + +def test_shell_cmd_inputs_format_1(): + """additional inputs with argstr that has string formatting""" + + @shell.define + class Shelly(shell.Task["Shelly.Outputs"]): + class Outputs(shell.Outputs): + pass + + executable = "executable" + + inpA: str = shell.arg( + position=1, + help="inpA", + argstr="-v {inpA}", + ) + + shelly = Shelly(inpA="aaa") + assert shelly.cmdline == "executable -v aaa" + + +def test_shell_cmd_inputs_format_2(): + """additional inputs with argstr that has string formatting and ...""" + + @shell.define + class Shelly(shell.Task["Shelly.Outputs"]): + class Outputs(shell.Outputs): + pass + + executable = "executable" + + inpA: MultiInputObj[str] = shell.arg( + position=1, + help="inpA", + argstr="-v {inpA}...", + ) + + shelly = Shelly(inpA=["el_1", "el_2"]) + assert shelly.cmdline == "executable -v el_1 -v el_2" + + +def test_shell_cmd_inputs_format_3(): + """adding float formatting for argstr with input field""" + + @shell.define + class Shelly(shell.Task["Shelly.Outputs"]): + class Outputs(shell.Outputs): + pass + + executable = "executable" + + inpA: float = shell.arg( + position=1, + help="inpA", + argstr="-v {inpA:.5f}", + ) + + shelly = Shelly(inpA=0.007) + assert shelly.cmdline == "executable -v 0.00700" + + +def test_shell_cmd_inputs_mandatory_1(): + """additional inputs with mandatory=True""" + + @shell.define + class Shelly(shell.Task["Shelly.Outputs"]): + class Outputs(shell.Outputs): + pass + + executable = "executable" + + inpA: str = shell.arg( + position=1, + help="inpA", + argstr="", + ) + + shelly = Shelly() + with pytest.raises(Exception) as e: + shelly.cmdline + assert "mandatory" in str(e.value).lower() + + +def test_shell_cmd_inputs_not_given_1(): + @shell.define + class Shelly(shell.Task["Shelly.Outputs"]): + class Outputs(shell.Outputs): + pass + + executable = "executable" + + arg1: MultiInputObj = shell.arg( + argstr="--arg1", + default=attrs.Factory(list), + help="Command line argument 1", + ) + arg2: MultiInputObj = shell.arg( + argstr="--arg2", + help="Command line argument 2", + ) + arg3: File | None = shell.arg( + argstr="--arg3", + default=None, + help="Command line argument 3", + ) + + shelly = Shelly() + + shelly.arg2 = "argument2" + + assert shelly.cmdline == "executable --arg2 argument2" + + +def test_shell_cmd_inputs_template_1(): + """additional inputs, one uses path_template (and argstr)""" + + @shell.define + class Shelly(shell.Task["Shelly.Outputs"]): + class Outputs(shell.Outputs): + outA: File = shell.outarg( + position=2, + help="outA", + argstr="-o", + path_template="{inpA}_out", + ) + + executable = "executable" + + inpA: str = shell.arg( + position=1, + help="inpA", + argstr="", + ) + + shelly = Shelly(inpA="inpA") + # outA has argstr in the metadata fields, so it's a part of the command line + # the full path will be use din the command line + assert shelly.cmdline == f"executable inpA -o {Path.cwd() / 'inpA_out'}" + # checking if outA in the output fields + assert get_output_names(shelly) == ["outA", "return_code", "stderr", "stdout"] + + +# TODO: after deciding how we use requires/templates +def test_shell_cmd_inputs_template_2(): + """additional inputs, one uses path_template (and argstr, but input not provided)""" + + @shell.define + class Shelly(shell.Task["Shelly.Outputs"]): + class Outputs(shell.Outputs): + outB: File | None = shell.outarg( + position=2, + help="outB", + argstr="-o", + path_template="{inpB}_out", + ) + + executable = "executable" + + inpB: File | None = shell.arg(position=1, help="inpB", argstr="", default=None) + + shelly = Shelly() + # inpB not in the inputs, so no outB in the command line + assert shelly.cmdline == "executable" + # checking if outB in the output fields + assert get_output_names(shelly) == ["outB", "return_code", "stderr", "stdout"] + + +def test_shell_cmd_inputs_template_3(tmp_path): + """additional inputs with path_template and an additional + read-only fields that combine two outputs together in the command line + """ + inpA = tmp_path / "inpA" + inpB = tmp_path / "inpB" + Path.touch(inpA) + Path.touch(inpB) + + @shell.define + class Shelly(shell.Task["Shelly.Outputs"]): + class Outputs(shell.Outputs): + + outA: File = shell.outarg( + help="outA", + argstr=None, + path_template="{inpA}_out", + ) + outB: File = shell.outarg( + help="outB", + argstr=None, + path_template="{inpB}_out", + ) + + executable = "executable" + + inpA: str = shell.arg( + position=1, + help="inpA", + argstr="", + ) + inpB: str = shell.arg( + position=2, + help="inpB", + argstr="", + ) + outAB: str = shell.arg( + position=-1, + help="outAB", + argstr="-o {outA} {outB}", + readonly=True, + ) + + shelly = Shelly(inpA=inpA, inpB=inpB) + # using syntax from the outAB field + assert ( + shelly.cmdline + == f"executable {tmp_path / 'inpA'} {tmp_path / 'inpB'} -o {Path.cwd() / 'inpA_out'} {str(Path.cwd() / 'inpB_out')}" + ) + # checking if outA and outB in the output fields (outAB should not be) + assert get_output_names(shelly) == [ + "outA", + "outB", + "return_code", + "stderr", + "stdout", + ] + + +def test_shell_cmd_inputs_template_3a(): + """additional inputs with path_template and an additional + read-only fields that combine two outputs together in the command line + testing a different order within the input definition + """ + + @shell.define + class Shelly(shell.Task["Shelly.Outputs"]): + class Outputs(shell.Outputs): + + outA: File = shell.outarg( + argstr=None, + help="outA", + path_template="{inpA}_out", + ) + outB: File = shell.outarg( + argstr=None, + help="outB", + path_template="{inpB}_out", + ) + + executable = "executable" + + inpA: str = shell.arg( + position=1, + help="inpA", + argstr="", + ) + inpB: str = shell.arg( + position=2, + help="inpB", + argstr="", + ) + outAB: str = shell.arg( + position=-1, + help="outAB", + argstr="-o {outA} {outB}", + readonly=True, + ) + + shelly = Shelly(inpA="inpA", inpB="inpB") + # using syntax from the outAB field + assert ( + shelly.cmdline + == f"executable inpA inpB -o {Path.cwd() / 'inpA_out'} {str(Path.cwd() / 'inpB_out')}" + ) + # checking if outA and outB in the output fields (outAB should not be) + assert get_output_names(shelly) == [ + "outA", + "outB", + "return_code", + "stderr", + "stdout", + ] + + +# TODO: after deciding how we use requires/templates +def test_shell_cmd_inputs_template_4(): + """additional inputs with path_template and an additional + read-only fields that combine two outputs together in the command line + one path_template can't be resolved - no inpB is provided + """ + + @shell.define + class Shelly(shell.Task["Shelly.Outputs"]): + class Outputs(shell.Outputs): + outA: File = shell.outarg( + argstr=None, + help="outA", + path_template="{inpA}_out", + ) + outB: File | None = shell.outarg( + argstr=None, + help="outB", + path_template="{inpB}_out", + ) + + executable = "executable" + + inpA: str = shell.arg( + position=1, + help="inpA", + argstr="", + ) + inpB: str | None = shell.arg(position=2, help="inpB", argstr="", default=None) + outAB: str = shell.arg( + position=-1, + help="outAB", + argstr="-o {outA} {outB}", + readonly=True, + ) + + shelly = Shelly(inpA="inpA") + # inpB is not provided so outB not in the command line + assert shelly.cmdline == f"executable inpA -o {Path.cwd() / 'inpA_out'}" + assert get_output_names(shelly) == [ + "outA", + "outB", + "return_code", + "stderr", + "stdout", + ] + + +def test_shell_cmd_inputs_template_5_ex(): + """checking if the exception is raised for read-only fields when input is set""" + + @shell.define + class Shelly(shell.Task["Shelly.Outputs"]): + class Outputs(shell.Outputs): + pass + + executable = "executable" + + outAB: str = shell.arg( + position=-1, + help="outAB", + argstr="-o", + readonly=True, + ) + + shelly = Shelly(outAB="outAB") + with pytest.raises(Exception) as e: + shelly.cmdline + assert "read only" in str(e.value) + + +def test_shell_cmd_inputs_template_6(): + """additional inputs with path_template that has type ty.Union[str, bool] + no default is set, so if nothing is provided as an input, the output is used + whenever the template can be formatted + (the same way as for templates that has type=str) + """ + + @shell.define + class Shelly(shell.Task["Shelly.Outputs"]): + class Outputs(shell.Outputs): + outA: File = shell.outarg( + position=2, + help="outA", + argstr="-o", + path_template="{inpA}_out", + ) + + executable = "executable" + + inpA: File = shell.arg( + position=1, + help="inpA", + argstr="", + ) + + # no input for outA (and no default value), so the output is created whenever the + # template can be formatted (the same way as for templates that has type=str) + inpA = File.mock("inpA") + shelly = Shelly(inpA=inpA) + + inpA_path = Path.cwd() / "inpA" + outA_path = Path.cwd() / "inpA_out" + assert shelly.cmdline == f"executable {inpA_path} -o {outA_path}" + + # a string is provided for outA, so this should be used as the outA value + shelly = Shelly(inpA=inpA, outA="outA") + assert shelly.cmdline == f"executable {inpA_path} -o outA" + + # True is provided for outA, so the formatted template should be used as outA value + shelly = Shelly(inpA=inpA, outA=True) + assert shelly.cmdline == f"executable {inpA_path} -o {outA_path}" + + # False is provided for outA, so the outA shouldn't be used + shelly = Shelly(inpA=inpA, outA=False) + assert shelly.cmdline == f"executable {inpA_path}" + + +def test_shell_cmd_inputs_template_6a(): + """additional inputs with path_template that has type ty.Union[str, bool] + and default is set to False, + so if nothing is provided as an input, the output is not used + """ + + @shell.define + class Shelly(shell.Task["Shelly.Outputs"]): + class Outputs(shell.Outputs): + outA: File | None = shell.outarg( + position=2, + help="outA", + argstr="-o", + path_template="{inpA}_out", + ) + + executable = "executable" + + inpA: str = shell.arg( + position=1, + help="inpA", + argstr="", + ) + + # no input for outA, but default is False, so the outA shouldn't be used + shelly = Shelly(inpA="inpA") + assert shelly.cmdline == "executable inpA" + + # a string is provided for outA, so this should be used as the outA value + shelly = Shelly(inpA="inpA", outA="outA") + assert shelly.cmdline == "executable inpA -o outA" + + # True is provided for outA, so the formatted template should be used as outA value + shelly = Shelly(inpA="inpA", outA=True) + assert shelly.cmdline == f"executable inpA -o {Path.cwd() / 'inpA_out'}" + + # False is provided for outA, so the outA shouldn't be used + shelly = Shelly(inpA="inpA", outA=False) + assert shelly.cmdline == "executable inpA" + + +def test_shell_cmd_inputs_template_7(tmp_path: Path): + """additional inputs uses path_template with a suffix (no extension) + no keep_extension is used + """ + + @shell.define + class Shelly(shell.Task["Shelly.Outputs"]): + class Outputs(shell.Outputs): + outA: File = shell.outarg( + position=2, + help="outA", + argstr="", + path_template="{inpA}_out", + ) + + executable = "executable" + + inpA: File = shell.arg( + position=1, + help="inpA", + argstr="", + ) + + inpA_file = tmp_path / "a_file.txt" + inpA_file.write_text("content") + shelly = Shelly(inpA=inpA_file) + + # outA should be formatted in a way that that .txt goes to the end + assert ( + shelly.cmdline + == f"executable {tmp_path / 'a_file.txt'} {Path.cwd() / 'a_file_out.txt'}" + ) + + +def test_shell_cmd_inputs_template_7a(tmp_path: Path): + """additional inputs uses path_template with a suffix (no extension) + keep_extension is True (as default) + """ + + @shell.define + class Shelly(shell.Task["Shelly.Outputs"]): + class Outputs(shell.Outputs): + outA: File = shell.outarg( + position=2, + help="outA", + argstr="", + keep_extension=True, + path_template="{inpA}_out", + ) + + executable = "executable" + + inpA: File = shell.arg( + position=1, + help="inpA", + argstr="", + ) + + inpA_file = tmp_path / "a_file.txt" + inpA_file.write_text("content") + shelly = Shelly(inpA=inpA_file) + + # outA should be formatted in a way that that .txt goes to the end + assert ( + shelly.cmdline + == f"executable {tmp_path / 'a_file.txt'} {Path.cwd() / 'a_file_out.txt'}" + ) + + +def test_shell_cmd_inputs_template_7b(tmp_path: Path): + """additional inputs uses path_template with a suffix (no extension) + keep extension is False (so the extension is removed when creating the output) + """ + + @shell.define + class Shelly(shell.Task["Shelly.Outputs"]): + class Outputs(shell.Outputs): + outA: File = shell.outarg( + position=2, + help="outA", + argstr="", + keep_extension=False, + path_template="{inpA}_out", + ) + + executable = "executable" + + inpA: File = shell.arg( + position=1, + help="inpA", + argstr="", + ) + + inpA_file = tmp_path / "a_file.txt" + inpA_file.write_text("content") + shelly = Shelly(inpA=inpA_file) + + # outA should be formatted in a way that that .txt goes to the end + assert ( + shelly.cmdline + == f"executable {tmp_path / 'a_file.txt'} {Path.cwd() / 'a_file_out'}" + ) + + +def test_shell_cmd_inputs_template_8(tmp_path: Path): + """additional inputs uses path_template with a suffix and an extension""" + + @shell.define + class Shelly(shell.Task["Shelly.Outputs"]): + class Outputs(shell.Outputs): + outA: File = shell.outarg( + position=2, + help="outA", + argstr="", + path_template="{inpA}_out.txt", + ) + + executable = "executable" + + inpA: File = shell.arg( + position=1, + help="inpA", + argstr="", + ) + + inpA_file = tmp_path / "a_file.t" + inpA_file.write_text("content") + shelly = Shelly(inpA=inpA_file) + + # outA should be formatted in a way that inpA extension is removed and the template extension is used + assert ( + shelly.cmdline + == f"executable {tmp_path / 'a_file.t'} {Path.cwd() / 'a_file_out.txt'}" + ) + + +def test_shell_cmd_inputs_template_9(tmp_path: Path): + """additional inputs, one uses path_template with two fields: + one File and one ints - the output should be recreated from the template + """ + + @shell.define + class Shelly(shell.Task["Shelly.Outputs"]): + class Outputs(shell.Outputs): + outA: File = shell.outarg( + position=3, + help="outA", + argstr="-o", + path_template="{inpA}_{inpInt}_out.txt", + ) + + executable = "executable" + + inpA: File = shell.arg( + position=1, + help="inpA", + argstr="", + ) + inpInt: int = shell.arg( + position=2, + help="inp int", + argstr="-i", + ) + + inpA_file = tmp_path / "inpA.t" + inpA_file.write_text("content") + + shelly = Shelly(inpA=inpA_file, inpInt=3) + + assert ( + shelly.cmdline + == f"executable {tmp_path / 'inpA.t'} -i 3 -o {Path.cwd() / 'inpA_3_out.txt'}" + ) + # checking if outA in the output fields + assert get_output_names(shelly) == ["outA", "return_code", "stderr", "stdout"] + + +def test_shell_cmd_inputs_template_9a(tmp_path: Path): + """additional inputs, one uses path_template with two fields: + one file and one string without extension - should be fine + """ + + @shell.define + class Shelly(shell.Task["Shelly.Outputs"]): + class Outputs(shell.Outputs): + outA: File = shell.outarg( + position=3, + help="outA", + argstr="-o", + path_template="{inpA}_{inpStr}_out.txt", + ) + + executable = "executable" + + inpA: File = shell.arg( + position=1, + help="inpA", + argstr="", + ) + inpStr: str = shell.arg( + position=2, + help="inp str", + argstr="-i", + ) + + inpA_file = tmp_path / "inpA.t" + inpA_file.write_text("content") + + shelly = Shelly(inpA=inpA_file, inpStr="hola") + + assert ( + shelly.cmdline + == f"executable {tmp_path / 'inpA.t'} -i hola -o {Path.cwd() / 'inpA_hola_out.txt'}" + ) + # checking if outA in the output fields + assert get_output_names(shelly) == ["outA", "return_code", "stderr", "stdout"] + + +def test_shell_cmd_inputs_template_9b_err(tmp_path: Path): + """path_template with two fields that are both Files, + an exception should be raised + """ + + @shell.define + class Shelly(shell.Task["Shelly.Outputs"]): + class Outputs(shell.Outputs): + + outA: File = shell.outarg( + position=3, + help="outA", + argstr="-o", + path_template="{inpA}_{inpFile}_out.txt", + ) + + executable = "executable" + + inpA: File = shell.arg( + position=1, + help="inpA", + argstr="", + ) + inpFile: File = shell.arg( + position=2, + help="inp file", + argstr="-i", + ) + + inpA_file = tmp_path / "inpA.t" + inpA_file.write_text("content") + + inpFile_file = tmp_path / "inpFile.t" + inpFile_file.write_text("content") + + shelly = Shelly( + inpA=inpA_file, + inpFile=inpFile_file, + ) + # the template has two files so the exception should be raised + with pytest.raises(Exception, match="can't have multiple paths"): + shelly.cmdline + + +def test_shell_cmd_inputs_template_9c_err(tmp_path: Path): + """path_template with two fields: a file and a string with extension, + that should be used as an additional file and the exception should be raised + """ + + @shell.define + class Shelly(shell.Task["Shelly.Outputs"]): + class Outputs(shell.Outputs): + + outA: File = shell.outarg( + position=3, + help="outA", + argstr="-o", + path_template="{inpA}_{inpStr}_out.txt", + ) + + executable = "executable" + + inpA: File = shell.arg( + position=1, + help="inpA", + argstr="", + ) + inpStr: Path = shell.arg( + position=2, + help="inp str with extension", + argstr="-i", + ) + + inpA_file = tmp_path / "inpA.t" + inpA_file.write_text("content") + + shelly = Shelly( + inpA=inpA_file, + inpStr="hola.txt", + ) + # inptStr has an extension so should be treated as a second file in the template formatting + # and the exception should be raised + with pytest.raises(Exception, match="can't have multiple paths"): + shelly.cmdline + + +def test_shell_cmd_inputs_template_10(): + """path_template uses a float field with formatting""" + + @shell.define + class Shelly(shell.Task["Shelly.Outputs"]): + class Outputs(shell.Outputs): + + outA: File = shell.outarg( + position=2, + help="outA", + argstr="-o", + path_template="file_{inpA:.1f}_out", + ) + + executable = "executable" + + inpA: float = shell.arg( + position=1, + help="inpA", + argstr="{inpA:.1f}", + ) + + shelly = Shelly(inpA=3.3456) + # outA has argstr in the metadata fields, so it's a part of the command line + # the full path will be use din the command line + assert shelly.cmdline == f"executable 3.3 -o {Path.cwd() / 'file_3.3_out'}" + # checking if outA in the output fields + assert get_output_names(shelly) == ["outA", "return_code", "stderr", "stdout"] + + +def test_shell_cmd_inputs_template_requires_1(): + """Given an input definition with a templated output file subject to required fields, + ensure the field is set only when all requirements are met.""" + + @shell.define + class Shelly(shell.Task["Shelly.Outputs"]): + class Outputs(shell.Outputs): + + out_file: File | None = shell.outarg( + help="output file", + argstr="--tpl", + path_template="tpl.{in_file}", + requires={"with_tpl"}, + ) + + executable = "executable" + + in_file: str = shell.arg( + help="input file", + argstr="", + ) + with_tpl: bool = shell.arg(help="enable template", default=False) + + # When requirements are not met. + shelly = Shelly(executable="cmd", in_file="in.file") + assert "--tpl" not in shelly.cmdline + + # When requirements are met. + shelly.with_tpl = True + assert "tpl.in.file" in shelly.cmdline + + +def test_shell_cmd_inputs_template_function_1(): + """one input field uses path_template that is a simple function + this can be easily done by simple template as in test_shell_cmd_inputs_template_1 + """ + + # a function that return an output template + def template_fun(inputs): + return "{inpA}_out" + + @shell.define + class Shelly(shell.Task["Shelly.Outputs"]): + class Outputs(shell.Outputs): + + outA: File = shell.outarg( + position=2, + help="outA", + argstr="-o", + path_template=template_fun, + ) + + executable = "executable" + + inpA: str = shell.arg( + position=1, + help="inpA", + argstr="", + ) + + shelly = Shelly(inpA="inpA") + + assert shelly.cmdline == f"executable inpA -o {Path.cwd() / 'inpA_out'}" + + +def test_shell_cmd_inputs_template_function_2(): + """one input field uses path_template that is a function, + depending on a value of an input it returns different template + """ + + # a function that return an output template that depends on value of the input + def template_fun(inputs): + if inputs.inpB % 2 == 0: + return "{inpA}_even" + else: + return "{inpA}_odd" + + @shell.define + class Shelly(shell.Task["Shelly.Outputs"]): + class Outputs(shell.Outputs): + + outA: File = shell.outarg( + position=2, + help="outA", + argstr="-o", + path_template=template_fun, + ) + + executable = "executable" + + inpA: str = shell.arg( + position=1, + help="inpA", + argstr="", + ) + inpB: int = shell.arg( + help="inpB", + argstr=None, + ) + + shelly = Shelly( + inpA="inpA", + inpB=1, + ) + + assert shelly.cmdline == f"executable inpA -o {Path.cwd() / 'inpA_odd'}" + + +# TODO: after deciding how we use requires/templates +def test_shell_cmd_inputs_denoise_image( + tmp_path, +): + """example from #279""" + + @shell.define + class DenoiseImage(shell.Task["DenoiseImage.Outputs"]): + class Outputs(shell.Outputs): + + correctedImage: File = shell.outarg( + help=""" + The output consists of the noise corrected version of the input image. + Optionally, one can also output the estimated noise image. """, + path_template="{inputImageFilename}_out", + argstr=None, + ) + noiseImage: File | None = shell.outarg( + help=""" + The output consists of the noise corrected version of the input image. + Optionally, one can also output the estimated noise image. """, + path_template="{inputImageFilename}_noise", + argstr=None, + ) + + executable = "executable" + + image_dimensionality: int | None = shell.arg( + help=""" + 2/3/4 + This option forces the image to be treated as a specified-dimensional image. + If not specified, the program tries to infer the dimensionality from + the input image. + """, + allowed_values=[2, 3, 4, None], + default=None, + argstr="-d", + position=1, + ) + inputImageFilename: File = shell.arg( + help="A scalar image is expected as input for noise correction.", + argstr="-i", + position=2, + ) + noise_model: str | None = shell.arg( + default=None, + help=""" Rician/(Gaussian) Employ a Rician or Gaussian noise model. """, + allowed_values=["Rician", "Gaussian"], + argstr="-n", + ) + maskImageFilename: str | None = shell.arg( + default=None, + help="If a mask image is specified, denoising is only performed in the mask region.", + argstr="-x", + ) + shrink_factor: int = shell.arg( + default=1, + help=""" + (1)/2/3/... + Running noise correction on large images can be time consuming. + To lessen computation time, the input image can be resampled. + The shrink factor, specified as a single integer, describes this + resampling. Shrink factor = 1 is the default. """, + argstr="-s", + position=3, + ) + patch_radius: int = shell.arg( + default=1, help="Patch radius. Default = 1x1x1", argstr="-p", position=4 + ) + search_radius: int = shell.arg( + default=2, help="Search radius. Default = 2x2x2.", argstr="-r", position=5 + ) + output: str = shell.arg( + help="Combined output", + argstr="-o [{correctedImage}, {noiseImage}]", + position=-1, + readonly=True, + ) + version: bool = shell.arg( + default=False, + help="Get Version Information.", + argstr="--version", + ) + verbose: int = shell.arg(default=0, help="(0)/1. Verbose output. ", argstr="-v") + help_short: bool = shell.arg( + default=False, + help="Print the help menu (short version)", + argstr="-h", + ) + help: int | None = shell.arg( + default=None, + help="Print the help menu.", + argstr="--help", + ) + + my_input_file = tmp_path / "a_file.ext" + my_input_file.write_text("content") + + # no input provided + denoise_image = DenoiseImage( + executable="DenoiseImage", + ) + with pytest.raises(Exception) as e: + denoise_image.cmdline + assert "mandatory" in str(e.value).lower() + + # input file name, noiseImage is not set, so using default value False + denoise_image = DenoiseImage( + executable="DenoiseImage", + inputImageFilename=my_input_file, + ) + assert ( + denoise_image.cmdline + == f"DenoiseImage -i {tmp_path / 'a_file.ext'} -s 1 -p 1 -r 2 -o [{Path.cwd() / 'a_file_out.ext'}]" + ) + + # input file name, noiseImage is set to True, so template is used in the output + denoise_image = DenoiseImage( + executable="DenoiseImage", + inputImageFilename=my_input_file, + noiseImage=True, + ) + assert ( + denoise_image.cmdline + == f"DenoiseImage -i {tmp_path / 'a_file.ext'} -s 1 -p 1 -r 2 " + f"-o [{Path.cwd() / 'a_file_out.ext'}, {str(Path.cwd() / 'a_file_noise.ext')}]" + ) + + # input file name and help_short + denoise_image = DenoiseImage( + executable="DenoiseImage", + inputImageFilename=my_input_file, + help_short=True, + ) + assert ( + denoise_image.cmdline + == f"DenoiseImage -i {tmp_path / 'a_file.ext'} -s 1 -p 1 -r 2 -h -o [{Path.cwd() / 'a_file_out.ext'}]" + ) + + assert get_output_names(denoise_image) == [ + "correctedImage", + "noiseImage", + "return_code", + "stderr", + "stdout", + ] + + # adding image_dimensionality that has allowed_values [2, 3, 4] + denoise_image = DenoiseImage( + executable="DenoiseImage", + inputImageFilename=my_input_file, + image_dimensionality=2, + ) + assert ( + denoise_image.cmdline + == f"DenoiseImage -d 2 -i {tmp_path / 'a_file.ext'} -s 1 -p 1 -r 2 -o [{Path.cwd() / 'a_file_out.ext'}]" + ) + + # adding image_dimensionality that has allowed_values [2, 3, 4] and providing 5 - exception should be raised + with pytest.raises(ValueError) as excinfo: + denoise_image = DenoiseImage( + executable="DenoiseImage", + inputImageFilename=my_input_file, + image_dimensionality=5, + ) + assert "value of image_dimensionality" in str(excinfo.value) + + +# tests with XOR in input metadata + + +@shell.define(xor=("input_1", "input_2", "input_3")) +class SimpleXor(shell.Task["SimpleTaskXor.Outputs"]): + + input_1: str | None = shell.arg( + default=None, + help="help", + ) + input_2: bool | None = shell.arg( + default=None, + help="help", + argstr="--i2", + ) + input_3: bool | None = shell.arg( + default=None, + help="help", + ) + + class Outputs(shell.Outputs): + pass + + executable = "cmd" + + +def test_task_inputs_mandatory_with_xOR_one_mandatory_is_OK(): + """input definition with mandatory inputs""" + simple_xor = SimpleXor() + simple_xor.input_1 = "Input1" + simple_xor._check_rules() + + +def test_task_inputs_mandatory_with_xOR_one_mandatory_out_3_is_OK(): + """input definition with mandatory inputs""" + simple_xor = SimpleXor() + simple_xor.input_3 = True + simple_xor._check_rules() + + +def test_task_inputs_mandatory_with_xOR_zero_mandatory_raises_error(): + """input definition with mandatory inputs""" + simple_xor = SimpleXor() + simple_xor.input_2 = False + with pytest.raises( + ValueError, match="At least one of the mutually exclusive fields should be set:" + ): + simple_xor._check_rules() + + +def test_task_inputs_mandatory_with_xOR_two_mandatories_raises_error(): + """input definition with mandatory inputs""" + simple_xor = SimpleXor() + simple_xor.input_1 = "Input1" + simple_xor.input_2 = True + + with pytest.raises( + ValueError, match="Mutually exclusive fields .* are set together" + ): + simple_xor._check_rules() + + +def test_task_inputs_mandatory_with_xOR_3_mandatories_raises_error(): + """input definition with mandatory inputs""" + simple_xor = SimpleXor() + simple_xor.input_1 = "Input1" + simple_xor.input_2 = True + simple_xor.input_3 = False + + with pytest.raises( + ValueError, + match=r".*Mutually exclusive fields \(input_1='Input1', input_2=True\) are set together", + ): + simple_xor._check_rules() diff --git a/pydra/compose/shell/tests/test_shell_fields.py b/pydra/compose/shell/tests/test_shell_fields.py new file mode 100644 index 0000000000..79459e0f72 --- /dev/null +++ b/pydra/compose/shell/tests/test_shell_fields.py @@ -0,0 +1,1051 @@ +import os +import typing as ty +from pathlib import Path +import attrs +import pytest +import cloudpickle as cp +from pydra.compose import shell +from pydra.utils.general import task_fields, task_help, wrap_text +from pydra.compose.shell.builder import _InputPassThrough +from fileformats.generic import File, Directory, FsObject +from fileformats import text, image +from pydra.utils.typing import MultiInputObj + + +def test_interface_template(): + + Cp = shell.define("cp ") + + assert issubclass(Cp, shell.Task) + output = shell.outarg( + name="out_path", + path_template="out_path", + type=FsObject, + position=2, + ) + assert sorted_fields(Cp) == [ + shell.arg( + name="executable", + validator=attrs.validators.min_len(1), + default="cp", + type=str | ty.Sequence[str], + position=0, + help=shell.Task.EXECUTABLE_HELP, + ), + shell.arg(name="in_path", type=FsObject, position=1), + output, + shell.Task.append_args, + ] + assert sorted_fields(Cp.Outputs) == [ + output, + shell.out( + name="return_code", + type=int, + help=shell.Outputs.RETURN_CODE_HELP, + ), + shell.out( + name="stderr", + type=str, + help=shell.Outputs.STDERR_HELP, + ), + shell.out( + name="stdout", + type=str, + help=shell.Outputs.STDOUT_HELP, + ), + ] + intf = Cp(in_path=File.mock("in-path.txt")) + assert intf.executable == "cp" + Cp(in_path=File.mock("in-path.txt"), out_path=Path("./out-path.txt")) + Cp.Outputs(out_path=File.mock("in-path.txt")) + + +def test_executable_arg_fail(): + + with pytest.raises(ValueError, match="The argument 'executable' is reserved"): + + shell.define("my-cmd ") + + +def test_interface_template_w_types_and_path_template_ext(): + + TrimPng = shell.define("trim-png ") + + assert issubclass(TrimPng, shell.Task) + output = shell.outarg( + name="out_image", + path_template="out_image.png", + type=image.Png, + position=2, + ) + assert sorted_fields(TrimPng) == [ + shell.arg( + name="executable", + validator=attrs.validators.min_len(1), + default="trim-png", + type=str | ty.Sequence[str], + position=0, + help=shell.Task.EXECUTABLE_HELP, + ), + shell.arg(name="in_image", type=image.Png, position=1), + output, + shell.Task.append_args, + ] + assert sorted_fields(TrimPng.Outputs) == [ + output, + shell.out( + name="return_code", + type=int, + help=shell.Outputs.RETURN_CODE_HELP, + ), + shell.out( + name="stderr", + type=str, + help=shell.Outputs.STDERR_HELP, + ), + shell.out( + name="stdout", + type=str, + help=shell.Outputs.STDOUT_HELP, + ), + ] + TrimPng(in_image=image.Png.mock()) + TrimPng(in_image=image.Png.mock(), out_image=Path("./new_image.png")) + TrimPng.Outputs(out_image=image.Png.mock()) + + +def test_interface_template_w_modify(): + + TrimPng = shell.define("trim-png ") + + assert issubclass(TrimPng, shell.Task) + assert sorted_fields(TrimPng) == [ + shell.arg( + name="executable", + validator=attrs.validators.min_len(1), + default="trim-png", + type=str | ty.Sequence[str], + position=0, + help=shell.Task.EXECUTABLE_HELP, + ), + shell.arg( + name="image", type=image.Png, position=1, copy_mode=File.CopyMode.copy + ), + shell.Task.append_args, + ] + assert sorted_fields(TrimPng.Outputs) == [ + shell.out( + name="image", + type=image.Png, + callable=_InputPassThrough("image"), + ), + shell.out( + name="return_code", + type=int, + help=shell.Outputs.RETURN_CODE_HELP, + ), + shell.out( + name="stderr", + type=str, + help=shell.Outputs.STDERR_HELP, + ), + shell.out( + name="stdout", + type=str, + help=shell.Outputs.STDOUT_HELP, + ), + ] + TrimPng(image=image.Png.mock()) + TrimPng.Outputs(image=image.Png.mock()) + + +def test_interface_template_more_complex(): + + Cp = shell.define( + ( + "cp " + "-R " + "--text-arg " + "--int-arg " + "--tuple-arg " + ), + ) + + assert issubclass(Cp, shell.Task) + output = shell.outarg( + name="out_dir", + type=Directory, + path_template="out_dir", + position=2, + ) + assert sorted_fields(Cp) == [ + shell.arg( + name="executable", + validator=attrs.validators.min_len(1), + default="cp", + type=str | ty.Sequence[str], + position=0, + help=shell.Task.EXECUTABLE_HELP, + ), + shell.arg( + name="in_fs_objects", + type=MultiInputObj[FsObject], + position=1, + ), + output, + shell.arg(name="recursive", argstr="-R", type=bool, default=False, position=3), + shell.arg( + name="text_arg", + argstr="--text-arg", + type=str | None, + default=None, + position=4, + ), + shell.arg( + name="int_arg", + argstr="--int-arg", + type=int | None, + default=None, + position=5, + ), + shell.arg( + name="tuple_arg", + argstr="--tuple-arg", + type=tuple[int, str] | None, + sep=" ", + default=None, + position=6, + ), + shell.Task.append_args, + ] + assert sorted_fields(Cp.Outputs) == [ + output, + shell.out( + name="return_code", + type=int, + help=shell.Outputs.RETURN_CODE_HELP, + ), + shell.out( + name="stderr", + type=str, + help=shell.Outputs.STDERR_HELP, + ), + shell.out( + name="stdout", + type=str, + help=shell.Outputs.STDOUT_HELP, + ), + ] + Cp(in_fs_objects=[File.sample(), File.sample(seed=1)]) + Cp.Outputs(out_dir=Directory.sample()) + + +def test_interface_template_with_overrides_and_optionals(): + + RECURSIVE_HELP = ( + "If source_file designates a directory, cp copies the directory and the entire " + "subtree connected at that point." + ) + + Cp = shell.define( + ( + "cp " + "-R " + "--text-arg " + "--int-arg " + "--tuple-arg " + ), + inputs={"recursive": shell.arg(help=RECURSIVE_HELP)}, + outputs={ + "out_dir": shell.outarg(position=-2), + "out_file": shell.outarg(position=-1), + }, + ) + + assert issubclass(Cp, shell.Task) + outargs = [ + shell.outarg( + name="out_dir", + type=Directory, + path_template="out_dir", + position=-2, + ), + shell.outarg( + name="out_file", + type=File | None, + default=None, + path_template="out_file", + position=-1, + ), + ] + assert sorted_fields(Cp) == [ + shell.arg( + name="executable", + validator=attrs.validators.min_len(1), + default="cp", + type=str | ty.Sequence[str], + position=0, + help=shell.Task.EXECUTABLE_HELP, + ), + shell.arg(name="in_fs_objects", type=MultiInputObj[FsObject], position=1), + shell.arg( + name="recursive", + argstr="-R", + type=bool, + default=False, + help=RECURSIVE_HELP, + position=2, + ), + shell.arg(name="text_arg", argstr="--text-arg", type=str, position=3), + shell.arg( + name="int_arg", + argstr="--int-arg", + type=int | None, + default=None, + position=4, + ), + shell.arg( + name="tuple_arg", + argstr="--tuple-arg", + type=tuple[int, str], + sep=" ", + position=5, + ), + ] + outargs + [shell.Task.append_args] + assert sorted_fields(Cp.Outputs) == outargs + [ + shell.out( + name="return_code", + type=int, + help=shell.Outputs.RETURN_CODE_HELP, + ), + shell.out( + name="stderr", + type=str, + help=shell.Outputs.STDERR_HELP, + ), + shell.out( + name="stdout", + type=str, + help=shell.Outputs.STDOUT_HELP, + ), + ] + + +def test_interface_template_with_defaults(): + + Cp = shell.define( + ( + "cp " + "-R " + "--text-arg " + "--int-arg " + "--tuple-arg " + ), + ) + + assert issubclass(Cp, shell.Task) + output = shell.outarg( + name="out_dir", + type=Directory, + path_template="out_dir", + position=2, + ) + assert sorted_fields(Cp) == [ + shell.arg( + name="executable", + validator=attrs.validators.min_len(1), + default="cp", + type=str | ty.Sequence[str], + position=0, + help=shell.Task.EXECUTABLE_HELP, + ), + shell.arg(name="in_fs_objects", type=MultiInputObj[FsObject], position=1), + output, + shell.arg(name="recursive", argstr="-R", type=bool, default=True, position=3), + shell.arg( + name="text_arg", argstr="--text-arg", type=str, position=4, default="foo" + ), + shell.arg(name="int_arg", argstr="--int-arg", type=int, position=5, default=99), + shell.arg( + name="tuple_arg", + argstr="--tuple-arg", + type=tuple[int, str], + default=(1, "bar"), + position=6, + sep=" ", + ), + shell.Task.append_args, + ] + assert sorted_fields(Cp.Outputs) == [ + output, + shell.out( + name="return_code", + type=int, + help=shell.Outputs.RETURN_CODE_HELP, + ), + shell.out( + name="stderr", + type=str, + help=shell.Outputs.STDERR_HELP, + ), + shell.out( + name="stdout", + type=str, + help=shell.Outputs.STDOUT_HELP, + ), + ] + Cp(in_fs_objects=[File.sample(), File.sample(seed=1)]) + Cp.Outputs(out_dir=Directory.sample()) + + +def test_interface_template_with_type_overrides(): + + Cp = shell.define( + ( + "cp " + "-R " + "--text-arg " + "--int-arg " + "--tuple-arg " + ), + inputs={"text_arg": str, "int_arg": int | None}, + ) + + assert issubclass(Cp, shell.Task) + output = shell.outarg( + name="out_dir", + type=Directory, + path_template="out_dir", + position=2, + ) + assert sorted_fields(Cp) == [ + shell.arg( + name="executable", + validator=attrs.validators.min_len(1), + default="cp", + type=str | ty.Sequence[str], + position=0, + help=shell.Task.EXECUTABLE_HELP, + ), + shell.arg(name="in_fs_objects", type=MultiInputObj[FsObject], position=1), + output, + shell.arg(name="recursive", argstr="-R", type=bool, default=False, position=3), + shell.arg(name="text_arg", argstr="--text-arg", type=str, position=4), + shell.arg( + name="int_arg", + argstr="--int-arg", + type=int | None, + position=5, + ), + shell.arg( + name="tuple_arg", + argstr="--tuple-arg", + type=tuple[int, str], + position=6, + sep=" ", + ), + shell.Task.append_args, + ] + assert sorted_fields(Cp.Outputs) == [ + output, + shell.out( + name="return_code", + type=int, + help=shell.Outputs.RETURN_CODE_HELP, + ), + shell.out( + name="stderr", + type=str, + help=shell.Outputs.STDERR_HELP, + ), + shell.out( + name="stdout", + type=str, + help=shell.Outputs.STDOUT_HELP, + ), + ] + + +@pytest.fixture(params=["static", "dynamic"]) +def Ls(request): + if request.param == "static": + + @shell.define(xor=["complete_date", "date_format_str", None]) + class Ls(shell.Task["Ls.Outputs"]): + executable = "ls" + + directory: Directory = shell.arg( + help="the directory to list the contents of", + argstr="", + position=-1, + ) + hidden: bool = shell.arg( + help=("display hidden FS objects"), + argstr="-a", + default=False, + ) + long_format: bool = shell.arg( + help=( + "display properties of FS object, such as permissions, size and " + "timestamps " + ), + default=False, + argstr="-l", + ) + human_readable: bool = shell.arg( + help="display file sizes in human readable form", + argstr="-h", + default=False, + requires=["long_format"], + ) + complete_date: bool = shell.arg( + help="Show complete date in long format", + argstr="-T", + default=False, + requires=["long_format"], + ) + date_format_str: str | None = shell.arg( + help="format string for ", + argstr="-D", + default=None, + requires=["long_format"], + ) + + class Outputs(shell.Outputs): + entries: list = shell.out( + help="list of entries returned by ls command", + callable=list_entries, + ) + + elif request.param == "dynamic": + Ls = shell.define( + "ls", + inputs={ + "directory": shell.arg( + type=Directory, + help="the directory to list the contents of", + argstr="", + position=-1, + ), + "hidden": shell.arg( + type=bool, + help="display hidden FS objects", + default=False, + argstr="-a", + ), + "long_format": { # Mix it up with a full dictionary based definition + "type": bool, + "default": False, + "help": ( + "display properties of FS object, such as permissions, size and " + "timestamps " + ), + "argstr": "-l", + }, + "human_readable": shell.arg( + type=bool, + help="display file sizes in human readable form", + default=False, + argstr="-h", + requires=["long_format"], + ), + "complete_date": shell.arg( + type=bool, + help="Show complete date in long format", + argstr="-T", + default=False, + requires=["long_format"], + ), + "date_format_str": shell.arg( + type=str | None, + help="format string for ", + default=None, + argstr="-D", + requires=["long_format"], + ), + }, + outputs={ + "entries": shell.out( + type=list, + help="list of entries returned by ls command", + callable=list_entries, + ) + }, + xor=["complete_date", "date_format_str", None], + name="Ls", + ) + + else: + assert False + + return Ls + + +def test_shell_fields(Ls): + assert sorted([a.name for a in sorted_fields(Ls)]) == sorted( + [ + "append_args", + "executable", + "directory", + "hidden", + "long_format", + "human_readable", + "complete_date", + "date_format_str", + ] + ) + + assert [a.name for a in sorted_fields(Ls.Outputs)] == sorted( + [ + "entries", + "stdout", + "stderr", + "return_code", + ] + ) + + +def test_shell_pickle_roundtrip(Ls, tmp_path): + pkl_file = tmp_path / "ls.pkl" + with open(pkl_file, "wb") as f: + cp.dump(Ls, f) + + with open(pkl_file, "rb") as f: + RereadLs = cp.load(f) + + assert RereadLs is Ls + + +# @pytest.mark.xfail(reason="Still need to update tasks to use new shell interface") +def test_shell_run(Ls, tmp_path): + Path.touch(tmp_path / "a") + Path.touch(tmp_path / "b") + Path.touch(tmp_path / "c") + + ls = Ls(directory=tmp_path, long_format=True) + + # Test cmdline + assert ls.directory == Directory(tmp_path) + assert not ls.hidden + assert ls.long_format + assert ls.cmdline == f"ls -l {tmp_path}" + + # Drop Long format flag to make output simpler + ls = Ls(directory=tmp_path) + outputs = ls() + + assert sorted(outputs.entries) == ["a", "b", "c"] + + +@pytest.fixture(params=["static", "dynamic"]) +def A(request): + if request.param == "static": + + @shell.define + class A(shell.Task["A.Outputs"]): + """An example shell interface described in a class + + Parameters + ---------- + x : File + an input file + """ + + executable = "cp" + + x: File = shell.arg(argstr="", position=1) + + class Outputs(shell.Outputs): + """The outputs of the example shell interface + + Parameters + ---------- + y : File + path of output file""" + + y: File = shell.outarg(path_template="{x}_out", position=-1) + + elif request.param == "dynamic": + A = shell.define( + "cp", + inputs={ + "x": shell.arg( + type=File, + help="an input file", + argstr="", + position=1, + ), + }, + outputs={ + "y": shell.outarg( + type=File, + help="path of output file", + argstr="", + path_template="{x}_out", + ), + }, + name="A", + ) + else: + assert False + + return A + + +def test_shell_output_path_template(A): + assert "y" in [a.name for a in attrs.fields(A.Outputs)] + + +def test_shell_output_field_name_static(): + @shell.define + class A(shell.Task["A.Outputs"]): + """Copy a file""" + + executable = "cp" + + x: File = shell.arg(help="an input file", argstr="", position=1) + + class Outputs(shell.Outputs): + y: File = shell.outarg( + help="the output file", + path_template="{x}_out", + argstr="", + position=-1, + ) + + assert sorted([a.name for a in attrs.fields(A) if not a.name.startswith("_")]) == [ + "append_args", + "executable", + "x", + "y", + ] + assert sorted( + a.name for a in attrs.fields(A.Outputs) if not a.name.startswith("_") + ) == [ + "return_code", + "stderr", + "stdout", + "y", + ] + output = shell.outarg( + name="y", + type=File, + help="the output file", + path_template="{x}_out", + argstr="", + position=-1, + ) + assert sorted_fields(A) == [ + shell.arg( + name="executable", + validator=attrs.validators.min_len(1), + default="cp", + type=str | ty.Sequence[str], + argstr="", + position=0, + help=shell.Task.EXECUTABLE_HELP, + ), + shell.arg( + name="x", + type=File, + help="an input file", + argstr="", + position=1, + ), + output, + shell.Task.append_args, + ] + assert sorted_fields(A.Outputs) == [ + output, + shell.out( + name="return_code", + type=int, + help=shell.Outputs.RETURN_CODE_HELP, + ), + shell.out( + name="stderr", + type=str, + help=shell.Outputs.STDERR_HELP, + ), + shell.out( + name="stdout", + type=str, + help=shell.Outputs.STDOUT_HELP, + ), + ] + + +def test_shell_output_field_name_dynamic(): + A = shell.define( + "cp", + name="A", + inputs={ + "x": shell.arg( + type=File, + help="an input file", + argstr="", + position=1, + ), + }, + outputs={ + "y": shell.outarg( + type=File, + help="path of output file", + argstr="", + path_template="{x}_out", + ), + }, + ) + + assert "y" in [a.name for a in attrs.fields(A.Outputs)] + + +def get_file_size(y: Path): + result = os.stat(y) + return result.st_size + + +def test_shell_bases_dynamic(A, tmp_path): + B = shell.define( + name="B", + inputs={"y": shell.arg(type=File, help="output file", argstr="", position=-1)}, + outputs={ + "out_file_size": { + "type": int, + "help": "size of the output directory", + "callable": get_file_size, + } + }, + bases=[A], + ) + + xpath = tmp_path / "x.txt" + ypath = tmp_path / "y.txt" + Path.touch(xpath) + Path.touch(ypath) + + b = B(x=xpath, y=ypath) + + assert b.x == File(xpath) + assert b.y == File(ypath) + + # outputs = b() + # assert outputs.y == str(ypath) + + +def test_shell_bases_static(A, tmp_path): + @shell.define + class B(A): + + y: text.Plain = shell.arg() # Override the output arg in A + + class Outputs(shell.Outputs): + """ + Args: + out_file_size: size of the output directory + """ + + out_file_size: int = shell.out(callable=get_file_size) + + xpath = tmp_path / "x.txt" + ypath = tmp_path / "y.txt" + Path.touch(xpath) + ypath.write_text("Hello, World!") + + a = A(x=xpath, y=ypath) + assert a.x == File(xpath) + assert a.y == ypath + + b = B(x=xpath, y=str(ypath)) + assert b.x == File(xpath) + # We have overridden the type of y from an output arg with a path_template so it + # gets coerced to a text.Plain object + assert b.y == text.Plain(ypath) + + # outputs = b() + # assert outputs.y == str(ypath) + + +def test_shell_inputs_outputs_bases_dynamic(tmp_path): + A = shell.define( + "ls", + name="A", + inputs={ + "directory": shell.arg( + type=Directory, + help="input directory", + argstr="", + position=-1, + ) + }, + outputs={ + "entries": shell.out( + type=list, + help="list of entries returned by ls command", + callable=list_entries, + ) + }, + ) + B = shell.define( + "ls", + name="B", + inputs={ + "hidden": shell.arg( + type=bool, + argstr="-a", + help="show hidden files", + default=False, + ) + }, + bases=[A], + ) + + b = B(directory=tmp_path, hidden=True) + + assert b.directory == Directory(tmp_path) + assert b.hidden + + # File.sample(tmp_path, stem=".hidden-file") + # outputs = b() + # assert result.runner.cmdline == f"ls -a {tmp_path}" + # assert outputs.entries == [".", "..", ".hidden-file"] + + +def test_shell_inputs_outputs_bases_static(tmp_path): + @shell.define + class A(shell.Task["A.Outputs"]): + executable = "ls" + + directory: Directory = shell.arg(help="input directory", argstr="", position=-1) + + class Outputs(shell.Outputs): + entries: list = shell.out( + help="list of entries returned by ls command", + callable=list_entries, + ) + + @shell.define + class B(A): + hidden: bool = shell.arg( + help="show hidden files", + argstr="-a", + default=False, + ) + + Path.touch(tmp_path / ".hidden") + + b = B(directory=tmp_path, hidden=True) + + assert b.directory == Directory(tmp_path) + assert b.hidden + + # outputs = b() + # assert outputs.entries == [".", "..", ".hidden"] + + +def test_shell_missing_executable_static(): + with pytest.raises(AttributeError, match="must have an `executable` attribute"): + + @shell.define + class A: + directory: Directory = shell.arg( + help="input directory", argstr="", position=-1 + ) + + class Outputs: + entries: list = shell.out( + help="list of entries returned by ls command", + callable=list_entries, + ) + + +def test_shell_missing_executable_dynamic(): + with pytest.raises( + ValueError, + match=r"name \('A'\) can only be provided when creating a class dynamically", + ): + shell.define( + name="A", + inputs={ + "directory": shell.arg( + type=Directory, + help="input directory", + argstr="", + position=-1, + ), + }, + outputs={ + "entries": shell.out( + type=list, + help="list of entries returned by ls command", + callable=list_entries, + ) + }, + ) + + +def test_shell_help1(): + + Shelly = shell.define( + "shelly --arg1 " + "--arg2 --opt-out " + ) + + assert task_help(Shelly) == [ + "----------------------------", + "Help for Shell task 'shelly'", + "----------------------------", + "", + "Inputs:", + "- executable: str | Sequence[str]; default = 'shelly'", + " the first part of the command, can be a string, e.g. 'ls', or a list, e.g.", + " ['ls', '-l', 'dirname']", + "- in_file: generic/file", + "- out_file: Path | bool; default = True", + ] + wrap_text(shell.outarg.PATH_TEMPLATE_HELP).split("\n") + [ + "- arg1: int ('--arg1')", + "- arg2: float | None; default = None ('--arg2')", + "- opt_out: Path | bool | None; default = None ('--opt-out')", + ] + wrap_text( + shell.outarg.OPTIONAL_PATH_TEMPLATE_HELP + ).split( + "\n" + ) + [ + "- append_args: list[str | generic/file]; default-factory = list()", + " Additional free-form arguments to append to the end of the command.", + "", + "Outputs:", + "- out_file: generic/file", + "- opt_out: generic/file | None; default = None", + "- return_code: int", + " " + shell.Outputs.RETURN_CODE_HELP, + "- stdout: str", + " " + shell.Outputs.STDOUT_HELP, + "- stderr: str", + " " + shell.Outputs.STDERR_HELP, + "", + ] + + +def list_entries(stdout): + return stdout.split("\n")[:-1] + + +def sorted_fields(interface): + fields = task_fields(interface) + length = len(fields) - 1 + + def pos_key(out: shell.out) -> int: + if out.name == "append_args": + return (length + 1, out.name) + try: + pos = out.position + except AttributeError: + return (length, out.name) + if pos < 0: + key = length + pos + else: + key = pos + return (key, out.name) + + return sorted(fields, key=pos_key) diff --git a/pydra/compose/shell/tests/test_shell_run.py b/pydra/compose/shell/tests/test_shell_run.py new file mode 100644 index 0000000000..39942be029 --- /dev/null +++ b/pydra/compose/shell/tests/test_shell_run.py @@ -0,0 +1,3594 @@ +import typing as ty +import os +import sys +import pytest +from pathlib import Path +import re +import stat +import attrs +from pydra.engine.submitter import Submitter +from pydra.compose import shell, workflow, python +from fileformats.generic import ( + File, + Directory, +) +from pydra.utils.typing import ( + MultiOutputFile, + MultiInputObj, + StateArray, +) +from pydra.engine.tests.utils import ( + run_no_submitter, + run_submitter, + no_win, + get_output_names, +) + +if sys.platform.startswith("win"): + pytest.skip("SLURM not available in windows", allow_module_level=True) + + +@pytest.mark.flaky(reruns=2) # when dask +@pytest.mark.parametrize("results_function", [run_no_submitter, run_submitter]) +def test_shell_cmd_1(worker, results_function, tmp_path): + """simple command, no arguments""" + cmd = ["pwd"] + shelly = shell.define(cmd)() + assert shelly.cmdline == " ".join(cmd) + + outputs = results_function(shelly, worker=worker, cache_root=tmp_path) + assert Path(outputs.stdout.rstrip()).parent == tmp_path + assert outputs.return_code == 0 + assert outputs.stderr == "" + + +@pytest.mark.parametrize("results_function", [run_no_submitter, run_submitter]) +def test_shell_cmd_1_strip(worker, results_function, tmp_path): + """simple command, no arguments + strip option to remove \n at the end os stdout + """ + cmd = ["pwd"] + shelly = shell.define(cmd)() + + assert shelly.cmdline == " ".join(cmd) + + outputs = results_function(shelly, worker=worker, cache_root=tmp_path) + assert Path(outputs.stdout).parent == tmp_path + assert outputs.return_code == 0 + assert outputs.stderr == "" + + +@pytest.mark.parametrize("results_function", [run_no_submitter, run_submitter]) +def test_shell_cmd_2(worker, results_function, tmp_path): + """a command with arguments, cmd and args given as executable""" + cmd = ["echo", "hail", "pydra"] + shelly = shell.define(cmd)() + + assert shelly.cmdline == " ".join(cmd) + + outputs = results_function(shelly, worker=worker, cache_root=tmp_path) + assert outputs.stdout.strip() == " ".join(cmd[1:]) + assert outputs.return_code == 0 + assert outputs.stderr == "" + + +@pytest.mark.parametrize("results_function", [run_no_submitter, run_submitter]) +def test_shell_cmd_2a(worker, results_function, tmp_path): + """a command with arguments, using executable and args""" + cmd_exec = "echo" + cmd_args = ["hail", "pydra"] + # separate command into exec + args + shelly = shell.define(cmd_exec)(append_args=cmd_args) + + assert shelly.executable == "echo" + assert shelly.cmdline == "echo " + " ".join(cmd_args) + + outputs = results_function(shelly, worker=worker, cache_root=tmp_path) + assert outputs.stdout.strip() == " ".join(cmd_args) + assert outputs.return_code == 0 + assert outputs.stderr == "" + + +@pytest.mark.parametrize("results_function", [run_no_submitter, run_submitter]) +def test_shell_cmd_2b(worker, results_function, tmp_path): + """a command with arguments, using strings executable and args""" + cmd_exec = "echo" + cmd_args = ["pydra"] + # separate command into exec + args + shelly = shell.define(cmd_exec)(append_args=cmd_args) + + assert shelly.executable == "echo" + assert shelly.cmdline == "echo pydra" + + outputs = results_function(shelly, worker=worker, cache_root=tmp_path) + assert outputs.stdout == "pydra\n" + assert outputs.return_code == 0 + assert outputs.stderr == "" + + +# tests with State + + +@pytest.mark.flaky(reruns=2) +def test_shell_cmd_3(worker, tmp_path): + """commands without arguments + splitter = executable + """ + cmd = ["pwd", "whoami"] + + # all args given as executable + shelly = shell.define("shelly")().split(executable=cmd) + + # assert shelly.cmdline == ["pwd", "whoami"] + outputs = shelly(worker=worker, cache_root=tmp_path) + assert Path(outputs.stdout[0].rstrip()).parent == tmp_path + + if "USER" in os.environ: + assert outputs.stdout[1] == f"{os.environ['USER']}\n" + else: + assert outputs.stdout[1] + assert outputs.return_code[0] == outputs.return_code[1] == 0 + assert outputs.stderr[0] == outputs.stderr[1] == "" + + +def test_shell_cmd_4(worker, tmp_path): + """a command with arguments, using executable and args + splitter=args + """ + cmd_exec = "echo" + cmd_args = [["nipype"], ["pydra"]] + # separate command into exec + args + shelly = shell.define(cmd_exec)().split(append_args=cmd_args) + + assert shelly.executable == "echo" + assert shelly.append_args == StateArray([["nipype"], ["pydra"]]) + # assert shelly.cmdline == ["echo nipype", "echo pydra"] + outputs = shelly(worker=worker) + + assert outputs.stdout[0] == "nipype\n" + assert outputs.stdout[1] == "pydra\n" + + assert outputs.return_code[0] == outputs.return_code[1] == 0 + assert outputs.stderr[0] == outputs.stderr[1] == "" + + +def test_shell_cmd_5(worker, tmp_path): + """a command with arguments + using splitter and combiner for args + """ + cmd_exec = "echo" + cmd_args = [["nipype"], ["pydra"]] + # separate command into exec + args + shelly = shell.define(cmd_exec)().split(append_args=cmd_args).combine("append_args") + + assert shelly.executable == "echo" + assert shelly.append_args == StateArray([["nipype"], ["pydra"]]) + # assert shelly.cmdline == ["echo nipype", "echo pydra"] + outputs = shelly(worker=worker) + + assert outputs.stdout[0] == "nipype\n" + assert outputs.stdout[1] == "pydra\n" + + +def test_shell_cmd_6(worker, tmp_path): + """a command with arguments, + outer splitter for executable and args + """ + cmd_exec = ["echo", ["echo", "-n"]] + cmd_args = [["nipype"], ["pydra"]] + # separate command into exec + args + shelly = shell.define("shelly")().split( + ["executable", "append_args"], executable=cmd_exec, append_args=cmd_args + ) + + assert shelly.executable == ["echo", ["echo", "-n"]] + assert shelly.append_args == StateArray([["nipype"], ["pydra"]]) + outputs = shelly(cache_root=tmp_path, worker=worker) + + assert outputs.stdout == ["nipype\n", "pydra\n", "nipype", "pydra"] + + assert ( + outputs.return_code[0] + == outputs.return_code[1] + == outputs.return_code[2] + == outputs.return_code[3] + == 0 + ) + assert ( + outputs.stderr[0] + == outputs.stderr[1] + == outputs.stderr[2] + == outputs.stderr[3] + == "" + ) + + +def test_shell_cmd_7(worker, tmp_path): + """a command with arguments, + outer splitter for executable and args, and combiner=args + """ + cmd_exec = ["echo", ["echo", "-n"]] + cmd_args = [["nipype"], ["pydra"]] + # separate command into exec + args + shelly = ( + shell.define("shelly")() + .split( + ["executable", "append_args"], + executable=cmd_exec, + append_args=cmd_args, + ) + .combine("append_args") + ) + + assert shelly.executable == ["echo", ["echo", "-n"]] + assert shelly.append_args == StateArray([["nipype"], ["pydra"]]) + + outputs = shelly(worker=worker) + + assert outputs.stdout == [["nipype\n", "pydra\n"], ["nipype", "pydra"]] + + +# tests with workflows + + +def test_wf_shell_cmd_1(worker, tmp_path): + """a workflow with two connected commands""" + + @workflow.define + def Workflow(cmd1, cmd2): + shelly_pwd = workflow.add(shell.define(cmd1)()) + + @python.define + def StripAndListify(x: str) -> list[str]: + return [x.strip()] + + listify = workflow.add(StripAndListify(x=shelly_pwd.stdout)) + shelly_ls = workflow.add(shell.define(cmd2)(append_args=listify.out)) + return shelly_ls.stdout + + wf = Workflow(cmd1="pwd", cmd2="ls") + + with Submitter(worker=worker, cache_root=tmp_path) as sub: + res = sub(wf) + + assert "_result.pklz" in res.outputs.out + assert "_job.pklz" in res.outputs.out + + +# customised input task + + +@pytest.mark.parametrize("results_function", [run_no_submitter, run_submitter]) +def test_shell_cmd_inputspec_1(worker, results_function, tmp_path): + """a command with executable, args and one command opt, + using a customized input_spec to add the opt to the command + in the right place that is specified in metadata["cmd_pos"] + """ + cmd_exec = "echo" + cmd_opt = True + cmd_args = ["hello from pydra"] + + @shell.define + class Shelly(shell.Task["Shelly.Outputs"]): + executable = cmd_exec + opt_n: bool = shell.arg( + position=1, + argstr="-n", + help="option", + ) + + class Outputs(shell.Outputs): + pass + + # separate command into exec + args + shelly = Shelly(append_args=cmd_args, opt_n=cmd_opt) + assert shelly.executable == cmd_exec + assert shelly.append_args == cmd_args + assert shelly.cmdline == "echo -n 'hello from pydra'" + + outputs = results_function(shelly, worker=worker, cache_root=tmp_path) + assert outputs.stdout == "hello from pydra" + + +@pytest.mark.parametrize("results_function", [run_no_submitter, run_submitter]) +def test_shell_cmd_inputspec_2(worker, results_function, tmp_path): + """a command with executable, args and two command options, + using a customized input_spec to add the opt to the command + in the right place that is specified in metadata["cmd_pos"] + """ + cmd_exec = "echo" + cmd_opt = True + cmd_opt_hello = "HELLO" + cmd_args = ["from pydra"] + + @shell.define + class Shelly(shell.Task["Shelly.Outputs"]): + executable = cmd_exec + opt_hello: str = shell.arg( + position=3, + help="todo", + argstr="", + ) + opt_n: bool = shell.arg( + position=1, + help="todo", + argstr="-n", + ) + + class Outputs(shell.Outputs): + pass + + # separate command into exec + args + shelly = Shelly(append_args=cmd_args, opt_n=cmd_opt, opt_hello=cmd_opt_hello) + assert shelly.executable == cmd_exec + assert shelly.append_args == cmd_args + assert shelly.cmdline == "echo -n HELLO 'from pydra'" + outputs = results_function(shelly, worker=worker, cache_root=tmp_path) + assert outputs.stdout == "HELLO from pydra" + + +@pytest.mark.parametrize("results_function", [run_no_submitter, run_submitter]) +def test_shell_cmd_inputspec_3(worker, results_function, tmp_path): + """mandatory field added to fields, value provided""" + cmd_exec = "echo" + hello = "HELLO" + + @shell.define + class Shelly(shell.Task["Shelly.Outputs"]): + executable = cmd_exec + text: str = shell.arg( + position=1, + help="text", + argstr="", + ) + + class Outputs(shell.Outputs): + pass + + # separate command into exec + args + shelly = Shelly(text=hello) + assert shelly.executable == cmd_exec + assert shelly.cmdline == "echo HELLO" + outputs = results_function(shelly, worker=worker, cache_root=tmp_path) + assert outputs.stdout == "HELLO\n" + + +@pytest.mark.parametrize("results_function", [run_no_submitter, run_submitter]) +def test_shell_cmd_inputspec_3a(worker, results_function, tmp_path): + """mandatory field added to fields, value provided + using shorter syntax for input (no attr.ib) + """ + cmd_exec = "echo" + hello = "HELLO" + + @shell.define + class Shelly(shell.Task["Shelly.Outputs"]): + executable = cmd_exec + text: str = shell.arg(position=1, help="text", argstr="") + + class Outputs(shell.Outputs): + pass + + # separate command into exec + args + shelly = Shelly(text=hello) + assert shelly.executable == cmd_exec + assert shelly.cmdline == "echo HELLO" + outputs = results_function(shelly, worker=worker, cache_root=tmp_path) + assert outputs.stdout == "HELLO\n" + + +@pytest.mark.parametrize("results_function", [run_no_submitter, run_submitter]) +def test_shell_cmd_inputspec_3b(worker, results_function, tmp_path): + """mandatory field added to fields, value provided after init""" + cmd_exec = "echo" + hello = "HELLO" + + @shell.define + class Shelly(shell.Task["Shelly.Outputs"]): + executable = cmd_exec + text: str = shell.arg( + position=1, + help="text", + argstr="", + ) + + class Outputs(shell.Outputs): + pass + + # separate command into exec + args + shelly = Shelly(executable=cmd_exec) + shelly.text = hello + + assert shelly.executable == cmd_exec + assert shelly.cmdline == "echo HELLO" + outputs = results_function(shelly, worker=worker, cache_root=tmp_path) + assert outputs.stdout == "HELLO\n" + + +def test_shell_cmd_inputspec_3c_exception(worker, tmp_path): + """mandatory field added to fields, value is not provided, so exception is raised""" + cmd_exec = "echo" + + @shell.define + class Shelly(shell.Task["Shelly.Outputs"]): + executable = cmd_exec + text: str = shell.arg( + position=1, + help="text", + argstr="", + ) + + class Outputs(shell.Outputs): + pass + + shelly = Shelly(executable=cmd_exec) + + with pytest.raises(ValueError, match="Mandatory field 'text' is not set"): + shelly(cache_root=tmp_path) + + +@pytest.mark.parametrize("results_function", [run_no_submitter, run_submitter]) +def test_shell_cmd_inputspec_3c(worker, results_function, tmp_path): + """mandatory=False, so tasks runs fine even without the value""" + cmd_exec = "echo" + + @shell.define + class Shelly(shell.Task["Shelly.Outputs"]): + executable = cmd_exec + text: ty.Optional[str] = shell.arg( + default=None, + position=1, + help="text", + argstr="", + ) + + class Outputs(shell.Outputs): + pass + + # separate command into exec + args + shelly = Shelly(executable=cmd_exec) + + assert shelly.executable == cmd_exec + assert shelly.cmdline == "echo" + outputs = results_function(shelly, worker=worker, cache_root=tmp_path) + assert outputs.stdout == "\n" + + +@pytest.mark.parametrize("results_function", [run_no_submitter, run_submitter]) +def test_shell_cmd_inputspec_4(worker, results_function, tmp_path): + """mandatory field added to fields, value provided""" + cmd_exec = "echo" + + @shell.define + class Shelly(shell.Task["Shelly.Outputs"]): + executable = cmd_exec + text: str = shell.arg( + default="Hello", + position=1, + help="text", + argstr="", + ) + + class Outputs(shell.Outputs): + pass + + # separate command into exec + args + shelly = Shelly(executable=cmd_exec) + + assert shelly.executable == cmd_exec + assert shelly.cmdline == "echo Hello" + + outputs = results_function(shelly, worker=worker, cache_root=tmp_path) + assert outputs.stdout == "Hello\n" + + +@pytest.mark.parametrize("results_function", [run_no_submitter, run_submitter]) +def test_shell_cmd_inputspec_4a(worker, results_function, tmp_path): + """mandatory field added to fields, value provided + using shorter syntax for input (no attr.ib) + """ + cmd_exec = "echo" + + @shell.define + class Shelly(shell.Task["Shelly.Outputs"]): + executable = cmd_exec + text: str = shell.arg(default="Hello", position=1, help="text", argstr="") + + class Outputs(shell.Outputs): + pass + + # separate command into exec + args + shelly = Shelly(executable=cmd_exec) + + assert shelly.executable == cmd_exec + assert shelly.cmdline == "echo Hello" + + outputs = results_function(shelly, worker=worker, cache_root=tmp_path) + assert outputs.stdout == "Hello\n" + + +@pytest.mark.parametrize("results_function", [run_no_submitter, run_submitter]) +def test_shell_cmd_inputspec_4b(worker, results_function, tmp_path): + """mandatory field added to fields, value provided""" + cmd_exec = "echo" + + @shell.define + class Shelly(shell.Task["Shelly.Outputs"]): + executable = cmd_exec + text: str = shell.arg( + default="Hi", + position=1, + help="text", + argstr="", + ) + + class Outputs(shell.Outputs): + pass + + # separate command into exec + args + shelly = Shelly(executable=cmd_exec) + + assert shelly.executable == cmd_exec + assert shelly.cmdline == "echo Hi" + + outputs = results_function(shelly, worker=worker, cache_root=tmp_path) + assert outputs.stdout == "Hi\n" + + +def test_shell_cmd_inputspec_4d_exception(worker): + """mandatory field added to fields, value provided""" + cmd_exec = "echo" + + # separate command into exec + args + with pytest.raises( + ValueError, + match=r"path_template \('exception'\) can only be provided when there is no default", + ): + + @shell.define + class Shelly(shell.Task["Shelly.Outputs"]): + executable = cmd_exec + text: File = shell.outarg( + default="Hello", + position=1, + help="text", + path_template="exception", + argstr="", + ) + + class Outputs(shell.Outputs): + pass + + +@pytest.mark.parametrize("results_function", [run_no_submitter, run_submitter]) +def test_shell_cmd_inputspec_5_nosubm(worker, results_function, tmp_path): + """checking xor in metadata: task should work fine, since only one option is True""" + cmd_exec = "ls" + cmd_t = True + + @shell.define(xor=["opt_S", "opt_t"]) + class Shelly(shell.Task["Shelly.Outputs"]): + executable = cmd_exec + opt_t: bool = shell.arg( + position=1, + help="opt t", + argstr="-t", + ) + opt_S: bool = shell.arg( + default=False, + position=2, + help="opt S", + argstr="-S", + ) + + class Outputs(shell.Outputs): + pass + + # separate command into exec + args + shelly = Shelly(opt_t=cmd_t) + assert shelly.executable == cmd_exec + assert shelly.cmdline == "ls -t" + results_function(shelly, worker=worker, cache_root=tmp_path) + + +def test_shell_cmd_inputspec_5a_exception(worker, tmp_path): + """checking xor in metadata: both options are True, so the task raises exception""" + cmd_exec = "ls" + cmd_t = True + cmd_S = True + + @shell.define(xor=["opt_S", "opt_t"]) + class Shelly(shell.Task["Shelly.Outputs"]): + executable = cmd_exec + opt_t: bool = shell.arg( + position=1, + help="opt t", + argstr="-t", + ) + opt_S: bool = shell.arg( + position=2, + help="opt S", + argstr="-S", + ) + + class Outputs(shell.Outputs): + pass + + shelly = Shelly(opt_t=cmd_t, opt_S=cmd_S) + with pytest.raises(ValueError, match="Mutually exclusive fields"): + shelly(cache_root=tmp_path) + + +@pytest.mark.parametrize("results_function", [run_no_submitter, run_submitter]) +def test_shell_cmd_inputspec_6(worker, results_function, tmp_path): + """checking requires in metadata: + the required field is set in the init, so the task works fine + """ + cmd_exec = "ls" + cmd_l = True + cmd_t = True + + @shell.define + class Shelly(shell.Task["Shelly.Outputs"]): + executable = cmd_exec + opt_t: bool = shell.arg( + position=2, + help="opt t", + argstr="-t", + requires=["opt_l"], + ) + opt_l: bool = shell.arg( + position=1, + help="opt l", + argstr="-l", + ) + + class Outputs(shell.Outputs): + pass + + # separate command into exec + args + shelly = Shelly(opt_t=cmd_t, opt_l=cmd_l) + assert shelly.executable == cmd_exec + assert shelly.cmdline == "ls -l -t" + results_function(shelly, worker=worker, cache_root=tmp_path) + + +def test_shell_cmd_inputspec_6a_exception(worker, tmp_path): + """checking requires in metadata: + the required field is None, so the task works raises exception + """ + cmd_exec = "ls" + cmd_t = True + + @shell.define + class Shelly(shell.Task["Shelly.Outputs"]): + executable = cmd_exec + opt_t: bool = shell.arg( + position=2, + help="opt t", + argstr="-t", + requires=["opt_l"], + ) + opt_l: bool = shell.arg( + default=False, + position=1, + help="opt l", + argstr="-l", + ) + + class Outputs(shell.Outputs): + pass + + shelly = Shelly(executable=cmd_exec, opt_t=cmd_t) + + with pytest.raises(ValueError, match=r"'opt_t' requires \['opt_l'\]"): + shelly() + + +@pytest.mark.parametrize("results_function", [run_no_submitter, run_submitter]) +def test_shell_cmd_inputspec_6b(worker, results_function, tmp_path): + """checking requires in metadata: + the required field set after the init + """ + cmd_exec = "ls" + cmd_l = True + cmd_t = True + + @shell.define + class Shelly(shell.Task["Shelly.Outputs"]): + executable = cmd_exec + opt_t: bool = shell.arg( + position=2, + help="opt t", + argstr="-t", + requires=["opt_l"], + ) + opt_l: bool = shell.arg( + position=1, + help="opt l", + argstr="-l", + ) + + class Outputs(shell.Outputs): + pass + + # separate command into exec + args + shelly = Shelly( + opt_t=cmd_t + # opt_l=cmd_l, + ) + shelly.opt_l = cmd_l + assert shelly.executable == cmd_exec + assert shelly.cmdline == "ls -l -t" + results_function(shelly, worker=worker, cache_root=tmp_path) + + +@pytest.mark.parametrize("results_function", [run_no_submitter, run_submitter]) +def test_shell_cmd_inputspec_7(worker, results_function, tmp_path): + """ + providing output name using input_spec, + using name_tamplate in metadata + """ + cmd = "touch" + arg = tmp_path / "newfile_tmp.txt" + cache_root = tmp_path / "cache" + + @shell.define + class Shelly(shell.Task["Shelly.Outputs"]): + executable = cmd + + arg: str = shell.arg(argstr=None) + + class Outputs(shell.Outputs): + out1: File = shell.outarg( + path_template="{arg}", + help="output file", + ) + + shelly = Shelly(executable=cmd, arg=arg) + + outputs = results_function(shelly, worker=worker, cache_root=cache_root) + assert outputs.stdout == "" + out1 = outputs.out1.fspath + assert out1.exists() + # checking if the file is created in a good place + assert out1.parent.parent == cache_root + assert out1.name == "newfile_tmp.txt" + + +@pytest.mark.parametrize("results_function", [run_no_submitter, run_submitter]) +def test_shell_cmd_inputspec_7b(worker, results_function, tmp_path): + """ + providing new file and output name using input_spec, + using name_template in metadata + """ + cmd = "touch" + + @shell.define + class Shelly(shell.Task["Shelly.Outputs"]): + executable = cmd + newfile: str = shell.arg( + position=1, + help="new file", + argstr="", + ) + + class Outputs(shell.Outputs): + out1: File = shell.outarg( + path_template="{newfile}", + help="output file", + ) + + shelly = Shelly(executable=cmd, newfile=File.mock("newfile_tmp.txt")) + + outputs = results_function(shelly, worker=worker, cache_root=tmp_path) + assert outputs.stdout == "" + assert outputs.out1.fspath.exists() + + +@pytest.mark.parametrize("results_function", [run_no_submitter, run_submitter]) +def test_shell_cmd_inputspec_7c(worker, results_function, tmp_path): + """ + providing output name using input_spec, + using name_tamplate with txt extension (extension from args should be removed + """ + cmd = "touch" + arg = File.mock("newfile_tmp.txt") + + @shell.define + class Shelly(shell.Task["Shelly.Outputs"]): + executable = cmd + + arg = shell.arg(argstr=None) + + class Outputs(shell.Outputs): + out1: File = shell.outarg( + path_template="{arg}.txt", + help="output file", + ) + + shelly = Shelly(executable=cmd, arg=arg) + + outputs = results_function(shelly, worker=worker, cache_root=tmp_path) + assert outputs.stdout == "" + # checking if the file is created in a good place + assert outputs.out1.fspath.parent.parent == tmp_path + assert outputs.out1.fspath.name == "newfile_tmp.txt" + + +@pytest.mark.parametrize("results_function", [run_no_submitter, run_submitter]) +def test_shell_cmd_inputspec_8(worker, results_function, tmp_path): + """ + providing new file and output name using input_spec, + adding additional string input field with argstr + """ + cmd = "touch" + + @shell.define + class Shelly(shell.Task["Shelly.Outputs"]): + executable = cmd + newfile: str = shell.arg( + position=2, + help="new file", + argstr="", + ) + time: str = shell.arg( + position=1, + argstr="-t", + help="time of modif.", + ) + + class Outputs(shell.Outputs): + out1: File = shell.outarg( + path_template="{newfile}", + help="output file", + ) + + shelly = Shelly( + executable=cmd, + newfile=tmp_path / "newfile_tmp.txt", + time="02121010", + ) + + outputs = results_function(shelly, worker=worker, cache_root=tmp_path) + assert outputs.stdout == "" + assert outputs.out1.fspath.exists() + + +@pytest.mark.parametrize("results_function", [run_no_submitter, run_submitter]) +def test_shell_cmd_inputspec_8a(worker, results_function, tmp_path): + """ + providing new file and output name using input_spec, + adding additional string input field with argstr (argstr uses string formatting) + """ + cmd = "touch" + + @shell.define + class Shelly(shell.Task["Shelly.Outputs"]): + executable = cmd + newfile: str = shell.arg( + position=2, + help="new file", + argstr="", + ) + time: str = shell.arg( + position=1, + argstr="-t {time}", + help="time of modif.", + ) + + class Outputs(shell.Outputs): + out1: File = shell.outarg( + path_template="{newfile}", + help="output file", + ) + + shelly = Shelly( + executable=cmd, + newfile=tmp_path / "newfile_tmp.txt", + time="02121010", + ) + + outputs = results_function(shelly, worker=worker, cache_root=tmp_path) + assert outputs.stdout == "" + assert outputs.out1.fspath.exists() + + +@pytest.mark.parametrize("results_function", [run_no_submitter, run_submitter]) +def test_shell_cmd_inputspec_9(tmp_path, worker, results_function): + """ + providing output name using input_spec (path_template in metadata), + the template has a suffix, the extension of the file will be moved to the end + """ + cmd = "cp" + ddir = tmp_path / "data_inp" + ddir.mkdir() + file = ddir / ("file.txt") + file.write_text("content\n") + + @shell.define + class Shelly(shell.Task["Shelly.Outputs"]): + executable = cmd + file_orig: File = shell.arg( + position=1, + help="new file", + argstr="", + ) + + class Outputs(shell.Outputs): + file_copy: File = shell.outarg( + path_template="{file_orig}_copy", + help="output file", + argstr="", + ) + + shelly = Shelly( + executable=cmd, + file_orig=file, + ) + + outputs = results_function(shelly, worker=worker, cache_root=tmp_path) + assert outputs.stdout == "" + assert outputs.file_copy.fspath.exists() + assert outputs.file_copy.fspath.name == "file_copy.txt" + # checking if it's created in a good place + assert outputs.file_copy.fspath.parent.parent == tmp_path + + +@pytest.mark.parametrize("results_function", [run_no_submitter]) +def test_shell_cmd_inputspec_9a(tmp_path, worker, results_function): + """ + providing output name using input_spec (path_template in metadata), + the template has a suffix, the extension of the file will be moved to the end + the change: input file has directory with a dot + """ + cmd = "cp" + file = tmp_path / "data.inp" / "file.txt" + file.parent.mkdir() + file.write_text("content\n") + + @shell.define + class Shelly(shell.Task["Shelly.Outputs"]): + executable = cmd + file_orig: File = shell.arg( + position=1, + help="new file", + argstr="", + ) + + class Outputs(shell.Outputs): + file_copy: File = shell.outarg( + path_template="{file_orig}_copy", + help="output file", + argstr="", + ) + + shelly = Shelly(executable=cmd, file_orig=file) + + outputs = results_function(shelly, worker=worker, cache_root=tmp_path) + assert outputs.stdout == "" + assert outputs.file_copy.fspath.exists() + assert outputs.file_copy.fspath.name == "file_copy.txt" + # checking if it's created in a good place + assert outputs.file_copy.fspath.parent.parent == tmp_path + + +@pytest.mark.parametrize("results_function", [run_no_submitter, run_submitter]) +def test_shell_cmd_inputspec_9b(tmp_path, worker, results_function): + """ + providing output name using input_spec (path_template in metadata) + and the keep_extension is set to False, so the extension is removed completely. + """ + cmd = "cp" + file = tmp_path / "file.txt" + file.write_text("content\n") + + @shell.define + class Shelly(shell.Task["Shelly.Outputs"]): + executable = cmd + file_orig: File = shell.arg( + position=1, + help="new file", + argstr="", + ) + + class Outputs(shell.Outputs): + file_copy: File = shell.outarg( + path_template="{file_orig}_copy", + keep_extension=False, + help="output file", + argstr="", + ) + + shelly = Shelly( + executable=cmd, + file_orig=file, + ) + + outputs = results_function(shelly, worker=worker, cache_root=tmp_path) + assert outputs.stdout == "" + assert outputs.file_copy.fspath.exists() + assert outputs.file_copy.fspath.name == "file_copy" + + +@pytest.mark.parametrize("results_function", [run_no_submitter, run_submitter]) +def test_shell_cmd_inputspec_9c(tmp_path, worker, results_function): + """ + providing output name using input_spec (path_template in metadata) + and the keep_extension is set to False, so the extension is removed completely, + no suffix in the template. + """ + cmd = "cp" + file = tmp_path / "file.txt" + file.write_text("content\n") + + @shell.define + class Shelly(shell.Task["Shelly.Outputs"]): + executable = cmd + file_orig: File = shell.arg( + position=1, + help="new file", + argstr="", + ) + + class Outputs(shell.Outputs): + file_copy: File = shell.outarg( + path_template="{file_orig}", + keep_extension=False, + help="output file", + argstr="", + ) + + shelly = Shelly(executable=cmd, file_orig=file) + + outputs = results_function(shelly, worker=worker, cache_root=tmp_path) + assert outputs.stdout == "" + assert outputs.file_copy.fspath.exists() + assert outputs.file_copy.fspath.name == "file" + assert outputs.file_copy.fspath.parent.parent == tmp_path + + +@pytest.mark.parametrize("results_function", [run_no_submitter, run_submitter]) +def test_shell_cmd_inputspec_9d(tmp_path, worker, results_function): + """ + providing output name explicitly by manually setting value in input_spec + (instead of using default provided bypath_template in metadata) + """ + cmd = "cp" + ddir = tmp_path / "data_inp" + ddir.mkdir() + file = ddir / ("file.txt") + file.write_text("content\n") + + @shell.define + class Shelly(shell.Task["Shelly.Outputs"]): + executable = cmd + file_orig: File = shell.arg( + position=1, + help="new file", + argstr="", + ) + + class Outputs(shell.Outputs): + file_copy: File = shell.outarg( + path_template="{file_orig}_copy", + help="output file", + argstr="", + ) + + shelly = Shelly( + executable=cmd, + file_orig=file, + file_copy="my_file_copy.txt", + ) + + outputs = results_function(shelly, worker=worker, cache_root=tmp_path) + assert outputs.stdout == "" + assert outputs.file_copy.fspath.exists() + assert outputs.file_copy.fspath.name == "my_file_copy.txt" + # checking if it's created in a good place + assert outputs.file_copy.fspath.parent.parent == tmp_path + + +@pytest.mark.parametrize("results_function", [run_no_submitter, run_submitter]) +def test_shell_cmd_inputspec_10(worker, results_function, tmp_path): + """using input_spec, providing list of files as an input""" + + file_1 = tmp_path / "file_1.txt" + file_2 = tmp_path / "file_2.txt" + with open(file_1, "w") as f: + f.write("hello ") + with open(file_2, "w") as f: + f.write("from boston") + + cmd_exec = "cat" + files_list = [file_1, file_2] + + @shell.define + class Shelly(shell.Task["Shelly.Outputs"]): + executable = cmd_exec + files: ty.List[File] = shell.arg( + position=1, + argstr="...", + sep=" ", + help="list of files", + ) + + class Outputs(shell.Outputs): + pass + + shelly = Shelly( + files=files_list, + ) + + assert shelly.executable == cmd_exec + outputs = results_function(shelly, worker=worker, cache_root=tmp_path) + assert outputs.stdout == "hello from boston" + + +def test_shell_cmd_inputspec_10_err(tmp_path): + """checking if the proper error is raised when broken symlink is provided + as a input field with File as a type + """ + + file_1 = tmp_path / "file_1.txt" + with open(file_1, "w") as f: + f.write("hello") + file_2 = tmp_path / "file_2.txt" + + # creating symlink and removing the original file + os.symlink(file_1, file_2) + os.remove(file_1) + + cmd_exec = "cat" + + @shell.define + class Shelly(shell.Task["Shelly.Outputs"]): + executable = cmd_exec + files: File = shell.arg( + position=1, + argstr="", + help="a file", + ) + + class Outputs(shell.Outputs): + pass + + with pytest.raises(FileNotFoundError): + Shelly(executable=cmd_exec, files=file_2) + + +def test_shell_cmd_inputspec_11(tmp_path): + + @shell.define + class Shelly(shell.Task["Shelly.Outputs"]): + inputFiles: MultiInputObj[str] = shell.arg( + argstr="...", + help="The list of input image files to be segmented.", + ) + + executable = "touch" + + class Outputs(shell.Outputs): + outputFiles: MultiOutputFile = shell.outarg( + help="""Corrected Output Images: should specify the same number of + images as inputVolume, if only one element is given, then it is used as + a file pattern where %s is replaced by the imageVolumeType, + and %d by the index list location.""", + path_template="{inputFiles}", + ) + + @workflow.define + def Workflow(inputFiles): + + echoMultiple = workflow.add(Shelly(inputFiles=inputFiles)) + return echoMultiple.outputFiles + + wf = Workflow( + inputFiles=[File.mock(tmp_path / "test1"), File.mock(tmp_path / "test2")] + ) + + # XXX: Figure out why this fails with "cf". Occurs in CI when using Ubuntu + Python >= 3.10 + # (but not when using macOS + Python >= 3.10). Same error occurs in test_shell_cmd_outputspec_7a + # see https://github.com/nipype/pydra/issues/671 + with Submitter(worker="debug") as sub: + result = sub(wf) + + for out_file in result.outputs.out: + assert out_file.fspath.name == "test1" or out_file.fspath.name == "test2" + + +@pytest.mark.parametrize("results_function", [run_no_submitter, run_submitter]) +def test_shell_cmd_inputspec_12(tmp_path: Path, worker, results_function): + """ + providing output name using input_spec + path_template is provided as a function that returns + various templates depending on the values of inputs fields + """ + cmd = "cp" + ddir = tmp_path / "data_inp" + ddir.mkdir() + file = ddir / "file.txt" + file.write_text("content\n") + + def template_function(inputs): + if inputs.number % 2 == 0: + return "{file_orig}_even" + else: + return "{file_orig}_odd" + + @shell.define + class Shelly(shell.Task["Shelly.Outputs"]): + executable = cmd + file_orig: File = shell.arg( + position=2, + help="new file", + argstr="", + ) + number: int = shell.arg( + help="a number", + argstr=None, + ) + + class Outputs(shell.Outputs): + file_copy: File = shell.outarg( + path_template=template_function, + help="output file", + argstr="", + ) + + shelly = Shelly( + executable=cmd, + file_orig=file, + number=2, + ) + + outputs = results_function(shelly, worker=worker, cache_root=tmp_path) + assert outputs.stdout == "" + fspath = outputs.file_copy.fspath + assert fspath.exists() + assert fspath.name == "file_even.txt" + # checking if it's created in a good place + assert fspath.parent.parent == tmp_path + + +def test_shell_cmd_inputspec_with_iterable(): + """Test formatting of argstr with different iterable types.""" + + @shell.define + class Shelly(shell.Task["Shelly.Outputs"]): + executable = "test" + iterable_1: list[int] = shell.arg( + help="iterable input 1", + argstr="--in1", + sep=" ", + ) + iterable_2: set[str] = shell.arg( + help="iterable input 2", + argstr="--in2", + sep=" ", + ) + iterable_3: tuple[float, ...] = shell.arg( + help="iterable input 3", + argstr="--in3...", + ) + + class Outputs(shell.Outputs): + pass + + task = Shelly() + + for iterable_type in (list, tuple): + task.iterable_1 = iterable_type(range(3)) + task.iterable_2 = iterable_type(["foo"]) + task.iterable_3 = iterable_type([1, 0]) + assert task.cmdline == "test --in1 0 1 2 --in2 foo --in3 1.0 --in3 0.0" + + +@pytest.mark.parametrize("results_function", [run_no_submitter, run_submitter]) +def test_shell_cmd_inputspec_copyfile_1(worker, results_function, tmp_path): + """shelltask changes a file in place, + adding copy_mode="copy" to the file-input from input_spec + hardlink or copy in the cache_dir should be created + """ + file = tmp_path / "file_pydra.txt" + with open(file, "w") as f: + f.write("hello from pydra\n") + + cmd = ["sed", "-is", "s/hello/hi/"] + + @shell.define + class Shelly(shell.Task["Shelly.Outputs"]): + executable = cmd + orig_file: File = shell.arg( + help="orig file", + copy_mode="copy", + ) + + class Outputs(shell.Outputs): + out_file: File = shell.out( + help="output file", + callable=lambda orig_file: orig_file, + ) + + shelly = Shelly(executable=cmd, orig_file=str(file)) + + outputs = results_function(shelly, worker=worker, cache_root=tmp_path) + assert outputs.stdout == "" + assert outputs.out_file.fspath.exists() + # the file is copied, and than it is changed in place + assert outputs.out_file.fspath.parent.parent == tmp_path + with open(outputs.out_file) as f: + assert "hi from pydra\n" == f.read() + # the original file is unchanged + with open(file) as f: + assert "hello from pydra\n" == f.read() + + +@pytest.mark.parametrize("results_function", [run_no_submitter, run_submitter]) +def test_shell_cmd_inputspec_copyfile_1a(worker, results_function, tmp_path): + """shelltask changes a file in place, + adding copyfile=False to the File-input from input_spec + hardlink or softlink in the cache_dir is created + """ + file = tmp_path / "file_pydra.txt" + with open(file, "w") as f: + f.write("hello from pydra\n") + + cmd = ["sed", "-is", "s/hello/hi/"] + + @shell.define + class Shelly(shell.Task["Shelly.Outputs"]): + executable = cmd + orig_file: File = shell.arg( + position=1, + argstr="", + help="orig file", + copy_mode="hardlink", + ) + + class Outputs(shell.Outputs): + out_file: File = shell.out( + callable=lambda orig_file: orig_file, + help="output file", + ) + + shelly = Shelly(executable=cmd, orig_file=str(file)) + + outputs = results_function(shelly, worker=worker, cache_root=tmp_path) + assert outputs.stdout == "" + assert outputs.out_file.fspath.exists() + # the file is uses a soft link, but it creates and an extra copy before modifying + assert outputs.out_file.fspath.parent.parent == tmp_path + + assert outputs.out_file.fspath.parent.joinpath( + outputs.out_file.fspath.name + "s" + ).exists() + with open(outputs.out_file) as f: + assert "hi from pydra\n" == f.read() + # the file is uses a soft link, but it creates and an extra copy + # it might depend on the OS + linked_file_copy = outputs.out_file.fspath.parent.joinpath( + outputs.out_file.fspath.name + "s" + ) + if linked_file_copy.exists(): + with open(linked_file_copy) as f: + assert "hello from pydra\n" == f.read() + + # the original file is unchanged + with open(file) as f: + assert "hello from pydra\n" == f.read() + + +@pytest.mark.parametrize("results_function", [run_no_submitter, run_submitter]) +def test_shell_cmd_inputspec_state_1(worker, results_function, tmp_path): + """adding state to the input from input_spec""" + cmd_exec = "echo" + hello = ["HELLO", "hi"] + + @shell.define + class Shelly(shell.Task["Shelly.Outputs"]): + executable = cmd_exec + text: str = shell.arg( + position=1, + help="text", + argstr="", + ) + + class Outputs(shell.Outputs): + pass + + # separate command into exec + args + shelly = Shelly().split("text", text=hello) + assert shelly.executable == cmd_exec + # todo: this doesn't work when state + # assert shelly.cmdline == "echo HELLO" + outputs = results_function(shelly, worker=worker, cache_root=tmp_path) + assert outputs.stdout[0] == "HELLO\n" + assert outputs.stdout[1] == "hi\n" + + +def test_shell_cmd_inputspec_typeval_1(tmp_path): + """customized input_spec with a type that doesn't match the value + - raise an exception + """ + cmd_exec = "echo" + + @shell.define + class Shelly(shell.Task["Shelly.Outputs"]): + executable = cmd_exec + text: int = shell.arg( + position=1, + argstr="", + help="text", + ) + + class Outputs(shell.Outputs): + pass + + with pytest.raises(TypeError): + Shelly(text="hello") + + +def test_shell_cmd_inputspec_typeval_2(tmp_path): + """customized input_spec (shorter syntax) with a type that doesn't match the value + - raise an exception + """ + cmd_exec = "echo" + + @shell.define + class Shelly(shell.Task["Shelly.Outputs"]): + executable = cmd_exec + + text: int = shell.arg(position=1, argstr="", help="text") + + class Outputs(shell.Outputs): + pass + + with pytest.raises(TypeError): + Shelly(text="hello") + + +@pytest.mark.parametrize("results_function", [run_no_submitter, run_submitter]) +def test_shell_cmd_inputspec_state_1a(worker, results_function, tmp_path): + """adding state to the input from input_spec + using shorter syntax for input_spec (without default) + """ + cmd_exec = "echo" + + @shell.define + class Shelly(shell.Task["Shelly.Outputs"]): + executable = cmd_exec + text: str = shell.arg( + position=1, + help="text", + argstr="", + ) + + class Outputs(shell.Outputs): + pass + + # separate command into exec + args + shelly = Shelly().split(text=["HELLO", "hi"]) + assert shelly.executable == cmd_exec + + outputs = results_function(shelly, worker=worker, cache_root=tmp_path) + assert outputs.stdout[0] == "HELLO\n" + assert outputs.stdout[1] == "hi\n" + + +@pytest.mark.parametrize("results_function", [run_no_submitter, run_submitter]) +def test_shell_cmd_inputspec_state_2(worker, results_function, tmp_path): + """ + adding splitter to input that is used in the output_file_tamplate + """ + cmd = "touch" + args = ["newfile_1.txt", "newfile_2.txt"] + + @shell.define + class Shelly(shell.Task["Shelly.Outputs"]): + executable = cmd + + arg: str = shell.arg(argstr=None) + + class Outputs(shell.Outputs): + out1: File = shell.outarg( + path_template="{arg}", + help="output file", + ) + + shelly = Shelly(executable=cmd).split(arg=args) + + outputs = results_function(shelly, worker=worker, cache_root=tmp_path) + for i in range(len(args)): + assert outputs.stdout[i] == "" + assert outputs.out1[i].fspath.exists() + assert outputs.out1[i].fspath.parent.parent == tmp_path + + +@pytest.mark.parametrize("results_function", [run_no_submitter, run_submitter]) +def test_shell_cmd_inputspec_state_3(worker, results_function, tmp_path): + """adding state to the File-input from input_spec""" + + file_1 = tmp_path / "file_pydra.txt" + file_2 = tmp_path / "file_nice.txt" + with open(file_1, "w") as f: + f.write("hello from pydra") + with open(file_2, "w") as f: + f.write("have a nice one") + + cmd_exec = "cat" + + @shell.define + class Shelly(shell.Task["Shelly.Outputs"]): + executable = cmd_exec + file: File = shell.arg( + position=1, + help="files", + argstr="", + ) + + class Outputs(shell.Outputs): + pass + + shelly = Shelly().split(file=[file_1, file_2]) + + assert shelly.executable == cmd_exec + # todo: this doesn't work when state + # assert shelly.cmdline == "echo HELLO" + outputs = results_function(shelly, worker=worker, cache_root=tmp_path) + assert outputs.stdout == ["hello from pydra", "have a nice one"] + + +@pytest.mark.parametrize("results_function", [run_no_submitter, run_submitter]) +def test_shell_cmd_inputspec_copyfile_state_1(worker, results_function, tmp_path): + """adding state to the File-input from input_spec""" + + file1 = tmp_path / "file1.txt" + with open(file1, "w") as f: + f.write("hello from pydra\n") + + file2 = tmp_path / "file2.txt" + with open(file2, "w") as f: + f.write("hello world\n") + + files = [str(file1), str(file2)] + cmd = ["sed", "-is", "s/hello/hi/"] + + @shell.define + class Shelly(shell.Task["Shelly.Outputs"]): + executable = cmd + orig_file: File = shell.arg( + position=1, + argstr="", + help="orig file", + copy_mode="copy", + ) + + class Outputs(shell.Outputs): + out_file: File = shell.out( + callable=lambda orig_file: orig_file, + help="output file", + ) + + shelly = Shelly( + executable=cmd, + ).split("orig_file", orig_file=files) + + txt_l = ["from pydra", "world"] + outputs = results_function(shelly, worker=worker, cache_root=tmp_path) + for i in range(len(files)): + assert outputs.stdout[i] == "" + assert outputs.out_file[i].fspath.exists() + # the file is copied, and than it is changed in place + assert outputs.out_file[i].fspath.parent.parent == tmp_path + with open(outputs.out_file[i]) as f: + assert f"hi {txt_l[i]}\n" == f.read() + # the original file is unchanged + with open(files[i]) as f: + assert f"hello {txt_l[i]}\n" == f.read() + + +# customised input_spec in Workflow + + +@pytest.mark.flaky(reruns=2) # when dask +def test_wf_shell_cmd_2(worker, tmp_path): + """a workflow with input with defined path_template (str) + that requires wf.lzin + """ + + @shell.define + class Shelly(shell.Task["Shelly.Outputs"]): + executable = "touch" + + arg: str = shell.arg() + + class Outputs(shell.Outputs): + out1: File = shell.outarg( + path_template="{arg}", + help="output file", + ) + + @workflow.define(outputs=["out_f", "stdout"]) + def Workflow(cmd, arg): + + shelly = workflow.add( + Shelly( + executable=cmd, + arg=arg, + ) + ) + + return shelly.out1, shelly.stdout + + wf = Workflow(cmd="touch", arg="newfile.txt") + + with Submitter(worker=worker, cache_root=tmp_path) as sub: + res = sub(wf) + + assert res.outputs.stdout == "" + assert res.outputs.out_f.fspath.exists() + assert res.outputs.out_f.fspath.parent.parent == tmp_path + + +def test_wf_shell_cmd_2a(worker, tmp_path): + """a workflow with input with defined path_template (tuple) + that requires wf.lzin + """ + + @shell.define + class Shelly(shell.Task["Shelly.Outputs"]): + executable = "shelly" + + arg: str = shell.arg() + + class Outputs(shell.Outputs): + out1: File = shell.outarg( + path_template="{arg}", + help="output file", + ) + + @workflow.define(outputs=["out_f", "out"]) + def Workflow(cmd, arg): + + shelly = workflow.add( + Shelly( + executable=cmd, + arg=arg, + ) + ) + + return shelly.out1, shelly.stdout + + wf = Workflow(cmd="touch", arg="newfile.txt") + + with Submitter(worker=worker) as sub: + res = sub(wf) + + assert res.outputs.out == "" + assert res.outputs.out_f.fspath.exists() + + +def test_wf_shell_cmd_3(worker, tmp_path): + """a workflow with 2 tasks, + first one has input with path_template (str, uses wf.lzin), + that is passed to the second task + """ + + @shell.define + class Shelly1(shell.Task["Shelly1.Outputs"]): + executable = "shelly" + + arg: str = shell.arg(argstr=None) + + class Outputs(shell.Outputs): + file: File = shell.outarg( + path_template="{arg}", + help="output file", + ) + + @shell.define + class Shelly2(shell.Task["Shelly2.Outputs"]): + + executable = "shelly2" + + orig_file: File = shell.arg( + position=1, + help="output file", + argstr="", + ) + + class Outputs(shell.Outputs): + out_file: File = shell.outarg( + position=2, + argstr="", + path_template="{orig_file}_copy", + help="output file", + ) + + @workflow.define(outputs=["touch_file", "out1", "cp_file", "out2"]) + def Workflow(cmd1, cmd2, arg): + + shelly1 = workflow.add( + Shelly1( + executable=cmd1, + arg=arg, + ) + ) + shelly2 = workflow.add( + Shelly2( + executable=cmd2, + orig_file=shelly1.file, + ) + ) + + return shelly1.file, shelly1.stdout, shelly2.out_file, shelly2.stdout + + wf = Workflow(cmd1="touch", cmd2="cp", arg="newfile.txt") + + with Submitter(worker=worker, cache_root=tmp_path) as sub: + res = sub(wf) + + assert res.outputs.out1 == "" + assert res.outputs.touch_file.fspath.exists() + assert res.outputs.touch_file.fspath.parent.parent == tmp_path + assert res.outputs.out2 == "" + assert res.outputs.cp_file.fspath.exists() + assert res.outputs.cp_file.fspath.parent.parent == tmp_path + + +def test_wf_shell_cmd_3a(worker, tmp_path): + """a workflow with 2 tasks, + first one has input with path_template (str, uses wf.lzin), + that is passed to the second task + """ + + @shell.define + class Shelly1(shell.Task["Shelly1.Outputs"]): + executable = "shelly" + arg: str = shell.outarg(argstr=None) + + class Outputs(shell.Outputs): + + file: File = shell.outarg( + path_template="{arg}", + help="output file", + ) + + @shell.define + class Shelly2(shell.Task["Shelly2.Outputs"]): + executable = "shelly2" + orig_file: str = shell.arg( + position=1, + help="output file", + argstr="", + ) + + class Outputs(shell.Outputs): + out_file: File = shell.outarg( + position=2, + argstr="", + path_template="{orig_file}_cp", + help="output file", + ) + + @workflow.define(outputs=["touch_file", "out1", "cp_file", "out2"]) + def Workflow(cmd1, cmd2, arg): + + shelly1 = workflow.add( + Shelly1( + executable=cmd1, + arg=arg, + ) + ) + shelly2 = workflow.add( + Shelly2( + executable=cmd2, + orig_file=shelly1.file, + ) + ) + + return shelly1.file, shelly1.stdout, shelly2.out_file, shelly2.stdout + + wf = Workflow(cmd1="touch", cmd2="cp", arg="newfile.txt") + + with Submitter(worker=worker) as sub: + res = sub(wf) + + assert res.outputs.out1 == "" + assert res.outputs.touch_file.fspath.exists() + assert res.outputs.out2 == "" + assert res.outputs.cp_file.fspath.exists() + + +def test_wf_shell_cmd_state_1(worker, tmp_path): + """a workflow with 2 tasks and splitter on the wf level, + first one has input with path_template (str, uses wf.lzin), + that is passed to the second task + """ + + @shell.define + class Shelly1(shell.Task["Shelly1.Outputs"]): + executable = "shelly1" + + arg: str = shell.arg(argstr=None) + + class Outputs(shell.Outputs): + file: File = shell.outarg( + path_template="{arg}", + help="output file", + ) + + @shell.define + class Shelly2(shell.Task["Shelly2.Outputs"]): + executable = "shelly2" + orig_file: str = shell.arg( + position=1, + help="output file", + argstr="", + ) + + class Outputs(shell.Outputs): + out_file: File = shell.outarg( + position=2, + argstr="", + path_template="{orig_file}_copy", + help="output file", + ) + + @workflow.define(outputs=["touch_file", "out1", "cp_file", "out2"]) + def Workflow(cmd1, cmd2, arg): + + shelly1 = workflow.add( + Shelly1( + executable=cmd1, + arg=arg, + ) + ) + shelly2 = workflow.add( + Shelly2( + executable=cmd2, + orig_file=shelly1.file, + ) + ) + + return shelly1.file, shelly1.stdout, shelly2.out_file, shelly2.stdout + + wf = Workflow(cmd1="touch", cmd2="cp").split(arg=["newfile_1.txt", "newfile_2.txt"]) + + with Submitter(worker=worker, cache_root=tmp_path) as sub: + res = sub(wf) + + for i in range(2): + assert res.outputs.out1[i] == "" + assert res.outputs.touch_file[i].fspath.exists() + assert res.outputs.touch_file[i].fspath.parent.parent == tmp_path + assert res.outputs.out2[i] == "" + assert res.outputs.cp_file[i].fspath.exists() + assert res.outputs.cp_file[i].fspath.parent.parent == tmp_path + + +def test_wf_shell_cmd_ndst_1(worker, tmp_path): + """a workflow with 2 tasks and a splitter on the node level, + first one has input with path_template (str, uses wf.lzin), + that is passed to the second task + """ + + @shell.define + class Shelly1(shell.Task["Shelly1.Outputs"]): + executable = "shelly" + + arg: str = shell.arg(argstr=None) + + class Outputs(shell.Outputs): + file: File = shell.outarg( + path_template="{arg}", + help="output file", + ) + + @shell.define + class Shelly2(shell.Task["Shelly2.Outputs"]): + executable = "shelly2" + + orig_file: str = shell.arg( + position=1, + help="output file", + argstr="", + ) + + class Outputs(shell.Outputs): + out_file: File = shell.outarg( + position=2, + argstr="", + path_template="{orig_file}_copy", + help="output file", + ) + + @workflow.define(outputs=["touch_file", "out1", "cp_file", "out2"]) + def Workflow(cmd1, cmd2, args): + + shelly1 = workflow.add( + Shelly1( + executable=cmd1, + ).split("arg", arg=args) + ) + shelly2 = workflow.add( + Shelly2( + executable=cmd2, + orig_file=shelly1.file, + ) + ) + + return shelly1.file, shelly1.stdout, shelly2.out_file, shelly2.stdout + + wf = Workflow( + cmd1="touch", + cmd2="cp", + args=["newfile_1.txt", "newfile_2.txt"], + ) + + with Submitter(worker=worker, cache_root=tmp_path) as sub: + res = sub(wf) + + assert res.outputs.out1 == ["", ""] + assert all([file.fspath.exists() for file in res.outputs.touch_file]) + assert res.outputs.out2 == ["", ""] + assert all([file.fspath.exists() for file in res.outputs.cp_file]) + + +# customised output task + + +@pytest.mark.parametrize("results_function", [run_no_submitter, run_submitter]) +def test_shell_cmd_outputspec_1(worker, results_function, tmp_path): + """ + customised output_spec, adding files to the output, providing specific pathname + """ + cmd = ["touch", "newfile_tmp.txt"] + Shelly = shell.define( + cmd, + outputs=[ + shell.out(name="newfile", type=File, callable=lambda: "newfile_tmp.txt") + ], + ) + shelly = Shelly() + + outputs = results_function(shelly, worker=worker, cache_root=tmp_path) + assert outputs.stdout == "" + assert outputs.newfile.fspath.exists() + + +@pytest.mark.parametrize("results_function", [run_no_submitter, run_submitter]) +def test_shell_cmd_outputspec_1a(worker, results_function, tmp_path): + """ + customised output_spec, adding files to the output, providing specific pathname + """ + cmd = ["touch", "newfile_tmp.txt"] + + @shell.define + class Shelly(shell.Task["Shelly.Outputs"]): + + executable = cmd + + class Outputs(shell.Outputs): + newfile: File = shell.outarg(path_template="newfile_tmp.txt") + + shelly = Shelly() + + outputs = results_function(shelly, worker=worker, cache_root=tmp_path) + assert outputs.stdout == "" + assert outputs.newfile.fspath.exists() + + +def test_shell_cmd_outputspec_1b_exception(worker, tmp_path): + """ + customised output_spec, adding files to the output, providing specific pathname + """ + cmd = ["touch", "newfile_tmp.txt"] + + @shell.define + class Shelly(shell.Task["Shelly.Outputs"]): + + executable = cmd + + class Outputs(shell.Outputs): + newfile: File = shell.out(callable=lambda: "newfile_tmp_.txt") + + shelly = Shelly() + + with pytest.raises(Exception) as exinfo: + shelly(worker=worker) + assert "does not exist" in str(exinfo.value) + + +@pytest.mark.parametrize("results_function", [run_no_submitter, run_submitter]) +def test_shell_cmd_outputspec_2(worker, results_function, tmp_path): + """ + customised output_spec, adding files to the output, + using a wildcard in default + """ + cmd = ["touch", "newfile_tmp.txt"] + + @shell.define + class Shelly(shell.Task["Shelly.Outputs"]): + + executable = cmd + + class Outputs(shell.Outputs): + newfile: File = shell.outarg(path_template="newfile_*.txt") + + shelly = Shelly() + + outputs = results_function(shelly, worker=worker, cache_root=tmp_path) + assert outputs.stdout == "" + assert outputs.newfile.fspath.exists() + + +def test_shell_cmd_outputspec_2a_exception(worker, tmp_path): + """ + customised output_spec, adding files to the output, + using a wildcard in default + """ + cmd = ["touch", "newfile_tmp.txt"] + + @shell.define + class Shelly(shell.Task["Shelly.Outputs"]): + + executable = cmd + + class Outputs(shell.Outputs): + newfile: File = shell.out(default="newfile_*K.txt") + + shelly = Shelly() + + with pytest.raises(FileNotFoundError): + shelly(cache_root=tmp_path, worker="debug") + + +@pytest.mark.parametrize("results_function", [run_no_submitter, run_submitter]) +def test_shell_cmd_outputspec_3(worker, results_function, tmp_path): + """ + customised output_spec, adding files to the output, + using a wildcard in default, should collect two files + """ + cmd = ["touch", "newfile_tmp1.txt", "newfile_tmp2.txt"] + + @shell.define + class Shelly(shell.Task["Shelly.Outputs"]): + + executable = cmd + + class Outputs(shell.Outputs): + newfile: MultiOutputFile = "newfile_*.txt" + + shelly = Shelly() + + outputs = results_function(shelly, worker=worker, cache_root=tmp_path) + assert outputs.stdout == "" + # newfile is a list + assert len(outputs.newfile) == 2 + assert all([file.fspath.exists() for file in outputs.newfile]) + + +@pytest.mark.parametrize("results_function", [run_no_submitter, run_submitter]) +def test_shell_cmd_outputspec_5(worker, results_function, tmp_path): + """ + customised output_spec, adding files to the output, + using a function to collect output, the function is saved in the field metadata + and uses cache_dir and the glob function + """ + cmd = ["touch", "newfile_tmp1.txt", "newfile_tmp2.txt"] + + def gather_output(field, cache_dir): + if field.name == "newfile": + return list(Path(cache_dir).expanduser().glob("newfile*.txt")) + + @shell.define + class Shelly(shell.Task["Shelly.Outputs"]): + + executable = cmd + + class Outputs(shell.Outputs): + newfile: MultiOutputFile = shell.out(callable=gather_output) + + shelly = Shelly() + + outputs = results_function(shelly, worker=worker, cache_root=tmp_path) + assert outputs.stdout == "" + # newfile is a list + assert len(outputs.newfile) == 2 + assert all([file.fspath.exists() for file in outputs.newfile]) + assert get_output_names(shelly) == ["newfile", "return_code", "stderr", "stdout"] + + +@pytest.mark.parametrize("results_function", [run_no_submitter, run_submitter]) +def test_shell_cmd_outputspec_5a(worker, results_function, tmp_path): + """ + customised output_spec, adding files to the output, + using a function to collect output, the function is saved in the field metadata + and uses cache_dir and inputs element + """ + cmd = ["touch", "newfile_tmp1.txt", "newfile_tmp2.txt"] + + def gather_output(executable, cache_dir): + files = executable[1:] + return [Path(cache_dir) / file for file in files] + + @shell.define + class Shelly(shell.Task["Shelly.Outputs"]): + + executable = cmd + + class Outputs(shell.Outputs): + + newfile: MultiOutputFile = shell.out(callable=gather_output) + + shelly = Shelly() + + outputs = results_function(shelly, worker=worker, cache_root=tmp_path) + assert outputs.stdout == "" + # newfile is a list + assert len(outputs.newfile) == 2 + assert all([file.fspath.exists() for file in outputs.newfile]) + + +def test_shell_cmd_outputspec_5b_error(tmp_path): + """ + customised output_spec, adding files to the output, + using a function to collect output, the function is saved in the field metadata + with an argument that is not part of the inputs - error is raised + """ + cmd = ["touch", "newfile_tmp1.txt", "newfile_tmp2.txt"] + + def gather_output(executable, cache_dir, ble): + files = executable[1:] + return [Path(cache_dir) / file for file in files] + + @shell.define + class Shelly(shell.Task["Shelly.Outputs"]): + + executable = cmd + + class Outputs(shell.Outputs): + newfile: File = shell.out(callable=gather_output) + + shelly = Shelly() + with pytest.raises(KeyError, match="ble"): + shelly(cache_root=tmp_path) + + +@pytest.mark.parametrize("results_function", [run_no_submitter, run_submitter]) +def test_shell_cmd_outputspec_5c(worker, results_function, tmp_path): + """ + Customised output defined as a class, + using a static function to collect output files. + """ + + @shell.define + class Shelly(shell.Task["Shelly.Outputs"]): + + executable = ["touch", "newfile_tmp1.txt", "newfile_tmp2.txt"] + + class Outputs(shell.Outputs): + + @staticmethod + def gather_output(executable, cache_dir): + files = executable[1:] + return [Path(cache_dir) / file for file in files] + + newfile: MultiOutputFile = shell.out(callable=gather_output) + + shelly = Shelly() + + outputs = results_function(shelly, worker=worker, cache_root=tmp_path) + assert outputs.stdout == "" + # newfile is a list + assert len(outputs.newfile) == 2 + assert all([file.exists() for file in outputs.newfile]) + + +@pytest.mark.parametrize("results_function", [run_no_submitter, run_submitter]) +def test_shell_cmd_outputspec_6(worker, results_function, tmp_path): + """ + providing output name by providing path_template + (similar to the previous example, but not touching input_spec) + """ + cmd = "touch" + arg = "newfile_tmp.txt" + + @shell.define + class Shelly(shell.Task["Shelly.Outputs"]): + + executable = cmd + + arg: str = shell.arg() + + class Outputs(shell.Outputs): + + out1: File = shell.outarg( + path_template="{arg}", + help="output file", + ) + + shelly = Shelly( + executable=cmd, + arg=arg, + ) + + outputs = results_function(shelly, worker=worker, cache_root=tmp_path) + assert outputs.stdout == "" + assert outputs.out1.fspath.exists() + + +def test_shell_cmd_outputspec_6a(tmp_path): + """ + providing output name by providing path_template + (using shorter syntax) + """ + cmd = "touch" + arg = "newfile_tmp.txt" + + @shell.define + class Shelly(shell.Task["Shelly.Outputs"]): + + executable = cmd + + arg: str = shell.arg(argstr=None) + + class Outputs(shell.Outputs): + + out1: File = shell.outarg( + path_template="{arg}", + help="output file", + ) + + shelly = Shelly(arg=arg) + + outputs = shelly(cache_root=tmp_path) + assert outputs.stdout == "" + assert outputs.out1.fspath.exists() + + +@pytest.mark.xfail( + sys.version_info >= (3, 11), + reason=( + "Fails on Python 3.11 in some cases (presumably a typing thing with that specific " + "version of Python)" + ), +) +@pytest.mark.parametrize("results_function", [run_no_submitter, run_submitter]) +def test_shell_cmd_outputspec_7(tmp_path, worker, results_function): + """ + providing output with output_file_name and using MultiOutputFile as a type. + the input field used in the template is a MultiInputObj, so it can be and is a list + """ + file = tmp_path / "script.sh" + file.write_text('for var in "$@"; do touch file"$var".txt; done') + + cmd = "bash" + new_files_id = ["1", "2", "3"] + + @shell.define + class Shelly(shell.Task["Shelly.Outputs"]): + executable = cmd + script: File = shell.arg( + help="script file", + position=1, + argstr="", + ) + files_id: MultiInputObj = shell.arg( + position=2, + argstr="...", + sep=" ", + help="list of name indices", + ) + + class Outputs(shell.Outputs): + + new_files: MultiOutputFile = shell.outarg( + argstr=None, + path_template="file{files_id}.txt", + help="output file", + ) + + shelly = Shelly( + script=file, + files_id=new_files_id, + ) + + outputs = results_function(shelly, worker=worker, cache_root=tmp_path) + assert outputs.stdout == "" + for file in outputs.new_files: + assert file.fspath.exists() + + +@pytest.mark.parametrize("results_function", [run_no_submitter, run_submitter]) +def test_shell_cmd_outputspec_7a(tmp_path, worker, results_function): + """ + providing output with output_file_name and using MultiOutputFile as a type. + the input field used in the template is a MultiInputObj, but a single element is used + """ + file = tmp_path / "script.sh" + file.write_text('for var in "$@"; do touch file"$var".txt; done') + + cmd = "bash" + new_files_id = "1" + + @shell.define + class Shelly(shell.Task["Shelly.Outputs"]): + executable = cmd + script: File = shell.arg( + help="script file", + position=1, + argstr="", + ) + files_id: MultiInputObj = shell.arg( + position=2, + argstr="...", + sep=" ", + help="list of name indices", + ) + + class Outputs(shell.Outputs): + + new_files: MultiOutputFile = shell.outarg( + argstr=None, + path_template="file{files_id}.txt", + help="output file", + ) + + shelly = Shelly( + script=file, + files_id=new_files_id, + ) + + # XXX: Figure out why this fails with "cf". Occurs in CI when using Ubuntu + Python >= 3.10 + # (but not when using macOS + Python >= 3.10). Same error occurs in test_shell_cmd_inputspec_11 + # see https://github.com/nipype/pydra/issues/671 + outputs = results_function(shelly, cache_root=tmp_path) + assert outputs.stdout == "" + assert outputs.new_files.fspath.exists() + + +@pytest.mark.parametrize("results_function", [run_no_submitter, run_submitter]) +def test_shell_cmd_outputspec_8a(tmp_path, worker, results_function): + """ + customised output_spec, adding int and str to the output, + requiring two callables with parameters stdout and stderr + """ + cmd = "echo" + args = [["newfile_1.txt"], ["newfile_2.txt"]] + + def get_file_index(stdout): + stdout = re.sub(r".*_", "", stdout) + stdout = re.sub(r".txt", "", stdout) + print(stdout) + return int(stdout) + + def get_stderr(stderr): + return f"stderr: {stderr}" + + @shell.define + class Shelly(shell.Task["Shelly.Outputs"]): + + executable = cmd + + class Outputs(shell.Outputs): + + out_file_index: int = shell.out( + help="output file", + callable=get_file_index, + ) + stderr_field: str = shell.out( + help="The standard error output", + callable=get_stderr, + ) + + shelly = Shelly().split(append_args=args) + + outputs = results_function(shelly, worker=worker, cache_root=tmp_path) + for index in range(2): + assert outputs.out_file_index[index] == index + 1 + assert outputs.stderr_field[index] == f"stderr: {outputs.stderr[index]}" + + +def test_shell_cmd_outputspec_8b_error(tmp_path): + """ + customised output_spec, adding Int to the output, + requiring a function to collect output + """ + + with pytest.raises( + ValueError, + match="A shell output field must have either a callable or a path_template", + ): + + @shell.define + class Shelly(shell.Task["Shelly.Outputs"]): + executable = "echo" + + class Outputs(shell.Outputs): + out: int = shell.out(help="output file") + + +@pytest.mark.parametrize("results_function", [run_no_submitter, run_submitter]) +def test_shell_cmd_outputspec_8c(tmp_path, worker, results_function): + """ + customised output_spec, adding Directory to the output named by args + """ + + def get_lowest_directory(directory_path): + return str(directory_path).replace(str(Path(directory_path).parents[0]), "") + + cmd = "mkdir" + args = [f"{tmp_path}/dir1", f"{tmp_path}/dir2"] + + @shell.define + class Shelly(shell.Task["Shelly.Outputs"]): + + executable = cmd + + arg: str = shell.arg() + + class Outputs(shell.Outputs): + + resultsDir: Directory = shell.outarg( + path_template="{arg}", + help="output file", + ) + + shelly = Shelly(resultsDir="outdir").split(arg=args) + + results_function(shelly, worker=worker, cache_root=tmp_path) + for index, arg_dir in enumerate(args): + assert Path(Path(tmp_path) / Path(arg_dir)).exists() + assert get_lowest_directory(arg_dir) == f"/dir{index+1}" + + +@pytest.mark.parametrize("results_function", [run_no_submitter, run_submitter]) +def test_shell_cmd_outputspec_8d(tmp_path, worker, results_function): + """ + customised output_spec, adding Directory to the output named by input task + """ + + # For /tmp/some_dict/test this function returns "/test" + def get_lowest_directory(directory_path): + return str(directory_path).replace(str(Path(directory_path).parents[0]), "") + + cmd = "mkdir" + + @shell.define + class Shelly(shell.Task["Shelly.Outputs"]): + executable = cmd + resultsDir: str = shell.arg( + position=1, + help="new directory", + argstr="", + ) + + class Outputs(shell.Outputs): + + resultsDir: Directory = shell.outarg( + path_template="{resultsDir}", + help="output file", + ) + + shelly = Shelly(resultsDir="test") + assert get_output_names(shelly) == ["resultsDir", "return_code", "stderr", "stdout"] + cache_root = tmp_path / "cache" + outputs = results_function(shelly, worker=worker, cache_root=cache_root) + cache_dir = next(p for p in cache_root.iterdir() if p.name.startswith("shell-")) + assert (cache_dir / Path("test")).exists() + assert get_lowest_directory(outputs.resultsDir) == get_lowest_directory( + cache_dir / Path("test") + ) + + +@pytest.mark.parametrize("results_function", [run_no_submitter, run_submitter]) +def test_shell_cmd_state_outputspec_1(worker, results_function, tmp_path): + """ + providing output name by providing path_template + splitter for a field that is used in the template + """ + cmd = "touch" + args = ["newfile_1.txt", "newfile_2.txt"] + + @shell.define + class Shelly(shell.Task["Shelly.Outputs"]): + + executable = cmd + + arg: str = shell.arg() + + class Outputs(shell.Outputs): + + out1: File = shell.outarg( + path_template="{arg}", + help="output file", + ) + + shelly = Shelly(executable=cmd).split(arg=args) + + outputs = results_function(shelly, worker=worker, cache_root=tmp_path) + for i in range(len(args)): + assert outputs.stdout[i] == "" + assert outputs.out1[i].fspath.exists() + + +# customised output_spec for tasks in workflows + + +def test_shell_cmd_outputspec_wf_1(worker, tmp_path): + """ + customised output_spec for tasks within a Workflow, + adding files to the output, providing specific pathname + """ + + cmd = ["touch", "newfile_tmp.txt"] + + @shell.define + class Shelly(shell.Task["Shelly.Outputs"]): + + executable = "shelly" + + class Outputs(shell.Outputs): + newfile: File = shell.outarg(path_template="newfile_tmp.txt") + + @workflow.define(outputs=["stdout", "newfile"]) + def Workflow(cmd): + shelly = workflow.add(Shelly(executable=cmd)) + return shelly.stdout, shelly.newfile + + wf = Workflow(cmd=cmd) + + with Submitter(worker=worker, cache_root=tmp_path) as sub: + res = sub(wf) + + assert res.outputs.stdout == "" + assert res.outputs.newfile.fspath.exists() + # checking if the file was copied to the wf dir + assert res.outputs.newfile.fspath.parent.parent == tmp_path + + +def test_shell_cmd_inputspec_outputspec_1(tmp_path): + """ + customised input_spec and output_spec, output_spec uses input_spec fields in templates + """ + cmd = ["touch", "newfile_tmp.txt"] + + @shell.define + class Shelly(shell.Task["Shelly.Outputs"]): + file1: File = shell.arg(help="1st creadted file", argstr="", position=1) + file2: File = shell.arg(help="2nd creadted file", argstr="", position=2) + + class Outputs(shell.Outputs): + newfile1: File = shell.outarg(path_template="{file1}", help="newfile 1") + newfile2: File = shell.outarg(path_template="{file2}", help="newfile 2") + + executable = cmd + + shelly = Shelly( + file1=File.mock(tmp_path / "new_file_1.txt"), + file2=File.mock(tmp_path / "new_file_2.txt"), + ) + + outputs = shelly(cache_root=tmp_path) + assert outputs.stdout == "" + assert outputs.newfile1.fspath.exists() + assert outputs.newfile2.fspath.exists() + + +def test_shell_cmd_inputspec_outputspec_1a(tmp_path): + """ + customised input_spec and output_spec, output_spec uses input_spec fields in templates, + file2 is used in a template for newfile2, but it is not provided, so newfile2 is set to NOTHING + """ + cmd = ["touch", "newfile_tmp.txt"] + + @shell.define + class Shelly(shell.Task["Shelly.Outputs"]): + executable = cmd + file1: str = shell.arg(help="1st creadted file", argstr="", position=1) + file2: str | None = shell.arg( + default=None, help="2nd creadted file", argstr="", position=2 + ) + + class Outputs(shell.Outputs): + + newfile1: File = shell.out(callable=lambda file1: file1, help="newfile 1") + newfile2: File | None = shell.out( + callable=lambda file2: file2, help="newfile 2" + ) + + shelly = Shelly( + executable=cmd, + ) + shelly.file1 = File.mock(tmp_path / "new_file_1.txt") + + outputs = shelly(cache_root=tmp_path) + assert outputs.stdout == "" + assert outputs.newfile1.fspath.exists() + # newfile2 is not created, since file2 is not provided + assert outputs.newfile2 is None + + +def test_shell_cmd_inputspec_outputspec_2(tmp_path): + """ + customised input_spec and output_spec, output_spec uses input_spec fields in the requires filed + """ + cmd = ["touch", "newfile_tmp.txt"] + + @shell.define + class Shelly(shell.Task["Shelly.Outputs"]): + executable = cmd + file1: str = shell.arg(help="1st creadted file", argstr="", position=1) + file2: str = shell.arg(help="2nd creadted file", argstr="", position=2) + + class Outputs(shell.Outputs): + + newfile1: File | None = shell.out( + callable=lambda file1: file1, + help="newfile 1", + requires=["file1"], + ) + newfile2: File | None = shell.out( + callable=lambda file2: file2, + help="newfile 1", + requires=["file1", "file2"], + ) + + shelly = Shelly(file1="new_file_1.txt", file2="new_file_2.txt") + assert get_output_names(shelly) == [ + "newfile1", + "newfile2", + "return_code", + "stderr", + "stdout", + ] + + outputs = shelly(cache_root=tmp_path) + assert outputs.stdout == "" + assert outputs.newfile1.fspath.exists() + assert outputs.newfile2.fspath.exists() + + +def test_shell_cmd_inputspec_outputspec_2a(tmp_path): + """ + customised input_spec and output_spec, output_spec uses input_spec fields in the requires filed + """ + cmd = ["touch", "newfile_tmp.txt"] + + @shell.define + class Shelly(shell.Task["Shelly.Outputs"]): + executable = cmd + file1: str = shell.arg(help="1st creadted file", argstr="", position=1) + file2: str | None = shell.arg( + default=None, help="2nd creadted file", argstr="", position=2 + ) + + class Outputs(shell.Outputs): + + newfile1: File | None = shell.out( + callable=lambda file1: file1, + help="newfile 1", + requires=["file1"], + ) + newfile2: File | None = shell.out( + callable=lambda file2: file2, + help="newfile 1", + requires=["file1", "file2"], + ) + + shelly = Shelly( + executable=cmd, + ) + shelly.file1 = tmp_path / "new_file_1.txt" + assert get_output_names(shelly) == [ + "newfile1", + "newfile2", + "return_code", + "stderr", + "stdout", + ] + + outputs = shelly(cache_root=tmp_path) + + assert outputs.stdout == "" + assert outputs.newfile1.fspath.exists() + assert outputs.newfile2 is None + + +def test_shell_cmd_inputspec_outputspec_3(tmp_path): + """ + customised input_spec and output_spec, output_spec uses input_spec fields in the requires filed + adding one additional input that is not in the template, but in the requires field, + """ + cmd = ["touch", "newfile_tmp.txt"] + + @shell.define + class Shelly(shell.Task["Shelly.Outputs"]): + executable = cmd + file1: str = shell.arg(help="1st creadted file", argstr="", position=1) + file2: str = shell.arg(help="2nd creadted file", argstr="", position=2) + additional_inp: int = shell.arg(help="additional inp") + + class Outputs(shell.Outputs): + + newfile1: File = shell.out(callable=lambda file1: file1, help="newfile 1") + newfile2: File | None = shell.out( + callable=lambda file2: file2, + help="newfile 1", + requires=["file1", "additional_inp"], + ) + + shelly = Shelly(executable=cmd) + shelly.file1 = tmp_path / "new_file_1.txt" + shelly.file2 = tmp_path / "new_file_2.txt" + shelly.additional_inp = 2 + + outputs = shelly(cache_root=tmp_path / "cache") + assert outputs.stdout == "" + assert outputs.newfile1.fspath.exists() + assert outputs.newfile2.fspath.exists() + + +def test_shell_cmd_inputspec_outputspec_3a(tmp_path): + """ + customised input_spec and output_spec, output_spec uses input_spec fields in the requires filed + adding one additional input that is not in the template, but in the requires field, + the additional input not provided, so the output is NOTHING + """ + cmd = ["touch", "newfile_tmp.txt"] + + @shell.define + class Shelly(shell.Task["Shelly.Outputs"]): + executable = cmd + file1: str = shell.arg(help="1st creadted file", argstr="", position=1) + file2: str | None = shell.arg(help="2nd creadted file", argstr="", position=2) + additional_inp: str | None = shell.arg(default=None, help="additional inp") + + class Outputs(shell.Outputs): + + newfile1: File = shell.out(callable=lambda file1: file1, help="newfile 1") + newfile2: File | None = shell.out( + callable=lambda file2: file2, + help="newfile 1", + requires=["file1", "additional_inp"], + ) + + shelly = Shelly( + executable=cmd, + ) + shelly.file1 = "new_file_1.txt" + shelly.file2 = "new_file_2.txt" + assert get_output_names(shelly) == [ + "newfile1", + "newfile2", + "return_code", + "stderr", + "stdout", + ] + + shelly.file2 = None + outputs = shelly(cache_root=tmp_path) + + assert outputs.stdout == "" + assert outputs.newfile1.fspath.exists() + # additional input not provided so no newfile2 set (even if the file was created) + assert outputs.newfile2 is None + + +def test_shell_cmd_inputspec_outputspec_4(tmp_path): + """ + customised input_spec and output_spec, output_spec uses input_spec fields in the requires filed + adding one additional input to the requires together with a list of the allowed values, + """ + cmd = ["touch", "newfile_tmp.txt"] + + @shell.define + class Shelly(shell.Task["Shelly.Outputs"]): + executable = cmd + file1: str = shell.arg(help="1st creadted file", argstr="", position=1) + additional_inp: int | None = shell.arg(help="additional inp", default=None) + + class Outputs(shell.Outputs): + + newfile1: File | None = shell.out( + callable=lambda file1: file1, + help="newfile 1", + requires=["file1", ("additional_inp", [2, 3])], + ) + + shelly = Shelly( + executable=cmd, + ) + shelly.file1 = File.mock("new_file_1.txt") + shelly.additional_inp = 2 + + outputs = shelly(cache_root=tmp_path) + assert get_output_names(shelly) == [ + "newfile1", + "return_code", + "stderr", + "stdout", + ] + + assert outputs.stdout == "" + assert outputs.newfile1.fspath.exists() + + +def test_shell_cmd_inputspec_outputspec_4a(tmp_path): + """ + customised input_spec and output_spec, output_spec uses input_spec fields in the requires filed + adding one additional input to the requires together with a list of the allowed values, + the input is set to a value that is not in the list, so output is None + """ + cmd = ["touch", "newfile_tmp.txt"] + + @shell.define + class Shelly(shell.Task["Shelly.Outputs"]): + executable = cmd + file1: str = shell.arg(help="1st creadted file", argstr="", position=1) + additional_inp: int | None = shell.arg(help="additional inp", default=None) + + class Outputs(shell.Outputs): + + newfile1: File | None = shell.out( + callable=lambda file1: file1, + help="newfile 1", + requires=("file1", ("additional_inp", [2, 3])), + ) + + shelly = Shelly(executable=cmd) + shelly.file1 = File.mock("new_file_1.txt") + # the value is not in the list from requires + shelly.additional_inp = 1 + + outputs = shelly(cache_root=tmp_path) + assert outputs.stdout == "" + assert outputs.newfile1 is None + + +def test_shell_cmd_inputspec_outputspec_5(tmp_path): + """ + customised input_spec and output_spec, output_spec uses input_spec fields in the requires + requires is a list of list so it is treated as OR list (i.e. el[0] OR el[1] OR...) + the firs element of the requires list has all the fields set + """ + cmd = ["touch", "newfile_tmp.txt"] + + @shell.define + class Shelly(shell.Task["Shelly.Outputs"]): + executable = cmd + file1: str = shell.arg(help="1st creadted file", argstr="", position=1) + additional_inp_A: int | None = shell.arg(help="additional inp A", default=None) + additional_inp_B: str | None = shell.arg(help="additional inp B", default=None) + + class Outputs(shell.Outputs): + + newfile1: File | None = shell.out( + callable=lambda file1: file1, + help="newfile 1", + # requires is a list of list so it's treated as el[0] OR el[1] OR... + requires=[ + ["file1", "additional_inp_A"], + ["file1", "additional_inp_B"], + ], + ) + + shelly = Shelly( + executable=cmd, + ) + shelly.file1 = File.mock("new_file_1.txt") + shelly.additional_inp_A = 2 + + outputs = shelly(cache_root=tmp_path) + assert outputs.stdout == "" + assert outputs.newfile1.fspath.exists() + + +def test_shell_cmd_inputspec_outputspec_5a(tmp_path): + """ + customised input_spec and output_spec, output_spec uses input_spec fields in the requires + requires is a list of list so it is treated as OR list (i.e. el[0] OR el[1] OR...) + the second element of the requires list (i.e. additional_inp_B) has all the fields set + """ + cmd = ["touch", "newfile_tmp.txt"] + + @shell.define + class Shelly(shell.Task["Shelly.Outputs"]): + executable = cmd + file1: str = shell.arg(help="1st creadted file", argstr="", position=1) + additional_inp_A: str | None = shell.arg(help="additional inp A", default=None) + additional_inp_B: int | None = shell.arg(help="additional inp B", default=None) + + class Outputs(shell.Outputs): + + newfile1: File | None = shell.out( + callable=lambda file1: file1, + help="newfile 1", + # requires is a list of list so it's treated as el[0] OR el[1] OR... + requires=[ + ["file1", "additional_inp_A"], + ["file1", "additional_inp_B"], + ], + ) + + shelly = Shelly( + executable=cmd, + ) + shelly.file1 = File.mock("new_file_1.txt") + shelly.additional_inp_B = 2 + + outputs = shelly(cache_root=tmp_path) + assert outputs.stdout == "" + assert outputs.newfile1.fspath.exists() + + +def test_shell_cmd_inputspec_outputspec_5b(tmp_path): + """ + customised input_spec and output_spec, output_spec uses input_spec fields in the requires + requires is a list of list so it is treated as OR list (i.e. el[0] OR el[1] OR...) + neither of the list from requirements has all the fields set, so the output is NOTHING + """ + cmd = ["touch", "newfile_tmp.txt"] + + @shell.define + class Shelly(shell.Task["Shelly.Outputs"]): + executable = cmd + file1: str = shell.arg(help="1st creadted file", argstr="", position=1) + additional_inp_A: str | None = shell.arg(help="additional inp A", default=None) + additional_inp_B: str | None = shell.arg(help="additional inp B", default=None) + + class Outputs(shell.Outputs): + + newfile1: File | None = shell.out( + callable=lambda file1: file1, + help="newfile 1", + # requires is a list of list so it's treated as el[0] OR el[1] OR... + requires=[ + ["file1", "additional_inp_A"], + ["file1", "additional_inp_B"], + ], + ) + + shelly = Shelly(executable=cmd) + shelly.file1 = tmp_path / "new_file_1.txt" + + outputs = shelly(cache_root=tmp_path / "cache") + assert outputs.stdout == "" + # neither additional_inp_A nor additional_inp_B is set, so newfile1 is None + assert outputs.newfile1 is None + + +@pytest.mark.xfail( + reason="I'm not sure why this requirements specification should fail" +) +def test_shell_cmd_inputspec_outputspec_6_except(tmp_path): + """ + customised input_spec and output_spec, output_spec uses input_spec fields in the requires + requires has invalid syntax - exception is raised + """ + + with pytest.raises(Exception, match="requires field can be"): + + @shell.define + class Shelly(shell.Task["Shelly.Outputs"]): + executable = "touch" + file1: str = shell.arg(help="1st creadted file", argstr="", position=1) + additional_inp_A: str | None = shell.arg( + default=None, help="additional inp A" + ) + + class Outputs(shell.Outputs): + newfile1: File | None = shell.out( + callable=lambda file1: file1, + help="newfile 1", + # requires has invalid syntax + requires=[["file1", "additional_inp_A"], "file1"], + ) + + +def no_fsl(): + if "FSLDIR" not in os.environ: + return True + + +@pytest.mark.skipif(no_fsl(), reason="fsl is not installed") +def test_fsl(data_tests_dir, tmp_path): + """mandatory field added to fields, value provided""" + + def change_name(file): + name, ext = os.path.splitext(file) + return f"{name}_brain.{ext}" + + @shell.define( + xor=[ + "functional", + "reduce_bias", + "robust", + "padding", + "remove_eyes", + "surfaces", + "t2_guided", + None, + ] + ) + class Bet(shell.Task["Bet.Outputs"]): + executable = "bet" + in_file: File = shell.arg( + help="input file to skull strip", + position=1, + argstr="", + ) + + outline: bool = shell.arg( + default=False, + help="create surface outline image", + argstr="-o", + ) + mask: bool = shell.arg( + default=False, + help="create binary mask image", + argstr="-m", + ) + skull: bool = shell.arg( + default=False, + help="create skull image", + argstr="-s", + ) + no_output: bool = shell.arg( + default=False, + help="Don't generate segmented output", + argstr="-n", + ) + frac: float | None = shell.arg( + default=None, + help="fractional intensity threshold", + argstr="-f", + ) + vertical_gradient: float | None = shell.arg( + default=None, + help="vertical gradient in fractional intensity threshold (-1, 1)", + argstr="-g", + allowed_values={"min_val": -1, "max_val": 1}, + ) + radius: int | None = shell.arg(default=None, argstr="-r", help="head radius") + center: ty.List[int] | None = shell.arg( + default=None, + help="center of gravity in voxels", + argstr="-c", + allowed_values={"min_value": 0, "max_value": 3}, + ) + threshold: bool = shell.arg( + default=False, + argstr="-t", + help="apply thresholding to segmented brain image and mask", + ) + mesh: bool = shell.arg( + default=False, + argstr="-e", + help="generate a vtk mesh brain surface", + ) + robust: bool = shell.arg( + default=False, + help="robust brain centre estimation (iterates BET several times)", + argstr="-R", + ) + padding: bool = shell.arg( + default=False, + help="improve BET if FOV is very small in Z (by temporarily padding end slices", + argstr="-Z", + ) + remove_eyes: bool = shell.arg( + default=False, + help="eye & optic nerve cleanup (can be useful in SIENA)", + argstr="-S", + ) + surfaces: bool = shell.arg( + default=False, + help="run bet2 and then betsurf to get additional skull and scalp surfaces (includes registrations)", + argstr="-A", + ) + t2_guided: File | str | None = shell.arg( + default=None, + help="as with creating surfaces, when also feeding in non-brain-extracted T2 (includes registrations)", + argstr="-A2", + ) + functional: bool = shell.arg( + default=False, + argstr="-F", + help="apply to 4D fMRI data", + ) + reduce_bias: bool = shell.arg( + default=False, + argstr="-B", + help="bias field and neck cleanup", + ) + + class Outputs(shell.Outputs): + out_file: File = shell.outarg( + help="name of output skull stripped image", + position=2, + argstr="", + path_template="{in_file}_brain", + ) + + # ("number_classes", int, attr.ib(metadata={help='number of tissue-type classes', argstr='-n', + # allowed_values={"min_val": 1, max_val=10}})), + # ("output_biasfield", bool, + # attr.ib(metadata={help='output estimated bias field', argstr='-b'})), + # ("output_biascorrected", bool, + # attr.ib(metadata={help='output restored image (bias-corrected image)', argstr='-B'})), + + # TODO: not sure why this has to be string + in_file = data_tests_dir / "test.nii.gz" + + # separate command into exec + args + shelly = Bet(in_file=File.mock("/path/to/nifti.nii.gz")) + assert shelly.executable == "bet" + try: + orig_dir = os.getcwd() + os.chdir(tmp_path) + assert ( + shelly.cmdline == f"bet /path/to/nifti.nii.gz {tmp_path}/nifti_brain.nii.gz" + ) + finally: + os.chdir(orig_dir) + shelly = Bet(in_file=in_file) + outputs = shelly(cache_root=tmp_path) + assert outputs.out_file.name == "test_brain.nii.gz" + + +def test_shell_cmd_optional_output_file1(tmp_path): + """ + Test to see that 'unused' doesn't complain about not having an output passed to it + """ + + @shell.define + class Shelly(shell.Task["Shelly.Outputs"]): + input: File = shell.arg(argstr="", help="input file") + + executable = "cp" + + class Outputs(shell.Outputs): + output: File = shell.outarg( + argstr="", + path_template="out.txt", + help="output file", + ) + unused: File | None = shell.outarg( + default=None, + argstr="--not-used", + path_template="out.txt", + help="dummy output", + ) + + file1 = tmp_path / "file1.txt" + file1.write_text("foo") + my_cp = Shelly(input=file1, unused=False) + outputs = my_cp(cache_root=tmp_path) + assert outputs.output.fspath.read_text() == "foo" + + +def test_shell_cmd_optional_output_file2(tmp_path): + """ + Test to see that 'unused' doesn't complain about not having an output passed to it + """ + + @shell.define + class Shelly(shell.Task["Shelly.Outputs"]): + executable = "cp" + + input: File = shell.arg(argstr="", help="input file") + + class Outputs(shell.Outputs): + output: File | None = shell.outarg( + argstr="", + path_template="out.txt", + help="dummy output", + ) + + file1 = tmp_path / "file1.txt" + file1.write_text("foo") + my_cp = Shelly(input=file1, output=True) + outputs = my_cp(cache_root=tmp_path) + assert outputs.output.fspath.read_text() == "foo" + + file2 = tmp_path / "file2.txt" + file2.write_text("bar") + my_cp2 = Shelly(input=file2, output=False) + with pytest.raises(RuntimeError): + my_cp2() + + +def test_shell_cmd_non_existing_outputs_1(tmp_path): + """Checking that non existing output files do not return a phantom path, + but return None instead""" + + @shell.define + class Shelly(shell.Task["Shelly.Outputs"]): + executable = "echo" + out_name: str = shell.arg( + help=""" + base name of the pretend outputs. + """, + ) + + class Outputs(shell.Outputs): + out_1: File | None = shell.out( + help="fictional output #1", + callable=lambda: "out_1.nii", + ) + out_2: File | None = shell.out( + help="fictional output #2", + callable=lambda: "out_2.nii", + ) + + shelly = Shelly(out_name="test") + outputs = shelly(cache_root=tmp_path) + assert outputs.out_1 is None + assert outputs.out_2 is None + + +def test_shell_cmd_non_existing_outputs_2(tmp_path): + """Checking that non existing output files do not return a phantom path, + but return None instead. This test has one existing and one non existing output file. + """ + + @shell.define + class Shelly(shell.Task["Shelly.Outputs"]): + executable = "touch" + out_name: str = shell.arg( + help=""" + base name of the pretend outputs. + """, + argstr="{out_name}_1.nii", + ) + + class Outputs(shell.Outputs): + out_1: File = shell.outarg( + help="fictional output #1", + path_template="{out_name}_1.nii", + ) + out_2: File | None = shell.outarg( + help="fictional output #2", + path_template="{out_name}_2.nii", + ) + + shelly = Shelly(out_name="test") + cache_root = tmp_path / "cache" + outputs = shelly(cache_root=cache_root) + # the first output file is created + assert ( + outputs.out_1.fspath + == next(p for p in cache_root.iterdir() if p.name.startswith("shell-")) + / "test_1.nii" + ) + assert outputs.out_1.fspath.exists() + # the second output file is not created + assert outputs.out_2 is None + + +def test_shell_cmd_non_existing_outputs_3(tmp_path): + """Checking that non existing output files do not return a phantom path, + but return None instead. This test has an existing mandatory output and another + non existing output file. + """ + + @shell.define + class Shelly(shell.Task["Shelly.Outputs"]): + executable = "touch" + out_name: str = shell.arg( + help=""" + base name of the pretend outputs. + """, + argstr=None, + ) + + class Outputs(shell.Outputs): + out_1: File = shell.outarg( + help="real output #1", + path_template="{out_name}_1.nii", + ) + out_2: File | None = shell.out( + help="fictional output #2", + callable=lambda out_name: f"{out_name}_2.nii", + ) + + shelly = Shelly(out_name="test") + + cache_root = tmp_path / "cache" + outputs = shelly(cache_root=cache_root) + # the first output file is created + assert ( + outputs.out_1.fspath + == next(p for p in cache_root.iterdir() if p.name.startswith("shell-")) + / "test_1.nii" + ) + assert outputs.out_1.fspath.exists() + # the second output file is not created + assert outputs.out_2 is None + + +def test_shell_cmd_non_existing_outputs_4(tmp_path): + """Checking that non existing output files do not return a phantom path, + but return None instead. This test has an existing mandatory output and another non existing + mandatory output file.""" + + @shell.define + class Shelly(shell.Task["Shelly.Outputs"]): + executable = "touch" + out_name: str = shell.arg( + help="""base name of the pretend outputs.""", + argstr="{out_name}_1.nii", + ) + + class Outputs(shell.Outputs): + out_1: File = shell.out( + help="real output #1", + callable=lambda out_name: f"{out_name}_1.nii", + ) + out_2: File = shell.out( + help="fictional output #2", + callable=lambda out_name: f"{out_name}_2.nii", + ) + + shelly = Shelly(out_name="test") + # An exception should be raised because the second mandatory output does not exist + cache_root = tmp_path / "cache" + with pytest.raises( + ValueError, + match=r"file system path\(s\) provided to mandatory field .* does not exist", + ): + shelly(cache_root=cache_root) + # checking if the first output was created + assert ( + next(p for p in cache_root.iterdir() if p.name.startswith("shell-")) + / "test_1.nii" + ).exists() + + +def test_shell_cmd_non_existing_outputs_multi_1(tmp_path): + """This test looks if non existing files of an multiOuputFile are also set to NOTHING""" + + @shell.define + class Shelly(shell.Task["Shelly.Outputs"]): + executable = "echo" + out_name: MultiInputObj = shell.arg( + help=""" + base name of the pretend outputs. + """, + argstr="...", + ) + + class Outputs(shell.Outputs): + out_list: MultiOutputFile | None = shell.out( + help="fictional output #1", + callable=lambda out_name: out_name, + ) + + shelly = Shelly(out_name=["test_1.nii", "test_2.nii"]) + + # with pytest.raises(ValueError): + outputs = shelly(cache_root=tmp_path) + assert outputs.out_list == None + + +def test_shell_cmd_non_existing_outputs_multi_2(tmp_path): + """This test looks if non existing files of an multiOutputFile are also set to NOTHING. + It checks that it also works if one file of the multiOutputFile actually exists.""" + + @shell.define + class Shelly(shell.Task["Shelly.Outputs"]): + executable = "touch" + out_name: MultiInputObj = shell.arg( + help="""base name of the pretend outputs.""", + sep=" test_1_real.nii", # hacky way of creating an extra file with that name + argstr="...", + ) + + class Outputs(shell.Outputs): + out_list: MultiOutputFile | None = shell.out( + help="fictional output #1", + callable=lambda out_name: f"{out_name}_real.nii", + ) + + shelly = Shelly(out_name=["test_1", "test_2"]) + + outputs = shelly(cache_root=tmp_path) + # checking if the outputs is None + assert outputs.out_list is None + + +def test_shellspec_formatter_1(tmp_path): + """test the input callable 'formatter'.""" + + def formatter_1(inputs): + print("FORMATTER:", inputs) + return f"-t [{inputs['in1']}, {inputs['in2']}]" + + def make_shelly(formatter): + @shell.define + class Shelly(shell.Task["Shelly.Outputs"]): + executable = "exec" + in1: str = shell.arg( + argstr=None, + help="""just a dummy name""", + ) + in2: str = shell.arg( + argstr=None, + help="""just a dummy name""", + ) + + together: ty.List = shell.arg( + default=attrs.Factory(list), + help="""combines in1 and in2 into a list""", + # When providing a formatter all other metadata options are discarded. + formatter=formatter, + ) + + class Outputs(shell.Outputs): + pass + + return Shelly + + Shelly = make_shelly(formatter=formatter_1) + shelly = Shelly(in1="i1", in2="i2") + assert shelly.cmdline == "exec -t [i1, i2]" + + # testing that the formatter can overwrite a provided value for together. + shelly = Shelly(in1="i1", in2="i2", together=[1]) + assert shelly.cmdline == "exec -t [i1, i2]" + + # asking for specific inputs + def formatter_2(in1, in2): + print("FORMATTER:", in1, in2) + return f"-t [{in1}, {in2}]" + + Shelly = make_shelly(formatter_2) + + shelly = Shelly(in1="i1", in2="i2") + assert shelly.cmdline == "exec -t [i1, i2]" + + def formatter_3(in1, in3): + print("FORMATTER:", in1, in3) + return f"-t [{in1}, {in3}]" + + Shelly = make_shelly(formatter_3) + + shelly = Shelly(in1="i1", in2="i2") + with pytest.raises(Exception) as excinfo: + shelly.cmdline + assert ( + "arguments of the formatter function from together has to be in inputs or be field, but in3 is used" + == str(excinfo.value) + ) + + # checking if field value is accessible when None + def formatter_4(field): + assert isinstance(field, shell.arg) + # formatter must return a string + return "-t test" + + Shelly = make_shelly(formatter_4) + + shelly = Shelly( + in1="i1", + in2="i2", + # together="-t test", + ) + assert shelly.cmdline == "exec -t test" + + +def test_shellspec_formatter_splitter_2(tmp_path): + """test the input callable 'formatter' when a splitter is used on an argument of the formatter.""" + + # asking for specific inputs + def formatter_1(in1, in2): + return f"-t [{in1} {in2}]" + + @shell.define + class Shelly(shell.Task["Shelly.Outputs"]): + executable = "executable" + in1: str = shell.arg(help="in1") + in2: str = shell.arg(help="in2") + together: ty.List = shell.arg( + help=""" + uses in1 + """, + # When providing a formatter all other metadata options are discarded. + formatter=formatter_1, + sep=" ", + ) + + class Outputs(shell.Outputs): + pass + + in1 = ["in11", "in12"] + shelly = Shelly(in2="in2").split("in1", in1=in1) + assert shelly is not None + + # results = shelly.cmdline + # assert len(results) == 2 + # com_results = ["executable -t [in11 in2]", "executable -t [in12 in2]"] + # for i, cr in enumerate(com_results): + # assert results[i] == cr + + +@no_win +def test_shellcommand_error_msg(tmp_path): + script_path = Path(tmp_path) / "script.sh" + + with open(script_path, "w") as f: + f.write( + """#!/bin/bash + echo "first line is ok, it prints '$1'" + /command-that-doesnt-exist""" + ) + + os.chmod( + script_path, + mode=( + stat.S_IRUSR + | stat.S_IWUSR + | stat.S_IXUSR + | stat.S_IRGRP + | stat.S_IWGRP + | stat.S_IROTH + ), + ) + + @shell.define + class Shelly(shell.Task["Shelly.Outputs"]): + + executable = str(script_path) + + in1: str = shell.arg(help="a dummy string", argstr="") + + class Outputs(shell.Outputs): + pass + + shelly = Shelly(in1="hello") + + with pytest.raises(RuntimeError) as excinfo: + shelly(cache_root=tmp_path) + + path_str = str(script_path) + + assert ( + str(excinfo.value) + == f"""Error running 'main' job with ['{path_str}', 'hello']: + +stderr: +{path_str}: line 3: /command-that-doesnt-exist: No such file or directory + + +stdout: +first line is ok, it prints 'hello' +""" + ) + + +@no_win +def test_shell_cmd(tmpdir): + cmd = ["echo", "hail", "pydra"] + + # all args given as executable + Shelly = shell.define(" ".join(cmd)) + shelly = Shelly() + assert shelly.cmdline == " ".join(cmd) + outputs = shelly() + assert outputs.stdout == " ".join(cmd[1:]) + "\n" + + # separate command into exec + args + Shelly = shell.define( + cmd[0], inputs=[shell.arg(name=a, default=a) for a in cmd[1:]] + ) + shelly = Shelly() + assert shelly.executable == "echo" + assert shelly.cmdline == " ".join(cmd) + outputs = shelly() + assert outputs.return_code == 0 + assert outputs.stdout == " ".join(cmd[1:]) + "\n" diff --git a/pydra/compose/shell/tests/test_shell_templating.py b/pydra/compose/shell/tests/test_shell_templating.py new file mode 100644 index 0000000000..98acdae7fe --- /dev/null +++ b/pydra/compose/shell/tests/test_shell_templating.py @@ -0,0 +1,109 @@ +import typing as ty +from pydra.compose import shell +from pydra.compose.shell.templating import argstr_formatting +from pathlib import Path +from unittest.mock import Mock +from pydra.compose.shell.templating import template_update_single +import os +import shutil +from pathlib import Path +import random +import platform +import typing as ty +import pytest +import cloudpickle as cp +from pydra.engine.submitter import Submitter +from pydra.engine.job import Job +from pydra.compose import workflow +from fileformats.generic import Directory, File +from pydra.engine.tests.utils import Multiply, RaiseXeq1 +from pydra.utils.general import position_sort +from pydra.compose.shell.templating import parse_format_string +from pydra.engine.job import save, load_and_run +from pydra.workers.cf import get_available_cpus +from pydra.utils.hash import hash_function + + +@pytest.mark.parametrize( + "pos_args", + [ + [(2, "b"), (1, "a"), (3, "c")], + [(-2, "b"), (1, "a"), (-1, "c")], + [(None, "b"), (1, "a"), (-1, "c")], + [(-3, "b"), (None, "a"), (-1, "c")], + [(None, "b"), (1, "a"), (None, "c")], + ], +) +def test_position_sort(pos_args): + final_args = position_sort(pos_args) + assert final_args == ["a", "b", "c"] + + +def test_parse_format_string1(): + assert parse_format_string("{a}") == {"a"} + + +def test_parse_format_string2(): + assert parse_format_string("{abc}") == {"abc"} + + +def test_parse_format_string3(): + assert parse_format_string("{a:{b}}") == {"a", "b"} + + +def test_parse_format_string4(): + assert parse_format_string("{a:{b[2]}}") == {"a", "b"} + + +def test_parse_format_string5(): + assert parse_format_string("{a.xyz[somekey].abc:{b[a][b].d[0]}}") == {"a", "b"} + + +def test_parse_format_string6(): + assert parse_format_string("{a:05{b[a 2][b].e}}") == {"a", "b"} + + +def test_parse_format_string7(): + assert parse_format_string( + "{a1_field} {b2_field:02f} -test {c3_field[c]} -me {d4_field[0]}" + ) == {"a1_field", "b2_field", "c3_field", "d4_field"} + + +def test_argstr_formatting(): + @shell.define + class Shelly(shell.Task["Shelly.Outputs"]): + a1_field: str + b2_field: float + c3_field: ty.Dict[str, str] + d4_field: ty.List[str] = shell.arg(sep=" ") + executable = "dummy" + + class Outputs(shell.Outputs): + pass + + values = dict(a1_field="1", b2_field=2.0, c3_field={"c": "3"}, d4_field=["4"]) + assert ( + argstr_formatting( + "{a1_field} {b2_field:02f} -test {c3_field[c]} -me {d4_field[0]}", + values, + ) + == "1 2.000000 -test 3 -me 4" + ) + + +def test_template_formatting(tmp_path: Path): + field = Mock() + field.name = "grad" + field.argstr = "--grad" + field.path_template = ("{in_file}.bvec", "{in_file}.bval") + field.keep_extension = False + task = Mock() + values = {"in_file": Path("/a/b/c/file.txt"), "grad": True} + + assert template_update_single( + field, + task, + values=values, + cache_dir=tmp_path, + spec_type="input", + ) == [tmp_path / "file.bvec", tmp_path / "file.bval"] diff --git a/pydra/compose/tests/test_python_equivalence.py b/pydra/compose/tests/test_python_equivalence.py new file mode 100644 index 0000000000..a91e150d93 --- /dev/null +++ b/pydra/compose/tests/test_python_equivalence.py @@ -0,0 +1,239 @@ +import pytest +import random +import typing as ty +from pydra.compose.base import Field +from pydra.compose import python +from pydra.utils.general import task_fields, attrs_values + + +def non_func_fields(defn: python.Task) -> list[Field]: + return [f for f in task_fields(defn) if f.name != "function"] + + +def non_func_values(defn: python.Task) -> dict: + return {n: v for n, v in attrs_values(defn).items() if n != "function"} + + +def hashes(defn: python.Task) -> dict[str, str]: + return defn._compute_hashes()[1] + + +def test_task_equivalence(): + """testing equivalence of tasks created in different ways""" + + def add_two(a: int) -> int: + return a + 2 + + @python.define + class Canonical(python.Task["Canonical.Outputs"]): + + a: ty.Any + + class Outputs(python.Outputs): + out: int + + @staticmethod + def function(a: int) -> int: + return a + 2 + + canonical = Canonical(a=3) + + decorated1 = python.define(add_two)(a=3) + + @python.define + def addtwo(a: int) -> int: + return a + 2 + + decorated2 = addtwo(a=3) + + assert canonical._compute_hashes()[1] == decorated1._compute_hashes()[1] + assert canonical._compute_hashes()[1] == decorated2._compute_hashes()[1] + + c_outputs = canonical() + d1_outputs = decorated1() + d2_outputs = decorated2() + + assert ( + non_func_values(c_outputs) + == non_func_values(d1_outputs) + == non_func_values(d2_outputs) + ) + + +def test_annotation_equivalence_1(): + """testing various ways of annotation: one output, only types provided""" + + def direct(a: int) -> int: + return a + 2 + + Direct = python.define(direct) + + @python.define(outputs={"out": int}) + def Partial(a: int): + return a + 2 + + @python.define(inputs={"a": int}, outputs={"out": int}) + def Indirect(a): + return a + 2 + + assert non_func_fields(Direct) == non_func_fields(Partial) + assert non_func_fields(Direct) == non_func_fields(Indirect) + + assert task_fields(Direct.Outputs) == task_fields(Partial.Outputs) + assert task_fields(Direct.Outputs) == task_fields(Indirect.Outputs) + + # Run functions to ensure behavior is unaffected + a = random.randint(0, (1 << 32) - 3) + assert non_func_values(Direct(a=a)) == non_func_values(Partial(a=a)) + assert non_func_values(Direct(a=a)) == non_func_values(Indirect(a=a)) + + # checking if the annotation is properly converted to output_spec if used in task + assert task_fields(Direct.Outputs).out == python.out(name="out", type=int) + + +def test_annotation_equivalence_2(): + """testing various ways of annotation: multiple outputs, using a tuple for output annot.""" + + def direct(a: int) -> tuple[int, float]: + return a + 2, a + 2.0 + + Direct = python.define(direct, outputs=["out1", "out2"]) + + @python.define(outputs={"out1": int, "out2": float}) + def Partial(a: int): + return a + 2, a + 2.0 + + @python.define(inputs={"a": int}, outputs=["out1", "out2"]) + def Indirect(a) -> tuple[int, float]: + return a + 2, a + 2.0 + + # checking if the annotations are equivalent + assert ( + non_func_fields(Direct) == non_func_fields(Partial) == non_func_fields(Indirect) + ) + + # Run functions to ensure behavior is unaffected + a = random.randint(0, (1 << 32) - 3) + assert hashes(Direct(a=a)) == hashes(Partial(a=a)) == hashes(Indirect(a=a)) + + # checking if the annotation is properly converted to output_spec if used in task + assert list(task_fields(Direct.Outputs)) == [ + python.out(name="out1", type=int), + python.out(name="out2", type=float), + ] + + +def test_annotation_equivalence_3(): + """testing various ways of annotation: using dictionary for output annot.""" + + def direct(a: int) -> int: + return a + 2 + + Direct = python.define(direct, outputs=["out1"]) + + @python.define(outputs={"out1": int}) + def Partial(a: int): + return a + 2 + + @python.define(inputs={"a": int}, outputs={"out1": int}) + def Indirect(a): + return a + 2 + + # checking if the annotations are equivalent + assert ( + non_func_fields(Direct) == non_func_fields(Partial) == non_func_fields(Indirect) + ) + + # Run functions to ensure behavior is unaffected + a = random.randint(0, (1 << 32) - 3) + assert hashes(Direct(a=a)) == hashes(Partial(a=a)) == hashes(Indirect(a=a)) + + # checking if the annotation is properly converted to output_spec if used in task + assert task_fields(Direct.Outputs).out1 == python.out(name="out1", type=int) + + +def test_annotation_equivalence_4(): + """testing various ways of annotation: using ty.NamedTuple for the output""" + + @python.define(outputs=["sum", "sub"]) + def Direct(a: int) -> tuple[int, int]: + return a + 2, a - 2 + + @python.define(outputs={"sum": int, "sub": int}) + def Partial(a: int): + return a + 2, a - 2 + + @python.define(inputs={"a": int}, outputs={"sum": int, "sub": int}) + def Indirect(a): + return a + 2, a - 2 + + # checking if the annotations are equivalent + assert ( + task_fields(Direct.Outputs) + == task_fields(Partial.Outputs) + == task_fields(Indirect.Outputs) + ) + assert ( + task_fields(Direct.Outputs) + == task_fields(Partial.Outputs) + == task_fields(Indirect.Outputs) + ) + + # Run functions to ensure behavior is unaffected + a = random.randint(0, (1 << 32) - 3) + assert hashes(Direct(a=a)) == hashes(Partial(a=a)) == hashes(Indirect(a=a)) + + # checking if the annotation is properly converted to output_spec if used in task + assert list(task_fields(Direct.Outputs)) == [ + python.out(name="sum", type=int), + python.out(name="sub", type=int), + ] + + +def test_invalid_annotation(): + with pytest.raises(ValueError, match="Unrecognised input names"): + + @python.define(inputs={"b": int}) + def addtwo(a): + return a + 2 + + +def test_annotated_task(): + + @python.define + def Square(in_val: float): + return in_val**2 + + outputs = Square(in_val=2.0)() + assert outputs.out == 4.0 + + +def test_return_annotated_task(): + + @python.define(inputs={"in_val": float}, outputs={"squared": float}) + def Square(in_val): + return in_val**2 + + outputs = Square(in_val=2.0)() + assert outputs.squared == 4.0 + + +def test_return_halfannotated_annotated_task(): + + @python.define(inputs={"in_val": float}, outputs={"out": float}) + def Square(in_val): + return in_val**2 + + outputs = Square(in_val=2.0)() + assert outputs.out == 4.0 + + +def test_return_annotated_task_multiple_output(): + + @python.define(inputs={"in_val": float}, outputs={"squared": float, "cubed": float}) + def Square(in_val): + return in_val**2, in_val**3 + + outputs = Square(in_val=2.0)() + assert outputs.squared == 4.0 + assert outputs.cubed == 8.0 diff --git a/pydra/compose/tests/test_python_fields.py b/pydra/compose/tests/test_python_fields.py new file mode 100644 index 0000000000..459f21f250 --- /dev/null +++ b/pydra/compose/tests/test_python_fields.py @@ -0,0 +1,426 @@ +from operator import attrgetter +import typing as ty +from decimal import Decimal +import attrs +import pytest +from pydra.utils.general import task_fields +from pydra.compose import python + + +sort_key = attrgetter("name") + + +def test_interface_wrap_function(tmp_path): + def func(a: int) -> float: + """Sample function with inputs and outputs""" + return a * 2 + + SampleDef = python.define(func) + + assert issubclass(SampleDef, python.Task) + inputs = sorted(task_fields(SampleDef), key=sort_key) + outputs = sorted(task_fields(SampleDef.Outputs), key=sort_key) + assert inputs == [ + python.arg(name="a", type=int), + python.arg(name="function", type=ty.Callable, hash_eq=True, default=func), + ] + assert outputs == [python.out(name="out", type=float)] + task = SampleDef(a=1) + outputs = task(cache_root=tmp_path) + assert outputs.out == 2.0 + with pytest.raises(TypeError): + SampleDef(a=1.5) + + +def test_function_arg_fail(): + + with pytest.raises(ValueError, match="The argument 'function' is reserved"): + + @python.define + def func(function: ty.Callable) -> ty.Callable: + return function + + +def test_interface_wrap_function_with_default(): + def func(a: int, k: float = 2.0) -> float: + """Sample function with inputs and outputs""" + return a * k + + SampleDef = python.define(func) + + assert issubclass(SampleDef, python.Task) + inputs = sorted(task_fields(SampleDef), key=sort_key) + outputs = sorted(task_fields(SampleDef.Outputs), key=sort_key) + assert inputs == [ + python.arg(name="a", type=int), + python.arg(name="function", type=ty.Callable, hash_eq=True, default=func), + python.arg(name="k", type=float, default=2.0), + ] + assert outputs == [python.out(name="out", type=float)] + assert SampleDef(a=1)().out == 2.0 + assert SampleDef(a=10, k=3.0)().out == 30.0 + + +def test_interface_wrap_function_overrides(): + def func(a: int) -> float: + """Sample function with inputs and outputs""" + return a * 2 + + SampleDef = python.define( + func, + inputs={"a": python.arg(help="The argument to be doubled")}, + outputs={"b": python.out(help="the doubled output", type=Decimal)}, + ) + + assert issubclass(SampleDef, python.Task) + inputs = sorted(task_fields(SampleDef), key=sort_key) + outputs = sorted(task_fields(SampleDef.Outputs), key=sort_key) + assert inputs == [ + python.arg(name="a", type=int, help="The argument to be doubled"), + python.arg(name="function", type=ty.Callable, hash_eq=True, default=func), + ] + assert outputs == [ + python.out(name="b", type=Decimal, help="the doubled output"), + ] + outputs = SampleDef.Outputs(b=Decimal(2.0)) + assert isinstance(outputs.b, Decimal) + + +def test_interface_wrap_function_types(): + def func(a: int) -> int: + """Sample function with inputs and outputs""" + return a * 2 + + SampleDef = python.define( + func, + inputs={"a": float}, + outputs={"b": float}, + ) + + assert issubclass(SampleDef, python.Task) + inputs = sorted(task_fields(SampleDef), key=sort_key) + outputs = sorted(task_fields(SampleDef.Outputs), key=sort_key) + assert inputs == [ + python.arg(name="a", type=float), + python.arg(name="function", type=ty.Callable, hash_eq=True, default=func), + ] + assert outputs == [python.out(name="b", type=float)] + intf = SampleDef(a=1) + assert isinstance(intf.a, float) + outputs = SampleDef.Outputs(b=2.0) + assert isinstance(outputs.b, float) + + +def test_decorated_function_interface(): + @python.define(outputs=["c", "d"]) + def SampleDef(a: int, b: float) -> tuple[float, float]: + """Sample function for testing""" + return a + b, a * b + + assert issubclass(SampleDef, python.Task) + inputs = sorted(task_fields(SampleDef), key=sort_key) + outputs = sorted(task_fields(SampleDef.Outputs), key=sort_key) + assert inputs == [ + python.arg(name="a", type=int), + python.arg(name="b", type=float), + python.arg( + name="function", + type=ty.Callable, + hash_eq=True, + default=attrs.fields(SampleDef).function.default, + ), + ] + assert outputs == [ + python.out(name="c", type=float), + python.out(name="d", type=float), + ] + assert attrs.fields(SampleDef).function.default.__name__ == "SampleDef" + SampleDef.Outputs(c=1.0, d=2.0) + + +def test_interface_with_function_docstr(): + @python.define(outputs=["c", "d"]) + def SampleDef(a: int, b: float) -> tuple[float, float]: + """Sample function for testing + + :param a: First input to be inputted + :param b: Second input + :return c: Sum of a and b + :return d: product of a and b + """ + return a + b, a * b + + inputs = sorted(task_fields(SampleDef), key=sort_key) + outputs = sorted(task_fields(SampleDef.Outputs), key=sort_key) + assert inputs == [ + python.arg(name="a", type=int, help="First input to be inputted"), + python.arg(name="b", type=float, help="Second input"), + python.arg( + name="function", + type=ty.Callable, + hash_eq=True, + default=attrs.fields(SampleDef).function.default, + ), + ] + assert outputs == [ + python.out(name="c", type=float, help="Sum of a and b"), + python.out(name="d", type=float, help="product of a and b"), + ] + assert attrs.fields(SampleDef).function.default.__name__ == "SampleDef" + + +def test_interface_with_function_google_docstr(): + @python.define(outputs=["c", "d"]) + def SampleDef(a: int, b: float) -> tuple[float, float]: + """Sample function for testing + + Args: + a: First input + to be inputted + b: Second input + + Returns: + c: Sum of a and b + d: Product of a and b + """ + return a + b, a * b + + inputs = sorted(task_fields(SampleDef), key=sort_key) + outputs = sorted(task_fields(SampleDef.Outputs), key=sort_key) + assert inputs == [ + python.arg(name="a", type=int, help="First input to be inputted"), + python.arg(name="b", type=float, help="Second input"), + python.arg( + name="function", + type=ty.Callable, + hash_eq=True, + default=attrs.fields(SampleDef).function.default, + ), + ] + assert outputs == [ + python.out(name="c", type=float, help="Sum of a and b"), + python.out(name="d", type=float, help="Product of a and b"), + ] + assert attrs.fields(SampleDef).function.default.__name__ == "SampleDef" + + +def test_interface_with_function_numpy_docstr(): + @python.define( + outputs=["c", "d"] + ) # Could potentiall read output names from doc-string instead + def SampleDef(a: int, b: float) -> tuple[float, float]: + """Sample function for testing + + Parameters + ---------- + a: int + First input + to be inputted + b: float + Second input + + Returns + ------- + c : int + Sum of a and b + d : float + Product of a and b + """ + return a + b, a * b + + inputs = sorted(task_fields(SampleDef), key=sort_key) + outputs = sorted(task_fields(SampleDef.Outputs), key=sort_key) + assert inputs == [ + python.arg(name="a", type=int, help="First input to be inputted"), + python.arg(name="b", type=float, help="Second input"), + python.arg( + name="function", + type=ty.Callable, + hash_eq=True, + default=attrs.fields(SampleDef).function.default, + ), + ] + assert outputs == [ + python.out(name="c", type=float, help="Sum of a and b"), + python.out(name="d", type=float, help="Product of a and b"), + ] + assert attrs.fields(SampleDef).function.default.__name__ == "SampleDef" + + +def test_interface_with_class(): + @python.define + class SampleDef(python.Task["SampleDef.Outputs"]): + """Sample class for testing + + Args: + a: First input + to be inputted + b: Second input + """ + + a: int + b: float = 2.0 + + class Outputs(python.Outputs): + """ + Args: + c: Sum of a and b + d: Product of a and b + """ + + c: float + d: float + + @staticmethod + def function(a, b): + return a + b, a * b + + assert issubclass(SampleDef, python.Task) + inputs = sorted(task_fields(SampleDef), key=sort_key) + outputs = sorted(task_fields(SampleDef.Outputs), key=sort_key) + assert inputs == [ + python.arg(name="a", type=int, help="First input to be inputted"), + python.arg(name="b", type=float, default=2.0, help="Second input"), + python.arg( + name="function", + type=ty.Callable, + hash_eq=True, + default=attrs.fields(SampleDef).function.default, + ), + ] + assert outputs == [ + python.out(name="c", type=float, help="Sum of a and b"), + python.out(name="d", type=float, help="Product of a and b"), + ] + assert SampleDef.function.__name__ == "function" + SampleDef(a=1) + SampleDef(a=1, b=2.0) + SampleDef.Outputs(c=1.0, d=2.0) + + +def test_interface_with_inheritance(): + @python.define + class SampleDef(python.Task["SampleDef.Outputs"]): + """Sample class for testing + + Args: + a: First input + to be inputted + b: Second input + """ + + a: int + b: float + + class Outputs(python.Outputs): + """ + Args: + c: Sum of a and b + d: Product of a and b + """ + + c: float + d: float + + @staticmethod + def function(a, b): + return a + b, a * b + + assert issubclass(SampleDef, python.Task) + + +def test_interface_with_class_no_auto_attribs(): + @python.define(auto_attribs=False) + class SampleDef(python.Task["SampleDef.Outputs"]): + a: int = python.arg(help="First input to be inputted") + b: float = python.arg(help="Second input") + + x: int + + class Outputs(python.Outputs): + c: float = python.out(help="Sum of a and b") + d: float = python.out(help="Product of a and b") + + y: str + + @staticmethod + def function(a, b): + return a + b, a * b + + inputs = sorted(task_fields(SampleDef), key=sort_key) + outputs = sorted(task_fields(SampleDef.Outputs), key=sort_key) + assert inputs == [ + python.arg(name="a", type=int, help="First input to be inputted"), + python.arg(name="b", type=float, help="Second input"), + python.arg( + name="function", + type=ty.Callable, + hash_eq=True, + default=attrs.fields(SampleDef).function.default, + ), + ] + assert outputs == [ + python.out(name="c", type=float, help="Sum of a and b"), + python.out(name="d", type=float, help="Product of a and b"), + ] + assert SampleDef.function.__name__ == "function" + SampleDef(a=1, b=2.0) + SampleDef.Outputs(c=1.0, d=2.0) + with pytest.raises(TypeError): + SampleDef(a=1, b=2.0, x=3) + with pytest.raises(TypeError): + SampleDef.Outputs(c=1.0, d=2.0, y="hello") + + +def test_interface_invalid_wrapped1(): + with pytest.raises(ValueError): + + @python.define(inputs={"a": python.arg()}) + class SampleDef(python.Task["SampleDef.Outputs"]): + a: int + + class Outputs: + b: float + + @staticmethod + def function(a): + return a + 1 + + +def test_interface_invalid_wrapped2(): + with pytest.raises(ValueError): + + @python.define(outputs={"b": python.out()}) + class SampleDef(python.Task["SampleDef.Outputs"]): + a: int + + class Outputs: + b: float + + @staticmethod + def function(a): + return a + 1 + + +def test_task_repr(): + @python.define(outputs=["x", "y", "z"]) + def IdentityN3(x: int, y: int = 1, z: int = 2) -> tuple[int, int, int]: + return x, y, z + + assert repr(IdentityN3(x=1, y=2)) == "IdentityN3(x=1, y=2)" + + +@attrs.define(auto_attribs=True) +class A: + x: int + + +def test_object_input(): + """Test function tasks with object inputs""" + + @python.define + def TestFunc(a: A): + return a.x + + outputs = TestFunc(a=A(x=7))() + assert outputs.out == 7 diff --git a/pydra/compose/tests/test_python_numpy.py b/pydra/compose/tests/test_python_numpy.py new file mode 100644 index 0000000000..302b431c88 --- /dev/null +++ b/pydra/compose/tests/test_python_numpy.py @@ -0,0 +1,108 @@ +import typing as ty +import importlib +from pathlib import Path +import pickle as pk +import numpy as np +import pytest + + +from pydra.engine.submitter import Submitter +from pydra.compose import python, workflow +from pydra.engine.tests.utils import Identity +from pydra.utils.hash import hash_function + +if importlib.util.find_spec("numpy") is None: + pytest.skip("can't find numpy library", allow_module_level=True) + + +@python.define(outputs=["b"]) +def ArrayOut(val): + return np.array([val, val]) + + +def test_multiout(tmpdir): + """testing a simple function that returns a numpy array""" + + @workflow.define(outputs=["array"]) + def Workflow(val): + mo = workflow.add(ArrayOut(val=val)) + return mo.b + + wf = Workflow(val=2) + + with Submitter(worker="cf", cache_root=tmpdir, n_procs=2) as sub: + results = sub(wf) + + assert np.array_equal(results.outputs.array, np.array([2, 2])) + + +def test_multiout_st(tmpdir): + """testing a simple function that returns a numpy array, adding splitter""" + + @workflow.define(outputs=["array"]) + def Workflow(values): + mo = workflow.add(ArrayOut().split(val=values).combine("val")) + return mo.b + + wf = Workflow(values=[0, 1, 2]) + + with Submitter(worker="cf", cache_root=tmpdir, n_procs=2) as sub: + results = sub(wf) + + for el in range(3): + assert np.array_equal(results.outputs.array[el], np.array([el, el])) + + +def test_numpy_hash_1(): + """hashing check for numeric numpy array""" + A = np.array([1, 2]) + A_pk = pk.loads(pk.dumps(A)) + assert (A == A_pk).all() + assert hash_function(A) == hash_function(A_pk) + + +def test_numpy_hash_2(): + """hashing check for numpy array of type object""" + A = np.array([["NDAR"]], dtype=object) + A_pk = pk.loads(pk.dumps(A)) + assert (A == A_pk).all() + assert hash_function(A) == hash_function(A_pk) + + +def test_numpy_hash_3(): + """hashing check for numeric numpy array""" + A = np.array([1, 2]) + B = np.array([3, 4]) + assert hash_function(A) != hash_function(B) + + +def test_task_numpyinput_1(tmp_path: Path): + """task with numeric numpy array as an input""" + nn = Identity().split(x=[np.array([1, 2]), np.array([3, 4])]) + # checking the results + outputs = nn(cache_root=tmp_path) + assert (np.array(outputs.out) == np.array([[1, 2], [3, 4]])).all() + + +def test_task_numpyinput_2(tmp_path: Path): + """task with numpy array of type object as an input""" + nn = Identity().split( + x=[np.array(["VAL1"], dtype=object), np.array(["VAL2"], dtype=object)] + ) + # checking the results + outputs = nn(cache_root=tmp_path) + assert outputs.out[0] == np.array(["VAL1"], dtype=object) + assert outputs.out[1] == np.array(["VAL2"], dtype=object) + + +def test_numpy_fft(): + """checking if mark.task works for numpy functions""" + np = pytest.importorskip("numpy") + FFT = python.define(inputs={"a": np.ndarray}, outputs={"out": np.ndarray})( + np.fft.fft + ) + + arr = np.array([[1, 10], [2, 20]]) + fft = FFT(a=arr) + outputs = fft() + assert np.allclose(np.fft.fft(arr), outputs.out) diff --git a/pydra/compose/tests/test_python_run.py b/pydra/compose/tests/test_python_run.py new file mode 100644 index 0000000000..db6e651515 --- /dev/null +++ b/pydra/compose/tests/test_python_run.py @@ -0,0 +1,831 @@ +import typing as ty +import os +import pytest +from pathlib import Path +import glob as glob +from pydra.compose import python +from pydra.utils.general import task_fields, task_help +from pydra.utils.general import default_run_cache_root +from pydra.utils.typing import ( + MultiInputObj, + MultiOutputObj, +) + + +@python.define +def FunAddTwo(a): + return a + 2 + + +def test_output(): + nn = FunAddTwo(a=3) + outputs = nn() + assert outputs.out == 5 + + +def test_python_output(): + @python.define(outputs=["output"]) + def TestFunc(a: int, b: float = 0.1) -> float: + """ + Parameters + ---------- + a : int + first input + b : float + second input + + Returns + ------- + output : float + sum of a and b + """ + return a + b + + funky = TestFunc(a=1) + assert hasattr(funky, "a") + assert hasattr(funky, "b") + assert hasattr(funky, "function") + assert getattr(funky, "a") == 1 + assert getattr(funky, "b") == 0.1 + assert getattr(funky, "function") is not None + assert set(f.name for f in task_fields(funky.Outputs)) == {"output"} + + outputs = funky() + assert hasattr(outputs, "output") + assert outputs.output == 1.1 + + assert os.path.exists( + default_run_cache_root / f"python-{funky._hash}" / "_result.pklz" + ) + funky() # should not recompute + funky.a = 2 + outputs = funky() + assert outputs.output == 2.1 + + help = task_help(funky) + assert help == [ + "-------------------------------", + "Help for Python task 'TestFunc'", + "-------------------------------", + "", + "Inputs:", + "- a: int", + " first input", + "- b: float; default = 0.1", + " second input", + "- function: Callable[]; default = TestFunc()", + "", + "Outputs:", + "- output: float", + " sum of a and b", + "", + ] + + +def test_python_output_dictreturn(tmp_path: Path): + """Test mapping from returned dictionary to output definition.""" + + @python.define(outputs={"sum": int, "mul": int | None}) + def TestFunc(a: int, b: int): + return dict(sum=a + b, diff=a - b) + + task = TestFunc(a=2, b=3) + outputs = task(cache_root=tmp_path) + + # Part of the annotation and returned, should be exposed to output. + assert outputs.sum == 5 + + # Part of the annotation but not returned, should be coalesced to None + assert outputs.mul is None + + # Not part of the annotation, should be discarded. + assert not hasattr(outputs, "diff") + + +def test_python_output_multreturn(): + """the function has two elements in the return statement""" + + @python.define(outputs={"fractional": float, "integer": int}) + def TestFunc( + a: float, + ): + import math + + return math.modf(a)[0], int(math.modf(a)[1]) + + funky = TestFunc(a=3.5) + assert hasattr(funky, "a") + assert hasattr(funky, "function") + assert getattr(funky, "a") == 3.5 + assert getattr(funky, "function") is not None + assert set(f.name for f in task_fields(funky.Outputs)) == {"fractional", "integer"} + + outputs = funky() + assert os.path.exists( + default_run_cache_root / f"python-{funky._hash}" / "_result.pklz" + ) + assert hasattr(outputs, "fractional") + assert outputs.fractional == 0.5 + assert hasattr(outputs, "integer") + assert outputs.integer == 3 + + help = task_help(funky) + assert help == [ + "-------------------------------", + "Help for Python task 'TestFunc'", + "-------------------------------", + "", + "Inputs:", + "- a: float", + "- function: Callable[]; default = TestFunc()", + "", + "Outputs:", + "- fractional: float", + "- integer: int", + "", + ] + + +def test_python_func_1(): + """the function with annotated input (float)""" + + @python.define + def TestFunc(a: float): + return a + + funky = TestFunc(a=3.5) + assert getattr(funky, "a") == 3.5 + + +def test_python_func_2(): + """the function with annotated input (int, but float provided)""" + + @python.define + def TestFunc(a: int): + return a + + with pytest.raises(TypeError): + TestFunc(a=3.5) + + +def test_python_func_2a(): + """the function with annotated input (int, but float provided)""" + + @python.define + def TestFunc(a: int): + return a + + funky = TestFunc() + with pytest.raises(TypeError): + funky.a = 3.5 + + +def test_python_func_3(): + """the function with annotated input (list)""" + + @python.define + def TestFunc(a: list): + return sum(a) + + funky = TestFunc(a=[1, 3.5]) + assert getattr(funky, "a") == [1, 3.5] + + +def test_python_func_3a(): + """the function with annotated input (list of floats)""" + + @python.define + def TestFunc(a: ty.List[float]): + return sum(a) + + funky = TestFunc(a=[1.0, 3.5]) + assert getattr(funky, "a") == [1.0, 3.5] + + +def test_python_func_3b(): + """the function with annotated input + (list of floats - int and float provided, should be fine) + """ + + @python.define + def TestFunc(a: ty.List[float]): + return sum(a) + + funky = TestFunc(a=[1, 3.5]) + assert getattr(funky, "a") == [1, 3.5] + + +def test_python_func_3c_excep(): + """the function with annotated input + (list of ints - int and float provided, should raise an error) + """ + + @python.define + def TestFunc(a: ty.List[int]): + return sum(a) + + with pytest.raises(TypeError): + TestFunc(a=[1, 3.5]) + + +def test_python_func_4(): + """the function with annotated input (dictionary)""" + + @python.define + def TestFunc(a: dict): + return sum(a.values()) + + funky = TestFunc(a={"el1": 1, "el2": 3.5}) + assert getattr(funky, "a") == {"el1": 1, "el2": 3.5} + + +def test_python_func_4a(): + """the function with annotated input (dictionary of floats)""" + + @python.define + def TestFunc(a: ty.Dict[str, float]): + return sum(a.values()) + + funky = TestFunc(a={"el1": 1, "el2": 3.5}) + assert getattr(funky, "a") == {"el1": 1, "el2": 3.5} + + +def test_python_func_4b_excep(): + """the function with annotated input (dictionary of ints, but float provided)""" + + @python.define + def TestFunc(a: ty.Dict[str, int]): + return sum(a.values()) + + with pytest.raises(TypeError): + TestFunc(a={"el1": 1, "el2": 3.5}) + + +def test_python_func_5(): + """the function with annotated more complex input type (ty.List in ty.Dict) + the validator should simply check if values of dict are lists + so no error for 3.5 + """ + + @python.define + def TestFunc(a: ty.Dict[str, ty.List]): + return sum(a["el1"]) + + funky = TestFunc(a={"el1": [1, 3.5]}) + assert getattr(funky, "a") == {"el1": [1, 3.5]} + + +def test_python_func_5a_except(): + """the function with annotated more complex input type (ty.Dict in ty.Dict) + list is provided as a dict value (instead a dict), so error is raised + """ + + @python.define + def TestFunc(a: ty.Dict[str, ty.Dict[str, float]]): + return sum(a["el1"]) + + with pytest.raises(TypeError): + TestFunc(a={"el1": [1, 3.5]}) + + +def test_python_func_6(): + """the function with annotated more complex input type (ty.Union in ty.Dict) + the validator should unpack values from the Union + """ + + @python.define + def TestFunc(a: ty.Dict[str, ty.Union[float, int]]): + return sum(a["el1"]) + + funky = TestFunc(a={"el1": 1, "el2": 3.5}) + assert getattr(funky, "a") == {"el1": 1, "el2": 3.5} + + +def test_python_func_6a_excep(): + """the function with annotated more complex input type (ty.Union in ty.Dict) + the validator should unpack values from the Union and raise an error for 3.5 + """ + + @python.define + def TestFunc(a: ty.Dict[str, ty.Union[str, int]]): + return sum(a["el1"]) + + with pytest.raises(TypeError): + TestFunc(a={"el1": 1, "el2": 3.5}) + + +def test_python_func_7(): + """the function with annotated input (float) + the task has a splitter, so list of float is provided + it should work, the validator tries to guess if this is a field with a splitter + """ + + @python.define + def TestFunc(a: float): + return a + + funky = TestFunc().split("a", a=[3.5, 2.1]) + assert getattr(funky, "a") == [3.5, 2.1] + + +def test_python_func_7a_excep(): + """the function with annotated input (int) and splitter + list of float provided - should raise an error (list of int would be fine) + """ + + @python.define + def TestFunc(a: int): + return a + + with pytest.raises(TypeError): + TestFunc(a=[3.5, 2.1]).split("a") + + +def test_python_func_8(): + """the function with annotated input as MultiInputObj + a single value is provided and should be converted to a list + """ + + @python.define + def TestFunc(a: MultiInputObj): + return len(a) + + funky = TestFunc(a=3.5) + assert getattr(funky, "a") == [3.5] + outputs = funky() + assert outputs.out == 1 + + +def test_python_func_8a(): + """the function with annotated input as MultiInputObj + a 1-el list is provided so shouldn't be changed + """ + + @python.define + def TestFunc(a: MultiInputObj): + return len(a) + + funky = TestFunc(a=[3.5]) + assert getattr(funky, "a") == [3.5] + outputs = funky() + assert outputs.out == 1 + + +def test_python_func_8b(): + """the function with annotated input as MultiInputObj + a single value is provided after initial. the task + (input should still be converted to a list) + """ + + @python.define + def TestFunc(a: MultiInputObj): + return len(a) + + funky = TestFunc() + # setting a after init + funky.a = 3.5 + assert getattr(funky, "a") == [3.5] + outputs = funky() + assert outputs.out == 1 + + +def test_python_output_multreturn_exception(): + """function has two elements in the return statement, + but three element provided in the task - should raise an error + """ + + @python.define(outputs={"fractional": float, "integer": int, "who_knows": int}) + def TestFunc( + a: float, + ): + import math + + return math.modf(a) + + funky = TestFunc(a=3.5) + with pytest.raises(Exception) as excinfo: + funky() + assert "expected 3 elements" in str(excinfo.value) + + +def test_halfpython_output(tmp_path): + + cache_root = tmp_path / "cache" + cache_root.mkdir() + + @python.define + def TestFunc(a, b) -> int: + return a + b + + funky = TestFunc(a=10, b=20) + assert hasattr(funky, "a") + assert hasattr(funky, "b") + assert hasattr(funky, "function") + assert getattr(funky, "a") == 10 + assert getattr(funky, "b") == 20 + assert getattr(funky, "function") is not None + assert set(f.name for f in task_fields(funky.Outputs)) == {"out"} + + outputs = funky(cache_root=cache_root) + assert hasattr(outputs, "out") + assert outputs.out == 30 + + assert Path(cache_root / f"python-{funky._hash}" / "_result.pklz").exists() + + funky(cache_root=cache_root) # should not recompute + funky.a = 11 + assert not Path(cache_root / f"python-{funky._hash}").exists() + outputs = funky(cache_root=cache_root) + assert outputs.out == 31 + help = task_help(funky) + + assert help == [ + "-------------------------------", + "Help for Python task 'TestFunc'", + "-------------------------------", + "", + "Inputs:", + "- a: Any", + "- b: Any", + "- function: Callable[]; default = TestFunc()", + "", + "Outputs:", + "- out: int", + "", + ] + + +def test_halfpython_output_multreturn(tmp_path): + + cache_root = tmp_path / "cache" + cache_root.mkdir() + + @python.define(outputs=["out1", "out2"]) + def TestFunc(a, b) -> tuple[int, int]: + return a + 1, b + 1 + + funky = TestFunc(a=10, b=20) + assert hasattr(funky, "a") + assert hasattr(funky, "b") + assert hasattr(funky, "function") + assert getattr(funky, "a") == 10 + assert getattr(funky, "b") == 20 + assert getattr(funky, "function") is not None + assert set(f.name for f in task_fields(funky.Outputs)) == {"out1", "out2"} + + outputs = funky(cache_root=cache_root) + assert hasattr(outputs, "out1") + assert outputs.out1 == 11 + + assert Path(cache_root / f"python-{funky._hash}" / "_result.pklz").exists() + + funky(cache_root=cache_root) # should not recompute + funky.a = 11 + assert not Path(cache_root / f"python-{funky._hash}" / "_result.pklz").exists() + outputs = funky(cache_root=cache_root) + assert outputs.out1 == 12 + help = task_help(funky) + + assert help == [ + "-------------------------------", + "Help for Python task 'TestFunc'", + "-------------------------------", + "", + "Inputs:", + "- a: Any", + "- b: Any", + "- function: Callable[]; default = TestFunc()", + "", + "Outputs:", + "- out1: int", + "- out2: int", + "", + ] + + +def test_notpython_output(): + @python.define + def NoAnnots(c, d): + return c + d + + no_annots = NoAnnots(c=17, d=3.2) + assert hasattr(no_annots, "c") + assert hasattr(no_annots, "d") + assert hasattr(no_annots, "function") + + outputs = no_annots() + assert hasattr(outputs, "out") + assert outputs.out == 20.2 + + +def test_notpython_output_returnlist(): + @python.define + def NoAnnots(c, d): + return [c, d] + + no_annots = NoAnnots(c=17, d=3.2) + outputs = no_annots() + assert hasattr(outputs, "out") + assert outputs.out == [17, 3.2] + + +def test_halfpython_output_multrun_returnlist(): + @python.define(outputs=["out1", "out2"]) + def NoAnnots(c, d) -> tuple[list, float]: + return [c, d], c + d + + no_annots = NoAnnots(c=17, d=3.2) + outputs = no_annots() + + assert hasattr(outputs, "out1") + assert hasattr(outputs, "out2") + assert outputs.out1 == [17, 3.2] + assert outputs.out2 == 20.2 + + +def test_notpython_output_multreturn(): + """no annotation and multiple values are returned + all elements should be returned as a tuple and set to "out" + """ + + @python.define + def NoAnnots(c, d): + return c + d, c - d + + no_annots = NoAnnots(c=17, d=3.2) + assert hasattr(no_annots, "c") + assert hasattr(no_annots, "d") + assert hasattr(no_annots, "function") + + outputs = no_annots() + assert hasattr(outputs, "out") + assert outputs.out == (20.2, 13.8) + + +def test_input_spec_func_1(): + """the function w/o annotated, but input_spec is used""" + + @python.define(inputs={"a": python.arg(type=float, help="input a")}) + def TestFunc(a): + return a + + funky = TestFunc(a=3.5) + assert funky.a == 3.5 + + +def test_input_spec_func_1a_except(): + """the function w/o annotated, but input_spec is used + a TypeError is raised (float is provided instead of int) + """ + + @python.define(inputs={"a": python.arg(type=int, help="input a")}) + def TestFunc(a): + return a + + with pytest.raises(TypeError): + TestFunc(a=3.5) + + +def test_input_spec_func_1b_except(): + """the function w/o annotated, but input_spec is used + metadata checks raise an error + """ + + with pytest.raises( + TypeError, match="got an unexpected keyword argument 'position'" + ): + + @python.define(inputs={"a": python.arg(type=float, position=1, help="input a")}) + def TestFunc(a): + return a + + +def test_input_spec_func_1d_except(): + """the function w/o annotated, but input_spec is used + input_spec doesn't contain 'a' input, an error is raised + """ + + @python.define + def TestFunc(a): + return a + + funky = TestFunc() + with pytest.raises(ValueError, match="Mandatory field 'a' is not set"): + funky() + + +def test_input_spec_func_2(): + """the function with annotation, and the task has input_spec, + input_spec changes the type of the input (so error is not raised) + """ + + @python.define(inputs={"a": python.arg(type=float, help="input a")}) + def TestFunc(a: int): + return a + + funky = TestFunc(a=3.5) + assert funky.a == 3.5 + + +def test_input_spec_func_2a(): + """the function with annotation, and the task has input_spec, + input_spec changes the type of the input (so error is not raised) + using the shorter syntax + """ + + @python.define(inputs={"a": python.arg(type=float, help="input a")}) + def TestFunc(a: int): + return a + + funky = TestFunc(a=3.5) + assert funky.a == 3.5 + + +def test_input_spec_func_3(): + """the function w/o annotated, but input_spec is used + additional keys (allowed_values) are used in metadata + """ + + @python.define( + inputs={ + "a": python.arg( + type=int, + help="input a", + allowed_values=[0, 1, 2], + ) + } + ) + def TestFunc(a): + return a + + funky = TestFunc(a=2) + assert funky.a == 2 + + +def test_input_spec_func_3a_except(): + """the function w/o annotated, but input_spec is used + allowed_values is used in metadata and the ValueError is raised + """ + + @python.define( + inputs={ + "a": python.arg( + type=int, + help="input a", + allowed_values=[0, 1, 2], + ) + } + ) + def TestFunc(a): + return a + + with pytest.raises(ValueError, match="value of a has to be"): + TestFunc(a=3) + + +def test_input_spec_func_4(): + """the function with a default value for b + but b is set as mandatory in the input_spec, so error is raised if not provided + """ + + @python.define( + inputs={ + "a": python.arg(type=int, help="input a"), + "b": python.arg(type=int, help="input b"), + } + ) + def TestFunc(a, b): + return a + b + + funky = TestFunc(a=2) + with pytest.raises(Exception, match="Mandatory field 'b' is not set"): + funky() + + +def test_input_spec_func_4a(): + """the function with a default value for b and metadata in the input_spec + has a different default value, so value from the function is overwritten + """ + + @python.define( + inputs={ + "a": python.arg(type=int, help="input a"), + "b": python.arg(type=int, help="input b", default=10), + } + ) + def TestFunc(a, b=1): + return a + b + + funky = TestFunc(a=2) + outputs = funky() + assert outputs.out == 12 + + +def test_input_spec_func_5(): + """the python.Task with input_spec, a input has MultiInputObj type + a single value is provided and should be converted to a list + """ + + @python.define(inputs={"a": python.arg(type=MultiInputObj, help="input a")}) + def TestFunc(a): + return len(a) + + funky = TestFunc(a=3.5) + assert funky.a == MultiInputObj([3.5]) + outputs = funky() + assert outputs.out == 1 + + +def test_output_spec_func_1(): + """the function w/o annotated, but output_spec is used""" + + @python.define(outputs={"out1": python.out(type=float, help="output")}) + def TestFunc(a): + return a + + funky = TestFunc(a=3.5) + outputs = funky() + assert outputs.out1 == 3.5 + + +def test_output_spec_func_1a_except(): + """the function w/o annotated, but output_spec is used + float returned instead of int - TypeError + """ + + @python.define(outputs={"out1": python.out(type=int, help="output")}) + def TestFunc(a): + return a + + funky = TestFunc(a=3.5) + with pytest.raises(TypeError): + funky() + + +def test_output_spec_func_2(): + """the function w/o annotated, but output_spec is used + output_spec changes the type of the output (so error is not raised) + """ + + @python.define(outputs={"out1": python.out(type=float, help="output")}) + def TestFunc(a) -> int: + return a + + funky = TestFunc(a=3.5) + outputs = funky() + assert outputs.out1 == 3.5 + + +def test_output_spec_func_2a(): + """the function w/o annotated, but output_spec is used + output_spec changes the type of the output (so error is not raised) + using a shorter syntax + """ + + @python.define(outputs={"out1": python.out(type=float, help="output")}) + def TestFunc(a) -> int: + return a + + funky = TestFunc(a=3.5) + outputs = funky() + assert outputs.out1 == 3.5 + + +def test_output_spec_func_3(): + """the function w/o annotated, but output_spec is used + MultiOutputObj is used, output is a 2-el list, so converter doesn't do anything + """ + + @python.define(outputs={"out_list": python.out(type=MultiOutputObj, help="output")}) + def TestFunc(a, b): + return [a, b] + + funky = TestFunc(a=3.5, b=1) + outputs = funky() + assert outputs.out_list == [3.5, 1] + + +def test_output_spec_func_4(): + """the function w/o annotated, but output_spec is used + MultiOutputObj is used, output is a 1el list, so converter return the element + """ + + @python.define(outputs={"out_list": python.out(type=MultiOutputObj, help="output")}) + def TestFunc(a): + return [a] + + funky = TestFunc(a=3.5) + outputs = funky() + assert outputs.out_list == 3.5 + + +def test_functask_callable(tmpdir): + # no submitter or worker + foo = FunAddTwo(a=1) + outputs = foo() + assert outputs.out == 3 + + # worker + bar = FunAddTwo(a=2) + outputs = bar(worker="cf", cache_root=tmpdir) + assert outputs.out == 4 diff --git a/pydra/compose/tests/test_workflow_fields.py b/pydra/compose/tests/test_workflow_fields.py new file mode 100644 index 0000000000..368a639f04 --- /dev/null +++ b/pydra/compose/tests/test_workflow_fields.py @@ -0,0 +1,502 @@ +from operator import attrgetter +from copy import copy +from unittest.mock import Mock +import pytest +import attrs +from pydra.engine.lazy import LazyInField, LazyOutField +import typing as ty +from pydra.compose import shell, python, workflow +from pydra.utils.general import task_fields +from pydra.engine.workflow import Workflow +from pydra.utils.hash import hash_function +from fileformats import video, image + +# NB: We use PascalCase for interfaces and workflow functions as it is translated into a class + + +@python.define +def Add(a, b): + return a + b + + +@python.define +def Mul(a, b): + return a * b + + +@python.define(outputs=["divided"]) +def Divide(x, y): + return x / y + + +@python.define +def Sum(x: list[float]) -> float: + return sum(x) + + +def a_converter(value): + if value is attrs.NOTHING: + return value + return float(value) + + +def test_workflow(): + + @workflow.define + def MyTestWorkflow(a, b): + add = workflow.add(Add(a=a, b=b)) + mul = workflow.add(Mul(a=add.out, b=b)) + return mul.out + + constructor = MyTestWorkflow().constructor + assert constructor.__name__ == "MyTestWorkflow" + + # The constructor function is included as a part of the task so it is + # included in the hash by default and can be overridden if needed. Not 100% sure + # if this is a good idea or not + assert list(task_fields(MyTestWorkflow)) == [ + workflow.arg(name="a"), + workflow.arg(name="b"), + workflow.arg( + name="constructor", type=ty.Callable, hash_eq=True, default=constructor + ), + ] + assert list(task_fields(MyTestWorkflow.Outputs)) == [ + workflow.out(name="out"), + ] + workflow_spec = MyTestWorkflow(a=1, b=2.0) + wf = Workflow.construct(workflow_spec) + assert wf.inputs.a == 1 + assert wf.inputs.b == 2.0 + assert wf.outputs.out == LazyOutField(node=wf["Mul"], field="out", type=ty.Any) + + # Nodes are named after the specs by default + assert list(wf.node_names) == ["Add", "Mul"] + + +def test_constructor_arg_fail(): + + with pytest.raises(ValueError, match="The argument 'constructor' is reserved"): + + @workflow.define + def MyTestWorkflow(constructor: ty.Callable) -> ty.Callable: + return constructor + + +def test_shell_workflow(): + + @workflow.define(outputs=["output_video"]) + def MyTestShellWorkflow( + input_video: video.Mp4, + watermark: image.Png, + watermark_dims: tuple[int, int] = (10, 10), + ) -> video.Mp4: + + add_watermark = workflow.add( + shell.define( + "ffmpeg -i -i " + "-filter_complex " + )( + in_video=input_video, + watermark=watermark, + filter="overlay={}:{}".format(*watermark_dims), + ), + name="add_watermark", + ) + output_video = workflow.add( + shell.define( + "HandBrakeCLI -i -o " + "--width --height ", + )(in_video=add_watermark.out_video, width=1280, height=720), + name="resize", + ).out_video + + return output_video + + constructor = MyTestShellWorkflow().constructor + assert constructor.__name__ == "MyTestShellWorkflow" + assert list(task_fields(MyTestShellWorkflow)) == [ + workflow.arg(name="input_video", type=video.Mp4), + workflow.arg(name="watermark", type=image.Png), + workflow.arg(name="watermark_dims", type=tuple[int, int], default=(10, 10)), + workflow.arg( + name="constructor", type=ty.Callable, hash_eq=True, default=constructor + ), + ] + assert list(task_fields(MyTestShellWorkflow.Outputs)) == [ + workflow.out(name="output_video", type=video.Mp4), + ] + input_video = video.Mp4.mock("input.mp4") + watermark = image.Png.mock("watermark.png") + workflow_spec = MyTestShellWorkflow( + input_video=input_video, + watermark=watermark, + ) + wf = Workflow.construct(workflow_spec) + assert wf.inputs.input_video == input_video + assert wf.inputs.watermark == watermark + assert wf.outputs.output_video == LazyOutField( + node=wf["resize"], field="out_video", type=video.Mp4, type_checked=True + ) + assert list(wf.node_names) == ["add_watermark", "resize"] + + +def test_workflow_canonical(): + """Test class-based workflow task""" + + # NB: We use PascalCase (i.e. class names) as it is translated into a class + + @workflow.define + class MyTestWorkflow(workflow.Task["MyTestWorkflow.Outputs"]): + + a: int + b: float = workflow.arg( + help="A float input", + converter=a_converter, + ) + + @staticmethod + def constructor(a, b): + add = workflow.add(Add(a=a, b=b)) + mul = workflow.add(Mul(a=add.out, b=b)) + return mul.out + + @workflow.outputs + class Outputs(workflow.Outputs): + out: float + + constructor = MyTestWorkflow().constructor + assert constructor.__name__ == "constructor" + + # The constructor function is included as a part of the task so it is + # included in the hash by default and can be overridden if needed. Not 100% sure + # if this is a good idea or not + assert sorted(task_fields(MyTestWorkflow), key=attrgetter("name")) == [ + workflow.arg(name="a", type=int), + workflow.arg(name="b", type=float, help="A float input", converter=a_converter), + workflow.arg( + name="constructor", type=ty.Callable, hash_eq=True, default=constructor + ), + ] + assert list(task_fields(MyTestWorkflow.Outputs)) == [ + workflow.out(name="out", type=float), + ] + workflow_spec = MyTestWorkflow(a=1, b=2.0) + wf = Workflow.construct(workflow_spec) + assert wf.inputs.a == 1 + assert wf.inputs.b == 2.0 + assert wf.outputs.out == LazyOutField(node=wf["Mul"], field="out", type=ty.Any) + + # Nodes are named after the specs by default + assert list(wf.node_names) == ["Add", "Mul"] + + +def test_workflow_lazy(): + + @workflow.define + def MyTestShellWorkflow( + input_video: video.Mp4, + watermark: image.Png, + watermark_dims: tuple[int, int] = (10, 10), + ) -> video.Mp4: + + add_watermark = workflow.add( + shell.define( + "ffmpeg -i -i " + "-filter_complex " + )( + in_video=input_video, + watermark=watermark, + filter="overlay={}:{}".format(*watermark_dims), + ), + name="add_watermark", + ) + output_video = workflow.add( + shell.define( + "HandBrakeCLI -i -o " + "--width --height ", + # By default any input/output specified with a flag (e.g. -i ) + # is considered optional, i.e. of type `FsObject | None`, and therefore + # won't be used by default. By overriding this with non-optional types, + # the fields are specified as being required. + inputs={"in_video": video.Mp4}, + outputs={"out_video": video.Mp4}, + )(in_video=add_watermark.out_video, width=1280, height=720), + name="resize", + ).out_video + + return output_video # test implicit detection of output name + + # input_video = video.Mp4.mock("input.mp4") + # watermark = image.Png.mock("watermark.png") + mock_node = Mock() + mock_node.name = "mock_node" + workflow_spec = MyTestShellWorkflow( + input_video=LazyOutField(node=mock_node, field="a_video", type=video.Mp4), + watermark=LazyOutField(node=mock_node, field="a_watermark", type=image.Png), + ) + Workflow.clear_cache(task=MyTestShellWorkflow) + wf = Workflow.construct(workflow_spec) + assert wf["add_watermark"].inputs.in_video == LazyInField( + workflow=wf, field="input_video", type=video.Mp4, type_checked=True + ) + assert wf["add_watermark"].inputs.watermark == LazyInField( + workflow=wf, field="watermark", type=image.Png, type_checked=True + ) + + # Check to see that the cache is populated with the new workflow + workflow_cache = Workflow._constructed_cache[hash_function(MyTestShellWorkflow)] + # The non-lazy keys used to construct the workflow + key_set = frozenset(["watermark_dims", "constructor"]) + assert list(workflow_cache) == [key_set] + assert len(workflow_cache[key_set]) == 1 + + # check to see that the cache is not used if we change the value of one of the + # non lazy fields + workflow_spec.watermark_dims = (20, 20) + wf2 = Workflow.construct(workflow_spec) + assert wf2 is not wf + assert list(workflow_cache) == [key_set] + assert len(workflow_cache[key_set]) == 2 + + # check to see that the cache is used if we provide a concrete value for one of the + # lazy fields + workflow_spec.input_video = video.Mp4.mock("input.mp4") + wf3 = Workflow.construct(workflow_spec) + assert wf3 is wf2 + assert list(workflow_cache) == [key_set] + assert len(workflow_cache[key_set]) == 2 + + +def test_direct_access_of_workflow_object(): + + @python.define(inputs={"x": float}, outputs={"z": float}) + def Add(x, y): + return x + y + + def Mul(x, y): + return x * y + + @workflow.define(outputs=["out1", "out2"]) + def MyTestWorkflow(a: int, b: float) -> tuple[float, float]: + """A test workflow demonstration a few alternative ways to set and connect nodes + + Args: + a: An integer input + b: A float input + + Returns: + out1: The first output + out2: The second output + """ + + wf = workflow.this() + + add = wf.add(Add(x=a, y=b), name="addition") + mul = wf.add(python.define(Mul, outputs={"out": float})(x=add.z, y=b)) + divide = wf.add(Divide(x=wf["addition"].lzout.z, y=mul.out), name="division") + + # Alter one of the inputs to a node after it has been initialised + wf["Mul"].inputs.y *= 2 + + return mul.out, divide.divided + + assert list(task_fields(MyTestWorkflow)) == [ + workflow.arg(name="a", type=int, help="An integer input"), + workflow.arg(name="b", type=float, help="A float input"), + workflow.arg( + name="constructor", + type=ty.Callable, + hash_eq=True, + default=MyTestWorkflow().constructor, + ), + ] + assert list(task_fields(MyTestWorkflow.Outputs)) == [ + workflow.out(name="out1", type=float, help="The first output"), + workflow.out(name="out2", type=float, help="The second output"), + ] + workflow_spec = MyTestWorkflow(a=1, b=2.0) + wf = Workflow.construct(workflow_spec) + assert wf.inputs.a == 1 + assert wf.inputs.b == 2.0 + assert wf.outputs.out1 == LazyOutField( + node=wf["Mul"], field="out", type=float, type_checked=True + ) + assert wf.outputs.out2 == LazyOutField( + node=wf["division"], field="divided", type=ty.Any + ) + assert list(wf.node_names) == ["addition", "Mul", "division"] + + +def test_workflow_set_outputs_directly(): + + @workflow.define(outputs={"out1": float, "out2": float}) + def MyTestWorkflow(a: int, b: float): + + wf = workflow.this() + add = wf.add(Add(a=a, b=b)) + wf.add(Mul(a=add.out, b=b)) + + # Set the outputs of the workflow directly instead of returning them them in + # a tuple + wf.outputs.out2 = add.out # Using the returned lzout outputs + wf.outputs.out1 = wf["Mul"].lzout.out # accessing the lzout outputs via getitem + + # no return is used when the outputs are set directly + + assert list(task_fields(MyTestWorkflow)) == [ + workflow.arg(name="a", type=int), + workflow.arg(name="b", type=float), + workflow.arg( + name="constructor", + type=ty.Callable, + hash_eq=True, + default=MyTestWorkflow().constructor, + ), + ] + assert list(task_fields(MyTestWorkflow.Outputs)) == [ + workflow.out(name="out1", type=float), + workflow.out(name="out2", type=float), + ] + workflow_spec = MyTestWorkflow(a=1, b=2.0) + wf = Workflow.construct(workflow_spec) + assert wf.inputs.a == 1 + assert wf.inputs.b == 2.0 + assert wf.outputs.out1 == LazyOutField(node=wf["Mul"], field="out", type=ty.Any) + assert wf.outputs.out2 == LazyOutField(node=wf["Add"], field="out", type=ty.Any) + assert list(wf.node_names) == ["Add", "Mul"] + + +def test_workflow_split_combine1(): + + @python.define + def Mul(x: float, y: float) -> float: + return x * y + + @python.define + def Sum(x: list[float]) -> float: + return sum(x) + + @workflow.define + def MyTestWorkflow(a: list[int], b: list[float]) -> list[float]: + mul = workflow.add(Mul().split(x=a, y=b).combine("x")) + sum = workflow.add(Sum(x=mul.out)) + return sum.out + + wf = Workflow.construct(MyTestWorkflow(a=[1, 2, 3], b=[1.0, 10.0, 100.0])) + assert wf["Mul"].splitter == ["Mul.x", "Mul.y"] + assert wf["Mul"].combiner == ["Mul.x"] + assert wf.outputs.out == LazyOutField( + node=wf["Sum"], field="out", type=list[float], type_checked=True + ) + + +def test_workflow_split_combine2(): + + @python.define + def Mul(x: float, y: float) -> float: + return x * y + + @python.define + def Add(x: float, y: float) -> float: + return x + y + + @workflow.define + def MyTestWorkflow(a: list[int], b: list[float], c: float) -> list[float]: + mul = workflow.add(Mul().split(x=a, y=b)) + add = workflow.add(Add(x=mul.out, y=c).combine("Mul.x")) + sum = workflow.add(Sum(x=add.out)) + return sum.out + + wf = Workflow.construct(MyTestWorkflow(a=[1, 2, 3], b=[1.0, 10.0, 100.0], c=2.0)) + assert wf["Mul"].splitter == ["Mul.x", "Mul.y"] + assert wf["Mul"].combiner == [] + assert wf["Add"].splitter == "_Mul" + assert wf["Add"].combiner == ["Mul.x"] + assert wf.outputs.out == LazyOutField( + node=wf["Sum"], field="out", type=list[float], type_checked=True + ) + + +def test_nested_workflow(): + """Simple test of a nested workflow""" + + @python.define + def Add(x: float, y: float) -> float: + return x + y + + @python.define + def Mul(x: float, y: float) -> float: + return x * y + + @python.define + def Divide(x: float, y: float) -> float: + return x / y + + @python.define + def Power(x: float, y: float) -> float: + return x**y + + @workflow.define + def NestedWorkflow(a: float, b: float, c: float) -> float: + pow = workflow.add(Power(x=a, y=c)) + add = workflow.add(Add(x=pow.out, y=b)) + return add.out + + @workflow.define + def MyTestWorkflow(a: int, b: float, c: float) -> float: + div = workflow.add(Divide(x=a, y=b)) + nested = workflow.add(NestedWorkflow(a=div.out, b=b, c=c)) + return nested.out + + wf = Workflow.construct(MyTestWorkflow(a=1, b=10.0, c=2.0)) + assert wf.inputs.a == 1 + assert wf.inputs.b == 10.0 + assert wf.inputs.c == 2.0 + assert wf.outputs.out == LazyOutField( + node=wf["NestedWorkflow"], field="out", type=float, type_checked=True + ) + assert list(wf.node_names) == ["Divide", "NestedWorkflow"] + nwf_spec = copy(wf["NestedWorkflow"]._task) + nwf_spec.a = 100.0 + nwf = Workflow.construct(nwf_spec) + nwf.inputs.a == 100.0 + nwf.inputs.b == 10.0 + nwf.inputs.c == 2.0 + nwf.outputs.out == LazyOutField(node=nwf["Add"], field="out", type=float) + assert list(nwf.node_names) == ["Power", "Add"] + + +def test_recursively_nested_conditional_workflow(): + """More complex nested workflow example demonstrating conditional branching at run + time""" + + @python.define + def Add(x: float, y: float) -> float: + return x + y + + @python.define + def Subtract(x: float, y: float) -> float: + return x - y + + @workflow.define + def RecursiveNestedWorkflow(a: float, depth: int) -> float: + add = workflow.add(Add(x=a, y=1)) + decrement_depth = workflow.add(Subtract(x=depth, y=1)) + if depth > 0: + out_node = workflow.add( + RecursiveNestedWorkflow(a=add.out, depth=decrement_depth.out) + ) + else: + out_node = add + return out_node.out + + wf = Workflow.construct(RecursiveNestedWorkflow(a=1, depth=3)) + assert wf.inputs.a == 1 + assert wf.inputs.depth == 3 + assert wf.outputs.out == LazyOutField( + node=wf["RecursiveNestedWorkflow"], + field="out", + type=float, + type_checked=True, + ) diff --git a/pydra/compose/tests/test_workflow_run.py b/pydra/compose/tests/test_workflow_run.py new file mode 100644 index 0000000000..9e314dda3a --- /dev/null +++ b/pydra/compose/tests/test_workflow_run.py @@ -0,0 +1,4608 @@ +import pytest +import shutil +import os +import sys +import time +import typing as ty +import attr +from pathlib import Path +from pydra.engine.tests.utils import ( + Add2, + Add2Wait, + Multiply, + Divide, + # MultiplyList, + # MultiplyMixed, + Power, + Ten, + Identity, + Identity2Flds, + ListOutput, + FunAddSubVar, + FunAddVar3, + FunAddVar, + FunAddTwo, + FunAddVarNone, + FunAddVarDefault, + FunAddVarDefaultNoType, + FunAddVarNoType, + FunAddTwoNoType, + FunWriteFile, + FunWriteFileList, + FunWriteFileList2Dict, + ListSum, + ListMultSum, + DOT_FLAG, +) +from pydra.engine.submitter import Submitter +from pydra.compose import python, workflow +from pydra.engine.workflow import Workflow +from pydra.utils.general import plot_workflow + + +def test_wf_no_output(worker: str, tmp_path: Path): + """Raise error when output isn't set with set_output""" + + @workflow.define + def Worky(x): + workflow.add(Add2(x=x)) + + with pytest.raises(ValueError, match="returned None"): + Workflow.construct(Worky(x=2)) + + +def test_wf_1(worker: str, tmp_path: Path): + """workflow with one task and no splitter""" + + @workflow.define + def Worky(x): + add2 = workflow.add(Add2(x=x), name="add2") + return add2.out + + worky = Worky(x=2) + + checksum_before = worky._hash + outputs = worky(worker=worker, cache_root=tmp_path) + + Workflow.construct(worky) + assert worky._hash == checksum_before + + assert 4 == outputs.out + + +def test_wf_1a_outpastuple(worker: str, tmp_path: Path): + """workflow with one task and no splitter + set_output takes a tuple + """ + + @workflow.define + def Worky(x): + add2 = workflow.add(Add2(x=x), name="add2") + return add2.out + + worky = Worky(x=2) + + outputs = worky(worker=worker, cache_root=tmp_path) + + assert 4 == outputs.out + + +def test_wf_1_call_subm(worker: str, tmp_path: Path): + """using wf["__call_"] with submitter""" + + @workflow.define + def Worky(x): + add2 = workflow.add(Add2(x=x), name="add2") + return add2.out + + worky = Worky(x=2) + + outputs = worky(worker=worker, cache_root=tmp_path) + + assert 4 == outputs.out + + +def test_wf_1_call_plug(worker: str, tmp_path: Path): + """using wf["__call_"] with worker""" + + @workflow.define + def Worky(x): + add2 = workflow.add(Add2(x=x), name="add2") + return add2.out + + worky = Worky(x=2) + + outputs = worky(worker=worker) + + assert 4 == outputs.out + + +def test_wf_1_call_noplug_nosubm(worker: str, tmp_path: Path): + """using wf["__call_"] without worker or submitter""" + + @workflow.define + def Worky(x): + add2 = workflow.add(Add2(x=x), name="add2") + return add2.out + + worky = Worky(x=2) + + outputs = worky() + + assert 4 == outputs.out + + +def test_wf_1_upd_in_run(tmp_path, worker): + """Updating input in __call__""" + + @workflow.define + def Worky(x): + add2 = workflow.add(Add2(x=x), name="add2") + return add2.out + + worky = Worky(x=1) + worky.x = 2 + outputs = worky(cache_root=tmp_path, worker=worker) + assert 4 == outputs.out + + +def test_wf_2(worker: str, tmp_path: Path): + """workflow with 2 tasks, no splitter""" + + @workflow.define + def Worky(x, y): + mult = workflow.add(Multiply(x=x, y=y), name="mult") + add2 = workflow.add(Add2(x=mult.out), name="add2") + return add2.out + + worky = Worky(x=2, y=3) + + outputs = worky(worker=worker, cache_root=tmp_path) + + assert 8 == outputs.out + + +def test_wf_2a(worker: str, tmp_path: Path): + """workflow with 2 tasks, no splitter + creating add2_task first (before calling add method), + """ + + @workflow.define + def Worky(x, y): + mult = workflow.add(Multiply(x=x, y=y), name="mult") + add2 = workflow.add(Add2(x=mult.out), name="add2") + return add2.out + + worky = Worky(x=2, y=3) + + outputs = worky(worker=worker, cache_root=tmp_path) + + assert 8 == outputs.out + + +def test_wf_2b(worker: str, tmp_path: Path): + """workflow with 2 tasks, no splitter + creating add2_task first (before calling add method), + adding inputs.x after add method + """ + + @workflow.define + def Worky(x, y): + mult = workflow.add(Multiply(x=x, y=y), name="mult") + add2 = workflow.add(Add2(x=mult.out), name="add2") + return add2.out + + worky = Worky(x=2, y=3) + + outputs = worky(worker=worker, cache_root=tmp_path) + + assert 8 == outputs.out + + +def test_wf_2c_multoutp(worker: str, tmp_path: Path): + """workflow with 2 tasks, no splitter + setting multiple outputs for the workflow + """ + + @workflow.define(outputs=["out_add2", "out_mult"]) + def Worky(x, y): + mult = workflow.add(Multiply(x=x, y=y), name="mult") + add2 = workflow.add(Add2(x=mult.out), name="add2") + return add2.out, mult.out + + worky = Worky(x=2, y=3) + + outputs = worky(worker=worker, cache_root=tmp_path) + + # checking outputs from both nodes + assert 6 == outputs.out_mult + assert 8 == outputs.out_add2 + + +def test_wf_2d_outpasdict(worker: str, tmp_path: Path): + """workflow with 2 tasks, no splitter + setting multiple outputs using a dictionary + """ + + @workflow.define(outputs=["out_add2", "out_mult"]) + def Worky(x, y): + mult = workflow.add(Multiply(x=x, y=y), name="mult") + add2 = workflow.add(Add2(x=mult.out), name="add2") + return add2.out, mult.out + + worky = Worky(x=2, y=3) + + outputs = worky(worker=worker, cache_root=tmp_path) + + # checking outputs from both nodes + assert 6 == outputs.out_mult + assert 8 == outputs.out_add2 + + +@pytest.mark.flaky(reruns=3) # when dask +def test_wf_3(worker, tmp_path: Path): + """testing None value for an input""" + + @workflow.define + def Worky(x, y): + addvar = workflow.add(FunAddVarNone(a=x, b=y)) + add2 = workflow.add(Add2(x=addvar.out), name="add2") + return add2.out + + worky = Worky(x=2, y=None) + + outputs = worky(worker=worker, cache_root=tmp_path) + + assert 4 == outputs.out + + +@pytest.mark.xfail(reason="the task error doesn't propagate") +def test_wf_3a_exception(worker: str, tmp_path: Path): + """testinh worky without set input, attr.NOTHING should be set + and the function should raise an exception + """ + + @workflow.define + def Worky(x, y): + addvar = workflow.add(FunAddVarNone(a=x, b=y)) + add2 = workflow.add(Add2(x=addvar.out), name="add2") + return add2.out + + worky = Worky(x=2, y=attr.NOTHING) + + with pytest.raises(TypeError, match="unsupported"): + worky(worker=worker, cache_root=tmp_path) + + +def test_wf_4(worker: str, tmp_path: Path): + """worky with a task that doesn't set one input and use the function default value""" + + @workflow.define + def Worky(x, y=None): + addvar = workflow.add(FunAddVarDefault(a=x)) + add2 = workflow.add(Add2(x=addvar.out), name="add2") + return add2.out + + worky = Worky(x=2) + + outputs = worky(worker=worker, cache_root=tmp_path) + + assert 5 == outputs.out + + +def test_wf_4a(worker: str, tmp_path: Path): + """worky with a task that doesn't set one input, + the unset input is send to the task input, + so the task should use the function default value + """ + + @workflow.define + def Worky(x): + addvar = workflow.add(FunAddVarDefault(a=x)) + add2 = workflow.add(Add2(x=addvar.out), name="add2") + return add2.out + + worky = Worky(x=2) + + outputs = worky(worker=worker, cache_root=tmp_path) + + assert 5 == outputs.out + + +def test_wf_5(worker: str, tmp_path: Path): + """worky with two outputs connected to the task outputs + one set_output + """ + + @workflow.define(outputs=["out_sum", "out_sub"]) + def Worky(x, y): + addsub = workflow.add(FunAddSubVar(a=x, b=y)) + return addsub.sum, addsub.sub + + worky = Worky(x=3, y=2) + + outputs = worky(worker=worker, cache_root=tmp_path) + + assert 5 == outputs.out_sum + assert 1 == outputs.out_sub + + +def test_wf_5a(worker: str, tmp_path: Path): + """worky with two outputs connected to the task outputs, + set_output set twice + """ + + @workflow.define(outputs=["out_sum", "out_sub"]) + def Worky(x, y): + addsub = workflow.add(FunAddSubVar(a=x, b=y)) + return addsub.sum, addsub.sub + + worky = Worky(x=3, y=2) + outputs = worky(worker=worker, cache_root=tmp_path) + + assert 5 == outputs.out_sum + assert 1 == outputs.out_sub + + +def test_wf_6(worker: str, tmp_path: Path): + """worky with two tasks and two outputs connected to both tasks, + one set_output + """ + + @workflow.define(outputs=["out1", "out2"]) + def Worky(x, y): + mult = workflow.add(Multiply(x=x, y=y), name="mult") + add2 = workflow.add(Add2(x=mult.out), name="add2") + return mult.out, add2.out # + + worky = Worky(x=2, y=3) + + outputs = worky(worker=worker, cache_root=tmp_path) + + assert 6 == outputs.out1 + assert 8 == outputs.out2 + + +def test_wf_6a(worker: str, tmp_path: Path): + """worky with two tasks and two outputs connected to both tasks, + set_output used twice + """ + + @workflow.define(outputs=["out1", "out2"]) + def Worky(x, y): + mult = workflow.add(Multiply(x=x, y=y), name="mult") + add2 = workflow.add(Add2(x=mult.out), name="add2") + return mult.out, add2.out + + worky = Worky(x=2, y=3) + + outputs = worky(worker=worker, cache_root=tmp_path) + + assert 6 == outputs.out1 + assert 8 == outputs.out2 + + +def test_wf_st_1(worker: str, tmp_path: Path): + """Worky with one task, a splitter for the workflow""" + + @workflow.define + def Worky(x): + add2 = workflow.add(Add2().split("x", x=x), name="add2") + + return add2.out + + worky = Worky(x=[1, 2]) + + checksum_before = worky._hash + outputs = worky(cache_root=tmp_path, worker=worker) + + Workflow.construct(worky) + assert worky._hash == checksum_before + + # expected: [({"test7.x": 1}, 3), ({"test7.x": 2}, 4)] + assert outputs.out[0] == 3 + assert outputs.out[1] == 4 + + +def test_wf_st_1_call_subm(worker: str, tmp_path: Path): + """Worky with one task, a splitter for the workflow""" + + @workflow.define + def Worky(x): + add2 = workflow.add(Add2().split("x", x=x), name="add2") + + return add2.out + + worky = Worky(x=[1, 2]) + + outputs = worky(worker=worker, cache_root=tmp_path) + + # expected: [({"test7.x": 1}, 3), ({"test7.x": 2}, 4)] + assert outputs.out[0] == 3 + assert outputs.out[1] == 4 + + +def test_wf_st_1_call_plug(worker: str, tmp_path: Path): + """Worky with one task, a splitter for the workflow + using Worky.__call__(worker) + """ + + @workflow.define + def Worky(x): + add2 = workflow.add(Add2().split("x", x=x), name="add2") + + return add2.out + + worky = Worky(x=[1, 2]) + + outputs = worky(worker=worker) + + # expected: [({"test7.x": 1}, 3), ({"test7.x": 2}, 4)] + assert outputs.out[0] == 3 + assert outputs.out[1] == 4 + + +def test_wf_st_1_call_selfplug(worker: str, tmp_path: Path): + """Worky with one task, a splitter for the workflow + using Worky.__call__() and using self.worker + """ + + @workflow.define + def Worky(x): + add2 = workflow.add(Add2().split("x", x=x), name="add2") + return add2.out + + worky = Worky(x=[1, 2]) + + outputs = worky() + + # expected: [({"test7.x": 1}, 3), ({"test7.x": 2}, 4)] + assert outputs.out[0] == 3 + assert outputs.out[1] == 4 + + +def test_wf_st_1_call_noplug_nosubm(worker: str, tmp_path: Path): + """Worky with one task, a splitter for the workflow + using Worky.__call__() without worker and submitter + (a submitter should be created within the __call__ function) + """ + + @workflow.define + def Worky(x): + add2 = workflow.add(Add2().split("x", x=x), name="add2") + return add2.out + + worky = Worky(x=[1, 2]) + + outputs = worky() + + # expected: [({"test7.x": 1}, 3), ({"test7.x": 2}, 4)] + assert outputs.out[0] == 3 + assert outputs.out[1] == 4 + + +def test_wf_st_1_inp_in_call(tmp_path, worker): + """Defining input in __call__""" + + @workflow.define + def Worky(x): + add2 = workflow.add(Add2(x=x), name="add2") + return add2.out + + worky = Worky().split("x", x=[1, 2]) + outputs = worky(cache_root=tmp_path, worker=worker) # + assert outputs.out[0] == 3 + assert outputs.out[1] == 4 + + +def test_wf_st_1_upd_inp_call(tmp_path, worker): + """Updating input in __call___""" + + @workflow.define + def Worky(x): + add2 = workflow.add(Add2(x=x), name="add2") + return add2.out + + worky = Worky().split("x", x=[1, 2]) + outputs = worky(cache_root=tmp_path, worker=worker) + assert outputs.out == [3, 4] + + +def test_wf_st_noinput_1(worker: str, tmp_path: Path): + """Worky with one task, a splitter for the workflow""" + + @workflow.define + def Worky(x): + add2 = workflow.add(Add2().split("x", x=x), name="add2") + return add2.out + + worky = Worky(x=[]) + + checksum_before = worky._hash + outputs = worky(worker=worker, cache_root=tmp_path) + + assert worky._hash == checksum_before + + assert outputs.out == [] + + +def test_wf_ndst_1(worker: str, tmp_path: Path): + """workflow with one task, a splitter on the task level""" + + @workflow.define + def Worky(x): + add2 = workflow.add(Add2().split("x", x=x), name="add2") + return add2.out + + worky = Worky(x=[1, 2]) + + checksum_before = worky._hash + outputs = worky(worker=worker, cache_root=tmp_path) + + wf = Workflow.construct(worky) + assert worky._hash == checksum_before + + # expected: [({"test7.x": 1}, 3), ({"test7.x": 2}, 4)] + assert outputs.out == [3, 4] + + +def test_wf_ndst_updatespl_1(worker: str, tmp_path: Path): + """workflow with one task, + a splitter on the task level is added *after* calling add + """ + + @workflow.define + def Worky(x): + add2 = workflow.add(Add2().split("x", x=x), name="add2") + return add2.out + + worky = Worky(x=[1, 2]) + + outputs = worky(worker=worker, cache_root=tmp_path) + + # expected: [({"test7.x": 1}, 3), ({"test7.x": 2}, 4)] + assert outputs.out == [3, 4] + + +def test_wf_ndst_updatespl_1a(worker: str, tmp_path: Path): + """workflow with one task (initialize before calling add), + a splitter on the task level is added *after* calling add + """ + + @workflow.define + def Worky(x): + add2 = workflow.add(Add2().split("x", x=x), name="add2") + return add2.out + + worky = Worky(x=[1, 2]) + + outputs = worky(worker=worker, cache_root=tmp_path) + + # expected: [({"test7.x": 1}, 3), ({"test7.x": 2}, 4)] + assert outputs.out == [3, 4] + + +def test_wf_ndst_updateinp_1(worker: str, tmp_path: Path): + """workflow with one task, + a splitter on the task level, + updating input of the task after calling add + """ + + @workflow.define + def Worky(x, y): + add2 = workflow.add(Add2().split("x", x=y), name="add2") + return add2.out + + worky = Worky(x=[1, 2], y=[11, 12]) + + outputs = worky(worker=worker, cache_root=tmp_path) + + assert outputs.out == [13, 14] + + +def test_wf_ndst_noinput_1(worker: str, tmp_path: Path): + """workflow with one task, a splitter on the task level""" + + @workflow.define + def Worky(x): + add2 = workflow.add(Add2().split("x", x=x), name="add2") + return add2.out + + worky = Worky(x=[]) + + checksum_before = worky._hash + outputs = worky(worker=worker, cache_root=tmp_path) + + wf = Workflow.construct(worky) + assert worky._hash == checksum_before + + assert outputs.out == [] + + +def test_wf_st_2(worker: str, tmp_path: Path): + """workflow with one task, splitters and combiner for workflow""" + + @workflow.define + def Worky(x): + add2 = workflow.add(Add2(x=x), name="add2") + + return add2.out + + worky = Worky().split("x", x=[1, 2]).combine(combiner="x") + + outputs = worky(worker=worker, cache_root=tmp_path) + + # expected: [({"test7.x": 1}, 3), ({"test7.x": 2}, 4)] + assert outputs.out[0] == 3 + assert outputs.out[1] == 4 + + +def test_wf_ndst_2(worker: str, tmp_path: Path): + """workflow with one task, splitters and combiner on the task level""" + + @workflow.define + def Worky(x): + add2 = workflow.add(Add2().split("x", x=x).combine(combiner="x"), name="add2") + return add2.out + + worky = Worky(x=[1, 2]) + + outputs = worky(worker=worker, cache_root=tmp_path) + + # expected: [({"test7.x": 1}, 3), ({"test7.x": 2}, 4)] + assert outputs.out == [3, 4] + + +# workflows with structures A -> B + + +def test_wf_st_3(worker: str, tmp_path: Path): + """workflow with 2 tasks, splitter on worky level""" + + @workflow.define + def Worky(x, y): + mult = workflow.add(Multiply(x=x, y=y), name="mult") + add2 = workflow.add(Add2(x=mult.out), name="add2") + + return add2.out + + worky = Worky().split(("x", "y"), x=[1, 2], y=[11, 12]) + + outputs = worky(worker=worker, cache_root=tmp_path) + + expected = [ + ({"wfst_3.x": 1, "wfst_3.y": 11}, 13), + ({"wfst_3.x": 2, "wfst_3.y": 12}, 26), + ] + expected_ind = [ + ({"wfst_3.x": 0, "wfst_3.y": 0}, 13), + ({"wfst_3.x": 1, "wfst_3.y": 1}, 26), + ] + + for i, res in enumerate(expected): + assert outputs.out[i] == res[1] + + +def test_wf_ndst_3(worker: str, tmp_path: Path): + """Test workflow with 2 tasks, splitter on a task level""" + + @workflow.define + def Worky(x, y): + mult = workflow.add(Multiply().split(("x", "y"), x=x, y=y), name="mult") + add2 = workflow.add(Add2(x=mult.out), name="add2") + return add2.out + + worky = Worky(x=[1, 2], y=[11, 12]) + + outputs = worky(worker=worker, cache_root=tmp_path) + + # expected: [({"test7.x": 1, "test7.y": 11}, 13), ({"test7.x": 2, "test.y": 12}, 26)] + assert outputs.out == [13, 26] + + +def test_wf_st_4(worker: str, tmp_path: Path): + """workflow with two tasks, scalar splitter and combiner for the workflow""" + + @workflow.define + def Worky(x, y): + mult = workflow.add(Multiply(x=x, y=y), name="mult") + add2 = workflow.add(Add2(x=mult.out), name="add2") + + return add2.out + + worky = Worky().split(("x", "y"), x=[1, 2], y=[11, 12]).combine("x") + outputs = worky(worker=worker, cache_root=tmp_path) + + # expected: [ + # ({"test7.x": 1, "test7.y": 11}, 13), ({"test7.x": 2, "test.y": 12}, 26) + # ] + assert outputs.out[0] == 13 + assert outputs.out[1] == 26 + + +def test_wf_ndst_4(worker: str, tmp_path: Path): + """workflow with two tasks, scalar splitter and combiner on tasks level""" + + @workflow.define + def Worky(a, b): + mult = workflow.add(Multiply().split(("x", "y"), x=a, y=b), name="mult") + add2 = workflow.add(Add2(x=mult.out).combine("mult.x"), name="add2") + + return add2.out + + worky = Worky(a=[1, 2], b=[11, 12]) + + outputs = worky(worker=worker, cache_root=tmp_path) + + # expected: [ + # ({"test7.x": 1, "test7.y": 11}, 13), ({"test7.x": 2, "test.y": 12}, 26) + # ] + assert outputs.out == [13, 26] + + +def test_wf_st_5(worker: str, tmp_path: Path): + """workflow with two tasks, outer splitter and no combiner""" + + @workflow.define + def Worky(x, y): + mult = workflow.add(Multiply(x=x, y=y), name="mult") + add2 = workflow.add(Add2(x=mult.out), name="add2") + + return add2.out + + worky = Worky().split(x=[1, 2], y=[11, 12]) + + outputs = worky(worker=worker, cache_root=tmp_path) + + assert outputs.out == [13, 14, 24, 26] + + +def test_wf_ndst_5(worker: str, tmp_path: Path): + """workflow with two tasks, outer splitter on tasks level and no combiner""" + + @workflow.define + def Worky(x, y): + mult = workflow.add(Multiply().split(["x", "y"], x=x, y=y), name="mult") + add2 = workflow.add(Add2(x=mult.out), name="add2") + return add2.out + + worky = Worky(x=[1, 2], y=[11, 12]) + + outputs = worky(worker=worker, cache_root=tmp_path) + + assert outputs.out[0] == 13 + assert outputs.out[1] == 14 + assert outputs.out[2] == 24 + assert outputs.out[3] == 26 + + +def test_wf_st_6(worker: str, tmp_path: Path): + """workflow with two tasks, outer splitter and combiner for the workflow""" + + @workflow.define + def Worky(x, y): + mult = workflow.add(Multiply(x=x, y=y), name="mult") + add2 = workflow.add(Add2(x=mult.out), name="add2") + + return add2.out + + worky = Worky().split(["x", "y"], x=[1, 2, 3], y=[11, 12]).combine("x") + + outputs = worky(worker=worker, cache_root=tmp_path) + + assert outputs.out[0][0] == 13 + assert outputs.out[0][1] == 24 + assert outputs.out[0][2] == 35 + assert outputs.out[1][0] == 14 + assert outputs.out[1][1] == 26 + assert outputs.out[1][2] == 38 + + +def test_wf_ndst_6(worker: str, tmp_path: Path): + """workflow with two tasks, outer splitter and combiner on tasks level""" + + @workflow.define + def Worky(x, y): + mult = workflow.add(Multiply().split(["x", "y"], x=x, y=y), name="mult") + add2 = workflow.add(Add2(x=mult.out).combine("mult.x"), name="add2") + return add2.out + + worky = Worky(x=[1, 2, 3], y=[11, 12]) + + outputs = worky(worker=worker, cache_root=tmp_path) + + assert outputs.out == [[13, 24, 35], [14, 26, 38]] + + +def test_wf_ndst_7(worker: str, tmp_path: Path): + """workflow with two tasks, outer splitter and (full) combiner for first node only""" + + @workflow.define + def Worky(x, y): + mult = workflow.add(Multiply(y=y).split(x=x).combine("x"), name="mult") + iden = workflow.add(Identity(x=mult.out)) + return iden.out + + worky = Worky(x=[1, 2, 3], y=11) + + outputs = worky(worker=worker, cache_root=tmp_path) + + assert outputs.out == [11, 22, 33] + + +def test_wf_ndst_8(worker: str, tmp_path: Path): + """workflow with two tasks, outer splitter and (partial) combiner for first task only""" + + @workflow.define + def Worky(x, y): + mult = workflow.add( + Multiply().split(["x", "y"], x=x, y=y).combine("x"), name="mult" + ) + iden = workflow.add(Identity(x=mult.out)) + return iden.out + + worky = Worky(x=[1, 2, 3], y=[11, 12]) + + outputs = worky(worker=worker, cache_root=tmp_path) + + assert outputs.out == [[11, 22, 33], [12, 24, 36]] + + +def test_wf_ndst_9(worker: str, tmp_path: Path): + """workflow with two tasks, outer splitter and (full) combiner for first task only""" + + @workflow.define + def Worky(x, y): + mult = workflow.add( + Multiply().split(["x", "y"], x=x, y=y).combine(["x", "y"]), name="mult" + ) + iden = workflow.add(Identity(x=mult.out)) + return iden.out + + worky = Worky(x=[1, 2, 3], y=[11, 12]) + + outputs = worky(worker=worker, cache_root=tmp_path) + + assert outputs.out == [11, 12, 22, 24, 33, 36] + + +# workflows with structures A -> B -> C + + +def test_wf_3sernd_ndst_1(worker: str, tmp_path: Path): + """workflow with three "serial" tasks, checking if the splitter is propagating""" + + @workflow.define + def Worky(x, y): + mult = workflow.add(Multiply().split(["x", "y"], x=x, y=y), name="mult") + add2_1st = workflow.add(Add2(x=mult.out), name="add2_1st") + add2_2nd = workflow.add(Add2(x=add2_1st.out), name="add2_2nd") + return add2_2nd.out + + worky = Worky(x=[1, 2], y=[11, 12]) + + outputs = worky(worker=worker, cache_root=tmp_path) + + # splitter from the first task should propagate to all tasks, + # splitter_rpn should be the same in all tasks + wf = Workflow.construct(worky) + assert wf["mult"].state.splitter == ["mult.x", "mult.y"] + assert wf["add2_1st"].state.splitter == "_mult" + assert wf["add2_2nd"].state.splitter == "_add2_1st" + assert ( + ["mult.x", "mult.y", "*"] + == wf["mult"].state.splitter_rpn + == wf["add2_1st"].state.splitter_rpn + == wf["add2_2nd"].state.splitter_rpn + ) + + assert outputs.out == [15, 16, 26, 28] + + +def test_wf_3sernd_ndst_1a(worker: str, tmp_path: Path): + """ + workflow with three "serial" tasks, checking if the splitter is propagating + first task has a splitter that propagates to the 2nd task, + and the 2nd task is adding one more input to the splitter + """ + + @workflow.define + def Worky(x, y): + add2_1st = workflow.add(Add2().split("x", x=x), name="add2_1st") + mult = workflow.add(Multiply(x=add2_1st.out).split("y", y=y), name="mult") + add2_2nd = workflow.add(Add2(x=mult.out), name="add2_2nd") + return add2_2nd.out + + worky = Worky(x=[1, 2], y=[11, 12]) + + outputs = worky(worker=worker, cache_root=tmp_path) + + # splitter from the 1st task should propagate and the 2nd task should add one more + # splitter_rpn for the 2nd and the 3rd task should be the same + wf = Workflow.construct(worky) + assert wf["add2_1st"].state.splitter == "add2_1st.x" + assert wf["mult"].state.splitter == ["_add2_1st", "mult.y"] + assert wf["add2_2nd"].state.splitter == "_mult" + assert ( + ["add2_1st.x", "mult.y", "*"] + == wf["mult"].state.splitter_rpn + == wf["add2_2nd"].state.splitter_rpn + ) + + assert outputs.out == [35, 38, 46, 50] + + +# workflows with structures A -> C, B -> C + + +@pytest.mark.flaky(reruns=3) # when dask +def test_wf_3nd_st_1(worker, tmp_path: Path): + """workflow with three tasks, third one connected to two previous tasks, + splitter on the workflow level + """ + + @workflow.define + def Worky(x, y): + add2x = workflow.add(Add2(x=x), name="add2x") + add2y = workflow.add(Add2(x=y), name="add2y") + mult = workflow.add(Multiply(x=add2x.out, y=add2y.out), name="mult") + + return mult.out + + worky = Worky().split(["x", "y"], x=[1, 2, 3], y=[11, 12]) + + outputs = worky(worker=worker, cache_root=tmp_path) + + assert outputs.out[0] == 39 + assert outputs.out[1] == 42 + assert outputs.out[5] == 70 + + +@pytest.mark.flaky(reruns=3) # when dask +def test_wf_3nd_ndst_1(worker, tmp_path: Path): + """workflow with three tasks, third one connected to two previous tasks, + splitter on the tasks levels + """ + + @workflow.define + def Worky(x, y): + add2x = workflow.add(Add2().split("x", x=x), name="add2x") + add2y = workflow.add(Add2().split("x", x=y), name="add2y") + mult = workflow.add(Multiply(x=add2x.out, y=add2y.out), name="mult") + return mult.out + + worky = Worky(x=[1, 2, 3], y=[11, 12]) + + outputs = worky(worker=worker, cache_root=tmp_path) + + assert len(outputs.out) == 6 + assert outputs.out == [39, 42, 52, 56, 65, 70] + + +def test_wf_3nd_st_2(worker: str, tmp_path: Path): + """workflow with three tasks, third one connected to two previous tasks, + splitter and partial combiner on the workflow level + """ + + @workflow.define + def Worky(x, y): + add2x = workflow.add(Add2(x=x), name="add2x") + add2y = workflow.add(Add2(x=y), name="add2y") + mult = workflow.add(Multiply(x=add2x.out, y=add2y.out), name="mult") + return mult.out + + worky = Worky().split(["x", "y"], x=[1, 2, 3], y=[11, 12]).combine("x") + + outputs = worky(worker=worker, cache_root=tmp_path) + + assert outputs.out[0][0] == 39 + assert outputs.out[0][1] == 52 + assert outputs.out[0][2] == 65 + assert outputs.out[1][0] == 42 + assert outputs.out[1][1] == 56 + assert outputs.out[1][2] == 70 + + +def test_wf_3nd_ndst_2(worker: str, tmp_path: Path): + """workflow with three tasks, third one connected to two previous tasks, + splitter and partial combiner on the tasks levels + """ + + @workflow.define + def Worky(x, y): + add2x = workflow.add(Add2().split("x", x=x), name="add2x") + add2y = workflow.add(Add2().split("x", x=y), name="add2y") + mult = workflow.add( + Multiply(x=add2x.out, y=add2y.out).combine("add2x.x"), name="mult" + ) + return mult.out + + worky = Worky(x=[1, 2, 3], y=[11, 12]) + + outputs = worky(worker=worker, cache_root=tmp_path) + + assert len(outputs.out) == 2 + assert outputs.out == [[39, 52, 65], [42, 56, 70]] + + +def test_wf_3nd_st_3(worker: str, tmp_path: Path): + """workflow with three tasks, third one connected to two previous tasks, + splitter and partial combiner (from the second task) on the workflow level + """ + + @workflow.define + def Worky(x, y): + add2x = workflow.add(Add2(x=x), name="add2x") + add2y = workflow.add(Add2(x=y), name="add2y") + mult = workflow.add(Multiply(x=add2x.out, y=add2y.out), name="mult") + return mult.out + + worky = Worky().split(["x", "y"], x=[1, 2, 3], y=[11, 12]).combine("y") + + outputs = worky(worker=worker, cache_root=tmp_path) + + assert outputs.out[0][0] == 39 + assert outputs.out[0][1] == 42 + assert outputs.out[1][0] == 52 + assert outputs.out[1][1] == 56 + assert outputs.out[2][0] == 65 + assert outputs.out[2][1] == 70 + + +def test_wf_3nd_ndst_3(worker: str, tmp_path: Path): + """workflow with three tasks, third one connected to two previous tasks, + splitter and partial combiner (from the second task) on the tasks levels + """ + + @workflow.define + def Worky(x, y): + add2x = workflow.add(Add2().split("x", x=x), name="add2x") + add2y = workflow.add(Add2().split("x", x=y), name="add2y") + mult = workflow.add( + Multiply(x=add2x.out, y=add2y.out).combine("add2y.x"), name="mult" + ) + return mult.out + + worky = Worky(x=[1, 2, 3], y=[11, 12]) + + outputs = worky(worker=worker, cache_root=tmp_path) + + assert len(outputs.out) == 3 + assert outputs.out[0] == [39, 42] + assert outputs.out[1] == [52, 56] + assert outputs.out[2] == [65, 70] + + +def test_wf_3nd_st_4(worker: str, tmp_path: Path): + """workflow with three tasks, third one connected to two previous tasks, + splitter and full combiner on the workflow level + """ + + @workflow.define + def Worky(x, y): + add2x = workflow.add(Add2(x=x), name="add2x") + add2y = workflow.add(Add2(x=y), name="add2y") + mult = workflow.add(Multiply(x=add2x.out, y=add2y.out), name="mult") + return mult.out + + worky = Worky().split(["x", "y"], x=[1, 2, 3], y=[11, 12]).combine(["x", "y"]) + + outputs = worky(worker=worker, cache_root=tmp_path) + + assert outputs.out[0] == 39 + assert outputs.out[1] == 42 + assert outputs.out[2] == 52 + assert outputs.out[3] == 56 + assert outputs.out[4] == 65 + assert outputs.out[5] == 70 + + +def test_wf_3nd_ndst_4(worker: str, tmp_path: Path): + """workflow with three tasks, third one connected to two previous tasks, + splitter and full combiner on the tasks levels + """ + + @workflow.define + def Worky(x, y): + add2x = workflow.add(Add2().split("x", x=x), name="add2x") + add2y = workflow.add(Add2().split("x", x=y), name="add2y") + mult = workflow.add( + Multiply(x=add2x.out, y=add2y.out).combine(["add2x.x", "add2y.x"]) + ) + return mult.out + + worky = Worky(x=[1, 2, 3], y=[11, 12]) + + outputs = worky(worker=worker, cache_root=tmp_path) + + # assert outputs._cache_dir.exists() + + assert len(outputs.out) == 6 + assert outputs.out == [39, 42, 52, 56, 65, 70] + + +def test_wf_3nd_st_5(worker: str, tmp_path: Path): + """workflow with three tasks (A->C, B->C) and three fields in the splitter, + splitter and partial combiner (from the second task) on the workflow level + """ + + @workflow.define + def Worky(x, y, z): + add2x = workflow.add(Add2(x=x), name="add2x") + add2y = workflow.add(Add2(x=y), name="add2y") + addvar = workflow.add(FunAddVar3(a=add2x.out, b=add2y.out, c=z)) + return addvar.out + + worky = ( + Worky().split(["x", "y", "z"], x=[2, 3], y=[11, 12], z=[10, 100]).combine("y") + ) + + outputs = worky(worker=worker, cache_root=tmp_path) + + assert outputs.out[0][0] == 27 + assert outputs.out[0][1] == 28 + assert outputs.out[1][0] == 117 + assert outputs.out[1][1] == 118 + assert outputs.out[2][0] == 28 + assert outputs.out[2][1] == 29 + assert outputs.out[3][0] == 118 + assert outputs.out[3][1] == 119 + + +def test_wf_3nd_ndst_5(worker: str, tmp_path: Path): + """workflow with three tasks (A->C, B->C) and three fields in the splitter, + all tasks have splitters and the last one has a partial combiner (from the 2nd) + """ + + @workflow.define + def Worky(x, y, z): + add2x = workflow.add(Add2().split("x", x=x), name="add2x") + add2y = workflow.add(Add2().split("x", x=y), name="add2y") + addvar = workflow.add( + FunAddVar3(a=add2x.out, b=add2y.out).split("c", c=z).combine("add2x.x") + ) + + return addvar.out + + worky = Worky(x=[2, 3], y=[11, 12], z=[10, 100]) + + outputs = worky(worker=worker, cache_root=tmp_path) + + assert len(outputs.out) == 4 + assert outputs.out[0] == [27, 28] + assert outputs.out[1] == [117, 118] + assert outputs.out[2] == [28, 29] + assert outputs.out[3] == [118, 119] + + # checking all directories + + +def test_wf_3nd_ndst_6(worker: str, tmp_path: Path): + """workflow with three tasks, third one connected to two previous tasks, + the third one uses scalar splitter from the previous ones and a combiner + """ + + @workflow.define + def Worky(x, y): + add2x = workflow.add(Add2().split("x", x=x), name="add2x") + add2y = workflow.add(Add2().split("x", x=y), name="add2y") + mult = workflow.add( + Multiply(x=add2x.out, y=add2y.out) + .split(("_add2x", "_add2y")) + .combine("add2y.x") + ) + return mult.out + + worky = Worky(x=[1, 2], y=[11, 12]) + + outputs = worky(worker=worker, cache_root=tmp_path) + + assert outputs.out == [39, 56] + + +def test_wf_3nd_ndst_7(worker: str, tmp_path: Path): + """workflow with three tasks, third one connected to two previous tasks, + the third one uses scalar splitter from the previous ones + """ + + @workflow.define + def Worky(x): + add2x = workflow.add(Add2().split("x", x=x), name="add2x") + add2y = workflow.add(Add2().split("x", x=x), name="add2y") + mult = workflow.add( + Multiply(x=add2x.out, y=add2y.out).split(("_add2x", "_add2y")) + ) + return mult.out + + worky = Worky(x=[1, 2]) + + outputs = worky(worker=worker, cache_root=tmp_path) + + assert outputs.out == [9, 16] + + +# workflows with structures A -> B -> C with multiple connections + + +def test_wf_3nd_8(tmp_path: Path): + """workflow with three tasks A->B->C vs two tasks A->C with multiple connections""" + + @workflow.define(outputs=["out1", "out2", "out1a", "out2a"]) + def Worky(zip): + + iden2flds_1 = workflow.add( + Identity2Flds(x2="Hoi").split("x1", x1=zip), name="iden2flds_1" + ) + + identity = workflow.add(Identity(x=iden2flds_1.out1)) + + iden2flds_2 = workflow.add( + Identity2Flds(x1=identity.out, x2=iden2flds_1.out2), name="iden2flds_2" + ) + + iden2flds_2a = workflow.add( + Identity2Flds( + x1=iden2flds_1.out1, + x2=iden2flds_1.out2, + ) + ) + + return iden2flds_2.out1, iden2flds_2.out2, iden2flds_2a.out1, iden2flds_2a.out2 + + worky = Worky(zip=[["test1", "test3", "test5"], ["test2", "test4", "test6"]]) + + with Submitter(worker="cf") as sub: + res = sub(worky) + + assert ( + res.outputs.out1 + == res.outputs.out1a + == [["test1", "test3", "test5"], ["test2", "test4", "test6"]] + ) + assert res.outputs.out2 == res.outputs.out2a == ["Hoi", "Hoi"] + + +# workflows with Left and Right part in splitters A -> B (L&R parts of the splitter) + + +def test_wf_ndstLR_1(worker: str, tmp_path: Path): + """Test workflow with 2 tasks, splitters on tasks levels + The second task has its own simple splitter + and the Left part from the first task should be added + """ + + @workflow.define + def Worky(x, y): + add2 = workflow.add(Add2().split("x", x=x), name="add2") + mult = workflow.add(Multiply(x=add2.out).split("y", y=y), name="mult") + return mult.out + + worky = Worky(x=[1, 2], y=[11, 12]) + + outputs = worky(worker=worker, cache_root=tmp_path) + + # checking if the splitter is created properly + wf = Workflow.construct(worky) + assert wf["mult"].state.splitter == ["_add2", "mult.y"] + assert wf["mult"].state.splitter_rpn == ["add2.x", "mult.y", "*"] + + # expected: [({"add2.x": 1, "mult.y": 11}, 33), ({"add2.x": 1, "mult.y": 12}, 36), + # ({"add2.x": 2, "mult.y": 11}, 44), ({"add2.x": 2, "mult.y": 12}, 48)] + assert outputs.out == [33, 36, 44, 48] + + +def test_wf_ndstLR_1a(worker: str, tmp_path: Path): + """Test workflow with 2 tasks, splitters on tasks levels + The second task has splitter that has Left part (from previous state) + and the Right part (it's own splitter) + """ + + @workflow.define + def Worky(x, y): + add2 = workflow.add(Add2().split("x", x=x), name="add2") + mult = workflow.add( + Multiply(x=add2.out).split(["_add2", "y"], y=y), name="mult" + ) + return mult.out + + worky = Worky(x=[1, 2], y=[11, 12]) + + # checking if the splitter is created properly + wf = Workflow.construct(worky) + assert wf["mult"].state.splitter == ["_add2", "mult.y"] + assert wf["mult"].state.splitter_rpn == ["add2.x", "mult.y", "*"] + + # expected: [({"add2.x": 1, "mult.y": 11}, 33), ({"add2.x": 1, "mult.y": 12}, 36), + # ({"add2.x": 2, "mult.y": 11}, 44), ({"add2.x": 2, "mult.y": 12}, 48)] + + outputs = worky(worker=worker, cache_root=tmp_path) + assert outputs.out == [33, 36, 44, 48] + + +def test_wf_ndstLR_2(worker: str, tmp_path: Path): + """Test workflow with 2 tasks, splitters on tasks levels + The second task has its own outer splitter + and the Left part from the first task should be added + """ + + @workflow.define + def Worky(x, y, z): + add2 = workflow.add(Add2().split("x", x=x), name="add2") + addvar = workflow.add( + FunAddVar3(a=add2.out).split(["b", "c"], b=y, c=z), name="addvar" + ) + return addvar.out + + worky = Worky(x=[1, 2, 3], y=[10, 20], z=[100, 200]) + + outputs = worky(worker=worker, cache_root=tmp_path) + + # checking if the splitter is created properly + wf = Workflow.construct(worky) + assert wf["addvar"].state.splitter == ["_add2", ["addvar.b", "addvar.c"]] + assert wf["addvar"].state.splitter_rpn == [ + "add2.x", + "addvar.b", + "addvar.c", + "*", + "*", + ] + + # expected: [({"add2.x": 1, "mult.b": 10, "mult.c": 100}, 113), + # ({"add2.x": 1, "mult.b": 10, "mult.c": 200}, 213), + # ({"add2.x": 1, "mult.b": 20, "mult.c": 100}, 123), + # ({"add2.x": 1, "mult.b": 20, "mult.c": 200}, 223), + # ...] + assert outputs.out == [ + 113, + 213, + 123, + 223, + 114, + 214, + 124, + 224, + 115, + 215, + 125, + 225, + ] + + +def test_wf_ndstLR_2a(worker: str, tmp_path: Path): + """Test workflow with 2 tasks, splitters on tasks levels + The second task has splitter that has Left part (from previous state) + and the Right part (it's own outer splitter) + """ + + @workflow.define + def Worky(x, y, z): + add2 = workflow.add(Add2().split("x", x=x), name="add2") + addvar = workflow.add( + FunAddVar3(a=add2.out).split(["_add2", ["b", "c"]], b=y, c=z), name="addvar" + ) + + return addvar.out + + worky = Worky(x=[1, 2, 3], y=[10, 20], z=[100, 200]) + + outputs = worky(worker=worker, cache_root=tmp_path) + + # checking if the splitter is created properly + wf = Workflow.construct(worky) + assert wf["addvar"].state.splitter == ["_add2", ["addvar.b", "addvar.c"]] + assert wf["addvar"].state.splitter_rpn == [ + "add2.x", + "addvar.b", + "addvar.c", + "*", + "*", + ] + + # expected: [({"add2.x": 1, "mult.b": 10, "mult.c": 100}, 113), + # ({"add2.x": 1, "mult.b": 10, "mult.c": 200}, 213), + # ({"add2.x": 1, "mult.b": 20, "mult.c": 100}, 123), + # ({"add2.x": 1, "mult.b": 20, "mult.c": 200}, 223), + # ...] + assert outputs.out == [ + 113, + 213, + 123, + 223, + 114, + 214, + 124, + 224, + 115, + 215, + 125, + 225, + ] + + +# workflows with inner splitters A -> B (inner spl) + + +def test_wf_ndstinner_1(worker: str, tmp_path: Path): + """workflow with 2 tasks, + the second task has inner splitter + """ + + @workflow.define(outputs=["out_list", "out"]) + def Worky(x: int): + list = workflow.add(ListOutput(x=x)) + add2 = workflow.add(Add2().split("x", x=list.out), name="add2") + return list.out, add2.out + + worky = Worky(x=1) + + wf = Workflow.construct(worky) + assert wf["add2"].state.splitter == "add2.x" + assert wf["add2"].state.splitter_rpn == ["add2.x"] + + outputs = worky(worker=worker, cache_root=tmp_path) + + assert outputs.out_list == [1, 2, 3] + assert outputs.out == [3, 4, 5] + + +def test_wf_ndstinner_2(worker: str, tmp_path: Path): + """workflow with 2 tasks, + the second task has two inputs and inner splitter from one of the input + """ + + @workflow.define(outputs=["out_list", "out"]) + def Worky(x, y): + list = workflow.add(ListOutput(x=x)) + mult = workflow.add(Multiply(y=y).split("x", x=list.out), name="mult") + return list.out, mult.out + + worky = Worky(x=1, y=10) # + + outputs = worky(worker=worker, cache_root=tmp_path) + + wf = Workflow.construct(worky) + assert wf["mult"].state.splitter == "mult.x" + assert wf["mult"].state.splitter_rpn == ["mult.x"] + + assert outputs.out_list == [1, 2, 3] + assert outputs.out == [10, 20, 30] + + +def test_wf_ndstinner_3(worker: str, tmp_path: Path): + """workflow with 2 tasks, + the second task has two inputs and outer splitter that includes an inner field + """ + + @workflow.define(outputs=["out_list", "out"]) + def Worky(x, y): + list = workflow.add(ListOutput(x=x)) + mult = workflow.add(Multiply().split(["x", "y"], x=list.out, y=y), name="mult") + return list.out, mult.out + + worky = Worky(x=1, y=[10, 100]) + + outputs = worky(worker=worker, cache_root=tmp_path) + + wf = Workflow.construct(worky) + assert wf["mult"].state.splitter == ["mult.x", "mult.y"] + assert wf["mult"].state.splitter_rpn == ["mult.x", "mult.y", "*"] + + assert outputs.out_list == [1, 2, 3] + assert outputs.out == [10, 100, 20, 200, 30, 300] + + +def test_wf_ndstinner_4(worker: str, tmp_path: Path): + """workflow with 3 tasks, + the second task has two inputs and inner splitter from one of the input, + the third task has no its own splitter + """ + + @workflow.define(outputs=["out_list", "out"]) + def Worky(x, y): + list = workflow.add(ListOutput(x=x)) + mult = workflow.add(Multiply(y=y).split("x", x=list.out), name="mult") + add2 = workflow.add(Add2(x=mult.out), name="add2") + return list.out, add2.out + + worky = Worky(x=1, y=10) + + outputs = worky(worker=worker, cache_root=tmp_path) + + wf = Workflow.construct(worky) + assert wf["mult"].state.splitter == "mult.x" + assert wf["mult"].state.splitter_rpn == ["mult.x"] + assert wf["add2"].state.splitter == "_mult" + assert wf["add2"].state.splitter_rpn == ["mult.x"] + + assert outputs.out_list == [1, 2, 3] + assert outputs.out == [12, 22, 32] + + +@pytest.mark.flaky(reruns=3) +def test_wf_ndstinner_5(worker: str, tmp_path: Path): + """workflow with 3 tasks, + the second task has two inputs and inner splitter from one of the input, + (inner input come from the first task that has its own splitter, + there is a inner_container_ndim) + the third task has no new splitter + """ + + @workflow.define(outputs=["out_list", "out_mult", "out_add"]) + def Worky(x, y, b): + list = workflow.add(ListOutput().split("x", x=x), name="list") + mult = workflow.add(Multiply().split(["y", "x"], x=list.out, y=y), name="mult") + addvar = workflow.add(FunAddVar(a=mult.out).split("b", b=b), name="addvar") + return list.out, mult.out, addvar.out + + worky = Worky(x=[1, 2], y=[10, 100], b=[3, 5]) + + outputs = worky(worker=worker, cache_root=tmp_path) + + wf = Workflow.construct(worky) + assert wf["mult"].state.splitter == ["_list", ["mult.y", "mult.x"]] + assert wf["mult"].state.splitter_rpn == ["list.x", "mult.y", "mult.x", "*", "*"] + assert wf["addvar"].state.splitter == ["_mult", "addvar.b"] + assert wf["addvar"].state.splitter_rpn == [ + "list.x", + "mult.y", + "mult.x", + "*", + "*", + "addvar.b", + "*", + ] + + assert outputs.out_list == [[1, 2, 3], [2, 4, 6]] + assert outputs.out_mult == [ + 10, + 20, + 30, + 20, + 40, + 60, + 100, + 200, + 300, + 200, + 400, + 600, + ] + assert outputs.out_add == [ + 13, + 15, + 23, + 25, + 33, + 35, + 23, + 25, + 43, + 45, + 63, + 65, + 103, + 105, + 203, + 205, + 303, + 305, + 203, + 205, + 403, + 405, + 603, + 605, + ] + + +# workflow that have some single values as the input + + +def test_wf_st_singl_1(worker: str, tmp_path: Path): + """workflow with two tasks, only one input is in the splitter and combiner""" + + @workflow.define + def Worky(x, y): + mult = workflow.add(Multiply(x=x, y=y), name="mult") + add2 = workflow.add(Add2(x=mult.out), name="add2") + + return add2.out + + worky = Worky(y=11).split("x", x=[1, 2]).combine("x") + + outputs = worky(worker=worker, cache_root=tmp_path) + + assert outputs.out == [13, 24] + + +def test_wf_ndst_singl_1(worker: str, tmp_path: Path): + """workflow with two tasks, outer splitter and combiner on tasks level; + only one input is part of the splitter, the other is a single value + """ + + @workflow.define + def Worky(x, y): + mult = workflow.add(Multiply(y=y).split("x", x=x), name="mult") + add2 = workflow.add(Add2(x=mult.out).combine("mult.x"), name="add2") + return add2.out + + worky = Worky(x=[1, 2], y=11) + + outputs = worky(worker=worker, cache_root=tmp_path) + + assert outputs.out == [13, 24] + + +def test_wf_st_singl_2(worker: str, tmp_path: Path): + """workflow with three tasks, third one connected to two previous tasks, + splitter on the workflow level + only one input is part of the splitter, the other is a single value + """ + + @workflow.define + def Worky(x, y): + add2x = workflow.add(Add2(x=x), name="add2x") + add2y = workflow.add(Add2(x=y), name="add2y") + mult = workflow.add(Multiply(x=add2x.out, y=add2y.out), name="mult") + return mult.out + + worky = Worky(y=11).split("x", x=[1, 2, 3]) + + outputs = worky(worker=worker, cache_root=tmp_path) + + assert outputs.out == [39, 52, 65] + + +def test_wf_ndst_singl_2(worker: str, tmp_path: Path): + """workflow with three tasks, third one connected to two previous tasks, + splitter on the tasks levels + only one input is part of the splitter, the other is a single value + """ + + @workflow.define + def Worky(x, y): + add2x = workflow.add(Add2().split("x", x=x), name="add2x") + add2y = workflow.add(Add2(x=y), name="add2y") + mult = workflow.add(Multiply(x=add2x.out, y=add2y.out), name="mult") + return mult.out + + worky = Worky(x=[1, 2, 3], y=11) + + outputs = worky(worker=worker, cache_root=tmp_path) + + assert len(outputs.out) == 3 + assert outputs.out == [39, 52, 65] + + +# workflows with structures worky(A) + + +def test_wfasnd_1(worker: str, tmp_path: Path): + """workflow as a node + workflow-node with one task and no splitter + """ + + @workflow.define + def Wfnd1(x): + add2 = workflow.add(Add2(x=x), name="add2") + return add2.out + + @workflow.define + def Worky(x): + wfnd = workflow.add(Wfnd1(x=x)) + return wfnd.out + + worky = Worky(x=2) + + outputs = worky(worker=worker, cache_root=tmp_path) + + assert outputs.out == 4 + + +def test_wfasnd_wfinp_1(worker: str, tmp_path: Path): + """workflow as a node + workflow-node with one task and no splitter + input set for the main workflow + """ + + @workflow.define + def Wfnd1A(x): + add2 = workflow.add(Add2(x=x), name="add2") + return add2.out + + @workflow.define + def Worky(x): + wfnd = workflow.add(Wfnd1A(x=x)) + return wfnd.out + + worky = Worky(x=2) + + checksum_before = worky._hash + outputs = worky(worker=worker, cache_root=tmp_path) + + wf = Workflow.construct(worky) + assert worky._hash == checksum_before + + assert outputs.out == 4 + + +def test_wfasnd_wfndupdate(worker: str, tmp_path: Path): + """workflow as a node + workflow-node with one task and no splitter + wfasnode input is updated to use the main workflow input + """ + + @workflow.define + def Wfnd(x): + add2 = workflow.add(Add2(x=x), name="add2") + return add2.out + + @workflow.define + def Worky(x): + wfnd = workflow.add(Wfnd(x=x)) + return wfnd.out + + worky = Worky(x=3) + + outputs = worky(worker=worker, cache_root=tmp_path) + + assert outputs.out == 5 + + +def test_wfasnd_wfndupdate_rerun(worker: str, tmp_path: Path): + """workflow as a node + workflow-node with one task and no splitter + wfasnode is run first and later is + updated to use the main workflow input + """ + + @workflow.define + def Wfnd(x): + add2 = workflow.add(Add2(x=x), name="add2") + return add2.out + + wfnd = Wfnd(x=2) + + with Submitter(worker=worker, cache_root=tmp_path) as sub: + sub(wfnd) + + @workflow.define + def Worky(x): + wfnd = workflow.add(Wfnd(x=x)) + return wfnd.out + + worky = Worky(x=3) + + outputs = worky(worker=worker, cache_root=tmp_path) + + assert outputs.out == 5 + + # adding another layer of workflow + @workflow.define + def WorkyO(x): + worky = workflow.add(Worky(x=x)) + return worky.out + + wf_o = WorkyO(x=4) + + outputs = wf_o(worker=worker, cache_root=tmp_path) + + assert outputs.out == 6 + + +def test_wfasnd_st_1(worker: str, tmp_path: Path): + """workflow as a node + workflow-node with one task, + splitter for wfnd + """ + + @workflow.define + def Wfnd(x): + add2 = workflow.add(Add2(x=x), name="add2") + return add2.out + + @workflow.define + def Worky(x): + wfnd = workflow.add(Wfnd(x=x).split(x=x)) + return wfnd.out + + worky = Worky(x=[2, 4]) + + checksum_before = worky._hash + outputs = worky(worker=worker, cache_root=tmp_path) + + wf = Workflow.construct(worky) + assert worky._hash == checksum_before + + assert outputs.out == [4, 6] + + +def test_wfasnd_st_updatespl_1(worker: str, tmp_path: Path): + """workflow as a node + workflow-node with one task, + splitter for wfnd is set after add + """ + + @workflow.define + def Wfnd(x): + add2 = workflow.add(Add2(x=x), name="add2") + return add2.out + + @workflow.define + def Worky(x): + wfnd = workflow.add(Wfnd(x=x).split(x=x)) + return wfnd.out + + worky = Worky(x=[2, 4]) + + outputs = worky(worker=worker, cache_root=tmp_path) + + assert outputs.out == [4, 6] + + +def test_wfasnd_ndst_1(worker: str, tmp_path: Path): + """workflow as a node + workflow-node with one task, + splitter for node + """ + + @workflow.define + def Wfnd(x): + add2 = workflow.add(Add2().split("x", x=x), name="add2") + return add2.out + + @workflow.define + def Worky(x): + wfnd = workflow.add(Wfnd(x=x)) + return wfnd.out + + worky = Worky(x=[2, 4]) + + outputs = worky(worker=worker, cache_root=tmp_path) + + assert outputs.out == [4, 6] + + +def test_wfasnd_ndst_updatespl_1(worker: str, tmp_path: Path): + """workflow as a node + workflow-node with one task, + splitter for node added after add + """ + + @workflow.define + def Wfnd(x): + add2 = workflow.add(Add2().split("x", x=x), name="add2") + return add2.out + + @workflow.define + def Worky(x): + wfnd = workflow.add(Wfnd(x=x)) + return wfnd.out + + worky = Worky(x=[2, 4]) + + outputs = worky(worker=worker, cache_root=tmp_path) + + assert outputs.out == [4, 6] + + +def test_wfasnd_wfst_1(worker: str, tmp_path: Path): + """workflow as a node + workflow-node with one task, + splitter for the main workflow + """ + + @workflow.define + def Wfnd1B(x): + add2 = workflow.add(Add2(x=x), name="add2") + return add2.out + + @workflow.define + def Worky(x): + wfnd = workflow.add(Wfnd1B(x=x)) + return wfnd.out + + worky = Worky().split("x", x=[2, 4]) + + outputs = worky(worker=worker, cache_root=tmp_path) + + # assert outputs._cache_dir.exists() + + assert outputs.out[0] == 4 + assert outputs.out[1] == 6 + + +# workflows with structures worky(A) -> B + + +def test_wfasnd_st_2(worker: str, tmp_path: Path): + """workflow as a node, + the main workflow has two tasks, + splitter for wfnd + """ + + @workflow.define + def Wfnd(x, y): + mult = workflow.add(Multiply().split(("x", "y"), x=x, y=y), name="mult") + return mult.out + + @workflow.define + def Worky(x, y): + wfnd = workflow.add(Wfnd(x=x, y=y)) + add2 = workflow.add(Add2().split(x=wfnd.out), name="add2") + return add2.out + + worky = Worky(x=[2, 4], y=[1, 10]) + + outputs = worky(worker=worker, cache_root=tmp_path) + + # assert outputs._cache_dir.exists() + + assert outputs.out == [4, 42] + + +def test_wfasnd_wfst_2(worker: str, tmp_path: Path): + """workflow as a node, + the main workflow has two tasks, + splitter for the main workflow + """ + + @workflow.define + def Wfnd(x, y): + mult = workflow.add(Multiply(x=x, y=y), name="mult") + return mult.out + + @workflow.define + def Worky(x, y): + wfnd = workflow.add(Wfnd(x=x, y=y)) + add2 = workflow.add(Add2(x=wfnd.out), name="add2") + return add2.out + + worky = Worky().split(("x", "y"), x=[2, 4], y=[1, 10]) + + outputs = worky(worker=worker, cache_root=tmp_path) + + # assert outputs._cache_dir.exists() + + assert outputs.out[0] == 4 + assert outputs.out[1] == 42 + + +# workflows with structures A -> worky(B) + + +def test_wfasnd_ndst_3(worker: str, tmp_path: Path): + """workflow as the second node, + the main workflow has two tasks, + splitter for the first task + """ + + @workflow.define + def Wfnd(x): + add2 = workflow.add(Add2(x=x), name="add2") + return add2.out + + @workflow.define + def Worky(x, y): + mult = workflow.add(Multiply().split(("x", "y"), x=x, y=y), name="mult") + wfnd = workflow.add(Wfnd(x=mult.out)) + return wfnd.out + + worky = Worky(x=[2, 4], y=[1, 10]) + + outputs = worky(cache_root=tmp_path, worker=worker) + + # assert outputs._cache_dir.exists() + + assert outputs.out == [4, 42] + + +def test_wfasnd_wfst_3(worker: str, tmp_path: Path): + """workflow as the second node, + the main workflow has two tasks, + splitter for the main workflow + """ + + @workflow.define + def Wfnd(x): + add2 = workflow.add(Add2(x=x), name="add2") + return add2.out + + @workflow.define + def Worky(x, y): + mult = workflow.add(Multiply(x=x, y=y), name="mult") + + wfnd = workflow.add(Wfnd(x=mult.out)) + + return wfnd.out + + worky = Worky().split(("x", "y"), x=[2, 4], y=[1, 10]) + + outputs = worky(worker=worker, cache_root=tmp_path) + + # assert outputs._cache_dir.exists() + + assert outputs.out[0] == 4 + assert outputs.out[1] == 42 + + +# workflows with structures wfns(A->B) + + +def test_wfasnd_4(worker: str, tmp_path: Path): + """workflow as a node + workflow-node with two tasks and no splitter + """ + + @workflow.define + def Wfnd(x): + add2_1st = workflow.add(Add2(x=x), name="add2_1st") + add2_2nd = workflow.add(Add2(x=add2_1st.out), name="add2_2nd") + return add2_2nd.out + + @workflow.define + def Worky(x): + wfnd = workflow.add(Wfnd(x=2)) + return wfnd.out + + worky = Worky(x=2) + + outputs = worky(worker=worker, cache_root=tmp_path) + + assert outputs.out == 6 + + +def test_wfasnd_ndst_4(worker: str, tmp_path: Path): + """workflow as a node + workflow-node with two tasks, + splitter for node + """ + + @workflow.define + def Wfnd4(x): + add2_1st = workflow.add(Add2().split(x=x), name="add2_1st") + add2_2nd = workflow.add(Add2(x=add2_1st.out), name="add2_2nd") + return add2_2nd.out + + @workflow.define + def Worky(x): + wfnd = workflow.add(Wfnd4(x=x)) + return wfnd.out + + worky = Worky(x=[2, 4]) + + outputs = worky(worker=worker, cache_root=tmp_path) + + assert outputs.out == [6, 8] + + +def test_wfasnd_wfst_4(worker: str, tmp_path: Path): + """workflow as a node + workflow-node with two tasks, + splitter for the main workflow + """ + + @workflow.define + def Wfnd4A(x): + add2_1st = workflow.add(Add2(x=x), name="add2_1st") + add2_2nd = workflow.add(Add2(x=add2_1st.out), name="add2_2nd") + return add2_2nd.out + + @workflow.define + def Worky(x): + wfnd = workflow.add(Wfnd4A(x=x)) + return wfnd.out + + worky = Worky().split("x", x=[2, 4]) + + outputs = worky(worker=worker, cache_root=tmp_path) + + # assert outputs._cache_dir.exists() + + assert outputs.out == [6, 8] + + +# Testing caching + + +@pytest.mark.flaky(reruns=3) +def test_wf_nostate_cachedir(worker: str, tmp_path: Path): + """worky with provided cache_root using pytest tmp_path""" + cache_root = tmp_path / "test_wf_cache_1" + cache_root.mkdir() + + @workflow.define + def Worky(x, y): + mult = workflow.add(Multiply(x=x, y=y), name="mult") + add2 = workflow.add(Add2(x=mult.out), name="add2") + return add2.out + + worky = Worky(x=2, y=3) + + outputs = worky(worker=worker, cache_root=tmp_path) + + assert 8 == outputs.out + + shutil.rmtree(cache_root) + + +@pytest.mark.flaky(reruns=3) +def test_wf_nostate_cachedir_relativepath(tmp_path, worker): + """worky with provided cache_root as relative path""" + cache_root = tmp_path / "test_wf_cache_2" + cache_root.mkdir() + + @workflow.define + def Worky(x, y): + mult = workflow.add(Multiply(x=x, y=y), name="mult") + add2 = workflow.add(Add2(x=mult.out), name="add2") + return add2.out + + worky = Worky(x=2, y=3) + + outputs = worky(worker=worker, cache_root=tmp_path) + + assert 8 == outputs.out + + shutil.rmtree(cache_root) + + +@pytest.mark.flaky(reruns=3) +def test_wf_nostate_cachelocations(worker: str, tmp_path: Path): + """ + Two identical wfs with provided cache_root; + the second worky has readonly_caches and should not recompute the results + """ + cache_root1 = tmp_path / "test_wf_cache3" + cache_root1.mkdir() + cache_root2 = tmp_path / "test_wf_cache4" + cache_root2.mkdir() + + @workflow.define + def Worky1(x, y): + mult = workflow.add(Multiply(x=x, y=y), name="mult") + add2 = workflow.add(Add2Wait(x=mult.out), name="add2") + return add2.out + + worky1 = Worky1(x=2, y=3) + + t0 = time.time() + with Submitter(worker=worker, cache_root=cache_root1) as sub: + results1 = sub(worky1) + + assert not results1.errored, "\n".join(results1.errors["error message"]) + t1 = time.time() - t0 + + assert 8 == results1.outputs.out + + @workflow.define + def Worky2(x, y): + + mult = workflow.add(Multiply(x=x, y=y), name="mult") + add2 = workflow.add(Add2Wait(x=mult.out), name="add2") + return add2.out + + worky2 = Worky2(x=2, y=3) + + t0 = time.time() + with Submitter( + worker=worker, cache_root=cache_root2, readonly_caches=cache_root1 + ) as sub: + results2 = sub(worky2) + + assert not results2.errored, "\n".join(results2.errors["error message"]) + t2 = time.time() - t0 + + assert 8 == results2.outputs.out + + # checking execution time (for unix and cf) + # for win and dask/slurm the time for dir creation etc. might take much longer + if not sys.platform.startswith("win") and worker == "cf": + assert t1 > 2 + assert t2 < max(1, t1 - 1) + + # checking if the second worky didn't run again + + +@pytest.mark.flaky(reruns=3) +def test_wf_nostate_cachelocations_a(worker: str, tmp_path: Path): + """ + the same as previous test, but workflows differ + """ + cache_root1 = tmp_path / "test_wf_cache3" + cache_root1.mkdir() + cache_root2 = tmp_path / "test_wf_cache4" + cache_root2.mkdir() + + @workflow.define + def Worky1(x, y): + mult = workflow.add(Multiply(x=x, y=y), name="mult") + add2 = workflow.add(Add2Wait(x=mult.out), name="add2") + return add2.out + + worky1 = Worky1(x=2, y=3) + + t0 = time.time() + with Submitter(worker=worker, cache_root=cache_root1) as sub: + results1 = sub(worky1) + + assert not results1.errored, "\n".join(results1.errors["error message"]) + t1 = time.time() - t0 + + assert 8 == results1.outputs.out + + @workflow.define + def Worky2(x, y): + mult = workflow.add(Divide(x=x, y=y), name="mult") + add2 = workflow.add(Add2Wait(x=mult.out), name="add2") + return add2.out + + worky2 = Worky2(x=2, y=3) + + t0 = time.time() + with Submitter( + worker=worker, cache_root=cache_root2, readonly_caches=cache_root1 + ) as sub: + results2 = sub(worky2) + + assert not results2.errored, "\n".join(results2.errors["error message"]) + + assert 2 == results2.outputs.out + + # checking if both cache_dirs are created + assert results1.cache_dir != results2.cache_dir + + +@pytest.mark.flaky(reruns=3) +def test_wf_nostate_cachelocations_b(worker: str, tmp_path: Path): + """ + the same as previous test, but the 2nd workflows has two outputs + (connected to the same task output); + the task should not be run and it should be fast, + but the worky itself is triggered and the new output dir is created + """ + cache_root1 = tmp_path / "test_wf_cache3" + cache_root1.mkdir() + cache_root2 = tmp_path / "test_wf_cache4" + cache_root2.mkdir() + + @workflow.define + def Worky1(x, y): + mult = workflow.add(Multiply(x=x, y=y), name="mult") + add2 = workflow.add(Add2Wait(x=mult.out), name="add2") + return add2.out + + worky1 = Worky1(x=2, y=3) + + t0 = time.time() + with Submitter(worker=worker, cache_root=cache_root1) as sub: + results1 = sub(worky1) + + assert not results1.errored, "\n".join(results1.errors["error message"]) + t1 = time.time() - t0 + + assert 8 == results1.outputs.out + + @workflow.define(outputs=["out", "out_pr"]) + def Worky2(x, y): + + mult = workflow.add(Multiply(x=x, y=y), name="mult") + add2 = workflow.add(Add2Wait(x=mult.out), name="add2") + return add2.out, add2.out + + worky2 = Worky2(x=2, y=3) + + t0 = time.time() + with Submitter( + worker=worker, cache_root=cache_root2, readonly_caches=cache_root1 + ) as sub: + results2 = sub(worky2) + + assert not results2.errored, "\n".join(results2.errors["error message"]) + t2 = time.time() - t0 + + assert 8 == results2.outputs.out == results2.outputs.out_pr + + # for win and dask/slurm the time for dir creation etc. might take much longer + if not sys.platform.startswith("win") and worker == "cf": + # execution time for second run should be much shorter + assert t1 > 2 + assert t2 < max(1, t1 - 1) + + # checking if the second worky didn't run again + assert results1.cache_dir != results2.cache_dir + + +@pytest.mark.flaky(reruns=3) +def test_wf_nostate_cachelocations_setoutputchange(worker: str, tmp_path: Path): + """ + the same as previous test, but worky output names differ, + the tasks should not be run and it should be fast, + but the worky itself is triggered and the new output dir is created + (the second worky has updated name in its Output) + """ + cache_root1 = tmp_path / "test_wf_cache3" + cache_root1.mkdir() + cache_root2 = tmp_path / "test_wf_cache4" + cache_root2.mkdir() + + @workflow.define(outputs=["out1"]) + def Worky1(x, y): + mult = workflow.add(Multiply(x=x, y=y), name="mult") + add2 = workflow.add(Add2Wait(x=mult.out), name="add2") + return add2.out # out1 + + worky1 = Worky1(x=2, y=3) + + t0 = time.time() + with Submitter(worker=worker, cache_root=cache_root1) as sub: + results1 = sub(worky1) + + assert not results1.errored, "\n".join(results1.errors["error message"]) + t1 = time.time() - t0 + + assert 8 == results1.outputs.out1 + + @workflow.define(outputs=["out2"]) + def Worky2(x, y): + mult = workflow.add(Multiply(x=x, y=y), name="mult") + add2 = workflow.add(Add2Wait(x=mult.out), name="add2") + return add2.out # out2 + + worky2 = Worky2(x=2, y=3) + + t0 = time.time() + with Submitter( + worker=worker, cache_root=cache_root2, readonly_caches=cache_root1 + ) as sub: + results2 = sub(worky2) + + assert not results2.errored, "\n".join(results2.errors["error message"]) + t2 = time.time() - t0 + + assert 8 == results2.outputs.out2 + + # for win and dask/slurm the time for dir creation etc. might take much longer + if not sys.platform.startswith("win") and worker == "cf": + # checking execution time (the second worky should be fast, nodes do not have to rerun) + assert t1 > 2 + # testing relative values (windows or slurm takes much longer to create worky itself) + assert t2 < max(1, t1 - 1) + + # both worky cache_dirs should be created + assert results1.cache_dir != results2.cache_dir + + +@pytest.mark.flaky(reruns=3) +def test_wf_nostate_cachelocations_setoutputchange_a(worker: str, tmp_path: Path): + """ + the same as previous test, but worky names and output names differ, + """ + cache_root1 = tmp_path / "test_wf_cache3" + cache_root1.mkdir() + cache_root2 = tmp_path / "test_wf_cache4" + cache_root2.mkdir() + + @workflow.define(outputs=["out1"]) + def Worky1(x, y): + mult = workflow.add(Multiply(x=x, y=y), name="mult") + add2 = workflow.add(Add2Wait(x=mult.out), name="add2") + return add2.out # out1 + + worky1 = Worky1(x=2, y=3) + + t0 = time.time() + with Submitter(worker=worker, cache_root=cache_root1) as sub: + results1 = sub(worky1) + + assert not results1.errored, "\n".join(results1.errors["error message"]) + t1 = time.time() - t0 + + assert 8 == results1.outputs.out1 + + @workflow.define(outputs=["out2"]) + def Worky2(x, y): + mult = workflow.add(Multiply(x=x, y=y), name="mult") + add2 = workflow.add(Add2Wait(x=mult.out), name="add2") + return add2.out + + worky2 = Worky2(x=2, y=3) + + t0 = time.time() + with Submitter( + worker=worker, cache_root=cache_root2, readonly_caches=cache_root1 + ) as sub: + results2 = sub(worky2) + + assert not results2.errored, "\n".join(results2.errors["error message"]) + t2 = time.time() - t0 + + assert 8 == results2.outputs.out2 + + # for win and dask/slurm the time for dir creation etc. might take much longer + if not sys.platform.startswith("win") and worker == "cf": + assert t1 > 2 + # testing relative values (windows or slurm takes much longer to create worky itself) + assert t2 < max(1, t1 - 1) + + # both worky cache_dirs should be created + assert results1.cache_dir != results2.cache_dir + + +@pytest.mark.flaky(reruns=3) +def test_wf_nostate_cachelocations_forcererun(worker: str, tmp_path: Path): + """ + Two identical wfs with provided cache_root; + the second worky has readonly_caches, + but submitter is called with rerun=True, so should recompute + """ + cache_root1 = tmp_path / "test_wf_cache3" + cache_root1.mkdir() + cache_root2 = tmp_path / "test_wf_cache4" + cache_root2.mkdir() + + @workflow.define + def Worky1(x, y): + mult = workflow.add(Multiply(x=x, y=y), name="mult") + add2 = workflow.add(Add2Wait(x=mult.out), name="add2") + return add2.out + + worky1 = Worky1(x=2, y=3) + + t0 = time.time() + with Submitter(worker=worker, cache_root=cache_root1) as sub: + results1 = sub(worky1) + + assert not results1.errored, "\n".join(results1.errors["error message"]) + t1 = time.time() - t0 + + assert 8 == results1.outputs.out + + @workflow.define + def Worky2(x, y): + + mult = workflow.add(Multiply(x=x, y=y), name="mult") + add2 = workflow.add(Add2Wait(x=mult.out), name="add2") + return add2.out + + worky2 = Worky2(x=2, y=3) + + t0 = time.time() + with Submitter(worker=worker, cache_root=cache_root2) as sub: + results2 = sub(worky2, rerun=True) + + assert not results2.errored, "\n".join(results2.errors["error message"]) + t2 = time.time() - t0 + + assert 8 == results2.outputs.out + + # for win and dask/slurm the time for dir creation etc. might take much longer + if not sys.platform.startswith("win") and worker == "cf": + # checking execution time + assert t1 > 2 + assert t2 > 2 + + # checking if the second worky didn't run again + assert results1.cache_dir != results2.cache_dir + + +@pytest.mark.flaky(reruns=3) +def test_wf_nostate_cachelocations_wftaskrerun_propagateTrue( + worker: str, tmp_path: Path +): + """ + Two identical wfs with provided cache_root and readonly_caches for the second one; + submitter doesn't have rerun, but the second worky has rerun=True, + propagate_rerun is True as default, so everything should be rerun + """ + cache_root1 = tmp_path / "test_wf_cache1" + cache_root1.mkdir() + cache_root2 = tmp_path / "test_wf_cache2" + cache_root2.mkdir() + + @workflow.define + def Worky1(x, y): + mult = workflow.add(Multiply(x=x, y=y), name="mult") + add2 = workflow.add(Add2Wait(x=mult.out), name="add2") + return add2.out + + worky1 = Worky1(x=2, y=3) + + t0 = time.time() + with Submitter(worker=worker, cache_root=cache_root1) as sub: + results1 = sub(worky1) + + assert not results1.errored, "\n".join(results1.errors["error message"]) + t1 = time.time() - t0 + + assert 8 == results1.outputs.out + + @workflow.define + def Worky2(x, y): + mult = workflow.add(Multiply(x=x, y=y), name="mult") + add2 = workflow.add(Add2Wait(x=mult.out), name="add2") + return add2.out + + worky2 = Worky2(x=2, y=3) + + t0 = time.time() + with Submitter( + worker=worker, cache_root=cache_root2, readonly_caches=cache_root1 + ) as sub: + results2 = sub(worky2, rerun=True) + + assert not results2.errored, "\n".join(results2.errors["error message"]) + t2 = time.time() - t0 + + assert 8 == results2.outputs.out + + # checking if the second worky runs again + assert results1.cache_dir != results2.cache_dir + + # everything has to be recomputed + assert len(list(Path(cache_root1).glob("python-*"))) == 2 + assert len(list(Path(cache_root2).glob("python-*"))) == 2 + + # for win and dask/slurm the time for dir creation etc. might take much longer + if not sys.platform.startswith("win") and worker == "cf": + # runtime for recomputed workflows should be about the same + assert abs(t1 - t2) < t1 / 2 + + +@pytest.mark.flaky(reruns=3) +def test_wf_nostate_cachelocations_wftaskrerun_propagateFalse( + worker: str, tmp_path: Path +): + """ + Two identical wfs with provided cache_root and readonly_caches for the second one; + submitter doesn't have rerun, but the second worky has rerun=True, + propagate_rerun is set to False, so worky will be triggered, + but tasks will not have rerun, so will use the previous results + """ + cache_root1 = tmp_path / "test_wf_cache3" + cache_root1.mkdir() + cache_root2 = tmp_path / "test_wf_cache4" + cache_root2.mkdir() + + @workflow.define + def Worky1(x, y): + mult = workflow.add(Multiply(x=x, y=y), name="mult") + add2 = workflow.add(Add2Wait(x=mult.out), name="add2") + return add2.out + + worky1 = Worky1(x=2, y=3) + + t0 = time.time() + with Submitter(worker=worker, cache_root=cache_root1) as sub: + results1 = sub(worky1) + + assert not results1.errored, "\n".join(results1.errors["error message"]) + t1 = time.time() - t0 + + assert 8 == results1.outputs.out + + @workflow.define + def Worky2(x, y): + mult = workflow.add(Multiply(x=x, y=y), name="mult") + add2 = workflow.add(Add2Wait(x=mult.out), name="add2") + return add2.out + + worky2 = Worky2(x=2, y=3) + + t0 = time.time() + with Submitter( + worker=worker, + cache_root=cache_root2, + readonly_caches=cache_root1, + propagate_rerun=False, + ) as sub: + results2 = sub(worky2, rerun=True) + + assert not results2.errored, "\n".join(results2.errors["error message"]) + t2 = time.time() - t0 + + assert 8 == results2.outputs.out + + # checking if the second worky runs again + assert results1.cache_dir != results2.cache_dir + + # for win and dask/slurm the time for dir creation etc. might take much longer + if not sys.platform.startswith("win") and worker == "cf": + # checking the time + assert t1 > 2 + assert t2 < max(1, t1 - 1) + + # tasks should not be recomputed + assert len(list(Path(cache_root1).glob("python-*"))) == 2 + assert len(list(Path(cache_root2).glob("python-*"))) == 0 + + +@pytest.mark.xfail( + reason=( + "Cannot specify tasks within a workflow to be rerun, maybe rerun could take a " + "list of task names instead" + ) +) +@pytest.mark.flaky(reruns=3) +def test_wf_nostate_cachelocations_taskrerun_wfrerun_propagateFalse( + worker: str, tmp_path: Path +): + """ + Two identical wfs with provided cache_root, and readonly_caches for the second worky; + submitter doesn't have rerun, but worky has rerun=True, + since propagate_rerun=False, only tasks that have rerun=True will be rerun + """ + cache_root1 = tmp_path / "cache1" + cache_root1.mkdir() + cache_root2 = tmp_path / "cache2" + cache_root2.mkdir() + + @workflow.define + def Worky1(x, y): + mult = workflow.add(Multiply(x=x, y=y), name="mult") + add2 = workflow.add(Add2Wait(x=mult.out), name="add2") + return add2.out + + worky1 = Worky1(x=2, y=3) + + t0 = time.time() + with Submitter(worker=worker, cache_root=cache_root1) as sub: + results1 = sub(worky1) + + assert not results1.errored, "\n".join(results1.errors["error message"]) + t1 = time.time() - t0 + + assert 8 == results1.outputs.out + + @workflow.define + def Worky2(x, y): + mult = workflow.add(Multiply(x=x, y=y), name="mult") + # rerun on the task level needed (wf["propagate_rerun"] is False) + add2 = workflow.add(Add2Wait(x=mult.out), name="add2") + return add2.out + + worky2 = Worky2(x=2, y=3) + + t0 = time.time() + with Submitter( + worker=worker, + cache_root=cache_root2, + readonly_caches=cache_root1, + propagate_rerun=False, + ) as sub: + results2 = sub(worky2, rerun=True) # rerun will not be propagated to each task) + t2 = time.time() - t0 + + assert 8 == results2.outputs.out + + assert results1.cache_dir != results2.cache_dir + # the second task should be recomputed + assert len(list(Path(cache_root1).glob("python-*"))) == 2 + assert len(list(Path(cache_root2).glob("python-*"))) == 1 + + # for win and dask/slurm the time for dir creation etc. might take much longer + if not sys.platform.startswith("win") and worker == "cf": + # checking the execution time + assert t1 > 2 + assert t2 > 2 + + +@pytest.mark.flaky(reruns=3) +def test_wf_nostate_nodecachelocations(worker: str, tmp_path: Path): + """ + Two wfs with different input, but the second node has the same input; + the second worky has readonly_caches and should recompute the worky, + but without recomputing the second node + """ + cache_root1 = tmp_path / "test_wf_cache3" + cache_root1.mkdir() + cache_root2 = tmp_path / "test_wf_cache4" + cache_root2.mkdir() + + @workflow.define + def Worky1(x): + ten = workflow.add(Ten(x=x)) + add2 = workflow.add(Add2(x=ten.out), name="add2") + return add2.out + + worky1 = Worky1(x=3) + + with Submitter(worker=worker, cache_root=cache_root1) as sub: + results1 = sub(worky1) + + assert not results1.errored, "\n".join(results1.errors["error message"]) + + assert 12 == results1.outputs.out + + @workflow.define + def Worky2(x, y=None): + + ten = workflow.add(Ten(x=x)) + add2 = workflow.add(Add2(x=ten.out), name="add2") + return add2.out + + worky2 = Worky2(x=2) + + with Submitter( + worker=worker, cache_root=cache_root2, readonly_caches=cache_root1 + ) as sub: + results2 = sub(worky2) + + assert not results2.errored, "\n".join(results2.errors["error message"]) + + assert 12 == results2.outputs.out + + # checking if the second worky runs again, but runs only one task + assert results1.cache_dir != results2.cache_dir + # the second worky should rerun one task + assert len(list(Path(cache_root1).glob("python-*"))) == 2 + assert len(list(Path(cache_root2).glob("python-*"))) == 1 + + +@pytest.mark.flaky(reruns=3) +def test_wf_nostate_nodecachelocations_upd(worker: str, tmp_path: Path): + """ + Two wfs with different input, but the second node has the same input; + the second worky has readonly_caches (set after adding tasks) and should recompute, + but without recomputing the second node + """ + cache_root1 = tmp_path / "test_wf_cache3" + cache_root1.mkdir() + cache_root2 = tmp_path / "test_wf_cache4" + cache_root2.mkdir() + + @workflow.define + def Worky1(x): + ten = workflow.add(Ten(x=x)) + add2 = workflow.add(Add2(x=ten.out), name="add2") + return add2.out + + worky1 = Worky1(x=3) + + with Submitter(worker=worker, cache_root=cache_root1) as sub: + results1 = sub(worky1) + + assert not results1.errored, "\n".join(results1.errors["error message"]) + + assert 12 == results1.outputs.out + + @workflow.define + def Worky2(x, y=None): + ten = workflow.add(Ten(x=x)) + add2 = workflow.add(Add2(x=ten.out), name="add2") + return add2.out + + worky2 = Worky2(x=2) + + with Submitter( + worker=worker, cache_root=cache_root2, readonly_caches=cache_root1 + ) as sub: + results2 = sub(worky2) + + assert not results2.errored, "\n".join(results2.errors["error message"]) + + assert 12 == results2.outputs.out + + # checking if the second worky runs again, but runs only one task + assert results1.cache_dir != results2.cache_dir + # the second worky should have only one task run + assert len(list(Path(cache_root1).glob("python-*"))) == 2 + assert len(list(Path(cache_root2).glob("python-*"))) == 1 + + +@pytest.mark.flaky(reruns=3) +def test_wf_state_cachelocations(worker: str, tmp_path: Path): + """ + Two identical wfs (with states) with provided cache_root; + the second worky has readonly_caches and should not recompute the results + """ + cache_root1 = tmp_path / "test_wf_cache3" + cache_root1.mkdir() + cache_root2 = tmp_path / "test_wf_cache4" + cache_root2.mkdir() + + @workflow.define + def Worky1(x, y): + mult = workflow.add(Multiply(x=x, y=y), name="mult") + add2 = workflow.add(Add2Wait(x=mult.out), name="add2") + return add2.out + + worky1 = Worky1().split(("x", "y"), x=[2, 20], y=[3, 4]) + + t0 = time.time() + with Submitter(worker=worker, cache_root=cache_root1) as sub: + results1 = sub(worky1) + + assert not results1.errored, "\n".join(results1.errors["error message"]) + t1 = time.time() - t0 + + assert results1.outputs.out[0] == 8 + assert results1.outputs.out[1] == 82 + + @workflow.define + def Worky2(x, y): + + mult = workflow.add(Multiply(x=x, y=y), name="mult") + add2 = workflow.add(Add2Wait(x=mult.out), name="add2") + return add2.out + + worky2 = Worky2().split(("x", "y"), x=[2, 20], y=[3, 4]) + + t0 = time.time() + with Submitter( + worker=worker, cache_root=cache_root2, readonly_caches=cache_root1 + ) as sub: + results2 = sub(worky2) + + assert not results2.errored, "\n".join(results2.errors["error message"]) + t2 = time.time() - t0 + + assert results2.outputs.out == [8, 82] + + # for win and dask/slurm the time for dir creation etc. might take much longer + if not sys.platform.startswith("win") and worker == "cf": + # checking the execution time + assert t1 > 2 + assert t2 < max(1, t1 - 1) + + # checking all directories + + # checking if the second worky didn't run again + # checking all directories + + assert results1.cache_dir == results2.cache_dir + + +@pytest.mark.flaky(reruns=3) +def test_wf_state_cachelocations_forcererun(worker: str, tmp_path: Path): + """ + Two identical wfs (with states) with provided cache_root; + the second worky has readonly_caches, + but submitter is called with rerun=True, so should recompute + """ + cache_root1 = tmp_path / "test_wf_cache3" + cache_root1.mkdir() + cache_root2 = tmp_path / "test_wf_cache4" + cache_root2.mkdir() + + @workflow.define + def Worky1(x, y): + mult = workflow.add(Multiply(x=x, y=y), name="mult") + add2 = workflow.add(Add2Wait(x=mult.out), name="add2") + return add2.out + + worky1 = Worky1().split(("x", "y"), x=[2, 20], y=[3, 4]) + + t0 = time.time() + with Submitter(worker=worker, cache_root=cache_root1) as sub: + results1 = sub(worky1) + + assert not results1.errored, "\n".join(results1.errors["error message"]) + t1 = time.time() - t0 + + assert results1.outputs.out[0] == 8 + assert results1.outputs.out[1] == 82 + + @workflow.define + def Worky2(x, y): + + mult = workflow.add(Multiply(x=x, y=y), name="mult") + add2 = workflow.add(Add2Wait(x=mult.out), name="add2") + return add2.out + + worky2 = Worky2().split(("x", "y"), x=[2, 20], y=[3, 4]) + + t0 = time.time() + with Submitter(worker=worker, cache_root=cache_root2) as sub: + results2 = sub(worky2, rerun=True) + + assert not results2.errored, "\n".join(results2.errors["error message"]) + t2 = time.time() - t0 + + assert results2.outputs.out[0] == 8 + assert results2.outputs.out[1] == 82 + + # for win and dask/slurm the time for dir creation etc. might take much longer + if not sys.platform.startswith("win") and worker == "cf": + # checking the execution time + assert t1 > 2 + assert t2 > 2 + + # checking all directories + + # checking if the second worky run again + # checking all directories + + +@pytest.mark.flaky(reruns=3) +def test_wf_state_cachelocations_updateinp(worker: str, tmp_path: Path): + """ + Two identical wfs (with states) with provided cache_root; + the second worky has readonly_caches and should not recompute the results + (the lazy input of the node is updated to the correct one, + i.e. the same as in worky1, after adding the node to the worky) + """ + cache_root1 = tmp_path / "test_wf_cache3" + cache_root1.mkdir() + cache_root2 = tmp_path / "test_wf_cache4" + cache_root2.mkdir() + + @workflow.define + def Worky1(x, y): + mult = workflow.add(Multiply(x=x, y=y), name="mult") + add2 = workflow.add(Add2Wait(x=mult.out), name="add2") + return add2.out + + worky1 = Worky1().split(("x", "y"), x=[2, 20], y=[3, 4]) + + t0 = time.time() + with Submitter(worker=worker, cache_root=cache_root1) as sub: + results1 = sub(worky1) + + assert not results1.errored, "\n".join(results1.errors["error message"]) + t1 = time.time() - t0 + + assert results1.outputs.out[0] == 8 + assert results1.outputs.out[1] == 82 + + @workflow.define + def Worky2(x, y): + mult = workflow.add(Multiply(x=x, y=y), name="mult") + add2 = workflow.add(Add2Wait(x=mult.out), name="add2") + return add2.out + + worky2 = Worky2().split(("x", "y"), x=[2, 20], y=[3, 4]) + + t0 = time.time() + with Submitter( + worker=worker, cache_root=cache_root2, readonly_caches=cache_root1 + ) as sub: + results2 = sub(worky2) + + assert not results2.errored, "\n".join(results2.errors["error message"]) + t2 = time.time() - t0 + + assert results2.outputs.out[0] == 8 + assert results2.outputs.out[1] == 82 + + # for win and dask/slurm the time for dir creation etc. might take much longer + if not sys.platform.startswith("win") and worker == "cf": + # checking the execution time + assert t1 > 2 + assert t2 < max(1, t1 - 1) + + # checking all directories + + # checking if the second worky didn't run again + # checking all directories + + +@pytest.mark.flaky(reruns=3) +def test_wf_state_n_nostate_cachelocations(worker: str, tmp_path: Path): + """ + Two wfs with provided cache_root, the first one has no state, the second has; + the second worky has readonly_caches and should not recompute only one element + """ + cache_root1 = tmp_path / "test_wf_cache3" + cache_root1.mkdir() + cache_root2 = tmp_path / "test_wf_cache4" + cache_root2.mkdir() + + @workflow.define + def Worky1(x, y): + mult = workflow.add(Multiply(x=x, y=y), name="mult") + add2 = workflow.add(Add2Wait(x=mult.out), name="add2") + return add2.out + + worky1 = Worky1(x=2, y=3) + + with Submitter(worker=worker, cache_root=cache_root1) as sub: + results1 = sub(worky1) + + assert not results1.errored, "\n".join(results1.errors["error message"]) + + assert results1.outputs.out == 8 + + @workflow.define + def Worky2(x, y): + + mult = workflow.add(Multiply(x=x, y=y), name="mult") + add2 = workflow.add(Add2Wait(x=mult.out), name="add2") + return add2.out + + worky2 = Worky2().split(("x", "y"), x=[2, 20], y=[3, 4]) + + with Submitter( + worker=worker, cache_root=cache_root2, readonly_caches=cache_root1 + ) as sub: + results2 = sub(worky2) + + assert not results2.errored, "\n".join(results2.errors["error message"]) + + assert results2.outputs.out[0] == 8 + assert results2.outputs.out[1] == 82 + + +def test_wf_nostate_cachelocations_updated(worker: str, tmp_path: Path): + """ + Two identical wfs with provided cache_root; + the second worky has readonly_caches in init, + that is later overwritten in Submitter.__call__; + the readonly_caches from call doesn't exist so the second task should run again + """ + cache_root1 = tmp_path / "test_wf_cache3" + cache_root1.mkdir() + cache_root1_empty = tmp_path / "test_wf_cache3_empty" + cache_root1_empty.mkdir() + cache_root2 = tmp_path / "test_wf_cache4" + cache_root2.mkdir() + + @workflow.define + def Worky1(x, y): + mult = workflow.add(Multiply(x=x, y=y), name="mult") + add2 = workflow.add(Add2Wait(x=mult.out), name="add2") + return add2.out + + worky1 = Worky1(x=2, y=3) + + t0 = time.time() + with Submitter(worker=worker, cache_root=cache_root1) as sub: + results1 = sub(worky1) + + assert not results1.errored, "\n".join(results1.errors["error message"]) + t1 = time.time() - t0 + + assert 8 == results1.outputs.out + + @workflow.define + def Worky2(x, y): + + mult = workflow.add(Multiply(x=x, y=y), name="mult") + add2 = workflow.add(Add2Wait(x=mult.out), name="add2") + return add2.out + + worky2 = Worky2(x=2, y=3) + + t0 = time.time() + # changing readonly_caches to non-existing dir + with Submitter( + worker=worker, cache_root=cache_root2, readonly_caches=cache_root1_empty + ) as sub: + results2 = sub(worky2) + + assert not results2.errored, "\n".join(results2.errors["error message"]) + t2 = time.time() - t0 + + assert 8 == results2.outputs.out + + # for win and dask/slurm the time for dir creation etc. might take much longer + if not sys.platform.startswith("win") and worker == "cf": + # checking the execution time + assert t1 > 2 + assert t2 > 2 + + # checking if both worky run + assert results1.cache_dir != results2.cache_dir + + +@pytest.mark.flaky(reruns=3) +def test_wf_nostate_cachelocations_recompute(worker: str, tmp_path: Path): + """ + Two wfs with the same inputs but slightly different graph; + the second worky should recompute the results, + but the second node should use the results from the first worky (has the same input) + """ + cache_root1 = tmp_path / "test_wf_cache3" + cache_root1.mkdir() + cache_root2 = tmp_path / "test_wf_cache4" + cache_root2.mkdir() + + @workflow.define + def Worky1(x, y): + mult = workflow.add(Multiply(x=x, y=y), name="mult") + add2 = workflow.add(Add2(x=mult.out), name="add2") + return add2.out + + worky1 = Worky1(x=2, y=3) + + with Submitter(worker=worker, cache_root=cache_root1) as sub: + results1 = sub(worky1) + + assert not results1.errored, "\n".join(results1.errors["error message"]) + + assert 8 == results1.outputs.out + + @workflow.define + def Worky2(x, y): + + # different argument assignment + mult = workflow.add(Multiply(x=y, y=x), name="mult") + add2 = workflow.add(Add2(x=mult.out), name="add2") + return add2.out + + worky2 = Worky2(x=2, y=3) + + with Submitter( + worker=worker, cache_root=cache_root2, readonly_caches=cache_root1 + ) as sub: + results2 = sub(worky2) + + assert not results2.errored, "\n".join(results2.errors["error message"]) + + assert 8 == results2.outputs.out + + # checking if both dir exists + assert results1.cache_dir != results2.cache_dir + + # the second worky should have only one task run + assert len(list(Path(cache_root1).glob("python-*"))) == 2 + assert len(list(Path(cache_root2).glob("python-*"))) == 1 + + +@pytest.mark.flaky(reruns=3) +def test_wf_ndstate_cachelocations(worker: str, tmp_path: Path): + """ + Two wfs with identical inputs and node states; + the second worky has readonly_caches and should not recompute the results + """ + cache_root1 = tmp_path / "test_wf_cache3" + cache_root1.mkdir() + cache_root2 = tmp_path / "test_wf_cache4" + cache_root2.mkdir() + + @workflow.define + def Worky1(x, y): + mult = workflow.add(Multiply().split(("x", "y"), x=x, y=y), name="mult") + add2 = workflow.add(Add2Wait(x=mult.out), name="add2") + return add2.out + + worky1 = Worky1(x=[2, 20], y=[3, 4]) + + t0 = time.time() + with Submitter(worker=worker, cache_root=cache_root1) as sub: + results1 = sub(worky1) + + assert not results1.errored, "\n".join(results1.errors["error message"]) + t1 = time.time() - t0 + + assert results1.outputs.out == [8, 82] + + @workflow.define + def Worky2(x, y): + + mult = workflow.add(Multiply().split(("x", "y"), x=x, y=y), name="mult") + add2 = workflow.add(Add2Wait(x=mult.out), name="add2") + return add2.out + + worky2 = Worky2(x=[2, 20], y=[3, 4]) + + t0 = time.time() + with Submitter( + worker=worker, cache_root=cache_root2, readonly_caches=cache_root1 + ) as sub: + results2 = sub(worky2) + + assert not results2.errored, "\n".join(results2.errors["error message"]) + t2 = time.time() - t0 + + assert results2.outputs.out == [8, 82] + + # for win and dask/slurm the time for dir creation etc. might take much longer + if not sys.platform.startswith("win") and worker == "cf": + # checking the execution time + assert t1 > 2 + assert t2 < max(1, t1 - 1) + + +@pytest.mark.flaky(reruns=3) +def test_wf_ndstate_cachelocations_forcererun(worker: str, tmp_path: Path): + """ + Two wfs with identical inputs and node states; + the second worky has readonly_caches, + but submitter is called with rerun=True, so should recompute + """ + cache_root1 = tmp_path / "test_wf_cache3" + cache_root1.mkdir() + cache_root2 = tmp_path / "test_wf_cache4" + cache_root2.mkdir() + + @workflow.define + def Worky1(x, y): + mult = workflow.add(Multiply().split(("x", "y"), x=x, y=y), name="mult") + add2 = workflow.add(Add2Wait(x=mult.out), name="add2") + return add2.out + + worky1 = Worky1(x=[2, 20], y=[3, 4]) + + t0 = time.time() + with Submitter(worker=worker, cache_root=cache_root1) as sub: + results1 = sub(worky1) + + assert not results1.errored, "\n".join(results1.errors["error message"]) + t1 = time.time() - t0 + + assert results1.outputs.out == [8, 82] + + @workflow.define + def Worky2(x, y): + + mult = workflow.add(Multiply().split(("x", "y"), x=x, y=y), name="mult") + add2 = workflow.add(Add2Wait(x=mult.out), name="add2") + return add2.out + + worky2 = Worky2(x=[2, 20], y=[3, 4]) + + t0 = time.time() + with Submitter(worker=worker, cache_root=cache_root2) as sub: + results2 = sub(worky2, rerun=True) + + assert not results2.errored, "\n".join(results2.errors["error message"]) + t2 = time.time() - t0 + + assert results2.outputs.out == [8, 82] + + # for win and dask/slurm the time for dir creation etc. might take much longer + if not sys.platform.startswith("win") and worker == "cf": + # checking the execution time + assert t1 > 2 + assert t2 > 2 + + # checking all directories + + # checking if the second worky run again + + +@pytest.mark.flaky(reruns=3) +def test_wf_ndstate_cachelocations_updatespl(worker: str, tmp_path: Path): + """ + Two wfs with identical inputs and node state (that is set after adding the node!); + the second worky has readonly_caches and should not recompute the results + """ + cache_root1 = tmp_path / "test_wf_cache3" + cache_root1.mkdir() + cache_root2 = tmp_path / "test_wf_cache4" + cache_root2.mkdir() + + @workflow.define + def Worky1(x, y): + mult = workflow.add(Multiply().split(("x", "y"), x=x, y=y), name="mult") + add2 = workflow.add(Add2Wait(x=mult.out), name="add2") + return add2.out + + worky1 = Worky1(x=[2, 20], y=[3, 4]) + + t0 = time.time() + with Submitter(worker=worker, cache_root=cache_root1) as sub: + results1 = sub(worky1) + + assert not results1.errored, "\n".join(results1.errors["error message"]) + t1 = time.time() - t0 + + assert results1.outputs.out == [8, 82] + + @workflow.define + def Worky2(x, y): + + mult = workflow.add(Multiply().split(("x", "y"), x=x, y=y), name="mult") + add2 = workflow.add(Add2Wait(x=mult.out), name="add2") + return add2.out + + worky2 = Worky2(x=[2, 20], y=[3, 4]) + + t0 = time.time() + with Submitter( + worker=worker, cache_root=cache_root2, readonly_caches=cache_root1 + ) as sub: + results2 = sub(worky2) + + assert not results2.errored, "\n".join(results2.errors["error message"]) + t2 = time.time() - t0 + + assert results2.outputs.out == [8, 82] + + # for win and dask/slurm the time for dir creation etc. might take much longer + if not sys.platform.startswith("win") and worker == "cf": + # checking the execution time + assert t1 > 2 + assert t2 < max(1, t1 - 1) + + # checking all directories + + # checking if the second worky didn't run again + # checking all directories + + +@pytest.mark.flaky(reruns=3) +def test_wf_ndstate_cachelocations_recompute(worker: str, tmp_path: Path): + """ + Two wfs (with nodes with states) with provided cache_root; + the second worky has readonly_caches and should not recompute the results + """ + cache_root1 = tmp_path / "test_wf_cache3" + cache_root1.mkdir() + cache_root2 = tmp_path / "test_wf_cache4" + cache_root2.mkdir() + + @workflow.define + def Worky1(x, y): + mult = workflow.add(Multiply().split(("x", "y"), x=x, y=y), name="mult") + add2 = workflow.add(Add2Wait(x=mult.out), name="add2") + return add2.out + + worky1 = Worky1(x=[2, 20], y=[3, 4]) + + t0 = time.time() + with Submitter(worker=worker, cache_root=cache_root1) as sub: + results1 = sub(worky1) + + assert not results1.errored, "\n".join(results1.errors["error message"]) + t1 = time.time() - t0 + + assert results1.outputs.out == [8, 82] + + @workflow.define + def Worky2(x, y): + + mult = workflow.add(Multiply().split(["x", "y"], x=x, y=y), name="mult") + add2 = workflow.add(Add2Wait(x=mult.out), name="add2") + return add2.out + + worky2 = Worky2(x=[2, 20], y=[3, 4]) + + t0 = time.time() + with Submitter( + worker=worker, cache_root=cache_root2, readonly_caches=cache_root1 + ) as sub: + results2 = sub(worky2) + + assert not results2.errored, "\n".join(results2.errors["error message"]) + t2 = time.time() - t0 + + assert results2.outputs.out == [8, 10, 62, 82] + + # for win and dask/slurm the time for dir creation etc. might take much longer + if not sys.platform.startswith("win") and worker == "cf": + # checking the execution time + assert t1 > 2 + assert t2 > 2 + + # checking all directories + + # checking if the second worky didn't run again + # checking all directories + + +@pytest.mark.flaky(reruns=3) +def test_wf_nostate_runtwice_usecache(worker: str, tmp_path: Path): + """ + running workflow (without state) twice, + the second run should use the results from the first one + """ + cache_root1 = tmp_path / "test_wf_cache3" + cache_root1.mkdir() + + @workflow.define + def Worky1(x, y): + mult = workflow.add(Multiply(x=x, y=y), name="mult") + add2 = workflow.add(Add2Wait(x=mult.out), name="add2") + return add2.out + + worky1 = Worky1(x=2, y=3) + + t0 = time.time() + with Submitter(worker=worker, cache_root=cache_root1) as sub: + results1 = sub(worky1) + + assert not results1.errored, "\n".join(results1.errors["error message"]) + t1 = time.time() - t0 + + assert 8 == results1.outputs.out + # checkoing cache_dir after the first run + + # saving the content of the cache dit after the first run + cache_root_content = os.listdir(cache_root1) + + # running workflow the second time + t0 = time.time() + with Submitter(worker=worker, cache_root=cache_root1) as sub: + results1 = sub(worky1) + + assert not results1.errored, "\n".join(results1.errors["error message"]) + t2 = time.time() - t0 + + assert 8 == results1.outputs.out + # checking if no new directory is created + assert cache_root_content == os.listdir(cache_root1) + + # for win and dask/slurm the time for dir creation etc. might take much longer + if not sys.platform.startswith("win") and worker == "cf": + # checking the execution time + assert t1 > 2 + assert t2 < max(1, t1 - 1) + + +def test_wf_state_runtwice_usecache(worker: str, tmp_path: Path): + """ + running workflow with a state twice, + the second run should use the results from the first one + """ + cache_root1 = tmp_path / "test_wf_cache3" + cache_root1.mkdir() + + @workflow.define + def Worky1(x, y): + mult = workflow.add(Multiply(x=x, y=y), name="mult") + add2 = workflow.add(Add2Wait(x=mult.out), name="add2") + return add2.out + + worky1 = Worky1().split(("x", "y"), x=[2, 20], y=[3, 30]) + + t0 = time.time() + with Submitter(worker=worker, cache_root=cache_root1) as sub: + results1 = sub(worky1) + + assert not results1.errored, "\n".join(results1.errors["error message"]) + t1 = time.time() - t0 + + assert 8 == results1.outputs.out[0] + assert 602 == results1.outputs.out[1] + + # checkoing cache_dir after the first run + assert results1.cache_dir.exists() + + # saving the content of the cache dit after the first run + cache_root_content = os.listdir(results1.job.cache_root) + + # running workflow the second time + t0 = time.time() + with Submitter(worker=worker, cache_root=cache_root1) as sub: + results1 = sub(worky1) + + assert not results1.errored, "\n".join(results1.errors["error message"]) + t2 = time.time() - t0 + + assert 8 == results1.outputs.out[0] + assert 602 == results1.outputs.out[1] + # checking if no new directory is created + assert cache_root_content == os.listdir(results1.job.cache_root) + # for win and dask/slurm the time for dir creation etc. might take much longer + if not sys.platform.startswith("win") and worker == "cf": + # checking the execution time + assert t1 > 2 + assert t2 < max(1, t1 - 1) + + +@pytest.fixture +def create_tasks(): + @workflow.define + def Worky(x): + t1 = workflow.add(Add2(x=x), name="t1") + t2 = workflow.add(Multiply(x=t1.out, y=2), name="t2") + return t2.out + + worky = Worky(x=1) + workflow_obj = Workflow.construct(worky) + t1 = workflow_obj["t1"] + t2 = workflow_obj["t2"] + return worky, t1, t2 + + +def test_workflow_combine1(tmp_path: Path): + @workflow.define(outputs=["out_pow", "out_iden1", "out_iden2"]) + def Worky1(a, b): + power = workflow.add(Power().split(["a", "b"], a=a, b=b), name="power") + identity1 = workflow.add( + Identity(x=power.out).combine("power.a"), name="identity1" + ) + identity2 = workflow.add( + Identity(x=identity1.out).combine("power.b"), name="identity2" + ) + return power.out, identity1.out, identity2.out + + worky1 = Worky1(a=[1, 2], b=[2, 3]) + outputs = worky1() + + assert outputs.out_pow == [1, 1, 4, 8] + assert outputs.out_iden1 == [[1, 4], [1, 8]] + assert outputs.out_iden2 == [[1, 4], [1, 8]] + + +def test_workflow_combine2(tmp_path: Path): + @workflow.define(outputs=["out_pow", "out_iden"]) + def Worky1(a, b): + power = workflow.add( + Power().split(["a", "b"], a=a, b=b).combine("a"), name="power" + ) + identity = workflow.add(Identity(x=power.out).combine("power.b")) + return power.out, identity.out + + worky1 = Worky1(a=[1, 2], b=[2, 3]) + outputs = worky1(cache_root=tmp_path) + + assert outputs.out_pow == [[1, 4], [1, 8]] + assert outputs.out_iden == [[1, 4], [1, 8]] + + +def test_wf_resultfile_1(worker: str, tmp_path: Path): + """workflow with a file in the result, file should be copied to the worky dir""" + + @workflow.define(outputs=["wf_out"]) + def Worky(x): + writefile = workflow.add(FunWriteFile(filename=x)) + + return writefile.out # + + worky = Worky(x="file_1.txt") + outputs = worky(worker=worker, cache_root=tmp_path) + + # checking if the file exists and if it is in the Worky directory + wf_out = outputs.wf_out.fspath + assert wf_out.exists() + assert wf_out == outputs._cache_dir / "file_1.txt" + + +def test_wf_resultfile_2(worker: str, tmp_path: Path): + """workflow with a list of files in the worky result, + all files should be copied to the worky dir + """ + + @workflow.define(outputs=["wf_out"]) + def Worky(x): + writefile = workflow.add(FunWriteFileList(filename_list=x)) + + return writefile.out # + + file_list = ["file_1.txt", "file_2.txt", "file_3.txt"] + worky = Worky(x=file_list) + outputs = worky(worker=worker, cache_root=tmp_path) + + # checking if the file exists and if it is in the Worky directory + for ii, file in enumerate(outputs.wf_out): + assert file.fspath.exists() + assert file.fspath == outputs._cache_dir / file_list[ii] + + +def test_wf_resultfile_3(worker: str, tmp_path: Path): + """workflow with a dictionaries of files in the worky result, + all files should be copied to the worky dir + """ + + @workflow.define(outputs=["wf_out"]) + def Worky(x): + writefile = workflow.add(FunWriteFileList2Dict(filename_list=x)) + + return writefile.out # + + file_list = ["file_1.txt", "file_2.txt", "file_3.txt"] + worky = Worky(x=file_list) + outputs = worky(worker=worker, cache_root=tmp_path) + + # checking if the file exists and if it is in the Worky directory + for key, val in outputs.wf_out.items(): + if key == "random_int": + assert val == 20 + else: + assert val.fspath.exists() + ii = int(key.split("_")[1]) + assert val.fspath == outputs._cache_dir / file_list[ii] + + +def test_wf_upstream_error1(tmp_path: Path): + """workflow with two tasks, task2 dependent on an task1 which raised an error""" + + @workflow.define + def Worky(x): + addvar1 = workflow.add(FunAddVarDefaultNoType(a=x), name="addvar1") + addvar2 = workflow.add(FunAddVarDefaultNoType(a=addvar1.out), name="addvar2") + return addvar2.out + + worky = Worky(x="hi") # TypeError for adding str and int + + with pytest.raises(RuntimeError) as excinfo: + worky(worker="cf", cache_root=tmp_path) + assert "addvar1" in str(excinfo.value) + assert "failed with errors" in str(excinfo.value) + + +def test_wf_upstream_error2(tmp_path: Path): + """task2 dependent on task1, task1 errors, workflow-level split on task 1 + goal - workflow finish running, one output errors but the other doesn't + """ + + @workflow.define + def Worky(x): + addvar1 = workflow.add(FunAddVarDefaultNoType(a=x), name="addvar1") + + addvar2 = workflow.add(FunAddVarDefaultNoType(a=addvar1.out), name="addvar2") + return addvar2.out + + worky = Worky().split( + "x", x=[1, "hi"] + ) # workflow-level split TypeError for adding str and int + + with pytest.raises(Exception) as excinfo: + worky(worker="cf", cache_root=tmp_path) + assert "addvar1" in str(excinfo.value) + assert "failed with errors" in str(excinfo.value) + + +@pytest.mark.flaky(reruns=2) # when slurm +def test_wf_upstream_error3(tmp_path: Path): + """task2 dependent on task1, task1 errors, task-level split on task 1 + goal - workflow finish running, one output errors but the other doesn't + """ + + @workflow.define + def Worky(x): + addvar1 = workflow.add(FunAddVarDefaultNoType().split("a", a=x), name="addvar1") + + addvar2 = workflow.add(FunAddVarDefaultNoType(a=addvar1.out), name="addvar2") + return addvar2.out + + worky = Worky(x=[1, "hi"]) # TypeError for adding str and int + with pytest.raises(RuntimeError) as excinfo: + worky(worker="cf", cache_root=tmp_path) + assert "addvar1" in str(excinfo.value) + assert "failed with errors" in str(excinfo.value) + + +def test_wf_upstream_error4(tmp_path: Path): + """workflow with one task, which raises an error""" + + @workflow.define + def Worky(x): + addvar1 = workflow.add(FunAddVarDefaultNoType(a=x), name="addvar1") + + return addvar1.out + + worky = Worky(x="hi") # TypeError for adding str and int + with pytest.raises(Exception) as excinfo: + worky(worker="cf", cache_root=tmp_path) + assert "failed with errors" in str(excinfo.value) + assert "addvar1" in str(excinfo.value) + + +def test_wf_upstream_error5(tmp_path: Path): + """nested workflow with one task, which raises an error""" + + @workflow.define + def Worky(x): + addvar1 = workflow.add(FunAddVarDefaultNoType(a=x), name="addvar1") + return addvar1.out # wf_out + + @workflow.define + def WfMain(x): + worky = workflow.add(Worky(x=x)) + return worky.out + + wf_main = WfMain(x="hi") # TypeError for adding str and int + + with pytest.raises(Exception) as excinfo: + wf_main(worker="cf", cache_root=tmp_path) + + assert "addvar1" in str(excinfo.value) + assert "failed with errors" in str(excinfo.value) + + +def test_wf_upstream_error6(tmp_path: Path): + """nested workflow with two tasks, the first one raises an error""" + + @workflow.define(outputs=["wf_out"]) + def Worky(x): + addvar1 = workflow.add(FunAddVarDefaultNoType(a=x), name="addvar1") + addvar2 = workflow.add(FunAddVarDefaultNoType(a=addvar1.out), name="addvar2") + + return addvar2.out # + + @workflow.define + def WfMain(x): + worky = workflow.add(Worky(x=x)) + return worky.wf_out + + wf_main = WfMain(x="hi") # TypeError for adding str and int + + with pytest.raises(RuntimeError) as excinfo: + wf_main(worker="cf", cache_root=tmp_path) + + assert "addvar1" in str(excinfo.value) + assert "failed with errors" in str(excinfo.value) + + +def test_wf_upstream_error7(tmp_path: Path): + """ + workflow with three sequential tasks, the first task raises an error + the last task is set as the workflow output + """ + + @workflow.define + def Worky(x): + addvar1 = workflow.add(FunAddVarDefaultNoType(a=x), name="addvar1") + + addvar2 = workflow.add(FunAddVarDefaultNoType(a=addvar1.out), name="addvar2") + addvar3 = workflow.add(FunAddVarDefaultNoType(a=addvar2.out), name="addvar3") + return addvar3.out + + worky = Worky(x="hi") # TypeError for adding str and int + + with Submitter(worker="cf", cache_root=tmp_path) as sub: + results = sub(worky) + error_message = "".join(results.errors["error message"]) + assert "addvar1" in error_message + assert "failed with errors" in error_message + + graph = results.job.return_values["exec_graph"] + assert graph["addvar1"].has_errored is True + assert list(graph["addvar2"].unrunnable.values()) == [[graph["addvar1"]]] + assert list(graph["addvar3"].unrunnable.values()) == [[graph["addvar2"]]] + + +def test_wf_upstream_error7a(tmp_path: Path): + """ + workflow with three sequential tasks, the first task raises an error + the second task is set as the workflow output + """ + + @workflow.define + def Worky(x): + addvar1 = workflow.add(FunAddVarDefaultNoType(a=x), name="addvar1") + + addvar2 = workflow.add(FunAddVarDefaultNoType(a=addvar1.out), name="addvar2") + addvar3 = workflow.add(FunAddVarDefaultNoType(a=addvar2.out), name="addvar3") + return addvar3.out + + worky = Worky(x="hi") # TypeError for adding str and int + with Submitter(worker="cf", cache_root=tmp_path) as sub: + results = sub(worky) + error_message = "".join(results.errors["error message"]) + assert "addvar1" in error_message + assert "failed with errors" in error_message + + graph = results.job.return_values["exec_graph"] + assert graph["addvar1"].has_errored is True + assert list(graph["addvar2"].unrunnable.values()) == [[graph["addvar1"]]] + assert list(graph["addvar3"].unrunnable.values()) == [[graph["addvar2"]]] + + +def test_wf_upstream_error7b(tmp_path: Path): + """ + workflow with three sequential tasks, the first task raises an error + the second and the third tasks are set as the workflow output + """ + + @workflow.define(outputs=["out1", "out2"]) + def Worky(x): + addvar1 = workflow.add(FunAddVarDefaultNoType(a=x), name="addvar1") + + addvar2 = workflow.add(FunAddVarDefaultNoType(a=addvar1.out), name="addvar2") + addvar3 = workflow.add(FunAddVarDefaultNoType(a=addvar2.out), name="addvar3") + return addvar2.out, addvar3.out # + + worky = Worky(x="hi") # TypeError for adding str and int + with Submitter(worker="cf", cache_root=tmp_path) as sub: + results = sub(worky) + error_message = "".join(results.errors["error message"]) + assert "addvar1" in error_message + assert "failed with errors" in error_message + + graph = results.job.return_values["exec_graph"] + assert graph["addvar1"].has_errored is True + assert list(graph["addvar2"].unrunnable.values()) == [[graph["addvar1"]]] + assert list(graph["addvar3"].unrunnable.values()) == [[graph["addvar2"]]] + + +def test_wf_upstream_error8(tmp_path: Path): + """workflow with three tasks, the first one raises an error, so 2 others are removed""" + + @workflow.define(outputs=["out1", "out2"]) + def Worky(x): + addvar1 = workflow.add(FunAddVarDefaultNoType(a=x), name="addvar1") + + addvar2 = workflow.add(FunAddVarDefaultNoType(a=addvar1.out), name="addvar2") + addtwo = workflow.add(FunAddTwo(a=addvar1.out), name="addtwo") + return addvar2.out, addtwo.out # + + worky = Worky(x="hi") # TypeError for adding str and int + with Submitter(worker="cf", cache_root=tmp_path) as sub: + results = sub(worky) + error_message = "".join(results.errors["error message"]) + assert "addvar1" in error_message + assert "failed with errors" in error_message + + graph = results.job.return_values["exec_graph"] + assert graph["addvar1"].has_errored is True + + assert list(graph["addvar2"].unrunnable.values()) == [[graph["addvar1"]]] + assert list(graph["addtwo"].unrunnable.values()) == [[graph["addvar1"]]] + + +def test_wf_upstream_error9(tmp_path: Path): + """ + workflow with five tasks with two "branches", + one branch has an error, the second is fine + the errored branch is connected to the workflow output + """ + + @workflow.define + def Worky(x): + addvar1 = workflow.add(FunAddVarDefaultNoType(a=x), name="addvar1") + err = workflow.add(FunAddVarNoType(a=addvar1.out, b="hi"), name="err") + follow_err = workflow.add(FunAddVarDefaultNoType(a=err.out), name="follow_err") + addtwo = workflow.add(FunAddTwoNoType(a=addvar1.out), name="addtwo") + workflow.add(FunAddVarDefaultNoType(a=addtwo.out)) + return follow_err.out # out1 + + worky = Worky(x=2) + with Submitter(worker="cf", cache_root=tmp_path) as sub: + results = sub(worky) + error_message = "".join(results.errors["error message"]) + assert "err" in error_message + assert "failed with errors" in error_message + + graph = results.job.return_values["exec_graph"] + assert graph["err"].has_errored is True + assert list(graph["follow_err"].unrunnable.values()) == [[graph["err"]]] + + +def test_wf_upstream_error9a(tmp_path: Path): + """ + workflow with five tasks with two "branches", + one branch has an error, the second is fine + the branch without error is connected to the workflow output + so the workflow finished clean + """ + + @workflow.define(outputs=["out1"]) + def Worky(x): + addvar1 = workflow.add(FunAddVarDefault(a=x), name="addvar1") + err = workflow.add(FunAddVarNoType(a=addvar1.out, b="hi"), name="err") + workflow.add(FunAddVarDefault(a=err.out), name="follow_err") + addtwo = workflow.add(FunAddTwoNoType(a=addvar1.out), name="addtwo") + addvar2 = workflow.add(FunAddVarDefault(a=addtwo.out), name="addvar2") + return addvar2.out + + worky = Worky(x=2) + + with Submitter(worker="cf", cache_root=tmp_path) as sub: + results = sub(worky) + error_message = "".join(results.errors["error message"]) + assert "err" in error_message + assert "failed with errors" in error_message + + graph = results.job.return_values["exec_graph"] + assert graph["err"].has_errored is True + assert list(graph["follow_err"].unrunnable.values()) == [[graph["err"]]] + + +def test_wf_upstream_error9b(tmp_path: Path): + """ + workflow with five tasks with two "branches", + one branch has an error, the second is fine + both branches are connected to the workflow output + """ + + @workflow.define(outputs=["out1", "out2"]) + def Worky(x): + addvar1 = workflow.add(FunAddVarDefaultNoType(a=x), name="addvar1") + err = workflow.add(FunAddVarNoType(a=addvar1.out, b="hi"), name="err") + follow_err = workflow.add(FunAddVarDefaultNoType(a=err.out), name="follow_err") + addtwo = workflow.add(FunAddTwoNoType(a=addvar1.out), name="addtwo") + addvar2 = workflow.add(FunAddVarDefaultNoType(a=addtwo.out), name="addvar2") + return follow_err.out, addvar2.out + + worky = Worky(x=2) + + with Submitter(worker="cf", cache_root=tmp_path) as sub: + results = sub(worky) + error_message = "".join(results.errors["error message"]) + assert "err" in error_message + assert "failed with errors" in error_message + + graph = results.job.return_values["exec_graph"] + assert graph["err"].has_errored is True + assert list(graph["follow_err"].unrunnable.values()) == [[graph["err"]]] + + +def exporting_graphs(worky, name, out_dir): + """helper function to run dot to create png/pdf files from dotfiles""" + # exporting the simple graph + dotfile_pr, formatted_dot = plot_workflow(worky, out_dir, export=True, name=name) + assert len(formatted_dot) == 1 + assert formatted_dot[0] == dotfile_pr.with_suffix(".png") + assert formatted_dot[0].exists() + print("\n png of a simple graph in: ", formatted_dot[0]) + # exporting nested graph + dotfile_pr, formatted_dot = plot_workflow( + worky, out_dir, plot_type="nested", export=["pdf", "png"], name=f"{name}_nest" + ) + assert len(formatted_dot) == 2 + assert formatted_dot[0] == dotfile_pr.with_suffix(".pdf") + assert formatted_dot[0].exists() + print("\n pdf of the nested graph in: ", formatted_dot[0]) + # detailed graph + dotfile_pr, formatted_dot = plot_workflow( + worky, out_dir, plot_type="detailed", export="pdf", name=f"{name}_det" + ) + assert len(formatted_dot) == 1 + assert formatted_dot[0] == dotfile_pr.with_suffix(".pdf") + assert formatted_dot[0].exists() + print("\n pdf of the detailed graph in: ", formatted_dot[0]) + + +@pytest.mark.parametrize("splitter", [None, "x"]) +def test_graph_simple(tmp_path, splitter): + """creating a set of graphs, worky with two nodes""" + + @workflow.define + def Worky(x=1, y=2): + mult_1 = workflow.add(Multiply(x=x, y=y), name="mult_1") + workflow.add(Multiply(x=x, y=x), name="mult_2") + add2 = workflow.add(Add2(x=mult_1.out), name="add2") + return add2.out + + worky = Worky().split(splitter, x=[1, 2]) + + # simple graph + dotfile_s = plot_workflow(worky, tmp_path, name="simple") + dotstr_s_lines = dotfile_s.read_text().split("\n") + assert "mult_1" in dotstr_s_lines + assert "mult_2" in dotstr_s_lines + assert "add2" in dotstr_s_lines + assert "mult_1 -> add2" in dotstr_s_lines + + +@pytest.mark.parametrize("splitter", [None, "x"]) +def test_graph_nested(tmp_path, splitter): + """creating a set of graphs, worky with two nodes""" + + @workflow.define + def Worky(x=1, y=2): + mult_1 = workflow.add(Multiply(x=x, y=y), name="mult_1") + workflow.add(Multiply(x=x, y=x), name="mult_2") + add2 = workflow.add(Add2(x=mult_1.out), name="add2") + return add2.out + + worky = Worky().split(splitter, x=[1, 2]) + + # nested graph (should have the same elements) + dotfile_n = plot_workflow( + worky, tmp_path, lazy=["x", "y"], plot_type="nested", name="nested" + ) + dotstr_n_lines = dotfile_n.read_text().split("\n") + assert "mult_1" in dotstr_n_lines + assert "mult_2" in dotstr_n_lines + assert "add2" in dotstr_n_lines + assert "mult_1 -> add2" in dotstr_n_lines + + +@pytest.mark.parametrize("splitter", [None, "x"]) +def test_graph_detailed(tmp_path, splitter): + """creating a set of graphs, worky with two nodes""" + + @workflow.define + def Worky(x=1, y=2): + mult_1 = workflow.add(Multiply(x=x, y=y), name="mult_1") + workflow.add(Multiply(x=x, y=x), name="mult_2") + add2 = workflow.add(Add2(x=mult_1.out), name="add2") + return add2.out + + worky = Worky().split(splitter, x=[1, 2]) + + # detailed graph + dotfile_d = plot_workflow( + worky, tmp_path, plot_type="detailed", lazy=["x", "y"], name="detailed" + ) + dotstr_d_lines = dotfile_d.read_text().split("\n") + assert ( + 'struct_Worky [color=red, label="{WORKFLOW INPUT: | { x | y}}"];' + in dotstr_d_lines + ) + assert "struct_mult_1:out -> struct_add2:x;" in dotstr_d_lines + + +@pytest.mark.skipif(not DOT_FLAG, reason="dot not available") +@pytest.mark.parametrize("splitter", [None, "x"]) +def test_graph_export_dot(tmp_path, splitter): + """creating a set of graphs, worky with two nodes""" + + @workflow.define + def Worky(x=1, y=2): + mult_1 = workflow.add(Multiply(x=x, y=y), name="mult_1") + workflow.add(Multiply(x=x, y=x), name="mult_2") + add2 = workflow.add(Add2(x=mult_1.out), name="add2") + return add2.out + + worky = Worky().split(splitter, x=[1, 2]) + + name = f"graph_{sys._getframe().f_code.co_name}" + exporting_graphs(worky=worky, name=name, out_dir=tmp_path) + + +def test_graph_1st(tmp_path: Path): + """creating a set of graphs, worky with two nodes + some nodes have splitters, should be marked with blue color + """ + + @workflow.define + def Worky(x, y): + mult_1 = workflow.add(Multiply(y=y).split("x", x=x), name="mult_1") + workflow.add(Multiply(x=y, y=y), name="mult_2") + add2 = workflow.add(Add2(x=mult_1.out), name="add2") + return add2.out + + worky = Worky(x=[1, 2], y=2) + + # simple graph + dotfile_s = plot_workflow(worky, out_dir=tmp_path) + dotstr_s_lines = dotfile_s.read_text().split("\n") + assert "mult_1 [color=blue]" in dotstr_s_lines + assert "mult_2" in dotstr_s_lines + assert "add2 [color=blue]" in dotstr_s_lines + assert "mult_1 -> add2 [color=blue]" in dotstr_s_lines + + # nested graph + dotfile_n = plot_workflow(worky, out_dir=tmp_path, plot_type="nested") + dotstr_n_lines = dotfile_n.read_text().split("\n") + assert "mult_1 [color=blue]" in dotstr_n_lines + assert "mult_2" in dotstr_n_lines + assert "add2 [color=blue]" in dotstr_n_lines + assert "mult_1 -> add2 [color=blue]" in dotstr_n_lines + + # detailed graph + dotfile_d = plot_workflow( + worky, out_dir=tmp_path, lazy=["x", "y"], plot_type="detailed" + ) + dotstr_d_lines = dotfile_d.read_text().split("\n") + assert ( + 'struct_Worky [color=red, label="{WORKFLOW INPUT: | { x | y}}"];' + in dotstr_d_lines + ) + assert "struct_mult_1:out -> struct_add2:x;" in dotstr_d_lines + + if DOT_FLAG: + name = f"graph_{sys._getframe().f_code.co_name}" + exporting_graphs(worky=worky, name=name, out_dir=tmp_path) + + +def test_graph_1st_cmb(tmp_path: Path): + """creating a set of graphs, worky with three nodes + the first one has a splitter, the second has a combiner, so the third one is stateless + first two nodes should be blue and the arrow between them should be blue + """ + + @workflow.define + def Worky(x, y): + mult = workflow.add(Multiply(y=y).split("x", x=x), name="mult") + add2 = workflow.add(Add2(x=mult.out).combine("mult.x"), name="add2") + sum = workflow.add(ListSum(x=add2.out), name="sum") + return sum.out + + worky = Worky(x=[1, 2], y=2) + # simple graph + dotfile_s = plot_workflow(worky, out_dir=tmp_path) + dotstr_s_lines = dotfile_s.read_text().split("\n") + assert "mult [color=blue]" in dotstr_s_lines + assert "add2 [color=blue]" in dotstr_s_lines + assert "sum" in dotstr_s_lines + assert "mult -> add2 [color=blue]" in dotstr_s_lines + assert "add2 -> sum" in dotstr_s_lines + + # nested graph + dotfile_n = plot_workflow(worky, out_dir=tmp_path, plot_type="nested") + dotstr_n_lines = dotfile_n.read_text().split("\n") + assert "mult [color=blue]" in dotstr_n_lines + assert "add2 [color=blue]" in dotstr_n_lines + assert "sum" in dotstr_n_lines + assert "mult -> add2 [color=blue]" in dotstr_n_lines + assert "add2 -> sum" in dotstr_n_lines + + # detailed graph + dotfile_d = plot_workflow( + worky, out_dir=tmp_path, lazy=["x", "y"], plot_type="detailed" + ) + dotstr_d_lines = dotfile_d.read_text().split("\n") + assert ( + 'struct_Worky [color=red, label="{WORKFLOW INPUT: | { x | y}}"];' + in dotstr_d_lines + ) + assert "struct_mult:out -> struct_add2:x;" in dotstr_d_lines + + if DOT_FLAG: + name = f"graph_{sys._getframe().f_code.co_name}" + exporting_graphs(worky=worky, name=name, out_dir=tmp_path) + + +def test_graph_2(tmp_path: Path): + """creating a graph, worky with one workflow as a node""" + + @workflow.define + def Wfnd(x): + add2 = workflow.add(Add2(x=x), name="add2") + return add2.out + + @workflow.define + def Worky(x): + wfnd = workflow.add(Wfnd(x=x), name="wfnd") + return wfnd.out + + worky = Worky(x=2) + + # simple graph + dotfile_s = plot_workflow(worky, out_dir=tmp_path) + dotstr_s_lines = dotfile_s.read_text().split("\n") + assert "wfnd [shape=box]" in dotstr_s_lines + + # nested graph + dotfile = plot_workflow(worky, out_dir=tmp_path, plot_type="nested") + dotstr_lines = dotfile.read_text().split("\n") + assert "subgraph cluster_wfnd {" in dotstr_lines + assert "add2" in dotstr_lines + + # detailed graph + dotfile_d = plot_workflow(worky, out_dir=tmp_path, lazy=["x"], plot_type="detailed") + dotstr_d_lines = dotfile_d.read_text().split("\n") + assert ( + 'struct_Worky [color=red, label="{WORKFLOW INPUT: | { x}}"];' + in dotstr_d_lines + ) + + if DOT_FLAG: + name = f"graph_{sys._getframe().f_code.co_name}" + exporting_graphs(worky=worky, name=name, out_dir=tmp_path) + + +def test_graph_2st(tmp_path: Path): + """creating a set of graphs, worky with one workflow as a node + the inner workflow has a state, so should be blue + """ + + @workflow.define + def Wfnd(x): + add2 = workflow.add(Add2(x=x), name="add2") + return add2.out + + @workflow.define + def Worky(x): + wfnd = workflow.add(Wfnd(x=x).split("x", x=x), name="wfnd") + return wfnd.out + + worky = Worky(x=[1, 2]) + + # simple graph + dotfile_s = plot_workflow(worky, out_dir=tmp_path) + dotstr_s_lines = dotfile_s.read_text().split("\n") + assert "wfnd [shape=box, color=blue]" in dotstr_s_lines + + # nested graph + dotfile_s = plot_workflow(worky, out_dir=tmp_path, plot_type="nested") + dotstr_s_lines = dotfile_s.read_text().split("\n") + assert "subgraph cluster_wfnd {" in dotstr_s_lines + assert "color=blue" in dotstr_s_lines + assert "add2" in dotstr_s_lines + + # detailed graph + dotfile_d = plot_workflow(worky, out_dir=tmp_path, lazy=["x"], plot_type="detailed") + dotstr_d_lines = dotfile_d.read_text().split("\n") + assert ( + 'struct_Worky [color=red, label="{WORKFLOW INPUT: | { x}}"];' + in dotstr_d_lines + ) + assert "struct_wfnd:out -> struct_Worky_out:out;" in dotstr_d_lines + + if DOT_FLAG: + name = f"graph_{sys._getframe().f_code.co_name}" + exporting_graphs(worky=worky, name=name, out_dir=tmp_path) + + +def test_graph_3(tmp_path: Path): + """creating a set of graphs, worky with two nodes (one node is a workflow)""" + + @workflow.define + def Wfnd(x): + add2 = workflow.add(Add2(x=x), name="add2") + return add2.out + + @workflow.define + def Worky(x, y=1): + mult = workflow.add(Multiply(x=x, y=y), name="mult") + wfnd = workflow.add(Wfnd(x=mult.out), name="wfnd") + return wfnd.out + + worky = Worky(x=2) + + # simple graph + dotfile_s = plot_workflow(worky, out_dir=tmp_path) + dotstr_s_lines = dotfile_s.read_text().split("\n") + assert "mult" in dotstr_s_lines + assert "wfnd [shape=box]" in dotstr_s_lines + assert "mult -> wfnd" in dotstr_s_lines + + # nested graph + dotfile_n = plot_workflow(worky, out_dir=tmp_path, plot_type="nested") + dotstr_n_lines = dotfile_n.read_text().split("\n") + assert "mult" in dotstr_n_lines + assert "subgraph cluster_wfnd {" in dotstr_n_lines + assert "add2" in dotstr_n_lines + + # detailed graph + dotfile_d = plot_workflow( + worky, out_dir=tmp_path, lazy=["x", "y"], plot_type="detailed" + ) + dotstr_d_lines = dotfile_d.read_text().split("\n") + assert ( + 'struct_Worky [color=red, label="{WORKFLOW INPUT: | { x | y}}"];' + in dotstr_d_lines + ) + assert "struct_mult:out -> struct_wfnd:x;" in dotstr_d_lines + + if DOT_FLAG: + name = f"graph_{sys._getframe().f_code.co_name}" + exporting_graphs(worky=worky, name=name, out_dir=tmp_path) + + +def test_graph_3st(tmp_path: Path): + """creating a set of graphs, worky with two nodes (one node is a workflow) + the first node has a state and it should be passed to the second node + (blue node and a wfasnd, and blue arrow from the node to the wfasnd) + """ + + @workflow.define + def Wfnd(x): + add2 = workflow.add(Add2(x=x), name="add2") + return add2.out + + @workflow.define + def Worky(x, y): + mult = workflow.add(Multiply(y=y).split("x", x=x), name="mult") + wfnd = workflow.add(Wfnd(x=mult.out), name="wfnd") + return wfnd.out + + worky = Worky(x=[1, 2], y=2) + + # simple graph + dotfile_s = plot_workflow(worky, out_dir=tmp_path) + dotstr_s_lines = dotfile_s.read_text().split("\n") + assert "mult [color=blue]" in dotstr_s_lines + assert "wfnd [shape=box, color=blue]" in dotstr_s_lines + assert "mult -> wfnd [color=blue]" in dotstr_s_lines + + # nested graph + dotfile_n = plot_workflow(worky, out_dir=tmp_path, plot_type="nested") + dotstr_n_lines = dotfile_n.read_text().split("\n") + assert "mult [color=blue]" in dotstr_n_lines + assert "subgraph cluster_wfnd {" in dotstr_n_lines + assert "add2" in dotstr_n_lines + + # detailed graph + dotfile_d = plot_workflow( + worky, out_dir=tmp_path, lazy=["x", "y"], plot_type="detailed" + ) + dotstr_d_lines = dotfile_d.read_text().split("\n") + assert ( + 'struct_Worky [color=red, label="{WORKFLOW INPUT: | { x | y}}"];' + in dotstr_d_lines + ) + assert "struct_mult:out -> struct_wfnd:x;" in dotstr_d_lines + + if DOT_FLAG: + name = f"graph_{sys._getframe().f_code.co_name}" + exporting_graphs(worky=worky, name=name, out_dir=tmp_path) + + +def test_graph_4(tmp_path: Path): + """creating a set of graphs, worky with two nodes (one node is a workflow with two nodes + inside). Connection from the node to the inner workflow. + """ + + @workflow.define + def Wfnd(x): + add2_a = workflow.add(Add2(x=x), name="add2_a") + add2_b = workflow.add(Add2(x=add2_a.out), name="add2_b") + return add2_b.out + + @workflow.define + def Worky(x, y): + mult = workflow.add(Multiply(x=x, y=y), name="mult") + wfnd = workflow.add(Wfnd(x=mult.out), name="wfnd") + return wfnd.out + + worky = Worky(x=2, y=3) + + # simple graph + dotfile_s = plot_workflow(worky, out_dir=tmp_path) + dotstr_s_lines = dotfile_s.read_text().split("\n") + assert "mult" in dotstr_s_lines + assert "wfnd [shape=box]" in dotstr_s_lines + assert "mult -> wfnd" in dotstr_s_lines + + # nested graph + dotfile_n = plot_workflow(worky, out_dir=tmp_path, plot_type="nested") + dotstr_n_lines = dotfile_n.read_text().split("\n") + for el in ["mult", "add2_a", "add2_b"]: + assert el in dotstr_n_lines + assert "subgraph cluster_wfnd {" in dotstr_n_lines + assert "add2_a -> add2_b" in dotstr_n_lines + assert "mult -> add2_a [lhead=cluster_wfnd]" + + # detailed graph + dotfile_d = plot_workflow( + worky, out_dir=tmp_path, lazy=["x", "y"], plot_type="detailed" + ) + dotstr_d_lines = dotfile_d.read_text().split("\n") + assert ( + 'struct_Worky [color=red, label="{WORKFLOW INPUT: | { x | y}}"];' + in dotstr_d_lines + ) + assert "struct_Worky:y -> struct_mult:y;" in dotstr_d_lines + + if DOT_FLAG: + name = f"graph_{sys._getframe().f_code.co_name}" + exporting_graphs(worky=worky, name=name, out_dir=tmp_path) + + +def test_graph_5(tmp_path: Path): + """creating a set of graphs, worky with two nodes (one node is a workflow with two nodes + inside). Connection from the inner workflow to the node. + """ + + @workflow.define + def Wfnd(x): + add2_a = workflow.add(Add2(x=x), name="add2_a") + add2_b = workflow.add(Add2(x=add2_a.out), name="add2_b") + return add2_b.out + + @workflow.define + def Worky(x, y): + wfnd = workflow.add(Wfnd(x=x), name="wfnd") + mult = workflow.add(Multiply(x=wfnd.out, y=y), name="mult") + return mult.out + + worky = Worky(x=2, y=3) + + # simple graph + dotfile_s = plot_workflow(worky, out_dir=tmp_path) + dotstr_s_lines = dotfile_s.read_text().split("\n") + assert "mult" in dotstr_s_lines + assert "wfnd [shape=box]" in dotstr_s_lines + assert "wfnd -> mult" in dotstr_s_lines + + # nested graph + dotfile_n = plot_workflow(worky, out_dir=tmp_path, plot_type="nested") + dotstr_n_lines = dotfile_n.read_text().split("\n") + for el in ["mult", "add2_a", "add2_b"]: + assert el in dotstr_n_lines + assert "subgraph cluster_wfnd {" in dotstr_n_lines + assert "add2_a -> add2_b" in dotstr_n_lines + assert "add2_b -> mult [ltail=cluster_wfnd]" + + # detailed graph + dotfile_d = plot_workflow( + worky, out_dir=tmp_path, lazy=["x", "y"], plot_type="detailed" + ) + dotstr_d_lines = dotfile_d.read_text().split("\n") + assert ( + 'struct_Worky [color=red, label="{WORKFLOW INPUT: | { x | y}}"];' + in dotstr_d_lines + ) + assert "struct_Worky:x -> struct_wfnd:x;" in dotstr_d_lines + + if DOT_FLAG: + name = f"graph_{sys._getframe().f_code.co_name}" + exporting_graphs(worky=worky, name=name, out_dir=tmp_path) + + +@pytest.mark.timeout(20) +def test_duplicate_input_on_split_wf(tmp_path: Path): + """checking if the workflow gets stuck if it has to run two tasks with equal checksum; + This can occur when splitting on a list containing duplicate values. + """ + text = ["test"] * 2 + + @python.define + def printer(a): + return a + + @workflow.define(outputs=["out1"]) + def Worky(text): + printer1 = workflow.add(printer(a=text)) + return printer1.out # + + worky = Worky().split(text=text) + + outputs = worky(worker="cf") + + assert outputs.out1[0] == "test" and outputs.out1[0] == "test" + + +@pytest.mark.timeout(40) +def test_inner_outer_wf_duplicate(tmp_path: Path): + """checking if the execution gets stuck if there is an inner and outer workflows + that run two nodes with the exact same inputs. + """ + task_list = ["First", "Second"] + start_list = [3, 4] + + @python.define + def OneArg(start_number): + for k in range(10): + start_number += 1 + return start_number + + @python.define + def OneArgInner(start_number): + for k in range(10): + start_number += 1 + return start_number + + # Inner Worky + @workflow.define(outputs=["res"]) + def InnerWf(start_number1): + inner_level1 = workflow.add(OneArgInner(start_number=start_number1)) + return inner_level1.out + + # Outer workflow has two nodes plus the inner workflow + + # Outer workflow + @workflow.define(outputs=["res2"]) + def OuterWf(start_number, task_name, dummy): + level1 = workflow.add(OneArg(start_number=start_number)) + inner = workflow.add(InnerWf(start_number1=level1.out)) + return inner.res + + test_outer = OuterWf(dummy=1).split( + ["start_number", "task_name"], start_number=start_list, task_name=task_list + ) + + with Submitter(worker="cf") as sub: + res = sub(test_outer) + + assert res.outputs.res2[0] == 23 and res.outputs.res2[1] == 23 + + +@pytest.mark.flaky(reruns=3) +def test_rerun_errored(tmp_path, capfd): + """Test rerunning a workflow containing errors. + Only the errored tasks and workflow should be rerun""" + + @python.define + def PassOdds(x): + if x % 2 == 0: + print(f"x={x}, running x%2 = {x % 2} (even error)\n") + raise ValueError("even error") + else: + print(f"x={x}, running x%2 = {x % 2}\n") + return x + + @workflow.define + def WorkyPassOdds(x): + pass_odds = workflow.add(PassOdds().split("x", x=x)) + return pass_odds.out + + worky = WorkyPassOdds(x=[1, 2, 3, 4, 5]) + + print("Starting run 1") + with pytest.raises(RuntimeError): + # Must be cf to get the error from all tasks, otherwise will only get the first error + worky(worker="cf", cache_root=tmp_path) + + print("Starting run 2") + with pytest.raises(RuntimeError): + worky(worker="cf", cache_root=tmp_path) + + out, err = capfd.readouterr() + stdout_lines = out.splitlines() + + tasks_run = 0 + errors_found = 0 + + for line in stdout_lines: + if "running x%2" in line: + tasks_run += 1 + if "(even error)" in line: + errors_found += 1 + + # There should have been 5 messages of the form "x%2 = XXX" after calling task() the first time + # and another 2 messagers after calling the second time + assert tasks_run == 7 + assert errors_found == 4 + + +def test_wf_state_arrays(tmp_path, worker): + @workflow.define(outputs={"alpha": int, "beta": ty.List[int]}) + def Worky(x: ty.List[int], y: int): + + A = workflow.add( # Split over workflow input "x" on "scalar" input + ListMultSum( + in_list=x, + ).split(scalar=x), + name="A", + ) + + B = workflow.add( # Worky is still split over "x", combined over "x" on out + ListMultSum( + scalar=A.sum, + in_list=A.products, + ).combine("A.scalar"), + name="B", + ) + + C = workflow.add( # Worky " + ListMultSum( + scalar=y, + in_list=B.sum, + ), + name="C", + ) + + D = workflow.add( # Worky is split again, this time over C.products + ListMultSum( + in_list=x, + ) + .split(scalar=C.products) + .combine("scalar"), + name="D", + ) + + E = workflow.add( # Worky is finally combined again into a single node + ListMultSum(scalar=y, in_list=D.sum), + name="E", + ) + + return E.sum, E.products + + worky = Worky(x=[1, 2, 3, 4], y=10) + + outputs = worky(cache_root=tmp_path, worker=worker) + assert outputs.alpha == 3000000 + assert outputs.beta == [100000, 400000, 900000, 1600000] + + +def test_wf_input_typing_fail(): + + @workflow.define(outputs={"alpha": int, "beta": ty.List[int]}) + def MismatchInputWf(x: int, y: int): + ListMultSum( + scalar=y, + in_list=y, + name="A", + ) + + with pytest.raises(TypeError, match="Incorrect type for field in 'y'"): + MismatchInputWf(x=1, y=[1, 2, 3]) + + +def test_wf_output_typing_fail(tmp_path: Path): + + @workflow.define(outputs={"alpha": int, "beta": ty.List[int]}) + def MismatchOutputWf(x: int, y: ty.List[int]): + A = workflow.add( # Split over workflow input "x" on "scalar" input + ListMultSum( + scalar=x, + in_list=y, + ) + ) + return A.products, A.products + + worky = MismatchOutputWf(x=1, y=[1, 2, 3]) + + with pytest.raises( + TypeError, + match="Incorrect type for lazy field in 'alpha' field of MismatchOutputWf.Outputs interface", + ): + worky(cache_root=tmp_path) + + +def test_wf_input_output_typing(tmp_path: Path): + @workflow.define(outputs={"sum": int, "products": ty.List[int]}) + def Worky(x: int, y: ty.List[int]): + A = workflow.add( # Split over workflow input "x" on "scalar" input + ListMultSum( + scalar=x, + in_list=y, + ) + ) + return A.sum, A.products + + outputs = Worky(x=10, y=[1, 2, 3, 4])(cache_root=tmp_path) + assert outputs.sum == 100 + assert outputs.products == [10, 20, 30, 40] diff --git a/pydra/compose/workflow.py b/pydra/compose/workflow.py new file mode 100644 index 0000000000..5ba004f799 --- /dev/null +++ b/pydra/compose/workflow.py @@ -0,0 +1,376 @@ +import typing as ty +import inspect +from typing import dataclass_transform +import attrs +from pydra.compose import base +from pydra.compose.base import ( + ensure_field_objects, + build_task_class, + parse_doc_string, + extract_function_inputs_and_outputs, + check_explicit_fields_are_none, + extract_fields_from_class, +) +from pydra.utils.general import attrs_values +from pydra.utils.typing import StateArray + +if ty.TYPE_CHECKING: + from pydra.engine.workflow import Workflow + from pydra.engine.job import Job + from pydra.engine.lazy import LazyOutField + from pydra.engine.graph import DiGraph + from pydra.engine.submitter import NodeExecution + from pydra.environments.base import Environment + from pydra.engine.hooks import TaskHooks + + +__all__ = ["define", "add", "this", "arg", "out", "Task", "Outputs", "cast"] + + +@attrs.define +class arg(base.Arg): + """Argument of a workflow task + + Parameters + ---------- + help: str + A short description of the input field. + default : Any, optional + the default value for the argument + allowed_values: list, optional + List of allowed values for the field. + requires: list, optional + Names of the inputs that are required together with the field. + copy_mode: File.CopyMode, optional + The mode of copying the file, by default it is File.CopyMode.any + copy_collation: File.CopyCollation, optional + The collation of the file, by default it is File.CopyCollation.any + copy_ext_decomp: File.ExtensionDecomposition, optional + The extension decomposition of the file, by default it is + File.ExtensionDecomposition.single + readonly: bool, optional + If True the input field can’t be provided by the user but it aggregates other + input fields (for example the fields with argstr: -o {fldA} {fldB}), by default + it is False + type: type, optional + The type of the field, by default it is Any + name: str, optional + The name of the field, used when specifying a list of fields instead of a mapping + from name to field, by default it is None + lazy: bool, optional + If True the input field is not required at construction time but is passed straight + through to the tasks, by default it is False + """ + + pass + + +@attrs.define +class out(base.Out): + """Output of a workflow task + + Parameters + ---------- + name: str, optional + The name of the field, used when specifying a list of fields instead of a mapping + from name to field, by default it is None + type: type, optional + The type of the field, by default it is Any + help: str, optional + A short description of the input field. + requires: list, optional + Names of the inputs that are required together with the field. + converter: callable, optional + The converter for the field passed through to the attrs.field, by default it is None + validator: callable | iterable[callable], optional + The validator(s) for the field passed through to the attrs.field, by default it is None + """ + + pass + + +@dataclass_transform( + kw_only_default=True, + field_specifiers=(out,), +) +def outputs(wrapped): + """Decorator to specify the output fields of a shell command is a dataclass-style type""" + return wrapped + + +@dataclass_transform( + kw_only_default=True, + field_specifiers=(arg,), +) +def define( + wrapped: type | ty.Callable | None = None, + /, + inputs: list[str | arg] | dict[str, arg | type] | None = None, + outputs: list[str | out] | dict[str, out | type] | type | None = None, + bases: ty.Sequence[type] = (), + outputs_bases: ty.Sequence[type] = (), + lazy: list[str] | None = None, + auto_attribs: bool = True, + xor: ty.Sequence[str | None] | ty.Sequence[ty.Sequence[str | None]] = (), +) -> "Task": + """ + Create an interface for a function or a class. Can be used either as a decorator on + a constructor function or the "canonical" dataclass-form of a task. + + Parameters + ---------- + wrapped : type | callable | None + The function or class to create an interface for. + inputs : list[str | Arg] | dict[str, Arg | type] | None + The inputs to the function or class. + outputs : list[str | Out] | dict[str, Out | type] | type | None + The outputs of the function or class. + auto_attribs : bool + Whether to use auto_attribs mode when creating the class. + xor: Sequence[str | None] | Sequence[Sequence[str | None]], optional + Names of args that are exclusive mutually exclusive, which must include + the name of the current field. If this list includes None, then none of the + fields need to be set. + + Returns + ------- + Task + The interface for the function or class. + """ + + if lazy is None: + lazy = [] + + def make(wrapped: ty.Callable | type) -> Task: + if inspect.isclass(wrapped): + klass = wrapped + constructor = klass.constructor + name = klass.__name__ + check_explicit_fields_are_none(klass, inputs, outputs) + parsed_inputs, parsed_outputs = extract_fields_from_class( + Task, + Outputs, + klass, + arg, + out, + auto_attribs, + skip_fields=["constructor"], + ) + else: + if not inspect.isfunction(wrapped): + raise ValueError( + f"wrapped must be a class or a function, not {wrapped!r}" + ) + klass = None + constructor = wrapped + input_helps, output_helps = parse_doc_string(constructor.__doc__) + inferred_inputs, inferred_outputs = extract_function_inputs_and_outputs( + constructor, arg, inputs, outputs + ) + name = constructor.__name__ + + parsed_inputs, parsed_outputs = ensure_field_objects( + arg_type=arg, + out_type=out, + inputs=inferred_inputs, + outputs=inferred_outputs, + input_helps=input_helps, + output_helps=output_helps, + ) + + if "constructor" in parsed_inputs: + raise ValueError( + "The argument 'constructor' is reserved and cannot be used as an " + "argument name" + ) + + parsed_inputs["constructor"] = arg( + name="constructor", type=ty.Callable, hash_eq=True, default=constructor + ) + for inpt_name in lazy: + parsed_inputs[inpt_name].lazy = True + + defn = build_task_class( + Task, + Outputs, + parsed_inputs, + parsed_outputs, + name=name, + klass=klass, + bases=bases, + outputs_bases=outputs_bases, + xor=xor, + ) + + return defn + + if wrapped is not None: + if not isinstance(wrapped, (ty.Callable, type)): + raise ValueError(f"wrapped must be a class or a callable, not {wrapped!r}") + return make(wrapped) + return make + + +def this() -> "Workflow": + """Get the workflow currently being constructed. + + Returns + ------- + Workflow + The workflow currently being constructed. + """ + from pydra.engine.workflow import Workflow + + return Workflow.under_construction() + + +OutputsType = ty.TypeVar("OutputsType", bound="Outputs") + + +def add( + task: "Task[OutputsType]", + name: str | None = None, + environment: "Environment | None" = None, + hooks: "TaskHooks | None" = None, +) -> OutputsType: + """Add a node to the workflow currently being constructed + + Parameters + ---------- + task : Task + The definition of the task to add to the workflow as a node + name : str, optional + The name of the node, by default it will be the name of the task + class + environment : Environment, optional + The environment to run the task in, such as the Docker or Singularity container, + by default it will be the "native" + hooks : TaskHooks, optional + The hooks to run before or after the task, by default no hooks will be run + + Returns + ------- + Outputs + The outputs of the node + """ + return this().add(task, name=name, environment=environment, hooks=hooks) + + +U = ty.TypeVar("U") + + +def cast(field: ty.Any, new_type: type[U]) -> U: + """Cast a lazy field to a new type. Note that the typing in the signature is a white + lie, as the return field is actually a LazyField as placeholder for the object of + type U. + + Parameters + ---------- + field : LazyField[T] + The field to cast + new_type : type[U] + The new type to cast the field to + + Returns + ------- + LazyField[U] + A copy of the lazy field with the new type + """ + return attrs.evolve( + field, + type=new_type, + cast_from=field._cast_from if field._cast_from else field._type, + ) + + +@attrs.define(kw_only=True, auto_attribs=False, eq=False, repr=False) +class WorkflowOutputs(base.Outputs): + + @classmethod + def _from_task(cls, job: "Job[WorkflowTask]") -> ty.Self: + """Collect the outputs of a workflow job from the outputs of the nodes in the + + Parameters + ---------- + job : Job[WorfklowDef] + The job whose outputs are being collected. + + Returns + ------- + outputs : Outputs + The outputs of the job + """ + + workflow: "Workflow" = job.return_values["workflow"] + exec_graph: "DiGraph[NodeExecution]" = job.return_values["exec_graph"] + + # Check for errors in any of the workflow nodes + if errored := [n for n in exec_graph.nodes if n.errored]: + errors = [] + for node in errored: + for node_task in node.errored.values(): + result = node_task.result() + if result.errors: + time_of_crash = result.errors["time of crash"] + error_message = "\n".join(result.errors["error message"]) + else: + time_of_crash = "UNKNOWN-TIME" + error_message = "NOT RETRIEVED" + errors.append( + f"Job {node.name!r} failed @ {time_of_crash} running " + f"{node._task} with the following errors:\n{error_message}" + "\nTo inspect, please load the pickled job object from here: " + f"{result.cache_dir}/_job.pklz" + ) + raise RuntimeError( + f"Workflow {job!r} failed with errors:\n\n" + "\n\n".join(errors) + ) + + # Retrieve values from the output fields + values = {} + lazy_field: LazyOutField + for name, lazy_field in attrs_values(workflow.outputs).items(): + val_out = lazy_field._get_value(workflow=workflow, graph=exec_graph) + if isinstance(val_out, StateArray): + val_out = list(val_out) # implicitly combine state arrays + values[name] = val_out + + # Set the values in the outputs object + outputs = super()._from_task(job) + outputs = attrs.evolve(outputs, **values) + outputs._cache_dir = job.cache_dir + return outputs + + +WorkflowOutputsType = ty.TypeVar("OutputType", bound=WorkflowOutputs) + + +@attrs.define(kw_only=True, auto_attribs=False, eq=False, repr=False) +class WorkflowTask(base.Task[WorkflowOutputsType]): + + _task_type = "workflow" + + RESERVED_FIELD_NAMES = base.Task.RESERVED_FIELD_NAMES + ("construct",) + + _constructed = attrs.field(default=None, init=False, repr=False, eq=False) + + def _run(self, job: "Job[WorkflowTask]", rerun: bool) -> None: + """Run the workflow.""" + job.submitter.expand_workflow(job, rerun) + + async def _run_async(self, job: "Job[WorkflowTask]", rerun: bool) -> None: + """Run the workflow asynchronously.""" + await job.submitter.expand_workflow_async(job, rerun) + + def construct(self) -> "Workflow": + from pydra.engine.workflow import Workflow + + if self._constructed is not None: + return self._constructed + self._constructed = Workflow.construct(self) + return self._constructed + + +# Alias WorkflowTask to Task so we can refer to it as workflow.Task +Task = WorkflowTask +Outputs = WorkflowOutputs diff --git a/pydra/conftest.py b/pydra/conftest.py index 66a1d200fc..0658bb8b39 100644 --- a/pydra/conftest.py +++ b/pydra/conftest.py @@ -6,62 +6,39 @@ def pytest_addoption(parser): - parser.addoption("--dask", action="store_true", help="run all combinations") + parser.addoption("--with-dask", action="store_true", help="run all combinations") parser.addoption( - "--psij", - action="store", - help="run with psij subtype plugin", - choices=["local", "slurm"], + "--only-worker", + help="only run tests with provided worker", ) +@pytest.fixture(scope="session", params=["debug", "cf"]) +def worker(request): + return request.param + + def pytest_generate_tests(metafunc): - if "plugin_dask_opt" in metafunc.fixturenames: - if bool(shutil.which("sbatch")): - Plugins = ["slurm"] - else: - Plugins = ["cf"] - try: - if metafunc.config.getoption("dask"): - Plugins.append("dask") - except ValueError: - # Called as --pyargs, so --dask isn't available - pass + if "any_worker" in metafunc.fixturenames: try: - if metafunc.config.getoption("psij"): - Plugins.append("psij-" + metafunc.config.getoption("psij")) - if ( - bool(shutil.which("sbatch")) - and metafunc.config.getoption("psij") == "slurm" - ): - Plugins.remove("slurm") + with_dask = metafunc.config.getoption("with_dask") except ValueError: - pass - metafunc.parametrize("plugin_dask_opt", Plugins) - - if "plugin" in metafunc.fixturenames: - use_dask = False + with_dask = False try: - use_dask = metafunc.config.getoption("dask") + only_worker = metafunc.config.getoption("only_worker") except ValueError: - pass - if use_dask: - Plugins = [] - elif bool(shutil.which("sbatch")): - Plugins = ["slurm"] + only_worker = None + if only_worker is None: + available_workers = ["debug", "cf"] + if with_dask: + available_workers.append("dask") + if bool(shutil.which("sbatch")): + available_workers.append("slurm") else: - Plugins = ["cf"] - try: - if metafunc.config.getoption("psij"): - Plugins.append("psij-" + metafunc.config.getoption("psij")) - if ( - bool(shutil.which("sbatch")) - and metafunc.config.getoption("psij") == "slurm" - ): - Plugins.remove("slurm") - except ValueError: - pass - metafunc.parametrize("plugin", Plugins) + available_workers = [only_worker] + # Set the available workers as a parameter to the + # test function + metafunc.parametrize("any_worker", available_workers) # For debugging in IDE's don't catch raised exceptions and let the IDE diff --git a/pydra/engine/__init__.py b/pydra/engine/__init__.py index 2eca36ba28..a3b26726f5 100644 --- a/pydra/engine/__init__.py +++ b/pydra/engine/__init__.py @@ -1,14 +1,6 @@ """The core of the workflow engine.""" from .submitter import Submitter -from .core import Workflow -from .task import AuditFlag, ShellCommandTask -from . import specs -__all__ = [ - "AuditFlag", - "ShellCommandTask", - "Submitter", - "Workflow", - "specs", -] + +__all__ = ["Submitter"] diff --git a/pydra/engine/audit.py b/pydra/engine/audit.py index 7397fad6e6..7648843c33 100644 --- a/pydra/engine/audit.py +++ b/pydra/engine/audit.py @@ -1,19 +1,21 @@ """Module to keep track of provenance information.""" import os +import typing as ty import json -import attr -from ..utils.messenger import send_message, make_message, gen_uuid, now, AuditFlag -from ..utils.hash import hash_function -from .helpers import ensure_list, gather_runtime_info -from .specs import attr_fields +from pydra.utils.messenger import send_message, make_message, gen_uuid, now, AuditFlag +from pydra.utils.general import attrs_values from fileformats.core import FileSet +from pydra.utils.hash import hash_function try: import importlib_resources except ImportError: import importlib.resources as importlib_resources # type: ignore +if ty.TYPE_CHECKING: + from pydra.engine.job import Job + class Audit: """Handle provenance tracking and resource utilization.""" @@ -28,7 +30,7 @@ def __init__(self, audit_flags, messengers, messenger_args, develop=None): Base configuration of auditing. messengers : :obj:`pydra.util.messenger.Messenger` or list of :class:`pydra.util.messenger.Messenger`, optional - Specify types of messenger used by Audit to send a message. + Taskify types of messenger used by Audit to send a message. Could be `PrintMessenger`, `FileMessenger`, or `RemoteRESTMessenger`. messenger_args : :obj:`dict`, optional Optional arguments for the `Messenger.send` method. @@ -36,6 +38,8 @@ def __init__(self, audit_flags, messengers, messenger_args, develop=None): If True, the local context.jsonld file is used, otherwise the one from github is used. """ + from pydra.utils.general import ensure_list + self.audit_flags = audit_flags self.messengers = ensure_list(messengers) self.messenger_args = messenger_args @@ -46,7 +50,7 @@ def start_audit(self, odir): Start recording provenance. Monitored information is not sent until directory is created, - in case message directory is inside task output directory. + in case message directory is inside job output directory. Parameters ---------- @@ -61,7 +65,7 @@ def start_audit(self, odir): user_id = f"uid:{gen_uuid()}" start_message = { "@id": self.aid, - "@type": "task", + "@type": "job", "startedAtTime": now(), "executedBy": user_id, } @@ -70,7 +74,7 @@ def start_audit(self, odir): if self.audit_check(AuditFlag.PROV): self.audit_message(start_message, AuditFlag.PROV) if self.audit_check(AuditFlag.RESOURCE): - from ..utils.profiler import ResourceMonitor + from pydra.utils.profiler import ResourceMonitor self.resource_monitor = ResourceMonitor(os.getpid(), logdir=self.odir) @@ -93,6 +97,8 @@ def monitor(self): def finalize_audit(self, result): """End auditing.""" if self.audit_check(AuditFlag.RESOURCE): + from pydra.engine.result import gather_runtime_info + self.resource_monitor.stop() result.runtime = gather_runtime_info(self.resource_monitor.fname) if self.audit_check(AuditFlag.PROV): @@ -102,7 +108,7 @@ def finalize_audit(self, result): ) # audit resources/runtime information self.eid = f"uid:{gen_uuid()}" - entity = attr.asdict(result.runtime, recurse=False) + entity = attrs_values(result.runtime) entity.update( **{ "@id": self.eid, @@ -176,16 +182,17 @@ def audit_check(self, flag): """ return self.audit_flags & flag - def audit_task(self, task): + def audit_task(self, job: "Job"): import subprocess as sp + from pydra.utils.general import task_fields - label = task.name + label = job.name - command = task.cmdline if hasattr(task.inputs, "executable") else None - attr_list = attr_fields(task.inputs) + command = job.task.cmdline if hasattr(job.task, "executable") else None + attr_list = task_fields(job.task) for attrs in attr_list: input_name = attrs.name - value = getattr(task.inputs, input_name) + value = job.inputs[input_name] if isinstance(value, FileSet): input_path = os.path.abspath(value) file_hash = hash_function(value) @@ -220,7 +227,7 @@ def audit_task(self, task): start_message = { "@id": self.aid, - "@type": "task", + "@type": "job", "Label": label, "Command": command, "StartedAtTime": now(), diff --git a/pydra/engine/boutiques.py b/pydra/engine/boutiques.py deleted file mode 100644 index 0f3cf110e1..0000000000 --- a/pydra/engine/boutiques.py +++ /dev/null @@ -1,213 +0,0 @@ -import typing as ty -import json -import attr -from urllib.request import urlretrieve -from pathlib import Path -from functools import reduce - -from ..utils.messenger import AuditFlag -from ..engine import ShellCommandTask -from ..engine.specs import SpecInfo, ShellSpec, ShellOutSpec, File, attr_fields -from .helpers_file import is_local_file - - -class BoshTask(ShellCommandTask): - """Shell Command Task based on the Boutiques descriptor""" - - def __init__( - self, - zenodo_id=None, - bosh_file=None, - audit_flags: AuditFlag = AuditFlag.NONE, - cache_dir=None, - input_spec_names: ty.Optional[ty.List] = None, - messenger_args=None, - messengers=None, - name=None, - output_spec_names: ty.Optional[ty.List] = None, - rerun=False, - strip=False, - **kwargs, - ): - """ - Initialize this task. - - Parameters - ---------- - zenodo_id: :obj: str - Zenodo ID - bosh_file : : str - json file with the boutiques descriptors - audit_flags : :obj:`pydra.utils.messenger.AuditFlag` - Auditing configuration - cache_dir : :obj:`os.pathlike` - Cache directory - input_spec_names : :obj: list - Input names for input_spec. - messenger_args : - TODO - messengers : - TODO - name : :obj:`str` - Name of this task. - output_spec_names : :obj: list - Output names for output_spec. - strip : :obj:`bool` - TODO - - """ - self.cache_dir = cache_dir - if (bosh_file and zenodo_id) or not (bosh_file or zenodo_id): - raise Exception("either bosh or zenodo_id has to be specified") - elif zenodo_id: - self.bosh_file = self._download_spec(zenodo_id) - else: # bosh_file - self.bosh_file = bosh_file - - with self.bosh_file.open() as f: - self.bosh_spec = json.load(f) - - self.input_spec = self._prepare_input_spec(names_subset=input_spec_names) - self.output_spec = self._prepare_output_spec(names_subset=output_spec_names) - self.bindings = ["-v", f"{self.bosh_file.parent}:{self.bosh_file.parent}:ro"] - - super().__init__( - name=name, - input_spec=self.input_spec, - output_spec=self.output_spec, - executable=["bosh", "exec", "launch"], - args=["-s"], - audit_flags=audit_flags, - messengers=messengers, - messenger_args=messenger_args, - cache_dir=self.cache_dir, - strip=strip, - rerun=rerun, - **kwargs, - ) - self.strip = strip - - def _download_spec(self, zenodo_id): - """ - using boutiques Searcher to find url of zenodo file for a specific id, - and download the file to self.cache_dir - """ - from boutiques.searcher import Searcher - - searcher = Searcher(zenodo_id, exact_match=True) - hits = searcher.zenodo_search().json()["hits"]["hits"] - if len(hits) == 0: - raise Exception(f"can't find zenodo spec for {zenodo_id}") - elif len(hits) > 1: - raise Exception(f"too many hits for {zenodo_id}") - else: - zenodo_url = hits[0]["files"][0]["links"]["self"] - zenodo_file = self.cache_dir / f"zenodo.{zenodo_id}.json" - urlretrieve(zenodo_url, zenodo_file) - return zenodo_file - - def _prepare_input_spec(self, names_subset=None): - """creating input spec from the zenodo file - if name_subset provided, only names from the subset will be used in the spec - """ - binputs = self.bosh_spec["inputs"] - self._input_spec_keys = {} - fields = [] - for input in binputs: - name = input["id"] - if names_subset is None: - pass - elif name not in names_subset: - continue - else: - names_subset.remove(name) - if input["type"] == "File": - tp = File - elif input["type"] == "String": - tp = str - elif input["type"] == "Number": - tp = float - elif input["type"] == "Flag": - tp = bool - else: - tp = None - # adding list - if tp and "list" in input and input["list"]: - tp = ty.List[tp] - - mdata = { - "help_string": input.get("description", None) or input["name"], - "mandatory": not input["optional"], - "argstr": input.get("command-line-flag", None), - } - fields.append((name, tp, mdata)) - self._input_spec_keys[input["value-key"]] = "{" + f"{name}" + "}" - if names_subset: - raise RuntimeError(f"{names_subset} are not in the zenodo input spec") - spec = SpecInfo(name="Inputs", fields=fields, bases=(ShellSpec,)) - return spec - - def _prepare_output_spec(self, names_subset=None): - """creating output spec from the zenodo file - if name_subset provided, only names from the subset will be used in the spec - """ - boutputs = self.bosh_spec["output-files"] - fields = [] - for output in boutputs: - name = output["id"] - if names_subset is None: - pass - elif name not in names_subset: - continue - else: - names_subset.remove(name) - path_template = reduce( - lambda s, r: s.replace(*r), - self._input_spec_keys.items(), - output["path-template"], - ) - mdata = { - "help_string": output.get("description", None) or output["name"], - "mandatory": not output["optional"], - "output_file_template": path_template, - } - fields.append((name, attr.ib(type=File, metadata=mdata))) - - if names_subset: - raise RuntimeError(f"{names_subset} are not in the zenodo output spec") - spec = SpecInfo(name="Outputs", fields=fields, bases=(ShellOutSpec,)) - return spec - - def _command_args_single(self, state_ind=None, index=None): - """Get command line arguments for a single state""" - input_filepath = self._bosh_invocation_file(state_ind=state_ind, index=index) - cmd_list = ( - self.inputs.executable - + [str(self.bosh_file), input_filepath] - + self.inputs.args - + self.bindings - ) - return cmd_list - - def _bosh_invocation_file(self, state_ind=None, index=None): - """creating bosh invocation file - json file with inputs values""" - input_json = {} - for f in attr_fields(self.inputs, exclude_names=("executable", "args")): - if self.state and f"{self.name}.{f.name}" in state_ind: - value = getattr(self.inputs, f.name)[state_ind[f"{self.name}.{f.name}"]] - else: - value = getattr(self.inputs, f.name) - # adding to the json file if specified by the user - if value is not attr.NOTHING and value != "NOTHING": - if is_local_file(f): - value = Path(value) - self.bindings.extend(["-v", f"{value.parent}:{value.parent}:ro"]) - value = str(value) - - input_json[f.name] = value - - filename = self.cache_dir / f"{self.name}-{index}.json" - with open(filename, "w") as jsonfile: - json.dump(input_json, jsonfile) - - return str(filename) diff --git a/pydra/engine/core.py b/pydra/engine/core.py deleted file mode 100644 index d0081e3ace..0000000000 --- a/pydra/engine/core.py +++ /dev/null @@ -1,1547 +0,0 @@ -"""Basic processing graph elements.""" - -import abc -import json -import logging -import itertools -from functools import cached_property -import os -import sys -from pathlib import Path -import typing as ty -from copy import deepcopy, copy -from uuid import uuid4 -from filelock import SoftFileLock -import shutil -from tempfile import mkdtemp -from traceback import format_exception -import attr -import cloudpickle as cp -from . import state -from . import helpers_state as hlpst -from .specs import ( - File, - BaseSpec, - RuntimeSpec, - Result, - SpecInfo, - LazyIn, - LazyOut, - LazyField, - TaskHook, - attr_fields, - StateArray, -) -from .helpers import ( - make_klass, - create_checksum, - print_help, - load_result, - save, - ensure_list, - record_error, - PydraFileLock, - parse_copyfile, -) -from ..utils.hash import hash_function -from .helpers_file import copy_nested_files, template_update -from .graph import DiGraph -from .audit import Audit -from ..utils.messenger import AuditFlag -from ..utils.typing import TypeParser -from fileformats.core import FileSet - -logger = logging.getLogger("pydra") - -develop = False - - -class TaskBase: - """ - A base structure for the nodes in the processing graph. - - Tasks are a generic compute step from which both elementary tasks and - :class:`Workflow` instances inherit. - - """ - - _api_version: str = "0.0.1" # Should generally not be touched by subclasses - _etelemetry_version_data = None # class variable to store etelemetry information - _version: str # Version of tool being wrapped - _task_version: ty.Optional[str] = None - # Task writers encouraged to define and increment when implementation changes sufficiently - _input_sets = None # Dictionaries of predefined input settings - - audit_flags: AuditFlag = AuditFlag.NONE - """What to audit -- available flags: :class:`~pydra.utils.messenger.AuditFlag`.""" - - _can_resume = False # Does the task allow resuming from previous state - _redirect_x = False # Whether an X session should be created/directed - - _runtime_requirements = RuntimeSpec() - _runtime_hints = None - - _cache_dir = None # Working directory in which to operate - _references = None # List of references for a task - - def __init__( - self, - name: str, - audit_flags: AuditFlag = AuditFlag.NONE, - cache_dir=None, - cache_locations=None, - inputs: ty.Optional[ty.Union[ty.Text, File, ty.Dict]] = None, - cont_dim=None, - messenger_args=None, - messengers=None, - rerun=False, - ): - """ - Initialize a task. - - Tasks allow for caching (retrieving a previous result of the same - task definition and inputs), and concurrent execution. - Running tasks follows a decision flow: - - 1. Check whether prior cache exists -- - if ``True``, return cached result - 2. Check whether other process is running this task -- - wait if ``True``: - a. Finishes (with or without exception) -> return result - b. Gets killed -> restart - 3. No cache or other process -> start - 4. Two or more concurrent new processes get to start - - Parameters - ---------- - name : :obj:`str` - Unique name of this node - audit_flags : :class:`AuditFlag`, optional - Configure provenance tracking. Default is no provenance tracking. - See available flags at :class:`~pydra.utils.messenger.AuditFlag`. - cache_dir : :obj:`os.pathlike` - Set a custom directory of previously computed nodes. - cache_locations : - TODO - inputs : :obj:`typing.Text`, or :class:`File`, or :obj:`dict`, or `None`. - Set particular inputs to this node. - cont_dim : :obj:`dict`, or `None` - Container dimensions for input fields, - if any of the container should be treated as a container - messenger_args : - TODO - messengers : - TODO - """ - from .. import check_latest_version - - if TaskBase._etelemetry_version_data is None: - TaskBase._etelemetry_version_data = check_latest_version() - - # raise error if name is same as of attributes - if name in dir(self): - raise ValueError("Cannot use names of attributes or methods as task name") - self.name = name - if not self.input_spec: - raise Exception("No input_spec in class: %s" % self.__class__.__name__) - klass = make_klass(self.input_spec) - - self.inputs = klass( - **{ - # in attrs names that starts with "_" could be set when name provided w/o "_" - (f.name[1:] if f.name.startswith("_") else f.name): f.default - for f in attr.fields(klass) - } - ) - - self.input_names = [ - field.name - for field in attr.fields(klass) - if field.name not in ["_func", "_graph_checksums"] - ] - - if inputs: - if isinstance(inputs, dict): - # selecting items that are in input_names (ignoring fields that are not in input_spec) - inputs = {k: v for k, v in inputs.items() if k in self.input_names} - # TODO: this needs to finished and tested after #305 - elif Path(inputs).is_file(): - inputs = json.loads(Path(inputs).read_text()) - # TODO: this needs to finished and tested after #305 - elif isinstance(inputs, str): - if self._input_sets is None or inputs not in self._input_sets: - raise ValueError(f"Unknown input set {inputs!r}") - inputs = self._input_sets[inputs] - - self.inputs = attr.evolve(self.inputs, **inputs) - - # checking if metadata is set properly - self.inputs.check_metadata() - # dictionary to save the connections with lazy fields - self.inp_lf = {} - self.state = None - # container dimensions provided by the user - self.cont_dim = cont_dim - # container dimension for inner input if needed (e.g. for inner splitter) - self._inner_cont_dim = {} - self._output = {} - self._result = {} - # flag that says if node finished all jobs - self._done = False - if self._input_sets is None: - self._input_sets = {} - - self.audit = Audit( - audit_flags=audit_flags, - messengers=messengers, - messenger_args=messenger_args, - develop=develop, - ) - self.cache_dir = cache_dir - self.cache_locations = cache_locations - self.allow_cache_override = True - self._checksum = None - self._uid = uuid4().hex - # if True the results are not checked (does not propagate to nodes) - self.task_rerun = rerun - - self.plugin = None - self.hooks = TaskHook() - self._errored = False - self._lzout = None - - def __str__(self): - return self.name - - def __getstate__(self): - state = self.__dict__.copy() - state["input_spec"] = cp.dumps(state["input_spec"]) - state["output_spec"] = cp.dumps(state["output_spec"]) - inputs = {} - for k, v in attr.asdict(state["inputs"], recurse=False).items(): - if k.startswith("_"): - k = k[1:] - inputs[k] = v - state["inputs"] = inputs - return state - - def __setstate__(self, state): - state["input_spec"] = cp.loads(state["input_spec"]) - state["output_spec"] = cp.loads(state["output_spec"]) - state["inputs"] = make_klass(state["input_spec"])(**state["inputs"]) - self.__dict__.update(state) - - @cached_property - def lzout(self): - return LazyOut(self) - - def help(self, returnhelp=False): - """Print class help.""" - help_obj = print_help(self) - if returnhelp: - return help_obj - - @property - def version(self): - """Get version of this task structure.""" - return self._version - - @property - def errored(self): - """Check if the task has raised an error""" - return self._errored - - @property - def checksum(self): - """Calculates the unique checksum of the task. - Used to create specific directory name for task that are run; - and to create nodes checksums needed for graph checksums - (before the tasks have inputs etc.) - """ - input_hash = self.inputs.hash - if self.state is None: - self._checksum = create_checksum(self.__class__.__name__, input_hash) - else: - splitter_hash = hash_function(self.state.splitter) - self._checksum = create_checksum( - self.__class__.__name__, hash_function([input_hash, splitter_hash]) - ) - return self._checksum - - def checksum_states(self, state_index=None): - """ - Calculate a checksum for the specific state or all of the states of the task. - Replaces lists in the inputs fields with a specific values for states. - Used to recreate names of the task directories, - - Parameters - ---------- - state_index : - TODO - - """ - if is_workflow(self) and self.inputs._graph_checksums is attr.NOTHING: - self.inputs._graph_checksums = { - nd.name: nd.checksum for nd in self.graph_sorted - } - - if state_index is not None: - inputs_copy = copy(self.inputs) - for key, ind in self.state.inputs_ind[state_index].items(): - val = self._extract_input_el( - inputs=self.inputs, inp_nm=key.split(".")[1], ind=ind - ) - setattr(inputs_copy, key.split(".")[1], val) - # setting files_hash again in case it was cleaned by setting specific element - # that might be important for outer splitter of input variable with big files - # the file can be changed with every single index even if there are only two files - input_hash = inputs_copy.hash - if is_workflow(self): - con_hash = hash_function(self._connections) - # TODO: hash list is not used - hash_list = [input_hash, con_hash] # noqa: F841 - checksum_ind = create_checksum( - self.__class__.__name__, self._checksum_wf(input_hash) - ) - else: - checksum_ind = create_checksum(self.__class__.__name__, input_hash) - return checksum_ind - else: - checksum_list = [] - if not hasattr(self.state, "inputs_ind"): - self.state.prepare_states(self.inputs, cont_dim=self.cont_dim) - self.state.prepare_inputs() - for ind in range(len(self.state.inputs_ind)): - checksum_list.append(self.checksum_states(state_index=ind)) - return checksum_list - - @property - def uid(self): - """the unique id number for the task - It will be used to create unique names for slurm scripts etc. - without a need to run checksum - """ - return self._uid - - def set_state(self, splitter, combiner=None): - """ - Set a particular state on this task. - - Parameters - ---------- - splitter : - TODO - combiner : - TODO - - """ - if splitter is not None: - self.state = state.State( - name=self.name, splitter=splitter, combiner=combiner - ) - else: - self.state = None - return self.state - - @property - def output_names(self): - """Get the names of the outputs from the task's output_spec - (not everything has to be generated, see generated_output_names). - """ - return [f.name for f in attr.fields(make_klass(self.output_spec))] - - @property - def generated_output_names(self): - """Get the names of the outputs generated by the task. - If the spec doesn't have generated_output_names method, - it uses output_names. - The results depends on the input provided to the task - """ - output_klass = make_klass(self.output_spec) - if hasattr(output_klass, "generated_output_names"): - output = output_klass( - **{f.name: attr.NOTHING for f in attr.fields(output_klass)} - ) - # using updated input (after filing the templates) - _inputs = deepcopy(self.inputs) - modified_inputs = template_update(_inputs, self.output_dir) - if modified_inputs: - _inputs = attr.evolve(_inputs, **modified_inputs) - - return output.generated_output_names( - inputs=_inputs, output_dir=self.output_dir - ) - else: - return self.output_names - - @property - def can_resume(self): - """Whether the task accepts checkpoint-restart.""" - return self._can_resume - - @abc.abstractmethod - def _run_task(self, environment=None): - pass - - @property - def cache_dir(self): - """Get the location of the cache directory.""" - return self._cache_dir - - @cache_dir.setter - def cache_dir(self, location): - if location is not None: - self._cache_dir = Path(location).resolve() - self._cache_dir.mkdir(parents=False, exist_ok=True) - else: - self._cache_dir = mkdtemp() - self._cache_dir = Path(self._cache_dir).resolve() - - @property - def cache_locations(self): - """Get the list of cache sources.""" - return self._cache_locations + ensure_list(self._cache_dir) - - @cache_locations.setter - def cache_locations(self, locations): - if locations is not None: - self._cache_locations = [Path(loc) for loc in ensure_list(locations)] - else: - self._cache_locations = [] - - @property - def output_dir(self): - """Get the filesystem path where outputs will be written.""" - if self.state: - return [self._cache_dir / checksum for checksum in self.checksum_states()] - return self._cache_dir / self.checksum - - @property - def cont_dim(self): - # adding inner_cont_dim to the general container_dimension provided by the users - cont_dim_all = deepcopy(self._cont_dim) - for k, v in self._inner_cont_dim.items(): - cont_dim_all[k] = cont_dim_all.get(k, 1) + v - return cont_dim_all - - @cont_dim.setter - def cont_dim(self, cont_dim): - if cont_dim is None: - self._cont_dim = {} - else: - self._cont_dim = cont_dim - - def __call__( - self, - submitter=None, - plugin=None, - plugin_kwargs=None, - rerun=False, - environment=None, - **kwargs, - ): - """Make tasks callable themselves.""" - from .submitter import Submitter - - if submitter and plugin: - raise Exception("Specify submitter OR plugin, not both") - elif submitter: - pass - # if there is plugin provided or the task is a Workflow or has a state, - # the submitter will be created using provided plugin, self.plugin or "cf" - elif plugin or self.state or is_workflow(self): - plugin = plugin or self.plugin or "cf" - if plugin_kwargs is None: - plugin_kwargs = {} - submitter = Submitter(plugin=plugin, **plugin_kwargs) - - if submitter: - with submitter as sub: - self.inputs = attr.evolve(self.inputs, **kwargs) - res = sub(self, environment=environment) - else: # tasks without state could be run without a submitter - res = self._run(rerun=rerun, environment=environment, **kwargs) - return res - - def _modify_inputs(self): - """This method modifies the inputs of the task ahead of its execution: - - links/copies upstream files and directories into the destination tasks - working directory as required select state array values corresponding to - state index (it will try to leave them where they are unless specified or - they are on different file systems) - - resolve template values (e.g. output_file_template) - - deepcopy all inputs to guard against in-place changes during the task's - execution (they will be replaced after the task's execution with the - original inputs to ensure the tasks checksums are consistent) - """ - orig_inputs = { - k: v - for k, v in attr.asdict(self.inputs, recurse=False).items() - if not k.startswith("_") - } - map_copyfiles = {} - input_fields = attr.fields(type(self.inputs)) - for name, value in orig_inputs.items(): - fld = getattr(input_fields, name) - copy_mode, copy_collation = parse_copyfile( - fld, default_collation=self.DEFAULT_COPY_COLLATION - ) - if value is not attr.NOTHING and TypeParser.contains_type( - FileSet, fld.type - ): - copied_value = copy_nested_files( - value=value, - dest_dir=self.output_dir, - mode=copy_mode, - collation=copy_collation, - supported_modes=self.SUPPORTED_COPY_MODES, - ) - if value is not copied_value: - map_copyfiles[name] = copied_value - modified_inputs = template_update( - self.inputs, self.output_dir, map_copyfiles=map_copyfiles - ) - assert all(m in orig_inputs for m in modified_inputs), ( - "Modified inputs contain fields not present in original inputs. " - "This is likely a bug." - ) - for name, orig_value in orig_inputs.items(): - try: - value = modified_inputs[name] - except KeyError: - # Ensure we pass a copy not the original just in case inner - # attributes are modified during execution - value = deepcopy(orig_value) - setattr(self.inputs, name, value) - return orig_inputs - - def _populate_filesystem(self, checksum, output_dir): - """ - Invoked immediately after the lockfile is generated, this function: - - Creates the cache file - - Clears existing outputs if `can_resume` is False - - Generates a fresh output directory - - Created as an attempt to simplify overlapping `Task`|`Workflow` behaviors. - """ - # adding info file with the checksum in case the task was cancelled - # and the lockfile has to be removed - with open(self.cache_dir / f"{self.uid}_info.json", "w") as jsonfile: - json.dump({"checksum": checksum}, jsonfile) - if not self.can_resume and output_dir.exists(): - shutil.rmtree(output_dir) - output_dir.mkdir(parents=False, exist_ok=self.can_resume) - - def _run(self, rerun=False, environment=None, **kwargs): - self.inputs = attr.evolve(self.inputs, **kwargs) - self.inputs.check_fields_input_spec() - - checksum = self.checksum - output_dir = self.output_dir - lockfile = self.cache_dir / (checksum + ".lock") - # Eagerly retrieve cached - see scenarios in __init__() - self.hooks.pre_run(self) - logger.debug("'%s' is attempting to acquire lock on %s", self.name, lockfile) - with SoftFileLock(lockfile): - if not (rerun or self.task_rerun): - result = self.result() - if result is not None and not result.errored: - return result - cwd = os.getcwd() - self._populate_filesystem(checksum, output_dir) - os.chdir(output_dir) - orig_inputs = self._modify_inputs() - result = Result(output=None, runtime=None, errored=False) - self.hooks.pre_run_task(self) - self.audit.start_audit(odir=output_dir) - if self.audit.audit_check(AuditFlag.PROV): - self.audit.audit_task(task=self) - try: - self.audit.monitor() - self._run_task(environment=environment) - result.output = self._collect_outputs(output_dir=output_dir) - except Exception: - etype, eval, etr = sys.exc_info() - traceback = format_exception(etype, eval, etr) - record_error(output_dir, error=traceback) - result.errored = True - raise - finally: - self.hooks.post_run_task(self, result) - self.audit.finalize_audit(result) - save(output_dir, result=result, task=self) - # removing the additional file with the checksum - (self.cache_dir / f"{self.uid}_info.json").unlink() - # Restore original values to inputs - for field_name, field_value in orig_inputs.items(): - setattr(self.inputs, field_name, field_value) - os.chdir(cwd) - self.hooks.post_run(self, result) - # Check for any changes to the input hashes that have occurred during the execution - # of the task - self._check_for_hash_changes() - return result - - def _collect_outputs(self, output_dir): - output_klass = make_klass(self.output_spec) - output = output_klass( - **{f.name: attr.NOTHING for f in attr.fields(output_klass)} - ) - other_output = output.collect_additional_outputs( - self.inputs, output_dir, self.output_ - ) - return attr.evolve(output, **self.output_, **other_output) - - def split( - self, - splitter: ty.Union[str, ty.List[str], ty.Tuple[str, ...], None] = None, - overwrite: bool = False, - cont_dim: ty.Optional[dict] = None, - **inputs, - ): - """ - Run this task parametrically over lists of split inputs. - - Parameters - ---------- - splitter : str or list[str] or tuple[str] or None - the fields which to split over. If splitting over multiple fields, lists of - fields are interpreted as outer-products and tuples inner-products. If None, - then the fields to split are taken from the keyword-arg names. - overwrite : bool, optional - whether to overwrite an existing split on the node, by default False - cont_dim : dict, optional - Container dimensions for specific inputs, used in the splitter. - If input name is not in cont_dim, it is assumed that the input values has - a container dimension of 1, so only the most outer dim will be used for splitting. - **split_inputs - fields to split over, will automatically be wrapped in a StateArray object - and passed to the node inputs - - Returns - ------- - self : TaskBase - a reference to the task - """ - if self._lzout: - raise RuntimeError( - f"Cannot split {self} as its output interface has already been accessed" - ) - if splitter is None and inputs: - splitter = list(inputs) - elif splitter: - missing = set(hlpst.unwrap_splitter(splitter)) - set(inputs) - missing = [m for m in missing if not m.startswith("_")] - if missing: - raise ValueError( - f"Split is missing values for the following fields {list(missing)}" - ) - splitter = hlpst.add_name_splitter(splitter, self.name) - # if user want to update the splitter, overwrite has to be True - if self.state and not overwrite and self.state.splitter != splitter: - raise Exception( - "splitter has been already set, " - "if you want to overwrite it - use overwrite=True" - ) - if cont_dim: - for key, vel in cont_dim.items(): - self._cont_dim[f"{self.name}.{key}"] = vel - if inputs: - new_inputs = {} - split_inputs = set( - f"{self.name}.{n}" if "." not in n else n - for n in hlpst.unwrap_splitter(splitter) - if not n.startswith("_") - ) - for inpt_name, inpt_val in inputs.items(): - new_val: ty.Any - if f"{self.name}.{inpt_name}" in split_inputs: # type: ignore - if isinstance(inpt_val, LazyField): - new_val = inpt_val.split(splitter) - elif isinstance(inpt_val, ty.Iterable) and not isinstance( - inpt_val, (ty.Mapping, str) - ): - new_val = StateArray(inpt_val) - else: - raise TypeError( - f"Could not split {inpt_val} as it is not a sequence type" - ) - else: - new_val = inpt_val - new_inputs[inpt_name] = new_val - self.inputs = attr.evolve(self.inputs, **new_inputs) - if not self.state or splitter != self.state.splitter: - self.set_state(splitter) - return self - - def combine( - self, - combiner: ty.Union[ty.List[str], str], - overwrite: bool = False, # **kwargs - ): - """ - Combine inputs parameterized by one or more previous tasks. - - Parameters - ---------- - combiner : list[str] or str - the - overwrite : bool - whether to overwrite an existing combiner on the node - **kwargs : dict[str, Any] - values for the task that will be "combined" before they are provided to the - node - - Returns - ------- - self : TaskBase - a reference to the task - """ - if self._lzout: - raise RuntimeError( - f"Cannot combine {self} as its output interface has already been " - "accessed" - ) - if not isinstance(combiner, (str, list)): - raise Exception("combiner has to be a string or a list") - combiner = hlpst.add_name_combiner(ensure_list(combiner), self.name) - if ( - self.state - and self.state.combiner - and combiner != self.state.combiner - and not overwrite - ): - raise Exception( - "combiner has been already set, " - "if you want to overwrite it - use overwrite=True" - ) - if not self.state: - self.split(splitter=None) - # a task can have a combiner without a splitter - # if is connected to one with a splitter; - # self.fut_combiner will be used later as a combiner - self.fut_combiner = combiner - else: # self.state and not self.state.combiner - self.combiner = combiner - self.set_state(splitter=self.state.splitter, combiner=self.combiner) - return self - - def _extract_input_el(self, inputs, inp_nm, ind): - """ - Extracting element of the inputs taking into account - container dimension of the specific element that can be set in self.cont_dim. - If input name is not in cont_dim, it is assumed that the input values has - a container dimension of 1, so only the most outer dim will be used for splitting. - If - """ - if f"{self.name}.{inp_nm}" in self.cont_dim: - return list( - hlpst.flatten( - ensure_list(getattr(inputs, inp_nm)), - max_depth=self.cont_dim[f"{self.name}.{inp_nm}"], - ) - )[ind] - else: - return getattr(inputs, inp_nm)[ind] - - def get_input_el(self, ind): - """Collect all inputs required to run the node (for specific state element).""" - # TODO: doesn't work properly for more cmplicated wf (check if still an issue) - input_ind = self.state.inputs_ind[ind] - inputs_dict = {} - for inp in set(self.input_names): - if f"{self.name}.{inp}" in input_ind: - inputs_dict[inp] = self._extract_input_el( - inputs=self.inputs, - inp_nm=inp, - ind=input_ind[f"{self.name}.{inp}"], - ) - return inputs_dict - # else: - # # todo it never gets here - # breakpoint() - # inputs_dict = {inp: getattr(self.inputs, inp) for inp in self.input_names} - # return None, inputs_dict - - def pickle_task(self): - """Pickling the tasks with full inputs""" - pkl_files = self.cache_dir / "pkl_files" - pkl_files.mkdir(exist_ok=True, parents=True) - task_main_path = pkl_files / f"{self.name}_{self.uid}_task.pklz" - save(task_path=pkl_files, task=self, name_prefix=f"{self.name}_{self.uid}") - return task_main_path - - @property - def done(self): - """Check whether the tasks has been finalized and all outputs are stored.""" - # if any of the field is lazy, there is no need to check results - if is_lazy(self.inputs): - return False - _result = self.result() - if self.state: - # TODO: only check for needed state result - if _result and all(_result): - if self.state.combiner and isinstance(_result[0], list): - for res_l in _result: - if any([res.errored for res in res_l]): - raise ValueError(f"Task {self.name} raised an error") - return True - else: - if any([res.errored for res in _result]): - raise ValueError(f"Task {self.name} raised an error") - return True - # checking if self.result() is not an empty list only because - # the states_ind is an empty list (input field might be an empty list) - elif ( - _result == [] - and hasattr(self.state, "states_ind") - and self.state.states_ind == [] - ): - return True - else: - if _result: - if _result.errored: - self._errored = True - raise ValueError(f"Task {self.name} raised an error") - else: - return True - return False - - def _combined_output(self, return_inputs=False): - combined_results = [] - for gr, ind_l in self.state.final_combined_ind_mapping.items(): - combined_results_gr = [] - for ind in ind_l: - result = load_result(self.checksum_states(ind), self.cache_locations) - if result is None: - return None - if return_inputs is True or return_inputs == "val": - result = (self.state.states_val[ind], result) - elif return_inputs == "ind": - result = (self.state.states_ind[ind], result) - combined_results_gr.append(result) - combined_results.append(combined_results_gr) - if len(combined_results) == 1 and self.state.splitter_rpn_final == []: - # in case it's full combiner, removing the nested structure - return combined_results[0] - else: - return combined_results - - def result(self, state_index=None, return_inputs=False): - """ - Retrieve the outcomes of this particular task. - - Parameters - ---------- - state_index : :obj: `int` - index of the element for task with splitter and multiple states - return_inputs : :obj: `bool`, :obj:`str` - if True or "val" result is returned together with values of the input fields, - if "ind" result is returned together with indices of the input fields - - Returns - ------- - result : Result - the result of the task - """ - # TODO: check if result is available in load_result and - # return a future if not - if self.errored: - return Result(output=None, runtime=None, errored=True) - if self.state: - if state_index is None: - # if state_index=None, collecting all results - if self.state.combiner: - return self._combined_output(return_inputs=return_inputs) - else: - results = [] - for ind in range(len(self.state.inputs_ind)): - checksum = self.checksum_states(state_index=ind) - result = load_result(checksum, self.cache_locations) - if result is None: - return None - results.append(result) - if return_inputs is True or return_inputs == "val": - return list(zip(self.state.states_val, results)) - elif return_inputs == "ind": - return list(zip(self.state.states_ind, results)) - else: - return results - else: # state_index is not None - if self.state.combiner: - return self._combined_output(return_inputs=return_inputs)[ - state_index - ] - result = load_result( - self.checksum_states(state_index), self.cache_locations - ) - if return_inputs is True or return_inputs == "val": - return (self.state.states_val[state_index], result) - elif return_inputs == "ind": - return (self.state.states_ind[state_index], result) - else: - return result - else: - if state_index is not None: - raise ValueError("Task does not have a state") - checksum = self.checksum - result = load_result(checksum, self.cache_locations) - if result and result.errored: - self._errored = True - if return_inputs is True or return_inputs == "val": - inputs_val = { - f"{self.name}.{inp}": getattr(self.inputs, inp) - for inp in self.input_names - } - return (inputs_val, result) - elif return_inputs == "ind": - inputs_ind = {f"{self.name}.{inp}": None for inp in self.input_names} - return (inputs_ind, result) - else: - return result - - def _reset(self): - """Reset the connections between inputs and LazyFields.""" - for field in attr_fields(self.inputs): - if field.name in self.inp_lf: - setattr(self.inputs, field.name, self.inp_lf[field.name]) - if is_workflow(self): - for task in self.graph.nodes: - task._reset() - - def _check_for_hash_changes(self): - hash_changes = self.inputs.hash_changes() - details = "" - for changed in hash_changes: - field = getattr(attr.fields(type(self.inputs)), changed) - val = getattr(self.inputs, changed) - field_type = type(val) - if issubclass(field.type, FileSet): - details += ( - f"- {changed}: value passed to the {field.type} field is of type " - f"{field_type} ('{val}'). If it is intended to contain output data " - "then the type of the field in the interface class should be changed " - "to `pathlib.Path`. Otherwise, if the field is intended to be an " - "input field but it gets altered by the task in some way, then the " - "'copyfile' flag should be set to 'copy' in the field metadata of " - "the task interface class so copies of the files/directories in it " - "are passed to the task instead.\n" - ) - else: - details += ( - f"- {changed}: the {field_type} object passed to the {field.type}" - f"field appears to have an unstable hash. This could be due to " - "a stochastic/non-thread-safe attribute(s) of the object\n\n" - f"The {field.type}.__bytes_repr__() method can be implemented to " - "bespoke hashing methods based only on the stable attributes for " - f"the `{field_type.__module__}.{field_type.__name__}` type. " - f"See pydra/utils/hash.py for examples. Value: {val}\n" - ) - if hash_changes: - raise RuntimeError( - f"Input field hashes have changed during the execution of the " - f"'{self.name}' {type(self).__name__}.\n\n{details}" - ) - logger.debug( - "Input values and hashes for '%s' %s node:\n%s\n%s", - self.name, - type(self).__name__, - self.inputs, - self.inputs._hashes, - ) - - SUPPORTED_COPY_MODES = FileSet.CopyMode.any - DEFAULT_COPY_COLLATION = FileSet.CopyCollation.any - - -def _sanitize_spec( - spec: ty.Union[ - SpecInfo, ty.List[str], ty.Dict[str, ty.Type[ty.Any]], BaseSpec, None - ], - wf_name: str, - spec_name: str, - allow_empty: bool = False, -) -> SpecInfo: - """Makes sure the provided input specifications are valid. - - If the input specification is a list of strings, this will - build a proper SpecInfo object out of it. - - Parameters - ---------- - spec : SpecInfo or List[str] or Dict[str, type] - Specification to be sanitized. - wf_name : str - The name of the workflow for which the input specifications - spec_name : str - name given to generated SpecInfo object - - Returns - ------- - spec : SpecInfo - Sanitized specification. - - Raises - ------ - ValueError - If provided `spec` is None. - """ - graph_checksum_input = ("_graph_checksums", ty.Any) - if spec: - if isinstance(spec, SpecInfo): - if BaseSpec not in spec.bases: - raise ValueError("Provided SpecInfo must have BaseSpec as its base.") - if "_graph_checksums" not in {f[0] for f in spec.fields}: - spec.fields.insert(0, graph_checksum_input) - return spec - else: - base = BaseSpec - if isinstance(spec, list): - typed_spec = zip(spec, itertools.repeat(ty.Any)) - elif isinstance(spec, dict): - typed_spec = spec.items() # type: ignore - elif isinstance(spec, BaseSpec): - base = spec - typed_spec = [] - else: - raise TypeError( - f"Unrecognised spec type, {spec}, should be SpecInfo, list or dict" - ) - return SpecInfo( - name=spec_name, - fields=[graph_checksum_input] - + [ - ( - nm, - attr.ib( - type=tp, - metadata={ - "help_string": f"{nm} input from {wf_name} workflow" - }, - ), - ) - for nm, tp in typed_spec - ], - bases=(base,), - ) - elif allow_empty: - return None - else: - raise ValueError(f'Empty "{spec_name}" spec provided to Workflow {wf_name}.') - - -class Workflow(TaskBase): - """A composite task with structure of computational graph.""" - - def __init__( - self, - name, - audit_flags: AuditFlag = AuditFlag.NONE, - cache_dir=None, - cache_locations=None, - input_spec: ty.Optional[ - ty.Union[ty.List[ty.Text], ty.Dict[ty.Text, ty.Type[ty.Any]], SpecInfo] - ] = None, - cont_dim=None, - messenger_args=None, - messengers=None, - output_spec: ty.Optional[ - ty.Union[ty.List[str], ty.Dict[str, type], SpecInfo, BaseSpec] - ] = None, - rerun=False, - propagate_rerun=True, - **kwargs, - ): - """ - Initialize a workflow. - - Parameters - ---------- - name : :obj:`str` - Unique name of this node - audit_flags : :class:`AuditFlag`, optional - Configure provenance tracking. Default is no provenance tracking. - See available flags at :class:`~pydra.utils.messenger.AuditFlag`. - cache_dir : :obj:`os.pathlike` - Set a custom directory of previously computed nodes. - cache_locations : - TODO - inputs : :obj:`typing.Text`, or :class:`File`, or :obj:`dict`, or `None`. - Set particular inputs to this node. - cont_dim : :obj:`dict`, or `None` - Container dimensions for input fields, - if any of the container should be treated as a container - messenger_args : - TODO - messengers : - TODO - output_spec : - TODO - - """ - self.input_spec = _sanitize_spec(input_spec, name, "Inputs") - self.output_spec = _sanitize_spec( - output_spec, name, "Outputs", allow_empty=True - ) - - if name in dir(self): - raise ValueError( - "Cannot use names of attributes or methods as workflow name" - ) - self.name = name - - super().__init__( - name=name, - inputs=kwargs, - cont_dim=cont_dim, - cache_dir=cache_dir, - cache_locations=cache_locations, - audit_flags=audit_flags, - messengers=messengers, - messenger_args=messenger_args, - rerun=rerun, - ) - - self.graph = DiGraph(name=name) - self.name2obj = {} - self._lzin = None - self._pre_split = ( - False # To signify if the workflow has been split on task load or not - ) - - # store output connections - self._connections = None - # propagating rerun if task_rerun=True - self.propagate_rerun = propagate_rerun - - @cached_property - def lzin(self): - return LazyIn(self) - - def __getattr__(self, name): - if name in self.name2obj: - return self.name2obj[name] - return self.__getattribute__(name) - - @property - def nodes(self): - """Get the list of node names.""" - return self.name2obj.values() - - @property - def graph_sorted(self): - """Get a sorted graph representation of the workflow.""" - return self.graph.sorted_nodes - - @property - def checksum(self): - """Calculates the unique checksum of the task. - Used to create specific directory name for task that are run; - and to create nodes checksums needed for graph checksums - (before the tasks have inputs etc.) - """ - # if checksum is called before run the _graph_checksums is not ready - if is_workflow(self) and self.inputs._graph_checksums is attr.NOTHING: - self.inputs._graph_checksums = { - nd.name: nd.checksum for nd in self.graph_sorted - } - - input_hash = self.inputs.hash - if not self.state: - self._checksum = create_checksum( - self.__class__.__name__, self._checksum_wf(input_hash) - ) - else: - self._checksum = create_checksum( - self.__class__.__name__, - self._checksum_wf(input_hash, with_splitter=True), - ) - return self._checksum - - def _checksum_wf(self, input_hash, with_splitter=False): - """creating hash value for workflows - includes connections and splitter if with_splitter is True - """ - connection_hash = hash_function(self._connections) - hash_list = [input_hash, connection_hash] - if with_splitter and self.state: - # including splitter in the hash - splitter_hash = hash_function(self.state.splitter) - hash_list.append(splitter_hash) - return hash_function(hash_list) - - def add(self, task): - """ - Add a task to the workflow. - - Parameters - ---------- - task : :class:`TaskBase` - The task to be added. - - """ - if task.name in dir(self): - raise ValueError( - "Cannot use names of workflow attributes or methods as task name" - ) - if task.name in self.name2obj: - raise ValueError( - "Another task named {} is already added to the workflow".format( - task.name - ) - ) - self.name2obj[task.name] = task - - if not is_task(task): - raise ValueError(f"Unknown workflow element: {task!r}") - self.graph.add_nodes(task) - self._last_added = task - logger.debug(f"Added {task}") - return self - - def create_connections(self, task, detailed=False): - """ - Add and connect a particular task to existing nodes in the workflow. - - Parameters - ---------- - task : :class:`TaskBase` - The task to be added. - detailed : :obj:`bool` - If True, `add_edges_description` is run for self.graph to add - a detailed descriptions of the connections (input/output fields names) - """ - # TODO: create connection is run twice - other_states = {} - for field in attr_fields(task.inputs): - val = getattr(task.inputs, field.name) - if isinstance(val, LazyField): - # saving all connections with LazyFields - task.inp_lf[field.name] = val - # adding an edge to the graph if task id expecting output from a different task - if val.name != self.name: - # checking if the connection is already in the graph - if (getattr(self, val.name), task) not in self.graph.edges: - self.graph.add_edges((getattr(self, val.name), task)) - if detailed: - self.graph.add_edges_description( - (task.name, field.name, val.name, val.field) - ) - logger.debug("Connecting %s to %s", val.name, task.name) - # adding a state from the previous task to other_states - if ( - getattr(self, val.name).state - and getattr(self, val.name).state.splitter_rpn_final - ): - # variables that are part of inner splitters should be treated as a containers - if ( - task.state - and f"{task.name}.{field.name}" in task.state.splitter - ): - task._inner_cont_dim[f"{task.name}.{field.name}"] = 1 - # adding task_name: (task.state, [a field from the connection] - if val.name not in other_states: - other_states[val.name] = ( - getattr(self, val.name).state, - [field.name], - ) - else: - # if the task already exist in other_state, - # additional field name should be added to the list of fields - other_states[val.name][1].append(field.name) - else: # LazyField with the wf input - # connections with wf input should be added to the detailed graph description - if detailed: - self.graph.add_edges_description( - (task.name, field.name, val.name, val.field) - ) - - # if task has connections state has to be recalculated - if other_states: - if hasattr(task, "fut_combiner"): - combiner = task.fut_combiner - else: - combiner = None - - if task.state: - task.state.update_connections( - new_other_states=other_states, new_combiner=combiner - ) - else: - task.state = state.State( - task.name, - splitter=None, - other_states=other_states, - combiner=combiner, - ) - - async def _run(self, submitter=None, rerun=False, **kwargs): - # output_spec needs to be set using set_output or at workflow initialization - if self.output_spec is None: - raise ValueError( - "Workflow output cannot be None, use set_output to define output(s)" - ) - # creating connections that were defined after adding tasks to the wf - self._connect_and_propagate_to_tasks( - propagate_rerun=self.task_rerun and self.propagate_rerun - ) - - checksum = self.checksum - output_dir = self.output_dir - lockfile = self.cache_dir / (checksum + ".lock") - self.hooks.pre_run(self) - logger.debug( - "'%s' is attempting to acquire lock on %s with Pydra lock", - self.name, - lockfile, - ) - async with PydraFileLock(lockfile): - if not (rerun or self.task_rerun): - result = self.result() - if result is not None and not result.errored: - return result - cwd = os.getcwd() - self._populate_filesystem(checksum, output_dir) - result = Result(output=None, runtime=None, errored=False) - self.hooks.pre_run_task(self) - self.audit.start_audit(odir=output_dir) - try: - self.audit.monitor() - await self._run_task(submitter, rerun=rerun) - result.output = self._collect_outputs() - except Exception: - etype, eval, etr = sys.exc_info() - traceback = format_exception(etype, eval, etr) - record_error(output_dir, error=traceback) - result.errored = True - self._errored = True - raise - finally: - self.hooks.post_run_task(self, result) - self.audit.finalize_audit(result=result) - save(output_dir, result=result, task=self) - # removing the additional file with the checksum - (self.cache_dir / f"{self.uid}_info.json").unlink() - os.chdir(cwd) - self.hooks.post_run(self, result) - # Check for any changes to the input hashes that have occurred during the execution - # of the task - self._check_for_hash_changes() - return result - - async def _run_task(self, submitter, rerun=False, environment=None): - if not submitter: - raise Exception("Submitter should already be set.") - for nd in self.graph.nodes: - if nd.allow_cache_override: - nd.cache_dir = self.cache_dir - # at this point Workflow is stateless so this should be fine - await submitter.expand_workflow(self, rerun=rerun) - - def set_output( - self, - connections: ty.Union[ - ty.Tuple[str, LazyField], ty.List[ty.Tuple[str, LazyField]] - ], - ): - """ - Set outputs of the workflow by linking them with lazy outputs of tasks - - Parameters - ---------- - connections : tuple[str, LazyField] or list[tuple[str, LazyField]] or None - single or list of tuples linking the name of the output to a lazy output - of a task in the workflow. - """ - from ..utils.typing import TypeParser - - if self._connections is None: - self._connections = [] - if isinstance(connections, tuple) and len(connections) == 2: - new_connections = [connections] - elif isinstance(connections, list) and all( - [len(el) == 2 for el in connections] - ): - new_connections = connections - elif isinstance(connections, dict): - new_connections = list(connections.items()) - else: - raise TypeError( - "Connections can be a 2-elements tuple, a list of these tuples, or dictionary" - ) - # checking if a new output name is already in the connections - connection_names = [name for name, _ in self._connections] - if self.output_spec: - output_types = { - a.name: a.type for a in attr.fields(make_klass(self.output_spec)) - } - else: - output_types = {} - # Check for type matches with explicitly defined outputs - conflicting = [] - type_mismatches = [] - for conn_name, lazy_field in new_connections: - if conn_name in connection_names: - conflicting.append(conn_name) - try: - output_type = output_types[conn_name] - except KeyError: - pass - else: - if not TypeParser.matches_type(lazy_field.type, output_type): - type_mismatches.append((conn_name, output_type, lazy_field.type)) - if conflicting: - raise ValueError(f"the output names {conflicting} are already set") - if type_mismatches: - raise TypeError( - f"the types of the following outputs of {self} don't match their declared types: " - + ", ".join( - f"{n} (expected: {ex}, provided: {p})" - for n, ex, p in type_mismatches - ) - ) - self._connections += new_connections - fields = [] - for con in self._connections: - wf_out_nm, lf = con - task_nm, task_out_nm = lf.name, lf.field - if task_out_nm == "all_": - help_string = f"all outputs from {task_nm}" - fields.append((wf_out_nm, dict, {"help_string": help_string})) - else: - from ..utils.typing import TypeParser - - # getting information about the output field from the task output_spec - # providing proper type and some help string - task_output_spec = getattr(self, task_nm).output_spec - out_fld = attr.fields_dict(make_klass(task_output_spec))[task_out_nm] - help_string = ( - f"{out_fld.metadata.get('help_string', '')} (from {task_nm})" - ) - if TypeParser.get_origin(lf.type) is StateArray: - type_ = TypeParser.get_item_type(lf.type) - else: - type_ = lf.type - fields.append((wf_out_nm, type_, {"help_string": help_string})) - self.output_spec = SpecInfo(name="Output", fields=fields, bases=(BaseSpec,)) - logger.info("Added %s to %s", self.output_spec, self) - - def _collect_outputs(self): - output_klass = make_klass(self.output_spec) - output = output_klass( - **{f.name: attr.NOTHING for f in attr.fields(output_klass)} - ) - # collecting outputs from tasks - output_wf = {} - for name, val in self._connections: - if not isinstance(val, LazyField): - raise ValueError("all connections must be lazy") - try: - val_out = val.get_value(self) - output_wf[name] = val_out - except (ValueError, AttributeError) as e: - output_wf[name] = None - # checking if the tasks has predecessors that raises error - if isinstance(getattr(self, val.name)._errored, list): - raise ValueError( - f"Tasks {getattr(self, val.name)._errored} raised an error" - ) - else: - if isinstance(getattr(self, val.name).output_dir, list): - err_file = [ - el / "_error.pklz" - for el in getattr(self, val.name).output_dir - ] - if not all(e.exists() for e in err_file): - raise e - else: - err_file = getattr(self, val.name).output_dir / "_error.pklz" - if not Path(err_file).exists(): - raise e - raise ValueError( - f"Task {val.name} raised an error, full crash report is here: " - f"{err_file}" - ) - return attr.evolve(output, **output_wf) - - def create_dotfile(self, type="simple", export=None, name=None, output_dir=None): - """creating a graph - dotfile and optionally exporting to other formats""" - outdir = output_dir if output_dir is not None else self.cache_dir - if not name: - name = f"graph_{self.name}" - if type == "simple": - for task in self.graph.nodes: - self.create_connections(task) - dotfile = self.graph.create_dotfile_simple(outdir=outdir, name=name) - elif type == "nested": - for task in self.graph.nodes: - self.create_connections(task) - dotfile = self.graph.create_dotfile_nested(outdir=outdir, name=name) - elif type == "detailed": - # create connections with detailed=True - for task in self.graph.nodes: - self.create_connections(task, detailed=True) - # adding wf outputs - for wf_out, lf in self._connections: - self.graph.add_edges_description((self.name, wf_out, lf.name, lf.field)) - dotfile = self.graph.create_dotfile_detailed(outdir=outdir, name=name) - else: - raise Exception( - f"type of the graph can be simple, detailed or nested, " - f"but {type} provided" - ) - if not export: - return dotfile - else: - if export is True: - export = ["png"] - elif isinstance(export, str): - export = [export] - formatted_dot = [] - for ext in export: - formatted_dot.append(self.graph.export_graph(dotfile=dotfile, ext=ext)) - return dotfile, formatted_dot - - def _connect_and_propagate_to_tasks( - self, - *, - propagate_rerun=False, - override_task_caches=False, - ): - """ - Visit each node in the graph and create the connections. - Additionally checks if all tasks should be rerun. - """ - for task in self.graph.nodes: - self.create_connections(task) - # if workflow has task_rerun=True and propagate_rerun=True, - # it should be passed to the tasks - if propagate_rerun: - task.task_rerun = True - # if the task is a wf, than the propagate_rerun should be also set - if is_workflow(task): - task.propagate_rerun = True - - # ported from Submitter.__call__ - # TODO: no prepare state ? - if override_task_caches and task.allow_cache_override: - task.cache_dir = self.cache_dir - task.cache_locations = task._cache_locations + self.cache_locations - - -def is_task(obj): - """Check whether an object looks like a task.""" - return hasattr(obj, "_run_task") - - -def is_workflow(obj): - """Check whether an object is a :class:`Workflow` instance.""" - return isinstance(obj, Workflow) - - -def is_lazy(obj): - """Check whether an object has any field that is a Lazy Field""" - for f in attr_fields(obj): - if isinstance(getattr(obj, f.name), LazyField): - return True - return False diff --git a/pydra/engine/environments.py b/pydra/engine/environments.py deleted file mode 100644 index 0c57008058..0000000000 --- a/pydra/engine/environments.py +++ /dev/null @@ -1,157 +0,0 @@ -from .helpers import execute - -from pathlib import Path - - -class Environment: - """ - Base class for environments that are used to execute tasks. - Right now it is assumed that the environment, including container images, - are available and are not removed at the end - TODO: add setup and teardown methods - """ - - def setup(self): - pass - - def execute(self, task): - """ - Execute the task in the environment. - - Parameters - ---------- - task : TaskBase - the task to execute - - Returns - ------- - output - Output of the task. - """ - raise NotImplementedError - - def teardown(self): - pass - - -class Native(Environment): - """ - Native environment, i.e. the tasks are executed in the current python environment. - """ - - def execute(self, task): - keys = ["return_code", "stdout", "stderr"] - values = execute(task.command_args(), strip=task.strip) - output = dict(zip(keys, values)) - if output["return_code"]: - msg = f"Error running '{task.name}' task with {task.command_args()}:" - if output["stderr"]: - msg += "\n\nstderr:\n" + output["stderr"] - if output["stdout"]: - msg += "\n\nstdout:\n" + output["stdout"] - raise RuntimeError(msg) - return output - - -class Container(Environment): - """ - Base class for container environments used by Docker and Singularity. - - Parameters - ---------- - image : str - Name of the container image - tag : str - Tag of the container image - root : str - Base path for mounting host directories into the container - xargs : Union[str, List[str]] - Extra arguments to be passed to the container - """ - - def __init__(self, image, tag="latest", root="/mnt/pydra", xargs=None): - self.image = image - self.tag = tag - if xargs is None: - xargs = [] - elif isinstance(xargs, str): - xargs = xargs.split() - self.xargs = xargs - self.root = root - - def bind(self, loc, mode="ro"): - loc_abs = Path(loc).absolute() - return f"{loc_abs}:{self.root}{loc_abs}:{mode}" - - -class Docker(Container): - """Docker environment.""" - - def execute(self, task): - docker_img = f"{self.image}:{self.tag}" - # mounting all input locations - mounts = task.get_bindings(root=self.root) - - docker_args = [ - "docker", - "run", - "-v", - self.bind(task.cache_dir, "rw"), - *self.xargs, - ] - docker_args.extend( - " ".join( - [f"-v {key}:{val[0]}:{val[1]}" for (key, val) in mounts.items()] - ).split() - ) - docker_args.extend(["-w", f"{self.root}{task.output_dir}"]) - keys = ["return_code", "stdout", "stderr"] - - values = execute( - docker_args + [docker_img] + task.command_args(root=self.root), - strip=task.strip, - ) - output = dict(zip(keys, values)) - if output["return_code"]: - if output["stderr"]: - raise RuntimeError(output["stderr"]) - else: - raise RuntimeError(output["stdout"]) - return output - - -class Singularity(Container): - """Singularity environment.""" - - def execute(self, task): - singularity_img = f"{self.image}:{self.tag}" - # mounting all input locations - mounts = task.get_bindings(root=self.root) - - # todo adding xargsy etc - singularity_args = [ - "singularity", - "exec", - "-B", - self.bind(task.cache_dir, "rw"), - *self.xargs, - ] - singularity_args.extend( - " ".join( - [f"-B {key}:{val[0]}:{val[1]}" for (key, val) in mounts.items()] - ).split() - ) - singularity_args.extend(["--pwd", f"{self.root}{task.output_dir}"]) - keys = ["return_code", "stdout", "stderr"] - - values = execute( - singularity_args + [singularity_img] + task.command_args(root=self.root), - strip=task.strip, - ) - output = dict(zip(keys, values)) - if output["return_code"]: - if output["stderr"]: - raise RuntimeError(output["stderr"]) - else: - raise RuntimeError(output["stdout"]) - return output diff --git a/pydra/engine/graph.py b/pydra/engine/graph.py index bfa62e0764..9c867ea606 100644 --- a/pydra/engine/graph.py +++ b/pydra/engine/graph.py @@ -1,16 +1,32 @@ -"""Data structure to support :class:`~pydra.engine.core.Workflow` tasks.""" +"""Data structure to support :class:`~pydra.engine.workflow.Workflow` tasks.""" from copy import copy from pathlib import Path +import typing as ty +from collections import Counter import subprocess as sp +from pydra.utils.general import ensure_list, is_workflow -from .helpers import ensure_list +NodeType = ty.TypeVar("NodeType") -class DiGraph: +INPUTS_NODE_NAME = "__INPUTS__" +OUTPUTS_NODE_NAME = "__OUTPUTS__" + + +class DiGraph(ty.Generic[NodeType]): """A simple Directed Graph object.""" - def __init__(self, name=None, nodes=None, edges=None): + name: str + nodes: list[NodeType] + edges: list[tuple[NodeType, NodeType]] + + def __init__( + self, + name: str | None = None, + nodes: ty.Iterable[NodeType] | None = None, + edges: ty.Iterable[tuple[NodeType, NodeType]] | None = None, + ): """ Initialize a directed graph. @@ -32,6 +48,7 @@ def __init__(self, name=None, nodes=None, edges=None): self._sorted_nodes = None self._node_wip = [] self._nodes_details = {} + self._node_lookup = {} def copy(self): """ @@ -59,20 +76,40 @@ def copy(self): return new_graph @property - def nodes(self): + def nodes(self) -> list[NodeType]: """Get a list of the nodes currently contained in the graph.""" return self._nodes @nodes.setter - def nodes(self, nodes): + def nodes(self, nodes: ty.Iterable[NodeType]) -> None: if nodes: - nodes = ensure_list(nodes) - if len(set(nodes)) != len(nodes): - raise Exception("nodes have repeated elements") + if duplicate_names := [ + n + for n, c in Counter(nd.name for nd in ensure_list(nodes)).items() + if c > 1 + ]: + raise ValueError( + f"Duplicate node names found in graph: {duplicate_names}" + ) self._nodes = nodes + def node(self, name: str) -> NodeType: + """Get a node by its name, caching the lookup directory""" + try: + return self._node_lookup[name] + except KeyError: + self._node_lookup = self.nodes_names_map + try: + return self._node_lookup[name] + except KeyError: + raise KeyError(f"Node {name!r} not found in graph") from None + + def __getitem__(self, key): + """Get a node by its name.""" + return self.node(key) + @property - def nodes_names_map(self): + def nodes_names_map(self) -> dict[str, NodeType]: """Get a map of node names to nodes.""" return {nd.name: nd for nd in self.nodes} @@ -257,6 +294,8 @@ def remove_nodes(self, nodes, check_ready=True): self._sorted_nodes.remove(nd) # starting from the previous sorted list, so is faster self.sorting(presorted=self.sorted_nodes) + # Reset the node lookup + self._node_lookup = {} def remove_nodes_connections(self, nodes): """ @@ -278,6 +317,8 @@ def remove_nodes_connections(self, nodes): self.successors.pop(nd.name) self.predecessors.pop(nd.name) self._node_wip.remove(nd) + # Reset the node lookup + self._node_lookup = {} def remove_previous_connections(self, nodes): """ @@ -300,6 +341,8 @@ def remove_previous_connections(self, nodes): self.successors.pop(nd.name) self.predecessors.pop(nd.name) self._node_wip.remove(nd) + # Reset the node lookup + self._node_lookup = {} def _checking_successors_nodes(self, node, remove=True): if self.successors[node.name]: @@ -309,6 +352,12 @@ def _checking_successors_nodes(self, node, remove=True): else: return True + def successors_nodes(self, node): + """Get all the nodes that follow the node""" + self._successors_all = [] + self._checking_successors_nodes(node=node, remove=False) + return set(self._successors_all) + def remove_successors_nodes(self, node): """Removing all the nodes that follow the node""" self._successors_all = [] @@ -354,11 +403,10 @@ def calculate_max_paths(self): def create_dotfile_simple(self, outdir, name="graph"): """creates a simple dotfile (no nested structure)""" - from .core import is_workflow dotstr = "digraph G {\n" for nd in self.nodes: - if is_workflow(nd): + if is_workflow(getattr(nd, "_task", None)): if nd.state: # adding color for wf with a state dotstr += f"{nd.name} [shape=box, color=blue]\n" @@ -393,27 +441,29 @@ def create_dotfile_detailed(self, outdir, name="graph_det"): if not self._nodes_details: raise Exception("node_details is empty, detailed dotfile can't be created") for nd_nm, nd_det in self.nodes_details.items(): - if nd_nm == self.name: # the main workflow itself + if nd_nm == INPUTS_NODE_NAME: # the main workflow itself # wf inputs wf_inputs_str = f'{{<{nd_det["outputs"][0]}> {nd_det["outputs"][0]}' for el in nd_det["outputs"][1:]: wf_inputs_str += f" | <{el}> {el}" wf_inputs_str += "}" - dotstr += f'struct_{nd_nm} [color=red, label="{{WORKFLOW INPUT: | {wf_inputs_str}}}"];\n' + dotstr += ( + f"struct_{self.name} [color=red, " + f'label="{{WORKFLOW INPUT: | {wf_inputs_str}}}"];\n' + ) + elif nd_nm == OUTPUTS_NODE_NAME: # wf outputs wf_outputs_str = f'{{<{nd_det["inputs"][0]}> {nd_det["inputs"][0]}' for el in nd_det["inputs"][1:]: wf_outputs_str += f" | <{el}> {el}" wf_outputs_str += "}" dotstr += ( - f"struct_{nd_nm}_out " + f"struct_{self.name}_out " f'[color=red, label="{{WORKFLOW OUTPUT: | {wf_outputs_str}}}"];\n' ) # connections to the wf outputs for con in nd_det["connections"]: - dotstr += ( - f"struct_{con[1]}:{con[2]} -> struct_{nd_nm}_out:{con[0]};\n" - ) + dotstr += f"struct_{con[1]}:{con[2]} -> struct_{self.name}_out:{con[0]};\n" else: # elements of the main workflow inputs_str = "{INPUT:" for inp in nd_det["inputs"]: @@ -429,7 +479,11 @@ def create_dotfile_detailed(self, outdir, name="graph_det"): ) # connections between elements for con in nd_det["connections"]: - dotstr += f"struct_{con[1]}:{con[2]} -> struct_{nd_nm}:{con[0]};\n" + in_conn = self.name if con[1] == INPUTS_NODE_NAME else con[1] + out_conn = self.name if con[0] == OUTPUTS_NODE_NAME else con[0] + dotstr += ( + f"struct_{in_conn}:{con[2]} -> struct_{nd_nm}:{out_conn};\n" + ) dotstr += "}" Path(outdir).mkdir(parents=True, exist_ok=True) dotfile = Path(outdir) / f"{name}.dot" @@ -447,18 +501,18 @@ def create_dotfile_nested(self, outdir, name="graph"): return dotfile def _create_dotfile_single_graph(self, nodes, edges): - from .core import is_workflow - wf_asnd = [] + wf_asnd = {} dotstr = "" for nd in nodes: - if is_workflow(nd): - wf_asnd.append(nd.name) - for task in nd.graph.nodes: - nd.create_connections(task) + if is_workflow(getattr(nd, "_task", None)): + nd_graph = nd._task.construct().graph() + wf_asnd[nd.name] = nd_graph + # for job in nd_graph.nodes: + # nd.create_connections(job) dotstr += f"subgraph cluster_{nd.name} {{\n" f"label = {nd.name} \n" dotstr += self._create_dotfile_single_graph( - nodes=nd.graph.nodes, edges=nd.graph.edges + nodes=nd_graph.nodes, edges=nd_graph.edges ) if nd.state: dotstr += "color=blue\n" @@ -480,12 +534,14 @@ def _create_dotfile_single_graph(self, nodes, edges): f"lhead=cluster_{ed[1].name}]\n" ) elif ed[0].name in wf_asnd: - tail_nd = list(ed[0].nodes)[-1].name + nd_nodes = wf_asnd[ed[0].name].nodes + tail_nd = list(nd_nodes)[-1].name dotstr_edg += ( f"{tail_nd} -> {ed[1].name} [ltail=cluster_{ed[0].name}]\n" ) elif ed[1].name in wf_asnd: - head_nd = list(ed[1].nodes)[0].name + nd_nodes = wf_asnd[ed[1].name].nodes + head_nd = list(nd_nodes)[0].name dotstr_edg += ( f"{ed[0].name} -> {head_nd} [lhead=cluster_{ed[1].name}]\n" ) diff --git a/pydra/engine/helpers.py b/pydra/engine/helpers.py deleted file mode 100644 index e6eaa012ef..0000000000 --- a/pydra/engine/helpers.py +++ /dev/null @@ -1,756 +0,0 @@ -"""Administrative support for the engine framework.""" - -import asyncio -import asyncio.subprocess as asp -from pathlib import Path -import os -import sys -from uuid import uuid4 -import getpass -import typing as ty -import subprocess as sp -import re -from time import strftime -from traceback import format_exception -import attr -import attrs # New defaults -from filelock import SoftFileLock, Timeout -import cloudpickle as cp -from .specs import ( - Runtime, - attr_fields, - Result, - LazyField, - File, -) -from .helpers_file import copy_nested_files -from ..utils.typing import TypeParser -from fileformats.core import FileSet -from .specs import MultiInputFile, MultiInputObj, MultiOutputObj, MultiOutputFile - - -def ensure_list(obj, tuple2list=False): - """ - Return a list whatever the input object is. - - Examples - -------- - >>> ensure_list(list("abc")) - ['a', 'b', 'c'] - >>> ensure_list("abc") - ['abc'] - >>> ensure_list(tuple("abc")) - [('a', 'b', 'c')] - >>> ensure_list(tuple("abc"), tuple2list=True) - ['a', 'b', 'c'] - >>> ensure_list(None) - [] - >>> ensure_list(5.0) - [5.0] - - """ - if obj is attr.NOTHING: - return attr.NOTHING - if obj is None: - return [] - # list or numpy.array (this might need some extra flag in case an array has to be converted) - elif isinstance(obj, list) or hasattr(obj, "__array__"): - return obj - elif tuple2list and isinstance(obj, tuple): - return list(obj) - elif isinstance(obj, LazyField): - return obj - return [obj] - - -def from_list_if_single(obj): - """Converts a list to a single item if it is of length == 1""" - if obj is attr.NOTHING: - return obj - if isinstance(obj, LazyField): - return obj - obj = list(obj) - if len(obj) == 1: - return obj[0] - return obj - - -def print_help(obj): - """Visit a task object and print its input/output interface.""" - lines = [f"Help for {obj.__class__.__name__}"] - input_klass = make_klass(obj.input_spec) - if attr.fields(input_klass): - lines += ["Input Parameters:"] - for f in attr.fields(input_klass): - default = "" - if f.default != attr.NOTHING and not f.name.startswith("_"): - default = f" (default: {f.default})" - try: - name = f.type.__name__ - except AttributeError: - name = str(f.type) - lines += [f"- {f.name}: {name}{default}"] - output_klass = make_klass(obj.output_spec) - if attr.fields(output_klass): - lines += ["Output Parameters:"] - for f in attr.fields(output_klass): - try: - name = f.type.__name__ - except AttributeError: - name = str(f.type) - lines += [f"- {f.name}: {name}"] - print("\n".join(lines)) - return lines - - -def load_result(checksum, cache_locations): - """ - Restore a result from the cache. - - Parameters - ---------- - checksum : :obj:`str` - Unique identifier of the task to be loaded. - cache_locations : :obj:`list` of :obj:`os.pathlike` - List of cache directories, in order of priority, where - the checksum will be looked for. - - """ - if not cache_locations: - return None - # TODO: if there are issues with loading, we might need to - # TODO: sleep and repeat loads (after checking that there are no lock files!) - for location in cache_locations: - if (location / checksum).exists(): - result_file = location / checksum / "_result.pklz" - if result_file.exists() and result_file.stat().st_size > 0: - return cp.loads(result_file.read_bytes()) - return None - return None - - -def save(task_path: Path, result=None, task=None, name_prefix=None): - """ - Save a :class:`~pydra.engine.core.TaskBase` object and/or results. - - Parameters - ---------- - task_path : :obj:`Path` - Write directory - result : :obj:`Result` - Result to pickle and write - task : :class:`~pydra.engine.core.TaskBase` - Task to pickle and write - """ - - if task is None and result is None: - raise ValueError("Nothing to be saved") - - if not isinstance(task_path, Path): - task_path = Path(task_path) - task_path.mkdir(parents=True, exist_ok=True) - if name_prefix is None: - name_prefix = "" - - lockfile = task_path.parent / (task_path.name + "_save.lock") - with SoftFileLock(lockfile): - if result: - if task_path.name.startswith("Workflow") and result.output is not None: - # copy files to the workflow directory - result = copyfile_workflow(wf_path=task_path, result=result) - with (task_path / f"{name_prefix}_result.pklz").open("wb") as fp: - cp.dump(result, fp) - if task: - with (task_path / f"{name_prefix}_task.pklz").open("wb") as fp: - cp.dump(task, fp) - - -def copyfile_workflow(wf_path: os.PathLike, result): - """if file in the wf results, the file will be copied to the workflow directory""" - for field in attr_fields(result.output): - value = getattr(result.output, field.name) - # if the field is a path or it can contain a path _copyfile_single_value is run - # to move all files and directories to the workflow directory - new_value = copy_nested_files(value, wf_path, mode=FileSet.CopyMode.hardlink) - setattr(result.output, field.name, new_value) - return result - - -def gather_runtime_info(fname): - """ - Extract runtime information from a file. - - Parameters - ---------- - fname : :obj:`os.pathlike` - The file containing runtime information - - Returns - ------- - runtime : :obj:`Runtime` - A runtime object containing the collected information. - - """ - runtime = Runtime(rss_peak_gb=None, vms_peak_gb=None, cpu_peak_percent=None) - - # Read .prof file in and set runtime values - data = [ - [float(el) for el in line.strip().split(",")] - for line in Path(fname).read_text().splitlines() - ] - if data: - runtime.rss_peak_gb = max([val[2] for val in data]) / 1024 - runtime.vms_peak_gb = max([val[3] for val in data]) / 1024 - runtime.cpu_peak_percent = max([val[1] for val in data]) - - """ - runtime.prof_dict = { - 'time': vals[:, 0].tolist(), - 'cpus': vals[:, 1].tolist(), - 'rss_GiB': (vals[:, 2] / 1024).tolist(), - 'vms_GiB': (vals[:, 3] / 1024).tolist(), - } - """ - return runtime - - -def make_klass(spec): - """ - Create a data class given a spec. - - Parameters - ---------- - spec : - TODO - - """ - if spec is None: - return None - fields = spec.fields - if fields: - newfields = {} - for item in fields: - if len(item) == 2: - name = item[0] - if isinstance(item[1], attr._make._CountingAttr): - newfield = item[1] - else: - newfield = attr.ib(type=item[1]) - else: - if ( - any([isinstance(ii, attr._make._CountingAttr) for ii in item]) - or len(item) > 4 - ): - raise ValueError( - "syntax not valid, you can use (name, attr), " - "(name, type, default), (name, type, default, metadata)" - "or (name, type, metadata)" - ) - kwargs = {} - if len(item) == 3: - name, tp = item[:2] - if isinstance(item[-1], dict) and "help_string" in item[-1]: - mdata = item[-1] - kwargs["metadata"] = mdata - else: - kwargs["default"] = item[-1] - elif len(item) == 4: - name, tp, dflt, mdata = item - kwargs["default"] = dflt - kwargs["metadata"] = mdata - newfield = attr.ib( - type=tp, - **kwargs, - ) - checker_label = f"'{name}' field of {spec.name}" - type_checker = TypeParser[newfield.type]( - newfield.type, label=checker_label, superclass_auto_cast=True - ) - if newfield.type in (MultiInputObj, MultiInputFile): - converter = attr.converters.pipe(ensure_list, type_checker) - elif newfield.type in (MultiOutputObj, MultiOutputFile): - converter = attr.converters.pipe(from_list_if_single, type_checker) - else: - converter = type_checker - newfield.converter = converter - newfield.on_setattr = attr.setters.convert - if "allowed_values" in newfield.metadata: - if newfield._validator is None: - newfield._validator = allowed_values_validator - elif isinstance(newfield._validator, ty.Iterable): - if allowed_values_validator not in newfield._validator: - newfield._validator.append(allowed_values_validator) - elif newfield._validator is not allowed_values_validator: - newfield._validator = [ - newfield._validator, - allowed_values_validator, - ] - newfields[name] = newfield - fields = newfields - return attrs.make_class( - spec.name, fields, bases=spec.bases, kw_only=True, on_setattr=None - ) - - -def allowed_values_validator(_, attribute, value): - """checking if the values is in allowed_values""" - allowed = attribute.metadata["allowed_values"] - if value is attr.NOTHING or isinstance(value, LazyField): - pass - elif value not in allowed: - raise ValueError( - f"value of {attribute.name} has to be from {allowed}, but {value} provided" - ) - - -async def read_stream_and_display(stream, display): - """ - Read from stream line by line until EOF, display, and capture the lines. - - See Also - -------- - This `discussion on StackOverflow - `__. - - """ - output = [] - while True: - line = await stream.readline() - if not line: - break - output.append(line) - if display is not None: - display(line) # assume it doesn't block - return b"".join(output).decode() - - -async def read_and_display_async(*cmd, hide_display=False, strip=False): - """ - Capture standard input and output of a process, displaying them as they arrive. - - Works line-by-line. - - """ - # start process - process = await asyncio.create_subprocess_exec( - *cmd, stdout=asp.PIPE, stderr=asp.PIPE - ) - - stdout_display = sys.stdout.buffer.write if not hide_display else None - stderr_display = sys.stderr.buffer.write if not hide_display else None - # read child's stdout/stderr concurrently (capture and display) - try: - stdout, stderr = await asyncio.gather( - read_stream_and_display(process.stdout, stdout_display), - read_stream_and_display(process.stderr, stderr_display), - ) - except Exception: - process.kill() - raise - finally: - # wait for the process to exit - rc = await process.wait() - if strip: - return rc, stdout.strip(), stderr - else: - return rc, stdout, stderr - - -def read_and_display(*cmd, strip=False, hide_display=False): - """Capture a process' standard output.""" - try: - process = sp.run(cmd, stdout=sp.PIPE, stderr=sp.PIPE) - except Exception: - # TODO editing some tracing? - raise - - if strip: - return ( - process.returncode, - process.stdout.decode("utf-8").strip(), - process.stderr.decode("utf-8"), - ) - else: - return ( - process.returncode, - process.stdout.decode("utf-8"), - process.stderr.decode("utf-8"), - ) - - -def execute(cmd, strip=False): - """ - Run the event loop with coroutine. - - Uses :func:`read_and_display_async` unless a loop is - already running, in which case :func:`read_and_display` - is used. - - Parameters - ---------- - cmd : :obj:`list` or :obj:`tuple` - The command line to be executed. - strip : :obj:`bool` - TODO - - """ - rc, stdout, stderr = read_and_display(*cmd, strip=strip) - """ - loop = get_open_loop() - if loop.is_running(): - rc, stdout, stderr = read_and_display(*cmd, strip=strip) - else: - rc, stdout, stderr = loop.run_until_complete( - read_and_display_async(*cmd, strip=strip) - ) - """ - return rc, stdout, stderr - - -def create_checksum(name, inputs): - """ - Generate a checksum name for a given combination of task name and inputs. - - Parameters - ---------- - name : :obj:`str` - Task name. - inputs : :obj:`str` - String of inputs. - - """ - return "_".join((name, inputs)) - - -def record_error(error_path, error): - """Write an error file.""" - - error_message = str(error) - - resultfile = error_path / "_result.pklz" - if not resultfile.exists(): - error_message += """\n - When creating this error file, the results file corresponding - to the task could not be found.""" - - name_checksum = str(error_path.name) - timeofcrash = strftime("%Y%m%d-%H%M%S") - try: - login_name = getpass.getuser() - except KeyError: - login_name = f"UID{os.getuid():d}" - - full_error = { - "time of crash": timeofcrash, - "login name": login_name, - "name with checksum": name_checksum, - "error message": error, - } - - with (error_path / "_error.pklz").open("wb") as fp: - cp.dump(full_error, fp) - - return error_path / "_error.pklz" - - -def get_open_loop(): - """ - Get current event loop. - - If the loop is closed, a new - loop is created and set as the current event loop. - - Returns - ------- - loop : :obj:`asyncio.EventLoop` - The current event loop - - """ - if os.name == "nt": - loop = asyncio.ProactorEventLoop() # for subprocess' pipes on Windows - else: - try: - loop = asyncio.get_event_loop() - # in case RuntimeError: There is no current event loop in thread 'MainThread' - except RuntimeError: - loop = asyncio.new_event_loop() - asyncio.set_event_loop(loop) - else: - if loop.is_closed(): - loop = asyncio.new_event_loop() - asyncio.set_event_loop(loop) - return loop - - -def output_from_inputfields(output_spec, input_spec): - """ - Collect values from output from input fields. - If names_only is False, the output_spec is updated, - if names_only is True only the names are returned - - Parameters - ---------- - output_spec : - TODO - input_spec : - TODO - - """ - current_output_spec_names = [f.name for f in attr.fields(make_klass(output_spec))] - new_fields = [] - for fld in attr.fields(make_klass(input_spec)): - if "output_file_template" in fld.metadata: - if "output_field_name" in fld.metadata: - field_name = fld.metadata["output_field_name"] - else: - field_name = fld.name - # not adding if the field already in the output_spec - if field_name not in current_output_spec_names: - # TODO: should probably remove some of the keys - new_fields.append( - (field_name, attr.ib(type=File, metadata=fld.metadata)) - ) - output_spec.fields += new_fields - return output_spec - - -def get_available_cpus(): - """ - Return the number of CPUs available to the current process or, if that is not - available, the total number of CPUs on the system. - - Returns - ------- - n_proc : :obj:`int` - The number of available CPUs. - """ - # Will not work on some systems or if psutil is not installed. - # See https://psutil.readthedocs.io/en/latest/#psutil.Process.cpu_affinity - try: - import psutil - - return len(psutil.Process().cpu_affinity()) - except (AttributeError, ImportError, NotImplementedError): - pass - - # Not available on all systems, including macOS. - # See https://docs.python.org/3/library/os.html#os.sched_getaffinity - if hasattr(os, "sched_getaffinity"): - return len(os.sched_getaffinity(0)) - - # Last resort - return os.cpu_count() - - -def load_and_run( - task_pkl, ind=None, rerun=False, submitter=None, plugin=None, **kwargs -): - """ - loading a task from a pickle file, settings proper input - and running the task - """ - try: - task = load_task(task_pkl=task_pkl, ind=ind) - except Exception: - if task_pkl.parent.exists(): - etype, eval, etr = sys.exc_info() - traceback = format_exception(etype, eval, etr) - errorfile = record_error(task_pkl.parent, error=traceback) - result = Result(output=None, runtime=None, errored=True) - save(task_pkl.parent, result=result) - raise - - resultfile = task.output_dir / "_result.pklz" - try: - task(rerun=rerun, plugin=plugin, submitter=submitter, **kwargs) - except Exception as excinfo: - # creating result and error files if missing - errorfile = task.output_dir / "_error.pklz" - if not errorfile.exists(): # not sure if this is needed - etype, eval, etr = sys.exc_info() - traceback = format_exception(etype, eval, etr) - errorfile = record_error(task.output_dir, error=traceback) - if not resultfile.exists(): # not sure if this is needed - result = Result(output=None, runtime=None, errored=True) - save(task.output_dir, result=result) - raise type(excinfo)( - str(excinfo.with_traceback(None)), - f" full crash report is here: {errorfile}", - ) - return resultfile - - -async def load_and_run_async(task_pkl, ind=None, submitter=None, rerun=False, **kwargs): - """ - loading a task from a pickle file, settings proper input - and running the workflow - """ - task = load_task(task_pkl=task_pkl, ind=ind) - await task._run(submitter=submitter, rerun=rerun, **kwargs) - - -def load_task(task_pkl, ind=None): - """loading a task from a pickle file, settings proper input for the specific ind""" - if isinstance(task_pkl, str): - task_pkl = Path(task_pkl) - task = cp.loads(task_pkl.read_bytes()) - if ind is not None: - ind_inputs = task.get_input_el(ind) - task.inputs = attr.evolve(task.inputs, **ind_inputs) - task._pre_split = True - task.state = None - # resetting uid for task - task._uid = uuid4().hex - return task - - -def position_sort(args): - """ - Sort objects by position, following Python indexing conventions. - - Ordering is positive positions, lowest to highest, followed by unspecified - positions (``None``) and negative positions, lowest to highest. - - >>> position_sort([(None, "d"), (-3, "e"), (2, "b"), (-2, "f"), (5, "c"), (1, "a")]) - ['a', 'b', 'c', 'd', 'e', 'f'] - - Parameters - ---------- - args : list of (int/None, object) tuples - - Returns - ------- - list of objects - """ - import bisect - - pos, none, neg = [], [], [] - for entry in args: - position = entry[0] - if position is None: - # Take existing order - none.append(entry[1]) - elif position < 0: - # Sort negatives while collecting - bisect.insort(neg, entry) - else: - # Sort positives while collecting - bisect.insort(pos, entry) - - return [arg for _, arg in pos] + none + [arg for _, arg in neg] - - -def argstr_formatting(argstr, inputs, value_updates=None): - """formatting argstr that have form {field_name}, - using values from inputs and updating with value_update if provided - """ - inputs_dict = attr.asdict(inputs, recurse=False) - # if there is a value that has to be updated (e.g. single value from a list) - if value_updates: - inputs_dict.update(value_updates) - # getting all fields that should be formatted, i.e. {field_name}, ... - inp_fields = parse_format_string(argstr) - val_dict = {} - for fld_name in inp_fields: - fld_value = inputs_dict[fld_name] - fld_attr = getattr(attrs.fields(type(inputs)), fld_name) - if fld_value is attr.NOTHING or ( - fld_value is False - and TypeParser.matches_type(fld_attr.type, ty.Union[Path, bool]) - ): - # if value is NOTHING, nothing should be added to the command - val_dict[fld_name] = "" - else: - val_dict[fld_name] = fld_value - - # formatting string based on the val_dict - argstr_formatted = argstr.format(**val_dict) - # removing extra commas and spaces after removing the field that have NOTHING - argstr_formatted = ( - argstr_formatted.replace("[ ", "[") - .replace(" ]", "]") - .replace("[,", "[") - .replace(",]", "]") - .strip() - ) - return argstr_formatted - - -class PydraFileLock: - """Wrapper for filelock's SoftFileLock that makes it work with asyncio.""" - - def __init__(self, lockfile): - self.lockfile = lockfile - self.timeout = 0.1 - - async def __aenter__(self): - lock = SoftFileLock(self.lockfile) - acquired_lock = False - while not acquired_lock: - try: - lock.acquire(timeout=0) - acquired_lock = True - except Timeout: - await asyncio.sleep(self.timeout) - if self.timeout <= 2: - self.timeout = self.timeout * 2 - self.lock = lock - return self - - async def __aexit__(self, exc_type, exc_value, traceback): - self.lock.release() - return None - - -def parse_copyfile(fld: attr.Attribute, default_collation=FileSet.CopyCollation.any): - """Gets the copy mode from the 'copyfile' value from a field attribute""" - copyfile = fld.metadata.get("copyfile", FileSet.CopyMode.any) - if isinstance(copyfile, tuple): - mode, collation = copyfile - elif isinstance(copyfile, str): - try: - mode, collation = copyfile.split(",") - except ValueError: - mode = copyfile - collation = default_collation - else: - collation = FileSet.CopyCollation[collation] - mode = FileSet.CopyMode[mode] - else: - if copyfile is True: - mode = FileSet.CopyMode.copy - elif copyfile is False: - mode = FileSet.CopyMode.link - elif copyfile is None: - mode = FileSet.CopyMode.any - else: - mode = copyfile - collation = default_collation - if not isinstance(mode, FileSet.CopyMode): - raise TypeError( - f"Unrecognised type for mode copyfile metadata of {fld}, {mode}" - ) - if not isinstance(collation, FileSet.CopyCollation): - raise TypeError( - f"Unrecognised type for collation copyfile metadata of {fld}, {collation}" - ) - return mode, collation - - -def parse_format_string(fmtstr): - """Parse a argstr format string and return all keywords used in it.""" - identifier = r"[a-zA-Z_]\w*" - attribute = rf"\.{identifier}" - item = r"\[\w+\]" - # Example: var.attr[key][0].attr2 (capture "var") - field_with_lookups = ( - f"({identifier})(?:{attribute}|{item})*" # Capture only the keyword - ) - conversion = "(?:!r|!s)" - nobrace = "[^{}]*" - # Example: 0{pads[hex]}x (capture "pads") - fmtspec = f"{nobrace}(?:{{({identifier}){nobrace}}}{nobrace})?" # Capture keywords in spec - full_field = f"{{{field_with_lookups}{conversion}?(?::{fmtspec})?}}" - - all_keywords = re.findall(full_field, fmtstr) - return set().union(*all_keywords) - {""} diff --git a/pydra/engine/helpers_file.py b/pydra/engine/helpers_file.py deleted file mode 100644 index f194533ac7..0000000000 --- a/pydra/engine/helpers_file.py +++ /dev/null @@ -1,472 +0,0 @@ -"""Functions ported from Nipype 1, after removing parts that were related to py2.""" - -import os -import re -import logging -from pathlib import Path -import typing as ty -from copy import copy -import subprocess as sp -from contextlib import contextmanager -import attr -from fileformats.core import FileSet - - -logger = logging.getLogger("pydra") - - -# dj: copied from misc -def is_container(item): - """ - Check if item is a container (list, tuple, dict, set). - - Parameters - ---------- - item : :obj:`object` - Input object to check. - - Returns - ------- - output : :obj:`bool` - ``True`` if container ``False`` otherwise. - - """ - if isinstance(item, str): - return False - elif hasattr(item, "__iter__"): - return True - - return False - - -def ensure_list(filename): - """Return a list given either a string or a list.""" - if isinstance(filename, (str, bytes)): - return [filename] - elif isinstance(filename, list): - return filename - elif is_container(filename): - return [x for x in filename] - - return None - - -def copy_nested_files( - value: ty.Any, - dest_dir: os.PathLike, - supported_modes: FileSet.CopyMode = FileSet.CopyMode.any, - **kwargs, -) -> ty.Any: - """Copies all "file-sets" found within the nested value (e.g. dict, list,...) into the - destination directory. If no nested file-sets are found then the original value is - returned. Note that multiple nested file-sets (e.g. a list) will to have unique names - names (i.e. not differentiated by parent directories) otherwise there will be a path - clash in the destination directory. - - Parameters - ---------- - value : Any - the value to copy files from (if required) - dest_dir : os.PathLike - the destination directory to copy the files to - **kwargs - passed directly onto FileSet.copy() - """ - from ..utils.typing import TypeParser # noqa - - cache: ty.Dict[FileSet, FileSet] = {} - - def copy_fileset(fileset: FileSet): - try: - return cache[fileset] - except KeyError: - pass - supported = supported_modes - if any(MountIndentifier.on_cifs(p) for p in fileset.fspaths): - supported -= FileSet.CopyMode.symlink - if not all( - MountIndentifier.on_same_mount(p, dest_dir) for p in fileset.fspaths - ): - supported -= FileSet.CopyMode.hardlink - copied = fileset.copy(dest_dir=dest_dir, supported_modes=supported, **kwargs) - cache[fileset] = copied - return copied - - return TypeParser.apply_to_instances(FileSet, copy_fileset, value) - - -# not sure if this might be useful for Function Task -def template_update(inputs, output_dir, state_ind=None, map_copyfiles=None): - """ - Update all templates that are present in the input spec. - - Should be run when all inputs used in the templates are already set. - - """ - - inputs_dict_st = attr.asdict(inputs, recurse=False) - if map_copyfiles is not None: - inputs_dict_st.update(map_copyfiles) - - if state_ind is not None: - for k, v in state_ind.items(): - k = k.split(".")[1] - inputs_dict_st[k] = inputs_dict_st[k][v] - - from .specs import attr_fields - - # Collect templated inputs for which all requirements are satisfied. - fields_templ = [ - field - for field in attr_fields(inputs) - if field.metadata.get("output_file_template") - and getattr(inputs, field.name) is not False - and all( - getattr(inputs, required_field) is not attr.NOTHING - for required_field in field.metadata.get("requires", ()) - ) - ] - - dict_mod = {} - for fld in fields_templ: - dict_mod[fld.name] = template_update_single( - field=fld, - inputs=inputs, - inputs_dict_st=inputs_dict_st, - output_dir=output_dir, - ) - # adding elements from map_copyfiles to fields with templates - if map_copyfiles: - dict_mod.update(map_copyfiles) - return dict_mod - - -def template_update_single( - field, inputs, inputs_dict_st=None, output_dir=None, spec_type="input" -): - """Update a single template from the input_spec or output_spec - based on the value from inputs_dict - (checking the types of the fields, that have "output_file_template)" - """ - # if input_dict_st with state specific value is not available, - # the dictionary will be created from inputs object - from ..utils.typing import TypeParser # noqa - from pydra.engine.specs import LazyField, OUTPUT_TEMPLATE_TYPES - - if inputs_dict_st is None: - inputs_dict_st = attr.asdict(inputs, recurse=False) - - if spec_type == "input": - inp_val_set = inputs_dict_st[field.name] - if isinstance(inp_val_set, bool) and field.type in (Path, str): - raise TypeError( - f"type of '{field.name}' is Path, consider using Union[Path, bool]" - ) - if inp_val_set is not attr.NOTHING and not isinstance(inp_val_set, LazyField): - inp_val_set = TypeParser(ty.Union[OUTPUT_TEMPLATE_TYPES])(inp_val_set) - elif spec_type == "output": - if not TypeParser.contains_type(FileSet, field.type): - raise TypeError( - f"output {field.name} should be file-system object, but {field.type} " - "set as the type" - ) - else: - raise TypeError(f"spec_type can be input or output, but {spec_type} provided") - # for inputs that the value is set (so the template is ignored) - if spec_type == "input": - if isinstance(inp_val_set, (Path, list)): - return inp_val_set - if inp_val_set is False: - # if input fld is set to False, the fld shouldn't be used (setting NOTHING) - return attr.NOTHING - # inputs_dict[field.name] is True or spec_type is output - value = _template_formatting(field, inputs, inputs_dict_st) - # changing path so it is in the output_dir - if output_dir and value is not attr.NOTHING: - # should be converted to str, it is also used for input fields that should be str - if type(value) is list: - return [str(output_dir / Path(val).name) for val in value] - else: - return str(output_dir / Path(value).name) - else: - return attr.NOTHING - - -def _template_formatting(field, inputs, inputs_dict_st): - """Formatting the field template based on the values from inputs. - Taking into account that the field with a template can be a MultiOutputFile - and the field values needed in the template can be a list - - returning a list of formatted templates in that case. - Allowing for multiple input values used in the template as longs as - there is no more than one file (i.e. File, PathLike or string with extensions) - """ - # if a template is a function it has to be run first with the inputs as the only arg - template = field.metadata["output_file_template"] - if callable(template): - template = template(inputs) - - # as default, we assume that keep_extension is True - if isinstance(template, (tuple, list)): - formatted = [ - _string_template_formatting(field, t, inputs, inputs_dict_st) - for t in template - ] - else: - assert isinstance(template, str) - formatted = _string_template_formatting(field, template, inputs, inputs_dict_st) - return formatted - - -def _string_template_formatting(field, template, inputs, inputs_dict_st): - from .specs import MultiInputObj, MultiOutputFile - - keep_extension = field.metadata.get("keep_extension", True) - inp_fields = re.findall(r"{\w+}", template) - inp_fields_fl = re.findall(r"{\w+:[0-9.]+f}", template) - inp_fields += [re.sub(":[0-9.]+f", "", el) for el in inp_fields_fl] - if len(inp_fields) == 0: - return template - - val_dict = {} - file_template = None - - for fld in inp_fields: - fld_name = fld[1:-1] # extracting the name form {field_name} - if fld_name not in inputs_dict_st: - raise AttributeError(f"{fld_name} is not provided in the input") - fld_value = inputs_dict_st[fld_name] - if fld_value is attr.NOTHING: - # if value is NOTHING, nothing should be added to the command - return attr.NOTHING - else: - # checking for fields that can be treated as a file: - # have type File, or value that is path like (including str with extensions) - if isinstance(fld_value, os.PathLike) or ( - isinstance(fld_value, str) and "." in fld_value - ): - if file_template: - raise Exception( - f"can't have multiple paths in {field.name} template," - f" but {template} provided" - ) - else: - file_template = (fld_name, fld_value) - else: - val_dict[fld_name] = fld_value - - # if field is MultiOutputFile and some elements from val_dict are lists, - # each element of the list should be used separately in the template - # and return a list with formatted values - if field.type is MultiOutputFile and any( - [isinstance(el, (list, MultiInputObj)) for el in val_dict.values()] - ): - # all fields that are lists - keys_list = [ - k for k, el in val_dict.items() if isinstance(el, (list, MultiInputObj)) - ] - if any( - [len(val_dict[key]) != len(val_dict[keys_list[0]]) for key in keys_list[1:]] - ): - raise Exception( - f"all fields used in {field.name} template have to have the same length" - f" or be a single value" - ) - formatted_value = [] - for ii in range(len(val_dict[keys_list[0]])): - val_dict_el = copy(val_dict) - # updating values to a single element from the list - for key in keys_list: - val_dict_el[key] = val_dict[key][ii] - - formatted_value.append( - _element_formatting( - template, val_dict_el, file_template, keep_extension=keep_extension - ) - ) - else: - formatted_value = _element_formatting( - template, val_dict, file_template, keep_extension=keep_extension - ) - return formatted_value - - -def _element_formatting(template, values_template_dict, file_template, keep_extension): - """Formatting a single template for a single element (if a list). - Taking into account that a file used in the template (file_template) - and the template itself could have file extensions - (assuming that if template has extension, the field value extension is removed, - if field has extension, and no template extension, than it is moved to the end). - For values_template_dict the simple formatting can be used (no file values inside) - """ - if file_template: - fld_name_file, fld_value_file = file_template - # splitting the filename for name and extension, - # the final value used for formatting depends on the template and keep_extension flag - name, *ext = Path(fld_value_file).name.split(".", maxsplit=1) - filename = str(Path(fld_value_file).parent / name) - # updating values_template_dic with the name of file - values_template_dict[fld_name_file] = filename - # if keep_extension is False, the extensions are removed - if keep_extension is False: - ext = [] - else: - ext = [] - - # if file_template is at the end of the template, the simplest formatting should work - if file_template and template.endswith(f"{{{fld_name_file}}}"): - # recreating fld_value with the updated extension - values_template_dict[fld_name_file] = ".".join([filename] + ext) - formatted_value = template.format(**values_template_dict) - # file_template provided, but the template doesn't have its own extension - elif file_template and "." not in template: - # if the fld_value_file has extension, it will be moved to the end - formatted_value = ".".join([template.format(**values_template_dict)] + ext) - # template has its own extension or no file_template provided - # the simplest formatting, if file_template is provided it's used without the extension - else: - formatted_value = template.format(**values_template_dict) - return formatted_value - - -def is_local_file(f): - from ..utils.typing import TypeParser - - return "container_path" not in f.metadata and TypeParser.contains_type( - FileSet, f.type - ) - - -class MountIndentifier: - """Used to check the mount type that given file paths reside on in order to determine - features that can be used (e.g. symlinks)""" - - @classmethod - def on_cifs(cls, path: os.PathLike) -> bool: - """ - Check whether a file path is on a CIFS filesystem mounted in a POSIX host. - - POSIX hosts are assumed to have the ``mount`` command. - - On Windows, Docker mounts host directories into containers through CIFS - shares, which has support for Minshall+French symlinks, or text files that - the CIFS driver exposes to the OS as symlinks. - We have found that under concurrent access to the filesystem, this feature - can result in failures to create or read recently-created symlinks, - leading to inconsistent behavior and ``FileNotFoundError`` errors. - - This check is written to support disabling symlinks on CIFS shares. - - NB: This function and sub-functions are copied from the nipype.utils.filemanip module - - - NB: Adapted from https://github.com/nipy/nipype - """ - return cls.get_mount(path)[1] == "cifs" - - @classmethod - def on_same_mount(cls, path1: os.PathLike, path2: os.PathLike) -> bool: - """Checks whether two or paths are on the same logical file system""" - return cls.get_mount(path1)[0] == cls.get_mount(path2)[0] - - @classmethod - def get_mount(cls, path: os.PathLike) -> ty.Tuple[Path, str]: - """Get the mount point for a given file-system path - - Parameters - ---------- - path: os.PathLike - the file-system path to identify the mount of - - Returns - ------- - mount_point: os.PathLike - the root of the mount the path sits on - fstype : str - the type of the file-system (e.g. ext4 or cifs)""" - try: - # Only the first match (most recent parent) counts, mount table sorted longest - # to shortest - return next( - (Path(p), t) - for p, t in cls.get_mount_table() - if str(path).startswith(p) - ) - except StopIteration: - return (Path("/"), "ext4") - - @classmethod - def generate_cifs_table(cls) -> ty.List[ty.Tuple[str, str]]: - """ - Construct a reverse-length-ordered list of mount points that fall under a CIFS mount. - - This precomputation allows efficient checking for whether a given path - would be on a CIFS filesystem. - On systems without a ``mount`` command, or with no CIFS mounts, returns an - empty list. - - """ - exit_code, output = sp.getstatusoutput("mount") - return cls.parse_mount_table(exit_code, output) - - @classmethod - def parse_mount_table( - cls, exit_code: int, output: str - ) -> ty.List[ty.Tuple[str, str]]: - """ - Parse the output of ``mount`` to produce (path, fs_type) pairs. - - Separated from _generate_cifs_table to enable testing logic with real - outputs - - """ - # Not POSIX - if exit_code != 0: - return [] - - # Linux mount example: sysfs on /sys type sysfs (rw,nosuid,nodev,noexec) - # ^^^^ ^^^^^ - # OSX mount example: /dev/disk2 on / (hfs, local, journaled) - # ^ ^^^ - pattern = re.compile(r".*? on (/.*?) (?:type |\()([^\s,\)]+)") - - # Keep line and match for error reporting (match == None on failure) - # Ignore empty lines - matches = [(ll, pattern.match(ll)) for ll in output.strip().splitlines() if ll] - - # (path, fstype) tuples, sorted by path length (longest first) - mount_info = sorted( - (match.groups() for _, match in matches if match is not None), - key=lambda x: len(x[0]), - reverse=True, - ) - cifs_paths = [path for path, fstype in mount_info if fstype.lower() == "cifs"] - - # Report failures as warnings - for line, match in matches: - if match is None: - logger.debug("Cannot parse mount line: '%s'", line) - - return [ - mount - for mount in mount_info - if any(mount[0].startswith(path) for path in cifs_paths) - ] - - @classmethod - def get_mount_table(cls) -> ty.List[ty.Tuple[str, str]]: - if cls._mount_table is None: - cls._mount_table = cls.generate_cifs_table() - return cls._mount_table - - @classmethod - @contextmanager - def patch_table(cls, mount_table: ty.List[ty.Tuple[str, str]]): - """Patch the mount table with new values. Used in test routines""" - orig_table = cls._mount_table - cls._mount_table = list(mount_table) - try: - yield - finally: - cls._mount_table = orig_table - - _mount_table: ty.Optional[ty.List[ty.Tuple[str, str]]] = None diff --git a/pydra/engine/helpers_state.py b/pydra/engine/helpers_state.py deleted file mode 100644 index 866d408a46..0000000000 --- a/pydra/engine/helpers_state.py +++ /dev/null @@ -1,653 +0,0 @@ -"""Additional functions used mostly by the State class.""" - -import attr -import itertools -from copy import deepcopy -import logging -import typing as ty -from .helpers import ensure_list - -logger = logging.getLogger("pydra") - - -class PydraStateError(Exception): - """Custom error for Pydra State""" - - def __init__(self, value): - self.value = value - - def __str__(self): - return str(self.value) - - -def splitter2rpn(splitter, other_states=None, state_fields=True): - """ - Translate user-provided splitter into *reverse polish notation*. - - The reverse polish notation is imposed by :class:`~pydra.engine.state.State`. - - Parameters - ---------- - splitter : - splitter (standard form) - other_states : - other states that are connected to the state - state_fields : :obj:`bool` - if False the splitter from the previous states are unwrapped - - """ - if not splitter: - return [] - output_splitter = [] - _ordering( - deepcopy(splitter), - i=0, - output_splitter=output_splitter, - other_states=deepcopy(other_states), - state_fields=state_fields, - ) - return output_splitter - - -def _ordering( - el, i, output_splitter, current_sign=None, other_states=None, state_fields=True -): - """Get a proper order of fields and signs (used by splitter2rpn).""" - if type(el) is tuple: - # checking if the splitter dont contain splitter from previous nodes - # i.e. has str "_NA", etc. - if len(el) == 1: - # treats .split(("x",)) like .split("x") - el = el[0] - _ordering(el, i, output_splitter, current_sign, other_states, state_fields) - else: - if type(el[0]) is str and el[0].startswith("_"): - node_nm = el[0][1:] - if node_nm not in other_states and state_fields: - raise PydraStateError( - "can't ask for splitter from {}, other nodes that are connected: {}".format( - node_nm, other_states.keys() - ) - ) - elif state_fields: - splitter_mod = add_name_splitter( - splitter=other_states[node_nm][0].splitter_final, name=node_nm - ) - el = (splitter_mod, el[1]) - if other_states[node_nm][0].other_states: - other_states.update(other_states[node_nm][0].other_states) - if type(el[1]) is str and el[1].startswith("_"): - node_nm = el[1][1:] - if node_nm not in other_states and state_fields: - raise PydraStateError( - "can't ask for splitter from {}, other nodes that are connected: {}".format( - node_nm, other_states.keys() - ) - ) - elif state_fields: - splitter_mod = add_name_splitter( - splitter=other_states[node_nm][0].splitter_final, name=node_nm - ) - el = (el[0], splitter_mod) - if other_states[node_nm][0].other_states: - other_states.update(other_states[node_nm][0].other_states) - _iterate_list( - el, - ".", - other_states, - output_splitter=output_splitter, - state_fields=state_fields, - ) - elif type(el) is list: - if len(el) == 1: - # treats .split(["x"]) like .split("x") - el = el[0] - _ordering(el, i, output_splitter, current_sign, other_states, state_fields) - else: - if type(el[0]) is str and el[0].startswith("_"): - node_nm = el[0][1:] - if node_nm not in other_states and state_fields: - raise PydraStateError( - "can't ask for splitter from {}, other nodes that are connected: {}".format( - node_nm, other_states.keys() - ) - ) - elif state_fields: - splitter_mod = add_name_splitter( - splitter=other_states[node_nm][0].splitter_final, name=node_nm - ) - el[0] = splitter_mod - if other_states[node_nm][0].other_states: - other_states.update(other_states[node_nm][0].other_states) - if type(el[1]) is str and el[1].startswith("_"): - node_nm = el[1][1:] - if node_nm not in other_states and state_fields: - raise PydraStateError( - "can't ask for splitter from {}, other nodes that are connected: {}".format( - node_nm, other_states.keys() - ) - ) - elif state_fields: - splitter_mod = add_name_splitter( - splitter=other_states[node_nm][0].splitter_final, name=node_nm - ) - el[1] = splitter_mod - if other_states[node_nm][0].other_states: - other_states.update(other_states[node_nm][0].other_states) - _iterate_list( - el, - "*", - other_states, - output_splitter=output_splitter, - state_fields=state_fields, - ) - elif type(el) is str: - if el.startswith("_"): - node_nm = el[1:] - if node_nm not in other_states and state_fields: - raise PydraStateError( - "can't ask for splitter from {}, other nodes that are connected: {}".format( - node_nm, other_states.keys() - ) - ) - elif state_fields: - splitter_mod = add_name_splitter( - splitter=other_states[node_nm][0].splitter_final, name=node_nm - ) - el = splitter_mod - if other_states[node_nm][0].other_states: - other_states.update(other_states[node_nm][0].other_states) - if type(el) is str: - output_splitter.append(el) - elif type(el) is tuple: - _iterate_list( - el, - ".", - other_states, - output_splitter=output_splitter, - state_fields=state_fields, - ) - elif type(el) is list: - _iterate_list( - el, - "*", - other_states, - output_splitter=output_splitter, - state_fields=state_fields, - ) - else: - raise PydraStateError("splitter has to be a string, a tuple or a list") - if i > 0: - output_splitter.append(current_sign) - - -def _iterate_list(element, sign, other_states, output_splitter, state_fields=True): - """Iterate over list (used in the splitter2rpn to get recursion).""" - for i, el in enumerate(element): - _ordering( - deepcopy(el), - i, - current_sign=sign, - other_states=other_states, - output_splitter=output_splitter, - state_fields=state_fields, - ) - - -def converter_groups_to_input(group_for_inputs): - """ - Return fields for each axis and number of all groups. - - Requires having axes for all the input fields. - - Parameters - ---------- - group_for_inputs : - specified axes (groups) for each input - - """ - input_for_axis = {} - ngr = 0 - for inp, grs in group_for_inputs.items(): - for gr in ensure_list(grs): - if gr in input_for_axis.keys(): - input_for_axis[gr].append(inp) - else: - ngr += 1 - input_for_axis[gr] = [inp] - return input_for_axis, ngr - - -def remove_inp_from_splitter_rpn(splitter_rpn, inputs_to_remove): - """ - Remove inputs due to combining. - - Mutates a splitter. - - Parameters - ---------- - splitter_rpn : - The splitter in reverse polish notation - inputs_to_remove : - input names that should be removed from the splitter - - """ - splitter_rpn_copy = splitter_rpn.copy() - # reverting order - splitter_rpn_copy.reverse() - stack_inp = [] - stack_sgn = [] - from_last_sign = [] - for ii, el in enumerate(splitter_rpn_copy): - # element is a sign - if el == "." or el == "*": - stack_sgn.append((ii, el)) - from_last_sign.append(0) - # it's an input but not to remove - elif el not in inputs_to_remove: - if from_last_sign: - from_last_sign[-1] += 1 - stack_inp.append((ii, el)) - # it'a an input that should be removed - else: - if not from_last_sign: - pass - elif from_last_sign[-1] <= 1: - stack_sgn.pop() - from_last_sign.pop() - else: - stack_sgn.pop(-1 * from_last_sign.pop()) - - # creating the final splitter_rpn after combining - remaining_elements = stack_sgn + stack_inp - remaining_elements.sort(reverse=True) - splitter_rpn_combined = [el for (i, el) in remaining_elements] - return splitter_rpn_combined - - -def rpn2splitter(splitter_rpn): - """ - Convert from splitter_rpn to splitter. - - Recurrent algorithm to perform the conversion. - Every time combines pairs of input in one input, - ends when the length is one. - - Parameters - ---------- - splitter_rpn : - splitter in reverse polish notation - - Returns - ------- - splitter : - splitter in the standard/original form - - """ - if splitter_rpn == []: - return None - if len(splitter_rpn) == 1: - return splitter_rpn[0] - - splitter_rpn_copy = splitter_rpn.copy() - signs = [".", "*"] - splitter_modified = [] - - while splitter_rpn_copy: - el = splitter_rpn_copy.pop() - # element is a sign - if el in signs: - if ( - splitter_rpn_copy[-1] not in signs - and splitter_rpn_copy[-2] not in signs - ): - right, left = splitter_rpn_copy.pop(), splitter_rpn_copy.pop() - if el == ".": - splitter_modified.append((left, right)) - elif el == "*": - splitter_modified.append([left, right]) - else: - splitter_modified.append(el) - else: - splitter_modified.append(el) - - # reversing the list and combining more - splitter_modified.reverse() - return rpn2splitter(splitter_modified) - - -def add_name_combiner(combiner, name): - """adding a node's name to each field from the combiner""" - combiner_changed = [] - for comb in combiner: - if "." not in comb: - combiner_changed.append(f"{name}.{comb}") - else: - combiner_changed.append(comb) - return combiner_changed - - -def add_name_splitter( - splitter: ty.Union[str, ty.List[str], ty.Tuple[str, ...], None], name: str -) -> ty.Optional[ty.List[str]]: - """adding a node's name to each field from the splitter""" - if isinstance(splitter, str): - return _add_name([splitter], name)[0] - elif isinstance(splitter, list): - return _add_name(list(splitter), name) - elif isinstance(splitter, tuple): - return tuple(_add_name(list(splitter), name)) - else: - return None - - -def _add_name(mlist, name): - """adding anem to each element from the list""" - for i, elem in enumerate(mlist): - if isinstance(elem, str): - if "." in elem or elem.startswith("_"): - pass - else: - mlist[i] = f"{name}.{mlist[i]}" - elif isinstance(elem, list): - mlist[i] = _add_name(elem, name) - elif isinstance(elem, tuple): - mlist[i] = list(elem) - mlist[i] = _add_name(mlist[i], name) - mlist[i] = tuple(mlist[i]) - return mlist - - -def flatten(vals, cur_depth=0, max_depth=None): - """Flatten a list of values.""" - if max_depth is None: - max_depth = len(list(input_shape(vals))) - values = [] - if cur_depth >= max_depth: - values.append([vals]) - else: - for val in vals: - if isinstance(val, (list, tuple)): - values.append(flatten(val, cur_depth + 1, max_depth)) - else: - values.append([val]) - return itertools.chain.from_iterable(values) - - -def iter_splits(iterable, keys): - """Generate splits.""" - for iter in list(iterable): - yield dict(zip(keys, list(flatten(iter, max_depth=1000)))) - - -def input_shape(inp, cont_dim=1): - """Get input shape, depends on the container dimension, if not specify it is assumed to be 1""" - # TODO: have to be changed for inner splitter (sometimes different length) - cont_dim -= 1 - shape = [len(inp)] - last_shape = None - for value in inp: - if isinstance(value, list) and cont_dim > 0: - cur_shape = input_shape(value, cont_dim) - if last_shape is None: - last_shape = cur_shape - elif last_shape != cur_shape: - last_shape = None - break - else: - last_shape = None - break - if last_shape is not None: - shape.extend(last_shape) - return tuple(shape) - - -def splits_groups(splitter_rpn, combiner=None, inner_inputs=None): - """splits inputs to groups (axes) and creates stacks for these groups - This is used to specify which input can be combined. - """ - if not splitter_rpn: - return [], {}, [], [] - stack = [] - keys = [] - groups = {} - group_count = None - if not combiner: - combiner = [] - if inner_inputs: - previous_states_ind = { - f"_{v.name}": v.keys_final for v in inner_inputs.values() - } - inner_inputs = {k: v for k, v in inner_inputs.items() if k in splitter_rpn} - else: - previous_states_ind = {} - inner_inputs = {} - - # when splitter is a single element (no operators) - if len(splitter_rpn) == 1: - op_single = splitter_rpn[0] - return _single_op_splits_groups(op_single, combiner, inner_inputs, groups) - - # len(splitter_rpn) > 1 - # iterating splitter_rpn - for token in splitter_rpn: - if token in [".", "*"]: - terms = {} - terms["R"] = stack.pop() - terms["L"] = stack.pop() - - # checking if opL/R are strings - trm_str = {"L": False, "R": False} - oldgroups = {} - - for lr in ["L", "R"]: - if isinstance(terms[lr], str): - trm_str[lr] = True - else: - oldgroups[lr] = terms[lr] - - if token == ".": - if all(trm_str.values()): - if group_count is None: - group_count = 0 - else: - group_count += 1 - oldgroup = groups[terms["L"]] = groups[terms["R"]] = group_count - elif trm_str["R"]: - groups[terms["R"]] = oldgroups["L"] - oldgroup = oldgroups["L"] - elif trm_str["L"]: - groups[terms["L"]] = oldgroups["R"] - oldgroup = oldgroups["R"] - else: - if len(ensure_list(oldgroups["L"])) != len( - ensure_list(oldgroups["R"]) - ): - raise ValueError( - "Operands do not have same shape " - "(left one is {}d and right one is {}d.".format( - len(ensure_list(oldgroups["L"])), - len(ensure_list(oldgroups["R"])), - ) - ) - oldgroup = oldgroups["L"] - # dj: changing axes for Right part of the scalar op. - for k, v in groups.items(): - if v in ensure_list(oldgroups["R"]): - groups[k] = ensure_list(oldgroups["L"])[ - ensure_list(oldgroups["R"]).index(v) - ] - else: # if token == "*": - if all(trm_str.values()): - if group_count is None: - group_count = 0 - else: - group_count += 1 - groups[terms["L"]] = group_count - group_count += 1 - groups[terms["R"]] = group_count - oldgroup = [groups[terms["L"]], groups[terms["R"]]] - elif trm_str["R"]: - group_count += 1 - groups[terms["R"]] = group_count - oldgroup = ensure_list(oldgroups["L"]) + [groups[terms["R"]]] - elif trm_str["L"]: - group_count += 1 - groups[terms["L"]] = group_count - oldgroup = [groups[terms["L"]]] + ensure_list(oldgroups["R"]) - else: - oldgroup = ensure_list(oldgroups["L"]) + ensure_list(oldgroups["R"]) - - # creating list of keys - if trm_str["L"]: - if terms["L"].startswith("_"): - keys = previous_states_ind[terms["L"]] + keys - else: - keys.insert(0, terms["L"]) - if trm_str["R"]: - if terms["R"].startswith("_"): - keys += previous_states_ind[terms["R"]] - else: - keys.append(terms["R"]) - - pushgroup = oldgroup - stack.append(pushgroup) - - else: # name of one of the inputs - stack.append(token) - - groups_stack = stack.pop() - if isinstance(groups_stack, int): - groups_stack = [groups_stack] - if inner_inputs: - groups_stack = [[], groups_stack] - else: - groups_stack = [groups_stack] - - if combiner: - ( - keys_final, - groups_final, - groups_stack_final, - combiner_all, - ) = combine_final_groups(combiner, groups, groups_stack, keys) - return keys_final, groups_final, groups_stack_final, combiner_all - else: - return keys, groups, groups_stack, [] - - -def _single_op_splits_groups(op_single, combiner, inner_inputs, groups): - """splits_groups function if splitter is a singleton""" - if op_single in inner_inputs: - # TODO: have to be changed if differ length - # TODO: i think I don't want to add here from left part - # keys = inner_inputs[op_single].keys_final + [op_single] - keys = [op_single] - groups[op_single], groups_stack = 0, [[], [0]] - else: - keys = [op_single] - groups[op_single], groups_stack = 0, [[0]] - if combiner: - if combiner == [op_single]: - return [], {}, [], combiner - else: - # TODO: probably not needed, should be already check by st.combiner_validation - raise PydraStateError( - f"all fields from the combiner have to be in splitter_rpn: {[op_single]}, " - f"but combiner: {combiner} is set" - ) - else: - return keys, groups, groups_stack, [] - - -def combine_final_groups(combiner, groups, groups_stack, keys): - """Combine the final groups.""" - input_for_groups, _ = converter_groups_to_input(groups) - combiner_all = [] - for comb in combiner: - for gr in ensure_list(groups[comb]): - combiner_all += input_for_groups[gr] - combiner_all = list(set(combiner_all)) - combiner_all.sort() - - # groups that were removed (so not trying to remove twice) - grs_removed = [] - groups_stack_final = deepcopy(groups_stack) - for comb in combiner: - grs = groups[comb] - for gr in ensure_list(grs): - if gr in groups_stack_final[-1]: - grs_removed.append(gr) - groups_stack_final[-1].remove(gr) - elif gr in grs_removed: - pass - else: - raise PydraStateError( - "input {} not ready to combine, you have to combine {} " - "first".format(comb, groups_stack[-1]) - ) - groups_final = {inp: gr for (inp, gr) in groups.items() if inp not in combiner_all} - gr_final = set() - for el in groups_final.values(): - gr_final.update(ensure_list(el)) - gr_final = list(gr_final) - map_gr_nr = {nr: i for (i, nr) in enumerate(sorted(gr_final))} - groups_final_map = {} - for inp, gr in groups_final.items(): - if isinstance(gr, int): - groups_final_map[inp] = map_gr_nr[gr] - elif isinstance(gr, list): - groups_final_map[inp] = [map_gr_nr[el] for el in gr] - else: - raise Exception("gr should be an int or a list, something wrong") - for i, groups_l in enumerate(groups_stack_final): - groups_stack_final[i] = [map_gr_nr[gr] for gr in groups_l] - - keys_final = [key for key in keys if key not in combiner_all] - # TODO: not sure if I have to calculate and return keys, groups, groups_stack - return keys_final, groups_final_map, groups_stack_final, combiner_all - - -def map_splits(split_iter, inputs, cont_dim=None): - """generate a dictionary of inputs prescribed by the splitter.""" - if cont_dim is None: - cont_dim = {} - for split in split_iter: - yield { - k: list(flatten(ensure_list(inputs[k]), max_depth=cont_dim.get(k, None)))[v] - for k, v in split.items() - } - - -def inputs_types_to_dict(name, inputs): - """Convert type.Inputs to dictionary.""" - # dj: any better option? - input_names = [ - field for field in attr.asdict(inputs, recurse=False) if field != "_func" - ] - inputs_dict = {} - for field in input_names: - inputs_dict[f"{name}.{field}"] = getattr(inputs, field) - return inputs_dict - - -def unwrap_splitter( - splitter: ty.Union[str, ty.List[str], ty.Tuple[str, ...]] -) -> ty.Iterable[str]: - """Unwraps a splitter into a flat list of fields that are split over, i.e. - [("a", "b"), "c"] -> ["a", "b", "c"] - - Parameters - ---------- - splitter: str or list[str] or tuple[str, ...] - the splitter spec to unwrap - - Returns - ------- - unwrapped : ty.Iterable[str] - the field names listed in the splitter - """ - if isinstance(splitter, str): - return [splitter] - else: - return itertools.chain(*(unwrap_splitter(s) for s in splitter)) diff --git a/pydra/engine/hooks.py b/pydra/engine/hooks.py new file mode 100644 index 0000000000..885079fdd4 --- /dev/null +++ b/pydra/engine/hooks.py @@ -0,0 +1,29 @@ +import typing as ty +import attrs +from attrs.converters import default_if_none + + +def donothing(*args: ty.Any, **kwargs: ty.Any) -> None: + return None + + +@attrs.define(kw_only=True) +class TaskHooks: + """Callable job hooks.""" + + pre_run_task: ty.Callable = attrs.field( + default=donothing, converter=default_if_none(donothing) + ) + post_run_task: ty.Callable = attrs.field( + default=donothing, converter=default_if_none(donothing) + ) + pre_run: ty.Callable = attrs.field( + default=donothing, converter=default_if_none(donothing) + ) + post_run: ty.Callable = attrs.field( + default=donothing, converter=default_if_none(donothing) + ) + + def reset(self): + for val in ["pre_run_task", "post_run_task", "pre_run", "post_run"]: + setattr(self, val, donothing) diff --git a/pydra/engine/job.py b/pydra/engine/job.py new file mode 100644 index 0000000000..9317b4ff1f --- /dev/null +++ b/pydra/engine/job.py @@ -0,0 +1,642 @@ +"""Basic processing graph elements.""" + +import json +import logging +import os +import inspect +import sys +import asyncio +from pathlib import Path +import typing as ty +from uuid import uuid4 +import shutil +from traceback import format_exception +import attr +import cloudpickle as cp +from pydra.compose.base import Task +from pydra.utils.hash import hash_function +from filelock import SoftFileLock, Timeout +from datetime import datetime +from fileformats.core import FileSet +from pydra.engine.hooks import TaskHooks +from pydra.engine.result import ( + RuntimeSpec, + Result, + record_error, +) +from pydra.utils.general import ( + attrs_values, + attrs_fields, + task_fields, + ensure_list, + is_workflow, +) +from pydra.utils.typing import is_lazy +from pydra.engine.result import load_result, save +from pydra.utils.typing import copy_nested_files +from pydra.compose.shell.templating import template_update +from pydra.utils.messenger import AuditFlag +from pydra.environments.base import Environment + +logger = logging.getLogger("pydra") + +develop = False + +if ty.TYPE_CHECKING: + from pydra.engine.submitter import Submitter + from pydra.compose.base import Arg + +TaskType = ty.TypeVar("TaskType", bound=Task) + + +class Job(ty.Generic[TaskType]): + """ + A base structure for the nodes in the processing graph. + + Tasks are a generic compute step from which both elementary tasks and + :class:`Workflow` instances inherit. + + """ + + _api_version: str = "0.0.1" # Should generally not be touched by subclasses + _etelemetry_version_data = None # class variable to store etelemetry information + _version: str # Version of tool being wrapped + _task_version: ty.Optional[str] = None + # Job writers encouraged to define and increment when implementation changes sufficiently + _input_sets = None # Dictionaries of predefined input settings + + audit_flags: AuditFlag = AuditFlag.NONE + """What to audit -- available flags: :class:`~pydra.utils.messenger.AuditFlag`.""" + + _can_resume = False # Does the job allow resuming from previous state + _redirect_x = False # Whether an X session should be created/directed + + _runtime_requirements = RuntimeSpec() + _runtime_hints = None + + _cache_root = None # Working directory in which to operate + _references = None # List of references for a job + + name: str + task: TaskType + submitter: "Submitter | None" + environment: "Environment | None" + state_index: int + bindings: dict[str, ty.Any] | None = None # Bindings for the job environment + + _inputs: dict[str, ty.Any] | None = None + _run_start_time: datetime | None + + def __init__( + self, + task: TaskType, + submitter: "Submitter", + name: str, + environment: "Environment | None" = None, + state_index: int | None = None, + hooks: TaskHooks | None = None, + ): + """ + Initialize a job. + + Jobs allow for caching (retrieving a previous result of the same + task and inputs), and concurrent execution. + Running tasks follows a decision flow: + + 1. Check whether prior cache exists -- + if ``True``, return cached result + 2. Check whether other process is running this job -- + wait if ``True``: + a. Finishes (with or without exception) -> return result + b. Gets killed -> restart + 3. No cache or other process -> start + 4. Two or more concurrent new processes get to start + """ + + if not isinstance(task, Task): + raise ValueError(f"Job task ({task!r}) must be a Task, not {type(task)}") + # Check that the task is fully resolved and ready to run + task._check_resolved() + task._check_rules() + self.task = task + # We save the submitter is the task is a workflow otherwise we don't + # so the job can be pickled + self.submitter = submitter + self.environment = ( + environment if environment is not None else submitter.environment + ) + self.name = name + self.state_index = state_index + + self.return_values = {} + self._result = {} + # flag that says if node finished all jobs + self._done = False + if self._input_sets is None: + self._input_sets = {} + + self.allow_cache_override = True + self._checksum = None + self._uid = uuid4().hex + self.hooks = hooks if hooks is not None else TaskHooks() + self._errored = False + self._lzout = None + + # Save the submitter attributes needed to run the job later + self.audit = submitter.audit + self.cache_root = submitter.cache_root + self.all_caches = submitter.readonly_caches + self._run_start_time = None + + @property + def cache_root(self): + return self._cache_root + + @property + def is_async(self) -> bool: + """Check to see if the job should be run asynchronously.""" + return self.submitter.worker.is_async and is_workflow(self.task) + + @cache_root.setter + def cache_root(self, path: os.PathLike): + self._cache_root = Path(path) + + @property + def all_caches(self): + """Get the list of cache sources.""" + return ensure_list(self.cache_root) + self._readonly_caches + + @all_caches.setter + def all_caches(self, locations): + if locations is not None: + self._readonly_caches = [Path(loc) for loc in ensure_list(locations)] + else: + self._readonly_caches = [] + + def __str__(self): + return self.name + + def __getstate__(self): + state = self.__dict__.copy() + state["task"] = cp.dumps(state["task"]) + return state + + def __setstate__(self, state): + state["task"] = cp.loads(state["task"]) + self.__dict__.update(state) + + @property + def errored(self): + """Check if the job has raised an error""" + return self._errored + + @property + def checksum(self): + """Calculates the unique checksum of the job. + Used to create specific directory name for job that are run; + and to create nodes checksums needed for graph checksums + (before the tasks have inputs etc.) + """ + if self._checksum is not None: + return self._checksum + self._checksum = self.task._checksum + return self._checksum + + @property + def lockfile(self): + return self.cache_dir.with_suffix(".lock") + + @property + def uid(self): + """the unique id number for the job + It will be used to create unique names for slurm scripts etc. + without a need to run checksum + """ + return self._uid + + @property + def output_names(self): + """Get the names of the outputs from the job's output_spec""" + return [f.name for f in attr.fields(self.task.Outputs)] + + @property + def can_resume(self): + """Whether the job accepts checkpoint-restart.""" + return self._can_resume + + @property + def cache_dir(self): + """Get the filesystem path where outputs will be written.""" + return self.cache_root / self.checksum + + @property + def inputs(self) -> dict[str, ty.Any]: + """Resolve any template inputs of the job ahead of its execution: + + - links/copies upstream files and directories into the destination tasks + working directory as required select state array values corresponding to + state index (it will try to leave them where they are unless specified or + they are on different file systems) + - resolve template values (e.g. output_file_template) + - deepcopy all inputs to guard against in-place changes during the job's + execution (they will be replaced after the job's execution with the + original inputs to ensure the tasks checksums are consistent) + """ + if self._inputs is not None: + return self._inputs + + from pydra.utils.typing import TypeParser + + self._inputs = { + k: v for k, v in attrs_values(self.task).items() if not k.startswith("_") + } + map_copyfiles = {} + fld: "Arg" + for fld in task_fields(self.task): + name = fld.name + value = self._inputs[name] + if value and TypeParser.contains_type(FileSet, fld.type): + copied_value = copy_nested_files( + value=value, + dest_dir=self.cache_dir, + mode=fld.copy_mode, + collation=fld.copy_collation, + supported_modes=self.SUPPORTED_COPY_MODES, + ) + if value is not copied_value: + map_copyfiles[name] = copied_value + self._inputs.update( + template_update( + self.task, cache_dir=self.cache_dir, map_copyfiles=map_copyfiles + ) + ) + return self._inputs + + def _populate_filesystem(self): + """ + Invoked immediately after the lockfile is generated, this function: + - Creates the cache file + - Clears existing outputs if `can_resume` is False + - Generates a fresh output directory + + Created as an attempt to simplify overlapping `Job`|`Workflow` behaviors. + """ + # adding info file with the checksum in case the job was cancelled + # and the lockfile has to be removed + with open(self.cache_root / f"{self.uid}_info.json", "w") as jsonfile: + json.dump({"checksum": self.checksum}, jsonfile) + if not self.can_resume and self.cache_dir.exists(): + shutil.rmtree(self.cache_dir) + self.cache_dir.mkdir(parents=False, exist_ok=self.can_resume) + # Save job pkl into the output directory for future reference + save(self.cache_dir, job=self) + + def run(self, rerun: bool = False): + """Prepare the job working directory, execute the task, and save the + results. + + Parameters + ---------- + rerun : bool + If True, the job will be re-run even if a result already exists. Will + propagated to all tasks within workflow tasks. + """ + # TODO: After these changes have been merged, will refactor this function and + # run_async to use common helper methods for pre/post run tasks + + # checking if the task is fully resolved and ready to run + self.hooks.pre_run(self) + logger.debug( + "'%s' is attempting to acquire lock on %s", self.name, self.lockfile + ) + with SoftFileLock(self.lockfile): + if not (rerun): + result = self.result() + if result is not None and not result.errored: + return result + cwd = os.getcwd() + self._populate_filesystem() + os.chdir(self.cache_dir) + result = Result( + outputs=None, + runtime=None, + errored=False, + cache_dir=self.cache_dir, + task=self.task, + ) + self.hooks.pre_run_task(self) + self.audit.start_audit(odir=self.cache_dir) + if self.audit.audit_check(AuditFlag.PROV): + self.audit.audit_task(job=self) + try: + self.audit.monitor() + self.task._run(self, rerun) + result.outputs = self.task.Outputs._from_task(self) + except Exception: + etype, eval, etr = sys.exc_info() + traceback = format_exception(etype, eval, etr) + record_error(self.cache_dir, error=traceback) + result.errored = True + raise + finally: + self.hooks.post_run_task(self, result) + self.audit.finalize_audit(result=result) + save(self.cache_dir, result=result, job=self) + # removing the additional file with the checksum + (self.cache_root / f"{self.uid}_info.json").unlink() + os.chdir(cwd) + self.hooks.post_run(self, result) + # Check for any changes to the input hashes that have occurred during the execution + # of the job + self._check_for_hash_changes() + return result + + async def run_async(self, rerun: bool = False) -> Result: + """Prepare the job working directory, execute the task asynchronously, + and save the results. NB: only workflows are run asynchronously at the moment. + + Parameters + ---------- + rerun : bool + If True, the job will be re-run even if a result already exists. Will + propagated to all tasks within workflow tasks. + """ + # checking if the task is fully resolved and ready to run + self.hooks.pre_run(self) + logger.debug( + "'%s' is attempting to acquire lock on %s", self.name, self.lockfile + ) + async with PydraFileLock(self.lockfile): + if not rerun: + result = self.result() + if result is not None and not result.errored: + return result + cwd = os.getcwd() + self._populate_filesystem() + result = Result( + outputs=None, + runtime=None, + errored=False, + cache_dir=self.cache_dir, + task=self.task, + ) + self.hooks.pre_run_task(self) + self.audit.start_audit(odir=self.cache_dir) + try: + self.audit.monitor() + await self.task._run_async(self, rerun) + result.outputs = self.task.Outputs._from_task(self) + except Exception: + etype, eval, etr = sys.exc_info() + traceback = format_exception(etype, eval, etr) + record_error(self.cache_dir, error=traceback) + result.errored = True + self._errored = True + raise + finally: + self.hooks.post_run_task(self, result) + self.audit.finalize_audit(result=result) + save(self.cache_dir, result=result, job=self) + # removing the additional file with the checksum + (self.cache_root / f"{self.uid}_info.json").unlink() + os.chdir(cwd) + self.hooks.post_run(self, result) + # Check for any changes to the input hashes that have occurred during the execution + # of the job + self._check_for_hash_changes() + return result + + def pickle_task(self): + """Pickling the tasks with full inputs""" + pkl_files = self.cache_root / "pkl_files" + pkl_files.mkdir(exist_ok=True, parents=True) + task_main_path = pkl_files / f"{self.name}_{self.uid}_job.pklz" + save(task_path=pkl_files, job=self, name_prefix=f"{self.name}_{self.uid}") + return task_main_path + + @property + def done(self): + """Check whether the tasks has been finalized and all outputs are stored.""" + # if any of the field is lazy, there is no need to check results + if has_lazy(self.task): + return False + _result = self.result() + if _result: + if _result.errored: + self._errored = True + raise ValueError(f"Job {self.name!r} failed") + else: + return True + return False + + @property + def run_start_time(self) -> datetime | None: + """Check whether the job is currently running.""" + if self._run_start_time is not None: + return self._run_start_time + try: + stat = self.lockfile.stat() + except FileNotFoundError: + return None + self._run_start_time = datetime.fromtimestamp(stat.st_ctime) + return self._run_start_time + + def _combined_output(self, return_inputs=False): + combined_results = [] + for gr, ind_l in self.state.final_combined_ind_mapping.items(): + combined_results_gr = [] + for ind in ind_l: + result = load_result(self.checksum_states(ind), self.all_caches) + if result is None: + return None + if return_inputs is True or return_inputs == "val": + result = (self.state.states_val[ind], result) + elif return_inputs is True or return_inputs == "ind": + result = (self.state.states_ind[ind], result) + combined_results_gr.append(result) + combined_results.append(combined_results_gr) + if len(combined_results) == 1 and self.state.splitter_rpn_final == []: + # in case it's full combiner, removing the nested structure + return combined_results[0] + else: + return combined_results + + def result(self, return_inputs=False): + """ + Retrieve the outcomes of this particular job. + + Parameters + ---------- + state_index : :obj: `int` + index of the element for job with splitter and multiple states + return_inputs : :obj: `bool`, :obj:`str` + if True or "val" result is returned together with values of the input fields, + if "ind" result is returned together with indices of the input fields + + Returns + ------- + result : Result + the result of the job + """ + if self.errored: + return Result( + outputs=None, + runtime=None, + errored=True, + cache_dir=self.cache_dir, + task=self.task, + ) + + checksum = self.checksum + result = load_result(checksum, self.all_caches) + if result and result.errored: + self._errored = True + if return_inputs is True or return_inputs == "val": + inputs_val = { + f"{self.name}.{inp}": getattr(self.task, inp) + for inp in self.input_names + } + return (inputs_val, result) + elif return_inputs == "ind": + inputs_ind = {f"{self.name}.{inp}": None for inp in self.input_names} + return (inputs_ind, result) + else: + return result + + def _check_for_hash_changes(self): + hash_changes = self.task._hash_changes() + details = "" + for changed in hash_changes: + field = getattr(attr.fields(type(self.task)), changed) + hash_function(getattr(self.task, changed)) + val = getattr(self.task, changed) + field_type = type(val) + if inspect.isclass(field.type) and issubclass(field.type, FileSet): + details += ( + f"- {changed}: value passed to the {field.type} field is of type " + f"{field_type} ('{val}'). If it is intended to contain output data " + "then the type of the field in the interface class should be changed " + "to `pathlib.Path`. Otherwise, if the field is intended to be an " + "input field but it gets altered by the job in some way, then the " + "'copyfile' flag should be set to 'copy' in the field metadata of " + "the job interface class so copies of the files/directories in it " + "are passed to the job instead.\n" + ) + else: + details += ( + f"- {changed}: the {field_type} object passed to the {field.type}" + f"field appears to have an unstable hash. This could be due to " + "a stochastic/non-thread-safe attribute(s) of the object\n\n" + f'A "bytes_repr" method for {field.type!r} can be implemented to ' + "bespoke hashing methods based only on the stable attributes for " + f"the `{field_type.__module__}.{field_type.__name__}` type. " + f"See pydra/utils/hash.py for examples. Value: {val}\n" + ) + if hash_changes: + raise RuntimeError( + f"Input field hashes have changed during the execution of the " + f"'{self.name}' job of {type(self)} type.\n\n{details}" + ) + logger.debug( + "Input values and hashes for '%s' %s node:\n%s\n%s", + self.name, + type(self).__name__, + self.task, + self.task._hashes, + ) + + def _write_notebook(self): + """Writes a notebook into the""" + raise NotImplementedError + + SUPPORTED_COPY_MODES = FileSet.CopyMode.any + DEFAULT_COPY_COLLATION = FileSet.CopyCollation.any + + +def has_lazy(obj): + """Check whether an object has lazy fields.""" + for f in attrs_fields(obj): + if is_lazy(getattr(obj, f.name)): + return True + return False + + +class PydraFileLock: + """Wrapper for filelock's SoftFileLock that makes it work with asyncio.""" + + def __init__(self, lockfile): + self.lockfile = lockfile + self.timeout = 0.1 + + async def __aenter__(self): + lock = SoftFileLock(self.lockfile) + acquired_lock = False + while not acquired_lock: + try: + lock.acquire(timeout=0) + acquired_lock = True + except Timeout: + await asyncio.sleep(self.timeout) + if self.timeout <= 2: + self.timeout = self.timeout * 2 + self.lock = lock + return self + + async def __aexit__(self, exc_type, exc_value, traceback): + self.lock.release() + return None + + +def load_and_run(job_pkl: Path, rerun: bool = False) -> Path: + """ + loading a job from a pickle file, settings proper input + and running the job + + Parameters + ---------- + job_pkl : :obj:`Path` + The path to pickled job file + + Returns + ------- + resultfile : :obj:`Path` + The path to the pickled result file + """ + + try: + job: Job[TaskType] = load_job(job_pkl=job_pkl) + except Exception: + if job_pkl.parent.exists(): + etype, eval, etr = sys.exc_info() + traceback = format_exception(etype, eval, etr) + errorfile = record_error(job_pkl.parent, error=traceback) + result = Result(output=None, runtime=None, errored=True, task=None) + save(job_pkl.parent, result=result) + raise + + resultfile = job.cache_dir / "_result.pklz" + try: + if job.is_async: + job.submitter.submit(job, rerun=rerun) + else: + job.run(rerun=rerun) + except Exception as e: + # creating result and error files if missing + errorfile = job.cache_dir / "_error.pklz" + if not errorfile.exists(): # not sure if this is needed + etype, eval, etr = sys.exc_info() + traceback = format_exception(etype, eval, etr) + errorfile = record_error(job.cache_dir, error=traceback) + if not resultfile.exists(): # not sure if this is needed + result = Result(output=None, runtime=None, errored=True, task=None) + save(job.cache_dir, result=result) + e.add_note(f" full crash report is here: {errorfile}") + raise + return resultfile + + +def load_job(job_pkl: os.PathLike) -> "Job[TaskType]": + """loading a job from a pickle file, settings proper input for the specific ind""" + with open(job_pkl, "rb") as fp: + job = cp.load(fp) + return job diff --git a/pydra/engine/lazy.py b/pydra/engine/lazy.py new file mode 100644 index 0000000000..bb8780a94b --- /dev/null +++ b/pydra/engine/lazy.py @@ -0,0 +1,252 @@ +import typing as ty +import abc +import attrs +from typing import Self +from pydra.utils.typing import StateArray +from pydra.utils.hash import hash_single +from pydra.engine import node + +if ty.TYPE_CHECKING: + from pydra.engine.submitter import DiGraph, NodeExecution + from pydra.engine.job import Job + from pydra.engine.workflow import Workflow + from pydra.compose.base import Task + + +T = ty.TypeVar("T") +TaskType = ty.TypeVar("TaskType", bound="Task") + +TypeOrAny = ty.Union[type, ty.Any] + + +@attrs.define(kw_only=True) +class LazyField(ty.Generic[T], metaclass=abc.ABCMeta): + """Lazy fields implement promises.""" + + _field: str + _type: TypeOrAny + _cast_from: ty.Optional[ty.Type[ty.Any]] = None + _type_checked: bool = False + + def __bytes_repr__(self, cache): + yield type(self).__name__.encode() + b"(" + yield from bytes(hash_single(self.source, cache)) + yield b"field=" + self._field.encode() + yield b"type=" + bytes(hash_single(self._type, cache)) + yield b"cast_from=" + bytes(hash_single(self._cast_from, cache)) + yield b")" + + def _apply_cast(self, value): + """\"Casts\" the value from the retrieved type if a cast has been applied to + the lazy-field""" + from pydra.utils.typing import TypeParser + + if self._cast_from: + assert TypeParser.matches(value, self._cast_from) + value = self._type(value) + return value + + def _get_value( + self, + workflow: "Workflow", + graph: "DiGraph[NodeExecution]", + state_index: int | None = None, + ) -> ty.Any: + """Return the value of a lazy field. + + Parameters + ---------- + workflow: Workflow + the workflow object + graph: DiGraph[NodeExecution] + the graph representing the execution state of the workflow + state_index : int, optional + the state index of the field to access + + Returns + ------- + value : Any + the resolved value of the lazy-field + """ + raise NotImplementedError("LazyField is an abstract class") + + def split(self) -> Self: + """ "Splits" the lazy field over an array of nodes by replacing the sequence type + of the lazy field with StateArray to signify that it will be "split" across + """ + from pydra.utils.typing import ( + TypeParser, + ) # pylint: disable=import-outside-toplevel + + # Modify the type of the lazy field to include the split across a state-array + inner_type, prev_split_depth = TypeParser.strip_splits(self._type) + assert prev_split_depth <= 1 + if inner_type is ty.Any: + type_ = StateArray[ty.Any] + elif TypeParser.matches_type(inner_type, list): + item_type = TypeParser.get_item_type(inner_type) + type_ = StateArray[item_type] + else: + raise TypeError( + f"Cannot split non-sequence field {self} of type {inner_type}" + ) + if prev_split_depth: + type_ = StateArray[ + type_ + ] # FIXME: This nesting of StateArray is probably unnecessary + return attrs.evolve(self, type=type_) + + +@attrs.define(kw_only=True) +class LazyInField(LazyField[T]): + + _workflow: "Workflow" = attrs.field() + + _attr_type = "input" + + def __eq__(self, other): + return ( + isinstance(other, LazyInField) + and self._field == other._field + and self._type == other._type + ) + + def __repr__(self): + return f"{type(self).__name__}(field={self._field!r}, type={self._type})" + + @property + def _source(self): + return self._workflow + + def _get_value( + self, + workflow: "Workflow", + graph: "DiGraph[NodeExecution]", + state_index: int | None = None, + ) -> ty.Any: + """Return the value of a lazy field. + + Parameters + ---------- + workflow: Workflow + the workflow object + graph: DiGraph[NodeExecution] + the graph representing the execution state of the workflow + state_index : int, optional + the state index of the field to access + + Returns + ------- + value : Any + the resolved value of the lazy-field + """ + value = workflow.inputs[self._field] + value = self._apply_cast(value) + return value + + +@attrs.define(kw_only=True) +class LazyOutField(LazyField[T]): + + _node: node.Node + _attr_type = "output" + + def __repr__(self): + return ( + f"{type(self).__name__}(node={self._node.name!r}, " + f"field={self._field!r}, type={self._type})" + ) + + def _get_value( + self, + workflow: "Workflow", + graph: "DiGraph[NodeExecution]", + state_index: int | None = None, + ) -> ty.Any: + """Return the value of a lazy field. + + Parameters + ---------- + workflow: Workflow + the workflow object + graph: DiGraph[NodeExecution] + the graph representing the execution state of the workflow + state_index : int, optional + the state index of the field to access + + Returns + ------- + value : Any + the resolved value of the lazy-field + """ + + def retrieve_from_job(job: "Job[TaskType]") -> ty.Any: + if job.errored: + raise ValueError( + f"Cannot retrieve value for {self._field!r} from {self._node.name} as " + "the node errored" + ) + res = job.result() + if res is None: + raise RuntimeError( + f"Could not find results of '{job.name}' node in a sub-directory " + f"named '{{{job.checksum}}}' in any of the cache locations.\n" + + "\n".join(str(p) for p in set(job.readonly_caches)) + + f"\n\nThis is likely due to hash changes in '{job.name}' node inputs. " + f"Current values and hashes: {job.inputs}, " + f"{job.task._hash}\n\n" + "Set loglevel to 'debug' in order to track hash changes " + "throughout the execution of the workflow.\n\n " + "These issues may have been caused by `bytes_repr()` methods " + "that don't return stable hash values for specific object " + "types across multiple processes (see bytes_repr() " + '"singledispatch "function in pydra/utils/hash.py).' + "You may need to write specific `bytes_repr()` " + "implementations (see `pydra.utils.hash.register_serializer`) or a " + "`__bytes_repr__()` dunder methods to handle one or more types in " + "your interface inputs." + ) + val = res.get_output_field(self._field) + val = self._apply_cast(val) + return val + + # Get the execution node that the value is coming from + upstream_node = graph.node(self._node.name) + + if not upstream_node._tasks: # No jobs, return empty state array + return StateArray() + if not upstream_node.state: # Return the singular job + value = retrieve_from_job(upstream_node._tasks[None]) + if state_index is not None: + return value[state_index] + return value + if upstream_node.state.combiner: + + # No state remains after the combination, return all values in a list + if not upstream_node.state.ind_l_final: + return [retrieve_from_job(j) for j in upstream_node.tasks] + + # Group the values of the tasks into list before returning + def group_values(index: int) -> list: + # Get a slice of the tasks that match the given index of the state array of the + # combined values + final_index = set(upstream_node.state.states_ind_final[index].items()) + return [ + retrieve_from_job(upstream_node._tasks[i]) + for i, ind in enumerate(upstream_node.state.states_ind) + if set(ind.items()).issuperset(final_index) + ] + + if state_index is None: # return all groups if no index is given + return StateArray( + group_values(i) for i in range(len(upstream_node.state.ind_l_final)) + ) + return group_values(state_index) # select the group that matches the index + if state_index is None: # return all jobs in a state array + return StateArray(retrieve_from_job(j) for j in upstream_node.tasks) + # Select the job that matches the index + return retrieve_from_job(upstream_node._tasks[state_index]) + + @property + def _source(self): + return self._node diff --git a/pydra/engine/node.py b/pydra/engine/node.py new file mode 100644 index 0000000000..0568481422 --- /dev/null +++ b/pydra/engine/node.py @@ -0,0 +1,235 @@ +import typing as ty +from copy import deepcopy +from enum import Enum +import attrs +from pydra.engine import lazy +from pydra.utils.general import attrs_values +from pydra.utils.typing import is_lazy +from pydra.engine.state import State, add_name_splitter, add_name_combiner + +if ty.TYPE_CHECKING: + from pydra.engine.workflow import Workflow + from pydra.environments.base import Environment + from pydra.compose import base + from pydra.engine.hooks import TaskHooks + + +OutputType = ty.TypeVar("OutputType", bound="base.Outputs") +Splitter = ty.Union[str, ty.Tuple[str, ...]] + +_not_set = Enum("_not_set", "NOT_SET") + +NOT_SET = _not_set.NOT_SET + + +@attrs.define +class Node(ty.Generic[OutputType]): + """A node in a workflow + + Parameters + ---------- + name : str + The name of the node + inputs : Task + The task of the node + """ + + name: str + _task: "base.Task[OutputType]" + _environment: "Environment | None" = None + _hooks: "TaskHooks | None" = None + _workflow: "Workflow" = attrs.field(default=None, eq=False, hash=False, repr=False) + _lzout: OutputType | None = attrs.field( + init=False, default=None, eq=False, hash=False, repr=False + ) + _state: State | None = attrs.field(init=False, default=NOT_SET) + + def __attrs_post_init__(self): + self._set_state() + + class Inputs: + """A class to wrap the inputs of a node and control access to them so lazy fields + that will change the downstream state (i.e. with new splits) aren't set after + the node has been split, combined or its outputs accessed. + """ + + _node: "Node" + + def __init__(self, node: "Node") -> None: + super().__setattr__("_node", node) + + def __getattr__(self, name: str) -> ty.Any: + return getattr(self._node._task, name) + + def __getstate__(self) -> ty.Dict[str, ty.Any]: + return {"_node": self._node} + + def __setstate__(self, state: ty.Dict[str, ty.Any]) -> None: + super().__setattr__("_node", state["_node"]) + + def __setattr__(self, name: str, value: ty.Any) -> None: + setattr(self._node._task, name, value) + if is_lazy(value): + upstream_states = self._node._get_upstream_states() + if ( + not self._node._state + or self._node._state.other_states != upstream_states + ): + self._node._check_if_outputs_have_been_used( + f"cannot set {name!r} input to {value} because it changes the " + f"state" + ) + self._set_state() + + @property + def inputs(self) -> Inputs: + return self.Inputs(self) + + @property + def input_names(self) -> list[str]: + return list(attrs_values(self._task).keys()) + + @property + def state(self): + """Initialise the state of the node just after it has been created (i.e. before + it has been split or combined) based on the upstream connections + """ + return self._state + + @property + def input_values(self) -> tuple[tuple[str, ty.Any]]: + return tuple(attrs_values(self._task).items()) + + @property + def state_values(self) -> dict[str, ty.Any]: + """Get the values of the task, scoped by the name of the node to be + used in the state + + Returns + ------- + dict[str, Any] + The values of the task + """ + return {f"{self.name}.{n}": v for n, v in attrs_values(self._task).items()} + + @property + def lzout(self) -> OutputType: + from pydra.utils.general import task_fields + + """The output task of the node populated with lazy fields""" + if self._lzout is not None: + return self._lzout + lazy_fields = {} + for field in task_fields(self.inputs.Outputs): + lazy_fields[field.name] = lazy.LazyOutField( + node=self, + field=field.name, + type=field.type, + ) + outputs = self.inputs.Outputs(**lazy_fields) + + outpt: lazy.LazyOutField + for outpt in attrs_values(outputs).values(): + # Assign the current node to the lazy fields so they can access the state + outpt._node = self + # If the node has a non-empty state, wrap the type of the lazy field in + # a combination of an optional list and a number of nested StateArrays + # types based on the number of states the node is split over and whether + # it has a combiner + if self._state: + outpt._type = self._state.nest_output_type(outpt._type) + # Flag the output lazy fields as being not typed checked (i.e. assigned to + # another node's inputs) yet. This is used to prevent the user from changing + # the type of the output after it has been accessed by connecting it to an + # output of an upstream node with additional state variables. + outpt._type_checked = False + self._lzout = outputs + return outputs + + @property + def splitter(self): + if not self._state: + return () + return self._state.splitter + + @property + def combiner(self): + if not self._state: + return () + return self._state.combiner + + def _check_if_outputs_have_been_used(self, msg): + used = [] + if self._lzout: + for outpt_name, outpt_val in attrs.asdict( + self._lzout, recurse=False + ).items(): + if outpt_val.type_checked: + used.append(outpt_name) + if used: + raise RuntimeError( + f"Outputs {used} of {self} have already been accessed and therefore " + + msg + ) + + def _set_state(self) -> None: + # Add node name to state's splitter, combiner and container_ndim loaded from the def + splitter = deepcopy(self._task._splitter) # these can be modified in state + combiner = deepcopy(self._task._combiner) # these can be modified in state + container_ndim = {} + if splitter: + splitter = add_name_splitter(splitter, self.name) + if combiner: + combiner = add_name_combiner(combiner, self.name) + if self._task._container_ndim: + for key, val in self._task._container_ndim.items(): + container_ndim[f"{self.name}.{key}"] = val + other_states = self._get_upstream_states() + if splitter or combiner or other_states: + self._state = State( + self.name, + splitter=splitter, + other_states=other_states, + combiner=combiner, + container_ndim=container_ndim, + ) + if combiner: + if not_split := [ + c + for c in combiner + if not any(c in s for s in self.state.splitter_rpn) and "." not in c + ]: + raise ValueError( + f"Combiner fields {not_split} for Node {self.name!r} are not in the " + f"splitter {self.state.splitter_rpn}" + ) + else: + self._state = None + + def _get_upstream_states(self) -> dict[str, tuple["State", list[str]]]: + """Get the states of the upstream nodes that are connected to this node""" + upstream_states = {} + for inpt_name, val in self.input_values: + if ( + isinstance(val, lazy.LazyOutField) + and val._node.state + and val._node.state.depth() + ): + node: Node = val._node + # variables that are part of inner splitters should be treated as a containers + if node.state and f"{node.name}.{val._field}" in node.state.splitter: + node.state._inner_container_ndim[f"{node.name}.{val._field}"] = 1 + # adding task_name: (task.state, [a field from the connection] + if node.name not in upstream_states: + upstream_states[node.name] = (node.state, [val._field]) + else: + # if the task already exist in other_state, + # additional field name should be added to the list of fields + upstream_states[node.name][1].append(val._field) + return upstream_states + + # else: + # # todo it never gets here + # breakpoint() + # inputs_dict = {inp: getattr(self.inputs, inp) for inp in self.input_names} + # return None, inputs_dict diff --git a/pydra/engine/result.py b/pydra/engine/result.py new file mode 100644 index 0000000000..e6b58a6989 --- /dev/null +++ b/pydra/engine/result.py @@ -0,0 +1,327 @@ +"""Job I/O definitions.""" + +from pathlib import Path +import typing as ty +import attrs +import pickle +import time +import os +import cloudpickle as cp +import getpass +from time import strftime +from filelock import SoftFileLock +from fileformats.generic import FileSet +from pydra.utils.general import ( + attrs_values, + attrs_fields, + is_workflow, +) +from pydra.utils.typing import copy_nested_files +from pydra.compose import workflow, base + +if ty.TYPE_CHECKING: + from pydra.engine.job import Job + + +TaskType = ty.TypeVar("TaskType", bound=base.Task) +OutputsType = ty.TypeVar("OutputsType", bound=base.Outputs) + + +@attrs.define(kw_only=True) +class Runtime: + """Represent run time metadata.""" + + rss_peak_gb: ty.Optional[float] = None + """Peak in consumption of physical RAM.""" + vms_peak_gb: ty.Optional[float] = None + """Peak in consumption of virtual memory.""" + cpu_peak_percent: ty.Optional[float] = None + """Peak in cpu consumption.""" + + +@attrs.define(kw_only=True) +class Result(ty.Generic[OutputsType]): + """Metadata regarding the outputs of processing.""" + + cache_dir: Path + outputs: OutputsType | None = None + runtime: Runtime | None = None + errored: bool = False + task: base.Task[OutputsType] | None = None + + CLOUD_PICKLE_ATTRS = ("outputs", "task") + + def __getstate__(self): + state = attrs_values(self) + for attr in self.CLOUD_PICKLE_ATTRS: + if state[attr] is not None: + state[attr] = cp.dumps(state[attr]) + return state + + def __setstate__(self, state): + for attr in self.CLOUD_PICKLE_ATTRS: + if state[attr] is not None: + state[attr] = cp.loads(state[attr]) + for name, val in state.items(): + setattr(self, name, val) + + def get_output_field(self, field_name): + """Used in get_values in Workflow + + Parameters + ---------- + field_name : `str` + Name of field in LazyField object + """ + if field_name == "all_": + return attrs_values(self.outputs) + else: + return getattr(self.outputs, field_name) + + @property + def errors(self): + if self.errored: + error_file = self.cache_dir / "_error.pklz" + if error_file.exists(): + with open(error_file, "rb") as f: + return cp.load(f) + return None + + @property + def job(self): + job_pkl = self.cache_dir / "_job.pklz" + if not job_pkl.exists(): + return None + with open(job_pkl, "rb") as f: + return cp.load(f) + + @property + def return_values(self): + return_values_pkl = self.cache_dir / "_return_values.pklz" + if not return_values_pkl.exists(): + return None + with open(return_values_pkl, "rb") as f: + return cp.load(f) + + +@attrs.define(kw_only=True) +class RuntimeSpec: + """ + Specification for a job. + + From CWL:: + + InlineJavascriptRequirement + SchemaDefRequirement + DockerRequirement + SoftwareRequirement + InitialWorkDirRequirement + EnvVarRequirement + ShellCommandRequirement + ResourceRequirement + + InlineScriptRequirement + + """ + + outdir: ty.Optional[str] = None + container: ty.Optional[str] = "shell" + network: bool = False + + +def load_result( + checksum: str, + readonly_caches: list[Path], + retries: int = 10, + polling_interval: float = 0.1, +) -> Result | None: + """ + Restore a result from the cache. + + Parameters + ---------- + checksum : :obj:`str` + Unique identifier of the job to be loaded. + readonly_caches : :obj:`list` of :obj:`os.pathlike` + List of cache directories, in order of priority, where + the checksum will be looked for. + retries : :obj:`int` + Number of times to retry loading the result if the file is not + completely written. + polling_interval : :obj:`float` + Time to wait between retries. + + Returns + ------- + result : :obj:`Result` | None + The result object if found, otherwise None. + + """ + if not readonly_caches: + return None + # TODO: if there are issues with loading, we might need to + # TODO: sleep and repeat loads (after checking that there are no lock files!) + for location in readonly_caches: + if (location / checksum).exists(): + result_file = location / checksum / "_result.pklz" + if result_file.exists() and result_file.stat().st_size > 0: + # Load the result file, retrying if necessary while waiting for the file + # to be written completely. + for _ in range(retries): + try: + with open(result_file, "rb") as fp: + return cp.load(fp) + except (pickle.UnpicklingError, EOFError): + # if the file is not finished writing + # wait and retry + time.sleep(polling_interval) + return None + return None + + +def save( + task_path: Path, + result: "Result | None" = None, + job: "Job[TaskType] | None" = None, + return_values: dict[str, ty.Any] | None = None, + name_prefix: str = None, +) -> None: + """ + Save a :class:`~pydra.compose.base.Task` object and/or results. + + Parameters + ---------- + task_path : :obj:`Path` + Write directory + result : :obj:`Result` + Result to pickle and write + job : :class:`~pydra.compose.base.Task` + Job to pickle and write + return_values : :obj:`dict` + Return values to pickle and write + """ + + if job is None and result is None: + raise ValueError("Nothing to be saved") + + if not isinstance(task_path, Path): + task_path = Path(task_path) + task_path.mkdir(parents=True, exist_ok=True) + if name_prefix is None: + name_prefix = "" + + lockfile = task_path.parent / (task_path.name + "_save.lock") + with SoftFileLock(lockfile): + if result: + if result.task and is_workflow(result.task) and result.outputs is not None: + # copy files to the workflow directory + result.outputs = copyfile_workflow( + wf_path=task_path, outputs=result.outputs + ) + with (task_path / f"{name_prefix}_result.pklz").open("wb") as fp: + cp.dump(result, fp) + if job: + with (task_path / f"{name_prefix}_job.pklz").open("wb") as fp: + cp.dump(job, fp) + if return_values: + with (task_path / f"{name_prefix}_return_values.pklz").open("wb") as fp: + cp.dump(job, fp) + + +def copyfile_workflow( + wf_path: os.PathLike, outputs: workflow.Outputs +) -> workflow.Outputs: + """if file in the wf results, the file will be copied to the workflow directory""" + + for field in attrs_fields(outputs): + value = getattr(outputs, field.name) + # if the field is a path or it can contain a path _copyfile_single_value is run + # to move all files and directories to the workflow directory + new_value = copy_nested_files(value, wf_path, mode=FileSet.CopyMode.hardlink) + setattr(outputs, field.name, new_value) + return outputs + + +def gather_runtime_info(fname): + """ + Extract runtime information from a file. + + Parameters + ---------- + fname : :obj:`os.pathlike` + The file containing runtime information + + Returns + ------- + runtime : :obj:`Runtime` + A runtime object containing the collected information. + + """ + + runtime = Runtime(rss_peak_gb=None, vms_peak_gb=None, cpu_peak_percent=None) + + # Read .prof file in and set runtime values + data = [ + [float(el) for el in line.strip().split(",")] + for line in Path(fname).read_text().splitlines() + ] + if data: + runtime.rss_peak_gb = max([val[2] for val in data]) / 1024 + runtime.vms_peak_gb = max([val[3] for val in data]) / 1024 + runtime.cpu_peak_percent = max([val[1] for val in data]) + + """ + runtime.prof_dict = { + 'time': vals[:, 0].tolist(), + 'cpus': vals[:, 1].tolist(), + 'rss_GiB': (vals[:, 2] / 1024).tolist(), + 'vms_GiB': (vals[:, 3] / 1024).tolist(), + } + """ + return runtime + + +def create_checksum(name, inputs): + """ + Generate a checksum name for a given combination of job name and inputs. + + Parameters + ---------- + name : :obj:`str` + Job name. + inputs : :obj:`str` + String of inputs. + + """ + return "-".join((name, inputs)) + + +def record_error(error_path, error): + """Write an error file.""" + + error_message = str(error) + + resultfile = error_path / "_result.pklz" + if not resultfile.exists(): + error_message += """\n + When creating this error file, the results file corresponding + to the job could not be found.""" + + name_checksum = str(error_path.name) + timeofcrash = strftime("%Y%m%d-%H%M%S") + try: + login_name = getpass.getuser() + except KeyError: + login_name = f"UID{os.getuid():d}" + + full_error = { + "time of crash": timeofcrash, + "login name": login_name, + "name with checksum": name_checksum, + "error message": error, + } + + with (error_path / "_error.pklz").open("wb") as fp: + cp.dump(full_error, fp) + + return error_path / "_error.pklz" diff --git a/pydra/engine/specs.py b/pydra/engine/specs.py deleted file mode 100644 index a2e3651779..0000000000 --- a/pydra/engine/specs.py +++ /dev/null @@ -1,1077 +0,0 @@ -"""Task I/O specifications.""" - -from pathlib import Path -import typing as ty -import inspect -import re -import os -from copy import copy -from glob import glob -import attr -from fileformats.core import FileSet -from fileformats.generic import ( - File, - Directory, -) -import pydra -from .helpers_file import template_update_single -from ..utils.hash import hash_function, Cache - -# from ..utils.misc import add_exc_note - - -T = ty.TypeVar("T") - - -def attr_fields(spec, exclude_names=()): - return [field for field in spec.__attrs_attrs__ if field.name not in exclude_names] - - -# These are special types that are checked for in the construction of input/output specs -# and special converters inserted into the attrs fields. - - -class MultiInputObj(list, ty.Generic[T]): - pass - - -MultiInputFile = MultiInputObj[File] - - -# Since we can't create a NewType from a type union, we add a dummy type to the union -# so we can detect the MultiOutput in the input/output spec creation -class MultiOutputType: - pass - - -MultiOutputObj = ty.Union[list, object, MultiOutputType] -MultiOutputFile = ty.Union[File, ty.List[File], MultiOutputType] - -OUTPUT_TEMPLATE_TYPES = ( - Path, - ty.List[Path], - ty.Union[Path, bool], - ty.Union[ty.List[Path], bool], - ty.List[ty.List[Path]], -) - - -@attr.s(auto_attribs=True, kw_only=True) -class SpecInfo: - """Base data structure for metadata of specifications.""" - - name: str - """A name for the specification.""" - fields: ty.List[ty.Tuple] = attr.ib(factory=list) - """List of names of fields (can be inputs or outputs).""" - bases: ty.Sequence[ty.Type["BaseSpec"]] = attr.ib(factory=tuple) - """Keeps track of specification inheritance. - Should be a tuple containing at least one BaseSpec """ - - -@attr.s(auto_attribs=True, kw_only=True) -class BaseSpec: - """The base dataclass specs for all inputs and outputs.""" - - def collect_additional_outputs(self, inputs, output_dir, outputs): - """Get additional outputs.""" - return {} - - @property - def hash(self): - hsh, self._hashes = self._compute_hashes() - return hsh - - def hash_changes(self): - """Detects any changes in the hashed values between the current inputs and the - previously calculated values""" - _, new_hashes = self._compute_hashes() - return [k for k, v in new_hashes.items() if v != self._hashes[k]] - - def _compute_hashes(self) -> ty.Tuple[bytes, ty.Dict[str, bytes]]: - """Compute a basic hash for any given set of fields.""" - inp_dict = {} - for field in attr_fields( - self, exclude_names=("_graph_checksums", "bindings", "files_hash") - ): - if field.metadata.get("output_file_template"): - continue - # removing values that are not set from hash calculation - if getattr(self, field.name) is attr.NOTHING: - continue - if "container_path" in field.metadata: - continue - inp_dict[field.name] = getattr(self, field.name) - hash_cache = Cache() - field_hashes = { - k: hash_function(v, cache=hash_cache) for k, v in inp_dict.items() - } - if hasattr(self, "_graph_checksums"): - field_hashes["_graph_checksums"] = self._graph_checksums - return hash_function(sorted(field_hashes.items())), field_hashes - - def retrieve_values(self, wf, state_index: ty.Optional[int] = None): - """Get values contained by this spec.""" - retrieved_values = {} - for field in attr_fields(self): - value = getattr(self, field.name) - if isinstance(value, LazyField): - retrieved_values[field.name] = value.get_value( - wf, state_index=state_index - ) - for field, val in retrieved_values.items(): - setattr(self, field, val) - - def check_fields_input_spec(self): - """ - Check fields from input spec based on the medatada. - - e.g., if xor, requires are fulfilled, if value provided when mandatory. - - """ - fields = attr_fields(self) - - for field in fields: - field_is_mandatory = bool(field.metadata.get("mandatory")) - field_is_unset = getattr(self, field.name) is attr.NOTHING - - if field_is_unset and not field_is_mandatory: - continue - - # Collect alternative fields associated with this field. - alternative_fields = { - name: getattr(self, name) is not attr.NOTHING - for name in field.metadata.get("xor", []) - if name != field.name - } - alternatives_are_set = any(alternative_fields.values()) - - # Raise error if no field in mandatory alternative group is set. - if field_is_unset: - if alternatives_are_set: - continue - message = f"{field.name} is mandatory and unset." - if alternative_fields: - raise AttributeError( - message[:-1] - + f", but no alternative provided by {list(alternative_fields)}." - ) - else: - raise AttributeError(message) - - # Raise error if multiple alternatives are set. - elif alternatives_are_set: - set_alternative_fields = [ - name for name, is_set in alternative_fields.items() if is_set - ] - raise AttributeError( - f"{field.name} is mutually exclusive with {set_alternative_fields}" - ) - - # Collect required fields associated with this field. - required_fields = { - name: getattr(self, name) is not attr.NOTHING - for name in field.metadata.get("requires", []) - if name != field.name - } - - # Raise error if any required field is unset. - if not all(required_fields.values()): - unset_required_fields = [ - name for name, is_set in required_fields.items() if not is_set - ] - raise AttributeError(f"{field.name} requires {unset_required_fields}") - - def check_metadata(self): - """Check contained metadata.""" - - def template_update(self): - """Update template.""" - - def copyfile_input(self, output_dir): - """Copy the file pointed by a :class:`File` input.""" - - -@attr.s(auto_attribs=True, kw_only=True) -class Runtime: - """Represent run time metadata.""" - - rss_peak_gb: ty.Optional[float] = None - """Peak in consumption of physical RAM.""" - vms_peak_gb: ty.Optional[float] = None - """Peak in consumption of virtual memory.""" - cpu_peak_percent: ty.Optional[float] = None - """Peak in cpu consumption.""" - - -@attr.s(auto_attribs=True, kw_only=True) -class Result: - """Metadata regarding the outputs of processing.""" - - output: ty.Optional[ty.Any] = None - runtime: ty.Optional[Runtime] = None - errored: bool = False - - def __getstate__(self): - state = self.__dict__.copy() - if state["output"] is not None: - fields = tuple((el.name, el.type) for el in attr_fields(state["output"])) - state["output_spec"] = (state["output"].__class__.__name__, fields) - state["output"] = attr.asdict(state["output"], recurse=False) - return state - - def __setstate__(self, state): - if "output_spec" in state: - spec = list(state["output_spec"]) - del state["output_spec"] - klass = attr.make_class( - spec[0], {k: attr.ib(type=v) for k, v in list(spec[1])} - ) - state["output"] = klass(**state["output"]) - self.__dict__.update(state) - - def get_output_field(self, field_name): - """Used in get_values in Workflow - - Parameters - ---------- - field_name : `str` - Name of field in LazyField object - """ - if field_name == "all_": - return attr.asdict(self.output, recurse=False) - else: - return getattr(self.output, field_name) - - -@attr.s(auto_attribs=True, kw_only=True) -class RuntimeSpec: - """ - Specification for a task. - - From CWL:: - - InlineJavascriptRequirement - SchemaDefRequirement - DockerRequirement - SoftwareRequirement - InitialWorkDirRequirement - EnvVarRequirement - ShellCommandRequirement - ResourceRequirement - - InlineScriptRequirement - - """ - - outdir: ty.Optional[str] = None - container: ty.Optional[str] = "shell" - network: bool = False - - -@attr.s(auto_attribs=True, kw_only=True) -class FunctionSpec(BaseSpec): - """Specification for a process invoked from a shell.""" - - def check_metadata(self): - """ - Check the metadata for fields in input_spec and fields. - - Also sets the default values when available and needed. - - """ - supported_keys = { - "allowed_values", - "copyfile", - "help_string", - "mandatory", - # "readonly", #likely not needed - # "output_field_name", #likely not needed - # "output_file_template", #likely not needed - "requires", - "keep_extension", - "xor", - "sep", - } - for fld in attr_fields(self, exclude_names=("_func", "_graph_checksums")): - mdata = fld.metadata - # checking keys from metadata - if set(mdata.keys()) - supported_keys: - raise AttributeError( - f"only these keys are supported {supported_keys}, but " - f"{set(mdata.keys()) - supported_keys} provided" - ) - # checking if the help string is provided (required field) - if "help_string" not in mdata: - raise AttributeError(f"{fld.name} doesn't have help_string field") - # not allowing for default if the field is mandatory - if not fld.default == attr.NOTHING and mdata.get("mandatory"): - raise AttributeError( - f"default value ({fld.default!r}) should not be set when the field " - f"('{fld.name}') in {self}) is mandatory" - ) - # setting default if value not provided and default is available - if getattr(self, fld.name) is None: - if not fld.default == attr.NOTHING: - setattr(self, fld.name, fld.default) - - -@attr.s(auto_attribs=True, kw_only=True) -class ShellSpec(BaseSpec): - """Specification for a process invoked from a shell.""" - - executable: ty.Union[str, ty.List[str]] = attr.ib( - metadata={ - "help_string": "the first part of the command, can be a string, " - "e.g. 'ls', or a list, e.g. ['ls', '-l', 'dirname']" - } - ) - args: ty.Union[str, ty.List[str], None] = attr.ib( - None, - metadata={ - "help_string": "the last part of the command, can be a string, " - "e.g. , or a list" - }, - ) - - def retrieve_values(self, wf, state_index=None): - """Parse output results.""" - temp_values = {} - for field in attr_fields(self): - # retrieving values that do not have templates - if not field.metadata.get("output_file_template"): - value = getattr(self, field.name) - if isinstance(value, LazyField): - temp_values[field.name] = value.get_value( - wf, state_index=state_index - ) - for field, val in temp_values.items(): - value = path_to_string(value) - setattr(self, field, val) - - def check_metadata(self): - """ - Check the metadata for fields in input_spec and fields. - - Also sets the default values when available and needed. - - """ - from ..utils.typing import TypeParser - - supported_keys = { - "allowed_values", - "argstr", - "container_path", - "copyfile", - "help_string", - "mandatory", - "readonly", - "output_field_name", - "output_file_template", - "position", - "requires", - "keep_extension", - "xor", - "sep", - "formatter", - "_output_type", - } - - for fld in attr_fields(self, exclude_names=("_func", "_graph_checksums")): - mdata = fld.metadata - # checking keys from metadata - if set(mdata.keys()) - supported_keys: - raise AttributeError( - f"only these keys are supported {supported_keys}, but " - f"{set(mdata.keys()) - supported_keys} provided for '{fld.name}' " - f"field in {self}" - ) - # checking if the help string is provided (required field) - if "help_string" not in mdata: - raise AttributeError( - f"{fld.name} doesn't have help_string field in {self}" - ) - # assuming that fields with output_file_template shouldn't have default - if mdata.get("output_file_template"): - if not any( - TypeParser.matches_type(fld.type, t) for t in OUTPUT_TEMPLATE_TYPES - ): - raise TypeError( - f"Type of '{fld.name}' should be one of {OUTPUT_TEMPLATE_TYPES} " - f"(not {fld.type}) because it has a value for output_file_template " - f"({mdata['output_file_template']!r})" - ) - if fld.default not in [attr.NOTHING, True, False]: - raise AttributeError( - f"default value ({fld.default!r}) should not be set together with " - f"output_file_template ({mdata['output_file_template']!r}) for " - f"'{fld.name}' field in {self}" - ) - # not allowing for default if the field is mandatory - if not fld.default == attr.NOTHING and mdata.get("mandatory"): - raise AttributeError( - f"default value ({fld.default!r}) should not be set when the field " - f"('{fld.name}') in {self}) is mandatory" - ) - # setting default if value not provided and default is available - if getattr(self, fld.name) is None: - if not fld.default == attr.NOTHING: - setattr(self, fld.name, fld.default) - - -@attr.s(auto_attribs=True, kw_only=True) -class ShellOutSpec: - """Output specification of a generic shell process.""" - - return_code: int - """The process' exit code.""" - stdout: str - """The process' standard output.""" - stderr: str - """The process' standard input.""" - - def collect_additional_outputs(self, inputs, output_dir, outputs): - from ..utils.typing import TypeParser - - """Collect additional outputs from shelltask output_spec.""" - additional_out = {} - for fld in attr_fields(self, exclude_names=("return_code", "stdout", "stderr")): - if not TypeParser.is_subclass( - fld.type, - ( - os.PathLike, - MultiOutputObj, - int, - float, - bool, - str, - list, - ), - ): - raise TypeError( - f"Support for {fld.type} type, required for '{fld.name}' in {self}, " - "has not been implemented in collect_additional_output" - ) - # assuming that field should have either default or metadata, but not both - input_value = getattr(inputs, fld.name, attr.NOTHING) - if input_value is not attr.NOTHING: - if TypeParser.contains_type(FileSet, fld.type): - if input_value is not False: - label = f"output field '{fld.name}' of {self}" - input_value = TypeParser(fld.type, label=label).coerce( - input_value - ) - additional_out[fld.name] = input_value - elif ( - fld.default is None or fld.default == attr.NOTHING - ) and not fld.metadata: # TODO: is it right? - raise AttributeError("File has to have default value or metadata") - elif fld.default != attr.NOTHING: - additional_out[fld.name] = self._field_defaultvalue(fld, output_dir) - elif fld.metadata: - if ( - fld.type in [int, float, bool, str, list] - and "callable" not in fld.metadata - ): - raise AttributeError( - f"{fld.type} has to have a callable in metadata" - ) - additional_out[fld.name] = self._field_metadata( - fld, inputs, output_dir, outputs - ) - return additional_out - - def generated_output_names(self, inputs, output_dir): - """Returns a list of all outputs that will be generated by the task. - Takes into account the task input and the requires list for the output fields. - TODO: should be in all Output specs? - """ - # checking the input (if all mandatory fields are provided, etc.) - inputs.check_fields_input_spec() - output_names = ["return_code", "stdout", "stderr"] - for fld in attr_fields(self, exclude_names=("return_code", "stdout", "stderr")): - if fld.type not in [File, MultiOutputFile, Directory]: - raise Exception("not implemented (collect_additional_output)") - # assuming that field should have either default or metadata, but not both - if ( - fld.default in (None, attr.NOTHING) and not fld.metadata - ): # TODO: is it right? - raise AttributeError("File has to have default value or metadata") - elif fld.default != attr.NOTHING: - output_names.append(fld.name) - elif ( - fld.metadata - and self._field_metadata( - fld, inputs, output_dir, outputs=None, check_existance=False - ) - != attr.NOTHING - ): - output_names.append(fld.name) - return output_names - - def _field_defaultvalue(self, fld, output_dir): - """Collect output file if the default value specified.""" - if not isinstance(fld.default, (str, Path)): - raise AttributeError( - f"{fld.name} is a File, so default value " - f"should be a string or a Path, " - f"{fld.default} provided" - ) - default = fld.default - if isinstance(default, str): - default = Path(default) - - default = output_dir / default - if "*" not in str(default): - if default.exists(): - return default - else: - raise AttributeError(f"file {default} does not exist") - else: - all_files = [Path(el) for el in glob(str(default.expanduser()))] - if len(all_files) > 1: - return all_files - elif len(all_files) == 1: - return all_files[0] - else: - raise AttributeError(f"no file matches {default.name}") - - def _field_metadata( - self, fld, inputs, output_dir, outputs=None, check_existance=True - ): - """Collect output file if metadata specified.""" - if self._check_requires(fld, inputs) is False: - return attr.NOTHING - - if "value" in fld.metadata: - return output_dir / fld.metadata["value"] - # this block is only run if "output_file_template" is provided in output_spec - # if the field is set in input_spec with output_file_template, - # than the field already should have value - elif "output_file_template" in fld.metadata: - value = template_update_single( - fld, inputs=inputs, output_dir=output_dir, spec_type="output" - ) - - if fld.type is MultiOutputFile and type(value) is list: - # TODO: how to deal with mandatory list outputs - ret = [] - for val in value: - val = Path(val) - if check_existance and not val.exists(): - ret.append(attr.NOTHING) - else: - ret.append(val) - return ret - else: - val = Path(value) - # checking if the file exists - if check_existance and not val.exists(): - # if mandatory raise exception - if "mandatory" in fld.metadata: - if fld.metadata["mandatory"]: - raise Exception( - f"mandatory output for variable {fld.name} does not exist" - ) - return attr.NOTHING - return val - elif "callable" in fld.metadata: - callable_ = fld.metadata["callable"] - if isinstance(callable_, staticmethod): - # In case callable is defined as a static method, - # retrieve the function wrapped in the descriptor. - callable_ = callable_.__func__ - call_args = inspect.getfullargspec(callable_) - call_args_val = {} - for argnm in call_args.args: - if argnm == "field": - call_args_val[argnm] = fld - elif argnm == "output_dir": - call_args_val[argnm] = output_dir - elif argnm == "inputs": - call_args_val[argnm] = inputs - elif argnm == "stdout": - call_args_val[argnm] = outputs["stdout"] - elif argnm == "stderr": - call_args_val[argnm] = outputs["stderr"] - else: - try: - call_args_val[argnm] = getattr(inputs, argnm) - except AttributeError: - raise AttributeError( - f"arguments of the callable function from {fld.name} " - f"has to be in inputs or be field or output_dir, " - f"but {argnm} is used" - ) - return callable_(**call_args_val) - else: - raise Exception( - f"Metadata for '{fld.name}', does not not contain any of the required fields " - f'("callable", "output_file_template" or "value"): {fld.metadata}.' - ) - - def _check_requires(self, fld, inputs): - """checking if all fields from the requires and template are set in the input - if requires is a list of list, checking if at least one list has all elements set - """ - from .helpers import ensure_list - - if "requires" in fld.metadata: - # if requires is a list of list it is treated as el[0] OR el[1] OR... - required_fields = ensure_list(fld.metadata["requires"]) - if all([isinstance(el, list) for el in required_fields]): - field_required_OR = required_fields - # if requires is a list of tuples/strings - I'm creating a 1-el nested list - elif all([isinstance(el, (str, tuple)) for el in required_fields]): - field_required_OR = [required_fields] - else: - raise Exception( - f"requires field can be a list of list, or a list " - f"of strings/tuples, but {fld.metadata['requires']} " - f"provided for {fld.name}" - ) - else: - field_required_OR = [[]] - - for field_required in field_required_OR: - # if the output has output_file_template field, - # adding all input fields from the template to requires - if "output_file_template" in fld.metadata: - template = fld.metadata["output_file_template"] - # if a template is a function it has to be run first with the inputs as the only arg - if callable(template): - template = template(inputs) - inp_fields = re.findall(r"{\w+}", template) - field_required += [ - el[1:-1] for el in inp_fields if el[1:-1] not in field_required - ] - - # it's a flag, of the field from the list is not in input it will be changed to False - required_found = True - for field_required in field_required_OR: - required_found = True - # checking if the input fields from requires have set values - for inp in field_required: - if isinstance(inp, str): # name of the input field - if not hasattr(inputs, inp): - raise Exception( - f"{inp} is not a valid input field, can't be used in requires" - ) - elif getattr(inputs, inp) in [attr.NOTHING, None]: - required_found = False - break - elif isinstance(inp, tuple): # (name, allowed values) - inp, allowed_val = inp[0], ensure_list(inp[1]) - if not hasattr(inputs, inp): - raise Exception( - f"{inp} is not a valid input field, can't be used in requires" - ) - elif getattr(inputs, inp) not in allowed_val: - required_found = False - break - else: - raise Exception( - f"each element of the requires element should be a string or a tuple, " - f"but {inp} is found in {field_required}" - ) - # if the specific list from field_required_OR has all elements set, no need to check more - if required_found: - break - - if required_found: - return True - else: - return False - - -@attr.s -class LazyInterface: - _task: "core.TaskBase" = attr.ib() - _attr_type: str - - def __getattr__(self, name): - if name in ("_task", "_attr_type", "_field_names"): - raise AttributeError(f"{name} hasn't been set yet") - if name not in self._field_names: - raise AttributeError( - f"Task '{self._task.name}' has no {self._attr_type} attribute '{name}', " - "available: '" + "', '".join(self._field_names) + "'" - ) - type_ = self._get_type(name) - splits = self._get_task_splits() - combines = self._get_task_combines() - if combines and self._attr_type == "output": - # Add in any scalar splits referencing upstream splits, i.e. "_myupstreamtask", - # "_myarbitrarytask" - combined_upstreams = set() - if self._task.state: - for scalar in LazyField.sanitize_splitter( - self._task.state.splitter, strip_previous=False - ): - for field in scalar: - if field.startswith("_"): - node_name = field[1:] - if any(c.split(".")[0] == node_name for c in combines): - combines.update( - f for f in scalar if not f.startswith("_") - ) - combined_upstreams.update( - f[1:] for f in scalar if f.startswith("_") - ) - if combines: - # Wrap type in list which holds the combined items - type_ = ty.List[type_] - # Iterate through splits to remove any splits which are removed by the - # combiner - for splitter in copy(splits): - remaining = tuple( - s - for s in splitter - if not any( - (x in combines or x.split(".")[0] in combined_upstreams) - for x in s - ) - ) - if remaining != splitter: - splits.remove(splitter) - if remaining: - splits.add(remaining) - # Wrap the type in a nested StateArray type - if splits: - type_ = StateArray[type_] - lf_klass = LazyInField if self._attr_type == "input" else LazyOutField - return lf_klass[type_]( - name=self._task.name, - field=name, - type=type_, - splits=splits, - ) - - def _get_task_splits(self) -> ty.Set[ty.Tuple[ty.Tuple[str, ...], ...]]: - """Returns the states over which the inputs of the task are split""" - splitter = self._task.state.splitter if self._task.state else None - splits = set() - if splitter: - # Ensure that splits is of tuple[tuple[str, ...], ...] form - splitter = LazyField.sanitize_splitter(splitter) - if splitter: - splits.add(splitter) - for inpt in attr.asdict(self._task.inputs, recurse=False).values(): - if isinstance(inpt, LazyField): - splits.update(inpt.splits) - return splits - - def _get_task_combines(self) -> ty.Set[ty.Union[str, ty.Tuple[str, ...]]]: - """Returns the states over which the outputs of the task are combined""" - combiner = ( - self._task.state.combiner - if self._task.state is not None - else getattr(self._task, "fut_combiner", None) - ) - return set(combiner) if combiner else set() - - -class LazyIn(LazyInterface): - _attr_type = "input" - - def _get_type(self, name): - attr = next(t for n, t in self._task.input_spec.fields if n == name) - if attr is None: - return ty.Any - elif inspect.isclass(attr): - return attr - else: - return attr.type - - @property - def _field_names(self): - return [field[0] for field in self._task.input_spec.fields] - - -class LazyOut(LazyInterface): - _attr_type = "output" - - def _get_type(self, name): - try: - type_ = next(f[1] for f in self._task.output_spec.fields if f[0] == name) - except StopIteration: - type_ = ty.Any - else: - if not inspect.isclass(type_): - try: - type_ = type_.type # attrs _CountingAttribute - except AttributeError: - pass # typing._SpecialForm - return type_ - - @property - def _field_names(self): - return self._task.output_names + ["all_"] - - -TypeOrAny = ty.Union[ty.Type[T], ty.Any] -Splitter = ty.Union[str, ty.Tuple[str, ...]] - - -@attr.s(auto_attribs=True, kw_only=True) -class LazyField(ty.Generic[T]): - """Lazy fields implement promises.""" - - name: str - field: str - type: TypeOrAny - # Set of splitters that have been applied to the lazy field. Note that the splitter - # specifications are transformed to a tuple[tuple[str, ...], ...] form where the - # outer tuple is the outer product, the inner tuple are inner products (where either - # product can be of length==1) - splits: ty.FrozenSet[ty.Tuple[ty.Tuple[str, ...], ...]] = attr.field( - factory=frozenset, converter=frozenset - ) - cast_from: ty.Optional[ty.Type[ty.Any]] = None - - def __bytes_repr__(self, cache): - yield type(self).__name__.encode() - yield self.name.encode() - yield self.field.encode() - - def cast(self, new_type: TypeOrAny) -> "LazyField": - """ "casts" the lazy field to a new type - - Parameters - ---------- - new_type : type - the type to cast the lazy-field to - - Returns - ------- - cast_field : LazyField - a copy of the lazy field with the new type - """ - return type(self)[new_type]( - name=self.name, - field=self.field, - type=new_type, - splits=self.splits, - cast_from=self.cast_from if self.cast_from else self.type, - ) - - def split(self, splitter: Splitter) -> "LazyField": - """ "Splits" the lazy field over an array of nodes by replacing the sequence type - of the lazy field with StateArray to signify that it will be "split" across - - Parameters - ---------- - splitter : str or ty.Tuple[str, ...] or ty.List[str] - the splitter to append to the list of splitters - """ - from ..utils.typing import TypeParser # pylint: disable=import-outside-toplevel - - splits = self.splits | set([LazyField.sanitize_splitter(splitter)]) - # Check to see whether the field has already been split over the given splitter - if splits == self.splits: - return self - - # Modify the type of the lazy field to include the split across a state-array - inner_type, prev_split_depth = TypeParser.strip_splits(self.type) - assert prev_split_depth <= 1 - if inner_type is ty.Any: - type_ = StateArray[ty.Any] - elif TypeParser.matches_type(inner_type, list): - item_type = TypeParser.get_item_type(inner_type) - type_ = StateArray[item_type] - else: - raise TypeError( - f"Cannot split non-sequence field {self} of type {inner_type}" - ) - if prev_split_depth: - type_ = StateArray[type_] - return type(self)[type_]( - name=self.name, - field=self.field, - type=type_, - splits=splits, - ) - - @classmethod - def sanitize_splitter( - cls, splitter: Splitter, strip_previous: bool = True - ) -> ty.Tuple[ty.Tuple[str, ...], ...]: - """Converts the splitter spec into a consistent tuple[tuple[str, ...], ...] form - used in LazyFields""" - if isinstance(splitter, str): - splitter = (splitter,) - if isinstance(splitter, tuple): - splitter = (splitter,) # type: ignore - else: - assert isinstance(splitter, list) - # convert to frozenset to differentiate from tuple, yet still be hashable - # (NB: order of fields in list splitters aren't relevant) - splitter = tuple((s,) if isinstance(s, str) else s for s in splitter) - # Strip out fields starting with "_" designating splits in upstream nodes - if strip_previous: - stripped = tuple( - tuple(f for f in i if not f.startswith("_")) for i in splitter - ) - splitter = tuple(s for s in stripped if s) # type: ignore - return splitter # type: ignore - - def _apply_cast(self, value): - """\"Casts\" the value from the retrieved type if a cast has been applied to - the lazy-field""" - from pydra.utils.typing import TypeParser - - if self.cast_from: - assert TypeParser.matches(value, self.cast_from) - value = self.type(value) - return value - - -class LazyInField(LazyField[T]): - attr_type = "input" - - def get_value( - self, wf: "pydra.Workflow", state_index: ty.Optional[int] = None - ) -> ty.Any: - """Return the value of a lazy field. - - Parameters - ---------- - wf : Workflow - the workflow the lazy field references - state_index : int, optional - the state index of the field to access - - Returns - ------- - value : Any - the resolved value of the lazy-field - """ - from ..utils.typing import TypeParser # pylint: disable=import-outside-toplevel - - value = getattr(wf.inputs, self.field) - if TypeParser.is_subclass(self.type, StateArray) and not wf._pre_split: - _, split_depth = TypeParser.strip_splits(self.type) - - def apply_splits(obj, depth): - if depth < 1: - return obj - return StateArray[self.type](apply_splits(i, depth - 1) for i in obj) - - value = apply_splits(value, split_depth) - value = self._apply_cast(value) - return value - - -class LazyOutField(LazyField[T]): - attr_type = "output" - - def get_value( - self, wf: "pydra.Workflow", state_index: ty.Optional[int] = None - ) -> ty.Any: - """Return the value of a lazy field. - - Parameters - ---------- - wf : Workflow - the workflow the lazy field references - state_index : int, optional - the state index of the field to access - - Returns - ------- - value : Any - the resolved value of the lazy-field - """ - from ..utils.typing import TypeParser # pylint: disable=import-outside-toplevel - - node = getattr(wf, self.name) - result = node.result(state_index=state_index) - if result is None: - raise RuntimeError( - f"Could not find results of '{node.name}' node in a sub-directory " - f"named '{node.checksum}' in any of the cache locations.\n" - + "\n".join(str(p) for p in set(node.cache_locations)) - + f"\n\nThis is likely due to hash changes in '{self.name}' node inputs. " - f"Current values and hashes: {node.inputs}, " - f"{node.inputs._hashes}\n\n" - "Set loglevel to 'debug' in order to track hash changes " - "throughout the execution of the workflow.\n\n " - "These issues may have been caused by `bytes_repr()` methods " - "that don't return stable hash values for specific object " - "types across multiple processes (see bytes_repr() " - '"singledispatch "function in pydra/utils/hash.py).' - "You may need to write specific `bytes_repr()` " - "implementations (see `pydra.utils.hash.register_serializer`) or a " - "`__bytes_repr__()` dunder methods to handle one or more types in " - "your interface inputs." - ) - _, split_depth = TypeParser.strip_splits(self.type) - - def get_nested_results(res, depth: int): - if isinstance(res, list): - if not depth: - val = [r.get_output_field(self.field) for r in res] - else: - val = StateArray[self.type]( - get_nested_results(res=r, depth=depth - 1) for r in res - ) - else: - if res.errored: - raise ValueError( - f"Cannot retrieve value for {self.field} from {self.name} as " - "the node errored" - ) - val = res.get_output_field(self.field) - if depth and not wf._pre_split: - assert isinstance(val, ty.Sequence) and not isinstance(val, str) - val = StateArray[self.type](val) - return val - - value = get_nested_results(result, depth=split_depth) - value = self._apply_cast(value) - return value - - -class StateArray(ty.List[T]): - """an array of values from, or to be split over in an array of nodes (see TaskBase.split()), - multiple nodes of the same task. Used in type-checking to differentiate between list - types and values for multiple nodes - """ - - def __repr__(self): - return f"{type(self).__name__}(" + ", ".join(repr(i) for i in self) + ")" - - -def donothing(*args, **kwargs): - return None - - -@attr.s(auto_attribs=True, kw_only=True) -class TaskHook: - """Callable task hooks.""" - - pre_run_task: ty.Callable = donothing - post_run_task: ty.Callable = donothing - pre_run: ty.Callable = donothing - post_run: ty.Callable = donothing - - def __setattr__(self, attr, val): - if attr not in ["pre_run_task", "post_run_task", "pre_run", "post_run"]: - raise AttributeError("Cannot set unknown hook") - super().__setattr__(attr, val) - - def reset(self): - for val in ["pre_run_task", "post_run_task", "pre_run", "post_run"]: - setattr(self, val, donothing) - - -def path_to_string(value): - """Convert paths to strings.""" - if isinstance(value, Path): - value = str(value) - elif isinstance(value, list) and len(value) and isinstance(value[0], Path): - value = [str(val) for val in value] - return value - - -from . import core # noqa diff --git a/pydra/engine/state.py b/pydra/engine/state.py index befbf86b9d..7aa5701d71 100644 --- a/pydra/engine/state.py +++ b/pydra/engine/state.py @@ -3,22 +3,25 @@ from copy import deepcopy import itertools from functools import reduce +import logging +import typing as ty +from pydra.utils.typing import StateArray, TypeParser +from pydra.utils.general import ensure_list, attrs_values -from . import helpers_state as hlpst -from .helpers import ensure_list -from .specs import BaseSpec -# TODO: move to State -op = {".": zip, "*": itertools.product} +logger = logging.getLogger("pydra") + + +OutputsType = ty.TypeVar("OutputsType") class State: """ A class that specifies a State of all tasks. - * It's only used when a task have a splitter. + * It's only used when a job have a splitter. * It contains all information about splitter, combiner, final splitter, - and input values for specific task states + and input values for specific job states (specified by the splitter and the input). * It also contains information about the final groups and the final splitter if combiner is available. @@ -26,7 +29,7 @@ class State: Attributes ---------- name : :obj:`str` - name of the state that is the same as a name of the task + name of the state that is the same as a name of the job splitter : :obj:`str`, :obj:`tuple`, :obj:`list` can be a str (name of a single input), tuple for scalar splitter, or list for outer splitter @@ -60,8 +63,8 @@ class State: values for all state inputs (i.e. inputs that are part of the splitter) inputs_ind : :obj:`list` of :obj:`dict` dictionary for every state that contains - indices for all task inputs (i.e. inputs that are relevant - for current task, can be outputs from previous nodes) + indices for all job inputs (i.e. inputs that are relevant + for current job, can be outputs from previous nodes) group_for_inputs : :obj:`dict` specifying groups (axes) for each input field (depends on the splitter) @@ -77,16 +80,23 @@ class State: """ - def __init__(self, name, splitter=None, combiner=None, other_states=None): + def __init__( + self, + name, + splitter=None, + combiner=None, + container_ndim=None, + other_states=None, + ): """ Initialize a state. Parameters ---------- name : :obj:`str` - name (should be the same as the task's name) + name (should be the same as the job's name) splitter : :obj:`str`, or :obj:`tuple`, or :obj:`list` - splitter of a task + splitter of a job combiner : :obj:`str`, or :obj:`list`) field/fields used to combine results other_states :obj:`dict`: @@ -99,6 +109,9 @@ def __init__(self, name, splitter=None, combiner=None, other_states=None): self.splitter = splitter # temporary combiner self.combiner = combiner + self.container_ndim = container_ndim or {} + self._inner_container_ndim = {} + self._inputs_ind = None # if other_states, the connections have to be updated if self.other_states: self.update_connections() @@ -110,6 +123,101 @@ def __str__(self): f"and combiner: {self.combiner}" ) + @property + def names(self): + """Return the names of the states.""" + previous_states_keys = { + f"_{v.name}": v.keys_final for v in self.inner_inputs.values() + } + names = [] + # iterating splitter_rpn + for token in self.splitter_rpn: + if token in [".", "*"]: # token is one of the input var + continue + # adding variable to the stack + if token.startswith("_"): + new_keys = previous_states_keys[token] + names += new_keys + else: + names.append(token) + return names + + def depth(self, before_combine: bool = False) -> int: + """Return the number of splits of the state, i.e. the number nested + state arrays to wrap around the type of lazy out fields + + Parameters + ---------- + before_combine : :obj:`bool` + if True, the depth is after combining the fields, otherwise it is before + any combinations + + Returns + ------- + int + number of splits in the state (i.e. linked splits only add 1) + """ + + # replace field names with 1 or 0 (1 if the field is included in the state) + include_rpn = [ + ( + s + if s in [".", "*"] + else (1 if before_combine else int(s not in self.combiner)) + ) + for s in self.splitter_rpn + ] + + stack = [] + for opr in include_rpn: + if opr == ".": + assert len(stack) >= 2 + opr1 = stack.pop() + opr2 = stack.pop() + stack.append(opr1 and opr2) + elif opr == "*": + assert len(stack) >= 2 + stack.append(stack.pop() + stack.pop()) + else: + stack.append(opr) + assert len(stack) == 1 + return stack[0] + + def nest_output_type(self, type_: type) -> type: + """Nests a type of an output field in a combination of lists and state-arrays + based on the state's splitter and combiner + + Parameters + ---------- + type_ : type + the type of the output field + + Returns + ------- + type + the nested type of the output field + """ + + state_array_depth = self.depth() + + # If there is a combination, it will get flattened into a single list + if self.depth(before_combine=True) > state_array_depth: + type_ = list[type_] + + # Nest the uncombined state arrays around the type + for _ in range(state_array_depth): + type_ = StateArray[type_] + return type_ + + @classmethod + def combine_state_arrays(cls, type_: type) -> type: + """Collapses (potentially nested) state array(s) into a single list""" + if TypeParser.get_origin(type_) is StateArray: + # Implicitly combine any remaining uncombined states into a single + # list + type_ = list[TypeParser.strip_splits(type_)[0]] + return type_ + @property def splitter(self): """Get the splitter of the state.""" @@ -118,11 +226,9 @@ def splitter(self): @splitter.setter def splitter(self, splitter): if splitter and not isinstance(splitter, (str, tuple, list)): - raise hlpst.PydraStateError( - "splitter has to be a string, a tuple or a list" - ) + raise PydraStateError("splitter has to be a string, a tuple or a list") if splitter: - self._splitter = hlpst.add_name_splitter(splitter, self.name) + self._splitter = add_name_splitter(splitter, self.name) else: self._splitter = None # updating splitter_rpn @@ -131,15 +237,15 @@ def splitter(self, splitter): def _splitter_rpn_updates(self): """updating splitter_rpn and splitter_rpn_compact""" try: - self._splitter_rpn = hlpst.splitter2rpn( + self._splitter_rpn = splitter2rpn( self.splitter, other_states=self.other_states ) # other_state might not be ready yet - except hlpst.PydraStateError: + except PydraStateError: self._splitter_rpn = None if self.other_states or self._splitter_rpn is None: - self._splitter_rpn_compact = hlpst.splitter2rpn( + self._splitter_rpn_compact = splitter2rpn( self.splitter, other_states=self.other_states, state_fields=False ) else: @@ -150,7 +256,7 @@ def splitter_rpn(self): """splitter in :abbr:`RPN (Reverse Polish Notation)`""" # if splitter_rpn was not calculated within splitter.setter if self._splitter_rpn is None: - self._splitter_rpn = hlpst.splitter2rpn( + self._splitter_rpn = splitter2rpn( self.splitter, other_states=self.other_states ) return self._splitter_rpn @@ -166,12 +272,12 @@ def splitter_rpn_compact(self): @property def splitter_final(self): """the final splitter, after removing the combined fields""" - return hlpst.rpn2splitter(self.splitter_rpn_final) + return rpn2splitter(self.splitter_rpn_final) @property def splitter_rpn_final(self): if self.combiner: - _splitter_rpn_final = hlpst.remove_inp_from_splitter_rpn( + _splitter_rpn_final = remove_inp_from_splitter_rpn( deepcopy(self.splitter_rpn), self.current_combiner_all + self.prev_state_combiner_all, ) @@ -182,7 +288,7 @@ def splitter_rpn_final(self): @property def current_splitter(self): """the current part of the splitter, - i.e. the part that is related to the current task's state only + i.e. the part that is related to the current job's state only (doesn't include fields propagated from the previous tasks) """ if hasattr(self, "_current_splitter"): @@ -190,6 +296,17 @@ def current_splitter(self): else: return self.splitter + @property + def inputs_ind(self): + """dictionary for every state that contains indices for all job inputs + (i.e. inputs that are relevant for current job, can be outputs from previous nodes) + """ + if self._inputs_ind is None: + raise RuntimeError( + "inputs_ind is not set, please run prepare_states() on the state first" + ) + return self._inputs_ind + @current_splitter.setter def current_splitter(self, current_splitter): self._current_splitter = current_splitter @@ -199,7 +316,7 @@ def current_splitter(self, current_splitter): def _current_splitter_rpn_updates(self): """updating current_splitter_rpn""" if self._current_splitter: - self._current_splitter_rpn = hlpst.splitter2rpn( + self._current_splitter_rpn = splitter2rpn( self.current_splitter, other_states=self.other_states ) else: @@ -232,14 +349,14 @@ def prev_state_splitter(self, prev_state_splitter): def _prev_state_splitter_rpn_updates(self): """updating prev_state_splitter_rpn/_rpn_compact""" if self._prev_state_splitter: - self._prev_state_splitter_rpn = hlpst.splitter2rpn( + self._prev_state_splitter_rpn = splitter2rpn( self.prev_state_splitter, other_states=self.other_states ) else: self._prev_state_splitter_rpn = [] if self.other_states: - self._prev_state_splitter_rpn_compact = hlpst.splitter2rpn( + self._prev_state_splitter_rpn_compact = splitter2rpn( self.prev_state_splitter, other_states=self.other_states, state_fields=False, @@ -259,6 +376,14 @@ def prev_state_splitter_rpn_compact(self): """ return self._prev_state_splitter_rpn_compact + @property + def container_ndim_all(self): + # adding inner_container_ndim to the general container_dimension provided by the users + container_ndim_all = deepcopy(self.container_ndim) + for k, v in self._inner_container_ndim.items(): + container_ndim_all[k] = container_ndim_all.get(k, 1) + v + return container_ndim_all + @property def combiner(self): """the combiner associated to the state.""" @@ -268,15 +393,15 @@ def combiner(self): def combiner(self, combiner): if combiner: if not isinstance(combiner, (str, list)): - raise hlpst.PydraStateError("combiner has to be a string or a list") - self._combiner = hlpst.add_name_combiner(ensure_list(combiner), self.name) + raise PydraStateError("combiner has to be a string or a list") + self._combiner = add_name_combiner(ensure_list(combiner), self.name) else: self._combiner = [] @property def current_combiner(self): """the current part of the combiner, - i.e. the part that is related to the current task's state only + i.e. the part that is related to the current job's state only (doesn't include fields propagated from the previous tasks) """ return [comb for comb in self.combiner if self.name in comb] @@ -326,13 +451,11 @@ def other_states(self): def other_states(self, other_states): if other_states: if not isinstance(other_states, dict): - raise hlpst.PydraStateError("other states has to be a dictionary") + raise PydraStateError("other states has to be a dictionary") else: for key, val in other_states.items(): if not val: - raise hlpst.PydraStateError( - f"connection from node {key} is empty" - ) + raise PydraStateError(f"connection from node {key} is empty") # ensuring that the connected fields are set as a list self._other_states = { nm: (st, ensure_list(flds)) for nm, (st, flds) in other_states.items() @@ -430,7 +553,7 @@ def _complete_prev_state(self, prev_state=None): the prev-state part of the splitter, that has to be completed """ if prev_state: - rpn_prev_state = hlpst.splitter2rpn( + rpn_prev_state = splitter2rpn( prev_state, other_states=self.other_states, state_fields=False ) for name, (st, inp) in list(self.other_states.items())[::-1]: @@ -450,7 +573,7 @@ def _remove_repeated(self, previous_splitters): """removing states from previous tasks that are repeated""" for el in previous_splitters: if el[1:] not in self.other_states: - raise hlpst.PydraStateError( + raise PydraStateError( f"can't ask for splitter from {el[1:]}, other nodes that are connected: " f"{self.other_states}" ) @@ -557,7 +680,7 @@ def _prevst_current_check(self, splitter_part, check_nested=True): If the splitter_part is mixed exception is raised. """ - rpn_part = hlpst.splitter2rpn( + rpn_part = splitter2rpn( splitter_part, other_states=self.other_states, state_fields=False ) inputs_in_splitter = [i for i in rpn_part if i not in ["*", "."]] @@ -579,7 +702,7 @@ def _prevst_current_check(self, splitter_part, check_nested=True): # the prev-state and the current parts separated in outer scalar return "[prev-state, current]" else: - raise hlpst.PydraStateError( + raise PydraStateError( "prev-state and current splitters are mixed - splitter invalid" ) @@ -591,7 +714,7 @@ def set_input_groups(self, state_fields=True): state_fields : :obj:`bool` if False the splitter from the previous states are unwrapped """ - current_splitter_rpn = hlpst.splitter2rpn( + current_splitter_rpn = splitter2rpn( self.current_splitter, other_states=self.other_states, state_fields=state_fields, @@ -599,7 +722,7 @@ def set_input_groups(self, state_fields=True): # merging groups from previous nodes if any input come from previous states if self.other_states: self._merge_previous_groups() - keys_f, group_for_inputs_f, groups_stack_f, combiner_all = hlpst.splits_groups( + keys_f, group_for_inputs_f, groups_stack_f, combiner_all = splits_groups( current_splitter_rpn, combiner=self.current_combiner, inner_inputs=self.inner_inputs, @@ -628,7 +751,7 @@ def _merge_previous_groups(self): self.group_for_inputs_final = {} self.keys_final = [] if self.prev_state_combiner: - _, _, _, self._prev_state_combiner_all = hlpst.splits_groups( + _, _, _, self._prev_state_combiner_all = splits_groups( self.prev_state_splitter_rpn, combiner=self.prev_state_combiner ) for i, prev_nm in enumerate(self.prev_state_splitter_rpn_compact): @@ -640,7 +763,7 @@ def _merge_previous_groups(self): ): last_gr = last_gr - 1 if prev_nm[1:] not in self.other_states: - raise hlpst.PydraStateError( + raise PydraStateError( f"can't ask for splitter from {prev_nm[1:]}, " f"other nodes that are connected: {self.other_states}" ) @@ -661,14 +784,14 @@ def _merge_previous_groups(self): group_for_inputs_f_st, groups_stack_f_st, combiner_all_st, - ) = hlpst.splits_groups( + ) = splits_groups( st.splitter_rpn_final, combiner=st_combiner, inner_inputs=st.inner_inputs, ) self.keys_final += keys_f_st # st.keys_final if not hasattr(st, "group_for_inputs_final"): - raise hlpst.PydraStateError("previous state has to run first") + raise PydraStateError("previous state has to run first") group_for_inputs = group_for_inputs_f_st groups_stack = groups_stack_f_st self._prev_state_combiner_all += combiner_all_st @@ -721,7 +844,7 @@ def splitter_validation(self): or (spl.startswith("_") and spl[1:] in self.other_states) or spl.split(".")[0] == self.name ): - raise hlpst.PydraStateError( + raise PydraStateError( "can't include {} in the splitter, consider using _{}".format( spl, spl.split(".")[0] ) @@ -729,15 +852,25 @@ def splitter_validation(self): def combiner_validation(self): """validating if the combiner is correct (after all states are connected)""" - if self.combiner: + if local_names := set( + c for c in self.combiner if c.startswith(self.name + ".") + ): if not self.splitter: - raise hlpst.PydraStateError( - "splitter has to be set before setting combiner" + raise PydraStateError( + "splitter has to be set before setting combiner with field names " + f"in the current node {list(local_names)}" + ) + if missing := local_names - set(self.splitter_rpn): + raise PydraStateError( + "The following field names from the current node referenced in the " + f"combiner, {list(missing)} are not in the splitter" ) - if set(self._combiner) - set(self.splitter_rpn): - raise hlpst.PydraStateError("all combiners have to be in the splitter") - def prepare_states(self, inputs, cont_dim=None): + def prepare_states( + self, + inputs: dict[str, ty.Any], + container_ndim: dict[str, int] | None = None, + ): """ Prepare a full list of state indices and state values. @@ -746,34 +879,19 @@ def prepare_states(self, inputs, cont_dim=None): State Values specific elements from inputs that can be used running interfaces - - Parameters - ---------- - inputs : :obj:`dict` - inputs of the task - cont_dim : :obj:`dict` or `None` - container's dimensions for a specific input's fields """ # checking if splitter and combiner have valid forms self.splitter_validation() self.combiner_validation() self.set_input_groups() - # container dimension for each input, specifies how nested the input is - if cont_dim: - self.cont_dim = cont_dim - else: - self.cont_dim = {} - if isinstance(inputs, BaseSpec): - self.inputs = hlpst.inputs_types_to_dict(self.name, inputs) - else: - self.inputs = inputs + self.inputs = inputs + if container_ndim is not None: + self.container_ndim = container_ndim if self.other_states: + st: State for nm, (st, _) in self.other_states.items(): - # I think now this if is never used - if not hasattr(st, "states_ind"): - st.prepare_states(self.inputs, cont_dim=cont_dim) self.inputs.update(st.inputs) - self.cont_dim.update(st.cont_dim) + self.container_ndim.update(st.container_ndim_all) self.prepare_states_ind() self.prepare_states_val() @@ -782,11 +900,11 @@ def prepare_states_ind(self): """ Calculate a list of dictionaries with state indices. - Uses hlpst.splits. + Uses splits. """ # removing elements that are connected to inner splitter - # (they will be taken into account in hlpst.splits anyway) + # (they will be taken into account in splits anyway) # _comb part will be used in prepare_states_combined_ind # TODO: need tests in test_Workflow.py elements_to_remove = [] @@ -801,7 +919,7 @@ def prepare_states_ind(self): if f"{self.name}.{inp}" not in self.combiner: elements_to_remove_comb.append(f"_{name}") - partial_rpn = hlpst.remove_inp_from_splitter_rpn( + partial_rpn = remove_inp_from_splitter_rpn( deepcopy(self.splitter_rpn_compact), elements_to_remove ) values_out_pr, keys_out_pr = self.splits( @@ -811,7 +929,7 @@ def prepare_states_ind(self): self.ind_l = values_pr self.keys = keys_out_pr - self.states_ind = list(hlpst.iter_splits(values_pr, self.keys)) + self.states_ind = list(iter_splits(values_pr, self.keys)) self.keys_final = self.keys if self.combiner: self.prepare_states_combined_ind(elements_to_remove_comb) @@ -832,14 +950,14 @@ def prepare_states_combined_ind(self, elements_to_remove_comb): elements_to_remove_comb : :obj:`list` elements of the splitter that should be removed due to the combining """ - partial_rpn_compact = hlpst.remove_inp_from_splitter_rpn( + partial_rpn_compact = remove_inp_from_splitter_rpn( deepcopy(self.splitter_rpn_compact), elements_to_remove_comb ) # combiner can have parts from the prev-state splitter, so have to have rpn with states - partial_rpn = hlpst.splitter2rpn( - hlpst.rpn2splitter(partial_rpn_compact), other_states=self.other_states + partial_rpn = splitter2rpn( + rpn2splitter(partial_rpn_compact), other_states=self.other_states ) - combined_rpn = hlpst.remove_inp_from_splitter_rpn( + combined_rpn = remove_inp_from_splitter_rpn( deepcopy(partial_rpn), self.current_combiner_all + self.prev_state_combiner_all, ) @@ -858,7 +976,7 @@ def prepare_states_combined_ind(self, elements_to_remove_comb): self.keys_final = keys_out # groups after combiner ind_map = { - tuple(hlpst.flatten(tup, max_depth=10)): ind + tuple(flatten(tup, max_depth=10)): ind for ind, tup in enumerate(self.ind_l_final) } self.final_combined_ind_mapping = { @@ -872,14 +990,14 @@ def prepare_states_combined_ind(self, elements_to_remove_comb): self.keys_final = keys_out # should be 0 or None? self.final_combined_ind_mapping = {0: list(range(len(self.states_ind)))} - self.states_ind_final = list( - hlpst.iter_splits(self.ind_l_final, self.keys_final) - ) + self.states_ind_final = list(iter_splits(self.ind_l_final, self.keys_final)) def prepare_states_val(self): """Evaluate states values having states indices.""" self.states_val = list( - hlpst.map_splits(self.states_ind, self.inputs, cont_dim=self.cont_dim) + map_splits( + self.states_ind, self.inputs, container_ndim=self.container_ndim_all + ) ) return self.states_val @@ -892,7 +1010,7 @@ def prepare_inputs(self): """ if not self.other_states: - self.inputs_ind = self.states_ind + self._inputs_ind = self.states_ind else: # elements from the current node (the current part of the splitter) if self.current_splitter_rpn: @@ -943,11 +1061,11 @@ def prepare_inputs(self): inputs_ind = [] # iter_splits using inputs from current state/node - self.inputs_ind = list(hlpst.iter_splits(inputs_ind, keys_inp)) + self._inputs_ind = list(iter_splits(inputs_ind, keys_inp)) # removing elements that are connected to inner splitter # TODO - add tests to test_workflow.py (not sure if we want to remove it) for el in connected_to_inner: - [dict.pop(el) for dict in self.inputs_ind] + [dict.pop(el) for dict in self._inputs_ind] def splits(self, splitter_rpn): """ @@ -1049,8 +1167,8 @@ def _processing_terms(self, term, previous_states_ind): var_ind, new_keys = previous_states_ind[term] shape = (len(var_ind),) else: - cont_dim = self.cont_dim.get(term, 1) - shape = hlpst.input_shape(self.inputs[term], cont_dim=cont_dim) + container_ndim = self.container_ndim_all.get(term, 1) + shape = input_shape(self.inputs[term], container_ndim=container_ndim) var_ind = range(reduce(lambda x, y: x * y, shape)) new_keys = [term] # checking if the term is in inner_inputs @@ -1069,8 +1187,9 @@ def _processing_terms(self, term, previous_states_ind): def _single_op_splits(self, op_single): """splits function if splitter is a singleton""" - shape = hlpst.input_shape( - self.inputs[op_single], cont_dim=self.cont_dim.get(op_single, 1) + shape = input_shape( + self.inputs[op_single], + container_ndim=self.container_ndim_all.get(op_single, 1), ) val_ind = range(reduce(lambda x, y: x * y, shape)) if op_single in self.inner_inputs: @@ -1091,3 +1210,680 @@ def _single_op_splits(self, op_single): val = op["*"](val_ind) keys = [op_single] return val, keys + + def _get_element(self, value: ty.Any, field_name: str, ind: int) -> ty.Any: + """ + Extracting element of the inputs taking into account + container dimension of the specific element that can be set in self.state.container_ndim. + If input name is not in container_ndim, it is assumed that the input values has + a container dimension of 1, so only the most outer dim will be used for splitting. + + Parameters + ---------- + value : Any + inputs of the job + field_name : str + name of the input field + ind : int + index of the element + + Returns + ------- + Any + specific element of the input field + """ + if f"{self.name}.{field_name}" in self.container_ndim_all: + return list( + flatten( + ensure_list(value), + max_depth=self.container_ndim_all[f"{self.name}.{field_name}"], + ) + )[ind] + else: + return value[ind] + + +def splitter2rpn(splitter, other_states=None, state_fields=True): + """ + Translate user-provided splitter into *reverse polish notation*. + + The reverse polish notation is imposed by :class:`~pydra.engine.state.State`. + + Parameters + ---------- + splitter : + splitter (standard form) + other_states : + other states that are connected to the state + state_fields : :obj:`bool` + if False the splitter from the previous states are unwrapped + + """ + if not splitter: + return [] + output_splitter = [] + _ordering( + deepcopy(splitter), + i=0, + output_splitter=output_splitter, + other_states=deepcopy(other_states), + state_fields=state_fields, + ) + return output_splitter + + +def _ordering( + el, i, output_splitter, current_sign=None, other_states=None, state_fields=True +): + """Get a proper order of fields and signs (used by splitter2rpn).""" + if type(el) is tuple: + # checking if the splitter dont contain splitter from previous nodes + # i.e. has str "_NA", etc. + if len(el) == 1: + # treats .split(("x",)) like .split("x") + el = el[0] + _ordering(el, i, output_splitter, current_sign, other_states, state_fields) + else: + if type(el[0]) is str and el[0].startswith("_"): + node_nm = el[0][1:] + if node_nm not in other_states and state_fields: + raise PydraStateError( + "can't ask for splitter from {}, other nodes that are connected: {}".format( + node_nm, other_states.keys() + ) + ) + elif state_fields: + splitter_mod = add_name_splitter( + splitter=other_states[node_nm][0].splitter_final, name=node_nm + ) + el = (splitter_mod, el[1]) + if other_states[node_nm][0].other_states: + other_states.update(other_states[node_nm][0].other_states) + if type(el[1]) is str and el[1].startswith("_"): + node_nm = el[1][1:] + if node_nm not in other_states and state_fields: + raise PydraStateError( + "can't ask for splitter from {}, other nodes that are connected: {}".format( + node_nm, other_states.keys() + ) + ) + elif state_fields: + splitter_mod = add_name_splitter( + splitter=other_states[node_nm][0].splitter_final, name=node_nm + ) + el = (el[0], splitter_mod) + if other_states[node_nm][0].other_states: + other_states.update(other_states[node_nm][0].other_states) + _iterate_list( + el, + ".", + other_states, + output_splitter=output_splitter, + state_fields=state_fields, + ) + elif type(el) is list: + if len(el) == 1: + # treats .split(["x"]) like .split("x") + el = el[0] + _ordering(el, i, output_splitter, current_sign, other_states, state_fields) + else: + if type(el[0]) is str and el[0].startswith("_"): + node_nm = el[0][1:] + if node_nm not in other_states and state_fields: + raise PydraStateError( + "can't ask for splitter from {}, other nodes that are connected: {}".format( + node_nm, other_states.keys() + ) + ) + elif state_fields: + splitter_mod = add_name_splitter( + splitter=other_states[node_nm][0].splitter_final, name=node_nm + ) + el[0] = splitter_mod + if other_states[node_nm][0].other_states: + other_states.update(other_states[node_nm][0].other_states) + if type(el[1]) is str and el[1].startswith("_"): + node_nm = el[1][1:] + if node_nm not in other_states and state_fields: + raise PydraStateError( + "can't ask for splitter from {}, other nodes that are connected: {}".format( + node_nm, other_states.keys() + ) + ) + elif state_fields: + splitter_mod = add_name_splitter( + splitter=other_states[node_nm][0].splitter_final, name=node_nm + ) + el[1] = splitter_mod + if other_states[node_nm][0].other_states: + other_states.update(other_states[node_nm][0].other_states) + _iterate_list( + el, + "*", + other_states, + output_splitter=output_splitter, + state_fields=state_fields, + ) + elif type(el) is str: + if el.startswith("_"): + node_nm = el[1:] + if node_nm not in other_states and state_fields: + raise PydraStateError( + "can't ask for splitter from {}, other nodes that are connected: {}".format( + node_nm, other_states.keys() + ) + ) + elif state_fields: + splitter_mod = add_name_splitter( + splitter=other_states[node_nm][0].splitter_final, name=node_nm + ) + el = splitter_mod + if other_states[node_nm][0].other_states: + other_states.update(other_states[node_nm][0].other_states) + if type(el) is str: + output_splitter.append(el) + elif type(el) is tuple: + _iterate_list( + el, + ".", + other_states, + output_splitter=output_splitter, + state_fields=state_fields, + ) + elif type(el) is list: + _iterate_list( + el, + "*", + other_states, + output_splitter=output_splitter, + state_fields=state_fields, + ) + else: + raise PydraStateError("splitter has to be a string, a tuple or a list") + if i > 0: + output_splitter.append(current_sign) + + +def _iterate_list(element, sign, other_states, output_splitter, state_fields=True): + """Iterate over list (used in the splitter2rpn to get recursion).""" + for i, el in enumerate(element): + _ordering( + deepcopy(el), + i, + current_sign=sign, + other_states=other_states, + output_splitter=output_splitter, + state_fields=state_fields, + ) + + +def converter_groups_to_input(group_for_inputs): + """ + Return fields for each axis and number of all groups. + + Requires having axes for all the input fields. + + Parameters + ---------- + group_for_inputs : + specified axes (groups) for each input + + """ + input_for_axis = {} + ngr = 0 + for inp, grs in group_for_inputs.items(): + for gr in ensure_list(grs): + if gr in input_for_axis.keys(): + input_for_axis[gr].append(inp) + else: + ngr += 1 + input_for_axis[gr] = [inp] + return input_for_axis, ngr + + +def remove_inp_from_splitter_rpn(splitter_rpn, inputs_to_remove): + """ + Remove inputs due to combining. + + Mutates a splitter. + + Parameters + ---------- + splitter_rpn : + The splitter in reverse polish notation + inputs_to_remove : + input names that should be removed from the splitter + + """ + splitter_rpn_copy = splitter_rpn.copy() + # reverting order + splitter_rpn_copy.reverse() + stack_inp = [] + stack_sgn = [] + from_last_sign = [] + for ii, el in enumerate(splitter_rpn_copy): + # element is a sign + if el == "." or el == "*": + stack_sgn.append((ii, el)) + from_last_sign.append(0) + # it's an input but not to remove + elif el not in inputs_to_remove: + if from_last_sign: + from_last_sign[-1] += 1 + stack_inp.append((ii, el)) + # it'a an input that should be removed + else: + if not from_last_sign: + pass + elif from_last_sign[-1] <= 1: + stack_sgn.pop() + from_last_sign.pop() + else: + stack_sgn.pop(-1 * from_last_sign.pop()) + + # creating the final splitter_rpn after combining + remaining_elements = stack_sgn + stack_inp + remaining_elements.sort(reverse=True) + splitter_rpn_combined = [el for (i, el) in remaining_elements] + return splitter_rpn_combined + + +def rpn2splitter(splitter_rpn): + """ + Convert from splitter_rpn to splitter. + + Recurrent algorithm to perform the conversion. + Every time combines pairs of input in one input, + ends when the length is one. + + Parameters + ---------- + splitter_rpn : + splitter in reverse polish notation + + Returns + ------- + splitter : + splitter in the standard/original form + + """ + if splitter_rpn == []: + return None + if len(splitter_rpn) == 1: + return splitter_rpn[0] + + splitter_rpn_copy = splitter_rpn.copy() + signs = [".", "*"] + splitter_modified = [] + + while splitter_rpn_copy: + el = splitter_rpn_copy.pop() + # element is a sign + if el in signs: + if ( + splitter_rpn_copy[-1] not in signs + and splitter_rpn_copy[-2] not in signs + ): + right, left = splitter_rpn_copy.pop(), splitter_rpn_copy.pop() + if el == ".": + splitter_modified.append((left, right)) + elif el == "*": + splitter_modified.append([left, right]) + else: + splitter_modified.append(el) + else: + splitter_modified.append(el) + + # reversing the list and combining more + splitter_modified.reverse() + return rpn2splitter(splitter_modified) + + +def add_name_combiner(combiner, name): + """adding a node's name to each field from the combiner""" + combiner_changed = [] + for comb in combiner: + if "." not in comb: + combiner_changed.append(f"{name}.{comb}") + else: + combiner_changed.append(comb) + return combiner_changed + + +def add_name_splitter( + splitter: ty.Union[str, ty.List[str], ty.Tuple[str, ...], None], name: str +) -> ty.Optional[ty.List[str]]: + """adding a node's name to each field from the splitter""" + if isinstance(splitter, str): + return _add_name([splitter], name)[0] + elif isinstance(splitter, list): + return _add_name(list(splitter), name) + elif isinstance(splitter, tuple): + return tuple(_add_name(list(splitter), name)) + else: + return None + + +def _add_name(mlist, name): + """adding anem to each element from the list""" + for i, elem in enumerate(mlist): + if isinstance(elem, str): + if "." in elem or elem.startswith("_"): + pass + else: + mlist[i] = f"{name}.{mlist[i]}" + elif isinstance(elem, list): + mlist[i] = _add_name(elem, name) + elif isinstance(elem, tuple): + mlist[i] = list(elem) + mlist[i] = _add_name(mlist[i], name) + mlist[i] = tuple(mlist[i]) + return mlist + + +def flatten(vals, cur_depth=0, max_depth=None): + """Flatten a list of values.""" + if max_depth is None: + max_depth = len(list(input_shape(vals))) + values = [] + if cur_depth >= max_depth: + values.append([vals]) + else: + for val in vals: + if isinstance(val, (list, tuple)): + values.append(flatten(val, cur_depth + 1, max_depth)) + else: + values.append([val]) + return itertools.chain.from_iterable(values) + + +def iter_splits(iterable, keys): + """Generate splits.""" + for iter in list(iterable): + yield dict(zip(keys, list(flatten(iter, max_depth=1000)))) + + +def input_shape(inp, container_ndim=1): + """Get input shape, depends on the container dimension, if not specify it is assumed to be 1""" + # TODO: have to be changed for inner splitter (sometimes different length) + container_ndim -= 1 + shape = [len(inp)] + last_shape = None + for value in inp: + if isinstance(value, list) and container_ndim > 0: + cur_shape = input_shape(value, container_ndim) + if last_shape is None: + last_shape = cur_shape + elif last_shape != cur_shape: + last_shape = None + break + else: + last_shape = None + break + if last_shape is not None: + shape.extend(last_shape) + return tuple(shape) + + +def splits_groups(splitter_rpn, combiner=None, inner_inputs=None): + """splits inputs to groups (axes) and creates stacks for these groups + This is used to specify which input can be combined. + """ + if not splitter_rpn: + return [], {}, [], [] + stack = [] + keys = [] + groups = {} + group_count = None + if not combiner: + combiner = [] + if inner_inputs: + previous_states_ind = { + f"_{v.name}": v.keys_final for v in inner_inputs.values() + } + inner_inputs = {k: v for k, v in inner_inputs.items() if k in splitter_rpn} + else: + previous_states_ind = {} + inner_inputs = {} + + # when splitter is a single element (no operators) + if len(splitter_rpn) == 1: + op_single = splitter_rpn[0] + return _single_op_splits_groups(op_single, combiner, inner_inputs, groups) + + # len(splitter_rpn) > 1 + # iterating splitter_rpn + for token in splitter_rpn: + if token in [".", "*"]: + terms = {} + terms["R"] = stack.pop() + terms["L"] = stack.pop() + + # checking if opL/R are strings + trm_str = {"L": False, "R": False} + oldgroups = {} + + for lr in ["L", "R"]: + if isinstance(terms[lr], str): + trm_str[lr] = True + else: + oldgroups[lr] = terms[lr] + + if token == ".": + if all(trm_str.values()): + if group_count is None: + group_count = 0 + else: + group_count += 1 + oldgroup = groups[terms["L"]] = groups[terms["R"]] = group_count + elif trm_str["R"]: + groups[terms["R"]] = oldgroups["L"] + oldgroup = oldgroups["L"] + elif trm_str["L"]: + groups[terms["L"]] = oldgroups["R"] + oldgroup = oldgroups["R"] + else: + if len(ensure_list(oldgroups["L"])) != len( + ensure_list(oldgroups["R"]) + ): + raise ValueError( + "Operands do not have same shape " + "(left one is {}d and right one is {}d.".format( + len(ensure_list(oldgroups["L"])), + len(ensure_list(oldgroups["R"])), + ) + ) + oldgroup = oldgroups["L"] + # dj: changing axes for Right part of the scalar op. + for k, v in groups.items(): + if v in ensure_list(oldgroups["R"]): + groups[k] = ensure_list(oldgroups["L"])[ + ensure_list(oldgroups["R"]).index(v) + ] + else: # if token == "*": + if all(trm_str.values()): + if group_count is None: + group_count = 0 + else: + group_count += 1 + groups[terms["L"]] = group_count + group_count += 1 + groups[terms["R"]] = group_count + oldgroup = [groups[terms["L"]], groups[terms["R"]]] + elif trm_str["R"]: + group_count += 1 + groups[terms["R"]] = group_count + oldgroup = ensure_list(oldgroups["L"]) + [groups[terms["R"]]] + elif trm_str["L"]: + group_count += 1 + groups[terms["L"]] = group_count + oldgroup = [groups[terms["L"]]] + ensure_list(oldgroups["R"]) + else: + oldgroup = ensure_list(oldgroups["L"]) + ensure_list(oldgroups["R"]) + + # creating list of keys + if trm_str["L"]: + if terms["L"].startswith("_"): + keys = previous_states_ind[terms["L"]] + keys + else: + keys.insert(0, terms["L"]) + if trm_str["R"]: + if terms["R"].startswith("_"): + keys += previous_states_ind[terms["R"]] + else: + keys.append(terms["R"]) + + pushgroup = oldgroup + stack.append(pushgroup) + + else: # name of one of the inputs + stack.append(token) + + groups_stack = stack.pop() + if isinstance(groups_stack, int): + groups_stack = [groups_stack] + if inner_inputs: + groups_stack = [[], groups_stack] + else: + groups_stack = [groups_stack] + + if combiner: + ( + keys_final, + groups_final, + groups_stack_final, + combiner_all, + ) = combine_final_groups(combiner, groups, groups_stack, keys) + return keys_final, groups_final, groups_stack_final, combiner_all + else: + return keys, groups, groups_stack, [] + + +def _single_op_splits_groups(op_single, combiner, inner_inputs, groups): + """splits_groups function if splitter is a singleton""" + if op_single in inner_inputs: + # TODO: have to be changed if differ length + # TODO: i think I don't want to add here from left part + # keys = inner_inputs[op_single].keys_final + [op_single] + keys = [op_single] + groups[op_single], groups_stack = 0, [[], [0]] + else: + keys = [op_single] + groups[op_single], groups_stack = 0, [[0]] + if combiner: + if combiner == [op_single]: + return [], {}, [], combiner + else: + # TODO: probably not needed, should be already check by st.combiner_validation + raise PydraStateError( + f"all fields from the combiner have to be in splitter_rpn: {[op_single]}, " + f"but combiner: {combiner} is set" + ) + else: + return keys, groups, groups_stack, [] + + +def combine_final_groups(combiner, groups, groups_stack, keys): + """Combine the final groups.""" + input_for_groups, _ = converter_groups_to_input(groups) + combiner_all = [] + for comb in combiner: + for gr in ensure_list(groups[comb]): + combiner_all += input_for_groups[gr] + combiner_all = list(set(combiner_all)) + combiner_all.sort() + + # groups that were removed (so not trying to remove twice) + grs_removed = [] + groups_stack_final = deepcopy(groups_stack) + for comb in combiner: + grs = groups[comb] + for gr in ensure_list(grs): + if gr in groups_stack_final[-1]: + grs_removed.append(gr) + groups_stack_final[-1].remove(gr) + elif gr in grs_removed: + pass + else: + raise PydraStateError( + "input {} not ready to combine, you have to combine {} " + "first".format(comb, groups_stack[-1]) + ) + groups_final = {inp: gr for (inp, gr) in groups.items() if inp not in combiner_all} + gr_final = set() + for el in groups_final.values(): + gr_final.update(ensure_list(el)) + gr_final = list(gr_final) + map_gr_nr = {nr: i for (i, nr) in enumerate(sorted(gr_final))} + groups_final_map = {} + for inp, gr in groups_final.items(): + if isinstance(gr, int): + groups_final_map[inp] = map_gr_nr[gr] + elif isinstance(gr, list): + groups_final_map[inp] = [map_gr_nr[el] for el in gr] + else: + raise Exception("gr should be an int or a list, something wrong") + for i, groups_l in enumerate(groups_stack_final): + groups_stack_final[i] = [map_gr_nr[gr] for gr in groups_l] + + keys_final = [key for key in keys if key not in combiner_all] + # TODO: not sure if I have to calculate and return keys, groups, groups_stack + return keys_final, groups_final_map, groups_stack_final, combiner_all + + +def map_splits(split_iter, inputs, container_ndim=None): + """generate a dictionary of inputs prescribed by the splitter.""" + if container_ndim is None: + container_ndim = {} + for split in split_iter: + yield { + k: list( + flatten(ensure_list(inputs[k]), max_depth=container_ndim.get(k, None)) + )[v] + for k, v in split.items() + } + + +def inputs_types_to_dict(name, inputs): + """Convert type.Inputs to dictionary.""" + # dj: any better option? + input_names = [field for field in attrs_values(inputs) if field != "_func"] + inputs_dict = {} + for field in input_names: + inputs_dict[f"{name}.{field}"] = getattr(inputs, field) + return inputs_dict + + +def unwrap_splitter( + splitter: ty.Union[str, ty.List[str], ty.Tuple[str, ...]], +) -> ty.Iterable[str]: + """Unwraps a splitter into a flat list of fields that are split over, i.e. + [("a", "b"), "c"] -> ["a", "b", "c"] + + Parameters + ---------- + splitter: str or list[str] or tuple[str, ...] + the splitter task to unwrap + + Returns + ------- + unwrapped : ty.Iterable[str] + the field names listed in the splitter + """ + if isinstance(splitter, str): + return [splitter] + else: + return itertools.chain(*(unwrap_splitter(s) for s in splitter)) + + +class PydraStateError(Exception): + """Custom error for Pydra State""" + + def __init__(self, value): + self.value = value + + def __str__(self): + return str(self.value) + + +op = {".": zip, "*": itertools.product} diff --git a/pydra/engine/submitter.py b/pydra/engine/submitter.py index fe3e598c21..4d779171b5 100644 --- a/pydra/engine/submitter.py +++ b/pydra/engine/submitter.py @@ -2,261 +2,505 @@ import asyncio import typing as ty -import pickle -from uuid import uuid4 -from .workers import Worker, WORKERS -from .core import is_workflow -from .helpers import get_open_loop, load_and_run_async -from ..utils.hash import PersistentCache - +import re +import os +from pathlib import Path +from traceback import format_exc +from tempfile import mkdtemp +from copy import copy, deepcopy +from datetime import datetime +from collections import defaultdict +import attrs import logging +from pydra.engine.graph import DiGraph +from pydra.utils.general import ( + task_fields, + attrs_values, +) +from pydra.utils.hash import PersistentCache +from pydra.engine.lazy import LazyField +from pydra.engine.audit import Audit +from pydra.engine.job import Job +from pydra.utils.messenger import AuditFlag, Messenger +from pydra.utils.general import default_run_cache_root +from pydra.compose import workflow +from pydra.engine.state import State +from pydra.workers.base import Worker +from pydra.compose.base import Task, Outputs logger = logging.getLogger("pydra.submitter") +if ty.TYPE_CHECKING: + from pydra.engine.node import Node + from pydra.engine.result import Result + from pydra.engine.hooks import TaskHooks + from pydra.engine.workflow import Workflow + from pydra.environments.base import Environment -# TODO: runnable in init or run -class Submitter: - """Send a task to the execution backend.""" - def __init__(self, plugin: ty.Union[str, ty.Type[Worker]] = "cf", **kwargs): - """ - Initialize task submission. +TaskType = ty.TypeVar("TaskType", bound="Task") +OutputType = ty.TypeVar("OutputType", bound="Outputs") - Parameters - ---------- - plugin : :obj:`str` or :obj:`ty.Type[pydra.engine.core.Worker]` - Either the identifier of the execution backend or the worker class itself. - Default is ``cf`` (Concurrent Futures). - **kwargs - Additional keyword arguments to pass to the worker. +# Used to flag development mode of Audit +develop = False - """ +WORKER_KWARG_FAIL_NOTE = "Attempting to instantiate worker submitter" + + +class Submitter: + """Send a job to the execution backend. + + Parameters + ---------- + cache_root : os.PathLike, optional + Cache directory where the working directory/results for the job will be + stored, by default None + worker : str or Worker, optional + The worker to use, by default "cf" + environment: Environment, optional + The execution environment to use, by default None + readonly_caches : list[os.PathLike], optional + Alternate cache locations to check for pre-computed results, by default None + max_concurrent: int | float, optional + Maximum number of concurrent tasks to run, by default float("inf") (unlimited) + audit_flags : AuditFlag, optional + Auditing configuration, by default AuditFlag.NONE + messengers : list, optional + Messengers, by default None + messenger_args : dict, optional + Messenger arguments, by default None + clean_stale_locks : bool, optional + Whether to clean stale lock files, i.e. lock files that were created before the + start of the current run. Don't set if using a global cache where there are + potentially multiple workflows that are running concurrently. By default (None), + lock files will be cleaned if the *debug* worker is used + **kwargs : dict + Keyword arguments to pass on to the worker initialisation + """ + + cache_root: os.PathLike + worker: Worker + environment: "Environment | None" + readonly_caches: list[os.PathLike] + audit_flags: AuditFlag + messengers: ty.Iterable[Messenger] + messenger_args: dict[str, ty.Any] + max_concurrent: int | float + clean_stale_locks: bool + run_start_time: datetime | None + propagate_rerun: bool + + def __init__( + self, + /, + cache_root: os.PathLike | None = None, + worker: str | ty.Type[Worker] | Worker | None = "debug", + environment: "Environment | None" = None, + readonly_caches: list[os.PathLike] | None = None, + audit_flags: AuditFlag = AuditFlag.NONE, + messengers: ty.Iterable[Messenger] | None = None, + messenger_args: dict[str, ty.Any] | None = None, + max_concurrent: int | float = float("inf"), + propagate_rerun: bool = True, + clean_stale_locks: bool | None = None, + **kwargs, + ): + + from pydra.environments import native + + if worker is None: + worker = "debug" + + from pydra.utils.etelemetry import check_latest_version + + if Job._etelemetry_version_data is None: + Job._etelemetry_version_data = check_latest_version() + + self.audit = Audit( + audit_flags=audit_flags, + messengers=messengers, + messenger_args=messenger_args, + develop=develop, + ) + if cache_root is None: + cache_root = default_run_cache_root + cache_root = Path(cache_root).resolve() + cache_root.mkdir(parents=True, exist_ok=True) + + self.cache_root = cache_root + self.readonly_caches = readonly_caches + self.propagate_rerun = propagate_rerun + if max_concurrent < 1 or ( + isinstance(max_concurrent, float) and max_concurrent != float("inf") + ): + raise ValueError( + "'max_concurrent' arg must be a positive integer or float('inf'), " + f"not {max_concurrent}" + ) + self.max_concurrent = max_concurrent + self.environment = ( + environment if environment is not None else native.Environment() + ) self.loop = get_open_loop() self._own_loop = not self.loop.is_running() - if isinstance(plugin, str): - self.plugin = plugin - try: - worker_cls = WORKERS[self.plugin] - except KeyError: - raise NotImplementedError(f"No worker for '{self.plugin}' plugin") - else: + if not isinstance(worker, Worker): + if isinstance(worker, str): + worker_cls = Worker.plugin(worker) + elif issubclass(worker, Worker): + worker_cls = worker + else: + raise TypeError( + "Worker must be a Worker object, name of a worker or a Worker " + f"class, not {worker}" + ) try: - self.plugin = plugin.plugin_name - except AttributeError: - raise ValueError("Worker class must have a 'plugin_name' str attribute") - worker_cls = plugin - self.worker = worker_cls(**kwargs) + worker = worker_cls(**kwargs) + except TypeError as e: + e.add_note(WORKER_KWARG_FAIL_NOTE) + raise + self.worker = worker + self.run_start_time = None + self.clean_stale_locks = ( + clean_stale_locks + if clean_stale_locks is not None + else (self.worker.plugin_name() == "debug") + ) + self.worker_kwargs = kwargs self.worker.loop = self.loop - def __call__(self, runnable, cache_locations=None, rerun=False, environment=None): - """Submitter run function.""" - if cache_locations is not None: - runnable.cache_locations = cache_locations - self.loop.run_until_complete( - self.submit_from_call(runnable, rerun, environment) - ) - PersistentCache().clean_up() - return runnable.result() + def __call__( + self, + task: "Task[OutputType]", + hooks: "TaskHooks | None" = None, + raise_errors: bool | None = None, + rerun: bool = False, + ) -> "Result[OutputType]": + """Submitter run function. + + Parameters + ---------- + task : :obj:`~pydra.compose.base.Task` + The task to run + hooks : :obj:`~pydra.engine.hooks.TaskHooks`, optional + Job hooks, callable functions called as the job is setup and torn down, + by default no functions are called at the hooks + raise_errors : bool, optional + Whether to raise errors, by default True if the 'debug' worker is used, + otherwise False + rerun : bool, optional + Whether to force the re-computation of the job results even if existing + results are found, by default False + propagate_rerun : bool, optional + Whether to propagate the rerun flag to all tasks in the workflow, by default True - async def submit_from_call(self, runnable, rerun, environment): + Returns + ------- + result : Any + The result of the job """ - This coroutine should only be called once per Submitter call, - and serves as the bridge between sync/async lands. + from pydra.environments.base import Environment + + if raise_errors is None: + raise_errors = self.worker.plugin_name() == "debug" + if not isinstance(raise_errors, bool): + raise TypeError( + f"'raise_errors' must be a boolean or None, not {type(raise_errors)}" + ) - There are 4 potential paths based on the type of runnable: - 0) Workflow has a different plugin than a submitter - 1) Workflow without State - 2) Task without State - 3) (Workflow or Task) with State + task._check_rules() + # If the outer job is split, create an implicit workflow to hold the split nodes + if task._splitter: - Once Python 3.10 is the minimum, this should probably be refactored into using - structural pattern matching. - """ - if is_workflow(runnable): # TODO: env to wf - # connect and calculate the checksum of the graph before running - runnable._connect_and_propagate_to_tasks(override_task_caches=True) - # 0 - if runnable.plugin and runnable.plugin != self.plugin: - # if workflow has a different plugin it's treated as a single element - await self.worker.run_el(runnable, rerun=rerun) - # 1 - if runnable.state is None: - await runnable._run(self, rerun=rerun) - # 3 - else: - await self.expand_runnable(runnable, wait=True, rerun=rerun) - runnable._reset() + state = State( + name="outer_split", + splitter=deepcopy(task._splitter), + combiner=deepcopy(task._combiner), + container_ndim=deepcopy(task._container_ndim), + ) + + def wrap_type(tp): + tp = state.nest_output_type(tp) + tp = state.combine_state_arrays(tp) + return tp + + output_types = { + o.name: wrap_type(o.type) for o in task_fields(task.Outputs) + } + + @workflow.define(outputs=output_types) + def Split(defn: Task, output_types: dict, environment: Environment | None): + node = workflow.add(defn, environment=environment, hooks=hooks) + return tuple(getattr(node, o) for o in output_types) + + task = Split( + defn=task, output_types=output_types, environment=self.environment + ) + + environment = None + elif task._combiner: + raise ValueError( + f"Job {self} is marked for combining, but not splitting. " + "Use the `split` method to split the job before combining." + ) else: - # 2 - if runnable.state is None: - # run_el should always return a coroutine - await self.worker.run_el(runnable, rerun=rerun, environment=environment) - # 3 + environment = self.environment + + job = Job( + task, + submitter=self, + name="main", + environment=environment, + hooks=hooks, + ) + try: + self.run_start_time = datetime.now() + self.submit(job, rerun=rerun) + except Exception as exc: + error_msg = ( + f"Full crash report for {type(task).__name__!r} job is here: " + + str(job.cache_dir / "_error.pklz") + ) + exc.add_note(error_msg) + if raise_errors or not job.result(): + raise exc else: - await self.expand_runnable(runnable, wait=True, rerun=rerun) # TODO - return True + logger.error("\nTask execution failed\n%s", error_msg) + finally: + self.run_start_time = None + PersistentCache().clean_up() + result = job.result() + if result is None: + if job.lockfile.exists(): + raise RuntimeError( + f"Job {job} has a lockfile, but no result was found. " + "This may be due to another submission that is currently running, or the hard " + "interrupt (e.g. a debugging abortion) interrupting a previous run. " + f"In the case of an interrupted run, please remove {str(job.lockfile)!r} " + "and resubmit." + ) + raise RuntimeError(f"Job {job} has no result in {str(job.cache_dir)!r}") + return result + + def submit(self, job: "Job[TaskType]", rerun: bool = False) -> None: + """Submit a job to the worker. - async def expand_runnable(self, runnable, wait=False, rerun=False): + Parameters + ---------- + job : :obj:`~pydra.engine.job.Job` + The job to submit + rerun : bool, optional + Whether to force the re-computation of the job results even if existing + results are found, by default False """ - This coroutine handles state expansion. + if self.worker.is_async: # Only workflow tasks can be async + self.loop.run_until_complete(self.worker.submit(job, rerun=rerun)) + else: + self.worker.run(job, rerun=rerun) + + def __getstate__(self): + state = self.__dict__.copy() + # Remove the unpicklable entries or those that should not be pickled + # When unpickled (in another process) the submitter can't be called + state["loop"] = None + return state + + def __setstate__(self, state): + self.__dict__.update(state) + # Restore the loop and worker + self.loop = get_open_loop() + self.worker.loop = self.loop - Removes any states from `runnable`. If `wait` is - set to False (default), aggregates all worker - execution coroutines and returns them. If `wait` is - True, waits for all coroutines to complete / error - and returns None. + def expand_workflow(self, workflow_task: "Job[workflow.Task]", rerun: bool) -> None: + """Expands and executes a workflow job synchronously. Typically only used during + debugging and testing, as the asynchronous version is more efficient. Parameters ---------- - runnable : pydra Task - Task instance (`Task`, `Workflow`) - wait : bool (False) - Await all futures before completing - - Returns - ------- - futures : set or None - Coroutines for :class:`~pydra.engine.core.TaskBase` execution. + job : :obj:`~pydra.engine.job.Job[workflow.Task]` + Workflow Job object """ - if runnable.plugin and runnable.plugin != self.plugin: - raise NotImplementedError() - - futures = set() - if runnable.state is None: - raise Exception("Only runnables with state should reach here") - - task_pkl = await prepare_runnable_with_state(runnable) + # Construct the workflow + wf = workflow_task.task.construct() + # Generate the execution graph + exec_graph = wf.execution_graph(submitter=self) + workflow_task.return_values = {"workflow": wf, "exec_graph": exec_graph} + tasks = self.get_runnable_tasks(exec_graph) + while tasks or any(not n.done for n in exec_graph.nodes): + for job in tasks: + self.worker.run(job, rerun=rerun and self.propagate_rerun) + tasks = self.get_runnable_tasks(exec_graph) + + async def expand_workflow_async( + self, workflow_task: "Job[workflow.Task]", rerun: bool + ) -> None: + """ + Expand and execute a workflow job asynchronously. - for sidx in range(len(runnable.state.states_val)): - if is_workflow(runnable): - # job has no state anymore - futures.add( - # This unpickles and runs workflow - why are we pickling? - asyncio.create_task(load_and_run_async(task_pkl, sidx, self, rerun)) + Parameters + ---------- + job : :obj:`~pydra.engine.job.Job[workflow.Task]` + Workflow Job object + """ + wf = workflow_task.task.construct() + # Generate the execution graph + exec_graph = wf.execution_graph(submitter=self) + workflow_task.return_values = {"workflow": wf, "exec_graph": exec_graph} + # keep track of pending futures + task_futures = set() + futured: dict[str, Job[TaskType]] = {} + tasks = self.get_runnable_tasks(exec_graph) + errors = [] + try: + while tasks or task_futures or any(not n.done for n in exec_graph.nodes): + if not tasks and not task_futures: + # it's possible that task_futures is empty, but not able to get any + # tasks from graph_copy (using get_runnable_tasks) + # this might be related to some delays saving the files + # so try to get_runnable_tasks for another minute + ii = 0 + while not tasks and any(not n.done for n in exec_graph.nodes): + tasks = self.get_runnable_tasks(exec_graph) + ii += 1 + # don't block the event loop! + await asyncio.sleep(1) + if ii > 10: + not_done = "\n".join( + ( + f"{n.name}: started={bool(n.started)}, " + f"blocked={list(n.blocked)}, queued={list(n.queued)}" + ) + for n in exec_graph.nodes + if not n.done + ) + msg = ( + "Something has gone wrong when retrieving the predecessor " + f"results. Not able to get any more tasks but he following " + f"nodes of the {wf.name!r} workflow are not done:" + f"\n{not_done}\n\n" + ) + not_done = [n for n in exec_graph.nodes if not n.done] + msg += "\n" + ", ".join( + f"{t.name}: {t.done}" + for t in not_done[0].queued.values() + ) + # Get blocked tasks and the predecessors they are blocked on + outstanding: dict[Job[TaskType], list[Job[TaskType]]] = { + t: [ + p + for p in exec_graph.predecessors[t.name] + if not p.done + ] + for t in exec_graph.sorted_nodes + } + + hashes_have_changed = False + for job, blocked_on in outstanding.items(): + if not blocked_on: + continue + msg += f"- '{job.name}' node blocked due to\n" + for pred in blocked_on: + if ( + pred.checksum + != wf.inputs._graph_checksums[pred.name] + ): + msg += ( + f" - hash changes in '{pred.name}' node " + f"inputs. Current values and hashes: " + f"{pred.inputs}, {pred.inputs._hashes}\n" + ) + hashes_have_changed = True + elif pred not in outstanding: + msg += ( + f" - undiagnosed issues in '{pred.name}' " + "node, potentially related to file-system " + "access issues " + ) + msg += "\n" + if hashes_have_changed: + msg += ( + "Set loglevel to 'debug' in order to track hash " + "changes throughout the execution of the workflow." + "\n\n These issues may have been caused by " + "`bytes_repr()` methods that don't return stable " + "hash values for specific object types across " + "multiple processes (see bytes_repr() " + '"singledispatch "function in pydra/utils/hash.py).' + "You may need to write specific `bytes_repr()` " + "implementations (see `pydra.utils.hash.register_serializer`) " + "or `__bytes_repr__()` dunder methods to handle one " + "or more types in your interface inputs." + ) + raise RuntimeError(msg) + for job in tasks: + if job.is_async: # Only workflows at this stage + await self.worker.submit( + job, rerun=rerun and self.propagate_rerun + ) + elif job.checksum not in futured: + asyncio_task = asyncio.Task( + self.worker.run(job, rerun=rerun and self.propagate_rerun), + name=job.checksum, + ) + task_futures.add(asyncio_task) + futured[job.checksum] = job + task_futures, completed = await self.fetch_finished(task_futures) + for task_future in completed: + try: + task_future.result() + except Exception: + error_msg = format_exc() + if match := re.match( + r'.*"""(.*)""".*', + error_msg, + flags=re.DOTALL | re.MULTILINE, + ): + error_msg = match.group(1) + job = futured[task_future.get_name()] + task_name = job.name + if job.state_index is not None: + task_name += f"({job.state_index})" + errors.append( + f"Job {task_name!r}, {job.task!r}, errored:{error_msg}" + ) + tasks = self.get_runnable_tasks(exec_graph) + finally: + if errors: + all_errors = "\n\n".join(errors) + raise RuntimeError( + f"Workflow job {workflow_task} failed with errors" + f":\n\n{all_errors}\n\nSee output directory for details: {workflow_task.cache_dir}" ) - else: - futures.add(self.worker.run_el((sidx, task_pkl, runnable), rerun=rerun)) - if wait and futures: - # if wait is True, we are at the end of the graph / state expansion. - # Once the remaining jobs end, we will exit `submit_from_call` - await asyncio.gather(*futures) - return - # pass along futures to be awaited independently - return futures - - async def expand_workflow(self, wf, rerun=False): + async def fetch_finished( + self, futures + ) -> tuple[set[asyncio.Task], set[asyncio.Task]]: """ - Expand and execute a stateless :class:`~pydra.engine.core.Workflow`. - This method is only reached by `Workflow._run_task`. + Awaits asyncio's :class:`asyncio.Task` until one is finished. Parameters ---------- - wf : :obj:`~pydra.engine.core.Workflow` - Workflow Task object + futures : set of asyncio awaitables + Job execution coroutines or asyncio :class:`asyncio.Task` Returns ------- - wf : :obj:`pydra.engine.core.Workflow` - The computed workflow + pending : set + Pending asyncio :class:`asyncio.Task`. + done: set + Completed asyncio :class:`asyncio.Task` """ - # creating a copy of the graph that will be modified - # the copy contains new lists with original runnable objects - graph_copy = wf.graph.copy() - # resetting uid for nodes in the copied workflows - for nd in graph_copy.nodes: - nd._uid = uuid4().hex - # keep track of pending futures - task_futures = set() - tasks, tasks_follow_errored = get_runnable_tasks(graph_copy) - while tasks or task_futures or graph_copy.nodes: - if not tasks and not task_futures: - # it's possible that task_futures is empty, but not able to get any - # tasks from graph_copy (using get_runnable_tasks) - # this might be related to some delays saving the files - # so try to get_runnable_tasks for another minut - ii = 0 - while not tasks and graph_copy.nodes: - tasks, follow_err = get_runnable_tasks(graph_copy) - ii += 1 - # don't block the event loop! - await asyncio.sleep(1) - if ii > 60: - msg = ( - f"Graph of '{wf}' workflow is not empty, but not able to get " - "more tasks - something has gone wrong when retrieving the " - "results predecessors:\n\n" - ) - # Get blocked tasks and the predecessors they are waiting on - outstanding = { - t: [ - p for p in graph_copy.predecessors[t.name] if not p.done - ] - for t in graph_copy.sorted_nodes - } - - hashes_have_changed = False - for task, waiting_on in outstanding.items(): - if not waiting_on: - continue - msg += f"- '{task.name}' node blocked due to\n" - for pred in waiting_on: - if ( - pred.checksum - != wf.inputs._graph_checksums[pred.name] - ): - msg += ( - f" - hash changes in '{pred.name}' node inputs. " - f"Current values and hashes: {pred.inputs}, " - f"{pred.inputs._hashes}\n" - ) - hashes_have_changed = True - elif pred not in outstanding: - msg += ( - f" - undiagnosed issues in '{pred.name}' node, " - "potentially related to file-system access issues " - ) - msg += "\n" - if hashes_have_changed: - msg += ( - "Set loglevel to 'debug' in order to track hash changes " - "throughout the execution of the workflow.\n\n " - "These issues may have been caused by `bytes_repr()` methods " - "that don't return stable hash values for specific object " - "types across multiple processes (see bytes_repr() " - '"singledispatch "function in pydra/utils/hash.py).' - "You may need to write specific `bytes_repr()` " - "implementations (see `pydra.utils.hash.register_serializer`) " - "or `__bytes_repr__()` dunder methods to handle one " - "or more types in your interface inputs." - ) - raise RuntimeError(msg) - for task in tasks: - # grab inputs if needed - logger.debug(f"Retrieving inputs for {task}") - # TODO: add state idx to retrieve values to reduce waiting - task.inputs.retrieve_values(wf) - if task.state: - for fut in await self.expand_runnable(task, rerun=rerun): - task_futures.add(fut) - # expand that workflow - elif is_workflow(task): - await task._run(self, rerun=rerun) - # single task - else: - task_futures.add(self.worker.run_el(task, rerun=rerun)) - task_futures = await self.worker.fetch_finished(task_futures) - tasks, follow_err = get_runnable_tasks(graph_copy) - # updating tasks_errored - for key, val in follow_err.items(): - tasks_follow_errored.setdefault(key, []) - tasks_follow_errored[key] += val - - for key, val in tasks_follow_errored.items(): - setattr(getattr(wf, key), "_errored", val) - return wf + done = set() + try: + done, pending = await asyncio.wait( + futures, return_when=asyncio.FIRST_COMPLETED + ) + except ValueError: + # nothing pending! + pending = set() + logger.debug(f"Tasks finished: {len(done)}") + return pending, done def __enter__(self): return self @@ -268,108 +512,455 @@ def close(self): """ Close submitter. - Do not close previously running loop. + Do not close previously queued loop. """ self.worker.close() if self._own_loop: self.loop.close() + def _check_locks(self, tasks: list[Job]) -> None: + """Check for stale lock files and remove them.""" + if self.clean_stale_locks: + for job in tasks: + start_time = job.run_start_time + if start_time and start_time < self.run_start_time: + job.lockfile.unlink() -def get_runnable_tasks(graph): - """Parse a graph and return all runnable tasks.""" - tasks = [] - to_remove = [] - # tasks that follow task that raises an error - following_err = dict() - for tsk in graph.sorted_nodes: - if tsk not in graph.sorted_nodes: - continue - # since the list is sorted (breadth-first) we can stop - # when we find a task that depends on any task that is already in tasks - if set(graph.predecessors[tsk.name]).intersection(set(tasks)): - break - _is_runnable = is_runnable(graph, tsk) - if _is_runnable is True: - tasks.append(tsk) - to_remove.append(tsk) - elif _is_runnable is False: - continue - else: # a previous task had an error - errored_task = _is_runnable - # removing all successors of the errored task - for task_err in errored_task: - task_to_remove = graph.remove_successors_nodes(task_err) - for tsk in task_to_remove: - # adding tasks that were removed from the graph - # due to the error in the errored_task - following_err.setdefault(tsk, []) - following_err[tsk].append(task_err.name) - - # removing tasks that are ready to run from the graph - for nd in to_remove: - graph.remove_nodes(nd) - return tasks, following_err - - -def is_runnable(graph, obj): - """Check if a task within a graph is runnable.""" - connections_to_remove = [] - pred_errored = [] - is_done = None - for pred in graph.predecessors[obj.name]: - try: - is_done = pred.done - except ValueError: - pred_errored.append(pred) + def get_runnable_tasks(self, graph: DiGraph) -> list["Job[TaskType]"]: + """Parse a graph and return all runnable tasks. - if is_done is True: - connections_to_remove.append(pred) - elif is_done is False: + Parameters + ---------- + graph : :obj:`~pydra.engine.graph.DiGraph` + Graph object + + Returns + ------- + tasks : list of :obj:`~pydra.engine.job.Job` + List of runnable tasks + following_err : dict[NodeToExecute, list[str]] + Dictionary of tasks that are blocked by errored tasks + """ + tasks = [] + not_started = set() + node: NodeExecution + for node in graph.sorted_nodes: + if node.done: + continue + # since the list is sorted (breadth-first) we can stop + # when we find a job that depends on any job that is already in tasks + preds = set(graph.predecessors[node.name]) + if preds.intersection(not_started): + break + # Record if the node has not been started + if not node.started: + not_started.add(node) + tasks.extend(node.get_runnable_tasks(graph)) + self._check_locks(tasks) + if len(tasks) > self.max_concurrent: + logger.info( + "Reducing number of tasks to run concurrently from %d to %d", + len(tasks), + self.max_concurrent, + ) + tasks = tasks[: self.max_concurrent] + return tasks + + @property + def cache_root(self): + """Get the location of the cache directory.""" + return self._cache_root + + @cache_root.setter + def cache_root(self, location): + if location is not None: + self._cache_root = Path(location).resolve() + self._cache_root.mkdir(parents=False, exist_ok=True) + else: + self._cache_root = mkdtemp() + self._cache_root = Path(self._cache_root).resolve() + + +class NodeExecution(ty.Generic[TaskType]): + """A wrapper around a workflow node containing the execution state of the tasks that + are generated from it""" + + name: str + node: "Node" + submitter: Submitter + + # List of tasks that were completed successfully + successful: dict[int, list["Job[TaskType]"]] + # List of tasks that failed + errored: dict[int, "Job[TaskType]"] + # List of tasks that couldn't be run due to upstream errors + unrunnable: dict[int, list["Job[TaskType]"]] + # List of tasks that are queued + queued: dict[int, "Job[TaskType]"] + # List of tasks that are queued + running: dict[int, tuple["Job[TaskType]", datetime]] + # List of tasks that are blocked on other tasks to complete before they can be run + blocked: dict[int, "Job[TaskType]"] | None + + _tasks: dict[int | None, "Job[TaskType]"] | None + + workflow: "Workflow" + + graph: DiGraph["NodeExecution"] | None + + def __init__( + self, + node: "Node", + submitter: Submitter, + workflow: "Workflow", + ): + self.name = node.name + self.node = node + self.submitter = submitter + # Initialize the state dictionaries + self._tasks = None + self.blocked = None + self.successful = {} + self.errored = {} + self.queued = {} + self.running = {} # Not used in logic, but may be useful for progress tracking + self.unrunnable = defaultdict(list) + self.workflow = workflow + self.graph = None + + @property + def state(self): + return self.node.state + + def __repr__(self): + return ( + f"NodeExecution(name={self.name!r}, blocked={list(self.blocked)}, " + f"queued={list(self.queued)}, running={list(self.running)}, " + f"successful={list(self.successful)}, errored={list(self.errored)}, " + f"unrunnable={list(self.unrunnable)})" + ) + + @property + def inputs(self) -> "Node.Inputs": + return self.node.inputs + + @property + def _task(self) -> "Node": + return self.node._task + + @property + def tasks(self) -> ty.Generator["Job[TaskType]", None, None]: + if self._tasks is None: + raise RuntimeError("Tasks have not been generated") + return self._tasks.values() + + def start(self) -> None: + """Prepare the execution node so that it can be processed""" + self._tasks = {} + if self.state: + values = {} + for name, value in self.node.state_values.items(): + if name in self.node.state.current_splitter_rpn: + if name in ("*", "."): + continue + if isinstance(value, LazyField): + values[name] = value._get_value( + workflow=self.workflow, graph=self.graph + ) + else: + values[name] = value + self.state.prepare_states(values) + self.state.prepare_inputs() + # Generate the tasks + for index, split_defn in enumerate(self._split_task()): + self._tasks[index] = Job( + task=split_defn, + submitter=self.submitter, + environment=self.node._environment, + name=self.node.name, + hooks=self.node._hooks, + state_index=index, + ) + else: + self._tasks[None] = Job( + task=self._resolve_lazy_inputs(task=self.node._task), + submitter=self.submitter, + environment=self.node._environment, + hooks=self.node._hooks, + name=self.node.name, + ) + self.blocked = copy(self._tasks) + + @property + def started(self) -> bool: + return ( + self.successful + or self.errored + or self.unrunnable + or self.queued + or self.blocked is not None + ) + + @property + def done(self) -> bool: + self.update_status() + if not self.started: return False + # Check to see if any previously queued tasks have completed + return not (self.queued or self.blocked or self.running) + + @property + def has_errored(self) -> bool: + self.update_status() + return bool(self.errored) + + def update_status(self) -> None: + """Updates the status of the tasks in the node.""" + if not self.started: + return + # Check to see if any previously queued tasks have completed + for index, job in list(self.queued.items()): + try: + is_done = job.done + except ValueError: + errored = True + is_done = False + else: + errored = False + if is_done: + self.successful[job.state_index] = self.queued.pop(index) + elif job.errored or errored: + self.errored[job.state_index] = self.queued.pop(index) + elif job.run_start_time: + self.running[job.state_index] = ( + self.queued.pop(index), + job.run_start_time, + ) + # Check to see if any previously running tasks have completed + for index, (job, _) in list(self.running.items()): + if job.done: + self.successful[job.state_index] = self.running.pop(index)[0] + elif job.errored: + self.errored[job.state_index] = self.running.pop(index)[0] + + @property + def all_failed(self) -> bool: + return (self.unrunnable or self.errored) and not ( + self.successful or self.blocked or self.queued + ) - if pred_errored: - return pred_errored + def _resolve_lazy_inputs( + self, + task: "Task", + state_index: int | None = None, + ) -> "Task": + """Resolves lazy fields in the task by replacing them with their + actual values calculated by upstream jobs. - # removing nodes that are done from connections - for nd in connections_to_remove: - graph.remove_nodes_connections(nd) + Parameters + ---------- + task : Task + The task to resolve the lazy fields of + state_index : int, optional + The state index for the workflow, by default None - return True + Returns + ------- + Task + The task with all lazy fields resolved + """ + resolved = {} + for name, value in attrs_values(task).items(): + if isinstance(value, LazyField): + resolved[name] = value._get_value( + workflow=self.workflow, graph=self.graph, state_index=state_index + ) + return attrs.evolve(task, **resolved) + def _split_task(self) -> dict[int, "Task[OutputType]"]: + """Split the task into the different states it will be run over -async def prepare_runnable_with_state(runnable): - runnable.state.prepare_states(runnable.inputs, cont_dim=runnable.cont_dim) - runnable.state.prepare_inputs() - logger.debug(f"Expanding {runnable} into {len(runnable.state.states_val)} states") - return runnable.pickle_task() + Parameters + ---------- + values : dict[str, Any] + The values to use for the split + """ + # TODO: doesn't work properly for more cmplicated wf (check if still an issue) + if not self.node.state: + return {None: self.node._task} + split_defs = [] + for index, vals in zip(self.node.state.inputs_ind, self.node.state.states_val): + resolved = {} + for inpt_name in set(self.node.input_names): + value = getattr(self._task, inpt_name) + state_key = f"{self.node.name}.{inpt_name}" + try: + resolved[inpt_name] = vals[state_key] + except KeyError: + if isinstance(value, LazyField): + resolved[inpt_name] = value._get_value( + workflow=self.workflow, + graph=self.graph, + state_index=index.get(state_key), + ) + split_defs.append(attrs.evolve(self.node._task, **resolved)) + return split_defs + def get_runnable_tasks(self, graph: DiGraph) -> list["Job[TaskType]"]: + """For a given node, check to see which tasks have been successfully run, are ready + to run, can't be run due to upstream errors, or are blocked on other tasks to complete. -def _list_blocked_tasks(graph): - """Generates a list of tasks that can't be run and predecessors that are blocking - them to help debugging of broken workflows""" - blocked = [] - for tsk in graph.sorted_nodes: - blocking = [] - for pred in graph.predecessors[tsk.name]: - if not pred.done: - matching_name = [] - for cache_loc in tsk.cache_locations: - for tsk_work_dir in cache_loc.iterdir(): - if (tsk_work_dir / "_task.pklz").exists(): - with open(tsk_work_dir / "_task.pklz", "rb") as f: - saved_tsk = pickle.load(f) - if saved_tsk.name == pred.name: - matching_name.append( - f"{saved_tsk.name} ({tsk_work_dir.name})" - ) - blocking.append((pred, ", ".join(matching_name))) - if blocking: - blocked.append( - f"\n{tsk.name} ({tsk.checksum}) is blocked by " - + "; ".join( - f"{pred.name} ({pred.checksum}), which matches names of [{matching}]" - for pred, matching in blocking + Parameters + ---------- + node : :obj:`~pydra.engine.node.Node` + The node object to get the tasks for + graph : :obj:`~pydra.engine.graph.DiGraph` + Graph object + + + Returns + ------- + runnable : list[NodeExecution] + List of tasks that are ready to run + """ + runnable: list["Job[TaskType]"] = [] + predecessors: list["Job[TaskType]"] = graph.predecessors[self.node.name] + + # If there is a split, we need to wait for all predecessor nodes to finish + # In theory, if the current splitter splits an already split state we should + # only need to wait for the direct predecessor jobs to finish, however, this + # would require a deep refactor of the State class as we need the whole state + # in order to assign consistent state indices across the new split + + # FIXME: The branch for handling partially completed/errored/unrunnable + # predecessor nodes can't be used until the State class can be partially + # initialised with lazy-fields. + if True: # self.node.splitter: + if unrunnable := [p for p in predecessors if p.errored or p.unrunnable]: + self.unrunnable = {None: unrunnable} + self.blocked = {} + assert self.done + else: + if all(p.done for p in predecessors): + if not self.started: + self.start() + if self.node.state is None: + inds = [None] + else: + inds = list(range(len(self.node.state.states_ind))) + if self.blocked: + for i in inds: + runnable.append(self.blocked.pop(i)) + else: + if not self.started: + self.start() + + # Check to see if any blocked tasks are now runnable/unrunnable + for index, job in list(self.blocked.items()): + pred: NodeExecution + is_runnable = True + states_ind = ( + list(self.node.state.states_ind[index].items()) + if self.node.state + else [] ) - ) - return blocked + for pred in predecessors: + if pred.node.state: + pred_states_ind = { + (k, i) + for k, i in states_ind + if k.startswith(pred.name + ".") + } + pred_inds = [ + i + for i, ind in enumerate(pred.node.state.states_ind) + if set(ind.items()).issuperset(pred_states_ind) + ] + else: + pred_inds = [None] + if not all(i in pred.successful for i in pred_inds): + is_runnable = False + blocked = True + if pred_errored := [ + pred.errored[i] for i in pred_inds if i in pred.errored + ]: + self.unrunnable[index].extend(pred_errored) + blocked = False + if pred_unrunnable := [ + pred.unrunnable[i] + for i in pred_inds + if i in pred.unrunnable + ]: + self.unrunnable[index].extend(pred_unrunnable) + blocked = False + if not blocked: + del self.blocked[index] + break + if is_runnable: + runnable.append(self.blocked.pop(index)) + self.queued.update({t.state_index: t for t in runnable}) + return list(self.queued.values()) + + +async def prepare_runnable(runnable): + return runnable.pickle_task() + + +# def _list_blocked_tasks(graph): +# """Generates a list of tasks that can't be run and predecessors that are blocking +# them to help debugging of broken workflows""" +# blocked = [] +# for tsk in graph.sorted_nodes: +# blocking = [] +# for pred in graph.predecessors[tsk.name]: +# if not pred.done: +# matching_name = [] +# for cache_loc in tsk.readonly_caches: +# for tsk_work_dir in cache_loc.iterdir(): +# if (tsk_work_dir / "_job.pklz").exists(): +# with open(tsk_work_dir / "_job.pklz", "rb") as f: +# saved_tsk = pickle.load(f) +# if saved_tsk.name == pred.name: +# matching_name.append( +# f"{saved_tsk.name} ({tsk_work_dir.name})" +# ) +# blocking.append((pred, ", ".join(matching_name))) +# if blocking: +# blocked.append( +# f"\n{tsk.name} ({tsk.checksum}) is blocked by " +# + "; ".join( +# f"{pred.name} ({pred.checksum}), which matches names of [{matching}]" +# for pred, matching in blocking +# ) +# ) +# return blocked + + +def get_open_loop(): + """ + Get current event loop. + + If the loop is closed, a new + loop is created and set as the current event loop. + + Returns + ------- + loop : :obj:`asyncio.EventLoop` + The current event loop + + """ + if os.name == "nt": + loop = asyncio.ProactorEventLoop() # for subprocess' pipes on Windows + else: + try: + loop = asyncio.get_event_loop() + # in case RuntimeError: There is no current event loop in thread 'MainThread' + except RuntimeError: + loop = asyncio.new_event_loop() + asyncio.set_event_loop(loop) + else: + if loop.is_closed(): + loop = asyncio.new_event_loop() + asyncio.set_event_loop(loop) + return loop diff --git a/pydra/engine/task.py b/pydra/engine/task.py deleted file mode 100644 index cb55d9e390..0000000000 --- a/pydra/engine/task.py +++ /dev/null @@ -1,592 +0,0 @@ -""" -Implement processing nodes. - -.. admonition :: Notes: - - * Environment specs - - 1. neurodocker json - 2. singularity file+hash - 3. docker hash - 4. conda env - 5. niceman config - 6. environment variables - - * Monitors/Audit - - 1. internal monitor - 2. external monitor - 3. callbacks - - * Resuming - - 1. internal tracking - 2. external tracking (DMTCP) - - * Provenance - - 1. Local fragments - 2. Remote server - - * Isolation - - 1. Working directory - 2. File (copy to local on write) - 3. read only file system - - * `Original implementation - `__ - -""" - -from __future__ import annotations - -import platform -import re -import attr -import inspect -import typing as ty -import shlex -from pathlib import Path -import warnings -import cloudpickle as cp -from fileformats.core import FileSet, DataType -from .core import TaskBase, is_lazy -from ..utils.messenger import AuditFlag -from .specs import ( - BaseSpec, - SpecInfo, - ShellSpec, - ShellOutSpec, - attr_fields, -) -from .helpers import ( - ensure_list, - position_sort, - argstr_formatting, - output_from_inputfields, - parse_copyfile, -) -from .helpers_file import template_update -from ..utils.typing import TypeParser -from .environments import Native - - -class FunctionTask(TaskBase): - """Wrap a Python callable as a task element.""" - - def __init__( - self, - func: ty.Callable, - audit_flags: AuditFlag = AuditFlag.NONE, - cache_dir=None, - cache_locations=None, - input_spec: ty.Optional[ty.Union[SpecInfo, BaseSpec]] = None, - cont_dim=None, - messenger_args=None, - messengers=None, - name=None, - output_spec: ty.Optional[ty.Union[SpecInfo, BaseSpec]] = None, - rerun=False, - **kwargs, - ): - """ - Initialize this task. - - Parameters - ---------- - func : :obj:`callable` - A Python executable function. - audit_flags : :obj:`pydra.utils.messenger.AuditFlag` - Auditing configuration - cache_dir : :obj:`os.pathlike` - Cache directory - cache_locations : :obj:`list` of :obj:`os.pathlike` - List of alternative cache locations. - input_spec : :obj:`pydra.engine.specs.SpecInfo` - Specification of inputs. - cont_dim : :obj:`dict`, or `None` - Container dimensions for input fields, - if any of the container should be treated as a container - messenger_args : - TODO - messengers : - TODO - name : :obj:`str` - Name of this task. - output_spec : :obj:`pydra.engine.specs.BaseSpec` - Specification of inputs. - - """ - if input_spec is None: - fields = [] - for val in inspect.signature(func).parameters.values(): - if val.default is not inspect.Signature.empty: - val_dflt = val.default - else: - val_dflt = attr.NOTHING - if isinstance(val.annotation, ty.TypeVar): - raise NotImplementedError( - "Template types are not currently supported in task signatures " - f"(found in '{val.name}' field of '{name}' task), " - "see https://github.com/nipype/pydra/issues/672" - ) - fields.append( - ( - val.name, - attr.ib( - default=val_dflt, - type=val.annotation, - metadata={ - "help_string": f"{val.name} parameter from {func.__name__}" - }, - ), - ) - ) - fields.append(("_func", attr.ib(default=cp.dumps(func), type=bytes))) - input_spec = SpecInfo(name="Inputs", fields=fields, bases=(BaseSpec,)) - else: - input_spec.fields.append( - ("_func", attr.ib(default=cp.dumps(func), type=bytes)) - ) - self.input_spec = input_spec - if name is None: - name = func.__name__ - super().__init__( - name, - inputs=kwargs, - cont_dim=cont_dim, - audit_flags=audit_flags, - messengers=messengers, - messenger_args=messenger_args, - cache_dir=cache_dir, - cache_locations=cache_locations, - rerun=rerun, - ) - if output_spec is None: - name = "Output" - fields = [("out", ty.Any)] - if "return" in func.__annotations__: - return_info = func.__annotations__["return"] - # # e.g. python annotation: fun() -> ty.NamedTuple("Output", [("out", float)]) - # # or pydra decorator: @pydra.mark.annotate({"return": ty.NamedTuple(...)}) - # - - if ( - hasattr(return_info, "__name__") - and getattr(return_info, "__annotations__", None) - and not issubclass(return_info, DataType) - ): - name = return_info.__name__ - fields = list(return_info.__annotations__.items()) - # e.g. python annotation: fun() -> {"out": int} - # or pydra decorator: @pydra.mark.annotate({"return": {"out": int}}) - elif isinstance(return_info, dict): - fields = list(return_info.items()) - # e.g. python annotation: fun() -> (int, int) - # or pydra decorator: @pydra.mark.annotate({"return": (int, int)}) - elif isinstance(return_info, tuple): - fields = [(f"out{i}", t) for i, t in enumerate(return_info, 1)] - # e.g. python annotation: fun() -> int - # or pydra decorator: @pydra.mark.annotate({"return": int}) - else: - fields = [("out", return_info)] - output_spec = SpecInfo(name=name, fields=fields, bases=(BaseSpec,)) - - self.output_spec = output_spec - - def _run_task(self, environment=None): - inputs = attr.asdict(self.inputs, recurse=False) - del inputs["_func"] - self.output_ = None - output = cp.loads(self.inputs._func)(**inputs) - output_names = [el[0] for el in self.output_spec.fields] - if output is None: - self.output_ = {nm: None for nm in output_names} - elif len(output_names) == 1: - # if only one element in the fields, everything should be returned together - self.output_ = {output_names[0]: output} - elif isinstance(output, tuple) and len(output_names) == len(output): - self.output_ = dict(zip(output_names, output)) - elif isinstance(output, dict): - self.output_ = {key: output.get(key, None) for key in output_names} - else: - raise RuntimeError( - f"expected {len(self.output_spec.fields)} elements, " - f"but {output} were returned" - ) - - -class ShellCommandTask(TaskBase): - """Wrap a shell command as a task element.""" - - input_spec = None - output_spec = None - - def __init__( - self, - audit_flags: AuditFlag = AuditFlag.NONE, - cache_dir=None, - input_spec: ty.Optional[SpecInfo] = None, - cont_dim=None, - messenger_args=None, - messengers=None, - name=None, - output_spec: ty.Optional[SpecInfo] = None, - rerun=False, - strip=False, - environment=Native(), - **kwargs, - ): - """ - Initialize this task. - - Parameters - ---------- - audit_flags : :obj:`pydra.utils.messenger.AuditFlag` - Auditing configuration - cache_dir : :obj:`os.pathlike` - Cache directory - input_spec : :obj:`pydra.engine.specs.SpecInfo` - Specification of inputs. - cont_dim : :obj:`dict`, or `None` - Container dimensions for input fields, - if any of the container should be treated as a container - messenger_args : - TODO - messengers : - TODO - name : :obj:`str` - Name of this task. - output_spec : :obj:`pydra.engine.specs.BaseSpec` - Specification of inputs. - strip : :obj:`bool` - TODO - - """ - - # using default name for task if no name provided - if name is None: - name = "ShellTask_noname" - - # using provided spec, class attribute or setting the default SpecInfo - self.input_spec = ( - input_spec - or self.input_spec - or SpecInfo(name="Inputs", fields=[], bases=(ShellSpec,)) - ) - self.output_spec = ( - output_spec - or self.output_spec - or SpecInfo(name="Output", fields=[], bases=(ShellOutSpec,)) - ) - self.output_spec = output_from_inputfields(self.output_spec, self.input_spec) - - for special_inp in ["executable", "args"]: - if hasattr(self, special_inp): - if special_inp not in kwargs: - kwargs[special_inp] = getattr(self, special_inp) - elif kwargs[special_inp] != getattr(self, special_inp): - warnings.warn( - f"you are changing the executable from {getattr(self, special_inp)} " - f"to {kwargs[special_inp]}" - ) - - super().__init__( - name=name, - inputs=kwargs, - cont_dim=cont_dim, - audit_flags=audit_flags, - messengers=messengers, - messenger_args=messenger_args, - cache_dir=cache_dir, - rerun=rerun, - ) - self.strip = strip - self.environment = environment - self.bindings = {} - self.inputs_mod_root = {} - - def get_bindings(self, root: str | None = None) -> dict[str, tuple[str, str]]: - """Return bindings necessary to run task in an alternative root. - - This is primarily intended for contexts when a task is going - to be run in a container with mounted volumes. - - Arguments - --------- - root: str - - Returns - ------- - bindings: dict - Mapping from paths in the host environment to the target environment - """ - - if root is None: - return {} - else: - self._prepare_bindings(root=root) - return self.bindings - - def command_args(self, root=None): - """Get command line arguments""" - if is_lazy(self.inputs): - raise Exception("can't return cmdline, self.inputs has LazyFields") - if self.state: - raise NotImplementedError - - modified_inputs = template_update(self.inputs, output_dir=self.output_dir) - for field_name, field_value in modified_inputs.items(): - setattr(self.inputs, field_name, field_value) - - pos_args = [] # list for (position, command arg) - self._positions_provided = [] - for field in attr_fields(self.inputs): - name, meta = field.name, field.metadata - if ( - getattr(self.inputs, name) is attr.NOTHING - and not meta.get("readonly") - and not meta.get("formatter") - ): - continue - if name == "executable": - pos_args.append(self._command_shelltask_executable(field)) - elif name == "args": - pos_val = self._command_shelltask_args(field) - if pos_val: - pos_args.append(pos_val) - else: - if name in modified_inputs: - pos_val = self._command_pos_args(field, root=root) - else: - pos_val = self._command_pos_args(field) - if pos_val: - pos_args.append(pos_val) - - # Sort command and arguments by position - cmd_args = position_sort(pos_args) - # pos_args values are each a list of arguments, so concatenate lists after sorting - return sum(cmd_args, []) - - def _field_value(self, field, check_file=False): - """ - Checking value of the specific field, if value is not set, None is returned. - check_file has no effect, but subclasses can use it to validate or modify - filenames. - """ - value = getattr(self.inputs, field.name) - if value == attr.NOTHING: - value = None - return value - - def _command_shelltask_executable(self, field): - """Returning position and value for executable ShellTask input""" - pos = 0 # executable should be the first el. of the command - value = self._field_value(field) - if value is None: - raise ValueError("executable has to be set") - return pos, ensure_list(value, tuple2list=True) - - def _command_shelltask_args(self, field): - """Returning position and value for args ShellTask input""" - pos = -1 # assuming that args is the last el. of the command - value = self._field_value(field, check_file=True) - if value is None: - return None - else: - return pos, ensure_list(value, tuple2list=True) - - def _command_pos_args(self, field, root=None): - """ - Checking all additional input fields, setting pos to None, if position not set. - Creating a list with additional parts of the command that comes from - the specific field. - """ - argstr = field.metadata.get("argstr", None) - formatter = field.metadata.get("formatter", None) - if argstr is None and formatter is None: - # assuming that input that has no argstr is not used in the command, - # or a formatter is not provided too. - return None - pos = field.metadata.get("position", None) - if pos is not None: - if not isinstance(pos, int): - raise Exception(f"position should be an integer, but {pos} given") - # checking if the position is not already used - if pos in self._positions_provided: - raise Exception( - f"{field.name} can't have provided position, {pos} is already used" - ) - - self._positions_provided.append(pos) - - # Shift non-negatives up to allow executable to be 0 - # Shift negatives down to allow args to be -1 - pos += 1 if pos >= 0 else -1 - - value = self._field_value(field, check_file=True) - - if value: - if field.name in self.inputs_mod_root: - value = self.inputs_mod_root[field.name] - elif root: # values from templates - value = value.replace(str(self.output_dir), f"{root}{self.output_dir}") - - if field.metadata.get("readonly", False) and value is not None: - raise Exception(f"{field.name} is read only, the value can't be provided") - elif ( - value is None - and not field.metadata.get("readonly", False) - and formatter is None - ): - return None - - inputs_dict = attr.asdict(self.inputs, recurse=False) - - cmd_add = [] - # formatter that creates a custom command argument - # it can take the value of the field, all inputs, or the value of other fields. - if "formatter" in field.metadata: - call_args = inspect.getfullargspec(field.metadata["formatter"]) - call_args_val = {} - for argnm in call_args.args: - if argnm == "field": - call_args_val[argnm] = value - elif argnm == "inputs": - call_args_val[argnm] = inputs_dict - else: - if argnm in inputs_dict: - call_args_val[argnm] = inputs_dict[argnm] - else: - raise AttributeError( - f"arguments of the formatter function from {field.name} " - f"has to be in inputs or be field or output_dir, " - f"but {argnm} is used" - ) - cmd_el_str = field.metadata["formatter"](**call_args_val) - cmd_el_str = cmd_el_str.strip().replace(" ", " ") - if cmd_el_str != "": - cmd_add += split_cmd(cmd_el_str) - elif field.type is bool: - # if value is simply True the original argstr is used, - # if False, nothing is added to the command. - if value is True: - cmd_add.append(argstr) - else: - sep = field.metadata.get("sep", " ") - if ( - argstr.endswith("...") - and isinstance(value, ty.Iterable) - and not isinstance(value, (str, bytes)) - ): - argstr = argstr.replace("...", "") - # if argstr has a more complex form, with "{input_field}" - if "{" in argstr and "}" in argstr: - argstr_formatted_l = [] - for val in value: - argstr_f = argstr_formatting( - argstr, self.inputs, value_updates={field.name: val} - ) - argstr_formatted_l.append(f" {argstr_f}") - cmd_el_str = sep.join(argstr_formatted_l) - else: # argstr has a simple form, e.g. "-f", or "--f" - cmd_el_str = sep.join([f" {argstr} {val}" for val in value]) - else: - # in case there are ... when input is not a list - argstr = argstr.replace("...", "") - if isinstance(value, ty.Iterable) and not isinstance( - value, (str, bytes) - ): - cmd_el_str = sep.join([str(val) for val in value]) - value = cmd_el_str - # if argstr has a more complex form, with "{input_field}" - if "{" in argstr and "}" in argstr: - cmd_el_str = argstr.replace(f"{{{field.name}}}", str(value)) - cmd_el_str = argstr_formatting(cmd_el_str, self.inputs) - else: # argstr has a simple form, e.g. "-f", or "--f" - if value: - cmd_el_str = f"{argstr} {value}" - else: - cmd_el_str = "" - if cmd_el_str: - cmd_add += split_cmd(cmd_el_str) - return pos, cmd_add - - @property - def cmdline(self): - """Get the actual command line that will be submitted - Returns a list if the task has a state. - """ - if is_lazy(self.inputs): - raise Exception("can't return cmdline, self.inputs has LazyFields") - # checking the inputs fields before returning the command line - self.inputs.check_fields_input_spec() - if self.state: - raise NotImplementedError - # Skip the executable, which can be a multi-part command, e.g. 'docker run'. - cmdline = self.command_args()[0] - for arg in self.command_args()[1:]: - # If there are spaces in the arg, and it is not enclosed by matching - # quotes, add quotes to escape the space. Not sure if this should - # be expanded to include other special characters apart from spaces - if " " in arg: - cmdline += " '" + arg + "'" - else: - cmdline += " " + arg - return cmdline - - def _run_task(self, environment=None): - if environment is None: - environment = self.environment - self.output_ = environment.execute(self) - - def _prepare_bindings(self, root: str): - """Prepare input files to be passed to the task - - This updates the ``bindings`` attribute of the current task to make files available - in an ``Environment``-defined ``root``. - """ - for fld in attr_fields(self.inputs): - if TypeParser.contains_type(FileSet, fld.type): - fileset = getattr(self.inputs, fld.name) - copy = parse_copyfile(fld)[0] == FileSet.CopyMode.copy - - host_path, env_path = fileset.parent, Path(f"{root}{fileset.parent}") - - # Default to mounting paths as read-only, but respect existing modes - old_mode = self.bindings.get(host_path, ("", "ro"))[1] - self.bindings[host_path] = (env_path, "rw" if copy else old_mode) - - # Provide in-container paths without type-checking - self.inputs_mod_root[fld.name] = tuple( - env_path / rel for rel in fileset.relative_fspaths - ) - - DEFAULT_COPY_COLLATION = FileSet.CopyCollation.adjacent - - -def split_cmd(cmd: str): - """Splits a shell command line into separate arguments respecting quotes - - Parameters - ---------- - cmd : str - Command line string or part thereof - - Returns - ------- - str - the command line string split into process args - """ - # Check whether running on posix or Windows system - on_posix = platform.system() != "Windows" - args = shlex.split(cmd, posix=on_posix) - cmd_args = [] - for arg in args: - match = re.match("(['\"])(.*)\\1$", arg) - if match: - cmd_args.append(match.group(2)) - else: - cmd_args.append(arg) - return cmd_args diff --git a/pydra/engine/tests/test_audit.py b/pydra/engine/tests/test_audit.py new file mode 100644 index 0000000000..a4ac3c7ac4 --- /dev/null +++ b/pydra/engine/tests/test_audit.py @@ -0,0 +1,304 @@ +import json +import glob as glob +from pydra.compose import python, shell, workflow +from pydra.utils.messenger import FileMessenger, PrintMessenger, collect_messages +from pydra.engine.audit import AuditFlag +from pydra.engine.submitter import Submitter +from pydra.engine.job import Job +from fileformats.generic import File +from pydra.utils.hash import hash_function + + +def test_audit_prov( + tmpdir, +): + @python.define(outputs={"out": float}) + def TestFunc(a: int, b: float = 0.1): + return a + b + + # printing the audit message + funky = TestFunc(a=1) + funky(cache_root=tmpdir, audit_flags=AuditFlag.PROV, messengers=PrintMessenger()) + + # saving the audit message into the file + funky = TestFunc(a=2) + funky(cache_root=tmpdir, audit_flags=AuditFlag.PROV, messengers=FileMessenger()) + # this should be the default loctaion + message_path = tmpdir / funky._checksum / "messages" + assert (tmpdir / funky._checksum / "messages").exists() + + collect_messages(tmpdir / funky._checksum, message_path, ld_op="compact") + assert (tmpdir / funky._checksum / "messages.jsonld").exists() + + +def test_audit_task(tmpdir): + @python.define(outputs={"out": float}) + def TestFunc(a: int, b: float = 0.1): + return a + b + + from glob import glob + + funky = TestFunc(a=2) + funky( + cache_root=tmpdir, + audit_flags=AuditFlag.PROV, + messengers=FileMessenger(), + ) + message_path = tmpdir / funky._checksum / "messages" + + for file in glob(str(message_path) + "/*.jsonld"): + with open(file) as f: + data = json.load(f) + if "@type" in data: + if "AssociatedWith" in data: + assert "main" in data["Label"] + + if "@type" in data: + if data["@type"] == "input": + assert None is data["Label"] + if "AssociatedWith" in data: + assert None is data["AssociatedWith"] + + # assert any(json_content) + + +def test_audit_shellcommandtask(tmpdir): + Shelly = shell.define("ls -l") + + from glob import glob + + shelly = Shelly() + + shelly( + cache_root=tmpdir, + audit_flags=AuditFlag.PROV, + messengers=FileMessenger(), + ) + message_path = tmpdir / shelly._checksum / "messages" + # go through each jsonld file in message_path and check if the label field exists + + command_content = [] + + for file in glob(str(message_path) + "/*.jsonld"): + with open(file) as f: + data = json.load(f) + + if "@type" in data: + if "AssociatedWith" in data: + assert "main" == data["Label"] + + if "@type" in data: + if data["@type"] == "input": + assert data["Label"] is None + + if "Command" in data: + command_content.append(True) + assert "ls -l" == data["Command"] + + assert any(command_content) + + +def test_audit_shellcommandtask_file(tmp_path): + # sourcery skip: use-fstring-for-concatenation + # create test.txt file with "This is a test" in it in the tmpdir + # create txt file in cwd + test1_file = tmp_path / "test.txt" + test2_file = tmp_path / "test2.txt" + with open(test1_file, "w") as f: + f.write("This is a test") + + with open(test2_file, "w") as f: + f.write("This is a test") + + cmd = "cat" + file_in = File(test1_file) + file_in_2 = File(test2_file) + test_file_hash = hash_function(file_in) + test_file_hash_2 = hash_function(file_in_2) + Shelly = shell.define( + cmd, + inputs={ + "in_file": shell.arg( + type=File, + position=1, + argstr="", + help="text", + ), + "in_file_2": shell.arg( + type=File, + position=2, + argstr="", + help="text", + ), + }, + ) + shelly = Shelly( + in_file=file_in, + in_file_2=file_in_2, + ) + shelly( + cache_root=tmp_path, + audit_flags=AuditFlag.PROV, + messengers=FileMessenger(), + ) + message_path = tmp_path / shelly._hash / "messages" + for file in glob.glob(str(message_path) + "/*.jsonld"): + with open(file) as x: + data = json.load(x) + if "@type" in data: + if data["@type"] == "input": + if data["Label"] == "in_file": + assert data["AtLocation"] == str(file_in) + assert data["digest"] == test_file_hash + if data["Label"] == "in_file_2": + assert data["AtLocation"] == str(file_in_2) + assert data["digest"] == test_file_hash_2 + + +def test_audit_shellcommandtask_version(tmpdir): + import subprocess as sp + + version_cmd = sp.run("less --version", shell=True, stdout=sp.PIPE).stdout.decode( + "utf-8" + ) + version_cmd = version_cmd.splitlines()[0] + cmd = "less test_task.py" + Shelly = shell.define(cmd) + shelly = Shelly() + + import glob + + shelly( + cache_root=tmpdir, + audit_flags=AuditFlag.PROV, + messengers=FileMessenger(), + ) + message_path = tmpdir / shelly._checksum / "messages" + # go through each jsonld file in message_path and check if the label field exists + version_content = [] + for file in glob.glob(str(message_path) + "/*.jsonld"): + with open(file) as f: + data = json.load(f) + if "AssociatedWith" in data: + if version_cmd in data["AssociatedWith"]: + version_content.append(True) + + assert any(version_content) + + +def test_audit_prov_messdir_1( + tmpdir, +): + """customized messenger dir""" + + @python.define(outputs={"out": float}) + def TestFunc(a: int, b: float = 0.1): + return a + b + + # printing the audit message + funky = TestFunc(a=1) + funky(cache_root=tmpdir, audit_flags=AuditFlag.PROV, messengers=PrintMessenger()) + + # saving the audit message into the file + funky = TestFunc(a=2) + # user defined path + message_path = tmpdir / funky._checksum / "my_messages" + # providing messenger_dir for audit + funky_task = Job( + task=funky, + submitter=Submitter( + cache_root=tmpdir, audit_flags=AuditFlag.PROV, messengers=FileMessenger() + ), + name="funky", + ) + funky_task.audit.messenger_args = dict(message_dir=message_path) + funky_task.run() + assert (tmpdir / funky._checksum / "my_messages").exists() + + collect_messages(tmpdir / funky._checksum, message_path, ld_op="compact") + assert (tmpdir / funky._checksum / "messages.jsonld").exists() + + +def test_audit_prov_messdir_2( + tmpdir, +): + """customized messenger dir in init""" + + @python.define(outputs={"out": float}) + def TestFunc(a: int, b: float = 0.1): + return a + b + + # printing the audit message + funky = TestFunc(a=1) + funky(cache_root=tmpdir, audit_flags=AuditFlag.PROV, messengers=PrintMessenger()) + + # user defined path (doesn't depend on checksum, can be defined before init) + message_path = tmpdir / "my_messages" + # saving the audit message into the file + funky = TestFunc(a=2) + # providing messenger_dir for audit + funky( + cache_root=tmpdir, + audit_flags=AuditFlag.PROV, + messengers=FileMessenger(), + messenger_args=dict(message_dir=message_path), + ) + assert (tmpdir / "my_messages").exists() + + collect_messages(tmpdir, message_path, ld_op="compact") + assert (tmpdir / "messages.jsonld").exists() + + +def test_audit_prov_wf( + tmpdir, +): + """FileMessenger for wf""" + + @python.define(outputs={"out": float}) + def TestFunc(a: int, b: float = 0.1): + return a + b + + @workflow.define + def Workflow(x: int): + test_func = workflow.add(TestFunc(a=x)) + return test_func.out + + wf = Workflow(x=2) + + wf( + cache_root=tmpdir, + audit_flags=AuditFlag.PROV, + messengers=FileMessenger(), + ) + # default path + message_path = tmpdir / wf._checksum / "messages" + assert message_path.exists() + + collect_messages(tmpdir / wf._checksum, message_path, ld_op="compact") + assert (tmpdir / wf._checksum / "messages.jsonld").exists() + + +def test_audit_all( + tmpdir, +): + @python.define(outputs={"out": float}) + def TestFunc(a: int, b: float = 0.1): + return a + b + + funky = TestFunc(a=2) + message_path = tmpdir / funky._checksum / "messages" + + funky( + cache_root=tmpdir, + audit_flags=AuditFlag.ALL, + messengers=FileMessenger(), + messenger_args=dict(message_dir=message_path), + ) + from glob import glob + + assert len(glob(str(tmpdir / funky._checksum / "proc*.log"))) == 1 + assert len(glob(str(message_path / "*.jsonld"))) == 7 + + # commented out to speed up testing + collect_messages(tmpdir / funky._checksum, message_path, ld_op="compact") + assert (tmpdir / funky._checksum / "messages.jsonld").exists() diff --git a/pydra/engine/tests/test_boutiques.py b/pydra/engine/tests/test_boutiques.py deleted file mode 100644 index 48f484b687..0000000000 --- a/pydra/engine/tests/test_boutiques.py +++ /dev/null @@ -1,175 +0,0 @@ -import shutil -import subprocess as sp -import attr -import pytest - -from ..core import Workflow -from ..task import ShellCommandTask -from ..submitter import Submitter -from ..boutiques import BoshTask -from .utils import result_no_submitter, result_submitter, no_win - -need_bosh_docker = pytest.mark.skipif( - shutil.which("docker") is None - or sp.call(["docker", "info"]) - or sp.call(["which", "bosh"]), - reason="requires docker and bosh", -) - -pytestmark = pytest.mark.skip() - - -@no_win -@need_bosh_docker -@pytest.mark.flaky(reruns=3) # need for travis -@pytest.mark.parametrize( - "maskfile", ["test_brain.nii.gz", "test_brain", "test_brain.nii"] -) -@pytest.mark.parametrize("results_function", [result_no_submitter, result_submitter]) -def test_boutiques_1(maskfile, plugin, results_function, tmpdir, data_tests_dir): - """simple task to run fsl.bet using BoshTask""" - btask = BoshTask(name="NA", zenodo_id="1482743") - btask.inputs.infile = data_tests_dir / "test.nii.gz" - btask.inputs.maskfile = maskfile - btask.cache_dir = tmpdir - res = results_function(btask, plugin) - - assert res.output.return_code == 0 - - # checking if the outfile exists and if it has a proper name - assert res.output.outfile.name == "test_brain.nii.gz" - assert res.output.outfile.exists() - # files that do not exist were set to NOTHING - assert res.output.out_outskin_off == attr.NOTHING - - -@no_win -@need_bosh_docker -@pytest.mark.flaky(reruns=3) -def test_boutiques_spec_1(data_tests_dir): - """testing spec: providing input/output fields names""" - btask = BoshTask( - name="NA", - zenodo_id="1482743", - infile=data_tests_dir / "test.nii.gz", - maskfile="test_brain.nii.gz", - input_spec_names=["infile", "maskfile"], - output_spec_names=["outfile", "out_outskin_off"], - ) - - assert len(btask.input_spec.fields) == 2 - assert btask.input_spec.fields[0][0] == "infile" - assert btask.input_spec.fields[1][0] == "maskfile" - assert hasattr(btask.inputs, "infile") - assert hasattr(btask.inputs, "maskfile") - - assert len(btask.output_spec.fields) == 2 - assert btask.output_spec.fields[0][0] == "outfile" - assert btask.output_spec.fields[1][0] == "out_outskin_off" - - -@no_win -@need_bosh_docker -@pytest.mark.flaky(reruns=3) -def test_boutiques_spec_2(data_tests_dir): - """testing spec: providing partial input/output fields names""" - btask = BoshTask( - name="NA", - zenodo_id="1482743", - infile=data_tests_dir / "test.nii.gz", - maskfile="test_brain.nii.gz", - input_spec_names=["infile"], - output_spec_names=[], - ) - - assert len(btask.input_spec.fields) == 1 - assert btask.input_spec.fields[0][0] == "infile" - assert hasattr(btask.inputs, "infile") - # input doesn't see maskfile - assert not hasattr(btask.inputs, "maskfile") - - assert len(btask.output_spec.fields) == 0 - - -@no_win -@need_bosh_docker -@pytest.mark.flaky(reruns=3) -@pytest.mark.parametrize( - "maskfile", ["test_brain.nii.gz", "test_brain", "test_brain.nii"] -) -def test_boutiques_wf_1(maskfile, plugin, tmpdir, infile): - """wf with one task that runs fsl.bet using BoshTask""" - wf = Workflow(name="wf", input_spec=["maskfile", "infile"]) - wf.inputs.maskfile = maskfile - wf.inputs.infile = infile - wf.cache_dir = tmpdir - - wf.add( - BoshTask( - name="bet", - zenodo_id="1482743", - infile=wf.lzin.infile, - maskfile=wf.lzin.maskfile, - ) - ) - - wf.set_output([("outfile", wf.bet.lzout.outfile)]) - - with Submitter(plugin=plugin) as sub: - wf(submitter=sub) - - res = wf.result() - assert res.output.outfile.name == "test_brain.nii.gz" - assert res.output.outfile.exists() - - -@no_win -@need_bosh_docker -@pytest.mark.flaky(reruns=3) -@pytest.mark.xfail(reason="issues with bosh for 4472771") -@pytest.mark.parametrize( - "maskfile", ["test_brain.nii.gz", "test_brain", "test_brain.nii"] -) -def test_boutiques_wf_2(maskfile, plugin, tmpdir, infile): - """wf with two BoshTasks (fsl.bet and fsl.stats) and one ShellTask""" - wf = Workflow(name="wf", input_spec=["maskfile", "infile"]) - wf.inputs.maskfile = maskfile - wf.inputs.infile = infile - wf.cache_dir = tmpdir - - wf.add( - BoshTask( - name="bet", - zenodo_id="1482743", - infile=wf.lzin.infile, - maskfile=wf.lzin.maskfile, - ) - ) - # used to be "3240521", but can't access anymore - wf.add( - BoshTask( - name="stat", zenodo_id="4472771", input_file=wf.bet.lzout.outfile, v=True - ) - ) - wf.add(ShellCommandTask(name="cat", executable="cat", args=wf.stat.lzout.output)) - - wf.set_output( - [ - ("outfile_bet", wf.bet.lzout.outfile), - ("out_stat", wf.stat.lzout.output), - ("out", wf.cat.lzout.stdout), - ] - ) - - with Submitter(plugin=plugin) as sub: - wf(submitter=sub) - - res = wf.result() - assert res.output.outfile_bet.name == "test_brain.nii.gz" - assert res.output.outfile_bet.exists() - - assert res.output.out_stat.name == "output.txt" - assert res.output.out_stat.exists() - - assert int(res.output.out.rstrip().split()[0]) == 11534336 - assert float(res.output.out.rstrip().split()[1]) == 11534336.0 diff --git a/pydra/engine/tests/test_cache_dirs.py b/pydra/engine/tests/test_cache_dirs.py new file mode 100644 index 0000000000..da863852e5 --- /dev/null +++ b/pydra/engine/tests/test_cache_dirs.py @@ -0,0 +1,196 @@ +import typing as ty +import time +from pathlib import Path +from fileformats.generic import File +from pydra.compose import python +from pydra.engine.tests.utils import FunAddTwo, FunFile +from pydra.engine.submitter import Submitter +from pydra.engine.tests.utils import num_python_cache_roots + + +def test_task_state_cachelocations(worker, tmp_path): + """ + Two identical tasks with a state and cache_root; + the second task has readonly_caches and should not recompute the results + """ + cache_root = tmp_path / "test_task_nostate" + cache_root.mkdir() + cache_root2 = tmp_path / "test_task_nostate2" + cache_root2.mkdir() + + nn = FunAddTwo(a=3).split("a", a=[3, 5]) + with Submitter(worker=worker, cache_root=cache_root) as sub: + sub(nn) + + nn2 = FunAddTwo(a=3).split("a", a=[3, 5]) + with Submitter( + worker=worker, cache_root=cache_root2, readonly_caches=cache_root + ) as sub: + results2 = sub(nn2) + assert not results2.errored, "\n".join(results2.errors["error message"]) + + # checking the results + expected = [({"NA.a": 3}, 5), ({"NA.a": 5}, 7)] + for i, res in enumerate(expected): + assert results2.outputs.out[i] == res[1] + + # Would ideally check for all nodes of the workflows + assert num_python_cache_roots(cache_root) == 2 + assert not num_python_cache_roots(cache_root2) + + +def test_task_state_cachelocations_forcererun(worker, tmp_path): + """ + Two identical tasks with a state and cache_root; + the second task has readonly_caches, + but submitter is called with rerun=True, so should recompute + """ + cache_root = tmp_path / "test_task_nostate" + cache_root.mkdir() + cache_root2 = tmp_path / "test_task_nostate2" + cache_root2.mkdir() + + nn = FunAddTwo(a=3).split("a", a=[3, 5]) + with Submitter(worker=worker, cache_root=cache_root) as sub: + sub(nn) + + nn2 = FunAddTwo(a=3).split("a", a=[3, 5]) + with Submitter( + worker=worker, cache_root=cache_root2, readonly_caches=cache_root + ) as sub: + results2 = sub(nn2, rerun=True) + + # checking the results + + expected = [({"NA.a": 3}, 5), ({"NA.a": 5}, 7)] + for i, res in enumerate(expected): + assert results2.outputs.out[i] == res[1] + + # both workflows should be run + assert num_python_cache_roots(cache_root) == 2 + assert num_python_cache_roots(cache_root2) == 2 + + +def test_task_state_cachelocations_updated(worker, tmp_path): + """ + Two identical tasks with states and cache_root; + the second task has readonly_caches in init, + that is later overwritten in Submitter.__call__; + the readonly_caches from call doesn't exist so the second task should run again + """ + cache_root = tmp_path / "test_task_nostate" + cache_root.mkdir() + cache_root1 = tmp_path / "test_task_nostate1" + cache_root1.mkdir() + cache_root2 = tmp_path / "test_task_nostate2" + cache_root2.mkdir() + + nn = FunAddTwo().split("a", a=[3, 5]) + with Submitter(worker=worker, cache_root=cache_root) as sub: + sub(nn) + + nn2 = FunAddTwo().split("a", a=[3, 5]) + with Submitter( + worker=worker, cache_root=cache_root2, readonly_caches=cache_root1 + ) as sub: + results2 = sub(nn2) + assert not results2.errored, "\n".join(results2.errors["error message"]) + + # checking the results + + expected = [({"NA.a": 3}, 5), ({"NA.a": 5}, 7)] + for i, res in enumerate(expected): + assert results2.outputs.out[i] == res[1] + + # both workflows should be run + assert num_python_cache_roots(cache_root) == 2 + assert num_python_cache_roots(cache_root2) == 2 + + +def test_task_files_cachelocations(worker, tmp_path): + """ + Two identical tasks with provided cache_root that use file as an input; + the second task has readonly_caches and should not recompute the results + """ + cache_root = tmp_path / "test_task_nostate" + cache_root.mkdir() + cache_root2 = tmp_path / "test_task_nostate2" + cache_root2.mkdir() + input_dir = tmp_path / "input" + input_dir.mkdir() + + input1 = input_dir / "input1.txt" + input1.write_text("test") + input2 = input_dir / "input2.txt" + input2.write_text("test") + + nn = FunFile(filename=input1) + with Submitter(worker=worker, cache_root=cache_root) as sub: + results = sub(nn) + assert not results.errored, "\n".join(results.errors["error message"]) + + nn2 = FunFile(filename=input2) + with Submitter( + worker=worker, cache_root=cache_root2, readonly_caches=cache_root + ) as sub: + results2 = sub(nn2) + assert not results2.errored, "\n".join(results.errors["error message"]) + + # checking the results + + assert results2.outputs.out == "test" + + # checking if the second task didn't run the interface again + assert results.cache_dir == results2.cache_dir + + +class OverriddenContentsFile(File): + """A class for testing purposes, to that enables you to override the contents + of the file to allow you to check whether the persistent cache is used.""" + + def __init__( + self, + fspaths: ty.Iterator[Path], + contents: ty.Optional[bytes] = None, + metadata: ty.Dict[str, ty.Any] = None, + ): + super().__init__(fspaths, metadata=metadata) + self._contents = contents + + def byte_chunks(self, **kwargs) -> ty.Generator[ty.Tuple[str, bytes], None, None]: + if self._contents is not None: + yield (str(self.fspath), iter([self._contents])) + else: + yield from super().byte_chunks(**kwargs) + + @property + def raw_contents(self): + if self._contents is not None: + return self._contents + return super().raw_contents + + +def test_task_files_persistentcache(tmp_path): + """ + Two identical tasks with provided cache_root that use file as an input; + the second task has readonly_caches and should not recompute the results + """ + test_file_path = tmp_path / "test_file.txt" + test_file_path.write_bytes(b"foo") + cache_root = tmp_path / "cache-dir" + cache_root.mkdir() + test_file = OverriddenContentsFile(test_file_path) + + @python.define + def read_contents(x: OverriddenContentsFile) -> bytes: + return x.raw_contents + + assert read_contents(x=test_file)(cache_root=cache_root).out == b"foo" + test_file._contents = b"bar" + # should return result from the first run using the persistent cache + assert read_contents(x=test_file)(cache_root=cache_root).out == b"foo" + time.sleep(2) # Windows has a 2-second resolution for mtime + test_file_path.touch() # update the mtime to invalidate the persistent cache value + assert ( + read_contents(x=test_file)(cache_root=cache_root).out == b"bar" + ) # returns the overridden value diff --git a/pydra/engine/tests/test_dockertask.py b/pydra/engine/tests/test_dockertask.py deleted file mode 100644 index 5ccf37e292..0000000000 --- a/pydra/engine/tests/test_dockertask.py +++ /dev/null @@ -1,771 +0,0 @@ -import typing as ty -import pytest -import attr - -from ..task import ShellCommandTask -from ..submitter import Submitter -from ..core import Workflow -from ..specs import ShellOutSpec, SpecInfo, File, ShellSpec -from ..environments import Docker -from .utils import no_win, need_docker, result_submitter, result_no_submitter - - -@no_win -@need_docker -def test_docker_1_nosubm(): - """simple command in a container, a default bindings and working directory is added - no submitter - """ - cmd = "whoami" - docky = ShellCommandTask( - name="docky", executable=cmd, environment=Docker(image="busybox") - ) - assert docky.environment.image == "busybox" - assert docky.environment.tag == "latest" - assert isinstance(docky.environment, Docker) - assert docky.cmdline == cmd - - res = docky() - assert res.output.stdout == "root\n" - assert res.output.return_code == 0 - - -@no_win -@need_docker -def test_docker_1(plugin): - """simple command in a container, a default bindings and working directory is added - using submitter - """ - cmd = "whoami" - docky = ShellCommandTask( - name="docky", executable=cmd, environment=Docker(image="busybox") - ) - - with Submitter(plugin=plugin) as sub: - docky(submitter=sub) - - res = docky.result() - assert res.output.stdout == "root\n" - assert res.output.return_code == 0 - - -@no_win -@need_docker -@pytest.mark.parametrize("results_function", [result_no_submitter, result_submitter]) -def test_docker_2(results_function, plugin): - """a command with arguments, cmd and args given as executable - with and without submitter - """ - cmd = ["echo", "hail", "pydra"] - docky = ShellCommandTask( - name="docky", executable=cmd, environment=Docker(image="busybox") - ) - # cmdline doesn't know anything about docker - assert docky.cmdline == " ".join(cmd) - res = results_function(docky, plugin) - assert res.output.stdout.strip() == " ".join(cmd[1:]) - assert res.output.return_code == 0 - - -@no_win -@need_docker -@pytest.mark.parametrize("results_function", [result_no_submitter, result_submitter]) -def test_docker_2a(results_function, plugin): - """a command with arguments, using executable and args - using submitter - """ - cmd_exec = "echo" - cmd_args = ["hail", "pydra"] - # separate command into exec + args - docky = ShellCommandTask( - name="docky", - executable=cmd_exec, - args=cmd_args, - environment=Docker(image="busybox"), - ) - assert docky.inputs.executable == "echo" - assert docky.cmdline == f"{cmd_exec} {' '.join(cmd_args)}" - - res = results_function(docky, plugin) - assert res.output.stdout.strip() == " ".join(cmd_args) - assert res.output.return_code == 0 - - -# tests with State - - -@no_win -@need_docker -@pytest.mark.parametrize("results_function", [result_no_submitter, result_submitter]) -def test_docker_st_1(results_function, plugin): - """commands without arguments in container - splitter = executable - """ - cmd = ["pwd", "whoami"] - docky = ShellCommandTask(name="docky", environment=Docker(image="busybox")).split( - "executable", executable=cmd - ) - assert docky.state.splitter == "docky.executable" - - res = results_function(docky, plugin) - assert res[0].output.stdout == f"/mnt/pydra{docky.output_dir[0]}\n" - assert res[1].output.stdout == "root\n" - assert res[0].output.return_code == res[1].output.return_code == 0 - - -# tests with customized output_spec - - -@no_win -@need_docker -def test_docker_outputspec_1(plugin, tmp_path): - """ - customised output_spec, adding files to the output, providing specific pathname - output_path is automatically added to the bindings - """ - cmd = ["touch", "newfile_tmp.txt"] - my_output_spec = SpecInfo( - name="Output", - fields=[("newfile", File, "newfile_tmp.txt")], - bases=(ShellOutSpec,), - ) - docky = ShellCommandTask( - name="docky", - environment=Docker(image="ubuntu"), - executable=cmd, - output_spec=my_output_spec, - ) - - with Submitter(plugin=plugin) as sub: - docky(submitter=sub) - - res = docky.result() - assert res.output.stdout == "" - - -# tests with customised input_spec - - -@no_win -@need_docker -def test_docker_inputspec_1(tmp_path): - """a simple customized input spec for docker task""" - filename = str(tmp_path / "file_pydra.txt") - with open(filename, "w") as f: - f.write("hello from pydra") - - cmd = "cat" - - my_input_spec = SpecInfo( - name="Input", - fields=[ - ( - "file", - attr.ib( - type=File, - metadata={ - "mandatory": True, - "position": 1, - "argstr": "", - "help_string": "input file", - }, - ), - ) - ], - bases=(ShellSpec,), - ) - - docky = ShellCommandTask( - name="docky", - environment=Docker(image="busybox"), - executable=cmd, - file=filename, - input_spec=my_input_spec, - strip=True, - ) - - res = docky() - assert res.output.stdout == "hello from pydra" - - -@no_win -@need_docker -def test_docker_inputspec_1a(tmp_path): - """a simple customized input spec for docker task - a default value is used - """ - filename = str(tmp_path / "file_pydra.txt") - with open(filename, "w") as f: - f.write("hello from pydra") - - cmd = "cat" - - my_input_spec = SpecInfo( - name="Input", - fields=[ - ( - "file", - attr.ib( - type=File, - default=filename, - metadata={"position": 1, "argstr": "", "help_string": "input file"}, - ), - ) - ], - bases=(ShellSpec,), - ) - - docky = ShellCommandTask( - name="docky", - environment=Docker(image="busybox"), - executable=cmd, - input_spec=my_input_spec, - strip=True, - ) - - res = docky() - assert res.output.stdout == "hello from pydra" - - -@no_win -@need_docker -def test_docker_inputspec_2(plugin, tmp_path): - """a customized input spec with two fields for docker task""" - filename_1 = tmp_path / "file_pydra.txt" - with open(filename_1, "w") as f: - f.write("hello from pydra\n") - - filename_2 = tmp_path / "file_nice.txt" - with open(filename_2, "w") as f: - f.write("have a nice one") - - cmd = "cat" - - my_input_spec = SpecInfo( - name="Input", - fields=[ - ( - "file1", - attr.ib( - type=File, - metadata={ - "position": 1, - "argstr": "", - "help_string": "input file 1", - }, - ), - ), - ( - "file2", - attr.ib( - type=File, - default=filename_2, - metadata={ - "position": 2, - "argstr": "", - "help_string": "input file 2", - }, - ), - ), - ], - bases=(ShellSpec,), - ) - - docky = ShellCommandTask( - name="docky", - environment=Docker(image="busybox"), - executable=cmd, - file1=filename_1, - input_spec=my_input_spec, - strip=True, - ) - - res = docky() - assert res.output.stdout == "hello from pydra\nhave a nice one" - - -@no_win -@need_docker -def test_docker_inputspec_2a_except(plugin, tmp_path): - """a customized input spec with two fields - first one uses a default, and second doesn't - raises a dataclass exception - """ - filename_1 = tmp_path / "file_pydra.txt" - with open(filename_1, "w") as f: - f.write("hello from pydra\n") - filename_2 = tmp_path / "file_nice.txt" - with open(filename_2, "w") as f: - f.write("have a nice one") - - cmd = "cat" - - # the field with default value can't be before value without default - my_input_spec = SpecInfo( - name="Input", - fields=[ - ( - "file1", - attr.ib( - type=File, - default=filename_1, - metadata={ - "position": 1, - "argstr": "", - "help_string": "input file 1", - }, - ), - ), - ( - "file2", - attr.ib( - type=File, - metadata={ - "position": 2, - "argstr": "", - "help_string": "input file 2", - }, - ), - ), - ], - bases=(ShellSpec,), - ) - - docky = ShellCommandTask( - name="docky", - environment=Docker(image="busybox"), - executable=cmd, - file2=filename_2, - input_spec=my_input_spec, - strip=True, - ) - assert docky.inputs.file2.fspath == filename_2 - - res = docky() - assert res.output.stdout == "hello from pydra\nhave a nice one" - - -@no_win -@need_docker -def test_docker_inputspec_2a(plugin, tmp_path): - """a customized input spec with two fields - first one uses a default value - this is fine even if the second field is not using any defaults - """ - filename_1 = tmp_path / "file_pydra.txt" - with open(filename_1, "w") as f: - f.write("hello from pydra\n") - filename_2 = tmp_path / "file_nice.txt" - with open(filename_2, "w") as f: - f.write("have a nice one") - - cmd = "cat" - - # if you want set default in the first field you can use default value - my_input_spec = SpecInfo( - name="Input", - fields=[ - ( - "file1", - attr.ib( - type=File, - default=filename_1, - metadata={ - "position": 1, - "argstr": "", - "help_string": "input file 1", - }, - ), - ), - ( - "file2", - attr.ib( - type=File, - metadata={ - "position": 2, - "argstr": "", - "help_string": "input file 2", - }, - ), - ), - ], - bases=(ShellSpec,), - ) - - docky = ShellCommandTask( - name="docky", - environment=Docker(image="busybox"), - executable=cmd, - file2=filename_2, - input_spec=my_input_spec, - strip=True, - ) - - res = docky() - assert res.output.stdout == "hello from pydra\nhave a nice one" - - -@no_win -@need_docker -@pytest.mark.xfail(reason="'docker' not in /proc/1/cgroup on ubuntu; TODO") -def test_docker_inputspec_3(plugin, tmp_path): - """input file is in the container, so metadata["container_path"]: True, - the input will be treated as a str""" - filename = "/proc/1/cgroup" - - cmd = "cat" - - my_input_spec = SpecInfo( - name="Input", - fields=[ - ( - "file", - attr.ib( - type=File, - metadata={ - "mandatory": True, - "position": 1, - "argstr": "", - "help_string": "input file", - "container_path": True, - }, - ), - ) - ], - bases=(ShellSpec,), - ) - - docky = ShellCommandTask( - name="docky", - environment=Docker(image="busybox"), - executable=cmd, - file=filename, - input_spec=my_input_spec, - strip=True, - ) - - cmdline = docky.cmdline - res = docky() - assert "docker" in res.output.stdout - assert cmdline == docky.cmdline - - -@no_win -@need_docker -def test_docker_cmd_inputspec_copyfile_1(plugin, tmp_path): - """shelltask changes a file in place, - adding copyfile=True to the file-input from input_spec - hardlink or copy in the output_dir should be created - """ - file = tmp_path / "file_pydra.txt" - with open(file, "w") as f: - f.write("hello from pydra\n") - - cmd = ["sed", "-is", "s/hello/hi/"] - - my_input_spec = SpecInfo( - name="Input", - fields=[ - ( - "orig_file", - attr.ib( - type=File, - metadata={ - "position": 1, - "argstr": "", - "help_string": "orig file", - "mandatory": True, - "copyfile": "copy", - }, - ), - ), - ( - "out_file", - attr.ib( - type=str, - metadata={ - "output_file_template": "{orig_file}", - "help_string": "output file", - }, - ), - ), - ], - bases=(ShellSpec,), - ) - - docky = ShellCommandTask( - name="docky", - environment=Docker(image="busybox"), - executable=cmd, - input_spec=my_input_spec, - orig_file=str(file), - ) - - res = docky() - assert res.output.stdout == "" - out_file = res.output.out_file.fspath - assert out_file.exists() - # the file is copied, and than it is changed in place - assert out_file.parent == docky.output_dir - with open(out_file) as f: - assert "hi from pydra\n" == f.read() - # the original file is unchanged - with open(file) as f: - assert "hello from pydra\n" == f.read() - - -@no_win -@need_docker -def test_docker_inputspec_state_1(plugin, tmp_path): - """a customised input spec for a docker file with a splitter, - splitter is on files - """ - filename_1 = tmp_path / "file_pydra.txt" - with open(filename_1, "w") as f: - f.write("hello from pydra\n") - filename_2 = tmp_path / "file_nice.txt" - with open(filename_2, "w") as f: - f.write("have a nice one") - - cmd = "cat" - - my_input_spec = SpecInfo( - name="Input", - fields=[ - ( - "file", - attr.ib( - type=File, - metadata={ - "mandatory": True, - "position": 1, - "argstr": "", - "help_string": "input file", - }, - ), - ) - ], - bases=(ShellSpec,), - ) - - docky = ShellCommandTask( - name="docky", - environment=Docker(image="busybox"), - executable=cmd, - input_spec=my_input_spec, - strip=True, - ).split("file", file=[str(filename_1), str(filename_2)]) - - res = docky() - assert res[0].output.stdout == "hello from pydra" - assert res[1].output.stdout == "have a nice one" - - -@no_win -@need_docker -def test_docker_inputspec_state_1b(plugin, tmp_path): - """a customised input spec for a docker file with a splitter, - files from the input spec have the same path in the local os and the container, - so hash is calculated and the test works fine - """ - file_1 = tmp_path / "file_pydra.txt" - file_2 = tmp_path / "file_nice.txt" - with open(file_1, "w") as f: - f.write("hello from pydra") - with open(file_2, "w") as f: - f.write("have a nice one") - - cmd = "cat" - filename = [] - - my_input_spec = SpecInfo( - name="Input", - fields=[ - ( - "file", - attr.ib( - type=File, - metadata={ - "mandatory": True, - "position": 1, - "argstr": "", - "help_string": "input file", - }, - ), - ) - ], - bases=(ShellSpec,), - ) - - docky = ShellCommandTask( - name="docky", - environment=Docker(image="busybox"), - executable=cmd, - input_spec=my_input_spec, - strip=True, - ).split("file", file=[str(file_1), str(file_2)]) - - res = docky() - assert res[0].output.stdout == "hello from pydra" - assert res[1].output.stdout == "have a nice one" - - -@no_win -@need_docker -def test_docker_wf_inputspec_1(plugin, tmp_path): - """a customized input spec for workflow with docker tasks""" - filename = tmp_path / "file_pydra.txt" - with open(filename, "w") as f: - f.write("hello from pydra") - - cmd = "cat" - - my_input_spec = SpecInfo( - name="Input", - fields=[ - ( - "file", - attr.ib( - type=File, - metadata={ - "mandatory": True, - "position": 1, - "argstr": "", - "help_string": "input file", - }, - ), - ) - ], - bases=(ShellSpec,), - ) - - wf = Workflow(name="wf", input_spec=["cmd", "file"]) - wf.inputs.cmd = cmd - wf.inputs.file = filename - - docky = ShellCommandTask( - name="docky", - environment=Docker(image="busybox"), - executable=wf.lzin.cmd, - file=wf.lzin.file, - input_spec=my_input_spec, - strip=True, - ) - wf.add(docky) - - wf.set_output([("out", wf.docky.lzout.stdout)]) - - with Submitter(plugin=plugin) as sub: - wf(submitter=sub) - - res = wf.result() - assert res.output.out == "hello from pydra" - - -@no_win -@need_docker -def test_docker_wf_state_inputspec_1(plugin, tmp_path): - """a customized input spec for workflow with docker tasks that has a state""" - file_1 = tmp_path / "file_pydra.txt" - file_2 = tmp_path / "file_nice.txt" - with open(file_1, "w") as f: - f.write("hello from pydra") - with open(file_2, "w") as f: - f.write("have a nice one") - - cmd = "cat" - - my_input_spec = SpecInfo( - name="Input", - fields=[ - ( - "file", - attr.ib( - type=File, - metadata={ - "mandatory": True, - "position": 1, - "argstr": "", - "help_string": "input file", - }, - ), - ) - ], - bases=(ShellSpec,), - ) - - wf = Workflow(name="wf", input_spec=["cmd", "file"]) - wf.split(file=[str(file_1), str(file_2)]) - wf.inputs.cmd = cmd - - docky = ShellCommandTask( - name="docky", - environment=Docker(image="busybox"), - executable=wf.lzin.cmd, - file=wf.lzin.file, - input_spec=my_input_spec, - strip=True, - ) - wf.add(docky) - - wf.set_output([("out", wf.docky.lzout.stdout)]) - - with Submitter(plugin=plugin) as sub: - wf(submitter=sub) - - res = wf.result() - assert res[0].output.out == "hello from pydra" - assert res[1].output.out == "have a nice one" - - -@no_win -@need_docker -def test_docker_wf_ndst_inputspec_1(plugin, tmp_path): - """a customized input spec for workflow with docker tasks with states""" - file_1 = tmp_path / "file_pydra.txt" - file_2 = tmp_path / "file_nice.txt" - with open(file_1, "w") as f: - f.write("hello from pydra") - with open(file_2, "w") as f: - f.write("have a nice one") - - cmd = "cat" - - my_input_spec = SpecInfo( - name="Input", - fields=[ - ( - "file", - attr.ib( - type=File, - metadata={ - "mandatory": True, - "position": 1, - "argstr": "", - "help_string": "input file", - }, - ), - ) - ], - bases=(ShellSpec,), - ) - - wf = Workflow(name="wf", input_spec=["cmd", "file"]) - wf.inputs.cmd = cmd - - docky = ShellCommandTask( - name="docky", - environment=Docker(image="busybox"), - executable=wf.lzin.cmd, - file=wf.lzin.file, - input_spec=my_input_spec, - strip=True, - ).split("file", file=[str(file_1), str(file_2)]) - wf.add(docky) - - wf.set_output([("out", wf.docky.lzout.stdout)]) - - with Submitter(plugin=plugin) as sub: - wf(submitter=sub) - - res = wf.result() - assert res.output.out == ["hello from pydra", "have a nice one"] diff --git a/pydra/engine/tests/test_environments.py b/pydra/engine/tests/test_environments.py deleted file mode 100644 index bd05d9daed..0000000000 --- a/pydra/engine/tests/test_environments.py +++ /dev/null @@ -1,539 +0,0 @@ -from pathlib import Path - -from ..environments import Native, Docker, Singularity -from ..task import ShellCommandTask -from ..submitter import Submitter -from ..specs import ( - ShellSpec, - SpecInfo, - File, -) -from .utils import no_win, need_docker, need_singularity - -import attr -import pytest - - -def makedir(path, name): - newdir = path / name - newdir.mkdir() - return newdir - - -def test_native_1(tmp_path): - """simple command, no arguments""" - newcache = lambda x: makedir(tmp_path, x) - - cmd = ["whoami"] - shelly = ShellCommandTask( - name="shelly", executable=cmd, cache_dir=newcache("shelly") - ) - assert shelly.cmdline == " ".join(cmd) - - env_res = Native().execute(shelly) - shelly() - assert env_res == shelly.output_ - - shelly_call = ShellCommandTask( - name="shelly_call", executable=cmd, cache_dir=newcache("shelly_call") - ) - shelly_call(environment=Native()) - assert env_res == shelly_call.output_ - - shelly_subm = ShellCommandTask( - name="shelly_subm", executable=cmd, cache_dir=newcache("shelly_subm") - ) - with Submitter(plugin="cf") as sub: - shelly_subm(submitter=sub, environment=Native()) - assert env_res == shelly_subm.result().output.__dict__ - - -@no_win -@need_docker -def test_docker_1(tmp_path): - """docker env: simple command, no arguments""" - newcache = lambda x: makedir(tmp_path, x) - - cmd = ["whoami"] - docker = Docker(image="busybox") - shelly = ShellCommandTask( - name="shelly", executable=cmd, cache_dir=newcache("shelly") - ) - assert shelly.cmdline == " ".join(cmd) - env_res = docker.execute(shelly) - - shelly_env = ShellCommandTask( - name="shelly", - executable=cmd, - cache_dir=newcache("shelly_env"), - environment=docker, - ) - shelly_env() - assert env_res == shelly_env.output_ == shelly_env.result().output.__dict__ - - shelly_call = ShellCommandTask( - name="shelly", executable=cmd, cache_dir=newcache("shelly_call") - ) - shelly_call(environment=docker) - assert env_res == shelly_call.output_ == shelly_call.result().output.__dict__ - - -@no_win -@need_docker -@pytest.mark.parametrize( - "docker", - [ - Docker(image="busybox"), - Docker(image="busybox", tag="latest", xargs="--rm"), - Docker(image="busybox", xargs=["--rm"]), - ], -) -def test_docker_1_subm(tmp_path, docker): - """docker env with submitter: simple command, no arguments""" - newcache = lambda x: makedir(tmp_path, x) - - cmd = ["whoami"] - docker = Docker(image="busybox") - shelly = ShellCommandTask( - name="shelly", executable=cmd, cache_dir=newcache("shelly") - ) - assert shelly.cmdline == " ".join(cmd) - env_res = docker.execute(shelly) - - shelly_env = ShellCommandTask( - name="shelly", - executable=cmd, - cache_dir=newcache("shelly_env"), - environment=docker, - ) - with Submitter(plugin="cf") as sub: - shelly_env(submitter=sub) - assert env_res == shelly_env.result().output.__dict__ - - shelly_call = ShellCommandTask( - name="shelly", executable=cmd, cache_dir=newcache("shelly_call") - ) - with Submitter(plugin="cf") as sub: - shelly_call(submitter=sub, environment=docker) - assert env_res == shelly_call.result().output.__dict__ - - -@no_win -@need_singularity -def test_singularity_1(tmp_path): - """singularity env: simple command, no arguments""" - newcache = lambda x: makedir(tmp_path, x) - - cmd = ["whoami"] - sing = Singularity(image="docker://alpine") - shelly = ShellCommandTask( - name="shelly", executable=cmd, cache_dir=newcache("shelly") - ) - assert shelly.cmdline == " ".join(cmd) - env_res = sing.execute(shelly) - - shelly_env = ShellCommandTask( - name="shelly", - executable=cmd, - cache_dir=newcache("shelly_env"), - environment=sing, - ) - shelly_env() - assert env_res == shelly_env.output_ == shelly_env.result().output.__dict__ - - shelly_call = ShellCommandTask( - name="shelly", executable=cmd, cache_dir=newcache("shelly_call") - ) - shelly_call(environment=sing) - assert env_res == shelly_call.output_ == shelly_call.result().output.__dict__ - - -@no_win -@need_singularity -def test_singularity_1_subm(tmp_path, plugin): - """docker env with submitter: simple command, no arguments""" - newcache = lambda x: makedir(tmp_path, x) - - cmd = ["whoami"] - sing = Singularity(image="docker://alpine") - shelly = ShellCommandTask( - name="shelly", executable=cmd, cache_dir=newcache("shelly") - ) - assert shelly.cmdline == " ".join(cmd) - env_res = sing.execute(shelly) - - shelly_env = ShellCommandTask( - name="shelly", - executable=cmd, - cache_dir=newcache("shelly_env"), - environment=sing, - ) - with Submitter(plugin=plugin) as sub: - shelly_env(submitter=sub) - assert env_res == shelly_env.result().output.__dict__ - - shelly_call = ShellCommandTask( - name="shelly", executable=cmd, cache_dir=newcache("shelly_call") - ) - with Submitter(plugin=plugin) as sub: - shelly_call(submitter=sub, environment=sing) - for key in [ - "stdout", - "return_code", - ]: # singularity gives info about cashed image in stderr - assert env_res[key] == shelly_call.result().output.__dict__[key] - - -def create_shelly_inputfile(tempdir, filename, name, executable): - """creating a task with a simple input_spec""" - my_input_spec = SpecInfo( - name="Input", - fields=[ - ( - "file", - attr.ib( - type=File, - metadata={ - "position": 1, - "help_string": "files", - "mandatory": True, - "argstr": "", - }, - ), - ) - ], - bases=(ShellSpec,), - ) - - kwargs = {} if filename is None else {"file": filename} - shelly = ShellCommandTask( - name=name, - executable=executable, - cache_dir=makedir(tempdir, name), - input_spec=my_input_spec, - **kwargs, - ) - return shelly - - -def test_shell_fileinp(tmp_path): - """task with a file in the command/input""" - input_dir = makedir(tmp_path, "inputs") - filename = input_dir / "file.txt" - with open(filename, "w") as f: - f.write("hello ") - - shelly = create_shelly_inputfile( - tempdir=tmp_path, filename=filename, name="shelly", executable=["cat"] - ) - env_res = Native().execute(shelly) - - shelly_env = create_shelly_inputfile( - tempdir=tmp_path, filename=filename, name="shelly_env", executable=["cat"] - ) - shelly_env.environment = Native() - shelly_env() - assert env_res == shelly_env.output_ == shelly_env.result().output.__dict__ - - shelly_call = create_shelly_inputfile( - tempdir=tmp_path, filename=filename, name="shelly_call", executable=["cat"] - ) - shelly_call(environment=Native()) - assert env_res == shelly_call.output_ == shelly_call.result().output.__dict__ - - -def test_shell_fileinp_st(tmp_path): - """task (with a splitter) with a file in the command/input""" - input_dir = makedir(tmp_path, "inputs") - filename_1 = input_dir / "file_1.txt" - with open(filename_1, "w") as f: - f.write("hello ") - - filename_2 = input_dir / "file_2.txt" - with open(filename_2, "w") as f: - f.write("hi ") - - filename = [filename_1, filename_2] - - shelly_env = create_shelly_inputfile( - tempdir=tmp_path, filename=None, name="shelly_env", executable=["cat"] - ) - shelly_env.environment = Native() - shelly_env.split(file=filename) - shelly_env() - assert shelly_env.result()[0].output.stdout.strip() == "hello" - assert shelly_env.result()[1].output.stdout.strip() == "hi" - - shelly_call = create_shelly_inputfile( - tempdir=tmp_path, filename=None, name="shelly_call", executable=["cat"] - ) - shelly_call.split(file=filename) - shelly_call(environment=Native()) - assert shelly_call.result()[0].output.stdout.strip() == "hello" - assert shelly_call.result()[1].output.stdout.strip() == "hi" - - -@no_win -@need_docker -def test_docker_fileinp(tmp_path): - """docker env: task with a file in the command/input""" - docker = Docker(image="busybox") - - input_dir = makedir(tmp_path, "inputs") - filename = input_dir / "file.txt" - with open(filename, "w") as f: - f.write("hello ") - - shelly = create_shelly_inputfile( - tempdir=tmp_path, filename=filename, name="shelly", executable=["cat"] - ) - env_res = docker.execute(shelly) - - shelly_env = create_shelly_inputfile( - tempdir=tmp_path, filename=filename, name="shelly_env", executable=["cat"] - ) - shelly_env.environment = docker - shelly_env() - - assert env_res == shelly_env.output_ == shelly_env.result().output.__dict__ - - shelly_call = create_shelly_inputfile( - tempdir=tmp_path, filename=filename, name="shelly_call", executable=["cat"] - ) - shelly_call(environment=docker) - assert env_res == shelly_call.output_ == shelly_call.result().output.__dict__ - - -@no_win -@need_docker -def test_docker_fileinp_subm(tmp_path, plugin): - """docker env with a submitter: task with a file in the command/input""" - docker = Docker(image="busybox") - - input_dir = makedir(tmp_path, "inputs") - filename = input_dir / "file.txt" - with open(filename, "w") as f: - f.write("hello ") - - shelly = create_shelly_inputfile( - tempdir=tmp_path, filename=filename, name="shelly", executable=["cat"] - ) - env_res = docker.execute(shelly) - - shelly_env = create_shelly_inputfile( - tempdir=tmp_path, filename=filename, name="shelly_env", executable=["cat"] - ) - shelly_env.environment = docker - with Submitter(plugin=plugin) as sub: - shelly_env(submitter=sub) - assert env_res == shelly_env.result().output.__dict__ - - shelly_call = create_shelly_inputfile( - tempdir=tmp_path, filename=filename, name="shelly_call", executable=["cat"] - ) - with Submitter(plugin=plugin) as sub: - shelly_call(submitter=sub, environment=docker) - assert env_res == shelly_call.result().output.__dict__ - - -@no_win -@need_docker -def test_docker_fileinp_st(tmp_path): - """docker env: task (with a splitter) with a file in the command/input""" - docker = Docker(image="busybox") - - input_dir = makedir(tmp_path, "inputs") - filename_1 = input_dir / "file_1.txt" - with open(filename_1, "w") as f: - f.write("hello ") - - filename_2 = input_dir / "file_2.txt" - with open(filename_2, "w") as f: - f.write("hi ") - - filename = [filename_1, filename_2] - - shelly_env = create_shelly_inputfile( - tempdir=tmp_path, filename=None, name="shelly_env", executable=["cat"] - ) - shelly_env.environment = docker - shelly_env.split(file=filename) - shelly_env() - assert shelly_env.result()[0].output.stdout.strip() == "hello" - assert shelly_env.result()[1].output.stdout.strip() == "hi" - - shelly_call = create_shelly_inputfile( - tempdir=tmp_path, filename=None, name="shelly_call", executable=["cat"] - ) - shelly_call.split(file=filename) - shelly_call(environment=docker) - assert shelly_call.result()[0].output.stdout.strip() == "hello" - assert shelly_call.result()[1].output.stdout.strip() == "hi" - - -def create_shelly_outputfile(tempdir, filename, name, executable="cp"): - """creating a task with an input_spec that contains a template""" - my_input_spec = SpecInfo( - name="Input", - fields=[ - ( - "file_orig", - attr.ib( - type=File, - metadata={"position": 2, "help_string": "new file", "argstr": ""}, - ), - ), - ( - "file_copy", - attr.ib( - type=str, - metadata={ - "output_file_template": "{file_orig}_copy", - "help_string": "output file", - "argstr": "", - }, - ), - ), - ], - bases=(ShellSpec,), - ) - - kwargs = {} if filename is None else {"file_orig": filename} - shelly = ShellCommandTask( - name=name, - executable=executable, - cache_dir=makedir(tempdir, name), - input_spec=my_input_spec, - **kwargs, - ) - return shelly - - -def test_shell_fileout(tmp_path): - """task with a file in the output""" - input_dir = makedir(tmp_path, "inputs") - filename = input_dir / "file.txt" - with open(filename, "w") as f: - f.write("hello ") - - # execute does not create the cashedir, so this part will fail, - # but I guess we don't want to use it this way anyway - # shelly = create_shelly_outputfile(tempdir=tmp_path, filename=filename, name="shelly") - # env_res = Native().execute(shelly) - - shelly_env = create_shelly_outputfile( - tempdir=tmp_path, filename=filename, name="shelly_env" - ) - shelly_env.environment = Native() - shelly_env() - assert ( - Path(shelly_env.result().output.file_copy) - == shelly_env.output_dir / "file_copy.txt" - ) - - shelly_call = create_shelly_outputfile( - tempdir=tmp_path, filename=filename, name="shelly_call" - ) - shelly_call(environment=Native()) - assert ( - Path(shelly_call.result().output.file_copy) - == shelly_call.output_dir / "file_copy.txt" - ) - - -def test_shell_fileout_st(tmp_path): - """task (with a splitter) with a file in the output""" - input_dir = makedir(tmp_path, "inputs") - filename_1 = input_dir / "file_1.txt" - with open(filename_1, "w") as f: - f.write("hello ") - - filename_2 = input_dir / "file_2.txt" - with open(filename_2, "w") as f: - f.write("hi ") - - filename = [filename_1, filename_2] - - shelly_env = create_shelly_outputfile( - tempdir=tmp_path, filename=None, name="shelly_env" - ) - shelly_env.environment = Native() - shelly_env.split(file_orig=filename) - shelly_env() - assert ( - Path(shelly_env.result()[0].output.file_copy) - == shelly_env.output_dir[0] / "file_1_copy.txt" - ) - assert ( - Path(shelly_env.result()[1].output.file_copy) - == shelly_env.output_dir[1] / "file_2_copy.txt" - ) - - shelly_call = create_shelly_outputfile( - tempdir=tmp_path, filename=None, name="shelly_call" - ) - shelly_call.split(file_orig=filename) - shelly_call(environment=Native()) - assert ( - Path(shelly_call.result()[0].output.file_copy) - == shelly_call.output_dir[0] / "file_1_copy.txt" - ) - assert ( - Path(shelly_call.result()[1].output.file_copy) - == shelly_call.output_dir[1] / "file_2_copy.txt" - ) - - -@no_win -@need_docker -def test_docker_fileout(tmp_path): - """docker env: task with a file in the output""" - docker_env = Docker(image="busybox") - - input_dir = makedir(tmp_path, "inputs") - filename = input_dir / "file.txt" - with open(filename, "w") as f: - f.write("hello ") - - shelly_env = create_shelly_outputfile( - tempdir=tmp_path, filename=filename, name="shelly_env" - ) - shelly_env.environment = docker_env - shelly_env() - assert ( - Path(shelly_env.result().output.file_copy) - == shelly_env.output_dir / "file_copy.txt" - ) - - -@no_win -@need_docker -def test_docker_fileout_st(tmp_path): - """docker env: task (with a splitter) with a file in the output""" - docker_env = Docker(image="busybox") - - input_dir = makedir(tmp_path, "inputs") - filename_1 = input_dir / "file_1.txt" - with open(filename_1, "w") as f: - f.write("hello ") - - filename_2 = input_dir / "file_2.txt" - with open(filename_2, "w") as f: - f.write("hi ") - - filename = [filename_1, filename_2] - - shelly_env = create_shelly_outputfile( - tempdir=tmp_path, filename=None, name="shelly_env" - ) - shelly_env.environment = docker_env - shelly_env.split(file_orig=filename) - shelly_env() - assert ( - Path(shelly_env.result()[0].output.file_copy) - == shelly_env.output_dir[0] / "file_1_copy.txt" - ) - assert ( - Path(shelly_env.result()[1].output.file_copy) - == shelly_env.output_dir[1] / "file_2_copy.txt" - ) diff --git a/pydra/engine/tests/test_error_handling.py b/pydra/engine/tests/test_error_handling.py new file mode 100644 index 0000000000..a46e8acce3 --- /dev/null +++ b/pydra/engine/tests/test_error_handling.py @@ -0,0 +1,146 @@ +import typing as ty +import sys +import pytest +import cloudpickle as cp +from pathlib import Path +import glob as glob +from pydra.compose import python, workflow +from pydra.engine.submitter import Submitter + + +no_win = pytest.mark.skipif( + sys.platform.startswith("win"), + reason="docker/singularity command not adjusted for windows", +) + + +def test_exception_func(): + @python.define + def raise_exception(c, d): + raise Exception() + + bad_funk = raise_exception(c=17, d=3.2) + assert pytest.raises(Exception, bad_funk) + + +def test_result_none_1(): + """checking if None is properly returned as the result""" + + @python.define + def FunNone(x): + return None + + task = FunNone(x=3) + outputs = task() + assert outputs.out is None + + +def test_result_none_2(): + """checking if None is properly set for all outputs""" + + @python.define(outputs=["out1", "out2"]) + def FunNone(x) -> tuple[ty.Any, ty.Any]: + return None # Do we actually want this behaviour? + + task = FunNone(x=3) + outputs = task() + assert outputs.out1 is None + assert outputs.out2 is None + + +def test_traceback(tmpdir): + """checking if the error raised in a function is properly returned; + checking if there is an error filename in the error message that contains + full traceback including the line in the python function + """ + + @python.define + def FunError(x): + raise Exception("Error from the function") + + with pytest.raises(Exception, match="Error from the function") as exinfo: + with Submitter(worker="cf", cache_root=tmpdir) as sub: + sub(FunError(x=3), raise_errors=True) + + # getting error file from the error message + error_file_match = ( + str(exinfo.value.__notes__[0]).split("here: ")[-1].split("_error.pklz")[0] + ) + error_file = Path(error_file_match) / "_error.pklz" + # checking if the file exists + assert error_file.exists() + # reading error message from the pickle file + error_tb = cp.loads(error_file.read_bytes())["error message"] + # the error traceback should be a list and should point to a specific line in the function + assert isinstance(error_tb, list) + assert "in FunError" in error_tb[-2] + + +def test_traceback_wf(tmp_path: Path): + """checking if the error raised in a function is properly returned by a workflow; + checking if there is an error filename in the error message that contains + full traceback including the line in the python function + """ + + @python.define + def FunError(x): + raise Exception("Error from the function") + + @workflow.define + def Workflow(x_list): + fun_error = workflow.add(FunError().split(x=x_list), name="fun_error") + return fun_error.out + + wf = Workflow(x_list=[3, 4]) + with pytest.raises(RuntimeError, match="Job 'fun_error.*, errored") as exinfo: + with Submitter(worker="cf", cache_root=tmp_path) as sub: + sub(wf, raise_errors=True) + + # getting error file from the error message + cache_dir_match = Path( + str(exinfo.value).split("See output directory for details: ")[-1].strip() + ) + assert cache_dir_match.exists() + error_file = cache_dir_match / "_error.pklz" + # checking if the file exists + assert error_file.exists() + assert "in FunError" in str(exinfo.value) + + +@pytest.mark.flaky(reruns=3) +def test_rerun_errored(tmp_path, capfd): + """Test rerunning a task containing errors. + Only the errored tasks should be rerun""" + + @python.define + def PassOdds(x): + if x % 2 == 0: + print(f"x={x} -> x%2 = {bool(x % 2)} (even error)\n") + raise Exception("even error") + else: + print(f"x={x} -> x%2 = {bool(x % 2)}\n") + return x + + pass_odds = PassOdds().split("x", x=[1, 2, 3, 4, 5]) + + with pytest.raises(Exception): + pass_odds(cache_root=tmp_path, worker="cf", n_procs=3) + with pytest.raises(Exception): + pass_odds(cache_root=tmp_path, worker="cf", n_procs=3) + + out, err = capfd.readouterr() + stdout_lines = out.splitlines() + + tasks_run = 0 + errors_found = 0 + + for line in stdout_lines: + if "-> x%2" in line: + tasks_run += 1 + if "(even error)" in line: + errors_found += 1 + + # There should have been 5 messages of the form "x%2 = XXX" after calling task() the first time + # and another 2 messagers after calling the second time + assert tasks_run == 7 + assert errors_found == 4 diff --git a/pydra/engine/tests/test_graph.py b/pydra/engine/tests/test_graph.py index 403b9e6ef9..43f3b2aa3c 100644 --- a/pydra/engine/tests/test_graph.py +++ b/pydra/engine/tests/test_graph.py @@ -1,5 +1,5 @@ -from ..graph import DiGraph -from .utils import DOT_FLAG +from pydra.engine.graph import DiGraph +from pydra.engine.tests.utils import DOT_FLAG import pytest @@ -64,15 +64,13 @@ def test_edges_3(): def test_edges_ecxeption_1(): - with pytest.raises(Exception) as excinfo: + with pytest.raises(Exception, match="Duplicate node names"): DiGraph(nodes=[A, B, A], edges=[(A, B)]) - assert "repeated elements" in str(excinfo.value) def test_edges_ecxeption_2(): - with pytest.raises(Exception) as excinfo: + with pytest.raises(Exception, match="can't be added"): DiGraph(nodes=[A, B], edges=[(A, C)]) - assert "can't be added" in str(excinfo.value) def test_sort_1(): diff --git a/pydra/engine/tests/test_helpers.py b/pydra/engine/tests/test_helpers.py deleted file mode 100644 index 48fd6e3120..0000000000 --- a/pydra/engine/tests/test_helpers.py +++ /dev/null @@ -1,364 +0,0 @@ -import os -import shutil -from pathlib import Path -import random -import platform -import typing as ty -import pytest -import attrs -import cloudpickle as cp -from unittest.mock import Mock -from fileformats.generic import Directory, File -from fileformats.core import FileSet -from .utils import multiply, raise_xeq1 -from ..helpers import ( - get_available_cpus, - save, - load_and_run, - position_sort, - parse_copyfile, - argstr_formatting, - parse_format_string, -) -from ...utils.hash import hash_function -from ..core import Workflow - - -def test_save(tmpdir): - outdir = Path(tmpdir) - with pytest.raises(ValueError): - save(tmpdir) - foo = multiply(name="mult", x=1, y=2) - # save task - save(outdir, task=foo) - del foo - # load saved task - task_pkl = outdir / "_task.pklz" - foo = cp.loads(task_pkl.read_bytes()) - assert foo.name == "mult" - assert foo.inputs.x == 1 and foo.inputs.y == 2 - # execute task and save result - res = foo() - assert res.output.out == 2 - save(outdir, result=res) - del res - # load saved result - res_pkl = outdir / "_result.pklz" - res = cp.loads(res_pkl.read_bytes()) - assert res.output.out == 2 - - -def test_hash_file(tmpdir): - outdir = Path(tmpdir) - with open(outdir / "test.file", "w") as fp: - fp.write("test") - assert ( - hash_function(File(outdir / "test.file")) == "f32ab20c4a86616e32bf2504e1ac5a22" - ) - - -def test_hashfun_float(): - import math - - pi_50 = 3.14159265358979323846264338327950288419716939937510 - pi_15 = 3.141592653589793 - pi_10 = 3.1415926536 - # comparing for x that have the same x.as_integer_ratio() - assert ( - math.pi.as_integer_ratio() - == pi_50.as_integer_ratio() - == pi_15.as_integer_ratio() - ) - assert hash_function(math.pi) == hash_function(pi_15) == hash_function(pi_50) - # comparing for x that have different x.as_integer_ratio() - assert math.pi.as_integer_ratio() != pi_10.as_integer_ratio() - assert hash_function(math.pi) != hash_function(pi_10) - - -def test_hash_function_dict(): - dict1 = {"a": 10, "b": 5} - dict2 = {"b": 5, "a": 10} - assert hash_function(dict1) == hash_function(dict2) - - -def test_hash_function_list_tpl(): - lst = [2, 5.6, "ala"] - tpl = (2, 5.6, "ala") - assert hash_function(lst) != hash_function(tpl) - - -def test_hash_function_list_dict(): - lst = [2, {"a": "ala", "b": 1}] - hash_function(lst) - - -def test_hash_function_files(tmp_path: Path): - file_1 = tmp_path / "file_1.txt" - file_2 = tmp_path / "file_2.txt" - file_1.write_text("hello") - file_2.write_text("hello") - - assert hash_function(File(file_1)) == hash_function(File(file_2)) - - -def test_hash_function_dir_and_files_list(tmp_path: Path): - dir1 = tmp_path / "foo" - dir2 = tmp_path / "bar" - for d in (dir1, dir2): - d.mkdir() - for i in range(3): - f = d / f"{i}.txt" - f.write_text(str(i)) - - assert hash_function(Directory(dir1)) == hash_function(Directory(dir2)) - file_list1: ty.List[File] = [File(f) for f in dir1.iterdir()] - file_list2: ty.List[File] = [File(f) for f in dir2.iterdir()] - assert hash_function(file_list1) == hash_function(file_list2) - - -def test_hash_function_files_mismatch(tmp_path: Path): - file_1 = tmp_path / "file_1.txt" - file_2 = tmp_path / "file_2.txt" - file_1.write_text("hello") - file_2.write_text("hi") - - assert hash_function(File(file_1)) != hash_function(File(file_2)) - - -def test_hash_function_nested(tmp_path: Path): - dpath = tmp_path / "dir" - dpath.mkdir() - hidden = dpath / ".hidden" - nested = dpath / "nested" - hidden.mkdir() - nested.mkdir() - file_1 = dpath / "file_1.txt" - file_2 = hidden / "file_2.txt" - file_3 = nested / ".file_3.txt" - file_4 = nested / "file_4.txt" - - for fx in [file_1, file_2, file_3, file_4]: - fx.write_text(str(random.randint(0, 1000))) - - nested_dir = Directory(dpath) - - orig_hash = nested_dir.hash() - - nohidden_hash = nested_dir.hash(ignore_hidden_dirs=True, ignore_hidden_files=True) - nohiddendirs_hash = nested_dir.hash(ignore_hidden_dirs=True) - nohiddenfiles_hash = nested_dir.hash(ignore_hidden_files=True) - - assert orig_hash != nohidden_hash - assert orig_hash != nohiddendirs_hash - assert orig_hash != nohiddenfiles_hash - - os.remove(file_3) - assert nested_dir.hash() == nohiddenfiles_hash - shutil.rmtree(hidden) - assert nested_dir.hash() == nohidden_hash - - -def test_get_available_cpus(): - assert get_available_cpus() > 0 - try: - import psutil - - has_psutil = True - except ImportError: - has_psutil = False - - if hasattr(os, "sched_getaffinity"): - assert get_available_cpus() == len(os.sched_getaffinity(0)) - - if has_psutil and platform.system().lower() != "darwin": - assert get_available_cpus() == len(psutil.Process().cpu_affinity()) - - if platform.system().lower() == "darwin": - assert get_available_cpus() == os.cpu_count() - - -def test_load_and_run(tmpdir): - """testing load_and_run for pickled task""" - task_pkl = Path(tmpdir.join("task_main.pkl")) - - task = multiply(name="mult", y=10).split(x=[1, 2]) - task.state.prepare_states(inputs=task.inputs) - task.state.prepare_inputs() - with task_pkl.open("wb") as fp: - cp.dump(task, fp) - - resultfile_0 = load_and_run(task_pkl=task_pkl, ind=0) - resultfile_1 = load_and_run(task_pkl=task_pkl, ind=1) - # checking the result files - result_0 = cp.loads(resultfile_0.read_bytes()) - result_1 = cp.loads(resultfile_1.read_bytes()) - assert result_0.output.out == 10 - assert result_1.output.out == 20 - - -def test_load_and_run_exception_load(tmpdir): - """testing raising exception and saving info in crashfile when when load_and_run""" - task_pkl = Path(tmpdir.join("task_main.pkl")) - raise_xeq1(name="raise").split("x", x=[1, 2]) - with pytest.raises(FileNotFoundError): - load_and_run(task_pkl=task_pkl, ind=0) - - -def test_load_and_run_exception_run(tmpdir): - """testing raising exception and saving info in crashfile when when load_and_run""" - task_pkl = Path(tmpdir.join("task_main.pkl")) - - task = raise_xeq1(name="raise").split("x", x=[1, 2]) - task.state.prepare_states(inputs=task.inputs) - task.state.prepare_inputs() - - with task_pkl.open("wb") as fp: - cp.dump(task, fp) - - with pytest.raises(Exception) as excinfo: - load_and_run(task_pkl=task_pkl, ind=0) - assert "i'm raising an exception!" in str(excinfo.value) - # checking if the crashfile has been created - assert "crash" in str(excinfo.value) - errorfile = Path(str(excinfo.value).split("here: ")[1][:-2]) - assert errorfile.exists() - - resultfile = errorfile.parent / "_result.pklz" - assert resultfile.exists() - # checking the content - result_exception = cp.loads(resultfile.read_bytes()) - assert result_exception.errored is True - - # the second task should be fine - resultfile = load_and_run(task_pkl=task_pkl, ind=1) - result_1 = cp.loads(resultfile.read_bytes()) - assert result_1.output.out == 2 - - -def test_load_and_run_wf(tmpdir): - """testing load_and_run for pickled task""" - wf_pkl = Path(tmpdir.join("wf_main.pkl")) - - wf = Workflow(name="wf", input_spec=["x", "y"], y=10) - wf.add(multiply(name="mult", x=wf.lzin.x, y=wf.lzin.y)) - wf.split("x", x=[1, 2]) - - wf.set_output([("out", wf.mult.lzout.out)]) - - # task = multiply(name="mult", x=[1, 2], y=10).split("x") - wf.state.prepare_states(inputs=wf.inputs) - wf.state.prepare_inputs() - wf.plugin = "cf" - - with wf_pkl.open("wb") as fp: - cp.dump(wf, fp) - - resultfile_0 = load_and_run(ind=0, task_pkl=wf_pkl) - resultfile_1 = load_and_run(ind=1, task_pkl=wf_pkl) - # checking the result files - result_0 = cp.loads(resultfile_0.read_bytes()) - result_1 = cp.loads(resultfile_1.read_bytes()) - assert result_0.output.out == 10 - assert result_1.output.out == 20 - - -@pytest.mark.parametrize( - "pos_args", - [ - [(2, "b"), (1, "a"), (3, "c")], - [(-2, "b"), (1, "a"), (-1, "c")], - [(None, "b"), (1, "a"), (-1, "c")], - [(-3, "b"), (None, "a"), (-1, "c")], - [(None, "b"), (1, "a"), (None, "c")], - ], -) -def test_position_sort(pos_args): - final_args = position_sort(pos_args) - assert final_args == ["a", "b", "c"] - - -def test_parse_copyfile(): - Mode = FileSet.CopyMode - Collation = FileSet.CopyCollation - - def mock_field(copyfile): - mock = Mock(["metadata"]) - mock.metadata = {"copyfile": copyfile} - return mock - - assert parse_copyfile(mock_field((Mode.any, Collation.any))) == ( - Mode.any, - Collation.any, - ) - assert parse_copyfile(mock_field("copy"), default_collation=Collation.siblings) == ( - Mode.copy, - Collation.siblings, - ) - assert parse_copyfile(mock_field("link,adjacent")) == ( - Mode.link, - Collation.adjacent, - ) - assert parse_copyfile(mock_field(True)) == ( - Mode.copy, - Collation.any, - ) - assert parse_copyfile(mock_field(False)) == ( - Mode.link, - Collation.any, - ) - assert parse_copyfile(mock_field(None)) == ( - Mode.any, - Collation.any, - ) - with pytest.raises(TypeError, match="Unrecognised type for mode copyfile"): - parse_copyfile(mock_field((1, 2))) - with pytest.raises(TypeError, match="Unrecognised type for collation copyfile"): - parse_copyfile(mock_field((Mode.copy, 2))) - - -def test_argstr_formatting(): - @attrs.define - class Inputs: - a1_field: str - b2_field: float - c3_field: ty.Dict[str, str] - d4_field: ty.List[str] - - inputs = Inputs("1", 2.0, {"c": "3"}, ["4"]) - assert ( - argstr_formatting( - "{a1_field} {b2_field:02f} -test {c3_field[c]} -me {d4_field[0]}", - inputs, - ) - == "1 2.000000 -test 3 -me 4" - ) - - -def test_parse_format_string1(): - assert parse_format_string("{a}") == {"a"} - - -def test_parse_format_string2(): - assert parse_format_string("{abc}") == {"abc"} - - -def test_parse_format_string3(): - assert parse_format_string("{a:{b}}") == {"a", "b"} - - -def test_parse_format_string4(): - assert parse_format_string("{a:{b[2]}}") == {"a", "b"} - - -def test_parse_format_string5(): - assert parse_format_string("{a.xyz[somekey].abc:{b[a][b].d[0]}}") == {"a", "b"} - - -def test_parse_format_string6(): - assert parse_format_string("{a:05{b[a 2][b].e}}") == {"a", "b"} - - -def test_parse_format_string7(): - assert parse_format_string( - "{a1_field} {b2_field:02f} -test {c3_field[c]} -me {d4_field[0]}" - ) == {"a1_field", "b2_field", "c3_field", "d4_field"} diff --git a/pydra/engine/tests/test_helpers_state.py b/pydra/engine/tests/test_helpers_state.py deleted file mode 100644 index 45fcb68641..0000000000 --- a/pydra/engine/tests/test_helpers_state.py +++ /dev/null @@ -1,253 +0,0 @@ -from .. import helpers_state as hlpst - -import pytest - - -# TODO: feature? -class other_states_to_tests: - def __init__( - self, - splitter, - splitter_final=None, - keys_final=None, - ind_l=None, - ind_l_final=None, - ): - self.splitter = splitter - if splitter_final: - self.splitter_final = splitter_final - else: - self.splitter_final = splitter - self.other_states = {} - self.keys_final = keys_final - self.name = "NA" - self.ind_l = ind_l - if ind_l_final: - self.ind_l_final = ind_l_final - else: - self.ind_l_final = ind_l - - -@pytest.mark.parametrize( - "splitter, keys_exp, groups_exp, grstack_exp", - [ - ("a", ["a"], {"a": 0}, [[0]]), - (["a"], ["a"], {"a": 0}, [[0]]), - (("a",), ["a"], {"a": 0}, [[0]]), - (("a", "b"), ["a", "b"], {"a": 0, "b": 0}, [[0]]), - (["a", "b"], ["a", "b"], {"a": 0, "b": 1}, [[0, 1]]), - ([["a", "b"]], ["a", "b"], {"a": 0, "b": 1}, [[0, 1]]), - ((["a", "b"],), ["a", "b"], {"a": 0, "b": 1}, [[0, 1]]), - ((["a", "b"], "c"), ["a", "b", "c"], {"a": 0, "b": 1, "c": [0, 1]}, [[0, 1]]), - ([("a", "b"), "c"], ["a", "b", "c"], {"a": 0, "b": 0, "c": 1}, [[0, 1]]), - ([["a", "b"], "c"], ["a", "b", "c"], {"a": 0, "b": 1, "c": 2}, [[0, 1, 2]]), - ( - (["a", "b"], ["c", "d"]), - ["a", "b", "c", "d"], - {"a": 0, "b": 1, "c": 0, "d": 1}, - [[0, 1]], - ), - ], -) -def test_splits_groups(splitter, keys_exp, groups_exp, grstack_exp): - splitter_rpn = hlpst.splitter2rpn(splitter) - keys_f, groups_f, grstack_f, _ = hlpst.splits_groups(splitter_rpn) - - assert set(keys_f) == set(keys_exp) - assert groups_f == groups_exp - assert grstack_f == grstack_exp - - -@pytest.mark.parametrize( - "splitter, combiner, combiner_all_exp," - "keys_final_exp, groups_final_exp, grstack_final_exp", - [ - ("a", ["a"], ["a"], [], {}, []), - (["a"], ["a"], ["a"], [], {}, []), - (("a",), ["a"], ["a"], [], {}, []), - (("a", "b"), ["a"], ["a", "b"], [], {}, [[]]), - (("a", "b"), ["b"], ["a", "b"], [], {}, [[]]), - (["a", "b"], ["b"], ["b"], ["a"], {"a": 0}, [[0]]), - (["a", "b"], ["a"], ["a"], ["b"], {"b": 0}, [[0]]), - ((["a", "b"], "c"), ["a"], ["a", "c"], ["b"], {"b": 0}, [[0]]), - ((["a", "b"], "c"), ["b"], ["b", "c"], ["a"], {"a": 0}, [[0]]), - ((["a", "b"], "c"), ["a"], ["a", "c"], ["b"], {"b": 0}, [[0]]), - ((["a", "b"], "c"), ["c"], ["a", "b", "c"], [], {}, [[]]), - ([("a", "b"), "c"], ["a"], ["a", "b"], ["c"], {"c": 0}, [[0]]), - ([("a", "b"), "c"], ["b"], ["a", "b"], ["c"], {"c": 0}, [[0]]), - ([("a", "b"), "c"], ["c"], ["c"], ["a", "b"], {"a": 0, "b": 0}, [[0]]), - ([[("a", "b"), "c"]], ["c"], ["c"], ["a", "b"], {"a": 0, "b": 0}, [[0]]), - (([("a", "b"), "c"],), ["c"], ["c"], ["a", "b"], {"a": 0, "b": 0}, [[0]]), - ], -) -def test_splits_groups_comb( - splitter, - combiner, - keys_final_exp, - groups_final_exp, - grstack_final_exp, - combiner_all_exp, -): - splitter_rpn = hlpst.splitter2rpn(splitter) - keys_final, groups_final, grstack_final, combiner_all = hlpst.splits_groups( - splitter_rpn, combiner - ) - assert keys_final == keys_final_exp - assert groups_final == groups_final_exp - assert grstack_final == grstack_final_exp - - assert combiner_all == combiner_all_exp - - -@pytest.mark.parametrize( - "splitter, rpn", - [ - ("a", ["a"]), - (("a", "b"), ["a", "b", "."]), - (["a", "b"], ["a", "b", "*"]), - (["a", ("b", "c")], ["a", "b", "c", ".", "*"]), - ([("a", "b"), "c"], ["a", "b", ".", "c", "*"]), - (["a", ["b", ["c", "d"]]], ["a", "b", "c", "d", "*", "*", "*"]), - (["a", ("b", ["c", "d"])], ["a", "b", "c", "d", "*", ".", "*"]), - ((["a", "b"], "c"), ["a", "b", "*", "c", "."]), - ((["a", "b"], ["c", "d"]), ["a", "b", "*", "c", "d", "*", "."]), - (([["a", "b"]], ["c", "d"]), ["a", "b", "*", "c", "d", "*", "."]), - (((["a", "b"],), ["c", "d"]), ["a", "b", "*", "c", "d", "*", "."]), - ([("a", "b"), ("c", "d")], ["a", "b", ".", "c", "d", ".", "*"]), - ], -) -def test_splitter2rpn(splitter, rpn): - assert hlpst.splitter2rpn(splitter) == rpn - - -@pytest.mark.parametrize( - "splitter, rpn", - [ - ((("a", "b"), "c"), ["a", "b", ".", "c", "."]), - (("a", "b", "c"), ["a", "b", ".", "c", "."]), - ([["a", "b"], "c"], ["a", "b", "*", "c", "*"]), - (["a", "b", "c"], ["a", "b", "*", "c", "*"]), - ], -) -def test_splitter2rpn_2(splitter, rpn): - assert hlpst.splitter2rpn(splitter) == rpn - - -@pytest.mark.parametrize( - "splitter, rpn", - [ - ("a", ["a"]), - (("a", "b"), ["a", "b", "."]), - (["a", "b"], ["a", "b", "*"]), - (["a", ("b", "c")], ["a", "b", "c", ".", "*"]), - ([("a", "b"), "c"], ["a", "b", ".", "c", "*"]), - (["a", ["b", ["c", "d"]]], ["a", "b", "c", "d", "*", "*", "*"]), - (["a", ("b", ["c", "d"])], ["a", "b", "c", "d", "*", ".", "*"]), - ((["a", "b"], "c"), ["a", "b", "*", "c", "."]), - ((["a", "b"], ["c", "d"]), ["a", "b", "*", "c", "d", "*", "."]), - ([("a", "b"), ("c", "d")], ["a", "b", ".", "c", "d", ".", "*"]), - ], -) -def test_rpn2splitter(splitter, rpn): - assert hlpst.rpn2splitter(rpn) == splitter - - -@pytest.mark.parametrize( - "splitter, other_states, rpn", - [ - ( - ["a", "_NA"], - {"NA": (other_states_to_tests(("b", "c")), "d")}, - ["a", "NA.b", "NA.c", ".", "*"], - ), - ( - ["_NA", "c"], - {"NA": (other_states_to_tests(("a", "b")), "d")}, - ["NA.a", "NA.b", ".", "c", "*"], - ), - ( - ["a", ("b", "_NA")], - {"NA": (other_states_to_tests(["c", "d"]), "d")}, - ["a", "b", "NA.c", "NA.d", "*", ".", "*"], - ), - ], -) -def test_splitter2rpn_wf_splitter_1(splitter, other_states, rpn): - assert hlpst.splitter2rpn(splitter, other_states=other_states) == rpn - - -@pytest.mark.parametrize( - "splitter, other_states, rpn", - [ - ( - ["a", "_NA"], - {"NA": (other_states_to_tests(("b", "c")), "d")}, - ["a", "_NA", "*"], - ), - ( - ["_NA", "c"], - {"NA": (other_states_to_tests(("a", "b")), "d")}, - ["_NA", "c", "*"], - ), - ( - ["a", ("b", "_NA")], - {"NA": (other_states_to_tests(["c", "d"]), "d")}, - ["a", "b", "_NA", ".", "*"], - ), - ], -) -def test_splitter2rpn_wf_splitter_3(splitter, other_states, rpn): - assert ( - hlpst.splitter2rpn(splitter, other_states=other_states, state_fields=False) - == rpn - ) - - -@pytest.mark.parametrize( - "splitter, splitter_changed", - [ - ("a", "Node.a"), - (["a", ("b", "c")], ["Node.a", ("Node.b", "Node.c")]), - (("a", ["b", "c"]), ("Node.a", ["Node.b", "Node.c"])), - ], -) -def test_addname_splitter(splitter, splitter_changed): - assert hlpst.add_name_splitter(splitter, "Node") == splitter_changed - - -@pytest.mark.parametrize( - "splitter_rpn, input_to_remove, final_splitter_rpn", - [ - (["a", "b", "."], ["b", "a"], []), - (["a", "b", "*"], ["b"], ["a"]), - (["a", "b", "c", ".", "*"], ["b", "c"], ["a"]), - (["a", "b", "c", ".", "*"], ["a"], ["b", "c", "."]), - (["a", "b", ".", "c", "*"], ["a", "b"], ["c"]), - (["a", "b", "c", "d", "*", "*", "*"], ["c"], ["a", "b", "d", "*", "*"]), - (["a", "b", "c", "d", "*", "*", "*"], ["a"], ["b", "c", "d", "*", "*"]), - (["a", "b", "c", "d", "*", ".", "*"], ["a"], ["b", "c", "d", "*", "."]), - (["a", "b", "*", "c", "."], ["a", "c"], ["b"]), - (["a", "b", "*", "c", "d", "*", "."], ["a", "c"], ["b", "d", "."]), - (["a", "b", ".", "c", "d", ".", "*"], ["a", "b"], ["c", "d", "."]), - ], -) -def test_remove_inp_from_splitter_rpn( - splitter_rpn, input_to_remove, final_splitter_rpn -): - assert ( - hlpst.remove_inp_from_splitter_rpn(splitter_rpn, input_to_remove) - == final_splitter_rpn - ) - - -@pytest.mark.parametrize( - "group_for_inputs, input_for_groups, ndim", - [ - ({"a": 0, "b": 0}, {0: ["a", "b"]}, 1), - ({"a": 0, "b": 1}, {0: ["a"], 1: ["b"]}, 2), - ], -) -def test_groups_to_input(group_for_inputs, input_for_groups, ndim): - res = hlpst.converter_groups_to_input(group_for_inputs) - assert res[0] == input_for_groups - assert res[1] == ndim diff --git a/pydra/engine/tests/test_hooks.py b/pydra/engine/tests/test_hooks.py new file mode 100644 index 0000000000..07daf232f2 --- /dev/null +++ b/pydra/engine/tests/test_hooks.py @@ -0,0 +1,145 @@ +import sys +import shutil +import pytest +from pathlib import Path +import glob as glob +from pydra.compose import python +from pydra.engine.hooks import TaskHooks +from pydra.engine.submitter import Submitter +from pydra.engine.job import Job + + +no_win = pytest.mark.skipif( + sys.platform.startswith("win"), + reason="docker/singularity command not adjusted for windows", +) + + +@python.define +def FunAddTwo(a): + return a + 2 + + +def test_taskhooks_1(tmpdir: Path, capsys): + cache_root = tmpdir / "cache" + cache_root.mkdir() + + foo = Job(task=FunAddTwo(a=1), submitter=Submitter(cache_root=tmpdir), name="foo") + assert foo.hooks + # ensure all hooks are defined + for attr in ("pre_run", "post_run", "pre_run_task", "post_run_task"): + hook = getattr(foo.hooks, attr) + assert hook() is None + + def myhook(task, *args): + print("I was called") + + FunAddTwo(a=1)(cache_root=cache_root, hooks=TaskHooks(pre_run=myhook)) + captured = capsys.readouterr() + assert "I was called\n" in captured.out + del captured + + # setting unknown hook should not be allowed + with pytest.raises(AttributeError): + foo.hooks.mid_run = myhook + + # reset all hooks + foo.hooks.reset() + for attr in ("pre_run", "post_run", "pre_run_task", "post_run_task"): + hook = getattr(foo.hooks, attr) + assert hook() is None + + # clear cache + shutil.rmtree(cache_root) + cache_root.mkdir() + + # set all hooks + FunAddTwo(a=1)( + cache_root=cache_root, + hooks=TaskHooks( + pre_run=myhook, + post_run=myhook, + pre_run_task=myhook, + post_run_task=myhook, + ), + ) + captured = capsys.readouterr() + assert captured.out.count("I was called\n") == 4 + del captured + + +def test_taskhooks_2(tmpdir, capsys): + """checking order of the hooks; using task's attributes""" + + def myhook_prerun(task, *args): + print(f"i. prerun hook was called from {task.name}") + + def myhook_prerun_task(task, *args): + print(f"ii. prerun task hook was called {task.name}") + + def myhook_postrun_task(task, *args): + print(f"iii. postrun task hook was called {task.name}") + + def myhook_postrun(task, *args): + print(f"iv. postrun hook was called {task.name}") + + FunAddTwo(a=1)( + cache_root=tmpdir, + hooks=TaskHooks( + pre_run=myhook_prerun, + post_run=myhook_postrun, + pre_run_task=myhook_prerun_task, + post_run_task=myhook_postrun_task, + ), + ) + + captured = capsys.readouterr() + hook_messages = captured.out.strip().split("\n") + # checking the order of the hooks + assert "i. prerun hook" in hook_messages[0] + assert "ii. prerun task hook" in hook_messages[1] + assert "iii. postrun task hook" in hook_messages[2] + assert "iv. postrun hook" in hook_messages[3] + + +def test_taskhooks_3(tmpdir, capsys): + """checking results in the post run hooks""" + foo = Job(task=FunAddTwo(a=1), name="foo", submitter=Submitter(cache_root=tmpdir)) + + def myhook_postrun_task(task, result, *args): + print(f"postrun task hook, the result is {result.outputs.out}") + + def myhook_postrun(task, result, *args): + print(f"postrun hook, the result is {result.outputs.out}") + + foo.hooks.post_run = myhook_postrun + foo.hooks.post_run_task = myhook_postrun_task + foo.run() + + captured = capsys.readouterr() + hook_messages = captured.out.strip().split("\n") + # checking that the postrun hooks have access to results + assert "postrun task hook, the result is 3" in hook_messages[0] + assert "postrun hook, the result is 3" in hook_messages[1] + + +def test_taskhooks_4(tmpdir, capsys): + """task raises an error: postrun task should be called, postrun shouldn't be called""" + + def myhook_postrun_task(task, result, *args): + print(f"postrun task hook was called, result object is {result}") + + def myhook_postrun(task, result, *args): + print("postrun hook should not be called") + + with pytest.raises(Exception): + FunAddTwo(a="one")( + cache_root=tmpdir, + hooks=TaskHooks(post_run=myhook_postrun, post_run_task=myhook_postrun_task), + ) + + captured = capsys.readouterr() + hook_messages = captured.out.strip().split("\n") + # only post run task hook should be called + assert len(hook_messages) == 1 + assert "postrun task hook was called" in hook_messages[0] diff --git a/pydra/engine/tests/test_job.py b/pydra/engine/tests/test_job.py new file mode 100644 index 0000000000..c1824a7f75 --- /dev/null +++ b/pydra/engine/tests/test_job.py @@ -0,0 +1,112 @@ +from pathlib import Path +import pytest +import cloudpickle as cp +from pydra.engine.submitter import Submitter +from pydra.engine.job import Job +from pydra.engine.result import Result +from pydra.compose import workflow +from pydra.engine.tests.utils import Multiply, RaiseXeq1 +from pydra.engine.job import save, load_and_run + + +def test_save(tmpdir): + outdir = Path(tmpdir) + with pytest.raises(ValueError): + save(tmpdir) + foo = Job(name="mult", task=Multiply(x=1, y=2), submitter=Submitter()) + # save job + save(outdir, job=foo) + del foo + # load saved job + job_pkl = outdir / "_job.pklz" + foo: Job = cp.loads(job_pkl.read_bytes()) + assert foo.name == "mult" + assert foo.inputs["x"] == 1 and foo.inputs["y"] == 2 + # execute job and save result + res: Result = foo.run() + assert res.outputs.out == 2 + save(outdir, result=res) + del res + # load saved result + res_pkl = outdir / "_result.pklz" + res: Result = cp.loads(res_pkl.read_bytes()) + assert res.outputs.out == 2 + + +def test_load_and_run(tmpdir): + """testing load_and_run for pickled job""" + job_pkl = Path(tmpdir.join("task_main.pkl")) + # Note that tasks now don't have state arrays and indices, just a single resolved + # set of parameters that are ready to run + job = Job(name="mult", task=Multiply(x=2, y=10), submitter=Submitter()) + with job_pkl.open("wb") as fp: + cp.dump(job, fp) + resultfile = load_and_run(job_pkl=job_pkl) + # checking the result files + result = cp.loads(resultfile.read_bytes()) + assert result.outputs.out == 20 + + +def test_load_and_run_exception_run(tmpdir): + """testing raising exception and saving info in crashfile when when load_and_run""" + job_pkl = Path(tmpdir.join("task_main.pkl")) + cache_root = Path(tmpdir.join("cache")) + cache_root.mkdir() + + job = Job( + task=RaiseXeq1(x=1), + name="raise", + submitter=Submitter(worker="cf", cache_root=cache_root), + ) + + with job_pkl.open("wb") as fp: + cp.dump(job, fp) + + with pytest.raises(Exception) as excinfo: + load_and_run(job_pkl=job_pkl) + exc_msg = excinfo.value.args[0] + assert "i'm raising an exception!" in exc_msg + # checking if the crashfile has been created + assert "crash" in excinfo.value.__notes__[0] + errorfile = Path(excinfo.value.__notes__[0].split("here: ")[1]) + assert errorfile.exists() + + resultfile = errorfile.parent / "_result.pklz" + assert resultfile.exists() + # checking the content + result_exception = cp.loads(resultfile.read_bytes()) + assert result_exception.errored is True + + job = Job(task=RaiseXeq1(x=2), name="wont_raise", submitter=Submitter()) + + with job_pkl.open("wb") as fp: + cp.dump(job, fp) + + # the second job should be fine + resultfile = load_and_run(job_pkl=job_pkl) + result_1 = cp.loads(resultfile.read_bytes()) + assert result_1.outputs.out == 2 + + +def test_load_and_run_wf(tmpdir, worker): + """testing load_and_run for pickled job""" + wf_pkl = Path(tmpdir.join("wf_main.pkl")) + + @workflow.define + def Workflow(x, y=10): + multiply = workflow.add(Multiply(x=x, y=y)) + return multiply.out + + job = Job( + name="mult", + task=Workflow(x=2), + submitter=Submitter(cache_root=tmpdir, worker=worker), + ) + + with wf_pkl.open("wb") as fp: + cp.dump(job, fp) + + resultfile = load_and_run(job_pkl=wf_pkl) + # checking the result files + result = cp.loads(resultfile.read_bytes()) + assert result.outputs.out == 20 diff --git a/pydra/engine/tests/test_lazy.py b/pydra/engine/tests/test_lazy.py new file mode 100644 index 0000000000..159489b602 --- /dev/null +++ b/pydra/engine/tests/test_lazy.py @@ -0,0 +1,99 @@ +import pytest +from pydra.engine.lazy import ( + LazyInField, + LazyOutField, +) +from pydra.engine.workflow import Workflow +from pydra.engine.node import Node +from pydra.engine.submitter import Submitter, NodeExecution, DiGraph +from pydra.compose import python, workflow +from pydra.engine.tests.utils import ( + Foo, + FunAddTwo, + FunAddVar, + ListSum, +) + + +@workflow.define +def ATestWorkflow(x: int, y: list[int]) -> int: + node_a = workflow.add(FunAddTwo(a=x), name="A") + node_b = workflow.add(FunAddVar(a=node_a.out).split(b=y).combine("b"), name="B") + node_c = workflow.add(ListSum(x=node_b.out), name="C") + return node_c.out + + +@pytest.fixture +def workflow_task(submitter: Submitter) -> workflow.Task: + wf = ATestWorkflow(x=1, y=[1, 2, 3]) + with submitter: + submitter(wf) + return wf + + +@pytest.fixture +def wf(workflow_task: workflow.Task) -> Workflow: + wf = Workflow.construct(workflow_task) + return wf + + +@pytest.fixture +def submitter(tmp_path) -> Submitter: + return Submitter(tmp_path) + + +@pytest.fixture +def graph(wf: Workflow, submitter: Submitter) -> DiGraph[NodeExecution]: + graph = wf.execution_graph(submitter=submitter) + for node in graph.nodes: + if node.state: + node.state.prepare_states(inputs=node.node.state_values) + node.state.prepare_inputs() + node.start() + return graph + + +@pytest.fixture +def node_a(wf) -> Node: + return wf["A"] # we can pick any node to retrieve the values to + + +def test_lazy_inp(wf: Workflow, graph: DiGraph[NodeExecution]): + lf = LazyInField(field="x", type=int, workflow=wf) + assert lf._get_value(workflow=wf, graph=graph) == 1 + + lf = LazyInField(field="y", type=str, workflow=wf) + assert lf._get_value(workflow=wf, graph=graph) == [1, 2, 3] + + +def test_lazy_out(node_a, wf, graph): + lf = LazyOutField(field="out", type=int, node=node_a) + assert lf._get_value(wf, graph) == 3 + + +def test_lazy_field_cast(wf: Workflow): + lzout = wf.add(Foo(a="a", b=1, c=2.0), name="foo") + + assert lzout.y._type is int + assert workflow.cast(lzout.y, float)._type is float + + +def test_wf_lzin_split(tmp_path): + @python.define + def identity(x: int) -> int: + return x + + @workflow.define + def Inner(x): + ident = workflow.add(identity(x=x)) + return ident.out + + @workflow.define + def Outer(xs): + inner = workflow.add(Inner().split(x=xs)) + return inner.out + + outer = Outer(xs=[1, 2, 3]) + + outputs = outer(cache_root=tmp_path) + assert outputs.out == [1, 2, 3] diff --git a/pydra/engine/tests/test_nipype1_convert.py b/pydra/engine/tests/test_nipype1_convert.py deleted file mode 100644 index 8408fddb6c..0000000000 --- a/pydra/engine/tests/test_nipype1_convert.py +++ /dev/null @@ -1,122 +0,0 @@ -import typing as ty -import pytest - - -from ..task import ShellCommandTask -from ..specs import ShellOutSpec, ShellSpec, SpecInfo, File - -interf_input_spec = SpecInfo( - name="Input", fields=[("test", ty.Any, {"help_string": "test"})], bases=(ShellSpec,) -) -interf_output_spec = SpecInfo( - name="Output", fields=[("test_out", File, "*.txt")], bases=(ShellOutSpec,) -) - - -class Interf_1(ShellCommandTask): - """class with customized input/output specs""" - - input_spec = interf_input_spec - output_spec = interf_output_spec - - -class Interf_2(ShellCommandTask): - """class with customized input/output specs and executables""" - - input_spec = interf_input_spec - output_spec = interf_output_spec - executable = "testing command" - - -class Interf_3(ShellCommandTask): - """class with customized input and executables""" - - input_spec = SpecInfo( - name="Input", - fields=[ - ( - "in_file", - str, - {"help_string": "in_file", "argstr": "'{in_file}'"}, - ) - ], - bases=(ShellSpec,), - ) - executable = "testing command" - - -class TouchInterf(ShellCommandTask): - """class with customized input and executables""" - - input_spec = SpecInfo( - name="Input", - fields=[ - ( - "new_file", - str, - { - "help_string": "new_file", - "argstr": "", - "output_file_template": "{new_file}", - }, - ) - ], - bases=(ShellSpec,), - ) - executable = "touch" - - -def test_interface_specs_1(): - """testing if class input/output spec are set properly""" - task = Interf_1(executable="ls") - assert task.input_spec == interf_input_spec - assert task.output_spec == interf_output_spec - - -def test_interface_specs_2(): - """testing if class input/output spec are overwritten properly by the user's specs""" - my_input_spec = SpecInfo( - name="Input", - fields=[("my_inp", ty.Any, {"help_string": "my inp"})], - bases=(ShellSpec,), - ) - my_output_spec = SpecInfo( - name="Output", fields=[("my_out", File, "*.txt")], bases=(ShellOutSpec,) - ) - task = Interf_1(input_spec=my_input_spec, output_spec=my_output_spec) - assert task.input_spec == my_input_spec - assert task.output_spec == my_output_spec - - -def test_interface_executable_1(): - """testing if the class executable is properly set and used in the command line""" - task = Interf_2() - assert task.executable == "testing command" - assert task.inputs.executable == "testing command" - assert task.cmdline == "testing command" - - -def test_interface_executable_2(): - """testing if the class executable is overwritten by the user's input (and if the warning is raised)""" - # warning that the user changes the executable from the one that is set as a class attribute - with pytest.warns(UserWarning, match="changing the executable"): - task = Interf_2(executable="i want a different command") - assert task.executable == "testing command" - # task.executable stays the same, but input.executable is changed, so the cmd is changed - assert task.inputs.executable == "i want a different command" - assert task.cmdline == "i want a different command" - - -def test_interface_cmdline_with_spaces(): - task = Interf_3(in_file="/path/to/file/with spaces") - assert task.executable == "testing command" - assert task.inputs.executable == "testing command" - assert task.cmdline == "testing command '/path/to/file/with spaces'" - - -def test_interface_run_1(): - """testing execution of a simple interf with customized input and executable""" - task = TouchInterf(new_file="hello.txt") - assert task.cmdline == "touch hello.txt" - res = task() - assert res.output.new_file.fspath.exists() diff --git a/pydra/engine/tests/test_node_task.py b/pydra/engine/tests/test_node_task.py deleted file mode 100644 index bceaf97402..0000000000 --- a/pydra/engine/tests/test_node_task.py +++ /dev/null @@ -1,1665 +0,0 @@ -import os -import shutil -import attr -import typing as ty -import numpy as np -import time -from unittest import mock -from pathlib import Path -import pytest -import time -from fileformats.generic import File -import pydra.mark - -from .utils import ( - fun_addtwo, - fun_addvar, - fun_addvar_none, - fun_addvar_default, - moment, - fun_div, - fun_dict, - fun_file, - fun_file_list, - op_4var, -) - -from ..core import TaskBase -from ..specs import StateArray -from ..submitter import Submitter - - -@pytest.fixture(scope="module") -def change_dir(request): - orig_dir = os.getcwd() - test_dir = os.path.join(orig_dir, "test_outputs") - os.makedirs(test_dir, exist_ok=True) - os.chdir(test_dir) - - def move2orig(): - os.chdir(orig_dir) - - request.addfinalizer(move2orig) - - -# Tests for tasks initializations -def test_task_init_1(): - """task with mandatory arguments only""" - nn = fun_addtwo() - assert isinstance(nn, TaskBase) - assert nn.name == "fun_addtwo" - assert hasattr(nn, "__call__") - - -def test_task_init_1a(): - with pytest.raises(TypeError): - fun_addtwo("NA") - - -def test_task_init_2(): - """task with a name and inputs""" - nn = fun_addtwo(name="NA", a=3) - # adding NA to the name of the variable - assert getattr(nn.inputs, "a") == 3 - assert nn.state is None - - -@pytest.mark.parametrize( - "splitter, state_splitter, state_rpn, states_ind, states_val", - [("a", "NA.a", ["NA.a"], [{"NA.a": 0}, {"NA.a": 1}], [{"NA.a": 3}, {"NA.a": 5}])], -) -@pytest.mark.parametrize("input_type", ["list", "array"]) -def test_task_init_3( - splitter, state_splitter, state_rpn, states_ind, states_val, input_type -): - """task with inputs and splitter""" - a_in = [3, 5] - if input_type == "array": - a_in = np.array(a_in) - - nn = fun_addtwo(name="NA").split(splitter=splitter, a=a_in) - - assert np.allclose(nn.inputs.a, [3, 5]) - assert nn.state.splitter == state_splitter - assert nn.state.splitter_rpn == state_rpn - - nn.state.prepare_states(nn.inputs) - assert nn.state.states_ind == states_ind - assert nn.state.states_val == states_val - - -@pytest.mark.parametrize( - "splitter, state_splitter, state_rpn, states_ind, states_val", - [ - ( - ("a", "b"), - ("NA.a", "NA.b"), - ["NA.a", "NA.b", "."], - [{"NA.a": 0, "NA.b": 0}, {"NA.a": 1, "NA.b": 1}], - [{"NA.a": 3, "NA.b": 10}, {"NA.a": 5, "NA.b": 20}], - ), - ( - ["a", "b"], - ["NA.a", "NA.b"], - ["NA.a", "NA.b", "*"], - [ - {"NA.a": 0, "NA.b": 0}, - {"NA.a": 0, "NA.b": 1}, - {"NA.a": 1, "NA.b": 0}, - {"NA.a": 1, "NA.b": 1}, - ], - [ - {"NA.a": 3, "NA.b": 10}, - {"NA.a": 3, "NA.b": 20}, - {"NA.a": 5, "NA.b": 10}, - {"NA.a": 5, "NA.b": 20}, - ], - ), - ], -) -@pytest.mark.parametrize("input_type", ["list", "array", "mixed"]) -def test_task_init_3a( - splitter, state_splitter, state_rpn, states_ind, states_val, input_type -): - """task with inputs and splitter""" - a_in, b_in = [3, 5], [10, 20] - if input_type == "array": - a_in, b_in = np.array(a_in), np.array(b_in) - elif input_type == "mixed": - a_in = np.array(a_in) - nn = fun_addvar(name="NA").split(splitter=splitter, a=a_in, b=b_in) - - assert np.allclose(nn.inputs.a, [3, 5]) - assert np.allclose(nn.inputs.b, [10, 20]) - assert nn.state.splitter == state_splitter - assert nn.state.splitter_rpn == state_rpn - - nn.state.prepare_states(nn.inputs) - assert nn.state.states_ind == states_ind - assert nn.state.states_val == states_val - - -def test_task_init_4(): - """task with interface splitter and inputs set in the split method""" - nn = fun_addtwo(name="NA") - nn.split(splitter="a", a=[3, 5]) - assert np.allclose(nn.inputs.a, [3, 5]) - - assert nn.state.splitter == "NA.a" - assert nn.state.splitter_rpn == ["NA.a"] - - nn.state.prepare_states(nn.inputs) - assert nn.state.states_ind == [{"NA.a": 0}, {"NA.a": 1}] - assert nn.state.states_val == [{"NA.a": 3}, {"NA.a": 5}] - - -def test_task_init_4b(): - """updating splitter using overwrite=True""" - nn = fun_addtwo(name="NA") - nn.split(splitter="a", a=[1, 2]) - nn.split(splitter="a", a=[3, 5], overwrite=True) - assert np.allclose(nn.inputs.a, [3, 5]) - - assert nn.state.splitter == "NA.a" - assert nn.state.splitter_rpn == ["NA.a"] - - nn.state.prepare_states(nn.inputs) - assert nn.state.states_ind == [{"NA.a": 0}, {"NA.a": 1}] - assert nn.state.states_val == [{"NA.a": 3}, {"NA.a": 5}] - - -def test_task_init_4c(): - """trying to set splitter twice without using overwrite""" - nn = fun_addvar(name="NA").split(splitter="b", b=[1, 2]) - with pytest.raises(Exception) as excinfo: - nn.split(splitter="a", a=[3, 5]) - assert "splitter has been already set" in str(excinfo.value) - - assert nn.state.splitter == "NA.b" - - -def test_task_init_4d(): - """trying to set the same splitter twice without using overwrite - if the splitter is the same, the exception shouldn't be raised - """ - nn = fun_addtwo(name="NA").split(splitter="a", a=[3, 5]) - nn.split(splitter="a", a=[3, 5]) - assert nn.state.splitter == "NA.a" - - -def test_task_init_5(): - """task with inputs, splitter and combiner""" - nn = ( - fun_addvar(name="NA") - .split(splitter=["a", "b"], a=[3, 5], b=[1, 2]) - .combine("b") - ) - - assert nn.state.splitter == ["NA.a", "NA.b"] - assert nn.state.splitter_rpn == ["NA.a", "NA.b", "*"] - assert nn.state.combiner == ["NA.b"] - - assert nn.state.splitter_final == "NA.a" - assert nn.state.splitter_rpn_final == ["NA.a"] - - nn.state.prepare_states(nn.inputs) - assert nn.state.states_ind == [ - {"NA.a": 0, "NA.b": 0}, - {"NA.a": 0, "NA.b": 1}, - {"NA.a": 1, "NA.b": 0}, - {"NA.a": 1, "NA.b": 1}, - ] - assert nn.state.states_val == [ - {"NA.a": 3, "NA.b": 1}, - {"NA.a": 3, "NA.b": 2}, - {"NA.a": 5, "NA.b": 1}, - {"NA.a": 5, "NA.b": 2}, - ] - - assert nn.state.final_combined_ind_mapping == {0: [0, 1], 1: [2, 3]} - - -def test_task_init_5a(): - """updating combiner using overwrite=True""" - nn = ( - fun_addvar(name="NA") - .split(splitter=["a", "b"], a=[3, 5], b=[1, 2]) - .combine("b") - ) - nn.combine("a", overwrite=True) - - assert nn.state.splitter == ["NA.a", "NA.b"] - assert nn.state.splitter_rpn == ["NA.a", "NA.b", "*"] - assert nn.state.combiner == ["NA.a"] - - assert nn.state.splitter_final == "NA.b" - assert nn.state.splitter_rpn_final == ["NA.b"] - - nn.state.prepare_states(nn.inputs) - assert nn.state.states_ind == [ - {"NA.a": 0, "NA.b": 0}, - {"NA.a": 0, "NA.b": 1}, - {"NA.a": 1, "NA.b": 0}, - {"NA.a": 1, "NA.b": 1}, - ] - assert nn.state.states_val == [ - {"NA.a": 3, "NA.b": 1}, - {"NA.a": 3, "NA.b": 2}, - {"NA.a": 5, "NA.b": 1}, - {"NA.a": 5, "NA.b": 2}, - ] - - assert nn.state.final_combined_ind_mapping == {0: [0, 2], 1: [1, 3]} - - -def test_task_init_5b(): - """updating combiner without using overwrite""" - nn = ( - fun_addvar(name="NA") - .split(splitter=["a", "b"], a=[3, 5], b=[1, 2]) - .combine("b") - ) - with pytest.raises(Exception) as excinfo: - nn.combine("a") - assert "combiner has been already set" in str(excinfo.value) - - assert nn.state.combiner == ["NA.b"] - - -def test_task_init_5c(): - """trying to set the same combiner twice without using overwrite - if the combiner is the same, the exception shouldn't be raised - """ - nn = ( - fun_addvar(name="NA") - .split(splitter=["a", "b"], a=[3, 5], b=[1, 2]) - .combine("b") - ) - nn.combine("b") - - assert nn.state.splitter == ["NA.a", "NA.b"] - assert nn.state.splitter_rpn == ["NA.a", "NA.b", "*"] - assert nn.state.combiner == ["NA.b"] - - assert nn.state.splitter_final == "NA.a" - assert nn.state.splitter_rpn_final == ["NA.a"] - - -def test_task_init_6(): - """task with splitter, but the input is an empty list""" - nn = fun_addtwo(name="NA") - nn.split(splitter="a", a=[]) - assert nn.inputs.a == [] - - assert nn.state.splitter == "NA.a" - assert nn.state.splitter_rpn == ["NA.a"] - - nn.state.prepare_states(nn.inputs) - assert nn.state.states_ind == [] - assert nn.state.states_val == [] - - -def test_task_init_7(tmp_path): - """task with a dictionary of files as an input, checking checksum""" - file1 = tmp_path / "file1.txt" - with open(file1, "w") as f: - f.write("hello") - - file2 = tmp_path / "file2.txt" - with open(file2, "w") as f: - f.write("from pydra\n") - - nn1 = fun_file_list(name="NA", filename_list=[file1, file2]) - output_dir1 = nn1.output_dir - - # changing the content of the file - time.sleep(2) # need the mtime to be different - file2 = tmp_path / "file2.txt" - with open(file2, "w") as f: - f.write("from pydra") - - nn2 = fun_file_list(name="NA", filename_list=[file1, file2]) - output_dir2 = nn2.output_dir - - # the checksum should be different - content of file2 is different - assert output_dir1.name != output_dir2.name - - -def test_task_init_8(): - """task without setting the input, the value should be set to attr.NOTHING""" - nn = fun_addtwo(name="NA") - assert nn.inputs.a is attr.NOTHING - - -def test_task_init_9(): - """task without setting the input, but using the default avlue from function""" - nn1 = fun_addvar_default(name="NA", a=2) - assert nn1.inputs.b == 1 - - nn2 = fun_addvar_default(name="NA", a=2, b=1) - assert nn2.inputs.b == 1 - # both tasks should have the same checksum - assert nn1.checksum == nn2.checksum - - -def test_task_error(): - func = fun_div(name="div", a=1, b=0) - with pytest.raises(ZeroDivisionError): - func() - assert (func.output_dir / "_error.pklz").exists() - - -def test_odir_init(): - """checking if output_dir is available for a task without init - before running the task - """ - nn = fun_addtwo(name="NA", a=3) - assert nn.output_dir - - -# Tests for tasks without state (i.e. no splitter) - - -@pytest.mark.flaky(reruns=2) # when dask -def test_task_nostate_1(plugin_dask_opt, tmp_path): - """task without splitter""" - nn = fun_addtwo(name="NA", a=3) - nn.cache_dir = tmp_path - assert np.allclose(nn.inputs.a, [3]) - assert nn.state is None - - with Submitter(plugin=plugin_dask_opt) as sub: - sub(nn) - - # checking the results - results = nn.result() - assert results.output.out == 5 - # checking the return_inputs option, either is return_inputs is True, or "val", - # it should give values of inputs that corresponds to the specific element - results_verb = nn.result(return_inputs=True) - results_verb_val = nn.result(return_inputs="val") - assert results_verb[0] == results_verb_val[0] == {"NA.a": 3} - assert results_verb[1].output.out == results_verb_val[1].output.out == 5 - # checking the return_inputs option return_inputs="ind" - # it should give indices of inputs (instead of values) for each element - results_verb_ind = nn.result(return_inputs="ind") - assert results_verb_ind[0] == {"NA.a": None} - assert results_verb_ind[1].output.out == 5 - - # checking the output_dir - assert nn.output_dir.exists() - - -def test_task_nostate_1_call(): - """task without splitter""" - nn = fun_addtwo(name="NA", a=3) - nn() - # checking the results - results = nn.result() - assert results.output.out == 5 - # checking the output_dir - assert nn.output_dir.exists() - - -@pytest.mark.flaky(reruns=2) # when dask -def test_task_nostate_1_call_subm(plugin_dask_opt, tmp_path): - """task without splitter""" - nn = fun_addtwo(name="NA", a=3) - nn.cache_dir = tmp_path - assert np.allclose(nn.inputs.a, [3]) - assert nn.state is None - - with Submitter(plugin=plugin_dask_opt) as sub: - nn(submitter=sub) - - # checking the results - results = nn.result() - assert results.output.out == 5 - # checking the output_dir - assert nn.output_dir.exists() - - -@pytest.mark.flaky(reruns=2) # when dask -def test_task_nostate_1_call_plug(plugin_dask_opt, tmp_path): - """task without splitter""" - nn = fun_addtwo(name="NA", a=3) - nn.cache_dir = tmp_path - assert np.allclose(nn.inputs.a, [3]) - assert nn.state is None - - nn(plugin=plugin_dask_opt) - - # checking the results - results = nn.result() - assert results.output.out == 5 - # checking the output_dir - assert nn.output_dir.exists() - - -def test_task_nostate_1_call_updateinp(): - """task without splitter""" - nn = fun_addtwo(name="NA", a=30) - # updating input when calling the node - nn(a=3) - - # checking the results - results = nn.result() - assert results.output.out == 5 - # checking the output_dir - assert nn.output_dir.exists() - - -def test_task_nostate_2(plugin, tmp_path): - """task with a list as an input, but no splitter""" - nn = moment(name="NA", n=3, lst=[2, 3, 4]) - nn.cache_dir = tmp_path - assert np.allclose(nn.inputs.n, [3]) - assert np.allclose(nn.inputs.lst, [2, 3, 4]) - assert nn.state is None - - with Submitter(plugin=plugin) as sub: - sub(nn) - - # checking the results - results = nn.result() - assert results.output.out == 33 - # checking the output_dir - assert nn.output_dir.exists() - - -def test_task_nostate_3(plugin, tmp_path): - """task with a dictionary as an input""" - nn = fun_dict(name="NA", d={"a": "ala", "b": "bala"}) - nn.cache_dir = tmp_path - assert nn.inputs.d == {"a": "ala", "b": "bala"} - - with Submitter(plugin=plugin) as sub: - sub(nn) - - # checking the results - results = nn.result() - assert results.output.out == "a:ala_b:bala" - # checking the output_dir - assert nn.output_dir.exists() - - -def test_task_nostate_4(plugin, tmp_path): - """task with a dictionary as an input""" - file1 = tmp_path / "file.txt" - with open(file1, "w") as f: - f.write("hello from pydra\n") - - nn = fun_file(name="NA", filename=file1) - nn.cache_dir = tmp_path - - with Submitter(plugin) as sub: - sub(nn) - - # checking the results - results = nn.result() - assert results.output.out == "hello from pydra\n" - # checking the output_dir - assert nn.output_dir.exists() - - -def test_task_nostate_5(tmp_path): - """task with a dictionary of files as an input""" - file1 = tmp_path / "file1.txt" - with open(file1, "w") as f: - f.write("hello") - - file2 = tmp_path / "file2.txt" - with open(file2, "w") as f: - f.write("from pydra\n") - - nn = fun_file_list(name="NA", filename_list=[file1, file2]) - - nn() - - # checking the results - results = nn.result() - assert results.output.out == "hello from pydra\n" - # checking the output_dir - assert nn.output_dir.exists() - - -def test_task_nostate_6(): - """checking if the function gets the None value""" - nn = fun_addvar_none(name="NA", a=2, b=None) - assert nn.inputs.b is None - nn() - assert nn.result().output.out == 2 - - -def test_task_nostate_6a_exception(): - """checking if the function gets the attr.Nothing value""" - nn = fun_addvar_none(name="NA", a=2) - assert nn.inputs.b is attr.NOTHING - with pytest.raises(TypeError) as excinfo: - nn() - assert "unsupported" in str(excinfo.value) - - -def test_task_nostate_7(): - """using the default value from the function for b input""" - nn = fun_addvar_default(name="NA", a=2) - assert nn.inputs.b == 1 - nn() - assert nn.result().output.out == 3 - - -# Testing caching for tasks without states - - -@pytest.mark.flaky(reruns=2) # when dask -def test_task_nostate_cachedir(plugin_dask_opt, tmp_path): - """task with provided cache_dir using pytest tmp_path""" - cache_dir = tmp_path / "test_task_nostate" - cache_dir.mkdir() - nn = fun_addtwo(name="NA", a=3, cache_dir=cache_dir) - assert np.allclose(nn.inputs.a, [3]) - assert nn.state is None - - with Submitter(plugin=plugin_dask_opt) as sub: - sub(nn) - - # checking the results - results = nn.result() - assert results.output.out == 5 - - -@pytest.mark.flaky(reruns=2) # when dask -def test_task_nostate_cachedir_relativepath(tmp_path, plugin_dask_opt): - """task with provided cache_dir as relative path""" - os.chdir(tmp_path) - cache_dir = "test_task_nostate" - (tmp_path / cache_dir).mkdir() - - nn = fun_addtwo(name="NA", a=3, cache_dir=cache_dir) - assert np.allclose(nn.inputs.a, [3]) - assert nn.state is None - - with Submitter(plugin=plugin_dask_opt) as sub: - sub(nn) - - # checking the results - results = nn.result() - assert results.output.out == 5 - - shutil.rmtree(cache_dir) - - -@pytest.mark.flaky(reruns=2) # when dask -def test_task_nostate_cachelocations(plugin_dask_opt, tmp_path): - """ - Two identical tasks with provided cache_dir; - the second task has cache_locations and should not recompute the results - """ - cache_dir = tmp_path / "test_task_nostate" - cache_dir.mkdir() - cache_dir2 = tmp_path / "test_task_nostate2" - cache_dir2.mkdir() - - nn = fun_addtwo(name="NA", a=3, cache_dir=cache_dir) - with Submitter(plugin=plugin_dask_opt) as sub: - sub(nn) - - nn2 = fun_addtwo(name="NA", a=3, cache_dir=cache_dir2, cache_locations=cache_dir) - with Submitter(plugin=plugin_dask_opt) as sub: - sub(nn2) - - # checking the results - results2 = nn2.result() - assert results2.output.out == 5 - - # checking if the second task didn't run the interface again - assert nn.output_dir.exists() - assert not nn2.output_dir.exists() - - -def test_task_nostate_cachelocations_forcererun(plugin, tmp_path): - """ - Two identical tasks with provided cache_dir; - the second task has cache_locations, - but submitter is called with rerun=True, so should recompute - """ - cache_dir = tmp_path / "test_task_nostate" - cache_dir.mkdir() - cache_dir2 = tmp_path / "test_task_nostate2" - cache_dir2.mkdir() - - nn = fun_addtwo(name="NA", a=3, cache_dir=cache_dir) - with Submitter(plugin=plugin) as sub: - sub(nn) - - nn2 = fun_addtwo(name="NA", a=3, cache_dir=cache_dir2, cache_locations=cache_dir) - with Submitter(plugin=plugin) as sub: - sub(nn2, rerun=True) - - # checking the results - results2 = nn2.result() - assert results2.output.out == 5 - - # checking if the second task rerun the interface - assert nn.output_dir.exists() - assert nn2.output_dir.exists() - - -def test_task_nostate_cachelocations_nosubmitter(tmp_path): - """ - Two identical tasks (that are run without submitter!) with provided cache_dir; - the second task has cache_locations and should not recompute the results - """ - cache_dir = tmp_path / "test_task_nostate" - cache_dir.mkdir() - cache_dir2 = tmp_path / "test_task_nostate2" - cache_dir2.mkdir() - - nn = fun_addtwo(name="NA", a=3, cache_dir=cache_dir) - nn() - - nn2 = fun_addtwo(name="NA", a=3, cache_dir=cache_dir2, cache_locations=cache_dir) - nn2() - - # checking the results - results2 = nn2.result() - assert results2.output.out == 5 - - # checking if the second task didn't run the interface again - assert nn.output_dir.exists() - assert not nn2.output_dir.exists() - - -def test_task_nostate_cachelocations_nosubmitter_forcererun(tmp_path): - """ - Two identical tasks (that are run without submitter!) with provided cache_dir; - the second task has cache_locations, - but submitter is called with rerun=True, so should recompute - """ - cache_dir = tmp_path / "test_task_nostate" - cache_dir.mkdir() - cache_dir2 = tmp_path / "test_task_nostate2" - cache_dir2.mkdir() - - nn = fun_addtwo(name="NA", a=3, cache_dir=cache_dir) - nn() - - nn2 = fun_addtwo(name="NA", a=3, cache_dir=cache_dir2, cache_locations=cache_dir) - nn2(rerun=True) - - # checking the results - results2 = nn2.result() - assert results2.output.out == 5 - - # checking if the second task run the interface again - assert nn.output_dir.exists() - assert nn2.output_dir.exists() - - -def test_task_nostate_cachelocations_updated(plugin, tmp_path): - """ - Two identical tasks with provided cache_dir; - the second task has cache_locations in init, - that is later overwritten in Submitter.__call__; - the cache_locations passed to call doesn't exist so the second task should run again - """ - cache_dir = tmp_path / "test_task_nostate" - cache_dir.mkdir() - cache_dir1 = tmp_path / "test_task_nostate1" - cache_dir1.mkdir() - cache_dir2 = tmp_path / "test_task_nostate2" - cache_dir2.mkdir() - - nn = fun_addtwo(name="NA", a=3, cache_dir=cache_dir) - with Submitter(plugin=plugin) as sub: - sub(nn) - - nn2 = fun_addtwo(name="NA", a=3, cache_dir=cache_dir2, cache_locations=cache_dir) - # updating cache location to non-existing dir - with Submitter(plugin=plugin) as sub: - sub(nn2, cache_locations=cache_dir1) - - # checking the results - results2 = nn2.result() - assert results2.output.out == 5 - - # checking if both tasks run interface - assert nn.output_dir.exists() - assert nn2.output_dir.exists() - - -# Tests for tasks with states (i.e. with splitter) - - -@pytest.mark.flaky(reruns=2) # when dask -@pytest.mark.parametrize("input_type", ["list", "array"]) -def test_task_state_1(plugin_dask_opt, input_type, tmp_path): - """task with the simplest splitter""" - a_in = [3, 5] - if input_type == "array": - a_in = np.array(a_in) - - nn = fun_addtwo(name="NA").split(splitter="a", a=a_in) - nn.cache_dir = tmp_path - - assert nn.state.splitter == "NA.a" - assert nn.state.splitter_rpn == ["NA.a"] - assert (nn.inputs.a == np.array([3, 5])).all() - - with Submitter(plugin=plugin_dask_opt) as sub: - sub(nn) - - # checking the results - results = nn.result() - expected = [({"NA.a": 3}, 5), ({"NA.a": 5}, 7)] - for i, res in enumerate(expected): - assert results[i].output.out == res[1] - - # checking the return_inputs option, either return_inputs is True or "val", - # it should give values of inputs that corresponds to the specific element - results_verb = nn.result(return_inputs=True) - results_verb_val = nn.result(return_inputs="val") - for i, res in enumerate(expected): - assert (results_verb[i][0], results_verb[i][1].output.out) == res - assert (results_verb_val[i][0], results_verb_val[i][1].output.out) == res - - # checking the return_inputs option return_inputs="ind" - # it should give indices of inputs (instead of values) for each element - results_verb_ind = nn.result(return_inputs="ind") - expected_ind = [({"NA.a": 0}, 5), ({"NA.a": 1}, 7)] - for i, res in enumerate(expected_ind): - assert (results_verb_ind[i][0], results_verb_ind[i][1].output.out) == res - - # checking the output_dir - assert nn.output_dir - for odir in nn.output_dir: - assert odir.exists() - - -def test_task_state_1a(plugin, tmp_path): - """task with the simplest splitter (inputs set separately)""" - nn = fun_addtwo(name="NA") - nn.split(splitter="a", a=[1, 2]) - nn.inputs.a = StateArray([3, 5]) - nn.cache_dir = tmp_path - - assert nn.state.splitter == "NA.a" - assert nn.state.splitter_rpn == ["NA.a"] - assert (nn.inputs.a == np.array([3, 5])).all() - - with Submitter(plugin=plugin) as sub: - sub(nn) - - # checking the results - results = nn.result() - expected = [({"NA.a": 3}, 5), ({"NA.a": 5}, 7)] - for i, res in enumerate(expected): - assert results[i].output.out == res[1] - - -def test_task_state_singl_1(plugin, tmp_path): - """Tasks with two inputs and a splitter (no combiner) - one input is a single value, the other is in the splitter and combiner - """ - nn = fun_addvar(name="NA").split(splitter="a", a=[3, 5], b=10) - nn.cache_dir = tmp_path - - assert nn.inputs.a == [3, 5] - assert nn.inputs.b == 10 - assert nn.state.splitter == "NA.a" - assert nn.state.splitter_rpn == ["NA.a"] - assert nn.state.splitter_final == "NA.a" - assert nn.state.splitter_rpn_final == ["NA.a"] - - with Submitter(plugin=plugin) as sub: - sub(nn) - - # checking the results - expected = [({"NA.a": 3, "NA.b": 10}, 13), ({"NA.a": 5, "NA.b": 10}, 15)] - results = nn.result() - for i, res in enumerate(expected): - assert results[i].output.out == res[1] - # checking the output_dir - assert nn.output_dir - for odir in nn.output_dir: - assert odir.exists() - - -@pytest.mark.parametrize( - "splitter, state_splitter, state_rpn, expected, expected_ind", - [ - ( - ("a", "b"), - ("NA.a", "NA.b"), - ["NA.a", "NA.b", "."], - [({"NA.a": 3, "NA.b": 10}, 13), ({"NA.a": 5, "NA.b": 20}, 25)], - [({"NA.a": 0, "NA.b": 0}, 13), ({"NA.a": 1, "NA.b": 1}, 25)], - ), - ( - ["a", "b"], - ["NA.a", "NA.b"], - ["NA.a", "NA.b", "*"], - [ - ({"NA.a": 3, "NA.b": 10}, 13), - ({"NA.a": 3, "NA.b": 20}, 23), - ({"NA.a": 5, "NA.b": 10}, 15), - ({"NA.a": 5, "NA.b": 20}, 25), - ], - [ - ({"NA.a": 0, "NA.b": 0}, 13), - ({"NA.a": 0, "NA.b": 1}, 23), - ({"NA.a": 1, "NA.b": 0}, 15), - ({"NA.a": 1, "NA.b": 1}, 25), - ], - ), - ], -) -@pytest.mark.parametrize("input_type", ["list", "array", "mixed"]) -def test_task_state_2( - plugin, - splitter, - state_splitter, - state_rpn, - expected, - expected_ind, - input_type, - tmp_path, -): - """Tasks with two inputs and a splitter (no combiner)""" - a_in, b_in = [3, 5], [10, 20] - if input_type == "array": - a_in, b_in = np.array(a_in), np.array(b_in) - elif input_type == "mixed": - a_in = np.array(a_in) - nn = fun_addvar(name="NA").split(splitter=splitter, a=a_in, b=b_in) - nn.cache_dir = tmp_path - - assert (nn.inputs.a == np.array([3, 5])).all() - assert (nn.inputs.b == np.array([10, 20])).all() - assert nn.state.splitter == state_splitter - assert nn.state.splitter_rpn == state_rpn - assert nn.state.splitter_final == state_splitter - assert nn.state.splitter_rpn_final == state_rpn - - with Submitter(plugin=plugin) as sub: - sub(nn) - - # checking the results - results = nn.result() - for i, res in enumerate(expected): - assert results[i].output.out == res[1] - - # checking the return_inputs option, either return_inputs is True or "val", - # it should give values of inputs that corresponds to the specific element - results_verb = nn.result(return_inputs=True) - results_verb_val = nn.result(return_inputs="val") - for i, res in enumerate(expected): - assert (results_verb[i][0], results_verb[i][1].output.out) == res - assert (results_verb_val[i][0], results_verb_val[i][1].output.out) == res - - # checking the return_inputs option return_inputs="ind" - # it should give indices of inputs (instead of values) for each element - results_verb_ind = nn.result(return_inputs="ind") - for i, res in enumerate(expected_ind): - assert (results_verb_ind[i][0], results_verb_ind[i][1].output.out) == res - - # checking the output_dir - assert nn.output_dir - for odir in nn.output_dir: - assert odir.exists() - - -def test_task_state_3(plugin, tmp_path): - """task with the simplest splitter, the input is an empty list""" - nn = fun_addtwo(name="NA").split(splitter="a", a=[]) - nn.cache_dir = tmp_path - - assert nn.state.splitter == "NA.a" - assert nn.state.splitter_rpn == ["NA.a"] - assert nn.inputs.a == [] - - with Submitter(plugin=plugin) as sub: - sub(nn) - - # checking the results - results = nn.result() - expected = [] - for i, res in enumerate(expected): - assert results[i].output.out == res[1] - # checking the output_dir - assert nn.output_dir == [] - - -@pytest.mark.parametrize("input_type", ["list", "array"]) -def test_task_state_4(plugin, input_type, tmp_path): - """task with a list as an input, and a simple splitter""" - lst_in = [[2, 3, 4], [1, 2, 3]] - if input_type == "array": - lst_in = np.array(lst_in, dtype=int) - nn = moment(name="NA", n=3).split(splitter="lst", lst=lst_in) - nn.cache_dir = tmp_path - - assert np.allclose(nn.inputs.n, 3) - assert np.allclose(nn.inputs.lst, [[2, 3, 4], [1, 2, 3]]) - assert nn.state.splitter == "NA.lst" - - with Submitter(plugin=plugin) as sub: - sub(nn) - - # checking that split is done across dim 0 - el_0 = nn.state.states_val[0]["NA.lst"] - if input_type == "list": - assert el_0 == [2, 3, 4] - elif input_type == "array": - assert el_0 == [2, 3, 4] - - # checking the results - results = nn.result() - for i, expected in enumerate([33, 12]): - assert results[i].output.out == expected - # checking the output_dir - assert nn.output_dir - for odir in nn.output_dir: - assert odir.exists() - - -def test_task_state_4a(plugin, tmp_path): - """task with a tuple as an input, and a simple splitter""" - nn = moment(name="NA", n=3).split(splitter="lst", lst=[(2, 3, 4), (1, 2, 3)]) - nn.cache_dir = tmp_path - - assert np.allclose(nn.inputs.n, 3) - assert np.allclose(nn.inputs.lst, [[2, 3, 4], [1, 2, 3]]) - assert nn.state.splitter == "NA.lst" - - with Submitter(plugin=plugin) as sub: - sub(nn) - - # checking the results - results = nn.result() - for i, expected in enumerate([33, 12]): - assert results[i].output.out == expected - # checking the output_dir - assert nn.output_dir - for odir in nn.output_dir: - assert odir.exists() - - -def test_task_state_5(plugin, tmp_path): - """task with a list as an input, and the variable is part of the scalar splitter""" - nn = moment(name="NA").split( - splitter=("n", "lst"), n=[1, 3], lst=[[2, 3, 4], [1, 2, 3]] - ) - nn.cache_dir = tmp_path - - assert np.allclose(nn.inputs.n, [1, 3]) - assert np.allclose(nn.inputs.lst, [[2, 3, 4], [1, 2, 3]]) - assert nn.state.splitter == ("NA.n", "NA.lst") - - with Submitter(plugin=plugin) as sub: - sub(nn) - - # checking the results - results = nn.result() - for i, expected in enumerate([3, 12]): - assert results[i].output.out == expected - # checking the output_dir - assert nn.output_dir - for odir in nn.output_dir: - assert odir.exists() - - -def test_task_state_5_exception(plugin, tmp_path): - """task with a list as an input, and the variable is part of the scalar splitter - the shapes are not matching, so exception should be raised - """ - nn = moment(name="NA").split( - splitter=("n", "lst"), n=[1, 3, 3], lst=[[2, 3, 4], [1, 2, 3]] - ) - nn.cache_dir = tmp_path - - assert np.allclose(nn.inputs.n, [1, 3, 3]) - assert np.allclose(nn.inputs.lst, [[2, 3, 4], [1, 2, 3]]) - assert nn.state.splitter == ("NA.n", "NA.lst") - - with pytest.raises(Exception) as excinfo: - with Submitter(plugin=plugin) as sub: - sub(nn) - assert "shape" in str(excinfo.value) - - -def test_task_state_6(plugin, tmp_path): - """ask with a list as an input, and the variable is part of the outer splitter""" - nn = moment(name="NA").split( - splitter=["n", "lst"], n=[1, 3], lst=[[2, 3, 4], [1, 2, 3]] - ) - nn.cache_dir = tmp_path - - assert np.allclose(nn.inputs.n, [1, 3]) - assert np.allclose(nn.inputs.lst, [[2, 3, 4], [1, 2, 3]]) - assert nn.state.splitter == ["NA.n", "NA.lst"] - - with Submitter(plugin=plugin) as sub: - sub(nn) - - # checking the results - results = nn.result() - for i, expected in enumerate([3, 2, 33, 12]): - assert results[i].output.out == expected - # checking the output_dir - assert nn.output_dir - for odir in nn.output_dir: - assert odir.exists() - - -def test_task_state_6a(plugin, tmp_path): - """ask with a tuple as an input, and the variable is part of the outer splitter""" - nn = moment(name="NA").split( - splitter=["n", "lst"], n=[1, 3], lst=[(2, 3, 4), (1, 2, 3)] - ) - nn.cache_dir = tmp_path - - assert np.allclose(nn.inputs.n, [1, 3]) - assert np.allclose(nn.inputs.lst, [[2, 3, 4], [1, 2, 3]]) - assert nn.state.splitter == ["NA.n", "NA.lst"] - - with Submitter(plugin=plugin) as sub: - sub(nn) - - # checking the results - results = nn.result() - for i, expected in enumerate([3, 2, 33, 12]): - assert results[i].output.out == expected - # checking the output_dir - assert nn.output_dir - for odir in nn.output_dir: - assert odir.exists() - - -@pytest.mark.flaky(reruns=2) # when dask -def test_task_state_comb_1(plugin_dask_opt, tmp_path): - """task with the simplest splitter and combiner""" - nn = fun_addtwo(name="NA").split(a=[3, 5], splitter="a").combine(combiner="a") - nn.cache_dir = tmp_path - - assert (nn.inputs.a == np.array([3, 5])).all() - - assert nn.state.splitter == "NA.a" - assert nn.state.splitter_rpn == ["NA.a"] - assert nn.state.combiner == ["NA.a"] - assert nn.state.splitter_final is None - assert nn.state.splitter_rpn_final == [] - - with Submitter(plugin=plugin_dask_opt) as sub: - sub(nn) - - assert nn.state.states_ind == [{"NA.a": 0}, {"NA.a": 1}] - assert nn.state.states_val == [{"NA.a": 3}, {"NA.a": 5}] - - # checking the results - results = nn.result() - # fully combined (no nested list) - combined_results = [res.output.out for res in results] - assert combined_results == [5, 7] - - expected = [({"NA.a": 3}, 5), ({"NA.a": 5}, 7)] - expected_ind = [({"NA.a": 0}, 5), ({"NA.a": 1}, 7)] - # checking the return_inputs option, either return_inputs is True or "val", - # it should give values of inputs that corresponds to the specific element - results_verb = nn.result(return_inputs=True) - results_verb_val = nn.result(return_inputs="val") - for i, res in enumerate(expected): - assert (results_verb[i][0], results_verb[i][1].output.out) == res - assert (results_verb_val[i][0], results_verb_val[i][1].output.out) == res - # checking the return_inputs option return_inputs="ind" - # it should give indices of inputs (instead of values) for each element - results_verb_ind = nn.result(return_inputs="ind") - for i, res in enumerate(expected_ind): - assert (results_verb_ind[i][0], results_verb_ind[i][1].output.out) == res - - # checking the output_dir - assert nn.output_dir - for odir in nn.output_dir: - assert odir.exists() - - -@pytest.mark.parametrize( - "splitter, combiner, state_splitter, state_rpn, state_combiner, state_combiner_all, " - "state_splitter_final, state_rpn_final, expected, expected_val", - [ - ( - ("a", "b"), - "a", - ("NA.a", "NA.b"), - ["NA.a", "NA.b", "."], - ["NA.a"], - ["NA.a", "NA.b"], - None, - [], - [13, 25], - [({"NA.a": 3, "NA.b": 10}, 13), ({"NA.a": 5, "NA.b": 20}, 25)], - ), - ( - ("a", "b"), - "b", - ("NA.a", "NA.b"), - ["NA.a", "NA.b", "."], - ["NA.b"], - ["NA.a", "NA.b"], - None, - [], - [13, 25], - [({"NA.a": 3, "NA.b": 10}, 13), ({"NA.a": 5, "NA.b": 20}, 25)], - ), - ( - ["a", "b"], - "a", - ["NA.a", "NA.b"], - ["NA.a", "NA.b", "*"], - ["NA.a"], - ["NA.a"], - "NA.b", - ["NA.b"], - [[13, 15], [23, 25]], - [ - [({"NA.a": 3, "NA.b": 10}, 13), ({"NA.a": 5, "NA.b": 10}, 15)], - [({"NA.a": 3, "NA.b": 20}, 23), ({"NA.a": 5, "NA.b": 20}, 25)], - ], - ), - ( - ["a", "b"], - "b", - ["NA.a", "NA.b"], - ["NA.a", "NA.b", "*"], - ["NA.b"], - ["NA.b"], - "NA.a", - ["NA.a"], - [[13, 23], [15, 25]], - [ - [({"NA.a": 3, "NA.b": 10}, 13), ({"NA.a": 3, "NA.b": 20}, 23)], - [({"NA.a": 5, "NA.b": 10}, 15), ({"NA.a": 5, "NA.b": 20}, 25)], - ], - ), - ( - ["a", "b"], - ["a", "b"], - ["NA.a", "NA.b"], - ["NA.a", "NA.b", "*"], - ["NA.a", "NA.b"], - ["NA.a", "NA.b"], - None, - [], - [13, 23, 15, 25], - [ - ({"NA.a": 3, "NA.b": 10}, 13), - ({"NA.a": 3, "NA.b": 20}, 23), - ({"NA.a": 5, "NA.b": 10}, 15), - ({"NA.a": 5, "NA.b": 20}, 25), - ], - ), - ], -) -def test_task_state_comb_2( - plugin, - splitter, - combiner, - state_splitter, - state_rpn, - state_combiner, - state_combiner_all, - state_splitter_final, - state_rpn_final, - expected, - expected_val, - tmp_path, -): - """Tasks with scalar and outer splitters and partial or full combiners""" - nn = ( - fun_addvar(name="NA") - .split(a=[3, 5], b=[10, 20], splitter=splitter) - .combine(combiner=combiner) - ) - nn.cache_dir = tmp_path - - assert (nn.inputs.a == np.array([3, 5])).all() - - assert nn.state.splitter == state_splitter - assert nn.state.splitter_rpn == state_rpn - assert nn.state.combiner == state_combiner - - with Submitter(plugin=plugin) as sub: - sub(nn) - - assert nn.state.splitter_final == state_splitter_final - assert nn.state.splitter_rpn_final == state_rpn_final - assert set(nn.state.current_combiner_all) == set(state_combiner_all) - - # checking the results - results = nn.result() - # checking the return_inputs option, either return_inputs is True or "val", - # it should give values of inputs that corresponds to the specific element - results_verb = nn.result(return_inputs=True) - - if nn.state.splitter_rpn_final: - for i, res in enumerate(expected): - assert [res.output.out for res in results[i]] == res - # results_verb - for i, res_l in enumerate(expected_val): - for j, res in enumerate(res_l): - assert (results_verb[i][j][0], results_verb[i][j][1].output.out) == res - # if the combiner is full expected is "a flat list" - else: - assert [res.output.out for res in results] == expected - for i, res in enumerate(expected_val): - assert (results_verb[i][0], results_verb[i][1].output.out) == res - - # checking the output_dir - assert nn.output_dir - for odir in nn.output_dir: - assert odir.exists() - - -def test_task_state_comb_singl_1(plugin, tmp_path): - """Tasks with two inputs; - one input is a single value, the other is in the splitter and combiner - """ - nn = fun_addvar(name="NA").split(splitter="a", a=[3, 5], b=10).combine(combiner="a") - nn.cache_dir = tmp_path - - assert nn.inputs.a == [3, 5] - assert nn.inputs.b == 10 - assert nn.state.splitter == "NA.a" - assert nn.state.splitter_rpn == ["NA.a"] - assert nn.state.combiner == ["NA.a"] - assert nn.state.splitter_final is None - assert nn.state.splitter_rpn_final == [] - - with Submitter(plugin=plugin) as sub: - sub(nn) - - # checking the results - expected = ({}, [13, 15]) - results = nn.result() - # full combiner, no nested list - combined_results = [res.output.out for res in results] - assert combined_results == expected[1] - # checking the output_dir - assert nn.output_dir - for odir in nn.output_dir: - assert odir.exists() - - -def test_task_state_comb_3(plugin, tmp_path): - """task with the simplest splitter, the input is an empty list""" - nn = fun_addtwo(name="NA").split(splitter="a", a=[]).combine(combiner=["a"]) - nn.cache_dir = tmp_path - - assert nn.state.splitter == "NA.a" - assert nn.state.splitter_rpn == ["NA.a"] - assert nn.inputs.a == [] - - with Submitter(plugin=plugin) as sub: - sub(nn) - - # checking the results - results = nn.result() - expected = [] - for i, res in enumerate(expected): - assert results[i].output.out == res[1] - # checking the output_dir - assert nn.output_dir == [] - - -def test_task_state_comb_order(): - """tasks with an outer splitter and various combiner; - showing the order of results - """ - - # single combiner "a" - will create two lists, first one for b=3, second for b=5 - nn_a = ( - fun_addvar(name="NA") - .split(splitter=["a", "b"], a=[10, 20], b=[3, 5]) - .combine(combiner="a") - ) - assert nn_a.state.combiner == ["NA.a"] - - results_a = nn_a() - combined_results_a = [[res.output.out for res in res_l] for res_l in results_a] - assert combined_results_a == [[13, 23], [15, 25]] - - # single combiner "b" - will create two lists, first one for a=10, second for a=20 - nn_b = ( - fun_addvar(name="NA") - .split(splitter=["a", "b"], a=[10, 20], b=[3, 5]) - .combine(combiner="b") - ) - assert nn_b.state.combiner == ["NA.b"] - - results_b = nn_b() - combined_results_b = [[res.output.out for res in res_l] for res_l in results_b] - assert combined_results_b == [[13, 15], [23, 25]] - - # combiner with both fields ["a", "b"] - will create one list - nn_ab = ( - fun_addvar(name="NA") - .split(splitter=["a", "b"], a=[10, 20], b=[3, 5]) - .combine(combiner=["a", "b"]) - ) - assert nn_ab.state.combiner == ["NA.a", "NA.b"] - - results_ab = nn_ab() - # full combiner, no nested list - combined_results_ab = [res.output.out for res in results_ab] - assert combined_results_ab == [13, 15, 23, 25] - - # combiner with both fields ["b", "a"] - will create the same list as nn_ab - # no difference in the order for setting combiner - nn_ba = ( - fun_addvar(name="NA") - .split(splitter=["a", "b"], a=[10, 20], b=[3, 5]) - .combine(combiner=["b", "a"]) - ) - assert nn_ba.state.combiner == ["NA.b", "NA.a"] - - results_ba = nn_ba() - combined_results_ba = [res.output.out for res in results_ba] - assert combined_results_ba == [13, 15, 23, 25] - - -# Testing with container dimensions for the input - - -def test_task_state_contdim_1(tmp_path): - """task with a spliter and container dimension for one of the value""" - task_4var = op_4var( - name="op_4var", - a="a1", - cache_dir=tmp_path, - ) - task_4var.split( - ("b", ["c", "d"]), - b=[["b1", "b2"], ["b3", "b4"]], - c=["c1", "c2"], - d=["d1", "d2"], - cont_dim={"b": 2}, - ) - task_4var() - res = task_4var.result() - assert len(res) == 4 - assert res[3].output.out == "a1 b4 c2 d2" - - -def test_task_state_contdim_2(tmp_path): - """task with a splitter and container dimension for one of the value""" - task_4var = op_4var( - name="op_4var", - cache_dir=tmp_path, - ) - task_4var.split( - ["a", ("b", ["c", "d"])], - cont_dim={"b": 2}, - a=["a1", "a2"], - b=[["b1", "b2"], ["b3", "b4"]], - c=["c1", "c2"], - d=["d1", "d2"], - ) - task_4var() - res = task_4var.result() - assert len(res) == 8 - assert res[7].output.out == "a2 b4 c2 d2" - - -def test_task_state_comb_contdim_1(tmp_path): - """task with a splitter-combiner, and container dimension for one of the value""" - task_4var = op_4var( - name="op_4var", - a="a1", - cache_dir=tmp_path, - ) - task_4var.split( - ("b", ["c", "d"]), - cont_dim={"b": 2}, - b=[["b1", "b2"], ["b3", "b4"]], - c=["c1", "c2"], - d=["d1", "d2"], - ).combine("b") - task_4var() - res = task_4var.result() - assert len(res) == 4 - assert res[3].output.out == "a1 b4 c2 d2" - - -def test_task_state_comb_contdim_2(tmp_path): - """task with a splitter-combiner, and container dimension for one of the value""" - task_4var = op_4var( - name="op_4var", - cache_dir=tmp_path, - ) - task_4var.split( - ["a", ("b", ["c", "d"])], - a=["a1", "a2"], - b=[["b1", "b2"], ["b3", "b4"]], - c=["c1", "c2"], - d=["d1", "d2"], - cont_dim={"b": 2}, - ).combine("a") - task_4var() - res = task_4var.result() - assert len(res) == 4 - assert res[3][1].output.out == "a2 b4 c2 d2" - - -# Testing caching for tasks with states - - -@pytest.mark.flaky(reruns=2) # when dask -def test_task_state_cachedir(plugin_dask_opt, tmp_path): - """task with a state and provided cache_dir using pytest tmp_path""" - cache_dir = tmp_path / "test_task_nostate" - cache_dir.mkdir() - nn = fun_addtwo(name="NA", cache_dir=cache_dir).split(splitter="a", a=[3, 5]) - - assert nn.state.splitter == "NA.a" - assert (nn.inputs.a == np.array([3, 5])).all() - - with Submitter(plugin=plugin_dask_opt) as sub: - sub(nn) - - # checking the results - results = nn.result() - expected = [({"NA.a": 3}, 5), ({"NA.a": 5}, 7)] - for i, res in enumerate(expected): - assert results[i].output.out == res[1] - - -def test_task_state_cachelocations(plugin, tmp_path): - """ - Two identical tasks with a state and cache_dir; - the second task has cache_locations and should not recompute the results - """ - cache_dir = tmp_path / "test_task_nostate" - cache_dir.mkdir() - cache_dir2 = tmp_path / "test_task_nostate2" - cache_dir2.mkdir() - - nn = fun_addtwo(name="NA", a=3, cache_dir=cache_dir).split(splitter="a", a=[3, 5]) - with Submitter(plugin=plugin) as sub: - sub(nn) - - nn2 = fun_addtwo( - name="NA", a=3, cache_dir=cache_dir2, cache_locations=cache_dir - ).split(splitter="a", a=[3, 5]) - with Submitter(plugin=plugin) as sub: - sub(nn2) - - # checking the results - results2 = nn2.result() - expected = [({"NA.a": 3}, 5), ({"NA.a": 5}, 7)] - for i, res in enumerate(expected): - assert results2[i].output.out == res[1] - - assert all([dir.exists() for dir in nn.output_dir]) - assert not any([dir.exists() for dir in nn2.output_dir]) - - -def test_task_state_cachelocations_forcererun(plugin, tmp_path): - """ - Two identical tasks with a state and cache_dir; - the second task has cache_locations, - but submitter is called with rerun=True, so should recompute - """ - cache_dir = tmp_path / "test_task_nostate" - cache_dir.mkdir() - cache_dir2 = tmp_path / "test_task_nostate2" - cache_dir2.mkdir() - - nn = fun_addtwo(name="NA", a=3, cache_dir=cache_dir).split(splitter="a", a=[3, 5]) - with Submitter(plugin=plugin) as sub: - sub(nn) - - nn2 = fun_addtwo( - name="NA", a=3, cache_dir=cache_dir2, cache_locations=cache_dir - ).split(splitter="a", a=[3, 5]) - with Submitter(plugin=plugin) as sub: - sub(nn2, rerun=True) - - # checking the results - results2 = nn2.result() - expected = [({"NA.a": 3}, 5), ({"NA.a": 5}, 7)] - for i, res in enumerate(expected): - assert results2[i].output.out == res[1] - - # both workflows should be run - assert all([dir.exists() for dir in nn.output_dir]) - assert all([dir.exists() for dir in nn2.output_dir]) - - -def test_task_state_cachelocations_updated(plugin, tmp_path): - """ - Two identical tasks with states and cache_dir; - the second task has cache_locations in init, - that is later overwritten in Submitter.__call__; - the cache_locations from call doesn't exist so the second task should run again - """ - cache_dir = tmp_path / "test_task_nostate" - cache_dir.mkdir() - cache_dir1 = tmp_path / "test_task_nostate1" - cache_dir1.mkdir() - cache_dir2 = tmp_path / "test_task_nostate2" - cache_dir2.mkdir() - - nn = fun_addtwo(name="NA", cache_dir=cache_dir).split(splitter="a", a=[3, 5]) - with Submitter(plugin=plugin) as sub: - sub(nn) - - nn2 = fun_addtwo(name="NA", cache_dir=cache_dir2, cache_locations=cache_dir).split( - splitter="a", a=[3, 5] - ) - with Submitter(plugin=plugin) as sub: - sub(nn2, cache_locations=cache_dir1) - - # checking the results - results2 = nn2.result() - expected = [({"NA.a": 3}, 5), ({"NA.a": 5}, 7)] - for i, res in enumerate(expected): - assert results2[i].output.out == res[1] - - # both workflows should be run - assert all([dir.exists() for dir in nn.output_dir]) - assert all([dir.exists() for dir in nn2.output_dir]) - - -def test_task_files_cachelocations(plugin_dask_opt, tmp_path): - """ - Two identical tasks with provided cache_dir that use file as an input; - the second task has cache_locations and should not recompute the results - """ - cache_dir = tmp_path / "test_task_nostate" - cache_dir.mkdir() - cache_dir2 = tmp_path / "test_task_nostate2" - cache_dir2.mkdir() - input_dir = tmp_path / "input" - input_dir.mkdir() - - input1 = input_dir / "input1.txt" - input1.write_text("test") - input2 = input_dir / "input2.txt" - input2.write_text("test") - - nn = fun_file(name="NA", filename=input1, cache_dir=cache_dir) - with Submitter(plugin=plugin_dask_opt) as sub: - sub(nn) - - nn2 = fun_file( - name="NA", filename=input2, cache_dir=cache_dir2, cache_locations=cache_dir - ) - with Submitter(plugin=plugin_dask_opt) as sub: - sub(nn2) - - # checking the results - results2 = nn2.result() - assert results2.output.out == "test" - - # checking if the second task didn't run the interface again - assert nn.output_dir.exists() - assert not nn2.output_dir.exists() - - -class OverriddenContentsFile(File): - """A class for testing purposes, to that enables you to override the contents - of the file to allow you to check whether the persistent cache is used.""" - - def __init__( - self, - fspaths: ty.Iterator[Path], - contents: ty.Optional[bytes] = None, - metadata: ty.Dict[str, ty.Any] = None, - ): - super().__init__(fspaths, metadata=metadata) - self._contents = contents - - def byte_chunks(self, **kwargs) -> ty.Generator[ty.Tuple[str, bytes], None, None]: - if self._contents is not None: - yield (str(self.fspath), iter([self._contents])) - else: - yield from super().byte_chunks(**kwargs) - - @property - def contents(self): - if self._contents is not None: - return self._contents - return super().contents - - -def test_task_files_persistentcache(tmp_path): - """ - Two identical tasks with provided cache_dir that use file as an input; - the second task has cache_locations and should not recompute the results - """ - test_file_path = tmp_path / "test_file.txt" - test_file_path.write_bytes(b"foo") - cache_dir = tmp_path / "cache-dir" - cache_dir.mkdir() - test_file = OverriddenContentsFile(test_file_path) - - @pydra.mark.task - def read_contents(x: OverriddenContentsFile) -> bytes: - return x.contents - - assert ( - read_contents(x=test_file, cache_dir=cache_dir)(plugin="serial").output.out - == b"foo" - ) - test_file._contents = b"bar" - # should return result from the first run using the persistent cache - assert ( - read_contents(x=test_file, cache_dir=cache_dir)(plugin="serial").output.out - == b"foo" - ) - time.sleep(2) # Windows has a 2-second resolution for mtime - test_file_path.touch() # update the mtime to invalidate the persistent cache value - assert ( - read_contents(x=test_file, cache_dir=cache_dir)(plugin="serial").output.out - == b"bar" - ) # returns the overridden value diff --git a/pydra/engine/tests/test_numpy_examples.py b/pydra/engine/tests/test_numpy_examples.py deleted file mode 100644 index defdad7a2b..0000000000 --- a/pydra/engine/tests/test_numpy_examples.py +++ /dev/null @@ -1,102 +0,0 @@ -import typing as ty -import importlib -from pathlib import Path -import pickle as pk -import numpy as np -import pytest - - -from ..submitter import Submitter -from ..core import Workflow -from ...mark import task, annotate -from .utils import identity -from ...utils.hash import hash_function, Cache - -if importlib.util.find_spec("numpy") is None: - pytest.skip("can't find numpy library", allow_module_level=True) - - -@task -@annotate({"return": {"b": ty.Any}}) -def arrayout(val): - return np.array([val, val]) - - -def test_multiout(tmpdir): - """testing a simple function that returns a numpy array""" - wf = Workflow("wf", input_spec=["val"], val=2) - wf.add(arrayout(name="mo", val=wf.lzin.val)) - - wf.set_output([("array", wf.mo.lzout.b)]) - wf.cache_dir = tmpdir - - with Submitter(plugin="cf", n_procs=2) as sub: - sub(runnable=wf) - - results = wf.result(return_inputs=True) - - assert results[0] == {"wf.val": 2} - assert np.array_equal(results[1].output.array, np.array([2, 2])) - - -def test_multiout_st(tmpdir): - """testing a simple function that returns a numpy array, adding splitter""" - wf = Workflow("wf", input_spec=["val"], val=[0, 1, 2]) - wf.add(arrayout(name="mo")) - wf.mo.split("val", val=wf.lzin.val).combine("val") - - wf.set_output([("array", wf.mo.lzout.b)]) - wf.cache_dir = tmpdir - - with Submitter(plugin="cf", n_procs=2) as sub: - sub(runnable=wf) - - results = wf.result(return_inputs=True) - - assert results[0] == {"wf.val": [0, 1, 2]} - for el in range(3): - assert np.array_equal(results[1].output.array[el], np.array([el, el])) - - -def test_numpy_hash_1(): - """hashing check for numeric numpy array""" - A = np.array([1, 2]) - A_pk = pk.loads(pk.dumps(A)) - assert (A == A_pk).all() - assert hash_function(A) == hash_function(A_pk) - - -def test_numpy_hash_2(): - """hashing check for numpy array of type object""" - A = np.array([["NDAR"]], dtype=object) - A_pk = pk.loads(pk.dumps(A)) - assert (A == A_pk).all() - assert hash_function(A) == hash_function(A_pk) - - -def test_numpy_hash_3(): - """hashing check for numeric numpy array""" - A = np.array([1, 2]) - B = np.array([3, 4]) - assert hash_function(A) != hash_function(B) - - -def test_task_numpyinput_1(tmp_path: Path): - """task with numeric numpy array as an input""" - nn = identity(name="NA") - nn.cache_dir = tmp_path - nn.split(x=[np.array([1, 2]), np.array([3, 4])]) - # checking the results - results = nn() - assert (results[0].output.out == np.array([1, 2])).all() - assert (results[1].output.out == np.array([3, 4])).all() - - -def test_task_numpyinput_2(tmp_path: Path): - """task with numpy array of type object as an input""" - nn = identity(name="NA") - nn.cache_dir = tmp_path - nn.split(x=[np.array(["VAL1"], dtype=object), np.array(["VAL2"], dtype=object)]) - # checking the results - results = nn() - assert (results[0].output.out == np.array(["VAL1"], dtype=object)).all() diff --git a/pydra/engine/tests/test_profiles.py b/pydra/engine/tests/test_profiles.py index f84f8d19f4..ddae2fe725 100644 --- a/pydra/engine/tests/test_profiles.py +++ b/pydra/engine/tests/test_profiles.py @@ -1,29 +1,25 @@ -from ..core import Workflow -from ..helpers import load_task -from ... import mark - +from pydra.compose import python, workflow import numpy as np from pympler import asizeof from pytest import approx -def generate_list(l): - return np.arange(l).tolist() +def generate_list(n): + return np.arange(n).tolist() -@mark.task -def show_var(a): +@python.define +def ShowVar(a): return a def create_wf(size): - wf = Workflow(name="wf", input_spec=["x"]) - wf.split("x", x=generate_list(size)) - wf.add(show_var(name="show", a=wf.lzin.x)) - wf.set_output([("out", wf.show.lzout.out)]) - wf.state.prepare_states(wf.inputs) - wf.state.prepare_inputs() - return wf + @workflow.define + def Workflow(x): + show = workflow.add(ShowVar(a=x)) + return show.out + + return Workflow().split(x=generate_list(size)) def test_wf_memory(): @@ -31,35 +27,15 @@ def test_wf_memory(): testings if the size of workflow grows linearly """ - wf_1000 = create_wf(size=1000) - wf_1000_mem = asizeof.asizeof(wf_1000) + wf_10000 = create_wf(size=10000) + wf_10000_mem = asizeof.asizeof(wf_10000) - wf_2000 = create_wf(size=2000) - wf_2000_mem = asizeof.asizeof(wf_2000) + wf_20000 = create_wf(size=20000) + wf_20000_mem = asizeof.asizeof(wf_20000) - wf_4000 = create_wf(size=4000) - wf_4000_mem = asizeof.asizeof(wf_4000) + wf_40000 = create_wf(size=40000) + wf_40000_mem = asizeof.asizeof(wf_40000) # checking if it's linear with the size of the splitter # check print(asizeof.asized(wf_4000, detail=2).format()) in case of problems - assert wf_4000_mem / wf_2000_mem == approx(2, 0.05) - assert wf_2000_mem / wf_1000_mem == approx(2, 0.05) - - -def test_load_task_memory(): - """creating two workflow with relatively big splitter: 1000 and 4000 elements - testings if load_task for a single element returns tasks of a similar size - """ - - wf_1000 = create_wf(size=1000) - wf_1000_pkl = wf_1000.pickle_task() - wf_1000_loaded = load_task(task_pkl=wf_1000_pkl, ind=1) - wf_1000_single_mem = asizeof.asizeof(wf_1000_loaded) - - wf_4000 = create_wf(size=4000) - wf_4000_pkl = wf_4000.pickle_task() - wf_4000_loaded = load_task(task_pkl=wf_4000_pkl, ind=1) - wf_4000_single_mem = asizeof.asizeof(wf_4000_loaded) - - # checking if it doesn't change with size of the splitter - # check print(asizeof.asized(wf_4000_loaded, detail=2).format()) in case of problems - assert wf_1000_single_mem / wf_4000_single_mem == approx(1, 0.05) + assert wf_40000_mem / wf_20000_mem == approx(2, 0.05) + assert wf_20000_mem / wf_10000_mem == approx(2, 0.05) diff --git a/pydra/engine/tests/test_result.py b/pydra/engine/tests/test_result.py new file mode 100644 index 0000000000..a05db33cdb --- /dev/null +++ b/pydra/engine/tests/test_result.py @@ -0,0 +1,16 @@ +from pydra.engine.result import Result, Runtime + + +def test_runtime(): + runtime = Runtime() + assert hasattr(runtime, "rss_peak_gb") + assert hasattr(runtime, "vms_peak_gb") + assert hasattr(runtime, "cpu_peak_percent") + + +def test_result(tmp_path): + result = Result(cache_dir=tmp_path) + assert hasattr(result, "runtime") + assert hasattr(result, "outputs") + assert hasattr(result, "errored") + assert getattr(result, "errored") is False diff --git a/pydra/engine/tests/test_shelltask.py b/pydra/engine/tests/test_shelltask.py deleted file mode 100644 index 4857db094f..0000000000 --- a/pydra/engine/tests/test_shelltask.py +++ /dev/null @@ -1,5060 +0,0 @@ -import attr -import typing as ty -import os, sys -import subprocess as sp -import pytest -from pathlib import Path -import re -import stat - -from ..task import ShellCommandTask -from ..submitter import Submitter -from ..core import Workflow -from ..specs import ( - ShellOutSpec, - ShellSpec, - SpecInfo, - File, - Directory, - MultiInputFile, - MultiOutputFile, - MultiInputObj, -) -from .utils import result_no_submitter, result_submitter, no_win - -if sys.platform.startswith("win"): - pytest.skip("SLURM not available in windows", allow_module_level=True) - - -@pytest.mark.flaky(reruns=2) # when dask -@pytest.mark.parametrize("results_function", [result_no_submitter, result_submitter]) -def test_shell_cmd_1(plugin_dask_opt, results_function, tmp_path): - """simple command, no arguments""" - cmd = ["pwd"] - shelly = ShellCommandTask(name="shelly", executable=cmd, cache_dir=tmp_path) - assert shelly.cmdline == " ".join(cmd) - - res = results_function(shelly, plugin=plugin_dask_opt) - assert Path(res.output.stdout.rstrip()) == shelly.output_dir - assert res.output.return_code == 0 - assert res.output.stderr == "" - - -@pytest.mark.parametrize("results_function", [result_no_submitter, result_submitter]) -def test_shell_cmd_1_strip(plugin, results_function, tmp_path): - """simple command, no arguments - strip option to remove \n at the end os stdout - """ - cmd = ["pwd"] - shelly = ShellCommandTask(name="shelly", executable=cmd, strip=True) - shelly.cache_dir = tmp_path - assert shelly.cmdline == " ".join(cmd) - - res = results_function(shelly, plugin) - assert Path(res.output.stdout) == Path(shelly.output_dir) - assert res.output.return_code == 0 - assert res.output.stderr == "" - - -@pytest.mark.parametrize("results_function", [result_no_submitter, result_submitter]) -def test_shell_cmd_2(plugin, results_function, tmp_path): - """a command with arguments, cmd and args given as executable""" - cmd = ["echo", "hail", "pydra"] - shelly = ShellCommandTask(name="shelly", executable=cmd) - shelly.cache_dir = tmp_path - assert shelly.cmdline == " ".join(cmd) - - res = results_function(shelly, plugin) - assert res.output.stdout.strip() == " ".join(cmd[1:]) - assert res.output.return_code == 0 - assert res.output.stderr == "" - - -@pytest.mark.parametrize("results_function", [result_no_submitter, result_submitter]) -def test_shell_cmd_2a(plugin, results_function, tmp_path): - """a command with arguments, using executable and args""" - cmd_exec = "echo" - cmd_args = ["hail", "pydra"] - # separate command into exec + args - shelly = ShellCommandTask(name="shelly", executable=cmd_exec, args=cmd_args) - shelly.cache_dir = tmp_path - assert shelly.inputs.executable == "echo" - assert shelly.cmdline == "echo " + " ".join(cmd_args) - - res = results_function(shelly, plugin) - assert res.output.stdout.strip() == " ".join(cmd_args) - assert res.output.return_code == 0 - assert res.output.stderr == "" - - -@pytest.mark.parametrize("results_function", [result_no_submitter, result_submitter]) -def test_shell_cmd_2b(plugin, results_function, tmp_path): - """a command with arguments, using strings executable and args""" - cmd_exec = "echo" - cmd_args = "pydra" - # separate command into exec + args - shelly = ShellCommandTask(name="shelly", executable=cmd_exec, args=cmd_args) - shelly.cache_dir = tmp_path - assert shelly.inputs.executable == "echo" - assert shelly.cmdline == "echo pydra" - - res = results_function(shelly, plugin) - assert res.output.stdout == "pydra\n" - assert res.output.return_code == 0 - assert res.output.stderr == "" - - -# tests with State - - -@pytest.mark.flaky(reruns=2) -def test_shell_cmd_3(plugin_dask_opt, tmp_path): - """commands without arguments - splitter = executable - """ - cmd = ["pwd", "whoami"] - - # all args given as executable - shelly = ShellCommandTask(name="shelly").split("executable", executable=cmd) - shelly.cache_dir = tmp_path - - # assert shelly.cmdline == ["pwd", "whoami"] - res = shelly(plugin=plugin_dask_opt) - assert Path(res[0].output.stdout.rstrip()) == shelly.output_dir[0] - - if "USER" in os.environ: - assert res[1].output.stdout == f"{os.environ['USER']}\n" - else: - assert res[1].output.stdout - assert res[0].output.return_code == res[1].output.return_code == 0 - assert res[0].output.stderr == res[1].output.stderr == "" - - -def test_shell_cmd_4(plugin, tmp_path): - """a command with arguments, using executable and args - splitter=args - """ - cmd_exec = "echo" - cmd_args = ["nipype", "pydra"] - # separate command into exec + args - shelly = ShellCommandTask(name="shelly", executable=cmd_exec).split( - splitter="args", args=cmd_args - ) - shelly.cache_dir = tmp_path - - assert shelly.inputs.executable == "echo" - assert shelly.inputs.args == ["nipype", "pydra"] - # assert shelly.cmdline == ["echo nipype", "echo pydra"] - res = shelly(plugin=plugin) - - assert res[0].output.stdout == "nipype\n" - assert res[1].output.stdout == "pydra\n" - - assert res[0].output.return_code == res[1].output.return_code == 0 - assert res[0].output.stderr == res[1].output.stderr == "" - - -def test_shell_cmd_5(plugin, tmp_path): - """a command with arguments - using splitter and combiner for args - """ - cmd_exec = "echo" - cmd_args = ["nipype", "pydra"] - # separate command into exec + args - shelly = ( - ShellCommandTask(name="shelly", executable=cmd_exec) - .split(splitter="args", args=cmd_args) - .combine("args") - ) - shelly.cache_dir = tmp_path - - assert shelly.inputs.executable == "echo" - assert shelly.inputs.args == ["nipype", "pydra"] - # assert shelly.cmdline == ["echo nipype", "echo pydra"] - res = shelly(plugin=plugin) - - assert res[0].output.stdout == "nipype\n" - assert res[1].output.stdout == "pydra\n" - - -def test_shell_cmd_6(plugin, tmp_path): - """a command with arguments, - outer splitter for executable and args - """ - cmd_exec = ["echo", ["echo", "-n"]] - cmd_args = ["nipype", "pydra"] - # separate command into exec + args - shelly = ShellCommandTask(name="shelly").split( - splitter=["executable", "args"], executable=cmd_exec, args=cmd_args - ) - shelly.cache_dir = tmp_path - - assert shelly.inputs.executable == ["echo", ["echo", "-n"]] - assert shelly.inputs.args == ["nipype", "pydra"] - # assert shelly.cmdline == [ - # "echo nipype", - # "echo pydra", - # "echo -n nipype", - # "echo -n pydra", - # ] - res = shelly(plugin=plugin) - - assert res[0].output.stdout == "nipype\n" - assert res[1].output.stdout == "pydra\n" - assert res[2].output.stdout == "nipype" - assert res[3].output.stdout == "pydra" - - assert ( - res[0].output.return_code - == res[1].output.return_code - == res[2].output.return_code - == res[3].output.return_code - == 0 - ) - assert ( - res[0].output.stderr - == res[1].output.stderr - == res[2].output.stderr - == res[3].output.stderr - == "" - ) - - -def test_shell_cmd_7(plugin, tmp_path): - """a command with arguments, - outer splitter for executable and args, and combiner=args - """ - cmd_exec = ["echo", ["echo", "-n"]] - cmd_args = ["nipype", "pydra"] - # separate command into exec + args - shelly = ( - ShellCommandTask(name="shelly") - .split(splitter=["executable", "args"], executable=cmd_exec, args=cmd_args) - .combine("args") - ) - shelly.cache_dir = tmp_path - - assert shelly.inputs.executable == ["echo", ["echo", "-n"]] - assert shelly.inputs.args == ["nipype", "pydra"] - - res = shelly(plugin=plugin) - - assert res[0][0].output.stdout == "nipype\n" - assert res[0][1].output.stdout == "pydra\n" - - assert res[1][0].output.stdout == "nipype" - assert res[1][1].output.stdout == "pydra" - - -# tests with workflows - - -def test_wf_shell_cmd_1(plugin, tmp_path): - """a workflow with two connected commands""" - wf = Workflow(name="wf", input_spec=["cmd1", "cmd2"]) - wf.inputs.cmd1 = "pwd" - wf.inputs.cmd2 = "ls" - wf.add(ShellCommandTask(name="shelly_pwd", executable=wf.lzin.cmd1, strip=True)) - wf.add( - ShellCommandTask( - name="shelly_ls", executable=wf.lzin.cmd2, args=wf.shelly_pwd.lzout.stdout - ) - ) - - wf.set_output([("out", wf.shelly_ls.lzout.stdout)]) - wf.cache_dir = tmp_path - - with Submitter(plugin=plugin) as sub: - wf(submitter=sub) - - res = wf.result() - assert "_result.pklz" in res.output.out - assert "_task.pklz" in res.output.out - - -# customised input spec - - -@pytest.mark.parametrize("results_function", [result_no_submitter, result_submitter]) -def test_shell_cmd_inputspec_1(plugin, results_function, tmp_path): - """a command with executable, args and one command opt, - using a customized input_spec to add the opt to the command - in the right place that is specified in metadata["cmd_pos"] - """ - cmd_exec = "echo" - cmd_opt = True - cmd_args = "hello from pydra" - my_input_spec = SpecInfo( - name="Input", - fields=[ - ( - "opt_n", - attr.ib( - type=bool, - metadata={"position": 1, "argstr": "-n", "help_string": "option"}, - ), - ) - ], - bases=(ShellSpec,), - ) - - # separate command into exec + args - shelly = ShellCommandTask( - name="shelly", - executable=cmd_exec, - args=cmd_args, - opt_n=cmd_opt, - input_spec=my_input_spec, - cache_dir=tmp_path, - ) - assert shelly.inputs.executable == cmd_exec - assert shelly.inputs.args == cmd_args - assert shelly.cmdline == "echo -n 'hello from pydra'" - - res = results_function(shelly, plugin) - assert res.output.stdout == "hello from pydra" - - -@pytest.mark.parametrize("results_function", [result_no_submitter, result_submitter]) -def test_shell_cmd_inputspec_2(plugin, results_function, tmp_path): - """a command with executable, args and two command options, - using a customized input_spec to add the opt to the command - in the right place that is specified in metadata["cmd_pos"] - """ - cmd_exec = "echo" - cmd_opt = True - cmd_opt_hello = "HELLO" - cmd_args = "from pydra" - my_input_spec = SpecInfo( - name="Input", - fields=[ - ( - "opt_hello", - attr.ib( - type=str, - metadata={"position": 3, "help_string": "todo", "argstr": ""}, - ), - ), - ( - "opt_n", - attr.ib( - type=bool, - metadata={"position": 1, "help_string": "todo", "argstr": "-n"}, - ), - ), - ], - bases=(ShellSpec,), - ) - - # separate command into exec + args - shelly = ShellCommandTask( - name="shelly", - executable=cmd_exec, - args=cmd_args, - opt_n=cmd_opt, - opt_hello=cmd_opt_hello, - input_spec=my_input_spec, - cache_dir=tmp_path, - ) - assert shelly.inputs.executable == cmd_exec - assert shelly.inputs.args == cmd_args - assert shelly.cmdline == "echo -n HELLO 'from pydra'" - res = results_function(shelly, plugin) - assert res.output.stdout == "HELLO from pydra" - - -@pytest.mark.parametrize("results_function", [result_no_submitter, result_submitter]) -def test_shell_cmd_inputspec_3(plugin, results_function, tmp_path): - """mandatory field added to fields, value provided""" - cmd_exec = "echo" - hello = "HELLO" - my_input_spec = SpecInfo( - name="Input", - fields=[ - ( - "text", - attr.ib( - type=str, - metadata={ - "position": 1, - "help_string": "text", - "mandatory": True, - "argstr": "", - }, - ), - ) - ], - bases=(ShellSpec,), - ) - - # separate command into exec + args - shelly = ShellCommandTask( - name="shelly", - executable=cmd_exec, - text=hello, - input_spec=my_input_spec, - cache_dir=tmp_path, - ) - assert shelly.inputs.executable == cmd_exec - assert shelly.cmdline == "echo HELLO" - res = results_function(shelly, plugin) - assert res.output.stdout == "HELLO\n" - - -@pytest.mark.parametrize("results_function", [result_no_submitter, result_submitter]) -def test_shell_cmd_inputspec_3a(plugin, results_function, tmp_path): - """mandatory field added to fields, value provided - using shorter syntax for input spec (no attr.ib) - """ - cmd_exec = "echo" - hello = "HELLO" - my_input_spec = SpecInfo( - name="Input", - fields=[ - ( - "text", - str, - {"position": 1, "help_string": "text", "mandatory": True, "argstr": ""}, - ) - ], - bases=(ShellSpec,), - ) - - # separate command into exec + args - shelly = ShellCommandTask( - name="shelly", - executable=cmd_exec, - text=hello, - input_spec=my_input_spec, - cache_dir=tmp_path, - ) - assert shelly.inputs.executable == cmd_exec - assert shelly.cmdline == "echo HELLO" - res = results_function(shelly, plugin) - assert res.output.stdout == "HELLO\n" - - -@pytest.mark.parametrize("results_function", [result_no_submitter, result_submitter]) -def test_shell_cmd_inputspec_3b(plugin, results_function, tmp_path): - """mandatory field added to fields, value provided after init""" - cmd_exec = "echo" - hello = "HELLO" - my_input_spec = SpecInfo( - name="Input", - fields=[ - ( - "text", - attr.ib( - type=str, - metadata={ - "position": 1, - "help_string": "text", - "mandatory": True, - "argstr": "", - }, - ), - ) - ], - bases=(ShellSpec,), - ) - - # separate command into exec + args - shelly = ShellCommandTask( - name="shelly", executable=cmd_exec, input_spec=my_input_spec, cache_dir=tmp_path - ) - shelly.inputs.text = hello - - assert shelly.inputs.executable == cmd_exec - assert shelly.cmdline == "echo HELLO" - res = results_function(shelly, plugin) - assert res.output.stdout == "HELLO\n" - - -def test_shell_cmd_inputspec_3c_exception(plugin, tmp_path): - """mandatory field added to fields, value is not provided, so exception is raised""" - cmd_exec = "echo" - my_input_spec = SpecInfo( - name="Input", - fields=[ - ( - "text", - attr.ib( - type=str, - metadata={ - "position": 1, - "help_string": "text", - "mandatory": True, - "argstr": "", - }, - ), - ) - ], - bases=(ShellSpec,), - ) - - shelly = ShellCommandTask( - name="shelly", executable=cmd_exec, input_spec=my_input_spec, cache_dir=tmp_path - ) - - with pytest.raises(Exception) as excinfo: - shelly() - assert "mandatory" in str(excinfo.value) - - -@pytest.mark.parametrize("results_function", [result_no_submitter, result_submitter]) -def test_shell_cmd_inputspec_3c(plugin, results_function, tmp_path): - """mandatory=False, so tasks runs fine even without the value""" - cmd_exec = "echo" - my_input_spec = SpecInfo( - name="Input", - fields=[ - ( - "text", - attr.ib( - type=ty.Optional[str], - default=None, - metadata={ - "position": 1, - "help_string": "text", - "mandatory": False, - "argstr": "", - }, - ), - ) - ], - bases=(ShellSpec,), - ) - - # separate command into exec + args - shelly = ShellCommandTask( - name="shelly", executable=cmd_exec, input_spec=my_input_spec, cache_dir=tmp_path - ) - - assert shelly.inputs.executable == cmd_exec - assert shelly.cmdline == "echo" - res = results_function(shelly, plugin) - assert res.output.stdout == "\n" - - -@pytest.mark.parametrize("results_function", [result_no_submitter, result_submitter]) -def test_shell_cmd_inputspec_4(plugin, results_function, tmp_path): - """mandatory field added to fields, value provided""" - cmd_exec = "echo" - my_input_spec = SpecInfo( - name="Input", - fields=[ - ( - "text", - attr.ib( - type=str, - default="Hello", - metadata={"position": 1, "help_string": "text", "argstr": ""}, - ), - ) - ], - bases=(ShellSpec,), - ) - - # separate command into exec + args - shelly = ShellCommandTask( - name="shelly", executable=cmd_exec, input_spec=my_input_spec, cache_dir=tmp_path - ) - - assert shelly.inputs.executable == cmd_exec - assert shelly.cmdline == "echo Hello" - - res = results_function(shelly, plugin) - assert res.output.stdout == "Hello\n" - - -@pytest.mark.parametrize("results_function", [result_no_submitter, result_submitter]) -def test_shell_cmd_inputspec_4a(plugin, results_function, tmp_path): - """mandatory field added to fields, value provided - using shorter syntax for input spec (no attr.ib) - """ - cmd_exec = "echo" - my_input_spec = SpecInfo( - name="Input", - fields=[ - ("text", str, "Hello", {"position": 1, "help_string": "text", "argstr": ""}) - ], - bases=(ShellSpec,), - ) - - # separate command into exec + args - shelly = ShellCommandTask( - name="shelly", executable=cmd_exec, input_spec=my_input_spec, cache_dir=tmp_path - ) - - assert shelly.inputs.executable == cmd_exec - assert shelly.cmdline == "echo Hello" - - res = results_function(shelly, plugin) - assert res.output.stdout == "Hello\n" - - -@pytest.mark.parametrize("results_function", [result_no_submitter, result_submitter]) -def test_shell_cmd_inputspec_4b(plugin, results_function, tmp_path): - """mandatory field added to fields, value provided""" - cmd_exec = "echo" - my_input_spec = SpecInfo( - name="Input", - fields=[ - ( - "text", - attr.ib( - type=str, - default="Hi", - metadata={"position": 1, "help_string": "text", "argstr": ""}, - ), - ) - ], - bases=(ShellSpec,), - ) - - # separate command into exec + args - shelly = ShellCommandTask( - name="shelly", executable=cmd_exec, input_spec=my_input_spec, cache_dir=tmp_path - ) - - assert shelly.inputs.executable == cmd_exec - assert shelly.cmdline == "echo Hi" - - res = results_function(shelly, plugin) - assert res.output.stdout == "Hi\n" - - -def test_shell_cmd_inputspec_4c_exception(plugin): - """mandatory field added to fields, value provided""" - cmd_exec = "echo" - my_input_spec = SpecInfo( - name="Input", - fields=[ - ( - "text", - attr.ib( - type=str, - default="Hello", - metadata={ - "position": 1, - "help_string": "text", - "mandatory": True, - "argstr": "", - }, - ), - ) - ], - bases=(ShellSpec,), - ) - - # separate command into exec + args - with pytest.raises( - Exception, match=r"default value \('Hello'\) should not be set when the field" - ): - ShellCommandTask(name="shelly", executable=cmd_exec, input_spec=my_input_spec) - - -def test_shell_cmd_inputspec_4d_exception(plugin): - """mandatory field added to fields, value provided""" - cmd_exec = "echo" - my_input_spec = SpecInfo( - name="Input", - fields=[ - ( - "text", - attr.ib( - type=str, - default="Hello", - metadata={ - "position": 1, - "help_string": "text", - "output_file_template": "exception", - "argstr": "", - }, - ), - ) - ], - bases=(ShellSpec,), - ) - - # separate command into exec + args - with pytest.raises( - Exception, match=r"default value \('Hello'\) should not be set together" - ) as excinfo: - ShellCommandTask(name="shelly", executable=cmd_exec, input_spec=my_input_spec) - - -@pytest.mark.parametrize("results_function", [result_no_submitter, result_submitter]) -def test_shell_cmd_inputspec_5_nosubm(plugin, results_function, tmp_path): - """checking xor in metadata: task should work fine, since only one option is True""" - cmd_exec = "ls" - cmd_t = True - my_input_spec = SpecInfo( - name="Input", - fields=[ - ( - "opt_t", - attr.ib( - type=bool, - metadata={ - "position": 1, - "help_string": "opt t", - "argstr": "-t", - "xor": ["opt_S"], - }, - ), - ), - ( - "opt_S", - attr.ib( - type=bool, - metadata={ - "position": 2, - "help_string": "opt S", - "argstr": "-S", - "xor": ["opt_t"], - }, - ), - ), - ], - bases=(ShellSpec,), - ) - - # separate command into exec + args - shelly = ShellCommandTask( - name="shelly", - executable=cmd_exec, - opt_t=cmd_t, - input_spec=my_input_spec, - cache_dir=tmp_path, - ) - assert shelly.inputs.executable == cmd_exec - assert shelly.cmdline == "ls -t" - results_function(shelly, plugin) - - -def test_shell_cmd_inputspec_5a_exception(plugin, tmp_path): - """checking xor in metadata: both options are True, so the task raises exception""" - cmd_exec = "ls" - cmd_t = True - cmd_S = True - my_input_spec = SpecInfo( - name="Input", - fields=[ - ( - "opt_t", - attr.ib( - type=bool, - metadata={ - "position": 1, - "help_string": "opt t", - "argstr": "-t", - "xor": ["opt_S"], - }, - ), - ), - ( - "opt_S", - attr.ib( - type=bool, - metadata={ - "position": 2, - "help_string": "opt S", - "argstr": "-S", - "xor": ["opt_t"], - }, - ), - ), - ], - bases=(ShellSpec,), - ) - - shelly = ShellCommandTask( - name="shelly", - executable=cmd_exec, - opt_t=cmd_t, - opt_S=cmd_S, - input_spec=my_input_spec, - cache_dir=tmp_path, - ) - with pytest.raises(Exception) as excinfo: - shelly() - assert "is mutually exclusive" in str(excinfo.value) - - -@pytest.mark.parametrize("results_function", [result_no_submitter, result_submitter]) -def test_shell_cmd_inputspec_6(plugin, results_function, tmp_path): - """checking requires in metadata: - the required field is set in the init, so the task works fine - """ - cmd_exec = "ls" - cmd_l = True - cmd_t = True - my_input_spec = SpecInfo( - name="Input", - fields=[ - ( - "opt_t", - attr.ib( - type=bool, - metadata={ - "position": 2, - "help_string": "opt t", - "argstr": "-t", - "requires": ["opt_l"], - }, - ), - ), - ( - "opt_l", - attr.ib( - type=bool, - metadata={"position": 1, "help_string": "opt l", "argstr": "-l"}, - ), - ), - ], - bases=(ShellSpec,), - ) - - # separate command into exec + args - shelly = ShellCommandTask( - name="shelly", - executable=cmd_exec, - opt_t=cmd_t, - opt_l=cmd_l, - input_spec=my_input_spec, - cache_dir=tmp_path, - ) - assert shelly.inputs.executable == cmd_exec - assert shelly.cmdline == "ls -l -t" - results_function(shelly, plugin) - - -def test_shell_cmd_inputspec_6a_exception(plugin): - """checking requires in metadata: - the required field is None, so the task works raises exception - """ - cmd_exec = "ls" - cmd_t = True - my_input_spec = SpecInfo( - name="Input", - fields=[ - ( - "opt_t", - attr.ib( - type=bool, - metadata={ - "position": 2, - "help_string": "opt t", - "argstr": "-t", - "requires": ["opt_l"], - }, - ), - ), - ( - "opt_l", - attr.ib( - type=bool, - metadata={"position": 1, "help_string": "opt l", "argstr": "-l"}, - ), - ), - ], - bases=(ShellSpec,), - ) - - shelly = ShellCommandTask( - name="shelly", executable=cmd_exec, opt_t=cmd_t, input_spec=my_input_spec - ) - with pytest.raises(Exception) as excinfo: - shelly() - assert "requires" in str(excinfo.value) - - -@pytest.mark.parametrize("results_function", [result_no_submitter, result_submitter]) -def test_shell_cmd_inputspec_6b(plugin, results_function, tmp_path): - """checking requires in metadata: - the required field set after the init - """ - cmd_exec = "ls" - cmd_l = True - cmd_t = True - my_input_spec = SpecInfo( - name="Input", - fields=[ - ( - "opt_t", - attr.ib( - type=bool, - metadata={ - "position": 2, - "help_string": "opt t", - "argstr": "-t", - "requires": ["opt_l"], - }, - ), - ), - ( - "opt_l", - attr.ib( - type=bool, - metadata={"position": 1, "help_string": "opt l", "argstr": "-l"}, - ), - ), - ], - bases=(ShellSpec,), - ) - - # separate command into exec + args - shelly = ShellCommandTask( - name="shelly", - executable=cmd_exec, - opt_t=cmd_t, - # opt_l=cmd_l, - input_spec=my_input_spec, - cache_dir=tmp_path, - ) - shelly.inputs.opt_l = cmd_l - assert shelly.inputs.executable == cmd_exec - assert shelly.cmdline == "ls -l -t" - results_function(shelly, plugin) - - -@pytest.mark.parametrize("results_function", [result_no_submitter, result_submitter]) -def test_shell_cmd_inputspec_7(plugin, results_function, tmp_path): - """ - providing output name using input_spec, - using name_tamplate in metadata - """ - cmd = "touch" - args = "newfile_tmp.txt" - - my_input_spec = SpecInfo( - name="Input", - fields=[ - ( - "out1", - attr.ib( - type=str, - metadata={ - "output_file_template": "{args}", - "help_string": "output file", - }, - ), - ) - ], - bases=(ShellSpec,), - ) - - shelly = ShellCommandTask( - name="shelly", - executable=cmd, - args=args, - input_spec=my_input_spec, - cache_dir=tmp_path, - ) - - res = results_function(shelly, plugin) - assert res.output.stdout == "" - out1 = res.output.out1.fspath - assert out1.exists() - # checking if the file is created in a good place - assert shelly.output_dir == out1.parent - assert out1.name == "newfile_tmp.txt" - - -@pytest.mark.parametrize("results_function", [result_no_submitter, result_submitter]) -def test_shell_cmd_inputspec_7a(plugin, results_function, tmp_path): - """ - providing output name using input_spec, - using name_tamplate in metadata - and changing the output name for output_spec using output_field_name - """ - cmd = "touch" - args = "newfile_tmp.txt" - - my_input_spec = SpecInfo( - name="Input", - fields=[ - ( - "out1", - attr.ib( - type=str, - metadata={ - "output_file_template": "{args}", - "output_field_name": "out1_changed", - "help_string": "output file", - }, - ), - ) - ], - bases=(ShellSpec,), - ) - - shelly = ShellCommandTask( - name="shelly", - executable=cmd, - args=args, - input_spec=my_input_spec, - cache_dir=tmp_path, - ) - - res = results_function(shelly, plugin) - assert res.output.stdout == "" - # checking if the file is created in a good place - assert shelly.output_dir == res.output.out1_changed.fspath.parent - assert res.output.out1_changed.fspath.name == "newfile_tmp.txt" - - -@pytest.mark.parametrize("results_function", [result_no_submitter, result_submitter]) -def test_shell_cmd_inputspec_7b(plugin, results_function, tmp_path): - """ - providing new file and output name using input_spec, - using name_template in metadata - """ - cmd = "touch" - - my_input_spec = SpecInfo( - name="Input", - fields=[ - ( - "newfile", - attr.ib( - type=str, - metadata={"position": 1, "help_string": "new file", "argstr": ""}, - ), - ), - ( - "out1", - attr.ib( - type=str, - metadata={ - "output_file_template": "{newfile}", - "help_string": "output file", - }, - ), - ), - ], - bases=(ShellSpec,), - ) - - shelly = ShellCommandTask( - name="shelly", - executable=cmd, - newfile="newfile_tmp.txt", - input_spec=my_input_spec, - cache_dir=tmp_path, - ) - - res = results_function(shelly, plugin) - assert res.output.stdout == "" - assert res.output.out1.fspath.exists() - - -@pytest.mark.parametrize("results_function", [result_no_submitter, result_submitter]) -def test_shell_cmd_inputspec_7c(plugin, results_function, tmp_path): - """ - providing output name using input_spec, - using name_tamplate with txt extension (extension from args should be removed - """ - cmd = "touch" - args = "newfile_tmp.txt" - - my_input_spec = SpecInfo( - name="Input", - fields=[ - ( - "out1", - attr.ib( - type=str, - metadata={ - "output_file_template": "{args}.txt", - "help_string": "output file", - }, - ), - ) - ], - bases=(ShellSpec,), - ) - - shelly = ShellCommandTask( - name="shelly", - executable=cmd, - args=args, - input_spec=my_input_spec, - cache_dir=tmp_path, - ) - - res = results_function(shelly, plugin) - assert res.output.stdout == "" - # checking if the file is created in a good place - assert shelly.output_dir == res.output.out1.fspath.parent - assert res.output.out1.fspath.name == "newfile_tmp.txt" - - -@pytest.mark.parametrize("results_function", [result_no_submitter, result_submitter]) -def test_shell_cmd_inputspec_8(plugin, results_function, tmp_path): - """ - providing new file and output name using input_spec, - adding additional string input field with argstr - """ - cmd = "touch" - - my_input_spec = SpecInfo( - name="Input", - fields=[ - ( - "newfile", - attr.ib( - type=str, - metadata={"position": 2, "help_string": "new file", "argstr": ""}, - ), - ), - ( - "time", - attr.ib( - type=str, - metadata={ - "position": 1, - "argstr": "-t", - "help_string": "time of modif.", - }, - ), - ), - ( - "out1", - attr.ib( - type=str, - metadata={ - "output_file_template": "{newfile}", - "help_string": "output file", - }, - ), - ), - ], - bases=(ShellSpec,), - ) - - shelly = ShellCommandTask( - name="shelly", - executable=cmd, - newfile="newfile_tmp.txt", - time="02121010", - input_spec=my_input_spec, - cache_dir=tmp_path, - ) - - res = results_function(shelly, plugin) - assert res.output.stdout == "" - assert res.output.out1.fspath.exists() - - -@pytest.mark.parametrize("results_function", [result_no_submitter, result_submitter]) -def test_shell_cmd_inputspec_8a(plugin, results_function, tmp_path): - """ - providing new file and output name using input_spec, - adding additional string input field with argstr (argstr uses string formatting) - """ - cmd = "touch" - - my_input_spec = SpecInfo( - name="Input", - fields=[ - ( - "newfile", - attr.ib( - type=str, - metadata={"position": 2, "help_string": "new file", "argstr": ""}, - ), - ), - ( - "time", - attr.ib( - type=str, - metadata={ - "position": 1, - "argstr": "-t {time}", - "help_string": "time of modif.", - }, - ), - ), - ( - "out1", - attr.ib( - type=str, - metadata={ - "output_file_template": "{newfile}", - "help_string": "output file", - }, - ), - ), - ], - bases=(ShellSpec,), - ) - - shelly = ShellCommandTask( - name="shelly", - executable=cmd, - newfile="newfile_tmp.txt", - time="02121010", - input_spec=my_input_spec, - cache_dir=tmp_path, - ) - - res = results_function(shelly, plugin) - assert res.output.stdout == "" - assert res.output.out1.fspath.exists() - - -@pytest.mark.parametrize("results_function", [result_no_submitter, result_submitter]) -def test_shell_cmd_inputspec_9(tmp_path, plugin, results_function): - """ - providing output name using input_spec (output_file_template in metadata), - the template has a suffix, the extension of the file will be moved to the end - """ - cmd = "cp" - ddir = tmp_path / "data_inp" - ddir.mkdir() - file = ddir / ("file.txt") - file.write_text("content\n") - - my_input_spec = SpecInfo( - name="Input", - fields=[ - ( - "file_orig", - attr.ib( - type=File, - metadata={"position": 2, "help_string": "new file", "argstr": ""}, - ), - ), - ( - "file_copy", - attr.ib( - type=str, - metadata={ - "output_file_template": "{file_orig}_copy", - "help_string": "output file", - "argstr": "", - }, - ), - ), - ], - bases=(ShellSpec,), - ) - - shelly = ShellCommandTask( - name="shelly", - executable=cmd, - input_spec=my_input_spec, - file_orig=file, - cache_dir=tmp_path, - ) - - res = results_function(shelly, plugin) - assert res.output.stdout == "" - assert res.output.file_copy.fspath.exists() - assert res.output.file_copy.fspath.name == "file_copy.txt" - # checking if it's created in a good place - assert shelly.output_dir == res.output.file_copy.fspath.parent - - -@pytest.mark.parametrize("results_function", [result_no_submitter]) -def test_shell_cmd_inputspec_9a(tmp_path, plugin, results_function): - """ - providing output name using input_spec (output_file_template in metadata), - the template has a suffix, the extension of the file will be moved to the end - the change: input file has directory with a dot - """ - cmd = "cp" - file = tmp_path / "data.inp" / "file.txt" - file.parent.mkdir() - file.write_text("content\n") - - my_input_spec = SpecInfo( - name="Input", - fields=[ - ( - "file_orig", - attr.ib( - type=File, - metadata={"position": 2, "help_string": "new file", "argstr": ""}, - ), - ), - ( - "file_copy", - attr.ib( - type=str, - metadata={ - "output_file_template": "{file_orig}_copy", - "help_string": "output file", - "argstr": "", - }, - ), - ), - ], - bases=(ShellSpec,), - ) - - shelly = ShellCommandTask( - name="shelly", executable=cmd, input_spec=my_input_spec, file_orig=file - ) - - res = results_function(shelly, plugin) - assert res.output.stdout == "" - assert res.output.file_copy.fspath.exists() - assert res.output.file_copy.fspath.name == "file_copy.txt" - # checking if it's created in a good place - assert shelly.output_dir == res.output.file_copy.fspath.parent - - -@pytest.mark.parametrize("results_function", [result_no_submitter, result_submitter]) -def test_shell_cmd_inputspec_9b(tmp_path, plugin, results_function): - """ - providing output name using input_spec (output_file_template in metadata) - and the keep_extension is set to False, so the extension is removed completely. - """ - cmd = "cp" - file = tmp_path / "file.txt" - file.write_text("content\n") - - my_input_spec = SpecInfo( - name="Input", - fields=[ - ( - "file_orig", - attr.ib( - type=File, - metadata={"position": 2, "help_string": "new file", "argstr": ""}, - ), - ), - ( - "file_copy", - attr.ib( - type=str, - metadata={ - "output_file_template": "{file_orig}_copy", - "keep_extension": False, - "help_string": "output file", - "argstr": "", - }, - ), - ), - ], - bases=(ShellSpec,), - ) - - shelly = ShellCommandTask( - name="shelly", - executable=cmd, - input_spec=my_input_spec, - file_orig=file, - cache_dir=tmp_path, - ) - - res = results_function(shelly, plugin) - assert res.output.stdout == "" - assert res.output.file_copy.fspath.exists() - assert res.output.file_copy.fspath.name == "file_copy" - - -@pytest.mark.parametrize("results_function", [result_no_submitter, result_submitter]) -def test_shell_cmd_inputspec_9c(tmp_path, plugin, results_function): - """ - providing output name using input_spec (output_file_template in metadata) - and the keep_extension is set to False, so the extension is removed completely, - no suffix in the template. - """ - cmd = "cp" - file = tmp_path / "file.txt" - file.write_text("content\n") - - my_input_spec = SpecInfo( - name="Input", - fields=[ - ( - "file_orig", - attr.ib( - type=File, - metadata={"position": 2, "help_string": "new file", "argstr": ""}, - ), - ), - ( - "file_copy", - attr.ib( - type=str, - metadata={ - "output_file_template": "{file_orig}", - "keep_extension": False, - "help_string": "output file", - "argstr": "", - }, - ), - ), - ], - bases=(ShellSpec,), - ) - - shelly = ShellCommandTask( - name="shelly", - executable=cmd, - input_spec=my_input_spec, - file_orig=file, - cache_dir=tmp_path, - ) - - res = results_function(shelly, plugin) - assert res.output.stdout == "" - assert res.output.file_copy.fspath.exists() - assert res.output.file_copy.fspath.name == "file" - assert res.output.file_copy.fspath.parent == shelly.output_dir - - -@pytest.mark.parametrize("results_function", [result_no_submitter, result_submitter]) -def test_shell_cmd_inputspec_9d(tmp_path, plugin, results_function): - """ - providing output name explicitly by manually setting value in input_spec - (instead of using default provided byoutput_file_template in metadata) - """ - cmd = "cp" - ddir = tmp_path / "data_inp" - ddir.mkdir() - file = ddir / ("file.txt") - file.write_text("content\n") - - my_input_spec = SpecInfo( - name="Input", - fields=[ - ( - "file_orig", - attr.ib( - type=File, - metadata={"position": 2, "help_string": "new file", "argstr": ""}, - ), - ), - ( - "file_copy", - attr.ib( - type=str, - metadata={ - "output_file_template": "{file_orig}_copy", - "help_string": "output file", - "argstr": "", - }, - ), - ), - ], - bases=(ShellSpec,), - ) - - shelly = ShellCommandTask( - name="shelly", - executable=cmd, - input_spec=my_input_spec, - file_orig=file, - file_copy="my_file_copy.txt", - cache_dir=tmp_path, - ) - - res = results_function(shelly, plugin) - assert res.output.stdout == "" - assert res.output.file_copy.fspath.exists() - assert res.output.file_copy.fspath.name == "my_file_copy.txt" - # checking if it's created in a good place - assert shelly.output_dir == res.output.file_copy.fspath.parent - - -@pytest.mark.parametrize("results_function", [result_no_submitter, result_submitter]) -def test_shell_cmd_inputspec_10(plugin, results_function, tmp_path): - """using input_spec, providing list of files as an input""" - - file_1 = tmp_path / "file_1.txt" - file_2 = tmp_path / "file_2.txt" - with open(file_1, "w") as f: - f.write("hello ") - with open(file_2, "w") as f: - f.write("from boston") - - cmd_exec = "cat" - files_list = [file_1, file_2] - - my_input_spec = SpecInfo( - name="Input", - fields=[ - ( - "files", - attr.ib( - type=ty.List[File], - metadata={ - "position": 1, - "argstr": "...", - "sep": " ", - "help_string": "list of files", - "mandatory": True, - }, - ), - ) - ], - bases=(ShellSpec,), - ) - - shelly = ShellCommandTask( - name="shelly", - executable=cmd_exec, - files=files_list, - input_spec=my_input_spec, - cache_dir=tmp_path, - ) - - assert shelly.inputs.executable == cmd_exec - res = results_function(shelly, plugin) - assert res.output.stdout == "hello from boston" - - -def test_shell_cmd_inputspec_10_err(tmp_path): - """checking if the proper error is raised when broken symlink is provided - as a input field with File as a type - """ - - file_1 = tmp_path / "file_1.txt" - with open(file_1, "w") as f: - f.write("hello") - file_2 = tmp_path / "file_2.txt" - - # creating symlink and removing the original file - os.symlink(file_1, file_2) - os.remove(file_1) - - cmd_exec = "cat" - - my_input_spec = SpecInfo( - name="Input", - fields=[ - ( - "files", - attr.ib( - type=File, - metadata={ - "position": 1, - "argstr": "", - "help_string": "a file", - "mandatory": True, - }, - ), - ) - ], - bases=(ShellSpec,), - ) - - with pytest.raises(FileNotFoundError): - shelly = ShellCommandTask( - name="shelly", executable=cmd_exec, files=file_2, input_spec=my_input_spec - ) - - -def test_shell_cmd_inputspec_11(tmp_path): - input_fields = [ - ( - "inputFiles", - attr.ib( - type=MultiInputObj[str], - metadata={ - "argstr": "...", - "help_string": "The list of input image files to be segmented.", - }, - ), - ) - ] - - output_fields = [ - ( - "outputFiles", - attr.ib( - type=MultiOutputFile, - metadata={ - "help_string": "Corrected Output Images: should specify the same number of images as inputVolume, if only one element is given, then it is used as a file pattern where %s is replaced by the imageVolumeType, and %d by the index list location.", - "output_file_template": "{inputFiles}", - }, - ), - ) - ] - - input_spec = SpecInfo(name="Input", fields=input_fields, bases=(ShellSpec,)) - output_spec = SpecInfo(name="Output", fields=output_fields, bases=(ShellOutSpec,)) - - task = ShellCommandTask( - name="echoMultiple", - executable="touch", - input_spec=input_spec, - output_spec=output_spec, - ) - - wf = Workflow(name="wf", input_spec=["inputFiles"], inputFiles=["test1", "test2"]) - - task.inputs.inputFiles = wf.lzin.inputFiles - - wf.add(task) - wf.set_output([("out", wf.echoMultiple.lzout.outputFiles)]) - - # XXX: Figure out why this fails with "cf". Occurs in CI when using Ubuntu + Python >= 3.10 - # (but not when using macOS + Python >= 3.10). Same error occurs in test_shell_cmd_outputspec_7a - # see https://github.com/nipype/pydra/issues/671 - with Submitter(plugin="serial") as sub: - sub(wf) - result = wf.result() - - for out_file in result.output.out: - assert out_file.fspath.name == "test1" or out_file.fspath.name == "test2" - - -@pytest.mark.parametrize("results_function", [result_no_submitter, result_submitter]) -def test_shell_cmd_inputspec_12(tmp_path: Path, plugin, results_function): - """ - providing output name using input_spec - output_file_template is provided as a function that returns - various templates depending on the values of inputs fields - """ - cmd = "cp" - ddir = tmp_path / "data_inp" - ddir.mkdir() - file = ddir / "file.txt" - file.write_text("content\n") - - def template_function(inputs): - if inputs.number % 2 == 0: - return "{file_orig}_even" - else: - return "{file_orig}_odd" - - my_input_spec = SpecInfo( - name="Input", - fields=[ - ( - "file_orig", - attr.ib( - type=File, - metadata={"position": 2, "help_string": "new file", "argstr": ""}, - ), - ), - ( - "number", - attr.ib( - type=int, - metadata={"help_string": "a number", "mandatory": True}, - ), - ), - ( - "file_copy", - attr.ib( - type=str, - metadata={ - "output_file_template": template_function, - "help_string": "output file", - "argstr": "", - }, - ), - ), - ], - bases=(ShellSpec,), - ) - - shelly = ShellCommandTask( - name="shelly", - executable=cmd, - input_spec=my_input_spec, - file_orig=file, - number=2, - cache_dir=tmp_path, - ) - - res = results_function(shelly, plugin) - assert res.output.stdout == "" - fspath = res.output.file_copy.fspath - assert fspath.exists() - assert fspath.name == "file_even.txt" - # checking if it's created in a good place - assert shelly.output_dir == fspath.parent - - -def test_shell_cmd_inputspec_with_iterable(): - """Test formatting of argstr with different iterable types.""" - - input_spec = SpecInfo( - name="Input", - fields=[ - ( - "iterable_1", - ty.Iterable[int], - { - "help_string": "iterable input 1", - "argstr": "--in1", - }, - ), - ( - "iterable_2", - ty.Iterable[str], - { - "help_string": "iterable input 2", - "argstr": "--in2...", - }, - ), - ], - bases=(ShellSpec,), - ) - - task = ShellCommandTask(name="test", input_spec=input_spec, executable="test") - - for iterable_type in (list, tuple): - task.inputs.iterable_1 = iterable_type(range(3)) - task.inputs.iterable_2 = iterable_type(["bar", "foo"]) - assert task.cmdline == "test --in1 0 1 2 --in2 bar --in2 foo" - - -@pytest.mark.parametrize("results_function", [result_no_submitter, result_submitter]) -def test_shell_cmd_inputspec_copyfile_1(plugin, results_function, tmp_path): - """shelltask changes a file in place, - adding copyfile=True to the file-input from input_spec - hardlink or copy in the output_dir should be created - """ - file = tmp_path / "file_pydra.txt" - with open(file, "w") as f: - f.write("hello from pydra\n") - - cmd = ["sed", "-is", "s/hello/hi/"] - - my_input_spec = SpecInfo( - name="Input", - fields=[ - ( - "orig_file", - attr.ib( - type=File, - metadata={ - "position": 1, - "argstr": "", - "help_string": "orig file", - "mandatory": True, - "copyfile": True, - }, - ), - ), - ( - "out_file", - attr.ib( - type=str, - metadata={ - "output_file_template": "{orig_file}", - "help_string": "output file", - }, - ), - ), - ], - bases=(ShellSpec,), - ) - - shelly = ShellCommandTask( - name="shelly", - executable=cmd, - input_spec=my_input_spec, - orig_file=str(file), - cache_dir=tmp_path, - ) - - res = results_function(shelly, plugin) - assert res.output.stdout == "" - assert res.output.out_file.fspath.exists() - # the file is copied, and than it is changed in place - assert res.output.out_file.fspath.parent == shelly.output_dir - with open(res.output.out_file) as f: - assert "hi from pydra\n" == f.read() - # the original file is unchanged - with open(file) as f: - assert "hello from pydra\n" == f.read() - - -@pytest.mark.parametrize("results_function", [result_no_submitter, result_submitter]) -def test_shell_cmd_inputspec_copyfile_1a(plugin, results_function, tmp_path): - """shelltask changes a file in place, - adding copyfile=False to the File-input from input_spec - hardlink or softlink in the output_dir is created - """ - file = tmp_path / "file_pydra.txt" - with open(file, "w") as f: - f.write("hello from pydra\n") - - cmd = ["sed", "-is", "s/hello/hi/"] - - my_input_spec = SpecInfo( - name="Input", - fields=[ - ( - "orig_file", - attr.ib( - type=File, - metadata={ - "position": 1, - "argstr": "", - "help_string": "orig file", - "mandatory": True, - "copyfile": "hardlink", - }, - ), - ), - ( - "out_file", - attr.ib( - type=str, - metadata={ - "output_file_template": "{orig_file}", - "help_string": "output file", - }, - ), - ), - ], - bases=(ShellSpec,), - ) - - shelly = ShellCommandTask( - name="shelly", - executable=cmd, - input_spec=my_input_spec, - orig_file=str(file), - cache_dir=tmp_path, - ) - - res = results_function(shelly, plugin) - assert res.output.stdout == "" - assert res.output.out_file.fspath.exists() - # the file is uses a soft link, but it creates and an extra copy before modifying - assert res.output.out_file.fspath.parent == shelly.output_dir - - assert res.output.out_file.fspath.parent.joinpath( - res.output.out_file.fspath.name + "s" - ).exists() - with open(res.output.out_file) as f: - assert "hi from pydra\n" == f.read() - # the file is uses a soft link, but it creates and an extra copy - # it might depend on the OS - linked_file_copy = res.output.out_file.fspath.parent.joinpath( - res.output.out_file.fspath.name + "s" - ) - if linked_file_copy.exists(): - with open(linked_file_copy) as f: - assert "hello from pydra\n" == f.read() - - # the original file is unchanged - with open(file) as f: - assert "hello from pydra\n" == f.read() - - -@pytest.mark.xfail( - reason="not sure if we want to support input overwrite," - "if we allow for this orig_file is changing, so does checksum," - " and the results can't be found" -) -@pytest.mark.parametrize("results_function", [result_no_submitter, result_submitter]) -def test_shell_cmd_inputspec_copyfile_1b(plugin, results_function, tmp_path): - """shelltask changes a file in place, - copyfile is None for the file-input, so original filed is changed - """ - file = tmp_path / "file_pydra.txt" - with open(file, "w") as f: - f.write("hello from pydra\n") - - cmd = ["sed", "-is", "s/hello/hi/"] - - my_input_spec = SpecInfo( - name="Input", - fields=[ - ( - "orig_file", - attr.ib( - type=File, - metadata={ - "position": 1, - "argstr": "", - "help_string": "orig file", - "mandatory": True, - }, - ), - ), - ( - "out_file", - attr.ib( - type=str, - metadata={ - "output_file_template": "{orig_file}", - "help_string": "output file", - }, - ), - ), - ], - bases=(ShellSpec,), - ) - - shelly = ShellCommandTask( - name="shelly", - executable=cmd, - input_spec=my_input_spec, - orig_file=str(file), - cache_dir=tmp_path, - ) - - res = results_function(shelly, plugin) - assert res.output.stdout == "" - assert res.output.out_file.fspath.exists() - # the file is not copied, it is changed in place - assert res.output.out_file == file - with open(res.output.out_file) as f: - assert "hi from pydra\n" == f.read() - - -@pytest.mark.parametrize("results_function", [result_no_submitter, result_submitter]) -def test_shell_cmd_inputspec_state_1(plugin, results_function, tmp_path): - """adding state to the input from input_spec""" - cmd_exec = "echo" - hello = ["HELLO", "hi"] - my_input_spec = SpecInfo( - name="Input", - fields=[ - ( - "text", - attr.ib( - type=str, - metadata={ - "position": 1, - "help_string": "text", - "mandatory": True, - "argstr": "", - }, - ), - ) - ], - bases=(ShellSpec,), - ) - - # separate command into exec + args - shelly = ShellCommandTask( - name="shelly", - executable=cmd_exec, - input_spec=my_input_spec, - cache_dir=tmp_path, - ).split("text", text=hello) - assert shelly.inputs.executable == cmd_exec - # todo: this doesn't work when state - # assert shelly.cmdline == "echo HELLO" - res = results_function(shelly, plugin) - assert res[0].output.stdout == "HELLO\n" - assert res[1].output.stdout == "hi\n" - - -def test_shell_cmd_inputspec_typeval_1(): - """customized input_spec with a type that doesn't match the value - - raise an exception - """ - cmd_exec = "echo" - - my_input_spec = SpecInfo( - name="Input", - fields=[ - ( - "text", - attr.ib( - type=int, - metadata={"position": 1, "argstr": "", "help_string": "text"}, - ), - ) - ], - bases=(ShellSpec,), - ) - - with pytest.raises(TypeError): - ShellCommandTask(executable=cmd_exec, text="hello", input_spec=my_input_spec) - - -def test_shell_cmd_inputspec_typeval_2(): - """customized input_spec (shorter syntax) with a type that doesn't match the value - - raise an exception - """ - cmd_exec = "echo" - - my_input_spec = SpecInfo( - name="Input", - fields=[("text", int, {"position": 1, "argstr": "", "help_string": "text"})], - bases=(ShellSpec,), - ) - - with pytest.raises(TypeError): - ShellCommandTask(executable=cmd_exec, text="hello", input_spec=my_input_spec) - - -@pytest.mark.parametrize("results_function", [result_no_submitter, result_submitter]) -def test_shell_cmd_inputspec_state_1a(plugin, results_function, tmp_path): - """adding state to the input from input_spec - using shorter syntax for input_spec (without default) - """ - cmd_exec = "echo" - my_input_spec = SpecInfo( - name="Input", - fields=[ - ( - "text", - str, - {"position": 1, "help_string": "text", "mandatory": True, "argstr": ""}, - ) - ], - bases=(ShellSpec,), - ) - - # separate command into exec + args - shelly = ShellCommandTask( - name="shelly", - executable=cmd_exec, - input_spec=my_input_spec, - cache_dir=tmp_path, - ).split(text=["HELLO", "hi"]) - assert shelly.inputs.executable == cmd_exec - - res = results_function(shelly, plugin) - assert res[0].output.stdout == "HELLO\n" - assert res[1].output.stdout == "hi\n" - - -@pytest.mark.parametrize("results_function", [result_no_submitter, result_submitter]) -def test_shell_cmd_inputspec_state_2(plugin, results_function, tmp_path): - """ - adding splitter to input that is used in the output_file_tamplate - """ - cmd = "touch" - args = ["newfile_1.txt", "newfile_2.txt"] - - my_input_spec = SpecInfo( - name="Input", - fields=[ - ( - "out1", - attr.ib( - type=str, - metadata={ - "output_file_template": "{args}", - "help_string": "output file", - }, - ), - ) - ], - bases=(ShellSpec,), - ) - - shelly = ShellCommandTask( - name="shelly", - executable=cmd, - input_spec=my_input_spec, - cache_dir=tmp_path, - ).split(args=args) - - res = results_function(shelly, plugin) - for i in range(len(args)): - assert res[i].output.stdout == "" - assert res[i].output.out1.fspath.exists() - assert res[i].output.out1.fspath.parent == shelly.output_dir[i] - - -@pytest.mark.parametrize("results_function", [result_no_submitter, result_submitter]) -def test_shell_cmd_inputspec_state_3(plugin, results_function, tmp_path): - """adding state to the File-input from input_spec""" - - file_1 = tmp_path / "file_pydra.txt" - file_2 = tmp_path / "file_nice.txt" - with open(file_1, "w") as f: - f.write("hello from pydra") - with open(file_2, "w") as f: - f.write("have a nice one") - - cmd_exec = "cat" - - my_input_spec = SpecInfo( - name="Input", - fields=[ - ( - "file", - attr.ib( - type=File, - metadata={ - "position": 1, - "help_string": "files", - "mandatory": True, - "argstr": "", - }, - ), - ) - ], - bases=(ShellSpec,), - ) - - shelly = ShellCommandTask( - name="shelly", - executable=cmd_exec, - input_spec=my_input_spec, - cache_dir=tmp_path, - ).split(file=[file_1, file_2]) - - assert shelly.inputs.executable == cmd_exec - # todo: this doesn't work when state - # assert shelly.cmdline == "echo HELLO" - res = results_function(shelly, plugin) - assert res[0].output.stdout == "hello from pydra" - assert res[1].output.stdout == "have a nice one" - - -@pytest.mark.parametrize("results_function", [result_no_submitter, result_submitter]) -def test_shell_cmd_inputspec_copyfile_state_1(plugin, results_function, tmp_path): - """adding state to the File-input from input_spec""" - - file1 = tmp_path / "file1.txt" - with open(file1, "w") as f: - f.write("hello from pydra\n") - - file2 = tmp_path / "file2.txt" - with open(file2, "w") as f: - f.write("hello world\n") - - files = [str(file1), str(file2)] - cmd = ["sed", "-is", "s/hello/hi/"] - - my_input_spec = SpecInfo( - name="Input", - fields=[ - ( - "orig_file", - attr.ib( - type=File, - metadata={ - "position": 1, - "argstr": "", - "help_string": "orig file", - "mandatory": True, - "copyfile": "copy", - }, - ), - ), - ( - "out_file", - attr.ib( - type=str, - metadata={ - "output_file_template": "{orig_file}", - "help_string": "output file", - }, - ), - ), - ], - bases=(ShellSpec,), - ) - - shelly = ShellCommandTask( - name="shelly", - executable=cmd, - input_spec=my_input_spec, - cache_dir=tmp_path, - ).split("orig_file", orig_file=files) - - txt_l = ["from pydra", "world"] - res_l = results_function(shelly, plugin) - for i, res in enumerate(res_l): - assert res.output.stdout == "" - assert res.output.out_file.fspath.exists() - # the file is copied, and than it is changed in place - assert res.output.out_file.fspath.parent == shelly.output_dir[i] - with open(res.output.out_file) as f: - assert f"hi {txt_l[i]}\n" == f.read() - # the original file is unchanged - with open(files[i]) as f: - assert f"hello {txt_l[i]}\n" == f.read() - - -# customised input_spec in Workflow - - -@pytest.mark.flaky(reruns=2) # when dask -def test_wf_shell_cmd_2(plugin_dask_opt, tmp_path): - """a workflow with input with defined output_file_template (str) - that requires wf.lzin - """ - wf = Workflow(name="wf", input_spec=["cmd", "args"]) - - wf.inputs.cmd = "touch" - wf.inputs.args = "newfile.txt" - wf.cache_dir = tmp_path - - my_input_spec = SpecInfo( - name="Input", - fields=[ - ( - "out1", - attr.ib( - type=str, - metadata={ - "output_file_template": "{args}", - "help_string": "output file", - }, - ), - ) - ], - bases=(ShellSpec,), - ) - - wf.add( - ShellCommandTask( - name="shelly", - input_spec=my_input_spec, - executable=wf.lzin.cmd, - args=wf.lzin.args, - ) - ) - - wf.set_output([("out_f", wf.shelly.lzout.out1), ("out", wf.shelly.lzout.stdout)]) - - with Submitter(plugin=plugin_dask_opt) as sub: - wf(submitter=sub) - - res = wf.result() - assert res.output.out == "" - assert res.output.out_f.fspath.exists() - assert res.output.out_f.fspath.parent == wf.output_dir - - -def test_wf_shell_cmd_2a(plugin, tmp_path): - """a workflow with input with defined output_file_template (tuple) - that requires wf.lzin - """ - wf = Workflow(name="wf", input_spec=["cmd", "args"]) - - wf.inputs.cmd = "touch" - wf.inputs.args = "newfile.txt" - wf.cache_dir = tmp_path - - my_input_spec = SpecInfo( - name="Input", - fields=[ - ( - "out1", - attr.ib( - type=str, - metadata={ - "output_file_template": "{args}", - "help_string": "output file", - }, - ), - ) - ], - bases=(ShellSpec,), - ) - - wf.add( - ShellCommandTask( - name="shelly", - input_spec=my_input_spec, - executable=wf.lzin.cmd, - args=wf.lzin.args, - ) - ) - - wf.set_output([("out_f", wf.shelly.lzout.out1), ("out", wf.shelly.lzout.stdout)]) - - with Submitter(plugin=plugin) as sub: - wf(submitter=sub) - - res = wf.result() - assert res.output.out == "" - assert res.output.out_f.fspath.exists() - - -def test_wf_shell_cmd_3(plugin, tmp_path): - """a workflow with 2 tasks, - first one has input with output_file_template (str, uses wf.lzin), - that is passed to the second task - """ - wf = Workflow(name="wf", input_spec=["cmd1", "cmd2", "args"]) - - wf.inputs.cmd1 = "touch" - wf.inputs.cmd2 = "cp" - wf.inputs.args = "newfile.txt" - wf.cache_dir = tmp_path - - my_input_spec1 = SpecInfo( - name="Input", - fields=[ - ( - "file", - attr.ib( - type=str, - metadata={ - "output_file_template": "{args}", - "help_string": "output file", - }, - ), - ) - ], - bases=(ShellSpec,), - ) - - my_input_spec2 = SpecInfo( - name="Input", - fields=[ - ( - "orig_file", - attr.ib( - type=File, - metadata={ - "position": 1, - "help_string": "output file", - "argstr": "", - }, - ), - ), - ( - "out_file", - attr.ib( - type=str, - metadata={ - "position": 2, - "argstr": "", - "output_file_template": "{orig_file}_copy", - "help_string": "output file", - }, - ), - ), - ], - bases=(ShellSpec,), - ) - - wf.add( - ShellCommandTask( - name="shelly1", - input_spec=my_input_spec1, - executable=wf.lzin.cmd1, - args=wf.lzin.args, - ) - ) - wf.add( - ShellCommandTask( - name="shelly2", - input_spec=my_input_spec2, - executable=wf.lzin.cmd2, - orig_file=wf.shelly1.lzout.file, - ) - ) - - wf.set_output( - [ - ("touch_file", wf.shelly1.lzout.file), - ("out1", wf.shelly1.lzout.stdout), - ("cp_file", wf.shelly2.lzout.out_file), - ("out2", wf.shelly2.lzout.stdout), - ] - ) - - with Submitter(plugin=plugin) as sub: - wf(submitter=sub) - - res = wf.result() - assert res.output.out1 == "" - assert res.output.touch_file.fspath.exists() - assert res.output.touch_file.fspath.parent == wf.output_dir - assert res.output.out2 == "" - assert res.output.cp_file.fspath.exists() - assert res.output.cp_file.fspath.parent == wf.output_dir - - -def test_wf_shell_cmd_3a(plugin, tmp_path): - """a workflow with 2 tasks, - first one has input with output_file_template (str, uses wf.lzin), - that is passed to the second task - """ - wf = Workflow(name="wf", input_spec=["cmd1", "cmd2", "args"]) - - wf.inputs.cmd1 = "touch" - wf.inputs.cmd2 = "cp" - wf.inputs.args = "newfile.txt" - wf.cache_dir = tmp_path - - my_input_spec1 = SpecInfo( - name="Input", - fields=[ - ( - "file", - attr.ib( - type=str, - metadata={ - "output_file_template": "{args}", - "help_string": "output file", - }, - ), - ) - ], - bases=(ShellSpec,), - ) - - my_input_spec2 = SpecInfo( - name="Input", - fields=[ - ( - "orig_file", - attr.ib( - type=str, - metadata={ - "position": 1, - "help_string": "output file", - "argstr": "", - }, - ), - ), - ( - "out_file", - attr.ib( - type=str, - metadata={ - "position": 2, - "argstr": "", - "output_file_template": "{orig_file}_cp", - "help_string": "output file", - }, - ), - ), - ], - bases=(ShellSpec,), - ) - - wf.add( - ShellCommandTask( - name="shelly1", - input_spec=my_input_spec1, - executable=wf.lzin.cmd1, - args=wf.lzin.args, - ) - ) - wf.add( - ShellCommandTask( - name="shelly2", - input_spec=my_input_spec2, - executable=wf.lzin.cmd2, - orig_file=wf.shelly1.lzout.file, - ) - ) - - wf.set_output( - [ - ("touch_file", wf.shelly1.lzout.file), - ("out1", wf.shelly1.lzout.stdout), - ("cp_file", wf.shelly2.lzout.out_file), - ("out2", wf.shelly2.lzout.stdout), - ] - ) - - with Submitter(plugin=plugin) as sub: - wf(submitter=sub) - - res = wf.result() - assert res.output.out1 == "" - assert res.output.touch_file.fspath.exists() - assert res.output.out2 == "" - assert res.output.cp_file.fspath.exists() - - -def test_wf_shell_cmd_state_1(plugin, tmp_path): - """a workflow with 2 tasks and splitter on the wf level, - first one has input with output_file_template (str, uses wf.lzin), - that is passed to the second task - """ - wf = Workflow( - name="wf", input_spec=["cmd1", "cmd2", "args"], cache_dir=tmp_path - ).split("args", args=["newfile_1.txt", "newfile_2.txt"]) - - wf.inputs.cmd1 = "touch" - wf.inputs.cmd2 = "cp" - - my_input_spec1 = SpecInfo( - name="Input", - fields=[ - ( - "file", - attr.ib( - type=str, - metadata={ - "output_file_template": "{args}", - "help_string": "output file", - }, - ), - ) - ], - bases=(ShellSpec,), - ) - - my_input_spec2 = SpecInfo( - name="Input", - fields=[ - ( - "orig_file", - attr.ib( - type=str, - metadata={ - "position": 1, - "help_string": "output file", - "argstr": "", - }, - ), - ), - ( - "out_file", - attr.ib( - type=str, - metadata={ - "position": 2, - "argstr": "", - "output_file_template": "{orig_file}_copy", - "help_string": "output file", - }, - ), - ), - ], - bases=(ShellSpec,), - ) - - wf.add( - ShellCommandTask( - name="shelly1", - input_spec=my_input_spec1, - executable=wf.lzin.cmd1, - args=wf.lzin.args, - ) - ) - wf.add( - ShellCommandTask( - name="shelly2", - input_spec=my_input_spec2, - executable=wf.lzin.cmd2, - orig_file=wf.shelly1.lzout.file, - ) - ) - - wf.set_output( - [ - ("touch_file", wf.shelly1.lzout.file), - ("out1", wf.shelly1.lzout.stdout), - ("cp_file", wf.shelly2.lzout.out_file), - ("out2", wf.shelly2.lzout.stdout), - ] - ) - - with Submitter(plugin=plugin) as sub: - wf(submitter=sub) - - res_l = wf.result() - for i, res in enumerate(res_l): - assert res.output.out1 == "" - assert res.output.touch_file.fspath.exists() - assert res.output.touch_file.fspath.parent == wf.output_dir[i] - assert res.output.out2 == "" - assert res.output.cp_file.fspath.exists() - assert res.output.cp_file.fspath.parent == wf.output_dir[i] - - -def test_wf_shell_cmd_ndst_1(plugin, tmp_path): - """a workflow with 2 tasks and a splitter on the node level, - first one has input with output_file_template (str, uses wf.lzin), - that is passed to the second task - """ - wf = Workflow(name="wf", input_spec=["cmd1", "cmd2", "args"]) - - wf.inputs.cmd1 = "touch" - wf.inputs.cmd2 = "cp" - wf.inputs.args = ["newfile_1.txt", "newfile_2.txt"] - wf.cache_dir = tmp_path - - my_input_spec1 = SpecInfo( - name="Input", - fields=[ - ( - "file", - attr.ib( - type=str, - metadata={ - "output_file_template": "{args}", - "help_string": "output file", - }, - ), - ) - ], - bases=(ShellSpec,), - ) - - my_input_spec2 = SpecInfo( - name="Input", - fields=[ - ( - "orig_file", - attr.ib( - type=str, - metadata={ - "position": 1, - "help_string": "output file", - "argstr": "", - }, - ), - ), - ( - "out_file", - attr.ib( - type=str, - metadata={ - "position": 2, - "argstr": "", - "output_file_template": "{orig_file}_copy", - "help_string": "output file", - }, - ), - ), - ], - bases=(ShellSpec,), - ) - - wf.add( - ShellCommandTask( - name="shelly1", - input_spec=my_input_spec1, - executable=wf.lzin.cmd1, - ).split("args", args=wf.lzin.args) - ) - wf.add( - ShellCommandTask( - name="shelly2", - input_spec=my_input_spec2, - executable=wf.lzin.cmd2, - orig_file=wf.shelly1.lzout.file, - ) - ) - - wf.set_output( - [ - ("touch_file", wf.shelly1.lzout.file), - ("out1", wf.shelly1.lzout.stdout), - ("cp_file", wf.shelly2.lzout.out_file), - ("out2", wf.shelly2.lzout.stdout), - ] - ) - - with Submitter(plugin=plugin) as sub: - wf(submitter=sub) - - res = wf.result() - assert res.output.out1 == ["", ""] - assert all([file.fspath.exists() for file in res.output.touch_file]) - assert res.output.out2 == ["", ""] - assert all([file.fspath.exists() for file in res.output.cp_file]) - - -# customised output spec - - -@pytest.mark.parametrize("results_function", [result_no_submitter, result_submitter]) -def test_shell_cmd_outputspec_1(plugin, results_function, tmp_path): - """ - customised output_spec, adding files to the output, providing specific pathname - """ - cmd = ["touch", "newfile_tmp.txt"] - my_output_spec = SpecInfo( - name="Output", - fields=[("newfile", File, "newfile_tmp.txt")], - bases=(ShellOutSpec,), - ) - shelly = ShellCommandTask( - name="shelly", executable=cmd, output_spec=my_output_spec, cache_dir=tmp_path - ) - - res = results_function(shelly, plugin) - assert res.output.stdout == "" - assert res.output.newfile.fspath.exists() - - -@pytest.mark.parametrize("results_function", [result_no_submitter, result_submitter]) -def test_shell_cmd_outputspec_1a(plugin, results_function, tmp_path): - """ - customised output_spec, adding files to the output, providing specific pathname - """ - cmd = ["touch", "newfile_tmp.txt"] - my_output_spec = SpecInfo( - name="Output", - fields=[("newfile", attr.ib(type=File, default="newfile_tmp.txt"))], - bases=(ShellOutSpec,), - ) - shelly = ShellCommandTask( - name="shelly", executable=cmd, output_spec=my_output_spec, cache_dir=tmp_path - ) - - res = results_function(shelly, plugin) - assert res.output.stdout == "" - assert res.output.newfile.fspath.exists() - - -def test_shell_cmd_outputspec_1b_exception(plugin, tmp_path): - """ - customised output_spec, adding files to the output, providing specific pathname - """ - cmd = ["touch", "newfile_tmp.txt"] - my_output_spec = SpecInfo( - name="Output", - fields=[("newfile", File, "newfile_tmp_.txt")], - bases=(ShellOutSpec,), - ) - shelly = ShellCommandTask( - name="shelly", executable=cmd, output_spec=my_output_spec, cache_dir=tmp_path - ) - - with pytest.raises(Exception) as exinfo: - with Submitter(plugin=plugin) as sub: - shelly(submitter=sub) - assert "does not exist" in str(exinfo.value) - - -@pytest.mark.parametrize("results_function", [result_no_submitter, result_submitter]) -def test_shell_cmd_outputspec_2(plugin, results_function, tmp_path): - """ - customised output_spec, adding files to the output, - using a wildcard in default - """ - cmd = ["touch", "newfile_tmp.txt"] - my_output_spec = SpecInfo( - name="Output", - fields=[("newfile", File, "newfile_*.txt")], - bases=(ShellOutSpec,), - ) - shelly = ShellCommandTask( - name="shelly", executable=cmd, output_spec=my_output_spec, cache_dir=tmp_path - ) - - res = results_function(shelly, plugin) - assert res.output.stdout == "" - assert res.output.newfile.fspath.exists() - - -def test_shell_cmd_outputspec_2a_exception(plugin, tmp_path): - """ - customised output_spec, adding files to the output, - using a wildcard in default - """ - cmd = ["touch", "newfile_tmp.txt"] - my_output_spec = SpecInfo( - name="Output", - fields=[("newfile", File, "newfile_*K.txt")], - bases=(ShellOutSpec,), - ) - shelly = ShellCommandTask( - name="shelly", executable=cmd, output_spec=my_output_spec, cache_dir=tmp_path - ) - - with pytest.raises(Exception) as excinfo: - with Submitter(plugin=plugin) as sub: - shelly(submitter=sub) - assert "no file matches" in str(excinfo.value) - - -@pytest.mark.parametrize("results_function", [result_no_submitter, result_submitter]) -def test_shell_cmd_outputspec_3(plugin, results_function, tmp_path): - """ - customised output_spec, adding files to the output, - using a wildcard in default, should collect two files - """ - cmd = ["touch", "newfile_tmp1.txt", "newfile_tmp2.txt"] - my_output_spec = SpecInfo( - name="Output", - fields=[("newfile", MultiOutputFile, "newfile_*.txt")], - bases=(ShellOutSpec,), - ) - shelly = ShellCommandTask( - name="shelly", executable=cmd, output_spec=my_output_spec, cache_dir=tmp_path - ) - - res = results_function(shelly, plugin) - assert res.output.stdout == "" - # newfile is a list - assert len(res.output.newfile) == 2 - assert all([file.fspath.exists() for file in res.output.newfile]) - - -@pytest.mark.parametrize("results_function", [result_no_submitter, result_submitter]) -def test_shell_cmd_outputspec_5(plugin, results_function, tmp_path): - """ - customised output_spec, adding files to the output, - using a function to collect output, the function is saved in the field metadata - and uses output_dir and the glob function - """ - cmd = ["touch", "newfile_tmp1.txt", "newfile_tmp2.txt"] - - def gather_output(field, output_dir): - if field.name == "newfile": - return list(Path(output_dir).expanduser().glob("newfile*.txt")) - - my_output_spec = SpecInfo( - name="Output", - fields=[ - ( - "newfile", - attr.ib(type=MultiOutputFile, metadata={"callable": gather_output}), - ) - ], - bases=(ShellOutSpec,), - ) - shelly = ShellCommandTask( - name="shelly", executable=cmd, output_spec=my_output_spec, cache_dir=tmp_path - ) - - res = results_function(shelly, plugin) - assert res.output.stdout == "" - # newfile is a list - assert len(res.output.newfile) == 2 - assert all([file.fspath.exists() for file in res.output.newfile]) - assert ( - shelly.output_names - == shelly.generated_output_names - == ["return_code", "stdout", "stderr", "newfile"] - ) - - -@pytest.mark.parametrize("results_function", [result_no_submitter, result_submitter]) -def test_shell_cmd_outputspec_5a(plugin, results_function, tmp_path): - """ - customised output_spec, adding files to the output, - using a function to collect output, the function is saved in the field metadata - and uses output_dir and inputs element - """ - cmd = ["touch", "newfile_tmp1.txt", "newfile_tmp2.txt"] - - def gather_output(executable, output_dir): - files = executable[1:] - return [Path(output_dir) / file for file in files] - - my_output_spec = SpecInfo( - name="Output", - fields=[ - ( - "newfile", - attr.ib(type=MultiOutputFile, metadata={"callable": gather_output}), - ) - ], - bases=(ShellOutSpec,), - ) - shelly = ShellCommandTask( - name="shelly", executable=cmd, output_spec=my_output_spec, cache_dir=tmp_path - ) - - res = results_function(shelly, plugin) - assert res.output.stdout == "" - # newfile is a list - assert len(res.output.newfile) == 2 - assert all([file.fspath.exists() for file in res.output.newfile]) - - -def test_shell_cmd_outputspec_5b_error(): - """ - customised output_spec, adding files to the output, - using a function to collect output, the function is saved in the field metadata - with an argument that is not part of the inputs - error is raised - """ - cmd = ["touch", "newfile_tmp1.txt", "newfile_tmp2.txt"] - - def gather_output(executable, output_dir, ble): - files = executable[1:] - return [Path(output_dir) / file for file in files] - - my_output_spec = SpecInfo( - name="Output", - fields=[("newfile", attr.ib(type=File, metadata={"callable": gather_output}))], - bases=(ShellOutSpec,), - ) - shelly = ShellCommandTask(name="shelly", executable=cmd, output_spec=my_output_spec) - with pytest.raises(AttributeError, match="ble"): - shelly() - - -@pytest.mark.parametrize("results_function", [result_no_submitter, result_submitter]) -def test_shell_cmd_outputspec_5c(plugin, results_function, tmp_path): - """ - Customised output spec defined as a class, - using a static function to collect output files. - """ - - @attr.s(kw_only=True) - class MyOutputSpec(ShellOutSpec): - @staticmethod - def gather_output(executable, output_dir): - files = executable[1:] - return [Path(output_dir) / file for file in files] - - newfile: MultiOutputFile = attr.ib(metadata={"callable": gather_output}) - - shelly = ShellCommandTask( - name="shelly", - executable=["touch", "newfile_tmp1.txt", "newfile_tmp2.txt"], - output_spec=SpecInfo(name="Output", bases=(MyOutputSpec,)), - cache_dir=tmp_path, - ) - - res = results_function(shelly, plugin) - assert res.output.stdout == "" - # newfile is a list - assert len(res.output.newfile) == 2 - assert all([file.exists() for file in res.output.newfile]) - - -@pytest.mark.parametrize("results_function", [result_no_submitter, result_submitter]) -def test_shell_cmd_outputspec_6(plugin, results_function, tmp_path): - """ - providing output name by providing output_file_template - (similar to the previous example, but not touching input_spec) - """ - cmd = "touch" - args = "newfile_tmp.txt" - - my_output_spec = SpecInfo( - name="Output", - fields=[ - ( - "out1", - attr.ib( - type=File, - metadata={ - "output_file_template": "{args}", - "help_string": "output file", - }, - ), - ) - ], - bases=(ShellOutSpec,), - ) - - shelly = ShellCommandTask( - name="shelly", - executable=cmd, - args=args, - output_spec=my_output_spec, - cache_dir=tmp_path, - ) - - res = results_function(shelly, plugin) - assert res.output.stdout == "" - assert res.output.out1.fspath.exists() - - -def test_shell_cmd_outputspec_6a(): - """ - providing output name by providing output_file_template - (using shorter syntax) - """ - cmd = "touch" - args = "newfile_tmp.txt" - - my_output_spec = SpecInfo( - name="Output", - fields=[ - ( - "out1", - File, - {"output_file_template": "{args}", "help_string": "output file"}, - ) - ], - bases=(ShellOutSpec,), - ) - - shelly = ShellCommandTask( - name="shelly", executable=cmd, args=args, output_spec=my_output_spec - ) - - res = shelly() - assert res.output.stdout == "" - assert res.output.out1.fspath.exists() - - -@pytest.mark.parametrize("results_function", [result_no_submitter, result_submitter]) -def test_shell_cmd_outputspec_7(tmp_path, plugin, results_function): - """ - providing output with output_file_name and using MultiOutputFile as a type. - the input field used in the template is a MultiInputObj, so it can be and is a list - """ - file = tmp_path / "script.sh" - file.write_text('for var in "$@"; do touch file"$var".txt; done') - - cmd = "bash" - new_files_id = ["1", "2", "3"] - - my_input_spec = SpecInfo( - name="Input", - fields=[ - ( - "script", - attr.ib( - type=File, - metadata={ - "help_string": "script file", - "mandatory": True, - "position": 1, - "argstr": "", - }, - ), - ), - ( - "files_id", - attr.ib( - type=MultiInputObj, - metadata={ - "position": 2, - "argstr": "...", - "sep": " ", - "help_string": "list of name indices", - "mandatory": True, - }, - ), - ), - ], - bases=(ShellSpec,), - ) - - my_output_spec = SpecInfo( - name="Output", - fields=[ - ( - "new_files", - attr.ib( - type=MultiOutputFile, - metadata={ - "output_file_template": "file{files_id}.txt", - "help_string": "output file", - }, - ), - ) - ], - bases=(ShellOutSpec,), - ) - - shelly = ShellCommandTask( - name="shelly", - executable=cmd, - input_spec=my_input_spec, - output_spec=my_output_spec, - script=file, - files_id=new_files_id, - ) - - res = results_function(shelly, "serial") - assert res.output.stdout == "" - for file in res.output.new_files: - assert file.fspath.exists() - - -@pytest.mark.parametrize("results_function", [result_no_submitter, result_submitter]) -def test_shell_cmd_outputspec_7a(tmp_path, plugin, results_function): - """ - providing output with output_file_name and using MultiOutputFile as a type. - the input field used in the template is a MultiInputObj, but a single element is used - """ - file = tmp_path / "script.sh" - file.write_text('for var in "$@"; do touch file"$var".txt; done') - - cmd = "bash" - new_files_id = "1" - - my_input_spec = SpecInfo( - name="Input", - fields=[ - ( - "script", - attr.ib( - type=File, - metadata={ - "help_string": "script file", - "mandatory": True, - "position": 1, - "argstr": "", - }, - ), - ), - ( - "files_id", - attr.ib( - type=MultiInputObj, - metadata={ - "position": 2, - "argstr": "...", - "sep": " ", - "help_string": "list of name indices", - "mandatory": True, - }, - ), - ), - ], - bases=(ShellSpec,), - ) - - my_output_spec = SpecInfo( - name="Output", - fields=[ - ( - "new_files", - attr.ib( - type=MultiOutputFile, - metadata={ - "output_file_template": "file{files_id}.txt", - "help_string": "output file", - }, - ), - ) - ], - bases=(ShellOutSpec,), - ) - - shelly = ShellCommandTask( - name="shelly", - executable=cmd, - input_spec=my_input_spec, - output_spec=my_output_spec, - script=file, - files_id=new_files_id, - ) - - # XXX: Figure out why this fails with "cf". Occurs in CI when using Ubuntu + Python >= 3.10 - # (but not when using macOS + Python >= 3.10). Same error occurs in test_shell_cmd_inputspec_11 - # see https://github.com/nipype/pydra/issues/671 - res = results_function(shelly, "serial") - assert res.output.stdout == "" - assert res.output.new_files.fspath.exists() - - -@pytest.mark.parametrize("results_function", [result_no_submitter, result_submitter]) -def test_shell_cmd_outputspec_8a(tmp_path, plugin, results_function): - """ - customised output_spec, adding int and str to the output, - requiring two callables with parameters stdout and stderr - """ - cmd = "echo" - args = ["newfile_1.txt", "newfile_2.txt"] - - def get_file_index(stdout): - stdout = re.sub(r".*_", "", stdout) - stdout = re.sub(r".txt", "", stdout) - print(stdout) - return int(stdout) - - def get_stderr(stderr): - return f"stderr: {stderr}" - - my_output_spec = SpecInfo( - name="Output", - fields=[ - ( - "out1", - attr.ib( - type=File, - metadata={ - "output_file_template": "{args}", - "help_string": "output file", - }, - ), - ), - ( - "out_file_index", - attr.ib( - type=int, - metadata={"help_string": "output file", "callable": get_file_index}, - ), - ), - ( - "stderr_field", - attr.ib( - type=str, - metadata={ - "help_string": "The standard error output", - "callable": get_stderr, - }, - ), - ), - ], - bases=(ShellOutSpec,), - ) - - shelly = ShellCommandTask( - name="shelly", executable=cmd, output_spec=my_output_spec, cache_dir=tmp_path - ).split("args", args=args) - - results = results_function(shelly, plugin) - for index, res in enumerate(results): - assert res.output.out_file_index == index + 1 - assert res.output.stderr_field == f"stderr: {res.output.stderr}" - - -def test_shell_cmd_outputspec_8b_error(): - """ - customised output_spec, adding Int to the output, - requiring a function to collect output - """ - cmd = "echo" - args = ["newfile_1.txt", "newfile_2.txt"] - - my_output_spec = SpecInfo( - name="Output", - fields=[ - ( - "out", - attr.ib( - type=int, metadata={"help_string": "output file", "value": "val"} - ), - ) - ], - bases=(ShellOutSpec,), - ) - shelly = ShellCommandTask( - name="shelly", executable=cmd, output_spec=my_output_spec - ).split("args", args=args) - with pytest.raises(Exception) as e: - shelly() - assert "has to have a callable" in str(e.value) - - -@pytest.mark.parametrize("results_function", [result_no_submitter, result_submitter]) -def test_shell_cmd_outputspec_8c(tmp_path, plugin, results_function): - """ - customised output_spec, adding Directory to the output named by args - """ - - def get_lowest_directory(directory_path): - return str(directory_path).replace(str(Path(directory_path).parents[0]), "") - - cmd = "mkdir" - args = [f"{tmp_path}/dir1", f"{tmp_path}/dir2"] - - my_output_spec = SpecInfo( - name="Output", - fields=[ - ( - "resultsDir", - attr.ib( - type=Directory, - metadata={ - "output_file_template": "{args}", - "help_string": "output file", - }, - ), - ) - ], - bases=(ShellOutSpec,), - ) - - shelly = ShellCommandTask( - name="shelly", - executable=cmd, - output_spec=my_output_spec, - resultsDir="outdir", - cache_dir=tmp_path, - ).split("args", args=args) - - results_function(shelly, plugin) - for index, arg_dir in enumerate(args): - assert Path(Path(tmp_path) / Path(arg_dir)).exists() - assert get_lowest_directory(arg_dir) == f"/dir{index+1}" - - -@pytest.mark.parametrize("results_function", [result_no_submitter, result_submitter]) -def test_shell_cmd_outputspec_8d(tmp_path, plugin, results_function): - """ - customised output_spec, adding Directory to the output named by input spec - """ - - # For /tmp/some_dict/test this function returns "/test" - def get_lowest_directory(directory_path): - return str(directory_path).replace(str(Path(directory_path).parents[0]), "") - - cmd = "mkdir" - - my_input_spec = SpecInfo( - name="Input", - fields=[ - ( - "resultsDir", - attr.ib( - type=str, - metadata={ - "position": 1, - "help_string": "new directory", - "argstr": "", - }, - ), - ) - ], - bases=(ShellSpec,), - ) - - my_output_spec = SpecInfo( - name="Output", - fields=[ - ( - "resultsDir", - attr.ib( - type=Directory, - metadata={ - "output_file_template": "{resultsDir}", - "help_string": "output file", - }, - ), - ) - ], - bases=(ShellOutSpec,), - ) - - shelly = ShellCommandTask( - name=cmd, - executable=cmd, - input_spec=my_input_spec, - output_spec=my_output_spec, - cache_dir=tmp_path, - resultsDir="test", # Path(tmp_path) / "test" TODO: Not working without absolute path support - ) - assert ( - shelly.output_names - == shelly.generated_output_names - == ["return_code", "stdout", "stderr", "resultsDir"] - ) - res = results_function(shelly, plugin) - print("Cache_dirr:", shelly.cache_dir) - assert (shelly.output_dir / Path("test")).exists() - assert get_lowest_directory(res.output.resultsDir) == get_lowest_directory( - shelly.output_dir / Path("test") - ) - - -@pytest.mark.parametrize("results_function", [result_no_submitter, result_submitter]) -def test_shell_cmd_state_outputspec_1(plugin, results_function, tmp_path): - """ - providing output name by providing output_file_template - splitter for a field that is used in the template - """ - cmd = "touch" - args = ["newfile_1.txt", "newfile_2.txt"] - - my_output_spec = SpecInfo( - name="Output", - fields=[ - ( - "out1", - attr.ib( - type=File, - metadata={ - "output_file_template": "{args}", - "help_string": "output file", - }, - ), - ) - ], - bases=(ShellOutSpec,), - ) - - shelly = ShellCommandTask( - name="shelly", - executable=cmd, - output_spec=my_output_spec, - cache_dir=tmp_path, - ).split("args", args=args) - - res = results_function(shelly, plugin) - for i in range(len(args)): - assert res[i].output.stdout == "" - assert res[i].output.out1.fspath.exists() - - -# customised output_spec for tasks in workflows - - -def test_shell_cmd_outputspec_wf_1(plugin, tmp_path): - """ - customised output_spec for tasks within a Workflow, - adding files to the output, providing specific pathname - """ - - cmd = ["touch", "newfile_tmp.txt"] - wf = Workflow(name="wf", input_spec=["cmd"]) - wf.inputs.cmd = cmd - wf.cache_dir = tmp_path - - my_output_spec = SpecInfo( - name="Output", - fields=[("newfile", File, "newfile_tmp.txt")], - bases=(ShellOutSpec,), - ) - wf.add( - ShellCommandTask( - name="shelly", executable=wf.lzin.cmd, output_spec=my_output_spec - ) - ) - wf.set_output( - [("stdout", wf.shelly.lzout.stdout), ("newfile", wf.shelly.lzout.newfile)] - ) - - with Submitter(plugin=plugin) as sub: - wf(submitter=sub) - - res = wf.result() - assert res.output.stdout == "" - assert res.output.newfile.fspath.exists() - # checking if the file was copied to the wf dir - assert res.output.newfile.fspath.parent == wf.output_dir - - -def test_shell_cmd_inputspec_outputspec_1(): - """ - customised input_spec and output_spec, output_spec uses input_spec fields in templates - """ - cmd = ["touch", "newfile_tmp.txt"] - my_input_spec = SpecInfo( - name="Input", - fields=[ - ( - "file1", - str, - {"help_string": "1st creadted file", "argstr": "", "position": 1}, - ), - ( - "file2", - str, - {"help_string": "2nd creadted file", "argstr": "", "position": 2}, - ), - ], - bases=(ShellSpec,), - ) - - my_output_spec = SpecInfo( - name="Output", - fields=[ - ( - "newfile1", - File, - {"output_file_template": "{file1}", "help_string": "newfile 1"}, - ), - ( - "newfile2", - File, - {"output_file_template": "{file2}", "help_string": "newfile 2"}, - ), - ], - bases=(ShellOutSpec,), - ) - shelly = ShellCommandTask( - name="shelly", - executable=cmd, - input_spec=my_input_spec, - output_spec=my_output_spec, - ) - shelly.inputs.file1 = "new_file_1.txt" - shelly.inputs.file2 = "new_file_2.txt" - - res = shelly() - assert res.output.stdout == "" - assert res.output.newfile1.fspath.exists() - assert res.output.newfile2.fspath.exists() - - -def test_shell_cmd_inputspec_outputspec_1a(): - """ - customised input_spec and output_spec, output_spec uses input_spec fields in templates, - file2 is used in a template for newfile2, but it is not provided, so newfile2 is set to NOTHING - """ - cmd = ["touch", "newfile_tmp.txt"] - my_input_spec = SpecInfo( - name="Input", - fields=[ - ( - "file1", - str, - {"help_string": "1st creadted file", "argstr": "", "position": 1}, - ), - ( - "file2", - str, - {"help_string": "2nd creadted file", "argstr": "", "position": 2}, - ), - ], - bases=(ShellSpec,), - ) - - my_output_spec = SpecInfo( - name="Output", - fields=[ - ( - "newfile1", - File, - {"output_file_template": "{file1}", "help_string": "newfile 1"}, - ), - ( - "newfile2", - File, - {"output_file_template": "{file2}", "help_string": "newfile 2"}, - ), - ], - bases=(ShellOutSpec,), - ) - shelly = ShellCommandTask( - name="shelly", - executable=cmd, - input_spec=my_input_spec, - output_spec=my_output_spec, - ) - shelly.inputs.file1 = "new_file_1.txt" - - res = shelly() - assert res.output.stdout == "" - assert res.output.newfile1.fspath.exists() - # newfile2 is not created, since file2 is not provided - assert res.output.newfile2 is attr.NOTHING - - -def test_shell_cmd_inputspec_outputspec_2(): - """ - customised input_spec and output_spec, output_spec uses input_spec fields in the requires filed - """ - cmd = ["touch", "newfile_tmp.txt"] - my_input_spec = SpecInfo( - name="Input", - fields=[ - ( - "file1", - str, - {"help_string": "1st creadted file", "argstr": "", "position": 1}, - ), - ( - "file2", - str, - {"help_string": "2nd creadted file", "argstr": "", "position": 2}, - ), - ], - bases=(ShellSpec,), - ) - - my_output_spec = SpecInfo( - name="Output", - fields=[ - ( - "newfile1", - File, - { - "output_file_template": "{file1}", - "help_string": "newfile 1", - "requires": ["file1"], - }, - ), - ( - "newfile2", - File, - { - "output_file_template": "{file2}", - "help_string": "newfile 1", - "requires": ["file1", "file2"], - }, - ), - ], - bases=(ShellOutSpec,), - ) - shelly = ShellCommandTask( - name="shelly", - executable=cmd, - input_spec=my_input_spec, - output_spec=my_output_spec, - ) - shelly.inputs.file1 = "new_file_1.txt" - shelly.inputs.file2 = "new_file_2.txt" - # all fields from output_spec should be in output_names and generated_output_names - assert ( - shelly.output_names - == shelly.generated_output_names - == ["return_code", "stdout", "stderr", "newfile1", "newfile2"] - ) - - res = shelly() - assert res.output.stdout == "" - assert res.output.newfile1.fspath.exists() - assert res.output.newfile2.fspath.exists() - - -def test_shell_cmd_inputspec_outputspec_2a(): - """ - customised input_spec and output_spec, output_spec uses input_spec fields in the requires filed - """ - cmd = ["touch", "newfile_tmp.txt"] - my_input_spec = SpecInfo( - name="Input", - fields=[ - ( - "file1", - str, - {"help_string": "1st creadted file", "argstr": "", "position": 1}, - ), - ( - "file2", - str, - {"help_string": "2nd creadted file", "argstr": "", "position": 2}, - ), - ], - bases=(ShellSpec,), - ) - - my_output_spec = SpecInfo( - name="Output", - fields=[ - ( - "newfile1", - File, - { - "output_file_template": "{file1}", - "help_string": "newfile 1", - "requires": ["file1"], - }, - ), - ( - "newfile2", - File, - { - "output_file_template": "{file2}", - "help_string": "newfile 1", - "requires": ["file1", "file2"], - }, - ), - ], - bases=(ShellOutSpec,), - ) - shelly = ShellCommandTask( - name="shelly", - executable=cmd, - input_spec=my_input_spec, - output_spec=my_output_spec, - ) - shelly.inputs.file1 = "new_file_1.txt" - # generated_output_names should know that newfile2 will not be generated - assert shelly.output_names == [ - "return_code", - "stdout", - "stderr", - "newfile1", - "newfile2", - ] - assert shelly.generated_output_names == [ - "return_code", - "stdout", - "stderr", - "newfile1", - ] - - res = shelly() - assert res.output.stdout == "" - assert res.output.newfile1.fspath.exists() - assert res.output.newfile2 is attr.NOTHING - - -def test_shell_cmd_inputspec_outputspec_3(): - """ - customised input_spec and output_spec, output_spec uses input_spec fields in the requires filed - adding one additional input that is not in the template, but in the requires field, - """ - cmd = ["touch", "newfile_tmp.txt"] - my_input_spec = SpecInfo( - name="Input", - fields=[ - ( - "file1", - str, - {"help_string": "1st creadted file", "argstr": "", "position": 1}, - ), - ( - "file2", - str, - {"help_string": "2nd creadted file", "argstr": "", "position": 2}, - ), - ("additional_inp", int, {"help_string": "additional inp"}), - ], - bases=(ShellSpec,), - ) - - my_output_spec = SpecInfo( - name="Output", - fields=[ - ( - "newfile1", - File, - {"output_file_template": "{file1}", "help_string": "newfile 1"}, - ), - ( - "newfile2", - File, - { - "output_file_template": "{file2}", - "help_string": "newfile 1", - "requires": ["file1", "additional_inp"], - }, - ), - ], - bases=(ShellOutSpec,), - ) - shelly = ShellCommandTask( - name="shelly", - executable=cmd, - input_spec=my_input_spec, - output_spec=my_output_spec, - ) - shelly.inputs.file1 = "new_file_1.txt" - shelly.inputs.file2 = "new_file_2.txt" - shelly.inputs.additional_inp = 2 - - res = shelly() - assert res.output.stdout == "" - assert res.output.newfile1.fspath.exists() - assert res.output.newfile2.fspath.exists() - - -def test_shell_cmd_inputspec_outputspec_3a(): - """ - customised input_spec and output_spec, output_spec uses input_spec fields in the requires filed - adding one additional input that is not in the template, but in the requires field, - the additional input not provided, so the output is NOTHING - """ - cmd = ["touch", "newfile_tmp.txt"] - my_input_spec = SpecInfo( - name="Input", - fields=[ - ( - "file1", - str, - {"help_string": "1st creadted file", "argstr": "", "position": 1}, - ), - ( - "file2", - str, - {"help_string": "2nd creadted file", "argstr": "", "position": 2}, - ), - ("additional_inp", str, {"help_string": "additional inp"}), - ], - bases=(ShellSpec,), - ) - - my_output_spec = SpecInfo( - name="Output", - fields=[ - ( - "newfile1", - File, - {"output_file_template": "{file1}", "help_string": "newfile 1"}, - ), - ( - "newfile2", - File, - { - "output_file_template": "{file2}", - "help_string": "newfile 1", - "requires": ["file1", "additional_inp"], - }, - ), - ], - bases=(ShellOutSpec,), - ) - shelly = ShellCommandTask( - name="shelly", - executable=cmd, - input_spec=my_input_spec, - output_spec=my_output_spec, - ) - shelly.inputs.file1 = "new_file_1.txt" - shelly.inputs.file2 = "new_file_2.txt" - # generated_output_names should know that newfile2 will not be generated - assert shelly.output_names == [ - "return_code", - "stdout", - "stderr", - "newfile1", - "newfile2", - ] - assert shelly.generated_output_names == [ - "return_code", - "stdout", - "stderr", - "newfile1", - ] - - res = shelly() - assert res.output.stdout == "" - assert res.output.newfile1.fspath.exists() - # additional input not provided so no newfile2 set (even if the file was created) - assert res.output.newfile2 is attr.NOTHING - - -def test_shell_cmd_inputspec_outputspec_4(): - """ - customised input_spec and output_spec, output_spec uses input_spec fields in the requires filed - adding one additional input to the requires together with a list of the allowed values, - """ - cmd = ["touch", "newfile_tmp.txt"] - my_input_spec = SpecInfo( - name="Input", - fields=[ - ( - "file1", - str, - {"help_string": "1st creadted file", "argstr": "", "position": 1}, - ), - ("additional_inp", int, {"help_string": "additional inp"}), - ], - bases=(ShellSpec,), - ) - - my_output_spec = SpecInfo( - name="Output", - fields=[ - ( - "newfile1", - File, - { - "output_file_template": "{file1}", - "help_string": "newfile 1", - "requires": ["file1", ("additional_inp", [2, 3])], - }, - ) - ], - bases=(ShellOutSpec,), - ) - shelly = ShellCommandTask( - name="shelly", - executable=cmd, - input_spec=my_input_spec, - output_spec=my_output_spec, - ) - shelly.inputs.file1 = "new_file_1.txt" - shelly.inputs.additional_inp = 2 - # generated_output_names should be the same as output_names - assert ( - shelly.output_names - == shelly.generated_output_names - == ["return_code", "stdout", "stderr", "newfile1"] - ) - - res = shelly() - assert res.output.stdout == "" - assert res.output.newfile1.fspath.exists() - - -def test_shell_cmd_inputspec_outputspec_4a(): - """ - customised input_spec and output_spec, output_spec uses input_spec fields in the requires filed - adding one additional input to the requires together with a list of the allowed values, - the input is set to a value that is not in the list, so output is NOTHING - """ - cmd = ["touch", "newfile_tmp.txt"] - my_input_spec = SpecInfo( - name="Input", - fields=[ - ( - "file1", - str, - {"help_string": "1st creadted file", "argstr": "", "position": 1}, - ), - ("additional_inp", int, {"help_string": "additional inp"}), - ], - bases=(ShellSpec,), - ) - - my_output_spec = SpecInfo( - name="Output", - fields=[ - ( - "newfile1", - File, - { - "output_file_template": "{file1}", - "help_string": "newfile 1", - "requires": ["file1", ("additional_inp", [2, 3])], - }, - ) - ], - bases=(ShellOutSpec,), - ) - shelly = ShellCommandTask( - name="shelly", - executable=cmd, - input_spec=my_input_spec, - output_spec=my_output_spec, - ) - shelly.inputs.file1 = "new_file_1.txt" - # the value is not in the list from requires - shelly.inputs.additional_inp = 1 - - res = shelly() - assert res.output.stdout == "" - assert res.output.newfile1 is attr.NOTHING - - -def test_shell_cmd_inputspec_outputspec_5(): - """ - customised input_spec and output_spec, output_spec uses input_spec fields in the requires - requires is a list of list so it is treated as OR list (i.e. el[0] OR el[1] OR...) - the firs element of the requires list has all the fields set - """ - cmd = ["touch", "newfile_tmp.txt"] - my_input_spec = SpecInfo( - name="Input", - fields=[ - ( - "file1", - str, - {"help_string": "1st creadted file", "argstr": "", "position": 1}, - ), - ("additional_inp_A", int, {"help_string": "additional inp A"}), - ("additional_inp_B", str, {"help_string": "additional inp B"}), - ], - bases=(ShellSpec,), - ) - - my_output_spec = SpecInfo( - name="Output", - fields=[ - ( - "newfile1", - File, - { - "output_file_template": "{file1}", - "help_string": "newfile 1", - # requires is a list of list so it's treated as el[0] OR el[1] OR... - "requires": [ - ["file1", "additional_inp_A"], - ["file1", "additional_inp_B"], - ], - }, - ) - ], - bases=(ShellOutSpec,), - ) - shelly = ShellCommandTask( - name="shelly", - executable=cmd, - input_spec=my_input_spec, - output_spec=my_output_spec, - ) - shelly.inputs.file1 = "new_file_1.txt" - shelly.inputs.additional_inp_A = 2 - - res = shelly() - assert res.output.stdout == "" - assert res.output.newfile1.fspath.exists() - - -def test_shell_cmd_inputspec_outputspec_5a(): - """ - customised input_spec and output_spec, output_spec uses input_spec fields in the requires - requires is a list of list so it is treated as OR list (i.e. el[0] OR el[1] OR...) - the second element of the requires list (i.e. additional_inp_B) has all the fields set - """ - cmd = ["touch", "newfile_tmp.txt"] - my_input_spec = SpecInfo( - name="Input", - fields=[ - ( - "file1", - str, - {"help_string": "1st creadted file", "argstr": "", "position": 1}, - ), - ("additional_inp_A", str, {"help_string": "additional inp A"}), - ("additional_inp_B", int, {"help_string": "additional inp B"}), - ], - bases=(ShellSpec,), - ) - - my_output_spec = SpecInfo( - name="Output", - fields=[ - ( - "newfile1", - File, - { - "output_file_template": "{file1}", - "help_string": "newfile 1", - # requires is a list of list so it's treated as el[0] OR el[1] OR... - "requires": [ - ["file1", "additional_inp_A"], - ["file1", "additional_inp_B"], - ], - }, - ) - ], - bases=(ShellOutSpec,), - ) - shelly = ShellCommandTask( - name="shelly", - executable=cmd, - input_spec=my_input_spec, - output_spec=my_output_spec, - ) - shelly.inputs.file1 = "new_file_1.txt" - shelly.inputs.additional_inp_B = 2 - - res = shelly() - assert res.output.stdout == "" - assert res.output.newfile1.fspath.exists() - - -def test_shell_cmd_inputspec_outputspec_5b(): - """ - customised input_spec and output_spec, output_spec uses input_spec fields in the requires - requires is a list of list so it is treated as OR list (i.e. el[0] OR el[1] OR...) - neither of the list from requirements has all the fields set, so the output is NOTHING - """ - cmd = ["touch", "newfile_tmp.txt"] - my_input_spec = SpecInfo( - name="Input", - fields=[ - ( - "file1", - str, - {"help_string": "1st creadted file", "argstr": "", "position": 1}, - ), - ("additional_inp_A", str, {"help_string": "additional inp A"}), - ("additional_inp_B", str, {"help_string": "additional inp B"}), - ], - bases=(ShellSpec,), - ) - - my_output_spec = SpecInfo( - name="Output", - fields=[ - ( - "newfile1", - File, - { - "output_file_template": "{file1}", - "help_string": "newfile 1", - # requires is a list of list so it's treated as el[0] OR el[1] OR... - "requires": [ - ["file1", "additional_inp_A"], - ["file1", "additional_inp_B"], - ], - }, - ) - ], - bases=(ShellOutSpec,), - ) - shelly = ShellCommandTask( - name="shelly", - executable=cmd, - input_spec=my_input_spec, - output_spec=my_output_spec, - ) - shelly.inputs.file1 = "new_file_1.txt" - - res = shelly() - assert res.output.stdout == "" - # neither additional_inp_A nor additional_inp_B is set, so newfile1 is NOTHING - assert res.output.newfile1 is attr.NOTHING - - -def test_shell_cmd_inputspec_outputspec_6_except(): - """ - customised input_spec and output_spec, output_spec uses input_spec fields in the requires - requires has invalid syntax - exception is raised - """ - cmd = ["touch", "newfile_tmp.txt"] - my_input_spec = SpecInfo( - name="Input", - fields=[ - ( - "file1", - str, - {"help_string": "1st creadted file", "argstr": "", "position": 1}, - ), - ("additional_inp_A", str, {"help_string": "additional inp A"}), - ], - bases=(ShellSpec,), - ) - - my_output_spec = SpecInfo( - name="Output", - fields=[ - ( - "newfile1", - File, - { - "output_file_template": "{file1}", - "help_string": "newfile 1", - # requires has invalid syntax - "requires": [["file1", "additional_inp_A"], "file1"], - }, - ) - ], - bases=(ShellOutSpec,), - ) - shelly = ShellCommandTask( - name="shelly", - executable=cmd, - input_spec=my_input_spec, - output_spec=my_output_spec, - ) - shelly.inputs.file1 = "new_file_1.txt" - - with pytest.raises(Exception, match="requires field can be"): - shelly() - - -def no_fsl(): - if "FSLDIR" not in os.environ: - return True - - -@pytest.mark.skipif(no_fsl(), reason="fsl is not installed") -def test_fsl(data_tests_dir): - """mandatory field added to fields, value provided""" - - _xor_inputs = [ - "functional", - "reduce_bias", - "robust", - "padding", - "remove_eyes", - "surfaces", - "t2_guided", - ] - - def change_name(file): - name, ext = os.path.splitext(file) - return f"{name}_brain.{ext}" - - bet_input_spec = SpecInfo( - name="Input", - # TODO: change the position?? - fields=[ - ( - "in_file", - attr.ib( - type=File, - metadata={ - "help_string": "input file to skull strip", - "position": 1, - "mandatory": True, - "argstr": "", - }, - ), - ), - ( - "out_file", - attr.ib( - type=str, - metadata={ - "help_string": "name of output skull stripped image", - "position": 2, - "argstr": "", - "output_file_template": "{in_file}_brain", - }, - ), - ), - ( - "outline", - attr.ib( - type=bool, - metadata={ - "help_string": "create surface outline image", - "argstr": "-o", - }, - ), - ), - ( - "mask", - attr.ib( - type=bool, - metadata={ - "help_string": "create binary mask image", - "argstr": "-m", - }, - ), - ), - ( - "skull", - attr.ib( - type=bool, - metadata={"help_string": "create skull image", "argstr": "-s"}, - ), - ), - ( - "no_output", - attr.ib( - type=bool, - metadata={ - "help_string": "Don't generate segmented output", - "argstr": "-n", - }, - ), - ), - ( - "frac", - attr.ib( - type=float, - metadata={ - "help_string": "fractional intensity threshold", - "argstr": "-f", - }, - ), - ), - ( - "vertical_gradient", - attr.ib( - type=float, - metadata={ - "help_string": "vertical gradient in fractional intensity threshold (-1, 1)", - "argstr": "-g", - "allowed_values": {"min_val": -1, "max_val": 1}, - }, - ), - ), - ( - "radius", - attr.ib( - type=int, metadata={"argstr": "-r", "help_string": "head radius"} - ), - ), - ( - "center", - attr.ib( - type=ty.List[int], - metadata={ - "help_string": "center of gravity in voxels", - "argstr": "-c", - "allowed_values": {"min_value": 0, "max_value": 3}, - }, - ), - ), - ( - "threshold", - attr.ib( - type=bool, - metadata={ - "argstr": "-t", - "help_string": "apply thresholding to segmented brain image and mask", - }, - ), - ), - ( - "mesh", - attr.ib( - type=bool, - metadata={ - "argstr": "-e", - "help_string": "generate a vtk mesh brain surface", - }, - ), - ), - ( - "robust", - attr.ib( - type=bool, - metadata={ - "help_string": "robust brain centre estimation (iterates BET several times)", - "argstr": "-R", - "xor": _xor_inputs, - }, - ), - ), - ( - "padding", - attr.ib( - type=bool, - metadata={ - "help_string": "improve BET if FOV is very small in Z (by temporarily padding end slices", - "argstr": "-Z", - "xor": _xor_inputs, - }, - ), - ), - ( - "remove_eyes", - attr.ib( - type=bool, - metadata={ - "help_string": "eye & optic nerve cleanup (can be useful in SIENA)", - "argstr": "-S", - "xor": _xor_inputs, - }, - ), - ), - ( - "surfaces", - attr.ib( - type=bool, - metadata={ - "help_string": "run bet2 and then betsurf to get additional skull and scalp surfaces (includes registrations)", - "argstr": "-A", - "xor": _xor_inputs, - }, - ), - ), - ( - "t2_guided", - attr.ib( - type=ty.Union[File, str], - metadata={ - "help_string": "as with creating surfaces, when also feeding in non-brain-extracted T2 (includes registrations)", - "argstr": "-A2", - "xor": _xor_inputs, - }, - ), - ), - ( - "functional", - attr.ib( - type=bool, - metadata={ - "argstr": "-F", - "xor": _xor_inputs, - "help_string": "apply to 4D fMRI data", - }, - ), - ), - ( - "reduce_bias", - attr.ib( - type=bool, - metadata={ - "argstr": "-B", - "xor": _xor_inputs, - "help_string": "bias field and neck cleanup", - }, - ), - ), - # ("number_classes", int, attr.ib(metadata={"help_string": 'number of tissue-type classes', "argstr": '-n', - # "allowed_values": {"min_val": 1, "max_val": 10}})), - # ("output_biasfield", bool, - # attr.ib(metadata={"help_string": 'output estimated bias field', "argstr": '-b'})), - # ("output_biascorrected", bool, - # attr.ib(metadata={"help_string": 'output restored image (bias-corrected image)', "argstr": '-B'})), - ], - bases=(ShellSpec,), - ) - - # TODO: not sure why this has to be string - in_file = data_tests_dir / "test.nii.gz" - - # separate command into exec + args - shelly = ShellCommandTask( - name="bet_task", executable="bet", in_file=in_file, input_spec=bet_input_spec - ) - out_file = shelly.output_dir / "test_brain.nii.gz" - assert shelly.inputs.executable == "bet" - assert shelly.cmdline == f"bet {in_file} {out_file}" - # res = shelly(plugin="cf") - - -def test_shell_cmd_optional_output_file1(tmp_path): - """ - Test to see that 'unused' doesn't complain about not having an output passed to it - """ - my_cp_spec = SpecInfo( - name="Input", - fields=[ - ( - "input", - attr.ib( - type=File, metadata={"argstr": "", "help_string": "input file"} - ), - ), - ( - "output", - attr.ib( - type=Path, - metadata={ - "argstr": "", - "output_file_template": "out.txt", - "help_string": "output file", - }, - ), - ), - ( - "unused", - attr.ib( - type=ty.Union[Path, bool], - default=False, - metadata={ - "argstr": "--not-used", - "output_file_template": "out.txt", - "help_string": "dummy output", - }, - ), - ), - ], - bases=(ShellSpec,), - ) - - my_cp = ShellCommandTask( - name="my_cp", - executable="cp", - input_spec=my_cp_spec, - ) - file1 = tmp_path / "file1.txt" - file1.write_text("foo") - result = my_cp(input=file1, unused=False) - assert result.output.output.fspath.read_text() == "foo" - - -def test_shell_cmd_optional_output_file2(tmp_path): - """ - Test to see that 'unused' doesn't complain about not having an output passed to it - """ - my_cp_spec = SpecInfo( - name="Input", - fields=[ - ( - "input", - attr.ib( - type=File, metadata={"argstr": "", "help_string": "input file"} - ), - ), - ( - "output", - attr.ib( - type=ty.Union[Path, bool], - default=False, - metadata={ - "argstr": "", - "output_file_template": "out.txt", - "help_string": "dummy output", - }, - ), - ), - ], - bases=(ShellSpec,), - ) - - my_cp = ShellCommandTask( - name="my_cp", - executable="cp", - input_spec=my_cp_spec, - ) - file1 = tmp_path / "file1.txt" - file1.write_text("foo") - result = my_cp(input=file1, output=True) - assert result.output.output.fspath.read_text() == "foo" - - file2 = tmp_path / "file2.txt" - file2.write_text("bar") - with pytest.raises(RuntimeError): - my_cp(input=file2, output=False) - - -def test_shell_cmd_non_existing_outputs_1(tmp_path): - """Checking that non existing output files do not return a phantom path, - but return NOTHING instead""" - input_spec = SpecInfo( - name="Input", - fields=[ - ( - "out_name", - attr.ib( - type=str, - metadata={ - "help_string": """ - base name of the pretend outputs. - """, - "mandatory": True, - }, - ), - ) - ], - bases=(ShellSpec,), - ) - out_spec = SpecInfo( - name="Output", - fields=[ - ( - "out_1", - attr.ib( - type=File, - metadata={ - "help_string": "fictional output #1", - "output_file_template": "{out_name}_1.nii", - }, - ), - ), - ( - "out_2", - attr.ib( - type=File, - metadata={ - "help_string": "fictional output #2", - "output_file_template": "{out_name}_2.nii", - }, - ), - ), - ], - bases=(ShellOutSpec,), - ) - - shelly = ShellCommandTask( - cache_dir=tmp_path, - executable="echo", - input_spec=input_spec, - output_spec=out_spec, - out_name="test", - ) - shelly() - res = shelly.result() - assert res.output.out_1 == attr.NOTHING and res.output.out_2 == attr.NOTHING - - -def test_shell_cmd_non_existing_outputs_2(tmp_path): - """Checking that non existing output files do not return a phantom path, - but return NOTHING instead. This test has one existing and one non existing output file. - """ - input_spec = SpecInfo( - name="Input", - fields=[ - ( - "out_name", - attr.ib( - type=str, - metadata={ - "help_string": """ - base name of the pretend outputs. - """, - "mandatory": True, - "argstr": "{out_name}_1.nii", - }, - ), - ) - ], - bases=(ShellSpec,), - ) - out_spec = SpecInfo( - name="Output", - fields=[ - ( - "out_1", - attr.ib( - type=File, - metadata={ - "help_string": "fictional output #1", - "output_file_template": "{out_name}_1.nii", - }, - ), - ), - ( - "out_2", - attr.ib( - type=File, - metadata={ - "help_string": "fictional output #2", - "output_file_template": "{out_name}_2.nii", - }, - ), - ), - ], - bases=(ShellOutSpec,), - ) - - shelly = ShellCommandTask( - cache_dir=tmp_path, - executable="touch", - input_spec=input_spec, - output_spec=out_spec, - out_name="test", - ) - shelly() - res = shelly.result() - # the first output file is created - assert res.output.out_1.fspath == Path(shelly.output_dir) / Path("test_1.nii") - assert res.output.out_1.fspath.exists() - # the second output file is not created - assert res.output.out_2 == attr.NOTHING - - -def test_shell_cmd_non_existing_outputs_3(tmp_path): - """Checking that non existing output files do not return a phantom path, - but return NOTHING instead. This test has an existing mandatory output and another non existing output file. - """ - input_spec = SpecInfo( - name="Input", - fields=[ - ( - "out_name", - attr.ib( - type=str, - metadata={ - "help_string": """ - base name of the pretend outputs. - """, - "mandatory": True, - "argstr": "{out_name}_1.nii", - }, - ), - ) - ], - bases=(ShellSpec,), - ) - out_spec = SpecInfo( - name="Output", - fields=[ - ( - "out_1", - attr.ib( - type=File, - metadata={ - "help_string": "fictional output #1", - "output_file_template": "{out_name}_1.nii", - "mandatory": True, - }, - ), - ), - ( - "out_2", - attr.ib( - type=File, - metadata={ - "help_string": "fictional output #2", - "output_file_template": "{out_name}_2.nii", - }, - ), - ), - ], - bases=(ShellOutSpec,), - ) - - shelly = ShellCommandTask( - cache_dir=tmp_path, - executable="touch", - input_spec=input_spec, - output_spec=out_spec, - out_name="test", - ) - shelly() - res = shelly.result() - # the first output file is created - assert res.output.out_1.fspath == Path(shelly.output_dir) / Path("test_1.nii") - assert res.output.out_1.fspath.exists() - # the second output file is not created - assert res.output.out_2 == attr.NOTHING - - -def test_shell_cmd_non_existing_outputs_4(tmp_path): - """Checking that non existing output files do not return a phantom path, - but return NOTHING instead. This test has an existing mandatory output and another non existing - mandatory output file.""" - input_spec = SpecInfo( - name="Input", - fields=[ - ( - "out_name", - attr.ib( - type=str, - metadata={ - "help_string": """ - base name of the pretend outputs. - """, - "mandatory": True, - "argstr": "{out_name}_1.nii", - }, - ), - ) - ], - bases=(ShellSpec,), - ) - out_spec = SpecInfo( - name="Output", - fields=[ - ( - "out_1", - attr.ib( - type=File, - metadata={ - "help_string": "fictional output #1", - "output_file_template": "{out_name}_1.nii", - "mandatory": True, - }, - ), - ), - ( - "out_2", - attr.ib( - type=File, - metadata={ - "help_string": "fictional output #2", - "output_file_template": "{out_name}_2.nii", - "mandatory": True, - }, - ), - ), - ], - bases=(ShellOutSpec,), - ) - - shelly = ShellCommandTask( - cache_dir=tmp_path, - executable="touch", - input_spec=input_spec, - output_spec=out_spec, - out_name="test", - ) - # An exception should be raised because the second mandatory output does not exist - with pytest.raises(Exception) as excinfo: - shelly() - assert "mandatory output for variable out_2 does not exist" == str(excinfo.value) - # checking if the first output was created - assert (Path(shelly.output_dir) / Path("test_1.nii")).exists() - - -def test_shell_cmd_non_existing_outputs_multi_1(tmp_path): - """This test looks if non existing files of an multiOuputFile are also set to NOTHING""" - input_spec = SpecInfo( - name="Input", - fields=[ - ( - "out_name", - attr.ib( - type=MultiInputObj, - metadata={ - "help_string": """ - base name of the pretend outputs. - """, - "mandatory": True, - "argstr": "...", - }, - ), - ) - ], - bases=(ShellSpec,), - ) - out_spec = SpecInfo( - name="Output", - fields=[ - ( - "out_list", - attr.ib( - type=MultiOutputFile, - metadata={ - "help_string": "fictional output #1", - "output_file_template": "{out_name}", - }, - ), - ), - ], - bases=(ShellOutSpec,), - ) - - shelly = ShellCommandTask( - cache_dir=tmp_path, - executable="echo", - input_spec=input_spec, - output_spec=out_spec, - out_name=["test_1.nii", "test_2.nii"], - ) - shelly() - res = shelly.result() - # checking if the outputs are Nothing - assert res.output.out_list[0] == attr.NOTHING - assert res.output.out_list[1] == attr.NOTHING - - -def test_shell_cmd_non_existing_outputs_multi_2(tmp_path): - """This test looks if non existing files of an multiOutputFile are also set to NOTHING. - It checks that it also works if one file of the multiOutputFile actually exists.""" - input_spec = SpecInfo( - name="Input", - fields=[ - ( - "out_name", - attr.ib( - type=MultiInputObj, - metadata={ - "help_string": """ - base name of the pretend outputs. - """, - "sep": " test_1_real.nii", # hacky way of creating an extra file with that name - "mandatory": True, - "argstr": "...", - }, - ), - ) - ], - bases=(ShellSpec,), - ) - out_spec = SpecInfo( - name="Output", - fields=[ - ( - "out_list", - attr.ib( - type=MultiOutputFile, - metadata={ - "help_string": "fictional output #1", - "output_file_template": "{out_name}_real.nii", - }, - ), - ), - ], - bases=(ShellOutSpec,), - ) - - shelly = ShellCommandTask( - cache_dir=tmp_path, - executable="touch", - input_spec=input_spec, - output_spec=out_spec, - out_name=["test_1", "test_2"], - ) - shelly() - res = shelly.result() - # checking if the outputs are Nothing - assert res.output.out_list[0] == File(Path(shelly.output_dir) / "test_1_real.nii") - assert res.output.out_list[1] == attr.NOTHING - - -@pytest.mark.xfail( - reason=( - "Not sure what the desired behaviour for formatter 5 is. Field is declared as a list " - "but a string containing the formatted arg is passed instead." - ) -) -def test_shellspec_formatter_1(tmp_path): - """test the input callable 'formatter'.""" - - def spec_info(formatter): - return SpecInfo( - name="Input", - fields=[ - ( - "in1", - attr.ib( - type=str, - metadata={ - "help_string": """ - just a dummy name - """, - "mandatory": True, - }, - ), - ), - ( - "in2", - attr.ib( - type=str, - metadata={ - "help_string": """ - just a dummy name - """, - "mandatory": True, - }, - ), - ), - ( - "together", - attr.ib( - type=ty.List, - metadata={ - "help_string": """ - combines in1 and in2 into a list - """, - # When providing a formatter all other metadata options are discarded. - "formatter": formatter, - }, - ), - ), - ], - bases=(ShellSpec,), - ) - - def formatter_1(inputs): - print("FORMATTER:", inputs) - return f"-t [{inputs['in1']}, {inputs['in2']}]" - - input_spec = spec_info(formatter_1) - shelly = ShellCommandTask( - executable="exec", input_spec=input_spec, in1="i1", in2="i2" - ) - assert shelly.cmdline == "exec -t [i1, i2]" - - # testing that the formatter can overwrite a provided value for together. - shelly = ShellCommandTask( - executable="exec", - input_spec=input_spec, - in1="i1", - in2="i2", - together=[1], - ) - assert shelly.cmdline == "exec -t [i1, i2]" - - # asking for specific inputs - def formatter_2(in1, in2): - print("FORMATTER:", in1, in2) - return f"-t [{in1}, {in2}]" - - input_spec = spec_info(formatter_2) - - shelly = ShellCommandTask( - executable="exec", input_spec=input_spec, in1="i1", in2="i2" - ) - assert shelly.cmdline == "exec -t [i1, i2]" - - def formatter_3(in1, in3): - print("FORMATTER:", in1, in3) - return f"-t [{in1}, {in3}]" - - input_spec = spec_info(formatter_3) - - shelly = ShellCommandTask( - executable="exec", input_spec=input_spec, in1="i1", in2="i2" - ) - with pytest.raises(Exception) as excinfo: - shelly.cmdline - assert ( - "arguments of the formatter function from together has to be in inputs or be field or output_dir, but in3 is used" - == str(excinfo.value) - ) - - # checking if field value is accessible when None - def formatter_5(field): - assert field == "-t test" - # formatter must return a string - return field - - input_spec = spec_info(formatter_5) - - shelly = ShellCommandTask( - executable="exec", - input_spec=input_spec, - in1="i1", - in2="i2", - # together="-t test", - ) - assert shelly.cmdline == "exec -t test" - - # checking if field value is accessible when None - def formatter_4(field): - assert field is None - # formatter must return a string - return "" - - input_spec = spec_info(formatter_4) - - shelly = ShellCommandTask( - executable="exec", input_spec=input_spec, in1="i1", in2="i2" - ) - assert shelly.cmdline == "exec" - - -def test_shellspec_formatter_splitter_2(tmp_path): - """test the input callable 'formatter' when a splitter is used on an argument of the formatter.""" - - def spec_info(formatter): - return SpecInfo( - name="Input", - fields=[ - ( - "in1", - attr.ib( - type=str, - metadata={ - "help_string": "in1", - }, - ), - ), - ( - "in2", - attr.ib( - type=str, - metadata={ - "help_string": "in2", - }, - ), - ), - ( - "together", - attr.ib( - type=ty.List, - metadata={ - "help_string": """ - uses in1 - """, - # When providing a formatter all other metadata options are discarded. - "formatter": formatter, - }, - ), - ), - ], - bases=(ShellSpec,), - ) - - # asking for specific inputs - def formatter_1(in1, in2): - return f"-t [{in1} {in2}]" - - input_spec = spec_info(formatter_1) - in1 = ["in11", "in12"] - shelly = ShellCommandTask( - name="f", executable="executable", input_spec=input_spec, in2="in2" - ).split("in1", in1=in1) - assert shelly is not None - - # results = shelly.cmdline - # assert len(results) == 2 - # com_results = ["executable -t [in11 in2]", "executable -t [in12 in2]"] - # for i, cr in enumerate(com_results): - # assert results[i] == cr - - -@no_win -def test_shellcommand_error_msg(tmp_path): - script_path = Path(tmp_path) / "script.sh" - - with open(script_path, "w") as f: - f.write( - """#!/bin/bash - echo "first line is ok, it prints '$1'" - /command-that-doesnt-exist""" - ) - - os.chmod( - script_path, - mode=( - stat.S_IRUSR - | stat.S_IWUSR - | stat.S_IXUSR - | stat.S_IRGRP - | stat.S_IWGRP - | stat.S_IROTH - ), - ) - - input_spec = SpecInfo( - name="Input", - fields=[ - ( - "in1", - str, - {"help_string": "a dummy string", "argstr": "", "mandatory": True}, - ), - ], - bases=(ShellSpec,), - ) - - shelly = ShellCommandTask( - name="err_msg", executable=str(script_path), input_spec=input_spec, in1="hello" - ) - - with pytest.raises(RuntimeError) as excinfo: - shelly() - - path_str = str(script_path) - - assert ( - str(excinfo.value) - == f"""Error running 'err_msg' task with ['{path_str}', 'hello']: - -stderr: -{path_str}: line 3: /command-that-doesnt-exist: No such file or directory - - -stdout: -first line is ok, it prints 'hello' -""" - ) diff --git a/pydra/engine/tests/test_shelltask_inputspec.py b/pydra/engine/tests/test_shelltask_inputspec.py deleted file mode 100644 index 9bc7f7a232..0000000000 --- a/pydra/engine/tests/test_shelltask_inputspec.py +++ /dev/null @@ -1,2297 +0,0 @@ -import typing as ty -from pathlib import Path -import attr -import pytest - -from ..task import ShellCommandTask -from ..specs import ( - ShellOutSpec, - ShellSpec, - SpecInfo, - File, - MultiInputObj, -) - - -def test_shell_cmd_execargs_1(): - # separate command into exec + args - shelly = ShellCommandTask(executable="executable", args="arg") - assert shelly.cmdline == "executable arg" - assert shelly.name == "ShellTask_noname" - - -def test_shell_cmd_execargs_2(): - # separate command into exec + args - shelly = ShellCommandTask(executable=["cmd_1", "cmd_2"], args="arg") - assert shelly.cmdline == "cmd_1 cmd_2 arg" - - -def test_shell_cmd_inputs_1(): - """additional input with provided position""" - my_input_spec = SpecInfo( - name="Input", - fields=[ - ( - "inpA", - attr.ib( - type=str, - metadata={"position": 1, "help_string": "inp1", "argstr": ""}, - ), - ) - ], - bases=(ShellSpec,), - ) - - shelly = ShellCommandTask( - executable="executable", args="arg", inpA="inp1", input_spec=my_input_spec - ) - assert shelly.cmdline == "executable inp1 arg" - - -def test_shell_cmd_inputs_1a(): - """additional input without provided position""" - my_input_spec = SpecInfo( - name="Input", - fields=[ - ("inpA", attr.ib(type=str, metadata={"help_string": "inpA", "argstr": ""})) - ], - bases=(ShellSpec,), - ) - - shelly = ShellCommandTask( - executable="executable", args="arg", inpA="inpNone1", input_spec=my_input_spec - ) - # inp1 should be the first one after executable - assert shelly.cmdline == "executable inpNone1 arg" - - -def test_shell_cmd_inputs_1b(): - """additional input with negative position""" - my_input_spec = SpecInfo( - name="Input", - fields=[ - ( - "inpA", - attr.ib( - type=str, - metadata={"position": -1, "help_string": "inpA", "argstr": ""}, - ), - ) - ], - bases=(ShellSpec,), - ) - - # separate command into exec + args - shelly = ShellCommandTask( - executable="executable", args="arg", inpA="inp-1", input_spec=my_input_spec - ) - # inp1 should be last before arg - assert shelly.cmdline == "executable inp-1 arg" - - -def test_shell_cmd_inputs_1_st(): - """additional input with provided position, checking cmdline when splitter""" - my_input_spec = SpecInfo( - name="Input", - fields=[ - ( - "inpA", - attr.ib( - type=str, - metadata={"position": 1, "help_string": "inp1", "argstr": ""}, - ), - ) - ], - bases=(ShellSpec,), - ) - - ShellCommandTask( - name="shelly", - executable="executable", - args="arg", - input_spec=my_input_spec, - ).split("inpA", inpA=["inp1", "inp2"]) - # cmdline should be a list - # assert shelly.cmdline[0] == "executable inp1 arg" - # assert shelly.cmdline[1] == "executable inp2 arg" - - -def test_shell_cmd_inputs_2(): - """additional inputs with provided positions""" - my_input_spec = SpecInfo( - name="Input", - fields=[ - ( - "inpA", - attr.ib( - type=str, - metadata={"position": 2, "help_string": "inpA", "argstr": ""}, - ), - ), - ( - "inpB", - attr.ib( - type=str, - metadata={"position": 1, "help_string": "inpN", "argstr": ""}, - ), - ), - ], - bases=(ShellSpec,), - ) - - # separate command into exec + args - shelly = ShellCommandTask( - executable="executable", inpB="inp1", inpA="inp2", input_spec=my_input_spec - ) - assert shelly.cmdline == "executable inp1 inp2" - - -def test_shell_cmd_inputs_2a(): - """additional inputs without provided positions""" - my_input_spec = SpecInfo( - name="Input", - fields=[ - ("inpA", attr.ib(type=str, metadata={"help_string": "inpA", "argstr": ""})), - ("inpB", attr.ib(type=str, metadata={"help_string": "inpB", "argstr": ""})), - ], - bases=(ShellSpec,), - ) - - # separate command into exec + args - shelly = ShellCommandTask( - executable="executable", - inpA="inpNone1", - inpB="inpNone2", - input_spec=my_input_spec, - ) - # position taken from the order in input spec - assert shelly.cmdline == "executable inpNone1 inpNone2" - - -def test_shell_cmd_inputs_2_err(): - """additional inputs with provided positions (exception due to the duplication)""" - my_input_spec = SpecInfo( - name="Input", - fields=[ - ( - "inpA", - attr.ib( - type=str, - metadata={"position": 1, "help_string": "inpA", "argstr": ""}, - ), - ), - ( - "inpB", - attr.ib( - type=str, - metadata={"position": 1, "help_string": "inpB", "argstr": ""}, - ), - ), - ], - bases=(ShellSpec,), - ) - - shelly = ShellCommandTask( - executable="executable", inpA="inp1", inpB="inp2", input_spec=my_input_spec - ) - with pytest.raises(Exception) as e: - shelly.cmdline - assert "1 is already used" in str(e.value) - - -def test_shell_cmd_inputs_2_noerr(): - """additional inputs with provided positions - (duplication of the position doesn't lead to error, since only one field has value) - """ - my_input_spec = SpecInfo( - name="Input", - fields=[ - ( - "inpA", - attr.ib( - type=str, - metadata={"position": 1, "help_string": "inpA", "argstr": ""}, - ), - ), - ( - "inpB", - attr.ib( - type=str, - metadata={"position": 1, "help_string": "inpB", "argstr": ""}, - ), - ), - ], - bases=(ShellSpec,), - ) - - shelly = ShellCommandTask( - executable="executable", inpA="inp1", input_spec=my_input_spec - ) - shelly.cmdline - - -def test_shell_cmd_inputs_3(): - """additional inputs: positive pos, negative pos and no pos""" - my_input_spec = SpecInfo( - name="Input", - fields=[ - ( - "inpA", - attr.ib( - type=str, - metadata={"position": 1, "help_string": "inpA", "argstr": ""}, - ), - ), - ( - "inpB", - attr.ib( - type=str, - metadata={"position": -1, "help_string": "inpB", "argstr": ""}, - ), - ), - ("inpC", attr.ib(type=str, metadata={"help_string": "inpC", "argstr": ""})), - ], - bases=(ShellSpec,), - ) - - # separate command into exec + args - shelly = ShellCommandTask( - executable="executable", - inpA="inp1", - inpB="inp-1", - inpC="inpNone", - input_spec=my_input_spec, - ) - # input without position should be between positive an negative positions - assert shelly.cmdline == "executable inp1 inpNone inp-1" - - -def test_shell_cmd_inputs_argstr_1(): - """additional string inputs with argstr""" - my_input_spec = SpecInfo( - name="Input", - fields=[ - ( - "inpA", - attr.ib( - type=str, - metadata={"position": 1, "help_string": "inpA", "argstr": "-v"}, - ), - ) - ], - bases=(ShellSpec,), - ) - - shelly = ShellCommandTask( - executable="executable", inpA="inp1", input_spec=my_input_spec - ) - # flag used before inp1 - assert shelly.cmdline == "executable -v inp1" - - -def test_shell_cmd_inputs_argstr_2(): - """additional bool inputs with argstr""" - my_input_spec = SpecInfo( - name="Input", - fields=[ - ( - "inpA", - attr.ib( - type=bool, - metadata={"position": 1, "help_string": "inpA", "argstr": "-v"}, - ), - ) - ], - bases=(ShellSpec,), - ) - - # separate command into exec + args - shelly = ShellCommandTask( - executable="executable", args="arg", inpA=True, input_spec=my_input_spec - ) - # a flag is used without any additional argument - assert shelly.cmdline == "executable -v arg" - - -def test_shell_cmd_inputs_list_1(): - """providing list as an additional input, no sep, no argstr""" - my_input_spec = SpecInfo( - name="Input", - fields=[ - ( - "inpA", - attr.ib( - type=ty.List[str], - metadata={"position": 2, "help_string": "inpA", "argstr": ""}, - ), - ) - ], - bases=(ShellSpec,), - ) - - shelly = ShellCommandTask( - executable="executable", inpA=["el_1", "el_2", "el_3"], input_spec=my_input_spec - ) - # multiple elements - assert shelly.cmdline == "executable el_1 el_2 el_3" - - -def test_shell_cmd_inputs_list_2(): - """providing list as an additional input, no sep, but argstr""" - my_input_spec = SpecInfo( - name="Input", - fields=[ - ( - "inpA", - attr.ib( - type=ty.List[str], - metadata={"position": 2, "help_string": "inpA", "argstr": "-v"}, - ), - ) - ], - bases=(ShellSpec,), - ) - - shelly = ShellCommandTask( - executable="executable", inpA=["el_1", "el_2", "el_3"], input_spec=my_input_spec - ) - assert shelly.cmdline == "executable -v el_1 el_2 el_3" - - -def test_shell_cmd_inputs_list_3(): - """providing list as an additional input, no sep, argstr with ...""" - my_input_spec = SpecInfo( - name="Input", - fields=[ - ( - "inpA", - attr.ib( - type=ty.List[str], - metadata={"position": 2, "help_string": "inpA", "argstr": "-v..."}, - ), - ) - ], - bases=(ShellSpec,), - ) - - shelly = ShellCommandTask( - executable="executable", inpA=["el_1", "el_2", "el_3"], input_spec=my_input_spec - ) - # a flag is repeated - assert shelly.cmdline == "executable -v el_1 -v el_2 -v el_3" - - -def test_shell_cmd_inputs_list_sep_1(): - """providing list as an additional input:, sep, no argstr""" - my_input_spec = SpecInfo( - name="Input", - fields=[ - ( - "inpA", - attr.ib( - type=MultiInputObj[str], - metadata={ - "position": 1, - "help_string": "inpA", - "sep": ",", - "argstr": "", - }, - ), - ) - ], - bases=(ShellSpec,), - ) - - shelly = ShellCommandTask( - executable="executable", - inpA=["aaa", "bbb", "ccc"], - input_spec=my_input_spec, - ) - # separated by commas - assert shelly.cmdline == "executable aaa,bbb,ccc" - - -def test_shell_cmd_inputs_list_sep_2(): - """providing list as an additional input:, sep, and argstr""" - my_input_spec = SpecInfo( - name="Input", - fields=[ - ( - "inpA", - attr.ib( - type=MultiInputObj[str], - metadata={ - "position": 1, - "help_string": "inpA", - "sep": ",", - "argstr": "-v", - }, - ), - ) - ], - bases=(ShellSpec,), - ) - - shelly = ShellCommandTask( - executable="executable", - inpA=["aaa", "bbb", "ccc"], - input_spec=my_input_spec, - ) - # a flag is used once - assert shelly.cmdline == "executable -v aaa,bbb,ccc" - - -def test_shell_cmd_inputs_list_sep_2a(): - """providing list as an additional input:, sep, and argstr with f-string""" - my_input_spec = SpecInfo( - name="Input", - fields=[ - ( - "inpA", - attr.ib( - type=MultiInputObj[str], - metadata={ - "position": 1, - "help_string": "inpA", - "sep": ",", - "argstr": "-v {inpA}", - }, - ), - ) - ], - bases=(ShellSpec,), - ) - - shelly = ShellCommandTask( - executable="executable", - inpA=["aaa", "bbb", "ccc"], - input_spec=my_input_spec, - ) - # a flag is used once - assert shelly.cmdline == "executable -v aaa,bbb,ccc" - - -def test_shell_cmd_inputs_list_sep_3(): - """providing list as an additional input:, sep, argstr with ...""" - my_input_spec = SpecInfo( - name="Input", - fields=[ - ( - "inpA", - attr.ib( - type=MultiInputObj[str], - metadata={ - "position": 1, - "help_string": "inpA", - "sep": ",", - "argstr": "-v...", - }, - ), - ) - ], - bases=(ShellSpec,), - ) - - shelly = ShellCommandTask( - executable="executable", - inpA=["aaa", "bbb", "ccc"], - input_spec=my_input_spec, - ) - # a flag is repeated - assert shelly.cmdline == "executable -v aaa, -v bbb, -v ccc" - - -def test_shell_cmd_inputs_list_sep_3a(): - """providing list as an additional input:, sep, argstr with ... and f-string""" - my_input_spec = SpecInfo( - name="Input", - fields=[ - ( - "inpA", - attr.ib( - type=MultiInputObj[str], - metadata={ - "position": 1, - "help_string": "inpA", - "sep": ",", - "argstr": "-v {inpA}...", - }, - ), - ) - ], - bases=(ShellSpec,), - ) - - shelly = ShellCommandTask( - executable="executable", - inpA=["aaa", "bbb", "ccc"], - input_spec=my_input_spec, - ) - # a flag is repeated - assert shelly.cmdline == "executable -v aaa, -v bbb, -v ccc" - - -def test_shell_cmd_inputs_sep_4(): - """providing 1-el list as an additional input:, sep, argstr with ...,""" - my_input_spec = SpecInfo( - name="Input", - fields=[ - ( - "inpA", - attr.ib( - type=MultiInputObj[str], - metadata={ - "position": 1, - "help_string": "inpA", - "sep": ",", - "argstr": "-v...", - }, - ), - ) - ], - bases=(ShellSpec,), - ) - - shelly = ShellCommandTask( - executable="executable", inpA=["aaa"], input_spec=my_input_spec - ) - assert shelly.cmdline == "executable -v aaa" - - -def test_shell_cmd_inputs_sep_4a(): - """providing str instead of list as an additional input:, sep, argstr with ...""" - my_input_spec = SpecInfo( - name="Input", - fields=[ - ( - "inpA", - attr.ib( - type=str, - metadata={ - "position": 1, - "help_string": "inpA", - "sep": ",", - "argstr": "-v...", - }, - ), - ) - ], - bases=(ShellSpec,), - ) - - shelly = ShellCommandTask( - executable="executable", inpA="aaa", input_spec=my_input_spec - ) - assert shelly.cmdline == "executable -v aaa" - - -def test_shell_cmd_inputs_format_1(): - """additional inputs with argstr that has string formatting""" - my_input_spec = SpecInfo( - name="Input", - fields=[ - ( - "inpA", - attr.ib( - type=str, - metadata={ - "position": 1, - "help_string": "inpA", - "argstr": "-v {inpA}", - }, - ), - ) - ], - bases=(ShellSpec,), - ) - - shelly = ShellCommandTask( - executable="executable", inpA="aaa", input_spec=my_input_spec - ) - assert shelly.cmdline == "executable -v aaa" - - -def test_shell_cmd_inputs_format_2(): - """additional inputs with argstr that has string formatting and ...""" - my_input_spec = SpecInfo( - name="Input", - fields=[ - ( - "inpA", - attr.ib( - type=MultiInputObj[str], - metadata={ - "position": 1, - "help_string": "inpA", - "argstr": "-v {inpA}...", - }, - ), - ) - ], - bases=(ShellSpec,), - ) - - shelly = ShellCommandTask( - executable="executable", - inpA=["el_1", "el_2"], - input_spec=my_input_spec, - ) - assert shelly.cmdline == "executable -v el_1 -v el_2" - - -def test_shell_cmd_inputs_format_3(): - """adding float formatting for argstr with input field""" - my_input_spec = SpecInfo( - name="Input", - fields=[ - ( - "inpA", - attr.ib( - type=float, - metadata={ - "position": 1, - "help_string": "inpA", - "argstr": "-v {inpA:.5f}", - }, - ), - ) - ], - bases=(ShellSpec,), - ) - - shelly = ShellCommandTask( - executable="executable", inpA=0.007, input_spec=my_input_spec - ) - assert shelly.cmdline == "executable -v 0.00700" - - -def test_shell_cmd_inputs_mandatory_1(): - """additional inputs with mandatory=True""" - my_input_spec = SpecInfo( - name="Input", - fields=[ - ( - "inpA", - attr.ib( - type=str, - metadata={ - "position": 1, - "help_string": "inpA", - "argstr": "", - "mandatory": True, - }, - ), - ) - ], - bases=(ShellSpec,), - ) - - shelly = ShellCommandTask(executable="executable", input_spec=my_input_spec) - with pytest.raises(Exception) as e: - shelly.cmdline - assert "mandatory" in str(e.value) - - -def test_shell_cmd_inputs_not_given_1(): - my_input_spec = SpecInfo( - name="Input", - fields=[ - ( - "arg1", - attr.ib( - type=MultiInputObj, - metadata={ - "argstr": "--arg1", - "help_string": "Command line argument 1", - }, - ), - ), - ( - "arg2", - attr.ib( - type=MultiInputObj, - metadata={ - "argstr": "--arg2", - "help_string": "Command line argument 2", - }, - ), - ), - ( - "arg3", - attr.ib( - type=File, - metadata={ - "argstr": "--arg3", - "help_string": "Command line argument 3", - }, - ), - ), - ], - bases=(ShellSpec,), - ) - shelly = ShellCommandTask( - name="shelly", executable="executable", input_spec=my_input_spec - ) - - shelly.inputs.arg2 = "argument2" - - assert shelly.cmdline == "executable --arg2 argument2" - - -def test_shell_cmd_inputs_template_1(): - """additional inputs, one uses output_file_template (and argstr)""" - my_input_spec = SpecInfo( - name="Input", - fields=[ - ( - "inpA", - attr.ib( - type=str, - metadata={ - "position": 1, - "help_string": "inpA", - "argstr": "", - "mandatory": True, - }, - ), - ), - ( - "outA", - attr.ib( - type=str, - metadata={ - "position": 2, - "help_string": "outA", - "argstr": "-o", - "output_file_template": "{inpA}_out", - }, - ), - ), - ], - bases=(ShellSpec,), - ) - - shelly = ShellCommandTask( - executable="executable", input_spec=my_input_spec, inpA="inpA" - ) - # outA has argstr in the metadata fields, so it's a part of the command line - # the full path will be use din the command line - assert shelly.cmdline == f"executable inpA -o {shelly.output_dir / 'inpA_out'}" - # checking if outA in the output fields - assert shelly.output_names == ["return_code", "stdout", "stderr", "outA"] - - -def test_shell_cmd_inputs_template_1a(): - """additional inputs, one uses output_file_template (without argstr)""" - my_input_spec = SpecInfo( - name="Input", - fields=[ - ( - "inpA", - attr.ib( - type=str, - metadata={ - "position": 1, - "help_string": "inpA", - "argstr": "", - "mandatory": True, - }, - ), - ), - ( - "outA", - attr.ib( - type=str, - metadata={ - "help_string": "outA", - "output_file_template": "{inpA}_out", - }, - ), - ), - ], - bases=(ShellSpec,), - ) - - shelly = ShellCommandTask( - executable="executable", input_spec=my_input_spec, inpA="inpA" - ) - # outA has no argstr in metadata, so it's not a part of the command line - assert shelly.cmdline == "executable inpA" - - -# TODO: after deciding how we use requires/templates -def test_shell_cmd_inputs_template_2(): - """additional inputs, one uses output_file_template (and argstr, but input not provided)""" - my_input_spec = SpecInfo( - name="Input", - fields=[ - ( - "inpB", - attr.ib( - type=str, - metadata={"position": 1, "help_string": "inpB", "argstr": ""}, - ), - ), - ( - "outB", - attr.ib( - type=str, - metadata={ - "position": 2, - "help_string": "outB", - "argstr": "-o", - "output_file_template": "{inpB}_out", - }, - ), - ), - ], - bases=(ShellSpec,), - ) - - shelly = ShellCommandTask(executable="executable", input_spec=my_input_spec) - # inpB not in the inputs, so no outB in the command line - assert shelly.cmdline == "executable" - # checking if outB in the output fields - assert shelly.output_names == ["return_code", "stdout", "stderr", "outB"] - - -def test_shell_cmd_inputs_template_3(tmp_path): - """additional inputs with output_file_template and an additional - read-only fields that combine two outputs together in the command line - """ - inpA = tmp_path / "inpA" - inpB = tmp_path / "inpB" - Path.touch(inpA) - Path.touch(inpB) - my_input_spec = SpecInfo( - name="Input", - fields=[ - ( - "inpA", - attr.ib( - type=str, - metadata={ - "position": 1, - "help_string": "inpA", - "argstr": "", - "mandatory": True, - }, - ), - ), - ( - "inpB", - attr.ib( - type=str, - metadata={ - "position": 2, - "help_string": "inpB", - "argstr": "", - "mandatory": True, - }, - ), - ), - ( - "outA", - attr.ib( - type=str, - metadata={ - "help_string": "outA", - "output_file_template": "{inpA}_out", - }, - ), - ), - ( - "outB", - attr.ib( - type=str, - metadata={ - "help_string": "outB", - "output_file_template": "{inpB}_out", - }, - ), - ), - ( - "outAB", - attr.ib( - type=str, - metadata={ - "position": -1, - "help_string": "outAB", - "argstr": "-o {outA} {outB}", - "readonly": True, - }, - ), - ), - ], - bases=(ShellSpec,), - ) - - shelly = ShellCommandTask( - executable="executable", input_spec=my_input_spec, inpA=inpA, inpB=inpB - ) - # using syntax from the outAB field - assert ( - shelly.cmdline - == f"executable {tmp_path / 'inpA'} {tmp_path / 'inpB'} -o {shelly.output_dir / 'inpA_out'} {str(shelly.output_dir / 'inpB_out')}" - ) - # checking if outA and outB in the output fields (outAB should not be) - assert shelly.output_names == ["return_code", "stdout", "stderr", "outA", "outB"] - - -def test_shell_cmd_inputs_template_3a(): - """additional inputs with output_file_template and an additional - read-only fields that combine two outputs together in the command line - testing a different order within the input spec - """ - my_input_spec = SpecInfo( - name="Input", - fields=[ - ( - "inpA", - attr.ib( - type=str, - metadata={ - "position": 1, - "help_string": "inpA", - "argstr": "", - "mandatory": True, - }, - ), - ), - ( - "inpB", - attr.ib( - type=str, - metadata={ - "position": 2, - "help_string": "inpB", - "argstr": "", - "mandatory": True, - }, - ), - ), - ( - "outAB", - attr.ib( - type=str, - metadata={ - "position": -1, - "help_string": "outAB", - "argstr": "-o {outA} {outB}", - "readonly": True, - }, - ), - ), - ( - "outA", - attr.ib( - type=str, - metadata={ - "help_string": "outA", - "output_file_template": "{inpA}_out", - }, - ), - ), - ( - "outB", - attr.ib( - type=str, - metadata={ - "help_string": "outB", - "output_file_template": "{inpB}_out", - }, - ), - ), - ], - bases=(ShellSpec,), - ) - - shelly = ShellCommandTask( - executable="executable", input_spec=my_input_spec, inpA="inpA", inpB="inpB" - ) - # using syntax from the outAB field - assert ( - shelly.cmdline - == f"executable inpA inpB -o {shelly.output_dir / 'inpA_out'} {str(shelly.output_dir / 'inpB_out')}" - ) - # checking if outA and outB in the output fields (outAB should not be) - assert shelly.output_names == ["return_code", "stdout", "stderr", "outA", "outB"] - - -# TODO: after deciding how we use requires/templates -def test_shell_cmd_inputs_template_4(): - """additional inputs with output_file_template and an additional - read-only fields that combine two outputs together in the command line - one output_file_template can't be resolved - no inpB is provided - """ - my_input_spec = SpecInfo( - name="Input", - fields=[ - ( - "inpA", - attr.ib( - type=str, - metadata={ - "position": 1, - "help_string": "inpA", - "argstr": "", - "mandatory": True, - }, - ), - ), - ( - "inpB", - attr.ib( - type=str, - metadata={"position": 2, "help_string": "inpB", "argstr": ""}, - ), - ), - ( - "outAB", - attr.ib( - type=str, - metadata={ - "position": -1, - "help_string": "outAB", - "argstr": "-o {outA} {outB}", - "readonly": True, - }, - ), - ), - ( - "outA", - attr.ib( - type=str, - metadata={ - "help_string": "outA", - "output_file_template": "{inpA}_out", - }, - ), - ), - ( - "outB", - attr.ib( - type=str, - metadata={ - "help_string": "outB", - "output_file_template": "{inpB}_out", - }, - ), - ), - ], - bases=(ShellSpec,), - ) - - shelly = ShellCommandTask( - executable="executable", input_spec=my_input_spec, inpA="inpA" - ) - # inpB is not provided so outB not in the command line - assert shelly.cmdline == f"executable inpA -o {shelly.output_dir / 'inpA_out'}" - assert shelly.output_names == ["return_code", "stdout", "stderr", "outA", "outB"] - - -def test_shell_cmd_inputs_template_5_ex(): - """checking if the exception is raised for read-only fields when input is set""" - my_input_spec = SpecInfo( - name="Input", - fields=[ - ( - "outAB", - attr.ib( - type=str, - metadata={ - "position": -1, - "help_string": "outAB", - "argstr": "-o", - "readonly": True, - }, - ), - ) - ], - bases=(ShellSpec,), - ) - - shelly = ShellCommandTask( - executable="executable", input_spec=my_input_spec, outAB="outAB" - ) - with pytest.raises(Exception) as e: - shelly.cmdline - assert "read only" in str(e.value) - - -def test_shell_cmd_inputs_template_6(): - """additional inputs with output_file_template that has type ty.Union[str, bool] - no default is set, so if nothing is provided as an input, the output is used - whenever the template can be formatted - (the same way as for templates that has type=str) - """ - my_input_spec = SpecInfo( - name="Input", - fields=[ - ( - "inpA", - attr.ib( - type=str, - metadata={ - "position": 1, - "help_string": "inpA", - "argstr": "", - "mandatory": True, - }, - ), - ), - ( - "outA", - attr.ib( - type=ty.Union[str, bool], - metadata={ - "position": 2, - "help_string": "outA", - "argstr": "-o", - "output_file_template": "{inpA}_out", - }, - ), - ), - ], - bases=(ShellSpec,), - ) - - # no input for outA (and no default value), so the output is created whenever the - # template can be formatted (the same way as for templates that has type=str) - shelly = ShellCommandTask( - executable="executable", input_spec=my_input_spec, inpA="inpA" - ) - assert shelly.cmdline == f"executable inpA -o {shelly.output_dir / 'inpA_out'}" - - # a string is provided for outA, so this should be used as the outA value - shelly = ShellCommandTask( - executable="executable", input_spec=my_input_spec, inpA="inpA", outA="outA" - ) - assert shelly.cmdline == "executable inpA -o outA" - - # True is provided for outA, so the formatted template should be used as outA value - shelly = ShellCommandTask( - executable="executable", input_spec=my_input_spec, inpA="inpA", outA=True - ) - assert shelly.cmdline == f"executable inpA -o {shelly.output_dir / 'inpA_out'}" - - # False is provided for outA, so the outA shouldn't be used - shelly = ShellCommandTask( - executable="executable", input_spec=my_input_spec, inpA="inpA", outA=False - ) - assert shelly.cmdline == "executable inpA" - - -def test_shell_cmd_inputs_template_6a(): - """additional inputs with output_file_template that has type ty.Union[str, bool] - and default is set to False, - so if nothing is provided as an input, the output is not used - """ - my_input_spec = SpecInfo( - name="Input", - fields=[ - ( - "inpA", - attr.ib( - type=str, - metadata={ - "position": 1, - "help_string": "inpA", - "argstr": "", - "mandatory": True, - }, - ), - ), - ( - "outA", - attr.ib( - type=ty.Union[str, bool], - default=False, - metadata={ - "position": 2, - "help_string": "outA", - "argstr": "-o", - "output_file_template": "{inpA}_out", - }, - ), - ), - ], - bases=(ShellSpec,), - ) - - # no input for outA, but default is False, so the outA shouldn't be used - shelly = ShellCommandTask( - executable="executable", input_spec=my_input_spec, inpA="inpA" - ) - assert shelly.cmdline == "executable inpA" - - # a string is provided for outA, so this should be used as the outA value - shelly = ShellCommandTask( - executable="executable", input_spec=my_input_spec, inpA="inpA", outA="outA" - ) - assert shelly.cmdline == "executable inpA -o outA" - - # True is provided for outA, so the formatted template should be used as outA value - shelly = ShellCommandTask( - executable="executable", input_spec=my_input_spec, inpA="inpA", outA=True - ) - assert shelly.cmdline == f"executable inpA -o {shelly.output_dir / 'inpA_out'}" - - # False is provided for outA, so the outA shouldn't be used - shelly = ShellCommandTask( - executable="executable", input_spec=my_input_spec, inpA="inpA", outA=False - ) - assert shelly.cmdline == "executable inpA" - - -def test_shell_cmd_inputs_template_7(tmp_path: Path): - """additional inputs uses output_file_template with a suffix (no extension) - no keep_extension is used - """ - my_input_spec = SpecInfo( - name="Input", - fields=[ - ( - "inpA", - attr.ib( - type=File, - metadata={ - "position": 1, - "help_string": "inpA", - "argstr": "", - "mandatory": True, - }, - ), - ), - ( - "outA", - attr.ib( - type=str, - metadata={ - "position": 2, - "help_string": "outA", - "argstr": "", - "output_file_template": "{inpA}_out", - }, - ), - ), - ], - bases=(ShellSpec,), - ) - - inpA_file = tmp_path / "a_file.txt" - inpA_file.write_text("content") - shelly = ShellCommandTask( - executable="executable", input_spec=my_input_spec, inpA=inpA_file - ) - - # outA should be formatted in a way that that .txt goes to the end - assert ( - shelly.cmdline - == f"executable {tmp_path / 'a_file.txt'} {shelly.output_dir / 'a_file_out.txt'}" - ) - - -def test_shell_cmd_inputs_template_7a(tmp_path: Path): - """additional inputs uses output_file_template with a suffix (no extension) - keep_extension is True (as default) - """ - my_input_spec = SpecInfo( - name="Input", - fields=[ - ( - "inpA", - attr.ib( - type=File, - metadata={ - "position": 1, - "help_string": "inpA", - "argstr": "", - "mandatory": True, - }, - ), - ), - ( - "outA", - attr.ib( - type=str, - metadata={ - "position": 2, - "help_string": "outA", - "argstr": "", - "keep_extension": True, - "output_file_template": "{inpA}_out", - }, - ), - ), - ], - bases=(ShellSpec,), - ) - - inpA_file = tmp_path / "a_file.txt" - inpA_file.write_text("content") - shelly = ShellCommandTask( - executable="executable", input_spec=my_input_spec, inpA=inpA_file - ) - - # outA should be formatted in a way that that .txt goes to the end - assert ( - shelly.cmdline - == f"executable {tmp_path / 'a_file.txt'} {shelly.output_dir / 'a_file_out.txt'}" - ) - - -def test_shell_cmd_inputs_template_7b(tmp_path: Path): - """additional inputs uses output_file_template with a suffix (no extension) - keep extension is False (so the extension is removed when creating the output) - """ - my_input_spec = SpecInfo( - name="Input", - fields=[ - ( - "inpA", - attr.ib( - type=File, - metadata={ - "position": 1, - "help_string": "inpA", - "argstr": "", - "mandatory": True, - }, - ), - ), - ( - "outA", - attr.ib( - type=str, - metadata={ - "position": 2, - "help_string": "outA", - "argstr": "", - "keep_extension": False, - "output_file_template": "{inpA}_out", - }, - ), - ), - ], - bases=(ShellSpec,), - ) - - inpA_file = tmp_path / "a_file.txt" - inpA_file.write_text("content") - shelly = ShellCommandTask( - executable="executable", input_spec=my_input_spec, inpA=inpA_file - ) - - # outA should be formatted in a way that that .txt goes to the end - assert ( - shelly.cmdline - == f"executable {tmp_path / 'a_file.txt'} {shelly.output_dir / 'a_file_out'}" - ) - - -def test_shell_cmd_inputs_template_8(tmp_path: Path): - """additional inputs uses output_file_template with a suffix and an extension""" - my_input_spec = SpecInfo( - name="Input", - fields=[ - ( - "inpA", - attr.ib( - type=File, - metadata={ - "position": 1, - "help_string": "inpA", - "argstr": "", - "mandatory": True, - }, - ), - ), - ( - "outA", - attr.ib( - type=str, - metadata={ - "position": 2, - "help_string": "outA", - "argstr": "", - "output_file_template": "{inpA}_out.txt", - }, - ), - ), - ], - bases=(ShellSpec,), - ) - - inpA_file = tmp_path / "a_file.t" - inpA_file.write_text("content") - shelly = ShellCommandTask( - executable="executable", input_spec=my_input_spec, inpA=inpA_file - ) - - # outA should be formatted in a way that inpA extension is removed and the template extension is used - assert ( - shelly.cmdline - == f"executable {tmp_path / 'a_file.t'} {shelly.output_dir / 'a_file_out.txt'}" - ) - - -def test_shell_cmd_inputs_template_9(tmp_path: Path): - """additional inputs, one uses output_file_template with two fields: - one File and one ints - the output should be recreated from the template - """ - my_input_spec = SpecInfo( - name="Input", - fields=[ - ( - "inpA", - attr.ib( - type=File, - metadata={ - "position": 1, - "help_string": "inpA", - "argstr": "", - "mandatory": True, - }, - ), - ), - ( - "inpInt", - attr.ib( - type=int, - metadata={ - "position": 2, - "help_string": "inp int", - "argstr": "-i", - "mandatory": True, - }, - ), - ), - ( - "outA", - attr.ib( - type=str, - metadata={ - "position": 3, - "help_string": "outA", - "argstr": "-o", - "output_file_template": "{inpA}_{inpInt}_out.txt", - }, - ), - ), - ], - bases=(ShellSpec,), - ) - - inpA_file = tmp_path / "inpA.t" - inpA_file.write_text("content") - - shelly = ShellCommandTask( - executable="executable", input_spec=my_input_spec, inpA=inpA_file, inpInt=3 - ) - - assert ( - shelly.cmdline - == f"executable {tmp_path / 'inpA.t'} -i 3 -o {shelly.output_dir / 'inpA_3_out.txt'}" - ) - # checking if outA in the output fields - assert shelly.output_names == ["return_code", "stdout", "stderr", "outA"] - - -def test_shell_cmd_inputs_template_9a(tmp_path: Path): - """additional inputs, one uses output_file_template with two fields: - one file and one string without extension - should be fine - """ - my_input_spec = SpecInfo( - name="Input", - fields=[ - ( - "inpA", - attr.ib( - type=File, - metadata={ - "position": 1, - "help_string": "inpA", - "argstr": "", - "mandatory": True, - }, - ), - ), - ( - "inpStr", - attr.ib( - type=str, - metadata={ - "position": 2, - "help_string": "inp str", - "argstr": "-i", - "mandatory": True, - }, - ), - ), - ( - "outA", - attr.ib( - type=str, - metadata={ - "position": 3, - "help_string": "outA", - "argstr": "-o", - "output_file_template": "{inpA}_{inpStr}_out.txt", - }, - ), - ), - ], - bases=(ShellSpec,), - ) - - inpA_file = tmp_path / "inpA.t" - inpA_file.write_text("content") - - shelly = ShellCommandTask( - executable="executable", input_spec=my_input_spec, inpA=inpA_file, inpStr="hola" - ) - - assert ( - shelly.cmdline - == f"executable {tmp_path / 'inpA.t'} -i hola -o {shelly.output_dir / 'inpA_hola_out.txt'}" - ) - # checking if outA in the output fields - assert shelly.output_names == ["return_code", "stdout", "stderr", "outA"] - - -def test_shell_cmd_inputs_template_9b_err(tmp_path: Path): - """output_file_template with two fields that are both Files, - an exception should be raised - """ - my_input_spec = SpecInfo( - name="Input", - fields=[ - ( - "inpA", - attr.ib( - type=File, - metadata={ - "position": 1, - "help_string": "inpA", - "argstr": "", - "mandatory": True, - }, - ), - ), - ( - "inpFile", - attr.ib( - type=File, - metadata={ - "position": 2, - "help_string": "inp file", - "argstr": "-i", - "mandatory": True, - }, - ), - ), - ( - "outA", - attr.ib( - type=str, - metadata={ - "position": 3, - "help_string": "outA", - "argstr": "-o", - "output_file_template": "{inpA}_{inpFile}_out.txt", - }, - ), - ), - ], - bases=(ShellSpec,), - ) - - inpA_file = tmp_path / "inpA.t" - inpA_file.write_text("content") - - inpFile_file = tmp_path / "inpFile.t" - inpFile_file.write_text("content") - - shelly = ShellCommandTask( - executable="executable", - input_spec=my_input_spec, - inpA=inpA_file, - inpFile=inpFile_file, - ) - # the template has two files so the exception should be raised - with pytest.raises(Exception, match="can't have multiple paths"): - shelly.cmdline - - -def test_shell_cmd_inputs_template_9c_err(tmp_path: Path): - """output_file_template with two fields: a file and a string with extension, - that should be used as an additional file and the exception should be raised - """ - my_input_spec = SpecInfo( - name="Input", - fields=[ - ( - "inpA", - attr.ib( - type=File, - metadata={ - "position": 1, - "help_string": "inpA", - "argstr": "", - "mandatory": True, - }, - ), - ), - ( - "inpStr", - attr.ib( - type=str, - metadata={ - "position": 2, - "help_string": "inp str with extension", - "argstr": "-i", - "mandatory": True, - }, - ), - ), - ( - "outA", - attr.ib( - type=str, - metadata={ - "position": 3, - "help_string": "outA", - "argstr": "-o", - "output_file_template": "{inpA}_{inpStr}_out.txt", - }, - ), - ), - ], - bases=(ShellSpec,), - ) - - inpA_file = tmp_path / "inpA.t" - inpA_file.write_text("content") - - shelly = ShellCommandTask( - executable="executable", - input_spec=my_input_spec, - inpA=inpA_file, - inpStr="hola.txt", - ) - # inptStr has an extension so should be treated as a second file in the template formatting - # and the exception should be raised - with pytest.raises(Exception, match="can't have multiple paths"): - shelly.cmdline - - -def test_shell_cmd_inputs_template_10(): - """output_file_template uses a float field with formatting""" - my_input_spec = SpecInfo( - name="Input", - fields=[ - ( - "inpA", - attr.ib( - type=float, - metadata={ - "position": 1, - "help_string": "inpA", - "argstr": "{inpA:.1f}", - "mandatory": True, - }, - ), - ), - ( - "outA", - attr.ib( - type=str, - metadata={ - "position": 2, - "help_string": "outA", - "argstr": "-o", - "output_file_template": "file_{inpA:.1f}_out", - }, - ), - ), - ], - bases=(ShellSpec,), - ) - - shelly = ShellCommandTask( - executable="executable", input_spec=my_input_spec, inpA=3.3456 - ) - # outA has argstr in the metadata fields, so it's a part of the command line - # the full path will be use din the command line - assert shelly.cmdline == f"executable 3.3 -o {shelly.output_dir / 'file_3.3_out'}" - # checking if outA in the output fields - assert shelly.output_names == ["return_code", "stdout", "stderr", "outA"] - - -def test_shell_cmd_inputs_template_requires_1(): - """Given an input specification with a templated output file subject to required fields, - ensure the field is set only when all requirements are met.""" - - my_input_spec = SpecInfo( - name="Input", - fields=[ - ( - "in_file", - attr.ib( - type=str, - metadata={ - "help_string": "input file", - "mandatory": True, - "argstr": "", - }, - ), - ), - ( - "with_tpl", - attr.ib( - type=bool, - metadata={"help_string": "enable template"}, - ), - ), - ( - "out_file", - attr.ib( - type=str, - metadata={ - "help_string": "output file", - "argstr": "--tpl", - "output_file_template": "tpl.{in_file}", - "requires": {"with_tpl"}, - }, - ), - ), - ], - bases=(ShellSpec,), - ) - - # When requirements are not met. - shelly = ShellCommandTask( - executable="cmd", input_spec=my_input_spec, in_file="in.file" - ) - assert "--tpl" not in shelly.cmdline - - # When requirements are met. - shelly.inputs.with_tpl = True - assert "tpl.in.file" in shelly.cmdline - - -def test_shell_cmd_inputs_template_function_1(): - """one input field uses output_file_template that is a simple function - this can be easily done by simple template as in test_shell_cmd_inputs_template_1 - """ - - # a function that return an output template - def template_fun(inputs): - return "{inpA}_out" - - my_input_spec = SpecInfo( - name="Input", - fields=[ - ( - "inpA", - attr.ib( - type=str, - metadata={ - "position": 1, - "help_string": "inpA", - "argstr": "", - "mandatory": True, - }, - ), - ), - ( - "outA", - attr.ib( - type=str, - metadata={ - "position": 2, - "help_string": "outA", - "argstr": "-o", - "output_file_template": template_fun, - }, - ), - ), - ], - bases=(ShellSpec,), - ) - - shelly = ShellCommandTask( - executable="executable", input_spec=my_input_spec, inpA="inpA" - ) - - assert shelly.cmdline == f"executable inpA -o {shelly.output_dir / 'inpA_out'}" - - -def test_shell_cmd_inputs_template_function_2(): - """one input field uses output_file_template that is a function, - depending on a value of an input it returns different template - """ - - # a function that return an output template that depends on value of the input - def template_fun(inputs): - if inputs.inpB % 2 == 0: - return "{inpA}_even" - else: - return "{inpA}_odd" - - my_input_spec = SpecInfo( - name="Input", - fields=[ - ( - "inpA", - attr.ib( - type=str, - metadata={ - "position": 1, - "help_string": "inpA", - "argstr": "", - "mandatory": True, - }, - ), - ), - ( - "inpB", - attr.ib( - type=int, - metadata={ - "help_string": "inpB", - "mandatory": True, - }, - ), - ), - ( - "outA", - attr.ib( - type=str, - metadata={ - "position": 2, - "help_string": "outA", - "argstr": "-o", - "output_file_template": template_fun, - }, - ), - ), - ], - bases=(ShellSpec,), - ) - - shelly = ShellCommandTask( - executable="executable", - input_spec=my_input_spec, - inpA="inpA", - inpB=1, - ) - - assert shelly.cmdline == f"executable inpA -o {shelly.output_dir / 'inpA_odd'}" - - -def test_shell_cmd_inputs_template_1_st(): - """additional inputs, one uses output_file_template (and argstr) - testing cmdline when splitter defined - """ - my_input_spec = SpecInfo( - name="Input", - fields=[ - ( - "inpA", - attr.ib( - type=str, - metadata={ - "position": 1, - "help_string": "inpA", - "argstr": "", - "mandatory": True, - }, - ), - ), - ( - "outA", - attr.ib( - type=str, - metadata={ - "position": 2, - "help_string": "outA", - "argstr": "-o", - "output_file_template": "{inpA}_out", - }, - ), - ), - ], - bases=(ShellSpec,), - ) - - inpA = ["inpA_1", "inpA_2"] - ShellCommandTask( - name="f", - executable="executable", - input_spec=my_input_spec, - ).split("inpA", inpA=inpA) - - # cmdline_list = shelly.cmdline - # assert len(cmdline_list) == 2 - # for i in range(2): - # path_out = Path(shelly.output_dir[i]) / f"{inpA[i]}_out" - # assert cmdline_list[i] == f"executable {inpA[i]} -o {path_out}" - - -# TODO: after deciding how we use requires/templates -def test_shell_cmd_inputs_denoise_image( - tmp_path, -): - """example from #279""" - my_input_spec = SpecInfo( - name="Input", - fields=[ - ( - "image_dimensionality", - attr.ib( - type=int, - metadata={ - "help_string": """ - 2/3/4 - This option forces the image to be treated as a specified-dimensional image. - If not specified, the program tries to infer the dimensionality from - the input image. - """, - "allowed_values": [2, 3, 4], - "argstr": "-d", - }, - ), - ), - ( - "inputImageFilename", - attr.ib( - type=File, - metadata={ - "help_string": "A scalar image is expected as input for noise correction.", - "argstr": "-i", - "mandatory": True, - }, - ), - ), - ( - "noise_model", - attr.ib( - type=str, - metadata={ - "help_string": """ - Rician/(Gaussian) - Employ a Rician or Gaussian noise model. - """, - "allowed_values": ["Rician", "Gaussian"], - "argstr": "-n", - }, - ), - ), - ( - "maskImageFilename", - attr.ib( - type=str, - metadata={ - "help_string": "If a mask image is specified, denoising is only performed in the mask region.", - "argstr": "-x", - }, - ), - ), - ( - "shrink_factor", - attr.ib( - type=int, - default=1, - metadata={ - "help_string": """ - (1)/2/3/... - Running noise correction on large images can be time consuming. - To lessen computation time, the input image can be resampled. - The shrink factor, specified as a single integer, describes this - resampling. Shrink factor = 1 is the default. - """, - "argstr": "-s", - }, - ), - ), - ( - "patch_radius", - attr.ib( - type=int, - default=1, - metadata={ - "help_string": "Patch radius. Default = 1x1x1", - "argstr": "-p", - }, - ), - ), - ( - "search_radius", - attr.ib( - type=int, - default=2, - metadata={ - "help_string": "Search radius. Default = 2x2x2.", - "argstr": "-r", - }, - ), - ), - ( - "correctedImage", - attr.ib( - type=str, - metadata={ - "help_string": """ - The output consists of the noise corrected version of the input image. - Optionally, one can also output the estimated noise image. - """, - "output_file_template": "{inputImageFilename}_out", - }, - ), - ), - ( - "noiseImage", - attr.ib( - type=ty.Union[str, bool], - default=False, - metadata={ - "help_string": """ - The output consists of the noise corrected version of the input image. - Optionally, one can also output the estimated noise image. - """, - "output_file_template": "{inputImageFilename}_noise", - }, - ), - ), - ( - "output", - attr.ib( - type=str, - metadata={ - "help_string": "Combined output", - "argstr": "-o [{correctedImage}, {noiseImage}]", - "position": -1, - "readonly": True, - }, - ), - ), - ( - "version", - attr.ib( - type=bool, - default=False, - metadata={ - "help_string": "Get Version Information.", - "argstr": "--version", - }, - ), - ), - ( - "verbose", - attr.ib( - type=int, - default=0, - metadata={"help_string": "(0)/1. Verbose output. ", "argstr": "-v"}, - ), - ), - ( - "help_short", - attr.ib( - type=bool, - default=False, - metadata={ - "help_string": "Print the help menu (short version)", - "argstr": "-h", - }, - ), - ), - ( - "help", - attr.ib( - type=int, - metadata={ - "help_string": "Print the help menu.", - "argstr": "--help", - }, - ), - ), - ], - bases=(ShellSpec,), - ) - - my_input_file = tmp_path / "a_file.ext" - my_input_file.write_text("content") - - # no input provided - shelly = ShellCommandTask(executable="DenoiseImage", input_spec=my_input_spec) - with pytest.raises(Exception) as e: - shelly.cmdline - assert "mandatory" in str(e.value) - - # input file name, noiseImage is not set, so using default value False - shelly = ShellCommandTask( - executable="DenoiseImage", - inputImageFilename=my_input_file, - input_spec=my_input_spec, - ) - assert ( - shelly.cmdline - == f"DenoiseImage -i {tmp_path / 'a_file.ext'} -s 1 -p 1 -r 2 -o [{shelly.output_dir / 'a_file_out.ext'}]" - ) - - # input file name, noiseImage is set to True, so template is used in the output - shelly = ShellCommandTask( - executable="DenoiseImage", - inputImageFilename=my_input_file, - input_spec=my_input_spec, - noiseImage=True, - ) - assert ( - shelly.cmdline == f"DenoiseImage -i {tmp_path / 'a_file.ext'} -s 1 -p 1 -r 2 " - f"-o [{shelly.output_dir / 'a_file_out.ext'}, {str(shelly.output_dir / 'a_file_noise.ext')}]" - ) - - # input file name and help_short - shelly = ShellCommandTask( - executable="DenoiseImage", - inputImageFilename=my_input_file, - help_short=True, - input_spec=my_input_spec, - ) - assert ( - shelly.cmdline - == f"DenoiseImage -i {tmp_path / 'a_file.ext'} -s 1 -p 1 -r 2 -h -o [{shelly.output_dir / 'a_file_out.ext'}]" - ) - - assert shelly.output_names == [ - "return_code", - "stdout", - "stderr", - "correctedImage", - "noiseImage", - ] - - # adding image_dimensionality that has allowed_values [2, 3, 4] - shelly = ShellCommandTask( - executable="DenoiseImage", - inputImageFilename=my_input_file, - input_spec=my_input_spec, - image_dimensionality=2, - ) - assert ( - shelly.cmdline - == f"DenoiseImage -d 2 -i {tmp_path / 'a_file.ext'} -s 1 -p 1 -r 2 -o [{shelly.output_dir / 'a_file_out.ext'}]" - ) - - # adding image_dimensionality that has allowed_values [2, 3, 4] and providing 5 - exception should be raised - with pytest.raises(ValueError) as excinfo: - shelly = ShellCommandTask( - executable="DenoiseImage", - inputImageFilename=my_input_file, - input_spec=my_input_spec, - image_dimensionality=5, - ) - assert "value of image_dimensionality" in str(excinfo.value) - - -# tests with XOR in input metadata - - -class SimpleTaskXor(ShellCommandTask): - input_fields = [ - ( - "input_1", - str, - { - "help_string": "help", - "mandatory": True, - "xor": ("input_1", "input_2", "input_3"), - }, - ), - ( - "input_2", - bool, - { - "help_string": "help", - "mandatory": True, - "argstr": "--i2", - "xor": ("input_1", "input_2", "input_3"), - }, - ), - ( - "input_3", - bool, - { - "help_string": "help", - "mandatory": True, - "xor": ("input_1", "input_2", "input_3"), - }, - ), - ] - task_input_spec = SpecInfo(name="Input", fields=input_fields, bases=(ShellSpec,)) - task_output_fields = [] - task_output_spec = SpecInfo( - name="Output", fields=task_output_fields, bases=(ShellOutSpec,) - ) - - input_spec = task_input_spec - output_spec = task_output_spec - executable = "cmd" - - -def test_task_inputs_mandatory_with_xOR_one_mandatory_is_OK(): - """input spec with mandatory inputs""" - task = SimpleTaskXor() - task.inputs.input_1 = "Input1" - task.inputs.input_2 = attr.NOTHING - task.inputs.check_fields_input_spec() - - -def test_task_inputs_mandatory_with_xOR_one_mandatory_out_3_is_OK(): - """input spec with mandatory inputs""" - task = SimpleTaskXor() - task.inputs.input_1 = attr.NOTHING - task.inputs.input_2 = attr.NOTHING - task.inputs.input_3 = True - task.inputs.check_fields_input_spec() - - -def test_task_inputs_mandatory_with_xOR_zero_mandatory_raises_error(): - """input spec with mandatory inputs""" - task = SimpleTaskXor() - task.inputs.input_1 = attr.NOTHING - task.inputs.input_2 = attr.NOTHING - with pytest.raises(Exception) as excinfo: - task.inputs.check_fields_input_spec() - assert "input_1 is mandatory" in str(excinfo.value) - assert "no alternative provided by ['input_2', 'input_3']" in str(excinfo.value) - assert excinfo.type is AttributeError - - -def test_task_inputs_mandatory_with_xOR_two_mandatories_raises_error(): - """input spec with mandatory inputs""" - task = SimpleTaskXor() - task.inputs.input_1 = "Input1" - task.inputs.input_2 = True - - with pytest.raises(Exception) as excinfo: - task.inputs.check_fields_input_spec() - assert "input_1 is mutually exclusive with ['input_2']" in str(excinfo.value) - assert excinfo.type is AttributeError - - -def test_task_inputs_mandatory_with_xOR_3_mandatories_raises_error(): - """input spec with mandatory inputs""" - task = SimpleTaskXor() - task.inputs.input_1 = "Input1" - task.inputs.input_2 = True - task.inputs.input_3 = False - - with pytest.raises(Exception) as excinfo: - task.inputs.check_fields_input_spec() - assert "input_1 is mutually exclusive with ['input_2', 'input_3']" in str( - excinfo.value - ) - assert excinfo.type is AttributeError diff --git a/pydra/engine/tests/test_singularity.py b/pydra/engine/tests/test_singularity.py deleted file mode 100644 index 791575adc1..0000000000 --- a/pydra/engine/tests/test_singularity.py +++ /dev/null @@ -1,782 +0,0 @@ -import shutil -import subprocess as sp -import pytest -import attr - -from ..task import ShellCommandTask -from ..submitter import Submitter -from ..core import Workflow -from ..specs import ShellOutSpec, SpecInfo, File, ShellSpec -from ..environments import Singularity - - -need_docker = pytest.mark.skipif( - shutil.which("docker") is None or sp.call(["docker", "info"]), - reason="no docker available", -) -need_singularity = pytest.mark.skipif( - shutil.which("singularity") is None, reason="no singularity available" -) - -need_slurm = pytest.mark.skipif( - not bool(shutil.which("sbatch")), reason="no singularity available" -) - - -@need_singularity -def test_singularity_1_nosubm(tmp_path): - """simple command in a container, a default bindings and working directory is added - no submitter - """ - cmd = "pwd" - image = "docker://alpine" - singu = ShellCommandTask( - name="singu", - executable=cmd, - environment=Singularity(image=image), - cache_dir=tmp_path, - ) - assert singu.environment.image == "docker://alpine" - assert isinstance(singu.environment, Singularity) - assert singu.cmdline == cmd - - res = singu() - assert "/mnt/pydra" in res.output.stdout - assert res.output.return_code == 0 - - -@need_singularity -def test_singularity_2_nosubm(tmp_path): - """a command with arguments, cmd and args given as executable - no submitter - """ - cmd = ["echo", "hail", "pydra"] - image = "docker://alpine" - singu = ShellCommandTask( - name="singu", - executable=cmd, - environment=Singularity(image=image), - cache_dir=tmp_path, - ) - assert singu.cmdline == " ".join(cmd) - - res = singu() - assert res.output.stdout.strip() == " ".join(cmd[1:]) - assert res.output.return_code == 0 - - -@need_singularity -def test_singularity_2(plugin, tmp_path): - """a command with arguments, cmd and args given as executable - using submitter - """ - cmd = ["echo", "hail", "pydra"] - image = "docker://alpine" - - singu = ShellCommandTask( - name="singu", - executable=cmd, - environment=Singularity(image=image), - cache_dir=tmp_path, - ) - assert singu.cmdline == " ".join(cmd) - - with Submitter(plugin=plugin) as sub: - singu(submitter=sub) - res = singu.result() - assert res.output.stdout.strip() == " ".join(cmd[1:]) - assert res.output.return_code == 0 - - -@need_singularity -def test_singularity_2a(plugin, tmp_path): - """a command with arguments, using executable and args - using submitter - """ - cmd_exec = "echo" - cmd_args = ["hail", "pydra"] - # separate command into exec + args - image = "docker://alpine" - singu = ShellCommandTask( - name="singu", - executable=cmd_exec, - args=cmd_args, - environment=Singularity(image=image), - cache_dir=tmp_path, - ) - assert singu.cmdline == f"{cmd_exec} {' '.join(cmd_args)}" - - with Submitter(plugin=plugin) as sub: - singu(submitter=sub) - res = singu.result() - assert res.output.stdout.strip() == " ".join(cmd_args) - assert res.output.return_code == 0 - - -# tests with State - - -@need_singularity -def test_singularity_st_1(plugin, tmp_path): - """commands without arguments in container - splitter = executable - """ - cmd = ["pwd", "ls"] - image = "docker://alpine" - singu = ShellCommandTask( - name="singu", environment=Singularity(image=image), cache_dir=tmp_path - ).split("executable", executable=cmd) - assert singu.state.splitter == "singu.executable" - - res = singu(plugin=plugin) - assert "/mnt/pydra" in res[0].output.stdout - assert res[1].output.stdout == "" - assert res[0].output.return_code == res[1].output.return_code == 0 - - -@need_singularity -@need_slurm -@pytest.mark.skip(reason="TODO, xfail incorrect") -@pytest.mark.xfail( - reason="slurm can complain if the number of submitted jobs exceeds the limit" -) -@pytest.mark.parametrize("n", [10, 50, 100]) -def test_singularity_st_2(tmp_path, n): - """splitter over args (checking bigger splitters if slurm available)""" - args_n = list(range(n)) - image = "docker://alpine" - singu = ShellCommandTask( - name="singu", - executable="echo", - environment=Singularity(image=image), - cache_dir=tmp_path, - ).split("args", args=args_n) - assert singu.state.splitter == "singu.args" - res = singu(plugin="slurm") - assert "1" in res[1].output.stdout - assert str(n - 1) in res[-1].output.stdout - assert res[0].output.return_code == res[1].output.return_code == 0 - - -# tests with customized output_spec - - -@need_singularity -def test_singularity_outputspec_1(plugin, tmp_path): - """ - customised output_spec, adding files to the output, providing specific pathname - output_path is automatically added to the bindings - """ - cmd = ["touch", "newfile_tmp.txt"] - image = "docker://alpine" - - my_output_spec = SpecInfo( - name="Output", - fields=[("newfile", File, "newfile_tmp.txt")], - bases=(ShellOutSpec,), - ) - singu = ShellCommandTask( - name="singu", - environment=Singularity(image=image), - executable=cmd, - output_spec=my_output_spec, - cache_dir=tmp_path, - ) - - with Submitter(plugin=plugin) as sub: - singu(submitter=sub) - - res = singu.result() - assert res.output.stdout == "" - assert res.output.newfile.fspath.exists() - - -# tests with customised input_spec - - -@need_singularity -def test_singularity_inputspec_1(plugin, tmp_path): - """a simple customized input spec for singularity task""" - filename = str((tmp_path / "file_pydra.txt")) - with open(filename, "w") as f: - f.write("hello from pydra") - - cmd = "cat" - image = "docker://alpine" - - my_input_spec = SpecInfo( - name="Input", - fields=[ - ( - "file", - attr.ib( - type=File, - metadata={ - "mandatory": True, - "position": 1, - "argstr": "", - "help_string": "input file", - }, - ), - ) - ], - bases=(ShellSpec,), - ) - - singu = ShellCommandTask( - name="singu", - environment=Singularity(image=image), - executable=cmd, - file=filename, - input_spec=my_input_spec, - strip=True, - cache_dir=tmp_path, - ) - - res = singu() - assert res.output.stdout == "hello from pydra" - - -@need_singularity -def test_singularity_inputspec_1a(plugin, tmp_path): - """a simple customized input spec for singularity task - a default value is used - """ - filename = str((tmp_path / "file_pydra.txt")) - with open(filename, "w") as f: - f.write("hello from pydra") - - cmd = "cat" - image = "docker://alpine" - - my_input_spec = SpecInfo( - name="Input", - fields=[ - ( - "file", - attr.ib( - type=File, - default=filename, - metadata={"position": 1, "argstr": "", "help_string": "input file"}, - ), - ) - ], - bases=(ShellSpec,), - ) - - singu = ShellCommandTask( - name="singu", - environment=Singularity(image=image), - executable=cmd, - input_spec=my_input_spec, - strip=True, - cache_dir=tmp_path, - ) - - res = singu() - assert res.output.stdout == "hello from pydra" - - -@need_singularity -def test_singularity_inputspec_2(plugin, tmp_path): - """a customized input spec with two fields for singularity task""" - filename_1 = tmp_path / "file_pydra.txt" - with open(filename_1, "w") as f: - f.write("hello from pydra\n") - - filename_2 = tmp_path / "file_nice.txt" - with open(filename_2, "w") as f: - f.write("have a nice one") - - cmd = "cat" - image = "docker://alpine" - - my_input_spec = SpecInfo( - name="Input", - fields=[ - ( - "file1", - attr.ib( - type=File, - metadata={ - "position": 1, - "argstr": "", - "help_string": "input file 1", - }, - ), - ), - ( - "file2", - attr.ib( - type=File, - default=filename_2, - metadata={ - "position": 2, - "argstr": "", - "help_string": "input file 2", - }, - ), - ), - ], - bases=(ShellSpec,), - ) - - singu = ShellCommandTask( - name="singu", - environment=Singularity(image=image), - executable=cmd, - file1=filename_1, - input_spec=my_input_spec, - strip=True, - cache_dir=tmp_path, - ) - - res = singu() - assert res.output.stdout == "hello from pydra\nhave a nice one" - - -@need_singularity -def test_singularity_inputspec_2a_except(plugin, tmp_path): - """a customized input spec with two fields - first one uses a default, and second doesn't - raises a dataclass exception - """ - filename_1 = tmp_path / "file_pydra.txt" - with open(filename_1, "w") as f: - f.write("hello from pydra\n") - filename_2 = tmp_path / "file_nice.txt" - with open(filename_2, "w") as f: - f.write("have a nice one") - - cmd = "cat" - image = "docker://alpine" - - # the field with default value can't be before value without default - my_input_spec = SpecInfo( - name="Input", - fields=[ - ( - "file1", - attr.ib( - type=File, - default=filename_1, - metadata={ - "position": 1, - "argstr": "", - "help_string": "input file 1", - }, - ), - ), - ( - "file2", - attr.ib( - type=File, - metadata={ - "position": 2, - "argstr": "", - "help_string": "input file 2", - }, - ), - ), - ], - bases=(ShellSpec,), - ) - - singu = ShellCommandTask( - name="singu", - environment=Singularity(image=image), - executable=cmd, - file2=filename_2, - input_spec=my_input_spec, - strip=True, - cache_dir=tmp_path, - ) - res = singu() - assert res.output.stdout == "hello from pydra\nhave a nice one" - - -@need_singularity -def test_singularity_inputspec_2a(plugin, tmp_path): - """a customized input spec with two fields - first one uses a default value, - this is fine even if the second field is not using any defaults - """ - filename_1 = tmp_path / "file_pydra.txt" - with open(filename_1, "w") as f: - f.write("hello from pydra\n") - filename_2 = tmp_path / "file_nice.txt" - with open(filename_2, "w") as f: - f.write("have a nice one") - - cmd = "cat" - image = "docker://alpine" - - # if you want set default in the first field you can use default_value in metadata - my_input_spec = SpecInfo( - name="Input", - fields=[ - ( - "file1", - attr.ib( - type=File, - default=filename_1, - metadata={ - "position": 1, - "argstr": "", - "help_string": "input file 1", - }, - ), - ), - ( - "file2", - attr.ib( - type=File, - metadata={ - "position": 2, - "argstr": "", - "help_string": "input file 2", - }, - ), - ), - ], - bases=(ShellSpec,), - ) - - singu = ShellCommandTask( - name="singu", - environment=Singularity(image=image), - executable=cmd, - file2=filename_2, - input_spec=my_input_spec, - strip=True, - cache_dir=tmp_path, - ) - - res = singu() - assert res.output.stdout == "hello from pydra\nhave a nice one" - - -@need_singularity -def test_singularity_cmd_inputspec_copyfile_1(plugin, tmp_path): - """shelltask changes a file in place, - adding copyfile=True to the file-input from input_spec - hardlink or copy in the output_dir should be created - """ - file = tmp_path / "file_pydra.txt" - with open(file, "w") as f: - f.write("hello from pydra\n") - - cmd = ["sed", "-is", "s/hello/hi/"] - image = "docker://alpine" - - my_input_spec = SpecInfo( - name="Input", - fields=[ - ( - "orig_file", - attr.ib( - type=File, - metadata={ - "position": 1, - "argstr": "", - "help_string": "orig file", - "mandatory": True, - "copyfile": True, - }, - ), - ), - ( - "out_file", - attr.ib( - type=str, - metadata={ - "output_file_template": "{orig_file}", - "help_string": "output file", - }, - ), - ), - ], - bases=(ShellSpec,), - ) - - singu = ShellCommandTask( - name="singu", - environment=Singularity(image=image), - executable=cmd, - input_spec=my_input_spec, - orig_file=str(file), - cache_dir=tmp_path, - ) - - res = singu() - assert res.output.stdout == "" - assert res.output.out_file.fspath.exists() - # the file is copied, and than it is changed in place - assert res.output.out_file.fspath.parent == singu.output_dir - with open(res.output.out_file) as f: - assert "hi from pydra\n" == f.read() - # the original file is unchanged - with open(file) as f: - assert "hello from pydra\n" == f.read() - - -@need_singularity -def test_singularity_inputspec_state_1(tmp_path): - """a customised input spec for a singularity file with a splitter, - splitter is on files - """ - filename_1 = tmp_path / "file_pydra.txt" - with open(filename_1, "w") as f: - f.write("hello from pydra\n") - filename_2 = tmp_path / "file_nice.txt" - with open(filename_2, "w") as f: - f.write("have a nice one") - - cmd = "cat" - filename = [str(filename_1), str(filename_2)] - image = "docker://alpine" - - my_input_spec = SpecInfo( - name="Input", - fields=[ - ( - "file", - attr.ib( - type=File, - metadata={ - "mandatory": True, - "position": 1, - "argstr": "", - "help_string": "input file", - }, - ), - ) - ], - bases=(ShellSpec,), - ) - - singu = ShellCommandTask( - name="singu", - environment=Singularity(image=image), - executable=cmd, - input_spec=my_input_spec, - strip=True, - cache_dir=tmp_path, - ).split("file", file=filename) - - res = singu() - assert res[0].output.stdout == "hello from pydra" - assert res[1].output.stdout == "have a nice one" - - -@need_singularity -def test_singularity_inputspec_state_1b(plugin, tmp_path): - """a customised input spec for a singularity file with a splitter, - files from the input spec have the same path in the local os and the container, - so hash is calculated and the test works fine - """ - file_1 = tmp_path / "file_pydra.txt" - file_2 = tmp_path / "file_nice.txt" - with open(file_1, "w") as f: - f.write("hello from pydra") - with open(file_2, "w") as f: - f.write("have a nice one") - - cmd = "cat" - filename = [str(file_1), str(file_2)] - image = "docker://alpine" - - my_input_spec = SpecInfo( - name="Input", - fields=[ - ( - "file", - attr.ib( - type=File, - metadata={ - "mandatory": True, - "position": 1, - "argstr": "", - "help_string": "input file", - }, - ), - ) - ], - bases=(ShellSpec,), - ) - - singu = ShellCommandTask( - name="singu", - environment=Singularity(image=image), - executable=cmd, - input_spec=my_input_spec, - strip=True, - cache_dir=tmp_path, - ).split("file", file=filename) - - res = singu() - assert res[0].output.stdout == "hello from pydra" - assert res[1].output.stdout == "have a nice one" - - -@need_singularity -def test_singularity_wf_inputspec_1(plugin, tmp_path): - """a customized input spec for workflow with singularity tasks""" - filename = tmp_path / "file_pydra.txt" - with open(filename, "w") as f: - f.write("hello from pydra") - - cmd = "cat" - image = "docker://alpine" - - my_input_spec = SpecInfo( - name="Input", - fields=[ - ( - "file", - attr.ib( - type=File, - metadata={ - "mandatory": True, - "position": 1, - "argstr": "", - "help_string": "input file", - }, - ), - ) - ], - bases=(ShellSpec,), - ) - - wf = Workflow(name="wf", input_spec=["cmd", "file"], cache_dir=tmp_path) - wf.inputs.cmd = cmd - wf.inputs.file = filename - - singu = ShellCommandTask( - name="singu", - environment=Singularity(image=image), - executable=wf.lzin.cmd, - file=wf.lzin.file, - input_spec=my_input_spec, - strip=True, - ) - wf.add(singu) - - wf.set_output([("out", wf.singu.lzout.stdout)]) - - with Submitter(plugin="serial") as sub: - wf(submitter=sub) - - res = wf.result() - assert res.output.out == "hello from pydra" - - -@need_singularity -def test_singularity_wf_state_inputspec_1(plugin, tmp_path): - """a customized input spec for workflow with singularity tasks that has a state""" - file_1 = tmp_path / "file_pydra.txt" - file_2 = tmp_path / "file_nice.txt" - with open(file_1, "w") as f: - f.write("hello from pydra") - with open(file_2, "w") as f: - f.write("have a nice one") - - cmd = "cat" - filename = [str(file_1), str(file_2)] - image = "docker://alpine" - - my_input_spec = SpecInfo( - name="Input", - fields=[ - ( - "file", - attr.ib( - type=File, - metadata={ - "mandatory": True, - "position": 1, - "argstr": "", - "help_string": "input file", - }, - ), - ) - ], - bases=(ShellSpec,), - ) - - wf = Workflow(name="wf", input_spec=["cmd", "file"], cache_dir=tmp_path) - wf.inputs.cmd = cmd - - singu = ShellCommandTask( - name="singu", - environment=Singularity(image=image), - executable=wf.lzin.cmd, - file=wf.lzin.file, - input_spec=my_input_spec, - strip=True, - ) - wf.add(singu) - wf.split("file", file=filename) - - wf.set_output([("out", wf.singu.lzout.stdout)]) - - with Submitter(plugin=plugin) as sub: - wf(submitter=sub) - - res = wf.result() - assert res[0].output.out == "hello from pydra" - assert res[1].output.out == "have a nice one" - - -@need_singularity -def test_singularity_wf_ndst_inputspec_1(plugin, tmp_path): - """a customized input spec for workflow with singularity tasks with states""" - file_1 = tmp_path / "file_pydra.txt" - file_2 = tmp_path / "file_nice.txt" - with open(file_1, "w") as f: - f.write("hello from pydra") - with open(file_2, "w") as f: - f.write("have a nice one") - - cmd = "cat" - filename = [str(file_1), str(file_2)] - image = "docker://alpine" - - my_input_spec = SpecInfo( - name="Input", - fields=[ - ( - "file", - attr.ib( - type=File, - metadata={ - "mandatory": True, - "position": 1, - "argstr": "", - "help_string": "input file", - }, - ), - ) - ], - bases=(ShellSpec,), - ) - - wf = Workflow(name="wf", input_spec=["cmd", "file"], cache_dir=tmp_path) - wf.inputs.cmd = cmd - wf.inputs.file = filename - - singu = ShellCommandTask( - name="singu", - environment=Singularity(image=image), - executable=wf.lzin.cmd, - input_spec=my_input_spec, - strip=True, - ).split("file", file=wf.lzin.file) - wf.add(singu) - - wf.set_output([("out", wf.singu.lzout.stdout)]) - - with Submitter(plugin=plugin) as sub: - wf(submitter=sub) - - res = wf.result() - assert res.output.out == ["hello from pydra", "have a nice one"] diff --git a/pydra/engine/tests/test_specs.py b/pydra/engine/tests/test_specs.py deleted file mode 100644 index 8221751d01..0000000000 --- a/pydra/engine/tests/test_specs.py +++ /dev/null @@ -1,402 +0,0 @@ -from pathlib import Path -import typing as ty -import os -import attrs -from copy import deepcopy -import time - -from ..specs import ( - BaseSpec, - SpecInfo, - File, - Runtime, - Result, - ShellSpec, - # ContainerSpec, - LazyIn, - LazyOut, - LazyField, - StateArray, -) -from ..helpers import make_klass -from .utils import foo -from pydra import mark, Workflow -import pytest - - -def test_basespec(): - spec = BaseSpec() - assert spec.hash == "0b1d98df22ecd1733562711c205abca2" - - -def test_runtime(): - runtime = Runtime() - assert hasattr(runtime, "rss_peak_gb") - assert hasattr(runtime, "vms_peak_gb") - assert hasattr(runtime, "cpu_peak_percent") - - -def test_result(): - result = Result() - assert hasattr(result, "runtime") - assert hasattr(result, "output") - assert hasattr(result, "errored") - assert getattr(result, "errored") is False - - -def test_shellspec(): - with pytest.raises(TypeError): - spec = ShellSpec() - spec = ShellSpec(executable="ls") # (executable, args) - assert hasattr(spec, "executable") - assert hasattr(spec, "args") - - -class NodeTesting: - @attrs.define() - class Input: - inp_a: str = "A" - inp_b: str = "B" - - def __init__(self): - class InpSpec: - def __init__(self): - self.fields = [("inp_a", int), ("inp_b", int)] - - class OutSpec: - def __init__(self): - self.fields = [("out_a", int)] - - self.name = "tn" - self.inputs = self.Input() - self.input_spec = InpSpec() - self.output_spec = OutSpec() - self.output_names = ["out_a"] - self.state = None - - def result(self, state_index=None): - class Output: - def __init__(self): - self.out_a = "OUT_A" - - class Result: - def __init__(self): - self.output = Output() - self.errored = False - - def get_output_field(self, field): - return getattr(self.output, field) - - return Result() - - -class WorkflowTesting: - def __init__(self): - class Input: - def __init__(self): - self.inp_a = "A" - self.inp_b = "B" - - self.inputs = Input() - self.tn = NodeTesting() - - -def test_lazy_inp(): - tn = NodeTesting() - lzin = LazyIn(task=tn) - - lf = lzin.inp_a - assert lf.get_value(wf=WorkflowTesting()) == "A" - - lf = lzin.inp_b - assert lf.get_value(wf=WorkflowTesting()) == "B" - - -def test_lazy_out(): - tn = NodeTesting() - lzout = LazyOut(task=tn) - lf = lzout.out_a - assert lf.get_value(wf=WorkflowTesting()) == "OUT_A" - - -def test_lazy_getvale(): - tn = NodeTesting() - lf = LazyIn(task=tn) - with pytest.raises(Exception) as excinfo: - lf.inp_c - assert ( - str(excinfo.value) - == "Task 'tn' has no input attribute 'inp_c', available: 'inp_a', 'inp_b'" - ) - - -def test_input_file_hash_1(tmp_path): - os.chdir(tmp_path) - outfile = "test.file" - fields = [("in_file", ty.Any)] - input_spec = SpecInfo(name="Inputs", fields=fields, bases=(BaseSpec,)) - inputs = make_klass(input_spec) - assert inputs(in_file=outfile).hash == "9a106eb2830850834d9b5bf098d5fa85" - - with open(outfile, "w") as fp: - fp.write("test") - fields = [("in_file", File)] - input_spec = SpecInfo(name="Inputs", fields=fields, bases=(BaseSpec,)) - inputs = make_klass(input_spec) - assert inputs(in_file=outfile).hash == "02fa5f6f1bbde7f25349f54335e1adaf" - - -def test_input_file_hash_2(tmp_path): - """input spec with File types, checking when the checksum changes""" - file = tmp_path / "in_file_1.txt" - with open(file, "w") as f: - f.write("hello") - - input_spec = SpecInfo(name="Inputs", fields=[("in_file", File)], bases=(BaseSpec,)) - inputs = make_klass(input_spec) - - # checking specific hash value - hash1 = inputs(in_file=file).hash - assert hash1 == "aaa50d60ed33d3a316d58edc882a34c3" - - # checking if different name doesn't affect the hash - file_diffname = tmp_path / "in_file_2.txt" - with open(file_diffname, "w") as f: - f.write("hello") - hash2 = inputs(in_file=file_diffname).hash - assert hash1 == hash2 - - # checking if different content (the same name) affects the hash - time.sleep(2) # ensure mtime is different - file_diffcontent = tmp_path / "in_file_1.txt" - with open(file_diffcontent, "w") as f: - f.write("hi") - hash3 = inputs(in_file=file_diffcontent).hash - assert hash1 != hash3 - - -def test_input_file_hash_2a(tmp_path): - """input spec with ty.Union[File, ...] type, checking when the checksum changes""" - file = tmp_path / "in_file_1.txt" - with open(file, "w") as f: - f.write("hello") - - input_spec = SpecInfo( - name="Inputs", fields=[("in_file", ty.Union[File, int])], bases=(BaseSpec,) - ) - inputs = make_klass(input_spec) - - # checking specific hash value - hash1 = inputs(in_file=file).hash - assert hash1 == "aaa50d60ed33d3a316d58edc882a34c3" - - # checking if different name doesn't affect the hash - file_diffname = tmp_path / "in_file_2.txt" - with open(file_diffname, "w") as f: - f.write("hello") - hash2 = inputs(in_file=file_diffname).hash - assert hash1 == hash2 - - # checking if different content (the same name) affects the hash - time.sleep(2) # ensure mtime is different - file_diffcontent = tmp_path / "in_file_1.txt" - with open(file_diffcontent, "w") as f: - f.write("hi") - hash3 = inputs(in_file=file_diffcontent).hash - assert hash1 != hash3 - - # checking if string is also accepted - hash4 = inputs(in_file=str(file)).hash - assert hash4 == "800af2b5b334c9e3e5c40c0e49b7ffb5" - - -def test_input_file_hash_3(tmp_path): - """input spec with File types, checking when the hash and file_hash change""" - file = tmp_path / "in_file_1.txt" - with open(file, "w") as f: - f.write("hello") - - input_spec = SpecInfo( - name="Inputs", fields=[("in_file", File), ("in_int", int)], bases=(BaseSpec,) - ) - inputs = make_klass(input_spec) - - my_inp = inputs(in_file=file, in_int=3) - # original hash and files_hash (dictionary contains info about files) - hash1 = my_inp.hash - # files_hash1 = deepcopy(my_inp.files_hash) - # file name should be in files_hash1[in_file] - filename = str(Path(file)) - # assert filename in files_hash1["in_file"] - - # changing int input - my_inp.in_int = 5 - hash2 = my_inp.hash - # files_hash2 = deepcopy(my_inp.files_hash) - # hash should be different - assert hash1 != hash2 - # files_hash should be the same, and the tuple for filename shouldn't be recomputed - # assert files_hash1 == files_hash2 - # assert id(files_hash1["in_file"][filename]) == id(files_hash2["in_file"][filename]) - - # recreating the file - time.sleep(2) # ensure mtime is different - with open(file, "w") as f: - f.write("hello") - - hash3 = my_inp.hash - # files_hash3 = deepcopy(my_inp.files_hash) - # hash should be the same, - # but the entry for in_file in files_hash should be different (modification time) - assert hash3 == hash2 - # assert files_hash3["in_file"][filename] != files_hash2["in_file"][filename] - # different timestamp - # assert files_hash3["in_file"][filename][0] != files_hash2["in_file"][filename][0] - # the same content hash - # assert files_hash3["in_file"][filename][1] == files_hash2["in_file"][filename][1] - - # setting the in_file again - my_inp.in_file = file - # filename should be removed from files_hash - # assert my_inp.files_hash["in_file"] == {} - # will be saved again when hash is calculated - assert my_inp.hash == hash3 - # assert filename in my_inp.files_hash["in_file"] - - -def test_input_file_hash_4(tmp_path): - """input spec with nested list, that contain ints and Files, - checking changes in checksums - """ - file = tmp_path / "in_file_1.txt" - with open(file, "w") as f: - f.write("hello") - - input_spec = SpecInfo( - name="Inputs", - fields=[("in_file", ty.List[ty.List[ty.Union[int, File]]])], - bases=(BaseSpec,), - ) - inputs = make_klass(input_spec) - - # checking specific hash value - hash1 = inputs(in_file=[[file, 3]]).hash - assert hash1 == "0693adbfac9f675af87e900065b1de00" - - # the same file, but int field changes - hash1a = inputs(in_file=[[file, 5]]).hash - assert hash1 != hash1a - - # checking if different name doesn't affect the hash - file_diffname = tmp_path / "in_file_2.txt" - with open(file_diffname, "w") as f: - f.write("hello") - hash2 = inputs(in_file=[[file_diffname, 3]]).hash - assert hash1 == hash2 - - # checking if different content (the same name) affects the hash - time.sleep(2) # need the mtime to be different - file_diffcontent = tmp_path / "in_file_1.txt" - with open(file_diffcontent, "w") as f: - f.write("hi") - hash3 = inputs(in_file=[[file_diffcontent, 3]]).hash - assert hash1 != hash3 - - -def test_input_file_hash_5(tmp_path): - """input spec with File in nested containers, checking changes in checksums""" - file = tmp_path / "in_file_1.txt" - with open(file, "w") as f: - f.write("hello") - - input_spec = SpecInfo( - name="Inputs", - fields=[("in_file", ty.List[ty.Dict[ty.Any, ty.Union[File, int]]])], - bases=(BaseSpec,), - ) - inputs = make_klass(input_spec) - - # checking specific hash value - hash1 = inputs(in_file=[{"file": file, "int": 3}]).hash - assert hash1 == "56e6e2c9f3bdf0cd5bd3060046dea480" - - # the same file, but int field changes - hash1a = inputs(in_file=[{"file": file, "int": 5}]).hash - assert hash1 != hash1a - - # checking if different name doesn't affect the hash - file_diffname = tmp_path / "in_file_2.txt" - with open(file_diffname, "w") as f: - f.write("hello") - hash2 = inputs(in_file=[{"file": file_diffname, "int": 3}]).hash - assert hash1 == hash2 - - # checking if different content (the same name) affects the hash - time.sleep(2) # ensure mtime is different - file_diffcontent = tmp_path / "in_file_1.txt" - with open(file_diffcontent, "w") as f: - f.write("hi") - hash3 = inputs(in_file=[{"file": file_diffcontent, "int": 3}]).hash - assert hash1 != hash3 - - -def test_lazy_field_cast(): - task = foo(a="a", b=1, c=2.0, name="foo") - - assert task.lzout.y.type == int - assert task.lzout.y.cast(float).type == float - - -def test_lazy_field_multi_same_split(): - @mark.task - def f(x: ty.List[int]) -> ty.List[int]: - return x - - task = f(x=[1, 2, 3], name="foo") - - lf = task.lzout.out.split("foo.x") - - assert lf.type == StateArray[int] - assert lf.splits == set([(("foo.x",),)]) - - lf2 = lf.split("foo.x") - assert lf2.type == StateArray[int] - assert lf2.splits == set([(("foo.x",),)]) - - -def test_lazy_field_multi_diff_split(): - @mark.task - def f(x: ty.Any, y: ty.Any) -> ty.Any: - return x - - task = f(x=[1, 2, 3], name="foo") - - lf = task.lzout.out.split("foo.x") - - assert lf.type == StateArray[ty.Any] - assert lf.splits == set([(("foo.x",),)]) - - lf2 = lf.split("foo.x") - assert lf2.type == StateArray[ty.Any] - assert lf2.splits == set([(("foo.x",),)]) - - lf3 = lf.split("foo.y") - assert lf3.type == StateArray[StateArray[ty.Any]] - assert lf3.splits == set([(("foo.x",),), (("foo.y",),)]) - - -def test_wf_lzin_split(): - @mark.task - def identity(x: int) -> int: - return x - - inner = Workflow(name="inner", input_spec=["x"]) - inner.add(identity(x=inner.lzin.x, name="f")) - inner.set_output(("out", inner.f.lzout.out)) - - outer = Workflow(name="outer", input_spec=["x"]) - outer.add(inner.split(x=outer.lzin.x)) - outer.set_output(("out", outer.inner.lzout.out)) - - result = outer(x=[1, 2, 3]) - assert result.output.out == StateArray([1, 2, 3]) diff --git a/pydra/engine/tests/test_state.py b/pydra/engine/tests/test_state.py index c8ef0941ca..e56a28c808 100644 --- a/pydra/engine/tests/test_state.py +++ b/pydra/engine/tests/test_state.py @@ -1,7 +1,262 @@ import pytest +from pydra.engine.state import State +from pydra.compose import python +from pydra.engine.state import ( + PydraStateError, + splitter2rpn, + splits_groups, + rpn2splitter, + add_name_splitter, + remove_inp_from_splitter_rpn, + converter_groups_to_input, +) + + +# TODO: feature? +class other_states_to_tests: + def __init__( + self, + splitter, + splitter_final=None, + keys_final=None, + ind_l=None, + ind_l_final=None, + ): + self.splitter = splitter + if splitter_final: + self.splitter_final = splitter_final + else: + self.splitter_final = splitter + self.other_states = {} + self.keys_final = keys_final + self.name = "NA" + self.ind_l = ind_l + if ind_l_final: + self.ind_l_final = ind_l_final + else: + self.ind_l_final = ind_l + + +@pytest.mark.parametrize( + "splitter, keys_exp, groups_exp, grstack_exp", + [ + ("a", ["a"], {"a": 0}, [[0]]), + (["a"], ["a"], {"a": 0}, [[0]]), + (("a",), ["a"], {"a": 0}, [[0]]), + (("a", "b"), ["a", "b"], {"a": 0, "b": 0}, [[0]]), + (["a", "b"], ["a", "b"], {"a": 0, "b": 1}, [[0, 1]]), + ([["a", "b"]], ["a", "b"], {"a": 0, "b": 1}, [[0, 1]]), + ((["a", "b"],), ["a", "b"], {"a": 0, "b": 1}, [[0, 1]]), + ((["a", "b"], "c"), ["a", "b", "c"], {"a": 0, "b": 1, "c": [0, 1]}, [[0, 1]]), + ([("a", "b"), "c"], ["a", "b", "c"], {"a": 0, "b": 0, "c": 1}, [[0, 1]]), + ([["a", "b"], "c"], ["a", "b", "c"], {"a": 0, "b": 1, "c": 2}, [[0, 1, 2]]), + ( + (["a", "b"], ["c", "d"]), + ["a", "b", "c", "d"], + {"a": 0, "b": 1, "c": 0, "d": 1}, + [[0, 1]], + ), + ], +) +def test_splits_groups(splitter, keys_exp, groups_exp, grstack_exp): + splitter_rpn = splitter2rpn(splitter) + keys_f, groups_f, grstack_f, _ = splits_groups(splitter_rpn) + + assert set(keys_f) == set(keys_exp) + assert groups_f == groups_exp + assert grstack_f == grstack_exp + + +@pytest.mark.parametrize( + "splitter, combiner, combiner_all_exp," + "keys_final_exp, groups_final_exp, grstack_final_exp", + [ + ("a", ["a"], ["a"], [], {}, []), + (["a"], ["a"], ["a"], [], {}, []), + (("a",), ["a"], ["a"], [], {}, []), + (("a", "b"), ["a"], ["a", "b"], [], {}, [[]]), + (("a", "b"), ["b"], ["a", "b"], [], {}, [[]]), + (["a", "b"], ["b"], ["b"], ["a"], {"a": 0}, [[0]]), + (["a", "b"], ["a"], ["a"], ["b"], {"b": 0}, [[0]]), + ((["a", "b"], "c"), ["a"], ["a", "c"], ["b"], {"b": 0}, [[0]]), + ((["a", "b"], "c"), ["b"], ["b", "c"], ["a"], {"a": 0}, [[0]]), + ((["a", "b"], "c"), ["a"], ["a", "c"], ["b"], {"b": 0}, [[0]]), + ((["a", "b"], "c"), ["c"], ["a", "b", "c"], [], {}, [[]]), + ([("a", "b"), "c"], ["a"], ["a", "b"], ["c"], {"c": 0}, [[0]]), + ([("a", "b"), "c"], ["b"], ["a", "b"], ["c"], {"c": 0}, [[0]]), + ([("a", "b"), "c"], ["c"], ["c"], ["a", "b"], {"a": 0, "b": 0}, [[0]]), + ([[("a", "b"), "c"]], ["c"], ["c"], ["a", "b"], {"a": 0, "b": 0}, [[0]]), + (([("a", "b"), "c"],), ["c"], ["c"], ["a", "b"], {"a": 0, "b": 0}, [[0]]), + ], +) +def test_splits_groups_comb( + splitter, + combiner, + keys_final_exp, + groups_final_exp, + grstack_final_exp, + combiner_all_exp, +): + splitter_rpn = splitter2rpn(splitter) + keys_final, groups_final, grstack_final, combiner_all = splits_groups( + splitter_rpn, combiner + ) + assert keys_final == keys_final_exp + assert groups_final == groups_final_exp + assert grstack_final == grstack_final_exp + + assert combiner_all == combiner_all_exp + + +@pytest.mark.parametrize( + "splitter, rpn", + [ + ("a", ["a"]), + (("a", "b"), ["a", "b", "."]), + (["a", "b"], ["a", "b", "*"]), + (["a", ("b", "c")], ["a", "b", "c", ".", "*"]), + ([("a", "b"), "c"], ["a", "b", ".", "c", "*"]), + (["a", ["b", ["c", "d"]]], ["a", "b", "c", "d", "*", "*", "*"]), + (["a", ("b", ["c", "d"])], ["a", "b", "c", "d", "*", ".", "*"]), + ((["a", "b"], "c"), ["a", "b", "*", "c", "."]), + ((["a", "b"], ["c", "d"]), ["a", "b", "*", "c", "d", "*", "."]), + (([["a", "b"]], ["c", "d"]), ["a", "b", "*", "c", "d", "*", "."]), + (((["a", "b"],), ["c", "d"]), ["a", "b", "*", "c", "d", "*", "."]), + ([("a", "b"), ("c", "d")], ["a", "b", ".", "c", "d", ".", "*"]), + ], +) +def test_splitter2rpn(splitter, rpn): + assert splitter2rpn(splitter) == rpn + + +@pytest.mark.parametrize( + "splitter, rpn", + [ + ((("a", "b"), "c"), ["a", "b", ".", "c", "."]), + (("a", "b", "c"), ["a", "b", ".", "c", "."]), + ([["a", "b"], "c"], ["a", "b", "*", "c", "*"]), + (["a", "b", "c"], ["a", "b", "*", "c", "*"]), + ], +) +def test_splitter2rpn_2(splitter, rpn): + assert splitter2rpn(splitter) == rpn + + +@pytest.mark.parametrize( + "splitter, rpn", + [ + ("a", ["a"]), + (("a", "b"), ["a", "b", "."]), + (["a", "b"], ["a", "b", "*"]), + (["a", ("b", "c")], ["a", "b", "c", ".", "*"]), + ([("a", "b"), "c"], ["a", "b", ".", "c", "*"]), + (["a", ["b", ["c", "d"]]], ["a", "b", "c", "d", "*", "*", "*"]), + (["a", ("b", ["c", "d"])], ["a", "b", "c", "d", "*", ".", "*"]), + ((["a", "b"], "c"), ["a", "b", "*", "c", "."]), + ((["a", "b"], ["c", "d"]), ["a", "b", "*", "c", "d", "*", "."]), + ([("a", "b"), ("c", "d")], ["a", "b", ".", "c", "d", ".", "*"]), + ], +) +def test_rpn2splitter(splitter, rpn): + assert rpn2splitter(rpn) == splitter + + +@pytest.mark.parametrize( + "splitter, other_states, rpn", + [ + ( + ["a", "_NA"], + {"NA": (other_states_to_tests(("b", "c")), "d")}, + ["a", "NA.b", "NA.c", ".", "*"], + ), + ( + ["_NA", "c"], + {"NA": (other_states_to_tests(("a", "b")), "d")}, + ["NA.a", "NA.b", ".", "c", "*"], + ), + ( + ["a", ("b", "_NA")], + {"NA": (other_states_to_tests(["c", "d"]), "d")}, + ["a", "b", "NA.c", "NA.d", "*", ".", "*"], + ), + ], +) +def test_splitter2rpn_wf_splitter_1(splitter, other_states, rpn): + assert splitter2rpn(splitter, other_states=other_states) == rpn -from ..state import State -from ..helpers_state import PydraStateError, add_name_splitter + +@pytest.mark.parametrize( + "splitter, other_states, rpn", + [ + ( + ["a", "_NA"], + {"NA": (other_states_to_tests(("b", "c")), "d")}, + ["a", "_NA", "*"], + ), + ( + ["_NA", "c"], + {"NA": (other_states_to_tests(("a", "b")), "d")}, + ["_NA", "c", "*"], + ), + ( + ["a", ("b", "_NA")], + {"NA": (other_states_to_tests(["c", "d"]), "d")}, + ["a", "b", "_NA", ".", "*"], + ), + ], +) +def test_splitter2rpn_wf_splitter_3(splitter, other_states, rpn): + assert splitter2rpn(splitter, other_states=other_states, state_fields=False) == rpn + + +@pytest.mark.parametrize( + "splitter, splitter_changed", + [ + ("a", "Node.a"), + (["a", ("b", "c")], ["Node.a", ("Node.b", "Node.c")]), + (("a", ["b", "c"]), ("Node.a", ["Node.b", "Node.c"])), + ], +) +def test_addname_splitter(splitter, splitter_changed): + assert add_name_splitter(splitter, "Node") == splitter_changed + + +@pytest.mark.parametrize( + "splitter_rpn, input_to_remove, final_splitter_rpn", + [ + (["a", "b", "."], ["b", "a"], []), + (["a", "b", "*"], ["b"], ["a"]), + (["a", "b", "c", ".", "*"], ["b", "c"], ["a"]), + (["a", "b", "c", ".", "*"], ["a"], ["b", "c", "."]), + (["a", "b", ".", "c", "*"], ["a", "b"], ["c"]), + (["a", "b", "c", "d", "*", "*", "*"], ["c"], ["a", "b", "d", "*", "*"]), + (["a", "b", "c", "d", "*", "*", "*"], ["a"], ["b", "c", "d", "*", "*"]), + (["a", "b", "c", "d", "*", ".", "*"], ["a"], ["b", "c", "d", "*", "."]), + (["a", "b", "*", "c", "."], ["a", "c"], ["b"]), + (["a", "b", "*", "c", "d", "*", "."], ["a", "c"], ["b", "d", "."]), + (["a", "b", ".", "c", "d", ".", "*"], ["a", "b"], ["c", "d", "."]), + ], +) +def test_remove_inp_from_splitter_rpn( + splitter_rpn, input_to_remove, final_splitter_rpn +): + assert ( + remove_inp_from_splitter_rpn(splitter_rpn, input_to_remove) + == final_splitter_rpn + ) + + +@pytest.mark.parametrize( + "group_for_inputs, input_for_groups, ndim", + [ + ({"a": 0, "b": 0}, {0: ["a", "b"]}, 1), + ({"a": 0, "b": 1}, {0: ["a"], 1: ["b"]}, 2), + ], +) +def test_groups_to_input(group_for_inputs, input_for_groups, ndim): + res = converter_groups_to_input(group_for_inputs) + assert res[0] == input_for_groups + assert res[1] == ndim @pytest.mark.parametrize( @@ -97,25 +352,23 @@ def test_state_1( def test_state_2_err(): with pytest.raises(PydraStateError) as exinfo: - State("NA", splitter={"a"}) + State(name="NA", splitter={"a"}) assert "splitter has to be a string, a tuple or a list" == str(exinfo.value) def test_state_3_err(): - with pytest.raises(PydraStateError) as exinfo: - State("NA", splitter=["a", "b"], combiner=("a", "b")) - assert "combiner has to be a string or a list" == str(exinfo.value) + with pytest.raises(PydraStateError, match="combiner has to be a string or a list"): + State(name="NA", splitter=["a", "b"], combiner=("a", "b")) def test_state_4_err(): - st = State("NA", splitter="a", combiner=["a", "b"]) - with pytest.raises(PydraStateError) as exinfo: + st = State(name="NA", splitter="a", combiner=["a", "b"]) + with pytest.raises(PydraStateError, match="are not in the splitter") as exinfo: st.combiner_validation() - assert "all combiners have to be in the splitter" in str(exinfo.value) def test_state_5_err(): - st = State("NA", combiner="a") + st = State(name="NA", combiner="a") with pytest.raises(PydraStateError) as exinfo: st.combiner_validation() assert "splitter has to be set before" in str(exinfo.value) @@ -126,7 +379,7 @@ def test_state_5_err(): @pytest.mark.parametrize( - "splitter, cont_dim, values, keys, splits", + "splitter, container_ndim, values, keys, splits", [ ("a", None, [(0,), (1,)], ["a"], [{"a": 1}, {"a": 2}]), (["a"], None, [(0,), (1,)], ["a"], [{"a": 1}, {"a": 2}]), @@ -316,7 +569,7 @@ def test_state_5_err(): ), ], ) -def test_state_6(splitter, cont_dim, values, keys, splits): +def test_state_6(splitter, container_ndim, values, keys, splits): """checking split method and prepare_state""" inputs = { "S.a": [1, 2], @@ -328,13 +581,13 @@ def test_state_6(splitter, cont_dim, values, keys, splits): # adding st.name to the inputs variables splitter = add_name_splitter(splitter, name="S") - if cont_dim: - cont_dim = {f"S.{k}": v for k, v in cont_dim.items()} + if container_ndim: + container_ndim = {f"S.{k}": v for k, v in container_ndim.items()} keys = [f"S.{k}" for k in keys] splits = [{f"S.{k}": v for k, v in el.items()} for el in splits] st = State(splitter=splitter, name="S") - st.prepare_states(inputs=inputs, cont_dim=cont_dim) + st.prepare_states(inputs=inputs, container_ndim=container_ndim) # checking keys and splits assert st.keys_final == keys @@ -343,7 +596,7 @@ def test_state_6(splitter, cont_dim, values, keys, splits): @pytest.mark.parametrize( - "splitter, cont_dim, inputs, mismatch", + "splitter, container_ndim, inputs, mismatch", [ ((["a", "v"], "c"), None, {"a": [1, 2], "v": ["a", "b"], "c": [3, 4]}, True), ( @@ -360,26 +613,26 @@ def test_state_6(splitter, cont_dim, values, keys, splits): ), ], ) -def test_state_7(splitter, cont_dim, inputs, mismatch): +def test_state_7(splitter, container_ndim, inputs, mismatch): """checking if the split methods returns errors if shapes doesn't match""" # adding st.name to the inputs variables splitter = add_name_splitter(splitter, name="S") - if cont_dim: - cont_dim = {f"S.{k}": v for k, v in cont_dim.items()} + if container_ndim: + container_ndim = {f"S.{k}": v for k, v in container_ndim.items()} inputs = {f"S.{k}": v for k, v in inputs.items()} st = State(splitter=splitter, name="S") if mismatch: with pytest.raises(ValueError): - st.prepare_states(inputs=inputs, cont_dim=cont_dim) + st.prepare_states(inputs=inputs, container_ndim=container_ndim) else: - st.prepare_states(inputs=inputs, cont_dim=cont_dim) + st.prepare_states(inputs=inputs, container_ndim=container_ndim) @pytest.mark.parametrize( - "splitter, cont_dim, values, keys, shapes, splits", + "splitter, container_ndim, values, keys, shapes, splits", [ ( (["a", "v"], "c"), @@ -409,18 +662,18 @@ def test_state_7(splitter, cont_dim, inputs, mismatch): ), ], ) -def test_state_8(splitter, cont_dim, values, keys, shapes, splits): +def test_state_8(splitter, container_ndim, values, keys, shapes, splits): inputs = {"S.a": [1, 2], "S.v": ["a", "b"], "S.c": [[3, 4], [5, 6]]} # adding st.name to the inputs variables splitter = add_name_splitter(splitter, name="S") - if cont_dim: - cont_dim = {f"S.{k}": v for k, v in cont_dim.items()} + if container_ndim: + container_ndim = {f"S.{k}": v for k, v in container_ndim.items()} keys = [f"S.{k}" for k in keys] splits = [{f"S.{k}": v for k, v in el.items()} for el in splits] st = State(splitter=splitter, name="S") - st.prepare_states(inputs=inputs, cont_dim=cont_dim) + st.prepare_states(inputs=inputs, container_ndim=container_ndim) # checking keys and splits assert st.keys_final == keys @@ -477,6 +730,7 @@ def test_state_connect_1(): no explicit splitter for the second state """ st1 = State(name="NA", splitter="a") + st1.prepare_states(inputs={"NA.a": [3, 5]}) st2 = State(name="NB", other_states={"NA": (st1, "b")}) assert st2.splitter == "_NA" assert st2.splitter_rpn == ["NA.a"] @@ -500,7 +754,12 @@ def test_state_connect_1a(): the second state has explicit splitter from the first one (the prev-state part) """ st1 = State(name="NA", splitter="a") - st2 = State(name="NB", splitter="_NA", other_states={"NA": (st1, "b")}) + st1.prepare_states(inputs={"NA.a": [3, 5]}) + st2 = State( + name="NB", + splitter="_NA", + other_states={"NA": (st1, "b")}, + ) assert st2.splitter == "_NA" assert st2.splitter_rpn == ["NA.a"] @@ -527,7 +786,11 @@ def test_state_connect_1b_exception(): def test_state_connect_1c_exception(splitter2, other_states2): """can't ask for splitter from node that is not connected""" with pytest.raises(PydraStateError): - st2 = State(name="NB", splitter=splitter2, other_states=other_states2) + st2 = State( + name="NB", + splitter=splitter2, + other_states=other_states2, + ) st2.splitter_validation() @@ -537,7 +800,12 @@ def test_state_connect_2(): splitter from the first node and a new field (the prev-state and current part) """ st1 = State(name="NA", splitter="a") - st2 = State(name="NB", splitter=["_NA", "a"], other_states={"NA": (st1, "b")}) + st1.prepare_states(inputs={"NA.a": [3, 5]}) + st2 = State( + name="NB", + splitter=["_NA", "a"], + other_states={"NA": (st1, "b")}, + ) assert st2.splitter == ["_NA", "NB.a"] assert st2.splitter_rpn == ["NA.a", "NB.a", "*"] @@ -581,7 +849,12 @@ def test_state_connect_2a(): adding an additional scalar field that is not part of the splitter """ st1 = State(name="NA", splitter="a") - st2 = State(name="NB", splitter=["_NA", "a"], other_states={"NA": (st1, "b")}) + st1.prepare_states(inputs={"NA.a": [3, 5]}) + st2 = State( + name="NB", + splitter=["_NA", "a"], + other_states={"NA": (st1, "b")}, + ) assert st2.splitter == ["_NA", "NB.a"] assert st2.splitter_rpn == ["NA.a", "NB.a", "*"] @@ -619,6 +892,7 @@ def test_state_connect_2b(): splitter from the first node (the prev-state part) has to be added """ st1 = State(name="NA", splitter="a") + st1.prepare_states(inputs={"NA.a": [3, 5]}) st2 = State(name="NB", splitter="a", other_states={"NA": (st1, "b")}) assert st2.splitter == ["_NA", "NB.a"] @@ -626,7 +900,7 @@ def test_state_connect_2b(): assert st2.current_splitter == "NB.a" assert st2.prev_state_splitter == "_NA" - st2.prepare_states(inputs={"NA.a": [3, 5], "NB.a": [1, 2]}) + st2.prepare_states(inputs={"NB.a": [1, 2]}) assert st2.group_for_inputs_final == {"NA.a": 0, "NB.a": 1} assert st2.groups_stack_final == [[0, 1]] assert st2.states_ind == [ @@ -657,8 +931,13 @@ def test_state_connect_3(): splitter from the previous states (the prev-state part) has to be added """ st1 = State(name="NA", splitter="a") + st1.prepare_states(inputs={"NA.a": [3, 5]}) st2 = State(name="NB", splitter="a") - st3 = State(name="NC", other_states={"NA": (st1, "b"), "NB": (st2, "c")}) + st2.prepare_states(inputs={"NB.a": [30, 50]}) + st3 = State( + name="NC", + other_states={"NA": (st1, "b"), "NB": (st2, "c")}, + ) assert st3.splitter == ["_NA", "_NB"] assert st3.splitter_rpn == ["NA.a", "NB.a", "*"] @@ -699,7 +978,9 @@ def test_state_connect_3a(): the third state has explicit splitter that contains splitters from previous states """ st1 = State(name="NA", splitter="a") + st1.prepare_states(inputs={"NA.a": [3, 5], "NB.a": [30, 50]}) st2 = State(name="NB", splitter="a") + st2.prepare_states(inputs={"NA.a": [3, 5], "NB.a": [30, 50]}) st3 = State( name="NC", splitter=["_NA", "_NB"], @@ -741,9 +1022,13 @@ def test_state_connect_3b(): splitter from the second state has to be added (partial prev-state part) """ st1 = State(name="NA", splitter="a") + st1.prepare_states(inputs={"NA.a": [3, 5], "NB.a": [30, 50]}) st2 = State(name="NB", splitter="a") + st2.prepare_states(inputs={"NA.a": [3, 5], "NB.a": [30, 50]}) st3 = State( - name="NC", splitter="_NB", other_states={"NA": (st1, "b"), "NB": (st2, "c")} + name="NC", + splitter="_NB", + other_states={"NA": (st1, "b"), "NB": (st2, "c")}, ) assert st3.splitter == ["_NA", "_NB"] @@ -780,7 +1065,9 @@ def test_state_connect_4(): the third state has explicit scalar(!) splitter that contains two previous states """ st1 = State(name="NA", splitter="a") + st1.prepare_states(inputs={"NA.a": [3, 5], "NB.a": [30, 50]}) st2 = State(name="NB", splitter="a") + st2.prepare_states(inputs={"NA.a": [3, 5], "NB.a": [30, 50]}) st3 = State( name="NC", splitter=("_NA", "_NB"), @@ -811,6 +1098,7 @@ def test_state_connect_5(): the second state has no explicit splitter """ st1 = State(name="NA", splitter=["a", "b"]) + st1.prepare_states(inputs={"NA.a": [3, 5], "NA.b": [10, 20]}) st2 = State(name="NB", other_states={"NA": (st1, "a")}) assert st2.splitter == "_NA" assert st2.splitter_rpn == ["NA.a", "NA.b", "*"] @@ -841,7 +1129,9 @@ def test_state_connect_6(): the third state has explicit splitter with splitters from previous states """ st1 = State(name="NA", splitter=["a", "b"]) + st1.prepare_states(inputs={"NA.a": [3, 5], "NA.b": [10, 20], "NB.a": [600, 700]}) st2 = State(name="NB", splitter="a") + st2.prepare_states(inputs={"NA.a": [3, 5], "NA.b": [10, 20], "NB.a": [600, 700]}) st3 = State( name="NC", splitter=["_NA", "_NB"], @@ -894,8 +1184,13 @@ def test_state_connect_6a(): the third state has no explicit splitter """ st1 = State(name="NA", splitter=["a", "b"]) + st1.prepare_states(inputs={"NA.a": [3, 5], "NA.b": [10, 20], "NB.a": [600, 700]}) st2 = State(name="NB", splitter="a") - st3 = State(name="NC", other_states={"NA": (st1, "a"), "NB": (st2, "b")}) + st2.prepare_states(inputs={"NA.a": [3, 5], "NA.b": [10, 20], "NB.a": [600, 700]}) + st3 = State( + name="NC", + other_states={"NA": (st1, "a"), "NB": (st2, "b")}, + ) assert st3.splitter == ["_NA", "_NB"] assert st3.splitter_rpn == ["NA.a", "NA.b", "*", "NB.a", "*"] @@ -941,6 +1236,7 @@ def test_state_connect_7(): no explicit splitter for the second state """ st1 = State(name="NA", splitter="a") + st1.prepare_states(inputs={"NA.a": [3, 5]}) st2 = State(name="NB", other_states={"NA": (st1, ["x", "y"])}) # should take into account that x, y come from the same task assert st2.splitter == "_NA" @@ -967,8 +1263,13 @@ def test_state_connect_8(): and it should give the same as the previous test """ st1 = State(name="NA", splitter="a") + st1.prepare_states(inputs={"NA.a": [3, 5]}) st2 = State(name="NB", other_states={"NA": (st1, "b")}) - st3 = State(name="NC", other_states={"NA": (st1, "x"), "NB": (st2, "y")}) + st2.prepare_states(inputs={"NA.a": [3, 5]}) + st3 = State( + name="NC", + other_states={"NA": (st1, "x"), "NB": (st2, "y")}, + ) # x comes from NA and y comes from NB, but NB has only NA's splitter, # so it should be treated as both inputs are from NA state assert st3.splitter == "_NA" @@ -998,9 +1299,18 @@ def test_state_connect_9(): """ st1 = State(name="NA_1", splitter="a") + st1.prepare_states(inputs={"NA_1.a": [3, 5], "NA_2.a": [11, 12]}) st1a = State(name="NA_2", splitter="a") - st2 = State(name="NB", other_states={"NA_1": (st1, "b"), "NA_2": (st1a, "c")}) - st3 = State(name="NC", other_states={"NA_1": (st1, "x"), "NB": (st2, "y")}) + st1a.prepare_states(inputs={"NA_1.a": [3, 5], "NA_2.a": [11, 12]}) + st2 = State( + name="NB", + other_states={"NA_1": (st1, "b"), "NA_2": (st1a, "c")}, + ) + st2.prepare_states(inputs={"NA_1.a": [3, 5], "NA_2.a": [11, 12]}) + st3 = State( + name="NC", + other_states={"NA_1": (st1, "x"), "NB": (st2, "y")}, + ) # x comes from NA_1 and y comes from NB, but NB has only NA_1/2's splitters, assert st3.splitter == ["_NA_1", "_NA_2"] assert st3.splitter_rpn == ["NA_1.a", "NA_2.a", "*"] @@ -1033,7 +1343,12 @@ def test_state_connect_innerspl_1(): the second state has an inner splitter, full splitter provided """ st1 = State(name="NA", splitter="a") - st2 = State(name="NB", splitter=["_NA", "b"], other_states={"NA": (st1, "b")}) + st1.prepare_states(inputs={"NA.a": [3, 5]}) + st2 = State( + name="NB", + splitter=["_NA", "b"], + other_states={"NA": (st1, "b")}, + ) assert st2.splitter == ["_NA", "NB.b"] assert st2.splitter_rpn == ["NA.a", "NB.b", "*"] @@ -1045,7 +1360,7 @@ def test_state_connect_innerspl_1(): st2.prepare_states( inputs={"NA.a": [3, 5], "NB.b": [[1, 10, 100], [2, 20, 200]]}, - cont_dim={"NB.b": 2}, # will be treated as 2d container + container_ndim={"NB.b": 2}, # will be treated as 2d container ) assert st2.other_states["NA"][1] == ["b"] assert st2.group_for_inputs_final == {"NA.a": 0, "NB.b": 1} @@ -1085,6 +1400,9 @@ def test_state_connect_innerspl_1a(): splitter from the first state (the prev-state part) has to be added """ st1 = State(name="NA", splitter="a") + st1.prepare_states( + inputs={"NA.a": [3, 5]}, + ) st2 = State(name="NB", splitter="b", other_states={"NA": (st1, "b")}) assert st2.splitter == ["_NA", "NB.b"] @@ -1099,7 +1417,7 @@ def test_state_connect_innerspl_1a(): st2.prepare_states( inputs={"NA.a": [3, 5], "NB.b": [[1, 10, 100], [2, 20, 200]]}, - cont_dim={"NB.b": 2}, # will be treated as 2d container + container_ndim={"NB.b": 2}, # will be treated as 2d container ) assert st2.group_for_inputs_final == {"NA.a": 0, "NB.b": 1} assert st2.groups_stack_final == [[0], [1]] @@ -1136,7 +1454,11 @@ def test_state_connect_innerspl_1b(): """incorrect splitter - the current & prev-state parts in scalar splitter""" with pytest.raises(PydraStateError): st1 = State(name="NA", splitter="a") - State(name="NB", splitter=("_NA", "b"), other_states={"NA": (st1, "b")}) + State( + name="NB", + splitter=("_NA", "b"), + other_states={"NA": (st1, "b")}, + ) def test_state_connect_innerspl_2(): @@ -1145,7 +1467,15 @@ def test_state_connect_innerspl_2(): only the current part of the splitter provided (the prev-state has to be added) """ st1 = State(name="NA", splitter="a") - st2 = State(name="NB", splitter=["c", "b"], other_states={"NA": (st1, "b")}) + st1.prepare_states( + inputs={"NA.a": [3, 5], "NB.b": [[1, 10, 100], [2, 20, 200]], "NB.c": [13, 17]}, + container_ndim={"NB.b": 2}, # will be treated as 2d container + ) + st2 = State( + name="NB", + splitter=["c", "b"], + other_states={"NA": (st1, "b")}, + ) assert st2.splitter == ["_NA", ["NB.c", "NB.b"]] assert st2.splitter_rpn == ["NA.a", "NB.c", "NB.b", "*", "*"] @@ -1157,7 +1487,7 @@ def test_state_connect_innerspl_2(): st2.prepare_states( inputs={"NA.a": [3, 5], "NB.b": [[1, 10, 100], [2, 20, 200]], "NB.c": [13, 17]}, - cont_dim={"NB.b": 2}, # will be treated as 2d container + container_ndim={"NB.b": 2}, # will be treated as 2d container ) assert st2.other_states["NA"][1] == ["b"] assert st2.group_for_inputs_final == {"NA.a": 0, "NB.c": 1, "NB.b": 2} @@ -1216,7 +1546,15 @@ def test_state_connect_innerspl_2a(): """ st1 = State(name="NA", splitter="a") - st2 = State(name="NB", splitter=["b", "c"], other_states={"NA": (st1, "b")}) + st1.prepare_states( + inputs={"NA.a": [3, 5], "NB.b": [[1, 10, 100], [2, 20, 200]], "NB.c": [13, 17]}, + container_ndim={"NB.b": 2}, # will be treated as 2d container + ) + st2 = State( + name="NB", + splitter=["b", "c"], + other_states={"NA": (st1, "b")}, + ) assert st2.splitter == ["_NA", ["NB.b", "NB.c"]] assert st2.splitter_rpn == ["NA.a", "NB.b", "NB.c", "*", "*"] @@ -1224,7 +1562,7 @@ def test_state_connect_innerspl_2a(): st2.prepare_states( inputs={"NA.a": [3, 5], "NB.b": [[1, 10, 100], [2, 20, 200]], "NB.c": [13, 17]}, - cont_dim={"NB.b": 2}, # will be treated as 2d container + container_ndim={"NB.b": 2}, # will be treated as 2d container ) assert st2.group_for_inputs_final == {"NA.a": 0, "NB.c": 2, "NB.b": 1} assert st2.groups_stack_final == [[0], [1, 2]] @@ -1283,7 +1621,19 @@ def test_state_connect_innerspl_3(): """ st1 = State(name="NA", splitter="a") - st2 = State(name="NB", splitter=["c", "b"], other_states={"NA": (st1, "b")}) + st1.prepare_states(inputs={"NA.a": [3, 5]}) + st2 = State( + name="NB", + splitter=["c", "b"], + other_states={"NA": (st1, "b")}, + ) + st2.prepare_states( + inputs={ + "NB.b": [[1, 10, 100], [2, 20, 200]], + "NB.c": [13, 17], + }, + container_ndim={"NB.b": 2}, # will be treated as 2d container + ) st3 = State(name="NC", splitter="d", other_states={"NB": (st2, "a")}) assert st3.splitter == ["_NB", "NC.d"] @@ -1301,7 +1651,7 @@ def test_state_connect_innerspl_3(): "NB.c": [13, 17], "NC.d": [33, 77], }, - cont_dim={"NB.b": 2}, # will be treated as 2d container + container_ndim={"NB.b": 2}, # will be treated as 2d container ) assert st3.group_for_inputs_final == {"NA.a": 0, "NB.c": 1, "NB.b": 2, "NC.d": 3} assert st3.groups_stack_final == [[0], [1, 2, 3]] @@ -1422,9 +1772,23 @@ def test_state_connect_innerspl_4(): the third one connected to two previous, only the current part of splitter provided """ st1 = State(name="NA", splitter="a") + st1.prepare_states( + inputs={ + "NA.a": [3, 5], + } + ) st2 = State(name="NB", splitter=["b", "c"]) + st2.prepare_states( + inputs={ + "NA.a": [3, 5], + "NB.b": [10, 20], + "NB.c": [13, 17], + } + ) st3 = State( - name="NC", splitter="d", other_states={"NA": (st1, "e"), "NB": (st2, "f")} + name="NC", + splitter="d", + other_states={"NA": (st1, "e"), "NB": (st2, "f")}, ) assert st3.splitter == [["_NA", "_NB"], "NC.d"] @@ -1441,7 +1805,7 @@ def test_state_connect_innerspl_4(): "NC.f": [[23, 27], [33, 37]], "NC.d": [1, 2], }, - cont_dim={"NC.f": 2}, # will be treated as 2d container + container_ndim={"NC.f": 2}, # will be treated as 2d container ) assert st3.group_for_inputs_final == {"NA.a": 0, "NB.c": 2, "NB.b": 1, "NC.d": 3} assert st3.groups_stack_final == [[0, 1, 2, 3]] @@ -1527,6 +1891,7 @@ def test_state_combine_1(): def test_state_connect_combine_1(): """two connected states; outer splitter and combiner in the first one""" st1 = State(name="NA", splitter=["a", "b"], combiner="a") + st1.prepare_states(inputs={"NA.a": [3, 5], "NA.b": [10, 20]}) st2 = State(name="NB", other_states={"NA": (st1, "c")}) assert st1.splitter == ["NA.a", "NA.b"] @@ -1572,6 +1937,9 @@ def test_state_connect_combine_2(): additional splitter in the second node """ st1 = State(name="NA", splitter=["a", "b"], combiner="a") + st1.prepare_states( + inputs={"NA.a": [3, 5], "NA.b": [10, 20], "NB.c": [90, 150], "NB.d": [0, 1]} + ) st2 = State(name="NB", splitter="d", other_states={"NA": (st1, "c")}) assert st1.splitter == ["NA.a", "NA.b"] @@ -1634,7 +2002,13 @@ def test_state_connect_combine_3(): additional splitter in the second node """ st1 = State(name="NA", splitter=["a", "b"], combiner="a") - st2 = State(name="NB", splitter="d", combiner="d", other_states={"NA": (st1, "c")}) + st1.prepare_states(inputs={"NA.a": [3, 5], "NA.b": [10, 20]}) + st2 = State( + name="NB", + splitter="d", + combiner="d", + other_states={"NA": (st1, "c")}, + ) assert st1.splitter == ["NA.a", "NA.b"] assert st1.splitter_rpn == ["NA.a", "NA.b", "*"] @@ -1699,8 +2073,12 @@ def test_state_connect_innerspl_combine_1(): """one previous node and one inner splitter (and inner splitter combiner); only current part provided - the prev-state part had to be added""" st1 = State(name="NA", splitter="a") + st1.prepare_states(inputs={"NA.a": [3, 5]}) st2 = State( - name="NB", splitter=["c", "b"], combiner=["b"], other_states={"NA": (st1, "b")} + name="NB", + splitter=["c", "b"], + combiner=["b"], + other_states={"NA": (st1, "b")}, ) assert st2.splitter == ["_NA", ["NB.c", "NB.b"]] @@ -1715,7 +2093,7 @@ def test_state_connect_innerspl_combine_1(): st2.prepare_states( inputs={"NA.a": [3, 5], "NB.b": [[1, 10, 100], [2, 20, 200]], "NB.c": [13, 17]}, - cont_dim={"NB.b": 2}, # will be treated as 2d container + container_ndim={"NB.b": 2}, # will be treated as 2d container ) assert st2.group_for_inputs_final == {"NA.a": 0, "NB.c": 1} assert st2.groups_stack_final == [[0], [1]] @@ -1780,8 +2158,12 @@ def test_state_connect_innerspl_combine_2(): the prev-state part has to be added """ st1 = State(name="NA", splitter="a") + st1.prepare_states(inputs={"NA.a": [3, 5]}) st2 = State( - name="NB", splitter=["c", "b"], combiner=["c"], other_states={"NA": (st1, "b")} + name="NB", + splitter=["c", "b"], + combiner=["c"], + other_states={"NA": (st1, "b")}, ) assert st2.splitter == ["_NA", ["NB.c", "NB.b"]] @@ -1791,7 +2173,7 @@ def test_state_connect_innerspl_combine_2(): st2.prepare_states( inputs={"NA.a": [3, 5], "NB.b": [[1, 10, 100], [2, 20, 200]], "NB.c": [13, 17]}, - cont_dim={"NB.b": 2}, # will be treated as 2d container + container_ndim={"NB.b": 2}, # will be treated as 2d container ) assert st2.group_for_inputs_final == {"NA.a": 0, "NB.b": 1} assert st2.groups_stack_final == [[0], [1]] @@ -1856,7 +2238,12 @@ def test_state_connect_combine_prevst_1(): (i.e. from the prev-state part of the splitter), """ st1 = State(name="NA", splitter="a") - st2 = State(name="NB", other_states={"NA": (st1, "b")}, combiner="NA.a") + st1.prepare_states(inputs={"NA.a": [3, 5]}) + st2 = State( + name="NB", + other_states={"NA": (st1, "b")}, + combiner="NA.a", + ) assert st2.splitter == "_NA" assert st2.splitter_rpn == ["NA.a"] assert ( @@ -1886,7 +2273,12 @@ def test_state_connect_combine_prevst_2(): (i.e. from the prev-state part of the splitter), """ st1 = State(name="NA", splitter=["a", "b"]) - st2 = State(name="NB", other_states={"NA": (st1, "b")}, combiner="NA.a") + st1.prepare_states(inputs={"NA.a": [3, 5], "NA.b": [10, 20]}) + st2 = State( + name="NB", + other_states={"NA": (st1, "b")}, + combiner="NA.a", + ) assert st2.splitter == "_NA" assert st2.splitter_rpn == ["NA.a", "NA.b", "*"] assert st2.combiner == ["NA.a"] @@ -1894,7 +2286,7 @@ def test_state_connect_combine_prevst_2(): assert st2.current_combiner_all == st2.current_combiner == [] assert st2.splitter_rpn_final == ["NA.b"] - st2.prepare_states(inputs={"NA.a": [3, 5], "NA.b": [10, 20]}) + st2.prepare_states(inputs={}) assert st2.group_for_inputs_final == {"NA.b": 0} assert st2.groups_stack_final == [[0]] assert st2.states_ind == [ @@ -1922,14 +2314,20 @@ def test_state_connect_combine_prevst_3(): (i.e. from the prev-state part of the splitter), """ st1 = State(name="NA", splitter=["a", "b"]) + st1.prepare_states(inputs={"NA.a": [3, 5], "NA.b": [10, 20]}) st2 = State(name="NB", other_states={"NA": (st1, "b")}) - st3 = State(name="NC", other_states={"NB": (st2, "c")}, combiner="NA.a") + st2.prepare_states(inputs={}) + st3 = State( + name="NC", + other_states={"NB": (st2, "c")}, + combiner="NA.a", + ) assert st3.splitter == "_NB" assert st3.splitter_rpn == ["NA.a", "NA.b", "*"] assert st3.combiner == ["NA.a"] assert st3.splitter_rpn_final == ["NA.b"] - st3.prepare_states(inputs={"NA.a": [3, 5], "NA.b": [10, 20]}) + st3.prepare_states(inputs={}) assert st3.group_for_inputs_final == {"NA.b": 0} assert st3.groups_stack_final == [[0]] @@ -1958,7 +2356,9 @@ def test_state_connect_combine_prevst_4(): the third state has also combiner from the prev-state part """ st1 = State(name="NA", splitter="a") + st1.prepare_states(inputs={"NA.a": [3, 5]}) st2 = State(name="NB", splitter="a") + st2.prepare_states(inputs={"NB.a": [600, 700]}) st3 = State( name="NC", splitter=["_NA", "_NB"], @@ -2010,7 +2410,9 @@ def test_state_connect_combine_prevst_5(): the third state has also combiner from the prev-state part """ st1 = State(name="NA", splitter="a") + st1.prepare_states(inputs={"NA.a": [3, 5]}) st2 = State(name="NB", splitter="a") + st2.prepare_states(inputs={"NB.a": [600, 700]}) st3 = State( name="NC", splitter=("_NA", "_NB"), @@ -2044,8 +2446,12 @@ def test_state_connect_combine_prevst_6(): (i.e. from the prev-state part of the splitter), """ st1 = State(name="NA", splitter=["a", "b"]) + st1.prepare_states(inputs={"NA.a": [3, 5], "NA.b": [10, 20]}) st2 = State( - name="NB", splitter="c", other_states={"NA": (st1, "b")}, combiner="NA.a" + name="NB", + splitter="c", + other_states={"NA": (st1, "b")}, + combiner="NA.a", ) assert st2.splitter == ["_NA", "NB.c"] assert st2.splitter_rpn == ["NA.a", "NA.b", "*", "NB.c", "*"] @@ -2098,10 +2504,32 @@ def test_state_connect_combine_prevst_6(): ] +@python.define +class ExampleDef(python.Task["ExampleDef.Outputs"]): + + a: int + b: int + + class Outputs(python.Outputs): + c: int + + def function(self): + return self.Outputs(c=self.inputs.a + self.inputs.b) + + +example_def = ExampleDef(a=1, b=2) + + @pytest.mark.parametrize( "splitter, other_states, expected_splitter, expected_prevst, expected_current", [ - (None, {"NA": (State(name="NA", splitter="a"), "b")}, "_NA", "_NA", None), + ( + None, + {"NA": (State(name="NA", splitter="a"), "b")}, + "_NA", + "_NA", + None, + ), ( "b", {"NA": (State(name="NA", splitter="a"), "b")}, @@ -2161,8 +2589,14 @@ def test_connect_splitters( @pytest.mark.parametrize( "splitter, other_states", [ - (("_NA", "b"), {"NA": (State(name="NA", splitter="a"), "b")}), - (["b", "_NA"], {"NA": (State(name="NA", splitter="a"), "b")}), + ( + ("_NA", "b"), + {"NA": (State(name="NA", splitter="a"), "b")}, + ), + ( + ["b", "_NA"], + {"NA": (State(name="NA", splitter="a"), "b")}, + ), ( ["_NB", ["_NA", "b"]], { @@ -2174,7 +2608,11 @@ def test_connect_splitters( ) def test_connect_splitters_exception_1(splitter, other_states): with pytest.raises(PydraStateError) as excinfo: - State(name="CN", splitter=splitter, other_states=other_states) + State( + name="CN", + splitter=splitter, + other_states=other_states, + ) assert "prev-state and current splitters are mixed" in str(excinfo.value) @@ -2194,6 +2632,9 @@ def test_connect_splitters_exception_3(): State( name="CN", splitter="_NB", - other_states=["NA", (State(name="NA", splitter="a"), "b")], + other_states=[ + "NA", + (State(name="NA", splitter="a"), "b"), + ], ) assert "other states has to be a dictionary" == str(excinfo.value) diff --git a/pydra/engine/tests/test_submitter.py b/pydra/engine/tests/test_submitter.py deleted file mode 100644 index 298e7e74b4..0000000000 --- a/pydra/engine/tests/test_submitter.py +++ /dev/null @@ -1,733 +0,0 @@ -from dateutil import parser -import secrets -import re -import subprocess as sp -import time -import attrs -import typing as ty -from random import randint -import os -from unittest.mock import patch -import pytest -from fileformats.generic import Directory -from .utils import ( - need_sge, - need_slurm, - gen_basic_wf, - gen_basic_wf_with_threadcount, - gen_basic_wf_with_threadcount_concurrent, -) -from ..core import Workflow, TaskBase -from ..submitter import Submitter -from ..workers import SerialWorker -from ... import mark -from pathlib import Path -from datetime import datetime - - -@mark.task -def sleep_add_one(x): - time.sleep(1) - return x + 1 - - -def test_callable_wf(plugin, tmpdir): - wf = gen_basic_wf() - res = wf() - assert res.output.out == 9 - del wf, res - - # providing plugin - wf = gen_basic_wf() - res = wf(plugin="cf") - assert res.output.out == 9 - del wf, res - - # providing plugin_kwargs - wf = gen_basic_wf() - res = wf(plugin="cf", plugin_kwargs={"n_procs": 2}) - assert res.output.out == 9 - del wf, res - - # providing wrong plugin_kwargs - wf = gen_basic_wf() - with pytest.raises(TypeError, match="an unexpected keyword argument"): - wf(plugin="cf", plugin_kwargs={"sbatch_args": "-N2"}) - - # providing submitter - wf = gen_basic_wf() - wf.cache_dir = tmpdir - sub = Submitter(plugin) - res = wf(submitter=sub) - assert res.output.out == 9 - - -def test_concurrent_wf(plugin, tmpdir): - # concurrent workflow - # A --> C - # B --> D - wf = Workflow("new_wf", input_spec=["x", "y"]) - wf.inputs.x = 5 - wf.inputs.y = 10 - wf.add(sleep_add_one(name="taska", x=wf.lzin.x)) - wf.add(sleep_add_one(name="taskb", x=wf.lzin.y)) - wf.add(sleep_add_one(name="taskc", x=wf.taska.lzout.out)) - wf.add(sleep_add_one(name="taskd", x=wf.taskb.lzout.out)) - wf.set_output([("out1", wf.taskc.lzout.out), ("out2", wf.taskd.lzout.out)]) - wf.cache_dir = tmpdir - - with Submitter(plugin) as sub: - sub(wf) - - res = wf.result() - assert res.output.out1 == 7 - assert res.output.out2 == 12 - - -def test_concurrent_wf_nprocs(tmpdir): - # concurrent workflow - # setting n_procs in Submitter that is passed to the worker - # A --> C - # B --> D - wf = Workflow("new_wf", input_spec=["x", "y"]) - wf.inputs.x = 5 - wf.inputs.y = 10 - wf.add(sleep_add_one(name="taska", x=wf.lzin.x)) - wf.add(sleep_add_one(name="taskb", x=wf.lzin.y)) - wf.add(sleep_add_one(name="taskc", x=wf.taska.lzout.out)) - wf.add(sleep_add_one(name="taskd", x=wf.taskb.lzout.out)) - wf.set_output([("out1", wf.taskc.lzout.out), ("out2", wf.taskd.lzout.out)]) - wf.cache_dir = tmpdir - - with Submitter("cf", n_procs=2) as sub: - sub(wf) - - res = wf.result() - assert res.output.out1 == 7 - assert res.output.out2 == 12 - - -def test_wf_in_wf(plugin, tmpdir): - """WF(A --> SUBWF(A --> B) --> B)""" - wf = Workflow(name="wf_in_wf", input_spec=["x"]) - wf.inputs.x = 3 - wf.add(sleep_add_one(name="wf_a", x=wf.lzin.x)) - - # workflow task - subwf = Workflow(name="sub_wf", input_spec=["x"]) - subwf.add(sleep_add_one(name="sub_a", x=subwf.lzin.x)) - subwf.add(sleep_add_one(name="sub_b", x=subwf.sub_a.lzout.out)) - subwf.set_output([("out", subwf.sub_b.lzout.out)]) - # connect, then add - subwf.inputs.x = wf.wf_a.lzout.out - subwf.cache_dir = tmpdir - - wf.add(subwf) - wf.add(sleep_add_one(name="wf_b", x=wf.sub_wf.lzout.out)) - wf.set_output([("out", wf.wf_b.lzout.out)]) - wf.cache_dir = tmpdir - - with Submitter(plugin) as sub: - sub(wf) - - res = wf.result() - assert res.output.out == 7 - - -@pytest.mark.flaky(reruns=2) # when dask -def test_wf2(plugin_dask_opt, tmpdir): - """workflow as a node - workflow-node with one task and no splitter - """ - wfnd = Workflow(name="wfnd", input_spec=["x"]) - wfnd.add(sleep_add_one(name="add2", x=wfnd.lzin.x)) - wfnd.set_output([("out", wfnd.add2.lzout.out)]) - wfnd.inputs.x = 2 - - wf = Workflow(name="wf", input_spec=["x"]) - wf.add(wfnd) - wf.set_output([("out", wf.wfnd.lzout.out)]) - wf.cache_dir = tmpdir - - with Submitter(plugin=plugin_dask_opt) as sub: - sub(wf) - - res = wf.result() - assert res.output.out == 3 - - -@pytest.mark.flaky(reruns=2) # when dask -def test_wf_with_state(plugin_dask_opt, tmpdir): - wf = Workflow(name="wf_with_state", input_spec=["x"]) - wf.add(sleep_add_one(name="taska", x=wf.lzin.x)) - wf.add(sleep_add_one(name="taskb", x=wf.taska.lzout.out)) - - wf.split("x", x=[1, 2, 3]) - wf.set_output([("out", wf.taskb.lzout.out)]) - wf.cache_dir = tmpdir - - with Submitter(plugin=plugin_dask_opt) as sub: - sub(wf) - - res = wf.result() - - assert res[0].output.out == 3 - assert res[1].output.out == 4 - assert res[2].output.out == 5 - - -def test_serial_wf(): - # Use serial plugin to execute workflow instead of CF - wf = gen_basic_wf() - res = wf(plugin="serial") - assert res.output.out == 9 - - -@need_slurm -def test_slurm_wf(tmpdir): - wf = gen_basic_wf() - wf.cache_dir = tmpdir - # submit workflow and every task as slurm job - with Submitter("slurm") as sub: - sub(wf) - - res = wf.result() - assert res.output.out == 9 - script_dir = tmpdir / "SlurmWorker_scripts" - assert script_dir.exists() - # ensure each task was executed with slurm - assert len([sd for sd in script_dir.listdir() if sd.isdir()]) == 2 - - -@need_slurm -def test_slurm_wf_cf(tmpdir): - # submit entire workflow as single job executing with cf worker - wf = gen_basic_wf() - wf.cache_dir = tmpdir - wf.plugin = "cf" - with Submitter("slurm") as sub: - sub(wf) - res = wf.result() - assert res.output.out == 9 - script_dir = tmpdir / "SlurmWorker_scripts" - assert script_dir.exists() - # ensure only workflow was executed with slurm - sdirs = [sd for sd in script_dir.listdir() if sd.isdir()] - assert len(sdirs) == 1 - # slurm scripts should be in the dirs that are using uid in the name - assert sdirs[0].basename == wf.uid - - -@need_slurm -def test_slurm_wf_state(tmpdir): - wf = gen_basic_wf() - wf.split("x", x=[5, 6]) - wf.cache_dir = tmpdir - with Submitter("slurm") as sub: - sub(wf) - res = wf.result() - assert res[0].output.out == 9 - assert res[1].output.out == 10 - script_dir = tmpdir / "SlurmWorker_scripts" - assert script_dir.exists() - sdirs = [sd for sd in script_dir.listdir() if sd.isdir()] - assert len(sdirs) == 2 * len(wf.inputs.x) - - -@need_slurm -@pytest.mark.flaky(reruns=3) -def test_slurm_max_jobs(tmpdir): - wf = Workflow("new_wf", input_spec=["x", "y"], cache_dir=tmpdir) - wf.inputs.x = 5 - wf.inputs.y = 10 - wf.add(sleep_add_one(name="taska", x=wf.lzin.x)) - wf.add(sleep_add_one(name="taskb", x=wf.lzin.y)) - wf.add(sleep_add_one(name="taskc", x=wf.taska.lzout.out)) - wf.add(sleep_add_one(name="taskd", x=wf.taskb.lzout.out)) - wf.set_output([("out1", wf.taskc.lzout.out), ("out2", wf.taskd.lzout.out)]) - with Submitter("slurm", max_jobs=1) as sub: - sub(wf) - - jobids = [] - time.sleep(0.5) # allow time for sacct to collect itself - for fl in (tmpdir / "SlurmWorker_scripts").visit("slurm-*.out"): - jid = re.search(r"(?<=slurm-)\d+", fl.strpath) - assert jid.group() - jobids.append(jid.group()) - time.sleep(0.2) - del jid - - # query sacct for job eligibility timings - queued = [] - for jid in sorted(jobids): - out = sp.run(["sacct", "-Xnj", jid, "-o", "Eligible"], capture_output=True) - et = out.stdout.decode().strip() - queued.append(parser.parse(et)) - del out, et - - # compare timing between queued jobs - prev = None - for et in sorted(queued, reverse=True): - if prev is None: - prev = et - continue - assert (prev - et).seconds >= 2 - - -@need_slurm -def test_slurm_args_1(tmpdir): - """testing sbatch_args provided to the submitter""" - task = sleep_add_one(x=1) - task.cache_dir = tmpdir - # submit workflow and every task as slurm job - with Submitter("slurm", sbatch_args="-N1") as sub: - sub(task) - - res = task.result() - assert res.output.out == 2 - script_dir = tmpdir / "SlurmWorker_scripts" - assert script_dir.exists() - - -@need_slurm -def test_slurm_args_2(tmpdir): - """testing sbatch_args provided to the submitter - exception should be raised for invalid options - """ - task = sleep_add_one(x=1) - task.cache_dir = tmpdir - # submit workflow and every task as slurm job - with pytest.raises(RuntimeError, match="Error returned from sbatch:"): - with Submitter("slurm", sbatch_args="-N1 --invalid") as sub: - sub(task) - - -@mark.task -def sleep(x, job_name_part): - time.sleep(x) - import subprocess as sp - - # getting the job_id of the first job that sleeps - job_id = 999 - while job_id != "": - time.sleep(3) - id_p1 = sp.Popen(["squeue"], stdout=sp.PIPE) - id_p2 = sp.Popen(["grep", job_name_part], stdin=id_p1.stdout, stdout=sp.PIPE) - id_p3 = sp.Popen(["awk", "{print $1}"], stdin=id_p2.stdout, stdout=sp.PIPE) - job_id = id_p3.communicate()[0].decode("utf-8").strip() - - return x - - -@mark.task -def cancel(job_name_part): - import subprocess as sp - - # getting the job_id of the first job that sleeps - job_id = "" - while job_id == "": - time.sleep(1) - id_p1 = sp.Popen(["squeue"], stdout=sp.PIPE) - id_p2 = sp.Popen(["grep", job_name_part], stdin=id_p1.stdout, stdout=sp.PIPE) - id_p3 = sp.Popen(["awk", "{print $1}"], stdin=id_p2.stdout, stdout=sp.PIPE) - job_id = id_p3.communicate()[0].decode("utf-8").strip() - - # # canceling the job - proc = sp.run(["scancel", job_id, "--verbose"], stdout=sp.PIPE, stderr=sp.PIPE) - # cancelling the job returns message in the sterr - return proc.stderr.decode("utf-8").strip() - - -@pytest.mark.flaky(reruns=1) -@need_slurm -def test_slurm_cancel_rerun_1(tmpdir): - """testing that tasks run with slurm is re-queue - Running wf with 2 tasks, one sleeps and the other trying to get - job_id of the first task and cancel it. - The first job should be re-queue and finish without problem. - (possibly has to be improved, in theory cancel job might finish before cancel) - """ - wf = Workflow( - name="wf", - input_spec=["x", "job_name_cancel", "job_name_resqueue"], - cache_dir=tmpdir, - ) - wf.add(sleep(name="sleep1", x=wf.lzin.x, job_name_part=wf.lzin.job_name_cancel)) - wf.add(cancel(name="cancel1", job_name_part=wf.lzin.job_name_resqueue)) - wf.inputs.x = 10 - wf.inputs.job_name_resqueue = "sleep1" - wf.inputs.job_name_cancel = "cancel1" - - wf.set_output([("out", wf.sleep1.lzout.out), ("canc_out", wf.cancel1.lzout.out)]) - with Submitter("slurm") as sub: - sub(wf) - - res = wf.result() - assert res.output.out == 10 - # checking if indeed the sleep-task job was cancelled by cancel-task - assert "Terminating" in res.output.canc_out - assert "Invalid" not in res.output.canc_out - script_dir = tmpdir / "SlurmWorker_scripts" - assert script_dir.exists() - - -@pytest.mark.flaky(reruns=1) -@need_slurm -def test_slurm_cancel_rerun_2(tmpdir): - """testing that tasks run with slurm that has --no-requeue - Running wf with 2 tasks, one sleeps and the other gets - job_id of the first task and cancel it. - The first job is not able t be rescheduled and the error is returned. - """ - wf = Workflow(name="wf", input_spec=["x", "job_name"], cache_dir=tmpdir) - wf.add(sleep(name="sleep2", x=wf.lzin.x)) - wf.add(cancel(name="cancel2", job_name_part=wf.lzin.job_name)) - - wf.inputs.x = 10 - wf.inputs.job_name = "sleep2" - - wf.set_output([("out", wf.sleep2.lzout.out), ("canc_out", wf.cancel2.lzout.out)]) - with pytest.raises(Exception): - with Submitter("slurm", sbatch_args="--no-requeue") as sub: - sub(wf) - - -@need_sge -def test_sge_wf(tmpdir): - """testing that a basic workflow can be run with the SGEWorker""" - wf = gen_basic_wf() - wf.cache_dir = tmpdir - # submit workflow and every task as sge job - with Submitter( - "sge", - ) as sub: - sub(wf) - - res = wf.result() - assert res.output.out == 9 - script_dir = tmpdir / "SGEWorker_scripts" - assert script_dir.exists() - # ensure each task was executed with sge - assert len([sd for sd in script_dir.listdir() if sd.isdir()]) == 2 - - -@need_sge -def test_sge_wf_cf(tmpdir): - """testing the SGEWorker can submit SGE tasks while the workflow - uses the concurrent futures plugin""" - # submit entire workflow as single job executing with cf worker - wf = gen_basic_wf() - wf.cache_dir = tmpdir - wf.plugin = "cf" - with Submitter("sge") as sub: - sub(wf) - res = wf.result() - assert res.output.out == 9 - script_dir = tmpdir / "SGEWorker_scripts" - assert script_dir.exists() - # ensure only workflow was executed with slurm - sdirs = [sd for sd in script_dir.listdir() if sd.isdir()] - assert len(sdirs) == 1 - # sge scripts should be in the dirs that are using uid in the name - assert Path(sdirs[0]).name == wf.uid - - -@need_sge -def test_sge_wf_state(tmpdir): - """testing the SGEWorker can be used with a workflow with state""" - wf = gen_basic_wf() - wf.split("x") - wf.inputs.x = [5, 6] - wf.cache_dir = tmpdir - with Submitter("sge") as sub: - sub(wf) - res = wf.result() - assert res[0].output.out == 9 - assert res[1].output.out == 10 - script_dir = tmpdir / "SGEWorker_scripts" - assert script_dir.exists() - sdirs = [sd for sd in script_dir.listdir() if sd.isdir()] - assert len(sdirs) == 2 * len(wf.inputs.x) - - -def qacct_output_to_dict(qacct_output): - stdout_dict = {} - for line in qacct_output.splitlines(): - key_value = line.split(None, 1) - if key_value[0] not in stdout_dict: - stdout_dict[key_value[0]] = [] - if len(key_value) > 1: - stdout_dict[key_value[0]].append(key_value[1]) - else: - stdout_dict[key_value[0]].append(None) - - print(stdout_dict) - return stdout_dict - - -@need_sge -def test_sge_set_threadcount(tmpdir): - """testing the number of threads for an SGEWorker task can be set - using the input_spec variable sgeThreads""" - wf = gen_basic_wf_with_threadcount() - wf.inputs.x = 5 - wf.cache_dir = tmpdir - - jobids = [] - with Submitter("sge") as sub: - sub(wf) - jobids = list(sub.worker.jobid_by_task_uid.values()) - jobids.sort() - - print(f"jobids: {jobids}") - - out_job0 = ( - sp.run(["qacct", "-j", jobids[0]], capture_output=True).stdout.decode().strip() - ) - out_job1 = ( - sp.run(["qacct", "-j", jobids[1]], capture_output=True).stdout.decode().strip() - ) - - out_job0_dict = qacct_output_to_dict(out_job0) - out_job1_dict = qacct_output_to_dict(out_job1) - - assert int(out_job0_dict["slots"][0]) == 4 - assert int(out_job1_dict["slots"][0]) == 1 - - -@need_sge -def test_sge_limit_maxthreads(tmpdir): - """testing the ability to limit the number of threads used by the SGE - at one time with the max_threads argument to SGEWorker""" - wf = gen_basic_wf_with_threadcount_concurrent() - wf.inputs.x = [5, 6] - wf.split("x") - wf.cache_dir = tmpdir - - jobids = [] - with Submitter("sge", max_threads=8) as sub: - sub(wf) - jobids = list(sub.worker.jobid_by_task_uid.values()) - jobids.sort() - - out_job0 = ( - sp.run(["qacct", "-j", jobids[0]], capture_output=True).stdout.decode().strip() - ) - out_job1 = ( - sp.run(["qacct", "-j", jobids[1]], capture_output=True).stdout.decode().strip() - ) - out_job2 = ( - sp.run(["qacct", "-j", jobids[2]], capture_output=True).stdout.decode().strip() - ) - out_job3 = ( - sp.run(["qacct", "-j", jobids[3]], capture_output=True).stdout.decode().strip() - ) - - qacct_output_to_dict(out_job0) - out_job1_dict = qacct_output_to_dict(out_job1) - out_job2_dict = qacct_output_to_dict(out_job2) - qacct_output_to_dict(out_job3) - - job_1_endtime = datetime.strptime( - out_job1_dict["end_time"][0], "%a %b %d %H:%M:%S %Y" - ) - # Running both task_1_1 and task_1_2 at once would exceed max_threads, - # so task_1_2 waits for task_1_1 to complete - job_2_starttime = datetime.strptime( - out_job2_dict["start_time"][0], "%a %b %d %H:%M:%S %Y" - ) - assert job_1_endtime < job_2_starttime - - -@need_sge -def test_sge_no_limit_maxthreads(tmpdir): - """testing unlimited threads can be used at once by SGE - when max_threads is not set""" - wf = gen_basic_wf_with_threadcount_concurrent() - wf.inputs.x = [5, 6] - wf.split("x") - wf.cache_dir = tmpdir - - jobids = [] - with Submitter("sge", max_threads=None) as sub: - sub(wf) - jobids = list(sub.worker.jobid_by_task_uid.values()) - jobids.sort() - - out_job0 = ( - sp.run(["qacct", "-j", jobids[0]], capture_output=True).stdout.decode().strip() - ) - out_job1 = ( - sp.run(["qacct", "-j", jobids[1]], capture_output=True).stdout.decode().strip() - ) - out_job2 = ( - sp.run(["qacct", "-j", jobids[2]], capture_output=True).stdout.decode().strip() - ) - - qacct_output_to_dict(out_job0) - out_job1_dict = qacct_output_to_dict(out_job1) - out_job2_dict = qacct_output_to_dict(out_job2) - - job_1_endtime = datetime.strptime( - out_job1_dict["end_time"][0], "%a %b %d %H:%M:%S %Y" - ) - # Running both task_1_1 and task_1_2 at once would not exceed max_threads, - # so task_1_2 does not wait for task_1_1 to complete - job_2_starttime = datetime.strptime( - out_job2_dict["start_time"][0], "%a %b %d %H:%M:%S %Y" - ) - assert job_1_endtime > job_2_starttime - - -def test_hash_changes_in_task_inputs_file(tmp_path): - @mark.task - def output_dir_as_input(out_dir: Directory) -> Directory: - (out_dir.fspath / "new-file.txt").touch() - return out_dir - - task = output_dir_as_input(out_dir=tmp_path) - with pytest.raises(RuntimeError, match="Input field hashes have changed"): - task() - - -def test_hash_changes_in_task_inputs_unstable(tmp_path): - @attrs.define - class Unstable: - value: int # type: ignore - - def __bytes_repr__(self, cache) -> ty.Iterator[bytes]: - """Random 128-bit bytestring""" - yield secrets.token_bytes(16) - - @mark.task - def unstable_input(unstable: Unstable) -> int: - return unstable.value - - task = unstable_input(unstable=Unstable(1)) - with pytest.raises(RuntimeError, match="Input field hashes have changed"): - task() - - -def test_hash_changes_in_workflow_inputs(tmp_path): - @mark.task - def output_dir_as_output(out_dir: Path) -> Directory: - (out_dir / "new-file.txt").touch() - return out_dir - - wf = Workflow( - name="test_hash_change", input_spec={"in_dir": Directory}, in_dir=tmp_path - ) - wf.add(output_dir_as_output(out_dir=wf.lzin.in_dir, name="task")) - wf.set_output(("out_dir", wf.task.lzout.out)) - with pytest.raises(RuntimeError, match="Input field hashes have changed.*Workflow"): - wf() - - -def test_hash_changes_in_workflow_graph(tmpdir): - class X: - """Dummy class with unstable hash (i.e. which isn't altered in a node in which - it is an input)""" - - value = 1 - - def __bytes_repr__(self, cache): - """Bytes representation from class attribute, which will be changed be - 'alter_x" node. - - NB: this is a contrived example where the bytes_repr implementation returns - a bytes representation of a class attribute in order to trigger the exception, - hopefully cases like this will be very rare""" - yield bytes(self.value) - - @mark.task - @mark.annotate({"return": {"x": X, "y": int}}) - def identity(x: X) -> ty.Tuple[X, int]: - return x, 99 - - @mark.task - def alter_x(y): - X.value = 2 - return y - - @mark.task - def to_tuple(x, y): - return (x, y) - - wf = Workflow(name="wf_with_blocked_tasks", input_spec=["x", "y"]) - wf.add(identity(name="taska", x=wf.lzin.x)) - wf.add(alter_x(name="taskb", y=wf.taska.lzout.y)) - wf.add(to_tuple(name="taskc", x=wf.taska.lzout.x, y=wf.taskb.lzout.out)) - wf.set_output([("out", wf.taskc.lzout.out)]) - - wf.inputs.x = X() - - wf.cache_dir = tmpdir - - with pytest.raises( - RuntimeError, match="Graph of 'wf_with_blocked_tasks' workflow is not empty" - ): - with Submitter("cf") as sub: - result = sub(wf) - - -@mark.task -def to_tuple(x, y): - return (x, y) - - -class BYOAddVarWorker(SerialWorker): - """A dummy worker that adds 1 to the output of the task""" - - plugin_name = "byo_add_env_var" - - def __init__(self, add_var, **kwargs): - super().__init__(**kwargs) - self.add_var = add_var - - async def exec_serial(self, runnable, rerun=False, environment=None): - if isinstance(runnable, TaskBase): - with patch.dict(os.environ, {"BYO_ADD_VAR": str(self.add_var)}): - result = runnable._run(rerun, environment=environment) - return result - else: # it could be tuple that includes pickle files with tasks and inputs - return super().exec_serial(runnable, rerun, environment) - - -@mark.task -def add_env_var_task(x: int) -> int: - return x + int(os.environ.get("BYO_ADD_VAR", 0)) - - -def test_byo_worker(): - - task1 = add_env_var_task(x=1) - - with Submitter(plugin=BYOAddVarWorker, add_var=10) as sub: - assert sub.plugin == "byo_add_env_var" - result = task1(submitter=sub) - - assert result.output.out == 11 - - task2 = add_env_var_task(x=2) - - with Submitter(plugin="serial") as sub: - result = task2(submitter=sub) - - assert result.output.out == 2 - - -def test_bad_builtin_worker(): - - with pytest.raises(NotImplementedError, match="No worker for 'bad-worker' plugin"): - Submitter(plugin="bad-worker") - - -def test_bad_byo_worker(): - - class BadWorker: - pass - - with pytest.raises( - ValueError, match="Worker class must have a 'plugin_name' str attribute" - ): - Submitter(plugin=BadWorker) diff --git a/pydra/engine/tests/test_task.py b/pydra/engine/tests/test_task.py deleted file mode 100644 index 0d666574e3..0000000000 --- a/pydra/engine/tests/test_task.py +++ /dev/null @@ -1,1584 +0,0 @@ -import typing as ty -import os, sys -import attr -import pytest -import cloudpickle as cp -from pathlib import Path -import json -import glob as glob -from ... import mark -from ..core import Workflow -from ..task import AuditFlag, ShellCommandTask -from ...utils.messenger import FileMessenger, PrintMessenger, collect_messages -from .utils import gen_basic_wf -from ..specs import ( - MultiInputObj, - MultiOutputObj, - SpecInfo, - FunctionSpec, - BaseSpec, - ShellSpec, - File, -) -from ...utils.hash import hash_function - - -no_win = pytest.mark.skipif( - sys.platform.startswith("win"), - reason="docker/singularity command not adjusted for windows", -) - - -@mark.task -def funaddtwo(a): - return a + 2 - - -def test_output(): - nn = funaddtwo(a=3) - res = nn._run() - assert res.output.out == 5 - - -def test_name_conflict(): - """raise error if task name conflicts with a class attribute or method""" - with pytest.raises(ValueError) as excinfo1: - funaddtwo(name="split", a=3) - assert "Cannot use names of attributes or methods" in str(excinfo1.value) - with pytest.raises(ValueError) as excinfo2: - funaddtwo(name="checksum", a=3) - assert "Cannot use names of attributes or methods" in str(excinfo2.value) - - -def test_numpy(): - """checking if mark.task works for numpy functions""" - np = pytest.importorskip("numpy") - fft = mark.annotate({"a": np.ndarray, "return": np.ndarray})(np.fft.fft) - fft = mark.task(fft)() - arr = np.array([[1, 10], [2, 20]]) - fft.inputs.a = arr - res = fft() - assert np.allclose(np.fft.fft(arr), res.output.out) - - -@pytest.mark.xfail(reason="cp.dumps(func) depends on the system/setup, TODO!!") -def test_checksum(): - nn = funaddtwo(a=3) - assert ( - nn.checksum - == "FunctionTask_abb4e7cc03b13d0e73884b87d142ed5deae6a312275187a9d8df54407317d7d3" - ) - - -def test_annotated_func(): - @mark.task - def testfunc( - a: int, b: float = 0.1 - ) -> ty.NamedTuple("Output", [("out_out", float)]): - return a + b - - funky = testfunc(a=1) - assert hasattr(funky.inputs, "a") - assert hasattr(funky.inputs, "b") - assert hasattr(funky.inputs, "_func") - assert getattr(funky.inputs, "a") == 1 - assert getattr(funky.inputs, "b") == 0.1 - assert getattr(funky.inputs, "_func") is not None - assert set(funky.output_names) == {"out_out"} - # assert funky.inputs.hash == '17772c3aec9540a8dd3e187eecd2301a09c9a25c6e371ddd86e31e3a1ecfeefa' - assert funky.__class__.__name__ + "_" + funky.inputs.hash == funky.checksum - - result = funky() - assert hasattr(result, "output") - assert hasattr(result.output, "out_out") - assert result.output.out_out == 1.1 - - assert os.path.exists(funky.cache_dir / funky.checksum / "_result.pklz") - funky.result() # should not recompute - funky.inputs.a = 2 - # assert funky.checksum == '537d25885fd2ea5662b7701ba02c132c52a9078a3a2d56aa903a777ea90e5536' - assert funky.result() is None - funky() - result = funky.result() - assert result.output.out_out == 2.1 - - help = funky.help(returnhelp=True) - assert help == [ - "Help for FunctionTask", - "Input Parameters:", - "- a: int", - "- b: float (default: 0.1)", - "- _func: bytes", - "Output Parameters:", - "- out_out: float", - ] - - -def test_annotated_func_dictreturn(): - """Test mapping from returned dictionary to output spec.""" - - @mark.task - @mark.annotate({"return": {"sum": int, "mul": ty.Optional[int]}}) - def testfunc(a: int, b: int): - return dict(sum=a + b, diff=a - b) - - task = testfunc(a=2, b=3) - result = task() - - # Part of the annotation and returned, should be exposed to output. - assert result.output.sum == 5 - - # Part of the annotation but not returned, should be coalesced to None - assert result.output.mul is None - - # Not part of the annotation, should be discarded. - assert not hasattr(result.output, "diff") - - -def test_annotated_func_multreturn(): - """the function has two elements in the return statement""" - - @mark.task - def testfunc( - a: float, - ) -> ty.NamedTuple("Output", [("fractional", float), ("integer", int)]): - import math - - return math.modf(a)[0], int(math.modf(a)[1]) - - funky = testfunc(a=3.5) - assert hasattr(funky.inputs, "a") - assert hasattr(funky.inputs, "_func") - assert getattr(funky.inputs, "a") == 3.5 - assert getattr(funky.inputs, "_func") is not None - assert set(funky.output_names) == {"fractional", "integer"} - assert funky.__class__.__name__ + "_" + funky.inputs.hash == funky.checksum - - result = funky() - assert os.path.exists(funky.cache_dir / funky.checksum / "_result.pklz") - assert hasattr(result, "output") - assert hasattr(result.output, "fractional") - assert result.output.fractional == 0.5 - assert hasattr(result.output, "integer") - assert result.output.integer == 3 - - help = funky.help(returnhelp=True) - assert help == [ - "Help for FunctionTask", - "Input Parameters:", - "- a: float", - "- _func: bytes", - "Output Parameters:", - "- fractional: float", - "- integer: int", - ] - - -def test_annotated_input_func_1(): - """the function with annotated input (float)""" - - @mark.task - def testfunc(a: float): - return a - - funky = testfunc(a=3.5) - assert getattr(funky.inputs, "a") == 3.5 - - -def test_annotated_input_func_2(): - """the function with annotated input (int, but float provided)""" - - @mark.task - def testfunc(a: int): - return a - - with pytest.raises(TypeError): - testfunc(a=3.5) - - -def test_annotated_input_func_2a(): - """the function with annotated input (int, but float provided)""" - - @mark.task - def testfunc(a: int): - return a - - funky = testfunc() - with pytest.raises(TypeError): - funky.inputs.a = 3.5 - - -def test_annotated_input_func_3(): - """the function with annotated input (list)""" - - @mark.task - def testfunc(a: list): - return sum(a) - - funky = testfunc(a=[1, 3.5]) - assert getattr(funky.inputs, "a") == [1, 3.5] - - -def test_annotated_input_func_3a(): - """the function with annotated input (list of floats)""" - - @mark.task - def testfunc(a: ty.List[float]): - return sum(a) - - funky = testfunc(a=[1.0, 3.5]) - assert getattr(funky.inputs, "a") == [1.0, 3.5] - - -def test_annotated_input_func_3b(): - """the function with annotated input - (list of floats - int and float provided, should be fine) - """ - - @mark.task - def testfunc(a: ty.List[float]): - return sum(a) - - funky = testfunc(a=[1, 3.5]) - assert getattr(funky.inputs, "a") == [1, 3.5] - - -def test_annotated_input_func_3c_excep(): - """the function with annotated input - (list of ints - int and float provided, should raise an error) - """ - - @mark.task - def testfunc(a: ty.List[int]): - return sum(a) - - with pytest.raises(TypeError): - testfunc(a=[1, 3.5]) - - -def test_annotated_input_func_4(): - """the function with annotated input (dictionary)""" - - @mark.task - def testfunc(a: dict): - return sum(a.values()) - - funky = testfunc(a={"el1": 1, "el2": 3.5}) - assert getattr(funky.inputs, "a") == {"el1": 1, "el2": 3.5} - - -def test_annotated_input_func_4a(): - """the function with annotated input (dictionary of floats)""" - - @mark.task - def testfunc(a: ty.Dict[str, float]): - return sum(a.values()) - - funky = testfunc(a={"el1": 1, "el2": 3.5}) - assert getattr(funky.inputs, "a") == {"el1": 1, "el2": 3.5} - - -def test_annotated_input_func_4b_excep(): - """the function with annotated input (dictionary of ints, but float provided)""" - - @mark.task - def testfunc(a: ty.Dict[str, int]): - return sum(a.values()) - - with pytest.raises(TypeError): - testfunc(a={"el1": 1, "el2": 3.5}) - - -def test_annotated_input_func_5(): - """the function with annotated more complex input type (ty.List in ty.Dict) - the validator should simply check if values of dict are lists - so no error for 3.5 - """ - - @mark.task - def testfunc(a: ty.Dict[str, ty.List]): - return sum(a["el1"]) - - funky = testfunc(a={"el1": [1, 3.5]}) - assert getattr(funky.inputs, "a") == {"el1": [1, 3.5]} - - -def test_annotated_input_func_5a_except(): - """the function with annotated more complex input type (ty.Dict in ty.Dict) - list is provided as a dict value (instead a dict), so error is raised - """ - - @mark.task - def testfunc(a: ty.Dict[str, ty.Dict[str, float]]): - return sum(a["el1"]) - - with pytest.raises(TypeError): - testfunc(a={"el1": [1, 3.5]}) - - -def test_annotated_input_func_6(): - """the function with annotated more complex input type (ty.Union in ty.Dict) - the validator should unpack values from the Union - """ - - @mark.task - def testfunc(a: ty.Dict[str, ty.Union[float, int]]): - return sum(a["el1"]) - - funky = testfunc(a={"el1": 1, "el2": 3.5}) - assert getattr(funky.inputs, "a") == {"el1": 1, "el2": 3.5} - - -def test_annotated_input_func_6a_excep(): - """the function with annotated more complex input type (ty.Union in ty.Dict) - the validator should unpack values from the Union and raise an error for 3.5 - """ - - @mark.task - def testfunc(a: ty.Dict[str, ty.Union[str, int]]): - return sum(a["el1"]) - - with pytest.raises(TypeError): - testfunc(a={"el1": 1, "el2": 3.5}) - - -def test_annotated_input_func_7(): - """the function with annotated input (float) - the task has a splitter, so list of float is provided - it should work, the validator tries to guess if this is a field with a splitter - """ - - @mark.task - def testfunc(a: float): - return a - - funky = testfunc().split("a", a=[3.5, 2.1]) - assert getattr(funky.inputs, "a") == [3.5, 2.1] - - -def test_annotated_input_func_7a_excep(): - """the function with annotated input (int) and splitter - list of float provided - should raise an error (list of int would be fine) - """ - - @mark.task - def testfunc(a: int): - return a - - with pytest.raises(TypeError): - testfunc(a=[3.5, 2.1]).split("a") - - -def test_annotated_input_func_8(): - """the function with annotated input as MultiInputObj - a single value is provided and should be converted to a list - """ - - @mark.task - def testfunc(a: MultiInputObj): - return len(a) - - funky = testfunc(a=3.5) - assert getattr(funky.inputs, "a") == [3.5] - res = funky() - assert res.output.out == 1 - - -def test_annotated_input_func_8a(): - """the function with annotated input as MultiInputObj - a 1-el list is provided so shouldn't be changed - """ - - @mark.task - def testfunc(a: MultiInputObj): - return len(a) - - funky = testfunc(a=[3.5]) - assert getattr(funky.inputs, "a") == [3.5] - res = funky() - assert res.output.out == 1 - - -def test_annotated_input_func_8b(): - """the function with annotated input as MultiInputObj - a single value is provided after initial. the task - (input should still be converted to a list) - """ - - @mark.task - def testfunc(a: MultiInputObj): - return len(a) - - funky = testfunc() - # setting a after init - funky.inputs.a = 3.5 - assert getattr(funky.inputs, "a") == [3.5] - res = funky() - assert res.output.out == 1 - - -def test_annotated_func_multreturn_exception(): - """function has two elements in the return statement, - but three element provided in the spec - should raise an error - """ - - @mark.task - def testfunc( - a: float, - ) -> ty.NamedTuple( - "Output", [("fractional", float), ("integer", int), ("who_knows", int)] - ): - import math - - return math.modf(a) - - funky = testfunc(a=3.5) - with pytest.raises(Exception) as excinfo: - funky() - assert "expected 3 elements" in str(excinfo.value) - - -def test_halfannotated_func(): - @mark.task - def testfunc(a, b) -> int: - return a + b - - funky = testfunc(a=10, b=20) - assert hasattr(funky.inputs, "a") - assert hasattr(funky.inputs, "b") - assert hasattr(funky.inputs, "_func") - assert getattr(funky.inputs, "a") == 10 - assert getattr(funky.inputs, "b") == 20 - assert getattr(funky.inputs, "_func") is not None - assert set(funky.output_names) == {"out"} - assert funky.__class__.__name__ + "_" + funky.inputs.hash == funky.checksum - - result = funky() - assert hasattr(result, "output") - assert hasattr(result.output, "out") - assert result.output.out == 30 - - assert os.path.exists(funky.cache_dir / funky.checksum / "_result.pklz") - - funky.result() # should not recompute - funky.inputs.a = 11 - assert funky.result() is None - funky() - result = funky.result() - assert result.output.out == 31 - help = funky.help(returnhelp=True) - - assert help == [ - "Help for FunctionTask", - "Input Parameters:", - "- a: _empty", - "- b: _empty", - "- _func: bytes", - "Output Parameters:", - "- out: int", - ] - - -def test_halfannotated_func_multreturn(): - @mark.task - def testfunc(a, b) -> (int, int): - return a + 1, b + 1 - - funky = testfunc(a=10, b=20) - assert hasattr(funky.inputs, "a") - assert hasattr(funky.inputs, "b") - assert hasattr(funky.inputs, "_func") - assert getattr(funky.inputs, "a") == 10 - assert getattr(funky.inputs, "b") == 20 - assert getattr(funky.inputs, "_func") is not None - assert set(funky.output_names) == {"out1", "out2"} - assert funky.__class__.__name__ + "_" + funky.inputs.hash == funky.checksum - - result = funky() - assert hasattr(result, "output") - assert hasattr(result.output, "out1") - assert result.output.out1 == 11 - - assert os.path.exists(funky.cache_dir / funky.checksum / "_result.pklz") - - funky.result() # should not recompute - funky.inputs.a = 11 - assert funky.result() is None - funky() - result = funky.result() - assert result.output.out1 == 12 - help = funky.help(returnhelp=True) - - assert help == [ - "Help for FunctionTask", - "Input Parameters:", - "- a: _empty", - "- b: _empty", - "- _func: bytes", - "Output Parameters:", - "- out1: int", - "- out2: int", - ] - - -def test_notannotated_func(): - @mark.task - def no_annots(c, d): - return c + d - - natask = no_annots(c=17, d=3.2) - assert hasattr(natask.inputs, "c") - assert hasattr(natask.inputs, "d") - assert hasattr(natask.inputs, "_func") - - result = natask._run() - assert hasattr(result, "output") - assert hasattr(result.output, "out") - assert result.output.out == 20.2 - - -def test_notannotated_func_returnlist(): - @mark.task - def no_annots(c, d): - return [c, d] - - natask = no_annots(c=17, d=3.2) - result = natask._run() - assert hasattr(result.output, "out") - assert result.output.out == [17, 3.2] - - -def test_halfannotated_func_multrun_returnlist(): - @mark.task - def no_annots(c, d) -> (list, float): - return [c, d], c + d - - natask = no_annots(c=17, d=3.2) - result = natask._run() - - assert hasattr(result.output, "out1") - assert hasattr(result.output, "out2") - assert result.output.out1 == [17, 3.2] - assert result.output.out2 == 20.2 - - -def test_notannotated_func_multreturn(): - """no annotation and multiple values are returned - all elements should be returned as a tuple and set to "out" - """ - - @mark.task - def no_annots(c, d): - return c + d, c - d - - natask = no_annots(c=17, d=3.2) - assert hasattr(natask.inputs, "c") - assert hasattr(natask.inputs, "d") - assert hasattr(natask.inputs, "_func") - - result = natask._run() - assert hasattr(result, "output") - assert hasattr(result.output, "out") - assert result.output.out == (20.2, 13.8) - - -def test_input_spec_func_1(): - """the function w/o annotated, but input_spec is used""" - - @mark.task - def testfunc(a): - return a - - my_input_spec = SpecInfo( - name="Input", - fields=[("a", attr.ib(type=float, metadata={"help_string": "input a"}))], - bases=(FunctionSpec,), - ) - - funky = testfunc(a=3.5, input_spec=my_input_spec) - assert getattr(funky.inputs, "a") == 3.5 - - -def test_input_spec_func_1a_except(): - """the function w/o annotated, but input_spec is used - a TypeError is raised (float is provided instead of int) - """ - - @mark.task - def testfunc(a): - return a - - my_input_spec = SpecInfo( - name="Input", - fields=[("a", attr.ib(type=int, metadata={"help_string": "input a"}))], - bases=(FunctionSpec,), - ) - with pytest.raises(TypeError): - testfunc(a=3.5, input_spec=my_input_spec) - - -def test_input_spec_func_1b_except(): - """the function w/o annotated, but input_spec is used - metadata checks raise an error - """ - - @mark.task - def testfunc(a): - return a - - my_input_spec = SpecInfo( - name="Input", - fields=[ - ( - "a", - attr.ib(type=float, metadata={"position": 1, "help_string": "input a"}), - ) - ], - bases=(FunctionSpec,), - ) - with pytest.raises(AttributeError, match="only these keys are supported"): - testfunc(a=3.5, input_spec=my_input_spec) - - -def test_input_spec_func_1d_except(): - """the function w/o annotated, but input_spec is used - input_spec doesn't contain 'a' input, an error is raised - """ - - @mark.task - def testfunc(a): - return a - - my_input_spec = SpecInfo(name="Input", fields=[], bases=(FunctionSpec,)) - funky = testfunc(a=3.5, input_spec=my_input_spec) - with pytest.raises(TypeError, match="missing 1 required positional argument"): - funky() - - -def test_input_spec_func_2(): - """the function with annotation, and the task has input_spec, - input_spec changes the type of the input (so error is not raised) - """ - - @mark.task - def testfunc(a: int): - return a - - my_input_spec = SpecInfo( - name="Input", - fields=[("a", attr.ib(type=float, metadata={"help_string": "input a"}))], - bases=(FunctionSpec,), - ) - - funky = testfunc(a=3.5, input_spec=my_input_spec) - assert getattr(funky.inputs, "a") == 3.5 - - -def test_input_spec_func_2a(): - """the function with annotation, and the task has input_spec, - input_spec changes the type of the input (so error is not raised) - using the shorter syntax - """ - - @mark.task - def testfunc(a: int): - return a - - my_input_spec = SpecInfo( - name="Input", - fields=[("a", float, {"help_string": "input a"})], - bases=(FunctionSpec,), - ) - - funky = testfunc(a=3.5, input_spec=my_input_spec) - assert getattr(funky.inputs, "a") == 3.5 - - -def test_input_spec_func_3(): - """the function w/o annotated, but input_spec is used - additional keys (allowed_values) are used in metadata - """ - - @mark.task - def testfunc(a): - return a - - my_input_spec = SpecInfo( - name="Input", - fields=[ - ( - "a", - attr.ib( - type=int, - metadata={"help_string": "input a", "allowed_values": [0, 1, 2]}, - ), - ) - ], - bases=(FunctionSpec,), - ) - - funky = testfunc(a=2, input_spec=my_input_spec) - assert getattr(funky.inputs, "a") == 2 - - -def test_input_spec_func_3a_except(): - """the function w/o annotated, but input_spec is used - allowed_values is used in metadata and the ValueError is raised - """ - - @mark.task - def testfunc(a): - return a - - my_input_spec = SpecInfo( - name="Input", - fields=[ - ( - "a", - attr.ib( - type=int, - metadata={"help_string": "input a", "allowed_values": [0, 1, 2]}, - ), - ) - ], - bases=(FunctionSpec,), - ) - - with pytest.raises(ValueError, match="value of a has to be"): - testfunc(a=3, input_spec=my_input_spec) - - -def test_input_spec_func_4(): - """the function with a default value for b - but b is set as mandatory in the input_spec, so error is raised if not provided - """ - - @mark.task - def testfunc(a, b=1): - return a + b - - my_input_spec = SpecInfo( - name="Input", - fields=[ - ( - "a", - attr.ib( - type=int, metadata={"help_string": "input a", "mandatory": True} - ), - ), - ( - "b", - attr.ib( - type=int, metadata={"help_string": "input b", "mandatory": True} - ), - ), - ], - bases=(FunctionSpec,), - ) - - funky = testfunc(a=2, input_spec=my_input_spec) - with pytest.raises(Exception, match="b is mandatory"): - funky() - - -def test_input_spec_func_4a(): - """the function with a default value for b and metadata in the input_spec - has a different default value, so value from the function is overwritten - """ - - @mark.task - def testfunc(a, b=1): - return a + b - - my_input_spec = SpecInfo( - name="Input", - fields=[ - ( - "a", - attr.ib( - type=int, metadata={"help_string": "input a", "mandatory": True} - ), - ), - ("b", attr.ib(type=int, default=10, metadata={"help_string": "input b"})), - ], - bases=(FunctionSpec,), - ) - - funky = testfunc(a=2, input_spec=my_input_spec) - res = funky() - assert res.output.out == 12 - - -def test_input_spec_func_5(): - """the FunctionTask with input_spec, a input has MultiInputObj type - a single value is provided and should be converted to a list - """ - - @mark.task - def testfunc(a): - return len(a) - - my_input_spec = SpecInfo( - name="Input", - fields=[ - ("a", attr.ib(type=MultiInputObj, metadata={"help_string": "input a"})) - ], - bases=(FunctionSpec,), - ) - - funky = testfunc(a=3.5, input_spec=my_input_spec) - assert getattr(funky.inputs, "a") == MultiInputObj([3.5]) - res = funky() - assert res.output.out == 1 - - -def test_output_spec_func_1(): - """the function w/o annotated, but output_spec is used""" - - @mark.task - def testfunc(a): - return a - - my_output_spec = SpecInfo( - name="Output", - fields=[("out1", attr.ib(type=float, metadata={"help_string": "output"}))], - bases=(BaseSpec,), - ) - - funky = testfunc(a=3.5, output_spec=my_output_spec) - res = funky() - assert res.output.out1 == 3.5 - - -def test_output_spec_func_1a_except(): - """the function w/o annotated, but output_spec is used - float returned instead of int - TypeError - """ - - @mark.task - def testfunc(a): - return a - - my_output_spec = SpecInfo( - name="Output", - fields=[("out1", attr.ib(type=int, metadata={"help_string": "output"}))], - bases=(BaseSpec,), - ) - - funky = testfunc(a=3.5, output_spec=my_output_spec) - with pytest.raises(TypeError): - funky() - - -def test_output_spec_func_2(): - """the function w/o annotated, but output_spec is used - output_spec changes the type of the output (so error is not raised) - """ - - @mark.task - def testfunc(a) -> int: - return a - - my_output_spec = SpecInfo( - name="Output", - fields=[("out1", attr.ib(type=float, metadata={"help_string": "output"}))], - bases=(BaseSpec,), - ) - - funky = testfunc(a=3.5, output_spec=my_output_spec) - res = funky() - assert res.output.out1 == 3.5 - - -def test_output_spec_func_2a(): - """the function w/o annotated, but output_spec is used - output_spec changes the type of the output (so error is not raised) - using a shorter syntax - """ - - @mark.task - def testfunc(a) -> int: - return a - - my_output_spec = SpecInfo( - name="Output", - fields=[("out1", float, {"help_string": "output"})], - bases=(BaseSpec,), - ) - - funky = testfunc(a=3.5, output_spec=my_output_spec) - res = funky() - assert res.output.out1 == 3.5 - - -def test_output_spec_func_3(): - """the function w/o annotated, but output_spec is used - MultiOutputObj is used, output is a 2-el list, so converter doesn't do anything - """ - - @mark.task - def testfunc(a, b): - return [a, b] - - my_output_spec = SpecInfo( - name="Output", - fields=[ - ( - "out_list", - attr.ib(type=MultiOutputObj, metadata={"help_string": "output"}), - ) - ], - bases=(BaseSpec,), - ) - - funky = testfunc(a=3.5, b=1, output_spec=my_output_spec) - res = funky() - assert res.output.out_list == [3.5, 1] - - -def test_output_spec_func_4(): - """the function w/o annotated, but output_spec is used - MultiOutputObj is used, output is a 1el list, so converter return the element - """ - - @mark.task - def testfunc(a): - return [a] - - my_output_spec = SpecInfo( - name="Output", - fields=[ - ( - "out_1el", - attr.ib(type=MultiOutputObj, metadata={"help_string": "output"}), - ) - ], - bases=(BaseSpec,), - ) - - funky = testfunc(a=3.5, output_spec=my_output_spec) - res = funky() - assert res.output.out_1el == 3.5 - - -def test_exception_func(): - @mark.task - def raise_exception(c, d): - raise Exception() - - bad_funk = raise_exception(c=17, d=3.2) - assert pytest.raises(Exception, bad_funk) - - -def test_result_none_1(): - """checking if None is properly returned as the result""" - - @mark.task - def fun_none(x): - return None - - task = fun_none(name="none", x=3) - res = task() - assert res.output.out is None - - -def test_result_none_2(): - """checking if None is properly set for all outputs""" - - @mark.task - def fun_none(x) -> (ty.Any, ty.Any): - return None - - task = fun_none(name="none", x=3) - res = task() - assert res.output.out1 is None - assert res.output.out2 is None - - -def test_audit_prov( - tmpdir, -): - @mark.task - def testfunc(a: int, b: float = 0.1) -> ty.NamedTuple("Output", [("out", float)]): - return a + b - - # printing the audit message - funky = testfunc(a=1, audit_flags=AuditFlag.PROV, messengers=PrintMessenger()) - funky.cache_dir = tmpdir - funky() - - # saving the audit message into the file - funky = testfunc(a=2, audit_flags=AuditFlag.PROV, messengers=FileMessenger()) - funky.cache_dir = tmpdir - funky() - # this should be the default loctaion - message_path = tmpdir / funky.checksum / "messages" - assert (tmpdir / funky.checksum / "messages").exists() - - collect_messages(tmpdir / funky.checksum, message_path, ld_op="compact") - assert (tmpdir / funky.checksum / "messages.jsonld").exists() - - -def test_audit_task(tmpdir): - @mark.task - def testfunc(a: int, b: float = 0.1) -> ty.NamedTuple("Output", [("out", float)]): - return a + b - - from glob import glob - - funky = testfunc(a=2, audit_flags=AuditFlag.PROV, messengers=FileMessenger()) - funky.cache_dir = tmpdir - funky() - message_path = tmpdir / funky.checksum / "messages" - - for file in glob(str(message_path) + "/*.jsonld"): - with open(file) as f: - data = json.load(f) - if "@type" in data: - if "AssociatedWith" in data: - assert "testfunc" in data["Label"] - - if "@type" in data: - if data["@type"] == "input": - assert None is data["Label"] - if "AssociatedWith" in data: - assert None is data["AssociatedWith"] - - # assert any(json_content) - - -def test_audit_shellcommandtask(tmpdir): - args = "-l" - shelly = ShellCommandTask( - name="shelly", - executable="ls", - args=args, - audit_flags=AuditFlag.PROV, - messengers=FileMessenger(), - ) - - from glob import glob - - shelly.cache_dir = tmpdir - shelly() - message_path = tmpdir / shelly.checksum / "messages" - # go through each jsonld file in message_path and check if the label field exists - - command_content = [] - - for file in glob(str(message_path) + "/*.jsonld"): - with open(file) as f: - data = json.load(f) - - if "@type" in data: - if "AssociatedWith" in data: - assert "shelly" in data["Label"] - - if "@type" in data: - if data["@type"] == "input": - assert data["Label"] is None - - if "Command" in data: - command_content.append(True) - assert "ls -l" == data["Command"] - - assert any(command_content) - - -def test_audit_shellcommandtask_file(tmp_path): - # sourcery skip: use-fstring-for-concatenation - import glob - import shutil - - # create test.txt file with "This is a test" in it in the tmpdir - # create txt file in cwd - with open("test.txt", "w") as f: - f.write("This is a test") - - with open("test2.txt", "w") as f: - f.write("This is a test") - - # copy the test.txt file to the tmpdir - shutil.copy("test.txt", tmp_path) - shutil.copy("test2.txt", tmp_path) - - cmd = "cat" - file_in = File(tmp_path / "test.txt") - file_in_2 = File(tmp_path / "test2.txt") - test_file_hash = hash_function(file_in) - test_file_hash_2 = hash_function(file_in_2) - my_input_spec = SpecInfo( - name="Input", - fields=[ - ( - "in_file", - attr.ib( - type=File, - metadata={ - "position": 1, - "argstr": "", - "help_string": "text", - "mandatory": True, - }, - ), - ), - ( - "in_file_2", - attr.ib( - type=File, - metadata={ - "position": 2, - "argstr": "", - "help_string": "text", - "mandatory": True, - }, - ), - ), - ], - bases=(ShellSpec,), - ) - shelly = ShellCommandTask( - name="shelly", - in_file=file_in, - in_file_2=file_in_2, - input_spec=my_input_spec, - executable=cmd, - audit_flags=AuditFlag.PROV, - messengers=FileMessenger(), - ) - shelly.cache_dir = tmp_path - results = shelly() - message_path = tmp_path / shelly.checksum / "messages" - for file in glob.glob(str(message_path) + "/*.jsonld"): - with open(file) as x: - data = json.load(x) - if "@type" in data: - if data["@type"] == "input": - if data["Label"] == "in_file": - assert data["AtLocation"] == str(file_in) - assert data["digest"] == test_file_hash - if data["Label"] == "in_file_2": - assert data["AtLocation"] == str(file_in_2) - assert data["digest"] == test_file_hash_2 - - -def test_audit_shellcommandtask_version(tmpdir): - import subprocess as sp - - version_cmd = sp.run("less --version", shell=True, stdout=sp.PIPE).stdout.decode( - "utf-8" - ) - version_cmd = version_cmd.splitlines()[0] - cmd = "less" - shelly = ShellCommandTask( - name="shelly", - executable=cmd, - args="test_task.py", - audit_flags=AuditFlag.PROV, - messengers=FileMessenger(), - ) - - import glob - - shelly.cache_dir = tmpdir - shelly() - message_path = tmpdir / shelly.checksum / "messages" - # go through each jsonld file in message_path and check if the label field exists - version_content = [] - for file in glob.glob(str(message_path) + "/*.jsonld"): - with open(file) as f: - data = json.load(f) - if "AssociatedWith" in data: - if version_cmd in data["AssociatedWith"]: - version_content.append(True) - - assert any(version_content) - - -def test_audit_prov_messdir_1( - tmpdir, -): - """customized messenger dir""" - - @mark.task - def testfunc(a: int, b: float = 0.1) -> ty.NamedTuple("Output", [("out", float)]): - return a + b - - # printing the audit message - funky = testfunc(a=1, audit_flags=AuditFlag.PROV, messengers=PrintMessenger()) - funky.cache_dir = tmpdir - funky() - - # saving the audit message into the file - funky = testfunc(a=2, audit_flags=AuditFlag.PROV, messengers=FileMessenger()) - # user defined path - message_path = tmpdir / funky.checksum / "my_messages" - funky.cache_dir = tmpdir - # providing messenger_dir for audit - funky.audit.messenger_args = dict(message_dir=message_path) - funky() - assert (tmpdir / funky.checksum / "my_messages").exists() - - collect_messages(tmpdir / funky.checksum, message_path, ld_op="compact") - assert (tmpdir / funky.checksum / "messages.jsonld").exists() - - -def test_audit_prov_messdir_2( - tmpdir, -): - """customized messenger dir in init""" - - @mark.task - def testfunc(a: int, b: float = 0.1) -> ty.NamedTuple("Output", [("out", float)]): - return a + b - - # printing the audit message - funky = testfunc(a=1, audit_flags=AuditFlag.PROV, messengers=PrintMessenger()) - funky.cache_dir = tmpdir - funky() - - # user defined path (doesn't depend on checksum, can be defined before init) - message_path = tmpdir / "my_messages" - # saving the audit message into the file - funky = testfunc( - a=2, - audit_flags=AuditFlag.PROV, - messengers=FileMessenger(), - messenger_args=dict(message_dir=message_path), - ) - funky.cache_dir = tmpdir - # providing messenger_dir for audit - funky() - assert (tmpdir / "my_messages").exists() - - collect_messages(tmpdir, message_path, ld_op="compact") - assert (tmpdir / "messages.jsonld").exists() - - -def test_audit_prov_wf( - tmpdir, -): - """FileMessenger for wf""" - - @mark.task - def testfunc(a: int, b: float = 0.1) -> ty.NamedTuple("Output", [("out", float)]): - return a + b - - wf = Workflow( - name="wf", - input_spec=["x"], - cache_dir=tmpdir, - audit_flags=AuditFlag.PROV, - messengers=FileMessenger(), - ) - wf.add(testfunc(name="testfunc", a=wf.lzin.x)) - wf.set_output([("out", wf.testfunc.lzout.out)]) - wf.inputs.x = 2 - - wf(plugin="cf") - # default path - message_path = tmpdir / wf.checksum / "messages" - assert message_path.exists() - - collect_messages(tmpdir / wf.checksum, message_path, ld_op="compact") - assert (tmpdir / wf.checksum / "messages.jsonld").exists() - - -def test_audit_all( - tmpdir, -): - @mark.task - def testfunc(a: int, b: float = 0.1) -> ty.NamedTuple("Output", [("out", float)]): - return a + b - - funky = testfunc(a=2, audit_flags=AuditFlag.ALL, messengers=FileMessenger()) - message_path = tmpdir / funky.checksum / "messages" - funky.cache_dir = tmpdir - funky.audit.messenger_args = dict(message_dir=message_path) - funky() - from glob import glob - - assert len(glob(str(tmpdir / funky.checksum / "proc*.log"))) == 1 - assert len(glob(str(message_path / "*.jsonld"))) == 7 - - # commented out to speed up testing - collect_messages(tmpdir / funky.checksum, message_path, ld_op="compact") - assert (tmpdir / funky.checksum / "messages.jsonld").exists() - - -@no_win -def test_shell_cmd(tmpdir): - cmd = ["echo", "hail", "pydra"] - - # all args given as executable - shelly = ShellCommandTask(name="shelly", executable=cmd) - assert shelly.cmdline == " ".join(cmd) - res = shelly._run() - assert res.output.stdout == " ".join(cmd[1:]) + "\n" - - # separate command into exec + args - shelly = ShellCommandTask(executable=cmd[0], args=cmd[1:]) - assert shelly.inputs.executable == "echo" - assert shelly.cmdline == " ".join(cmd) - res = shelly._run() - assert res.output.return_code == 0 - assert res.output.stdout == " ".join(cmd[1:]) + "\n" - - -def test_functask_callable(tmpdir): - # no submitter or plugin - foo = funaddtwo(a=1) - res = foo() - assert res.output.out == 3 - assert foo.plugin is None - - # plugin - bar = funaddtwo(a=2) - res = bar(plugin="cf") - assert res.output.out == 4 - assert bar.plugin is None - - foo2 = funaddtwo(a=3) - foo2.plugin = "cf" - res = foo2() - assert res.output.out == 5 - assert foo2.plugin == "cf" - - -def test_taskhooks_1(tmpdir, capsys): - foo = funaddtwo(name="foo", a=1, cache_dir=tmpdir) - assert foo.hooks - # ensure all hooks are defined - for attr in ("pre_run", "post_run", "pre_run_task", "post_run_task"): - hook = getattr(foo.hooks, attr) - assert hook() is None - - def myhook(task, *args): - print("I was called") - - foo.hooks.pre_run = myhook - foo() - captured = capsys.readouterr() - assert "I was called\n" in captured.out - del captured - - # setting unknown hook should not be allowed - with pytest.raises(AttributeError): - foo.hooks.mid_run = myhook - - # set all hooks - foo.hooks.post_run = myhook - foo.hooks.pre_run_task = myhook - foo.hooks.post_run_task = myhook - foo.inputs.a = 2 # ensure not pre-cached - foo() - captured = capsys.readouterr() - assert captured.out.count("I was called\n") == 4 - del captured - - # hooks are independent across tasks by default - bar = funaddtwo(name="bar", a=3, cache_dir=tmpdir) - assert bar.hooks is not foo.hooks - # but can be shared across tasks - bar.hooks = foo.hooks - # and workflows - wf = gen_basic_wf() - wf.tmpdir = tmpdir - wf.hooks = bar.hooks - assert foo.hooks == bar.hooks == wf.hooks - - wf(plugin="cf") - captured = capsys.readouterr() - assert captured.out.count("I was called\n") == 4 - del captured - - # reset all hooks - foo.hooks.reset() - for attr in ("pre_run", "post_run", "pre_run_task", "post_run_task"): - hook = getattr(foo.hooks, attr) - assert hook() is None - - -def test_taskhooks_2(tmpdir, capsys): - """checking order of the hooks; using task's attributes""" - foo = funaddtwo(name="foo", a=1, cache_dir=tmpdir) - - def myhook_prerun(task, *args): - print(f"i. prerun hook was called from {task.name}") - - def myhook_prerun_task(task, *args): - print(f"ii. prerun task hook was called {task.name}") - - def myhook_postrun_task(task, *args): - print(f"iii. postrun task hook was called {task.name}") - - def myhook_postrun(task, *args): - print(f"iv. postrun hook was called {task.name}") - - foo.hooks.pre_run = myhook_prerun - foo.hooks.post_run = myhook_postrun - foo.hooks.pre_run_task = myhook_prerun_task - foo.hooks.post_run_task = myhook_postrun_task - foo() - - captured = capsys.readouterr() - hook_messages = captured.out.strip().split("\n") - # checking the order of the hooks - assert "i. prerun hook" in hook_messages[0] - assert "ii. prerun task hook" in hook_messages[1] - assert "iii. postrun task hook" in hook_messages[2] - assert "iv. postrun hook" in hook_messages[3] - - -def test_taskhooks_3(tmpdir, capsys): - """checking results in the post run hooks""" - foo = funaddtwo(name="foo", a=1, cache_dir=tmpdir) - - def myhook_postrun_task(task, result, *args): - print(f"postrun task hook, the result is {result.output.out}") - - def myhook_postrun(task, result, *args): - print(f"postrun hook, the result is {result.output.out}") - - foo.hooks.post_run = myhook_postrun - foo.hooks.post_run_task = myhook_postrun_task - foo() - - captured = capsys.readouterr() - hook_messages = captured.out.strip().split("\n") - # checking that the postrun hooks have access to results - assert "postrun task hook, the result is 3" in hook_messages[0] - assert "postrun hook, the result is 3" in hook_messages[1] - - -def test_taskhooks_4(tmpdir, capsys): - """task raises an error: postrun task should be called, postrun shouldn't be called""" - foo = funaddtwo(name="foo", a="one", cache_dir=tmpdir) - - def myhook_postrun_task(task, result, *args): - print(f"postrun task hook was called, result object is {result}") - - def myhook_postrun(task, result, *args): - print("postrun hook should not be called") - - foo.hooks.post_run = myhook_postrun - foo.hooks.post_run_task = myhook_postrun_task - - with pytest.raises(Exception): - foo() - - captured = capsys.readouterr() - hook_messages = captured.out.strip().split("\n") - # only post run task hook should be called - assert len(hook_messages) == 1 - assert "postrun task hook was called" in hook_messages[0] - - -def test_traceback(tmpdir): - """checking if the error raised in a function is properly returned; - checking if there is an error filename in the error message that contains - full traceback including the line in the python function - """ - - @mark.task - def fun_error(x): - raise Exception("Error from the function") - - task = fun_error(name="error", cache_dir=tmpdir).split("x", x=[3, 4]) - - with pytest.raises(Exception, match="from the function") as exinfo: - task() - - # getting error file from the error message - error_file_match = str(exinfo.value).split("here: ")[-1].split("_error.pklz")[0] - error_file = Path(error_file_match) / "_error.pklz" - # checking if the file exists - assert error_file.exists() - # reading error message from the pickle file - error_tb = cp.loads(error_file.read_bytes())["error message"] - # the error traceback should be a list and should point to a specific line in the function - assert isinstance(error_tb, list) - assert "in fun_error" in error_tb[-2] - - -def test_traceback_wf(tmpdir): - """checking if the error raised in a function is properly returned by a workflow; - checking if there is an error filename in the error message that contains - full traceback including the line in the python function - """ - - @mark.task - def fun_error(x): - raise Exception("Error from the function") - - wf = Workflow(name="wf", input_spec=["x"], cache_dir=tmpdir).split("x", x=[3, 4]) - wf.add(fun_error(name="error", x=wf.lzin.x)) - wf.set_output([("out", wf.error.lzout.out)]) - - with pytest.raises(Exception, match="Task error raised an error") as exinfo: - wf() - - # getting error file from the error message - error_file_match = str(exinfo.value).split("here: ")[-1].split("_error.pklz")[0] - error_file = Path(error_file_match) / "_error.pklz" - # checking if the file exists - assert error_file.exists() - # reading error message from the pickle file - error_tb = cp.loads(error_file.read_bytes())["error message"] - # the error traceback should be a list and should point to a specific line in the function - assert isinstance(error_tb, list) - assert "in fun_error" in error_tb[-2] - - -def test_rerun_errored(tmpdir, capfd): - """Test rerunning a task containing errors. - Only the errored tasks should be rerun""" - - @mark.task - def pass_odds(x): - if x % 2 == 0: - print(f"x%2 = {x % 2} (error)\n") - raise Exception("even error") - else: - print(f"x%2 = {x % 2}\n") - return x - - task = pass_odds(name="pass_odds", cache_dir=tmpdir).split("x", x=[1, 2, 3, 4, 5]) - - with pytest.raises(Exception, match="even error"): - task() - with pytest.raises(Exception, match="even error"): - task() - - out, err = capfd.readouterr() - stdout_lines = out.splitlines() - - tasks_run = 0 - errors_found = 0 - - for line in stdout_lines: - if "x%2" in line: - tasks_run += 1 - if "(error)" in line: - errors_found += 1 - - # There should have been 5 messages of the form "x%2 = XXX" after calling task() the first time - # and another 2 messagers after calling the second time - assert tasks_run == 7 - assert errors_found == 4 - - -@attr.s(auto_attribs=True) -class A: - x: int - - -def test_object_input(): - """Test function tasks with object inputs""" - - @mark.task - def testfunc(a: A): - return a.x - - result = testfunc(a=A(x=7))() - assert result.output.out == 7 diff --git a/pydra/engine/tests/test_task_file.py b/pydra/engine/tests/test_task_file.py new file mode 100644 index 0000000000..d6667864e5 --- /dev/null +++ b/pydra/engine/tests/test_task_file.py @@ -0,0 +1,372 @@ +import os +from pathlib import Path +import numpy as np +import pytest +from pydra.engine.submitter import Submitter +from pydra.compose import python, workflow +from fileformats.generic import File, Directory +import time +from pydra.engine.tests.utils import ( + FileOrIntIdentity, + FileAndIntIdentity, + ListOfListOfFileOrIntIdentity, + ListOfDictOfFileOrIntIdentity, +) + + +@pytest.mark.flaky(reruns=5) +def test_input_file_hash_1(tmp_path): + """input definition with File types, checking when the checksum changes""" + file = tmp_path / "in_file_1.txt" + with open(file, "w") as f: + f.write("hello") + + # checking specific hash value + hash1 = FileOrIntIdentity(in_file=file)._hash + # assert hash1 == "eba2fafb8df4bae94a7aa42bb159b778" + + # checking if different name doesn't affect the hash + file_diffname = tmp_path / "in_file_2.txt" + with open(file_diffname, "w") as f: + f.write("hello") + hash2 = FileOrIntIdentity(in_file=file_diffname)._hash + assert hash1 == hash2 + + # checking if different content (the same name) affects the hash + file_diffcontent = tmp_path / "in_file_1.txt" + with open(file_diffcontent, "w") as f: + f.write("hi") + hash3 = FileOrIntIdentity(in_file=file_diffcontent)._hash + assert hash1 != hash3 + + +@pytest.mark.flaky(reruns=5) +def test_input_file_hash_2(tmp_path): + """input definition with ty.Union[File, ...] type, checking when the checksum changes""" + file = tmp_path / "in_file_1.txt" + with open(file, "w") as f: + f.write("hello") + + # checking specific hash value + hash1 = FileOrIntIdentity(in_file=file)._hash + # assert hash1 == "eba2fafb8df4bae94a7aa42bb159b778" + + # checking if different name doesn't affect the hash + file_diffname = tmp_path / "in_file_2.txt" + with open(file_diffname, "w") as f: + f.write("hello") + hash2 = FileOrIntIdentity(in_file=file_diffname)._hash + assert hash1 == hash2 + + # checking if string is also accepted + hash3 = FileOrIntIdentity(in_file=str(file))._hash + assert hash3 == hash1 + + # checking if different content (the same name) affects the hash + file_diffcontent = tmp_path / "in_file_1.txt" + with open(file_diffcontent, "w") as f: + f.write("hi") + hash4 = FileOrIntIdentity(in_file=file_diffcontent)._hash + assert hash1 != hash4 + + +@pytest.mark.flaky(reruns=5) +def test_input_file_hash_3(tmp_path): + """input definition with File types, checking when the hash and file_hash change""" + file = tmp_path / "in_file_1.txt" + with open(file, "w") as f: + f.write("hello") + + a = FileAndIntIdentity(in_file=file, in_int=3) + # original hash and files_hash (dictionary contains info about files) + hash1 = a._hash + # files_hash1 = deepcopy(my_inp.files_hash) + # file name should be in files_hash1[in_file] + # filename = str(Path(file)) + # assert filename in files_hash1["in_file"] + + # changing int input + a.in_int = 5 + hash2 = a._hash + # files_hash2 = deepcopy(my_inp.files_hash) + # hash should be different + assert hash1 != hash2 + # files_hash should be the same, and the tuple for filename shouldn't be recomputed + # assert files_hash1 == files_hash2 + # assert id(files_hash1["in_file"][filename]) == id(files_hash2["in_file"][filename]) + + # recreating the file + time.sleep(2) # ensure mtime is different + with open(file, "w") as f: + f.write("hello") + + hash3 = a._hash + # files_hash3 = deepcopy(my_inp.files_hash) + # hash should be the same, + # but the entry for in_file in files_hash should be different (modification time) + assert hash3 == hash2 + # assert files_hash3["in_file"][filename] != files_hash2["in_file"][filename] + # different timestamp + # assert files_hash3["in_file"][filename][0] != files_hash2["in_file"][filename][0] + # the same content hash + # assert files_hash3["in_file"][filename][1] == files_hash2["in_file"][filename][1] + + # setting the in_file again + a.in_file = file + # filename should be removed from files_hash + # assert my_inp.files_hash["in_file"] == {} + # will be saved again when hash is calculated + assert a._hash == hash3 + # assert filename in my_inp.files_hash["in_file"] + + +@pytest.mark.flaky(reruns=5) +def test_input_file_hash_4(tmp_path): + """input definition with nested list, that contain ints and Files, + checking changes in checksums + """ + file = tmp_path / "in_file_1.txt" + with open(file, "w") as f: + f.write("hello") + + # checking specific hash value + hash1 = ListOfListOfFileOrIntIdentity(in_file=[[file, 3]])._hash + # assert hash1 == "2c35c94089b00a7a399d3d4faf208fee" + + # the same file, but int field changes + hash1a = ListOfListOfFileOrIntIdentity(in_file=[[file, 5]])._hash + assert hash1 != hash1a + + # checking if different name doesn't affect the hash + file_diffname = tmp_path / "in_file_2.txt" + with open(file_diffname, "w") as f: + f.write("hello") + hash2 = ListOfListOfFileOrIntIdentity(in_file=[[file_diffname, 3]])._hash + assert hash1 == hash2 + + # checking if different content (the same name) affects the hash + time.sleep(2) # need the mtime to be different + file_diffcontent = tmp_path / "in_file_1.txt" + with open(file_diffcontent, "w") as f: + f.write("hi") + hash3 = ListOfListOfFileOrIntIdentity(in_file=[[file_diffcontent, 3]])._hash + assert hash1 != hash3 + + +@pytest.mark.flaky(reruns=5) +def test_input_file_hash_5(tmp_path): + """input definition with File in nested containers, checking changes in checksums""" + file = tmp_path / "in_file_1.txt" + with open(file, "w") as f: + f.write("hello") + + # checking specific hash value + hash1 = ListOfDictOfFileOrIntIdentity(in_file=[{"file": file, "int": 3}])._hash + # assert hash1 == "7692ffe0b3323c13ecbd642b494f1f53" + + # the same file, but int field changes + hash1a = ListOfDictOfFileOrIntIdentity(in_file=[{"file": file, "int": 5}])._hash + assert hash1 != hash1a + + # checking if different name doesn't affect the hash + file_diffname = tmp_path / "in_file_2.txt" + with open(file_diffname, "w") as f: + f.write("hello") + hash2 = ListOfDictOfFileOrIntIdentity( + in_file=[{"file": file_diffname, "int": 3}] + )._hash + assert hash1 == hash2 + + # checking if different content (the same name) affects the hash + time.sleep(2) # ensure mtime is different + file_diffcontent = tmp_path / "in_file_1.txt" + with open(file_diffcontent, "w") as f: + f.write("hi") + hash3 = ListOfDictOfFileOrIntIdentity( + in_file=[{"file": file_diffcontent, "int": 3}] + )._hash + assert hash1 != hash3 + + +@python.define +def DirCountFile(dirpath: Directory) -> int: + return len(os.listdir(dirpath)) + + +@python.define +def DirCountFileAnnot(dirpath: Directory) -> int: + return len(os.listdir(dirpath)) + + +@python.define +def FileAdd2(file: File) -> File: + array_inp = np.load(file) + array_out = array_inp + 2 + cwd = os.getcwd() + # providing a full path + file_out = os.path.join(cwd, "arr_out.npy") + np.save(file_out, array_out) + return file_out + + +@python.define +def FileMult(file: File) -> File: + array_inp = np.load(file) + array_out = 10 * array_inp + cwd = os.getcwd() + file_out = os.path.join(cwd, "arr_out.npy") + np.save(file_out, array_out) + return file_out + + +@python.define +def FileAdd2Annot(file: File) -> File: + array_inp = np.load(file) + array_out = array_inp + 2 + cwd = os.getcwd() + # providing a full path + file_out = os.path.join(cwd, "arr_out.npy") + np.save(file_out, array_out) + return file_out + + +@python.define +def FileMultAnnot(file: File) -> File: + array_inp = np.load(file) + array_out = 10 * array_inp + cwd = os.getcwd() + file_out = os.path.join(cwd, "arr_out.npy") + np.save(file_out, array_out) + return file_out + + +def test_task_1(tmpdir): + """task that takes file as an input""" + os.chdir(tmpdir) + arr = np.array([2]) + # creating abs path + file = os.path.join(os.getcwd(), "arr1.npy") + np.save(file, arr) + nn = FileAdd2(file=file) + + with Submitter(worker="cf") as sub: + res = sub(nn) + + # checking the results + + result = np.load(res.outputs.out) + assert result == np.array([4]) + + +def test_wf_1(tmpdir): + """workflow with 2 tasks that take file as an input and give file as an aoutput""" + + @workflow.define + def Workflow(file_orig: File): + add2 = workflow.add(FileAdd2(file=file_orig)) + mult = workflow.add(FileMult(file=add2.out)) + return mult.out + + os.chdir(tmpdir) + arr = np.array([2, 3]) + # creating abs path + file_orig = os.path.join(os.getcwd(), "arr_orig.npy") + np.save(file_orig, arr) + wf = Workflow(file_orig=file_orig) + + with Submitter(worker="cf") as sub: + res = sub(wf) + + assert res.cache_dir.exists() + file_output = res.outputs.out + assert Path(file_output).exists() + # loading results + array_out = np.load(file_output) + assert np.array_equal(array_out, [40, 50]) + + +def test_file_annotation_1(tmpdir): + """task that takes file as an input""" + os.chdir(tmpdir) + arr = np.array([2]) + # creating abs path + file = os.path.join(os.getcwd(), "arr1.npy") + np.save(file, arr) + nn = FileAdd2Annot(file=file) + + with Submitter(worker="cf") as sub: + res = sub(nn) + + # checking the results + assert res.errored is False, " ".join(res.errors["error message"]) + arr = np.load(res.outputs.out) + assert arr == np.array([4]) + + +def test_broken_file(tmpdir): + """task that takes file as an input""" + os.chdir(tmpdir) + file = os.path.join(os.getcwd(), "non_existent.npy") + + with pytest.raises(FileNotFoundError): + with Submitter(worker="cf") as sub: + sub(FileAdd2(file=file)) + + with pytest.raises(FileNotFoundError, match="do not exist"): + FileAdd2Annot(file=file) + + +def test_broken_file_link(tmpdir): + """ + Test how broken symlinks are handled during hashing + """ + os.chdir(tmpdir) + file = os.path.join(os.getcwd(), "arr.npy") + arr = np.array([2]) + np.save(file, arr) + + file_link = os.path.join(os.getcwd(), "link_to_arr.npy") + os.symlink(file, file_link) + os.remove(file) + + # raises error inside task + # unless variable is defined as a File pydra will treat it as a string + with pytest.raises(FileNotFoundError): + with Submitter(worker="cf") as sub: + sub(FileAdd2(file=file_link)) + + with pytest.raises(FileNotFoundError, match="do not exist"): + FileAdd2Annot(file=file_link) + + +def test_broken_dir(): + """Test how broken directories are handled during hashing""" + + # unless variable is defined as a File pydra will treat it as a string + with pytest.raises(FileNotFoundError): + with Submitter(worker="cf") as sub: + sub(DirCountFile(dirpath="/broken_dir_path/")) + + # raises error before task is run + with pytest.raises(FileNotFoundError): + DirCountFileAnnot(dirpath="/broken_dir_path/") + + +def test_broken_dir_link1(tmpdir): + """ + Test how broken symlinks are hashed in hash_dir + """ + # broken symlink to dir path + dir1 = tmpdir.join("dir1") + os.mkdir(dir1) + dir1_link = tmpdir.join("dir1_link") + os.symlink(dir1, dir1_link) + os.rmdir(dir1) + + # raises error while running task + with pytest.raises(FileNotFoundError): + with Submitter(worker="cf") as sub: + sub(DirCountFile(dirpath=Path(dir1))) + + with pytest.raises(FileNotFoundError): + DirCountFileAnnot(dirpath=Path(dir1)) diff --git a/pydra/engine/tests/test_task_state.py b/pydra/engine/tests/test_task_state.py new file mode 100644 index 0000000000..8e02d77206 --- /dev/null +++ b/pydra/engine/tests/test_task_state.py @@ -0,0 +1,1331 @@ +import os +import shutil +import attrs +import numpy as np +import time +import pytest +from pydra.compose import python, workflow +from pydra.engine.tests.utils import ( + FunAddTwo, + FunAddVar, + FunAddVarNone, + FunAddVarDefault, + Moment, + FunDiv, + FunDict, + FunFile, + FunFileList, + Op4Var, +) +from pydra.compose.base import Task +from pydra.engine.state import State +from pydra.utils.typing import StateArray +from pydra.engine.submitter import Submitter +from pydra.engine.workflow import Workflow +from pydra.engine.tests.utils import num_python_cache_roots + + +@workflow.define +def IdentityWorkflow(a: int) -> int: + + @python.define + def Identity(a): + return a + + a = workflow.add(Identity(a=a)) + return a.out + + +def get_state(task: Task, name="NA") -> State: + """helper function to get the state of the task once it has been added to workflow""" + identity_workflow = IdentityWorkflow(a=1) + wf = Workflow.construct(identity_workflow, dont_cache=True) + wf.add(task, name=name) + node = wf[name] + if node.state: + node.state.prepare_states(node.state_values) + node.state.prepare_inputs() + return node.state + + +@pytest.fixture(scope="module") +def change_dir(request): + orig_dir = os.getcwd() + test_dir = os.path.join(orig_dir, "test_outputs") + os.makedirs(test_dir, exist_ok=True) + os.chdir(test_dir) + + def move2orig(): + os.chdir(orig_dir) + + request.addfinalizer(move2orig) + + +@pytest.mark.flaky(reruns=2) # when dask +def test_task_state_cachedir(worker, tmp_path): + """task with a state and provided cache_root using pytest tmp_path""" + cache_root = tmp_path / "test_task_nostate" + cache_root.mkdir() + nn = FunAddTwo().split("a", a=[3, 5]) + state = get_state(nn) + + assert state.splitter == "NA.a" + assert (nn.a == np.array([3, 5])).all() + + with Submitter(worker=worker, cache_root=cache_root) as sub: + results = sub(nn) + assert not results.errored, "\n".join(results.errors["error message"]) + + # checking the results + + expected = [({"NA.a": 3}, 5), ({"NA.a": 5}, 7)] + for i, res in enumerate(expected): + assert results.outputs.out[i] == res[1] + + +def test_task_init_1a(): + with pytest.raises(TypeError): + FunAddTwo("NA") + + +def test_task_init_2(): + """task with a name and inputs""" + nn = FunAddTwo(a=3) + # adding NA to the name of the variable + assert nn.a == 3 + state = get_state(nn) + assert state is None + + +@pytest.mark.parametrize( + "splitter, state_splitter, state_rpn, states_ind, states_val", + [("a", "NA.a", ["NA.a"], [{"NA.a": 0}, {"NA.a": 1}], [{"NA.a": 3}, {"NA.a": 5}])], +) +@pytest.mark.parametrize("input_type", ["list", "array"]) +def test_task_init_3( + splitter, state_splitter, state_rpn, states_ind, states_val, input_type +): + """task with inputs and splitter""" + a_in = [3, 5] + if input_type == "array": + a_in = np.array(a_in) + + nn = FunAddTwo().split(splitter, a=a_in) + + assert np.allclose(nn.a, [3, 5]) + state = get_state(nn) + assert state.splitter == state_splitter + assert state.splitter_rpn == state_rpn + + assert state.states_ind == states_ind + assert state.states_val == states_val + + +@pytest.mark.parametrize( + "splitter, state_splitter, state_rpn, states_ind, states_val", + [ + ( + ("a", "b"), + ("NA.a", "NA.b"), + ["NA.a", "NA.b", "."], + [{"NA.a": 0, "NA.b": 0}, {"NA.a": 1, "NA.b": 1}], + [{"NA.a": 3, "NA.b": 10}, {"NA.a": 5, "NA.b": 20}], + ), + ( + ["a", "b"], + ["NA.a", "NA.b"], + ["NA.a", "NA.b", "*"], + [ + {"NA.a": 0, "NA.b": 0}, + {"NA.a": 0, "NA.b": 1}, + {"NA.a": 1, "NA.b": 0}, + {"NA.a": 1, "NA.b": 1}, + ], + [ + {"NA.a": 3, "NA.b": 10}, + {"NA.a": 3, "NA.b": 20}, + {"NA.a": 5, "NA.b": 10}, + {"NA.a": 5, "NA.b": 20}, + ], + ), + ], +) +@pytest.mark.parametrize("input_type", ["list", "array", "mixed"]) +def test_task_init_3a( + splitter, state_splitter, state_rpn, states_ind, states_val, input_type +): + """task with inputs and splitter""" + a_in, b_in = [3, 5], [10, 20] + if input_type == "array": + a_in, b_in = np.array(a_in), np.array(b_in) + elif input_type == "mixed": + a_in = np.array(a_in) + nn = FunAddVar().split(splitter, a=a_in, b=b_in) + state = get_state(nn) + + assert np.allclose(nn.a, [3, 5]) + assert np.allclose(nn.b, [10, 20]) + assert state.splitter == state_splitter + assert state.splitter_rpn == state_rpn + + assert state.states_ind == states_ind + assert state.states_val == states_val + + +def test_task_init_4(): + """task with interface splitter and inputs set in the split method""" + nn = FunAddTwo() + nn = nn.split("a", a=[3, 5]) + state = get_state(nn) + assert np.allclose(nn.a, [3, 5]) + + assert state.splitter == "NA.a" + assert state.splitter_rpn == ["NA.a"] + + assert state.states_ind == [{"NA.a": 0}, {"NA.a": 1}] + assert state.states_val == [{"NA.a": 3}, {"NA.a": 5}] + + +def test_task_init_4b(): + """updating splitter using overwrite=True""" + nn = FunAddTwo() + nn = nn.split("a", a=[1, 2]) + nn = nn.split("a", a=[3, 5], overwrite=True) + state = get_state(nn) + assert np.allclose(nn.a, [3, 5]) + + assert state.splitter == "NA.a" + assert state.splitter_rpn == ["NA.a"] + + assert state.states_ind == [{"NA.a": 0}, {"NA.a": 1}] + assert state.states_val == [{"NA.a": 3}, {"NA.a": 5}] + + +def test_task_init_4c(): + """trying to set splitter twice without using overwrite""" + nn = FunAddVar().split("b", b=[1, 2]) + state = get_state(nn) + with pytest.raises(Exception) as excinfo: + nn.split("a", a=[3, 5]) + assert "Cannot overwrite existing splitter" in str(excinfo.value) + + assert state.splitter == "NA.b" + + +def test_task_init_4d(): + """trying to set the same splitter twice without using overwrite + if the splitter is the same, the exception shouldn't be raised + """ + nn = FunAddTwo().split("a", a=[3, 5]) + nn = nn.split("a", a=[3, 5], overwrite=True) + state = get_state(nn) + assert state.splitter == "NA.a" + + +def test_task_init_5(): + """task with inputs, splitter and combiner""" + nn = FunAddVar().split(["a", "b"], a=[3, 5], b=[1, 2]).combine("b") + state = get_state(nn) + + assert state.splitter == ["NA.a", "NA.b"] + assert state.splitter_rpn == ["NA.a", "NA.b", "*"] + assert state.combiner == ["NA.b"] + + assert state.splitter_final == "NA.a" + assert state.splitter_rpn_final == ["NA.a"] + + assert state.states_ind == [ + {"NA.a": 0, "NA.b": 0}, + {"NA.a": 0, "NA.b": 1}, + {"NA.a": 1, "NA.b": 0}, + {"NA.a": 1, "NA.b": 1}, + ] + assert state.states_val == [ + {"NA.a": 3, "NA.b": 1}, + {"NA.a": 3, "NA.b": 2}, + {"NA.a": 5, "NA.b": 1}, + {"NA.a": 5, "NA.b": 2}, + ] + + assert state.final_combined_ind_mapping == {0: [0, 1], 1: [2, 3]} + + +def test_task_init_5a(): + """updating combiner using overwrite=True""" + nn = FunAddVar().split(["a", "b"], a=[3, 5], b=[1, 2]).combine("b") + nn = nn.combine("a", overwrite=True) + state = get_state(nn) + + assert state.splitter == ["NA.a", "NA.b"] + assert state.splitter_rpn == ["NA.a", "NA.b", "*"] + assert state.combiner == ["NA.a"] + + assert state.splitter_final == "NA.b" + assert state.splitter_rpn_final == ["NA.b"] + + assert state.states_ind == [ + {"NA.a": 0, "NA.b": 0}, + {"NA.a": 0, "NA.b": 1}, + {"NA.a": 1, "NA.b": 0}, + {"NA.a": 1, "NA.b": 1}, + ] + assert state.states_val == [ + {"NA.a": 3, "NA.b": 1}, + {"NA.a": 3, "NA.b": 2}, + {"NA.a": 5, "NA.b": 1}, + {"NA.a": 5, "NA.b": 2}, + ] + + assert state.final_combined_ind_mapping == {0: [0, 2], 1: [1, 3]} + + +def test_task_init_5b(): + """updating combiner without using overwrite""" + nn = FunAddVar().split(["a", "b"], a=[3, 5], b=[1, 2]).combine("b") + state = get_state(nn) + with pytest.raises(Exception) as excinfo: + nn.combine("a") + assert "Attempting to overwrite existing combiner" in str(excinfo.value) + + assert state.combiner == ["NA.b"] + + +def test_task_init_5c(): + """trying to set the same combiner twice without using overwrite + if the combiner is the same, the exception shouldn't be raised + """ + nn = FunAddVar().split(["a", "b"], a=[3, 5], b=[1, 2]).combine("b") + state = get_state(nn) + nn = nn.combine("b", overwrite=True) + + assert state.splitter == ["NA.a", "NA.b"] + assert state.splitter_rpn == ["NA.a", "NA.b", "*"] + assert state.combiner == ["NA.b"] + + assert state.splitter_final == "NA.a" + assert state.splitter_rpn_final == ["NA.a"] + + +def test_task_init_6(): + """task with splitter, but the input is an empty list""" + nn = FunAddTwo() + nn = nn.split("a", a=[]) + state = get_state(nn) + assert nn.a == [] + + assert state.splitter == "NA.a" + assert state.splitter_rpn == ["NA.a"] + + assert state.states_ind == [] + assert state.states_val == [] + + +def test_task_init_7(tmp_path): + """task with a dictionary of files as an input, checking checksum""" + file1 = tmp_path / "file1.txt" + with open(file1, "w") as f: + f.write("hello") + + file2 = tmp_path / "file2.txt" + with open(file2, "w") as f: + f.write("from pydra\n") + + nn1 = FunFileList(filename_list=[file1, file2]) + hash1 = nn1._hash + + # changing the content of the file + time.sleep(2) # need the mtime to be different + file2 = tmp_path / "file2.txt" + with open(file2, "w") as f: + f.write("from pydra") + + nn2 = FunFileList(filename_list=[file1, file2]) + hash2 = nn2._hash + + # the checksum should be different - content of file2 is different + assert hash1 != hash2 + + +def test_task_init_8(): + """task without setting the input, the value should be set to attrs.NOTHING""" + nn = FunAddTwo() + assert nn.a is attrs.NOTHING + + +def test_task_init_9(): + """task without setting the input, but using the default avlue from function""" + nn1 = FunAddVarDefault(a=2) + assert nn1.b == 1 + + nn2 = FunAddVarDefault(a=2, b=1) + assert nn2.b == 1 + # both tasks should have the same checksum + assert nn1._hash == nn2._hash + + +def test_task_error(tmp_path): + func = FunDiv(a=1, b=0) + with pytest.raises(ZeroDivisionError): + func(cache_root=tmp_path) + assert (next(tmp_path.iterdir()) / "_error.pklz").exists() + + +# Tests for tasks without state (i.e. no splitter) + + +@pytest.mark.flaky(reruns=2) # when dask +def test_task_nostate_1(worker, tmp_path): + """task without splitter""" + nn = FunAddTwo(a=3) + + assert np.allclose(nn.a, [3]) + state = get_state(nn) + assert state is None + + with Submitter(worker=worker, cache_root=tmp_path) as sub: + results = sub(nn) + assert not results.errored, "\n".join(results.errors["error message"]) + + # checking the results + assert results.outputs.out == 5 + + # checking the cache_dir + assert results.cache_dir.exists() + + +def test_task_nostate_1_call(tmp_path): + """task without splitter""" + nn = FunAddTwo(a=3) + with Submitter(cache_root=tmp_path) as sub: + results = sub(nn) + assert not results.errored, "\n".join(results.errors["error message"]) + # checking the results + + assert results.outputs.out == 5 + # checking the cache_dir + assert results.cache_dir.exists() + + +@pytest.mark.flaky(reruns=2) # when dask +def test_task_nostate_1_call_subm(worker, tmp_path): + """task without splitter""" + nn = FunAddTwo(a=3) + + assert np.allclose(nn.a, [3]) + state = get_state(nn) + assert state is None + + with Submitter(worker=worker, cache_root=tmp_path) as sub: + results = sub(nn) + assert not results.errored, "\n".join(results.errors["error message"]) + + # checking the results + + assert results.outputs.out == 5 + # checking the cache_dir + assert results.cache_dir.exists() + + +@pytest.mark.flaky(reruns=2) # when dask +def test_task_nostate_1_call_plug(worker, tmp_path): + """task without splitter""" + nn = FunAddTwo(a=3) + + assert np.allclose(nn.a, [3]) + state = get_state(nn) + assert state is None + + with Submitter(cache_root=tmp_path, worker=worker) as sub: + results = sub(nn) + assert not results.errored, "\n".join(results.errors["error message"]) + + # checking the results + + assert results.outputs.out == 5 + # checking the cache_dir + assert results.cache_dir.exists() + + +def test_task_nostate_2(worker, tmp_path): + """task with a list as an input, but no splitter""" + nn = Moment(n=3, lst=[2, 3, 4]) + + assert np.allclose(nn.n, [3]) + assert np.allclose(nn.lst, [2, 3, 4]) + state = get_state(nn) + assert state is None + + with Submitter(worker=worker, cache_root=tmp_path) as sub: + results = sub(nn) + assert not results.errored, "\n".join(results.errors["error message"]) + + # checking the results + + assert results.outputs.out == 33 + # checking the cache_dir + assert results.cache_dir.exists() + + +def test_task_nostate_3(worker, tmp_path): + """task with a dictionary as an input""" + nn = FunDict(d={"a": "ala", "b": "bala"}) + + assert nn.d == {"a": "ala", "b": "bala"} + + with Submitter(worker=worker, cache_root=tmp_path) as sub: + results = sub(nn) + assert not results.errored, "\n".join(results.errors["error message"]) + + # checking the results + + assert results.outputs.out == "a:ala_b:bala" + # checking the cache_dir + assert results.cache_dir.exists() + + +def test_task_nostate_4(worker, tmp_path): + """task with a dictionary as an input""" + file1 = tmp_path / "file.txt" + with open(file1, "w") as f: + f.write("hello from pydra\n") + + nn = FunFile(filename=file1) + + with Submitter(worker=worker, cache_root=tmp_path) as sub: + results = sub(nn) + assert not results.errored, "\n".join(results.errors["error message"]) + + # checking the results + + assert results.outputs.out == "hello from pydra\n" + # checking the cache_dir + assert results.cache_dir.exists() + + +def test_task_nostate_5(tmp_path): + """task with a dictionary of files as an input""" + file1 = tmp_path / "file1.txt" + with open(file1, "w") as f: + f.write("hello") + + file2 = tmp_path / "file2.txt" + with open(file2, "w") as f: + f.write("from pydra\n") + + nn = FunFileList(filename_list=[file1, file2]) + + outputs = nn() + + # checking the results + + assert outputs.out == "hello from pydra\n" + + +def test_task_nostate_6(): + """checking if the function gets the None value""" + nn = FunAddVarNone(a=2, b=None) + assert nn.b is None + outputs = nn() + assert outputs.out == 2 + + +def test_task_nostate_6a_exception(): + """checking if the function gets the attrs.Nothing value""" + nn = FunAddVarNone(a=2) + assert nn.b is attrs.NOTHING + with pytest.raises(ValueError) as excinfo: + nn() + assert "Mandatory field 'b' is not set" in str(excinfo.value) + + +def test_task_nostate_7(): + """using the default value from the function for b input""" + nn = FunAddVarDefault(a=2) + assert nn.b == 1 + outputs = nn() + assert outputs.out == 3 + + +# Testing caching for tasks without states + + +@pytest.mark.flaky(reruns=2) # when dask +def test_task_nostate_cachedir(worker, tmp_path): + """task with provided cache_root using pytest tmp_path""" + cache_root = tmp_path / "test_task_nostate" + cache_root.mkdir() + nn = FunAddTwo(a=3) + state = get_state(nn) + assert np.allclose(nn.a, [3]) + assert state is None + + with Submitter(worker=worker, cache_root=cache_root) as sub: + results = sub(nn) + assert not results.errored, "\n".join(results.errors["error message"]) + + # checking the results + + assert results.outputs.out == 5 + + +@pytest.mark.flaky(reruns=2) # when dask +def test_task_nostate_cachedir_relativepath(tmp_path, worker): + """task with provided cache_root as relative path""" + os.chdir(tmp_path) + cache_root = "test_task_nostate" + (tmp_path / cache_root).mkdir() + + nn = FunAddTwo(a=3) + assert np.allclose(nn.a, [3]) + state = get_state(nn) + assert state is None + + with Submitter(worker=worker, cache_root=cache_root) as sub: + results = sub(nn) + assert not results.errored, "\n".join(results.errors["error message"]) + + # checking the results + + assert results.outputs.out == 5 + + shutil.rmtree(cache_root) + + +@pytest.mark.flaky(reruns=2) # when dask +def test_task_nostate_cachelocations(worker, tmp_path): + """ + Two identical tasks with provided cache_root; + the second task has readonly_caches and should not recompute the results + """ + cache_root = tmp_path / "test_task_nostate" + cache_root.mkdir() + cache_root2 = tmp_path / "test_task_nostate2" + cache_root2.mkdir() + + nn = FunAddTwo(a=3) + with Submitter(worker=worker, cache_root=cache_root) as sub: + results = sub(nn) + assert not results.errored, "\n".join(results.errors["error message"]) + + nn2 = FunAddTwo(a=3) + with Submitter( + worker=worker, cache_root=cache_root2, readonly_caches=cache_root + ) as sub: + results2 = sub(nn2) + assert not results2.errored, "\n".join(results.errors["error message"]) + + # checking the results + + assert results2.outputs.out == 5 + + # checking if the second task didn't run the interface again + assert results.cache_dir == results2.cache_dir + + +def test_task_nostate_cachelocations_forcererun(worker, tmp_path): + """ + Two identical tasks with provided cache_root; + the second task has readonly_caches, + but submitter is called with rerun=True, so should recompute + """ + cache_root = tmp_path / "test_task_nostate" + cache_root.mkdir() + cache_root2 = tmp_path / "test_task_nostate2" + cache_root2.mkdir() + + nn = FunAddTwo(a=3) + with Submitter(worker=worker, cache_root=cache_root) as sub: + results = sub(nn) + assert not results.errored, "\n".join(results.errors["error message"]) + + nn2 = FunAddTwo(a=3) + with Submitter( + worker=worker, cache_root=cache_root2, readonly_caches=cache_root + ) as sub: + results2 = sub(nn2, rerun=True) + + # checking the results + + assert results2.outputs.out == 5 + + # checking if the second task rerun the interface + assert results.cache_dir.exists() + assert results2.cache_dir.exists() + + +def test_task_nostate_cachelocations_nosubmitter(tmp_path): + """ + Two identical tasks (that are run without submitter!) with provided cache_root; + the second task has readonly_caches and should not recompute the results + """ + cache_root = tmp_path / "test_task_nostate" + cache_root.mkdir() + cache_root2 = tmp_path / "test_task_nostate2" + cache_root2.mkdir() + + nn = FunAddTwo(a=3) + nn(cache_root=cache_root) + + nn2 = FunAddTwo(a=3) + outputs2 = nn2(cache_root=cache_root2, readonly_caches=cache_root) + + # checking the results + + assert outputs2.out == 5 + + # checking if the second task didn't run the interface again + assert num_python_cache_roots(cache_root) == 1 + assert not num_python_cache_roots(cache_root2) + + +def test_task_nostate_cachelocations_nosubmitter_forcererun(tmp_path): + """ + Two identical tasks (that are run without submitter!) with provided cache_root; + the second task has readonly_caches, + but submitter is called with rerun=True, so should recompute + """ + cache_root = tmp_path / "test_task_nostate" + cache_root.mkdir() + cache_root2 = tmp_path / "test_task_nostate2" + cache_root2.mkdir() + + nn = FunAddTwo(a=3) + nn(cache_root=cache_root) + + nn2 = FunAddTwo(a=3) + outputs2 = nn2(rerun=True, cache_root=cache_root2, readonly_caches=cache_root) + + # checking the results + + assert outputs2.out == 5 + + # checking if the second task run the interface again + assert num_python_cache_roots(cache_root) == 1 + assert num_python_cache_roots(cache_root2) + + +def test_task_nostate_cachelocations_updated(worker, tmp_path): + """ + Two identical tasks with provided cache_root; + the second task has readonly_caches in init, + that is later overwritten in Submitter.__call__; + the readonly_caches passed to call doesn't exist so the second task should run again + """ + cache_root = tmp_path / "test_task_nostate" + cache_root.mkdir() + cache_root1 = tmp_path / "test_task_nostate1" + cache_root1.mkdir() + cache_root2 = tmp_path / "test_task_nostate2" + cache_root2.mkdir() + + nn = FunAddTwo(a=3) + with Submitter(worker=worker, cache_root=cache_root) as sub: + results = sub(nn) + assert not results.errored, "\n".join(results.errors["error message"]) + + nn2 = FunAddTwo(a=3) + with Submitter( + worker=worker, cache_root=cache_root2, readonly_caches=cache_root + ) as sub: + results1 = sub(nn2) + assert not results1.errored, "\n".join(results.errors["error message"]) + + # updating cache location to non-existing dir + with Submitter( + worker=worker, readonly_caches=cache_root1, cache_root=tmp_path + ) as sub: + results2 = sub(nn2) + assert not results2.errored, "\n".join(results.errors["error message"]) + + # checking the results + + assert results2.outputs.out == 5 + + # checking if both tasks run interface + assert results.cache_dir == results1.cache_dir + assert results.cache_dir != results2.cache_dir + + +# Tests for tasks with states (i.e. with splitter) + + +@pytest.mark.flaky(reruns=2) # when dask +@pytest.mark.parametrize("input_type", ["list", "array"]) +def test_task_state_1(worker, input_type, tmp_path): + """task with the simplest splitter""" + a_in = [3, 5] + if input_type == "array": + a_in = np.array(a_in) + + nn = FunAddTwo().split("a", a=a_in) + state = get_state(nn) + + assert state.splitter == "NA.a" + assert state.splitter_rpn == ["NA.a"] + assert (nn.a == np.array([3, 5])).all() + + with Submitter(worker=worker, cache_root=tmp_path) as sub: + results = sub(nn) + assert not results.errored, "\n".join(results.errors["error message"]) + + # checking the results + + expected = [({"NA.a": 3}, 5), ({"NA.a": 5}, 7)] + for i, res in enumerate(expected): + assert results.outputs.out[i] == res[1] + + +def test_task_state_1a(worker, tmp_path): + """task with the simplest splitter (inputs set separately)""" + nn = FunAddTwo() + nn = nn.split("a", a=[1, 2]) + nn.a = StateArray([3, 5]) + + state = get_state(nn) + + assert state.splitter == "NA.a" + assert state.splitter_rpn == ["NA.a"] + assert (nn.a == np.array([3, 5])).all() + + with Submitter(worker=worker, cache_root=tmp_path) as sub: + results = sub(nn) + assert not results.errored, "\n".join(results.errors["error message"]) + + # checking the results + + expected = [({"NA.a": 3}, 5), ({"NA.a": 5}, 7)] + for i, res in enumerate(expected): + assert results.outputs.out[i] == res[1] + + +def test_task_state_singl_1(worker, tmp_path): + """Tasks with two inputs and a splitter (no combiner) + one input is a single value, the other is in the splitter and combiner + """ + nn = FunAddVar(b=10).split("a", a=[3, 5]) + state = get_state(nn) + + assert nn.a == [3, 5] + assert nn.b == 10 + assert state.splitter == "NA.a" + assert state.splitter_rpn == ["NA.a"] + assert state.splitter_final == "NA.a" + assert state.splitter_rpn_final == ["NA.a"] + + with Submitter(worker=worker, cache_root=tmp_path) as sub: + results = sub(nn) + assert not results.errored, "\n".join(results.errors["error message"]) + + # checking the results + expected = [({"NA.a": 3, "NA.b": 10}, 13), ({"NA.a": 5, "NA.b": 10}, 15)] + + for i, res in enumerate(expected): + assert results.outputs.out[i] == res[1] + # checking the cache_dir + assert results.cache_dir.exists() + + +@pytest.mark.parametrize( + "splitter, state_splitter, state_rpn, expected, expected_ind", + [ + ( + ("a", "b"), + ("NA.a", "NA.b"), + ["NA.a", "NA.b", "."], + [({"NA.a": 3, "NA.b": 10}, 13), ({"NA.a": 5, "NA.b": 20}, 25)], + [({"NA.a": 0, "NA.b": 0}, 13), ({"NA.a": 1, "NA.b": 1}, 25)], + ), + ( + ["a", "b"], + ["NA.a", "NA.b"], + ["NA.a", "NA.b", "*"], + [ + ({"NA.a": 3, "NA.b": 10}, 13), + ({"NA.a": 3, "NA.b": 20}, 23), + ({"NA.a": 5, "NA.b": 10}, 15), + ({"NA.a": 5, "NA.b": 20}, 25), + ], + [ + ({"NA.a": 0, "NA.b": 0}, 13), + ({"NA.a": 0, "NA.b": 1}, 23), + ({"NA.a": 1, "NA.b": 0}, 15), + ({"NA.a": 1, "NA.b": 1}, 25), + ], + ), + ], +) +@pytest.mark.parametrize("input_type", ["list", "array", "mixed"]) +def test_task_state_2( + worker, + splitter, + state_splitter, + state_rpn, + expected, + expected_ind, + input_type, + tmp_path, +): + """Tasks with two inputs and a splitter (no combiner)""" + a_in, b_in = [3, 5], [10, 20] + if input_type == "array": + a_in, b_in = np.array(a_in), np.array(b_in) + elif input_type == "mixed": + a_in = np.array(a_in) + nn = FunAddVar().split(splitter, a=a_in, b=b_in) + state = get_state(nn) + + assert (nn.a == np.array([3, 5])).all() + assert (nn.b == np.array([10, 20])).all() + assert state.splitter == state_splitter + assert state.splitter_rpn == state_rpn + assert state.splitter_final == state_splitter + assert state.splitter_rpn_final == state_rpn + + with Submitter(worker=worker, cache_root=tmp_path) as sub: + results = sub(nn) + assert not results.errored, "\n".join(results.errors["error message"]) + + # checking the results + + for i, res in enumerate(expected): + assert results.outputs.out[i] == res[1] + + +def test_task_state_3(worker, tmp_path): + """task with the simplest splitter, the input is an empty list""" + nn = FunAddTwo().split("a", a=[]) + state = get_state(nn) + + assert state.splitter == "NA.a" + assert state.splitter_rpn == ["NA.a"] + assert nn.a == [] + + with Submitter(worker="debug", cache_root=tmp_path) as sub: + results = sub(nn) + assert not results.errored, "\n".join(results.errors["error message"]) + + # checking the results + + expected = [] + for i, res in enumerate(expected): + assert results.outputs.out[i] == res[1] + + +@pytest.mark.parametrize("input_type", ["list", "array"]) +def test_task_state_4(worker, input_type, tmp_path): + """task with a list as an input, and a simple splitter""" + lst_in = [[2, 3, 4], [1, 2, 3]] + if input_type == "array": + lst_in = np.array(lst_in, dtype=int) + nn = Moment(n=3).split("lst", lst=lst_in) + state = get_state(nn) + + assert np.allclose(nn.n, 3) + assert np.allclose(nn.lst, [[2, 3, 4], [1, 2, 3]]) + assert state.splitter == "NA.lst" + + with Submitter(worker=worker, cache_root=tmp_path) as sub: + results = sub(nn) + assert not results.errored, "\n".join(results.errors["error message"]) + + # checking that split is done across dim 0 + el_0 = state.states_val[0]["NA.lst"] + if input_type == "list": + assert el_0 == [2, 3, 4] + elif input_type == "array": + assert el_0 == [2, 3, 4] + + # checking the results + + for i, expected in enumerate([33, 12]): + assert results.outputs.out[i] == expected + + +def test_task_state_4a(worker, tmp_path): + """task with a tuple as an input, and a simple splitter""" + nn = Moment(n=3).split("lst", lst=[(2, 3, 4), (1, 2, 3)]) + state = get_state(nn) + + assert np.allclose(nn.n, 3) + assert np.allclose(nn.lst, [[2, 3, 4], [1, 2, 3]]) + assert state.splitter == "NA.lst" + + with Submitter(worker=worker, cache_root=tmp_path) as sub: + results = sub(nn) + assert not results.errored, "\n".join(results.errors["error message"]) + + # checking the results + + for i, expected in enumerate([33, 12]): + assert results.outputs.out[i] == expected + + +def test_task_state_5(worker, tmp_path): + """task with a list as an input, and the variable is part of the scalar splitter""" + nn = Moment().split(("n", "lst"), n=[1, 3], lst=[[2, 3, 4], [1, 2, 3]]) + state = get_state(nn) + + assert np.allclose(nn.n, [1, 3]) + assert np.allclose(nn.lst, [[2, 3, 4], [1, 2, 3]]) + assert state.splitter == ("NA.n", "NA.lst") + + with Submitter(worker=worker, cache_root=tmp_path) as sub: + results = sub(nn) + assert not results.errored, "\n".join(results.errors["error message"]) + + # checking the results + + for i, expected in enumerate([3, 12]): + assert results.outputs.out[i] == expected + + +def test_task_state_5_exception(worker, tmp_path): + """task with a list as an input, and the variable is part of the scalar splitter + the shapes are not matching, so exception should be raised + """ + nn = Moment().split(("n", "lst"), n=[1, 3, 3], lst=[[2, 3, 4], [1, 2, 3]]) + + assert np.allclose(nn.n, [1, 3, 3]) + assert np.allclose(nn.lst, [[2, 3, 4], [1, 2, 3]]) + + with pytest.raises(Exception) as excinfo: + get_state(nn) + + assert "shape" in str(excinfo.value) + + +def test_task_state_6(worker, tmp_path): + """ask with a list as an input, and the variable is part of the outer splitter""" + nn = Moment().split(["n", "lst"], n=[1, 3], lst=[[2, 3, 4], [1, 2, 3]]) + state = get_state(nn) + + assert np.allclose(nn.n, [1, 3]) + assert np.allclose(nn.lst, [[2, 3, 4], [1, 2, 3]]) + assert state.splitter == ["NA.n", "NA.lst"] + + with Submitter(worker="debug", cache_root=tmp_path) as sub: + results = sub(nn) + assert not results.errored, "\n".join(results.errors["error message"]) + + # checking the results + assert results.outputs.out == [3.0, 2.0, 33.0, 12.0] + + +def test_task_state_6a(worker, tmp_path): + """ask with a tuple as an input, and the variable is part of the outer splitter""" + nn = Moment().split(["n", "lst"], n=[1, 3], lst=[(2, 3, 4), (1, 2, 3)]) + state = get_state(nn) + + assert np.allclose(nn.n, [1, 3]) + assert np.allclose(nn.lst, [[2, 3, 4], [1, 2, 3]]) + assert state.splitter == ["NA.n", "NA.lst"] + + with Submitter(worker=worker, cache_root=tmp_path) as sub: + results = sub(nn) + assert not results.errored, "\n".join(results.errors["error message"]) + + # checking the results + assert results.outputs.out == [3.0, 2.0, 33.0, 12.0] + + +@pytest.mark.flaky(reruns=2) # when dask +def test_task_state_comb_1(worker, tmp_path): + """task with the simplest splitter and combiner""" + nn = FunAddTwo().split(a=[3, 5]).combine(combiner="a") + state = get_state(nn) + + assert (nn.a == np.array([3, 5])).all() + + assert state.splitter == ["NA.a"] + assert state.splitter_rpn == ["NA.a"] + assert state.combiner == ["NA.a"] + assert state.splitter_final is None + assert state.splitter_rpn_final == [] + + with Submitter(worker="debug", cache_root=tmp_path) as sub: + results = sub(nn) + assert not results.errored, "\n".join(results.errors["error message"]) + + assert state.states_ind == [{"NA.a": 0}, {"NA.a": 1}] + assert state.states_val == [{"NA.a": 3}, {"NA.a": 5}] + + # checking the results + + # fully combined (no nested list) + assert results.outputs.out == [5, 7] + + +@pytest.mark.parametrize( + "splitter, combiner, state_splitter, state_rpn, state_combiner, state_combiner_all, " + "state_splitter_final, state_rpn_final, expected", # , expected_val", + [ + ( + ("a", "b"), + "a", + ("NA.a", "NA.b"), + ["NA.a", "NA.b", "."], + ["NA.a"], + ["NA.a", "NA.b"], + None, + [], + [13, 25], + # [({"NA.a": 3, "NA.b": 10}, 13), ({"NA.a": 5, "NA.b": 20}, 25)], + ), + ( + ("a", "b"), + "b", + ("NA.a", "NA.b"), + ["NA.a", "NA.b", "."], + ["NA.b"], + ["NA.a", "NA.b"], + None, + [], + [13, 25], + # [({"NA.a": 3, "NA.b": 10}, 13), ({"NA.a": 5, "NA.b": 20}, 25)], + ), + ( + ["a", "b"], + "a", + ["NA.a", "NA.b"], + ["NA.a", "NA.b", "*"], + ["NA.a"], + ["NA.a"], + "NA.b", + ["NA.b"], + [[13, 15], [23, 25]], + # [ + # [({"NA.a": 3, "NA.b": 10}, 13), ({"NA.a": 5, "NA.b": 10}, 15)], + # [({"NA.a": 3, "NA.b": 20}, 23), ({"NA.a": 5, "NA.b": 20}, 25)], + # ], + ), + ( + ["a", "b"], + "b", + ["NA.a", "NA.b"], + ["NA.a", "NA.b", "*"], + ["NA.b"], + ["NA.b"], + "NA.a", + ["NA.a"], + [[13, 23], [15, 25]], + # [ + # [({"NA.a": 3, "NA.b": 10}, 13), ({"NA.a": 3, "NA.b": 20}, 23)], + # [({"NA.a": 5, "NA.b": 10}, 15), ({"NA.a": 5, "NA.b": 20}, 25)], + # ], + ), + ( + ["a", "b"], + ["a", "b"], + ["NA.a", "NA.b"], + ["NA.a", "NA.b", "*"], + ["NA.a", "NA.b"], + ["NA.a", "NA.b"], + None, + [], + [13, 23, 15, 25], + # [ + # ({"NA.a": 3, "NA.b": 10}, 13), + # ({"NA.a": 3, "NA.b": 20}, 23), + # ({"NA.a": 5, "NA.b": 10}, 15), + # ({"NA.a": 5, "NA.b": 20}, 25), + # ], + ), + ], +) +def test_task_state_comb_2( + worker, + splitter, + combiner, + state_splitter, + state_rpn, + state_combiner, + state_combiner_all, + state_splitter_final, + state_rpn_final, + expected, + # expected_val, + tmp_path, +): + """Tasks with scalar and outer splitters and partial or full combiners""" + nn = FunAddVar().split(splitter, a=[3, 5], b=[10, 20]).combine(combiner=combiner) + state = get_state(nn) + + assert (nn.a == np.array([3, 5])).all() + + assert state.splitter == state_splitter + assert state.splitter_rpn == state_rpn + assert state.combiner == state_combiner + + with Submitter(worker="debug", cache_root=tmp_path) as sub: + results = sub(nn) + assert not results.errored, "\n".join(results.errors["error message"]) + + assert state.splitter_final == state_splitter_final + assert state.splitter_rpn_final == state_rpn_final + assert set(state.current_combiner_all) == set(state_combiner_all) + + # checking the results + + # checking the return_inputs option, either return_inputs is True or "val", + # it should give values of inputs that corresponds to the specific element + # results_verb = nn.result(return_inputs=True) + + assert results.outputs.out == expected + + +def test_task_state_comb_singl_1(worker, tmp_path): + """Tasks with two inputs; + one input is a single value, the other is in the splitter and combiner + """ + nn = FunAddVar(b=10).split("a", a=[3, 5]).combine(combiner="a") + state = get_state(nn) + + assert nn.a == [3, 5] + assert nn.b == 10 + assert state.splitter == "NA.a" + assert state.splitter_rpn == ["NA.a"] + assert state.combiner == ["NA.a"] + assert state.splitter_final is None + assert state.splitter_rpn_final == [] + + with Submitter(worker=worker, cache_root=tmp_path) as sub: + results = sub(nn) + assert not results.errored, "\n".join(results.errors["error message"]) + + assert results.outputs.out == [13, 15] + + +def test_task_state_comb_3(worker, tmp_path): + """task with the simplest splitter, the input is an empty list""" + nn = FunAddTwo().split("a", a=[]).combine(combiner=["a"]) + state = get_state(nn) + + assert state.splitter == "NA.a" + assert state.splitter_rpn == ["NA.a"] + assert nn.a == [] + + with Submitter(worker=worker, cache_root=tmp_path) as sub: + results = sub(nn) + assert not results.errored, "\n".join(results.errors["error message"]) + + # checking the results + + expected = [] + for i, res in enumerate(expected): + assert results.outputs.out[i] == res[1] + + +def test_task_state_comb_order(tmp_path): + """tasks with an outer splitter and various combiner; + showing the order of results + """ + + # single combiner "a" - will create two lists, first one for b=3, second for b=5 + nn_a = FunAddVar().split(["a", "b"], a=[10, 20], b=[3, 5]).combine(combiner="a") + state_a = get_state(nn_a) + assert state_a.combiner == ["NA.a"] + + outputs = nn_a(cache_root=tmp_path / "cache") + # combined_results_a = [[res.output.out for res in res_l] for res_l in results_a] + assert outputs.out == [[13, 23], [15, 25]] + + # single combiner "b" - will create two lists, first one for a=10, second for a=20 + nn_b = FunAddVar().split(["a", "b"], a=[10, 20], b=[3, 5]).combine(combiner="b") + state_b = get_state(nn_b) + assert state_b.combiner == ["NA.b"] + + outputs_b = nn_b(cache_root=tmp_path / "cache_b") + # combined_results_b = [[res.output.out for res in res_l] for res_l in results_b] + assert outputs_b.out == [[13, 15], [23, 25]] + + # combiner with both fields ["a", "b"] - will create one list + nn_ab = ( + FunAddVar().split(["a", "b"], a=[10, 20], b=[3, 5]).combine(combiner=["a", "b"]) + ) + state_ab = get_state(nn_ab) + assert state_ab.combiner == ["NA.a", "NA.b"] + + outputs_ab = nn_ab(cache_root=tmp_path / "cache_ab") + assert outputs_ab.out == [13, 15, 23, 25] + + # combiner with both fields ["b", "a"] - will create the same list as nn_ab + # no difference in the order for setting combiner + nn_ba = ( + FunAddVar().split(["a", "b"], a=[10, 20], b=[3, 5]).combine(combiner=["b", "a"]) + ) + state_ba = get_state(nn_ba) + assert state_ba.combiner == ["NA.b", "NA.a"] + + outputs_ba = nn_ba(cache_root=tmp_path / "cache_ba") + assert outputs_ba.out == [13, 15, 23, 25] + + +# Testing with container dimensions for the input + + +def test_task_state_contdim_1(tmp_path): + """task with a spliter and container dimension for one of the value""" + task_4var = Op4Var( + a="a1", + ).split( + ("b", ["c", "d"]), + b=[["b1", "b2"], ["b3", "b4"]], + c=["c1", "c2"], + d=["d1", "d2"], + container_ndim={"b": 2}, + ) + outputs = task_4var(cache_root=tmp_path) + assert len(outputs.out) == 4 + assert outputs.out[3] == "a1 b4 c2 d2" + + +def test_task_state_contdim_2(tmp_path): + """task with a splitter and container dimension for one of the value""" + task_4var = Op4Var().split( + ["a", ("b", ["c", "d"])], + container_ndim={"b": 2}, + a=["a1", "a2"], + b=[["b1", "b2"], ["b3", "b4"]], + c=["c1", "c2"], + d=["d1", "d2"], + ) + outputs = task_4var(cache_root=tmp_path) + assert len(outputs.out) == 8 + assert outputs.out[7] == "a2 b4 c2 d2" + + +def test_task_state_comb_contdim_1(tmp_path): + """task with a splitter-combiner, and container dimension for one of the value""" + task_4var = ( + Op4Var(a="a1") + .split( + ("b", ["c", "d"]), + container_ndim={"b": 2}, + b=[["b1", "b2"], ["b3", "b4"]], + c=["c1", "c2"], + d=["d1", "d2"], + ) + .combine("b") + ) + outputs = task_4var(cache_root=tmp_path) + assert len(outputs.out) == 4 + assert outputs.out[3] == "a1 b4 c2 d2" + + +def test_task_state_comb_contdim_2(tmp_path): + """task with a splitter-combiner, and container dimension for one of the value""" + task_4var = ( + Op4Var() + .split( + ["a", ("b", ["c", "d"])], + a=["a1", "a2"], + b=[["b1", "b2"], ["b3", "b4"]], + c=["c1", "c2"], + d=["d1", "d2"], + container_ndim={"b": 2}, + ) + .combine("a") + ) + outputs = task_4var(cache_root=tmp_path) + assert len(outputs.out) == 4 + assert outputs.out[3][1] == "a2 b4 c2 d2" diff --git a/pydra/engine/tests/test_tasks_files.py b/pydra/engine/tests/test_tasks_files.py deleted file mode 100644 index a1849e221b..0000000000 --- a/pydra/engine/tests/test_tasks_files.py +++ /dev/null @@ -1,221 +0,0 @@ -import os -from pathlib import Path -import numpy as np -import pytest -import typing as ty - -from ..submitter import Submitter -from ..core import Workflow -from ... import mark -from ..specs import File, Directory - - -@mark.task -def dir_count_file(dirpath): - return len(os.listdir(dirpath)) - - -@mark.task -def dir_count_file_annot(dirpath: Directory): - return len(os.listdir(dirpath)) - - -@mark.task -def file_add2(file): - array_inp = np.load(file) - array_out = array_inp + 2 - cwd = os.getcwd() - # providing a full path - file_out = os.path.join(cwd, "arr_out.npy") - np.save(file_out, array_out) - return file_out - - -@mark.task -def file_mult(file): - array_inp = np.load(file) - array_out = 10 * array_inp - cwd = os.getcwd() - file_out = os.path.join(cwd, "arr_out.npy") - np.save(file_out, array_out) - return file_out - - -@mark.task -def file_add2_annot(file: File) -> ty.NamedTuple("Output", [("out", File)]): - array_inp = np.load(file) - array_out = array_inp + 2 - cwd = os.getcwd() - # providing a full path - file_out = os.path.join(cwd, "arr_out.npy") - np.save(file_out, array_out) - return file_out - - -@mark.task -def file_mult_annot(file: File) -> ty.NamedTuple("Output", [("out", File)]): - array_inp = np.load(file) - array_out = 10 * array_inp - cwd = os.getcwd() - file_out = os.path.join(cwd, "arr_out.npy") - np.save(file_out, array_out) - return file_out - - -def test_task_1(tmpdir): - """task that takes file as an input""" - os.chdir(tmpdir) - arr = np.array([2]) - # creating abs path - file = os.path.join(os.getcwd(), "arr1.npy") - np.save(file, arr) - nn = file_add2(name="add2", file=file) - - with Submitter(plugin="cf") as sub: - sub(nn) - - # checking the results - results = nn.result() - res = np.load(results.output.out) - assert res == np.array([4]) - - -def test_wf_1(tmpdir): - """workflow with 2 tasks that take file as an input and give file as an aoutput""" - wf = Workflow(name="wf_1", input_spec=["file_orig"]) - wf.add(file_add2(name="add2", file=wf.lzin.file_orig)) - wf.add(file_mult(name="mult", file=wf.add2.lzout.out)) - wf.set_output([("out", wf.mult.lzout.out)]) - - os.chdir(tmpdir) - arr = np.array([2, 3]) - # creating abs path - file_orig = os.path.join(os.getcwd(), "arr_orig.npy") - np.save(file_orig, arr) - wf.inputs.file_orig = file_orig - - with Submitter(plugin="cf") as sub: - sub(wf) - - assert wf.output_dir.exists() - file_output = wf.result().output.out - assert Path(file_output).exists() - # loading results - array_out = np.load(file_output) - assert np.array_equal(array_out, [40, 50]) - - -def test_file_annotation_1(tmpdir): - """task that takes file as an input""" - os.chdir(tmpdir) - arr = np.array([2]) - # creating abs path - file = os.path.join(os.getcwd(), "arr1.npy") - np.save(file, arr) - nn = file_add2_annot(name="add2", file=file) - - with Submitter(plugin="cf") as sub: - sub(nn) - - # checking the results - results = nn.result() - res = np.load(results.output.out) - assert res == np.array([4]) - - -def test_broken_file(tmpdir): - """task that takes file as an input""" - os.chdir(tmpdir) - file = os.path.join(os.getcwd(), "non_existent.npy") - - nn = file_add2(name="add2", file=file) - with pytest.raises(FileNotFoundError): - with Submitter(plugin="cf") as sub: - sub(nn) - - with pytest.raises(FileNotFoundError, match="do not exist"): - file_add2_annot(name="add2_annot", file=file) - - -def test_broken_file_link(tmpdir): - """ - Test how broken symlinks are handled during hashing - """ - os.chdir(tmpdir) - file = os.path.join(os.getcwd(), "arr.npy") - arr = np.array([2]) - np.save(file, arr) - - file_link = os.path.join(os.getcwd(), "link_to_arr.npy") - os.symlink(file, file_link) - os.remove(file) - - nn = file_add2(name="add2", file=file_link) - # raises error inside task - # unless variable is defined as a File pydra will treat it as a string - with pytest.raises(FileNotFoundError): - with Submitter(plugin="cf") as sub: - sub(nn) - - with pytest.raises(FileNotFoundError, match="do not exist"): - file_add2_annot(name="add2_annot", file=file_link) - - -def test_broken_dir(): - """Test how broken directories are handled during hashing""" - - # dirpath doesn't exist - nn = dir_count_file(name="listdir", dirpath="/broken_dir_path/") - # raises error inside task - # unless variable is defined as a File pydra will treat it as a string - with pytest.raises(FileNotFoundError): - with Submitter(plugin="cf") as sub: - sub(nn) - - # raises error before task is run - with pytest.raises(FileNotFoundError): - dir_count_file_annot(name="listdir", dirpath="/broken_dir_path/") - - -def test_broken_dir_link1(tmpdir): - """ - Test how broken symlinks are hashed in hash_dir - """ - # broken symlink to dir path - dir1 = tmpdir.join("dir1") - os.mkdir(dir1) - dir1_link = tmpdir.join("dir1_link") - os.symlink(dir1, dir1_link) - os.rmdir(dir1) - - nn = dir_count_file(name="listdir", dirpath=Path(dir1)) - # raises error while running task - with pytest.raises(FileNotFoundError): - with Submitter(plugin="cf") as sub: - sub(nn) - - with pytest.raises(FileNotFoundError): - dir_count_file_annot(name="listdir", dirpath=Path(dir1)) - - -def test_broken_dir_link2(tmpdir): - # valid dirs with broken symlink(s) are hashed - dir2 = tmpdir.join("dir2") - os.mkdir(dir2) - file1 = dir2.join("file1") - file2 = dir2.join("file2") - file1.open("w+").close() - file2.open("w+").close() - - file1_link = dir2.join("file1_link") - os.symlink(file1, file1_link) - os.remove(file1) # file1_link is broken - - nn = dir_count_file(name="listdir", dirpath=dir2) - # does not raises error because pydra treats dirpath as a string - with Submitter(plugin="cf") as sub: - sub(nn) - - nn2 = dir_count_file_annot(name="listdir", dirpath=str(dir2)) - with Submitter(plugin="cf") as sub: - sub(nn2) diff --git a/pydra/engine/tests/test_workflow.py b/pydra/engine/tests/test_workflow.py deleted file mode 100644 index c6aab6544f..0000000000 --- a/pydra/engine/tests/test_workflow.py +++ /dev/null @@ -1,5031 +0,0 @@ -import pytest -import shutil, os, sys -import time -import typing as ty -import attr -from pathlib import Path -from .utils import ( - add2, - add2_wait, - multiply, - multiply_list, - multiply_mixed, - power, - ten, - identity, - identity_2flds, - list_output, - fun_addsubvar, - fun_addvar3, - fun_addvar, - fun_addtwo, - add2_sub2_res, - add2_sub2_res_list, - fun_addvar_none, - fun_addvar_default, - fun_addvar_default_notype, - fun_addvar_notype, - fun_addtwo_notype, - fun_write_file, - fun_write_file_list, - fun_write_file_list2dict, - list_sum, - list_mult_sum, - DOT_FLAG, -) -from ..submitter import Submitter -from ..core import Workflow -from ... import mark -from ..specs import SpecInfo, BaseSpec, ShellSpec -from pydra.utils import exc_info_matches - - -def test_wf_no_input_spec(): - with pytest.raises(ValueError, match='Empty "Inputs" spec'): - Workflow(name="workflow") - - -def test_wf_specinfo_input_spec(): - input_spec = SpecInfo( - name="Input", - fields=[ - ("a", str, "", {"mandatory": True}), - ("b", dict, {"foo": 1, "bar": False}, {"mandatory": False}), - ], - bases=(BaseSpec,), - ) - wf = Workflow( - name="workflow", - input_spec=input_spec, - ) - for x in ["a", "b", "_graph_checksums"]: - assert hasattr(wf.inputs, x) - assert wf.inputs.a == "" - assert wf.inputs.b == {"foo": 1, "bar": False} - bad_input_spec = SpecInfo( - name="Input", - fields=[ - ("a", str, {"mandatory": True}), - ], - bases=(ShellSpec,), - ) - with pytest.raises( - ValueError, match="Provided SpecInfo must have BaseSpec as its base." - ): - Workflow(name="workflow", input_spec=bad_input_spec) - - -def test_wf_dict_input_and_output_spec(): - spec = { - "a": str, - "b": ty.Dict[str, ty.Union[int, bool]], - } - wf = Workflow( - name="workflow", - input_spec=spec, - output_spec=spec, - ) - wf.add( - identity_2flds( - name="identity", - x1=wf.lzin.a, - x2=wf.lzin.b, - ) - ) - wf.set_output( - [ - ("a", wf.identity.lzout.out1), - ("b", wf.identity.lzout.out2), - ] - ) - for x in ["a", "b", "_graph_checksums"]: - assert hasattr(wf.inputs, x) - wf.inputs.a = "any-string" - wf.inputs.b = {"foo": 1, "bar": False} - - with pytest.raises(TypeError) as exc_info: - wf.inputs.a = 1.0 - assert exc_info_matches(exc_info, "Cannot coerce 1.0 into ") - - with pytest.raises(TypeError) as exc_info: - wf.inputs.b = {"foo": 1, "bar": "bad-value"} - assert exc_info_matches( - exc_info, "Could not coerce object, 'bad-value', to any of the union types" - ) - - result = wf() - assert result.output.a == "any-string" - assert result.output.b == {"foo": 1, "bar": False} - - -def test_wf_name_conflict1(): - """raise error when workflow name conflicts with a class attribute or method""" - with pytest.raises(ValueError) as excinfo1: - Workflow(name="result", input_spec=["x"]) - assert "Cannot use names of attributes or methods" in str(excinfo1.value) - with pytest.raises(ValueError) as excinfo2: - Workflow(name="done", input_spec=["x"]) - assert "Cannot use names of attributes or methods" in str(excinfo2.value) - - -def test_wf_name_conflict2(): - """raise error when a task with the same name is already added to workflow""" - wf = Workflow(name="wf_1", input_spec=["x"]) - wf.add(add2(name="task_name", x=wf.lzin.x)) - with pytest.raises(ValueError) as excinfo: - wf.add(identity(name="task_name", x=3)) - assert "Another task named task_name is already added" in str(excinfo.value) - - -def test_wf_no_output(plugin, tmpdir): - """Raise error when output isn't set with set_output""" - wf = Workflow(name="wf_1", input_spec=["x"], cache_dir=tmpdir) - wf.add(add2(name="add2", x=wf.lzin.x)) - wf.inputs.x = 2 - - with pytest.raises(ValueError) as excinfo: - with Submitter(plugin=plugin) as sub: - sub(wf) - assert "Workflow output cannot be None" in str(excinfo.value) - - -def test_wf_1(plugin, tmpdir): - """workflow with one task and no splitter""" - wf = Workflow(name="wf_1", input_spec=["x"]) - wf.add(add2(name="add2", x=wf.lzin.x)) - wf.set_output([("out", wf.add2.lzout.out)]) - wf.inputs.x = 2 - wf.cache_dir = tmpdir - - checksum_before = wf.checksum - with Submitter(plugin=plugin) as sub: - sub(wf) - - assert wf.checksum == checksum_before - results = wf.result() - assert 4 == results.output.out - assert wf.output_dir.exists() - - -def test_wf_1a_outpastuple(plugin, tmpdir): - """workflow with one task and no splitter - set_output takes a tuple - """ - wf = Workflow(name="wf_1", input_spec=["x"]) - wf.add(add2(name="add2", x=wf.lzin.x)) - wf.set_output(("out", wf.add2.lzout.out)) - wf.inputs.x = 2 - wf.plugin = plugin - wf.cache_dir = tmpdir - - with Submitter(plugin=plugin) as sub: - sub(wf) - - results = wf.result() - assert 4 == results.output.out - assert wf.output_dir.exists() - - -def test_wf_1_call_subm(plugin, tmpdir): - """using wf.__call_ with submitter""" - wf = Workflow(name="wf_1", input_spec=["x"]) - wf.add(add2(name="add2", x=wf.lzin.x)) - wf.set_output([("out", wf.add2.lzout.out)]) - wf.inputs.x = 2 - wf.cache_dir = tmpdir - - with Submitter(plugin=plugin) as sub: - wf(submitter=sub) - - results = wf.result() - assert 4 == results.output.out - assert wf.output_dir.exists() - - -def test_wf_1_call_plug(plugin, tmpdir): - """using wf.__call_ with plugin""" - wf = Workflow(name="wf_1", input_spec=["x"]) - wf.add(add2(name="add2", x=wf.lzin.x)) - wf.set_output([("out", wf.add2.lzout.out)]) - wf.inputs.x = 2 - wf.plugin = plugin - wf.cache_dir = tmpdir - - wf(plugin=plugin) - - results = wf.result() - assert 4 == results.output.out - assert wf.output_dir.exists() - - -def test_wf_1_call_noplug_nosubm(plugin, tmpdir): - """using wf.__call_ without plugin or submitter""" - wf = Workflow(name="wf_1", input_spec=["x"]) - wf.add(add2(name="add2", x=wf.lzin.x)) - wf.set_output([("out", wf.add2.lzout.out)]) - wf.inputs.x = 2 - wf.cache_dir = tmpdir - - wf() - results = wf.result() - assert 4 == results.output.out - assert wf.output_dir.exists() - - -def test_wf_1_call_exception(plugin, tmpdir): - """using wf.__call_ with plugin and submitter - should raise an exception""" - wf = Workflow(name="wf_1", input_spec=["x"]) - wf.add(add2(name="add2", x=wf.lzin.x)) - wf.set_output([("out", wf.add2.lzout.out)]) - wf.inputs.x = 2 - wf.plugin = plugin - wf.cache_dir = tmpdir - - with Submitter(plugin=plugin) as sub: - with pytest.raises(Exception) as e: - wf(submitter=sub, plugin=plugin) - assert "Specify submitter OR plugin" in str(e.value) - - -def test_wf_1_inp_in_call(tmpdir): - """Defining input in __call__""" - wf = Workflow(name="wf_1", input_spec=["x"], cache_dir=tmpdir) - wf.add(add2(name="add2", x=wf.lzin.x)) - wf.set_output([("out", wf.add2.lzout.out)]) - wf.inputs.x = 1 - results = wf(x=2) - assert 4 == results.output.out - - -def test_wf_1_upd_in_run(tmpdir): - """Updating input in __call__""" - wf = Workflow(name="wf_1", input_spec=["x"], cache_dir=tmpdir) - wf.add(add2(name="add2", x=wf.lzin.x)) - wf.set_output([("out", wf.add2.lzout.out)]) - wf.inputs.x = 1 - results = wf(x=2) - assert 4 == results.output.out - - -def test_wf_2(plugin, tmpdir): - """workflow with 2 tasks, no splitter""" - wf = Workflow(name="wf_2", input_spec=["x", "y"]) - wf.add(multiply(name="mult", x=wf.lzin.x, y=wf.lzin.y)) - wf.add(add2(name="add2", x=wf.mult.lzout.out)) - wf.set_output([("out", wf.add2.lzout.out)]) - wf.inputs.x = 2 - wf.inputs.y = 3 - wf.cache_dir = tmpdir - - with Submitter(plugin=plugin) as sub: - sub(wf) - - assert wf.output_dir.exists() - results = wf.result() - assert 8 == results.output.out - - -def test_wf_2a(plugin, tmpdir): - """workflow with 2 tasks, no splitter - creating add2_task first (before calling add method), - """ - wf = Workflow(name="wf_2", input_spec=["x", "y"]) - wf.add(multiply(name="mult", x=wf.lzin.x, y=wf.lzin.y)) - add2_task = add2(name="add2") - add2_task.inputs.x = wf.mult.lzout.out - wf.add(add2_task) - wf.set_output([("out", wf.add2.lzout.out)]) - wf.inputs.x = 2 - wf.inputs.y = 3 - wf.cache_dir = tmpdir - - with Submitter(plugin=plugin) as sub: - sub(wf) - - results = wf.result() - assert 8 == results.output.out - assert wf.output_dir.exists() - - -def test_wf_2b(plugin, tmpdir): - """workflow with 2 tasks, no splitter - creating add2_task first (before calling add method), - adding inputs.x after add method - """ - wf = Workflow(name="wf_2", input_spec=["x", "y"]) - wf.add(multiply(name="mult", x=wf.lzin.x, y=wf.lzin.y)) - add2_task = add2(name="add2") - wf.add(add2_task) - add2_task.inputs.x = wf.mult.lzout.out - wf.set_output([("out", wf.add2.lzout.out)]) - wf.inputs.x = 2 - wf.inputs.y = 3 - wf.cache_dir = tmpdir - - with Submitter(plugin=plugin) as sub: - sub(wf) - - results = wf.result() - assert 8 == results.output.out - - assert wf.output_dir.exists() - - -def test_wf_2c_multoutp(plugin, tmpdir): - """workflow with 2 tasks, no splitter - setting multiple outputs for the workflow - """ - wf = Workflow(name="wf_2", input_spec=["x", "y"]) - wf.add(multiply(name="mult", x=wf.lzin.x, y=wf.lzin.y)) - add2_task = add2(name="add2") - add2_task.inputs.x = wf.mult.lzout.out - wf.add(add2_task) - # setting multiple output (from both nodes) - wf.set_output([("out_add2", wf.add2.lzout.out), ("out_mult", wf.mult.lzout.out)]) - wf.inputs.x = 2 - wf.inputs.y = 3 - wf.cache_dir = tmpdir - - with Submitter(plugin=plugin) as sub: - sub(wf) - - results = wf.result() - # checking outputs from both nodes - assert 6 == results.output.out_mult - assert 8 == results.output.out_add2 - assert wf.output_dir.exists() - - -def test_wf_2d_outpasdict(plugin, tmpdir): - """workflow with 2 tasks, no splitter - setting multiple outputs using a dictionary - """ - wf = Workflow(name="wf_2", input_spec=["x", "y"]) - wf.add(multiply(name="mult", x=wf.lzin.x, y=wf.lzin.y)) - add2_task = add2(name="add2") - add2_task.inputs.x = wf.mult.lzout.out - wf.add(add2_task) - # setting multiple output (from both nodes) - wf.set_output({"out_add2": wf.add2.lzout.out, "out_mult": wf.mult.lzout.out}) - wf.inputs.x = 2 - wf.inputs.y = 3 - wf.cache_dir = tmpdir - - with Submitter(plugin=plugin) as sub: - sub(wf) - - results = wf.result() - # checking outputs from both nodes - assert 6 == results.output.out_mult - assert 8 == results.output.out_add2 - assert wf.output_dir.exists() - - -@pytest.mark.flaky(reruns=3) # when dask -def test_wf_3(plugin_dask_opt, tmpdir): - """testing None value for an input""" - wf = Workflow(name="wf_3", input_spec=["x", "y"]) - wf.add(fun_addvar_none(name="addvar", a=wf.lzin.x, b=wf.lzin.y)) - wf.add(add2(name="add2", x=wf.addvar.lzout.out)) - wf.set_output([("out", wf.add2.lzout.out)]) - wf.inputs.x = 2 - wf.inputs.y = None - wf.cache_dir = tmpdir - - with Submitter(plugin=plugin_dask_opt) as sub: - sub(wf) - - assert wf.output_dir.exists() - results = wf.result() - assert 4 == results.output.out - - -@pytest.mark.xfail(reason="the task error doesn't propagate") -def test_wf_3a_exception(plugin, tmpdir): - """testinh wf without set input, attr.NOTHING should be set - and the function should raise an exception - """ - wf = Workflow(name="wf_3", input_spec=["x", "y"]) - wf.add(fun_addvar_none(name="addvar", a=wf.lzin.x, b=wf.lzin.y)) - wf.add(add2(name="add2", x=wf.addvar.lzout.out)) - wf.set_output([("out", wf.add2.lzout.out)]) - wf.inputs.x = 2 - wf.inputs.y = attr.NOTHING - wf.plugin = plugin - wf.cache_dir = tmpdir - - with pytest.raises(TypeError) as excinfo: - with Submitter(plugin=plugin) as sub: - sub(wf) - assert "unsupported" in str(excinfo.value) - - -def test_wf_4(plugin, tmpdir): - """wf with a task that doesn't set one input and use the function default value""" - wf = Workflow(name="wf_4", input_spec=["x", "y"]) - wf.add(fun_addvar_default(name="addvar", a=wf.lzin.x)) - wf.add(add2(name="add2", x=wf.addvar.lzout.out)) - wf.set_output([("out", wf.add2.lzout.out)]) - wf.inputs.x = 2 - wf.cache_dir = tmpdir - - with Submitter(plugin=plugin) as sub: - sub(wf) - - assert wf.output_dir.exists() - results = wf.result() - assert 5 == results.output.out - - -def test_wf_4a(plugin, tmpdir): - """wf with a task that doesn't set one input, - the unset input is send to the task input, - so the task should use the function default value - """ - wf = Workflow(name="wf_4a", input_spec=["x", "y"]) - wf.add(fun_addvar_default(name="addvar", a=wf.lzin.x, y=wf.lzin.y)) - wf.add(add2(name="add2", x=wf.addvar.lzout.out)) - wf.set_output([("out", wf.add2.lzout.out)]) - wf.inputs.x = 2 - wf.cache_dir = tmpdir - - with Submitter(plugin=plugin) as sub: - sub(wf) - - assert wf.output_dir.exists() - results = wf.result() - assert 5 == results.output.out - - -def test_wf_5(plugin, tmpdir): - """wf with two outputs connected to the task outputs - one set_output - """ - wf = Workflow(name="wf_5", input_spec=["x", "y"], x=3, y=2) - wf.add(fun_addsubvar(name="addsub", a=wf.lzin.x, b=wf.lzin.y)) - wf.set_output([("out_sum", wf.addsub.lzout.sum), ("out_sub", wf.addsub.lzout.sub)]) - wf.cache_dir = tmpdir - - with Submitter(plugin=plugin) as sub: - sub(wf) - - results = wf.result() - assert 5 == results.output.out_sum - assert 1 == results.output.out_sub - - -def test_wf_5a(plugin, tmpdir): - """wf with two outputs connected to the task outputs, - set_output set twice - """ - wf = Workflow(name="wf_5", input_spec=["x", "y"], x=3, y=2) - wf.add(fun_addsubvar(name="addsub", a=wf.lzin.x, b=wf.lzin.y)) - wf.set_output([("out_sum", wf.addsub.lzout.sum)]) - wf.set_output([("out_sub", wf.addsub.lzout.sub)]) - wf.cache_dir = tmpdir - - with Submitter(plugin=plugin) as sub: - sub(wf) - - results = wf.result() - assert 5 == results.output.out_sum - assert 1 == results.output.out_sub - - -def test_wf_5b_exception(tmpdir): - """set_output used twice with the same name - exception should be raised""" - wf = Workflow(name="wf_5", input_spec=["x", "y"], x=3, y=2) - wf.add(fun_addsubvar(name="addsub", a=wf.lzin.x, b=wf.lzin.y)) - wf.set_output([("out", wf.addsub.lzout.sum)]) - wf.cache_dir = tmpdir - - with pytest.raises(Exception, match="are already set"): - wf.set_output([("out", wf.addsub.lzout.sub)]) - - -def test_wf_6(plugin, tmpdir): - """wf with two tasks and two outputs connected to both tasks, - one set_output - """ - wf = Workflow(name="wf_6", input_spec=["x", "y"], x=2, y=3) - wf.add(multiply(name="mult", x=wf.lzin.x, y=wf.lzin.y)) - wf.add(add2(name="add2", x=wf.mult.lzout.out)) - wf.set_output([("out1", wf.mult.lzout.out), ("out2", wf.add2.lzout.out)]) - wf.cache_dir = tmpdir - - with Submitter(plugin=plugin) as sub: - sub(wf) - - assert wf.output_dir.exists() - results = wf.result() - assert 6 == results.output.out1 - assert 8 == results.output.out2 - - -def test_wf_6a(plugin, tmpdir): - """wf with two tasks and two outputs connected to both tasks, - set_output used twice - """ - wf = Workflow(name="wf_6", input_spec=["x", "y"], x=2, y=3) - wf.add(multiply(name="mult", x=wf.lzin.x, y=wf.lzin.y)) - wf.add(add2(name="add2", x=wf.mult.lzout.out)) - wf.set_output([("out1", wf.mult.lzout.out)]) - wf.set_output([("out2", wf.add2.lzout.out)]) - wf.cache_dir = tmpdir - - with Submitter(plugin=plugin) as sub: - sub(wf) - - assert wf.output_dir.exists() - results = wf.result() - assert 6 == results.output.out1 - assert 8 == results.output.out2 - - -def test_wf_st_1(plugin, tmpdir): - """Workflow with one task, a splitter for the workflow""" - wf = Workflow(name="wf_spl_1", input_spec=["x"]) - wf.add(add2(name="add2", x=wf.lzin.x)) - - wf.split("x", x=[1, 2]) - wf.set_output([("out", wf.add2.lzout.out)]) - wf.cache_dir = tmpdir - - checksum_before = wf.checksum - with Submitter(plugin="serial") as sub: - sub(wf) - - assert wf.checksum == checksum_before - results = wf.result() - # expected: [({"test7.x": 1}, 3), ({"test7.x": 2}, 4)] - assert results[0].output.out == 3 - assert results[1].output.out == 4 - # checking all directories - assert wf.output_dir - for odir in wf.output_dir: - assert odir.exists() - - -def test_wf_st_1_call_subm(plugin, tmpdir): - """Workflow with one task, a splitter for the workflow""" - wf = Workflow(name="wf_spl_1", input_spec=["x"]) - wf.add(add2(name="add2", x=wf.lzin.x)) - - wf.split("x", x=[1, 2]) - wf.set_output([("out", wf.add2.lzout.out)]) - wf.cache_dir = tmpdir - - with Submitter(plugin=plugin) as sub: - wf(submitter=sub) - - results = wf.result() - # expected: [({"test7.x": 1}, 3), ({"test7.x": 2}, 4)] - assert results[0].output.out == 3 - assert results[1].output.out == 4 - # checking all directories - assert wf.output_dir - for odir in wf.output_dir: - assert odir.exists() - - -def test_wf_st_1_call_plug(plugin, tmpdir): - """Workflow with one task, a splitter for the workflow - using Workflow.__call__(plugin) - """ - wf = Workflow(name="wf_spl_1", input_spec=["x"]) - wf.add(add2(name="add2", x=wf.lzin.x)) - - wf.split("x", x=[1, 2]) - wf.set_output([("out", wf.add2.lzout.out)]) - wf.cache_dir = tmpdir - - wf(plugin=plugin) - - results = wf.result() - # expected: [({"test7.x": 1}, 3), ({"test7.x": 2}, 4)] - assert results[0].output.out == 3 - assert results[1].output.out == 4 - # checking all directories - assert wf.output_dir - for odir in wf.output_dir: - assert odir.exists() - - -def test_wf_st_1_call_selfplug(plugin, tmpdir): - """Workflow with one task, a splitter for the workflow - using Workflow.__call__() and using self.plugin - """ - wf = Workflow(name="wf_spl_1", input_spec=["x"]) - wf.add(add2(name="add2", x=wf.lzin.x)) - - wf.split("x", x=[1, 2]) - wf.set_output([("out", wf.add2.lzout.out)]) - wf.plugin = plugin - wf.cache_dir = tmpdir - - wf() - results = wf.result() - # expected: [({"test7.x": 1}, 3), ({"test7.x": 2}, 4)] - assert results[0].output.out == 3 - assert results[1].output.out == 4 - # checking all directories - assert wf.output_dir - for odir in wf.output_dir: - assert odir.exists() - - -def test_wf_st_1_call_noplug_nosubm(plugin, tmpdir): - """Workflow with one task, a splitter for the workflow - using Workflow.__call__() without plugin and submitter - (a submitter should be created within the __call__ function) - """ - wf = Workflow(name="wf_spl_1", input_spec=["x"]) - wf.add(add2(name="add2", x=wf.lzin.x)) - - wf.split("x", x=[1, 2]) - wf.set_output([("out", wf.add2.lzout.out)]) - wf.cache_dir = tmpdir - - wf() - results = wf.result() - # expected: [({"test7.x": 1}, 3), ({"test7.x": 2}, 4)] - assert results[0].output.out == 3 - assert results[1].output.out == 4 - # checking all directories - assert wf.output_dir - for odir in wf.output_dir: - assert odir.exists() - - -def test_wf_st_1_inp_in_call(tmpdir): - """Defining input in __call__""" - wf = Workflow(name="wf_spl_1", input_spec=["x"], cache_dir=tmpdir).split( - "x", x=[1, 2] - ) - wf.add(add2(name="add2", x=wf.lzin.x)) - wf.set_output([("out", wf.add2.lzout.out)]) - results = wf() - assert results[0].output.out == 3 - assert results[1].output.out == 4 - - -def test_wf_st_1_upd_inp_call(tmpdir): - """Updating input in __call___""" - wf = Workflow(name="wf_spl_1", input_spec=["x"], cache_dir=tmpdir).split( - "x", x=[11, 22] - ) - wf.add(add2(name="add2", x=wf.lzin.x)) - wf.set_output([("out", wf.add2.lzout.out)]) - results = wf(x=[1, 2]) - assert results[0].output.out == 3 - assert results[1].output.out == 4 - - -def test_wf_st_noinput_1(plugin, tmpdir): - """Workflow with one task, a splitter for the workflow""" - wf = Workflow(name="wf_spl_1", input_spec=["x"]) - wf.add(add2(name="add2", x=wf.lzin.x)) - - wf.split("x", x=[]) - wf.set_output([("out", wf.add2.lzout.out)]) - wf.plugin = plugin - wf.cache_dir = tmpdir - - checksum_before = wf.checksum - with Submitter(plugin=plugin) as sub: - sub(wf) - - assert wf.checksum == checksum_before - results = wf.result() - assert results == [] - # checking all directories - assert wf.output_dir == [] - - -def test_wf_ndst_1(plugin, tmpdir): - """workflow with one task, a splitter on the task level""" - wf = Workflow(name="wf_spl_1", input_spec=["x"]) - wf.add(add2(name="add2").split("x", x=wf.lzin.x)) - wf.inputs.x = [1, 2] - wf.set_output([("out", wf.add2.lzout.out)]) - wf.cache_dir = tmpdir - - checksum_before = wf.checksum - with Submitter(plugin=plugin) as sub: - sub(wf) - - assert wf.checksum == checksum_before - results = wf.result() - # expected: [({"test7.x": 1}, 3), ({"test7.x": 2}, 4)] - assert results.output.out == [3, 4] - assert wf.output_dir.exists() - - -def test_wf_ndst_updatespl_1(plugin, tmpdir): - """workflow with one task, - a splitter on the task level is added *after* calling add - """ - wf = Workflow(name="wf_spl_1", input_spec=["x"]) - wf.add(add2(name="add2")) - wf.inputs.x = [1, 2] - wf.add2.split("x", x=wf.lzin.x) - wf.set_output([("out", wf.add2.lzout.out)]) - wf.cache_dir = tmpdir - - with Submitter(plugin=plugin) as sub: - sub(wf) - - results = wf.result() - # expected: [({"test7.x": 1}, 3), ({"test7.x": 2}, 4)] - assert results.output.out == [3, 4] - assert wf.output_dir.exists() - - assert wf.output_dir.exists() - - -def test_wf_ndst_updatespl_1a(plugin, tmpdir): - """workflow with one task (initialize before calling add), - a splitter on the task level is added *after* calling add - """ - wf = Workflow(name="wf_spl_1", input_spec=["x"]) - task_add2 = add2(name="add2", x=wf.lzin.x) - wf.add(task_add2) - task_add2.split("x", x=[1, 2]) - wf.set_output([("out", wf.add2.lzout.out)]) - wf.cache_dir = tmpdir - - with Submitter(plugin=plugin) as sub: - sub(wf) - - results = wf.result() - # expected: [({"test7.x": 1}, 3), ({"test7.x": 2}, 4)] - assert results.output.out == [3, 4] - assert wf.output_dir.exists() - - assert wf.output_dir.exists() - - -def test_wf_ndst_updateinp_1(plugin, tmpdir): - """workflow with one task, - a splitter on the task level, - updating input of the task after calling add - """ - wf = Workflow(name="wf_spl_1", input_spec=["x", "y"]) - wf.add(add2(name="add2", x=wf.lzin.x)) - wf.inputs.x = [1, 2] - wf.inputs.y = [11, 12] - wf.add2.split("x", x=wf.lzin.y) - wf.set_output([("out", wf.add2.lzout.out)]) - wf.cache_dir = tmpdir - - with Submitter(plugin=plugin) as sub: - sub(wf) - - results = wf.result() - assert results.output.out == [13, 14] - assert wf.output_dir.exists() - - assert wf.output_dir.exists() - - -def test_wf_ndst_noinput_1(plugin, tmpdir): - """workflow with one task, a splitter on the task level""" - wf = Workflow(name="wf_spl_1", input_spec=["x"]) - wf.add(add2(name="add2").split("x", x=wf.lzin.x)) - wf.inputs.x = [] - wf.set_output([("out", wf.add2.lzout.out)]) - wf.cache_dir = tmpdir - - checksum_before = wf.checksum - with Submitter(plugin=plugin) as sub: - sub(wf) - - assert wf.checksum == checksum_before - results = wf.result() - - assert results.output.out == [] - assert wf.output_dir.exists() - - -def test_wf_st_2(plugin, tmpdir): - """workflow with one task, splitters and combiner for workflow""" - wf = Workflow(name="wf_st_2", input_spec=["x"]) - wf.add(add2(name="add2", x=wf.lzin.x)) - - wf.split("x", x=[1, 2]).combine(combiner="x") - wf.set_output([("out", wf.add2.lzout.out)]) - wf.cache_dir = tmpdir - - with Submitter(plugin=plugin) as sub: - sub(wf) - - results = wf.result() - # expected: [({"test7.x": 1}, 3), ({"test7.x": 2}, 4)] - assert results[0].output.out == 3 - assert results[1].output.out == 4 - # checking all directories - assert wf.output_dir - for odir in wf.output_dir: - assert odir.exists() - - -def test_wf_ndst_2(plugin, tmpdir): - """workflow with one task, splitters and combiner on the task level""" - wf = Workflow(name="wf_ndst_2", input_spec=["x"]) - wf.add(add2(name="add2").split("x", x=wf.lzin.x).combine(combiner="x")) - wf.inputs.x = [1, 2] - wf.set_output([("out", wf.add2.lzout.out)]) - wf.cache_dir = tmpdir - - with Submitter(plugin=plugin) as sub: - sub(wf) - - results = wf.result() - # expected: [({"test7.x": 1}, 3), ({"test7.x": 2}, 4)] - assert results.output.out == [3, 4] - assert wf.output_dir.exists() - - -# workflows with structures A -> B - - -def test_wf_st_3(plugin, tmpdir): - """workflow with 2 tasks, splitter on wf level""" - wf = Workflow(name="wfst_3", input_spec=["x", "y"]) - wf.add(multiply(name="mult", x=wf.lzin.x, y=wf.lzin.y)) - wf.add(add2(name="add2", x=wf.mult.lzout.out)) - wf.split(("x", "y"), x=[1, 2], y=[11, 12]) - wf.set_output([("out", wf.add2.lzout.out)]) - wf.cache_dir = tmpdir - - with Submitter(plugin=plugin) as sub: - sub(wf) - - expected = [ - ({"wfst_3.x": 1, "wfst_3.y": 11}, 13), - ({"wfst_3.x": 2, "wfst_3.y": 12}, 26), - ] - expected_ind = [ - ({"wfst_3.x": 0, "wfst_3.y": 0}, 13), - ({"wfst_3.x": 1, "wfst_3.y": 1}, 26), - ] - - results = wf.result() - for i, res in enumerate(expected): - assert results[i].output.out == res[1] - - # checking the return_inputs option, either return_inputs is True or "val", - # it should give values of inputs that corresponds to the specific element - results_verb = wf.result(return_inputs=True) - results_verb_val = wf.result(return_inputs="val") - for i, res in enumerate(expected): - assert (results_verb[i][0], results_verb[i][1].output.out) == res - assert (results_verb_val[i][0], results_verb_val[i][1].output.out) == res - - # checking the return_inputs option return_inputs="ind" - # it should give indices of inputs (instead of values) for each element - results_verb_ind = wf.result(return_inputs="ind") - for i, res in enumerate(expected_ind): - assert (results_verb_ind[i][0], results_verb_ind[i][1].output.out) == res - - # checking all directories - assert wf.output_dir - for odir in wf.output_dir: - assert odir.exists() - - -def test_wf_ndst_3(plugin, tmpdir): - """Test workflow with 2 tasks, splitter on a task level""" - wf = Workflow(name="wf_ndst_3", input_spec=["x", "y"]) - wf.add(multiply(name="mult").split(("x", "y"), x=wf.lzin.x, y=wf.lzin.y)) - wf.add(add2(name="add2", x=wf.mult.lzout.out)) - wf.inputs.x = [1, 2] - wf.inputs.y = [11, 12] - wf.set_output([("out", wf.add2.lzout.out)]) - wf.cache_dir = tmpdir - - with Submitter(plugin=plugin) as sub: - sub(wf) - - results = wf.result() - # expected: [({"test7.x": 1, "test7.y": 11}, 13), ({"test7.x": 2, "test.y": 12}, 26)] - assert results.output.out == [13, 26] - # checking the output directory - assert wf.output_dir.exists() - - -def test_wf_st_4(plugin, tmpdir): - """workflow with two tasks, scalar splitter and combiner for the workflow""" - wf = Workflow(name="wf_st_4", input_spec=["x", "y"]) - wf.add(multiply(name="mult", x=wf.lzin.x, y=wf.lzin.y)) - wf.add(add2(name="add2", x=wf.mult.lzout.out)) - - wf.split(("x", "y"), x=[1, 2], y=[11, 12]) - wf.combine("x") - wf.set_output([("out", wf.add2.lzout.out)]) - wf.cache_dir = tmpdir - - with Submitter(plugin=plugin) as sub: - sub(wf) - - results = wf.result() - # expected: [ - # ({"test7.x": 1, "test7.y": 11}, 13), ({"test7.x": 2, "test.y": 12}, 26) - # ] - assert results[0].output.out == 13 - assert results[1].output.out == 26 - # checking all directories - assert wf.output_dir - for odir in wf.output_dir: - assert odir.exists() - - -def test_wf_ndst_4(plugin, tmpdir): - """workflow with two tasks, scalar splitter and combiner on tasks level""" - wf = Workflow(name="wf_ndst_4", input_spec=["a", "b"]) - wf.add(multiply(name="mult").split(("x", "y"), x=wf.lzin.a, y=wf.lzin.b)) - wf.add(add2(name="add2", x=wf.mult.lzout.out).combine("mult.x")) - - wf.set_output([("out", wf.add2.lzout.out)]) - wf.cache_dir = tmpdir - wf.inputs.a = [1, 2] - wf.inputs.b = [11, 12] - - with Submitter(plugin=plugin) as sub: - sub(wf) - - results = wf.result() - # expected: [ - # ({"test7.x": 1, "test7.y": 11}, 13), ({"test7.x": 2, "test.y": 12}, 26) - # ] - assert results.output.out == [13, 26] - # checking the output directory - assert wf.output_dir.exists() - - -def test_wf_st_5(plugin, tmpdir): - """workflow with two tasks, outer splitter and no combiner""" - wf = Workflow(name="wf_st_5", input_spec=["x", "y"]) - wf.add(multiply(name="mult", x=wf.lzin.x, y=wf.lzin.y)) - wf.add(add2(name="add2", x=wf.mult.lzout.out)) - - wf.split(["x", "y"], x=[1, 2], y=[11, 12]) - wf.set_output([("out", wf.add2.lzout.out)]) - wf.cache_dir = tmpdir - - with Submitter(plugin=plugin) as sub: - sub(wf) - - results = wf.result() - assert results[0].output.out == 13 - assert results[1].output.out == 14 - assert results[2].output.out == 24 - assert results[3].output.out == 26 - # checking all directories - assert wf.output_dir - for odir in wf.output_dir: - assert odir.exists() - - -def test_wf_ndst_5(plugin, tmpdir): - """workflow with two tasks, outer splitter on tasks level and no combiner""" - wf = Workflow(name="wf_ndst_5", input_spec=["x", "y"]) - wf.add(multiply(name="mult").split(["x", "y"], x=wf.lzin.x, y=wf.lzin.y)) - wf.add(add2(name="add2", x=wf.mult.lzout.out)) - wf.inputs.x = [1, 2] - wf.inputs.y = [11, 12] - wf.set_output([("out", wf.add2.lzout.out)]) - wf.cache_dir = tmpdir - - with Submitter(plugin=plugin) as sub: - sub(wf) - - results = wf.result() - assert results.output.out[0] == 13 - assert results.output.out[1] == 14 - assert results.output.out[2] == 24 - assert results.output.out[3] == 26 - # checking the output directory - assert wf.output_dir.exists() - - -def test_wf_st_6(plugin, tmpdir): - """workflow with two tasks, outer splitter and combiner for the workflow""" - wf = Workflow(name="wf_st_6", input_spec=["x", "y"]) - wf.add(multiply(name="mult", x=wf.lzin.x, y=wf.lzin.y)) - wf.add(add2(name="add2", x=wf.mult.lzout.out)) - - wf.split(["x", "y"], x=[1, 2, 3], y=[11, 12]) - wf.combine("x") - wf.set_output([("out", wf.add2.lzout.out)]) - wf.cache_dir = tmpdir - - with Submitter(plugin=plugin) as sub: - sub(wf) - - results = wf.result() - assert results[0][0].output.out == 13 - assert results[0][1].output.out == 24 - assert results[0][2].output.out == 35 - assert results[1][0].output.out == 14 - assert results[1][1].output.out == 26 - assert results[1][2].output.out == 38 - # checking all directories - assert wf.output_dir - for odir in wf.output_dir: - assert odir.exists() - - -def test_wf_ndst_6(plugin, tmpdir): - """workflow with two tasks, outer splitter and combiner on tasks level""" - wf = Workflow(name="wf_ndst_6", input_spec=["x", "y"]) - wf.add(multiply(name="mult").split(["x", "y"], x=wf.lzin.x, y=wf.lzin.y)) - wf.add(add2(name="add2", x=wf.mult.lzout.out).combine("mult.x")) - wf.inputs.x = [1, 2, 3] - wf.inputs.y = [11, 12] - wf.set_output([("out", wf.add2.lzout.out)]) - wf.cache_dir = tmpdir - - with Submitter(plugin=plugin) as sub: - sub(wf) - - results = wf.result() - assert results.output.out[0] == [13, 24, 35] - assert results.output.out[1] == [14, 26, 38] - - # checking the output directory - assert wf.output_dir.exists() - - -def test_wf_ndst_7(plugin, tmpdir): - """workflow with two tasks, outer splitter and (full) combiner for first node only""" - wf = Workflow(name="wf_ndst_6", input_spec=["x", "y"]) - wf.add(multiply(name="mult").split("x", x=wf.lzin.x, y=wf.lzin.y).combine("x")) - wf.add(identity(name="iden", x=wf.mult.lzout.out)) - wf.inputs.x = [1, 2, 3] - wf.inputs.y = 11 - wf.set_output([("out", wf.iden.lzout.out)]) - wf.cache_dir = tmpdir - - with Submitter(plugin=plugin) as sub: - sub(wf) - - results = wf.result() - assert results.output.out == [11, 22, 33] - - # checking the output directory - assert wf.output_dir.exists() - - -def test_wf_ndst_8(plugin, tmpdir): - """workflow with two tasks, outer splitter and (partial) combiner for first task only""" - wf = Workflow(name="wf_ndst_6", input_spec=["x", "y"]) - wf.add( - multiply(name="mult").split(["x", "y"], x=wf.lzin.x, y=wf.lzin.y).combine("x") - ) - wf.add(identity(name="iden", x=wf.mult.lzout.out)) - wf.inputs.x = [1, 2, 3] - wf.inputs.y = [11, 12] - wf.set_output([("out", wf.iden.lzout.out)]) - wf.cache_dir = tmpdir - - with Submitter(plugin=plugin) as sub: - sub(wf) - - results = wf.result() - assert results.output.out[0] == [11, 22, 33] - assert results.output.out[1] == [12, 24, 36] - - # checking the output directory - assert wf.output_dir.exists() - - -def test_wf_ndst_9(plugin, tmpdir): - """workflow with two tasks, outer splitter and (full) combiner for first task only""" - wf = Workflow(name="wf_ndst_6", input_spec=["x", "y"]) - wf.add( - multiply(name="mult") - .split(["x", "y"], x=wf.lzin.x, y=wf.lzin.y) - .combine(["x", "y"]) - ) - wf.add(identity(name="iden", x=wf.mult.lzout.out)) - wf.inputs.x = [1, 2, 3] - wf.inputs.y = [11, 12] - wf.set_output([("out", wf.iden.lzout.out)]) - wf.cache_dir = tmpdir - - with Submitter(plugin=plugin) as sub: - sub(wf) - - results = wf.result() - assert results.output.out == [11, 12, 22, 24, 33, 36] - - # checking the output directory - assert wf.output_dir.exists() - - -# workflows with structures A -> B -> C - - -def test_wf_3sernd_ndst_1(plugin, tmpdir): - """workflow with three "serial" tasks, checking if the splitter is propagating""" - wf = Workflow(name="wf_3sernd_ndst_1", input_spec=["x", "y"]) - wf.add(multiply(name="mult").split(["x", "y"], x=wf.lzin.x, y=wf.lzin.y)) - wf.add(add2(name="add2_1st", x=wf.mult.lzout.out)) - wf.add(add2(name="add2_2nd", x=wf.add2_1st.lzout.out)) - wf.inputs.x = [1, 2] - wf.inputs.y = [11, 12] - wf.set_output([("out", wf.add2_2nd.lzout.out)]) - wf.cache_dir = tmpdir - - with Submitter(plugin=plugin) as sub: - sub(wf) - - # splitter from the first task should propagate to all tasks, - # splitter_rpn should be the same in all tasks - assert wf.mult.state.splitter == ["mult.x", "mult.y"] - assert wf.add2_1st.state.splitter == "_mult" - assert wf.add2_2nd.state.splitter == "_add2_1st" - assert ( - ["mult.x", "mult.y", "*"] - == wf.mult.state.splitter_rpn - == wf.add2_1st.state.splitter_rpn - == wf.add2_2nd.state.splitter_rpn - ) - - results = wf.result() - assert results.output.out[0] == 15 - assert results.output.out[1] == 16 - assert results.output.out[2] == 26 - assert results.output.out[3] == 28 - # checking the output directory - assert wf.output_dir.exists() - - -def test_wf_3sernd_ndst_1a(plugin, tmpdir): - """ - workflow with three "serial" tasks, checking if the splitter is propagating - first task has a splitter that propagates to the 2nd task, - and the 2nd task is adding one more input to the splitter - """ - wf = Workflow(name="wf_3sernd_ndst_1", input_spec=["x", "y"]) - wf.add(add2(name="add2_1st").split("x", x=wf.lzin.x)) - wf.add(multiply(name="mult", x=wf.add2_1st.lzout.out).split("y", y=wf.lzin.y)) - wf.add(add2(name="add2_2nd", x=wf.mult.lzout.out)) - wf.inputs.x = [1, 2] - wf.inputs.y = [11, 12] - wf.set_output([("out", wf.add2_2nd.lzout.out)]) - wf.cache_dir = tmpdir - - with Submitter(plugin=plugin) as sub: - sub(wf) - - # splitter from the 1st task should propagate and the 2nd task should add one more - # splitter_rpn for the 2nd and the 3rd task should be the same - assert wf.add2_1st.state.splitter == "add2_1st.x" - assert wf.mult.state.splitter == ["_add2_1st", "mult.y"] - assert wf.add2_2nd.state.splitter == "_mult" - assert ( - ["add2_1st.x", "mult.y", "*"] - == wf.mult.state.splitter_rpn - == wf.add2_2nd.state.splitter_rpn - ) - - results = wf.result() - assert results.output.out[0] == 35 - assert results.output.out[1] == 38 - assert results.output.out[2] == 46 - assert results.output.out[3] == 50 - # checking the output directory - assert wf.output_dir.exists() - - -# workflows with structures A -> C, B -> C - - -@pytest.mark.flaky(reruns=3) # when dask -def test_wf_3nd_st_1(plugin_dask_opt, tmpdir): - """workflow with three tasks, third one connected to two previous tasks, - splitter on the workflow level - """ - wf = Workflow(name="wf_st_7", input_spec=["x", "y"]) - wf.add(add2(name="add2x", x=wf.lzin.x)) - wf.add(add2(name="add2y", x=wf.lzin.y)) - wf.add(multiply(name="mult", x=wf.add2x.lzout.out, y=wf.add2y.lzout.out)) - wf.split(["x", "y"], x=[1, 2, 3], y=[11, 12]) - - wf.set_output([("out", wf.mult.lzout.out)]) - wf.cache_dir = tmpdir - - with Submitter(plugin=plugin_dask_opt) as sub: - sub(wf) - - results = wf.result() - assert len(results) == 6 - assert results[0].output.out == 39 - assert results[1].output.out == 42 - assert results[5].output.out == 70 - # checking all directories - assert wf.output_dir - for odir in wf.output_dir: - assert odir.exists() - - -@pytest.mark.flaky(reruns=3) # when dask -def test_wf_3nd_ndst_1(plugin_dask_opt, tmpdir): - """workflow with three tasks, third one connected to two previous tasks, - splitter on the tasks levels - """ - wf = Workflow(name="wf_ndst_7", input_spec=["x", "y"]) - wf.add(add2(name="add2x").split("x", x=wf.lzin.x)) - wf.add(add2(name="add2y").split("x", x=wf.lzin.y)) - wf.add(multiply(name="mult", x=wf.add2x.lzout.out, y=wf.add2y.lzout.out)) - wf.inputs.x = [1, 2, 3] - wf.inputs.y = [11, 12] - wf.set_output([("out", wf.mult.lzout.out)]) - wf.cache_dir = tmpdir - - with Submitter(plugin=plugin_dask_opt) as sub: - sub(wf) - - results = wf.result() - assert len(results.output.out) == 6 - assert results.output.out == [39, 42, 52, 56, 65, 70] - # checking the output directory - assert wf.output_dir.exists() - - -def test_wf_3nd_st_2(plugin, tmpdir): - """workflow with three tasks, third one connected to two previous tasks, - splitter and partial combiner on the workflow level - """ - wf = Workflow(name="wf_st_8", input_spec=["x", "y"]) - wf.add(add2(name="add2x", x=wf.lzin.x)) - wf.add(add2(name="add2y", x=wf.lzin.y)) - wf.add(multiply(name="mult", x=wf.add2x.lzout.out, y=wf.add2y.lzout.out)) - wf.split(["x", "y"], x=[1, 2, 3], y=[11, 12]).combine("x") - - wf.set_output([("out", wf.mult.lzout.out)]) - wf.cache_dir = tmpdir - - with Submitter(plugin=plugin) as sub: - sub(wf) - - results = wf.result() - assert len(results) == 2 - assert results[0][0].output.out == 39 - assert results[0][1].output.out == 52 - assert results[0][2].output.out == 65 - assert results[1][0].output.out == 42 - assert results[1][1].output.out == 56 - assert results[1][2].output.out == 70 - # checking all directories - assert wf.output_dir - for odir in wf.output_dir: - assert odir.exists() - - -def test_wf_3nd_ndst_2(plugin, tmpdir): - """workflow with three tasks, third one connected to two previous tasks, - splitter and partial combiner on the tasks levels - """ - wf = Workflow(name="wf_ndst_8", input_spec=["x", "y"]) - wf.add(add2(name="add2x").split("x", x=wf.lzin.x)) - wf.add(add2(name="add2y").split("x", x=wf.lzin.y)) - wf.add( - multiply(name="mult", x=wf.add2x.lzout.out, y=wf.add2y.lzout.out).combine( - "add2x.x" - ) - ) - wf.inputs.x = [1, 2, 3] - wf.inputs.y = [11, 12] - wf.set_output([("out", wf.mult.lzout.out)]) - wf.cache_dir = tmpdir - - with Submitter(plugin="serial") as sub: - sub(wf) - - results = wf.result() - assert len(results.output.out) == 2 - assert results.output.out[0] == [39, 52, 65] - assert results.output.out[1] == [42, 56, 70] - # checking the output directory - assert wf.output_dir.exists() - - -def test_wf_3nd_st_3(plugin, tmpdir): - """workflow with three tasks, third one connected to two previous tasks, - splitter and partial combiner (from the second task) on the workflow level - """ - wf = Workflow(name="wf_st_9", input_spec=["x", "y"]) - wf.add(add2(name="add2x", x=wf.lzin.x)) - wf.add(add2(name="add2y", x=wf.lzin.y)) - wf.add(multiply(name="mult", x=wf.add2x.lzout.out, y=wf.add2y.lzout.out)) - wf.split(["x", "y"], x=[1, 2, 3], y=[11, 12]).combine("y") - - wf.set_output([("out", wf.mult.lzout.out)]) - wf.cache_dir = tmpdir - - with Submitter(plugin=plugin) as sub: - sub(wf) - - results = wf.result() - assert len(results) == 3 - assert results[0][0].output.out == 39 - assert results[0][1].output.out == 42 - assert results[1][0].output.out == 52 - assert results[1][1].output.out == 56 - assert results[2][0].output.out == 65 - assert results[2][1].output.out == 70 - # checking all directories - assert wf.output_dir - for odir in wf.output_dir: - assert odir.exists() - - -def test_wf_3nd_ndst_3(plugin, tmpdir): - """workflow with three tasks, third one connected to two previous tasks, - splitter and partial combiner (from the second task) on the tasks levels - """ - wf = Workflow(name="wf_ndst_9", input_spec=["x", "y"]) - wf.add(add2(name="add2x").split("x", x=wf.lzin.x)) - wf.add(add2(name="add2y").split("x", x=wf.lzin.y)) - wf.add( - multiply(name="mult", x=wf.add2x.lzout.out, y=wf.add2y.lzout.out).combine( - "add2y.x" - ) - ) - wf.inputs.x = [1, 2, 3] - wf.inputs.y = [11, 12] - wf.set_output([("out", wf.mult.lzout.out)]) - wf.cache_dir = tmpdir - - with Submitter(plugin=plugin) as sub: - sub(wf) - - results = wf.result() - assert len(results.output.out) == 3 - assert results.output.out[0] == [39, 42] - assert results.output.out[1] == [52, 56] - assert results.output.out[2] == [65, 70] - # checking the output directory - assert wf.output_dir.exists() - - -def test_wf_3nd_st_4(plugin, tmpdir): - """workflow with three tasks, third one connected to two previous tasks, - splitter and full combiner on the workflow level - """ - wf = Workflow(name="wf_st_10", input_spec=["x", "y"]) - wf.add(add2(name="add2x", x=wf.lzin.x)) - wf.add(add2(name="add2y", x=wf.lzin.y)) - wf.add(multiply(name="mult", x=wf.add2x.lzout.out, y=wf.add2y.lzout.out)) - wf.split(["x", "y"], x=[1, 2, 3], y=[11, 12]).combine(["x", "y"]) - wf.set_output([("out", wf.mult.lzout.out)]) - wf.plugin = plugin - wf.cache_dir = tmpdir - - with Submitter(plugin=plugin) as sub: - sub(wf) - - results = wf.result() - assert len(results) == 6 - assert results[0].output.out == 39 - assert results[1].output.out == 42 - assert results[2].output.out == 52 - assert results[3].output.out == 56 - assert results[4].output.out == 65 - assert results[5].output.out == 70 - # checking all directories - assert wf.output_dir - for odir in wf.output_dir: - assert odir.exists() - - -def test_wf_3nd_ndst_4(plugin, tmpdir): - """workflow with three tasks, third one connected to two previous tasks, - splitter and full combiner on the tasks levels - """ - wf = Workflow(name="wf_ndst_10", input_spec=["x", "y"]) - wf.add(add2(name="add2x").split("x", x=wf.lzin.x)) - wf.add(add2(name="add2y").split("x", x=wf.lzin.y)) - wf.add( - multiply(name="mult", x=wf.add2x.lzout.out, y=wf.add2y.lzout.out).combine( - ["add2x.x", "add2y.x"] - ) - ) - wf.inputs.x = [1, 2, 3] - wf.inputs.y = [11, 12] - wf.set_output([("out", wf.mult.lzout.out)]) - wf.cache_dir = tmpdir - - with Submitter(plugin=plugin) as sub: - sub(wf) - # assert wf.output_dir.exists() - results = wf.result() - - assert len(results.output.out) == 6 - assert results.output.out == [39, 42, 52, 56, 65, 70] - # checking the output directory - assert wf.output_dir.exists() - - -def test_wf_3nd_st_5(plugin, tmpdir): - """workflow with three tasks (A->C, B->C) and three fields in the splitter, - splitter and partial combiner (from the second task) on the workflow level - """ - wf = Workflow(name="wf_st_9", input_spec=["x", "y", "z"]) - wf.add(add2(name="add2x", x=wf.lzin.x)) - wf.add(add2(name="add2y", x=wf.lzin.y)) - wf.add( - fun_addvar3( - name="addvar", a=wf.add2x.lzout.out, b=wf.add2y.lzout.out, c=wf.lzin.z - ) - ) - wf.split(["x", "y", "z"], x=[2, 3], y=[11, 12], z=[10, 100]).combine("y") - - wf.set_output([("out", wf.addvar.lzout.out)]) - wf.plugin = plugin - wf.cache_dir = tmpdir - - with Submitter(plugin=plugin) as sub: - sub(wf) - - results = wf.result() - assert len(results) == 4 - assert results[0][0].output.out == 27 - assert results[0][1].output.out == 28 - assert results[1][0].output.out == 117 - assert results[1][1].output.out == 118 - assert results[2][0].output.out == 28 - assert results[2][1].output.out == 29 - assert results[3][0].output.out == 118 - assert results[3][1].output.out == 119 - - # checking all directories - assert wf.output_dir - for odir in wf.output_dir: - assert odir.exists() - - -def test_wf_3nd_ndst_5(plugin, tmpdir): - """workflow with three tasks (A->C, B->C) and three fields in the splitter, - all tasks have splitters and the last one has a partial combiner (from the 2nd) - """ - wf = Workflow(name="wf_st_9", input_spec=["x", "y", "z"]) - wf.add(add2(name="add2x").split("x", x=wf.lzin.x)) - wf.add(add2(name="add2y").split("x", x=wf.lzin.y)) - wf.add( - fun_addvar3(name="addvar", a=wf.add2x.lzout.out, b=wf.add2y.lzout.out) - .split("c", c=wf.lzin.z) - .combine("add2x.x") - ) - wf.inputs.x = [2, 3] - wf.inputs.y = [11, 12] - wf.inputs.z = [10, 100] - - wf.set_output([("out", wf.addvar.lzout.out)]) - wf.cache_dir = tmpdir - - with Submitter(plugin=plugin) as sub: - sub(wf) - - results = wf.result() - assert len(results.output.out) == 4 - assert results.output.out[0] == [27, 28] - assert results.output.out[1] == [117, 118] - assert results.output.out[2] == [28, 29] - assert results.output.out[3] == [118, 119] - - # checking all directories - assert wf.output_dir.exists() - - -def test_wf_3nd_ndst_6(plugin, tmpdir): - """workflow with three tasks, third one connected to two previous tasks, - the third one uses scalar splitter from the previous ones and a combiner - """ - wf = Workflow(name="wf_ndst_9", input_spec=["x", "y"]) - wf.add(add2(name="add2x").split("x", x=wf.lzin.x)) - wf.add(add2(name="add2y").split("x", x=wf.lzin.y)) - wf.add( - multiply(name="mult", x=wf.add2x.lzout.out, y=wf.add2y.lzout.out) - .split(("_add2x", "_add2y")) - .combine("add2y.x") - ) - wf.inputs.x = [1, 2] - wf.inputs.y = [11, 12] - wf.set_output([("out", wf.mult.lzout.out)]) - wf.cache_dir = tmpdir - - with Submitter(plugin=plugin) as sub: - sub(wf) - - results = wf.result() - assert results.output.out == [39, 56] - # checking the output directory - assert wf.output_dir.exists() - - -def test_wf_3nd_ndst_7(plugin, tmpdir): - """workflow with three tasks, third one connected to two previous tasks, - the third one uses scalar splitter from the previous ones - """ - wf = Workflow(name="wf_ndst_9", input_spec=["x"]) - wf.add(add2(name="add2x").split("x", x=wf.lzin.x)) - wf.add(add2(name="add2y").split("x", x=wf.lzin.x)) - wf.add( - multiply(name="mult", x=wf.add2x.lzout.out, y=wf.add2y.lzout.out).split( - ("_add2x", "_add2y") - ) - ) - wf.inputs.x = [1, 2] - wf.set_output([("out", wf.mult.lzout.out)]) - wf.cache_dir = tmpdir - - with Submitter(plugin=plugin) as sub: - sub(wf) - - results = wf.result() - assert results.output.out == [9, 16] - # checking the output directory - assert wf.output_dir.exists() - - -# workflows with structures A -> B -> C with multiple connections - - -def test_wf_3nd_8(tmpdir): - """workflow with three tasks A->B->C vs two tasks A->C with multiple connections""" - wf = Workflow(name="wf", input_spec=["zip"], cache_dir=tmpdir) - wf.inputs.zip = [["test1", "test3", "test5"], ["test2", "test4", "test6"]] - - wf.add(identity_2flds(name="iden2flds_1", x2="Hoi").split("x1", x1=wf.lzin.zip)) - - wf.add(identity(name="identity", x=wf.iden2flds_1.lzout.out1)) - - wf.add( - identity_2flds( - name="iden2flds_2", x1=wf.identity.lzout.out, x2=wf.iden2flds_1.lzout.out2 - ) - ) - - wf.add( - identity_2flds( - name="iden2flds_2a", - x1=wf.iden2flds_1.lzout.out1, - x2=wf.iden2flds_1.lzout.out2, - ) - ) - - wf.set_output( - [ - ("out1", wf.iden2flds_2.lzout.out1), - ("out2", wf.iden2flds_2.lzout.out2), - ("out1a", wf.iden2flds_2a.lzout.out1), - ("out2a", wf.iden2flds_2a.lzout.out2), - ] - ) - - with Submitter(plugin="cf") as sub: - sub(wf) - - res = wf.result() - - assert ( - res.output.out1 - == res.output.out1a - == [["test1", "test3", "test5"], ["test2", "test4", "test6"]] - ) - assert res.output.out2 == res.output.out2a == ["Hoi", "Hoi"] - - -# workflows with Left and Right part in splitters A -> B (L&R parts of the splitter) - - -def test_wf_ndstLR_1(plugin, tmpdir): - """Test workflow with 2 tasks, splitters on tasks levels - The second task has its own simple splitter - and the Left part from the first task should be added - """ - wf = Workflow(name="wf_ndst_3", input_spec=["x", "y"]) - wf.add(add2(name="add2").split("x", x=wf.lzin.x)) - wf.add(multiply(name="mult", x=wf.add2.lzout.out).split("y", y=wf.lzin.y)) - wf.inputs.x = [1, 2] - wf.inputs.y = [11, 12] - wf.set_output([("out", wf.mult.lzout.out)]) - wf.cache_dir = tmpdir - - with Submitter(plugin=plugin) as sub: - sub(wf) - - # checking if the splitter is created properly - assert wf.mult.state.splitter == ["_add2", "mult.y"] - assert wf.mult.state.splitter_rpn == ["add2.x", "mult.y", "*"] - - results = wf.result() - # expected: [({"add2.x": 1, "mult.y": 11}, 33), ({"add2.x": 1, "mult.y": 12}, 36), - # ({"add2.x": 2, "mult.y": 11}, 44), ({"add2.x": 2, "mult.y": 12}, 48)] - assert results.output.out == [33, 36, 44, 48] - # checking the output directory - assert wf.output_dir.exists() - - -def test_wf_ndstLR_1a(plugin, tmpdir): - """Test workflow with 2 tasks, splitters on tasks levels - The second task has splitter that has Left part (from previous state) - and the Right part (it's own splitter) - """ - wf = Workflow(name="wf_ndst_3", input_spec=["x", "y"]) - wf.add(add2(name="add2").split("x", x=wf.lzin.x)) - wf.add( - multiply(name="mult").split(["_add2", "y"], x=wf.add2.lzout.out, y=wf.lzin.y) - ) - wf.inputs.x = [1, 2] - wf.inputs.y = [11, 12] - wf.set_output([("out", wf.mult.lzout.out)]) - wf.cache_dir = tmpdir - - with Submitter(plugin=plugin) as sub: - sub(wf) - - # checking if the splitter is created properly - assert wf.mult.state.splitter == ["_add2", "mult.y"] - assert wf.mult.state.splitter_rpn == ["add2.x", "mult.y", "*"] - - results = wf.result() - # expected: [({"add2.x": 1, "mult.y": 11}, 33), ({"add2.x": 1, "mult.y": 12}, 36), - # ({"add2.x": 2, "mult.y": 11}, 44), ({"add2.x": 2, "mult.y": 12}, 48)] - assert results.output.out == [33, 36, 44, 48] - # checking the output directory - assert wf.output_dir.exists() - - -def test_wf_ndstLR_2(plugin, tmpdir): - """Test workflow with 2 tasks, splitters on tasks levels - The second task has its own outer splitter - and the Left part from the first task should be added - """ - wf = Workflow(name="wf_ndst_3", input_spec=["x", "y", "z"]) - wf.add(add2(name="add2").split("x", x=wf.lzin.x)) - wf.add( - fun_addvar3(name="addvar", a=wf.add2.lzout.out).split( - ["b", "c"], b=wf.lzin.y, c=wf.lzin.z - ) - ) - wf.inputs.x = [1, 2, 3] - wf.inputs.y = [10, 20] - wf.inputs.z = [100, 200] - wf.set_output([("out", wf.addvar.lzout.out)]) - wf.cache_dir = tmpdir - - with Submitter(plugin=plugin) as sub: - sub(wf) - - # checking if the splitter is created properly - assert wf.addvar.state.splitter == ["_add2", ["addvar.b", "addvar.c"]] - assert wf.addvar.state.splitter_rpn == ["add2.x", "addvar.b", "addvar.c", "*", "*"] - - results = wf.result() - # expected: [({"add2.x": 1, "mult.b": 10, "mult.c": 100}, 113), - # ({"add2.x": 1, "mult.b": 10, "mult.c": 200}, 213), - # ({"add2.x": 1, "mult.b": 20, "mult.c": 100}, 123), - # ({"add2.x": 1, "mult.b": 20, "mult.c": 200}, 223), - # ...] - assert results.output.out == [ - 113, - 213, - 123, - 223, - 114, - 214, - 124, - 224, - 115, - 215, - 125, - 225, - ] - # checking the output directory - assert wf.output_dir.exists() - - -def test_wf_ndstLR_2a(plugin, tmpdir): - """Test workflow with 2 tasks, splitters on tasks levels - The second task has splitter that has Left part (from previous state) - and the Right part (it's own outer splitter) - """ - wf = Workflow(name="wf_ndst_3", input_spec=["x", "y", "z"]) - wf.add(add2(name="add2").split("x", x=wf.lzin.x)) - wf.add( - fun_addvar3(name="addvar", a=wf.add2.lzout.out).split( - ["_add2", ["b", "c"]], b=wf.lzin.y, c=wf.lzin.z - ) - ) - wf.inputs.x = [1, 2, 3] - wf.inputs.y = [10, 20] - wf.inputs.z = [100, 200] - wf.set_output([("out", wf.addvar.lzout.out)]) - wf.cache_dir = tmpdir - - with Submitter(plugin=plugin) as sub: - sub(wf) - - # checking if the splitter is created properly - assert wf.addvar.state.splitter == ["_add2", ["addvar.b", "addvar.c"]] - assert wf.addvar.state.splitter_rpn == ["add2.x", "addvar.b", "addvar.c", "*", "*"] - - results = wf.result() - # expected: [({"add2.x": 1, "mult.b": 10, "mult.c": 100}, 113), - # ({"add2.x": 1, "mult.b": 10, "mult.c": 200}, 213), - # ({"add2.x": 1, "mult.b": 20, "mult.c": 100}, 123), - # ({"add2.x": 1, "mult.b": 20, "mult.c": 200}, 223), - # ...] - assert results.output.out == [ - 113, - 213, - 123, - 223, - 114, - 214, - 124, - 224, - 115, - 215, - 125, - 225, - ] - # checking the output directory - assert wf.output_dir.exists() - - -# workflows with inner splitters A -> B (inner spl) - - -def test_wf_ndstinner_1(plugin, tmpdir): - """workflow with 2 tasks, - the second task has inner splitter - """ - wf = Workflow(name="wf_st_3", input_spec={"x": int}) - wf.add(list_output(name="list", x=wf.lzin.x)) - wf.add(add2(name="add2").split("x", x=wf.list.lzout.out)) - wf.inputs.x = 1 - wf.set_output([("out_list", wf.list.lzout.out), ("out", wf.add2.lzout.out)]) - wf.cache_dir = tmpdir - - with Submitter(plugin=plugin) as sub: - sub(wf) - - assert wf.add2.state.splitter == "add2.x" - assert wf.add2.state.splitter_rpn == ["add2.x"] - - results = wf.result() - assert results.output.out_list == [1, 2, 3] - assert results.output.out == [3, 4, 5] - - assert wf.output_dir.exists() - - -def test_wf_ndstinner_2(plugin, tmpdir): - """workflow with 2 tasks, - the second task has two inputs and inner splitter from one of the input - """ - wf = Workflow(name="wf_st_3", input_spec=["x", "y"]) - wf.add(list_output(name="list", x=wf.lzin.x)) - wf.add(multiply(name="mult", y=wf.lzin.y).split("x", x=wf.list.lzout.out)) - wf.inputs.x = 1 - wf.inputs.y = 10 - wf.set_output([("out_list", wf.list.lzout.out), ("out", wf.mult.lzout.out)]) - wf.cache_dir = tmpdir - - with Submitter(plugin=plugin) as sub: - sub(wf) - - assert wf.mult.state.splitter == "mult.x" - assert wf.mult.state.splitter_rpn == ["mult.x"] - - results = wf.result() - assert results.output.out_list == [1, 2, 3] - assert results.output.out == [10, 20, 30] - - assert wf.output_dir.exists() - - -def test_wf_ndstinner_3(plugin, tmpdir): - """workflow with 2 tasks, - the second task has two inputs and outer splitter that includes an inner field - """ - wf = Workflow(name="wf_st_3", input_spec=["x", "y"]) - wf.add(list_output(name="list", x=wf.lzin.x)) - wf.add(multiply(name="mult").split(["x", "y"], x=wf.list.lzout.out, y=wf.lzin.y)) - wf.inputs.x = 1 - wf.inputs.y = [10, 100] - wf.set_output([("out_list", wf.list.lzout.out), ("out", wf.mult.lzout.out)]) - wf.cache_dir = tmpdir - - with Submitter(plugin=plugin) as sub: - sub(wf) - - assert wf.mult.state.splitter == ["mult.x", "mult.y"] - assert wf.mult.state.splitter_rpn == ["mult.x", "mult.y", "*"] - - results = wf.result() - assert results.output.out_list == [1, 2, 3] - assert results.output.out == [10, 100, 20, 200, 30, 300] - - assert wf.output_dir.exists() - - -def test_wf_ndstinner_4(plugin, tmpdir): - """workflow with 3 tasks, - the second task has two inputs and inner splitter from one of the input, - the third task has no its own splitter - """ - wf = Workflow(name="wf_st_3", input_spec=["x", "y"]) - wf.add(list_output(name="list", x=wf.lzin.x)) - wf.add(multiply(name="mult", y=wf.lzin.y).split("x", x=wf.list.lzout.out)) - wf.add(add2(name="add2", x=wf.mult.lzout.out)) - wf.inputs.x = 1 - wf.inputs.y = 10 - wf.set_output([("out_list", wf.list.lzout.out), ("out", wf.add2.lzout.out)]) - wf.cache_dir = tmpdir - - with Submitter(plugin=plugin) as sub: - sub(wf) - - assert wf.mult.state.splitter == "mult.x" - assert wf.mult.state.splitter_rpn == ["mult.x"] - assert wf.add2.state.splitter == "_mult" - assert wf.add2.state.splitter_rpn == ["mult.x"] - - results = wf.result() - assert results.output.out_list == [1, 2, 3] - assert results.output.out == [12, 22, 32] - - assert wf.output_dir.exists() - - -def test_wf_ndstinner_5(plugin, tmpdir): - """workflow with 3 tasks, - the second task has two inputs and inner splitter from one of the input, - (inner input come from the first task that has its own splitter, - there is a inner_cont_dim) - the third task has no new splitter - """ - wf = Workflow(name="wf_5", input_spec=["x", "y", "b"]) - wf.add(list_output(name="list").split("x", x=wf.lzin.x)) - wf.add(multiply(name="mult").split(["y", "x"], x=wf.list.lzout.out, y=wf.lzin.y)) - wf.add(fun_addvar(name="addvar", a=wf.mult.lzout.out).split("b", b=wf.lzin.b)) - wf.inputs.x = [1, 2] - wf.inputs.y = [10, 100] - wf.inputs.b = [3, 5] - - wf.set_output( - [ - ("out_list", wf.list.lzout.out), - ("out_mult", wf.mult.lzout.out), - ("out_add", wf.addvar.lzout.out), - ] - ) - wf.cache_dir = tmpdir - - with Submitter(plugin=plugin) as sub: - sub(wf) - - assert wf.mult.state.splitter == ["_list", ["mult.y", "mult.x"]] - assert wf.mult.state.splitter_rpn == ["list.x", "mult.y", "mult.x", "*", "*"] - assert wf.addvar.state.splitter == ["_mult", "addvar.b"] - assert wf.addvar.state.splitter_rpn == [ - "list.x", - "mult.y", - "mult.x", - "*", - "*", - "addvar.b", - "*", - ] - - results = wf.result() - assert results.output.out_list == [[1, 2, 3], [2, 4, 6]] - assert results.output.out_mult == [ - 10, - 20, - 30, - 20, - 40, - 60, - 100, - 200, - 300, - 200, - 400, - 600, - ] - assert results.output.out_add == [ - 13, - 15, - 23, - 25, - 33, - 35, - 23, - 25, - 43, - 45, - 63, - 65, - 103, - 105, - 203, - 205, - 303, - 305, - 203, - 205, - 403, - 405, - 603, - 605, - ] - - assert wf.output_dir.exists() - - -# workflow that have some single values as the input - - -def test_wf_st_singl_1(plugin, tmpdir): - """workflow with two tasks, only one input is in the splitter and combiner""" - wf = Workflow(name="wf_st_5", input_spec=["x", "y"]) - wf.add(multiply(name="mult", x=wf.lzin.x, y=wf.lzin.y)) - wf.add(add2(name="add2", x=wf.mult.lzout.out)) - - wf.split("x", x=[1, 2], y=11) - wf.combine("x") - wf.set_output([("out", wf.add2.lzout.out)]) - wf.cache_dir = tmpdir - - with Submitter(plugin=plugin) as sub: - sub(wf) - - results = wf.result() - assert results[0].output.out == 13 - assert results[1].output.out == 24 - # checking all directories - assert wf.output_dir - for odir in wf.output_dir: - assert odir.exists() - - -def test_wf_ndst_singl_1(plugin, tmpdir): - """workflow with two tasks, outer splitter and combiner on tasks level; - only one input is part of the splitter, the other is a single value - """ - wf = Workflow(name="wf_ndst_5", input_spec=["x", "y"]) - wf.add(multiply(name="mult", y=wf.lzin.y).split("x", x=wf.lzin.x)) - wf.add(add2(name="add2", x=wf.mult.lzout.out).combine("mult.x")) - wf.inputs.x = [1, 2] - wf.inputs.y = 11 - wf.set_output([("out", wf.add2.lzout.out)]) - wf.cache_dir = tmpdir - - with Submitter(plugin=plugin) as sub: - sub(wf) - - results = wf.result() - assert results.output.out == [13, 24] - # checking the output directory - assert wf.output_dir.exists() - - -def test_wf_st_singl_2(plugin, tmpdir): - """workflow with three tasks, third one connected to two previous tasks, - splitter on the workflow level - only one input is part of the splitter, the other is a single value - """ - wf = Workflow(name="wf_st_6", input_spec=["x", "y"]) - wf.add(add2(name="add2x", x=wf.lzin.x)) - wf.add(add2(name="add2y", x=wf.lzin.y)) - wf.add(multiply(name="mult", x=wf.add2x.lzout.out, y=wf.add2y.lzout.out)) - wf.split("x", x=[1, 2, 3], y=11) - - wf.set_output([("out", wf.mult.lzout.out)]) - wf.cache_dir = tmpdir - - with Submitter(plugin=plugin) as sub: - sub(wf) - - results = wf.result() - assert len(results) == 3 - assert results[0].output.out == 39 - assert results[1].output.out == 52 - assert results[2].output.out == 65 - # checking all directories - assert wf.output_dir - for odir in wf.output_dir: - assert odir.exists() - - -def test_wf_ndst_singl_2(plugin, tmpdir): - """workflow with three tasks, third one connected to two previous tasks, - splitter on the tasks levels - only one input is part of the splitter, the other is a single value - """ - wf = Workflow(name="wf_ndst_6", input_spec=["x", "y"]) - wf.add(add2(name="add2x").split("x", x=wf.lzin.x)) - wf.add(add2(name="add2y", x=wf.lzin.y)) - wf.add(multiply(name="mult", x=wf.add2x.lzout.out, y=wf.add2y.lzout.out)) - wf.inputs.x = [1, 2, 3] - wf.inputs.y = 11 - wf.set_output([("out", wf.mult.lzout.out)]) - wf.cache_dir = tmpdir - - with Submitter(plugin=plugin) as sub: - sub(wf) - - results = wf.result() - assert len(results.output.out) == 3 - assert results.output.out == [39, 52, 65] - # checking the output directory - assert wf.output_dir.exists() - - -# workflows with structures wf(A) - - -def test_wfasnd_1(plugin, tmpdir): - """workflow as a node - workflow-node with one task and no splitter - """ - wfnd = Workflow(name="wfnd", input_spec=["x"]) - wfnd.add(add2(name="add2", x=wfnd.lzin.x)) - wfnd.set_output([("out", wfnd.add2.lzout.out)]) - wfnd.inputs.x = 2 - - wf = Workflow(name="wf", input_spec=["x"]) - wf.add(wfnd) - wf.set_output([("out", wf.wfnd.lzout.out)]) - wf.cache_dir = tmpdir - - with Submitter(plugin=plugin) as sub: - sub(wf) - - results = wf.result() - assert results.output.out == 4 - # checking the output directory - assert wf.output_dir.exists() - - -def test_wfasnd_wfinp_1(plugin, tmpdir): - """workflow as a node - workflow-node with one task and no splitter - input set for the main workflow - """ - wf = Workflow(name="wf", input_spec=["x"]) - wfnd = Workflow(name="wfnd", input_spec=["x"], x=wf.lzin.x) - wfnd.add(add2(name="add2", x=wfnd.lzin.x)) - wfnd.set_output([("out", wfnd.add2.lzout.out)]) - - wf.add(wfnd) - wf.inputs.x = 2 - wf.set_output([("out", wf.wfnd.lzout.out)]) - wf.cache_dir = tmpdir - - checksum_before = wf.checksum - with Submitter(plugin=plugin) as sub: - sub(wf) - - assert wf.checksum == checksum_before - results = wf.result() - assert results.output.out == 4 - # checking the output directory - assert wf.output_dir.exists() - - -def test_wfasnd_wfndupdate(plugin, tmpdir): - """workflow as a node - workflow-node with one task and no splitter - wfasnode input is updated to use the main workflow input - """ - - wfnd = Workflow(name="wfnd", input_spec=["x"], x=2) - wfnd.add(add2(name="add2", x=wfnd.lzin.x)) - wfnd.set_output([("out", wfnd.add2.lzout.out)]) - - wf = Workflow(name="wf", input_spec=["x"], x=3) - wfnd.inputs.x = wf.lzin.x - wf.add(wfnd) - wf.set_output([("out", wf.wfnd.lzout.out)]) - wf.cache_dir = tmpdir - - with Submitter(plugin=plugin) as sub: - sub(wf) - - results = wf.result() - assert results.output.out == 5 - assert wf.output_dir.exists() - - -def test_wfasnd_wfndupdate_rerun(plugin, tmpdir): - """workflow as a node - workflow-node with one task and no splitter - wfasnode is run first and later is - updated to use the main workflow input - """ - - wfnd = Workflow(name="wfnd", input_spec=["x"], x=2) - wfnd.add(add2(name="add2", x=wfnd.lzin.x)) - wfnd.set_output([("out", wfnd.add2.lzout.out)]) - wfnd.cache_dir = tmpdir - with Submitter(plugin=plugin) as sub: - sub(wfnd) - - wf = Workflow(name="wf", input_spec=["x"], x=3) - # trying to set before - wfnd.inputs.x = wf.lzin.x - wf.add(wfnd) - # trying to set after add... - wf.wfnd.inputs.x = wf.lzin.x - wf.set_output([("out", wf.wfnd.lzout.out)]) - wf.cache_dir = tmpdir - - with Submitter(plugin=plugin) as sub: - sub(wf) - - results = wf.result() - assert results.output.out == 5 - assert wf.output_dir.exists() - - # adding another layer of workflow - wf_o = Workflow(name="wf_o", input_spec=["x"], x=4) - wf.inputs.x = wf_o.lzin.x - wf_o.add(wf) - wf_o.set_output([("out", wf_o.wf.lzout.out)]) - wf_o.cache_dir = tmpdir - - with Submitter(plugin=plugin) as sub: - sub(wf_o) - - results = wf_o.result() - assert results.output.out == 6 - assert wf_o.output_dir.exists() - - -def test_wfasnd_st_1(plugin, tmpdir): - """workflow as a node - workflow-node with one task, - splitter for wfnd - """ - wfnd = Workflow(name="wfnd", input_spec=["x"]) - wfnd.add(add2(name="add2", x=wfnd.lzin.x)) - wfnd.set_output([("out", wfnd.add2.lzout.out)]) - wfnd.split("x", x=[2, 4]) - - wf = Workflow(name="wf", input_spec=["x"]) - wf.add(wfnd) - wf.set_output([("out", wf.wfnd.lzout.out)]) - wf.cache_dir = tmpdir - - checksum_before = wf.checksum - with Submitter(plugin=plugin) as sub: - sub(wf) - - assert wf.checksum == checksum_before - results = wf.result() - assert results.output.out == [4, 6] - # checking the output directory - assert wf.output_dir.exists() - - -def test_wfasnd_st_updatespl_1(plugin, tmpdir): - """workflow as a node - workflow-node with one task, - splitter for wfnd is set after add - """ - wfnd = Workflow(name="wfnd", input_spec=["x"]) - wfnd.add(add2(name="add2", x=wfnd.lzin.x)) - wfnd.set_output([("out", wfnd.add2.lzout.out)]) - - wf = Workflow(name="wf", input_spec=["x"]) - wf.add(wfnd) - wfnd.split("x", x=[2, 4]) - wf.set_output([("out", wf.wfnd.lzout.out)]) - wf.cache_dir = tmpdir - - with Submitter(plugin=plugin) as sub: - sub(wf) - - results = wf.result() - assert results.output.out == [4, 6] - # checking the output directory - assert wf.output_dir.exists() - - -def test_wfasnd_ndst_1(plugin, tmpdir): - """workflow as a node - workflow-node with one task, - splitter for node - """ - wfnd = Workflow(name="wfnd", input_spec=["x"]) - wfnd.add(add2(name="add2").split("x", x=wfnd.lzin.x)) - wfnd.set_output([("out", wfnd.add2.lzout.out)]) - # TODO: without this the test is failing - wfnd.plugin = plugin - wfnd.inputs.x = [2, 4] - - wf = Workflow(name="wf", input_spec=["x"]) - wf.add(wfnd) - wf.set_output([("out", wf.wfnd.lzout.out)]) - wf.cache_dir = tmpdir - - with Submitter(plugin=plugin) as sub: - sub(wf) - - results = wf.result() - assert results.output.out == [4, 6] - # checking the output directory - assert wf.output_dir.exists() - - -def test_wfasnd_ndst_updatespl_1(plugin, tmpdir): - """workflow as a node - workflow-node with one task, - splitter for node added after add - """ - wfnd = Workflow(name="wfnd", input_spec=["x"]) - wfnd.add(add2(name="add2", x=wfnd.lzin.x)) - wfnd.add2.split("x", x=[2, 4]) - wfnd.set_output([("out", wfnd.add2.lzout.out)]) - - wf = Workflow(name="wf", input_spec=["x"]) - wf.add(wfnd) - wf.set_output([("out", wf.wfnd.lzout.out)]) - wf.cache_dir = tmpdir - - with Submitter(plugin=plugin) as sub: - sub(wf) - - results = wf.result() - assert results.output.out == [4, 6] - # checking the output directory - assert wf.output_dir.exists() - - -def test_wfasnd_wfst_1(plugin, tmpdir): - """workflow as a node - workflow-node with one task, - splitter for the main workflow - """ - wf = Workflow(name="wf", input_spec=["x"], cache_dir=tmpdir) - wfnd = Workflow(name="wfnd", input_spec=["x"], x=wf.lzin.x) - wfnd.add(add2(name="add2", x=wfnd.lzin.x)) - wfnd.set_output([("out", wfnd.add2.lzout.out)]) - - wf.add(wfnd) - wf.split("x", x=[2, 4]) - wf.set_output([("out", wf.wfnd.lzout.out)]) - - with Submitter(plugin=plugin) as sub: - sub(wf) - # assert wf.output_dir.exists() - results = wf.result() - assert results[0].output.out == 4 - assert results[1].output.out == 6 - # checking all directories - assert wf.output_dir - for odir in wf.output_dir: - assert odir.exists() - - -# workflows with structures wf(A) -> B - - -def test_wfasnd_st_2(plugin, tmpdir): - """workflow as a node, - the main workflow has two tasks, - splitter for wfnd - """ - wfnd = Workflow(name="wfnd", input_spec=["x", "y"]) - wfnd.add(multiply(name="mult", x=wfnd.lzin.x, y=wfnd.lzin.y)) - wfnd.set_output([("out", wfnd.mult.lzout.out)]) - wfnd.split(("x", "y"), x=[2, 4], y=[1, 10]) - - wf = Workflow(name="wf_st_3", input_spec=["x", "y"]) - wf.add(wfnd) - wf.add(add2(name="add2", x=wf.wfnd.lzout.out)) - wf.set_output([("out", wf.add2.lzout.out)]) - wf.cache_dir = tmpdir - - with Submitter(plugin=plugin) as sub: - sub(wf) - # assert wf.output_dir.exists() - results = wf.result() - assert results.output.out == [4, 42] - # checking the output directory - assert wf.output_dir.exists() - - -def test_wfasnd_wfst_2(plugin, tmpdir): - """workflow as a node, - the main workflow has two tasks, - splitter for the main workflow - """ - wf = Workflow(name="wf_st_3", input_spec=["x", "y"]) - wfnd = Workflow(name="wfnd", input_spec=["x", "y"], x=wf.lzin.x, y=wf.lzin.y) - wfnd.add(multiply(name="mult", x=wfnd.lzin.x, y=wfnd.lzin.y)) - wfnd.set_output([("out", wfnd.mult.lzout.out)]) - - wf.add(wfnd) - wf.add(add2(name="add2", x=wf.wfnd.lzout.out)) - wf.split(("x", "y"), x=[2, 4], y=[1, 10]) - wf.set_output([("out", wf.add2.lzout.out)]) - wf.cache_dir = tmpdir - - with Submitter(plugin=plugin) as sub: - sub(wf) - # assert wf.output_dir.exists() - results = wf.result() - assert results[0].output.out == 4 - assert results[1].output.out == 42 - # checking all directories - assert wf.output_dir - for odir in wf.output_dir: - assert odir.exists() - - -# workflows with structures A -> wf(B) - - -def test_wfasnd_ndst_3(plugin, tmpdir): - """workflow as the second node, - the main workflow has two tasks, - splitter for the first task - """ - wf = Workflow(name="wf_st_3", input_spec=["x", "y"]) - wf.add(multiply(name="mult").split(("x", "y"), x=wf.lzin.x, y=wf.lzin.y)) - wf.inputs.x = [2, 4] - wf.inputs.y = [1, 10] - - wfnd = Workflow(name="wfnd", input_spec=["x"], x=wf.mult.lzout.out) - wfnd.add(add2(name="add2", x=wfnd.lzin.x)) - wfnd.set_output([("out", wfnd.add2.lzout.out)]) - wf.add(wfnd) - - wf.set_output([("out", wf.wfnd.lzout.out)]) - wf.cache_dir = tmpdir - - with Submitter(plugin="serial") as sub: - sub(wf) - # assert wf.output_dir.exists() - results = wf.result() - assert results.output.out == [4, 42] - # checking the output directory - assert wf.output_dir.exists() - - -def test_wfasnd_wfst_3(plugin, tmpdir): - """workflow as the second node, - the main workflow has two tasks, - splitter for the main workflow - """ - wf = Workflow(name="wf_st_3", input_spec=["x", "y"]) - wf.add(multiply(name="mult", x=wf.lzin.x, y=wf.lzin.y)) - wf.split(("x", "y"), x=[2, 4], y=[1, 10]) - - wfnd = Workflow(name="wfnd", input_spec=["x"], x=wf.mult.lzout.out) - wfnd.add(add2(name="add2", x=wfnd.lzin.x)) - wfnd.set_output([("out", wfnd.add2.lzout.out)]) - wf.add(wfnd) - - wf.set_output([("out", wf.wfnd.lzout.out)]) - wf.plugin = plugin - wf.cache_dir = tmpdir - - with Submitter(plugin=plugin) as sub: - sub(wf) - # assert wf.output_dir.exists() - results = wf.result() - assert results[0].output.out == 4 - assert results[1].output.out == 42 - # checking all directories - assert wf.output_dir - for odir in wf.output_dir: - assert odir.exists() - - -# workflows with structures wfns(A->B) - - -def test_wfasnd_4(plugin, tmpdir): - """workflow as a node - workflow-node with two tasks and no splitter - """ - wfnd = Workflow(name="wfnd", input_spec=["x"]) - wfnd.add(add2(name="add2_1st", x=wfnd.lzin.x)) - wfnd.add(add2(name="add2_2nd", x=wfnd.add2_1st.lzout.out)) - wfnd.set_output([("out", wfnd.add2_2nd.lzout.out)]) - wfnd.inputs.x = 2 - - wf = Workflow(name="wf", input_spec=["x"]) - wf.add(wfnd) - wf.set_output([("out", wf.wfnd.lzout.out)]) - wf.cache_dir = tmpdir - - with Submitter(plugin=plugin) as sub: - sub(wf) - - results = wf.result() - assert results.output.out == 6 - # checking the output directory - assert wf.output_dir.exists() - - -def test_wfasnd_ndst_4(plugin, tmpdir): - """workflow as a node - workflow-node with two tasks, - splitter for node - """ - wfnd = Workflow(name="wfnd", input_spec=["x"]) - wfnd.add(add2(name="add2_1st").split("x", x=wfnd.lzin.x)) - wfnd.add(add2(name="add2_2nd", x=wfnd.add2_1st.lzout.out)) - wfnd.set_output([("out", wfnd.add2_2nd.lzout.out)]) - wfnd.inputs.x = [2, 4] - - wf = Workflow(name="wf", input_spec=["x"]) - wf.add(wfnd) - wf.set_output([("out", wf.wfnd.lzout.out)]) - wf.cache_dir = tmpdir - - with Submitter(plugin=plugin) as sub: - sub(wf) - - results = wf.result() - assert results.output.out == [6, 8] - # checking the output directory - assert wf.output_dir.exists() - - -def test_wfasnd_wfst_4(plugin, tmpdir): - """workflow as a node - workflow-node with two tasks, - splitter for the main workflow - """ - wf = Workflow(name="wf", input_spec=["x"], cache_dir=tmpdir) - wfnd = Workflow(name="wfnd", input_spec=["x"], x=wf.lzin.x) - wfnd.add(add2(name="add2_1st", x=wfnd.lzin.x)) - wfnd.add(add2(name="add2_2nd", x=wfnd.add2_1st.lzout.out)) - wfnd.set_output([("out", wfnd.add2_2nd.lzout.out)]) - - wf.add(wfnd) - wf.split("x", x=[2, 4]) - wf.set_output([("out", wf.wfnd.lzout.out)]) - - with Submitter(plugin=plugin) as sub: - sub(wf) - # assert wf.output_dir.exists() - results = wf.result() - assert results[0].output.out == 6 - assert results[1].output.out == 8 - # checking all directories - assert wf.output_dir - for odir in wf.output_dir: - assert odir.exists() - - -# Testing caching - - -@pytest.mark.flaky(reruns=3) -def test_wf_nostate_cachedir(plugin, tmpdir): - """wf with provided cache_dir using pytest tmpdir""" - cache_dir = tmpdir.mkdir("test_wf_cache_1") - - wf = Workflow(name="wf_2", input_spec=["x", "y"], cache_dir=cache_dir) - wf.add(multiply(name="mult", x=wf.lzin.x, y=wf.lzin.y)) - wf.add(add2(name="add2", x=wf.mult.lzout.out)) - wf.set_output([("out", wf.add2.lzout.out)]) - wf.inputs.x = 2 - wf.inputs.y = 3 - - with Submitter(plugin=plugin) as sub: - sub(wf) - - assert wf.output_dir.exists() - results = wf.result() - assert 8 == results.output.out - - shutil.rmtree(cache_dir) - - -@pytest.mark.flaky(reruns=3) -def test_wf_nostate_cachedir_relativepath(tmpdir, plugin): - """wf with provided cache_dir as relative path""" - tmpdir.chdir() - cache_dir = "test_wf_cache_2" - tmpdir.mkdir(cache_dir) - - wf = Workflow(name="wf_2", input_spec=["x", "y"], cache_dir=cache_dir) - wf.add(multiply(name="mult", x=wf.lzin.x, y=wf.lzin.y)) - wf.add(add2(name="add2", x=wf.mult.lzout.out)) - wf.set_output([("out", wf.add2.lzout.out)]) - wf.inputs.x = 2 - wf.inputs.y = 3 - - with Submitter(plugin=plugin) as sub: - sub(wf) - - assert wf.output_dir.exists() - results = wf.result() - assert 8 == results.output.out - - shutil.rmtree(cache_dir) - - -@pytest.mark.flaky(reruns=3) -def test_wf_nostate_cachelocations(plugin, tmpdir): - """ - Two identical wfs with provided cache_dir; - the second wf has cache_locations and should not recompute the results - """ - cache_dir1 = tmpdir.mkdir("test_wf_cache3") - cache_dir2 = tmpdir.mkdir("test_wf_cache4") - - wf1 = Workflow(name="wf", input_spec=["x", "y"], cache_dir=cache_dir1) - wf1.add(multiply(name="mult", x=wf1.lzin.x, y=wf1.lzin.y)) - wf1.add(add2_wait(name="add2", x=wf1.mult.lzout.out)) - wf1.set_output([("out", wf1.add2.lzout.out)]) - wf1.inputs.x = 2 - wf1.inputs.y = 3 - - t0 = time.time() - with Submitter(plugin=plugin) as sub: - sub(wf1) - t1 = time.time() - t0 - - results1 = wf1.result() - assert 8 == results1.output.out - - wf2 = Workflow( - name="wf", - input_spec=["x", "y"], - cache_dir=cache_dir2, - cache_locations=cache_dir1, - ) - wf2.add(multiply(name="mult", x=wf2.lzin.x, y=wf2.lzin.y)) - wf2.add(add2_wait(name="add2", x=wf2.mult.lzout.out)) - wf2.set_output([("out", wf2.add2.lzout.out)]) - wf2.inputs.x = 2 - wf2.inputs.y = 3 - - t0 = time.time() - with Submitter(plugin=plugin) as sub: - sub(wf2) - t2 = time.time() - t0 - - results2 = wf2.result() - assert 8 == results2.output.out - - # checking execution time (for unix and cf) - # for win and dask/slurm the time for dir creation etc. might take much longer - if not sys.platform.startswith("win") and plugin == "cf": - assert t1 > 2 - assert t2 < max(1, t1 - 1) - - # checking if the second wf didn't run again - assert wf1.output_dir.exists() - assert not wf2.output_dir.exists() - - -@pytest.mark.flaky(reruns=3) -def test_wf_nostate_cachelocations_a(plugin, tmpdir): - """ - the same as previous test, but workflows names differ; - the task should not be run and it should be fast, - but the wf itself is triggered and the new output dir is created - """ - cache_dir1 = tmpdir.mkdir("test_wf_cache3") - cache_dir2 = tmpdir.mkdir("test_wf_cache4") - - wf1 = Workflow(name="wf1", input_spec=["x", "y"], cache_dir=cache_dir1) - wf1.add(multiply(name="mult", x=wf1.lzin.x, y=wf1.lzin.y)) - wf1.add(add2_wait(name="add2", x=wf1.mult.lzout.out)) - wf1.set_output([("out", wf1.add2.lzout.out)]) - wf1.inputs.x = 2 - wf1.inputs.y = 3 - wf1.plugin = plugin - - t0 = time.time() - with Submitter(plugin=plugin) as sub: - sub(wf1) - t1 = time.time() - t0 - - results1 = wf1.result() - assert 8 == results1.output.out - - wf2 = Workflow( - name="wf2", - input_spec=["x", "y"], - cache_dir=cache_dir2, - cache_locations=cache_dir1, - ) - wf2.add(multiply(name="mult", x=wf2.lzin.x, y=wf2.lzin.y)) - wf2.add(add2_wait(name="add2", x=wf2.mult.lzout.out)) - wf2.set_output([("out", wf2.add2.lzout.out)]) - wf2.inputs.x = 2 - wf2.inputs.y = 3 - wf2.plugin = plugin - - t0 = time.time() - with Submitter(plugin=plugin) as sub: - sub(wf2) - t2 = time.time() - t0 - - results2 = wf2.result() - assert 8 == results2.output.out - - # for win and dask/slurm the time for dir creation etc. might take much longer - if not sys.platform.startswith("win") and plugin == "cf": - # checking execution time (second one should be quick) - assert t1 > 2 - # testing relative values (windows or slurm takes much longer to create wf itself) - assert t2 < max(1, t1 - 1) - - # checking if both wf.output_dir are created - assert wf1.output_dir.exists() - assert wf2.output_dir.exists() - - -@pytest.mark.flaky(reruns=3) -def test_wf_nostate_cachelocations_b(plugin, tmpdir): - """ - the same as previous test, but the 2nd workflows has two outputs - (connected to the same task output); - the task should not be run and it should be fast, - but the wf itself is triggered and the new output dir is created - """ - cache_dir1 = tmpdir.mkdir("test_wf_cache3") - cache_dir2 = tmpdir.mkdir("test_wf_cache4") - - wf1 = Workflow(name="wf", input_spec=["x", "y"], cache_dir=cache_dir1) - wf1.add(multiply(name="mult", x=wf1.lzin.x, y=wf1.lzin.y)) - wf1.add(add2_wait(name="add2", x=wf1.mult.lzout.out)) - wf1.set_output([("out", wf1.add2.lzout.out)]) - wf1.inputs.x = 2 - wf1.inputs.y = 3 - wf1.plugin = plugin - - t0 = time.time() - with Submitter(plugin=plugin) as sub: - sub(wf1) - t1 = time.time() - t0 - - results1 = wf1.result() - assert 8 == results1.output.out - - wf2 = Workflow( - name="wf", - input_spec=["x", "y"], - cache_dir=cache_dir2, - cache_locations=cache_dir1, - ) - wf2.add(multiply(name="mult", x=wf2.lzin.x, y=wf2.lzin.y)) - wf2.add(add2_wait(name="add2", x=wf2.mult.lzout.out)) - wf2.set_output([("out", wf2.add2.lzout.out)]) - # additional output - wf2.set_output([("out_pr", wf2.add2.lzout.out)]) - wf2.inputs.x = 2 - wf2.inputs.y = 3 - wf2.plugin = plugin - - t0 = time.time() - with Submitter(plugin=plugin) as sub: - sub(wf2) - t2 = time.time() - t0 - - results2 = wf2.result() - assert 8 == results2.output.out == results2.output.out_pr - - # for win and dask/slurm the time for dir creation etc. might take much longer - if not sys.platform.startswith("win") and plugin == "cf": - # execution time for second run should be much shorter - assert t1 > 2 - assert t2 < max(1, t1 - 1) - - # checking if the second wf didn't run again - assert wf1.output_dir.exists() - assert wf2.output_dir.exists() - - -@pytest.mark.flaky(reruns=3) -def test_wf_nostate_cachelocations_setoutputchange(plugin, tmpdir): - """ - the same as previous test, but wf output names differ, - the tasks should not be run and it should be fast, - but the wf itself is triggered and the new output dir is created - (the second wf has updated name in its Output) - """ - cache_dir1 = tmpdir.mkdir("test_wf_cache3") - cache_dir2 = tmpdir.mkdir("test_wf_cache4") - - wf1 = Workflow(name="wf", input_spec=["x", "y"], cache_dir=cache_dir1) - wf1.add(multiply(name="mult", x=wf1.lzin.x, y=wf1.lzin.y)) - wf1.add(add2_wait(name="add2", x=wf1.mult.lzout.out)) - wf1.set_output([("out1", wf1.add2.lzout.out)]) - wf1.inputs.x = 2 - wf1.inputs.y = 3 - wf1.plugin = plugin - - t0 = time.time() - with Submitter(plugin=plugin) as sub: - sub(wf1) - t1 = time.time() - t0 - - results1 = wf1.result() - assert 8 == results1.output.out1 - - wf2 = Workflow( - name="wf", - input_spec=["x", "y"], - cache_dir=cache_dir2, - cache_locations=cache_dir1, - ) - wf2.add(multiply(name="mult", x=wf2.lzin.x, y=wf2.lzin.y)) - wf2.add(add2_wait(name="add2", x=wf2.mult.lzout.out)) - wf2.set_output([("out2", wf2.add2.lzout.out)]) - wf2.inputs.x = 2 - wf2.inputs.y = 3 - wf2.plugin = plugin - - t0 = time.time() - with Submitter(plugin=plugin) as sub: - sub(wf2) - t2 = time.time() - t0 - - results2 = wf2.result() - assert 8 == results2.output.out2 - - # for win and dask/slurm the time for dir creation etc. might take much longer - if not sys.platform.startswith("win") and plugin == "cf": - # checking execution time (the second wf should be fast, nodes do not have to rerun) - assert t1 > 2 - # testing relative values (windows or slurm takes much longer to create wf itself) - assert t2 < max(1, t1 - 1) - - # both wf output_dirs should be created - assert wf1.output_dir.exists() - assert wf2.output_dir.exists() - - -@pytest.mark.flaky(reruns=3) -def test_wf_nostate_cachelocations_setoutputchange_a(plugin, tmpdir): - """ - the same as previous test, but wf names and output names differ, - """ - cache_dir1 = tmpdir.mkdir("test_wf_cache3") - cache_dir2 = tmpdir.mkdir("test_wf_cache4") - - wf1 = Workflow(name="wf1", input_spec=["x", "y"], cache_dir=cache_dir1) - wf1.add(multiply(name="mult", x=wf1.lzin.x, y=wf1.lzin.y)) - wf1.add(add2_wait(name="add2", x=wf1.mult.lzout.out)) - wf1.set_output([("out1", wf1.add2.lzout.out)]) - wf1.inputs.x = 2 - wf1.inputs.y = 3 - wf1.plugin = plugin - - t0 = time.time() - with Submitter(plugin=plugin) as sub: - sub(wf1) - t1 = time.time() - t0 - - results1 = wf1.result() - assert 8 == results1.output.out1 - - wf2 = Workflow( - name="wf2", - input_spec=["x", "y"], - cache_dir=cache_dir2, - cache_locations=cache_dir1, - ) - wf2.add(multiply(name="mult", x=wf2.lzin.x, y=wf2.lzin.y)) - wf2.add(add2_wait(name="add2", x=wf2.mult.lzout.out)) - wf2.set_output([("out2", wf2.add2.lzout.out)]) - wf2.inputs.x = 2 - wf2.inputs.y = 3 - wf2.plugin = plugin - - t0 = time.time() - with Submitter(plugin=plugin) as sub: - sub(wf2) - t2 = time.time() - t0 - - results2 = wf2.result() - assert 8 == results2.output.out2 - - # for win and dask/slurm the time for dir creation etc. might take much longer - if not sys.platform.startswith("win") and plugin == "cf": - assert t1 > 2 - # testing relative values (windows or slurm takes much longer to create wf itself) - assert t2 < max(1, t1 - 1) - - # both wf output_dirs should be created - assert wf1.output_dir.exists() - assert wf2.output_dir.exists() - - -@pytest.mark.flaky(reruns=3) -def test_wf_nostate_cachelocations_forcererun(plugin, tmpdir): - """ - Two identical wfs with provided cache_dir; - the second wf has cache_locations, - but submitter is called with rerun=True, so should recompute - """ - cache_dir1 = tmpdir.mkdir("test_wf_cache3") - cache_dir2 = tmpdir.mkdir("test_wf_cache4") - - wf1 = Workflow(name="wf", input_spec=["x", "y"], cache_dir=cache_dir1) - wf1.add(multiply(name="mult", x=wf1.lzin.x, y=wf1.lzin.y)) - wf1.add(add2_wait(name="add2", x=wf1.mult.lzout.out)) - wf1.set_output([("out", wf1.add2.lzout.out)]) - wf1.inputs.x = 2 - wf1.inputs.y = 3 - wf1.plugin = plugin - - t0 = time.time() - with Submitter(plugin=plugin) as sub: - sub(wf1) - t1 = time.time() - t0 - - results1 = wf1.result() - assert 8 == results1.output.out - - wf2 = Workflow( - name="wf", - input_spec=["x", "y"], - cache_dir=cache_dir2, - cache_locations=cache_dir1, - ) - wf2.add(multiply(name="mult", x=wf2.lzin.x, y=wf2.lzin.y)) - wf2.add(add2_wait(name="add2", x=wf2.mult.lzout.out)) - wf2.set_output([("out", wf2.add2.lzout.out)]) - wf2.inputs.x = 2 - wf2.inputs.y = 3 - wf2.plugin = plugin - - t0 = time.time() - with Submitter(plugin=plugin) as sub: - sub(wf2, rerun=True) - t2 = time.time() - t0 - - results2 = wf2.result() - assert 8 == results2.output.out - - # for win and dask/slurm the time for dir creation etc. might take much longer - if not sys.platform.startswith("win") and plugin == "cf": - # checking execution time - assert t1 > 2 - assert t2 > 2 - - # checking if the second wf didn't run again - assert wf1.output_dir.exists() - assert wf2.output_dir.exists() - - -@pytest.mark.flaky(reruns=3) -def test_wf_nostate_cachelocations_wftaskrerun_propagateTrue(plugin, tmpdir): - """ - Two identical wfs with provided cache_dir and cache_locations for the second one; - submitter doesn't have rerun, but the second wf has rerun=True, - propagate_rerun is True as default, so everything should be rerun - """ - cache_dir1 = tmpdir.mkdir("test_wf_cache3") - cache_dir2 = tmpdir.mkdir("test_wf_cache4") - - wf1 = Workflow(name="wf", input_spec=["x", "y"], cache_dir=cache_dir1) - wf1.add(multiply(name="mult", x=wf1.lzin.x, y=wf1.lzin.y)) - wf1.add(add2_wait(name="add2", x=wf1.mult.lzout.out)) - wf1.set_output([("out", wf1.add2.lzout.out)]) - wf1.inputs.x = 2 - wf1.inputs.y = 3 - wf1.plugin = plugin - - t0 = time.time() - with Submitter(plugin=plugin) as sub: - sub(wf1) - t1 = time.time() - t0 - - results1 = wf1.result() - assert 8 == results1.output.out - - wf2 = Workflow( - name="wf", - input_spec=["x", "y"], - cache_dir=cache_dir2, - cache_locations=cache_dir1, - rerun=True, # wh has to be rerun (default for propagate_rerun is True) - ) - wf2.add(multiply(name="mult", x=wf2.lzin.x, y=wf2.lzin.y)) - wf2.add(add2_wait(name="add2", x=wf2.mult.lzout.out)) - wf2.set_output([("out", wf2.add2.lzout.out)]) - wf2.inputs.x = 2 - wf2.inputs.y = 3 - wf2.plugin = plugin - - t0 = time.time() - with Submitter(plugin=plugin) as sub: - sub(wf2) - t2 = time.time() - t0 - - results2 = wf2.result() - assert 8 == results2.output.out - - # checking if the second wf runs again - assert wf1.output_dir.exists() - assert wf2.output_dir.exists() - - # everything has to be recomputed - assert len(list(Path(cache_dir1).glob("F*"))) == 2 - assert len(list(Path(cache_dir2).glob("F*"))) == 2 - - # for win and dask/slurm the time for dir creation etc. might take much longer - if not sys.platform.startswith("win") and plugin == "cf": - # runtime for recomputed workflows should be about the same - assert abs(t1 - t2) < t1 / 2 - - -@pytest.mark.flaky(reruns=3) -def test_wf_nostate_cachelocations_wftaskrerun_propagateFalse(plugin, tmpdir): - """ - Two identical wfs with provided cache_dir and cache_locations for the second one; - submitter doesn't have rerun, but the second wf has rerun=True, - propagate_rerun is set to False, so wf will be triggered, - but tasks will not have rerun, so will use the previous results - """ - cache_dir1 = tmpdir.mkdir("test_wf_cache3") - cache_dir2 = tmpdir.mkdir("test_wf_cache4") - - wf1 = Workflow(name="wf", input_spec=["x", "y"], cache_dir=cache_dir1) - wf1.add(multiply(name="mult", x=wf1.lzin.x, y=wf1.lzin.y)) - wf1.add(add2_wait(name="add2", x=wf1.mult.lzout.out)) - wf1.set_output([("out", wf1.add2.lzout.out)]) - wf1.inputs.x = 2 - wf1.inputs.y = 3 - wf1.plugin = plugin - - t0 = time.time() - with Submitter(plugin=plugin) as sub: - sub(wf1) - t1 = time.time() - t0 - - results1 = wf1.result() - assert 8 == results1.output.out - - wf2 = Workflow( - name="wf", - input_spec=["x", "y"], - cache_dir=cache_dir2, - cache_locations=cache_dir1, - rerun=True, # wh has to be rerun - propagate_rerun=False, # but rerun doesn't propagate to the tasks - ) - wf2.add(multiply(name="mult", x=wf2.lzin.x, y=wf2.lzin.y)) - wf2.add(add2_wait(name="add2", x=wf2.mult.lzout.out)) - wf2.set_output([("out", wf2.add2.lzout.out)]) - wf2.inputs.x = 2 - wf2.inputs.y = 3 - wf2.plugin = plugin - - t0 = time.time() - with Submitter(plugin=plugin) as sub: - sub(wf2) - t2 = time.time() - t0 - - results2 = wf2.result() - assert 8 == results2.output.out - - # checking if the second wf runs again - assert wf1.output_dir.exists() - assert wf2.output_dir.exists() - - # for win and dask/slurm the time for dir creation etc. might take much longer - if not sys.platform.startswith("win") and plugin == "cf": - # checking the time - assert t1 > 2 - assert t2 < max(1, t1 - 1) - - # tasks should not be recomputed - assert len(list(Path(cache_dir1).glob("F*"))) == 2 - assert len(list(Path(cache_dir2).glob("F*"))) == 0 - - -@pytest.mark.flaky(reruns=3) -def test_wf_nostate_cachelocations_taskrerun_wfrerun_propagateFalse(plugin, tmpdir): - """ - Two identical wfs with provided cache_dir, and cache_locations for the second wf; - submitter doesn't have rerun, but wf has rerun=True, - since propagate_rerun=False, only tasks that have rerun=True will be rerun - """ - cache_dir1 = tmpdir.mkdir("test_wf_cache3") - cache_dir2 = tmpdir.mkdir("test_wf_cache4") - - wf1 = Workflow(name="wf", input_spec=["x", "y"], cache_dir=cache_dir1) - wf1.add(multiply(name="mult", x=wf1.lzin.x, y=wf1.lzin.y)) - wf1.add(add2_wait(name="add2", x=wf1.mult.lzout.out)) - wf1.set_output([("out", wf1.add2.lzout.out)]) - wf1.inputs.x = 2 - wf1.inputs.y = 3 - wf1.plugin = plugin - - t0 = time.time() - with Submitter(plugin=plugin) as sub: - sub(wf1) - t1 = time.time() - t0 - - results1 = wf1.result() - assert 8 == results1.output.out - - wf2 = Workflow( - name="wf", - input_spec=["x", "y"], - cache_dir=cache_dir2, - cache_locations=cache_dir1, - rerun=True, - propagate_rerun=False, # rerun will not be propagated to each task - ) - wf2.add(multiply(name="mult", x=wf2.lzin.x, y=wf2.lzin.y)) - # rerun on the task level needed (wf.propagate_rerun is False) - wf2.add(add2_wait(name="add2", x=wf2.mult.lzout.out, rerun=True)) - wf2.set_output([("out", wf2.add2.lzout.out)]) - wf2.inputs.x = 2 - wf2.inputs.y = 3 - wf2.plugin = plugin - - t0 = time.time() - with Submitter(plugin=plugin) as sub: - sub(wf2) - t2 = time.time() - t0 - - results2 = wf2.result() - assert 8 == results2.output.out - - assert wf1.output_dir.exists() - assert wf2.output_dir.exists() - # the second task should be recomputed - assert len(list(Path(cache_dir1).glob("F*"))) == 2 - assert len(list(Path(cache_dir2).glob("F*"))) == 1 - - # for win and dask/slurm the time for dir creation etc. might take much longer - if not sys.platform.startswith("win") and plugin == "cf": - # checking the execution time - assert t1 > 2 - assert t2 > 2 - - -@pytest.mark.flaky(reruns=3) -def test_wf_nostate_nodecachelocations(plugin, tmpdir): - """ - Two wfs with different input, but the second node has the same input; - the second wf has cache_locations and should recompute the wf, - but without recomputing the second node - """ - cache_dir1 = tmpdir.mkdir("test_wf_cache3") - cache_dir2 = tmpdir.mkdir("test_wf_cache4") - - wf1 = Workflow(name="wf", input_spec=["x"], cache_dir=cache_dir1) - wf1.add(ten(name="ten", x=wf1.lzin.x)) - wf1.add(add2(name="add2", x=wf1.ten.lzout.out)) - wf1.set_output([("out", wf1.add2.lzout.out)]) - wf1.inputs.x = 3 - wf1.plugin = plugin - - with Submitter(plugin=plugin) as sub: - sub(wf1) - - results1 = wf1.result() - assert 12 == results1.output.out - - wf2 = Workflow( - name="wf", - input_spec=["x", "y"], - cache_dir=cache_dir2, - cache_locations=cache_dir1, - ) - wf2.add(ten(name="ten", x=wf2.lzin.x)) - wf2.add(add2(name="add2", x=wf2.ten.lzout.out)) - wf2.set_output([("out", wf2.add2.lzout.out)]) - wf2.inputs.x = 2 - wf2.plugin = plugin - - with Submitter(plugin=plugin) as sub: - sub(wf2) - - results2 = wf2.result() - assert 12 == results2.output.out - - # checking if the second wf runs again, but runs only one task - assert wf1.output_dir.exists() - assert wf2.output_dir.exists() - # the second wf should rerun one task - assert len(list(Path(cache_dir1).glob("F*"))) == 2 - assert len(list(Path(cache_dir2).glob("F*"))) == 1 - - -@pytest.mark.flaky(reruns=3) -def test_wf_nostate_nodecachelocations_upd(plugin, tmpdir): - """ - Two wfs with different input, but the second node has the same input; - the second wf has cache_locations (set after adding tasks) and should recompute, - but without recomputing the second node - """ - cache_dir1 = tmpdir.mkdir("test_wf_cache3") - cache_dir2 = tmpdir.mkdir("test_wf_cache4") - - wf1 = Workflow(name="wf", input_spec=["x"], cache_dir=cache_dir1) - wf1.add(ten(name="ten", x=wf1.lzin.x)) - wf1.add(add2(name="add2", x=wf1.ten.lzout.out)) - wf1.set_output([("out", wf1.add2.lzout.out)]) - wf1.inputs.x = 3 - wf1.plugin = plugin - - with Submitter(plugin=plugin) as sub: - sub(wf1) - - results1 = wf1.result() - assert 12 == results1.output.out - - wf2 = Workflow(name="wf", input_spec=["x", "y"], cache_dir=cache_dir2) - wf2.add(ten(name="ten", x=wf2.lzin.x)) - wf2.add(add2(name="add2", x=wf2.ten.lzout.out)) - wf2.set_output([("out", wf2.add2.lzout.out)]) - wf2.inputs.x = 2 - wf2.plugin = plugin - # updating cache_locations after adding the tasks - wf2.cache_locations = cache_dir1 - - with Submitter(plugin=plugin) as sub: - sub(wf2) - - results2 = wf2.result() - assert 12 == results2.output.out - - # checking if the second wf runs again, but runs only one task - assert wf1.output_dir.exists() - assert wf2.output_dir.exists() - # the second wf should have only one task run - assert len(list(Path(cache_dir1).glob("F*"))) == 2 - assert len(list(Path(cache_dir2).glob("F*"))) == 1 - - -@pytest.mark.flaky(reruns=3) -def test_wf_state_cachelocations(plugin, tmpdir): - """ - Two identical wfs (with states) with provided cache_dir; - the second wf has cache_locations and should not recompute the results - """ - cache_dir1 = tmpdir.mkdir("test_wf_cache3") - cache_dir2 = tmpdir.mkdir("test_wf_cache4") - - wf1 = Workflow(name="wf", input_spec=["x", "y"], cache_dir=cache_dir1) - wf1.add(multiply(name="mult", x=wf1.lzin.x, y=wf1.lzin.y)) - wf1.add(add2_wait(name="add2", x=wf1.mult.lzout.out)) - wf1.set_output([("out", wf1.add2.lzout.out)]) - wf1.split(splitter=("x", "y"), x=[2, 20], y=[3, 4]) - wf1.plugin = plugin - - t0 = time.time() - with Submitter(plugin=plugin) as sub: - sub(wf1) - t1 = time.time() - t0 - - results1 = wf1.result() - assert results1[0].output.out == 8 - assert results1[1].output.out == 82 - - wf2 = Workflow( - name="wf", - input_spec=["x", "y"], - cache_dir=cache_dir2, - cache_locations=cache_dir1, - ) - wf2.add(multiply(name="mult", x=wf2.lzin.x, y=wf2.lzin.y)) - wf2.add(add2_wait(name="add2", x=wf2.mult.lzout.out)) - wf2.set_output([("out", wf2.add2.lzout.out)]) - wf2.split(splitter=("x", "y"), x=[2, 20], y=[3, 4]) - wf2.plugin = plugin - - t0 = time.time() - with Submitter(plugin=plugin) as sub: - sub(wf2) - t2 = time.time() - t0 - - results2 = wf2.result() - assert results2[0].output.out == 8 - assert results2[1].output.out == 82 - - # for win and dask/slurm the time for dir creation etc. might take much longer - if not sys.platform.startswith("win") and plugin == "cf": - # checking the execution time - assert t1 > 2 - assert t2 < max(1, t1 - 1) - - # checking all directories - assert wf1.output_dir - for odir in wf1.output_dir: - assert odir.exists() - # checking if the second wf didn't run again - # checking all directories - assert wf2.output_dir - for odir in wf2.output_dir: - assert not odir.exists() - - -@pytest.mark.flaky(reruns=3) -def test_wf_state_cachelocations_forcererun(plugin, tmpdir): - """ - Two identical wfs (with states) with provided cache_dir; - the second wf has cache_locations, - but submitter is called with rerun=True, so should recompute - """ - cache_dir1 = tmpdir.mkdir("test_wf_cache3") - cache_dir2 = tmpdir.mkdir("test_wf_cache4") - - wf1 = Workflow(name="wf", input_spec=["x", "y"], cache_dir=cache_dir1) - wf1.add(multiply(name="mult", x=wf1.lzin.x, y=wf1.lzin.y)) - wf1.add(add2_wait(name="add2", x=wf1.mult.lzout.out)) - wf1.set_output([("out", wf1.add2.lzout.out)]) - wf1.split(splitter=("x", "y"), x=[2, 20], y=[3, 4]) - wf1.plugin = plugin - - t0 = time.time() - with Submitter(plugin=plugin) as sub: - sub(wf1) - t1 = time.time() - t0 - - results1 = wf1.result() - assert results1[0].output.out == 8 - assert results1[1].output.out == 82 - - wf2 = Workflow( - name="wf", - input_spec=["x", "y"], - cache_dir=cache_dir2, - cache_locations=cache_dir1, - ) - wf2.add(multiply(name="mult", x=wf2.lzin.x, y=wf2.lzin.y)) - wf2.add(add2_wait(name="add2", x=wf2.mult.lzout.out)) - wf2.set_output([("out", wf2.add2.lzout.out)]) - wf2.split(splitter=("x", "y"), x=[2, 20], y=[3, 4]) - wf2.plugin = plugin - - t0 = time.time() - with Submitter(plugin=plugin) as sub: - sub(wf2, rerun=True) - t2 = time.time() - t0 - - results2 = wf2.result() - assert results2[0].output.out == 8 - assert results2[1].output.out == 82 - - # for win and dask/slurm the time for dir creation etc. might take much longer - if not sys.platform.startswith("win") and plugin == "cf": - # checking the execution time - assert t1 > 2 - assert t2 > 2 - - # checking all directories - assert wf1.output_dir - for odir in wf1.output_dir: - assert odir.exists() - # checking if the second wf run again - # checking all directories - assert wf2.output_dir - for odir in wf2.output_dir: - assert odir.exists() - - -@pytest.mark.flaky(reruns=3) -def test_wf_state_cachelocations_updateinp(plugin, tmpdir): - """ - Two identical wfs (with states) with provided cache_dir; - the second wf has cache_locations and should not recompute the results - (the lazy input of the node is updated to the correct one, - i.e. the same as in wf1, after adding the node to the wf) - """ - cache_dir1 = tmpdir.mkdir("test_wf_cache3") - cache_dir2 = tmpdir.mkdir("test_wf_cache4") - - wf1 = Workflow(name="wf", input_spec=["x", "y"], cache_dir=cache_dir1) - wf1.add(multiply(name="mult", x=wf1.lzin.x, y=wf1.lzin.y)) - wf1.add(add2_wait(name="add2", x=wf1.mult.lzout.out)) - wf1.set_output([("out", wf1.add2.lzout.out)]) - wf1.split(splitter=("x", "y"), x=[2, 20], y=[3, 4]) - wf1.plugin = plugin - - t0 = time.time() - with Submitter(plugin=plugin) as sub: - sub(wf1) - t1 = time.time() - t0 - - results1 = wf1.result() - assert results1[0].output.out == 8 - assert results1[1].output.out == 82 - - wf2 = Workflow( - name="wf", - input_spec=["x", "y"], - cache_dir=cache_dir2, - cache_locations=cache_dir1, - ) - wf2.add(multiply(name="mult", x=wf2.lzin.x, y=wf2.lzin.y)) - wf2.add(add2_wait(name="add2", x=wf2.mult.lzout.out)) - wf2.set_output([("out", wf2.add2.lzout.out)]) - wf2.split(splitter=("x", "y"), x=[2, 20], y=[3, 4]) - wf2.plugin = plugin - wf2.mult.inputs.y = wf2.lzin.y - - t0 = time.time() - with Submitter(plugin=plugin) as sub: - sub(wf2) - t2 = time.time() - t0 - - results2 = wf2.result() - assert results2[0].output.out == 8 - assert results2[1].output.out == 82 - - # for win and dask/slurm the time for dir creation etc. might take much longer - if not sys.platform.startswith("win") and plugin == "cf": - # checking the execution time - assert t1 > 2 - assert t2 < max(1, t1 - 1) - - # checking all directories - assert wf1.output_dir - for odir in wf1.output_dir: - assert odir.exists() - # checking if the second wf didn't run again - # checking all directories - assert wf2.output_dir - for odir in wf2.output_dir: - assert not odir.exists() - - -@pytest.mark.flaky(reruns=3) -def test_wf_state_n_nostate_cachelocations(plugin, tmpdir): - """ - Two wfs with provided cache_dir, the first one has no state, the second has; - the second wf has cache_locations and should not recompute only one element - """ - cache_dir1 = tmpdir.mkdir("test_wf_cache3") - cache_dir2 = tmpdir.mkdir("test_wf_cache4") - - wf1 = Workflow(name="wf", input_spec=["x", "y"], cache_dir=cache_dir1) - wf1.add(multiply(name="mult", x=wf1.lzin.x, y=wf1.lzin.y)) - wf1.add(add2_wait(name="add2", x=wf1.mult.lzout.out)) - wf1.set_output([("out", wf1.add2.lzout.out)]) - wf1.inputs.x = 2 - wf1.inputs.y = 3 - wf1.plugin = plugin - - with Submitter(plugin=plugin) as sub: - sub(wf1) - - results1 = wf1.result() - assert results1.output.out == 8 - - wf2 = Workflow( - name="wf", - input_spec=["x", "y"], - cache_dir=cache_dir2, - cache_locations=cache_dir1, - ) - wf2.add(multiply(name="mult", x=wf2.lzin.x, y=wf2.lzin.y)) - wf2.add(add2_wait(name="add2", x=wf2.mult.lzout.out)) - wf2.set_output([("out", wf2.add2.lzout.out)]) - wf2.split(splitter=("x", "y"), x=[2, 20], y=[3, 4]) - wf2.plugin = plugin - - with Submitter(plugin=plugin) as sub: - sub(wf2) - - results2 = wf2.result() - assert results2[0].output.out == 8 - assert results2[1].output.out == 82 - - # checking the directory from the first wf - assert wf1.output_dir.exists() - # checking directories from the second wf, only second element should be recomputed - assert not wf2.output_dir[0].exists() - assert wf2.output_dir[1].exists() - - -def test_wf_nostate_cachelocations_updated(plugin, tmpdir): - """ - Two identical wfs with provided cache_dir; - the second wf has cache_locations in init, - that is later overwritten in Submitter.__call__; - the cache_locations from call doesn't exist so the second task should run again - """ - cache_dir1 = tmpdir.mkdir("test_wf_cache3") - cache_dir1_empty = tmpdir.mkdir("test_wf_cache3_empty") - cache_dir2 = tmpdir.mkdir("test_wf_cache4") - - wf1 = Workflow(name="wf", input_spec=["x", "y"], cache_dir=cache_dir1) - wf1.add(multiply(name="mult", x=wf1.lzin.x, y=wf1.lzin.y)) - wf1.add(add2_wait(name="add2", x=wf1.mult.lzout.out)) - wf1.set_output([("out", wf1.add2.lzout.out)]) - wf1.inputs.x = 2 - wf1.inputs.y = 3 - wf1.plugin = plugin - - t0 = time.time() - with Submitter(plugin=plugin) as sub: - sub(wf1) - t1 = time.time() - t0 - - results1 = wf1.result() - assert 8 == results1.output.out - - wf2 = Workflow( - name="wf", - input_spec=["x", "y"], - cache_dir=cache_dir2, - cache_locations=cache_dir1, - ) - wf2.add(multiply(name="mult", x=wf2.lzin.x, y=wf2.lzin.y)) - wf2.add(add2_wait(name="add2", x=wf2.mult.lzout.out)) - wf2.set_output([("out", wf2.add2.lzout.out)]) - wf2.inputs.x = 2 - wf2.inputs.y = 3 - wf2.plugin = plugin - - t0 = time.time() - # changing cache_locations to non-existing dir - with Submitter(plugin=plugin) as sub: - sub(wf2, cache_locations=cache_dir1_empty) - t2 = time.time() - t0 - - results2 = wf2.result() - assert 8 == results2.output.out - - # for win and dask/slurm the time for dir creation etc. might take much longer - if not sys.platform.startswith("win") and plugin == "cf": - # checking the execution time - assert t1 > 2 - assert t2 > 2 - - # checking if both wf run - assert wf1.output_dir.exists() - assert wf2.output_dir.exists() - - -@pytest.mark.flaky(reruns=3) -def test_wf_nostate_cachelocations_recompute(plugin, tmpdir): - """ - Two wfs with the same inputs but slightly different graph; - the second wf should recompute the results, - but the second node should use the results from the first wf (has the same input) - """ - cache_dir1 = tmpdir.mkdir("test_wf_cache3") - cache_dir2 = tmpdir.mkdir("test_wf_cache4") - - wf1 = Workflow(name="wf", input_spec=["x", "y"], cache_dir=cache_dir1) - wf1.add(multiply(name="mult", x=wf1.lzin.x, y=wf1.lzin.y)) - wf1.add(add2(name="add2", x=wf1.mult.lzout.out)) - wf1.set_output([("out", wf1.add2.lzout.out)]) - wf1.inputs.x = 2 - wf1.inputs.y = 3 - wf1.plugin = plugin - - with Submitter(plugin=plugin) as sub: - sub(wf1) - - results1 = wf1.result() - assert 8 == results1.output.out - - wf2 = Workflow( - name="wf", - input_spec=["x", "y"], - cache_dir=cache_dir2, - cache_locations=cache_dir1, - ) - # different argument assignment - wf2.add(multiply(name="mult", x=wf2.lzin.y, y=wf2.lzin.x)) - wf2.add(add2(name="add2", x=wf2.mult.lzout.out)) - wf2.set_output([("out", wf2.add2.lzout.out)]) - wf2.inputs.x = 2 - wf2.inputs.y = 3 - wf2.plugin = plugin - - with Submitter(plugin=plugin) as sub: - sub(wf2) - - results2 = wf2.result() - assert 8 == results2.output.out - - # checking if both dir exists - assert wf1.output_dir.exists() - assert wf2.output_dir.exists() - - # the second wf should have only one task run - assert len(list(Path(cache_dir1).glob("F*"))) == 2 - assert len(list(Path(cache_dir2).glob("F*"))) == 1 - - -@pytest.mark.flaky(reruns=3) -def test_wf_ndstate_cachelocations(plugin, tmpdir): - """ - Two wfs with identical inputs and node states; - the second wf has cache_locations and should not recompute the results - """ - cache_dir1 = tmpdir.mkdir("test_wf_cache3") - cache_dir2 = tmpdir.mkdir("test_wf_cache4") - - wf1 = Workflow(name="wf", input_spec=["x", "y"], cache_dir=cache_dir1) - wf1.add( - multiply(name="mult").split(splitter=("x", "y"), x=wf1.lzin.x, y=wf1.lzin.y) - ) - wf1.add(add2_wait(name="add2", x=wf1.mult.lzout.out)) - wf1.set_output([("out", wf1.add2.lzout.out)]) - wf1.inputs.x = [2, 20] - wf1.inputs.y = [3, 4] - wf1.plugin = plugin - - t0 = time.time() - with Submitter(plugin=plugin) as sub: - sub(wf1) - t1 = time.time() - t0 - - results1 = wf1.result() - assert results1.output.out == [8, 82] - - wf2 = Workflow( - name="wf", - input_spec=["x", "y"], - cache_dir=cache_dir2, - cache_locations=cache_dir1, - ) - wf2.add( - multiply(name="mult").split(splitter=("x", "y"), x=wf2.lzin.x, y=wf2.lzin.y) - ) - wf2.add(add2_wait(name="add2", x=wf2.mult.lzout.out)) - wf2.set_output([("out", wf2.add2.lzout.out)]) - wf2.inputs.x = [2, 20] - wf2.inputs.y = [3, 4] - wf2.plugin = plugin - - t0 = time.time() - with Submitter(plugin=plugin) as sub: - sub(wf2) - t2 = time.time() - t0 - - results2 = wf2.result() - assert results2.output.out == [8, 82] - - # for win and dask/slurm the time for dir creation etc. might take much longer - if not sys.platform.startswith("win") and plugin == "cf": - # checking the execution time - assert t1 > 2 - assert t2 < max(1, t1 - 1) - - # checking all directories - assert wf1.output_dir.exists() - - # checking if the second wf didn't run again - # checking all directories - assert not wf2.output_dir.exists() - - -@pytest.mark.flaky(reruns=3) -def test_wf_ndstate_cachelocations_forcererun(plugin, tmpdir): - """ - Two wfs with identical inputs and node states; - the second wf has cache_locations, - but submitter is called with rerun=True, so should recompute - """ - cache_dir1 = tmpdir.mkdir("test_wf_cache3") - cache_dir2 = tmpdir.mkdir("test_wf_cache4") - - wf1 = Workflow(name="wf", input_spec=["x", "y"], cache_dir=cache_dir1) - wf1.add( - multiply(name="mult").split(splitter=("x", "y"), x=wf1.lzin.x, y=wf1.lzin.y) - ) - wf1.add(add2_wait(name="add2", x=wf1.mult.lzout.out)) - wf1.set_output([("out", wf1.add2.lzout.out)]) - wf1.inputs.x = [2, 20] - wf1.inputs.y = [3, 4] - wf1.plugin = plugin - - t0 = time.time() - with Submitter(plugin=plugin) as sub: - sub(wf1) - t1 = time.time() - t0 - - results1 = wf1.result() - assert results1.output.out == [8, 82] - - wf2 = Workflow( - name="wf", - input_spec=["x", "y"], - cache_dir=cache_dir2, - cache_locations=cache_dir1, - ) - wf2.add( - multiply(name="mult").split(splitter=("x", "y"), x=wf2.lzin.x, y=wf2.lzin.y) - ) - wf2.add(add2_wait(name="add2", x=wf2.mult.lzout.out)) - wf2.set_output([("out", wf2.add2.lzout.out)]) - wf2.inputs.x = [2, 20] - wf2.inputs.y = [3, 4] - wf2.plugin = plugin - - t0 = time.time() - with Submitter(plugin=plugin) as sub: - sub(wf2, rerun=True) - t2 = time.time() - t0 - - results2 = wf2.result() - assert results2.output.out == [8, 82] - - # for win and dask/slurm the time for dir creation etc. might take much longer - if not sys.platform.startswith("win") and plugin == "cf": - # checking the execution time - assert t1 > 2 - assert t2 > 2 - - # checking all directories - assert wf1.output_dir.exists() - - # checking if the second wf run again - assert wf2.output_dir.exists() - - -@pytest.mark.flaky(reruns=3) -def test_wf_ndstate_cachelocations_updatespl(plugin, tmpdir): - """ - Two wfs with identical inputs and node state (that is set after adding the node!); - the second wf has cache_locations and should not recompute the results - """ - cache_dir1 = tmpdir.mkdir("test_wf_cache3") - cache_dir2 = tmpdir.mkdir("test_wf_cache4") - - wf1 = Workflow(name="wf", input_spec=["x", "y"], cache_dir=cache_dir1) - wf1.add( - multiply(name="mult").split(splitter=("x", "y"), x=wf1.lzin.x, y=wf1.lzin.y) - ) - wf1.add(add2_wait(name="add2", x=wf1.mult.lzout.out)) - wf1.set_output([("out", wf1.add2.lzout.out)]) - wf1.inputs.x = [2, 20] - wf1.inputs.y = [3, 4] - wf1.plugin = plugin - - t0 = time.time() - with Submitter(plugin=plugin) as sub: - sub(wf1) - t1 = time.time() - t0 - - results1 = wf1.result() - assert results1.output.out == [8, 82] - - wf2 = Workflow( - name="wf", - input_spec=["x", "y"], - cache_dir=cache_dir2, - cache_locations=cache_dir1, - ) - wf2.add(multiply(name="mult")) - wf2.mult.split(splitter=("x", "y"), x=wf2.lzin.x, y=wf2.lzin.y) - wf2.add(add2_wait(name="add2", x=wf2.mult.lzout.out)) - wf2.set_output([("out", wf2.add2.lzout.out)]) - wf2.inputs.x = [2, 20] - wf2.inputs.y = [3, 4] - wf2.plugin = plugin - - t0 = time.time() - with Submitter(plugin=plugin) as sub: - sub(wf2) - t2 = time.time() - t0 - - results2 = wf2.result() - assert results2.output.out == [8, 82] - - # for win and dask/slurm the time for dir creation etc. might take much longer - if not sys.platform.startswith("win") and plugin == "cf": - # checking the execution time - assert t1 > 2 - assert t2 < max(1, t1 - 1) - - # checking all directories - assert wf1.output_dir.exists() - - # checking if the second wf didn't run again - # checking all directories - assert not wf2.output_dir.exists() - - -@pytest.mark.flaky(reruns=3) -def test_wf_ndstate_cachelocations_recompute(plugin, tmpdir): - """ - Two wfs (with nodes with states) with provided cache_dir; - the second wf has cache_locations and should not recompute the results - """ - cache_dir1 = tmpdir.mkdir("test_wf_cache3") - cache_dir2 = tmpdir.mkdir("test_wf_cache4") - - wf1 = Workflow(name="wf", input_spec=["x", "y"], cache_dir=cache_dir1) - wf1.add( - multiply(name="mult").split(splitter=("x", "y"), x=wf1.lzin.x, y=wf1.lzin.y) - ) - wf1.add(add2_wait(name="add2", x=wf1.mult.lzout.out)) - wf1.set_output([("out", wf1.add2.lzout.out)]) - wf1.inputs.x = [2, 20] - wf1.inputs.y = [3, 4] - wf1.plugin = plugin - - t0 = time.time() - with Submitter(plugin=plugin) as sub: - sub(wf1) - t1 = time.time() - t0 - - results1 = wf1.result() - assert results1.output.out == [8, 82] - - wf2 = Workflow( - name="wf", - input_spec=["x", "y"], - cache_dir=cache_dir2, - cache_locations=cache_dir1, - ) - wf2.add( - multiply(name="mult").split(splitter=["x", "y"], x=wf2.lzin.x, y=wf2.lzin.y) - ) - wf2.add(add2_wait(name="add2", x=wf2.mult.lzout.out)) - wf2.set_output([("out", wf2.add2.lzout.out)]) - wf2.inputs.x = [2, 20] - wf2.inputs.y = [3, 4] - wf2.plugin = plugin - - t0 = time.time() - with Submitter(plugin=plugin) as sub: - sub(wf2) - t2 = time.time() - t0 - - results2 = wf2.result() - assert results2.output.out == [8, 10, 62, 82] - - # for win and dask/slurm the time for dir creation etc. might take much longer - if not sys.platform.startswith("win") and plugin == "cf": - # checking the execution time - assert t1 > 2 - assert t2 > 2 - - # checking all directories - assert wf1.output_dir.exists() - - # checking if the second wf didn't run again - # checking all directories - assert wf2.output_dir.exists() - - -@pytest.mark.flaky(reruns=3) -def test_wf_nostate_runtwice_usecache(plugin, tmpdir): - """ - running workflow (without state) twice, - the second run should use the results from the first one - """ - cache_dir1 = tmpdir.mkdir("test_wf_cache3") - - wf1 = Workflow(name="wf", input_spec=["x", "y"], cache_dir=cache_dir1) - wf1.add(multiply(name="mult", x=wf1.lzin.x, y=wf1.lzin.y)) - wf1.add(add2_wait(name="add2", x=wf1.mult.lzout.out)) - wf1.set_output([("out", wf1.add2.lzout.out)]) - wf1.inputs.x = 2 - wf1.inputs.y = 3 - wf1.plugin = plugin - - t0 = time.time() - with Submitter(plugin=plugin) as sub: - sub(wf1) - t1 = time.time() - t0 - - results1 = wf1.result() - assert 8 == results1.output.out - # checkoing output_dir after the first run - assert wf1.output_dir.exists() - - # saving the content of the cache dit after the first run - cache_dir_content = os.listdir(wf1.cache_dir) - - # running workflow the second time - t0 = time.time() - with Submitter(plugin=plugin) as sub: - sub(wf1) - t2 = time.time() - t0 - - results1 = wf1.result() - assert 8 == results1.output.out - # checking if no new directory is created - assert cache_dir_content == os.listdir(wf1.cache_dir) - - # for win and dask/slurm the time for dir creation etc. might take much longer - if not sys.platform.startswith("win") and plugin == "cf": - # checking the execution time - assert t1 > 2 - assert t2 < max(1, t1 - 1) - - -def test_wf_state_runtwice_usecache(plugin, tmpdir): - """ - running workflow with a state twice, - the second run should use the results from the first one - """ - cache_dir1 = tmpdir.mkdir("test_wf_cache3") - - wf1 = Workflow(name="wf", input_spec=["x", "y"], cache_dir=cache_dir1) - wf1.add(multiply(name="mult", x=wf1.lzin.x, y=wf1.lzin.y)) - wf1.add(add2_wait(name="add2", x=wf1.mult.lzout.out)) - wf1.set_output([("out", wf1.add2.lzout.out)]) - wf1.split(splitter=("x", "y"), x=[2, 20], y=[3, 30]) - wf1.plugin = plugin - - t0 = time.time() - with Submitter(plugin=plugin) as sub: - sub(wf1) - t1 = time.time() - t0 - - results1 = wf1.result() - assert 8 == results1[0].output.out - assert 602 == results1[1].output.out - - # checkoing output_dir after the first run - assert [odir.exists() for odir in wf1.output_dir] - - # saving the content of the cache dit after the first run - cache_dir_content = os.listdir(wf1.cache_dir) - - # running workflow the second time - t0 = time.time() - with Submitter(plugin=plugin) as sub: - sub(wf1) - t2 = time.time() - t0 - - results1 = wf1.result() - assert 8 == results1[0].output.out - assert 602 == results1[1].output.out - # checking if no new directory is created - assert cache_dir_content == os.listdir(wf1.cache_dir) - # for win and dask/slurm the time for dir creation etc. might take much longer - if not sys.platform.startswith("win") and plugin == "cf": - # checking the execution time - assert t1 > 2 - assert t2 < max(1, t1 - 1) - - -@pytest.fixture -def create_tasks(): - wf = Workflow(name="wf", input_spec=["x"]) - wf.inputs.x = 1 - wf.add(add2(name="t1", x=wf.lzin.x)) - wf.add(multiply(name="t2", x=wf.t1.lzout.out, y=2)) - wf.set_output([("out", wf.t2.lzout.out)]) - t1 = wf.name2obj["t1"] - t2 = wf.name2obj["t2"] - return wf, t1, t2 - - -def test_cache_propagation1(tmpdir, create_tasks): - """No cache set, all independent""" - wf, t1, t2 = create_tasks - wf(plugin="cf") - assert wf.cache_dir == t1.cache_dir == t2.cache_dir - wf.cache_dir = (tmpdir / "shared").strpath - wf(plugin="cf") - assert wf.cache_dir == t1.cache_dir == t2.cache_dir - - -def test_cache_propagation2(tmpdir, create_tasks): - """Task explicitly states no inheriting""" - wf, t1, t2 = create_tasks - wf.cache_dir = (tmpdir / "shared").strpath - t2.allow_cache_override = False - wf(plugin="cf") - assert wf.cache_dir == t1.cache_dir != t2.cache_dir - - -def test_cache_propagation3(tmpdir, create_tasks): - """Shared cache_dir with state""" - wf, t1, t2 = create_tasks - wf.split("x", x=[1, 2]) - wf.cache_dir = (tmpdir / "shared").strpath - wf(plugin="cf") - assert wf.cache_dir == t1.cache_dir == t2.cache_dir - - -def test_workflow_combine1(tmpdir): - wf1 = Workflow(name="wf1", input_spec=["a", "b"], a=[1, 2], b=[2, 3]) - wf1.add(power(name="power").split(["a", "b"], a=wf1.lzin.a, b=wf1.lzin.b)) - wf1.add(identity(name="identity1", x=wf1.power.lzout.out).combine("power.a")) - wf1.add(identity(name="identity2", x=wf1.identity1.lzout.out).combine("power.b")) - wf1.set_output( - { - "out_pow": wf1.power.lzout.out, - "out_iden1": wf1.identity1.lzout.out, - "out_iden2": wf1.identity2.lzout.out, - } - ) - wf1.cache_dir = tmpdir - result = wf1() - - assert result.output.out_pow == [1, 1, 4, 8] - assert result.output.out_iden1 == [[1, 4], [1, 8]] - assert result.output.out_iden2 == [[1, 4], [1, 8]] - - -def test_workflow_combine2(tmpdir): - wf1 = Workflow(name="wf1", input_spec=["a", "b"], a=[1, 2], b=[2, 3]) - wf1.add( - power(name="power").split(["a", "b"], a=wf1.lzin.a, b=wf1.lzin.b).combine("a") - ) - wf1.add(identity(name="identity", x=wf1.power.lzout.out).combine("power.b")) - wf1.set_output({"out_pow": wf1.power.lzout.out, "out_iden": wf1.identity.lzout.out}) - wf1.cache_dir = tmpdir - result = wf1() - - assert result.output.out_pow == [[1, 4], [1, 8]] - assert result.output.out_iden == [[1, 4], [1, 8]] - - -# testing lzout.all to collect all of the results and let FunctionTask deal with it - - -def test_wf_lzoutall_1(plugin, tmpdir): - """workflow with 2 tasks, no splitter - passing entire result object to add2_sub2_res function - by using lzout.all syntax - """ - wf = Workflow(name="wf_2", input_spec=["x", "y"]) - wf.add(multiply(name="mult", x=wf.lzin.x, y=wf.lzin.y)) - wf.add(add2_sub2_res(name="add_sub", res=wf.mult.lzout.all_)) - wf.set_output([("out", wf.add_sub.lzout.out_add)]) - wf.inputs.x = 2 - wf.inputs.y = 3 - wf.cache_dir = tmpdir - - with Submitter(plugin=plugin) as sub: - sub(wf) - - assert wf.output_dir.exists() - results = wf.result() - assert 8 == results.output.out - - -def test_wf_lzoutall_1a(plugin, tmpdir): - """workflow with 2 tasks, no splitter - passing entire result object to add2_res function - by using lzout.all syntax in the node connections and for wf output - """ - wf = Workflow(name="wf_2", input_spec=["x", "y"]) - wf.add(multiply(name="mult", x=wf.lzin.x, y=wf.lzin.y)) - wf.add(add2_sub2_res(name="add_sub", res=wf.mult.lzout.all_)) - wf.set_output([("out_all", wf.add_sub.lzout.all_)]) - wf.inputs.x = 2 - wf.inputs.y = 3 - wf.cache_dir = tmpdir - - with Submitter(plugin=plugin) as sub: - sub(wf) - - assert wf.output_dir.exists() - results = wf.result() - assert results.output.out_all == {"out_add": 8, "out_sub": 4} - - -def test_wf_lzoutall_st_1(plugin, tmpdir): - """workflow with 2 tasks, no splitter - passing entire result object to add2_res function - by using lzout.all syntax - """ - wf = Workflow(name="wf_2", input_spec=["x", "y"]) - wf.add(multiply(name="mult").split(["x", "y"], x=wf.lzin.x, y=wf.lzin.y)) - wf.add(add2_sub2_res(name="add_sub", res=wf.mult.lzout.all_)) - wf.set_output([("out_add", wf.add_sub.lzout.out_add)]) - wf.inputs.x = [2, 20] - wf.inputs.y = [3, 30] - wf.plugin = plugin - wf.cache_dir = tmpdir - - with Submitter(plugin=plugin) as sub: - sub(wf) - - assert wf.output_dir.exists() - results = wf.result() - assert results.output.out_add == [8, 62, 62, 602] - - -def test_wf_lzoutall_st_1a(plugin, tmpdir): - """workflow with 2 tasks, no splitter - passing entire result object to add2_res function - by using lzout.all syntax - """ - wf = Workflow(name="wf_2", input_spec=["x", "y"]) - wf.add(multiply(name="mult").split(["x", "y"], x=wf.lzin.x, y=wf.lzin.y)) - wf.add(add2_sub2_res(name="add_sub", res=wf.mult.lzout.all_)) - wf.set_output([("out_all", wf.add_sub.lzout.all_)]) - wf.inputs.x = [2, 20] - wf.inputs.y = [3, 30] - wf.plugin = plugin - wf.cache_dir = tmpdir - - with Submitter(plugin=plugin) as sub: - sub(wf) - - assert wf.output_dir.exists() - results = wf.result() - assert results.output.out_all == [ - {"out_add": 8, "out_sub": 4}, - {"out_add": 62, "out_sub": 58}, - {"out_add": 62, "out_sub": 58}, - {"out_add": 602, "out_sub": 598}, - ] - - -def test_wf_lzoutall_st_2(plugin, tmpdir): - """workflow with 2 tasks, no splitter - passing entire result object to add2_res function - by using lzout.all syntax - """ - wf = Workflow(name="wf_2", input_spec=["x", "y"]) - wf.add( - multiply(name="mult").split(["x", "y"], x=wf.lzin.x, y=wf.lzin.y).combine("x") - ) - wf.add(add2_sub2_res_list(name="add_sub", res=wf.mult.lzout.all_)) - wf.set_output([("out_add", wf.add_sub.lzout.out_add)]) - wf.inputs.x = [2, 20] - wf.inputs.y = [3, 30] - wf.plugin = plugin - wf.cache_dir = tmpdir - - with Submitter(plugin=plugin) as sub: - sub(wf) - - assert wf.output_dir.exists() - results = wf.result() - assert results.output.out_add[0] == [8, 62] - assert results.output.out_add[1] == [62, 602] - - -@pytest.mark.xfail( - condition=bool(shutil.which("sbatch")), # using SLURM - reason=( - "Not passing on SLURM image for some reason, hoping upgrade of image/Python " - "version fixes it" - ), -) -def test_wf_lzoutall_st_2a(plugin, tmpdir): - """workflow with 2 tasks, no splitter - passing entire result object to add2_res function - by using lzout.all syntax - """ - wf = Workflow(name="wf_2", input_spec=["x", "y"]) - wf.add( - multiply(name="mult").split(["x", "y"], x=wf.lzin.x, y=wf.lzin.y).combine("x") - ) - wf.add(add2_sub2_res_list(name="add_sub", res=wf.mult.lzout.all_)) - wf.set_output([("out_all", wf.add_sub.lzout.all_)]) - wf.inputs.x = [2, 20] - wf.inputs.y = [3, 30] - wf.plugin = plugin - wf.cache_dir = tmpdir - - with Submitter(plugin=plugin) as sub: - sub(wf) - - assert wf.output_dir.exists() - results = wf.result() - assert results.output.out_all == [ - {"out_add": [8, 62], "out_sub": [4, 58]}, - {"out_add": [62, 602], "out_sub": [58, 598]}, - ] - - -# workflows that have files in the result, the files should be copied to the wf dir - - -def test_wf_resultfile_1(plugin, tmpdir): - """workflow with a file in the result, file should be copied to the wf dir""" - wf = Workflow(name="wf_file_1", input_spec=["x"], cache_dir=tmpdir) - wf.add(fun_write_file(name="writefile", filename=wf.lzin.x)) - wf.inputs.x = "file_1.txt" - wf.plugin = plugin - wf.set_output([("wf_out", wf.writefile.lzout.out)]) - - with Submitter(plugin=plugin) as sub: - sub(wf) - - results = wf.result() - # checking if the file exists and if it is in the Workflow directory - wf_out = results.output.wf_out.fspath - wf_out.exists() - assert wf_out == wf.output_dir / "file_1.txt" - - -def test_wf_resultfile_2(plugin, tmpdir): - """workflow with a list of files in the wf result, - all files should be copied to the wf dir - """ - wf = Workflow(name="wf_file_1", input_spec=["x"], cache_dir=tmpdir) - wf.add(fun_write_file_list(name="writefile", filename_list=wf.lzin.x)) - file_list = ["file_1.txt", "file_2.txt", "file_3.txt"] - wf.inputs.x = file_list - wf.plugin = plugin - wf.set_output([("wf_out", wf.writefile.lzout.out)]) - - with Submitter(plugin=plugin) as sub: - sub(wf) - - results = wf.result() - # checking if the file exists and if it is in the Workflow directory - for ii, file in enumerate(results.output.wf_out): - assert file.fspath.exists() - assert file.fspath == wf.output_dir / file_list[ii] - - -def test_wf_resultfile_3(plugin, tmpdir): - """workflow with a dictionaries of files in the wf result, - all files should be copied to the wf dir - """ - wf = Workflow(name="wf_file_1", input_spec=["x"], cache_dir=tmpdir) - wf.add(fun_write_file_list2dict(name="writefile", filename_list=wf.lzin.x)) - file_list = ["file_1.txt", "file_2.txt", "file_3.txt"] - wf.inputs.x = file_list - wf.plugin = plugin - wf.set_output([("wf_out", wf.writefile.lzout.out)]) - - with Submitter(plugin=plugin) as sub: - sub(wf) - - results = wf.result() - # checking if the file exists and if it is in the Workflow directory - for key, val in results.output.wf_out.items(): - if key == "random_int": - assert val == 20 - else: - assert val.fspath.exists() - ii = int(key.split("_")[1]) - assert val.fspath == wf.output_dir / file_list[ii] - - -def test_wf_upstream_error1(plugin, tmpdir): - """workflow with two tasks, task2 dependent on an task1 which raised an error""" - wf = Workflow(name="wf", input_spec=["x"], cache_dir=tmpdir) - wf.add(fun_addvar_default_notype(name="addvar1", a=wf.lzin.x)) - wf.inputs.x = "hi" # TypeError for adding str and int - wf.plugin = plugin - wf.add(fun_addvar_default_notype(name="addvar2", a=wf.addvar1.lzout.out)) - wf.set_output([("out", wf.addvar2.lzout.out)]) - - with pytest.raises(ValueError) as excinfo: - with Submitter(plugin=plugin) as sub: - sub(wf) - assert "addvar1" in str(excinfo.value) - assert "raised an error" in str(excinfo.value) - - -def test_wf_upstream_error2(plugin, tmpdir): - """task2 dependent on task1, task1 errors, workflow-level split on task 1 - goal - workflow finish running, one output errors but the other doesn't - """ - wf = Workflow(name="wf", input_spec=["x"], cache_dir=tmpdir) - wf.add(fun_addvar_default_notype(name="addvar1", a=wf.lzin.x)) - wf.split("x", x=[1, "hi"]) # workflow-level split TypeError for adding str and int - wf.plugin = plugin - wf.add(fun_addvar_default_notype(name="addvar2", a=wf.addvar1.lzout.out)) - wf.set_output([("out", wf.addvar2.lzout.out)]) - - with pytest.raises(Exception) as excinfo: - with Submitter(plugin=plugin) as sub: - sub(wf) - assert "addvar1" in str(excinfo.value) - assert "raised an error" in str(excinfo.value) - - -@pytest.mark.flaky(reruns=2) # when slurm -def test_wf_upstream_error3(plugin, tmpdir): - """task2 dependent on task1, task1 errors, task-level split on task 1 - goal - workflow finish running, one output errors but the other doesn't - """ - wf = Workflow(name="wf", input_spec=["x"], cache_dir=tmpdir) - wf.add(fun_addvar_default_notype(name="addvar1")) - wf.inputs.x = [1, "hi"] # TypeError for adding str and int - wf.addvar1.split("a", a=wf.lzin.x) # task-level split - wf.plugin = plugin - wf.add(fun_addvar_default_notype(name="addvar2", a=wf.addvar1.lzout.out)) - wf.set_output([("out", wf.addvar2.lzout.out)]) - - with pytest.raises(Exception) as excinfo: - with Submitter(plugin=plugin) as sub: - sub(wf) - assert "addvar1" in str(excinfo.value) - assert "raised an error" in str(excinfo.value) - - -def test_wf_upstream_error4(plugin, tmpdir): - """workflow with one task, which raises an error""" - wf = Workflow(name="wf", input_spec=["x"], cache_dir=tmpdir) - wf.add(fun_addvar_default_notype(name="addvar1", a=wf.lzin.x)) - wf.inputs.x = "hi" # TypeError for adding str and int - wf.plugin = plugin - wf.set_output([("out", wf.addvar1.lzout.out)]) - - with pytest.raises(Exception) as excinfo: - with Submitter(plugin=plugin) as sub: - sub(wf) - assert "raised an error" in str(excinfo.value) - assert "addvar1" in str(excinfo.value) - - -def test_wf_upstream_error5(plugin, tmpdir): - """nested workflow with one task, which raises an error""" - wf_main = Workflow(name="wf_main", input_spec=["x"], cache_dir=tmpdir) - wf = Workflow(name="wf", input_spec=["x"], x=wf_main.lzin.x) - wf.add(fun_addvar_default_notype(name="addvar1", a=wf.lzin.x)) - wf.plugin = plugin - wf.set_output([("wf_out", wf.addvar1.lzout.out)]) - - wf_main.add(wf) - wf_main.inputs.x = "hi" # TypeError for adding str and int - wf_main.set_output([("out", wf_main.wf.lzout.wf_out)]) - - with pytest.raises(Exception) as excinfo: - with Submitter(plugin=plugin) as sub: - sub(wf_main) - - assert "addvar1" in str(excinfo.value) - assert "raised an error" in str(excinfo.value) - - -def test_wf_upstream_error6(plugin, tmpdir): - """nested workflow with two tasks, the first one raises an error""" - wf_main = Workflow(name="wf_main", input_spec=["x"], cache_dir=tmpdir) - wf = Workflow(name="wf", input_spec=["x"], x=wf_main.lzin.x) - wf.add(fun_addvar_default_notype(name="addvar1", a=wf.lzin.x)) - wf.add(fun_addvar_default_notype(name="addvar2", a=wf.addvar1.lzout.out)) - wf.plugin = plugin - wf.set_output([("wf_out", wf.addvar2.lzout.out)]) - - wf_main.add(wf) - wf_main.inputs.x = "hi" # TypeError for adding str and int - wf_main.set_output([("out", wf_main.wf.lzout.wf_out)]) - - with pytest.raises(Exception) as excinfo: - with Submitter(plugin=plugin) as sub: - sub(wf_main) - - assert "addvar1" in str(excinfo.value) - assert "raised an error" in str(excinfo.value) - - -def test_wf_upstream_error7(plugin, tmpdir): - """ - workflow with three sequential tasks, the first task raises an error - the last task is set as the workflow output - """ - wf = Workflow(name="wf", input_spec=["x"], cache_dir=tmpdir) - wf.add(fun_addvar_default_notype(name="addvar1", a=wf.lzin.x)) - wf.inputs.x = "hi" # TypeError for adding str and int - wf.plugin = plugin - wf.add(fun_addvar_default_notype(name="addvar2", a=wf.addvar1.lzout.out)) - wf.add(fun_addvar_default_notype(name="addvar3", a=wf.addvar2.lzout.out)) - wf.set_output([("out", wf.addvar3.lzout.out)]) - - with pytest.raises(ValueError) as excinfo: - with Submitter(plugin=plugin) as sub: - sub(wf) - assert "addvar1" in str(excinfo.value) - assert "raised an error" in str(excinfo.value) - assert wf.addvar1._errored is True - assert wf.addvar2._errored == wf.addvar3._errored == ["addvar1"] - - -def test_wf_upstream_error7a(plugin, tmpdir): - """ - workflow with three sequential tasks, the first task raises an error - the second task is set as the workflow output - """ - wf = Workflow(name="wf", input_spec=["x"], cache_dir=tmpdir) - wf.add(fun_addvar_default_notype(name="addvar1", a=wf.lzin.x)) - wf.inputs.x = "hi" # TypeError for adding str and int - wf.plugin = plugin - wf.add(fun_addvar_default_notype(name="addvar2", a=wf.addvar1.lzout.out)) - wf.add(fun_addvar_default_notype(name="addvar3", a=wf.addvar2.lzout.out)) - wf.set_output([("out", wf.addvar2.lzout.out)]) - - with pytest.raises(ValueError) as excinfo: - with Submitter(plugin=plugin) as sub: - sub(wf) - assert "addvar1" in str(excinfo.value) - assert "raised an error" in str(excinfo.value) - assert wf.addvar1._errored is True - assert wf.addvar2._errored == wf.addvar3._errored == ["addvar1"] - - -def test_wf_upstream_error7b(plugin, tmpdir): - """ - workflow with three sequential tasks, the first task raises an error - the second and the third tasks are set as the workflow output - """ - wf = Workflow(name="wf", input_spec=["x"], cache_dir=tmpdir) - wf.add(fun_addvar_default_notype(name="addvar1", a=wf.lzin.x)) - wf.inputs.x = "hi" # TypeError for adding str and int - wf.plugin = plugin - wf.add(fun_addvar_default_notype(name="addvar2", a=wf.addvar1.lzout.out)) - wf.add(fun_addvar_default_notype(name="addvar3", a=wf.addvar2.lzout.out)) - wf.set_output([("out1", wf.addvar2.lzout.out), ("out2", wf.addvar3.lzout.out)]) - - with pytest.raises(ValueError) as excinfo: - with Submitter(plugin=plugin) as sub: - sub(wf) - assert "addvar1" in str(excinfo.value) - assert "raised an error" in str(excinfo.value) - assert wf.addvar1._errored is True - assert wf.addvar2._errored == wf.addvar3._errored == ["addvar1"] - - -def test_wf_upstream_error8(plugin, tmpdir): - """workflow with three tasks, the first one raises an error, so 2 others are removed""" - wf = Workflow(name="wf", input_spec=["x"], cache_dir=tmpdir) - wf.add(fun_addvar_default_notype(name="addvar1", a=wf.lzin.x)) - wf.inputs.x = "hi" # TypeError for adding str and int - wf.plugin = plugin - wf.add(fun_addvar_default_notype(name="addvar2", a=wf.addvar1.lzout.out)) - wf.add(fun_addtwo(name="addtwo", a=wf.addvar1.lzout.out)) - wf.set_output([("out1", wf.addvar2.lzout.out), ("out2", wf.addtwo.lzout.out)]) - - with pytest.raises(ValueError) as excinfo: - with Submitter(plugin=plugin) as sub: - sub(wf) - - assert "addvar1" in str(excinfo.value) - assert "raised an error" in str(excinfo.value) - assert wf.addvar1._errored is True - assert wf.addvar2._errored == wf.addtwo._errored == ["addvar1"] - - -def test_wf_upstream_error9(plugin, tmpdir): - """ - workflow with five tasks with two "branches", - one branch has an error, the second is fine - the errored branch is connected to the workflow output - """ - wf = Workflow(name="wf", input_spec=["x"], cache_dir=tmpdir) - wf.add(fun_addvar_default_notype(name="addvar1", a=wf.lzin.x)) - wf.inputs.x = 2 - wf.add(fun_addvar_notype(name="err", a=wf.addvar1.lzout.out, b="hi")) - wf.add(fun_addvar_default_notype(name="follow_err", a=wf.err.lzout.out)) - - wf.add(fun_addtwo_notype(name="addtwo", a=wf.addvar1.lzout.out)) - wf.add(fun_addvar_default_notype(name="addvar2", a=wf.addtwo.lzout.out)) - wf.set_output([("out1", wf.follow_err.lzout.out)]) - - wf.plugin = plugin - with pytest.raises(ValueError) as excinfo: - with Submitter(plugin=plugin) as sub: - sub(wf) - assert "err" in str(excinfo.value) - assert "raised an error" in str(excinfo.value) - assert wf.err._errored is True - assert wf.follow_err._errored == ["err"] - - -def test_wf_upstream_error9a(plugin, tmpdir): - """ - workflow with five tasks with two "branches", - one branch has an error, the second is fine - the branch without error is connected to the workflow output - so the workflow finished clean - """ - wf = Workflow(name="wf", input_spec=["x"], cache_dir=tmpdir) - wf.add(fun_addvar_default(name="addvar1", a=wf.lzin.x)) - wf.inputs.x = 2 - wf.add(fun_addvar_notype(name="err", a=wf.addvar1.lzout.out, b="hi")) - wf.add(fun_addvar_default(name="follow_err", a=wf.err.lzout.out)) - - wf.add(fun_addtwo_notype(name="addtwo", a=wf.addvar1.lzout.out)) - wf.add(fun_addvar_default(name="addvar2", a=wf.addtwo.lzout.out)) - wf.set_output([("out1", wf.addvar2.lzout.out)]) # , ("out2", wf.addtwo.lzout.out)]) - - wf.plugin = plugin - with Submitter(plugin=plugin) as sub: - sub(wf) - assert wf.err._errored is True - assert wf.follow_err._errored == ["err"] - - -def test_wf_upstream_error9b(plugin, tmpdir): - """ - workflow with five tasks with two "branches", - one branch has an error, the second is fine - both branches are connected to the workflow output - """ - wf = Workflow(name="wf", input_spec=["x"], cache_dir=tmpdir) - wf.add(fun_addvar_default_notype(name="addvar1", a=wf.lzin.x)) - wf.inputs.x = 2 - wf.add(fun_addvar_notype(name="err", a=wf.addvar1.lzout.out, b="hi")) - wf.add(fun_addvar_default_notype(name="follow_err", a=wf.err.lzout.out)) - - wf.add(fun_addtwo_notype(name="addtwo", a=wf.addvar1.lzout.out)) - wf.add(fun_addvar_default_notype(name="addvar2", a=wf.addtwo.lzout.out)) - wf.set_output([("out1", wf.follow_err.lzout.out), ("out2", wf.addtwo.lzout.out)]) - - wf.plugin = plugin - with pytest.raises(ValueError) as excinfo: - with Submitter(plugin=plugin) as sub: - sub(wf) - assert "err" in str(excinfo.value) - assert "raised an error" in str(excinfo.value) - assert wf.err._errored is True - assert wf.follow_err._errored == ["err"] - - -def exporting_graphs(wf, name): - """helper function to run dot to create png/pdf files from dotfiles""" - # exporting the simple graph - dotfile_pr, formatted_dot = wf.create_dotfile(export=True, name=name) - assert len(formatted_dot) == 1 - assert formatted_dot[0] == dotfile_pr.with_suffix(".png") - assert formatted_dot[0].exists() - print("\n png of a simple graph in: ", formatted_dot[0]) - # exporting nested graph - dotfile_pr, formatted_dot = wf.create_dotfile( - type="nested", export=["pdf", "png"], name=f"{name}_nest" - ) - assert len(formatted_dot) == 2 - assert formatted_dot[0] == dotfile_pr.with_suffix(".pdf") - assert formatted_dot[0].exists() - print("\n pdf of the nested graph in: ", formatted_dot[0]) - # detailed graph - dotfile_pr, formatted_dot = wf.create_dotfile( - type="detailed", export="pdf", name=f"{name}_det" - ) - assert len(formatted_dot) == 1 - assert formatted_dot[0] == dotfile_pr.with_suffix(".pdf") - assert formatted_dot[0].exists() - print("\n pdf of the detailed graph in: ", formatted_dot[0]) - - -@pytest.mark.parametrize("splitter", [None, "x"]) -def test_graph_1(tmpdir, splitter): - """creating a set of graphs, wf with two nodes""" - wf = Workflow(name="wf", input_spec=["x", "y"], cache_dir=tmpdir) - wf.add(multiply(name="mult_1", x=wf.lzin.x, y=wf.lzin.y)) - wf.add(multiply(name="mult_2", x=wf.lzin.x, y=wf.lzin.x)) - wf.add(add2(name="add2", x=wf.mult_1.lzout.out)) - wf.set_output([("out", wf.add2.lzout.out)]) - wf.split(splitter, x=[1, 2]) - - # simple graph - dotfile_s = wf.create_dotfile() - dotstr_s_lines = dotfile_s.read_text().split("\n") - assert "mult_1" in dotstr_s_lines - assert "mult_2" in dotstr_s_lines - assert "add2" in dotstr_s_lines - assert "mult_1 -> add2" in dotstr_s_lines - - # nested graph (should have the same elements) - dotfile_n = wf.create_dotfile(type="nested") - dotstr_n_lines = dotfile_n.read_text().split("\n") - assert "mult_1" in dotstr_n_lines - assert "mult_2" in dotstr_n_lines - assert "add2" in dotstr_n_lines - assert "mult_1 -> add2" in dotstr_n_lines - - # detailed graph - dotfile_d = wf.create_dotfile(type="detailed") - dotstr_d_lines = dotfile_d.read_text().split("\n") - assert ( - 'struct_wf [color=red, label="{WORKFLOW INPUT: | { x | y}}"];' - in dotstr_d_lines - ) - assert "struct_mult_1:out -> struct_add2:x;" in dotstr_d_lines - - # exporting graphs if dot available - if DOT_FLAG: - name = f"graph_{sys._getframe().f_code.co_name}" - exporting_graphs(wf=wf, name=name) - - -def test_graph_1st(tmpdir): - """creating a set of graphs, wf with two nodes - some nodes have splitters, should be marked with blue color - """ - wf = Workflow(name="wf", input_spec=["x", "y"], cache_dir=tmpdir) - wf.add(multiply(name="mult_1", y=wf.lzin.y).split("x", x=wf.lzin.x)) - wf.add(multiply(name="mult_2", x=wf.lzin.x, y=wf.lzin.x)) - wf.add(add2(name="add2", x=wf.mult_1.lzout.out)) - wf.set_output([("out", wf.add2.lzout.out)]) - - # simple graph - dotfile_s = wf.create_dotfile() - dotstr_s_lines = dotfile_s.read_text().split("\n") - assert "mult_1 [color=blue]" in dotstr_s_lines - assert "mult_2" in dotstr_s_lines - assert "add2 [color=blue]" in dotstr_s_lines - assert "mult_1 -> add2 [color=blue]" in dotstr_s_lines - - # nested graph - dotfile_n = wf.create_dotfile(type="nested") - dotstr_n_lines = dotfile_n.read_text().split("\n") - assert "mult_1 [color=blue]" in dotstr_n_lines - assert "mult_2" in dotstr_n_lines - assert "add2 [color=blue]" in dotstr_n_lines - assert "mult_1 -> add2 [color=blue]" in dotstr_n_lines - - # detailed graph - dotfile_d = wf.create_dotfile(type="detailed") - dotstr_d_lines = dotfile_d.read_text().split("\n") - assert ( - 'struct_wf [color=red, label="{WORKFLOW INPUT: | { x | y}}"];' - in dotstr_d_lines - ) - assert "struct_mult_1:out -> struct_add2:x;" in dotstr_d_lines - - if DOT_FLAG: - name = f"graph_{sys._getframe().f_code.co_name}" - exporting_graphs(wf=wf, name=name) - - -def test_graph_1st_cmb(tmpdir): - """creating a set of graphs, wf with three nodes - the first one has a splitter, the second has a combiner, so the third one is stateless - first two nodes should be blue and the arrow between them should be blue - """ - wf = Workflow(name="wf", input_spec=["x", "y"], cache_dir=tmpdir) - wf.add(multiply(name="mult", y=wf.lzin.y).split("x", x=wf.lzin.x)) - wf.add(add2(name="add2", x=wf.mult.lzout.out).combine("mult.x")) - wf.add(list_sum(name="sum", x=wf.add2.lzout.out)) - wf.set_output([("out", wf.sum.lzout.out)]) - # simple graph - dotfile_s = wf.create_dotfile() - dotstr_s_lines = dotfile_s.read_text().split("\n") - assert "mult [color=blue]" in dotstr_s_lines - assert "add2 [color=blue]" in dotstr_s_lines - assert "sum" in dotstr_s_lines - assert "mult -> add2 [color=blue]" in dotstr_s_lines - assert "add2 -> sum" in dotstr_s_lines - - # nested graph - dotfile_n = wf.create_dotfile(type="nested") - dotstr_n_lines = dotfile_n.read_text().split("\n") - assert "mult [color=blue]" in dotstr_n_lines - assert "add2 [color=blue]" in dotstr_n_lines - assert "sum" in dotstr_n_lines - assert "mult -> add2 [color=blue]" in dotstr_n_lines - assert "add2 -> sum" in dotstr_n_lines - - # detailed graph - dotfile_d = wf.create_dotfile(type="detailed") - dotstr_d_lines = dotfile_d.read_text().split("\n") - assert ( - 'struct_wf [color=red, label="{WORKFLOW INPUT: | { x | y}}"];' - in dotstr_d_lines - ) - assert "struct_mult:out -> struct_add2:x;" in dotstr_d_lines - - if DOT_FLAG: - name = f"graph_{sys._getframe().f_code.co_name}" - exporting_graphs(wf=wf, name=name) - - -def test_graph_2(tmpdir): - """creating a graph, wf with one workflow as a node""" - wf = Workflow(name="wf", input_spec=["x"], cache_dir=tmpdir) - wfnd = Workflow(name="wfnd", input_spec=["x"], x=wf.lzin.x) - wfnd.add(add2(name="add2", x=wfnd.lzin.x)) - wfnd.set_output([("out", wfnd.add2.lzout.out)]) - wf.add(wfnd) - wf.set_output([("out", wf.wfnd.lzout.out)]) - - # simple graph - dotfile_s = wf.create_dotfile() - dotstr_s_lines = dotfile_s.read_text().split("\n") - assert "wfnd [shape=box]" in dotstr_s_lines - - # nested graph - dotfile = wf.create_dotfile(type="nested") - dotstr_lines = dotfile.read_text().split("\n") - assert "subgraph cluster_wfnd {" in dotstr_lines - assert "add2" in dotstr_lines - - # detailed graph - dotfile_d = wf.create_dotfile(type="detailed") - dotstr_d_lines = dotfile_d.read_text().split("\n") - assert ( - 'struct_wf [color=red, label="{WORKFLOW INPUT: | { x}}"];' in dotstr_d_lines - ) - - if DOT_FLAG: - name = f"graph_{sys._getframe().f_code.co_name}" - exporting_graphs(wf=wf, name=name) - - -def test_graph_2st(tmpdir): - """creating a set of graphs, wf with one workflow as a node - the inner workflow has a state, so should be blue - """ - wf = Workflow(name="wf", input_spec=["x"], cache_dir=tmpdir) - wfnd = Workflow(name="wfnd", input_spec=["x"]).split("x", x=wf.lzin.x) - wfnd.add(add2(name="add2", x=wfnd.lzin.x)) - wfnd.set_output([("out", wfnd.add2.lzout.out)]) - wf.add(wfnd) - wf.set_output([("out", wf.wfnd.lzout.out)]) - - # simple graph - dotfile_s = wf.create_dotfile() - dotstr_s_lines = dotfile_s.read_text().split("\n") - assert "wfnd [shape=box, color=blue]" in dotstr_s_lines - - # nested graph - dotfile_s = wf.create_dotfile(type="nested") - dotstr_s_lines = dotfile_s.read_text().split("\n") - assert "subgraph cluster_wfnd {" in dotstr_s_lines - assert "color=blue" in dotstr_s_lines - assert "add2" in dotstr_s_lines - - # detailed graph - dotfile_d = wf.create_dotfile(type="detailed") - dotstr_d_lines = dotfile_d.read_text().split("\n") - assert ( - 'struct_wf [color=red, label="{WORKFLOW INPUT: | { x}}"];' in dotstr_d_lines - ) - assert "struct_wfnd:out -> struct_wf_out:out;" in dotstr_d_lines - - if DOT_FLAG: - name = f"graph_{sys._getframe().f_code.co_name}" - exporting_graphs(wf=wf, name=name) - - -def test_graph_3(tmpdir): - """creating a set of graphs, wf with two nodes (one node is a workflow)""" - wf = Workflow(name="wf", input_spec=["x", "y"], cache_dir=tmpdir) - wf.add(multiply(name="mult", x=wf.lzin.x, y=wf.lzin.y)) - - wfnd = Workflow(name="wfnd", input_spec=["x"], x=wf.mult.lzout.out) - wfnd.add(add2(name="add2", x=wfnd.lzin.x)) - wfnd.set_output([("out", wfnd.add2.lzout.out)]) - wf.add(wfnd) - wf.set_output([("out", wf.wfnd.lzout.out)]) - - # simple graph - dotfile_s = wf.create_dotfile() - dotstr_s_lines = dotfile_s.read_text().split("\n") - assert "mult" in dotstr_s_lines - assert "wfnd [shape=box]" in dotstr_s_lines - assert "mult -> wfnd" in dotstr_s_lines - - # nested graph - dotfile_n = wf.create_dotfile(type="nested") - dotstr_n_lines = dotfile_n.read_text().split("\n") - assert "mult" in dotstr_n_lines - assert "subgraph cluster_wfnd {" in dotstr_n_lines - assert "add2" in dotstr_n_lines - - # detailed graph - dotfile_d = wf.create_dotfile(type="detailed") - dotstr_d_lines = dotfile_d.read_text().split("\n") - assert ( - 'struct_wf [color=red, label="{WORKFLOW INPUT: | { x | y}}"];' - in dotstr_d_lines - ) - assert "struct_mult:out -> struct_wfnd:x;" in dotstr_d_lines - - if DOT_FLAG: - name = f"graph_{sys._getframe().f_code.co_name}" - exporting_graphs(wf=wf, name=name) - - -def test_graph_3st(tmpdir): - """creating a set of graphs, wf with two nodes (one node is a workflow) - the first node has a state and it should be passed to the second node - (blue node and a wfasnd, and blue arrow from the node to the wfasnd) - """ - wf = Workflow(name="wf", input_spec=["x", "y"], cache_dir=tmpdir) - wf.add(multiply(name="mult", y=wf.lzin.y).split("x", x=wf.lzin.x)) - - wfnd = Workflow(name="wfnd", input_spec=["x"], x=wf.mult.lzout.out) - wfnd.add(add2(name="add2", x=wfnd.lzin.x)) - wfnd.set_output([("out", wfnd.add2.lzout.out)]) - wf.add(wfnd) - wf.set_output([("out", wf.wfnd.lzout.out)]) - - # simple graph - dotfile_s = wf.create_dotfile() - dotstr_s_lines = dotfile_s.read_text().split("\n") - assert "mult [color=blue]" in dotstr_s_lines - assert "wfnd [shape=box, color=blue]" in dotstr_s_lines - assert "mult -> wfnd [color=blue]" in dotstr_s_lines - - # nested graph - dotfile_n = wf.create_dotfile(type="nested") - dotstr_n_lines = dotfile_n.read_text().split("\n") - assert "mult [color=blue]" in dotstr_n_lines - assert "subgraph cluster_wfnd {" in dotstr_n_lines - assert "add2" in dotstr_n_lines - - # detailed graph - dotfile_d = wf.create_dotfile(type="detailed") - dotstr_d_lines = dotfile_d.read_text().split("\n") - assert ( - 'struct_wf [color=red, label="{WORKFLOW INPUT: | { x | y}}"];' - in dotstr_d_lines - ) - assert "struct_mult:out -> struct_wfnd:x;" in dotstr_d_lines - - if DOT_FLAG: - name = f"graph_{sys._getframe().f_code.co_name}" - exporting_graphs(wf=wf, name=name) - - -def test_graph_4(tmpdir): - """creating a set of graphs, wf with two nodes (one node is a workflow with two nodes - inside). Connection from the node to the inner workflow. - """ - wf = Workflow(name="wf", input_spec=["x", "y"], cache_dir=tmpdir) - wf.add(multiply(name="mult", x=wf.lzin.x, y=wf.lzin.y)) - wfnd = Workflow(name="wfnd", input_spec=["x"], x=wf.mult.lzout.out) - wfnd.add(add2(name="add2_a", x=wfnd.lzin.x)) - wfnd.add(add2(name="add2_b", x=wfnd.add2_a.lzout.out)) - wfnd.set_output([("out", wfnd.add2_b.lzout.out)]) - wf.add(wfnd) - wf.set_output([("out", wf.wfnd.lzout.out)]) - - # simple graph - dotfile_s = wf.create_dotfile() - dotstr_s_lines = dotfile_s.read_text().split("\n") - assert "mult" in dotstr_s_lines - assert "wfnd [shape=box]" in dotstr_s_lines - assert "mult -> wfnd" in dotstr_s_lines - - # nested graph - dotfile_n = wf.create_dotfile(type="nested") - dotstr_n_lines = dotfile_n.read_text().split("\n") - for el in ["mult", "add2_a", "add2_b"]: - assert el in dotstr_n_lines - assert "subgraph cluster_wfnd {" in dotstr_n_lines - assert "add2_a -> add2_b" in dotstr_n_lines - assert "mult -> add2_a [lhead=cluster_wfnd]" - - # detailed graph - dotfile_d = wf.create_dotfile(type="detailed") - dotstr_d_lines = dotfile_d.read_text().split("\n") - assert ( - 'struct_wf [color=red, label="{WORKFLOW INPUT: | { x | y}}"];' - in dotstr_d_lines - ) - assert "struct_wf:y -> struct_mult:y;" in dotstr_d_lines - - if DOT_FLAG: - name = f"graph_{sys._getframe().f_code.co_name}" - exporting_graphs(wf=wf, name=name) - - -def test_graph_5(tmpdir): - """creating a set of graphs, wf with two nodes (one node is a workflow with two nodes - inside). Connection from the inner workflow to the node. - """ - wf = Workflow(name="wf", input_spec=["x", "y"], cache_dir=tmpdir) - wfnd = Workflow(name="wfnd", input_spec=["x"], x=wf.lzin.x) - wfnd.add(add2(name="add2_a", x=wfnd.lzin.x)) - wfnd.add(add2(name="add2_b", x=wfnd.add2_a.lzout.out)) - wfnd.set_output([("out", wfnd.add2_b.lzout.out)]) - wf.add(wfnd) - wf.add(multiply(name="mult", x=wf.wfnd.lzout.out, y=wf.lzin.y)) - wf.set_output([("out", wf.mult.lzout.out)]) - - # simple graph - dotfile_s = wf.create_dotfile() - dotstr_s_lines = dotfile_s.read_text().split("\n") - assert "mult" in dotstr_s_lines - assert "wfnd [shape=box]" in dotstr_s_lines - assert "wfnd -> mult" in dotstr_s_lines - - # nested graph - dotfile_n = wf.create_dotfile(type="nested") - dotstr_n_lines = dotfile_n.read_text().split("\n") - for el in ["mult", "add2_a", "add2_b"]: - assert el in dotstr_n_lines - assert "subgraph cluster_wfnd {" in dotstr_n_lines - assert "add2_a -> add2_b" in dotstr_n_lines - assert "add2_b -> mult [ltail=cluster_wfnd]" - - # detailed graph - dotfile_d = wf.create_dotfile(type="detailed") - dotstr_d_lines = dotfile_d.read_text().split("\n") - assert ( - 'struct_wf [color=red, label="{WORKFLOW INPUT: | { x | y}}"];' - in dotstr_d_lines - ) - assert "struct_wf:x -> struct_wfnd:x;" in dotstr_d_lines - - if DOT_FLAG: - name = f"graph_{sys._getframe().f_code.co_name}" - exporting_graphs(wf=wf, name=name) - - -@pytest.mark.timeout(20) -def test_duplicate_input_on_split_wf(tmpdir): - """checking if the workflow gets stuck if it has to run two tasks with equal checksum; - This can occur when splitting on a list containing duplicate values. - """ - text = ["test"] * 2 - - @mark.task - def printer(a): - return a - - wf = Workflow(name="wf", input_spec=["text"], cache_dir=tmpdir) - wf.split(("text"), text=text) - - wf.add(printer(name="printer1", a=wf.lzin.text)) - - wf.set_output([("out1", wf.printer1.lzout.out)]) - - with Submitter(plugin="cf", n_procs=6) as sub: - sub(wf) - - res = wf.result() - - assert res[0].output.out1 == "test" and res[1].output.out1 == "test" - - -@pytest.mark.timeout(40) -def test_inner_outer_wf_duplicate(tmpdir): - """checking if the execution gets stuck if there is an inner and outer workflows - that run two nodes with the exact same inputs. - """ - task_list = ["First", "Second"] - start_list = [3, 4] - - @mark.task - def one_arg(start_number): - for k in range(10): - start_number += 1 - return start_number - - @mark.task - def one_arg_inner(start_number): - for k in range(10): - start_number += 1 - return start_number - - # Outer workflow - test_outer = Workflow( - name="test_outer", - input_spec=["start_number", "task_name", "dummy"], - cache_dir=tmpdir, - dummy=1, - ) - # Splitting on both arguments - test_outer.split( - ["start_number", "task_name"], start_number=start_list, task_name=task_list - ) - - # Inner Workflow - test_inner = Workflow(name="test_inner", input_spec=["start_number1"]) - test_inner.add( - one_arg_inner(name="Ilevel1", start_number=test_inner.lzin.start_number1) - ) - test_inner.set_output([("res", test_inner.Ilevel1.lzout.out)]) - - # Outer workflow has two nodes plus the inner workflow - test_outer.add(one_arg(name="level1", start_number=test_outer.lzin.start_number)) - test_outer.add(test_inner) - test_inner.inputs.start_number1 = test_outer.level1.lzout.out - - test_outer.set_output([("res2", test_outer.test_inner.lzout.res)]) - - with Submitter(plugin="cf") as sub: - sub(test_outer) - - res = test_outer.result() - assert res[0].output.res2 == 23 and res[1].output.res2 == 23 - - -def test_rerun_errored(tmpdir, capfd): - """Test rerunning a workflow containing errors. - Only the errored tasks and workflow should be rerun""" - - @mark.task - def pass_odds(x): - if x % 2 == 0: - print(f"x%2 = {x % 2} (error)\n") - raise Exception("even error") - else: - print(f"x%2 = {x % 2}\n") - return x - - wf = Workflow(name="wf", input_spec=["x"], cache_dir=tmpdir) - wf.add(pass_odds(name="pass_odds").split("x", x=[1, 2, 3, 4, 5])) - wf.set_output([("out", wf.pass_odds.lzout.out)]) - - with pytest.raises(Exception): - wf() - with pytest.raises(Exception): - wf() - - out, err = capfd.readouterr() - stdout_lines = out.splitlines() - - tasks_run = 0 - errors_found = 0 - - for line in stdout_lines: - if "x%2" in line: - tasks_run += 1 - if "(error)" in line: - errors_found += 1 - - # There should have been 5 messages of the form "x%2 = XXX" after calling task() the first time - # and another 2 messagers after calling the second time - assert tasks_run == 7 - assert errors_found == 4 - - -def test_wf_state_arrays(): - wf = Workflow( - name="test", - input_spec={"x": ty.List[int], "y": int}, - output_spec={"alpha": int, "beta": ty.List[int]}, - ) - - wf.add( # Split over workflow input "x" on "scalar" input - list_mult_sum( - in_list=wf.lzin.x, - name="A", - ).split(scalar=wf.lzin.x) - ) - - wf.add( # Workflow is still split over "x", combined over "x" on out - list_mult_sum( - name="B", - scalar=wf.A.lzout.sum, - in_list=wf.A.lzout.products, - ).combine("A.scalar") - ) - - wf.add( # Workflow " - list_mult_sum( - name="C", - scalar=wf.lzin.y, - in_list=wf.B.lzout.sum, - ) - ) - - wf.add( # Workflow is split again, this time over C.products - list_mult_sum( - name="D", - in_list=wf.lzin.x, - ) - .split(scalar=wf.C.lzout.products) - .combine("scalar") - ) - - wf.add( # Workflow is finally combined again into a single node - list_mult_sum(name="E", scalar=wf.lzin.y, in_list=wf.D.lzout.sum) - ) - - wf.set_output([("alpha", wf.E.lzout.sum), ("beta", wf.E.lzout.products)]) - - results = wf(x=[1, 2, 3, 4], y=10) - assert results.output.alpha == 3000000 - assert results.output.beta == [100000, 400000, 900000, 1600000] - - -def test_wf_input_output_typing(): - wf = Workflow( - name="test", - input_spec={"x": int, "y": ty.List[int]}, - output_spec={"alpha": int, "beta": ty.List[int]}, - ) - - with pytest.raises(TypeError) as exc_info: - list_mult_sum( - scalar=wf.lzin.y, - in_list=wf.lzin.y, - name="A", - ) - exc_info_matches(exc_info, "Cannot coerce into ") - - wf.add( # Split over workflow input "x" on "scalar" input - list_mult_sum( - scalar=wf.lzin.x, - in_list=wf.lzin.y, - name="A", - ) - ) - - with pytest.raises(TypeError, match="don't match their declared types"): - wf.set_output( - [ - ("alpha", wf.A.lzout.products), - ] - ) - - wf.set_output([("alpha", wf.A.lzout.sum), ("beta", wf.A.lzout.products)]) diff --git a/pydra/engine/tests/utils.py b/pydra/engine/tests/utils.py index 5b0858866c..36ca5c28d2 100644 --- a/pydra/engine/tests/utils.py +++ b/pydra/engine/tests/utils.py @@ -1,6 +1,7 @@ # Tasks for testing import time -import sys, shutil +import sys +import shutil import typing as ty from pathlib import Path import functools @@ -8,10 +9,12 @@ import subprocess as sp import pytest from fileformats.generic import File +from pydra.utils.general import task_fields +from pydra.engine.submitter import Submitter +from pydra.compose import workflow, python, shell -from ..core import Workflow -from ..submitter import Submitter -from ... import mark +if ty.TYPE_CHECKING: + from pydra.environments.base import Environment need_docker = pytest.mark.skipif( @@ -35,18 +38,40 @@ ) -def result_no_submitter(shell_task, plugin=None): +def num_python_cache_roots(cache_path: Path) -> int: + return len(list(cache_path.glob("python-*"))) + + +def get_output_names(task): + return sorted(f.name for f in task_fields(task.Outputs)) + + +def run_no_submitter( + shell_def: shell.Task, + cache_root: Path | None = None, + worker: str | None = None, + environment: "Environment | None" = None, +): """helper function to return result when running without submitter""" - return shell_task() + return shell_def(worker=worker, cache_root=cache_root, environment=environment) -def result_submitter(shell_task, plugin): +def run_submitter( + shell_def: shell.Task, + cache_root: Path | None = None, + worker: str | None = None, + environment: "Environment | None" = None, +): """helper function to return result when running with submitter - with specific plugin + with specific worker """ - with Submitter(plugin=plugin) as sub: - shell_task(submitter=sub) - return shell_task.result() + with Submitter( + worker=worker, cache_root=cache_root, environment=environment + ) as sub: + results = sub(shell_def) + if results.errored: + raise RuntimeError(f"task {shell_def} failed:\n" + "\n".join(results.errors)) + return results.outputs dot_check = sp.run(["which", "dot"], stdout=sp.PIPE, stderr=sp.PIPE) @@ -56,13 +81,13 @@ def result_submitter(shell_task, plugin): DOT_FLAG = False -@mark.task -def op_4var(a, b, c, d) -> str: +@python.define +def Op4Var(a, b, c, d) -> str: return f"{a} {b} {c} {d}" -@mark.task -def fun_addtwo(a: int) -> int: +@python.define +def FunAddTwo(a: int) -> int: import time time.sleep(1) @@ -71,8 +96,8 @@ def fun_addtwo(a: int) -> int: return a + 2 -@mark.task -def fun_addtwo_notype(a): +@python.define +def FunAddTwoNoType(a): import time time.sleep(1) @@ -81,8 +106,8 @@ def fun_addtwo_notype(a): return a + 2 -@mark.task -def fun_addtwo_with_threadcount(a: int, sgeThreads: int = 1) -> int: +@python.define +def FunAddTwoWithThreadCount(a: int, sgeThreads: int = 1) -> int: import time time.sleep(1) @@ -91,158 +116,180 @@ def fun_addtwo_with_threadcount(a: int, sgeThreads: int = 1) -> int: return a + 2 -@mark.task -def fun_addvar( - a: ty.Union[int, float], b: ty.Union[int, float] -) -> ty.Union[int, float]: +@python.define +def FunAddVar(a: ty.Union[int, float], b: ty.Union[int, float]) -> ty.Union[int, float]: return a + b -@mark.task -def fun_addvar_notype(a, b): +@python.define +def FunAddVarNoType(a, b): return a + b -@mark.task -@mark.annotate({"return": {"sum": float, "sub": float}}) -def fun_addsubvar(a: float, b: float): +@python.define(outputs={"sum": float, "sub": float}) +def FunAddSubVar(a: float, b: float): return a + b, a - b -@mark.task -def fun_addvar_none(a: int, b: ty.Optional[int]) -> int: +@python.define +def FunAddVarNone(a: int, b: ty.Optional[int]) -> int: if b is None: return a else: return a + b -@mark.task -def fun_addvar_default(a: int, b: int = 1) -> int: +@python.define +def FunAddVarDefault(a: int, b: int = 1) -> int: return a + b -@mark.task -def fun_addvar_default_notype(a, b=1): +@python.define +def FunAddVarDefaultNoType(a, b=1): return a + b -@mark.task -def fun_addvar3(a: int, b: int, c: int) -> int: +@python.define +def FunAddVar3(a: int, b: int, c: int) -> int: return a + b + c -@mark.task -def fun_addvar4(a: int, b: int, c: int, d: int) -> int: +@python.define +def FunAddVar4(a: int, b: int, c: int, d: int) -> int: return a + b + c + d -@mark.task -def moment(lst: ty.List[float], n: float) -> float: +@python.define +def Moment(lst: ty.List[float], n: float) -> float: return sum([i**n for i in lst]) / len(lst) -@mark.task -def fun_div(a: ty.Union[int, float], b: ty.Union[int, float]) -> float: +@python.define +def FunDiv(a: ty.Union[int, float], b: ty.Union[int, float]) -> float: return a / b -@mark.task -def multiply(x: int, y: int) -> int: +@python.define +def Multiply(x: int, y: int) -> int: return x * y -@mark.task -def multiply_list(x: list, y: int) -> list: +@python.define +def Divide(x: int, y: int) -> int: + return x // y + + +@python.define +def MultiplyList(x: list, y: int) -> list: return x * y -@mark.task -def multiply_mixed(x: list, y: int) -> list: +@python.define +def MultiplyMixed(x: list, y: int) -> list: return x * y -@mark.task -def add2(x: int) -> int: +@python.define +def Add2(x: int) -> int: if x == 1 or x == 12: time.sleep(1) return x + 2 -@mark.task -def raise_xeq1(x: int) -> int: +@python.define +def FileOrIntIdentity(in_file: ty.Union[File, int]) -> File: + return in_file + + +@python.define +def FileAndIntIdentity(in_file: File, in_int: int) -> File: + return in_file, in_int + + +@python.define +def ListOfListOfFileOrIntIdentity( + in_file: ty.List[ty.List[ty.Union[int, File]]], +) -> ty.List[ty.List[ty.Union[int, File]]]: + return in_file + + +@python.define +def ListOfDictOfFileOrIntIdentity( + in_file: ty.List[ty.Dict[ty.Any, ty.Union[File, int]]], +) -> ty.List[ty.Dict[ty.Any, ty.Union[File, int]]]: + return in_file + + +@python.define +def RaiseXeq1(x: int) -> int: if x == 1: raise Exception("x is 1, so i'm raising an exception!") return x -@mark.task -@mark.annotate({"return": {"out_add": float, "out_sub": float}}) -def add2_sub2_res(res): +@python.define(outputs={"out_add": float, "out_sub": float}) +def Add2Sub2Res(res): """function that takes entire output as an input""" return res["out"] + 2, res["out"] - 2 -@mark.task -@mark.annotate({"return": {"out_add": ty.List[float], "out_sub": ty.List[float]}}) -def add2_sub2_res_list(res): +@python.define(outputs={"out_add": ty.List[float], "out_sub": ty.List[float]}) +def Add2Sub2ResList(res): """function that takes entire output as an input""" return [r["out"] + 2 for r in res], [r["out"] - 2 for r in res] -@mark.task -def power(a: int, b: int) -> int: +@python.define +def Power(a: int, b: int) -> int: return a**b -@mark.task -def identity(x): +@python.define +def Identity(x): return x -@mark.task -def identity_2flds( - x1, x2 -) -> ty.NamedTuple("Output", [("out1", ty.Any), ("out2", ty.Any)]): +@python.define(outputs={"out1": ty.Any, "out2": ty.Any}) +def Identity2Flds(x1, x2): return x1, x2 -@mark.task -def ten(x) -> int: +@python.define +def Ten(x) -> int: return 10 -@mark.task -def add2_wait(x: int) -> int: +@python.define +def Add2Wait(x: int) -> int: time.sleep(2) return x + 2 -@mark.task -def list_output(x: int) -> ty.List[int]: +@python.define +def ListOutput(x: int) -> ty.List[int]: return [x, 2 * x, 3 * x] -@mark.task -def list_sum(x: ty.Sequence[ty.Union[int, float]]) -> ty.Union[int, float]: +@python.define +def ListSum(x: ty.Sequence[ty.Union[int, float]]) -> ty.Union[int, float]: return sum(x) -@mark.task -def fun_dict(d: dict) -> str: +@python.define +def FunDict(d: dict) -> str: kv_list = [f"{k}:{v}" for (k, v) in d.items()] return "_".join(kv_list) -@mark.task -def fun_write_file(filename: Path, text="hello") -> File: +@python.define +def FunWriteFile(filename: Path, text="hello") -> File: with open(filename, "w") as f: f.write(text) return File(filename) -@mark.task -def fun_write_file_list( +@python.define +def FunWriteFileList( filename_list: ty.List[ty.Union[str, File, Path]], text="hi" ) -> ty.List[File]: for ii, filename in enumerate(filename_list): @@ -252,8 +299,8 @@ def fun_write_file_list( return filename_list -@mark.task -def fun_write_file_list2dict( +@python.define +def FunWriteFileList2Dict( filename_list: ty.List[ty.Union[str, File, Path]], text="hi" ) -> ty.Dict[str, ty.Union[File, int]]: filename_dict = {} @@ -266,15 +313,15 @@ def fun_write_file_list2dict( return filename_dict -@mark.task -def fun_file(filename: File): +@python.define +def FunFile(filename: File): with open(filename) as f: txt = f.read() return txt -@mark.task -def fun_file_list(filename_list: ty.List[File]): +@python.define +def FunFileList(filename_list: ty.List[File]): txt_list = [] for filename in filename_list: with open(filename) as f: @@ -282,75 +329,36 @@ def fun_file_list(filename_list: ty.List[File]): return " ".join(txt_list) -def gen_basic_wf(name="basic-wf"): - """ - Generates `Workflow` of two tasks +@workflow.define(outputs=["out"]) +def BasicWorkflow(x): + task1 = workflow.add(FunAddTwo(a=x), name="A") + task2 = workflow.add(FunAddVar(a=task1.out, b=2), name="B") + return task2.out - Task Input - ---------- - x : int (5) - Task Output - ----------- - out : int (9) - """ - wf = Workflow(name=name, input_spec=["x"]) - wf.inputs.x = 5 - wf.add(fun_addtwo(name="task1", a=wf.lzin.x, b=0)) - wf.add(fun_addvar(name="task2", a=wf.task1.lzout.out, b=2)) - wf.set_output([("out", wf.task2.lzout.out)]) - return wf +@workflow.define(outputs=["out"]) +def BasicWorkflowWithThreadCount(x): + task1 = workflow.add(FunAddTwoWithThreadCount(a=x, sgeThreads=4)) + task2 = workflow.add(FunAddVar(a=task1.out, b=2)) + return task2.out -def gen_basic_wf_with_threadcount(name="basic-wf-with-threadcount"): - """ - Generates `Workflow` of two tasks +@workflow.define(outputs=["out1", "out2"]) +def BasicWorkflowWithThreadCountConcurrent(x): + task1_1 = workflow.add(FunAddTwoWithThreadCount(a=x, sgeThreads=4)) + task1_2 = workflow.add(FunAddTwoWithThreadCount(a=x, sgeThreads=2)) + task2 = workflow.add(FunAddVar(a=task1_1.out, b=2)) + return task2.out, task1_2.out - Task Input - ---------- - x : int (5) + # return Workflow(x=5) - Task Output - ----------- - out : int (9) - """ - wf = Workflow(name=name, input_spec=["x"]) - wf.inputs.x = 5 - wf.add(fun_addtwo_with_threadcount(name="task1", a=wf.lzin.x, sgeThreads=4)) - wf.add(fun_addvar(name="task2", a=wf.task1.lzout.out, b=2)) - wf.set_output([("out", wf.task2.lzout.out)]) - return wf - -def gen_basic_wf_with_threadcount_concurrent(name="basic-wf-with-threadcount"): - """ - Generates `Workflow` of two tasks - - Task Input - ---------- - x : int (5) - - Task Output - ----------- - out : int (9) - """ - wf = Workflow(name=name, input_spec=["x"]) - wf.inputs.x = 5 - wf.add(fun_addtwo_with_threadcount(name="task1_1", a=wf.lzin.x, sgeThreads=4)) - wf.add(fun_addtwo_with_threadcount(name="task1_2", a=wf.lzin.x, sgeThreads=2)) - wf.add(fun_addvar(name="task2", a=wf.task1_1.lzout.out, b=2)) - wf.set_output([("out1", wf.task2.lzout.out), ("out2", wf.task1_2.lzout.out)]) - return wf - - -@mark.task -@mark.annotate({"return": {"sum": int, "products": ty.List[int]}}) -def list_mult_sum(scalar: int, in_list: ty.List[int]) -> ty.Tuple[int, ty.List[int]]: +@python.define(outputs={"sum": int, "products": ty.List[int]}) +def ListMultSum(scalar: int, in_list: ty.List[int]) -> ty.Tuple[int, ty.List[int]]: products = [scalar * x for x in in_list] return functools.reduce(operator.add, products, 0), products -@mark.task -@mark.annotate({"return": {"x": str, "y": int, "z": float}}) -def foo(a: str, b: int, c: float) -> ty.Tuple[str, int, float]: +@python.define(outputs={"x": str, "y": int, "z": float}) +def Foo(a: str, b: int, c: float) -> ty.Tuple[str, int, float]: return a, b, c diff --git a/pydra/engine/workers.py b/pydra/engine/workers.py deleted file mode 100644 index eaa40beb0a..0000000000 --- a/pydra/engine/workers.py +++ /dev/null @@ -1,1067 +0,0 @@ -"""Execution workers.""" - -import asyncio -import sys -import json -import re -from tempfile import gettempdir -from pathlib import Path -from shutil import copyfile, which - -import concurrent.futures as cf - -from .core import TaskBase -from .helpers import ( - get_available_cpus, - read_and_display_async, - save, - load_and_run, - load_task, -) - -import logging - -import random - -logger = logging.getLogger("pydra.worker") - - -class Worker: - """A base class for execution of tasks.""" - - def __init__(self, loop=None): - """Initialize the worker.""" - logger.debug(f"Initializing {self.__class__.__name__}") - self.loop = loop - - def run_el(self, interface, **kwargs): - """Return coroutine for task execution.""" - raise NotImplementedError - - def close(self): - """Close this worker.""" - - async def fetch_finished(self, futures): - """ - Awaits asyncio's :class:`asyncio.Task` until one is finished. - - Parameters - ---------- - futures : set of asyncio awaitables - Task execution coroutines or asyncio :class:`asyncio.Task` - - Returns - ------- - pending : set - Pending asyncio :class:`asyncio.Task`. - - """ - done = set() - try: - done, pending = await asyncio.wait( - [ - asyncio.create_task(f) if not isinstance(f, asyncio.Task) else f - for f in futures - ], - return_when=asyncio.FIRST_COMPLETED, - ) - except ValueError: - # nothing pending! - pending = set() - logger.debug(f"Tasks finished: {len(done)}") - return pending - - -class DistributedWorker(Worker): - """Base Worker for distributed execution.""" - - def __init__(self, loop=None, max_jobs=None): - """Initialize the worker.""" - super().__init__(loop=loop) - self.max_jobs = max_jobs - """Maximum number of concurrently running jobs.""" - self._jobs = 0 - - async def fetch_finished(self, futures): - """ - Awaits asyncio's :class:`asyncio.Task` until one is finished. - - Limits number of submissions based on - py:attr:`DistributedWorker.max_jobs`. - - Parameters - ---------- - futures : set of asyncio awaitables - Task execution coroutines or asyncio :class:`asyncio.Task` - - Returns - ------- - pending : set - Pending asyncio :class:`asyncio.Task`. - - """ - done, unqueued = set(), set() - job_slots = self.max_jobs - self._jobs if self.max_jobs else float("inf") - if len(futures) > job_slots: - # convert to list to simplify indexing - logger.warning(f"Reducing queued jobs due to max jobs ({self.max_jobs})") - futures = list(futures) - futures, unqueued = set(futures[:job_slots]), set(futures[job_slots:]) - try: - self._jobs += len(futures) - done, pending = await asyncio.wait( - [ - asyncio.create_task(f) if not isinstance(f, asyncio.Task) else f - for f in futures - ], - return_when=asyncio.FIRST_COMPLETED, - ) - except ValueError: - # nothing pending! - pending = set() - self._jobs -= len(done) - logger.debug(f"Tasks finished: {len(done)}") - # ensure pending + unqueued tasks persist - return pending.union(unqueued) - - -class SerialWorker(Worker): - """A worker to execute linearly.""" - - plugin_name = "serial" - - def __init__(self, **kwargs): - """Initialize worker.""" - logger.debug("Initialize SerialWorker") - - def run_el(self, interface, rerun=False, environment=None, **kwargs): - """Run a task.""" - return self.exec_serial(interface, rerun=rerun, environment=environment) - - def close(self): - """Return whether the task is finished.""" - - async def exec_serial(self, runnable, rerun=False, environment=None): - if isinstance(runnable, TaskBase): - return runnable._run(rerun, environment=environment) - else: # it could be tuple that includes pickle files with tasks and inputs - ind, task_main_pkl, _ = runnable - return load_and_run(task_main_pkl, ind, rerun, environment=environment) - - async def fetch_finished(self, futures): - await asyncio.gather(*futures) - return set() - - # async def fetch_finished(self, futures): - # return await asyncio.wait(futures) - - -class ConcurrentFuturesWorker(Worker): - """A worker to execute in parallel using Python's concurrent futures.""" - - plugin_name = "cf" - - def __init__(self, n_procs=None): - """Initialize Worker.""" - super().__init__() - self.n_procs = get_available_cpus() if n_procs is None else n_procs - # added cpu_count to verify, remove once confident and let PPE handle - self.pool = cf.ProcessPoolExecutor(self.n_procs) - # self.loop = asyncio.get_event_loop() - logger.debug("Initialize ConcurrentFuture") - - def run_el(self, runnable, rerun=False, environment=None, **kwargs): - """Run a task.""" - assert self.loop, "No event loop available to submit tasks" - return self.exec_as_coro(runnable, rerun=rerun, environment=environment) - - async def exec_as_coro(self, runnable, rerun=False, environment=None): - """Run a task (coroutine wrapper).""" - if isinstance(runnable, TaskBase): - res = await self.loop.run_in_executor( - self.pool, runnable._run, rerun, environment - ) - else: # it could be tuple that includes pickle files with tasks and inputs - ind, task_main_pkl, task_orig = runnable - res = await self.loop.run_in_executor( - self.pool, load_and_run, task_main_pkl, ind, rerun, environment - ) - return res - - def close(self): - """Finalize the internal pool of tasks.""" - self.pool.shutdown() - - -class SlurmWorker(DistributedWorker): - """A worker to execute tasks on SLURM systems.""" - - plugin_name = "slurm" - _cmd = "sbatch" - _sacct_re = re.compile( - "(?P\\d*) +(?P\\w*)\\+? +" "(?P\\d+):\\d+" - ) - - def __init__(self, loop=None, max_jobs=None, poll_delay=1, sbatch_args=None): - """ - Initialize SLURM Worker. - - Parameters - ---------- - poll_delay : seconds - Delay between polls to slurmd - sbatch_args : str - Additional sbatch arguments - max_jobs : int - Maximum number of submitted jobs - - """ - super().__init__(loop=loop, max_jobs=max_jobs) - if not poll_delay or poll_delay < 0: - poll_delay = 0 - self.poll_delay = poll_delay - self.sbatch_args = sbatch_args or "" - self.error = {} - - def run_el(self, runnable, rerun=False, environment=None): - """Worker submission API.""" - script_dir, batch_script = self._prepare_runscripts(runnable, rerun=rerun) - if (script_dir / script_dir.parts[1]) == gettempdir(): - logger.warning("Temporary directories may not be shared across computers") - if isinstance(runnable, TaskBase): - cache_dir = runnable.cache_dir - name = runnable.name - uid = runnable.uid - else: # runnable is a tuple (ind, pkl file, task) - cache_dir = runnable[-1].cache_dir - name = runnable[-1].name - uid = f"{runnable[-1].uid}_{runnable[0]}" - - return self._submit_job(batch_script, name=name, uid=uid, cache_dir=cache_dir) - - def _prepare_runscripts(self, task, interpreter="/bin/sh", rerun=False): - if isinstance(task, TaskBase): - cache_dir = task.cache_dir - ind = None - uid = task.uid - else: - ind = task[0] - cache_dir = task[-1].cache_dir - uid = f"{task[-1].uid}_{ind}" - - script_dir = cache_dir / f"{self.__class__.__name__}_scripts" / uid - script_dir.mkdir(parents=True, exist_ok=True) - if ind is None: - if not (script_dir / "_task.pkl").exists(): - save(script_dir, task=task) - else: - copyfile(task[1], script_dir / "_task.pklz") - - task_pkl = script_dir / "_task.pklz" - if not task_pkl.exists() or not task_pkl.stat().st_size: - raise Exception("Missing or empty task!") - - batchscript = script_dir / f"batchscript_{uid}.sh" - python_string = ( - f"""'from pydra.engine.helpers import load_and_run; """ - f"""load_and_run(task_pkl="{task_pkl}", ind={ind}, rerun={rerun}) '""" - ) - bcmd = "\n".join( - ( - f"#!{interpreter}", - f"#SBATCH --output={script_dir / 'slurm-%j.out'}", - f"{sys.executable} -c " + python_string, - ) - ) - with batchscript.open("wt") as fp: - fp.writelines(bcmd) - return script_dir, batchscript - - async def _submit_job(self, batchscript, name, uid, cache_dir): - """Coroutine that submits task runscript and polls job until completion or error.""" - script_dir = cache_dir / f"{self.__class__.__name__}_scripts" / uid - sargs = self.sbatch_args.split() - jobname = re.search(r"(?<=-J )\S+|(?<=--job-name=)\S+", self.sbatch_args) - if not jobname: - jobname = ".".join((name, uid)) - sargs.append(f"--job-name={jobname}") - output = re.search(r"(?<=-o )\S+|(?<=--output=)\S+", self.sbatch_args) - if not output: - output_file = str(script_dir / "slurm-%j.out") - sargs.append(f"--output={output_file}") - error = re.search(r"(?<=-e )\S+|(?<=--error=)\S+", self.sbatch_args) - if not error: - error_file = str(script_dir / "slurm-%j.err") - sargs.append(f"--error={error_file}") - else: - error_file = None - sargs.append(str(batchscript)) - # TO CONSIDER: add random sleep to avoid overloading calls - rc, stdout, stderr = await read_and_display_async( - "sbatch", *sargs, hide_display=True - ) - jobid = re.search(r"\d+", stdout) - if rc: - raise RuntimeError(f"Error returned from sbatch: {stderr}") - elif not jobid: - raise RuntimeError("Could not extract job ID") - jobid = jobid.group() - if error_file: - error_file = error_file.replace("%j", jobid) - self.error[jobid] = error_file.replace("%j", jobid) - # intermittent polling - while True: - # 3 possibilities - # False: job is still pending/working - # True: job is complete - # Exception: Polling / job failure - done = await self._poll_job(jobid) - if done: - if ( - done in ["CANCELLED", "TIMEOUT", "PREEMPTED"] - and "--no-requeue" not in self.sbatch_args - ): - # loading info about task with a specific uid - info_file = cache_dir / f"{uid}_info.json" - if info_file.exists(): - checksum = json.loads(info_file.read_text())["checksum"] - if (cache_dir / f"{checksum}.lock").exists(): - # for pyt3.8 we could you missing_ok=True - (cache_dir / f"{checksum}.lock").unlink() - cmd_re = ("scontrol", "requeue", jobid) - await read_and_display_async(*cmd_re, hide_display=True) - else: - return True - await asyncio.sleep(self.poll_delay) - - async def _poll_job(self, jobid): - cmd = ("squeue", "-h", "-j", jobid) - logger.debug(f"Polling job {jobid}") - rc, stdout, stderr = await read_and_display_async(*cmd, hide_display=True) - if not stdout or "slurm_load_jobs error" in stderr: - # job is no longer running - check exit code - status = await self._verify_exit_code(jobid) - return status - return False - - async def _verify_exit_code(self, jobid): - cmd = ("sacct", "-n", "-X", "-j", jobid, "-o", "JobID,State,ExitCode") - _, stdout, _ = await read_and_display_async(*cmd, hide_display=True) - if not stdout: - raise RuntimeError("Job information not found") - m = self._sacct_re.search(stdout) - error_file = self.error[jobid] - if int(m.group("exit_code")) != 0 or m.group("status") != "COMPLETED": - if m.group("status") in ["CANCELLED", "TIMEOUT", "PREEMPTED"]: - return m.group("status") - elif m.group("status") in ["RUNNING", "PENDING"]: - return False - # TODO: potential for requeuing - # parsing the error message - error_line = Path(error_file).read_text().split("\n")[-2] - if "Exception" in error_line: - error_message = error_line.replace("Exception: ", "") - elif "Error" in error_line: - error_message = error_line.replace("Exception: ", "") - else: - error_message = "Job failed (unknown reason - TODO)" - raise Exception(error_message) - return True - - -class SGEWorker(DistributedWorker): - """A worker to execute tasks on SLURM systems.""" - - plugin_name = "sge" - - _cmd = "qsub" - _sacct_re = re.compile( - "(?P\\d*) +(?P\\w*)\\+? +" "(?P\\d+):\\d+" - ) - - def __init__( - self, - loop=None, - max_jobs=None, - poll_delay=1, - qsub_args=None, - write_output_files=True, - max_job_array_length=50, - indirect_submit_host=None, - max_threads=None, - poll_for_result_file=True, - default_threads_per_task=1, - polls_before_checking_evicted=60, - collect_jobs_delay=30, - default_qsub_args="", - max_mem_free=None, - ): - """ - Initialize SGE Worker. - - Parameters - ---------- - poll_delay : seconds - Delay between polls to slurmd - qsub_args : str - Additional qsub arguments - max_jobs : int - Maximum number of submitted jobs - write_output_files : bool - Turns on/off writing to output files for individual tasks - max_job_array_length : int - Number of jobs an SGE job array can hold - indirect_submit_host : str - Name of a submit node in the SGE cluster through which to run SGE qsub commands - max_threads : int - Maximum number of threads that will be scheduled for SGE submission at once - poll_for_result_file : bool - If true, a task is complete when its _result.pklz file exists - If false, a task is complete when its job array is indicated complete by qstat/qacct polling - default_threads_per_task : int - Sets the number of slots SGE should request for a task if sgeThreads - is not a field in the task input_spec - polls_before_checking_evicted : int - Number of poll_delays before running qacct to check if a task has been evicted by SGE - collect_jobs_delay : int - Number of seconds to wait for the list of jobs for a job array to fill - - """ - super().__init__(loop=loop, max_jobs=max_jobs) - if not poll_delay or poll_delay < 0: - poll_delay = 0 - self.poll_delay = poll_delay - self.qsub_args = qsub_args or "" - self.error = {} - self.write_output_files = ( - write_output_files # set to False to avoid OSError: Too many open files - ) - self.tasks_to_run_by_threads_requested = {} - self.output_by_jobid = {} - self.jobid_by_task_uid = {} - self.max_job_array_length = max_job_array_length - self.threads_used = 0 - self.job_completed_by_jobid = {} - self.indirect_submit_host = indirect_submit_host - self.max_threads = max_threads - self.default_threads_per_task = default_threads_per_task - self.poll_for_result_file = poll_for_result_file - self.polls_before_checking_evicted = polls_before_checking_evicted - self.result_files_by_jobid = {} - self.collect_jobs_delay = collect_jobs_delay - self.task_pkls_rerun = {} - self.default_qsub_args = default_qsub_args - self.max_mem_free = max_mem_free - - def run_el(self, runnable, rerun=False): # TODO: add env - """Worker submission API.""" - ( - script_dir, - batch_script, - task_pkl, - ind, - output_dir, - task_qsub_args, - ) = self._prepare_runscripts(runnable, rerun=rerun) - if (script_dir / script_dir.parts[1]) == gettempdir(): - logger.warning("Temporary directories may not be shared across computers") - if isinstance(runnable, TaskBase): - cache_dir = runnable.cache_dir - name = runnable.name - uid = runnable.uid - else: # runnable is a tuple (ind, pkl file, task) - cache_dir = runnable[-1].cache_dir - name = runnable[-1].name - uid = f"{runnable[-1].uid}_{runnable[0]}" - - return self._submit_job( - batch_script, - name=name, - uid=uid, - cache_dir=cache_dir, - task_pkl=task_pkl, - ind=ind, - output_dir=output_dir, - task_qsub_args=task_qsub_args, - ) - - def _prepare_runscripts(self, task, interpreter="/bin/sh", rerun=False): - if isinstance(task, TaskBase): - cache_dir = task.cache_dir - ind = None - uid = task.uid - try: - task_qsub_args = task.qsub_args - except Exception: - task_qsub_args = self.default_qsub_args - else: - ind = task[0] - cache_dir = task[-1].cache_dir - uid = f"{task[-1].uid}_{ind}" - try: - task_qsub_args = task[-1].qsub_args - except Exception: - task_qsub_args = self.default_qsub_args - - script_dir = cache_dir / f"{self.__class__.__name__}_scripts" / uid - script_dir.mkdir(parents=True, exist_ok=True) - if ind is None: - if not (script_dir / "_task.pkl").exists(): - save(script_dir, task=task) - else: - copyfile(task[1], script_dir / "_task.pklz") - - task_pkl = script_dir / "_task.pklz" - if not task_pkl.exists() or not task_pkl.stat().st_size: - raise Exception("Missing or empty task!") - - batchscript = script_dir / f"batchscript_{uid}.job" - - if task_qsub_args not in self.tasks_to_run_by_threads_requested: - self.tasks_to_run_by_threads_requested[task_qsub_args] = [] - self.tasks_to_run_by_threads_requested[task_qsub_args].append( - (str(task_pkl), ind, rerun) - ) - - return ( - script_dir, - batchscript, - task_pkl, - ind, - task.output_dir, - task_qsub_args, - ) - - async def get_tasks_to_run(self, task_qsub_args, mem_free): - # Extract the first N tasks to run - if mem_free is not None and self.max_mem_free is not None: - max_job_array_length = min( - self.max_job_array_length, int(self.max_mem_free / mem_free) - ) - else: - max_job_array_length = self.max_job_array_length - tasks_to_run_copy, self.tasks_to_run_by_threads_requested[task_qsub_args] = ( - self.tasks_to_run_by_threads_requested[task_qsub_args][ - :max_job_array_length - ], - self.tasks_to_run_by_threads_requested[task_qsub_args][ - max_job_array_length: - ], - ) - return tasks_to_run_copy - - async def check_for_results_files(self, jobid, threads_requested): - for task in list(self.result_files_by_jobid[jobid]): - if self.result_files_by_jobid[jobid][task].exists(): - del self.result_files_by_jobid[jobid][task] - self.threads_used -= threads_requested - - async def _submit_jobs( - self, - batchscript, - name, - uid, - cache_dir, - output_dir, - task_qsub_args, - interpreter="/bin/sh", - ): - # Get the number of slots requested for this task - threads_requested = self.default_threads_per_task - if "smp" in task_qsub_args: - smp_index = task_qsub_args.split().index("smp") - if ( - smp_index + 1 < len(task_qsub_args.split()) - and task_qsub_args.split()[smp_index + 1].isdigit() - ): - threads_requested = int(task_qsub_args.split()[smp_index + 1]) - # Get the amount of mem_free requested for the job - mem_free = None - if "mem_free" in task_qsub_args: - mem_free_cmd = [ - word for word in task_qsub_args.split() if word.startswith("mem_free") - ][0] - if len(re.findall(r"\d+", mem_free_cmd)) > 0: - mem_free = int(re.findall(r"\d+", mem_free_cmd)[0]) - - if ( - len(self.tasks_to_run_by_threads_requested.get(task_qsub_args)) - <= self.max_job_array_length - ): - await asyncio.sleep(self.collect_jobs_delay) - tasks_to_run = await self.get_tasks_to_run(task_qsub_args, mem_free) - - if mem_free is not None: - summed_mem_free_cmd = re.sub( - str(mem_free), str(len(tasks_to_run) * mem_free), mem_free_cmd - ) - task_qsub_args = re.sub(mem_free_cmd, summed_mem_free_cmd, task_qsub_args) - - if len(tasks_to_run) > 0: - if self.max_threads is not None: - while self.threads_used > self.max_threads - threads_requested * len( - tasks_to_run - ): - await asyncio.sleep(self.poll_delay) - self.threads_used += threads_requested * len(tasks_to_run) - - python_string = f"""import sys; from pydra.engine.helpers import load_and_run; \ - task_pkls={[task_tuple for task_tuple in tasks_to_run]}; \ - task_index=int(sys.argv[1])-1; \ - load_and_run(task_pkl=task_pkls[task_index][0], \ - ind=task_pkls[task_index][1], rerun=task_pkls[task_index][2])""" - bcmd_job = "\n".join( - ( - f"#!{interpreter}", - f"{sys.executable} {Path(batchscript).with_suffix('.py')}" - + " $SGE_TASK_ID", - ) - ) - - bcmd_py = python_string - - # Better runtime when the python contents are written to file - # rather than given by cmdline arg -c - with Path(batchscript).with_suffix(".py").open("wt") as fp: - fp.write(bcmd_py) - - with batchscript.open("wt") as fp: - fp.writelines(bcmd_job) - - script_dir = cache_dir / f"{self.__class__.__name__}_scripts" / uid - script_dir.mkdir(parents=True, exist_ok=True) - sargs = ["-t"] - sargs.append(f"1-{len(tasks_to_run)}") - sargs = sargs + task_qsub_args.split() - - jobname = re.search(r"(?<=-N )\S+", task_qsub_args) - - if not jobname: - jobname = ".".join((name, uid)) - sargs.append("-N") - sargs.append(jobname) - output = re.search(r"(?<=-o )\S+", self.qsub_args) - - if not output: - output_file = str(script_dir / "sge-%j.out") - if self.write_output_files: - sargs.append("-o") - sargs.append(output_file) - error = re.search(r"(?<=-e )\S+", self.qsub_args) - if not error: - error_file = str(script_dir / "sge-%j.out") - if self.write_output_files: - sargs.append("-e") - sargs.append(error_file) - else: - error_file = None - sargs.append(str(batchscript)) - - await asyncio.sleep(random.uniform(0, 5)) - - jobid = await self.submit_array_job(sargs, tasks_to_run, error_file) - - if self.poll_for_result_file: - self.result_files_by_jobid[jobid] = {} - for task_pkl, ind, rerun in tasks_to_run: - task = load_task(task_pkl=task_pkl, ind=ind) - self.result_files_by_jobid[jobid][task] = ( - task.output_dir / "_result.pklz" - ) - - poll_counter = 0 - while True: - # 3 possibilities - # False: job is still pending/working - # True: job is complete - # Exception: Polling / job failure - # done = await self._poll_job(jobid) - if self.poll_for_result_file: - if len(self.result_files_by_jobid[jobid]) > 0: - for task in list(self.result_files_by_jobid[jobid]): - if self.result_files_by_jobid[jobid][task].exists(): - del self.result_files_by_jobid[jobid][task] - self.threads_used -= threads_requested - - else: - exit_status = await self._verify_exit_code(jobid) - if exit_status == "ERRORED": - jobid = await self._rerun_job_array( - cache_dir, uid, sargs, tasks_to_run, error_file, jobid - ) - else: - for task_pkl, ind, rerun in tasks_to_run: - if task_pkl in self.task_pkls_rerun: - del self.task_pkls_rerun[task_pkl] - return True - - if poll_counter >= self.polls_before_checking_evicted: - # Checking for evicted for jobid - exit_status = await self._verify_exit_code(jobid) - if exit_status == "ERRORED": - jobid = await self._rerun_job_array( - cache_dir, uid, sargs, tasks_to_run, error_file, jobid - ) - poll_counter = 0 - poll_counter += 1 - await asyncio.sleep(self.poll_delay) - else: - done = await self._poll_job(jobid, cache_dir) - if done: - if done == "ERRORED": # If the SGE job was evicted, rerun it - jobid = await self._rerun_job_array( - cache_dir, uid, sargs, tasks_to_run, error_file, jobid - ) - else: - self.job_completed_by_jobid[jobid] = True - self.threads_used -= threads_requested * len(tasks_to_run) - return True - # Don't poll exactly on the same interval to avoid overloading SGE - await asyncio.sleep( - random.uniform(max(0, self.poll_delay - 2), self.poll_delay + 2) - ) - - async def _rerun_job_array( - self, cache_dir, uid, sargs, tasks_to_run, error_file, evicted_jobid - ): - for task_pkl, ind, rerun in tasks_to_run: - sge_task = load_task(task_pkl=task_pkl, ind=ind) - application_task_pkl = sge_task.output_dir / "_task.pklz" - if ( - not application_task_pkl.exists() - or load_task(task_pkl=application_task_pkl).result() is None - or load_task(task_pkl=application_task_pkl).result().errored - ): - self.task_pkls_rerun[task_pkl] = None - info_file = cache_dir / f"{sge_task.uid}_info.json" - if info_file.exists(): - checksum = json.loads(info_file.read_text())["checksum"] - if (cache_dir / f"{checksum}.lock").exists(): - # for pyt3.8 we could use missing_ok=True - (cache_dir / f"{checksum}.lock").unlink() - # Maybe wait a little to check if _error.pklz exists - not getting found immediately - - # If the previous job array failed, run the array's script again and get the new jobid - jobid = await self.submit_array_job(sargs, tasks_to_run, error_file) - self.result_files_by_jobid[jobid] = self.result_files_by_jobid[evicted_jobid] - return jobid - - async def submit_array_job(self, sargs, tasks_to_run, error_file): - if self.indirect_submit_host is not None: - indirect_submit_host_prefix = [] - indirect_submit_host_prefix.append("ssh") - indirect_submit_host_prefix.append(self.indirect_submit_host) - indirect_submit_host_prefix.append('""export SGE_ROOT=/opt/sge;') - rc, stdout, stderr = await read_and_display_async( - *indirect_submit_host_prefix, - str(Path(which("qsub")).parent / "qsub"), - *sargs, - '""', - hide_display=True, - ) - else: - rc, stdout, stderr = await read_and_display_async( - "qsub", *sargs, hide_display=True - ) - jobid = re.search(r"\d+", stdout) - if rc: - raise RuntimeError(f"Error returned from qsub: {stderr}") - elif not jobid: - raise RuntimeError("Could not extract job ID") - jobid = jobid.group() - self.output_by_jobid[jobid] = (rc, stdout, stderr) - - for task_pkl, ind, rerun in tasks_to_run: - self.jobid_by_task_uid[Path(task_pkl).parent.name] = jobid - - if error_file: - error_file = str(error_file).replace("%j", jobid) - self.error[jobid] = str(error_file).replace("%j", jobid) - return jobid - - async def get_output_by_task_pkl(self, task_pkl): - jobid = self.jobid_by_task_uid.get(task_pkl.parent.name) - while jobid is None: - jobid = self.jobid_by_task_uid.get(task_pkl.parent.name) - await asyncio.sleep(1) - job_output = self.output_by_jobid.get(jobid) - while job_output is None: - job_output = self.output_by_jobid.get(jobid) - await asyncio.sleep(1) - return job_output - - async def _submit_job( - self, - batchscript, - name, - uid, - cache_dir, - task_pkl, - ind, - output_dir, - task_qsub_args, - ): - """Coroutine that submits task runscript and polls job until completion or error.""" - await self._submit_jobs( - batchscript, - name, - uid, - cache_dir, - output_dir, - task_qsub_args, - ) - if self.poll_for_result_file: - while True: - result_file = output_dir / "_result.pklz" - if result_file.exists() and str(task_pkl) not in self.task_pkls_rerun: - return True - await asyncio.sleep(self.poll_delay) - else: - rc, stdout, stderr = await self.get_output_by_task_pkl(task_pkl) - while True: - jobid = self.jobid_by_task_uid.get(task_pkl.parent.name) - if self.job_completed_by_jobid.get(jobid): - return True - else: - await asyncio.sleep(self.poll_delay) - - async def _poll_job(self, jobid, cache_dir): - cmd = ("qstat", "-j", jobid) - logger.debug(f"Polling job {jobid}") - rc, stdout, stderr = await read_and_display_async(*cmd, hide_display=True) - - if not stdout: - # job is no longer running - check exit code - status = await self._verify_exit_code(jobid) - return status - return False - - async def _verify_exit_code(self, jobid): - cmd = ("qacct", "-j", jobid) - rc, stdout, stderr = await read_and_display_async(*cmd, hide_display=True) - if not stdout: - await asyncio.sleep(10) - rc, stdout, stderr = await read_and_display_async(*cmd, hide_display=True) - - # job is still pending/working - if re.match(r"error: job id .* not found", stderr): - return False - - if not stdout: - return "ERRORED" - - # Read the qacct stdout into dictionary stdout_dict - for line in stdout.splitlines(): - line_split = line.split() - if len(line_split) > 1: - if line_split[0] == "failed": - if not line_split[1].isdigit(): - return "ERRORED" - elif not int(line_split[1]) == 0: - return "ERRORED" - return True - - -class DaskWorker(Worker): - """A worker to execute in parallel using Dask.distributed. - This is an experimental implementation with limited testing. - """ - - plugin_name = "dask" - - def __init__(self, **kwargs): - """Initialize Worker.""" - super().__init__() - try: - from dask.distributed import Client # noqa: F401 - except ImportError: - logger.critical("Please instiall Dask distributed.") - raise - self.client = None - self.client_args = kwargs - logger.debug("Initialize Dask") - - def run_el(self, runnable, rerun=False, **kwargs): - """Run a task.""" - return self.exec_dask(runnable, rerun=rerun) - - async def exec_dask(self, runnable, rerun=False): - """Run a task (coroutine wrapper).""" - from dask.distributed import Client - - async with Client(**self.client_args, asynchronous=True) as client: - if isinstance(runnable, TaskBase): - future = client.submit(runnable._run, rerun) - result = await future - else: # it could be tuple that includes pickle files with tasks and inputs - ind, task_main_pkl, task_orig = runnable - future = client.submit(load_and_run, task_main_pkl, ind, rerun) - result = await future - return result - - def close(self): - """Finalize the internal pool of tasks.""" - pass - - -class PsijWorker(Worker): - """A worker to execute tasks using PSI/J.""" - - def __init__(self, **kwargs): - """ - Initialize PsijWorker. - - Parameters - ---------- - subtype : str - Scheduler for PSI/J. - """ - try: - import psij - except ImportError: - logger.critical("Please install psij.") - raise - logger.debug("Initialize PsijWorker") - self.psij = psij - - def run_el(self, interface, rerun=False, **kwargs): - """Run a task.""" - return self.exec_psij(interface, rerun=rerun) - - def make_spec(self, cmd=None, arg=None): - """ - Create a PSI/J job specification. - - Parameters - ---------- - cmd : str, optional - Executable command. Defaults to None. - arg : list, optional - List of arguments. Defaults to None. - - Returns - ------- - psij.JobSpec - PSI/J job specification. - """ - spec = self.psij.JobSpec() - spec.executable = cmd - spec.arguments = arg - - return spec - - def make_job(self, spec, attributes): - """ - Create a PSI/J job. - - Parameters - ---------- - spec : psij.JobSpec - PSI/J job specification. - attributes : any - Job attributes. - - Returns - ------- - psij.Job - PSI/J job. - """ - job = self.psij.Job() - job.spec = spec - return job - - async def exec_psij(self, runnable, rerun=False): - """ - Run a task (coroutine wrapper). - - Raises - ------ - Exception - If stderr is not empty. - - Returns - ------- - None - """ - import pickle - from pathlib import Path - - jex = self.psij.JobExecutor.get_instance(self.subtype) - absolute_path = Path(__file__).parent - - if isinstance(runnable, TaskBase): - cache_dir = runnable.cache_dir - file_path = cache_dir / "runnable_function.pkl" - with open(file_path, "wb") as file: - pickle.dump(runnable._run, file) - func_path = absolute_path / "run_pickled.py" - spec = self.make_spec("python", [func_path, file_path]) - else: # it could be tuple that includes pickle files with tasks and inputs - cache_dir = runnable[-1].cache_dir - file_path_1 = cache_dir / "taskmain.pkl" - file_path_2 = cache_dir / "ind.pkl" - ind, task_main_pkl, task_orig = runnable - with open(file_path_1, "wb") as file: - pickle.dump(task_main_pkl, file) - with open(file_path_2, "wb") as file: - pickle.dump(ind, file) - func_path = absolute_path / "run_pickled.py" - spec = self.make_spec( - "python", - [ - func_path, - file_path_1, - file_path_2, - ], - ) - - if rerun: - spec.arguments.append("--rerun") - - spec.stdout_path = cache_dir / "demo.stdout" - spec.stderr_path = cache_dir / "demo.stderr" - - job = self.make_job(spec, None) - jex.submit(job) - job.wait() - - if spec.stderr_path.stat().st_size > 0: - with open(spec.stderr_path, "r") as stderr_file: - stderr_contents = stderr_file.read() - raise Exception( - f"stderr_path '{spec.stderr_path}' is not empty. Contents:\n{stderr_contents}" - ) - - return - - def close(self): - """Finalize the internal pool of tasks.""" - pass - - -class PsijLocalWorker(PsijWorker): - """A worker to execute tasks using PSI/J on the local machine.""" - - subtype = "local" - plugin_name = f"psij-{subtype}" - - -class PsijSlurmWorker(PsijWorker): - """A worker to execute tasks using PSI/J using SLURM.""" - - subtype = "slurm" - plugin_name = f"psij-{subtype}" - - -WORKERS = { - w.plugin_name: w - for w in ( - SerialWorker, - ConcurrentFuturesWorker, - SlurmWorker, - DaskWorker, - SGEWorker, - PsijLocalWorker, - PsijSlurmWorker, - ) -} diff --git a/pydra/engine/workflow.py b/pydra/engine/workflow.py new file mode 100644 index 0000000000..915aea9840 --- /dev/null +++ b/pydra/engine/workflow.py @@ -0,0 +1,397 @@ +import logging +import inspect +import typing as ty +from copy import copy +from collections import defaultdict +from typing import Self +import attrs +from pydra.compose import workflow +from pydra.compose.base import Task, Outputs +from pydra.engine.graph import DiGraph, INPUTS_NODE_NAME, OUTPUTS_NODE_NAME +from pydra.engine import state +from pydra.engine.lazy import LazyInField, LazyOutField +from pydra.utils.hash import hash_function, Cache +from pydra.engine.state import State +from pydra.engine.node import Node +from pydra.engine.hooks import ( + TaskHooks, +) +from pydra.engine.submitter import Submitter, NodeExecution +from pydra.utils.general import ( + attrs_values, + task_dict, + task_fields, +) +from pydra.utils.typing import is_lazy +from pydra.environments.base import Environment + +logger = logging.getLogger("pydra") + +OutputsType = ty.TypeVar("OutputType", bound=Outputs) +WorkflowOutputsType = ty.TypeVar("OutputType", bound=workflow.Outputs) + + +@attrs.define(auto_attribs=False) +class Workflow(ty.Generic[WorkflowOutputsType]): + """A workflow, constructed from a workflow task + + Parameters + ---------- + name : str + The name of the workflow + inputs : Task + The input task of the workflow + outputs : Task + The output task of the workflow + """ + + name: str = attrs.field() + inputs: workflow.Task[WorkflowOutputsType] = attrs.field() + outputs: WorkflowOutputsType = attrs.field() + _nodes: dict[str, Node] = attrs.field(factory=dict) + + def __repr__(self): + return f"Workflow(name={self.name!r}, defn={self.inputs!r})" + + @classmethod + def clear_cache( + cls, task: workflow.Task[WorkflowOutputsType] | None = None + ) -> None: + """Clear the cache of constructed workflows""" + if task is None: + cls._constructed_cache = defaultdict(lambda: defaultdict(dict)) + else: + cls._constructed_cache[hash_function(task)] = defaultdict(dict) + + @classmethod + def construct( + cls, + task: workflow.Task[WorkflowOutputsType], + dont_cache: bool = False, + lazy: ty.Sequence[str] = (), + ) -> Self: + """Construct a workflow from a task, caching the constructed worklow + + Parameters + ---------- + task : workflow.Task + The task of the workflow to construct + dont_cache : bool, optional + Whether to cache the constructed workflow, by default False + lazy : Sequence[str], optional + The names of the inputs to the workflow to be considered lazy even if they + have values in the given task, by default () + """ + + # Check the previously constructed workflows to see if a workflow has been + # constructed for the given set of inputs, or a less-specific set (i.e. with a + # super-set of lazy inputs), and use that if it exists + + non_lazy_vals = { + n: v + for n, v in attrs_values(task).items() + if not is_lazy(v) and n not in lazy + } + non_lazy_keys = frozenset(non_lazy_vals) + hash_cache = Cache() # share the hash cache to avoid recalculations + non_lazy_hash = hash_function(non_lazy_vals, cache=hash_cache) + defn_hash = hash_function(type(task), cache=hash_cache) + # Check for same non-lazy inputs + try: + defn_cache = cls._constructed_cache[defn_hash] + except KeyError: + pass + else: + if ( + non_lazy_keys in defn_cache + and non_lazy_hash in defn_cache[non_lazy_keys] + ): + return defn_cache[non_lazy_keys][non_lazy_hash] + # Check for supersets of lazy inputs + for key_set, key_set_cache in defn_cache.items(): + if key_set.issubset(non_lazy_keys): + subset_vals = { + k: v for k, v in non_lazy_vals.items() if k in key_set + } + subset_hash = hash_function(subset_vals, cache=hash_cache) + if subset_hash in key_set_cache: + return key_set_cache[subset_hash] + + # Initialise the outputs of the workflow + outputs = task.Outputs( + **{f.name: attrs.NOTHING for f in task_fields(task.Outputs)} + ) + + # Initialise the lzin fields + lazy_spec = copy(task) + workflow = Workflow( + name=type(task).__name__, + inputs=lazy_spec, + outputs=outputs, + ) + # Set lazy inputs to the workflow, need to do it after the workflow is initialised + # so a back ref to the workflow can be set in the lazy field + for field in task_fields(task): + if field.name not in non_lazy_keys: + setattr( + lazy_spec, + field.name, + LazyInField( + workflow=workflow, + field=field.name, + type=field.type, + ), + ) + + input_values = attrs_values(lazy_spec) + constructor = input_values.pop("constructor") + # Call the user defined constructor to set the outputs + output_lazy_fields = constructor(**input_values) + if all(v is attrs.NOTHING for v in task_dict(outputs).values()): + if output_lazy_fields is None: + raise ValueError( + f"Constructor function for {task} returned None, must a lazy field " + "or a tuple of lazy fields" + ) + else: # Outputs are set explicitly in the outputs object + if output_lazy_fields is not None: + raise ValueError( + f"Constructor function for {task} must not return anything " + "if any of the outputs are already set explicitly" + ) + if unset_outputs := [ + n for n, v in task_dict(outputs).items() if v is attrs.NOTHING + ]: + raise ValueError( + f"Mandatory outputs {unset_outputs} are not set by the " + f"constructor of {workflow!r}" + ) + # Check to see whether any mandatory inputs are not set + for node in workflow.nodes: + node._task._check_rules() + # Check that the outputs are set correctly, either directly by the constructor + # or via returned values that can be zipped with the output names + if output_lazy_fields: + if not isinstance(output_lazy_fields, (list, tuple)): + output_lazy_fields = [output_lazy_fields] + output_fields = task_fields(task.Outputs) + if len(output_lazy_fields) != len(output_fields): + raise ValueError( + f"Expected {len(output_fields)} outputs, got " + f"{len(output_lazy_fields)} ({output_lazy_fields})" + ) + for outpt, outpt_lf in zip(output_fields, output_lazy_fields): + # Automatically combine any uncombined state arrays into a single lists + outpt_lf._type = State.combine_state_arrays(outpt_lf._type) + setattr(outputs, outpt.name, outpt_lf) + else: + if unset_outputs := [ + a for a, v in attrs_values(outputs).items() if v is attrs.NOTHING + ]: + raise ValueError( + f"Expected outputs {unset_outputs} to be set by the " + f"constructor of {workflow!r}" + ) + if not dont_cache: + cls._constructed_cache[defn_hash][non_lazy_keys][non_lazy_hash] = workflow + + return workflow + + @classmethod + def under_construction(cls) -> "Workflow[ty.Any]": + """Access the under_construction variable by iterating up through the call stack.""" + frame = inspect.currentframe() + while frame: + # Find the frame where the construct method was called + if ( + frame.f_code.co_name == "construct" + and frame.f_locals.get("cls") is cls + and "workflow" in frame.f_locals + ): + return frame.f_locals["workflow"] # local var "workflow" in construct + frame = frame.f_back + raise RuntimeError( + "No workflow is currently under construction (i.e. did not find a " + "`Workflow.construct` in the current call stack" + ) + + def add( + self, + task: Task[OutputsType], + name: str | None = None, + environment: Environment | None = None, + hooks: TaskHooks | None = None, + ) -> OutputsType: + """Add a node to the workflow + + Parameters + ---------- + task_spec : Task + The task of the job to add to the workflow as a node + name : str, optional + The name of the node, by default it will be the name of the task + class + environment : Environment, optional + The environment to run the job in, such as the Docker or Singularity container, + by default it will be the "native" + hooks : TaskHooks, optional + The hooks to run before or after the job, by default no hooks will be run + + Returns + ------- + OutputType + The outputs of the node + """ + from pydra.environments import native + + if name is None: + name = type(task).__name__ + if name in self._nodes: + raise ValueError(f"Node with name {name!r} already exists in the workflow") + if ( + environment + and not isinstance(environment, native.Environment) + and task._task_type != "shell" + ): + raise ValueError( + "Environments can only be used with 'shell' tasks not " + f"{task._task_type!r} tasks ({task!r})" + ) + node = Node[OutputsType]( + name=name, + task=task, + workflow=self, + environment=environment, + hooks=hooks, + ) + self._nodes[name] = node + return node.lzout + + def __getitem__(self, key: str) -> Node: + return self._nodes[key] + + @property + def nodes(self) -> ty.Iterable[Node]: + return self._nodes.values() + + @property + def node_names(self) -> list[str]: + return list(self._nodes) + + # Used to cache the constructed workflows by their hashed input values + _constructed_cache: dict[ + str, dict[frozenset[str], dict[str, "Workflow[ty.Any]"]] + ] = defaultdict(lambda: defaultdict(dict)) + + def execution_graph(self, submitter: "Submitter") -> DiGraph: + from pydra.engine.submitter import NodeExecution + + exec_nodes = [NodeExecution(n, submitter, workflow=self) for n in self.nodes] + graph = self._create_graph(exec_nodes) + # Set the graph attribute of the nodes so lazy fields can be resolved as tasks + # are created + for node in exec_nodes: + node.graph = graph + return graph + + def graph(self, detailed: bool = False) -> DiGraph: + return self._create_graph(self.nodes, detailed=detailed) + + def _create_graph( + self, nodes: "list[Node | NodeExecution]", detailed: bool = False + ) -> DiGraph: + """ + Connects a particular job to existing nodes in the workflow. + + Parameters + ---------- + detailed : bool + If True, `add_edges_description` is run a detailed descriptions of the + connections (input/output fields names) + node_klass : type, optional + The class to use for the nodes in the workflow. If provided the node is + wrapped by an instance of the class, if None the node is added as is, + by default None + + Returns + ------- + DiGraph + The graph of the workflow + """ + graph: DiGraph = DiGraph(name=self.name) + for node in nodes: + graph.add_nodes(node) + # TODO: create connection is run twice + for node in nodes: + other_states = {} + for field in task_fields(node._task): + lf = node._task[field.name] + if isinstance(lf, LazyOutField): + # adding an edge to the graph if job id expecting output from a different job + + # checking if the connection is already in the graph + if (graph.node(lf._node.name), node) not in graph.edges: + graph.add_edges((graph.node(lf._node.name), node)) + if detailed: + graph.add_edges_description( + (node.name, field.name, lf._node.name, lf._field) + ) + logger.debug("Connecting %s to %s", lf._node.name, node.name) + # adding a state from the previous job to other_states + if ( + graph.node(lf._node.name).state + and graph.node(lf._node.name).state.splitter_rpn_final + ): + # variables that are part of inner splitters should be + # treated as a containers + if ( + node.state + and f"{node.name}.{field.name}" + in node.state._current_splitter_rpn + ): + node.state._inner_container_ndim[ + f"{node.name}.{field.name}" + ] = 1 + # adding task_name: (job.state, [a field from the connection] + if lf._node.name not in other_states: + other_states[lf._node.name] = ( + graph.node(lf._node.name).state, + [field.name], + ) + else: + # if the job already exist in other_state, + # additional field name should be added to the list of fields + other_states[lf._node.name][1].append(field.name) + elif ( + isinstance(lf, LazyInField) and detailed + ): # LazyField with the wf input + # connections with wf input should be added to the detailed graph description + graph.add_edges_description( + (node.name, field.name, INPUTS_NODE_NAME, lf._field) + ) + + # if job has connections state has to be recalculated + if other_states: + if hasattr(node, "fut_combiner"): + combiner = node.fut_combiner + else: + combiner = None + + if node.state: + node.state.update_connections( + new_other_states=other_states, new_combiner=combiner + ) + else: + node.state = state.State( + node.name, + splitter=None, + other_states=other_states, + combiner=combiner, + ) + if detailed: + lf: LazyOutField + for outpt_name, lf in attrs_values(self.outputs).items(): + graph.add_edges_description( + (OUTPUTS_NODE_NAME, outpt_name, lf._node.name, lf._field) + ) + return graph diff --git a/pydra/environments/base.py b/pydra/environments/base.py new file mode 100644 index 0000000000..29de902de3 --- /dev/null +++ b/pydra/environments/base.py @@ -0,0 +1,251 @@ +import typing as ty +import os +from copy import copy +import attrs +import subprocess as sp +from pathlib import Path +import logging +from fileformats.generic import FileSet +from pydra.compose import shell +from pydra.utils.general import task_fields, get_plugin_classes +from pydra.utils.typing import TypeParser +import pydra.environments + +logger = logging.getLogger("pydra") + +if ty.TYPE_CHECKING: + from pydra.engine.job import Job + + +@attrs.define +class Environment: + """ + Base class for environments that are used to execute tasks. + Right now it is assumed that the environment, including container images, + are available and are not removed at the end + TODO: add setup and teardown methods + """ + + def setup(self): + pass + + def execute(self, job: "Job[shell.Task]") -> dict[str, ty.Any]: + """ + Execute the job in the environment. + + Parameters + ---------- + job : TaskBase + the job to execute + + Returns + ------- + output: dict[str, Any] + Output of the job. + """ + raise NotImplementedError + + def teardown(self): + pass + + @classmethod + def available_plugins(cls) -> ty.Dict[str, ty.Type["Environment"]]: + """Return all installed worker types""" + return get_plugin_classes(pydra.environments, "Environment") + + @classmethod + def plugin(cls, plugin_name: str) -> ty.Type["Environment"]: + """Return a worker class by name.""" + try: + return cls.available_plugins()[plugin_name.replace("-", "_")] + except KeyError: + raise ValueError( + f"No environment matches {plugin_name!r}, check if there is a " + f"plugin package called 'pydra-environments-{plugin_name}' that needs to be " + "installed." + ) + + @classmethod + def plugin_name(cls) -> str: + """Return the name of the plugin.""" + try: + plugin_name = cls._plugin_name + except AttributeError: + parts = cls.__module__.split(".") + if parts[:-1] != ["pydra", "environments"]: + raise ValueError( + f"Cannot infer plugin name of Environment (({cls}) from module path, " + f"as it isn't installed within `pydra.environments` ({cls.__module__}). " + "Please set the `_plugin_name` attribute on the class explicitly." + ) + plugin_name = parts[-1] + return plugin_name.replace("_", "-") + + +def split_if_str(s) -> list[str]: + if isinstance(s, str): + return s.split() + elif not isinstance(s, list): + return list(s) + return s + + +@attrs.define +class Container(Environment): + """ + Base class for container environments used by Docker and Singularity. + + Parameters + ---------- + image : str + Name of the container image + tag : str + Tag of the container image + root : str + Base path for mounting host directories into the container + xargs : Union[str, List[str]] + Extra arguments to be passed to the container + """ + + image: str + tag: str = "latest" + root: str = "/mnt/pydra" + xargs: list[str] = attrs.field(factory=list, converter=split_if_str) + + def bind(self, loc, mode="ro"): + loc_abs = Path(loc).absolute() + return f"{loc_abs}:{self.root}{loc_abs}:{mode}" + + def get_bindings( + self, job: "Job", root: str | None = None + ) -> tuple[dict[str, tuple[str, str]], dict[str, tuple[Path, ...]]]: + """Return bindings necessary to run job in an alternative root. + + This is primarily intended for contexts when a job is going + to be run in a container with mounted volumes. + + Arguments + --------- + root: str, optional + + + Returns + ------- + bindings: dict + Mapping from paths in the host environment to the target environment + """ + + bindings: dict[str, tuple[str, str]] = {} + value_updates: dict[str, tuple[Path, ...]] = {} + if root is None: + return bindings + fld: shell.arg + for fld in task_fields(job.task): + if TypeParser.contains_type(FileSet, fld.type): + value: FileSet | None = job.inputs[fld.name] + if not value: + continue + + copy_file = fld.copy_mode == FileSet.CopyMode.copy + + def map_path(fileset: os.PathLike | FileSet) -> Path: + host_path, env_path = fileset.parent, Path( + f"{root}{fileset.parent}" + ) + + # Default to mounting paths as read-only, but respect existing modes + bindings[host_path] = ( + env_path, + "rw" if copy_file or isinstance(fld, shell.outarg) else "ro", + ) + return ( + env_path / fileset.name + if isinstance(fileset, os.PathLike) + else tuple(env_path / rel for rel in fileset.relative_fspaths) + ) + + # Provide updated in-container paths to the command to be run. If a + # fs-object, which resolves to a single path, just pass in the name of + # that path relative to the location in the mount point in the container. + # If it is a more complex file-set with multiple paths, then it is converted + # into a tuple of paths relative to the base of the fileset. + if TypeParser.matches(value, os.PathLike | FileSet): + value_updates[fld.name] = map_path(value) + elif TypeParser.matches(value, ty.Sequence[FileSet | os.PathLike]): + mapped_value = [] + for val in value: + mapped_val = map_path(val) + if isinstance(mapped_val, tuple): + mapped_value.extend(mapped_val) + else: + mapped_value.append(mapped_val) + value_updates[fld.name] = mapped_value + else: + logger.debug( + "No support for generating bindings for %s types " "(%s)", + type(value), + value, + ) + + # Add the cache directory to the list of mounts + bindings[job.cache_root] = ( + f"{self.root.rstrip('/')}{job.cache_root.absolute()}", + "rw", + ) + + # Update values with the new paths + values = copy(job.inputs) + values.update(value_updates) + + return bindings, values + + +def execute(cmd, strip=False): + """ + Run the event loop with coroutine. + + Uses :func:`read_and_display_async` unless a loop is + already running, in which case :func:`read_and_display` + is used. + + Parameters + ---------- + cmd : :obj:`list` or :obj:`tuple` + The command line to be executed. + strip : :obj:`bool` + TODO + + """ + rc, stdout, stderr = read_and_display(*cmd, strip=strip) + """ + loop = get_open_loop() + if loop.is_running(): + rc, stdout, stderr = read_and_display(*cmd, strip=strip) + else: + rc, stdout, stderr = loop.run_until_complete( + read_and_display_async(*cmd, strip=strip) + ) + """ + return rc, stdout, stderr + + +def read_and_display(*cmd, strip=False, hide_display=False): + """Capture a process' standard output.""" + try: + process = sp.run(cmd, stdout=sp.PIPE, stderr=sp.PIPE) + except Exception: + # TODO editing some tracing? + raise + + if strip: + return ( + process.returncode, + process.stdout.decode("utf-8").strip(), + process.stderr.decode("utf-8"), + ) + else: + return ( + process.returncode, + process.stdout.decode("utf-8"), + process.stderr.decode("utf-8"), + ) diff --git a/pydra/environments/docker.py b/pydra/environments/docker.py new file mode 100644 index 0000000000..03179f0da6 --- /dev/null +++ b/pydra/environments/docker.py @@ -0,0 +1,47 @@ +import typing as ty +import logging +from pydra.compose import shell +from pydra.environments import base + +logger = logging.getLogger("pydra") + +if ty.TYPE_CHECKING: + from pydra.engine.job import Job + + +class Docker(base.Container): + """Docker environment.""" + + def execute(self, job: "Job[shell.Task]") -> dict[str, ty.Any]: + docker_img = f"{self.image}:{self.tag}" + # mounting all input locations + mounts, values = self.get_bindings(job=job, root=self.root) + + docker_args = [ + "docker", + "run", + *self.xargs, + ] + docker_args.extend( + " ".join( + [f"-v {key}:{val[0]}:{val[1]}" for (key, val) in mounts.items()] + ).split() + ) + docker_args.extend(["-w", f"{self.root}{job.cache_dir}"]) + keys = ["return_code", "stdout", "stderr"] + + job.cache_dir.mkdir(exist_ok=True) + values = base.execute( + docker_args + [docker_img] + job.task._command_args(values=values), + ) + output = dict(zip(keys, values)) + if output["return_code"]: + if output["stderr"]: + raise RuntimeError(output["stderr"]) + else: + raise RuntimeError(output["stdout"]) + return output + + +# Alias so it can be referred to as docker.Environment +Environment = Docker diff --git a/pydra/environments/native.py b/pydra/environments/native.py new file mode 100644 index 0000000000..fa51108d0f --- /dev/null +++ b/pydra/environments/native.py @@ -0,0 +1,33 @@ +import typing as ty +import logging +from pydra.compose import shell +from pydra.environments import base + +logger = logging.getLogger("pydra") + +if ty.TYPE_CHECKING: + from pydra.engine.job import Job + + +class Native(base.Environment): + """ + Native environment, i.e. the tasks are executed in the current shell environment. + """ + + def execute(self, job: "Job[shell.Task]") -> dict[str, ty.Any]: + keys = ["return_code", "stdout", "stderr"] + cmd_args = job.task._command_args(values=job.inputs) + values = base.execute(cmd_args) + output = dict(zip(keys, values)) + if output["return_code"]: + msg = f"Error running '{job.name}' job with {cmd_args}:" + if output["stderr"]: + msg += "\n\nstderr:\n" + output["stderr"] + if output["stdout"]: + msg += "\n\nstdout:\n" + output["stdout"] + raise RuntimeError(msg) + return output + + +# Alias so it can be referred to as native.Environment +Environment = Native diff --git a/pydra/environments/singularity.py b/pydra/environments/singularity.py new file mode 100644 index 0000000000..a7fc0efffa --- /dev/null +++ b/pydra/environments/singularity.py @@ -0,0 +1,52 @@ +import typing as ty +import logging +from pydra.compose import shell +from pydra.environments import base + +logger = logging.getLogger("pydra") + +if ty.TYPE_CHECKING: + from pydra.engine.job import Job + + +class Singularity(base.Container): + """Singularity environment.""" + + def execute(self, job: "Job[shell.Task]") -> dict[str, ty.Any]: + singularity_img = f"{self.image}:{self.tag}" + # mounting all input locations + mounts, values = self.get_bindings(job=job, root=self.root) + + # todo adding xargsy etc + singularity_args = [ + "singularity", + "exec", + *self.xargs, + ] + singularity_args.extend( + " ".join( + [f"-B {key}:{val[0]}:{val[1]}" for (key, val) in mounts.items()] + ).split() + ) + singularity_args.extend( + ["--pwd", f"{self.root.rstrip('/')}{job.cache_dir.absolute()}"] + ) + keys = ["return_code", "stdout", "stderr"] + + job.cache_dir.mkdir(exist_ok=True) + values = base.execute( + singularity_args + + [singularity_img] + + job.task._command_args(values=values), + ) + output = dict(zip(keys, values)) + if output["return_code"]: + if output["stderr"]: + raise RuntimeError(output["stderr"]) + else: + raise RuntimeError(output["stdout"]) + return output + + +# Alias so it can be referred to as singularity.Environment +Environment = Singularity diff --git a/pydra/environments/tests/test_docker.py b/pydra/environments/tests/test_docker.py new file mode 100644 index 0000000000..2f83ffb64d --- /dev/null +++ b/pydra/environments/tests/test_docker.py @@ -0,0 +1,630 @@ +import attrs +import pytest +from pydra.engine.submitter import Submitter +from fileformats.generic import File +from pydra.environments import docker +from pydra.compose import shell, workflow +from pydra.engine.job import Job +from pydra.engine.tests.utils import ( + no_win, + need_docker, + run_submitter, + run_no_submitter, +) + + +@no_win +@need_docker +def test_docker_1_nosubm(tmp_path): + """simple command in a container, a default bindings and working directory is added + no submitter + """ + cmd = "whoami" + Docky = shell.define(cmd) + docky = Docky() + docky_task = Job( + task=docky, + name="docky", + submitter=Submitter( + environment=docker.Environment(image="busybox"), cache_root=tmp_path + ), + ) + assert docky_task.environment.image == "busybox" + assert docky_task.environment.tag == "latest" + assert isinstance(docky_task.environment, docker.Environment) + assert docky.cmdline == cmd + + res = docky_task.run() + assert res.outputs.stdout == "root\n" + assert res.outputs.return_code == 0 + + +@no_win +@need_docker +def test_docker_1(worker, tmp_path): + """simple command in a container, a default bindings and working directory is added + using submitter + """ + cmd = "whoami" + Docky = shell.define(cmd) + docky = Docky() + + with Submitter( + cache_root=tmp_path, environment=docker.Environment(image="busybox") + ) as sub: + res = sub(docky) + + assert res.outputs.stdout == "root\n" + assert res.outputs.return_code == 0 + + +@no_win +@need_docker +@pytest.mark.parametrize("run_function", [run_no_submitter, run_submitter]) +def test_docker_2(run_function, worker, tmp_path): + """a command with arguments, cmd and args given as executable + with and without submitter + """ + cmdline = "echo hail pydra" + Docky = shell.define(cmdline) + docky = Docky() + # cmdline doesn't know anything about docker + assert docky.cmdline == cmdline + outputs = run_function( + docky, tmp_path, worker, environment=docker.Environment(image="busybox") + ) + assert outputs.stdout.strip() == " ".join(cmdline.split()[1:]) + assert outputs.return_code == 0 + + +@no_win +@need_docker +@pytest.mark.parametrize("run_function", [run_no_submitter, run_submitter]) +def test_docker_2a(run_function, worker, tmp_path): + """a command with arguments, using executable and args + using submitter + """ + cmd = ["echo", "hail", "pydra"] + # separate command into exec + args + Docky = shell.define(cmd) + docky = Docky() + assert docky.executable == cmd + assert docky.cmdline == " ".join(cmd) + + outputs = run_function( + docky, tmp_path, worker, environment=docker.Environment(image="busybox") + ) + assert outputs.stdout.strip() == " ".join(cmd[1:]) + assert outputs.return_code == 0 + + +# tests with State + + +@no_win +@need_docker +@pytest.mark.parametrize("run_function", [run_no_submitter, run_submitter]) +def test_docker_st_1(run_function, worker, tmp_path): + """commands without arguments in container + splitter = executable + """ + cmd = ["pwd", "whoami"] + Docky = shell.define("docky") # cmd is just a placeholder + docky = Docky().split(executable=cmd) + + outputs = run_function( + docky, tmp_path, worker, environment=docker.Environment(image="busybox") + ) + assert ( + outputs.stdout[0] + == f"/mnt/pydra{tmp_path}/{attrs.evolve(docky, executable=cmd[0])._checksum}\n" + ) + assert outputs.stdout[1] == "root\n" + assert outputs.return_code[0] == outputs.return_code[1] == 0 + + +# tests with customized output_spec + + +@no_win +@need_docker +def test_docker_outputspec_1(worker, tmp_path): + """ + customised output_spec, adding files to the output, providing specific pathname + output_path is automatically added to the bindings + """ + Docky = shell.define("touch ") + docky = Docky() + + outputs = docky(worker=worker, environment=docker.Environment(image="ubuntu")) + assert outputs.stdout == "" + + +# tests with customised input_spec + + +@no_win +@need_docker +def test_docker_inputspec_1(tmp_path, worker): + """a simple customized input task for docker task""" + filename = str(tmp_path / "file_pydra.txt") + with open(filename, "w") as f: + f.write("hello from pydra") + + cmd = "cat" + + Docky = shell.define( + cmd, + inputs=[ + shell.arg( + name="file", + type=File, + position=1, + argstr="", + help="input file", + ) + ], + ) + + docky = Docky(file=filename) + + outputs = docky( + cache_root=tmp_path, + worker=worker, + environment=docker.Environment(image="busybox"), + ) + assert outputs.stdout.strip() == "hello from pydra" + + +@no_win +@need_docker +def test_docker_inputspec_1a(tmp_path): + """a simple customized input task for docker task + a default value is used + """ + filename = str(tmp_path / "file_pydra.txt") + with open(filename, "w") as f: + f.write("hello from pydra") + + cmd = "cat" + + Docky = shell.define( + cmd, + inputs=[ + shell.arg( + name="file", + type=File, + default=filename, + position=1, + argstr="", + help="input file", + ) + ], + ) + + docky = Docky() + + outputs = docky( + cache_root=tmp_path, environment=docker.Environment(image="busybox") + ) + assert outputs.stdout.strip() == "hello from pydra" + + +@no_win +@need_docker +def test_docker_inputspec_2(worker, tmp_path): + """a customized input task with two fields for docker task""" + filename_1 = tmp_path / "file_pydra.txt" + with open(filename_1, "w") as f: + f.write("hello from pydra\n") + + filename_2 = tmp_path / "file_nice.txt" + with open(filename_2, "w") as f: + f.write("have a nice one") + + cmd = "cat" + + Docky = shell.define( + cmd, + inputs=[ + shell.arg( + name="file1", + type=File, + position=1, + argstr="", + help="input file 1", + ), + shell.arg( + name="file2", + type=File, + default=filename_2, + position=2, + argstr="", + help="input file 2", + ), + ], + ) + docky = Docky( + file1=filename_1, + ) + + outputs = docky(environment=docker.Environment(image="busybox")) + assert outputs.stdout.strip() == "hello from pydra\nhave a nice one" + + +@no_win +@need_docker +def test_docker_inputspec_2a_except(worker, tmp_path): + """a customized input task with two fields + first one uses a default, and second doesn't - raises a dataclass exception + """ + filename_1 = tmp_path / "file_pydra.txt" + with open(filename_1, "w") as f: + f.write("hello from pydra\n") + filename_2 = tmp_path / "file_nice.txt" + with open(filename_2, "w") as f: + f.write("have a nice one") + + cmd = "cat" + + Docky = shell.define( + cmd, + inputs=[ + shell.arg( + name="file1", + type=File, + default=filename_1, + position=1, + argstr="", + help="input file 1", + ), + shell.arg( + name="file2", + type=File, + position=2, + argstr="", + help="input file 2", + ), + ], + ) + + docky = Docky( + file2=filename_2, + ) + assert docky.file2.fspath == filename_2 + + outputs = docky( + cache_root=tmp_path, + worker=worker, + environment=docker.Environment(image="busybox"), + ) + assert outputs.stdout.strip() == "hello from pydra\nhave a nice one" + + +@no_win +@need_docker +def test_docker_inputspec_2a(worker, tmp_path): + """a customized input task with two fields + first one uses a default value + this is fine even if the second field is not using any defaults + """ + filename_1 = tmp_path / "file_pydra.txt" + with open(filename_1, "w") as f: + f.write("hello from pydra\n") + filename_2 = tmp_path / "file_nice.txt" + with open(filename_2, "w") as f: + f.write("have a nice one") + + cmd = "cat" + + Docky = shell.define( + cmd, + inputs=[ + shell.arg( + name="file1", + type=File, + default=filename_1, + position=1, + argstr="", + help="input file 1", + ), + shell.arg( + name="file2", + type=File, + position=2, + argstr="", + help="input file 2", + ), + ], + ) + + docky = Docky(file2=filename_2) + + outputs = docky( + cache_root=tmp_path, + worker=worker, + environment=docker.Environment(image="busybox"), + ) + assert outputs.stdout.strip() == "hello from pydra\nhave a nice one" + + +@no_win +@need_docker +@pytest.mark.xfail(reason="'docker' not in /proc/1/cgroup on ubuntu; TODO") +def test_docker_inputspec_3(worker, tmp_path): + """input file is in the container, so metadata["container_path"]: True, + the input will be treated as a str""" + filename = "/proc/1/cgroup" + + cmd = "cat" + + inputs = [ + shell.arg( + name="file", + type=File, + position=1, + argstr="", + help="input file", + container_path=True, + ) + ] + + docky = shell.define(cmd, inputs=inputs)( + environment=docker.Environment(image="busybox"), + file=filename, + strip=True, + ) + + cmdline = docky.cmdline + outputs = docky() + assert "docker" in outputs.stdout + assert cmdline == docky.cmdline + + +@no_win +@need_docker +def test_docker_cmd_inputspec_copyfile_1(worker, tmp_path): + """shelltask changes a file in place, + adding copyfile=True to the file-input from input_spec + hardlink or copy in the cache_dir should be created + """ + file = tmp_path / "file_pydra.txt" + with open(file, "w") as f: + f.write("hello from pydra\n") + + @shell.define + class Docky(shell.Task["Docky.Outputs"]): + executable = ["sed", "-is", "s/hello/hi/"] + orig_file: File = shell.arg( + position=1, + argstr="", + help="orig file", + copy_mode="copy", + ) + + class Outputs(shell.Outputs): + out_file: File = shell.outarg( + path_template="{orig_file}.txt", + help="output file", + ) + + docky = Docky(orig_file=str(file)) + + outputs = docky( + cache_root=tmp_path, + worker=worker, + environment=docker.Environment(image="busybox"), + ) + assert outputs.stdout == "" + out_file = outputs.out_file.fspath + assert out_file.exists() + # the file is copied, and then it is changed in place + assert out_file.parent.parent == tmp_path + with open(out_file) as f: + assert "hi from pydra\n" == f.read() + # the original file is unchanged + with open(file) as f: + assert "hello from pydra\n" == f.read() + + +@no_win +@need_docker +def test_docker_inputspec_state_1(worker, tmp_path): + """a customised input task for a docker file with a splitter, + splitter is on files + """ + filename_1 = tmp_path / "file_pydra.txt" + with open(filename_1, "w") as f: + f.write("hello from pydra\n") + filename_2 = tmp_path / "file_nice.txt" + with open(filename_2, "w") as f: + f.write("have a nice one") + + cmd = "cat" + + Docky = shell.define( + cmd, + inputs=[ + shell.arg( + name="file", + type=File, + position=1, + argstr="", + help="input file", + ) + ], + ) + + docky = Docky().split(file=[str(filename_1), str(filename_2)]) + + outputs = docky( + worker=worker, + cache_root=tmp_path, + environment=docker.Environment(image="busybox"), + ) + assert outputs.stdout[0].strip() == "hello from pydra" + assert outputs.stdout[1].strip() == "have a nice one" + + +@no_win +@need_docker +def test_docker_inputspec_state_1b(worker, tmp_path): + """a customised input task for a docker file with a splitter, + files from the input task have the same path in the local os and the container, + so hash is calculated and the test works fine + """ + file_1 = tmp_path / "file_pydra.txt" + file_2 = tmp_path / "file_nice.txt" + with open(file_1, "w") as f: + f.write("hello from pydra") + with open(file_2, "w") as f: + f.write("have a nice one") + + cmd = "cat" + + Docky = shell.define( + cmd, + inputs=[ + shell.arg( + name="file", + type=File, + position=1, + argstr="", + help="input file", + ) + ], + ) + docky = Docky().split(file=[str(file_1), str(file_2)]) + + outputs = docky( + cache_root=tmp_path, + worker=worker, + environment=docker.Environment(image="busybox"), + ) + assert outputs.stdout[0].strip() == "hello from pydra" + assert outputs.stdout[1].strip() == "have a nice one" + + +@no_win +@need_docker +def test_docker_wf_inputspec_1(worker, tmp_path): + """a customized input task for workflow with docker tasks""" + filename = tmp_path / "file_pydra.txt" + with open(filename, "w") as f: + f.write("hello from pydra") + + cmd = "cat" + + Docky = shell.define( + cmd, + inputs=[ + shell.arg( + name="file", + type=File, + position=1, + argstr="", + help="input file", + ) + ], + ) + + @workflow.define + def Workflow(file): + + docky = workflow.add( + Docky(file=file), + environment=docker.Environment(image="busybox"), + ) + + return docky.stdout + + wf = Workflow(file=filename) + + outputs = wf(cache_root=tmp_path) + assert outputs.out.strip() == "hello from pydra" + + +@no_win +@need_docker +def test_docker_wf_state_inputspec_1(worker, tmp_path): + """a customized input task for workflow with docker tasks that has a state""" + file_1 = tmp_path / "file_pydra.txt" + file_2 = tmp_path / "file_nice.txt" + with open(file_1, "w") as f: + f.write("hello from pydra") + with open(file_2, "w") as f: + f.write("have a nice one") + + cmd = "cat" + + Docky = shell.define( + cmd, + inputs=[ + shell.arg( + name="file", + type=File, + position=1, + argstr="", + help="input file", + ) + ], + ) + + @workflow.define + def Workflow(file): + + docky = workflow.add( + Docky(file=file), + environment=docker.Environment(image="busybox"), + ) + + return docky.stdout + + wf = Workflow().split(file=[file_1, file_2]) + + outputs = wf(cache_root=tmp_path) + + assert outputs.out[0].strip() == "hello from pydra" + assert outputs.out[1].strip() == "have a nice one" + + +@no_win +@need_docker +def test_docker_wf_ndst_inputspec_1(worker, tmp_path): + """a customized input task for workflow with docker tasks with states""" + file_1 = tmp_path / "file_pydra.txt" + file_2 = tmp_path / "file_nice.txt" + with open(file_1, "w") as f: + f.write("hello from pydra") + with open(file_2, "w") as f: + f.write("have a nice one") + + cmd = "cat" + + Docky = shell.define( + cmd, + inputs=[ + shell.arg( + name="file", + type=File, + position=1, + argstr="", + help="input file", + ) + ], + ) + + @workflow.define + def Workflow(file): + + docky = workflow.add( + Docky(file=file), + environment=docker.Environment(image="busybox"), + ) + + return docky.stdout + + wf = Workflow().split(file=[str(file_1), str(file_2)]) + + outputs = wf(cache_root=tmp_path) + assert outputs.out == ["hello from pydra", "have a nice one"] diff --git a/pydra/environments/tests/test_environments.py b/pydra/environments/tests/test_environments.py new file mode 100644 index 0000000000..b39ed8ee8c --- /dev/null +++ b/pydra/environments/tests/test_environments.py @@ -0,0 +1,505 @@ +from pathlib import Path +import typing as ty +from pydra.environments import native, docker, singularity +from pydra.engine.submitter import Submitter +from fileformats.generic import File +from pydra.compose import shell +from pydra.engine.job import Job +from pydra.utils.general import attrs_values +from pydra.engine.tests.utils import no_win, need_docker, need_singularity +import pytest + + +def makedir(path: Path, name: str) -> Path: + newdir = path / name + newdir.mkdir() + return newdir + + +def drop_stderr(dct: dict[str, ty.Any]): + return {k: v for k, v in dct.items() if k != "stderr"} + + +def test_native_1(tmp_path): + """simple command, no arguments""" + + def newcache(x): + return makedir(tmp_path, x) + + cmd = "whoami" + Shelly = shell.define(cmd) + shelly = Shelly() + assert shelly.cmdline == cmd + + shelly_job = Job( + task=shelly, + submitter=Submitter(cache_root=newcache("native-task")), + name="native", + ) + env_outputs = native.Environment().execute(shelly_job) + + outputs = shelly(cache_root=newcache("native-exec")) + assert drop_stderr(env_outputs) == drop_stderr(attrs_values(outputs)) + + outputs = shelly( + environment=native.Environment(), cache_root=newcache("native-call") + ) + assert drop_stderr(env_outputs) == drop_stderr(attrs_values(outputs)) + + with Submitter( + cache_root=newcache("native-submitter"), environment=native.Environment() + ) as sub: + result = sub(shelly) + assert drop_stderr(env_outputs) == drop_stderr(attrs_values(result.outputs)) + + +@no_win +@need_docker +def test_docker_1(tmp_path): + """docker env: simple command, no arguments""" + + def newcache(x): + return makedir(tmp_path, x) + + cmd = "whoami" + dock = docker.Environment(image="busybox") + Shelly = shell.define(cmd) + shelly = Shelly() + assert shelly.cmdline == cmd + + shelly_job = Job( + task=shelly, + submitter=Submitter(cache_root=newcache("docker")), + name="docker", + ) + outputs_dict = dock.execute(shelly_job) + + with Submitter(cache_root=newcache("docker_sub"), environment=dock) as sub: + result = sub(shelly) + + outputs = shelly(environment=dock, cache_root=newcache("docker_call")) + # If busybox isn't found locally, then the stderr will have the download progress from + # the Docker auto-pull in it + for key in ["stdout", "return_code"]: + assert ( + outputs_dict[key] + == attrs_values(outputs)[key] + == attrs_values(result.outputs)[key] + ) + + +@no_win +@need_docker +@pytest.mark.parametrize( + "dock", + [ + docker.Environment(image="busybox"), + docker.Environment(image="busybox", tag="latest", xargs="--rm"), + docker.Environment(image="busybox", xargs=["--rm"]), + ], +) +def test_docker_1_subm(tmp_path, dock): + """docker env with submitter: simple command, no arguments""" + + def newcache(x): + return makedir(tmp_path, x) + + cmd = "whoami" + shelly = shell.define(cmd)() + shelly_job = Job( + task=shelly, + submitter=Submitter(cache_root=newcache("docker")), + name="docker", + ) + assert shelly.cmdline == cmd + outputs_dict = dock.execute(shelly_job) + + with Submitter( + worker="cf", cache_root=newcache("docker_sub"), environment=dock + ) as sub: + result = sub(shelly) + assert drop_stderr(outputs_dict) == drop_stderr(attrs_values(result.outputs)) + + outputs = shelly(cache_root=newcache("docker_call"), environment=dock) + assert drop_stderr(outputs_dict) == drop_stderr(attrs_values(outputs)) + + +@no_win +@need_singularity +def test_singularity_1(tmp_path): + """singularity env: simple command, no arguments""" + + def newcache(x): + return makedir(tmp_path, x) + + cmd = "whoami" + sing = singularity.Environment(image="docker://alpine", xargs=["--fakeroot"]) + Shelly = shell.define(cmd) + shelly = Shelly() + shelly_job = Job( + task=shelly, + submitter=Submitter(cache_root=newcache("singu")), + name="singu", + ) + assert shelly.cmdline == cmd + outputs_dict = sing.execute(shelly_job) + + with Submitter(cache_root=newcache("singu_sub"), environment=sing) as sub: + results = sub(shelly) + assert drop_stderr(outputs_dict) == drop_stderr(attrs_values(results.outputs)) + + outputs = shelly(environment=sing, cache_root=newcache("singu_call")) + assert drop_stderr(outputs_dict) == drop_stderr(attrs_values(outputs)) + + +@no_win +@need_singularity +def test_singularity_1_subm(tmp_path, worker): + """docker env with submitter: simple command, no arguments""" + + def newcache(x: str) -> Path: + return makedir(tmp_path, x) + + cmd = "whoami" + sing = singularity.Environment(image="docker://alpine", xargs=["--fakeroot"]) + Shelly = shell.define(cmd) + shelly = Shelly() + shelly_job = Job( + task=shelly, + submitter=Submitter(cache_root=newcache("singu")), + name="singu", + ) + assert shelly.cmdline == cmd + outputs_dict = sing.execute(shelly_job) + + with Submitter( + worker=worker, environment=sing, cache_root=newcache("singu_sub") + ) as sub: + results = sub(shelly) + assert drop_stderr(outputs_dict) == drop_stderr(attrs_values(results.outputs)) + + outputs = shelly(environment=sing, cache_root=newcache("singu_call")) + # singularity gives info about cashed image in stderr + assert drop_stderr(outputs_dict) == drop_stderr(attrs_values(outputs)) + + +def shelly_with_input_factory(filename, executable) -> shell.Task: + """creating a task with a simple input_spec""" + Shelly = shell.define( + executable, + inputs=[ + shell.arg( + name="file", + type=File, + position=1, + help="files", + argstr="", + ) + ], + ) + return Shelly(**({} if filename is None else {"file": filename})) + + +def make_job(task: shell.Task, tempdir: Path, name: str): + return Job( + task=task, + submitter=Submitter(cache_root=makedir(tempdir, name)), + name=name, + ) + + +def test_shell_fileinp(tmp_path): + """task with a file in the command/input""" + + def newcache(x): + return makedir(tmp_path, x) + + input_dir = makedir(tmp_path, "inputs") + filename = input_dir / "file.txt" + with open(filename, "w") as f: + f.write("hello ") + + shelly = shelly_with_input_factory(filename=filename, executable="cat") + shelly_job = make_job(shelly, tmp_path, "native") + outputs_dict = native.Environment().execute(shelly_job) + + with Submitter( + environment=native.Environment(), cache_root=newcache("native_sub") + ) as sub: + results = sub(shelly) + assert drop_stderr(outputs_dict) == drop_stderr(attrs_values(results.outputs)) + + outputs = shelly( + environment=native.Environment(), cache_root=newcache("native_call") + ) + assert drop_stderr(outputs_dict) == drop_stderr(attrs_values(outputs)) + + +def test_shell_fileinp_st(tmp_path): + """task (with a splitter) with a file in the command/input""" + + def newcache(x): + return makedir(tmp_path, x) + + input_dir = makedir(tmp_path, "inputs") + filename_1 = input_dir / "file_1.txt" + with open(filename_1, "w") as f: + f.write("hello ") + + filename_2 = input_dir / "file_2.txt" + with open(filename_2, "w") as f: + f.write("hi ") + + filename = [filename_1, filename_2] + + shelly = shelly_with_input_factory(filename=None, executable="cat") + with Submitter( + environment=native.Environment(), cache_root=newcache("native") + ) as sub: + results = sub(shelly.split(file=filename)) + assert [s.strip() for s in results.outputs.stdout] == ["hello", "hi"] + + outputs = shelly.split(file=filename)( + environment=native.Environment(), cache_root=newcache("native_call") + ) + assert [s.strip() for s in outputs.stdout] == ["hello", "hi"] + + +@no_win +@need_docker +def test_docker_fileinp(tmp_path): + """docker env: task with a file in the command/input""" + + def newcache(x): + return makedir(tmp_path, x) + + dock = docker.Environment(image="busybox") + + input_dir = makedir(tmp_path, "inputs") + filename = input_dir / "file.txt" + with open(filename, "w") as f: + f.write("hello ") + + shelly = shelly_with_input_factory(filename=filename, executable="cat") + outputs_dict = dock.execute(make_job(shelly, tmp_path, "docker")) + + with Submitter(environment=dock, cache_root=newcache("shell_sub")) as sub: + results = sub(shelly) + + assert drop_stderr(outputs_dict) == drop_stderr(attrs_values(results.outputs)) + + outputs = shelly(environment=dock, cache_root=newcache("docker_call")) + assert drop_stderr(outputs_dict) == drop_stderr(attrs_values(outputs)) + + +@no_win +@need_docker +def test_docker_fileinp_subm(tmp_path, worker): + """docker env with a submitter: task with a file in the command/input""" + + def newcache(x): + return makedir(tmp_path, x) + + dock = docker.Environment(image="busybox") + + input_dir = makedir(tmp_path, "inputs") + filename = input_dir / "file.txt" + with open(filename, "w") as f: + f.write("hello ") + + shelly = shelly_with_input_factory(filename=filename, executable="cat") + shelly_job = make_job(shelly, tmp_path, "docker_job") + outputs_dict = dock.execute(shelly_job) + + with Submitter( + environment=dock, cache_root=newcache("docker_sub"), worker=worker + ) as sub: + results = sub(shelly) + with Submitter(worker=worker) as sub: + results = sub(shelly) + assert drop_stderr(outputs_dict) == drop_stderr(attrs_values(results.outputs)) + + outputs = shelly(environment=dock, cache_root=newcache("docker_call")) + assert drop_stderr(outputs_dict) == drop_stderr(attrs_values(outputs)) + + +@no_win +@need_docker +def test_docker_fileinp_st(tmp_path): + """docker env: task (with a splitter) with a file in the command/input""" + + def newcache(x): + return makedir(tmp_path, x) + + dock = docker.Environment(image="busybox") + + input_dir = makedir(tmp_path, "inputs") + filename_1 = input_dir / "file_1.txt" + with open(filename_1, "w") as f: + f.write("hello ") + + filename_2 = input_dir / "file_2.txt" + with open(filename_2, "w") as f: + f.write("hi ") + + filename = [filename_1, filename_2] + + shelly = shelly_with_input_factory(filename=None, executable="cat") + + with Submitter(environment=dock, cache_root=newcache("docker_sub")) as sub: + results = sub(shelly.split(file=filename)) + + assert [s.strip() for s in results.outputs.stdout] == ["hello", "hi"] + + outputs = shelly.split(file=filename)( + environment=dock, cache_root=newcache("docker_call") + ) + assert [s.strip() for s in outputs.stdout] == ["hello", "hi"] + + +def shelly_outputfile_factory(filename, executable="cp"): + """creating a task with an input_spec that contains a template""" + Shelly = shell.define( + executable, + inputs=[ + shell.arg( + name="file_orig", + type=File, + position=1, + help="new file", + argstr="", + ), + ], + outputs=[ + shell.outarg( + name="file_copy", + type=File, + path_template="{file_orig}_copy", + help="output file", + argstr="", + position=2, + keep_extension=True, + ), + ], + ) + + return Shelly(**({} if filename is None else {"file_orig": filename})) + + +def test_shell_fileout(tmp_path): + """task with a file in the output""" + + def newcache(x): + return Path(makedir(tmp_path, x)) + + input_dir = makedir(tmp_path, "inputs") + filename = input_dir / "file.txt" + with open(filename, "w") as f: + f.write("hello ") + + # execute does not create the cashedir, so this part will fail, + # but I guess we don't want to use it this way anyway + # shelly = create_shelly_outputfile(tempdir=tmp_path, filename=filename, name="native") + # outputs_dict = native.Environment().execute(shelly) + + shelly = shelly_outputfile_factory(filename=filename) + + with Submitter( + environment=native.Environment(), cache_root=newcache("native_sub") + ) as sub: + result = sub(shelly) + assert Path(result.outputs.file_copy) == result.cache_dir / "file_copy.txt" + + call_cache = newcache("native_call") + + outputs = shelly(environment=native.Environment(), cache_root=call_cache) + assert Path(outputs.file_copy) == call_cache / shelly._checksum / "file_copy.txt" + + +def test_shell_fileout_st(tmp_path): + """task (with a splitter) with a file in the output""" + + def newcache(x): + return Path(makedir(tmp_path, x)) + + input_dir = makedir(tmp_path, "inputs") + filename_1 = input_dir / "file_1.txt" + with open(filename_1, "w") as f: + f.write("hello ") + + filename_2 = input_dir / "file_2.txt" + with open(filename_2, "w") as f: + f.write("hi ") + + filename = [filename_1, filename_2] + + shelly = shelly_outputfile_factory(filename=None) + with Submitter( + environment=native.Environment(), cache_root=newcache("native") + ) as sub: + results = sub(shelly.split(file_orig=filename)) + + assert [f.name for f in results.outputs.file_copy] == [ + "file_1_copy.txt", + "file_2_copy.txt", + ] + + call_cache = newcache("native_call") + + outputs = shelly.split(file_orig=filename)( + environment=native.Environment(), cache_root=call_cache + ) + assert [f.name for f in outputs.file_copy] == ["file_1_copy.txt", "file_2_copy.txt"] + + +@no_win +@need_docker +def test_docker_fileout(tmp_path): + """docker env: task with a file in the output""" + + def newcache(x): + return Path(makedir(tmp_path, x)) + + dock = docker.Environment(image="busybox") + + input_dir = makedir(tmp_path, "inputs") + filename = input_dir / "file.txt" + with open(filename, "w") as f: + f.write("hello ") + + shelly = shelly_outputfile_factory(filename=filename) + + with Submitter(environment=dock, cache_root=newcache("docker")) as sub: + results = sub(shelly) + assert results.outputs.file_copy == File(results.cache_dir / "file_copy.txt") + + +@no_win +@need_docker +def test_docker_fileout_st(tmp_path): + """docker env: task (with a splitter) with a file in the output""" + + def newcache(x): + return Path(makedir(tmp_path, x)) + + dock = docker.Environment(image="busybox") + + input_dir = makedir(tmp_path, "inputs") + filename_1 = input_dir / "file_1.txt" + with open(filename_1, "w") as f: + f.write("hello ") + + filename_2 = input_dir / "file_2.txt" + with open(filename_2, "w") as f: + f.write("hi ") + + filename = [filename_1, filename_2] + + shelly = shelly_outputfile_factory(filename=None) + + with Submitter(environment=dock, cache_root=newcache("docker_sub")) as sub: + results = sub(shelly.split(file_orig=filename)) + assert [f.name for f in results.outputs.file_copy] == [ + "file_1_copy.txt", + "file_2_copy.txt", + ] diff --git a/pydra/environments/tests/test_singularity.py b/pydra/environments/tests/test_singularity.py new file mode 100644 index 0000000000..0f8e686318 --- /dev/null +++ b/pydra/environments/tests/test_singularity.py @@ -0,0 +1,594 @@ +from pydra.engine.submitter import Submitter +from pydra.compose import shell, workflow +from fileformats.generic import File +from pydra.environments import singularity +from pydra.engine.tests.utils import need_singularity + + +@need_singularity +def test_singularity_1_nosubm(tmp_path): + """simple command in a container, a default bindings and working directory is added + no submitter + """ + cmd = "pwd" + image = "docker://alpine" + Singu = shell.define(cmd) + singu = Singu() + outputs = singu( + environment=singularity.Environment(image=image), cache_root=tmp_path + ) + assert "/mnt/pydra" in outputs.stdout + assert outputs.return_code == 0 + + +@need_singularity +def test_singularity_2_nosubm(tmp_path): + """a command with arguments, cmd and args given as executable + no submitter + """ + cmd = ["echo", "hail", "pydra"] + image = "docker://alpine" + Singu = shell.define(" ".join(cmd)) + singu = Singu() + assert singu.cmdline == " ".join(cmd) + + outputs = singu( + environment=singularity.Environment(image=image), + cache_root=tmp_path, + ) + assert outputs.stdout.strip() == " ".join(cmd[1:]) + assert outputs.return_code == 0 + + +@need_singularity +def test_singularity_2(worker, tmp_path): + """a command with arguments, cmd and args given as executable + using submitter + """ + cmd = ["echo", "hail", "pydra"] + image = "docker://alpine" + Singu = shell.define(" ".join(cmd)) + singu = Singu() + + assert singu.cmdline == " ".join(cmd) + + with Submitter( + worker=worker, + environment=singularity.Environment(image=image), + cache_root=tmp_path, + ) as sub: + res = sub(singu) + assert not res.errored, "\n".join(res.errors["error message"]) + assert res.outputs.stdout.strip() == " ".join(cmd[1:]) + assert res.outputs.return_code == 0 + + +@need_singularity +def test_singularity_2a(worker, tmp_path): + """a command with arguments, using executable and args + using submitter + """ + cmd_exec = "echo" + cmd_args = ["hail", "pydra"] + # separate command into exec + args + image = "docker://alpine" + Singu = shell.define(cmd_exec) + singu = Singu(append_args=cmd_args) + assert singu.cmdline == f"{cmd_exec} {' '.join(cmd_args)}" + + with Submitter( + worker="debug", + environment=singularity.Environment(image=image), + cache_root=tmp_path, + ) as sub: + res = sub(singu) + + assert not res.errored, "\n".join(res.errors["error message"]) + assert res.outputs.stdout.strip() == " ".join(cmd_args) + assert res.outputs.return_code == 0 + + +# tests with State + + +@need_singularity +def test_singularity_st_1(worker, tmp_path): + """commands without arguments in container + splitter = executable + """ + cmd = ["whoami", "pwd", "ls"] + image = "docker://alpine" + Singu = shell.define("dummy") + singu = Singu().split("executable", executable=cmd) + + outputs = singu( + worker=worker, + environment=singularity.Environment(image=image, xargs=["--fakeroot"]), + cache_root=tmp_path, + ) + assert outputs.stdout[0].strip() == "root" + assert "/mnt/pydra" in outputs.stdout[1] + assert outputs.stdout[2].strip() == "_job.pklz" + assert outputs.return_code == [0, 0, 0] + + +# tests with customized output_spec + + +@need_singularity +def test_singularity_outputspec_1(worker, tmp_path): + """ + customised output_spec, adding files to the output, providing specific pathname + output_path is automatically added to the bindings + """ + cmd = ["touch", "newfile_tmp.txt"] + image = "docker://alpine" + + Singu = shell.define( + " ".join(cmd), + outputs=[ + shell.outarg(name="newfile", type=File, path_template="newfile_tmp.txt") + ], + ) + singu = Singu() + + with Submitter( + environment=singularity.Environment(image=image), cache_root=tmp_path + ) as sub: + res = sub(singu) + + assert not res.errored, "\n".join(res.errors["error message"]) + assert res.outputs.stdout == "" + assert res.outputs.newfile.fspath.exists() + + +# tests with customised input_spec + + +@need_singularity +def test_singularity_inputspec_1(worker, tmp_path): + """a simple customized input task for singularity task""" + filename = str((tmp_path / "file_pydra.txt")) + with open(filename, "w") as f: + f.write("hello from pydra") + + cmd = "cat" + image = "docker://alpine" + + Singu = shell.define( + cmd, + inputs=[ + shell.arg( + name="file", + type=File, + position=1, + argstr="", + help="input file", + ) + ], + ) + + singu = Singu(file=filename) + + outputs = singu( + environment=singularity.Environment(image=image), cache_root=tmp_path + ) + assert outputs.stdout.strip() == "hello from pydra" + + +@need_singularity +def test_singularity_inputspec_1a(worker, tmp_path): + """a simple customized input task for singularity task + a default value is used + """ + filename = str((tmp_path / "file_pydra.txt")) + with open(filename, "w") as f: + f.write("hello from pydra") + + cmd = "cat" + image = "docker://alpine" + + Singu = shell.define( + cmd, + inputs=[ + shell.arg( + name="file", + type=File, + default=filename, + position=1, + argstr="", + help="input file", + ) + ], + ) + singu = Singu(file=filename) + + outputs = singu( + environment=singularity.Environment(image=image), cache_root=tmp_path + ) + assert outputs.stdout.strip() == "hello from pydra" + + +@need_singularity +def test_singularity_inputspec_2(worker, tmp_path): + """a customized input task with two fields for singularity task""" + filename_1 = tmp_path / "file_pydra.txt" + with open(filename_1, "w") as f: + f.write("hello from pydra\n") + + filename_2 = tmp_path / "file_nice.txt" + with open(filename_2, "w") as f: + f.write("have a nice one") + + cmd = "cat" + image = "docker://alpine" + + Singu = shell.define( + cmd, + inputs=[ + shell.arg( + name="file1", + type=File, + position=1, + argstr="", + help="input file 1", + ), + shell.arg( + name="file2", + type=File, + default=filename_2, + position=2, + argstr="", + help="input file 2", + ), + ], + ) + + singu = Singu(file1=filename_1) + + outputs = singu( + environment=singularity.Environment(image=image), cache_root=tmp_path + ) + assert outputs.stdout == "hello from pydra\nhave a nice one" + + +@need_singularity +def test_singularity_inputspec_2a_except(worker, tmp_path): + """a customized input task with two fields + first one uses a default, and second doesn't - raises a dataclass exception + """ + filename_1 = tmp_path / "file_pydra.txt" + with open(filename_1, "w") as f: + f.write("hello from pydra\n") + filename_2 = tmp_path / "file_nice.txt" + with open(filename_2, "w") as f: + f.write("have a nice one") + + cmd = "cat" + image = "docker://alpine" + + # the field with default value can't be before value without default + Singu = shell.define( + cmd, + inputs=[ + shell.arg( + name="file1", + type=File, + default=filename_1, + position=1, + argstr="", + help="input file 1", + ), + shell.arg( + name="file2", + type=File, + position=2, + argstr="", + help="input file 2", + ), + ], + ) + + singu = Singu(file2=filename_2) + outputs = singu( + environment=singularity.Environment(image=image), cache_root=tmp_path + ) + assert outputs.stdout == "hello from pydra\nhave a nice one" + + +@need_singularity +def test_singularity_inputspec_2a(worker, tmp_path): + """a customized input task with two fields + first one uses a default value, + this is fine even if the second field is not using any defaults + """ + filename_1 = tmp_path / "file_pydra.txt" + with open(filename_1, "w") as f: + f.write("hello from pydra\n") + filename_2 = tmp_path / "file_nice.txt" + with open(filename_2, "w") as f: + f.write("have a nice one") + + cmd = "cat" + image = "docker://alpine" + + # if you want set default in the first field you can use default_value in metadata + Singu = shell.define( + cmd, + inputs=[ + shell.arg( + name="file1", + type=File, + default=filename_1, + position=1, + argstr="", + help="input file 1", + ), + shell.arg( + name="file2", + type=File, + position=2, + argstr="", + help="input file 2", + ), + ], + ) + + singu = Singu(file2=filename_2) + + outputs = singu( + environment=singularity.Environment(image=image), cache_root=tmp_path + ) + assert outputs.stdout == "hello from pydra\nhave a nice one" + + +@need_singularity +def test_singularity_cmd_inputspec_copyfile_1(worker, tmp_path): + """shelltask changes a file in place, + adding copyfile=True to the file-input from input_spec + hardlink or copy in the cache_dir should be created + """ + file = tmp_path / "file_pydra.txt" + with open(file, "w") as f: + f.write("hello from pydra\n") + + image = "docker://alpine" + + @shell.define + class Singu(shell.Task["Singu.Outputs"]): + + executable = ["sed", "-is", "s/hello/hi/"] + + orig_file: File = shell.arg( + position=1, + argstr="", + help="orig file", + copy_mode=File.CopyMode.copy, + ) + + class Outputs(shell.Outputs): + out_file: File = shell.outarg( + path_template="{orig_file}.txt", # FIXME: Shouldn't have to specify the extension + help="output file", + ) + + singu = Singu(orig_file=file) + + outputs = singu( + environment=singularity.Environment(image=image), cache_root=tmp_path + ) + assert outputs.stdout == "" + assert outputs.out_file.fspath.exists() + # the file is copied, and than it is changed in place + assert outputs.out_file.fspath.parent.parent == tmp_path + with open(outputs.out_file) as f: + assert "hi from pydra\n" == f.read() + # the original file is unchanged + with open(file) as f: + assert "hello from pydra\n" == f.read() + + +@need_singularity +def test_singularity_inputspec_state_1(tmp_path): + """a customised input task for a singularity file with a splitter, + splitter is on files + """ + filename_1 = tmp_path / "file_pydra.txt" + with open(filename_1, "w") as f: + f.write("hello from pydra\n") + filename_2 = tmp_path / "file_nice.txt" + with open(filename_2, "w") as f: + f.write("have a nice one") + + cmd = "cat" + filename = [str(filename_1), str(filename_2)] + image = "docker://alpine" + + Singu = shell.define( + cmd, + inputs=[ + shell.arg( + name="file", + type=File, + position=1, + argstr="", + help="input file", + ) + ], + ) + + singu = Singu().split("file", file=filename) + + outputs = singu( + environment=singularity.Environment(image=image), cache_root=tmp_path + ) + assert outputs.stdout[0].strip() == "hello from pydra" + assert outputs.stdout[1].strip() == "have a nice one" + + +@need_singularity +def test_singularity_inputspec_state_1b(worker, tmp_path): + """a customised input task for a singularity file with a splitter, + files from the input task have the same path in the local os and the container, + so hash is calculated and the test works fine + """ + file_1 = tmp_path / "file_pydra.txt" + file_2 = tmp_path / "file_nice.txt" + with open(file_1, "w") as f: + f.write("hello from pydra") + with open(file_2, "w") as f: + f.write("have a nice one") + + cmd = "cat" + filename = [str(file_1), str(file_2)] + image = "docker://alpine" + + Singu = shell.define( + cmd, + inputs=[ + shell.arg( + name="file", + type=File, + position=1, + argstr="", + help="input file", + ) + ], + ) + + singu = Singu().split("file", file=filename) + + outputs = singu( + environment=singularity.Environment(image=image), cache_root=tmp_path + ) + assert outputs.stdout[0].strip() == "hello from pydra" + assert outputs.stdout[1].strip() == "have a nice one" + + +@need_singularity +def test_singularity_wf_inputspec_1(worker, tmp_path): + """a customized input task for workflow with singularity tasks""" + filename = tmp_path / "file_pydra.txt" + with open(filename, "w") as f: + f.write("hello from pydra") + + cmd = "cat" + image = "docker://alpine" + + Singu = shell.define( + cmd, + inputs=[ + shell.arg( + name="file", + type=File, + position=1, + argstr="", + help="input file", + ) + ], + ) + + @workflow.define + def Workflow(cmd: str, file: File) -> str: + singu = workflow.add( + Singu(executable=cmd, file=file), + environment=singularity.Environment(image=image), + ) + return singu.stdout + + with Submitter(cache_root=tmp_path) as sub: + res = sub(Workflow(cmd=cmd, file=filename)) + + assert res.outputs.out.strip() == "hello from pydra" + + +@need_singularity +def test_singularity_wf_state_inputspec_1(worker, tmp_path): + """a customized input task for workflow with singularity tasks that has a state""" + file_1 = tmp_path / "file_pydra.txt" + file_2 = tmp_path / "file_nice.txt" + with open(file_1, "w") as f: + f.write("hello from pydra") + with open(file_2, "w") as f: + f.write("have a nice one") + + cmd = "cat" + filename = [str(file_1), str(file_2)] + image = "docker://alpine" + + Singu = shell.define( + cmd, + inputs=[ + shell.arg( + name="file", + type=File, + position=1, + argstr="", + help="input file", + ) + ], + ) + + @workflow.define + def Workflow(cmd: str, file: File) -> str: + singu = workflow.add( + Singu(executable=cmd, file=file), + environment=singularity.Environment(image=image), + ) + return singu.stdout + + wf = Workflow(cmd=cmd).split("file", file=filename) + + with Submitter(worker=worker, cache_root=tmp_path) as sub: + res = sub(wf) + + assert [o.strip() for o in res.outputs.out] == [ + "hello from pydra", + "have a nice one", + ] + + +@need_singularity +def test_singularity_wf_ndst_inputspec_1(worker, tmp_path): + """a customized input task for workflow with singularity tasks with states""" + file_1 = tmp_path / "file_pydra.txt" + file_2 = tmp_path / "file_nice.txt" + with open(file_1, "w") as f: + f.write("hello from pydra") + with open(file_2, "w") as f: + f.write("have a nice one") + + cmd = "cat" + filename = [str(file_1), str(file_2)] + image = "docker://alpine" + + Singu = shell.define( + cmd, + inputs=[ + shell.arg( + name="file", + type=File, + position=1, + argstr="", + help="input file", + ) + ], + ) + + @workflow.define + def Workflow(cmd: str, files: list[File]) -> list[str]: + singu = workflow.add( + Singu(executable=cmd).split(file=files), + environment=singularity.Environment(image=image), + ) + return singu.stdout + + wf = Workflow(cmd=cmd, files=filename) + + with Submitter(worker=worker, cache_root=tmp_path) as sub: + res = sub(wf) + + assert [o.strip() for o in res.outputs.out] == [ + "hello from pydra", + "have a nice one", + ] diff --git a/pydra/mark/__init__.py b/pydra/mark/__init__.py deleted file mode 100644 index 31e4cf832e..0000000000 --- a/pydra/mark/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -from .functions import annotate, task - -__all__ = ("annotate", "task") diff --git a/pydra/mark/functions.py b/pydra/mark/functions.py deleted file mode 100644 index e191a61809..0000000000 --- a/pydra/mark/functions.py +++ /dev/null @@ -1,49 +0,0 @@ -""" Decorators to apply to functions used in Pydra workflows """ - -from functools import wraps - - -def annotate(annotation): - """ - Update the annotation of a function. - - Example - ------- - >>> import pydra - >>> @pydra.mark.annotate({'a': int, 'return': float}) - ... def square(a): - ... return a ** 2.0 - - """ - import inspect - - def decorate(func): - sig = inspect.signature(func) - unknown = set(annotation) - set(sig.parameters) - {"return"} - if unknown: - raise TypeError(f"Cannot annotate unknown parameters: {tuple(unknown)}") - func.__annotations__.update(annotation) - return func - - return decorate - - -def task(func): - """ - Promote a function to a :class:`~pydra.engine.task.FunctionTask`. - - Example - ------- - >>> import pydra - >>> @pydra.mark.task - ... def square(a: int) -> float: - ... return a ** 2.0 - - """ - from ..engine.task import FunctionTask - - @wraps(func) - def decorate(**kwargs): - return FunctionTask(func=func, **kwargs) - - return decorate diff --git a/pydra/mark/tests/__init__.py b/pydra/mark/tests/__init__.py deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/pydra/mark/tests/test_functions.py b/pydra/mark/tests/test_functions.py deleted file mode 100644 index 4be0343f1f..0000000000 --- a/pydra/mark/tests/test_functions.py +++ /dev/null @@ -1,219 +0,0 @@ -import pytest -import random -import typing as ty - -from ..functions import task, annotate -from ...engine.task import FunctionTask - - -def test_task_equivalence(): - def add_two(a): - return a + 2 - - canonical = FunctionTask(add_two, a=3) - - decorated1 = task(add_two)(a=3) - - @task - def addtwo(a): - return a + 2 - - decorated2 = addtwo(a=3) - - assert canonical.checksum == decorated1.checksum - - c_res = canonical._run() - d1_res = decorated1._run() - d2_res = decorated2._run() - - assert c_res.output.hash == d1_res.output.hash - assert c_res.output.hash == d2_res.output.hash - - -def test_annotation_equivalence_1(): - """testing various ways of annotation: one output, only types provided""" - - def direct(a: int) -> int: - return a + 2 - - @annotate({"return": int}) - def partial(a: int): - return a + 2 - - @annotate({"a": int, "return": int}) - def indirect(a): - return a + 2 - - # checking if the annotations are equivalent - assert direct.__annotations__ == partial.__annotations__ - assert direct.__annotations__ == indirect.__annotations__ - - # Run functions to ensure behavior is unaffected - a = random.randint(0, (1 << 32) - 3) - assert direct(a) == partial(a) - assert direct(a) == indirect(a) - - # checking if the annotation is properly converted to output_spec if used in task - task_direct = task(direct)() - assert task_direct.output_spec.fields[0] == ("out", int) - - -def test_annotation_equivalence_2(): - """testing various ways of annotation: multiple outputs, using a tuple for output annot.""" - - def direct(a: int) -> (int, float): - return a + 2, a + 2.0 - - @annotate({"return": (int, float)}) - def partial(a: int): - return a + 2, a + 2.0 - - @annotate({"a": int, "return": (int, float)}) - def indirect(a): - return a + 2, a + 2.0 - - # checking if the annotations are equivalent - assert direct.__annotations__ == partial.__annotations__ - assert direct.__annotations__ == indirect.__annotations__ - - # Run functions to ensure behavior is unaffected - a = random.randint(0, (1 << 32) - 3) - assert direct(a) == partial(a) - assert direct(a) == indirect(a) - - # checking if the annotation is properly converted to output_spec if used in task - task_direct = task(direct)() - assert task_direct.output_spec.fields == [("out1", int), ("out2", float)] - - -def test_annotation_equivalence_3(): - """testing various ways of annotation: using dictionary for output annot.""" - - def direct(a: int) -> {"out1": int}: - return a + 2 - - @annotate({"return": {"out1": int}}) - def partial(a: int): - return a + 2 - - @annotate({"a": int, "return": {"out1": int}}) - def indirect(a): - return a + 2 - - # checking if the annotations are equivalent - assert direct.__annotations__ == partial.__annotations__ - assert direct.__annotations__ == indirect.__annotations__ - - # Run functions to ensure behavior is unaffected - a = random.randint(0, (1 << 32) - 3) - assert direct(a) == partial(a) - assert direct(a) == indirect(a) - - # checking if the annotation is properly converted to output_spec if used in task - task_direct = task(direct)() - assert task_direct.output_spec.fields[0] == ("out1", int) - - -def test_annotation_equivalence_4(): - """testing various ways of annotation: using ty.NamedTuple for the output""" - - def direct(a: int) -> ty.NamedTuple("Output", [("sum", int), ("sub", int)]): - return a + 2, a - 2 - - @annotate({"return": ty.NamedTuple("Output", [("sum", int), ("sub", int)])}) - def partial(a: int): - return a + 2, a - 2 - - @annotate( - {"a": int, "return": ty.NamedTuple("Output", [("sum", int), ("sub", int)])} - ) - def indirect(a): - return a + 2, a - 2 - - # checking if the annotations are equivalent - assert ( - direct.__annotations__["return"].__annotations__ - == partial.__annotations__["return"].__annotations__ - == indirect.__annotations__["return"].__annotations__ - ) - assert ( - direct.__annotations__["return"].__name__ - == partial.__annotations__["return"].__name__ - == indirect.__annotations__["return"].__name__ - ) - - # Run functions to ensure behavior is unaffected - a = random.randint(0, (1 << 32) - 3) - assert direct(a) == partial(a) - assert direct(a) == indirect(a) - - # checking if the annotation is properly converted to output_spec if used in task - task_direct = task(direct)() - assert task_direct.output_spec.fields == [("sum", int), ("sub", int)] - - -def test_annotation_override(): - @annotate({"a": float, "return": float}) - def annotated(a: int) -> int: - return a + 2 - - assert annotated.__annotations__ == {"a": float, "return": float} - - -def test_invalid_annotation(): - with pytest.raises(TypeError): - - @annotate({"b": int}) - def addtwo(a): - return a + 2 - - -def test_annotated_task(): - @task - def square(in_val: float): - return in_val**2 - - res = square(in_val=2.0)() - assert res.output.out == 4.0 - - -def test_return_annotated_task(): - @task - @annotate({"in_val": float, "return": {"squared": float}}) - def square(in_val): - return in_val**2 - - res = square(in_val=2.0)() - assert res.output.squared == 4.0 - - -def test_return_halfannotated_annotated_task(): - @task - @annotate({"in_val": float, "return": float}) - def square(in_val): - return in_val**2 - - res = square(in_val=2.0)() - assert res.output.out == 4.0 - - -def test_return_annotated_task_multiple_output(): - @task - @annotate({"in_val": float, "return": {"squared": float, "cubed": float}}) - def square(in_val): - return in_val**2, in_val**3 - - res = square(in_val=2.0)() - assert res.output.squared == 4.0 - assert res.output.cubed == 8.0 - - -def test_return_halfannotated_task_multiple_output(): - @task - @annotate({"in_val": float, "return": (float, float)}) - def square(in_val): - return in_val**2, in_val**3 - - res = square(in_val=2.0)() - assert res.output.out1 == 4.0 - assert res.output.out2 == 8.0 diff --git a/pydra/scripts/__init__.py b/pydra/scripts/__init__.py new file mode 100644 index 0000000000..2d33c7c8b6 --- /dev/null +++ b/pydra/scripts/__init__.py @@ -0,0 +1,3 @@ +# Although the Python files in this directory are meant to be run separately, +# we add a __init__.py file to make it a package that can easily be discovered +# within the installation directory diff --git a/pydra/engine/run_pickled.py b/pydra/scripts/run_pickled.py similarity index 66% rename from pydra/engine/run_pickled.py rename to pydra/scripts/run_pickled.py index 902b243242..513aa38cf6 100644 --- a/pydra/engine/run_pickled.py +++ b/pydra/scripts/run_pickled.py @@ -1,6 +1,11 @@ -import pickle +import cloudpickle as cp import sys -from pydra.engine.helpers import load_and_run +from pathlib import Path +from pydra.engine.job import load_and_run + +# To avoid issues when running pytest, where the namespace package "pydra" is dropped in +# the pickling process due to it being run from inside the source tree +sys.path.append(str(Path(__file__).parent.parent)) def run_pickled(*file_paths, rerun=False): @@ -8,7 +13,7 @@ def run_pickled(*file_paths, rerun=False): for file_path in file_paths: with open(file_path, "rb") as file: - loaded_objects.append(pickle.load(file)) + loaded_objects.append(cp.load(file)) if len(loaded_objects) == 1: result = loaded_objects[0](rerun=rerun) diff --git a/pydra/tasks/__init__.py b/pydra/tasks/__init__.py deleted file mode 100644 index fae53c2d92..0000000000 --- a/pydra/tasks/__init__.py +++ /dev/null @@ -1,11 +0,0 @@ -""" Pydra tasks - -The ``pydra.tasks`` namespace is reserved for collections of Tasks, to be managed and -packaged separately. -To create a task package, please fork the `pydra-tasks-template -`__. -""" - -# This call enables pydra.tasks to be used as a namespace package when installed -# in editable mode. In normal installations it has no effect. -__path__ = __import__("pkgutil").extend_path(__path__, __name__) diff --git a/pydra/tasks/common/__init__.py b/pydra/tasks/common/__init__.py new file mode 100644 index 0000000000..f27d2cd259 --- /dev/null +++ b/pydra/tasks/common/__init__.py @@ -0,0 +1,9 @@ +import json +from fileformats.application import Json +from pydra.compose import python + + +@python.define +def LoadJson(file: Json) -> dict | list: + with open(file) as f: + return json.load(f) diff --git a/pydra/tasks/common/tests/test_common_tasks.py b/pydra/tasks/common/tests/test_common_tasks.py new file mode 100644 index 0000000000..13688d5a5e --- /dev/null +++ b/pydra/tasks/common/tests/test_common_tasks.py @@ -0,0 +1,21 @@ +from pathlib import Path +import json +from pydra.tasks.common import LoadJson + + +def test_load_json(tmp_path: Path): + JSON_CONTENTS = {"a": True, "b": "two", "c": 3, "d": [7, 0.55, 6]} + + # Create a JSON file with some contents + json_file = tmp_path / "test.json" + with open(json_file, "w") as f: + json.dump(JSON_CONTENTS, f) + + # Instantiate the task, providing the JSON file we want to load + load_json = LoadJson(file=json_file) + + # Run the task to load the JSON file + outputs = load_json() + + # Access the loaded JSON output contents and check they match original + assert outputs.out == JSON_CONTENTS diff --git a/pydra/tasks/testing/__init__.py b/pydra/tasks/testing/__init__.py new file mode 100644 index 0000000000..3787ded24b --- /dev/null +++ b/pydra/tasks/testing/__init__.py @@ -0,0 +1,82 @@ +from pydra.compose import python, workflow + + +@python.define +def Add(x: float, y: float) -> float: + return x + y + + +@python.define +def Divide(x: float, y: float) -> float: + return x / y + + +@python.define +def SafeDivide(x: float, y: float) -> float: + if y == 0: + return float("nan") + return x / y + + +@python.define +def Subtract(x: float, y: float) -> float: + return x - y + + +@workflow.define +def UnsafeDivisionWorkflow(a: float, b: float, denominator: float) -> float: + """Adds 'a' and 'b' together, divides by 'denominator', and then subtracts 'b' from + the output. Division by 0 is not guarded against so the workflow will fail if + the value passed to the 'denominator' parameter is 0. + + Parameters + ---------- + a : float + The first number to add. + b : float + The second number to add. + denominator : float + The number to divide the sum of 'a' and 'b' by. + + Returns + ------- + out : float + The result of subtracting 'b' from the result of dividing the sum of 'a' and + 'b' by 'denominator'. + """ + add = workflow.add(Add(x=a, y=b)) + divide = workflow.add(Divide(x=add.out, y=denominator)) + subtract = workflow.add(Subtract(x=divide.out, y=b)) + return subtract.out + + +@workflow.define +def SafeDivisionWorkflow(a: float, b: float, denominator: float) -> float: + """Adds 'a' and 'b' together, divides by 'denominator', and then subtracts 'b' from + the output. Division by 0 is not guarded against so the workflow will fail if + the value passed to the 'denominator' parameter is 0. + + Parameters + ---------- + a : float + The first number to add. + b : float + The second number to add. + denominator : float + The number to divide the sum of 'a' and 'b' by. + + Returns + ------- + out : float + The result of subtracting 'b' from the result of dividing the sum of 'a' and + 'b' by 'denominator'. + """ + add = workflow.add(Add(x=a, y=b)) + divide = workflow.add(SafeDivide(x=add.out, y=denominator)) + subtract = workflow.add(Subtract(x=divide.out, y=b)) + return subtract.out + + +@python.define +def TenToThePower(p: int) -> int: + return 10**p diff --git a/pydra/tasks/testing/tests/test_testing_tasks.py b/pydra/tasks/testing/tests/test_testing_tasks.py new file mode 100644 index 0000000000..4c54a70563 --- /dev/null +++ b/pydra/tasks/testing/tests/test_testing_tasks.py @@ -0,0 +1,18 @@ +import pytest +from pydra.engine.submitter import Submitter +from pydra.tasks.testing import SafeDivisionWorkflow, UnsafeDivisionWorkflow + + +def test_safe_division_workflow(): + wf = SafeDivisionWorkflow(a=10, b=5).split(denominator=[3, 2, 0]) + with Submitter(worker="cf") as sub: + result = sub(wf) + + assert not result.errored, "\n".join(result.errors["error message"]) + + +def test_unsafe_division_workflow(): + wf = UnsafeDivisionWorkflow(a=10, b=5).split(denominator=[3, 2, 0]) + + with pytest.raises(ZeroDivisionError): + wf(worker="debug") diff --git a/pydra/utils/__init__.py b/pydra/utils/__init__.py index cfde94dbf8..69394bdf3e 100644 --- a/pydra/utils/__init__.py +++ b/pydra/utils/__init__.py @@ -1 +1,17 @@ -from .misc import user_cache_dir, add_exc_note, exc_info_matches # noqa: F401 +from .general import ( + task_fields, + plot_workflow, + show_workflow, + task_help, + print_help, +) +from ._version import __version__ + +__all__ = [ + "__version__", + "task_fields", + "plot_workflow", + "show_workflow", + "task_help", + "print_help", +] diff --git a/pydra/utils/etelemetry.py b/pydra/utils/etelemetry.py new file mode 100644 index 0000000000..3610b9b30a --- /dev/null +++ b/pydra/utils/etelemetry.py @@ -0,0 +1,20 @@ +import logging +from ._version import __version__ + +# import __main__ + +logger = logging.getLogger("pydra") + + +def check_latest_version(): + import etelemetry + + return etelemetry.check_available_version("nipype/pydra", __version__, lgr=logger) + + +# Run telemetry on import for interactive sessions, such as IPython, Jupyter notebooks, Python REPL +# if not hasattr(__main__, "__file__"): +# from pydra.compose.base import Task + +# if Task._etelemetry_version_data is None: +# Task._etelemetry_version_data = check_latest_version() diff --git a/pydra/utils/general.py b/pydra/utils/general.py new file mode 100644 index 0000000000..68241ac8c2 --- /dev/null +++ b/pydra/utils/general.py @@ -0,0 +1,586 @@ +"""Administrative support for the engine framework.""" + +from pathlib import Path +import inspect +import sys +import typing as ty +import re +import attrs +import ast +import tempfile +import importlib +import types +import sysconfig +import platformdirs +import builtins +import pkgutil +import logging +import pydra.scripts +from ._version import __version__ + + +logger = logging.getLogger("pydra") +if ty.TYPE_CHECKING: + from pydra.compose.base import Task + from pydra.compose import workflow + + +SCRIPTS_DIR = Path(pydra.scripts.__file__).parent + +PYDRA_ATTR_METADATA = "__PYDRA_METADATA__" + +TaskType = ty.TypeVar("TaskType", bound="Task") + + +user_cache_root = Path( + platformdirs.user_cache_dir( + appname="pydra", + appauthor="nipype", + version=__version__, + ) +) + +default_run_cache_root = user_cache_root / "run-cache" + + +def add_exc_note(e: Exception, note: str) -> Exception: + """Adds a note to an exception in a Python <3.11 compatible way + + Parameters + ---------- + e : Exception + the exception to add the note to + note : str + the note to add + + Returns + ------- + Exception + returns the exception again + """ + if hasattr(e, "add_note"): + e.add_note(note) + else: + e.args = (e.args[0] + "\n" + note,) + return e + + +def exc_info_matches(exc_info, match, regex=False): + if exc_info.value.__cause__ is not None: + msg = str(exc_info.value.__cause__) + else: + msg = str(exc_info.value) + if regex: + return re.match(".*" + match, msg) + else: + return match in msg + + +def get_undefined_symbols( + func, exclude_signature_type_hints: bool = False, ignore_decorator: bool = False +): + """ + Check the source code of a function and detect any symbols that aren't defined in its scope. + + Parameters + ---------- + func : callable + The function to analyze. + + Returns + ------- + set + A set of undefined symbols. + """ + # Get the source code of the function + source = inspect.getsource(func) + + # De-indent the source code if required + indent = re.match(r"^\s*", source).group() + source = ("\n" + source).replace("\n" + indent, "\n") + + if ignore_decorator: + # Remove the decorator from the source code, i.e. everything before the first + # unindented 'def ' keyword. + source = re.match( + r"(.*\n)(def .*)", "\n" + source, flags=re.MULTILINE | re.DOTALL + ).group(2) + + # Parse the source code into an AST + tree = ast.parse(source) + + # Define a visitor class to traverse the AST + class SymbolVisitor(ast.NodeVisitor): + + def __init__(self): + # Initialize sets to track defined and used symbols + self.defined_symbols = set() + self.used_symbols = set() + + def visit_FunctionDef(self, node): + # Add function arguments to defined symbols + for arg in node.args.args: + self.defined_symbols.add(arg.arg) + if exclude_signature_type_hints: + # Exclude type hints from the defined symbols + type_hints_visitor = SymbolVisitor() + if node.returns: + type_hints_visitor.visit(node.returns) + for arg in node.args.args: + if arg.annotation: + type_hints_visitor.visit(arg.annotation) + type_hint_symbols = type_hints_visitor.used_symbols - self.used_symbols + self.generic_visit(node) + if exclude_signature_type_hints: + # Remove type hints from the used symbols + self.used_symbols -= type_hint_symbols + + def visit_Assign(self, node): + # Add assigned variables to defined symbols + for target in node.targets: + if isinstance(target, ast.Name): + self.defined_symbols.add(target.id) + self.generic_visit(node) + + def visit_Name(self, node): + # Add all variable names to used symbols + if isinstance(node.ctx, ast.Load): + self.used_symbols.add(node.id) + self.generic_visit(node) + + @property + def undefined_symbols(self): + return self.used_symbols - self.defined_symbols - get_builtin_type_names() + + # Create a visitor instance and visit the AST + visitor = SymbolVisitor() + visitor.visit(tree) + + return visitor.undefined_symbols + + +def get_builtin_type_names(): + """ + Get a list of built-in object type names in Python. + + Returns + ------- + set + A set of built-in object type names. + """ + return set(name for name, obj in vars(builtins).items() if isinstance(obj, type)) + + +def in_stdlib(obj: types.FunctionType | type) -> str | bool: + """Check if a type is in the standard library and return the name of the module if + so.""" + module = inspect.getmodule(obj) + if module is None: + return False + if module.__name__.startswith("builtins"): + return "builtins" + if module.__name__ == "types" and obj.__name__ not in dir(types): + return False + toplevel = module.__name__.split(".")[0] + if toplevel in STDLIB_MODULES: + return toplevel + return False + + +def _stdlib_modules() -> frozenset[str]: + """List all standard library modules.""" + std_lib_modules = set(sys.builtin_module_names) + std_lib_path = sysconfig.get_path("stdlib") + std_lib_modules.update(m[1] for m in pkgutil.iter_modules([std_lib_path])) + return frozenset(std_lib_modules) + + +STDLIB_MODULES: frozenset[str] = _stdlib_modules() + +# Example usage: +# print(list_standard_library_modules()) + + +def plot_workflow( + workflow_task: "workflow.Task", + out_dir: Path, + plot_type: str = "simple", + export: ty.Sequence[str] | None = None, + name: str | None = None, + lazy: ty.Sequence[str] | ty.Set[str] | None = None, +) -> Path | tuple[Path, list[Path]]: + """creating a graph - dotfile and optionally exporting to other formats""" + from pydra.engine.workflow import Workflow + + if inspect.isclass(workflow_task): + workflow_task = workflow_task() + + # Create output directory + out_dir.mkdir(parents=True, exist_ok=True) + + if lazy is None: + lazy = [n for n, v in attrs_values(workflow_task).items() if v is attrs.NOTHING] + + # Construct the workflow object with all of the fields lazy + wf = Workflow.construct(workflow_task, lazy=lazy) + + if not name: + name = f"graph_{type(workflow_task).__name__}" + if plot_type == "simple": + graph = wf.graph() + dotfile = graph.create_dotfile_simple(outdir=out_dir, name=name) + elif plot_type == "nested": + graph = wf.graph() + dotfile = graph.create_dotfile_nested(outdir=out_dir, name=name) + elif plot_type == "detailed": + graph = wf.graph(detailed=True) + dotfile = graph.create_dotfile_detailed(outdir=out_dir, name=name) + else: + raise Exception( + f"type of the graph can be simple, detailed or nested, " + f"but {plot_type} provided" + ) + if not export: + return dotfile + else: + if export is True: + export = ["png"] + elif isinstance(export, str): + export = [export] + formatted_dot = [] + for ext in export: + formatted_dot.append(graph.export_graph(dotfile=dotfile, ext=ext)) + return dotfile, formatted_dot + + +def show_workflow( + workflow_task: "workflow.Task", + plot_type: str = "simple", + lazy: ty.Sequence[str] | ty.Set[str] | None = None, + use_lib: str | None = None, + figsize: tuple[int, int] | None = None, + **kwargs, +) -> None: + """creating a graph and showing it""" + out_dir = Path(tempfile.mkdtemp()) + png_graph = plot_workflow( + workflow_task, + out_dir=out_dir, + plot_type=plot_type, + export="png", + lazy=lazy, + )[1][0] + + if use_lib in ("matplotlib", None): + try: + import matplotlib.pyplot as plt + from matplotlib.image import imread + except ImportError: + if use_lib == "matplotlib": + raise ImportError( + "Please install either matplotlib to display the workflow image." + ) + else: + use_lib = "matplotlib" + + # Read the image + img = imread(png_graph) + + if figsize is not None: + plt.figure(figsize=figsize) + # Display the image + plt.imshow(img, **kwargs) + plt.axis("off") + plt.show() + + if use_lib in ("PIL", None): + try: + from PIL import Image + except ImportError: + msg = " or matplotlib" if use_lib is None else "" + raise ImportError( + f"Please install either Pillow{msg} to display the workflow image." + ) + # Open the PNG image + img = Image.open(png_graph) + + # Display the image + img.show(**kwargs) + + +def attrs_fields(task, exclude_names=()) -> list[attrs.Attribute]: + """Get the fields of a task, excluding some names.""" + return [field for field in task.__attrs_attrs__ if field.name not in exclude_names] + + +def attrs_values(obj, **kwargs) -> dict[str, ty.Any]: + """Get the values of an attrs object.""" + return { + n: v + for n, v in attrs.asdict(obj, recurse=False, **kwargs).items() + if not n.startswith("_") + } + + +class _TaskFieldsList(dict[str, "Field"]): + """A list of task fields. Acts like list in that you can iterate over the values + but also access them like a dict or by attribute.""" + + def __iter__(self): + return iter(self.values()) + + def __getattr__(self, name): + return self[name] + + def __dir__(self): + return sorted(self.keys()) + + +def task_fields(task: "type[Task] | Task") -> _TaskFieldsList: + """List the fields of a task""" + if not inspect.isclass(task): + task = type(task) + if not attrs.has(task): + return _TaskFieldsList() + return _TaskFieldsList( + **{ + f.name: f.metadata[PYDRA_ATTR_METADATA] + for f in attrs.fields(task) + if PYDRA_ATTR_METADATA in f.metadata + } + ) + + +def task_dict(obj, **kwargs) -> dict[str, ty.Any]: + """Get the values of an attrs object.""" + return {f.name: getattr(obj, f.name) for f in task_fields(obj)} + + +def from_list_if_single(obj: ty.Any) -> ty.Any: + """Converts a list to a single item if it is of length == 1""" + from pydra.utils.typing import is_lazy + + if obj is attrs.NOTHING: + return obj + if is_lazy(obj): + return obj + if isinstance(obj, ty.Sequence) and not isinstance(obj, str): + obj = list(obj) + if len(obj) == 1: + return obj[0] + return obj + + +def wrap_text(text: str, width: int = 79, indent_size=4) -> str: + """Wraps text to a given width, respecting word boundaries.""" + indent = " " * indent_size + if len(text) <= width: + return indent + text + lines = [] + for line in text.splitlines(): + if len(line) > width: + words = line.split() + split_line = indent + for word in words: + if len(split_line) + len(word) + 1 > width: + lines.append(split_line.rstrip()) + split_line = indent + split_line += word + " " + lines.append(split_line.rstrip()) + else: + lines.append(indent + line) + return "\n".join(lines).rstrip() + + +def task_help( + task_type: "type[Task] | Task", line_width: int = 79, help_indent: int = 4 +) -> list[str] | None: + """Visit a job object and print its input/output interface.""" + import pydra.compose.base + + if isinstance(task_type, pydra.compose.base.Task): + task_type = type(task_type) + + plugin_name = next( + n + for n, t in get_plugin_classes(pydra.compose, "Task").items() + if issubclass(task_type, t) + ).capitalize() + + header = f"Help for {plugin_name} task '{task_type.__name__}'" + hyphen_line = "-" * len(header) + lines = [hyphen_line, header, hyphen_line] + inputs = task_fields(task_type) + if inputs: + lines.extend(["", "Inputs:"]) + if any(hasattr(i, "position") for i in inputs): + inputs = sorted(inputs) + for inpt in inputs: + lines.extend( + inpt.markdown_listing( + line_width, help_indent=help_indent, as_input=True + ).split("\n") + ) + outputs = task_fields(task_type.Outputs) + if outputs: + lines.extend(["", "Outputs:"]) + for output in outputs: + lines.extend( + output.markdown_listing(line_width, help_indent=help_indent).split("\n") + ) + lines.append("") + return lines + + +def print_help(task: "Task[TaskType]") -> None: + """Print help for a task.""" + lines = task_help(task) + print("\n".join(lines)) + + +def position_sort(args): + """ + Sort objects by position, following Python indexing conventions. + + Ordering is positive positions, lowest to highest, followed by unspecified + positions (``None``) and negative positions, lowest to highest. + + >>> position_sort([(None, "d"), (-3, "e"), (2, "b"), (-2, "f"), (5, "c"), (1, "a")]) + ['a', 'b', 'c', 'd', 'e', 'f'] + + Parameters + ---------- + args : list of (int/None, object) tuples + + Returns + ------- + list of objects + """ + import bisect + + pos, none, neg = [], [], [] + for entry in args: + position = entry[0] + if position is None: + # Take existing order + none.append(entry[1]) + elif position < 0: + # Sort negatives while collecting + bisect.insort(neg, entry) + else: + # Sort positives while collecting + bisect.insort(pos, entry) + + return [arg for _, arg in pos] + none + [arg for _, arg in neg] + + +def ensure_list(obj, tuple2list=False): + """ + Return a list whatever the input object is. + + Examples + -------- + >>> ensure_list(list("abc")) + ['a', 'b', 'c'] + >>> ensure_list("abc") + ['abc'] + >>> ensure_list(tuple("abc")) + [('a', 'b', 'c')] + >>> ensure_list(tuple("abc"), tuple2list=True) + ['a', 'b', 'c'] + >>> ensure_list(None) + [] + >>> ensure_list(5.0) + [5.0] + + """ + from pydra.utils.typing import is_lazy + + if obj is attrs.NOTHING: + return attrs.NOTHING + if obj is None: + return [] + # list or numpy.array (this might need some extra flag in case an array has to be converted) + elif isinstance(obj, list) or hasattr(obj, "__array__"): + return obj + elif tuple2list and isinstance(obj, tuple): + return list(obj) + elif is_lazy(obj): + return obj + # elif is_container(obj): + # raise NotImplementedError("just checking for now") + return [obj] + + +def ensure_file_list(filename): + """Return a list given either a string or a list.""" + if isinstance(filename, (str, bytes)): + return [filename] + elif isinstance(filename, list): + return filename + elif is_container(filename): + return [x for x in filename] + + return None + + +# dj: copied from misc +def is_container(item): + """ + Check if item is a container (list, tuple, dict, set). + + Parameters + ---------- + item : :obj:`object` + Input object to check. + + Returns + ------- + output : :obj:`bool` + ``True`` if container ``False`` otherwise. + + """ + if isinstance(item, str): + return False + elif hasattr(item, "__iter__"): + return True + + return False + + +def is_workflow(obj): + """Check whether an object is a :class:`Workflow` instance.""" + from pydra.compose.workflow import Task + from pydra.engine.workflow import Workflow + + return isinstance(obj, (Task, Workflow)) + + +def get_plugin_classes(namespace: types.ModuleType, class_name: str) -> dict[str, type]: + """ + Get all classes within sub-packages of namespace package with a given name, e.g. + "Worker" within "pydra.workers.*" sub-packages. + + Parameters + ---------- + namespace : :obj:`str` + The namespace to search for subclasses. + base_class : :obj:`type` + The base class to search for subclasses of. + + Returns + ------- + :obj:`dict[str, type]` + A dictionary mapping the sub-package name to classes of 'class_name' within + the namespace package + """ + sub_packages = [ + importlib.import_module(f"{namespace.__name__}.{m.name}") + for m in pkgutil.iter_modules(namespace.__path__) + if not m.name.startswith("base") + ] + return { + pkg.__name__.split(".")[-1]: getattr(pkg, class_name) + for pkg in sub_packages + if hasattr(pkg, class_name) + } diff --git a/pydra/utils/hash.py b/pydra/utils/hash.py index 3ba3e97b44..6835f65b8d 100644 --- a/pydra/utils/hash.py +++ b/pydra/utils/hash.py @@ -2,7 +2,10 @@ import sys import os +import re +import ast import struct +import inspect from datetime import datetime import typing as ty import types @@ -21,19 +24,23 @@ from filelock import SoftFileLock import attrs.exceptions from fileformats.core.fileset import FileSet, MockMixin -from . import user_cache_dir, add_exc_note +from fileformats.generic import FsObject +import fileformats.core.exceptions +from pydra.utils.general import in_stdlib, user_cache_root, add_exc_note logger = logging.getLogger("pydra") +FUNCTION_SRC_CHUNK_LEN_DEFAULT = 8192 + try: from typing import Protocol except ImportError: - from typing_extensions import Protocol # type: ignore + from typing import Protocol # type: ignore try: from typing import runtime_checkable except ImportError: - from typing_extensions import runtime_checkable # type: ignore + from typing import runtime_checkable # type: ignore try: @@ -104,7 +111,7 @@ def location_default(cls): try: location = os.environ[cls.LOCATION_ENV_VAR] except KeyError: - location = user_cache_dir / "hashes" + location = user_cache_root / "hashes" return location # the default needs to be an instance method @@ -322,10 +329,19 @@ def bytes_repr(obj: object, cache: Cache) -> Iterator[bytes]: if attrs.has(type(obj)): # Drop any attributes that aren't used in comparisons by default dct = attrs.asdict(obj, recurse=False, filter=lambda a, _: bool(a.eq)) - elif hasattr(obj, "__slots__"): + elif hasattr(obj, "__slots__") and obj.__slots__ is not None: dct = {attr: getattr(obj, attr) for attr in obj.__slots__} else: - dct = obj.__dict__ + + def is_special_or_method(n: str): + return (n.startswith("__") and n.endswith("__")) or inspect.ismethod( + getattr(obj, n) + ) + + try: + dct = {n: v for n, v in obj.__dict__.items() if not is_special_or_method(n)} + except AttributeError: + dct = {n: getattr(obj, n) for n in dir(obj) if not is_special_or_method(n)} yield from bytes_repr_mapping_contents(dct, cache) yield b"}" @@ -439,33 +455,88 @@ def bytes_repr_dict(obj: dict, cache: Cache) -> Iterator[bytes]: yield b"}" +@register_serializer +def bytes_repr_module(obj: types.ModuleType, cache: Cache) -> Iterator[bytes]: + yield b"module:(" + yield hash_single(FsObject(obj.__file__), cache=cache) + yield b")" + + @register_serializer(ty._GenericAlias) @register_serializer(ty._SpecialForm) @register_serializer(type) def bytes_repr_type(klass: type, cache: Cache) -> Iterator[bytes]: - def type_name(tp): + from pydra.utils.general import task_fields + + def type_location(tp: type) -> bytes: + """Return the module and name of the type in a ASCII byte string""" try: - name = tp.__name__ + type_name = tp.__name__ except AttributeError: - name = tp._name - return name + type_name = tp._name + mod_path = ".".join( + p for p in klass.__module__.split(".") if not p.startswith("_") + ) + return f"{mod_path}.{type_name}".encode() yield b"type:(" origin = ty.get_origin(klass) - if origin: - yield f"{origin.__module__}.{type_name(origin)}[".encode() - for arg in ty.get_args(klass): + args = ty.get_args(klass) + if origin and args: + yield b"origin:(" + yield from bytes_repr_type(origin, cache) + yield b"),args:(" + for arg in args: if isinstance( arg, list ): # sometimes (e.g. Callable) the args of a type is a list - yield b"[" + yield b"list:(" yield from (b for t in arg for b in bytes_repr_type(t, cache)) - yield b"]" + yield b")" else: yield from bytes_repr_type(arg, cache) - yield b"]" + yield b")" else: - yield f"{klass.__module__}.{type_name(klass)}".encode() + if inspect.isclass(klass) and issubclass(klass, FileSet): + try: + yield b"mime-like:(" + klass.mime_like.encode() + b")" + except fileformats.core.exceptions.FormatDefinitionError: + yield type_location(klass) + elif fields := task_fields(klass): + yield b"fields:(" + yield from bytes_repr_sequence_contents(fields, cache) + yield b")" + if hasattr(klass, "Outputs"): + yield b",outputs:(" + yield from bytes_repr_type(klass.Outputs, cache) + yield b")" + elif in_stdlib(klass): + yield type_location(klass) + else: + try: + dct = { + n: v for n, v in klass.__dict__.items() if not n.startswith("__") + } + except AttributeError: + yield type_location(klass) + else: + yield b"__dict__:(" + yield from bytes_repr_mapping_contents(dct, cache) + yield b")" + # Include annotations + try: + annotations = klass.__annotations__ + except AttributeError: + pass + else: + yield b",annotations:(" + yield from bytes_repr_mapping_contents(annotations, cache) + yield b")" + yield b",mro:(" + yield from ( + b for t in klass.mro()[1:-1] for b in bytes_repr_type(t, cache) + ) + yield b")" yield b")" @@ -519,6 +590,77 @@ def bytes_repr_set(obj: Set, cache: Cache) -> Iterator[bytes]: yield b"}" +@register_serializer +def bytes_repr_code(obj: types.CodeType, cache: Cache) -> Iterator[bytes]: + yield b"code:(" + yield from bytes_repr_sequence_contents( + ( + obj.co_argcount, + obj.co_posonlyargcount, + obj.co_kwonlyargcount, + obj.co_nlocals, + obj.co_flags, + obj.co_code, + obj.co_consts, + obj.co_names, + obj.co_varnames, + obj.co_freevars, + obj.co_name, + obj.co_cellvars, + ), + cache, + ) + yield b")" + + +@register_serializer +def bytes_repr_function(obj: types.FunctionType, cache: Cache) -> Iterator[bytes]: + """Serialize a function, attempting to use the AST of the source code if available + otherwise falling back to the byte-code of the function.""" + yield b"function:(" + if in_stdlib(obj): + yield f"{obj.__module__}.{obj.__name__}".encode() + else: + try: + src = inspect.getsource(obj) + except OSError: + # Fallback to using the bytes representation of the code object + yield from bytes_repr(obj.__code__, cache) + else: + + def dump_ast(node: ast.AST) -> bytes: + return ast.dump( + node, annotate_fields=False, include_attributes=False + ).encode() + + def strip_annotations(node: ast.AST): + """Remove annotations from function arguments.""" + if hasattr(node, "args"): + for arg in node.args.args: + arg.annotation = None + for arg in node.args.kwonlyargs: + arg.annotation = None + if node.args.vararg: + node.args.vararg.annotation = None + if node.args.kwarg: + node.args.kwarg.annotation = None + + indent = re.match(r"(\s*)", src).group(1) + if indent: + src = re.sub(f"^{indent}", "", src, flags=re.MULTILINE) + try: + func_ast = ast.parse(src).body[0] + strip_annotations(func_ast) + if hasattr(func_ast, "args"): + yield dump_ast(func_ast.args) + if hasattr(func_ast, "body"): + for stmt in func_ast.body: + yield dump_ast(stmt) + except SyntaxError: + yield src.encode() + yield b")" + + def bytes_repr_mapping_contents(mapping: Mapping, cache: Cache) -> Iterator[bytes]: """Serialize the contents of a mapping @@ -535,6 +677,7 @@ def bytes_repr_mapping_contents(mapping: Mapping, cache: Cache) -> Iterator[byte yield from bytes_repr(key, cache) yield b"=" yield bytes(hash_single(mapping[key], cache)) + yield b"," def bytes_repr_sequence_contents(seq: Sequence, cache: Cache) -> Iterator[bytes]: diff --git a/pydra/utils/misc.py b/pydra/utils/misc.py deleted file mode 100644 index 45b6a5c3ba..0000000000 --- a/pydra/utils/misc.py +++ /dev/null @@ -1,45 +0,0 @@ -from pathlib import Path -import re -import platformdirs -from pydra._version import __version__ - -user_cache_dir = Path( - platformdirs.user_cache_dir( - appname="pydra", - appauthor="nipype", - version=__version__, - ) -) - - -def add_exc_note(e: Exception, note: str) -> Exception: - """Adds a note to an exception in a Python <3.11 compatible way - - Parameters - ---------- - e : Exception - the exception to add the note to - note : str - the note to add - - Returns - ------- - Exception - returns the exception again - """ - if hasattr(e, "add_note"): - e.add_note(note) - else: - e.args = (e.args[0] + "\n" + note,) - return e - - -def exc_info_matches(exc_info, match, regex=False): - if exc_info.value.__cause__ is not None: - msg = str(exc_info.value.__cause__) - else: - msg = str(exc_info.value) - if regex: - return re.match(".*" + match, msg) - else: - return match in msg diff --git a/pydra/utils/mount_identifier.py b/pydra/utils/mount_identifier.py new file mode 100644 index 0000000000..e65a79990c --- /dev/null +++ b/pydra/utils/mount_identifier.py @@ -0,0 +1,147 @@ +"""Functions ported from Nipype 1, after removing parts that were related to py2.""" + +import os +import re +import logging +from pathlib import Path +import typing as ty +import subprocess as sp +from contextlib import contextmanager + +logger = logging.getLogger("pydra") + + +class MountIndentifier: + """Used to check the mount type that given file paths reside on in order to determine + features that can be used (e.g. symlinks)""" + + @classmethod + def on_cifs(cls, path: os.PathLike) -> bool: + """ + Check whether a file path is on a CIFS filesystem mounted in a POSIX host. + + POSIX hosts are assumed to have the ``mount`` command. + + On Windows, Docker mounts host directories into containers through CIFS + shares, which has support for Minshall+French symlinks, or text files that + the CIFS driver exposes to the OS as symlinks. + We have found that under concurrent access to the filesystem, this feature + can result in failures to create or read recently-created symlinks, + leading to inconsistent behavior and ``FileNotFoundError`` errors. + + This check is written to support disabling symlinks on CIFS shares. + + NB: This function and sub-functions are copied from the nipype.utils.filemanip module + + + NB: Adapted from https://github.com/nipy/nipype + """ + return cls.get_mount(path)[1] == "cifs" + + @classmethod + def on_same_mount(cls, path1: os.PathLike, path2: os.PathLike) -> bool: + """Checks whether two or paths are on the same logical file system""" + return cls.get_mount(path1)[0] == cls.get_mount(path2)[0] + + @classmethod + def get_mount(cls, path: os.PathLike) -> ty.Tuple[Path, str]: + """Get the mount point for a given file-system path + + Parameters + ---------- + path: os.PathLike + the file-system path to identify the mount of + + Returns + ------- + mount_point: os.PathLike + the root of the mount the path sits on + fstype : str + the type of the file-system (e.g. ext4 or cifs)""" + try: + # Only the first match (most recent parent) counts, mount table sorted longest + # to shortest + return next( + (Path(p), t) + for p, t in cls.get_mount_table() + if str(path).startswith(p) + ) + except StopIteration: + return (Path("/"), "ext4") + + @classmethod + def generate_cifs_table(cls) -> ty.List[ty.Tuple[str, str]]: + """ + Construct a reverse-length-ordered list of mount points that fall under a CIFS mount. + + This precomputation allows efficient checking for whether a given path + would be on a CIFS filesystem. + On systems without a ``mount`` command, or with no CIFS mounts, returns an + empty list. + + """ + exit_code, output = sp.getstatusoutput("mount") + return cls.parse_mount_table(exit_code, output) + + @classmethod + def parse_mount_table( + cls, exit_code: int, output: str + ) -> ty.List[ty.Tuple[str, str]]: + """ + Parse the output of ``mount`` to produce (path, fs_type) pairs. + + Separated from _generate_cifs_table to enable testing logic with real + outputs + + """ + # Not POSIX + if exit_code != 0: + return [] + + # Linux mount example: sysfs on /sys type sysfs (rw,nosuid,nodev,noexec) + # ^^^^ ^^^^^ + # OSX mount example: /dev/disk2 on / (hfs, local, journaled) + # ^ ^^^ + pattern = re.compile(r".*? on (/.*?) (?:type |\()([^\s,\)]+)") + + # Keep line and match for error reporting (match == None on failure) + # Ignore empty lines + matches = [(ll, pattern.match(ll)) for ll in output.strip().splitlines() if ll] + + # (path, fstype) tuples, sorted by path length (longest first) + mount_info = sorted( + (match.groups() for _, match in matches if match is not None), + key=lambda x: len(x[0]), + reverse=True, + ) + cifs_paths = [path for path, fstype in mount_info if fstype.lower() == "cifs"] + + # Report failures as warnings + for line, match in matches: + if match is None: + logger.debug("Cannot parse mount line: '%s'", line) + + return [ + mount + for mount in mount_info + if any(mount[0].startswith(path) for path in cifs_paths) + ] + + @classmethod + def get_mount_table(cls) -> ty.List[ty.Tuple[str, str]]: + if cls._mount_table is None: + cls._mount_table = cls.generate_cifs_table() + return cls._mount_table + + @classmethod + @contextmanager + def patch_table(cls, mount_table: ty.List[ty.Tuple[str, str]]): + """Patch the mount table with new values. Used in test routines""" + orig_table = cls._mount_table + cls._mount_table = list(mount_table) + try: + yield + finally: + cls._mount_table = orig_table + + _mount_table: ty.Optional[ty.List[ty.Tuple[str, str]]] = None diff --git a/pydra/engine/tests/test_helpers_file.py b/pydra/utils/tests/test_file_handling.py similarity index 86% rename from pydra/engine/tests/test_helpers_file.py rename to pydra/utils/tests/test_file_handling.py index ea5dd2afdc..95fa6c8705 100644 --- a/pydra/engine/tests/test_helpers_file.py +++ b/pydra/utils/tests/test_file_handling.py @@ -1,18 +1,15 @@ import typing as ty import sys +import os from pathlib import Path -import attr from unittest.mock import Mock import pytest from fileformats.generic import File -from ..specs import SpecInfo, ShellSpec -from ..task import ShellCommandTask -from ..helpers_file import ( - ensure_list, - MountIndentifier, - copy_nested_files, - template_update_single, -) +from pydra.compose import shell +from pydra.utils.general import ensure_file_list +from pydra.utils.mount_identifier import MountIndentifier +from pydra.utils.typing import copy_nested_files +from pydra.compose.shell.templating import template_update_single def _ignore_atime(stat): @@ -48,8 +45,8 @@ def _temp_analyze_files_prime(tmpdir): (12.34, None), ], ) -def test_ensure_list(filename, expected): - x = ensure_list(filename) +def test_ensure_file_list(filename, expected): + x = ensure_file_list(filename) assert x == expected @@ -354,66 +351,34 @@ def test_output_template(tmp_path): filename = str(tmp_path / "file.txt") with open(filename, "w") as f: f.write("hello from pydra") - in_file = File(filename) - - my_input_spec = SpecInfo( - name="Input", - fields=[ - ( - "in_file", - attr.ib( - type=File, - metadata={ - "mandatory": True, - "position": 1, - "argstr": "", - "help_string": "input file", - }, - ), - ), - ( - "optional", - attr.ib( - type=ty.Union[Path, bool], - default=False, - metadata={ - "position": 2, - "argstr": "--opt", - "output_file_template": "{in_file}.out", - "help_string": "optional file output", - }, - ), - ), - ], - bases=(ShellSpec,), - ) - class MyCommand(ShellCommandTask): + @shell.define + class MyCommand(shell.Task["MyCommand.Outputs"]): + in_file: File = shell.arg( + position=1, + argstr="", + help="input file", + ) + optional: File | None = shell.outarg( + position=2, + argstr="--opt", + path_template="{in_file}.out", + help="optional file output", + ) + + class Outputs(shell.Outputs): + pass + executable = "my" - input_spec = my_input_spec - - task = MyCommand(in_file=filename) - assert task.cmdline == f"my {filename}" - task.inputs.optional = True - assert task.cmdline == f"my {filename} --opt {task.output_dir / 'file.out'}" - task.inputs.optional = False - assert task.cmdline == f"my {filename}" - task.inputs.optional = "custom-file-out.txt" - assert task.cmdline == f"my {filename} --opt custom-file-out.txt" - - -def test_template_formatting(tmp_path): - field = Mock() - field.name = "grad" - field.argstr = "--grad" - field.metadata = {"output_file_template": ("{in_file}.bvec", "{in_file}.bval")} - inputs = Mock() - inputs_dict = {"in_file": "/a/b/c/file.txt", "grad": True} - - assert template_update_single( - field, - inputs, - inputs_dict_st=inputs_dict, - output_dir=tmp_path, - spec_type="input", - ) == [str(tmp_path / "file.bvec"), str(tmp_path / "file.bval")] + + defn = MyCommand(in_file=filename) + assert defn.cmdline == f"my {filename}" + defn.optional = True + file_out_path = os.path.join(os.getcwd(), "file.out") + if " " in file_out_path: + file_out_path = f"'{file_out_path}'" + assert defn.cmdline == f"my {filename} --opt {file_out_path}" + defn.optional = False + assert defn.cmdline == f"my {filename}" + defn.optional = "custom-file-out.txt" + assert defn.cmdline == f"my {filename} --opt custom-file-out.txt" diff --git a/pydra/utils/tests/test_hash.py b/pydra/utils/tests/test_hash.py index de065a03de..6fd7d4ba47 100644 --- a/pydra/utils/tests/test_hash.py +++ b/pydra/utils/tests/test_hash.py @@ -10,13 +10,127 @@ import typing as ty from fileformats.application import Zip, Json from fileformats.text import TextFile -from ..hash import ( +from pydra.utils.hash import ( Cache, bytes_repr, hash_object, register_serializer, PersistentCache, ) +import shutil +import random +from fileformats.generic import Directory, File +from pydra.utils.hash import hash_function + + +def test_hash_file(tmpdir): + outdir = Path(tmpdir) + with open(outdir / "test.file", "w") as fp: + fp.write("test") + assert ( + hash_function(File(outdir / "test.file")) == "f32ab20c4a86616e32bf2504e1ac5a22" + ) + + +def test_hashfun_float(): + import math + + pi_50 = 3.14159265358979323846264338327950288419716939937510 + pi_15 = 3.141592653589793 + pi_10 = 3.1415926536 + # comparing for x that have the same x.as_integer_ratio() + assert ( + math.pi.as_integer_ratio() + == pi_50.as_integer_ratio() + == pi_15.as_integer_ratio() + ) + assert hash_function(math.pi) == hash_function(pi_15) == hash_function(pi_50) + # comparing for x that have different x.as_integer_ratio() + assert math.pi.as_integer_ratio() != pi_10.as_integer_ratio() + assert hash_function(math.pi) != hash_function(pi_10) + + +def test_hash_function_dict(): + dict1 = {"a": 10, "b": 5} + dict2 = {"b": 5, "a": 10} + assert hash_function(dict1) == hash_function(dict2) + + +def test_hash_function_list_tpl(): + lst = [2, 5.6, "ala"] + tpl = (2, 5.6, "ala") + assert hash_function(lst) != hash_function(tpl) + + +def test_hash_function_list_dict(): + lst = [2, {"a": "ala", "b": 1}] + hash_function(lst) + + +def test_hash_function_files(tmp_path: Path): + file_1 = tmp_path / "file_1.txt" + file_2 = tmp_path / "file_2.txt" + file_1.write_text("hello") + file_2.write_text("hello") + + assert hash_function(File(file_1)) == hash_function(File(file_2)) + + +def test_hash_function_dir_and_files_list(tmp_path: Path): + dir1 = tmp_path / "foo" + dir2 = tmp_path / "bar" + for d in (dir1, dir2): + d.mkdir() + for i in range(3): + f = d / f"{i}.txt" + f.write_text(str(i)) + + assert hash_function(Directory(dir1)) == hash_function(Directory(dir2)) + file_list1: ty.List[File] = [File(f) for f in dir1.iterdir()] + file_list2: ty.List[File] = [File(f) for f in dir2.iterdir()] + assert hash_function(file_list1) == hash_function(file_list2) + + +def test_hash_function_files_mismatch(tmp_path: Path): + file_1 = tmp_path / "file_1.txt" + file_2 = tmp_path / "file_2.txt" + file_1.write_text("hello") + file_2.write_text("hi") + + assert hash_function(File(file_1)) != hash_function(File(file_2)) + + +def test_hash_function_nested(tmp_path: Path): + dpath = tmp_path / "dir" + dpath.mkdir() + hidden = dpath / ".hidden" + nested = dpath / "nested" + hidden.mkdir() + nested.mkdir() + file_1 = dpath / "file_1.txt" + file_2 = hidden / "file_2.txt" + file_3 = nested / ".file_3.txt" + file_4 = nested / "file_4.txt" + + for fx in [file_1, file_2, file_3, file_4]: + fx.write_text(str(random.randint(0, 1000))) + + nested_dir = Directory(dpath) + + orig_hash = nested_dir.hash() + + nohidden_hash = nested_dir.hash(ignore_hidden_dirs=True, ignore_hidden_files=True) + nohiddendirs_hash = nested_dir.hash(ignore_hidden_dirs=True) + nohiddenfiles_hash = nested_dir.hash(ignore_hidden_files=True) + + assert orig_hash != nohidden_hash + assert orig_hash != nohiddendirs_hash + assert orig_hash != nohiddenfiles_hash + + os.remove(file_3) + assert nested_dir.hash() == nohiddenfiles_hash + shutil.rmtree(hidden) + assert nested_dir.hash() == nohidden_hash @pytest.fixture @@ -50,7 +164,7 @@ def test_bytes_repr_builtins(): assert complex_repr == b"complex:" + bytes(16) # Dicts are sorted by key, and values are hashed dict_repr = join_bytes_repr({"b": "c", "a": 0}) - assert re.match(rb"dict:{str:1:a=.{16}str:1:b=.{16}}$", dict_repr) + assert re.match(rb"dict:{str:1:a=.{16},str:1:b=.{16},}$", dict_repr) # Lists and tuples concatenate hashes of their contents list_repr = join_bytes_repr([1, 2, 3]) assert re.match(rb"list:\(.{48}\)$", list_repr) @@ -75,7 +189,7 @@ def test_bytes_repr_builtins(): (1, "6dc1db8d4dcdd8def573476cbb90cce0"), (12345678901234567890, "2b5ba668c1e8ea4902361b8d81e53074"), (1.0, "29492927b2e505840235e15a5be9f79a"), - ({"b": "c", "a": 0}, "2405cd36f4e4b6318c033f32db289f7d"), + ({"b": "c", "a": 0}, "04e5c65ec2269775d3b9ccecaf10da38"), ([1, 2, 3], "2f8902ff90f63d517bd6f6e6111e15b8"), ((1, 2, 3), "054a7b31c29e7875a6f83ff1dcb4841b"), ], @@ -142,7 +256,7 @@ def __init__(self, x): self.x = x obj_repr = join_bytes_repr(MyClass(1)) - assert re.match(rb".*\.MyClass:{str:1:x=.{16}}", obj_repr) + assert re.match(rb".*\.MyClass:{str:1:x=.{16},}", obj_repr) def test_bytes_repr_slots_obj(): @@ -153,7 +267,7 @@ def __init__(self, x): self.x = x obj_repr = join_bytes_repr(MyClass(1)) - assert re.match(rb".*\.MyClass:{str:1:x=.{16}}", obj_repr) + assert re.match(rb".*\.MyClass:{str:1:x=.{16},}", obj_repr) def test_bytes_repr_attrs_slots(): @@ -162,7 +276,7 @@ class MyClass: x: int obj_repr = join_bytes_repr(MyClass(1)) - assert re.match(rb".*\.MyClass:{str:1:x=.{16}}", obj_repr) + assert re.match(rb".*\.MyClass:{str:1:x=.{16},}", obj_repr) def test_bytes_repr_attrs_no_slots(): @@ -171,7 +285,7 @@ class MyClass: x: int obj_repr = join_bytes_repr(MyClass(1)) - assert re.match(rb".*\.MyClass:{str:1:x=.{16}}", obj_repr) + assert re.match(rb".*\.MyClass:{str:1:x=.{16},}", obj_repr) def test_bytes_repr_type1(): @@ -181,31 +295,44 @@ def test_bytes_repr_type1(): def test_bytes_repr_type1a(): obj_repr = join_bytes_repr(Zip[Json]) - assert obj_repr == rb"type:(fileformats.application.archive.Json__Zip)" + assert obj_repr == rb"type:(mime-like:(application/json+zip))" def test_bytes_repr_type2(): T = ty.TypeVar("T") class MyClass(ty.Generic[T]): - pass + + a: int + b: str + + def method(self, f: float) -> float: + return f + 1 obj_repr = join_bytes_repr(MyClass[int]) - assert ( - obj_repr == b"type:(pydra.utils.tests.test_hash.MyClass[type:(builtins.int)])" + assert re.match( + ( + rb"type:\(origin:\(type:\(__dict__:\(str:6:method=.{16},\),annotations:\(str:1:a=.{16}," + rb"str:1:b=.{16},\),mro:\(type:\(typing.Generic\)\)\)\),args:\(type:\(builtins.int\)\)\)" + ), + obj_repr, ) def test_bytes_special_form1(): obj_repr = join_bytes_repr(ty.Union[int, float]) - assert obj_repr == b"type:(typing.Union[type:(builtins.int)type:(builtins.float)])" + assert obj_repr == ( + b"type:(origin:(type:(typing.Union)),args:(type:(builtins.int)" + b"type:(builtins.float)))" + ) @pytest.mark.skipif(condition=sys.version_info < (3, 10), reason="requires python3.10") def test_bytes_special_form1a(): obj_repr = join_bytes_repr(int | float) - assert ( - obj_repr == b"type:(types.UnionType[type:(builtins.int)type:(builtins.float)])" + assert obj_repr == ( + b"type:(origin:(type:(types.UnionType)),args:(type:(builtins.int)" + b"type:(builtins.float)))" ) @@ -216,30 +343,34 @@ def test_bytes_special_form2(): def test_bytes_special_form3(): obj_repr = join_bytes_repr(ty.Optional[Path]) - assert ( - obj_repr == b"type:(typing.Union[type:(pathlib.Path)type:(builtins.NoneType)])" + assert obj_repr == ( + b"type:(origin:(type:(typing.Union)),args:(type:(pathlib.Path)" + b"type:(builtins.NoneType)))" ) @pytest.mark.skipif(condition=sys.version_info < (3, 10), reason="requires python3.10") def test_bytes_special_form3a(): obj_repr = join_bytes_repr(Path | None) - assert ( - obj_repr - == b"type:(types.UnionType[type:(pathlib.Path)type:(builtins.NoneType)])" + assert obj_repr == ( + b"type:(origin:(type:(types.UnionType)),args:(type:(pathlib.Path)" + b"type:(builtins.NoneType)))" ) def test_bytes_special_form4(): obj_repr = join_bytes_repr(ty.Type[Path]) - assert obj_repr == b"type:(builtins.type[type:(pathlib.Path)])" + assert ( + obj_repr == b"type:(origin:(type:(builtins.type)),args:(type:(pathlib.Path)))" + ) def test_bytes_special_form5(): obj_repr = join_bytes_repr(ty.Callable[[Path, int], ty.Tuple[float, str]]) assert obj_repr == ( - b"type:(collections.abc.Callable[[type:(pathlib.Path)type:(builtins.int)]" - b"type:(builtins.tuple[type:(builtins.float)type:(builtins.str)])])" + b"type:(origin:(type:(collections.abc.Callable)),args:(list:(type:(pathlib.Path)" + b"type:(builtins.int))type:(origin:(type:(builtins.tuple))," + b"args:(type:(builtins.float)type:(builtins.str)))))" ) @@ -423,8 +554,7 @@ def __repr__(self): with pytest.raises( TypeError, match=( - "unhashable\nand therefore cannot hash `A\(\)` of type " - "`pydra.utils.tests.test_hash.A`" + r"unhashable\nand therefore cannot hash `A\(\)` of type `.*\.test_hash\.A`" ), ): hash_object(A()) diff --git a/pydra/utils/tests/test_messenger.py b/pydra/utils/tests/test_messenger.py index 5abbf85924..786ab231a2 100644 --- a/pydra/utils/tests/test_messenger.py +++ b/pydra/utils/tests/test_messenger.py @@ -1,7 +1,12 @@ from contextlib import redirect_stdout import io import pytest -from ..messenger import PrintMessenger, FileMessenger, collect_messages, make_message +from pydra.utils.messenger import ( + PrintMessenger, + FileMessenger, + collect_messages, + make_message, +) def test_print_messenger(): diff --git a/pydra/utils/tests/test_typing.py b/pydra/utils/tests/test_typing.py index f83eedbd8c..de3f0d962b 100644 --- a/pydra/utils/tests/test_typing.py +++ b/pydra/utils/tests/test_typing.py @@ -4,29 +4,39 @@ import typing as ty from pathlib import Path import tempfile +import traceback +from unittest.mock import Mock import pytest -from pydra import mark -from ...engine.specs import File, LazyOutField, MultiInputObj -from ..typing import TypeParser -from pydra import Workflow +from pydra.compose import python +from fileformats.generic import File +from pydra.engine.lazy import LazyOutField +from pydra.compose import workflow +from pydra.utils.typing import TypeParser, MultiInputObj from fileformats.application import Json, Yaml, Xml from .utils import ( - generic_func_task, + GenericFuncTask, GenericShellTask, - specific_func_task, + SpecificFuncTask, SpecificShellTask, - other_specific_func_task, + OtherSpecificFuncTask, OtherSpecificShellTask, MyFormatX, MyOtherFormatX, MyHeader, ) -from pydra.utils import exc_info_matches +from pydra.utils.general import exc_info_matches + + +def exc_to_str(exc_info): + return "".join( + traceback.format_exception(exc_info.type, exc_info.value, exc_info.tb) + ) def lz(tp: ty.Type): """convenience method for creating a LazyField of type 'tp'""" - return LazyOutField(name="foo", field="boo", type=tp) + node = Mock() + return LazyOutField(node=node, field="boo", type=tp) PathTypes = ty.Union[str, os.PathLike] @@ -485,29 +495,27 @@ def test_type_coercion_fail2a(): def test_type_coercion_fail3(): - with pytest.raises(TypeError) as exc_info: + with pytest.raises(TypeError, match="Incorrect type for field"): TypeParser(ty.Sequence, coercible=[(ty.Sequence, ty.Sequence)])( {"a": 1, "b": 2} ) - assert exc_info_matches(exc_info, "doesn't match any of the explicit inclusion") def test_type_coercion_fail4(): - with pytest.raises(TypeError) as exc_info: + with pytest.raises(TypeError, match="Incorrect type for field"): TypeParser(ty.Sequence, coercible=[(ty.Any, ty.Any)])({"a": 1}) - assert exc_info_matches(exc_info, "Cannot coerce {'a': 1} into") def test_type_coercion_fail5(): - with pytest.raises(TypeError) as exc_info: + with pytest.raises(TypeError) as excinfo: TypeParser(ty.List[int], coercible=[(ty.Any, ty.Any)])(1) - assert exc_info_matches(exc_info, "as 1 is not iterable") + assert "as 1 is not iterable" in exc_to_str(excinfo) def test_type_coercion_fail6(): - with pytest.raises(TypeError) as exc_info: + with pytest.raises(TypeError) as excinfo: TypeParser(ty.List[ty.Dict[str, str]], coercible=[(ty.Any, ty.Any)])((1, 2, 3)) - assert exc_info_matches(exc_info, "is not a mapping type") + assert "is not a mapping type" in exc_to_str(excinfo) def test_type_coercion_realistic(): @@ -520,26 +528,22 @@ def test_type_coercion_realistic(): Path.touch(yet_another_file) file_list = [File(p) for p in (a_file, another_file, yet_another_file)] - @mark.task - @mark.annotate({"return": {"a": ty.List[File], "b": ty.List[str]}}) + @python.define(outputs={"a": ty.List[File], "b": ty.List[str]}) def f(x: ty.List[File], y: ty.Dict[str, ty.List[File]]): return list(itertools.chain(x, *y.values())), list(y.keys()) - task = f(x=file_list, y={"a": file_list[1:]}) + defn = f(x=file_list, y={"a": file_list[1:]}) + outputs = defn() - TypeParser(ty.List[str])(task.lzout.a) # pylint: disable=no-member + TypeParser(ty.List[str])(outputs.a) # pylint: disable=no-member with pytest.raises( TypeError, + match=r"Incorrect type for field:", ) as exc_info: - TypeParser(ty.List[int])(task.lzout.a) # pylint: disable=no-member - assert exc_info_matches( - exc_info, - match=r"Cannot coerce into ", - regex=True, - ) + TypeParser(ty.List[int])(outputs.a) # pylint: disable=no-member with pytest.raises(TypeError) as exc_info: - task.inputs.x = "bad-value" + defn.x = "bad-value" assert exc_info_matches( exc_info, match="Cannot coerce 'bad-value' into " ) @@ -682,9 +686,9 @@ def test_type_matches(): @pytest.fixture(params=["func", "shell"]) -def generic_task(request): +def GenericTask(request): if request.param == "func": - return generic_func_task + return GenericFuncTask elif request.param == "shell": return GenericShellTask else: @@ -692,9 +696,9 @@ def generic_task(request): @pytest.fixture(params=["func", "shell"]) -def specific_task(request): +def SpecificTask(request): if request.param == "func": - return specific_func_task + return SpecificFuncTask elif request.param == "shell": return SpecificShellTask else: @@ -702,125 +706,72 @@ def specific_task(request): @pytest.fixture(params=["func", "shell"]) -def other_specific_task(request): +def OtherSpecificTask(request): if request.param == "func": - return other_specific_func_task + return OtherSpecificFuncTask elif request.param == "shell": return OtherSpecificShellTask else: assert False -def test_typing_implicit_cast_from_super(tmp_path, generic_task, specific_task): +def test_typing_implicit_cast_from_super(tmp_path, GenericTask, SpecificTask): """Check the casting of lazy fields and whether specific file-sets can be recovered from generic `File` classes""" - wf = Workflow( - name="test", - input_spec={"in_file": MyFormatX}, - output_spec={"out_file": MyFormatX}, - ) - - wf.add( - specific_task( - in_file=wf.lzin.in_file, - name="specific1", - ) - ) - - wf.add( # Generic task - generic_task( - in_file=wf.specific1.lzout.out, - name="generic", - ) - ) - - wf.add( - specific_task( - in_file=wf.generic.lzout.out, - name="specific2", - ) - ) - - wf.set_output( - [ - ("out_file", wf.specific2.lzout.out), - ] - ) + @workflow.define(outputs=["out_file"]) + def Workflow(in_file: MyFormatX) -> MyFormatX: + specific1 = workflow.add(SpecificTask(in_file=in_file)) + generic = workflow.add(GenericTask(in_file=specific1.out)) # Generic task + specific2 = workflow.add(SpecificTask(in_file=generic.out), name="specific2") + return specific2.out in_file = MyFormatX.sample() - result = wf(in_file=in_file, plugin="serial") + outputs = Workflow(in_file=in_file)() - out_file: MyFormatX = result.output.out_file + out_file: MyFormatX = outputs.out_file assert type(out_file) is MyFormatX assert out_file.parent != in_file.parent assert type(out_file.header) is MyHeader assert out_file.header.parent != in_file.header.parent -def test_typing_cast(tmp_path, specific_task, other_specific_task): +@pytest.mark.flaky(reruns=5) +def test_typing_cast(tmp_path, SpecificTask, OtherSpecificTask): """Check the casting of lazy fields and whether specific file-sets can be recovered from generic `File` classes""" - wf = Workflow( - name="test", - input_spec={"in_file": MyFormatX}, - output_spec={"out_file": MyFormatX}, - ) + @workflow.define(outputs=["out_file"]) + def Workflow(in_file: MyFormatX) -> MyFormatX: + entry = workflow.add(SpecificTask(in_file=in_file)) - wf.add( - specific_task( - in_file=wf.lzin.in_file, - name="entry", - ) - ) + with pytest.raises(TypeError) as exc_info: + # No cast of generic task output to MyFormatX + workflow.add(OtherSpecificTask(in_file=entry.out)) # Generic task + assert exc_info_matches(exc_info, "Cannot coerce") - with pytest.raises(TypeError) as exc_info: - # No cast of generic task output to MyFormatX - wf.add( # Generic task - other_specific_task( - in_file=wf.entry.lzout.out, - name="inner", - ) + inner = workflow.add( # Generic task + OtherSpecificTask(in_file=workflow.cast(entry.out, MyOtherFormatX)) ) - assert exc_info_matches(exc_info, "Cannot coerce") - wf.add( # Generic task - other_specific_task( - in_file=wf.entry.lzout.out.cast(MyOtherFormatX), - name="inner", - ) - ) + with pytest.raises(TypeError) as exc_info: + # No cast of generic task output to MyFormatX + workflow.add(SpecificTask(in_file=inner.out)) - with pytest.raises(TypeError) as exc_info: - # No cast of generic task output to MyFormatX - wf.add( - specific_task( - in_file=wf.inner.lzout.out, - name="exit", - ) - ) - assert exc_info_matches(exc_info, "Cannot coerce") + assert exc_info_matches(exc_info, "Cannot coerce") - wf.add( - specific_task( - in_file=wf.inner.lzout.out.cast(MyFormatX), - name="exit", + exit = workflow.add( + SpecificTask(in_file=workflow.cast(inner.out, MyFormatX)), name="exit" ) - ) - wf.set_output( - [ - ("out_file", wf.exit.lzout.out), - ] - ) + return exit.out in_file = MyFormatX.sample() - result = wf(in_file=in_file, plugin="serial") + outputs = Workflow(in_file=in_file)() - out_file: MyFormatX = result.output.out_file + out_file: MyFormatX = outputs.out_file assert type(out_file) is MyFormatX assert out_file.parent != in_file.parent assert type(out_file.header) is MyHeader diff --git a/pydra/utils/tests/utils.py b/pydra/utils/tests/utils.py index 3582fa9eda..1d0695c563 100644 --- a/pydra/utils/tests/utils.py +++ b/pydra/utils/tests/utils.py @@ -1,11 +1,9 @@ -from fileformats.generic import File +from fileformats.generic import File, BinaryFile from fileformats.core.mixin import WithSeparateHeader, WithMagicNumber -from pydra import mark -from pydra.engine.task import ShellCommandTask -from pydra.engine import specs +from pydra.compose import shell, python -class MyFormat(WithMagicNumber, File): +class MyFormat(WithMagicNumber, BinaryFile): ext = ".my" magic_number = b"MYFORMAT" @@ -18,164 +16,81 @@ class MyFormatX(WithSeparateHeader, MyFormat): header_type = MyHeader -class MyOtherFormatX(WithMagicNumber, WithSeparateHeader, File): +class MyOtherFormatX(WithMagicNumber, WithSeparateHeader, BinaryFile): magic_number = b"MYFORMAT" ext = ".my" header_type = MyHeader -@mark.task -def generic_func_task(in_file: File) -> File: +@python.define +def GenericFuncTask(in_file: File) -> File: return in_file -generic_shell_input_fields = [ - ( - "in_file", - File, - { - "help_string": "the input file", - "argstr": "", - "copyfile": "copy", - }, - ), - ( - "out", - str, - { - "help_string": "output file name", - "argstr": "", - "position": -1, - "output_file_template": "{in_file}", - }, - ), -] - -generic_shell_input_spec = specs.SpecInfo( - name="Input", fields=generic_shell_input_fields, bases=(specs.ShellSpec,) -) - -generic_shell_output_fields = [ - ( - "out", - File, - { - "help_string": "output file", - }, - ), -] -generic_shelloutput_spec = specs.SpecInfo( - name="Output", fields=generic_shell_output_fields, bases=(specs.ShellOutSpec,) -) - - -class GenericShellTask(ShellCommandTask): - input_spec = generic_shell_input_spec - output_spec = generic_shelloutput_spec +@shell.define +class GenericShellTask(shell.Task["GenericShellTask.Outputs"]): + """class with customized input and executables""" + + in_file: File = shell.arg( + help="the input file", + argstr="", + copy_mode="copy", + ) + + class Outputs(shell.Outputs): + out: File = shell.outarg( + help="output file name", + argstr="", + position=-1, + path_template="{in_file}", + ) + executable = "echo" -@mark.task -def specific_func_task(in_file: MyFormatX) -> MyFormatX: +@python.define +def SpecificFuncTask(in_file: MyFormatX) -> MyFormatX: return in_file -specific_shell_input_fields = [ - ( - "in_file", - MyFormatX, - { - "help_string": "the input file", - "argstr": "", - "copyfile": "copy", - "sep": " ", - }, - ), - ( - "out", - str, - { - "help_string": "output file name", - "argstr": "", - "position": -1, - "output_file_template": "{in_file}", # Pass through un-altered - }, - ), -] - -specific_shell_input_spec = specs.SpecInfo( - name="Input", fields=specific_shell_input_fields, bases=(specs.ShellSpec,) -) - -specific_shell_output_fields = [ - ( - "out", - MyFormatX, - { - "help_string": "output file", - }, - ), -] -specific_shelloutput_spec = specs.SpecInfo( - name="Output", fields=specific_shell_output_fields, bases=(specs.ShellOutSpec,) -) - - -class SpecificShellTask(ShellCommandTask): - input_spec = specific_shell_input_spec - output_spec = specific_shelloutput_spec +@shell.define +class SpecificShellTask(shell.Task["SpecificShellTask.Outputs"]): executable = "echo" + in_file: MyFormatX = shell.arg( + help="the input file", + argstr="", + copy_mode="copy", + ) + + class Outputs(shell.Outputs): + out: MyFormatX = shell.outarg( + help="output file name", + argstr="", + position=-1, + path_template="{in_file}", # Pass through un-altered + ) -@mark.task -def other_specific_func_task(in_file: MyOtherFormatX) -> MyOtherFormatX: + +@python.define +def OtherSpecificFuncTask(in_file: MyOtherFormatX) -> MyOtherFormatX: return in_file -other_specific_shell_input_fields = [ - ( - "in_file", - MyOtherFormatX, - { - "help_string": "the input file", - "argstr": "", - "copyfile": "copy", - "sep": " ", - }, - ), - ( - "out", - str, - { - "help_string": "output file name", - "argstr": "", - "position": -1, - "output_file_template": "{in_file}", # Pass through un-altered - }, - ), -] - -other_specific_shell_input_spec = specs.SpecInfo( - name="Input", fields=other_specific_shell_input_fields, bases=(specs.ShellSpec,) -) - -other_specific_shell_output_fields = [ - ( - "out", - MyOtherFormatX, - { - "help_string": "output file", - }, - ), -] -other_specific_shelloutput_spec = specs.SpecInfo( - name="Output", - fields=other_specific_shell_output_fields, - bases=(specs.ShellOutSpec,), -) - - -class OtherSpecificShellTask(ShellCommandTask): - input_spec = other_specific_shell_input_spec - output_spec = other_specific_shelloutput_spec +class OtherSpecificShellTask(shell.Task): + + in_file: MyOtherFormatX = shell.arg( + help="the input file", + argstr="", + copy_mode="copy", + ) + + class Outputs(shell.Outputs): + out: MyOtherFormatX = shell.outarg( + help="output file name", + argstr="", + position=-1, + path_template="{in_file}", # Pass through un-altered + ) + executable = "echo" diff --git a/pydra/utils/typing.py b/pydra/utils/typing.py index e40f928047..1aa6f762c8 100644 --- a/pydra/utils/typing.py +++ b/pydra/utils/typing.py @@ -1,33 +1,32 @@ import itertools import inspect from pathlib import Path +import collections.abc import os from copy import copy import sys import types import typing as ty import logging -import attr -from ..engine.specs import ( - LazyField, - StateArray, - MultiInputObj, - MultiOutputObj, -) -from ..utils import add_exc_note -from fileformats import field +import attrs +from fileformats import field, core, generic +from pydra.utils.general import add_exc_note +from pydra.utils.mount_identifier import MountIndentifier try: from typing import get_origin, get_args except ImportError: # Python < 3.8 - from typing_extensions import get_origin, get_args # type: ignore + from typing import get_origin, get_args # type: ignore if sys.version_info >= (3, 10): UNION_TYPES = (ty.Union, types.UnionType) else: UNION_TYPES = (ty.Union,) +if ty.TYPE_CHECKING: + from pydra.engine.lazy import LazyField + logger = logging.getLogger("pydra") NO_GENERIC_ISSUBCLASS = sys.version_info.major == 3 and sys.version_info.minor < 10 @@ -46,6 +45,45 @@ TypeOrAny = ty.Union[type, ty.Any] +# These are special types that are checked for in the construction of input/output specs +# and special converters inserted into the attrs fields. + + +class MultiInputObj(list, ty.Generic[T]): + pass + + +MultiInputFile = MultiInputObj[generic.File] + + +# Since we can't create a NewType from a type union, we add a dummy type to the union +# so we can detect the MultiOutput in the input/output task creation +class MultiOutputType: + pass + + +MultiOutputObj = ty.Union[list, object, MultiOutputType] +MultiOutputFile = ty.Union[generic.File, ty.List[generic.File], MultiOutputType] + +OUTPUT_TEMPLATE_TYPES = ( + Path, + ty.List[Path], + ty.Union[Path, bool], + ty.Union[ty.List[Path], bool], + ty.List[ty.List[Path]], +) + + +class StateArray(ty.List[T]): + """an array of values from, or to be split over in an array of nodes (see TaskBase.split()), + multiple nodes of the same task. Used in type-checking to differentiate between list + types and values for multiple nodes + """ + + def __repr__(self): + return f"{type(self).__name__}(" + ", ".join(repr(i) for i in self) + ")" + + class TypeParser(ty.Generic[T]): """A callable which can be used as a converter for attrs.fields to check whether an object or LazyField matches the specified field type, or can be @@ -86,6 +124,8 @@ class TypeParser(ty.Generic[T]): COERCIBLE_DEFAULT: ty.Tuple[ty.Tuple[type, type], ...] = ( ( (ty.Sequence, ty.Sequence), + (ty.Sequence, collections.abc.Set), + (collections.abc.Set, ty.Sequence), (ty.Mapping, ty.Mapping), (Path, os.PathLike), (str, os.PathLike), @@ -159,7 +199,7 @@ def expand_pattern(t): self.superclass_auto_cast = superclass_auto_cast self.match_any_of_union = match_any_of_union - def __call__(self, obj: ty.Any) -> ty.Union[T, LazyField[T]]: + def __call__(self, obj: ty.Any) -> T: """Attempts to coerce the object to the specified type, unless the value is a LazyField where the type of the field is just checked instead or an attrs.NOTHING where it is simply returned. @@ -180,24 +220,25 @@ def __call__(self, obj: ty.Any) -> ty.Union[T, LazyField[T]]: if the coercion is not possible, or not specified by the `coercible`/`not_coercible` parameters, then a TypeError is raised """ + coerced: T - if obj is attr.NOTHING: - coerced = attr.NOTHING # type: ignore[assignment] - elif isinstance(obj, LazyField): + if obj is attrs.NOTHING: + coerced = attrs.NOTHING # type: ignore[assignment] + elif is_lazy(obj): try: - self.check_type(obj.type) + self.check_type(obj._type) except TypeError as e: if self.superclass_auto_cast: try: # Check whether the type of the lazy field isn't a superclass of # the type to check against, and if so, allow it due to permissive # typing rules. - TypeParser(obj.type, match_any_of_union=True).check_type( + TypeParser(obj._type, match_any_of_union=True).check_type( self.tp ) except TypeError: raise TypeError( - f"Incorrect type for lazy field{self.label_str}: {obj.type!r} " + f"Incorrect type for lazy field{self.label_str}: {obj._type!r} " f"is not a subclass or superclass of {self.tp} (and will not " "be able to be coerced to one that is)" ) from e @@ -211,17 +252,25 @@ def __call__(self, obj: ty.Any) -> ty.Union[T, LazyField[T]]: ) else: raise TypeError( - f"Incorrect type for lazy field{self.label_str}: {obj.type!r} " + f"Incorrect type for lazy field{self.label_str}: {obj._type!r} " f"is not a subclass of {self.tp} (and will not be able to be " "coerced to one that is)" ) from e coerced = obj # type: ignore + if obj._type is not ty.Any: + # Used to check whether the type of the field can be changed + obj._type_checked = True elif isinstance(obj, StateArray): coerced = StateArray(self(o) for o in obj) # type: ignore[assignment] else: try: coerced = self.coerce(obj) except TypeError as e: + if obj is None: + raise TypeError( + f"Mandatory field{self.label_str} of type {self.tp} was not " + "provided a value (i.e. a value that wasn't None) " + ) from None raise TypeError( f"Incorrect type for field{self.label_str}: {obj!r} is not of type " f"{self.tp} (and cannot be coerced to it)" @@ -235,11 +284,13 @@ def coerce(self, object_: ty.Any) -> T: def expand_and_coerce(obj, pattern: ty.Union[type, tuple]): """Attempt to expand the object along the lines of the coercion pattern""" - if obj is attr.NOTHING: - return attr.NOTHING + if obj is attrs.NOTHING: + return attrs.NOTHING if not isinstance(pattern, tuple): return coerce_basic(obj, pattern) origin, pattern_args = pattern + if origin == MultiInputObj: + return coerce_multi_input(obj, pattern_args) if origin in UNION_TYPES: return coerce_union(obj, pattern_args) if origin is type: @@ -292,6 +343,21 @@ def coerce_union(obj, pattern_args): + "\n\n".join(f"{a} -> {e}" for a, e in zip(pattern_args, reasons)) ) + def coerce_multi_input(obj, pattern_args): + # Attempt to coerce the object into arg type of the MultiInputObj first, + # and if that fails, try to coerce it into a list of the arg type + try: + return coerce_sequence(list, obj, pattern_args) + except TypeError as e1: + try: + return [expand_and_coerce(obj, pattern_args[0])] + except TypeError as e2: + raise TypeError( + f"Could not coerce object ({obj!r}) to MultiInputObj[{pattern_args[0]}] " + f"either as sequence of {pattern_args[0]} ({e1}) or a single {pattern_args[0]} " + f"object to be wrapped in a list {e2}" + ) from e2 + def coerce_mapping( obj: ty.Mapping, type_: ty.Type[ty.Mapping], pattern_args: list ): @@ -368,26 +434,7 @@ def coerce_obj(obj, type_): f"Cannot coerce {obj!r} into {type_}{msg}{self.label_str}" ) from e - try: - return expand_and_coerce(object_, self.pattern) - except TypeError as e: - # Special handling for MultiInputObjects (which are annoying) - if isinstance(self.pattern, tuple) and self.pattern[0] == MultiInputObj: - # Attempt to coerce the object into arg type of the MultiInputObj first, - # and if that fails, try to coerce it into a list of the arg type - inner_type_parser = copy(self) - inner_type_parser.pattern = self.pattern[1][0] - try: - return [inner_type_parser.coerce(object_)] - except TypeError: - add_exc_note( - e, - "Also failed to coerce to the arg-type of the MultiInputObj " - f"({self.pattern[1][0]})", - ) - raise e - else: - raise e + return expand_and_coerce(object_, self.pattern) def check_type(self, type_: ty.Type[ty.Any]): """Checks the given type to see whether it matches or is a subtype of the @@ -584,6 +631,12 @@ def check_coercible(self, source: ty.Any, target: ty.Union[type, ty.Any]): If the object cannot be coerced into the target type depending on the explicit inclusions and exclusions set in the `coercible` and `not_coercible` member attrs """ + if ( + isinstance(source, ty.Sequence) + and issubclass(target, core.FileSet) + and all(isinstance(p, os.PathLike) for p in source) + ): + return True self.check_type_coercible(type(source), target, source_repr=repr(source)) def check_type_coercible( @@ -959,8 +1012,6 @@ def strip_splits(cls, type_: ty.Type[ty.Any]) -> ty.Tuple[ty.Type, int]: ---------- type_ : ty.Type[ty.Any] the type to list the nested sequences of - only_splits : bool, optional - whether to only return nested splits, not all sequence types Returns ------- @@ -988,3 +1039,174 @@ def label_str(self): get_origin = staticmethod(get_origin) get_args = staticmethod(get_args) + + +def is_union(type_: type, args: list[type] = None) -> bool: + """Checks whether a type is a Union, in either ty.Union[T, U] or T | U form + + Parameters + ---------- + type_ : type + the type to check + args : list[type], optional + required arguments of the union to check, by default (None) any args will match + + Returns + ------- + is_union : bool + whether the type is a Union type + """ + if ty.get_origin(type_) in UNION_TYPES: + if args is not None: + return ty.get_args(type_) == args + return True + return False + + +def is_optional(type_: type) -> bool: + """Check if the type is Optional, i.e. a union containing None""" + if is_union(type_): + return any(a is type(None) or is_optional(a) for a in ty.get_args(type_)) + return False + + +def optional_type(type_: type) -> type: + """Gets the non-None args of an optional type (i.e. a union with a None arg)""" + if is_optional(type_): + non_none = [a for a in ty.get_args(type_) if a is not type(None)] + if len(non_none) == 1: + return non_none[0] + return ty.Union[tuple(non_none)] + return type_ + + +def is_multi_input(type_: type) -> bool: + """Check if the type is a MultiInputObj""" + type_ = optional_type(type_) + return MultiInputObj in (type_, ty.get_origin(type_)) + + +def is_fileset_or_union(type_: type, allow_none: bool | None = None) -> bool: + """Check if the type is a FileSet or a Union containing a FileSet + + Parameters + ---------- + type_ : type + the type to check + allow_none : bool, optional + whether to allow None as a valid type, by default None. If None, then None + is not allowed at the outer layer, but is allowed within a Union + + Returns + ------- + is_fileset : bool + whether the type is a FileSet or a Union containing a FileSet + """ + if type_ is None and allow_none: + return True + if is_union(type_): + return any( + is_fileset_or_union(t, allow_none=allow_none or allow_none is None) + for t in ty.get_args(type_) + ) + elif not inspect.isclass(type_): + return False + return issubclass(type_, core.FileSet) + + +def is_type(*args: ty.Any) -> bool: + """check that the value is a type or generic""" + if len(args) == 3: # attrs validator + val = args[2] + elif len(args) != 1: + raise TypeError(f"is_type() takes 1 or 3 arguments, not {args}") + else: + val = args[0] + return inspect.isclass(val) or ty.get_origin(val) + + +T = ty.TypeVar("T") +U = ty.TypeVar("U") + + +def state_array_support( + function: ty.Callable[T, U], +) -> ty.Callable[T | StateArray[T], U | StateArray[U]]: + """ + Decorator to convert a allow a function to accept and return StateArray objects, + where the function is applied to each element of the StateArray. + """ + + def state_array_wrapper( + value: "T | StateArray[T] | LazyField[T]", + ) -> "U | StateArray[U] | LazyField[U]": + if is_lazy(value): + return value + if isinstance(value, StateArray): + return StateArray(function(v) for v in value) + return function(value) + + return state_array_wrapper + + +def is_lazy(obj): + """Check whether an object is a lazy field or has any attribute that is a Lazy Field""" + from pydra.engine.lazy import LazyField + + return isinstance(obj, LazyField) + + +def copy_nested_files( + value: ty.Any, + dest_dir: os.PathLike, + supported_modes: generic.FileSet.CopyMode = generic.FileSet.CopyMode.any, + **kwargs, +) -> ty.Any: + """Copies all "file-sets" found within the nested value (e.g. dict, list,...) into the + destination directory. If no nested file-sets are found then the original value is + returned. Note that multiple nested file-sets (e.g. a list) will to have unique names + names (i.e. not differentiated by parent directories) otherwise there will be a path + clash in the destination directory. + + Parameters + ---------- + value : Any + the value to copy files from (if required) + dest_dir : os.PathLike + the destination directory to copy the files to + **kwargs + passed directly onto FileSet.copy() + """ + from pydra.utils.typing import TypeParser # noqa + + cache: ty.Dict[generic.FileSet, generic.FileSet] = {} + + # Set to keep track of file paths that have already been copied + # to allow FileSet.copy to avoid name clashes + clashes_to_avoid = set() + + def copy_fileset(fileset: generic.FileSet): + try: + return cache[fileset] + except KeyError: + pass + supported = supported_modes + if any(MountIndentifier.on_cifs(p) for p in fileset.fspaths): + supported -= generic.FileSet.CopyMode.symlink + if not all( + MountIndentifier.on_same_mount(p, dest_dir) for p in fileset.fspaths + ): + supported -= generic.FileSet.CopyMode.hardlink + cp_kwargs = {} + + cp_kwargs.update(kwargs) + copied = fileset.copy( + dest_dir=dest_dir, + supported_modes=supported, + avoid_clashes=clashes_to_avoid, # this prevents fname clashes between filesets + **kwargs, + ) + cache[fileset] = copied + return copied + + return TypeParser.apply_to_instances(generic.FileSet, copy_fileset, value) diff --git a/pydra/workers/base.py b/pydra/workers/base.py new file mode 100644 index 0000000000..feeb26f014 --- /dev/null +++ b/pydra/workers/base.py @@ -0,0 +1,154 @@ +"""Execution workers.""" + +import asyncio +import attrs +import sys +import abc +import inspect +import asyncio.subprocess as asp +import typing as ty +import logging +from pydra.engine.job import Job +from pydra.utils.general import get_plugin_classes +import pydra.workers + +logger = logging.getLogger("pydra.worker") + +if ty.TYPE_CHECKING: + from pydra.engine.result import Result + from pydra.compose import base + +TaskType = ty.TypeVar("TaskType", bound="base.Task") + + +@attrs.define +class Worker(metaclass=abc.ABCMeta): + """A base class for execution of tasks.""" + + loop: asyncio.AbstractEventLoop = None + + def __getstate__(self) -> dict[str, ty.Any]: + """Return state for pickling.""" + state = attrs.asdict(self, recurse=False) + state["loop"] = None + return state + + def __setstate__(self, state: dict[str, ty.Any]) -> None: + for key, value in state.items(): + setattr(self, key, value) + self.loop = None + # Loop will be restored by submitter __setstate__ + + @abc.abstractmethod + def run(self, job: "Job[TaskType]", rerun: bool = False) -> "Result": + """Return coroutine for job execution.""" + pass + + async def submit(self, job: "Job[TaskType]", rerun: bool = False) -> "Result": + assert self.is_async, "Worker is not asynchronous, job should just be `run()`" + if job.is_async: # only for workflows at this stage and the foreseeable + # These jobs are run in the primary process but potentially farm out + # workflow jobs to other processes/job-schedulers + return await job.run_async(rerun=rerun) + else: + return await self.run(job=job, rerun=rerun) + + def close(self): + """Close this worker.""" + + @property + def is_async(self) -> bool: + """Return whether the worker is asynchronous.""" + return inspect.iscoroutinefunction(self.run) + + @classmethod + def available_plugins(cls) -> ty.Dict[str, ty.Type["Worker"]]: + """Return all installed worker types""" + return get_plugin_classes(pydra.workers, "Worker") + + @classmethod + def plugin(cls, plugin_name: str) -> ty.Type["Worker"]: + """Return a worker class by name.""" + try: + return cls.available_plugins()[plugin_name.replace("-", "_")] + except KeyError: + raise ValueError( + f"No worker matches {plugin_name!r}, check if there is a " + f"plugin package called 'pydra-workers-{plugin_name}' that needs to be " + "installed." + ) + + @classmethod + def plugin_name(cls) -> str: + """Return the name of the plugin.""" + try: + plugin_name = cls._plugin_name + except AttributeError: + parts = cls.__module__.split(".") + if parts[:-1] != ["pydra", "workers"]: + raise ValueError( + f"Cannot infer plugin name of Worker ({cls}) from module path, as it " + f"isn't installed within `pydra.workers` ({cls.__module__}). " + "Please set the `_plugin_name` attribute on the class explicitly." + ) + plugin_name = parts[-1].replace("_", "-") + return plugin_name + + +async def read_and_display_async(*cmd, hide_display=False, strip=False): + """ + Capture standard input and output of a process, displaying them as they arrive. + + Works line-by-line. + + """ + # start process + process = await asyncio.create_subprocess_exec( + *cmd, stdout=asp.PIPE, stderr=asp.PIPE + ) + + stdout_display = sys.stdout.buffer.write if not hide_display else None + stderr_display = sys.stderr.buffer.write if not hide_display else None + # read child's stdout/stderr concurrently (capture and display) + try: + stdout, stderr = await asyncio.gather( + read_stream_and_display(process.stdout, stdout_display), + read_stream_and_display(process.stderr, stderr_display), + ) + except Exception: + process.kill() + raise + finally: + # wait for the process to exit + rc = await process.wait() + if strip: + return rc, stdout.strip(), stderr + else: + return rc, stdout, stderr + + +async def read_stream_and_display(stream, display): + """ + Read from stream line by line until EOF, display, and capture the lines. + + See Also + -------- + This `discussion on StackOverflow + `__. + + """ + output = [] + while True: + line = await stream.readline() + if not line: + break + output.append(line) + if display is not None: + display(line) # assume it doesn't block + return b"".join(output).decode() + + +def ensure_non_negative(value: int) -> int: + if not value or value < 0: + return 0 + return value diff --git a/pydra/workers/cf.py b/pydra/workers/cf.py new file mode 100644 index 0000000000..bd82018248 --- /dev/null +++ b/pydra/workers/cf.py @@ -0,0 +1,92 @@ +import os +import attrs +import typing as ty +import cloudpickle as cp +import concurrent.futures as cf +import logging +from pydra.engine.job import Job +from pydra.workers import base + +logger = logging.getLogger("pydra.worker") + +if ty.TYPE_CHECKING: + from pydra.engine.result import Result + + +def get_available_cpus(): + """ + Return the number of CPUs available to the current process or, if that is not + available, the total number of CPUs on the system. + + Returns + ------- + n_proc : :obj:`int` + The number of available CPUs. + """ + # Will not work on some systems or if psutil is not installed. + # See https://psutil.readthedocs.io/en/latest/#psutil.Process.cpu_affinity + try: + import psutil + + return len(psutil.Process().cpu_affinity()) + except (AttributeError, ImportError, NotImplementedError): + pass + + # Not available on all systems, including macOS. + # See https://docs.python.org/3/library/os.html#os.sched_getaffinity + if hasattr(os, "sched_getaffinity"): + return len(os.sched_getaffinity(0)) + + # Last resort + return os.cpu_count() + + +@attrs.define +class ConcurrentFuturesWorker(base.Worker): + """A worker to execute in parallel using Python's concurrent futures.""" + + n_procs: int = attrs.field(factory=get_available_cpus) + pool: cf.ProcessPoolExecutor = attrs.field( + eq=False, init=False, hash=False, repr=False + ) + + @pool.default + def _pool_default(self) -> cf.ProcessPoolExecutor: + return cf.ProcessPoolExecutor(self.n_procs) + + def __getstate__(self) -> dict[str, ty.Any]: + """Return state for pickling.""" + state = super().__getstate__() + del state["pool"] + return state + + def __setstate__(self, state: dict[str, ty.Any]) -> None: + """Set state from pickling.""" + super().__setstate__(state) + self.pool = cf.ProcessPoolExecutor(self.n_procs) + + async def run( + self, + job: Job[base.TaskType], + rerun: bool = False, + ) -> "Result": + """Run a job.""" + assert self.loop, "No event loop available to submit tasks" + job_pkl = cp.dumps(job) + return await self.loop.run_in_executor( + self.pool, self.uncloudpickle_and_run, job_pkl, rerun + ) + + @classmethod + def uncloudpickle_and_run(cls, job_pkl: bytes, rerun: bool) -> "Result": + """Unpickle and run a job.""" + job: Job[base.TaskType] = cp.loads(job_pkl) + return job.run(rerun=rerun) + + def close(self): + """Finalize the internal pool of tasks.""" + self.pool.shutdown() + + +# Alias so it can be referred to as cf.Worker +Worker = ConcurrentFuturesWorker diff --git a/pydra/workers/debug.py b/pydra/workers/debug.py new file mode 100644 index 0000000000..f5a39017e6 --- /dev/null +++ b/pydra/workers/debug.py @@ -0,0 +1,29 @@ +import typing as ty +import logging +from pydra.engine.job import Job +from pydra.workers import base + +if ty.TYPE_CHECKING: + from pydra.engine.result import Result + + +logger = logging.getLogger("pydra.worker") + + +class DebugWorker(base.Worker): + """A worker to execute linearly.""" + + def run( + self, + job: "Job[base.TaskType]", + rerun: bool = False, + ) -> "Result": + """Run a job.""" + return job.run(rerun=rerun) + + def close(self): + """Return whether the job is finished.""" + + +# Alias so it can be referred to as debug.Worker +Worker = DebugWorker diff --git a/pydra/workers/sge.py b/pydra/workers/sge.py new file mode 100644 index 0000000000..ef9097aea5 --- /dev/null +++ b/pydra/workers/sge.py @@ -0,0 +1,501 @@ +import asyncio +import sys +import json +import re +import attrs +import typing as ty +from tempfile import gettempdir +from pathlib import Path +from shutil import copyfile, which +import random +import logging +from pydra.engine.job import Job, save, load_job +from pydra.workers import base + +logger = logging.getLogger("pydra.worker") + +if ty.TYPE_CHECKING: + from pydra.engine.result import Result + + +@attrs.define +class SgeWorker(base.Worker): + """A worker to execute tasks on SLURM systems. Initialize SGE Worker. + + Parameters + ---------- + poll_delay : seconds + Delay between polls to slurmd + qsub_args : str + Additional qsub arguments + max_jobs : int + Maximum number of submitted jobs + write_output_files : bool + Turns on/off writing to output files for individual tasks + max_job_array_length : int + Number of jobs an SGE job array can hold + indirect_submit_host : str + Name of a submit node in the SGE cluster through which to run SGE qsub commands + max_threads : int + Maximum number of threads that will be scheduled for SGE submission at once + poll_for_result_file : bool + If true, a job is complete when its _result.pklz file exists + If false, a job is complete when its job array is indicated complete by qstat/qacct polling + default_threads_per_task : int + Sets the number of slots SGE should request for a job if sgeThreads + is not a field in the job input_spec + polls_before_checking_evicted : int + Number of poll_delays before running qacct to check if a job has been evicted by SGE + collect_jobs_delay : int + Number of seconds to wait for the list of jobs for a job array to fill + """ + + _cmd = "qsub" + _sacct_re = re.compile( + "(?P\\d*) +(?P\\w*)\\+? +" "(?P\\d+):\\d+" + ) + + poll_delay: int = attrs.field(default=1, converter=base.ensure_non_negative) + qsub_args: str = "" + write_output_files: bool = True + max_job_array_length: int = 50 + indirect_submit_host: str | None = None + max_threads: int | None = None + poll_for_result_file: bool = True + default_threads_per_task: int = 1 + polls_before_checking_evicted: int = 60 + collect_jobs_delay: int = 30 + default_qsub_args: str = "" + max_mem_free: int | None = None + error: dict[str, ty.Any] = attrs.field(factory=dict, init=False) + tasks_to_run_by_threads_requested: dict[str, ty.Any] = attrs.field( + factory=dict, init=False + ) + output_by_jobid: dict[str, ty.Any] = attrs.field(factory=dict, init=False) + jobid_by_task_uid: dict[str, ty.Any] = attrs.field(factory=dict, init=False) + threads_used: dict[str, ty.Any] = attrs.field(factory=dict, init=False) + job_completed_by_jobid: dict[str, ty.Any] = attrs.field(factory=dict, init=False) + result_files_by_jobid: dict[str, ty.Any] = attrs.field(factory=dict, init=False) + job_pkls_rerun: dict[str, ty.Any] = attrs.field(factory=dict, init=False) + + _INTERNAL_DICT_ATTRS = [ + "error", + "tasks_to_run_by_threads_requested", + "output_by_jobid", + "jobid_by_task_uid", + "threads_used", + "job_completed_by_jobid", + "result_files_by_jobid", + "job_pkls_rerun", + ] + + def __getstate__(self) -> dict[str, ty.Any]: + """Return state for pickling.""" + state = super().__getstate__() + for atr in self._INTERNAL_DICT_ATTRS: + del state[atr] + return state + + def __setstate__(self, state: dict[str, ty.Any]): + """Set state for unpickling.""" + super().__setstate__(state) + for atr in self._INTERNAL_DICT_ATTRS: + setattr(self, atr, {}) + + def _prepare_runscripts(self, job, interpreter="/bin/sh", rerun=False): + if isinstance(job, Job): + cache_root = job.cache_root + ind = None + uid = job.uid + try: + task_qsub_args = job.qsub_args + except Exception: + task_qsub_args = self.default_qsub_args + else: + ind = job[0] + cache_root = job[-1].cache_root + uid = f"{job[-1].uid}_{ind}" + try: + task_qsub_args = job[-1].qsub_args + except Exception: + task_qsub_args = self.default_qsub_args + + script_dir = cache_root / f"{self.plugin_name()}_scripts" / uid + script_dir.mkdir(parents=True, exist_ok=True) + if ind is None: + if not (script_dir / "_job.pklz").exists(): + save(script_dir, job=job) + else: + copyfile(job[1], script_dir / "_job.pklz") + + job_pkl = script_dir / "_job.pklz" + if not job_pkl.exists() or not job_pkl.stat().st_size: + raise Exception("Missing or empty job!") + + batchscript = script_dir / f"batchscript_{uid}.job" + + if task_qsub_args not in self.tasks_to_run_by_threads_requested: + self.tasks_to_run_by_threads_requested[task_qsub_args] = [] + self.tasks_to_run_by_threads_requested[task_qsub_args].append( + (str(job_pkl), ind, rerun) + ) + + return ( + script_dir, + batchscript, + job_pkl, + ind, + job.cache_dir, + task_qsub_args, + ) + + async def get_tasks_to_run(self, task_qsub_args, mem_free): + # Extract the first N tasks to run + if mem_free is not None and self.max_mem_free is not None: + max_job_array_length = min( + self.max_job_array_length, int(self.max_mem_free / mem_free) + ) + else: + max_job_array_length = self.max_job_array_length + tasks_to_run_copy, self.tasks_to_run_by_threads_requested[task_qsub_args] = ( + self.tasks_to_run_by_threads_requested[task_qsub_args][ + :max_job_array_length + ], + self.tasks_to_run_by_threads_requested[task_qsub_args][ + max_job_array_length: + ], + ) + return tasks_to_run_copy + + async def check_for_results_files(self, jobid, threads_requested): + for job in list(self.result_files_by_jobid[jobid]): + if self.result_files_by_jobid[jobid][job].exists(): + del self.result_files_by_jobid[jobid][job] + self.threads_used -= threads_requested + + async def run(self, job: Job[base.TaskType], rerun: bool = False) -> "Result": + """Worker submission API.""" + ( + script_dir, + batch_script, + job_pkl, + ind, + cache_dir, + task_qsub_args, + ) = self._prepare_runscripts(job, rerun=rerun) + if (script_dir / script_dir.parts[1]) == gettempdir(): + logger.warning("Temporary directories may not be shared across computers") + interpreter = "/bin/sh" + threads_requested = self.default_threads_per_task + if "smp" in task_qsub_args: + smp_index = task_qsub_args.split().index("smp") + if ( + smp_index + 1 < len(task_qsub_args.split()) + and task_qsub_args.split()[smp_index + 1].isdigit() + ): + threads_requested = int(task_qsub_args.split()[smp_index + 1]) + # Get the amount of mem_free requested for the job + mem_free = None + if "mem_free" in task_qsub_args: + mem_free_cmd = [ + word for word in task_qsub_args.split() if word.startswith("mem_free") + ][0] + if len(re.findall(r"\d+", mem_free_cmd)) > 0: + mem_free = int(re.findall(r"\d+", mem_free_cmd)[0]) + + if ( + len(self.tasks_to_run_by_threads_requested.get(task_qsub_args)) + <= self.max_job_array_length + ): + await asyncio.sleep(self.collect_jobs_delay) + tasks_to_run = await self.get_tasks_to_run(task_qsub_args, mem_free) + + if mem_free is not None: + summed_mem_free_cmd = re.sub( + str(mem_free), str(len(tasks_to_run) * mem_free), mem_free_cmd + ) + task_qsub_args = re.sub(mem_free_cmd, summed_mem_free_cmd, task_qsub_args) + + if len(tasks_to_run) > 0: + if self.max_threads is not None: + while self.threads_used > self.max_threads - threads_requested * len( + tasks_to_run + ): + await asyncio.sleep(self.poll_delay) + self.threads_used += threads_requested * len(tasks_to_run) + + python_string = f"""import sys; from pydra.engine.job import load_and_run; \ + job_pkls={[task_tuple for task_tuple in tasks_to_run]}; \ + task_index=int(sys.argv[1])-1; \ + load_and_run(job_pkls[task_index][0], rerun=job_pkls[task_index][1])""" + bcmd_job = "\n".join( + ( + f"#!{interpreter}", + f"{sys.executable} {Path(batch_script).with_suffix('.py')}" + + " $SGE_TASK_ID", + ) + ) + + bcmd_py = python_string + + # Better runtime when the python contents are written to file + # rather than given by cmdline arg -c + with Path(batch_script).with_suffix(".py").open("wt") as fp: + fp.write(bcmd_py) + + with batch_script.open("wt") as fp: + fp.writelines(bcmd_job) + + script_dir = job.cache_root / f"{self.plugin_name()}_scripts" / job.uid + script_dir.mkdir(parents=True, exist_ok=True) + sargs = ["-t"] + sargs.append(f"1-{len(tasks_to_run)}") + sargs = sargs + task_qsub_args.split() + + jobname = re.search(r"(?<=-N )\S+", task_qsub_args) + + if not jobname: + jobname = ".".join((job.name, job.uid)) + sargs.append("-N") + sargs.append(jobname) + output = re.search(r"(?<=-o )\S+", self.qsub_args) + + if not output: + output_file = str(script_dir / "sge-%j.out") + if self.write_output_files: + sargs.append("-o") + sargs.append(output_file) + error = re.search(r"(?<=-e )\S+", self.qsub_args) + if not error: + error_file = str(script_dir / "sge-%j.out") + if self.write_output_files: + sargs.append("-e") + sargs.append(error_file) + else: + error_file = None + sargs.append(str(batch_script)) + + await asyncio.sleep(random.uniform(0, 5)) + + jobid = await self.submit_array_job(sargs, tasks_to_run, error_file) + + if self.poll_for_result_file: + self.result_files_by_jobid[jobid] = {} + for job_pkl, ind, rerun in tasks_to_run: + job = load_job(job_pkl=job_pkl, ind=ind) + self.result_files_by_jobid[jobid][job] = ( + job.cache_dir / "_result.pklz" + ) + + poll_counter = 0 + while True: + # 3 possibilities + # False: job is still pending/working + # True: job is complete + # Exception: Polling / job failure + # done = await self._poll_job(jobid) + if self.poll_for_result_file: + if len(self.result_files_by_jobid[jobid]) > 0: + for job in list(self.result_files_by_jobid[jobid]): + if self.result_files_by_jobid[jobid][job].exists(): + del self.result_files_by_jobid[jobid][job] + self.threads_used -= threads_requested + + else: + exit_status = await self._verify_exit_code(jobid) + if exit_status == "ERRORED": + jobid = await self._rerun_job_array( + job.cache_root, + job.uid, + sargs, + tasks_to_run, + error_file, + jobid, + ) + else: + for job_pkl, ind, rerun in tasks_to_run: + if job_pkl in self.job_pkls_rerun: + del self.job_pkls_rerun[job_pkl] + return True + + if poll_counter >= self.polls_before_checking_evicted: + # Checking for evicted for jobid + exit_status = await self._verify_exit_code(jobid) + if exit_status == "ERRORED": + jobid = await self._rerun_job_array( + job.cache_root, + job.uid, + sargs, + tasks_to_run, + error_file, + jobid, + ) + poll_counter = 0 + poll_counter += 1 + await asyncio.sleep(self.poll_delay) + else: + done = await self._poll_job(jobid, job.cache_root) + if done: + if done == "ERRORED": # If the SGE job was evicted, rerun it + jobid = await self._rerun_job_array( + job.cache_root, + job.uid, + sargs, + tasks_to_run, + error_file, + jobid, + ) + else: + self.job_completed_by_jobid[jobid] = True + self.threads_used -= threads_requested * len(tasks_to_run) + return True + # Don't poll exactly on the same interval to avoid overloading SGE + await asyncio.sleep( + random.uniform(max(0, self.poll_delay - 2), self.poll_delay + 2) + ) + + async def _rerun_job_array( + self, cache_root, uid, sargs, tasks_to_run, error_file, evicted_jobid + ): + for job_pkl, ind, rerun in tasks_to_run: + sge_task = load_job(job_pkl=job_pkl, ind=ind) + application_job_pkl = sge_task.cache_dir / "_job.pklz" + if ( + not application_job_pkl.exists() + or load_job(job_pkl=application_job_pkl).result() is None + or load_job(job_pkl=application_job_pkl).result().errored + ): + self.job_pkls_rerun[job_pkl] = None + info_file = cache_root / f"{sge_task.uid}_info.json" + if info_file.exists(): + checksum = json.loads(info_file.read_text())["checksum"] + if (cache_root / f"{checksum}.lock").exists(): + # for pyt3.8 we could use missing_ok=True + (cache_root / f"{checksum}.lock").unlink() + # Maybe wait a little to check if _error.pklz exists - not getting found immediately + + # If the previous job array failed, run the array's script again and get the new jobid + jobid = await self.submit_array_job(sargs, tasks_to_run, error_file) + self.result_files_by_jobid[jobid] = self.result_files_by_jobid[evicted_jobid] + return jobid + + async def submit_array_job(self, sargs, tasks_to_run, error_file): + if self.indirect_submit_host is not None: + indirect_submit_host_prefix = [] + indirect_submit_host_prefix.append("ssh") + indirect_submit_host_prefix.append(self.indirect_submit_host) + indirect_submit_host_prefix.append('""export SGE_ROOT=/opt/sge;') + rc, stdout, stderr = await base.read_and_display_async( + *indirect_submit_host_prefix, + str(Path(which("qsub")).parent / "qsub"), + *sargs, + '""', + hide_display=True, + ) + else: + rc, stdout, stderr = await base.read_and_display_async( + "qsub", *sargs, hide_display=True + ) + jobid = re.search(r"\d+", stdout) + if rc: + raise RuntimeError(f"Error returned from qsub: {stderr}") + elif not jobid: + raise RuntimeError("Could not extract job ID") + jobid = jobid.group() + self.output_by_jobid[jobid] = (rc, stdout, stderr) + + for job_pkl, ind, rerun in tasks_to_run: + self.jobid_by_task_uid[Path(job_pkl).parent.name] = jobid + + if error_file: + error_file = str(error_file).replace("%j", jobid) + self.error[jobid] = str(error_file).replace("%j", jobid) + return jobid + + async def get_output_by_job_pkl(self, job_pkl): + jobid = self.jobid_by_task_uid.get(job_pkl.parent.name) + while jobid is None: + jobid = self.jobid_by_task_uid.get(job_pkl.parent.name) + await asyncio.sleep(1) + job_output = self.output_by_jobid.get(jobid) + while job_output is None: + job_output = self.output_by_jobid.get(jobid) + await asyncio.sleep(1) + return job_output + + async def _submit_job( + self, + batchscript, + name, + uid, + cache_root, + job_pkl, + ind, + cache_dir, + task_qsub_args, + ): + """Coroutine that submits job runscript and polls job until completion or error.""" + await self._submit_jobs( + batchscript, + name, + uid, + cache_root, + cache_dir, + task_qsub_args, + ) + if self.poll_for_result_file: + while True: + result_file = cache_dir / "_result.pklz" + if result_file.exists() and str(job_pkl) not in self.job_pkls_rerun: + return True + await asyncio.sleep(self.poll_delay) + else: + rc, stdout, stderr = await self.get_output_by_job_pkl(job_pkl) + while True: + jobid = self.jobid_by_task_uid.get(job_pkl.parent.name) + if self.job_completed_by_jobid.get(jobid): + return True + else: + await asyncio.sleep(self.poll_delay) + + async def _poll_job(self, jobid, cache_root): + cmd = ("qstat", "-j", jobid) + logger.debug(f"Polling job {jobid}") + rc, stdout, stderr = await base.read_and_display_async(*cmd, hide_display=True) + + if not stdout: + # job is no longer running - check exit code + status = await self._verify_exit_code(jobid) + return status + return False + + async def _verify_exit_code(self, jobid): + cmd = ("qacct", "-j", jobid) + rc, stdout, stderr = await base.read_and_display_async(*cmd, hide_display=True) + if not stdout: + await asyncio.sleep(10) + rc, stdout, stderr = await base.read_and_display_async( + *cmd, hide_display=True + ) + + # job is still pending/working + if re.match(r"error: job id .* not found", stderr): + return False + + if not stdout: + return "ERRORED" + + # Read the qacct stdout into dictionary stdout_dict + for line in stdout.splitlines(): + line_split = line.split() + if len(line_split) > 1: + if line_split[0] == "failed": + if not line_split[1].isdigit(): + return "ERRORED" + elif not int(line_split[1]) == 0: + return "ERRORED" + return True + + +# Alias so it can be referred to as sge.Worker +Worker = SgeWorker diff --git a/pydra/workers/slurm.py b/pydra/workers/slurm.py new file mode 100644 index 0000000000..e7ec3ceaf7 --- /dev/null +++ b/pydra/workers/slurm.py @@ -0,0 +1,181 @@ +import asyncio +import sys +import json +import re +import typing as ty +from tempfile import gettempdir +from pathlib import Path +from shutil import copyfile +import logging +import attrs +from pydra.engine.job import Job, save +from pydra.workers import base + + +logger = logging.getLogger("pydra.worker") + +if ty.TYPE_CHECKING: + from pydra.engine.result import Result + + +@attrs.define +class SlurmWorker(base.Worker): + """A worker to execute tasks on SLURM systems.""" + + _cmd = "sbatch" + _sacct_re = re.compile( + "(?P\\d*) +(?P\\w*)\\+? +" "(?P\\d+):\\d+" + ) + + poll_delay: int = attrs.field(default=1, converter=base.ensure_non_negative) + sbatch_args: str = "" + error: dict[str, ty.Any] = attrs.field(factory=dict) + + def __getstate__(self) -> dict[str, ty.Any]: + """Return state for pickling.""" + state = super().__getstate__() + del state["error"] + return state + + def __setstate__(self, state: dict[str, ty.Any]): + """Set state for unpickling.""" + state["error"] = {} + super().__setstate__(state) + + def _prepare_runscripts(self, job, interpreter="/bin/sh", rerun=False): + if isinstance(job, Job): + cache_root = job.cache_root + ind = None + uid = job.uid + else: + assert isinstance(job, tuple), f"Expecting a job or a tuple, not {job!r}" + assert len(job) == 2, f"Expecting a tuple of length 2, not {job!r}" + ind = job[0] + cache_root = job[-1].cache_root + uid = f"{job[-1].uid}_{ind}" + + script_dir = cache_root / f"{self.plugin_name()}_scripts" / uid + script_dir.mkdir(parents=True, exist_ok=True) + if ind is None: + if not (script_dir / "_job.pklz").exists(): + save(script_dir, job=job) + else: + copyfile(job[1], script_dir / "_job.pklz") + + job_pkl = script_dir / "_job.pklz" + if not job_pkl.exists() or not job_pkl.stat().st_size: + raise Exception("Missing or empty job!") + + batchscript = script_dir / f"batchscript_{uid}.sh" + python_string = ( + f"""'from pydra.engine.job import load_and_run; """ + f"""load_and_run("{job_pkl}", rerun={rerun}) '""" + ) + bcmd = "\n".join( + ( + f"#!{interpreter}", + f"#SBATCH --output={script_dir / 'slurm-%j.out'}", + f"{sys.executable} -c " + python_string, + ) + ) + with batchscript.open("wt") as fp: + fp.writelines(bcmd) + return script_dir, batchscript + + async def run(self, job: "Job[base.TaskType]", rerun: bool = False) -> "Result": + """Worker submission API.""" + script_dir, batch_script = self._prepare_runscripts(job, rerun=rerun) + if (script_dir / script_dir.parts[1]) == gettempdir(): + logger.warning("Temporary directories may not be shared across computers") + script_dir = job.cache_root / f"{self.plugin_name()}_scripts" / job.uid + sargs = self.sbatch_args.split() + jobname = re.search(r"(?<=-J )\S+|(?<=--job-name=)\S+", self.sbatch_args) + if not jobname: + jobname = ".".join((job.name, job.uid)) + sargs.append(f"--job-name={jobname}") + output = re.search(r"(?<=-o )\S+|(?<=--output=)\S+", self.sbatch_args) + if not output: + output_file = str(script_dir / "slurm-%j.out") + sargs.append(f"--output={output_file}") + error = re.search(r"(?<=-e )\S+|(?<=--error=)\S+", self.sbatch_args) + if not error: + error_file = str(script_dir / "slurm-%j.err") + sargs.append(f"--error={error_file}") + else: + error_file = None + sargs.append(str(batch_script)) + # TO CONSIDER: add random sleep to avoid overloading calls + rc, stdout, stderr = await base.read_and_display_async( + "sbatch", *sargs, hide_display=True + ) + jobid = re.search(r"\d+", stdout) + if rc: + raise RuntimeError(f"Error returned from sbatch: {stderr}") + elif not jobid: + raise RuntimeError("Could not extract job ID") + jobid = jobid.group() + if error_file: + error_file = error_file.replace("%j", jobid) + self.error[jobid] = error_file.replace("%j", jobid) + # intermittent polling + while True: + # 3 possibilities + # False: job is still pending/working + # True: job is complete + # Exception: Polling / job failure + done = await self._poll_job(jobid) + if done: + if ( + done in ["CANCELLED", "TIMEOUT", "PREEMPTED"] + and "--no-requeue" not in self.sbatch_args + ): + # loading info about job with a specific uid + info_file = job.cache_root / f"{job.uid}_info.json" + if info_file.exists(): + checksum = json.loads(info_file.read_text())["checksum"] + if (job.cache_root / f"{checksum}.lock").exists(): + # for pyt3.8 we could you missing_ok=True + (job.cache_root / f"{checksum}.lock").unlink() + cmd_re = ("scontrol", "requeue", jobid) + await base.read_and_display_async(*cmd_re, hide_display=True) + else: + return True + await asyncio.sleep(self.poll_delay) + + async def _poll_job(self, jobid): + cmd = ("squeue", "-h", "-j", jobid) + logger.debug(f"Polling job {jobid}") + rc, stdout, stderr = await base.read_and_display_async(*cmd, hide_display=True) + if not stdout or "slurm_load_jobs error" in stderr: + # job is no longer running - check exit code + status = await self._verify_exit_code(jobid) + return status + return False + + async def _verify_exit_code(self, jobid): + cmd = ("sacct", "-n", "-X", "-j", jobid, "-o", "JobID,State,ExitCode") + _, stdout, _ = await base.read_and_display_async(*cmd, hide_display=True) + if not stdout: + raise RuntimeError("Job information not found") + m = self._sacct_re.search(stdout) + error_file = self.error[jobid] + if int(m.group("exit_code")) != 0 or m.group("status") != "COMPLETED": + if m.group("status") in ["CANCELLED", "TIMEOUT", "PREEMPTED"]: + return m.group("status") + elif m.group("status") in ["RUNNING", "PENDING"]: + return False + # TODO: potential for requeuing + # parsing the error message + error_line = Path(error_file).read_text().split("\n")[-2] + if "Exception" in error_line: + error_message = error_line.replace("Exception: ", "") + elif "Error" in error_line: + error_message = error_line.replace("Exception: ", "") + else: + error_message = "Job failed (unknown reason - TODO)" + raise Exception(error_message) + return True + + +# Alias so it can be referred to as slurm.Worker +Worker = SlurmWorker diff --git a/pydra/workers/tests/test_pickle.py b/pydra/workers/tests/test_pickle.py new file mode 100644 index 0000000000..15aa8e7bc5 --- /dev/null +++ b/pydra/workers/tests/test_pickle.py @@ -0,0 +1,43 @@ +import pickle as pkl + +from pydra.workers import debug, cf, sge, slurm + + +def test_pickle_debug_worker(): + """ + Test pickling of debug.Worker + """ + worker = debug.Worker() + worker2 = pkl.loads(pkl.dumps(worker)) + assert worker2.loop is None + assert worker == worker2 + + +def test_pickle_cf_worker(): + """ + Test pickling of cf.Worker + """ + worker = cf.Worker() + worker2 = pkl.loads(pkl.dumps(worker)) + assert worker2.loop is None + assert worker == worker2 + + +def test_pickle_sge_worker(): + """ + Test pickling of sge.Worker + """ + worker = sge.Worker() + worker2 = pkl.loads(pkl.dumps(worker)) + assert worker2.loop is None + assert worker == worker2 + + +def test_pickle_slurm_worker(): + """ + Test pickling of DebugWorker + """ + worker = slurm.Worker() + worker2 = pkl.loads(pkl.dumps(worker)) + assert worker2.loop is None + assert worker == worker2 diff --git a/pydra/workers/tests/test_worker.py b/pydra/workers/tests/test_worker.py new file mode 100644 index 0000000000..56c202a9b6 --- /dev/null +++ b/pydra/workers/tests/test_worker.py @@ -0,0 +1,731 @@ +from dateutil import parser +import secrets +import re +import subprocess as sp +import time +import attrs +import typing as ty +import os +from unittest.mock import patch +import pytest +from pydra.compose import workflow, shell +from fileformats.generic import Directory +from pydra.engine.job import Job +from pydra.engine.submitter import Submitter +from pydra.workers import debug +from pydra.environments import singularity +from pydra.compose import python +from pathlib import Path +from datetime import datetime +from pydra.engine.result import Result +from pydra.engine.tests.utils import ( + need_sge, + need_slurm, + need_singularity, + BasicWorkflow, + BasicWorkflowWithThreadCount, + BasicWorkflowWithThreadCountConcurrent, +) +import logging + +logger = logging.getLogger("pydra.worker") + + +@python.define +def SleepAddOne(x): + time.sleep(1) + return x + 1 + + +def test_callable_wf(any_worker, tmpdir): + wf = BasicWorkflow(x=5) + outputs = wf(cache_root=tmpdir) + assert outputs.out == 9 + del wf, outputs + + # providing any_worker + wf = BasicWorkflow(x=5) + outputs = wf(worker="cf") + assert outputs.out == 9 + del wf, outputs + + # providing plugin_kwargs + wf = BasicWorkflow(x=5) + outputs = wf(worker="cf", n_procs=2) + assert outputs.out == 9 + del wf, outputs + + # providing wrong plugin_kwargs + wf = BasicWorkflow(x=5) + with pytest.raises(TypeError, match="an unexpected keyword argument"): + wf(worker="cf", sbatch_args="-N2") + + # providing submitter + wf = BasicWorkflow(x=5) + + with Submitter(worker=any_worker, cache_root=tmpdir) as sub: + res = sub(wf) + assert res.outputs.out == 9 + + +def test_concurrent_wf(any_worker, tmpdir): + # concurrent workflow + # A --> C + # B --> D + @workflow.define(outputs=["out1", "out2"]) + def Workflow(x, y): + taska = workflow.add(SleepAddOne(x=x), name="taska") + taskb = workflow.add(SleepAddOne(x=y), name="taskb") + taskc = workflow.add(SleepAddOne(x=taska.out), name="taskc") + taskd = workflow.add(SleepAddOne(x=taskb.out), name="taskd") + return taskc.out, taskd.out + + wf = Workflow(x=5, y=10) + + with Submitter(worker=any_worker, cache_root=tmpdir) as sub: + results = sub(wf) + + assert not results.errored, " ".join(results.errors["error message"]) + outputs = results.outputs + assert outputs.out1 == 7 + assert outputs.out2 == 12 + + +def test_concurrent_wf_nprocs(tmpdir): + # concurrent workflow + # setting n_procs in Submitter that is passed to the worker + # A --> C + # B --> D + @workflow.define(outputs=["out1", "out2"]) + def Workflow(x, y): + taska = workflow.add(SleepAddOne(x=x), name="taska") + taskb = workflow.add(SleepAddOne(x=y), name="taskb") + taskc = workflow.add(SleepAddOne(x=taska.out), name="taskc") + taskd = workflow.add(SleepAddOne(x=taskb.out), name="taskd") + return taskc.out, taskd.out + + wf = Workflow(x=5, y=10) + with Submitter(worker="cf", n_procs=2, cache_root=tmpdir) as sub: + res = sub(wf) + + assert not res.errored, " ".join(res.errors["error message"]) + outputs = res.outputs + assert outputs.out1 == 7 + assert outputs.out2 == 12 + + +def test_wf_in_wf(any_worker, tmpdir): + """WF(A --> SUBWF(A --> B) --> B)""" + + # workflow task + @workflow.define + def SubWf(x): + sub_a = workflow.add(SleepAddOne(x=x), name="sub_a") + sub_b = workflow.add(SleepAddOne(x=sub_a.out), name="sub_b") + return sub_b.out + + @workflow.define + def WfInWf(x): + a = workflow.add(SleepAddOne(x=x), name="a") + subwf = workflow.add(SubWf(x=a.out), name="subwf") + b = workflow.add(SleepAddOne(x=subwf.out), name="b") + return b.out + + wf = WfInWf(x=3) + + with Submitter(worker=any_worker, cache_root=tmpdir) as sub: + results = sub(wf) + + assert not results.errored, " ".join(results.errors["error message"]) + outputs = results.outputs + assert outputs.out == 7 + + +@pytest.mark.flaky(reruns=2) # when dask +def test_wf2(any_worker, tmpdir): + """workflow as a node + workflow-node with one task and no splitter + """ + + @workflow.define + def Wfnd(x): + add2 = workflow.add(SleepAddOne(x=x)) + return add2.out + + @workflow.define + def Workflow(x): + wfnd = workflow.add(Wfnd(x=x)) + return wfnd.out + + wf = Workflow(x=2) + + with Submitter(worker=any_worker, cache_root=tmpdir) as sub: + res = sub(wf) + + assert res.outputs.out == 3 + + +@pytest.mark.flaky(reruns=2) # when dask +def test_wf_with_state(any_worker, tmpdir): + @workflow.define + def Workflow(x): + taska = workflow.add(SleepAddOne(x=x), name="taska") + taskb = workflow.add(SleepAddOne(x=taska.out), name="taskb") + return taskb.out + + wf = Workflow().split(x=[1, 2, 3]) + + with Submitter(cache_root=tmpdir, worker=any_worker) as sub: + res = sub(wf) + + assert res.outputs.out[0] == 3 + assert res.outputs.out[1] == 4 + assert res.outputs.out[2] == 5 + + +def test_debug_wf(): + # Use serial any_worker to execute workflow instead of CF + wf = BasicWorkflow(x=5) + outputs = wf(worker="debug") + assert outputs.out == 9 + + +@need_slurm +def test_slurm_wf(tmpdir): + wf = BasicWorkflow(x=1) + # submit workflow and every task as slurm job + with Submitter(worker="slurm", cache_root=tmpdir) as sub: + res = sub(wf) + + outputs = res.outputs + assert outputs.out == 5 + script_dir = tmpdir / "slurm_scripts" + assert script_dir.exists() + # ensure each task was executed with slurm + assert len([sd for sd in script_dir.listdir() if sd.isdir()]) == 2 + + +@pytest.mark.skip( + reason=( + "There currently isn't a way to specify a worker to run a whole workflow within " + "a single SLURM job" + ) +) +@need_slurm +def test_slurm_wf_cf(tmpdir): + # submit entire workflow as single job executing with cf worker + wf = BasicWorkflow(x=1) + with Submitter(worker="slurm", cache_root=tmpdir) as sub: + res = sub(wf) + outputs = res.outputs + assert outputs.out == 5 + script_dir = tmpdir / "slurm_scripts" + assert script_dir.exists() + # ensure only workflow was executed with slurm + sdirs = [sd for sd in script_dir.listdir() if sd.isdir()] + assert len(sdirs) == 1 + # slurm scripts should be in the dirs that are using uid in the name + assert sdirs[0].basename == wf.uid + + +@need_slurm +def test_slurm_wf_state(tmpdir): + wf = BasicWorkflow().split(x=[5, 6]) + with Submitter(worker="slurm", cache_root=tmpdir) as sub: + res = sub(wf) + + assert res.outputs.out == [9, 10] + script_dir = tmpdir / "slurm_scripts" + assert script_dir.exists() + sdirs = [sd for sd in script_dir.listdir() if sd.isdir()] + assert len(sdirs) == 2 * len(wf.x) + + +@need_slurm +@pytest.mark.flaky(reruns=3) +def test_slurm_max_jobs(tmp_path): + @workflow.define(outputs=["out1", "out2"]) + def Workflow(x, y): + taska = workflow.add(SleepAddOne(x=x), name="taska") + taskb = workflow.add(SleepAddOne(x=y), name="taskb") + taskc = workflow.add(SleepAddOne(x=taska.out), name="taskc") + taskd = workflow.add(SleepAddOne(x=taskb.out), name="taskd") + return taskc.out, taskd.out + + wf = Workflow(x=5, y=10) + + with Submitter(worker="slurm", cache_root=tmp_path, max_concurrent=1) as sub: + res = sub(wf) + + assert not res.errored, " ".join(res.errors["error message"]) + + jobids = [] + time.sleep(0.5) # allow time for sacct to collect itself + for fl in (tmp_path / "slurm_scripts").glob("**/slurm-*.out"): + jid = re.search(r"(?<=slurm-)\d+", str(fl)) + assert jid.group() + jobids.append(jid.group()) + time.sleep(0.2) + del jid + with open(fl, "r") as f: + print(f.read()) + + assert jobids + + # query sacct for job eligibility timings + queued = [] + for jid in sorted(jobids): + out = sp.run(["sacct", "-Xnj", jid, "-o", "Eligible"], capture_output=True) + et = out.stdout.decode().strip() + queued.append(parser.parse(et)) + del out, et + + # compare timing between queued jobs + prev = None + for et in sorted(queued, reverse=True): + if prev is None: + prev = et + continue + assert (prev - et).seconds >= 2 + + +@need_slurm +def test_slurm_args_1(tmpdir): + """testing sbatch_args provided to the submitter""" + task = SleepAddOne(x=1) + # submit workflow and every task as slurm job + with Submitter(worker="slurm", cache_root=tmpdir, sbatch_args="-N1") as sub: + res = sub(task) + + assert res.outputs.out == 2 + script_dir = tmpdir / "slurm_scripts" + assert script_dir.exists() + + +@need_slurm +def test_slurm_args_2(tmpdir): + """testing sbatch_args provided to the submitter + exception should be raised for invalid options + """ + task = SleepAddOne(x=1) + # submit workflow and every task as slurm job + with pytest.raises(RuntimeError, match="Error returned from sbatch:"): + with Submitter( + worker="slurm", cache_root=tmpdir, sbatch_args="-N1 --invalid" + ) as sub: + sub(task) + + +@need_singularity +@need_slurm +@pytest.mark.skip(reason="TODO, xfail incorrect") +@pytest.mark.xfail( + reason="slurm can complain if the number of submitted jobs exceeds the limit" +) +@pytest.mark.parametrize("n", [10, 50, 100]) +def test_singularity_st_2(tmp_path, n): + """splitter over args (checking bigger splitters if slurm available)""" + args_n = list(range(n)) + image = "docker://alpine" + Singu = shell.define("echo") + singu = Singu().split("args", args=args_n) + with Submitter( + worker="slurm", + environment=singularity.Environment(image=image), + cache_root=tmp_path, + ) as sub: + res = sub(singu) + + assert "1" in res.outputs.stdout[1] + assert str(n - 1) in res.outputs.stdout[-1] + assert res.outputs.return_code[0] == res.outputs.return_code[1] == 0 + + +@python.define +def Sleep(x, job_name_part): + time.sleep(x) + import subprocess as sp + + # getting the job_id of the first job that sleeps + job_id = 999 + while job_id != "": + time.sleep(3) + id_p1 = sp.Popen(["squeue"], stdout=sp.PIPE) + id_p2 = sp.Popen(["grep", job_name_part], stdin=id_p1.stdout, stdout=sp.PIPE) + id_p3 = sp.Popen(["awk", "{print $1}"], stdin=id_p2.stdout, stdout=sp.PIPE) + job_id = id_p3.communicate()[0].decode("utf-8").strip() + + return x + + +@python.define +def Cancel(job_name_part): + import subprocess as sp + + # getting the job_id of the first job that sleeps + job_id = "" + while job_id == "": + time.sleep(1) + id_p1 = sp.Popen(["squeue"], stdout=sp.PIPE) + id_p2 = sp.Popen(["grep", job_name_part], stdin=id_p1.stdout, stdout=sp.PIPE) + id_p3 = sp.Popen(["awk", "{print $1}"], stdin=id_p2.stdout, stdout=sp.PIPE) + job_id = id_p3.communicate()[0].decode("utf-8").strip() + + # # canceling the job + proc = sp.run(["scancel", job_id, "--verbose"], stdout=sp.PIPE, stderr=sp.PIPE) + # cancelling the job returns message in the sterr + return proc.stderr.decode("utf-8").strip() + + +@pytest.mark.skip(reason="this test is hanging, need to work out why") +@pytest.mark.flaky(reruns=1) +@need_slurm +def test_slurm_cancel_rerun_1(tmpdir): + """testing that tasks run with slurm is re-queue + Running wf with 2 tasks, one sleeps and the other trying to get + job_id of the first task and cancel it. + The first job should be re-queue and finish without problem. + (possibly has to be improved, in theory cancel job might finish before cancel) + """ + + @workflow.define(outputs=["out", "canc_out"]) + def Workflow(x, job_name_cancel, job_name_resqueue): + sleep1 = workflow.add(Sleep(x=x, job_name_part=job_name_cancel), name="sleep1") + cancel1 = workflow.add(Cancel(job_name_part=job_name_resqueue), name="cancel1") + return sleep1.out, cancel1.out + + wf = Workflow(x=10, job_name_resqueue="sleep1", job_name_cancel="cancel1") + + with Submitter(worker="slurm", cache_root=tmpdir) as sub: + res = sub(wf) + + outputs = res.outputs + assert outputs.out == 10 + # checking if indeed the sleep-task job was cancelled by cancel-task + assert "Terminating" in outputs.canc_out + assert "Invalid" not in outputs.canc_out + script_dir = tmpdir / "slurm_scripts" + assert script_dir.exists() + + +@pytest.mark.flaky(reruns=1) +@need_slurm +def test_slurm_cancel_rerun_2(tmpdir): + """testing that tasks run with slurm that has --no-requeue + Running wf with 2 tasks, one sleeps and the other gets + job_id of the first task and cancel it. + The first job is not able t be rescheduled and the error is returned. + """ + + @workflow.define(outputs=["out", "canc_out"]) + def Workflow(x, job_name): + sleep2 = workflow.add(Sleep(x=x, job_name_part=job_name), name="sleep2") + cancel2 = workflow.add(Cancel(job_name_part=job_name), name="cancel2") + return sleep2.out, cancel2.out + + wf = Workflow(x=10, job_name="sleep2") + + with pytest.raises(Exception): + with Submitter( + worker="slurm", cache_root=tmpdir, sbatch_args="--no-requeue" + ) as sub: + sub(wf, raise_errors=True) + + +@need_sge +def test_sge_wf(tmpdir): + """testing that a basic workflow can be run with the SGEWorker""" + wf = BasicWorkflow(x=1) + # submit workflow and every task as sge job + with Submitter(worker="sge", cache_root=tmpdir) as sub: + res = sub(wf) + + outputs = res.outputs + assert outputs.out == 9 + script_dir = tmpdir / "sge_scripts" + assert script_dir.exists() + # ensure each task was executed with sge + assert len([sd for sd in script_dir.listdir() if sd.isdir()]) == 2 + + +@need_sge +def test_sge_wf_cf(tmp_path): + """testing the SGEWorker can submit SGE tasks while the workflow + uses the concurrent futures any_worker""" + # submit entire workflow as single job executing with cf worker + wf = BasicWorkflow(x=1) + with Submitter(worker="sge", cache_root=tmp_path) as sub: + res = sub(wf) + outputs = res.outputs + assert outputs.out == 9 + script_dir = tmp_path / "sge_scripts" + assert script_dir.exists() + # ensure only workflow was executed with slurm + sdirs = [sd for sd in script_dir.listdir() if sd.isdir()] + assert len(sdirs) == 1 + # sge scripts should be in the dirs that are using uid in the name + assert Path(sdirs[0]).name == wf.uid + + +@need_sge +def test_sge_wf_state(tmpdir): + """testing the SGEWorker can be used with a workflow with state""" + wf = BasicWorkflow().split(x=[5, 6]) + with Submitter(worker="sge", cache_root=tmpdir) as sub: + res = sub(wf) + assert res.output.out[0] == 9 + assert res.output.out[1] == 10 + script_dir = tmpdir / "sge_scripts" + assert script_dir.exists() + sdirs = [sd for sd in script_dir.listdir() if sd.isdir()] + assert len(sdirs) == 2 * len(wf.x) + + +def qacct_output_to_dict(qacct_output): + stdout_dict = {} + for line in qacct_output.splitlines(): + key_value = line.split(None, 1) + if key_value[0] not in stdout_dict: + stdout_dict[key_value[0]] = [] + if len(key_value) > 1: + stdout_dict[key_value[0]].append(key_value[1]) + else: + stdout_dict[key_value[0]].append(None) + + print(stdout_dict) + return stdout_dict + + +@need_sge +def test_sge_set_threadcount(tmpdir): + """testing the number of threads for an SGEWorker task can be set + using the input_spec variable sgeThreads""" + wf = BasicWorkflowWithThreadCount(x=5) + + jobids = [] + with Submitter(worker="sge", cache_root=tmpdir) as sub: + sub(wf) + jobids = list(sub.worker.jobid_by_task_uid.values()) + jobids.sort() + + print(f"jobids: {jobids}") + + out_job0 = ( + sp.run(["qacct", "-j", jobids[0]], capture_output=True).stdout.decode().strip() + ) + out_job1 = ( + sp.run(["qacct", "-j", jobids[1]], capture_output=True).stdout.decode().strip() + ) + + out_job0_dict = qacct_output_to_dict(out_job0) + out_job1_dict = qacct_output_to_dict(out_job1) + + assert int(out_job0_dict["slots"][0]) == 4 + assert int(out_job1_dict["slots"][0]) == 1 + + +@need_sge +def test_sge_limit_maxthreads(tmpdir): + """testing the ability to limit the number of threads used by the SGE + at one time with the max_threads argument to SGEWorker""" + wf = BasicWorkflowWithThreadCountConcurrent().split(x=[5, 6]) + + jobids = [] + with Submitter(worker="sge", max_threads=8, cache_root=tmpdir) as sub: + sub(wf) + jobids = list(sub.worker.jobid_by_task_uid.values()) + jobids.sort() + + out_job0 = ( + sp.run(["qacct", "-j", jobids[0]], capture_output=True).stdout.decode().strip() + ) + out_job1 = ( + sp.run(["qacct", "-j", jobids[1]], capture_output=True).stdout.decode().strip() + ) + out_job2 = ( + sp.run(["qacct", "-j", jobids[2]], capture_output=True).stdout.decode().strip() + ) + out_job3 = ( + sp.run(["qacct", "-j", jobids[3]], capture_output=True).stdout.decode().strip() + ) + + qacct_output_to_dict(out_job0) + out_job1_dict = qacct_output_to_dict(out_job1) + out_job2_dict = qacct_output_to_dict(out_job2) + qacct_output_to_dict(out_job3) + + job_1_endtime = datetime.strptime( + out_job1_dict["end_time"][0], "%a %b %d %H:%M:%S %Y" + ) + # Running both task_1_1 and task_1_2 at once would exceed max_threads, + # so task_1_2 waits for task_1_1 to complete + job_2_starttime = datetime.strptime( + out_job2_dict["start_time"][0], "%a %b %d %H:%M:%S %Y" + ) + assert job_1_endtime < job_2_starttime + + +@need_sge +def test_sge_no_limit_maxthreads(tmpdir): + """testing unlimited threads can be used at once by SGE + when max_threads is not set""" + wf = BasicWorkflowWithThreadCountConcurrent().split(x=[5, 6]) + + jobids = [] + with Submitter(worker="sge", max_threads=None, cache_root=tmpdir) as sub: + sub(wf) + jobids = list(sub.worker.jobid_by_task_uid.values()) + jobids.sort() + + out_job0 = ( + sp.run(["qacct", "-j", jobids[0]], capture_output=True).stdout.decode().strip() + ) + out_job1 = ( + sp.run(["qacct", "-j", jobids[1]], capture_output=True).stdout.decode().strip() + ) + out_job2 = ( + sp.run(["qacct", "-j", jobids[2]], capture_output=True).stdout.decode().strip() + ) + + qacct_output_to_dict(out_job0) + out_job1_dict = qacct_output_to_dict(out_job1) + out_job2_dict = qacct_output_to_dict(out_job2) + + job_1_endtime = datetime.strptime( + out_job1_dict["end_time"][0], "%a %b %d %H:%M:%S %Y" + ) + # Running both task_1_1 and task_1_2 at once would not exceed max_threads, + # so task_1_2 does not wait for task_1_1 to complete + job_2_starttime = datetime.strptime( + out_job2_dict["start_time"][0], "%a %b %d %H:%M:%S %Y" + ) + assert job_1_endtime > job_2_starttime + + +def test_hash_changes_in_task_inputs_file(tmp_path): + @python.define + def cache_dir_as_input(out_dir: Directory) -> Directory: + (out_dir.fspath / "new-file.txt").touch() + return out_dir + + task = cache_dir_as_input(out_dir=tmp_path) + with pytest.raises(RuntimeError, match="Input field hashes have changed"): + task(cache_root=tmp_path) + + +def test_hash_changes_in_task_inputs_unstable(tmp_path): + @attrs.define + class Unstable: + value: int # type: ignore + + def __bytes_repr__(self, cache) -> ty.Iterator[bytes]: + """Random 128-bit bytestring""" + yield secrets.token_bytes(16) + + @python.define + def unstable_input(unstable: Unstable) -> int: + return unstable.value + + task = unstable_input(unstable=Unstable(1)) + with pytest.raises(RuntimeError, match="Input field hashes have changed"): + task(cache_root=tmp_path) + + +def test_hash_changes_in_workflow_inputs(tmp_path): + @python.define + def OutputDirAsOutput(out_dir: Path) -> Directory: + (out_dir / "new-file.txt").touch() + return out_dir + + @workflow.define(outputs=["out_dir"]) + def Workflow(in_dir: Directory): + task = workflow.add(OutputDirAsOutput(out_dir=in_dir), name="task") + return task.out + + in_dir = tmp_path / "in_dir" + in_dir.mkdir() + cache_root = tmp_path / "cache_root" + cache_root.mkdir() + + wf = Workflow(in_dir=in_dir) + with pytest.raises(RuntimeError, match="Input field hashes have changed.*"): + wf(cache_root=cache_root) + + +@python.define +def to_tuple(x, y): + return (x, y) + + +class BYOAddVarWorker(debug.Worker): + """A dummy worker that adds 1 to the output of the task""" + + _plugin_name = "byo_add_env_var" + + def __init__(self, add_var, **kwargs): + super().__init__(**kwargs) + self.add_var = add_var + + def run( + self, + task: "Job", + rerun: bool = False, + ) -> "Result": + with patch.dict(os.environ, {"BYO_ADD_VAR": str(self.add_var)}): + return super().run(task, rerun) + + +@python.define +def AddEnvVarTask(x: int) -> int: + return x + int(os.environ.get("BYO_ADD_VAR", 0)) + + +def test_byo_worker(tmp_path): + + task1 = AddEnvVarTask(x=1) + + with Submitter(worker=BYOAddVarWorker, add_var=10, cache_root=tmp_path) as sub: + assert sub.worker.plugin_name() == "byo_add_env_var" + result = sub(task1) + + assert result.outputs.out == 11 + + task2 = AddEnvVarTask(x=2) + + new_cache_root = tmp_path / "new" + + with Submitter(worker="debug", cache_root=new_cache_root) as sub: + result = sub(task2) + + assert result.outputs.out == 2 + + +def test_bad_builtin_worker(): + + with pytest.raises(ValueError, match="No worker matches 'bad-worker'"): + Submitter(worker="bad-worker") + + +def test_bad_byo_worker1(): + + import pydra.workers.base as base + + class BadWorker(base.Worker): + + def run(self, task: Job, rerun: bool = False) -> Result: + pass + + with pytest.raises(ValueError, match="Cannot infer plugin name of Worker "): + Submitter(worker=BadWorker) + + +def test_bad_byo_worker2(): + + class BadWorker: + pass + + with pytest.raises( + TypeError, + match="Worker must be a Worker object, name of a worker or a Worker class", + ): + Submitter(worker=BadWorker) diff --git a/pydra/workers/tests/test_worker_cf.py b/pydra/workers/tests/test_worker_cf.py new file mode 100644 index 0000000000..f7e1d905bf --- /dev/null +++ b/pydra/workers/tests/test_worker_cf.py @@ -0,0 +1,37 @@ +import os +import shutil +from pathlib import Path +import random +import platform +import typing as ty +import pytest +import cloudpickle as cp +from pydra.engine.submitter import Submitter +from pydra.engine.job import Job +from pydra.compose import workflow +from fileformats.generic import Directory, File +from pydra.engine.tests.utils import Multiply, RaiseXeq1 +from pydra.utils.general import position_sort +from pydra.compose.shell.templating import parse_format_string +from pydra.engine.job import save, load_and_run +from pydra.workers.cf import get_available_cpus +from pydra.utils.hash import hash_function + + +def test_get_available_cpus(): + assert get_available_cpus() > 0 + try: + import psutil + + has_psutil = True + except ImportError: + has_psutil = False + + if hasattr(os, "sched_getaffinity"): + assert get_available_cpus() == len(os.sched_getaffinity(0)) + + if has_psutil and platform.system().lower() != "darwin": + assert get_available_cpus() == len(psutil.Process().cpu_affinity()) + + if platform.system().lower() == "darwin": + assert get_available_cpus() == os.cpu_count() diff --git a/pyproject.toml b/pyproject.toml index ba862339cd..1315b1558c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,36 +1,26 @@ [build-system] -requires = ["flit_scm"] -build-backend = "flit_scm:buildapi" +requires = ["hatchling", "hatch-vcs"] +build-backend = "hatchling.build" [project] name = "pydra" description = "Pydra dataflow engine" readme = "README.rst" -requires-python = ">=3.8" +requires-python = ">=3.11" dependencies = [ - "attrs >=19.1.0", + "attrs >=24.2.0", "cloudpickle >=2.0.0", "etelemetry >=0.2.2", "filelock >=3.0.0", - "fileformats >=0.8", - "importlib_resources >=5.7; python_version < '3.11'", + "fileformats >=0.15a4", "platformdirs >=2", - "typing_extensions >=4.6.3; python_version < '3.10'", - "typing_utils >=0.1.0; python_version < '3.10'", -] -license = {file = "LICENSE"} -authors = [ - {name = "Nipype developers", email = "neuroimaging@python.org"}, ] +license = { file = "LICENSE" } +authors = [{ name = "Nipype developers", email = "neuroimaging@python.org" }] maintainers = [ - {name = "Nipype developers", email = "neuroimaging@python.org"}, -] -keywords = [ - "brainweb", - "dataflow", - "neuroimaging", - "pydra", + { name = "Nipype developers", email = "neuroimaging@python.org" }, ] +keywords = ["brainweb", "dataflow", "neuroimaging", "pydra"] classifiers = [ "Development Status :: 3 - Alpha", "Environment :: Console", @@ -39,31 +29,39 @@ classifiers = [ "Operating System :: MacOS :: MacOS X", "Operating System :: Microsoft :: Windows", "Operating System :: POSIX :: Linux", - "Programming Language :: Python :: 3.7", - "Programming Language :: Python :: 3.8", - "Programming Language :: Python :: 3.9", - "Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", + "Programming Language :: Python :: 3.13", "Topic :: Scientific/Engineering", ] dynamic = ["version"] [project.optional-dependencies] -psij = [ - "psij-python", -] -dask = [ - "dask", - "distributed", -] -dev = [ - "black", - "pre-commit", - "pydra[test]", -] +dev = ["black", "pre-commit", "pydra[test]", "matplotlib"] doc = [ + "fileformats-extras >= v0.15.0a6", + "fileformats-medimage >= v0.10.0a2", + "fileformats-medimage-extras >= v0.10.0a2", + "furo>=2022.2.14.1", + "ipython", + "ipykernel", + "ipywidgets", + "matplotlib", + "nbsphinx", + "nest_asyncio", + "nibabel", + "nilearn", + "numpy", + "numpydoc>=0.6.0", + "openneuro-py", "packaging", - "sphinx ==6.2.1", + "pandas", + "pandoc", + "pydra-mrtrix3 >=3.0.4a17", + "scipy", + "sphinx", + "sphinx-argparse", + "sphinx-click", "sphinx_rtd_theme", "sphinxcontrib-apidoc ~=0.3.0", "sphinxcontrib-versioning", @@ -75,17 +73,37 @@ test = [ "pytest-xdist <2.0", "pytest-rerunfailures", "pytest-timeout", + "pympler", "codecov", + "fileformats-extras >=0.15.0a6", "numpy", "pyld", "psutil", "python-dateutil", "tornado", - "boutiques", "pympler", ] -jupyter = [ - "nest_asyncio" +tutorial = [ + "fileformats-extras >= v0.15.0a6", + "fileformats-medimage >= v0.10.0a2", + "fileformats-medimage-extras >= v0.10.0a2", + "jupyter", + "jupyter_contrib_nbextensions", + "jupytext", + "jupyterlab", + "matplotlib", + "nbformat", + "nbval", + "nest_asyncio", + "nibabel", + "nilearn", + "numpy", + "openneuro-py", + "pandas", + "psutil", + "pydra-mrtrix3 >=3.0.4a17", + "scipy", + "sh", ] # Aliases tests = ["pydra[test]"] @@ -97,18 +115,23 @@ documentation = "https://nipype.github.io/pydra/" homepage = "https://nipype.github.io/pydra/" repository = "https://github.com/nipype/pydra.git" -[tool.flit.module] -name = "pydra" +[tool.hatch.build] +packages = ["pydra"] +exclude = ["tests"] +include = ["./pydra"] + +[tool.hatch.version] +source = "vcs" -[tool.flit.sdist] -exclude = [".gitignore"] +[tool.hatch.build.hooks.vcs] +version-file = "pydra/utils/_version.py" -[tool.setuptools_scm] -write_to = "pydra/_version.py" +[tool.hatch.metadata] +allow-direct-references = true [tool.black] -target-version = ['py37', 'py38'] -exclude = "pydra/_version.py" +target-version = ['py38'] +exclude = "pydra/utils/_version.py" [tool.codespell] -ignore-words-list = "nd,afile,inpt" +ignore-words-list = "nd,afile,inpt,fpr" diff --git a/pytest.ini b/pytest.ini new file mode 100644 index 0000000000..1fd101fd6a --- /dev/null +++ b/pytest.ini @@ -0,0 +1,2 @@ +[pytest] +addopts = --import-mode=importlib -vv diff --git a/tutorial/README.md b/tutorial/README.md deleted file mode 100644 index 4df55ac5a0..0000000000 --- a/tutorial/README.md +++ /dev/null @@ -1,5 +0,0 @@ -# Pydra Tutorial - -Python Tutorial has been moved to a separate [GitHub repository](https://github.com/nipype/pydra-tutorial). - -The interactive tutorial is available at [![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/nipype/pydra-tutorial/master?filepath=notebooks)