From 07fcb818cb0d94a37e943b63cf79cc7426576916 Mon Sep 17 00:00:00 2001 From: Alex Sherstinsky Date: Fri, 8 Mar 2024 12:19:48 -0800 Subject: [PATCH 01/36] Troubleshooting test failures. --- .github/workflows/pytest.yml | 559 +++++++++++---------- .pre-commit-config.yaml | 18 +- setup.cfg | 8 +- tests/conftest.py | 3 + tests/integration_tests/tests_f_control.py | 210 ++++++++ 5 files changed, 510 insertions(+), 288 deletions(-) create mode 100644 tests/integration_tests/tests_f_control.py diff --git a/.github/workflows/pytest.yml b/.github/workflows/pytest.yml index ad487232b46..33ba9b0a67c 100644 --- a/.github/workflows/pytest.yml +++ b/.github/workflows/pytest.yml @@ -16,186 +16,188 @@ concurrency: cancel-in-progress: true jobs: - pytest: - runs-on: ${{ matrix.os }} - strategy: - fail-fast: false - matrix: - os: [ubuntu-latest] - python-version: ["3.8", "3.9", "3.10"] - test-markers: ["not distributed", "distributed"] - include: - - python-version: "3.8" - pytorch-version: 2.0.0 - torchscript-version: 1.10.2 - ray-version: 2.3.1 - - python-version: "3.9" - pytorch-version: 2.1.1 - torchscript-version: 1.10.2 - ray-version: 2.3.1 - - python-version: "3.10" - # pytorch-version: nightly - pytorch-version: 2.2.1 - torchscript-version: 1.10.2 - ray-version: 2.3.1 - env: - PYTORCH: ${{ matrix.pytorch-version }} - MARKERS: ${{ matrix.test-markers }} - NEUROPOD_BASE_DIR: "/usr/local/lib/neuropod" - NEUROPOD_VERISON: "0.3.0-rc6" - TORCHSCRIPT_VERSION: ${{ matrix.torchscript-version }} - RAY_VERSION: ${{ matrix.ray-version }} - AWS_ACCESS_KEY_ID: ${{ secrets.LUDWIG_TESTS_AWS_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.LUDWIG_TESTS_AWS_SECRET_ACCESS_KEY }} - KAGGLE_USERNAME: ${{ secrets.KAGGLE_USERNAME }} - KAGGLE_KEY: ${{ secrets.KAGGLE_KEY }} - IS_NOT_FORK: ${{ !(github.event.pull_request.base.repo.full_name == 'ludwig-ai/ludwig' && github.event.pull_request.head.repo.fork) }} + # TODO: ALEX + # pytest: + # runs-on: ${{ matrix.os }} + # strategy: + # fail-fast: false + # matrix: + # os: [ubuntu-latest] + # python-version: ["3.8", "3.9", "3.10"] + # test-markers: ["not distributed", "distributed"] + # include: + # - python-version: "3.8" + # pytorch-version: 2.0.0 + # torchscript-version: 1.10.2 + # ray-version: 2.3.1 + # - python-version: "3.9" + # pytorch-version: 2.1.1 + # torchscript-version: 1.10.2 + # ray-version: 2.3.1 + # - python-version: "3.10" + # # pytorch-version: nightly + # pytorch-version: 2.2.1 + # torchscript-version: 1.10.2 + # ray-version: 2.3.1 + # env: + # PYTORCH: ${{ matrix.pytorch-version }} + # MARKERS: ${{ matrix.test-markers }} + # NEUROPOD_BASE_DIR: "/usr/local/lib/neuropod" + # NEUROPOD_VERISON: "0.3.0-rc6" + # TORCHSCRIPT_VERSION: ${{ matrix.torchscript-version }} + # RAY_VERSION: ${{ matrix.ray-version }} + # AWS_ACCESS_KEY_ID: ${{ secrets.LUDWIG_TESTS_AWS_ACCESS_KEY_ID }} + # AWS_SECRET_ACCESS_KEY: ${{ secrets.LUDWIG_TESTS_AWS_SECRET_ACCESS_KEY }} + # KAGGLE_USERNAME: ${{ secrets.KAGGLE_USERNAME }} + # KAGGLE_KEY: ${{ secrets.KAGGLE_KEY }} + # IS_NOT_FORK: ${{ !(github.event.pull_request.base.repo.full_name == 'ludwig-ai/ludwig' && github.event.pull_request.head.repo.fork) }} + + # name: py${{ matrix.python-version }}, torch-${{ matrix.pytorch-version }}, ${{ matrix.test-markers }}, ${{ matrix.os }}, ray ${{ matrix.ray-version }} + # services: + # minio: + # image: fclairamb/minio-github-actions + # env: + # MINIO_ACCESS_KEY: minio + # MINIO_SECRET_KEY: minio123 + # ports: + # - 9000:9000 - name: py${{ matrix.python-version }}, torch-${{ matrix.pytorch-version }}, ${{ matrix.test-markers }}, ${{ matrix.os }}, ray ${{ matrix.ray-version }} - services: - minio: - image: fclairamb/minio-github-actions - env: - MINIO_ACCESS_KEY: minio - MINIO_SECRET_KEY: minio123 - ports: - - 9000:9000 + # timeout-minutes: 150 + # steps: + # - name: Setup ludwigai/ludwig-ray container for local testing with act. + # if: ${{ env.ACT }} + # run: | + # curl -fsSL https://deb.nodesource.com/setup_16.x | sudo -E bash - + # sudo apt-get install -y nodejs + # sudo mkdir -p /opt/hostedtoolcache/ + # sudo chmod 777 -R /opt/hostedtoolcache/ + # - uses: actions/checkout@v2 + # - name: Set up Python ${{ matrix.python-version }} + # uses: actions/setup-python@v2 + # with: + # python-version: ${{ matrix.python-version }} - timeout-minutes: 150 - steps: - - name: Setup ludwigai/ludwig-ray container for local testing with act. - if: ${{ env.ACT }} - run: | - curl -fsSL https://deb.nodesource.com/setup_16.x | sudo -E bash - - sudo apt-get install -y nodejs - sudo mkdir -p /opt/hostedtoolcache/ - sudo chmod 777 -R /opt/hostedtoolcache/ - - uses: actions/checkout@v2 - - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v2 - with: - python-version: ${{ matrix.python-version }} + # - name: Setup Linux + # if: runner.os == 'linux' + # run: | + # sudo apt-get update && sudo apt-get install -y cmake libsndfile1 wget libsox-dev - - name: Setup Linux - if: runner.os == 'linux' - run: | - sudo apt-get update && sudo apt-get install -y cmake libsndfile1 wget libsox-dev + # - name: Setup macOS + # if: runner.os == 'macOS' + # run: | + # brew install libuv - - name: Setup macOS - if: runner.os == 'macOS' - run: | - brew install libuv + # - name: pip cache + # if: ${{ !env.ACT }} + # uses: actions/cache@v2 + # with: + # path: ~/.cache/pip + # key: ${{ runner.os }}-pip-py${{ matrix.python-version }}-torch${{ matrix.pytorch-version }}-${{ matrix.test-markers }}-${{ hashFiles('requirements*.txt', '.github/workflows/pytest.yml') }} - - name: pip cache - if: ${{ !env.ACT }} - uses: actions/cache@v2 - with: - path: ~/.cache/pip - key: ${{ runner.os }}-pip-py${{ matrix.python-version }}-torch${{ matrix.pytorch-version }}-${{ matrix.test-markers }}-${{ hashFiles('requirements*.txt', '.github/workflows/pytest.yml') }} + # - name: Debug out of space + # run: | + # du -h -d 1 ~ + # df -h - - name: Debug out of space - run: | - du -h -d 1 ~ - df -h + # - name: Install dependencies + # run: | + # python --version + # pip --version + # python -m pip install -U pip + # cmake --version - - name: Install dependencies - run: | - python --version - pip --version - python -m pip install -U pip - cmake --version + # # remove torch and ray from the dependencies so we can add them depending on the matrix args for the job. + # cat requirements.txt | sed '/^torch[>=<\b]/d' | sed '/^torchtext/d' | sed '/^torchvision/d' | sed '/^torchaudio/d' > requirements-temp && mv requirements-temp requirements.txt + # cat requirements_distributed.txt | sed '/^ray[\[]/d' - # remove torch and ray from the dependencies so we can add them depending on the matrix args for the job. - cat requirements.txt | sed '/^torch[>=<\b]/d' | sed '/^torchtext/d' | sed '/^torchvision/d' | sed '/^torchaudio/d' > requirements-temp && mv requirements-temp requirements.txt - cat requirements_distributed.txt | sed '/^ray[\[]/d' + # if [ "$MARKERS" != "distributed" ]; then + # # Skip distributed and hyperopt requirements to test optional imports + # echo > requirements-temp && mv requirements-temp requirements_distributed.txt + # echo > requirements-temp && mv requirements-temp requirements_hyperopt.txt - if [ "$MARKERS" != "distributed" ]; then - # Skip distributed and hyperopt requirements to test optional imports - echo > requirements-temp && mv requirements-temp requirements_distributed.txt - echo > requirements-temp && mv requirements-temp requirements_hyperopt.txt - - # Skip distributed tree requirement (lightgbm-ray) - cat requirements_tree.txt | sed '/^lightgbm-ray/d' > requirements-temp && mv requirements-temp requirements_tree.txt - else - if [ "$RAY_VERSION" == "nightly" ]; then - # NOTE: hardcoded for python 3.10 on Linux - echo "ray[default,data,serve,tune] @ https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-3.0.0.dev0-cp310-cp310-manylinux2014_x86_64.whl" >> requirements_distributed.txt - else - echo "ray[default,data,serve,tune]==$RAY_VERSION" >> requirements_distributed.txt - fi - fi - - if [ "$PYTORCH" == "nightly" ]; then - extra_index_url=https://download.pytorch.org/whl/nightly/cpu - pip install --pre torch torchtext torchvision torchaudio --extra-index-url $extra_index_url - - else - extra_index_url=https://download.pytorch.org/whl/cpu - pip install torch==$PYTORCH torchtext torchvision torchaudio --extra-index-url $extra_index_url - fi - - pip install '.[test]' --extra-index-url $extra_index_url - pip list + # # Skip distributed tree requirement (lightgbm-ray) + # cat requirements_tree.txt | sed '/^lightgbm-ray/d' > requirements-temp && mv requirements-temp requirements_tree.txt + # else + # if [ "$RAY_VERSION" == "nightly" ]; then + # # NOTE: hardcoded for python 3.10 on Linux + # echo "ray[default,data,serve,tune] @ https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-3.0.0.dev0-cp310-cp310-manylinux2014_x86_64.whl" >> requirements_distributed.txt + # else + # echo "ray[default,data,serve,tune]==$RAY_VERSION" >> requirements_distributed.txt + # fi + # fi - if [ "$PYTORCH" == "nightly" ]; then - python -c "from packaging import version; import torch; assert version.parse(torch.__version__).release >= version.parse(\"2.0.0\").release, f\"torch {version.parse(torch.__version__).release} < version.parse(\'2.0.0\').release\"" - else - python -c "from packaging import version; import torch; assert version.parse(torch.__version__).release == version.parse(\"$PYTORCH\").release, f\"torch {version.parse(torch.__version__).release} != version.parse(\'$PYTORCH\').release\"" - fi - - if [ "$MARKERS" == "distributed" ]; then - python -c "from packaging import version; import ray; assert version.parse(ray.__version__).release == version.parse(\"$RAY_VERSION\").release, f\"ray {version.parse(ray.__version__).release} != version.parse(\'$RAY_VERSION\').release\"" - else - python -c "import importlib.util; assert importlib.util.find_spec('ray') is None, \"found ray but expected it to not be installed\"" - fi - shell: bash + # if [ "$PYTORCH" == "nightly" ]; then + # extra_index_url=https://download.pytorch.org/whl/nightly/cpu + # pip install --pre torch torchtext torchvision torchaudio --extra-index-url $extra_index_url - - name: Install Neuropod backend - run: | - sudo mkdir -p "$NEUROPOD_BASE_DIR" - curl -L https://github.com/uber/neuropod/releases/download/v${{ env.NEUROPOD_VERISON }}/libneuropod-cpu-linux-v${{ env.NEUROPOD_VERISON }}-torchscript-${{ env.TORCHSCRIPT_VERSION }}-backend.tar.gz | sudo tar -xz -C "$NEUROPOD_BASE_DIR" - shell: bash + # else + # extra_index_url=https://download.pytorch.org/whl/cpu + # pip install torch==$PYTORCH torchtext torchvision torchaudio --extra-index-url $extra_index_url + # fi - - name: Unit Tests - run: | - RUN_PRIVATE=$IS_NOT_FORK LUDWIG_TEST_SUITE_TIMEOUT_S=5400 pytest -v --timeout 300 --durations 100 -m "$MARKERS and not slow and not combinatorial and not horovod and not llm" --junitxml pytest.xml tests/ludwig + # pip install '.[test]' --extra-index-url $extra_index_url + # pip list - - name: Regression Tests - run: | - RUN_PRIVATE=$IS_NOT_FORK LUDWIG_TEST_SUITE_TIMEOUT_S=5400 pytest -v --timeout 300 --durations 100 -m "$MARKERS and not slow and not combinatorial and not horovod or benchmark and not llm" --junitxml pytest.xml tests/regression_tests - - # Skip Horovod and replace with DDP. - # https://github.com/ludwig-ai/ludwig/issues/3468 - # - name: Install Horovod if necessary - # if: matrix.test-markers == 'distributed' && matrix.pytorch-version != 'nightly' - # env: - # HOROVOD_WITH_PYTORCH: 1 - # HOROVOD_WITHOUT_MPI: 1 - # HOROVOD_WITHOUT_TENSORFLOW: 1 - # HOROVOD_WITHOUT_MXNET: 1 - # run: | - # pip install -r requirements_extra.txt - # HOROVOD_BUILT=$(python -c "import horovod.torch; horovod.torch.nccl_built(); print('SUCCESS')" || true) - # if [[ $HOROVOD_BUILT != "SUCCESS" ]]; then - # pip uninstall -y horovod - # pip install --no-cache-dir git+https://github.com/horovod/horovod.git@master - # fi - # horovodrun --check-build - # shell: bash - - # Skip Horovod tests and replace with DDP. - # https://github.com/ludwig-ai/ludwig/issues/3468 - # - name: Horovod Tests - # if: matrix.test-markers == 'distributed' && matrix.pytorch-version != 'nightly' - # run: | - # RUN_PRIVATE=$IS_NOT_FORK LUDWIG_TEST_SUITE_TIMEOUT_S=5400 pytest -v --timeout 300 --durations 100 -m "$MARKERS and horovod and not slow and not combinatorial and not llm" --junitxml pytest.xml tests/ - - - name: Upload Unit Test Results - if: ${{ always() && !env.ACT }} - uses: actions/upload-artifact@v2 - with: - name: Unit Test Results (Python ${{ matrix.python-version }} ${{ matrix.test-markers }}) - path: pytest.xml + # if [ "$PYTORCH" == "nightly" ]; then + # python -c "from packaging import version; import torch; assert version.parse(torch.__version__).release >= version.parse(\"2.0.0\").release, f\"torch {version.parse(torch.__version__).release} < version.parse(\'2.0.0\').release\"" + # else + # python -c "from packaging import version; import torch; assert version.parse(torch.__version__).release == version.parse(\"$PYTORCH\").release, f\"torch {version.parse(torch.__version__).release} != version.parse(\'$PYTORCH\').release\"" + # fi + + # if [ "$MARKERS" == "distributed" ]; then + # python -c "from packaging import version; import ray; assert version.parse(ray.__version__).release == version.parse(\"$RAY_VERSION\").release, f\"ray {version.parse(ray.__version__).release} != version.parse(\'$RAY_VERSION\').release\"" + # else + # python -c "import importlib.util; assert importlib.util.find_spec('ray') is None, \"found ray but expected it to not be installed\"" + # fi + # shell: bash + + # - name: Install Neuropod backend + # run: | + # sudo mkdir -p "$NEUROPOD_BASE_DIR" + # curl -L https://github.com/uber/neuropod/releases/download/v${{ env.NEUROPOD_VERISON }}/libneuropod-cpu-linux-v${{ env.NEUROPOD_VERISON }}-torchscript-${{ env.TORCHSCRIPT_VERSION }}-backend.tar.gz | sudo tar -xz -C "$NEUROPOD_BASE_DIR" + # shell: bash + + # - name: Unit Tests + # run: | + # RUN_PRIVATE=$IS_NOT_FORK LUDWIG_TEST_SUITE_TIMEOUT_S=5400 pytest -v --timeout 300 --durations 100 -m "$MARKERS and not slow and not combinatorial and not horovod and not llm" --junitxml pytest.xml tests/ludwig + + # - name: Regression Tests + # run: | + # RUN_PRIVATE=$IS_NOT_FORK LUDWIG_TEST_SUITE_TIMEOUT_S=5400 pytest -v --timeout 300 --durations 100 -m "$MARKERS and not slow and not combinatorial and not horovod or benchmark and not llm" --junitxml pytest.xml tests/regression_tests + + # # Skip Horovod and replace with DDP. + # # https://github.com/ludwig-ai/ludwig/issues/3468 + # # - name: Install Horovod if necessary + # # if: matrix.test-markers == 'distributed' && matrix.pytorch-version != 'nightly' + # # env: + # # HOROVOD_WITH_PYTORCH: 1 + # # HOROVOD_WITHOUT_MPI: 1 + # # HOROVOD_WITHOUT_TENSORFLOW: 1 + # # HOROVOD_WITHOUT_MXNET: 1 + # # run: | + # # pip install -r requirements_extra.txt + # # HOROVOD_BUILT=$(python -c "import horovod.torch; horovod.torch.nccl_built(); print('SUCCESS')" || true) + # # if [[ $HOROVOD_BUILT != "SUCCESS" ]]; then + # # pip uninstall -y horovod + # # pip install --no-cache-dir git+https://github.com/horovod/horovod.git@master + # # fi + # # horovodrun --check-build + # # shell: bash + + # # Skip Horovod tests and replace with DDP. + # # https://github.com/ludwig-ai/ludwig/issues/3468 + # # - name: Horovod Tests + # # if: matrix.test-markers == 'distributed' && matrix.pytorch-version != 'nightly' + # # run: | + # # RUN_PRIVATE=$IS_NOT_FORK LUDWIG_TEST_SUITE_TIMEOUT_S=5400 pytest -v --timeout 300 --durations 100 -m "$MARKERS and horovod and not slow and not combinatorial and not llm" --junitxml pytest.xml tests/ + + # - name: Upload Unit Test Results + # if: ${{ always() && !env.ACT }} + # uses: actions/upload-artifact@v2 + # with: + # name: Unit Test Results (Python ${{ matrix.python-version }} ${{ matrix.test-markers }}) + # path: pytest.xml + # TODO: ALEX integration-tests: name: ${{ matrix.test-markers }} @@ -204,12 +206,17 @@ jobs: fail-fast: false matrix: test-markers: - - "integration_tests_a" - - "integration_tests_b" - - "integration_tests_c" - - "integration_tests_d" - - "integration_tests_e" - - "integration_tests_f" + # TODO: ALEX + # - "integration_tests_a" + # - "integration_tests_b" + # - "integration_tests_c" + # - "integration_tests_d" + # - "integration_tests_e" + # - "integration_tests_f" + # TODO: ALEX + # TODO: ALEX + - "integration_tests_x" + # TODO: ALEX env: AWS_ACCESS_KEY_ID: ${{ secrets.LUDWIG_TESTS_AWS_ACCESS_KEY_ID }} @@ -265,127 +272,129 @@ jobs: run: | RUN_PRIVATE=$IS_NOT_FORK LUDWIG_TEST_SUITE_TIMEOUT_S=7200 pytest -v --timeout 300 --durations 100 -m "not slow and not combinatorial and not horovod and not llm and $MARKERS" --junitxml pytest.xml tests/integration_tests - llm-tests: - name: LLM Tests - runs-on: ubuntu-latest + # TODO: ALEX + # llm-tests: + # name: LLM Tests + # runs-on: ubuntu-latest - timeout-minutes: 60 - steps: - - uses: actions/checkout@v2 - - name: Set up Python 3.9 - uses: actions/setup-python@v2 - with: - python-version: 3.9 + # timeout-minutes: 60 + # steps: + # - uses: actions/checkout@v2 + # - name: Set up Python 3.9 + # uses: actions/setup-python@v2 + # with: + # python-version: 3.9 - - name: Setup Linux - if: runner.os == 'linux' - run: | - sudo apt-get update && sudo apt-get install -y cmake libsndfile1 + # - name: Setup Linux + # if: runner.os == 'linux' + # run: | + # sudo apt-get update && sudo apt-get install -y cmake libsndfile1 - - name: Setup macOS - if: runner.os == 'macOS' - run: | - brew install libuv + # - name: Setup macOS + # if: runner.os == 'macOS' + # run: | + # brew install libuv - - name: Install dependencies - run: | - python --version - pip --version - python -m pip install -U pip + # - name: Install dependencies + # run: | + # python --version + # pip --version + # python -m pip install -U pip - # remove torch and ray from the dependencies so we can add them depending on the matrix args for the job. - cat requirements.txt | sed '/^torch[>=<\b]/d' | sed '/^torchtext/d' | sed '/^torchvision/d' | sed '/^torchaudio/d' > requirements-temp && mv requirements-temp requirements.txt - cat requirements_distributed.txt | sed '/^ray[\[]/d' - pip install torch==2.0.0 torchtext torchvision torchaudio - pip install ray==2.3.0 - pip install '.[test]' - pip list - shell: bash + # # remove torch and ray from the dependencies so we can add them depending on the matrix args for the job. + # cat requirements.txt | sed '/^torch[>=<\b]/d' | sed '/^torchtext/d' | sed '/^torchvision/d' | sed '/^torchaudio/d' > requirements-temp && mv requirements-temp requirements.txt + # cat requirements_distributed.txt | sed '/^ray[\[]/d' + # pip install torch==2.0.0 torchtext torchvision torchaudio + # pip install ray==2.3.0 + # pip install '.[test]' + # pip list + # shell: bash - - name: LLM Tests - run: | - pytest -vs --durations 100 -m "llm" --junitxml pytest.xml tests + # - name: LLM Tests + # run: | + # pytest -vs --durations 100 -m "llm" --junitxml pytest.xml tests - combinatorial-tests: - name: Combinatorial Tests - runs-on: ubuntu-latest + # combinatorial-tests: + # name: Combinatorial Tests + # runs-on: ubuntu-latest - timeout-minutes: 60 - steps: - - uses: actions/checkout@v2 - - name: Set up Python 3.8 - uses: actions/setup-python@v2 - with: - python-version: 3.8 + # timeout-minutes: 60 + # steps: + # - uses: actions/checkout@v2 + # - name: Set up Python 3.8 + # uses: actions/setup-python@v2 + # with: + # python-version: 3.8 - - name: Setup Linux - if: runner.os == 'linux' - run: | - sudo apt-get update && sudo apt-get install -y cmake libsndfile1 + # - name: Setup Linux + # if: runner.os == 'linux' + # run: | + # sudo apt-get update && sudo apt-get install -y cmake libsndfile1 - - name: Setup macOS - if: runner.os == 'macOS' - run: | - brew install libuv + # - name: Setup macOS + # if: runner.os == 'macOS' + # run: | + # brew install libuv - - name: Install dependencies - run: | - python --version - pip --version - python -m pip install -U pip - pip install '.[test]' - pip list - shell: bash + # - name: Install dependencies + # run: | + # python --version + # pip --version + # python -m pip install -U pip + # pip install '.[test]' + # pip list + # shell: bash - - name: Testing combinatorial config generation code - run: | - pytest -vs --durations 100 -m "combinatorial" --junitxml pytest.xml tests/ludwig/config_sampling + # - name: Testing combinatorial config generation code + # run: | + # pytest -vs --durations 100 -m "combinatorial" --junitxml pytest.xml tests/ludwig/config_sampling - - name: Combinatorial Tests - run: | - pytest -rx --durations 100 -m "combinatorial" --junitxml pytest.xml tests/training_success + # - name: Combinatorial Tests + # run: | + # pytest -rx --durations 100 -m "combinatorial" --junitxml pytest.xml tests/training_success - test-minimal-install: - name: Test Minimal Install - runs-on: ubuntu-latest + # test-minimal-install: + # name: Test Minimal Install + # runs-on: ubuntu-latest - timeout-minutes: 15 - steps: - - uses: actions/checkout@v2 - - name: Set up Python 3.8 - uses: actions/setup-python@v2 - with: - python-version: 3.8 + # timeout-minutes: 15 + # steps: + # - uses: actions/checkout@v2 + # - name: Set up Python 3.8 + # uses: actions/setup-python@v2 + # with: + # python-version: 3.8 - - name: Setup Linux - if: runner.os == 'linux' - run: | - sudo apt-get update && sudo apt-get install -y cmake libsndfile1 + # - name: Setup Linux + # if: runner.os == 'linux' + # run: | + # sudo apt-get update && sudo apt-get install -y cmake libsndfile1 - - name: Setup macOS - if: runner.os == 'macOS' - run: | - brew install libuv + # - name: Setup macOS + # if: runner.os == 'macOS' + # run: | + # brew install libuv - - name: Install dependencies - run: | - python --version - pip --version - python -m pip install -U pip - pip install torch==2.0.0 torchtext - pip install ray==2.3.0 - pip install '.' - pip list - shell: bash - - name: Check Install - run: | - ludwig check_install - shell: bash + # - name: Install dependencies + # run: | + # python --version + # pip --version + # python -m pip install -U pip + # pip install torch==2.0.0 torchtext + # pip install ray==2.3.0 + # pip install '.' + # pip list + # shell: bash + # - name: Check Install + # run: | + # ludwig check_install + # shell: bash - - name: Test Getting Started - run: | - cd examples/getting_started && sh ./run.sh - shell: bash + # - name: Test Getting Started + # run: | + # cd examples/getting_started && sh ./run.sh + # shell: bash + # TODO: ALEX # start-runner: # name: Start self-hosted EC2 runner diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 1c6390db514..27fd6a51fde 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -38,15 +38,15 @@ repos: hooks: - id: docformatter args: [--in-place, --wrap-summaries=115, --wrap-descriptions=120] - - repo: https://github.com/PyCQA/isort - rev: 5.12.0 - hooks: - - id: isort - name: Format imports - - repo: https://github.com/pycqa/flake8 - rev: 6.0.0 - hooks: - - id: flake8 + #- repo: https://github.com/PyCQA/isort + # rev: 5.12.0 + # hooks: + # - id: isort + # name: Format imports + #- repo: https://github.com/pycqa/flake8 + # rev: 6.0.0 + # hooks: + # - id: flake8 - repo: https://github.com/psf/black rev: 23.3.0 hooks: diff --git a/setup.cfg b/setup.cfg index 421f3a791ee..1090ea3197b 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,4 +1,4 @@ -[flake8] +#[flake8] max-line-length = 120 exclude = .tox, @@ -7,9 +7,9 @@ exclude = build, temp -select = E,W,F -doctests = True -verbose = 2 +#select = E,W,F +#doctests = True +#verbose = 2 # https://pep8.readthedocs.io/en/latest/intro.html#error-codes format = pylint ignore = diff --git a/tests/conftest.py b/tests/conftest.py index 9dae92e2e65..3b2abe99622 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -44,6 +44,9 @@ "integration_tests_c", "integration_tests_d", "integration_tests_e", + # TODO: ALEX + "integration_tests_x", + # TODO: ALEX } diff --git a/tests/integration_tests/tests_f_control.py b/tests/integration_tests/tests_f_control.py new file mode 100644 index 00000000000..6fb059bd136 --- /dev/null +++ b/tests/integration_tests/tests_f_control.py @@ -0,0 +1,210 @@ +import asyncio +import contextlib +import copy +import logging +import os +import platform +import random +import string +from typing import List, Union +from unittest import mock + +import numpy as np +import pandas as pd +import pytest +import torch +from PIL import Image +from transformers import AutoTokenizer + +import ludwig +from ludwig.api import LudwigModel +from ludwig.backend import initialize_backend +from ludwig.callbacks import Callback +from ludwig.constants import ( + BASE_MODEL, + BATCH_SIZE, + COLUMN, + DECODER, + EPOCHS, + FULL, + INPUT_FEATURES, + MODEL_ECD, + MODEL_LLM, + MODEL_TYPE, + NAME, + OUTPUT_FEATURES, + PREDICTIONS, + PREPROCESSING, + PROC_COLUMN, + PROMPT, + SPLIT, + TRAINER, + TYPE, +) +from ludwig.data.concatenate_datasets import concatenate_df +from ludwig.data.preprocessing import handle_features_with_prompt_config, preprocess_for_prediction +from ludwig.schema.llms.prompt import PromptConfig +from ludwig.schema.model_types.base import ModelConfig +from ludwig.utils.carton_utils import export_carton +from tests.integration_tests.utils import ( + assert_preprocessed_dataset_shape_and_dtype_for_feature, + audio_feature, + binary_feature, + category_feature, + generate_data, + generate_data_as_dataframe, + image_feature, + LocalTestBackend, + number_feature, + sequence_feature, + text_feature, +) + +NUM_EXAMPLES = 20 + +# TODO: ALEX +# pytestmark = pytest.mark.integration_tests_x +# TODO: ALEX + + +# TODO: ALEX +@pytest.mark.integration_tests_x +# TODO: ALEX +@pytest.mark.skipif(platform.system() == "Windows", reason="Carton is not supported on Windows") +def test_carton_torchscript(csv_filename, tmpdir): + data_csv_path = os.path.join(tmpdir, csv_filename) + + # Configure features to be tested: + bin_str_feature = binary_feature() + input_features = [ + bin_str_feature, + # binary_feature(), + number_feature(), + category_feature(encoder={"vocab_size": 3}), + # TODO: future support + # sequence_feature(vocab_size=3), + # text_feature(vocab_size=3), + # vector_feature(), + # image_feature(image_dest_folder), + # audio_feature(audio_dest_folder), + # timeseries_feature(), + # date_feature(), + # h3_feature(), + # set_feature(vocab_size=3), + # bag_feature(vocab_size=3), + ] + output_features = [ + bin_str_feature, + # binary_feature(), + number_feature(), + category_feature(decoder={"vocab_size": 3}, output_feature=True), + # TODO: future support + # sequence_feature(vocab_size=3), + # text_feature(vocab_size=3), + # set_feature(vocab_size=3), + # vector_feature() + ] + backend = LocalTestBackend() + config = { + "input_features": input_features, + "output_features": output_features, + TRAINER: {"epochs": 2, BATCH_SIZE: 128}, + } + + # Generate training data + training_data_csv_path = generate_data(input_features, output_features, data_csv_path) + + # Convert bool values to strings, e.g., {'Yes', 'No'} + df = pd.read_csv(training_data_csv_path) + false_value, true_value = "No", "Yes" + df[bin_str_feature[NAME]] = df[bin_str_feature[NAME]].map(lambda x: true_value if x else false_value) + df.to_csv(training_data_csv_path) + + # Train Ludwig (Pythonic) model: + ludwig_model = LudwigModel(config, backend=backend) + ludwig_model.train( + dataset=training_data_csv_path, + skip_save_training_description=True, + skip_save_training_statistics=True, + skip_save_model=True, + skip_save_progress=True, + skip_save_log=True, + skip_save_processed_input=True, + ) + + # Obtain predictions from Python model + preds_dict, _ = ludwig_model.predict(dataset=training_data_csv_path, return_type=dict) + + # Create graph inference model (Torchscript) from trained Ludwig model. + carton_path = os.path.join(tmpdir, "carton") + export_carton(ludwig_model, carton_path) + + import cartonml as carton + + # Load the carton model + # See https://pyo3.rs/v0.20.0/ecosystem/async-await#a-note-about-asynciorun for why we wrap it + # in another function + async def load(): + return await carton.load(carton_path) + + loop = asyncio.get_event_loop() + carton_model = loop.run_until_complete(load()) + + def to_input(s: pd.Series) -> Union[List[str], torch.Tensor]: + if s.dtype == "object": + return np.array(s.to_list()) + return s.to_numpy().astype(np.float32) + + df = pd.read_csv(training_data_csv_path) + inputs = {name: to_input(df[feature.column]) for name, feature in ludwig_model.model.input_features.items()} + + # See https://pyo3.rs/v0.20.0/ecosystem/async-await#a-note-about-asynciorun for why we wrap it + # in another function + async def infer(inputs): + return await carton_model.infer(inputs) + + outputs = loop.run_until_complete(infer(inputs)) + + # Compare results from Python trained model against Carton + assert len(preds_dict) == len(outputs) + for feature_name, feature_outputs_expected in preds_dict.items(): + assert feature_name in outputs + + output_values_expected = feature_outputs_expected[PREDICTIONS] + output_values = outputs[feature_name] + if output_values.dtype.type in {np.string_, np.str_}: + # Strings should match exactly + assert np.all(output_values == output_values_expected), f"feature: {feature_name}, output: predictions" + else: + assert np.allclose(output_values, output_values_expected), f"feature: {feature_name}, output: predictions" + + +# TODO: ALEX +@pytest.mark.integration_tests_x +# TODO: ALEX +@pytest.mark.parametrize("use_pretrained", [False, True], ids=["false", "true"]) +def test_vit_encoder_different_dimension_image(tmpdir, csv_filename, use_pretrained: bool): + input_features = [ + image_feature( + os.path.join(tmpdir, "generated_output"), + preprocessing={"in_memory": True, "height": 224, "width": 206, "num_channels": 3}, + encoder={TYPE: "_vit_legacy", "use_pretrained": use_pretrained}, + ) + ] + output_features = [category_feature(decoder={"vocab_size": 5}, reduce_input="sum")] + + data_csv = generate_data( + input_features, output_features, os.path.join(tmpdir, csv_filename), num_examples=NUM_EXAMPLES + ) + + config = { + INPUT_FEATURES: input_features, + OUTPUT_FEATURES: output_features, + TRAINER: {"train_steps": 1}, + } + + model = LudwigModel(config) + + # Failure happens post preprocessing but before training during the ECD model creation phase + # so make sure the model can be created properly and training can proceed + model.train(dataset=data_csv) From e62581e8bf4b42f4a11f402e41a7fa5864916ff8 Mon Sep 17 00:00:00 2001 From: Alex Sherstinsky Date: Fri, 8 Mar 2024 12:26:19 -0800 Subject: [PATCH 02/36] Troubleshooting test failures. --- setup.cfg | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/setup.cfg b/setup.cfg index 1090ea3197b..421f3a791ee 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,4 +1,4 @@ -#[flake8] +[flake8] max-line-length = 120 exclude = .tox, @@ -7,9 +7,9 @@ exclude = build, temp -#select = E,W,F -#doctests = True -#verbose = 2 +select = E,W,F +doctests = True +verbose = 2 # https://pep8.readthedocs.io/en/latest/intro.html#error-codes format = pylint ignore = From 9017c3dfdf58bc9daa6bbe92a6e5619ce7f0240d Mon Sep 17 00:00:00 2001 From: Alex Sherstinsky Date: Fri, 8 Mar 2024 12:36:19 -0800 Subject: [PATCH 03/36] Troubleshooting test failures. --- .github/workflows/pytest.yml | 334 +++++++++++++++++------------------ 1 file changed, 167 insertions(+), 167 deletions(-) diff --git a/.github/workflows/pytest.yml b/.github/workflows/pytest.yml index 33ba9b0a67c..ca9d70f5dd3 100644 --- a/.github/workflows/pytest.yml +++ b/.github/workflows/pytest.yml @@ -17,186 +17,186 @@ concurrency: jobs: # TODO: ALEX - # pytest: - # runs-on: ${{ matrix.os }} - # strategy: - # fail-fast: false - # matrix: - # os: [ubuntu-latest] - # python-version: ["3.8", "3.9", "3.10"] - # test-markers: ["not distributed", "distributed"] - # include: - # - python-version: "3.8" - # pytorch-version: 2.0.0 - # torchscript-version: 1.10.2 - # ray-version: 2.3.1 - # - python-version: "3.9" - # pytorch-version: 2.1.1 - # torchscript-version: 1.10.2 - # ray-version: 2.3.1 - # - python-version: "3.10" - # # pytorch-version: nightly - # pytorch-version: 2.2.1 - # torchscript-version: 1.10.2 - # ray-version: 2.3.1 - # env: - # PYTORCH: ${{ matrix.pytorch-version }} - # MARKERS: ${{ matrix.test-markers }} - # NEUROPOD_BASE_DIR: "/usr/local/lib/neuropod" - # NEUROPOD_VERISON: "0.3.0-rc6" - # TORCHSCRIPT_VERSION: ${{ matrix.torchscript-version }} - # RAY_VERSION: ${{ matrix.ray-version }} - # AWS_ACCESS_KEY_ID: ${{ secrets.LUDWIG_TESTS_AWS_ACCESS_KEY_ID }} - # AWS_SECRET_ACCESS_KEY: ${{ secrets.LUDWIG_TESTS_AWS_SECRET_ACCESS_KEY }} - # KAGGLE_USERNAME: ${{ secrets.KAGGLE_USERNAME }} - # KAGGLE_KEY: ${{ secrets.KAGGLE_KEY }} - # IS_NOT_FORK: ${{ !(github.event.pull_request.base.repo.full_name == 'ludwig-ai/ludwig' && github.event.pull_request.head.repo.fork) }} - - # name: py${{ matrix.python-version }}, torch-${{ matrix.pytorch-version }}, ${{ matrix.test-markers }}, ${{ matrix.os }}, ray ${{ matrix.ray-version }} - # services: - # minio: - # image: fclairamb/minio-github-actions - # env: - # MINIO_ACCESS_KEY: minio - # MINIO_SECRET_KEY: minio123 - # ports: - # - 9000:9000 - - # timeout-minutes: 150 - # steps: - # - name: Setup ludwigai/ludwig-ray container for local testing with act. - # if: ${{ env.ACT }} - # run: | - # curl -fsSL https://deb.nodesource.com/setup_16.x | sudo -E bash - - # sudo apt-get install -y nodejs - # sudo mkdir -p /opt/hostedtoolcache/ - # sudo chmod 777 -R /opt/hostedtoolcache/ - # - uses: actions/checkout@v2 - # - name: Set up Python ${{ matrix.python-version }} - # uses: actions/setup-python@v2 - # with: - # python-version: ${{ matrix.python-version }} - - # - name: Setup Linux - # if: runner.os == 'linux' - # run: | - # sudo apt-get update && sudo apt-get install -y cmake libsndfile1 wget libsox-dev - - # - name: Setup macOS - # if: runner.os == 'macOS' - # run: | - # brew install libuv - - # - name: pip cache - # if: ${{ !env.ACT }} - # uses: actions/cache@v2 - # with: - # path: ~/.cache/pip - # key: ${{ runner.os }}-pip-py${{ matrix.python-version }}-torch${{ matrix.pytorch-version }}-${{ matrix.test-markers }}-${{ hashFiles('requirements*.txt', '.github/workflows/pytest.yml') }} - - # - name: Debug out of space - # run: | - # du -h -d 1 ~ - # df -h + pytest: + runs-on: ${{ matrix.os }} + strategy: + fail-fast: false + matrix: + os: [ubuntu-latest] + python-version: ["3.8", "3.9", "3.10"] + test-markers: ["not distributed", "distributed"] + include: + - python-version: "3.8" + pytorch-version: 2.0.0 + torchscript-version: 1.10.2 + ray-version: 2.3.1 + - python-version: "3.9" + pytorch-version: 2.1.1 + torchscript-version: 1.10.2 + ray-version: 2.3.1 + - python-version: "3.10" + # pytorch-version: nightly + pytorch-version: 2.2.1 + torchscript-version: 1.10.2 + ray-version: 2.3.1 + env: + PYTORCH: ${{ matrix.pytorch-version }} + MARKERS: ${{ matrix.test-markers }} + NEUROPOD_BASE_DIR: "/usr/local/lib/neuropod" + NEUROPOD_VERISON: "0.3.0-rc6" + TORCHSCRIPT_VERSION: ${{ matrix.torchscript-version }} + RAY_VERSION: ${{ matrix.ray-version }} + AWS_ACCESS_KEY_ID: ${{ secrets.LUDWIG_TESTS_AWS_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.LUDWIG_TESTS_AWS_SECRET_ACCESS_KEY }} + KAGGLE_USERNAME: ${{ secrets.KAGGLE_USERNAME }} + KAGGLE_KEY: ${{ secrets.KAGGLE_KEY }} + IS_NOT_FORK: ${{ !(github.event.pull_request.base.repo.full_name == 'ludwig-ai/ludwig' && github.event.pull_request.head.repo.fork) }} - # - name: Install dependencies - # run: | - # python --version - # pip --version - # python -m pip install -U pip - # cmake --version + name: py${{ matrix.python-version }}, torch-${{ matrix.pytorch-version }}, ${{ matrix.test-markers }}, ${{ matrix.os }}, ray ${{ matrix.ray-version }} + services: + minio: + image: fclairamb/minio-github-actions + env: + MINIO_ACCESS_KEY: minio + MINIO_SECRET_KEY: minio123 + ports: + - 9000:9000 - # # remove torch and ray from the dependencies so we can add them depending on the matrix args for the job. - # cat requirements.txt | sed '/^torch[>=<\b]/d' | sed '/^torchtext/d' | sed '/^torchvision/d' | sed '/^torchaudio/d' > requirements-temp && mv requirements-temp requirements.txt - # cat requirements_distributed.txt | sed '/^ray[\[]/d' + timeout-minutes: 150 + steps: + - name: Setup ludwigai/ludwig-ray container for local testing with act. + if: ${{ env.ACT }} + run: | + curl -fsSL https://deb.nodesource.com/setup_16.x | sudo -E bash - + sudo apt-get install -y nodejs + sudo mkdir -p /opt/hostedtoolcache/ + sudo chmod 777 -R /opt/hostedtoolcache/ + - uses: actions/checkout@v2 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v2 + with: + python-version: ${{ matrix.python-version }} - # if [ "$MARKERS" != "distributed" ]; then - # # Skip distributed and hyperopt requirements to test optional imports - # echo > requirements-temp && mv requirements-temp requirements_distributed.txt - # echo > requirements-temp && mv requirements-temp requirements_hyperopt.txt + - name: Setup Linux + if: runner.os == 'linux' + run: | + sudo apt-get update && sudo apt-get install -y cmake libsndfile1 wget libsox-dev - # # Skip distributed tree requirement (lightgbm-ray) - # cat requirements_tree.txt | sed '/^lightgbm-ray/d' > requirements-temp && mv requirements-temp requirements_tree.txt - # else - # if [ "$RAY_VERSION" == "nightly" ]; then - # # NOTE: hardcoded for python 3.10 on Linux - # echo "ray[default,data,serve,tune] @ https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-3.0.0.dev0-cp310-cp310-manylinux2014_x86_64.whl" >> requirements_distributed.txt - # else - # echo "ray[default,data,serve,tune]==$RAY_VERSION" >> requirements_distributed.txt - # fi - # fi + - name: Setup macOS + if: runner.os == 'macOS' + run: | + brew install libuv - # if [ "$PYTORCH" == "nightly" ]; then - # extra_index_url=https://download.pytorch.org/whl/nightly/cpu - # pip install --pre torch torchtext torchvision torchaudio --extra-index-url $extra_index_url + - name: pip cache + if: ${{ !env.ACT }} + uses: actions/cache@v2 + with: + path: ~/.cache/pip + key: ${{ runner.os }}-pip-py${{ matrix.python-version }}-torch${{ matrix.pytorch-version }}-${{ matrix.test-markers }}-${{ hashFiles('requirements*.txt', '.github/workflows/pytest.yml') }} - # else - # extra_index_url=https://download.pytorch.org/whl/cpu - # pip install torch==$PYTORCH torchtext torchvision torchaudio --extra-index-url $extra_index_url - # fi + - name: Debug out of space + run: | + du -h -d 1 ~ + df -h - # pip install '.[test]' --extra-index-url $extra_index_url - # pip list + - name: Install dependencies + run: | + python --version + pip --version + python -m pip install -U pip + cmake --version - # if [ "$PYTORCH" == "nightly" ]; then - # python -c "from packaging import version; import torch; assert version.parse(torch.__version__).release >= version.parse(\"2.0.0\").release, f\"torch {version.parse(torch.__version__).release} < version.parse(\'2.0.0\').release\"" - # else - # python -c "from packaging import version; import torch; assert version.parse(torch.__version__).release == version.parse(\"$PYTORCH\").release, f\"torch {version.parse(torch.__version__).release} != version.parse(\'$PYTORCH\').release\"" - # fi + # remove torch and ray from the dependencies so we can add them depending on the matrix args for the job. + cat requirements.txt | sed '/^torch[>=<\b]/d' | sed '/^torchtext/d' | sed '/^torchvision/d' | sed '/^torchaudio/d' > requirements-temp && mv requirements-temp requirements.txt + cat requirements_distributed.txt | sed '/^ray[\[]/d' - # if [ "$MARKERS" == "distributed" ]; then - # python -c "from packaging import version; import ray; assert version.parse(ray.__version__).release == version.parse(\"$RAY_VERSION\").release, f\"ray {version.parse(ray.__version__).release} != version.parse(\'$RAY_VERSION\').release\"" - # else - # python -c "import importlib.util; assert importlib.util.find_spec('ray') is None, \"found ray but expected it to not be installed\"" - # fi - # shell: bash + if [ "$MARKERS" != "distributed" ]; then + # Skip distributed and hyperopt requirements to test optional imports + echo > requirements-temp && mv requirements-temp requirements_distributed.txt + echo > requirements-temp && mv requirements-temp requirements_hyperopt.txt + + # Skip distributed tree requirement (lightgbm-ray) + cat requirements_tree.txt | sed '/^lightgbm-ray/d' > requirements-temp && mv requirements-temp requirements_tree.txt + else + if [ "$RAY_VERSION" == "nightly" ]; then + # NOTE: hardcoded for python 3.10 on Linux + echo "ray[default,data,serve,tune] @ https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-3.0.0.dev0-cp310-cp310-manylinux2014_x86_64.whl" >> requirements_distributed.txt + else + echo "ray[default,data,serve,tune]==$RAY_VERSION" >> requirements_distributed.txt + fi + fi + + if [ "$PYTORCH" == "nightly" ]; then + extra_index_url=https://download.pytorch.org/whl/nightly/cpu + pip install --pre torch torchtext torchvision torchaudio --extra-index-url $extra_index_url + + else + extra_index_url=https://download.pytorch.org/whl/cpu + pip install torch==$PYTORCH torchtext torchvision torchaudio --extra-index-url $extra_index_url + fi + + pip install '.[test]' --extra-index-url $extra_index_url + pip list - # - name: Install Neuropod backend - # run: | - # sudo mkdir -p "$NEUROPOD_BASE_DIR" - # curl -L https://github.com/uber/neuropod/releases/download/v${{ env.NEUROPOD_VERISON }}/libneuropod-cpu-linux-v${{ env.NEUROPOD_VERISON }}-torchscript-${{ env.TORCHSCRIPT_VERSION }}-backend.tar.gz | sudo tar -xz -C "$NEUROPOD_BASE_DIR" - # shell: bash + if [ "$PYTORCH" == "nightly" ]; then + python -c "from packaging import version; import torch; assert version.parse(torch.__version__).release >= version.parse(\"2.0.0\").release, f\"torch {version.parse(torch.__version__).release} < version.parse(\'2.0.0\').release\"" + else + python -c "from packaging import version; import torch; assert version.parse(torch.__version__).release == version.parse(\"$PYTORCH\").release, f\"torch {version.parse(torch.__version__).release} != version.parse(\'$PYTORCH\').release\"" + fi + + if [ "$MARKERS" == "distributed" ]; then + python -c "from packaging import version; import ray; assert version.parse(ray.__version__).release == version.parse(\"$RAY_VERSION\").release, f\"ray {version.parse(ray.__version__).release} != version.parse(\'$RAY_VERSION\').release\"" + else + python -c "import importlib.util; assert importlib.util.find_spec('ray') is None, \"found ray but expected it to not be installed\"" + fi + shell: bash - # - name: Unit Tests - # run: | - # RUN_PRIVATE=$IS_NOT_FORK LUDWIG_TEST_SUITE_TIMEOUT_S=5400 pytest -v --timeout 300 --durations 100 -m "$MARKERS and not slow and not combinatorial and not horovod and not llm" --junitxml pytest.xml tests/ludwig + - name: Install Neuropod backend + run: | + sudo mkdir -p "$NEUROPOD_BASE_DIR" + curl -L https://github.com/uber/neuropod/releases/download/v${{ env.NEUROPOD_VERISON }}/libneuropod-cpu-linux-v${{ env.NEUROPOD_VERISON }}-torchscript-${{ env.TORCHSCRIPT_VERSION }}-backend.tar.gz | sudo tar -xz -C "$NEUROPOD_BASE_DIR" + shell: bash - # - name: Regression Tests - # run: | - # RUN_PRIVATE=$IS_NOT_FORK LUDWIG_TEST_SUITE_TIMEOUT_S=5400 pytest -v --timeout 300 --durations 100 -m "$MARKERS and not slow and not combinatorial and not horovod or benchmark and not llm" --junitxml pytest.xml tests/regression_tests - - # # Skip Horovod and replace with DDP. - # # https://github.com/ludwig-ai/ludwig/issues/3468 - # # - name: Install Horovod if necessary - # # if: matrix.test-markers == 'distributed' && matrix.pytorch-version != 'nightly' - # # env: - # # HOROVOD_WITH_PYTORCH: 1 - # # HOROVOD_WITHOUT_MPI: 1 - # # HOROVOD_WITHOUT_TENSORFLOW: 1 - # # HOROVOD_WITHOUT_MXNET: 1 - # # run: | - # # pip install -r requirements_extra.txt - # # HOROVOD_BUILT=$(python -c "import horovod.torch; horovod.torch.nccl_built(); print('SUCCESS')" || true) - # # if [[ $HOROVOD_BUILT != "SUCCESS" ]]; then - # # pip uninstall -y horovod - # # pip install --no-cache-dir git+https://github.com/horovod/horovod.git@master - # # fi - # # horovodrun --check-build - # # shell: bash - - # # Skip Horovod tests and replace with DDP. - # # https://github.com/ludwig-ai/ludwig/issues/3468 - # # - name: Horovod Tests - # # if: matrix.test-markers == 'distributed' && matrix.pytorch-version != 'nightly' - # # run: | - # # RUN_PRIVATE=$IS_NOT_FORK LUDWIG_TEST_SUITE_TIMEOUT_S=5400 pytest -v --timeout 300 --durations 100 -m "$MARKERS and horovod and not slow and not combinatorial and not llm" --junitxml pytest.xml tests/ + - name: Unit Tests + run: | + RUN_PRIVATE=$IS_NOT_FORK LUDWIG_TEST_SUITE_TIMEOUT_S=5400 pytest -v --timeout 300 --durations 100 -m "$MARKERS and not slow and not combinatorial and not horovod and not llm" --junitxml pytest.xml tests/ludwig - # - name: Upload Unit Test Results - # if: ${{ always() && !env.ACT }} - # uses: actions/upload-artifact@v2 - # with: - # name: Unit Test Results (Python ${{ matrix.python-version }} ${{ matrix.test-markers }}) - # path: pytest.xml + - name: Regression Tests + run: | + RUN_PRIVATE=$IS_NOT_FORK LUDWIG_TEST_SUITE_TIMEOUT_S=5400 pytest -v --timeout 300 --durations 100 -m "$MARKERS and not slow and not combinatorial and not horovod or benchmark and not llm" --junitxml pytest.xml tests/regression_tests + + # Skip Horovod and replace with DDP. + # https://github.com/ludwig-ai/ludwig/issues/3468 + # - name: Install Horovod if necessary + # if: matrix.test-markers == 'distributed' && matrix.pytorch-version != 'nightly' + # env: + # HOROVOD_WITH_PYTORCH: 1 + # HOROVOD_WITHOUT_MPI: 1 + # HOROVOD_WITHOUT_TENSORFLOW: 1 + # HOROVOD_WITHOUT_MXNET: 1 + # run: | + # pip install -r requirements_extra.txt + # HOROVOD_BUILT=$(python -c "import horovod.torch; horovod.torch.nccl_built(); print('SUCCESS')" || true) + # if [[ $HOROVOD_BUILT != "SUCCESS" ]]; then + # pip uninstall -y horovod + # pip install --no-cache-dir git+https://github.com/horovod/horovod.git@master + # fi + # horovodrun --check-build + # shell: bash + + # Skip Horovod tests and replace with DDP. + # https://github.com/ludwig-ai/ludwig/issues/3468 + # - name: Horovod Tests + # if: matrix.test-markers == 'distributed' && matrix.pytorch-version != 'nightly' + # run: | + # RUN_PRIVATE=$IS_NOT_FORK LUDWIG_TEST_SUITE_TIMEOUT_S=5400 pytest -v --timeout 300 --durations 100 -m "$MARKERS and horovod and not slow and not combinatorial and not llm" --junitxml pytest.xml tests/ + + - name: Upload Unit Test Results + if: ${{ always() && !env.ACT }} + uses: actions/upload-artifact@v2 + with: + name: Unit Test Results (Python ${{ matrix.python-version }} ${{ matrix.test-markers }}) + path: pytest.xml # TODO: ALEX integration-tests: From 1c99e27edddb4d0cf11a0361edab8e6ee19cff75 Mon Sep 17 00:00:00 2001 From: Alex Sherstinsky Date: Fri, 8 Mar 2024 12:38:28 -0800 Subject: [PATCH 04/36] Troubleshooting test failures. --- .github/workflows/pytest.yml | 27 ++++++++++++++++++--------- 1 file changed, 18 insertions(+), 9 deletions(-) diff --git a/.github/workflows/pytest.yml b/.github/workflows/pytest.yml index ca9d70f5dd3..24f882f5cde 100644 --- a/.github/workflows/pytest.yml +++ b/.github/workflows/pytest.yml @@ -24,21 +24,30 @@ jobs: matrix: os: [ubuntu-latest] python-version: ["3.8", "3.9", "3.10"] - test-markers: ["not distributed", "distributed"] + # TODO: ALEX + # test-markers: ["not distributed", "distributed"] + # TODO: ALEX + # TODO: ALEX + test-markers: ["not distributed"] + # TODO: ALEX include: - - python-version: "3.8" - pytorch-version: 2.0.0 - torchscript-version: 1.10.2 - ray-version: 2.3.1 - - python-version: "3.9" - pytorch-version: 2.1.1 - torchscript-version: 1.10.2 - ray-version: 2.3.1 + # TODO: ALEX + # - python-version: "3.8" + # pytorch-version: 2.0.0 + # torchscript-version: 1.10.2 + # ray-version: 2.3.1 + # - python-version: "3.9" + # pytorch-version: 2.1.1 + # torchscript-version: 1.10.2 + # ray-version: 2.3.1 + # TODO: ALEX + # TODO: ALEX - python-version: "3.10" # pytorch-version: nightly pytorch-version: 2.2.1 torchscript-version: 1.10.2 ray-version: 2.3.1 + # TODO: ALEX env: PYTORCH: ${{ matrix.pytorch-version }} MARKERS: ${{ matrix.test-markers }} From 9e79c4b44c422e68224559589679c764c38782ea Mon Sep 17 00:00:00 2001 From: Alex Sherstinsky Date: Fri, 8 Mar 2024 12:40:45 -0800 Subject: [PATCH 05/36] Troubleshooting test failures. --- .github/workflows/pytest.yml | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/.github/workflows/pytest.yml b/.github/workflows/pytest.yml index 24f882f5cde..a6253349590 100644 --- a/.github/workflows/pytest.yml +++ b/.github/workflows/pytest.yml @@ -23,7 +23,11 @@ jobs: fail-fast: false matrix: os: [ubuntu-latest] - python-version: ["3.8", "3.9", "3.10"] + # TODO: ALEX + # python-version: ["3.8", "3.9", "3.10"] + # TODO: ALEX + # TODO: ALEX + python-version: [3.10"] # TODO: ALEX # test-markers: ["not distributed", "distributed"] # TODO: ALEX From 56828f259fcaaff3cf2068c53acbfab9df987710 Mon Sep 17 00:00:00 2001 From: Alex Sherstinsky Date: Fri, 8 Mar 2024 12:52:27 -0800 Subject: [PATCH 06/36] Troubleshooting test failures. --- .github/workflows/pytest.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/pytest.yml b/.github/workflows/pytest.yml index a6253349590..ef59e8e7079 100644 --- a/.github/workflows/pytest.yml +++ b/.github/workflows/pytest.yml @@ -27,7 +27,7 @@ jobs: # python-version: ["3.8", "3.9", "3.10"] # TODO: ALEX # TODO: ALEX - python-version: [3.10"] + python-version: ["3.10"] # TODO: ALEX # test-markers: ["not distributed", "distributed"] # TODO: ALEX From e44afc825927e8af7d4a9d2a3273f5e797b30796 Mon Sep 17 00:00:00 2001 From: Alex Sherstinsky Date: Fri, 8 Mar 2024 13:09:32 -0800 Subject: [PATCH 07/36] Troubleshooting test failures. --- .github/workflows/pytest.yml | 506 ----------------------------------- 1 file changed, 506 deletions(-) diff --git a/.github/workflows/pytest.yml b/.github/workflows/pytest.yml index ef59e8e7079..af187eb9806 100644 --- a/.github/workflows/pytest.yml +++ b/.github/workflows/pytest.yml @@ -16,202 +16,6 @@ concurrency: cancel-in-progress: true jobs: - # TODO: ALEX - pytest: - runs-on: ${{ matrix.os }} - strategy: - fail-fast: false - matrix: - os: [ubuntu-latest] - # TODO: ALEX - # python-version: ["3.8", "3.9", "3.10"] - # TODO: ALEX - # TODO: ALEX - python-version: ["3.10"] - # TODO: ALEX - # test-markers: ["not distributed", "distributed"] - # TODO: ALEX - # TODO: ALEX - test-markers: ["not distributed"] - # TODO: ALEX - include: - # TODO: ALEX - # - python-version: "3.8" - # pytorch-version: 2.0.0 - # torchscript-version: 1.10.2 - # ray-version: 2.3.1 - # - python-version: "3.9" - # pytorch-version: 2.1.1 - # torchscript-version: 1.10.2 - # ray-version: 2.3.1 - # TODO: ALEX - # TODO: ALEX - - python-version: "3.10" - # pytorch-version: nightly - pytorch-version: 2.2.1 - torchscript-version: 1.10.2 - ray-version: 2.3.1 - # TODO: ALEX - env: - PYTORCH: ${{ matrix.pytorch-version }} - MARKERS: ${{ matrix.test-markers }} - NEUROPOD_BASE_DIR: "/usr/local/lib/neuropod" - NEUROPOD_VERISON: "0.3.0-rc6" - TORCHSCRIPT_VERSION: ${{ matrix.torchscript-version }} - RAY_VERSION: ${{ matrix.ray-version }} - AWS_ACCESS_KEY_ID: ${{ secrets.LUDWIG_TESTS_AWS_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.LUDWIG_TESTS_AWS_SECRET_ACCESS_KEY }} - KAGGLE_USERNAME: ${{ secrets.KAGGLE_USERNAME }} - KAGGLE_KEY: ${{ secrets.KAGGLE_KEY }} - IS_NOT_FORK: ${{ !(github.event.pull_request.base.repo.full_name == 'ludwig-ai/ludwig' && github.event.pull_request.head.repo.fork) }} - - name: py${{ matrix.python-version }}, torch-${{ matrix.pytorch-version }}, ${{ matrix.test-markers }}, ${{ matrix.os }}, ray ${{ matrix.ray-version }} - services: - minio: - image: fclairamb/minio-github-actions - env: - MINIO_ACCESS_KEY: minio - MINIO_SECRET_KEY: minio123 - ports: - - 9000:9000 - - timeout-minutes: 150 - steps: - - name: Setup ludwigai/ludwig-ray container for local testing with act. - if: ${{ env.ACT }} - run: | - curl -fsSL https://deb.nodesource.com/setup_16.x | sudo -E bash - - sudo apt-get install -y nodejs - sudo mkdir -p /opt/hostedtoolcache/ - sudo chmod 777 -R /opt/hostedtoolcache/ - - uses: actions/checkout@v2 - - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v2 - with: - python-version: ${{ matrix.python-version }} - - - name: Setup Linux - if: runner.os == 'linux' - run: | - sudo apt-get update && sudo apt-get install -y cmake libsndfile1 wget libsox-dev - - - name: Setup macOS - if: runner.os == 'macOS' - run: | - brew install libuv - - - name: pip cache - if: ${{ !env.ACT }} - uses: actions/cache@v2 - with: - path: ~/.cache/pip - key: ${{ runner.os }}-pip-py${{ matrix.python-version }}-torch${{ matrix.pytorch-version }}-${{ matrix.test-markers }}-${{ hashFiles('requirements*.txt', '.github/workflows/pytest.yml') }} - - - name: Debug out of space - run: | - du -h -d 1 ~ - df -h - - - name: Install dependencies - run: | - python --version - pip --version - python -m pip install -U pip - cmake --version - - # remove torch and ray from the dependencies so we can add them depending on the matrix args for the job. - cat requirements.txt | sed '/^torch[>=<\b]/d' | sed '/^torchtext/d' | sed '/^torchvision/d' | sed '/^torchaudio/d' > requirements-temp && mv requirements-temp requirements.txt - cat requirements_distributed.txt | sed '/^ray[\[]/d' - - if [ "$MARKERS" != "distributed" ]; then - # Skip distributed and hyperopt requirements to test optional imports - echo > requirements-temp && mv requirements-temp requirements_distributed.txt - echo > requirements-temp && mv requirements-temp requirements_hyperopt.txt - - # Skip distributed tree requirement (lightgbm-ray) - cat requirements_tree.txt | sed '/^lightgbm-ray/d' > requirements-temp && mv requirements-temp requirements_tree.txt - else - if [ "$RAY_VERSION" == "nightly" ]; then - # NOTE: hardcoded for python 3.10 on Linux - echo "ray[default,data,serve,tune] @ https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-3.0.0.dev0-cp310-cp310-manylinux2014_x86_64.whl" >> requirements_distributed.txt - else - echo "ray[default,data,serve,tune]==$RAY_VERSION" >> requirements_distributed.txt - fi - fi - - if [ "$PYTORCH" == "nightly" ]; then - extra_index_url=https://download.pytorch.org/whl/nightly/cpu - pip install --pre torch torchtext torchvision torchaudio --extra-index-url $extra_index_url - - else - extra_index_url=https://download.pytorch.org/whl/cpu - pip install torch==$PYTORCH torchtext torchvision torchaudio --extra-index-url $extra_index_url - fi - - pip install '.[test]' --extra-index-url $extra_index_url - pip list - - if [ "$PYTORCH" == "nightly" ]; then - python -c "from packaging import version; import torch; assert version.parse(torch.__version__).release >= version.parse(\"2.0.0\").release, f\"torch {version.parse(torch.__version__).release} < version.parse(\'2.0.0\').release\"" - else - python -c "from packaging import version; import torch; assert version.parse(torch.__version__).release == version.parse(\"$PYTORCH\").release, f\"torch {version.parse(torch.__version__).release} != version.parse(\'$PYTORCH\').release\"" - fi - - if [ "$MARKERS" == "distributed" ]; then - python -c "from packaging import version; import ray; assert version.parse(ray.__version__).release == version.parse(\"$RAY_VERSION\").release, f\"ray {version.parse(ray.__version__).release} != version.parse(\'$RAY_VERSION\').release\"" - else - python -c "import importlib.util; assert importlib.util.find_spec('ray') is None, \"found ray but expected it to not be installed\"" - fi - shell: bash - - - name: Install Neuropod backend - run: | - sudo mkdir -p "$NEUROPOD_BASE_DIR" - curl -L https://github.com/uber/neuropod/releases/download/v${{ env.NEUROPOD_VERISON }}/libneuropod-cpu-linux-v${{ env.NEUROPOD_VERISON }}-torchscript-${{ env.TORCHSCRIPT_VERSION }}-backend.tar.gz | sudo tar -xz -C "$NEUROPOD_BASE_DIR" - shell: bash - - - name: Unit Tests - run: | - RUN_PRIVATE=$IS_NOT_FORK LUDWIG_TEST_SUITE_TIMEOUT_S=5400 pytest -v --timeout 300 --durations 100 -m "$MARKERS and not slow and not combinatorial and not horovod and not llm" --junitxml pytest.xml tests/ludwig - - - name: Regression Tests - run: | - RUN_PRIVATE=$IS_NOT_FORK LUDWIG_TEST_SUITE_TIMEOUT_S=5400 pytest -v --timeout 300 --durations 100 -m "$MARKERS and not slow and not combinatorial and not horovod or benchmark and not llm" --junitxml pytest.xml tests/regression_tests - - # Skip Horovod and replace with DDP. - # https://github.com/ludwig-ai/ludwig/issues/3468 - # - name: Install Horovod if necessary - # if: matrix.test-markers == 'distributed' && matrix.pytorch-version != 'nightly' - # env: - # HOROVOD_WITH_PYTORCH: 1 - # HOROVOD_WITHOUT_MPI: 1 - # HOROVOD_WITHOUT_TENSORFLOW: 1 - # HOROVOD_WITHOUT_MXNET: 1 - # run: | - # pip install -r requirements_extra.txt - # HOROVOD_BUILT=$(python -c "import horovod.torch; horovod.torch.nccl_built(); print('SUCCESS')" || true) - # if [[ $HOROVOD_BUILT != "SUCCESS" ]]; then - # pip uninstall -y horovod - # pip install --no-cache-dir git+https://github.com/horovod/horovod.git@master - # fi - # horovodrun --check-build - # shell: bash - - # Skip Horovod tests and replace with DDP. - # https://github.com/ludwig-ai/ludwig/issues/3468 - # - name: Horovod Tests - # if: matrix.test-markers == 'distributed' && matrix.pytorch-version != 'nightly' - # run: | - # RUN_PRIVATE=$IS_NOT_FORK LUDWIG_TEST_SUITE_TIMEOUT_S=5400 pytest -v --timeout 300 --durations 100 -m "$MARKERS and horovod and not slow and not combinatorial and not llm" --junitxml pytest.xml tests/ - - - name: Upload Unit Test Results - if: ${{ always() && !env.ACT }} - uses: actions/upload-artifact@v2 - with: - name: Unit Test Results (Python ${{ matrix.python-version }} ${{ matrix.test-markers }}) - path: pytest.xml - # TODO: ALEX - integration-tests: name: ${{ matrix.test-markers }} runs-on: ubuntu-latest @@ -219,17 +23,7 @@ jobs: fail-fast: false matrix: test-markers: - # TODO: ALEX - # - "integration_tests_a" - # - "integration_tests_b" - # - "integration_tests_c" - # - "integration_tests_d" - # - "integration_tests_e" - # - "integration_tests_f" - # TODO: ALEX - # TODO: ALEX - "integration_tests_x" - # TODO: ALEX env: AWS_ACCESS_KEY_ID: ${{ secrets.LUDWIG_TESTS_AWS_ACCESS_KEY_ID }} @@ -285,280 +79,6 @@ jobs: run: | RUN_PRIVATE=$IS_NOT_FORK LUDWIG_TEST_SUITE_TIMEOUT_S=7200 pytest -v --timeout 300 --durations 100 -m "not slow and not combinatorial and not horovod and not llm and $MARKERS" --junitxml pytest.xml tests/integration_tests - # TODO: ALEX - # llm-tests: - # name: LLM Tests - # runs-on: ubuntu-latest - - # timeout-minutes: 60 - # steps: - # - uses: actions/checkout@v2 - # - name: Set up Python 3.9 - # uses: actions/setup-python@v2 - # with: - # python-version: 3.9 - - # - name: Setup Linux - # if: runner.os == 'linux' - # run: | - # sudo apt-get update && sudo apt-get install -y cmake libsndfile1 - - # - name: Setup macOS - # if: runner.os == 'macOS' - # run: | - # brew install libuv - - # - name: Install dependencies - # run: | - # python --version - # pip --version - # python -m pip install -U pip - - # # remove torch and ray from the dependencies so we can add them depending on the matrix args for the job. - # cat requirements.txt | sed '/^torch[>=<\b]/d' | sed '/^torchtext/d' | sed '/^torchvision/d' | sed '/^torchaudio/d' > requirements-temp && mv requirements-temp requirements.txt - # cat requirements_distributed.txt | sed '/^ray[\[]/d' - # pip install torch==2.0.0 torchtext torchvision torchaudio - # pip install ray==2.3.0 - # pip install '.[test]' - # pip list - # shell: bash - - # - name: LLM Tests - # run: | - # pytest -vs --durations 100 -m "llm" --junitxml pytest.xml tests - - # combinatorial-tests: - # name: Combinatorial Tests - # runs-on: ubuntu-latest - - # timeout-minutes: 60 - # steps: - # - uses: actions/checkout@v2 - # - name: Set up Python 3.8 - # uses: actions/setup-python@v2 - # with: - # python-version: 3.8 - - # - name: Setup Linux - # if: runner.os == 'linux' - # run: | - # sudo apt-get update && sudo apt-get install -y cmake libsndfile1 - - # - name: Setup macOS - # if: runner.os == 'macOS' - # run: | - # brew install libuv - - # - name: Install dependencies - # run: | - # python --version - # pip --version - # python -m pip install -U pip - # pip install '.[test]' - # pip list - # shell: bash - - # - name: Testing combinatorial config generation code - # run: | - # pytest -vs --durations 100 -m "combinatorial" --junitxml pytest.xml tests/ludwig/config_sampling - - # - name: Combinatorial Tests - # run: | - # pytest -rx --durations 100 -m "combinatorial" --junitxml pytest.xml tests/training_success - - # test-minimal-install: - # name: Test Minimal Install - # runs-on: ubuntu-latest - - # timeout-minutes: 15 - # steps: - # - uses: actions/checkout@v2 - # - name: Set up Python 3.8 - # uses: actions/setup-python@v2 - # with: - # python-version: 3.8 - - # - name: Setup Linux - # if: runner.os == 'linux' - # run: | - # sudo apt-get update && sudo apt-get install -y cmake libsndfile1 - - # - name: Setup macOS - # if: runner.os == 'macOS' - # run: | - # brew install libuv - - # - name: Install dependencies - # run: | - # python --version - # pip --version - # python -m pip install -U pip - # pip install torch==2.0.0 torchtext - # pip install ray==2.3.0 - # pip install '.' - # pip list - # shell: bash - # - name: Check Install - # run: | - # ludwig check_install - # shell: bash - - # - name: Test Getting Started - # run: | - # cd examples/getting_started && sh ./run.sh - # shell: bash - # TODO: ALEX - - # start-runner: - # name: Start self-hosted EC2 runner - # if: > - # always() && needs.pytest.result != 'failure' && ( - # github.event_name == 'schedule' && github.repository == 'ludwig-ai/ludwig' || - # github.event_name == 'push' && github.repository == 'ludwig-ai/ludwig' || - # github.event_name == 'pull_request' && github.event.pull_request.base.repo.full_name == 'ludwig-ai/ludwig' && !github.event.pull_request.head.repo.fork) - # needs: pytest - # runs-on: ubuntu-latest - # outputs: - # label: ${{ steps.start-ec2-runner.outputs.label }} - # ec2-instance-id: ${{ steps.start-ec2-runner.outputs.ec2-instance-id }} - - # steps: - # - name: Configure AWS credentials - # uses: aws-actions/configure-aws-credentials@v1 - # with: - # aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} - # aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} - # aws-region: ${{ secrets.AWS_REGION }} - - # - name: Start EC2 runner - # id: start-ec2-runner - # uses: machulav/ec2-github-runner@v2.3.2 - # with: - # mode: start - # github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }} - # ec2-image-id: ami-0759580dedc953d1f - # ec2-instance-type: g4dn.xlarge - # subnet-id: subnet-0983be43 - # security-group-id: sg-4cba0d08 - # aws-resource-tags: > - # [ - # {"Key": "Name", "Value": "ludwig-github-${{ github.head_ref || github.sha }}"}, - # {"Key": "GitHubRepository", "Value": "${{ github.repository }}"}, - # {"Key": "GitHubHeadRef", "Value": "${{ github.head_ref }}"}, - # {"Key": "GitHubSHA", "Value": "${{ github.sha }}"} - # ] - - # pytest-gpu: - # if: needs.start-runner.result != 'skipped' - # needs: start-runner # required to start the main job when the runner is ready - # runs-on: ${{ needs.start-runner.outputs.label }} # run the job on the newly created runners - # strategy: - # fail-fast: false - # matrix: - # python-version: [3.7] - # include: - # - python-version: 3.7 - # pytorch-version: 1.10.0 - # torchscript-version: 1.10.2 - # env: - # PYTORCH: ${{ matrix.pytorch-version }} - # NEUROPOD_BASE_DIR: "/usr/local/lib/neuropod" - # NEUROPOD_VERISON: "0.3.0-rc6" - # TORCHSCRIPT_VERSION: ${{ matrix.torchscript-version }} - - # name: py${{ matrix.python-version }}, torch-${{ matrix.pytorch-version }}, gpu - - # timeout-minutes: 70 - # steps: - # - uses: actions/checkout@v2 - # - name: Set up Python ${{ matrix.python-version }} - # uses: actions/setup-python@v2 - # with: - # python-version: ${{ matrix.python-version }} - - # - name: Setup Linux - # if: runner.os == 'linux' - # run: | - # sudo apt-get update && sudo apt-get install -y libsndfile1 cmake ccache build-essential g++-8 gcc-8 - # cmake --version - - # - name: Install CUDA drivers - # run: | - # wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/cuda-ubuntu2004.pin - # sudo mv cuda-ubuntu2004.pin /etc/apt/preferences.d/cuda-repository-pin-600 - # wget https://developer.download.nvidia.com/compute/cuda/11.5.1/local_installers/cuda-repo-ubuntu2004-11-5-local_11.5.1-495.29.05-1_amd64.deb - # sudo dpkg -i cuda-repo-ubuntu2004-11-5-local_11.5.1-495.29.05-1_amd64.deb - # sudo apt-key add /var/cuda-repo-ubuntu2004-11-5-local/7fa2af80.pub - # sudo apt-get update - # sudo apt-get -y install cuda - # shell: bash - - # - name: pip cache - # uses: actions/cache@v2 - # with: - # path: ~/.cache/pip - # key: ${{ runner.os }}-pip-py${{ matrix.python-version }}-torch${{ matrix.pytorch-version }}-${{ hashFiles('requirements*.txt') }} - # restore-keys: | - # ${{ runner.os }}-pip-py${{ matrix.python-version }}-torch${{ matrix.pytorch-version }}- - - # - name: Install dependencies - # env: - # HOROVOD_WITH_PYTORCH: 1 - # HOROVOD_WITHOUT_MPI: 1 - # HOROVOD_WITHOUT_TENSORFLOW: 1 - # HOROVOD_WITHOUT_MXNET: 1 - # run: | - # python --version - # pip --version - # python -m pip install -U pip - # if [ $PYTORCH == "nightly" ]; then - # cat requirements.txt | sed '/^torch[>=<]/d' > requirements-temp && mv requirements-temp requirements.txt - # pip install --pre torch torchvision -f https://download.pytorch.org/whl/torch_stable.html - # else - # pip install torch==${PYTORCH}+cu111 -f https://download.pytorch.org/whl/torch_stable.html - # fi - # # pip install --no-cache-dir git+https://github.com/horovod/horovod.git@master - # pip install dulwich==0.20.26 # workaround for `/usr/bin/ld: cannot find -lpython3.7m` - # pip install '.[test]' - # pip list - # shell: bash - - # - name: Install Neuropod backend - # run: | - # sudo mkdir -p "$NEUROPOD_BASE_DIR" - # curl -L https://github.com/uber/neuropod/releases/download/v${{ env.NEUROPOD_VERISON }}/libneuropod-cpu-linux-v${{ env.NEUROPOD_VERISON }}-torchscript-${{ env.TORCHSCRIPT_VERSION }}-backend.tar.gz | sudo tar -xz -C "$NEUROPOD_BASE_DIR" - # shell: bash - - # - name: Reinstall Horovod if necessary - # env: - # HOROVOD_WITH_PYTORCH: 1 - # HOROVOD_WITHOUT_MPI: 1 - # HOROVOD_WITHOUT_TENSORFLOW: 1 - # HOROVOD_WITHOUT_MXNET: 1 - # run: | - # HOROVOD_BUILT=$(python -c "import horovod.torch; horovod.torch.nccl_built(); print('SUCCESS')" || true) - # if [[ $HOROVOD_BUILT != "SUCCESS" ]]; then - # pip uninstall -y horovod - # pip install --no-cache-dir git+https://github.com/horovod/horovod.git@master - # fi - # horovodrun --check-build - # shell: bash - - # - name: Check CUDA is available - # run: | - # python -c "import torch; assert torch.cuda.is_available()" - - # - name: Tests - # run: | - # pytest -v --timeout 300 --durations 10 --junitxml pytest.xml tests - - # - name: Upload Unit Test Results - # if: always() - # uses: actions/upload-artifact@v2 - # with: - # name: Unit Test Results (Python ${{ matrix.python-version }} gpu - # path: pytest.xml - event_file: name: "Event File" runs-on: ubuntu-latest @@ -570,29 +90,3 @@ jobs: with: name: Event File path: ${{ github.event_path }} - - # stop-runner: - # name: Stop self-hosted EC2 runner - - # # required to stop the runner even if the error happened in the previous job - # if: always() && needs.start-runner.result != 'skipped' - # needs: - # - start-runner # required to get output from the start-runner job - # - pytest-gpu # required to wait when the main job is done - # runs-on: ubuntu-latest - - # steps: - # - name: Configure AWS credentials - # uses: aws-actions/configure-aws-credentials@v1 - # with: - # aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} - # aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} - # aws-region: ${{ secrets.AWS_REGION }} - - # - name: Stop EC2 runner - # uses: machulav/ec2-github-runner@v2.3.1 - # with: - # mode: stop - # github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }} - # label: ${{ needs.start-runner.outputs.label }} - # ec2-instance-id: ${{ needs.start-runner.outputs.ec2-instance-id }} From 4f1c50b02f2276406203102f1c9e17e5339a0500 Mon Sep 17 00:00:00 2001 From: Alex Sherstinsky Date: Fri, 8 Mar 2024 13:29:29 -0800 Subject: [PATCH 08/36] Troubleshooting test failures. --- pytest.ini | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/pytest.ini b/pytest.ini index 539a53b1674..61478d9705e 100644 --- a/pytest.ini +++ b/pytest.ini @@ -7,11 +7,6 @@ markers = combinatorial: mark a test as combinatorial. horovod: mark a test as a Horovod test. llm: mark a test as an LLM test. - integration_tests_a: mark a test to be run as part of integration tests, group A. - integration_tests_b: mark a test to be run as part of integration tests, group B. - integration_tests_c: mark a test to be run as part of integration tests, group C. - integration_tests_d: mark a test to be run as part of integration tests, group D. - integration_tests_e: mark a test to be run as part of integration tests, group E. - integration_tests_f: mark a test to be run as part of integration tests, group F. + integration_tests_x: mark a test to be run as part of integration tests, group X. filterwarnings = ignore::DeprecationWarning From 5848459515af9f6bc612afe5145d77aba9d7885c Mon Sep 17 00:00:00 2001 From: Alex Sherstinsky Date: Fri, 8 Mar 2024 20:56:46 -0800 Subject: [PATCH 09/36] Troubleshooting test failures. --- pytest.ini | 7 ------- 1 file changed, 7 deletions(-) diff --git a/pytest.ini b/pytest.ini index 61478d9705e..a82421e060c 100644 --- a/pytest.ini +++ b/pytest.ini @@ -1,12 +1,5 @@ [pytest] markers = - benchmark: mark a test as a benchmarking test. - distributed: mark a test as a distributed test. - filesystem: mark to test operating system systems. - slow: mark test as slow. - combinatorial: mark a test as combinatorial. - horovod: mark a test as a Horovod test. - llm: mark a test as an LLM test. integration_tests_x: mark a test to be run as part of integration tests, group X. filterwarnings = ignore::DeprecationWarning From 2aefef1671f3d3a8e797f03841bee3ff731dbc26 Mon Sep 17 00:00:00 2001 From: Alex Sherstinsky Date: Fri, 8 Mar 2024 22:39:28 -0800 Subject: [PATCH 10/36] Troubleshooting test failures. --- tests/integration_tests/{tests_f_control.py => test_f_control.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename tests/integration_tests/{tests_f_control.py => test_f_control.py} (100%) diff --git a/tests/integration_tests/tests_f_control.py b/tests/integration_tests/test_f_control.py similarity index 100% rename from tests/integration_tests/tests_f_control.py rename to tests/integration_tests/test_f_control.py From f0608a1e8cca108c95505890ce1a63771540ee96 Mon Sep 17 00:00:00 2001 From: Alex Sherstinsky Date: Sat, 9 Mar 2024 09:36:24 -0800 Subject: [PATCH 11/36] Troubleshooting test failures. --- ludwig/utils/carton_utils.py | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/ludwig/utils/carton_utils.py b/ludwig/utils/carton_utils.py index f19ad84dae2..b969f949ddc 100644 --- a/ludwig/utils/carton_utils.py +++ b/ludwig/utils/carton_utils.py @@ -114,6 +114,7 @@ def export_carton(model: LudwigModel, carton_path: str, carton_model_name="ludwi # carton.pack is an async function so we run it and wait until it's complete # See https://pyo3.rs/v0.20.0/ecosystem/async-await#a-note-about-asynciorun for why we wrap it # in another function + # TODO: ALEX async def pack(): return await carton.pack( input_model_path, @@ -126,8 +127,26 @@ async def pack(): outputs=_get_output_spec(model), ) + # TODO: ALEX + loop = asyncio.get_event_loop() - tmp_out_path = loop.run_until_complete(pack()) + # TODO: ALEX + # tmp_out_path = loop.run_until_complete(pack()) + # TODO: ALEX + # TODO: ALEX + import sys + import traceback + + try: + tmp_out_path = loop.run_until_complete(pack()) + except Exception as e: + exception_message: str = "A Sub-Process call Exception occurred.\n" + exception_traceback: str = traceback.format_exc() + exception_message += f'{type(e).__name__}: "{str(e)}". Traceback: "{exception_traceback}".' + sys.stderr.write(exception_message) + sys.stderr.flush() + raise SystemExit(exception_message) from e # Make sure error is fatal. + # TODO: ALEX # Move it to the output path shutil.move(tmp_out_path, carton_path) From 5a61f973ec25fb1ef1e8f471e206a0c34e21199e Mon Sep 17 00:00:00 2001 From: Alex Sherstinsky Date: Sat, 9 Mar 2024 11:34:50 -0800 Subject: [PATCH 12/36] Troubleshooting test failures. --- ludwig/utils/carton_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ludwig/utils/carton_utils.py b/ludwig/utils/carton_utils.py index b969f949ddc..18af6dd40be 100644 --- a/ludwig/utils/carton_utils.py +++ b/ludwig/utils/carton_utils.py @@ -140,7 +140,7 @@ async def pack(): try: tmp_out_path = loop.run_until_complete(pack()) except Exception as e: - exception_message: str = "A Sub-Process call Exception occurred.\n" + exception_message: str = "A general Exception occurred.\n" exception_traceback: str = traceback.format_exc() exception_message += f'{type(e).__name__}: "{str(e)}". Traceback: "{exception_traceback}".' sys.stderr.write(exception_message) From f11edff49c4411f925496f0c433194771c355fd7 Mon Sep 17 00:00:00 2001 From: Alex Sherstinsky Date: Sat, 9 Mar 2024 11:37:26 -0800 Subject: [PATCH 13/36] Troubleshooting test failures. --- .github/workflows/pytest.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/pytest.yml b/.github/workflows/pytest.yml index af187eb9806..570271f950e 100644 --- a/.github/workflows/pytest.yml +++ b/.github/workflows/pytest.yml @@ -45,10 +45,10 @@ jobs: timeout-minutes: 90 steps: - uses: actions/checkout@v2 - - name: Set up Python 3.9 + - name: Set up Python 3.10 uses: actions/setup-python@v2 with: - python-version: 3.9 + python-version: 3.10 - name: Setup Linux if: runner.os == 'linux' From b385ceb5ba975bc3ce1817a72a47d6f12246f146 Mon Sep 17 00:00:00 2001 From: Alex Sherstinsky Date: Sat, 9 Mar 2024 11:48:58 -0800 Subject: [PATCH 14/36] Troubleshooting test failures. --- .github/workflows/pytest.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/pytest.yml b/.github/workflows/pytest.yml index 570271f950e..3397cb29dd0 100644 --- a/.github/workflows/pytest.yml +++ b/.github/workflows/pytest.yml @@ -48,7 +48,7 @@ jobs: - name: Set up Python 3.10 uses: actions/setup-python@v2 with: - python-version: 3.10 + python-version: "3.10" - name: Setup Linux if: runner.os == 'linux' From c28ac8cc7f5af1f08533b324fddc28391fedaad1 Mon Sep 17 00:00:00 2001 From: Alex Sherstinsky Date: Sat, 9 Mar 2024 15:13:15 -0800 Subject: [PATCH 15/36] Troubleshooting test failures. --- ludwig/utils/carton_utils.py | 28 +++++++++++++++++++++++++--- 1 file changed, 25 insertions(+), 3 deletions(-) diff --git a/ludwig/utils/carton_utils.py b/ludwig/utils/carton_utils.py index 18af6dd40be..7f4b1852251 100644 --- a/ludwig/utils/carton_utils.py +++ b/ludwig/utils/carton_utils.py @@ -115,7 +115,21 @@ def export_carton(model: LudwigModel, carton_path: str, carton_model_name="ludwi # See https://pyo3.rs/v0.20.0/ecosystem/async-await#a-note-about-asynciorun for why we wrap it # in another function # TODO: ALEX - async def pack(): + # async def pack(): + # return await carton.pack( + # input_model_path, + # runner_name="torchscript", + # # Any 2.x.x version is okay + # # TODO: improve this + # required_framework_version="=2", + # model_name=carton_model_name, + # inputs=_get_input_spec(model), + # outputs=_get_output_spec(model), + # ) + + # TODO: ALEX + # TODO: ALEX + async def packster(): return await carton.pack( input_model_path, runner_name="torchscript", @@ -129,16 +143,23 @@ async def pack(): # TODO: ALEX - loop = asyncio.get_event_loop() # TODO: ALEX + loop = asyncio.get_event_loop() # tmp_out_path = loop.run_until_complete(pack()) # TODO: ALEX # TODO: ALEX + import time import sys import traceback try: - tmp_out_path = loop.run_until_complete(pack()) + # TODO: ALEX + # tmp_out_path = loop.run_until_complete(pack()) + # TODO: ALEX + # TODO: ALEX + time.sleep(1) + tmp_out_path = loop.run_until_complete(packster()) + # TODO: ALEX except Exception as e: exception_message: str = "A general Exception occurred.\n" exception_traceback: str = traceback.format_exc() @@ -149,4 +170,5 @@ async def pack(): # TODO: ALEX # Move it to the output path + time.sleep(1) shutil.move(tmp_out_path, carton_path) From 3b0e6c1c58ae7fabc5401469a1513e6f3848a4b5 Mon Sep 17 00:00:00 2001 From: Alex Sherstinsky Date: Sat, 9 Mar 2024 15:18:20 -0800 Subject: [PATCH 16/36] Troubleshooting test failures. --- ludwig/utils/carton_utils.py | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/ludwig/utils/carton_utils.py b/ludwig/utils/carton_utils.py index 7f4b1852251..e5bc928aa01 100644 --- a/ludwig/utils/carton_utils.py +++ b/ludwig/utils/carton_utils.py @@ -105,10 +105,11 @@ def export_carton(model: LudwigModel, carton_path: str, carton_model_name="ludwi # Generate a torchscript model model_ts = generate_carton_torchscript(model) + print(f"\n[ALEX_TEST] [WOUTPUT] MODEL_TORCH_SCRIPT:\n{model_ts} ; TYPE: {str(type(model_ts))}") with tempfile.TemporaryDirectory() as tmpdir: # Save the model to a temp dir - input_model_path = os.path.join(tmpdir, "model.pt") + input_model_path: str = os.path.join(tmpdir, "model.pt") torch.jit.save(model_ts, input_model_path) # carton.pack is an async function so we run it and wait until it's complete @@ -129,8 +130,9 @@ def export_carton(model: LudwigModel, carton_path: str, carton_model_name="ludwi # TODO: ALEX # TODO: ALEX - async def packster(): - return await carton.pack( + async def packster() -> str: + time.sleep(1) + a: str = await carton.pack( input_model_path, runner_name="torchscript", # Any 2.x.x version is okay @@ -140,11 +142,16 @@ async def packster(): inputs=_get_input_spec(model), outputs=_get_output_spec(model), ) + time.sleep(1) + print(f"\n[ALEX_TEST] [WOUTPUT] WOUTPUT:\n{a} ; TYPE: {str(type(a))}") + time.sleep(1) + return a # TODO: ALEX # TODO: ALEX loop = asyncio.get_event_loop() + print(f"\n[ALEX_TEST] [WOUTPUT] LOOP:\n{loop} ; TYPE: {str(type(loop))}") # tmp_out_path = loop.run_until_complete(pack()) # TODO: ALEX # TODO: ALEX @@ -158,7 +165,7 @@ async def packster(): # TODO: ALEX # TODO: ALEX time.sleep(1) - tmp_out_path = loop.run_until_complete(packster()) + tmp_out_path: str = loop.run_until_complete(packster()) # TODO: ALEX except Exception as e: exception_message: str = "A general Exception occurred.\n" From c6ea057951011bf816da9e0f0b63fedde722a260 Mon Sep 17 00:00:00 2001 From: Alex Sherstinsky Date: Sat, 9 Mar 2024 15:29:48 -0800 Subject: [PATCH 17/36] Troubleshooting test failures. --- ludwig/utils/carton_utils.py | 36 ++++++++++++++++++++++-------------- 1 file changed, 22 insertions(+), 14 deletions(-) diff --git a/ludwig/utils/carton_utils.py b/ludwig/utils/carton_utils.py index e5bc928aa01..4d1735d47fc 100644 --- a/ludwig/utils/carton_utils.py +++ b/ludwig/utils/carton_utils.py @@ -132,20 +132,28 @@ def export_carton(model: LudwigModel, carton_path: str, carton_model_name="ludwi # TODO: ALEX async def packster() -> str: time.sleep(1) - a: str = await carton.pack( - input_model_path, - runner_name="torchscript", - # Any 2.x.x version is okay - # TODO: improve this - required_framework_version="=2", - model_name=carton_model_name, - inputs=_get_input_spec(model), - outputs=_get_output_spec(model), - ) - time.sleep(1) - print(f"\n[ALEX_TEST] [WOUTPUT] WOUTPUT:\n{a} ; TYPE: {str(type(a))}") - time.sleep(1) - return a + try: + a: str = await carton.pack( + input_model_path, + runner_name="torchscript", + # Any 2.x.x version is okay + # TODO: improve this + required_framework_version="=2", + model_name=carton_model_name, + inputs=_get_input_spec(model), + outputs=_get_output_spec(model), + ) + time.sleep(1) + print(f"\n[ALEX_TEST] [WOUTPUT] WOUTPUT:\n{a} ; TYPE: {str(type(a))}") + time.sleep(1) + return a + except Exception as ie: + exception_message: str = "A Packster-Inside Exception occurred.\n" + exception_traceback: str = traceback.format_exc() + exception_message += f'{type(ie).__name__}: "{str(ie)}". Traceback: "{exception_traceback}".' + sys.stderr.write(exception_message) + sys.stderr.flush() + raise ValueError(exception_message) from ie # TODO: ALEX From 61e778ae2dc1026f56baf0533ce677e39668601d Mon Sep 17 00:00:00 2001 From: Alex Sherstinsky Date: Sat, 9 Mar 2024 15:43:30 -0800 Subject: [PATCH 18/36] Troubleshooting test failures. --- ludwig/utils/carton_utils.py | 73 ++++++++++++++++------- tests/integration_tests/test_f_control.py | 56 ++++++++--------- 2 files changed, 80 insertions(+), 49 deletions(-) diff --git a/ludwig/utils/carton_utils.py b/ludwig/utils/carton_utils.py index 4d1735d47fc..17b1f2b3cbc 100644 --- a/ludwig/utils/carton_utils.py +++ b/ludwig/utils/carton_utils.py @@ -132,28 +132,57 @@ def export_carton(model: LudwigModel, carton_path: str, carton_model_name="ludwi # TODO: ALEX async def packster() -> str: time.sleep(1) - try: - a: str = await carton.pack( - input_model_path, - runner_name="torchscript", - # Any 2.x.x version is okay - # TODO: improve this - required_framework_version="=2", - model_name=carton_model_name, - inputs=_get_input_spec(model), - outputs=_get_output_spec(model), - ) - time.sleep(1) - print(f"\n[ALEX_TEST] [WOUTPUT] WOUTPUT:\n{a} ; TYPE: {str(type(a))}") - time.sleep(1) - return a - except Exception as ie: - exception_message: str = "A Packster-Inside Exception occurred.\n" - exception_traceback: str = traceback.format_exc() - exception_message += f'{type(ie).__name__}: "{str(ie)}". Traceback: "{exception_traceback}".' - sys.stderr.write(exception_message) - sys.stderr.flush() - raise ValueError(exception_message) from ie + # TODO: ALEX + # try: + # a: str = await carton.pack( + # input_model_path, + # runner_name="torchscript", + # # Any 2.x.x version is okay + # # TODO: improve this + # required_framework_version="=2", + # model_name=carton_model_name, + # inputs=_get_input_spec(model), + # outputs=_get_output_spec(model), + # ) + # time.sleep(1) + # print(f"\n[ALEX_TEST] [WOUTPUT] WOUTPUT:\n{a} ; TYPE: {str(type(a))}") + # time.sleep(1) + # return a + # except Exception as ie: + # exception_message: str = "A Packster-Inside Exception occurred.\n" + # exception_traceback: str = traceback.format_exc() + # exception_message += f'{type(ie).__name__}: "{str(ie)}". Traceback: "{exception_traceback}".' + # sys.stderr.write(exception_message) + # sys.stderr.flush() + # raise ValueError(exception_message) from ie + # TODO: ALEX + # TODO: ALEX + idx: int + for idx in range(5): + print(f"\n[ALEX_TEST] [WOUTPUT] TRYING_IDX:\n{idx} ; TYPE: {str(type(idx))}") + try: + a: str = await carton.pack( + input_model_path, + runner_name="torchscript", + # Any 2.x.x version is okay + # TODO: improve this + required_framework_version="=2", + model_name=carton_model_name, + inputs=_get_input_spec(model), + outputs=_get_output_spec(model), + ) + time.sleep(1) + print(f"\n[ALEX_TEST] [WOUTPUT] WOUTPUT:\n{a} ; TYPE: {str(type(a))}") + time.sleep(1) + return a + except Exception as ie: + exception_message: str = "A Packster-Inside Exception occurred.\n" + exception_traceback: str = traceback.format_exc() + exception_message += f'{type(ie).__name__}: "{str(ie)}". Traceback: "{exception_traceback}".' + sys.stderr.write(exception_message) + sys.stderr.flush() + raise ValueError(exception_message) from ie + # TODO: ALEX # TODO: ALEX diff --git a/tests/integration_tests/test_f_control.py b/tests/integration_tests/test_f_control.py index 6fb059bd136..8724732fc8f 100644 --- a/tests/integration_tests/test_f_control.py +++ b/tests/integration_tests/test_f_control.py @@ -180,31 +180,33 @@ async def infer(inputs): # TODO: ALEX -@pytest.mark.integration_tests_x +# # TODO: ALEX +# @pytest.mark.integration_tests_x +# # TODO: ALEX +# @pytest.mark.parametrize("use_pretrained", [False, True], ids=["false", "true"]) +# def test_vit_encoder_different_dimension_image(tmpdir, csv_filename, use_pretrained: bool): +# input_features = [ +# image_feature( +# os.path.join(tmpdir, "generated_output"), +# preprocessing={"in_memory": True, "height": 224, "width": 206, "num_channels": 3}, +# encoder={TYPE: "_vit_legacy", "use_pretrained": use_pretrained}, +# ) +# ] +# output_features = [category_feature(decoder={"vocab_size": 5}, reduce_input="sum")] + +# data_csv = generate_data( +# input_features, output_features, os.path.join(tmpdir, csv_filename), num_examples=NUM_EXAMPLES +# ) + +# config = { +# INPUT_FEATURES: input_features, +# OUTPUT_FEATURES: output_features, +# TRAINER: {"train_steps": 1}, +# } + +# model = LudwigModel(config) + +# # Failure happens post preprocessing but before training during the ECD model creation phase +# # so make sure the model can be created properly and training can proceed +# model.train(dataset=data_csv) # TODO: ALEX -@pytest.mark.parametrize("use_pretrained", [False, True], ids=["false", "true"]) -def test_vit_encoder_different_dimension_image(tmpdir, csv_filename, use_pretrained: bool): - input_features = [ - image_feature( - os.path.join(tmpdir, "generated_output"), - preprocessing={"in_memory": True, "height": 224, "width": 206, "num_channels": 3}, - encoder={TYPE: "_vit_legacy", "use_pretrained": use_pretrained}, - ) - ] - output_features = [category_feature(decoder={"vocab_size": 5}, reduce_input="sum")] - - data_csv = generate_data( - input_features, output_features, os.path.join(tmpdir, csv_filename), num_examples=NUM_EXAMPLES - ) - - config = { - INPUT_FEATURES: input_features, - OUTPUT_FEATURES: output_features, - TRAINER: {"train_steps": 1}, - } - - model = LudwigModel(config) - - # Failure happens post preprocessing but before training during the ECD model creation phase - # so make sure the model can be created properly and training can proceed - model.train(dataset=data_csv) From b20b64a2518aa85abf8e977c850de384adbb96ed Mon Sep 17 00:00:00 2001 From: Alex Sherstinsky Date: Sat, 9 Mar 2024 15:54:16 -0800 Subject: [PATCH 19/36] Troubleshooting test failures. --- ludwig/utils/carton_utils.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/ludwig/utils/carton_utils.py b/ludwig/utils/carton_utils.py index 17b1f2b3cbc..7dea3d7c8d5 100644 --- a/ludwig/utils/carton_utils.py +++ b/ludwig/utils/carton_utils.py @@ -157,8 +157,9 @@ async def packster() -> str: # raise ValueError(exception_message) from ie # TODO: ALEX # TODO: ALEX + max_tries: int = 5 idx: int - for idx in range(5): + for idx in range(max_tries): print(f"\n[ALEX_TEST] [WOUTPUT] TRYING_IDX:\n{idx} ; TYPE: {str(type(idx))}") try: a: str = await carton.pack( @@ -181,6 +182,8 @@ async def packster() -> str: exception_message += f'{type(ie).__name__}: "{str(ie)}". Traceback: "{exception_traceback}".' sys.stderr.write(exception_message) sys.stderr.flush() + # raise ValueError(exception_message) from ie + if idx >= max_tries - 1: raise ValueError(exception_message) from ie # TODO: ALEX From 4b03b61292a0a7673859f53fa2807ac37228b06c Mon Sep 17 00:00:00 2001 From: Alex Sherstinsky Date: Sat, 9 Mar 2024 16:02:28 -0800 Subject: [PATCH 20/36] Troubleshooting test failures. --- ludwig/utils/carton_utils.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/ludwig/utils/carton_utils.py b/ludwig/utils/carton_utils.py index 7dea3d7c8d5..194e0b942b7 100644 --- a/ludwig/utils/carton_utils.py +++ b/ludwig/utils/carton_utils.py @@ -161,6 +161,7 @@ async def packster() -> str: idx: int for idx in range(max_tries): print(f"\n[ALEX_TEST] [WOUTPUT] TRYING_IDX:\n{idx} ; TYPE: {str(type(idx))}") + time.sleep(1) try: a: str = await carton.pack( input_model_path, @@ -185,6 +186,7 @@ async def packster() -> str: # raise ValueError(exception_message) from ie if idx >= max_tries - 1: raise ValueError(exception_message) from ie + time.sleep(1) # TODO: ALEX # TODO: ALEX From b7b40f39a361fb9138aae0c2abc7b850de3b6a35 Mon Sep 17 00:00:00 2001 From: Alex Sherstinsky Date: Sat, 9 Mar 2024 16:03:50 -0800 Subject: [PATCH 21/36] Troubleshooting test failures. --- ludwig/utils/carton_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ludwig/utils/carton_utils.py b/ludwig/utils/carton_utils.py index 194e0b942b7..38cfb768bad 100644 --- a/ludwig/utils/carton_utils.py +++ b/ludwig/utils/carton_utils.py @@ -185,7 +185,7 @@ async def packster() -> str: sys.stderr.flush() # raise ValueError(exception_message) from ie if idx >= max_tries - 1: - raise ValueError(exception_message) from ie + raise ValueError("THINGS ENDED VERY BADLY!!!!!!!!!!!!!") time.sleep(1) # TODO: ALEX From 13d96e1c59288c50be7b919ea6c8bad1e3adef18 Mon Sep 17 00:00:00 2001 From: Alex Sherstinsky Date: Sun, 10 Mar 2024 08:36:53 -0700 Subject: [PATCH 22/36] Troubleshooting test failures. --- ludwig/utils/carton_utils.py | 150 ++++++++++++++++++----------------- 1 file changed, 77 insertions(+), 73 deletions(-) diff --git a/ludwig/utils/carton_utils.py b/ludwig/utils/carton_utils.py index 38cfb768bad..cb9daeb1185 100644 --- a/ludwig/utils/carton_utils.py +++ b/ludwig/utils/carton_utils.py @@ -98,6 +98,7 @@ def _get_output_spec(model: LudwigModel) -> List[Dict[str, Any]]: @DeveloperAPI def export_carton(model: LudwigModel, carton_path: str, carton_model_name="ludwig_model"): + print(f"\n[ALEX_TEST] [WOUTPUT] CARTON_PATH:\n{carton_path} ; TYPE: {str(type(carton_path))}") try: import cartonml as carton except ImportError: @@ -108,86 +109,88 @@ def export_carton(model: LudwigModel, carton_path: str, carton_model_name="ludwi print(f"\n[ALEX_TEST] [WOUTPUT] MODEL_TORCH_SCRIPT:\n{model_ts} ; TYPE: {str(type(model_ts))}") with tempfile.TemporaryDirectory() as tmpdir: + print(f"\n[ALEX_TEST] [WOUTPUT] TMPDIR:\n{tmpdir} ; TYPE: {str(type(tmpdir))}") # Save the model to a temp dir input_model_path: str = os.path.join(tmpdir, "model.pt") torch.jit.save(model_ts, input_model_path) + print(f"\n[ALEX_TEST] [WOUTPUT] INPUT_MODEL_PATH:\n{input_model_path} ; TYPE: {str(type(input_model_path))}") # carton.pack is an async function so we run it and wait until it's complete # See https://pyo3.rs/v0.20.0/ecosystem/async-await#a-note-about-asynciorun for why we wrap it # in another function # TODO: ALEX - # async def pack(): - # return await carton.pack( - # input_model_path, - # runner_name="torchscript", - # # Any 2.x.x version is okay - # # TODO: improve this - # required_framework_version="=2", - # model_name=carton_model_name, - # inputs=_get_input_spec(model), - # outputs=_get_output_spec(model), - # ) + async def pack(): + return await carton.pack( + path=input_model_path, + runner_name="torchscript", + # Any 2.x.x version is okay + # TODO: improve this + required_framework_version=">=2", + model_name=carton_model_name, + inputs=_get_input_spec(model), + outputs=_get_output_spec(model), + ) # TODO: ALEX # TODO: ALEX - async def packster() -> str: - time.sleep(1) - # TODO: ALEX - # try: - # a: str = await carton.pack( - # input_model_path, - # runner_name="torchscript", - # # Any 2.x.x version is okay - # # TODO: improve this - # required_framework_version="=2", - # model_name=carton_model_name, - # inputs=_get_input_spec(model), - # outputs=_get_output_spec(model), - # ) - # time.sleep(1) - # print(f"\n[ALEX_TEST] [WOUTPUT] WOUTPUT:\n{a} ; TYPE: {str(type(a))}") - # time.sleep(1) - # return a - # except Exception as ie: - # exception_message: str = "A Packster-Inside Exception occurred.\n" - # exception_traceback: str = traceback.format_exc() - # exception_message += f'{type(ie).__name__}: "{str(ie)}". Traceback: "{exception_traceback}".' - # sys.stderr.write(exception_message) - # sys.stderr.flush() - # raise ValueError(exception_message) from ie - # TODO: ALEX - # TODO: ALEX - max_tries: int = 5 - idx: int - for idx in range(max_tries): - print(f"\n[ALEX_TEST] [WOUTPUT] TRYING_IDX:\n{idx} ; TYPE: {str(type(idx))}") - time.sleep(1) - try: - a: str = await carton.pack( - input_model_path, - runner_name="torchscript", - # Any 2.x.x version is okay - # TODO: improve this - required_framework_version="=2", - model_name=carton_model_name, - inputs=_get_input_spec(model), - outputs=_get_output_spec(model), - ) - time.sleep(1) - print(f"\n[ALEX_TEST] [WOUTPUT] WOUTPUT:\n{a} ; TYPE: {str(type(a))}") - time.sleep(1) - return a - except Exception as ie: - exception_message: str = "A Packster-Inside Exception occurred.\n" - exception_traceback: str = traceback.format_exc() - exception_message += f'{type(ie).__name__}: "{str(ie)}". Traceback: "{exception_traceback}".' - sys.stderr.write(exception_message) - sys.stderr.flush() - # raise ValueError(exception_message) from ie - if idx >= max_tries - 1: - raise ValueError("THINGS ENDED VERY BADLY!!!!!!!!!!!!!") - time.sleep(1) - # TODO: ALEX + # async def packster() -> str: + # time.sleep(1) + # # TODO: ALEX + # # try: + # # a: str = await carton.pack( + # # input_model_path, + # # runner_name="torchscript", + # # # Any 2.x.x version is okay + # # # TODO: improve this + # # required_framework_version="=2", + # # model_name=carton_model_name, + # # inputs=_get_input_spec(model), + # # outputs=_get_output_spec(model), + # # ) + # # time.sleep(1) + # # print(f"\n[ALEX_TEST] [WOUTPUT] WOUTPUT:\n{a} ; TYPE: {str(type(a))}") + # # time.sleep(1) + # # return a + # # except Exception as ie: + # # exception_message: str = "A Packster-Inside Exception occurred.\n" + # # exception_traceback: str = traceback.format_exc() + # # exception_message += f'{type(ie).__name__}: "{str(ie)}". Traceback: "{exception_traceback}".' + # # sys.stderr.write(exception_message) + # # sys.stderr.flush() + # # raise ValueError(exception_message) from ie + # # TODO: ALEX + # # TODO: ALEX + # max_tries: int = 5 + # idx: int + # for idx in range(max_tries): + # print(f"\n[ALEX_TEST] [WOUTPUT] TRYING_IDX:\n{idx} ; TYPE: {str(type(idx))}") + # time.sleep(1) + # try: + # a: str = await carton.pack( + # input_model_path, + # runner_name="torchscript", + # # Any 2.x.x version is okay + # # TODO: improve this + # required_framework_version="=2", + # model_name=carton_model_name, + # inputs=_get_input_spec(model), + # outputs=_get_output_spec(model), + # ) + # time.sleep(1) + # print(f"\n[ALEX_TEST] [WOUTPUT] WOUTPUT:\n{a} ; TYPE: {str(type(a))}") + # time.sleep(1) + # return a + # except Exception as ie: + # exception_message: str = "A Packster-Inside Exception occurred.\n" + # exception_traceback: str = traceback.format_exc() + # exception_message += f'{type(ie).__name__}: "{str(ie)}". Traceback: "{exception_traceback}".' + # sys.stderr.write(exception_message) + # sys.stderr.flush() + # # raise ValueError(exception_message) from ie + # if idx >= max_tries - 1: + # raise ValueError("THINGS ENDED VERY BADLY!!!!!!!!!!!!!") + # time.sleep(1) + # TODO: ALEX # TODO: ALEX @@ -201,13 +204,14 @@ async def packster() -> str: import sys import traceback + tmp_out_path: str = None try: # TODO: ALEX - # tmp_out_path = loop.run_until_complete(pack()) + tmp_out_path = loop.run_until_complete(pack()) # TODO: ALEX # TODO: ALEX - time.sleep(1) - tmp_out_path: str = loop.run_until_complete(packster()) + # time.sleep(1) + # tmp_out_path: str = loop.run_until_complete(packster()) # TODO: ALEX except Exception as e: exception_message: str = "A general Exception occurred.\n" @@ -219,5 +223,5 @@ async def packster() -> str: # TODO: ALEX # Move it to the output path - time.sleep(1) + # time.sleep(1) shutil.move(tmp_out_path, carton_path) From 958abde9c18ecd17695f5aa5aaead202a2382292 Mon Sep 17 00:00:00 2001 From: Alex Sherstinsky Date: Sun, 10 Mar 2024 08:50:12 -0700 Subject: [PATCH 23/36] Troubleshooting test failures. --- ludwig/utils/carton_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ludwig/utils/carton_utils.py b/ludwig/utils/carton_utils.py index cb9daeb1185..fefd9f00c90 100644 --- a/ludwig/utils/carton_utils.py +++ b/ludwig/utils/carton_utils.py @@ -125,7 +125,7 @@ async def pack(): runner_name="torchscript", # Any 2.x.x version is okay # TODO: improve this - required_framework_version=">=2", + required_framework_version="=2.0", model_name=carton_model_name, inputs=_get_input_spec(model), outputs=_get_output_spec(model), From f0c286804bbbd76f8d1c1490d842ea955ce015a3 Mon Sep 17 00:00:00 2001 From: Alex Sherstinsky Date: Sun, 10 Mar 2024 09:04:50 -0700 Subject: [PATCH 24/36] Troubleshooting test failures. --- .github/workflows/pytest.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/pytest.yml b/.github/workflows/pytest.yml index 3397cb29dd0..f543293ad5b 100644 --- a/.github/workflows/pytest.yml +++ b/.github/workflows/pytest.yml @@ -70,7 +70,7 @@ jobs: cat requirements.txt | sed '/^torch[>=<\b]/d' | sed '/^torchtext/d' | sed '/^torchvision/d' | sed '/^torchaudio/d' > requirements-temp && mv requirements-temp requirements.txt cat requirements_distributed.txt | sed '/^ray[\[]/d' pip install torch==2.0.0 torchtext torchvision torchaudio - pip install ray==2.3.0 + pip install ray==2.3.1 pip install '.[test]' pip list shell: bash From ffbc78c3eea87f46cb2e7dd610e89d4595f0c7ee Mon Sep 17 00:00:00 2001 From: Alex Sherstinsky Date: Sun, 10 Mar 2024 22:37:08 -0700 Subject: [PATCH 25/36] Troubleshooting test failures. --- ludwig/utils/carton_utils.py | 141 ++++++++++++++++++----------------- 1 file changed, 73 insertions(+), 68 deletions(-) diff --git a/ludwig/utils/carton_utils.py b/ludwig/utils/carton_utils.py index fefd9f00c90..2daebb55a67 100644 --- a/ludwig/utils/carton_utils.py +++ b/ludwig/utils/carton_utils.py @@ -119,77 +119,82 @@ def export_carton(model: LudwigModel, carton_path: str, carton_model_name="ludwi # See https://pyo3.rs/v0.20.0/ecosystem/async-await#a-note-about-asynciorun for why we wrap it # in another function # TODO: ALEX - async def pack(): - return await carton.pack( - path=input_model_path, - runner_name="torchscript", - # Any 2.x.x version is okay - # TODO: improve this - required_framework_version="=2.0", - model_name=carton_model_name, - inputs=_get_input_spec(model), - outputs=_get_output_spec(model), - ) + # async def pack(): + # return await carton.pack( + # path=input_model_path, + # runner_name="torchscript", + # # Any 2.x.x version is okay + # # TODO: improve this + # required_framework_version="=2.0", + # model_name=carton_model_name, + # inputs=_get_input_spec(model), + # outputs=_get_output_spec(model), + # ) # TODO: ALEX # TODO: ALEX - # async def packster() -> str: - # time.sleep(1) - # # TODO: ALEX - # # try: - # # a: str = await carton.pack( - # # input_model_path, - # # runner_name="torchscript", - # # # Any 2.x.x version is okay - # # # TODO: improve this - # # required_framework_version="=2", - # # model_name=carton_model_name, - # # inputs=_get_input_spec(model), - # # outputs=_get_output_spec(model), - # # ) - # # time.sleep(1) - # # print(f"\n[ALEX_TEST] [WOUTPUT] WOUTPUT:\n{a} ; TYPE: {str(type(a))}") - # # time.sleep(1) - # # return a - # # except Exception as ie: - # # exception_message: str = "A Packster-Inside Exception occurred.\n" - # # exception_traceback: str = traceback.format_exc() - # # exception_message += f'{type(ie).__name__}: "{str(ie)}". Traceback: "{exception_traceback}".' - # # sys.stderr.write(exception_message) - # # sys.stderr.flush() - # # raise ValueError(exception_message) from ie - # # TODO: ALEX - # # TODO: ALEX - # max_tries: int = 5 - # idx: int - # for idx in range(max_tries): - # print(f"\n[ALEX_TEST] [WOUTPUT] TRYING_IDX:\n{idx} ; TYPE: {str(type(idx))}") - # time.sleep(1) - # try: - # a: str = await carton.pack( - # input_model_path, - # runner_name="torchscript", - # # Any 2.x.x version is okay - # # TODO: improve this - # required_framework_version="=2", - # model_name=carton_model_name, - # inputs=_get_input_spec(model), - # outputs=_get_output_spec(model), - # ) - # time.sleep(1) - # print(f"\n[ALEX_TEST] [WOUTPUT] WOUTPUT:\n{a} ; TYPE: {str(type(a))}") - # time.sleep(1) - # return a - # except Exception as ie: - # exception_message: str = "A Packster-Inside Exception occurred.\n" - # exception_traceback: str = traceback.format_exc() - # exception_message += f'{type(ie).__name__}: "{str(ie)}". Traceback: "{exception_traceback}".' - # sys.stderr.write(exception_message) - # sys.stderr.flush() - # # raise ValueError(exception_message) from ie - # if idx >= max_tries - 1: - # raise ValueError("THINGS ENDED VERY BADLY!!!!!!!!!!!!!") - # time.sleep(1) + async def packster() -> str: + # time.sleep(1) + # TODO: ALEX + # try: + # a: str = await carton.pack( + # input_model_path, + # runner_name="torchscript", + # # Any 2.x.x version is okay + # # TODO: improve this + # required_framework_version="=2", + # model_name=carton_model_name, + # inputs=_get_input_spec(model), + # outputs=_get_output_spec(model), + # ) + # time.sleep(1) + # print(f"\n[ALEX_TEST] [WOUTPUT] WOUTPUT:\n{a} ; TYPE: {str(type(a))}") + # time.sleep(1) + # return a + # except Exception as ie: + # exception_message: str = "A Packster-Inside Exception occurred.\n" + # exception_traceback: str = traceback.format_exc() + # exception_message += f'{type(ie).__name__}: "{str(ie)}". Traceback: "{exception_traceback}".' + # sys.stderr.write(exception_message) + # sys.stderr.flush() + # raise ValueError(exception_message) from ie + # TODO: ALEX + # TODO: ALEX + # max_tries: int = 5 + max_tries: int = 1 + idx: int + em: str = "" + for idx in range(max_tries): + print(f"\n[ALEX_TEST] [WOUTPUT] TRYING_IDX:\n{idx} ; TYPE: {str(type(idx))}") + time.sleep(1) + try: + a: str = await carton.pack( + input_model_path, + runner_name="torchscript", + # Any 2.x.x version is okay + # TODO: improve this + required_framework_version="=2", + model_name=carton_model_name, + inputs=_get_input_spec(model), + outputs=_get_output_spec(model), + ) + # time.sleep(1) + print(f"\n[ALEX_TEST] [WOUTPUT] WOUTPUT:\n{a} ; TYPE: {str(type(a))}") + # time.sleep(1) + return a + except Exception as ie: + exception_message: str = "A Packster-Inside Exception occurred.\n" + exception_traceback: str = traceback.format_exc() + exception_message += f'{type(ie).__name__}: "{str(ie)}". Traceback: "{exception_traceback}".' + sys.stderr.write(exception_message) + sys.stderr.flush() + em = exception_message + # raise ValueError(exception_message) from ie + if idx >= max_tries - 1: + # raise ValueError("THINGS ENDED VERY BADLY!!!!!!!!!!!!!") + raise ValueError(em) + # time.sleep(1) + # TODO: ALEX # TODO: ALEX From aa6f879181f3ab88d66266ebf54c823cc1f842cf Mon Sep 17 00:00:00 2001 From: Alex Sherstinsky Date: Sun, 10 Mar 2024 22:38:30 -0700 Subject: [PATCH 26/36] Troubleshooting test failures. --- ludwig/utils/carton_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ludwig/utils/carton_utils.py b/ludwig/utils/carton_utils.py index 2daebb55a67..21471af6ca1 100644 --- a/ludwig/utils/carton_utils.py +++ b/ludwig/utils/carton_utils.py @@ -133,7 +133,7 @@ def export_carton(model: LudwigModel, carton_path: str, carton_model_name="ludwi # TODO: ALEX # TODO: ALEX - async def packster() -> str: + async def pack() -> str: # time.sleep(1) # TODO: ALEX # try: From 57bd646822512981365a7b2323a4abc2020e2421 Mon Sep 17 00:00:00 2001 From: Alex Sherstinsky Date: Mon, 11 Mar 2024 17:29:05 -0700 Subject: [PATCH 27/36] Troubleshooting test failures. --- .github/workflows/pytest.yml | 5 +++++ ludwig/utils/carton_utils.py | 2 +- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/.github/workflows/pytest.yml b/.github/workflows/pytest.yml index f543293ad5b..5be57f91db4 100644 --- a/.github/workflows/pytest.yml +++ b/.github/workflows/pytest.yml @@ -60,6 +60,11 @@ jobs: run: | brew install libuv + - name: Debug out of space + run: | + du -h -d 1 ~ + df -h + - name: Install dependencies run: | python --version diff --git a/ludwig/utils/carton_utils.py b/ludwig/utils/carton_utils.py index 21471af6ca1..ed5e787bfca 100644 --- a/ludwig/utils/carton_utils.py +++ b/ludwig/utils/carton_utils.py @@ -169,7 +169,7 @@ async def pack() -> str: time.sleep(1) try: a: str = await carton.pack( - input_model_path, + path=input_model_path, runner_name="torchscript", # Any 2.x.x version is okay # TODO: improve this From 71ad3f9ad91698df1c40b0ecf57ff91c9db91b14 Mon Sep 17 00:00:00 2001 From: Alex Sherstinsky Date: Mon, 11 Mar 2024 17:47:40 -0700 Subject: [PATCH 28/36] Troubleshooting test failures. --- .github/workflows/pytest.yml | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/workflows/pytest.yml b/.github/workflows/pytest.yml index 5be57f91db4..f93d1f20aa2 100644 --- a/.github/workflows/pytest.yml +++ b/.github/workflows/pytest.yml @@ -60,11 +60,6 @@ jobs: run: | brew install libuv - - name: Debug out of space - run: | - du -h -d 1 ~ - df -h - - name: Install dependencies run: | python --version @@ -80,6 +75,11 @@ jobs: pip list shell: bash + - name: Debug out of space + run: | + du -h -d 1 ~ + df -h + - name: Integration Tests run: | RUN_PRIVATE=$IS_NOT_FORK LUDWIG_TEST_SUITE_TIMEOUT_S=7200 pytest -v --timeout 300 --durations 100 -m "not slow and not combinatorial and not horovod and not llm and $MARKERS" --junitxml pytest.xml tests/integration_tests From 23cacf69f3c443cd8f0c1b9b50d04b4a3fa73ef5 Mon Sep 17 00:00:00 2001 From: Alex Sherstinsky Date: Mon, 11 Mar 2024 17:57:26 -0700 Subject: [PATCH 29/36] Troubleshooting test failures. --- ludwig/utils/carton_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ludwig/utils/carton_utils.py b/ludwig/utils/carton_utils.py index ed5e787bfca..3f16f8feea4 100644 --- a/ludwig/utils/carton_utils.py +++ b/ludwig/utils/carton_utils.py @@ -192,7 +192,7 @@ async def pack() -> str: # raise ValueError(exception_message) from ie if idx >= max_tries - 1: # raise ValueError("THINGS ENDED VERY BADLY!!!!!!!!!!!!!") - raise ValueError(em) + raise ValueError(em) from ie # time.sleep(1) # TODO: ALEX From fea546fe10b6fe1375cf9928497e7bf7a8d2e3f1 Mon Sep 17 00:00:00 2001 From: Alex Sherstinsky Date: Mon, 11 Mar 2024 18:27:45 -0700 Subject: [PATCH 30/36] Troubleshooting test failures. --- ludwig/utils/carton_utils.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/ludwig/utils/carton_utils.py b/ludwig/utils/carton_utils.py index 3f16f8feea4..3586b0bd5a0 100644 --- a/ludwig/utils/carton_utils.py +++ b/ludwig/utils/carton_utils.py @@ -164,6 +164,7 @@ async def pack() -> str: max_tries: int = 1 idx: int em: str = "" + error: Exception | None = None for idx in range(max_tries): print(f"\n[ALEX_TEST] [WOUTPUT] TRYING_IDX:\n{idx} ; TYPE: {str(type(idx))}") time.sleep(1) @@ -189,10 +190,11 @@ async def pack() -> str: sys.stderr.write(exception_message) sys.stderr.flush() em = exception_message + error = ie # raise ValueError(exception_message) from ie if idx >= max_tries - 1: # raise ValueError("THINGS ENDED VERY BADLY!!!!!!!!!!!!!") - raise ValueError(em) from ie + raise ValueError(em) from error # time.sleep(1) # TODO: ALEX From a04219c4b87a4f258361e615f5e9c0944554a281 Mon Sep 17 00:00:00 2001 From: Alex Sherstinsky Date: Mon, 11 Mar 2024 18:40:33 -0700 Subject: [PATCH 31/36] Troubleshooting test failures. --- .github/workflows/pytest.yml | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/.github/workflows/pytest.yml b/.github/workflows/pytest.yml index f93d1f20aa2..4001de00cc5 100644 --- a/.github/workflows/pytest.yml +++ b/.github/workflows/pytest.yml @@ -75,15 +75,22 @@ jobs: pip list shell: bash - - name: Debug out of space + - name: Debug out of space -- A run: | du -h -d 1 ~ df -h + du -s /tmp - name: Integration Tests run: | RUN_PRIVATE=$IS_NOT_FORK LUDWIG_TEST_SUITE_TIMEOUT_S=7200 pytest -v --timeout 300 --durations 100 -m "not slow and not combinatorial and not horovod and not llm and $MARKERS" --junitxml pytest.xml tests/integration_tests + - name: Debug out of space -- B + run: | + du -h -d 1 ~ + df -h + du -s /tmp + event_file: name: "Event File" runs-on: ubuntu-latest From 1c6553217e480e142d17df0a87948b7ee36d3708 Mon Sep 17 00:00:00 2001 From: Alex Sherstinsky Date: Tue, 12 Mar 2024 00:14:32 -0700 Subject: [PATCH 32/36] Troubleshooting test failures. --- .github/workflows/pytest.yml | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/.github/workflows/pytest.yml b/.github/workflows/pytest.yml index 4001de00cc5..f222d8098eb 100644 --- a/.github/workflows/pytest.yml +++ b/.github/workflows/pytest.yml @@ -79,7 +79,17 @@ jobs: run: | du -h -d 1 ~ df -h - du -s /tmp + # du -s /tmp + + - name: Free Disk Space (Ubuntu) + uses: jlumbroso/free-disk-space@main + with: + tool-cache: false + android: true + dotnet: true + haskell: true + large-packages: false + swap-storage: true - name: Integration Tests run: | @@ -89,7 +99,7 @@ jobs: run: | du -h -d 1 ~ df -h - du -s /tmp + # du -s /tmp event_file: name: "Event File" From 265a9a06eb276d62dc4c2b7287cc462ed360be94 Mon Sep 17 00:00:00 2001 From: Alex Sherstinsky Date: Tue, 12 Mar 2024 00:30:05 -0700 Subject: [PATCH 33/36] Troubleshooting test failures. --- .github/workflows/pytest.yml | 27 ++++++++++++++++----------- 1 file changed, 16 insertions(+), 11 deletions(-) diff --git a/.github/workflows/pytest.yml b/.github/workflows/pytest.yml index f222d8098eb..d83af12cf26 100644 --- a/.github/workflows/pytest.yml +++ b/.github/workflows/pytest.yml @@ -75,11 +75,11 @@ jobs: pip list shell: bash - - name: Debug out of space -- A - run: | - du -h -d 1 ~ - df -h - # du -s /tmp + # - name: Debug out of space -- A + # run: | + # du -h -d 1 ~ + # df -h + # # du -s /tmp - name: Free Disk Space (Ubuntu) uses: jlumbroso/free-disk-space@main @@ -89,17 +89,22 @@ jobs: dotnet: true haskell: true large-packages: false + docker-images: true swap-storage: true - - name: Integration Tests + - name: Clean out /tmp directory run: | - RUN_PRIVATE=$IS_NOT_FORK LUDWIG_TEST_SUITE_TIMEOUT_S=7200 pytest -v --timeout 300 --durations 100 -m "not slow and not combinatorial and not horovod and not llm and $MARKERS" --junitxml pytest.xml tests/integration_tests + rm -rf /tmp/* - - name: Debug out of space -- B + - name: #Integration Tests run: | - du -h -d 1 ~ - df -h - # du -s /tmp + RUN_PRIVATE=$IS_NOT_FORK LUDWIG_TEST_SUITE_TIMEOUT_S=7200 pytest -v --timeout 300 --durations 100 -m "not slow and not combinatorial and not horovod and not llm and $MARKERS" --junitxml pytest.xml tests/integration_tests + + # - name: Debug out of space -- B + # run: | + # du -h -d 1 ~ + # df -h + # # du -s /tmp event_file: name: "Event File" From c9e9d3292829a7ccd1149d602159419f6b0f9a80 Mon Sep 17 00:00:00 2001 From: Alex Sherstinsky Date: Tue, 12 Mar 2024 00:39:04 -0700 Subject: [PATCH 34/36] Troubleshooting test failures. --- .github/workflows/pytest.yml | 2 +- tests/integration_tests/test_f_control.py | 60 ++++++++++++----------- 2 files changed, 32 insertions(+), 30 deletions(-) diff --git a/.github/workflows/pytest.yml b/.github/workflows/pytest.yml index d83af12cf26..491ce2a9fd1 100644 --- a/.github/workflows/pytest.yml +++ b/.github/workflows/pytest.yml @@ -94,7 +94,7 @@ jobs: - name: Clean out /tmp directory run: | - rm -rf /tmp/* + sudo rm -rf /tmp/* - name: #Integration Tests run: | diff --git a/tests/integration_tests/test_f_control.py b/tests/integration_tests/test_f_control.py index 8724732fc8f..d8cf5e2deb4 100644 --- a/tests/integration_tests/test_f_control.py +++ b/tests/integration_tests/test_f_control.py @@ -180,33 +180,35 @@ async def infer(inputs): # TODO: ALEX -# # TODO: ALEX -# @pytest.mark.integration_tests_x -# # TODO: ALEX -# @pytest.mark.parametrize("use_pretrained", [False, True], ids=["false", "true"]) -# def test_vit_encoder_different_dimension_image(tmpdir, csv_filename, use_pretrained: bool): -# input_features = [ -# image_feature( -# os.path.join(tmpdir, "generated_output"), -# preprocessing={"in_memory": True, "height": 224, "width": 206, "num_channels": 3}, -# encoder={TYPE: "_vit_legacy", "use_pretrained": use_pretrained}, -# ) -# ] -# output_features = [category_feature(decoder={"vocab_size": 5}, reduce_input="sum")] - -# data_csv = generate_data( -# input_features, output_features, os.path.join(tmpdir, csv_filename), num_examples=NUM_EXAMPLES -# ) - -# config = { -# INPUT_FEATURES: input_features, -# OUTPUT_FEATURES: output_features, -# TRAINER: {"train_steps": 1}, -# } - -# model = LudwigModel(config) - -# # Failure happens post preprocessing but before training during the ECD model creation phase -# # so make sure the model can be created properly and training can proceed -# model.train(dataset=data_csv) +# TODO: ALEX +@pytest.mark.integration_tests_x +# TODO: ALEX +@pytest.mark.parametrize("use_pretrained", [False, True], ids=["false", "true"]) +def test_vit_encoder_different_dimension_image(tmpdir, csv_filename, use_pretrained: bool): + input_features = [ + image_feature( + os.path.join(tmpdir, "generated_output"), + preprocessing={"in_memory": True, "height": 224, "width": 206, "num_channels": 3}, + encoder={TYPE: "_vit_legacy", "use_pretrained": use_pretrained}, + ) + ] + output_features = [category_feature(decoder={"vocab_size": 5}, reduce_input="sum")] + + data_csv = generate_data( + input_features, output_features, os.path.join(tmpdir, csv_filename), num_examples=NUM_EXAMPLES + ) + + config = { + INPUT_FEATURES: input_features, + OUTPUT_FEATURES: output_features, + TRAINER: {"train_steps": 1}, + } + + model = LudwigModel(config) + + # Failure happens post preprocessing but before training during the ECD model creation phase + # so make sure the model can be created properly and training can proceed + model.train(dataset=data_csv) + + # TODO: ALEX From d1f7592655ed82c635888059f7d4152e8f8b8101 Mon Sep 17 00:00:00 2001 From: Alex Sherstinsky Date: Tue, 12 Mar 2024 08:38:06 -0700 Subject: [PATCH 35/36] Making carton utils more robust. --- .pre-commit-config.yaml | 18 ++--- ludwig/utils/carton_utils.py | 135 +++++++---------------------------- pytest.ini | 14 +++- tests/conftest.py | 3 - 4 files changed, 47 insertions(+), 123 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 27fd6a51fde..1c6390db514 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -38,15 +38,15 @@ repos: hooks: - id: docformatter args: [--in-place, --wrap-summaries=115, --wrap-descriptions=120] - #- repo: https://github.com/PyCQA/isort - # rev: 5.12.0 - # hooks: - # - id: isort - # name: Format imports - #- repo: https://github.com/pycqa/flake8 - # rev: 6.0.0 - # hooks: - # - id: flake8 + - repo: https://github.com/PyCQA/isort + rev: 5.12.0 + hooks: + - id: isort + name: Format imports + - repo: https://github.com/pycqa/flake8 + rev: 6.0.0 + hooks: + - id: flake8 - repo: https://github.com/psf/black rev: 23.3.0 hooks: diff --git a/ludwig/utils/carton_utils.py b/ludwig/utils/carton_utils.py index 3586b0bd5a0..d03d0a4cd92 100644 --- a/ludwig/utils/carton_utils.py +++ b/ludwig/utils/carton_utils.py @@ -3,7 +3,9 @@ import logging import os import shutil +import sys import tempfile +import traceback from typing import Any, Dict, List import torch @@ -98,7 +100,6 @@ def _get_output_spec(model: LudwigModel) -> List[Dict[str, Any]]: @DeveloperAPI def export_carton(model: LudwigModel, carton_path: str, carton_model_name="ludwig_model"): - print(f"\n[ALEX_TEST] [WOUTPUT] CARTON_PATH:\n{carton_path} ; TYPE: {str(type(carton_path))}") try: import cartonml as carton except ImportError: @@ -106,129 +107,43 @@ def export_carton(model: LudwigModel, carton_path: str, carton_model_name="ludwi # Generate a torchscript model model_ts = generate_carton_torchscript(model) - print(f"\n[ALEX_TEST] [WOUTPUT] MODEL_TORCH_SCRIPT:\n{model_ts} ; TYPE: {str(type(model_ts))}") with tempfile.TemporaryDirectory() as tmpdir: - print(f"\n[ALEX_TEST] [WOUTPUT] TMPDIR:\n{tmpdir} ; TYPE: {str(type(tmpdir))}") # Save the model to a temp dir input_model_path: str = os.path.join(tmpdir, "model.pt") torch.jit.save(model_ts, input_model_path) - print(f"\n[ALEX_TEST] [WOUTPUT] INPUT_MODEL_PATH:\n{input_model_path} ; TYPE: {str(type(input_model_path))}") # carton.pack is an async function so we run it and wait until it's complete # See https://pyo3.rs/v0.20.0/ecosystem/async-await#a-note-about-asynciorun for why we wrap it # in another function - # TODO: ALEX - # async def pack(): - # return await carton.pack( - # path=input_model_path, - # runner_name="torchscript", - # # Any 2.x.x version is okay - # # TODO: improve this - # required_framework_version="=2.0", - # model_name=carton_model_name, - # inputs=_get_input_spec(model), - # outputs=_get_output_spec(model), - # ) - - # TODO: ALEX - # TODO: ALEX async def pack() -> str: - # time.sleep(1) - # TODO: ALEX - # try: - # a: str = await carton.pack( - # input_model_path, - # runner_name="torchscript", - # # Any 2.x.x version is okay - # # TODO: improve this - # required_framework_version="=2", - # model_name=carton_model_name, - # inputs=_get_input_spec(model), - # outputs=_get_output_spec(model), - # ) - # time.sleep(1) - # print(f"\n[ALEX_TEST] [WOUTPUT] WOUTPUT:\n{a} ; TYPE: {str(type(a))}") - # time.sleep(1) - # return a - # except Exception as ie: - # exception_message: str = "A Packster-Inside Exception occurred.\n" - # exception_traceback: str = traceback.format_exc() - # exception_message += f'{type(ie).__name__}: "{str(ie)}". Traceback: "{exception_traceback}".' - # sys.stderr.write(exception_message) - # sys.stderr.flush() - # raise ValueError(exception_message) from ie - # TODO: ALEX - # TODO: ALEX - # max_tries: int = 5 - max_tries: int = 1 - idx: int - em: str = "" - error: Exception | None = None - for idx in range(max_tries): - print(f"\n[ALEX_TEST] [WOUTPUT] TRYING_IDX:\n{idx} ; TYPE: {str(type(idx))}") - time.sleep(1) - try: - a: str = await carton.pack( - path=input_model_path, - runner_name="torchscript", - # Any 2.x.x version is okay - # TODO: improve this - required_framework_version="=2", - model_name=carton_model_name, - inputs=_get_input_spec(model), - outputs=_get_output_spec(model), - ) - # time.sleep(1) - print(f"\n[ALEX_TEST] [WOUTPUT] WOUTPUT:\n{a} ; TYPE: {str(type(a))}") - # time.sleep(1) - return a - except Exception as ie: - exception_message: str = "A Packster-Inside Exception occurred.\n" - exception_traceback: str = traceback.format_exc() - exception_message += f'{type(ie).__name__}: "{str(ie)}". Traceback: "{exception_traceback}".' - sys.stderr.write(exception_message) - sys.stderr.flush() - em = exception_message - error = ie - # raise ValueError(exception_message) from ie - if idx >= max_tries - 1: - # raise ValueError("THINGS ENDED VERY BADLY!!!!!!!!!!!!!") - raise ValueError(em) from error - # time.sleep(1) - - # TODO: ALEX - - # TODO: ALEX - - # TODO: ALEX - loop = asyncio.get_event_loop() - print(f"\n[ALEX_TEST] [WOUTPUT] LOOP:\n{loop} ; TYPE: {str(type(loop))}") - # tmp_out_path = loop.run_until_complete(pack()) - # TODO: ALEX - # TODO: ALEX - import time - import sys - import traceback - - tmp_out_path: str = None + try: + return await carton.pack( + path=input_model_path, + runner_name="torchscript", + # Any 2.x.x version is okay + # TODO: improve this + required_framework_version="=2", + model_name=carton_model_name, + inputs=_get_input_spec(model), + outputs=_get_output_spec(model), + ) + except Exception as e: + exception_message: str = 'An Exception inside "pack()" occurred.\n' + exception_traceback: str = traceback.format_exc() + exception_message += f'{type(e).__name__}: "{str(e)}". Traceback: "{exception_traceback}".' + sys.stderr.write(exception_message) + sys.stderr.flush() + raise ValueError(exception_message) from e # Re-raise error for calling function to handle. + try: - # TODO: ALEX - tmp_out_path = loop.run_until_complete(pack()) - # TODO: ALEX - # TODO: ALEX - # time.sleep(1) - # tmp_out_path: str = loop.run_until_complete(packster()) - # TODO: ALEX + tmp_out_path: str = asyncio.get_event_loop().run_until_complete(pack()) + # Move it to the output path + shutil.move(tmp_out_path, carton_path) except Exception as e: - exception_message: str = "A general Exception occurred.\n" + exception_message: str = 'An Exception inside "export_carton()" occurred.\n' exception_traceback: str = traceback.format_exc() exception_message += f'{type(e).__name__}: "{str(e)}". Traceback: "{exception_traceback}".' sys.stderr.write(exception_message) sys.stderr.flush() raise SystemExit(exception_message) from e # Make sure error is fatal. - # TODO: ALEX - - # Move it to the output path - # time.sleep(1) - shutil.move(tmp_out_path, carton_path) diff --git a/pytest.ini b/pytest.ini index a82421e060c..539a53b1674 100644 --- a/pytest.ini +++ b/pytest.ini @@ -1,5 +1,17 @@ [pytest] markers = - integration_tests_x: mark a test to be run as part of integration tests, group X. + benchmark: mark a test as a benchmarking test. + distributed: mark a test as a distributed test. + filesystem: mark to test operating system systems. + slow: mark test as slow. + combinatorial: mark a test as combinatorial. + horovod: mark a test as a Horovod test. + llm: mark a test as an LLM test. + integration_tests_a: mark a test to be run as part of integration tests, group A. + integration_tests_b: mark a test to be run as part of integration tests, group B. + integration_tests_c: mark a test to be run as part of integration tests, group C. + integration_tests_d: mark a test to be run as part of integration tests, group D. + integration_tests_e: mark a test to be run as part of integration tests, group E. + integration_tests_f: mark a test to be run as part of integration tests, group F. filterwarnings = ignore::DeprecationWarning diff --git a/tests/conftest.py b/tests/conftest.py index 3b2abe99622..9dae92e2e65 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -44,9 +44,6 @@ "integration_tests_c", "integration_tests_d", "integration_tests_e", - # TODO: ALEX - "integration_tests_x", - # TODO: ALEX } From 9e5b5cdd39d207588422e16acb23abb5207d3985 Mon Sep 17 00:00:00 2001 From: Alex Sherstinsky Date: Tue, 12 Mar 2024 08:45:53 -0700 Subject: [PATCH 36/36] Adding storage cleaning to GitHub Actions for integration tests. --- .github/workflows/pytest.yml | 500 +++++++++++++++++++++- tests/integration_tests/test_f_control.py | 214 --------- 2 files changed, 486 insertions(+), 228 deletions(-) delete mode 100644 tests/integration_tests/test_f_control.py diff --git a/.github/workflows/pytest.yml b/.github/workflows/pytest.yml index 491ce2a9fd1..4f90d4b138c 100644 --- a/.github/workflows/pytest.yml +++ b/.github/workflows/pytest.yml @@ -16,6 +16,187 @@ concurrency: cancel-in-progress: true jobs: + pytest: + runs-on: ${{ matrix.os }} + strategy: + fail-fast: false + matrix: + os: [ubuntu-latest] + python-version: ["3.8", "3.9", "3.10"] + test-markers: ["not distributed", "distributed"] + include: + - python-version: "3.8" + pytorch-version: 2.0.0 + torchscript-version: 1.10.2 + ray-version: 2.3.1 + - python-version: "3.9" + pytorch-version: 2.1.1 + torchscript-version: 1.10.2 + ray-version: 2.3.1 + - python-version: "3.10" + # pytorch-version: nightly + pytorch-version: 2.2.1 + torchscript-version: 1.10.2 + ray-version: 2.3.1 + env: + PYTORCH: ${{ matrix.pytorch-version }} + MARKERS: ${{ matrix.test-markers }} + NEUROPOD_BASE_DIR: "/usr/local/lib/neuropod" + NEUROPOD_VERISON: "0.3.0-rc6" + TORCHSCRIPT_VERSION: ${{ matrix.torchscript-version }} + RAY_VERSION: ${{ matrix.ray-version }} + AWS_ACCESS_KEY_ID: ${{ secrets.LUDWIG_TESTS_AWS_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.LUDWIG_TESTS_AWS_SECRET_ACCESS_KEY }} + KAGGLE_USERNAME: ${{ secrets.KAGGLE_USERNAME }} + KAGGLE_KEY: ${{ secrets.KAGGLE_KEY }} + IS_NOT_FORK: ${{ !(github.event.pull_request.base.repo.full_name == 'ludwig-ai/ludwig' && github.event.pull_request.head.repo.fork) }} + + name: py${{ matrix.python-version }}, torch-${{ matrix.pytorch-version }}, ${{ matrix.test-markers }}, ${{ matrix.os }}, ray ${{ matrix.ray-version }} + services: + minio: + image: fclairamb/minio-github-actions + env: + MINIO_ACCESS_KEY: minio + MINIO_SECRET_KEY: minio123 + ports: + - 9000:9000 + + timeout-minutes: 150 + steps: + - name: Setup ludwigai/ludwig-ray container for local testing with act. + if: ${{ env.ACT }} + run: | + curl -fsSL https://deb.nodesource.com/setup_16.x | sudo -E bash - + sudo apt-get install -y nodejs + sudo mkdir -p /opt/hostedtoolcache/ + sudo chmod 777 -R /opt/hostedtoolcache/ + - uses: actions/checkout@v2 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v2 + with: + python-version: ${{ matrix.python-version }} + + - name: Setup Linux + if: runner.os == 'linux' + run: | + sudo apt-get update && sudo apt-get install -y cmake libsndfile1 wget libsox-dev + + - name: Setup macOS + if: runner.os == 'macOS' + run: | + brew install libuv + + - name: pip cache + if: ${{ !env.ACT }} + uses: actions/cache@v2 + with: + path: ~/.cache/pip + key: ${{ runner.os }}-pip-py${{ matrix.python-version }}-torch${{ matrix.pytorch-version }}-${{ matrix.test-markers }}-${{ hashFiles('requirements*.txt', '.github/workflows/pytest.yml') }} + + - name: Debug out of space + run: | + du -h -d 1 ~ + df -h + + - name: Install dependencies + run: | + python --version + pip --version + python -m pip install -U pip + cmake --version + + # remove torch and ray from the dependencies so we can add them depending on the matrix args for the job. + cat requirements.txt | sed '/^torch[>=<\b]/d' | sed '/^torchtext/d' | sed '/^torchvision/d' | sed '/^torchaudio/d' > requirements-temp && mv requirements-temp requirements.txt + cat requirements_distributed.txt | sed '/^ray[\[]/d' + + if [ "$MARKERS" != "distributed" ]; then + # Skip distributed and hyperopt requirements to test optional imports + echo > requirements-temp && mv requirements-temp requirements_distributed.txt + echo > requirements-temp && mv requirements-temp requirements_hyperopt.txt + + # Skip distributed tree requirement (lightgbm-ray) + cat requirements_tree.txt | sed '/^lightgbm-ray/d' > requirements-temp && mv requirements-temp requirements_tree.txt + else + if [ "$RAY_VERSION" == "nightly" ]; then + # NOTE: hardcoded for python 3.10 on Linux + echo "ray[default,data,serve,tune] @ https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-3.0.0.dev0-cp310-cp310-manylinux2014_x86_64.whl" >> requirements_distributed.txt + else + echo "ray[default,data,serve,tune]==$RAY_VERSION" >> requirements_distributed.txt + fi + fi + + if [ "$PYTORCH" == "nightly" ]; then + extra_index_url=https://download.pytorch.org/whl/nightly/cpu + pip install --pre torch torchtext torchvision torchaudio --extra-index-url $extra_index_url + + else + extra_index_url=https://download.pytorch.org/whl/cpu + pip install torch==$PYTORCH torchtext torchvision torchaudio --extra-index-url $extra_index_url + fi + + pip install '.[test]' --extra-index-url $extra_index_url + pip list + + if [ "$PYTORCH" == "nightly" ]; then + python -c "from packaging import version; import torch; assert version.parse(torch.__version__).release >= version.parse(\"2.0.0\").release, f\"torch {version.parse(torch.__version__).release} < version.parse(\'2.0.0\').release\"" + else + python -c "from packaging import version; import torch; assert version.parse(torch.__version__).release == version.parse(\"$PYTORCH\").release, f\"torch {version.parse(torch.__version__).release} != version.parse(\'$PYTORCH\').release\"" + fi + + if [ "$MARKERS" == "distributed" ]; then + python -c "from packaging import version; import ray; assert version.parse(ray.__version__).release == version.parse(\"$RAY_VERSION\").release, f\"ray {version.parse(ray.__version__).release} != version.parse(\'$RAY_VERSION\').release\"" + else + python -c "import importlib.util; assert importlib.util.find_spec('ray') is None, \"found ray but expected it to not be installed\"" + fi + shell: bash + + - name: Install Neuropod backend + run: | + sudo mkdir -p "$NEUROPOD_BASE_DIR" + curl -L https://github.com/uber/neuropod/releases/download/v${{ env.NEUROPOD_VERISON }}/libneuropod-cpu-linux-v${{ env.NEUROPOD_VERISON }}-torchscript-${{ env.TORCHSCRIPT_VERSION }}-backend.tar.gz | sudo tar -xz -C "$NEUROPOD_BASE_DIR" + shell: bash + + - name: Unit Tests + run: | + RUN_PRIVATE=$IS_NOT_FORK LUDWIG_TEST_SUITE_TIMEOUT_S=5400 pytest -v --timeout 300 --durations 100 -m "$MARKERS and not slow and not combinatorial and not horovod and not llm" --junitxml pytest.xml tests/ludwig + + - name: Regression Tests + run: | + RUN_PRIVATE=$IS_NOT_FORK LUDWIG_TEST_SUITE_TIMEOUT_S=5400 pytest -v --timeout 300 --durations 100 -m "$MARKERS and not slow and not combinatorial and not horovod or benchmark and not llm" --junitxml pytest.xml tests/regression_tests + + # Skip Horovod and replace with DDP. + # https://github.com/ludwig-ai/ludwig/issues/3468 + # - name: Install Horovod if necessary + # if: matrix.test-markers == 'distributed' && matrix.pytorch-version != 'nightly' + # env: + # HOROVOD_WITH_PYTORCH: 1 + # HOROVOD_WITHOUT_MPI: 1 + # HOROVOD_WITHOUT_TENSORFLOW: 1 + # HOROVOD_WITHOUT_MXNET: 1 + # run: | + # pip install -r requirements_extra.txt + # HOROVOD_BUILT=$(python -c "import horovod.torch; horovod.torch.nccl_built(); print('SUCCESS')" || true) + # if [[ $HOROVOD_BUILT != "SUCCESS" ]]; then + # pip uninstall -y horovod + # pip install --no-cache-dir git+https://github.com/horovod/horovod.git@master + # fi + # horovodrun --check-build + # shell: bash + + # Skip Horovod tests and replace with DDP. + # https://github.com/ludwig-ai/ludwig/issues/3468 + # - name: Horovod Tests + # if: matrix.test-markers == 'distributed' && matrix.pytorch-version != 'nightly' + # run: | + # RUN_PRIVATE=$IS_NOT_FORK LUDWIG_TEST_SUITE_TIMEOUT_S=5400 pytest -v --timeout 300 --durations 100 -m "$MARKERS and horovod and not slow and not combinatorial and not llm" --junitxml pytest.xml tests/ + + - name: Upload Unit Test Results + if: ${{ always() && !env.ACT }} + uses: actions/upload-artifact@v2 + with: + name: Unit Test Results (Python ${{ matrix.python-version }} ${{ matrix.test-markers }}) + path: pytest.xml + integration-tests: name: ${{ matrix.test-markers }} runs-on: ubuntu-latest @@ -23,7 +204,12 @@ jobs: fail-fast: false matrix: test-markers: - - "integration_tests_x" + - "integration_tests_a" + - "integration_tests_b" + - "integration_tests_c" + - "integration_tests_d" + - "integration_tests_e" + - "integration_tests_f" env: AWS_ACCESS_KEY_ID: ${{ secrets.LUDWIG_TESTS_AWS_ACCESS_KEY_ID }} @@ -70,17 +256,11 @@ jobs: cat requirements.txt | sed '/^torch[>=<\b]/d' | sed '/^torchtext/d' | sed '/^torchvision/d' | sed '/^torchaudio/d' > requirements-temp && mv requirements-temp requirements.txt cat requirements_distributed.txt | sed '/^ray[\[]/d' pip install torch==2.0.0 torchtext torchvision torchaudio - pip install ray==2.3.1 + pip install ray==2.3.0 pip install '.[test]' pip list shell: bash - # - name: Debug out of space -- A - # run: | - # du -h -d 1 ~ - # df -h - # # du -s /tmp - - name: Free Disk Space (Ubuntu) uses: jlumbroso/free-disk-space@main with: @@ -96,15 +276,281 @@ jobs: run: | sudo rm -rf /tmp/* - - name: #Integration Tests + - name: Integration Tests run: | RUN_PRIVATE=$IS_NOT_FORK LUDWIG_TEST_SUITE_TIMEOUT_S=7200 pytest -v --timeout 300 --durations 100 -m "not slow and not combinatorial and not horovod and not llm and $MARKERS" --junitxml pytest.xml tests/integration_tests - # - name: Debug out of space -- B - # run: | - # du -h -d 1 ~ - # df -h - # # du -s /tmp + llm-tests: + name: LLM Tests + runs-on: ubuntu-latest + + timeout-minutes: 60 + steps: + - uses: actions/checkout@v2 + - name: Set up Python 3.9 + uses: actions/setup-python@v2 + with: + python-version: 3.9 + + - name: Setup Linux + if: runner.os == 'linux' + run: | + sudo apt-get update && sudo apt-get install -y cmake libsndfile1 + + - name: Setup macOS + if: runner.os == 'macOS' + run: | + brew install libuv + + - name: Install dependencies + run: | + python --version + pip --version + python -m pip install -U pip + + # remove torch and ray from the dependencies so we can add them depending on the matrix args for the job. + cat requirements.txt | sed '/^torch[>=<\b]/d' | sed '/^torchtext/d' | sed '/^torchvision/d' | sed '/^torchaudio/d' > requirements-temp && mv requirements-temp requirements.txt + cat requirements_distributed.txt | sed '/^ray[\[]/d' + pip install torch==2.0.0 torchtext torchvision torchaudio + pip install ray==2.3.0 + pip install '.[test]' + pip list + shell: bash + + - name: LLM Tests + run: | + pytest -vs --durations 100 -m "llm" --junitxml pytest.xml tests + + combinatorial-tests: + name: Combinatorial Tests + runs-on: ubuntu-latest + + timeout-minutes: 60 + steps: + - uses: actions/checkout@v2 + - name: Set up Python 3.8 + uses: actions/setup-python@v2 + with: + python-version: 3.8 + + - name: Setup Linux + if: runner.os == 'linux' + run: | + sudo apt-get update && sudo apt-get install -y cmake libsndfile1 + + - name: Setup macOS + if: runner.os == 'macOS' + run: | + brew install libuv + + - name: Install dependencies + run: | + python --version + pip --version + python -m pip install -U pip + pip install '.[test]' + pip list + shell: bash + + - name: Testing combinatorial config generation code + run: | + pytest -vs --durations 100 -m "combinatorial" --junitxml pytest.xml tests/ludwig/config_sampling + + - name: Combinatorial Tests + run: | + pytest -rx --durations 100 -m "combinatorial" --junitxml pytest.xml tests/training_success + + test-minimal-install: + name: Test Minimal Install + runs-on: ubuntu-latest + + timeout-minutes: 15 + steps: + - uses: actions/checkout@v2 + - name: Set up Python 3.8 + uses: actions/setup-python@v2 + with: + python-version: 3.8 + + - name: Setup Linux + if: runner.os == 'linux' + run: | + sudo apt-get update && sudo apt-get install -y cmake libsndfile1 + + - name: Setup macOS + if: runner.os == 'macOS' + run: | + brew install libuv + + - name: Install dependencies + run: | + python --version + pip --version + python -m pip install -U pip + pip install torch==2.0.0 torchtext + pip install ray==2.3.0 + pip install '.' + pip list + shell: bash + - name: Check Install + run: | + ludwig check_install + shell: bash + + - name: Test Getting Started + run: | + cd examples/getting_started && sh ./run.sh + shell: bash + + # start-runner: + # name: Start self-hosted EC2 runner + # if: > + # always() && needs.pytest.result != 'failure' && ( + # github.event_name == 'schedule' && github.repository == 'ludwig-ai/ludwig' || + # github.event_name == 'push' && github.repository == 'ludwig-ai/ludwig' || + # github.event_name == 'pull_request' && github.event.pull_request.base.repo.full_name == 'ludwig-ai/ludwig' && !github.event.pull_request.head.repo.fork) + # needs: pytest + # runs-on: ubuntu-latest + # outputs: + # label: ${{ steps.start-ec2-runner.outputs.label }} + # ec2-instance-id: ${{ steps.start-ec2-runner.outputs.ec2-instance-id }} + + # steps: + # - name: Configure AWS credentials + # uses: aws-actions/configure-aws-credentials@v1 + # with: + # aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} + # aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + # aws-region: ${{ secrets.AWS_REGION }} + + # - name: Start EC2 runner + # id: start-ec2-runner + # uses: machulav/ec2-github-runner@v2.3.2 + # with: + # mode: start + # github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }} + # ec2-image-id: ami-0759580dedc953d1f + # ec2-instance-type: g4dn.xlarge + # subnet-id: subnet-0983be43 + # security-group-id: sg-4cba0d08 + # aws-resource-tags: > + # [ + # {"Key": "Name", "Value": "ludwig-github-${{ github.head_ref || github.sha }}"}, + # {"Key": "GitHubRepository", "Value": "${{ github.repository }}"}, + # {"Key": "GitHubHeadRef", "Value": "${{ github.head_ref }}"}, + # {"Key": "GitHubSHA", "Value": "${{ github.sha }}"} + # ] + + # pytest-gpu: + # if: needs.start-runner.result != 'skipped' + # needs: start-runner # required to start the main job when the runner is ready + # runs-on: ${{ needs.start-runner.outputs.label }} # run the job on the newly created runners + # strategy: + # fail-fast: false + # matrix: + # python-version: [3.7] + # include: + # - python-version: 3.7 + # pytorch-version: 1.10.0 + # torchscript-version: 1.10.2 + # env: + # PYTORCH: ${{ matrix.pytorch-version }} + # NEUROPOD_BASE_DIR: "/usr/local/lib/neuropod" + # NEUROPOD_VERISON: "0.3.0-rc6" + # TORCHSCRIPT_VERSION: ${{ matrix.torchscript-version }} + + # name: py${{ matrix.python-version }}, torch-${{ matrix.pytorch-version }}, gpu + + # timeout-minutes: 70 + # steps: + # - uses: actions/checkout@v2 + # - name: Set up Python ${{ matrix.python-version }} + # uses: actions/setup-python@v2 + # with: + # python-version: ${{ matrix.python-version }} + + # - name: Setup Linux + # if: runner.os == 'linux' + # run: | + # sudo apt-get update && sudo apt-get install -y libsndfile1 cmake ccache build-essential g++-8 gcc-8 + # cmake --version + + # - name: Install CUDA drivers + # run: | + # wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/cuda-ubuntu2004.pin + # sudo mv cuda-ubuntu2004.pin /etc/apt/preferences.d/cuda-repository-pin-600 + # wget https://developer.download.nvidia.com/compute/cuda/11.5.1/local_installers/cuda-repo-ubuntu2004-11-5-local_11.5.1-495.29.05-1_amd64.deb + # sudo dpkg -i cuda-repo-ubuntu2004-11-5-local_11.5.1-495.29.05-1_amd64.deb + # sudo apt-key add /var/cuda-repo-ubuntu2004-11-5-local/7fa2af80.pub + # sudo apt-get update + # sudo apt-get -y install cuda + # shell: bash + + # - name: pip cache + # uses: actions/cache@v2 + # with: + # path: ~/.cache/pip + # key: ${{ runner.os }}-pip-py${{ matrix.python-version }}-torch${{ matrix.pytorch-version }}-${{ hashFiles('requirements*.txt') }} + # restore-keys: | + # ${{ runner.os }}-pip-py${{ matrix.python-version }}-torch${{ matrix.pytorch-version }}- + + # - name: Install dependencies + # env: + # HOROVOD_WITH_PYTORCH: 1 + # HOROVOD_WITHOUT_MPI: 1 + # HOROVOD_WITHOUT_TENSORFLOW: 1 + # HOROVOD_WITHOUT_MXNET: 1 + # run: | + # python --version + # pip --version + # python -m pip install -U pip + # if [ $PYTORCH == "nightly" ]; then + # cat requirements.txt | sed '/^torch[>=<]/d' > requirements-temp && mv requirements-temp requirements.txt + # pip install --pre torch torchvision -f https://download.pytorch.org/whl/torch_stable.html + # else + # pip install torch==${PYTORCH}+cu111 -f https://download.pytorch.org/whl/torch_stable.html + # fi + # # pip install --no-cache-dir git+https://github.com/horovod/horovod.git@master + # pip install dulwich==0.20.26 # workaround for `/usr/bin/ld: cannot find -lpython3.7m` + # pip install '.[test]' + # pip list + # shell: bash + + # - name: Install Neuropod backend + # run: | + # sudo mkdir -p "$NEUROPOD_BASE_DIR" + # curl -L https://github.com/uber/neuropod/releases/download/v${{ env.NEUROPOD_VERISON }}/libneuropod-cpu-linux-v${{ env.NEUROPOD_VERISON }}-torchscript-${{ env.TORCHSCRIPT_VERSION }}-backend.tar.gz | sudo tar -xz -C "$NEUROPOD_BASE_DIR" + # shell: bash + + # - name: Reinstall Horovod if necessary + # env: + # HOROVOD_WITH_PYTORCH: 1 + # HOROVOD_WITHOUT_MPI: 1 + # HOROVOD_WITHOUT_TENSORFLOW: 1 + # HOROVOD_WITHOUT_MXNET: 1 + # run: | + # HOROVOD_BUILT=$(python -c "import horovod.torch; horovod.torch.nccl_built(); print('SUCCESS')" || true) + # if [[ $HOROVOD_BUILT != "SUCCESS" ]]; then + # pip uninstall -y horovod + # pip install --no-cache-dir git+https://github.com/horovod/horovod.git@master + # fi + # horovodrun --check-build + # shell: bash + + # - name: Check CUDA is available + # run: | + # python -c "import torch; assert torch.cuda.is_available()" + + # - name: Tests + # run: | + # pytest -v --timeout 300 --durations 10 --junitxml pytest.xml tests + + # - name: Upload Unit Test Results + # if: always() + # uses: actions/upload-artifact@v2 + # with: + # name: Unit Test Results (Python ${{ matrix.python-version }} gpu + # path: pytest.xml event_file: name: "Event File" @@ -117,3 +563,29 @@ jobs: with: name: Event File path: ${{ github.event_path }} + + # stop-runner: + # name: Stop self-hosted EC2 runner + + # # required to stop the runner even if the error happened in the previous job + # if: always() && needs.start-runner.result != 'skipped' + # needs: + # - start-runner # required to get output from the start-runner job + # - pytest-gpu # required to wait when the main job is done + # runs-on: ubuntu-latest + + # steps: + # - name: Configure AWS credentials + # uses: aws-actions/configure-aws-credentials@v1 + # with: + # aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} + # aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + # aws-region: ${{ secrets.AWS_REGION }} + + # - name: Stop EC2 runner + # uses: machulav/ec2-github-runner@v2.3.1 + # with: + # mode: stop + # github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }} + # label: ${{ needs.start-runner.outputs.label }} + # ec2-instance-id: ${{ needs.start-runner.outputs.ec2-instance-id }} diff --git a/tests/integration_tests/test_f_control.py b/tests/integration_tests/test_f_control.py deleted file mode 100644 index d8cf5e2deb4..00000000000 --- a/tests/integration_tests/test_f_control.py +++ /dev/null @@ -1,214 +0,0 @@ -import asyncio -import contextlib -import copy -import logging -import os -import platform -import random -import string -from typing import List, Union -from unittest import mock - -import numpy as np -import pandas as pd -import pytest -import torch -from PIL import Image -from transformers import AutoTokenizer - -import ludwig -from ludwig.api import LudwigModel -from ludwig.backend import initialize_backend -from ludwig.callbacks import Callback -from ludwig.constants import ( - BASE_MODEL, - BATCH_SIZE, - COLUMN, - DECODER, - EPOCHS, - FULL, - INPUT_FEATURES, - MODEL_ECD, - MODEL_LLM, - MODEL_TYPE, - NAME, - OUTPUT_FEATURES, - PREDICTIONS, - PREPROCESSING, - PROC_COLUMN, - PROMPT, - SPLIT, - TRAINER, - TYPE, -) -from ludwig.data.concatenate_datasets import concatenate_df -from ludwig.data.preprocessing import handle_features_with_prompt_config, preprocess_for_prediction -from ludwig.schema.llms.prompt import PromptConfig -from ludwig.schema.model_types.base import ModelConfig -from ludwig.utils.carton_utils import export_carton -from tests.integration_tests.utils import ( - assert_preprocessed_dataset_shape_and_dtype_for_feature, - audio_feature, - binary_feature, - category_feature, - generate_data, - generate_data_as_dataframe, - image_feature, - LocalTestBackend, - number_feature, - sequence_feature, - text_feature, -) - -NUM_EXAMPLES = 20 - -# TODO: ALEX -# pytestmark = pytest.mark.integration_tests_x -# TODO: ALEX - - -# TODO: ALEX -@pytest.mark.integration_tests_x -# TODO: ALEX -@pytest.mark.skipif(platform.system() == "Windows", reason="Carton is not supported on Windows") -def test_carton_torchscript(csv_filename, tmpdir): - data_csv_path = os.path.join(tmpdir, csv_filename) - - # Configure features to be tested: - bin_str_feature = binary_feature() - input_features = [ - bin_str_feature, - # binary_feature(), - number_feature(), - category_feature(encoder={"vocab_size": 3}), - # TODO: future support - # sequence_feature(vocab_size=3), - # text_feature(vocab_size=3), - # vector_feature(), - # image_feature(image_dest_folder), - # audio_feature(audio_dest_folder), - # timeseries_feature(), - # date_feature(), - # h3_feature(), - # set_feature(vocab_size=3), - # bag_feature(vocab_size=3), - ] - output_features = [ - bin_str_feature, - # binary_feature(), - number_feature(), - category_feature(decoder={"vocab_size": 3}, output_feature=True), - # TODO: future support - # sequence_feature(vocab_size=3), - # text_feature(vocab_size=3), - # set_feature(vocab_size=3), - # vector_feature() - ] - backend = LocalTestBackend() - config = { - "input_features": input_features, - "output_features": output_features, - TRAINER: {"epochs": 2, BATCH_SIZE: 128}, - } - - # Generate training data - training_data_csv_path = generate_data(input_features, output_features, data_csv_path) - - # Convert bool values to strings, e.g., {'Yes', 'No'} - df = pd.read_csv(training_data_csv_path) - false_value, true_value = "No", "Yes" - df[bin_str_feature[NAME]] = df[bin_str_feature[NAME]].map(lambda x: true_value if x else false_value) - df.to_csv(training_data_csv_path) - - # Train Ludwig (Pythonic) model: - ludwig_model = LudwigModel(config, backend=backend) - ludwig_model.train( - dataset=training_data_csv_path, - skip_save_training_description=True, - skip_save_training_statistics=True, - skip_save_model=True, - skip_save_progress=True, - skip_save_log=True, - skip_save_processed_input=True, - ) - - # Obtain predictions from Python model - preds_dict, _ = ludwig_model.predict(dataset=training_data_csv_path, return_type=dict) - - # Create graph inference model (Torchscript) from trained Ludwig model. - carton_path = os.path.join(tmpdir, "carton") - export_carton(ludwig_model, carton_path) - - import cartonml as carton - - # Load the carton model - # See https://pyo3.rs/v0.20.0/ecosystem/async-await#a-note-about-asynciorun for why we wrap it - # in another function - async def load(): - return await carton.load(carton_path) - - loop = asyncio.get_event_loop() - carton_model = loop.run_until_complete(load()) - - def to_input(s: pd.Series) -> Union[List[str], torch.Tensor]: - if s.dtype == "object": - return np.array(s.to_list()) - return s.to_numpy().astype(np.float32) - - df = pd.read_csv(training_data_csv_path) - inputs = {name: to_input(df[feature.column]) for name, feature in ludwig_model.model.input_features.items()} - - # See https://pyo3.rs/v0.20.0/ecosystem/async-await#a-note-about-asynciorun for why we wrap it - # in another function - async def infer(inputs): - return await carton_model.infer(inputs) - - outputs = loop.run_until_complete(infer(inputs)) - - # Compare results from Python trained model against Carton - assert len(preds_dict) == len(outputs) - for feature_name, feature_outputs_expected in preds_dict.items(): - assert feature_name in outputs - - output_values_expected = feature_outputs_expected[PREDICTIONS] - output_values = outputs[feature_name] - if output_values.dtype.type in {np.string_, np.str_}: - # Strings should match exactly - assert np.all(output_values == output_values_expected), f"feature: {feature_name}, output: predictions" - else: - assert np.allclose(output_values, output_values_expected), f"feature: {feature_name}, output: predictions" - - -# TODO: ALEX -# TODO: ALEX -@pytest.mark.integration_tests_x -# TODO: ALEX -@pytest.mark.parametrize("use_pretrained", [False, True], ids=["false", "true"]) -def test_vit_encoder_different_dimension_image(tmpdir, csv_filename, use_pretrained: bool): - input_features = [ - image_feature( - os.path.join(tmpdir, "generated_output"), - preprocessing={"in_memory": True, "height": 224, "width": 206, "num_channels": 3}, - encoder={TYPE: "_vit_legacy", "use_pretrained": use_pretrained}, - ) - ] - output_features = [category_feature(decoder={"vocab_size": 5}, reduce_input="sum")] - - data_csv = generate_data( - input_features, output_features, os.path.join(tmpdir, csv_filename), num_examples=NUM_EXAMPLES - ) - - config = { - INPUT_FEATURES: input_features, - OUTPUT_FEATURES: output_features, - TRAINER: {"train_steps": 1}, - } - - model = LudwigModel(config) - - # Failure happens post preprocessing but before training during the ECD model creation phase - # so make sure the model can be created properly and training can proceed - model.train(dataset=data_csv) - - -# TODO: ALEX