From f2ee3eedfbe0fbd986b85d07a1cb4ee1db0cae22 Mon Sep 17 00:00:00 2001
From: Bas Zalmstra <zalmstra.bas@gmail.com>
Date: Thu, 4 Apr 2024 17:21:52 +0200
Subject: [PATCH] Add Windows builds

Co-authored-by: Bas Zalmstra <zalmstra.bas@gmail.com>
---
 .github/workflows/conda-build.yml            | 118 +++++++++++
 .scripts/run_win_build.bat                   | 130 +++++++++++++
 conda-forge.yml                              |   4 +
 recipe/bld.bat                               | 195 ++++++++++++++++---
 recipe/build.sh                              |  15 +-
 recipe/build_pytorch.bat                     |  18 ++
 recipe/conda_build_config.yaml               |   6 +
 recipe/meta.yaml                             | 116 ++++++++---
 recipe/patches/0003-fix-FindLAPACK.patch     |  12 ++
 recipe/patches/0004-Fix-mkl-dependency.patch |  12 ++
 recipe/patches/0005-Fix-FindOpenBLAS.patch   |  36 ++++
 11 files changed, 604 insertions(+), 58 deletions(-)
 create mode 100644 .github/workflows/conda-build.yml
 create mode 100755 .scripts/run_win_build.bat
 create mode 100644 recipe/patches/0003-fix-FindLAPACK.patch
 create mode 100644 recipe/patches/0004-Fix-mkl-dependency.patch
 create mode 100644 recipe/patches/0005-Fix-FindOpenBLAS.patch

diff --git a/.github/workflows/conda-build.yml b/.github/workflows/conda-build.yml
new file mode 100644
index 000000000..fc235682d
--- /dev/null
+++ b/.github/workflows/conda-build.yml
@@ -0,0 +1,118 @@
+# This file was generated automatically from conda-smithy. To update this configuration,
+# update the conda-forge.yml and/or the recipe/meta.yaml.
+# -*- mode: yaml -*-
+
+name: Build conda package
+on:
+  push:
+
+  pull_request:
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}
+  cancel-in-progress: true
+
+jobs:
+  build:
+    name: ${{ matrix.CONFIG }}
+    runs-on: ${{ matrix.runs_on }}
+    timeout-minutes: 720
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - CONFIG: win_64_blas_implgenericcuda_compilerNonecuda_compiler_versionNone
+            UPLOAD_PACKAGES: True
+            os: windows
+            runs_on: ['cirun-azure-windows-2xlarge--${{ github.run_id }}-win_64_blas_implgenericcuda_compile_h4a9b946200', 'windows', 'x64', 'self-hosted']
+          - CONFIG: win_64_blas_implgenericcuda_compilercuda-nvcccuda_compiler_version12.0
+            UPLOAD_PACKAGES: True
+            os: windows
+            runs_on: ['cirun-azure-windows-2xlarge--${{ github.run_id }}-win_64_blas_implgenericcuda_compile_hd23f0db2d7', 'windows', 'x64', 'self-hosted']
+          - CONFIG: win_64_blas_implmklcuda_compilerNonecuda_compiler_versionNone
+            UPLOAD_PACKAGES: True
+            os: windows
+            runs_on: ['cirun-azure-windows-2xlarge--${{ github.run_id }}-win_64_blas_implmklcuda_compilerNon_ha94fa4b9ab', 'windows', 'x64', 'self-hosted']
+          - CONFIG: win_64_blas_implmklcuda_compilercuda-nvcccuda_compiler_version12.0
+            UPLOAD_PACKAGES: True
+            os: windows
+            runs_on: ['cirun-azure-windows-2xlarge--${{ github.run_id }}-win_64_blas_implmklcuda_compilercud_h0e61f86427', 'windows', 'x64', 'self-hosted']
+    steps:
+
+    - name: Checkout code
+      uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b # v4.1.4
+
+    - name: Build on Linux
+      if: matrix.os == 'ubuntu'
+      env:
+        CONFIG: ${{ matrix.CONFIG }}
+        UPLOAD_PACKAGES: ${{ matrix.UPLOAD_PACKAGES }}
+        DOCKER_IMAGE: ${{ matrix.DOCKER_IMAGE }}
+        CI: github_actions
+        CONDA_FORGE_DOCKER_RUN_ARGS: "${{ matrix.CONDA_FORGE_DOCKER_RUN_ARGS }}"
+        BINSTAR_TOKEN: ${{ secrets.BINSTAR_TOKEN }}
+        FEEDSTOCK_TOKEN: ${{ secrets.FEEDSTOCK_TOKEN }}
+        STAGING_BINSTAR_TOKEN: ${{ secrets.STAGING_BINSTAR_TOKEN }}
+      shell: bash
+      run: |
+        echo "::group::Configure binfmt_misc"
+        docker run --rm --privileged multiarch/qemu-user-static:register --reset --credential yes
+        export flow_run_id="github_$GITHUB_RUN_ID"
+        export remote_url="https://github.com/$GITHUB_REPOSITORY"
+        export sha="$GITHUB_SHA"
+        export FEEDSTOCK_NAME="$(basename $GITHUB_REPOSITORY)"
+        export GIT_BRANCH="$(basename $GITHUB_REF)"
+        if [[ "${GITHUB_EVENT_NAME}" == "pull_request" ]]; then
+          export IS_PR_BUILD="True"
+        else
+          export IS_PR_BUILD="False"
+        fi
+        echo "::endgroup::"
+        ./.scripts/run_docker_build.sh
+
+    - name: Build on macOS
+      if: matrix.os == 'macos'
+      env:
+        CONFIG: ${{ matrix.CONFIG }}
+        UPLOAD_PACKAGES: ${{ matrix.UPLOAD_PACKAGES }}
+        CI: github_actions
+        BINSTAR_TOKEN: ${{ secrets.BINSTAR_TOKEN }}
+        FEEDSTOCK_TOKEN: ${{ secrets.FEEDSTOCK_TOKEN }}
+        STAGING_BINSTAR_TOKEN: ${{ secrets.STAGING_BINSTAR_TOKEN }}
+      shell: bash
+      run: |
+        export flow_run_id="github_$GITHUB_RUN_ID"
+        export remote_url="https://github.com/$GITHUB_REPOSITORY"
+        export sha="$GITHUB_SHA"
+        export FEEDSTOCK_NAME="$(basename $GITHUB_REPOSITORY)"
+        export GIT_BRANCH="$(basename $GITHUB_REF)"
+        if [[ "${GITHUB_EVENT_NAME}" == "pull_request" ]]; then
+          export IS_PR_BUILD="True"
+        else
+          export IS_PR_BUILD="False"
+        fi
+        ./.scripts/run_osx_build.sh
+
+    - name: Install Miniconda for windows
+      uses: conda-incubator/setup-miniconda@a4260408e20b96e80095f42ff7f1a15b27dd94ca # v3.0.4
+      with:
+        miniforge-version: latest
+        miniforge-variant: Mambaforge
+      if: matrix.os == 'windows'
+
+    - name: Build on windows
+      shell: cmd
+      run: |
+        set "flow_run_id=github_%GITHUB_RUN_ID%"
+        set "remote_url=https://github.com/%GITHUB_REPOSITORY%"
+        set "sha=%GITHUB_SHA%"
+        call ".scripts\run_win_build.bat"
+      env:
+        PYTHONUNBUFFERED: 1
+        CONFIG: ${{ matrix.CONFIG }}
+        CI: github_actions
+        UPLOAD_PACKAGES: ${{ matrix.UPLOAD_PACKAGES }}
+        BINSTAR_TOKEN: ${{ secrets.BINSTAR_TOKEN }}
+        FEEDSTOCK_TOKEN: ${{ secrets.FEEDSTOCK_TOKEN }}
+        STAGING_BINSTAR_TOKEN: ${{ secrets.STAGING_BINSTAR_TOKEN }}
+      if: matrix.os == 'windows'
\ No newline at end of file
diff --git a/.scripts/run_win_build.bat b/.scripts/run_win_build.bat
new file mode 100755
index 000000000..685844e28
--- /dev/null
+++ b/.scripts/run_win_build.bat
@@ -0,0 +1,130 @@
+:: PLEASE NOTE: This script has been automatically generated by conda-smithy. Any changes here
+:: will be lost next time ``conda smithy rerender`` is run. If you would like to make permanent
+:: changes to this script, consider a proposal to conda-smithy so that other feedstocks can also
+:: benefit from the improvement.
+
+:: Note: we assume a Miniforge installation is available
+
+:: INPUTS (required environment variables)
+:: CONFIG: name of the .ci_support/*.yaml file for this job
+:: CI: azure, github_actions, or unset
+:: UPLOAD_PACKAGES: true or false
+:: UPLOAD_ON_BRANCH: true or false
+
+setlocal enableextensions enabledelayedexpansion
+
+call :start_group "Configuring conda"
+
+:: Activate the base conda environment
+call activate base
+:: Configure the solver
+set "CONDA_SOLVER=libmamba"
+if !errorlevel! neq 0 exit /b !errorlevel!
+set "CONDA_LIBMAMBA_SOLVER_NO_CHANNELS_FROM_INSTALLED=1"
+
+:: Provision the necessary dependencies to build the recipe later
+echo Installing dependencies
+mamba.exe install "python=3.10" pip mamba conda-build py-lief=0.12.3 conda-forge-ci-setup=3 -c conda-forge --strict-channel-priority --yes
+if !errorlevel! neq 0 exit /b !errorlevel!
+
+:: Set basic configuration
+echo Setting up configuration
+setup_conda_rc .\ ".\recipe" .\.ci_support\%CONFIG%.yaml
+if !errorlevel! neq 0 exit /b !errorlevel!
+echo Running build setup
+CALL run_conda_forge_build_setup
+
+
+if !errorlevel! neq 0 exit /b !errorlevel!
+
+if EXIST LICENSE.txt (
+    echo Copying feedstock license
+    copy LICENSE.txt "recipe\\recipe-scripts-license.txt"
+)
+if NOT [%HOST_PLATFORM%] == [%BUILD_PLATFORM%] (
+    if [%CROSSCOMPILING_EMULATOR%] == [] (
+        set "EXTRA_CB_OPTIONS=%EXTRA_CB_OPTIONS% --no-test"
+    )
+)
+
+if NOT [%flow_run_id%] == [] (
+    set "EXTRA_CB_OPTIONS=%EXTRA_CB_OPTIONS% --extra-meta flow_run_id=%flow_run_id% remote_url=%remote_url% sha=%sha%"
+)
+
+call :end_group
+
+:: Build the recipe
+echo Building recipe
+conda-build.exe "recipe" -m .ci_support\%CONFIG%.yaml --suppress-variables %EXTRA_CB_OPTIONS%
+if !errorlevel! neq 0 exit /b !errorlevel!
+
+call :start_group "Inspecting artifacts"
+:: inspect_artifacts was only added in conda-forge-ci-setup 4.6.0
+WHERE inspect_artifacts >nul 2>nul && inspect_artifacts || echo "inspect_artifacts needs conda-forge-ci-setup >=4.6.0"
+call :end_group
+
+:: Prepare some environment variables for the upload step
+if /i "%CI%" == "github_actions" (
+    set "FEEDSTOCK_NAME=%GITHUB_REPOSITORY:*/=%"
+    set "GIT_BRANCH=%GITHUB_REF:refs/heads/=%"
+    if /i "%GITHUB_EVENT_NAME%" == "pull_request" (
+        set "IS_PR_BUILD=True"
+    ) else (
+        set "IS_PR_BUILD=False"
+    )
+    set "TEMP=%RUNNER_TEMP%"
+)
+if /i "%CI%" == "azure" (
+    set "FEEDSTOCK_NAME=%BUILD_REPOSITORY_NAME:*/=%"
+    set "GIT_BRANCH=%BUILD_SOURCEBRANCHNAME%"
+    if /i "%BUILD_REASON%" == "PullRequest" (
+        set "IS_PR_BUILD=True"
+    ) else (
+        set "IS_PR_BUILD=False"
+    )
+    set "TEMP=%UPLOAD_TEMP%"
+)
+
+:: Validate
+call :start_group "Validating outputs"
+validate_recipe_outputs "%FEEDSTOCK_NAME%"
+if !errorlevel! neq 0 exit /b !errorlevel!
+call :end_group
+
+if /i "%UPLOAD_PACKAGES%" == "true" (
+    if /i "%IS_PR_BUILD%" == "false" (
+        call :start_group "Uploading packages"
+        if not exist "%TEMP%\" md "%TEMP%"
+        set "TMP=%TEMP%"
+        upload_package --validate --feedstock-name="%FEEDSTOCK_NAME%" .\ ".\recipe" .ci_support\%CONFIG%.yaml
+        if !errorlevel! neq 0 exit /b !errorlevel!
+        call :end_group
+    )
+)
+
+exit
+
+:: Logging subroutines
+
+:start_group
+if /i "%CI%" == "github_actions" (
+    echo ::group::%~1
+    exit /b
+)
+if /i "%CI%" == "azure" (
+    echo ##[group]%~1
+    exit /b
+)
+echo %~1
+exit /b
+
+:end_group
+if /i "%CI%" == "github_actions" (
+    echo ::endgroup::
+    exit /b
+)
+if /i "%CI%" == "azure" (
+    echo ##[endgroup]
+    exit /b
+)
+exit /b
\ No newline at end of file
diff --git a/conda-forge.yml b/conda-forge.yml
index d21ee21a2..c259c5f5d 100644
--- a/conda-forge.yml
+++ b/conda-forge.yml
@@ -2,6 +2,9 @@ azure:
   free_disk_space: true
   settings_linux:
     timeoutInMinutes: 1
+  settings_win:
+    variables:
+      CONDA_BLD_PATH: C:\\bld\\
 build_platform:
   linux_aarch64: linux_64
   osx_arm64: osx_64
@@ -21,5 +24,6 @@ os_version:
   linux_64: cos7
 provider:
   linux_64: azure
+  win_64: github_actions
   linux_aarch64: azure
 test: native_and_emulated
diff --git a/recipe/bld.bat b/recipe/bld.bat
index c1c2b2e61..bc40b6b7f 100644
--- a/recipe/bld.bat
+++ b/recipe/bld.bat
@@ -1,50 +1,195 @@
 @echo On
+setlocal enabledelayedexpansion
+
+REM remove pyproject.toml to avoid installing deps from pip
+if EXIST pyproject.toml DEL pyproject.toml
 
 set TH_BINARY_BUILD=1
 set PYTORCH_BUILD_VERSION=%PKG_VERSION%
 set PYTORCH_BUILD_NUMBER=%PKG_BUILDNUM%
 
-if "%pytorch_variant%" == "gpu" (
-    set build_with_cuda=1
-    set desired_cuda=%CUDA_VERSION:~0,-1%.%CUDA_VERSION:~-1,1%
+REM I don't know where this folder comes from, but it's interfering with the build in osx-64
+if EXIST %PREFIX%\git RD /S /Q %PREFIX%\git
+
+@REM Setup BLAS
+if "%blas_impl%" == "generic" (
+    REM Fake openblas
+    SET BLAS=OpenBLAS
+    SET OpenBLAS_HOME=%LIBRARY_PREFIX%
 ) else (
-    set build_with_cuda=
-    set USE_CUDA=0
+    SET BLAS=MKL
 )
 
-if "%build_with_cuda%" == "" goto cuda_flags_end
+@REM TODO(baszalmstra): Figure out if we need these flags
+SET "USE_NUMA=0"
+SET "USE_ITT=0"
 
-set CUDA_PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v%desired_cuda%
-set CUDA_BIN_PATH=%CUDA_PATH%\bin
-set TORCH_CUDA_ARCH_LIST=3.5;5.0+PTX
-if "%desired_cuda%" == "9.0" set TORCH_CUDA_ARCH_LIST=%TORCH_CUDA_ARCH_LIST%;6.0;7.0
-if "%desired_cuda%" == "9.2" set TORCH_CUDA_ARCH_LIST=%TORCH_CUDA_ARCH_LIST%;6.0;6.1;7.0
-if "%desired_cuda%" == "10.0" set TORCH_CUDA_ARCH_LIST=%TORCH_CUDA_ARCH_LIST%;6.0;6.1;7.0;7.5
-set TORCH_NVCC_FLAGS=-Xfatbin -compress-all
+@REM KINETO seems to require CUPTI and will look quite hard for it.
+@REM CUPTI seems to cause trouble when users install a version of
+@REM cudatoolkit different than the one specified at compile time.
+@REM https://github.com/conda-forge/pytorch-cpu-feedstock/issues/135
+set "USE_KINETO=OFF"
 
-:cuda_flags_end
+if "%PKG_NAME%" == "pytorch" (
+  set "PIP_ACTION=install"
+  @REM We build libtorch for a specific python version. 
+  @REM This ensures its only build once. However, when that version changes 
+  @REM we need to make sure to update that here.
+  sed "s/3.12/%PY_VER%/g" build/CMakeCache.txt.orig > build/CMakeCache.txt
+  sed -i "s/312/%CONDA_PY%/g" build/CMakeCache.txt
 
-set DISTUTILS_USE_SDK=1
+  @REM We use a fan-out build to avoid the long rebuild of libtorch
+  @REM However, the location of the numpy headers changes between python 3.8
+  @REM and 3.9+ since numpy 2.0 only exists for 3.9+
+  if "%PY_VER%" == "3.8" (
+    sed -i.bak "s#numpy\\\\_core\\\\include#numpy\\\\core\\\\include#g" build/CMakeCache.txt
+  ) else ( 
+    sed -i.bak "s#numpy\\\\core\\\\include#numpy\\\\_core\\\\include#g" build/CMakeCache.txt
+  )
 
-set CMAKE_INCLUDE_PATH=%LIBRARY_PREFIX%\include
-set LIB=%LIBRARY_PREFIX%\lib;%LIB%
+) else (
+  @REM For the main script we just build a wheel for so that the C++/CUDA
+  @REM parts are built. Then they are reused in each python version.
+  set "PIP_ACTION=wheel"
+)
+
+if not "%cuda_compiler_version%" == "None" (
+    set USE_CUDA=1
+
+    REM set CUDA_PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v%desired_cuda%
+    REM set CUDA_BIN_PATH=%CUDA_PATH%\bin
 
-IF "%build_with_cuda%" == "" goto cuda_end
+    set TORCH_CUDA_ARCH_LIST=3.5;5.0+PTX
+    if "%cuda_compiler_version%" == "11.8" (
+        set TORCH_CUDA_ARCH_LIST=3.5;5.0;6.0;6.1;7.0;7.5;8.0;8.6;8.9+PTX
+        set CUDA_TOOLKIT_ROOT_DIR=%CUDA_HOME%
+    ) else if "%cuda_compiler_version%" == "12.0" (
+        set TORCH_CUDA_ARCH_LIST=5.0;6.0;6.1;7.0;7.5;8.0;8.6;8.9;9.0+PTX
+        set CUDA_TOOLKIT_ROOT_DIR=%PREFIX%
+    ) else (
+        echo "unsupported cuda version. edit build_pytorch.bat"
+        exit /b 1
+    )
 
-set MAGMA_HOME=%LIBRARY_PREFIX%
+    set TORCH_NVCC_FLAGS=-Xfatbin -compress-all
 
-set "PATH=%CUDA_BIN_PATH%;%PATH%"
+    REM set USE_SYSTEM_NCCL=1
+    set USE_STATIC_NCCL=0
+    set USE_STATIC_CUDNN=0
+    set MAGMA_HOME=%PREFIX%
 
-set CUDNN_INCLUDE_DIR=%LIBRARY_PREFIX%\include
+    REM NCCL is not available on windows
+    set USE_NCCL=0
+
+    set MAGMA_HOME=%LIBRARY_PREFIX%
+
+    set "PATH=%CUDA_BIN_PATH%;%PATH%"
+
+    set CUDNN_INCLUDE_DIR=%LIBRARY_PREFIX%\include
+
+) else (
+    set USE_CUDA=0
+    
+    @REM MKLDNN is an Apache-2.0 licensed library for DNNs and is used
+    @REM for CPU builds. Not to be confused with MKL.
+    set "USE_MKLDNN=1"
+)
+
+set DISTUTILS_USE_SDK=1
 
-:cuda_end
+set CMAKE_INCLUDE_PATH=%LIBRARY_PREFIX%\include
+set LIB=%LIBRARY_PREFIX%\lib;%LIB%
 
+@REM CMake configuration
 set CMAKE_GENERATOR=Ninja
+set "CMAKE_GENERATOR_TOOLSET="
 set "CMAKE_GENERATOR_PLATFORM="
 set "CMAKE_PREFIX_PATH=%LIBRARY_PREFIX%"
+set "CMAKE_INCLUDE_PATH=%LIBRARY_INC%"
+set "CMAKE_LIBRARY_PATH=%LIBRARY_LIB%"
+set "CMAKE_BUILD_TYPE=Release"
+
+set "INSTALL_TEST=0"
+set "BUILD_TEST=0"
+
 set "libuv_ROOT=%LIBRARY_PREFIX%"
-set "USE_SYSTEM_SLEEF=OFF"
-set "BUILD_CUSTOM_PROTOBUF=OFF"
+set "USE_SYSTEM_SLEEF=ON"
+
+@REM uncomment to debug cmake build
+@REM set "CMAKE_VERBOSE_MAKEFILE=1"
+
+@REM TODO(baszalmstra): There are link errors because of conflicting symbols with caffe2_protos.lib
+set "BUILD_CUSTOM_PROTOBUF=ON"
+
+@REM TODO(baszalmstra): There are linker errors because of mixing Intel OpenMP (iomp) and Microsoft OpenMP (vcomp)
+set "USE_OPENMP=0"
+
+@REM The activation script for cuda-nvcc doesnt add the CUDA_CFLAGS on windows. 
+@REM Therefor we do this manually here. See:
+@REM https://github.com/conda-forge/cuda-nvcc-feedstock/issues/47
+echo "CUDA_CFLAGS=%CUDA_CFLAGS%"
+set "CUDA_CFLAGS=-I%PREFIX%/Library/include -I%BUILD_PREFIX%/Library/include"
+set "CFLAGS=%CFLAGS% %CUDA_CFLAGS%"
+set "CPPFLAGS=%CPPFLAGS% %CUDA_CFLAGS%"
+set "CXXFLAGS=%CXXFLAGS% %CUDA_CFLAGS%"
+echo "CUDA_CFLAGS=%CUDA_CFLAGS%"
+echo "CXXFLAGS=%CXXFLAGS%"
+
+@REM Configure sccache
+set "CMAKE_C_COMPILER_LAUNCHER=sccache"
+set "CMAKE_CXX_COMPILER_LAUNCHER=sccache"
+set "CMAKE_CUDA_COMPILER_LAUNCHER=sccache"
+
+sccache --stop-server
+sccache --start-server
+sccache --zero-stats
 
-%PYTHON% -m pip install . --no-deps -vv
+@REM Clear the build from any remaining artifacts. We use sccache to avoid recompiling similar code.
+cmake --build build --target clean
+
+%PYTHON% -m pip %PIP_ACTION% . --no-deps -vvv --no-clean
 if errorlevel 1 exit /b 1
+
+@REM Here we split the build into two parts.
+@REM 
+@REM Both the packages libtorch and pytorch use this same build script.
+@REM - The output of the libtorch package should just contain the binaries that are 
+@REM   not related to Python.
+@REM - The output of the pytorch package contains everything except for the 
+@REM   non-python specific binaries.
+@REM
+@REM This ensures that a user can quickly switch between python versions without the
+@REM need to redownload all the large CUDA binaries.
+
+if "%PKG_NAME%" == "libtorch" (
+    @REM Extract the compiled wheel into a temporary directory
+    if not exist "%SRC_DIR%/dist" mkdir %SRC_DIR%/dist
+    pushd %SRC_DIR%/dist
+    for %%f in (../torch-*.whl) do (
+        wheel unpack %%f
+    )
+
+    @REM Navigate into the unpacked wheel
+    pushd torch-*
+
+    @REM Move the binaries into the packages site-package directory
+    robocopy /NP /NFL /NDL /NJH /E torch\bin %SP_DIR%\torch\bin\
+    robocopy /NP /NFL /NDL /NJH /E torch\lib %SP_DIR%\torch\lib\
+    robocopy /NP /NFL /NDL /NJH /E torch\share %SP_DIR%\torch\share\
+    for %%f in (ATen caffe2 torch c10) do (
+        robocopy /NP /NFL /NDL /NJH /E torch\include\%%f %SP_DIR%\torch\include\%%f\
+    )
+
+    @REM Remove the python binary file, that is placed in the site-packages 
+    @REM directory by the specific python specific pytorch package.
+    del %SP_DIR%\torch\lib\torch_python.*
+    
+    popd
+    popd
+
+    @REM Keep the original backed up to sed later
+    copy build\CMakeCache.txt build\CMakeCache.txt.orig
+)
+
+@REM Show the sccache stats.
+sccache --show-stats
diff --git a/recipe/build.sh b/recipe/build.sh
index f36cd2bbd..f0702772a 100644
--- a/recipe/build.sh
+++ b/recipe/build.sh
@@ -100,7 +100,7 @@ fi
 if [[ "$blas_impl" == "generic" ]]; then
     # Fake openblas
     export BLAS=OpenBLAS
-    sed -i.bak "s#FIND_LIBRARY.*#set(OpenBLAS_LIB ${PREFIX}/lib/liblapack${SHLIB_EXT} ${PREFIX}/lib/libcblas${SHLIB_EXT} ${PREFIX}/lib/libblas${SHLIB_EXT})#g" cmake/Modules/FindOpenBLAS.cmake
+    export OpenBLAS_HOME=%PREFIX%
 else
     export BLAS=MKL
 fi
@@ -115,6 +115,8 @@ else
   # For the main script we just build a wheel for so that the C++/CUDA
   # parts are built. Then they are reused in each python version.
   PIP_ACTION=wheel
+
+  export BUILD_PYTHON=OFF
 fi
 
 # MacOS build is simple, and will not be for CUDA
@@ -208,12 +210,23 @@ else
     export CMAKE_TOOLCHAIN_FILE="${RECIPE_DIR}/cross-linux.cmake"
 fi
 
+# Configure sccache
+export CMAKE_C_COMPILER_LAUNCHER=sccache
+export CMAKE_CXX_COMPILER_LAUNCHER=sccache
+export CMAKE_CUDA_COMPILER_LAUNCHER=sccache
+
+sccache --stop-server
+sccache --start-server
+sccache --zero-stats
+
+# Execute the build
 echo '${CXX}'=${CXX}
 echo '${PREFIX}'=${PREFIX}
 $PREFIX/bin/python -m pip $PIP_ACTION . --no-deps -vvv --no-clean \
     | sed "s,${CXX},\$\{CXX\},g" \
     | sed "s,${PREFIX},\$\{PREFIX\},g"
 
+# Create split packages
 if [[ "$PKG_NAME" == "libtorch" ]]; then
   mkdir -p $SRC_DIR/dist
   pushd $SRC_DIR/dist
diff --git a/recipe/build_pytorch.bat b/recipe/build_pytorch.bat
index 88ba1301b..9dcbb8276 100644
--- a/recipe/build_pytorch.bat
+++ b/recipe/build_pytorch.bat
@@ -1 +1,19 @@
+@echo On
+setlocal enabledelayedexpansion
+
 call %RECIPE_DIR%\bld.bat
+if errorlevel 1 exit /b 1
+
+rmdir /s /q %SP_DIR%\torch\bin
+rmdir /s /q %SP_DIR%\torch\share
+for %%f in (ATen caffe2 torch c10) do (
+    rmdir /s /q %SP_DIR%\torch\include\%%f
+)
+
+@REM Delete all files from the lib directory that do not start with torch_python
+for %%f in (%SP_DIR%\torch\lib\*) do (
+    set "FILENAME=%%~nf"
+    if "!FILENAME:~0,12!" neq "torch_python" (
+        del %%f
+    )
+)
diff --git a/recipe/conda_build_config.yaml b/recipe/conda_build_config.yaml
index bbdadf2c0..35268a366 100644
--- a/recipe/conda_build_config.yaml
+++ b/recipe/conda_build_config.yaml
@@ -99,7 +99,13 @@ blas_impl:
 # https://github.com/conda-forge/.cirun
 github_actions_labels:          # [linux]
 - cirun-openstack-gpu-2xlarge   # [linux]
+- cirun-azure-windows-2xlarge   # [win]
 
 megabuild:
 - true      # [linux]
 - false     # [osx]
+- true      # [win]
+
+# zip_keys:
+#   - - github_actions_labels
+#     - blas_impl
diff --git a/recipe/meta.yaml b/recipe/meta.yaml
index 4b2b655db..bb1983c63 100644
--- a/recipe/meta.yaml
+++ b/recipe/meta.yaml
@@ -1,5 +1,5 @@
 {% set version = "2.5.1" %}
-{% set build = 3 %}
+{% set build = 4 %}
 
 {% if cuda_compiler_version != "None" %}
 {% set build = build + 200 %}
@@ -37,6 +37,17 @@ source:
     # https://github.com/pytorch/pytorch/pull/137331
     - patches/137331.patch
 
+    # Remove when https://github.com/pytorch/pytorch/pull/126165 is merged
+    - patches/0003-fix-FindLAPACK.patch         # [win and blas_impl == "generic"]
+    # TODO(baszalmstra): Remove once https://github.com/pytorch/pytorch/issues/126279 is resolved
+    # Recent versions of pytorch have a python dependency:
+    # 'mkl>=2021.1.1,<=2021.4.0; platform_system == "Windows"'
+    # However, we might be building with a completely different version of MKL and we might not 
+    # even use MKL in the first place. This patch simply removes the python dependency until a
+    # better solution is found or the problem is fixed upstream.
+    - patches/0004-Fix-mkl-dependency.patch     # [win]
+    - patches/0005-Fix-FindOpenBLAS.patch       # [blas_impl == "generic"]
+
 build:
   number: {{ build }}
   string: cuda{{ cuda_compiler_version | replace('.', '') }}_h{{ PKG_HASH }}_{{ PKG_BUILDNUM }}  # [cuda_compiler_version != "None"]
@@ -51,17 +62,20 @@ build:
   ignore_run_exports:
     - python *                               # [megabuild]
     - numpy *                                # [megabuild]
-  skip: true  # [win]
+  
+  # TODO(baszalmstra): Remove this before merging
+  skip: true  # [not win or not (cuda_compiler_version == "12.0" or cuda_compiler == "None")]
+
   skip: true  # [cuda_compiler_version != "None" and linux64 and blas_impl != "mkl"]
   # 2024/04 This build seems to fail due to disk space on the CIs
   # We can potentially re-enable it at a later time
-  skip: true  # [cuda_compiler_version == "11.8" and aarch64]
+  # skip: true  # [cuda_compiler_version == "11.8" and aarch64]
 
 requirements:
   # Keep this list synchronized (except for python*, numpy*) in outputs
   # We use python to build libtorch as well because it is easier
   build:
-    # When you change 3.12 here, change it in build.sh as well
+    # When you change 3.12 here, change it in build.sh/bld.bat as well
     - python 3.12                            # [megabuild and build_platform != target_platform]
     - python                                 # [not megabuild and build_platform != target_platform]
     - cross-python_{{ target_platform }}     # [build_platform != target_platform]
@@ -85,29 +99,29 @@ requirements:
     - libcusolver-dev                        # [build_platform != target_platform]
     - libcusparse-dev                        # [build_platform != target_platform]
     {% endif %}
-    # Dec 2020: it seems that git is broken on windows, so we use m2-git
-    - m2-patch  # [win]
-    - m2-git    # [win]
     - patch     # [not win]
-    - git       # [not win]
-    - libgomp        # [linux]
+    - m2-patch  # [win]
+    - m2-sed    # [win]
+    - libgomp   # [linux]
     - llvm-openmp    # [osx]
+    - libuv     # [win]
     - cmake
     - ninja
     # Keep libprotobuf here so that a compatibile version
     # of protobuf is installed between build and host
-    - libprotobuf
-    - protobuf
+    - libprotobuf  # [not win]
+    - protobuf     # [not win]
     - make      # [linux]
+    - sccache
   host:
     # GPU requirements
     - cudnn                           # [cuda_compiler_version != "None"]
-    - nccl                            # [cuda_compiler_version != "None"]
+    - nccl                            # [cuda_compiler_version != "None" and linux]
     - magma                           # [cuda_compiler_version != "None"]
     - cuda-version {{ cuda_compiler_version }}  # [cuda_compiler_version != "None"]
     - nvtx-c                          # [cuda_compiler_version != "None"]
     {% if cuda_major >= 12 %}
-    - cuda-driver-dev
+    - cuda-driver-dev                 # [linux]
     - cuda-cudart-dev
     - cuda-nvrtc-dev
     - cuda-nvtx-dev
@@ -132,12 +146,12 @@ requirements:
     - six
     - mkl-devel {{ mkl }}   # [blas_impl == "mkl"]
     - libcblas * *_mkl      # [blas_impl == "mkl"]
+    - libblas               # [blas_impl != "mkl"]
     - libcblas              # [blas_impl != "mkl"]
     - liblapack             # [blas_impl != "mkl"]
     - libgomp   # [linux]
-    - llvm-openmp    # [osx]
-    - libabseil
-    - libprotobuf
+    - llvm-openmp  # [osx]
+    - libprotobuf  # [not win]
     - sleef
     - typing
     - libuv
@@ -164,12 +178,33 @@ requirements:
 test:
   commands:
     # libraries
-    {% for each_lib in [ 'libc10', 'libshm', 'libtorch', 'libtorch_cpu', 'libtorch_global_deps'] %}
-    - test -f $PREFIX/lib/{{ each_lib }}.so     # [linux]
-    - test -f $PREFIX/lib/{{ each_lib }}.dylib  # [osx]
+    {% for each_lib in [ 'c10', 'shm', 'torch', 'torch_cpu', 'torch_global_deps'] %}
+    - test -f $PREFIX/lib/lib{{ each_lib }}.so                    # [linux]
+    - test -f $PREFIX/lib/lib{{ each_lib }}.dylib                 # [osx]
+    - if not exist %SP_DIR%\torch\lib\{{ each_lib }}.dll exit 1   # [win]
+    {% endfor %}
+    
+    # Windows specific .LIB files
+    {% for each_lib in [ 'c10', 'shm', 'torch', 'torch_cpu'] %}
+    - if not exist %SP_DIR%\torch\lib\{{ each_lib }}.lib exit 1   # [win]
+    {% endfor %}
+
+    # Windows specific libraries
+    {% for each_lib in [ 'asmjit', 'fbgemm'] %}
+    - if not exist %SP_DIR%\torch\lib\{{ each_lib }}.dll exit 1   # [win]
+    - if not exist %SP_DIR%\torch\lib\{{ each_lib }}.lib exit 1   # [win]
     {% endfor %}
-    {% for each_lib in ['libc10_cuda', 'libcaffe2_nvrtc', 'libtorch_cuda', 'libtorch_cuda_linalg'] %}
-    - test -f $PREFIX/lib/{{ each_lib }}.so     # [linux and cuda_compiler_version != "None"]
+    
+    # Cuda only libraries
+    {% for each_lib in ['c10_cuda', 'caffe2_nvrtc', 'torch_cuda'] %}
+    - test -f $PREFIX/lib/lib{{ each_lib }}.so                    # [linux and cuda_compiler_version != "None"]
+    - if not exist %SP_DIR%\torch\lib\{{ each_lib }}.dll exit 1   # [win and cuda_compiler_version != "None"]
+    - if not exist %SP_DIR%\torch\lib\{{ each_lib }}.lib exit 1   # [win and cuda_compiler_version != "None"]
+    {% endfor %}
+
+    # Linux specific cuda libraries
+    {% for each_lib in [ 'torch_cuda_linalg'] %}
+    - test -f $PREFIX/lib/lib{{ each_lib }}.so                    # [linux and cuda_compiler_version != "None"]
     {% endfor %}
 
 outputs:
@@ -182,10 +217,8 @@ outputs:
       run_exports:
         - {{ pin_subpackage('pytorch', max_pin='x.x') }}
         - {{ pin_subpackage('libtorch', max_pin='x.x') }}
-      skip: true  # [win]
       skip: true  # [cuda_compiler_version != "None" and linux64 and blas_impl != "mkl"]
-
-    script: build_pytorch.sh   # [unix]
+    script: build_pytorch.sh  # [not win]
     script: build_pytorch.bat  # [win]
     requirements:
       build:
@@ -198,7 +231,7 @@ outputs:
         - {{ compiler('cuda') }}                 # [cuda_compiler_version != "None"]
         - nvtx-c                                 # [cuda_compiler_version != "None" and build_platform != target_platform]
         {% if cuda_major >= 12 %}
-        - cuda-driver-dev                        # [build_platform != target_platform]
+        - cuda-driver-dev                        # [build_platform != target_platform and linux]
         - cuda-cudart-dev                        # [build_platform != target_platform]
         - cuda-nvrtc-dev                         # [build_platform != target_platform]
         - cuda-nvtx-dev                          # [build_platform != target_platform]
@@ -221,18 +254,20 @@ outputs:
         - ninja
         # Keep libprotobuf here so that a compatibile version
         # of protobuf is installed between build and host
-        - libprotobuf
-        - protobuf
+        # TODO(baszalmstra): There are linker errors on Windows when using protobuf from conda-forge
+        - libprotobuf  # [not win]
+        - protobuf     # [not win]
         - make      # [linux]
+        - sccache
       host:
         # GPU requirements
         - cudnn                           # [cuda_compiler_version != "None"]
-        - nccl                            # [cuda_compiler_version != "None"]
+        - nccl                            # [cuda_compiler_version != "None" and linux]
         - magma                           # [cuda_compiler_version != "None"]
         - cuda-version {{ cuda_compiler_version }}  # [cuda_compiler_version != "None"]
         - nvtx-c                          # [cuda_compiler_version != "None"]
         {% if cuda_major >= 12 %}
-        - cuda-driver-dev
+        - cuda-driver-dev                 # [linux]
         - cuda-cudart-dev
         - cuda-nvrtc-dev
         - cuda-nvtx-dev
@@ -259,8 +294,8 @@ outputs:
         - liblapack             # [blas_impl != "mkl"]
         - libgomp   # [linux]
         - llvm-openmp    # [osx]
-        - libabseil
-        - libprotobuf
+        # TODO(baszalmstra): There are linker errors on Windows when using protobuf from conda-forge
+        - libprotobuf   # [not win]
         - sleef
         - typing
         - libuv
@@ -301,6 +336,7 @@ outputs:
         - {{ compiler('c') }}
         - {{ compiler('cxx') }}
         - boto3
+        - expecttest
         - hypothesis
         - pytest
         - tabulate
@@ -309,16 +345,26 @@ outputs:
         - pip
         - expecttest
         - xmlrunner
+        # Required by run_test.py
+        - pytest-rerunfailures
+        - pytest-shard
+        - pytest-flakefinder
+        - pytest-xdist
       imports:
         - torch  # [not (aarch64 and cuda_compiler_version != "None")]
       source_files:
+        # Only include the source_files if we are actually going to run the tests.
         - test
         # tools/ is needed to optimise test run
         # as of pytorch=2.0.0, there is a bug when trying to run tests without the tools
         - tools
       commands:
         - OMP_NUM_THREADS=4 python ./test/run_test.py || true  # [not win and not (aarch64 and cuda_compiler_version != "None")]
-        - python ./test/run_test.py  # [win]
+        
+        # TODO(baszalmstra): Getting all sorts of test failures, disabling for now, should take a look later.
+        # Using `cmd.exe /c` to not fail if this test fails (similar to `|| true`).
+        - cmd.exe /c "python ./test/run_test.py" || ver > nul 2> nul    # [win and cuda_compiler_version == "None"]
+
         # Run pip check so as to ensure that all pytorch packages are installed
         # https://github.com/conda-forge/pytorch-cpu-feedstock/issues/24
         - pip check
@@ -346,11 +392,16 @@ outputs:
       string: cpu_{{ blas_impl }}_py{{ CONDA_PY }}h{{ PKG_HASH }}_{{ PKG_BUILDNUM }}                                # [not megabuild and cuda_compiler_version == "None"]
       detect_binary_files_with_prefix: false
       skip: true  # [cuda_compiler_version != "None" and linux64 and blas_impl != "mkl"]
+      skip: true  # [win and py >= 312]
+      detect_binary_files_with_prefix: false
       # weigh down cpu implementation and give cuda preference
       track_features:
         - pytorch-cpu                                      # [cuda_compiler_version == "None"]
     requirements:
+      host:
+        - python
       run:
+        - python
         - pytorch {{ version }}=cuda*{{ PKG_BUILDNUM }}                   # [megabuild and cuda_compiler_version != "None"]
         - pytorch {{ version }}=cpu_{{ blas_impl }}*{{ PKG_BUILDNUM }}    # [megabuild and cuda_compiler_version == "None"]
         - {{ pin_subpackage("pytorch", exact=True) }}                     # [not megabuild]
@@ -377,4 +428,5 @@ extra:
     - benjaminrwilson
     - Tobias-Fischer
     - beckermr
+    - baszalmstra
   feedstock-name: pytorch-cpu
diff --git a/recipe/patches/0003-fix-FindLAPACK.patch b/recipe/patches/0003-fix-FindLAPACK.patch
new file mode 100644
index 000000000..0ef99b7ec
--- /dev/null
+++ b/recipe/patches/0003-fix-FindLAPACK.patch
@@ -0,0 +1,12 @@
+diff --git a/cmake/Modules/FindLAPACK.cmake b/cmake/Modules/FindLAPACK.cmake
+index 02367ff..cd85f99 100644
+--- a/cmake/Modules/FindLAPACK.cmake	
++++ b/cmake/Modules/FindLAPACK.cmake
+@@ -26,6 +26,7 @@ ENDIF(LAPACK_FIND_QUIETLY OR NOT LAPACK_FIND_REQUIRED)
+ 
+ # Old search lapack script
+ include(CheckFortranFunctionExists)
++include(CheckFunctionExists)
+ 
+ macro(Check_Lapack_Libraries LIBRARIES _prefix _name _flags _list _blas)
+   # This macro checks for the existence of the combination of fortran libraries
diff --git a/recipe/patches/0004-Fix-mkl-dependency.patch b/recipe/patches/0004-Fix-mkl-dependency.patch
new file mode 100644
index 000000000..197b4add2
--- /dev/null
+++ b/recipe/patches/0004-Fix-mkl-dependency.patch
@@ -0,0 +1,12 @@
+diff --git a/setup.py b/setup.py
+index 6b0860a..a510f12 100644
+--- a/setup.py
++++ b/setup.py	
+@@ -1111,7 +1111,6 @@ def main():
+         "networkx",
+         "jinja2",
+         "fsspec",
+-        'mkl>=2021.1.1,<=2021.4.0; platform_system == "Windows"',
+     ]
+ 
+     # Parse the command line and check the arguments before we proceed with
diff --git a/recipe/patches/0005-Fix-FindOpenBLAS.patch b/recipe/patches/0005-Fix-FindOpenBLAS.patch
new file mode 100644
index 000000000..2971a7fb5
--- /dev/null
+++ b/recipe/patches/0005-Fix-FindOpenBLAS.patch
@@ -0,0 +1,36 @@
+diff --git a/cmake/Modules/FindOpenBLAS.cmake b/cmake/Modules/FindOpenBLAS.cmake
+index 69d8227..04cedeb 100644
+--- a/cmake/Modules/FindOpenBLAS.cmake
++++ b/cmake/Modules/FindOpenBLAS.cmake	
+@@ -31,22 +31,25 @@ SET(Open_BLAS_LIB_SEARCH_PATHS
+         $ENV{OpenBLAS_HOME}/lib
+  )
+ 
+-FIND_PATH(OpenBLAS_INCLUDE_DIR NAMES cblas.h PATHS ${Open_BLAS_INCLUDE_SEARCH_PATHS})
+-FIND_LIBRARY(OpenBLAS_LIB NAMES openblas PATHS ${Open_BLAS_LIB_SEARCH_PATHS})
+-
+ SET(OpenBLAS_FOUND ON)
+ 
+ #    Check include files
++FIND_PATH(OpenBLAS_INCLUDE_DIR NAMES cblas.h PATHS ${Open_BLAS_INCLUDE_SEARCH_PATHS})
+ IF(NOT OpenBLAS_INCLUDE_DIR)
+     SET(OpenBLAS_FOUND OFF)
+     MESSAGE(STATUS "Could not find OpenBLAS include. Turning OpenBLAS_FOUND off")
+ ENDIF()
+ 
+ #    Check libraries
+-IF(NOT OpenBLAS_LIB)
++FOREACH(LIB lapack cblas blas)
++  FIND_LIBRARY(FOUND_LIB_${LIB} NAMES ${LIB} PATHS ${Open_BLAS_LIB_SEARCH_PATHS})  
++  IF(NOT FOUND_LIB_${LIB})
+     SET(OpenBLAS_FOUND OFF)
+-    MESSAGE(STATUS "Could not find OpenBLAS lib. Turning OpenBLAS_FOUND off")
+-ENDIF()
++    MESSAGE(STATUS "Could not find OpenBLAS lib ${LIB}. Turning OpenBLAS_FOUND off")
++  ELSE()
++    LIST(APPEND OpenBLAS_LIB ${FOUND_LIB_${LIB}})
++  ENDIF()
++ENDFOREACH()
+ 
+ IF (OpenBLAS_FOUND)
+   IF (NOT OpenBLAS_FIND_QUIETLY)