Skip to content

MSVC fixes #1

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 3 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
68 changes: 54 additions & 14 deletions torch/_inductor/codecache.py
Original file line number Diff line number Diff line change
Expand Up @@ -342,7 +342,7 @@ def get_path(
subdir = os.path.join(cache_dir(), specified_dir)
else:
subdir = os.path.join(cache_dir(), basename[1:3])
path = os.path.join(subdir, f"{basename}.{extension}")
path = os.path.join(subdir, f"{basename}.{extension}").replace(os.sep, "/")
return basename, subdir, path


Expand Down Expand Up @@ -384,7 +384,7 @@ def write_atomic(path: str, content: Union[str, bytes]) -> None:
write_mode = "w" if isinstance(content, str) else "wb"
with tmp_path.open(write_mode) as f:
f.write(content)
tmp_path.rename(path)
tmp_path.replace(path)


@dataclasses.dataclass
Expand Down Expand Up @@ -921,7 +921,10 @@ def cpp_compiler_search(search: str) -> str:
)
with lock:
cxx = install_gcc_via_conda()
subprocess.check_output([cxx, "--version"])
if cxx == "cl":
subprocess.check_output([cxx])
else:
subprocess.check_output([cxx, "--version"])
return cxx
except (subprocess.SubprocessError, FileNotFoundError, ImportError):
continue
Expand Down Expand Up @@ -998,7 +1001,12 @@ class VecISA:

__attribute__((aligned(64))) float in_out_ptr0[16] = {0.0};

extern "C" void __avx_chk_kernel() {
#ifdef _MSC_VER
#define DLLEXPORT __declspec(dllexport)
#else
#define DLLEXPORT
#endif
extern "C" DLLEXPORT void __avx_chk_kernel() {
auto tmp0 = at::vec::Vectorized<float>(1);
auto tmp1 = tmp0.exp();
tmp1.store(in_out_ptr0);
Expand Down Expand Up @@ -1040,7 +1048,7 @@ def __bool__(self) -> bool:
lock_dir = get_lock_dir()
lock = FileLock(os.path.join(lock_dir, key + ".lock"), timeout=LOCK_TIMEOUT)
with lock:
output_path = input_path[:-3] + "so"
output_path = input_path[:-3] + ("so" if sys.platform != "win32" else "dll")
build_cmd = shlex.split(
cpp_compile_command(
input_path, output_path, warning_all=False, vec_isa=self
Expand Down Expand Up @@ -1167,6 +1175,10 @@ def get_compile_only(compile_only: bool = True) -> str:


def get_shared(shared: bool = True) -> str:
if sys.platform == "win32":
if cpp_compiler() in ["cl", "clang", "clang-cl"]:
return ""
return "-shared" if shared else ""
return "-shared -fPIC" if shared else ""


Expand All @@ -1180,6 +1192,8 @@ def get_glibcxx_abi_build_flags() -> str:

def cpp_flags() -> str:
flags = ["-std=c++17", "-Wno-unused-variable", "-Wno-unknown-pragmas"]
if cpp_compiler() in ["cl", "clang-cl"]:
return "/std:c++17"
if is_clang():
flags.append("-Werror=ignored-optimization-argument")
return " ".join(flags)
Expand All @@ -1192,6 +1206,8 @@ def cpp_wrapper_flags() -> str:
def optimization_flags() -> str:
base_flags = "-O0 -g" if config.aot_inductor.debug_compile else "-O3 -DNDEBUG"
base_flags += " -ffast-math -fno-finite-math-only"
if cpp_compiler() in ["cl", "clang-cl"]:
base_flags = "/nologo /O2 /fp:fast"
if not config.cpp.enable_unsafe_math_opt_flag:
base_flags += " -fno-unsafe-math-optimizations"

Expand All @@ -1205,6 +1221,8 @@ def optimization_flags() -> str:
# Per https://mac.r-project.org/openmp/ right way to pass `openmp` flags to MacOS is via `-Xclang`
# Also, `-march=native` is unrecognized option on M1
base_flags += " -Xclang"
elif sys.platform == "win32":
pass
else:
if platform.machine() == "ppc64le":
base_flags += " -mcpu=native"
Expand All @@ -1213,12 +1231,15 @@ def optimization_flags() -> str:

# Internal cannot find libgomp.so
if not config.is_fbcode():
base_flags += " -fopenmp"
if cpp_compiler() in ["cl", "clang-cl"]:
base_flags += " /openmp"
else:
base_flags += " -fopenmp"
return base_flags


def use_custom_generated_macros() -> str:
return "-D C10_USING_CUSTOM_GENERATED_MACROS"
return "-DC10_USING_CUSTOM_GENERATED_MACROS"


def use_fb_internal_macros() -> str:
Expand Down Expand Up @@ -1406,6 +1427,8 @@ def get_include_and_linking_paths(
# and raise error together with instructions at compilation error later
else:
libs = ["omp"] if config.is_fbcode() else ["gomp"]
if sys.platform == "win32" and "gomp" in libs:
libs.pop(libs.index("gomp"))

# Unconditionally import c10 for non-abi-compatible mode to use TORCH_CHECK - See PyTorch #108690
if not config.aot_inductor.abi_compatible:
Expand Down Expand Up @@ -1434,6 +1457,11 @@ def get_include_and_linking_paths(

lpaths_str = " ".join(["-L" + p for p in lpaths])
libs_str = " ".join(static_link_libs + ["-l" + p for p in libs])
if sys.platform == "win32":
ipaths = [p.replace(os.sep, "/") for p in ipaths]
lpaths_str = lpaths_str.replace(os.sep, "/")
libs_str = libs_str.replace(os.sep, "/")

return ipaths, lpaths_str, libs_str, macros, build_arch_flags


Expand All @@ -1454,7 +1482,7 @@ def cpp_compile_command(
)
if isinstance(input, str):
input = [input]
ipaths_str = " ".join(["-I" + p for p in ipaths])
ipaths_str = " ".join([f'-I"{p}"' for p in ipaths])
clang_flags = ""
if config.is_fbcode():
if aot_mode and not use_absolute_path:
Expand All @@ -1475,6 +1503,10 @@ def cpp_compile_command(
out_name = output
linker_paths = "" # let the compiler pick
inp_name_str = " ".join(inp_name)

out_dir = ""
if cpp_compiler() in ["cl", "clang-cl"]:
out_dir = "/Fe:" + os.path.dirname(out_name) + "/"
return re.sub(
r"[ \n]+",
" ",
Expand All @@ -1489,7 +1521,8 @@ def cpp_compile_command(
{use_fb_internal_macros()}
{use_standard_sys_dir_headers()}
{get_compile_only(compile_only)}
-o {out_name}
{out_dir}
{"-o " if "cl" not in cpp_compiler() else "/LDd /OUT:"}"{out_name}"
""",
).strip()

Expand Down Expand Up @@ -1600,7 +1633,7 @@ def compile(
output_so = (
config.aot_inductor.output_path
if specified_so_name
else os.path.splitext(input_path)[0] + ".so"
else os.path.splitext(input_path)[0] + (".so" if sys.platform != "win32" else ".dll")
)

if not os.path.exists(output_so):
Expand Down Expand Up @@ -1741,9 +1774,16 @@ def cpp_prefix() -> str:
# everything that we compile into a folder for remote compilation.
return f'#include "{os.path.basename(filename)}"'
else:
filename = filename.replace(os.sep, "/")
return f'#include "{filename}"'


@functools.lru_cache(None)
def output_encoding():
import locale
return locale.getpreferredencoding()


# Given a path to an input cpp file and an output path,
# Attempts to compile the file, storing the output in "output_path"
def compile_file(
Expand Down Expand Up @@ -1781,7 +1821,7 @@ def compile_file(
else:
subprocess.check_output(cmd, stderr=subprocess.STDOUT)
except subprocess.CalledProcessError as e:
output = e.output.decode("utf-8")
output = e.output.decode(output_encoding())
openmp_problem = "'omp.h' file not found" in output or "libomp" in output
if openmp_problem and sys.platform == "darwin":
instruction = (
Expand Down Expand Up @@ -1834,7 +1874,7 @@ def load(cls, source_code: str) -> CDLL:
lock_dir = get_lock_dir()
lock = FileLock(os.path.join(lock_dir, key + ".lock"), timeout=LOCK_TIMEOUT)
with lock:
output_path = input_path[:-3] + "so"
output_path = input_path[:-3] + ("so" if sys.platform != "win32" else "dll")
if not os.path.exists(output_path):
cmd = shlex.split(
cpp_compile_command(
Expand Down Expand Up @@ -1881,7 +1921,7 @@ def load_by_key_path(
if key not in cls.cache:
with open(path) as f:
try:
code = compile(f.read(), path, "exec")
code = compile(f.read(), path.replace(os.sep, "/"), "exec")
except Exception as e:
raise RuntimeError(
f"Failed to import {path}\n{type(e).__name__}: {e}"
Expand Down Expand Up @@ -1941,7 +1981,7 @@ def load(cls, source_code: str, func_name: str, key: str, cuda: bool) -> CDLL:
cpp_wrapper_dir = cpp_wrapper_cache_dir(name)
os.makedirs(cpp_wrapper_dir, exist_ok=True)

ext = "so"
ext = "so" if sys.platform != "win32" else "dll"
filepath = os.path.join(cpp_wrapper_dir, f"{name}.{ext}")
log.debug("Cpp wrapper code path %s", filepath)

Expand Down
7 changes: 6 additions & 1 deletion torch/_inductor/codegen/cpp.py
Original file line number Diff line number Diff line change
Expand Up @@ -3282,7 +3282,12 @@ def codegen_define_and_call(self, wrapper):
kernel_decl_name = kernel_name if V.graph.cpp_wrapper else "kernel"
code.writeline(codecache.cpp_prefix())

code.writeline(f'extern "C" void {kernel_decl_name}({arg_defs})')
code.writeline('#ifdef _MSC_VER')
code.writeline(' #define DLLEXPORT __declspec(dllexport)')
code.writeline('#else')
code.writeline(' #define DLLEXPORT')
code.writeline('#endif')
code.writeline(f'extern "C" DLLEXPORT void {kernel_decl_name}({arg_defs})')
with code.indent():
if enable_kernel_profile:
graph_id = V.graph.graph_id
Expand Down
3 changes: 3 additions & 0 deletions torch/_inductor/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -426,6 +426,9 @@ class cpp:
os.environ.get("CXX", "g++"),
# "g++.par",
)
if os.name == "nt":
cxx += ("cl",)

# Allow kernel performance profiling via PyTorch profiler
enable_kernel_profile = False

Expand Down