[mlir][gpu] Improving Cubin Serialization with ptxas Compiler

This work improves how we compile the generated PTX code using the `ptxas` compiler. Currently, we rely on the driver's jit API to compile the PTX code. However, this approach has some limitations. It doesn't always produce the same binary output as the ptxas compiler, leading to potential inconsistencies in the generated Cubin files. This work introduces a significant improvement by directly utilizing the ptxas compiler for PTX compilation. By doing so, we can achieve more consistent and reliable results in generating cubin files. Key Benefits: - Using the Ptxas compiler directly ensures that the cubin files generated during the build process remain consistent with CUDA compilation using `nvcc` or `clang`. - Another advantage of this work is that it allows developers to experiment with different ptxas compilers without the need to change the compiler. Performance among ptxas compiler versions are vary, therefore, one can easily try different ptxas compilers. Reviewed By: nicolasvasilache Differential Revision: https://reviews.llvm.org/D155563
kokkos · Jul 24, 2023 · 585cbe3 · 585cbe3
1 parent 106bde9
commit 585cbe3
Show file tree

Hide file tree

Showing 4 changed files with 201 additions and 28 deletions.
diff --git a/mlir/include/mlir/Dialect/GPU/Transforms/Passes.h b/mlir/include/mlir/Dialect/GPU/Transforms/Passes.h
@@ -70,6 +70,32 @@ inline void populateGpuRewritePatterns(RewritePatternSet &patterns) {
 }
 
 namespace gpu {
+
+/// Options for Serialization
+struct SerializationToCubinOptions {
+  /// LLVM target triple
+  std::string triple;
+
+  /// SM Architecture of the GPU
+  std::string chip;
+
+  /// PTX version that is wanted to produce
+  std::string features;
+
+  /// Optimization level
+  int optLevel = 2;
+
+  /// Dump generated PTX to stderr for debug purposes
+  bool dumpPtx = false;
+
+  /// Compiles generated PTX by ptxas compiler. When it is false, the generated
+  /// PTX is compilet by JIT compielr by the driver.
+  bool usePtxas = true;
+
+  /// Parameters to pass ptxas compiler. It is ignored for JIT compiler.
+  std::string ptxasParams;
+};
+
 /// Base pass class to serialize kernel functions through LLVM into
 /// user-specified IR and add the resulting blob as module attribute.
 class SerializeToBlobPass : public OperationPass<gpu::GPUModuleOp> {
@@ -117,9 +143,18 @@ class SerializeToBlobPass : public OperationPass<gpu::GPUModuleOp> {
       *this, "gpu-binary-annotation",
       llvm::cl::desc("Annotation attribute string for GPU binary"),
       llvm::cl::init(getDefaultGpuBinaryAnnotation())};
+
   Option<bool> dumpPtx{*this, "dump-ptx",
                        ::llvm::cl::desc("Dump generated PTX"),
                        llvm::cl::init(false)};
+
+  Option<bool> usePtxas{
+      *this, "use-ptxas",
+      ::llvm::cl::desc("Compile generated PTX by ptxas compiler"),
+      llvm::cl::init(true)};
+  Option<std::string> ptxasParams{
+      *this, "ptxas-params",
+      ::llvm::cl::desc("Parameters to pass ptxas compiler")};
 };
 } // namespace gpu
 
@@ -137,11 +172,8 @@ void registerGpuSerializeToHsacoPass();
 
 /// Create an instance of the GPU kernel function to CUBIN binary serialization
 /// pass with optLevel (default level 2).
-std::unique_ptr<Pass> createGpuSerializeToCubinPass(StringRef triple,
-                                                    StringRef chip,
-                                                    StringRef features,
-                                                    int optLevel = 2,
-                                                    bool dumpPtx = false);
+std::unique_ptr<Pass>
+createGpuSerializeToCubinPass(const gpu::SerializationToCubinOptions &options);
 
 /// Create an instance of the GPU kernel function to HSAco binary serialization
 /// pass.

diff --git a/mlir/lib/Dialect/GPU/Transforms/SerializeToCubin.cpp b/mlir/lib/Dialect/GPU/Transforms/SerializeToCubin.cpp
@@ -12,7 +12,14 @@
 //===----------------------------------------------------------------------===//
 
 #include "mlir/Dialect/GPU/Transforms/Passes.h"
-#include "llvm/Support/Debug.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/FileSystem.h"
+#include "llvm/Support/FileUtilities.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/Process.h"
+#include "llvm/Support/Program.h"
+#include "llvm/Support/WithColor.h"
+#include "llvm/Support/raw_ostream.h"
 
 #if MLIR_GPU_TO_CUBIN_PASS_ENABLE
 #include "mlir/Pass/Pass.h"
@@ -36,6 +43,106 @@ static void emitCudaError(const llvm::Twine &expr, const char *buffer,
                      .concat("]"));
 }
 
+static constexpr char kPtxasCompilerName[] = "ptxas";
+
+/// Compiles the given generated PTX code with the given ptxas compiler.
+static FailureOr<std::string>
+compileWithPtxas(StringRef smCapability, StringRef ptxasParams,
+                 StringRef ptxSource, bool dumpPtx, std::string *message) {
+  // Step 0. Find ptxas compiler
+  std::optional<std::string> ptxasCompiler =
+      llvm::sys::Process::FindInEnvPath("PATH", kPtxasCompilerName);
+  if (!ptxasCompiler.has_value())
+    return failure();
+
+  // Step 1. Create temporary files: ptx source file, log file and cubin file
+  llvm::SmallString<64> ptxSourceFile, stdinFile, stdoutFile, stderrFile;
+  llvm::sys::fs::createTemporaryFile("mlir-ptx", "", ptxSourceFile);
+  llvm::sys::fs::createTemporaryFile("ptxas-stdin", "", stdinFile);
+  llvm::sys::fs::createTemporaryFile("ptxas-stdout", "", stdoutFile);
+  llvm::sys::fs::createTemporaryFile("ptxas-stderr", "", stderrFile);
+  std::string cubinFile = std::string(ptxSourceFile) + ".cubin";
+  llvm::FileRemover stdinRemover(stdinFile.c_str());
+  llvm::FileRemover stdoutRemover(stdoutFile.c_str());
+  llvm::FileRemover stderrRemover(stderrFile.c_str());
+  llvm::FileRemover binRemover(cubinFile.c_str());
+  llvm::FileRemover srcRemover(ptxSourceFile.c_str());
+
+  // Step 2. Write the generated PTX into a file, so we can pass it  to ptxas
+  // compiler
+  std::error_code ec;
+  llvm::raw_fd_ostream fPtxSource(ptxSourceFile, ec);
+  fPtxSource << ptxSource;
+  fPtxSource.close();
+  if (fPtxSource.has_error()) {
+    *message = std::string(
+        "Could not write the generated ptx into a temporary file\n");
+    return failure();
+  }
+
+  // Step 3. Build the ptxas command  line
+  std::vector<StringRef> argVector{StringRef("ptxas"), StringRef("-arch"),
+                                   smCapability,       StringRef(ptxSourceFile),
+                                   StringRef("-o"),    StringRef(cubinFile)};
+#ifdef _WIN32
+  auto tokenize = llvm::cl::TokenizeWindowsCommandLine;
+#else
+  auto tokenize = llvm::cl::TokenizeGNUCommandLine;
+#endif // _WIN32
+  llvm::BumpPtrAllocator scratchAllocator;
+  llvm::StringSaver stringSaver(scratchAllocator);
+  SmallVector<const char *> rawArgs;
+  tokenize(ptxasParams, stringSaver, rawArgs, /*MarkEOLs=*/false);
+  for (const auto *rawArg : rawArgs)
+    argVector.emplace_back(rawArg);
+
+  std::optional<StringRef> redirects[] = {
+      stdinFile.str(),
+      stdoutFile.str(),
+      stderrFile.str(),
+  };
+
+  // Step 4. Invoke ptxas
+  if (llvm::sys::ExecuteAndWait(ptxasCompiler.value(),
+                                llvm::ArrayRef<llvm::StringRef>(argVector),
+                                /*Env=*/std::nullopt,
+                                /*Redirects=*/redirects,
+                                /*SecondsToWait=*/0,
+                                /*MemoryLimit=*/0,
+                                /*ErrMsg=*/message)) {
+    if (message->empty()) {
+      llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> maybeErrorlog =
+          llvm::MemoryBuffer::getFile(stderrFile);
+      *message = std::string("Invoking ptxas is failed, see the file: ");
+      if (maybeErrorlog)
+        *message += maybeErrorlog->get()->getBuffer().str();
+    }
+    stderrRemover.releaseFile();
+    return failure();
+  }
+
+  // Step 5. The output of ptxas if  verbose flag is set. This is useful
+  // because it shows local memory usage, register usage, and etc.
+  if (dumpPtx) {
+    llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> maybeFlog =
+        llvm::MemoryBuffer::getFile(stderrFile);
+    if (maybeFlog) {
+      llvm::WithColor::note() << maybeFlog->get()->getBuffer().str();
+    }
+  }
+
+  // Step 6. Read the cubin file, and return. It will eventually be written
+  // into executable.
+  llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> maybeFcubin =
+      llvm::MemoryBuffer::getFile(cubinFile);
+  if (!maybeFcubin) {
+    *message = std::string("Could not read cubin file \n");
+    return failure();
+  }
+
+  return std::string(maybeFcubin->get()->getBuffer());
+}
+
 #define RETURN_ON_CUDA_ERROR(expr)                                             \
   do {                                                                         \
     if (auto status = (expr)) {                                                \
@@ -54,11 +161,13 @@ class SerializeToCubinPass
 
   SerializeToCubinPass(StringRef triple = "nvptx64-nvidia-cuda",
                        StringRef chip = "sm_35", StringRef features = "+ptx60",
-                       int optLevel = 2, bool dumpPtx = false);
+                       int optLevel = 2, bool dumpPtx = false,
+                       bool usePtxas = true, StringRef ptxasParams = {});
 
   StringRef getArgument() const override { return "gpu-to-cubin"; }
   StringRef getDescription() const override {
-    return "Lower GPU kernel function to CUBIN binary annotations";
+    return "Lower GPU kernel function to CUBIN binary "
+           "annotations";
   }
 
 private:
@@ -80,9 +189,10 @@ llvm::once_flag SerializeToCubinPass::initializeBackendOnce;
 
 SerializeToCubinPass::SerializeToCubinPass(StringRef triple, StringRef chip,
                                            StringRef features, int optLevel,
-                                           bool dumpPtx) {
-  // No matter how this pass is constructed, ensure that the NVPTX backend
-  // is initialized exactly once.
+                                           bool dumpPtx, bool usePtxas,
+                                           StringRef ptxasParams) {
+  // No matter how this pass is constructed, ensure that
+  // the NVPTX backend is initialized exactly once.
   llvm::call_once(initializeBackendOnce, []() {
     // Initialize LLVM NVPTX backend.
     LLVMInitializeNVPTXTarget();
@@ -94,7 +204,9 @@ SerializeToCubinPass::SerializeToCubinPass(StringRef triple, StringRef chip,
   maybeSetOption(this->triple, triple);
   maybeSetOption(this->chip, chip);
   maybeSetOption(this->features, features);
+  maybeSetOption(this->ptxasParams, ptxasParams);
   this->dumpPtx = dumpPtx;
+  this->usePtxas = usePtxas;
   if (this->optLevel.getNumOccurrences() == 0)
     this->optLevel.setValue(optLevel);
 }
@@ -112,7 +224,8 @@ SerializeToCubinPass::serializeISA(const std::string &isa) {
 
   RETURN_ON_CUDA_ERROR(cuInit(0));
 
-  // Linking requires a device context.
+  // Linking requires a device
+  // context.
   CUdevice device;
   RETURN_ON_CUDA_ERROR(cuDeviceGet(&device, 0));
   CUcontext context;
@@ -131,9 +244,24 @@ SerializeToCubinPass::serializeISA(const std::string &isa) {
 
   auto kernelName = getOperation().getName().str();
   if (dumpPtx) {
-    llvm::dbgs() << " Kernel Name : [" << kernelName << "]\n";
-    llvm::dbgs() << isa << "\n";
+    llvm::errs() << "// Kernel Name : [" << kernelName << "]\n";
+    llvm::errs() << isa << "\n";
   }
+
+  if (usePtxas) {
+    // Try to compile it with ptxas first.
+    std::string message;
+    FailureOr<std::string> maybeCubinImage =
+        compileWithPtxas(this->chip, ptxasParams, isa, dumpPtx, &message);
+    if (succeeded(maybeCubinImage)) {
+      return std::make_unique<std::vector<char>>(
+          maybeCubinImage.value().begin(), maybeCubinImage.value().end());
+    }
+    emitError(loc) << message;
+    return {};
+  }
+
+  // Fallback to JIT compilation if ptxas fails.
   RETURN_ON_CUDA_ERROR(cuLinkAddData(
       linkState, CUjitInputType::CU_JIT_INPUT_PTX,
       const_cast<void *>(static_cast<const void *>(isa.c_str())), isa.length(),
@@ -150,7 +278,7 @@ SerializeToCubinPass::serializeISA(const std::string &isa) {
   auto result =
       std::make_unique<std::vector<char>>(cubinAsChar, cubinAsChar + cubinSize);
 
-  // This will also destroy the cubin data.
+  // This will also destroy the cubin  data.
   RETURN_ON_CUDA_ERROR(cuLinkDestroy(linkState));
   RETURN_ON_CUDA_ERROR(cuCtxDestroy(context));
 
@@ -159,17 +287,22 @@ SerializeToCubinPass::serializeISA(const std::string &isa) {
 
 // Register pass to serialize GPU kernel functions to a CUBIN binary annotation.
 void mlir::registerGpuSerializeToCubinPass() {
-  PassRegistration<SerializeToCubinPass> registerSerializeToCubin(
-      [] { return std::make_unique<SerializeToCubinPass>(); });
+  PassRegistration<SerializeToCubinPass> registerSerializeToCubin([] {
+    // Initialize LLVM NVPTX backend.
+    LLVMInitializeNVPTXTarget();
+    LLVMInitializeNVPTXTargetInfo();
+    LLVMInitializeNVPTXTargetMC();
+    LLVMInitializeNVPTXAsmPrinter();
+
+    return std::make_unique<SerializeToCubinPass>();
+  });
 }
 
-std::unique_ptr<Pass> mlir::createGpuSerializeToCubinPass(StringRef triple,
-                                                          StringRef arch,
-                                                          StringRef features,
-                                                          int optLevel,
-                                                          bool dumpPtx) {
-  return std::make_unique<SerializeToCubinPass>(triple, arch, features,
-                                                optLevel, dumpPtx);
+std::unique_ptr<Pass> mlir::createGpuSerializeToCubinPass(
+    const gpu::SerializationToCubinOptions &options) {
+  return std::make_unique<SerializeToCubinPass>(
+      options.triple, options.chip, options.features, options.optLevel,
+      options.dumpPtx, options.usePtxas, options.ptxasParams);
 }
 
 #else  // MLIR_GPU_TO_CUBIN_PASS_ENABLE

diff --git a/mlir/lib/Dialect/SparseTensor/Pipelines/SparseTensorPipelines.cpp b/mlir/lib/Dialect/SparseTensor/Pipelines/SparseTensorPipelines.cpp
@@ -102,8 +102,12 @@ void mlir::sparse_tensor::buildSparseCompiler(
   // Finalize GPU code generation.
   if (gpuCodegen) {
 #if MLIR_GPU_TO_CUBIN_PASS_ENABLE
-    pm.addNestedPass<gpu::GPUModuleOp>(createGpuSerializeToCubinPass(
-        options.gpuTriple, options.gpuChip, options.gpuFeatures));
+    gpu::SerializationToCubinOptions cubinOptions;
+    cubinOptions.triple = options.gpuTriple;
+    cubinOptions.chip = options.gpuChip;
+    cubinOptions.features = options.gpuFeatures;
+    pm.addNestedPass<gpu::GPUModuleOp>(
+        createGpuSerializeToCubinPass(cubinOptions));
 #endif
     pm.addPass(createGpuToLLVMConversionPass());
   }

diff --git a/mlir/test/lib/Dialect/GPU/TestLowerToNVVM.cpp b/mlir/test/lib/Dialect/GPU/TestLowerToNVVM.cpp
@@ -172,8 +172,12 @@ void buildGpuPassPipeline(OpPassManager &pm,
   pm.addNestedPass<gpu::GPUModuleOp>(createReconcileUnrealizedCastsPass());
 
 #if MLIR_GPU_TO_CUBIN_PASS_ENABLE
-  pm.addNestedPass<gpu::GPUModuleOp>(createGpuSerializeToCubinPass(
-      options.cubinTriple, options.cubinChip, options.cubinFeatures));
+  gpu::SerializationToCubinOptions cubinOptions;
+  cubinOptions.triple = options.cubinTriple;
+  cubinOptions.chip = options.cubinChip;
+  cubinOptions.features = options.cubinFeatures;
+  pm.addNestedPass<gpu::GPUModuleOp>(
+      createGpuSerializeToCubinPass(cubinOptions));
 #endif // MLIR_GPU_TO_CUBIN_PASS_ENABLE
 }