diff --git a/src/compiler/mod.rs b/src/compiler/mod.rs
index 051ccc2c8..0b434c5a0 100644
--- a/src/compiler/mod.rs
+++ b/src/compiler/mod.rs
@@ -32,5 +32,6 @@ mod tasking_vx;
 #[macro_use]
 mod counted_array;
 
+pub use crate::compiler::c::CCompilerKind;
 pub use crate::compiler::compiler::*;
 pub use crate::compiler::preprocessor_cache::PreprocessorCacheEntry;
diff --git a/src/compiler/nvcc.rs b/src/compiler/nvcc.rs
index 5da0a53a4..bc24d0e59 100644
--- a/src/compiler/nvcc.rs
+++ b/src/compiler/nvcc.rs
@@ -461,7 +461,20 @@ pub fn generate_compile_commands(
         output_file_name: output.file_name().unwrap().to_owned(),
     };
 
-    Ok((command, None, Cacheable::Yes))
+    Ok((
+        command,
+        None,
+        // Never assume the outer `nvcc` call is cacheable. We must decompose the nvcc call into
+        // its constituent subcommands with `--dryrun` and only cache the final build product.
+        //
+        // Always decomposing `nvcc --dryrun` is the only way to ensure caching nvcc invocations
+        // is fully sound, because the `nvcc -E` preprocessor output is not sufficient to detect
+        // all source code changes.
+        //
+        // Specifically, `nvcc -E` always defines __CUDA_ARCH__, which means changes to host-only
+        // code guarded by an `#ifndef __CUDA_ARCH__` will _not_ be captured in `nvcc -E` output.
+        Cacheable::No,
+    ))
 }
 
 #[derive(Clone, Debug)]
@@ -811,19 +824,28 @@ where
                         )
                     }
                 } else {
-                    // Returns Cacheable::Yes to indicate we _do_ want to run this host
-                    // compiler call through sccache (because it may be distributed),
-                    // but we _do not_ want to cache its output. The output file will
-                    // be cached as the result of the outer `nvcc` command. Caching
-                    // here would store the same object twice under two different hashes,
-                    // unnecessarily bloating the cache size.
+                    // Cache the host compiler calls, since we've marked the outer `nvcc` call
+                    // as non-cacheable. This ensures `sccache nvcc ...` _always_ decomposes the
+                    // nvcc call into its constituent subcommands with `--dryrun`, but only caches
+                    // the final build product once.
+                    //
+                    // Always decomposing `nvcc --dryrun` is the only way to ensure caching nvcc invocations
+                    // is fully sound, because the `nvcc -E` preprocessor output is not sufficient to detect
+                    // all source code changes.
+                    //
+                    // Specifically, `nvcc -E` always defines __CUDA_ARCH__, which means changes to host-only
+                    // code guarded by an `#ifndef __CUDA_ARCH__` will _not_ be captured in `nvcc -E` output.
                     (
                         env_vars
                             .iter()
                             .chain(
                                 [
-                                    // Do not cache host compiler calls
-                                    ("SCCACHE_NO_CACHE".into(), "true".into()),
+                                    // HACK: This compilation will look like a C/C++ compilation,
+                                    // but we want to report it in the stats as a CUDA compilation.
+                                    // The SccacheService API doesn't have a great way to specify this
+                                    // case, so we set a special envvar here that it can read when the
+                                    // compilation is finished.
+                                    ("__SCCACHE_THIS_IS_A_CUDA_COMPILATION__".into(), "".into()),
                                 ]
                                 .iter(),
                             )
diff --git a/src/server.rs b/src/server.rs
index 14207b48c..0620e61b9 100644
--- a/src/server.rs
+++ b/src/server.rs
@@ -1305,8 +1305,22 @@ where
 
         let out_pretty = hasher.output_pretty().into_owned();
         let color_mode = hasher.color_mode();
-        let kind = compiler.kind();
-        let lang = hasher.language();
+
+        let (kind, lang) = {
+            // HACK: See note in src/compiler/nvcc.rs
+            if env_vars
+                .iter()
+                .any(|(k, _)| k == "__SCCACHE_THIS_IS_A_CUDA_COMPILATION__")
+            {
+                (
+                    CompilerKind::C(crate::compiler::CCompilerKind::Nvcc),
+                    Language::Cuda,
+                )
+            } else {
+                (compiler.kind(), hasher.language())
+            }
+        };
+
         let me = self.clone();
 
         self.rt
diff --git a/tests/system.rs b/tests/system.rs
index 75fa004a6..de480dfa9 100644
--- a/tests/system.rs
+++ b/tests/system.rs
@@ -703,12 +703,12 @@ fn test_nvcc_cuda_compiles(compiler: &Compiler, tempdir: &Path) {
     trace!("compile A request stats");
     get_stats(|info| {
         assert_eq!(2, info.stats.compile_requests);
-        assert_eq!(5, info.stats.requests_executed);
-        assert_eq!(1, info.stats.cache_hits.all());
+        assert_eq!(8, info.stats.requests_executed);
+        assert_eq!(3, info.stats.cache_hits.all());
         assert_eq!(3, info.stats.cache_misses.all());
         assert_eq!(&1, info.stats.cache_hits.get("CUDA").unwrap());
-        assert!(info.stats.cache_hits.get("PTX").is_none());
-        assert!(info.stats.cache_hits.get("CUBIN").is_none());
+        assert_eq!(&1, info.stats.cache_hits.get("PTX").unwrap());
+        assert_eq!(&1, info.stats.cache_hits.get("CUBIN").unwrap());
         assert_eq!(&1, info.stats.cache_misses.get("CUDA").unwrap());
         assert_eq!(&1, info.stats.cache_misses.get("PTX").unwrap());
         assert_eq!(&1, info.stats.cache_misses.get("CUBIN").unwrap());
@@ -717,8 +717,8 @@ fn test_nvcc_cuda_compiles(compiler: &Compiler, tempdir: &Path) {
         let adv_ptx_key = adv_key_kind("ptx", compiler.name);
         let adv_cubin_key = adv_key_kind("cubin", compiler.name);
         assert_eq!(&1, info.stats.cache_hits.get_adv(&adv_cuda_key).unwrap());
-        assert!(info.stats.cache_hits.get_adv(&adv_ptx_key).is_none());
-        assert!(info.stats.cache_hits.get_adv(&adv_cubin_key).is_none());
+        assert_eq!(&1, info.stats.cache_hits.get_adv(&adv_ptx_key).unwrap());
+        assert_eq!(&1, info.stats.cache_hits.get_adv(&adv_cubin_key).unwrap());
         assert_eq!(&1, info.stats.cache_misses.get_adv(&adv_cuda_key).unwrap());
         assert_eq!(&1, info.stats.cache_misses.get_adv(&adv_ptx_key).unwrap());
         assert_eq!(&1, info.stats.cache_misses.get_adv(&adv_cubin_key).unwrap());
@@ -747,12 +747,12 @@ fn test_nvcc_cuda_compiles(compiler: &Compiler, tempdir: &Path) {
     trace!("compile B request stats");
     get_stats(|info| {
         assert_eq!(3, info.stats.compile_requests);
-        assert_eq!(9, info.stats.requests_executed);
-        assert_eq!(2, info.stats.cache_hits.all());
+        assert_eq!(12, info.stats.requests_executed);
+        assert_eq!(4, info.stats.cache_hits.all());
         assert_eq!(5, info.stats.cache_misses.all());
         assert_eq!(&1, info.stats.cache_hits.get("CUDA").unwrap());
-        assert!(info.stats.cache_hits.get("PTX").is_none());
-        assert_eq!(&1, info.stats.cache_hits.get("CUBIN").unwrap());
+        assert_eq!(&1, info.stats.cache_hits.get("PTX").unwrap());
+        assert_eq!(&2, info.stats.cache_hits.get("CUBIN").unwrap());
         assert_eq!(&2, info.stats.cache_misses.get("CUDA").unwrap());
         assert_eq!(&2, info.stats.cache_misses.get("PTX").unwrap());
         assert_eq!(&1, info.stats.cache_misses.get("CUBIN").unwrap());
@@ -761,8 +761,8 @@ fn test_nvcc_cuda_compiles(compiler: &Compiler, tempdir: &Path) {
         let adv_ptx_key = adv_key_kind("ptx", compiler.name);
         let adv_cubin_key = adv_key_kind("cubin", compiler.name);
         assert_eq!(&1, info.stats.cache_hits.get_adv(&adv_cuda_key).unwrap());
-        assert!(info.stats.cache_hits.get_adv(&adv_ptx_key).is_none());
-        assert_eq!(&1, info.stats.cache_hits.get_adv(&adv_cubin_key).unwrap());
+        assert_eq!(&1, info.stats.cache_hits.get_adv(&adv_ptx_key).unwrap());
+        assert_eq!(&2, info.stats.cache_hits.get_adv(&adv_cubin_key).unwrap());
         assert_eq!(&2, info.stats.cache_misses.get_adv(&adv_cuda_key).unwrap());
         assert_eq!(&2, info.stats.cache_misses.get_adv(&adv_ptx_key).unwrap());
         assert_eq!(&1, info.stats.cache_misses.get_adv(&adv_cubin_key).unwrap());
@@ -789,13 +789,13 @@ fn test_nvcc_cuda_compiles(compiler: &Compiler, tempdir: &Path) {
     trace!("compile ptx request stats");
     get_stats(|info| {
         assert_eq!(4, info.stats.compile_requests);
-        assert_eq!(11, info.stats.requests_executed);
-        assert_eq!(3, info.stats.cache_hits.all());
-        assert_eq!(6, info.stats.cache_misses.all());
+        assert_eq!(14, info.stats.requests_executed);
+        assert_eq!(5, info.stats.cache_hits.all());
+        assert_eq!(5, info.stats.cache_misses.all());
         assert_eq!(&1, info.stats.cache_hits.get("CUDA").unwrap());
-        assert_eq!(&1, info.stats.cache_hits.get("PTX").unwrap());
-        assert_eq!(&1, info.stats.cache_hits.get("CUBIN").unwrap());
-        assert_eq!(&3, info.stats.cache_misses.get("CUDA").unwrap());
+        assert_eq!(&2, info.stats.cache_hits.get("PTX").unwrap());
+        assert_eq!(&2, info.stats.cache_hits.get("CUBIN").unwrap());
+        assert_eq!(&2, info.stats.cache_misses.get("CUDA").unwrap());
         assert_eq!(&2, info.stats.cache_misses.get("PTX").unwrap());
         assert_eq!(&1, info.stats.cache_misses.get("CUBIN").unwrap());
         assert!(info.stats.cache_misses.get("C/C++").is_none());
@@ -803,9 +803,9 @@ fn test_nvcc_cuda_compiles(compiler: &Compiler, tempdir: &Path) {
         let adv_ptx_key = adv_key_kind("ptx", compiler.name);
         let adv_cubin_key = adv_key_kind("cubin", compiler.name);
         assert_eq!(&1, info.stats.cache_hits.get_adv(&adv_cuda_key).unwrap());
-        assert_eq!(&1, info.stats.cache_hits.get_adv(&adv_ptx_key).unwrap());
-        assert_eq!(&1, info.stats.cache_hits.get_adv(&adv_cubin_key).unwrap());
-        assert_eq!(&3, info.stats.cache_misses.get_adv(&adv_cuda_key).unwrap());
+        assert_eq!(&2, info.stats.cache_hits.get_adv(&adv_ptx_key).unwrap());
+        assert_eq!(&2, info.stats.cache_hits.get_adv(&adv_cubin_key).unwrap());
+        assert_eq!(&2, info.stats.cache_misses.get_adv(&adv_cuda_key).unwrap());
         assert_eq!(&2, info.stats.cache_misses.get_adv(&adv_ptx_key).unwrap());
         assert_eq!(&1, info.stats.cache_misses.get_adv(&adv_cubin_key).unwrap());
     });
@@ -831,13 +831,13 @@ fn test_nvcc_cuda_compiles(compiler: &Compiler, tempdir: &Path) {
     trace!("compile cubin request stats");
     get_stats(|info| {
         assert_eq!(5, info.stats.compile_requests);
-        assert_eq!(14, info.stats.requests_executed);
-        assert_eq!(5, info.stats.cache_hits.all());
-        assert_eq!(7, info.stats.cache_misses.all());
+        assert_eq!(17, info.stats.requests_executed);
+        assert_eq!(7, info.stats.cache_hits.all());
+        assert_eq!(5, info.stats.cache_misses.all());
         assert_eq!(&1, info.stats.cache_hits.get("CUDA").unwrap());
-        assert_eq!(&2, info.stats.cache_hits.get("PTX").unwrap());
-        assert_eq!(&2, info.stats.cache_hits.get("CUBIN").unwrap());
-        assert_eq!(&4, info.stats.cache_misses.get("CUDA").unwrap());
+        assert_eq!(&3, info.stats.cache_hits.get("PTX").unwrap());
+        assert_eq!(&3, info.stats.cache_hits.get("CUBIN").unwrap());
+        assert_eq!(&2, info.stats.cache_misses.get("CUDA").unwrap());
         assert_eq!(&2, info.stats.cache_misses.get("PTX").unwrap());
         assert_eq!(&1, info.stats.cache_misses.get("CUBIN").unwrap());
         assert!(info.stats.cache_misses.get("C/C++").is_none());
@@ -845,9 +845,9 @@ fn test_nvcc_cuda_compiles(compiler: &Compiler, tempdir: &Path) {
         let adv_ptx_key = adv_key_kind("ptx", compiler.name);
         let adv_cubin_key = adv_key_kind("cubin", compiler.name);
         assert_eq!(&1, info.stats.cache_hits.get_adv(&adv_cuda_key).unwrap());
-        assert_eq!(&2, info.stats.cache_hits.get_adv(&adv_ptx_key).unwrap());
-        assert_eq!(&2, info.stats.cache_hits.get_adv(&adv_cubin_key).unwrap());
-        assert_eq!(&4, info.stats.cache_misses.get_adv(&adv_cuda_key).unwrap());
+        assert_eq!(&3, info.stats.cache_hits.get_adv(&adv_ptx_key).unwrap());
+        assert_eq!(&3, info.stats.cache_hits.get_adv(&adv_cubin_key).unwrap());
+        assert_eq!(&2, info.stats.cache_misses.get_adv(&adv_cuda_key).unwrap());
         assert_eq!(&2, info.stats.cache_misses.get_adv(&adv_ptx_key).unwrap());
         assert_eq!(&1, info.stats.cache_misses.get_adv(&adv_cubin_key).unwrap());
     });
@@ -914,14 +914,14 @@ fn test_nvcc_proper_lang_stat_tracking(compiler: Compiler, tempdir: &Path) {
     trace!("request stats");
     get_stats(|info| {
         assert_eq!(4, info.stats.compile_requests);
-        assert_eq!(8, info.stats.requests_executed);
-        assert_eq!(3, info.stats.cache_hits.all());
+        assert_eq!(12, info.stats.requests_executed);
+        assert_eq!(5, info.stats.cache_hits.all());
         assert_eq!(3, info.stats.cache_misses.all());
-        assert_eq!(&1, info.stats.cache_hits.get("C/C++").unwrap());
-        assert_eq!(&1, info.stats.cache_hits.get("CUDA").unwrap());
-        assert_eq!(&1, info.stats.cache_hits.get("CUBIN").unwrap());
-        assert_eq!(&1, info.stats.cache_misses.get("C/C++").unwrap());
-        assert_eq!(&1, info.stats.cache_misses.get("CUDA").unwrap());
+        assert!(info.stats.cache_hits.get("C/C++").is_none());
+        assert_eq!(&2, info.stats.cache_hits.get("CUDA").unwrap());
+        assert_eq!(&2, info.stats.cache_hits.get("CUBIN").unwrap());
+        assert!(info.stats.cache_misses.get("C/C++").is_none());
+        assert_eq!(&2, info.stats.cache_misses.get("CUDA").unwrap());
         assert_eq!(&1, info.stats.cache_misses.get("PTX").unwrap());
     });
 }