diff --git a/src/librustc_codegen_llvm/base.rs b/src/librustc_codegen_llvm/base.rs
index cb44a56d07516..d3b524c1a1e70 100644
--- a/src/librustc_codegen_llvm/base.rs
+++ b/src/librustc_codegen_llvm/base.rs
@@ -13,7 +13,7 @@
 //!   but one `llvm::Type` corresponds to many `Ty`s; for instance, `tup(int, int,
 //!   int)` and `rec(x=int, y=int, z=int)` will have the same `llvm::Type`.
 
-use super::{LlvmCodegenBackend, ModuleLlvm};
+use super::ModuleLlvm;
 
 use crate::builder::Builder;
 use crate::common;
@@ -29,7 +29,6 @@ use rustc::middle::exported_symbols;
 use rustc::mir::mono::{Linkage, Visibility};
 use rustc::session::config::DebugInfo;
 use rustc::ty::TyCtxt;
-use rustc_codegen_ssa::back::write::submit_codegened_module_to_llvm;
 use rustc_codegen_ssa::base::maybe_create_entry_wrapper;
 use rustc_codegen_ssa::mono_item::MonoItemExt;
 use rustc_codegen_ssa::traits::*;
@@ -100,8 +99,7 @@ pub fn iter_globals(llmod: &'ll llvm::Module) -> ValueIter<'ll> {
 pub fn compile_codegen_unit(
     tcx: TyCtxt<'tcx>,
     cgu_name: Symbol,
-    tx_to_llvm_workers: &std::sync::mpsc::Sender<Box<dyn std::any::Any + Send>>,
-) {
+) -> (ModuleCodegen<ModuleLlvm>, u64) {
     let prof_timer = tcx.prof.generic_activity("codegen_module");
     let start_time = Instant::now();
 
@@ -115,8 +113,6 @@ pub fn compile_codegen_unit(
     // the time we needed for codegenning it.
     let cost = time_to_codegen.as_secs() * 1_000_000_000 + time_to_codegen.subsec_nanos() as u64;
 
-    submit_codegened_module_to_llvm(&LlvmCodegenBackend(()), tx_to_llvm_workers, module, cost);
-
     fn module_codegen(tcx: TyCtxt<'_>, cgu_name: Symbol) -> ModuleCodegen<ModuleLlvm> {
         let cgu = tcx.codegen_unit(cgu_name);
         // Instantiate monomorphizations without filling out definitions yet...
@@ -164,6 +160,8 @@ pub fn compile_codegen_unit(
             kind: ModuleKind::Regular,
         }
     }
+
+    (module, cost)
 }
 
 pub fn set_link_section(llval: &Value, attrs: &CodegenFnAttrs) {
diff --git a/src/librustc_codegen_llvm/lib.rs b/src/librustc_codegen_llvm/lib.rs
index 35c71a6675683..a6168128c4d44 100644
--- a/src/librustc_codegen_llvm/lib.rs
+++ b/src/librustc_codegen_llvm/lib.rs
@@ -19,6 +19,7 @@
 #![feature(link_args)]
 #![feature(static_nobundle)]
 #![feature(trusted_len)]
+#![recursion_limit = "256"]
 
 use back::write::{create_informational_target_machine, create_target_machine};
 use rustc_span::symbol::Symbol;
@@ -108,9 +109,8 @@ impl ExtraBackendMethods for LlvmCodegenBackend {
         &self,
         tcx: TyCtxt<'_>,
         cgu_name: Symbol,
-        tx: &std::sync::mpsc::Sender<Box<dyn Any + Send>>,
-    ) {
-        base::compile_codegen_unit(tcx, cgu_name, tx);
+    ) -> (ModuleCodegen<ModuleLlvm>, u64) {
+        base::compile_codegen_unit(tcx, cgu_name)
     }
     fn target_machine_factory(
         &self,
diff --git a/src/librustc_codegen_ssa/base.rs b/src/librustc_codegen_ssa/base.rs
index c838109072775..3e39fa7c8f36d 100644
--- a/src/librustc_codegen_ssa/base.rs
+++ b/src/librustc_codegen_ssa/base.rs
@@ -14,8 +14,8 @@
 //!   int)` and `rec(x=int, y=int, z=int)` will have the same `llvm::Type`.
 
 use crate::back::write::{
-    start_async_codegen, submit_post_lto_module_to_llvm, submit_pre_lto_module_to_llvm,
-    OngoingCodegen,
+    start_async_codegen, submit_codegened_module_to_llvm, submit_post_lto_module_to_llvm,
+    submit_pre_lto_module_to_llvm, OngoingCodegen,
 };
 use crate::common::{IntPredicate, RealPredicate, TypeKind};
 use crate::meth;
@@ -40,6 +40,7 @@ use rustc::ty::{self, Instance, Ty, TyCtxt};
 use rustc_codegen_utils::{check_for_rustc_errors_attr, symbol_names_test};
 use rustc_data_structures::fx::FxHashMap;
 use rustc_data_structures::profiling::print_time_passes_entry;
+use rustc_data_structures::sync::{par_iter, Lock, ParallelIterator};
 use rustc_hir as hir;
 use rustc_hir::def_id::{DefId, LOCAL_CRATE};
 use rustc_index::vec::Idx;
@@ -606,20 +607,83 @@ pub fn codegen_crate<B: ExtraBackendMethods>(
         codegen_units
     };
 
-    let mut total_codegen_time = Duration::new(0, 0);
+    let total_codegen_time = Lock::new(Duration::new(0, 0));
 
-    for cgu in codegen_units.into_iter() {
+    // The non-parallel compiler can only translate codegen units to LLVM IR
+    // on a single thread, leading to a staircase effect where the N LLVM
+    // threads have to wait on the single codegen threads to generate work
+    // for them. The parallel compiler does not have this restriction, so
+    // we can pre-load the LLVM queue in parallel before handing off
+    // coordination to the OnGoingCodegen scheduler.
+    //
+    // This likely is a temporary measure. Once we don't have to support the
+    // non-parallel compiler anymore, we can compile CGUs end-to-end in
+    // parallel and get rid of the complicated scheduling logic.
+    let pre_compile_cgus = |cgu_reuse: &[CguReuse]| {
+        if cfg!(parallel_compiler) {
+            tcx.sess.time("compile_first_CGU_batch", || {
+                // Try to find one CGU to compile per thread.
+                let cgus: Vec<_> = cgu_reuse
+                    .iter()
+                    .enumerate()
+                    .filter(|&(_, reuse)| reuse == &CguReuse::No)
+                    .take(tcx.sess.threads())
+                    .collect();
+
+                // Compile the found CGUs in parallel.
+                par_iter(cgus)
+                    .map(|(i, _)| {
+                        let start_time = Instant::now();
+                        let module = backend.compile_codegen_unit(tcx, codegen_units[i].name());
+                        let mut time = total_codegen_time.lock();
+                        *time += start_time.elapsed();
+                        (i, module)
+                    })
+                    .collect()
+            })
+        } else {
+            FxHashMap::default()
+        }
+    };
+
+    let mut cgu_reuse = Vec::new();
+    let mut pre_compiled_cgus: Option<FxHashMap<usize, _>> = None;
+
+    for (i, cgu) in codegen_units.iter().enumerate() {
         ongoing_codegen.wait_for_signal_to_codegen_item();
         ongoing_codegen.check_for_errors(tcx.sess);
 
-        let cgu_reuse = determine_cgu_reuse(tcx, &cgu);
+        // Do some setup work in the first iteration
+        if pre_compiled_cgus.is_none() {
+            // Calculate the CGU reuse
+            cgu_reuse = tcx.sess.time("find_cgu_reuse", || {
+                codegen_units.iter().map(|cgu| determine_cgu_reuse(tcx, &cgu)).collect()
+            });
+            // Pre compile some CGUs
+            pre_compiled_cgus = Some(pre_compile_cgus(&cgu_reuse));
+        }
+
+        let cgu_reuse = cgu_reuse[i];
         tcx.sess.cgu_reuse_tracker.set_actual_reuse(&cgu.name().as_str(), cgu_reuse);
 
         match cgu_reuse {
             CguReuse::No => {
-                let start_time = Instant::now();
-                backend.compile_codegen_unit(tcx, cgu.name(), &ongoing_codegen.coordinator_send);
-                total_codegen_time += start_time.elapsed();
+                let (module, cost) =
+                    if let Some(cgu) = pre_compiled_cgus.as_mut().unwrap().remove(&i) {
+                        cgu
+                    } else {
+                        let start_time = Instant::now();
+                        let module = backend.compile_codegen_unit(tcx, cgu.name());
+                        let mut time = total_codegen_time.lock();
+                        *time += start_time.elapsed();
+                        module
+                    };
+                submit_codegened_module_to_llvm(
+                    &backend,
+                    &ongoing_codegen.coordinator_send,
+                    module,
+                    cost,
+                );
                 false
             }
             CguReuse::PreLto => {
@@ -652,7 +716,11 @@ pub fn codegen_crate<B: ExtraBackendMethods>(
 
     // Since the main thread is sometimes blocked during codegen, we keep track
     // -Ztime-passes output manually.
-    print_time_passes_entry(tcx.sess.time_passes(), "codegen_to_LLVM_IR", total_codegen_time);
+    print_time_passes_entry(
+        tcx.sess.time_passes(),
+        "codegen_to_LLVM_IR",
+        total_codegen_time.into_inner(),
+    );
 
     ::rustc_incremental::assert_module_sources::assert_module_sources(tcx);
 
diff --git a/src/librustc_codegen_ssa/traits/backend.rs b/src/librustc_codegen_ssa/traits/backend.rs
index e0d0a2f32f331..bc3a75250bf70 100644
--- a/src/librustc_codegen_ssa/traits/backend.rs
+++ b/src/librustc_codegen_ssa/traits/backend.rs
@@ -1,5 +1,6 @@
 use super::write::WriteBackendMethods;
 use super::CodegenObject;
+use crate::ModuleCodegen;
 
 use rustc::middle::cstore::EncodedMetadata;
 use rustc::session::{config, Session};
@@ -10,7 +11,6 @@ use rustc_codegen_utils::codegen_backend::CodegenBackend;
 use rustc_span::symbol::Symbol;
 use syntax::expand::allocator::AllocatorKind;
 
-use std::sync::mpsc;
 use std::sync::Arc;
 
 pub trait BackendTypes {
@@ -34,7 +34,7 @@ impl<'tcx, T> Backend<'tcx> for T where
 {
 }
 
-pub trait ExtraBackendMethods: CodegenBackend + WriteBackendMethods + Sized + Send {
+pub trait ExtraBackendMethods: CodegenBackend + WriteBackendMethods + Sized + Send + Sync {
     fn new_metadata(&self, sess: TyCtxt<'_>, mod_name: &str) -> Self::Module;
     fn write_compressed_metadata<'tcx>(
         &self,
@@ -48,12 +48,13 @@ pub trait ExtraBackendMethods: CodegenBackend + WriteBackendMethods + Sized + Se
         mods: &mut Self::Module,
         kind: AllocatorKind,
     );
+    /// This generates the codegen unit and returns it along with
+    /// a `u64` giving an estimate of the unit's processing cost.
     fn compile_codegen_unit(
         &self,
         tcx: TyCtxt<'_>,
         cgu_name: Symbol,
-        tx_to_llvm_workers: &mpsc::Sender<Box<dyn std::any::Any + Send>>,
-    );
+    ) -> (ModuleCodegen<Self::Module>, u64);
     // If find_features is true this won't access `sess.crate_types` by assuming
     // that `is_pie_binary` is false. When we discover LLVM target features
     // `sess.crate_types` is uninitialized so we cannot access it.