@@ -429,54 +429,54 @@ void PassManagerBuilder::addFunctionSimplificationPasses(
429429 MPM.add (createCFGSimplificationPass ()); // Merge & remove BBs
430430 MPM.add (createReassociatePass ()); // Reassociate expressions
431431
432- // Begin the loop pass pipeline.
433- if (EnableSimpleLoopUnswitch) {
434- // The simple loop unswitch pass relies on separate cleanup passes. Schedule
435- // them first so when we re-process a loop they run before other loop
436- // passes.
437- MPM.add (createLoopInstSimplifyPass ());
438- MPM.add (createLoopSimplifyCFGPass ());
439- }
440- // Try to remove as much code from the loop header as possible,
441- // to reduce amount of IR that will have to be duplicated.
442- // TODO: Investigate promotion cap for O1.
443- MPM.add (createLICMPass (LicmMssaOptCap, LicmMssaNoAccForPromotionCap));
444- // Rotate Loop - disable header duplication at -Oz
445- MPM.add (createLoopRotatePass (SizeLevel == 2 ? 0 : -1 , PrepareForLTO));
446- // TODO: Investigate promotion cap for O1.
447- MPM.add (createLICMPass (LicmMssaOptCap, LicmMssaNoAccForPromotionCap));
448- if (EnableSimpleLoopUnswitch)
449- MPM.add (createSimpleLoopUnswitchLegacyPass ());
450- else
451- MPM.add (createLoopUnswitchPass (SizeLevel || OptLevel < 3 , DivergentTarget));
452- // FIXME: We break the loop pass pipeline here in order to do full
453- // simplify-cfg. Eventually loop-simplifycfg should be enhanced to replace the
454- // need for this.
455- MPM.add (createCFGSimplificationPass ());
456- MPM.add (createInstructionCombiningPass ());
457- // We resume loop passes creating a second loop pipeline here.
458- if (EnableLoopFlatten) {
459- MPM.add (createLoopFlattenPass ()); // Flatten loops
460- MPM.add (createLoopSimplifyCFGPass ());
432+ // Do not run loop pass pipeline in "SYCL Optimization Mode". Loop
433+ // optimizations rely on TTI, which is not accurate for SPIR target.
434+ if (!SYCLOptimizationMode) {
435+ // Begin the loop pass pipeline.
436+ if (EnableSimpleLoopUnswitch) {
437+ // The simple loop unswitch pass relies on separate cleanup passes.
438+ // Schedule them first so when we re-process a loop they run before other
439+ // loop passes.
440+ MPM.add (createLoopInstSimplifyPass ());
441+ MPM.add (createLoopSimplifyCFGPass ());
442+ }
443+ // Try to remove as much code from the loop header as possible,
444+ // to reduce amount of IR that will have to be duplicated.
445+ // TODO: Investigate promotion cap for O1.
446+ MPM.add (createLICMPass (LicmMssaOptCap, LicmMssaNoAccForPromotionCap));
447+ // Rotate Loop - disable header duplication at -Oz
448+ MPM.add (createLoopRotatePass (SizeLevel == 2 ? 0 : -1 , PrepareForLTO));
449+ // TODO: Investigate promotion cap for O1.
450+ MPM.add (createLICMPass (LicmMssaOptCap, LicmMssaNoAccForPromotionCap));
451+ if (EnableSimpleLoopUnswitch)
452+ MPM.add (createSimpleLoopUnswitchLegacyPass ());
453+ else
454+ MPM.add (
455+ createLoopUnswitchPass (SizeLevel || OptLevel < 3 , DivergentTarget));
456+ // FIXME: We break the loop pass pipeline here in order to do full
457+ // simplify-cfg. Eventually loop-simplifycfg should be enhanced to replace
458+ // the need for this.
459+ MPM.add (createCFGSimplificationPass ());
460+ MPM.add (createInstructionCombiningPass ());
461+ // We resume loop passes creating a second loop pipeline here.
462+ if (EnableLoopFlatten) {
463+ MPM.add (createLoopFlattenPass ()); // Flatten loops
464+ MPM.add (createLoopSimplifyCFGPass ());
465+ }
466+ MPM.add (createLoopIdiomPass ()); // Recognize idioms like memset.
467+ MPM.add (createIndVarSimplifyPass ()); // Canonicalize indvars
468+ addExtensionsToPM (EP_LateLoopOptimizations, MPM);
469+ MPM.add (createLoopDeletionPass ()); // Delete dead loops
470+
471+ if (EnableLoopInterchange)
472+ MPM.add (createLoopInterchangePass ()); // Interchange loops
473+
474+ // Unroll small loops and perform peeling.
475+ MPM.add (createSimpleLoopUnrollPass (OptLevel, DisableUnrollLoops,
476+ ForgetAllSCEVInLoopUnroll));
477+ addExtensionsToPM (EP_LoopOptimizerEnd, MPM);
478+ // This ends the loop pass pipelines.
461479 }
462- MPM.add (createLoopIdiomPass ()); // Recognize idioms like memset.
463- // TODO: this pass hurts performance due to promotions of induction variables
464- // from 32-bit value to 64-bit values. I assume it's because SPIR is a virtual
465- // target with unlimited # of registers and pass doesn't take into account
466- // that on real HW this promotion is not beneficial.
467- if (!SYCLOptimizationMode)
468- MPM.add (createIndVarSimplifyPass ()); // Canonicalize indvars
469- addExtensionsToPM (EP_LateLoopOptimizations, MPM);
470- MPM.add (createLoopDeletionPass ()); // Delete dead loops
471-
472- if (EnableLoopInterchange)
473- MPM.add (createLoopInterchangePass ()); // Interchange loops
474-
475- // Unroll small loops and perform peeling.
476- MPM.add (createSimpleLoopUnrollPass (OptLevel, DisableUnrollLoops,
477- ForgetAllSCEVInLoopUnroll));
478- addExtensionsToPM (EP_LoopOptimizerEnd, MPM);
479- // This ends the loop pass pipelines.
480480
481481 // Break up allocas that may now be splittable after loop unrolling.
482482 MPM.add (createSROAPass ());
@@ -788,68 +788,74 @@ void PassManagerBuilder::populateModulePassManager(
788788
789789 addExtensionsToPM (EP_VectorizerStart, MPM);
790790
791- // Re-rotate loops in all our loop nests. These may have fallout out of
792- // rotated form due to GVN or other transformations, and the vectorizer relies
793- // on the rotated form. Disable header duplication at -Oz.
794- MPM.add (createLoopRotatePass (SizeLevel == 2 ? 0 : -1 , PrepareForLTO));
795-
796- // Distribute loops to allow partial vectorization. I.e. isolate dependences
797- // into separate loop that would otherwise inhibit vectorization. This is
798- // currently only performed for loops marked with the metadata
799- // llvm.loop.distribute=true or when -enable-loop-distribute is specified.
800- MPM.add (createLoopDistributePass ());
801-
802- MPM.add (createLoopVectorizePass (!LoopsInterleaved, !LoopVectorize));
803-
804- // Eliminate loads by forwarding stores from the previous iteration to loads
805- // of the current iteration.
806- MPM.add (createLoopLoadEliminationPass ());
807-
808- // FIXME: Because of #pragma vectorize enable, the passes below are always
809- // inserted in the pipeline, even when the vectorizer doesn't run (ex. when
810- // on -O1 and no #pragma is found). Would be good to have these two passes
811- // as function calls, so that we can only pass them when the vectorizer
812- // changed the code.
813- MPM.add (createInstructionCombiningPass ());
814- if (OptLevel > 1 && ExtraVectorizerPasses) {
815- // At higher optimization levels, try to clean up any runtime overlap and
816- // alignment checks inserted by the vectorizer. We want to track correllated
817- // runtime checks for two inner loops in the same outer loop, fold any
818- // common computations, hoist loop-invariant aspects out of any outer loop,
819- // and unswitch the runtime checks if possible. Once hoisted, we may have
820- // dead (or speculatable) control flows or more combining opportunities.
821- MPM.add (createEarlyCSEPass ());
822- MPM.add (createCorrelatedValuePropagationPass ());
823- MPM.add (createInstructionCombiningPass ());
824- MPM.add (createLICMPass (LicmMssaOptCap, LicmMssaNoAccForPromotionCap));
825- MPM.add (createLoopUnswitchPass (SizeLevel || OptLevel < 3 , DivergentTarget));
826- MPM.add (createCFGSimplificationPass ());
791+ if (!SYCLOptimizationMode) {
792+ // Re-rotate loops in all our loop nests. These may have fallout out of
793+ // rotated form due to GVN or other transformations, and the vectorizer
794+ // relies on the rotated form. Disable header duplication at -Oz.
795+ MPM.add (createLoopRotatePass (SizeLevel == 2 ? 0 : -1 , PrepareForLTO));
796+
797+ // Distribute loops to allow partial vectorization. I.e. isolate
798+ // dependences into separate loop that would otherwise inhibit
799+ // vectorization. This is currently only performed for loops marked with
800+ // the metadata llvm.loop.distribute=true or when -enable-loop-distribute is
801+ // specified.
802+ MPM.add (createLoopDistributePass ());
803+
804+ MPM.add (createLoopVectorizePass (!LoopsInterleaved, !LoopVectorize));
805+
806+ // Eliminate loads by forwarding stores from the previous iteration to loads
807+ // of the current iteration.
808+ MPM.add (createLoopLoadEliminationPass ());
809+
810+ // FIXME: Because of #pragma vectorize enable, the passes below are always
811+ // inserted in the pipeline, even when the vectorizer doesn't run (ex. when
812+ // on -O1 and no #pragma is found). Would be good to have these two passes
813+ // as function calls, so that we can only pass them when the vectorizer
814+ // changed the code.
827815 MPM.add (createInstructionCombiningPass ());
828- }
829-
830- // Cleanup after loop vectorization, etc. Simplification passes like CVP and
831- // GVN, loop transforms, and others have already run, so it's now better to
832- // convert to more optimized IR using more aggressive simplify CFG options.
833- // The extra sinking transform can create larger basic blocks, so do this
834- // before SLP vectorization.
835- // FIXME: study whether hoisting and/or sinking of common instructions should
836- // be delayed until after SLP vectorizer.
837- MPM.add (createCFGSimplificationPass (SimplifyCFGOptions ()
838- .forwardSwitchCondToPhi (true )
839- .convertSwitchToLookupTable (true )
840- .needCanonicalLoops (false )
841- .hoistCommonInsts (true )
842- .sinkCommonInsts (true )));
843-
844- if (SLPVectorize) {
845- MPM.add (createSLPVectorizerPass ()); // Vectorize parallel scalar chains.
846816 if (OptLevel > 1 && ExtraVectorizerPasses) {
817+ // At higher optimization levels, try to clean up any runtime overlap and
818+ // alignment checks inserted by the vectorizer. We want to track
819+ // correllated runtime checks for two inner loops in the same outer loop,
820+ // fold any common computations, hoist loop-invariant aspects out of any
821+ // outer loop, and unswitch the runtime checks if possible. Once hoisted,
822+ // we may have dead (or speculatable) control flows or more combining
823+ // opportunities.
847824 MPM.add (createEarlyCSEPass ());
825+ MPM.add (createCorrelatedValuePropagationPass ());
826+ MPM.add (createInstructionCombiningPass ());
827+ MPM.add (createLICMPass (LicmMssaOptCap, LicmMssaNoAccForPromotionCap));
828+ MPM.add (
829+ createLoopUnswitchPass (SizeLevel || OptLevel < 3 , DivergentTarget));
830+ MPM.add (createCFGSimplificationPass ());
831+ MPM.add (createInstructionCombiningPass ());
848832 }
849- }
850833
851- // Enhance/cleanup vector code.
852- MPM.add (createVectorCombinePass ());
834+ // Cleanup after loop vectorization, etc. Simplification passes like CVP and
835+ // GVN, loop transforms, and others have already run, so it's now better to
836+ // convert to more optimized IR using more aggressive simplify CFG options.
837+ // The extra sinking transform can create larger basic blocks, so do this
838+ // before SLP vectorization.
839+ // FIXME: study whether hoisting and/or sinking of common instructions
840+ // should
841+ // be delayed until after SLP vectorizer.
842+ MPM.add (createCFGSimplificationPass (SimplifyCFGOptions ()
843+ .forwardSwitchCondToPhi (true )
844+ .convertSwitchToLookupTable (true )
845+ .needCanonicalLoops (false )
846+ .hoistCommonInsts (true )
847+ .sinkCommonInsts (true )));
848+
849+ if (SLPVectorize) {
850+ MPM.add (createSLPVectorizerPass ()); // Vectorize parallel scalar chains.
851+ if (OptLevel > 1 && ExtraVectorizerPasses) {
852+ MPM.add (createEarlyCSEPass ());
853+ }
854+ }
855+
856+ // Enhance/cleanup vector code.
857+ MPM.add (createVectorCombinePass ());
858+ }
853859
854860 addExtensionsToPM (EP_Peephole, MPM);
855861 MPM.add (createInstructionCombiningPass ());
@@ -861,22 +867,24 @@ void PassManagerBuilder::populateModulePassManager(
861867 MPM.add (createLoopUnrollAndJamPass (OptLevel));
862868 }
863869
864- // Unroll small loops
865- MPM.add (createLoopUnrollPass (OptLevel, DisableUnrollLoops,
866- ForgetAllSCEVInLoopUnroll));
870+ if (!SYCLOptimizationMode) {
871+ // Unroll small loops
872+ MPM.add (createLoopUnrollPass (OptLevel, DisableUnrollLoops,
873+ ForgetAllSCEVInLoopUnroll));
867874
868- if (!DisableUnrollLoops) {
869- // LoopUnroll may generate some redundency to cleanup.
870- MPM.add (createInstructionCombiningPass ());
875+ if (!DisableUnrollLoops) {
876+ // LoopUnroll may generate some redundency to cleanup.
877+ MPM.add (createInstructionCombiningPass ());
871878
872- // Runtime unrolling will introduce runtime check in loop prologue. If the
873- // unrolled loop is a inner loop, then the prologue will be inside the
874- // outer loop. LICM pass can help to promote the runtime check out if the
875- // checked value is loop invariant.
876- MPM.add (createLICMPass (LicmMssaOptCap, LicmMssaNoAccForPromotionCap));
877- }
879+ // Runtime unrolling will introduce runtime check in loop prologue. If the
880+ // unrolled loop is a inner loop, then the prologue will be inside the
881+ // outer loop. LICM pass can help to promote the runtime check out if the
882+ // checked value is loop invariant.
883+ MPM.add (createLICMPass (LicmMssaOptCap, LicmMssaNoAccForPromotionCap));
884+ }
878885
879- MPM.add (createWarnMissedTransformationsPass ());
886+ MPM.add (createWarnMissedTransformationsPass ());
887+ }
880888
881889 // After vectorization and unrolling, assume intrinsics may tell us more
882890 // about pointer alignments.
0 commit comments