@@ -66,11 +66,30 @@ SDValue X86SelectionDAGInfo::EmitTargetCodeForMemset(
6666 // If not DWORD aligned or size is more than the threshold, call the library.
6767 // The libc version is likely to be faster for these cases. It can use the
6868 // address value and run time information about the CPU.
69- if (Alignment < Align (4 ) || !ConstantSize ||
70- ConstantSize->getZExtValue () > Subtarget.getMaxInlineSizeThreshold ())
69+ if (!ConstantSize ||
70+ (!AlwaysInline &&
71+ (Alignment < Align (4 ) ||
72+ ConstantSize->getZExtValue () > Subtarget.getMaxInlineSizeThreshold ())))
7173 return SDValue ();
7274
75+ // If we have minsize, then don't care about the alignment.
76+ // On x86, the CPU doesn't care and neither should you.
77+ // As long as the count is aligned, we can use the minimum number of
78+ // instructions without always having to resort to stosb.
79+ //
80+ // Because this is a feature specific to x86, we must handle it here.
7381 uint64_t SizeVal = ConstantSize->getZExtValue ();
82+ if (DAG.getMachineFunction ().getFunction ().hasMinSize ()) {
83+ if ((SizeVal & 7 ) == 0 && Subtarget.is64Bit ())
84+ Alignment = Align (8 );
85+ else if ((SizeVal & 3 ) == 0 )
86+ Alignment = Align (4 );
87+ else if ((SizeVal & 1 ) == 0 )
88+ Alignment = Align (2 );
89+ else
90+ Alignment = Align (1 );
91+ }
92+
7493 SDValue InGlue;
7594 EVT AVT;
7695 SDValue Count;
@@ -86,7 +105,7 @@ SDValue X86SelectionDAGInfo::EmitTargetCodeForMemset(
86105 ValReg = X86::EAX;
87106 Val = (Val << 8 ) | Val;
88107 Val = (Val << 16 ) | Val;
89- if (Subtarget.is64Bit () && Alignment > Align (8 )) { // QWORD aligned
108+ if (Subtarget.is64Bit () && Alignment > Align (4 )) { // QWORD aligned
90109 AVT = MVT::i64 ;
91110 ValReg = X86::RAX;
92111 Val = (Val << 32 ) | Val;
@@ -103,12 +122,10 @@ SDValue X86SelectionDAGInfo::EmitTargetCodeForMemset(
103122 Count = DAG.getIntPtrConstant (SizeVal, dl);
104123 }
105124
106- if (AVT.bitsGT (MVT::i8 )) {
107- unsigned UBytes = AVT.getSizeInBits () / 8 ;
108- Count = DAG.getIntPtrConstant (SizeVal / UBytes, dl);
109- BytesLeft = SizeVal % UBytes;
110- }
111-
125+ const uint64_t BlockBytes = AVT.getSizeInBits () / 8 ;
126+ const uint64_t BlockCount = SizeVal / BlockBytes;
127+ Count = DAG.getIntPtrConstant (BlockCount, dl);
128+ BytesLeft = SizeVal % BlockBytes;
112129 Chain = DAG.getCopyToReg (Chain, dl, ValReg, DAG.getConstant (Val, dl, AVT),
113130 InGlue);
114131 InGlue = Chain.getValue (1 );
@@ -120,34 +137,41 @@ SDValue X86SelectionDAGInfo::EmitTargetCodeForMemset(
120137 }
121138
122139 bool Use64BitRegs = Subtarget.isTarget64BitLP64 ();
123- Chain = DAG.getCopyToReg (Chain, dl, Use64BitRegs ? X86::RCX : X86::ECX,
124- Count, InGlue);
140+ Chain = DAG.getCopyToReg (Chain, dl, Use64BitRegs ? X86::RCX : X86::ECX, Count,
141+ InGlue);
125142 InGlue = Chain.getValue (1 );
126- Chain = DAG.getCopyToReg (Chain, dl, Use64BitRegs ? X86::RDI : X86::EDI,
127- Dst, InGlue);
143+ Chain = DAG.getCopyToReg (Chain, dl, Use64BitRegs ? X86::RDI : X86::EDI, Dst,
144+ InGlue);
128145 InGlue = Chain.getValue (1 );
129146
130147 SDVTList Tys = DAG.getVTList (MVT::Other, MVT::Glue);
131- SDValue Ops[] = { Chain, DAG.getValueType (AVT), InGlue };
132- Chain = DAG.getNode (X86ISD::REP_STOS, dl, Tys, Ops);
133-
134- if (BytesLeft) {
135- // Handle the last 1 - 7 bytes.
136- unsigned Offset = SizeVal - BytesLeft;
137- EVT AddrVT = Dst.getValueType ();
138- EVT SizeVT = Size.getValueType ();
139-
140- Chain =
141- DAG.getMemset (Chain, dl,
142- DAG.getNode (ISD::ADD, dl, AddrVT, Dst,
143- DAG.getConstant (Offset, dl, AddrVT)),
144- Val, DAG.getConstant (BytesLeft, dl, SizeVT), Alignment,
145- isVolatile, AlwaysInline,
146- /* isTailCall */ false , DstPtrInfo.getWithOffset (Offset));
147- }
148+ SDValue Ops[] = {Chain, DAG.getValueType (AVT), InGlue};
149+ SDValue RepStos = DAG.getNode (X86ISD::REP_STOS, dl, Tys, Ops);
150+
151+ // / RepStos can process the whole length.
152+ //
153+ // Because we changed the alignment earlier in the function to work on size
154+ // when we have the minsize attribute, this is guaranteed to be 0 when we get
155+ // here.
156+ if (BytesLeft == 0 )
157+ return RepStos;
148158
149- // TODO: Use a Tokenfactor, as in memcpy, instead of a single chain.
150- return Chain;
159+ // Handle the last 1 - 7 bytes.
160+ SmallVector<SDValue, 4 > Results;
161+ Results.push_back (RepStos);
162+ unsigned Offset = SizeVal - BytesLeft;
163+ EVT AddrVT = Dst.getValueType ();
164+ EVT SizeVT = Size.getValueType ();
165+
166+ Results.push_back (
167+ DAG.getMemset (Chain, dl,
168+ DAG.getNode (ISD::ADD, dl, AddrVT, Dst,
169+ DAG.getConstant (Offset, dl, AddrVT)),
170+ Val, DAG.getConstant (BytesLeft, dl, SizeVT), Alignment,
171+ isVolatile, /* AlwaysInline */ true ,
172+ /* isTailCall */ false , DstPtrInfo.getWithOffset (Offset)));
173+
174+ return DAG.getNode (ISD::TokenFactor, dl, MVT::Other, Results);
151175}
152176
153177// / Emit a single REP MOVS{B,W,D,Q} instruction.
@@ -220,13 +244,42 @@ static SDValue emitConstantSizeRepmov(
220244 assert (!Subtarget.hasERMSB () && " No efficient RepMovs" );
221245 // / We assume runtime memcpy will do a better job for unaligned copies when
222246 // / ERMS is not present.
223- if (!AlwaysInline && (Alignment. value () & 3 ) != 0 )
247+ if (!AlwaysInline && (Alignment < Align ( 4 )) )
224248 return SDValue ();
225249
250+ // If we have minsize, then don't care about the alignment.
251+ // On x86, the CPU doesn't care and neither should you.
252+ // As long as the count is aligned, we can use the minimum number of
253+ // instructions without always having to resort to movsb
254+ //
255+ // Because this is a feature specific to x86, we must handle it here.
256+
257+ if (DAG.getMachineFunction ().getFunction ().hasMinSize ()) {
258+ if ((Size & 15 ) == 0 && Subtarget.is64Bit ())
259+ Alignment = Align (16 );
260+ else if ((Size & 7 ) == 0 )
261+ Alignment = Align (8 );
262+ else if ((Size & 3 ) == 0 )
263+ Alignment = Align (4 );
264+ else if ((Size & 1 ) == 0 )
265+ Alignment = Align (2 );
266+ else
267+ Alignment = Align (1 );
268+ }
269+
226270 const MVT BlockType = getOptimalRepmovsType (Subtarget, Alignment);
227271 const uint64_t BlockBytes = BlockType.getSizeInBits () / 8 ;
228272 const uint64_t BlockCount = Size / BlockBytes;
229273 const uint64_t BytesLeft = Size % BlockBytes;
274+
275+ if (DAG.getMachineFunction ().getFunction ().hasMinSize ()) {
276+ // Use the one instruction determined. Because we changed the alignment
277+ // earlier in the function to work on size when we have the minsize
278+ // attribute, it is guaranteed to process the entire length.
279+ return emitRepmovs (Subtarget, DAG, dl, Chain, Dst, Src,
280+ DAG.getIntPtrConstant (BlockCount, dl), BlockType);
281+ }
282+
230283 SDValue RepMovs =
231284 emitRepmovs (Subtarget, DAG, dl, Chain, Dst, Src,
232285 DAG.getIntPtrConstant (BlockCount, dl), BlockType);
@@ -237,11 +290,6 @@ static SDValue emitConstantSizeRepmov(
237290
238291 assert (BytesLeft && " We have leftover at this point" );
239292
240- // / In case we optimize for size we use repmovsb even if it's less efficient
241- // / so we can save the loads/stores of the leftover.
242- if (DAG.getMachineFunction ().getFunction ().hasMinSize ())
243- return emitRepmovsB (Subtarget, DAG, dl, Chain, Dst, Src, Size);
244-
245293 // Handle the last 1 - 7 bytes.
246294 SmallVector<SDValue, 4 > Results;
247295 Results.push_back (RepMovs);
0 commit comments