@@ -367,6 +367,166 @@ entry:
367367 ret <4 x i64 > %partial.reduce
368368}
369369
370+ define <4 x i32 > @udot_no_bin_op (<4 x i32 > %acc , <16 x i8 > %a ){
371+ ; CHECK-DOT-LABEL: udot_no_bin_op:
372+ ; CHECK-DOT: // %bb.0:
373+ ; CHECK-DOT-NEXT: movi v2.16b, #1
374+ ; CHECK-DOT-NEXT: udot v0.4s, v1.16b, v2.16b
375+ ; CHECK-DOT-NEXT: ret
376+ ;
377+ ; CHECK-NODOT-LABEL: udot_no_bin_op:
378+ ; CHECK-NODOT: // %bb.0:
379+ ; CHECK-NODOT-NEXT: ushll v2.8h, v1.8b, #0
380+ ; CHECK-NODOT-NEXT: ushll2 v1.8h, v1.16b, #0
381+ ; CHECK-NODOT-NEXT: ushll v3.4s, v1.4h, #0
382+ ; CHECK-NODOT-NEXT: uaddw v0.4s, v0.4s, v2.4h
383+ ; CHECK-NODOT-NEXT: uaddw2 v2.4s, v3.4s, v2.8h
384+ ; CHECK-NODOT-NEXT: uaddw2 v0.4s, v0.4s, v1.8h
385+ ; CHECK-NODOT-NEXT: add v0.4s, v2.4s, v0.4s
386+ ; CHECK-NODOT-NEXT: ret
387+ %a.wide = zext <16 x i8 > %a to <16 x i32 >
388+ %partial.reduce = tail call <4 x i32 > @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32 (<4 x i32 > %acc , <16 x i32 > %a.wide )
389+ ret <4 x i32 > %partial.reduce
390+ }
391+
392+ define <4 x i32 > @sdot_no_bin_op (<4 x i32 > %acc , <16 x i8 > %a ){
393+ ; CHECK-DOT-LABEL: sdot_no_bin_op:
394+ ; CHECK-DOT: // %bb.0:
395+ ; CHECK-DOT-NEXT: movi v2.16b, #1
396+ ; CHECK-DOT-NEXT: sdot v0.4s, v1.16b, v2.16b
397+ ; CHECK-DOT-NEXT: ret
398+ ;
399+ ; CHECK-NODOT-LABEL: sdot_no_bin_op:
400+ ; CHECK-NODOT: // %bb.0:
401+ ; CHECK-NODOT-NEXT: sshll v2.8h, v1.8b, #0
402+ ; CHECK-NODOT-NEXT: sshll2 v1.8h, v1.16b, #0
403+ ; CHECK-NODOT-NEXT: sshll v3.4s, v1.4h, #0
404+ ; CHECK-NODOT-NEXT: saddw v0.4s, v0.4s, v2.4h
405+ ; CHECK-NODOT-NEXT: saddw2 v2.4s, v3.4s, v2.8h
406+ ; CHECK-NODOT-NEXT: saddw2 v0.4s, v0.4s, v1.8h
407+ ; CHECK-NODOT-NEXT: add v0.4s, v2.4s, v0.4s
408+ ; CHECK-NODOT-NEXT: ret
409+ %a.wide = sext <16 x i8 > %a to <16 x i32 >
410+ %partial.reduce = tail call <4 x i32 > @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32 (<4 x i32 > %acc , <16 x i32 > %a.wide )
411+ ret <4 x i32 > %partial.reduce
412+ }
413+
414+ define <2 x i32 > @udot_no_bin_op_narrow (<2 x i32 > %acc , <8 x i8 > %a ){
415+ ; CHECK-DOT-LABEL: udot_no_bin_op_narrow:
416+ ; CHECK-DOT: // %bb.0:
417+ ; CHECK-DOT-NEXT: movi v2.8b, #1
418+ ; CHECK-DOT-NEXT: udot v0.2s, v1.8b, v2.8b
419+ ; CHECK-DOT-NEXT: ret
420+ ;
421+ ; CHECK-NODOT-LABEL: udot_no_bin_op_narrow:
422+ ; CHECK-NODOT: // %bb.0:
423+ ; CHECK-NODOT-NEXT: ushll v1.8h, v1.8b, #0
424+ ; CHECK-NODOT-NEXT: // kill: def $d0 killed $d0 def $q0
425+ ; CHECK-NODOT-NEXT: ushll v2.4s, v1.4h, #0
426+ ; CHECK-NODOT-NEXT: ushll2 v3.4s, v1.8h, #0
427+ ; CHECK-NODOT-NEXT: ext v4.16b, v1.16b, v1.16b, #8
428+ ; CHECK-NODOT-NEXT: uaddw v0.4s, v0.4s, v1.4h
429+ ; CHECK-NODOT-NEXT: ext v3.16b, v3.16b, v3.16b, #8
430+ ; CHECK-NODOT-NEXT: ext v2.16b, v2.16b, v2.16b, #8
431+ ; CHECK-NODOT-NEXT: add v0.2s, v3.2s, v0.2s
432+ ; CHECK-NODOT-NEXT: uaddw v1.4s, v2.4s, v4.4h
433+ ; CHECK-NODOT-NEXT: add v0.2s, v1.2s, v0.2s
434+ ; CHECK-NODOT-NEXT: ret
435+ %a.wide = zext <8 x i8 > %a to <8 x i32 >
436+ %partial.reduce = tail call <2 x i32 > @llvm.experimental.vector.partial.reduce.add.v2i32.v8i32 (<2 x i32 > %acc , <8 x i32 > %a.wide )
437+ ret <2 x i32 > %partial.reduce
438+ }
439+
440+ define <2 x i32 > @sdot_no_bin_op_narrow (<2 x i32 > %acc , <8 x i8 > %a ){
441+ ; CHECK-DOT-LABEL: sdot_no_bin_op_narrow:
442+ ; CHECK-DOT: // %bb.0:
443+ ; CHECK-DOT-NEXT: movi v2.8b, #1
444+ ; CHECK-DOT-NEXT: sdot v0.2s, v1.8b, v2.8b
445+ ; CHECK-DOT-NEXT: ret
446+ ;
447+ ; CHECK-NODOT-LABEL: sdot_no_bin_op_narrow:
448+ ; CHECK-NODOT: // %bb.0:
449+ ; CHECK-NODOT-NEXT: sshll v1.8h, v1.8b, #0
450+ ; CHECK-NODOT-NEXT: // kill: def $d0 killed $d0 def $q0
451+ ; CHECK-NODOT-NEXT: sshll v2.4s, v1.4h, #0
452+ ; CHECK-NODOT-NEXT: sshll2 v3.4s, v1.8h, #0
453+ ; CHECK-NODOT-NEXT: ext v4.16b, v1.16b, v1.16b, #8
454+ ; CHECK-NODOT-NEXT: saddw v0.4s, v0.4s, v1.4h
455+ ; CHECK-NODOT-NEXT: ext v3.16b, v3.16b, v3.16b, #8
456+ ; CHECK-NODOT-NEXT: ext v2.16b, v2.16b, v2.16b, #8
457+ ; CHECK-NODOT-NEXT: add v0.2s, v3.2s, v0.2s
458+ ; CHECK-NODOT-NEXT: saddw v1.4s, v2.4s, v4.4h
459+ ; CHECK-NODOT-NEXT: add v0.2s, v1.2s, v0.2s
460+ ; CHECK-NODOT-NEXT: ret
461+ %a.wide = sext <8 x i8 > %a to <8 x i32 >
462+ %partial.reduce = tail call <2 x i32 > @llvm.experimental.vector.partial.reduce.add.v2i32.v8i32 (<2 x i32 > %acc , <8 x i32 > %a.wide )
463+ ret <2 x i32 > %partial.reduce
464+ }
465+
466+ define <4 x i64 > @udot_no_bin_op_8to64 (<4 x i64 > %acc , <16 x i8 > %a ){
467+ ; CHECK-DOT-LABEL: udot_no_bin_op_8to64:
468+ ; CHECK-DOT: // %bb.0:
469+ ; CHECK-DOT-NEXT: movi v3.16b, #1
470+ ; CHECK-DOT-NEXT: movi v4.2d, #0000000000000000
471+ ; CHECK-DOT-NEXT: udot v4.4s, v2.16b, v3.16b
472+ ; CHECK-DOT-NEXT: saddw2 v1.2d, v1.2d, v4.4s
473+ ; CHECK-DOT-NEXT: saddw v0.2d, v0.2d, v4.2s
474+ ; CHECK-DOT-NEXT: ret
475+ ;
476+ ; CHECK-NODOT-LABEL: udot_no_bin_op_8to64:
477+ ; CHECK-NODOT: // %bb.0:
478+ ; CHECK-NODOT-NEXT: ushll v3.8h, v2.8b, #0
479+ ; CHECK-NODOT-NEXT: ushll2 v2.8h, v2.16b, #0
480+ ; CHECK-NODOT-NEXT: ushll v4.4s, v3.4h, #0
481+ ; CHECK-NODOT-NEXT: ushll v5.4s, v2.4h, #0
482+ ; CHECK-NODOT-NEXT: ushll2 v3.4s, v3.8h, #0
483+ ; CHECK-NODOT-NEXT: ushll2 v2.4s, v2.8h, #0
484+ ; CHECK-NODOT-NEXT: uaddw2 v1.2d, v1.2d, v4.4s
485+ ; CHECK-NODOT-NEXT: uaddw v0.2d, v0.2d, v4.2s
486+ ; CHECK-NODOT-NEXT: uaddl2 v4.2d, v3.4s, v5.4s
487+ ; CHECK-NODOT-NEXT: uaddl v3.2d, v3.2s, v5.2s
488+ ; CHECK-NODOT-NEXT: uaddw2 v1.2d, v1.2d, v2.4s
489+ ; CHECK-NODOT-NEXT: uaddw v0.2d, v0.2d, v2.2s
490+ ; CHECK-NODOT-NEXT: add v1.2d, v4.2d, v1.2d
491+ ; CHECK-NODOT-NEXT: add v0.2d, v3.2d, v0.2d
492+ ; CHECK-NODOT-NEXT: ret
493+ %a.wide = zext <16 x i8 > %a to <16 x i64 >
494+ %partial.reduce = tail call <4 x i64 > @llvm.experimental.vector.partial.reduce.add.v4i64.v16i64 (<4 x i64 > %acc , <16 x i64 > %a.wide )
495+ ret <4 x i64 > %partial.reduce
496+ }
497+
498+ define <4 x i64 > @sdot_no_bin_op_8to64 (<4 x i64 > %acc , <16 x i8 > %a ){
499+ ; CHECK-DOT-LABEL: sdot_no_bin_op_8to64:
500+ ; CHECK-DOT: // %bb.0:
501+ ; CHECK-DOT-NEXT: movi v3.16b, #1
502+ ; CHECK-DOT-NEXT: movi v4.2d, #0000000000000000
503+ ; CHECK-DOT-NEXT: sdot v4.4s, v2.16b, v3.16b
504+ ; CHECK-DOT-NEXT: saddw2 v1.2d, v1.2d, v4.4s
505+ ; CHECK-DOT-NEXT: saddw v0.2d, v0.2d, v4.2s
506+ ; CHECK-DOT-NEXT: ret
507+ ;
508+ ; CHECK-NODOT-LABEL: sdot_no_bin_op_8to64:
509+ ; CHECK-NODOT: // %bb.0:
510+ ; CHECK-NODOT-NEXT: sshll v3.8h, v2.8b, #0
511+ ; CHECK-NODOT-NEXT: sshll2 v2.8h, v2.16b, #0
512+ ; CHECK-NODOT-NEXT: sshll v4.4s, v3.4h, #0
513+ ; CHECK-NODOT-NEXT: sshll v5.4s, v2.4h, #0
514+ ; CHECK-NODOT-NEXT: sshll2 v3.4s, v3.8h, #0
515+ ; CHECK-NODOT-NEXT: sshll2 v2.4s, v2.8h, #0
516+ ; CHECK-NODOT-NEXT: saddw2 v1.2d, v1.2d, v4.4s
517+ ; CHECK-NODOT-NEXT: saddw v0.2d, v0.2d, v4.2s
518+ ; CHECK-NODOT-NEXT: saddl2 v4.2d, v3.4s, v5.4s
519+ ; CHECK-NODOT-NEXT: saddl v3.2d, v3.2s, v5.2s
520+ ; CHECK-NODOT-NEXT: saddw2 v1.2d, v1.2d, v2.4s
521+ ; CHECK-NODOT-NEXT: saddw v0.2d, v0.2d, v2.2s
522+ ; CHECK-NODOT-NEXT: add v1.2d, v4.2d, v1.2d
523+ ; CHECK-NODOT-NEXT: add v0.2d, v3.2d, v0.2d
524+ ; CHECK-NODOT-NEXT: ret
525+ %a.wide = sext <16 x i8 > %a to <16 x i64 >
526+ %partial.reduce = tail call <4 x i64 > @llvm.experimental.vector.partial.reduce.add.v4i64.v16i64 (<4 x i64 > %acc , <16 x i64 > %a.wide )
527+ ret <4 x i64 > %partial.reduce
528+ }
529+
370530define <4 x i32 > @not_udot (<4 x i32 > %acc , <8 x i8 > %u , <8 x i8 > %s ) #0 {
371531; CHECK-LABEL: not_udot:
372532; CHECK: // %bb.0:
@@ -398,3 +558,91 @@ define <2 x i32> @not_udot_narrow(<2 x i32> %acc, <4 x i8> %u, <4 x i8> %s) {
398558 %partial.reduce = tail call <2 x i32 > @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32 (<2 x i32 > %acc , <4 x i32 > %mult )
399559 ret <2 x i32 > %partial.reduce
400560}
561+
562+ define <2 x i64 > @udot_different_types (<2 x i64 > %acc , <8 x i16 > %a , <8 x i8 > %b ){
563+ ; CHECK-LABEL: udot_different_types:
564+ ; CHECK: // %bb.0: // %entry
565+ ; CHECK-NEXT: ushll v2.8h, v2.8b, #0
566+ ; CHECK-NEXT: ushll v3.4s, v1.4h, #0
567+ ; CHECK-NEXT: ushll2 v1.4s, v1.8h, #0
568+ ; CHECK-NEXT: ushll v4.4s, v2.4h, #0
569+ ; CHECK-NEXT: ushll2 v2.4s, v2.8h, #0
570+ ; CHECK-NEXT: umull v5.2d, v1.2s, v2.2s
571+ ; CHECK-NEXT: umlal v0.2d, v3.2s, v4.2s
572+ ; CHECK-NEXT: umlal2 v0.2d, v1.4s, v2.4s
573+ ; CHECK-NEXT: umlal2 v5.2d, v3.4s, v4.4s
574+ ; CHECK-NEXT: add v0.2d, v5.2d, v0.2d
575+ ; CHECK-NEXT: ret
576+ entry:
577+ %a.wide = zext <8 x i16 > %a to <8 x i64 >
578+ %b.wide = zext <8 x i8 > %b to <8 x i64 >
579+ %mult = mul nuw nsw <8 x i64 > %a.wide , %b.wide
580+ %partial.reduce = tail call <2 x i64 > @llvm.experimental.vector.partial.reduce.add.v2i64.v8i64 (<2 x i64 > %acc , <8 x i64 > %mult )
581+ ret <2 x i64 > %partial.reduce
582+ }
583+
584+ define <2 x i64 > @sdot_different_types (<2 x i64 > %acc , <8 x i16 > %a , <8 x i8 > %b ){
585+ ; CHECK-LABEL: sdot_different_types:
586+ ; CHECK: // %bb.0: // %entry
587+ ; CHECK-NEXT: sshll v2.8h, v2.8b, #0
588+ ; CHECK-NEXT: sshll v3.4s, v1.4h, #0
589+ ; CHECK-NEXT: sshll2 v1.4s, v1.8h, #0
590+ ; CHECK-NEXT: sshll v4.4s, v2.4h, #0
591+ ; CHECK-NEXT: sshll2 v2.4s, v2.8h, #0
592+ ; CHECK-NEXT: smull v5.2d, v1.2s, v2.2s
593+ ; CHECK-NEXT: smlal v0.2d, v3.2s, v4.2s
594+ ; CHECK-NEXT: smlal2 v0.2d, v1.4s, v2.4s
595+ ; CHECK-NEXT: smlal2 v5.2d, v3.4s, v4.4s
596+ ; CHECK-NEXT: add v0.2d, v5.2d, v0.2d
597+ ; CHECK-NEXT: ret
598+ entry:
599+ %a.wide = sext <8 x i16 > %a to <8 x i64 >
600+ %b.wide = sext <8 x i8 > %b to <8 x i64 >
601+ %mult = mul nuw nsw <8 x i64 > %a.wide , %b.wide
602+ %partial.reduce = tail call <2 x i64 > @llvm.experimental.vector.partial.reduce.add.v2i64.v8i64 (<2 x i64 > %acc , <8 x i64 > %mult )
603+ ret <2 x i64 > %partial.reduce
604+ }
605+
606+ define <2 x i64 > @usdot_different_types (<2 x i64 > %acc , <8 x i16 > %a , <8 x i8 > %b ){
607+ ; CHECK-LABEL: usdot_different_types:
608+ ; CHECK: // %bb.0: // %entry
609+ ; CHECK-NEXT: sshll v2.8h, v2.8b, #0
610+ ; CHECK-NEXT: ushll v3.4s, v1.4h, #0
611+ ; CHECK-NEXT: ushll2 v1.4s, v1.8h, #0
612+ ; CHECK-NEXT: sshll v4.4s, v2.4h, #0
613+ ; CHECK-NEXT: sshll2 v2.4s, v2.8h, #0
614+ ; CHECK-NEXT: smull v5.2d, v1.2s, v2.2s
615+ ; CHECK-NEXT: smlal v0.2d, v3.2s, v4.2s
616+ ; CHECK-NEXT: smlal2 v0.2d, v1.4s, v2.4s
617+ ; CHECK-NEXT: smlal2 v5.2d, v3.4s, v4.4s
618+ ; CHECK-NEXT: add v0.2d, v5.2d, v0.2d
619+ ; CHECK-NEXT: ret
620+ entry:
621+ %a.wide = zext <8 x i16 > %a to <8 x i64 >
622+ %b.wide = sext <8 x i8 > %b to <8 x i64 >
623+ %mult = mul nuw nsw <8 x i64 > %a.wide , %b.wide
624+ %partial.reduce = tail call <2 x i64 > @llvm.experimental.vector.partial.reduce.add.v2i64.v8i64 (<2 x i64 > %acc , <8 x i64 > %mult )
625+ ret <2 x i64 > %partial.reduce
626+ }
627+
628+ define <2 x i64 > @sudot_different_types (<2 x i64 > %acc , <8 x i16 > %a , <8 x i8 > %b ){
629+ ; CHECK-LABEL: sudot_different_types:
630+ ; CHECK: // %bb.0: // %entry
631+ ; CHECK-NEXT: ushll v2.8h, v2.8b, #0
632+ ; CHECK-NEXT: sshll v3.4s, v1.4h, #0
633+ ; CHECK-NEXT: sshll2 v1.4s, v1.8h, #0
634+ ; CHECK-NEXT: ushll v4.4s, v2.4h, #0
635+ ; CHECK-NEXT: ushll2 v2.4s, v2.8h, #0
636+ ; CHECK-NEXT: smull v5.2d, v1.2s, v2.2s
637+ ; CHECK-NEXT: smlal v0.2d, v3.2s, v4.2s
638+ ; CHECK-NEXT: smlal2 v0.2d, v1.4s, v2.4s
639+ ; CHECK-NEXT: smlal2 v5.2d, v3.4s, v4.4s
640+ ; CHECK-NEXT: add v0.2d, v5.2d, v0.2d
641+ ; CHECK-NEXT: ret
642+ entry:
643+ %a.wide = sext <8 x i16 > %a to <8 x i64 >
644+ %b.wide = zext <8 x i8 > %b to <8 x i64 >
645+ %mult = mul nuw nsw <8 x i64 > %a.wide , %b.wide
646+ %partial.reduce = tail call <2 x i64 > @llvm.experimental.vector.partial.reduce.add.v2i64.v8i64 (<2 x i64 > %acc , <8 x i64 > %mult )
647+ ret <2 x i64 > %partial.reduce
648+ }
0 commit comments