As noted in the code comment, transforming this in the other direction might require a separate transform here in CGP given the block-at-a-time DAG constraint. Besides that theoretical motivation, there are 2 practical motivations for the subtract-of-cmps form: 1. The codegen for both x86 and PPC is better for this IR (though PPC could be better still). There is discussion about canonicalizing IR to the select form ( http://lists.llvm.org/pipermail/llvm-dev/2017-July/114885.html ), so we probably need to add DAG transforms for those patterns anyway, but this improves the memcmp output without waiting for that step. 2. If we allow vector-sized chunks for the load and compare, x86 is better prepared to convert that to optimal code when using subtract-of-cmps, so another prerequisite patch is avoided if we choose to enable that. Differential Revision: https://reviews.llvm.org/D34904 llvm-svn: 309597
70 lines
2.2 KiB
LLVM
70 lines
2.2 KiB
LLVM
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
|
; RUN: llc -verify-machineinstrs -mcpu=pwr8 -mtriple=powerpc64le-unknown-gnu-linux < %s | FileCheck %s -check-prefix=CHECK
|
|
|
|
define signext i32 @memcmp8(i32* nocapture readonly %buffer1, i32* nocapture readonly %buffer2) {
|
|
; CHECK-LABEL: memcmp8:
|
|
; CHECK: # BB#0:
|
|
; CHECK-NEXT: ldbrx 3, 0, 3
|
|
; CHECK-NEXT: ldbrx 4, 0, 4
|
|
; CHECK-NEXT: li 5, 0
|
|
; CHECK-NEXT: cmpld 3, 4
|
|
; CHECK-NEXT: li 3, 1
|
|
; CHECK-NEXT: isel 4, 3, 5, 1
|
|
; CHECK-NEXT: isel 3, 3, 5, 0
|
|
; CHECK-NEXT: subf 3, 3, 4
|
|
; CHECK-NEXT: extsw 3, 3
|
|
; CHECK-NEXT: blr
|
|
%t0 = bitcast i32* %buffer1 to i8*
|
|
%t1 = bitcast i32* %buffer2 to i8*
|
|
%call = tail call signext i32 @memcmp(i8* %t0, i8* %t1, i64 8)
|
|
ret i32 %call
|
|
}
|
|
|
|
define signext i32 @memcmp4(i32* nocapture readonly %buffer1, i32* nocapture readonly %buffer2) {
|
|
; CHECK-LABEL: memcmp4:
|
|
; CHECK: # BB#0:
|
|
; CHECK-NEXT: lwbrx 3, 0, 3
|
|
; CHECK-NEXT: lwbrx 4, 0, 4
|
|
; CHECK-NEXT: sub 5, 4, 3
|
|
; CHECK-NEXT: sub 3, 3, 4
|
|
; CHECK-NEXT: rldicl 4, 5, 1, 63
|
|
; CHECK-NEXT: rldicl 3, 3, 1, 63
|
|
; CHECK-NEXT: subf 3, 3, 4
|
|
; CHECK-NEXT: extsw 3, 3
|
|
; CHECK-NEXT: blr
|
|
%t0 = bitcast i32* %buffer1 to i8*
|
|
%t1 = bitcast i32* %buffer2 to i8*
|
|
%call = tail call signext i32 @memcmp(i8* %t0, i8* %t1, i64 4)
|
|
ret i32 %call
|
|
}
|
|
|
|
define signext i32 @memcmp2(i32* nocapture readonly %buffer1, i32* nocapture readonly %buffer2) {
|
|
; CHECK-LABEL: memcmp2:
|
|
; CHECK: # BB#0:
|
|
; CHECK-NEXT: lhbrx 3, 0, 3
|
|
; CHECK-NEXT: lhbrx 4, 0, 4
|
|
; CHECK-NEXT: subf 3, 4, 3
|
|
; CHECK-NEXT: extsw 3, 3
|
|
; CHECK-NEXT: blr
|
|
%t0 = bitcast i32* %buffer1 to i8*
|
|
%t1 = bitcast i32* %buffer2 to i8*
|
|
%call = tail call signext i32 @memcmp(i8* %t0, i8* %t1, i64 2)
|
|
ret i32 %call
|
|
}
|
|
|
|
define signext i32 @memcmp1(i32* nocapture readonly %buffer1, i32* nocapture readonly %buffer2) {
|
|
; CHECK-LABEL: memcmp1:
|
|
; CHECK: # BB#0:
|
|
; CHECK-NEXT: lbz 3, 0(3)
|
|
; CHECK-NEXT: lbz 4, 0(4)
|
|
; CHECK-NEXT: subf 3, 4, 3
|
|
; CHECK-NEXT: extsw 3, 3
|
|
; CHECK-NEXT: blr
|
|
%t0 = bitcast i32* %buffer1 to i8*
|
|
%t1 = bitcast i32* %buffer2 to i8*
|
|
%call = tail call signext i32 @memcmp(i8* %t0, i8* %t1, i64 1) #2
|
|
ret i32 %call
|
|
}
|
|
|
|
declare signext i32 @memcmp(i8*, i8*, i64)
|