gcc - GNU Compiler Collection (original) (raw)

author Jan Hubicka hubicka@ucw.cz 2025-05-03 00:26:29 +0200
committer Jan Hubicka hubicka@ucw.cz 2025-05-03 10:34:09 +0200
commit 20d184e3f84d859e7e9f44a8d91772a02b658872 (patch)
tree 5181d6ae8374f93c67ac380f6b2acb51541b8deb
parent Daily bump. (diff)

ix86_rtx_costs VEC_MERGE by special casing AVX512 mask operations and otherwise returning cost->sse_op completely ignoring costs of the operands. Since VEC_MERGE is also used to represent scalar variant of SSE/AVX operation, this means that many instructions (such as SSE converisions) are often costed as sse_op instead of their real cost. This patch adds pattern matching for the VEC_MERGE pattern which also forced me to add special cases for masked versions and vcmp otherwise combine is confused by the default cost compred to the cost of recognized version of the instruction. Since now the important cases should be handled, I also added recursion to the remaining cases so substituting constants and memory is adequately costed. gcc/ChangeLog: * config/i386/i386.cc (unspec_pcmp_p): New function. (ix86_rtx_costs): Cost VEC_MERGE more realistically.

-rw-r--r-- gcc/config/i386/i386.cc 82

1 files changed, 77 insertions, 5 deletions

diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.ccindex 0c808c22b4f0..5ad47e194348 100644--- a/gcc/config/i386/i386.cc+++ b/gcc/config/i386/i386.cc
@@ -22025,6 +22025,15 @@ vec_fp_conversion_cost (const struct processor_costs *cost, int size)
22025 return cost->vcvtps2pd512; 22025 return cost->vcvtps2pd512;
22026 } 22026 }
22027 22027
22028 /* Return true of X is UNSPEC with UNSPEC_PCMP or UNSPEC_UNSIGNED_PCMP. */
22029
22030 static bool
22031 unspec_pcmp_p (rtx x)
22032 {
22033 return GET_CODE (x) == UNSPEC
22034 && (XINT (x, 1) == UNSPEC_PCMP | XINT (x, 1) == UNSPEC_UNSIGNED_PCMP);
22035 }
22036
22028 /* Compute a (partial) cost for rtx X. Return true if the complete 22037 /* Compute a (partial) cost for rtx X. Return true if the complete
22029 cost has been computed, and false if subexpressions should be 22038 cost has been computed, and false if subexpressions should be
22030 scanned. In either case, *TOTAL contains the cost result. */ 22039 scanned. In either case, *TOTAL contains the cost result. */
@@ -22807,14 +22816,77 @@ ix86_rtx_costs (rtx x, machine_mode mode, int outer_code_i, int opno,
22807 22816
22808 case VEC_MERGE: 22817 case VEC_MERGE:
22809 mask = XEXP (x, 2); 22818 mask = XEXP (x, 2);
22819 /* Scalar versions of SSE instructions may be represented as:
22820
22821 (vec_merge (vec_duplicate (operation ....))
22822 (register or memory)
22823 (const_int 1))
22824
22825 In this case vec_merge and vec_duplicate is for free.
22826 Just recurse into operation and second operand. */
22827 if (mask == const1_rtx
22828 && GET_CODE (XEXP (x, 0)) == VEC_DUPLICATE)
22829 {
22830 *total = rtx_cost (XEXP (XEXP (x, 0), 0), mode,
22831 outer_code, opno, speed)
22832 + rtx_cost (XEXP (x, 1), mode, outer_code, opno, speed);
22833 return true;
22834 }
22810 /* This is masked instruction, assume the same cost, 22835 /* This is masked instruction, assume the same cost,
22811 as nonmasked variant. */ 22836 as nonmasked variant. */
22812 if (TARGET_AVX512F && register_operand (mask, GET_MODE (mask))) 22837 else if (TARGET_AVX512F && register_operand (mask, GET_MODE (mask)))
22813 *total = rtx_cost (XEXP (x, 0), mode, outer_code, opno, speed); 22838 {
22839 *total = rtx_cost (XEXP (x, 0), mode, outer_code, opno, speed)
22840 + rtx_cost (XEXP (x, 1), mode, outer_code, opno, speed);
22841 return true;
22842 }
22843 /* Combination of the two above:
22844
22845 (vec_merge (vec_merge (vec_duplicate (operation ...))
22846 (register or memory)
22847 (reg:QI mask))
22848 (register or memory)
22849 (const_int 1))
22850
22851 i.e. avx512fp16_vcvtss2sh_mask. */
22852 else if (TARGET_AVX512F
22853 && mask == const1_rtx
22854 && GET_CODE (XEXP (x, 0)) == VEC_MERGE
22855 && GET_CODE (XEXP (XEXP (x, 0), 0)) == VEC_DUPLICATE
22856 && register_operand (XEXP (XEXP (x, 0), 2),
22857 GET_MODE (XEXP (XEXP (x, 0), 2))))
22858 {
22859 *total = rtx_cost (XEXP (XEXP (XEXP (x, 0), 0), 0),
22860 mode, outer_code, opno, speed)
22861 + rtx_cost (XEXP (XEXP (x, 0), 1),
22862 mode, outer_code, opno, speed)
22863 + rtx_cost (XEXP (x, 1), mode, outer_code, opno, speed);
22864 return true;
22865 }
22866 /* vcmp. */
22867 else if (unspec_pcmp_p (mask)
22868 | (GET_CODE (mask) == NOT
22869 && unspec_pcmp_p (XEXP (mask, 0))))
22870 {
22871 rtx uns = GET_CODE (mask) == NOT ? XEXP (mask, 0) : mask;
22872 rtx unsop0 = XVECEXP (uns, 0, 0);
22873 /* Make (subreg:V4SI (not:V16QI (reg:V16QI ..)) 0)
22874 cost the same as register.
22875 This is used by avx_cmp3_ltint_not. */
22876 if (GET_CODE (unsop0) == SUBREG)
22877 unsop0 = XEXP (unsop0, 0);
22878 if (GET_CODE (unsop0) == NOT)
22879 unsop0 = XEXP (unsop0, 0);
22880 *total = rtx_cost (XEXP (x, 0), mode, outer_code, opno, speed)
22881 + rtx_cost (XEXP (x, 1), mode, outer_code, opno, speed)
22882 + rtx_cost (unsop0, mode, UNSPEC, opno, speed)
22883 + rtx_cost (XVECEXP (uns, 0, 1), mode, UNSPEC, opno, speed)
22884 + cost->sse_op;
22885 return true;
22886 }
22814 else 22887 else
22815 /* ??? We should still recruse when computing cost. */
22816 *total = cost->sse_op; 22888 *total = cost->sse_op;
22817 return true; 22889 return false;
22818 22890
22819 case MEM: 22891 case MEM:
22820 /* CONST_VECTOR_DUPLICATE_P in constant_pool is just broadcast. 22892 /* CONST_VECTOR_DUPLICATE_P in constant_pool is just broadcast.
@@ -22831,7 +22903,7 @@ ix86_rtx_costs (rtx x, machine_mode mode, int outer_code_i, int opno,
22831 } 22903 }
22832 22904
22833 /* An insn that accesses memory is slightly more expensive 22905 /* An insn that accesses memory is slightly more expensive
22834 than one that does not. */ 22906 than one that does not. */
22835 if (speed) 22907 if (speed)
22836 { 22908 {
22837 *total += 1; 22909 *total += 1;