Implement _mm256_permute2f128_ps and _mm256_permute2f128_pd intrinsics · rust-lang/rust@6a53ace (original) (raw)
`@@ -172,8 +172,12 @@ pub(crate) fn codegen_x86_llvm_intrinsic_call<'tcx>(
`
172
172
`}
`
173
173
`}
`
174
174
`}
`
175
``
`-
"llvm.x86.avx2.vperm2i128" => {
`
``
175
`+
"llvm.x86.avx2.vperm2i128"
`
``
176
`+
| "llvm.x86.avx.vperm2f128.ps.256"
`
``
177
`+
| "llvm.x86.avx.vperm2f128.pd.256" => {
`
176
178
`
``
179
`+
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permute2f128_ps
`
``
180
`+
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permute2f128_pd
`
177
181
`let (a, b, imm8) = match args {
`
178
182
`[a, b, imm8] => (a, b, imm8),
`
179
183
` _ => bug!("wrong number of args for intrinsic {intrinsic}"),
`
`@@ -182,19 +186,11 @@ pub(crate) fn codegen_x86_llvm_intrinsic_call<'tcx>(
`
182
186
`let b = codegen_operand(fx, b);
`
183
187
`let imm8 = codegen_operand(fx, imm8).load_scalar(fx);
`
184
188
``
185
``
`-
let a_0 = a.value_lane(fx, 0).load_scalar(fx);
`
186
``
`-
let a_1 = a.value_lane(fx, 1).load_scalar(fx);
`
187
``
`-
let a_low = fx.bcx.ins().iconcat(a_0, a_1);
`
188
``
`-
let a_2 = a.value_lane(fx, 2).load_scalar(fx);
`
189
``
`-
let a_3 = a.value_lane(fx, 3).load_scalar(fx);
`
190
``
`-
let a_high = fx.bcx.ins().iconcat(a_2, a_3);
`
``
189
`+
let a_low = a.value_typed_lane(fx, fx.tcx.types.u128, 0).load_scalar(fx);
`
``
190
`+
let a_high = a.value_typed_lane(fx, fx.tcx.types.u128, 1).load_scalar(fx);
`
191
191
``
192
``
`-
let b_0 = b.value_lane(fx, 0).load_scalar(fx);
`
193
``
`-
let b_1 = b.value_lane(fx, 1).load_scalar(fx);
`
194
``
`-
let b_low = fx.bcx.ins().iconcat(b_0, b_1);
`
195
``
`-
let b_2 = b.value_lane(fx, 2).load_scalar(fx);
`
196
``
`-
let b_3 = b.value_lane(fx, 3).load_scalar(fx);
`
197
``
`-
let b_high = fx.bcx.ins().iconcat(b_2, b_3);
`
``
192
`+
let b_low = b.value_typed_lane(fx, fx.tcx.types.u128, 0).load_scalar(fx);
`
``
193
`+
let b_high = b.value_typed_lane(fx, fx.tcx.types.u128, 1).load_scalar(fx);
`
198
194
``
199
195
`fn select4(
`
200
196
`fx: &mut FunctionCx<'_, '_, '_>,
`
`@@ -219,16 +215,20 @@ pub(crate) fn codegen_x86_llvm_intrinsic_call<'tcx>(
`
219
215
``
220
216
`let control0 = imm8;
`
221
217
`let res_low = select4(fx, a_high, a_low, b_high, b_low, control0);
`
222
``
`-
let (res_0, res_1) = fx.bcx.ins().isplit(res_low);
`
223
218
``
224
219
`let control1 = fx.bcx.ins().ushr_imm(imm8, 4);
`
225
220
`let res_high = select4(fx, a_high, a_low, b_high, b_low, control1);
`
226
``
`-
let (res_2, res_3) = fx.bcx.ins().isplit(res_high);
`
227
221
``
228
``
`-
ret.place_lane(fx, 0).to_ptr().store(fx, res_0, MemFlags::trusted());
`
229
``
`-
ret.place_lane(fx, 1).to_ptr().store(fx, res_1, MemFlags::trusted());
`
230
``
`-
ret.place_lane(fx, 2).to_ptr().store(fx, res_2, MemFlags::trusted());
`
231
``
`-
ret.place_lane(fx, 3).to_ptr().store(fx, res_3, MemFlags::trusted());
`
``
222
`+
ret.place_typed_lane(fx, fx.tcx.types.u128, 0).to_ptr().store(
`
``
223
`+
fx,
`
``
224
`+
res_low,
`
``
225
`+
MemFlags::trusted(),
`
``
226
`+
);
`
``
227
`+
ret.place_typed_lane(fx, fx.tcx.types.u128, 1).to_ptr().store(
`
``
228
`+
fx,
`
``
229
`+
res_high,
`
``
230
`+
MemFlags::trusted(),
`
``
231
`+
);
`
232
232
`}
`
233
233
`"llvm.x86.ssse3.pabs.b.128" | "llvm.x86.ssse3.pabs.w.128" | "llvm.x86.ssse3.pabs.d.128" => {
`
234
234
`let a = match args {
`