Implement _mm256_permute2f128_ps and _mm256_permute2f128_pd intrinsics · rust-lang/rust@6a53ace (original) (raw)

`@@ -172,8 +172,12 @@ pub(crate) fn codegen_x86_llvm_intrinsic_call<'tcx>(

`

172

172

`}

`

173

173

`}

`

174

174

`}

`

175

``

`-

"llvm.x86.avx2.vperm2i128" => {

`

``

175

`+

"llvm.x86.avx2.vperm2i128"

`

``

176

`+

| "llvm.x86.avx.vperm2f128.ps.256"

`

``

177

`+

| "llvm.x86.avx.vperm2f128.pd.256" => {

`

176

178

`// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permute2x128_si256

`

``

179

`+

// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permute2f128_ps

`

``

180

`+

// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permute2f128_pd

`

177

181

`let (a, b, imm8) = match args {

`

178

182

`[a, b, imm8] => (a, b, imm8),

`

179

183

` _ => bug!("wrong number of args for intrinsic {intrinsic}"),

`

`@@ -182,19 +186,11 @@ pub(crate) fn codegen_x86_llvm_intrinsic_call<'tcx>(

`

182

186

`let b = codegen_operand(fx, b);

`

183

187

`let imm8 = codegen_operand(fx, imm8).load_scalar(fx);

`

184

188

``

185

``

`-

let a_0 = a.value_lane(fx, 0).load_scalar(fx);

`

186

``

`-

let a_1 = a.value_lane(fx, 1).load_scalar(fx);

`

187

``

`-

let a_low = fx.bcx.ins().iconcat(a_0, a_1);

`

188

``

`-

let a_2 = a.value_lane(fx, 2).load_scalar(fx);

`

189

``

`-

let a_3 = a.value_lane(fx, 3).load_scalar(fx);

`

190

``

`-

let a_high = fx.bcx.ins().iconcat(a_2, a_3);

`

``

189

`+

let a_low = a.value_typed_lane(fx, fx.tcx.types.u128, 0).load_scalar(fx);

`

``

190

`+

let a_high = a.value_typed_lane(fx, fx.tcx.types.u128, 1).load_scalar(fx);

`

191

191

``

192

``

`-

let b_0 = b.value_lane(fx, 0).load_scalar(fx);

`

193

``

`-

let b_1 = b.value_lane(fx, 1).load_scalar(fx);

`

194

``

`-

let b_low = fx.bcx.ins().iconcat(b_0, b_1);

`

195

``

`-

let b_2 = b.value_lane(fx, 2).load_scalar(fx);

`

196

``

`-

let b_3 = b.value_lane(fx, 3).load_scalar(fx);

`

197

``

`-

let b_high = fx.bcx.ins().iconcat(b_2, b_3);

`

``

192

`+

let b_low = b.value_typed_lane(fx, fx.tcx.types.u128, 0).load_scalar(fx);

`

``

193

`+

let b_high = b.value_typed_lane(fx, fx.tcx.types.u128, 1).load_scalar(fx);

`

198

194

``

199

195

`fn select4(

`

200

196

`fx: &mut FunctionCx<'_, '_, '_>,

`

`@@ -219,16 +215,20 @@ pub(crate) fn codegen_x86_llvm_intrinsic_call<'tcx>(

`

219

215

``

220

216

`let control0 = imm8;

`

221

217

`let res_low = select4(fx, a_high, a_low, b_high, b_low, control0);

`

222

``

`-

let (res_0, res_1) = fx.bcx.ins().isplit(res_low);

`

223

218

``

224

219

`let control1 = fx.bcx.ins().ushr_imm(imm8, 4);

`

225

220

`let res_high = select4(fx, a_high, a_low, b_high, b_low, control1);

`

226

``

`-

let (res_2, res_3) = fx.bcx.ins().isplit(res_high);

`

227

221

``

228

``

`-

ret.place_lane(fx, 0).to_ptr().store(fx, res_0, MemFlags::trusted());

`

229

``

`-

ret.place_lane(fx, 1).to_ptr().store(fx, res_1, MemFlags::trusted());

`

230

``

`-

ret.place_lane(fx, 2).to_ptr().store(fx, res_2, MemFlags::trusted());

`

231

``

`-

ret.place_lane(fx, 3).to_ptr().store(fx, res_3, MemFlags::trusted());

`

``

222

`+

ret.place_typed_lane(fx, fx.tcx.types.u128, 0).to_ptr().store(

`

``

223

`+

fx,

`

``

224

`+

res_low,

`

``

225

`+

MemFlags::trusted(),

`

``

226

`+

);

`

``

227

`+

ret.place_typed_lane(fx, fx.tcx.types.u128, 1).to_ptr().store(

`

``

228

`+

fx,

`

``

229

`+

res_high,

`

``

230

`+

MemFlags::trusted(),

`

``

231

`+

);

`

232

232

`}

`

233

233

`"llvm.x86.ssse3.pabs.b.128" | "llvm.x86.ssse3.pabs.w.128" | "llvm.x86.ssse3.pabs.d.128" => {

`

234

234

`let a = match args {

`