Implement all SSE intrinsics used by the jpeg-decoder crate · rust-lang/rust@61e38ce (original) (raw)

`@@ -413,6 +413,77 @@ pub(crate) fn codegen_x86_llvm_intrinsic_call<'tcx>(

`

413

413

` ret.place_lane(fx, out_lane_idx).write_cvalue(fx, res_lane);

`

414

414

`}

`

415

415

`}

`

``

416

+

``

417

`+

"llvm.x86.ssse3.pmul.hr.sw.128" => {

`

``

418

`+

// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mulhrs_epi16&ig_expand=4782

`

``

419

`+

intrinsic_args!(fx, args => (a, b); intrinsic);

`

``

420

+

``

421

`+

assert_eq!(a.layout(), b.layout());

`

``

422

`+

let layout = a.layout();

`

``

423

+

``

424

`+

let (lane_count, lane_ty) = layout.ty.simd_size_and_type(fx.tcx);

`

``

425

`+

let (ret_lane_count, ret_lane_ty) = ret.layout().ty.simd_size_and_type(fx.tcx);

`

``

426

`+

assert_eq!(lane_ty, fx.tcx.types.i16);

`

``

427

`+

assert_eq!(ret_lane_ty, fx.tcx.types.i16);

`

``

428

`+

assert_eq!(lane_count, ret_lane_count);

`

``

429

+

``

430

`+

let ret_lane_layout = fx.layout_of(fx.tcx.types.i16);

`

``

431

`+

for out_lane_idx in 0..lane_count {

`

``

432

`+

let a_lane = a.value_lane(fx, out_lane_idx).load_scalar(fx);

`

``

433

`+

let a_lane = fx.bcx.ins().sextend(types::I32, a_lane);

`

``

434

`+

let b_lane = b.value_lane(fx, out_lane_idx).load_scalar(fx);

`

``

435

`+

let b_lane = fx.bcx.ins().sextend(types::I32, b_lane);

`

``

436

+

``

437

`+

let mul: Value = fx.bcx.ins().imul(a_lane, b_lane);

`

``

438

`+

let shifted = fx.bcx.ins().ushr_imm(mul, 14);

`

``

439

`+

let incremented = fx.bcx.ins().iadd_imm(shifted, 1);

`

``

440

`+

let shifted_again = fx.bcx.ins().ushr_imm(incremented, 1);

`

``

441

+

``

442

`+

let res_lane = fx.bcx.ins().ireduce(types::I16, shifted_again);

`

``

443

`+

let res_lane = CValue::by_val(res_lane, ret_lane_layout);

`

``

444

+

``

445

`+

ret.place_lane(fx, out_lane_idx).write_cvalue(fx, res_lane);

`

``

446

`+

}

`

``

447

`+

}

`

``

448

+

``

449

`+

"llvm.x86.sse2.packuswb.128" => {

`

``

450

`+

// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packus_epi16&ig_expand=4903

`

``

451

`+

intrinsic_args!(fx, args => (a, b); intrinsic);

`

``

452

+

``

453

`+

assert_eq!(a.layout(), b.layout());

`

``

454

`+

let layout = a.layout();

`

``

455

+

``

456

`+

let (lane_count, lane_ty) = layout.ty.simd_size_and_type(fx.tcx);

`

``

457

`+

let (ret_lane_count, ret_lane_ty) = ret.layout().ty.simd_size_and_type(fx.tcx);

`

``

458

`+

assert_eq!(lane_ty, fx.tcx.types.i16);

`

``

459

`+

assert_eq!(ret_lane_ty, fx.tcx.types.u8);

`

``

460

`+

assert_eq!(lane_count * 2, ret_lane_count);

`

``

461

+

``

462

`+

let zero = fx.bcx.ins().iconst(types::I16, 0);

`

``

463

`+

let max_u8 = fx.bcx.ins().iconst(types::I16, 255);

`

``

464

`+

let ret_lane_layout = fx.layout_of(fx.tcx.types.u8);

`

``

465

+

``

466

`+

for idx in 0..lane_count {

`

``

467

`+

let lane = a.value_lane(fx, idx).load_scalar(fx);

`

``

468

`+

let sat = fx.bcx.ins().smax(lane, zero);

`

``

469

`+

let sat = fx.bcx.ins().umin(sat, max_u8);

`

``

470

`+

let res = fx.bcx.ins().ireduce(types::I8, sat);

`

``

471

+

``

472

`+

let res_lane = CValue::by_val(res, ret_lane_layout);

`

``

473

`+

ret.place_lane(fx, idx).write_cvalue(fx, res_lane);

`

``

474

`+

}

`

``

475

+

``

476

`+

for idx in 0..lane_count {

`

``

477

`+

let lane = b.value_lane(fx, idx).load_scalar(fx);

`

``

478

`+

let sat = fx.bcx.ins().smax(lane, zero);

`

``

479

`+

let sat = fx.bcx.ins().umin(sat, max_u8);

`

``

480

`+

let res = fx.bcx.ins().ireduce(types::I8, sat);

`

``

481

+

``

482

`+

let res_lane = CValue::by_val(res, ret_lane_layout);

`

``

483

`+

ret.place_lane(fx, lane_count + idx).write_cvalue(fx, res_lane);

`

``

484

`+

}

`

``

485

`+

}

`

``

486

+

416

487

` _ => {

`

417

488

` fx.tcx

`

418

489

`.sess

`