Implement all vendor intrinsics used by the fimg crate · rust-lang/rust@ecf79a3 (original) (raw)

`@@ -494,6 +494,160 @@ pub(crate) fn codegen_x86_llvm_intrinsic_call<'tcx>(

`

494

494

`}

`

495

495

`}

`

496

496

``

``

497

`+

"llvm.x86.avx2.packuswb" => {

`

``

498

`+

// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_packus_epi16&ig_expand=4906

`

``

499

`+

intrinsic_args!(fx, args => (a, b); intrinsic);

`

``

500

+

``

501

`+

assert_eq!(a.layout(), b.layout());

`

``

502

`+

let layout = a.layout();

`

``

503

+

``

504

`+

let (lane_count, lane_ty) = layout.ty.simd_size_and_type(fx.tcx);

`

``

505

`+

let (ret_lane_count, ret_lane_ty) = ret.layout().ty.simd_size_and_type(fx.tcx);

`

``

506

`+

assert_eq!(lane_ty, fx.tcx.types.i16);

`

``

507

`+

assert_eq!(ret_lane_ty, fx.tcx.types.u8);

`

``

508

`+

assert_eq!(lane_count * 2, ret_lane_count);

`

``

509

+

``

510

`+

let zero = fx.bcx.ins().iconst(types::I16, 0);

`

``

511

`+

let max_u8 = fx.bcx.ins().iconst(types::I16, 255);

`

``

512

`+

let ret_lane_layout = fx.layout_of(fx.tcx.types.u8);

`

``

513

+

``

514

`+

for idx in 0..lane_count / 2 {

`

``

515

`+

let lane = a.value_lane(fx, idx).load_scalar(fx);

`

``

516

`+

let sat = fx.bcx.ins().smax(lane, zero);

`

``

517

`+

let sat = fx.bcx.ins().umin(sat, max_u8);

`

``

518

`+

let res = fx.bcx.ins().ireduce(types::I8, sat);

`

``

519

+

``

520

`+

let res_lane = CValue::by_val(res, ret_lane_layout);

`

``

521

`+

ret.place_lane(fx, idx).write_cvalue(fx, res_lane);

`

``

522

`+

}

`

``

523

+

``

524

`+

for idx in 0..lane_count / 2 {

`

``

525

`+

let lane = b.value_lane(fx, idx).load_scalar(fx);

`

``

526

`+

let sat = fx.bcx.ins().smax(lane, zero);

`

``

527

`+

let sat = fx.bcx.ins().umin(sat, max_u8);

`

``

528

`+

let res = fx.bcx.ins().ireduce(types::I8, sat);

`

``

529

+

``

530

`+

let res_lane = CValue::by_val(res, ret_lane_layout);

`

``

531

`+

ret.place_lane(fx, lane_count / 2 + idx).write_cvalue(fx, res_lane);

`

``

532

`+

}

`

``

533

+

``

534

`+

for idx in 0..lane_count / 2 {

`

``

535

`+

let lane = a.value_lane(fx, idx).load_scalar(fx);

`

``

536

`+

let sat = fx.bcx.ins().smax(lane, zero);

`

``

537

`+

let sat = fx.bcx.ins().umin(sat, max_u8);

`

``

538

`+

let res = fx.bcx.ins().ireduce(types::I8, sat);

`

``

539

+

``

540

`+

let res_lane = CValue::by_val(res, ret_lane_layout);

`

``

541

`+

ret.place_lane(fx, lane_count / 2 * 2 + idx).write_cvalue(fx, res_lane);

`

``

542

`+

}

`

``

543

+

``

544

`+

for idx in 0..lane_count / 2 {

`

``

545

`+

let lane = b.value_lane(fx, idx).load_scalar(fx);

`

``

546

`+

let sat = fx.bcx.ins().smax(lane, zero);

`

``

547

`+

let sat = fx.bcx.ins().umin(sat, max_u8);

`

``

548

`+

let res = fx.bcx.ins().ireduce(types::I8, sat);

`

``

549

+

``

550

`+

let res_lane = CValue::by_val(res, ret_lane_layout);

`

``

551

`+

ret.place_lane(fx, lane_count / 2 * 3 + idx).write_cvalue(fx, res_lane);

`

``

552

`+

}

`

``

553

`+

}

`

``

554

+

``

555

`+

"llvm.x86.sse2.packssdw.128" => {

`

``

556

`+

// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packs_epi32&ig_expand=4889

`

``

557

`+

intrinsic_args!(fx, args => (a, b); intrinsic);

`

``

558

+

``

559

`+

assert_eq!(a.layout(), b.layout());

`

``

560

`+

let layout = a.layout();

`

``

561

+

``

562

`+

let (lane_count, lane_ty) = layout.ty.simd_size_and_type(fx.tcx);

`

``

563

`+

let (ret_lane_count, ret_lane_ty) = ret.layout().ty.simd_size_and_type(fx.tcx);

`

``

564

`+

assert_eq!(lane_ty, fx.tcx.types.i32);

`

``

565

`+

assert_eq!(ret_lane_ty, fx.tcx.types.i16);

`

``

566

`+

assert_eq!(lane_count * 2, ret_lane_count);

`

``

567

+

``

568

`+

let min_i16 = fx.bcx.ins().iconst(types::I32, i64::from(i16::MIN as u16));

`

``

569

`+

let max_i16 = fx.bcx.ins().iconst(types::I32, i64::from(i16::MAX as u16));

`

``

570

`+

let ret_lane_layout = fx.layout_of(fx.tcx.types.i16);

`

``

571

+

``

572

`+

for idx in 0..lane_count {

`

``

573

`+

let lane = a.value_lane(fx, idx).load_scalar(fx);

`

``

574

`+

let sat = fx.bcx.ins().smax(lane, min_i16);

`

``

575

`+

let sat = fx.bcx.ins().umin(sat, max_i16);

`

``

576

`+

let res = fx.bcx.ins().ireduce(types::I16, sat);

`

``

577

+

``

578

`+

let res_lane = CValue::by_val(res, ret_lane_layout);

`

``

579

`+

ret.place_lane(fx, idx).write_cvalue(fx, res_lane);

`

``

580

`+

}

`

``

581

+

``

582

`+

for idx in 0..lane_count {

`

``

583

`+

let lane = b.value_lane(fx, idx).load_scalar(fx);

`

``

584

`+

let sat = fx.bcx.ins().smax(lane, min_i16);

`

``

585

`+

let sat = fx.bcx.ins().umin(sat, max_i16);

`

``

586

`+

let res = fx.bcx.ins().ireduce(types::I16, sat);

`

``

587

+

``

588

`+

let res_lane = CValue::by_val(res, ret_lane_layout);

`

``

589

`+

ret.place_lane(fx, lane_count + idx).write_cvalue(fx, res_lane);

`

``

590

`+

}

`

``

591

`+

}

`

``

592

+

``

593

`+

"llvm.x86.avx2.packssdw" => {

`

``

594

`+

// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_packs_epi32&ig_expand=4892

`

``

595

`+

intrinsic_args!(fx, args => (a, b); intrinsic);

`

``

596

+

``

597

`+

assert_eq!(a.layout(), b.layout());

`

``

598

`+

let layout = a.layout();

`

``

599

+

``

600

`+

let (lane_count, lane_ty) = layout.ty.simd_size_and_type(fx.tcx);

`

``

601

`+

let (ret_lane_count, ret_lane_ty) = ret.layout().ty.simd_size_and_type(fx.tcx);

`

``

602

`+

assert_eq!(lane_ty, fx.tcx.types.i32);

`

``

603

`+

assert_eq!(ret_lane_ty, fx.tcx.types.i16);

`

``

604

`+

assert_eq!(lane_count * 2, ret_lane_count);

`

``

605

+

``

606

`+

let min_i16 = fx.bcx.ins().iconst(types::I32, i64::from(i16::MIN as u16));

`

``

607

`+

let max_i16 = fx.bcx.ins().iconst(types::I32, i64::from(i16::MAX as u16));

`

``

608

`+

let ret_lane_layout = fx.layout_of(fx.tcx.types.i16);

`

``

609

+

``

610

`+

for idx in 0..lane_count / 2 {

`

``

611

`+

let lane = a.value_lane(fx, idx).load_scalar(fx);

`

``

612

`+

let sat = fx.bcx.ins().smax(lane, min_i16);

`

``

613

`+

let sat = fx.bcx.ins().umin(sat, max_i16);

`

``

614

`+

let res = fx.bcx.ins().ireduce(types::I16, sat);

`

``

615

+

``

616

`+

let res_lane = CValue::by_val(res, ret_lane_layout);

`

``

617

`+

ret.place_lane(fx, idx).write_cvalue(fx, res_lane);

`

``

618

`+

}

`

``

619

+

``

620

`+

for idx in 0..lane_count / 2 {

`

``

621

`+

let lane = b.value_lane(fx, idx).load_scalar(fx);

`

``

622

`+

let sat = fx.bcx.ins().smax(lane, min_i16);

`

``

623

`+

let sat = fx.bcx.ins().umin(sat, max_i16);

`

``

624

`+

let res = fx.bcx.ins().ireduce(types::I16, sat);

`

``

625

+

``

626

`+

let res_lane = CValue::by_val(res, ret_lane_layout);

`

``

627

`+

ret.place_lane(fx, lane_count / 2 + idx).write_cvalue(fx, res_lane);

`

``

628

`+

}

`

``

629

+

``

630

`+

for idx in 0..lane_count / 2 {

`

``

631

`+

let lane = a.value_lane(fx, idx).load_scalar(fx);

`

``

632

`+

let sat = fx.bcx.ins().smax(lane, min_i16);

`

``

633

`+

let sat = fx.bcx.ins().umin(sat, max_i16);

`

``

634

`+

let res = fx.bcx.ins().ireduce(types::I16, sat);

`

``

635

+

``

636

`+

let res_lane = CValue::by_val(res, ret_lane_layout);

`

``

637

`+

ret.place_lane(fx, lane_count / 2 * 2 + idx).write_cvalue(fx, res_lane);

`

``

638

`+

}

`

``

639

+

``

640

`+

for idx in 0..lane_count / 2 {

`

``

641

`+

let lane = b.value_lane(fx, idx).load_scalar(fx);

`

``

642

`+

let sat = fx.bcx.ins().smax(lane, min_i16);

`

``

643

`+

let sat = fx.bcx.ins().umin(sat, max_i16);

`

``

644

`+

let res = fx.bcx.ins().ireduce(types::I16, sat);

`

``

645

+

``

646

`+

let res_lane = CValue::by_val(res, ret_lane_layout);

`

``

647

`+

ret.place_lane(fx, lane_count / 2 * 3 + idx).write_cvalue(fx, res_lane);

`

``

648

`+

}

`

``

649

`+

}

`

``

650

+

497

651

` _ => {

`

498

652

` fx.tcx

`

499

653

`.sess

`