Implement all vendor intrinsics used by the simd-json crate · rust-lang/rust@8649731 (original) (raw)

`@@ -590,6 +590,44 @@ pub(crate) fn codegen_x86_llvm_intrinsic_call<'tcx>(

`

590

590

`}

`

591

591

`}

`

592

592

``

``

593

`+

"llvm.x86.sse41.packusdw" => {

`

``

594

`+

// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packus_epi32&ig_expand=4912

`

``

595

`+

intrinsic_args!(fx, args => (a, b); intrinsic);

`

``

596

+

``

597

`+

assert_eq!(a.layout(), b.layout());

`

``

598

`+

let layout = a.layout();

`

``

599

+

``

600

`+

let (lane_count, lane_ty) = layout.ty.simd_size_and_type(fx.tcx);

`

``

601

`+

let (ret_lane_count, ret_lane_ty) = ret.layout().ty.simd_size_and_type(fx.tcx);

`

``

602

`+

assert_eq!(lane_ty, fx.tcx.types.i32);

`

``

603

`+

assert_eq!(ret_lane_ty, fx.tcx.types.u16);

`

``

604

`+

assert_eq!(lane_count * 2, ret_lane_count);

`

``

605

+

``

606

`+

let min_u16 = fx.bcx.ins().iconst(types::I32, i64::from(u16::MIN));

`

``

607

`+

let max_u16 = fx.bcx.ins().iconst(types::I32, i64::from(u16::MAX));

`

``

608

`+

let ret_lane_layout = fx.layout_of(fx.tcx.types.u16);

`

``

609

+

``

610

`+

for idx in 0..lane_count {

`

``

611

`+

let lane = a.value_lane(fx, idx).load_scalar(fx);

`

``

612

`+

let sat = fx.bcx.ins().umax(lane, min_u16);

`

``

613

`+

let sat = fx.bcx.ins().umin(sat, max_u16);

`

``

614

`+

let res = fx.bcx.ins().ireduce(types::I16, sat);

`

``

615

+

``

616

`+

let res_lane = CValue::by_val(res, ret_lane_layout);

`

``

617

`+

ret.place_lane(fx, idx).write_cvalue(fx, res_lane);

`

``

618

`+

}

`

``

619

+

``

620

`+

for idx in 0..lane_count {

`

``

621

`+

let lane = b.value_lane(fx, idx).load_scalar(fx);

`

``

622

`+

let sat = fx.bcx.ins().umax(lane, min_u16);

`

``

623

`+

let sat = fx.bcx.ins().umin(sat, max_u16);

`

``

624

`+

let res = fx.bcx.ins().ireduce(types::I16, sat);

`

``

625

+

``

626

`+

let res_lane = CValue::by_val(res, ret_lane_layout);

`

``

627

`+

ret.place_lane(fx, lane_count + idx).write_cvalue(fx, res_lane);

`

``

628

`+

}

`

``

629

`+

}

`

``

630

+

593

631

`"llvm.x86.avx2.packssdw" => {

`

594

632

`// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_packs_epi32&ig_expand=4892

`

595

633

`intrinsic_args!(fx, args => (a, b); intrinsic);

`

`@@ -648,6 +686,106 @@ pub(crate) fn codegen_x86_llvm_intrinsic_call<'tcx>(

`

648

686

`}

`

649

687

`}

`

650

688

``

``

689

`+

"llvm.x86.pclmulqdq" => {

`

``

690

`+

// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_clmulepi64_si128&ig_expand=772

`

``

691

`+

intrinsic_args!(fx, args => (a, b, imm8); intrinsic);

`

``

692

+

``

693

`+

assert_eq!(a.layout(), b.layout());

`

``

694

`+

let layout = a.layout();

`

``

695

+

``

696

`+

let (lane_count, lane_ty) = layout.ty.simd_size_and_type(fx.tcx);

`

``

697

`+

let (ret_lane_count, ret_lane_ty) = ret.layout().ty.simd_size_and_type(fx.tcx);

`

``

698

`+

assert_eq!(lane_ty, fx.tcx.types.i64);

`

``

699

`+

assert_eq!(ret_lane_ty, fx.tcx.types.i64);

`

``

700

`+

assert_eq!(lane_count, 2);

`

``

701

`+

assert_eq!(ret_lane_count, 2);

`

``

702

+

``

703

`+

let imm8 = imm8.load_scalar(fx);

`

``

704

+

``

705

`+

let control0 = fx.bcx.ins().band_imm(imm8, 0b0000_0001);

`

``

706

`+

let a_lane0 = a.value_lane(fx, 0).load_scalar(fx);

`

``

707

`+

let a_lane1 = a.value_lane(fx, 1).load_scalar(fx);

`

``

708

`+

let temp1 = fx.bcx.ins().select(control0, a_lane1, a_lane0);

`

``

709

+

``

710

`+

let control4 = fx.bcx.ins().band_imm(imm8, 0b0001_0000);

`

``

711

`+

let b_lane0 = b.value_lane(fx, 0).load_scalar(fx);

`

``

712

`+

let b_lane1 = b.value_lane(fx, 1).load_scalar(fx);

`

``

713

`+

let temp2 = fx.bcx.ins().select(control4, b_lane1, b_lane0);

`

``

714

+

``

715

`+

fn extract_bit(fx: &mut FunctionCx<'_, '_, '_>, val: Value, bit: i64) -> Value {

`

``

716

`+

let tmp = fx.bcx.ins().ushr_imm(val, bit);

`

``

717

`+

fx.bcx.ins().band_imm(tmp, 1)

`

``

718

`+

}

`

``

719

+

``

720

`+

let mut res1 = fx.bcx.ins().iconst(types::I64, 0);

`

``

721

`+

for i in 0..=63 {

`

``

722

`+

let x = extract_bit(fx, temp1, 0);

`

``

723

`+

let y = extract_bit(fx, temp2, i);

`

``

724

`+

let mut temp = fx.bcx.ins().band(x, y);

`

``

725

`+

for j in 1..=i {

`

``

726

`+

let x = extract_bit(fx, temp1, j);

`

``

727

`+

let y = extract_bit(fx, temp2, i - j);

`

``

728

`+

let z = fx.bcx.ins().band(x, y);

`

``

729

`+

temp = fx.bcx.ins().bxor(temp, z);

`

``

730

`+

}

`

``

731

`+

let temp = fx.bcx.ins().ishl_imm(temp, i);

`

``

732

`+

res1 = fx.bcx.ins().bor(res1, temp);

`

``

733

`+

}

`

``

734

`+

ret.place_lane(fx, 0).to_ptr().store(fx, res1, MemFlags::trusted());

`

``

735

+

``

736

`+

let mut res2 = fx.bcx.ins().iconst(types::I64, 0);

`

``

737

`+

for i in 64..=127 {

`

``

738

`+

let mut temp = fx.bcx.ins().iconst(types::I64, 0);

`

``

739

`+

for j in i - 63..=63 {

`

``

740

`+

let x = extract_bit(fx, temp1, j);

`

``

741

`+

let y = extract_bit(fx, temp2, i - j);

`

``

742

`+

let z = fx.bcx.ins().band(x, y);

`

``

743

`+

temp = fx.bcx.ins().bxor(temp, z);

`

``

744

`+

}

`

``

745

`+

let temp = fx.bcx.ins().ishl_imm(temp, i);

`

``

746

`+

res2 = fx.bcx.ins().bor(res2, temp);

`

``

747

`+

}

`

``

748

`+

ret.place_lane(fx, 1).to_ptr().store(fx, res2, MemFlags::trusted());

`

``

749

`+

}

`

``

750

+

``

751

`+

"llvm.x86.avx.ptestz.256" => {

`

``

752

`+

// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_testz_si256&ig_expand=6945

`

``

753

`+

intrinsic_args!(fx, args => (a, b); intrinsic);

`

``

754

+

``

755

`+

assert_eq!(a.layout(), b.layout());

`

``

756

`+

let layout = a.layout();

`

``

757

+

``

758

`+

let (lane_count, lane_ty) = layout.ty.simd_size_and_type(fx.tcx);

`

``

759

`+

assert_eq!(lane_ty, fx.tcx.types.i64);

`

``

760

`+

assert_eq!(ret.layout().ty, fx.tcx.types.i32);

`

``

761

`+

assert_eq!(lane_count, 4);

`

``

762

+

``

763

`+

let a_lane0 = a.value_lane(fx, 0).load_scalar(fx);

`

``

764

`+

let a_lane1 = a.value_lane(fx, 1).load_scalar(fx);

`

``

765

`+

let a_lane2 = a.value_lane(fx, 2).load_scalar(fx);

`

``

766

`+

let a_lane3 = a.value_lane(fx, 3).load_scalar(fx);

`

``

767

`+

let b_lane0 = b.value_lane(fx, 0).load_scalar(fx);

`

``

768

`+

let b_lane1 = b.value_lane(fx, 1).load_scalar(fx);

`

``

769

`+

let b_lane2 = b.value_lane(fx, 2).load_scalar(fx);

`

``

770

`+

let b_lane3 = b.value_lane(fx, 3).load_scalar(fx);

`

``

771

+

``

772

`+

let zero0 = fx.bcx.ins().band(a_lane0, b_lane0);

`

``

773

`+

let zero1 = fx.bcx.ins().band(a_lane1, b_lane1);

`

``

774

`+

let zero2 = fx.bcx.ins().band(a_lane2, b_lane2);

`

``

775

`+

let zero3 = fx.bcx.ins().band(a_lane3, b_lane3);

`

``

776

+

``

777

`+

let all_zero0 = fx.bcx.ins().bor(zero0, zero1);

`

``

778

`+

let all_zero1 = fx.bcx.ins().bor(zero2, zero3);

`

``

779

`+

let all_zero = fx.bcx.ins().bor(all_zero0, all_zero1);

`

``

780

+

``

781

`+

let res = fx.bcx.ins().icmp_imm(IntCC::Equal, all_zero, 0);

`

``

782

`+

let res = CValue::by_val(

`

``

783

`+

fx.bcx.ins().uextend(types::I32, res),

`

``

784

`+

fx.layout_of(fx.tcx.types.i32),

`

``

785

`+

);

`

``

786

`+

ret.write_cvalue(fx, res);

`

``

787

`+

}

`

``

788

+

651

789

` _ => {

`

652

790

` fx.tcx

`

653

791

`.sess

`