Implement all SSE intrinsics used by the jpeg-decoder crate · rust-lang/rust@61e38ce (original) (raw)
`@@ -413,6 +413,77 @@ pub(crate) fn codegen_x86_llvm_intrinsic_call<'tcx>(
`
413
413
` ret.place_lane(fx, out_lane_idx).write_cvalue(fx, res_lane);
`
414
414
`}
`
415
415
`}
`
``
416
+
``
417
`+
"llvm.x86.ssse3.pmul.hr.sw.128" => {
`
``
418
`+
`
``
419
`+
intrinsic_args!(fx, args => (a, b); intrinsic);
`
``
420
+
``
421
`+
assert_eq!(a.layout(), b.layout());
`
``
422
`+
let layout = a.layout();
`
``
423
+
``
424
`+
let (lane_count, lane_ty) = layout.ty.simd_size_and_type(fx.tcx);
`
``
425
`+
let (ret_lane_count, ret_lane_ty) = ret.layout().ty.simd_size_and_type(fx.tcx);
`
``
426
`+
assert_eq!(lane_ty, fx.tcx.types.i16);
`
``
427
`+
assert_eq!(ret_lane_ty, fx.tcx.types.i16);
`
``
428
`+
assert_eq!(lane_count, ret_lane_count);
`
``
429
+
``
430
`+
let ret_lane_layout = fx.layout_of(fx.tcx.types.i16);
`
``
431
`+
for out_lane_idx in 0..lane_count {
`
``
432
`+
let a_lane = a.value_lane(fx, out_lane_idx).load_scalar(fx);
`
``
433
`+
let a_lane = fx.bcx.ins().sextend(types::I32, a_lane);
`
``
434
`+
let b_lane = b.value_lane(fx, out_lane_idx).load_scalar(fx);
`
``
435
`+
let b_lane = fx.bcx.ins().sextend(types::I32, b_lane);
`
``
436
+
``
437
`+
let mul: Value = fx.bcx.ins().imul(a_lane, b_lane);
`
``
438
`+
let shifted = fx.bcx.ins().ushr_imm(mul, 14);
`
``
439
`+
let incremented = fx.bcx.ins().iadd_imm(shifted, 1);
`
``
440
`+
let shifted_again = fx.bcx.ins().ushr_imm(incremented, 1);
`
``
441
+
``
442
`+
let res_lane = fx.bcx.ins().ireduce(types::I16, shifted_again);
`
``
443
`+
let res_lane = CValue::by_val(res_lane, ret_lane_layout);
`
``
444
+
``
445
`+
ret.place_lane(fx, out_lane_idx).write_cvalue(fx, res_lane);
`
``
446
`+
}
`
``
447
`+
}
`
``
448
+
``
449
`+
"llvm.x86.sse2.packuswb.128" => {
`
``
450
`+
`
``
451
`+
intrinsic_args!(fx, args => (a, b); intrinsic);
`
``
452
+
``
453
`+
assert_eq!(a.layout(), b.layout());
`
``
454
`+
let layout = a.layout();
`
``
455
+
``
456
`+
let (lane_count, lane_ty) = layout.ty.simd_size_and_type(fx.tcx);
`
``
457
`+
let (ret_lane_count, ret_lane_ty) = ret.layout().ty.simd_size_and_type(fx.tcx);
`
``
458
`+
assert_eq!(lane_ty, fx.tcx.types.i16);
`
``
459
`+
assert_eq!(ret_lane_ty, fx.tcx.types.u8);
`
``
460
`+
assert_eq!(lane_count * 2, ret_lane_count);
`
``
461
+
``
462
`+
let zero = fx.bcx.ins().iconst(types::I16, 0);
`
``
463
`+
let max_u8 = fx.bcx.ins().iconst(types::I16, 255);
`
``
464
`+
let ret_lane_layout = fx.layout_of(fx.tcx.types.u8);
`
``
465
+
``
466
`+
for idx in 0..lane_count {
`
``
467
`+
let lane = a.value_lane(fx, idx).load_scalar(fx);
`
``
468
`+
let sat = fx.bcx.ins().smax(lane, zero);
`
``
469
`+
let sat = fx.bcx.ins().umin(sat, max_u8);
`
``
470
`+
let res = fx.bcx.ins().ireduce(types::I8, sat);
`
``
471
+
``
472
`+
let res_lane = CValue::by_val(res, ret_lane_layout);
`
``
473
`+
ret.place_lane(fx, idx).write_cvalue(fx, res_lane);
`
``
474
`+
}
`
``
475
+
``
476
`+
for idx in 0..lane_count {
`
``
477
`+
let lane = b.value_lane(fx, idx).load_scalar(fx);
`
``
478
`+
let sat = fx.bcx.ins().smax(lane, zero);
`
``
479
`+
let sat = fx.bcx.ins().umin(sat, max_u8);
`
``
480
`+
let res = fx.bcx.ins().ireduce(types::I8, sat);
`
``
481
+
``
482
`+
let res_lane = CValue::by_val(res, ret_lane_layout);
`
``
483
`+
ret.place_lane(fx, lane_count + idx).write_cvalue(fx, res_lane);
`
``
484
`+
}
`
``
485
`+
}
`
``
486
+
416
487
` _ => {
`
417
488
` fx.tcx
`
418
489
`.sess
`