bpo-43475: Fix worst case collision behavior for NaN instances (GH-2… · python/cpython@a07da09 (original) (raw)

10 files changed

lines changed

Original file line number Diff line number Diff line change
@@ -692,10 +692,9 @@ Here are the rules in detail:
692 692 as ``-hash(-x)``. If the resulting hash is ``-1``, replace it with
693 693 ``-2``.
694 694
695 -- The particular values ``sys.hash_info.inf``, ``-sys.hash_info.inf``
696 - and ``sys.hash_info.nan`` are used as hash values for positive
697 - infinity, negative infinity, or nans (respectively). (All hashable
698 - nans have the same hash value.)
695 +- The particular values ``sys.hash_info.inf`` and ``-sys.hash_info.inf``
696 + are used as hash values for positive
697 + infinity or negative infinity (respectively).
699 698
700 699 - For a :class:`complex` number ``z``, the hash values of the real
701 700 and imaginary parts are combined by computing ``hash(z.real) +
@@ -740,7 +739,7 @@ number, :class:`float`, or :class:`complex`::
740 739 """Compute the hash of a float x."""
741 740
742 741 if math.isnan(x):
743 - return sys.hash_info.nan
742 + return super().__hash__()
744 743 elif math.isinf(x):
745 744 return sys.hash_info.inf if x > 0 else -sys.hash_info.inf
746 745 else:

| Original file line number | Diff line number | Diff line change | | | | ------------------------------------- | ------------------- | -------------------------------------------------------------------------- | ------------------------------------------- | | | @@ -855,7 +855,7 @@ always available. | | | | | | 855 | 855 | +---------------------+--------------------------------------------------+ | | | | 856 | 856 | | :const:`inf` | hash value returned for a positive infinity | | | 857 | 857 | +---------------------+--------------------------------------------------+ | | | | 858 | | - | :const:`nan` | hash value returned for a nan | | | | 858 | + | :const:`nan` | (this attribute is no longer used) | | | | 859 | 859 | +---------------------+--------------------------------------------------+ | | | | 860 | 860 | | :const:`imag` | multiplier used for the imaginary part of a | | | 861 | 861 | | | complex number | |

Original file line number Diff line number Diff line change
@@ -7,7 +7,7 @@ extern "C" {
7 7
8 8 /* Helpers for hash functions */
9 9 #ifndef Py_LIMITED_API
10 -PyAPI_FUNC(Py_hash_t) _Py_HashDouble(double);
10 +PyAPI_FUNC(Py_hash_t) _Py_HashDouble(PyObject *, double);
11 11 PyAPI_FUNC(Py_hash_t) _Py_HashPointer(const void*);
12 12 // Similar to _Py_HashPointer(), but don't replace -1 with -2
13 13 PyAPI_FUNC(Py_hash_t) _Py_HashPointerRaw(const void*);
@@ -29,7 +29,6 @@ PyAPI_FUNC(Py_hash_t) _Py_HashBytes(const void*, Py_ssize_t);
29 29
30 30 #define _PyHASH_MODULUS (((size_t)1 << _PyHASH_BITS) - 1)
31 31 #define _PyHASH_INF 314159
32 -#define _PyHASH_NAN 0
33 32 #define _PyHASH_IMAG _PyHASH_MULTIPLIER
34 33
35 34
Original file line number Diff line number Diff line change
@@ -951,7 +951,7 @@ def __hash__(self):
951 951 if self.is_snan():
952 952 raise TypeError('Cannot hash a signaling NaN value.')
953 953 elif self.is_nan():
954 -return _PyHASH_NAN
954 +return super().__hash__()
955 955 else:
956 956 if self._sign:
957 957 return -_PyHASH_INF
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
1 +Hashes of NaN values now depend on object identity. Formerly, they always
2 +hashed to 0 even though NaN values are not equal to one another. Having the
3 +same hash for unequal values caused pile-ups in hash tables.
Original file line number Diff line number Diff line change
@@ -4536,7 +4536,6 @@ _dec_hash(PyDecObject *v)
4536 4536 #error "No valid combination of CONFIG_64, CONFIG_32 and _PyHASH_BITS"
4537 4537 #endif
4538 4538 const Py_hash_t py_hash_inf = 314159;
4539 -const Py_hash_t py_hash_nan = 0;
4540 4539 mpd_uint_t ten_data[1] = {10};
4541 4540 mpd_t ten = {MPD_POS|MPD_STATIC
4542 4541 0, 2, 1, 1, ten_data};
@@ -4555,7 +4554,7 @@ _dec_hash(PyDecObject *v)
4555 4554 return -1;
4556 4555 }
4557 4556 else if (mpd_isnan(MPD(v))) {
4558 -return py_hash_nan;
4557 +return _Py_HashPointer(v);
4559 4558 }
4560 4559 else {
4561 4560 return py_hash_inf * mpd_arith_sign(MPD(v));
@@ -5939,5 +5938,3 @@ PyInit__decimal(void)
5939 5938
5940 5939 return NULL; /* GCOV_NOT_REACHED */
5941 5940 }
5942 -
5943 -
Original file line number Diff line number Diff line change
@@ -412,10 +412,10 @@ static Py_hash_t
412 412 complex_hash(PyComplexObject *v)
413 413 {
414 414 Py_uhash_t hashreal, hashimag, combined;
415 -hashreal = (Py_uhash_t)_Py_HashDouble(v->cval.real);
415 +hashreal = (Py_uhash_t)_Py_HashDouble((PyObject *) v, v->cval.real);
416 416 if (hashreal == (Py_uhash_t)-1)
417 417 return -1;
418 -hashimag = (Py_uhash_t)_Py_HashDouble(v->cval.imag);
418 +hashimag = (Py_uhash_t)_Py_HashDouble((PyObject *)v, v->cval.imag);
419 419 if (hashimag == (Py_uhash_t)-1)
420 420 return -1;
421 421 /* Note: if the imaginary part is 0, hashimag is 0 now,
Original file line number Diff line number Diff line change
@@ -556,7 +556,7 @@ float_richcompare(PyObject *v, PyObject *w, int op)
556 556 static Py_hash_t
557 557 float_hash(PyFloatObject *v)
558 558 {
559 -return _Py_HashDouble(v->ob_fval);
559 +return _Py_HashDouble((PyObject *)v, v->ob_fval);
560 560 }
561 561
562 562 static PyObject *
Original file line number Diff line number Diff line change
@@ -56,8 +56,12 @@ static Py_ssize_t hashstats[Py_HASH_STATS_MAX + 1] = {0};
56 56 If the result of the reduction is infinity (this is impossible for
57 57 integers, floats and Decimals) then use the predefined hash value
58 58 _PyHASH_INF for x >= 0, or -_PyHASH_INF for x < 0, instead.
59 - _PyHASH_INF, -_PyHASH_INF and _PyHASH_NAN are also used for the
60 - hashes of float and Decimal infinities and nans.
59 + _PyHASH_INF and -_PyHASH_INF are also used for the
60 + hashes of float and Decimal infinities.
61 +
62 + NaNs hash with a pointer hash. Having distinct hash values prevents
63 + catastrophic pileups from distinct NaN instances which used to always
64 + have the same hash value but would compare unequal.
61 65
62 66 A selling point for the above strategy is that it makes it possible
63 67 to compute hashes of decimal and binary floating-point numbers
@@ -82,8 +86,10 @@ static Py_ssize_t hashstats[Py_HASH_STATS_MAX + 1] = {0};
82 86
83 87 */
84 88
89 +Py_hash_t _Py_HashPointer(const void *);
90 +
85 91 Py_hash_t
86 -_Py_HashDouble(double v)
92 +_Py_HashDouble(PyObject *inst, double v)
87 93 {
88 94 int e, sign;
89 95 double m;
@@ -93,7 +99,7 @@ _Py_HashDouble(double v)
93 99 if (Py_IS_INFINITY(v))
94 100 return v > 0 ? _PyHASH_INF : -_PyHASH_INF;
95 101 else
96 -return _PyHASH_NAN;
102 +return _Py_HashPointer(inst);
97 103 }
98 104
99 105 m = frexp(v, &e);
Original file line number Diff line number Diff line change
@@ -1405,7 +1405,7 @@ get_hash_info(PyThreadState *tstate)
1405 1405 PyStructSequence_SET_ITEM(hash_info, field++,
1406 1406 PyLong_FromLong(_PyHASH_INF));
1407 1407 PyStructSequence_SET_ITEM(hash_info, field++,
1408 -PyLong_FromLong(_PyHASH_NAN));
1408 +PyLong_FromLong(0)); // This is no longer used
1409 1409 PyStructSequence_SET_ITEM(hash_info, field++,
1410 1410 PyLong_FromLong(_PyHASH_IMAG));
1411 1411 PyStructSequence_SET_ITEM(hash_info, field++,