bpo-43475: Fix worst case collision behavior for NaN instances (GH-2… · python/cpython@a07da09 (original) (raw)

10 files changed

lines changed

Original file line number	Diff line number	Diff line change
@@ -692,10 +692,9 @@ Here are the rules in detail:
692	692	as ``-hash(-x)``. If the resulting hash is ``-1``, replace it with
693	693	``-2``.
694	694
695		-- The particular values ``sys.hash_info.inf``, ``-sys.hash_info.inf``
696		- and ``sys.hash_info.nan`` are used as hash values for positive
697		- infinity, negative infinity, or nans (respectively). (All hashable
698		- nans have the same hash value.)
	695	+- The particular values ``sys.hash_info.inf`` and ``-sys.hash_info.inf``
	696	+ are used as hash values for positive
	697	+ infinity or negative infinity (respectively).
699	698
700	699	- For a :class:`complex` number ``z``, the hash values of the real
701	700	and imaginary parts are combined by computing ``hash(z.real) +
@@ -740,7 +739,7 @@ number, :class:`float`, or :class:`complex`::
740	739	"""Compute the hash of a float x."""
741	740
742	741	if math.isnan(x):
743		- return sys.hash_info.nan
	742	+ return super().__hash__()
744	743	elif math.isinf(x):
745	744	return sys.hash_info.inf if x > 0 else -sys.hash_info.inf
746	745	else:

| Original file line number | Diff line number | Diff line change | | | | ------------------------------------- | ------------------- | -------------------------------------------------------------------------- | ------------------------------------------- | | | @@ -855,7 +855,7 @@ always available. | | | | | | 855 | 855 | +---------------------+--------------------------------------------------+ | | | | 856 | 856 | | :const:`inf` | hash value returned for a positive infinity | | | 857 | 857 | +---------------------+--------------------------------------------------+ | | | | 858 | | - | :const:`nan` | hash value returned for a nan | | | | 858 | + | :const:`nan` | (this attribute is no longer used) | | | | 859 | 859 | +---------------------+--------------------------------------------------+ | | | | 860 | 860 | | :const:`imag` | multiplier used for the imaginary part of a | | | 861 | 861 | | | complex number | |

Original file line number	Diff line number	Diff line change
@@ -7,7 +7,7 @@ extern "C" {
7	7
8	8	/* Helpers for hash functions */
9	9	#ifndef Py_LIMITED_API
10		-PyAPI_FUNC(Py_hash_t) _Py_HashDouble(double);
	10	+PyAPI_FUNC(Py_hash_t) _Py_HashDouble(PyObject *, double);
11	11	PyAPI_FUNC(Py_hash_t) _Py_HashPointer(const void*);
12	12	// Similar to _Py_HashPointer(), but don't replace -1 with -2
13	13	PyAPI_FUNC(Py_hash_t) _Py_HashPointerRaw(const void*);
@@ -29,7 +29,6 @@ PyAPI_FUNC(Py_hash_t) _Py_HashBytes(const void*, Py_ssize_t);
29	29
30	30	#define _PyHASH_MODULUS (((size_t)1 << _PyHASH_BITS) - 1)
31	31	#define _PyHASH_INF 314159
32		-#define _PyHASH_NAN 0
33	32	#define _PyHASH_IMAG _PyHASH_MULTIPLIER
34	33
35	34

Original file line number	Diff line number	Diff line change
@@ -951,7 +951,7 @@ def __hash__(self):
951	951	if self.is_snan():
952	952	raise TypeError('Cannot hash a signaling NaN value.')
953	953	elif self.is_nan():
954		-return _PyHASH_NAN
	954	+return super().__hash__()
955	955	else:
956	956	if self._sign:
957	957	return -_PyHASH_INF

Original file line number	Diff line number	Diff line change
@@ -0,0 +1,3 @@
	1	+Hashes of NaN values now depend on object identity. Formerly, they always
	2	+hashed to 0 even though NaN values are not equal to one another. Having the
	3	+same hash for unequal values caused pile-ups in hash tables.

Original file line number	Diff line number	Diff line change
@@ -4536,7 +4536,6 @@ _dec_hash(PyDecObject *v)
4536	4536	#error "No valid combination of CONFIG_64, CONFIG_32 and _PyHASH_BITS"
4537	4537	#endif
4538	4538	const Py_hash_t py_hash_inf = 314159;
4539		-const Py_hash_t py_hash_nan = 0;
4540	4539	mpd_uint_t ten_data[1] = {10};
4541	4540	mpd_t ten = {MPD_POS\|MPD_STATIC
4542	4541	0, 2, 1, 1, ten_data};
@@ -4555,7 +4554,7 @@ _dec_hash(PyDecObject *v)
4555	4554	return -1;
4556	4555	}
4557	4556	else if (mpd_isnan(MPD(v))) {
4558		-return py_hash_nan;
	4557	+return _Py_HashPointer(v);
4559	4558	}
4560	4559	else {
4561	4560	return py_hash_inf * mpd_arith_sign(MPD(v));
@@ -5939,5 +5938,3 @@ PyInit__decimal(void)
5939	5938
5940	5939	return NULL; /* GCOV_NOT_REACHED */
5941	5940	}
5942		-
5943		-

Original file line number	Diff line number	Diff line change
@@ -412,10 +412,10 @@ static Py_hash_t
412	412	complex_hash(PyComplexObject *v)
413	413	{
414	414	Py_uhash_t hashreal, hashimag, combined;
415		-hashreal = (Py_uhash_t)_Py_HashDouble(v->cval.real);
	415	+hashreal = (Py_uhash_t)_Py_HashDouble((PyObject *) v, v->cval.real);
416	416	if (hashreal == (Py_uhash_t)-1)
417	417	return -1;
418		-hashimag = (Py_uhash_t)_Py_HashDouble(v->cval.imag);
	418	+hashimag = (Py_uhash_t)_Py_HashDouble((PyObject *)v, v->cval.imag);
419	419	if (hashimag == (Py_uhash_t)-1)
420	420	return -1;
421	421	/* Note: if the imaginary part is 0, hashimag is 0 now,

Original file line number	Diff line number	Diff line change
@@ -556,7 +556,7 @@ float_richcompare(PyObject v, PyObject w, int op)
556	556	static Py_hash_t
557	557	float_hash(PyFloatObject *v)
558	558	{
559		-return _Py_HashDouble(v->ob_fval);
	559	+return _Py_HashDouble((PyObject *)v, v->ob_fval);
560	560	}
561	561
562	562	static PyObject *

Original file line number	Diff line number	Diff line change
@@ -56,8 +56,12 @@ static Py_ssize_t hashstats[Py_HASH_STATS_MAX + 1] = {0};
56	56	If the result of the reduction is infinity (this is impossible for
57	57	integers, floats and Decimals) then use the predefined hash value
58	58	_PyHASH_INF for x >= 0, or -_PyHASH_INF for x < 0, instead.
59		- _PyHASH_INF, -_PyHASH_INF and _PyHASH_NAN are also used for the
60		- hashes of float and Decimal infinities and nans.
	59	+ _PyHASH_INF and -_PyHASH_INF are also used for the
	60	+ hashes of float and Decimal infinities.
	61	+
	62	+ NaNs hash with a pointer hash. Having distinct hash values prevents
	63	+ catastrophic pileups from distinct NaN instances which used to always
	64	+ have the same hash value but would compare unequal.
61	65
62	66	A selling point for the above strategy is that it makes it possible
63	67	to compute hashes of decimal and binary floating-point numbers
@@ -82,8 +86,10 @@ static Py_ssize_t hashstats[Py_HASH_STATS_MAX + 1] = {0};
82	86
83	87	*/
84	88
	89	+Py_hash_t _Py_HashPointer(const void *);
	90	+
85	91	Py_hash_t
86		-_Py_HashDouble(double v)
	92	+_Py_HashDouble(PyObject *inst, double v)
87	93	{
88	94	int e, sign;
89	95	double m;
@@ -93,7 +99,7 @@ _Py_HashDouble(double v)
93	99	if (Py_IS_INFINITY(v))
94	100	return v > 0 ? _PyHASH_INF : -_PyHASH_INF;
95	101	else
96		-return _PyHASH_NAN;
	102	+return _Py_HashPointer(inst);
97	103	}
98	104
99	105	m = frexp(v, &e);

Original file line number	Diff line number	Diff line change
@@ -1405,7 +1405,7 @@ get_hash_info(PyThreadState *tstate)
1405	1405	PyStructSequence_SET_ITEM(hash_info, field++,
1406	1406	PyLong_FromLong(_PyHASH_INF));
1407	1407	PyStructSequence_SET_ITEM(hash_info, field++,
1408		-PyLong_FromLong(_PyHASH_NAN));
	1408	+PyLong_FromLong(0)); // This is no longer used
1409	1409	PyStructSequence_SET_ITEM(hash_info, field++,
1410	1410	PyLong_FromLong(_PyHASH_IMAG));
1411	1411	PyStructSequence_SET_ITEM(hash_info, field++,