hash_pandas_object fails on tuple · Issue #28969 · pandas-dev/pandas (original) (raw)
Code Sample
import pandas as pd
pd.util.hash_pandas_object(pd.DataFrame({'data': [tuple('1'), tuple('2')]})) # fails pd.util.hash_pandas_object(pd.DataFrame({'data': [tuple([1]), tuple([2])]})) # fails
Traceback
TypeError Traceback (most recent call last) ~/miniconda3/envs/povmap/lib/python3.7/site-packages/pandas/core/util/hashing.py in hash_array(vals, encoding, hash_key, categorize) 307 try: --> 308 vals = hashing.hash_object_array(vals, hash_key, encoding) 309 except TypeError:
pandas/_libs/hashing.pyx in pandas._libs.hashing.hash_object_array()
TypeError: ('1',) of type is not a valid type for hashing, must be string or null
During handling of the above exception, another exception occurred:
ValueError Traceback (most recent call last) in 1 import pandas as pd 2 ----> 3 pd.util.hash_pandas_object(pd.DataFrame({'data': [tuple('1'), tuple('2')]}))
~/miniconda3/envs/povmap/lib/python3.7/site-packages/pandas/core/util/hashing.py in hash_pandas_object(obj, index, encoding, hash_key, categorize) 129 num_items += 1 130 hashes = itertools.chain(hashes, index_hash_generator) --> 131 h = _combine_hash_arrays(hashes, num_items) 132 133 h = Series(h, index=obj.index, dtype="uint64", copy=False)
~/miniconda3/envs/povmap/lib/python3.7/site-packages/pandas/core/util/hashing.py in _combine_hash_arrays(arrays, num_items) 37 """ 38 try: ---> 39 first = next(arrays) 40 except StopIteration: 41 return np.array([], dtype=np.uint64)
~/miniconda3/envs/povmap/lib/python3.7/site-packages/pandas/core/util/hashing.py in (.0) 114 115 elif isinstance(obj, ABCDataFrame): --> 116 hashes = (hash_array(series.values) for _, series in obj.items()) 117 num_items = len(obj.columns) 118 if index:
~/miniconda3/envs/povmap/lib/python3.7/site-packages/pandas/core/util/hashing.py in hash_array(vals, encoding, hash_key, categorize) 303 codes, categories = factorize(vals, sort=False) 304 cat = Categorical(codes, Index(categories), ordered=False, fastpath=True) --> 305 return _hash_categorical(cat, encoding, hash_key) 306 307 try:
~/miniconda3/envs/povmap/lib/python3.7/site-packages/pandas/core/util/hashing.py in _hash_categorical(c, encoding, hash_key) 221 # Convert ExtensionArrays to ndarrays 222 values = np.asarray(c.categories.values) --> 223 hashed = hash_array(values, encoding, hash_key, categorize=False) 224 225 # we have uint64, as we don't directly support missing values
~/miniconda3/envs/povmap/lib/python3.7/site-packages/pandas/core/util/hashing.py in hash_array(vals, encoding, hash_key, categorize) 310 # we have mixed types 311 vals = hashing.hash_object_array( --> 312 vals.astype(str).astype(object), hash_key, encoding 313 ) 314
ValueError: setting an array element with a sequence
Problem description
Tuples are immutable and hash should work correct?
Expected Output
Hashed dataframe elements.
Output of pd.show_versions()
INSTALLED VERSIONS
commit : None
python : 3.7.3.final.0
python-bits : 64
OS : Linux
OS-release : 5.0.0-1018-azure
machine : x86_64
processor : x86_64
byteorder : little
LC_ALL : None
LANG : C.UTF-8
LOCALE : en_US.UTF-8
pandas : 0.25.1
numpy : 1.17.2
pytz : 2019.3
dateutil : 2.8.0
pip : 19.2.3
setuptools : 41.4.0
Cython : None
pytest : 5.2.1
hypothesis : None
sphinx : None
blosc : None
feather : None
xlsxwriter : None
lxml.etree : None
html5lib : None
pymysql : None
psycopg2 : None
jinja2 : 2.10.3
IPython : 7.8.0
pandas_datareader: None
bs4 : None
bottleneck : None
fastparquet : None
gcsfs : None
lxml.etree : None
matplotlib : 3.1.1
numexpr : None
odfpy : None
openpyxl : 3.0.0
pandas_gbq : None
pyarrow : 0.15.0
pytables : None
s3fs : None
scipy : 1.3.1
sqlalchemy : 1.3.10
tables : None
xarray : None
xlrd : 1.2.0
xlwt : None
xlsxwriter : None