hash_pandas_object fails on tuple · Issue #28969 · pandas-dev/pandas (original) (raw)

Code Sample

import pandas as pd

pd.util.hash_pandas_object(pd.DataFrame({'data': [tuple('1'), tuple('2')]})) # fails pd.util.hash_pandas_object(pd.DataFrame({'data': [tuple([1]), tuple([2])]})) # fails

Traceback

TypeError Traceback (most recent call last) ~/miniconda3/envs/povmap/lib/python3.7/site-packages/pandas/core/util/hashing.py in hash_array(vals, encoding, hash_key, categorize) 307 try: --> 308 vals = hashing.hash_object_array(vals, hash_key, encoding) 309 except TypeError:

pandas/_libs/hashing.pyx in pandas._libs.hashing.hash_object_array()

TypeError: ('1',) of type is not a valid type for hashing, must be string or null

During handling of the above exception, another exception occurred:

ValueError Traceback (most recent call last) in 1 import pandas as pd 2 ----> 3 pd.util.hash_pandas_object(pd.DataFrame({'data': [tuple('1'), tuple('2')]}))

~/miniconda3/envs/povmap/lib/python3.7/site-packages/pandas/core/util/hashing.py in hash_pandas_object(obj, index, encoding, hash_key, categorize) 129 num_items += 1 130 hashes = itertools.chain(hashes, index_hash_generator) --> 131 h = _combine_hash_arrays(hashes, num_items) 132 133 h = Series(h, index=obj.index, dtype="uint64", copy=False)

~/miniconda3/envs/povmap/lib/python3.7/site-packages/pandas/core/util/hashing.py in _combine_hash_arrays(arrays, num_items) 37 """ 38 try: ---> 39 first = next(arrays) 40 except StopIteration: 41 return np.array([], dtype=np.uint64)

~/miniconda3/envs/povmap/lib/python3.7/site-packages/pandas/core/util/hashing.py in (.0) 114 115 elif isinstance(obj, ABCDataFrame): --> 116 hashes = (hash_array(series.values) for _, series in obj.items()) 117 num_items = len(obj.columns) 118 if index:

~/miniconda3/envs/povmap/lib/python3.7/site-packages/pandas/core/util/hashing.py in hash_array(vals, encoding, hash_key, categorize) 303 codes, categories = factorize(vals, sort=False) 304 cat = Categorical(codes, Index(categories), ordered=False, fastpath=True) --> 305 return _hash_categorical(cat, encoding, hash_key) 306 307 try:

~/miniconda3/envs/povmap/lib/python3.7/site-packages/pandas/core/util/hashing.py in _hash_categorical(c, encoding, hash_key) 221 # Convert ExtensionArrays to ndarrays 222 values = np.asarray(c.categories.values) --> 223 hashed = hash_array(values, encoding, hash_key, categorize=False) 224 225 # we have uint64, as we don't directly support missing values

~/miniconda3/envs/povmap/lib/python3.7/site-packages/pandas/core/util/hashing.py in hash_array(vals, encoding, hash_key, categorize) 310 # we have mixed types 311 vals = hashing.hash_object_array( --> 312 vals.astype(str).astype(object), hash_key, encoding 313 ) 314

ValueError: setting an array element with a sequence

Problem description

Tuples are immutable and hash should work correct?

Expected Output

Hashed dataframe elements.

Output of pd.show_versions()

INSTALLED VERSIONS

commit : None
python : 3.7.3.final.0
python-bits : 64
OS : Linux
OS-release : 5.0.0-1018-azure
machine : x86_64
processor : x86_64
byteorder : little
LC_ALL : None
LANG : C.UTF-8
LOCALE : en_US.UTF-8

pandas : 0.25.1
numpy : 1.17.2
pytz : 2019.3
dateutil : 2.8.0
pip : 19.2.3
setuptools : 41.4.0
Cython : None
pytest : 5.2.1
hypothesis : None
sphinx : None
blosc : None
feather : None
xlsxwriter : None
lxml.etree : None
html5lib : None
pymysql : None
psycopg2 : None
jinja2 : 2.10.3
IPython : 7.8.0
pandas_datareader: None
bs4 : None
bottleneck : None
fastparquet : None
gcsfs : None
lxml.etree : None
matplotlib : 3.1.1
numexpr : None
odfpy : None
openpyxl : 3.0.0
pandas_gbq : None
pyarrow : 0.15.0
pytables : None
s3fs : None
scipy : 1.3.1
sqlalchemy : 1.3.10
tables : None
xarray : None
xlrd : 1.2.0
xlwt : None
xlsxwriter : None