df.reset_index() fails if df.index.name is a tuple · Issue #16164 · pandas-dev/pandas (original) (raw)

Code Sample, a copy-pastable example if possible

In [2]: df = pd.DataFrame([[1, 2]], columns=pd.MultiIndex.from_product([['A'], ['a', 'b']]))

In [3]: reind = df.set_index([('A', 'a')])

In [4]: reind.reset_index()

TypeError Traceback (most recent call last) /home/pietro/nobackup/repo/pandas/pandas/util/hashing.py in hash_array(vals, encoding, hash_key, categorize) 265 try: --> 266 vals = _hash.hash_object_array(vals, hash_key, encoding) 267 except TypeError:

as/pandas/util/hashing.pyx in pandas.util.libhashing.hash_object_array (pandas/util/hashing.c:2372)()

TypeError: ('A', 'a') of type <class 'tuple'> is not a valid type for hashing, must be string or null

During handling of the above exception, another exception occurred:

ValueError Traceback (most recent call last) in () ----> 1 reind.reset_index()

/home/pietro/nobackup/repo/pandas/pandas/core/frame.py in reset_index(self, level, drop, inplace, col_level, col_fill) 3057 name = tuple(name_lst) 3058 values = _maybe_casted_values(self.index) -> 3059 new_obj.insert(0, name, values) 3060 3061 new_obj.index = new_index

/home/pietro/nobackup/repo/pandas/pandas/core/frame.py in insert(self, loc, column, value, allow_duplicates) 2518 value = self._sanitize_column(column, value, broadcast=False) 2519 self._data.insert(loc, column, value, -> 2520 allow_duplicates=allow_duplicates) 2521 2522 def assign(self, **kwargs):

/home/pietro/nobackup/repo/pandas/pandas/core/internals.py in insert(self, loc, item, value, allow_duplicates) 3806 3807 """ -> 3808 if not allow_duplicates and item in self.items: 3809 # Should this be a different kind of error?? 3810 raise ValueError('cannot insert {}, already exists'.format(item))

/home/pietro/nobackup/repo/pandas/pandas/core/indexes/multi.py in contains(self, key) 1326 hash(key) 1327 try: -> 1328 self.get_loc(key) 1329 return True 1330 except LookupError:

/home/pietro/nobackup/repo/pandas/pandas/core/indexes/multi.py in get_loc(self, key, method) 1984 key = _values_from_object(key) 1985 key = tuple(map(_maybe_str_to_time_stamp, key, self.levels)) -> 1986 return self._engine.get_loc(key) 1987 1988 # -- partial selection or non-unique index

/home/pietro/nobackup/repo/pandas/pandas/_libs/index.pyx in pandas._libs.index.MultiIndexEngine.get_loc (pandas/_libs/index.c:13171)()

/home/pietro/nobackup/repo/pandas/pandas/_libs/index.pyx in pandas._libs.index.MultiIndexEngine.get_loc (pandas/_libs/index.c:13018)()

/home/pietro/nobackup/repo/pandas/pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.MultiIndexHashTable.get_item (pandas/_libs/hashtable.c:23625)()

/home/pietro/nobackup/repo/pandas/pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.MultiIndexHashTable.get_item (pandas/_libs/hashtable.c:23374)()

/home/pietro/nobackup/repo/pandas/pandas/core/indexes/multi.py in _hashed_indexing_key(self, key) 755 key = tuple([f(k, stringify) 756 for k, stringify in zip(key, self._have_mixed_levels)]) --> 757 return hash_tuples(key) 758 759 @Appender(base._shared_docs['duplicated'] % _index_doc_kwargs)

/home/pietro/nobackup/repo/pandas/pandas/util/hashing.py in hash_tuples(vals, encoding, hash_key) 159 hash_key=hash_key) 160 for cat in vals) --> 161 h = _combine_hash_arrays(hashes, len(vals)) 162 if is_tuple: 163 h = h[0]

/home/pietro/nobackup/repo/pandas/pandas/util/hashing.py in _combine_hash_arrays(arrays, num_items) 31 """ 32 try: ---> 33 first = next(arrays) 34 except StopIteration: 35 return np.array([], dtype=np.uint64)

/home/pietro/nobackup/repo/pandas/pandas/util/hashing.py in (.0) 158 encoding=encoding, 159 hash_key=hash_key) --> 160 for cat in vals) 161 h = _combine_hash_arrays(hashes, len(vals)) 162 if is_tuple:

/home/pietro/nobackup/repo/pandas/pandas/util/hashing.py in _hash_categorical(c, encoding, hash_key) 182 """ 183 hashed = hash_array(c.categories.values, encoding, hash_key, --> 184 categorize=False) 185 186 # we have uint64, as we don't directly support missing values

/home/pietro/nobackup/repo/pandas/pandas/util/hashing.py in hash_array(vals, encoding, hash_key, categorize) 267 except TypeError: 268 # we have mixed types --> 269 vals = _hash.hash_object_array(vals.astype(str).astype(object), 270 hash_key, encoding) 271

ValueError: setting an array element with a sequence

Problem description

In general, one expects df.set_index([df.columns[0]]).reset_index() to return (a copy of) df. Instead, this does not happen if df.columns is a MultiIndex.

@jreback I know you were not really convinced this should work, but I'm uploading a PR in a couple of minutes, and as you will see the fix is extremely simple.

Expected Output

df

Output of `pd.show_versions()`

INSTALLED VERSIONS

commit: None
python: 3.5.3.final.0
python-bits: 64
OS: Linux
OS-release: 4.7.0-1-amd64
machine: x86_64
processor:
byteorder: little
LC_ALL: None
LANG: it_IT.utf8
LOCALE: it_IT.UTF-8

pandas: 0.20.0rc1+29.g075eca1fa
pytest: 3.0.6
pip: 9.0.1
setuptools: 33.1.1
Cython: 0.25.2
numpy: 1.12.0
scipy: 0.18.1
xarray: 0.9.1
IPython: 5.1.0.dev
sphinx: 1.4.9
patsy: 0.3.0-dev
dateutil: 2.5.3
pytz: 2016.7
blosc: None
bottleneck: 1.2.0
tables: 3.3.0
numexpr: 2.6.1
feather: 0.3.1
matplotlib: 2.0.0
openpyxl: 2.3.0
xlrd: 1.0.0
xlwt: 1.1.2
xlsxwriter: 0.9.6
lxml: 3.7.1
bs4: 4.5.3
html5lib: 0.999999999
sqlalchemy: 1.0.15
pymysql: None
psycopg2: None
jinja2: 2.8
s3fs: None
pandas_gbq: None
pandas_datareader: 0.2.1