API: Consistency between DataFrame, MultiIndex.to_frame and reset_index · Issue #45245 · pandas-dev/pandas (original) (raw)

DataFrame, MultiIndex.to_frame, Series.reset_index and DataFrame.reset_index all create DataFrames. In most cases they are the same. But in the following cases they behave differently:

I would also suggest that both to_frame and reset_index have an additional parameter allow_duplicates, default False, but this seems controversial (#44755)...

import pandas as pd
for (values, names) in [
        ([(1, 2),(3, 4)],["a", "b"]),
        ([(1, True),(3, False)],["a", "b"]),
        ([(1, 2),(3, 4)],["a", None]),
        ([(1, 2),(3, 4)],["a", "a"]),
    ]:
    print(f"\nvalues: {values}, names: {names}")
    
    print("\nDataFrame")
    df = pd.DataFrame(values, columns=names)
    print(f"{df}\nDtypes:\n{df.dtypes}")
    
    print("\nto_frame")
    index = pd.MultiIndex.from_tuples(values, names=names)
    df = index.to_frame(index=False)
    print(f"{df}\nDtypes:\n{df.dtypes}")
    
    print("\nreset_index")
    df = pd.DataFrame(index=index).reset_index()
    print(f"{df}\nDtypes:\n{df.dtypes}")


values: [(1, 2), (3, 4)], names: ['a', 'b']

DataFrame
   a  b
0  1  2
1  3  4
Dtypes:
a    int64
b    int64
dtype: object

to_frame
   a  b
0  1  2
1  3  4
Dtypes:
a    int64
b    int64
dtype: object

reset_index
   a  b
0  1  2
1  3  4
Dtypes:
a    int64
b    int64
dtype: object

values: [(1, True), (3, False)], names: ['a', 'b']

DataFrame
   a      b
0  1   True
1  3  False
Dtypes:
a    int64
b     bool
dtype: object

to_frame
   a      b
0  1   True
1  3  False
Dtypes:
a     int64
b    object
dtype: object

reset_index
   a      b
0  1   True
1  3  False
Dtypes:
a    int64
b     bool
dtype: object

values: [(1, 2), (3, 4)], names: ['a', None]

DataFrame
   a  NaN
0  1    2
1  3    4
Dtypes:
a      int64
NaN    int64
dtype: object

to_frame
   a  1
0  1  2
1  3  4
Dtypes:
a    int64
1    int64
dtype: object

reset_index
   a  level_1
0  1        2
1  3        4
Dtypes:
a          int64
level_1    int64
dtype: object

values: [(1, 2), (3, 4)], names: ['a', 'a']

DataFrame
   a  a
0  1  2
1  3  4
Dtypes:
a    int64
a    int64
dtype: object

to_frame
   a
0  2
1  4
Dtypes:
a    int64
dtype: object

reset_index

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
~\AppData\Local\Temp/ipykernel_21224/810260565.py in <module>
     14     print(f"{df}\nDtypes:\n{df.dtypes}")
     15     print("\nreset_index")
---> 16     df = pd.DataFrame(index=index).reset_index()
     17     print(f"{df}\nDtypes:\n{df.dtypes}")

c:\users\john\onedrive\documents\github\pandas_johnzangwill\pandas\util\_decorators.py in wrapper(*args, **kwargs)
    309                     stacklevel=stacklevel,
    310                 )
--> 311             return func(*args, **kwargs)
    312 
    313         return wrapper

c:\users\john\onedrive\documents\github\pandas_johnzangwill\pandas\core\frame.py in reset_index(self, level, drop, inplace, col_level, col_fill)
   5832                     )
   5833 
-> 5834                 new_obj.insert(0, name, level_values)
   5835 
   5836         new_obj.index = new_index

c:\users\john\onedrive\documents\github\pandas_johnzangwill\pandas\core\frame.py in insert(self, loc, column, value, allow_duplicates)
   4433         if not allow_duplicates and column in self.columns:
   4434             # Should this be a different kind of error??
-> 4435             raise ValueError(f"cannot insert {column}, already exists")
   4436         if not isinstance(loc, int):
   4437             raise TypeError("loc must be int")

ValueError: cannot insert a, already exists