ENH: support EA types in read_csv · Issue #23228 · pandas-dev/pandas (original) (raw)

In [3]: df = pd.DataFrame({'Int': pd.Series([1, 2, 3], dtype='Int64'), 'A': [1, 2, 1]})
   ...: df
   ...: 
Out[3]: 
  Int  A
0   1  1
1   2  2
2   3  1

In [4]: data = df.to_csv(index=False)

In [5]: data
Out[5]: 'Int,A\n1,1\n2,2\n3,1\n'

In [6]: from io import StringIO

In [7]: pd.read_csv(StringIO(data), dtype={'Int': 'Int64'})~/pandas/pandas/_libs/parsers.pyx in pandas._libs.parsers.TextReader._read_rows()
    968 
    969         self._start_clock()
--> 970         columns = self._convert_column_data(rows=rows,
    971                                             footer=footer,
    972                                             upcast_na=True)

~/pandas/pandas/_libs/parsers.pyx in pandas._libs.parsers.TextReader._convert_column_data()
   1096 
   1097             # Should return as the desired dtype (inferred or specified)
-> 1098             col_res, na_count = self._convert_tokens(
   1099                 i, start, end, name, na_filter, na_hashset,
   1100                 na_flist, col_dtype)

~/pandas/pandas/_libs/parsers.pyx in pandas._libs.parsers.TextReader._convert_tokens()
   1121 
   1122         if col_dtype is not None:
-> 1123             col_res, na_count = self._convert_with_dtype(
   1124                 col_dtype, i, start, end, na_filter,
   1125                 1, na_hashset, na_flist)

~/pandas/pandas/_libs/parsers.pyx in pandas._libs.parsers.TextReader._convert_with_dtype()
   1249                             "using parse_dates instead".format(dtype=dtype))
   1250         else:
-> 1251             raise TypeError("the dtype {dtype} is not "
   1252                             "supported for parsing".format(dtype=dtype))
   1253 

TypeError: the dtype Int64 is not supported for parsing

we already support Categorical, would be nice to have a general interface to this

In [8]: from pandas.core.arrays.integer import Int64Dtype

In [9]: pd.read_csv(StringIO(data), dtype={'Int': Int64Dtype})
Out[9]: 
  Int  A
0   1  1
1   2  2
2   3  1

In [10]: pd.read_csv(StringIO(data), dtype={'Int': Int64Dtype}).dtypes
Out[10]: 
Int    object
A       int64
dtype: object