"to_hdf()" with "format='table'" ignores encoder "errors" argument. · Issue #20835 · pandas-dev/pandas (original) (raw)
The default encoder options for to_hdf()
with format = 'table'
seem to specify errors = 'strict'
as a default for the encoding. The problem is that while one can specify the encoding with, e.g., encoding = 'utf-8'
, the encoder errors
cannot be specified.
srs.to_hdf( 'srs.hdf', key = 'srs', mode = 'w', format = 'table' )
UnicodeEncodeError Traceback (most recent call last)
in ()
----> 1 srs.to_hdf( 'srs.hdf', key = 'srs', mode = 'w', format = 'table' )
/usr/lib/python3.6/site-packages/pandas/core/generic.py in to_hdf(self, path_or_buf, key, **kwargs)
1469
1470 from pandas.io import pytables
-> 1471 return pytables.to_hdf(path_or_buf, key, self, **kwargs)
1472
1473 def to_msgpack(self, path_or_buf=None, encoding='utf-8', **kwargs):
/usr/lib/python3.6/site-packages/pandas/io/pytables.py in to_hdf(path_or_buf, key, value, mode, complevel, complib, append, **kwargs)
279 with HDFStore(path_or_buf, mode=mode, complevel=complevel,
280 complib=complib) as store:
--> 281 f(store)
282 else:
283 f(path_or_buf)
/usr/lib/python3.6/site-packages/pandas/io/pytables.py in (store)
273 f = lambda store: store.append(key, value, **kwargs)
274 else:
--> 275 f = lambda store: store.put(key, value, **kwargs)
276
277 path_or_buf = _stringify_path(path_or_buf)
/usr/lib/python3.6/site-packages/pandas/io/pytables.py in put(self, key, value, format, append, **kwargs)
864 format = get_option("io.hdf.default_format") or 'fixed'
865 kwargs = self._validate_format(format, kwargs)
--> 866 self._write_to_group(key, value, append=append, **kwargs)
867
868 def remove(self, key, where=None, start=None, stop=None):
/usr/lib/python3.6/site-packages/pandas/io/pytables.py in _write_to_group(self, key, value, format, index, append, complib, encoding, **kwargs)
1339
1340 # write the object
-> 1341 s.write(obj=value, append=append, complib=complib, **kwargs)
1342
1343 if s.is_table and index:
/usr/lib/python3.6/site-packages/pandas/io/pytables.py in write(self, obj, data_columns, **kwargs)
4208 obj.columns = [name]
4209 return super(AppendableSeriesTable, self).write(
-> 4210 obj=obj, data_columns=obj.columns.tolist(), **kwargs)
4211
4212 def read(self, columns=None, **kwargs):
/usr/lib/python3.6/site-packages/pandas/io/pytables.py in write(self, obj, axes, append, complib, complevel, fletcher32, min_itemsize, chunksize, expectedrows, dropna, **kwargs)
3905 self.create_axes(axes=axes, obj=obj, validate=append,
3906 min_itemsize=min_itemsize,
-> 3907 **kwargs)
3908
3909 for a in self.axes:
/usr/lib/python3.6/site-packages/pandas/io/pytables.py in create_axes(self, axes, obj, validate, nan_rep, data_columns, min_itemsize, **kwargs)
3577 self.values_axes.append(col)
3578 except (NotImplementedError, ValueError, TypeError) as e:
-> 3579 raise e
3580 except Exception as detail:
3581 raise Exception(
/usr/lib/python3.6/site-packages/pandas/io/pytables.py in create_axes(self, axes, obj, validate, nan_rep, data_columns, min_itemsize, **kwargs)
3572 encoding=self.encoding,
3573 info=self.info,
-> 3574 **kwargs)
3575 col.set_pos(j)
3576
/usr/lib/python3.6/site-packages/pandas/io/pytables.py in set_atom(self, block, block_items, existing_col, min_itemsize, nan_rep, info, encoding, **kwargs)
1923 min_itemsize,
1924 nan_rep,
-> 1925 encoding)
1926
1927 # set as a data block
/usr/lib/python3.6/site-packages/pandas/io/pytables.py in set_atom_string(self, block, block_items, existing_col, min_itemsize, nan_rep, encoding)
1959
1960 # itemsize is the maximum length of a string (along any dimension)
-> 1961 data_converted = _convert_string_array(data, encoding)
1962 itemsize = data_converted.itemsize
1963
/usr/lib/python3.6/site-packages/pandas/io/pytables.py in _convert_string_array(data, encoding, itemsize)
4569 if encoding is not None and len(data):
4570 data = Series(data.ravel()).str.encode(
-> 4571 encoding).values.reshape(data.shape)
4572
4573 # create the sized dtype
/usr/lib/python3.6/site-packages/pandas/core/strings.py in encode(self, encoding, errors)
1655 @copy(str_encode)
1656 def encode(self, encoding, errors="strict"):
-> 1657 result = str_encode(self._data, encoding, errors)
1658 return self._wrap_result(result)
1659
/usr/lib/python3.6/site-packages/pandas/core/strings.py in str_encode(arr, encoding, errors)
1309 encoder = codecs.getencoder(encoding)
1310 f = lambda x: encoder(x, errors)[0]
-> 1311 return _na_map(f, arr)
1312
1313
/usr/lib/python3.6/site-packages/pandas/core/strings.py in _na_map(f, arr, na_result, dtype)
154 def _na_map(f, arr, na_result=np.nan, dtype=object):
155 # should really check for NA
--> 156 return _map(f, arr, na_mask=True, na_value=na_result, dtype=dtype)
157
158
/usr/lib/python3.6/site-packages/pandas/core/strings.py in _map(f, arr, na_mask, na_value, dtype)
169 try:
170 convert = not all(mask)
--> 171 result = lib.map_infer_mask(arr, f, mask.view(np.uint8), convert)
172 except (TypeError, AttributeError) as e:
173 # Reraise the exception if callable f
got wrong number of args.
pandas/_libs/src/inference.pyx in pandas._libs.lib.map_infer_mask()
/usr/lib/python3.6/site-packages/pandas/core/strings.py in (x)
1308 else:
1309 encoder = codecs.getencoder(encoding)
-> 1310 f = lambda x: encoder(x, errors)[0]
1311 return _na_map(f, arr)
1312
UnicodeEncodeError: 'utf-8' codec can't encode character '\ud800' in position 0: surrogates not allowed
That is to say, I'd like to be able to specify the error
argument for an encoder (and, of course, for the decoder also).
Perhaps better yet would be to also change the default errors (at least in this case) to errors = 'surrogatepass'
for both the encoder and decoder, so as to preserve the strings as they are without errors or surprises.