DataFrame.query raises ValueError when comparing columns with nullable dtypes · Issue #31913 · pandas-dev/pandas (original) (raw)
Code Sample
In [2]: df1 = pd.DataFrame({'A': [1, 1, 2], 'B': [1, 2, 2]})
In [3]: df1.dtypes Out[3]: A int64 B int64 dtype: object
In [4]: df2 = pd.DataFrame({'A': [1, 1, 2], 'B': [1, 2, 2]}, dtype='Int64')
In [5]: df2.dtypes Out[5]: A Int64 B Int64 dtype: object
In [6]: df1.query('A == B') Out[6]: A B 0 1 1 2 2 2
In [7]: df2.query('A == B')
ValueError Traceback (most recent call last) in ----> 1 df2.query('A == B')
~/anaconda3/lib/python3.6/site-packages/pandas/core/frame.py in query(self, expr, inplace, **kwargs) 3229 kwargs["level"] = kwargs.pop("level", 0) + 1 3230 kwargs["target"] = None -> 3231 res = self.eval(expr, **kwargs) 3232 3233 try:
~/anaconda3/lib/python3.6/site-packages/pandas/core/frame.py in eval(self, expr, inplace, **kwargs) 3344 kwargs["resolvers"] = kwargs.get("resolvers", ()) + tuple(resolvers) 3345 -> 3346 return _eval(expr, inplace=inplace, **kwargs) 3347 3348 def select_dtypes(self, include=None, exclude=None) -> "DataFrame":
~/anaconda3/lib/python3.6/site-packages/pandas/core/computation/eval.py in eval(expr, parser, engine, truediv, local_dict, global_dict, resolvers, level, target, inplace) 335 eng = _engines[engine] 336 eng_inst = eng(parsed_expr) --> 337 ret = eng_inst.evaluate() 338 339 if parsed_expr.assigner is None:
~/anaconda3/lib/python3.6/site-packages/pandas/core/computation/engines.py in evaluate(self) 71 72 # make sure no names in resolvers and locals/globals clash ---> 73 res = self._evaluate() 74 return reconstruct_object( 75 self.result_type, res, self.aligned_axes, self.expr.terms.return_type
~/anaconda3/lib/python3.6/site-packages/pandas/core/computation/engines.py in _evaluate(self) 112 scope = env.full_scope 113 _check_ne_builtin_clash(self.expr) --> 114 return ne.evaluate(s, local_dict=scope) 115 116
~/anaconda3/lib/python3.6/site-packages/numexpr/necompiler.py in evaluate(ex, local_dict, global_dict, out, order, casting, **kwargs) 820 # Create a signature 821 signature = [(name, getType(arg)) for (name, arg) in --> 822 zip(names, arguments)] 823 824 # Look up numexpr if possible.
~/anaconda3/lib/python3.6/site-packages/numexpr/necompiler.py in (.0) 819 820 # Create a signature --> 821 signature = [(name, getType(arg)) for (name, arg) in 822 zip(names, arguments)] 823
~/anaconda3/lib/python3.6/site-packages/numexpr/necompiler.py in getType(a) 701 if kind == 'S': 702 return bytes --> 703 raise ValueError("unknown type %s" % a.dtype.name) 704 705
ValueError: unknown type object
Problem description
DataFrame.query
raises ValueError: unknown type object
for boolean comparisons when the dtype is one of the new nullable types. (I have tested this for both Int64
and string
dtypes.)
Output of pd.show_versions()
INSTALLED VERSIONS
------------------
commit : None
python : 3.6.8.final.0
python-bits : 64
OS : Darwin
OS-release : 17.7.0
machine : x86_64
processor : i386
byteorder : little
LC_ALL : None
LANG : en_AU.UTF-8
LOCALE : en_AU.UTF-8
pandas : 1.0.1
numpy : 1.18.1
pytz : 2019.3
dateutil : 2.8.1
pip : 20.0.2
setuptools : 45.2.0.post20200210
Cython : 0.29.15
pytest : 5.3.5
hypothesis : 5.4.1
sphinx : 2.4.0
blosc : None
feather : None
xlsxwriter : 1.2.7
lxml.etree : 4.5.0
html5lib : 1.0.1
pymysql : None
psycopg2 : None
jinja2 : 2.11.1
IPython : 7.12.0
pandas_datareader: None
bs4 : 4.8.2
bottleneck : 1.3.1
fastparquet : None
gcsfs : None
lxml.etree : 4.5.0
matplotlib : 3.1.3
numexpr : 2.7.1
odfpy : None
openpyxl : 3.0.3
pandas_gbq : None
pyarrow : None
pytables : None
pytest : 5.3.5
pyxlsb : None
s3fs : None
scipy : 1.4.1
sqlalchemy : 1.3.13
tables : 3.6.1
tabulate : None
xarray : None
xlrd : 1.2.0
xlwt : 1.2.0
xlsxwriter : 1.2.7
numba : 0.48.0