DataFrame.query raises ValueError when comparing columns with nullable dtypes · Issue #31913 · pandas-dev/pandas (original) (raw)

Code Sample

In [2]: df1 = pd.DataFrame({'A': [1, 1, 2], 'B': [1, 2, 2]})

In [3]: df1.dtypes Out[3]: A int64 B int64 dtype: object

In [4]: df2 = pd.DataFrame({'A': [1, 1, 2], 'B': [1, 2, 2]}, dtype='Int64')

In [5]: df2.dtypes Out[5]: A Int64 B Int64 dtype: object

In [6]: df1.query('A == B') Out[6]: A B 0 1 1 2 2 2

In [7]: df2.query('A == B')

ValueError Traceback (most recent call last) in ----> 1 df2.query('A == B')

~/anaconda3/lib/python3.6/site-packages/pandas/core/frame.py in query(self, expr, inplace, **kwargs) 3229 kwargs["level"] = kwargs.pop("level", 0) + 1 3230 kwargs["target"] = None -> 3231 res = self.eval(expr, **kwargs) 3232 3233 try:

~/anaconda3/lib/python3.6/site-packages/pandas/core/frame.py in eval(self, expr, inplace, **kwargs) 3344 kwargs["resolvers"] = kwargs.get("resolvers", ()) + tuple(resolvers) 3345 -> 3346 return _eval(expr, inplace=inplace, **kwargs) 3347 3348 def select_dtypes(self, include=None, exclude=None) -> "DataFrame":

~/anaconda3/lib/python3.6/site-packages/pandas/core/computation/eval.py in eval(expr, parser, engine, truediv, local_dict, global_dict, resolvers, level, target, inplace) 335 eng = _engines[engine] 336 eng_inst = eng(parsed_expr) --> 337 ret = eng_inst.evaluate() 338 339 if parsed_expr.assigner is None:

~/anaconda3/lib/python3.6/site-packages/pandas/core/computation/engines.py in evaluate(self) 71 72 # make sure no names in resolvers and locals/globals clash ---> 73 res = self._evaluate() 74 return reconstruct_object( 75 self.result_type, res, self.aligned_axes, self.expr.terms.return_type

~/anaconda3/lib/python3.6/site-packages/pandas/core/computation/engines.py in _evaluate(self) 112 scope = env.full_scope 113 _check_ne_builtin_clash(self.expr) --> 114 return ne.evaluate(s, local_dict=scope) 115 116

~/anaconda3/lib/python3.6/site-packages/numexpr/necompiler.py in evaluate(ex, local_dict, global_dict, out, order, casting, **kwargs) 820 # Create a signature 821 signature = [(name, getType(arg)) for (name, arg) in --> 822 zip(names, arguments)] 823 824 # Look up numexpr if possible.

~/anaconda3/lib/python3.6/site-packages/numexpr/necompiler.py in (.0) 819 820 # Create a signature --> 821 signature = [(name, getType(arg)) for (name, arg) in 822 zip(names, arguments)] 823

~/anaconda3/lib/python3.6/site-packages/numexpr/necompiler.py in getType(a) 701 if kind == 'S': 702 return bytes --> 703 raise ValueError("unknown type %s" % a.dtype.name) 704 705

ValueError: unknown type object

Problem description

DataFrame.query raises ValueError: unknown type object for boolean comparisons when the dtype is one of the new nullable types. (I have tested this for both Int64 and string dtypes.)

Output of pd.show_versions()

INSTALLED VERSIONS
------------------
commit           : None
python           : 3.6.8.final.0
python-bits      : 64
OS               : Darwin
OS-release       : 17.7.0
machine          : x86_64
processor        : i386
byteorder        : little
LC_ALL           : None
LANG             : en_AU.UTF-8
LOCALE           : en_AU.UTF-8

pandas           : 1.0.1
numpy            : 1.18.1
pytz             : 2019.3
dateutil         : 2.8.1
pip              : 20.0.2
setuptools       : 45.2.0.post20200210
Cython           : 0.29.15
pytest           : 5.3.5
hypothesis       : 5.4.1
sphinx           : 2.4.0
blosc            : None
feather          : None
xlsxwriter       : 1.2.7
lxml.etree       : 4.5.0
html5lib         : 1.0.1
pymysql          : None
psycopg2         : None
jinja2           : 2.11.1
IPython          : 7.12.0
pandas_datareader: None
bs4              : 4.8.2
bottleneck       : 1.3.1
fastparquet      : None
gcsfs            : None
lxml.etree       : 4.5.0
matplotlib       : 3.1.3
numexpr          : 2.7.1
odfpy            : None
openpyxl         : 3.0.3
pandas_gbq       : None
pyarrow          : None
pytables         : None
pytest           : 5.3.5
pyxlsb           : None
s3fs             : None
scipy            : 1.4.1
sqlalchemy       : 1.3.13
tables           : 3.6.1
tabulate         : None
xarray           : None
xlrd             : 1.2.0
xlwt             : 1.2.0
xlsxwriter       : 1.2.7
numba            : 0.48.0