BUG: 1.3.0 column assignment via single columnnp.matrix
behaviour change · Issue #42376 · pandas-dev/pandas (original) (raw)
- I have checked that this issue has not already been reported.
- I have confirmed this bug exists on the latest version of pandas.
- (optional) I have confirmed this bug exists on the master branch of pandas.
Code Sample, a copy-pastable example
import pandas as pd import numpy as np from scipy import sparse
X = sparse.random(100, 100, density=0.2, format="csr") df = pd.DataFrame({"a": np.arange(100)}) df["X_sum"] = X.sum(axis=1) df
Before 1.3.0, this worked fine. As of 1.3.0, displaying df fails with:
traceback
ValueError Traceback (most recent call last) /usr/local/lib/python3.8/site-packages/IPython/core/formatters.py in call(self, obj) 700 type_pprinters=self.type_printers, 701 deferred_pprinters=self.deferred_printers) --> 702 printer.pretty(obj) 703 printer.flush() 704 return stream.getvalue()
/usr/local/lib/python3.8/site-packages/IPython/lib/pretty.py in pretty(self, obj)
392 if cls is not object
393 and callable(cls.dict.get('repr')):
--> 394 return _repr_pprint(obj, self, cycle)
395
396 return _default_pprint(obj, self, cycle)
/usr/local/lib/python3.8/site-packages/IPython/lib/pretty.py in repr_pprint(obj, p, cycle) 698 """A pprint that just redirects to the normal repr function.""" 699 # Find newlines and replace them with p.break() --> 700 output = repr(obj) 701 lines = output.splitlines() 702 with p.group():
/usr/local/lib/python3.8/site-packages/pandas/core/frame.py in repr(self) 993 else: 994 width = None --> 995 self.to_string( 996 buf=buf, 997 max_rows=max_rows,
/usr/local/lib/python3.8/site-packages/pandas/core/frame.py in to_string(self, buf, columns, col_space, header, index, na_rep, formatters, float_format, sparsify, index_names, justify, max_rows, min_rows, max_cols, show_dimensions, decimal, line_width, max_colwidth, encoding) 1129 decimal=decimal, 1130 ) -> 1131 return fmt.DataFrameRenderer(formatter).to_string( 1132 buf=buf, 1133 encoding=encoding,
/usr/local/lib/python3.8/site-packages/pandas/io/formats/format.py in to_string(self, buf, encoding, line_width) 1051 1052 string_formatter = StringFormatter(self.fmt, line_width=line_width) -> 1053 string = string_formatter.to_string() 1054 return save_to_buffer(string, buf=buf, encoding=encoding) 1055
/usr/local/lib/python3.8/site-packages/pandas/io/formats/string.py in to_string(self) 23 24 def to_string(self) -> str: ---> 25 text = self._get_string_representation() 26 if self.fmt.should_show_dimensions: 27 text = "".join([text, self.fmt.dimensions_info])
/usr/local/lib/python3.8/site-packages/pandas/io/formats/string.py in _get_string_representation(self) 38 return self._empty_info_line 39 ---> 40 strcols = self._get_strcols() 41 42 if self.line_width is None:
/usr/local/lib/python3.8/site-packages/pandas/io/formats/string.py in _get_strcols(self) 29 30 def _get_strcols(self) -> list[list[str]]: ---> 31 strcols = self.fmt.get_strcols() 32 if self.fmt.is_truncated: 33 strcols = self._insert_dot_separators(strcols)
/usr/local/lib/python3.8/site-packages/pandas/io/formats/format.py in get_strcols(self) 538 Render a DataFrame to a list of columns (as lists of strings). 539 """ --> 540 strcols = self._get_strcols_without_index() 541 542 if self.index:
/usr/local/lib/python3.8/site-packages/pandas/io/formats/format.py in _get_strcols_without_index(self) 802 int(self.col_space.get(c, 0)), *(self.adj.len(x) for x in cheader) 803 ) --> 804 fmt_values = self.format_col(i) 805 fmt_values = _make_fixed_width( 806 fmt_values, self.justify, minimum=header_colwidth, adj=self.adj
/usr/local/lib/python3.8/site-packages/pandas/io/formats/format.py in format_col(self, i) 816 frame = self.tr_frame 817 formatter = self._get_formatter(i) --> 818 return format_array( 819 frame.iloc[:, i]._values, 820 formatter,
/usr/local/lib/python3.8/site-packages/pandas/io/formats/format.py in format_array(values, formatter, float_format, na_rep, digits, space, justify, decimal, leading_space, quoting) 1238 ) 1239 -> 1240 return fmt_obj.get_result() 1241 1242
/usr/local/lib/python3.8/site-packages/pandas/io/formats/format.py in get_result(self) 1269 1270 def get_result(self) -> list[str]: -> 1271 fmt_values = self._format_strings() 1272 return _make_fixed_width(fmt_values, self.justify) 1273
/usr/local/lib/python3.8/site-packages/pandas/io/formats/format.py in _format_strings(self) 1516 1517 def _format_strings(self) -> list[str]: -> 1518 return list(self.get_result_as_array()) 1519 1520
/usr/local/lib/python3.8/site-packages/pandas/io/formats/format.py in get_result_as_array(self) 1480 float_format = lambda value: self.float_format % value 1481 -> 1482 formatted_values = format_values_with(float_format) 1483 1484 if not self.fixed_width:
/usr/local/lib/python3.8/site-packages/pandas/io/formats/format.py in format_values_with(float_format) 1454 values = self.values 1455 is_complex = is_complex_dtype(values) -> 1456 values = format_with_na_rep(values, formatter, na_rep) 1457 1458 if self.fixed_width:
/usr/local/lib/python3.8/site-packages/pandas/io/formats/format.py in format_with_na_rep(values, formatter, na_rep) 1425 mask = isna(values) 1426 formatted = np.array( -> 1427 [ 1428 formatter(val) if not m else na_rep 1429 for val, m in zip(values.ravel(), mask.ravel())
/usr/local/lib/python3.8/site-packages/pandas/io/formats/format.py in (.0) 1426 formatted = np.array( 1427 [ -> 1428 formatter(val) if not m else na_rep 1429 for val, m in zip(values.ravel(), mask.ravel()) 1430 ]
ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()
ValueError Traceback (most recent call last) /usr/local/lib/python3.8/site-packages/IPython/core/formatters.py in call(self, obj) 343 method = get_real_method(obj, self.print_method) 344 if method is not None: --> 345 return method() 346 return None 347 else:
/usr/local/lib/python3.8/site-packages/pandas/core/frame.py in repr_html(self) 1045 decimal=".", 1046 ) -> 1047 return fmt.DataFrameRenderer(formatter).to_html(notebook=True) 1048 else: 1049 return None
/usr/local/lib/python3.8/site-packages/pandas/io/formats/format.py in to_html(self, buf, encoding, classes, notebook, border, table_id, render_links) 1027 render_links=render_links, 1028 ) -> 1029 string = html_formatter.to_string() 1030 return save_to_buffer(string, buf=buf, encoding=encoding) 1031
/usr/local/lib/python3.8/site-packages/pandas/io/formats/html.py in to_string(self) 70 71 def to_string(self) -> str: ---> 72 lines = self.render() 73 if any(isinstance(x, str) for x in lines): 74 lines = [str(x) for x in lines]
/usr/local/lib/python3.8/site-packages/pandas/io/formats/html.py in render(self) 619 self.write("
/usr/local/lib/python3.8/site-packages/pandas/io/formats/html.py in render(self) 76 77 def render(self) -> list[str]: ---> 78 self._write_table() 79 80 if self.should_show_dimensions:
/usr/local/lib/python3.8/site-packages/pandas/io/formats/html.py in _write_table(self, indent) 246 self._write_header(indent + self.indent_delta) 247 --> 248 self._write_body(indent + self.indent_delta) 249 250 self.write("", indent)
/usr/local/lib/python3.8/site-packages/pandas/io/formats/html.py in _write_body(self, indent) 393 def _write_body(self, indent: int) -> None: 394 self.write("
", indent) --> 395 fmt_values = self._get_formatted_values() 396 397 # write values/usr/local/lib/python3.8/site-packages/pandas/io/formats/html.py in _get_formatted_values(self) 583 584 def _get_formatted_values(self) -> dict[int, list[str]]: --> 585 return {i: self.fmt.format_col(i) for i in range(self.ncols)} 586 587 def _get_columns_formatted_values(self) -> list[str]:
/usr/local/lib/python3.8/site-packages/pandas/io/formats/html.py in (.0) 583 584 def _get_formatted_values(self) -> dict[int, list[str]]: --> 585 return {i: self.fmt.format_col(i) for i in range(self.ncols)} 586 587 def _get_columns_formatted_values(self) -> list[str]:
/usr/local/lib/python3.8/site-packages/pandas/io/formats/format.py in format_col(self, i) 816 frame = self.tr_frame 817 formatter = self._get_formatter(i) --> 818 return format_array( 819 frame.iloc[:, i]._values, 820 formatter,
/usr/local/lib/python3.8/site-packages/pandas/io/formats/format.py in format_array(values, formatter, float_format, na_rep, digits, space, justify, decimal, leading_space, quoting) 1238 ) 1239 -> 1240 return fmt_obj.get_result() 1241 1242
/usr/local/lib/python3.8/site-packages/pandas/io/formats/format.py in get_result(self) 1269 1270 def get_result(self) -> list[str]: -> 1271 fmt_values = self._format_strings() 1272 return _make_fixed_width(fmt_values, self.justify) 1273
/usr/local/lib/python3.8/site-packages/pandas/io/formats/format.py in _format_strings(self) 1516 1517 def _format_strings(self) -> list[str]: -> 1518 return list(self.get_result_as_array()) 1519 1520
/usr/local/lib/python3.8/site-packages/pandas/io/formats/format.py in get_result_as_array(self) 1480 float_format = lambda value: self.float_format % value 1481 -> 1482 formatted_values = format_values_with(float_format) 1483 1484 if not self.fixed_width:
/usr/local/lib/python3.8/site-packages/pandas/io/formats/format.py in format_values_with(float_format) 1454 values = self.values 1455 is_complex = is_complex_dtype(values) -> 1456 values = format_with_na_rep(values, formatter, na_rep) 1457 1458 if self.fixed_width:
/usr/local/lib/python3.8/site-packages/pandas/io/formats/format.py in format_with_na_rep(values, formatter, na_rep) 1425 mask = isna(values) 1426 formatted = np.array( -> 1427 [ 1428 formatter(val) if not m else na_rep 1429 for val, m in zip(values.ravel(), mask.ravel())
/usr/local/lib/python3.8/site-packages/pandas/io/formats/format.py in (.0) 1426 formatted = np.array( 1427 [ -> 1428 formatter(val) if not m else na_rep 1429 for val, m in zip(values.ravel(), mask.ravel()) 1430 ]
ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()
I discovered this new behaviour due to our tests starting to fail. What was causing that was:
df["sum"] = X.sum(axis=1) df["log1p_sum"] = np.log1p(df["sum"])
failing with:
traceback
ValueError Traceback (most recent call last) in 1 df["sum"] = X.sum(axis=1) ----> 2 df["log1p_sum"] = np.log1p(df["sum"])
/usr/local/lib/python3.8/site-packages/pandas/core/frame.py in setitem(self, key, value) 3605 else: 3606 # set column -> 3607 self._set_item(key, value) 3608 3609 def _setitem_slice(self, key: slice, value):
/usr/local/lib/python3.8/site-packages/pandas/core/frame.py in _set_item(self, key, value) 3777 ensure homogeneity. 3778 """ -> 3779 value = self._sanitize_column(value) 3780 3781 if (
/usr/local/lib/python3.8/site-packages/pandas/core/frame.py in _sanitize_column(self, value) 4502 4503 if is_list_like(value): -> 4504 com.require_length_match(value, self.index) 4505 return sanitize_array(value, self.index, copy=True, allow_2d=True) 4506
/usr/local/lib/python3.8/site-packages/pandas/core/common.py in require_length_match(data, index) 525 """ 526 if len(data) != len(index): --> 527 raise ValueError( 528 "Length of values " 529 f"({len(data)}) "
ValueError: Length of values (1) does not match length of index (100)
Problem description
This problem is being triggered because the result of X.sum(axis=1)
when X
is a scipy sparse matrix is not a 1d numpy ndarray, but a np.matrix
with one column. This used to be handled by pandas, but now isn't.
This is a problem because it's a behaviour change that breaks existing code. As far as I can tell from the release notes, this was not an intentional behaviour change. It does look like some things around column assignment did change, and I imagine that assigning with deprecated numpy types was not considered.
Expected Output
I would expect this to not error, and for this to pass: np.testing.assert_array_equal(df["X_sum"], np.ravel(X.sum(axis=1)))
Output of pd.show_versions()
INSTALLED VERSIONS
------------------
commit : f00ed8f47020034e752baf0250483053340971b0
python : 3.8.10.final.0
python-bits : 64
OS : Darwin
OS-release : 20.5.0
Version : Darwin Kernel Version 20.5.0: Sat May 8 05:10:33 PDT 2021; root:xnu-7195.121.3~9/RELEASE_X86_64
machine : x86_64
processor : i386
byteorder : little
LC_ALL : None
LANG : en_US.UTF-8
LOCALE : en_US.UTF-8
pandas : 1.3.0
numpy : 1.21.0
pytz : 2020.1
dateutil : 2.8.1
pip : 21.1.3
setuptools : 56.0.0
Cython : 0.29.23
pytest : 6.2.4
hypothesis : None
sphinx : 4.0.2
blosc : None
feather : None
xlsxwriter : None
lxml.etree : 4.6.3
html5lib : None
pymysql : None
psycopg2 : None
jinja2 : 2.11.2
IPython : 7.23.1
pandas_datareader: None
bs4 : 4.9.3
bottleneck : None
fsspec : 2021.06.0
fastparquet : 0.4.1
gcsfs : None
matplotlib : 3.4.2
numexpr : 2.7.2
odfpy : None
openpyxl : None
pandas_gbq : None
pyarrow : 4.0.1
pyxlsb : None
s3fs : 0.4.2
scipy : 1.7.0
sqlalchemy : 1.3.18
tables : 3.6.1
tabulate : 0.8.7
xarray : 0.18.2
xlrd : 1.2.0
xlwt : None
numba : 0.53.1