BUG: 1.3.0 column assignment via single column`np.matrix` behaviour change · Issue #42376 · pandas-dev/pandas (original) (raw)

I have checked that this issue has not already been reported.
I have confirmed this bug exists on the latest version of pandas.
(optional) I have confirmed this bug exists on the master branch of pandas.

Code Sample, a copy-pastable example

import pandas as pd import numpy as np from scipy import sparse

X = sparse.random(100, 100, density=0.2, format="csr") df = pd.DataFrame({"a": np.arange(100)}) df["X_sum"] = X.sum(axis=1) df

Before 1.3.0, this worked fine. As of 1.3.0, displaying df fails with:

traceback

ValueError Traceback (most recent call last) /usr/local/lib/python3.8/site-packages/IPython/core/formatters.py in call(self, obj) 700 type_pprinters=self.type_printers, 701 deferred_pprinters=self.deferred_printers) --> 702 printer.pretty(obj) 703 printer.flush() 704 return stream.getvalue()

/usr/local/lib/python3.8/site-packages/IPython/lib/pretty.py in pretty(self, obj) 392 if cls is not object
393 and callable(cls.dict.get('repr')): --> 394 return _repr_pprint(obj, self, cycle) 395 396 return _default_pprint(obj, self, cycle)

/usr/local/lib/python3.8/site-packages/IPython/lib/pretty.py in repr_pprint(obj, p, cycle) 698 """A pprint that just redirects to the normal repr function.""" 699 # Find newlines and replace them with p.break() --> 700 output = repr(obj) 701 lines = output.splitlines() 702 with p.group():

/usr/local/lib/python3.8/site-packages/pandas/core/frame.py in repr(self) 993 else: 994 width = None --> 995 self.to_string( 996 buf=buf, 997 max_rows=max_rows,

/usr/local/lib/python3.8/site-packages/pandas/core/frame.py in to_string(self, buf, columns, col_space, header, index, na_rep, formatters, float_format, sparsify, index_names, justify, max_rows, min_rows, max_cols, show_dimensions, decimal, line_width, max_colwidth, encoding) 1129 decimal=decimal, 1130 ) -> 1131 return fmt.DataFrameRenderer(formatter).to_string( 1132 buf=buf, 1133 encoding=encoding,

/usr/local/lib/python3.8/site-packages/pandas/io/formats/format.py in to_string(self, buf, encoding, line_width) 1051 1052 string_formatter = StringFormatter(self.fmt, line_width=line_width) -> 1053 string = string_formatter.to_string() 1054 return save_to_buffer(string, buf=buf, encoding=encoding) 1055

/usr/local/lib/python3.8/site-packages/pandas/io/formats/string.py in to_string(self) 23 24 def to_string(self) -> str: ---> 25 text = self._get_string_representation() 26 if self.fmt.should_show_dimensions: 27 text = "".join([text, self.fmt.dimensions_info])

/usr/local/lib/python3.8/site-packages/pandas/io/formats/string.py in _get_string_representation(self) 38 return self._empty_info_line 39 ---> 40 strcols = self._get_strcols() 41 42 if self.line_width is None:

/usr/local/lib/python3.8/site-packages/pandas/io/formats/string.py in _get_strcols(self) 29 30 def _get_strcols(self) -> list[list[str]]: ---> 31 strcols = self.fmt.get_strcols() 32 if self.fmt.is_truncated: 33 strcols = self._insert_dot_separators(strcols)

/usr/local/lib/python3.8/site-packages/pandas/io/formats/format.py in get_strcols(self) 538 Render a DataFrame to a list of columns (as lists of strings). 539 """ --> 540 strcols = self._get_strcols_without_index() 541 542 if self.index:

/usr/local/lib/python3.8/site-packages/pandas/io/formats/format.py in _get_strcols_without_index(self) 802 int(self.col_space.get(c, 0)), *(self.adj.len(x) for x in cheader) 803 ) --> 804 fmt_values = self.format_col(i) 805 fmt_values = _make_fixed_width( 806 fmt_values, self.justify, minimum=header_colwidth, adj=self.adj

/usr/local/lib/python3.8/site-packages/pandas/io/formats/format.py in format_col(self, i) 816 frame = self.tr_frame 817 formatter = self._get_formatter(i) --> 818 return format_array( 819 frame.iloc[:, i]._values, 820 formatter,

/usr/local/lib/python3.8/site-packages/pandas/io/formats/format.py in format_array(values, formatter, float_format, na_rep, digits, space, justify, decimal, leading_space, quoting) 1238 ) 1239 -> 1240 return fmt_obj.get_result() 1241 1242

/usr/local/lib/python3.8/site-packages/pandas/io/formats/format.py in get_result(self) 1269 1270 def get_result(self) -> list[str]: -> 1271 fmt_values = self._format_strings() 1272 return _make_fixed_width(fmt_values, self.justify) 1273

/usr/local/lib/python3.8/site-packages/pandas/io/formats/format.py in _format_strings(self) 1516 1517 def _format_strings(self) -> list[str]: -> 1518 return list(self.get_result_as_array()) 1519 1520

/usr/local/lib/python3.8/site-packages/pandas/io/formats/format.py in get_result_as_array(self) 1480 float_format = lambda value: self.float_format % value 1481 -> 1482 formatted_values = format_values_with(float_format) 1483 1484 if not self.fixed_width:

/usr/local/lib/python3.8/site-packages/pandas/io/formats/format.py in format_values_with(float_format) 1454 values = self.values 1455 is_complex = is_complex_dtype(values) -> 1456 values = format_with_na_rep(values, formatter, na_rep) 1457 1458 if self.fixed_width:

/usr/local/lib/python3.8/site-packages/pandas/io/formats/format.py in format_with_na_rep(values, formatter, na_rep) 1425 mask = isna(values) 1426 formatted = np.array( -> 1427 [ 1428 formatter(val) if not m else na_rep 1429 for val, m in zip(values.ravel(), mask.ravel())

/usr/local/lib/python3.8/site-packages/pandas/io/formats/format.py in (.0) 1426 formatted = np.array( 1427 [ -> 1428 formatter(val) if not m else na_rep 1429 for val, m in zip(values.ravel(), mask.ravel()) 1430 ]

ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()

ValueError Traceback (most recent call last) /usr/local/lib/python3.8/site-packages/IPython/core/formatters.py in call(self, obj) 343 method = get_real_method(obj, self.print_method) 344 if method is not None: --> 345 return method() 346 return None 347 else:

/usr/local/lib/python3.8/site-packages/pandas/core/frame.py in repr_html(self) 1045 decimal=".", 1046 ) -> 1047 return fmt.DataFrameRenderer(formatter).to_html(notebook=True) 1048 else: 1049 return None

/usr/local/lib/python3.8/site-packages/pandas/io/formats/format.py in to_html(self, buf, encoding, classes, notebook, border, table_id, render_links) 1027 render_links=render_links, 1028 ) -> 1029 string = html_formatter.to_string() 1030 return save_to_buffer(string, buf=buf, encoding=encoding) 1031

/usr/local/lib/python3.8/site-packages/pandas/io/formats/html.py in to_string(self) 70 71 def to_string(self) -> str: ---> 72 lines = self.render() 73 if any(isinstance(x, str) for x in lines): 74 lines = [str(x) for x in lines]

/usr/local/lib/python3.8/site-packages/pandas/io/formats/html.py in render(self) 619 self.write("

") 620 self.write_style() --> 621 super().render() 622 self.write("

") 623 return self.elements

/usr/local/lib/python3.8/site-packages/pandas/io/formats/html.py in render(self) 76 77 def render(self) -> list[str]: ---> 78 self._write_table() 79 80 if self.should_show_dimensions:

/usr/local/lib/python3.8/site-packages/pandas/io/formats/html.py in _write_table(self, indent) 246 self._write_header(indent + self.indent_delta) 247 --> 248 self._write_body(indent + self.indent_delta) 249 250 self.write("", indent)

/usr/local/lib/python3.8/site-packages/pandas/io/formats/html.py in _write_body(self, indent) 393 def _write_body(self, indent: int) -> None: 394 self.write("", indent) --> 395 fmt_values = self._get_formatted_values() 396 397 # write values

/usr/local/lib/python3.8/site-packages/pandas/io/formats/html.py in _get_formatted_values(self) 583 584 def _get_formatted_values(self) -> dict[int, list[str]]: --> 585 return {i: self.fmt.format_col(i) for i in range(self.ncols)} 586 587 def _get_columns_formatted_values(self) -> list[str]:

/usr/local/lib/python3.8/site-packages/pandas/io/formats/html.py in (.0) 583 584 def _get_formatted_values(self) -> dict[int, list[str]]: --> 585 return {i: self.fmt.format_col(i) for i in range(self.ncols)} 586 587 def _get_columns_formatted_values(self) -> list[str]:

/usr/local/lib/python3.8/site-packages/pandas/io/formats/format.py in _format_strings(self) 1516 1517 def _format_strings(self) -> list[str]: -> 1518 return list(self.get_result_as_array()) 1519 1520

ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()

I discovered this new behaviour due to our tests starting to fail. What was causing that was:

df["sum"] = X.sum(axis=1) df["log1p_sum"] = np.log1p(df["sum"])

failing with:

traceback

ValueError Traceback (most recent call last) in 1 df["sum"] = X.sum(axis=1) ----> 2 df["log1p_sum"] = np.log1p(df["sum"])

/usr/local/lib/python3.8/site-packages/pandas/core/frame.py in setitem(self, key, value) 3605 else: 3606 # set column -> 3607 self._set_item(key, value) 3608 3609 def _setitem_slice(self, key: slice, value):

/usr/local/lib/python3.8/site-packages/pandas/core/frame.py in _set_item(self, key, value) 3777 ensure homogeneity. 3778 """ -> 3779 value = self._sanitize_column(value) 3780 3781 if (

/usr/local/lib/python3.8/site-packages/pandas/core/frame.py in _sanitize_column(self, value) 4502 4503 if is_list_like(value): -> 4504 com.require_length_match(value, self.index) 4505 return sanitize_array(value, self.index, copy=True, allow_2d=True) 4506

/usr/local/lib/python3.8/site-packages/pandas/core/common.py in require_length_match(data, index) 525 """ 526 if len(data) != len(index): --> 527 raise ValueError( 528 "Length of values " 529 f"({len(data)}) "

ValueError: Length of values (1) does not match length of index (100)

Problem description

This problem is being triggered because the result of X.sum(axis=1) when X is a scipy sparse matrix is not a 1d numpy ndarray, but a np.matrix with one column. This used to be handled by pandas, but now isn't.

This is a problem because it's a behaviour change that breaks existing code. As far as I can tell from the release notes, this was not an intentional behaviour change. It does look like some things around column assignment did change, and I imagine that assigning with deprecated numpy types was not considered.

Expected Output

I would expect this to not error, and for this to pass: np.testing.assert_array_equal(df["X_sum"], np.ravel(X.sum(axis=1)))

Output of `pd.show_versions()`

INSTALLED VERSIONS
------------------
commit           : f00ed8f47020034e752baf0250483053340971b0
python           : 3.8.10.final.0
python-bits      : 64
OS               : Darwin
OS-release       : 20.5.0
Version          : Darwin Kernel Version 20.5.0: Sat May  8 05:10:33 PDT 2021; root:xnu-7195.121.3~9/RELEASE_X86_64
machine          : x86_64
processor        : i386
byteorder        : little
LC_ALL           : None
LANG             : en_US.UTF-8
LOCALE           : en_US.UTF-8

pandas           : 1.3.0
numpy            : 1.21.0
pytz             : 2020.1
dateutil         : 2.8.1
pip              : 21.1.3
setuptools       : 56.0.0
Cython           : 0.29.23
pytest           : 6.2.4
hypothesis       : None
sphinx           : 4.0.2
blosc            : None
feather          : None
xlsxwriter       : None
lxml.etree       : 4.6.3
html5lib         : None
pymysql          : None
psycopg2         : None
jinja2           : 2.11.2
IPython          : 7.23.1
pandas_datareader: None
bs4              : 4.9.3
bottleneck       : None
fsspec           : 2021.06.0
fastparquet      : 0.4.1
gcsfs            : None
matplotlib       : 3.4.2
numexpr          : 2.7.2
odfpy            : None
openpyxl         : None
pandas_gbq       : None
pyarrow          : 4.0.1
pyxlsb           : None
s3fs             : 0.4.2
scipy            : 1.7.0
sqlalchemy       : 1.3.18
tables           : 3.6.1
tabulate         : 0.8.7
xarray           : 0.18.2
xlrd             : 1.2.0
xlwt             : None
numba            : 0.53.1

BUG: 1.3.0 column assignment via single columnnp.matrix behaviour change · Issue #42376 · pandas-dev/pandas (original) (raw)

Code Sample, a copy-pastable example

ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()

Problem description

Expected Output

Output of pd.show_versions()

BUG: 1.3.0 column assignment via single column`np.matrix` behaviour change · Issue #42376 · pandas-dev/pandas (original) (raw)

Output of `pd.show_versions()`