PERF: faster corrwith method for pearson and spearman correlation when other is a Series and axis = 0 (column-wise) by fractionalhare · Pull Request #46174 · pandas-dev/pandas (original) (raw)
@jreback In addition to the changes you asked for and the perf tests I've shown with timeit
, here is a demonstration that the method is 1) still robust to nulls and 2) equivalent functionally to the existing logic. Given an unmodified build of Pandas:
import timeit
import numpy as np
import pandas as pd
from typing import Union
# test modified method against df.corrwith in unmodified Pandas build by making standalone function
def corr_with(df: pd.DataFrame, other: Union[pd.Series, pd.DataFrame], method: str, axis: int) -> pd.Series:
if axis == 0 and method in ['pearson', 'spearman'] and isinstance(other, pd.Series):
corrs = {}
numeric_cols = df.select_dtypes(include=np.number).columns
ndf = df[numeric_cols].values.transpose()
k = other.values
if method == "pearson":
for i, r in enumerate(ndf):
nonnull_mask = ~np.isnan(r) & ~np.isnan(k)
corrs[numeric_cols[i]] = np.corrcoef(
r[nonnull_mask], k[nonnull_mask]
)[0, 1]
else:
for i, r in enumerate(ndf):
nonnull_mask = ~np.isnan(r) & ~np.isnan(k)
corrs[numeric_cols[i]] = np.corrcoef(
r[nonnull_mask].argsort().argsort(),
k[nonnull_mask].argsort().argsort(),
)[0, 1]
return pd.Series(corrs)
else:
return df.corrwith(other, method=method, axis=axis)
# create a sample dataframe and series, ideally with nans
df = pd.DataFrame({'a': [1.4, 3.5, np.nan, 5.6, np.nan, 7.8], 'b': [5.7, 3.6, 2.3, 1.5, 8.7, 3.2]})
s = pd.Series([1.4, 6.5, 7.4, 2.6, np.nan, 3.4])
a = df.corrwith(s, method='pearson', axis=0)
b = corr_with(df, s, 'pearson', 0)
a = df.corrwith(s, method='pearson', axis=0)
b = corr_with(df, s, 'pearson', 0)
np.isclose(a, b)
which will output
Out[14]: array([ True, True])
Likewise if we do with Spearman instead of Pearson:
a = df.corrwith(s, method='spearman', axis=0)
b = corr_with(df, s, 'spearman', 0)
np.isclose(a, b)
we have
Out[17]: array([ True, True])