PERF: faster corrwith method for pearson and spearman correlation when other is a Series and axis = 0 (column-wise) by fractionalhare · Pull Request #46174 · pandas-dev/pandas (original) (raw)

@jreback In addition to the changes you asked for and the perf tests I've shown with timeit, here is a demonstration that the method is 1) still robust to nulls and 2) equivalent functionally to the existing logic. Given an unmodified build of Pandas:

import timeit
import numpy as np
import pandas as pd
from typing import Union

# test modified method against df.corrwith in unmodified Pandas build by making standalone function
def corr_with(df: pd.DataFrame, other: Union[pd.Series, pd.DataFrame], method: str, axis: int) -> pd.Series:
    if axis == 0 and method in ['pearson', 'spearman'] and isinstance(other, pd.Series):
        corrs = {}
        numeric_cols = df.select_dtypes(include=np.number).columns
        ndf = df[numeric_cols].values.transpose()
        k = other.values
        if method == "pearson":
            for i, r in enumerate(ndf):
                nonnull_mask = ~np.isnan(r) & ~np.isnan(k)
                corrs[numeric_cols[i]] = np.corrcoef(
                    r[nonnull_mask], k[nonnull_mask]
                )[0, 1]
        else:
            for i, r in enumerate(ndf):
                nonnull_mask = ~np.isnan(r) & ~np.isnan(k)
                corrs[numeric_cols[i]] = np.corrcoef(
                    r[nonnull_mask].argsort().argsort(),
                    k[nonnull_mask].argsort().argsort(),
                )[0, 1]
        return pd.Series(corrs)
    else:
        return df.corrwith(other, method=method, axis=axis)

# create a sample dataframe and series, ideally with nans
df = pd.DataFrame({'a': [1.4, 3.5, np.nan, 5.6, np.nan, 7.8], 'b': [5.7, 3.6, 2.3, 1.5, 8.7, 3.2]})

s = pd.Series([1.4, 6.5, 7.4, 2.6, np.nan, 3.4])

a = df.corrwith(s, method='pearson', axis=0)

b = corr_with(df, s, 'pearson', 0)

a = df.corrwith(s, method='pearson', axis=0)
b = corr_with(df, s, 'pearson', 0)
np.isclose(a, b)

which will output

Out[14]: array([ True,  True])

Likewise if we do with Spearman instead of Pearson:

a = df.corrwith(s, method='spearman', axis=0)
b = corr_with(df, s, 'spearman', 0)
np.isclose(a, b)

we have

Out[17]: array([ True,  True])