Notebook on nbviewer (original) (raw)
- DBDA-python
- Notebooks Notebook
Chapter 19 - Metric Predicted Variable with One Nominal Predictor¶
- 19.3 - Hierarchical Bayesian Approach
- 19.4 - Adding a Metric Predictor
- 19.5 - Heterogeneous Variances and Robustness against Outliers
In [1]:
import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns import pymc3 as pm import theano.tensor as tt import warnings warnings.filterwarnings("ignore", category=FutureWarning)
from scipy.stats import norm from IPython.display import Image from sklearn.linear_model import LinearRegression from sklearn.metrics import mean_squared_error
%matplotlib inline plt.style.use('seaborn-white')
color = '#87ceeb'
In [2]:
%load_ext watermark %watermark -p pandas,numpy,pymc3,theano,matplotlib,seaborn,scipy
pandas 0.23.4 numpy 1.15.1 pymc3 3.5 theano 1.0.2 matplotlib 2.2.3 seaborn 0.9.0 scipy 1.1.0
In [3]:
def gammaShRaFromModeSD(mode, sd): """Calculate Gamma shape and rate from mode and sd.""" rate = (mode + np.sqrt( mode2 + 4 * sd2 ) ) / ( 2 * sd**2 ) shape = 1 + mode * rate return(shape, rate)
In [4]:
def plot_mustache(var, sd, j, axis, width=.75): for i in np.arange(start=0, stop=len(var), step=int(len(var)*.1)): rv = norm(loc=var[i], scale=sd[i]) yrange = np.linspace(rv.ppf(0.01), rv.ppf(0.99), 100) xrange = rv.pdf(yrange)
# When the SD of a group is large compared to others, then the top of its mustache is relatively
# low and does not plot well together with low SD groups.
# Scale the xrange so that the 'height' of the all mustaches is 0.75
xrange_scaled = xrange*(width/xrange.max())
# Using the negative value to flip the mustache in the right direction.
axis.plot(-xrange_scaled+j, yrange, color=color, alpha=.6)
In [5]:
def plot_cred_lines(b0, bj, bcov, x, ax): """Plot credible posterior distribution lines for model in section 19.4"""
B = pd.DataFrame(np.c_[b0, bj, bcov], columns=['beta0', 'betaj', 'betacov'])
# Credible posterior prediction lines
hpd_interval = pm.hpd(B.values, alpha=0.05)
B_hpd = B[B.beta0.between(*hpd_interval[0,:]) &
B.betaj.between(*hpd_interval[1,:]) &
B.betacov.between(*hpd_interval[2,:])]
xrange = np.linspace(x.min()*.95, x.max()*1.05)
for i in np.random.randint(0, len(B_hpd), 10):
ax.plot(xrange, B_hpd.iloc[i,0]+B_hpd.iloc[i,1]+B_hpd.iloc[i,2]*xrange, c=color, alpha=.6, zorder=0)
19.3 - Hierarchical Bayesian Approach¶
In [6]:
df = pd.read_csv('data/FruitflyDataReduced.csv', dtype={'CompanionNumber':'category'}) df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 125 entries, 0 to 124 Data columns (total 3 columns): Longevity 125 non-null int64 CompanionNumber 125 non-null category Thorax 125 non-null float64 dtypes: category(1), float64(1), int64(1) memory usage: 2.3 KB
In [7]:
df.groupby('CompanionNumber').head(2)
Out[7]:
Longevity | CompanionNumber | Thorax | |
---|---|---|---|
0 | 35 | Pregnant8 | 0.64 |
1 | 37 | Pregnant8 | 0.68 |
25 | 40 | None0 | 0.64 |
26 | 37 | None0 | 0.70 |
50 | 46 | Pregnant1 | 0.64 |
51 | 42 | Pregnant1 | 0.68 |
75 | 21 | Virgin1 | 0.68 |
76 | 40 | Virgin1 | 0.68 |
100 | 16 | Virgin8 | 0.64 |
101 | 19 | Virgin8 | 0.64 |
In [8]:
Count the number of records per nominal group
df.CompanionNumber.value_counts()
Out[8]:
Virgin8 25 Virgin1 25 Pregnant8 25 Pregnant1 25 None0 25 Name: CompanionNumber, dtype: int64
Model (Kruschke, 2015)¶
In [9]:
Image('images/fig19_2.png')
Out[9]:
In [10]:
x = df.CompanionNumber.cat.codes.values y = df.Longevity yMean = y.mean() ySD = y.std()
NxLvl = len(df.CompanionNumber.cat.categories)
agammaShRa = gammaShRaFromModeSD(ySD/2, 2*ySD)
with pm.Model() as model1:
aSigma = pm.Gamma('aSigma', agammaShRa[0], agammaShRa[1])
a0 = pm.Normal('a0', yMean, tau=1/(ySD*5)**2)
a = pm.Normal('a', 0.0, tau=1/aSigma**2, shape=NxLvl)
ySigma = pm.Uniform('ySigma', ySD/100, ySD*10)
y = pm.Normal('y', a0 + a[x], tau=1/ySigma**2, observed=y)
# Convert a0,a to sum-to-zero b0,b
m = pm.Deterministic('m', a0 + a)
b0 = pm.Deterministic('b0', tt.mean(m))
b = pm.Deterministic('b', m - b0)
pm.model_to_graphviz(model1)
Out[10]:
%3 cluster5 5 cluster125 125 ySigma ySigma ~ Uniform y y ~ Normal ySigma->y a0 a0 ~ Normal m m ~ Deterministic a0->m a0->y aSigma aSigma ~ Gamma a a ~ Normal aSigma->a b0 b0 ~ Deterministic b b ~ Deterministic m->b0 m->b a->m a->y
In [11]:
with model1: trace1 = pm.sample(3000, cores=4, nuts_kwargs={'target_accept': 0.95})
Auto-assigning NUTS sampler... Initializing NUTS using jitter+adapt_diag... Multiprocess sampling (4 chains in 4 jobs) NUTS: [ySigma, a, a0, aSigma] Sampling 4 chains: 100%|██████████| 14000/14000 [00:14<00:00, 983.22draws/s] The number of effective samples is smaller than 25% for some parameters.
Figure 19.3 (top)¶
In [13]:
Here we plot the metric predicted variable for each group. Then we superimpose the
posterior predictive distribution
None0 = trace1['m'][:,0] Pregnant1 = trace1['m'][:,1] Pregnant8 = trace1['m'][:,2] Virgin1 = trace1['m'][:,3] Virgin8 = trace1['m'][:,4] scale = trace1['ySigma'][:]
fig, ax = plt.subplots(1,1, figsize=(8,5)) ax.set_title('Data with Posterior Predictive Distribution')
sns.swarmplot('CompanionNumber', 'Longevity', data=df, ax=ax); ax.set_xlim(xmin=-1)
for i, grp in enumerate([None0, Pregnant1, Pregnant8, Virgin1, Virgin8]): plot_mustache(grp, scale, i, ax)
Contrasts¶
In [14]:
fig, axes = plt.subplots(2,4, figsize=(15,6))
contrasts = [np.mean([Pregnant1, Pregnant8], axis=0)-None0, np.mean([Pregnant1, Pregnant8, None0], axis=0)-Virgin1, Virgin1-Virgin8, np.mean([Pregnant1, Pregnant8, None0], axis=0)-np.mean([Virgin1, Virgin8], axis=0)]
contrast_titles = ['Pregnant1.Pregnant8 \n vs \n None0', 'Pregnant1.Pregnant8.None0 \n vs \n Virgin1', 'Virgin1 \n vs \n Virgin8', 'Pregnant1.Pregnant8.None0 \n vs \n Virgin1.Virgin8']
for contr, ctitle, ax_top, ax_bottom in zip(contrasts, contrast_titles, fig.axes[:4], fig.axes[4:]): pm.plot_posterior(contr, ref_val=0, color=color, ax=ax_top) pm.plot_posterior(contr/scale, ref_val=0, color=color, ax=ax_bottom) ax_top.set_title(ctitle) ax_bottom.set_title(ctitle) ax_top.set_xlabel('Difference') ax_bottom.set_xlabel('Effect Size') fig.tight_layout()
19.4 - Adding a Metric Predictor¶
Model (Kruschke, 2015)¶
In [15]:
Image('images/fig19_4.png')
Out[15]:
In [16]:
y = df.Longevity yMean = y.mean() ySD = y.std() xNom = df.CompanionNumber.cat.categories xMet = df.Thorax xMetMean = df.Thorax.mean() xMetSD = df.Thorax.std() NxNomLvl = len(df.CompanionNumber.cat.categories)
X = pd.concat([df.Thorax, pd.get_dummies(df.CompanionNumber, drop_first=True)], axis=1) lmInfo = LinearRegression().fit(X, y) residSD = np.sqrt(mean_squared_error(y, lmInfo.predict(X)))
agammaShRa = gammaShRaFromModeSD(ySD/2, 2*ySD)
with pm.Model() as model2:
aSigma = pm.Gamma('aSigma', agammaShRa[0], agammaShRa[1])
a0 = pm.Normal('a0', yMean, tau=1/(ySD*5)**2)
a = pm.Normal('a', 0.0, tau=1/aSigma**2, shape=NxNomLvl)
aMet = pm.Normal('aMet', 0, tau=1/(2*ySD/xMetSD)**2)
ySigma = pm.Uniform('ySigma', residSD/100, ySD*10)
mu = a0 + a[x] + aMet*(xMet - xMetMean)
y = pm.Normal('y', mu, tau=1/ySigma**2, observed=y)
# Convert a0,a to sum-to-zero b0,b
b0 = pm.Deterministic('b0', a0 + tt.mean(a) + aMet*(-xMetMean))
b = pm.Deterministic('b', a - tt.mean(a))
pm.model_to_graphviz(model2)
Out[16]:
%3 cluster5 5 cluster125 125 aSigma aSigma ~ Gamma a a ~ Normal aSigma->a b0 b0 ~ Deterministic ySigma ySigma ~ Uniform y y ~ Normal ySigma->y a0 a0 ~ Normal a0->b0 a0->y aMet aMet ~ Normal aMet->b0 aMet->y b b ~ Deterministic a->b0 a->b a->y
In [17]:
with model2: trace2 = pm.sample(3000, cores=4, nuts_kwargs={'target_accept': 0.95})
Auto-assigning NUTS sampler... Initializing NUTS using jitter+adapt_diag... Multiprocess sampling (4 chains in 4 jobs) NUTS: [ySigma, aMet, a, a0, aSigma] Sampling 4 chains: 100%|██████████| 14000/14000 [00:18<00:00, 761.72draws/s] The number of effective samples is smaller than 25% for some parameters.
Figure 19.5¶
In [19]:
Here we plot, for every group, the predicted variable and the metric predictor.
Superimposed are are the posterior predictive distributions.
fg = sns.FacetGrid(df, col='CompanionNumber', despine=False) fg.map(plt.scatter, 'Thorax', 'Longevity', facecolor='none', edgecolor='r')
plt.suptitle('Data with Posterior Predictive Distribution', y=1.10, fontsize=15) for i, ax in enumerate(fg.axes.flatten()): plot_cred_lines(trace2['b0'], trace2['b'][:,i], trace2['aMet'][:], xMet, ax) ax.set_xticks(np.arange(.6, 1.1, .1));
Contrasts¶
In [20]:
None0 = trace2['b'][:,0] Pregnant1 = trace2['b'][:,1] Pregnant8 = trace2['b'][:,2] Virgin1 = trace2['b'][:,3] Virgin8 = trace2['b'][:,4] scale = trace2['ySigma']
fig, axes = plt.subplots(2,4, figsize=(15,6))
contrasts = [np.mean([Pregnant1, Pregnant8], axis=0)-None0, np.mean([Pregnant1, Pregnant8, None0], axis=0)-Virgin1, Virgin1-Virgin8, np.mean([Pregnant1, Pregnant8, None0], axis=0)-np.mean([Virgin1, Virgin8], axis=0)]
for contr, ctitle, ax_top, ax_bottom in zip(contrasts, contrast_titles, fig.axes[:4], fig.axes[4:]): pm.plot_posterior(contr, ref_val=0, color=color, ax=ax_top) pm.plot_posterior(contr/scale, ref_val=0, color=color, ax=ax_bottom) ax_top.set_title(ctitle) ax_bottom.set_title(ctitle) ax_top.set_xlabel('Difference') ax_bottom.set_xlabel('Effect Size') fig.tight_layout()
19.5 - Heterogeneous Variances and Robustness against Outliers¶
In [21]:
df2 = pd.read_csv('data/NonhomogVarData.csv', dtype={'Group':'category'}) df2.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 96 entries, 0 to 95 Data columns (total 2 columns): Group 96 non-null category Y 96 non-null float64 dtypes: category(1), float64(1) memory usage: 976.0 bytes
In [22]:
df2.groupby('Group').head(3)
Out[22]:
Group | Y | |
---|---|---|
0 | A | 97.770214 |
1 | A | 99.919872 |
2 | A | 92.372917 |
24 | B | 98.246778 |
25 | B | 98.736006 |
26 | B | 98.722708 |
48 | C | 102.432580 |
49 | C | 102.198665 |
50 | C | 103.052658 |
72 | D | 97.561346 |
73 | D | 92.912256 |
74 | D | 96.500329 |
Model (Kruschke, 2015)¶
In [23]:
Image('images/fig19_6.png')
Out[23]:
In [24]:
y = df2.Y x = df2.Group.cat.codes.values xlevels = df2.Group.cat.categories NxLvl = len(xlevels) yMean = y.mean() ySD = y.std()
aGammaShRa = gammaShRaFromModeSD(ySD/2, 2*ySD) medianCellSD = df2.groupby('Group').std().dropna().median()
with pm.Model() as model3:
aSigma = pm.Gamma('aSigma', aGammaShRa[0], aGammaShRa[1])
a0 = pm.Normal('a0', yMean, tau=1/(ySD*10)**2)
a = pm.Normal('a', 0.0, tau=1/aSigma**2, shape=NxLvl)
ySigmaSD = pm.Gamma('ySigmaSD', aGammaShRa[0], aGammaShRa[1])
ySigmaMode = pm.Gamma('ySigmaMode', aGammaShRa[0], aGammaShRa[1])
ySigmaRa = (ySigmaMode + np.sqrt(ySigmaMode**2 + 4*ySigmaSD**2))/2*ySigmaSD**2
ySigmaSh = ySigmaMode*ySigmaRa
sigma = pm.Gamma('sigma', ySigmaSh, ySigmaRa, shape=NxLvl)
ySigma = pm.Deterministic('ySigma', tt.maximum(sigma, medianCellSD/1000))
nu_minus1 = pm.Exponential('nu_minus1', 1/29.)
nu = pm.Deterministic('nu', nu_minus1+1)
like = pm.StudentT('y', nu=nu, mu=a0 + a[x], sd=ySigma[x], observed=y)
# Convert a0,a to sum-to-zero b0,b
m = pm.Deterministic('m', a0 + a)
b0 = pm.Deterministic('b0', tt.mean(m))
b = pm.Deterministic('b', m - b0)
pm.model_to_graphviz(model3)
Out[24]:
%3 cluster4 4 cluster96 96 nu_minus1 nu_minus1 ~ Exponential nu nu ~ Deterministic nu_minus1->nu aSigma aSigma ~ Gamma a a ~ Normal aSigma->a b0 b0 ~ Deterministic ySigmaMode ySigmaMode ~ Gamma sigma sigma ~ Gamma ySigmaMode->sigma ySigmaSD ySigmaSD ~ Gamma ySigmaSD->sigma a0 a0 ~ Normal m m ~ Deterministic a0->m y y ~ StudentT a0->y m->b0 b b ~ Deterministic m->b a->m a->y ySigma ySigma ~ Deterministic sigma->ySigma
In [30]:
with model3: # Initializing NUTS with advi since jitter seems to create a problem in this model. # https://github.com/pymc-devs/pymc3/issues/2897 trace3 = pm.sample(3000, init='advi+adapt_diag', cores=4, nuts_kwargs={'target_accept': 0.95})
Auto-assigning NUTS sampler...
Initializing NUTS using advi+adapt_diag...
Average Loss = 268.21: 14%|█▍ | 28661/200000 [00:10<01:03, 2707.41it/s]
Convergence achieved at 28900
Interrupted at 28,899 [14%]: Average Loss = 30,887
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [nu_minus1, sigma, ySigmaMode, ySigmaSD, a, a0, aSigma]
Sampling 4 chains: 100%|██████████| 14000/14000 [01:16<00:00, 183.03draws/s]
The number of effective samples is smaller than 25% for some parameters.
Model that assumes equal variances¶
In [32]:
y = df2.Y x = df2.Group.cat.codes.values xlevels = df2.Group.cat.categories NxLvl = len(xlevels) yMean = y.mean() ySD = y.std()
aGammaShRa = gammaShRaFromModeSD(ySD/2, 2*ySD)
with pm.Model() as model3b:
aSigma = pm.Gamma('aSigma', agammaShRa[0], agammaShRa[1])
a0 = pm.Normal('a0', yMean, tau=1/(ySD*5)**2)
a = pm.Normal('a', 0.0, tau=1/aSigma**2, shape=NxLvl)
ySigma = pm.Uniform('ySigma', ySD/100, ySD*10)
y = pm.Normal('y', a0 + a[x], tau=1/ySigma**2, observed=y)
# Convert a0,a to sum-to-zero b0,b
m = pm.Deterministic('m', a0 + a)
b0 = pm.Deterministic('b0', tt.mean(m))
b = pm.Deterministic('b', m - b0)
pm.model_to_graphviz(model3b)
Out[32]:
%3 cluster4 4 cluster96 96 ySigma ySigma ~ Uniform y y ~ Normal ySigma->y a0 a0 ~ Normal m m ~ Deterministic a0->m a0->y aSigma aSigma ~ Gamma a a ~ Normal aSigma->a b0 b0 ~ Deterministic b b ~ Deterministic m->b0 m->b a->m a->y
In [33]:
with model3b: trace3b = pm.sample(3000, cores=4, nuts_kwargs={'target_accept': 0.95})
Auto-assigning NUTS sampler... Initializing NUTS using jitter+adapt_diag... Multiprocess sampling (4 chains in 4 jobs) NUTS: [ySigma, a, a0, aSigma] Sampling 4 chains: 100%|██████████| 14000/14000 [00:19<00:00, 723.93draws/s] The number of effective samples is smaller than 25% for some parameters.
Figure 19.7¶
In [35]:
group_a = trace3b['m'][:,0] group_b = trace3b['m'][:,1] group_c = trace3b['m'][:,2] group_d = trace3b['m'][:,3] scale = trace3b['ySigma'] fig, ax = plt.subplots(1,1, figsize=(8,6)) ax.set_title('Data with Posterior Predictive Distribution\n(Heterogeneous variances)')
sns.swarmplot('Group', 'Y', data=df2, size=5, ax=ax) ax.set_xlim(xmin=-1);
for i, grp, in enumerate([group_a, group_b, group_c, group_d]): plot_mustache(grp, scale, i, ax)
In [36]:
fig, axes = plt.subplots(2,2, figsize=(8,6))
contrasts = [group_d-group_a, group_c-group_b]
contrast_titles = ['D vs A', 'C vs B']
for contr, ctitle, ax_top, ax_bottom in zip(contrasts, contrast_titles, fig.axes[:2], fig.axes[2:]): pm.plot_posterior(contr, ref_val=0, color=color, ax=ax_top) pm.plot_posterior(contr/scale, ref_val=0, color=color, ax=ax_bottom) ax_top.set_title(ctitle) ax_bottom.set_title(ctitle) ax_top.set_xlabel('Difference') ax_bottom.set_xlabel('Effect Size') fig.tight_layout()
Figure 19.8¶
In [37]:
group_a = trace3['m'][:,0] group_b = trace3['m'][:,1] group_c = trace3['m'][:,2] group_d = trace3['m'][:,3] scale_a = trace3['ySigma'][:,0] scale_b = trace3['ySigma'][:,1] scale_c = trace3['ySigma'][:,2] scale_d = trace3['ySigma'][:,3]
fig, ax = plt.subplots(1,1, figsize=(8,6)) ax.set_title('Data with Posterior Predictive Distribution\n(Heterogeneous variances)')
sns.swarmplot('Group', 'Y', data=df2, size=5, ax=ax) ax.set_xlim(xmin=-1);
for i, (grp, scale) in enumerate(zip([group_a, group_b, group_c, group_d], [scale_a, scale_b, scale_c, scale_d])): plot_mustache(grp, scale, i, ax)
Contrasts¶
In [38]:
fig, axes = plt.subplots(2,2, figsize=(8,6))
contrasts = [group_d-group_a, group_c-group_b]
scales = [scale_d2 + scale_a2, scale_c2 + scale_b2]
contrast_titles = ['D vs A', 'C vs B']
for contr, scale, ctitle, ax_top, ax_bottom in zip(contrasts, scales, contrast_titles, fig.axes[:2], fig.axes[2:]): pm.plot_posterior(contr, ref_val=0, color=color, ax=ax_top) pm.plot_posterior(contr/(np.sqrt(scale/2)), ref_val=0, color=color, ax=ax_bottom) ax_top.set_title(ctitle) ax_bottom.set_title(ctitle) ax_top.set_xlabel('Difference') ax_bottom.set_xlabel('Effect Size') fig.tight_layout()