Reproducing Results: ARM, Chapter 4

ARM04: Linear regression: before and after fitting the model

Section 4.1: Linear transformations

# Standard imports
import numpy as np
import pandas as pd
import scipy.stats

import statsmodels.api as sm
import statsmodels.formula.api as smf
import patsy

import matplotlib.pyplot as plt
import matplotlib.ticker as mticker

# Custom imports
import common
import datasets
import arm
import summary

import pydlennon.extensions.matplotlib
from pydlennon.contexts.seeded_rng import SeededRng

# Set up the notebook session
from common import session
session._update(usetex=False, fontscale=1.0, seed=1111)

# Set up the summary factory
summarize = summary.Factory()

# Load the earnings dataset
dsf = datasets.Factory()
df = dsf('armch04a.earnings')

# Preview the table
df.head()

	earnings	height	male
record_id
3	50000.0	74.0	1
4	60000.0	66.0	0
5	30000.0	64.0	0
9	51000.0	63.0	0
10	9000.0	64.0	0

Equation 4.1

# Fit a linear model
y,X = patsy.dmatrices("earnings ~ height", data=df, return_type='dataframe')
model = sm.OLS(y,X).fit()
summarize(model)

	coef	std err	t	P>\|t\|	[0.025	0.975]
Intercept	-6.052e+04	1.04e+04	-5.809	0.000	-8.1e+04	-4.01e+04
height	1255.9797	155.063	8.100	0.000	951.713	1560.247

Statistics
n	1059
k	2
R-squared	0.058
residual sd	19307.173

Figure 4.1

# Plot points
x = df.height
with SeededRng(100) as rng:
    jx = x + rng.uniform(-0.2, 0.2, size=len(x))

# Set up figure, axes
fig = plt.figure(figsize=(10,4))
ax1 = fig.add_subplot(1,2,1)
ax2 = fig.add_subplot(1,2,2)

for ax in [ax1,ax2]:
    ax.scatter(jx,y,s=3)
    ax.abline(*model.params)
    ax.set_xlabel('height')
    ax.set_ylabel('earnings')

ax1.xaxis.set_major_locator( mticker.MultipleLocator(5) )
    
ax2.set_xlim((0,78))
ax2.set_ylim((-100000, 210000));

# OLS parametric bootstrap
with SeededRng(101) as rng:
    beta_sim = arm.sim(model, 50, rng)
    
for beta in beta_sim:
    intercept,slope = beta
    ax1.abline(intercept, slope, color='gray', alpha=0.25, zorder=1)
    ax2.abline(intercept, slope, color='gray', alpha=0.25, zorder=1)

The figure on the right shows the difficulty associated with interpretation of the intercept estimate.

Section 4.2: Centering and standardizing, especially for models with interactions

df = dsf("armch04.childiq", refresh=True)
df.head()

	kid_score	mom_hs	mom_iq	mom_age	mom_work
0	65	1	121.117529	27	4
1	98	1	89.361882	25	4
2	85	1	115.443165	27	4
3	83	1	99.449639	25	3
4	115	1	92.745710	27	4

y,X = patsy.dmatrices("kid_score ~ mom_hs + mom_iq + mom_hs:mom_iq", data=df, return_type='dataframe')
model = sm.OLS(y,X).fit()
summarize(model)

	coef	std err	t	P>\|t\|	[0.025	0.975]
Intercept	-11.4820	13.758	-0.835	0.404	-38.523	15.559
mom_hs	51.2682	15.338	3.343	0.001	21.122	81.414
mom_iq	0.9689	0.148	6.531	0.000	0.677	1.260
mom_hs:mom_iq	-0.4843	0.162	-2.985	0.003	-0.803	-0.165

Statistics
n	434
k	4
R-squared	0.230
residual sd	17.971

Centering by subtracting the mean of the data

With the interaction term, any interpretation of coefficients must be predicated on mom_iq being zero. This is a scenario which isn’t meaningful. Interpretation is far simpler after adjusting the variables.

# more interpretable; using stateful transforms
formula = "kid_score ~ center(mom_hs) + center(mom_iq) + center(mom_hs):center(mom_iq)"
y,X0 = patsy.dmatrices(formula, data=df)
model = sm.OLS(y,X0).fit()
summarize(model)

	coef	std err	t	P>\|t\|	[0.025	0.975]
Intercept	87.6389	0.908	96.565	0.000	85.855	89.423
center(mom_hs)	2.8408	2.427	1.171	0.242	-1.929	7.610
center(mom_iq)	0.5884	0.061	9.712	0.000	0.469	0.707
center(mom_hs):center(mom_iq)	-0.4843	0.162	-2.985	0.003	-0.803	-0.165

Statistics
n	434
k	4
R-squared	0.230
residual sd	17.971

formula = "kid_score ~ I(mom_hs-0.5) + I(mom_iq - 100) + I(mom_hs-0.5)*I(mom_iq-100)"
y,X1 = patsy.dmatrices(formula, data=df)
model = sm.OLS(y, X1, data=df).fit()
summarize(model)

	coef	std err	t	P>\|t\|	[0.025	0.975]
Intercept	86.8273	1.213	71.561	0.000	84.442	89.212
I(mom_hs - 0.5)	2.8408	2.427	1.171	0.242	-1.929	7.610
I(mom_iq - 100)	0.7268	0.081	8.960	0.000	0.567	0.886
I(mom_hs - 0.5):I(mom_iq - 100)	-0.4843	0.162	-2.985	0.003	-0.803	-0.165

Statistics
n	434
k	4
R-squared	0.230
residual sd	17.971

# perhaps even easier to interpret; in particular, compare to above
formula = "kid_score ~ mom_hs + center(mom_iq) + mom_hs:center(mom_iq)"
y,X1 = patsy.dmatrices(formula, data=df)
model = sm.OLS(y, X1, data=df).fit()
summarize(model)

	coef	std err	t	P>\|t\|	[0.025	0.975]
Intercept	85.4069	2.218	38.502	0.000	81.047	89.767
mom_hs	2.8408	2.427	1.171	0.242	-1.929	7.610
center(mom_iq)	0.9689	0.148	6.531	0.000	0.677	1.260
mom_hs:center(mom_iq)	-0.4843	0.162	-2.985	0.003	-0.803	-0.165

Statistics
n	434
k	4
R-squared	0.230
residual sd	17.971

type(df)

pandas.core.frame.DataFrame

Stateful Transforms (patsy)

The patsy package uses stateful transforms to maintain a separation between training and test data. This is encapsulated by the build_design_matrices function. Using the statsmodel formula interface simplifies the transformation process.

# Using the statsmodels formula interface
model_smf = smf.ols(formula, data=df).fit()
all(model_smf.params == model.params)

True

# Transform some data
z = df.iloc[0:5]
xz = patsy.build_design_matrices([X1.design_info], z)[0]

# Predictions
np.c_[
    model.predict(xz, transform=False),
    model_smf.predict(z)
]

array([[98.48151832, 98.48151832],
       [83.09226994, 83.09226994],
       [95.73163912, 95.73163912],
       [87.98094406, 87.98094406],
       [84.73212235, 84.73212235]])

Standardizing by subtracting the mean and dividing by 2 standard deviations

ARM recommends a standardization that divides by two standard deviations. Implementing this in patsy looks like the following:

class ArmStandardize(patsy.state.Standardize):
    def transform(self, x, center=True, rescale=True, ddof=0):
        x = patsy.state.asarray_or_pandas(x, copy=True, dtype=float)
        x_2d = patsy.state.atleast_2d_column_default(x, preserve_pandas=True)

        if center:
            x_2d -= self.current_mean
        if rescale:
            x_2d /= 2*np.sqrt(self.current_M2 / (self.current_n - ddof))
            
        return patsy.state.pandas_friendly_reshape(x_2d, x.shape)
        
arm_standardize = patsy.stateful_transform(ArmStandardize)

formula = "kid_score ~ arm_standardize(mom_hs) + arm_standardize(mom_iq) + arm_standardize(mom_hs):arm_standardize(mom_iq)"
model = smf.ols(formula, data=df).fit()
summarize(model)

	coef	std err	t	P>\|t\|	[0.025	0.975]
Intercept	87.6389	0.908	96.565	0.000	85.855	89.423
arm_standardize(mom_hs)	2.3313	1.991	1.171	0.242	-1.583	6.245
arm_standardize(mom_iq)	17.6313	1.815	9.712	0.000	14.063	21.199
arm_standardize(mom_hs):arm_standardize(mom_iq)	-11.9089	3.989	-2.985	0.003	-19.749	-4.068

Statistics
n	434
k	4
R-squared	0.230
residual sd	17.971

Interpretability of the above model is now on a roughly common scale. Because we divide by two standard deviations, the standardized mom_hs takes on values near -0.5 and 0.5, and the coefficient reflects a comparison of a (nearly) unit change in mom_hs. That is, we recover an estimate that captures the full range of mom_hs.

4.3 Correlation and “regression to the mean”

Figure 4.2

# Simulate correlated MVN data
mu = np.array([2,1], dtype=float)
Omega = np.array([[1, 0.5], [0.5, 1]])
with SeededRng(102) as rng:
    xy = rng.multivariate_normal(mu, Omega, size=2500)
xydf = pd.DataFrame(xy, columns=['y', 'x'])
X = xydf.values

# regresion line
model = smf.ols("y ~ x", data=xydf).fit()
intercept,slope = model.params

# principal component lines
mu = np.mean(X, axis=0)
w,V = np.linalg.eig( np.cov(X.T) )
pc_slopes = -V[1,:] / V[0,:]
pc_intercepts = mu[0] - mu[1:] * pc_slopes

x = xydf.x
y = xydf.y

# Set up the figure
fig = plt.figure(figsize=(10,4))
ax1 = fig.add_subplot(1,2,1)
ax2 = fig.add_subplot(1,2,2)

for ax in [ax1,ax2]:
    ax.scatter(x,y,s=3)
    ax.set_xlabel('x')
    ax.set_ylabel('y')
    ax.set_aspect(1)
    
ax1.set_title('principal component lines')    
ax1.abline(pc_intercepts[0], pc_slopes[0], color=common.colors[1])
ax1.abline(pc_intercepts[1], pc_slopes[1], color=common.colors[1], ls=":")

ax2.set_title('regression line of y on x')
ax2.abline(intercept, slope, color=common.colors[1]);

Section 4.4: Logarithmic transformation

Height and earnings example

df = dsf('armch04b.earnings')
formula = "np.log(earnings) ~ height"
y,X = patsy.dmatrices(formula, data=df, return_type="dataframe")
model = sm.OLS(y,X).fit()
summarize(model)

	coef	std err	t	P>\|t\|	[0.025	0.975]
Intercept	5.7785	0.451	12.815	0.000	4.894	6.663
height	0.0588	0.007	8.743	0.000	0.046	0.072

Statistics
n	1192
k	2
R-squared	0.060
residual sd	0.893

Figure 4.3

x = X.height

with SeededRng(103) as rng:
    jx = x + rng.uniform(-0.2,0.2,size=len(x))

# helper function to plot a transformed curve
def xplt(ax, beta, **kw):
    xlim = ax.get_xlim()
    x = np.linspace(*ax.get_xlim(), num=100)
    y = np.exp(beta[0] + x * beta[1])
    ax.plot(x,y, **kw)
    ax.set_xlim(xlim)

# Set up the figure
fig = plt.figure(figsize=(10,4))
ax1 = fig.add_subplot(1,2,1)
ax2 = fig.add_subplot(1,2,2)

for ax in [ax1,ax2]:
    ax.set_xlabel('height')

ax1.scatter(jx,y,s=3)
ax1.abline(*model.params)
ax1.set_ylabel("log(earnings)")
ax1.set_title("Log regression on log scale")

ax2.scatter(jx,np.exp(y),s=3)
xplt(ax2, model.params)
ax2.set_ylabel("earnings")
ax2.set_title("Log regression plotted on original scale")

# OLS parametric bootstrap
with SeededRng(104) as rng:
    beta_sim = arm.sim(model, 50, rng)
    
for beta in beta_sim:
    intercept,slope = beta
    ax1.abline(intercept, slope, color='gray', alpha=0.25, zorder=1)
    xplt(ax2, beta, color='gray', alpha=0.25, zorder=1)

Figure 4.4

Omitted, expository.

Building a regression model on the log scale

formula = "np.log(earnings) ~ height + male"
model = smf.ols(formula, data=df).fit()
summarize(model)

	coef	std err	t	P>\|t\|	[0.025	0.975]
Intercept	8.1527	0.603	13.530	0.000	6.970	9.335
height	0.0207	0.009	2.218	0.027	0.002	0.039
male	0.4232	0.072	5.841	0.000	0.281	0.565

Statistics
n	1192
k	3
R-squared	0.087
residual sd	0.881

Linear transformation to make coefficients more interpretable

formula = "np.log(earnings) ~ standardize(height) + male + standardize(height):male"
model = smf.ols(formula, data=df).fit()
summarize(model)

	coef	std err	t	P>\|t\|	[0.025	0.975]
Intercept	9.5266	0.045	210.878	0.000	9.438	9.615
standardize(height)	0.0654	0.050	1.304	0.193	-0.033	0.164
male	0.4197	0.073	5.748	0.000	0.276	0.563
standardize(height):male	0.0286	0.072	0.400	0.690	-0.112	0.169

Statistics
n	1192
k	4
R-squared	0.087
residual sd	0.881

Section 4.5: Other transformations

Figure 4.5

Omitted, unknown data source.

# A model using an ordered categorical variable, mother's work history after childbirth
df = dsf('armch04.childiq')
df['mom_work'] = df.mom_work.values.codes  # this used to work w/o accessors...?
model = smf.ols("kid_score ~ C(mom_work)", data=df).fit()
summarize(model)

	coef	std err	t	P>\|t\|	[0.025	0.975]
Intercept	82.0000	2.305	35.568	0.000	77.469	86.531
C(mom_work)[T.1]	3.8542	3.095	1.245	0.214	-2.229	9.937
C(mom_work)[T.2]	11.5000	3.553	3.237	0.001	4.517	18.483
C(mom_work)[T.3]	5.2098	2.704	1.927	0.055	-0.105	10.524

Statistics
n	434
k	4
R-squared	0.024
residual sd	20.230

Section 4.6: Building regression models for prediction

Example: predicting the yields of mesquite bushes

df = dsf('armch04.mesquite', refresh=True)
df.head()

	group	diam_1	diam_2	total_height	canopy_height	density	weight
record_id
1	MCD	1.8	1.15	1.30	1.00	1	401.3
2	MCD	1.7	1.35	1.35	1.33	1	513.7
3	MCD	2.8	2.55	2.16	0.60	1	1179.2
4	MCD	1.3	0.85	1.80	1.20	1	308.0
5	MCD	3.3	1.90	1.55	1.05	1	855.2

# Initial model
formula = "weight ~ diam_1 + diam_2 + canopy_height + total_height + density + group"
model = smf.ols(formula, data=df).fit()
summarize(model)

	coef	std err	t	P>\|t\|	[0.025	0.975]
Intercept	-728.5929	147.159	-4.951	0.000	-1026.250	-430.935
group[T.ALS]	-363.2951	100.184	-3.626	0.001	-565.936	-160.654
diam_1	189.6690	112.760	1.682	0.101	-38.410	417.748
diam_2	371.4621	124.378	2.987	0.005	119.883	623.041
canopy_height	355.6653	209.843	1.695	0.098	-68.782	780.113
total_height	-101.7325	185.574	-0.548	0.587	-477.091	273.626
density	131.2542	34.355	3.820	0.000	61.764	200.744

Statistics
n	46
k	7
R-squared	0.848
residual sd	268.960

# Descriptive statistics
s = df.describe().T
s['IQR'] = s.loc[:,"75%"] - s.loc[:,"25%"]
s.loc[:,['min','25%','50%','75%','max', 'IQR']]

	min	25%	50%	75%	max	IQR
diam_1	0.80	1.4000	1.950	2.475	5.2	1.0750
diam_2	0.40	1.0000	1.525	1.900	4.0	0.9000
total_height	0.65	1.2000	1.500	1.700	3.0	0.5000
canopy_height	0.50	0.8625	1.100	1.300	2.5	0.4375
density	1.00	1.0000	1.000	2.000	9.0	1.0000
weight	60.20	219.6250	361.850	688.725	4052.0	469.1000

# Multiplicative model
formula =   "np.log(weight) ~ np.log(diam_1) + " \
            "np.log(diam_2) + np.log(canopy_height) + " \
            "np.log(total_height) + np.log(density) + group"
model = smf.ols(formula, data=df).fit()
summarize(model)

	coef	std err	t	P>\|t\|	[0.025	0.975]
Intercept	5.3515	0.170	31.391	0.000	5.007	5.696
group[T.ALS]	-0.5834	0.129	-4.534	0.000	-0.844	-0.323
np.log(diam_1)	0.3938	0.282	1.397	0.170	-0.177	0.964
np.log(diam_2)	1.1512	0.210	5.477	0.000	0.726	1.576
np.log(canopy_height)	0.3732	0.281	1.330	0.191	-0.194	0.941
np.log(total_height)	0.3943	0.313	1.260	0.215	-0.239	1.027
np.log(density)	0.1093	0.122	0.896	0.376	-0.137	0.356

Statistics
n	46
k	7
R-squared	0.887
residual sd	0.329

# Minimalist model
df['canopy_volume'] = df.diam_1 * df.diam_2 * df.canopy_height
formula = "np.log(weight) ~ np.log(canopy_volume)"
model = smf.ols(formula, data=df).fit()
summarize(model)

	coef	std err	t	P>\|t\|	[0.025	0.975]
Intercept	5.1697	0.083	62.066	0.000	5.002	5.338
np.log(canopy_volume)	0.7224	0.055	13.234	0.000	0.612	0.832

Statistics
n	46
k	2
R-squared	0.799
residual sd	0.414

# Adding semantic, interpretable predictors
df['canopy_area'] = df.diam_1 * df.diam_2
df['canopy_shape'] = df.diam_1 / df.diam_2
formula = "np.log(weight) ~ np.log(canopy_volume) + np.log(canopy_area) +" \
    "np.log(canopy_shape) + np.log(total_height) + np.log(density) + group"
model = smf.ols(formula, data=df).fit()
summarize(model)

	coef	std err	t	P>\|t\|	[0.025	0.975]
Intercept	5.3515	0.170	31.391	0.000	5.007	5.696
group[T.ALS]	-0.5834	0.129	-4.534	0.000	-0.844	-0.323
np.log(canopy_volume)	0.3732	0.281	1.330	0.191	-0.194	0.941
np.log(canopy_area)	0.3993	0.294	1.357	0.183	-0.196	0.994
np.log(canopy_shape)	-0.3787	0.231	-1.642	0.109	-0.845	0.088
np.log(total_height)	0.3943	0.313	1.260	0.215	-0.239	1.027
np.log(density)	0.1093	0.122	0.896	0.376	-0.137	0.356

Statistics
n	46
k	7
R-squared	0.887
residual sd	0.329

# Reduced model, post fit assessment
formula = "np.log(weight) ~ np.log(canopy_volume) + np.log(canopy_area) + group"
model = smf.ols(formula, data=df).fit()
summarize(model)

	coef	std err	t	P>\|t\|	[0.025	0.975]
Intercept	5.2244	0.088	59.197	0.000	5.046	5.403
group[T.ALS]	-0.5269	0.116	-4.558	0.000	-0.760	-0.294
np.log(canopy_volume)	0.6148	0.191	3.215	0.003	0.229	1.001
np.log(canopy_area)	0.2889	0.238	1.215	0.231	-0.191	0.768

Statistics
n	46
k	4
R-squared	0.873
residual sd	0.337

Bibliography

[GH07] Gelman, A., and Hill, J. (2007). Data Analysis Using Regression and Multilevel/Hierarchical Models. Chapter 3.

Dustin Lennon

Applied Scientist
dlennon.org

ARM04: Linear regression: before and after fitting the model

ARM04: Linear regression: before and after fitting the model

Section 4.1: Linear transformations

Section 4.1: Linear transformations

Equation 4.1

Figure 4.1

Section 4.2: Centering and standardizing, especially for models with interactions

Section 4.2: Centering and standardizing, especially for models with interactions

Centering by subtracting the mean of the data

Stateful Transforms (patsy)

Standardizing by subtracting the mean and dividing by 2 standard deviations

4.3 Correlation and “regression to the mean”

4.3 Correlation and “regression to the mean”

Figure 4.2

Section 4.4: Logarithmic transformation

Section 4.4: Logarithmic transformation

Height and earnings example

Figure 4.3

Figure 4.3

Figure 4.4

Figure 4.4

Building a regression model on the log scale

Linear transformation to make coefficients more interpretable

Section 4.5: Other transformations

Section 4.5: Other transformations

Figure 4.5

Section 4.6: Building regression models for prediction

Section 4.6: Building regression models for prediction

Example: predicting the yields of mesquite bushes

Bibliography

Bibliography

Dustin Lennon

Applied Scientist
dlennon.org

Dustin Lennon

Applied Scientist dlennon.org

ARM04: Linear regression: before and after fitting the model

ARM04: Linear regression: before and after fitting the model

Section 4.1: Linear transformations

Section 4.1: Linear transformations

Equation 4.1

Figure 4.1

Section 4.2: Centering and standardizing, especially for models with interactions

Section 4.2: Centering and standardizing, especially for models with interactions

Centering by subtracting the mean of the data

Stateful Transforms (patsy)

Standardizing by subtracting the mean and dividing by 2 standard deviations

4.3 Correlation and “regression to the mean”

4.3 Correlation and “regression to the mean”

Figure 4.2

Section 4.4: Logarithmic transformation

Section 4.4: Logarithmic transformation

Height and earnings example

Figure 4.3

Figure 4.3

Figure 4.4

Figure 4.4

Building a regression model on the log scale

Linear transformation to make coefficients more interpretable

Section 4.5: Other transformations

Section 4.5: Other transformations

Figure 4.5

Section 4.6: Building regression models for prediction

Section 4.6: Building regression models for prediction

Example: predicting the yields of mesquite bushes

Bibliography

Bibliography

Dustin Lennon

Applied Scientist dlennon.org

Applied Scientist
dlennon.org

Applied Scientist
dlennon.org