Python code “Old faithful geyser dataset rebooted with Python”

# %load ../standard_import.txt
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import axes3d
import seaborn as sns

from sklearn.preprocessing import scale
import sklearn.linear_model as skl_lm
from sklearn.metrics import mean_squared_error, r2_score
import statsmodels.api as sm
import statsmodels.formula.api as smf

%matplotlib inline
plt.style.use('seaborn-white')
of = pd.read_csv('/.../oldfaith.csv')
of.info()
of.head()
regr = skl_lm.LinearRegression()

# Linear fit
X = of.wait_time_min.values.reshape(-1,1)
y = of.duration_sec
regr.fit(X, y)

of['pred1'] = regr.predict(X)
of['resid1'] = of.duration_sec - of.pred1

# Quadratic fit
X2 = of[['wait_time_min', 'wait_time_min']].as_matrix()
regr.fit(X2, y)

of['pred2'] = regr.predict(X2)
of['resid2'] = of.duration_sec - of.pred2
fig, (ax1,ax2) = plt.subplots(1,2, figsize=(12,5))

# Left plot
sns.regplot(of.pred1, of.resid1, lowess=True, 
            ax=ax1, line_kws={'color':'r', 'lw':1},
            scatter_kws={'facecolors':'None', 'edgecolors':'k', 'alpha':0.5})
ax1.hlines(0,xmin=ax1.xaxis.get_data_interval()[0],
           xmax=ax1.xaxis.get_data_interval()[1], linestyles='dotted')
ax1.set_title('Residual Plot for Linear Fit')

# Right plot
sns.regplot(of.pred2, of.resid2, lowess=True,
            line_kws={'color':'r', 'lw':1}, ax=ax2,
            scatter_kws={'facecolors':'None', 'edgecolors':'k', 'alpha':0.5})
ax2.hlines(0,xmin=ax2.xaxis.get_data_interval()[0],
           xmax=ax2.xaxis.get_data_interval()[1], linestyles='dotted')
ax2.set_title('Residual Plot for Quadratic Fit')

for ax in fig.axes:
    ax.set_xlabel('Fitted values')
    ax.set_ylabel('Residuals')
est = smf.ols('wait_time_min ~ duration_sec', of).fit()
est.summary().tables[1]
sns.jointplot(x='wait_time_min',y='duration_sec',data=of,kind='reg')
g = sns.jointplot("wait_time_min", "duration_sec", data=of,
...                   kind="kde", space=0, color="g")
 g = (sns.jointplot("wait_time_min", "duration_sec",
...                    data=of, color="k")
...         .plot_joint(sns.kdeplot, zorder=0, n_levels=6))
g = sns.jointplot("wait_time_min", "duration_sec", data=of,
...                   marginal_kws=dict(bins=15, rug=True),
...                   annot_kws=dict(stat="r"),
...                   s=40, edgecolor="w", linewidth=1)

Leave a Reply

This site uses Akismet to reduce spam. Learn how your comment data is processed.