Old faithful geyser dataset rebooted with Python

dm.jpg by Dane Miller – 4/9/18

Here is a popular dataset on old faithful geyser eruptions in Yellowstone, WY. The dataset comes from Weisberg (2005) publication in Applied Linear Regression. This type of dataset can be extremely useful to National Park Service Rangers for predicting eruptions for visiting tourist. I would highly recommend visiting Yellowstone and seeing old faithful geyser in person it is truly amazing!

Source of the data: http://www.stat.cmu.edu/~larry/all-of-statistics/=data/faithful.dat

Weisberg, S. (2005). Applied Linear Regression, 3rd edition. New York: Wiley, Problem 1.4.

Yellowstone NPS https://www.nps.gov/yell/planyourvisit/exploreoldfaithful.htm

seaborn.jointplot https://seaborn.pydata.org/generated/seaborn.jointplot.html

This dataset contains only two variables duration of the current eruption, and the wait time in between eruptions.

Let’s look at a theoretical model: μ = β0 + β1Xi

μ : Wait time         β1Xi: Duration

Empirical model:  ^yi = b0 +b1xi1

y= observed wait time          b1xi1: observed duration

coef std err t P>|t| [0.025 0.975]
Intercept 35.0774 1.184 29.630 0.000 32.748 37.407
duration_sec 10.7499 0.325 33.111 0.000 10.111 11.389

Wait time = 35.0774 + 10.7499Duration

When I was initially introduced to this dataset in graduate school during a stats course. My focus then was to complete the problems as quickly as possible so that I could get back to my graduate research. However, I missed on some important subtleties in this simply dataset.

Rushing for a dataset in graduate school with Microsoft Excel. Looks pretty crappy! What was I thinking!!!

excel.jpg

Plotting the residuals:

The data is separating into two groups.

download.png

The same old faithful dataset now using seaborn.jointplot in python.

22.png44.png

33.png

Focus your efforts on learning python or R it will drastically improve your work. And there you have it a rebooted old faithful dataset plotted with seaborn.jointplot in python.

 

 

San Francisco Police Department traffic stops data 2017 Python code

http://sanfranciscopolice.org/data#trafficstops

(See file) Stops by Race and Ethnicity – data (2017)

# %load ../standard_import.txt
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import axes3d
import seaborn as sns

from sklearn.preprocessing import scale
import sklearn.linear_model as skl_lm
from sklearn.metrics import mean_squared_error, r2_score
import statsmodels.api as sm
import statsmodels.formula.api as smf

%matplotlib inline
plt.style.use('seaborn-white')


df = pd.read_csv('/.../sfpd2017.csv')
df.head()
# I renamed the file so that it was easier to load

df.info()
sns.heatmap(df.isnull(),yticklabels=False,cbar=False,cmap='viridis')
# to find missing data in the data set
fig = plt.figure(figsize=(15,9))
fig.suptitle('SFPD demographic chart', fontsize=20)

sns.set_style('whitegrid')
sns.countplot(x='Race_description',hue='Sex',data=df,palette='RdBu_r')
sns.distplot(df['Age'].dropna(),kde=False,color='darkred',bins=50)
sns.countplot(x='Race_description',data=df)
sns.countplot(x='Sex',data=df)
plt.figure(figsize=(12, 7))
sns.boxplot(x='Race_description',y='Age',data=df,palette='winter')
df['Race_description'].hist(color='green',bins=40,figsize=(8,4))
df = pd.DataFrame(np.random.randn(1000, 2), columns=['Race_description', 'Age'])
df.plot.hexbin(x='Race_description',y='Age',gridsize=25,cmap='Oranges')
sns.lmplot(x='Time_hour',y='Age',data=df,col='Race_description',hue='Sex',palette='coolwarm',
          aspect=0.6,size=8)

Trends on emission data – python code

# %load ../standard_import.txt
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import axes3d
import seaborn as sns

from sklearn.preprocessing import scale
import sklearn.linear_model as skl_lm
from sklearn.metrics import mean_squared_error, r2_score
import statsmodels.api as sm
import statsmodels.formula.api as smf

%matplotlib inline
plt.style.use('seaborn-white')
# https://catalog.data.gov/dataset/greenhouse-gas-emissions-from-fuel-combustion-million-metric-tons-beginning-1990
df = pd.read_csv('/.../GreenhouseEmissions.csv') # add your location for your file in ...

df.head()
sns.regplot(df.Year, df.Commercial, order=1, ci=None, scatter_kws={'color':'r', 's':9})
plt.xlim(1990, 2016)
plt.ylim(15,40);
sns.jointplot(x='Year',y='Transportation',data=df,kind='reg')
sns.pairplot(df)

 

NBA data analysis 1913-1997 python code

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import axes3d
import seaborn as sns

from sklearn.preprocessing import scale
import sklearn.linear_model as skl_lm
from sklearn.metrics import mean_squared_error, r2_score
import statsmodels.api as sm
import statsmodels.formula.api as smf

%matplotlib inline
plt.style.use('seaborn-white')
a = pd.read_csv('/.../1913-1933nba.csv') # add your location for your file in ...

a.head()
# You can break the data down into 4 csv files a - 1913-1933; b - 1934-1959; c-1960-1979; and 1980-1997
sns.regplot(a.weight_lbs, a.height_ft, order=1, ci=None, scatter_kws={'color':'g', 's':12})
sns.regplot(b.weight_lbs, b.height_ft, order=1, ci=None, scatter_kws={'color':'r', 's':12})
sns.regplot(c.weight_lbs, c.height_ft, order=1, ci=None, scatter_kws={'color':'b', 's':12})
sns.regplot(d.weight_lbs, d.height_ft, order=1, ci=None, scatter_kws={'color':'y', 's':12})
plt.xlim(140,325)
plt.ylim(ymin=5.5);

# multiple regression lines and changing the color with letter symbol 
regr = skl_lm.LinearRegression()

X = a.weight_lbs.values.reshape(-1,1)
y = a.height_ft

regr.fit(X,y)
print(regr.intercept_)
print(regr.coef_)

# you can run regression coefficient for each dataset
sns.regplot(a.height_ft, a.born, order=1, ci=None, scatter_kws={'color':'g', 's':9})
sns.regplot(b.height_ft, b.born, order=1, ci=None, scatter_kws={'color':'r', 's':9})
sns.regplot(c.height_ft, c.born, order=1, ci=None, scatter_kws={'color':'b', 's':9})
sns.regplot(d.height_ft, d.born, order=1, ci=None, scatter_kws={'color':'y', 's':9})
plt.xlim(5.5, 7.5)
plt.ylim(1913, 1997);

# green data points and blue line indicate 1913-1933; red data points and orange line indicate 1934-1959
# blue data points and green line indicate 1960-1979; yellow data points and red line indicate 1980-1997
a[['weight_lbs', 'height_ft']].describe()
# Run the descriptive statistics for all data sets
# Create a coordinate grid
weight_lbs = np.arange(0,50)
height_ft = np.arange(0,300)

B1, B2 = np.meshgrid(weight_lbs, height_ft, indexing='xy')
Z = np.zeros((height_ft.size, weight_lbs.size))

for (i,j),v in np.ndenumerate(Z):
        Z[i,j] =(regr.intercept_ + B1[i,j]*regr.coef_[0] + B2[i,j]*regr.coef_[1])
# Create plot
fig = plt.figure(figsize=(12,8))
fig.suptitle('NBA players born between 1910 - 1997', fontsize=20)

ax = axes3d.Axes3D(fig)

ax.plot_surface(B1, B2, Z, rstride=10, cstride=5, alpha=0.4)
ax.scatter3D(a.weight_lbs, a.height_ft, a.born, c='g')
ax.scatter3D(b.weight_lbs, b.height_ft, b.born, c='r')
ax.scatter3D(c.weight_lbs, c.height_ft, c.born, c='b')
ax.scatter3D(d.weight_lbs, d.height_ft, d.born, c='y')

ax.set_xlabel('weight_lbs')
ax.set_xlim(350,150)
ax.set_ylabel('height_ft')
ax.set_ylim(5.5,8)
ax.set_zlabel('born')
ax.set_zlim(1910,1997);
sns.pairplot(a[['height_ft','weight_lbs']]);
sns.pairplot(b[['height_ft','weight_lbs']]);
sns.pairplot(c[['height_ft','weight_lbs']]);
sns.pairplot(d[['height_ft','weight_lbs']]);
sns.jointplot(x='weight_lbs',y='height_ft',data=a,kind='hex') 
# interchange the data sets into this code
a = pd.DataFrame(np.random.randn(1000, 2), columns=['height_ft', 'weight_lbs'])
a.plot.hexbin(x='height_ft',y='weight_lbs',gridsize=25,cmap='Oranges') 
# interchange the other data sets into this code

NBA metric height and weight (python code)

Python 3.6 using Jupyter Notebook

# %load ../standard_import.txt
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import axes3d
import seaborn as sns

from sklearn.preprocessing import scale
import sklearn.linear_model as skl_lm
from sklearn.metrics import mean_squared_error, r2_score
import statsmodels.api as sm
import statsmodels.formula.api as smf

%matplotlib inline
plt.style.use(‘seaborn-white’)

a = pd.read_csv(‘/…/1913-1933nba.csv’) # add your location for your file in …
a.head()

b = pd.read_csv(‘/…/1934-1959nba.csv’) # add your location for your file in …
b.head()

c = pd.read_csv(‘/…/1960-1979 nba.csv’) # add your location for your file in …
c.head()

d = pd.read_csv(‘/…/1980-1997 nba.csv’) # add your location for your file in …
d.head()

sns.regplot(a.weight_lbs, a.height_ft, order=1, ci=None, scatter_kws={‘color’:’g’, ‘s’:12})
sns.regplot(b.weight_lbs, b.height_ft, order=1, ci=None, scatter_kws={‘color’:’r’, ‘s’:12})
sns.regplot(c.weight_lbs, c.height_ft, order=1, ci=None, scatter_kws={‘color’:’b’, ‘s’:12})
sns.regplot(d.weight_lbs, d.height_ft, order=1, ci=None, scatter_kws={‘color’:’y’, ‘s’:12})
plt.xlim(140,325)
plt.ylim(ymin=5.5);

# multiple regression lines and changing the color with letter symbol

regr = skl_lm.LinearRegression()

X = a.weight_lbs.values.reshape(-1,1)
y = a.height_ft

regr.fit(X,y)
print(regr.intercept_)
print(regr.coef_)

# regression coefficient for 1913-1933 (4.24)

a[[‘weight_lbs’, ‘height_ft’]].describe()
# 1913-1933 note 6.3 ft mean and 192.7 lbs mean

d[[‘weight_lbs’, ‘height_ft’]].describe()
# 1980-1997 note 6.59 ft mean and 219.9 lbs mean
# increase of +0.03 ft in the mean and +6.8 lbs in the mean

# Create a coordinate grid
weight_lbs = np.arange(0,50)
height_ft = np.arange(0,300)

B1, B2 = np.meshgrid(weight_lbs, height_ft, indexing=’xy’)
Z = np.zeros((height_ft.size, weight_lbs.size))

for (i,j),v in np.ndenumerate(Z):
Z[i,j] =(regr.intercept_ + B1[i,j]*regr.coef_[0] + B2[i,j]*regr.coef_[1])

# Create plot
fig = plt.figure(figsize=(12,8))
fig.suptitle(‘NBA players born between 1910 – 1997′, fontsize=20)

ax = axes3d.Axes3D(fig)

ax.plot_surface(B1, B2, Z, rstride=10, cstride=5, alpha=0.4)
ax.scatter3D(a.weight_lbs, a.height_ft, a.born, c=’g’)
ax.scatter3D(b.weight_lbs, b.height_ft, b.born, c=’r’)
ax.scatter3D(c.weight_lbs, c.height_ft, c.born, c=’b’)
ax.scatter3D(d.weight_lbs, d.height_ft, d.born, c=’y’)

ax.set_xlabel(‘weight_lbs’)
ax.set_xlim(350,150)
ax.set_ylabel(‘height_ft’)
ax.set_ylim(5.5,8)
ax.set_zlabel(‘born’)
ax.set_zlim(1910,1997);

sns.jointplot(x=’weight_lbs’,y=’height_ft’,data=a,kind=’hex’) #1913-1933 showing the mean of height and weight
# note the pearson r is the strength of the linear relationship between the two variables 0.79