NBA data analysis 1913-1997 python code

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import axes3d
import seaborn as sns

from sklearn.preprocessing import scale
import sklearn.linear_model as skl_lm
from sklearn.metrics import mean_squared_error, r2_score
import statsmodels.api as sm
import statsmodels.formula.api as smf

%matplotlib inline
plt.style.use('seaborn-white')
a = pd.read_csv('/.../1913-1933nba.csv') # add your location for your file in ...

a.head()
# You can break the data down into 4 csv files a - 1913-1933; b - 1934-1959; c-1960-1979; and 1980-1997
sns.regplot(a.weight_lbs, a.height_ft, order=1, ci=None, scatter_kws={'color':'g', 's':12})
sns.regplot(b.weight_lbs, b.height_ft, order=1, ci=None, scatter_kws={'color':'r', 's':12})
sns.regplot(c.weight_lbs, c.height_ft, order=1, ci=None, scatter_kws={'color':'b', 's':12})
sns.regplot(d.weight_lbs, d.height_ft, order=1, ci=None, scatter_kws={'color':'y', 's':12})
plt.xlim(140,325)
plt.ylim(ymin=5.5);

# multiple regression lines and changing the color with letter symbol 
regr = skl_lm.LinearRegression()

X = a.weight_lbs.values.reshape(-1,1)
y = a.height_ft

regr.fit(X,y)
print(regr.intercept_)
print(regr.coef_)

# you can run regression coefficient for each dataset
sns.regplot(a.height_ft, a.born, order=1, ci=None, scatter_kws={'color':'g', 's':9})
sns.regplot(b.height_ft, b.born, order=1, ci=None, scatter_kws={'color':'r', 's':9})
sns.regplot(c.height_ft, c.born, order=1, ci=None, scatter_kws={'color':'b', 's':9})
sns.regplot(d.height_ft, d.born, order=1, ci=None, scatter_kws={'color':'y', 's':9})
plt.xlim(5.5, 7.5)
plt.ylim(1913, 1997);

# green data points and blue line indicate 1913-1933; red data points and orange line indicate 1934-1959
# blue data points and green line indicate 1960-1979; yellow data points and red line indicate 1980-1997
a[['weight_lbs', 'height_ft']].describe()
# Run the descriptive statistics for all data sets
# Create a coordinate grid
weight_lbs = np.arange(0,50)
height_ft = np.arange(0,300)

B1, B2 = np.meshgrid(weight_lbs, height_ft, indexing='xy')
Z = np.zeros((height_ft.size, weight_lbs.size))

for (i,j),v in np.ndenumerate(Z):
        Z[i,j] =(regr.intercept_ + B1[i,j]*regr.coef_[0] + B2[i,j]*regr.coef_[1])
# Create plot
fig = plt.figure(figsize=(12,8))
fig.suptitle('NBA players born between 1910 - 1997', fontsize=20)

ax = axes3d.Axes3D(fig)

ax.plot_surface(B1, B2, Z, rstride=10, cstride=5, alpha=0.4)
ax.scatter3D(a.weight_lbs, a.height_ft, a.born, c='g')
ax.scatter3D(b.weight_lbs, b.height_ft, b.born, c='r')
ax.scatter3D(c.weight_lbs, c.height_ft, c.born, c='b')
ax.scatter3D(d.weight_lbs, d.height_ft, d.born, c='y')

ax.set_xlabel('weight_lbs')
ax.set_xlim(350,150)
ax.set_ylabel('height_ft')
ax.set_ylim(5.5,8)
ax.set_zlabel('born')
ax.set_zlim(1910,1997);
sns.pairplot(a[['height_ft','weight_lbs']]);
sns.pairplot(b[['height_ft','weight_lbs']]);
sns.pairplot(c[['height_ft','weight_lbs']]);
sns.pairplot(d[['height_ft','weight_lbs']]);
sns.jointplot(x='weight_lbs',y='height_ft',data=a,kind='hex') 
# interchange the data sets into this code
a = pd.DataFrame(np.random.randn(1000, 2), columns=['height_ft', 'weight_lbs'])
a.plot.hexbin(x='height_ft',y='weight_lbs',gridsize=25,cmap='Oranges') 
# interchange the other data sets into this code

Leave a Reply

This site uses Akismet to reduce spam. Learn how your comment data is processed.