Data scrapping and cleaning¶

import requests
import pandas as pd
from bs4 import Comment
from bs4 import BeautifulSoup

df = pd.DataFrame()
df2 = pd.DataFrame()
df3 = pd.DataFrame()

# Set time period --> from 2000 to 2020
for year in range(2000,2020):
    # Get the seasonal stats for each team
    url = 'https://www.basketball-reference.com/leagues/NBA_{}.html#all_team-stats-base'.format(str(year))
    page = requests.get(url)

    soup = BeautifulSoup(page.text, 'html.parser')
    comments = soup.find_all(string=lambda text: isinstance(text, Comment))

    tables = []
    for each in comments:
        if 'table' in each:
            try:
                tables.append(pd.read_html(each)[0])
            except:
                continue
    
    # Miscellaneous stats, including technical analysis like usage rate and offensive rating
    tmp = tables[7]
    tmp.columns = tmp.columns.droplevel()
    tmp.drop(tmp[tmp['Team'] == 'League Average'].index, inplace = True)
    tmp.insert(1,'yearID',str(year))
    del tmp['Rk']
    del tmp['Arena']
    
    df = df.append(tmp)
    
    # Per Game Stats, including points, assist, block, turnover, 3 Pointer
    tmp2 = tables[1]
    tmp2.drop(tmp2[tmp2['Team'] == 'League Average'].index, inplace = True)
    tmp2.insert(0,'yearID',str(year))
    df3 = df3.append(tmp2)
    
    # Get each game's data, including home team, away team, score
    url = 'https://www.basketball-reference.com/leagues/NBA_{}_games.html'.format(str(year))
    page = requests.get(url)
    soup = BeautifulSoup(page.text, 'html.parser')
    
    table = soup.find('div', class_='overthrow table_container')
    rows = table.find_all('tr')
    
    data = [[],[],[],[]]
    
    for r in rows:
        tmp = r.find_all('td')
        for idx, val in enumerate(tmp):
            
            tmp = val.text
            
            if year <= 2000:
                idx = idx + 1
            
            if idx == 1 or idx == 3:
                data[idx-1].append(tmp)
            elif idx == 2 or idx == 4:
                data[idx-1].append(int(tmp))
    
    # Use the score for each team and convert that into a single categorical attribute
    home_win = [0 if (data[1])[i] < (data[3])[i] else 1 for i in range(len(data[2]))]

    
    d = {'home_team' : data[0],'away_team' : data[2], 'home_win': home_win}
    schedule = pd.DataFrame(d)
    schedule.insert(0,'yearID',str(year))
    df2 = df2.append(schedule)

# Join Per Game Stats with Miscellaneous stats on year and team name
df = df.merge(df3, on = ['Team', 'yearID'], suffixes=(False, False))

name = [i.replace('*','') if '*' in i else i for i in df.Team.tolist()]
new_df = pd.DataFrame({'Team': name})

df.update(new_df)
df

# Match the stats for each team in the record for each game
title = df.columns.tolist()[30:50]
h_stats = [[] for x in range(len(title))]
a_stats = [[] for x in range(len(title))]

yr = df2['yearID'].tolist()
hn = df2['home_team'].tolist()
an = df2['away_team'].tolist()

for i in range(len(yr)):
    h_item = df[(df.Team == hn[i]) & (df.yearID == yr[i])]
    a_item = df[(df.Team == an[i]) & (df.yearID == yr[i])]
    for j in range(len(title)):
        h_val = h_item[title[j]].tolist()
        h_stats[j].append(h_val[0])
        
        a_val = a_item[title[j]].tolist()
        a_stats[j].append(a_val[0])
        
for index, item in enumerate(title):
    h_item = 'h_' + item
    a_item = 'a_' + item
    df2[h_item] = h_stats[index]
    df2[a_item] = a_stats[index]

df2['yearID'] = df2['yearID'].astype('int64')
df2

df.to_csv(r'C:\\Users\\TomTu\\OneDrive - University of Maryland\\2020 Spring\\CMSC 320\\\nba_data.csv', index = False)
df2.to_csv(r'C:\\Users\\TomTu\\OneDrive - University of Maryland\\2020 Spring\\CMSC 320\\\game_data.csv', index = False)

Model training & result interpretation¶

import numpy as np
import sklearn.metrics
import seaborn as sns
import matplotlib.pylab as plt
import statsmodels.formula.api as sm
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split, KFold

sns.set(rc={'figure.figsize':(15,12)})

We tried to train with data before 2000, but since the rules have been changing, older data is not a good for the model no more. The best timeframe I found is about 2000 to 2019. And this is what I went with. Here we are feeding in all the attributes for each team and that's what's in train dataset, and then we set our y to be output, hosting categorical data in df.homewin, which is all 0 and 1s indicting if home team won the game or not.

tmp = df2.columns.tolist()[4:50]
train = df2[df2.yearID < 2019]
test = df2[df2.yearID == 2019]

x_train = train[tmp]
x_test = test[tmp]
y_train = train.home_win
y_test = test.home_win

model = LogisticRegression(n_jobs=8)
model.fit(x_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=8, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

y_test_pred = model.predict(x_test)
y_train_pred = model.predict(x_train)

mse_train = mean_squared_error(y_train, y_train_pred)
mse_test = mean_squared_error(y_test, y_test_pred)
score_trian = model.score(x_train, y_train)
score_test = model.score(x_test, y_test)

print("mse_train: {}\nmse_test: {}\nscore_trian: {}\nscore_test: {}\n".format(mse_train,mse_test,score_trian,score_test))

mse_train: 0.3070796460176991
mse_test: 0.2636363636363636
score_trian: 0.6929203539823009
score_test: 0.7363636363636363

Since there are only limited data due to rule changes, and there are only around 80 games each season, the best I can achieve is around 73.6%, which is not bad, and also much better as compared to a random guess (50%).

output = pd.DataFrame()

a_tmp = [-1 * x for x in model.coef_[0][1::2]]

output['attribute'] = title
output['h_weight'] = model.coef_[0][::2]
output['a_weight'] = a_tmp

output

sns.barplot(x = "attribute", y = "h_weight", data = output.sort_values(by=['h_weight'], ascending=False))
plt.title('2000 to 2019 -- Home team attribute vs. weight')
plt.show()
sns.barplot(x = "attribute", y = "a_weight", data = output.sort_values(by=['a_weight'], ascending=False))
plt.title('2000 to 2019 -- Away team attribute vs. weight')
plt.show()

From graph, we can see that for the past 20 years as a whole, the most import factors that contribute to winning a game are field goals made, total rebounds, blocks, two-point field goals made, steal, three-point field goals made, and turnover will hurt a team the most.

For away teams, surprisingly, steals contirbutes to road wins the most, then the same story as it somes to field goals made, total rebounds, two-point field goals made, free throws made and three-point field goals made. It's not surprising that turnovers will also hurt a team the most.

One more thing¶

Before I finish the study, I think I should look at the rise of the golden state warrios's time period. So I re-trained the model with data only between 2014 to 2018, and try to see how good it is at predicting the 2018-2019 season.

tmp = df2.columns.tolist()[4:50]
train = df2[(df2.yearID < 2019) & (df2.yearID >= 2014)]
test = df2[df2.yearID == 2019]

x_train = train[tmp]
x_test = test[tmp]
y_train = train.home_win
y_test = test.home_win

model = LogisticRegression(n_jobs=8)
model.fit(x_train, y_train)

y_test_pred = model.predict(x_test)
y_train_pred = model.predict(x_train)

mse_train = mean_squared_error(y_train, y_train_pred)
mse_test = mean_squared_error(y_test, y_test_pred)
score_trian = model.score(x_train, y_train)
score_test = model.score(x_test, y_test)

print("mse_train: {}\nmse_test: {}\nscore_trian: {}\nscore_test: {}\n".format(mse_train,mse_test,score_trian,score_test))

mse_train: 0.2974137931034483
mse_test: 0.2909090909090909
score_trian: 0.7025862068965517
score_test: 0.7090909090909091

So I got a 70.9% correct rate for this reduced model. It's reasonable to see a drop in correctness, as there are less data. But what I care the most is to find which are the most important factors during 2014 to 2018.

output = pd.DataFrame()

a_tmp = [-1 * x for x in model.coef_[0][1::2]]

output['attribute'] = title
output['h_weight'] = model.coef_[0][::2]
output['a_weight'] = a_tmp

sns.barplot(x = "attribute", y = "h_weight", data = output.sort_values(by=['h_weight'], ascending=False))
plt.title('2014 to 2018 -- Home team attribute vs. weight')
plt.show()
sns.barplot(x = "attribute", y = "a_weight", data = output.sort_values(by=['a_weight'], ascending=False))
plt.title('2014 to 2018 -- Away team attribute vs. weight')
plt.show()

From the graph above, I got the same result as I predicted: three pointers are the most important contributor as a home team gets a win. For the previus model, where we are using the past 20 years to do the prediction, 3P were both ranked 7th as for the contributor. Besides home team, we can also see that the 3P's importance was raised and now is ranked 4th as it comes to predicting the win.

Additional Information¶

In this study, since our output data is categorical, we decided to used logisic regression classifier as it does well when we are trying to seperate two items. For more information on logistic classifier, here are some helpful links:

Wiki - logistic regression

Scikit-Learn API

In the process of this study, we also had some inspiration from serval youtube videos that shows how to do data analysis with python and game data, here are the links for anyone that wants to explore this topic further.

Predicting NFL games

model details

	yearID	Team	Age	W	L	PW	PL	MOV	SOS	SRS	...	FT%	ORB	DRB	TRB	AST	STL	BLK	TOV	PF	PTS
0	2000	Los Angeles Lakers	29.2	67.0	15.0	64	18	8.55	-0.14	8.41	...	0.696	13.6	33.4	47.0	23.4	7.5	6.5	13.9	22.5	100.8
1	2000	Portland Trail Blazers	29.6	59.0	23.0	59	23	6.40	-0.04	6.36	...	0.760	11.8	31.2	43.0	23.5	7.7	4.8	15.2	22.7	97.5
2	2000	San Antonio Spurs	30.9	53.0	29.0	58	24	5.94	-0.02	5.92	...	0.746	11.3	32.5	43.8	22.2	7.5	6.7	15.0	20.9	96.2
3	2000	Phoenix Suns	28.6	53.0	29.0	56	26	5.22	0.02	5.24	...	0.759	12.5	31.2	43.7	25.6	9.1	5.3	16.7	24.1	98.9
4	2000	Utah Jazz	31.5	55.0	27.0	54	28	4.46	0.05	4.52	...	0.773	11.4	29.6	41.0	24.9	7.7	5.4	14.9	24.5	96.5
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
590	2019	Atlanta Hawks	25.1	29.0	53.0	27	55	-6.02	-0.04	-6.06	...	0.752	11.6	34.5	46.1	25.8	8.2	5.1	17.0	23.6	113.3
591	2019	Chicago Bulls	24.0	22.0	60.0	21	61	-8.41	0.10	-8.32	...	0.783	8.8	34.1	42.9	21.9	7.4	4.3	14.1	20.3	104.9
592	2019	Phoenix Suns	24.0	19.0	63.0	19	63	-9.34	0.73	-8.61	...	0.779	9.1	31.3	40.4	23.9	9.0	5.1	15.6	23.6	107.5
593	2019	New York Knicks	23.4	17.0	65.0	19	63	-9.21	0.28	-8.93	...	0.759	10.5	34.3	44.7	20.1	6.8	5.1	14.0	20.9	104.6
594	2019	Cleveland Cavaliers	25.2	19.0	63.0	19	63	-9.61	0.22	-9.39	...	0.792	10.7	31.9	42.7	20.7	6.5	2.4	13.5	20.0	104.5

	yearID	home_team	away_team	home_win	h_FG	a_FG	h_FGA	a_FGA	h_FG%	a_FG%	...	h_AST	a_AST	h_STL	a_STL	h_BLK	a_BLK	h_TOV	a_TOV	h_PF	a_PF
0	2000	Orlando Magic	Charlotte Hornets	0	38.6	35.8	85.5	79.7	0.452	0.449	...	20.8	24.7	9.1	8.9	5.7	5.9	17.6	14.7	24.0	20.4
1	2000	Golden State Warriors	Dallas Mavericks	0	36.5	39.0	87.1	85.9	0.420	0.453	...	22.6	22.1	8.9	7.2	4.3	5.1	15.9	13.7	24.9	21.6
2	2000	Phoenix Suns	Denver Nuggets	0	37.7	37.3	82.6	84.3	0.457	0.442	...	25.6	23.3	9.1	6.8	5.3	7.5	16.7	15.6	24.1	23.9
3	2000	Milwaukee Bucks	Houston Rockets	1	38.7	36.6	83.3	81.3	0.465	0.450	...	22.6	21.6	8.2	7.5	4.6	5.3	15.0	17.4	24.6	20.3
4	2000	Seattle SuperSonics	Los Angeles Clippers	1	37.9	35.1	84.7	82.4	0.447	0.426	...	22.9	18.0	8.0	7.0	4.2	6.0	14.0	16.2	21.7	22.2
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
105	2019	Utah Jazz	Minnesota Timberwolves	0	40.4	41.6	86.4	91.3	0.468	0.456	...	26.0	24.6	8.1	8.3	5.9	5.0	15.1	13.1	21.1	20.3
106	2019	Indiana Pacers	New York Knicks	1	41.3	38.2	87.0	88.3	0.475	0.433	...	26.0	20.1	8.7	6.8	4.9	5.1	13.7	14.0	19.4	20.9
107	2019	New Orleans Pelicans	Golden State Warriors	0	43.7	44.0	92.2	89.8	0.473	0.491	...	27.0	29.4	7.4	7.6	5.4	6.4	14.8	14.3	21.1	21.4
108	2019	Dallas Mavericks	Los Angeles Lakers	0	38.8	42.6	86.9	90.5	0.447	0.470	...	23.4	25.6	6.5	7.5	4.3	5.4	14.2	15.7	20.1	20.7
109	2019	San Antonio Spurs	Phoenix Suns	1	42.3	40.1	88.4	87.4	0.478	0.459	...	24.5	23.9	6.1	9.0	4.7	5.1	12.1	15.6	18.1	23.6

	attribute	h_weight	a_weight
0	FG	0.226895	0.194331
1	FGA	-0.186685	-0.153776
2	FG%	0.003272	0.003851
3	3P	0.086181	0.070963
4	3PA	-0.041852	-0.076499
5	3P%	0.003980	0.007332
6	2P	0.153479	0.110015
7	2PA	-0.149417	-0.090220
8	2P%	0.002929	0.002201
9	FT	0.068581	0.088320
10	FTA	-0.069059	-0.074324
11	FT%	0.003845	0.005520
12	ORB	0.185919	0.059421
13	DRB	0.029739	0.093776
14	TRB	0.199245	0.147955
15	AST	-0.002619	0.011860
16	STL	0.142090	0.207814
17	BLK	0.190082	0.000109
18	TOV	-0.331940	-0.277124
19	PF	0.058421	-0.031567

Using Data Science and Machine Learning for Predicting NBA Games

Hongyu Tu/Zhaoyi Zhuang/Yujian Zhao

5/13/2020

Github Repository

Table of Contents:

1. Introduction

2. Prepare Data

3. Data Analysis

4. Machine Learning with Python

5. Conclusion

6. Additional Information

Introduction

1.1 Background

1.2 Project Motivation

1.3 Glossary

Prepare data

Data scrapping and cleaning¶

Library

Original Data

Organize Dataset

Data Analysis

3.1 Pace vs. Wininning percentage over year

3.2 relationship between Offensive rating and WP over time

3.3 How does Three-points ball affect the game over time

3.3.0

3.3.1

3.3.2

3.3.3

3.3.4

3.3.5

3.3.6

3.3.7

3.4

3.4.1

3.4.2

3.4.3

Machine Learning with Python

Model training & result interpretation¶

One more thing¶

Conclusion

Additional Information¶