import pandas as pd
df = pd.read_csv("BattlesStaging_01012021_WL_tagged.csv", nrows = 10000)
wincons = pd.read_csv("Wincons.csv")
indexes = []
# getting rid of outdated win condition information
indexes.append(wincons[wincons['card_name'] == 'Baby Dragon'].index[0])
indexes.append(wincons[wincons['card_name'] == 'Prince'].index[0])
indexes.append(wincons[wincons['card_name'] == 'Giant Skeleton'].index[0])
indexes.append(wincons[wincons['card_name'] == 'Mega Knight'].index[0])
wincons.drop(indexes, inplace=True)
cards = pd.read_csv("CardMasterListSeason18_12082020.csv")
display(wincons.head())
# drop the columns that we don't need 
to_drop = ['battleTime' , 'winner.tag', 'winner.clan.tag', 'winner.clan.badgeId', 'loser.tag', 
'loser.clan.tag', 'loser.clan.badgeId', 'tournamentTag']
df.drop(to_drop, axis=1, inplace=True)
print(df.columns)
display(df)

Index(['Unnamed: 0', 'arena.id', 'gameMode.id', 'average.startingTrophies',
       'winner.startingTrophies', 'winner.trophyChange', 'winner.crowns',
       'winner.kingTowerHitPoints', 'winner.princessTowersHitPoints',
       'loser.startingTrophies', 'loser.trophyChange', 'loser.crowns',
       'loser.kingTowerHitPoints', 'loser.princessTowersHitPoints',
       'winner.card1.id', 'winner.card1.level', 'winner.card2.id',
       'winner.card2.level', 'winner.card3.id', 'winner.card3.level',
       'winner.card4.id', 'winner.card4.level', 'winner.card5.id',
       'winner.card5.level', 'winner.card6.id', 'winner.card6.level',
       'winner.card7.id', 'winner.card7.level', 'winner.card8.id',
       'winner.card8.level', 'winner.cards.list', 'winner.totalcard.level',
       'winner.troop.count', 'winner.structure.count', 'winner.spell.count',
       'winner.common.count', 'winner.rare.count', 'winner.epic.count',
       'winner.legendary.count', 'winner.elixir.average', 'loser.card1.id',
       'loser.card1.level', 'loser.card2.id', 'loser.card2.level',
       'loser.card3.id', 'loser.card3.level', 'loser.card4.id',
       'loser.card4.level', 'loser.card5.id', 'loser.card5.level',
       'loser.card6.id', 'loser.card6.level', 'loser.card7.id',
       'loser.card7.level', 'loser.card8.id', 'loser.card8.level',
       'loser.cards.list', 'loser.totalcard.level', 'loser.troop.count',
       'loser.structure.count', 'loser.spell.count', 'loser.common.count',
       'loser.rare.count', 'loser.epic.count', 'loser.legendary.count',
       'loser.elixir.average'],
      dtype='object')


import matplotlib.pyplot as plt
# make a list containing all of the cards in all of the winners' decks
winnerCards = []
winnerCards = df['winner.card1.id'].tolist()
temp = df['winner.card2.id'].tolist()
winnerCards.extend(temp)
temp = df['winner.card3.id'].tolist()
winnerCards.extend(temp)
temp = df['winner.card4.id'].tolist()
winnerCards.extend(temp)
temp = df['winner.card5.id'].tolist()
winnerCards.extend(temp)
temp = df['winner.card6.id'].tolist()
winnerCards.extend(temp)
temp = df['winner.card7.id'].tolist()
winnerCards.extend(temp)
temp = df['winner.card8.id'].tolist()
winnerCards.extend(temp)
print(len(winnerCards))
winnerCardsSeries = pd.Series(winnerCards) 
print(winnerCardsSeries.unique())
print(winnerCardsSeries.value_counts())
# plot a frequency distribution of the cards
winnerCardsSeries.value_counts().plot.bar(figsize=(20,8), color='cornflowerblue')
plt.title('Card Distribution among Winners', fontsize=20)
plt.xlabel('Cards', fontsize=14)
plt.ylabel('Number of Decks which Include a Card', fontsize=14)
plt.show

80000
[26000008 26000056 26000044 28000004 28000011 26000041 26000061 26000055
 26000004 26000023 28000015 26000009 26000015 26000047 27000003 26000032
 26000017 26000037 26000034 28000001 26000005 26000018 26000049 26000059
 26000001 26000010 27000006 28000002 26000028 26000029 27000002 26000000
 26000040 28000010 26000024 28000003 26000054 28000008 28000007 26000011
 26000080 26000085 28000012 26000035 26000003 26000019 26000043 28000009
 26000046 26000042 26000026 26000012 26000006 26000014 26000027 26000063
 26000020 27000010 26000064 26000030 26000051 26000062 28000018 26000053
 28000000 26000021 26000038 26000045 26000060 26000068 28000017 28000006
 26000048 27000004 26000039 26000067 26000016 26000007 27000008 26000052
 26000084 26000033 26000022 26000083 28000016 28000005 28000014 26000025
 27000001 26000036 26000057 26000058 26000002 26000013 27000012 26000031
 26000050 28000013 27000000 27000007 27000005 27000009]
28000008    3133
28000011    2998
28000000    2502
28000001    2153
26000000    2038
            ... 
28000018     129
26000002     105
27000005      93
26000085      83
26000028      80
Length: 102, dtype: int64

<function matplotlib.pyplot.show(close=None, block=None)>


import numpy as np
# Make a list of every win condition card in the winner decks
winner_wincons = []
for i in range(0, len(winnerCards)):
    if winnerCards[i] in wincons['card_id'].to_list():
        winner_wincons.append(winnerCards[i])

print("Number of wincons present in the winner decks: " + str(len(winner_wincons)))
winner_wincons_series = pd.Series(winner_wincons) 
# plot the frequency distribution of the win conditions
winner_wincons_series.value_counts().plot.bar(figsize=(20,8), color='cornflowerblue')
plt.title('Win Condition Card Distribution among Winners', fontsize=20)
plt.xlabel('Win Condition Cards', fontsize=14)
plt.ylabel('Number of Decks which Include a Card', fontsize=14)
plt.show

Number of wincons present in the winner decks: 12261

<function matplotlib.pyplot.show(close=None, block=None)>


loserCards = []
# make a list containing all of the cards in all of the losers' decks
loserCards = df['loser.card1.id'].tolist()
temp = df['loser.card2.id'].tolist()
loserCards.extend(temp)
temp = df['loser.card3.id'].tolist()
loserCards.extend(temp)
temp = df['loser.card4.id'].tolist()
loserCards.extend(temp)
temp = df['loser.card5.id'].tolist()
loserCards.extend(temp)
temp = df['loser.card6.id'].tolist()
loserCards.extend(temp)
temp = df['loser.card7.id'].tolist()
loserCards.extend(temp)
temp = df['loser.card8.id'].tolist()
loserCards.extend(temp)
print(len(loserCards))
loserCardsSeries = pd.Series(loserCards) 
print(loserCardsSeries.unique())
print(loserCardsSeries.value_counts())
# plot the distribution of cards
loserCardsSeries.value_counts().plot.bar(figsize=(20,8), color='maroon')
plt.title('Card Distribution among Losers', fontsize=20)
plt.xlabel('Cards', fontsize=14)
plt.ylabel('Number of Decks which Include a Card', fontsize=14)
plt.show

80000
[27000004 26000037 26000046 26000027 26000021 26000009 26000000 26000004
 26000042 28000011 26000052 26000012 28000009 28000000 26000049 28000003
 26000030 26000036 26000026 27000003 26000055 26000017 26000007 26000010
 26000032 26000059 26000018 26000056 28000001 26000033 26000011 26000041
 26000013 26000022 28000014 26000016 28000008 26000031 26000020 27000010
 28000002 26000051 26000006 26000005 26000043 26000029 28000004 26000014
 26000083 26000058 26000044 28000012 26000003 26000035 26000008 26000024
 26000047 26000039 28000015 26000064 26000054 28000007 26000001 28000010
 26000048 26000057 27000012 26000019 26000060 26000053 26000015 26000040
 27000009 27000008 26000038 28000017 28000013 27000002 26000023 26000084
 28000005 26000067 26000085 26000061 26000062 26000063 28000006 26000034
 28000016 27000007 26000045 26000028 27000005 26000002 26000025 27000006
 26000080 27000001 26000050 27000000 26000068 28000018]
28000008    2955
28000011    2882
28000000    2534
26000011    2360
28000001    2190
            ... 
26000053     106
26000060     105
28000018     102
26000085      87
26000028      82
Length: 102, dtype: int64

<function matplotlib.pyplot.show(close=None, block=None)>


loser_wincons = []
# Make a list of every win condition card in the loser decks
for i in range(0, len(loserCards)):
    if loserCards[i] in wincons['card_id'].to_list():
        loser_wincons.append(loserCards[i])

print("Number of wincons present in the loser decks: " + str(len(loser_wincons)))
loser_wincons_series = pd.Series(loser_wincons) 
# plot the frequency distribution of the win conditions
loser_wincons_series.value_counts().plot.bar(figsize=(20,8), color='maroon')
plt.title('Win Condition Card Distribution among Losers', fontsize=20)
plt.xlabel('Win Condition Cards', fontsize=14)
plt.ylabel('Number of Decks which Include a Card', fontsize=14)
plt.show

Number of wincons present in the loser decks: 11924

<function matplotlib.pyplot.show(close=None, block=None)>


# get the top 10 most common decks for winners
top_decks_winner = df['winner.cards.list'].value_counts()[0:10]
print(top_decks_winner)
# Plot the decks
ax = top_decks_winner.plot.bar(figsize=(20,8), color='cornflowerblue')
ax.set_xticklabels(['Deck 1', 'Deck 2', 'Deck 3', 'Deck 4', 'Deck 5', 'Deck 6', 'Deck 7', 'Deck 8', 'Deck 9', 'Deck10'])
plt.title('Top 10 Decks among Winners', fontsize=20)
plt.xlabel('Decks', fontsize=14)
plt.ylabel('Frequency of a Deck', fontsize=14)
plt.show

[26000000, 26000026, 26000030, 26000041, 27000003, 28000003, 28000004, 28000011]    175
[26000000, 26000001, 26000010, 26000030, 27000006, 27000008, 28000000, 28000011]    129
[26000004, 26000036, 26000042, 26000046, 26000050, 26000062, 28000008, 28000009]    129
[26000010, 26000014, 26000021, 26000030, 26000038, 27000000, 28000000, 28000011]    129
[26000006, 26000008, 26000029, 26000032, 26000037, 26000080, 28000001, 28000008]    107
[26000000, 26000010, 26000023, 27000006, 27000008, 28000003, 28000011, 28000012]     82
[26000000, 26000019, 26000032, 26000049, 26000058, 27000004, 28000000, 28000011]     63
[26000000, 26000015, 26000023, 27000004, 28000009, 28000010, 28000012, 28000015]     60
[26000009, 26000015, 26000023, 26000027, 26000048, 28000007, 28000012, 28000015]     59
[26000003, 26000016, 26000027, 26000032, 26000039, 26000042, 28000000, 28000008]     58
Name: winner.cards.list, dtype: int64

<function matplotlib.pyplot.show(close=None, block=None)>


# get the top 10 most common decks for losers 
top_decks_loser = df['loser.cards.list'].value_counts()[0:10]
print(top_decks_loser)
# Plot the decks
ax = top_decks_loser.plot.bar(figsize=(20,8), color='maroon')
ax.set_xticklabels(['Deck 1', 'Deck 2', 'Deck 3', 'Deck 4', 'Deck 5', 'Deck 6', 'Deck 7', 'Deck 8', 'Deck 9', 'Deck10'])
plt.title('Top 10 Decks among Losers', fontsize=20)
plt.xlabel('Decks', fontsize=14)
plt.ylabel('Frequency of a Deck', fontsize=14)
plt.show

[26000000, 26000026, 26000030, 26000041, 27000003, 28000003, 28000004, 28000011]    159
[26000010, 26000014, 26000021, 26000030, 26000038, 27000000, 28000000, 28000011]    135
[26000000, 26000001, 26000010, 26000030, 27000006, 27000008, 28000000, 28000011]     78
[26000000, 26000010, 26000023, 27000006, 27000008, 28000003, 28000011, 28000012]     66
[26000004, 26000036, 26000042, 26000046, 26000050, 26000062, 28000008, 28000009]     62
[26000006, 26000008, 26000029, 26000032, 26000037, 26000080, 28000001, 28000008]     55
[26000000, 26000019, 26000032, 26000049, 26000058, 27000004, 28000000, 28000011]     52
[26000000, 26000015, 26000023, 27000004, 28000009, 28000010, 28000012, 28000015]     51
[26000000, 26000026, 26000030, 26000041, 27000006, 28000003, 28000004, 28000011]     49
[26000009, 26000015, 26000035, 26000039, 26000048, 28000007, 28000012, 28000015]     49
Name: loser.cards.list, dtype: int64

<function matplotlib.pyplot.show(close=None, block=None)>


import scipy.stats as stats
# 2 sample t tests
# Elixir Average
print("Elixir Average:")
print(df['winner.elixir.average'].mean())
print(df['loser.elixir.average'].mean())
print(stats.ttest_ind(df['winner.elixir.average'], df['loser.elixir.average']))
# Spell Counts
print("Spell Counts:")
print(df['winner.spell.count'].mean())
print(df['loser.spell.count'].mean())
print(stats.ttest_ind(df['winner.spell.count'], df['loser.spell.count']))
# Legendary Counts
print("Legendary Counts:")
print(df['winner.legendary.count'].mean())
print(df['loser.legendary.count'].mean())
print(stats.ttest_ind(df['winner.legendary.count'], df['loser.legendary.count']))
# Total Card Levels
print("Total Card Levels:")
print(df['winner.totalcard.level'].mean())
print(df['loser.totalcard.level'].mean())
print(stats.ttest_ind(df['winner.totalcard.level'], df['loser.totalcard.level']))
# Starting Trophies
print("Starting Trophies")
print(df['winner.startingTrophies'].mean())
print(df['loser.startingTrophies'].mean())
print(stats.ttest_ind(df['winner.startingTrophies'], df['loser.startingTrophies']))

Elixir Average:
3.761382142857143
3.796941071428571
Ttest_indResult(statistic=-4.949226778999882, pvalue=7.511465939739547e-07)
Spell Counts:
2.1064
2.0718
Ttest_indResult(statistic=2.9472044651245857, pvalue=0.0032103129941737107)
Legendary Counts:
1.6639
1.584
Ttest_indResult(statistic=4.994397046861575, pvalue=5.951686925696776e-07)
Total Card Levels:
99.4396
98.7528
Ttest_indResult(statistic=4.562144040400342, pvalue=5.093467257694242e-06)
Starting Trophies
5221.8336
5220.9394
Ttest_indResult(statistic=0.08536764645481916, pvalue=0.9319699682156413)


# Classification Models
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score,KFold
from sklearn import metrics
from numpy import mean
from numpy import absolute
import pandas as pd
import scipy.stats as stats

# Features
x_set = df[['winner.card1.id', 'winner.card1.level', 'winner.card2.id', 'winner.card2.level',
'winner.card3.id', 'winner.card3.level', 'winner.card4.id', 'winner.card4.level',
'winner.card5.id', 'winner.card5.level', 'winner.card6.id', 'winner.card6.level',
'winner.card7.id', 'winner.card7.level', 'winner.card8.id', 'winner.card8.level',
'loser.card1.id', 'loser.card1.level', 'loser.card2.id', 'loser.card2.level',
'loser.card3.id', 'loser.card3.level', 'loser.card4.id', 'loser.card4.level',
'loser.card5.id', 'loser.card5.level', 'loser.card6.id', 'loser.card6.level',
'loser.card7.id', 'loser.card7.level', 'loser.card8.id', 'loser.card8.level',]]
# Labels
y_set = df['winner.crowns']
X_train, X_test, y_train, y_test = train_test_split(x_set, y_set, test_size=0.2, random_state=1)

# K Nearest Neighbors

knn = KNeighborsClassifier()
params = {
    'n_neighbors': [3,5,7,9,11,13,15,17,21,23,25]
}
# gridsearch
clf2 = GridSearchCV(
    estimator=knn,
    param_grid=params,
    cv=5,
    n_jobs=5,
    verbose=1
)
# fitting
clf2.fit(X_train, y_train)
print(clf2.best_params_)
neighbors = clf2.best_params_["n_neighbors"]
knn2 = KNeighborsClassifier(n_neighbors=neighbors)
knn2.fit(X_train, y_train)
y_pred = knn2.predict(X_test)
# Holdout predictions
print("KNN Holdout Accuracy:",metrics.accuracy_score(y_test, y_pred))
# Cross Validation
kf=KFold(n_splits=10)
score2=cross_val_score(knn2, x_set, y_set,cv=kf)
error_scores2 = cross_val_score(knn2, x_set, y_set, scoring='neg_mean_absolute_error',cv=kf)
print("KNN Cross Validation Scores are {}".format(score2))
print("KNN Average Cross Validation score: {}".format(score2.mean()))
print("KNN Mean Absolute Error: {}".format(mean(absolute(error_scores2))))

# Random Forest

forest = RandomForestClassifier()
params = {'n_estimators': [5, 10, 25, 50, 100, 150, 200], 'max_depth': [5, 10, 20, 40, 80]}

#grid search
clf = GridSearchCV(
    estimator=forest,
    param_grid=params,
    cv=5,
    n_jobs=5,
    verbose=1
)
# fitting
clf.fit(X_train, y_train)
print(clf.best_params_)
n_val = clf.best_params_["n_estimators"]
depth = clf.best_params_["max_depth"]
forest2 = RandomForestClassifier(n_estimators=n_val, max_depth = depth)
forest2.fit(X_train, y_train)
y_pred=forest2.predict(X_test)
# Holdout predictions
print("RF Holdout Accuracy:",metrics.accuracy_score(y_test, y_pred))
# Cross Validation
kf=KFold(n_splits=10)
score=cross_val_score(forest2,x_set, y_set,cv=kf)
error_scores = cross_val_score(forest2,x_set, y_set, scoring='neg_mean_absolute_error',cv=kf)
print("RF Cross Validation Scores are {}".format(score))
print("RF Average Cross Validation score: {}".format(score.mean()))
print("RF Mean Absolute Error: {}".format(mean(absolute(error_scores))))

# 2 Sample t test
stats.ttest_ind(score2, score)

Fitting 5 folds for each of 11 candidates, totalling 55 fits
{'n_neighbors': 25}
KNN Holdout Accuracy: 0.547
KNN Cross Validation Scores are [0.54  0.556 0.539 0.542 0.509 0.521 0.517 0.486 0.506 0.511]
KNN Average Cross Validation score: 0.5227000000000002
KNN Mean Absolute Error: 0.6943
Fitting 5 folds for each of 35 candidates, totalling 175 fits
{'max_depth': 20, 'n_estimators': 200}
RF Holdout Accuracy: 0.5675
RF Cross Validation Scores are [0.573 0.601 0.57  0.564 0.549 0.569 0.548 0.543 0.541 0.531]
RF Average Cross Validation score: 0.5589
RF Mean Absolute Error: 0.6454000000000001

Ttest_indResult(statistic=-3.8827833633961824, pvalue=0.0010904617094874593)

	Unnamed: 0	arena.id	gameMode.id	average.startingTrophies	winner.startingTrophies	winner.trophyChange	winner.crowns	winner.kingTowerHitPoints	winner.princessTowersHitPoints	loser.startingTrophies	...	loser.cards.list	loser.totalcard.level	loser.troop.count	loser.structure.count	loser.spell.count	loser.common.count	loser.rare.count	loser.epic.count	loser.legendary.count	loser.elixir.average
0	0	54000050.0	72000006.0	5363.0	5372.0	28.0	2.0	4145.0	[1484]	5354.0	...	[26000000, 26000015, 26000023, 27000004, 28000...	104	3	1	4	1	1	4	2	3.500
1	1	54000050.0	72000006.0	5407.0	5409.0	29.0	1.0	5304.0	[579, 3082]	5405.0	...	[26000023, 26000027, 26000037, 26000046, 26000...	104	6	1	1	0	1	2	5	4.250
2	2	54000050.0	72000006.0	5741.0	5749.0	28.0	2.0	5762.0	[2080, 2099]	5733.0	...	[26000022, 26000027, 26000028, 26000041, 26000...	104	7	0	1	4	2	1	1	4.125
3	3	54000050.0	72000006.0	4307.0	4316.0	28.0	2.0	4392.0	[1322]	4298.0	...	[26000012, 26000027, 26000031, 26000033, 26000...	80	6	1	1	2	1	2	3	3.750
4	4	54000050.0	72000006.0	5776.5	5783.0	28.0	3.0	5832.0	[3668, 3668]	5770.0	...	[26000010, 26000011, 26000021, 26000037, 26000...	104	5	1	2	2	4	0	2	3.250
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
9995	9995	54000050.0	72000006.0	4936.0	4915.0	33.0	1.0	5832.0	[3242, 1063]	4957.0	...	[26000017, 26000024, 26000025, 26000055, 26000...	103	5	1	2	4	1	2	1	4.500
9996	9996	54000050.0	72000006.0	4540.0	4543.0	29.0	2.0	3436.0	[2014]	4537.0	...	[26000000, 26000001, 26000016, 26000021, 26000...	93	5	0	3	4	2	1	1	3.125
9997	9997	54000050.0	72000006.0	4441.0	4442.0	29.0	3.0	4824.0	[3052, 2734]	4440.0	...	[26000005, 26000011, 26000015, 26000017, 26000...	91	7	0	1	2	4	1	1	3.750
9998	9998	54000050.0	72000006.0	5663.0	5684.0	26.0	1.0	5832.0	[711, 2363]	5642.0	...	[26000004, 26000012, 26000015, 26000036, 26000...	104	6	0	2	2	2	4	0	3.875
9999	9999	54000050.0	72000006.0	5290.0	5304.0	27.0	1.0	5832.0	[1131, 2632]	5276.0	...	[26000027, 26000042, 26000049, 26000054, 26000...	102	5	1	2	2	2	2	2	4.000

Data Science and Clash Royale¶

Introduction¶

The Data¶

Data Exploration¶

Hypothesis Testing¶

Results¶

Classification¶

Classifier Results¶

Conclusion¶

	id	card_id	card_name
0	1	26000056	Skeleton Barrel
1	2	27000002	Mortar
2	3	26000024	Royal Giant
3	4	26000067	Elixir Golem
4	5	26000021	Hog Rider