import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import OrdinalEncoder
import sklearn.preprocessing as preprocessing
import sklearn.model_selection as model_selection
import tensorflow as tf
import matplotlib.pyplot as plt
# A lot of this code is derived from:
# https://www.kaggle.com/code/cullensun/deep-learning-model-for-hong-kong-horse-racing
# Load the CSV file into a pandas DataFrame
data = pd.read_csv('all.csv')
# Things to consider:
# Create models based on each class instead of learning on them all
# We need to clean up some of the data
# 6=short course, 0=long course, make 0 short and 1 long
data.loc[data['random'] == 0, 'random'] = 1
data.loc[data['random'] == 6, 'random'] = 0
# rename random to course_length
data = data.rename(columns={'random': 'course_length'})
# Fix jockey order
data.loc[data['jockey'] == 0, 'jockey'] = 9
data.loc[data['jockey'] == 1, 'jockey'] = 8
data.loc[data['jockey'] == 2, 'jockey'] = 7
data.loc[data['jockey'] == 3, 'jockey'] = 6
# bronze,silver,gold,plat
data.loc[data['jockey'] == 9, 'jockey'] = 3
data.loc[data['jockey'] == 8, 'jockey'] = 2
data.loc[data['jockey'] == 7, 'jockey'] = 0
data.loc[data['jockey'] == 6, 'jockey'] = 1
# Golds are better than plats at short courses swap 3 and 2
#data.loc[(data['course_length'] == 0) & (data['jockey'] == 3), 'jockey'] = 5
#data.loc[(data['course_length'] == 0) & (data['jockey'] == 2), 'jockey'] = 6
#data.loc[data['jockey'] == 5, 'jockey'] = 2
#data.loc[data['jockey'] == 6, 'jockey'] = 3
# Convert c,b,a,s to 0,1,2,3 to give them numeric values
ordinal_encoder = OrdinalEncoder(categories=[['c', 'b', 'a', 's']])
data['rank'] = ordinal_encoder.fit_transform(data[['rank']])
# Group each set of 6, assigning a unique raceid to each race
data['raceid'] = (data.index // 6)
# Create a column with where in the set of 6 a chocobo is aka draw order
data['draw'] = data.index %6+1
# Grab only columns we care about, rearrange, dropping extra
# Consider training on less data
data = data[['raceid','draw','course_length','rank','ts', 'stamina1', 'sprinting', 'jockey', 'rs1', 'intel', 'coop', 'acc','winorder']]
#data = data[['raceid','draw','rs1','ts','winorder']]
# Expand a race into a single row, remove raceid and draw from expansion
# Who you are racing against matters not just the stats, so we merge them into a single row
# Another approach is to not do this and instead analyze stats for each chocobo individually
data = data.pivot(index='raceid', columns='draw', values=data.columns[2:])
# Sort columns alphabetically (cosmetic)
rearranged_columns = sorted(list(data.columns.values))
data = data[rearranged_columns]
# Create a copy of the data without the winorder (training data)
X = data[data.columns[:-6]]
featureCount = X.shape[1]
# Create a copy of the winorders
W = data[data.columns[-6:]]
# Do standard deviation with magic math to normalize data ~20% higher success rates
ss = preprocessing.StandardScaler()
X = pd.DataFrame(ss.fit_transform(X),columns = X.columns)
#print(X.head(10))
#print(W.head(10))
# This is where the science takes a turn to a little trial and error
# We need to make a list of outputs as goals before we train the model
# The website trains so first place is the goal and assigns a 1 if you are in first, otherwise a 0.
# .42 success
y_won = W.applymap(lambda x: 1.0 if x < 2 else 0.0)
# We actually care about knowing first, second, and third place with the order
# So we're going to asssign 1 for first, .67 for second, and .33 as third.
# .32 success
#y_won = W.applymap(lambda x: 1.0 if x == 1 else 0.67 if x == 2 else 0.33 if x == 3 else 0.0)
# We could assign the top 3 as 1 and train on that as well
# .29 success
#y_won = W.applymap(lambda x: 1.0 if x < 4 else 0.0)
# We could not modify values and train that the entire placement matters
# .17 success
#y_won = W
#print(y_won.head(10))
print(X.shape)
print(y_won.shape)
outShape = y_won.shape[1]
# Split data into 80% train and 20% test sets
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y_won, train_size=0.8, test_size=0.2, random_state=1)
print('X_train', X_train.shape)
print('y_train', y_train.shape)
print('X_test', X_test.shape)
print('y_test', y_test.shape)
#print(y_test.head(10))
# We need to know who actually came in first and second for checking accuracy later so we're making a copy of the data with the same random seed
X_train1, X_test1, y_train1, y_test1 = model_selection.train_test_split(X, W, train_size=0.8, test_size=0.2, random_state=1)
#print(y_test1.head(10))
# Now we convert that to a list of all the races with who got 1 and 2 aka [[0,1,1,0,0,0],[1,1,0,0,0,0]]
# We'll use this later to see how well the model does
y_test1 = y_test1.applymap(lambda x: 1 if x == 1.0 else 1 if x == 2.0 else 0)
y_actual = y_test1.values.tolist()
#print(y_actual)
# Now we train the model using the website's code
# 96 is arbitrary afaik
model = tf.keras.Sequential([
tf.keras.layers.Dense(96, activation='relu', input_shape=(featureCount,)),
tf.keras.layers.Dense(outShape, activation='softmax')
])
model.compile(optimizer=tf.keras.optimizers.Adam(5e-04),
loss=tf.keras.losses.CategoricalCrossentropy(),
metrics=[tf.keras.metrics.Precision(name='precision')])
dataset = tf.data.Dataset.from_tensor_slices((X_train.values, y_train.values))
train_dataset = dataset.shuffle(len(X_train)).batch(500)
dataset = tf.data.Dataset.from_tensor_slices((X_test.values, y_test.values))
validation_dataset = dataset.shuffle(len(X_test)).batch(500)
print("Start training..\n")
# How many epochs to use here are trial and error.
history = model.fit(train_dataset, epochs=100, validation_data=validation_dataset)
print("Done.")
# Make predictions on the test set
y_pred = model.predict(X_test)
# y_pred is full of values 0-1 for each horse ie [0.1, 0.2, 0.3, 0.4, 0.5, 0.6]
# They are out of order and use longer decimals
# process_result takes the highest 3 and uses them as the guesses
def process_result(data):
temp = data.copy()
for index in range(len(data)):
if temp[index] == max(temp):
data[index] = 1
temp[index] = 0
break
for index in range(len(data)):
if temp[index] == max(temp):
data[index] = 1
temp[index] = 0
break
for index in range(len(data)):
if temp[index] == max(temp):
data[index] = 1
temp[index] = 0
break
for index in range(len(data)):
if data[index] != 1:
data[index] = 0
values = []
for item in data:
values.append(int(item))
return values
# Print the first three races
print("\nOutputting first three races:")
print("Actual order")
for sublist in y_actual[:3]:
print(sublist)
# Here are the predicted chances
print("Predicted chances ")
for sublist in y_pred[:3]:
print(sublist)
# Here are the winning guesses
print("Guesses ")
for sublist in y_pred[:3]:
print(process_result(sublist))
# We need to go through the predictions and take the chocobos with the highest chances of winning and give them a 1
# Aka [0.1,0.6,0.5,0.4,0.3,0.2] -> [0,1,1,1,0,0]
# Then we need to check if that guess has first and second place and tell us how good the model actually is
print("\nAll races: ")
count = 0
success = 0
for index,pred in enumerate(y_pred):
#print(pred)
predres = process_result(pred)
# print("Guess: "+str(predres))
# print("Actual: "+str(y_actual[index]))
found = 0
count = count+1
n = 0
for yval in y_actual[index]:
if yval == 1:
if predres[n] == 1:
found = found +1
n = n+1
if found == 2:
success = success+1
# input("waiting")
print("Success: %s" % success)
print("Total: %s" % count)
print(success/count)