import chess.pgn
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, accuracy_score, classification_report, precision_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier

/Users/pranavshah/anaconda3/lib/python3.11/site-packages/pandas/core/arrays/masked.py:60: UserWarning: Pandas requires version '1.3.6' or newer of 'bottleneck' (version '1.3.5' currently installed).
  from pandas.core import (

pgnFilePath = r'./data/*.pgn'
#outputFilePath = "./evaluations.csv"
totalGameCount = 0
totalEvalGameCount = 0
fileList = [] 
#glob.glob(pgnFilePath)
rows_list = [] # list of dictionaries for each game 
for file in fileList:
    pgn = open(fileList[0])
    game = chess.pgn.read_game(pgn)
    while game != None:
        totalGameCount += 1
        variations = game.variations # list of either the eval or clock score
        if(totalGameCount % 2000 == 0): # print every 2000 games
            print("Total Game Count: ", totalGameCount)
            print("Eval Game Count: ", totalEvalGameCount)
        if(totalGameCount % 1000000 == 0):
            df = pd.DataFrame(rows_list) # add everything to the dataframe 
            df.to_csv(outputFilePath)
        if(len(variations) > 0 and 'eval' in variations[0].comment): # if this game was evaluated by a computer, add it
            totalEvalGameCount+=1
            h = game.headers
                
            # adding a dictionary is faster than appending to a dataframe 
            rows_list.append({
                "UTCDate": h.get("UTCDate", np.NaN),
                "UTCTime": h.get("UTCTime", np.NaN),
                'WhiteElo': h.get('WhiteElo', np.NaN),
                'BlackElo': h.get('BlackElo', np.NaN),
                "Opening": h.get("Opening", np.NaN),
                "ECO": h.get("ECO", np.NaN),
                'Result': h.get('Result', np.NaN),
                "Termination": h.get("Termination", np.NaN),
                "Variations": str(variations[0]) if variations else np.NaN,
                'WhiteRatingDiff': h.get('WhiteRatingDiff', np.NaN),
                'BlackRatingDiff': h.get("BlackRatingDiff", np.NaN)
            })

        # Iterator reading next game
        game = chess.pgn.read_game(pgn)

# Read data back in from csv and ignore the first column because it just contains the indicies.
# We only want white and black elo, opening category, result, and variations
smallDataset = "miniEvaluations.csv"
fullDataset = "evaluations.csv"
df= pd.read_csv(fullDataset, usecols=["WhiteElo", "BlackElo", "ECO", "Result", "Variations"])

# All data will be from the perspective of white 

# Get mistake differential from a game
def getMistakeDifferentials(variation):
    # Find all evaluations
    evalText = re.findall(r'%eval -?\d.\d*', variation)
    # Truncate text and get float eval value
    evalList = []
    for eval in evalText:
        evalList.append(float(eval.split(" ")[1]))

    # Find the mistake differential for white
    evalDifference = 0
    whiteBlunders = 0
    blackBlunders = 0
    whiteMistakes = 0
    blackMistakes = 0
    whiteInaccuracies = 0
    blackInaccuracies = 0
    for i in range(len(evalList)):
        if i != 0:
            evalDifference = evalList[i] - evalList[i-1]
        if abs(evalDifference) > 3:
            if i % 2 == 0:
                whiteBlunders += 1
            else:
                blackBlunders += 1
        elif abs(evalDifference) > 1:
            if i % 2 == 0:
                whiteMistakes += 1
            else:
                blackMistakes += 1
        elif abs(evalDifference) > 0.5:
            if i % 2 == 0:
                whiteInaccuracies += 1
            else:
                blackInaccuracies += 1
    blunderDifferential = whiteBlunders - blackBlunders
    mistakeDifferential = whiteMistakes - blackMistakes
    inaccuracyDifferential = whiteInaccuracies - blackInaccuracies
    return pd.Series({"BlunderDifferential" : blunderDifferential, "MistakeDifferential" : mistakeDifferential, 
                      "InaccuracyDifferential" : inaccuracyDifferential})

# Get time differential from a game
def getTimeDifferential(variation):
    # Find all clock text
    clockText = re.findall(r'%clk \d:\d{2}:\d{2}', variation)
    # Truncate text and get clock data in seconds
    clockList = []
    for clock in clockText:
        time = clock.split(" ")[1]
        hr = int(time.split(":")[0])
        min = int(time.split(":")[1])
        sec = int(time.split(":")[2])
        totalSeconds = hr * 3600 + min * 60 + sec
        clockList.append(totalSeconds)

    # Get clock differential
    whiteEndClock = 0
    blackEndClock = 0
    # If less than 2 moves, can't calculate time differential
    whiteBeginClock = 0
    if len(clockList) >= 2:
        whiteBeginClock = clockList[0]
        if len(clockList) % 2 == 0:
            whiteEndClock = clockList[-2]
            blackEndClock = clockList[-1]
        else:
            whiteEndClock = clockList[-1]
            blackEndClock = clockList[-2]
    timeDifferential = whiteEndClock - blackEndClock 

    return pd.Series({"TimeDifferential": timeDifferential, "TimeControl": whiteBeginClock})

def getMoves(variation):
    moves = re.findall(r'%eval -?\d.\d*', variation)
    # Even out moves to prevent it from giving away winner
    if len(moves) % 2 != 0:
        return len(moves) + 1
    return len(moves)

# Turn string result into a number result 
def getResultForWhite(result):
    if result == "0-1":
        return 0
    elif result == "1-0":
        return 1
    else:
        return 0.5
    
df[["BlunderDifferential","MistakeDifferential","InaccuracyDifferential"]] = df["Variations"].apply(getMistakeDifferentials)

df[["TimeDifferential","TimeControl"]] = df["Variations"].apply(getTimeDifferential)

df["Moves"] = df["Variations"].apply(getMoves)

df["EloDifferential"] = df["WhiteElo"] - df["BlackElo"]

df["AverageElo"] = (df["WhiteElo"] + df["BlackElo"]) / 2

df["Result"] = df["Result"].apply(getResultForWhite)


pd.set_option('display.max_columns', None)
print(df.head())

   WhiteElo  BlackElo  ECO  Result  \
0      2344      2247  B09     0.0   
1      1605      1733  C33     1.0   
2      1897      1491  B12     1.0   
3      2026      1684  B13     1.0   
4      1520      1079  C40     1.0   

                                          Variations  BlunderDifferential  \
0  1. e4 { [%eval 0.13] [%clk 0:03:00] } 1... d6 ...                    0   
1  1. e4 { [%eval 0.13] [%clk 0:10:00] } 1... e5 ...                   -1   
2  1. e4 { [%eval 0.13] [%clk 0:30:00] } 1... c6 ...                   -1   
3  1. e4 { [%eval 0.13] [%clk 0:29:57] } 1... c6 ...                    0   
4  1. e4 { [%eval 0.13] [%clk 0:10:00] } 1... e5 ...                    0   

   MistakeDifferential  InaccuracyDifferential  TimeDifferential  TimeControl  \
0                    2                       1               -26          180   
1                    0                      -2                33          600   
2                   -2                       0               495         1800   
3                    1                       2               531         1797   
4                   -3                      -3              -121          600   

   Moves  EloDifferential  AverageElo  
0     36               97      2295.5  
1     48             -128      1669.0  
2     48              406      1694.0  
3     90              342      1855.0  
4     48              441      1299.5

import re

def setupGame():
    # Setup the starting board
    gameArr = [["" for i in range(8)] for i in range(8)]
    setupArr = ["R", "N", "B", "Q", "K", "B", "N", "R"]
    for i in range(8):
        gameArr[7][i] = "B" + setupArr[i]
        gameArr[0][i] = "W" + setupArr[i]
    for i in range(8):
        gameArr[6][i] = "BP"
        gameArr[1][i] = "WP"
    return gameArr

def makeMove(gameArr, move, moveNum):
    # Each move can either be a regular move from one place to another, 
    # or it can be a special move with more steps than a simple move
    piece, specifier, file, rank = breakMoveUp(move)
    color = "W" if moveNum % 2 == 0 else "B"
    # A castle is simply a rearranging of King and Rook
    if piece == "Castle":
        colorRank = 0 if color == "W" else 7
        gameArr[colorRank][4] = ""
        gameArr[colorRank][5] = color + "R"
        gameArr[colorRank][6] = color + "K"
        gameArr[colorRank][7] = ""
    elif piece == "Queenside Castle":
        colorRank = 0 if color == "W" else 7
        gameArr[colorRank][0] = ""
        gameArr[colorRank][2] = color + "K"
        gameArr[colorRank][3] = color + "R"
        gameArr[colorRank][4] = ""
    # A promotion is a pawn move to the last rank and then a swap to a different piece
    elif piece == "Promotion":
        firstMove = move.split("=")[0]
        makeMove(gameArr, firstMove, moveNum)
        piece, specifier, file, rank = breakMoveUp(firstMove)
        col = ord(file) - ord("a")
        row = int(rank) - 1
        gameArr[row][col] = color + move.split("=")[1][0]
    else:
        piece = color + piece
        col = ord(file) - ord("a")
        row = int(rank) - 1
        # Need to find the current piece that's being moved
        curRow,curCol = findPiecePos(piece, specifier, gameArr, col, row, color)
        #En Passant Condition: If a piece was captured by a pawn 
        # and the new location for it had no piece to begin with, then it must be an en passant
        if "x" in move and gameArr[row][col] == "":
            if color == "W":
                # Erasing black piece, so add 1
                gameArr[row-1][col] = ""
            else:
                gameArr[row+1][col] = ""
        gameArr[row][col] = piece
        gameArr[curRow][curCol] = ""
       
def findPiecePos(piece, specifier, gameArr, endCol, endRow, color):
    col = -1
    row = -1
    if specifier:
        # Specifier is file
        if 0 <= ord(specifier) - ord("a") <= 8:
            col = ord(specifier) - ord("a")
        # Specifier is rank
        else:
            row = int(specifier) - 1
    # Check all 64 squares brute force
    for i in range(len(gameArr)):
        for j in range(len(gameArr[i])):
            # Meets piece and specifier constraints
            if piece == gameArr[i][j] and (col==-1 or j==col) and (row==-1 or i==row):
                colDistance = abs(endCol-j)
                rowDistance = abs(endRow-i)
                # An original position for a piece is valid when these conditions are satisfied:
                # 1. The piece's movement rules are followed ex. N moves 2 squares one way and 1 square the other way
                # 2. The piece's path to its destination is unobstructed
                # 3. The piece's movement doesn't leave its king in check
                #If positions satisfy movement rules of the piece
                if piece[1] == "N" and colDistance + rowDistance == 3 \
                    and min(colDistance, rowDistance) == 1 and not kingChecked(gameArr, color, i, j, endRow, endCol):
                    
                    return (i,j)
                if piece[1] == "R" and min(colDistance, rowDistance) == 0 and not kingChecked(gameArr, color, i, j, endRow, endCol):
                    blockingPiece = False
                    if colDistance == 0:
                        multiplier = 1
                        if i < endRow: 
                            multiplier = -1
                        for k in range(1,rowDistance):
                            if gameArr[endRow + multiplier*k][endCol] != "":
                                blockingPiece = True
                    else:
                        multiplier = 1
                        if j < endCol:
                            multiplier = -1
                        for k in range(1,colDistance):
                            if gameArr[endRow][endCol + multiplier*k] != "":
                                blockingPiece = True
                    if blockingPiece == False:
                        return (i,j)
                if piece[1] == "B" and colDistance == rowDistance and not kingChecked(gameArr, color, i, j, endRow, endCol):
                    blockingPiece = False
                    if colDistance == rowDistance:
                        rowM = 1
                        colM = 1
                        if i<endRow:
                            rowM = -1
                        if j<endCol:
                            colM = -1
                        for k in range(1,rowDistance):
                            if gameArr[endRow+rowM*k][endCol + colM*k] != "":
                                blockingPiece = True
                    if blockingPiece == False:
                        return (i,j)
                if piece[1] == "P" and not kingChecked(gameArr, color, i, j, endRow, endCol):
                    if (((color == "W" and i==1) or (color == "B" and i==6)) and rowDistance == 2 and colDistance == 0):
                        if color == "W" and i<endRow:
                            if gameArr[endRow-1][endCol] == "":
                                return(i,j)
                        elif color == "B" and i>endRow:
                            if gameArr[endRow+1][endCol] == "":
                                return(i,j)
                    elif rowDistance == 1 and colDistance <= 1:
                        if color == "W" and i<endRow:
                            return (i,j)
                        elif color == "B" and i>endRow:
                            return(i,j)
                if piece[1] == "K":
                    return(i,j)
                if piece[1] == "Q" and (min(colDistance, rowDistance) == 0 or colDistance == rowDistance) and not kingChecked(gameArr, color, i, j, endRow, endCol):
                    blockingPiece = False
                    if colDistance == rowDistance:
                        rowM = 1
                        colM = 1
                        if i<endRow:
                            rowM = -1
                        if j<endCol:
                            colM = -1
                        for k in range(1,rowDistance):
                            if gameArr[endRow+rowM*k][endCol + colM*k] != "":
                                blockingPiece = True
                    elif colDistance == 0:
                        multiplier = 1
                        if i < endRow: 
                            multiplier = -1
                        for k in range(1,rowDistance):
                            if gameArr[endRow + multiplier*k][endCol] != "":
                                blockingPiece = True
                    elif rowDistance == 0:
                        multiplier = 1
                        if j < endCol:
                            multiplier = -1
                        for k in range(1,colDistance):
                            if gameArr[endRow][endCol + multiplier*k] != "":
                                blockingPiece = True
                    if blockingPiece == False:
                        return (i,j)

def kingChecked(gameArr, color, startRow, startCol, endRow, endCol):
    # Checks if a piece is pinned to its king
    # Does this by first assuming the move takes place, and then checking if the king is left attacked
    startPiece = gameArr[startRow][startCol]
    gameArr[startRow][startCol] = ""
    endPiece = gameArr[endRow][endCol]
    gameArr[endRow][endCol] = startPiece
    for i in range(len(gameArr)):
        for j in range(len(gameArr[i])):
            if gameArr[i][j] == color + "K":
                kArr = [(0,1),(0,-1),(1,0),(-1,0),(1,1),(-1,-1),(1,-1),(-1,1)]
                row = i
                col = j
                for k in range(len(kArr)*8):
                    r,c = kArr[k%8]
                    r = r*((k//8)+1)
                    c = c*((k//8)+1)
                    modRow = row + r
                    modCol = col + c
                    if 0 <= modRow <= 7 and 0 <= modCol <= 7:
                        if gameArr[modRow][modCol] != "":   
                            pieceColor = gameArr[modRow][modCol][0]
                            oppositeColor = "B" if color=="W" else "W"
                            piece = gameArr[modRow][modCol][1]
                            # Rook or Queen
                            if k%8 < 4:
                                if pieceColor == oppositeColor and piece in ["R","Q"]: 
                                    gameArr[startRow][startCol] = startPiece
                                    gameArr[endRow][endCol] = endPiece
                                    return True
                                else:
                                    kArr[k%8] = (0,0)
                            # Bishop and Queen
                            else:
                                if pieceColor == oppositeColor and piece in ["B","Q"]:
                                    gameArr[startRow][startCol] = startPiece
                                    gameArr[endRow][endCol] = endPiece
                                    return True
                                else:
                                    kArr[k%8] = (0,0)
    gameArr[startRow][startCol] = startPiece
    gameArr[endRow][endCol] = endPiece
    return False
    
def breakMoveUp(move):
    # Break up the move into its piece, specifier, to break tie between multiple of the same piece, file, and rank
    # Get piece
    files = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h']
    ranks = ['1', '2', '3', '4', '5', '6', '7', '8']
    piece = move[0]
    specifier = None
    file = None
    rank = None
    if move[0] not in ["R", "N", "B", "Q", "K"]:
        # Castle or Pawn move
        if move[0] == "O":
            if len(move) == 3:
                return "Castle", None, None, None
            else:
                return "Queenside Castle", None, None, None
        else:
            piece = "P"
    if "=" in move:
        return "Promotion", None, None, None
    
    # Get file, rank, specifier
    for i in range(0, len(move)):
        if move[i] in files or move[i] in ranks:
            if specifier == None:
                specifier = move[i]
            else:
                if move[i] in files:
                    file = move[i]
                else:
                    rank = move[i]
    if not file:
        file = specifier
        specifier = None
    if not rank:
        rank = specifier
        specifier = None
    
    if piece == "P" and specifier == None:
        specifier = file
    
    return piece, specifier, file, rank

def playMoves(moves):
    # Play through the moves by calling makeMove repeatedly
    gameArr = setupGame()
    for i in range(len(moves)):
        makeMove(gameArr, moves[i], i)
    return gameArr


def findKingSafetyDifferential(gameArr):
    # Using the Pawn Shield method for calculating King Safety
    # The lack of a shielding pawn within one or two squares of king gets a -1 penalty and an open file gets a -3 penalty
    blackKingSafetyPenalty = 0
    whiteKingSafetyPenalty = 0
    for i in range(len(gameArr)):
        for j in range(len(gameArr[i])):
            if gameArr[i][j] == "WK":
                minRow = i+1
                maxRow = min(7, i+2)
                minCol = max(0, j-1)
                maxCol = min(7, j+1)
                whiteKingSafetyPenalty = calculateKingSafetyPenalty(minRow, maxRow, minCol, maxCol, "W", gameArr)
            if gameArr[i][j] == "BK":
                minRow = max(0, i-2)
                maxRow = i-1
                minCol = max(0, j-1)
                maxCol = min(7, j+1)
                blackKingSafetyPenalty = calculateKingSafetyPenalty(minRow, maxRow, minCol, maxCol, "B", gameArr)
    return whiteKingSafetyPenalty - blackKingSafetyPenalty    

def calculateKingSafetyPenalty(minRow, maxRow, minCol, maxCol, color, gameArr):
    # Open Files are -3 and Missing Pawns are -1
    totalPenalty = 0
    for b in range(minCol, maxCol+1):
        # Missing Pawn Check
        penalty = -1
        for a in range(minRow, maxRow+1):
            if gameArr[a][b] == color + "P":
                penalty = 0
        # Open File Check
        if penalty == -1:
            penalty = -3
            for a in range(0,7):
                if gameArr[a][b] != "" and gameArr[a][b][1] == "P":
                    penalty = -1
        totalPenalty += penalty
    return totalPenalty

def findMobilityDifferential(gameArr):
    # Calculating mobility by looking at number of legal moves
    blackMobility = 0
    whiteMobility = 0
    for i in range(len(gameArr)):
        for j in range(len(gameArr[i])):
            if gameArr[i][j] != "":
                color = gameArr[i][j][0]
                piece = gameArr[i][j][1]
                if piece == "P":
                    if color == "W":
                        if gameArr[i+1][j] == "":
                            whiteMobility+=1
                            if i==1 and gameArr[i+2][j] == "":
                                whiteMobility+=1
                    else:
                        if gameArr[i-1][j] == "":
                            blackMobility+=1
                            if i==6 and gameArr[i-2][j] == "":
                                blackMobility+=1
                else:
                    if color == "W":
                        whiteMobility += generateMobilityCombos(piece, i, j, gameArr)
                    else:
                        blackMobility += generateMobilityCombos(piece, i, j, gameArr)
    return whiteMobility - blackMobility
                        
def generateMobilityCombos(piece, row, col, gameArr):
    # Checking all possible ways each piece can move to determine number of legal squares
    mobility = 0
    if piece == "N":
        nArr = [(-1,-2), (-1,2), (1,-2), (1,2), (2, 1), (2, -1), (-2,1), (-2,-1)]
        for i in range(len(nArr)):
            r, c = nArr[i]
            modRow = row + r
            modCol = col + c
            if 0 <= modRow <= 7 and 0 <= modCol <= 7:
                if gameArr[modRow][modCol] == "":
                    mobility += 1
    elif piece == "B":
        bArr = [(-1,-1),(1,1),(-1,1),(1,-1)]
        for i in range(len(bArr)*4):
            r,c = bArr[i%4]
            r = r*((i//4)+1)
            c = c*((i//4)+1)
            modRow = row + r
            modCol = col + c
            if 0 <= modRow <= 7 and 0 <= modCol <= 7:
                if gameArr[modRow][modCol] == "":
                    mobility += 1
                else:
                    bArr[i%4] = (0,0)
    elif piece == "R":
        rArr = [(0,-1), (0,1), (1,0), (-1,0)]
        for i in range(len(rArr)*4):
            r,c = rArr[i%4]
            r = r*((i//4)+1)
            c = c*((i//4)+1)
            modRow = row + r
            modCol = col + c
            
            if 0 <= modRow <= 7 and 0 <= modCol <= 7:
                if gameArr[modRow][modCol] == "":
                    mobility += 1
                else:
                    rArr[i%4] = (0,0)
    elif piece == "K":
        kArr = [(0,1),(0,-1),(1,-1),(1,0),(1,1),(-1,-1),(-1,0),(-1,1)]
        for i in range(len(kArr)):
            r,c = kArr[i]
            modRow = row + r
            modCol = col + c
            if 0 <= modRow <= 7 and 0 <= modCol <= 7:
                if gameArr[modRow][modCol] == "":
                    mobility += 1
    elif piece == "Q":
        qArr = [(0,-1), (0,1), (1,0), (-1,0), (-1,-1),(1,1),(-1,1),(1,-1)]
        for i in range(len(qArr)*4):
            r,c = qArr[i%8]
            r = r*((i//8)+1)
            c = c*((i//8)+1)
            modRow = row + r
            modCol = col + c
            if 0 <= modRow <= 7 and 0 <= modCol <= 7:
                if gameArr[modRow][modCol] == "":
                    mobility += 1
                else:
                    qArr[i%8] = (0,0)
    return mobility

def findMaterialDifferential(gameArr):
    blackDict = {"P": 0, "B": 0, "N": 0, "R": 0, "Q": 0, "K": 0}
    whiteDict = {"P": 0, "B": 0, "N": 0, "R": 0, "Q": 0, "K": 0}
    # Calculating material imbalance
    # P-1, B and N- 3, R-5, Q-9
    # +1 for bishop pair
    # -1 for rook pair
    # -1 for knight pair
    # -1 for no pawns
    for i in range(len(gameArr)):
        for j in range(len(gameArr[i])):
            if gameArr[i][j] != "":
                color = gameArr[i][j][0]
                piece = gameArr[i][j][1]
                if color == "B":
                    blackDict[piece] = blackDict.get(piece,0) + 1
                else:
                    whiteDict[piece] = whiteDict.get(piece,0) + 1
    blackPoints = calcPoints(blackDict)
    whitePoints = calcPoints(whiteDict)
    return whitePoints - blackPoints

def calcPoints(dict):
    # 
    points = dict["B"] * 3 + dict["N"] * 3 + dict["P"] * 1 + dict["Q"] * 9 + dict["R"] * 5
    if dict["B"] == 2:
        points += 1
    if dict["N"] == 2:
        points -= 1
    if dict["R"] == 2:
        points -= 1
    if dict["P"] == 0:
        points -= 1
    return points
    
def findDevelopmentDifferential(moves):     
    whiteNonPawnMoves = 0
    blackNonPawnMoves = 0
    for i in range(len(moves)):
        if moves[i][0] in ["N","B","R","Q","K","O"]:
            if i % 2 == 0:
                if moves[i][0] == "O":
                    whiteCastleTime = i//2
                whiteNonPawnMoves+=1
            else:
                if moves[i][0] == "O":
                    blackCastleTime = i//2
                blackNonPawnMoves+=1
    nonPawnMoveDifferential = whiteNonPawnMoves - blackNonPawnMoves
    return nonPawnMoveDifferential 
def findStatistics(variation):
    # Match with a number followed by some .'s, a space, and then a combination of letters, dashes, and equals to match all moves
    moves = re.findall(r'\d+\.+\s[\w\-\=]+',variation)
    moves = [move.split(" ")[1] for move in moves]
    modifiedMoves = moves[:24]
    gameArr = playMoves(modifiedMoves)
    safety = findKingSafetyDifferential(gameArr)
    mobility = findMobilityDifferential(gameArr)
    material = findMaterialDifferential(gameArr)
    development = findDevelopmentDifferential(modifiedMoves)
    
    return pd.Series({"KingSafetyDifferential": safety, "MobilityDifferential": mobility, 
                      "MaterialDifferential": material, "DevelopmentDifferential": development}) 

df[["KingSafetyDifferential","MobilityDifferential","MaterialDifferential", "DevelopmentDifferential"]] = df["Variations"].apply(findStatistics)

df = df.drop(["Variations", "WhiteElo", "BlackElo"], axis=1)
print(df.head())

   ECO  Result  BlunderDifferential  MistakeDifferential  \
0  B09     0.0                    0                    2   
1  C33     1.0                   -1                    0   
2  B12     1.0                   -1                   -2   
3  B13     1.0                    0                    1   
4  C40     1.0                    0                   -3   

   InaccuracyDifferential  TimeDifferential  TimeControl  Moves  \
0                       1               -26          180     36   
1                      -2                33          600     48   
2                       0               495         1800     48   
3                       2               531         1797     90   
4                      -3              -121          600     48   

   EloDifferential  AverageElo  KingSafetyDifferential  MobilityDifferential  \
0               97      2295.5                      -2                    10   
1             -128      1669.0                       0                     3   
2              406      1694.0                      -1                    11   
3              342      1855.0                       0                     1   
4              441      1299.5                       3                     7   

   MaterialDifferential  DevelopmentDifferential  
0                     1                       -1  
1                     3                       -2  
2                    -3                        1  
3                     0                        1  
4                     4                        0

df["Inaccuracy Differential Group"] = pd.cut(df['InaccuracyDifferential'], bins=[-12,-8,-4,0,4,8,12], precision=0)
df["Mistake Differential Group"] = pd.cut(df['MistakeDifferential'], bins=[-9,-6,-3,0,3,6,9], precision=0)
df["Blunder Differential Group"] = pd.cut(df['BlunderDifferential'], bins=[-6,-4,-2,0,2,4,6], precision=0)

# Drop na values caused by values that don't fit the bins
inaccuracyGroups = df["Inaccuracy Differential Group"].dropna().unique().sort_values()
mistakeGroups = df["Mistake Differential Group"].dropna().unique().sort_values()
blunderGroups = df["Blunder Differential Group"].dropna().unique().sort_values()

fig, axes = plt.subplots(3, 1, figsize=(10, 20))


ax1 = axes[0]
x = range(len(inaccuracyGroups))
y = df.groupby("Inaccuracy Differential Group", observed=True)["Result"].mean()
ax1.bar(x,y)
ax1.set_xticks(x,inaccuracyGroups)
ax1.set_xlabel('Inaccuracy Differential Group')
ax1.set_ylabel('Mean Result')

ax2 = axes[1]
x = range(len(mistakeGroups))
y = df.groupby("Mistake Differential Group", observed=True)["Result"].mean()
ax2.bar(x,y)
ax2.set_xticks(x,mistakeGroups)
ax2.set_xlabel('Mistake Differential Group')
ax2.set_ylabel('Mean Result')

ax3 = axes[2]
x = range(len(blunderGroups))
y = df.groupby("Blunder Differential Group", observed=True)["Result"].mean()
ax3.bar(x,y)
ax3.set_xticks(x,blunderGroups)
ax3.set_xlabel('Blunder Differential Group')
ax3.set_ylabel('Mean Result')

Text(0, 0.5, 'Mean Result')

bins = [-150,-100,-50,0,50,100,150]
df["EloGroup"] = pd.cut(df['EloDifferential'], bins=bins, precision=0)

# Drop na values caused by values that don't fit the bins
groups = df["EloGroup"].dropna().unique().sort_values()

x = range(len(groups))
y = df.groupby("EloGroup", observed=True)["Result"].mean()
plt.figure(figsize=(10,6))
plt.bar(x, y)

plt.xticks(x, groups)
plt.xlabel('Elo Differential Group')
plt.ylabel('Mean Result')

Text(0, 0.5, 'Mean Result')

bins = [-150,-120,-90,-60,-30,0,30,60,90,120,150]
df["TimeGroup"] = pd.cut(df['TimeDifferential'],bins=bins, precision=0)

# Drop na values caused by values that don't fit the bins
groups = df["TimeGroup"].dropna().unique().sort_values()

x = range(len(groups))
y = df.groupby("TimeGroup", observed=True)["Result"].mean()
plt.figure(figsize=(12,6))
plt.bar(x, y)
plt.xticks(x, groups)
plt.xlabel('Time Differential Group')
plt.ylabel('Mean Result')

Text(0, 0.5, 'Mean Result')

# Filter our df by openings that have at least 10000 games played, approximately 1% of the dataset. 
# We want frequently played openings because unusual openings have a large variance to their win percentage.
value_counts = df["ECO"].value_counts()

filtered_values = value_counts[value_counts > len(df.index)*.01].index
filtered_df = df[df['ECO'].isin(filtered_values)]

groups = filtered_df['ECO'].unique()

x = range(len(groups))
y = filtered_df.groupby("ECO", observed=True)["Result"].mean()
plt.figure(figsize=(16,6))
plt.bar(x, y)
plt.xticks(x, groups)
plt.xlabel('Openings')
plt.ylabel('Mean Result')

Text(0, 0.5, 'Mean Result')

bins = [-6,-4,-2,0,2,4,6]
df["DevelopmentGroup"] = pd.cut(df['DevelopmentDifferential'],bins=bins, precision=0)

# Drop na values caused by values that don't fit the bins
groups = df["DevelopmentGroup"].dropna().unique().sort_values()

x = range(len(groups))
y = df.groupby("DevelopmentGroup", observed=True)["Result"].mean()
plt.figure(figsize=(12,6))
plt.bar(x, y)
plt.xticks(x, groups)
plt.xlabel('Development Differential Group')
plt.ylabel('Mean Result')

Text(0, 0.5, 'Mean Result')

bins = [-4,-2,0,2,4]
df["KingSafetyGroup"] = pd.cut(df['KingSafetyDifferential'],bins=bins, precision=0)

# Drop na values caused by values that don't fit the bins
groups = df["KingSafetyGroup"].dropna().unique().sort_values()

x = range(len(groups))
y = df.groupby("KingSafetyGroup", observed=True)["Result"].mean()
plt.figure(figsize=(12,6))
plt.bar(x, y)
plt.xticks(x, groups)
plt.xlabel('King Safety Differential Group')
plt.ylabel('Mean Result')

Text(0, 0.5, 'Mean Result')

bins = [-12,-8,-4,0,4,8,12]
df["MobilityGroup"] = pd.cut(df['MobilityDifferential'],bins=bins, precision=0)

# Drop na values caused by values that don't fit the bins
groups = df["MobilityGroup"].dropna().unique().sort_values()

x = range(len(groups))
y = df.groupby("MobilityGroup", observed=True)["Result"].mean()
plt.figure(figsize=(12,6))
plt.bar(x, y)
plt.xticks(x, groups)
plt.xlabel('Mobility Differential Group')
plt.ylabel('Mean Result')

Text(0, 0.5, 'Mean Result')

bins = [-4,-2,0,2,4]
df["MaterialGroup"] = pd.cut(df['MaterialDifferential'],bins=bins, precision=0)

# Drop na values caused by values that don't fit the bins
groups = df["MaterialGroup"].dropna().unique().sort_values()

x = range(len(groups))
y = df.groupby("MaterialGroup", observed=True)["Result"].mean()
plt.figure(figsize=(12,6))
plt.bar(x, y)
plt.xticks(x, groups)
plt.xlabel('Material Differential Group')
plt.ylabel('Mean Result')

Text(0, 0.5, 'Mean Result')

# Filter dataframe to remove outliers to make axes smaller
mincnt = 300
# If using small dataset
if len(df.index) < 100000:
    mincnt = 10

# Filtering dataframe to remove outliers because outliers will expand the range of the graphs significantly and 
# make trends in the graph tougher to notice.
filtered_df = df[(abs(df["EloDifferential"]) < 200) & (abs(df["TimeDifferential"]) < 200)]
fig, axes = plt.subplots(3, 1, figsize=(10, 20))

axesArr = [("EloDifferential","MistakeDifferential"),("EloDifferential","TimeDifferential"),("TimeDifferential","MistakeDifferential")]
for i, ax in enumerate(axes.flat):
    x, y = axesArr[i]
    # Decided to use a hexbin as there is a signficant amount of data, so scatter plots wouldnt work well
    ax.hexbin(filtered_df[x],filtered_df[y],gridsize=25,mincnt=mincnt, bins="log")
    # Getting a line of best fit using linear regression
    [m,b] = np.polyfit(filtered_df[x],filtered_df[y],1)
    # Plotting this line
    ax.plot(filtered_df[x],m*filtered_df[x]+b, 'r')
    
    print(f"Slope for {x} vs {y}: {m}")
    
    predicted_y = np.polyval([m,b], filtered_df[x])
    residuals = filtered_df[y] - predicted_y
    SSR = np.sum(residuals ** 2)
    
    mean_y = filtered_df[y].mean()
    difference = filtered_df[y] - mean_y
    SST = np.sum(difference ** 2)
    
    Rsquared = 1-(SSR/SST)
    print(f"R^2 value {Rsquared}")
    
    ax.set_xlabel(x)
    ax.set_ylabel(y)

plt.show()

Slope for EloDifferential vs MistakeDifferential: -0.0019607645692288327
R^2 value 0.0027103792832799956
Slope for EloDifferential vs TimeDifferential: 0.039300400488498095
R^2 value 0.0014882666837612302
Slope for TimeDifferential vs MistakeDifferential: -0.0008304778469412856
R^2 value 0.0005046014153655687

def convertWinToCategory(val):
    res = ""
    match val:
        case 1.0:
            res = "Win"
        case 0.0:
            res = "Loss" 
        case 0.5:
            res = "Tie"
    return res

# extract necessary features for training
train_feat_df = df.loc[:,["MobilityDifferential", "MaterialDifferential", "KingSafetyDifferential","DevelopmentDifferential","AverageElo","TimeControl", "Moves", "InaccuracyDifferential", "MistakeDifferential", "BlunderDifferential", "TimeDifferential", "EloDifferential", "ECO", "Result"]]

# convert results to categorical values for training
train_feat_df["Result"] = train_feat_df["Result"].map(convertWinToCategory)

#retuns a df of the value counts of all the openings
value_counts = train_feat_df["ECO"].value_counts()

#filters all the values such that the count of their occurences needs to be greater than 2% of the dataset, and then cuts the others out of the training dataset
filtered_values = value_counts[value_counts > len(train_feat_df.index)*.02].index
train_feat_df = train_feat_df[train_feat_df['ECO'].isin(filtered_values)]

# Use pd.get_dummies to perform one-hot encoding
one_hot_enc_df = pd.get_dummies(train_feat_df["ECO"], prefix='ECO')
train_feat_df = pd.concat([train_feat_df, one_hot_enc_df], axis=1)
train_feat_df = train_feat_df.drop("ECO", axis = 1) #drop old opening column

# set up all classifiers
classifiers = [
    RandomForestClassifier(),
    GradientBoostingClassifier(),
    AdaBoostClassifier(),
    DecisionTreeClassifier(),
    KNeighborsClassifier(),
    SVC(),
    GaussianNB(),
    LogisticRegression(),
    MLPClassifier()
]

# seperate out data into testing and training dataset
X = train_feat_df.drop("Result", axis=1)
y = train_feat_df["Result"]
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=.25, random_state=42, shuffle=True)

results = []

#loop through all classifiers
for classifier in classifiers:
    clf_name = classifier.__class__.__name__
    clf = classifier
    
    #fit each classifier
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test.values)
    
    # extract the accuracy and precsision
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    
    #save the results in a dictionary
    results.append({'classifier': clf_name, 'accuracy': accuracy, 'precision': precision})

# plot using matplot lib
accuracies = [result['accuracy'] for result in results]
precisions = [result['precision'] for result in results]
classifiers = [result['classifier'] for result in results]

fig, ax = plt.subplots(figsize=(12, 6))
ax.bar(classifiers, accuracies, label='Accuracy')
ax.bar(classifiers, precisions, label='Precision')
ax.set_xlabel('Classifier')
ax.set_ylabel('Score')
ax.set_title('Classifier Performance')
ax.legend()
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

/Users/pranavshah/anaconda3/lib/python3.11/site-packages/sklearn/base.py:464: UserWarning: X does not have valid feature names, but RandomForestClassifier was fitted with feature names
  warnings.warn(
/Users/pranavshah/anaconda3/lib/python3.11/site-packages/sklearn/base.py:464: UserWarning: X does not have valid feature names, but GradientBoostingClassifier was fitted with feature names
  warnings.warn(
/Users/pranavshah/anaconda3/lib/python3.11/site-packages/sklearn/base.py:464: UserWarning: X does not have valid feature names, but AdaBoostClassifier was fitted with feature names
  warnings.warn(
/Users/pranavshah/anaconda3/lib/python3.11/site-packages/sklearn/base.py:464: UserWarning: X does not have valid feature names, but DecisionTreeClassifier was fitted with feature names
  warnings.warn(
/Users/pranavshah/anaconda3/lib/python3.11/site-packages/sklearn/base.py:464: UserWarning: X does not have valid feature names, but KNeighborsClassifier was fitted with feature names
  warnings.warn(
/Users/pranavshah/anaconda3/lib/python3.11/site-packages/sklearn/base.py:464: UserWarning: X does not have valid feature names, but SVC was fitted with feature names
  warnings.warn(
/Users/pranavshah/anaconda3/lib/python3.11/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/Users/pranavshah/anaconda3/lib/python3.11/site-packages/sklearn/base.py:464: UserWarning: X does not have valid feature names, but GaussianNB was fitted with feature names
  warnings.warn(
/Users/pranavshah/anaconda3/lib/python3.11/site-packages/sklearn/linear_model/_logistic.py:460: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
/Users/pranavshah/anaconda3/lib/python3.11/site-packages/sklearn/base.py:464: UserWarning: X does not have valid feature names, but LogisticRegression was fitted with feature names
  warnings.warn(
/Users/pranavshah/anaconda3/lib/python3.11/site-packages/sklearn/base.py:464: UserWarning: X does not have valid feature names, but MLPClassifier was fitted with feature names
  warnings.warn(

param_grid = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7],
}

# Create a Gradient Boosting classifier
gb_classifier = GradientBoostingClassifier(random_state=42)

# Perform grid search with cross-validation
grid_search = GridSearchCV(estimator=gb_classifier, param_grid=param_grid, cv=5, n_jobs=-1, verbose=True)
grid_search.fit(X_train, y_train)

# Print the best model and its hyperparameters
print("Best Model:")
best_model = grid_search.best_estimator_
print(best_model)

Fitting 5 folds for each of 27 candidates, totalling 135 fits

/Users/pranavshah/anaconda3/lib/python3.11/site-packages/pandas/core/arrays/masked.py:60: UserWarning: Pandas requires version '1.3.6' or newer of 'bottleneck' (version '1.3.5' currently installed).
  from pandas.core import (
/Users/pranavshah/anaconda3/lib/python3.11/site-packages/pandas/core/arrays/masked.py:60: UserWarning: Pandas requires version '1.3.6' or newer of 'bottleneck' (version '1.3.5' currently installed).
  from pandas.core import (
/Users/pranavshah/anaconda3/lib/python3.11/site-packages/pandas/core/arrays/masked.py:60: UserWarning: Pandas requires version '1.3.6' or newer of 'bottleneck' (version '1.3.5' currently installed).
  from pandas.core import (
/Users/pranavshah/anaconda3/lib/python3.11/site-packages/pandas/core/arrays/masked.py:60: UserWarning: Pandas requires version '1.3.6' or newer of 'bottleneck' (version '1.3.5' currently installed).
  from pandas.core import (
/Users/pranavshah/anaconda3/lib/python3.11/site-packages/pandas/core/arrays/masked.py:60: UserWarning: Pandas requires version '1.3.6' or newer of 'bottleneck' (version '1.3.5' currently installed).
  from pandas.core import (
/Users/pranavshah/anaconda3/lib/python3.11/site-packages/pandas/core/arrays/masked.py:60: UserWarning: Pandas requires version '1.3.6' or newer of 'bottleneck' (version '1.3.5' currently installed).
  from pandas.core import (
/Users/pranavshah/anaconda3/lib/python3.11/site-packages/pandas/core/arrays/masked.py:60: UserWarning: Pandas requires version '1.3.6' or newer of 'bottleneck' (version '1.3.5' currently installed).
  from pandas.core import (
/Users/pranavshah/anaconda3/lib/python3.11/site-packages/pandas/core/arrays/masked.py:60: UserWarning: Pandas requires version '1.3.6' or newer of 'bottleneck' (version '1.3.5' currently installed).
  from pandas.core import (
/Users/pranavshah/anaconda3/lib/python3.11/site-packages/pandas/core/arrays/masked.py:60: UserWarning: Pandas requires version '1.3.6' or newer of 'bottleneck' (version '1.3.5' currently installed).
  from pandas.core import (
/Users/pranavshah/anaconda3/lib/python3.11/site-packages/pandas/core/arrays/masked.py:60: UserWarning: Pandas requires version '1.3.6' or newer of 'bottleneck' (version '1.3.5' currently installed).
  from pandas.core import (

Best Model:
GradientBoostingClassifier(max_depth=7, random_state=42)

best_model.fit(X_train, y_train) 
y_pred_rand = best_model.predict(X_test) 
print(classification_report(y_pred_rand, y_test))

              precision    recall  f1-score   support

        Loss       0.82      0.81      0.82    194960
         Tie       0.12      0.43      0.19      3313
         Win       0.84      0.82      0.83    208935

    accuracy                           0.81    407208
   macro avg       0.60      0.69      0.61    407208
weighted avg       0.83      0.81      0.82    407208

# combine features and their imporances into a dataframe
important_features = zip(X.columns, best_model.feature_importances_)
imp_feat_df = pd.DataFrame(important_features)
imp_feat_df.rename(columns={0:'features',
                               1:'importance'},
                      inplace=True)

# sort by importance and reconfigure the index
imp_feat_df.sort_values(by=['importance'], inplace=True, ascending=False)
imp_feat_df.reset_index(inplace=True)
imp_feat_df.drop(['index'], axis=1, inplace=True)
imp_feat_df

# plot using matplotlib
plt.figure(figsize=(10, 6))
plt.barh(imp_feat_df['features'], imp_feat_df['importance'], color='darkblue')

plt.xlabel('Importance')
plt.ylabel('Features')

plt.title('Feature Importance for Making Predictions')

plt.show()

# plot confusion matrix
cm = confusion_matrix(y_test, y_pred_rand, labels=best_model.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix=cm,
                               display_labels=best_model.classes_)
disp.plot()

plt.show()

tolerance = 10

bench1 = X_test['EloDifferential'].apply(lambda x: "Win" if x > tolerance else ("Loss" if x < -tolerance else "Tie"))

bench2 = np.random.choice(["Loss", "Tie", "Win"], size=len(X_test))

bench3 = y_pred

bench4 = y_pred_rand

ground_truth = y_test


data = {'Greater ELO Wins': bench1, 'Random Guess': bench2, 'predictions (no hyperparameter)': bench3, 'predictions (hyperparameter)': bench4,'ground_truth': ground_truth}
test_df = pd.DataFrame(data)

accuracy_bench1 = accuracy_score(test_df['ground_truth'], test_df['Greater ELO Wins'])
accuracy_bench2 = accuracy_score(test_df['ground_truth'], test_df['Random Guess'])
accuracy_bench3 = accuracy_score(test_df['ground_truth'], test_df['predictions (no hyperparameter)'])
accuracy_bench4 = accuracy_score(test_df['ground_truth'], test_df['predictions (hyperparameter)'])


print(f"Accuracy of Greater ELO Wins Benchmark: {accuracy_bench1*100:.2f}%")
print(f"Accuracy of Randomly Guessing Winner: {accuracy_bench2*100:.2f}%")
print(f"Accuracy of Gradient Boosting Predictions (w/o hyperparameter): {accuracy_bench3*100:.2f}%")
print(f"Accuracy of Gradient Boosting Predictions (w/ hyperparameter): {accuracy_bench4*100:.2f}%")

Accuracy of Greater ELO Wins Benchmark: 42.68%
Accuracy of Randomly Guessing Winner: 33.41%
Accuracy of Gradient Boosting Predictions (w/o hyperparameter): 75.04%
Accuracy of Gradient Boosting Predictions (w/ hyperparameter): 81.22%

	features	importance
0	BlunderDifferential	0.379436
1	MistakeDifferential	0.233109
2	TimeDifferential	0.098432
3	TimeControl	0.077960
4	Moves	0.070639
5	InaccuracyDifferential	0.041791
6	AverageElo	0.036350
7	EloDifferential	0.022973
8	MobilityDifferential	0.015647
9	MaterialDifferential	0.006348
10	DevelopmentDifferential	0.004517
11	KingSafetyDifferential	0.004035
12	ECO_C41	0.001192
13	ECO_A00	0.001131
14	ECO_B10	0.001057
15	ECO_B01	0.000963
16	ECO_C20	0.000834
17	ECO_D00	0.000587
18	ECO_B00	0.000571
19	ECO_A40	0.000550
20	ECO_D02	0.000489
21	ECO_C50	0.000477
22	ECO_C44	0.000458
23	ECO_C00	0.000455

Mastering the Chessboard: A Data-Driven Approach¶

Mohammad Durrani and Pranav Shah¶

Contents¶

1. Introduction¶

Required Libraries and Imports¶

Import Libaries¶

Background / Further Reading: Chess¶

2. About The Data¶

3. Data Collection: Scraping¶

4. Data Processing: Cleaning¶

5. Exploratory Data Analysis¶

6. Machine Learning¶

Preliminary Reading: Classification and Categorical Encoding¶

Preparing Data For Training¶

Encoding Openings¶

Choosing A Model¶

Hyperparameter Optimization¶

Finding the most impactful features¶

Benchmark¶

7. Insights, Future Work, and Considerations¶

8. References and Additional Resources¶