From 26784956a32d51b3587313a5f3ed2924f98f24a9 Mon Sep 17 00:00:00 2001 From: Lukas Wieser Date: Wed, 31 Jan 2024 18:07:02 +0100 Subject: [PATCH] Add hypothesis test --- chesspp/util.py | 39 +++++++++++++++++++++++++++++++++++++++ main.py | 30 +++++++++++++++++++----------- 2 files changed, 58 insertions(+), 11 deletions(-) diff --git a/chesspp/util.py b/chesspp/util.py index 9bd86c5..81d5962 100644 --- a/chesspp/util.py +++ b/chesspp/util.py @@ -1,9 +1,13 @@ +from typing import TypedDict + import chess import chess.engine from stockfish import Stockfish import numpy as np import random +from scipy.stats import binomtest + def pick_move(board: chess.Board) -> chess.Move | None: """ @@ -77,3 +81,38 @@ def simulate_stockfish_prob(board: chess.Board, move: chess.Move, games: int = 1 print(scores) # TODO: return distribution here? return np.array(scores).mean(), np.array(scores).std() + + +HypothesisTestResult = TypedDict('HypothesisTestResult', {"trials": int, "pvalue": float, "statistic": float}) + + +def hypothesis_test(wins: int, draws: int, losses: int) -> HypothesisTestResult: + """ + Hypothesis test using Binomial distributions. + + Null Hypothesis: Both engines have the same strength, aka they win on average half of the games. + Alternative Hypothesis: Both engines have different strength. + + :returns: tuple of trials, pvalue, test-statistic + """ + + # wins give 1 point, and draws give 1/2 points + score = wins + draws // 2 + + # number of games + trials = wins + draws + losses + + # due to rounding down the variable score, if draws are even, we have to reduce trials by one. + if draws % 2 != 0: + trials -= 1 + + # we expect that if both engines have the same strength, that they "win" on 50% on average + expected_success_rate = 0.5 + + result = binomtest(score, trials, expected_success_rate, alternative='two-sided') + + return { + "trials": trials, + "pvalue": result.pvalue, + "statistic": result.statistic + } diff --git a/main.py b/main.py index a4ae590..326a788 100644 --- a/main.py +++ b/main.py @@ -1,18 +1,20 @@ +import argparse +import os import random import time import chess import chess.engine import chess.pgn -from chesspp.mcts.classic_mcts import ClassicMcts + +from chesspp import engine +from chesspp import simulation, eval +from chesspp import util from chesspp.mcts.baysian_mcts import BayesianMcts +from chesspp.mcts.classic_mcts import ClassicMcts from chesspp.random_strategy import RandomStrategy from chesspp.stockfish_strategy import StockFishStrategy -from chesspp import engine -from chesspp import util -from chesspp import simulation, eval -import argparse -import os +from chesspp.util import hypothesis_test def test_simulate(): @@ -44,7 +46,7 @@ def test_bayes_mcts(): t1 = time.time_ns() mcts.sample(1) t2 = time.time_ns() - print ((t2 - t1)/1e6) + print((t2 - t1) / 1e6) mcts.print() for move, score in mcts.get_moves().items(): print("move (mcts):", move, " with score:", score) @@ -106,10 +108,15 @@ def test_evaluation(): b_wins = len(list(filter(lambda x: x.winner == simulation.Winner.Engine_B, results))) draws = len(list(filter(lambda x: x.winner == simulation.Winner.Draw, results))) + alpha = 0.001 + test_result = hypothesis_test(a_wins, draws, b_wins) + reject_h0 = test_result['pvalue'] < alpha + print(f"{games_played} games played") - print(f"Engine {a.get_name()} won {a_wins} games ({a_wins/games_played:.2%})") - print(f"Engine {b.get_name()} won {b_wins} games ({b_wins/games_played:.2%})") - print(f"{draws} games ({draws/games_played:.2%}) resulted in a draw") + print(f"Engine {a.get_name()} won {a_wins} games ({a_wins / games_played:.2%})") + print(f"Engine {b.get_name()} won {b_wins} games ({b_wins / games_played:.2%})") + print(f"{draws} games ({draws / games_played:.2%}) resulted in a draw") + print(f"Hypothesis test: trials={test_result['trials']}, pvalue={test_result['pvalue']:2.10f}, statistic={test_result['statistic']:2.4f}, reject_h0={reject_h0}") def read_arguments(): @@ -118,7 +125,8 @@ def read_arguments(): description='Compare two engines by playing multiple games against each other' ) - engines = {"ClassicMCTS": engine.ClassicMctsEngine, "BayesianMCTS": engine.BayesMctsEngine, "Random": engine.RandomEngine} + engines = {"ClassicMCTS": engine.ClassicMctsEngine, "BayesianMCTS": engine.BayesMctsEngine, + "Random": engine.RandomEngine} strategies = {"Random": RandomStrategy, "Stockfish": StockFishStrategy} if os.name == 'nt':