For my chess engine I use statistics to choose optimal moves. I collected them from millions of games. I'm interested in the current move, the next move and how much times the next move was played given the current move.
For using a Python dictionary and storing it with pickle the file is too large, and hard to update with new games. So I decided to use SQLite.
I created a class MovesDatabase
:
class MovesDatabase:
def __init__(self, work_dir):
self.con = sqlite3.connect(os.path.join(work_dir, "moves.db"))
self.con.execute('PRAGMA temp_store = MEMORY')
self.con.execute('PRAGMA synchronous = NORMAL')
self.con.execute('PRAGMA journal_mode = WAL')
self.cur = self.con.cursor()
self.cur.execute("CREATE TABLE IF NOT EXISTS moves("
"move TEXT,"
"next TEXT,"
"count INTEGER DEFAULT 1);")
move
and next
represent the state of a chess board in a string format: FEN. Example:
- rnbqkbnr/pppppppp/8/8/8/8/PPPPPPPP/RNBQKBNR
- r1b1k1nr/p2p1pNp/n2B4/1p1NP2P/6P1/3P1Q2/P1P1K3/q5b1
- 8/8/8/4p1K1/2k1P3/8/8/8 b
The method below is responsible for taking a games file, extracting the moves and inserting if the couple (move
, next
) is new, or updating if (move
, next
) already exist in the database:
def insert_moves_from_file(self, file: str):
print("Extracting moves to database from " file)
count = 0
with open(file) as games_file:
game = chess.pgn.read_game(games_file)
while game is not None:
batch = []
board = game.board()
state_one = board.fen().split(' ')[0] ' ' board.fen().split(' ')[1]
for move in game.mainline_moves():
board.push(move)
fen = board.fen().split(' ')
state_two = fen[0] ' ' fen[1]
res = self.cur.execute("SELECT * FROM moves WHERE move=? AND next=?",
(state_one, state_two))
res = res.fetchall()
if len(res) != 0:
self.cur.execute("UPDATE moves SET count=count 1 WHERE move=? AND next=?",
(state_one, state_two))
else:
batch.append((state_one, state_two))
state_one = state_two
self.cur.executemany("INSERT INTO moves(move, next) VALUES"
"(?, ?)", batch)
count = 1
print('\r' "%d games was add to the database.." % (count 1), end='')
game = chess.pgn.read_game(games_file)
self.con.commit()
print("\n Finished!")
The couple (move
, next
) is unique.
I tested with a file containing approximately 4 million (move
, next
). It started inserting/updating 3.000 rows/s, but with 50K rows it slows down to 100 rows/s and keeps going down. I designed this method to process multiple game files, that's why I choose an SQL database in the first place.
CodePudding user response:
It's not INSERT
ing that's slow here.
Your move
and next
columns aren't indexed, so any SELECT
or UPDATE
involving those columns requires a full table scan.
If (move, next)
is always unique, you'll want to add an UNIQUE
index on that. It will also automagically make the queries that query for move
/next
pairs faster (but not necessarily those that query for only one of those two columns).
To create that index on your existing table,
CREATE UNIQUE INDEX ix_move_next ON moves (move, next);
Finally, once you have that index in place, you can get rid of the whole SELECT
/UPDATE
thing too with an upsert:
INSERT INTO moves (move, next) VALUES (?, ?) ON CONFLICT (move, next) DO UPDATE SET count = count 1;
Here's a slight refactoring that achieves about 6200 moves/second inserted on my machine. (It requires the tqdm
library for a nice progress bar, and a pgns/
directory with PGN files.)
import glob
import sqlite3
import chess.pgn
import tqdm
from chess import WHITE
def board_to_state(board):
# These were extracted from the implementation of `board.fen()`
# so as to avoid doing extra work we don't need.
bfen = board.board_fen(promoted=False)
turn = ("w" if board.turn == WHITE else "b")
return f'{bfen} {turn}'
def insert_game(cur, game):
batch = []
board = game.board()
state_one = board_to_state(board)
for move in game.mainline_moves():
board.push(move)
state_two = board_to_state(board)
batch.append((state_one, state_two))
state_one = state_two
cur.executemany("INSERT INTO moves (move, next) VALUES (?, ?) ON CONFLICT (move, next) DO UPDATE SET count = count 1", batch)
n_moves = len(batch)
return n_moves
def main():
con = sqlite3.connect("moves.db")
con.execute('PRAGMA temp_store = MEMORY')
con.execute('PRAGMA synchronous = NORMAL')
con.execute('PRAGMA journal_mode = WAL')
con.execute('CREATE TABLE IF NOT EXISTS moves(move TEXT,next TEXT,count INTEGER DEFAULT 1);')
con.execute('CREATE UNIQUE INDEX IF NOT EXISTS ix_move_next ON moves (move, next);')
cur = con.cursor()
for pgn_file in sorted(glob.glob("pgns/*.pgn")):
with open(pgn_file) as games_file:
n_games = 0
with tqdm.tqdm(desc=pgn_file, unit="moves") as pbar:
while (game := chess.pgn.read_game(games_file)):
n_moves = insert_game(cur, game)
n_games = 1
pbar.set_description(f"{pgn_file} ({n_games} games)", refresh=False)
pbar.update(n_moves)
con.commit()
if __name__ == '__main__':
main()