i have an algorithym that creates a B-tree using python, what I need is a way to save this B-tree in the disc and be able to read it later on with python, what is the best way of doing it?
CodePudding user response:
One approach to write a tree to file would be to come up with a record structure (text-based or binary) and iterate over all the nodes in the tree, starting with the leaves and going up to the root. As you write each node, you write the value and its neighbours to the file and you note the position that node was written to on the node. When writing its neighbours, you write the address in the file instead.
When loading the file, you start with the root record, set its value, get the address of the left neighbour, get that record, set its value, etc. and the same for the right neighbour.
I won't write and test all the code for the existing code you linked, but here's a simple example that shows the basic idea:
from random import randint
from dataclasses import dataclass
from typing import BinaryIO, ClassVar, Union
# A simple unbalanced tree node with an add method
@dataclass
class Node:
# write to file as 2-byte integers
int_size: ClassVar[int] = 2
value: int = None
address: int = None
left: Union['Node', int] = None
right: Union['Node', int] = None
def add(self, value):
p = self
while True:
if value > p.value:
if p.right is None:
p.right = Node(value=value)
break
else:
p = p.right
elif value < p.value:
if p.left is None:
p.left = Node(value=value)
break
else:
p = p.left
else:
# value already in tree
break
@classmethod
def _write_int(cls, f: BinaryIO, i: int):
f.write(i.to_bytes(length=cls.int_size, byteorder='big'))
@classmethod
def _write_node(cls, f: BinaryIO, node: 'Node'):
f.write(node.value.to_bytes(length=cls.int_size, byteorder='big'))
f.write((node.left.address if node.left is not None else 0).to_bytes(length=cls.int_size, byteorder='big'))
f.write((node.right.address if node.right is not None else 0).to_bytes(length=cls.int_size, byteorder='big'))
@classmethod
def _read_int(cls, f: BinaryIO):
return int.from_bytes(f.read(cls.int_size), byteorder='big')
@classmethod
def _read_node(cls, f: BinaryIO, node: 'Node'):
node.value = int.from_bytes(f.read(cls.int_size), byteorder='big')
node.left = int.from_bytes(f.read(cls.int_size), byteorder='big')
node.right = int.from_bytes(f.read(cls.int_size), byteorder='big')
def save(self, f: BinaryIO):
# seek the start of the file and write a dummy root record
f.seek(0)
self._write_int(f, self.value)
self._write_int(f, 0)
self._write_int(f, 0)
stack = []
p = self
while True:
if p.left is not None and p.left.address is None:
stack.append(p)
p = p.left
elif p.right is not None and p.right.address is None:
stack.append(p)
p = p.right
else:
p.address = f.tell()
self._write_node(f, p)
if stack:
p = stack.pop()
else:
break
# rewrite root node with correct location of left and right nodes
f.seek(0)
self._write_node(f, self)
def load(self, f: BinaryIO):
f.seek(0)
self.address = 0
def load_side(side: str):
# this function avoids repeating this code twice for .left and .right,
# working around Python not having pointers, otherwise you'd just point at the attributes
nonlocal f, p, stack
if not isinstance(getattr(p, side), int):
return False
if not getattr(p, side):
setattr(p, side, None)
else:
f.seek(getattr(p, side))
setattr(p, side, Node())
stack.append(p)
p = getattr(p, side)
self._read_node(f, p)
return True
stack = []
p = self
self._read_node(f, p)
while True:
if not load_side('left') and not load_side('right'):
if stack:
p = stack.pop()
else:
break
def print(self):
stack = []
p = self
left_done = False
right_done = False
while True:
if not left_done and p.left is not None:
stack.append((p, True, False))
p, left_done, right_done = p.left, False, False
elif not right_done and p.right is not None:
stack.append((p, True, True))
p, left_done, right_done = p.right, False, False
else:
print(f'value: {p.value}, '
f'left: {p.left.value if p.left else "none"}, '
f'right: {p.right.value if p.right else "none"}')
if stack:
p, left_done, right_done = stack.pop()
else:
break
values = [randint(0, 100) for __ in range(10)]
root = Node(values[0])
for v in values[1:]:
root.add(v)
print('\nTree after construction:')
root.print()
# save the tree
with open('tree.bin', 'wb') as f:
root.save(f)
# start a new and empty tree
root = Node()
# read the tree
with open('tree.bin', 'rb') as f:
root.load(f)
print('\nResult after load:')
root.print()
This writes the tree as a 2-byte unsigned integer binary file (changing that size is trivial) with the references to left and right children represented as the file position of the record on file.
The example avoids recursive calls, as a serious unbalanced b-tree could quickly approach maximum recursion depth.
Example result (the print function ends up printing the root at the bottom, a nicer print is certainly possible, but this is just to demonstrate it works):
Tree after construction:
value: 15, left: none, right: none
value: 18, left: 15, right: none
value: 30, left: 18, right: none
value: 41, left: 30, right: none
value: 11, left: none, right: 41
value: 54, left: none, right: none
value: 74, left: 54, right: none
value: 81, left: none, right: none
value: 78, left: 74, right: 81
value: 45, left: 11, right: 78
Result after load:
value: 15, left: none, right: none
value: 18, left: 15, right: none
value: 30, left: 18, right: none
value: 41, left: 30, right: none
value: 11, left: none, right: 41
value: 54, left: none, right: none
value: 74, left: 54, right: none
value: 81, left: none, right: none
value: 78, left: 74, right: 81
value: 45, left: 11, right: 78
Note that quite a bit of code is required even for a basic example. And in this particular example, it would probably make a lot more sense to just store the values in a file and reconstruct the tree the same way it was constructed originally, as the tree's structure is entirely deterministic.
But there are cases where you might not want to do that - for example, if you add functionality that allows rewriting of the value of a node while on file, the reconstruction would result in a different (though still correct) tree.
Also note that the code provided above has a flaw: if you try to save the same tree twice, it will fail as it uses the address to determine if a node has already been written - it's left as an exercise to the reader to fix that. Either you could keep track separately, or you could reset the addresses for the entire tree before saving.