Home > Blockchain >  save a b-tree file to be read later on using python
save a b-tree file to be read later on using python

Time:11-28

i have an algorithym that creates a B-tree using python, what I need is a way to save this B-tree in the disc and be able to read it later on with python, what is the best way of doing it?

CodePudding user response:

One approach to write a tree to file would be to come up with a record structure (text-based or binary) and iterate over all the nodes in the tree, starting with the leaves and going up to the root. As you write each node, you write the value and its neighbours to the file and you note the position that node was written to on the node. When writing its neighbours, you write the address in the file instead.

When loading the file, you start with the root record, set its value, get the address of the left neighbour, get that record, set its value, etc. and the same for the right neighbour.

I won't write and test all the code for the existing code you linked, but here's a simple example that shows the basic idea:

from random import randint
from dataclasses import dataclass
from typing import BinaryIO, ClassVar, Union

# A simple unbalanced tree node with an add method
@dataclass
class Node:
    # write to file as 2-byte integers
    int_size: ClassVar[int] = 2
    value: int = None
    address: int = None
    left: Union['Node', int] = None
    right: Union['Node', int] = None

    def add(self, value):
        p = self
        while True:
            if value > p.value:
                if p.right is None:
                    p.right = Node(value=value)
                    break
                else:
                    p = p.right
            elif value < p.value:
                if p.left is None:
                    p.left = Node(value=value)
                    break
                else:
                    p = p.left
            else:
                # value already in tree
                break

    @classmethod
    def _write_int(cls, f: BinaryIO, i: int):
        f.write(i.to_bytes(length=cls.int_size, byteorder='big'))

    @classmethod
    def _write_node(cls, f: BinaryIO, node: 'Node'):
        f.write(node.value.to_bytes(length=cls.int_size, byteorder='big'))
        f.write((node.left.address if node.left is not None else 0).to_bytes(length=cls.int_size, byteorder='big'))
        f.write((node.right.address if node.right is not None else 0).to_bytes(length=cls.int_size, byteorder='big'))

    @classmethod
    def _read_int(cls, f: BinaryIO):
        return int.from_bytes(f.read(cls.int_size), byteorder='big')

    @classmethod
    def _read_node(cls, f: BinaryIO, node: 'Node'):
        node.value = int.from_bytes(f.read(cls.int_size), byteorder='big')
        node.left = int.from_bytes(f.read(cls.int_size), byteorder='big')
        node.right = int.from_bytes(f.read(cls.int_size), byteorder='big')

    def save(self, f: BinaryIO):
        # seek the start of the file and write a dummy root record
        f.seek(0)
        self._write_int(f, self.value)
        self._write_int(f, 0)
        self._write_int(f, 0)

        stack = []
        p = self
        while True:
            if p.left is not None and p.left.address is None:
                stack.append(p)
                p = p.left
            elif p.right is not None and p.right.address is None:
                stack.append(p)
                p = p.right
            else:
                p.address = f.tell()
                self._write_node(f, p)
                if stack:
                    p = stack.pop()
                else:
                    break

        # rewrite root node with correct location of left and right nodes
        f.seek(0)
        self._write_node(f, self)

    def load(self, f: BinaryIO):
        f.seek(0)
        self.address = 0

        def load_side(side: str):
            # this function avoids repeating this code twice for .left and .right,
            # working around Python not having pointers, otherwise you'd just point at the attributes
            nonlocal f, p, stack
            if not isinstance(getattr(p, side), int):
                return False
            if not getattr(p, side):
                setattr(p, side, None)
            else:
                f.seek(getattr(p, side))
                setattr(p, side, Node())
                stack.append(p)
                p = getattr(p, side)
                self._read_node(f, p)
            return True

        stack = []
        p = self
        self._read_node(f, p)
        while True:
            if not load_side('left') and not load_side('right'):
                if stack:
                    p = stack.pop()
                else:
                    break

    def print(self):
        stack = []
        p = self
        left_done = False
        right_done = False
        while True:
            if not left_done and p.left is not None:
                stack.append((p, True, False))
                p, left_done, right_done = p.left, False, False
            elif not right_done and p.right is not None:
                stack.append((p, True, True))
                p, left_done, right_done = p.right, False, False
            else:
                print(f'value: {p.value}, '
                      f'left: {p.left.value if p.left else "none"}, '
                      f'right: {p.right.value if p.right else "none"}')
                if stack:
                    p, left_done, right_done = stack.pop()
                else:
                    break


values = [randint(0, 100) for __ in range(10)]
root = Node(values[0])
for v in values[1:]:
    root.add(v)
print('\nTree after construction:')
root.print()

# save the tree
with open('tree.bin', 'wb') as f:
    root.save(f)

# start a new and empty tree
root = Node()
# read the tree
with open('tree.bin', 'rb') as f:
    root.load(f)
print('\nResult after load:')
root.print()

This writes the tree as a 2-byte unsigned integer binary file (changing that size is trivial) with the references to left and right children represented as the file position of the record on file.

The example avoids recursive calls, as a serious unbalanced b-tree could quickly approach maximum recursion depth.

Example result (the print function ends up printing the root at the bottom, a nicer print is certainly possible, but this is just to demonstrate it works):

Tree after construction:
value: 15, left: none, right: none
value: 18, left: 15, right: none
value: 30, left: 18, right: none
value: 41, left: 30, right: none
value: 11, left: none, right: 41
value: 54, left: none, right: none
value: 74, left: 54, right: none
value: 81, left: none, right: none
value: 78, left: 74, right: 81
value: 45, left: 11, right: 78

Result after load:
value: 15, left: none, right: none
value: 18, left: 15, right: none
value: 30, left: 18, right: none
value: 41, left: 30, right: none
value: 11, left: none, right: 41
value: 54, left: none, right: none
value: 74, left: 54, right: none
value: 81, left: none, right: none
value: 78, left: 74, right: 81
value: 45, left: 11, right: 78

Note that quite a bit of code is required even for a basic example. And in this particular example, it would probably make a lot more sense to just store the values in a file and reconstruct the tree the same way it was constructed originally, as the tree's structure is entirely deterministic.

But there are cases where you might not want to do that - for example, if you add functionality that allows rewriting of the value of a node while on file, the reconstruction would result in a different (though still correct) tree.

Also note that the code provided above has a flaw: if you try to save the same tree twice, it will fail as it uses the address to determine if a node has already been written - it's left as an exercise to the reader to fix that. Either you could keep track separately, or you could reset the addresses for the entire tree before saving.

  • Related