Heap Dijkstra Implementation is slower than Naive Dijsktra Implementation-CodePudding

I have tried to implement the Naive and Heap Dijkstra as shown below but somehow my naive Dijkstra implementation is surprisingly faster. I debugged my code but couldn't understand where my problem in my implementations are.

Why is my heap-based implementation is slower than the naive implementation?

Raw data is stored here: https://www.algorithmsilluminated.org/datasets/problem9.8.txt

Data Import and Manipulation:

import time
with open("DijkstraTest2.txt", 'r') as input:
    lines = input.readlines()

lengths = {}
vertices = []
for line in lines:
    contents = line.split("\t")
    vertices.append(contents[0])
    for content in contents:
        content = content.replace('\n', '')
        if ',' in content:
            edge = contents[0]   '-'   content.split(',')[0]
            lengths[edge] = int(content.split(',')[1])

Naive Dijkstra:

def NaiveDijkstra(vertices, start_point, lengths):
    X = [start_point]
    shortest_paths = {}
    for vertex in vertices:
        if vertex == start_point:
            shortest_paths[vertex] = 0
        else:
            shortest_paths[vertex] = 999999999999
    subset = [key for key in lengths.keys() if start_point == key.split('-')[0]
              and key.split('-')[0] in X and key.split('-')[1] not in X]
    while len(subset) > 0:
        temp_min_dict = {}
        for edge in subset:
            temp_min = shortest_paths[edge.split('-')[0]]   lengths[edge]
            temp_min_dict[edge] = temp_min
        new_edge = min(temp_min_dict, key=temp_min_dict.get)
        X.append(new_edge.split('-')[1])
        shortest_paths[new_edge.split('-')[1]] = shortest_paths[new_edge.split('-')[0]]   lengths[new_edge]
        subset = []
        for key in lengths.keys():
            if key.split('-')[0] in X and key.split('-')[1] not in X:
                subset.append(key)
    return shortest_paths

start_time = time.time()
print(NaiveDijkstra(vertices = vertices, start_point = '1', lengths = lengths)['197'])
print(time.time() - start_time, "seconds")

My Heap based Dijkstra code:

class Heap:
    def __init__(self):
        self.size = 0
        self.lst = []

    def swap(self, a):
        if self.size == 1:
            return self.lst
        else:
            if a == 1:
                i = 1
            else:
                i = a // 2
            while i > 0:
                if i * 2 - 1 >= self.size:
                    break
                elif self.lst[i - 1][1] > self.lst[i * 2 - 1][1]:
                    temp = self.lst[i - 1]
                    self.lst[i - 1] = self.lst[i * 2 - 1]
                    self.lst[i * 2 - 1] = temp
                elif i * 2 >= self.size:
                    break
                elif self.lst[i - 1][1] > self.lst[i * 2][1]:
                    temp = self.lst[i - 1]
                    self.lst[i - 1] = self.lst[i * 2]
                    self.lst[i * 2] = temp
                i -= 1
        # print(f"output: {self.lst}")

    def insert(self, element):
        # print(f"input: {self.lst}")
        self.lst.append(element)
        self.size  = 1
        self.swap(self.size)

    def extractmin(self):
        val = self.lst.pop(0)[0]
        self.size -= 1
        self.swap(self.size - 1)
        return val

    def delete(self, deleted):
        ix = self.lst.index(deleted)
        temp = self.lst[-1]
        self.lst[ix] = temp
        self.lst[-1] = deleted
        self.lst.pop(-1)
        self.size -= 1
        #self.swap(self.size)


def FastDijkstra(vertices, start_point, lengths):
    X = []
    h = Heap()
    width = {}
    shortest_paths = {}
    for vertex in vertices:
        if vertex == start_point:
            width[vertex] = 0
            h.insert((vertex, width[vertex]))
        else:
            width[vertex] = 999999999999
            h.insert((vertex, width[vertex]))
    while h.size > 0:
        w = h.extractmin()
        X.append(w)
        shortest_paths[w] = width[w]
        Y = set(vertices).difference(X)
        for x in X:
            for y in Y:
                key = f"{x}-{y}"
                if lengths.get(key) is not None:
                    h.delete((y, width[y]))
                    if width[y] > shortest_paths[x]   lengths[key]:
                        width[y] = shortest_paths[x]   lengths[key]
                    h.insert((y, width[y]))
    return shortest_paths


start_time = time.time()
print(FastDijkstra(vertices=vertices, start_point='1', lengths=lengths)['197'])
print(time.time() - start_time, "seconds")

CodePudding user response：

The way you implemented the heap version is not efficient. Notably the following make it inefficient:

All nodes are put on the heap instead of only the direct neighbors of the visited nodes. This makes the heap large and slower than needed.
Y = set(vertices).difference(X) is a slow operation, and makes Y unnecessary large.
The nested loop that tries every pair in the Cartesian product of X and Y to see if it is an edge. This point together with the previous should be replaced with a collection of edges starting from X, and then discarding edges that lead to already visited nodes.
For every found edge to delete the target node from the heap, and re-insert it, even if the width didn't change! Deletion is a costly operation (see next point). Only if the Heap implementation supports a decrease-key operation, this is an option, but otherwise the heap should just get an extra entry for the same vertex, knowing that the one with the lowest cost will come out of the heap first.
The heap's delete method has a bad time complexity due to the .index() call.
The heap's extractmin method has a bad time complexity, due to the .pop(0) call. This has O(n) time complexity.

The heap's extractmin does not give correct results (again due to that pop(0)). Here is a little script that shows a mistake:

h = Heap()
for value in 4, 3, 5, 2, 1:
    h.insert((value, value))
print(h.extractmin())  # 1 = OK
print(h.extractmin())  # 2 = OK
print(h.extractmin())  # 4 = NOK. 3 expected.

The data structure lengths does not allow to quickly find the edges from a particular vertex. But this is a point that is also making the naive implementation slow. I would however suggest to turn that in a dict.

If this is done right it should run faster. Certainly when you would make use of the native heapq module you'll get good running times.

Here is a (much) faster implementation. It doesn't bother about unreachable vertices, and doesn't bother about possibly having multiple entries on the heap for the same node (with different distances). But it does start with only the starting node on the heap, and uses heapq:

from heapq import heappush, heappop
from collections import defaultdict

def FastDijkstra(vertices, start_point, lengths):
    # Create a dictionary for the edges, keyed by source node
    edges = defaultdict(list)
    for key, length in lengths.items():
        x, y = key.split("-")
        edges[x].append((length, y))

    heap = [(0, start_point)]
    shortest_paths = {}
    while heap:
        cost, x = heappop(heap)
        if x in shortest_paths:
            continue  # this vertex had already been on the heap before
        shortest_paths[x] = cost
        for length, y in edges[x]:
            if y not in shortest_paths:
                heappush(heap, (cost   length, y))
    return shortest_paths

In my tests this ran hundreds times faster.

CodePudding user response：

Thanks to the above answer (wonderful analysis) I adjusted my implementation which is way faster than the previous version. It is shown below.

class Heap:
    def __init__(self):
        self.size = 0
        self.lst = []

    def swap(self, a):
        if self.size == 1:
            return self.lst
        else:
            if a == 1:
                i = 1
            else:
                i = a // 2
            while i > 0:
                if i * 2 - 1 >= self.size:
                    break
                elif self.lst[i - 1][1] > self.lst[i * 2 - 1][1]:
                    temp = self.lst[i - 1]
                    self.lst[i - 1] = self.lst[i * 2 - 1]
                    self.lst[i * 2 - 1] = temp
                elif i * 2 >= self.size:
                    break
                elif self.lst[i - 1][1] > self.lst[i * 2][1]:
                    temp = self.lst[i - 1]
                    self.lst[i - 1] = self.lst[i * 2]
                    self.lst[i * 2] = temp
                i -= 1
        #print(f"output: {self.lst}")

    def insert(self, element):
        #print(f"input: {self.lst}")
        self.lst.append(element)
        self.size  = 1
        self.swap(self.size)

    def extractmin(self):
        val = self.lst[0][0]
        del self.lst[0]
        self.size -= 1
        self.swap(self.size)
        return val

    def delete(self, deleted):
        ix = self.lst.index(deleted)
        temp = self.lst[-1]
        self.lst[ix] = temp
        self.lst[-1] = deleted
        del self.lst[-1]
        self.size -= 1
        #self.swap(self.size)


def FastDijkstra(vertices, start_point, lengths):
    X = []
    h = Heap()
    width = {}
    shortest_paths = {}
    for vertex in vertices:
        if vertex == start_point:
            width[vertex] = 0
            h.insert((vertex, width[vertex]))
        else:
            width[vertex] = 999999999999
            h.insert((vertex, width[vertex]))
    while h.size > 0:
        w = h.extractmin()
        X.append(w)
        shortest_paths[w] = width[w]
        Y = set(vertices).difference(X)
        for y in Y:
            key = f"{w}-{y}"
            if lengths.get(key) is not None:
                h.delete((y, width[y]))
                if width[y] > shortest_paths[w]   lengths[key]:
                    width[y] = shortest_paths[w]   lengths[key]
                h.insert((y, width[y]))
    return shortest_paths


start_time = time.time()
print(FastDijkstra(vertices=vertices, start_point='1', lengths=lengths)['197'])
print(time.time() - start_time, "seconds")