Home > Back-end >  How to make good use of BFS for sub milisecond execution?
How to make good use of BFS for sub milisecond execution?

Time:12-20

I have an implementation of BFS that works just fine, but seems to get really CPU heavy, even at low depth (sub ms for a depth of 4 but 10 ms for a depth of 10). I'm quite confident that this algorithm should run sub ms event at a depth of 100 but i'm not quite sure what I'm missing. Here's the code :

using System.Collections.Generic;
using System.Linq;
using UnityEngine;

public class VisionGraph : MonoBehaviour
{
    public Transform Ground;
    public int Height;
    public int Width;
    private MeshFilter mf;
    public Vector3[] Vertices;
    public float precision;
    public Vector3 SelectedVertex;


    // Start is called before the first frame update
    private void Start()
    {
        mf = Ground.GetComponent<MeshFilter>();
        Matrix4x4 localToWorld = transform.localToWorldMatrix;

        Vector3 world_max = localToWorld.MultiplyPoint3x4(mf.mesh.vertices[0]);

        Width = Mathf.RoundToInt(world_max.z * (1 / precision));
        int maxIndex = Mathf.RoundToInt((world_max.z * (1 / precision)) * (Height * (1 / precision)) * world_max.x);
        Vertices = new Vector3[maxIndex];
        
        //This is the graph initialization. 
        //Indices increment by 1 while actual coordinates increments by `precision`. 
        //Indices are then reversed to coordinates in BFS with `position/precision`.
        int xInd, yInd, zInd; xInd = yInd = zInd = 0;
        float x, y, z; x = y = z = 0;

        int index = 0;

        while (index < Vertices.Length - 1)
        {
            index = (yInd * (Width * Width))   (zInd * Width)   xInd;

            Debug.Log(index   " "   maxIndex);
            Vertices[index] = new(x, y, z);
            x  = precision;
            xInd  ;

            if (x > world_max.x)
            {
                x = 0;
                xInd = 0;
                z  = precision;
                zInd  ;

                if (z > world_max.z)
                {
                    z = 0;
                    zInd = 0;
                    y  = precision;
                    yInd  ;
                }
            }
        }

        SelectedVertex = Vertices[600];
    }

    private void OnDrawGizmos()
    {
        // Needs to be turned into retrieve index from position.
        // but i'm not sure how to clamp the continuous position to `precision` steps.

        SelectedVertex = Vertices.Where(v => Vector3.Distance(v, SelectedVertex) <= precision - 0.0001 ).FirstOrDefault();

        var watch = System.Diagnostics.Stopwatch.StartNew();
        List<Vector3Int> closeVertices = BFS(SelectedVertex, 10); // second param is the search depth

        watch.Stop();
        Debug.Log(watch.ElapsedMilliseconds);
        foreach (var vert in closeVertices)
        {
            var index = (vert.y * (Width * Width))   (vert.z * Width)   vert.x;
            if (index >= Vertices.Length) continue;
            Gizmos.color = Color.red;
            Gizmos.DrawSphere(Vertices[index], 0.1f);
        }
    }

    private List<Vector3Int> BFS(Vector3 start, int depth)
    {
        Vector3Int startIndex = new((int)(start.x / precision), (int)(start.y / precision), (int)(start.z / precision));

        Dictionary<Vector3Int, bool> closedList = new();
        List<Vector3Int> queue = new() { startIndex };

        while (queue.Count > 0)
        {
            Vector3Int v = queue[^1];
            queue.RemoveAt(queue.Count-1);

            Vector3Int[] neighbors = new[]
            {
                v   Vector3Int.left,
                v   Vector3Int.right,
                v   Vector3Int.up,
                v   Vector3Int.down,
                v   Vector3Int.forward,
                v   Vector3Int.back,
            };
            foreach (Vector3Int n in neighbors)
            {
                if (n.x < 0 || n.y < 0 || n.z < 0) continue; // this will alos include the high limit of the grid but i dont "need" it at this point of the tests

                //For every implementation of graph search algorithms I make, this always seem to bee the weak part.
                if ((n - startIndex).sqrMagnitude > depth*depth || queue.Any(vert => vert == n) || closedList.ContainsKey(n)) continue;

                queue.Insert(0, n);
            }
            closedList.Add(v, true); 
        }

        return closedList.Keys.ToList();
    }
}

The use of a Dictionary for the closedList is a poor attempt at trying to reduce the List searching time, it used to be closedList.Any(vert => vert == n) but I didn't see great change by the use of each. I'd be really glad if somebody could pinpoint what really slow this down.

Second question : How do you even MultiThread with BFS ? Both the queue and the closed list are very dynamic, is there a solution to work this out with NativeLists ? Thank you for your time. Let me know if anything's unclear.

CodePudding user response:

I cannot make out at a glance what your code is doing ( there is no docs and my C# is rusty ) But your BFS has an surprising amount of code ( A BFS usually only takes a few lines ). Why do you need to create a new, large data structure on EVERY pass through the while loop? That is expensive, and as far as I recall C# uses some bedeviled garbage collector thingy that is going to be fighting you all the way

You have a variable named queue, but it is not a queue but rather a list. This is confusing, and probably incorrect.

As far as I can make out, you are creating the neighbors of a node on the fly as you go along. Terrible idea! You should model the graph either with an adjacency matrix ( simplest ) or node adjacency lists ( more complicated, but memory efficient for large graphs ) Then you can move through the graph using the node indices to look up the neighbours without needing to create them over and over again.

For comparison, here is a C code for a BFS that uses a real queue and concerns itself only with node indices.

    void cPathFinder::breadth(
        std::function<void(int v, int p)> visitor)
    {
        if (!nodeCount())
            throw std::runtime_error("breadth called on empty graph");

        std::vector<bool> visited(nodeCount(), false);
        std::queue<int> Q;

        visited[myStart] = true;
        Q.push(myStart);

        while (Q.size())
        {
            int v = Q.front();
            Q.pop();
            for (int w : adjacent(v))
            {
                if (!visited[w])
                {
                    // reached a new node
                    visitor(w, v);
                    visited[w] = true;
                    Q.push(w);
                }
            }
        }
    }

CodePudding user response:

Upfront, this may, or may not be suitable in your situation, but as an exercise for myself as much as anything, I thought it'd be interesting to see how fast I could get the execution of the given code.

I created three methods in my test.

  1. The first was your method verbatim.
  2. The second method was your code, but 'rejigged' but taking the neighbours array creation out of the loop and adding in a Queue<Vector3Int> instead of the List<Vector3Int> as I suspected that the insertion to element 0 of the List<T> might be a big part of the issue in your original code also replacing your Dictionary<Vector3Int, bool> with a HashSet<Vector3Int>. I also replaced Any(v => v == n) with Contains(v). Contains turned out to be significantly faster.
  3. Then the third method I created, I went even further to add a second HashSet to enable a fast lookup of items that were still in the queue.

I then ran the tests against each method using a StopWatch to record the times. I also ran the last method again, but that time from Task.Run.

Here are the two 'optimised' methods:

private List<Vector3Int> BFS_Optimised ( Vector3 start, int depth )
{
    Vector3Int startIndex = new((int)(start.x / precision), (int)(start.y / precision), (int)(start.z / precision));
    HashSet<Vector3Int> closedList = new ();
    Queue<Vector3Int> queue = new ();
    queue.Enqueue(startIndex);
    var dSquared = depth * depth;

    Vector3Int[] neighbors =
        new[] { Vector3Int.left, Vector3Int.right, Vector3Int.up, Vector3Int.down, Vector3Int.forward, Vector3Int.back };

    while ( queue.Count > 0 )
    {
        var v = queue.Dequeue();
        for ( int i = 0; i < 6;   i )
        {
            var n = v   neighbors[i];
            if ( n.x < 0 || n.y < 0 || n.z < 0 )
                continue; // this will alos include the high limit of the grid but i dont "need" it at this point of the tests
            if ( ( n - startIndex ).sqrMagnitude > dSquared 
                    || closedList.Contains ( n ) 
                    ||  queue.Contains ( n ) ) // queue.Any(v => v == n ) ) //
                    continue;
            queue.Enqueue ( n );
        }
        closedList.Add ( v );
    }
    return closedList.ToList ( );
}


private List<Vector3Int> BFS_Optimised2 ( Vector3 start, int depth )
{
    Vector3Int startIndex = new((int)(start.x / precision), (int)(start.y / precision), (int)(start.z / precision));
    HashSet<Vector3Int> closedList = new ();
    Queue<Vector3Int> queue = new ();
    queue.Enqueue ( startIndex );
    HashSet<Vector3Int> qHash = new ( ) { startIndex };
    var dSquared = depth * depth;

    Vector3Int[] neighbors = 
            new[] { Vector3Int.left, Vector3Int.right, Vector3Int.up, Vector3Int.down, Vector3Int.forward, Vector3Int.back };

    while ( queue.Count > 0 )
    {
        var v = queue.Dequeue();
        qHash.Remove ( v );
        for ( int i = 0; i < 6; i   )
        {
            var n = v   neighbors[i];
            if ( n.x < 0 || n.y < 0 || n.z < 0 )
                continue; // this will alos include the high limit of the grid but i dont "need" it at this point of the tests
            if ( ( n - startIndex ).sqrMagnitude > dSquared 
                    || closedList.Contains ( n ) 
                    || qHash.Contains ( n ) )
                continue;
            queue.Enqueue ( n );
            qHash.Add ( n );
        }
        closedList.Add ( v );
    }
    return closedList.ToList ( );
}

Here's the test (excuse the crudeness and lack of depth in tests):

async void Run ( )
{
    var iterations = 100;
    var d = 10;
    var v = new Vector3 ( 10, 10, 10 );

    List<Vector3Int> r1 = default;
    List<Vector3Int> r2 = default;
    List<Vector3Int> r3 = default;
    List<Vector3Int> r4 = default;

    Debug.Log ( "Waiting ... " );
    await Task.Delay ( 2000 );
    Debug.Log ( "Run ... " );

    Stopwatch sw = new();
    sw.Start ( );
    for ( int i = 0; i < iterations; i   )
        r1 = BFS ( v, d );
    sw.Stop ( );
    var t1 = sw.Elapsed.TotalMilliseconds;


    sw.Restart ( );
    for ( int i = 0; i < iterations; i   )
        r2 = BFS_Optimised ( v, d );
    sw.Stop ( );
    var t2 = sw.Elapsed.TotalMilliseconds;


    sw.Restart ( );
    for ( int i = 0; i < iterations; i   )
        r3 = BFS_Optimised2 ( v, d );
    sw.Stop ( );
    var t3 = sw.Elapsed.TotalMilliseconds;


    sw.Restart ( );
    r4 = await Task.Run ( ( ) => BFS_Optimised2 ( v, d ) );
    sw.Stop ( );
    var t4 = sw.Elapsed.TotalMilliseconds;



    StringBuilder sb = new();
    sb.AppendLine ( $"Original : {t1} ms [{r1.Count}] ({r1 [ 0 ]}) .. ({r1 [ ^1 ]})" );
    sb.AppendLine ( $"Optimised : {t2} ms [{r2.Count}] ({r2 [ 0 ]}) .. ({r2 [ ^1 ]})" );
    sb.AppendLine ( $"Optimised2 : {t3} ms [{r3.Count}] ({r3 [ 0 ]}) .. ({r3 [ ^1 ]})" );
    sb.AppendLine ( $"Optimised2 Task.Run : {t4} ms [{r4.Count}] ({r4 [ 0 ]}) .. ({r4 [ ^1 ]})" );
    Debug.Log ( sb.ToString ( ) );
}

And here are the results:

Original : 10701.7465 ms [4169] ((10, 10, 10)) .. ((15, 5, 3))
Optimised : 1830.9519 ms [4169] ((10, 10, 10)) .. ((15, 5, 3))
Optimised2 : 209.1559 ms [4169] ((10, 10, 10)) .. ((15, 5, 3))
Optimised2 Task.Run : 17.7353 ms [4169] ((10, 10, 10)) .. ((15, 5, 3))
  • Related