What I'm trying to do

I'm testing out metals capability to work with loops. Since I can't define new constants in metal, I'm passing a uint into the buffer and use it to iterate over an array filled with integers. It looks like this in swift.

let array1: [Int] = [1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6]

The problem(s)

However when reading the result array buffer in Swift after completing the loop in metal, it seems like not every element has been allocated.

#include <metal_stdlib>
using namespace metal;

kernel void shader(constant int *arr        [[ buffer(0) ]],
                   device   int *resultArray [[ buffer(1) ]],
                   constant uint &iter       [[ buffer(2) ]]) // value of 12
{
    
    for (uint i = 0; i < iter; i  ){
        resultArray[i] = arr[i];
    }
    
}

out

Similarly, using the iterator to set allocate each element of resultArray, yields strange results

for (uint i = 0; i < iter; i  ){
        resultArray[i] = i;
    }

out

4294967296
12884901890
21474836484
30064771078
38654705672
47244640266
0
0
0
0
0
0

Multiplication seems to work

for (uint i = 0; i < iter; i  ){
        resultArray[i] = arr[i] * i;
    }

out

Addition does not

for (uint i = 0; i < iter; i  ){
        resultArray[i] = arr[i]   i;
    }

out

4294967297
12884901892
21474836487
30064771082
38654705677
47244640272
0
0
0
0
0
0

When however, I set iter to a value of for example 24 or higher, it at least iterated over the whole arrays of size 12.

for (uint i = 0; i < iter; i  ){ // iter now value of 100
        resultArray[i] = arr[i] * iter;
    }

What is going on here?

MCVE

yes, it's a lot of code to get a simple loop running in metal, please bare with me

main.swift

import MetalKit

let array1: [Int] = [1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6]

func gpuProcess(arr1: [Int]) {
    
    let size = arr1.count // value of 12

    // GPU we want to use
    let device = MTLCreateSystemDefaultDevice()

    // Fifo queue for sending commands to the gpu
    let commandQueue = device?.makeCommandQueue()

    // The library for getting our metal functions
    let gpuFunctionLibrary = device?.makeDefaultLibrary()

    // Grab gpu function
    let additionGPUFunction = gpuFunctionLibrary?.makeFunction(name: "shader")

    var additionComputePipelineState: MTLComputePipelineState!
    do {
        additionComputePipelineState = try device?.makeComputePipelineState(function: additionGPUFunction!)
    } catch {
      print(error)
    }

    // Create buffers to be sent to the gpu from our array
    let arr1Buff = device?.makeBuffer(bytes: arr1,
                                      length: MemoryLayout<Int>.size * size ,
                                      options: .storageModeShared)
    
    let resultBuff = device?.makeBuffer(length: MemoryLayout<Int>.size * size,
                                        options: .storageModeShared)

    // Create the buffer to be sent to the command queue
    let commandBuffer = commandQueue?.makeCommandBuffer()

    // Create an encoder to set values on the compute function
    let commandEncoder = commandBuffer?.makeComputeCommandEncoder()
    commandEncoder?.setComputePipelineState(additionComputePipelineState)

    // Set the parameters of our gpu function
    commandEncoder?.setBuffer(arr1Buff, offset: 0, index: 0)
    commandEncoder?.setBuffer(resultBuff, offset: 0, index: 1)
    
    // Set parameters for our iterator
    var count = size
    commandEncoder?.setBytes(&count, length: MemoryLayout.size(ofValue: count), index: 2)

    // Figure out how many threads we need to use for our operation
    let threadsPerGrid = MTLSize(width: 1, height: 1, depth: 1)
    let maxThreadsPerThreadgroup = additionComputePipelineState.maxTotalThreadsPerThreadgroup // 1024
    let threadsPerThreadgroup = MTLSize(width: maxThreadsPerThreadgroup, height: 1, depth: 1)
    commandEncoder?.dispatchThreads(threadsPerGrid,
                                    threadsPerThreadgroup: threadsPerThreadgroup)

    // Tell encoder that it is done encoding.  Now we can send this off to the gpu.
    commandEncoder?.endEncoding()

    // Push this command to the command queue for processing
    commandBuffer?.commit()

    // Wait until the gpu function completes before working with any of the data
    commandBuffer?.waitUntilCompleted()

    // Get the pointer to the beginning of our data
    var resultBufferPointer = resultBuff?.contents().bindMemory(to: Int.self,
                                                                capacity: MemoryLayout<Int>.size * size)

    // Print out all of our new added together array information
    for _ in 0..<size {
        print("\(Int(resultBufferPointer!.pointee) as Any)")
        resultBufferPointer = resultBufferPointer?.advanced(by: 1)
    }
    
}

// Call function
gpuProcess(arr1: array1)

compute.metal

#include <metal_stdlib>
using namespace metal;

kernel void shader(constant int *arr        [[ buffer(0) ]],
                   device   int *resultArray [[ buffer(1) ]],
                   constant uint &iter       [[ buffer(2) ]]) // value of 12
{
    
    for (uint i = 0; i < iter; i  ){
        resultArray[i] = arr[i] * iter;
    }
    
}

CodePudding user response：

You are using 64 bit Int in Swift and 32 bit integers in MSL. Your GPU threads are also overlapping their work. Instead, use Int32 in Swift and make each thread process their own piece of data. Like this

import MetalKit

let array1: [Int32] = [1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6]

func gpuProcess(arr1: [Int32]) {
    
    let size = arr1.count // value of 12

    // GPU we want to use
    let device = MTLCreateSystemDefaultDevice()

    // Fifo queue for sending commands to the gpu
    let commandQueue = device?.makeCommandQueue()

    // The library for getting our metal functions
    let gpuFunctionLibrary = device?.makeDefaultLibrary()

    // Grab gpu function
    let additionGPUFunction = gpuFunctionLibrary?.makeFunction(name: "shader")

    var additionComputePipelineState: MTLComputePipelineState!
    do {
        additionComputePipelineState = try device?.makeComputePipelineState(function: additionGPUFunction!)
    } catch {
      print(error)
    }

    // Create buffers to be sent to the gpu from our array
    let arr1Buff = device?.makeBuffer(bytes: arr1,
                                      length: MemoryLayout<Int32>.stride * size ,
                                      options: .storageModeShared)
    
    let resultBuff = device?.makeBuffer(length: MemoryLayout<Int32>.stride * size,
                                        options: .storageModeShared)

    // Create the buffer to be sent to the command queue
    let commandBuffer = commandQueue?.makeCommandBuffer()

    // Create an encoder to set values on the compute function
    let commandEncoder = commandBuffer?.makeComputeCommandEncoder()
    commandEncoder?.setComputePipelineState(additionComputePipelineState)

    // Set the parameters of our gpu function
    commandEncoder?.setBuffer(arr1Buff, offset: 0, index: 0)
    commandEncoder?.setBuffer(resultBuff, offset: 0, index: 1)
    
    // Set parameters for our iterator
    var count = size
    commandEncoder?.setBytes(&count, length: MemoryLayout.size(ofValue: count), index: 2)

    // Figure out how many threads we need to use for our operation
    let threadsPerGrid = MTLSize(width: 1, height: 1, depth: 1)
    let maxThreadsPerThreadgroup = additionComputePipelineState.maxTotalThreadsPerThreadgroup // 1024
    let threadsPerThreadgroup = MTLSize(width: maxThreadsPerThreadgroup, height: 1, depth: 1)
    commandEncoder?.dispatchThreads(threadsPerGrid,
                                    threadsPerThreadgroup: threadsPerThreadgroup)

    // Tell encoder that it is done encoding.  Now we can send this off to the gpu.
    commandEncoder?.endEncoding()

    // Push this command to the command queue for processing
    commandBuffer?.commit()

    // Wait until the gpu function completes before working with any of the data
    commandBuffer?.waitUntilCompleted()

    // Get the pointer to the beginning of our data
    var resultBufferPointer = resultBuff?.contents().bindMemory(to: Int32.self,
                                                                capacity: MemoryLayout<Int32>.stride * size)

    // Print out all of our new added together array information
    for _ in 0..<size {
        print("\(Int32(resultBufferPointer!.pointee) as Any)")
        resultBufferPointer = resultBufferPointer?.advanced(by: 1)
    }
    
}

// Call function
gpuProcess(arr1: array1)

Kernel:

#include <metal_stdlib>
using namespace metal;

kernel void shader(constant int *arr        [[ buffer(0) ]],
                   device   int *resultArray [[ buffer(1) ]],
                   constant uint &iter       [[ buffer(2) ]],
                   uint gid [[ thread_position_in_grid ]], // this is thread index in grid, since you have height and depth of a dispatch set to 1 in CPU code, you can use 1D `int` here.
)
{
    // Early out if gid is out of array boudns
    if(gid >= iter)
    {
        return;
    }

    // Each thread processes it's own data
    resultArray[gid] = arr[gid] * iter;
}

For more information on how to use Metal for compute refer to developer docs and for the information about attributes such as thread_position_in_grid refer to Metal Shading Language specification.