What I'm trying to do
I'm testing out metals capability to work with loops. Since I can't define new constants in metal, I'm passing a uint
into the buffer and use it to iterate over an array filled with integers. It looks like this in swift.
let array1: [Int] = [1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6]
The problem(s)
However when reading the result array buffer in Swift after completing the loop in metal, it seems like not every element has been allocated.
#include <metal_stdlib>
using namespace metal;
kernel void shader(constant int *arr [[ buffer(0) ]],
device int *resultArray [[ buffer(1) ]],
constant uint &iter [[ buffer(2) ]]) // value of 12
{
for (uint i = 0; i < iter; i ){
resultArray[i] = arr[i];
}
}
out
1
2
3
4
5
6
0
0
0
0
0
0
Similarly, using the iterator to set allocate each element of resultArray
, yields strange results
for (uint i = 0; i < iter; i ){
resultArray[i] = i;
}
out
4294967296
12884901890
21474836484
30064771078
38654705672
47244640266
0
0
0
0
0
0
Multiplication seems to work
for (uint i = 0; i < iter; i ){
resultArray[i] = arr[i] * i;
}
out
0
4
12
24
40
60
0
0
0
0
0
0
Addition does not
for (uint i = 0; i < iter; i ){
resultArray[i] = arr[i] i;
}
out
4294967297
12884901892
21474836487
30064771082
38654705677
47244640272
0
0
0
0
0
0
When however, I set iter
to a value of for example 24
or higher, it at least iterated over the whole arrays of size 12
.
for (uint i = 0; i < iter; i ){ // iter now value of 100
resultArray[i] = arr[i] * iter;
}
100
200
300
400
500
600
100
200
300
400
500
600
What is going on here?
MCVE
yes, it's a lot of code to get a simple loop running in metal, please bare with me
main.swift
import MetalKit
let array1: [Int] = [1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6]
func gpuProcess(arr1: [Int]) {
let size = arr1.count // value of 12
// GPU we want to use
let device = MTLCreateSystemDefaultDevice()
// Fifo queue for sending commands to the gpu
let commandQueue = device?.makeCommandQueue()
// The library for getting our metal functions
let gpuFunctionLibrary = device?.makeDefaultLibrary()
// Grab gpu function
let additionGPUFunction = gpuFunctionLibrary?.makeFunction(name: "shader")
var additionComputePipelineState: MTLComputePipelineState!
do {
additionComputePipelineState = try device?.makeComputePipelineState(function: additionGPUFunction!)
} catch {
print(error)
}
// Create buffers to be sent to the gpu from our array
let arr1Buff = device?.makeBuffer(bytes: arr1,
length: MemoryLayout<Int>.size * size ,
options: .storageModeShared)
let resultBuff = device?.makeBuffer(length: MemoryLayout<Int>.size * size,
options: .storageModeShared)
// Create the buffer to be sent to the command queue
let commandBuffer = commandQueue?.makeCommandBuffer()
// Create an encoder to set values on the compute function
let commandEncoder = commandBuffer?.makeComputeCommandEncoder()
commandEncoder?.setComputePipelineState(additionComputePipelineState)
// Set the parameters of our gpu function
commandEncoder?.setBuffer(arr1Buff, offset: 0, index: 0)
commandEncoder?.setBuffer(resultBuff, offset: 0, index: 1)
// Set parameters for our iterator
var count = size
commandEncoder?.setBytes(&count, length: MemoryLayout.size(ofValue: count), index: 2)
// Figure out how many threads we need to use for our operation
let threadsPerGrid = MTLSize(width: 1, height: 1, depth: 1)
let maxThreadsPerThreadgroup = additionComputePipelineState.maxTotalThreadsPerThreadgroup // 1024
let threadsPerThreadgroup = MTLSize(width: maxThreadsPerThreadgroup, height: 1, depth: 1)
commandEncoder?.dispatchThreads(threadsPerGrid,
threadsPerThreadgroup: threadsPerThreadgroup)
// Tell encoder that it is done encoding. Now we can send this off to the gpu.
commandEncoder?.endEncoding()
// Push this command to the command queue for processing
commandBuffer?.commit()
// Wait until the gpu function completes before working with any of the data
commandBuffer?.waitUntilCompleted()
// Get the pointer to the beginning of our data
var resultBufferPointer = resultBuff?.contents().bindMemory(to: Int.self,
capacity: MemoryLayout<Int>.size * size)
// Print out all of our new added together array information
for _ in 0..<size {
print("\(Int(resultBufferPointer!.pointee) as Any)")
resultBufferPointer = resultBufferPointer?.advanced(by: 1)
}
}
// Call function
gpuProcess(arr1: array1)
compute.metal
#include <metal_stdlib>
using namespace metal;
kernel void shader(constant int *arr [[ buffer(0) ]],
device int *resultArray [[ buffer(1) ]],
constant uint &iter [[ buffer(2) ]]) // value of 12
{
for (uint i = 0; i < iter; i ){
resultArray[i] = arr[i] * iter;
}
}
CodePudding user response:
You are using 64 bit Int
in Swift and 32 bit integers in MSL. Your GPU threads are also overlapping their work. Instead, use Int32
in Swift and make each thread process their own piece of data. Like this
import MetalKit
let array1: [Int32] = [1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6]
func gpuProcess(arr1: [Int32]) {
let size = arr1.count // value of 12
// GPU we want to use
let device = MTLCreateSystemDefaultDevice()
// Fifo queue for sending commands to the gpu
let commandQueue = device?.makeCommandQueue()
// The library for getting our metal functions
let gpuFunctionLibrary = device?.makeDefaultLibrary()
// Grab gpu function
let additionGPUFunction = gpuFunctionLibrary?.makeFunction(name: "shader")
var additionComputePipelineState: MTLComputePipelineState!
do {
additionComputePipelineState = try device?.makeComputePipelineState(function: additionGPUFunction!)
} catch {
print(error)
}
// Create buffers to be sent to the gpu from our array
let arr1Buff = device?.makeBuffer(bytes: arr1,
length: MemoryLayout<Int32>.stride * size ,
options: .storageModeShared)
let resultBuff = device?.makeBuffer(length: MemoryLayout<Int32>.stride * size,
options: .storageModeShared)
// Create the buffer to be sent to the command queue
let commandBuffer = commandQueue?.makeCommandBuffer()
// Create an encoder to set values on the compute function
let commandEncoder = commandBuffer?.makeComputeCommandEncoder()
commandEncoder?.setComputePipelineState(additionComputePipelineState)
// Set the parameters of our gpu function
commandEncoder?.setBuffer(arr1Buff, offset: 0, index: 0)
commandEncoder?.setBuffer(resultBuff, offset: 0, index: 1)
// Set parameters for our iterator
var count = size
commandEncoder?.setBytes(&count, length: MemoryLayout.size(ofValue: count), index: 2)
// Figure out how many threads we need to use for our operation
let threadsPerGrid = MTLSize(width: 1, height: 1, depth: 1)
let maxThreadsPerThreadgroup = additionComputePipelineState.maxTotalThreadsPerThreadgroup // 1024
let threadsPerThreadgroup = MTLSize(width: maxThreadsPerThreadgroup, height: 1, depth: 1)
commandEncoder?.dispatchThreads(threadsPerGrid,
threadsPerThreadgroup: threadsPerThreadgroup)
// Tell encoder that it is done encoding. Now we can send this off to the gpu.
commandEncoder?.endEncoding()
// Push this command to the command queue for processing
commandBuffer?.commit()
// Wait until the gpu function completes before working with any of the data
commandBuffer?.waitUntilCompleted()
// Get the pointer to the beginning of our data
var resultBufferPointer = resultBuff?.contents().bindMemory(to: Int32.self,
capacity: MemoryLayout<Int32>.stride * size)
// Print out all of our new added together array information
for _ in 0..<size {
print("\(Int32(resultBufferPointer!.pointee) as Any)")
resultBufferPointer = resultBufferPointer?.advanced(by: 1)
}
}
// Call function
gpuProcess(arr1: array1)
Kernel:
#include <metal_stdlib>
using namespace metal;
kernel void shader(constant int *arr [[ buffer(0) ]],
device int *resultArray [[ buffer(1) ]],
constant uint &iter [[ buffer(2) ]],
uint gid [[ thread_position_in_grid ]], // this is thread index in grid, since you have height and depth of a dispatch set to 1 in CPU code, you can use 1D `int` here.
)
{
// Early out if gid is out of array boudns
if(gid >= iter)
{
return;
}
// Each thread processes it's own data
resultArray[gid] = arr[gid] * iter;
}
For more information on how to use Metal for compute refer to developer docs and for the information about attributes such as thread_position_in_grid
refer to Metal Shading Language specification.