I am trying to make a CUDA stream instance automatically delete itself once all its usages have been removed and I was wondering if when calling cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking)
, if it is created the object on the heap or not? (I am assuming it is but I am not sure).
In the end I want to do something like this:
struct CUDAStreamDeleter {
void operator()(cudaStream_t* p) const
{
cudaStreamDestroy(*p);
}
};
int main() {
int numberOfStreams = 4;
vector<shared_ptr<cudaStream_t>> streams(numberOfStreams);
for (size_t i = 0; i < numberOfStreams; i)
{
cudaStream_t stream;
cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking);
streams[i] = shared_ptr<cudaStream_t>(&stream, CUDAStreamDeleter());
}
}
Edit:
As per @wohlstad a better template would be:
class StreamWrapper {
public:
StreamWrapper() {
gpuErrchk(cudaStreamCreateWithFlags(&this->stream, cudaStreamNonBlocking));
}
~StreamWrapper() { gpuErrchk(cudaStreamDestroy(stream)); }
cudaStream_t& get() { return stream; }
cudaStream_t* ref() { return &this->stream; }
private:
cudaStream_t stream;
};
int main(){
int numberOfStreams = 10;
vector<shared_ptr<StreamWrapper>> streamsTemp(numberOfStreams);
for (size_t i = 0; i < numberOfStreams; i)
{
streamsTemp[i] = shared_ptr<StreamWrapper>(new StreamWrapper());
}
// Stream testing
for (size_t i = 0; i < numberOfStreams; i)
{
int * d_i;
gpuErrchk(cudaMallocAsync(&d_i, sizeof(int), streamsTemp[i]->get()));
gpuErrchk(cudaMemcpyAsync(d_i, &i, sizeof(int), cudaMemcpyHostToDevice, streamsTemp[i]->get()));
int out;
gpuErrchk(cudaMemcpyAsync(&out, d_i, sizeof(int), cudaMemcpyDeviceToHost, streamsTemp[i]->get()));
gpuErrchk(cudaFreeAsync(d_i, streamsTemp[i]->get()));
gpuErrchk(cudaStreamSynchronize(streamsTemp[i]->get()));
cout << "Out: " << to_string(out) << " In: " << to_string(i);
}
}
CodePudding user response:
As mentioned in several comment above (including mine), your first attempt involves creating std::shared_ptr
s managing dangling pointers.
This is because these pointers are actually addresses of automatic variables created on the stack in the scope of the loop body (and therefore become dangling once the variables get out of scope).
However - you can use the RAII idiom to achieve what you need:
StreamWrapper
will create the stream in the ctor, and destroy it in the dtor.
#include "cuda_runtime.h"
#include <vector>
#include <memory>
#include <iostream>
#include <string>
#define gpuErrchk(X) X // use your current definition of gpuErrchk
// RAII class:
class StreamWrapper {
public:
StreamWrapper() { gpuErrchk(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking)); }
~StreamWrapper() { gpuErrchk(cudaStreamDestroy(stream)); }
cudaStream_t& get() { return stream; }
private:
cudaStream_t stream;
};
int main() {
int numberOfStreams = 10;
std::vector<std::shared_ptr<StreamWrapper>> streamsTemp(numberOfStreams);
for (size_t i = 0; i < numberOfStreams; i)
{
streamsTemp[i] = std::make_shared<StreamWrapper>();
}
// Stream testing
for (size_t i = 0; i < numberOfStreams; i)
{
int* d_i;
gpuErrchk(cudaMallocAsync(&d_i, sizeof(int), streamsTemp[i]->get()));
gpuErrchk(cudaMemcpyAsync(d_i, &i, sizeof(int), cudaMemcpyHostToDevice, streamsTemp[i]->get()));
int out;
gpuErrchk(cudaMemcpyAsync(&out, d_i, sizeof(int), cudaMemcpyDeviceToHost, streamsTemp[i]->get()));
gpuErrchk(cudaFreeAsync(d_i, streamsTemp[i]->get()));
gpuErrchk(cudaStreamSynchronize(streamsTemp[i]->get()));
std::cout << "Out: " << std::to_string(out) << " In: " << std::to_string(i) << std::endl;
}
}
Notes:
- When initializing a
std::shared_ptr
it is better to usestd::make_shared
. See here: Difference in make_shared and normal shared_ptr in C . - Better to avoid
using namespace std
- see here Why is "using namespace std;" considered bad practice?.