I wrote this piece of code to make a median Blur in CUDA but I am running into an issue, where the channel of image is blurred but it creates stripes which look unusual for blurring.
#include <iostream>
#include <opencv2/core.hpp>
#include <opencv2/imgcodecs.hpp>
using namespace std;
using namespace cv;
#define BLOCK_SIZE 16
#define TILE_SIZE 14
#define FILTER_WIDTH 3
#define FILTER_HEIGHT 3
__device__ void sort(unsigned char* filterVector)
{
for (int i = 0; i < FILTER_WIDTH*FILTER_HEIGHT; i ) {
for (int j = i 1; j < FILTER_WIDTH*FILTER_HEIGHT; j ) {
if (filterVector[i] > filterVector[j]) {
unsigned char tmp = filterVector[i];
filterVector[i] = filterVector[j];
filterVector[j] = tmp;
}
}
}
}
__global__ void medianFilter(unsigned char *srcImage, unsigned char *dstImage, unsigned int width, unsigned int height)
{
int x_o = TILE_SIZE * blockIdx.x threadIdx.x;
int y_o = TILE_SIZE * blockIdx.y threadIdx.y;
int x_i = x_o - (FILTER_HEIGHT / 2);
int y_i = y_o - (FILTER_WIDTH / 2);
__shared__ unsigned char sBuffer[BLOCK_SIZE][BLOCK_SIZE];
if ((x_i >= 0) && (x_i < width) && (y_i >= 0) && (y_i < height)) {
sBuffer[threadIdx.y][threadIdx.x] = srcImage[y_i * width x_i];
} else {
sBuffer[threadIdx.y][threadIdx.x] = 0;
}
__syncthreads();
unsigned char filterVector[FILTER_WIDTH*FILTER_HEIGHT];
// int size_vec = sizeof(filterVector) / sizeof(filterVector[0]);
// printf("%d \n", size_vec);
if (threadIdx.x < TILE_SIZE && threadIdx.y < TILE_SIZE) {
for (int r = 0; r < FILTER_HEIGHT; r ) {
for (int c = 0; c < FILTER_HEIGHT; c ) {
filterVector[r*FILTER_HEIGHT c] = sBuffer[threadIdx.y r][threadIdx.x c];
}
}
}
sort(filterVector);
if (x_o < width && y_o < height) {
dstImage[y_o * width x_o] = filterVector[4]; // (FILTER_WIDTH*FILTER_HEIGHT)/2
}
}
int main(int argc, char **argv)
{
std::string image_path = "./test.jpg";
cv::Mat img = imread(image_path, IMREAD_COLOR);
std::string output_file = "test_gpu.jpg";
if(img.empty())
{
std::cout << "Couldn't read img:" << image_path << std::endl;
}
Mat bgr[3];
split(img, bgr);
cv::Mat dstImg (bgr[1].size(), bgr[1].type());
const int inputSize = img.cols * img.rows;
const int outputSize = dstImg.cols * dstImg.rows;
unsigned char *d_input, *d_output;
cudaMalloc<unsigned char>(&d_input, inputSize);
cudaMalloc<unsigned char>(&d_output, outputSize);
cudaMemcpy(d_input, bgr[1].ptr(), inputSize, cudaMemcpyHostToDevice);
const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
const dim3 grid((dstImg.cols TILE_SIZE - 1)/TILE_SIZE, (dstImg.rows TILE_SIZE - 1)/TILE_SIZE);
medianFilter<<<grid,block>>>(d_input, d_output, dstImg.cols, dstImg.rows);
cudaMemcpy(dstImg.ptr(), d_output, outputSize, cudaMemcpyDeviceToHost);
cudaFree(d_input);
cudaFree(d_output);
imwrite(output_file, dstImg);
}
This is my original image:
and here is one blurred channel:
For some reason I get those stripes on the output image, which is just one of the channels for now. Any idea why this is happening?
CodePudding user response:
Your intention is that even though you are launching a block of dimension (BLOCK_SIZE
, BLOCK_SIZE
), you only intend (TILE_SIZE
, TILE_SIZE
) threads in that block to actually compute the values for output pixels.
However you are not properly accounting for that here:
if (x_o < width && y_o < height) {
that should be, instead:
if (x_o < width && y_o < height && threadIdx.x < TILE_SIZE && threadIdx.y < TILE_SIZE) {
(In fact, everything after the __syncthreads()
in your kernel can be conditioned to only execute if threadIdx.x < TILE_SIZE && threadIdx.y < TILE_SIZE
if you wish.)