cuda - Intro to Parallel Programming CS344 -
when doing exercise on udacity course, intro parallel programming cs344, got same errors. i'm not sure code right, , cannot input data. here's error message , code. exercise problem set 4.
---------- numelems:220480 numelems:220480 cuda error at: student_func.cu:238 unspecified launch failure cudagetlasterror() terminate called after throwing instance of 'thrust::system::system_error' what(): unload of cuda runtime failed unable execute code. did set grid and/or block size correctly? code compiled!
my code:
//udacity hw 4 //radix sorting #include "reference_calc.cpp" #include "utils.h" #include <cstdio> #define block_size 1024 /* red eye removal =============== assignment implementing red eye removal. accomplished first creating score every pixel tells how red eye pixel. have done - receiving scores , need sort them in ascending order know pixels alter remove red eye. note: ascending order == smallest largest each score associated position, when sort scores, must move positions accordingly. implementing parallel radix sort cuda ========================================== basic idea construct histogram on each pass of how many of each "digit" there are. scan histogram know put output of each digit. example, first 1 must come after 0s have know how many 0s there able start moving 1s correct position. 1) histogram of number of occurrences of each digit 2) exclusive prefix sum of histogram 3) determine relative offset of each digit example [0 0 1 1 0 0 1] -> [0 1 0 1 2 3 2] 4) combine results of steps 2 & 3 determine final output location each element , move there lsb radix sort out-of-place sort , need ping-pong values between input , output buffers have provided. make sure final sorted results end in output buffer! hint: may need copy @ end. */ __global__ void histogramkernel(unsigned int * input, unsigned int size, unsigned int* histogram, unsigned int pass) { int mid = threadidx.x + blockidx.x * blockdim.x; if (mid < size) { atomicadd(&histogram[(input[mid]>>pass) & 0x01], 1); } } __global__ void scan_sum_kernel(unsigned int* input_vals, unsigned int pass, unsigned int * output, unsigned int* output_block, unsigned int size, unsigned int block_num) { unsigned int tid = threadidx.x; unsigned int mid = threadidx.x + blockidx.x * blockdim.x; __shared__ unsigned int shared_input_vals[block_size]; __shared__ unsigned int shared_output[block_size]; if (mid >= size) { shared_input_vals[tid] = 0xffffffff; } else { shared_input_vals[tid] = input_vals[mid]; } __syncthreads(); if (tid == 0 || ((shared_input_vals[tid - 1] >> pass) & 0x01)) { shared_output[tid] = 0; } else { shared_output[tid] = 1; } __syncthreads(); (unsigned int = 1; < block_size; <<= 1) { unsigned int val = 0; if (tid >= i) { val = shared_output[tid - i]; } __syncthreads(); shared_output[tid] += val; __syncthreads(); } if (mid < size) { output[mid] = shared_output[tid]; if ((mid == size - 1) || ((tid == block_size-1) && (mid < size))) { output_block[mid/block_size] = shared_output[tid]; if (!((shared_input_vals[tid] >> pass) & 0x01)) { //output_block[mid/block_size] += 1; output_block[blockidx.x] += 1; } } } __syncthreads(); } __global__ void scan_kernel(unsigned int* output_block, unsigned int block_num) { __shared__ unsigned int shared_output[block_size]; if (threadidx.x >= block_num || threadidx.x == 0) { shared_output[threadidx.x] = 0x0; } else { shared_output[threadidx.x] = output_block[threadidx.x - 1]; } (unsigned int = 1; < block_num; <<= 1) { unsigned int val = 0; if (threadidx.x >= i) { val = shared_output[threadidx.x - i]; } __syncthreads(); shared_output[threadidx.x] += val; __syncthreads(); } if (threadidx.x < block_num) { output_block[threadidx.x] = shared_output[threadidx.x]; } __syncthreads(); } void show_data(unsigned int* d_data, unsigned int len, char* tag) { unsigned int* h_data = (unsigned int*) malloc(len * sizeof(unsigned int)); cudamemcpy(h_data, d_data, sizeof(unsigned int) * len, cudamemcpydevicetohost); (unsigned int = max((int)len - 1000, 0); < len; ++i) { printf("%s[%u]=%u; ", tag, i, h_data[i]); } free(h_data); } __global__ void scan_large_sum_kernel(unsigned int* output_block, unsigned int* output_val, unsigned int* output_pos, unsigned int* input_val, unsigned int* input_pos, unsigned int* histogram, unsigned int pass, unsigned int block_num, unsigned int size) { __shared__ unsigned int shared_prefix_sum[block_size]; unsigned int tid = threadidx.x; unsigned int mid = threadidx.x + blockidx.x * blockdim.x; shared_prefix_sum[tid] = 0; __syncthreads(); if (mid >= size) { shared_prefix_sum[tid] = output_val[size-1] + 1; } else { shared_prefix_sum[tid] = output_block[mid / block_size] + output_val[mid]; } __syncthreads(); if (mid < size) { unsigned int location = shared_prefix_sum[tid]; if ((input_val[mid] >> pass) & 0x01) { location = mid + histogram[0] - shared_prefix_sum[tid]; } //output_val[location] = input_val[mid]; //output_pos[location] = input_pos[mid]; output_val[mid] = location; } __syncthreads(); } __global__ void scatter_kernel(unsigned int* d_inputvals, unsigned int* d_inputpos, unsigned int* d_outputvals, unsigned int* d_outputpos, size_t numelems) { unsigned int tid = threadidx.x; unsigned int mid = threadidx.x + blockidx.x * blockdim.x; unsigned int val; if (mid < numelems) { val = d_outputvals[mid]; } __syncthreads(); if (mid < numelems) { d_outputvals[val] = d_inputvals[mid]; d_outputpos[val] = d_inputpos[mid]; } __syncthreads(); } void your_sort(unsigned int* const d_inputvals, unsigned int* const d_inputpos, unsigned int* const d_outputvals, unsigned int* const d_outputpos, const size_t numelems) { printf("numelems:%d\n", numelems); unsigned int block_nums; if (numelems % block_size == 0) { block_nums = numelems / block_size; } else { block_nums = numelems / block_size + 1; } dim3 scan_sum_blockdim(block_size); dim3 scan_sum_griddim(block_nums); dim3 scan_griddim(1); dim3 scan_blockdim(block_size); dim3 scan_large_sum_blockdim(block_size); dim3 scan_large_sum_griddim(block_nums); unsigned int* cu_inputvals; unsigned int* cu_inputpos; unsigned int* cu_outputvals; unsigned int* cu_outputpos; unsigned int* cu_block; cu_inputvals = d_inputvals; cu_inputpos = d_inputpos; cu_outputvals = d_outputvals; cu_outputpos = d_outputpos; unsigned int* d_histogram; checkcudaerrors(cudamalloc(&d_histogram, 2 * sizeof(unsigned int))); checkcudaerrors(cudamalloc(&cu_inputvals, numelems * sizeof(unsigned int))); checkcudaerrors(cudamalloc(&cu_inputpos, numelems * sizeof(unsigned int))); checkcudaerrors(cudamemcpy(cu_inputvals, d_inputvals, sizeof(unsigned int) * numelems, cudamemcpydevicetodevice)); checkcudaerrors(cudamemcpy(cu_inputpos, d_inputpos, sizeof(unsigned int) * numelems, cudamemcpydevicetodevice)); checkcudaerrors(cudamalloc(&cu_outputvals, numelems * sizeof(unsigned int))); checkcudaerrors(cudamalloc(&cu_outputpos, numelems * sizeof(unsigned int))); checkcudaerrors(cudamalloc(&cu_block, block_nums * sizeof(unsigned int))); //show_data(d_inputvals, 100, "cu_inputvals"); printf("numelems:%d\n", numelems); checkcudaerrors(cudamemset(d_outputvals, 0, block_nums * sizeof(unsigned int))); checkcudaerrors(cudamemset(cu_outputvals, 0, block_nums * sizeof(unsigned int))); checkcudaerrors(cudamemset(cu_outputpos, 0, block_nums * sizeof(unsigned int))); checkcudaerrors(cudamemset(d_outputpos, 0, block_nums * sizeof(unsigned int))); (unsigned int pass = 1; pass < 32; ++pass) { checkcudaerrors(cudamemset(d_histogram, 0, 2 * sizeof(unsigned int))); histogramkernel<<<scan_sum_griddim, scan_sum_blockdim>>>(d_inputvals, numelems, d_histogram, pass); checkcudaerrors(cudamemset(cu_block, 0, block_nums * sizeof(unsigned int))); scan_sum_kernel<<<scan_sum_griddim, scan_sum_blockdim>>>(cu_inputvals, pass, cu_outputvals, cu_block, numelems, block_nums); cudadevicesynchronize(); checkcudaerrors(cudagetlasterror()); scan_kernel<<<1, 256>>>(cu_block, block_nums); cudadevicesynchronize(); checkcudaerrors(cudagetlasterror()); scan_large_sum_kernel<<<scan_large_sum_griddim, scan_large_sum_blockdim>>>(cu_block, cu_outputvals, cu_outputpos, cu_inputvals, cu_inputpos, d_histogram, pass, block_nums, numelems); //cudadevicesynchronize(); checkcudaerrors(cudagetlasterror()); scatter_kernel<<<scan_sum_griddim, scan_sum_blockdim>>>(cu_inputvals, d_inputpos, cu_outputvals, cu_outputpos, numelems); checkcudaerrors(cudamemcpy(cu_inputvals, cu_outputvals, sizeof(unsigned int) * numelems, cudamemcpydevicetodevice)); checkcudaerrors(cudamemcpy(cu_inputpos, cu_outputpos, sizeof(unsigned int) * numelems, cudamemcpydevicetodevice)); } checkcudaerrors(cudamemcpy(d_outputvals, cu_outputvals, sizeof(unsigned int) * numelems, cudamemcpydevicetodevice)); checkcudaerrors(cudamemcpy(d_outputpos, cu_outputpos, sizeof(unsigned int) * numelems, cudamemcpydevicetodevice)); checkcudaerrors(cudafree(cu_block)); checkcudaerrors(cudafree(cu_inputvals)); checkcudaerrors(cudafree(cu_inputpos)); checkcudaerrors(cudafree(cu_outputvals)); checkcudaerrors(cudafree(cu_outputpos)); }
i had test program on small numbers. cannot find errors in code. can me?
Comments
Post a Comment