cuda - Intro to Parallel Programming CS344 -


when doing exercise on udacity course, intro parallel programming cs344, got same errors. i'm not sure code right, , cannot input data. here's error message , code. exercise problem set 4.

---------- numelems:220480 numelems:220480 cuda error at: student_func.cu:238 unspecified launch failure cudagetlasterror() terminate called after throwing instance of 'thrust::system::system_error'   what():  unload of cuda runtime failed  unable execute code. did set grid and/or block size correctly? code compiled!  

my code:

//udacity hw 4 //radix sorting  #include "reference_calc.cpp" #include "utils.h" #include <cstdio> #define block_size 1024  /* red eye removal    ===============     assignment implementing red eye removal.     accomplished first creating score every pixel tells how    red eye pixel.  have done -    receiving scores , need sort them in ascending order    know pixels alter remove red eye.     note: ascending order == smallest largest     each score associated position, when sort scores, must    move positions accordingly.     implementing parallel radix sort cuda    ==========================================     basic idea construct histogram on each pass of how many of each    "digit" there are.   scan histogram know put    output of each digit.  example, first 1 must come after    0s have know how many 0s there able start moving 1s    correct position.     1) histogram of number of occurrences of each digit    2) exclusive prefix sum of histogram    3) determine relative offset of each digit         example [0 0 1 1 0 0 1]                 ->  [0 1 0 1 2 3 2]    4) combine results of steps 2 & 3 determine final       output location each element , move there     lsb radix sort out-of-place sort , need ping-pong values    between input , output buffers have provided.  make sure final    sorted results end in output buffer!  hint: may need copy    @ end.   */  __global__ void histogramkernel(unsigned int * input, unsigned int size, unsigned int* histogram, unsigned int pass) {     int mid = threadidx.x + blockidx.x * blockdim.x;     if (mid < size) {         atomicadd(&histogram[(input[mid]>>pass) & 0x01], 1);     } }  __global__ void scan_sum_kernel(unsigned int* input_vals, unsigned int pass, unsigned int * output, unsigned int* output_block, unsigned int size, unsigned int block_num) {     unsigned int tid = threadidx.x;     unsigned int mid = threadidx.x + blockidx.x * blockdim.x;     __shared__ unsigned int shared_input_vals[block_size];     __shared__ unsigned int shared_output[block_size];     if (mid >= size) {         shared_input_vals[tid] = 0xffffffff;     } else {         shared_input_vals[tid] = input_vals[mid];     }      __syncthreads();      if (tid == 0 || ((shared_input_vals[tid - 1] >> pass) & 0x01)) {         shared_output[tid] = 0;     } else {         shared_output[tid] = 1;     }      __syncthreads();      (unsigned int = 1; < block_size; <<= 1) {         unsigned int val = 0;         if (tid >= i) {             val = shared_output[tid - i];         }         __syncthreads();         shared_output[tid] += val;         __syncthreads();     }      if (mid < size) {         output[mid] = shared_output[tid];         if ((mid == size - 1) || ((tid == block_size-1) && (mid < size))) {             output_block[mid/block_size] = shared_output[tid];             if (!((shared_input_vals[tid] >> pass) & 0x01)) {                 //output_block[mid/block_size] += 1;                 output_block[blockidx.x] += 1;             }         }     }     __syncthreads(); }  __global__ void scan_kernel(unsigned int* output_block, unsigned int block_num) {     __shared__ unsigned int shared_output[block_size];      if (threadidx.x >= block_num || threadidx.x == 0) {         shared_output[threadidx.x] = 0x0;     }  else {         shared_output[threadidx.x] = output_block[threadidx.x - 1];     }      (unsigned int = 1; < block_num; <<= 1) {         unsigned int val = 0;         if (threadidx.x >= i) {             val = shared_output[threadidx.x - i];         }         __syncthreads();         shared_output[threadidx.x] += val;         __syncthreads();     }      if (threadidx.x < block_num) {         output_block[threadidx.x] = shared_output[threadidx.x];     }     __syncthreads(); }  void show_data(unsigned int* d_data, unsigned int len, char* tag) {    unsigned int* h_data = (unsigned int*) malloc(len * sizeof(unsigned int));    cudamemcpy(h_data, d_data, sizeof(unsigned int) * len, cudamemcpydevicetohost);    (unsigned int = max((int)len - 1000, 0); < len; ++i) {        printf("%s[%u]=%u; ", tag, i, h_data[i]);    }    free(h_data); }  __global__ void scan_large_sum_kernel(unsigned int* output_block,                                       unsigned int* output_val,                                       unsigned int* output_pos,                                       unsigned int* input_val,                                       unsigned int* input_pos,                                       unsigned int* histogram,                                       unsigned int pass,                                       unsigned int block_num,                                       unsigned int size) {      __shared__ unsigned int shared_prefix_sum[block_size];     unsigned int tid = threadidx.x;     unsigned int mid = threadidx.x + blockidx.x * blockdim.x;     shared_prefix_sum[tid] = 0;     __syncthreads();     if (mid >= size) {         shared_prefix_sum[tid] = output_val[size-1] + 1;     } else {         shared_prefix_sum[tid] = output_block[mid / block_size] + output_val[mid];     }     __syncthreads();       if (mid < size) {         unsigned int location = shared_prefix_sum[tid];         if ((input_val[mid] >> pass) & 0x01) {             location = mid + histogram[0] - shared_prefix_sum[tid];         }          //output_val[location] = input_val[mid];         //output_pos[location] = input_pos[mid];         output_val[mid] = location;      }     __syncthreads();  }  __global__ void scatter_kernel(unsigned int* d_inputvals,                                unsigned int* d_inputpos,                                unsigned int* d_outputvals,                                unsigned int* d_outputpos,                                size_t numelems) {     unsigned int tid = threadidx.x;     unsigned int mid = threadidx.x + blockidx.x * blockdim.x;     unsigned int val;     if (mid < numelems) {         val = d_outputvals[mid];     }     __syncthreads();     if (mid < numelems) {         d_outputvals[val] = d_inputvals[mid];         d_outputpos[val] = d_inputpos[mid];     }     __syncthreads(); } void your_sort(unsigned int* const d_inputvals,                unsigned int* const d_inputpos,                unsigned int* const d_outputvals,                unsigned int* const d_outputpos,                const size_t numelems) {    printf("numelems:%d\n", numelems);     unsigned int block_nums;    if (numelems % block_size == 0) {       block_nums = numelems / block_size;    } else {       block_nums = numelems / block_size + 1;    }    dim3 scan_sum_blockdim(block_size);    dim3 scan_sum_griddim(block_nums);    dim3 scan_griddim(1);    dim3 scan_blockdim(block_size);    dim3 scan_large_sum_blockdim(block_size);    dim3 scan_large_sum_griddim(block_nums);    unsigned int* cu_inputvals;    unsigned int* cu_inputpos;    unsigned int* cu_outputvals;    unsigned int* cu_outputpos;    unsigned int* cu_block;    cu_inputvals = d_inputvals;    cu_inputpos = d_inputpos;    cu_outputvals = d_outputvals;    cu_outputpos = d_outputpos;    unsigned int* d_histogram;    checkcudaerrors(cudamalloc(&d_histogram, 2 * sizeof(unsigned int)));    checkcudaerrors(cudamalloc(&cu_inputvals, numelems * sizeof(unsigned int)));    checkcudaerrors(cudamalloc(&cu_inputpos, numelems * sizeof(unsigned int)));    checkcudaerrors(cudamemcpy(cu_inputvals, d_inputvals, sizeof(unsigned int) * numelems, cudamemcpydevicetodevice));    checkcudaerrors(cudamemcpy(cu_inputpos, d_inputpos, sizeof(unsigned int) * numelems, cudamemcpydevicetodevice));    checkcudaerrors(cudamalloc(&cu_outputvals, numelems * sizeof(unsigned int)));    checkcudaerrors(cudamalloc(&cu_outputpos, numelems * sizeof(unsigned int)));    checkcudaerrors(cudamalloc(&cu_block, block_nums * sizeof(unsigned int)));    //show_data(d_inputvals, 100, "cu_inputvals");    printf("numelems:%d\n", numelems);    checkcudaerrors(cudamemset(d_outputvals, 0, block_nums * sizeof(unsigned int)));    checkcudaerrors(cudamemset(cu_outputvals, 0, block_nums * sizeof(unsigned int)));    checkcudaerrors(cudamemset(cu_outputpos, 0, block_nums * sizeof(unsigned int)));    checkcudaerrors(cudamemset(d_outputpos, 0, block_nums * sizeof(unsigned int)));    (unsigned int pass = 1; pass < 32; ++pass) {         checkcudaerrors(cudamemset(d_histogram, 0, 2 * sizeof(unsigned int)));         histogramkernel<<<scan_sum_griddim, scan_sum_blockdim>>>(d_inputvals, numelems, d_histogram, pass);          checkcudaerrors(cudamemset(cu_block, 0, block_nums * sizeof(unsigned int)));         scan_sum_kernel<<<scan_sum_griddim, scan_sum_blockdim>>>(cu_inputvals, pass, cu_outputvals, cu_block, numelems, block_nums);         cudadevicesynchronize(); checkcudaerrors(cudagetlasterror());          scan_kernel<<<1, 256>>>(cu_block, block_nums);         cudadevicesynchronize(); checkcudaerrors(cudagetlasterror());          scan_large_sum_kernel<<<scan_large_sum_griddim, scan_large_sum_blockdim>>>(cu_block, cu_outputvals, cu_outputpos, cu_inputvals, cu_inputpos, d_histogram, pass, block_nums, numelems);         //cudadevicesynchronize(); checkcudaerrors(cudagetlasterror());          scatter_kernel<<<scan_sum_griddim, scan_sum_blockdim>>>(cu_inputvals, d_inputpos, cu_outputvals, cu_outputpos, numelems);         checkcudaerrors(cudamemcpy(cu_inputvals, cu_outputvals, sizeof(unsigned int) * numelems, cudamemcpydevicetodevice));         checkcudaerrors(cudamemcpy(cu_inputpos, cu_outputpos, sizeof(unsigned int) * numelems, cudamemcpydevicetodevice));     }    checkcudaerrors(cudamemcpy(d_outputvals, cu_outputvals, sizeof(unsigned int) * numelems, cudamemcpydevicetodevice));    checkcudaerrors(cudamemcpy(d_outputpos, cu_outputpos, sizeof(unsigned int) * numelems, cudamemcpydevicetodevice));     checkcudaerrors(cudafree(cu_block));    checkcudaerrors(cudafree(cu_inputvals));    checkcudaerrors(cudafree(cu_inputpos));    checkcudaerrors(cudafree(cu_outputvals));    checkcudaerrors(cudafree(cu_outputpos));  } 

i had test program on small numbers. cannot find errors in code. can me?


Comments

Popular posts from this blog

angular - Ionic slides - dynamically add slides before and after -

minify - Minimizing css files -

Add a dynamic header in angular 2 http provider -