fork download
  1.  
  2. #include "cuda_runtime.h"
  3. #include "device_launch_parameters.h"
  4.  
  5. #include <stdio.h>
  6.  
  7. #define BLOCK_DIM 2
  8. #define GRID_DIM 2
  9. #define STEPS 4
  10.  
  11. __global__ void addKernel(int *output,int *input,int paths,int steps)
  12. {
  13. int idx = blockIdx.x * blockDim.x + threadIdx.x;
  14. int ndx = blockIdx.x * blockDim.x + threadIdx.x * steps;
  15. int n = 0;
  16. int sum = 0;
  17. if (idx < paths)
  18. {
  19. for (int i = ndx; i < ndx+steps; i++)
  20. {
  21. sum += input[i];
  22. }
  23. output[idx] = sum;
  24. }
  25. }
  26.  
  27. int main()
  28. {
  29. const int Data_size = BLOCK_DIM * GRID_DIM * STEPS;
  30. const int PATHS = BLOCK_DIM * GRID_DIM;
  31. int h_input[Data_size];
  32. int h_output[PATHS] = { 0 };
  33.  
  34. for (int i = 0; i < Data_size; i++)
  35. {
  36. h_input[i] = i;
  37. }
  38.  
  39. cudaSetDevice(0);
  40. int *d_input;
  41. int *d_output;
  42.  
  43. cudaMalloc((void**)&d_input, sizeof(int) * Data_size);
  44. cudaMalloc((void**)&d_output, sizeof(int) * PATHS);
  45.  
  46. cudaMemcpy(d_input, h_input, sizeof(int) * Data_size, cudaMemcpyHostToDevice);
  47. cudaMemset(d_output, 0, sizeof(int)* PATHS);
  48.  
  49. addKernel <<<GRID_DIM, BLOCK_DIM,0 >>>(d_output, d_input,PATHS, STEPS);
  50. cudaDeviceSynchronize();
  51.  
  52. cudaMemcpy(h_output, d_output, sizeof(int) * PATHS, cudaMemcpyDeviceToHost);
  53.  
  54. for (int i = 0; i < PATHS; i++)
  55. {
  56. int k = i*STEPS;
  57. printf("Output[%d]={%d+%d+%d+%d}=%d.\n",i,k,k+1,k+2,k+3,h_output[i]);
  58. }
  59.  
  60. cudaFree(d_input);
  61. cudaFree(d_output);
  62. cudaDeviceReset();
  63. return 0;
  64. }
  65.  
Compilation error #stdin compilation error #stdout 0s 0KB
stdin
Standard input is empty
compilation info
prog.c:2:26: fatal error: cuda_runtime.h: No such file or directory
 #include "cuda_runtime.h"
                          ^
compilation terminated.
stdout
Standard output is empty