fork download
  1. #include "cuda_runtime.h"
  2. #include "device_launch_parameters.h"
  3.  
  4. #include <stdio.h>
  5.  
  6. #define BLOCK_DIM 4
  7. #define GRID_DIM 1
  8. #define STEPS 4
  9.  
  10. __global__ void addKernel(int *output,int *input,int steps)
  11. {
  12. int idx = threadIdx.x + blockIdx.x * blockDim.x;
  13. int ndx = threadIdx.x*steps + blockIdx.x * blockDim.x;
  14. int n = 0;
  15. if (idx < BLOCK_DIM * GRID_DIM)
  16. {
  17. for (int i = ndx; i < ndx+steps; i++)
  18. {
  19. output[idx] += input[i];
  20. }
  21. }
  22. }
  23.  
  24. int main()
  25. {
  26. const int Data_size = BLOCK_DIM * GRID_DIM * STEPS;
  27. const int PATHS = BLOCK_DIM * GRID_DIM;
  28. int h_input[Data_size];
  29. int h_output[PATHS] = { 0 };;
  30.  
  31. for (int i = 0; i < Data_size; i++)
  32. {
  33. h_input[i] = i;
  34. }
  35.  
  36. cudaSetDevice(0);
  37. int *d_input;
  38. int *d_output;
  39.  
  40. cudaMalloc((void**)&d_input, sizeof(int) * Data_size);
  41. cudaMalloc((void**)&d_output, sizeof(int) * PATHS);
  42.  
  43. cudaMemcpy(d_input, h_input, sizeof(int) * Data_size, cudaMemcpyHostToDevice);
  44.  
  45. addKernel <<<GRID_DIM, BLOCK_DIM,0 >>>(d_output, d_input, STEPS);
  46. cudaDeviceSynchronize();
  47.  
  48. cudaMemcpy(h_output, d_output, sizeof(int) * PATHS, cudaMemcpyDeviceToHost);
  49.  
  50. for (int i = 0; i < PATHS; i++)
  51. {
  52. int k = i*STEPS;
  53. printf("Output[%d]={%d+%d+%d+%d}=%d.\n",i,k,k+1,k+2,k+3,h_output[i]);
  54. }
  55.  
  56. cudaFree(d_input);
  57. cudaFree(d_output);
  58. cudaDeviceReset();
  59. return 0;
  60. }
Compilation error #stdin compilation error #stdout 0s 0KB
stdin
Standard input is empty
compilation info
prog.c:1:26: fatal error: cuda_runtime.h: No such file or directory
 #include "cuda_runtime.h"
                          ^
compilation terminated.
stdout
Standard output is empty