fork download
make
mkdir -p build
/usr/local/cuda/bin/nvcc -c -I/usr/local/cuda/include *.cu
nvcc warning : The 'compute_10' and 'sm_10' architectures are deprecated, and may be removed in a future release.
ptxas /var/folders/8v/1gw9hxtx40qdp2slztw3whq80000gn/T//tmpxft_00012625_00000000-5_CUDA_ConvNN.ptx, line 745; warning : Double is not supported. Demoting to float
clang++ -std=c++11 -c -m32 -I/usr/local/cuda/include *.cpp
/usr/local/cuda/bin/nvcc -m32 -L/usr/local/cuda/lib -lcuda -lcudart -lm -o build/main *.o
nvcc warning : The 'compute_10' and 'sm_10' architectures are deprecated, and may be removed in a future release.
clang: error: unknown argument: '-malign-double' [-Wunused-command-line-argument-hard-error-in-future]
clang: note: this will be a hard error (cannot be downgraded to a warning) in the future
make: *** [build] Error 1

here the makefile:
------------------
------------------
PROJECT_NAME = main

# NVCC is path to nvcc. Here it is assumed /usr/local/cuda is on one's PATH.
# CC is the compiler for C++ host code.

NVCC = /usr/local/cuda/bin/nvcc
CC = clang++ -std=c++11

CUDAPATH = /usr/local/cuda
#  Directories to search for
#CUDA_ROOT = /usr/local/cuda
#SDK_HOME = /Developer/NVIDIA/CUDA-6.0/C

BUILD_DIR = build
# note that nvcc defaults to 32-bit architecture. thus, force C/LFLAGS to comply.
# you could also force nvcc to compile 64-bit with -m64 flag. (and remove -m32 instances)

CFLAGS = -c -m32 -I$(CUDAPATH)/include
NVCCFLAGS = -c -I$(CUDAPATH)/include
LFLAGS = -m32 -L$(CUDAPATH)/lib -lcuda -lcudart -lm

all: build clean

build: build_dir gpu cpu
	$(NVCC) $(LFLAGS) -o $(BUILD_DIR)/$(PROJECT_NAME) *.o

build_dir:
	mkdir -p $(BUILD_DIR)

gpu:
	$(NVCC) $(NVCCFLAGS) *.cu

cpu:			
	$(CC) $(CFLAGS) *.cpp

clean:
	rm *.o

run:
	./$(BUILD_DIR)/$(PROJECT_NAME)
Not running #stdin #stdout 0s 0KB
stdin
#include <stdio.h>
#define SIZE	1024

__global__ void VectorAdd(int *a, int *b, int *c, int n)
{
	int i=threadIdx.x;

	if (i < n)
		c[i] = a[i] + b[i];
}

int main()
{
	int *a, *b, *c;

	cudaMallocManaged(&a, SIZE * sizeof(int));
	cudaMallocManaged(&b, SIZE * sizeof(int));
	cudaMallocManaged(&c, SIZE * sizeof(int));

	for (int i = 0; i < SIZE; ++i)
	{
		a[i] = i;
		b[i] = i;
		c[i] = 0;
	}

	VectorAdd <<<1, SIZE>>> (a, b, c, SIZE);

	cudaDeviceSynchronize();

	for (int i = 0; i < 10; ++i)
		printf("c[%d] = %d\n", i, c[i]);

	cudaFree(a);
	cudaFree(b);
	cudaFree(c);

	return 0;
}
stdout
Standard output is empty