#include __global__ void cuda_hello(int* number, int* out) { *out = *number; for(int i = 0; i < 255; i++) { *out += 1; } } int main() { int *number, *out; int *d_number, *d_out; number = (int*)malloc(sizeof(int)); out = (int*)malloc(sizeof(int)); *number = 22; *out = 0; cudaMalloc((void**)&d_number, sizeof(int)); cudaMemcpy(d_number, number, sizeof(int), cudaMemcpyHostToDevice); cudaMalloc((void**)&d_out, sizeof(int)); cudaMemcpy(d_out, out, sizeof(int), cudaMemcpyHostToDevice); cuda_hello<<<1,1>>>(d_number, d_out); cudaDeviceSynchronize(); cudaMemcpy(out, d_out, sizeof(int), cudaMemcpyDeviceToHost); cudaFree(d_number); cudaFree(d_out); printf("Number: %i\n", *out); free(number); free(out); return 0; }