Skip to content
Snippets Groups Projects
Commit 174c2a1b authored by Neil Gershenfeld's avatar Neil Gershenfeld
Browse files

wip

parent 73d4808e
Branches
No related tags found
No related merge requests found
Pipeline #5090 passed
//
// cudapi.cu
// Neil Gershenfeld 3/1/20
// calculation of pi by a CUDA sum
// pi = 3.14159265358979323846
//
#include <iostream>
#include <chrono>
#include <cstdint>
uint64_t blocks = 1024;
uint64_t threads = 1024;
uint64_t nloop = 1000000;
uint64_t npts = blocks*threads;
__global__ void init(double *arr,uint64_t nloop) {
uint64_t i = blockIdx.x*blockDim.x+threadIdx.x;
uint64_t start = nloop*i+1;
uint64_t end = nloop*(i+1)+1;
arr[i] = 0;
for (uint64_t j = start; j < end; ++j)
arr[i] += 0.5/((j-0.75)*(j-0.25));
}
__global__ void reduce_sum(double *arr,uint64_t len) {
uint64_t i = blockIdx.x*blockDim.x+threadIdx.x;
if (i < len)
arr[i] += arr[i+len];
}
void reduce(double *arr) {
uint64_t len = npts >> 1;
while (1) {
reduce_sum<<<blocks,threads>>>(arr,len);
len = len >> 1;
if (len == 0)
return;
}
}
int main(void) {
double harr[1],*darr;
cudaMalloc(&darr,npts*sizeof(double));
auto tstart = std::chrono::high_resolution_clock::now();
init<<<blocks,threads>>>(darr,nloop);
reduce(darr);
cudaDeviceSynchronize();
auto tend = std::chrono::high_resolution_clock::now();
auto dt = std::chrono::duration_cast<std::chrono::microseconds>(tend-tstart).count();
auto mflops = npts*nloop*5.0/dt;
cudaMemcpy(harr,darr,8,cudaMemcpyDeviceToHost);
printf("npts = %ld, nloop = %ld, pi = %lf\n",npts,nloop,harr[0]);
printf("time = %f, estimated MFlops = %f\n",1e-6*dt,mflops);
cudaFree(darr);
return 0;
}
#
# numbapig.py
# Neil Gershenfeld 2/9/20
# calculation of pi by a Numba GPU sum
# Neil Gershenfeld 3/1/20
# calculation of pi by a Numba CUDA sum
# pi = 3.14159265358979323846
#
from numba import cuda
import numpy as np
import time
#
# problem size
#
block_size = 2**10
grid_size = 2**21
NPTS = grid_size*block_size
#
# kernels and functions
#
block_size = 1024
grid_size = 1024
nloop = 1000000
npts = grid_size*block_size
@cuda.jit
def init(arr):
i = 1+cuda.grid(1)
arr[i-1] = 0.5/((i-0.75)*(i-0.25))
#arr[i-1] = i # for testing reduction
#
@cuda.reduce
def Numba_reduce(a,b):
return a+b
#
def init(arr,nloop):
i = cuda.blockIdx.x*cuda.blockDim.x+cuda.threadIdx.x;
start = nloop*i+1;
end = nloop*(i+1)+1;
arr[i] = 0;
for j in range(start,end):
arr[i] += 0.5/((j-0.75)*(j-0.25));
@cuda.jit
def CUDA_sum(arr,len):
i = cuda.grid(1)
def reduce_sum(arr,len):
i = cuda.blockIdx.x*cuda.blockDim.x+cuda.threadIdx.x;
if (i < len):
arr[i] += arr[i+len]
#
def CUDA_reduce(arr,NPTS):
len = NPTS >> 1
def reduce(arr,npts):
len = npts >> 1
while (1):
CUDA_sum[grid_size,block_size](arr,len)
cuda.synchronize()
reduce_sum[grid_size,block_size](arr,len)
len = len >> 1
if (len == 0):
return
#
@cuda.jit
def CUDA_result(arr,result):
i = cuda.grid(1)
if (i == 0):
result[0] = arr[0]
#
# device array
#
arr = cuda.device_array(NPTS,np.float32)
result = cuda.device_array(1,np.float32)
#arr = cuda.device_array(NPTS,np.int64) # for testing reduction
#result = cuda.device_array(1,np.int64) # for testing reduction
#
# compile kernels by calling them
#
init[grid_size,block_size](arr)
pi = Numba_reduce(arr)
CUDA_reduce(arr,NPTS)
CUDA_result(arr,result)
#
# CUDA kernel array calculation
#
start_time = time.time()
init[grid_size,block_size](arr)
cuda.synchronize()
end_time = time.time()
mflops = NPTS*4.0/(1.0e6*(end_time-start_time))
print("CUDA kernel array calculation:")
print(" time = %f, estimated MFlops = %f"%(end_time-start_time,mflops))
#
# Numba reduce
#
init[grid_size,block_size](arr)
darr = cuda.device_array(npts,np.float64)
init[grid_size,block_size](darr,nloop) # compile kernel
reduce(darr,npts) # compile kernel
start_time = time.time()
pi = Numba_reduce(arr)
end_time = time.time()
mflops = NPTS*1.0/(1.0e6*(end_time-start_time))
print("Numba reduce:")
print(" time = %f, estimated MFlops = %f"%(end_time-start_time,mflops))
#
# both with Numba reduce
#
start_time = time.time()
init[grid_size,block_size](arr)
cuda.synchronize()
pi = Numba_reduce(arr)
end_time = time.time()
mflops = NPTS*5.0/(1.0e6*(end_time-start_time))
print("both with Numba reduce:")
print(" NPTS = %d, pi = %f"%(NPTS,pi))
print(" time = %f, estimated MFlops = %f"%(end_time-start_time,mflops))
#
# CUDA kernel reduction
#
init[grid_size,block_size](arr)
start_time = time.time()
CUDA_reduce(arr,NPTS)
end_time = time.time()
mflops = NPTS*1.0/(1.0e6*(end_time-start_time))
print("CUDA kernel reduction:")
print(" time = %f, estimated MFlops = %f"%(end_time-start_time,mflops))
#
# both with CUDA kernel reduction
#
start_time = time.time()
init[grid_size,block_size](arr)
cuda.synchronize()
CUDA_reduce(arr,NPTS)
CUDA_result(arr,result)
init[grid_size,block_size](darr,nloop)
reduce(darr,npts)
cuda.synchronize()
end_time = time.time()
pi = result.copy_to_host()
mflops = NPTS*5.0/(1.0e6*(end_time-start_time))
print("both with CUDA kernel reduction:")
print(" NPTS = %d, pi = %f"%(NPTS,pi[0]))
print(" time = %f, estimated MFlops = %f"%(end_time-start_time,mflops))
#
# both with CUDA kernel reduction and transfer
#
start_time = time.time()
init[grid_size,block_size](arr)
CUDA_reduce(arr,NPTS)
CUDA_result(arr,result)
pi = result.copy_to_host()
end_time = time.time()
mflops = NPTS*5.0/(1.0e6*(end_time-start_time))
print("both with CUDA kernel reduction and transfer:")
print(" NPTS = %d, pi = %f"%(NPTS,pi[0]))
mflops = npts*nloop*5.0/(1.0e6*(end_time-start_time))
harr = darr.copy_to_host()
print("npts = %d, nloop = %d, pi = %f"%(npts,nloop,harr[0]))
print("time = %f, estimated MFlops = %f"%(end_time-start_time,mflops))
......@@ -6,6 +6,8 @@
|88,333|[mpimppi.c](hybrid/mpimppi.c)|C, MPI+OpenMP, 1024 nodes, 64 cores/node, 4 threads/core<br>cc mpimppi.c -o mpimppi -O3 -ffast-math -fopenmp|Argonne ALCF Theta<br>Cray XC40|Oct 9, 2019|
|2,117|[mpipi2.c](MPI/mpipi2.c)|C, MPI, 10 nodes, 96 cores/node<br>mpicc mpipi2.c -o mpipi2 -O3 -ffast-math|Intel 2x Xeon Platinum 8175M|Oct 24, 2019|
|2,102|[mpipi2.py](Python/mpipi2.py)|Python, Numba, MPI<br>10 nodes, 96 cores/node|Intel 2x Xeon Platinum 8175M|Feb 6, 2020|
|1,635|[cudapi.cu](CUDA/cudapi.cu)|C++, CUDA<br>5120 cores|NVIDIA V100|March 1, 2020|
|1,090|[numbapig.py](Python/numbapig.py)|Python, Numba, CUDA<br>5120 cores|NVIDIA V100|March 1, 2020|
|315|[numbapip.py](Python/numbapip.py)|Python, Numba, parallel, fastmath<br>96 cores|Intel 2x Xeon Platinum 8175M|Feb 7, 2020|
|272|[threadpi.c](C/threadpi.c)|C, 96 threads<br>gcc threadpi.c -o threadpi -O3 -ffast-math -pthread|Intel 2x Xeon Platinum 8175M|Jun 3, 2019|
|211|[mpipi2.c](MPI/mpipi2.c)|C, MPI, 1 node, 96 cores<br>mpicc mpipi2.c -o mpipi2 -O3 -ffast-math|Intel 2x Xeon Platinum 8175M|Oct 24, 2019|
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment