Select Git revision
-
Jean-Christophe PLAGNIOL-VILLARD authored
Archive released NEWS file as NEWS-0.5.0. Create new NEWS file from release script template.
Jean-Christophe PLAGNIOL-VILLARD authoredArchive released NEWS file as NEWS-0.5.0. Create new NEWS file from release script template.
To find the state of this project's repository at the time of any of these versions, check out the tags.
cudampipi.cu 2.44 KiB
//
// cudampipi.cu
// Neil Gershenfeld 10/14/20
// calculation of pi by a CUDA+MPI sum
// assumes one GPU per MPI rank
// pi = 3.14159265358979323846
//
#include <iostream>
#include <cstdint>
#include <mpi.h>
//
using namespace std;
//
uint64_t blocks = 1024;
uint64_t threads = 1024;
uint64_t npts = 1000000;
uint64_t nthreads = blocks*threads;
int nloop = 10;
//
__global__ void init(double *arr,uint64_t npts,uint64_t nthreads,int rank) {
uint64_t index = blockIdx.x*blockDim.x+threadIdx.x;
uint64_t start = npts*index+npts*nthreads*rank+1;
uint64_t end = npts*(index+1)+npts*nthreads*rank+1;
arr[index] = 0;
for (uint64_t j = start; j < end; ++j)
arr[index] += 0.5/((j-0.75)*(j-0.25));
}
//
__global__ void reduce_sum(double *arr,uint64_t len) {
uint64_t index = blockIdx.x*blockDim.x+threadIdx.x;
if (index < len)
arr[index] += arr[index+len];
}
//
void reduce(double *arr) {
uint64_t len = nthreads >> 1;
while (1) {
reduce_sum<<<blocks,threads>>>(arr,len);
len = len >> 1;
if (len == 0)
return;
}
}
//
int main(int argc, char** argv) {
int rank,nranks;
MPI_Init(&argc,&argv);
MPI_Comm_rank(MPI_COMM_WORLD,&rank);
MPI_Comm_size(MPI_COMM_WORLD,&nranks);
double *arr,result,pi;
cudaMalloc(&arr,nthreads*sizeof(double));
if (rank == 0) {
for (int i = 0; i < nloop; ++i) {
MPI_Barrier(MPI_COMM_WORLD);
double tstart = MPI_Wtime();
init<<<blocks,threads>>>(arr,npts,nthreads,rank);
reduce(arr);
cudaDeviceSynchronize();
cudaMemcpy(&result,arr,8,cudaMemcpyDeviceToHost);
MPI_Reduce(&result,&pi,1,MPI_DOUBLE,MPI_SUM,0,MPI_COMM_WORLD);
double tend = MPI_Wtime();
double dt = tend-tstart;
double gflops = npts*nthreads*nranks*5.0/dt/1e9;
printf("npts = %ld, nthreads = %ld, nranks = %d, pi = %lf\n",npts,nthreads,nranks,pi);
printf("time = %f, estimated GFlops = %f\n",dt,gflops);
}
}
else {
for (int i = 0; i < nloop; ++i) {
MPI_Barrier(MPI_COMM_WORLD);
init<<<blocks,threads>>>(arr,npts,nthreads,rank);
reduce(arr);
cudaDeviceSynchronize();
cudaMemcpy(&result,arr,8,cudaMemcpyDeviceToHost);
MPI_Reduce(&result,&pi,1,MPI_DOUBLE,MPI_SUM,0,MPI_COMM_WORLD);
}
}
cudaFree(arr);
MPI_Finalize();
return 0;
}