diff --git a/examples/hamilton-jacobi-parallel/tnlParallelEikonalSolver.h b/examples/hamilton-jacobi-parallel/tnlParallelEikonalSolver.h index 6e4cc4abd9c237fa58323a59472b1ebaf571b6d5..fad69121a0ebab30ddbfdc77eeec94b3c27b2d22 100644 --- a/examples/hamilton-jacobi-parallel/tnlParallelEikonalSolver.h +++ b/examples/hamilton-jacobi-parallel/tnlParallelEikonalSolver.h @@ -112,6 +112,7 @@ public: int*boundaryConditions_cuda; int* unusedCell_cuda; int* calculationsCount_cuda; + double* tmpw; //MeshTypeCUDA mesh_cuda, subMesh_cuda; //SchemeDevice scheme_cuda; //double delta_cuda, tau0_cuda, stopTime_cuda,cflCondition_cuda; @@ -156,7 +157,7 @@ template <typename SchemeHost, typename SchemeDevice, typename Device> __global__ void initRunCUDA(tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int >* caller); template <typename SchemeHost, typename SchemeDevice, typename Device> -__global__ void initCUDA( tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int >* cudaSolver, double* ptr); +__global__ void initCUDA( tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int >* cudaSolver, double* ptr, bool * ptr2); template <typename SchemeHost, typename SchemeDevice, typename Device> __global__ void synchronizeCUDA(tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int >* cudaSolver); diff --git a/examples/hamilton-jacobi-parallel/tnlParallelEikonalSolver_impl.h b/examples/hamilton-jacobi-parallel/tnlParallelEikonalSolver_impl.h index ce4344966f9b5fdd51527f8df6d42293ec9a2906..ea7127e9aba340ed18b88e72986015898ee7037a 100644 --- a/examples/hamilton-jacobi-parallel/tnlParallelEikonalSolver_impl.h +++ b/examples/hamilton-jacobi-parallel/tnlParallelEikonalSolver_impl.h @@ -113,25 +113,27 @@ bool tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>::in if( !initCUDA(parameters, gridRows, gridCols) ) return false; }*/ - cout << "s" << endl; + //cout << "s" << endl; cudaMalloc(&(this->cudaSolver), sizeof(tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int >)); - cout << "s" << endl; + //cout << "s" << endl; cudaMemcpy(this->cudaSolver, this,sizeof(tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int >), cudaMemcpyHostToDevice); - cout << "s" << endl; + //cout << "s" << endl; double** tmpdev = NULL; cudaMalloc(&tmpdev, sizeof(double*)); - double* tmpw; - cudaMalloc(&tmpw, this->work_u.getSize()*sizeof(double)); - initCUDA<SchemeTypeHost, SchemeTypeDevice, DeviceType><<<1,1>>>(this->cudaSolver, tmpw); + //double* tmpw; + cudaMalloc(&(this->tmpw), this->work_u.getSize()*sizeof(double)); + cudaMalloc(&(this->runcuda), sizeof(bool)); + initCUDA<SchemeTypeHost, SchemeTypeDevice, DeviceType><<<1,1>>>(this->cudaSolver, (this->tmpw), (this->runcuda)); cudaDeviceSynchronize(); - cout << "s " << endl; + //cout << "s " << endl; //cudaMalloc(&(cudaSolver->work_u_cuda), this->work_u.getSize()*sizeof(double)); double* tmpu = NULL; cudaMemcpy(&tmpu, tmpdev,sizeof(double*), cudaMemcpyDeviceToHost); - printf("%p %p \n",tmpu,tmpw); - cudaMemcpy(tmpw, this->work_u.getData(), this->work_u.getSize()*sizeof(double), cudaMemcpyHostToDevice); - cout << "s "<< endl; + //printf("%p %p \n",tmpu,tmpw); + cudaMemcpy((this->tmpw), this->work_u.getData(), this->work_u.getSize()*sizeof(double), cudaMemcpyHostToDevice); + //cout << "s "<< endl; + } #endif @@ -168,13 +170,14 @@ bool tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>::in #ifdef HAVE_CUDA else if(this->device == tnlCudaDevice) { - cout << "pre 1 kernel" << endl; +// cout << "pre 1 kernel" << endl; dim3 threadsPerBlock(this->n, this->n); dim3 numBlocks(this->gridCols,this->gridRows); cudaDeviceSynchronize(); initRunCUDA<SchemeTypeHost,SchemeTypeDevice, DeviceType><<<numBlocks,threadsPerBlock,this->n*this->n*sizeof(double)>>>(this->cudaSolver); cudaDeviceSynchronize(); - cout << "post 1 kernel" << endl; +// cout << "post 1 kernel" << endl; + } #endif @@ -186,8 +189,26 @@ bool tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>::in else if(this->device == tnlCudaDevice) { cudaDeviceSynchronize(); + + + //double * test = (double*)malloc(this->work_u.getSize()*sizeof(double)); + //cout << test[0] <<" " << test[1] <<" " << test[2] <<" " << test[3] << endl; + //cudaMemcpy(/*this->work_u.getData()*/ test, (this->tmpw), this->work_u.getSize()*sizeof(double), cudaMemcpyDeviceToHost); + //cout << this->tmpw << " " << test[0] <<" " << test[1] << " " <<test[2] << " " <<test[3] << endl; + + checkCudaDevice; + synchronizeCUDA<SchemeTypeHost, SchemeTypeDevice, DeviceType><<<1,1>>>(this->cudaSolver); + cudaDeviceSynchronize(); + checkCudaDevice; + //cout << test[0] << " " <<test[1] <<" " << test[2] << " " <<test[3] << endl; + //cudaMemcpy(/*this->work_u.getData()*/ test, (this->tmpw), this->work_u.getSize()*sizeof(double), cudaMemcpyDeviceToHost); + //checkCudaDevice; + //cout << this->tmpw << " " << test[0] << " " <<test[1] << " " <<test[2] <<" " << test[3] << endl; + //free(test); + } + #endif cout << "Solver initialized." << endl; @@ -281,40 +302,60 @@ void tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>::ru #ifdef HAVE_CUDA else if(this->device == tnlCudaDevice) { + //cout << "fn" << endl; bool end_cuda = false; dim3 threadsPerBlock(this->n, this->n); dim3 numBlocks(this->gridCols,this->gridRows); cudaDeviceSynchronize(); - cudaMalloc(&runcuda,sizeof(bool)); - cudaMemcpy(runcuda, &run_host, sizeof(bool), cudaMemcpyHostToDevice); + checkCudaDevice; + //cudaMalloc(&runcuda,sizeof(bool)); + //cudaMemcpy(runcuda, &run_host, sizeof(bool), cudaMemcpyHostToDevice); + //cout << "fn" << endl; bool* tmpb; + //cudaMemcpy(tmpb, &(cudaSolver->runcuda),sizeof(bool*), cudaMemcpyDeviceToHost); + //cudaDeviceSynchronize(); + //checkCudaDevice; + cudaMemcpy(&(this->run_host),this->runcuda,sizeof(bool), cudaMemcpyDeviceToHost); + cudaDeviceSynchronize(); + checkCudaDevice; + //cout << "fn" << endl; + int i = 1; while (run_host || !end_cuda) { - cout << "a" << endl; - if(this->boundaryConditions.max() == 0 ) + cout << "Computing at step "<< i++ << endl; + if(run_host == false ) end_cuda = true; else end_cuda = false; - cout << "a" << endl; + //cout << "a" << endl; cudaDeviceSynchronize(); + checkCudaDevice; runCUDA<SchemeTypeHost, SchemeTypeDevice, DeviceType><<<numBlocks,threadsPerBlock,this->n*this->n*sizeof(double)>>>(this->cudaSolver); - cout << "a" << endl; + //cout << "a" << endl; cudaDeviceSynchronize(); synchronizeCUDA<SchemeTypeHost, SchemeTypeDevice, DeviceType><<<1,1>>>(this->cudaSolver); - cout << "a" << endl; - run_host = false; - cout << "in kernel loop" << run_host << endl; - cudaMemcpy(tmpb, &(cudaSolver->runcuda),sizeof(double*), cudaMemcpyHostToDevice); - cudaMemcpy(&run_host, tmpb,sizeof(bool), cudaMemcpyDeviceToHost); - cout << "in kernel loop" << run_host << endl; + //cout << "a" << endl; + //run_host = false; + //cout << "in kernel loop" << run_host << endl; + //cudaMemcpy(tmpb, &(cudaSolver->runcuda),sizeof(bool*), cudaMemcpyDeviceToHost); + cudaMemcpy(&run_host, (this->runcuda),sizeof(bool), cudaMemcpyDeviceToHost); + //cout << "in kernel loop" << run_host << endl; } - cout << "b" << endl; + //cout << "b" << endl; - double* tmpu; - cudaMemcpy(tmpu, &(cudaSolver->work_u_cuda),sizeof(double*), cudaMemcpyHostToDevice); - cudaMemcpy(this->work_u.getData(), tmpu, this->work_u.getSize()*sizeof(double), cudaMemcpyDeviceToHost); + //double* tmpu; + //cudaMemcpy(tmpu, &(cudaSolver->work_u_cuda),sizeof(double*), cudaMemcpyHostToDevice); + //cudaMemcpy(this->work_u.getData(), tmpu, this->work_u.getSize()*sizeof(double), cudaMemcpyDeviceToHost); + //cout << this->work_u.getData()[0] << endl; + //double * test = (double*)malloc(this->work_u.getSize()*sizeof(double)); + //cout << test[0] << test[1] << test[2] << test[3] << endl; + cudaMemcpy(this->work_u.getData()/* test*/, (this->tmpw), this->work_u.getSize()*sizeof(double), cudaMemcpyDeviceToHost); + //cout << this->tmpw << " " << test[0] << test[1] << test[2] << test[3] << endl; + //free(test); + + cudaDeviceSynchronize(); } #endif contractGrid(); @@ -323,6 +364,15 @@ void tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>::ru cout << "Average number of calculations on one subgrid was " << ( (double) this->calculationsCount.sum() / (double) this->calculationsCount.getSize() ) << endl; cout << "Solver finished" << endl; +#ifdef HAVE_CUDA + if(this->device == tnlCudaDevice) + { + cudaFree(this->runcuda); + cudaFree(this->tmpw); + cudaFree(this->cudaSolver); + } +#endif + } //north - 1, east - 2, west - 4, south - 8 @@ -943,7 +993,6 @@ void tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>::ge + (i % caller->gridCols) * caller->n + (j/caller->n) * caller->n*caller->gridCols + (j % caller->n); - printf(" tX: %d, tY : %d, bDx: %d, Total: %d, Index: %d, pntr: %d, pntr2 %d \n", threadIdx.x,threadIdx.y,blockDim.x, j, th,caller->work_u_cuda,a); *a = caller->work_u_cuda[th]; //printf("Hi %f \n", *a); //return ret; @@ -955,7 +1004,9 @@ __device__ void tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>::insertSubgridCUDA( double u, const int i ) { + int j = threadIdx.x + threadIdx.y * blockDim.x; + //printf("j = %d, u = %f\n", j,u); int index = (i / this->gridCols)*this->n*this->n*this->gridCols + (i % this->gridCols)*this->n + (j/this->n)*this->n*this->gridCols + (j % this->n); @@ -964,8 +1015,10 @@ void tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>::in { this->work_u_cuda[index] = u; this->unusedCell_cuda[index] = 0; + } + } @@ -980,25 +1033,33 @@ void tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>::ru int l = threadIdx.y * blockDim.x + threadIdx.x; if(l == 0) + { tmp = 0; + /*for(int o = 0; o < this->n * this->n; o++) + printf("%d : %f\n", o, u[o]);*/ + } __syncthreads(); if(u[0]*u[l] <= 0.0) - atomicOr( &tmp, 1); + atomicMax( &tmp, 1); + __syncthreads(); + //printf("tmp = %d", tmp); + __shared__ double value; if(l == 0) value = 0.0; - + __syncthreads(); for(int o = 0; o < blockDim.x*blockDim.y; o++) { if(l == o) value=Max(value,fabs(u[l])); __syncthreads(); } + __syncthreads(); //atomicMax(&value,fabs(u[l])) if(l == 0) @@ -1007,7 +1068,7 @@ void tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>::ru __syncthreads(); - if(tmp) + if(tmp == 1) {} //north - 1, east - 2, west - 4, south - 8 else if(boundaryCondition == 4) @@ -1030,6 +1091,7 @@ void tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>::ru if(i > 0) u[i*this->n + j] = value; } + __syncthreads(); double time = 0.0; __shared__ double currentTau; @@ -1038,34 +1100,43 @@ void tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>::ru if(threadIdx.x * threadIdx.y == 0) { currentTau = this->tau0; - maxResidue = 10.0 * this->subMesh.getHx(); + maxResidue = 0.0;//10.0 * this->subMesh.getHx(); } double finalTime = this->stopTime; if( time + currentTau > finalTime ) currentTau = finalTime - time; __syncthreads(); + //printf("%d : %f\n", l, u[l]); while( time < finalTime ) { + /**** * Compute the RHS */ + __syncthreads(); - fu = schemeDevice.getValue( this->subMesh, l, this->subMesh.getCellCoordinates(i), u, time, boundaryCondition ); + fu = schemeHost.getValueDev( this->subMesh, l, this->subMesh.getCellCoordinates(l), u, time, boundaryCondition); __syncthreads(); + //printf("%d : %f\n", l, fu); //atomicMax(&maxResidue,fabs(fu));//maxResidue = fu. absMax(); + if(l == 0) + maxResidue = 0.0; + __syncthreads(); for(int o = 0; o < blockDim.x*blockDim.y; o++) { if(l == o) maxResidue=Max(maxResidue,fabs(fu)); __syncthreads(); } + __syncthreads(); + - if(threadIdx.x * threadIdx.y == 0) + if(l == 0) { if( this -> cflCondition * maxResidue != 0.0) currentTau = this -> cflCondition / maxResidue; @@ -1076,13 +1147,16 @@ void tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>::ru } if( time + currentTau > finalTime ) currentTau = finalTime - time; - maxResidue = 0.0; } + __syncthreads(); // - double tau2 = 1.0; + + double tau2 = finalTime; if((u[l]+currentTau * fu)*u[l] < 0.0 && fu != 0.0 && u[l] != 0.0 ) tau2 = fabs(u[l]/(2.0*fu)); + + __syncthreads(); //atomicMin(¤tTau, tau2); for(int o = 0; o < blockDim.x*blockDim.y; o++) { @@ -1093,93 +1167,22 @@ void tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>::ru // + + __syncthreads(); u[l] += currentTau * fu; + //if(l==0) + // printf("ct %f\n",currentTau); + time += currentTau; __syncthreads(); } + //printf("%d : %f\n", l, u[l]); } -template< typename SchemeHost, typename SchemeDevice, typename Device > -__global__ -void /*tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>::*/runCUDA(tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int >* caller) -{ - extern __shared__ double u[]; - int i = blockIdx.y * gridDim.x + blockIdx.x; - int l = threadIdx.y * blockDim.x + threadIdx.x; - - if(caller->getSubgridValueCUDA(i) != INT_MAX) - { - double a; - caller->getSubgridCUDA(i,caller, &a); - u[l] = a; - int bound = caller->getBoundaryConditionCUDA(i); - - if(bound & 1) - { - caller->runSubgridCUDA(1,u,i); - //this->calculationsCount[i]++; - } - __syncthreads(); - caller->insertSubgridCUDA(u[l],i); - if(bound & 2) - { - caller->runSubgridCUDA(2,u,i); - //this->calculationsCount[i]++; - } - __syncthreads(); - caller->insertSubgridCUDA(u[l],i); - if(bound & 4) - { - caller->runSubgridCUDA(4,u,i); - //this->calculationsCount[i]++; - } - __syncthreads(); - caller->insertSubgridCUDA(u[l],i); - if(bound & 8) - { - caller->runSubgridCUDA(8,u,i); - //this->calculationsCount[i]++; - } - __syncthreads(); - caller->insertSubgridCUDA(u[l],i); - - if( ((bound & 2) )) - { - caller->runSubgridCUDA(3,u,i); - } - __syncthreads(); - caller->insertSubgridCUDA(u[l],i); - if( ((bound & 4) )) - { - caller->runSubgridCUDA(5,u,i); - } - __syncthreads(); - caller->insertSubgridCUDA(u[l],i); - if( ((bound & 2) )) - { - caller->runSubgridCUDA(10,u,i); - } - __syncthreads(); - caller->insertSubgridCUDA(u[l],i); - if( (bound & 4) ) - { - caller->runSubgridCUDA(12,u,i); - } - __syncthreads(); - caller->insertSubgridCUDA(u[l],i); - - - caller->setBoundaryConditionCUDA(i, 0); - caller->setSubgridValueCUDA(i, caller->getSubgridValueCUDA(i) - 1 ); - - - } -} - template< typename SchemeHost, typename SchemeDevice, typename Device> __device__ @@ -1230,14 +1233,17 @@ template <typename SchemeHost, typename SchemeDevice, typename Device> __global__ void /*tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>::*/synchronizeCUDA(tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int >* cudaSolver) //needs fix ---- maybe not anymore --- but frankly: yeah, it does -- aaaa-and maybe fixed now { + //printf("I am not an empty kernel!\n"); //cout << "Synchronizig..." << endl; int tmp1, tmp2; int grid1, grid2; if(cudaSolver->currentStep & 1) { + //printf("I am not an empty kernel! 1\n"); for(int j = 0; j < cudaSolver->gridRows - 1; j++) { + //printf("I am not an empty kernel! 3\n"); for (int i = 0; i < cudaSolver->gridCols*cudaSolver->n; i++) { tmp1 = cudaSolver->gridCols*cudaSolver->n*((cudaSolver->n-1)+j*cudaSolver->n) + i; @@ -1247,6 +1253,7 @@ void /*tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>:: if ((fabs(cudaSolver->work_u_cuda[tmp1]) < fabs(cudaSolver->work_u_cuda[tmp2]) - cudaSolver->delta || grid2 == INT_MAX || grid2 == -INT_MAX) && (grid1 != INT_MAX && grid1 != -INT_MAX)) { + //printf("%d %d %d %d \n",tmp1,tmp2,cudaSolver->getOwnerCUDA(tmp1),cudaSolver->getOwnerCUDA(tmp2)); cudaSolver->work_u_cuda[tmp2] = cudaSolver->work_u_cuda[tmp1]; cudaSolver->unusedCell_cuda[tmp2] = 0; if(grid2 == INT_MAX) @@ -1258,6 +1265,7 @@ void /*tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>:: } else if ((fabs(cudaSolver->work_u_cuda[tmp1]) > fabs(cudaSolver->work_u_cuda[tmp2]) + cudaSolver->delta || grid1 == INT_MAX || grid1 == -INT_MAX) && (grid2 != INT_MAX && grid2 != -INT_MAX)) { + //printf("%d %d %d %d \n",tmp1,tmp2,cudaSolver->getOwnerCUDA(tmp1),cudaSolver->getOwnerCUDA(tmp2)); cudaSolver->work_u_cuda[tmp1] = cudaSolver->work_u_cuda[tmp2]; cudaSolver->unusedCell_cuda[tmp1] = 0; if(grid1 == INT_MAX) @@ -1273,10 +1281,13 @@ void /*tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>:: } else { + //printf("I am not an empty kernel! 2\n"); for(int i = 1; i < cudaSolver->gridCols; i++) { + //printf("I am not an empty kernel! 4\n"); for (int j = 0; j < cudaSolver->gridRows*cudaSolver->n; j++) { + tmp1 = cudaSolver->gridCols*cudaSolver->n*j + i*cudaSolver->n - 1; tmp2 = cudaSolver->gridCols*cudaSolver->n*j + i*cudaSolver->n ; grid1 = cudaSolver->getSubgridValueCUDA(cudaSolver->getOwnerCUDA(tmp1)); @@ -1284,6 +1295,7 @@ void /*tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>:: if ((fabs(cudaSolver->work_u_cuda[tmp1]) < fabs(cudaSolver->work_u_cuda[tmp2]) - cudaSolver->delta || grid2 == INT_MAX || grid2 == -INT_MAX) && (grid1 != INT_MAX && grid1 != -INT_MAX)) { + //printf("%d %d %d %d \n",tmp1,tmp2,cudaSolver->getOwnerCUDA(tmp1),cudaSolver->getOwnerCUDA(tmp2)); cudaSolver->work_u_cuda[tmp2] = cudaSolver->work_u_cuda[tmp1]; cudaSolver->unusedCell_cuda[tmp2] = 0; if(grid2 == INT_MAX) @@ -1295,6 +1307,7 @@ void /*tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>:: } else if ((fabs(cudaSolver->work_u_cuda[tmp1]) > fabs(cudaSolver->work_u_cuda[tmp2]) + cudaSolver->delta || grid1 == INT_MAX || grid1 == -INT_MAX) && (grid2 != INT_MAX && grid2 != -INT_MAX)) { + //printf("%d %d %d %d \n",tmp1,tmp2,cudaSolver->getOwnerCUDA(tmp1),cudaSolver->getOwnerCUDA(tmp2)); cudaSolver->work_u_cuda[tmp1] = cudaSolver->work_u_cuda[tmp2]; cudaSolver->unusedCell_cuda[tmp1] = 0; if(grid1 == INT_MAX) @@ -1307,21 +1320,25 @@ void /*tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>:: } } } + //printf("I am not an empty kernel! 5 cudaSolver->currentStep : %d \n", cudaSolver->currentStep); - - cudaSolver->currentStep++; + cudaSolver->currentStep = cudaSolver->currentStep + 1; int stepValue = cudaSolver->currentStep + 4; - for (int i = 0; i < cudaSolver->subgridValues.getSize(); i++) + for (int i = 0; i < cudaSolver->gridRows * cudaSolver->gridCols; i++) { if( cudaSolver->getSubgridValueCUDA(i) == -INT_MAX ) cudaSolver->setSubgridValueCUDA(i, stepValue); } int maxi = 0; - for(int q=0; q < cudaSolver->n*cudaSolver->n;q++) + for(int q=0; q < cudaSolver->gridRows*cudaSolver->gridCols;q++) + { + //printf("%d : %d\n", q, cudaSolver->boundaryConditions_cuda[q]); maxi=Max(maxi,cudaSolver->boundaryConditions_cuda[q]); - + } + //printf("I am not an empty kernel! %d\n", maxi); *(cudaSolver->runcuda) = (maxi > 0); + //printf("I am not an empty kernel! 7 %d\n", cudaSolver->boundaryConditions_cuda[0]); //cout << "Grid synchronized at step " << (this->currentStep - 1 ) << endl; } @@ -1335,7 +1352,7 @@ void /*tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>:: template< typename SchemeHost, typename SchemeDevice, typename Device> __global__ -void /*tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>::*/initCUDA( tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int >* cudaSolver, double* ptr ) +void /*tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>::*/initCUDA( tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int >* cudaSolver, double* ptr , bool* ptr2) { //cout << "Initializating solver..." << endl; //const tnlString& meshLocation = parameters.getParameter <tnlString>("mesh"); @@ -1377,9 +1394,22 @@ void /*tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>:: cudaSolver->unusedCell_cuda = (int*)malloc(cudaSolver->gridCols*cudaSolver->gridRows*cudaSolver->n*cudaSolver->n*sizeof(int)); cudaSolver->subgridValues_cuda =(int*)malloc(cudaSolver->gridCols*cudaSolver->gridRows*sizeof(int)); cudaSolver->boundaryConditions_cuda =(int*)malloc(cudaSolver->gridCols*cudaSolver->gridRows*sizeof(int)); + cudaSolver->runcuda = ptr2;//(bool*)malloc(sizeof(bool)); + *(cudaSolver->runcuda) = true; + cudaSolver->currentStep = 1; //cudaMemcpy(ptr,&(cudaSolver->work_u_cuda), sizeof(double*),cudaMemcpyDeviceToHost); ptr = cudaSolver->work_u_cuda; printf("GPU memory allocated. %p \n",cudaSolver->work_u_cuda); + + for(int i = 0; i < cudaSolver->gridCols*cudaSolver->gridRows; i++) + { + for(int j = 0; j < cudaSolver->n*cudaSolver->n; j++) + cudaSolver->unusedCell_cuda[i*cudaSolver->n*cudaSolver->n + j] = 1; + cudaSolver->subgridValues_cuda[i] = INT_MAX; + cudaSolver->boundaryConditions_cuda[i] = 0; + } + + //cudaSolver->work_u_cuda[50] = 32.153438; //// //// @@ -1427,28 +1457,149 @@ void /*tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>:: double a; caller->getSubgridCUDA(i,caller, &a); - printf("%f %f\n",a , u[l]); + //printf("%f %f\n",a , u[l]); u[l] = a; - printf("Hi %f \n", u[l]); + //printf("Hi %f \n", u[l]); __syncthreads(); - printf("hurewrwr %f \n", u[l]); + //printf("hurewrwr %f \n", u[l]); if(u[0] * u[l] <= 0.0) { - printf("000 \n"); + // printf("000 \n"); atomicMax( &containsCurve, 1); } __syncthreads(); - printf("hu"); + //printf("hu"); + //printf("%d : %f\n", l, u[l]); if(containsCurve) { + //printf("have curve \n"); caller->runSubgridCUDA(0,u,i); + //printf("%d : %f\n", l, u[l]); __syncthreads(); caller->insertSubgridCUDA(u[l],i); - caller->setSubgridValueCUDA(i, 4); + __syncthreads(); + if(l == 0) + caller->setSubgridValueCUDA(i, 4); } - printf("das"); + +} + + + + + +template< typename SchemeHost, typename SchemeDevice, typename Device > +__global__ +void /*tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>::*/runCUDA(tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int >* caller) +{ + extern __shared__ double u[]; + int i = blockIdx.y * gridDim.x + blockIdx.x; + int l = threadIdx.y * blockDim.x + threadIdx.x; + + if(i+l == 0) + printf("a"); + if(caller->getSubgridValueCUDA(i) != INT_MAX) + { + double a; + caller->getSubgridCUDA(i,caller, &a); + u[l] = a; + int bound = caller->getBoundaryConditionCUDA(i); + if(l == 0) + //printf("i = %d, bound = %d\n",i,caller->getSubgridValueCUDA(i)); + if(bound & 1) + { + caller->runSubgridCUDA(1,u,i); + //this->calculationsCount[i]++; + } + __syncthreads(); + caller->insertSubgridCUDA(u[l],i); + __syncthreads(); + caller->getSubgridCUDA(i,caller, &a); + u[l] = a; + __syncthreads(); + if(bound & 2) + { + caller->runSubgridCUDA(2,u,i); + //this->calculationsCount[i]++; + } + __syncthreads(); + caller->insertSubgridCUDA(u[l],i); + __syncthreads(); + caller->getSubgridCUDA(i,caller, &a); + u[l] = a; + __syncthreads(); + if(bound & 4) + { + caller->runSubgridCUDA(4,u,i); + //this->calculationsCount[i]++; + } + __syncthreads(); + caller->insertSubgridCUDA(u[l],i); + __syncthreads(); + caller->getSubgridCUDA(i,caller, &a); + u[l] = a; + __syncthreads(); + if(bound & 8) + { + caller->runSubgridCUDA(8,u,i); + //this->calculationsCount[i]++; + } + __syncthreads(); + caller->insertSubgridCUDA(u[l],i); + __syncthreads(); + caller->getSubgridCUDA(i,caller, &a); + u[l] = a; + __syncthreads(); + + if( ((bound & 2) )) + { + caller->runSubgridCUDA(3,u,i); + } + __syncthreads(); + caller->insertSubgridCUDA(u[l],i); + __syncthreads(); + caller->getSubgridCUDA(i,caller, &a); + u[l] = a; + __syncthreads(); + if( ((bound & 4) )) + { + caller->runSubgridCUDA(5,u,i); + } + __syncthreads(); + caller->insertSubgridCUDA(u[l],i); + __syncthreads(); + caller->getSubgridCUDA(i,caller, &a); + u[l] = a; + __syncthreads(); + if( ((bound & 2) )) + { + caller->runSubgridCUDA(10,u,i); + } + __syncthreads(); + caller->insertSubgridCUDA(u[l],i); + __syncthreads(); + caller->getSubgridCUDA(i,caller, &a); + u[l] = a; + __syncthreads(); + if( (bound & 4) ) + { + caller->runSubgridCUDA(12,u,i); + } + __syncthreads(); + caller->insertSubgridCUDA(u[l],i); + + + caller->setBoundaryConditionCUDA(i, 0); + caller->setSubgridValueCUDA(i, caller->getSubgridValueCUDA(i) - 1 ); + + + } + + + if(i+l == 0) + printf("b"); } #endif /*HAVE_CUDA*/ diff --git a/src/operators/godunov-eikonal/parallelGodunovEikonal.h b/src/operators/godunov-eikonal/parallelGodunovEikonal.h index c0d83527b53ddbf339a8f7a05af98483d43a40b9..8233c9278ab1c8836c8198f1ecdec6382696c3fd 100644 --- a/src/operators/godunov-eikonal/parallelGodunovEikonal.h +++ b/src/operators/godunov-eikonal/parallelGodunovEikonal.h @@ -149,16 +149,16 @@ public: const RealType& time, const IndexType boundaryCondition ) const; - template< typename Vector > + #ifdef HAVE_CUDA - __device__ __host__ + __device__ #endif - Real getValue( const MeshType& mesh, + Real getValueDev( const MeshType& mesh, const IndexType cellIndex, const CoordinatesType& coordinates, const RealType* u, const RealType& time, - const IndexType boundaryCondition ) const; + const IndexType boundaryCondition) const; #ifdef HAVE_CUDA __device__ __host__ #endif diff --git a/src/operators/godunov-eikonal/parallelGodunovEikonal2D_impl.h b/src/operators/godunov-eikonal/parallelGodunovEikonal2D_impl.h index 9cc077f7661d78672fad2fe295e649bb62c015ab..c0dd0565f7ce2728f30be19174088a1af2643eb2 100644 --- a/src/operators/godunov-eikonal/parallelGodunovEikonal2D_impl.h +++ b/src/operators/godunov-eikonal/parallelGodunovEikonal2D_impl.h @@ -145,6 +145,7 @@ Real parallelGodunovEikonalScheme< tnlGrid< 2, MeshReal, Device, MeshIndex >, Re const IndexType boundaryCondition ) const { + if ( ((coordinates.x() == 0 && (boundaryCondition & 4)) or (coordinates.x() == mesh.getDimensions().x() - 1 && (boundaryCondition & 2)) or (coordinates.y() == 0 && (boundaryCondition & 8)) or @@ -165,6 +166,7 @@ Real parallelGodunovEikonalScheme< tnlGrid< 2, MeshReal, Device, MeshIndex >, Re signui = sign(u[cellIndex],epsilon); + if(fabs(u[cellIndex]) < acc) return 0.0; if(signui > 0.0) @@ -296,18 +298,19 @@ template< typename MeshReal, typename MeshIndex, typename Real, typename Index > -template< typename Vector > + #ifdef HAVE_CUDA -__device__ __host__ +__device__ #endif -Real parallelGodunovEikonalScheme< tnlGrid< 2, MeshReal, Device, MeshIndex >, Real, Index >:: getValue( const MeshType& mesh, +Real parallelGodunovEikonalScheme< tnlGrid< 2, MeshReal, Device, MeshIndex >, Real, Index >:: getValueDev( const MeshType& mesh, const IndexType cellIndex, const CoordinatesType& coordinates, const Real* u, const Real& time, - const IndexType boundaryCondition ) const + const IndexType boundaryCondition) const { + if ( ((coordinates.x() == 0 && (boundaryCondition & 4)) or (coordinates.x() == mesh.getDimensions().x() - 1 && (boundaryCondition & 2)) or (coordinates.y() == 0 && (boundaryCondition & 8)) or @@ -327,8 +330,10 @@ Real parallelGodunovEikonalScheme< tnlGrid< 2, MeshReal, Device, MeshIndex >, Re RealType nabla, xb, xf, yb, yf, signui; signui = sign(u[cellIndex],epsilon); - - if(fabs(u[cellIndex]) < acc) return 0.0; +#ifdef HAVE_CUDA + //printf("%d : %d ;;;; %d : %d\n",threadIdx.x, coordinates.x(), threadIdx.y,coordinates.y()); +#endif + //if(fabs(u[cellIndex]) < acc) return 0.0; if(signui > 0.0) { @@ -379,8 +384,8 @@ Real parallelGodunovEikonalScheme< tnlGrid< 2, MeshReal, Device, MeshIndex >, Re yb = 0.0; nabla = sqrt (xf*xf + xb*xb + yf*yf + yb*yb ); - if(fabs(1.0-nabla) < acc) - return 0.0; + // if(fabs(1.0-nabla) < acc) + // return 0.0; return signui*(1.0 - nabla); } else if (signui < 0.0) @@ -435,8 +440,8 @@ Real parallelGodunovEikonalScheme< tnlGrid< 2, MeshReal, Device, MeshIndex >, Re nabla = sqrt (xf*xf + xb*xb + yf*yf + yb*yb ); - if(fabs(1.0-nabla) < acc) - return 0.0; + // if(fabs(1.0-nabla) < acc) + // return 0.0; return signui*(1.0 - nabla); } else