diff --git a/examples/hamilton-jacobi-parallel/main.h b/examples/hamilton-jacobi-parallel/main.h index ffda6cd5b4f5757ccf4c1430607b6a7bbf09c95a..a34ec1fafbcdba0f19c73679ee1639f5dd98c14a 100644 --- a/examples/hamilton-jacobi-parallel/main.h +++ b/examples/hamilton-jacobi-parallel/main.h @@ -44,12 +44,12 @@ int main( int argc, char* argv[] ) tnlDeviceEnum device; device = tnlHostDevice; - typedef parallelGodunovEikonalScheme< tnlGrid<2,double,tnlHost, int>, double, int > SchemeTypeHost; + typedef parallelGodunovEikonalScheme< tnlGrid<3,double,tnlHost, int>, double, int > SchemeTypeHost; /*#ifdef HAVE_CUDA typedef parallelGodunovEikonalScheme< tnlGrid<2,double,tnlCuda, int>, double, int > SchemeTypeDevice; #endif #ifndef HAVE_CUDA*/ - typedef parallelGodunovEikonalScheme< tnlGrid<2,double,tnlHost, int>, double, int > SchemeTypeDevice; + typedef parallelGodunovEikonalScheme< tnlGrid<3,double,tnlHost, int>, double, int > SchemeTypeDevice; /*#endif*/ if(device==tnlHostDevice) @@ -57,7 +57,7 @@ int main( int argc, char* argv[] ) typedef tnlHost Device; - tnlParallelEikonalSolver<2,SchemeTypeHost,SchemeTypeDevice, Device> solver; + tnlParallelEikonalSolver<3,SchemeTypeHost,SchemeTypeDevice, Device> solver; if(!solver.init(parameters)) { cerr << "Solver failed to initialize." << endl; @@ -72,7 +72,7 @@ int main( int argc, char* argv[] ) typedef tnlCuda Device; //typedef parallelGodunovEikonalScheme< tnlGrid<2,double,Device, int>, double, int > SchemeType; - tnlParallelEikonalSolver<2,SchemeTypeHost,SchemeTypeDevice, Device> solver; + tnlParallelEikonalSolver<3,SchemeTypeHost,SchemeTypeDevice, Device> solver; if(!solver.init(parameters)) { cerr << "Solver failed to initialize." << endl; diff --git a/examples/hamilton-jacobi-parallel/tnlParallelEikonalSolver3D_impl.h b/examples/hamilton-jacobi-parallel/tnlParallelEikonalSolver3D_impl.h index 1786649fc1a56ea4e89c804681edbdeaea3bb446..9ffeb8e06698f167af7edf2042cd59cc6962de2a 100644 --- a/examples/hamilton-jacobi-parallel/tnlParallelEikonalSolver3D_impl.h +++ b/examples/hamilton-jacobi-parallel/tnlParallelEikonalSolver3D_impl.h @@ -59,8 +59,8 @@ bool tnlParallelEikonalSolver<3,SchemeHost, SchemeDevice, Device, double, int>:: this->n = parameters.getParameter <int>("subgrid-size"); cout << "Setting N to " << this->n << endl; - this->subMesh.setDimensions( this->n, this->n ); - this->subMesh.setDomain( tnlStaticVector<3,double>(0.0, 0.0), + this->subMesh.setDimensions( this->n, this->n, this->n ); + this->subMesh.setDomain( tnlStaticVector<3,double>(0.0, 0.0, 0.0), tnlStaticVector<3,double>(mesh.template getSpaceStepsProducts< 1, 0, 0 >()*(double)(this->n), mesh.template getSpaceStepsProducts< 0, 1, 0 >()*(double)(this->n),mesh.template getSpaceStepsProducts< 0, 0, 1 >()*(double)(this->n)) ); this->subMesh.save("submesh.tnl"); @@ -182,11 +182,11 @@ bool tnlParallelEikonalSolver<3,SchemeHost, SchemeDevice, Device, double, int>:: // cout << "pre 1 kernel" << endl; cudaDeviceSynchronize(); checkCudaDevice; - dim3 threadsPerBlock(this->n, this->n); - dim3 numBlocks(this->gridCols,this->gridRows); + dim3 threadsPerBlock(this->n, this->n, this->n); + dim3 numBlocks(this->gridCols,this->gridRows,this->gridLevels); cudaDeviceSynchronize(); checkCudaDevice; - initRunCUDA3D<SchemeTypeHost,SchemeTypeDevice, DeviceType><<<numBlocks,threadsPerBlock,3*this->n*this->n*sizeof(double)>>>(this->cudaSolver); + initRunCUDA3D<SchemeTypeHost,SchemeTypeDevice, DeviceType><<<numBlocks,threadsPerBlock,3*this->n*this->n*this->n*sizeof(double)>>>(this->cudaSolver); cudaDeviceSynchronize(); // cout << "post 1 kernel" << endl; @@ -200,8 +200,8 @@ bool tnlParallelEikonalSolver<3,SchemeHost, SchemeDevice, Device, double, int>:: #ifdef HAVE_CUDA else if(this->device == tnlCudaDevice) { - dim3 threadsPerBlock(this->n, this->n); - dim3 numBlocks(this->gridCols,this->gridRows); + dim3 threadsPerBlock(this->n, this->n, this->n); + dim3 numBlocks(this->gridCols,this->gridRows,this->gridLevels); //double * test = (double*)malloc(this->work_u.getSize()*sizeof(double)); //cout << test[0] <<" " << test[1] <<" " << test[2] <<" " << test[3] << endl; //cudaMemcpy(/*this->work_u.getData()*/ test, (this->tmpw), this->work_u.getSize()*sizeof(double), cudaMemcpyDeviceToHost); @@ -321,8 +321,8 @@ void tnlParallelEikonalSolver<3,SchemeHost, SchemeDevice, Device, double, int>:: { //cout << "fn" << endl; bool end_cuda = false; - dim3 threadsPerBlock(this->n, this->n); - dim3 numBlocks(this->gridCols,this->gridRows); + dim3 threadsPerBlock(this->n, this->n, this->n); + dim3 numBlocks(this->gridCols,this->gridRows,this->gridLevels); cudaDeviceSynchronize(); checkCudaDevice; //cudaMalloc(&runcuda,sizeof(bool)); @@ -349,7 +349,7 @@ void tnlParallelEikonalSolver<3,SchemeHost, SchemeDevice, Device, double, int>:: cudaDeviceSynchronize(); checkCudaDevice; start = std::clock(); - runCUDA3D<SchemeTypeHost, SchemeTypeDevice, DeviceType><<<numBlocks,threadsPerBlock,3*this->n*this->n*sizeof(double)>>>(this->cudaSolver); + runCUDA3D<SchemeTypeHost, SchemeTypeDevice, DeviceType><<<numBlocks,threadsPerBlock,3*this->n*this->n*this->n*sizeof(double)>>>(this->cudaSolver); //cout << "a" << endl; cudaDeviceSynchronize(); time_diff += (std::clock() - start) / (double)(CLOCKS_PER_SEC); @@ -594,18 +594,17 @@ void tnlParallelEikonalSolver<3,SchemeHost, SchemeDevice, Device, double, int>:: k+= (i/(this->n*this->gridCols) - idealStretch +1 )* this->mesh.getDimensions().x() ; } - for( int j = 0; j<this->n*this->gridLevels; j++) { int l = j/this->n; - if(j%(this->n*this->gridLevels) - idealStretch >= 0) + if(j - idealStretch >= 0) { - l+= j%(this->n*this->gridLevels) - idealStretch + 1; + l+= j - idealStretch + 1; } - this->work_u[i+(j-l)*levelSize] = this->u0[i+(j-l)*levelSize-k]; + this->work_u[i+(j-l)*levelSize] = this->u0[i+(j-l)*mesh.getDimensions().x()*mesh.getDimensions().y()-k]; } } @@ -637,7 +636,7 @@ void tnlParallelEikonalSolver<3,SchemeHost, SchemeDevice, Device, double, int>:: { int l = j/this->n; - this->u0[i+(j-l)*levelSize-k] = this->work_u[i+(j-l)*levelSize]; + this->u0[i+(j-l)*mesh.getDimensions().x()*mesh.getDimensions().y()-k] = this->work_u[i+(j-l)*levelSize]; } } @@ -978,7 +977,7 @@ tnlParallelEikonalSolver<3,SchemeHost, SchemeDevice, Device, double, int>::runSu Entity.setCoordinates(coords); Entity.refresh(); neighbourEntities.refresh(subMesh,Entity.getIndex()); - fu[ i ] = schemeHost.getValue( this->subMesh, i, coords,u, time, boundaryCondition ); + fu[ i ] = schemeHost.getValue( this->subMesh, i, coords,u, time, boundaryCondition, neighbourEntities ); } maxResidue = fu. absMax(); @@ -1084,10 +1083,10 @@ void tnlParallelEikonalSolver<3,SchemeHost, SchemeDevice, Device, double, int>:: // int j = threadIdx.x + threadIdx.y * blockDim.x; //printf("j = %d, u = %f\n", j,u); - int index = (blockIdx.z*caller->n + threadIdx.z) * caller->n*caller->n*caller->gridCols*caller->gridRows - + (blockIdx.y) * caller->n*caller->n*caller->gridCols - + (blockIdx.x) * caller->n - + threadIdx.y * caller->n*caller->gridCols + int index = (blockIdx.z*this->n + threadIdx.z) * this->n*this->n*this->gridCols*this->gridRows + + (blockIdx.y) * this->n*this->n*this->gridCols + + (blockIdx.x) * this->n + + threadIdx.y * this->n*this->gridCols + threadIdx.x; //printf("i= %d,j= %d,index= %d\n",i,j,index); @@ -1162,18 +1161,43 @@ void tnlParallelEikonalSolver<3,SchemeHost, SchemeDevice, Device, double, int>:: // u[l] = value; if(computeFU) { + tnlGridEntity<MeshType, 3, tnlGridEntityNoStencilStorage > Ent(subMesh); if(boundaryCondition == 4) - u[l] = u[this->subMesh.getCellCoordinates(0,j,k)] + Sign(u[0])*this->subMesh.template getSpaceStepsProducts< 1, 0, 0 >()*(threadIdx.x) ;//+ 2*Sign(u[0])*this->subMesh.template getSpaceStepsProducts< 1, 0, 0 >()*(threadIdx.x+this->n); + { + Ent.setCoordinates(tnlStaticVector<3,int>(0,j,k)); + Ent.refresh(); + u[l] = u[Ent.getIndex()] + Sign(u[0])*this->subMesh.template getSpaceStepsProducts< 1, 0, 0 >()*(threadIdx.x) ;//+ 2*Sign(u[0])*this->subMesh.template getSpaceStepsProducts< 1, 0, 0 >()*(threadIdx.x+this->n); + } else if(boundaryCondition == 2) - u[l] = u[this->subMesh.getCellCoordinates(blockDim.x - 1,j,k)] + Sign(u[0])*this->subMesh.template getSpaceStepsProducts< 1, 0, 0 >()*(this->n - 1 - threadIdx.x);//+ 2*Sign(u[0])*this->subMesh.template getSpaceStepsProducts< 1, 0, 0 >()*(blockDim.x - threadIdx.x - 1+this->n); + { + Ent.setCoordinates(tnlStaticVector<3,int>(blockDim.x - 1,j,k)); + Ent.refresh(); + u[l] = u[Ent.getIndex()] + Sign(u[0])*this->subMesh.template getSpaceStepsProducts< 1, 0, 0 >()*(this->n - 1 - threadIdx.x);//+ 2*Sign(u[0])*this->subMesh.template getSpaceStepsProducts< 1, 0, 0 >()*(blockDim.x - threadIdx.x - 1+this->n); + } else if(boundaryCondition == 8) - u[l] = u[this->subMesh.getCellCoordinates(i,0,k)] + Sign(u[0])*this->subMesh.template getSpaceStepsProducts< 1, 0, 0 >()*(threadIdx.y) ;//+ 2*Sign(u[0])*this->subMesh.template getSpaceStepsProducts< 1, 0, 0 >()*(threadIdx.y+this->n); + { + Ent.setCoordinates(tnlStaticVector<3,int>(i,0,k)); + Ent.refresh(); + u[l] = u[Ent.getIndex()] + Sign(u[0])*this->subMesh.template getSpaceStepsProducts< 1, 0, 0 >()*(threadIdx.y) ;//+ 2*Sign(u[0])*this->subMesh.template getSpaceStepsProducts< 1, 0, 0 >()*(threadIdx.y+this->n); + } else if(boundaryCondition == 1) - u[l] = u[this->subMesh.getCellCoordinates(i,blockDim.y - 1,k)] + Sign(u[0])*this->subMesh.template getSpaceStepsProducts< 1, 0, 0 >()*(this->n - 1 - threadIdx.y) ;//+ Sign(u[0])*this->subMesh.template getSpaceStepsProducts< 1, 0, 0 >()*(blockDim.y - threadIdx.y - 1 +this->n); + { + Ent.setCoordinates(tnlStaticVector<3,int>(i,blockDim.y - 1,k)); + Ent.refresh(); + u[l] = u[Ent.getIndex()] + Sign(u[0])*this->subMesh.template getSpaceStepsProducts< 1, 0, 0 >()*(this->n - 1 - threadIdx.y) ;//+ Sign(u[0])*this->subMesh.template getSpaceStepsProducts< 1, 0, 0 >()*(blockDim.y - threadIdx.y - 1 +this->n); + } else if(boundaryCondition == 32) - u[l] = u[this->subMesh.getCellCoordinates(i,j,0)] + Sign(u[0])*this->subMesh.template getSpaceStepsProducts< 1, 0, 0 >()*(threadIdx.z); + { + Ent.setCoordinates(tnlStaticVector<3,int>(i,j,0)); + Ent.refresh(); + u[l] = u[Ent.getIndex()] + Sign(u[0])*this->subMesh.template getSpaceStepsProducts< 1, 0, 0 >()*(threadIdx.z); + } else if(boundaryCondition == 16) - u[l] = u[this->subMesh.getCellCoordinates(i,j,blockDim.z - 1)] + Sign(u[0])*this->subMesh.template getSpaceStepsProducts< 1, 0, 0 >()*(this->n - 1 - threadIdx.z) ; + { + Ent.setCoordinates(tnlStaticVector<3,int>(i,j,blockDim.z - 1)); + Ent.refresh(); + u[l] = u[Ent.getIndex()] + Sign(u[0])*this->subMesh.template getSpaceStepsProducts< 1, 0, 0 >()*(this->n - 1 - threadIdx.z) ; + } } } @@ -1199,7 +1223,7 @@ void tnlParallelEikonalSolver<3,SchemeHost, SchemeDevice, Device, double, int>:: while( time < finalTime ) { if(computeFU) - fu = schemeHost.getValueDev( this->subMesh, l, tnlStaticVector<3,int>(i,j,k)/*this->subMesh.getCellCoordinates(l)*/, u, time, boundaryCondition); + fu = schemeHost.getValueDev( this->subMesh, l, tnlStaticVector<3,int>(i,j,k)/*this->subMesh.getCellCoordinates(l)*/, u, time, boundaryCondition, neighbourEntities); sharedTau[l]=cfl/fabs(fu); @@ -1236,7 +1260,7 @@ template< typename SchemeHost, typename SchemeDevice, typename Device> __device__ int tnlParallelEikonalSolver<3,SchemeHost, SchemeDevice, Device, double, int>::getOwnerCUDA3D(int i) const { - int j = i % (this->gridCols*this->gridRows*this->n*this->n) + int j = i % (this->gridCols*this->gridRows*this->n*this->n); return ( (i / (this->gridCols*this->gridRows*this->n*this->n))/this->n + (j / (this->gridCols*this->n*this->n))*this->gridCols @@ -1289,22 +1313,25 @@ void /*tnlParallelEikonalSolver<3,SchemeHost, SchemeDevice, Device, double, int> __shared__ int newSubgridValue; - int gid = (blockDim.y*blockIdx.y + threadIdx.y)*blockDim.x*gridDim.x + blockDim.x*blockIdx.x + threadIdx.x; + int gid = (blockDim.y*blockIdx.y + threadIdx.y)*blockDim.x*gridDim.x + blockDim.x*blockIdx.x + threadIdx.x + (blockDim.z*blockIdx.z + threadIdx.z)*blockDim.x*gridDim.x*blockDim.y*gridDim.y; double u = cudaSolver->work_u_cuda[gid]; double u_cmp; int subgridValue_cmp=INT_MAX; int boundary_index=0; - if(threadIdx.x+threadIdx.y == 0) + if(threadIdx.x+threadIdx.y+threadIdx.z == 0) { - subgridValue = cudaSolver->getSubgridValueCUDA3D(blockIdx.y*gridDim.x + blockIdx.x); + printf("z = %d, y = %d, x = %d\n",blockIdx.z,blockIdx.y,blockIdx.x); + subgridValue = cudaSolver->getSubgridValueCUDA3D(blockIdx.y*gridDim.x + blockIdx.x + blockIdx.z*gridDim.x*gridDim.y); boundary[0] = 0; boundary[1] = 0; boundary[2] = 0; boundary[3] = 0; + boundary[4] = 0; + boundary[5] = 0; newSubgridValue = 0; - //printf("%d %d\n", blockDim.x, gridDim.x); + printf("aaa z = %d, y = %d, x = %d\n",blockIdx.z,blockIdx.y,blockIdx.x); } __syncthreads(); @@ -1312,34 +1339,24 @@ void /*tnlParallelEikonalSolver<3,SchemeHost, SchemeDevice, Device, double, int> if( (threadIdx.x == 0 /* && !(cudaSolver->currentStep & 1)*/) || (threadIdx.y == 0 /* && (cudaSolver->currentStep & 1)*/) || + (threadIdx.z == 0 /* && !(cudaSolver->currentStep & 1)*/) || (threadIdx.x == blockDim.x - 1 /* && !(cudaSolver->currentStep & 1)*/) || - (threadIdx.y == blockDim.y - 1 /* && (cudaSolver->currentStep & 1)*/) ) + (threadIdx.y == blockDim.y - 1 /* && (cudaSolver->currentStep & 1)*/) || + (threadIdx.z == blockDim.z - 1 /* && (cudaSolver->currentStep & 1)*/) ) { if(threadIdx.x == 0 && (blockIdx.x != 0)/* && !(cudaSolver->currentStep & 1)*/) { u_cmp = cudaSolver->work_u_cuda[gid - 1]; - subgridValue_cmp = cudaSolver->getSubgridValueCUDA3D(blockIdx.y*gridDim.x + blockIdx.x - 1); + subgridValue_cmp = cudaSolver->getSubgridValueCUDA3D(blockIdx.y*gridDim.x + blockIdx.x + blockIdx.z*gridDim.x*gridDim.y - 1); boundary_index = 2; } if(threadIdx.x == blockDim.x - 1 && (blockIdx.x != gridDim.x - 1)/* && !(cudaSolver->currentStep & 1)*/) { u_cmp = cudaSolver->work_u_cuda[gid + 1]; - subgridValue_cmp = cudaSolver->getSubgridValueCUDA3D(blockIdx.y*gridDim.x + blockIdx.x + 1); + subgridValue_cmp = cudaSolver->getSubgridValueCUDA3D(blockIdx.y*gridDim.x + blockIdx.x + blockIdx.z*gridDim.x*gridDim.y + 1); boundary_index = 1; } -// if(threadIdx.y == 0 && (blockIdx.y != 0) && (cudaSolver->currentStep & 1)) -// { -// u_cmp = cudaSolver->work_u_cuda[gid - blockDim.x*gridDim.x]; -// subgridValue_cmp = cudaSolver->getSubgridValueCUDA3D((blockIdx.y - 1)*gridDim.x + blockIdx.x); -// boundary_index = 3; -// } -// if(threadIdx.y == blockDim.y - 1 && (blockIdx.y != gridDim.y - 1) && (cudaSolver->currentStep & 1)) -// { -// u_cmp = cudaSolver->work_u_cuda[gid + blockDim.x*gridDim.x]; -// subgridValue_cmp = cudaSolver->getSubgridValueCUDA3D((blockIdx.y + 1)*gridDim.x + blockIdx.x); -// boundary_index = 0; -// } __threadfence(); if((subgridValue == INT_MAX || fabs(u_cmp) + cudaSolver->delta < fabs(u) ) && (subgridValue_cmp != INT_MAX && subgridValue_cmp != -INT_MAX)) @@ -1350,17 +1367,17 @@ void /*tnlParallelEikonalSolver<3,SchemeHost, SchemeDevice, Device, double, int> cudaSolver->work_u_cuda[gid] = u_cmp; u=u_cmp; } - //__threadfence(); + __threadfence(); if(threadIdx.y == 0 && (blockIdx.y != 0)/* && (cudaSolver->currentStep & 1)*/) { u_cmp = cudaSolver->work_u_cuda[gid - blockDim.x*gridDim.x]; - subgridValue_cmp = cudaSolver->getSubgridValueCUDA3D((blockIdx.y - 1)*gridDim.x + blockIdx.x); + subgridValue_cmp = cudaSolver->getSubgridValueCUDA3D((blockIdx.y-1)*gridDim.x + blockIdx.x + blockIdx.z*gridDim.x*gridDim.y); boundary_index = 3; } if(threadIdx.y == blockDim.y - 1 && (blockIdx.y != gridDim.y - 1)/* && (cudaSolver->currentStep & 1)*/) { u_cmp = cudaSolver->work_u_cuda[gid + blockDim.x*gridDim.x]; - subgridValue_cmp = cudaSolver->getSubgridValueCUDA3D((blockIdx.y + 1)*gridDim.x + blockIdx.x); + subgridValue_cmp = cudaSolver->getSubgridValueCUDA3D((blockIdx.y + 1)*gridDim.x + blockIdx.x + blockIdx.z*gridDim.x*gridDim.y); boundary_index = 0; } @@ -1372,130 +1389,44 @@ void /*tnlParallelEikonalSolver<3,SchemeHost, SchemeDevice, Device, double, int> atomicMax(&boundary[boundary_index], 1); cudaSolver->work_u_cuda[gid] = u_cmp; } - } - __syncthreads(); - - if(threadIdx.x+threadIdx.y == 0) - { - if(subgridValue == INT_MAX && newSubgridValue !=0) - cudaSolver->setSubgridValueCUDA3D(blockIdx.y*gridDim.x + blockIdx.x, -INT_MAX); - - cudaSolver->setBoundaryConditionCUDA3D(blockIdx.y*gridDim.x + blockIdx.x, boundary[0] + - 2 * boundary[1] + - 4 * boundary[2] + - 8 * boundary[3]); - } - - - /* - //printf("I am not an empty kernel!\n"); - //cout << "Synchronizig..." << endl; - int tmp1, tmp2; - int grid1, grid2; + __threadfence(); - if(cudaSolver->currentStep & 1) - { - //printf("I am not an empty kernel! 1\n"); - for(int j = 0; j < cudaSolver->gridRows - 1; j++) + if(threadIdx.z == 0 && (blockIdx.z != 0)/* && (cudaSolver->currentStep & 1)*/) { - //printf("I am not an empty kernel! 3\n"); - for (int i = 0; i < cudaSolver->gridCols*cudaSolver->n; i++) - { - tmp1 = cudaSolver->gridCols*cudaSolver->n*((cudaSolver->n-1)+j*cudaSolver->n) + i; - tmp2 = cudaSolver->gridCols*cudaSolver->n*((cudaSolver->n)+j*cudaSolver->n) + i; - grid1 = cudaSolver->getSubgridValueCUDA3D(cudaSolver->getOwnerCUDA3D(tmp1)); - grid2 = cudaSolver->getSubgridValueCUDA3D(cudaSolver->getOwnerCUDA3D(tmp2)); - - if ((fabs(cudaSolver->work_u_cuda[tmp1]) < fabs(cudaSolver->work_u_cuda[tmp2]) - cudaSolver->delta || grid2 == INT_MAX || grid2 == -INT_MAX) && (grid1 != INT_MAX && grid1 != -INT_MAX)) - { - //printf("%d %d %d %d \n",tmp1,tmp2,cudaSolver->getOwnerCUDA3D(tmp1),cudaSolver->getOwnerCUDA3D(tmp2)); - cudaSolver->work_u_cuda[tmp2] = cudaSolver->work_u_cuda[tmp1]; - cudaSolver->unusedCell_cuda[tmp2] = 0; - if(grid2 == INT_MAX) - { - cudaSolver->setSubgridValueCUDA3D(cudaSolver->getOwnerCUDA3D(tmp2), -INT_MAX); - } - if(! (cudaSolver->getBoundaryConditionCUDA3D(cudaSolver->getOwnerCUDA3D(tmp2)) & 8) ) - cudaSolver->setBoundaryConditionCUDA3D(cudaSolver->getOwnerCUDA3D(tmp2), cudaSolver->getBoundaryConditionCUDA3D(cudaSolver->getOwnerCUDA3D(tmp2))+8); - } - else if ((fabs(cudaSolver->work_u_cuda[tmp1]) > fabs(cudaSolver->work_u_cuda[tmp2]) + cudaSolver->delta || grid1 == INT_MAX || grid1 == -INT_MAX) && (grid2 != INT_MAX && grid2 != -INT_MAX)) - { - //printf("%d %d %d %d \n",tmp1,tmp2,cudaSolver->getOwnerCUDA3D(tmp1),cudaSolver->getOwnerCUDA3D(tmp2)); - cudaSolver->work_u_cuda[tmp1] = cudaSolver->work_u_cuda[tmp2]; - cudaSolver->unusedCell_cuda[tmp1] = 0; - if(grid1 == INT_MAX) - { - cudaSolver->setSubgridValueCUDA3D(cudaSolver->getOwnerCUDA3D(tmp1), -INT_MAX); - } - if(! (cudaSolver->getBoundaryConditionCUDA3D(cudaSolver->getOwnerCUDA3D(tmp1)) & 1) ) - cudaSolver->setBoundaryConditionCUDA3D(cudaSolver->getOwnerCUDA3D(tmp1), cudaSolver->getBoundaryConditionCUDA3D(cudaSolver->getOwnerCUDA3D(tmp1))+1); - } - } + u_cmp = cudaSolver->work_u_cuda[gid - blockDim.x*gridDim.x*blockDim.y*gridDim.y]; + subgridValue_cmp = cudaSolver->getSubgridValueCUDA3D((blockIdx.y)*gridDim.x + blockIdx.x + (blockIdx.z-1)*gridDim.x*gridDim.y); + boundary_index = 5; } - - } - else - { - //printf("I am not an empty kernel! 2\n"); - for(int i = 1; i < cudaSolver->gridCols; i++) + if(threadIdx.z == blockDim.z - 1 && (blockIdx.z != gridDim.z - 1)/* && (cudaSolver->currentStep & 1)*/) { - //printf("I am not an empty kernel! 4\n"); - for (int j = 0; j < cudaSolver->gridRows*cudaSolver->n; j++) - { - - tmp1 = cudaSolver->gridCols*cudaSolver->n*j + i*cudaSolver->n - 1; - tmp2 = cudaSolver->gridCols*cudaSolver->n*j + i*cudaSolver->n ; - grid1 = cudaSolver->getSubgridValueCUDA3D(cudaSolver->getOwnerCUDA3D(tmp1)); - grid2 = cudaSolver->getSubgridValueCUDA3D(cudaSolver->getOwnerCUDA3D(tmp2)); - - if ((fabs(cudaSolver->work_u_cuda[tmp1]) < fabs(cudaSolver->work_u_cuda[tmp2]) - cudaSolver->delta || grid2 == INT_MAX || grid2 == -INT_MAX) && (grid1 != INT_MAX && grid1 != -INT_MAX)) - { - //printf("%d %d %d %d \n",tmp1,tmp2,cudaSolver->getOwnerCUDA3D(tmp1),cudaSolver->getOwnerCUDA3D(tmp2)); - cudaSolver->work_u_cuda[tmp2] = cudaSolver->work_u_cuda[tmp1]; - cudaSolver->unusedCell_cuda[tmp2] = 0; - if(grid2 == INT_MAX) - { - cudaSolver->setSubgridValueCUDA3D(cudaSolver->getOwnerCUDA3D(tmp2), -INT_MAX); - } - if(! (cudaSolver->getBoundaryConditionCUDA3D(cudaSolver->getOwnerCUDA3D(tmp2)) & 4) ) - cudaSolver->setBoundaryConditionCUDA3D(cudaSolver->getOwnerCUDA3D(tmp2), cudaSolver->getBoundaryConditionCUDA3D(cudaSolver->getOwnerCUDA3D(tmp2))+4); - } - else if ((fabs(cudaSolver->work_u_cuda[tmp1]) > fabs(cudaSolver->work_u_cuda[tmp2]) + cudaSolver->delta || grid1 == INT_MAX || grid1 == -INT_MAX) && (grid2 != INT_MAX && grid2 != -INT_MAX)) - { - //printf("%d %d %d %d \n",tmp1,tmp2,cudaSolver->getOwnerCUDA3D(tmp1),cudaSolver->getOwnerCUDA3D(tmp2)); - cudaSolver->work_u_cuda[tmp1] = cudaSolver->work_u_cuda[tmp2]; - cudaSolver->unusedCell_cuda[tmp1] = 0; - if(grid1 == INT_MAX) - { - cudaSolver->setSubgridValueCUDA3D(cudaSolver->getOwnerCUDA3D(tmp1), -INT_MAX); - } - if(! (cudaSolver->getBoundaryConditionCUDA3D(cudaSolver->getOwnerCUDA3D(tmp1)) & 2) ) - cudaSolver->setBoundaryConditionCUDA3D(cudaSolver->getOwnerCUDA3D(tmp1), cudaSolver->getBoundaryConditionCUDA3D(cudaSolver->getOwnerCUDA3D(tmp1))+2); - } - } + u_cmp = cudaSolver->work_u_cuda[gid + blockDim.x*gridDim.x*blockDim.y*gridDim.y]; + subgridValue_cmp = cudaSolver->getSubgridValueCUDA3D((blockIdx.y)*gridDim.x + blockIdx.x + (blockIdx.z+1)*gridDim.x*gridDim.y); + boundary_index = 4; + } + if((subgridValue == INT_MAX || fabs(u_cmp) + cudaSolver->delta < fabs(u) ) && (subgridValue_cmp != INT_MAX && subgridValue_cmp != -INT_MAX)) + { + cudaSolver->unusedCell_cuda[gid] = 0; + atomicMax(&newSubgridValue, INT_MAX); + atomicMax(&boundary[boundary_index], 1); + cudaSolver->work_u_cuda[gid] = u_cmp; } - } - //printf("I am not an empty kernel! 5 cudaSolver->currentStep : %d \n", cudaSolver->currentStep); - cudaSolver->currentStep = cudaSolver->currentStep + 1; - int stepValue = cudaSolver->currentStep + 4; - for (int i = 0; i < cudaSolver->gridRows * cudaSolver->gridCols; i++) - { - if( cudaSolver->getSubgridValueCUDA3D(i) == -INT_MAX ) - cudaSolver->setSubgridValueCUDA3D(i, stepValue); } + __syncthreads(); - int maxi = 0; - for(int q=0; q < cudaSolver->gridRows*cudaSolver->gridCols;q++) + if(threadIdx.x+threadIdx.y+threadIdx.z == 0) { - //printf("%d : %d\n", q, cudaSolver->boundaryConditions_cuda[q]); - maxi=Max(maxi,cudaSolver->getBoundaryConditionCUDA3D(q)); + + if(subgridValue == INT_MAX && newSubgridValue !=0) + cudaSolver->setSubgridValueCUDA3D(blockIdx.y*gridDim.x + blockIdx.x + blockIdx.z*gridDim.x*gridDim.y, -INT_MAX); + + cudaSolver->setBoundaryConditionCUDA3D(blockIdx.y*gridDim.x + blockIdx.x + blockIdx.z*gridDim.x*gridDim.y, 1 * boundary[0] + + 2 * boundary[1] + + 4 * boundary[2] + + 8 * boundary[3] + + 16 * boundary[4] + + 32 * boundary[5]); } - //printf("I am not an empty kernel! %d\n", maxi); - *(cudaSolver->runcuda) = (maxi > 0); - //printf("I am not an empty kernel! 7 %d\n", cudaSolver->boundaryConditions_cuda[0]); - //cout << "Grid synchronized at step " << (this->currentStep - 1 ) << endl; -*/ } @@ -1504,17 +1435,20 @@ template <typename SchemeHost, typename SchemeDevice, typename Device> __global__ void synchronize2CUDA3D(tnlParallelEikonalSolver<3,SchemeHost, SchemeDevice, Device, double, int >* cudaSolver) { - if(blockIdx.x+blockIdx.y ==0) + if(blockIdx.x+blockIdx.y+blockIdx.z ==0) { cudaSolver->currentStep = cudaSolver->currentStep + 1; *(cudaSolver->runcuda) = 0; } int stepValue = cudaSolver->currentStep + 4; - if( cudaSolver->getSubgridValueCUDA3D(blockIdx.y*gridDim.x + blockIdx.x) == -INT_MAX ) - cudaSolver->setSubgridValueCUDA3D(blockIdx.y*gridDim.x + blockIdx.x, stepValue); + if( cudaSolver->getSubgridValueCUDA3D(blockIdx.z*gridDim.x*gridDim.y + blockIdx.y*gridDim.x + blockIdx.x) == -INT_MAX ) + { + printf("z = %d, y = %d, x = %d\n",blockIdx.z,blockIdx.y,blockIdx.x); + cudaSolver->setSubgridValueCUDA3D(blockIdx.z*gridDim.x*gridDim.y + blockIdx.y*gridDim.x + blockIdx.x, stepValue); + } - atomicMax((cudaSolver->runcuda),cudaSolver->getBoundaryConditionCUDA3D(blockIdx.y*gridDim.x + blockIdx.x)); + atomicMax((cudaSolver->runcuda),cudaSolver->getBoundaryConditionCUDA3D(blockIdx.z*gridDim.x*gridDim.y + blockIdx.y*gridDim.x + blockIdx.x)); } @@ -1526,48 +1460,14 @@ void synchronize2CUDA3D(tnlParallelEikonalSolver<3,SchemeHost, SchemeDevice, Dev template< typename SchemeHost, typename SchemeDevice, typename Device> __global__ -void /*tnlParallelEikonalSolver<3,SchemeHost, SchemeDevice, Device, double, int>::*/initCUDA3D( tnlParallelEikonalSolver<3,SchemeHost, SchemeDevice, Device, double, int >* cudaSolver, double* ptr , int* ptr2, int* ptr3) +void initCUDA3D( tnlParallelEikonalSolver<3,SchemeHost, SchemeDevice, Device, double, int >* cudaSolver, double* ptr , int* ptr2, int* ptr3) { - //cout << "Initializating solver..." << endl; - //const tnlString& meshLocation = parameters.getParameter <tnlString>("mesh"); - //this->mesh_cuda.load( meshLocation ); - - //this->n_cuda = parameters.getParameter <int>("subgrid-size"); - //cout << "Setting N << this->n_cuda << endl; - - //this->subMesh_cuda.setDimensions( this->n_cuda, this->n_cuda ); - //this->subMesh_cuda.setDomain( tnlStaticVector<3,double>(0.0, 0.0), - //tnlStaticVector<3,double>(this->mesh_cuda.template getSpaceStepsProducts< 1, 0, 0 >()*(double)(this->n_cuda), this->mesh_cuda.template getSpaceStepsProducts< 0, 1, 0 >()*(double)(this->n_cuda)) ); - - //this->subMesh_cuda.save("submesh.tnl"); - -// const tnlString& initialCondition = parameters.getParameter <tnlString>("initial-condition"); -// this->u0.load( initialCondition ); - //cout << this->mesh.getCellCenter(0) << endl; - - //this->delta_cuda = parameters.getParameter <double>("delta"); - //this->delta_cuda *= this->mesh_cuda.template getSpaceStepsProducts< 1, 0, 0 >()*this->mesh_cuda.template getSpaceStepsProducts< 0, 1, 0 >(); - - //cout << "Setting delta to " << this->delta << endl; - - //this->tau0_cuda = parameters.getParameter <double>("initial-tau"); - //cout << "Setting initial tau to " << this->tau0_cuda << endl; - //this->stopTime_cuda = parameters.getParameter <double>("stop-time"); - - //this->cflCondition_cuda = parameters.getParameter <double>("cfl-condition"); - //this -> cflCondition_cuda *= sqrt(this->mesh_cuda.template getSpaceStepsProducts< 1, 0, 0 >()*this->mesh_cuda.template getSpaceStepsProducts< 0, 1, 0 >()); - //cout << "Setting CFL to " << this->cflCondition << endl; -//// -//// - -// this->gridRows_cuda = gridRows; -// this->gridCols_cuda = gridCols; cudaSolver->work_u_cuda = ptr;//(double*)malloc(cudaSolver->gridCols*cudaSolver->gridRows*cudaSolver->n*cudaSolver->n*sizeof(double)); cudaSolver->unusedCell_cuda = ptr3;//(int*)malloc(cudaSolver->gridCols*cudaSolver->gridRows*cudaSolver->n*cudaSolver->n*sizeof(int)); - cudaSolver->subgridValues_cuda =(int*)malloc(cudaSolver->gridCols*cudaSolver->gridRows*sizeof(int)); - cudaSolver->boundaryConditions_cuda =(int*)malloc(cudaSolver->gridCols*cudaSolver->gridRows*sizeof(int)); + cudaSolver->subgridValues_cuda =(int*)malloc(cudaSolver->gridCols*cudaSolver->gridRows*cudaSolver->gridLevels*sizeof(int)); + cudaSolver->boundaryConditions_cuda =(int*)malloc(cudaSolver->gridCols*cudaSolver->gridRows*cudaSolver->gridLevels*sizeof(int)); cudaSolver->runcuda = ptr2;//(bool*)malloc(sizeof(bool)); *(cudaSolver->runcuda) = 1; cudaSolver->currentStep = 1; @@ -1575,7 +1475,7 @@ void /*tnlParallelEikonalSolver<3,SchemeHost, SchemeDevice, Device, double, int> //ptr = cudaSolver->work_u_cuda; printf("GPU memory allocated.\n"); - for(int i = 0; i < cudaSolver->gridCols*cudaSolver->gridRows; i++) + for(int i = 0; i < cudaSolver->gridCols*cudaSolver->gridRows*cudaSolver->gridLevels; i++) { cudaSolver->subgridValues_cuda[i] = INT_MAX; cudaSolver->boundaryConditions_cuda[i] = 0; @@ -1587,30 +1487,6 @@ void /*tnlParallelEikonalSolver<3,SchemeHost, SchemeDevice, Device, double, int> cudaSolver->unusedCell_cuda[ j] = 1; }*/ printf("GPU memory initialized.\n"); - - - //cudaSolver->work_u_cuda[50] = 32.153438; -//// -//// - //stretchGrid(); - //this->stopTime_cuda /= (double)(this->gridCols_cuda); - //this->stopTime_cuda *= (1.0+1.0/((double)(this->n_cuda) - 1.0)); - //cout << "Setting stopping time to " << this->stopTime << endl; - //this->stopTime_cuda = 1.5*((double)(this->n_cuda))*parameters.getParameter <double>("stop-time")*this->mesh_cuda.template getSpaceStepsProducts< 1, 0, 0 >(); - //cout << "Setting stopping time to " << this->stopTime << endl; - - //cout << "Initializating scheme..." << endl; - //if(!this->schemeDevice.init(parameters)) -// { - //cerr << "Scheme failed to initialize." << endl; -// return false; -// } - //cout << "Scheme initialized." << endl; - - //test(); - -// this->currentStep_cuda = 1; - //return true; } @@ -1619,42 +1495,34 @@ void /*tnlParallelEikonalSolver<3,SchemeHost, SchemeDevice, Device, double, int> //extern __shared__ double array[]; template< typename SchemeHost, typename SchemeDevice, typename Device > __global__ -void /*tnlParallelEikonalSolver<3,SchemeHost, SchemeDevice, Device, double, int>::*/initRunCUDA3D(tnlParallelEikonalSolver<3,SchemeHost, SchemeDevice, Device, double, int >* caller) +void initRunCUDA3D(tnlParallelEikonalSolver<3,SchemeHost, SchemeDevice, Device, double, int >* caller) { extern __shared__ double u[]; - //printf("%p\n",caller->work_u_cuda); - int i = blockIdx.y * gridDim.x + blockIdx.x; - int l = threadIdx.y * blockDim.x + threadIdx.x; + int i = blockIdx.z*gridDim.y*gridDim.y + blockIdx.y * gridDim.x + blockIdx.x; + int l = threadIdx.z*blockDim.x*blockDim.y + threadIdx.y * blockDim.x + threadIdx.x; __shared__ int containsCurve; if(l == 0) + { + printf("z = %d, y = %d, x = %d\n",blockIdx.z,blockIdx.y,blockIdx.x); containsCurve = 0; + } - //double a; caller->getSubgridCUDA3D(i,caller, &u[l]); - //printf("%f %f\n",a , u[l]); - //u[l] = a; - //printf("Hi %f \n", u[l]); __syncthreads(); - //printf("hurewrwr %f \n", u[l]); if(u[0] * u[l] <= 0.0) { - //printf("contains %d \n",i); atomicMax( &containsCurve, 1); } __syncthreads(); - //printf("hu"); - //printf("%d : %f\n", l, u[l]); if(containsCurve == 1) { - //printf("have curve \n"); caller->runSubgridCUDA3D(0,u,i); - //printf("%d : %f\n", l, u[l]); __syncthreads(); caller->insertSubgridCUDA3D(u[l],i); __syncthreads(); @@ -1671,11 +1539,11 @@ void /*tnlParallelEikonalSolver<3,SchemeHost, SchemeDevice, Device, double, int> template< typename SchemeHost, typename SchemeDevice, typename Device > __global__ -void /*tnlParallelEikonalSolver<3,SchemeHost, SchemeDevice, Device, double, int>::*/runCUDA3D(tnlParallelEikonalSolver<3,SchemeHost, SchemeDevice, Device, double, int >* caller) +void runCUDA3D(tnlParallelEikonalSolver<3,SchemeHost, SchemeDevice, Device, double, int >* caller) { extern __shared__ double u[]; - int i = blockIdx.y * gridDim.x + blockIdx.x; - int l = threadIdx.y * blockDim.x + threadIdx.x; + int i = blockIdx.z*gridDim.y*gridDim.y + blockIdx.y * gridDim.x + blockIdx.x; + int l = threadIdx.z*blockDim.x*blockDim.y + threadIdx.y * blockDim.x + threadIdx.x; int bound = caller->getBoundaryConditionCUDA3D(i); if(caller->getSubgridValueCUDA3D(i) != INT_MAX && bound != 0 && caller->getSubgridValueCUDA3D(i) > 0) @@ -1689,220 +1557,132 @@ void /*tnlParallelEikonalSolver<3,SchemeHost, SchemeDevice, Device, double, int> if(bound & 1) { caller->runSubgridCUDA3D(1,u,i); - //__syncthreads(); - //caller->insertSubgridCUDA3D(u[l],i); - //__syncthreads(); - //caller->getSubgridCUDA3D(i,caller, &u[l]); caller->updateSubgridCUDA3D(i,caller, &u[l]); __syncthreads(); } if(bound & 2 ) { caller->runSubgridCUDA3D(2,u,i); - //__syncthreads(); - //caller->insertSubgridCUDA3D(u[l],i); - //__syncthreads(); - //caller->getSubgridCUDA3D(i,caller, &u[l]); caller->updateSubgridCUDA3D(i,caller, &u[l]); __syncthreads(); } if(bound & 4) { caller->runSubgridCUDA3D(4,u,i); - //__syncthreads(); - //caller->insertSubgridCUDA3D(u[l],i); - //__syncthreads(); - //caller->getSubgridCUDA3D(i,caller, &u[l]); caller->updateSubgridCUDA3D(i,caller, &u[l]); __syncthreads(); } if(bound & 8) { caller->runSubgridCUDA3D(8,u,i); - //__syncthreads(); - //caller->insertSubgridCUDA3D(u[l],i); - //__syncthreads(); - //caller->getSubgridCUDA3D(i,caller, &u[l]); caller->updateSubgridCUDA3D(i,caller, &u[l]); __syncthreads(); } - - - - - - if( ((bound & 3 ))) - { - caller->runSubgridCUDA3D(3,u,i); - //__syncthreads(); - //caller->insertSubgridCUDA3D(u[l],i); - //__syncthreads(); - //caller->getSubgridCUDA3D(i,caller, &u[l]); - caller->updateSubgridCUDA3D(i,caller, &u[l]); - __syncthreads(); - } - if( ((bound & 5 ))) - { - caller->runSubgridCUDA3D(5,u,i); - //__syncthreads(); - //caller->insertSubgridCUDA3D(u[l],i); - //__syncthreads(); - //caller->getSubgridCUDA3D(i,caller, &u[l]); - caller->updateSubgridCUDA3D(i,caller, &u[l]); - __syncthreads(); - } - if( ((bound & 10 ))) - { - caller->runSubgridCUDA3D(10,u,i); - //__syncthreads(); - //caller->insertSubgridCUDA3D(u[l],i); - //__syncthreads(); - //caller->getSubgridCUDA3D(i,caller, &u[l]); - caller->updateSubgridCUDA3D(i,caller, &u[l]); - __syncthreads(); - } - if( (bound & 12 )) - { - caller->runSubgridCUDA3D(12,u,i); - //__syncthreads(); - //caller->insertSubgridCUDA3D(u[l],i); - //__syncthreads(); - //caller->getSubgridCUDA3D(i,caller, &u[l]); - caller->updateSubgridCUDA3D(i,caller, &u[l]); - __syncthreads(); - } - - - - + if(bound & 16) + { + caller->runSubgridCUDA3D(16,u,i); + caller->updateSubgridCUDA3D(i,caller, &u[l]); + __syncthreads(); + } + if(bound & 32) + { + caller->runSubgridCUDA3D(32,u,i); + caller->updateSubgridCUDA3D(i,caller, &u[l]); + __syncthreads(); + } } - - else { - - - - - - - - - if( ((bound == 2))) - { - caller->runSubgridCUDA3D(2,u,i); - //__syncthreads(); - //caller->insertSubgridCUDA3D(u[l],i); - //__syncthreads(); - //caller->getSubgridCUDA3D(i,caller, &u[l]); - caller->updateSubgridCUDA3D(i,caller, &u[l]); - __syncthreads(); - } - if( ((bound == 1) )) - { - caller->runSubgridCUDA3D(1,u,i); - //__syncthreads(); - //caller->insertSubgridCUDA3D(u[l],i); - //__syncthreads(); - //caller->getSubgridCUDA3D(i,caller, &u[l]); - caller->updateSubgridCUDA3D(i,caller, &u[l]); - __syncthreads(); - } - if( ((bound == 8) )) - { - caller->runSubgridCUDA3D(8,u,i); - //__syncthreads(); - //caller->insertSubgridCUDA3D(u[l],i); - //__syncthreads(); - //caller->getSubgridCUDA3D(i,caller, &u[l]); - caller->updateSubgridCUDA3D(i,caller, &u[l]); - __syncthreads(); - } - if( (bound == 4)) - { - caller->runSubgridCUDA3D(4,u,i); - //__syncthreads(); - //caller->insertSubgridCUDA3D(u[l],i); - //__syncthreads(); - //caller->getSubgridCUDA3D(i,caller, &u[l]); - caller->updateSubgridCUDA3D(i,caller, &u[l]); - __syncthreads(); - } - - - - - - - - - - - if( ((bound & 3) )) { - caller->runSubgridCUDA3D(3,u,i); - //__syncthreads(); - //caller->insertSubgridCUDA3D(u[l],i); - //__syncthreads(); - //caller->getSubgridCUDA3D(i,caller, &u[l]); + caller->runSubgridCUDA3D(2,u,i); + caller->updateSubgridCUDA3D(i,caller, &u[l]); + __syncthreads(); + } + if( ((bound == 1) )) + { + caller->runSubgridCUDA3D(1,u,i); caller->updateSubgridCUDA3D(i,caller, &u[l]); __syncthreads(); } - if( ((bound & 5) )) + if( ((bound == 8) )) { - caller->runSubgridCUDA3D(5,u,i); - //__syncthreads(); - //caller->insertSubgridCUDA3D(u[l],i); - //__syncthreads(); - //caller->getSubgridCUDA3D(i,caller, &u[l]); + caller->runSubgridCUDA3D(8,u,i); caller->updateSubgridCUDA3D(i,caller, &u[l]); __syncthreads(); } - if( ((bound & 10) )) + if( (bound == 4)) { - caller->runSubgridCUDA3D(10,u,i); - //__syncthreads(); - //caller->insertSubgridCUDA3D(u[l],i); - //__syncthreads(); - //caller->getSubgridCUDA3D(i,caller, &u[l]); + caller->runSubgridCUDA3D(4,u,i); caller->updateSubgridCUDA3D(i,caller, &u[l]); __syncthreads(); } - if( (bound & 12) ) + if(bound == 16) { - caller->runSubgridCUDA3D(12,u,i); - //__syncthreads(); - //caller->insertSubgridCUDA3D(u[l],i); - //__syncthreads(); - //caller->getSubgridCUDA3D(i,caller, &u[l]); + caller->runSubgridCUDA3D(16,u,i); caller->updateSubgridCUDA3D(i,caller, &u[l]); __syncthreads(); } + if(bound == 32) + { + caller->runSubgridCUDA3D(32,u,i); + caller->updateSubgridCUDA3D(i,caller, &u[l]); + __syncthreads(); + } + } + if( ((bound & 19 ))) + { + caller->runSubgridCUDA3D(19,u,i); + caller->updateSubgridCUDA3D(i,caller, &u[l]); + __syncthreads(); + } + if( ((bound & 21 ))) + { + caller->runSubgridCUDA3D(21,u,i); + caller->updateSubgridCUDA3D(i,caller, &u[l]); + __syncthreads(); + } + if( ((bound & 26 ))) + { + caller->runSubgridCUDA3D(26,u,i); + caller->updateSubgridCUDA3D(i,caller, &u[l]); + __syncthreads(); + } + if( (bound & 28 )) + { + caller->runSubgridCUDA3D(28,u,i); + caller->updateSubgridCUDA3D(i,caller, &u[l]); + __syncthreads(); + } - - - - - - - - + if( ((bound & 35 ))) + { + caller->runSubgridCUDA3D(35,u,i); + caller->updateSubgridCUDA3D(i,caller, &u[l]); + __syncthreads(); } - /*if( bound ) + if( ((bound & 37 ))) { - caller->runSubgridCUDA3D(15,u,i); + caller->runSubgridCUDA3D(37,u,i); + caller->updateSubgridCUDA3D(i,caller, &u[l]); __syncthreads(); - //caller->insertSubgridCUDA3D(u[l],i); - //__syncthreads(); - //caller->getSubgridCUDA3D(i,caller, &u[l]); + } + if( ((bound & 42 ))) + { + caller->runSubgridCUDA3D(42,u,i); caller->updateSubgridCUDA3D(i,caller, &u[l]); __syncthreads(); - }*/ + } + if( (bound & 44 )) + { + caller->runSubgridCUDA3D(44,u,i); + caller->updateSubgridCUDA3D(i,caller, &u[l]); + __syncthreads(); + } if(l==0) { diff --git a/src/operators/godunov-eikonal/parallelGodunovEikonal.h b/src/operators/godunov-eikonal/parallelGodunovEikonal.h index b15caf715a9f2bd008b010cf9807e7f76bfdbc22..73aa091c41728364ab2c41209d3a761f9d53d29d 100644 --- a/src/operators/godunov-eikonal/parallelGodunovEikonal.h +++ b/src/operators/godunov-eikonal/parallelGodunovEikonal.h @@ -224,7 +224,8 @@ public: const CoordinatesType& coordinates, const Vector& u, const RealType& time, - const IndexType boundaryCondition ) const; + const IndexType boundaryCondition, + const tnlNeighbourGridEntityGetter<tnlGridEntity< MeshType, 3, tnlGridEntityNoStencilStorage >,3> neighbourEntities ) const; #ifdef HAVE_CUDA __device__ @@ -234,7 +235,8 @@ public: const CoordinatesType& coordinates, const RealType* u, const RealType& time, - const IndexType boundaryCondition) const; + const IndexType boundaryCondition, + const tnlNeighbourGridEntityGetter<tnlGridEntity< MeshType, 3, tnlGridEntityNoStencilStorage >,3> neighbourEntities ) const; #ifdef HAVE_CUDA __device__ __host__ @@ -261,7 +263,7 @@ protected: //#include <operators/godunov-eikonal/parallelGodunovEikonal1D_impl.h> #include <operators/godunov-eikonal/parallelGodunovEikonal2D_impl.h> -//#include <operators/godunov-eikonal/parallelGodunovEikonal3D_impl.h> +#include <operators/godunov-eikonal/parallelGodunovEikonal3D_impl.h> #endif /* GODUNOVEIKONAL_H_ */ diff --git a/src/operators/godunov-eikonal/parallelGodunovEikonal3D_impl.h b/src/operators/godunov-eikonal/parallelGodunovEikonal3D_impl.h index c375e94b68705bb6eb416b0ce84fe9301cb26608..ca818114ec1b4c358fc24a977618729d92183e11 100644 --- a/src/operators/godunov-eikonal/parallelGodunovEikonal3D_impl.h +++ b/src/operators/godunov-eikonal/parallelGodunovEikonal3D_impl.h @@ -126,7 +126,8 @@ Real parallelGodunovEikonalScheme< tnlGrid< 3, MeshReal, Device, MeshIndex >, Re const CoordinatesType& coordinates, const Vector& u, const Real& time, - const IndexType boundaryCondition ) const + const IndexType boundaryCondition, + const tnlNeighbourGridEntityGetter<tnlGridEntity< MeshType, 3, tnlGridEntityNoStencilStorage >,3> neighbourEntities ) const { if ( ((coordinates.x() == 0 && (boundaryCondition & 4)) or (coordinates.x() == mesh.getDimensions().x() - 1 && (boundaryCondition & 2)) or @@ -159,35 +160,35 @@ Real parallelGodunovEikonalScheme< tnlGrid< 3, MeshReal, Device, MeshIndex >, Re if(coordinates.x() == mesh.getDimensions().x() - 1) - xf += u[mesh.template getCellNextToCell<-1,0,0>( cellIndex )]; + xf += u[neighbourEntities.template getEntityIndex< -1, 0, 0 >()]; else - xf += u[mesh.template getCellNextToCell<1,0,0>( cellIndex )]; + xf += u[neighbourEntities.template getEntityIndex< 1, 0, 0 >()]; if(coordinates.x() == 0) - xb -= u[mesh.template getCellNextToCell<1,0,0>( cellIndex )]; + xb -= u[neighbourEntities.template getEntityIndex< 1, 0, 0 >()]; else - xb -= u[mesh.template getCellNextToCell<-1,0,0>( cellIndex )]; + xb -= u[neighbourEntities.template getEntityIndex< -1, 0, 0 >()]; if(coordinates.y() == mesh.getDimensions().y() - 1) - yf += u[mesh.template getCellNextToCell<0,-1,0>( cellIndex )]; + yf += u[neighbourEntities.template getEntityIndex< 0, -1, 0 >()]; else - yf += u[mesh.template getCellNextToCell<0,1,0>( cellIndex )]; + yf += u[neighbourEntities.template getEntityIndex< 0, 1, 0 >()]; if(coordinates.y() == 0) - yb -= u[mesh.template getCellNextToCell<0,1,0>( cellIndex )]; + yb -= u[neighbourEntities.template getEntityIndex< 0, 1, 0 >()]; else - yb -= u[mesh.template getCellNextToCell<0,-1,0>( cellIndex )]; + yb -= u[neighbourEntities.template getEntityIndex< 0, -1, 0 >()]; if(coordinates.z() == mesh.getDimensions().z() - 1) - zf += u[mesh.template getCellNextToCell<0,0,-1>( cellIndex )]; + zf += u[neighbourEntities.template getEntityIndex< 0, 0, -1 >()]; else - zf += u[mesh.template getCellNextToCell<0,0,1>( cellIndex )]; + zf += u[neighbourEntities.template getEntityIndex< 0, 0, 1 >()]; if(coordinates.z() == 0) - zb -= u[mesh.template getCellNextToCell<0,0,1>( cellIndex )]; + zb -= u[neighbourEntities.template getEntityIndex< 0, 0, 1 >()]; else - zb -= u[mesh.template getCellNextToCell<0,0,-1>( cellIndex )]; + zb -= u[neighbourEntities.template getEntityIndex< 0, 0, -1 >()]; //xb *= ihx; @@ -266,7 +267,9 @@ Real parallelGodunovEikonalScheme< tnlGrid< 3, MeshReal, Device, MeshIndex >, Re const CoordinatesType& coordinates, const Real* u, const Real& time, - const IndexType boundaryCondition) const + const IndexType boundaryCondition, + const tnlNeighbourGridEntityGetter<tnlGridEntity< MeshType, 3, tnlGridEntityNoStencilStorage >,3> neighbourEntities + ) const { /* @@ -300,35 +303,35 @@ Real parallelGodunovEikonalScheme< tnlGrid< 3, MeshReal, Device, MeshIndex >, Re if(coordinates.x() == mesh.getDimensions().x() - 1) - xf += u[mesh.template getCellNextToCell<-1,0,0>( cellIndex )]; + xf += u[neighbourEntities.template getEntityIndex< -1, 0, 0 >()]; else - xf += u[mesh.template getCellNextToCell<1,0,0>( cellIndex )]; + xf += u[neighbourEntities.template getEntityIndex< 1, 0, 0 >()]; if(coordinates.x() == 0) - xb -= u[mesh.template getCellNextToCell<1,0,0>( cellIndex )]; + xb -= u[neighbourEntities.template getEntityIndex< 1, 0, 0 >()]; else - xb -= u[mesh.template getCellNextToCell<-1,0,0>( cellIndex )]; + xb -= u[neighbourEntities.template getEntityIndex< -1, 0, 0 >()]; if(coordinates.y() == mesh.getDimensions().y() - 1) - yf += u[mesh.template getCellNextToCell<0,-1,0>( cellIndex )]; + yf += u[neighbourEntities.template getEntityIndex< 0, -1, 0 >()]; else - yf += u[mesh.template getCellNextToCell<0,1,0>( cellIndex )]; + yf += u[neighbourEntities.template getEntityIndex< 0, 1, 0 >()]; if(coordinates.y() == 0) - yb -= u[mesh.template getCellNextToCell<0,1,0>( cellIndex )]; + yb -= u[neighbourEntities.template getEntityIndex< 0, 1, 0 >()]; else - yb -= u[mesh.template getCellNextToCell<0,-1,0>( cellIndex )]; + yb -= u[neighbourEntities.template getEntityIndex< 0, -1, 0 >()]; if(coordinates.z() == mesh.getDimensions().z() - 1) - zf += u[mesh.template getCellNextToCell<0,0,-1>( cellIndex )]; + zf += u[neighbourEntities.template getEntityIndex< 0, 0, -1 >()]; else - zf += u[mesh.template getCellNextToCell<0,0,1>( cellIndex )]; + zf += u[neighbourEntities.template getEntityIndex< 0, 0, 1 >()]; if(coordinates.z() == 0) - zb -= u[mesh.template getCellNextToCell<0,0,1>( cellIndex )]; + zb -= u[neighbourEntities.template getEntityIndex< 0, 0, 1 >()]; else - zb -= u[mesh.template getCellNextToCell<0,0,-1>( cellIndex )]; + zb -= u[neighbourEntities.template getEntityIndex< 0, 0, -1 >()]; //xb *= ihx;