diff --git a/examples/fast-sweeping/main.h b/examples/fast-sweeping/main.h index 75850bbf6fb26033017272aad22a0dea3cff8499..aabaf684ca797a09f6c37c6fffc864dc000cc93e 100644 --- a/examples/fast-sweeping/main.h +++ b/examples/fast-sweeping/main.h @@ -17,9 +17,9 @@ #include "MainBuildConfig.h" //for HOST versions: -//#include "tnlFastSweeping.h" +#include "tnlFastSweeping.h" //for DEVICE versions: -#include "tnlFastSweeping_CUDA.h" +//#include "tnlFastSweeping_CUDA.h" #include "fastSweepingConfig.h" #include <solvers/tnlConfigTags.h> @@ -43,7 +43,7 @@ int main( int argc, char* argv[] ) if( ! parseCommandLine( argc, argv, configDescription, parameters ) ) return false; - tnlFastSweeping<tnlGrid<2,double,tnlHost, int>, double, int> solver; + tnlFastSweeping<tnlGrid<3,double,tnlHost, int>, double, int> solver; if(!solver.init(parameters)) { cerr << "Solver failed to initialize." << endl; diff --git a/examples/fast-sweeping/tnlFastSweeping.h b/examples/fast-sweeping/tnlFastSweeping.h index 5f0871b764b080ba34c06ee53a79a540a6989fc4..c13a28c710785884839fb2f4f9899bbe04d60875 100644 --- a/examples/fast-sweeping/tnlFastSweeping.h +++ b/examples/fast-sweeping/tnlFastSweeping.h @@ -107,9 +107,87 @@ protected: }; + + + + + + + + +template< typename MeshReal, + typename Device, + typename MeshIndex, + typename Real, + typename Index > +class tnlFastSweeping< tnlGrid< 3,MeshReal, Device, MeshIndex >, Real, Index > +{ + +public: + typedef Real RealType; + typedef Device DeviceType; + typedef Index IndexType; + typedef tnlGrid< 3, Real, Device, Index > MeshType; + typedef tnlVector< RealType, DeviceType, IndexType> DofVectorType; + typedef typename MeshType::CoordinatesType CoordinatesType; + + + + static tnlString getType(); + bool init( const tnlParameterContainer& parameters ); + + bool initGrid(); + bool run(); + + //for single core version use this implementation: + void updateValue(const Index i, const Index j, const Index k); + //for parallel version use this one instead: +// void updateValue(const Index i, const Index j, DofVectorType* grid); + + + void setupSquare1000(Index i, Index j); + void setupSquare1100(Index i, Index j); + void setupSquare1010(Index i, Index j); + void setupSquare1001(Index i, Index j); + void setupSquare1110(Index i, Index j); + void setupSquare1101(Index i, Index j); + void setupSquare1011(Index i, Index j); + void setupSquare1111(Index i, Index j); + void setupSquare0000(Index i, Index j); + void setupSquare0100(Index i, Index j); + void setupSquare0010(Index i, Index j); + void setupSquare0001(Index i, Index j); + void setupSquare0110(Index i, Index j); + void setupSquare0101(Index i, Index j); + void setupSquare0011(Index i, Index j); + void setupSquare0111(Index i, Index j); + + Real fabsMin(const Real x, const Real y); + + +protected: + + MeshType Mesh; + + bool exactInput; + + DofVectorType dofVector, dofVector2; + + RealType h; + +#ifdef HAVE_OPENMP +// omp_lock_t* gridLock; +#endif + + +}; + + //for single core version use this implementation: #include "tnlFastSweeping2D_impl.h" //for parallel version use this one instead: // #include "tnlFastSweeping2D_openMP_impl.h" +#include "tnlFastSweeping3D_impl.h" + #endif /* TNLFASTSWEEPING_H_ */ diff --git a/examples/fast-sweeping/tnlFastSweeping3D_CUDA_impl.h b/examples/fast-sweeping/tnlFastSweeping3D_CUDA_impl.h new file mode 100644 index 0000000000000000000000000000000000000000..f3f8cc216a0361b57d1ebcaca032d75deba5e0ec --- /dev/null +++ b/examples/fast-sweeping/tnlFastSweeping3D_CUDA_impl.h @@ -0,0 +1,1253 @@ +/*************************************************************************** + tnlFastSweeping2D_CUDA_v4_impl.h - description + ------------------- + begin : Oct 15 , 2015 + copyright : (C) 2015 by Tomas Sobotik + ***************************************************************************/ + +/*************************************************************************** + * * + * This program is free software; you can redistribute it and/or modify * + * it under the terms of the GNU General Public License as published by * + * the Free Software Foundation; either version 2 of the License, or * + * (at your option) any later version. * + * * + ***************************************************************************/ +#ifndef TNLFASTSWEEPING3D_IMPL_H_ +#define TNLFASTSWEEPING3D_IMPL_H_ + +#include "tnlFastSweeping.h" + +//__device__ +//double fabsMin( double x, double y) +//{ +// double fx = abs(x); +// +// if(Min(fx,abs(y)) == fx) +// return x; +// else +// return y; +//} +// +//__device__ +//double atomicFabsMin(double* address, double val) +//{ +// unsigned long long int* address_as_ull = +// (unsigned long long int*)address; +// unsigned long long int old = *address_as_ull, assumed; +// do { +// assumed = old; +// old = atomicCAS(address_as_ull, assumed,__double_as_longlong( fabsMin(assumed,val) )); +// } while (assumed != old); +// return __longlong_as_double(old); +//} + +template< typename MeshReal, + typename Device, + typename MeshIndex, + typename Real, + typename Index > +tnlString tnlFastSweeping< tnlGrid< 3,MeshReal, Device, MeshIndex >, Real, Index > :: getType() +{ + return tnlString( "tnlFastSweeping< " ) + + MeshType::getType() + ", " + + ::getType< Real >() + ", " + + ::getType< Index >() + " >"; +} + + + + +template< typename MeshReal, + typename Device, + typename MeshIndex, + typename Real, + typename Index > +bool tnlFastSweeping< tnlGrid< 3,MeshReal, Device, MeshIndex >, Real, Index > :: init( const tnlParameterContainer& parameters ) +{ + const tnlString& meshFile = parameters.getParameter< tnlString >( "mesh" ); + + if( ! Mesh.load( meshFile ) ) + { + cerr << "I am not able to load the mesh from the file " << meshFile << "." << endl; + return false; + } + + + const tnlString& initialCondition = parameters.getParameter <tnlString>("initial-condition"); + if( ! dofVector.load( initialCondition ) ) + { + cerr << "I am not able to load the initial condition from the file " << meshFile << "." << endl; + return false; + } + + h = Mesh.getHx(); + counter = 0; + + const tnlString& exact_input = parameters.getParameter< tnlString >( "exact-input" ); + + if(exact_input == "no") + exactInput=false; + else + exactInput=true; + + +#ifdef HAVE_CUDA + + cudaMalloc(&(cudaDofVector), this->dofVector.getSize()*sizeof(double)); + cudaMemcpy(cudaDofVector, this->dofVector.getData(), this->dofVector.getSize()*sizeof(double), cudaMemcpyHostToDevice); + + cudaMalloc(&(cudaDofVector2), this->dofVector.getSize()*sizeof(double)); + cudaMemcpy(cudaDofVector2, this->dofVector.getData(), this->dofVector.getSize()*sizeof(double), cudaMemcpyHostToDevice); + + + cudaMalloc(&(this->cudaSolver), sizeof(tnlFastSweeping< tnlGrid< 3,MeshReal, Device, MeshIndex >, Real, Index >)); + cudaMemcpy(this->cudaSolver, this,sizeof(tnlFastSweeping< tnlGrid< 3,MeshReal, Device, MeshIndex >, Real, Index >), cudaMemcpyHostToDevice); + +#endif + + int n = Mesh.getDimensions().x(); + dim3 threadsPerBlock(8, 8,8); + dim3 numBlocks(n/8 + 1, n/8 +1, n/8 +1); + + initCUDA<<<numBlocks,threadsPerBlock>>>(this->cudaSolver); + cudaDeviceSynchronize(); + checkCudaDevice; + + return true; +} + + + + + +template< typename MeshReal, + typename Device, + typename MeshIndex, + typename Real, + typename Index > +bool tnlFastSweeping< tnlGrid< 3,MeshReal, Device, MeshIndex >, Real, Index > :: run() +{ +// +// for(Index i = 0; i < Mesh.getDimensions().x(); i++) +// { +// for(Index j = 0; j < Mesh.getDimensions().y(); j++) +// { +// updateValue(i,j); +// } +// } +// +///*---------------------------------------------------------------------------------------------------------------------------*/ +// +// for(Index i = Mesh.getDimensions().x() - 1; i > -1; i--) +// { +// for(Index j = 0; j < Mesh.getDimensions().y(); j++) +// { +// updateValue(i,j); +// } +// } +// +///*---------------------------------------------------------------------------------------------------------------------------*/ +// +// for(Index i = Mesh.getDimensions().x() - 1; i > -1; i--) +// { +// for(Index j = Mesh.getDimensions().y() - 1; j > -1; j--) +// { +// updateValue(i,j); +// } +// } +// +///*---------------------------------------------------------------------------------------------------------------------------*/ +// for(Index i = 0; i < Mesh.getDimensions().x(); i++) +// { +// for(Index j = Mesh.getDimensions().y() - 1; j > -1; j--) +// { +// updateValue(i,j); +// } +// } +// +///*---------------------------------------------------------------------------------------------------------------------------*/ +// +// +// dofVector.save("u-00001.tnl"); + + int n = Mesh.getDimensions().x(); + dim3 threadsPerBlock(1, 512); + dim3 numBlocks(8,1); + + + runCUDA<<<numBlocks,threadsPerBlock>>>(this->cudaSolver,0,0); + + cudaDeviceSynchronize(); + checkCudaDevice; + + cudaMemcpy(this->dofVector.getData(), cudaDofVector2, this->dofVector.getSize()*sizeof(double), cudaMemcpyDeviceToHost); + cudaDeviceSynchronize(); + cudaFree(cudaDofVector); + cudaFree(cudaDofVector2); + cudaFree(cudaSolver); + dofVector.save("u-00001.tnl"); + cudaDeviceSynchronize(); + return true; +} + + + + +#ifdef HAVE_CUDA + + +template< typename MeshReal, + typename Device, + typename MeshIndex, + typename Real, + typename Index > +__device__ +void tnlFastSweeping< tnlGrid< 3,MeshReal, Device, MeshIndex >, Real, Index > :: updateValue( Index i, Index j, Index k) +{ + Index index = Mesh.getCellIndex(CoordinatesType(i,j,k)); + Real value = cudaDofVector2[index]; + Real a,b,c, tmp; + + if( i == 0 ) + a = cudaDofVector2[Mesh.template getCellNextToCell<1,0,0>(index)]; + else if( i == Mesh.getDimensions().x() - 1 ) + a = cudaDofVector2[Mesh.template getCellNextToCell<-1,0,0>(index)]; + else + { + a = fabsMin( cudaDofVector2[Mesh.template getCellNextToCell<-1,0,0>(index)], + cudaDofVector2[Mesh.template getCellNextToCell<1,0,0>(index)] ); + } + + if( j == 0 ) + b = cudaDofVector2[Mesh.template getCellNextToCell<0,1,0>(index)]; + else if( j == Mesh.getDimensions().y() - 1 ) + b = cudaDofVector2[Mesh.template getCellNextToCell<0,-1,0>(index)]; + else + { + b = fabsMin( cudaDofVector2[Mesh.template getCellNextToCell<0,-1,0>(index)], + cudaDofVector2[Mesh.template getCellNextToCell<0,1,0>(index)] ); + } + + + if( k == 0 ) + c = cudaDofVector2[Mesh.template getCellNextToCell<0,0,1>(index)]; + else if( k == Mesh.getDimensions().z() - 1 ) + c = cudaDofVector2[Mesh.template getCellNextToCell<0,0,-1>(index)]; + else + { + c = fabsMin( cudaDofVector2[Mesh.template getCellNextToCell<0,0,-1>(index)], + cudaDofVector2[Mesh.template getCellNextToCell<0,0,1>(index)] ); + } + + + Real hD= 2.0*(a*a+b*b+c*c-a*b-a*c-b*c); + + if(hD >= 3.0*h*h) + tmp = fabsMin(a,fabsMin(b,c)) + Sign(value)*h; + else + tmp = (1.0/3.0) * ( a + b + c + Sign(value)*sqrt(3.0*h*h-hD) ); + + + +// cudaDofVector2[index] = fabsMin(value, tmp); + atomicFabsMin(&cudaDofVector2[index],tmp); + +} + + +template< typename MeshReal, + typename Device, + typename MeshIndex, + typename Real, + typename Index > +__device__ +bool tnlFastSweeping< tnlGrid< 3,MeshReal, Device, MeshIndex >, Real, Index > :: initGrid() +{ + int i = threadIdx.x + blockDim.x*blockIdx.x; + int j = blockDim.y*blockIdx.y + threadIdx.y; + int k = blockDim.z*blockIdx.z + threadIdx.z; + int gid = Mesh.getCellIndex(CoordinatesType(i,j,k)); + + cudaDofVector2[gid] = INT_MAX*Sign(cudaDofVector[gid]); + if(abs(cudaDofVector[gid]) < 1.8*this->h) + cudaDofVector2[gid] = cudaDofVector[gid]; +// if(i+1 < Mesh.getDimensions().x() && j+1 < Mesh.getDimensions().y() ) +// { +// if(cudaDofVector[Mesh.getCellIndex(CoordinatesType(i,j))] > 0) +// { +// if(cudaDofVector[Mesh.getCellIndex(CoordinatesType(i+1,j))] > 0) +// { +// if(cudaDofVector[Mesh.getCellIndex(CoordinatesType(i,j+1))] > 0) +// { +// if(cudaDofVector[Mesh.getCellIndex(CoordinatesType(i+1,j+1))] > 0) +// setupSquare1111(i,j); +// else +// setupSquare1110(i,j); +// } +// else +// { +// if(cudaDofVector[Mesh.getCellIndex(CoordinatesType(i+1,j+1))] > 0) +// setupSquare1101(i,j); +// else +// setupSquare1100(i,j); +// } +// } +// else +// { +// if(cudaDofVector[Mesh.getCellIndex(CoordinatesType(i,j+1))] > 0) +// { +// if(cudaDofVector[Mesh.getCellIndex(CoordinatesType(i+1,j+1))] > 0) +// setupSquare1011(i,j); +// else +// setupSquare1010(i,j); +// } +// else +// { +// if(cudaDofVector[Mesh.getCellIndex(CoordinatesType(i+1,j+1))] > 0) +// setupSquare1001(i,j); +// else +// setupSquare1000(i,j); +// } +// } +// } +// else +// { +// if(cudaDofVector[Mesh.getCellIndex(CoordinatesType(i+1,j))] > 0) +// { +// if(cudaDofVector[Mesh.getCellIndex(CoordinatesType(i,j+1))] > 0) +// { +// if(cudaDofVector[Mesh.getCellIndex(CoordinatesType(i+1,j+1))] > 0) +// setupSquare0111(i,j); +// else +// setupSquare0110(i,j); +// } +// else +// { +// if(cudaDofVector[Mesh.getCellIndex(CoordinatesType(i+1,j+1))] > 0) +// setupSquare0101(i,j); +// else +// setupSquare0100(i,j); +// } +// } +// else +// { +// if(cudaDofVector[Mesh.getCellIndex(CoordinatesType(i,j+1))] > 0) +// { +// if(cudaDofVector[Mesh.getCellIndex(CoordinatesType(i+1,j+1))] > 0) +// setupSquare0011(i,j); +// else +// setupSquare0010(i,j); +// } +// else +// { +// if(cudaDofVector[Mesh.getCellIndex(CoordinatesType(i+1,j+1))] > 0) +// setupSquare0001(i,j); +// else +// setupSquare0000(i,j); +// } +// } +// } +// +// } + + return true; +// +// int total = blockDim.x*gridDim.x; +// +// +// +// Real tmp = 0.0; +// int flag = 0; +// counter = 0; +// tmp = Sign(cudaDofVector[Mesh.getCellIndex(CoordinatesType(gx,gy))]); +// +// +// if(!exactInput) +// { +// cudaDofVector[gid]=cudaDofVector[gid]=0.5*h*Sign(cudaDofVector[gid]); +// } +// __threadfence(); +//// printf("-----------------------------------------------------------------------------------\n"); +// +// __threadfence(); +// +// if(gx > 0 && gx < Mesh.getDimensions().x()-1) +// { +// if(gy > 0 && gy < Mesh.getDimensions().y()-1) +// { +// +// Index j = gy; +// Index i = gx; +//// tmp = Sign(cudaDofVector[Mesh.getCellIndex(CoordinatesType(i,j))]); +// +// if(tmp == 0.0) +// {} +// else if(cudaDofVector[Mesh.getCellIndex(CoordinatesType(i+1,j))]*tmp < 0.0 || +// cudaDofVector[Mesh.getCellIndex(CoordinatesType(i-1,j))]*tmp < 0.0 || +// cudaDofVector[Mesh.getCellIndex(CoordinatesType(i,j+1))]*tmp < 0.0 || +// cudaDofVector[Mesh.getCellIndex(CoordinatesType(i,j-1))]*tmp < 0.0 ) +// {} +// else +// flag=1;//cudaDofVector[Mesh.getCellIndex(CoordinatesType(i,j))] = tmp*INT_MAX; +// } +// } +// +//// printf("gx: %d, gy: %d, gid: %d \n", gx, gy,gid); +//// printf("****************************************************************\n"); +//// printf("gx: %d, gy: %d, gid: %d \n", gx, gy,gid); +// if(gx > 0 && gx < Mesh.getDimensions().x()-1 && gy == 0) +// { +//// printf("gx: %d, gy: %d, gid: %d \n", gx, gy,gid); +// Index j = 0; +// Index i = gx; +//// tmp = Sign(cudaDofVector[Mesh.getCellIndex(CoordinatesType(i,j))]); +// +// +// if(tmp == 0.0) +// {} +// else if(cudaDofVector[Mesh.getCellIndex(CoordinatesType(i+1,j))]*tmp < 0.0 || +// cudaDofVector[Mesh.getCellIndex(CoordinatesType(i-1,j))]*tmp < 0.0 || +// cudaDofVector[Mesh.getCellIndex(CoordinatesType(i,j+1))]*tmp < 0.0 ) +// {} +// else +// flag = 1;//cudaDofVector[Mesh.getCellIndex(CoordinatesType(i,j))] = tmp*INT_MAX; +// } +// +//// printf("++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\n"); +// if(gx > 0 && gx < Mesh.getDimensions().x()-1 && gy == Mesh.getDimensions().y() - 1) +// { +// Index i = gx; +// Index j = Mesh.getDimensions().y() - 1; +//// tmp = Sign(cudaDofVector[Mesh.getCellIndex(CoordinatesType(i,j))]); +// +// +// if(tmp == 0.0) +// {} +// else if(cudaDofVector[Mesh.getCellIndex(CoordinatesType(i+1,j))]*tmp < 0.0 || +// cudaDofVector[Mesh.getCellIndex(CoordinatesType(i-1,j))]*tmp < 0.0 || +// cudaDofVector[Mesh.getCellIndex(CoordinatesType(i,j-1))]*tmp < 0.0 ) +// {} +// else +// flag = 1;//cudaDofVector[Mesh.getCellIndex(CoordinatesType(i,j))] = tmp*INT_MAX; +// } +// +//// printf("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n"); +// if(gy > 0 && gy < Mesh.getDimensions().y()-1 && gx == 0) +// { +// Index j = gy; +// Index i = 0; +//// tmp = Sign(cudaDofVector[Mesh.getCellIndex(CoordinatesType(i,j))]); +// +// +// if(tmp == 0.0) +// {} +// else if(cudaDofVector[Mesh.getCellIndex(CoordinatesType(i+1,j))]*tmp < 0.0 || +// cudaDofVector[Mesh.getCellIndex(CoordinatesType(i,j+1))]*tmp < 0.0 || +// cudaDofVector[Mesh.getCellIndex(CoordinatesType(i,j-1))]*tmp < 0.0 ) +// {} +// else +// flag = 1;//cudaDofVector[Mesh.getCellIndex(CoordinatesType(i,j))] = tmp*INT_MAX; +// } +//// printf("@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@\n"); +// if(gy > 0 && gy < Mesh.getDimensions().y()-1 && gx == Mesh.getDimensions().x() - 1) +// { +// Index j = gy; +// Index i = Mesh.getDimensions().x() - 1; +//// tmp = Sign(cudaDofVector[Mesh.getCellIndex(CoordinatesType(i,j))]); +// +// +// if(tmp == 0.0) +// {} +// else if(cudaDofVector[Mesh.getCellIndex(CoordinatesType(i-1,j))]*tmp < 0.0 || +// cudaDofVector[Mesh.getCellIndex(CoordinatesType(i,j+1))]*tmp < 0.0 || +// cudaDofVector[Mesh.getCellIndex(CoordinatesType(i,j-1))]*tmp < 0.0 ) +// {} +// else +// flag = 1;//cudaDofVector[Mesh.getCellIndex(CoordinatesType(i,j))] = tmp*INT_MAX; +// } +// +//// printf("##################################################################################################\n"); +// if(gx == Mesh.getDimensions().x() - 1 && +// gy == Mesh.getDimensions().y() - 1) +// { +// +//// tmp = Sign(cudaDofVector[Mesh.getCellIndex(CoordinatesType(gx,gy))]); +// if(cudaDofVector[Mesh.getCellIndex(CoordinatesType(gx-1,gy))]*tmp > 0.0 && +// cudaDofVector[Mesh.getCellIndex(CoordinatesType(gx,gy-1))]*tmp > 0.0) +// +// flag = 1;//cudaDofVector[Mesh.getCellIndex(CoordinatesType(gx,gy))] = tmp*INT_MAX; +// } +// if(gx == Mesh.getDimensions().x() - 1 && +// gy == 0) +// { +// +//// tmp = Sign(cudaDofVector[Mesh.getCellIndex(CoordinatesType(gx,gy))]); +// if(cudaDofVector[Mesh.getCellIndex(CoordinatesType(gx-1,gy))]*tmp > 0.0 && +// cudaDofVector[Mesh.getCellIndex(CoordinatesType(gx,gy+1))]*tmp > 0.0) +// +// flag = 1;//cudaDofVector[Mesh.getCellIndex(CoordinatesType(gx,gy))] = tmp*INT_MAX; +// } +//// printf("$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$\n"); +// if(gx == 0 && +// gy == Mesh.getDimensions().y() - 1) +// { +// +//// tmp = Sign(cudaDofVector[Mesh.getCellIndex(CoordinatesType(gx,gy))]); +// if(cudaDofVector[Mesh.getCellIndex(CoordinatesType(gx+1,gy))]*tmp > 0.0 && +// cudaDofVector[Mesh.getCellIndex(CoordinatesType(gx,gy-1))]*tmp > 0.0) +// +// flag = 1;//cudaDofVector[Mesh.getCellIndex(CoordinatesType(gx,gy))] = tmp*INT_MAX; +// } +// if(gx == 0 && +// gy == 0) +// { +//// tmp = Sign(cudaDofVector[Mesh.getCellIndex(CoordinatesType(gx,gy))]); +// if(cudaDofVector[Mesh.getCellIndex(CoordinatesType(gx+1,gy))]*tmp > 0.0 && +// cudaDofVector[Mesh.getCellIndex(CoordinatesType(gx,gy+1))]*tmp > 0.0) +// +// flag = 1;//cudaDofVector[Mesh.getCellIndex(CoordinatesType(gx,gy))] = tmp*INT_MAX; +// } +// +// __threadfence(); +// +// if(flag==1) +// cudaDofVector[gid] = tmp*3; +} + + +template< typename MeshReal, + typename Device, + typename MeshIndex, + typename Real, + typename Index > +__device__ +Real tnlFastSweeping< tnlGrid< 3,MeshReal, Device, MeshIndex >, Real, Index > :: fabsMin( Real x, Real y) +{ + Real fx = abs(x); + //Real fy = abs(y); + + //Real tmpMin = Min(fx,abs(y)); + + if(Min(fx,abs(y)) == fx) + return x; + else + return y; + + +} + + + +__global__ void runCUDA(tnlFastSweeping< tnlGrid< 3,double, tnlHost, int >, double, int >* solver, int sweep, int i) +{ + + int gx = 0; + int gy = threadIdx.y; + + //if(solver->Mesh.getDimensions().x() <= gx || solver->Mesh.getDimensions().y() <= gy) + // return; + int n = solver->Mesh.getDimensions().x(); + int blockCount = n/blockDim.y +1; + //int gid = solver->Mesh.getDimensions().x() * gy + gx; + //int max = solver->Mesh.getDimensions().x()*solver->Mesh.getDimensions().x(); + + //int id1 = gx+gy; + //int id2 = (solver->Mesh.getDimensions().x() - gx - 1) + gy; + + + if(blockIdx.x==0) + { + for(int gz = 0; gz < n;gz++) + { + gx = 0; + gy = threadIdx.y; + for(int k = 0; k < n*blockCount + blockDim.y; k++) + { + if(threadIdx.y < k+1 && gy < n) + { + solver->updateValue(gx,gy,gz); + gx++; + if(gx==n) + { + gx=0; + gy+=blockDim.y; + } + } + + + __syncthreads(); + } + __syncthreads(); + } + } + else if(blockIdx.x==1) + { + for(int gz = 0; gz < n;gz++) + { + gx=n-1; + gy=threadIdx.y; + + for(int k = 0; k < n*blockCount + blockDim.y; k++) + { + if(threadIdx.y < k+1 && gy < n) + { + solver->updateValue(gx,gy,gz); + gx--; + if(gx==-1) + { + gx=n-1; + gy+=blockDim.y; + } + } + + + __syncthreads(); + } + } + } + else if(blockIdx.x==2) + { + + for(int gz = 0; gz < n;gz++) + { + gx=0; + gy=n-threadIdx.y; + for(int k = 0; k < n*blockCount + blockDim.y; k++) + { + if(threadIdx.y < k+1 && gy > -1) + { + solver->updateValue(gx,gy,gz); + gx++; + if(gx==n) + { + gx=0; + gy-=blockDim.y; + } + } + + + __syncthreads(); + } + } + } + else if(blockIdx.x==3) + { + for(int gz = 0; gz < n;gz++) + { + gx=n-1; + gy=n-threadIdx.y; + + for(int k = 0; k < n*blockCount + blockDim.y; k++) + { + if(threadIdx.y < k+1 && gy > -1) + { + solver->updateValue(gx,gy,gz); + gx--; + if(gx==-1) + { + gx=n-1; + gy-=blockDim.y; + } + } + + + __syncthreads(); + } + } + } + + + + + else if(blockIdx.x==4) + { + for(int gz = n-1; gz > -1;gz--) + { + gx = 0; + gy = threadIdx.y; + for(int k = 0; k < n*blockCount + blockDim.y; k++) + { + if(threadIdx.y < k+1 && gy < n) + { + solver->updateValue(gx,gy,gz); + gx++; + if(gx==n) + { + gx=0; + gy+=blockDim.y; + } + } + + + __syncthreads(); + } + } + } + else if(blockIdx.x==5) + { + for(int gz = n-1; gz > -1;gz--) + { + gx=n-1; + gy=threadIdx.y; + + for(int k = 0; k < n*blockCount + blockDim.y; k++) + { + if(threadIdx.y < k+1 && gy < n) + { + solver->updateValue(gx,gy,gz); + gx--; + if(gx==-1) + { + gx=n-1; + gy+=blockDim.y; + } + } + + + __syncthreads(); + } + } + } + else if(blockIdx.x==6) + { + + for(int gz = n-1; gz > -1;gz--) + { + gx=0; + gy=n-threadIdx.y; + for(int k = 0; k < n*blockCount + blockDim.y; k++) + { + if(threadIdx.y < k+1 && gy > -1) + { + solver->updateValue(gx,gy,gz); + gx++; + if(gx==n) + { + gx=0; + gy-=blockDim.y; + } + } + + + __syncthreads(); + } + } + } + else if(blockIdx.x==7) + { + for(int gz = n-1; gz > -1;gz--) + { + gx=n-1; + gy=n-threadIdx.y; + + for(int k = 0; k < n*blockCount + blockDim.y; k++) + { + if(threadIdx.y < k+1 && gy > -1) + { + solver->updateValue(gx,gy,gz); + gx--; + if(gx==-1) + { + gx=n-1; + gy-=blockDim.y; + } + } + + + __syncthreads(); + } + } + } + + + + +} + + +__global__ void initCUDA(tnlFastSweeping< tnlGrid< 3,double, tnlHost, int >, double, int >* solver) +{ + int gx = threadIdx.x + blockDim.x*blockIdx.x; + int gy = blockDim.y*blockIdx.y + threadIdx.y; + int gz = blockDim.z*blockIdx.z + threadIdx.z; + + if(solver->Mesh.getDimensions().x() > gx && solver->Mesh.getDimensions().y() > gy && solver->Mesh.getDimensions().z() > gz) + { + solver->initGrid(); + } + + +} + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +template< typename MeshReal, + typename Device, + typename MeshIndex, + typename Real, + typename Index > +void tnlFastSweeping< tnlGrid< 3,MeshReal, Device, MeshIndex >, Real, Index > :: setupSquare1111( Index i, Index j) +{ + Index index = Mesh.getCellIndex(CoordinatesType(i,j)); + cudaDofVector2[index]=fabsMin(INT_MAX,cudaDofVector2[(index)]); + cudaDofVector2[Mesh.template getCellNextToCell<0,1>(index)]=fabsMin(INT_MAX,cudaDofVector2[Mesh.template getCellNextToCell<0,1>(index)]); + cudaDofVector2[Mesh.template getCellNextToCell<1,1>(index)]=fabsMin(INT_MAX,cudaDofVector2[Mesh.template getCellNextToCell<1,1>(index)]); + cudaDofVector2[Mesh.template getCellNextToCell<1,0>(index)]=fabsMin(INT_MAX,cudaDofVector2[Mesh.template getCellNextToCell<1,0>(index)]); + +} + + +template< typename MeshReal, + typename Device, + typename MeshIndex, + typename Real, + typename Index > +void tnlFastSweeping< tnlGrid< 3,MeshReal, Device, MeshIndex >, Real, Index > :: setupSquare0000( Index i, Index j) +{ + Index index = Mesh.getCellIndex(CoordinatesType(i,j)); + cudaDofVector2[index]=fabsMin(-INT_MAX,cudaDofVector2[(index)]); + cudaDofVector2[Mesh.template getCellNextToCell<0,1>(index)]=fabsMin(-INT_MAX,cudaDofVector2[Mesh.template getCellNextToCell<0,1>(index)]); + cudaDofVector2[Mesh.template getCellNextToCell<1,1>(index)]=fabsMin(-INT_MAX,cudaDofVector2[Mesh.template getCellNextToCell<1,1>(index)]); + cudaDofVector2[Mesh.template getCellNextToCell<1,0>(index)]=fabsMin(-INT_MAX,cudaDofVector2[Mesh.template getCellNextToCell<1,0>(index)]); + +} + + +template< typename MeshReal, + typename Device, + typename MeshIndex, + typename Real, + typename Index > +void tnlFastSweeping< tnlGrid< 3,MeshReal, Device, MeshIndex >, Real, Index > :: setupSquare1110( Index i, Index j) +{ + Index index = Mesh.getCellIndex(CoordinatesType(i,j)); + Real al,be, a,b,c,s; + al=abs(cudaDofVector[Mesh.template getCellNextToCell<1,1>(index)]/ + (cudaDofVector[Mesh.template getCellNextToCell<1,0>(index)]- + cudaDofVector[Mesh.template getCellNextToCell<1,1>(index)])); + + be=abs(cudaDofVector[Mesh.template getCellNextToCell<1,1>(index)]/ + (cudaDofVector[Mesh.template getCellNextToCell<0,1>(index)]- + cudaDofVector[Mesh.template getCellNextToCell<1,1>(index)])); + + a = be/al; + b=1.0; + c=-be; + s= h/sqrt(a*a+b*b); + + + cudaDofVector2[index]=fabsMin(abs(a*1+b*1+c)*s,cudaDofVector2[(index)]); + cudaDofVector2[Mesh.template getCellNextToCell<0,1>(index)]=fabsMin(abs(a*0+b*1+c)*s,cudaDofVector2[Mesh.template getCellNextToCell<0,1>(index)]); + cudaDofVector2[Mesh.template getCellNextToCell<1,1>(index)]=fabsMin(-abs(a*0+b*0+c)*s,cudaDofVector2[Mesh.template getCellNextToCell<1,1>(index)]); + cudaDofVector2[Mesh.template getCellNextToCell<1,0>(index)]=fabsMin(abs(a*1+b*0+c)*s,cudaDofVector2[Mesh.template getCellNextToCell<1,0>(index)]); + +} + +template< typename MeshReal, + typename Device, + typename MeshIndex, + typename Real, + typename Index > +void tnlFastSweeping< tnlGrid< 3,MeshReal, Device, MeshIndex >, Real, Index > :: setupSquare1101( Index i, Index j) +{ + Index index = Mesh.getCellIndex(CoordinatesType(i,j)); + Real al,be, a,b,c,s; + al=abs(cudaDofVector[Mesh.template getCellNextToCell<0,1>(index)]/ + (cudaDofVector[Mesh.template getCellNextToCell<1,1>(index)]- + cudaDofVector[Mesh.template getCellNextToCell<0,1>(index)])); + + be=abs(cudaDofVector[Mesh.template getCellNextToCell<0,1>(index)]/ + (cudaDofVector[Mesh.template getCellNextToCell<0,0>(index)]- + cudaDofVector[Mesh.template getCellNextToCell<0,1>(index)])); + + a = be/al; + b=1.0; + c=-be; + s= h/sqrt(a*a+b*b); + + + cudaDofVector2[index]=fabsMin(abs(a*0+b*1+c)*s,cudaDofVector2[(index)]); + cudaDofVector2[Mesh.template getCellNextToCell<0,1>(index)]=fabsMin(abs(a*0+b*0+c)*s,cudaDofVector2[Mesh.template getCellNextToCell<0,1>(index)]); + cudaDofVector2[Mesh.template getCellNextToCell<1,1>(index)]=fabsMin(abs(a*1+b*0+c)*s,cudaDofVector2[Mesh.template getCellNextToCell<1,1>(index)]); + cudaDofVector2[Mesh.template getCellNextToCell<1,0>(index)]=fabsMin(-abs(a*1+b*1+c)*s,cudaDofVector2[Mesh.template getCellNextToCell<1,0>(index)]); + +} + +template< typename MeshReal, + typename Device, + typename MeshIndex, + typename Real, + typename Index > +void tnlFastSweeping< tnlGrid< 3,MeshReal, Device, MeshIndex >, Real, Index > :: setupSquare1011( Index i, Index j) +{ + Index index = Mesh.getCellIndex(CoordinatesType(i,j)); + Real al,be, a,b,c,s; + al=abs(cudaDofVector[Mesh.template getCellNextToCell<1,0>(index)]/ + (cudaDofVector[Mesh.template getCellNextToCell<0,0>(index)]- + cudaDofVector[Mesh.template getCellNextToCell<1,0>(index)])); + + be=abs(cudaDofVector[Mesh.template getCellNextToCell<1,0>(index)]/ + (cudaDofVector[Mesh.template getCellNextToCell<1,1>(index)]- + cudaDofVector[Mesh.template getCellNextToCell<1,0>(index)])); + + a = be/al; + b=1.0; + c=-be; + s= h/sqrt(a*a+b*b); + + + cudaDofVector2[index]=fabsMin(abs(a*1+b*0+c)*s,cudaDofVector2[(index)]); + cudaDofVector2[Mesh.template getCellNextToCell<0,1>(index)]=fabsMin(-abs(a*1+b*1+c)*s,cudaDofVector2[Mesh.template getCellNextToCell<0,1>(index)]); + cudaDofVector2[Mesh.template getCellNextToCell<1,1>(index)]=fabsMin(abs(a*0+b*1+c)*s,cudaDofVector2[Mesh.template getCellNextToCell<1,1>(index)]); + cudaDofVector2[Mesh.template getCellNextToCell<1,0>(index)]=fabsMin(abs(a*0+b*0+c)*s,cudaDofVector2[Mesh.template getCellNextToCell<1,0>(index)]); + +} + +template< typename MeshReal, + typename Device, + typename MeshIndex, + typename Real, + typename Index > +void tnlFastSweeping< tnlGrid< 3,MeshReal, Device, MeshIndex >, Real, Index > :: setupSquare0111( Index i, Index j) +{ + Index index = Mesh.getCellIndex(CoordinatesType(i,j)); + Real al,be, a,b,c,s; + al=abs(cudaDofVector[Mesh.template getCellNextToCell<0,0>(index)]/ + (cudaDofVector[Mesh.template getCellNextToCell<0,1>(index)]- + cudaDofVector[Mesh.template getCellNextToCell<0,0>(index)])); + + be=abs(cudaDofVector[Mesh.template getCellNextToCell<0,0>(index)]/ + (cudaDofVector[Mesh.template getCellNextToCell<1,0>(index)]- + cudaDofVector[Mesh.template getCellNextToCell<0,0>(index)])); + + a = be/al; + b=1.0; + c=-be; + s= h/sqrt(a*a+b*b); + + + cudaDofVector2[index]=fabsMin(-abs(a*0+b*0+c)*s,cudaDofVector2[(index)]); + cudaDofVector2[Mesh.template getCellNextToCell<0,1>(index)]=fabsMin(abs(a*1+b*0+c)*s,cudaDofVector2[Mesh.template getCellNextToCell<0,1>(index)]); + cudaDofVector2[Mesh.template getCellNextToCell<1,1>(index)]=fabsMin(abs(a*1+b*1+c)*s,cudaDofVector2[Mesh.template getCellNextToCell<1,1>(index)]); + cudaDofVector2[Mesh.template getCellNextToCell<1,0>(index)]=fabsMin(abs(a*0+b*1+c)*s,cudaDofVector2[Mesh.template getCellNextToCell<1,0>(index)]); + +} + + +template< typename MeshReal, + typename Device, + typename MeshIndex, + typename Real, + typename Index > +void tnlFastSweeping< tnlGrid< 3,MeshReal, Device, MeshIndex >, Real, Index > :: setupSquare0001( Index i, Index j) +{ + Index index = Mesh.getCellIndex(CoordinatesType(i,j)); + Real al,be, a,b,c,s; + al=abs(cudaDofVector[Mesh.template getCellNextToCell<1,1>(index)]/ + (cudaDofVector[Mesh.template getCellNextToCell<1,0>(index)]- + cudaDofVector[Mesh.template getCellNextToCell<1,1>(index)])); + + be=abs(cudaDofVector[Mesh.template getCellNextToCell<1,1>(index)]/ + (cudaDofVector[Mesh.template getCellNextToCell<0,1>(index)]- + cudaDofVector[Mesh.template getCellNextToCell<1,1>(index)])); + + a = be/al; + b=1.0; + c=-be; + s= h/sqrt(a*a+b*b); + + + cudaDofVector2[index]=fabsMin(-abs(a*1+b*1+c)*s,cudaDofVector2[(index)]); + cudaDofVector2[Mesh.template getCellNextToCell<0,1>(index)]=fabsMin(-abs(a*0+b*1+c)*s,cudaDofVector2[Mesh.template getCellNextToCell<0,1>(index)]); + cudaDofVector2[Mesh.template getCellNextToCell<1,1>(index)]=fabsMin(abs(a*0+b*0+c)*s,cudaDofVector2[Mesh.template getCellNextToCell<1,1>(index)]); + cudaDofVector2[Mesh.template getCellNextToCell<1,0>(index)]=fabsMin(-abs(a*1+b*0+c)*s,cudaDofVector2[Mesh.template getCellNextToCell<1,0>(index)]); + +} + +template< typename MeshReal, + typename Device, + typename MeshIndex, + typename Real, + typename Index > +void tnlFastSweeping< tnlGrid< 3,MeshReal, Device, MeshIndex >, Real, Index > :: setupSquare0010( Index i, Index j) +{ + Index index = Mesh.getCellIndex(CoordinatesType(i,j)); + Real al,be, a,b,c,s; + al=abs(cudaDofVector[Mesh.template getCellNextToCell<0,1>(index)]/ + (cudaDofVector[Mesh.template getCellNextToCell<1,1>(index)]- + cudaDofVector[Mesh.template getCellNextToCell<0,1>(index)])); + + be=abs(cudaDofVector[Mesh.template getCellNextToCell<0,1>(index)]/ + (cudaDofVector[Mesh.template getCellNextToCell<0,0>(index)]- + cudaDofVector[Mesh.template getCellNextToCell<0,1>(index)])); + + a = be/al; + b=1.0; + c=-be; + s= h/sqrt(a*a+b*b); + + + cudaDofVector2[index]=fabsMin(-abs(a*0+b*1+c)*s,cudaDofVector2[(index)]); + cudaDofVector2[Mesh.template getCellNextToCell<0,1>(index)]=fabsMin(-abs(a*0+b*0+c)*s,cudaDofVector2[Mesh.template getCellNextToCell<0,1>(index)]); + cudaDofVector2[Mesh.template getCellNextToCell<1,1>(index)]=fabsMin(-abs(a*1+b*0+c)*s,cudaDofVector2[Mesh.template getCellNextToCell<1,1>(index)]); + cudaDofVector2[Mesh.template getCellNextToCell<1,0>(index)]=fabsMin(abs(a*1+b*1+c)*s,cudaDofVector2[Mesh.template getCellNextToCell<1,0>(index)]); + +} + +template< typename MeshReal, + typename Device, + typename MeshIndex, + typename Real, + typename Index > +void tnlFastSweeping< tnlGrid< 3,MeshReal, Device, MeshIndex >, Real, Index > :: setupSquare0100( Index i, Index j) +{ + Index index = Mesh.getCellIndex(CoordinatesType(i,j)); + Real al,be, a,b,c,s; + al=abs(cudaDofVector[Mesh.template getCellNextToCell<1,0>(index)]/ + (cudaDofVector[Mesh.template getCellNextToCell<0,0>(index)]- + cudaDofVector[Mesh.template getCellNextToCell<1,0>(index)])); + + be=abs(cudaDofVector[Mesh.template getCellNextToCell<1,0>(index)]/ + (cudaDofVector[Mesh.template getCellNextToCell<1,1>(index)]- + cudaDofVector[Mesh.template getCellNextToCell<1,0>(index)])); + + a = be/al; + b=1.0; + c=-be; + s= h/sqrt(a*a+b*b); + + + cudaDofVector2[index]=fabsMin(-abs(a*1+b*0+c)*s,cudaDofVector2[(index)]); + cudaDofVector2[Mesh.template getCellNextToCell<0,1>(index)]=fabsMin(abs(a*1+b*1+c)*s,cudaDofVector2[Mesh.template getCellNextToCell<0,1>(index)]); + cudaDofVector2[Mesh.template getCellNextToCell<1,1>(index)]=fabsMin(-abs(a*0+b*1+c)*s,cudaDofVector2[Mesh.template getCellNextToCell<1,1>(index)]); + cudaDofVector2[Mesh.template getCellNextToCell<1,0>(index)]=fabsMin(-abs(a*0+b*0+c)*s,cudaDofVector2[Mesh.template getCellNextToCell<1,0>(index)]); + +} + +template< typename MeshReal, + typename Device, + typename MeshIndex, + typename Real, + typename Index > +void tnlFastSweeping< tnlGrid< 3,MeshReal, Device, MeshIndex >, Real, Index > :: setupSquare1000( Index i, Index j) +{ + Index index = Mesh.getCellIndex(CoordinatesType(i,j)); + Real al,be, a,b,c,s; + al=abs(cudaDofVector[Mesh.template getCellNextToCell<0,0>(index)]/ + (cudaDofVector[Mesh.template getCellNextToCell<0,1>(index)]- + cudaDofVector[Mesh.template getCellNextToCell<0,0>(index)])); + + be=abs(cudaDofVector[Mesh.template getCellNextToCell<0,0>(index)]/ + (cudaDofVector[Mesh.template getCellNextToCell<1,0>(index)]- + cudaDofVector[Mesh.template getCellNextToCell<0,0>(index)])); + + a = be/al; + b=1.0; + c=-be; + s= h/sqrt(a*a+b*b); + + + cudaDofVector2[index]=fabsMin(abs(a*0+b*0+c)*s,cudaDofVector2[(index)]); + cudaDofVector2[Mesh.template getCellNextToCell<0,1>(index)]=fabsMin(-abs(a*1+b*0+c)*s,cudaDofVector2[Mesh.template getCellNextToCell<0,1>(index)]); + cudaDofVector2[Mesh.template getCellNextToCell<1,1>(index)]=fabsMin(-abs(a*1+b*1+c)*s,cudaDofVector2[Mesh.template getCellNextToCell<1,1>(index)]); + cudaDofVector2[Mesh.template getCellNextToCell<1,0>(index)]=fabsMin(-abs(a*0+b*1+c)*s,cudaDofVector2[Mesh.template getCellNextToCell<1,0>(index)]); + +} + + + + + +template< typename MeshReal, + typename Device, + typename MeshIndex, + typename Real, + typename Index > +void tnlFastSweeping< tnlGrid< 3,MeshReal, Device, MeshIndex >, Real, Index > :: setupSquare1100( Index i, Index j) +{ + Index index = Mesh.getCellIndex(CoordinatesType(i,j)); + Real al,be, a,b,c,s; + al=abs(cudaDofVector[Mesh.template getCellNextToCell<0,0>(index)]/ + (cudaDofVector[Mesh.template getCellNextToCell<0,1>(index)]- + cudaDofVector[Mesh.template getCellNextToCell<0,0>(index)])); + + be=abs(cudaDofVector[Mesh.template getCellNextToCell<1,0>(index)]/ + (cudaDofVector[Mesh.template getCellNextToCell<1,1>(index)]- + cudaDofVector[Mesh.template getCellNextToCell<1,0>(index)])); + + a = al-be; + b=1.0; + c=-al; + s= h/sqrt(a*a+b*b); + + + cudaDofVector2[index]=fabsMin(abs(a*0+b*0+c)*s,cudaDofVector2[(index)]); + cudaDofVector2[Mesh.template getCellNextToCell<0,1>(index)]=fabsMin(-abs(a*0+b*1+c)*s,cudaDofVector2[Mesh.template getCellNextToCell<0,1>(index)]); + cudaDofVector2[Mesh.template getCellNextToCell<1,1>(index)]=fabsMin(-abs(a*1+b*1+c)*s,cudaDofVector2[Mesh.template getCellNextToCell<1,1>(index)]); + cudaDofVector2[Mesh.template getCellNextToCell<1,0>(index)]=fabsMin(abs(a*1+b*0+c)*s,cudaDofVector2[Mesh.template getCellNextToCell<1,0>(index)]); + +} + +template< typename MeshReal, + typename Device, + typename MeshIndex, + typename Real, + typename Index > +void tnlFastSweeping< tnlGrid< 3,MeshReal, Device, MeshIndex >, Real, Index > :: setupSquare1010( Index i, Index j) +{ + Index index = Mesh.getCellIndex(CoordinatesType(i,j)); + Real al,be, a,b,c,s; + al=abs(cudaDofVector[Mesh.template getCellNextToCell<0,0>(index)]/ + (cudaDofVector[Mesh.template getCellNextToCell<1,0>(index)]- + cudaDofVector[Mesh.template getCellNextToCell<0,0>(index)])); + + be=abs(cudaDofVector[Mesh.template getCellNextToCell<0,1>(index)]/ + (cudaDofVector[Mesh.template getCellNextToCell<1,1>(index)]- + cudaDofVector[Mesh.template getCellNextToCell<0,1>(index)])); + + a = al-be; + b=1.0; + c=-be; + s= h/sqrt(a*a+b*b); + + + cudaDofVector2[index]=fabsMin(abs(a*0+b*0+c)*s,cudaDofVector2[(index)]); + cudaDofVector2[Mesh.template getCellNextToCell<0,1>(index)]=fabsMin(abs(a*1+b*0+c)*s,cudaDofVector2[Mesh.template getCellNextToCell<0,1>(index)]); + cudaDofVector2[Mesh.template getCellNextToCell<1,1>(index)]=fabsMin(-abs(a*1+b*1+c)*s,cudaDofVector2[Mesh.template getCellNextToCell<1,1>(index)]); + cudaDofVector2[Mesh.template getCellNextToCell<1,0>(index)]=fabsMin(-abs(a*0+b*1+c)*s,cudaDofVector2[Mesh.template getCellNextToCell<1,0>(index)]); + +} + +template< typename MeshReal, + typename Device, + typename MeshIndex, + typename Real, + typename Index > +void tnlFastSweeping< tnlGrid< 3,MeshReal, Device, MeshIndex >, Real, Index > :: setupSquare1001( Index i, Index j) +{ + Index index = Mesh.getCellIndex(CoordinatesType(i,j)); + cudaDofVector2[index]=fabsMin(cudaDofVector[index],cudaDofVector2[(index)]); + cudaDofVector2[Mesh.template getCellNextToCell<0,1>(index)]=fabsMin(cudaDofVector[Mesh.template getCellNextToCell<0,1>(index)],cudaDofVector2[Mesh.template getCellNextToCell<0,1>(index)]); + cudaDofVector2[Mesh.template getCellNextToCell<1,1>(index)]=fabsMin(cudaDofVector[Mesh.template getCellNextToCell<1,1>(index)],cudaDofVector2[Mesh.template getCellNextToCell<1,1>(index)]); + cudaDofVector2[Mesh.template getCellNextToCell<1,0>(index)]=fabsMin(cudaDofVector[Mesh.template getCellNextToCell<1,0>(index)],cudaDofVector2[Mesh.template getCellNextToCell<1,0>(index)]); + +} + + + + + + + +template< typename MeshReal, + typename Device, + typename MeshIndex, + typename Real, + typename Index > +void tnlFastSweeping< tnlGrid< 3,MeshReal, Device, MeshIndex >, Real, Index > :: setupSquare0011( Index i, Index j) +{ + Index index = Mesh.getCellIndex(CoordinatesType(i,j)); + Real al,be, a,b,c,s; + al=abs(cudaDofVector[Mesh.template getCellNextToCell<0,0>(index)]/ + (cudaDofVector[Mesh.template getCellNextToCell<0,1>(index)]- + cudaDofVector[Mesh.template getCellNextToCell<0,0>(index)])); + + be=abs(cudaDofVector[Mesh.template getCellNextToCell<1,0>(index)]/ + (cudaDofVector[Mesh.template getCellNextToCell<1,1>(index)]- + cudaDofVector[Mesh.template getCellNextToCell<1,0>(index)])); + + a = al-be; + b=1.0; + c=-al; + s= h/sqrt(a*a+b*b); + + + cudaDofVector2[index]=fabsMin(-abs(a*0+b*0+c)*s,cudaDofVector2[(index)]); + cudaDofVector2[Mesh.template getCellNextToCell<0,1>(index)]=fabsMin(abs(a*0+b*1+c)*s,cudaDofVector2[Mesh.template getCellNextToCell<0,1>(index)]); + cudaDofVector2[Mesh.template getCellNextToCell<1,1>(index)]=fabsMin(abs(a*1+b*1+c)*s,cudaDofVector2[Mesh.template getCellNextToCell<1,1>(index)]); + cudaDofVector2[Mesh.template getCellNextToCell<1,0>(index)]=fabsMin(-abs(a*1+b*0+c)*s,cudaDofVector2[Mesh.template getCellNextToCell<1,0>(index)]); + +} + +template< typename MeshReal, + typename Device, + typename MeshIndex, + typename Real, + typename Index > +void tnlFastSweeping< tnlGrid< 3,MeshReal, Device, MeshIndex >, Real, Index > :: setupSquare0101( Index i, Index j) +{ + Index index = Mesh.getCellIndex(CoordinatesType(i,j)); + Real al,be, a,b,c,s; + al=abs(cudaDofVector[Mesh.template getCellNextToCell<0,0>(index)]/ + (cudaDofVector[Mesh.template getCellNextToCell<1,0>(index)]- + cudaDofVector[Mesh.template getCellNextToCell<0,0>(index)])); + + be=abs(cudaDofVector[Mesh.template getCellNextToCell<0,1>(index)]/ + (cudaDofVector[Mesh.template getCellNextToCell<1,1>(index)]- + cudaDofVector[Mesh.template getCellNextToCell<0,1>(index)])); + + a = al-be; + b=1.0; + c=-be; + s= h/sqrt(a*a+b*b); + + + cudaDofVector2[index]=fabsMin(-abs(a*0+b*0+c)*s,cudaDofVector2[(index)]); + cudaDofVector2[Mesh.template getCellNextToCell<0,1>(index)]=fabsMin(-abs(a*1+b*0+c)*s,cudaDofVector2[Mesh.template getCellNextToCell<0,1>(index)]); + cudaDofVector2[Mesh.template getCellNextToCell<1,1>(index)]=fabsMin(abs(a*1+b*1+c)*s,cudaDofVector2[Mesh.template getCellNextToCell<1,1>(index)]); + cudaDofVector2[Mesh.template getCellNextToCell<1,0>(index)]=fabsMin(abs(a*0+b*1+c)*s,cudaDofVector2[Mesh.template getCellNextToCell<1,0>(index)]); + +} + +template< typename MeshReal, + typename Device, + typename MeshIndex, + typename Real, + typename Index > +void tnlFastSweeping< tnlGrid< 3,MeshReal, Device, MeshIndex >, Real, Index > :: setupSquare0110( Index i, Index j) +{ + Index index = Mesh.getCellIndex(CoordinatesType(i,j)); + cudaDofVector2[index]=fabsMin(cudaDofVector[index],cudaDofVector2[(index)]); + cudaDofVector2[Mesh.template getCellNextToCell<0,1>(index)]=fabsMin(cudaDofVector[Mesh.template getCellNextToCell<0,1>(index)],cudaDofVector2[Mesh.template getCellNextToCell<0,1>(index)]); + cudaDofVector2[Mesh.template getCellNextToCell<1,1>(index)]=fabsMin(cudaDofVector[Mesh.template getCellNextToCell<1,1>(index)],cudaDofVector2[Mesh.template getCellNextToCell<1,1>(index)]); + cudaDofVector2[Mesh.template getCellNextToCell<1,0>(index)]=fabsMin(cudaDofVector[Mesh.template getCellNextToCell<1,0>(index)],cudaDofVector2[Mesh.template getCellNextToCell<1,0>(index)]); +} +#endif + + + + +#endif /* TNLFASTSWEEPING_IMPL_H_ */ diff --git a/examples/fast-sweeping/tnlFastSweeping3D_impl.h b/examples/fast-sweeping/tnlFastSweeping3D_impl.h new file mode 100644 index 0000000000000000000000000000000000000000..2f5eeb07764dd7ac2d8137030967c932841fbc84 --- /dev/null +++ b/examples/fast-sweeping/tnlFastSweeping3D_impl.h @@ -0,0 +1,946 @@ +/*************************************************************************** + tnlFastSweeping2D_impl.h - description + ------------------- + begin : Oct 15 , 2015 + copyright : (C) 2015 by Tomas Sobotik + ***************************************************************************/ + +/*************************************************************************** + * * + * This program is free software; you can redistribute it and/or modify * + * it under the terms of the GNU General Public License as published by * + * the Free Software Foundation; either version 2 of the License, or * + * (at your option) any later version. * + * * + ***************************************************************************/ +#ifndef TNLFASTSWEEPING3D_IMPL_H_ +#define TNLFASTSWEEPING3D_IMPL_H_ + +#include "tnlFastSweeping.h" + +template< typename MeshReal, + typename Device, + typename MeshIndex, + typename Real, + typename Index > +tnlString tnlFastSweeping< tnlGrid< 3,MeshReal, Device, MeshIndex >, Real, Index > :: getType() +{ + return tnlString( "tnlFastSweeping< " ) + + MeshType::getType() + ", " + + ::getType< Real >() + ", " + + ::getType< Index >() + " >"; +} + + + + +template< typename MeshReal, + typename Device, + typename MeshIndex, + typename Real, + typename Index > +bool tnlFastSweeping< tnlGrid< 3,MeshReal, Device, MeshIndex >, Real, Index > :: init( const tnlParameterContainer& parameters ) +{ + const tnlString& meshFile = parameters.getParameter< tnlString >( "mesh" ); + + if( ! Mesh.load( meshFile ) ) + { + cerr << "I am not able to load the mesh from the file " << meshFile << "." << endl; + return false; + } + + + const tnlString& initialCondition = parameters.getParameter <tnlString>("initial-condition"); + if( ! dofVector.load( initialCondition ) ) + { + cerr << "I am not able to load the initial condition from the file " << meshFile << "." << endl; + return false; + } + dofVector2.setLike(dofVector); + + h = Mesh.getHx(); + + const tnlString& exact_input = parameters.getParameter< tnlString >( "exact-input" ); + + if(exact_input == "no") + exactInput=false; + else + exactInput=true; + cout << "bla "<<endl; + return initGrid(); +} + + +template< typename MeshReal, + typename Device, + typename MeshIndex, + typename Real, + typename Index > +bool tnlFastSweeping< tnlGrid< 3,MeshReal, Device, MeshIndex >, Real, Index > :: initGrid() +{ + + for(int i=0; i< Mesh.getDimensions().x()*Mesh.getDimensions().y()*Mesh.getDimensions().z();i++) + { + dofVector2[i]=INT_MAX*Sign(dofVector[i]); + if (abs(dofVector[i]) < 1.8*h) + dofVector2[i]=dofVector[i]; + } + cout << "bla 2"<<endl; +// +// for(int i = 0 ; i < Mesh.getDimensions().x()-1; i++) +// { +// for(int j = 0 ; j < Mesh.getDimensions().x()-1; j++) +// { +// if(dofVector[Mesh.getCellIndex(CoordinatesType(i,j))] > 0) +// { +// if(dofVector[Mesh.getCellIndex(CoordinatesType(i+1,j))] > 0) +// { +// if(dofVector[Mesh.getCellIndex(CoordinatesType(i,j+1))] > 0) +// { +// if(dofVector[Mesh.getCellIndex(CoordinatesType(i+1,j+1))] > 0) +// setupSquare1111(i,j); +// else +// setupSquare1110(i,j); +// } +// else +// { +// if(dofVector[Mesh.getCellIndex(CoordinatesType(i+1,j+1))] > 0) +// setupSquare1101(i,j); +// else +// setupSquare1100(i,j); +// } +// } +// else +// { +// if(dofVector[Mesh.getCellIndex(CoordinatesType(i,j+1))] > 0) +// { +// if(dofVector[Mesh.getCellIndex(CoordinatesType(i+1,j+1))] > 0) +// setupSquare1011(i,j); +// else +// setupSquare1010(i,j); +// } +// else +// { +// if(dofVector[Mesh.getCellIndex(CoordinatesType(i+1,j+1))] > 0) +// setupSquare1001(i,j); +// else +// setupSquare1000(i,j); +// } +// } +// } +// else +// { +// if(dofVector[Mesh.getCellIndex(CoordinatesType(i+1,j))] > 0) +// { +// if(dofVector[Mesh.getCellIndex(CoordinatesType(i,j+1))] > 0) +// { +// if(dofVector[Mesh.getCellIndex(CoordinatesType(i+1,j+1))] > 0) +// setupSquare0111(i,j); +// else +// setupSquare0110(i,j); +// } +// else +// { +// if(dofVector[Mesh.getCellIndex(CoordinatesType(i+1,j+1))] > 0) +// setupSquare0101(i,j); +// else +// setupSquare0100(i,j); +// } +// } +// else +// { +// if(dofVector[Mesh.getCellIndex(CoordinatesType(i,j+1))] > 0) +// { +// if(dofVector[Mesh.getCellIndex(CoordinatesType(i+1,j+1))] > 0) +// setupSquare0011(i,j); +// else +// setupSquare0010(i,j); +// } +// else +// { +// if(dofVector[Mesh.getCellIndex(CoordinatesType(i+1,j+1))] > 0) +// setupSquare0001(i,j); +// else +// setupSquare0000(i,j); +// } +// } +// } +// +// } +// } + +// Real tmp = 0.0; +// Real ax=0.5/sqrt(2.0); +// +// if(!exactInput) +// { +// for(Index i = 0; i < Mesh.getDimensions().x()*Mesh.getDimensions().y(); i++) +// dofVector[i]=0.5*h*Sign(dofVector[i]); +// } +// +// +// for(Index i = 1; i < Mesh.getDimensions().x()-1; i++) +// { +// for(Index j = 1; j < Mesh.getDimensions().y()-1; j++) +// { +// tmp = Sign(dofVector[Mesh.getCellIndex(CoordinatesType(i,j))]); +// +// if(tmp == 0.0) +// {} +// else if(dofVector[Mesh.getCellIndex(CoordinatesType(i+1,j))]*tmp < 0.0 || +// dofVector[Mesh.getCellIndex(CoordinatesType(i-1,j))]*tmp < 0.0 || +// dofVector[Mesh.getCellIndex(CoordinatesType(i,j+1))]*tmp < 0.0 || +// dofVector[Mesh.getCellIndex(CoordinatesType(i,j-1))]*tmp < 0.0 ) +// {} +// else +// dofVector[Mesh.getCellIndex(CoordinatesType(i,j))] = tmp*INT_MAX; +// } +// } +// +// +// +// for(int i = 1; i < Mesh.getDimensions().x()-1; i++) +// { +// Index j = 0; +// tmp = Sign(dofVector[Mesh.getCellIndex(CoordinatesType(i,j))]); +// +// +// if(tmp == 0.0) +// {} +// else if(dofVector[Mesh.getCellIndex(CoordinatesType(i+1,j))]*tmp < 0.0 || +// dofVector[Mesh.getCellIndex(CoordinatesType(i-1,j))]*tmp < 0.0 || +// dofVector[Mesh.getCellIndex(CoordinatesType(i,j+1))]*tmp < 0.0 ) +// {} +// else +// dofVector[Mesh.getCellIndex(CoordinatesType(i,j))] = tmp*INT_MAX; +// } +// +// for(int i = 1; i < Mesh.getDimensions().x()-1; i++) +// { +// Index j = Mesh.getDimensions().y() - 1; +// tmp = Sign(dofVector[Mesh.getCellIndex(CoordinatesType(i,j))]); +// +// +// if(tmp == 0.0) +// {} +// else if(dofVector[Mesh.getCellIndex(CoordinatesType(i+1,j))]*tmp < 0.0 || +// dofVector[Mesh.getCellIndex(CoordinatesType(i-1,j))]*tmp < 0.0 || +// dofVector[Mesh.getCellIndex(CoordinatesType(i,j-1))]*tmp < 0.0 ) +// {} +// else +// dofVector[Mesh.getCellIndex(CoordinatesType(i,j))] = tmp*INT_MAX; +// } +// +// for(int j = 1; j < Mesh.getDimensions().y()-1; j++) +// { +// Index i = 0; +// tmp = Sign(dofVector[Mesh.getCellIndex(CoordinatesType(i,j))]); +// +// +// if(tmp == 0.0) +// {} +// else if(dofVector[Mesh.getCellIndex(CoordinatesType(i+1,j))]*tmp < 0.0 || +// dofVector[Mesh.getCellIndex(CoordinatesType(i,j+1))]*tmp < 0.0 || +// dofVector[Mesh.getCellIndex(CoordinatesType(i,j-1))]*tmp < 0.0 ) +// {} +// else +// dofVector[Mesh.getCellIndex(CoordinatesType(i,j))] = tmp*INT_MAX; +// } +// +// for(int j = 1; j < Mesh.getDimensions().y()-1; j++) +// { +// Index i = Mesh.getDimensions().x() - 1; +// tmp = Sign(dofVector[Mesh.getCellIndex(CoordinatesType(i,j))]); +// +// +// if(tmp == 0.0) +// {} +// else if(dofVector[Mesh.getCellIndex(CoordinatesType(i-1,j))]*tmp < 0.0 || +// dofVector[Mesh.getCellIndex(CoordinatesType(i,j+1))]*tmp < 0.0 || +// dofVector[Mesh.getCellIndex(CoordinatesType(i,j-1))]*tmp < 0.0 ) +// {} +// else +// dofVector[Mesh.getCellIndex(CoordinatesType(i,j))] = tmp*INT_MAX; +// } +// +// +// Index i = Mesh.getDimensions().x() - 1; +// Index j = Mesh.getDimensions().y() - 1; +// +// tmp = Sign(dofVector[Mesh.getCellIndex(CoordinatesType(i,j))]); +// if(dofVector[Mesh.getCellIndex(CoordinatesType(i-1,j))]*tmp > 0.0 && +// dofVector[Mesh.getCellIndex(CoordinatesType(i,j-1))]*tmp > 0.0) +// +// dofVector[Mesh.getCellIndex(CoordinatesType(i,j))] = tmp*INT_MAX; +// +// +// +// j = 0; +// tmp = Sign(dofVector[Mesh.getCellIndex(CoordinatesType(i,j))]); +// if(dofVector[Mesh.getCellIndex(CoordinatesType(i-1,j))]*tmp > 0.0 && +// dofVector[Mesh.getCellIndex(CoordinatesType(i,j+1))]*tmp > 0.0) +// +// dofVector[Mesh.getCellIndex(CoordinatesType(i,j))] = tmp*INT_MAX; +// +// +// +// i = 0; +// j = Mesh.getDimensions().y() -1; +// tmp = Sign(dofVector[Mesh.getCellIndex(CoordinatesType(i,j))]); +// if(dofVector[Mesh.getCellIndex(CoordinatesType(i+1,j))]*tmp > 0.0 && +// dofVector[Mesh.getCellIndex(CoordinatesType(i,j-1))]*tmp > 0.0) +// +// dofVector[Mesh.getCellIndex(CoordinatesType(i,j))] = tmp*INT_MAX; +// +// +// +// j = 0; +// tmp = Sign(dofVector[Mesh.getCellIndex(CoordinatesType(i,j))]); +// if(dofVector[Mesh.getCellIndex(CoordinatesType(i+1,j))]*tmp > 0.0 && +// dofVector[Mesh.getCellIndex(CoordinatesType(i,j+1))]*tmp > 0.0) +// +// dofVector[Mesh.getCellIndex(CoordinatesType(i,j))] = tmp*INT_MAX; + + + dofVector2.save("u-00000.tnl"); + + return true; +} + + + +template< typename MeshReal, + typename Device, + typename MeshIndex, + typename Real, + typename Index > +bool tnlFastSweeping< tnlGrid< 3,MeshReal, Device, MeshIndex >, Real, Index > :: run() +{ + + for(Index k = 0; k < Mesh.getDimensions().z(); k++) + { + for(Index i = 0; i < Mesh.getDimensions().x(); i++) + { + for(Index j = 0; j < Mesh.getDimensions().y(); j++) + { + updateValue(i,j,k); + } + } + } + +/*---------------------------------------------------------------------------------------------------------------------------*/ + + for(Index k = 0; k < Mesh.getDimensions().z(); k++) + { + for(Index i = Mesh.getDimensions().x() - 1; i > -1; i--) + { + for(Index j = 0; j < Mesh.getDimensions().y(); j++) + { + updateValue(i,j,k); + } + } + } + +/*---------------------------------------------------------------------------------------------------------------------------*/ + + for(Index k = 0; k < Mesh.getDimensions().z(); k++) + { + for(Index i = Mesh.getDimensions().x() - 1; i > -1; i--) + { + for(Index j = Mesh.getDimensions().y() - 1; j > -1; j--) + { + updateValue(i,j,k); + } + } + } + +/*---------------------------------------------------------------------------------------------------------------------------*/ + for(Index k = 0; k < Mesh.getDimensions().z(); k++) + { + for(Index i = 0; i < Mesh.getDimensions().x(); i++) + { + for(Index j = Mesh.getDimensions().y() - 1; j > -1; j--) + { + updateValue(i,j,k); + } + } + } + +/*---------------------------------------------------------------------------------------------------------------------------*/ + + + + + + + + + for(Index k = Mesh.getDimensions().z() -1; k > -1; k--) + { + for(Index i = 0; i < Mesh.getDimensions().x(); i++) + { + for(Index j = 0; j < Mesh.getDimensions().y(); j++) + { + updateValue(i,j,k); + } + } + } + +/*---------------------------------------------------------------------------------------------------------------------------*/ + + for(Index k = Mesh.getDimensions().z() -1; k > -1; k--) + { + for(Index i = Mesh.getDimensions().x() - 1; i > -1; i--) + { + for(Index j = 0; j < Mesh.getDimensions().y(); j++) + { + updateValue(i,j,k); + } + } + } + +/*---------------------------------------------------------------------------------------------------------------------------*/ + + for(Index k = Mesh.getDimensions().z() -1; k > -1; k--) + { + for(Index i = Mesh.getDimensions().x() - 1; i > -1; i--) + { + for(Index j = Mesh.getDimensions().y() - 1; j > -1; j--) + { + updateValue(i,j,k); + } + } + } + +/*---------------------------------------------------------------------------------------------------------------------------*/ + for(Index k = Mesh.getDimensions().z() -1; k > -1; k--) + { + for(Index i = 0; i < Mesh.getDimensions().x(); i++) + { + for(Index j = Mesh.getDimensions().y() - 1; j > -1; j--) + { + updateValue(i,j,k); + } + } + } + +/*---------------------------------------------------------------------------------------------------------------------------*/ + + + dofVector2.save("u-00001.tnl"); + + cout << "bla 3"<<endl; + return true; +} + + +template< typename MeshReal, + typename Device, + typename MeshIndex, + typename Real, + typename Index > +void tnlFastSweeping< tnlGrid< 3,MeshReal, Device, MeshIndex >, Real, Index > :: updateValue( Index i, Index j, Index k) +{ + Index index = Mesh.getCellIndex(CoordinatesType(i,j,k)); + Real value = dofVector2[index]; + Real a,b,c, tmp; + + if( i == 0 ) + a = dofVector2[Mesh.template getCellNextToCell<1,0,0>(index)]; + else if( i == Mesh.getDimensions().x() - 1 ) + a = dofVector2[Mesh.template getCellNextToCell<-1,0,0>(index)]; + else + { + a = fabsMin( dofVector2[Mesh.template getCellNextToCell<-1,0,0>(index)], + dofVector2[Mesh.template getCellNextToCell<1,0,0>(index)] ); + } + + if( j == 0 ) + b = dofVector2[Mesh.template getCellNextToCell<0,1,0>(index)]; + else if( j == Mesh.getDimensions().y() - 1 ) + b = dofVector2[Mesh.template getCellNextToCell<0,-1,0>(index)]; + else + { + b = fabsMin( dofVector2[Mesh.template getCellNextToCell<0,-1,0>(index)], + dofVector2[Mesh.template getCellNextToCell<0,1,0>(index)] ); + } + + if( k == 0 ) + c = dofVector2[Mesh.template getCellNextToCell<0,0,1>(index)]; + else if( k == Mesh.getDimensions().z() - 1 ) + c = dofVector2[Mesh.template getCellNextToCell<0,0,-1>(index)]; + else + { + c = fabsMin( dofVector2[Mesh.template getCellNextToCell<0,0,-1>(index)], + dofVector2[Mesh.template getCellNextToCell<0,0,1>(index)] ); + } + + Real hD= 2.0*(a*a+b*b+c*c-a*b-a*c-b*c); + + if(hD >= 3.0*h) + tmp = fabsMin(a,fabsMin(b,c)) + Sign(value)*h; + else + tmp = (1.0/3.0) * ( a + b + c + Sign(value)*sqrt(3.0*h*h-hD) ); + + + dofVector2[index] = fabsMin(value, tmp); +} + + +template< typename MeshReal, + typename Device, + typename MeshIndex, + typename Real, + typename Index > +Real tnlFastSweeping< tnlGrid< 3,MeshReal, Device, MeshIndex >, Real, Index > :: fabsMin( Real x, Real y) +{ + Real fx = fabs(x); + Real fy = fabs(y); + + Real tmpMin = Min(fx,fy); + + if(tmpMin == fx) + return x; + else + return y; + +} + + + +template< typename MeshReal, + typename Device, + typename MeshIndex, + typename Real, + typename Index > +void tnlFastSweeping< tnlGrid< 3,MeshReal, Device, MeshIndex >, Real, Index > :: setupSquare1111( Index i, Index j) +{ + Index index = Mesh.getCellIndex(CoordinatesType(i,j)); + dofVector2[index]=fabsMin(INT_MAX,dofVector2[(index)]); + dofVector2[Mesh.template getCellNextToCell<0,1>(index)]=fabsMin(INT_MAX,dofVector2[Mesh.template getCellNextToCell<0,1>(index)]); + dofVector2[Mesh.template getCellNextToCell<1,1>(index)]=fabsMin(INT_MAX,dofVector2[Mesh.template getCellNextToCell<1,1>(index)]); + dofVector2[Mesh.template getCellNextToCell<1,0>(index)]=fabsMin(INT_MAX,dofVector2[Mesh.template getCellNextToCell<1,0>(index)]); + +} + + +template< typename MeshReal, + typename Device, + typename MeshIndex, + typename Real, + typename Index > +void tnlFastSweeping< tnlGrid< 3,MeshReal, Device, MeshIndex >, Real, Index > :: setupSquare0000( Index i, Index j) +{ + Index index = Mesh.getCellIndex(CoordinatesType(i,j)); + dofVector2[index]=fabsMin(-INT_MAX,dofVector2[(index)]); + dofVector2[Mesh.template getCellNextToCell<0,1>(index)]=fabsMin(-INT_MAX,dofVector2[Mesh.template getCellNextToCell<0,1>(index)]); + dofVector2[Mesh.template getCellNextToCell<1,1>(index)]=fabsMin(-INT_MAX,dofVector2[Mesh.template getCellNextToCell<1,1>(index)]); + dofVector2[Mesh.template getCellNextToCell<1,0>(index)]=fabsMin(-INT_MAX,dofVector2[Mesh.template getCellNextToCell<1,0>(index)]); + +} + + +template< typename MeshReal, + typename Device, + typename MeshIndex, + typename Real, + typename Index > +void tnlFastSweeping< tnlGrid< 3,MeshReal, Device, MeshIndex >, Real, Index > :: setupSquare1110( Index i, Index j) +{ + Index index = Mesh.getCellIndex(CoordinatesType(i,j)); + Real al,be, a,b,c,s; + al=abs(dofVector[Mesh.template getCellNextToCell<1,1>(index)]/ + (dofVector[Mesh.template getCellNextToCell<1,0>(index)]- + dofVector[Mesh.template getCellNextToCell<1,1>(index)])); + + be=abs(dofVector[Mesh.template getCellNextToCell<1,1>(index)]/ + (dofVector[Mesh.template getCellNextToCell<0,1>(index)]- + dofVector[Mesh.template getCellNextToCell<1,1>(index)])); + + a = be/al; + b=1.0; + c=-be; + s= h/sqrt(a*a+b*b); + + + dofVector2[index]=fabsMin(abs(a*1+b*1+c)*s,dofVector2[(index)]); + dofVector2[Mesh.template getCellNextToCell<0,1>(index)]=fabsMin(abs(a*0+b*1+c)*s,dofVector2[Mesh.template getCellNextToCell<0,1>(index)]); + dofVector2[Mesh.template getCellNextToCell<1,1>(index)]=fabsMin(-abs(a*0+b*0+c)*s,dofVector2[Mesh.template getCellNextToCell<1,1>(index)]); + dofVector2[Mesh.template getCellNextToCell<1,0>(index)]=fabsMin(abs(a*1+b*0+c)*s,dofVector2[Mesh.template getCellNextToCell<1,0>(index)]); + +} + +template< typename MeshReal, + typename Device, + typename MeshIndex, + typename Real, + typename Index > +void tnlFastSweeping< tnlGrid< 3,MeshReal, Device, MeshIndex >, Real, Index > :: setupSquare1101( Index i, Index j) +{ + Index index = Mesh.getCellIndex(CoordinatesType(i,j)); + Real al,be, a,b,c,s; + al=abs(dofVector[Mesh.template getCellNextToCell<0,1>(index)]/ + (dofVector[Mesh.template getCellNextToCell<1,1>(index)]- + dofVector[Mesh.template getCellNextToCell<0,1>(index)])); + + be=abs(dofVector[Mesh.template getCellNextToCell<0,1>(index)]/ + (dofVector[Mesh.template getCellNextToCell<0,0>(index)]- + dofVector[Mesh.template getCellNextToCell<0,1>(index)])); + + a = be/al; + b=1.0; + c=-be; + s= h/sqrt(a*a+b*b); + + + dofVector2[index]=fabsMin(abs(a*0+b*1+c)*s,dofVector2[(index)]); + dofVector2[Mesh.template getCellNextToCell<0,1>(index)]=fabsMin(abs(a*0+b*0+c)*s,dofVector2[Mesh.template getCellNextToCell<0,1>(index)]); + dofVector2[Mesh.template getCellNextToCell<1,1>(index)]=fabsMin(abs(a*1+b*0+c)*s,dofVector2[Mesh.template getCellNextToCell<1,1>(index)]); + dofVector2[Mesh.template getCellNextToCell<1,0>(index)]=fabsMin(-abs(a*1+b*1+c)*s,dofVector2[Mesh.template getCellNextToCell<1,0>(index)]); + +} + +template< typename MeshReal, + typename Device, + typename MeshIndex, + typename Real, + typename Index > +void tnlFastSweeping< tnlGrid< 3,MeshReal, Device, MeshIndex >, Real, Index > :: setupSquare1011( Index i, Index j) +{ + Index index = Mesh.getCellIndex(CoordinatesType(i,j)); + Real al,be, a,b,c,s; + al=abs(dofVector[Mesh.template getCellNextToCell<1,0>(index)]/ + (dofVector[Mesh.template getCellNextToCell<0,0>(index)]- + dofVector[Mesh.template getCellNextToCell<1,0>(index)])); + + be=abs(dofVector[Mesh.template getCellNextToCell<1,0>(index)]/ + (dofVector[Mesh.template getCellNextToCell<1,1>(index)]- + dofVector[Mesh.template getCellNextToCell<1,0>(index)])); + + a = be/al; + b=1.0; + c=-be; + s= h/sqrt(a*a+b*b); + + + dofVector2[index]=fabsMin(abs(a*1+b*0+c)*s,dofVector2[(index)]); + dofVector2[Mesh.template getCellNextToCell<0,1>(index)]=fabsMin(-abs(a*1+b*1+c)*s,dofVector2[Mesh.template getCellNextToCell<0,1>(index)]); + dofVector2[Mesh.template getCellNextToCell<1,1>(index)]=fabsMin(abs(a*0+b*1+c)*s,dofVector2[Mesh.template getCellNextToCell<1,1>(index)]); + dofVector2[Mesh.template getCellNextToCell<1,0>(index)]=fabsMin(abs(a*0+b*0+c)*s,dofVector2[Mesh.template getCellNextToCell<1,0>(index)]); + +} + +template< typename MeshReal, + typename Device, + typename MeshIndex, + typename Real, + typename Index > +void tnlFastSweeping< tnlGrid< 3,MeshReal, Device, MeshIndex >, Real, Index > :: setupSquare0111( Index i, Index j) +{ + Index index = Mesh.getCellIndex(CoordinatesType(i,j)); + Real al,be, a,b,c,s; + al=abs(dofVector[Mesh.template getCellNextToCell<0,0>(index)]/ + (dofVector[Mesh.template getCellNextToCell<0,1>(index)]- + dofVector[Mesh.template getCellNextToCell<0,0>(index)])); + + be=abs(dofVector[Mesh.template getCellNextToCell<0,0>(index)]/ + (dofVector[Mesh.template getCellNextToCell<1,0>(index)]- + dofVector[Mesh.template getCellNextToCell<0,0>(index)])); + + a = be/al; + b=1.0; + c=-be; + s= h/sqrt(a*a+b*b); + + + dofVector2[index]=fabsMin(-abs(a*0+b*0+c)*s,dofVector2[(index)]); + dofVector2[Mesh.template getCellNextToCell<0,1>(index)]=fabsMin(abs(a*1+b*0+c)*s,dofVector2[Mesh.template getCellNextToCell<0,1>(index)]); + dofVector2[Mesh.template getCellNextToCell<1,1>(index)]=fabsMin(abs(a*1+b*1+c)*s,dofVector2[Mesh.template getCellNextToCell<1,1>(index)]); + dofVector2[Mesh.template getCellNextToCell<1,0>(index)]=fabsMin(abs(a*0+b*1+c)*s,dofVector2[Mesh.template getCellNextToCell<1,0>(index)]); + +} + + +template< typename MeshReal, + typename Device, + typename MeshIndex, + typename Real, + typename Index > +void tnlFastSweeping< tnlGrid< 3,MeshReal, Device, MeshIndex >, Real, Index > :: setupSquare0001( Index i, Index j) +{ + Index index = Mesh.getCellIndex(CoordinatesType(i,j)); + Real al,be, a,b,c,s; + al=abs(dofVector[Mesh.template getCellNextToCell<1,1>(index)]/ + (dofVector[Mesh.template getCellNextToCell<1,0>(index)]- + dofVector[Mesh.template getCellNextToCell<1,1>(index)])); + + be=abs(dofVector[Mesh.template getCellNextToCell<1,1>(index)]/ + (dofVector[Mesh.template getCellNextToCell<0,1>(index)]- + dofVector[Mesh.template getCellNextToCell<1,1>(index)])); + + a = be/al; + b=1.0; + c=-be; + s= h/sqrt(a*a+b*b); + + + dofVector2[index]=fabsMin(-abs(a*1+b*1+c)*s,dofVector2[(index)]); + dofVector2[Mesh.template getCellNextToCell<0,1>(index)]=fabsMin(-abs(a*0+b*1+c)*s,dofVector2[Mesh.template getCellNextToCell<0,1>(index)]); + dofVector2[Mesh.template getCellNextToCell<1,1>(index)]=fabsMin(abs(a*0+b*0+c)*s,dofVector2[Mesh.template getCellNextToCell<1,1>(index)]); + dofVector2[Mesh.template getCellNextToCell<1,0>(index)]=fabsMin(-abs(a*1+b*0+c)*s,dofVector2[Mesh.template getCellNextToCell<1,0>(index)]); + +} + +template< typename MeshReal, + typename Device, + typename MeshIndex, + typename Real, + typename Index > +void tnlFastSweeping< tnlGrid< 3,MeshReal, Device, MeshIndex >, Real, Index > :: setupSquare0010( Index i, Index j) +{ + Index index = Mesh.getCellIndex(CoordinatesType(i,j)); + Real al,be, a,b,c,s; + al=abs(dofVector[Mesh.template getCellNextToCell<0,1>(index)]/ + (dofVector[Mesh.template getCellNextToCell<1,1>(index)]- + dofVector[Mesh.template getCellNextToCell<0,1>(index)])); + + be=abs(dofVector[Mesh.template getCellNextToCell<0,1>(index)]/ + (dofVector[Mesh.template getCellNextToCell<0,0>(index)]- + dofVector[Mesh.template getCellNextToCell<0,1>(index)])); + + a = be/al; + b=1.0; + c=-be; + s= h/sqrt(a*a+b*b); + + + dofVector2[index]=fabsMin(-abs(a*0+b*1+c)*s,dofVector2[(index)]); + dofVector2[Mesh.template getCellNextToCell<0,1>(index)]=fabsMin(-abs(a*0+b*0+c)*s,dofVector2[Mesh.template getCellNextToCell<0,1>(index)]); + dofVector2[Mesh.template getCellNextToCell<1,1>(index)]=fabsMin(-abs(a*1+b*0+c)*s,dofVector2[Mesh.template getCellNextToCell<1,1>(index)]); + dofVector2[Mesh.template getCellNextToCell<1,0>(index)]=fabsMin(abs(a*1+b*1+c)*s,dofVector2[Mesh.template getCellNextToCell<1,0>(index)]); + +} + +template< typename MeshReal, + typename Device, + typename MeshIndex, + typename Real, + typename Index > +void tnlFastSweeping< tnlGrid< 3,MeshReal, Device, MeshIndex >, Real, Index > :: setupSquare0100( Index i, Index j) +{ + Index index = Mesh.getCellIndex(CoordinatesType(i,j)); + Real al,be, a,b,c,s; + al=abs(dofVector[Mesh.template getCellNextToCell<1,0>(index)]/ + (dofVector[Mesh.template getCellNextToCell<0,0>(index)]- + dofVector[Mesh.template getCellNextToCell<1,0>(index)])); + + be=abs(dofVector[Mesh.template getCellNextToCell<1,0>(index)]/ + (dofVector[Mesh.template getCellNextToCell<1,1>(index)]- + dofVector[Mesh.template getCellNextToCell<1,0>(index)])); + + a = be/al; + b=1.0; + c=-be; + s= h/sqrt(a*a+b*b); + + + dofVector2[index]=fabsMin(-abs(a*1+b*0+c)*s,dofVector2[(index)]); + dofVector2[Mesh.template getCellNextToCell<0,1>(index)]=fabsMin(abs(a*1+b*1+c)*s,dofVector2[Mesh.template getCellNextToCell<0,1>(index)]); + dofVector2[Mesh.template getCellNextToCell<1,1>(index)]=fabsMin(-abs(a*0+b*1+c)*s,dofVector2[Mesh.template getCellNextToCell<1,1>(index)]); + dofVector2[Mesh.template getCellNextToCell<1,0>(index)]=fabsMin(-abs(a*0+b*0+c)*s,dofVector2[Mesh.template getCellNextToCell<1,0>(index)]); + +} + +template< typename MeshReal, + typename Device, + typename MeshIndex, + typename Real, + typename Index > +void tnlFastSweeping< tnlGrid< 3,MeshReal, Device, MeshIndex >, Real, Index > :: setupSquare1000( Index i, Index j) +{ + Index index = Mesh.getCellIndex(CoordinatesType(i,j)); + Real al,be, a,b,c,s; + al=abs(dofVector[Mesh.template getCellNextToCell<0,0>(index)]/ + (dofVector[Mesh.template getCellNextToCell<0,1>(index)]- + dofVector[Mesh.template getCellNextToCell<0,0>(index)])); + + be=abs(dofVector[Mesh.template getCellNextToCell<0,0>(index)]/ + (dofVector[Mesh.template getCellNextToCell<1,0>(index)]- + dofVector[Mesh.template getCellNextToCell<0,0>(index)])); + + a = be/al; + b=1.0; + c=-be; + s= h/sqrt(a*a+b*b); + + + dofVector2[index]=fabsMin(abs(a*0+b*0+c)*s,dofVector2[(index)]); + dofVector2[Mesh.template getCellNextToCell<0,1>(index)]=fabsMin(-abs(a*1+b*0+c)*s,dofVector2[Mesh.template getCellNextToCell<0,1>(index)]); + dofVector2[Mesh.template getCellNextToCell<1,1>(index)]=fabsMin(-abs(a*1+b*1+c)*s,dofVector2[Mesh.template getCellNextToCell<1,1>(index)]); + dofVector2[Mesh.template getCellNextToCell<1,0>(index)]=fabsMin(-abs(a*0+b*1+c)*s,dofVector2[Mesh.template getCellNextToCell<1,0>(index)]); + +} + + + + + +template< typename MeshReal, + typename Device, + typename MeshIndex, + typename Real, + typename Index > +void tnlFastSweeping< tnlGrid< 3,MeshReal, Device, MeshIndex >, Real, Index > :: setupSquare1100( Index i, Index j) +{ + Index index = Mesh.getCellIndex(CoordinatesType(i,j)); + Real al,be, a,b,c,s; + al=abs(dofVector[Mesh.template getCellNextToCell<0,0>(index)]/ + (dofVector[Mesh.template getCellNextToCell<0,1>(index)]- + dofVector[Mesh.template getCellNextToCell<0,0>(index)])); + + be=abs(dofVector[Mesh.template getCellNextToCell<1,0>(index)]/ + (dofVector[Mesh.template getCellNextToCell<1,1>(index)]- + dofVector[Mesh.template getCellNextToCell<1,0>(index)])); + + a = al-be; + b=1.0; + c=-al; + s= h/sqrt(a*a+b*b); + + + dofVector2[index]=fabsMin(abs(a*0+b*0+c)*s,dofVector2[(index)]); + dofVector2[Mesh.template getCellNextToCell<0,1>(index)]=fabsMin(-abs(a*0+b*1+c)*s,dofVector2[Mesh.template getCellNextToCell<0,1>(index)]); + dofVector2[Mesh.template getCellNextToCell<1,1>(index)]=fabsMin(-abs(a*1+b*1+c)*s,dofVector2[Mesh.template getCellNextToCell<1,1>(index)]); + dofVector2[Mesh.template getCellNextToCell<1,0>(index)]=fabsMin(abs(a*1+b*0+c)*s,dofVector2[Mesh.template getCellNextToCell<1,0>(index)]); + +} + +template< typename MeshReal, + typename Device, + typename MeshIndex, + typename Real, + typename Index > +void tnlFastSweeping< tnlGrid< 3,MeshReal, Device, MeshIndex >, Real, Index > :: setupSquare1010( Index i, Index j) +{ + Index index = Mesh.getCellIndex(CoordinatesType(i,j)); + Real al,be, a,b,c,s; + al=abs(dofVector[Mesh.template getCellNextToCell<0,0>(index)]/ + (dofVector[Mesh.template getCellNextToCell<1,0>(index)]- + dofVector[Mesh.template getCellNextToCell<0,0>(index)])); + + be=abs(dofVector[Mesh.template getCellNextToCell<0,1>(index)]/ + (dofVector[Mesh.template getCellNextToCell<1,1>(index)]- + dofVector[Mesh.template getCellNextToCell<0,1>(index)])); + + a = al-be; + b=1.0; + c=-be; + s= h/sqrt(a*a+b*b); + + + dofVector2[index]=fabsMin(abs(a*0+b*0+c)*s,dofVector2[(index)]); + dofVector2[Mesh.template getCellNextToCell<0,1>(index)]=fabsMin(abs(a*1+b*0+c)*s,dofVector2[Mesh.template getCellNextToCell<0,1>(index)]); + dofVector2[Mesh.template getCellNextToCell<1,1>(index)]=fabsMin(-abs(a*1+b*1+c)*s,dofVector2[Mesh.template getCellNextToCell<1,1>(index)]); + dofVector2[Mesh.template getCellNextToCell<1,0>(index)]=fabsMin(-abs(a*0+b*1+c)*s,dofVector2[Mesh.template getCellNextToCell<1,0>(index)]); + +} + +template< typename MeshReal, + typename Device, + typename MeshIndex, + typename Real, + typename Index > +void tnlFastSweeping< tnlGrid< 3,MeshReal, Device, MeshIndex >, Real, Index > :: setupSquare1001( Index i, Index j) +{ + Index index = Mesh.getCellIndex(CoordinatesType(i,j)); + dofVector2[index]=fabsMin(dofVector[index],dofVector2[(index)]); + dofVector2[Mesh.template getCellNextToCell<0,1>(index)]=fabsMin(dofVector[Mesh.template getCellNextToCell<0,1>(index)],dofVector2[Mesh.template getCellNextToCell<0,1>(index)]); + dofVector2[Mesh.template getCellNextToCell<1,1>(index)]=fabsMin(dofVector[Mesh.template getCellNextToCell<1,1>(index)],dofVector2[Mesh.template getCellNextToCell<1,1>(index)]); + dofVector2[Mesh.template getCellNextToCell<1,0>(index)]=fabsMin(dofVector[Mesh.template getCellNextToCell<1,0>(index)],dofVector2[Mesh.template getCellNextToCell<1,0>(index)]); + +} + + + + + + + +template< typename MeshReal, + typename Device, + typename MeshIndex, + typename Real, + typename Index > +void tnlFastSweeping< tnlGrid< 3,MeshReal, Device, MeshIndex >, Real, Index > :: setupSquare0011( Index i, Index j) +{ + Index index = Mesh.getCellIndex(CoordinatesType(i,j)); + Real al,be, a,b,c,s; + al=abs(dofVector[Mesh.template getCellNextToCell<0,0>(index)]/ + (dofVector[Mesh.template getCellNextToCell<0,1>(index)]- + dofVector[Mesh.template getCellNextToCell<0,0>(index)])); + + be=abs(dofVector[Mesh.template getCellNextToCell<1,0>(index)]/ + (dofVector[Mesh.template getCellNextToCell<1,1>(index)]- + dofVector[Mesh.template getCellNextToCell<1,0>(index)])); + + a = al-be; + b=1.0; + c=-al; + s= h/sqrt(a*a+b*b); + + + dofVector2[index]=fabsMin(-abs(a*0+b*0+c)*s,dofVector2[(index)]); + dofVector2[Mesh.template getCellNextToCell<0,1>(index)]=fabsMin(abs(a*0+b*1+c)*s,dofVector2[Mesh.template getCellNextToCell<0,1>(index)]); + dofVector2[Mesh.template getCellNextToCell<1,1>(index)]=fabsMin(abs(a*1+b*1+c)*s,dofVector2[Mesh.template getCellNextToCell<1,1>(index)]); + dofVector2[Mesh.template getCellNextToCell<1,0>(index)]=fabsMin(-abs(a*1+b*0+c)*s,dofVector2[Mesh.template getCellNextToCell<1,0>(index)]); + +} + +template< typename MeshReal, + typename Device, + typename MeshIndex, + typename Real, + typename Index > +void tnlFastSweeping< tnlGrid< 3,MeshReal, Device, MeshIndex >, Real, Index > :: setupSquare0101( Index i, Index j) +{ + Index index = Mesh.getCellIndex(CoordinatesType(i,j)); + Real al,be, a,b,c,s; + al=abs(dofVector[Mesh.template getCellNextToCell<0,0>(index)]/ + (dofVector[Mesh.template getCellNextToCell<1,0>(index)]- + dofVector[Mesh.template getCellNextToCell<0,0>(index)])); + + be=abs(dofVector[Mesh.template getCellNextToCell<0,1>(index)]/ + (dofVector[Mesh.template getCellNextToCell<1,1>(index)]- + dofVector[Mesh.template getCellNextToCell<0,1>(index)])); + + a = al-be; + b=1.0; + c=-be; + s= h/sqrt(a*a+b*b); + + + dofVector2[index]=fabsMin(-abs(a*0+b*0+c)*s,dofVector2[(index)]); + dofVector2[Mesh.template getCellNextToCell<0,1>(index)]=fabsMin(-abs(a*1+b*0+c)*s,dofVector2[Mesh.template getCellNextToCell<0,1>(index)]); + dofVector2[Mesh.template getCellNextToCell<1,1>(index)]=fabsMin(abs(a*1+b*1+c)*s,dofVector2[Mesh.template getCellNextToCell<1,1>(index)]); + dofVector2[Mesh.template getCellNextToCell<1,0>(index)]=fabsMin(abs(a*0+b*1+c)*s,dofVector2[Mesh.template getCellNextToCell<1,0>(index)]); + +} + +template< typename MeshReal, + typename Device, + typename MeshIndex, + typename Real, + typename Index > +void tnlFastSweeping< tnlGrid< 3,MeshReal, Device, MeshIndex >, Real, Index > :: setupSquare0110( Index i, Index j) +{ + Index index = Mesh.getCellIndex(CoordinatesType(i,j)); + dofVector2[index]=fabsMin(dofVector[index],dofVector2[(index)]); + dofVector2[Mesh.template getCellNextToCell<0,1>(index)]=fabsMin(dofVector[Mesh.template getCellNextToCell<0,1>(index)],dofVector2[Mesh.template getCellNextToCell<0,1>(index)]); + dofVector2[Mesh.template getCellNextToCell<1,1>(index)]=fabsMin(dofVector[Mesh.template getCellNextToCell<1,1>(index)],dofVector2[Mesh.template getCellNextToCell<1,1>(index)]); + dofVector2[Mesh.template getCellNextToCell<1,0>(index)]=fabsMin(dofVector[Mesh.template getCellNextToCell<1,0>(index)],dofVector2[Mesh.template getCellNextToCell<1,0>(index)]); +} + + + + +#endif /* TNLFASTSWEEPING_IMPL_H_ */ diff --git a/examples/fast-sweeping/tnlFastSweeping_CUDA.h b/examples/fast-sweeping/tnlFastSweeping_CUDA.h index 8d47d5cb1c5039fd284b33e6e5e4e7f755d7bf54..4313bbd3a79c1c85e798e0f5e7a04b157a43d12f 100644 --- a/examples/fast-sweeping/tnlFastSweeping_CUDA.h +++ b/examples/fast-sweeping/tnlFastSweeping_CUDA.h @@ -105,12 +105,90 @@ protected: + + + + + + +template< typename MeshReal, + typename Device, + typename MeshIndex, + typename Real, + typename Index > +class tnlFastSweeping< tnlGrid< 3,MeshReal, Device, MeshIndex >, Real, Index > +{ + +public: + typedef Real RealType; + typedef Device DeviceType; + typedef Index IndexType; + typedef tnlGrid< 3, Real, Device, Index > MeshType; + typedef tnlVector< RealType, DeviceType, IndexType> DofVectorType; + typedef typename MeshType::CoordinatesType CoordinatesType; + + + + __host__ static tnlString getType(); + __host__ bool init( const tnlParameterContainer& parameters ); + __host__ bool run(); + +#ifdef HAVE_CUDA + __device__ bool initGrid(); + __device__ void updateValue(const Index i, const Index j, const Index k); + __device__ void updateValue(const Index i, const Index j, const Index k, double** sharedMem, const int k3); + __device__ Real fabsMin(const Real x, const Real y); + + tnlFastSweeping< tnlGrid< 3,MeshReal, Device, MeshIndex >, Real, Index >* cudaSolver; + double* cudaDofVector; + double* cudaDofVector2; + int counter; + __device__ void setupSquare1000(Index i, Index j); + __device__ void setupSquare1100(Index i, Index j); + __device__ void setupSquare1010(Index i, Index j); + __device__ void setupSquare1001(Index i, Index j); + __device__ void setupSquare1110(Index i, Index j); + __device__ void setupSquare1101(Index i, Index j); + __device__ void setupSquare1011(Index i, Index j); + __device__ void setupSquare1111(Index i, Index j); + __device__ void setupSquare0000(Index i, Index j); + __device__ void setupSquare0100(Index i, Index j); + __device__ void setupSquare0010(Index i, Index j); + __device__ void setupSquare0001(Index i, Index j); + __device__ void setupSquare0110(Index i, Index j); + __device__ void setupSquare0101(Index i, Index j); + __device__ void setupSquare0011(Index i, Index j); + __device__ void setupSquare0111(Index i, Index j); +#endif + + MeshType Mesh; + +protected: + + + + bool exactInput; + + DofVectorType dofVector; + + RealType h; + + +}; + + + + + + + #ifdef HAVE_CUDA //template<int sweep_t> __global__ void runCUDA(tnlFastSweeping< tnlGrid< 2,double, tnlHost, int >, double, int >* solver, int sweep, int i); -//__global__ void runCUDA(tnlFastSweeping< tnlGrid< 2,double, tnlHost, int >, double, int >* solver, int sweep, int i); +__global__ void runCUDA(tnlFastSweeping< tnlGrid< 3,double, tnlHost, int >, double, int >* solver, int sweep, int i); __global__ void initCUDA(tnlFastSweeping< tnlGrid< 2,double, tnlHost, int >, double, int >* solver); +__global__ void initCUDA(tnlFastSweeping< tnlGrid< 3,double, tnlHost, int >, double, int >* solver); #endif /*various implementtions.... choose one*/ @@ -119,5 +197,6 @@ __global__ void initCUDA(tnlFastSweeping< tnlGrid< 2,double, tnlHost, int >, dou //#include "tnlFastSweeping2D_CUDA_v3_impl.h" #include "tnlFastSweeping2D_CUDA_v4_impl.h" //#include "tnlFastSweeping2D_CUDA_v5_impl.h" +#include "tnlFastSweeping3D_CUDA_impl.h" #endif /* TNLFASTSWEEPING_H_ */ diff --git a/examples/hamilton-jacobi-parallel/main.h b/examples/hamilton-jacobi-parallel/main.h index b0edef69159dbfca6cc31fc30dc1d7f7a922e6ee..692a2bc504ab735f69e293132d839d15e07b9743 100644 --- a/examples/hamilton-jacobi-parallel/main.h +++ b/examples/hamilton-jacobi-parallel/main.h @@ -57,7 +57,7 @@ int main( int argc, char* argv[] ) typedef tnlHost Device; - tnlParallelEikonalSolver<SchemeTypeHost,SchemeTypeDevice, Device> solver; + tnlParallelEikonalSolver<2,SchemeTypeHost,SchemeTypeDevice, Device> solver; if(!solver.init(parameters)) { cerr << "Solver failed to initialize." << endl; @@ -72,7 +72,7 @@ int main( int argc, char* argv[] ) typedef tnlCuda Device; //typedef parallelGodunovEikonalScheme< tnlGrid<2,double,Device, int>, double, int > SchemeType; - tnlParallelEikonalSolver<SchemeTypeHost,SchemeTypeDevice, Device> solver; + tnlParallelEikonalSolver<2,SchemeTypeHost,SchemeTypeDevice, Device> solver; if(!solver.init(parameters)) { cerr << "Solver failed to initialize." << endl; diff --git a/examples/hamilton-jacobi-parallel/tnlParallelEikonalSolver.h b/examples/hamilton-jacobi-parallel/tnlParallelEikonalSolver.h index 547c58ec28a2c132f0ced721923eba5cc2f41e30..2d049321c003c5464c2f47f0521e7c31ad2ae40e 100644 --- a/examples/hamilton-jacobi-parallel/tnlParallelEikonalSolver.h +++ b/examples/hamilton-jacobi-parallel/tnlParallelEikonalSolver.h @@ -34,7 +34,8 @@ #endif -template< typename SchemeHost, +template< int Dimension, + typename SchemeHost, typename SchemeDevice, typename Device, typename RealType = double, @@ -42,8 +43,8 @@ template< typename SchemeHost, class tnlParallelEikonalSolver {}; -template< typename SchemeHost, typename SchemeDevice, typename Device> -class tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int > +template<typename SchemeHost, typename SchemeDevice, typename Device> +class tnlParallelEikonalSolver<2, SchemeHost, SchemeDevice, Device, double, int > { public: @@ -96,7 +97,7 @@ public: SchemeHost schemeHost; SchemeDevice schemeDevice; double delta, tau0, stopTime,cflCondition; - int gridRows, gridCols, currentStep, n; + int gridRows, gridCols, gridLevels, currentStep, n; std::clock_t start; double time_diff; @@ -104,14 +105,14 @@ public: tnlDeviceEnum device; - tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int >* getSelf() + tnlParallelEikonalSolver<2, SchemeHost, SchemeDevice, Device, double, int >* getSelf() { return this; }; #ifdef HAVE_CUDA - tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int >* cudaSolver; + tnlParallelEikonalSolver<2, SchemeHost, SchemeDevice, Device, double, int >* cudaSolver; double* work_u_cuda; @@ -129,27 +130,27 @@ public: int run_host; - __device__ void getSubgridCUDA( const int i, tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int >* caller, double* a); + __device__ void getSubgridCUDA2D( const int i, tnlParallelEikonalSolver<2, SchemeHost, SchemeDevice, Device, double, int >* caller, double* a); - __device__ void updateSubgridCUDA( const int i, tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int >* caller, double* a); + __device__ void updateSubgridCUDA2D( const int i, tnlParallelEikonalSolver<2, SchemeHost, SchemeDevice, Device, double, int >* caller, double* a); - __device__ void insertSubgridCUDA( double u, const int i ); + __device__ void insertSubgridCUDA2D( double u, const int i ); - __device__ void runSubgridCUDA( int boundaryCondition, double* u, int subGridID); + __device__ void runSubgridCUDA2D( int boundaryCondition, double* u, int subGridID); /*__global__ void runCUDA();*/ //__device__ void synchronizeCUDA(); - __device__ int getOwnerCUDA( int i) const; + __device__ int getOwnerCUDA2D( int i) const; - __device__ int getSubgridValueCUDA( int i ) const; + __device__ int getSubgridValueCUDA2D( int i ) const; - __device__ void setSubgridValueCUDA( int i, int value ); + __device__ void setSubgridValueCUDA2D( int i, int value ); - __device__ int getBoundaryConditionCUDA( int i ) const; + __device__ int getBoundaryConditionCUDA2D( int i ) const; - __device__ void setBoundaryConditionCUDA( int i, int value ); + __device__ void setBoundaryConditionCUDA2D( int i, int value ); //__device__ bool initCUDA( tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int >* cudaSolver); @@ -158,23 +159,173 @@ public: #endif }; + + + + + + + + template<typename SchemeHost, typename SchemeDevice, typename Device> + class tnlParallelEikonalSolver<3, SchemeHost, SchemeDevice, Device, double, int > + { + public: + + typedef SchemeDevice SchemeTypeDevice; + typedef SchemeHost SchemeTypeHost; + typedef Device DeviceType; + typedef tnlVector< double, tnlHost, int > VectorType; + typedef tnlVector< int, tnlHost, int > IntVectorType; + typedef tnlGrid< 3, double, tnlHost, int > MeshType; + #ifdef HAVE_CUDA + typedef tnlVector< double, tnlHost, int > VectorTypeCUDA; + typedef tnlVector< int, tnlHost, int > IntVectorTypeCUDA; + typedef tnlGrid< 3, double, tnlHost, int > MeshTypeCUDA; + #endif + tnlParallelEikonalSolver(); + bool init( const tnlParameterContainer& parameters ); + void run(); + + void test(); + + /*private:*/ + + + void synchronize(); + + int getOwner( int i) const; + + int getSubgridValue( int i ) const; + + void setSubgridValue( int i, int value ); + + int getBoundaryCondition( int i ) const; + + void setBoundaryCondition( int i, int value ); + + void stretchGrid(); + + void contractGrid(); + + VectorType getSubgrid( const int i ) const; + + void insertSubgrid( VectorType u, const int i ); + + VectorType runSubgrid( int boundaryCondition, VectorType u, int subGridID); + + + VectorType u0, work_u; + IntVectorType subgridValues, boundaryConditions, unusedCell, calculationsCount; + MeshType mesh, subMesh; + SchemeHost schemeHost; + SchemeDevice schemeDevice; + double delta, tau0, stopTime,cflCondition; + int gridRows, gridCols, gridLevels, currentStep, n; + + std::clock_t start; + double time_diff; + + + tnlDeviceEnum device; + + tnlParallelEikonalSolver<3, SchemeHost, SchemeDevice, Device, double, int >* getSelf() + { + return this; + }; + #ifdef HAVE_CUDA + + tnlParallelEikonalSolver<3, SchemeHost, SchemeDevice, Device, double, int >* cudaSolver; + + double* work_u_cuda; + + int* subgridValues_cuda; + int*boundaryConditions_cuda; + int* unusedCell_cuda; + int* calculationsCount_cuda; + double* tmpw; + //MeshTypeCUDA mesh_cuda, subMesh_cuda; + //SchemeDevice scheme_cuda; + //double delta_cuda, tau0_cuda, stopTime_cuda,cflCondition_cuda; + //int gridRows_cuda, gridCols_cuda, currentStep_cuda, n_cuda; + + int* runcuda; + int run_host; + + + __device__ void getSubgridCUDA3D( const int i, tnlParallelEikonalSolver<3, SchemeHost, SchemeDevice, Device, double, int >* caller, double* a); + + __device__ void updateSubgridCUDA3D( const int i, tnlParallelEikonalSolver<3, SchemeHost, SchemeDevice, Device, double, int >* caller, double* a); + + __device__ void insertSubgridCUDA3D( double u, const int i ); + + __device__ void runSubgridCUDA3D( int boundaryCondition, double* u, int subGridID); + + /*__global__ void runCUDA();*/ + + //__device__ void synchronizeCUDA(); + + __device__ int getOwnerCUDA3D( int i) const; + + __device__ int getSubgridValueCUDA3D( int i ) const; + + __device__ void setSubgridValueCUDA3D( int i, int value ); + + __device__ int getBoundaryConditionCUDA3D( int i ) const; + + __device__ void setBoundaryConditionCUDA3D( int i, int value ); + + //__device__ bool initCUDA( tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int >* cudaSolver); + + /*__global__ void initRunCUDA(tnlParallelEikonalSolver<Scheme, double, tnlHost, int >* caller);*/ + +#endif + +}; + + + + + + +#ifdef HAVE_CUDA +template <typename SchemeHost, typename SchemeDevice, typename Device> +__global__ void runCUDA2D(tnlParallelEikonalSolver<2, SchemeHost, SchemeDevice, Device, double, int >* caller); + +template <typename SchemeHost, typename SchemeDevice, typename Device> +__global__ void initRunCUDA2D(tnlParallelEikonalSolver<2, SchemeHost, SchemeDevice, Device, double, int >* caller); + +template <typename SchemeHost, typename SchemeDevice, typename Device> +__global__ void initCUDA2D( tnlParallelEikonalSolver<2, SchemeHost, SchemeDevice, Device, double, int >* cudaSolver, double* ptr, int * ptr2, int* ptr3); + +template <typename SchemeHost, typename SchemeDevice, typename Device> +__global__ void synchronizeCUDA2D(tnlParallelEikonalSolver<2, SchemeHost, SchemeDevice, Device, double, int >* cudaSolver); + +template <typename SchemeHost, typename SchemeDevice, typename Device> +__global__ void synchronize2CUDA2D(tnlParallelEikonalSolver<2, SchemeHost, SchemeDevice, Device, double, int >* cudaSolver); + + + + + + + template <typename SchemeHost, typename SchemeDevice, typename Device> -__global__ void runCUDA(tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int >* caller); +__global__ void runCUDA3D(tnlParallelEikonalSolver<3, SchemeHost, SchemeDevice, Device, double, int >* caller); template <typename SchemeHost, typename SchemeDevice, typename Device> -__global__ void initRunCUDA(tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int >* caller); +__global__ void initRunCUDA3D(tnlParallelEikonalSolver<3, SchemeHost, SchemeDevice, Device, double, int >* caller); template <typename SchemeHost, typename SchemeDevice, typename Device> -__global__ void initCUDA( tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int >* cudaSolver, double* ptr, int * ptr2, int* ptr3); +__global__ void initCUDA3D( tnlParallelEikonalSolver<3, SchemeHost, SchemeDevice, Device, double, int >* cudaSolver, double* ptr, int * ptr2, int* ptr3); template <typename SchemeHost, typename SchemeDevice, typename Device> -__global__ void synchronizeCUDA(tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int >* cudaSolver); +__global__ void synchronizeCUDA3D(tnlParallelEikonalSolver<3, SchemeHost, SchemeDevice, Device, double, int >* cudaSolver); template <typename SchemeHost, typename SchemeDevice, typename Device> -__global__ void synchronize2CUDA(tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int >* cudaSolver); +__global__ void synchronize2CUDA3D(tnlParallelEikonalSolver<3, SchemeHost, SchemeDevice, Device, double, int >* cudaSolver); #endif -#include "tnlParallelEikonalSolver_impl.h" +#include "tnlParallelEikonalSolver2D_impl.h" #endif /* TNLPARALLELEIKONALSOLVER_H_ */ diff --git a/examples/hamilton-jacobi-parallel/tnlParallelEikonalSolver_impl.h b/examples/hamilton-jacobi-parallel/tnlParallelEikonalSolver2D_impl.h similarity index 80% rename from examples/hamilton-jacobi-parallel/tnlParallelEikonalSolver_impl.h rename to examples/hamilton-jacobi-parallel/tnlParallelEikonalSolver2D_impl.h index a20abadec1a94388e6f6da19fa2be9f89f1204a9..a989361a89da748c5d74c439252bbedd9b7d0ccf 100644 --- a/examples/hamilton-jacobi-parallel/tnlParallelEikonalSolver_impl.h +++ b/examples/hamilton-jacobi-parallel/tnlParallelEikonalSolver2D_impl.h @@ -1,5 +1,5 @@ /*************************************************************************** - tnlParallelEikonalSolver_impl.h - description + tnlParallelEikonalSolver2D_impl.h - description ------------------- begin : Nov 28 , 2014 copyright : (C) 2014 by Tomas Sobotik @@ -14,15 +14,15 @@ * * ***************************************************************************/ -#ifndef TNLPARALLELEIKONALSOLVER_IMPL_H_ -#define TNLPARALLELEIKONALSOLVER_IMPL_H_ +#ifndef TNLPARALLELEIKONALSOLVER2D_IMPL_H_ +#define TNLPARALLELEIKONALSOLVER2D_IMPL_H_ #include "tnlParallelEikonalSolver.h" #include <core/mfilename.h> template< typename SchemeHost, typename SchemeDevice, typename Device> -tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>::tnlParallelEikonalSolver() +tnlParallelEikonalSolver<2,SchemeHost, SchemeDevice, Device, double, int>::tnlParallelEikonalSolver() { cout << "a" << endl; this->device = tnlCudaDevice; /////////////// tnlCuda Device --- vypocet na GPU, tnlHostDevice --- vypocet na CPU @@ -38,7 +38,7 @@ tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>::tnlPara } template< typename SchemeHost, typename SchemeDevice, typename Device> -void tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>::test() +void tnlParallelEikonalSolver<2,SchemeHost, SchemeDevice, Device, double, int>::test() { /* for(int i =0; i < this->subgridValues.getSize(); i++ ) @@ -50,7 +50,7 @@ void tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>::te template< typename SchemeHost, typename SchemeDevice, typename Device> -bool tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>::init( const tnlParameterContainer& parameters ) +bool tnlParallelEikonalSolver<2,SchemeHost, SchemeDevice, Device, double, int>::init( const tnlParameterContainer& parameters ) { cout << "Initializating solver..." << endl; const tnlString& meshLocation = parameters.getParameter <tnlString>("mesh"); @@ -110,13 +110,13 @@ bool tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>::in /*cout << "Testing... " << endl; if(this->device == tnlCudaDevice) { - if( !initCUDA(parameters, gridRows, gridCols) ) + if( !initCUDA2D(parameters, gridRows, gridCols) ) return false; }*/ //cout << "s" << endl; - cudaMalloc(&(this->cudaSolver), sizeof(tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int >)); + cudaMalloc(&(this->cudaSolver), sizeof(tnlParallelEikonalSolver<2,SchemeHost, SchemeDevice, Device, double, int >)); //cout << "s" << endl; - cudaMemcpy(this->cudaSolver, this,sizeof(tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int >), cudaMemcpyHostToDevice); + cudaMemcpy(this->cudaSolver, this,sizeof(tnlParallelEikonalSolver<2,SchemeHost, SchemeDevice, Device, double, int >), cudaMemcpyHostToDevice); //cout << "s" << endl; double** tmpdev = NULL; cudaMalloc(&tmpdev, sizeof(double*)); @@ -129,7 +129,7 @@ bool tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>::in cudaMalloc(&(tmpUC), this->work_u.getSize()*sizeof(int)); cudaMemcpy(tmpUC, this->unusedCell.getData(), this->unusedCell.getSize()*sizeof(int), cudaMemcpyHostToDevice); - initCUDA<SchemeTypeHost, SchemeTypeDevice, DeviceType><<<1,1>>>(this->cudaSolver, (this->tmpw), (this->runcuda),tmpUC); + initCUDA2D<SchemeTypeHost, SchemeTypeDevice, DeviceType><<<1,1>>>(this->cudaSolver, (this->tmpw), (this->runcuda),tmpUC); cudaDeviceSynchronize(); checkCudaDevice; //cout << "s " << endl; @@ -186,7 +186,7 @@ bool tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>::in dim3 numBlocks(this->gridCols,this->gridRows); cudaDeviceSynchronize(); checkCudaDevice; - initRunCUDA<SchemeTypeHost,SchemeTypeDevice, DeviceType><<<numBlocks,threadsPerBlock,3*this->n*this->n*sizeof(double)>>>(this->cudaSolver); + initRunCUDA2D<SchemeTypeHost,SchemeTypeDevice, DeviceType><<<numBlocks,threadsPerBlock,3*this->n*this->n*sizeof(double)>>>(this->cudaSolver); cudaDeviceSynchronize(); // cout << "post 1 kernel" << endl; @@ -209,10 +209,10 @@ bool tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>::in checkCudaDevice; - synchronizeCUDA<SchemeTypeHost, SchemeTypeDevice, DeviceType><<<numBlocks,threadsPerBlock>>>(this->cudaSolver); + synchronizeCUDA2D<SchemeTypeHost, SchemeTypeDevice, DeviceType><<<numBlocks,threadsPerBlock>>>(this->cudaSolver); cudaDeviceSynchronize(); checkCudaDevice; - synchronize2CUDA<SchemeTypeHost, SchemeTypeDevice, DeviceType><<<numBlocks,1>>>(this->cudaSolver); + synchronize2CUDA2D<SchemeTypeHost, SchemeTypeDevice, DeviceType><<<numBlocks,1>>>(this->cudaSolver); cudaDeviceSynchronize(); checkCudaDevice; //cout << test[0] << " " <<test[1] <<" " << test[2] << " " <<test[3] << endl; @@ -230,7 +230,7 @@ bool tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>::in } template< typename SchemeHost, typename SchemeDevice, typename Device> -void tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>::run() +void tnlParallelEikonalSolver<2,SchemeHost, SchemeDevice, Device, double, int>::run() { if(this->device == tnlHostDevice) { @@ -349,16 +349,16 @@ void tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>::ru cudaDeviceSynchronize(); checkCudaDevice; start = std::clock(); - runCUDA<SchemeTypeHost, SchemeTypeDevice, DeviceType><<<numBlocks,threadsPerBlock,3*this->n*this->n*sizeof(double)>>>(this->cudaSolver); + runCUDA2D<SchemeTypeHost, SchemeTypeDevice, DeviceType><<<numBlocks,threadsPerBlock,3*this->n*this->n*sizeof(double)>>>(this->cudaSolver); //cout << "a" << endl; cudaDeviceSynchronize(); time_diff += (std::clock() - start) / (double)(CLOCKS_PER_SEC); //start = std::clock(); - synchronizeCUDA<SchemeTypeHost, SchemeTypeDevice, DeviceType><<<numBlocks,threadsPerBlock>>>(this->cudaSolver); + synchronizeCUDA2D<SchemeTypeHost, SchemeTypeDevice, DeviceType><<<numBlocks,threadsPerBlock>>>(this->cudaSolver); cudaDeviceSynchronize(); checkCudaDevice; - synchronize2CUDA<SchemeTypeHost, SchemeTypeDevice, DeviceType><<<numBlocks,1>>>(this->cudaSolver); + synchronize2CUDA2D<SchemeTypeHost, SchemeTypeDevice, DeviceType><<<numBlocks,1>>>(this->cudaSolver); cudaDeviceSynchronize(); checkCudaDevice; //time_diff += (std::clock() - start) / (double)(CLOCKS_PER_SEC); @@ -407,7 +407,7 @@ void tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>::ru //north - 1, east - 2, west - 4, south - 8 template< typename SchemeHost, typename SchemeDevice, typename Device> -void tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>::synchronize() //needs fix ---- maybe not anymore --- but frankly: yeah, it does -- aaaa-and maybe fixed now +void tnlParallelEikonalSolver<2,SchemeHost, SchemeDevice, Device, double, int>::synchronize() //needs fix ---- maybe not anymore --- but frankly: yeah, it does -- aaaa-and maybe fixed now { cout << "Synchronizig..." << endl; int tmp1, tmp2; @@ -504,38 +504,38 @@ void tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>::sy template< typename SchemeHost, typename SchemeDevice, typename Device> -int tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>::getOwner(int i) const +int tnlParallelEikonalSolver<2,SchemeHost, SchemeDevice, Device, double, int>::getOwner(int i) const { return (i / (this->gridCols*this->n*this->n))*this->gridCols + (i % (this->gridCols*this->n))/this->n; } template< typename SchemeHost, typename SchemeDevice, typename Device> -int tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>::getSubgridValue( int i ) const +int tnlParallelEikonalSolver<2,SchemeHost, SchemeDevice, Device, double, int>::getSubgridValue( int i ) const { return this->subgridValues[i]; } template< typename SchemeHost, typename SchemeDevice, typename Device> -void tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>::setSubgridValue(int i, int value) +void tnlParallelEikonalSolver<2,SchemeHost, SchemeDevice, Device, double, int>::setSubgridValue(int i, int value) { this->subgridValues[i] = value; } template< typename SchemeHost, typename SchemeDevice, typename Device> -int tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>::getBoundaryCondition( int i ) const +int tnlParallelEikonalSolver<2,SchemeHost, SchemeDevice, Device, double, int>::getBoundaryCondition( int i ) const { return this->boundaryConditions[i]; } template< typename SchemeHost, typename SchemeDevice, typename Device> -void tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>::setBoundaryCondition(int i, int value) +void tnlParallelEikonalSolver<2,SchemeHost, SchemeDevice, Device, double, int>::setBoundaryCondition(int i, int value) { this->boundaryConditions[i] = value; } template< typename SchemeHost, typename SchemeDevice, typename Device> -void tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>::stretchGrid() +void tnlParallelEikonalSolver<2,SchemeHost, SchemeDevice, Device, double, int>::stretchGrid() { cout << "Stretching grid..." << endl; @@ -612,7 +612,7 @@ void tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>::st } template< typename SchemeHost, typename SchemeDevice, typename Device> -void tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>::contractGrid() +void tnlParallelEikonalSolver<2,SchemeHost, SchemeDevice, Device, double, int>::contractGrid() { cout << "Contracting grid..." << endl; int stretchedSize = this->n*this->n*this->gridCols*this->gridRows; @@ -637,8 +637,8 @@ void tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>::co } template< typename SchemeHost, typename SchemeDevice, typename Device> -typename tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>::VectorType -tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>::getSubgrid( const int i ) const +typename tnlParallelEikonalSolver<2,SchemeHost, SchemeDevice, Device, double, int>::VectorType +tnlParallelEikonalSolver<2,SchemeHost, SchemeDevice, Device, double, int>::getSubgrid( const int i ) const { VectorType u; u.setSize(this->n*this->n); @@ -654,7 +654,7 @@ tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>::getSubg } template< typename SchemeHost, typename SchemeDevice, typename Device> -void tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>::insertSubgrid( VectorType u, const int i ) +void tnlParallelEikonalSolver<2,SchemeHost, SchemeDevice, Device, double, int>::insertSubgrid( VectorType u, const int i ) { for( int j = 0; j < this->n*this->n; j++) @@ -671,8 +671,8 @@ void tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>::in } template< typename SchemeHost, typename SchemeDevice, typename Device> -typename tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>::VectorType -tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>::runSubgrid( int boundaryCondition, VectorType u, int subGridID) +typename tnlParallelEikonalSolver<2,SchemeHost, SchemeDevice, Device, double, int>::VectorType +tnlParallelEikonalSolver<2,SchemeHost, SchemeDevice, Device, double, int>::runSubgrid( int boundaryCondition, VectorType u, int subGridID) { VectorType fu; @@ -1020,7 +1020,7 @@ tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>::runSubg template< typename SchemeHost, typename SchemeDevice, typename Device> __device__ -void tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>::getSubgridCUDA( const int i ,tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int >* caller, double* a) +void tnlParallelEikonalSolver<2,SchemeHost, SchemeDevice, Device, double, int>::getSubgridCUDA2D( const int i ,tnlParallelEikonalSolver<2,SchemeHost, SchemeDevice, Device, double, int >* caller, double* a) { //int j = threadIdx.x + threadIdx.y * blockDim.x; int th = (blockIdx.y) * caller->n*caller->n*caller->gridCols @@ -1036,7 +1036,7 @@ void tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>::ge template< typename SchemeHost, typename SchemeDevice, typename Device> __device__ -void tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>::updateSubgridCUDA( const int i ,tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int >* caller, double* a) +void tnlParallelEikonalSolver<2,SchemeHost, SchemeDevice, Device, double, int>::updateSubgridCUDA2D( const int i ,tnlParallelEikonalSolver<2,SchemeHost, SchemeDevice, Device, double, int >* caller, double* a) { int j = threadIdx.x + threadIdx.y * blockDim.x; int index = (blockIdx.y) * caller->n*caller->n*caller->gridCols @@ -1057,7 +1057,7 @@ void tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>::up template< typename SchemeHost, typename SchemeDevice, typename Device> __device__ -void tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>::insertSubgridCUDA( double u, const int i ) +void tnlParallelEikonalSolver<2,SchemeHost, SchemeDevice, Device, double, int>::insertSubgridCUDA2D( double u, const int i ) { @@ -1083,7 +1083,7 @@ void tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>::in template< typename SchemeHost, typename SchemeDevice, typename Device> __device__ -void tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>::runSubgridCUDA( int boundaryCondition, double* u, int subGridID) +void tnlParallelEikonalSolver<2,SchemeHost, SchemeDevice, Device, double, int>::runSubgridCUDA2D( int boundaryCondition, double* u, int subGridID) { __shared__ int tmp; @@ -1199,7 +1199,7 @@ void tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>::ru template< typename SchemeHost, typename SchemeDevice, typename Device> __device__ -int tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>::getOwnerCUDA(int i) const +int tnlParallelEikonalSolver<2,SchemeHost, SchemeDevice, Device, double, int>::getOwnerCUDA2D(int i) const { return ((i / (this->gridCols*this->n*this->n))*this->gridCols @@ -1209,7 +1209,7 @@ int tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>::get template< typename SchemeHost, typename SchemeDevice, typename Device> __device__ -int tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>::getSubgridValueCUDA( int i ) const +int tnlParallelEikonalSolver<2,SchemeHost, SchemeDevice, Device, double, int>::getSubgridValueCUDA2D( int i ) const { return this->subgridValues_cuda[i]; } @@ -1217,7 +1217,7 @@ int tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>::get template< typename SchemeHost, typename SchemeDevice, typename Device> __device__ -void tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>::setSubgridValueCUDA(int i, int value) +void tnlParallelEikonalSolver<2,SchemeHost, SchemeDevice, Device, double, int>::setSubgridValueCUDA2D(int i, int value) { this->subgridValues_cuda[i] = value; } @@ -1225,7 +1225,7 @@ void tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>::se template< typename SchemeHost, typename SchemeDevice, typename Device> __device__ -int tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>::getBoundaryConditionCUDA( int i ) const +int tnlParallelEikonalSolver<2,SchemeHost, SchemeDevice, Device, double, int>::getBoundaryConditionCUDA2D( int i ) const { return this->boundaryConditions_cuda[i]; } @@ -1233,7 +1233,7 @@ int tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>::get template< typename SchemeHost, typename SchemeDevice, typename Device> __device__ -void tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>::setBoundaryConditionCUDA(int i, int value) +void tnlParallelEikonalSolver<2,SchemeHost, SchemeDevice, Device, double, int>::setBoundaryConditionCUDA2D(int i, int value) { this->boundaryConditions_cuda[i] = value; } @@ -1244,7 +1244,7 @@ void tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>::se template <typename SchemeHost, typename SchemeDevice, typename Device> __global__ -void /*tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>::*/synchronizeCUDA(tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int >* cudaSolver) //needs fix ---- maybe not anymore --- but frankly: yeah, it does -- aaaa-and maybe fixed now +void /*tnlParallelEikonalSolver<2,SchemeHost, SchemeDevice, Device, double, int>::*/synchronizeCUDA2D(tnlParallelEikonalSolver<2,SchemeHost, SchemeDevice, Device, double, int >* cudaSolver) //needs fix ---- maybe not anymore --- but frankly: yeah, it does -- aaaa-and maybe fixed now { __shared__ int boundary[4]; // north,east,west,south @@ -1261,7 +1261,7 @@ void /*tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>:: if(threadIdx.x+threadIdx.y == 0) { - subgridValue = cudaSolver->getSubgridValueCUDA(blockIdx.y*gridDim.x + blockIdx.x); + subgridValue = cudaSolver->getSubgridValueCUDA2D(blockIdx.y*gridDim.x + blockIdx.x); boundary[0] = 0; boundary[1] = 0; boundary[2] = 0; @@ -1281,26 +1281,26 @@ void /*tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>:: if(threadIdx.x == 0 && (blockIdx.x != 0)/* && !(cudaSolver->currentStep & 1)*/) { u_cmp = cudaSolver->work_u_cuda[gid - 1]; - subgridValue_cmp = cudaSolver->getSubgridValueCUDA(blockIdx.y*gridDim.x + blockIdx.x - 1); + subgridValue_cmp = cudaSolver->getSubgridValueCUDA2D(blockIdx.y*gridDim.x + blockIdx.x - 1); boundary_index = 2; } if(threadIdx.x == blockDim.x - 1 && (blockIdx.x != gridDim.x - 1)/* && !(cudaSolver->currentStep & 1)*/) { u_cmp = cudaSolver->work_u_cuda[gid + 1]; - subgridValue_cmp = cudaSolver->getSubgridValueCUDA(blockIdx.y*gridDim.x + blockIdx.x + 1); + subgridValue_cmp = cudaSolver->getSubgridValueCUDA2D(blockIdx.y*gridDim.x + blockIdx.x + 1); boundary_index = 1; } // if(threadIdx.y == 0 && (blockIdx.y != 0) && (cudaSolver->currentStep & 1)) // { // u_cmp = cudaSolver->work_u_cuda[gid - blockDim.x*gridDim.x]; -// subgridValue_cmp = cudaSolver->getSubgridValueCUDA((blockIdx.y - 1)*gridDim.x + blockIdx.x); +// subgridValue_cmp = cudaSolver->getSubgridValueCUDA2D((blockIdx.y - 1)*gridDim.x + blockIdx.x); // boundary_index = 3; // } // if(threadIdx.y == blockDim.y - 1 && (blockIdx.y != gridDim.y - 1) && (cudaSolver->currentStep & 1)) // { // u_cmp = cudaSolver->work_u_cuda[gid + blockDim.x*gridDim.x]; -// subgridValue_cmp = cudaSolver->getSubgridValueCUDA((blockIdx.y + 1)*gridDim.x + blockIdx.x); +// subgridValue_cmp = cudaSolver->getSubgridValueCUDA2D((blockIdx.y + 1)*gridDim.x + blockIdx.x); // boundary_index = 0; // } @@ -1317,13 +1317,13 @@ void /*tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>:: if(threadIdx.y == 0 && (blockIdx.y != 0)/* && (cudaSolver->currentStep & 1)*/) { u_cmp = cudaSolver->work_u_cuda[gid - blockDim.x*gridDim.x]; - subgridValue_cmp = cudaSolver->getSubgridValueCUDA((blockIdx.y - 1)*gridDim.x + blockIdx.x); + subgridValue_cmp = cudaSolver->getSubgridValueCUDA2D((blockIdx.y - 1)*gridDim.x + blockIdx.x); boundary_index = 3; } if(threadIdx.y == blockDim.y - 1 && (blockIdx.y != gridDim.y - 1)/* && (cudaSolver->currentStep & 1)*/) { u_cmp = cudaSolver->work_u_cuda[gid + blockDim.x*gridDim.x]; - subgridValue_cmp = cudaSolver->getSubgridValueCUDA((blockIdx.y + 1)*gridDim.x + blockIdx.x); + subgridValue_cmp = cudaSolver->getSubgridValueCUDA2D((blockIdx.y + 1)*gridDim.x + blockIdx.x); boundary_index = 0; } @@ -1341,9 +1341,9 @@ void /*tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>:: if(threadIdx.x+threadIdx.y == 0) { if(subgridValue == INT_MAX && newSubgridValue !=0) - cudaSolver->setSubgridValueCUDA(blockIdx.y*gridDim.x + blockIdx.x, -INT_MAX); + cudaSolver->setSubgridValueCUDA2D(blockIdx.y*gridDim.x + blockIdx.x, -INT_MAX); - cudaSolver->setBoundaryConditionCUDA(blockIdx.y*gridDim.x + blockIdx.x, boundary[0] + + cudaSolver->setBoundaryConditionCUDA2D(blockIdx.y*gridDim.x + blockIdx.x, boundary[0] + 2 * boundary[1] + 4 * boundary[2] + 8 * boundary[3]); @@ -1366,32 +1366,32 @@ void /*tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>:: { tmp1 = cudaSolver->gridCols*cudaSolver->n*((cudaSolver->n-1)+j*cudaSolver->n) + i; tmp2 = cudaSolver->gridCols*cudaSolver->n*((cudaSolver->n)+j*cudaSolver->n) + i; - grid1 = cudaSolver->getSubgridValueCUDA(cudaSolver->getOwnerCUDA(tmp1)); - grid2 = cudaSolver->getSubgridValueCUDA(cudaSolver->getOwnerCUDA(tmp2)); + grid1 = cudaSolver->getSubgridValueCUDA2D(cudaSolver->getOwnerCUDA2D(tmp1)); + grid2 = cudaSolver->getSubgridValueCUDA2D(cudaSolver->getOwnerCUDA2D(tmp2)); if ((fabs(cudaSolver->work_u_cuda[tmp1]) < fabs(cudaSolver->work_u_cuda[tmp2]) - cudaSolver->delta || grid2 == INT_MAX || grid2 == -INT_MAX) && (grid1 != INT_MAX && grid1 != -INT_MAX)) { - //printf("%d %d %d %d \n",tmp1,tmp2,cudaSolver->getOwnerCUDA(tmp1),cudaSolver->getOwnerCUDA(tmp2)); + //printf("%d %d %d %d \n",tmp1,tmp2,cudaSolver->getOwnerCUDA2D(tmp1),cudaSolver->getOwnerCUDA2D(tmp2)); cudaSolver->work_u_cuda[tmp2] = cudaSolver->work_u_cuda[tmp1]; cudaSolver->unusedCell_cuda[tmp2] = 0; if(grid2 == INT_MAX) { - cudaSolver->setSubgridValueCUDA(cudaSolver->getOwnerCUDA(tmp2), -INT_MAX); + cudaSolver->setSubgridValueCUDA2D(cudaSolver->getOwnerCUDA2D(tmp2), -INT_MAX); } - if(! (cudaSolver->getBoundaryConditionCUDA(cudaSolver->getOwnerCUDA(tmp2)) & 8) ) - cudaSolver->setBoundaryConditionCUDA(cudaSolver->getOwnerCUDA(tmp2), cudaSolver->getBoundaryConditionCUDA(cudaSolver->getOwnerCUDA(tmp2))+8); + if(! (cudaSolver->getBoundaryConditionCUDA2D(cudaSolver->getOwnerCUDA2D(tmp2)) & 8) ) + cudaSolver->setBoundaryConditionCUDA2D(cudaSolver->getOwnerCUDA2D(tmp2), cudaSolver->getBoundaryConditionCUDA2D(cudaSolver->getOwnerCUDA2D(tmp2))+8); } else if ((fabs(cudaSolver->work_u_cuda[tmp1]) > fabs(cudaSolver->work_u_cuda[tmp2]) + cudaSolver->delta || grid1 == INT_MAX || grid1 == -INT_MAX) && (grid2 != INT_MAX && grid2 != -INT_MAX)) { - //printf("%d %d %d %d \n",tmp1,tmp2,cudaSolver->getOwnerCUDA(tmp1),cudaSolver->getOwnerCUDA(tmp2)); + //printf("%d %d %d %d \n",tmp1,tmp2,cudaSolver->getOwnerCUDA2D(tmp1),cudaSolver->getOwnerCUDA2D(tmp2)); cudaSolver->work_u_cuda[tmp1] = cudaSolver->work_u_cuda[tmp2]; cudaSolver->unusedCell_cuda[tmp1] = 0; if(grid1 == INT_MAX) { - cudaSolver->setSubgridValueCUDA(cudaSolver->getOwnerCUDA(tmp1), -INT_MAX); + cudaSolver->setSubgridValueCUDA2D(cudaSolver->getOwnerCUDA2D(tmp1), -INT_MAX); } - if(! (cudaSolver->getBoundaryConditionCUDA(cudaSolver->getOwnerCUDA(tmp1)) & 1) ) - cudaSolver->setBoundaryConditionCUDA(cudaSolver->getOwnerCUDA(tmp1), cudaSolver->getBoundaryConditionCUDA(cudaSolver->getOwnerCUDA(tmp1))+1); + if(! (cudaSolver->getBoundaryConditionCUDA2D(cudaSolver->getOwnerCUDA2D(tmp1)) & 1) ) + cudaSolver->setBoundaryConditionCUDA2D(cudaSolver->getOwnerCUDA2D(tmp1), cudaSolver->getBoundaryConditionCUDA2D(cudaSolver->getOwnerCUDA2D(tmp1))+1); } } } @@ -1408,32 +1408,32 @@ void /*tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>:: tmp1 = cudaSolver->gridCols*cudaSolver->n*j + i*cudaSolver->n - 1; tmp2 = cudaSolver->gridCols*cudaSolver->n*j + i*cudaSolver->n ; - grid1 = cudaSolver->getSubgridValueCUDA(cudaSolver->getOwnerCUDA(tmp1)); - grid2 = cudaSolver->getSubgridValueCUDA(cudaSolver->getOwnerCUDA(tmp2)); + grid1 = cudaSolver->getSubgridValueCUDA2D(cudaSolver->getOwnerCUDA2D(tmp1)); + grid2 = cudaSolver->getSubgridValueCUDA2D(cudaSolver->getOwnerCUDA2D(tmp2)); if ((fabs(cudaSolver->work_u_cuda[tmp1]) < fabs(cudaSolver->work_u_cuda[tmp2]) - cudaSolver->delta || grid2 == INT_MAX || grid2 == -INT_MAX) && (grid1 != INT_MAX && grid1 != -INT_MAX)) { - //printf("%d %d %d %d \n",tmp1,tmp2,cudaSolver->getOwnerCUDA(tmp1),cudaSolver->getOwnerCUDA(tmp2)); + //printf("%d %d %d %d \n",tmp1,tmp2,cudaSolver->getOwnerCUDA2D(tmp1),cudaSolver->getOwnerCUDA2D(tmp2)); cudaSolver->work_u_cuda[tmp2] = cudaSolver->work_u_cuda[tmp1]; cudaSolver->unusedCell_cuda[tmp2] = 0; if(grid2 == INT_MAX) { - cudaSolver->setSubgridValueCUDA(cudaSolver->getOwnerCUDA(tmp2), -INT_MAX); + cudaSolver->setSubgridValueCUDA2D(cudaSolver->getOwnerCUDA2D(tmp2), -INT_MAX); } - if(! (cudaSolver->getBoundaryConditionCUDA(cudaSolver->getOwnerCUDA(tmp2)) & 4) ) - cudaSolver->setBoundaryConditionCUDA(cudaSolver->getOwnerCUDA(tmp2), cudaSolver->getBoundaryConditionCUDA(cudaSolver->getOwnerCUDA(tmp2))+4); + if(! (cudaSolver->getBoundaryConditionCUDA2D(cudaSolver->getOwnerCUDA2D(tmp2)) & 4) ) + cudaSolver->setBoundaryConditionCUDA2D(cudaSolver->getOwnerCUDA2D(tmp2), cudaSolver->getBoundaryConditionCUDA2D(cudaSolver->getOwnerCUDA2D(tmp2))+4); } else if ((fabs(cudaSolver->work_u_cuda[tmp1]) > fabs(cudaSolver->work_u_cuda[tmp2]) + cudaSolver->delta || grid1 == INT_MAX || grid1 == -INT_MAX) && (grid2 != INT_MAX && grid2 != -INT_MAX)) { - //printf("%d %d %d %d \n",tmp1,tmp2,cudaSolver->getOwnerCUDA(tmp1),cudaSolver->getOwnerCUDA(tmp2)); + //printf("%d %d %d %d \n",tmp1,tmp2,cudaSolver->getOwnerCUDA2D(tmp1),cudaSolver->getOwnerCUDA2D(tmp2)); cudaSolver->work_u_cuda[tmp1] = cudaSolver->work_u_cuda[tmp2]; cudaSolver->unusedCell_cuda[tmp1] = 0; if(grid1 == INT_MAX) { - cudaSolver->setSubgridValueCUDA(cudaSolver->getOwnerCUDA(tmp1), -INT_MAX); + cudaSolver->setSubgridValueCUDA2D(cudaSolver->getOwnerCUDA2D(tmp1), -INT_MAX); } - if(! (cudaSolver->getBoundaryConditionCUDA(cudaSolver->getOwnerCUDA(tmp1)) & 2) ) - cudaSolver->setBoundaryConditionCUDA(cudaSolver->getOwnerCUDA(tmp1), cudaSolver->getBoundaryConditionCUDA(cudaSolver->getOwnerCUDA(tmp1))+2); + if(! (cudaSolver->getBoundaryConditionCUDA2D(cudaSolver->getOwnerCUDA2D(tmp1)) & 2) ) + cudaSolver->setBoundaryConditionCUDA2D(cudaSolver->getOwnerCUDA2D(tmp1), cudaSolver->getBoundaryConditionCUDA2D(cudaSolver->getOwnerCUDA2D(tmp1))+2); } } } @@ -1444,15 +1444,15 @@ void /*tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>:: int stepValue = cudaSolver->currentStep + 4; for (int i = 0; i < cudaSolver->gridRows * cudaSolver->gridCols; i++) { - if( cudaSolver->getSubgridValueCUDA(i) == -INT_MAX ) - cudaSolver->setSubgridValueCUDA(i, stepValue); + if( cudaSolver->getSubgridValueCUDA2D(i) == -INT_MAX ) + cudaSolver->setSubgridValueCUDA2D(i, stepValue); } int maxi = 0; for(int q=0; q < cudaSolver->gridRows*cudaSolver->gridCols;q++) { //printf("%d : %d\n", q, cudaSolver->boundaryConditions_cuda[q]); - maxi=Max(maxi,cudaSolver->getBoundaryConditionCUDA(q)); + maxi=Max(maxi,cudaSolver->getBoundaryConditionCUDA2D(q)); } //printf("I am not an empty kernel! %d\n", maxi); *(cudaSolver->runcuda) = (maxi > 0); @@ -1465,7 +1465,7 @@ void /*tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>:: template <typename SchemeHost, typename SchemeDevice, typename Device> __global__ -void synchronize2CUDA(tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int >* cudaSolver) +void synchronize2CUDA2D(tnlParallelEikonalSolver<2,SchemeHost, SchemeDevice, Device, double, int >* cudaSolver) { if(blockIdx.x+blockIdx.y ==0) { @@ -1474,10 +1474,10 @@ void synchronize2CUDA(tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, } int stepValue = cudaSolver->currentStep + 4; - if( cudaSolver->getSubgridValueCUDA(blockIdx.y*gridDim.x + blockIdx.x) == -INT_MAX ) - cudaSolver->setSubgridValueCUDA(blockIdx.y*gridDim.x + blockIdx.x, stepValue); + if( cudaSolver->getSubgridValueCUDA2D(blockIdx.y*gridDim.x + blockIdx.x) == -INT_MAX ) + cudaSolver->setSubgridValueCUDA2D(blockIdx.y*gridDim.x + blockIdx.x, stepValue); - atomicMax((cudaSolver->runcuda),cudaSolver->getBoundaryConditionCUDA(blockIdx.y*gridDim.x + blockIdx.x)); + atomicMax((cudaSolver->runcuda),cudaSolver->getBoundaryConditionCUDA2D(blockIdx.y*gridDim.x + blockIdx.x)); } @@ -1489,7 +1489,7 @@ void synchronize2CUDA(tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, template< typename SchemeHost, typename SchemeDevice, typename Device> __global__ -void /*tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>::*/initCUDA( tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int >* cudaSolver, double* ptr , int* ptr2, int* ptr3) +void /*tnlParallelEikonalSolver<2,SchemeHost, SchemeDevice, Device, double, int>::*/initCUDA2D( tnlParallelEikonalSolver<2,SchemeHost, SchemeDevice, Device, double, int >* cudaSolver, double* ptr , int* ptr2, int* ptr3) { //cout << "Initializating solver..." << endl; //const tnlString& meshLocation = parameters.getParameter <tnlString>("mesh"); @@ -1582,7 +1582,7 @@ void /*tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>:: //extern __shared__ double array[]; template< typename SchemeHost, typename SchemeDevice, typename Device > __global__ -void /*tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>::*/initRunCUDA(tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int >* caller) +void /*tnlParallelEikonalSolver<2,SchemeHost, SchemeDevice, Device, double, int>::*/initRunCUDA2D(tnlParallelEikonalSolver<2,SchemeHost, SchemeDevice, Device, double, int >* caller) { @@ -1598,7 +1598,7 @@ void /*tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>:: containsCurve = 0; //double a; - caller->getSubgridCUDA(i,caller, &u[l]); + caller->getSubgridCUDA2D(i,caller, &u[l]); //printf("%f %f\n",a , u[l]); //u[l] = a; //printf("Hi %f \n", u[l]); @@ -1616,13 +1616,13 @@ void /*tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>:: if(containsCurve == 1) { //printf("have curve \n"); - caller->runSubgridCUDA(0,u,i); + caller->runSubgridCUDA2D(0,u,i); //printf("%d : %f\n", l, u[l]); __syncthreads(); - caller->insertSubgridCUDA(u[l],i); + caller->insertSubgridCUDA2D(u[l],i); __syncthreads(); if(l == 0) - caller->setSubgridValueCUDA(i, 4); + caller->setSubgridValueCUDA2D(i, 4); } @@ -1634,59 +1634,59 @@ void /*tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>:: template< typename SchemeHost, typename SchemeDevice, typename Device > __global__ -void /*tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>::*/runCUDA(tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int >* caller) +void /*tnlParallelEikonalSolver<2,SchemeHost, SchemeDevice, Device, double, int>::*/runCUDA2D(tnlParallelEikonalSolver<2,SchemeHost, SchemeDevice, Device, double, int >* caller) { extern __shared__ double u[]; int i = blockIdx.y * gridDim.x + blockIdx.x; int l = threadIdx.y * blockDim.x + threadIdx.x; - int bound = caller->getBoundaryConditionCUDA(i); + int bound = caller->getBoundaryConditionCUDA2D(i); - if(caller->getSubgridValueCUDA(i) != INT_MAX && bound != 0 && caller->getSubgridValueCUDA(i) > 0) + if(caller->getSubgridValueCUDA2D(i) != INT_MAX && bound != 0 && caller->getSubgridValueCUDA2D(i) > 0) { - caller->getSubgridCUDA(i,caller, &u[l]); + caller->getSubgridCUDA2D(i,caller, &u[l]); //if(l == 0) - //printf("i = %d, bound = %d\n",i,caller->getSubgridValueCUDA(i)); - if(caller->getSubgridValueCUDA(i) == caller->currentStep+4) + //printf("i = %d, bound = %d\n",i,caller->getSubgridValueCUDA2D(i)); + if(caller->getSubgridValueCUDA2D(i) == caller->currentStep+4) { if(bound & 1) { - caller->runSubgridCUDA(1,u,i); + caller->runSubgridCUDA2D(1,u,i); //__syncthreads(); - //caller->insertSubgridCUDA(u[l],i); + //caller->insertSubgridCUDA2D(u[l],i); //__syncthreads(); - //caller->getSubgridCUDA(i,caller, &u[l]); - caller->updateSubgridCUDA(i,caller, &u[l]); + //caller->getSubgridCUDA2D(i,caller, &u[l]); + caller->updateSubgridCUDA2D(i,caller, &u[l]); __syncthreads(); } if(bound & 2 ) { - caller->runSubgridCUDA(2,u,i); + caller->runSubgridCUDA2D(2,u,i); //__syncthreads(); - //caller->insertSubgridCUDA(u[l],i); + //caller->insertSubgridCUDA2D(u[l],i); //__syncthreads(); - //caller->getSubgridCUDA(i,caller, &u[l]); - caller->updateSubgridCUDA(i,caller, &u[l]); + //caller->getSubgridCUDA2D(i,caller, &u[l]); + caller->updateSubgridCUDA2D(i,caller, &u[l]); __syncthreads(); } if(bound & 4) { - caller->runSubgridCUDA(4,u,i); + caller->runSubgridCUDA2D(4,u,i); //__syncthreads(); - //caller->insertSubgridCUDA(u[l],i); + //caller->insertSubgridCUDA2D(u[l],i); //__syncthreads(); - //caller->getSubgridCUDA(i,caller, &u[l]); - caller->updateSubgridCUDA(i,caller, &u[l]); + //caller->getSubgridCUDA2D(i,caller, &u[l]); + caller->updateSubgridCUDA2D(i,caller, &u[l]); __syncthreads(); } if(bound & 8) { - caller->runSubgridCUDA(8,u,i); + caller->runSubgridCUDA2D(8,u,i); //__syncthreads(); - //caller->insertSubgridCUDA(u[l],i); + //caller->insertSubgridCUDA2D(u[l],i); //__syncthreads(); - //caller->getSubgridCUDA(i,caller, &u[l]); - caller->updateSubgridCUDA(i,caller, &u[l]); + //caller->getSubgridCUDA2D(i,caller, &u[l]); + caller->updateSubgridCUDA2D(i,caller, &u[l]); __syncthreads(); } @@ -1696,42 +1696,42 @@ void /*tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>:: if( ((bound & 3 ))) { - caller->runSubgridCUDA(3,u,i); + caller->runSubgridCUDA2D(3,u,i); //__syncthreads(); - //caller->insertSubgridCUDA(u[l],i); + //caller->insertSubgridCUDA2D(u[l],i); //__syncthreads(); - //caller->getSubgridCUDA(i,caller, &u[l]); - caller->updateSubgridCUDA(i,caller, &u[l]); + //caller->getSubgridCUDA2D(i,caller, &u[l]); + caller->updateSubgridCUDA2D(i,caller, &u[l]); __syncthreads(); } if( ((bound & 5 ))) { - caller->runSubgridCUDA(5,u,i); + caller->runSubgridCUDA2D(5,u,i); //__syncthreads(); - //caller->insertSubgridCUDA(u[l],i); + //caller->insertSubgridCUDA2D(u[l],i); //__syncthreads(); - //caller->getSubgridCUDA(i,caller, &u[l]); - caller->updateSubgridCUDA(i,caller, &u[l]); + //caller->getSubgridCUDA2D(i,caller, &u[l]); + caller->updateSubgridCUDA2D(i,caller, &u[l]); __syncthreads(); } if( ((bound & 10 ))) { - caller->runSubgridCUDA(10,u,i); + caller->runSubgridCUDA2D(10,u,i); //__syncthreads(); - //caller->insertSubgridCUDA(u[l],i); + //caller->insertSubgridCUDA2D(u[l],i); //__syncthreads(); - //caller->getSubgridCUDA(i,caller, &u[l]); - caller->updateSubgridCUDA(i,caller, &u[l]); + //caller->getSubgridCUDA2D(i,caller, &u[l]); + caller->updateSubgridCUDA2D(i,caller, &u[l]); __syncthreads(); } if( (bound & 12 )) { - caller->runSubgridCUDA(12,u,i); + caller->runSubgridCUDA2D(12,u,i); //__syncthreads(); - //caller->insertSubgridCUDA(u[l],i); + //caller->insertSubgridCUDA2D(u[l],i); //__syncthreads(); - //caller->getSubgridCUDA(i,caller, &u[l]); - caller->updateSubgridCUDA(i,caller, &u[l]); + //caller->getSubgridCUDA2D(i,caller, &u[l]); + caller->updateSubgridCUDA2D(i,caller, &u[l]); __syncthreads(); } @@ -1755,42 +1755,42 @@ void /*tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>:: if( ((bound == 2))) { - caller->runSubgridCUDA(2,u,i); + caller->runSubgridCUDA2D(2,u,i); //__syncthreads(); - //caller->insertSubgridCUDA(u[l],i); + //caller->insertSubgridCUDA2D(u[l],i); //__syncthreads(); - //caller->getSubgridCUDA(i,caller, &u[l]); - caller->updateSubgridCUDA(i,caller, &u[l]); + //caller->getSubgridCUDA2D(i,caller, &u[l]); + caller->updateSubgridCUDA2D(i,caller, &u[l]); __syncthreads(); } if( ((bound == 1) )) { - caller->runSubgridCUDA(1,u,i); + caller->runSubgridCUDA2D(1,u,i); //__syncthreads(); - //caller->insertSubgridCUDA(u[l],i); + //caller->insertSubgridCUDA2D(u[l],i); //__syncthreads(); - //caller->getSubgridCUDA(i,caller, &u[l]); - caller->updateSubgridCUDA(i,caller, &u[l]); + //caller->getSubgridCUDA2D(i,caller, &u[l]); + caller->updateSubgridCUDA2D(i,caller, &u[l]); __syncthreads(); } if( ((bound == 8) )) { - caller->runSubgridCUDA(8,u,i); + caller->runSubgridCUDA2D(8,u,i); //__syncthreads(); - //caller->insertSubgridCUDA(u[l],i); + //caller->insertSubgridCUDA2D(u[l],i); //__syncthreads(); - //caller->getSubgridCUDA(i,caller, &u[l]); - caller->updateSubgridCUDA(i,caller, &u[l]); + //caller->getSubgridCUDA2D(i,caller, &u[l]); + caller->updateSubgridCUDA2D(i,caller, &u[l]); __syncthreads(); } if( (bound == 4)) { - caller->runSubgridCUDA(4,u,i); + caller->runSubgridCUDA2D(4,u,i); //__syncthreads(); - //caller->insertSubgridCUDA(u[l],i); + //caller->insertSubgridCUDA2D(u[l],i); //__syncthreads(); - //caller->getSubgridCUDA(i,caller, &u[l]); - caller->updateSubgridCUDA(i,caller, &u[l]); + //caller->getSubgridCUDA2D(i,caller, &u[l]); + caller->updateSubgridCUDA2D(i,caller, &u[l]); __syncthreads(); } @@ -1805,42 +1805,42 @@ void /*tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>:: if( ((bound & 3) )) { - caller->runSubgridCUDA(3,u,i); + caller->runSubgridCUDA2D(3,u,i); //__syncthreads(); - //caller->insertSubgridCUDA(u[l],i); + //caller->insertSubgridCUDA2D(u[l],i); //__syncthreads(); - //caller->getSubgridCUDA(i,caller, &u[l]); - caller->updateSubgridCUDA(i,caller, &u[l]); + //caller->getSubgridCUDA2D(i,caller, &u[l]); + caller->updateSubgridCUDA2D(i,caller, &u[l]); __syncthreads(); } if( ((bound & 5) )) { - caller->runSubgridCUDA(5,u,i); + caller->runSubgridCUDA2D(5,u,i); //__syncthreads(); - //caller->insertSubgridCUDA(u[l],i); + //caller->insertSubgridCUDA2D(u[l],i); //__syncthreads(); - //caller->getSubgridCUDA(i,caller, &u[l]); - caller->updateSubgridCUDA(i,caller, &u[l]); + //caller->getSubgridCUDA2D(i,caller, &u[l]); + caller->updateSubgridCUDA2D(i,caller, &u[l]); __syncthreads(); } if( ((bound & 10) )) { - caller->runSubgridCUDA(10,u,i); + caller->runSubgridCUDA2D(10,u,i); //__syncthreads(); - //caller->insertSubgridCUDA(u[l],i); + //caller->insertSubgridCUDA2D(u[l],i); //__syncthreads(); - //caller->getSubgridCUDA(i,caller, &u[l]); - caller->updateSubgridCUDA(i,caller, &u[l]); + //caller->getSubgridCUDA2D(i,caller, &u[l]); + caller->updateSubgridCUDA2D(i,caller, &u[l]); __syncthreads(); } if( (bound & 12) ) { - caller->runSubgridCUDA(12,u,i); + caller->runSubgridCUDA2D(12,u,i); //__syncthreads(); - //caller->insertSubgridCUDA(u[l],i); + //caller->insertSubgridCUDA2D(u[l],i); //__syncthreads(); - //caller->getSubgridCUDA(i,caller, &u[l]); - caller->updateSubgridCUDA(i,caller, &u[l]); + //caller->getSubgridCUDA2D(i,caller, &u[l]); + caller->updateSubgridCUDA2D(i,caller, &u[l]); __syncthreads(); } @@ -1858,19 +1858,19 @@ void /*tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>:: } /*if( bound ) { - caller->runSubgridCUDA(15,u,i); + caller->runSubgridCUDA2D(15,u,i); __syncthreads(); - //caller->insertSubgridCUDA(u[l],i); + //caller->insertSubgridCUDA2D(u[l],i); //__syncthreads(); - //caller->getSubgridCUDA(i,caller, &u[l]); - caller->updateSubgridCUDA(i,caller, &u[l]); + //caller->getSubgridCUDA2D(i,caller, &u[l]); + caller->updateSubgridCUDA2D(i,caller, &u[l]); __syncthreads(); }*/ if(l==0) { - caller->setBoundaryConditionCUDA(i, 0); - caller->setSubgridValueCUDA(i, caller->getSubgridValueCUDA(i) - 1 ); + caller->setBoundaryConditionCUDA2D(i, 0); + caller->setSubgridValueCUDA2D(i, caller->getSubgridValueCUDA2D(i) - 1 ); } @@ -1882,4 +1882,4 @@ void /*tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>:: #endif /*HAVE_CUDA*/ -#endif /* TNLPARALLELEIKONALSOLVER_IMPL_H_ */ +#endif /* TNLPARALLELEIKONALSOLVER2D_IMPL_H_ */ diff --git a/examples/hamilton-jacobi-parallel/tnlParallelEikonalSolver3D_impl.h b/examples/hamilton-jacobi-parallel/tnlParallelEikonalSolver3D_impl.h new file mode 100644 index 0000000000000000000000000000000000000000..91ec8afa4f75c2ac031a037d1e9e8e027108609e --- /dev/null +++ b/examples/hamilton-jacobi-parallel/tnlParallelEikonalSolver3D_impl.h @@ -0,0 +1,1885 @@ +/*************************************************************************** + tnlParallelEikonalSolver2D_impl.h - description + ------------------- + begin : Nov 28 , 2014 + copyright : (C) 2014 by Tomas Sobotik + ***************************************************************************/ + +/*************************************************************************** + * * + * This program is free software; you can redistribute it and/or modify * + * it under the terms of the GNU General Public License as published by * + * the Free Software Foundation; either version 2 of the License, or * + * (at your option) any later version. * + * * + ***************************************************************************/ + +#ifndef TNLPARALLELEIKONALSOLVER3D_IMPL_H_ +#define TNLPARALLELEIKONALSOLVER3D_IMPL_H_ + + +#include "tnlParallelEikonalSolver.h" +#include <core/mfilename.h> + +template< typename SchemeHost, typename SchemeDevice, typename Device> +tnlParallelEikonalSolver<3,SchemeHost, SchemeDevice, Device, double, int>::tnlParallelEikonalSolver() +{ + cout << "a" << endl; + this->device = tnlCudaDevice; /////////////// tnlCuda Device --- vypocet na GPU, tnlHostDevice --- vypocet na CPU + +#ifdef HAVE_CUDA + if(this->device == tnlCudaDevice) + { + run_host = 1; + } +#endif + + cout << "b" << endl; +} + +template< typename SchemeHost, typename SchemeDevice, typename Device> +void tnlParallelEikonalSolver<3,SchemeHost, SchemeDevice, Device, double, int>::test() +{ +/* + for(int i =0; i < this->subgridValues.getSize(); i++ ) + { + insertSubgrid(getSubgrid(i), i); + } +*/ +} + +template< typename SchemeHost, typename SchemeDevice, typename Device> + +bool tnlParallelEikonalSolver<3,SchemeHost, SchemeDevice, Device, double, int>::init( const tnlParameterContainer& parameters ) +{ + cout << "Initializating solver..." << endl; + const tnlString& meshLocation = parameters.getParameter <tnlString>("mesh"); + this->mesh.load( meshLocation ); + + this->n = parameters.getParameter <int>("subgrid-size"); + cout << "Setting N to " << this->n << endl; + + this->subMesh.setDimensions( this->n, this->n ); + this->subMesh.setDomain( tnlStaticVector<3,double>(0.0, 0.0), + tnlStaticVector<3,double>(this->mesh.getHx()*(double)(this->n), this->mesh.getHy()*(double)(this->n),this->mesh.getHz()*(double)(this->n)) ); + + this->subMesh.save("submesh.tnl"); + + const tnlString& initialCondition = parameters.getParameter <tnlString>("initial-condition"); + this->u0.load( initialCondition ); + + //cout << this->mesh.getCellCenter(0) << endl; + + this->delta = parameters.getParameter <double>("delta"); + this->delta *= this->mesh.getHx()*this->mesh.getHy(); + + cout << "Setting delta to " << this->delta << endl; + + this->tau0 = parameters.getParameter <double>("initial-tau"); + cout << "Setting initial tau to " << this->tau0 << endl; + this->stopTime = parameters.getParameter <double>("stop-time"); + + this->cflCondition = parameters.getParameter <double>("cfl-condition"); + this -> cflCondition *= sqrt(this->mesh.getHx()*this->mesh.getHy()); + cout << "Setting CFL to " << this->cflCondition << endl; + + stretchGrid(); + this->stopTime /= (double)(this->gridCols); + this->stopTime *= (1.0+1.0/((double)(this->n) - 2.0)); + cout << "Setting stopping time to " << this->stopTime << endl; + //this->stopTime = 1.5*((double)(this->n))*parameters.getParameter <double>("stop-time")*this->mesh.getHx(); + //cout << "Setting stopping time to " << this->stopTime << endl; + + cout << "Initializating scheme..." << endl; + if(!this->schemeHost.init(parameters)) + { + cerr << "SchemeHost failed to initialize." << endl; + return false; + } + cout << "Scheme initialized." << endl; + + test(); + + VectorType* tmp = new VectorType[subgridValues.getSize()]; + bool containsCurve = false; + +#ifdef HAVE_CUDA + + if(this->device == tnlCudaDevice) + { + /*cout << "Testing... " << endl; + if(this->device == tnlCudaDevice) + { + if( !initCUDA3D(parameters, gridRows, gridCols) ) + return false; + }*/ + //cout << "s" << endl; + cudaMalloc(&(this->cudaSolver), sizeof(tnlParallelEikonalSolver<3,SchemeHost, SchemeDevice, Device, double, int >)); + //cout << "s" << endl; + cudaMemcpy(this->cudaSolver, this,sizeof(tnlParallelEikonalSolver<3,SchemeHost, SchemeDevice, Device, double, int >), cudaMemcpyHostToDevice); + //cout << "s" << endl; + double** tmpdev = NULL; + cudaMalloc(&tmpdev, sizeof(double*)); + //double* tmpw; + cudaMalloc(&(this->tmpw), this->work_u.getSize()*sizeof(double)); + cudaMalloc(&(this->runcuda), sizeof(int)); + cudaDeviceSynchronize(); + checkCudaDevice; + int* tmpUC; + cudaMalloc(&(tmpUC), this->work_u.getSize()*sizeof(int)); + cudaMemcpy(tmpUC, this->unusedCell.getData(), this->unusedCell.getSize()*sizeof(int), cudaMemcpyHostToDevice); + + initCUDA3D<SchemeTypeHost, SchemeTypeDevice, DeviceType><<<1,1>>>(this->cudaSolver, (this->tmpw), (this->runcuda),tmpUC); + cudaDeviceSynchronize(); + checkCudaDevice; + //cout << "s " << endl; + //cudaMalloc(&(cudaSolver->work_u_cuda), this->work_u.getSize()*sizeof(double)); + double* tmpu = NULL; + + cudaMemcpy(&tmpu, tmpdev,sizeof(double*), cudaMemcpyDeviceToHost); + //printf("%p %p \n",tmpu,tmpw); + cudaMemcpy((this->tmpw), this->work_u.getData(), this->work_u.getSize()*sizeof(double), cudaMemcpyHostToDevice); + cudaDeviceSynchronize(); + checkCudaDevice; + //cout << "s "<< endl; + + } +#endif + + if(this->device == tnlHostDevice) + { + for(int i = 0; i < this->subgridValues.getSize(); i++) + { + + if(! tmp[i].setSize(this->n * this->n)) + cout << "Could not allocate tmp["<< i <<"] array." << endl; + tmp[i] = getSubgrid(i); + containsCurve = false; + + for(int j = 0; j < tmp[i].getSize(); j++) + { + if(tmp[i][0]*tmp[i][j] <= 0.0) + { + containsCurve = true; + j=tmp[i].getSize(); + } + + } + if(containsCurve) + { + //cout << "Computing initial SDF on subgrid " << i << "." << endl; + insertSubgrid(runSubgrid(0, tmp[i],i), i); + setSubgridValue(i, 4); + //cout << "Computed initial SDF on subgrid " << i << "." << endl; + } + containsCurve = false; + + } + } +#ifdef HAVE_CUDA + else if(this->device == tnlCudaDevice) + { +// cout << "pre 1 kernel" << endl; + cudaDeviceSynchronize(); + checkCudaDevice; + dim3 threadsPerBlock(this->n, this->n); + dim3 numBlocks(this->gridCols,this->gridRows); + cudaDeviceSynchronize(); + checkCudaDevice; + initRunCUDA3D<SchemeTypeHost,SchemeTypeDevice, DeviceType><<<numBlocks,threadsPerBlock,3*this->n*this->n*sizeof(double)>>>(this->cudaSolver); + cudaDeviceSynchronize(); +// cout << "post 1 kernel" << endl; + + } +#endif + + + this->currentStep = 1; + if(this->device == tnlHostDevice) + synchronize(); +#ifdef HAVE_CUDA + else if(this->device == tnlCudaDevice) + { + dim3 threadsPerBlock(this->n, this->n); + dim3 numBlocks(this->gridCols,this->gridRows); + //double * test = (double*)malloc(this->work_u.getSize()*sizeof(double)); + //cout << test[0] <<" " << test[1] <<" " << test[2] <<" " << test[3] << endl; + //cudaMemcpy(/*this->work_u.getData()*/ test, (this->tmpw), this->work_u.getSize()*sizeof(double), cudaMemcpyDeviceToHost); + //cout << this->tmpw << " " << test[0] <<" " << test[1] << " " <<test[2] << " " <<test[3] << endl; + + checkCudaDevice; + + synchronizeCUDA3D<SchemeTypeHost, SchemeTypeDevice, DeviceType><<<numBlocks,threadsPerBlock>>>(this->cudaSolver); + cudaDeviceSynchronize(); + checkCudaDevice; + synchronize2CUDA3D<SchemeTypeHost, SchemeTypeDevice, DeviceType><<<numBlocks,1>>>(this->cudaSolver); + cudaDeviceSynchronize(); + checkCudaDevice; + //cout << test[0] << " " <<test[1] <<" " << test[2] << " " <<test[3] << endl; + //cudaMemcpy(/*this->work_u.getData()*/ test, (this->tmpw), this->work_u.getSize()*sizeof(double), cudaMemcpyDeviceToHost); + //checkCudaDevice; + //cout << this->tmpw << " " << test[0] << " " <<test[1] << " " <<test[2] <<" " << test[3] << endl; + //free(test); + + } + +#endif + cout << "Solver initialized." << endl; + + return true; +} + +template< typename SchemeHost, typename SchemeDevice, typename Device> +void tnlParallelEikonalSolver<3,SchemeHost, SchemeDevice, Device, double, int>::run() +{ + if(this->device == tnlHostDevice) + { + + bool end = false; + while ((this->boundaryConditions.max() > 0 ) || !end) + { + if(this->boundaryConditions.max() == 0 ) + end=true; + else + end=false; +#ifdef HAVE_OPENMP +#pragma omp parallel for num_threads(4) schedule(dynamic) +#endif + for(int i = 0; i < this->subgridValues.getSize(); i++) + { + if(getSubgridValue(i) != INT_MAX) + { + //cout << "subMesh: " << i << ", BC: " << getBoundaryCondition(i) << endl; + + if(getSubgridValue(i) == currentStep+4) + { + + if(getBoundaryCondition(i) & 1) + { + insertSubgrid( runSubgrid(1, getSubgrid(i),i), i); + this->calculationsCount[i]++; + } + if(getBoundaryCondition(i) & 2) + { + insertSubgrid( runSubgrid(2, getSubgrid(i),i), i); + this->calculationsCount[i]++; + } + if(getBoundaryCondition(i) & 4) + { + insertSubgrid( runSubgrid(4, getSubgrid(i),i), i); + this->calculationsCount[i]++; + } + if(getBoundaryCondition(i) & 8) + { + insertSubgrid( runSubgrid(8, getSubgrid(i),i), i); + this->calculationsCount[i]++; + } + } + + if( ((getBoundaryCondition(i) & 2) )|| (getBoundaryCondition(i) & 1)//) + /* &&(!(getBoundaryCondition(i) & 5) && !(getBoundaryCondition(i) & 10)) */) + { + //cout << "3 @ " << getBoundaryCondition(i) << endl; + insertSubgrid( runSubgrid(3, getSubgrid(i),i), i); + } + if( ((getBoundaryCondition(i) & 4) )|| (getBoundaryCondition(i) & 1)//) + /* &&(!(getBoundaryCondition(i) & 3) && !(getBoundaryCondition(i) & 12)) */) + { + //cout << "5 @ " << getBoundaryCondition(i) << endl; + insertSubgrid( runSubgrid(5, getSubgrid(i),i), i); + } + if( ((getBoundaryCondition(i) & 2) )|| (getBoundaryCondition(i) & 8)//) + /* &&(!(getBoundaryCondition(i) & 12) && !(getBoundaryCondition(i) & 3))*/ ) + { + //cout << "10 @ " << getBoundaryCondition(i) << endl; + insertSubgrid( runSubgrid(10, getSubgrid(i),i), i); + } + if( ((getBoundaryCondition(i) & 4) )|| (getBoundaryCondition(i) & 8)//) + /*&&(!(getBoundaryCondition(i) & 10) && !(getBoundaryCondition(i) & 5)) */) + { + //cout << "12 @ " << getBoundaryCondition(i) << endl; + insertSubgrid( runSubgrid(12, getSubgrid(i),i), i); + } + + + /*if(getBoundaryCondition(i)) + { + insertSubgrid( runSubgrid(15, getSubgrid(i),i), i); + }*/ + + setBoundaryCondition(i, 0); + + setSubgridValue(i, getSubgridValue(i)-1); + + } + } + synchronize(); + } + } +#ifdef HAVE_CUDA + else if(this->device == tnlCudaDevice) + { + //cout << "fn" << endl; + bool end_cuda = false; + dim3 threadsPerBlock(this->n, this->n); + dim3 numBlocks(this->gridCols,this->gridRows); + cudaDeviceSynchronize(); + checkCudaDevice; + //cudaMalloc(&runcuda,sizeof(bool)); + //cudaMemcpy(runcuda, &run_host, sizeof(bool), cudaMemcpyHostToDevice); + //cout << "fn" << endl; + bool* tmpb; + //cudaMemcpy(tmpb, &(cudaSolver->runcuda),sizeof(bool*), cudaMemcpyDeviceToHost); + //cudaDeviceSynchronize(); + //checkCudaDevice; + cudaMemcpy(&(this->run_host),this->runcuda,sizeof(int), cudaMemcpyDeviceToHost); + cudaDeviceSynchronize(); + checkCudaDevice; + //cout << "fn" << endl; + int i = 1; + time_diff = 0.0; + while (run_host || !end_cuda) + { + cout << "Computing at step "<< i++ << endl; + if(run_host != 0 ) + end_cuda = true; + else + end_cuda = false; + //cout << "a" << endl; + cudaDeviceSynchronize(); + checkCudaDevice; + start = std::clock(); + runCUDA3D<SchemeTypeHost, SchemeTypeDevice, DeviceType><<<numBlocks,threadsPerBlock,3*this->n*this->n*sizeof(double)>>>(this->cudaSolver); + //cout << "a" << endl; + cudaDeviceSynchronize(); + time_diff += (std::clock() - start) / (double)(CLOCKS_PER_SEC); + + //start = std::clock(); + synchronizeCUDA3D<SchemeTypeHost, SchemeTypeDevice, DeviceType><<<numBlocks,threadsPerBlock>>>(this->cudaSolver); + cudaDeviceSynchronize(); + checkCudaDevice; + synchronize2CUDA3D<SchemeTypeHost, SchemeTypeDevice, DeviceType><<<numBlocks,1>>>(this->cudaSolver); + cudaDeviceSynchronize(); + checkCudaDevice; + //time_diff += (std::clock() - start) / (double)(CLOCKS_PER_SEC); + + + //cout << "a" << endl; + //run_host = false; + //cout << "in kernel loop" << run_host << endl; + //cudaMemcpy(tmpb, &(cudaSolver->runcuda),sizeof(bool*), cudaMemcpyDeviceToHost); + cudaMemcpy(&run_host, (this->runcuda),sizeof(int), cudaMemcpyDeviceToHost); + //cout << "in kernel loop" << run_host << endl; + } + cout << "Solving time was: " << time_diff << endl; + //cout << "b" << endl; + + //double* tmpu; + //cudaMemcpy(tmpu, &(cudaSolver->work_u_cuda),sizeof(double*), cudaMemcpyHostToDevice); + //cudaMemcpy(this->work_u.getData(), tmpu, this->work_u.getSize()*sizeof(double), cudaMemcpyDeviceToHost); + //cout << this->work_u.getData()[0] << endl; + + //double * test = (double*)malloc(this->work_u.getSize()*sizeof(double)); + //cout << test[0] << test[1] << test[2] << test[3] << endl; + cudaMemcpy(this->work_u.getData()/* test*/, (this->tmpw), this->work_u.getSize()*sizeof(double), cudaMemcpyDeviceToHost); + //cout << this->tmpw << " " << test[0] << test[1] << test[2] << test[3] << endl; + //free(test); + + cudaDeviceSynchronize(); + } +#endif + contractGrid(); + this->u0.save("u-00001.tnl"); + cout << "Maximum number of calculations on one subgrid was " << this->calculationsCount.absMax() << endl; + cout << "Average number of calculations on one subgrid was " << ( (double) this->calculationsCount.sum() / (double) this->calculationsCount.getSize() ) << endl; + cout << "Solver finished" << endl; + +#ifdef HAVE_CUDA + if(this->device == tnlCudaDevice) + { + cudaFree(this->runcuda); + cudaFree(this->tmpw); + cudaFree(this->cudaSolver); + } +#endif + +} + +//north - 1, east - 2, west - 4, south - 8 +template< typename SchemeHost, typename SchemeDevice, typename Device> +void tnlParallelEikonalSolver<3,SchemeHost, SchemeDevice, Device, double, int>::synchronize() //needs fix ---- maybe not anymore --- but frankly: yeah, it does -- aaaa-and maybe fixed now +{ + cout << "Synchronizig..." << endl; + int tmp1, tmp2; + int grid1, grid2; + + if(this->currentStep & 1) + { + for(int j = 0; j < this->gridRows - 1; j++) + { + for (int i = 0; i < this->gridCols*this->n; i++) + { + tmp1 = this->gridCols*this->n*((this->n-1)+j*this->n) + i; + tmp2 = this->gridCols*this->n*((this->n)+j*this->n) + i; + grid1 = getSubgridValue(getOwner(tmp1)); + grid2 = getSubgridValue(getOwner(tmp2)); + if(getOwner(tmp1)==getOwner(tmp2)) + cout << "i, j" << i << "," << j << endl; + if ((fabs(this->work_u[tmp1]) < fabs(this->work_u[tmp2]) - this->delta || grid2 == INT_MAX || grid2 == -INT_MAX) && (grid1 != INT_MAX && grid1 != -INT_MAX)) + { + this->work_u[tmp2] = this->work_u[tmp1]; + this->unusedCell[tmp2] = 0; + if(grid2 == INT_MAX) + { + setSubgridValue(getOwner(tmp2), -INT_MAX); + } + if(! (getBoundaryCondition(getOwner(tmp2)) & 8) ) + setBoundaryCondition(getOwner(tmp2), getBoundaryCondition(getOwner(tmp2))+8); + } + else if ((fabs(this->work_u[tmp1]) > fabs(this->work_u[tmp2]) + this->delta || grid1 == INT_MAX || grid1 == -INT_MAX) && (grid2 != INT_MAX && grid2 != -INT_MAX)) + { + this->work_u[tmp1] = this->work_u[tmp2]; + this->unusedCell[tmp1] = 0; + if(grid1 == INT_MAX) + { + setSubgridValue(getOwner(tmp1), -INT_MAX); + } + if(! (getBoundaryCondition(getOwner(tmp1)) & 1) ) + setBoundaryCondition(getOwner(tmp1), getBoundaryCondition(getOwner(tmp1))+1); + } + } + } + + } + else + { + for(int i = 1; i < this->gridCols; i++) + { + for (int j = 0; j < this->gridRows*this->n; j++) + { + tmp1 = this->gridCols*this->n*j + i*this->n - 1; + tmp2 = this->gridCols*this->n*j + i*this->n ; + grid1 = getSubgridValue(getOwner(tmp1)); + grid2 = getSubgridValue(getOwner(tmp2)); + if(getOwner(tmp1)==getOwner(tmp2)) + cout << "i, j" << i << "," << j << endl; + if ((fabs(this->work_u[tmp1]) < fabs(this->work_u[tmp2]) - this->delta || grid2 == INT_MAX || grid2 == -INT_MAX) && (grid1 != INT_MAX && grid1 != -INT_MAX)) + { + this->work_u[tmp2] = this->work_u[tmp1]; + this->unusedCell[tmp2] = 0; + if(grid2 == INT_MAX) + { + setSubgridValue(getOwner(tmp2), -INT_MAX); + } + if(! (getBoundaryCondition(getOwner(tmp2)) & 4) ) + setBoundaryCondition(getOwner(tmp2), getBoundaryCondition(getOwner(tmp2))+4); + } + else if ((fabs(this->work_u[tmp1]) > fabs(this->work_u[tmp2]) + this->delta || grid1 == INT_MAX || grid1 == -INT_MAX) && (grid2 != INT_MAX && grid2 != -INT_MAX)) + { + this->work_u[tmp1] = this->work_u[tmp2]; + this->unusedCell[tmp1] = 0; + if(grid1 == INT_MAX) + { + setSubgridValue(getOwner(tmp1), -INT_MAX); + } + if(! (getBoundaryCondition(getOwner(tmp1)) & 2) ) + setBoundaryCondition(getOwner(tmp1), getBoundaryCondition(getOwner(tmp1))+2); + } + } + } + } + + + this->currentStep++; + int stepValue = this->currentStep + 4; + for (int i = 0; i < this->subgridValues.getSize(); i++) + { + if( getSubgridValue(i) == -INT_MAX ) + setSubgridValue(i, stepValue); + } + + cout << "Grid synchronized at step " << (this->currentStep - 1 ) << endl; + +} + + +template< typename SchemeHost, typename SchemeDevice, typename Device> +int tnlParallelEikonalSolver<3,SchemeHost, SchemeDevice, Device, double, int>::getOwner(int i) const +{ + + return (i / (this->gridCols*this->n*this->n))*this->gridCols + (i % (this->gridCols*this->n))/this->n; +} + +template< typename SchemeHost, typename SchemeDevice, typename Device> +int tnlParallelEikonalSolver<3,SchemeHost, SchemeDevice, Device, double, int>::getSubgridValue( int i ) const +{ + return this->subgridValues[i]; +} + +template< typename SchemeHost, typename SchemeDevice, typename Device> +void tnlParallelEikonalSolver<3,SchemeHost, SchemeDevice, Device, double, int>::setSubgridValue(int i, int value) +{ + this->subgridValues[i] = value; +} + +template< typename SchemeHost, typename SchemeDevice, typename Device> +int tnlParallelEikonalSolver<3,SchemeHost, SchemeDevice, Device, double, int>::getBoundaryCondition( int i ) const +{ + return this->boundaryConditions[i]; +} + +template< typename SchemeHost, typename SchemeDevice, typename Device> +void tnlParallelEikonalSolver<3,SchemeHost, SchemeDevice, Device, double, int>::setBoundaryCondition(int i, int value) +{ + this->boundaryConditions[i] = value; +} + +template< typename SchemeHost, typename SchemeDevice, typename Device> +void tnlParallelEikonalSolver<3,SchemeHost, SchemeDevice, Device, double, int>::stretchGrid() +{ + cout << "Stretching grid..." << endl; + + + this->gridCols = ceil( ((double)(this->mesh.getDimensions().x()-1)) / ((double)(this->n-1)) ); + this->gridRows = ceil( ((double)(this->mesh.getDimensions().y()-1)) / ((double)(this->n-1)) ); + + //this->gridCols = (this->mesh.getDimensions().x()-1) / (this->n-1) ; + //this->gridRows = (this->mesh.getDimensions().y()-1) / (this->n-1) ; + + cout << "Setting gridCols to " << this->gridCols << "." << endl; + cout << "Setting gridRows to " << this->gridRows << "." << endl; + + this->subgridValues.setSize(this->gridCols*this->gridRows); + this->subgridValues.setValue(0); + this->boundaryConditions.setSize(this->gridCols*this->gridRows); + this->boundaryConditions.setValue(0); + this->calculationsCount.setSize(this->gridCols*this->gridRows); + this->calculationsCount.setValue(0); + + for(int i = 0; i < this->subgridValues.getSize(); i++ ) + { + this->subgridValues[i] = INT_MAX; + this->boundaryConditions[i] = 0; + } + + int stretchedSize = this->n*this->n*this->gridCols*this->gridRows; + + if(!this->work_u.setSize(stretchedSize)) + cerr << "Could not allocate memory for stretched grid." << endl; + if(!this->unusedCell.setSize(stretchedSize)) + cerr << "Could not allocate memory for supporting stretched grid." << endl; + int idealStretch =this->mesh.getDimensions().x() + (this->mesh.getDimensions().x()-2)/(this->n-1); + cout << idealStretch << endl; + + for(int i = 0; i < stretchedSize; i++) + { + this->unusedCell[i] = 1; + int diff =(this->n*this->gridCols) - idealStretch ; + //cout << "diff = " << diff <<endl; + int k = i/this->n - i/(this->n*this->gridCols) + this->mesh.getDimensions().x()*(i/(this->n*this->n*this->gridCols)) + (i/(this->n*this->gridCols))*diff; + + if(i%(this->n*this->gridCols) - idealStretch >= 0) + { + //cout << i%(this->n*this->gridCols) - idealStretch +1 << endl; + k+= i%(this->n*this->gridCols) - idealStretch +1 ; + } + + if(i/(this->n*this->gridCols) - idealStretch + 1 > 0) + { + //cout << i/(this->n*this->gridCols) - idealStretch + 1 << endl; + k+= (i/(this->n*this->gridCols) - idealStretch +1 )* this->mesh.getDimensions().x() ; + } + + //cout << "i = " << i << " : i-k = " << i-k << endl; + /*int j=(i % (this->n*this->gridCols)) - ( (this->mesh.getDimensions().x() - this->n)/(this->n - 1) + this->mesh.getDimensions().x() - 1) + + (this->n*this->gridCols - this->mesh.getDimensions().x())*(i/(this->n*this->n*this->gridCols)) ; + + if(j > 0) + k += j; + + int l = i-k - (this->u0.getSize() - 1); + int m = (l % this->mesh.getDimensions().x()); + + if(l>0) + k+= l + ( (l / this->mesh.getDimensions().x()) + 1 )*this->mesh.getDimensions().x() - (l % this->mesh.getDimensions().x());*/ + + this->work_u[i] = this->u0[i-k]; + //cout << (i-k) <<endl; + } + + + cout << "Grid stretched." << endl; +} + +template< typename SchemeHost, typename SchemeDevice, typename Device> +void tnlParallelEikonalSolver<3,SchemeHost, SchemeDevice, Device, double, int>::contractGrid() +{ + cout << "Contracting grid..." << endl; + int stretchedSize = this->n*this->n*this->gridCols*this->gridRows; + + int idealStretch =this->mesh.getDimensions().x() + (this->mesh.getDimensions().x()-2)/(this->n-1); + cout << idealStretch << endl; + + for(int i = 0; i < stretchedSize; i++) + { + int diff =(this->n*this->gridCols) - idealStretch ; + int k = i/this->n - i/(this->n*this->gridCols) + this->mesh.getDimensions().x()*(i/(this->n*this->n*this->gridCols)) + (i/(this->n*this->gridCols))*diff; + + if((i%(this->n*this->gridCols) - idealStretch < 0) && (i/(this->n*this->gridCols) - idealStretch + 1 <= 0)) + { + //cout << i <<" : " <<i-k<< endl; + this->u0[i-k] = this->work_u[i]; + } + + } + + cout << "Grid contracted" << endl; +} + +template< typename SchemeHost, typename SchemeDevice, typename Device> +typename tnlParallelEikonalSolver<3,SchemeHost, SchemeDevice, Device, double, int>::VectorType +tnlParallelEikonalSolver<3,SchemeHost, SchemeDevice, Device, double, int>::getSubgrid( const int i ) const +{ + VectorType u; + u.setSize(this->n*this->n); + + for( int j = 0; j < u.getSize(); j++) + { + u[j] = this->work_u[ (i / this->gridCols) * this->n*this->n*this->gridCols + + (i % this->gridCols) * this->n + + (j/this->n) * this->n*this->gridCols + + (j % this->n) ]; + } + return u; +} + +template< typename SchemeHost, typename SchemeDevice, typename Device> +void tnlParallelEikonalSolver<3,SchemeHost, SchemeDevice, Device, double, int>::insertSubgrid( VectorType u, const int i ) +{ + + for( int j = 0; j < this->n*this->n; j++) + { + int index = (i / this->gridCols)*this->n*this->n*this->gridCols + (i % this->gridCols)*this->n + (j/this->n)*this->n*this->gridCols + (j % this->n); + //OMP LOCK index + if( (fabs(this->work_u[index]) > fabs(u[j])) || (this->unusedCell[index] == 1) ) + { + this->work_u[index] = u[j]; + this->unusedCell[index] = 0; + } + //OMP UNLOCK index + } +} + +template< typename SchemeHost, typename SchemeDevice, typename Device> +typename tnlParallelEikonalSolver<3,SchemeHost, SchemeDevice, Device, double, int>::VectorType +tnlParallelEikonalSolver<3,SchemeHost, SchemeDevice, Device, double, int>::runSubgrid( int boundaryCondition, VectorType u, int subGridID) +{ + + VectorType fu; + + fu.setLike(u); + fu.setValue( 0.0 ); + +/* + * Insert Euler-Solver Here + */ + + /**/ + + /*for(int i = 0; i < u.getSize(); i++) + { + int x = this->subMesh.getCellCoordinates(i).x(); + int y = this->subMesh.getCellCoordinates(i).y(); + + if(x == 0 && (boundaryCondition & 4) && y ==0) + { + if((u[subMesh.getCellYSuccessor( i )] - u[i])/subMesh.getHy() > 1.0) + { + //cout << "x = 0; y = 0" << endl; + u[i] = u[subMesh.getCellYSuccessor( i )] - subMesh.getHy(); + } + } + else if(x == 0 && (boundaryCondition & 4) && y == subMesh.getDimensions().y() - 1) + { + if((u[subMesh.getCellYPredecessor( i )] - u[i])/subMesh.getHy() > 1.0) + { + //cout << "x = 0; y = n" << endl; + u[i] = u[subMesh.getCellYPredecessor( i )] - subMesh.getHy(); + } + } + + + else if(x == subMesh.getDimensions().x() - 1 && (boundaryCondition & 2) && y ==0) + { + if((u[subMesh.getCellYSuccessor( i )] - u[i])/subMesh.getHy() > 1.0) + { + //cout << "x = n; y = 0" << endl; + u[i] = u[subMesh.getCellYSuccessor( i )] - subMesh.getHy(); + } + } + else if(x == subMesh.getDimensions().x() - 1 && (boundaryCondition & 2) && y == subMesh.getDimensions().y() - 1) + { + if((u[subMesh.getCellYPredecessor( i )] - u[i])/subMesh.getHy() > 1.0) + { + //cout << "x = n; y = n" << endl; + u[i] = u[subMesh.getCellYPredecessor( i )] - subMesh.getHy(); + } + } + + + else if(y == 0 && (boundaryCondition & 8) && x ==0) + { + if((u[subMesh.getCellXSuccessor( i )] - u[i])/subMesh.getHx() > 1.0) + { + //cout << "y = 0; x = 0" << endl; + u[i] = u[subMesh.getCellXSuccessor( i )] - subMesh.getHx(); + } + } + else if(y == 0 && (boundaryCondition & 8) && x == subMesh.getDimensions().x() - 1) + { + if((u[subMesh.getCellXPredecessor( i )] - u[i])/subMesh.getHx() > 1.0) + { + //cout << "y = 0; x = n" << endl; + u[i] = u[subMesh.getCellXPredecessor( i )] - subMesh.getHx(); + } + } + + + else if(y == subMesh.getDimensions().y() - 1 && (boundaryCondition & 1) && x ==0) + { + if((u[subMesh.getCellXSuccessor( i )] - u[i])/subMesh.getHx() > 1.0) { + //cout << "y = n; x = 0" << endl; + u[i] = u[subMesh.getCellXSuccessor( i )] - subMesh.getHx(); + } + } + else if(y == subMesh.getDimensions().y() - 1 && (boundaryCondition & 1) && x == subMesh.getDimensions().x() - 1) + { + if((u[subMesh.getCellXPredecessor( i )] - u[i])/subMesh.getHx() > 1.0) + { + //cout << "y = n; x = n" << endl; + u[i] = u[subMesh.getCellXPredecessor( i )] - subMesh.getHx(); + } + } + }*/ + + /**/ + + +/* bool tmp = false; + for(int i = 0; i < u.getSize(); i++) + { + if(u[0]*u[i] <= 0.0) + tmp=true; + } + + + if(tmp) + {} + else if(boundaryCondition == 4) + { + int i; + for(i = 0; i < u.getSize() - subMesh.getDimensions().x() ; i=subMesh.getCellYSuccessor(i)) + { + int j; + for(j = i; j < subMesh.getDimensions().x() - 1; j=subMesh.getCellXSuccessor(j)) + { + u[j] = u[i]; + } + u[j] = u[i]; + } + int j; + for(j = i; j < subMesh.getDimensions().x() - 1; j=subMesh.getCellXSuccessor(j)) + { + u[j] = u[i]; + } + u[j] = u[i]; + } + else if(boundaryCondition == 8) + { + int i; + for(i = 0; i < subMesh.getDimensions().x() - 1; i=subMesh.getCellXSuccessor(i)) + { + int j; + for(j = i; j < u.getSize() - subMesh.getDimensions().x(); j=subMesh.getCellYSuccessor(j)) + { + u[j] = u[i]; + } + u[j] = u[i]; + } + int j; + for(j = i; j < u.getSize() - subMesh.getDimensions().x(); j=subMesh.getCellYSuccessor(j)) + { + u[j] = u[i]; + } + u[j] = u[i]; + + } + else if(boundaryCondition == 2) + { + int i; + for(i = subMesh.getDimensions().x() - 1; i < u.getSize() - subMesh.getDimensions().x() ; i=subMesh.getCellYSuccessor(i)) + { + int j; + for(j = i; j > (i-1)*subMesh.getDimensions().x(); j=subMesh.getCellXPredecessor(j)) + { + u[j] = u[i]; + } + u[j] = u[i]; + } + int j; + for(j = i; j > (i-1)*subMesh.getDimensions().x(); j=subMesh.getCellXPredecessor(j)) + { + u[j] = u[i]; + } + u[j] = u[i]; + } + else if(boundaryCondition == 1) + { + int i; + for(i = (subMesh.getDimensions().y() - 1)*subMesh.getDimensions().x(); i < u.getSize() - 1; i=subMesh.getCellXSuccessor(i)) + { + int j; + for(j = i; j >=subMesh.getDimensions().x(); j=subMesh.getCellYPredecessor(j)) + { + u[j] = u[i]; + } + u[j] = u[i]; + } + int j; + for(j = i; j >=subMesh.getDimensions().x(); j=subMesh.getCellYPredecessor(j)) + { + u[j] = u[i]; + } + u[j] = u[i]; + } +*/ + /**/ + + + + bool tmp = false; + for(int i = 0; i < u.getSize(); i++) + { + if(u[0]*u[i] <= 0.0) + tmp=true; + int centreGID = (this->n*(subGridID / this->gridRows)+ (this->n >> 1))*(this->n*this->gridCols) + this->n*(subGridID % this->gridRows) + (this->n >> 1); + if(this->unusedCell[centreGID] == 0 || boundaryCondition == 0) + tmp = true; + } + //if(this->currentStep + 3 < getSubgridValue(subGridID)) + //tmp = true; + + + double value = Sign(u[0]) * u.absMax(); + + if(tmp) + {} + + + //north - 1, east - 2, west - 4, south - 8 + else if(boundaryCondition == 4) + { + for(int i = 0; i < this->n; i++) + for(int j = 1;j < this->n; j++) + //if(fabs(u[i*this->n + j]) < fabs(u[i*this->n])) + u[i*this->n + j] = value;// u[i*this->n]; + } + else if(boundaryCondition == 2) + { + for(int i = 0; i < this->n; i++) + for(int j =0 ;j < this->n -1; j++) + //if(fabs(u[i*this->n + j]) < fabs(u[(i+1)*this->n - 1])) + u[i*this->n + j] = value;// u[(i+1)*this->n - 1]; + } + else if(boundaryCondition == 1) + { + for(int j = 0; j < this->n; j++) + for(int i = 0;i < this->n - 1; i++) + //if(fabs(u[i*this->n + j]) < fabs(u[j + this->n*(this->n - 1)])) + u[i*this->n + j] = value;// u[j + this->n*(this->n - 1)]; + } + else if(boundaryCondition == 8) + { + for(int j = 0; j < this->n; j++) + for(int i = 1;i < this->n; i++) + //if(fabs(u[i*this->n + j]) < fabs(u[j])) + u[i*this->n + j] = value;// u[j]; + } + +/* + + else if(boundaryCondition == 5) + { + for(int i = 0; i < this->n - 1; i++) + for(int j = 1;j < this->n; j++) + //if(fabs(u[i*this->n + j]) < fabs(u[i*this->n])) + u[i*this->n + j] = value;// u[i*this->n]; + } + else if(boundaryCondition == 10) + { + for(int i = 1; i < this->n; i++) + for(int j =0 ;j < this->n -1; j++) + //if(fabs(u[i*this->n + j]) < fabs(u[(i+1)*this->n - 1])) + u[i*this->n + j] = value;// u[(i+1)*this->n - 1]; + } + else if(boundaryCondition == 3) + { + for(int j = 0; j < this->n - 1; j++) + for(int i = 0;i < this->n - 1; i++) + //if(fabs(u[i*this->n + j]) < fabs(u[j + this->n*(this->n - 1)])) + u[i*this->n + j] = value;// u[j + this->n*(this->n - 1)]; + } + else if(boundaryCondition == 12) + { + for(int j = 1; j < this->n; j++) + for(int i = 1;i < this->n; i++) + //if(fabs(u[i*this->n + j]) < fabs(u[j])) + u[i*this->n + j] = value;// u[j]; + } +*/ + + + /**/ + + /*if (u.max() > 0.0) + this->stopTime *=(double) this->gridCols;*/ + + + double time = 0.0; + double currentTau = this->tau0; + double finalTime = this->stopTime;// + 3.0*(u.max() - u.min()); + if( time + currentTau > finalTime ) currentTau = finalTime - time; + + double maxResidue( 1.0 ); + //double lastResidue( 10000.0 ); + while( time < finalTime /*|| maxResidue > subMesh.getHx()*/) + { + /**** + * Compute the RHS + */ + + for( int i = 0; i < fu.getSize(); i ++ ) + { + fu[ i ] = schemeHost.getValue( this->subMesh, i, this->subMesh.getCellCoordinates(i), u, time, boundaryCondition ); + } + maxResidue = fu. absMax(); + + + if( this -> cflCondition * maxResidue != 0.0) + currentTau = this -> cflCondition / maxResidue; + + /* if (maxResidue < 0.05) + cout << "Max < 0.05" << endl;*/ + if(currentTau > 1.0 * this->subMesh.getHx()) + { + //cout << currentTau << " >= " << 2.0 * this->subMesh.getHx() << endl; + currentTau = 1.0 * this->subMesh.getHx(); + } + /*if(maxResidue > lastResidue) + currentTau *=(1.0/10.0);*/ + + + if( time + currentTau > finalTime ) currentTau = finalTime - time; +// for( int i = 0; i < fu.getSize(); i ++ ) +// { +// //cout << "Too big RHS! i = " << i << ", fu = " << fu[i] << ", u = " << u[i] << endl; +// if((u[i]+currentTau * fu[ i ])*u[i] < 0.0 && fu[i] != 0.0 && u[i] != 0.0 ) +// currentTau = fabs(u[i]/(2.0*fu[i])); +// +// } + + + for( int i = 0; i < fu.getSize(); i ++ ) + { + double add = u[i] + currentTau * fu[ i ]; + //if( fabs(u[i]) < fabs(add) or (this->subgridValues[subGridID] == this->currentStep +4) ) + u[ i ] = add; + } + time += currentTau; + + //cout << '\r' << flush; + //cout << maxResidue << " " << currentTau << " @ " << time << flush; + //lastResidue = maxResidue; + } + //cout << "Time: " << time << ", Res: " << maxResidue <<endl; + /*if (u.max() > 0.0) + this->stopTime /=(double) this->gridCols;*/ + + VectorType solution; + solution.setLike(u); + for( int i = 0; i < u.getSize(); i ++ ) + { + solution[i]=u[i]; + } + return solution; +} + + +#ifdef HAVE_CUDA + + +template< typename SchemeHost, typename SchemeDevice, typename Device> +__device__ +void tnlParallelEikonalSolver<3,SchemeHost, SchemeDevice, Device, double, int>::getSubgridCUDA3D( const int i ,tnlParallelEikonalSolver<3,SchemeHost, SchemeDevice, Device, double, int >* caller, double* a) +{ + //int j = threadIdx.x + threadIdx.y * blockDim.x; + int th = (blockIdx.y) * caller->n*caller->n*caller->gridCols + + (blockIdx.x) * caller->n + + threadIdx.y * caller->n*caller->gridCols + + threadIdx.x; + //printf("i= %d,j= %d,th= %d\n",i,j,th); + *a = caller->work_u_cuda[th]; + //printf("Hi %f \n", *a); + //return ret; +} + + +template< typename SchemeHost, typename SchemeDevice, typename Device> +__device__ +void tnlParallelEikonalSolver<3,SchemeHost, SchemeDevice, Device, double, int>::updateSubgridCUDA3D( const int i ,tnlParallelEikonalSolver<3,SchemeHost, SchemeDevice, Device, double, int >* caller, double* a) +{ + int j = threadIdx.x + threadIdx.y * blockDim.x; + int index = (blockIdx.y) * caller->n*caller->n*caller->gridCols + + (blockIdx.x) * caller->n + + threadIdx.y * caller->n*caller->gridCols + + threadIdx.x; + + if( (fabs(caller->work_u_cuda[index]) > fabs(*a)) || (caller->unusedCell_cuda[index] == 1) ) + { + caller->work_u_cuda[index] = *a; + caller->unusedCell_cuda[index] = 0; + + } + + *a = caller->work_u_cuda[index]; +} + + +template< typename SchemeHost, typename SchemeDevice, typename Device> +__device__ +void tnlParallelEikonalSolver<3,SchemeHost, SchemeDevice, Device, double, int>::insertSubgridCUDA3D( double u, const int i ) +{ + + +// int j = threadIdx.x + threadIdx.y * blockDim.x; + //printf("j = %d, u = %f\n", j,u); + + int index = (blockIdx.y)*this->n*this->n*this->gridCols + + (blockIdx.x)*this->n + + threadIdx.y*this->n*this->gridCols + + threadIdx.x; + + //printf("i= %d,j= %d,index= %d\n",i,j,index); + if( (fabs(this->work_u_cuda[index]) > fabs(u)) || (this->unusedCell_cuda[index] == 1) ) + { + this->work_u_cuda[index] = u; + this->unusedCell_cuda[index] = 0; + + } + + +} + + +template< typename SchemeHost, typename SchemeDevice, typename Device> +__device__ +void tnlParallelEikonalSolver<3,SchemeHost, SchemeDevice, Device, double, int>::runSubgridCUDA3D( int boundaryCondition, double* u, int subGridID) +{ + + __shared__ int tmp; + __shared__ double value; + //double tmpRes = 0.0; + volatile double* sharedTau = &u[blockDim.x*blockDim.y]; + volatile double* absVal = &u[2*blockDim.x*blockDim.y]; + int i = threadIdx.x; + int j = threadIdx.y; + int l = threadIdx.y * blockDim.x + threadIdx.x; + bool computeFU = !((i == 0 && (boundaryCondition & 4)) or + (i == blockDim.x - 1 && (boundaryCondition & 2)) or + (j == 0 && (boundaryCondition & 8)) or + (j == blockDim.y - 1 && (boundaryCondition & 1))); + + if(l == 0) + { + tmp = 0; + int centreGID = (blockDim.y*blockIdx.y + (blockDim.y>>1))*(blockDim.x*gridDim.x) + blockDim.x*blockIdx.x + (blockDim.x>>1); + if(this->unusedCell_cuda[centreGID] == 0 || boundaryCondition == 0) + tmp = 1; + } + __syncthreads(); + + /*if(!tmp && (u[0]*u[l] <= 0.0)) + atomicMax( &tmp, 1);*/ + + __syncthreads(); + if(tmp !=1) + { +// if(computeFU) +// absVal[l]=0.0; +// else +// absVal[l] = fabs(u[l]); +// +// __syncthreads(); +// +// if((blockDim.x == 16) && (l < 128)) absVal[l] = Max(absVal[l],absVal[l+128]); +// __syncthreads(); +// if((blockDim.x == 16) && (l < 64)) absVal[l] = Max(absVal[l],absVal[l+64]); +// __syncthreads(); +// if(l < 32) absVal[l] = Max(absVal[l],absVal[l+32]); +// if(l < 16) absVal[l] = Max(absVal[l],absVal[l+16]); +// if(l < 8) absVal[l] = Max(absVal[l],absVal[l+8]); +// if(l < 4) absVal[l] = Max(absVal[l],absVal[l+4]); +// if(l < 2) absVal[l] = Max(absVal[l],absVal[l+2]); +// if(l < 1) value = Sign(u[0])*Max(absVal[l],absVal[l+1]); +// __syncthreads(); +// +// if(computeFU) +// u[l] = value; + if(computeFU) + { + if(boundaryCondition == 4) + u[l] = u[threadIdx.y * blockDim.x] + Sign(u[0])*this->subMesh.getHx()*(threadIdx.x) ;//+ 2*Sign(u[0])*this->subMesh.getHx()*(threadIdx.x+this->n); + else if(boundaryCondition == 2) + u[l] = u[threadIdx.y * blockDim.x + blockDim.x - 1] + Sign(u[0])*this->subMesh.getHx()*(this->n - 1 - threadIdx.x);//+ 2*Sign(u[0])*this->subMesh.getHx()*(blockDim.x - threadIdx.x - 1+this->n); + else if(boundaryCondition == 8) + u[l] = u[threadIdx.x] + Sign(u[0])*this->subMesh.getHx()*(threadIdx.y) ;//+ 2*Sign(u[0])*this->subMesh.getHx()*(threadIdx.y+this->n); + else if(boundaryCondition == 1) + u[l] = u[(blockDim.y - 1)* blockDim.x + threadIdx.x] + Sign(u[0])*this->subMesh.getHx()*(this->n - 1 - threadIdx.y) ;//+ Sign(u[0])*this->subMesh.getHx()*(blockDim.y - threadIdx.y - 1 +this->n); + } + } + + double time = 0.0; + __shared__ double currentTau; + double cfl = this->cflCondition; + double fu = 0.0; +// if(threadIdx.x * threadIdx.y == 0) +// { +// currentTau = this->tau0; +// } + double finalTime = this->stopTime; + __syncthreads(); + if( time + currentTau > finalTime ) currentTau = finalTime - time; + + + while( time < finalTime ) + { + if(computeFU) + fu = schemeHost.getValueDev( this->subMesh, l, tnlStaticVector<3,int>(i,j)/*this->subMesh.getCellCoordinates(l)*/, u, time, boundaryCondition); + + sharedTau[l]=cfl/fabs(fu); + + if(l == 0) + { + if(sharedTau[0] > 1.0 * this->subMesh.getHx()) sharedTau[0] = 1.0 * this->subMesh.getHx(); + } + else if(l == blockDim.x*blockDim.y - 1) + if( time + sharedTau[l] > finalTime ) sharedTau[l] = finalTime - time; + + + + if((blockDim.x == 16) && (l < 128)) sharedTau[l] = Min(sharedTau[l],sharedTau[l+128]); + __syncthreads(); + if((blockDim.x == 16) && (l < 64)) sharedTau[l] = Min(sharedTau[l],sharedTau[l+64]); + __syncthreads(); + if(l < 32) sharedTau[l] = Min(sharedTau[l],sharedTau[l+32]); + if(l < 16) sharedTau[l] = Min(sharedTau[l],sharedTau[l+16]); + if(l < 8) sharedTau[l] = Min(sharedTau[l],sharedTau[l+8]); + if(l < 4) sharedTau[l] = Min(sharedTau[l],sharedTau[l+4]); + if(l < 2) sharedTau[l] = Min(sharedTau[l],sharedTau[l+2]); + if(l < 1) currentTau = Min(sharedTau[l],sharedTau[l+1]); + __syncthreads(); + + u[l] += currentTau * fu; + time += currentTau; + } + + +} + + +template< typename SchemeHost, typename SchemeDevice, typename Device> +__device__ +int tnlParallelEikonalSolver<3,SchemeHost, SchemeDevice, Device, double, int>::getOwnerCUDA3D(int i) const +{ + + return ((i / (this->gridCols*this->n*this->n))*this->gridCols + + (i % (this->gridCols*this->n))/this->n); +} + + +template< typename SchemeHost, typename SchemeDevice, typename Device> +__device__ +int tnlParallelEikonalSolver<3,SchemeHost, SchemeDevice, Device, double, int>::getSubgridValueCUDA3D( int i ) const +{ + return this->subgridValues_cuda[i]; +} + + +template< typename SchemeHost, typename SchemeDevice, typename Device> +__device__ +void tnlParallelEikonalSolver<3,SchemeHost, SchemeDevice, Device, double, int>::setSubgridValueCUDA3D(int i, int value) +{ + this->subgridValues_cuda[i] = value; +} + + +template< typename SchemeHost, typename SchemeDevice, typename Device> +__device__ +int tnlParallelEikonalSolver<3,SchemeHost, SchemeDevice, Device, double, int>::getBoundaryConditionCUDA3D( int i ) const +{ + return this->boundaryConditions_cuda[i]; +} + + +template< typename SchemeHost, typename SchemeDevice, typename Device> +__device__ +void tnlParallelEikonalSolver<3,SchemeHost, SchemeDevice, Device, double, int>::setBoundaryConditionCUDA3D(int i, int value) +{ + this->boundaryConditions_cuda[i] = value; +} + + + +//north - 1, east - 2, west - 4, south - 8 + +template <typename SchemeHost, typename SchemeDevice, typename Device> +__global__ +void /*tnlParallelEikonalSolver<3,SchemeHost, SchemeDevice, Device, double, int>::*/synchronizeCUDA3D(tnlParallelEikonalSolver<3,SchemeHost, SchemeDevice, Device, double, int >* cudaSolver) //needs fix ---- maybe not anymore --- but frankly: yeah, it does -- aaaa-and maybe fixed now +{ + + __shared__ int boundary[4]; // north,east,west,south + __shared__ int subgridValue; + __shared__ int newSubgridValue; + + + int gid = (blockDim.y*blockIdx.y + threadIdx.y)*blockDim.x*gridDim.x + blockDim.x*blockIdx.x + threadIdx.x; + double u = cudaSolver->work_u_cuda[gid]; + double u_cmp; + int subgridValue_cmp=INT_MAX; + int boundary_index=0; + + + if(threadIdx.x+threadIdx.y == 0) + { + subgridValue = cudaSolver->getSubgridValueCUDA3D(blockIdx.y*gridDim.x + blockIdx.x); + boundary[0] = 0; + boundary[1] = 0; + boundary[2] = 0; + boundary[3] = 0; + newSubgridValue = 0; + //printf("%d %d\n", blockDim.x, gridDim.x); + } + __syncthreads(); + + + + if( (threadIdx.x == 0 /* && !(cudaSolver->currentStep & 1)*/) || + (threadIdx.y == 0 /* && (cudaSolver->currentStep & 1)*/) || + (threadIdx.x == blockDim.x - 1 /* && !(cudaSolver->currentStep & 1)*/) || + (threadIdx.y == blockDim.y - 1 /* && (cudaSolver->currentStep & 1)*/) ) + { + if(threadIdx.x == 0 && (blockIdx.x != 0)/* && !(cudaSolver->currentStep & 1)*/) + { + u_cmp = cudaSolver->work_u_cuda[gid - 1]; + subgridValue_cmp = cudaSolver->getSubgridValueCUDA3D(blockIdx.y*gridDim.x + blockIdx.x - 1); + boundary_index = 2; + } + + if(threadIdx.x == blockDim.x - 1 && (blockIdx.x != gridDim.x - 1)/* && !(cudaSolver->currentStep & 1)*/) + { + u_cmp = cudaSolver->work_u_cuda[gid + 1]; + subgridValue_cmp = cudaSolver->getSubgridValueCUDA3D(blockIdx.y*gridDim.x + blockIdx.x + 1); + boundary_index = 1; + } +// if(threadIdx.y == 0 && (blockIdx.y != 0) && (cudaSolver->currentStep & 1)) +// { +// u_cmp = cudaSolver->work_u_cuda[gid - blockDim.x*gridDim.x]; +// subgridValue_cmp = cudaSolver->getSubgridValueCUDA3D((blockIdx.y - 1)*gridDim.x + blockIdx.x); +// boundary_index = 3; +// } +// if(threadIdx.y == blockDim.y - 1 && (blockIdx.y != gridDim.y - 1) && (cudaSolver->currentStep & 1)) +// { +// u_cmp = cudaSolver->work_u_cuda[gid + blockDim.x*gridDim.x]; +// subgridValue_cmp = cudaSolver->getSubgridValueCUDA3D((blockIdx.y + 1)*gridDim.x + blockIdx.x); +// boundary_index = 0; +// } + + __threadfence(); + if((subgridValue == INT_MAX || fabs(u_cmp) + cudaSolver->delta < fabs(u) ) && (subgridValue_cmp != INT_MAX && subgridValue_cmp != -INT_MAX)) + { + cudaSolver->unusedCell_cuda[gid] = 0; + atomicMax(&newSubgridValue, INT_MAX); + atomicMax(&boundary[boundary_index], 1); + cudaSolver->work_u_cuda[gid] = u_cmp; + u=u_cmp; + } + //__threadfence(); + if(threadIdx.y == 0 && (blockIdx.y != 0)/* && (cudaSolver->currentStep & 1)*/) + { + u_cmp = cudaSolver->work_u_cuda[gid - blockDim.x*gridDim.x]; + subgridValue_cmp = cudaSolver->getSubgridValueCUDA3D((blockIdx.y - 1)*gridDim.x + blockIdx.x); + boundary_index = 3; + } + if(threadIdx.y == blockDim.y - 1 && (blockIdx.y != gridDim.y - 1)/* && (cudaSolver->currentStep & 1)*/) + { + u_cmp = cudaSolver->work_u_cuda[gid + blockDim.x*gridDim.x]; + subgridValue_cmp = cudaSolver->getSubgridValueCUDA3D((blockIdx.y + 1)*gridDim.x + blockIdx.x); + boundary_index = 0; + } + + __threadfence(); + if((subgridValue == INT_MAX || fabs(u_cmp) + cudaSolver->delta < fabs(u) ) && (subgridValue_cmp != INT_MAX && subgridValue_cmp != -INT_MAX)) + { + cudaSolver->unusedCell_cuda[gid] = 0; + atomicMax(&newSubgridValue, INT_MAX); + atomicMax(&boundary[boundary_index], 1); + cudaSolver->work_u_cuda[gid] = u_cmp; + } + } + __syncthreads(); + + if(threadIdx.x+threadIdx.y == 0) + { + if(subgridValue == INT_MAX && newSubgridValue !=0) + cudaSolver->setSubgridValueCUDA3D(blockIdx.y*gridDim.x + blockIdx.x, -INT_MAX); + + cudaSolver->setBoundaryConditionCUDA3D(blockIdx.y*gridDim.x + blockIdx.x, boundary[0] + + 2 * boundary[1] + + 4 * boundary[2] + + 8 * boundary[3]); + } + + + /* + //printf("I am not an empty kernel!\n"); + //cout << "Synchronizig..." << endl; + int tmp1, tmp2; + int grid1, grid2; + + if(cudaSolver->currentStep & 1) + { + //printf("I am not an empty kernel! 1\n"); + for(int j = 0; j < cudaSolver->gridRows - 1; j++) + { + //printf("I am not an empty kernel! 3\n"); + for (int i = 0; i < cudaSolver->gridCols*cudaSolver->n; i++) + { + tmp1 = cudaSolver->gridCols*cudaSolver->n*((cudaSolver->n-1)+j*cudaSolver->n) + i; + tmp2 = cudaSolver->gridCols*cudaSolver->n*((cudaSolver->n)+j*cudaSolver->n) + i; + grid1 = cudaSolver->getSubgridValueCUDA3D(cudaSolver->getOwnerCUDA3D(tmp1)); + grid2 = cudaSolver->getSubgridValueCUDA3D(cudaSolver->getOwnerCUDA3D(tmp2)); + + if ((fabs(cudaSolver->work_u_cuda[tmp1]) < fabs(cudaSolver->work_u_cuda[tmp2]) - cudaSolver->delta || grid2 == INT_MAX || grid2 == -INT_MAX) && (grid1 != INT_MAX && grid1 != -INT_MAX)) + { + //printf("%d %d %d %d \n",tmp1,tmp2,cudaSolver->getOwnerCUDA3D(tmp1),cudaSolver->getOwnerCUDA3D(tmp2)); + cudaSolver->work_u_cuda[tmp2] = cudaSolver->work_u_cuda[tmp1]; + cudaSolver->unusedCell_cuda[tmp2] = 0; + if(grid2 == INT_MAX) + { + cudaSolver->setSubgridValueCUDA3D(cudaSolver->getOwnerCUDA3D(tmp2), -INT_MAX); + } + if(! (cudaSolver->getBoundaryConditionCUDA3D(cudaSolver->getOwnerCUDA3D(tmp2)) & 8) ) + cudaSolver->setBoundaryConditionCUDA3D(cudaSolver->getOwnerCUDA3D(tmp2), cudaSolver->getBoundaryConditionCUDA3D(cudaSolver->getOwnerCUDA3D(tmp2))+8); + } + else if ((fabs(cudaSolver->work_u_cuda[tmp1]) > fabs(cudaSolver->work_u_cuda[tmp2]) + cudaSolver->delta || grid1 == INT_MAX || grid1 == -INT_MAX) && (grid2 != INT_MAX && grid2 != -INT_MAX)) + { + //printf("%d %d %d %d \n",tmp1,tmp2,cudaSolver->getOwnerCUDA3D(tmp1),cudaSolver->getOwnerCUDA3D(tmp2)); + cudaSolver->work_u_cuda[tmp1] = cudaSolver->work_u_cuda[tmp2]; + cudaSolver->unusedCell_cuda[tmp1] = 0; + if(grid1 == INT_MAX) + { + cudaSolver->setSubgridValueCUDA3D(cudaSolver->getOwnerCUDA3D(tmp1), -INT_MAX); + } + if(! (cudaSolver->getBoundaryConditionCUDA3D(cudaSolver->getOwnerCUDA3D(tmp1)) & 1) ) + cudaSolver->setBoundaryConditionCUDA3D(cudaSolver->getOwnerCUDA3D(tmp1), cudaSolver->getBoundaryConditionCUDA3D(cudaSolver->getOwnerCUDA3D(tmp1))+1); + } + } + } + + } + else + { + //printf("I am not an empty kernel! 2\n"); + for(int i = 1; i < cudaSolver->gridCols; i++) + { + //printf("I am not an empty kernel! 4\n"); + for (int j = 0; j < cudaSolver->gridRows*cudaSolver->n; j++) + { + + tmp1 = cudaSolver->gridCols*cudaSolver->n*j + i*cudaSolver->n - 1; + tmp2 = cudaSolver->gridCols*cudaSolver->n*j + i*cudaSolver->n ; + grid1 = cudaSolver->getSubgridValueCUDA3D(cudaSolver->getOwnerCUDA3D(tmp1)); + grid2 = cudaSolver->getSubgridValueCUDA3D(cudaSolver->getOwnerCUDA3D(tmp2)); + + if ((fabs(cudaSolver->work_u_cuda[tmp1]) < fabs(cudaSolver->work_u_cuda[tmp2]) - cudaSolver->delta || grid2 == INT_MAX || grid2 == -INT_MAX) && (grid1 != INT_MAX && grid1 != -INT_MAX)) + { + //printf("%d %d %d %d \n",tmp1,tmp2,cudaSolver->getOwnerCUDA3D(tmp1),cudaSolver->getOwnerCUDA3D(tmp2)); + cudaSolver->work_u_cuda[tmp2] = cudaSolver->work_u_cuda[tmp1]; + cudaSolver->unusedCell_cuda[tmp2] = 0; + if(grid2 == INT_MAX) + { + cudaSolver->setSubgridValueCUDA3D(cudaSolver->getOwnerCUDA3D(tmp2), -INT_MAX); + } + if(! (cudaSolver->getBoundaryConditionCUDA3D(cudaSolver->getOwnerCUDA3D(tmp2)) & 4) ) + cudaSolver->setBoundaryConditionCUDA3D(cudaSolver->getOwnerCUDA3D(tmp2), cudaSolver->getBoundaryConditionCUDA3D(cudaSolver->getOwnerCUDA3D(tmp2))+4); + } + else if ((fabs(cudaSolver->work_u_cuda[tmp1]) > fabs(cudaSolver->work_u_cuda[tmp2]) + cudaSolver->delta || grid1 == INT_MAX || grid1 == -INT_MAX) && (grid2 != INT_MAX && grid2 != -INT_MAX)) + { + //printf("%d %d %d %d \n",tmp1,tmp2,cudaSolver->getOwnerCUDA3D(tmp1),cudaSolver->getOwnerCUDA3D(tmp2)); + cudaSolver->work_u_cuda[tmp1] = cudaSolver->work_u_cuda[tmp2]; + cudaSolver->unusedCell_cuda[tmp1] = 0; + if(grid1 == INT_MAX) + { + cudaSolver->setSubgridValueCUDA3D(cudaSolver->getOwnerCUDA3D(tmp1), -INT_MAX); + } + if(! (cudaSolver->getBoundaryConditionCUDA3D(cudaSolver->getOwnerCUDA3D(tmp1)) & 2) ) + cudaSolver->setBoundaryConditionCUDA3D(cudaSolver->getOwnerCUDA3D(tmp1), cudaSolver->getBoundaryConditionCUDA3D(cudaSolver->getOwnerCUDA3D(tmp1))+2); + } + } + } + } + //printf("I am not an empty kernel! 5 cudaSolver->currentStep : %d \n", cudaSolver->currentStep); + + cudaSolver->currentStep = cudaSolver->currentStep + 1; + int stepValue = cudaSolver->currentStep + 4; + for (int i = 0; i < cudaSolver->gridRows * cudaSolver->gridCols; i++) + { + if( cudaSolver->getSubgridValueCUDA3D(i) == -INT_MAX ) + cudaSolver->setSubgridValueCUDA3D(i, stepValue); + } + + int maxi = 0; + for(int q=0; q < cudaSolver->gridRows*cudaSolver->gridCols;q++) + { + //printf("%d : %d\n", q, cudaSolver->boundaryConditions_cuda[q]); + maxi=Max(maxi,cudaSolver->getBoundaryConditionCUDA3D(q)); + } + //printf("I am not an empty kernel! %d\n", maxi); + *(cudaSolver->runcuda) = (maxi > 0); + //printf("I am not an empty kernel! 7 %d\n", cudaSolver->boundaryConditions_cuda[0]); + //cout << "Grid synchronized at step " << (this->currentStep - 1 ) << endl; +*/ +} + + + +template <typename SchemeHost, typename SchemeDevice, typename Device> +__global__ +void synchronize2CUDA3D(tnlParallelEikonalSolver<3,SchemeHost, SchemeDevice, Device, double, int >* cudaSolver) +{ + if(blockIdx.x+blockIdx.y ==0) + { + cudaSolver->currentStep = cudaSolver->currentStep + 1; + *(cudaSolver->runcuda) = 0; + } + + int stepValue = cudaSolver->currentStep + 4; + if( cudaSolver->getSubgridValueCUDA3D(blockIdx.y*gridDim.x + blockIdx.x) == -INT_MAX ) + cudaSolver->setSubgridValueCUDA3D(blockIdx.y*gridDim.x + blockIdx.x, stepValue); + + atomicMax((cudaSolver->runcuda),cudaSolver->getBoundaryConditionCUDA3D(blockIdx.y*gridDim.x + blockIdx.x)); +} + + + + + + + + +template< typename SchemeHost, typename SchemeDevice, typename Device> +__global__ +void /*tnlParallelEikonalSolver<3,SchemeHost, SchemeDevice, Device, double, int>::*/initCUDA3D( tnlParallelEikonalSolver<3,SchemeHost, SchemeDevice, Device, double, int >* cudaSolver, double* ptr , int* ptr2, int* ptr3) +{ + //cout << "Initializating solver..." << endl; + //const tnlString& meshLocation = parameters.getParameter <tnlString>("mesh"); + //this->mesh_cuda.load( meshLocation ); + + //this->n_cuda = parameters.getParameter <int>("subgrid-size"); + //cout << "Setting N << this->n_cuda << endl; + + //this->subMesh_cuda.setDimensions( this->n_cuda, this->n_cuda ); + //this->subMesh_cuda.setDomain( tnlStaticVector<3,double>(0.0, 0.0), + //tnlStaticVector<3,double>(this->mesh_cuda.getHx()*(double)(this->n_cuda), this->mesh_cuda.getHy()*(double)(this->n_cuda)) ); + + //this->subMesh_cuda.save("submesh.tnl"); + +// const tnlString& initialCondition = parameters.getParameter <tnlString>("initial-condition"); +// this->u0.load( initialCondition ); + + //cout << this->mesh.getCellCenter(0) << endl; + + //this->delta_cuda = parameters.getParameter <double>("delta"); + //this->delta_cuda *= this->mesh_cuda.getHx()*this->mesh_cuda.getHy(); + + //cout << "Setting delta to " << this->delta << endl; + + //this->tau0_cuda = parameters.getParameter <double>("initial-tau"); + //cout << "Setting initial tau to " << this->tau0_cuda << endl; + //this->stopTime_cuda = parameters.getParameter <double>("stop-time"); + + //this->cflCondition_cuda = parameters.getParameter <double>("cfl-condition"); + //this -> cflCondition_cuda *= sqrt(this->mesh_cuda.getHx()*this->mesh_cuda.getHy()); + //cout << "Setting CFL to " << this->cflCondition << endl; +//// +//// + +// this->gridRows_cuda = gridRows; +// this->gridCols_cuda = gridCols; + + cudaSolver->work_u_cuda = ptr;//(double*)malloc(cudaSolver->gridCols*cudaSolver->gridRows*cudaSolver->n*cudaSolver->n*sizeof(double)); + cudaSolver->unusedCell_cuda = ptr3;//(int*)malloc(cudaSolver->gridCols*cudaSolver->gridRows*cudaSolver->n*cudaSolver->n*sizeof(int)); + cudaSolver->subgridValues_cuda =(int*)malloc(cudaSolver->gridCols*cudaSolver->gridRows*sizeof(int)); + cudaSolver->boundaryConditions_cuda =(int*)malloc(cudaSolver->gridCols*cudaSolver->gridRows*sizeof(int)); + cudaSolver->runcuda = ptr2;//(bool*)malloc(sizeof(bool)); + *(cudaSolver->runcuda) = 1; + cudaSolver->currentStep = 1; + //cudaMemcpy(ptr,&(cudaSolver->work_u_cuda), sizeof(double*),cudaMemcpyDeviceToHost); + //ptr = cudaSolver->work_u_cuda; + printf("GPU memory allocated.\n"); + + for(int i = 0; i < cudaSolver->gridCols*cudaSolver->gridRows; i++) + { + cudaSolver->subgridValues_cuda[i] = INT_MAX; + cudaSolver->boundaryConditions_cuda[i] = 0; + } + + /*for(long int j = 0; j < cudaSolver->n*cudaSolver->n*cudaSolver->gridCols*cudaSolver->gridRows; j++) + { + printf("%d\n",j); + cudaSolver->unusedCell_cuda[ j] = 1; + }*/ + printf("GPU memory initialized.\n"); + + + //cudaSolver->work_u_cuda[50] = 32.153438; +//// +//// + //stretchGrid(); + //this->stopTime_cuda /= (double)(this->gridCols_cuda); + //this->stopTime_cuda *= (1.0+1.0/((double)(this->n_cuda) - 1.0)); + //cout << "Setting stopping time to " << this->stopTime << endl; + //this->stopTime_cuda = 1.5*((double)(this->n_cuda))*parameters.getParameter <double>("stop-time")*this->mesh_cuda.getHx(); + //cout << "Setting stopping time to " << this->stopTime << endl; + + //cout << "Initializating scheme..." << endl; + //if(!this->schemeDevice.init(parameters)) +// { + //cerr << "Scheme failed to initialize." << endl; +// return false; +// } + //cout << "Scheme initialized." << endl; + + //test(); + +// this->currentStep_cuda = 1; + //return true; +} + + + + +//extern __shared__ double array[]; +template< typename SchemeHost, typename SchemeDevice, typename Device > +__global__ +void /*tnlParallelEikonalSolver<3,SchemeHost, SchemeDevice, Device, double, int>::*/initRunCUDA3D(tnlParallelEikonalSolver<3,SchemeHost, SchemeDevice, Device, double, int >* caller) + +{ + + + extern __shared__ double u[]; + //printf("%p\n",caller->work_u_cuda); + + int i = blockIdx.y * gridDim.x + blockIdx.x; + int l = threadIdx.y * blockDim.x + threadIdx.x; + + __shared__ int containsCurve; + if(l == 0) + containsCurve = 0; + + //double a; + caller->getSubgridCUDA3D(i,caller, &u[l]); + //printf("%f %f\n",a , u[l]); + //u[l] = a; + //printf("Hi %f \n", u[l]); + __syncthreads(); + //printf("hurewrwr %f \n", u[l]); + if(u[0] * u[l] <= 0.0) + { + //printf("contains %d \n",i); + atomicMax( &containsCurve, 1); + } + + __syncthreads(); + //printf("hu"); + //printf("%d : %f\n", l, u[l]); + if(containsCurve == 1) + { + //printf("have curve \n"); + caller->runSubgridCUDA3D(0,u,i); + //printf("%d : %f\n", l, u[l]); + __syncthreads(); + caller->insertSubgridCUDA3D(u[l],i); + __syncthreads(); + if(l == 0) + caller->setSubgridValueCUDA3D(i, 4); + } + + +} + + + + + +template< typename SchemeHost, typename SchemeDevice, typename Device > +__global__ +void /*tnlParallelEikonalSolver<3,SchemeHost, SchemeDevice, Device, double, int>::*/runCUDA3D(tnlParallelEikonalSolver<3,SchemeHost, SchemeDevice, Device, double, int >* caller) +{ + extern __shared__ double u[]; + int i = blockIdx.y * gridDim.x + blockIdx.x; + int l = threadIdx.y * blockDim.x + threadIdx.x; + int bound = caller->getBoundaryConditionCUDA3D(i); + + if(caller->getSubgridValueCUDA3D(i) != INT_MAX && bound != 0 && caller->getSubgridValueCUDA3D(i) > 0) + { + caller->getSubgridCUDA3D(i,caller, &u[l]); + + //if(l == 0) + //printf("i = %d, bound = %d\n",i,caller->getSubgridValueCUDA3D(i)); + if(caller->getSubgridValueCUDA3D(i) == caller->currentStep+4) + { + if(bound & 1) + { + caller->runSubgridCUDA3D(1,u,i); + //__syncthreads(); + //caller->insertSubgridCUDA3D(u[l],i); + //__syncthreads(); + //caller->getSubgridCUDA3D(i,caller, &u[l]); + caller->updateSubgridCUDA3D(i,caller, &u[l]); + __syncthreads(); + } + if(bound & 2 ) + { + caller->runSubgridCUDA3D(2,u,i); + //__syncthreads(); + //caller->insertSubgridCUDA3D(u[l],i); + //__syncthreads(); + //caller->getSubgridCUDA3D(i,caller, &u[l]); + caller->updateSubgridCUDA3D(i,caller, &u[l]); + __syncthreads(); + } + if(bound & 4) + { + caller->runSubgridCUDA3D(4,u,i); + //__syncthreads(); + //caller->insertSubgridCUDA3D(u[l],i); + //__syncthreads(); + //caller->getSubgridCUDA3D(i,caller, &u[l]); + caller->updateSubgridCUDA3D(i,caller, &u[l]); + __syncthreads(); + } + if(bound & 8) + { + caller->runSubgridCUDA3D(8,u,i); + //__syncthreads(); + //caller->insertSubgridCUDA3D(u[l],i); + //__syncthreads(); + //caller->getSubgridCUDA3D(i,caller, &u[l]); + caller->updateSubgridCUDA3D(i,caller, &u[l]); + __syncthreads(); + } + + + + + + if( ((bound & 3 ))) + { + caller->runSubgridCUDA3D(3,u,i); + //__syncthreads(); + //caller->insertSubgridCUDA3D(u[l],i); + //__syncthreads(); + //caller->getSubgridCUDA3D(i,caller, &u[l]); + caller->updateSubgridCUDA3D(i,caller, &u[l]); + __syncthreads(); + } + if( ((bound & 5 ))) + { + caller->runSubgridCUDA3D(5,u,i); + //__syncthreads(); + //caller->insertSubgridCUDA3D(u[l],i); + //__syncthreads(); + //caller->getSubgridCUDA3D(i,caller, &u[l]); + caller->updateSubgridCUDA3D(i,caller, &u[l]); + __syncthreads(); + } + if( ((bound & 10 ))) + { + caller->runSubgridCUDA3D(10,u,i); + //__syncthreads(); + //caller->insertSubgridCUDA3D(u[l],i); + //__syncthreads(); + //caller->getSubgridCUDA3D(i,caller, &u[l]); + caller->updateSubgridCUDA3D(i,caller, &u[l]); + __syncthreads(); + } + if( (bound & 12 )) + { + caller->runSubgridCUDA3D(12,u,i); + //__syncthreads(); + //caller->insertSubgridCUDA3D(u[l],i); + //__syncthreads(); + //caller->getSubgridCUDA3D(i,caller, &u[l]); + caller->updateSubgridCUDA3D(i,caller, &u[l]); + __syncthreads(); + } + + + + + + } + + + else + { + + + + + + + + + + if( ((bound == 2))) + { + caller->runSubgridCUDA3D(2,u,i); + //__syncthreads(); + //caller->insertSubgridCUDA3D(u[l],i); + //__syncthreads(); + //caller->getSubgridCUDA3D(i,caller, &u[l]); + caller->updateSubgridCUDA3D(i,caller, &u[l]); + __syncthreads(); + } + if( ((bound == 1) )) + { + caller->runSubgridCUDA3D(1,u,i); + //__syncthreads(); + //caller->insertSubgridCUDA3D(u[l],i); + //__syncthreads(); + //caller->getSubgridCUDA3D(i,caller, &u[l]); + caller->updateSubgridCUDA3D(i,caller, &u[l]); + __syncthreads(); + } + if( ((bound == 8) )) + { + caller->runSubgridCUDA3D(8,u,i); + //__syncthreads(); + //caller->insertSubgridCUDA3D(u[l],i); + //__syncthreads(); + //caller->getSubgridCUDA3D(i,caller, &u[l]); + caller->updateSubgridCUDA3D(i,caller, &u[l]); + __syncthreads(); + } + if( (bound == 4)) + { + caller->runSubgridCUDA3D(4,u,i); + //__syncthreads(); + //caller->insertSubgridCUDA3D(u[l],i); + //__syncthreads(); + //caller->getSubgridCUDA3D(i,caller, &u[l]); + caller->updateSubgridCUDA3D(i,caller, &u[l]); + __syncthreads(); + } + + + + + + + + + + + if( ((bound & 3) )) + { + caller->runSubgridCUDA3D(3,u,i); + //__syncthreads(); + //caller->insertSubgridCUDA3D(u[l],i); + //__syncthreads(); + //caller->getSubgridCUDA3D(i,caller, &u[l]); + caller->updateSubgridCUDA3D(i,caller, &u[l]); + __syncthreads(); + } + if( ((bound & 5) )) + { + caller->runSubgridCUDA3D(5,u,i); + //__syncthreads(); + //caller->insertSubgridCUDA3D(u[l],i); + //__syncthreads(); + //caller->getSubgridCUDA3D(i,caller, &u[l]); + caller->updateSubgridCUDA3D(i,caller, &u[l]); + __syncthreads(); + } + if( ((bound & 10) )) + { + caller->runSubgridCUDA3D(10,u,i); + //__syncthreads(); + //caller->insertSubgridCUDA3D(u[l],i); + //__syncthreads(); + //caller->getSubgridCUDA3D(i,caller, &u[l]); + caller->updateSubgridCUDA3D(i,caller, &u[l]); + __syncthreads(); + } + if( (bound & 12) ) + { + caller->runSubgridCUDA3D(12,u,i); + //__syncthreads(); + //caller->insertSubgridCUDA3D(u[l],i); + //__syncthreads(); + //caller->getSubgridCUDA3D(i,caller, &u[l]); + caller->updateSubgridCUDA3D(i,caller, &u[l]); + __syncthreads(); + } + + + + + + + + + + + + + } + /*if( bound ) + { + caller->runSubgridCUDA3D(15,u,i); + __syncthreads(); + //caller->insertSubgridCUDA3D(u[l],i); + //__syncthreads(); + //caller->getSubgridCUDA3D(i,caller, &u[l]); + caller->updateSubgridCUDA3D(i,caller, &u[l]); + __syncthreads(); + }*/ + + if(l==0) + { + caller->setBoundaryConditionCUDA3D(i, 0); + caller->setSubgridValueCUDA3D(i, caller->getSubgridValueCUDA3D(i) - 1 ); + } + + + } + + + +} + +#endif /*HAVE_CUDA*/ + +#endif /* TNLPARALLELEIKONALSOLVER3D_IMPL_H_ */