diff --git a/examples/fast-sweeping/fastSweepingConfig.h b/examples/fast-sweeping/fastSweepingConfig.h index 6b8b7b9a910fde8135860c2cdfdbf464f5665ec6..ff1403630100c2ce49375eee25a3a74f4d2e3049 100644 --- a/examples/fast-sweeping/fastSweepingConfig.h +++ b/examples/fast-sweeping/fastSweepingConfig.h @@ -30,6 +30,7 @@ class fastSweepingConfig config.addEntry < tnlString > ( "problem-name", "This defines particular problem.", "fast-sweeping" ); config.addRequiredEntry < tnlString > ( "initial-condition", "Initial condition for solver"); config.addEntry < tnlString > ( "mesh", "Name of mesh.", "mesh.tnl" ); + config.addEntry < tnlString > ( "exact-input", "Are the function values near the curve equal to the SDF? (yes/no)", "no" ); } }; diff --git a/examples/fast-sweeping/main.h b/examples/fast-sweeping/main.h index c21b5913588c358248c81731d539609f913aea31..75850bbf6fb26033017272aad22a0dea3cff8499 100644 --- a/examples/fast-sweeping/main.h +++ b/examples/fast-sweeping/main.h @@ -16,7 +16,10 @@ #include "MainBuildConfig.h" -#include "tnlFastSweeping.h" + //for HOST versions: +//#include "tnlFastSweeping.h" + //for DEVICE versions: +#include "tnlFastSweeping_CUDA.h" #include "fastSweepingConfig.h" #include <solvers/tnlConfigTags.h> @@ -46,6 +49,7 @@ int main( int argc, char* argv[] ) cerr << "Solver failed to initialize." << endl; return EXIT_FAILURE; } + checkCudaDevice; cout << "-------------------------------------------------------------" << endl; cout << "Starting solver..." << endl; solver.run(); diff --git a/examples/fast-sweeping/tnlFastSweeping.h b/examples/fast-sweeping/tnlFastSweeping.h index 1e04925ecb220eded08dd163f7d9c280fd52269a..b18722b2a6f67fcae1c7120cf94a722110c8e6d6 100644 --- a/examples/fast-sweeping/tnlFastSweeping.h +++ b/examples/fast-sweeping/tnlFastSweeping.h @@ -24,6 +24,9 @@ #include <limits.h> #include <core/tnlDevice.h> #include <ctime> +#ifdef HAVE_OPENMP +#include <omp.h> +#endif @@ -60,7 +63,12 @@ public: bool initGrid(); bool run(); - void updateValue(const Index i, const Index j); + + //for single core version use this implementation: + //void updateValue(const Index i, const Index j); + //for parallel version use this one instead: + void updateValue(const Index i, const Index j, DofVectorType* grid); + Real fabsMin(const Real x, const Real y); @@ -68,14 +76,22 @@ protected: MeshType Mesh; + bool exactInput; + DofVectorType dofVector; RealType h; +#ifdef HAVE_OPENMP +// omp_lock_t* gridLock; +#endif -}; +}; -#include "tnlFastSweeping2D_impl.h" + //for single core version use this implementation: +//#include "tnlFastSweeping2D_impl.h" + //for parallel version use this one instead: + #include "tnlFastSweeping2D_openMP_impl.h" #endif /* TNLFASTSWEEPING_H_ */ diff --git a/examples/fast-sweeping/tnlFastSweeping2D_CUDA_impl.h b/examples/fast-sweeping/tnlFastSweeping2D_CUDA_impl.h new file mode 100644 index 0000000000000000000000000000000000000000..5b90938c2a426f5a98217153017596a574796042 --- /dev/null +++ b/examples/fast-sweeping/tnlFastSweeping2D_CUDA_impl.h @@ -0,0 +1,522 @@ +/*************************************************************************** + tnlFastSweeping_impl.h - description + ------------------- + begin : Oct 15 , 2015 + copyright : (C) 2015 by Tomas Sobotik + ***************************************************************************/ + +/*************************************************************************** + * * + * This program is free software; you can redistribute it and/or modify * + * it under the terms of the GNU General Public License as published by * + * the Free Software Foundation; either version 2 of the License, or * + * (at your option) any later version. * + * * + ***************************************************************************/ +#ifndef TNLFASTSWEEPING2D_IMPL_H_ +#define TNLFASTSWEEPING2D_IMPL_H_ + +#include "tnlFastSweeping.h" + + +template< typename MeshReal, + typename Device, + typename MeshIndex, + typename Real, + typename Index > +tnlString tnlFastSweeping< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: getType() +{ + return tnlString( "tnlFastSweeping< " ) + + MeshType::getType() + ", " + + ::getType< Real >() + ", " + + ::getType< Index >() + " >"; +} + + + + +template< typename MeshReal, + typename Device, + typename MeshIndex, + typename Real, + typename Index > +bool tnlFastSweeping< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: init( const tnlParameterContainer& parameters ) +{ + const tnlString& meshFile = parameters.getParameter< tnlString >( "mesh" ); + + if( ! Mesh.load( meshFile ) ) + { + cerr << "I am not able to load the mesh from the file " << meshFile << "." << endl; + return false; + } + + + const tnlString& initialCondition = parameters.getParameter <tnlString>("initial-condition"); + if( ! dofVector.load( initialCondition ) ) + { + cerr << "I am not able to load the initial condition from the file " << meshFile << "." << endl; + return false; + } + + h = Mesh.getHx(); + counter = 0; + + const tnlString& exact_input = parameters.getParameter< tnlString >( "exact-input" ); + + if(exact_input == "no") + exactInput=false; + else + exactInput=true; + + +#ifdef HAVE_CUDA + + cudaMalloc(&(cudaDofVector), this->dofVector.getSize()*sizeof(double)); + cudaMemcpy(cudaDofVector, this->dofVector.getData(), this->dofVector.getSize()*sizeof(double), cudaMemcpyHostToDevice); + + cudaMalloc(&(cudaDofVector2), this->dofVector.getSize()*sizeof(double)); + cudaMemcpy(cudaDofVector2, this->dofVector.getData(), this->dofVector.getSize()*sizeof(double), cudaMemcpyHostToDevice); + + + cudaMalloc(&(this->cudaSolver), sizeof(tnlFastSweeping< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index >)); + cudaMemcpy(this->cudaSolver, this,sizeof(tnlFastSweeping< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index >), cudaMemcpyHostToDevice); + +#endif + + int n = Mesh.getDimensions().x(); + dim3 threadsPerBlock(16, 16); + dim3 numBlocks(n/16 + 1 ,n/16 +1); + + initCUDA<<<numBlocks,threadsPerBlock>>>(this->cudaSolver); + cudaDeviceSynchronize(); + checkCudaDevice; + + return true; +} + + + + + +template< typename MeshReal, + typename Device, + typename MeshIndex, + typename Real, + typename Index > +bool tnlFastSweeping< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: run() +{ +// +// for(Index i = 0; i < Mesh.getDimensions().x(); i++) +// { +// for(Index j = 0; j < Mesh.getDimensions().y(); j++) +// { +// updateValue(i,j); +// } +// } +// +///*---------------------------------------------------------------------------------------------------------------------------*/ +// +// for(Index i = Mesh.getDimensions().x() - 1; i > -1; i--) +// { +// for(Index j = 0; j < Mesh.getDimensions().y(); j++) +// { +// updateValue(i,j); +// } +// } +// +///*---------------------------------------------------------------------------------------------------------------------------*/ +// +// for(Index i = Mesh.getDimensions().x() - 1; i > -1; i--) +// { +// for(Index j = Mesh.getDimensions().y() - 1; j > -1; j--) +// { +// updateValue(i,j); +// } +// } +// +///*---------------------------------------------------------------------------------------------------------------------------*/ +// for(Index i = 0; i < Mesh.getDimensions().x(); i++) +// { +// for(Index j = Mesh.getDimensions().y() - 1; j > -1; j--) +// { +// updateValue(i,j); +// } +// } +// +///*---------------------------------------------------------------------------------------------------------------------------*/ +// +// +// dofVector.save("u-00001.tnl"); + + int n = Mesh.getDimensions().x(); + dim3 threadsPerBlock(32, 32); + dim3 numBlocks(n/32 + 1 ,n/32 +1); + + for(int i = 2*n - 1; i > -1; i--) + { + runCUDA<<<numBlocks,threadsPerBlock>>>(this->cudaSolver,4,i); + cudaDeviceSynchronize(); + } + cudaDeviceSynchronize(); + checkCudaDevice; + for(int i = 0; i < 2*n ; i++) + { + runCUDA<<<numBlocks,threadsPerBlock>>>(this->cudaSolver,1,i); + cudaDeviceSynchronize(); + } + cudaDeviceSynchronize(); + checkCudaDevice; + for(int i = 0; i < 2*n ; i++) + { + runCUDA<<<numBlocks,threadsPerBlock>>>(this->cudaSolver,2,i); + cudaDeviceSynchronize(); + } + cudaDeviceSynchronize(); + checkCudaDevice; + for(int i = 2*n - 1; i > -1; i--) + { + runCUDA<<<numBlocks,threadsPerBlock>>>(this->cudaSolver,3,i); + cudaDeviceSynchronize(); + } + + cudaDeviceSynchronize(); + checkCudaDevice; + + cudaMemcpy(this->dofVector.getData(), cudaDofVector, this->dofVector.getSize()*sizeof(double), cudaMemcpyDeviceToHost); + cudaDeviceSynchronize(); + cudaFree(cudaDofVector); + cudaFree(cudaDofVector2); + cudaFree(cudaSolver); + dofVector.save("u-00001.tnl"); + cudaDeviceSynchronize(); + return true; +} + + + + +#ifdef HAVE_CUDA + + +template< typename MeshReal, + typename Device, + typename MeshIndex, + typename Real, + typename Index > +__device__ +void tnlFastSweeping< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: updateValue( Index i, Index j) +{ + Index index = Mesh.getCellIndex(CoordinatesType(i,j)); + Real value = cudaDofVector[index]; + Real a,b, tmp; + + if( i == 0 ) + a = cudaDofVector[Mesh.template getCellNextToCell<1,0>(index)]; + else if( i == Mesh.getDimensions().x() - 1 ) + a = cudaDofVector[Mesh.template getCellNextToCell<-1,0>(index)]; + else + { + a = fabsMin( cudaDofVector[Mesh.template getCellNextToCell<-1,0>(index)], + cudaDofVector[Mesh.template getCellNextToCell<1,0>(index)] ); + } + + if( j == 0 ) + b = cudaDofVector[Mesh.template getCellNextToCell<0,1>(index)]; + else if( j == Mesh.getDimensions().y() - 1 ) + b = cudaDofVector[Mesh.template getCellNextToCell<0,-1>(index)]; + else + { + b = fabsMin( cudaDofVector[Mesh.template getCellNextToCell<0,-1>(index)], + cudaDofVector[Mesh.template getCellNextToCell<0,1>(index)] ); + } + + + if(abs(a-b) >= h) + tmp = fabsMin(a,b) + Sign(value)*h; + else + tmp = 0.5 * (a + b + Sign(value)*sqrt(2.0 * h * h - (a - b) * (a - b) ) ); + + cudaDofVector[index] = fabsMin(value, tmp); + +} + + +template< typename MeshReal, + typename Device, + typename MeshIndex, + typename Real, + typename Index > +__device__ +bool tnlFastSweeping< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: initGrid() +{ + int gx = threadIdx.x + blockDim.x*blockIdx.x; + int gy = blockDim.y*blockIdx.y + threadIdx.y; + int gid = Mesh.getCellIndex(CoordinatesType(gx,gy)); + + int total = blockDim.x*gridDim.x; + + + + Real tmp = 0.0; + int flag = 0; + counter = 0; + tmp = Sign(cudaDofVector[Mesh.getCellIndex(CoordinatesType(gx,gy))]); + + + if(!exactInput) + { + cudaDofVector[gid]=cudaDofVector[gid]=0.5*h*Sign(cudaDofVector[gid]); + } + __threadfence(); +// printf("-----------------------------------------------------------------------------------\n"); + + __threadfence(); + + if(gx > 0 && gx < Mesh.getDimensions().x()-1) + { + if(gy > 0 && gy < Mesh.getDimensions().y()-1) + { + + Index j = gy; + Index i = gx; +// tmp = Sign(cudaDofVector[Mesh.getCellIndex(CoordinatesType(i,j))]); + + if(tmp == 0.0) + {} + else if(cudaDofVector[Mesh.getCellIndex(CoordinatesType(i+1,j))]*tmp < 0.0 || + cudaDofVector[Mesh.getCellIndex(CoordinatesType(i-1,j))]*tmp < 0.0 || + cudaDofVector[Mesh.getCellIndex(CoordinatesType(i,j+1))]*tmp < 0.0 || + cudaDofVector[Mesh.getCellIndex(CoordinatesType(i,j-1))]*tmp < 0.0 ) + {} + else + flag=1;//cudaDofVector[Mesh.getCellIndex(CoordinatesType(i,j))] = tmp*INT_MAX; + } + } + +// printf("gx: %d, gy: %d, gid: %d \n", gx, gy,gid); +// printf("****************************************************************\n"); +// printf("gx: %d, gy: %d, gid: %d \n", gx, gy,gid); + if(gx > 0 && gx < Mesh.getDimensions().x()-1 && gy == 0) + { +// printf("gx: %d, gy: %d, gid: %d \n", gx, gy,gid); + Index j = 0; + Index i = gx; +// tmp = Sign(cudaDofVector[Mesh.getCellIndex(CoordinatesType(i,j))]); + + + if(tmp == 0.0) + {} + else if(cudaDofVector[Mesh.getCellIndex(CoordinatesType(i+1,j))]*tmp < 0.0 || + cudaDofVector[Mesh.getCellIndex(CoordinatesType(i-1,j))]*tmp < 0.0 || + cudaDofVector[Mesh.getCellIndex(CoordinatesType(i,j+1))]*tmp < 0.0 ) + {} + else + flag = 1;//cudaDofVector[Mesh.getCellIndex(CoordinatesType(i,j))] = tmp*INT_MAX; + } + +// printf("++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\n"); + if(gx > 0 && gx < Mesh.getDimensions().x()-1 && gy == Mesh.getDimensions().y() - 1) + { + Index i = gx; + Index j = Mesh.getDimensions().y() - 1; +// tmp = Sign(cudaDofVector[Mesh.getCellIndex(CoordinatesType(i,j))]); + + + if(tmp == 0.0) + {} + else if(cudaDofVector[Mesh.getCellIndex(CoordinatesType(i+1,j))]*tmp < 0.0 || + cudaDofVector[Mesh.getCellIndex(CoordinatesType(i-1,j))]*tmp < 0.0 || + cudaDofVector[Mesh.getCellIndex(CoordinatesType(i,j-1))]*tmp < 0.0 ) + {} + else + flag = 1;//cudaDofVector[Mesh.getCellIndex(CoordinatesType(i,j))] = tmp*INT_MAX; + } + +// printf("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n"); + if(gy > 0 && gy < Mesh.getDimensions().y()-1 && gx == 0) + { + Index j = gy; + Index i = 0; +// tmp = Sign(cudaDofVector[Mesh.getCellIndex(CoordinatesType(i,j))]); + + + if(tmp == 0.0) + {} + else if(cudaDofVector[Mesh.getCellIndex(CoordinatesType(i+1,j))]*tmp < 0.0 || + cudaDofVector[Mesh.getCellIndex(CoordinatesType(i,j+1))]*tmp < 0.0 || + cudaDofVector[Mesh.getCellIndex(CoordinatesType(i,j-1))]*tmp < 0.0 ) + {} + else + flag = 1;//cudaDofVector[Mesh.getCellIndex(CoordinatesType(i,j))] = tmp*INT_MAX; + } +// printf("@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@\n"); + if(gy > 0 && gy < Mesh.getDimensions().y()-1 && gx == Mesh.getDimensions().x() - 1) + { + Index j = gy; + Index i = Mesh.getDimensions().x() - 1; +// tmp = Sign(cudaDofVector[Mesh.getCellIndex(CoordinatesType(i,j))]); + + + if(tmp == 0.0) + {} + else if(cudaDofVector[Mesh.getCellIndex(CoordinatesType(i-1,j))]*tmp < 0.0 || + cudaDofVector[Mesh.getCellIndex(CoordinatesType(i,j+1))]*tmp < 0.0 || + cudaDofVector[Mesh.getCellIndex(CoordinatesType(i,j-1))]*tmp < 0.0 ) + {} + else + flag = 1;//cudaDofVector[Mesh.getCellIndex(CoordinatesType(i,j))] = tmp*INT_MAX; + } + +// printf("##################################################################################################\n"); + if(gx == Mesh.getDimensions().x() - 1 && + gy == Mesh.getDimensions().y() - 1) + { + +// tmp = Sign(cudaDofVector[Mesh.getCellIndex(CoordinatesType(gx,gy))]); + if(cudaDofVector[Mesh.getCellIndex(CoordinatesType(gx-1,gy))]*tmp > 0.0 && + cudaDofVector[Mesh.getCellIndex(CoordinatesType(gx,gy-1))]*tmp > 0.0) + + flag = 1;//cudaDofVector[Mesh.getCellIndex(CoordinatesType(gx,gy))] = tmp*INT_MAX; + } + if(gx == Mesh.getDimensions().x() - 1 && + gy == 0) + { + +// tmp = Sign(cudaDofVector[Mesh.getCellIndex(CoordinatesType(gx,gy))]); + if(cudaDofVector[Mesh.getCellIndex(CoordinatesType(gx-1,gy))]*tmp > 0.0 && + cudaDofVector[Mesh.getCellIndex(CoordinatesType(gx,gy+1))]*tmp > 0.0) + + flag = 1;//cudaDofVector[Mesh.getCellIndex(CoordinatesType(gx,gy))] = tmp*INT_MAX; + } +// printf("$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$\n"); + if(gx == 0 && + gy == Mesh.getDimensions().y() - 1) + { + +// tmp = Sign(cudaDofVector[Mesh.getCellIndex(CoordinatesType(gx,gy))]); + if(cudaDofVector[Mesh.getCellIndex(CoordinatesType(gx+1,gy))]*tmp > 0.0 && + cudaDofVector[Mesh.getCellIndex(CoordinatesType(gx,gy-1))]*tmp > 0.0) + + flag = 1;//cudaDofVector[Mesh.getCellIndex(CoordinatesType(gx,gy))] = tmp*INT_MAX; + } + if(gx == 0 && + gy == 0) + { +// tmp = Sign(cudaDofVector[Mesh.getCellIndex(CoordinatesType(gx,gy))]); + if(cudaDofVector[Mesh.getCellIndex(CoordinatesType(gx+1,gy))]*tmp > 0.0 && + cudaDofVector[Mesh.getCellIndex(CoordinatesType(gx,gy+1))]*tmp > 0.0) + + flag = 1;//cudaDofVector[Mesh.getCellIndex(CoordinatesType(gx,gy))] = tmp*INT_MAX; + } + + __threadfence(); + + if(flag==1) + cudaDofVector[gid] = tmp*3; +} + + +template< typename MeshReal, + typename Device, + typename MeshIndex, + typename Real, + typename Index > +__device__ +Real tnlFastSweeping< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: fabsMin( Real x, Real y) +{ + Real fx = abs(x); + Real fy = abs(y); + + Real tmpMin = Min(fx,fy); + + if(tmpMin == fx) + return x; + else + return y; + + +} + + + +__global__ void runCUDA(tnlFastSweeping< tnlGrid< 2,double, tnlHost, int >, double, int >* solver, int sweep, int i) +{ + + int gx = threadIdx.x + blockDim.x*blockIdx.x; + int gy = blockDim.y*blockIdx.y + threadIdx.y; + if(solver->Mesh.getDimensions().x() <= gx || solver->Mesh.getDimensions().y() <= gy) + return; + int total = solver->Mesh.getDimensions().x(); + //int gid = solver->Mesh.getDimensions().x() * gy + gx; + int max = solver->Mesh.getDimensions().x()*solver->Mesh.getDimensions().x(); + + int id1 = gx+gy; + int id2 = (solver->Mesh.getDimensions().x() - gx - 1) + gy; + + /*---------------------------------------------------------------------------------------------------------------------------*/ + if(sweep == 1) +// for(int i = 0; i < 2*total - 1; i++) + { + if(id1 == i) + { + solver->updateValue(gx,gy); + return; + } + + } + /*---------------------------------------------------------------------------------------------------------------------------*/ + else if(sweep == 2) +// for(int i = 0; i < 2*total - 1; i++) + { + if(id2 == i) + { + solver->updateValue(gx,gy); + return; + } + } + /*---------------------------------------------------------------------------------------------------------------------------*/ + else if(sweep == 3) +// for(int i = 2*total - 2; i > -1; i--) + { + if(id1 == i) + { + solver->updateValue(gx,gy); + return; + } + } + /*---------------------------------------------------------------------------------------------------------------------------*/ + else if(sweep == 4) +// for(int i = 2*total - 2; i > -1; i--) + { + if(id2 == i) + { + solver->updateValue(gx,gy); + return; + } + } + /*---------------------------------------------------------------------------------------------------------------------------*/ + + + + +} + + +__global__ void initCUDA(tnlFastSweeping< tnlGrid< 2,double, tnlHost, int >, double, int >* solver) +{ + int gx = threadIdx.x + blockDim.x*blockIdx.x; + int gy = blockDim.y*blockIdx.y + threadIdx.y; + + if(solver->Mesh.getDimensions().x() > gx && solver->Mesh.getDimensions().y() > gy) + { + solver->initGrid(); + } + + +} +#endif + + + + +#endif /* TNLFASTSWEEPING_IMPL_H_ */ diff --git a/examples/fast-sweeping/tnlFastSweeping2D_CUDA_v2_impl.h b/examples/fast-sweeping/tnlFastSweeping2D_CUDA_v2_impl.h new file mode 100644 index 0000000000000000000000000000000000000000..cbaf200b69edcf7d389a3466f34be5b3a4897415 --- /dev/null +++ b/examples/fast-sweeping/tnlFastSweeping2D_CUDA_v2_impl.h @@ -0,0 +1,588 @@ +/*************************************************************************** + tnlFastSweeping_impl.h - description + ------------------- + begin : Oct 15 , 2015 + copyright : (C) 2015 by Tomas Sobotik + ***************************************************************************/ + +/*************************************************************************** + * * + * This program is free software; you can redistribute it and/or modify * + * it under the terms of the GNU General Public License as published by * + * the Free Software Foundation; either version 2 of the License, or * + * (at your option) any later version. * + * * + ***************************************************************************/ +#ifndef TNLFASTSWEEPING2D_IMPL_H_ +#define TNLFASTSWEEPING2D_IMPL_H_ + +#include "tnlFastSweeping.h" + + +template< typename MeshReal, + typename Device, + typename MeshIndex, + typename Real, + typename Index > +tnlString tnlFastSweeping< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: getType() +{ + return tnlString( "tnlFastSweeping< " ) + + MeshType::getType() + ", " + + ::getType< Real >() + ", " + + ::getType< Index >() + " >"; +} + + + + +template< typename MeshReal, + typename Device, + typename MeshIndex, + typename Real, + typename Index > +bool tnlFastSweeping< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: init( const tnlParameterContainer& parameters ) +{ + const tnlString& meshFile = parameters.getParameter< tnlString >( "mesh" ); + + if( ! Mesh.load( meshFile ) ) + { + cerr << "I am not able to load the mesh from the file " << meshFile << "." << endl; + return false; + } + + + const tnlString& initialCondition = parameters.getParameter <tnlString>("initial-condition"); + if( ! dofVector.load( initialCondition ) ) + { + cerr << "I am not able to load the initial condition from the file " << meshFile << "." << endl; + return false; + } + + h = Mesh.getHx(); + counter = 0; + + const tnlString& exact_input = parameters.getParameter< tnlString >( "exact-input" ); + + if(exact_input == "no") + exactInput=false; + else + exactInput=true; + + +#ifdef HAVE_CUDA + + cudaMalloc(&(cudaDofVector), this->dofVector.getSize()*sizeof(double)); + cudaMemcpy(cudaDofVector, this->dofVector.getData(), this->dofVector.getSize()*sizeof(double), cudaMemcpyHostToDevice); + + cudaMalloc(&(cudaDofVector2), this->dofVector.getSize()*sizeof(double)); + cudaMemcpy(cudaDofVector2, this->dofVector.getData(), this->dofVector.getSize()*sizeof(double), cudaMemcpyHostToDevice); + + + cudaMalloc(&(this->cudaSolver), sizeof(tnlFastSweeping< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index >)); + cudaMemcpy(this->cudaSolver, this,sizeof(tnlFastSweeping< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index >), cudaMemcpyHostToDevice); + +#endif + + int n = Mesh.getDimensions().x(); + dim3 threadsPerBlock(16, 16); + dim3 numBlocks(n/16 + 1 ,n/16 +1); + + initCUDA<<<numBlocks,threadsPerBlock>>>(this->cudaSolver); + cudaDeviceSynchronize(); + checkCudaDevice; + + return true; +} + + + + + +template< typename MeshReal, + typename Device, + typename MeshIndex, + typename Real, + typename Index > +bool tnlFastSweeping< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: run() +{ +// +// for(Index i = 0; i < Mesh.getDimensions().x(); i++) +// { +// for(Index j = 0; j < Mesh.getDimensions().y(); j++) +// { +// updateValue(i,j); +// } +// } +// +///*---------------------------------------------------------------------------------------------------------------------------*/ +// +// for(Index i = Mesh.getDimensions().x() - 1; i > -1; i--) +// { +// for(Index j = 0; j < Mesh.getDimensions().y(); j++) +// { +// updateValue(i,j); +// } +// } +// +///*---------------------------------------------------------------------------------------------------------------------------*/ +// +// for(Index i = Mesh.getDimensions().x() - 1; i > -1; i--) +// { +// for(Index j = Mesh.getDimensions().y() - 1; j > -1; j--) +// { +// updateValue(i,j); +// } +// } +// +///*---------------------------------------------------------------------------------------------------------------------------*/ +// for(Index i = 0; i < Mesh.getDimensions().x(); i++) +// { +// for(Index j = Mesh.getDimensions().y() - 1; j > -1; j--) +// { +// updateValue(i,j); +// } +// } +// +///*---------------------------------------------------------------------------------------------------------------------------*/ +// +// +// dofVector.save("u-00001.tnl"); + + int n = Mesh.getDimensions().x(); + dim3 threadsPerBlock(27, 27); + dim3 numBlocks(1 ,1); + +// for(int i = 2*n - 1; i > -1; i--) + { + runCUDA<<<numBlocks,threadsPerBlock>>>(this->cudaSolver,4,0); + cudaDeviceSynchronize(); + } + cudaDeviceSynchronize(); + checkCudaDevice; +//// for(int i = 0; i < 2*n ; i++) +// { +// runCUDA<<<numBlocks,threadsPerBlock>>>(this->cudaSolver,1,0); +// cudaDeviceSynchronize(); +// } +// cudaDeviceSynchronize(); +// checkCudaDevice; +//// for(int i = 0; i < 2*n ; i++) +// { +// runCUDA<<<numBlocks,threadsPerBlock>>>(this->cudaSolver,2,0); +// cudaDeviceSynchronize(); +// } +// cudaDeviceSynchronize(); +// checkCudaDevice; +//// for(int i = 2*n - 1; i > -1; i--) +// { +// runCUDA<<<numBlocks,threadsPerBlock>>>(this->cudaSolver,3,0); +// cudaDeviceSynchronize(); +// } +// +// cudaDeviceSynchronize(); +// checkCudaDevice; + + cudaMemcpy(this->dofVector.getData(), cudaDofVector, this->dofVector.getSize()*sizeof(double), cudaMemcpyDeviceToHost); + cudaDeviceSynchronize(); + cudaFree(cudaDofVector); + cudaFree(cudaDofVector2); + cudaFree(cudaSolver); + dofVector.save("u-00001.tnl"); + cudaDeviceSynchronize(); + return true; +} + + + + +#ifdef HAVE_CUDA + + +template< typename MeshReal, + typename Device, + typename MeshIndex, + typename Real, + typename Index > +__device__ +void tnlFastSweeping< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: updateValue( Index i, Index j) +{ + Index index = Mesh.getCellIndex(CoordinatesType(i,j)); + Real value = cudaDofVector[index]; + Real a,b, tmp; + + if( i == 0 ) + a = cudaDofVector[Mesh.template getCellNextToCell<1,0>(index)]; + else if( i == Mesh.getDimensions().x() - 1 ) + a = cudaDofVector[Mesh.template getCellNextToCell<-1,0>(index)]; + else + { + a = fabsMin( cudaDofVector[Mesh.template getCellNextToCell<-1,0>(index)], + cudaDofVector[Mesh.template getCellNextToCell<1,0>(index)] ); + } + + if( j == 0 ) + b = cudaDofVector[Mesh.template getCellNextToCell<0,1>(index)]; + else if( j == Mesh.getDimensions().y() - 1 ) + b = cudaDofVector[Mesh.template getCellNextToCell<0,-1>(index)]; + else + { + b = fabsMin( cudaDofVector[Mesh.template getCellNextToCell<0,-1>(index)], + cudaDofVector[Mesh.template getCellNextToCell<0,1>(index)] ); + } + + + if(abs(a-b) >= h) + tmp = fabsMin(a,b) + Sign(value)*h; + else + tmp = 0.5 * (a + b + Sign(value)*sqrt(2.0 * h * h - (a - b) * (a - b) ) ); + + cudaDofVector[index] = fabsMin(value, tmp); + +} + + +template< typename MeshReal, + typename Device, + typename MeshIndex, + typename Real, + typename Index > +__device__ +bool tnlFastSweeping< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: initGrid() +{ + int gx = threadIdx.x + blockDim.x*blockIdx.x; + int gy = blockDim.y*blockIdx.y + threadIdx.y; + int gid = Mesh.getCellIndex(CoordinatesType(gx,gy)); + + int total = blockDim.x*gridDim.x; + + + + Real tmp = 0.0; + int flag = 0; + counter = 0; + tmp = Sign(cudaDofVector[Mesh.getCellIndex(CoordinatesType(gx,gy))]); + + + if(!exactInput) + { + cudaDofVector[gid]=cudaDofVector[gid]=0.5*h*Sign(cudaDofVector[gid]); + } + __threadfence(); +// printf("-----------------------------------------------------------------------------------\n"); + + __threadfence(); + + if(gx > 0 && gx < Mesh.getDimensions().x()-1) + { + if(gy > 0 && gy < Mesh.getDimensions().y()-1) + { + + Index j = gy; + Index i = gx; +// tmp = Sign(cudaDofVector[Mesh.getCellIndex(CoordinatesType(i,j))]); + + if(tmp == 0.0) + {} + else if(cudaDofVector[Mesh.getCellIndex(CoordinatesType(i+1,j))]*tmp < 0.0 || + cudaDofVector[Mesh.getCellIndex(CoordinatesType(i-1,j))]*tmp < 0.0 || + cudaDofVector[Mesh.getCellIndex(CoordinatesType(i,j+1))]*tmp < 0.0 || + cudaDofVector[Mesh.getCellIndex(CoordinatesType(i,j-1))]*tmp < 0.0 ) + {} + else + flag=1;//cudaDofVector[Mesh.getCellIndex(CoordinatesType(i,j))] = tmp*INT_MAX; + } + } + +// printf("gx: %d, gy: %d, gid: %d \n", gx, gy,gid); +// printf("****************************************************************\n"); +// printf("gx: %d, gy: %d, gid: %d \n", gx, gy,gid); + if(gx > 0 && gx < Mesh.getDimensions().x()-1 && gy == 0) + { +// printf("gx: %d, gy: %d, gid: %d \n", gx, gy,gid); + Index j = 0; + Index i = gx; +// tmp = Sign(cudaDofVector[Mesh.getCellIndex(CoordinatesType(i,j))]); + + + if(tmp == 0.0) + {} + else if(cudaDofVector[Mesh.getCellIndex(CoordinatesType(i+1,j))]*tmp < 0.0 || + cudaDofVector[Mesh.getCellIndex(CoordinatesType(i-1,j))]*tmp < 0.0 || + cudaDofVector[Mesh.getCellIndex(CoordinatesType(i,j+1))]*tmp < 0.0 ) + {} + else + flag = 1;//cudaDofVector[Mesh.getCellIndex(CoordinatesType(i,j))] = tmp*INT_MAX; + } + +// printf("++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\n"); + if(gx > 0 && gx < Mesh.getDimensions().x()-1 && gy == Mesh.getDimensions().y() - 1) + { + Index i = gx; + Index j = Mesh.getDimensions().y() - 1; +// tmp = Sign(cudaDofVector[Mesh.getCellIndex(CoordinatesType(i,j))]); + + + if(tmp == 0.0) + {} + else if(cudaDofVector[Mesh.getCellIndex(CoordinatesType(i+1,j))]*tmp < 0.0 || + cudaDofVector[Mesh.getCellIndex(CoordinatesType(i-1,j))]*tmp < 0.0 || + cudaDofVector[Mesh.getCellIndex(CoordinatesType(i,j-1))]*tmp < 0.0 ) + {} + else + flag = 1;//cudaDofVector[Mesh.getCellIndex(CoordinatesType(i,j))] = tmp*INT_MAX; + } + +// printf("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n"); + if(gy > 0 && gy < Mesh.getDimensions().y()-1 && gx == 0) + { + Index j = gy; + Index i = 0; +// tmp = Sign(cudaDofVector[Mesh.getCellIndex(CoordinatesType(i,j))]); + + + if(tmp == 0.0) + {} + else if(cudaDofVector[Mesh.getCellIndex(CoordinatesType(i+1,j))]*tmp < 0.0 || + cudaDofVector[Mesh.getCellIndex(CoordinatesType(i,j+1))]*tmp < 0.0 || + cudaDofVector[Mesh.getCellIndex(CoordinatesType(i,j-1))]*tmp < 0.0 ) + {} + else + flag = 1;//cudaDofVector[Mesh.getCellIndex(CoordinatesType(i,j))] = tmp*INT_MAX; + } +// printf("@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@\n"); + if(gy > 0 && gy < Mesh.getDimensions().y()-1 && gx == Mesh.getDimensions().x() - 1) + { + Index j = gy; + Index i = Mesh.getDimensions().x() - 1; +// tmp = Sign(cudaDofVector[Mesh.getCellIndex(CoordinatesType(i,j))]); + + + if(tmp == 0.0) + {} + else if(cudaDofVector[Mesh.getCellIndex(CoordinatesType(i-1,j))]*tmp < 0.0 || + cudaDofVector[Mesh.getCellIndex(CoordinatesType(i,j+1))]*tmp < 0.0 || + cudaDofVector[Mesh.getCellIndex(CoordinatesType(i,j-1))]*tmp < 0.0 ) + {} + else + flag = 1;//cudaDofVector[Mesh.getCellIndex(CoordinatesType(i,j))] = tmp*INT_MAX; + } + +// printf("##################################################################################################\n"); + if(gx == Mesh.getDimensions().x() - 1 && + gy == Mesh.getDimensions().y() - 1) + { + +// tmp = Sign(cudaDofVector[Mesh.getCellIndex(CoordinatesType(gx,gy))]); + if(cudaDofVector[Mesh.getCellIndex(CoordinatesType(gx-1,gy))]*tmp > 0.0 && + cudaDofVector[Mesh.getCellIndex(CoordinatesType(gx,gy-1))]*tmp > 0.0) + + flag = 1;//cudaDofVector[Mesh.getCellIndex(CoordinatesType(gx,gy))] = tmp*INT_MAX; + } + if(gx == Mesh.getDimensions().x() - 1 && + gy == 0) + { + +// tmp = Sign(cudaDofVector[Mesh.getCellIndex(CoordinatesType(gx,gy))]); + if(cudaDofVector[Mesh.getCellIndex(CoordinatesType(gx-1,gy))]*tmp > 0.0 && + cudaDofVector[Mesh.getCellIndex(CoordinatesType(gx,gy+1))]*tmp > 0.0) + + flag = 1;//cudaDofVector[Mesh.getCellIndex(CoordinatesType(gx,gy))] = tmp*INT_MAX; + } +// printf("$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$\n"); + if(gx == 0 && + gy == Mesh.getDimensions().y() - 1) + { + +// tmp = Sign(cudaDofVector[Mesh.getCellIndex(CoordinatesType(gx,gy))]); + if(cudaDofVector[Mesh.getCellIndex(CoordinatesType(gx+1,gy))]*tmp > 0.0 && + cudaDofVector[Mesh.getCellIndex(CoordinatesType(gx,gy-1))]*tmp > 0.0) + + flag = 1;//cudaDofVector[Mesh.getCellIndex(CoordinatesType(gx,gy))] = tmp*INT_MAX; + } + if(gx == 0 && + gy == 0) + { +// tmp = Sign(cudaDofVector[Mesh.getCellIndex(CoordinatesType(gx,gy))]); + if(cudaDofVector[Mesh.getCellIndex(CoordinatesType(gx+1,gy))]*tmp > 0.0 && + cudaDofVector[Mesh.getCellIndex(CoordinatesType(gx,gy+1))]*tmp > 0.0) + + flag = 1;//cudaDofVector[Mesh.getCellIndex(CoordinatesType(gx,gy))] = tmp*INT_MAX; + } + + __threadfence(); + + if(flag==1) + cudaDofVector[gid] = tmp*3; +} + + +template< typename MeshReal, + typename Device, + typename MeshIndex, + typename Real, + typename Index > +__device__ +Real tnlFastSweeping< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: fabsMin( Real x, Real y) +{ + Real fx = abs(x); + + Real tmpMin = Min(fx,abs(y)); + + if(tmpMin == fx) + return x; + else + return y; + + +} + + + +__global__ void runCUDA(tnlFastSweeping< tnlGrid< 2,double, tnlHost, int >, double, int >* solver, int sweep, int k) +{ + + //int gx = threadIdx.x; + //int gy = threadIdx.y; + int id1,id2; + int nx = solver->Mesh.getDimensions().x()+ threadIdx.x; + int ny = solver->Mesh.getDimensions().y()+ threadIdx.y; + + int blockCount = solver->Mesh.getDimensions().x()/blockDim.x + 1; + + for(int gy = threadIdx.y; gy < ny;gy+=blockDim.y) + { + for(int gx = threadIdx.x; gx < nx;gx+=blockDim.x) + { +// if(solver->Mesh.getDimensions().x() > gx && solver->Mesh.getDimensions().y() > gy && gy > -1&& gx > -1) + { + id1 = threadIdx.x+threadIdx.y; + + for(int l = 0; l < 2*blockDim.x - 1; l++) + { + if(id1 == l) + { + if(solver->Mesh.getDimensions().x() > gx && solver->Mesh.getDimensions().y() > gy /*&& gy > -1&& gx > -1*/) + solver->updateValue(gx,gy); + } + __syncthreads(); + } + + } + //gx+=blockDim.x; + //__syncthreads(); + } + //gx = threadIdx.x; + //gy+=blockDim.y; + //__syncthreads(); + } + /*---------------------------------------------------------------------------------------------------------------------------*/ +// gx = blockDim.x*(blockCount-1) + threadIdx.x; +// gy = threadIdx.y; + for(int gy = threadIdx.y; gy < ny;gy+=blockDim.y) + { + for(int gx = blockDim.x*(blockCount-1) + threadIdx.x; gx >- 1;gx-=blockDim.x) + { +// if(solver->Mesh.getDimensions().x() > gx && solver->Mesh.getDimensions().y() > gy && gy > -1&& gx > -1) + { + id2 = (blockDim.x - threadIdx.x - 1) + threadIdx.y; + + for(int l = 0; l < 2*blockDim.x - 1; l++) + { + if(id2 == l) + { + if(solver->Mesh.getDimensions().x() > gx && solver->Mesh.getDimensions().y() > gy /*&& gy > -1&& gx > -1*/) + solver->updateValue(gx,gy); + } + __syncthreads(); + } + } + //gx-=blockDim.x; + //__syncthreads(); + } + //gx = blockDim.x*(blockCount-1) + threadIdx.x; + //gy+=blockDim.y; + //__syncthreads(); + } + /*---------------------------------------------------------------------------------------------------------------------------*/ +// gx = blockDim.x*(blockCount-1) + threadIdx.x; +// gy = blockDim.x*(blockCount-1) + threadIdx.y; + for(int gy = blockDim.x*(blockCount-1) +threadIdx.y; gy >- 1;gy-=blockDim.y) + { + for(int gx = blockDim.x*(blockCount-1) + threadIdx.x; gx >- 1;gx-=blockDim.x) + { +// if(solver->Mesh.getDimensions().x() > gx && solver->Mesh.getDimensions().y() > gy && gy > -1&& gx > -1) + { + id1 = threadIdx.x+threadIdx.y; + + for(int l = 2*blockDim.x - 2; l > -1; l--) + { + if(id1 == l) + { + if(solver->Mesh.getDimensions().x() > gx && solver->Mesh.getDimensions().y() > gy /*&& gy > -1&& gx > -1*/) + solver->updateValue(gx,gy); + } + __syncthreads(); + } + } + //gx-=blockDim.x; + //__syncthreads(); + } + //gx = blockDim.x*(blockCount-1) + threadIdx.x; + //gy-=blockDim.y; + //__syncthreads(); + } + /*---------------------------------------------------------------------------------------------------------------------------*/ + //gx = threadIdx.x; + //gy = blockDim.x*(blockCount-1) +threadIdx.y; + for(int gy = blockDim.x*(blockCount-1) +threadIdx.y; gy >- 1;gy-=blockDim.y) + { + for(int gx = threadIdx.x; gx < nx;gx+=blockDim.x) + { +// if(solver->Mesh.getDimensions().x() > gx && solver->Mesh.getDimensions().y() > gy && gy > -1&& gx > -1) + { + id2 = (blockDim.x - threadIdx.x - 1) + threadIdx.y; + + for(int l = 2*blockDim.x - 2; l > -1; l--) + { + if(id2 == l) + { + if(solver->Mesh.getDimensions().x() > gx && solver->Mesh.getDimensions().y() > gy /*&& gy > -1&& gx > -1*/) + solver->updateValue(gx,gy); + } + __syncthreads(); + } + } + //gx+=blockDim.x; + //__syncthreads(); + } + //gx = threadIdx.x; + //gy-=blockDim.y; + ///__syncthreads(); + } + /*---------------------------------------------------------------------------------------------------------------------------*/ + + + + + +} + + +__global__ void initCUDA(tnlFastSweeping< tnlGrid< 2,double, tnlHost, int >, double, int >* solver) +{ + int gx = threadIdx.x + blockDim.x*blockIdx.x; + int gy = blockDim.y*blockIdx.y + threadIdx.y; + + if(solver->Mesh.getDimensions().x() > gx && solver->Mesh.getDimensions().y() > gy) + { + solver->initGrid(); + } + + +} +#endif + + + + +#endif /* TNLFASTSWEEPING_IMPL_H_ */ diff --git a/examples/fast-sweeping/tnlFastSweeping2D_CUDA_v3_impl.h b/examples/fast-sweeping/tnlFastSweeping2D_CUDA_v3_impl.h new file mode 100644 index 0000000000000000000000000000000000000000..44aa2fbc59111d134ae09433067532f9e9e02e63 --- /dev/null +++ b/examples/fast-sweeping/tnlFastSweeping2D_CUDA_v3_impl.h @@ -0,0 +1,920 @@ +/*************************************************************************** + tnlFastSweeping_impl.h - description + ------------------- + begin : Oct 15 , 2015 + copyright : (C) 2015 by Tomas Sobotik + ***************************************************************************/ + +/*************************************************************************** + * * + * This program is free software; you can redistribute it and/or modify * + * it under the terms of the GNU General Public License as published by * + * the Free Software Foundation; either version 2 of the License, or * + * (at your option) any later version. * + * * + ***************************************************************************/ +#ifndef TNLFASTSWEEPING2D_IMPL_H_ +#define TNLFASTSWEEPING2D_IMPL_H_ + +#include "tnlFastSweeping.h" + + + + + __device__ double atomicSet(double* address, double val) + { + unsigned long long int* address_as_ull = + (unsigned long long int*)address; + unsigned long long int old = *address_as_ull, assumed; + do { + assumed = old; + old = atomicCAS(address_as_ull, assumed,__double_as_longlong(val )); + } while (assumed != old); + return __longlong_as_double(old); + } + + +template< typename MeshReal, + typename Device, + typename MeshIndex, + typename Real, + typename Index > +tnlString tnlFastSweeping< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: getType() +{ + return tnlString( "tnlFastSweeping< " ) + + MeshType::getType() + ", " + + ::getType< Real >() + ", " + + ::getType< Index >() + " >"; +} + + + + +template< typename MeshReal, + typename Device, + typename MeshIndex, + typename Real, + typename Index > +bool tnlFastSweeping< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: init( const tnlParameterContainer& parameters ) +{ + const tnlString& meshFile = parameters.getParameter< tnlString >( "mesh" ); + + if( ! Mesh.load( meshFile ) ) + { + cerr << "I am not able to load the mesh from the file " << meshFile << "." << endl; + return false; + } + + + const tnlString& initialCondition = parameters.getParameter <tnlString>("initial-condition"); + if( ! dofVector.load( initialCondition ) ) + { + cerr << "I am not able to load the initial condition from the file " << meshFile << "." << endl; + return false; + } + + h = Mesh.getHx(); + counter = 0; + + const tnlString& exact_input = parameters.getParameter< tnlString >( "exact-input" ); + + if(exact_input == "no") + exactInput=false; + else + exactInput=true; + + +#ifdef HAVE_CUDA + + cudaMalloc(&(cudaDofVector), this->dofVector.getSize()*sizeof(double)); + cudaMemcpy(cudaDofVector, this->dofVector.getData(), this->dofVector.getSize()*sizeof(double), cudaMemcpyHostToDevice); + + cudaMalloc(&(cudaDofVector2), this->dofVector.getSize()*sizeof(double)); + cudaMemcpy(cudaDofVector2, this->dofVector.getData(), this->dofVector.getSize()*sizeof(double), cudaMemcpyHostToDevice); + + + cudaMalloc(&(this->cudaSolver), sizeof(tnlFastSweeping< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index >)); + cudaMemcpy(this->cudaSolver, this,sizeof(tnlFastSweeping< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index >), cudaMemcpyHostToDevice); + +#endif + + int n = Mesh.getDimensions().x(); + dim3 threadsPerBlock(16, 16); + dim3 numBlocks(n/16 + 1 ,n/16 +1); + + initCUDA<<<numBlocks,threadsPerBlock>>>(this->cudaSolver); + cudaDeviceSynchronize(); + checkCudaDevice; + + return true; +} + + + + + +template< typename MeshReal, + typename Device, + typename MeshIndex, + typename Real, + typename Index > +bool tnlFastSweeping< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: run() +{ +// +// for(Index i = 0; i < Mesh.getDimensions().x(); i++) +// { +// for(Index j = 0; j < Mesh.getDimensions().y(); j++) +// { +// updateValue(i,j); +// } +// } +// +///*---------------------------------------------------------------------------------------------------------------------------*/ +// +// for(Index i = Mesh.getDimensions().x() - 1; i > -1; i--) +// { +// for(Index j = 0; j < Mesh.getDimensions().y(); j++) +// { +// updateValue(i,j); +// } +// } +// +///*---------------------------------------------------------------------------------------------------------------------------*/ +// +// for(Index i = Mesh.getDimensions().x() - 1; i > -1; i--) +// { +// for(Index j = Mesh.getDimensions().y() - 1; j > -1; j--) +// { +// updateValue(i,j); +// } +// } +// +///*---------------------------------------------------------------------------------------------------------------------------*/ +// for(Index i = 0; i < Mesh.getDimensions().x(); i++) +// { +// for(Index j = Mesh.getDimensions().y() - 1; j > -1; j--) +// { +// updateValue(i,j); +// } +// } +// +///*---------------------------------------------------------------------------------------------------------------------------*/ +// +// +// dofVector.save("u-00001.tnl"); + + int n = Mesh.getDimensions().x(); + dim3 threadsPerBlock(16, 16); + dim3 numBlocks(n/16 +1 ,n/16 +1); + int m =n/16 +1; + + for(int i = 0; i < 2*m -1; i++) + { + runCUDA<15><<<numBlocks,threadsPerBlock>>>(this->cudaSolver,1,i); + //cudaDeviceSynchronize(); + } +// cudaDeviceSynchronize(); +// checkCudaDevice; +// for(int i = 0; i < 2*m -1; i++) +// { +// runCUDA<2><<<numBlocks,threadsPerBlock>>>(this->cudaSolver,2,i); +// cudaDeviceSynchronize(); +// } +// cudaDeviceSynchronize(); +// checkCudaDevice; +// for(int i = 0; i < 2*m -1; i++) +// { +// runCUDA<4><<<numBlocks,threadsPerBlock>>>(this->cudaSolver,4,i); +// cudaDeviceSynchronize(); +// } +// cudaDeviceSynchronize(); +// checkCudaDevice; +// for(int i = 0; i < 2*m -1; i++) +// { +// runCUDA<8><<<numBlocks,threadsPerBlock>>>(this->cudaSolver,8,i); +// cudaDeviceSynchronize(); +// } + + + + +// for(int i = 0; i < (2*m -1)/4 -1; i++) +// { +// runCUDA<15><<<numBlocks,threadsPerBlock>>>(this->cudaSolver,15,i);//all +// cudaDeviceSynchronize(); +// } +// for(int i = (2*m -1)/4 -1; i < (2*m -1)/2 -1; i++) +// { +// runCUDA<5><<<numBlocks,threadsPerBlock>>>(this->cudaSolver,5,i); //two +// cudaDeviceSynchronize(); +// runCUDA<10><<<numBlocks,threadsPerBlock>>>(this->cudaSolver,10,i); //two +// cudaDeviceSynchronize(); +// } +// for(int i = (2*m -1)/2 -1; i < (2*m -1)/2 +1; i++) +// { +// runCUDA<1><<<numBlocks,threadsPerBlock>>>(this->cudaSolver,1,i); //separate +// cudaDeviceSynchronize(); +// runCUDA<2><<<numBlocks,threadsPerBlock>>>(this->cudaSolver,2,i); //separate +// cudaDeviceSynchronize(); +// runCUDA<4><<<numBlocks,threadsPerBlock>>>(this->cudaSolver,4,i); //separate +// cudaDeviceSynchronize(); +// runCUDA<8><<<numBlocks,threadsPerBlock>>>(this->cudaSolver,8,i); //separate +// cudaDeviceSynchronize(); +// } +// for(int i = (2*m -1)/2 +1; i < (2*m -1/4)*3 +1; i++) +// { +// runCUDA<5><<<numBlocks,threadsPerBlock>>>(this->cudaSolver,5,i); //two +// cudaDeviceSynchronize(); +// runCUDA<10><<<numBlocks,threadsPerBlock>>>(this->cudaSolver,10,i); //two +// cudaDeviceSynchronize(); +// } +// for(int i = (2*m -1/4)*3 +1; i < 2*m -1; i++) +// { +// runCUDA<15><<<numBlocks,threadsPerBlock>>>(this->cudaSolver,15,i);//all +// cudaDeviceSynchronize(); +// } +cudaDeviceSynchronize(); + checkCudaDevice; + + cudaMemcpy(this->dofVector.getData(), cudaDofVector, this->dofVector.getSize()*sizeof(double), cudaMemcpyDeviceToHost); + cudaDeviceSynchronize(); + cudaFree(cudaDofVector); + cudaFree(cudaDofVector2); + cudaFree(cudaSolver); + dofVector.save("u-00001.tnl"); + cudaDeviceSynchronize(); + return true; +} + + + + +#ifdef HAVE_CUDA + + +template< typename MeshReal, + typename Device, + typename MeshIndex, + typename Real, + typename Index > +__device__ +void tnlFastSweeping< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: updateValue( Index i, Index j) +{ + Index index = Mesh.getCellIndex(CoordinatesType(i,j)); + Real value = cudaDofVector[index]; + Real a,b, tmp; + + if( i == 0 ) + a = cudaDofVector[Mesh.template getCellNextToCell<1,0>(index)]; + else if( i == Mesh.getDimensions().x() - 1 ) + a = cudaDofVector[Mesh.template getCellNextToCell<-1,0>(index)]; + else + { + a = fabsMin( cudaDofVector[Mesh.template getCellNextToCell<-1,0>(index)], + cudaDofVector[Mesh.template getCellNextToCell<1,0>(index)] ); + } + + if( j == 0 ) + b = cudaDofVector[Mesh.template getCellNextToCell<0,1>(index)]; + else if( j == Mesh.getDimensions().y() - 1 ) + b = cudaDofVector[Mesh.template getCellNextToCell<0,-1>(index)]; + else + { + b = fabsMin( cudaDofVector[Mesh.template getCellNextToCell<0,-1>(index)], + cudaDofVector[Mesh.template getCellNextToCell<0,1>(index)] ); + } + + + if(abs(a-b) >= h) + tmp = fabsMin(a,b) + Sign(value)*h; + else + tmp = 0.5 * (a + b + Sign(value)*sqrt(2.0 * h * h - (a - b) * (a - b) ) ); + + atomicSet(&cudaDofVector[index],fabsMin(value, tmp)); + +} + + +template< typename MeshReal, + typename Device, + typename MeshIndex, + typename Real, + typename Index > +__device__ +bool tnlFastSweeping< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: initGrid() +{ + int gx = threadIdx.x + blockDim.x*blockIdx.x; + int gy = blockDim.y*blockIdx.y + threadIdx.y; + int gid = Mesh.getCellIndex(CoordinatesType(gx,gy)); + + int total = blockDim.x*gridDim.x; + + + + Real tmp = 0.0; + int flag = 0; + counter = 0; + tmp = Sign(cudaDofVector[Mesh.getCellIndex(CoordinatesType(gx,gy))]); + + + if(!exactInput) + { + cudaDofVector[gid]=cudaDofVector[gid]=0.5*h*Sign(cudaDofVector[gid]); + } + __threadfence(); +// printf("-----------------------------------------------------------------------------------\n"); + + __threadfence(); + + if(gx > 0 && gx < Mesh.getDimensions().x()-1) + { + if(gy > 0 && gy < Mesh.getDimensions().y()-1) + { + + Index j = gy; + Index i = gx; +// tmp = Sign(cudaDofVector[Mesh.getCellIndex(CoordinatesType(i,j))]); + + if(tmp == 0.0) + {} + else if(cudaDofVector[Mesh.getCellIndex(CoordinatesType(i+1,j))]*tmp < 0.0 || + cudaDofVector[Mesh.getCellIndex(CoordinatesType(i-1,j))]*tmp < 0.0 || + cudaDofVector[Mesh.getCellIndex(CoordinatesType(i,j+1))]*tmp < 0.0 || + cudaDofVector[Mesh.getCellIndex(CoordinatesType(i,j-1))]*tmp < 0.0 ) + {} + else + flag=1;//cudaDofVector[Mesh.getCellIndex(CoordinatesType(i,j))] = tmp*INT_MAX; + } + } + +// printf("gx: %d, gy: %d, gid: %d \n", gx, gy,gid); +// printf("****************************************************************\n"); +// printf("gx: %d, gy: %d, gid: %d \n", gx, gy,gid); + if(gx > 0 && gx < Mesh.getDimensions().x()-1 && gy == 0) + { +// printf("gx: %d, gy: %d, gid: %d \n", gx, gy,gid); + Index j = 0; + Index i = gx; +// tmp = Sign(cudaDofVector[Mesh.getCellIndex(CoordinatesType(i,j))]); + + + if(tmp == 0.0) + {} + else if(cudaDofVector[Mesh.getCellIndex(CoordinatesType(i+1,j))]*tmp < 0.0 || + cudaDofVector[Mesh.getCellIndex(CoordinatesType(i-1,j))]*tmp < 0.0 || + cudaDofVector[Mesh.getCellIndex(CoordinatesType(i,j+1))]*tmp < 0.0 ) + {} + else + flag = 1;//cudaDofVector[Mesh.getCellIndex(CoordinatesType(i,j))] = tmp*INT_MAX; + } + +// printf("++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\n"); + if(gx > 0 && gx < Mesh.getDimensions().x()-1 && gy == Mesh.getDimensions().y() - 1) + { + Index i = gx; + Index j = Mesh.getDimensions().y() - 1; +// tmp = Sign(cudaDofVector[Mesh.getCellIndex(CoordinatesType(i,j))]); + + + if(tmp == 0.0) + {} + else if(cudaDofVector[Mesh.getCellIndex(CoordinatesType(i+1,j))]*tmp < 0.0 || + cudaDofVector[Mesh.getCellIndex(CoordinatesType(i-1,j))]*tmp < 0.0 || + cudaDofVector[Mesh.getCellIndex(CoordinatesType(i,j-1))]*tmp < 0.0 ) + {} + else + flag = 1;//cudaDofVector[Mesh.getCellIndex(CoordinatesType(i,j))] = tmp*INT_MAX; + } + +// printf("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n"); + if(gy > 0 && gy < Mesh.getDimensions().y()-1 && gx == 0) + { + Index j = gy; + Index i = 0; +// tmp = Sign(cudaDofVector[Mesh.getCellIndex(CoordinatesType(i,j))]); + + + if(tmp == 0.0) + {} + else if(cudaDofVector[Mesh.getCellIndex(CoordinatesType(i+1,j))]*tmp < 0.0 || + cudaDofVector[Mesh.getCellIndex(CoordinatesType(i,j+1))]*tmp < 0.0 || + cudaDofVector[Mesh.getCellIndex(CoordinatesType(i,j-1))]*tmp < 0.0 ) + {} + else + flag = 1;//cudaDofVector[Mesh.getCellIndex(CoordinatesType(i,j))] = tmp*INT_MAX; + } +// printf("@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@\n"); + if(gy > 0 && gy < Mesh.getDimensions().y()-1 && gx == Mesh.getDimensions().x() - 1) + { + Index j = gy; + Index i = Mesh.getDimensions().x() - 1; +// tmp = Sign(cudaDofVector[Mesh.getCellIndex(CoordinatesType(i,j))]); + + + if(tmp == 0.0) + {} + else if(cudaDofVector[Mesh.getCellIndex(CoordinatesType(i-1,j))]*tmp < 0.0 || + cudaDofVector[Mesh.getCellIndex(CoordinatesType(i,j+1))]*tmp < 0.0 || + cudaDofVector[Mesh.getCellIndex(CoordinatesType(i,j-1))]*tmp < 0.0 ) + {} + else + flag = 1;//cudaDofVector[Mesh.getCellIndex(CoordinatesType(i,j))] = tmp*INT_MAX; + } + +// printf("##################################################################################################\n"); + if(gx == Mesh.getDimensions().x() - 1 && + gy == Mesh.getDimensions().y() - 1) + { + +// tmp = Sign(cudaDofVector[Mesh.getCellIndex(CoordinatesType(gx,gy))]); + if(cudaDofVector[Mesh.getCellIndex(CoordinatesType(gx-1,gy))]*tmp > 0.0 && + cudaDofVector[Mesh.getCellIndex(CoordinatesType(gx,gy-1))]*tmp > 0.0) + + flag = 1;//cudaDofVector[Mesh.getCellIndex(CoordinatesType(gx,gy))] = tmp*INT_MAX; + } + if(gx == Mesh.getDimensions().x() - 1 && + gy == 0) + { + +// tmp = Sign(cudaDofVector[Mesh.getCellIndex(CoordinatesType(gx,gy))]); + if(cudaDofVector[Mesh.getCellIndex(CoordinatesType(gx-1,gy))]*tmp > 0.0 && + cudaDofVector[Mesh.getCellIndex(CoordinatesType(gx,gy+1))]*tmp > 0.0) + + flag = 1;//cudaDofVector[Mesh.getCellIndex(CoordinatesType(gx,gy))] = tmp*INT_MAX; + } +// printf("$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$\n"); + if(gx == 0 && + gy == Mesh.getDimensions().y() - 1) + { + +// tmp = Sign(cudaDofVector[Mesh.getCellIndex(CoordinatesType(gx,gy))]); + if(cudaDofVector[Mesh.getCellIndex(CoordinatesType(gx+1,gy))]*tmp > 0.0 && + cudaDofVector[Mesh.getCellIndex(CoordinatesType(gx,gy-1))]*tmp > 0.0) + + flag = 1;//cudaDofVector[Mesh.getCellIndex(CoordinatesType(gx,gy))] = tmp*INT_MAX; + } + if(gx == 0 && + gy == 0) + { +// tmp = Sign(cudaDofVector[Mesh.getCellIndex(CoordinatesType(gx,gy))]); + if(cudaDofVector[Mesh.getCellIndex(CoordinatesType(gx+1,gy))]*tmp > 0.0 && + cudaDofVector[Mesh.getCellIndex(CoordinatesType(gx,gy+1))]*tmp > 0.0) + + flag = 1;//cudaDofVector[Mesh.getCellIndex(CoordinatesType(gx,gy))] = tmp*INT_MAX; + } + + __threadfence(); + + if(flag==1) + cudaDofVector[gid] = tmp*3; +} + + +template< typename MeshReal, + typename Device, + typename MeshIndex, + typename Real, + typename Index > +__device__ +Real tnlFastSweeping< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: fabsMin( Real x, Real y) +{ +// Real fx = abs(x); +// +// Real tmpMin = Min(fx,abs(y)); + + if(abs(y) > abs(x)) + return x; + else + return y; + + +} + + +template<> +__global__ void runCUDA<1>(tnlFastSweeping< tnlGrid< 2,double, tnlHost, int >, double, int >* solver, int sweep, int k) +{ + + if(blockIdx.x+blockIdx.y == k) + { + int gx = threadIdx.x + blockDim.x*blockIdx.x; + int gy = threadIdx.y + blockDim.y*blockIdx.y; + + int id1 = threadIdx.x+threadIdx.y; + + for(int l = 0; l < 2*blockDim.x - 1; l++) + { + if(id1 == l) + { + if(solver->Mesh.getDimensions().x() > gx && solver->Mesh.getDimensions().y() > gy /*&& gy > -1&& gx > -1*/) + solver->updateValue(gx,gy); + } + __syncthreads(); + } + + } + /*---------------------------------------------------------------------------------------------------------------------------*/ +} + template<> + __global__ void runCUDA<2>(tnlFastSweeping< tnlGrid< 2,double, tnlHost, int >, double, int >* solver, int sweep, int k) + { + if((gridDim.x - blockIdx.x - 1)+blockIdx.y == k) + { + int gx = threadIdx.x + blockDim.x*blockIdx.x; + int gy = threadIdx.y + blockDim.y*blockIdx.y; + + int id2 = (blockDim.x - threadIdx.x - 1) + threadIdx.y; + + for(int l = 0; l < 2*blockDim.x - 1; l++) + { + if(id2 == l) + { + if(solver->Mesh.getDimensions().x() > gx && solver->Mesh.getDimensions().y() > gy /*&& gy > -1&& gx > -1*/) + solver->updateValue(gx,gy); + } + __syncthreads(); + } + + } + } /*---------------------------------------------------------------------------------------------------------------------------*/ + template<> + __global__ void runCUDA<4>(tnlFastSweeping< tnlGrid< 2,double, tnlHost, int >, double, int >* solver, int sweep, int k) + { + if(blockIdx.x+blockIdx.y == gridDim.x+gridDim.y-k-2) + { + int gx = threadIdx.x + blockDim.x*blockIdx.x; + int gy = threadIdx.y + blockDim.y*blockIdx.y; + + int id1 = threadIdx.x+threadIdx.y; + + for(int l = 2*blockDim.x - 2; l > -1; l--) + { + if(id1 == l) + { + if(solver->Mesh.getDimensions().x() > gx && solver->Mesh.getDimensions().y() > gy /*&& gy > -1&& gx > -1*/) + solver->updateValue(gx,gy); + return; + } + __syncthreads(); + } + + } + /*---------------------------------------------------------------------------------------------------------------------------*/ + + } + + template<> + __global__ void runCUDA<8>(tnlFastSweeping< tnlGrid< 2,double, tnlHost, int >, double, int >* solver, int sweep, int k) + { + if((gridDim.x - blockIdx.x - 1)+blockIdx.y == gridDim.x+gridDim.y-k-2) + { + int gx = threadIdx.x + blockDim.x*blockIdx.x; + int gy = threadIdx.y + blockDim.y*blockIdx.y; + + int id2 = (blockDim.x - threadIdx.x - 1) + threadIdx.y; + + for(int l = 2*blockDim.x - 2; l > -1; l--) + { + if(id2 == l) + { + if(solver->Mesh.getDimensions().x() > gx && solver->Mesh.getDimensions().y() > gy /*&& gy > -1&& gx > -1*/) + solver->updateValue(gx,gy); + return; + } + __syncthreads(); + } + + } + /*---------------------------------------------------------------------------------------------------------------------------*/ + + + + + +} + + + template<> + __global__ void runCUDA<5>(tnlFastSweeping< tnlGrid< 2,double, tnlHost, int >, double, int >* solver, int sweep, int k) + { + + if(blockIdx.x+blockIdx.y == k) + { + int gx = threadIdx.x + blockDim.x*blockIdx.x; + int gy = threadIdx.y + blockDim.y*blockIdx.y; + + int id1 = threadIdx.x+threadIdx.y; + + for(int l = 0; l < 2*blockDim.x - 1; l++) + { + if(id1 == l) + { + if(solver->Mesh.getDimensions().x() > gx && solver->Mesh.getDimensions().y() > gy /*&& gy > -1&& gx > -1*/) + solver->updateValue(gx,gy); + return; + } + __syncthreads(); + } + + } + else if(blockIdx.x+blockIdx.y == gridDim.x+gridDim.y-k-2) + { + int gx = threadIdx.x + blockDim.x*blockIdx.x; + int gy = threadIdx.y + blockDim.y*blockIdx.y; + + int id1 = threadIdx.x+threadIdx.y; + + for(int l = 2*blockDim.x - 2; l > -1; l--) + { + if(id1 == l) + { + if(solver->Mesh.getDimensions().x() > gx && solver->Mesh.getDimensions().y() > gy /*&& gy > -1&& gx > -1*/) + solver->updateValue(gx,gy); + return; + } + __syncthreads(); + } + + } + } + + + template<> + __global__ void runCUDA<10>(tnlFastSweeping< tnlGrid< 2,double, tnlHost, int >, double, int >* solver, int sweep, int k) + { + if((gridDim.x - blockIdx.x - 1)+blockIdx.y == k) + { + int gx = threadIdx.x + blockDim.x*blockIdx.x; + int gy = threadIdx.y + blockDim.y*blockIdx.y; + + int id2 = (blockDim.x - threadIdx.x - 1) + threadIdx.y; + + for(int l = 0; l < 2*blockDim.x - 1; l++) + { + if(id2 == l) + { + if(solver->Mesh.getDimensions().x() > gx && solver->Mesh.getDimensions().y() > gy /*&& gy > -1&& gx > -1*/) + solver->updateValue(gx,gy); + return; + } + __syncthreads(); + } + + } + + else if((gridDim.x - blockIdx.x - 1)+blockIdx.y == gridDim.x+gridDim.y-k-2) + { + int gx = threadIdx.x + blockDim.x*blockIdx.x; + int gy = threadIdx.y + blockDim.y*blockIdx.y; + + int id2 = (blockDim.x - threadIdx.x - 1) + threadIdx.y; + + for(int l = 2*blockDim.x - 2; l > -1; l--) + { + if(id2 == l) + { + if(solver->Mesh.getDimensions().x() > gx && solver->Mesh.getDimensions().y() > gy /*&& gy > -1&& gx > -1*/) + solver->updateValue(gx,gy); + return; + } + __syncthreads(); + } + + } + + } + + + + template<> + __global__ void runCUDA<15>(tnlFastSweeping< tnlGrid< 2,double, tnlHost, int >, double, int >* solver, int sweep, int k) + { + + if(blockIdx.x+blockIdx.y == k) + { + int gx = threadIdx.x + blockDim.x*blockIdx.x; + int gy = threadIdx.y + blockDim.y*blockIdx.y; + + int id1 = threadIdx.x+threadIdx.y; + + for(int l = 0; l < 2*blockDim.x - 1; l++) + { + if(id1 == l) + { + if(solver->Mesh.getDimensions().x() > gx && solver->Mesh.getDimensions().y() > gy /*&& gy > -1&& gx > -1*/) + solver->updateValue(gx,gy); + return; + } + __syncthreads(); + } + + } + /*---------------------------------------------------------------------------------------------------------------------------*/ + + if((gridDim.x - blockIdx.x - 1)+blockIdx.y == k) + { + int gx = threadIdx.x + blockDim.x*blockIdx.x; + int gy = threadIdx.y + blockDim.y*blockIdx.y; + + int id2 = (blockDim.x - threadIdx.x - 1) + threadIdx.y; + + for(int l = 0; l < 2*blockDim.x - 1; l++) + { + if(id2 == l) + { + if(solver->Mesh.getDimensions().x() > gx && solver->Mesh.getDimensions().y() > gy /*&& gy > -1&& gx > -1*/) + solver->updateValue(gx,gy); + return; + } + __syncthreads(); + } + + } + /*---------------------------------------------------------------------------------------------------------------------------*/ + + if(blockIdx.x+blockIdx.y == gridDim.x+gridDim.y-k-2) + { + int gx = threadIdx.x + blockDim.x*blockIdx.x; + int gy = threadIdx.y + blockDim.y*blockIdx.y; + + int id1 = threadIdx.x+threadIdx.y; + + for(int l = 2*blockDim.x - 2; l > -1; l--) + { + if(id1 == l) + { + if(solver->Mesh.getDimensions().x() > gx && solver->Mesh.getDimensions().y() > gy /*&& gy > -1&& gx > -1*/) + solver->updateValue(gx,gy); + return; + } + __syncthreads(); + } + + } + /*---------------------------------------------------------------------------------------------------------------------------*/ + + if((gridDim.x - blockIdx.x - 1)+blockIdx.y == gridDim.x+gridDim.y-k-2) + { + int gx = threadIdx.x + blockDim.x*blockIdx.x; + int gy = threadIdx.y + blockDim.y*blockIdx.y; + + int id2 = (blockDim.x - threadIdx.x - 1) + threadIdx.y; + + for(int l = 2*blockDim.x - 2; l > -1; l--) + { + if(id2 == l) + { + if(solver->Mesh.getDimensions().x() > gx && solver->Mesh.getDimensions().y() > gy /*&& gy > -1&& gx > -1*/) + solver->updateValue(gx,gy); + return; + } + __syncthreads(); + } + + } + /*---------------------------------------------------------------------------------------------------------------------------*/ + + + + + + } + + + + + + + + + + + + + + + + + + + + + + + + + + + +__global__ void initCUDA(tnlFastSweeping< tnlGrid< 2,double, tnlHost, int >, double, int >* solver) +{ + int gx = threadIdx.x + blockDim.x*blockIdx.x; + int gy = blockDim.y*blockIdx.y + threadIdx.y; + + if(solver->Mesh.getDimensions().x() > gx && solver->Mesh.getDimensions().y() > gy) + { + solver->initGrid(); + } + + +} +#endif + + + + + + +//__global__ void runCUDA(tnlFastSweeping< tnlGrid< 2,double, tnlHost, int >, double, int >* solver, int sweep, int k) +//{ +// +// if(sweep==1 && blockIdx.x+blockIdx.y == k) +// { +// int gx = threadIdx.x + blockDim.x*blockIdx.x; +// int gy = threadIdx.y + blockDim.y*blockIdx.y; +// +// int id1 = threadIdx.x+threadIdx.y; +// +// for(int l = 0; l < 2*blockDim.x - 1; l++) +// { +// if(id1 == l) +// { +// if(solver->Mesh.getDimensions().x() > gx && solver->Mesh.getDimensions().y() > gy /*&& gy > -1&& gx > -1*/) +// solver->updateValue(gx,gy); +// } +// __syncthreads(); +// } +// +// } +// /*---------------------------------------------------------------------------------------------------------------------------*/ +// +// else if(sweep==2 && (gridDim.x - blockIdx.x - 1)+blockIdx.y == k) +// { +// int gx = threadIdx.x + blockDim.x*blockIdx.x; +// int gy = threadIdx.y + blockDim.y*blockIdx.y; +// +// int id2 = (blockDim.x - threadIdx.x - 1) + threadIdx.y; +// +// for(int l = 0; l < 2*blockDim.x - 1; l++) +// { +// if(id2 == l) +// { +// if(solver->Mesh.getDimensions().x() > gx && solver->Mesh.getDimensions().y() > gy /*&& gy > -1&& gx > -1*/) +// solver->updateValue(gx,gy); +// } +// __syncthreads(); +// } +// +// } +// /*---------------------------------------------------------------------------------------------------------------------------*/ +// +// else if(sweep==4 && blockIdx.x+blockIdx.y == gridDim.x+gridDim.y-k-2) +// { +// int gx = threadIdx.x + blockDim.x*blockIdx.x; +// int gy = threadIdx.y + blockDim.y*blockIdx.y; +// +// int id1 = threadIdx.x+threadIdx.y; +// +// for(int l = 2*blockDim.x - 2; l > -1; l--) +// { +// if(id1 == l) +// { +// if(solver->Mesh.getDimensions().x() > gx && solver->Mesh.getDimensions().y() > gy /*&& gy > -1&& gx > -1*/) +// solver->updateValue(gx,gy); +// return; +// } +// __syncthreads(); +// } +// +// } +// /*---------------------------------------------------------------------------------------------------------------------------*/ +// +// else if(sweep==8 && (gridDim.x - blockIdx.x - 1)+blockIdx.y == gridDim.x+gridDim.y-k-2) +// { +// int gx = threadIdx.x + blockDim.x*blockIdx.x; +// int gy = threadIdx.y + blockDim.y*blockIdx.y; +// +// int id2 = (blockDim.x - threadIdx.x - 1) + threadIdx.y; +// +// for(int l = 2*blockDim.x - 2; l > -1; l--) +// { +// if(id2 == l) +// { +// if(solver->Mesh.getDimensions().x() > gx && solver->Mesh.getDimensions().y() > gy /*&& gy > -1&& gx > -1*/) +// solver->updateValue(gx,gy); +// return; +// } +// __syncthreads(); +// } +// +// } +// /*---------------------------------------------------------------------------------------------------------------------------*/ +// +// +// +// +// +//} + + +#endif /* TNLFASTSWEEPING_IMPL_H_ */ diff --git a/examples/fast-sweeping/tnlFastSweeping2D_impl.h b/examples/fast-sweeping/tnlFastSweeping2D_impl.h index 0abf197618f2e087706a7d327ab32e1d31a23ade..a2197864bbf3ee31cf8dfe9bb9d08c1210342c55 100644 --- a/examples/fast-sweeping/tnlFastSweeping2D_impl.h +++ b/examples/fast-sweeping/tnlFastSweeping2D_impl.h @@ -59,6 +59,13 @@ bool tnlFastSweeping< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: h = Mesh.getHx(); + const tnlString& exact_input = parameters.getParameter< tnlString >( "exact-input" ); + + if(exact_input == "no") + exactInput=false; + else + exactInput=true; + return initGrid(); } @@ -72,6 +79,15 @@ bool tnlFastSweeping< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: { Real tmp = 0.0; + Real ax=0.5/sqrt(2.0); + + if(!exactInput) + { + for(Index i = 0; i < Mesh.getDimensions().x()*Mesh.getDimensions().y(); i++) + dofVector[i]=0.5*h*Sign(dofVector[i]); + } + + for(Index i = 1; i < Mesh.getDimensions().x()-1; i++) { for(Index j = 1; j < Mesh.getDimensions().y()-1; j++) diff --git a/examples/fast-sweeping/tnlFastSweeping2D_openMP_impl.h b/examples/fast-sweeping/tnlFastSweeping2D_openMP_impl.h new file mode 100644 index 0000000000000000000000000000000000000000..1bf8504439dcf62cea0ff53f3f622dc578e6ae39 --- /dev/null +++ b/examples/fast-sweeping/tnlFastSweeping2D_openMP_impl.h @@ -0,0 +1,399 @@ +/*************************************************************************** + tnlFastSweeping_impl.h - description + ------------------- + begin : Oct 15 , 2015 + copyright : (C) 2015 by Tomas Sobotik + ***************************************************************************/ + +/*************************************************************************** + * * + * This program is free software; you can redistribute it and/or modify * + * it under the terms of the GNU General Public License as published by * + * the Free Software Foundation; either version 2 of the License, or * + * (at your option) any later version. * + * * + ***************************************************************************/ +#ifndef TNLFASTSWEEPING2D_IMPL_H_ +#define TNLFASTSWEEPING2D_IMPL_H_ + +#include "tnlFastSweeping.h" + +template< typename MeshReal, + typename Device, + typename MeshIndex, + typename Real, + typename Index > +tnlString tnlFastSweeping< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: getType() +{ + return tnlString( "tnlFastSweeping< " ) + + MeshType::getType() + ", " + + ::getType< Real >() + ", " + + ::getType< Index >() + " >"; +} + + + + +template< typename MeshReal, + typename Device, + typename MeshIndex, + typename Real, + typename Index > +bool tnlFastSweeping< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: init( const tnlParameterContainer& parameters ) +{ + const tnlString& meshFile = parameters.getParameter< tnlString >( "mesh" ); + + if( ! Mesh.load( meshFile ) ) + { + cerr << "I am not able to load the mesh from the file " << meshFile << "." << endl; + return false; + } + + + const tnlString& initialCondition = parameters.getParameter <tnlString>("initial-condition"); + if( ! dofVector.load( initialCondition ) ) + { + cerr << "I am not able to load the initial condition from the file " << meshFile << "." << endl; + return false; + } + + h = Mesh.getHx(); + + + const tnlString& exact_input = parameters.getParameter< tnlString >( "exact-input" ); + + if(exact_input == "no") + exactInput=false; + else + exactInput=true; + +#ifdef HAVE_OPENMP +// gridLock = (omp_lock_t*) malloc(sizeof(omp_lock_t)*Mesh.getDimensions().x()*Mesh.getDimensions().y()); +// +// for(int i = 0; i < Mesh.getDimensions().x()*Mesh.getDimensions().y(); i++) +// omp_init_lock(&gridLock[i]); +#endif + + return initGrid(); +} + + +template< typename MeshReal, + typename Device, + typename MeshIndex, + typename Real, + typename Index > +bool tnlFastSweeping< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: initGrid() +{ + + Real tmp = 0.0; + + if(!exactInput) + { + for(Index i = 0; i < Mesh.getDimensions().x()*Mesh.getDimensions().y(); i++) + dofVector[i]=0.5*h*Sign(dofVector[i]); + } + + + for(Index i = 1; i < Mesh.getDimensions().x()-1; i++) + { + for(Index j = 1; j < Mesh.getDimensions().y()-1; j++) + { + tmp = Sign(dofVector[Mesh.getCellIndex(CoordinatesType(i,j))]); + + + if(tmp == 0.0) + {} + else if(dofVector[Mesh.getCellIndex(CoordinatesType(i+1,j))]*tmp < 0.0 || + dofVector[Mesh.getCellIndex(CoordinatesType(i-1,j))]*tmp < 0.0 || + dofVector[Mesh.getCellIndex(CoordinatesType(i,j+1))]*tmp < 0.0 || + dofVector[Mesh.getCellIndex(CoordinatesType(i,j-1))]*tmp < 0.0 ) + {} + else + dofVector[Mesh.getCellIndex(CoordinatesType(i,j))] = tmp*INT_MAX; + } + } + + + + for(int i = 1; i < Mesh.getDimensions().x()-1; i++) + { + Index j = 0; + tmp = Sign(dofVector[Mesh.getCellIndex(CoordinatesType(i,j))]); + + + if(tmp == 0.0) + {} + else if(dofVector[Mesh.getCellIndex(CoordinatesType(i+1,j))]*tmp < 0.0 || + dofVector[Mesh.getCellIndex(CoordinatesType(i-1,j))]*tmp < 0.0 || + dofVector[Mesh.getCellIndex(CoordinatesType(i,j+1))]*tmp < 0.0 ) + {} + else + dofVector[Mesh.getCellIndex(CoordinatesType(i,j))] = tmp*INT_MAX; + } + + for(int i = 1; i < Mesh.getDimensions().x()-1; i++) + { + Index j = Mesh.getDimensions().y() - 1; + tmp = Sign(dofVector[Mesh.getCellIndex(CoordinatesType(i,j))]); + + + if(tmp == 0.0) + {} + else if(dofVector[Mesh.getCellIndex(CoordinatesType(i+1,j))]*tmp < 0.0 || + dofVector[Mesh.getCellIndex(CoordinatesType(i-1,j))]*tmp < 0.0 || + dofVector[Mesh.getCellIndex(CoordinatesType(i,j-1))]*tmp < 0.0 ) + {} + else + dofVector[Mesh.getCellIndex(CoordinatesType(i,j))] = tmp*INT_MAX; + } + + for(int j = 1; j < Mesh.getDimensions().y()-1; j++) + { + Index i = 0; + tmp = Sign(dofVector[Mesh.getCellIndex(CoordinatesType(i,j))]); + + + if(tmp == 0.0) + {} + else if(dofVector[Mesh.getCellIndex(CoordinatesType(i+1,j))]*tmp < 0.0 || + dofVector[Mesh.getCellIndex(CoordinatesType(i,j+1))]*tmp < 0.0 || + dofVector[Mesh.getCellIndex(CoordinatesType(i,j-1))]*tmp < 0.0 ) + {} + else + dofVector[Mesh.getCellIndex(CoordinatesType(i,j))] = tmp*INT_MAX; + } + + for(int j = 1; j < Mesh.getDimensions().y()-1; j++) + { + Index i = Mesh.getDimensions().x() - 1; + tmp = Sign(dofVector[Mesh.getCellIndex(CoordinatesType(i,j))]); + + + if(tmp == 0.0) + {} + else if(dofVector[Mesh.getCellIndex(CoordinatesType(i-1,j))]*tmp < 0.0 || + dofVector[Mesh.getCellIndex(CoordinatesType(i,j+1))]*tmp < 0.0 || + dofVector[Mesh.getCellIndex(CoordinatesType(i,j-1))]*tmp < 0.0 ) + {} + else + dofVector[Mesh.getCellIndex(CoordinatesType(i,j))] = tmp*INT_MAX; + } + + + Index i = Mesh.getDimensions().x() - 1; + Index j = Mesh.getDimensions().y() - 1; + + tmp = Sign(dofVector[Mesh.getCellIndex(CoordinatesType(i,j))]); + if(dofVector[Mesh.getCellIndex(CoordinatesType(i-1,j))]*tmp > 0.0 && + dofVector[Mesh.getCellIndex(CoordinatesType(i,j-1))]*tmp > 0.0) + + dofVector[Mesh.getCellIndex(CoordinatesType(i,j))] = tmp*INT_MAX; + + + + j = 0; + tmp = Sign(dofVector[Mesh.getCellIndex(CoordinatesType(i,j))]); + if(dofVector[Mesh.getCellIndex(CoordinatesType(i-1,j))]*tmp > 0.0 && + dofVector[Mesh.getCellIndex(CoordinatesType(i,j+1))]*tmp > 0.0) + + dofVector[Mesh.getCellIndex(CoordinatesType(i,j))] = tmp*INT_MAX; + + + + i = 0; + j = Mesh.getDimensions().y() -1; + tmp = Sign(dofVector[Mesh.getCellIndex(CoordinatesType(i,j))]); + if(dofVector[Mesh.getCellIndex(CoordinatesType(i+1,j))]*tmp > 0.0 && + dofVector[Mesh.getCellIndex(CoordinatesType(i,j-1))]*tmp > 0.0) + + dofVector[Mesh.getCellIndex(CoordinatesType(i,j))] = tmp*INT_MAX; + + + + j = 0; + tmp = Sign(dofVector[Mesh.getCellIndex(CoordinatesType(i,j))]); + if(dofVector[Mesh.getCellIndex(CoordinatesType(i+1,j))]*tmp > 0.0 && + dofVector[Mesh.getCellIndex(CoordinatesType(i,j+1))]*tmp > 0.0) + + dofVector[Mesh.getCellIndex(CoordinatesType(i,j))] = tmp*INT_MAX; + + + dofVector.save("u-00000.tnl"); + + return true; +} + + + +template< typename MeshReal, + typename Device, + typename MeshIndex, + typename Real, + typename Index > +bool tnlFastSweeping< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: run() +{ + + DofVectorType d2,d3,d4; + d2.setLike(dofVector); + d2=dofVector; + d3.setLike(dofVector); + d3=dofVector; + d4.setLike(dofVector); + d4=dofVector; + + +#ifdef HAVE_OPENMP +#pragma omp parallel sections num_threads(4) + { + { +#endif + + for(Index i = 0; i < Mesh.getDimensions().x(); i++) + { + for(Index j = 0; j < Mesh.getDimensions().y(); j++) + { + updateValue(i,j,&dofVector); + } + } + +/*---------------------------------------------------------------------------------------------------------------------------*/ +#ifdef HAVE_OPENMP + } +#pragma omp section + { +#endif + for(Index i = Mesh.getDimensions().x() - 1; i > -1; i--) + { + for(Index j = 0; j < Mesh.getDimensions().y(); j++) + { + updateValue(i,j,&d2); + } + } + +/*---------------------------------------------------------------------------------------------------------------------------*/ +#ifdef HAVE_OPENMP + } +#pragma omp section + { +#endif + for(Index i = Mesh.getDimensions().x() - 1; i > -1; i--) + { + for(Index j = Mesh.getDimensions().y() - 1; j > -1; j--) + { + updateValue(i,j, &d3); + } + } + +/*---------------------------------------------------------------------------------------------------------------------------*/ +#ifdef HAVE_OPENMP + } +#pragma omp section + { +#endif + for(Index i = 0; i < Mesh.getDimensions().x(); i++) + { + for(Index j = Mesh.getDimensions().y() - 1; j > -1; j--) + { + updateValue(i,j, &d4); + } + } + +/*---------------------------------------------------------------------------------------------------------------------------*/ +#ifdef HAVE_OPENMP + } + } +#endif + + +#ifdef HAVE_OPENMP +#pragma omp parallel for num_threads(4) schedule(dynamic) +#endif + for(Index i = 0; i < Mesh.getDimensions().x(); i++) + { + for(Index j = Mesh.getDimensions().y() - 1; j > -1; j--) + { + int index = Mesh.getCellIndex(CoordinatesType(i,j)); + dofVector[index] = fabsMin(dofVector[index], d2[index]); + dofVector[index] = fabsMin(dofVector[index], d3[index]); + dofVector[index] = fabsMin(dofVector[index], d4[index]); + } + } + + dofVector.save("u-00001.tnl"); + + return true; +} + + +template< typename MeshReal, + typename Device, + typename MeshIndex, + typename Real, + typename Index > +void tnlFastSweeping< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: updateValue( Index i, Index j, DofVectorType* grid) +{ + Index index = Mesh.getCellIndex(CoordinatesType(i,j)); + Real value = (*grid)[index]; + Real a,b, tmp; + + if( i == 0 ) + a = (*grid)[Mesh.template getCellNextToCell<1,0>(index)]; + else if( i == Mesh.getDimensions().x() - 1 ) + a = (*grid)[Mesh.template getCellNextToCell<-1,0>(index)]; + else + { + a = fabsMin( (*grid)[Mesh.template getCellNextToCell<-1,0>(index)], + (*grid)[Mesh.template getCellNextToCell<1,0>(index)] ); + } + + if( j == 0 ) + b = (*grid)[Mesh.template getCellNextToCell<0,1>(index)]; + else if( j == Mesh.getDimensions().y() - 1 ) + b = (*grid)[Mesh.template getCellNextToCell<0,-1>(index)]; + else + { + b = fabsMin( (*grid)[Mesh.template getCellNextToCell<0,-1>(index)], + (*grid)[Mesh.template getCellNextToCell<0,1>(index)] ); + } + + + if(fabs(a-b) >= h) + tmp = fabsMin(a,b) + Sign(value)*h; + else + tmp = 0.5 * (a + b + Sign(value)*sqrt(2.0 * h * h - (a - b) * (a - b) ) ); + +#ifdef HAVE_OPENMP +// omp_set_lock(&gridLock[index]); +#endif + (*grid)[index] = fabsMin(value, tmp); +#ifdef HAVE_OPENMP +// omp_unset_lock(&gridLock[index]); +#endif +} + + +template< typename MeshReal, + typename Device, + typename MeshIndex, + typename Real, + typename Index > +Real tnlFastSweeping< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: fabsMin( Real x, Real y) +{ + Real fx = fabs(x); + Real fy = fabs(y); + + Real tmpMin = Min(fx,fy); + + if(tmpMin == fx) + return x; + else + return y; + + +} + + + + +#endif /* TNLFASTSWEEPING_IMPL_H_ */ diff --git a/examples/fast-sweeping/tnlFastSweeping_CUDA.h b/examples/fast-sweeping/tnlFastSweeping_CUDA.h new file mode 100644 index 0000000000000000000000000000000000000000..e5023b7c3223ca2bdcbf34395deca241bd935c54 --- /dev/null +++ b/examples/fast-sweeping/tnlFastSweeping_CUDA.h @@ -0,0 +1,104 @@ +/*************************************************************************** + tnlFastSweeping.h - description + ------------------- + begin : Oct 15 , 2015 + copyright : (C) 2015 by Tomas Sobotik + ***************************************************************************/ + +/*************************************************************************** + * * + * This program is free software; you can redistribute it and/or modify * + * it under the terms of the GNU General Public License as published by * + * the Free Software Foundation; either version 2 of the License, or * + * (at your option) any later version. * + * * + ***************************************************************************/ +#ifndef TNLFASTSWEEPING_H_ +#define TNLFASTSWEEPING_H_ + +#include <config/tnlParameterContainer.h> +#include <core/vectors/tnlVector.h> +#include <core/vectors/tnlStaticVector.h> +#include <core/tnlHost.h> +#include <mesh/tnlGrid.h> +#include <limits.h> +#include <core/tnlDevice.h> +#include <ctime> + + + + + +template< typename Mesh, + typename Real, + typename Index > +class tnlFastSweeping +{}; + + + + +template< typename MeshReal, + typename Device, + typename MeshIndex, + typename Real, + typename Index > +class tnlFastSweeping< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > +{ + +public: + typedef Real RealType; + typedef Device DeviceType; + typedef Index IndexType; + typedef tnlGrid< 2, Real, Device, Index > MeshType; + typedef tnlVector< RealType, DeviceType, IndexType> DofVectorType; + typedef typename MeshType::CoordinatesType CoordinatesType; + + + + __host__ static tnlString getType(); + __host__ bool init( const tnlParameterContainer& parameters ); + __host__ bool run(); + +#ifdef HAVE_CUDA + __device__ bool initGrid(); + __device__ void updateValue(const Index i, const Index j); + __device__ Real fabsMin(const Real x, const Real y); + + tnlFastSweeping< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index >* cudaSolver; + double* cudaDofVector; + double* cudaDofVector2; + int counter; +#endif + + MeshType Mesh; + +protected: + + + + bool exactInput; + + DofVectorType dofVector; + + RealType h; + + +}; + + + +#ifdef HAVE_CUDA +template<int sweep_t> +__global__ void runCUDA(tnlFastSweeping< tnlGrid< 2,double, tnlHost, int >, double, int >* solver, int sweep, int i); +//__global__ void runCUDA(tnlFastSweeping< tnlGrid< 2,double, tnlHost, int >, double, int >* solver, int sweep, int i); + +__global__ void initCUDA(tnlFastSweeping< tnlGrid< 2,double, tnlHost, int >, double, int >* solver); +#endif + +/*various implementtions.... choose one*/ +//#include "tnlFastSweeping2D_CUDA_impl.h" +//#include "tnlFastSweeping2D_CUDA_v2_impl.h" +#include "tnlFastSweeping2D_CUDA_v3_impl.h" + +#endif /* TNLFASTSWEEPING_H_ */