From 69063c92dbf46bf92700b658bac7259305f1941f Mon Sep 17 00:00:00 2001 From: Tomas Sobotik <sobotto4@fjfi.cvut.cz> Date: Tue, 3 Nov 2015 08:54:51 +0100 Subject: [PATCH] Merging done --- .../tnlParallelEikonalSolver_impl.h | 1485 +++++++++++++++++ 1 file changed, 1485 insertions(+) create mode 100644 examples/hamilton-jacobi-parallel/tnlParallelEikonalSolver_impl.h diff --git a/examples/hamilton-jacobi-parallel/tnlParallelEikonalSolver_impl.h b/examples/hamilton-jacobi-parallel/tnlParallelEikonalSolver_impl.h new file mode 100644 index 0000000000..320d85eacd --- /dev/null +++ b/examples/hamilton-jacobi-parallel/tnlParallelEikonalSolver_impl.h @@ -0,0 +1,1485 @@ +/*************************************************************************** + tnlParallelEikonalSolver_impl.h - description + ------------------- + begin : Nov 28 , 2014 + copyright : (C) 2014 by Tomas Sobotik + ***************************************************************************/ + +/*************************************************************************** + * * + * This program is free software; you can redistribute it and/or modify * + * it under the terms of the GNU General Public License as published by * + * the Free Software Foundation; either version 2 of the License, or * + * (at your option) any later version. * + * * + ***************************************************************************/ + +#ifndef TNLPARALLELEIKONALSOLVER_IMPL_H_ +#define TNLPARALLELEIKONALSOLVER_IMPL_H_ + +//north - 1, east - 2, west - 4, south - 8 + +#define FROM_NORTH 1 +#define FROM_EAST 2 +#define FROM_WEST 4 +#define FROM_SOUTH 8 + +#include "tnlParallelEikonalSolver.h" +#include <core/mfilename.h> + +template< typename SchemeHost, typename SchemeDevice, typename Device> +tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>::tnlParallelEikonalSolver() +{ + cout << "a" << endl; + this->device = tnlCudaDevice; /////////////// tnlCuda Device --- vypocet na GPU, tnlHostDevice --- vypocet na CPU + +#ifdef HAVE_CUDA + if(this->device == tnlCudaDevice) + { + run_host = 1; + } +#endif + + cout << "b" << endl; +} + +template< typename SchemeHost, typename SchemeDevice, typename Device> +void tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>::test() +{ +} + +template< typename SchemeHost, typename SchemeDevice, typename Device> + +bool tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>::init( const tnlParameterContainer& parameters ) +{ + cout << "Initializating solver..." << endl; + const tnlString& meshLocation = parameters.getParameter <tnlString>("mesh"); + this->mesh.load( meshLocation ); + + this->n = parameters.getParameter <int>("subgrid-size"); + cout << "Setting N to " << this->n << endl; + + this->subMesh.setDimensions( this->n, this->n ); + this->subMesh.setDomain( tnlStaticVector<2,double>(0.0, 0.0), + tnlStaticVector<2,double>(this->mesh.getHx()*(double)(this->n), this->mesh.getHy()*(double)(this->n)) ); + + this->subMesh.save("submesh.tnl"); + + const tnlString& initialCondition = parameters.getParameter <tnlString>("initial-condition"); + this->u0.load( initialCondition ); + + this->delta = parameters.getParameter <double>("delta"); + this->delta *= this->mesh.getHx()*this->mesh.getHy(); + + cout << "Setting delta to " << this->delta << endl; + + this->tau0 = parameters.getParameter <double>("initial-tau"); + cout << "Setting initial tau to " << this->tau0 << endl; + this->stopTime = parameters.getParameter <double>("stop-time"); + + this->cflCondition = parameters.getParameter <double>("cfl-condition"); + this -> cflCondition *= sqrt(this->mesh.getHx()*this->mesh.getHy()); + cout << "Setting CFL to " << this->cflCondition << endl; + + stretchGrid(); + this->stopTime /= (double)(this->gridCols); + this->stopTime *= (1.0+1.0/((double)(this->n) - 2.0)); + cout << "Setting stopping time to " << this->stopTime << endl; + //this->stopTime = 1.5*((double)(this->n))*parameters.getParameter <double>("stop-time")*this->mesh.getHx(); + //cout << "Setting stopping time to " << this->stopTime << endl; + + cout << "Initializating scheme..." << endl; + if(!this->schemeHost.init(parameters)) + { + cerr << "SchemeHost failed to initialize." << endl; + return false; + } + cout << "Scheme initialized." << endl; + + test(); + + VectorType* tmp = new VectorType[subgridValues.getSize()]; + bool containsCurve = false; + +#ifdef HAVE_CUDA + + if(this->device == tnlCudaDevice) + { + cudaMalloc(&(this->cudaSolver), sizeof(tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int >)); + cudaMemcpy(this->cudaSolver, this,sizeof(tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int >), cudaMemcpyHostToDevice); + double** tmpdev = NULL; + cudaMalloc(&tmpdev, sizeof(double*)); + cudaMalloc(&(this->tmpw), this->work_u.getSize()*sizeof(double)); + cudaMalloc(&(this->runcuda), sizeof(int)); + cudaDeviceSynchronize(); + checkCudaDevice; + int* tmpUC; + cudaMalloc(&(tmpUC), this->work_u.getSize()*sizeof(int)); + cudaMemcpy(tmpUC, this->unusedCell.getData(), this->unusedCell.getSize()*sizeof(int), cudaMemcpyHostToDevice); + + initCUDA<SchemeTypeHost, SchemeTypeDevice, DeviceType><<<1,1>>>(this->cudaSolver, (this->tmpw), (this->runcuda),tmpUC); + cudaDeviceSynchronize(); + checkCudaDevice; + double* tmpu = NULL; + + cudaMemcpy(&tmpu, tmpdev,sizeof(double*), cudaMemcpyDeviceToHost); + cudaMemcpy((this->tmpw), this->work_u.getData(), this->work_u.getSize()*sizeof(double), cudaMemcpyHostToDevice); + cudaDeviceSynchronize(); + checkCudaDevice; + } +#endif + + if(this->device == tnlHostDevice) + { + for(int i = 0; i < this->subgridValues.getSize(); i++) + { + + if(! tmp[i].setSize(this->n * this->n)) + cout << "Could not allocate tmp["<< i <<"] array." << endl; + tmp[i] = getSubgrid(i); + containsCurve = false; + + for(int j = 0; j < tmp[i].getSize(); j++) + { + if(tmp[i][0]*tmp[i][j] <= 0.0) + { + containsCurve = true; + j=tmp[i].getSize(); + } + + } + if(containsCurve) + { + insertSubgrid(runSubgrid(0, tmp[i],i), i); + setSubgridValue(i, 4); + } + containsCurve = false; + + } + } +#ifdef HAVE_CUDA + else if(this->device == tnlCudaDevice) + { + cudaDeviceSynchronize(); + checkCudaDevice; + dim3 threadsPerBlock(this->n, this->n); + dim3 numBlocks(this->gridCols,this->gridRows); + cudaDeviceSynchronize(); + checkCudaDevice; + initRunCUDA<SchemeTypeHost,SchemeTypeDevice, DeviceType><<<numBlocks,threadsPerBlock,3*this->n*this->n*sizeof(double)>>>(this->cudaSolver); + cudaDeviceSynchronize(); + } +#endif + + + this->currentStep = 1; + if(this->device == tnlHostDevice) + synchronize(); +#ifdef HAVE_CUDA + else if(this->device == tnlCudaDevice) + { + dim3 threadsPerBlock(this->n, this->n); + dim3 numBlocks(this->gridCols,this->gridRows); + + checkCudaDevice; + + synchronizeCUDA<SchemeTypeHost, SchemeTypeDevice, DeviceType><<<numBlocks,threadsPerBlock>>>(this->cudaSolver); + cudaDeviceSynchronize(); + checkCudaDevice; + synchronize2CUDA<SchemeTypeHost, SchemeTypeDevice, DeviceType><<<numBlocks,1>>>(this->cudaSolver); + cudaDeviceSynchronize(); + checkCudaDevice; + + } + +#endif + cout << "Solver initialized." << endl; + + return true; +} + +template< typename SchemeHost, typename SchemeDevice, typename Device> +void tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>::run() +{ + if(this->device == tnlHostDevice) + { + + bool end = false; + while ((this->boundaryConditions.max() > 0 ) || !end) + { + if(this->boundaryConditions.max() == 0 ) + end=true; + else + end=false; +#ifdef HAVE_OPENMP +#pragma omp parallel for num_threads(4) schedule(dynamic) +#endif + for(int i = 0; i < this->subgridValues.getSize(); i++) + { + if(getSubgridValue(i) != INT_MAX) + { + +<<<<<<< HEAD + if(getSubgridValue(i) == currentStep+4) + { + + if(getBoundaryCondition(i) & 1) +======= + if(getBoundaryCondition(i) & FROM_NORTH) +>>>>>>> 86250e05537ea6311fa87d0f7bbae298788aa432 + { + insertSubgrid( runSubgrid(FROM_NORTH, getSubgrid(i),i), i); + this->calculationsCount[i]++; + } + if(getBoundaryCondition(i) & FROM_EAST) + { + insertSubgrid( runSubgrid(FROM_EAST, getSubgrid(i),i), i); + this->calculationsCount[i]++; + } + if(getBoundaryCondition(i) & FROM_WEST) + { + insertSubgrid( runSubgrid(FROM_WEST, getSubgrid(i),i), i); + this->calculationsCount[i]++; + } + if(getBoundaryCondition(i) & FROM_SOUTH) + { + insertSubgrid( runSubgrid(FROM_SOUTH, getSubgrid(i),i), i); + this->calculationsCount[i]++; + } + } + +<<<<<<< HEAD + if( ((getBoundaryCondition(i) & 2) )|| (getBoundaryCondition(i) & 1)//) + /* &&(!(getBoundaryCondition(i) & 5) && !(getBoundaryCondition(i) & 10)) */) +======= + + if( ((getBoundaryCondition(i) & FROM_EAST) )) +>>>>>>> 86250e05537ea6311fa87d0f7bbae298788aa432 + { + insertSubgrid( runSubgrid(FROM_NORTH + FROM_EAST, getSubgrid(i),i), i); + } +<<<<<<< HEAD + if( ((getBoundaryCondition(i) & 4) )|| (getBoundaryCondition(i) & 1)//) + /* &&(!(getBoundaryCondition(i) & 3) && !(getBoundaryCondition(i) & 12)) */) +======= + if( ((getBoundaryCondition(i) & FROM_WEST) )) +>>>>>>> 86250e05537ea6311fa87d0f7bbae298788aa432 + { + insertSubgrid( runSubgrid(FROM_NORTH + FROM_WEST, getSubgrid(i),i), i); + } +<<<<<<< HEAD + if( ((getBoundaryCondition(i) & 2) )|| (getBoundaryCondition(i) & 8)//) + /* &&(!(getBoundaryCondition(i) & 12) && !(getBoundaryCondition(i) & 3))*/ ) +======= + if( ((getBoundaryCondition(i) & FROM_EAST) )) +>>>>>>> 86250e05537ea6311fa87d0f7bbae298788aa432 + { + insertSubgrid( runSubgrid(FROM_SOUTH + FROM_EAST, getSubgrid(i),i), i); + } +<<<<<<< HEAD + if( ((getBoundaryCondition(i) & 4) )|| (getBoundaryCondition(i) & 8)//) + /*&&(!(getBoundaryCondition(i) & 10) && !(getBoundaryCondition(i) & 5)) */) +======= + if( ((getBoundaryCondition(i) & FROM_WEST) )) +>>>>>>> 86250e05537ea6311fa87d0f7bbae298788aa432 + { + insertSubgrid( runSubgrid(FROM_SOUTH + FROM_WEST, getSubgrid(i),i), i); + } + + setBoundaryCondition(i, 0); + + setSubgridValue(i, getSubgridValue(i)-1); + + } + } + synchronize(); + } + } +#ifdef HAVE_CUDA + else if(this->device == tnlCudaDevice) + { + bool end_cuda = false; + dim3 threadsPerBlock(this->n, this->n); + dim3 numBlocks(this->gridCols,this->gridRows); + cudaDeviceSynchronize(); + checkCudaDevice; + bool* tmpb; + cudaMemcpy(&(this->run_host),this->runcuda,sizeof(int), cudaMemcpyDeviceToHost); + cudaDeviceSynchronize(); + checkCudaDevice; + int i = 1; + time_diff = 0.0; + while (run_host || !end_cuda) + { + cout << "Computing at step "<< i++ << endl; + if(run_host != 0 ) + end_cuda = true; + else + end_cuda = false; + cudaDeviceSynchronize(); + checkCudaDevice; + start = std::clock(); + runCUDA<SchemeTypeHost, SchemeTypeDevice, DeviceType><<<numBlocks,threadsPerBlock,3*this->n*this->n*sizeof(double)>>>(this->cudaSolver); + cudaDeviceSynchronize(); + time_diff += (std::clock() - start) / (double)(CLOCKS_PER_SEC); + + //start = std::clock(); + synchronizeCUDA<SchemeTypeHost, SchemeTypeDevice, DeviceType><<<numBlocks,threadsPerBlock>>>(this->cudaSolver); + cudaDeviceSynchronize(); + checkCudaDevice; + synchronize2CUDA<SchemeTypeHost, SchemeTypeDevice, DeviceType><<<numBlocks,1>>>(this->cudaSolver); + cudaDeviceSynchronize(); + checkCudaDevice; + //time_diff += (std::clock() - start) / (double)(CLOCKS_PER_SEC); + + cudaMemcpy(&run_host, (this->runcuda),sizeof(int), cudaMemcpyDeviceToHost); + } + cout << "Solving time was: " << time_diff << endl; + cudaMemcpy(this->work_u.getData(), (this->tmpw), this->work_u.getSize()*sizeof(double), cudaMemcpyDeviceToHost); + + cudaDeviceSynchronize(); + } +#endif + contractGrid(); + this->u0.save("u-00001.tnl"); + cout << "Maximum number of calculations on one subgrid was " << this->calculationsCount.absMax() << endl; + cout << "Average number of calculations on one subgrid was " << ( (double) this->calculationsCount.sum() / (double) this->calculationsCount.getSize() ) << endl; + cout << "Solver finished" << endl; + +#ifdef HAVE_CUDA + if(this->device == tnlCudaDevice) + { + cudaFree(this->runcuda); + cudaFree(this->tmpw); + cudaFree(this->cudaSolver); + } +#endif + +} + +//north - 1, east - 2, west - 4, south - 8 +template< typename SchemeHost, typename SchemeDevice, typename Device> +void tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>::synchronize() +{ + cout << "Synchronizig..." << endl; + int tmp1, tmp2; + int grid1, grid2; + + if(this->currentStep & 1) + { + for(int j = 0; j < this->gridRows - 1; j++) + { + for (int i = 0; i < this->gridCols*this->n; i++) + { + tmp1 = this->gridCols*this->n*((this->n-1)+j*this->n) + i; + tmp2 = this->gridCols*this->n*((this->n)+j*this->n) + i; + grid1 = getSubgridValue(getOwner(tmp1)); + grid2 = getSubgridValue(getOwner(tmp2)); + if(getOwner(tmp1)==getOwner(tmp2)) + cout << "i, j" << i << "," << j << endl; + if ((fabs(this->work_u[tmp1]) < fabs(this->work_u[tmp2]) - this->delta || grid2 == INT_MAX || grid2 == -INT_MAX) && (grid1 != INT_MAX && grid1 != -INT_MAX)) + { + this->work_u[tmp2] = this->work_u[tmp1]; + this->unusedCell[tmp2] = 0; + if(grid2 == INT_MAX) + { + setSubgridValue(getOwner(tmp2), -INT_MAX); + } + if(! (getBoundaryCondition(getOwner(tmp2)) & FROM_SOUTH) ) + setBoundaryCondition(getOwner(tmp2), getBoundaryCondition(getOwner(tmp2))+FROM_SOUTH); + } + else if ((fabs(this->work_u[tmp1]) > fabs(this->work_u[tmp2]) + this->delta || grid1 == INT_MAX || grid1 == -INT_MAX) && (grid2 != INT_MAX && grid2 != -INT_MAX)) + { + this->work_u[tmp1] = this->work_u[tmp2]; + this->unusedCell[tmp1] = 0; + if(grid1 == INT_MAX) + { + setSubgridValue(getOwner(tmp1), -INT_MAX); + } + if(! (getBoundaryCondition(getOwner(tmp1)) & FROM_NORTH) ) + setBoundaryCondition(getOwner(tmp1), getBoundaryCondition(getOwner(tmp1))+FROM_NORTH); + } + } + } + + } + else + { + for(int i = 1; i < this->gridCols; i++) + { + for (int j = 0; j < this->gridRows*this->n; j++) + { + tmp1 = this->gridCols*this->n*j + i*this->n - 1; + tmp2 = this->gridCols*this->n*j + i*this->n ; + grid1 = getSubgridValue(getOwner(tmp1)); + grid2 = getSubgridValue(getOwner(tmp2)); + if(getOwner(tmp1)==getOwner(tmp2)) + cout << "i, j" << i << "," << j << endl; + if ((fabs(this->work_u[tmp1]) < fabs(this->work_u[tmp2]) - this->delta || grid2 == INT_MAX || grid2 == -INT_MAX) && (grid1 != INT_MAX && grid1 != -INT_MAX)) + { + this->work_u[tmp2] = this->work_u[tmp1]; + this->unusedCell[tmp2] = 0; + if(grid2 == INT_MAX) + { + setSubgridValue(getOwner(tmp2), -INT_MAX); + } + if(! (getBoundaryCondition(getOwner(tmp2)) & FROM_WEST) ) + setBoundaryCondition(getOwner(tmp2), getBoundaryCondition(getOwner(tmp2))+FROM_WEST); + } + else if ((fabs(this->work_u[tmp1]) > fabs(this->work_u[tmp2]) + this->delta || grid1 == INT_MAX || grid1 == -INT_MAX) && (grid2 != INT_MAX && grid2 != -INT_MAX)) + { + this->work_u[tmp1] = this->work_u[tmp2]; + this->unusedCell[tmp1] = 0; + if(grid1 == INT_MAX) + { + setSubgridValue(getOwner(tmp1), -INT_MAX); + } + if(! (getBoundaryCondition(getOwner(tmp1)) & FROM_EAST) ) + setBoundaryCondition(getOwner(tmp1), getBoundaryCondition(getOwner(tmp1))+FROM_EAST); + } + } + } + } + + + this->currentStep++; + int stepValue = this->currentStep + 4; + for (int i = 0; i < this->subgridValues.getSize(); i++) + { + if( getSubgridValue(i) == -INT_MAX ) + setSubgridValue(i, stepValue); + } + + cout << "Grid synchronized at step " << (this->currentStep - 1 ) << endl; + +} + + +template< typename SchemeHost, typename SchemeDevice, typename Device> +int tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>::getOwner(int i) const +{ + + return (i / (this->gridCols*this->n*this->n))*this->gridCols + (i % (this->gridCols*this->n))/this->n; +} + +template< typename SchemeHost, typename SchemeDevice, typename Device> +int tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>::getSubgridValue( int i ) const +{ + return this->subgridValues[i]; +} + +template< typename SchemeHost, typename SchemeDevice, typename Device> +void tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>::setSubgridValue(int i, int value) +{ + this->subgridValues[i] = value; +} + +template< typename SchemeHost, typename SchemeDevice, typename Device> +int tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>::getBoundaryCondition( int i ) const +{ + return this->boundaryConditions[i]; +} + +template< typename SchemeHost, typename SchemeDevice, typename Device> +void tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>::setBoundaryCondition(int i, int value) +{ + this->boundaryConditions[i] = value; +} + +template< typename SchemeHost, typename SchemeDevice, typename Device> +void tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>::stretchGrid() +{ + cout << "Stretching grid..." << endl; + + + this->gridCols = ceil( ((double)(this->mesh.getDimensions().x()-1)) / ((double)(this->n-1)) ); + this->gridRows = ceil( ((double)(this->mesh.getDimensions().y()-1)) / ((double)(this->n-1)) ); + + cout << "Setting gridCols to " << this->gridCols << "." << endl; + cout << "Setting gridRows to " << this->gridRows << "." << endl; + + this->subgridValues.setSize(this->gridCols*this->gridRows); + this->subgridValues.setValue(0); + this->boundaryConditions.setSize(this->gridCols*this->gridRows); + this->boundaryConditions.setValue(0); + this->calculationsCount.setSize(this->gridCols*this->gridRows); + this->calculationsCount.setValue(0); + + for(int i = 0; i < this->subgridValues.getSize(); i++ ) + { + this->subgridValues[i] = INT_MAX; + this->boundaryConditions[i] = 0; + } + + int stretchedSize = this->n*this->n*this->gridCols*this->gridRows; + + if(!this->work_u.setSize(stretchedSize)) + cerr << "Could not allocate memory for stretched grid." << endl; + if(!this->unusedCell.setSize(stretchedSize)) + cerr << "Could not allocate memory for supporting stretched grid." << endl; + int idealStretch =this->mesh.getDimensions().x() + (this->mesh.getDimensions().x()-2)/(this->n-1); + cout << idealStretch << endl; + + for(int i = 0; i < stretchedSize; i++) + { + this->unusedCell[i] = 1; + int diff =(this->n*this->gridCols) - idealStretch ; + int k = i/this->n - i/(this->n*this->gridCols) + this->mesh.getDimensions().x()*(i/(this->n*this->n*this->gridCols)) + (i/(this->n*this->gridCols))*diff; + + if(i%(this->n*this->gridCols) - idealStretch >= 0) + { + k+= i%(this->n*this->gridCols) - idealStretch +1 ; + } + + if(i/(this->n*this->gridCols) - idealStretch + 1 > 0) + { + k+= (i/(this->n*this->gridCols) - idealStretch +1 )* this->mesh.getDimensions().x() ; + } + + this->work_u[i] = this->u0[i-k]; + } + + cout << "Grid stretched." << endl; +} + +template< typename SchemeHost, typename SchemeDevice, typename Device> +void tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>::contractGrid() +{ + cout << "Contracting grid..." << endl; + int stretchedSize = this->n*this->n*this->gridCols*this->gridRows; + + int idealStretch =this->mesh.getDimensions().x() + (this->mesh.getDimensions().x()-2)/(this->n-1); + cout << idealStretch << endl; + + for(int i = 0; i < stretchedSize; i++) + { + int diff =(this->n*this->gridCols) - idealStretch ; + int k = i/this->n - i/(this->n*this->gridCols) + this->mesh.getDimensions().x()*(i/(this->n*this->n*this->gridCols)) + (i/(this->n*this->gridCols))*diff; + + if((i%(this->n*this->gridCols) - idealStretch < 0) && (i/(this->n*this->gridCols) - idealStretch + 1 <= 0)) + { + this->u0[i-k] = this->work_u[i]; + } + + } + + cout << "Grid contracted" << endl; +} + +template< typename SchemeHost, typename SchemeDevice, typename Device> +typename tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>::VectorType +tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>::getSubgrid( const int i ) const +{ + VectorType u; + u.setSize(this->n*this->n); + + for( int j = 0; j < u.getSize(); j++) + { + u[j] = this->work_u[ (i / this->gridCols) * this->n*this->n*this->gridCols + + (i % this->gridCols) * this->n + + (j/this->n) * this->n*this->gridCols + + (j % this->n) ]; + } + return u; +} + +template< typename SchemeHost, typename SchemeDevice, typename Device> +void tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>::insertSubgrid( VectorType u, const int i ) +{ + + for( int j = 0; j < this->n*this->n; j++) + { + int index = (i / this->gridCols)*this->n*this->n*this->gridCols + (i % this->gridCols)*this->n + (j/this->n)*this->n*this->gridCols + (j % this->n); + //OMP LOCK index + if( (fabs(this->work_u[index]) > fabs(u[j])) || (this->unusedCell[index] == 1) ) + { + this->work_u[index] = u[j]; + this->unusedCell[index] = 0; + } + //OMP UNLOCK index + } +} + +template< typename SchemeHost, typename SchemeDevice, typename Device> +typename tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>::VectorType +tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>::runSubgrid( int boundaryCondition, VectorType u, int subGridID) +{ + + VectorType fu; + + fu.setLike(u); + fu.setValue( 0.0 ); + bool tmp = false; + for(int i = 0; i < u.getSize(); i++) + { + if(u[0]*u[i] <= 0.0) + tmp=true; + int centreGID = (this->n*(subGridID / this->gridRows)+ (this->n >> 1))*(this->n*this->gridCols) + this->n*(subGridID % this->gridRows) + (this->n >> 1); + if(this->unusedCell[centreGID] == 0 || boundaryCondition == 0) + tmp = true; + } + + double value = Sign(u[0]) * u.absMax(); + + if(tmp) + {} + + + //north - 1, east - 2, west - 4, south - 8 + else if(boundaryCondition == FROM_WEST) + { + for(int i = 0; i < this->n; i++) + for(int j = 1;j < this->n; j++) + u[i*this->n + j] = value;// u[i*this->n]; + } + else if(boundaryCondition == FROM_EAST) + { + for(int i = 0; i < this->n; i++) + for(int j =0 ;j < this->n -1; j++) + u[i*this->n + j] = value;// u[(i+1)*this->n - 1]; + } + else if(boundaryCondition == FROM_NORTH) + { + for(int j = 0; j < this->n; j++) + for(int i = 0;i < this->n - 1; i++) + u[i*this->n + j] = value;// u[j + this->n*(this->n - 1)]; + } + else if(boundaryCondition == FROM_SOUTH) + { + for(int j = 0; j < this->n; j++) + for(int i = 1;i < this->n; i++) + u[i*this->n + j] = value;// u[j]; + } + + double time = 0.0; + double currentTau = this->tau0; + double finalTime = this->stopTime;// + 3.0*(u.max() - u.min()); + if( time + currentTau > finalTime ) currentTau = finalTime - time; + + double maxResidue( 1.0 ); + while( time < finalTime /*|| maxResidue > subMesh.getHx()*/) + { + /**** + * Compute the RHS + */ + + for( int i = 0; i < fu.getSize(); i ++ ) + { + fu[ i ] = schemeHost.getValue( this->subMesh, i, this->subMesh.getCellCoordinates(i), u, time, boundaryCondition ); + } + maxResidue = fu. absMax(); + + + if( this -> cflCondition * maxResidue != 0.0) + currentTau = this -> cflCondition / maxResidue; + + if(currentTau > 1.0 * this->subMesh.getHx()) + { + currentTau = 1.0 * this->subMesh.getHx(); + } + + + if( time + currentTau > finalTime ) currentTau = finalTime - time; +<<<<<<< HEAD +// for( int i = 0; i < fu.getSize(); i ++ ) +// { +// //cout << "Too big RHS! i = " << i << ", fu = " << fu[i] << ", u = " << u[i] << endl; +// if((u[i]+currentTau * fu[ i ])*u[i] < 0.0 && fu[i] != 0.0 && u[i] != 0.0 ) +// currentTau = fabs(u[i]/(2.0*fu[i])); +// +// } +======= + for( int i = 0; i < fu.getSize(); i ++ ) + { + if((u[i]+currentTau * fu[ i ])*u[i] < 0.0 && fu[i] != 0.0 && u[i] != 0.0 ) + currentTau = fabs(u[i]/(2.0*fu[i])); + + } +>>>>>>> 86250e05537ea6311fa87d0f7bbae298788aa432 + + + for( int i = 0; i < fu.getSize(); i ++ ) + { + double add = u[i] + currentTau * fu[ i ]; + u[ i ] = add; + } + time += currentTau; + + } + VectorType solution; + solution.setLike(u); + for( int i = 0; i < u.getSize(); i ++ ) + { + solution[i]=u[i]; + } + return solution; +} + + +#ifdef HAVE_CUDA + + +template< typename SchemeHost, typename SchemeDevice, typename Device> +__device__ +void tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>::getSubgridCUDA( const int i ,tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int >* caller, double* a) +{ +<<<<<<< HEAD + //int j = threadIdx.x + threadIdx.y * blockDim.x; + int th = (blockIdx.y) * caller->n*caller->n*caller->gridCols + + (blockIdx.x) * caller->n + + threadIdx.y * caller->n*caller->gridCols + + threadIdx.x; + //printf("i= %d,j= %d,th= %d\n",i,j,th); +======= + int j = threadIdx.x + threadIdx.y * blockDim.x; + int th = (i / caller->gridCols) * caller->n*caller->n*caller->gridCols + + (i % caller->gridCols) * caller->n + + (j/caller->n) * caller->n*caller->gridCols + + (j % caller->n); +>>>>>>> 86250e05537ea6311fa87d0f7bbae298788aa432 + *a = caller->work_u_cuda[th]; +} + + +template< typename SchemeHost, typename SchemeDevice, typename Device> +__device__ +void tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>::updateSubgridCUDA( const int i ,tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int >* caller, double* a) +{ + int j = threadIdx.x + threadIdx.y * blockDim.x; + int index = (blockIdx.y) * caller->n*caller->n*caller->gridCols + + (blockIdx.x) * caller->n + + threadIdx.y * caller->n*caller->gridCols + + threadIdx.x; + + if( (fabs(caller->work_u_cuda[index]) > fabs(*a)) || (caller->unusedCell_cuda[index] == 1) ) + { + caller->work_u_cuda[index] = *a; + caller->unusedCell_cuda[index] = 0; + + } + + *a = caller->work_u_cuda[index]; +} + + +template< typename SchemeHost, typename SchemeDevice, typename Device> +__device__ +void tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>::insertSubgridCUDA( double u, const int i ) +{ + + +<<<<<<< HEAD +// int j = threadIdx.x + threadIdx.y * blockDim.x; + //printf("j = %d, u = %f\n", j,u); +======= + int j = threadIdx.x + threadIdx.y * blockDim.x; +>>>>>>> 86250e05537ea6311fa87d0f7bbae298788aa432 + + int index = (blockIdx.y)*this->n*this->n*this->gridCols + + (blockIdx.x)*this->n + + threadIdx.y*this->n*this->gridCols + + threadIdx.x; + + if( (fabs(this->work_u_cuda[index]) > fabs(u)) || (this->unusedCell_cuda[index] == 1) ) + { + this->work_u_cuda[index] = u; + this->unusedCell_cuda[index] = 0; + + } + + +} + + +template< typename SchemeHost, typename SchemeDevice, typename Device> +__device__ +void tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>::runSubgridCUDA( int boundaryCondition, double* u, int subGridID) +{ + + __shared__ int tmp; + __shared__ double value; + volatile double* sharedTau = &u[blockDim.x*blockDim.y]; + volatile double* absVal = &u[2*blockDim.x*blockDim.y]; + int i = threadIdx.x; + int j = threadIdx.y; + int l = threadIdx.y * blockDim.x + threadIdx.x; + bool computeFU = !((i == 0 && (boundaryCondition & FROM_WEST)) or + (i == blockDim.x - 1 && (boundaryCondition & FROM_EAST)) or + (j == 0 && (boundaryCondition & FROM_SOUTH)) or + (j == blockDim.y - 1 && (boundaryCondition & FROM_NORTH))); + + if(l == 0) + { + tmp = 0; + int centreGID = (blockDim.y*blockIdx.y + (blockDim.y>>1))*(blockDim.x*gridDim.x) + blockDim.x*blockIdx.x + (blockDim.x>>1); + if(this->unusedCell_cuda[centreGID] == 0 || boundaryCondition == 0) + tmp = 1; + } + __syncthreads(); + + + __syncthreads(); + if(tmp !=1) + { +// if(computeFU) +// absVal[l]=0.0; +// else +// absVal[l] = fabs(u[l]); +// +// __syncthreads(); +// +// if((blockDim.x == 16) && (l < 128)) absVal[l] = Max(absVal[l],absVal[l+128]); +// __syncthreads(); +// if((blockDim.x == 16) && (l < 64)) absVal[l] = Max(absVal[l],absVal[l+64]); +// __syncthreads(); +// if(l < 32) absVal[l] = Max(absVal[l],absVal[l+32]); +// if(l < 16) absVal[l] = Max(absVal[l],absVal[l+16]); +// if(l < 8) absVal[l] = Max(absVal[l],absVal[l+8]); +// if(l < 4) absVal[l] = Max(absVal[l],absVal[l+4]); +// if(l < 2) absVal[l] = Max(absVal[l],absVal[l+2]); +// if(l < 1) value = Sign(u[0])*Max(absVal[l],absVal[l+1]); +// __syncthreads(); +// +// if(computeFU) +// u[l] = value; + if(computeFU) +<<<<<<< HEAD + { + if(boundaryCondition == 4) + u[l] = u[threadIdx.y * blockDim.x] + Sign(u[0])*this->subMesh.getHx()*(threadIdx.x) ;//+ 2*Sign(u[0])*this->subMesh.getHx()*(threadIdx.x+this->n); + else if(boundaryCondition == 2) + u[l] = u[threadIdx.y * blockDim.x + blockDim.x - 1] + Sign(u[0])*this->subMesh.getHx()*(this->n - 1 - threadIdx.x);//+ 2*Sign(u[0])*this->subMesh.getHx()*(blockDim.x - threadIdx.x - 1+this->n); + else if(boundaryCondition == 8) + u[l] = u[threadIdx.x] + Sign(u[0])*this->subMesh.getHx()*(threadIdx.y) ;//+ 2*Sign(u[0])*this->subMesh.getHx()*(threadIdx.y+this->n); + else if(boundaryCondition == 1) + u[l] = u[(blockDim.y - 1)* blockDim.x + threadIdx.x] + Sign(u[0])*this->subMesh.getHx()*(this->n - 1 - threadIdx.y) ;//+ Sign(u[0])*this->subMesh.getHx()*(blockDim.y - threadIdx.y - 1 +this->n); +======= + absVal[l]=0.0; + else + absVal[l] = fabs(u[l]); + + __syncthreads(); + + if((blockDim.x == 16) && (l < 128)) absVal[l] = Max(absVal[l],absVal[l+128]); + __syncthreads(); + if((blockDim.x == 16) && (l < 64)) absVal[l] = Max(absVal[l],absVal[l+64]); + __syncthreads(); + if(l < 32) absVal[l] = Max(absVal[l],absVal[l+32]); + if(l < 16) absVal[l] = Max(absVal[l],absVal[l+16]); + if(l < 8) absVal[l] = Max(absVal[l],absVal[l+8]); + if(l < 4) absVal[l] = Max(absVal[l],absVal[l+4]); + if(l < 2) absVal[l] = Max(absVal[l],absVal[l+2]); + if(l < 1) value = Sign(u[0])*Max(absVal[l],absVal[l+1]); + __syncthreads(); + + if(computeFU) + { +// u[l] = value; + if(boundaryCondition==FROM_NORTH) + u[l] = u[i + blockDim.x*(blockDim.y-1)]; + else if(boundaryCondition==FROM_SOUTH) + u[l] = u[i]; + else if(boundaryCondition==FROM_EAST) + u[l] = u[blockDim.x-1 + blockDim.x*j]; + else if(boundaryCondition==FROM_WEST) + u[l] = u[blockDim.x*j]; +>>>>>>> 86250e05537ea6311fa87d0f7bbae298788aa432 + } + } + + double time = 0.0; + __shared__ double currentTau; + double cfl = this->cflCondition; + double fu = 0.0; +// if(threadIdx.x * threadIdx.y == 0) +// { +// currentTau = this->tau0; +// } + double finalTime = this->stopTime; + __syncthreads(); + if( time + currentTau > finalTime ) currentTau = finalTime - time; + + + while( time < finalTime ) + { + if(computeFU) + fu = schemeHost.getValueDev( this->subMesh, l, tnlStaticVector<2,int>(i,j)/*this->subMesh.getCellCoordinates(l)*/, u, time, boundaryCondition); + + sharedTau[l]=cfl/fabs(fu); + + if(l == 0) + { + if(sharedTau[0] > 1.0 * this->subMesh.getHx()) sharedTau[0] = 1.0 * this->subMesh.getHx(); + } + else if(l == blockDim.x*blockDim.y - 1) + if( time + sharedTau[l] > finalTime ) sharedTau[l] = finalTime - time; + + + + if((blockDim.x == 16) && (l < 128)) sharedTau[l] = Min(sharedTau[l],sharedTau[l+128]); + __syncthreads(); + if((blockDim.x == 16) && (l < 64)) sharedTau[l] = Min(sharedTau[l],sharedTau[l+64]); + __syncthreads(); + if(l < 32) sharedTau[l] = Min(sharedTau[l],sharedTau[l+32]); + if(l < 16) sharedTau[l] = Min(sharedTau[l],sharedTau[l+16]); + if(l < 8) sharedTau[l] = Min(sharedTau[l],sharedTau[l+8]); + if(l < 4) sharedTau[l] = Min(sharedTau[l],sharedTau[l+4]); + if(l < 2) sharedTau[l] = Min(sharedTau[l],sharedTau[l+2]); + if(l < 1) currentTau = Min(sharedTau[l],sharedTau[l+1]); + __syncthreads(); + + u[l] += currentTau * fu; + time += currentTau; + } + + +} + + +template< typename SchemeHost, typename SchemeDevice, typename Device> +__device__ +int tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>::getOwnerCUDA(int i) const +{ + + return ((i / (this->gridCols*this->n*this->n))*this->gridCols + + (i % (this->gridCols*this->n))/this->n); +} + + +template< typename SchemeHost, typename SchemeDevice, typename Device> +__device__ +int tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>::getSubgridValueCUDA( int i ) const +{ + return this->subgridValues_cuda[i]; +} + + +template< typename SchemeHost, typename SchemeDevice, typename Device> +__device__ +void tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>::setSubgridValueCUDA(int i, int value) +{ + this->subgridValues_cuda[i] = value; +} + + +template< typename SchemeHost, typename SchemeDevice, typename Device> +__device__ +int tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>::getBoundaryConditionCUDA( int i ) const +{ + return this->boundaryConditions_cuda[i]; +} + + +template< typename SchemeHost, typename SchemeDevice, typename Device> +__device__ +void tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>::setBoundaryConditionCUDA(int i, int value) +{ + this->boundaryConditions_cuda[i] = value; +} + + + +//north - 1, east - 2, west - 4, south - 8 + +template <typename SchemeHost, typename SchemeDevice, typename Device> +__global__ +void synchronizeCUDA(tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int >* cudaSolver) //needs fix ---- maybe not anymore --- but frankly: yeah, it does -- aaaa-and maybe fixed now +{ + + __shared__ int boundary[4]; // north,east,west,south + __shared__ int subgridValue; + __shared__ int newSubgridValue; + + + int gid = (blockDim.y*blockIdx.y + threadIdx.y)*blockDim.x*gridDim.x + blockDim.x*blockIdx.x + threadIdx.x; + double u = cudaSolver->work_u_cuda[gid]; + double u_cmp; + int subgridValue_cmp=INT_MAX; + int boundary_index=0; + + + if(threadIdx.x+threadIdx.y == 0) + { + subgridValue = cudaSolver->getSubgridValueCUDA(blockIdx.y*gridDim.x + blockIdx.x); + boundary[0] = 0; + boundary[1] = 0; + boundary[2] = 0; + boundary[3] = 0; + newSubgridValue = 0; + } + __syncthreads(); + + + + if( (threadIdx.x == 0 /* && !(cudaSolver->currentStep & 1)*/) || + (threadIdx.y == 0 /* && (cudaSolver->currentStep & 1)*/) || + (threadIdx.x == blockDim.x - 1 /* && !(cudaSolver->currentStep & 1)*/) || + (threadIdx.y == blockDim.y - 1 /* && (cudaSolver->currentStep & 1)*/) ) + { + if(threadIdx.x == 0 && (blockIdx.x != 0)/* && !(cudaSolver->currentStep & 1)*/) + { + u_cmp = cudaSolver->work_u_cuda[gid - 1]; + subgridValue_cmp = cudaSolver->getSubgridValueCUDA(blockIdx.y*gridDim.x + blockIdx.x - 1); + boundary_index = 2; + } + + if(threadIdx.x == blockDim.x - 1 && (blockIdx.x != gridDim.x - 1)/* && !(cudaSolver->currentStep & 1)*/) + { + u_cmp = cudaSolver->work_u_cuda[gid + 1]; + subgridValue_cmp = cudaSolver->getSubgridValueCUDA(blockIdx.y*gridDim.x + blockIdx.x + 1); + boundary_index = 1; + } + __threadfence(); + if((subgridValue == INT_MAX || fabs(u_cmp) + cudaSolver->delta < fabs(u) ) && (subgridValue_cmp != INT_MAX && subgridValue_cmp != -INT_MAX)) + { + cudaSolver->unusedCell_cuda[gid] = 0; + atomicMax(&newSubgridValue, INT_MAX); + atomicMax(&boundary[boundary_index], 1); + cudaSolver->work_u_cuda[gid] = u_cmp; + u=u_cmp; + } + if(threadIdx.y == 0 && (blockIdx.y != 0)/* && (cudaSolver->currentStep & 1)*/) + { + u_cmp = cudaSolver->work_u_cuda[gid - blockDim.x*gridDim.x]; + subgridValue_cmp = cudaSolver->getSubgridValueCUDA((blockIdx.y - 1)*gridDim.x + blockIdx.x); + boundary_index = 3; + } + if(threadIdx.y == blockDim.y - 1 && (blockIdx.y != gridDim.y - 1)/* && (cudaSolver->currentStep & 1)*/) + { + u_cmp = cudaSolver->work_u_cuda[gid + blockDim.x*gridDim.x]; + subgridValue_cmp = cudaSolver->getSubgridValueCUDA((blockIdx.y + 1)*gridDim.x + blockIdx.x); + boundary_index = 0; + } + + __threadfence(); + if((subgridValue == INT_MAX || fabs(u_cmp) + cudaSolver->delta < fabs(u) ) && (subgridValue_cmp != INT_MAX && subgridValue_cmp != -INT_MAX)) + { + cudaSolver->unusedCell_cuda[gid] = 0; + atomicMax(&newSubgridValue, INT_MAX); + atomicMax(&boundary[boundary_index], 1); + cudaSolver->work_u_cuda[gid] = u_cmp; + } + } + __syncthreads(); + + if(threadIdx.x+threadIdx.y == 0) + { + if(subgridValue == INT_MAX && newSubgridValue !=0) + cudaSolver->setSubgridValueCUDA(blockIdx.y*gridDim.x + blockIdx.x, -INT_MAX); + + cudaSolver->setBoundaryConditionCUDA(blockIdx.y*gridDim.x + blockIdx.x, FROM_NORTH * boundary[0] + + FROM_EAST * boundary[1] + + FROM_WEST * boundary[2] + + FROM_SOUTH * boundary[3]); + } +} + + + +template <typename SchemeHost, typename SchemeDevice, typename Device> +__global__ +void synchronize2CUDA(tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int >* cudaSolver) +{ + if(blockIdx.x+blockIdx.y ==0) + { + cudaSolver->currentStep = cudaSolver->currentStep + 1; + *(cudaSolver->runcuda) = 0; + } + + int stepValue = cudaSolver->currentStep + 4; + if( cudaSolver->getSubgridValueCUDA(blockIdx.y*gridDim.x + blockIdx.x) == -INT_MAX ) + cudaSolver->setSubgridValueCUDA(blockIdx.y*gridDim.x + blockIdx.x, stepValue); + + atomicMax((cudaSolver->runcuda),cudaSolver->getBoundaryConditionCUDA(blockIdx.y*gridDim.x + blockIdx.x)); +} + + + + + + + + +template< typename SchemeHost, typename SchemeDevice, typename Device> +__global__ +void initCUDA( tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int >* cudaSolver, double* ptr , int* ptr2, int* ptr3) +{ + cudaSolver->work_u_cuda = ptr; + cudaSolver->unusedCell_cuda = ptr3; + cudaSolver->subgridValues_cuda =(int*)malloc(cudaSolver->gridCols*cudaSolver->gridRows*sizeof(int)); + cudaSolver->boundaryConditions_cuda =(int*)malloc(cudaSolver->gridCols*cudaSolver->gridRows*sizeof(int)); + cudaSolver->runcuda = ptr2; + *(cudaSolver->runcuda) = 1; + cudaSolver->currentStep = 1; + printf("GPU memory allocated.\n"); + + for(int i = 0; i < cudaSolver->gridCols*cudaSolver->gridRows; i++) + { + cudaSolver->subgridValues_cuda[i] = INT_MAX; + cudaSolver->boundaryConditions_cuda[i] = 0; + } + + printf("GPU memory initialized.\n"); +} + + + + +template< typename SchemeHost, typename SchemeDevice, typename Device > +__global__ +void initRunCUDA(tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int >* caller) + +{ + + + extern __shared__ double u[]; + + int i = blockIdx.y * gridDim.x + blockIdx.x; + int l = threadIdx.y * blockDim.x + threadIdx.x; + + __shared__ int containsCurve; + if(l == 0) + containsCurve = 0; + + caller->getSubgridCUDA(i,caller, &u[l]); + __syncthreads(); + if(u[0] * u[l] <= 0.0) + { + atomicMax( &containsCurve, 1); + } + + __syncthreads(); + if(containsCurve == 1) + { + caller->runSubgridCUDA(0,u,i); + __syncthreads(); + caller->insertSubgridCUDA(u[l],i); + __syncthreads(); + if(l == 0) + caller->setSubgridValueCUDA(i, 4); + } + + +} + + + + + +template< typename SchemeHost, typename SchemeDevice, typename Device > +__global__ +void runCUDA(tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int >* caller) +{ + extern __shared__ double u[]; + int i = blockIdx.y * gridDim.x + blockIdx.x; + int l = threadIdx.y * blockDim.x + threadIdx.x; + int bound = caller->getBoundaryConditionCUDA(i); + + if(caller->getSubgridValueCUDA(i) != INT_MAX && bound != 0 && caller->getSubgridValueCUDA(i) > 0) + { + caller->getSubgridCUDA(i,caller, &u[l]); +<<<<<<< HEAD + + //if(l == 0) + //printf("i = %d, bound = %d\n",i,caller->getSubgridValueCUDA(i)); + if(caller->getSubgridValueCUDA(i) == caller->currentStep+4) + { + if(bound & 1) + { + caller->runSubgridCUDA(1,u,i); + //__syncthreads(); + //caller->insertSubgridCUDA(u[l],i); + //__syncthreads(); + //caller->getSubgridCUDA(i,caller, &u[l]); + caller->updateSubgridCUDA(i,caller, &u[l]); + __syncthreads(); + } + if(bound & 2 ) + { + caller->runSubgridCUDA(2,u,i); + //__syncthreads(); + //caller->insertSubgridCUDA(u[l],i); + //__syncthreads(); + //caller->getSubgridCUDA(i,caller, &u[l]); + caller->updateSubgridCUDA(i,caller, &u[l]); + __syncthreads(); + } + if(bound & 4) + { + caller->runSubgridCUDA(4,u,i); + //__syncthreads(); + //caller->insertSubgridCUDA(u[l],i); + //__syncthreads(); + //caller->getSubgridCUDA(i,caller, &u[l]); + caller->updateSubgridCUDA(i,caller, &u[l]); + __syncthreads(); + } + if(bound & 8) + { + caller->runSubgridCUDA(8,u,i); + //__syncthreads(); + //caller->insertSubgridCUDA(u[l],i); + //__syncthreads(); + //caller->getSubgridCUDA(i,caller, &u[l]); + caller->updateSubgridCUDA(i,caller, &u[l]); + __syncthreads(); + } + + + + + + if( ((bound & 3 ))) + { + caller->runSubgridCUDA(3,u,i); + //__syncthreads(); + //caller->insertSubgridCUDA(u[l],i); + //__syncthreads(); + //caller->getSubgridCUDA(i,caller, &u[l]); + caller->updateSubgridCUDA(i,caller, &u[l]); + __syncthreads(); + } + if( ((bound & 5 ))) + { + caller->runSubgridCUDA(5,u,i); + //__syncthreads(); + //caller->insertSubgridCUDA(u[l],i); + //__syncthreads(); + //caller->getSubgridCUDA(i,caller, &u[l]); + caller->updateSubgridCUDA(i,caller, &u[l]); + __syncthreads(); + } + if( ((bound & 10 ))) + { + caller->runSubgridCUDA(10,u,i); + //__syncthreads(); + //caller->insertSubgridCUDA(u[l],i); + //__syncthreads(); + //caller->getSubgridCUDA(i,caller, &u[l]); + caller->updateSubgridCUDA(i,caller, &u[l]); + __syncthreads(); + } + if( (bound & 12 )) + { + caller->runSubgridCUDA(12,u,i); + //__syncthreads(); + //caller->insertSubgridCUDA(u[l],i); + //__syncthreads(); + //caller->getSubgridCUDA(i,caller, &u[l]); + caller->updateSubgridCUDA(i,caller, &u[l]); + __syncthreads(); + } + + + + + + } + + + else + { + + + + + + + + + + if( ((bound == 2))) + { + caller->runSubgridCUDA(2,u,i); + //__syncthreads(); + //caller->insertSubgridCUDA(u[l],i); + //__syncthreads(); + //caller->getSubgridCUDA(i,caller, &u[l]); + caller->updateSubgridCUDA(i,caller, &u[l]); + __syncthreads(); + } + if( ((bound == 1) )) + { + caller->runSubgridCUDA(1,u,i); + //__syncthreads(); + //caller->insertSubgridCUDA(u[l],i); + //__syncthreads(); + //caller->getSubgridCUDA(i,caller, &u[l]); + caller->updateSubgridCUDA(i,caller, &u[l]); + __syncthreads(); + } + if( ((bound == 8) )) + { + caller->runSubgridCUDA(8,u,i); + //__syncthreads(); + //caller->insertSubgridCUDA(u[l],i); + //__syncthreads(); + //caller->getSubgridCUDA(i,caller, &u[l]); + caller->updateSubgridCUDA(i,caller, &u[l]); + __syncthreads(); + } + if( (bound == 4)) + { + caller->runSubgridCUDA(4,u,i); + //__syncthreads(); + //caller->insertSubgridCUDA(u[l],i); + //__syncthreads(); + //caller->getSubgridCUDA(i,caller, &u[l]); + caller->updateSubgridCUDA(i,caller, &u[l]); + __syncthreads(); + } + + + + + + + + + + + if( ((bound & 3) )) + { + caller->runSubgridCUDA(3,u,i); + //__syncthreads(); + //caller->insertSubgridCUDA(u[l],i); + //__syncthreads(); + //caller->getSubgridCUDA(i,caller, &u[l]); + caller->updateSubgridCUDA(i,caller, &u[l]); + __syncthreads(); + } + if( ((bound & 5) )) + { + caller->runSubgridCUDA(5,u,i); + //__syncthreads(); + //caller->insertSubgridCUDA(u[l],i); + //__syncthreads(); + //caller->getSubgridCUDA(i,caller, &u[l]); + caller->updateSubgridCUDA(i,caller, &u[l]); + __syncthreads(); + } + if( ((bound & 10) )) + { + caller->runSubgridCUDA(10,u,i); + //__syncthreads(); + //caller->insertSubgridCUDA(u[l],i); + //__syncthreads(); + //caller->getSubgridCUDA(i,caller, &u[l]); + caller->updateSubgridCUDA(i,caller, &u[l]); + __syncthreads(); + } + if( (bound & 12) ) + { + caller->runSubgridCUDA(12,u,i); + //__syncthreads(); + //caller->insertSubgridCUDA(u[l],i); + //__syncthreads(); + //caller->getSubgridCUDA(i,caller, &u[l]); + caller->updateSubgridCUDA(i,caller, &u[l]); + __syncthreads(); + } + + + + + + + + + + + + + } + /*if( bound ) + { + caller->runSubgridCUDA(15,u,i); + __syncthreads(); + //caller->insertSubgridCUDA(u[l],i); + //__syncthreads(); + //caller->getSubgridCUDA(i,caller, &u[l]); + caller->updateSubgridCUDA(i,caller, &u[l]); + __syncthreads(); + }*/ +======= + int bound = caller->getBoundaryConditionCUDA(i); + if(caller->getSubgridValueCUDA(i) == caller->currentStep+4) + { + if(bound & FROM_NORTH) + { + caller->runSubgridCUDA(FROM_NORTH,u,i); + __syncthreads(); + caller->updateSubgridCUDA(i,caller, &u[l]); + __syncthreads(); + } + if(bound & FROM_EAST ) + { + caller->runSubgridCUDA(FROM_EAST,u,i); + __syncthreads(); + caller->updateSubgridCUDA(i,caller, &u[l]); + __syncthreads(); + } + if(bound & FROM_WEST) + { + caller->runSubgridCUDA(FROM_WEST,u,i); + __syncthreads(); + caller->updateSubgridCUDA(i,caller, &u[l]); + __syncthreads(); + } + if(bound & FROM_SOUTH) + { + caller->runSubgridCUDA(FROM_SOUTH,u,i); + __syncthreads(); + caller->updateSubgridCUDA(i,caller, &u[l]); + __syncthreads(); + } + } + + + + if( ((bound & FROM_EAST) || (bound & FROM_NORTH) )) + { + caller->runSubgridCUDA(FROM_NORTH + FROM_EAST,u,i); + __syncthreads(); + caller->updateSubgridCUDA(i,caller, &u[l]); + __syncthreads(); + } + if( ((bound & FROM_WEST) || (bound & FROM_NORTH) )) + { + caller->runSubgridCUDA(FROM_NORTH + FROM_WEST,u,i); + __syncthreads(); + caller->updateSubgridCUDA(i,caller, &u[l]); + __syncthreads(); + } + if( ((bound & FROM_EAST) || (bound & FROM_SOUTH) )) + { + caller->runSubgridCUDA(FROM_SOUTH + FROM_EAST,u,i); + __syncthreads(); + caller->updateSubgridCUDA(i,caller, &u[l]); + __syncthreads(); + } + if( (bound & FROM_WEST) || (bound & FROM_SOUTH) ) + { + caller->runSubgridCUDA(FROM_SOUTH + FROM_WEST,u,i); + __syncthreads(); + caller->updateSubgridCUDA(i,caller, &u[l]); + __syncthreads(); + } +>>>>>>> 86250e05537ea6311fa87d0f7bbae298788aa432 + + if(l==0) + { + caller->setBoundaryConditionCUDA(i, 0); + caller->setSubgridValueCUDA(i, caller->getSubgridValueCUDA(i) - 1 ); + } + + + } + + + +} + +#endif /*HAVE_CUDA*/ + +#endif /* TNLPARALLELEIKONALSOLVER_IMPL_H_ */ -- GitLab