Commit a306dd84 authored by Vojtěch Legler's avatar Vojtěch Legler

Added custom makefiles

parent 4cb8baa2
......@@ -85,7 +85,7 @@ __cuda_callable__
ArrayView< Value, Device, Index >::
ArrayView( const std::initializer_list< Value >& elems )
{
this->getSize( elems.size() );
//this->getSize( elems.size() );
int i = 0;
for (const auto &elem : elems)
{
......
......@@ -66,13 +66,12 @@ addElement( IndexType i, RealType value, Scalar thisElementMultiplicator )
template< typename Real,
typename Index,
typename Expression >
__global__ void expressionTemplatesKernel( Real* dt, Expression* expression, Index size )
__global__ void expressionTemplatesKernel( Real* dt, Expression expression, Index size )
{
const int idx = blockIdx.x * blockDim.x + threadIdx.x;
int stride = blockDim.x * gridDim.x;
for (int i = idx; i < size; i += stride)
dt[ i ] = expression[ i ];
//( *dt )[ idx ] = ( *expression )[ idx ];
}
#endif
......@@ -110,16 +109,10 @@ evaluateFor( const VectorOperationType& vo )
#ifdef HAVE_CUDA
else if( std::is_same< DeviceType, Devices::Cuda >::value )
{
//VectorOperationType* expression = Devices::Cuda::passToDevice( vo );
dim3 cudaBlockSize( 256 ), cudaGridSize( Devices::Cuda::getMaxGridSize() );
expressionTemplatesKernel<<< cudaGridSize, cudaBlockSize >>>( this->data, &vo, this->getSize() );
//expressionTemplatesKernel<<< cudaGridSize, cudaBlockSize >>>( this->data, expression, this->getSize() );
expressionTemplatesKernel<<< cudaGridSize, cudaBlockSize >>>( this->data, vo, this->getSize() );
TNL_CHECK_CUDA_DEVICE;
//Devices::Cuda::freeFromDevice( expression );
//TNL_CHECK_CUDA_DEVICE;
}
#endif
}
......
......@@ -3,7 +3,7 @@ set( headers StaticVectorExpressions.h
)
IF( BUILD_CUDA )
CUDA_ADD_EXECUTABLE( tnl-expression-templates expression-templates.cu )
CUDA_ADD_EXECUTABLE( tnl-expression-templates-cuda expression-templates.cu )
ELSE( BUILD_CUDA )
ADD_EXECUTABLE( tnl-expression-templates expression-templates.cpp )
ADD_EXECUTABLE( tnl-expression-templates-static expression-templates-static.cpp )
......@@ -11,8 +11,15 @@ ELSE( BUILD_CUDA )
ADD_EXECUTABLE( tnl-expression-templates-static-temp expression-templates-static-temp.cpp )
ENDIF( BUILD_CUDA )
INSTALL( TARGETS tnl-expression-templates
IF( BUILD_CUDA )
INSTALL( TARGETS tnl-expression-templates-cuda
RUNTIME DESTINATION bin
PERMISSIONS OWNER_READ OWNER_WRITE OWNER_EXECUTE GROUP_READ GROUP_EXECUTE WORLD_READ WORLD_EXECUTE )
ELSE( BUILD_CUDA )
INSTALL( TARGETS tnl-expression-templates
RUNTIME DESTINATION bin
PERMISSIONS OWNER_READ OWNER_WRITE OWNER_EXECUTE GROUP_READ GROUP_EXECUTE WORLD_READ WORLD_EXECUTE )
ENDIF( BUILD_CUDA )
INSTALL( FILES ${headers} DESTINATION ${TNL_TARGET_INCLUDE_DIRECTORY}/Experimental/ExpressionTemplates )
TNL_INCLUDE_DIR=${HOME}/tnl-dev/src
CXX = g++
CUDA_CXX = nvcc
CXX_FLAGS = -I$(TNL_INCLUDE_DIR) -std=c++11 -pthread -Wall -Wno-unused-local-typedefs -Wno-unused-variable -O3 -march=native -mtune=native -DNDEBUG
OUTDIR = release_gcc
all: $(OUTDIR)/tnl-expression-templates $(OUTDIR)/tnl-expression-templates-static $(OUTDIR)/tnl-expression-templates-temp $(OUTDIR)/tnl-expression-templates-static-temp
$(OUTDIR)/tnl-expression-templates: expression-templates.cpp
$(CXX) -o $(OUTDIR)/tnl-expression-templates expression-templates.cpp $(CXX_FLAGS)
$(OUTDIR)/tnl-expression-templates-static: expression-templates-static.cpp
$(CXX) -o $(OUTDIR)/tnl-expression-templates-static expression-templates-static.cpp $(CXX_FLAGS)
$(OUTDIR)/tnl-expression-templates-temp: expression-templates-temp.cpp
$(CXX) -o $(OUTDIR)/tnl-expression-templates-temp expression-templates-temp.cpp $(CXX_FLAGS)
$(OUTDIR)/tnl-expression-templates-static-temp: expression-templates-static-temp.cpp
$(CXX) -o $(OUTDIR)/tnl-expression-templates-static-temp expression-templates-static-temp.cpp $(CXX_FLAGS)
$(OUTDIR)/tnl-expression-templates-cuda: expression-templates.cu
$(CUDA_CXX) -o $(OUTDIR)/tnl-expression-templates-cuda expression-templates.cu $(CXX_FLAGS)
TNL_INCLUDE_DIR=${HOME}/tnl-dev/src
CXX = clang++
CUDA_CXX = nvcc
CXX_FLAGS = -I$(TNL_INCLUDE_DIR) -std=c++11 -pthread -Wall -Wno-unused-local-typedefs -Wno-unused-variable -O3 -march=native -mtune=native -DNDEBUG
OUTDIR = release_clang
all: $(OUTDIR)/tnl-expression-templates $(OUTDIR)/tnl-expression-templates-static $(OUTDIR)/tnl-expression-templates-temp $(OUTDIR)/tnl-expression-templates-static-temp
$(OUTDIR)/tnl-expression-templates: expression-templates.cpp
$(CXX) -o $(OUTDIR)/tnl-expression-templates expression-templates.cpp $(CXX_FLAGS)
$(OUTDIR)/tnl-expression-templates-static: expression-templates-static.cpp
$(CXX) -o $(OUTDIR)/tnl-expression-templates-static expression-templates-static.cpp $(CXX_FLAGS)
$(OUTDIR)/tnl-expression-templates-temp: expression-templates-temp.cpp
$(CXX) -o $(OUTDIR)/tnl-expression-templates-temp expression-templates-temp.cpp $(CXX_FLAGS)
$(OUTDIR)/tnl-expression-templates-static-temp: expression-templates-static-temp.cpp
$(CXX) -o $(OUTDIR)/tnl-expression-templates-static-temp expression-templates-static-temp.cpp $(CXX_FLAGS)
$(OUTDIR)/tnl-expression-templates-cuda: expression-templates.cu
$(CUDA_CXX) -o $(OUTDIR)/tnl-expression-templates-cuda expression-templates.cu $(CXX_FLAGS)
TNL_INCLUDE_DIR=${HOME}/tnl-dev/src
CXX = g++
CUDA_CXX = nvcc
CXX_FLAGS = -I$(TNL_INCLUDE_DIR) -std=c++11 -pthread -Wall -Wno-unused-local-typedefs -Wno-unused-variable -g
OUTDIR = debug
all: $(OUTDIR)/tnl-expression-templates $(OUTDIR)/tnl-expression-templates-static $(OUTDIR)/tnl-expression-templates-temp $(OUTDIR)/tnl-expression-templates-static-temp
$(OUTDIR)/tnl-expression-templates: expression-templates.cpp
$(CXX) -o $(OUTDIR)/tnl-expression-templates expression-templates.cpp $(CXX_FLAGS)
$(OUTDIR)/tnl-expression-templates-static: expression-templates-static.cpp
$(CXX) -o $(OUTDIR)/tnl-expression-templates-static expression-templates-static.cpp $(CXX_FLAGS)
$(OUTDIR)/tnl-expression-templates-temp: expression-templates-temp.cpp
$(CXX) -o $(OUTDIR)/tnl-expression-templates-temp expression-templates-temp.cpp $(CXX_FLAGS)
$(OUTDIR)/tnl-expression-templates-static-temp: expression-templates-static-temp.cpp
$(CXX) -o $(OUTDIR)/tnl-expression-templates-static-temp expression-templates-static-temp.cpp $(CXX_FLAGS)
$(OUTDIR)/tnl-expression-templates-cuda: expression-templates.cu
$(CUDA_CXX) -o $(OUTDIR)/tnl-expression-templates-cuda expression-templates.cu $(CXX_FLAGS)
#pragma once
#include <iostream>
#include <iomanip>
#include <TNL/Timer.h>
#include "OverloadedOperators.h"
#include <TNL/Containers/Vector.h>
#include <TNL/Experimental/ExpressionTemplates/VectorExpressions.h>
//#include <TNL/Experimental/ExpressionTemplates/VectorExpressionsWithReferences.h>
using namespace std;
using namespace TNL;
using namespace TNL::Containers;
int main()
{
Vector< double, Devices::Host, int > d1a( 10 );
for( int i = 0; i < 10; i++)
d1a[i] = 1.5;
Vector< double, Devices::Host, int > d1b( 10 );
for( int i = 0; i < 10; i++)
d1b[i] = 3;
Vector< double, Devices::Host, int > dr1( 10 );
VectorView< double, Devices::Host, int > dv1a( d1a );
VectorView< double, Devices::Host, int > dv1b( d1b );
VectorView< double, Devices::Host, int > dvr1( dr1 );
TNL::Timer t4;
long double dtm1 = 0;
long double dtmt1 = 0;
int numb = 200000;
//dynamic vectors
t4.start();
for( int i = 0; i < numb; i++ )
dvr1.evaluateFor( dv1a + dv1b );
t4.stop();
dtm1 = t4.getCPUCycles();
dtmt1 = t4.getCPUTime();
//std::cout << std::fixed;
std::cout << std::setprecision(2);
std::cout << dtm1 << "/n" << dtmt1 << std::endl;
return 0;
}
......@@ -15,109 +15,109 @@ using namespace TNL::Containers;
int main()
{
Vector< double, Devices::Cuda, int > d1a( 10 );
Vector< double, Devices::Host, int > d1a( 10 );
for( int i = 0; i < 10; i++)
d1a[i] = 1.5;
Vector< double, Devices::Cuda, int > d1b( 10 );
Vector< double, Devices::Host, int > d1b( 10 );
for( int i = 0; i < 10; i++)
d1b[i] = 3;
Vector< double, Devices::Cuda, int > d1c( 10 );
Vector< double, Devices::Host, int > d1c( 10 );
for( int i = 0; i < 10; i++)
d1c[i] = 25;
Vector< double, Devices::Cuda, int > d2a( 100 );
Vector< double, Devices::Host, int > d2a( 100 );
for( int i = 0; i < 100; i++)
d2a[i] = 1.5;
Vector< double, Devices::Cuda, int > d2b( 100 );
Vector< double, Devices::Host, int > d2b( 100 );
for( int i = 0; i < 100; i++)
d2b[i] = 3;
Vector< double, Devices::Cuda, int > d2c( 100 );
Vector< double, Devices::Host, int > d2c( 100 );
for( int i = 0; i < 100; i++)
d2c[i] = 25;
Vector< double, Devices::Cuda, int > d3a( 510 );
Vector< double, Devices::Host, int > d3a( 510 );
for( int i = 0; i < 510; i++)
d3a[i] = 1.5;
Vector< double, Devices::Cuda, int > d3b( 510 );
Vector< double, Devices::Host, int > d3b( 510 );
for( int i = 0; i < 510; i++)
d3b[i] = 3;
Vector< double, Devices::Cuda, int > d3c( 510 );
Vector< double, Devices::Host, int > d3c( 510 );
for( int i = 0; i < 510; i++)
d3c[i] = 25;
Vector< double, Devices::Cuda, int > d4a( 515 );
Vector< double, Devices::Host, int > d4a( 515 );
for( int i = 0; i < 515; i++)
d4a[i] = 1.5;
Vector< double, Devices::Cuda, int > d4b( 515 );
Vector< double, Devices::Host, int > d4b( 515 );
for( int i = 0; i < 515; i++)
d4b[i] = 3;
Vector< double, Devices::Cuda, int > d4c( 515 );
Vector< double, Devices::Host, int > d4c( 515 );
for( int i = 0; i < 515; i++)
d4c[i] = 25;
Vector< double, Devices::Cuda, int > d5a( 2000 );
Vector< double, Devices::Host, int > d5a( 2000 );
for( int i = 0; i < 2000; i++)
d5a[i] = 1.5;
Vector< double, Devices::Cuda, int > d5b( 2000 );
Vector< double, Devices::Host, int > d5b( 2000 );
for( int i = 0; i < 2000; i++)
d5b[i] = 3;
Vector< double, Devices::Cuda, int > d5c( 2000 );
Vector< double, Devices::Host, int > d5c( 2000 );
for( int i = 0; i < 2000; i++)
d5c[i] = 25;
Vector< double, Devices::Cuda, int > d6a( 5000 );
Vector< double, Devices::Host, int > d6a( 5000 );
for( int i = 0; i < 5000; i++)
d6a[i] = 1.5;
Vector< double, Devices::Cuda, int > d6b( 5000 );
Vector< double, Devices::Host, int > d6b( 5000 );
for( int i = 0; i < 5000; i++)
d6b[i] = 3;
Vector< double, Devices::Cuda, int > d6c( 5000 );
Vector< double, Devices::Host, int > d6c( 5000 );
for( int i = 0; i < 5000; i++)
d6c[i] = 25;
Vector< double, Devices::Cuda, int > dr1( 10 );
Vector< double, Devices::Cuda, int > dr2( 100 );
Vector< double, Devices::Cuda, int > dr3( 510 );
Vector< double, Devices::Cuda, int > dr4( 515 );
Vector< double, Devices::Cuda, int > dr5( 2000 );
Vector< double, Devices::Cuda, int > dr6( 5000 );
Vector< double, Devices::Host, int > dr1( 10 );
Vector< double, Devices::Host, int > dr2( 100 );
Vector< double, Devices::Host, int > dr3( 510 );
Vector< double, Devices::Host, int > dr4( 515 );
Vector< double, Devices::Host, int > dr5( 2000 );
Vector< double, Devices::Host, int > dr6( 5000 );
VectorView< double, Devices::Cuda, int > dv1a( d1a );
VectorView< double, Devices::Cuda, int > dv1b( d1b );
VectorView< double, Devices::Cuda, int > dv1c( d1c );
VectorView< double, Devices::Cuda, int > dv2a( d2a );
VectorView< double, Devices::Cuda, int > dv2b( d2b );
VectorView< double, Devices::Cuda, int > dv2c( d2c );
VectorView< double, Devices::Cuda, int > dv3a( d3a );
VectorView< double, Devices::Cuda, int > dv3b( d3b );
VectorView< double, Devices::Cuda, int > dv3c( d3c );
VectorView< double, Devices::Cuda, int > dv4a( d4a );
VectorView< double, Devices::Cuda, int > dv4b( d4b );
VectorView< double, Devices::Cuda, int > dv4c( d4c );
VectorView< double, Devices::Cuda, int > dv5a( d5a );
VectorView< double, Devices::Cuda, int > dv5b( d5b );
VectorView< double, Devices::Cuda, int > dv5c( d5c );
VectorView< double, Devices::Cuda, int > dv6a( d6a );
VectorView< double, Devices::Cuda, int > dv6b( d6b );
VectorView< double, Devices::Cuda, int > dv6c( d6c );
VectorView< double, Devices::Cuda, int > dvr1( dr1 );
VectorView< double, Devices::Cuda, int > dvr2( dr2 );
VectorView< double, Devices::Cuda, int > dvr3( dr3 );
VectorView< double, Devices::Cuda, int > dvr4( dr4 );
VectorView< double, Devices::Cuda, int > dvr5( dr5 );
VectorView< double, Devices::Cuda, int > dvr6( dr6 );
VectorView< double, Devices::Cuda, int > dvr1_( dr1 );
VectorView< double, Devices::Cuda, int > dvr2_( dr2 );
VectorView< double, Devices::Cuda, int > dvr3_( dr3 );
VectorView< double, Devices::Cuda, int > dvr4_( dr4 );
VectorView< double, Devices::Cuda, int > dvr5_( dr5 );
VectorView< double, Devices::Cuda, int > dvr6_( dr6 );
VectorView< double, Devices::Cuda, int > cvr1( dr1 );
VectorView< double, Devices::Cuda, int > cvr2( dr2 );
VectorView< double, Devices::Cuda, int > cvr3( dr3 );
VectorView< double, Devices::Cuda, int > cvr4( dr4 );
VectorView< double, Devices::Cuda, int > cvr5( dr5 );
VectorView< double, Devices::Cuda, int > cvr6( dr6 );
VectorView< double, Devices::Cuda, int > cvr1_( dr1 );
VectorView< double, Devices::Cuda, int > cvr2_( dr2 );
VectorView< double, Devices::Cuda, int > cvr3_( dr3 );
VectorView< double, Devices::Cuda, int > cvr4_( dr4 );
VectorView< double, Devices::Cuda, int > cvr5_( dr5 );
VectorView< double, Devices::Cuda, int > cvr6_( dr6 );
VectorView< double, Devices::Host, int > dv1a( d1a );
VectorView< double, Devices::Host, int > dv1b( d1b );
VectorView< double, Devices::Host, int > dv1c( d1c );
VectorView< double, Devices::Host, int > dv2a( d2a );
VectorView< double, Devices::Host, int > dv2b( d2b );
VectorView< double, Devices::Host, int > dv2c( d2c );
VectorView< double, Devices::Host, int > dv3a( d3a );
VectorView< double, Devices::Host, int > dv3b( d3b );
VectorView< double, Devices::Host, int > dv3c( d3c );
VectorView< double, Devices::Host, int > dv4a( d4a );
VectorView< double, Devices::Host, int > dv4b( d4b );
VectorView< double, Devices::Host, int > dv4c( d4c );
VectorView< double, Devices::Host, int > dv5a( d5a );
VectorView< double, Devices::Host, int > dv5b( d5b );
VectorView< double, Devices::Host, int > dv5c( d5c );
VectorView< double, Devices::Host, int > dv6a( d6a );
VectorView< double, Devices::Host, int > dv6b( d6b );
VectorView< double, Devices::Host, int > dv6c( d6c );
VectorView< double, Devices::Host, int > dvr1( dr1 );
VectorView< double, Devices::Host, int > dvr2( dr2 );
VectorView< double, Devices::Host, int > dvr3( dr3 );
VectorView< double, Devices::Host, int > dvr4( dr4 );
VectorView< double, Devices::Host, int > dvr5( dr5 );
VectorView< double, Devices::Host, int > dvr6( dr6 );
VectorView< double, Devices::Host, int > dvr1_( dr1 );
VectorView< double, Devices::Host, int > dvr2_( dr2 );
VectorView< double, Devices::Host, int > dvr3_( dr3 );
VectorView< double, Devices::Host, int > dvr4_( dr4 );
VectorView< double, Devices::Host, int > dvr5_( dr5 );
VectorView< double, Devices::Host, int > dvr6_( dr6 );
VectorView< double, Devices::Host, int > cvr1( dr1 );
VectorView< double, Devices::Host, int > cvr2( dr2 );
VectorView< double, Devices::Host, int > cvr3( dr3 );
VectorView< double, Devices::Host, int > cvr4( dr4 );
VectorView< double, Devices::Host, int > cvr5( dr5 );
VectorView< double, Devices::Host, int > cvr6( dr6 );
VectorView< double, Devices::Host, int > cvr1_( dr1 );
VectorView< double, Devices::Host, int > cvr2_( dr2 );
VectorView< double, Devices::Host, int > cvr3_( dr3 );
VectorView< double, Devices::Host, int > cvr4_( dr4 );
VectorView< double, Devices::Host, int > cvr5_( dr5 );
VectorView< double, Devices::Host, int > cvr6_( dr6 );
std::vector<double> v1a( 10, 1.5 );
std::vector<double> v1b( 10, 3 );
......@@ -151,7 +151,7 @@ int main()
long double tmt1 = 0, tmt2 = 0, tmt3 = 0, tmt4 = 0, tmt5 = 0, tmt6 = 0, tmt1_ = 0, tmt2_ = 0, tmt3_ = 0, tmt4_ = 0, tmt5_ = 0, tmt6_ = 0;
long double ctmt1 = 0, ctmt2 = 0, ctmt3 = 0, ctmt4 = 0, ctmt5 = 0, ctmt6 = 0, ctmt1_ = 0, ctmt2_ = 0, ctmt3_ = 0, ctmt4_ = 0, ctmt5_ = 0, ctmt6_ = 0;
int numb = 200000;
int numb = 1000000;
//pure c
......
......@@ -87,7 +87,7 @@ int main()
long double tmt1 = 0, tmt2 = 0, tmt3 = 0, tmt4 = 0, tmt5 = 0, tmt6 = 0, tmt1_ = 0, tmt2_ = 0, tmt3_ = 0, tmt4_ = 0, tmt5_ = 0, tmt6_ = 0;
long double ctmt1 = 0, ctmt2 = 0, ctmt3 = 0, ctmt4 = 0, ctmt5 = 0, ctmt6 = 0, ctmt1_ = 0, ctmt2_ = 0, ctmt3_ = 0, ctmt4_ = 0, ctmt5_ = 0, ctmt6_ = 0;
int numb = 200000;
int numb = 1000000;
//static vectors
......
......@@ -151,7 +151,7 @@ int main()
long double tmt1 = 0, tmt2 = 0, tmt3 = 0, tmt4 = 0, tmt5 = 0, tmt6 = 0, tmt1_ = 0, tmt2_ = 0, tmt3_ = 0, tmt4_ = 0, tmt5_ = 0, tmt6_ = 0;
long double ctmt1 = 0, ctmt2 = 0, ctmt3 = 0, ctmt4 = 0, ctmt5 = 0, ctmt6 = 0, ctmt1_ = 0, ctmt2_ = 0, ctmt3_ = 0, ctmt4_ = 0, ctmt5_ = 0, ctmt6_ = 0;
int numb = 200000;
int numb = 1000000;
//pure c
......@@ -528,4 +528,4 @@ int main()
std::cout << "pure c" << "\t\t" << ctmt1_ << "\t" << ctmt2_ << "\t" << ctmt3_ << "\t" << ctmt4_ << "\t" << ctmt5_ << "\t" << ctmt6_ << std::endl;
return 0;
}
\ No newline at end of file
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment