diff --git a/src/implementation/core/memory-operations.h b/src/implementation/core/memory-operations.h
index 00871261f2cc9c343878b5717cfda4bd2b4436d3..dae87f9fa932883f76ff1d9191befc580c2e6cb7 100644
--- a/src/implementation/core/memory-operations.h
+++ b/src/implementation/core/memory-operations.h
@@ -70,11 +70,21 @@ bool setMemoryHost( Element* data,
 }
 
 #ifdef HAVE_CUDA
-template< typename Element >
+template< typename Element, typename Index >
 __global__ void setVectorValueCudaKernel( Element* data,
-                                          const Element value )
+                                          const Index size,
+                                          const Element value,
+                                          const Index elementsPerThread )
 {
-   data[ blockIdx. x * blockDim. x + threadIdx. x ] = value;
+   Index elementIdx = blockDim. x * blockIdx. x * elementsPerThread + threadIdx. x;
+   Index elementsProcessed( 0 );
+   while( elementsProcessed < elementsPerThread &&
+          elementIdx < size )
+   {
+      data[ elementIdx ] = value;
+      elementIdx += blockDim. x;
+      elementsProcessed ++;
+   }
 }
 #endif
 
@@ -85,11 +95,12 @@ bool setMemoryCuda( Element* data,
 {
 #ifdef HAVE_CUDA
       dim3 blockSize, gridSize;
-      blockSize. x = 512;
+      blockSize. x = 32;
       int blocksNumber = ceil( ( double ) size / ( double ) blockSize. x );
       int elementsPerThread = ceil( ( double ) blocksNumber / ( double ) maxCudaGridSize );
-
-      setVectorValueCudaKernel<<< gridSize, blockSize >>>( data, size, value, elementsPerThread );
+      gridSize. x = Min( blocksNumber, maxCudaGridSize );
+      cout << "blocksNumber = " << blocksNumber << "Grid size = " << gridSize. x << " elementsPerThread = " << elementsPerThread << endl;
+      setVectorValueCudaKernel<<< blockSize, gridSize >>>( data, size, value, elementsPerThread );
 
       return checkCudaDevice;
 #else
diff --git a/tests/unit-tests/core/cuda/tnlCudaMemoryOperationsTester.h b/tests/unit-tests/core/cuda/tnlCudaMemoryOperationsTester.h
index 22892819909416610f94639bbb9ebaa1a059a022..b65a8091a588a1367e3955d60bd5c0814e597888 100644
--- a/tests/unit-tests/core/cuda/tnlCudaMemoryOperationsTester.h
+++ b/tests/unit-tests/core/cuda/tnlCudaMemoryOperationsTester.h
@@ -43,6 +43,10 @@ class tnlCudaMemoryOperationsTester : public CppUnit :: TestCase
                                 "allocationTest",
                                 &tnlCudaMemoryOperationsTester :: allocationTest )
                                );
+      suiteOfTests -> addTest( new CppUnit :: TestCaller< tnlCudaMemoryOperationsTester >(
+                                "copyTest",
+                                &tnlCudaMemoryOperationsTester :: allocationTest )
+                               );
       suiteOfTests -> addTest( new CppUnit :: TestCaller< tnlCudaMemoryOperationsTester >(
                                 "smallMemorySetTest",
                                 &tnlCudaMemoryOperationsTester :: smallMemorySetTest )
@@ -65,9 +69,25 @@ class tnlCudaMemoryOperationsTester : public CppUnit :: TestCase
       CPPUNIT_ASSERT( checkCudaDevice );
    };
 
+   void copyTest()
+   {
+      const int size( 1 << 20 );
+      int *hostData1, *hostData2, *deviceData;
+      allocateMemoryHost( hostData1, size );
+      allocateMemoryHost( hostData2, size );
+      allocateMemoryCuda( deviceData, size );
+      setMemoryHost( hostData1, 13, size );
+      copyMemoryHostToCuda( deviceData, hostData1, size );
+      copyMemoryCudaToHost( hostData2, deviceData, size );
+      CPPUNIT_ASSERT( compareMemoryHost( hostData1, hostData2, size) );
+      freeMemoryHost( hostData1 );
+      freeMemoryHost( hostData2 );
+      freeMemoryCuda( deviceData );
+   };
+
    void smallMemorySetTest()
    {
-      const int size( 100 );
+      const int size( 1024 );
       int *hostData, *deviceData;
       allocateMemoryHost( hostData, size );
       allocateMemoryCuda( deviceData, size );
@@ -78,11 +98,13 @@ class tnlCudaMemoryOperationsTester : public CppUnit :: TestCase
       CPPUNIT_ASSERT( checkCudaDevice );
       for( int i = 0; i < size; i ++ )
          CPPUNIT_ASSERT( hostData[ i ] == 13 );
+      freeMemoryHost( hostData );
+      freeMemoryCuda( deviceData );
    };
 
    void bigMemorySetTest()
    {
-      const int size( 2.7 * maxCudaGridSize * maxCudaBlockSize );
+      const int size( 1.1 * maxCudaGridSize * maxCudaBlockSize );
       cout << "Size = " << size << endl;
       int *hostData, *deviceData;
       allocateMemoryHost( hostData, size );
@@ -95,9 +117,11 @@ class tnlCudaMemoryOperationsTester : public CppUnit :: TestCase
       for( int i = 0; i < size; i ++ )
       {
          if( hostData[ i ] != 13 )
-            cout << " i = " << i << endl;
+            cout << " i = " << i << " " << hostData[ i ] << endl;
          CPPUNIT_ASSERT( hostData[ i ] == 13 );
       }
+      freeMemoryHost( hostData );
+      freeMemoryCuda( deviceData );
    };
 
 };