Added documentation for Segments namespace.

3bbf860d · Tomáš Oberhuber · c2ee4999 · 3bbf860d · 3bbf860d · 3bbf860d
Commit 3bbf860d authored 4 years ago by Tomáš Oberhuber
--- a/Documentation/Examples/Algorithms/CMakeLists.txt
+++ b/Documentation/Examples/Algorithms/CMakeLists.txt
+ADD_SUBDIRECTORY( Segments )
+
 IF( BUILD_CUDA )
   CUDA_ADD_EXECUTABLE( SortingExampleCuda SortingExample.cu)
   ADD_CUSTOM_COMMAND( COMMAND SortingExampleCuda > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/SortingExample.out OUTPUT SortingExample.out )

--- a/Documentation/Examples/Algorithms/Segments/.SegmentsExample_General.cpp.swp
+++ b/Documentation/Examples/Algorithms/Segments/.SegmentsExample_General.cpp.swp
--- a/Documentation/Examples/Algorithms/Segments/CMakeLists.txt
+++ b/Documentation/Examples/Algorithms/Segments/CMakeLists.txt
+set( COMMON_EXAMPLES
+   SegmentsExample_General
+)
+
+if( BUILD_CUDA )
+   foreach( target IN ITEMS ${COMMON_EXAMPLES} )
+      cuda_add_executable( ${target}-cuda ${target}.cu OPTIONS )
+      add_custom_command( COMMAND ${target}-cuda > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/${target}.out OUTPUT ${target}.out )
+      set( CUDA_OUTPUTS ${CUDA_OUTPUTS} ${target}.out )
+   endforeach()
+else()
+   foreach( target IN ITEMS ${COMMON_EXAMPLES} )
+      add_executable( ${target} ${target}.cpp )
+      add_custom_command( COMMAND ${target} > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/${target}.out OUTPUT ${target}.out )
+      set( HOST_OUTPUTS ${HOST_OUTPUTS} ${target}.out )
+   endforeach()
+endif()
+
+IF( BUILD_CUDA )
+   ADD_CUSTOM_TARGET( RunSegmentsExamples-cuda ALL DEPENDS ${CUDA_OUTPUTS} )
+ELSE()
+   ADD_CUSTOM_TARGET( RunSegmentsExamples ALL DEPENDS ${HOST_OUTPUTS} )
+ENDIF()
\ No newline at end of file
--- a/Documentation/Examples/Algorithms/Segments/SegmentsExample_General.cpp
+++ b/Documentation/Examples/Algorithms/Segments/SegmentsExample_General.cpp
+#include <iostream>
+#include <functional>
+#include <TNL/Containers/Vector.h>
+#include <TNL/Algorithms/Segments/CSR.h>
+#include <TNL/Algorithms/Segments/Ellpack.h>
+#include <TNL/Devices/Host.h>
+#include <TNL/Devices/Cuda.h>
+
+template< typename Segments >
+void SegmentsExample()
+{
+   using DeviceType = typename Segments::DeviceType;
+   using IndexType = typename Segments::IndexType;
+
+   /***
+    * Create segments with given segments sizes.
+    */
+   Segments segments{ 1, 2, 3, 4, 5 };
+   std::cout << "Segments sizes are: " << segments << std::endl;
+
+   /***
+    * Allocate array for the segments;
+    */
+   TNL::Containers::Array< double, DeviceType > data( segments.getStorageSize() );
+   data = 0.0;
+
+   /***
+    * Insert data into particular segments.
+    */
+   auto data_view = data.getView();
+   segments.forAllElements( [=] __cuda_callable__ ( IndexType segmentIdx, IndexType localIdx, IndexType globalIdx, bool& compute ) mutable {
+      if( localIdx <= segmentIdx )
+         data_view[ globalIdx ] = segmentIdx;
+   } );
+
+   /***
+    * Print the data managed by the segments.
+    */
+   auto fetch = [=] __cuda_callable__ ( IndexType globalIdx ) -> double { return data_view[ globalIdx ]; };
+   printSegments( segments, fetch, std::cout );
+
+   /***
+    * Compute sums of elements in particular segments.
+    */
+   TNL::Containers::Vector< double, DeviceType, IndexType > sums( segments.getSegmentsCount() );
+   auto sums_view = sums.getView();
+   auto sum_fetch = [=] __cuda_callable__ ( IndexType segmentIdx, IndexType localIdx, IndexType globalIdx, bool& compute ) -> double {
+      return data_view[ globalIdx ];
+   };
+   auto keep = [=] __cuda_callable__ ( const IndexType& segmentIdx, const double& value ) mutable {
+      sums_view[ segmentIdx ] = value;
+   };
+   segments.reduceAllSegments( sum_fetch, std::plus<>{}, keep, 0.0 );
+   std::cout << "The sums are: " << sums << std::endl;
+}
+
+int main( int argc, char* argv[] )
+{
+   using HostCSR = TNL::Algorithms::Segments::CSR< TNL::Devices::Host, int >;
+   using HostEllpack = TNL::Algorithms::Segments::Ellpack< TNL::Devices::Host, int >;
+   using CudaCSR = TNL::Algorithms::Segments::CSR< TNL::Devices::Cuda, int >;
+   using CudaEllpack = TNL::Algorithms::Segments::Ellpack< TNL::Devices::Cuda, int >;
+
+
+   std::cout << "Example of CSR segments on host: " << std::endl;
+   SegmentsExample< HostCSR >();
+
+   std::cout << "Example of Ellpack segments on host: " << std::endl;
+   SegmentsExample< HostEllpack >();
+
+#ifdef HAVE_CUDA
+   std::cout << "Example of CSR segments on CUDA GPU: " << std::endl;
+   SegmentsExample< CudaCSR >();
+
+   std::cout << "Example of Ellpack segments on CUDA GPU: " << std::endl;
+   SegmentsExample< CudaEllpack >();
+#endif
+   return EXIT_SUCCESS;
+}
--- a/Documentation/Examples/Algorithms/Segments/SegmentsExample_General.cu
+++ b/Documentation/Examples/Algorithms/Segments/SegmentsExample_General.cu
+SegmentsExample_General.cpp
\ No newline at end of file
--- a/src/TNL/Algorithms/Segments/_NamespaceDoxy.h
+++ b/src/TNL/Algorithms/Segments/_NamespaceDoxy.h
+/***************************************************************************
+                          _NamespaceDoxy.h -  description
+                             -------------------
+    begin                : Apr 1, 2021
+    copyright            : (C) 2021 by Tomas Oberhuber
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+#pragma once
+
+namespace TNL {
+   namespace Algorithms {
+/**
+ * \brief Namespace holding segments data structures.
+
+ *Segments* represent data structure for manipulation with several local arrays (denoted also as segments)
+ having different size in general. All the local arrays are supposed to be allocated in one continuos global array.
+ The data structure segments offers mapping between indexes of particular local arrays and indexes
+ of the global array. In addition,one can perform parallel operations like for or flexible reduction on partical
+ local arrays.
+
+ A typical example for use of *segments* is implementation of sparse matrices. Sparse matrix like the following
+ \f[
+  \left(
+  \begin{array}{ccccc}
+   1  &  0  &  2  &  0  &  0 \\
+    0  &  0  &  5  &  0  &  0 \\
+    3  &  4  &  7  &  9  &  0 \\
+    0  &  0  &  0  &  0  & 12 \\
+   0  &  0  & 15  & 17  & 20
+  \end{array}
+  \right)
+ \f]
+ is usually first compressed which means that the zero elements are omitted to get the following "matrix":
+
+ \f[
+ \begin{array}{ccccc}
+    1  &   2  \\
+    5   \\
+    3  &   4  &  7 &  9   \\
+    12 \\
+    15 & 17  & 20
+ \end{array}
+ \f]
+ We have to store column index of each matrix elements as well in a "matrix" like this:
+ \f[
+ \begin{array}{ccccc}
+    0  &   2  \\
+    2   \\
+    0  &   1  &  2 &  3   \\
+    4 \\
+    2 & 3  & 4
+ \end{array}
+ \f]
+
+ Such "matrices" can be stored in memory in a row-wise manner in one contiguous array because of the performance reasons. The first "matrix" (i.e. values of the matrix elements)
+ would be stored as follows
+
+ \f[
+    \begin{array}{|cc|c|cccc|c|cc|} 1 & 2 &  5 & 3 & 4 & 7 & 9 & 12 & 15 & 17 & 20 \end{array}
+ \f]
+
+and the second one (i.e. column indexes of the matrix values) as follows
+
+\f[
+    \begin{array}{|cc|c|cccc|c|cc|} 0 & 2 & 2 & 0 & 1 & 2 & 3 & 4 & 2 & 3 & 4 \end{array}
+ \f]
+
+What we see above is so called [CSR sparse matrix format](https://en.wikipedia.org/wiki/Sparse_matrix#Compressed_sparse_row_(CSR,_CRS_or_Yale_format)).
+It is the most popular format for storage of sparse matrices designed for high performance. However, it may not be the most efficient format for storage
+of sparse matrices on GPUs. Therefore many other formats have been developed to get better performance. These formats often have different layout
+of the matrix elements in the memory. They have to deal especially with two difficulties:
+
+1. Efficient storage of matrix elements in the memory to fulfill the requirements of coalesced memory accesses on GPUs or good spatial locality
+ for efficient use of caches on CPUs.
+2. Efficient mapping of GPU threads to different matrix rows.
+
+Necessity of working with this kind of data structure is not limited only to sparse matrices. We could name at least few others:
+
+1. Efficient storage of [graphs](https://en.wikipedia.org/wiki/Graph_(discrete_mathematics)) - one segment represents one graph node,
+   the elements in one segments are indexes of its neighbors.
+2. [Unstructured numerical meshes](https://en.wikipedia.org/wiki/Types_of_mesh) - unstructured numerical mesh is a graph in fact.
+3. [Particle in cell method](https://en.wikipedia.org/wiki/Particle-in-cell) - one segment represents one cell, the elements in one segment
+   are indexes of the particles.
+4. [K-means clustering](https://en.wikipedia.org/wiki/K-means_clustering) - segments represent one cluster, the elements represent vectors
+   belonging to given cluster.
+5. [Hashing](https://arxiv.org/abs/1907.02900) - segments are particular rows of the hash table, elements in segments corresponds with coliding
+   hashed elements.
+
+In general, segments can be used for problems that somehow corresponds wit 2D data structure where each row can have different size and we need
+to perform miscellaneous operations within the rows. The name *segments* comes from segmented parallel reduction or
+[segmented scan (prefix-sum)](https://en.wikipedia.org/wiki/Segmented_scan).
+
+The following example demonstrates the essence of *segments* in TNL:
+
+\includelineno Algorithms/Segments/SegmentsExample_General.cpp
+
+The result looks as follows:
+
+\include SegmentsExample_General.out
+
+*/
+
+
+
+      namespace Segments {
+
+      } // namespace Segments
+   }  // namespace Algorithms
+} // namespace TNL