Loading MemoryInfo.h 0 → 100644 +202 −0 Original line number Diff line number Diff line #pragma once // References: // - https://stackoverflow.com/a/64166/4180822 // - https://lemire.me/blog/2020/03/03/calling-free-or-delete/ // - https://stackoverflow.com/questions/15529643/what-does-malloc-trim0-really-mean #include <stdlib.h> #include <stdio.h> #include <string.h> #include <sys/types.h> #include <sys/sysinfo.h> #include <malloc.h> inline long getTotalVirtualMemory() { struct sysinfo memInfo; sysinfo (&memInfo); long totalVirtualMem = memInfo.totalram; // Add other values in next statement to avoid int overflow on right hand side... totalVirtualMem += memInfo.totalswap; totalVirtualMem *= memInfo.mem_unit; return totalVirtualMem; } inline long getUsedVirtualMemory() { struct sysinfo memInfo; sysinfo (&memInfo); long virtualMemUsed = memInfo.totalram - memInfo.freeram; // Add other values in next statement to avoid int overflow on right hand side... virtualMemUsed += memInfo.totalswap - memInfo.freeswap; virtualMemUsed *= memInfo.mem_unit; return virtualMemUsed; } inline long parseLine(char* line) { // This assumes that a digit will be found and the line ends in " kB". int i = strlen(line); const char* p = line; while (*p <'0' || *p > '9') p++; line[i-3] = '\0'; return atol(p); } // virtual memory currently used by the calling process inline long getSelfVirtualMemory() { // explicitly release unused memory malloc_trim(0); FILE* file = fopen("/proc/self/status", "r"); long result = -1; char line[128]; while (fgets(line, 128, file) != NULL){ if (strncmp(line, "VmSize:", 7) == 0){ // convert from kB to B result = parseLine(line) * 1024; break; } } fclose(file); return result; } inline long getTotalPhysicalMemory() { struct sysinfo memInfo; sysinfo (&memInfo); long totalPhysMem = memInfo.totalram; //Multiply in next statement to avoid int overflow on right hand side... totalPhysMem *= memInfo.mem_unit; return totalPhysMem; } inline long getUsedPhysicalMemory() { struct sysinfo memInfo; sysinfo (&memInfo); long physMemUsed = memInfo.totalram - memInfo.freeram; //Multiply in next statement to avoid int overflow on right hand side... physMemUsed *= memInfo.mem_unit; return physMemUsed; } inline long getSelfPhysicalMemory() { // explicitly release unused memory malloc_trim(0); FILE* file = fopen("/proc/self/status", "r"); long result = -1; char line[128]; while (fgets(line, 128, file) != NULL){ if (strncmp(line, "VmRSS:", 6) == 0){ // convert from kB to B result = parseLine(line) * 1024; break; } } fclose(file); return result; } #include <TNL/Benchmarks/Benchmarks.h> #include <TNL/Config/ParameterContainer.h> #include <TNL/Containers/StaticVector.h> struct MemoryBenchmarkResult : public TNL::Benchmarks::BenchmarkResult { using HeaderElements = TNL::Benchmarks::Logging::HeaderElements; using RowElements = TNL::Benchmarks::Logging::RowElements; double memory = std::numeric_limits<double>::quiet_NaN(); double memstddev = std::numeric_limits<double>::quiet_NaN(); virtual HeaderElements getTableHeader() const override { return HeaderElements({ "time", "stddev", "stddev/time", "bandwidth", "speedup", "memory", "memstddev", "memstddev/memory" }); } virtual RowElements getRowElements() const override { RowElements elements; elements << time << stddev << stddev / time << bandwidth; if( speedup != 0 ) elements << speedup; else elements << "N/A"; elements << memory << memstddev << memstddev / memory; return elements; } }; template< typename Mesh > MemoryBenchmarkResult testMemoryUsage( const TNL::Config::ParameterContainer& parameters, const Mesh& mesh ) { const long baseline = getSelfPhysicalMemory(); const Mesh m1 = mesh; const long check1 = getSelfPhysicalMemory(); const Mesh m2 = mesh; const long check2 = getSelfPhysicalMemory(); const Mesh m3 = mesh; const long check3 = getSelfPhysicalMemory(); const Mesh m4 = mesh; const long check4 = getSelfPhysicalMemory(); const Mesh m5 = mesh; const long check5 = getSelfPhysicalMemory(); const Mesh m6 = mesh; const long check6 = getSelfPhysicalMemory(); const Mesh m7 = mesh; const long check7 = getSelfPhysicalMemory(); const Mesh m8 = mesh; const long check8 = getSelfPhysicalMemory(); const Mesh m9 = mesh; const long check9 = getSelfPhysicalMemory(); const Mesh m10 = mesh; const long check10 = getSelfPhysicalMemory(); TNL::Containers::StaticVector< 10, long > data; data[0] = check1 - baseline; data[1] = check2 - check1; data[2] = check3 - check2; data[3] = check4 - check3; data[4] = check5 - check4; data[5] = check6 - check5; data[6] = check7 - check6; data[7] = check8 - check7; data[8] = check9 - check8; data[9] = check10 - check9; const double mean = TNL::sum( data ) / (double) data.getSize(); const double stddev = 1.0 / std::sqrt( data.getSize() - 1 ) * TNL::l2Norm( data - mean ); MemoryBenchmarkResult result; result.memory = mean / 1024.0 / 1024.0; // MiB result.memstddev = stddev / 1024.0 / 1024.0; // MiB return result; } MeshBenchmarks.h +25 −18 Original line number Diff line number Diff line Loading @@ -16,7 +16,7 @@ #include <TNL/Meshes/Mesh.h> #include <TNL/Meshes/Geometry/getEntityCenter.h> #include <TNL/Meshes/Geometry/getEntityMeasure.h> #include <TNL/Meshes/TypeResolver/TypeResolver.h> #include <TNL/Meshes/TypeResolver/resolveMeshType.h> #include <TNL/Pointers/DevicePointer.h> #include <TNL/Algorithms/ParallelFor.h> #include <TNL/Algorithms/TemplateStaticFor.h> Loading @@ -27,8 +27,8 @@ #endif #include "MeshOrdering.h" #include "MeshConfigs.h" #include "MemoryInfo.h" using namespace TNL; using namespace TNL::Meshes; Loading Loading @@ -100,30 +100,37 @@ struct MeshBenchmarks return false; } // collect memory usage const MemoryBenchmarkResult meminfo = testMemoryUsage( parameters, mesh ); // natural ordering metadataColumns.back() = {"order", "nat"}; benchmark.setMetadataColumns( metadataColumns ); dispatchAlgorithms( benchmark, parameters, mesh ); // TODO: pass result to the dispatchAlgorithms to append the timings MemoryBenchmarkResult result = meminfo; auto noop = [](){}; benchmark.time< TNL::Devices::Host >( "CPU", noop, result ); // dispatchAlgorithms( benchmark, parameters, mesh ); // k-d tree ordering metadataColumns.back() = {"order", "kdt"}; benchmark.setMetadataColumns( metadataColumns ); using KdTreeOrdering = MeshOrdering< Mesh, KdTreeOrdering >; KdTreeOrdering kd; kd.reorder( mesh ); dispatchAlgorithms( benchmark, parameters, mesh ); // metadataColumns.back() = {"order", "kdt"}; // benchmark.setMetadataColumns( metadataColumns ); // using KdTreeOrdering = MeshOrdering< Mesh, KdTreeOrdering >; // KdTreeOrdering kd; // kd.reorder( mesh ); // dispatchAlgorithms( benchmark, parameters, mesh ); #ifdef HAVE_CUDA cudaProfilerStart(); #endif //#ifdef HAVE_CUDA // cudaProfilerStart(); //#endif // RCM ordering metadataColumns.back() = {"order", "rcm"}; benchmark.setMetadataColumns( metadataColumns ); using RCMOrdering = MeshOrdering< Mesh, CuthillMcKeeOrdering<> >; RCMOrdering rcm; rcm.reorder( mesh ); dispatchAlgorithms( benchmark, parameters, mesh ); // metadataColumns.back() = {"order", "rcm"}; // benchmark.setMetadataColumns( metadataColumns ); // using RCMOrdering = MeshOrdering< Mesh, CuthillMcKeeOrdering<> >; // RCMOrdering rcm; // rcm.reorder( mesh ); // dispatchAlgorithms( benchmark, parameters, mesh ); #ifdef HAVE_CUDA cudaProfilerStop(); Loading Loading
MemoryInfo.h 0 → 100644 +202 −0 Original line number Diff line number Diff line #pragma once // References: // - https://stackoverflow.com/a/64166/4180822 // - https://lemire.me/blog/2020/03/03/calling-free-or-delete/ // - https://stackoverflow.com/questions/15529643/what-does-malloc-trim0-really-mean #include <stdlib.h> #include <stdio.h> #include <string.h> #include <sys/types.h> #include <sys/sysinfo.h> #include <malloc.h> inline long getTotalVirtualMemory() { struct sysinfo memInfo; sysinfo (&memInfo); long totalVirtualMem = memInfo.totalram; // Add other values in next statement to avoid int overflow on right hand side... totalVirtualMem += memInfo.totalswap; totalVirtualMem *= memInfo.mem_unit; return totalVirtualMem; } inline long getUsedVirtualMemory() { struct sysinfo memInfo; sysinfo (&memInfo); long virtualMemUsed = memInfo.totalram - memInfo.freeram; // Add other values in next statement to avoid int overflow on right hand side... virtualMemUsed += memInfo.totalswap - memInfo.freeswap; virtualMemUsed *= memInfo.mem_unit; return virtualMemUsed; } inline long parseLine(char* line) { // This assumes that a digit will be found and the line ends in " kB". int i = strlen(line); const char* p = line; while (*p <'0' || *p > '9') p++; line[i-3] = '\0'; return atol(p); } // virtual memory currently used by the calling process inline long getSelfVirtualMemory() { // explicitly release unused memory malloc_trim(0); FILE* file = fopen("/proc/self/status", "r"); long result = -1; char line[128]; while (fgets(line, 128, file) != NULL){ if (strncmp(line, "VmSize:", 7) == 0){ // convert from kB to B result = parseLine(line) * 1024; break; } } fclose(file); return result; } inline long getTotalPhysicalMemory() { struct sysinfo memInfo; sysinfo (&memInfo); long totalPhysMem = memInfo.totalram; //Multiply in next statement to avoid int overflow on right hand side... totalPhysMem *= memInfo.mem_unit; return totalPhysMem; } inline long getUsedPhysicalMemory() { struct sysinfo memInfo; sysinfo (&memInfo); long physMemUsed = memInfo.totalram - memInfo.freeram; //Multiply in next statement to avoid int overflow on right hand side... physMemUsed *= memInfo.mem_unit; return physMemUsed; } inline long getSelfPhysicalMemory() { // explicitly release unused memory malloc_trim(0); FILE* file = fopen("/proc/self/status", "r"); long result = -1; char line[128]; while (fgets(line, 128, file) != NULL){ if (strncmp(line, "VmRSS:", 6) == 0){ // convert from kB to B result = parseLine(line) * 1024; break; } } fclose(file); return result; } #include <TNL/Benchmarks/Benchmarks.h> #include <TNL/Config/ParameterContainer.h> #include <TNL/Containers/StaticVector.h> struct MemoryBenchmarkResult : public TNL::Benchmarks::BenchmarkResult { using HeaderElements = TNL::Benchmarks::Logging::HeaderElements; using RowElements = TNL::Benchmarks::Logging::RowElements; double memory = std::numeric_limits<double>::quiet_NaN(); double memstddev = std::numeric_limits<double>::quiet_NaN(); virtual HeaderElements getTableHeader() const override { return HeaderElements({ "time", "stddev", "stddev/time", "bandwidth", "speedup", "memory", "memstddev", "memstddev/memory" }); } virtual RowElements getRowElements() const override { RowElements elements; elements << time << stddev << stddev / time << bandwidth; if( speedup != 0 ) elements << speedup; else elements << "N/A"; elements << memory << memstddev << memstddev / memory; return elements; } }; template< typename Mesh > MemoryBenchmarkResult testMemoryUsage( const TNL::Config::ParameterContainer& parameters, const Mesh& mesh ) { const long baseline = getSelfPhysicalMemory(); const Mesh m1 = mesh; const long check1 = getSelfPhysicalMemory(); const Mesh m2 = mesh; const long check2 = getSelfPhysicalMemory(); const Mesh m3 = mesh; const long check3 = getSelfPhysicalMemory(); const Mesh m4 = mesh; const long check4 = getSelfPhysicalMemory(); const Mesh m5 = mesh; const long check5 = getSelfPhysicalMemory(); const Mesh m6 = mesh; const long check6 = getSelfPhysicalMemory(); const Mesh m7 = mesh; const long check7 = getSelfPhysicalMemory(); const Mesh m8 = mesh; const long check8 = getSelfPhysicalMemory(); const Mesh m9 = mesh; const long check9 = getSelfPhysicalMemory(); const Mesh m10 = mesh; const long check10 = getSelfPhysicalMemory(); TNL::Containers::StaticVector< 10, long > data; data[0] = check1 - baseline; data[1] = check2 - check1; data[2] = check3 - check2; data[3] = check4 - check3; data[4] = check5 - check4; data[5] = check6 - check5; data[6] = check7 - check6; data[7] = check8 - check7; data[8] = check9 - check8; data[9] = check10 - check9; const double mean = TNL::sum( data ) / (double) data.getSize(); const double stddev = 1.0 / std::sqrt( data.getSize() - 1 ) * TNL::l2Norm( data - mean ); MemoryBenchmarkResult result; result.memory = mean / 1024.0 / 1024.0; // MiB result.memstddev = stddev / 1024.0 / 1024.0; // MiB return result; }
MeshBenchmarks.h +25 −18 Original line number Diff line number Diff line Loading @@ -16,7 +16,7 @@ #include <TNL/Meshes/Mesh.h> #include <TNL/Meshes/Geometry/getEntityCenter.h> #include <TNL/Meshes/Geometry/getEntityMeasure.h> #include <TNL/Meshes/TypeResolver/TypeResolver.h> #include <TNL/Meshes/TypeResolver/resolveMeshType.h> #include <TNL/Pointers/DevicePointer.h> #include <TNL/Algorithms/ParallelFor.h> #include <TNL/Algorithms/TemplateStaticFor.h> Loading @@ -27,8 +27,8 @@ #endif #include "MeshOrdering.h" #include "MeshConfigs.h" #include "MemoryInfo.h" using namespace TNL; using namespace TNL::Meshes; Loading Loading @@ -100,30 +100,37 @@ struct MeshBenchmarks return false; } // collect memory usage const MemoryBenchmarkResult meminfo = testMemoryUsage( parameters, mesh ); // natural ordering metadataColumns.back() = {"order", "nat"}; benchmark.setMetadataColumns( metadataColumns ); dispatchAlgorithms( benchmark, parameters, mesh ); // TODO: pass result to the dispatchAlgorithms to append the timings MemoryBenchmarkResult result = meminfo; auto noop = [](){}; benchmark.time< TNL::Devices::Host >( "CPU", noop, result ); // dispatchAlgorithms( benchmark, parameters, mesh ); // k-d tree ordering metadataColumns.back() = {"order", "kdt"}; benchmark.setMetadataColumns( metadataColumns ); using KdTreeOrdering = MeshOrdering< Mesh, KdTreeOrdering >; KdTreeOrdering kd; kd.reorder( mesh ); dispatchAlgorithms( benchmark, parameters, mesh ); // metadataColumns.back() = {"order", "kdt"}; // benchmark.setMetadataColumns( metadataColumns ); // using KdTreeOrdering = MeshOrdering< Mesh, KdTreeOrdering >; // KdTreeOrdering kd; // kd.reorder( mesh ); // dispatchAlgorithms( benchmark, parameters, mesh ); #ifdef HAVE_CUDA cudaProfilerStart(); #endif //#ifdef HAVE_CUDA // cudaProfilerStart(); //#endif // RCM ordering metadataColumns.back() = {"order", "rcm"}; benchmark.setMetadataColumns( metadataColumns ); using RCMOrdering = MeshOrdering< Mesh, CuthillMcKeeOrdering<> >; RCMOrdering rcm; rcm.reorder( mesh ); dispatchAlgorithms( benchmark, parameters, mesh ); // metadataColumns.back() = {"order", "rcm"}; // benchmark.setMetadataColumns( metadataColumns ); // using RCMOrdering = MeshOrdering< Mesh, CuthillMcKeeOrdering<> >; // RCMOrdering rcm; // rcm.reorder( mesh ); // dispatchAlgorithms( benchmark, parameters, mesh ); #ifdef HAVE_CUDA cudaProfilerStop(); Loading