Commit fc488c4d authored by Tomáš Oberhuber's avatar Tomáš Oberhuber Committed by Jakub Klinkovský
Browse files

Implementing Python script to analyze SpMV benchmarks results.

parent 35a017fa
Loading
Loading
Loading
Loading
+156 −24
Original line number Diff line number Diff line
#!/usr/bin/python3

import os
import json
import pandas as pd
from pandas.io.json import json_normalize
import matplotlib.pyplot as plt
import numpy as np


####
# Helper function
def slugify(s):
   s = str(s).strip().replace(' ', '_')
   return re.sub(r'(?u)[^-\w.]', '', s)

####
# Comparison with Cusparse
def cusparse_comparison( df, formats ):
   if not os.path.exists("Cusparse-bw"):
      os.mkdir("Cusparse-bw")
   df.sort_values(by=[('cusparse','GPU','bandwidth')],inplace=True,ascending=False)
   for format in formats:
      if not format in ['cusparse','CSR']:
         print( f"Writing comparison of {format} and Cusparse" )
         t = np.arange(df[(format,'GPU','bandwidth')].size )
         fig, axs = plt.subplots( 2, 1 )
         axs[0].plot( t, df[(format,'GPU','bandwidth')], '-o', ms=1, lw=1 )
         axs[0].plot( t, df[('cusparse','GPU','bandwidth')], '-o', ms=1, lw=1 )
         axs[0].legend( [ format, 'Cusparse' ], loc='upper right' )
         axs[0].set_ylabel( 'Bandwidth in GB/sec' )
         axs[1].set_yscale( 'log' )
         axs[1].plot( t, df[(format,'GPU','bandwidth')], '-o', ms=1, lw=1 )
         axs[1].plot( t, df[('cusparse','GPU','bandwidth')], '-o', ms=1, lw=1 )
         axs[1].legend( [ format, 'Cusparse' ], loc='upper right' )
         axs[1].set_xlabel( 'Matrix ID - sorted w.r.t. Cusparse' )
         axs[1].set_ylabel( 'Bandwidth in GB/sec' )
         plt.savefig( f"Cusparse-bw/{format}.pdf" )
         plt.close(fig)

####
# Comparison with CSR on CPU
def csr_comparison( df, formats ):
   if not os.path.exists("CSR-bw"):
      os.mkdir("CSR-bw")
   for format in formats:
      if not format in ['cusparse','CSR']:
         print( f"Writing comparison of {format} and CSR on CPU" )
         result.sort_values(by=[(format,'GPU','bandwidth')],inplace=True,ascending=False)
         fig, axs = plt.subplots( 2, 1 )
         t = np.arange(result[(format,'GPU','bandwidth')].size )
         axs[0].plot( t, result[(format,'GPU','bandwidth')], '-o', ms=1, lw=1 )
         axs[0].plot( t, result[('CSR','CPU','bandwidth')], '-o', ms=1, lw=1 )
         axs[0].legend( [ format, 'CSR on CPU' ], loc='upper right' )
         axs[1].set_yscale( 'log' )
         axs[1].plot( t, result[(format,'GPU','bandwidth')], '-o', ms=1, lw=1 )
         axs[1].plot( t, result[('CSR','CPU','bandwidth')], '-o', ms=1, lw=1 )
         axs[1].legend( [ format, 'CSR on CPU' ], loc='upper right' )
         axs[1].set_xlabel( f"Matrix ID - sorted w.r.t. {format}" )
         axs[1].set_ylabel( 'Bandwidth in GB/sec' )
         plt.savefig( f"CSR-bw/{format}.pdf")
         plt.close(fig)

####
# Comparison of Legacy formats
def legacy_formats_comparison( df, formats ):
   if not os.path.exists("Legacy-bw"):
      os.mkdir("Legacy-bw")
   for ref_format, legacy_format in [ ('Ellpack', 'Ellpack Legacy'),
                                    ('SlicedEllpack', 'SlicedEllpack Legacy'),
                                    ('ChunkedEllpack', 'ChunkedEllpack Legacy'),
                                    ('BiEllpack', 'BiEllpack Legacy'),
                                    ('CSR< Adaptive >', 'CSR Legacy Adaptive'),
                                    ('CSR< Scalar >', 'CSR Legacy Scalar'),
                                    ('CSR< Vector >', 'CSR Legacy Vector') ]:
      if ref_format in formats and legacy_format in formats:
         print( f"Writing comparison of {ref_format} and {legacy_format}" )
         result.sort_values(by=[(ref_format,'GPU','bandwidth')],inplace=True,ascending=False)
         fig, axs = plt.subplots( 2, 1 )
         t = np.arange(result[(ref_format,'GPU','bandwidth')].size )
         axs[0].plot( t, result[(ref_format,'GPU','bandwidth')], '-o', ms=1, lw=1 )
         axs[0].plot( t, result[(legacy_format,'GPU','bandwidth')], '-o', ms=1, lw=1 )
         axs[0].legend( [ ref_format, legacy_format ], loc='upper right' )
         axs[1].set_yscale( 'log' )
         axs[1].plot( t, result[(ref_format,'GPU','bandwidth')], '-o', ms=1, lw=1 )
         axs[1].plot( t, result[(legacy_format,'GPU','bandwidth')], '-o', ms=1, lw=1 )
         axs[1].legend( [ ref_format, legacy_format ], loc='upper right' )
         axs[1].set_xlabel( f"Matrix ID - sorted w.r.t. {ref_format}" )
         axs[1].set_ylabel( 'Bandwidth in GB/sec' )
         plt.savefig( f"Legacy-bw/{ref_format}.pdf")
         plt.close(fig)

####
# Parse input file
print( "Parsing input file...." )
@@ -17,16 +97,19 @@ with open('sparse-matrix-benchmark.log') as f:
input_df = json_normalize( d, record_path=['results'] )
#input_df.to_html( "orig-pandas.html" )


####
# Create multiindex for columns

# Get format names - TODO: the first benchmark might not have all of them
matrixName = input_df.iloc[0]['matrix name']
df_matrix = input_df.loc[input_df['matrix name'] == matrixName]
formats = df_matrix.loc[:,'format']
formats = df_matrix.loc[:,'format'].values.tolist()
formats = list(dict.fromkeys(formats)) # remove duplicates
level1 = [ 'Matrix name', 'rows', 'columns' ]
level2 = [ '',            '',     ''        ]
level3 = [ '',            '',     ''        ]
level4 = [ '',            '',     ''        ]
df_data = [[ ' ',' ',' ']]
for format in formats:
   for device in ['CPU','GPU']:
@@ -34,13 +117,21 @@ for format in formats:
         level1.append( format )
         level2.append( device )
         level3.append( data )
         level4.append( '' )
         df_data[ 0 ].append( ' ' )
multiColumns = pd.MultiIndex.from_arrays([ level1, level2, level3 ] )
   if not format in [ 'cusparse', 'CSR' ]:
      for speedup in [ 'cusparse', 'CSR CPU']:
         level1.append( format )
         level2.append( 'GPU' )
         level3.append( 'speed-up')
         level4.append( speedup )
         df_data[ 0 ].append( ' ' )
multiColumns = pd.MultiIndex.from_arrays([ level1, level2, level3, level4 ] )
frames = []

in_idx = 0
out_idx = 0
max_out_idx = 50
max_out_idx = 10
print( "Converting data..." )
while in_idx < len(input_df.index) and out_idx < max_out_idx:
   matrixName = input_df.iloc[in_idx]['matrix name']
@@ -54,7 +145,7 @@ while in_idx < len(input_df.index) and out_idx < max_out_idx:
      current_format = row['format']
      current_device = row['device']
      #print( current_format + " / " + current_device )
      aux_df.iloc[0][(current_format,current_device,'bandwidth')]   = row['bandwidth']
      aux_df.iloc[0][(current_format,current_device,'bandwidth','')]   = pd.to_numeric(row['bandwidth'], errors='coerce')
      #aux_df.iloc[0][(current_format,current_device,'time')]        = row['time']
      #aux_df.iloc[0][(current_format,current_device,'speed-up')]    = row['speedup']
      #aux_df.iloc[0][(current_format,current_device,'non-zeros')]   = row['non-zeros']
@@ -70,30 +161,71 @@ while in_idx < len(input_df.index) and out_idx < max_out_idx:
print( "Merging data into one frame..." )
result = pd.concat( frames )

print( "Setting data types..." )
for format in formats:
   for device in ['CPU','GPU']:
      #df['eps'] = pd.to_numeric(df['eps'], errors='coerce')
      print(result[(format,device,'bandwidth')].toList())
      result[(format,device,'bandwidth')] = pd.to_numeric( result[(format,device,'bandwidth')], errors='coerce' )
      #result[(format,device,'time')].astype('float64')
      #result[(format,device,'speed-up')].astype('float64')
      #result[(format,device,'non-zeros')].astype('int64')
      #result[(format,device,'stddev')].astype('float64')
      #result[(format,device,'stddev/time')].astype('float64')
      #result[(format,device,'diff.max')].astype('float64')
      #result[(format,device,'diff.l2')].astype('float64')
   if not format in [ 'cusparse', 'CSR' ]:
      print( 'Adding speed-up for ', format )
      format_bdw_list = result[(format,'GPU','bandwidth')]
      cusparse_bdw_list = result[('cusparse','GPU','bandwidth')]
      csr_bdw_list = result[('CSR','CPU','bandwidth')]
      cusparse_speedup_list = []
      csr_speedup_list = []
      for ( format_bdw, cusparse_bdw, csr_bdw ) in zip( format_bdw_list, cusparse_bdw_list,csr_bdw_list ):
         try:
            cusparse_speedup_list.append( format_bdw / cusparse_bdw )
         except:
            cusparse_speedup_list.append('')
         try:
            csr_speedup_list.append( format_bdw / csr_bdw )
         except:
            csr_speedup_list.append('')
         #print( f'**{type(format_bdw)}** -- {type(5.2)}' )
         #if type(format_bdw) == "<class 'numpy.float64'>":
         #   print( f'##########{format_bdw / cusparse_bdw}' )
         #   cusparse_speedup_list.append( format_bdw / cusparse_bdw )
         #   csr_speedup_list.append( format_bdw / csr_bdw )
         #else:
         #   cusparse_speedup_list.append('')
         #   csr_speedup_list.append('')

      result[(format,'GPU','speed-up','cusparse')] = cusparse_speedup_list
      result[(format,'GPU','speed-up','CSR CPU')] = csr_speedup_list

print( "Writting to HTML file..." )
result.to_html( 'output.html' )

result.replace( to_replace=' ',value=np.nan,inplace=True)

####
# Generate report = tables and figures

#result.sort_values(by=[('cusparse','GPU','bandwidth')],inplace=True,ascending=False)
#for format in formats:
#   cusparse_bw = result[('cusparse','GPU','bandwidth')].toList()
#   format_bw = result[(format,'GPU','bandwidth')].toList()
#
#cusparse_comparison( result, formats )
#csr_comparison( result, formats )
#legacy_formats_comparison( result, formats )

#for format in formats:
#   result.sort_values(by=[(format,'GPU','bandwidth')],inplace=True,ascending=False)
####
# Comparison of speed-up w.r.t. Cusparse
if not os.path.exists("Cusparse-speed-up"):
   os.mkdir("Cusparse-speed-up")
for format in formats:
   if not format in ['cusparse','CSR']:
      print( f"Writing comparison of speed-up of {format} compared to Cusparse" )
      result['tmp'] = result[(format, 'GPU','bandwidth')]
      filtered_df=result.dropna(subset=['rows'])
      filtered_df.to_html( 'tmp.html')
      break
      filtered_df.sort_values(by=[(format,'GPU','speed-up','cusparse')],inplace=True,ascending=False)
      fig, axs = plt.subplots( 2, 1 )
      size = result[(format,'GPU','bandwidth')].size
      t = np.arange( size )
      bar = np.full( size, 1 )
      axs[0].plot( t, filtered_df[(format,'GPU','speed-up','cusparse')], '-o', ms=1, lw=1 )
      axs[0].plot( t, bar, '-', ms=1, lw=1 )
      axs[0].legend( [ format, 'Cusparse' ], loc='upper right' )
      axs[1].set_yscale( 'log' )
      axs[1].plot( t, result[(format,'GPU','speed-up','cusparse')], '-o', ms=1, lw=1 )
      axs[1].plot( t, bar, '-', ms=1, lw=1 )
      axs[1].legend( [ format, 'Cusparse' ], loc='upper right' )
      axs[1].set_xlabel( f"Matrix ID - sorted w.r.t. {format}" )
      axs[1].set_ylabel( 'Bandwidth in GB/sec' )
      plt.savefig( f"Cusparse-speed-up/{format}.pdf")
      plt.close(fig)