Commit 15930e8a authored by Tomáš Oberhuber's avatar Tomáš Oberhuber Committed by Jakub Klinkovský
Browse files

Python script for SpMV benchmark results processing is working well.

parent fc488c4d
Loading
Loading
Loading
Loading
+176 −135
Original line number Diff line number Diff line
@@ -14,98 +14,17 @@ def slugify(s):
   return re.sub(r'(?u)[^-\w.]', '', s)

####
# Comparison with Cusparse
def cusparse_comparison( df, formats ):
   if not os.path.exists("Cusparse-bw"):
      os.mkdir("Cusparse-bw")
   df.sort_values(by=[('cusparse','GPU','bandwidth')],inplace=True,ascending=False)
   for format in formats:
      if not format in ['cusparse','CSR']:
         print( f"Writing comparison of {format} and Cusparse" )
         t = np.arange(df[(format,'GPU','bandwidth')].size )
         fig, axs = plt.subplots( 2, 1 )
         axs[0].plot( t, df[(format,'GPU','bandwidth')], '-o', ms=1, lw=1 )
         axs[0].plot( t, df[('cusparse','GPU','bandwidth')], '-o', ms=1, lw=1 )
         axs[0].legend( [ format, 'Cusparse' ], loc='upper right' )
         axs[0].set_ylabel( 'Bandwidth in GB/sec' )
         axs[1].set_yscale( 'log' )
         axs[1].plot( t, df[(format,'GPU','bandwidth')], '-o', ms=1, lw=1 )
         axs[1].plot( t, df[('cusparse','GPU','bandwidth')], '-o', ms=1, lw=1 )
         axs[1].legend( [ format, 'Cusparse' ], loc='upper right' )
         axs[1].set_xlabel( 'Matrix ID - sorted w.r.t. Cusparse' )
         axs[1].set_ylabel( 'Bandwidth in GB/sec' )
         plt.savefig( f"Cusparse-bw/{format}.pdf" )
         plt.close(fig)

####
# Comparison with CSR on CPU
def csr_comparison( df, formats ):
   if not os.path.exists("CSR-bw"):
      os.mkdir("CSR-bw")
   for format in formats:
      if not format in ['cusparse','CSR']:
         print( f"Writing comparison of {format} and CSR on CPU" )
         result.sort_values(by=[(format,'GPU','bandwidth')],inplace=True,ascending=False)
         fig, axs = plt.subplots( 2, 1 )
         t = np.arange(result[(format,'GPU','bandwidth')].size )
         axs[0].plot( t, result[(format,'GPU','bandwidth')], '-o', ms=1, lw=1 )
         axs[0].plot( t, result[('CSR','CPU','bandwidth')], '-o', ms=1, lw=1 )
         axs[0].legend( [ format, 'CSR on CPU' ], loc='upper right' )
         axs[1].set_yscale( 'log' )
         axs[1].plot( t, result[(format,'GPU','bandwidth')], '-o', ms=1, lw=1 )
         axs[1].plot( t, result[('CSR','CPU','bandwidth')], '-o', ms=1, lw=1 )
         axs[1].legend( [ format, 'CSR on CPU' ], loc='upper right' )
         axs[1].set_xlabel( f"Matrix ID - sorted w.r.t. {format}" )
         axs[1].set_ylabel( 'Bandwidth in GB/sec' )
         plt.savefig( f"CSR-bw/{format}.pdf")
         plt.close(fig)

####
# Comparison of Legacy formats
def legacy_formats_comparison( df, formats ):
   if not os.path.exists("Legacy-bw"):
      os.mkdir("Legacy-bw")
   for ref_format, legacy_format in [ ('Ellpack', 'Ellpack Legacy'),
                                    ('SlicedEllpack', 'SlicedEllpack Legacy'),
                                    ('ChunkedEllpack', 'ChunkedEllpack Legacy'),
                                    ('BiEllpack', 'BiEllpack Legacy'),
                                    ('CSR< Adaptive >', 'CSR Legacy Adaptive'),
                                    ('CSR< Scalar >', 'CSR Legacy Scalar'),
                                    ('CSR< Vector >', 'CSR Legacy Vector') ]:
      if ref_format in formats and legacy_format in formats:
         print( f"Writing comparison of {ref_format} and {legacy_format}" )
         result.sort_values(by=[(ref_format,'GPU','bandwidth')],inplace=True,ascending=False)
         fig, axs = plt.subplots( 2, 1 )
         t = np.arange(result[(ref_format,'GPU','bandwidth')].size )
         axs[0].plot( t, result[(ref_format,'GPU','bandwidth')], '-o', ms=1, lw=1 )
         axs[0].plot( t, result[(legacy_format,'GPU','bandwidth')], '-o', ms=1, lw=1 )
         axs[0].legend( [ ref_format, legacy_format ], loc='upper right' )
         axs[1].set_yscale( 'log' )
         axs[1].plot( t, result[(ref_format,'GPU','bandwidth')], '-o', ms=1, lw=1 )
         axs[1].plot( t, result[(legacy_format,'GPU','bandwidth')], '-o', ms=1, lw=1 )
         axs[1].legend( [ ref_format, legacy_format ], loc='upper right' )
         axs[1].set_xlabel( f"Matrix ID - sorted w.r.t. {ref_format}" )
         axs[1].set_ylabel( 'Bandwidth in GB/sec' )
         plt.savefig( f"Legacy-bw/{ref_format}.pdf")
         plt.close(fig)

####
# Parse input file
print( "Parsing input file...." )
with open('sparse-matrix-benchmark.log') as f:
    d = json.load(f)
input_df = json_normalize( d, record_path=['results'] )
#input_df.to_html( "orig-pandas.html" )


####
# Create multiindex for columns

# Get format names - TODO: the first benchmark might not have all of them
# Extract all formats
def get_formats( input_df ):
   matrixName = input_df.iloc[0]['matrix name']
   df_matrix = input_df.loc[input_df['matrix name'] == matrixName]
formats = df_matrix.loc[:,'format'].values.tolist()
   formats = df_matrix.loc[:,'format'].values.tolist() # Get format names - TODO: the first benchmark might not have all of them
   formats = list(dict.fromkeys(formats))              # remove duplicates
   return formats

####
# Create multiindex for columns
def get_multiindex( input_df, formats ):
   level1 = [ 'Matrix name', 'rows', 'columns' ]
   level2 = [ '',            '',     ''        ]
   level3 = [ '',            '',     ''        ]
@@ -127,17 +46,22 @@ for format in formats:
            level4.append( speedup )
            df_data[ 0 ].append( ' ' )
   multiColumns = pd.MultiIndex.from_arrays([ level1, level2, level3, level4 ] )
frames = []
   return multiColumns, df_data

####
# Convert input table to better structured one
def convert_data_frame( input_df, multicolumns, df_data, max_rows = -1 ):
   frames = []
   in_idx = 0
   out_idx = 0
max_out_idx = 10
print( "Converting data..." )
   max_out_idx = max_rows
   if max_out_idx == -1:
      max_out_idx = len(input_df.index)
   while in_idx < len(input_df.index) and out_idx < max_out_idx:
      matrixName = input_df.iloc[in_idx]['matrix name']
      df_matrix = input_df.loc[input_df['matrix name'] == matrixName]
      print( out_idx, ":", in_idx, "/", len(input_df.index), ":", matrixName )
   aux_df = pd.DataFrame( df_data, columns = multiColumns, index = [out_idx] )
      aux_df = pd.DataFrame( df_data, columns = multicolumns, index = [out_idx] )
      for index,row in df_matrix.iterrows():
         aux_df.iloc[0]['Matrix name'] = row['matrix name']
         aux_df.iloc[0]['rows']        = row['rows']
@@ -153,20 +77,21 @@ while in_idx < len(input_df.index) and out_idx < max_out_idx:
         #aux_df.iloc[0][(current_format,current_device,'stddev/time')] = row['stddev/time']
         #aux_df.iloc[0][(current_format,current_device,'diff.max')]    = row['CSR Diff.Max']
         #aux_df.iloc[0][(current_format,current_device,'diff.l2')]    = row['CSR Diff.L2']

      frames.append( aux_df )
      out_idx = out_idx + 1
      in_idx = in_idx + len(df_matrix.index)

print( "Merging data into one frame..." )
   result = pd.concat( frames )
   return result

####
# Compute speed-up of particular formats compared to Cusparse on GPU and CSR on CPU
def compute_speedup( df, formats ):
   for format in formats:
      if not format in [ 'cusparse', 'CSR' ]:
         print( 'Adding speed-up for ', format )
      format_bdw_list = result[(format,'GPU','bandwidth')]
      cusparse_bdw_list = result[('cusparse','GPU','bandwidth')]
      csr_bdw_list = result[('CSR','CPU','bandwidth')]
         format_bdw_list = df[(format,'GPU','bandwidth')]
         cusparse_bdw_list = df[('cusparse','GPU','bandwidth')]
         csr_bdw_list = df[('CSR','CPU','bandwidth')]
         cusparse_speedup_list = []
         csr_speedup_list = []
         for ( format_bdw, cusparse_bdw, csr_bdw ) in zip( format_bdw_list, cusparse_bdw_list,csr_bdw_list ):
@@ -186,46 +111,162 @@ for format in formats:
            #else:
            #   cusparse_speedup_list.append('')
            #   csr_speedup_list.append('')
         df[(format,'GPU','speed-up','cusparse')] = cusparse_speedup_list
         df[(format,'GPU','speed-up','CSR CPU')] = csr_speedup_list

      result[(format,'GPU','speed-up','cusparse')] = cusparse_speedup_list
      result[(format,'GPU','speed-up','CSR CPU')] = csr_speedup_list

print( "Writting to HTML file..." )
result.to_html( 'output.html' )

result.replace( to_replace=' ',value=np.nan,inplace=True)
####
# Comparison with Cusparse
def cusparse_comparison( df, formats, head_size=10 ):
   if not os.path.exists("Cusparse-bw"):
      os.mkdir("Cusparse-bw")
   df.sort_values(by=[('cusparse','GPU','bandwidth')],inplace=True,ascending=False)
   for format in formats:
      if not format in ['cusparse','CSR']:
         print( f"Writing comparison of {format} and Cusparse" )
         filtered_df = df.dropna( subset=[(format,'GPU','bandwidth','')] )
         t = np.arange(filtered_df[(format,'GPU','bandwidth')].size )
         fig, axs = plt.subplots( 2, 1 )
         axs[0].plot( t, filtered_df[(format,'GPU','bandwidth')], '-o', ms=1, lw=1 )
         axs[0].plot( t, filtered_df[('cusparse','GPU','bandwidth')], '-o', ms=1, lw=1 )
         axs[0].legend( [ format, 'Cusparse' ], loc='upper right' )
         axs[0].set_ylabel( 'Bandwidth in GB/sec' )
         axs[1].set_yscale( 'log' )
         axs[1].plot( t, filtered_df[(format,'GPU','bandwidth')], '-o', ms=1, lw=1 )
         axs[1].plot( t, filtered_df[('cusparse','GPU','bandwidth')], '-o', ms=1, lw=1 )
         axs[1].legend( [ format, 'Cusparse' ], loc='upper right' )
         axs[1].set_xlabel( 'Matrix ID - sorted w.r.t. Cusparse' )
         axs[1].set_ylabel( 'Bandwidth in GB/sec' )
         plt.savefig( f"Cusparse-bw/{format}.pdf" )
         plt.close(fig)
         head_df = filtered_df.head( head_size )
         for f in formats:
            if not f in ['cusparse','CSR',format]:
               print( f"Droping {f}..." )
               head_df.drop( labels=f, axis='columns', level=0, inplace=True )
         head_df.to_html( f"Cusparse-bw/{format}-head.html" )

####
# Generate report = tables and figures
# Comparison with CSR on CPU
def csr_comparison( df, formats, head_size=10 ):
   if not os.path.exists("CSR-bw"):
      os.mkdir("CSR-bw")
   for format in formats:
      if not format in ['cusparse','CSR']:
         print( f"Writing comparison of {format} and CSR on CPU" )
         result.sort_values(by=[(format,'GPU','bandwidth')],inplace=True,ascending=False)
         fig, axs = plt.subplots( 2, 1 )
         t = np.arange(result[(format,'GPU','bandwidth')].size )
         axs[0].plot( t, result[(format,'GPU','bandwidth')], '-o', ms=1, lw=1 )
         axs[0].plot( t, result[('CSR','CPU','bandwidth')], '-o', ms=1, lw=1 )
         axs[0].legend( [ format, 'CSR on CPU' ], loc='upper right' )
         axs[1].set_yscale( 'log' )
         axs[1].plot( t, result[(format,'GPU','bandwidth')], '-o', ms=1, lw=1 )
         axs[1].plot( t, result[('CSR','CPU','bandwidth')], '-o', ms=1, lw=1 )
         axs[1].legend( [ format, 'CSR on CPU' ], loc='upper right' )
         axs[1].set_xlabel( f"Matrix ID - sorted w.r.t. {format}" )
         axs[1].set_ylabel( 'Bandwidth in GB/sec' )
         plt.savefig( f"CSR-bw/{format}.pdf")
         plt.close(fig)
         head_df = filtered_df.head( head_size )
         for f in formats:
            if not f in ['cusparse','CSR',format]:
               print( f"Droping {f}..." )
               head_df.drop( labels=f, axis='columns', level=0, inplace=True )
         head_df.to_html( f"CSR-bw/{format}-head.html" )

#cusparse_comparison( result, formats )
#csr_comparison( result, formats )
#legacy_formats_comparison( result, formats )
####
# Comparison of Legacy formats
def legacy_formats_comparison( df, formats, head_size=10 ):
   if not os.path.exists("Legacy-bw"):
      os.mkdir("Legacy-bw")
   for ref_format, legacy_format in [ ('Ellpack', 'Ellpack Legacy'),
                                    ('SlicedEllpack', 'SlicedEllpack Legacy'),
                                    ('ChunkedEllpack', 'ChunkedEllpack Legacy'),
                                    ('BiEllpack', 'BiEllpack Legacy'),
                                    ('CSR< Adaptive >', 'CSR Legacy Adaptive'),
                                    ('CSR< Scalar >', 'CSR Legacy Scalar'),
                                    ('CSR< Vector >', 'CSR Legacy Vector') ]:
      if ref_format in formats and legacy_format in formats:
         print( f"Writing comparison of {ref_format} and {legacy_format}" )
         df.sort_values(by=[(ref_format,'GPU','bandwidth')],inplace=True,ascending=False)
         fig, axs = plt.subplots( 2, 1 )
         t = np.arange(df[(ref_format,'GPU','bandwidth')].size )
         axs[0].plot( t, df[(ref_format,'GPU','bandwidth')], '-o', ms=1, lw=1 )
         axs[0].plot( t, df[(legacy_format,'GPU','bandwidth')], '-o', ms=1, lw=1 )
         axs[0].legend( [ ref_format, legacy_format ], loc='upper right' )
         axs[1].set_yscale( 'log' )
         axs[1].plot( t, df[(ref_format,'GPU','bandwidth')], '-o', ms=1, lw=1 )
         axs[1].plot( t, df[(legacy_format,'GPU','bandwidth')], '-o', ms=1, lw=1 )
         axs[1].legend( [ ref_format, legacy_format ], loc='upper right' )
         axs[1].set_xlabel( f"Matrix ID - sorted w.r.t. {ref_format}" )
         axs[1].set_ylabel( 'Bandwidth in GB/sec' )
         plt.savefig( f"Legacy-bw/{ref_format}.pdf")
         plt.close(fig)
         head_df = filtered_df.head( head_size )
         for f in formats:
            if not f in ['cusparse','CSR',format]:
               print( f"Droping {f}..." )
               head_df.drop( labels=f, axis='columns', level=0, inplace=True )
         head_df.to_html( f"Legacy-bw/{format}-head.html" )

####
# Comparison of speed-up w.r.t. Cusparse
def cusparse_speedup_comparison( df, formats, head_size=10 ):
   if not os.path.exists("Cusparse-speed-up"):
      os.mkdir("Cusparse-speed-up")
   for format in formats:
      if not format in ['cusparse','CSR']:
         print( f"Writing comparison of speed-up of {format} compared to Cusparse" )
      result['tmp'] = result[(format, 'GPU','bandwidth')]
      filtered_df=result.dropna(subset=['rows'])
      filtered_df.to_html( 'tmp.html')
      break
         df['tmp'] = df[(format, 'GPU','bandwidth')]
         filtered_df=df.dropna(subset=[('tmp','','','')])
         filtered_df.sort_values(by=[(format,'GPU','speed-up','cusparse')],inplace=True,ascending=False)
         fig, axs = plt.subplots( 2, 1 )
      size = result[(format,'GPU','bandwidth')].size
         size = len(filtered_df[(format,'GPU','speed-up','cusparse')].index)
         t = np.arange( size )
         bar = np.full( size, 1 )
         axs[0].plot( t, filtered_df[(format,'GPU','speed-up','cusparse')], '-o', ms=1, lw=1 )
         axs[0].plot( t, bar, '-', ms=1, lw=1 )
         axs[0].legend( [ format, 'Cusparse' ], loc='upper right' )
         axs[1].set_yscale( 'log' )
      axs[1].plot( t, result[(format,'GPU','speed-up','cusparse')], '-o', ms=1, lw=1 )
         axs[1].plot( t, filtered_df[(format,'GPU','speed-up','cusparse')], '-o', ms=1, lw=1 )
         axs[1].plot( t, bar, '-', ms=1, lw=1 )
         axs[1].legend( [ format, 'Cusparse' ], loc='upper right' )
         axs[1].set_xlabel( f"Matrix ID - sorted w.r.t. {format}" )
         axs[1].set_ylabel( 'Bandwidth in GB/sec' )
         plt.savefig( f"Cusparse-speed-up/{format}.pdf")
         plt.close(fig)
         head_df = filtered_df.head( head_size )
         for f in formats:
            if not f in ['cusparse','CSR',format]:
               print( f"Droping {f}..." )
               head_df.drop( labels=f, axis='columns', level=0, inplace=True )
         head_df.to_html( f"Cusparse-speed-up/{format}-head.html" )

####
# Parse input file
print( "Parsing input file...." )
with open('sparse-matrix-benchmark.log') as f:
    d = json.load(f)
input_df = json_normalize( d, record_path=['results'] )
#input_df.to_html( "orig-pandas.html" )

formats = get_formats( input_df )
multicolumns, df_data = get_multiindex( input_df, formats )

print( "Converting data..." )
result = convert_data_frame( input_df, multicolumns, df_data, 200 )
compute_speedup( result, formats )

print( "Writting to HTML file..." )
result.to_html( 'output.html' )

result.replace( to_replace=' ',value=np.nan,inplace=True)

####
# Generate report = tables and figures
head_size = 10
cusparse_comparison( result, formats, head_size )
csr_comparison( result, formats, head_size )
legacy_formats_comparison( result, formats, head_size )
cusparse_speedup_comparison( result, formats, head_size )