Skip to content

Commit 90ea8aa

Browse files
committed
cat: updates in vector-FLOPs benchmarks
Include output for kernels that only execute scalar floating-point operations. These changes have been tested on the Intel Sapphire Rapids and IBM POWER10 architectures.
1 parent 9f6f6e6 commit 90ea8aa

File tree

11 files changed

+692
-325
lines changed

11 files changed

+692
-325
lines changed

src/counter_analysis_toolkit/Makefile

Lines changed: 29 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -49,9 +49,9 @@ endif
4949
ifeq ($(ARCH),POWER)
5050
FLOP+=-maltivec -DPOWER
5151
VECSRC=vec_fma_hp.o vec_fma_sp.o vec_fma_dp.o vec_nonfma_hp.o vec_nonfma_sp.o vec_nonfma_dp.o
52-
VEC=-maltivec -O0 -DPOWER
53-
VEC_FMA=-maltivec -O0 -DPOWER
54-
VEC_ALL=$(VEC) -O0 -DPOWER
52+
VEC=-maltivec -DPOWER
53+
VEC_FMA=-maltivec -DPOWER
54+
VEC_ALL=$(VEC) -DPOWER
5555
endif
5656
ifeq ($(ARCH),ARM)
5757
FLOP+=-march=armv8.2-a+fp16 -DARM
@@ -109,58 +109,58 @@ weak_symbols.o: weak_symbols.c vec.h
109109
-$(CC) -c $(CFLAGS) weak_symbols.c
110110

111111
vec.o: vec.c vec.h
112-
-$(CC) -c $(CFLAGS) $(INCFLAGS) -D$(ARCH) $(VEC_META) vec.c
112+
-$(CC) -c $(CFLAGS) $(OPT1) $(INCFLAGS) -D$(ARCH) $(VEC_META) vec.c
113113

114114
vec_scalar_verify.o: vec_scalar_verify.c vec_scalar_verify.h cat_arch.h
115-
-$(CC) -c $(CFLAGS) $(INCFLAGS) $(VEC_ALL) vec_scalar_verify.c
115+
-$(CC) -c $(CFLAGS) $(OPT1) $(INCFLAGS) $(VEC_ALL) vec_scalar_verify.c
116116

117117
vec_fma_hp.o: vec_fma_hp.c vec_scalar_verify.h
118-
-$(CC) -c $(CFLAGS) $(INCFLAGS) $(VEC_FMA) vec_fma_hp.c
118+
-$(CC) -c $(CFLAGS) $(OPT1) $(INCFLAGS) $(VEC_FMA) vec_fma_hp.c
119119

120120
vec_fma_hp: vec_fma_hp.c vec_scalar_verify.h
121-
-$(CC) -c $(CFLAGS) $(INCFLAGS) $(VEC128_FMA) vec_fma_hp.c -o vec_fma_hp-128B.o
122-
-$(CC) -c $(CFLAGS) $(INCFLAGS) $(VEC256_FMA) vec_fma_hp.c -o vec_fma_hp-256B.o
123-
-$(CC) -c $(CFLAGS) $(INCFLAGS) $(VEC512_FMA) vec_fma_hp.c -o vec_fma_hp-512B.o
121+
-$(CC) -c $(CFLAGS) $(OPT1) $(INCFLAGS) $(VEC128_FMA) vec_fma_hp.c -o vec_fma_hp-128B.o
122+
-$(CC) -c $(CFLAGS) $(OPT1) $(INCFLAGS) $(VEC256_FMA) vec_fma_hp.c -o vec_fma_hp-256B.o
123+
-$(CC) -c $(CFLAGS) $(OPT1) $(INCFLAGS) $(VEC512_FMA) vec_fma_hp.c -o vec_fma_hp-512B.o
124124

125125
vec_fma_sp.o: vec_fma_sp.c vec_scalar_verify.h
126-
-$(CC) -c $(CFLAGS) $(INCFLAGS) $(VEC_FMA) vec_fma_sp.c
126+
-$(CC) -c $(CFLAGS) $(OPT1) $(INCFLAGS) $(VEC_FMA) vec_fma_sp.c
127127

128128
vec_fma_sp: vec_fma_sp.c vec_scalar_verify.h
129-
-$(CC) -c $(CFLAGS) $(INCFLAGS) $(VEC128_FMA) vec_fma_sp.c -o vec_fma_sp-128B.o
130-
-$(CC) -c $(CFLAGS) $(INCFLAGS) $(VEC256_FMA) vec_fma_sp.c -o vec_fma_sp-256B.o
131-
-$(CC) -c $(CFLAGS) $(INCFLAGS) $(VEC512_FMA) vec_fma_sp.c -o vec_fma_sp-512B.o
129+
-$(CC) -c $(CFLAGS) $(OPT1) $(INCFLAGS) $(VEC128_FMA) vec_fma_sp.c -o vec_fma_sp-128B.o
130+
-$(CC) -c $(CFLAGS) $(OPT1) $(INCFLAGS) $(VEC256_FMA) vec_fma_sp.c -o vec_fma_sp-256B.o
131+
-$(CC) -c $(CFLAGS) $(OPT1) $(INCFLAGS) $(VEC512_FMA) vec_fma_sp.c -o vec_fma_sp-512B.o
132132

133133
vec_fma_dp.o: vec_fma_dp.c vec_scalar_verify.h
134-
-$(CC) -c $(CFLAGS) $(INCFLAGS) $(VEC_FMA) vec_fma_dp.c
134+
-$(CC) -c $(CFLAGS) $(OPT1) $(INCFLAGS) $(VEC_FMA) vec_fma_dp.c
135135

136136
vec_fma_dp: vec_fma_dp.c vec_scalar_verify.h
137-
-$(CC) -c $(CFLAGS) $(INCFLAGS) $(VEC128_FMA) vec_fma_dp.c -o vec_fma_dp-128B.o
138-
-$(CC) -c $(CFLAGS) $(INCFLAGS) $(VEC256_FMA) vec_fma_dp.c -o vec_fma_dp-256B.o
139-
-$(CC) -c $(CFLAGS) $(INCFLAGS) $(VEC512_FMA) vec_fma_dp.c -o vec_fma_dp-512B.o
137+
-$(CC) -c $(CFLAGS) $(OPT1) $(INCFLAGS) $(VEC128_FMA) vec_fma_dp.c -o vec_fma_dp-128B.o
138+
-$(CC) -c $(CFLAGS) $(OPT1) $(INCFLAGS) $(VEC256_FMA) vec_fma_dp.c -o vec_fma_dp-256B.o
139+
-$(CC) -c $(CFLAGS) $(OPT1) $(INCFLAGS) $(VEC512_FMA) vec_fma_dp.c -o vec_fma_dp-512B.o
140140

141141
vec_nonfma_hp.o: vec_nonfma_hp.c vec_scalar_verify.h
142-
-$(CC) -c $(CFLAGS) $(INCFLAGS) $(VEC) vec_nonfma_hp.c
142+
-$(CC) -c $(CFLAGS) $(OPT1) $(INCFLAGS) $(VEC) vec_nonfma_hp.c
143143

144144
vec_nonfma_hp: vec_nonfma_hp.c vec_scalar_verify.h
145-
-$(CC) -c $(CFLAGS) $(INCFLAGS) $(VEC128) vec_nonfma_hp.c -o vec_nonfma_hp-128B.o
146-
-$(CC) -c $(CFLAGS) $(INCFLAGS) $(VEC256) vec_nonfma_hp.c -o vec_nonfma_hp-256B.o
147-
-$(CC) -c $(CFLAGS) $(INCFLAGS) $(VEC512) vec_nonfma_hp.c -o vec_nonfma_hp-512B.o
145+
-$(CC) -c $(CFLAGS) $(OPT1) $(INCFLAGS) $(VEC128) vec_nonfma_hp.c -o vec_nonfma_hp-128B.o
146+
-$(CC) -c $(CFLAGS) $(OPT1) $(INCFLAGS) $(VEC256) vec_nonfma_hp.c -o vec_nonfma_hp-256B.o
147+
-$(CC) -c $(CFLAGS) $(OPT1) $(INCFLAGS) $(VEC512) vec_nonfma_hp.c -o vec_nonfma_hp-512B.o
148148

149149
vec_nonfma_sp.o: vec_nonfma_sp.c vec_scalar_verify.h
150-
-$(CC) -c $(CFLAGS) $(INCFLAGS) $(VEC) vec_nonfma_sp.c
150+
-$(CC) -c $(CFLAGS) $(OPT1) $(INCFLAGS) $(VEC) vec_nonfma_sp.c
151151

152152
vec_nonfma_sp: vec_nonfma_sp.c vec_scalar_verify.h
153-
-$(CC) -c $(CFLAGS) $(INCFLAGS) $(VEC128) vec_nonfma_sp.c -o vec_nonfma_sp-128B.o
154-
-$(CC) -c $(CFLAGS) $(INCFLAGS) $(VEC256) vec_nonfma_sp.c -o vec_nonfma_sp-256B.o
155-
-$(CC) -c $(CFLAGS) $(INCFLAGS) $(VEC512) vec_nonfma_sp.c -o vec_nonfma_sp-512B.o
153+
-$(CC) -c $(CFLAGS) $(OPT1) $(INCFLAGS) $(VEC128) vec_nonfma_sp.c -o vec_nonfma_sp-128B.o
154+
-$(CC) -c $(CFLAGS) $(OPT1) $(INCFLAGS) $(VEC256) vec_nonfma_sp.c -o vec_nonfma_sp-256B.o
155+
-$(CC) -c $(CFLAGS) $(OPT1) $(INCFLAGS) $(VEC512) vec_nonfma_sp.c -o vec_nonfma_sp-512B.o
156156

157157
vec_nonfma_dp.o: vec_nonfma_dp.c vec_scalar_verify.h
158-
-$(CC) -c $(CFLAGS) $(INCFLAGS) $(VEC) vec_nonfma_dp.c
158+
-$(CC) -c $(CFLAGS) $(OPT1) $(INCFLAGS) $(VEC) vec_nonfma_dp.c
159159

160160
vec_nonfma_dp: vec_nonfma_dp.c vec_scalar_verify.h
161-
-$(CC) -c $(CFLAGS) $(INCFLAGS) $(VEC128) vec_nonfma_dp.c -o vec_nonfma_dp-128B.o
162-
-$(CC) -c $(CFLAGS) $(INCFLAGS) $(VEC256) vec_nonfma_dp.c -o vec_nonfma_dp-256B.o
163-
-$(CC) -c $(CFLAGS) $(INCFLAGS) $(VEC512) vec_nonfma_dp.c -o vec_nonfma_dp-512B.o
161+
-$(CC) -c $(CFLAGS) $(OPT1) $(INCFLAGS) $(VEC128) vec_nonfma_dp.c -o vec_nonfma_dp-128B.o
162+
-$(CC) -c $(CFLAGS) $(OPT1) $(INCFLAGS) $(VEC256) vec_nonfma_dp.c -o vec_nonfma_dp-256B.o
163+
-$(CC) -c $(CFLAGS) $(OPT1) $(INCFLAGS) $(VEC512) vec_nonfma_dp.c -o vec_nonfma_dp-512B.o
164164

165165
cat_collect:
166166
$(CC) $(CFLAGS) -fopenmp $(INCFLAGS) main.c $(wildcard *.o) -o cat_collect $(LDFLAGS)

src/counter_analysis_toolkit/cat_arch.h

Lines changed: 5 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -123,38 +123,17 @@ typedef float64x2_t DP_VEC_TYPE;
123123
#define ADD_VEC_SH(_I_,_J_) vaddh_f16( _I_ , _J_ );
124124
#define MUL_VEC_SH(_I_,_J_) vmulh_f16( _I_ , _J_ );
125125
#define SQRT_VEC_SH(_I_) vsqrth_f16( _I_ );
126-
#define FMA_VEC_SH(_out_,_I_,_J_,_K_) {\
127-
HP_VEC_TYPE arg1 = SET_VEC_PH(_I_);\
128-
HP_VEC_TYPE arg2 = SET_VEC_PH(_J_);\
129-
HP_VEC_TYPE arg3 = SET_VEC_PH(_K_);\
130-
HP_VEC_TYPE argTmp;\
131-
argTmp = FMA_VEC_PH( arg1 , arg2 , arg3 );\
132-
_out_ = ((half*)&(argTmp))[0];\
133-
}
126+
#define FMA_VEC_SH(_out_,_I_,_J_,_K_) _out_ = _I_ * _J_ + _K_;
134127

135128
#define SET_VEC_SS(_I_) _I_ ;
136129
#define ADD_VEC_SS(_I_,_J_) _I_ + _J_ ;
137130
#define MUL_VEC_SS(_I_,_J_) _I_ * _J_ ;
138-
#define FMA_VEC_SS(_out_,_I_,_J_,_K_) {\
139-
SP_VEC_TYPE arg1 = SET_VEC_PS(_I_);\
140-
SP_VEC_TYPE arg2 = SET_VEC_PS(_J_);\
141-
SP_VEC_TYPE arg3 = SET_VEC_PS(_K_);\
142-
SP_VEC_TYPE argTmp;\
143-
argTmp = FMA_VEC_PS( arg1 , arg2 , arg3 );\
144-
_out_ = ((SP_SCALAR_TYPE*)&(argTmp))[0];\
145-
}
131+
#define FMA_VEC_SS(_out_,_I_,_J_,_K_) _out_ = _I_ * _J_ + _K_;
146132

147133
#define SET_VEC_SD(_I_) _I_ ;
148134
#define ADD_VEC_SD(_I_,_J_) _I_ + _J_ ;
149135
#define MUL_VEC_SD(_I_,_J_) _I_ * _J_ ;
150-
#define FMA_VEC_SD(_out_,_I_,_J_,_K_) {\
151-
DP_VEC_TYPE arg1 = SET_VEC_PD(_I_);\
152-
DP_VEC_TYPE arg2 = SET_VEC_PD(_J_);\
153-
DP_VEC_TYPE arg3 = SET_VEC_PD(_K_);\
154-
DP_VEC_TYPE argTmp;\
155-
argTmp = FMA_VEC_PD( arg1 , arg2 , arg3 );\
156-
_out_ = ((DP_SCALAR_TYPE*)&(argTmp))[0];\
157-
}
136+
#define FMA_VEC_SD(_out_,_I_,_J_,_K_) _out_ = _I_ * _J_ + _K_;
158137

159138
#elif defined(POWER)
160139
void test_hp_power_VEC( int instr_per_loop, uint64 iterations, int EventSet, FILE *fp );
@@ -187,25 +166,11 @@ typedef __vector double DP_VEC_TYPE;
187166
#define SET_VEC_SS(_I_) _I_ ;
188167
#define ADD_VEC_SS(_I_,_J_) _I_ + _J_ ;
189168
#define MUL_VEC_SS(_I_,_J_) _I_ * _J_ ;
190-
#define FMA_VEC_SS(_out_,_I_,_J_,_K_) {\
191-
SP_VEC_TYPE arg1 = SET_VEC_PS(_I_);\
192-
SP_VEC_TYPE arg2 = SET_VEC_PS(_J_);\
193-
SP_VEC_TYPE arg3 = SET_VEC_PS(_K_);\
194-
SP_VEC_TYPE argTmp;\
195-
argTmp = FMA_VEC_PS( arg1 , arg2 , arg3 );\
196-
_out_ = ((SP_SCALAR_TYPE*)&(argTmp))[0];\
197-
}
169+
#define FMA_VEC_SS(_out_,_I_,_J_,_K_) _out_ = _I_ * _J_ + _K_;
198170

199171
#define SET_VEC_SD(_I_) _I_ ;
200172
#define ADD_VEC_SD(_I_,_J_) _I_ + _J_ ;
201173
#define MUL_VEC_SD(_I_,_J_) _I_ * _J_ ;
202-
#define FMA_VEC_SD(_out_,_I_,_J_,_K_) {\
203-
DP_VEC_TYPE arg1 = SET_VEC_PD(_I_);\
204-
DP_VEC_TYPE arg2 = SET_VEC_PD(_J_);\
205-
DP_VEC_TYPE arg3 = SET_VEC_PD(_K_);\
206-
DP_VEC_TYPE argTmp;\
207-
argTmp = FMA_VEC_PD( arg1 , arg2 , arg3 );\
208-
_out_ = ((DP_SCALAR_TYPE*)&(argTmp))[0];\
209-
}
174+
#define FMA_VEC_SD(_out_,_I_,_J_,_K_) _out_ = _I_ * _J_ + _K_;
210175

211176
#endif

0 commit comments

Comments
 (0)