Skip to content

Commit a74efdb

Browse files
hyesoonhyojongk
andauthored
igpu transfer (#46)
Co-authored-by: hyojongk <[email protected]>
1 parent ad51942 commit a74efdb

29 files changed

+217
-164
lines changed

def/general.stat.def

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@ DEF_STAT(EXE_TIME, COUNT, NO_RATIO)
3838
DEF_STAT(NUM_REPEAT, COUNT, NO_RATIO)
3939

4040
DEF_STAT(CYC_COUNT_X86, COUNT, NO_RATIO)
41-
DEF_STAT(CYC_COUNT_PTX, COUNT, NO_RATIO)
41+
DEF_STAT(CYC_COUNT_ACC, COUNT, NO_RATIO)
4242

4343
DEF_STAT(AVG_BLOCK_EXE_CYCLE, COUNT, NO_RATIO)
4444
DEF_STAT(AVG_BLOCK_EXE_CYCLE_BASE, COUNT, NO_RATIO)

macsimComponent.cpp

Lines changed: 14 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,16 @@ macsimComponent::macsimComponent(ComponentId_t id, Params& params)
6060
m_clock_freq,
6161
new Clock::Handler<macsimComponent>(this, &macsimComponent::ticReceived));
6262

63-
m_ptx_core = params.find<bool>("ptx_core", 0);
63+
if (params.find<bool>("ptx_core", 0)) {
64+
m_acc_type = PTX_CORE;
65+
m_acc_core = 1;
66+
} else if (params.find<bool>("igpu_core", 0)) {
67+
m_acc_type = IGPU_CORE;
68+
m_acc_core = 1;
69+
} else {
70+
m_acc_core = 0;
71+
m_acc_type = NO_ACC;
72+
}
6473
m_num_link = params.find<uint32_t>("num_link", 1);
6574
configureLinks(params, tc);
6675

@@ -150,7 +159,7 @@ void macsimComponent::configureLinks(SST::Params& params, TimeConverter* tc) {
150159
m_data_cache_requests.push_back(std::map<uint64_t, uint64_t>());
151160
m_data_cache_responses.push_back(std::set<uint64_t>());
152161

153-
if (m_ptx_core) {
162+
if (m_acc_core) {
154163
auto ccache_link = loadUserSubComponent<Interfaces::SimpleMem>(
155164
"core" + std::to_string(l) + "-ccache", ComponentInfo::SHARE_NONE, tc,
156165
new Interfaces::SimpleMem::Handler<macsimComponent>(
@@ -194,7 +203,7 @@ void macsimComponent::configureLinks(SST::Params& params, TimeConverter* tc) {
194203
m_data_cache_request_counters = std::vector<uint64_t>(m_num_link, 0);
195204
m_data_cache_response_counters = std::vector<uint64_t>(m_num_link, 0);
196205

197-
if (m_ptx_core) {
206+
if (m_acc_core) {
198207
m_const_cache_request_counters = std::vector<uint64_t>(m_num_link, 0);
199208
m_const_cache_response_counters = std::vector<uint64_t>(m_num_link, 0);
200209
m_texture_cache_request_counters = std::vector<uint64_t>(m_num_link, 0);
@@ -275,7 +284,7 @@ void macsimComponent::setup() {
275284
new Callback<macsimComponent, bool, int, uint64_t>(
276285
this, &macsimComponent::strobeDataCacheRespQ);
277286

278-
if (m_ptx_core) {
287+
if (m_acc_core) {
279288
CallbackSendConstCacheRequest* scr =
280289
new Callback<macsimComponent, void, int, uint64_t, uint64_t, int>(
281290
this, &macsimComponent::sendConstCacheRequest);
@@ -347,7 +356,7 @@ bool macsimComponent::ticReceived(Cycle_t) {
347356
// Debugging
348357
if (m_cycle % 100000 == 0) {
349358
for (unsigned int l = 0; l < m_num_link; ++l) {
350-
if (m_ptx_core) {
359+
if (m_acc_core) {
351360
MSC_DEBUG(
352361
"Core[%2d] I$: (%lu, %lu), D$: (%lu, %lu) C$: (%lu, %lu), T$: (%lu, "
353362
"%lu)\n",

macsimComponent.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -105,7 +105,8 @@ class macsimComponent : public SST::Component
105105

106106
macsim_c *m_macsim;
107107
bool m_sim_running;
108-
bool m_ptx_core;
108+
bool m_acc_core;
109+
ACC_Type m_acc_type;
109110
bool m_cube_connected;
110111
bool m_debug_all;
111112
int64_t m_debug_addr;

src/config.h

Lines changed: 74 additions & 47 deletions
Large diffs are not rendered by default.

src/dram_ctrl.cc

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -578,9 +578,9 @@ void dram_ctrl_c::send(void) {
578578
for (auto I = m_output_buffer->begin(), E = m_output_buffer->end(); I != E;
579579
++I) {
580580
mem_req_s* req = (*I);
581-
if (req_type_allowed[req->m_ptx] == false) continue;
581+
if (req_type_allowed[req->m_acc] == false) continue;
582582

583-
req_type_checked[req->m_ptx] = true;
583+
req_type_checked[req->m_acc] = true;
584584
req->m_msg_type = NOC_FILL;
585585

586586
bool insert_packet =
@@ -764,7 +764,7 @@ void dram_ctrl_c::channel_schedule_data(void) {
764764
m_current_list[bank]->m_req->m_id);
765765
ASSERT(m_current_list[bank]->m_state == DRAM_DATA);
766766
m_data_ready[bank] = acquire_data_bus(
767-
ii, m_current_list[bank]->m_size, m_current_list[bank]->m_req->m_ptx);
767+
ii, m_current_list[bank]->m_size, m_current_list[bank]->m_req->m_acc);
768768
m_data_avail[bank] = ULLONG_MAX;
769769
m_current_list[bank]->m_state = DRAM_DATA_WAIT;
770770
} else

src/exec.cc

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -538,7 +538,7 @@ bool exec_c::exec(int thread_id, int entry, uop_c* uop) {
538538
use_port(thread_id, entry);
539539

540540
// GPU : if we use load-block policy, block current thread due to load instruction
541-
if (uop_latency == -1 && m_ptx_sim &&
541+
if (uop_latency == -1 && m_acc_sim &&
542542
*m_simBase->m_knobs->KNOB_FETCH_ONLY_LOAD_READY) {
543543
m_frontend->set_load_wait(uop->m_thread_id, uop->m_uop_num);
544544

@@ -741,7 +741,7 @@ void exec_c::br_exec(uop_c* uop) {
741741
}
742742

743743
// GPU : stall on branch policy
744-
if (m_ptx_sim && *m_simBase->m_knobs->KNOB_MT_NO_FETCH_BR) {
744+
if (m_acc_sim && *m_simBase->m_knobs->KNOB_MT_NO_FETCH_BR) {
745745
m_frontend->set_br_ready(uop->m_thread_id);
746746
}
747747
}
@@ -793,7 +793,7 @@ void exec_c::run_a_cycle(void) {
793793
if (responseArrived) {
794794
DEBUG_CORE(m_core_id, "key found: 0x%lx, addr = 0x%llx\n", key,
795795
uop->m_vaddr);
796-
if (m_ptx_sim || m_igpu_sim) {
796+
if (m_acc_sim || m_igpu_sim) {
797797
if (uop->m_parent_uop) {
798798
uop_c* puop = uop->m_parent_uop;
799799
++puop->m_num_child_uops_done;
@@ -883,7 +883,7 @@ int exec_c::access_data_cache(uop_c* uop) {
883883
auto i = m_uop_buffer.find(key);
884884
ASSERTM(m_uop_buffer.end() == i, "uop has already been executed!\n");
885885

886-
int block_size = m_ptx_sim ? KNOB(KNOB_L1_SMALL_LINE_SIZE)->getValue()
886+
int block_size = m_acc_sim ? KNOB(KNOB_L1_SMALL_LINE_SIZE)->getValue()
887887
: KNOB(KNOB_L1_LARGE_LINE_SIZE)->getValue();
888888
// Addr block_addr = uop->m_vaddr & ~((uint64_t)block_size-1);
889889

@@ -936,7 +936,7 @@ int exec_c::access_data_cache(uop_c* uop) {
936936
}
937937

938938
int exec_c::access_const_texture_cache(uop_c* uop) {
939-
ASSERT(m_ptx_sim);
939+
ASSERT(m_acc_sim);
940940
ASSERT(uop->m_mem_type == MEM_LD_CM || uop->m_mem_type == MEM_LD_TM);
941941

942942
// assign unique key to each memory request; this will be used later in time for strobbing

src/exec.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -184,8 +184,9 @@ class exec_c
184184
uns16 m_mem_sched_rate; /**< memory schedule rate */
185185
uns16 m_fp_sched_rate; /**< fp schedule rate */
186186
uns8 m_dcache_cycles; /**< L1 cache latency */
187-
bool m_ptx_sim; /**< gpu simulation */
187+
bool m_acc_sim; /**< gpu simulation */
188188
bool m_igpu_sim; /**< intel gpu simulation */
189+
bool m_ptx_sim; /**< PTX simulation */
189190
int m_latency[NUM_UOP_TYPES]; /**< latency map */
190191
Counter m_cur_core_cycle; /**< current core cycle */
191192
int m_max_port[max_ALLOCQ]; /**< maximum port */

src/frontend.cc

Lines changed: 22 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -191,6 +191,8 @@ void frontend_c::run_a_cycle(void) {
191191
// fetch every KNOB_FETCH_RATIO cycle
192192
// CPU : every cycle
193193
// NVIDIA G80 : 1/4 cycles, NVIDIA Fermi: 1/2 cycles
194+
// check core type for the fetch rate
195+
// Hyesoon: Aug-2020 please check whether this need to be changed with heteroe and igpu
194196
if (m_fetch_ratio != 1) {
195197
m_fetch_modulo++;
196198
if (m_fetch_modulo == m_fetch_ratio)
@@ -300,7 +302,7 @@ void frontend_c::run_a_cycle(void) {
300302

301303
// TONAGESH
302304
// nagesh - comments for BAR are incomplete...
303-
if (m_knob_ptx_sim) {
305+
if (m_ptx_sim) {
304306
// handling of BAR instruction in PTX - can/should this be moved?
305307
// do we have any blocks for which all warps have reached (retired)
306308
// their next barrier?
@@ -346,7 +348,7 @@ FRONTEND_MODE frontend_c::process_ifetch(unsigned int tid,
346348

347349
// First time : set up traces for current thread
348350
if (fetch_data->m_first_time) {
349-
m_simBase->m_trace_reader->setup_trace(m_core_id, tid, m_knob_ptx_sim);
351+
m_simBase->m_trace_reader->setup_trace(m_core_id, tid, m_ptx_sim);
350352
fetch_data->m_first_time = false;
351353

352354
++m_core->m_inst_fetched[tid]; /*! initial increase */
@@ -356,11 +358,18 @@ FRONTEND_MODE frontend_c::process_ifetch(unsigned int tid,
356358

357359
// set up initial fetch address
358360
thread_s *thread = m_core->get_trace_info(tid);
359-
if (thread->m_ptx) {
360-
trace_info_gpu_s *prev_trace_info =
361-
static_cast<trace_info_gpu_s *>(thread->m_prev_trace_info);
362-
fetch_data->m_MT_scheduler.m_next_fetch_addr =
363-
prev_trace_info->m_inst_addr;
361+
if (thread->m_acc) {
362+
if (m_ptx_sim) {
363+
trace_info_gpu_s *prev_trace_info =
364+
static_cast<trace_info_gpu_s *>(thread->m_prev_trace_info);
365+
fetch_data->m_MT_scheduler.m_next_fetch_addr =
366+
prev_trace_info->m_inst_addr;
367+
} else if (m_igpu_sim) {
368+
trace_info_igpu_s *prev_trace_info =
369+
static_cast<trace_info_igpu_s *>(thread->m_prev_trace_info);
370+
fetch_data->m_MT_scheduler.m_next_fetch_addr =
371+
prev_trace_info->m_instruction_addr;
372+
}
364373
} else {
365374
if (KNOB(KNOB_LARGE_CORE_TYPE)->getValue() == "x86") {
366375
trace_info_cpu_s *prev_trace_info =
@@ -372,11 +381,6 @@ FRONTEND_MODE frontend_c::process_ifetch(unsigned int tid,
372381
static_cast<trace_info_a64_s *>(thread->m_prev_trace_info);
373382
fetch_data->m_MT_scheduler.m_next_fetch_addr =
374383
prev_trace_info->m_instruction_addr;
375-
} else if (KNOB(KNOB_LARGE_CORE_TYPE)->getValue() == "igpu") {
376-
trace_info_igpu_s *prev_trace_info =
377-
static_cast<trace_info_igpu_s *>(thread->m_prev_trace_info);
378-
fetch_data->m_MT_scheduler.m_next_fetch_addr =
379-
prev_trace_info->m_instruction_addr;
380384
} else {
381385
ASSERTM(0, "Wrong core type %s\n",
382386
KNOB(KNOB_LARGE_CORE_TYPE)->getValue().c_str());
@@ -457,8 +461,8 @@ FRONTEND_MODE frontend_c::process_ifetch(unsigned int tid,
457461
ASSERT(new_uop);
458462

459463
// read an uop from the traces
460-
if (!m_simBase->m_trace_reader->get_uops_from_traces(
461-
m_core_id, new_uop, tid, m_knob_ptx_sim)) {
464+
if (!m_simBase->m_trace_reader->get_uops_from_traces(m_core_id, new_uop,
465+
tid, m_ptx_sim)) {
462466
// couldn't get an uop
463467
DEBUG_CORE(m_core_id, "not success\n");
464468
m_uop_pool->release_entry(new_uop->free());
@@ -631,7 +635,7 @@ bool frontend_c::access_icache(int tid, Addr fetch_addr,
631635
int result = m_simBase->m_memory->new_mem_req(
632636
MRT_IFETCH, line_addr, m_knob_icache_line_size, false, false, 0, NULL,
633637
icache_fill_line_wrapper, m_core->get_unique_uop_num(), NULL, m_core_id,
634-
tid, m_knob_ptx_sim);
638+
tid, m_ptx_sim);
635639

636640
// mshr full
637641
if (!result) return false;
@@ -712,7 +716,7 @@ bool frontend_c::icache_fill_line(mem_req_s *req) {
712716
if (m_icache->access_cache(req->m_addr, &line_addr, false, req->m_appl_id) ==
713717
NULL) {
714718
m_icache->insert_cache(req->m_addr, &line_addr, &repl_line_addr,
715-
req->m_appl_id, req->m_ptx);
719+
req->m_appl_id, req->m_acc);
716720
POWER_CORE_EVENT(req->m_core_id, POWER_ICACHE_W);
717721
}
718722

@@ -806,7 +810,7 @@ int frontend_c::predict_bpu(uop_c *uop) {
806810
// no branch prediction
807811
else {
808812
// GPU : stall on branch policy, stop fetching
809-
if (m_knob_ptx_sim && *m_simBase->m_knobs->KNOB_MT_NO_FETCH_BR) {
813+
if (m_ptx_sim && *m_simBase->m_knobs->KNOB_MT_NO_FETCH_BR) {
810814
set_br_wait(uop->m_thread_id);
811815
mispredicted = false;
812816
}
@@ -906,7 +910,7 @@ int frontend_c::fetch_rr(void) {
906910
}
907911

908912
// check the thread is ready to fetch
909-
if (m_knob_ptx_sim) {
913+
if (m_ptx_sim) {
910914
// GPU : stall on branch policy, check whether previous branch has been resolved
911915
if (*m_simBase->m_knobs->KNOB_MT_NO_FETCH_BR &&
912916
!check_br_ready(fetch_id)) {

src/frontend.h

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -407,7 +407,9 @@ class frontend_c
407407
uns m_knob_icache_line_size; /**< icache line size */
408408
bool m_fe_stall; /**< frontend stalled */
409409
bool m_fe_running; /**< enabled frontend */
410-
bool m_knob_ptx_sim; /**< GPU simulation */
410+
bool m_ptx_sim; /**< PTX simulation */
411+
bool m_igpu_sim; /**< iGPU simulation */
412+
bool m_acc_sim; /**< Accelerator simulation */
411413
bool m_ready_thread_available; /**< ready thread available */
412414
bool m_last_fetch_tid_failed;
413415
core_c* m_core; /**< core pointer */

src/global_types.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -80,4 +80,9 @@ typedef enum uop_latency_map { // enum for x86 latency maps - Michael
8080
NUM_LATENCY_MAPS
8181
} latency_map;
8282

83+
typedef enum _ACC_Type_enum {
84+
NO_ACC = 0, /**< no accelerator */
85+
PTX_CORE, /**< PTX core */
86+
IGPU_CORE /**< IGPU core */
87+
} ACC_Type;
8388
#endif

0 commit comments

Comments
 (0)