diff --git a/cifar10/problem.py b/cifar10/problem.py index dee804a..f2ef5e7 100644 --- a/cifar10/problem.py +++ b/cifar10/problem.py @@ -9,8 +9,8 @@ Problem.add_dim('pool_size',[2]) Problem.add_dim('conv2_out_chan',(3,64)) Problem.add_dim('conv2_kern',(3,8)) -Problem.add_dim('fc1_out',(64,256)) -Problem.add_dim('fc2_out',(32,128)) +Problem.add_dim('fc1_out',(64,16384)) +Problem.add_dim('fc2_out',(32,16384)) Problem.add_dim('fc3_out',[10]) Problem.add_dim('omp_num_threads',[64]) diff --git a/cifar10_parallel/cifar10_run.py b/cifar10_parallel/cifar10_run.py new file mode 100644 index 0000000..6ad67ee --- /dev/null +++ b/cifar10_parallel/cifar10_run.py @@ -0,0 +1,139 @@ +import time +import numpy as np + + +def run(point): + start = time.time() + try: + batch_size = point['batch_size'] + image_size = point['image_size'] + conv1_in_chan = point['conv1_in_chan'] + conv1_out_chan = point['conv1_out_chan'] + conv1_kern = point['conv1_kern'] + pool_size = point['pool_size'] + conv2_out_chan = point['conv2_out_chan'] + conv2_kern = point['conv2_kern'] + fc1_out = point['fc1_out'] + fc2_out = point['fc2_out'] + fc3_out = point['fc3_out'] + n_conv_block = point['n_conv_block'] + + omp_num_threads = point['omp_num_threads'] + + import os + os.environ['OMP_NUM_THREADS'] = str(omp_num_threads) + os.environ['MKL_NUM_THREADS'] = str(omp_num_threads) + os.environ['KMP_HW_SUBSET'] = '1s,%sc,2t' % str(omp_num_threads) + os.environ['KMP_AFFINITY'] = 'granularity=fine,verbose,compact,1,0' + os.environ['KMP_BLOCKTIME'] = str(0) + #os.environ['MKLDNN_VERBOSE'] = str(1) + import torch + + print('torch version: ',torch.__version__,' torch file: ',torch.__file__) + + class Net(torch.nn.Module): + def __init__(self, batch_size, + image_size, + conv1_in_chan,conv1_out_chan,conv1_kern, + pool_size, + conv2_out_chan,conv2_kern, + fc1_out, + fc2_out, + fc3_out, + n_conv_block, + ): + super(Net, self).__init__() + + self.flop = 0 + self.n_conv_block = n_conv_block + self.batch_size = batch_size + + self.conv1 = torch.nn.Conv2d(conv1_in_chan, conv1_out_chan, conv1_kern) + self.flop += conv1_kern**2 * conv1_in_chan * conv1_out_chan * image_size**2 * batch_size + self.pool = torch.nn.MaxPool2d(pool_size, pool_size) + self.flop += image_size**2 * conv1_out_chan * batch_size + self.conv2 = torch.nn.Conv2d(conv1_out_chan,conv2_out_chan,conv2_kern) + self.flop += conv2_kern**2 * conv1_out_chan * conv2_out_chan * int(image_size/pool_size)**2 * batch_size + self.view_size = conv2_out_chan * conv2_kern * conv2_kern + + self.fc1 = torch.nn.Linear(conv2_out_chan * conv2_kern * conv2_kern, fc1_out) + self.flop += (2*self.view_size - 1) * fc1_out * batch_size + self.fc2 = torch.nn.Linear(fc1_out, fc2_out) + self.flop += (2*fc1_out - 1) * fc2_out * batch_size + self.fc3 = torch.nn.Linear(fc2_out, fc3_out) + self.flop += (2*fc2_out - 1) * fc3_out * batch_size + + def forward(self, inputs): + block_output = torch.zeros(inputs.shape[0],self.view_size,dtype=torch.float) + for i in range(self.n_conv_block): + batch = inputs[i * self.batch_size:(i + 1) * self.batch_size] + + x = self.pool(torch.nn.functional.relu(self.conv1(batch))) + x = self.pool(torch.nn.functional.relu(self.conv2(x))) + x = x.view(-1,self.view_size) + block_output[i * self.batch_size:(i + 1) * self.batch_size] = x + + x = torch.nn.functional.relu(self.fc1(block_output)) + x = torch.nn.functional.relu(self.fc2(x)) + x = self.fc3(x) + return x + + inputs = torch.arange(batch_size * n_conv_block * image_size**2 * conv1_in_chan,dtype=torch.float) + inputs = inputs.view((batch_size * n_conv_block,conv1_in_chan,image_size,image_size)) + net = Net(batch_size, + image_size, + conv1_in_chan,conv1_out_chan,conv1_kern, + pool_size, + conv2_out_chan,conv2_kern, + fc1_out, + fc2_out, + fc3_out, + n_conv_block) + outputs = net(inputs) + + total_flop = net.flop + + runs = 5 + tot_time = 0. + tt = time.time() + for _ in range(runs): + outputs = net(inputs) + tot_time += time.time() - tt + tt = time.time() + + ave_time = tot_time / runs + + print('total_flop = ',total_flop,'ave_time = ',ave_time) + + ave_flops = total_flop / ave_time + runtime = time.time() - start + print('runtime=',runtime,'ave_flops=',ave_flops) + + return ave_flops + except Exception as e: + import traceback + print('received exception: ',str(e),'for point: ',point) + print(traceback.print_exc()) + print('runtime=',time.time() - start) + return 0. + + +if __name__ == '__main__': + point = { + 'batch_size': 10, + 'image_size': 32, + 'conv1_in_chan':3, + 'conv1_out_chan':6, + 'conv1_kern':5, + 'pool_size':2, + 'conv2_out_chan':16, + 'conv2_kern':5, + 'fc1_out':120, + 'fc2_out':84, + 'fc3_out': 10, + 'omp_num_threads':64, + 'n_conv_block': 6, + } + + print('flops for this setting =',run(point)) + diff --git a/cifar10_parallel/problem.py b/cifar10_parallel/problem.py new file mode 100644 index 0000000..f1d0901 --- /dev/null +++ b/cifar10_parallel/problem.py @@ -0,0 +1,23 @@ +from deephyper.benchmark import HpProblem + +Problem = HpProblem() +Problem.add_dim('batch_size',(1,32)) +Problem.add_dim('image_size',[32]) +Problem.add_dim('conv1_in_chan',[3]) +Problem.add_dim('conv1_out_chan',(3,64)) +Problem.add_dim('conv1_kern',(3,8)) +Problem.add_dim('pool_size',[2]) +Problem.add_dim('conv2_out_chan',(3,64)) +Problem.add_dim('conv2_kern',(3,8)) +Problem.add_dim('fc1_out',(64,512)) +Problem.add_dim('fc2_out',(32,512)) +Problem.add_dim('fc3_out',[10]) +Problem.add_dim('omp_num_threads',[64]) +Problem.add_dim('n_conv_block',(1,10)) + +Problem.add_starting_point(batch_size=10,image_size=32,conv1_in_chan=3,conv1_out_chan=16,conv1_kern=5, + pool_size=2,conv2_out_chan=16,conv2_kern=5,fc1_out=128,fc2_out=84, + fc3_out=10,omp_num_threads=64,n_conv_block=3) + +if __name__ == '__main__': + print(Problem) diff --git a/conv3d/conv3d_run.py b/conv3d/conv3d_run.py index 0fef6de..83c176b 100644 --- a/conv3d/conv3d_run.py +++ b/conv3d/conv3d_run.py @@ -1,7 +1,20 @@ -import time +import time,psutil,os +import multiprocessing as mp + +def print_mem_cpu(): + start = time.time() + while True: + mem = psutil.virtual_memory() + print('[%010d] pid=%010d total_mem=%010d free_mem=%05.2f cpu_usage=%05.2f' % (time.time()-start,os.getpid(),mem.total,mem.free/mem.total*100.,psutil.cpu_percent())) + time.sleep(1) + + def run(point): + print(point) start = time.time() + memorymon = mp.Process(target=print_mem_cpu) + memorymon.start() try: batch_size = point['batch_size'] image_size = point['image_size'] @@ -21,48 +34,60 @@ def run(point): print('torch version: ',torch.__version__,' torch file: ',torch.__file__) - - inputs = torch.arange(batch_size * image_size**3 * in_channels,dtype=torch.float).view((batch_size,in_channels,image_size,image_size,image_size)) - - layer = torch.nn.Conv3d(in_channels,out_channels,kernel_size,stride=1,padding=1) - outputs = layer(inputs) - - total_flop = kernel_size**3 * in_channels * out_channels * outputs.shape[-1] * outputs.shape[-2] * outputs.shape[-3] * batch_size - runs = 5 - tot_time = 0. - tt = time.time() - for _ in range(runs): + with torch.no_grad(): + inputs = torch.arange(batch_size * image_size**3 * in_channels,dtype=torch.float).view((batch_size,in_channels,image_size,image_size,image_size)) + print('creating layer') + layer = torch.nn.Conv3d(in_channels,out_channels,kernel_size,stride=1,padding=1) + layer.eval() + print('first execution') outputs = layer(inputs) - tot_time += time.time() - tt - tt = time.time() - ave_time = tot_time / runs - print('total_flop = ',total_flop,'ave_time = ',ave_time) + total_flop = kernel_size**3 * in_channels * out_channels * outputs.shape[-1] * outputs.shape[-2] * outputs.shape[-3] * batch_size + + runs = 25 + tot_time = 0. + tt = time.time() + print('loop') + for i in range(runs): + print('step',i) + outputs = layer(inputs) + tot_time += time.time() - tt + tt = time.time() + + ave_time = tot_time / runs - ave_flops = total_flop / ave_time - runtime = time.time() - start - print('runtime=',runtime,'ave_flops=',ave_flops) + print('total_flop = ',total_flop,'ave_time = ',ave_time) + ave_flops = total_flop / ave_time + runtime = time.time() - start + print('runtime=',runtime,'ave_flops=',ave_flops) + memorymon.terminate() + memorymon.join() return ave_flops except Exception as e: import traceback print('received exception: ',str(e),'for point: ',point) print(traceback.print_exc()) print('runtime=',time.time() - start) + memorymon.terminate() + memorymon.join() + return 0. if __name__ == '__main__': point = { 'batch_size': 10, - 'image_size': 128, + 'image_size': 64, 'in_channels': 3, 'out_channels': 3, - 'kernel_size': 4, + 'kernel_size': 3, 'omp_num_threads':64, } + #point = {'batch_size': 4, 'image_size': 88, 'in_channels': 56, 'kernel_size': 10, 'omp_num_threads': 64, 'out_channels': 47} + print('flops for this setting =',run(point)) diff --git a/conv3d/problem.py b/conv3d/problem.py index bfefd15..584e9d7 100644 --- a/conv3d/problem.py +++ b/conv3d/problem.py @@ -1,7 +1,7 @@ from deephyper.benchmark import HpProblem Problem = HpProblem() -Problem.add_dim('batch_size',(1,32)) +Problem.add_dim('batch_size',(1,64)) Problem.add_dim('image_size',(16,128)) Problem.add_dim('in_channels',(2,64)) Problem.add_dim('out_channels',(2,64))