From 9b0e27b5f48d66ffad297a7906720a31a43dc609 Mon Sep 17 00:00:00 2001 From: Mihir Pandya Date: Tue, 7 Apr 2015 18:56:04 -0400 Subject: [PATCH 01/33] Implemented getImages function for localDocker --- vmms/localDocker.py | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/vmms/localDocker.py b/vmms/localDocker.py index f48e7f3c..adfa8a46 100644 --- a/vmms/localDocker.py +++ b/vmms/localDocker.py @@ -193,7 +193,8 @@ def safeDestroyVM(self, vm): return def getVMs(self): - """ getVMs - Executes and parses `docker ps` + """ getVMs - Executes and parses `docker ps`. This function + is a lot of parsing and can break easily. """ # Get all volumes of docker containers machines = [] @@ -217,3 +218,21 @@ def existsVM(self, vm): ret = timeout(['docker', 'inspect', instanceName]) return (ret is 0) + def getImages(self): + """ getImages - Executes `docker images` and returns a list of + images that can be used to boot a docker container with. This + function is a lot of parsing and so can break easily. + """ + result = [] + cmd = "docker images" + o = subprocess.check_output("docker images", shell=True) + o_l = o.split('\n') + o_l.pop() + o_l.reverse() + o_l.pop() + for row in o_l: + row_l = row.split(' ') + result.append(row_l[0]) + return result + + From 7702d9fd5fd681a3ca30046c6420306b5b1153bc Mon Sep 17 00:00:00 2001 From: Mihir Pandya Date: Tue, 7 Apr 2015 18:58:44 -0400 Subject: [PATCH 02/33] Using a set to get unique image names. Tango is not going to support docker tags. Instead, it will always is the latest image. --- vmms/localDocker.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/vmms/localDocker.py b/vmms/localDocker.py index adfa8a46..679bb56e 100644 --- a/vmms/localDocker.py +++ b/vmms/localDocker.py @@ -223,7 +223,7 @@ def getImages(self): images that can be used to boot a docker container with. This function is a lot of parsing and so can break easily. """ - result = [] + result = set() cmd = "docker images" o = subprocess.check_output("docker images", shell=True) o_l = o.split('\n') @@ -232,7 +232,7 @@ def getImages(self): o_l.pop() for row in o_l: row_l = row.split(' ') - result.append(row_l[0]) - return result + result.add(row_l[0]) + return list(result) From f8e754d029668519d6630bba99d3cc1c2bcf0067 Mon Sep 17 00:00:00 2001 From: Mihir Pandya Date: Tue, 7 Apr 2015 19:19:42 -0400 Subject: [PATCH 03/33] Updated validateJob in tangod to use getImages instead of casing on tashi. --- tangod.py | 32 ++++++++++++-------------------- 1 file changed, 12 insertions(+), 20 deletions(-) diff --git a/tangod.py b/tangod.py index 4db3799a..1c22a29c 100755 --- a/tangod.py +++ b/tangod.py @@ -43,6 +43,7 @@ from config import Config from tangoObjects import TangoJob +from datetime import datetime class TangoServer: @@ -247,25 +248,16 @@ def validateJob(job, vmms): (datetime.utcnow().ctime())) errors += 1 else: - if job.vm.vmms == "tashiSSH": - # Check if VM name exists in Tashi directory - imgList = os.listdir(Config.TASHI_IMAGE_PATH) - imgPath = Config.TASHI_IMAGE_PATH + job.vm.image - if job.vm.image not in imgList: - log.error("validateJob: Image not found: %s" % - job.vm.image) - job.appendTrace("%s|validateJob: Image not found: %s" % - (datetime.utcnow().ctime(), job.vm.image)) - errors += 1 - # Check if image has read permissions - elif not (os.stat(imgPath).st_mode & stat.S_IRUSR): - log.error("validateJob: Not readable: %s" % job.vm.image) - job.appendTrace("%s|validateJob: Not readable: %s" % - (datetime.utcnow().ctime(), job.vm.image)) - errors += 1 - else: - (base, ext) = os.path.splitext(job.vm.image) - job.vm.name = base + vobj = vmms[Config.VMMS_NAME] + imgList = vobj.getImages() + if job.vm.image not in imgList: + log.error("validateJob: Image not found: %s" % + job.vm.image) + job.appendTrace("%s|validateJob: Image not found: %s" % + (datetime.utcnow().ctime(), job.vm.image)) + errors += 1 + else: + job.vm.name = job.vm.image if not job.vm.vmms: log.error("validateJob: Missing job.vm.vmms") @@ -323,7 +315,7 @@ def validateJob(job, vmms): # Any problems, return an error status if errors > 0: log.error("validateJob: Job rejected: %d errors" % errors) - job.timerace.append("%s|validateJob: Job rejected: %d errors" % + job.appendTrace("%s|validateJob: Job rejected: %d errors" % (datetime.utcnow().ctime(), errors)) return -1 else: From c85c4fbc75ee25e40007ab5886742d5298b391a2 Mon Sep 17 00:00:00 2001 From: Yashas Kumar Date: Mon, 13 Apr 2015 16:06:58 -0400 Subject: [PATCH 04/33] Added pseudo job-ordering by ID back into Tango. Fixes #55 for tashiSSH --- config.template.py | 4 +--- tangoObjects.py | 3 ++- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/config.template.py b/config.template.py index 9a187b6c..e89a9ec6 100644 --- a/config.template.py +++ b/config.template.py @@ -23,7 +23,7 @@ class Config: PORT = 3000 # Log file. Setting this to None sends the server output to stdout - LOGFILE = "tango3.log" + LOGFILE = None # Logging level LOGLEVEL = logging.DEBUG @@ -100,11 +100,9 @@ class Config: # Path for tashi images TASHI_IMAGE_PATH = '' - # Optionally log finer-grained timing information LOG_TIMING = False - # Largest job ID MAX_JOBID = 500 diff --git a/tangoObjects.py b/tangoObjects.py index fad249b3..5d2c6d2c 100644 --- a/tangoObjects.py +++ b/tangoObjects.py @@ -326,7 +326,8 @@ def delete(self, id): del self.dict[str(id)] def iteritems(self): - return self.dict.iteritems() + return iter([(i, self.get(i)) for i in xrange(1,Config.MAX_JOBID+1) + if self.get(i) != None]) def _clean(self): # only for testing From fc13a88a6ad48c16dc21befbb8a4557063683b33 Mon Sep 17 00:00:00 2001 From: Mihir Pandya Date: Tue, 14 Apr 2015 14:29:17 -0400 Subject: [PATCH 05/33] Using getImages for validation in pool and prealloc. --- preallocator.py | 8 ++++---- restful-tango/tangoREST.py | 24 +++++++++++------------- tangod.py | 6 ++++-- vmms/tashiSSH.py | 1 + 4 files changed, 20 insertions(+), 19 deletions(-) diff --git a/preallocator.py b/preallocator.py index fdbc5fca..bbfdabf6 100644 --- a/preallocator.py +++ b/preallocator.py @@ -40,10 +40,10 @@ def update(self, vm, num): """ update - Updates the number of machines of a certain type to be preallocated. - This function is called via the TangoServer HTTP interface, and - therefore should do as little as possible before returning. - It will update the machine list, and then spawn child threads - to do the creation and destruction of machines as necessary. + This function is called via the TangoServer HTTP interface. + It will validate the request,update the machine list, and + then spawn child threads to do the creation and destruction + of machines as necessary. """ self.lock.acquire() if vm.name not in self.machines.keys(): diff --git a/restful-tango/tangoREST.py b/restful-tango/tangoREST.py index 12935836..a7340421 100644 --- a/restful-tango/tangoREST.py +++ b/restful-tango/tangoREST.py @@ -43,8 +43,8 @@ def __init__(self): self.wrong_courselab = self.create(-1, "Courselab not found") self.out_not_found = self.create(-1, "Output file not found") self.invalid_image = self.create(-1, "Invalid image name") + self.invalid_prealloc_size = self.create(-1, "Invalid prealloc size") self.pool_not_found = self.create(-1, "Pool not found") - self.prealloc_failed = self.create(-1, "Preallocate VM failed") def create(self, id, msg): """ create - Constructs a dict with the given ID and message @@ -414,13 +414,9 @@ def pool(self, key, image): """ self.log.debug("Received pool request(%s, %s)" % (key, image)) if self.validateKey(key): - if not image or image == "" or not image.endswith(".img"): - self.log.info("Invalid image name") - return self.status.invalid_image - image = image[:-4] info = self.preallocator.getPool(image) if len(info["pool"]) == 0: - self.log.info("Pool image not found: %s" % image) + self.log.info("Pool not found: %s" % image) return self.status.pool_not_found self.log.info("Pool image found: %s" % image) result = self.status.obtained_pool @@ -437,18 +433,20 @@ def prealloc(self, key, image, num, vmStr): self.log.debug("Received prealloc request(%s, %s, %s)" % (key, image, num)) if self.validateKey(key): - if not image or image == "" or not image.endswith(".img"): - self.log.info("Invalid image name") - return self.status.invalid_image if vmStr != "": vmObj = json.loads(vmStr) vm = self.createTangoMachine(image, vmObj=vmObj) else: vm = self.createTangoMachine(image) - success = self.tango.preallocVM(vm, int(num)) - if (success == -1): - self.log.info("Failed to preallocated VMs") - return self.status.prealloc_failed + + ret = self.tango.preallocVM(vm, int(num)) + + if ret == -1: + self.log.error("Invalid prealloc size") + return self.status.invalid_prealloc_size + if ret == -2: + self.log.error("Invalid image name") + return self.status.invalid_image self.log.info("Successfully preallocated VMs") return self.status.preallocated else: diff --git a/tangod.py b/tangod.py index 1c22a29c..17674600 100755 --- a/tangod.py +++ b/tangod.py @@ -111,10 +111,12 @@ def preallocVM(self, vm, num): self.log.debug("Received preallocVM(%s,%d)request" % (vm.name, num)) try: + vmms = self.vmms[vm.vmms] if not vm or num < 0: return -1 - (base, ext) = os.path.splitext(vm.image) - vm.name = base + if vm.name not in vmms.getImages(): + self.log.error("Invalid image name") + return -2 self.preallocator.update(vm, num) return 0 except Exception as err: diff --git a/vmms/tashiSSH.py b/vmms/tashiSSH.py index 54c3977a..bead3756 100644 --- a/vmms/tashiSSH.py +++ b/vmms/tashiSSH.py @@ -178,6 +178,7 @@ def initializeVM(self, vm): """ initializeVM - Ask Tashi to create a new VM instance """ # Create the instance + vm.image = vm.image + ".img" instance = self.tangoMachineToInstance(vm) tashiInst = self.tashiCall("createVm", [instance]) vm.instance_id = tashiInst.id From 9218d2874fe4c1ff9ca1943560341919a5f00074 Mon Sep 17 00:00:00 2001 From: Mihir Pandya Date: Tue, 14 Apr 2015 17:07:29 -0400 Subject: [PATCH 06/33] Basic implementation of distDocker. --- jobManager.py | 5 +- jobQueue.py | 4 +- vmms/distDocker.py | 328 +++++++++++++++++++++++++++++++++++++++++++++ vmms/tashiSSH.py | 1 - 4 files changed, 330 insertions(+), 8 deletions(-) create mode 100644 vmms/distDocker.py diff --git a/jobManager.py b/jobManager.py index ae7d0bbd..45db3492 100644 --- a/jobManager.py +++ b/jobManager.py @@ -9,14 +9,11 @@ # is launched that will handle things from here on. If anything goes # wrong, the job is made dead with the error. # -import time -import threading -import logging +import time, threading, logging from datetime import datetime from config import Config from worker import Worker - from jobQueue import JobQueue from preallocator import Preallocator diff --git a/jobQueue.py b/jobQueue.py index 11627bc9..1bddd92b 100644 --- a/jobQueue.py +++ b/jobQueue.py @@ -7,9 +7,7 @@ # JobManager: Class that creates a thread object that looks for new # work on the job queue and assigns it to workers. # -import time -import threading -import logging +import time, threading, logging from datetime import datetime from config import Config diff --git a/vmms/distDocker.py b/vmms/distDocker.py new file mode 100644 index 00000000..9a207114 --- /dev/null +++ b/vmms/distDocker.py @@ -0,0 +1,328 @@ +# +# distDocker.py +# +# Implements the Tango VMMS interface to run Tango jobs in +# docker containers on a list of host machines. This list of +# host machines must be able to run docker and be accessible +# by SSH. The IP address of the host machine is stored in the +# `domain_name` attribtue of TangoMachine. +# + +import random, subprocess, re, time, logging, threading, os, sys, shutil +import config +from tangoObjects import TangoMachine + +def timeout(command, time_out=1): + """ timeout - Run a unix command with a timeout. Return -1 on + timeout, otherwise return the return value from the command, which + is typically 0 for success, 1-255 for failure. + """ + + # Launch the command + p = subprocess.Popen(command, + stdout=open("/dev/null", 'w'), + stderr=subprocess.STDOUT) + + # Wait for the command to complete + t = 0.0 + while t < time_out and p.poll() is None: + time.sleep(config.Config.TIMER_POLL_INTERVAL) + t += config.Config.TIMER_POLL_INTERVAL + + # Determine why the while loop terminated + if p.poll() is None: + subprocess.call(["/bin/kill", "-9", str(p.pid)]) + returncode = -1 + else: + returncode = p.poll() + return returncode + +def timeoutWithReturnStatus(command, time_out, returnValue = 0): + """ timeoutWithReturnStatus - Run a Unix command with a timeout, + until the expected value is returned by the command; On timeout, + return last error code obtained from the command. + """ + p = subprocess.Popen(command, + stdout=open("/dev/null", 'w'), + stderr=subprocess.STDOUT) + t = 0.0 + while (t < time_out): + ret = p.poll() + if ret is None: + time.sleep(config.Config.TIMER_POLL_INTERVAL) + t += config.Config.TIMER_POLL_INTERVAL + elif ret == returnValue: + return ret + else: + p = subprocess.Popen(command, + stdout=open("/dev/null", 'w'), + stderr=subprocess.STDOUT) + return ret + +# +# User defined exceptions +# + +class DistDocker: + + _SSH_FLAGS = ["-q", "-i", "/Users/Mihir/Documents/prog/Autolab/mp_tango.pem", + "-o", "StrictHostKeyChecking=no", + "-o", "GSSAPIAuthentication=no"] + + def __init__(self): + """ Checks if the machine is ready to run docker containers. + Initialize boot2docker if running on OS X. + """ + try: + self.log = logging.getLogger("DistDocker") + self.hosts = ['127.0.0.1'] + self.hostIdx = 0 + self.hostLock = threading.Lock() + self.hostUser = "ubuntu" + + # Check import docker constants are defined in config + if len(config.Config.DOCKER_VOLUME_PATH) == 0: + raise Exception('DOCKER_VOLUME_PATH not defined in config.') + + except Exception as e: + self.log.error(str(e)) + exit(1) + + def getHost(self): + self.hostLock.acquire() + host = self.hosts[self.hostIdx] + self.hostIdx = self.hostIdx + 1 + if self.hostIdx >= len(self.hosts): + self.hostIdx = 0 + self.hostLock.release() + return host + + def instanceName(self, id, name): + """ instanceName - Constructs a Docker instance name. Always use + this function when you need a Docker instance name. Never generate + instance names manually. + """ + return "%s-%s-%s" % (config.Config.PREFIX, id, name) + + def getVolumePath(self, instanceName): + volumePath = config.Config.DOCKER_VOLUME_PATH + if '*' in volumePath: + volumePath = os.getcwd() + '/' + 'volumes/' + volumePath = volumePath + instanceName + '/' + return volumePath + + def domainName(self, vm): + """ Returns the domain name that is stored in the vm + instance. + """ + return vm.domain_name + + # + # VMMS API functions + # + def initializeVM(self, vm): + """ initializeVM - Nothing to do for initializeVM + """ + host = self.getHost() + vm.domain_name = host + self.log.info("Assign host %s to VM %s." % (host, vm.name)) + return vm + + def waitVM(self, vm, max_secs): + """ waitVM - Nothing to do for waitVM + """ + domain_name = self.domainName(vm) + + # First, wait for ping to the vm instance to work + instance_down = 1 + start_time = time.time() + while instance_down: + instance_down = subprocess.call("ping -c 1 %s" % (domain_name), + shell=True, + stdout=open('/dev/null', 'w'), + stderr=subprocess.STDOUT) + + # Wait a bit and then try again if we haven't exceeded + # timeout + if instance_down: + time.sleep(config.Config.TIMER_POLL_INTERVAL) + elapsed_secs = time.time() - start_time + if (elapsed_secs > max_secs): + return -1 + + # The ping worked, so now wait for SSH to work before + # declaring that the VM is ready + self.log.debug("VM %s: ping completed" % (domain_name)) + while (True): + + elapsed_secs = time.time() - start_time + + # Give up if the elapsed time exceeds the allowable time + if elapsed_secs > max_secs: + self.log.info("VM %s: SSH timeout after %d secs" % + (domain_name, elapsed_secs)) + return -1 + + # If the call to ssh returns timeout (-1) or ssh error + # (255), then success. Otherwise, keep trying until we run + # out of time. + ret = timeout(["ssh"] + DistDocker._SSH_FLAGS + + ["%s@%s" % (self.hostUser, domain_name), + "(:)"], max_secs - elapsed_secs) + self.log.debug("VM %s: ssh returned with %d" % + (domain_name, ret)) + if (ret != -1) and (ret != 255): + return 0 + + # Sleep a bit before trying again + time.sleep(config.Config.TIMER_POLL_INTERVAL) + + def copyIn(self, vm, inputFiles): + """ copyIn - Create a directory to be mounted as a volume + for the docker containers on the host machine for this VM. + Copy input files to this directory on the host machine. + """ + domainName = self.domainName(vm) + instanceName = self.instanceName(vm.id, vm.image) + volumePath = self.getVolumePath(instanceName) + + # Create a fresh volume + ret = timeout(["ssh"] + DistDocker._SSH_FLAGS + + ["%s@%s" % (self.hostUser, domainName), + "(rm -rf %s; mkdir %s)" % (volumePath, volumePath)], + config.Config.COPYIN_TIMEOUT) + if ret == 0: + self.log.debug("Volume directory created on VM.") + else: + return ret + + for file in inputFiles: + ret = timeout(["scp"] + DistDocker._SSH_FLAGS + file.localFile + + ["%s@%s:%s/%s" % + (self.hostUser, domainName, volumePath, file.destFile)], + config.Config.COPYIN_TIMEOUT) + if ret == 0: + self.log.debug('Copied in file %s to %s' % + (file.localFile, volumePath + file.destFile)) + else: + self.log.error( + "Error: failed to copy file %s to VM %s with status %s" % + (file.localFile, domain_name, str(ret))) + return ret + + return 0 + + def runJob(self, vm, runTimeout, maxOutputFileSize): + """ runJob - Run a docker container by doing the follows: + - mount directory corresponding to this job to /home/autolab + in the container + - run autodriver with corresponding ulimits and timeout as + autolab user + """ + domainName = self.domainName(vm) + instanceName = self.instanceName(vm.id, vm.image) + volumePath = self.getVolumePath(instanceName) + + autodriverCmd = 'autodriver -u %d -f %d -t %d -o %d autolab &> output/feedback' % \ + (config.Config.VM_ULIMIT_USER_PROC, + config.Config.VM_ULIMIT_FILE_SIZE, + runTimeout, config.Config.MAX_OUTPUT_FILE_SIZE) + + setupCmd = 'cp -r mount/* autolab/; su autolab -c "%s"; \ + cp output/feedback mount/feedback' % autodriverCmd + + args = '(docker run --name %s -v %s:/home/mount %s sh -c "%s")' % + (instanceName, volumePath, vm.image, setupCmd) + + self.log.debug('Running job: %s' % str(args)) + + ret = timeout(["ssh"] + DistDocker._SSH_FLAGS + + ["%s@%s" % (self.hostUser, domain_name), + args, config.Config.RUNJOB_TIMEOUT) + + self.log.debug('runJob return status %d' % ret) + + return ret + + + def copyOut(self, vm, destFile): + """ copyOut - Copy the autograder feedback from container to + destFile on the Tango host. Then, destroy that container. + Containers are never reused. + """ + domainName = self.domainName(vm) + instanceName = self.instanceName(vm.id, vm.image) + volumePath = self.getVolumePath(instanceName) + + ret = timeout(["scp"] + DistDocker._SSH_FLAGS + + ["%s@%s:%s" % + (self.hostUser, domain_name, volumePath + 'feedback'), + destFile], + config.Config.COPYOUT_TIMEOUT) + + self.log.debug('Copied feedback file to %s' % destFile) + self.destroyVM(vm) + + return 0 + + def destroyVM(self, vm): + """ destroyVM - Delete the docker container. + """ + domainName = self.domainName(vm) + instanceName = self.instanceName(vm.id, vm.image) + volumePath = self.getVolumePath('') + # Do a hard kill on corresponding docker container. + # Return status does not matter. + args = '(docker rm -f %s)' % (instanceName) + timeout(["ssh"] + DistDocker._SSH_FLAGS + + ["%s@%s" % (self.hostUser, domainName), args] + config.Config.DOCKER_RM_TIMEOUT) + # Destroy corresponding volume if it exists. + timeout(["ssh"] + DistDocker._SSH_FLAGS + + ["%s@%s" % (self.hostUser, domainName), + "(rm -rf %s" % (volumePath)]) + self.log.debug('Deleted volume %s' % instanceName) + return + + def safeDestroyVM(self, vm): + """ safeDestroyVM - Delete the docker container and make + sure it is removed. + """ + start_time = time.time() + while self.existsVM(vm): + if (time.time()-start_time > config.Config.DESTROY_SECS): + self.log.error("Failed to safely destroy container %s" + % vm.name) + return + self.destroyVM(vm) + return + + def getVMs(self): + """ getVMs - Executes and parses `docker ps` + """ + # Get all volumes of docker containers + machines = [] + volumePath = self.getVolumePath('') + for host in self.hosts: + volumes = subprocess.check_output(["ssh"] + DistDocker._SSH_FLAGS + + ["%s@%s" % (self.hostUser, host), + "(ls %s)" % volumePath]).split('\n') + for volume in volumes: + if re.match("%s-" % config.Config.PREFIX, volume): + machine = TangoMachine() + machine.vmms = 'distDocker' + machine.name = volume + machine.domain_name = host + volume_l = volume.split('-') + machine.id = volume_l[1] + machine.image = volume_l[2] + machines.append(machine) + return machines + + def existsVM(self, vm): + """ existsVM + """ + vms = self.getVMs() + vmnames = [vm.name for vm in vms] + return (vm.name in vmname) + diff --git a/vmms/tashiSSH.py b/vmms/tashiSSH.py index 54c3977a..6e9577b0 100644 --- a/vmms/tashiSSH.py +++ b/vmms/tashiSSH.py @@ -238,7 +238,6 @@ def copyIn(self, vm, inputFiles): """ copyIn - Copy input files to VM """ domain_name = self.domainName(vm.id, vm.name) - ssh_tries = 0 self.log.debug("Creating autolab directory on VM") # Create a fresh input directory ret = subprocess.call(["ssh"] + TashiSSH._SSH_FLAGS + From b57438df466c6f4277ae255ed8fc080185e4ec05 Mon Sep 17 00:00:00 2001 From: Mihir Pandya Date: Tue, 14 Apr 2015 17:57:59 -0400 Subject: [PATCH 07/33] Full roundtrip working with one host machine. --- jobManager.py | 3 +++ restful-tango/tangoREST.py | 3 +++ vmms/distDocker.py | 51 ++++++++++++++------------------------ 3 files changed, 24 insertions(+), 33 deletions(-) diff --git a/jobManager.py b/jobManager.py index 45db3492..d0e183a4 100644 --- a/jobManager.py +++ b/jobManager.py @@ -89,6 +89,9 @@ def __manage(self): elif Config.VMMS_NAME == "localDocker": from vmms.localDocker import LocalDocker vmms = LocalDocker() + elif Config.VMMS_NAME == "distDocker": + from vmms.distDocker import DistDocker + vmms = DistDocker() vmms = {Config.VMMS_NAME: vmms} preallocator = Preallocator(vmms) diff --git a/restful-tango/tangoREST.py b/restful-tango/tangoREST.py index 12935836..f5c8f25b 100644 --- a/restful-tango/tangoREST.py +++ b/restful-tango/tangoREST.py @@ -87,6 +87,9 @@ def __init__(self): elif Config.VMMS_NAME == "localDocker": from vmms.localDocker import LocalDocker vmms = LocalDocker() + elif Config.VMMS_NAME == "distDocker": + from vmms.distDocker import DistDocker + vmms = DistDocker() self.vmms = {Config.VMMS_NAME: vmms} diff --git a/vmms/distDocker.py b/vmms/distDocker.py index 9a207114..ecdd6791 100644 --- a/vmms/distDocker.py +++ b/vmms/distDocker.py @@ -75,7 +75,7 @@ def __init__(self): """ try: self.log = logging.getLogger("DistDocker") - self.hosts = ['127.0.0.1'] + self.hosts = ['54.186.238.205'] self.hostIdx = 0 self.hostLock = threading.Lock() self.hostUser = "ubuntu" @@ -133,26 +133,9 @@ def waitVM(self, vm, max_secs): """ domain_name = self.domainName(vm) - # First, wait for ping to the vm instance to work - instance_down = 1 start_time = time.time() - while instance_down: - instance_down = subprocess.call("ping -c 1 %s" % (domain_name), - shell=True, - stdout=open('/dev/null', 'w'), - stderr=subprocess.STDOUT) - - # Wait a bit and then try again if we haven't exceeded - # timeout - if instance_down: - time.sleep(config.Config.TIMER_POLL_INTERVAL) - elapsed_secs = time.time() - start_time - if (elapsed_secs > max_secs): - return -1 - - # The ping worked, so now wait for SSH to work before - # declaring that the VM is ready - self.log.debug("VM %s: ping completed" % (domain_name)) + + # Wait for SSH to work before declaring that the VM is ready while (True): elapsed_secs = time.time() - start_time @@ -197,8 +180,8 @@ def copyIn(self, vm, inputFiles): return ret for file in inputFiles: - ret = timeout(["scp"] + DistDocker._SSH_FLAGS + file.localFile + - ["%s@%s:%s/%s" % + ret = timeout(["scp"] + DistDocker._SSH_FLAGS + [file.localFile] + + ["%s@%s:%s/%s" % \ (self.hostUser, domainName, volumePath, file.destFile)], config.Config.COPYIN_TIMEOUT) if ret == 0: @@ -207,7 +190,7 @@ def copyIn(self, vm, inputFiles): else: self.log.error( "Error: failed to copy file %s to VM %s with status %s" % - (file.localFile, domain_name, str(ret))) + (file.localFile, domainName, str(ret))) return ret return 0 @@ -231,14 +214,14 @@ def runJob(self, vm, runTimeout, maxOutputFileSize): setupCmd = 'cp -r mount/* autolab/; su autolab -c "%s"; \ cp output/feedback mount/feedback' % autodriverCmd - args = '(docker run --name %s -v %s:/home/mount %s sh -c "%s")' % + args = "(docker run --name %s -v %s:/home/mount %s sh -c '%s')" % \ (instanceName, volumePath, vm.image, setupCmd) - self.log.debug('Running job: %s' % str(args)) + self.log.debug('Running job: %s' % args) ret = timeout(["ssh"] + DistDocker._SSH_FLAGS + - ["%s@%s" % (self.hostUser, domain_name), - args, config.Config.RUNJOB_TIMEOUT) + ["%s@%s" % (self.hostUser, domainName), args], + config.Config.RUNJOB_TIMEOUT) self.log.debug('runJob return status %d' % ret) @@ -256,12 +239,12 @@ def copyOut(self, vm, destFile): ret = timeout(["scp"] + DistDocker._SSH_FLAGS + ["%s@%s:%s" % - (self.hostUser, domain_name, volumePath + 'feedback'), + (self.hostUser, domainName, volumePath + 'feedback'), destFile], config.Config.COPYOUT_TIMEOUT) self.log.debug('Copied feedback file to %s' % destFile) - self.destroyVM(vm) + # self.destroyVM(vm) return 0 @@ -270,17 +253,19 @@ def destroyVM(self, vm): """ domainName = self.domainName(vm) instanceName = self.instanceName(vm.id, vm.image) - volumePath = self.getVolumePath('') + volumePath = self.getVolumePath(instanceName) + self.log.debug(volumePath) # Do a hard kill on corresponding docker container. # Return status does not matter. args = '(docker rm -f %s)' % (instanceName) timeout(["ssh"] + DistDocker._SSH_FLAGS + - ["%s@%s" % (self.hostUser, domainName), args] - config.Config.DOCKER_RM_TIMEOUT) + ["%s@%s" % (self.hostUser, domainName), args], + config.Config.DOCKER_RM_TIMEOUT) # Destroy corresponding volume if it exists. timeout(["ssh"] + DistDocker._SSH_FLAGS + ["%s@%s" % (self.hostUser, domainName), - "(rm -rf %s" % (volumePath)]) + "(rm -rf %s)" % (volumePath)], + config.Config.DOCKER_RM_TIMEOUT) self.log.debug('Deleted volume %s' % instanceName) return From 7a6ecaa49add4f1c5d247da5b00a0c1d9917a585 Mon Sep 17 00:00:00 2001 From: Mihir Pandya Date: Tue, 14 Apr 2015 20:02:40 -0400 Subject: [PATCH 08/33] Works with two host machines. Implemented getImages. --- vmms/distDocker.py | 72 ++++++++++++++++++++++++---------------------- 1 file changed, 37 insertions(+), 35 deletions(-) diff --git a/vmms/distDocker.py b/vmms/distDocker.py index ecdd6791..99c23696 100644 --- a/vmms/distDocker.py +++ b/vmms/distDocker.py @@ -75,7 +75,7 @@ def __init__(self): """ try: self.log = logging.getLogger("DistDocker") - self.hosts = ['54.186.238.205'] + self.hosts = ['54.186.238.205', '54.68.89.235'] self.hostIdx = 0 self.hostLock = threading.Lock() self.hostUser = "ubuntu" @@ -88,15 +88,6 @@ def __init__(self): self.log.error(str(e)) exit(1) - def getHost(self): - self.hostLock.acquire() - host = self.hosts[self.hostIdx] - self.hostIdx = self.hostIdx + 1 - if self.hostIdx >= len(self.hosts): - self.hostIdx = 0 - self.hostLock.release() - return host - def instanceName(self, id, name): """ instanceName - Constructs a Docker instance name. Always use this function when you need a Docker instance name. Never generate @@ -111,19 +102,18 @@ def getVolumePath(self, instanceName): volumePath = volumePath + instanceName + '/' return volumePath - def domainName(self, vm): - """ Returns the domain name that is stored in the vm - instance. - """ - return vm.domain_name - # # VMMS API functions # def initializeVM(self, vm): """ initializeVM - Nothing to do for initializeVM """ - host = self.getHost() + self.hostLock.acquire() + host = self.hosts[self.hostIdx] + self.hostIdx = self.hostIdx + 1 + if self.hostIdx >= len(self.hosts): + self.hostIdx = 0 + self.hostLock.release() vm.domain_name = host self.log.info("Assign host %s to VM %s." % (host, vm.name)) return vm @@ -131,8 +121,6 @@ def initializeVM(self, vm): def waitVM(self, vm, max_secs): """ waitVM - Nothing to do for waitVM """ - domain_name = self.domainName(vm) - start_time = time.time() # Wait for SSH to work before declaring that the VM is ready @@ -143,17 +131,17 @@ def waitVM(self, vm, max_secs): # Give up if the elapsed time exceeds the allowable time if elapsed_secs > max_secs: self.log.info("VM %s: SSH timeout after %d secs" % - (domain_name, elapsed_secs)) + (vm.domain_name, elapsed_secs)) return -1 # If the call to ssh returns timeout (-1) or ssh error # (255), then success. Otherwise, keep trying until we run # out of time. ret = timeout(["ssh"] + DistDocker._SSH_FLAGS + - ["%s@%s" % (self.hostUser, domain_name), + ["%s@%s" % (self.hostUser, vm.domain_name), "(:)"], max_secs - elapsed_secs) self.log.debug("VM %s: ssh returned with %d" % - (domain_name, ret)) + (vm.domain_name, ret)) if (ret != -1) and (ret != 255): return 0 @@ -165,13 +153,12 @@ def copyIn(self, vm, inputFiles): for the docker containers on the host machine for this VM. Copy input files to this directory on the host machine. """ - domainName = self.domainName(vm) instanceName = self.instanceName(vm.id, vm.image) volumePath = self.getVolumePath(instanceName) # Create a fresh volume ret = timeout(["ssh"] + DistDocker._SSH_FLAGS + - ["%s@%s" % (self.hostUser, domainName), + ["%s@%s" % (self.hostUser, vm.domain_name), "(rm -rf %s; mkdir %s)" % (volumePath, volumePath)], config.Config.COPYIN_TIMEOUT) if ret == 0: @@ -182,7 +169,7 @@ def copyIn(self, vm, inputFiles): for file in inputFiles: ret = timeout(["scp"] + DistDocker._SSH_FLAGS + [file.localFile] + ["%s@%s:%s/%s" % \ - (self.hostUser, domainName, volumePath, file.destFile)], + (self.hostUser, vm.domain_name, volumePath, file.destFile)], config.Config.COPYIN_TIMEOUT) if ret == 0: self.log.debug('Copied in file %s to %s' % @@ -190,7 +177,7 @@ def copyIn(self, vm, inputFiles): else: self.log.error( "Error: failed to copy file %s to VM %s with status %s" % - (file.localFile, domainName, str(ret))) + (file.localFile, vm.domain_name, str(ret))) return ret return 0 @@ -202,7 +189,6 @@ def runJob(self, vm, runTimeout, maxOutputFileSize): - run autodriver with corresponding ulimits and timeout as autolab user """ - domainName = self.domainName(vm) instanceName = self.instanceName(vm.id, vm.image) volumePath = self.getVolumePath(instanceName) @@ -220,7 +206,7 @@ def runJob(self, vm, runTimeout, maxOutputFileSize): self.log.debug('Running job: %s' % args) ret = timeout(["ssh"] + DistDocker._SSH_FLAGS + - ["%s@%s" % (self.hostUser, domainName), args], + ["%s@%s" % (self.hostUser, vm.domain_name), args], config.Config.RUNJOB_TIMEOUT) self.log.debug('runJob return status %d' % ret) @@ -233,37 +219,34 @@ def copyOut(self, vm, destFile): destFile on the Tango host. Then, destroy that container. Containers are never reused. """ - domainName = self.domainName(vm) instanceName = self.instanceName(vm.id, vm.image) volumePath = self.getVolumePath(instanceName) ret = timeout(["scp"] + DistDocker._SSH_FLAGS + ["%s@%s:%s" % - (self.hostUser, domainName, volumePath + 'feedback'), + (self.hostUser, vm.domain_name, volumePath + 'feedback'), destFile], config.Config.COPYOUT_TIMEOUT) self.log.debug('Copied feedback file to %s' % destFile) - # self.destroyVM(vm) + self.destroyVM(vm) return 0 def destroyVM(self, vm): """ destroyVM - Delete the docker container. """ - domainName = self.domainName(vm) instanceName = self.instanceName(vm.id, vm.image) volumePath = self.getVolumePath(instanceName) - self.log.debug(volumePath) # Do a hard kill on corresponding docker container. # Return status does not matter. args = '(docker rm -f %s)' % (instanceName) timeout(["ssh"] + DistDocker._SSH_FLAGS + - ["%s@%s" % (self.hostUser, domainName), args], + ["%s@%s" % (self.hostUser, vm.domain_name), args], config.Config.DOCKER_RM_TIMEOUT) # Destroy corresponding volume if it exists. timeout(["ssh"] + DistDocker._SSH_FLAGS + - ["%s@%s" % (self.hostUser, domainName), + ["%s@%s" % (self.hostUser, vm.domain_name), "(rm -rf %s)" % (volumePath)], config.Config.DOCKER_RM_TIMEOUT) self.log.debug('Deleted volume %s' % instanceName) @@ -311,3 +294,22 @@ def existsVM(self, vm): vmnames = [vm.name for vm in vms] return (vm.name in vmname) + def getImages(self): + """ getImages - Executes `docker images` on every host and + returns a list of images that can be used to boot a docker + container with. This function is a lot of parsing and so + can break easily. + """ + result = set() + for host in self.hosts: + o = subprocess.check_output(["ssh"] + DistDocker._SSH_FLAGS + + ["%s@%s" % (self.hostUser, host), + "(docker images)"]) + o_l.split('\n') + o_l.pop() + o_l.reverse() + o_l.pop() + for row in o_l: + row_l = row.split(' ') + result.add(row_l[0]) + return list(result) From fc6bb3053909d13e31374f27d4f16f6c2d1da6f7 Mon Sep 17 00:00:00 2001 From: Mihir Pandya Date: Wed, 15 Apr 2015 01:13:59 -0400 Subject: [PATCH 09/33] Clean up and comments. --- vmms/distDocker.py | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/vmms/distDocker.py b/vmms/distDocker.py index 99c23696..b7f63b2c 100644 --- a/vmms/distDocker.py +++ b/vmms/distDocker.py @@ -59,10 +59,6 @@ def timeoutWithReturnStatus(command, time_out, returnValue = 0): stderr=subprocess.STDOUT) return ret -# -# User defined exceptions -# - class DistDocker: _SSH_FLAGS = ["-q", "-i", "/Users/Mihir/Documents/prog/Autolab/mp_tango.pem", @@ -106,7 +102,8 @@ def getVolumePath(self, instanceName): # VMMS API functions # def initializeVM(self, vm): - """ initializeVM - Nothing to do for initializeVM + """ initializeVM - Assign a host machine for this container to + run on. """ self.hostLock.acquire() host = self.hosts[self.hostIdx] @@ -114,12 +111,14 @@ def initializeVM(self, vm): if self.hostIdx >= len(self.hosts): self.hostIdx = 0 self.hostLock.release() + vm.domain_name = host - self.log.info("Assign host %s to VM %s." % (host, vm.name)) + self.log.info("Assigned host %s to VM %s." % (host, vm.name)) return vm def waitVM(self, vm, max_secs): - """ waitVM - Nothing to do for waitVM + """ waitVM - Wait at most max_secs for a VM to become + ready. Return error if it takes too long. """ start_time = time.time() @@ -197,6 +196,9 @@ def runJob(self, vm, runTimeout, maxOutputFileSize): config.Config.VM_ULIMIT_FILE_SIZE, runTimeout, config.Config.MAX_OUTPUT_FILE_SIZE) + # IMPORTANT: The single and double quotes are important, since we + # are switching to the autolab user and then running + # bash commands. setupCmd = 'cp -r mount/* autolab/; su autolab -c "%s"; \ cp output/feedback mount/feedback' % autodriverCmd @@ -266,9 +268,8 @@ def safeDestroyVM(self, vm): return def getVMs(self): - """ getVMs - Executes and parses `docker ps` + """ getVMs - Get all volumes of docker containers """ - # Get all volumes of docker containers machines = [] volumePath = self.getVolumePath('') for host in self.hosts: @@ -288,7 +289,8 @@ def getVMs(self): return machines def existsVM(self, vm): - """ existsVM + """ existsVM - Returns true if volume exists for corresponding + container. """ vms = self.getVMs() vmnames = [vm.name for vm in vms] @@ -312,4 +314,5 @@ def getImages(self): for row in o_l: row_l = row.split(' ') result.add(row_l[0]) + return list(result) From a57460a3c36d50f790aa54c53719374fe4e90e71 Mon Sep 17 00:00:00 2001 From: Mihir Pandya Date: Wed, 15 Apr 2015 01:44:15 -0400 Subject: [PATCH 10/33] getImages for Tashi. --- config.template.py | 3 --- vmms/tashiSSH.py | 14 ++++++++++++-- 2 files changed, 12 insertions(+), 5 deletions(-) diff --git a/config.template.py b/config.template.py index e89a9ec6..a0f59d4b 100644 --- a/config.template.py +++ b/config.template.py @@ -97,9 +97,6 @@ class Config: # Default vm pool size POOL_SIZE = 2 - # Path for tashi images - TASHI_IMAGE_PATH = '' - # Optionally log finer-grained timing information LOG_TIMING = False diff --git a/vmms/tashiSSH.py b/vmms/tashiSSH.py index bead3756..a71d9f9b 100644 --- a/vmms/tashiSSH.py +++ b/vmms/tashiSSH.py @@ -85,8 +85,6 @@ def timeoutWithReturnStatus(command, time_out, returnValue=0): # User defined exceptions # # tashiCall() exception - - class tashiCallError(Exception): pass @@ -96,6 +94,8 @@ class TashiSSH: "-o", "StrictHostKeyChecking=no", "-o", "GSSAPIAuthentication=no"] + TASHI_IMAGE_PATH = '/raid/tashi/images' + def __init__(self): self.config = getConfig(["Client"])[0] self.client = createClient(self.config) @@ -405,3 +405,13 @@ def existsVM(self, vm): if vm.instance_id == instance.id: return True return False + + def getImages(self): + """ getImages - Lists all images in TASHI_IMAGE_PATH + """ + imgList = os.listdir(Config.TASHI_IMAGE_PATH) + result = [] + for img in imgList: + (base, ext) = img.split('.') + result.append(base) + return result From 66136c4b6d8a5d04db096b51d32ac26590020484 Mon Sep 17 00:00:00 2001 From: Mihir Pandya Date: Mon, 20 Apr 2015 16:04:25 -0400 Subject: [PATCH 11/33] Committing changes before checking out. --- clients/tango-rest.py | 10 +++++----- restful-tango/tangoREST.py | 6 +++++- tangod.py | 4 ++-- vmms/Dockerfile | 16 +++++++++++++++- vmms/tashiSSH.py | 8 +------- 5 files changed, 28 insertions(+), 16 deletions(-) diff --git a/clients/tango-rest.py b/clients/tango-rest.py index e107a5f2..0e0c12a9 100755 --- a/clients/tango-rest.py +++ b/clients/tango-rest.py @@ -35,7 +35,7 @@ parser.add_argument('-o', '--open', action='store_true', help=open_help) upload_help = 'Uploads a file. Must specify key with -k, courselab with -l, and filename with --filename.' parser.add_argument('-u', '--upload', action='store_true', help=upload_help) -addJob_help = 'Submit a job. Must specify key with -k, courselab with -l, and input files with --infiles. Modify defaults with --image (rhel.img), --outputFile (result.out), --jobname (test_job), --maxsize(0), --timeout (0).' +addJob_help = 'Submit a job. Must specify key with -k, courselab with -l, and input files with --infiles. Modify defaults with --image (rhel), --outputFile (result.out), --jobname (test_job), --maxsize(0), --timeout (0).' parser.add_argument('-a', '--addJob', action='store_true', help=addJob_help) poll_help = 'Poll a given output file. Must specify key with -k, courselab with -l. Modify defaults with --outputFile (result.out).' parser.add_argument('-p', '--poll', action='store_true', help=poll_help) @@ -43,9 +43,9 @@ parser.add_argument('-i', '--info', action='store_true', help=info_help) jobs_help = 'Obtain information of live jobs (deadJobs == 0) or dead jobs (deadJobs == 1). Must specify key with -k. Modify defaults with --deadJobs (0).' parser.add_argument('-j', '--jobs', action='store_true', help=jobs_help) -pool_help = 'Obtain information about a pool of VMs spawned from a specific image. Must specify key with -k. Modify defaults with --image (rhel.img).' +pool_help = 'Obtain information about a pool of VMs spawned from a specific image. Must specify key with -k. Modify defaults with --image (rhel).' parser.add_argument('--pool', action='store_true', help=pool_help) -prealloc_help = 'Create a pool of instances spawned from a specific image. Must specify key with -k. Modify defaults with --image (rhel.img), --num (2), --vmms (tashiSSH), --cores (1), and --memory (512).' +prealloc_help = 'Create a pool of instances spawned from a specific image. Must specify key with -k. Modify defaults with --image (rhel), --num (2), --vmms (tashiSSH), --cores (1), and --memory (512).' parser.add_argument('--prealloc', action='store_true', help=prealloc_help) parser.add_argument('--runJob', help='Run a job from a specific directory') @@ -54,8 +54,8 @@ parser.add_argument('--vmms', default='tashiSSH', help='Choose vmms between localSSH, ec2SSH, tashiSSH') -parser.add_argument('--image', default='rhel.img', - help='VM image name (default "rhel.img")') +parser.add_argument('--image', default='rhel', + help='VM image name (default "rhel")') parser.add_argument( '--infiles', nargs='+', diff --git a/restful-tango/tangoREST.py b/restful-tango/tangoREST.py index a7340421..cafafc4c 100644 --- a/restful-tango/tangoREST.py +++ b/restful-tango/tangoREST.py @@ -45,6 +45,7 @@ def __init__(self): self.invalid_image = self.create(-1, "Invalid image name") self.invalid_prealloc_size = self.create(-1, "Invalid prealloc size") self.pool_not_found = self.create(-1, "Pool not found") + self.prealloc_failed = self.create(-1, "Preallocate VM failed") def create(self, id, msg): """ create - Constructs a dict with the given ID and message @@ -442,9 +443,12 @@ def prealloc(self, key, image, num, vmStr): ret = self.tango.preallocVM(vm, int(num)) if ret == -1: + self.log.error("Prealloc failed") + return self.status.prealloc_failed + if ret == -2: self.log.error("Invalid prealloc size") return self.status.invalid_prealloc_size - if ret == -2: + if ret == -3: self.log.error("Invalid image name") return self.status.invalid_image self.log.info("Successfully preallocated VMs") diff --git a/tangod.py b/tangod.py index 17674600..2647a2d7 100755 --- a/tangod.py +++ b/tangod.py @@ -113,10 +113,10 @@ def preallocVM(self, vm, num): try: vmms = self.vmms[vm.vmms] if not vm or num < 0: - return -1 + return -2 if vm.name not in vmms.getImages(): self.log.error("Invalid image name") - return -2 + return -3 self.preallocator.update(vm, num) return 0 except Exception as err: diff --git a/vmms/Dockerfile b/vmms/Dockerfile index e9053445..d5fad6c7 100644 --- a/vmms/Dockerfile +++ b/vmms/Dockerfile @@ -7,6 +7,7 @@ RUN apt-get update RUN apt-get install -y gcc RUN apt-get install -y make RUN apt-get install -y build-essential +RUN apt-get install -y wget # Install autodriver WORKDIR /home @@ -23,12 +24,25 @@ RUN make clean && make RUN cp autodriver /usr/bin/autodriver RUN chmod +s /usr/bin/autodriver +# Install C0 +WORKDIR /home +RUN wget http://c0.typesafety.net/dist/cc0-v0440-linux3.18.1-64bit-bin.tgz +RUN tar -xvzf cc0-* +WORKDIR /home/cc0 +RUN bin/cc0 -d doc/src/exp.c0 doc/src/exp-test.c0 +RUN ./a.out +RUN cp bin/cc0 /usr/bin/cc0 + # Clean up WORKDIR /home RUN apt-get remove -y git +RUN apt-get remove -y wget RUN apt-get -y autoremove RUN rm -rf Tango/ +RUN rm -f cc0-* +RUN rm -rf cc0/ # Check installation RUN ls -l /home -RUN which autodriver \ No newline at end of file +RUN which autodriver +RUN which cc0 \ No newline at end of file diff --git a/vmms/tashiSSH.py b/vmms/tashiSSH.py index a71d9f9b..1d98c6f7 100644 --- a/vmms/tashiSSH.py +++ b/vmms/tashiSSH.py @@ -178,7 +178,6 @@ def initializeVM(self, vm): """ initializeVM - Ask Tashi to create a new VM instance """ # Create the instance - vm.image = vm.image + ".img" instance = self.tangoMachineToInstance(vm) tashiInst = self.tashiCall("createVm", [instance]) vm.instance_id = tashiInst.id @@ -409,9 +408,4 @@ def existsVM(self, vm): def getImages(self): """ getImages - Lists all images in TASHI_IMAGE_PATH """ - imgList = os.listdir(Config.TASHI_IMAGE_PATH) - result = [] - for img in imgList: - (base, ext) = img.split('.') - result.append(base) - return result + return os.listdir(Config.TASHI_IMAGE_PATH) From 9329218bc902c1591f1f2286133a2b98db54d1db Mon Sep 17 00:00:00 2001 From: Mihir Pandya Date: Tue, 21 Apr 2015 15:54:43 -0400 Subject: [PATCH 12/33] Tested getImages and validateJob with Tashi. --- tangod.py | 3 ++- vmms/tashiSSH.py | 5 +++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/tangod.py b/tangod.py index 2647a2d7..f6f4283b 100755 --- a/tangod.py +++ b/tangod.py @@ -259,7 +259,8 @@ def validateJob(job, vmms): (datetime.utcnow().ctime(), job.vm.image)) errors += 1 else: - job.vm.name = job.vm.image + (name, ext) = os.path.splitext(job.vm.image) + job.vm.name = name if not job.vm.vmms: log.error("validateJob: Missing job.vm.vmms") diff --git a/vmms/tashiSSH.py b/vmms/tashiSSH.py index 1d98c6f7..7bf0966b 100644 --- a/vmms/tashiSSH.py +++ b/vmms/tashiSSH.py @@ -406,6 +406,7 @@ def existsVM(self, vm): return False def getImages(self): - """ getImages - Lists all images in TASHI_IMAGE_PATH + """ getImages - Lists all images in TASHI_IMAGE_PATH that have the + .img extension """ - return os.listdir(Config.TASHI_IMAGE_PATH) + return [img for img in os.listdir(Config.TASHI_IMAGE_PATH) if img.endswith('.img')] From 9d12052f79c529168f7a41115a9224a48c86d356 Mon Sep 17 00:00:00 2001 From: Mihir Pandya Date: Tue, 21 Apr 2015 16:01:48 -0400 Subject: [PATCH 13/33] Checking name in prealloc. --- tangod.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tangod.py b/tangod.py index f6f4283b..af0ecaf3 100755 --- a/tangod.py +++ b/tangod.py @@ -114,9 +114,11 @@ def preallocVM(self, vm, num): vmms = self.vmms[vm.vmms] if not vm or num < 0: return -2 - if vm.name not in vmms.getImages(): + if vm.image not in vmms.getImages(): self.log.error("Invalid image name") return -3 + (name, ext) = os.path.splitext(vm.image) + vm.name = name self.preallocator.update(vm, num) return 0 except Exception as err: From b4a602c8309a78b1d86221b05903923584d19c3f Mon Sep 17 00:00:00 2001 From: Mihir Pandya Date: Tue, 21 Apr 2015 16:40:03 -0400 Subject: [PATCH 14/33] Updated API for pool to return all pools if no pool name is given. --- preallocator.py | 10 ++++++++-- restful-tango/server.py | 5 ++--- restful-tango/tangoREST.py | 12 ++++++++---- 3 files changed, 18 insertions(+), 9 deletions(-) diff --git a/preallocator.py b/preallocator.py index bbfdabf6..78c4dfcf 100644 --- a/preallocator.py +++ b/preallocator.py @@ -226,11 +226,17 @@ def destroyVM(self, vmName, id): else: return -1 + def getAllPools(self): + result = {} + for vmName in self.machines.keys(): + result[vmName] = self.getPool(vmName) + return result + def getPool(self, vmName): """ getPool - returns the members of a pool and its free list """ result = {} - result["pool"] = [] + result["total"] = [] result["free"] = [] if vmName not in self.machines.keys(): return result @@ -246,6 +252,6 @@ def getPool(self, vmName): self.machines.set(vmName, machine) self.lock.release() - result["pool"] = self.machines.get(vmName)[0] + result["total"] = self.machines.get(vmName)[0] result["free"] = free_list return result diff --git a/restful-tango/server.py b/restful-tango/server.py index 4977cb14..a30c1704 100755 --- a/restful-tango/server.py +++ b/restful-tango/server.py @@ -102,11 +102,10 @@ def get(self, key, deadJobs): """ get - Handles the get request to jobs.""" return tangoREST.jobs(key, deadJobs) - class PoolHandler(tornado.web.RequestHandler): @unblock - def get(self, key, image): + def get(self, key, image=''): """ get - Handles the get request to pool.""" return tangoREST.pool(key, image) @@ -128,7 +127,7 @@ def post(self, key, image, num): (SHA1_KEY, COURSELAB, OUTPUTFILE), PollHandler), (r"/info/(%s)/" % (SHA1_KEY), InfoHandler), (r"/jobs/(%s)/(%s)/" % (SHA1_KEY, DEADJOBS), JobsHandler), - (r"/pool/(%s)/(%s)/" % (SHA1_KEY, IMAGE), PoolHandler), + (r"/pool/(%s)/(%s)" % (SHA1_KEY, '.*'), PoolHandler), (r"/prealloc/(%s)/(%s)/(%s)/" % (SHA1_KEY, IMAGE, NUM), PreallocHandler), ]) diff --git a/restful-tango/tangoREST.py b/restful-tango/tangoREST.py index cafafc4c..d382bb01 100644 --- a/restful-tango/tangoREST.py +++ b/restful-tango/tangoREST.py @@ -415,14 +415,18 @@ def pool(self, key, image): """ self.log.debug("Received pool request(%s, %s)" % (key, image)) if self.validateKey(key): - info = self.preallocator.getPool(image) - if len(info["pool"]) == 0: + if image == "": + pools = self.preallocator.getAllPools() + else: + info = self.preallocator.getPool(image) + pools = {} + pools[image] = info + if len(pools) == 0: self.log.info("Pool not found: %s" % image) return self.status.pool_not_found self.log.info("Pool image found: %s" % image) result = self.status.obtained_pool - result["total"] = info["pool"] - result["free"] = info["free"] + result["pools"] = pools return result else: self.log.info("Key not recognized: %s" % key) From 1ad7ff604fec58760c0c417f1daee1901aec550f Mon Sep 17 00:00:00 2001 From: Mihir Pandya Date: Wed, 22 Apr 2015 21:20:39 -0400 Subject: [PATCH 15/33] Return empty pool instead of failure. Fixed argument parsing. --- restful-tango/server.py | 8 +++++++- restful-tango/tangoREST.py | 3 --- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/restful-tango/server.py b/restful-tango/server.py index a30c1704..fb73aa08 100755 --- a/restful-tango/server.py +++ b/restful-tango/server.py @@ -105,8 +105,13 @@ def get(self, key, deadJobs): class PoolHandler(tornado.web.RequestHandler): @unblock - def get(self, key, image=''): + def get(self, key): """ get - Handles the get request to pool.""" + image = '' + if '/' in key: + key_l = key.split('/') + key = key_l[0] + image = key_l[1] return tangoREST.pool(key, image) @@ -127,6 +132,7 @@ def post(self, key, image, num): (SHA1_KEY, COURSELAB, OUTPUTFILE), PollHandler), (r"/info/(%s)/" % (SHA1_KEY), InfoHandler), (r"/jobs/(%s)/(%s)/" % (SHA1_KEY, DEADJOBS), JobsHandler), + (r"/pool/(%s)/" % (SHA1_KEY), PoolHandler), (r"/pool/(%s)/(%s)" % (SHA1_KEY, '.*'), PoolHandler), (r"/prealloc/(%s)/(%s)/(%s)/" % (SHA1_KEY, IMAGE, NUM), PreallocHandler), ]) diff --git a/restful-tango/tangoREST.py b/restful-tango/tangoREST.py index d382bb01..bba32337 100644 --- a/restful-tango/tangoREST.py +++ b/restful-tango/tangoREST.py @@ -421,9 +421,6 @@ def pool(self, key, image): info = self.preallocator.getPool(image) pools = {} pools[image] = info - if len(pools) == 0: - self.log.info("Pool not found: %s" % image) - return self.status.pool_not_found self.log.info("Pool image found: %s" % image) result = self.status.obtained_pool result["pools"] = pools From 884c01d91bd4235144179377a01864b57786a6fd Mon Sep 17 00:00:00 2001 From: Mihir Pandya Date: Wed, 22 Apr 2015 21:26:10 -0400 Subject: [PATCH 16/33] Pool route was overloaded. --- restful-tango/server.py | 1 - 1 file changed, 1 deletion(-) diff --git a/restful-tango/server.py b/restful-tango/server.py index fb73aa08..eafc7306 100755 --- a/restful-tango/server.py +++ b/restful-tango/server.py @@ -133,7 +133,6 @@ def post(self, key, image, num): (r"/info/(%s)/" % (SHA1_KEY), InfoHandler), (r"/jobs/(%s)/(%s)/" % (SHA1_KEY, DEADJOBS), JobsHandler), (r"/pool/(%s)/" % (SHA1_KEY), PoolHandler), - (r"/pool/(%s)/(%s)" % (SHA1_KEY, '.*'), PoolHandler), (r"/prealloc/(%s)/(%s)/(%s)/" % (SHA1_KEY, IMAGE, NUM), PreallocHandler), ]) From 969d4c08370bdec8427d1dd484361a5a62b51f98 Mon Sep 17 00:00:00 2001 From: Mihir Pandya Date: Wed, 22 Apr 2015 21:37:18 -0400 Subject: [PATCH 17/33] Fixed status messages. --- preallocator.py | 4 ++-- restful-tango/tangoREST.py | 13 ++++++++++--- 2 files changed, 12 insertions(+), 5 deletions(-) diff --git a/preallocator.py b/preallocator.py index 78c4dfcf..bc3c9a6a 100644 --- a/preallocator.py +++ b/preallocator.py @@ -236,11 +236,11 @@ def getPool(self, vmName): """ getPool - returns the members of a pool and its free list """ result = {} - result["total"] = [] - result["free"] = [] if vmName not in self.machines.keys(): return result + result["total"] = [] + result["free"] = [] free_list = [] self.lock.acquire() size = self.machines.get(vmName)[1].qsize() diff --git a/restful-tango/tangoREST.py b/restful-tango/tangoREST.py index bba32337..f880aa26 100644 --- a/restful-tango/tangoREST.py +++ b/restful-tango/tangoREST.py @@ -420,9 +420,16 @@ def pool(self, key, image): else: info = self.preallocator.getPool(image) pools = {} - pools[image] = info - self.log.info("Pool image found: %s" % image) - result = self.status.obtained_pool + if len(info) > 0: + pools[image] = info + + if len(pools) > 0: + self.log.info("Pool image found: %s" % image) + result = self.status.obtained_pool + else: + self.log.info("Invalid image name: %s" % image) + result = self.status.pool_not_found + result["pools"] = pools return result else: From c1f01f1c90434e4a9d4380d1140e05fe765e90d7 Mon Sep 17 00:00:00 2001 From: Mihir Pandya Date: Sun, 26 Apr 2015 19:27:55 -0400 Subject: [PATCH 18/33] Fixed getImages --- vmms/distDocker.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vmms/distDocker.py b/vmms/distDocker.py index b7f63b2c..125775fe 100644 --- a/vmms/distDocker.py +++ b/vmms/distDocker.py @@ -307,7 +307,7 @@ def getImages(self): o = subprocess.check_output(["ssh"] + DistDocker._SSH_FLAGS + ["%s@%s" % (self.hostUser, host), "(docker images)"]) - o_l.split('\n') + o_l = o.split('\n') o_l.pop() o_l.reverse() o_l.pop() From 66ae6a6a16f6c7269e85b4eec25d9a3c34b86b42 Mon Sep 17 00:00:00 2001 From: Mihir Pandya Date: Sun, 26 Apr 2015 19:56:51 -0400 Subject: [PATCH 19/33] Added a file for all hosts to be listed. --- hosts | 2 ++ vmms/distDocker.py | 16 +++++++++++++++- 2 files changed, 17 insertions(+), 1 deletion(-) create mode 100644 hosts diff --git a/hosts b/hosts new file mode 100644 index 00000000..265c7c67 --- /dev/null +++ b/hosts @@ -0,0 +1,2 @@ +54.186.238.205 +54.68.89.235 \ No newline at end of file diff --git a/vmms/distDocker.py b/vmms/distDocker.py index 125775fe..ca4d8b9f 100644 --- a/vmms/distDocker.py +++ b/vmms/distDocker.py @@ -65,13 +65,15 @@ class DistDocker: "-o", "StrictHostKeyChecking=no", "-o", "GSSAPIAuthentication=no"] + HOSTS_FILE = 'hosts' + def __init__(self): """ Checks if the machine is ready to run docker containers. Initialize boot2docker if running on OS X. """ try: self.log = logging.getLogger("DistDocker") - self.hosts = ['54.186.238.205', '54.68.89.235'] + self.hosts = self.readHosts() self.hostIdx = 0 self.hostLock = threading.Lock() self.hostUser = "ubuntu" @@ -84,6 +86,18 @@ def __init__(self): self.log.error(str(e)) exit(1) + def readHosts(self): + f = open(self.HOSTS_FILE, 'r') + hosts = [] + hosts_str = f.read() + hosts_l = hosts_str.split('\n') + for host in hosts_l: + if len(host) > 0: + hosts.append(host) + self.log.info("Current host machines: %s" % hosts) + return hosts + + def instanceName(self, id, name): """ instanceName - Constructs a Docker instance name. Always use this function when you need a Docker instance name. Never generate From 3f6953d13f54ac2fde46c094e8e92bfd153911ec Mon Sep 17 00:00:00 2001 From: Mihir Pandya Date: Sun, 26 Apr 2015 23:51:43 -0400 Subject: [PATCH 20/33] Fixed existsVM --- vmms/distDocker.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vmms/distDocker.py b/vmms/distDocker.py index ca4d8b9f..c8f6fce7 100644 --- a/vmms/distDocker.py +++ b/vmms/distDocker.py @@ -308,7 +308,7 @@ def existsVM(self, vm): """ vms = self.getVMs() vmnames = [vm.name for vm in vms] - return (vm.name in vmname) + return (vm.name in vmnames) def getImages(self): """ getImages - Executes `docker images` on every host and From 3462ad676cf8538bad6f282401c592677afb8ebd Mon Sep 17 00:00:00 2001 From: Dave OHallaron Date: Tue, 16 Jun 2015 12:55:13 -0400 Subject: [PATCH 21/33] Updated config file to use localDocker as default vmms --- config.template.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/config.template.py b/config.template.py index a0f59d4b..77df1d68 100644 --- a/config.template.py +++ b/config.template.py @@ -32,8 +32,9 @@ class Config: COURSELABS = "courselabs" # VMMS to use. Must be set to a VMMS implemented in vmms/ before - # starting Tango. Options are: "localSSH", "tashiSSH", "ec2SSH" - VMMS_NAME = "localSSH" + # starting Tango. Options are: "localDocker", "distDocker", + # "localSSH", "tashiSSH", "ec2SSH" + VMMS_NAME = "localDocker" ##### # Part 2: Constants that shouldn't need to change very often. @@ -88,7 +89,7 @@ class Config: # Frequency of retrying SSH connections (in seconds) SSH_INTERVAL = 0.5 - # Give Tashi this many seconds to destroy a VM before giving up + # Give VMMS this many seconds to destroy a VM before giving up DESTROY_SECS = 5 # Time to wait between creating VM instances to give DNS time to cool down From e5d5b8cdff7683dce041a212e2437898ccf488e5 Mon Sep 17 00:00:00 2001 From: Dave OHallaron Date: Tue, 16 Jun 2015 12:58:38 -0400 Subject: [PATCH 22/33] Updated config template to use localDocker as the default VMMS --- config.template.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/config.template.py b/config.template.py index 77df1d68..3d7299e3 100644 --- a/config.template.py +++ b/config.template.py @@ -33,7 +33,7 @@ class Config: # VMMS to use. Must be set to a VMMS implemented in vmms/ before # starting Tango. Options are: "localDocker", "distDocker", - # "localSSH", "tashiSSH", "ec2SSH" + # "tashiSSH", "localSSH", and "ec2SSH" VMMS_NAME = "localDocker" ##### From 5ec90a2095283b7b62bea9864754dc651ca0acde Mon Sep 17 00:00:00 2001 From: Yiming Zong Date: Wed, 1 Jul 2015 09:14:30 -0700 Subject: [PATCH 23/33] Added Circle CI testing badge --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index cb41fcec..60b35dcb 100644 --- a/README.md +++ b/README.md @@ -26,7 +26,7 @@ A brief overview of the Tango respository: * vmms - VMM system library implementations * restful-tango - HTTP server layer on the main tango -## Testing +## Testing [![Circle CI](https://circleci.com/gh/autolab/Tango.svg?style=svg)](https://circleci.com/gh/autolab/Tango) To test whether Tango is running and accepting jobs, a tango command-line client is included in `clients/` along with sample jobs. From eb18878d49c0ca718fc994919ed08b92aa00a77b Mon Sep 17 00:00:00 2001 From: Mihir Pandya Date: Wed, 1 Jul 2015 14:56:45 -0700 Subject: [PATCH 24/33] Increased ceiling for timeout in runJob --- vmms/Dockerfile | 16 +------------- vmms/Dockerfile_122 | 49 ++++++++++++++++++++++++++++++++++++++++++ vmms/Dockerfile_ubuntu | 34 +++++++++++++++++++++++++++++ vmms/distDocker.py | 2 +- vmms/localDocker.py | 2 +- vmms/setup.sh | 16 ++++++++++++++ 6 files changed, 102 insertions(+), 17 deletions(-) create mode 100644 vmms/Dockerfile_122 create mode 100644 vmms/Dockerfile_ubuntu create mode 100644 vmms/setup.sh diff --git a/vmms/Dockerfile b/vmms/Dockerfile index d5fad6c7..e9053445 100644 --- a/vmms/Dockerfile +++ b/vmms/Dockerfile @@ -7,7 +7,6 @@ RUN apt-get update RUN apt-get install -y gcc RUN apt-get install -y make RUN apt-get install -y build-essential -RUN apt-get install -y wget # Install autodriver WORKDIR /home @@ -24,25 +23,12 @@ RUN make clean && make RUN cp autodriver /usr/bin/autodriver RUN chmod +s /usr/bin/autodriver -# Install C0 -WORKDIR /home -RUN wget http://c0.typesafety.net/dist/cc0-v0440-linux3.18.1-64bit-bin.tgz -RUN tar -xvzf cc0-* -WORKDIR /home/cc0 -RUN bin/cc0 -d doc/src/exp.c0 doc/src/exp-test.c0 -RUN ./a.out -RUN cp bin/cc0 /usr/bin/cc0 - # Clean up WORKDIR /home RUN apt-get remove -y git -RUN apt-get remove -y wget RUN apt-get -y autoremove RUN rm -rf Tango/ -RUN rm -f cc0-* -RUN rm -rf cc0/ # Check installation RUN ls -l /home -RUN which autodriver -RUN which cc0 \ No newline at end of file +RUN which autodriver \ No newline at end of file diff --git a/vmms/Dockerfile_122 b/vmms/Dockerfile_122 new file mode 100644 index 00000000..f021598d --- /dev/null +++ b/vmms/Dockerfile_122 @@ -0,0 +1,49 @@ +# Autolab - autograding docker image + +FROM ubuntu:14.04 +MAINTAINER Mihir Pandya + +# Install necessary packages +RUN apt-get update +RUN apt-get install -y gcc +RUN apt-get install -y make +RUN apt-get install -y build-essential +RUN apt-get install -y wget +RUN apt-get install -y git + +# Install autodriver +WORKDIR /home +RUN useradd autolab +RUN useradd autograde +RUN mkdir autolab autograde output +RUN chown autolab:autolab autolab +RUN chown autolab:autolab output +RUN chown autograde:autograde autograde +RUN git clone https://github.com/autolab/Tango.git +WORKDIR Tango/autodriver +RUN make clean && make +RUN cp autodriver /usr/bin/autodriver +RUN chmod +s /usr/bin/autodriver + +# Install C0 +WORKDIR /home +RUN wget http://c0.typesafety.net/dist/cc0-v0440-linux3.18.1-64bit-bin.tgz +RUN tar -xvzf cc0-* +WORKDIR /home/cc0 +RUN bin/cc0 -d doc/src/exp.c0 doc/src/exp-test.c0 +RUN ./a.out +RUN cp bin/cc0 /usr/bin/cc0 + +# Clean up +WORKDIR /home +RUN apt-get remove -y git +RUN apt-get remove -y wget +RUN apt-get -y autoremove +RUN rm -rf Tango/ +RUN rm -f cc0-* +RUN rm -rf cc0/ + +# Check installation +RUN ls -l /home +RUN which autodriver +RUN which cc0 \ No newline at end of file diff --git a/vmms/Dockerfile_ubuntu b/vmms/Dockerfile_ubuntu new file mode 100644 index 00000000..e9053445 --- /dev/null +++ b/vmms/Dockerfile_ubuntu @@ -0,0 +1,34 @@ +# Autolab - autograding docker image + +FROM ubuntu:14.04 +MAINTAINER Mihir Pandya + +RUN apt-get update +RUN apt-get install -y gcc +RUN apt-get install -y make +RUN apt-get install -y build-essential + +# Install autodriver +WORKDIR /home +RUN useradd autolab +RUN useradd autograde +RUN mkdir autolab autograde output +RUN chown autolab:autolab autolab +RUN chown autolab:autolab output +RUN chown autograde:autograde autograde +RUN apt-get install -y git +RUN git clone https://github.com/autolab/Tango.git +WORKDIR Tango/autodriver +RUN make clean && make +RUN cp autodriver /usr/bin/autodriver +RUN chmod +s /usr/bin/autodriver + +# Clean up +WORKDIR /home +RUN apt-get remove -y git +RUN apt-get -y autoremove +RUN rm -rf Tango/ + +# Check installation +RUN ls -l /home +RUN which autodriver \ No newline at end of file diff --git a/vmms/distDocker.py b/vmms/distDocker.py index c8f6fce7..e0090750 100644 --- a/vmms/distDocker.py +++ b/vmms/distDocker.py @@ -223,7 +223,7 @@ def runJob(self, vm, runTimeout, maxOutputFileSize): ret = timeout(["ssh"] + DistDocker._SSH_FLAGS + ["%s@%s" % (self.hostUser, vm.domain_name), args], - config.Config.RUNJOB_TIMEOUT) + runTimeout * 2) self.log.debug('runJob return status %d' % ret) diff --git a/vmms/localDocker.py b/vmms/localDocker.py index 679bb56e..2c36c491 100644 --- a/vmms/localDocker.py +++ b/vmms/localDocker.py @@ -145,7 +145,7 @@ def runJob(self, vm, runTimeout, maxOutputFileSize): autodriverCmd] self.log.debug('Running job: %s' % str(args)) - ret = timeout(args, runTimeout) + ret = timeout(args, runTimeout * 2) self.log.debug('runJob returning %d' % ret) return ret diff --git a/vmms/setup.sh b/vmms/setup.sh new file mode 100644 index 00000000..1e459c71 --- /dev/null +++ b/vmms/setup.sh @@ -0,0 +1,16 @@ +#!/bin/bash +# +# This script is to set up a shark machine to host docker +# containers for autograding. System must be running ubuntu +# kernel version 3.13 or higher. Check with `uname -r` +# +# Run as root + +wget -qO- https://get.docker.com/ | sh +adduser autolab +usermod -aG docker autolab +mkdir /home/autolab/volumes/ +mkdir /home/autolab/.ssh +# Append greatwhite:/usr/share/mpandyaAutolab/Tango/vmms/id_rsa.pub to +# host:/home/autolab/.ssh/authorized_keys +# Make docker image called `rhel` From efd17da907ef122cf51b8d0d3a753cda85d63cb5 Mon Sep 17 00:00:00 2001 From: Yashas Kumar Date: Thu, 10 Sep 2015 18:58:06 -0400 Subject: [PATCH 25/33] Update README Rewordings and edits to make the instructions more clear --- README.md | 58 +++++++++++++++++++++++++++++++------------------------ 1 file changed, 33 insertions(+), 25 deletions(-) diff --git a/README.md b/README.md index 60b35dcb..e218bd82 100644 --- a/README.md +++ b/README.md @@ -1,43 +1,51 @@ ![Welcome to Autolab](https://github.com/autolab/Autolab/blob/master/public/images/autolab_logo.png) -Tango +Tango [![Circle CI](https://circleci.com/gh/autolab/Tango.svg?style=svg)](https://circleci.com/gh/autolab/Tango) ====== -Tango is a standalone, RESTful Web service that provides autograding services for [Autolab](https://github.com/autolab/Autolab). This is the main repository that includes the application layer of the project. +Tango is a standalone RESTful Web service that runs and manages jobs. A job is a set of files that must satisfy the following constraints: -Tango allows submission of jobs that are to be run in pre-configured VMs. Tango also supports different Virtual Machine Management (VMM) systems by providing a high level VMM API. Users can implement the high level VMM API for a hypervisor or VMM of your choice such as KVM, Xen, Docker or EC2. +1. There must be exactly one `Makefile` that runs the job. +2. The output for the job should be printed to stdout. -Tango was started as part of the Autolab project at Carnegie Mellon University and has been extensively used for autograding programming assignments in CMU courses. +Example jobs are provided for the user to peruse in `clients/`. Tango has a [REST API](https://github.com/autolab/Tango/wiki/Tango-REST-API) which is used for job submission. -## Getting Started - -The easiest way to get started with Tango is by installing it on a vanilla EC2 Ubuntu instance. The detailed instructions can be found [here](https://github.com/autolab/Tango/wiki/Setting-up-Tango-and-VMs-on-Amazon-EC2). - -Tango has a REST API which can be used for job submission and other administrative tasks. The documentation of the API can be found [here](https://github.com/autolab/Tango/wiki/Tango-REST-API) - -In order to run Tango locally, the VMM API needs to be implemented such that jobs run locally. This is currently work in progress. +Upon receiving a job, Tango will copy all of the job's input files into a VM, run `make`, and copy the resulting output back to the host machine. Tango jobs are run in pre-configured VMs. Support for various Virtual Machine Management Systems (VMMSs) like KVM, Docker, or Amazon EC2 can be added by implementing a high level VMMS API that Tango provides. A brief overview of the Tango respository: -* tangod.py - Main tango program -* jobQueue.py - Manages the job queue -* preallocator.py - Manages a pool of preallocated VMs -* worker.py - Shepherds a job through its execution -* vmms - VMM system library implementations -* restful-tango - HTTP server layer on the main tango +* `tangod.py` - Main tango server +* `jobQueue.py` - Manages the job queue +* `jobManager.py` - Assigns jobs to free VMs +* `worker.py` - Shepherds a job through its execution +* `preallocator.py` - Manages pools of VMs +* `vmms/` - VMMS library implementations +* `restful-tango/` - HTTP server layer on the main Tango -## Testing [![Circle CI](https://circleci.com/gh/autolab/Tango.svg?style=svg)](https://circleci.com/gh/autolab/Tango) +Tango was developed as a distributed grading system for [Autolab](https://github.com/autolab/Autolab) at Carnegie Mellon University and has been extensively used for autograding programming assignments in CMU courses. -To test whether Tango is running and accepting jobs, a tango command-line client is included in `clients/` along with sample jobs. +## Using Tango -## Contributing +Please feel free to use Tango at your school/organization. If you run into any problems with the steps below, you can reach the core developers at `autolab-dev@andrew.cmu.edu` and we would be happy to help. -Contributing to Tango is greatly encouraged! Future issues and features will be posted on Github Issues. Also look at [Contributing to Autolab Guide](https://github.com/autolab/Autolab) for guidelines on how to proceed. [Join us!](http://contributors.autolabproject.org) +1. [Follow the steps to set up Tango](https://github.com/autolab/Tango/wiki/Set-up-Tango). +2. [Read the documentation for the REST API](https://github.com/autolab/Tango/wiki/Tango-REST-API). +3. Read the documentation for the VMMS API - coming soon. +4. [Test whether Tango is set up properly and can process jobs](https://github.com/autolab/Tango/wiki/Testing-Tango). -## License +## Contributing to Tango -Autolab is released under the [Apache License 2.0](http://opensource.org/licenses/Apache-2.0). +1. [Fork the Tango repository](https://github.com/autolab/Tango). +2. Create a local clone of the forked repo. +3. Make a branch for your feature and start committing changes. +3. Create a pull request (PR). +4. Address any comments by updating the PR and wait for it to be accepted. +5. Once your PR is accepted, a reviewer will ask you to squash the commits on your branch into one well-worded commit. +6. Squash your commits into one and push to your branch on your forked repo. +7. A reviewer will fetch from your repo, rebase your commit, and push to Tango. + +Please see [the git linear development guide](https://github.com/edx/edx-platform/wiki/How-to-Rebase-a-Pull-Request) for a more in-depth explanation of the version control model that we use. -## Using Tango +## License -Please feel free to use Tango at your school/organization. If you run into any problems, you can reach the core developers at `autolab-dev@andrew.cmu.edu` and we would be happy to help. On a case by case basis, we also provide servers for free. (Especially if you are an NGO or small high-school classroom) +Tango is released under the [Apache License 2.0](http://opensource.org/licenses/Apache-2.0). From de33ac59d110222a34e02f3880107ae02bfd8497 Mon Sep 17 00:00:00 2001 From: Yashas Kumar Date: Thu, 10 Sep 2015 23:58:52 -0400 Subject: [PATCH 26/33] Up job queue capacity Increase maximum job queue capacity to accommodate increasing usage of Tango. --- config.template.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/config.template.py b/config.template.py index 3d7299e3..2c924bec 100644 --- a/config.template.py +++ b/config.template.py @@ -102,7 +102,7 @@ class Config: LOG_TIMING = False # Largest job ID - MAX_JOBID = 500 + MAX_JOBID = 1000 ###### # Part 3: Runtime info that you can retrieve using the /info route From 050e5fcd3a6e5af1644cf606c9a2eb9bb0fa5b42 Mon Sep 17 00:00:00 2001 From: Yashas Kumar Date: Thu, 10 Sep 2015 23:37:01 -0400 Subject: [PATCH 27/33] Check filename and md5 hash on file uploads Solves "caching" issue by only rejecting file uploads if the same file exists on the server **with the same filename**. Note that this also changes the REST API spec for /open from returning a list of dictionaries to returning just a simple dictionary. This will require a change on frontend that @ymzong is implementing. --- restful-tango/tangoREST.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/restful-tango/tangoREST.py b/restful-tango/tangoREST.py index b90f505b..42581776 100644 --- a/restful-tango/tangoREST.py +++ b/restful-tango/tangoREST.py @@ -146,12 +146,12 @@ def computeMD5(self, directory): """ computeMD5 - Computes the MD5 hash of given files in the given directory """ - result = [] + result = {} for elem in os.listdir(directory): try: body = open("%s/%s" % (directory, elem)).read() md5hash = hashlib.md5(body).hexdigest() - result.append({'md5': md5hash, 'localFile': elem}) + result[elem] = md5hash except IOError: continue return result @@ -264,7 +264,7 @@ def convertTangoJobObj(self, tangoJobObj): ## def open(self, key, courselab): - """ open - Return a list of md5 hashes for each input file in the + """ open - Return a dict of md5 hashes for each input file in the key-courselab directory and make one if the directory doesn't exist """ self.log.debug("Received open request(%s, %s)" % (key, courselab)) @@ -293,7 +293,8 @@ def open(self, key, courselab): return self.status.wrong_key def upload(self, key, courselab, file, body): - """ upload - Upload file as an input file in key-courselab + """ upload - Upload file as an input file in key-courselab if the + same file doesn't exist already """ self.log.debug("Received upload request(%s, %s, %s)" % (key, courselab, file)) @@ -301,12 +302,11 @@ def upload(self, key, courselab, file, body): labPath = self.getDirPath(key, courselab) try: if os.path.exists(labPath): + fileMD5 = hashlib.md5(body).hexdigest() + filesInDir = self.computeMD5(labPath) + if file in filesInDir and filesInDir[file] == fileMD5: + return self.status.file_exists absPath = "%s/%s" % (labPath, file) - if os.path.exists(absPath): - fileMD5 = hashlib.md5(body).hexdigest() - if fileMD5 in [obj["md5"] - for obj in self.computeMD5(labPath)]: - return self.status.file_exists fh = open(absPath, "wt") fh.write(body) fh.close() From 8a7aed9c5f21026e27094da9937fab34e0253b1a Mon Sep 17 00:00:00 2001 From: Yashas Kumar Date: Mon, 14 Sep 2015 15:39:48 -0400 Subject: [PATCH 28/33] Remove localSSH VMMS. Fixes #88. --- .gitignore | 1 + clients/tango-rest.py | 2 +- config.template.py | 2 +- jobManager.py | 5 +- restful-tango/tangoREST.py | 5 +- vmms/localSSH.py | 259 ------------------------------------- 6 files changed, 5 insertions(+), 269 deletions(-) delete mode 100644 vmms/localSSH.py diff --git a/.gitignore b/.gitignore index f17f11d5..937ecaed 100644 --- a/.gitignore +++ b/.gitignore @@ -14,6 +14,7 @@ config.py bin lib include +pip-selfcheck.json # Mac OS X custom attribute files .DS_Store diff --git a/clients/tango-rest.py b/clients/tango-rest.py index 0e0c12a9..0c8285a1 100755 --- a/clients/tango-rest.py +++ b/clients/tango-rest.py @@ -53,7 +53,7 @@ '--numJobs', type=int, default=1, help='Number of jobs to run') parser.add_argument('--vmms', default='tashiSSH', - help='Choose vmms between localSSH, ec2SSH, tashiSSH') + help='Choose vmms between ec2SSH, tashiSSH, localDocker, and distDocker') parser.add_argument('--image', default='rhel', help='VM image name (default "rhel")') parser.add_argument( diff --git a/config.template.py b/config.template.py index 2c924bec..f714a99a 100644 --- a/config.template.py +++ b/config.template.py @@ -33,7 +33,7 @@ class Config: # VMMS to use. Must be set to a VMMS implemented in vmms/ before # starting Tango. Options are: "localDocker", "distDocker", - # "tashiSSH", "localSSH", and "ec2SSH" + # "tashiSSH", and "ec2SSH" VMMS_NAME = "localDocker" ##### diff --git a/jobManager.py b/jobManager.py index d0e183a4..78472c44 100644 --- a/jobManager.py +++ b/jobManager.py @@ -77,10 +77,7 @@ def __manage(self): else: vmms = None - if Config.VMMS_NAME == "localSSH": - from vmms.localSSH import LocalSSH - vmms = LocalSSH() - elif Config.VMMS_NAME == "tashiSSH": + if Config.VMMS_NAME == "tashiSSH": from vmms.tashiSSH import TashiSSH vmms = TashiSSH() elif Config.VMMS_NAME == "ec2SSH": diff --git a/restful-tango/tangoREST.py b/restful-tango/tangoREST.py index 42581776..e3a2111d 100644 --- a/restful-tango/tangoREST.py +++ b/restful-tango/tangoREST.py @@ -76,10 +76,7 @@ def __init__(self): vmms = None - if Config.VMMS_NAME == "localSSH": - from vmms.localSSH import LocalSSH - vmms = LocalSSH() - elif Config.VMMS_NAME == "tashiSSH": + if Config.VMMS_NAME == "tashiSSH": from vmms.tashiSSH import TashiSSH vmms = TashiSSH() elif Config.VMMS_NAME == "ec2SSH": diff --git a/vmms/localSSH.py b/vmms/localSSH.py deleted file mode 100644 index 22ff69a4..00000000 --- a/vmms/localSSH.py +++ /dev/null @@ -1,259 +0,0 @@ -# -# localSSH.py - Implements the Tango VMMS interface to run Tango jobs locally. -# -import subprocess -import re -import time -import logging - -import config - - -def timeout(command, time_out=1): - """ timeout - Run a unix command with a timeout. Return -1 on - timeout, otherwise return the return value from the command, which - is typically 0 for success, 1-255 for failure. - """ - - # Launch the command - p = subprocess.Popen(command, - stdout=open("/dev/null", 'w'), - stderr=subprocess.STDOUT) - - # Wait for the command to complete - t = 0.0 - while t < time_out and p.poll() is None: - time.sleep(config.Config.TIMER_POLL_INTERVAL) - t += config.Config.TIMER_POLL_INTERVAL - - # Determine why the while loop terminated - if p.poll() is None: - subprocess.call(["/bin/kill", "-9", str(p.pid)]) - returncode = -1 - else: - returncode = p.poll() - return returncode - - -def timeoutWithReturnStatus(command, time_out, returnValue=0): - """ timeoutWithReturnStatus - Run a Unix command with a timeout, - until the expected value is returned by the command; On timeout, - return last error code obtained from the command. - """ - p = subprocess.Popen( - command, stdout=open("/dev/null", 'w'), stderr=subprocess.STDOUT) - t = 0.0 - while (t < time_out): - ret = p.poll() - if ret is None: - time.sleep(config.Config.TIMER_POLL_INTERVAL) - t += config.Config.TIMER_POLL_INTERVAL - elif ret == returnValue: - return ret - else: - p = subprocess.Popen(command, - stdout=open("/dev/null", 'w'), - stderr=subprocess.STDOUT) - return ret - -# -# User defined exceptions -# -# ec2Call() exception - - -class localCallError(Exception): - pass - - -class LocalSSH: - _SSH_FLAGS = [ - "-o", "StrictHostKeyChecking no", "-o", "GSSAPIAuthentication no"] - - def __init__(self): - """ - Checks if the machine is ready to run Tango jobs. - """ - self.log = logging.getLogger("LocalSSH") - - try: - checkBinary = subprocess.check_call(["which", "autodriver"]) - checkAutogradeUser = subprocess.check_call( - "getent passwd | grep 'autograde'", shell=True) - except subprocess.CalledProcessError as e: - print "Local machine has not been bootstrapped for autograding. Please run localBootstrap.sh" - self.log.error(e) - exit(1) - - - def instanceName(self, id, name): - """ instanceName - Constructs a VM instance name. Always use - this function when you need a VM instance name. Never generate - instance names manually. - """ - return "%s-%d-%s" % (config.Config.PREFIX, id, name) - - def domainName(self, vm): - """ Returns the domain name that is stored in the vm - instance. - """ - return vm.domain_name - - # - # VMMS API functions - # - def initializeVM(self, vm): - """ initializeVM - Set domain name to localhost - """ - # Create the instance and obtain the reservation - vm.domain_name = "127.0.0.1" - return vm - - def waitVM(self, vm, max_secs): - """ waitVM - Wait at most max_secs for a VM to become - ready. Return error if it takes too long. This should - be immediate since the VM is localhost. - """ - - # First, wait for ping to the vm instance to work - instance_down = 1 - instanceName = self.instanceName(vm.id, vm.name) - start_time = time.time() - domain_name = self.domainName(vm) - while instance_down: - instance_down = subprocess.call("ping -c 1 %s" % (domain_name), - shell=True, - stdout=open('/dev/null', 'w'), - stderr=subprocess.STDOUT) - - # Wait a bit and then try again if we haven't exceeded - # timeout - if instance_down: - time.sleep(config.Config.TIMER_POLL_INTERVAL) - elapsed_secs = time.time() - start_time - if (elapsed_secs > max_secs): - return -1 - - # The ping worked, so now wait for SSH to work before - # declaring that the VM is ready - self.log.debug("VM %s: ping completed" % (vm.name)) - while(True): - - elapsed_secs = time.time() - start_time - - # Give up if the elapsed time exceeds the allowable time - if elapsed_secs > max_secs: - self.log.info( - "VM %s: SSH timeout after %d secs" % - (instanceName, elapsed_secs)) - return -1 - - # If the call to ssh returns timeout (-1) or ssh error - # (255), then success. Otherwise, keep trying until we run - # out of time. - ret = timeout(["ssh"] + LocalSSH._SSH_FLAGS + - ["%s" % (domain_name), - "(:)"], max_secs - elapsed_secs) - - self.log.debug("VM %s: ssh returned with %d" % - (instanceName, ret)) - - if (ret != -1) and (ret != 255): - return 0 - - # Sleep a bit before trying again - time.sleep(config.Config.TIMER_POLL_INTERVAL) - - def copyIn(self, vm, inputFiles): - """ copyIn - Copy input files to VM - """ - domain_name = self.domainName(vm) - - # Create a fresh input directory - ret = subprocess.call(["ssh"] + LocalSSH._SSH_FLAGS + - ["%s" % (domain_name), - "(rm -rf autolab; mkdir autolab)"]) - - # Copy the input files to the input directory - for file in inputFiles: - ret = timeout(["scp"] + - LocalSSH._SSH_FLAGS + - [file.localFile, "%s:autolab/%s" % - (domain_name, file.destFile)], config.Config.COPYIN_TIMEOUT) - if ret != 0: - return ret - return 0 - - def runJob(self, vm, runTimeout, maxOutputFileSize): - """ runJob - Run the make command on a VM using SSH and - redirect output to file "output". - """ - print "IN RUN JOB!!!" - domain_name = self.domainName(vm) - self.log.debug("runJob: Running job on VM %s" % - self.instanceName(vm.id, vm.name)) - # Setting ulimits for VM and running job - runcmd = "/usr/bin/time --output=time.out autodriver -u %d -f %d -t \ - %d -o %d autolab &> output" % (config.Config.VM_ULIMIT_USER_PROC, - config.Config.VM_ULIMIT_FILE_SIZE, - runTimeout, - maxOutputFileSize) - return timeout(["ssh"] + LocalSSH._SSH_FLAGS + - ["%s" % (domain_name), runcmd], runTimeout * 2) - # runTimeout * 2 is a temporary hack. The driver will handle the timout - - def copyOut(self, vm, destFile): - """ copyOut - Copy the file output on the VM to the file - outputFile on the Tango host. - """ - domain_name = self.domainName(vm) - - # Optionally log finer grained runtime info. Adds about 1 sec - # to the job latency, so we typically skip this. - if config.Config.LOG_TIMING: - try: - # regular expression matcher for error message from cat - no_file = re.compile('No such file or directory') - - time_info = subprocess.check_output( - ['ssh'] + LocalSSH._SSH_FLAGS + ['%s' % (domain_name), 'cat time.out']).rstrip('\n') - - # If the output is empty, then ignore it (timing info wasn't - # collected), otherwise let's log it! - if no_file.match(time_info): - # runJob didn't produce an output file - pass - - else: - # remove newline character printed in timing info - # replaces first '\n' character with a space - time_info = re.sub('\n', ' ', time_info, count=1) - self.log.info('Timing (%s): %s' % (domain_name, time_info)) - - except subprocess.CalledProcessError as xxx_todo_changeme: - # Error copying out the timing data (probably runJob failed) - re.error = xxx_todo_changeme - # Error copying out the timing data (probably runJob failed) - pass - - return timeout(["scp"] + LocalSSH._SSH_FLAGS + - ["%s:output" % (domain_name), destFile], - config.Config.COPYOUT_TIMEOUT) - - def destroyVM(self, vm): - """ destroyVM - Nothing to destroy for local. - """ - return - - def safeDestroyVM(self, vm): - return self.destroyVM(vm) - - def getVMs(self): - """ getVMs - Nothing to return for local. - """ - return [] - - def existsVM(self, vm): - """ existsVM - VM is simply localhost which exists. - """ - return True From f779953331647955b55d0174b07b2ceaded452b5 Mon Sep 17 00:00:00 2001 From: Aatish Date: Thu, 17 Sep 2015 18:33:56 -0400 Subject: [PATCH 29/33] Change array of key-values to dictionary for getInfo --- tangod.py | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/tangod.py b/tangod.py index af0ecaf3..5f07df09 100755 --- a/tangod.py +++ b/tangod.py @@ -171,16 +171,17 @@ def getPool(self, vmName): def getInfo(self): """ getInfo - return various statistics about the Tango daemon """ - stats = [] - stats.append("elapsed_secs=%d" % (time.time() - Config.start_time)) - stats.append("job_requests=%d" % Config.job_requests) - stats.append("job_retries=%d" % Config.job_retries) - stats.append("waitvm_timeouts=%d" % Config.waitvm_timeouts) - stats.append("runjob_timeouts=%d" % Config.runjob_timeouts) - stats.append("copyin_errors=%d" % Config.copyin_errors) - stats.append("runjob_errors=%d" % Config.runjob_errors) - stats.append("copyout_errors=%d" % Config.copyout_errors) - stats.append("num_threads=%d" % threading.activeCount()) + stats = {} + stats['elapsed_secs'] = time.time() - Config.start_time; + stats['job_requests'] = Config.job_requests + stats['job_retries'] = Config.job_retries + stats['waitvm_timeouts'] = Config.waitvm_timeouts + stats['runjob_timeouts'] = Config.runjob_timeouts + stats['copyin_errors'] = Config.copyin_errors + stats['runjob_errors'] = Config.runjob_errors + stats['copyout_errors'] = Config.copyout_errors + stats['num_threads'] = threading.activeCount() + return stats # From 55fe85f2b1899db7efe320754a90b818c88f9289 Mon Sep 17 00:00:00 2001 From: Yashas Kumar Date: Thu, 17 Sep 2015 13:25:22 -0400 Subject: [PATCH 30/33] Remove unnecessary files, move components from API level to server level, misc cleanup tasks --- clients/Makefile | 6 - clients/{README => README.md} | 2 +- clients/reset.py | 24 ---- clients/runTests.sh | 8 -- clients/tango-rest.py | 17 ++- config.template.py | 4 + restful-tango/server.py | 6 +- restful-tango/tangoREST.py | 65 ++-------- startTangoREST.sh | 9 -- tangod.py | 225 +++++++++++++++++++--------------- 10 files changed, 148 insertions(+), 218 deletions(-) delete mode 100644 clients/Makefile rename clients/{README => README.md} (91%) delete mode 100755 clients/reset.py delete mode 100755 clients/runTests.sh delete mode 100755 startTangoREST.sh diff --git a/clients/Makefile b/clients/Makefile deleted file mode 100644 index 61b03558..00000000 --- a/clients/Makefile +++ /dev/null @@ -1,6 +0,0 @@ -all: - @echo "Run make with a specific rule" - -# main purpose of this is to clean up the job output files -clean: - rm -f *.out* *~ diff --git a/clients/README b/clients/README.md similarity index 91% rename from clients/README rename to clients/README.md index 3bed4c6f..bacd36b1 100644 --- a/clients/README +++ b/clients/README.md @@ -1,7 +1,7 @@ This directory contains the Tango client program and some example jobs that are useful for testing. -client.py - Tango client program +tango-rest.py - Tango client program Example jobs: job1 - simple hello job diff --git a/clients/reset.py b/clients/reset.py deleted file mode 100755 index 2bb51b7b..00000000 --- a/clients/reset.py +++ /dev/null @@ -1,24 +0,0 @@ -#!/usr/bin/python - -# reset.py - sends a TCP RST across a connection - -import socket -import struct - - -def client(host, port): - s = socket.socket(socket.AF_INET, socket.SOCK_STREAM, 0) - s.connect((host, port)) - l_onoff = 1 - l_linger = 0 - s.setsockopt(socket.SOL_SOCKET, socket.SO_LINGER, - struct.pack('ii', l_onoff, l_linger)) - s.send("this is a random string that we're sending to the server") - s.close() - - -def main(): - client("localhost", 9090) - -if __name__ == "__main__": - main() diff --git a/clients/runTests.sh b/clients/runTests.sh deleted file mode 100755 index 478c65f3..00000000 --- a/clients/runTests.sh +++ /dev/null @@ -1,8 +0,0 @@ -#!/bin/bash - -./tango.py --addjob --jobname job1 --makefile autograde-Makefile --infiles hello.sh -./tango.py --addjob --jobname job2 --makefile autograde-Makefile --infiles hello.sh -./tango.py --addjob --jobname job4 --makefile autograde-Makefile --infiles hello.sh -./tango.py --addjob --jobname job5 --makefile autograde-Makefile --infiles hello.sh -./tango.py --addjob --jobname job6 --makefile autograde-Makefile --infiles hello.sh -./tango.py --addjob --jobname job7 --makefile autograde-Makefile --infiles bug diff --git a/clients/tango-rest.py b/clients/tango-rest.py index 0c8285a1..059a1682 100755 --- a/clients/tango-rest.py +++ b/clients/tango-rest.py @@ -4,13 +4,10 @@ # tango-rest.py - Command line client for the RESTful Tango. # - import os import sys sys.path.append('/usr/lib/python2.7/site-packages/') -sys.path.append('gen-py') -sys.path.append('/usr/share/Tango-prod/lib/requests-2.2.1/') import argparse import requests @@ -24,8 +21,8 @@ parser = argparse.ArgumentParser(description='') parser.add_argument('-s', '--server', default='http://localhost', help='Tango server endpoint (default = http://localhost)') -parser.add_argument('-P', '--port', default=8080, type=int, - help='Tango server port number (default = 8080)') +parser.add_argument('-P', '--port', default=3000, type=int, + help='Tango server port number (default = 3000)') parser.add_argument('-k', '--key', help='Key of client') parser.add_argument('-l', '--courselab', @@ -35,7 +32,7 @@ parser.add_argument('-o', '--open', action='store_true', help=open_help) upload_help = 'Uploads a file. Must specify key with -k, courselab with -l, and filename with --filename.' parser.add_argument('-u', '--upload', action='store_true', help=upload_help) -addJob_help = 'Submit a job. Must specify key with -k, courselab with -l, and input files with --infiles. Modify defaults with --image (rhel), --outputFile (result.out), --jobname (test_job), --maxsize(0), --timeout (0).' +addJob_help = 'Submit a job. Must specify key with -k, courselab with -l, and input files with --infiles. Modify defaults with --image (autograding_image), --outputFile (result.out), --jobname (test_job), --maxsize(0), --timeout (0).' parser.add_argument('-a', '--addJob', action='store_true', help=addJob_help) poll_help = 'Poll a given output file. Must specify key with -k, courselab with -l. Modify defaults with --outputFile (result.out).' parser.add_argument('-p', '--poll', action='store_true', help=poll_help) @@ -43,18 +40,18 @@ parser.add_argument('-i', '--info', action='store_true', help=info_help) jobs_help = 'Obtain information of live jobs (deadJobs == 0) or dead jobs (deadJobs == 1). Must specify key with -k. Modify defaults with --deadJobs (0).' parser.add_argument('-j', '--jobs', action='store_true', help=jobs_help) -pool_help = 'Obtain information about a pool of VMs spawned from a specific image. Must specify key with -k. Modify defaults with --image (rhel).' +pool_help = 'Obtain information about a pool of VMs spawned from a specific image. Must specify key with -k. Modify defaults with --image (autograding_image).' parser.add_argument('--pool', action='store_true', help=pool_help) -prealloc_help = 'Create a pool of instances spawned from a specific image. Must specify key with -k. Modify defaults with --image (rhel), --num (2), --vmms (tashiSSH), --cores (1), and --memory (512).' +prealloc_help = 'Create a pool of instances spawned from a specific image. Must specify key with -k. Modify defaults with --image (autograding_image), --num (2), --vmms (localDocker), --cores (1), and --memory (512).' parser.add_argument('--prealloc', action='store_true', help=prealloc_help) parser.add_argument('--runJob', help='Run a job from a specific directory') parser.add_argument( '--numJobs', type=int, default=1, help='Number of jobs to run') -parser.add_argument('--vmms', default='tashiSSH', +parser.add_argument('--vmms', default='localDocker', help='Choose vmms between ec2SSH, tashiSSH, localDocker, and distDocker') -parser.add_argument('--image', default='rhel', +parser.add_argument('--image', default='autograding_image', help='VM image name (default "rhel")') parser.add_argument( '--infiles', diff --git a/config.template.py b/config.template.py index f714a99a..bfe0e997 100644 --- a/config.template.py +++ b/config.template.py @@ -30,6 +30,10 @@ class Config: # Courselabs directory. Must be created before starting Tango COURSELABS = "courselabs" + + # Directory within each courselab where Tango will copy the output + # for jobs of that courselab + OUTPUT_FOLDER = "output" # VMMS to use. Must be set to a VMMS implemented in vmms/ before # starting Tango. Options are: "localDocker", "distDocker", diff --git a/restful-tango/server.py b/restful-tango/server.py index eafc7306..69756a00 100755 --- a/restful-tango/server.py +++ b/restful-tango/server.py @@ -140,10 +140,6 @@ def post(self, key, image, num): if __name__ == "__main__": port = Config.PORT - if len(sys.argv) > 1: - port = int(sys.argv[1]) - - print("Starting the RESTful Tango server on port %d..." % (port)) - tangoREST.resetTango() + tangoREST.tango.resetTango(tangoREST.tango.vmms) application.listen(port) tornado.ioloop.IOLoop.instance().start() diff --git a/restful-tango/tangoREST.py b/restful-tango/tangoREST.py index e3a2111d..993eb40f 100644 --- a/restful-tango/tangoREST.py +++ b/restful-tango/tangoREST.py @@ -10,7 +10,6 @@ import hashlib import json import logging -import logging.handlers currentdir = os.path.dirname( os.path.abspath(inspect.getfile(inspect.currentframe()))) @@ -18,9 +17,6 @@ sys.path.insert(0, parentdir) from tangod import TangoServer -from jobQueue import JobQueue -from jobManager import JobManager -from preallocator import Preallocator from tangoObjects import TangoJob, TangoMachine, InputFile from config import Config @@ -59,65 +55,31 @@ def create(self, id, msg): class TangoREST: COURSELABS = Config.COURSELABS - OUTPUT_FOLDER = "output" + OUTPUT_FOLDER = Config.OUTPUT_FOLDER LOGFILE = Config.LOGFILE # Replace with choice of key store and override validateKey. # This key is just for testing. - keys = Config.KEYS + KEYS = Config.KEYS def __init__(self): logging.basicConfig( - filename = self.LOGFILE, - format = "%(levelname)s|%(asctime)s|%(name)s|%(message)s", - level = Config.LOGLEVEL - ) - - vmms = None - - if Config.VMMS_NAME == "tashiSSH": - from vmms.tashiSSH import TashiSSH - vmms = TashiSSH() - elif Config.VMMS_NAME == "ec2SSH": - from vmms.ec2SSH import Ec2SSH - vmms = Ec2SSH() - elif Config.VMMS_NAME == "localDocker": - from vmms.localDocker import LocalDocker - vmms = LocalDocker() - elif Config.VMMS_NAME == "distDocker": - from vmms.distDocker import DistDocker - vmms = DistDocker() - - - self.vmms = {Config.VMMS_NAME: vmms} - self.preallocator = Preallocator(self.vmms) - self.queue = JobQueue(self.preallocator) - - if not Config.USE_REDIS: - # creates a local Job Manager if there is no persistent - # memory between processes. Otherwise, JobManager will - # be initiated separately - JobManager(self.queue, self.vmms, self.preallocator) - - self.tango = TangoServer(self.queue, self.preallocator, self.vmms) - - logging.basicConfig( - filename=self.LOGFILE, - format="%(levelname)s|%(asctime)s|%(name)s|%(message)s", - level=Config.LOGLEVEL + filename = self.LOGFILE, + format = "%(levelname)s|%(asctime)s|%(name)s|%(message)s", + level = Config.LOGLEVEL ) - - logging.getLogger('boto').setLevel(logging.INFO) self.log = logging.getLogger("TangoREST") self.log.info("Starting RESTful Tango server") + + self.tango = TangoServer() self.status = Status() def validateKey(self, key): """ validateKey - Validates key provided by client """ result = False - for el in self.keys: + for el in self.KEYS: if el == key: result = True return result @@ -416,9 +378,9 @@ def pool(self, key, image): self.log.debug("Received pool request(%s, %s)" % (key, image)) if self.validateKey(key): if image == "": - pools = self.preallocator.getAllPools() + pools = self.tango.preallocator.getAllPools() else: - info = self.preallocator.getPool(image) + info = self.tango.preallocator.getPool(image) pools = {} if len(info) > 0: pools[image] = info @@ -464,10 +426,3 @@ def prealloc(self, key, image, num, vmStr): else: self.log.info("Key not recognized: %s" % key) return self.status.wrong_key - - def resetTango(self): - """ Destroys VMs associated with this namespace. Used for admin - purposes only. - """ - self.log.debug("Received resetTango request.") - self.tango.resetTango(self.vmms) diff --git a/startTangoREST.sh b/startTangoREST.sh deleted file mode 100755 index b06d1996..00000000 --- a/startTangoREST.sh +++ /dev/null @@ -1,9 +0,0 @@ -#!/bin/bash -# -# TangoREST dependencies have to be added to the PATH. -# -source $PWD/scripts/sourcelib; - -while [ 1 ]; do - python $PWD/restful-tango/server.py -done diff --git a/tangod.py b/tangod.py index 5f07df09..65762c81 100755 --- a/tangod.py +++ b/tangod.py @@ -42,6 +42,9 @@ import stat from config import Config +from jobQueue import JobQueue +from jobManager import JobManager +from preallocator import Preallocator from tangoObjects import TangoJob from datetime import datetime @@ -51,11 +54,34 @@ class TangoServer: """ TangoServer - Implements the API functions that the server accepts """ - def __init__(self, jobQueue, preallocator, vmms): + def __init__(self): self.daemon = True - self.jobQueue = jobQueue - self.preallocator = preallocator - self.vmms = vmms + + vmms = None + if Config.VMMS_NAME == "tashiSSH": + from vmms.tashiSSH import TashiSSH + vmms = TashiSSH() + elif Config.VMMS_NAME == "ec2SSH": + from vmms.ec2SSH import Ec2SSH + vmms = Ec2SSH() + elif Config.VMMS_NAME == "localDocker": + from vmms.localDocker import LocalDocker + vmms = LocalDocker() + elif Config.VMMS_NAME == "distDocker": + from vmms.distDocker import DistDocker + vmms = DistDocker() + + self.vmms = {Config.VMMS_NAME: vmms} + self.preallocator = Preallocator(self.vmms) + + self.preallocator = Preallocator(self.vmms) + self.jobQueue = JobQueue(self.preallocator) + if not Config.USE_REDIS: + # creates a local Job Manager if there is no persistent + # memory between processes. Otherwise, JobManager will + # be initiated separately + JobManager(self.jobQueue, self.vmms, self.preallocator) + logging.basicConfig( filename=Config.LOGFILE, format="%(levelname)s|%(asctime)s|%(name)s|%(message)s", @@ -69,7 +95,7 @@ def addJob(self, job): """ Config.job_requests += 1 self.log.debug("Received addJob request") - ret = validateJob(job, self.vmms) + ret = self.__validateJob(job, self.vmms) self.log.info("Done validating job") if ret == 0: return self.jobQueue.add(job) @@ -193,7 +219,7 @@ def resetTango(self, vmms): effect is that also checks that each supported VMMS is actually running. """ - log = logging.getLogger('Server') + self.log.debug("Received resetTango request.") try: # For each supported VMM system, get the instances it knows about, @@ -201,7 +227,7 @@ def resetTango(self, vmms): for vmms_name in vmms: vobj = vmms[vmms_name] vms = vobj.getVMs() - log.debug("Pre-existing VMs: %s" % [vm.name for vm in vms]) + self.log.debug("Pre-existing VMs: %s" % [vm.name for vm in vms]) namelist = [] for vm in vms: if re.match("%s-" % Config.PREFIX, vm.name): @@ -210,7 +236,7 @@ def resetTango(self, vmms): # interfaces namelist.append(vm.name) if namelist: - log.warning("Killed these %s VMs on restart: %s" % + self.log.warning("Killed these %s VMs on restart: %s" % (vmms_name, namelist)) for job in self.jobQueue.liveJobs.values(): @@ -218,111 +244,110 @@ def resetTango(self, vmms): (str(job.name), str(job.assigned))) except Exception as err: - log.error("resetTango: Call to VMMS %s failed: %s" % + self.log.error("resetTango: Call to VMMS %s failed: %s" % (vmms_name, err)) os._exit(1) -def validateJob(job, vmms): - """ validateJob - validate the input arguments in an addJob request. - """ - log = logging.getLogger('Server') - errors = 0 - - # If this isn't a Tango job then bail with an error - if (not isinstance(job, TangoJob)): - return -1 - - # Every job must have a name - if not job.name: - log.error("validateJob: Missing job.name") - job.appendTrace("%s|validateJob: Missing job.name" % - (datetime.utcnow().ctime())) - errors += 1 - - # Check the virtual machine field - if not job.vm: - log.error("validateJob: Missing job.vm") - job.appendTrace("%s|validateJob: Missing job.vm" % - (datetime.utcnow().ctime())) - errors += 1 - else: - if not job.vm.image: - log.error("validateJob: Missing job.vm.image") - job.appendTrace("%s|validateJob: Missing job.vm.image" % + def __validateJob(self, job, vmms): + """ validateJob - validate the input arguments in an addJob request. + """ + errors = 0 + + # If this isn't a Tango job then bail with an error + if (not isinstance(job, TangoJob)): + return -1 + + # Every job must have a name + if not job.name: + self.log.error("validateJob: Missing job.name") + job.appendTrace("%s|validateJob: Missing job.name" % (datetime.utcnow().ctime())) errors += 1 - else: - vobj = vmms[Config.VMMS_NAME] - imgList = vobj.getImages() - if job.vm.image not in imgList: - log.error("validateJob: Image not found: %s" % - job.vm.image) - job.appendTrace("%s|validateJob: Image not found: %s" % - (datetime.utcnow().ctime(), job.vm.image)) - errors += 1 - else: - (name, ext) = os.path.splitext(job.vm.image) - job.vm.name = name - if not job.vm.vmms: - log.error("validateJob: Missing job.vm.vmms") - job.appendTrace("%s|validateJob: Missing job.vm.vmms" % + # Check the virtual machine field + if not job.vm: + self.log.error("validateJob: Missing job.vm") + job.appendTrace("%s|validateJob: Missing job.vm" % (datetime.utcnow().ctime())) errors += 1 else: - if job.vm.vmms not in vmms: - log.error("validateJob: Invalid vmms name: %s" % job.vm.vmms) - job.appendTrace("%s|validateJob: Invalid vmms name: %s" % - (datetime.utcnow().ctime(), job.vm.vmms)) + if not job.vm.image: + self.log.error("validateJob: Missing job.vm.image") + job.appendTrace("%s|validateJob: Missing job.vm.image" % + (datetime.utcnow().ctime())) errors += 1 - - # Check the output file - if not job.outputFile: - log.error("validateJob: Missing job.outputFile") - job.appendTrace("%s|validateJob: Missing job.outputFile" % - (datetime.utcnow().ctime())) - errors += 1 - else: - if not os.path.exists(os.path.dirname(job.outputFile)): - log.error("validateJob: Bad output path: %s", job.outputFile) - job.appendTrace("%s|validateJob: Bad output path: %s" % - (datetime.utcnow().ctime(), job.outputFile)) - errors += 1 - - # Check for max output file size parameter - if not job.maxOutputFileSize: - log.debug("validateJob: Setting job.maxOutputFileSize " - "to default value: %d bytes", Config.MAX_OUTPUT_FILE_SIZE) - job.maxOutputFileSize = Config.MAX_OUTPUT_FILE_SIZE - - # Check the list of input files - for inputFile in job.input: - if not inputFile.localFile: - log.error("validateJob: Missing inputFile.localFile") - job.appendTrace("%s|validateJob: Missing inputFile.localFile" % + else: + vobj = vmms[Config.VMMS_NAME] + imgList = vobj.getImages() + if job.vm.image not in imgList: + self.log.error("validateJob: Image not found: %s" % + job.vm.image) + job.appendTrace("%s|validateJob: Image not found: %s" % + (datetime.utcnow().ctime(), job.vm.image)) + errors += 1 + else: + (name, ext) = os.path.splitext(job.vm.image) + job.vm.name = name + + if not job.vm.vmms: + self.log.error("validateJob: Missing job.vm.vmms") + job.appendTrace("%s|validateJob: Missing job.vm.vmms" % + (datetime.utcnow().ctime())) + errors += 1 + else: + if job.vm.vmms not in vmms: + self.log.error("validateJob: Invalid vmms name: %s" % job.vm.vmms) + job.appendTrace("%s|validateJob: Invalid vmms name: %s" % + (datetime.utcnow().ctime(), job.vm.vmms)) + errors += 1 + + # Check the output file + if not job.outputFile: + self.log.error("validateJob: Missing job.outputFile") + job.appendTrace("%s|validateJob: Missing job.outputFile" % (datetime.utcnow().ctime())) errors += 1 else: - if not os.path.exists(inputFile.localFile): - log.error("validateJob: Input file %s not found" % - (inputFile.localFile)) - job.appendTrace( - "%s|validateJob: Input file %s not found" % - (datetime.utcnow().ctime(), inputFile.localFile)) + if not os.path.exists(os.path.dirname(job.outputFile)): + self.log.error("validateJob: Bad output path: %s", job.outputFile) + job.appendTrace("%s|validateJob: Bad output path: %s" % + (datetime.utcnow().ctime(), job.outputFile)) errors += 1 - # Check if job timeout has been set; If not set timeout to default - if not job.timeout or job.timeout <= 0: - log.debug("validateJob: Setting job.timeout to" - " default config value: %d secs", Config.RUNJOB_TIMEOUT) - job.timeout = Config.RUNJOB_TIMEOUT - - # Any problems, return an error status - if errors > 0: - log.error("validateJob: Job rejected: %d errors" % errors) - job.appendTrace("%s|validateJob: Job rejected: %d errors" % - (datetime.utcnow().ctime(), errors)) - return -1 - else: - return 0 + # Check for max output file size parameter + if not job.maxOutputFileSize: + self.log.debug("validateJob: Setting job.maxOutputFileSize " + "to default value: %d bytes", Config.MAX_OUTPUT_FILE_SIZE) + job.maxOutputFileSize = Config.MAX_OUTPUT_FILE_SIZE + + # Check the list of input files + for inputFile in job.input: + if not inputFile.localFile: + self.log.error("validateJob: Missing inputFile.localFile") + job.appendTrace("%s|validateJob: Missing inputFile.localFile" % + (datetime.utcnow().ctime())) + errors += 1 + else: + if not os.path.exists(inputFile.localFile): + self.log.error("validateJob: Input file %s not found" % + (inputFile.localFile)) + job.appendTrace( + "%s|validateJob: Input file %s not found" % + (datetime.utcnow().ctime(), inputFile.localFile)) + errors += 1 + + # Check if job timeout has been set; If not set timeout to default + if not job.timeout or job.timeout <= 0: + self.log.debug("validateJob: Setting job.timeout to" + " default config value: %d secs", Config.RUNJOB_TIMEOUT) + job.timeout = Config.RUNJOB_TIMEOUT + + # Any problems, return an error status + if errors > 0: + self.log.error("validateJob: Job rejected: %d errors" % errors) + job.appendTrace("%s|validateJob: Job rejected: %d errors" % + (datetime.utcnow().ctime(), errors)) + return -1 + else: + return 0 From e6edf70f26d321e4caa266789a247ffb0b16733f Mon Sep 17 00:00:00 2001 From: Aatish Date: Sat, 19 Sep 2015 19:28:20 -0400 Subject: [PATCH 31/33] Check for job for Makefile when validating --- tangod.py | 25 ++++++++++++++++--------- 1 file changed, 16 insertions(+), 9 deletions(-) diff --git a/tangod.py b/tangod.py index 65762c81..6e93169c 100755 --- a/tangod.py +++ b/tangod.py @@ -305,8 +305,7 @@ def __validateJob(self, job, vmms): # Check the output file if not job.outputFile: self.log.error("validateJob: Missing job.outputFile") - job.appendTrace("%s|validateJob: Missing job.outputFile" % - (datetime.utcnow().ctime())) + job.appendTrace("%s|validateJob: Missing job.outputFile" % (datetime.utcnow().ctime())) errors += 1 else: if not os.path.exists(os.path.dirname(job.outputFile)): @@ -322,21 +321,29 @@ def __validateJob(self, job, vmms): job.maxOutputFileSize = Config.MAX_OUTPUT_FILE_SIZE # Check the list of input files + hasMakefile = False for inputFile in job.input: if not inputFile.localFile: self.log.error("validateJob: Missing inputFile.localFile") job.appendTrace("%s|validateJob: Missing inputFile.localFile" % - (datetime.utcnow().ctime())) + (datetime.utcnow().ctime())) errors += 1 else: - if not os.path.exists(inputFile.localFile): - self.log.error("validateJob: Input file %s not found" % - (inputFile.localFile)) - job.appendTrace( - "%s|validateJob: Input file %s not found" % - (datetime.utcnow().ctime(), inputFile.localFile)) + if not os.path.exists(os.path.dirname(job.outputFile)): + self.log.error("validateJob: Bad output path: %s", job.outputFile) + job.appendTrace("%s|validateJob: Bad output path: %s" % + (datetime.utcnow().ctime(), job.outputFile)) errors += 1 + if inputFile.destFile == 'Makefile': + hasMakefile = True + + # Check if input files include a Makefile + if not hasMakefile: + self.log.error("validateJob: Missing Makefile in input files.") + job.appendTrace("%s|validateJob: Missing Makefile in input files." % (datetime.utcnow().ctime())) + errors+=1 + # Check if job timeout has been set; If not set timeout to default if not job.timeout or job.timeout <= 0: self.log.debug("validateJob: Setting job.timeout to" From f736923fbd8c57e22d02511e1da3cec95ff8c821 Mon Sep 17 00:00:00 2001 From: Aatish Date: Sun, 20 Sep 2015 15:25:32 -0400 Subject: [PATCH 32/33] Set start_time to system time when Tango is started --- config.template.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/config.template.py b/config.template.py index bfe0e997..bcf00d3a 100644 --- a/config.template.py +++ b/config.template.py @@ -3,6 +3,7 @@ # import logging +import time # Config - defines @@ -111,7 +112,7 @@ class Config: ###### # Part 3: Runtime info that you can retrieve using the /info route # - start_time = 0 + start_time = time.time() job_requests = 0 job_retries = 0 waitvm_timeouts = 0 From d14aead7f847e849d5439d2a0905d2ac1e406261 Mon Sep 17 00:00:00 2001 From: Yashas Kumar Date: Sun, 27 Sep 2015 17:22:06 -0400 Subject: [PATCH 33/33] Display objects correctly as strings in log --- tangoObjects.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/tangoObjects.py b/tangoObjects.py index 5d2c6d2c..25162409 100644 --- a/tangoObjects.py +++ b/tangoObjects.py @@ -30,6 +30,10 @@ def __init__(self, localFile, destFile): self.localFile = localFile self.destFile = destFile + def __repr__(self): + return "InputFile(localFile: %s, destFile: %s)" % (self.localFile, + self.destFile) + class TangoMachine(): @@ -54,6 +58,9 @@ def __init__(self, name="DefaultTestVM", image=None, vmms=None, self.id = id self.instance_id = id + def __repr__(self): + return "TangoMachine(image: %s, vmms: %s)" % (self.image, self.vmms) + class TangoJob():