Skip to content

Commit

Permalink
try and fix status returns
Browse files Browse the repository at this point in the history
  • Loading branch information
b-d-e committed Jan 16, 2025
1 parent ed29235 commit 647eaf7
Showing 1 changed file with 20 additions and 1 deletion.
21 changes: 20 additions & 1 deletion src/kairos/scheduler.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,7 +132,15 @@ def run_job(self, gpu_slot: GPUSlot, job: Job, job_index: int) -> int:
return return_code

def run_jobs(self, jobs: List[Job]) -> List[int]:
"""Run multiple jobs across available GPU slots."""
"""Run multiple jobs across available GPU slots.
Args:
jobs: List of Job objects to execute
Returns:
List[int]: Status code for each job
(0 for success, non-zero for failure)
"""
total_slots = len(self.gpu_slots)
self.logger.info(
f"Starting {len(jobs)} jobs across {self.n_gpus} "
Expand All @@ -147,6 +155,7 @@ def run_jobs(self, jobs: List[Job]) -> List[int]:
for slot in self.gpu_slots:
slot_queue.put(slot)

# Initialize results list with None values
results = [None] * len(jobs)
active_jobs = set()
job_lock = threading.Lock()
Expand Down Expand Up @@ -184,12 +193,22 @@ def worker():
) as executor:
futures = []

# Queue up all jobs
for i, job in enumerate(jobs):
job_queue.put((i, job))

# Start workers
for _ in range(total_slots):
futures.append(executor.submit(worker))

# Wait for all workers to complete
concurrent.futures.wait(futures)

self.logger.info("\nAll jobs completed!")

# Ensure all jobs have a result
assert all(
result is not None for result in results
), "Some jobs did not complete"

return results

0 comments on commit 647eaf7

Please sign in to comment.