@@ -132,7 +132,15 @@ def run_job(self, gpu_slot: GPUSlot, job: Job, job_index: int) -> int:
132132 return return_code
133133
134134 def run_jobs (self , jobs : List [Job ]) -> List [int ]:
135- """Run multiple jobs across available GPU slots."""
135+ """Run multiple jobs across available GPU slots.
136+
137+ Args:
138+ jobs: List of Job objects to execute
139+
140+ Returns:
141+ List[int]: Status code for each job
142+ (0 for success, non-zero for failure)
143+ """
136144 total_slots = len (self .gpu_slots )
137145 self .logger .info (
138146 f"Starting { len (jobs )} jobs across { self .n_gpus } "
@@ -147,6 +155,7 @@ def run_jobs(self, jobs: List[Job]) -> List[int]:
147155 for slot in self .gpu_slots :
148156 slot_queue .put (slot )
149157
158+ # Initialize results list with None values
150159 results = [None ] * len (jobs )
151160 active_jobs = set ()
152161 job_lock = threading .Lock ()
@@ -184,12 +193,22 @@ def worker():
184193 ) as executor :
185194 futures = []
186195
196+ # Queue up all jobs
187197 for i , job in enumerate (jobs ):
188198 job_queue .put ((i , job ))
189199
200+ # Start workers
190201 for _ in range (total_slots ):
191202 futures .append (executor .submit (worker ))
192203
204+ # Wait for all workers to complete
193205 concurrent .futures .wait (futures )
194206
195207 self .logger .info ("\n All jobs completed!" )
208+
209+ # Ensure all jobs have a result
210+ assert all (
211+ result is not None for result in results
212+ ), "Some jobs did not complete"
213+
214+ return results
0 commit comments