Skip to content

Commit 5adeeba

Browse files
authored
Merge pull request #194 from njzjz/inf-group-size
support infinite group_size
2 parents deda42e + 0d87a36 commit 5adeeba

File tree

4 files changed

+49
-4
lines changed

4 files changed

+49
-4
lines changed

doc/examples/shell.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,4 +14,4 @@ The workstation has 48 cores of CPUs and 8 RTX3090 cards. Here we hope each card
1414
:linenos:
1515
```
1616

17-
Note that `group_size` should be set as large as possible to ensure there is only one job and avoid running multiple jobs at the same time.
17+
Note that `group_size` should be set to `0` (means infinity) to ensure there is only one job and avoid running multiple jobs at the same time.

dpdispatcher/submission.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -311,11 +311,14 @@ def generate_jobs(self):
311311
if self.belonging_jobs:
312312
raise RuntimeError(f'Can not generate jobs when submission.belonging_jobs is not empty. debug:{self}')
313313
group_size = self.resources.group_size
314-
if ( group_size < 1 ) or ( type(group_size) is not int ):
314+
if ( group_size < 0 ) or ( type(group_size) is not int ):
315315
raise RuntimeError('group_size must be a positive number')
316316
task_num = len(self.belonging_tasks)
317317
if task_num == 0:
318318
raise RuntimeError("submission must have at least 1 task")
319+
if group_size == 0:
320+
# 0 means infinity
321+
group_size = task_num
319322
random.seed(42)
320323
random_task_index = list(range(task_num))
321324
random.shuffle(random_task_index)
@@ -815,7 +818,7 @@ def arginfo(detail_kwargs=True):
815818
doc_cpu_per_node = 'cpu numbers of each node assigned to each job.'
816819
doc_gpu_per_node = 'gpu numbers of each node assigned to each job.'
817820
doc_queue_name = 'The queue name of batch job scheduler system.'
818-
doc_group_size = 'The number of `tasks` in a `job`.'
821+
doc_group_size = 'The number of `tasks` in a `job`. 0 means infinity.'
819822
doc_custom_flags = 'The extra lines pass to job submitting script header'
820823
doc_para_deg = 'Decide how many tasks will be run in parallel.'
821824
doc_source_list = 'The env file to be sourced before the command execution.'

examples/resources/mandu.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
"cpu_per_node": 48,
44
"gpu_per_node": 8,
55
"queue_name": "shell",
6-
"group_size": 9999,
6+
"group_size": 0,
77
"strategy": {
88
"if_cuda_multi_devices": true
99
},

tests/test_group_size.py

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
"""Test `Submission.generate_jobs` with different group size."""
2+
3+
import os
4+
import sys
5+
import json
6+
from unittest import TestCase
7+
from pathlib import Path
8+
9+
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
10+
__package__ = 'tests'
11+
from .context import Machine, Resources, Task, Submission
12+
13+
# 99 tasks in total
14+
# group_size - expected_ntasks
15+
group_ntasks_pairs = [
16+
(1, 99),
17+
(3, 33),
18+
(10, 10),
19+
(100, 1),
20+
(0, 1),
21+
]
22+
23+
cwd = Path(__file__).parent
24+
with open(cwd / "jsons" / "machine.json") as f:
25+
j_machine = json.load(f)['machine']
26+
with open(cwd / "jsons" / "resources.json") as f:
27+
j_resources = json.load(f)
28+
with open(cwd / "jsons" / "task.json") as f:
29+
j_task = json.load(f)
30+
31+
32+
class TestGroupSize(TestCase):
33+
def test_works_as_expected(self):
34+
for group_size, ntasks in group_ntasks_pairs:
35+
with self.subTest(group_size):
36+
machine = Machine.load_from_dict(j_machine)
37+
j_resources['group_size'] = group_size
38+
resources = Resources.load_from_dict(j_resources)
39+
tasks = [Task.load_from_dict(j_task) for _ in range(99)]
40+
submission = Submission(".", machine, resources, task_list=tasks)
41+
submission.generate_jobs()
42+
self.assertEqual(len(submission.belonging_jobs), ntasks)

0 commit comments

Comments
 (0)