Skip to content

Commit 1008db1

Browse files
committed
Attempting to merge
1 parent ad42245 commit 1008db1

9 files changed

+71
-48
lines changed
Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,14 @@
11
slurm_job_number:
22
block_size: "4M,64K,4K"
3-
directory: /mnt/cephtest-fi5k/test-ec63-hdd/skrit/fio
4-
time: 120
3+
directory: /mnt/cephtest-fi5k/test-ec63-hdd/skrit/fio
4+
time: 120
55
io_type: write
6-
platform_type: EC63
6+
platform_type: nvme_ec63_kernel_hdd
77
split_hosts_file: 0
88
job_number: '48,16,8,4,2,1'
99
node_count: 20,16,8,4,2,1
1010
hosts_file:
1111
no_scrub: 0
12+
unit_restart: 1
13+
job_note: "This test is for hdd ec63 partition after swapping bad SAS (6GB/s) cables with 12GB/s cables"
1214
template_path: /mnt/home/skrit/Documents/testing_clones/clone1/PyBenchFramework/examples/template/template.fio
Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,14 @@
11
slurm_job_number:
22
block_size: "4M,64K,4K"
3-
directory: /mnt/cephtest-fi5k/test-rep3-hdd/skrit/fio
4-
time: 120
3+
directory: /mnt/cephtest-fi5k/test-rep3-hdd/skrit/fio
4+
time: 120
55
io_type: write
6-
platform_type: triple_replication
6+
platform_type: nvme_rep3_kernel_hdd
77
split_hosts_file: 0
88
job_number: '48,16,8,4,2,1'
99
node_count: 20,16,8,4,2,1
1010
hosts_file:
1111
no_scrub: 0
12+
unit_restart: 1
13+
job_note: "This test is for rep3 hdd partition after we swap 6GB/s SAS cables with 12GB/s SAS cables"
1214
template_path: /mnt/home/skrit/Documents/testing_clones/clone1/PyBenchFramework/examples/template/template.fio

python_runs/config/nvme_test_kernel_ssd_ec63.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,4 +10,5 @@ node_count: 20,16,8,4,2,1
1010
hosts_file:
1111
no_scrub: 0
1212
unit_restart: 1
13+
job_note: "This test is for ec63 partition after we change the number of OSDs back from 1 OSD per SSD to 2 OSDs per SSD"
1314
template_path: /mnt/home/skrit/Documents/testing_clones/clone1/PyBenchFramework/examples/template/template.fio

python_runs/config/nvme_test_kernel_ssd_rep3.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,4 +10,5 @@ node_count: 20,16,8,4,2,1
1010
hosts_file:
1111
no_scrub: 0
1212
unit_restart: 1
13+
job_note: "This test is for rep3 partition after we change the number of OSDs back from 1 OSD per SSD to 2 OSDs per SSD"
1314
template_path: /mnt/home/skrit/Documents/testing_clones/clone1/PyBenchFramework/examples/template/template.fio

python_runs/miscellaneous.py

Lines changed: 30 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -95,7 +95,8 @@ def get_decrypted_password(opt_pass_file,opt_key_file):
9595

9696
def restart_ceph_unit(path):
9797

98-
def check_ceph_is_active(escaped_path, sudo_password):
98+
def check_ceph_is_active(escaped_path):
99+
#, sudo_password):
99100

100101
unit_filename = 'mnt-' + escaped_path + '.mount'
101102
unit_path = '/etc/systemd/system/' + unit_filename
@@ -104,44 +105,45 @@ def check_ceph_is_active(escaped_path, sudo_password):
104105
sys.exit(1)
105106

106107
try:
107-
command = ['sudo', '-S', 'systemctl', 'is-active', unit_filename]
108+
command = ['systemctl', 'is-active', unit_filename]
108109

109-
process = subprocess.Popen(command, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
110-
output, error = process.communicate(sudo_password + '\n')
110+
result = subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
111+
#output, error = process.communicate(sudo_password + '\n')
111112

112-
print(f"{hostname} Ceph is active? Output: ", output)
113-
print(f"{hostname} Ceph is active? Errors: ", error)
113+
print(f"{hostname} Ceph is active? Output: ", result.stdout)
114+
print(f"{hostname} Ceph is active? Errors: ", result.stderr)
114115

115-
result = output.strip()
116+
is_active = result.stdout.strip()
116117

117118
# Check the output
118-
if result == 'active':
119+
if is_active == 'active':
119120
return 1
120121
else:
121122
return 0
122123
except subprocess.CalledProcessError as e:
123124
print(f"Failed to get active status for {unit_filename}: {e.stderr}")
124125
sys.exit(1)
125-
126+
126127
hostname = socket.gethostname()
127128

128129
m = re.match('/mnt/cephtest[-_\w]*$', path)
129130
if not m:
130131
print("ERROR: Remount path must be /mnt/cephtest...")
131132
sys.exit(1)
132-
133+
133134
if not os.path.exists(path):
134135
print("ERROR: Path '{}' does not exist".format(path))
135136
sys.exit(1)
136-
137+
137138
escaped_path = path[5:]
138139
try:
139140
while True:
140141
ix = escaped_path.index('-')
141142
escaped_path = escaped_path[:ix] + '\\x2d' + escaped_path[ix+1:]
142143
except ValueError:
143144
pass
144-
145+
146+
'''
145147
try:
146148
key_file = os.getenv("KEY_FILE")
147149
if not key_file:
@@ -153,7 +155,7 @@ def check_ceph_is_active(escaped_path, sudo_password):
153155
except ValueError as ve:
154156
print(f"Error: {ve}")
155157
sys.exit(1)
156-
158+
157159
try:
158160
password_file = os.getenv("PASS_FILE")
159161
if not key_file:
@@ -165,24 +167,27 @@ def check_ceph_is_active(escaped_path, sudo_password):
165167
except ValueError as ve:
166168
print(f"Error: {ve}")
167169
sys.exit(1)
168-
170+
169171
print(key_file, password_file)
170-
171-
command = ['sudo', '-S', 'python', '/mnt/cephadm/bin/iotest_helper.py', 'remount', path]
172-
173-
sudo_password = get_decrypted_password(password_file, key_file)
174-
175-
process = subprocess.Popen(command, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
176-
output, error = process.communicate(sudo_password + '\n')
177-
178-
print(f"{hostname} Ceph restart? Output: ", output)
179-
print(f"{hostname} Ceph restart? Errors: ", error)
172+
'''
173+
174+
#command = ['sudo', '-S', 'python', '/mnt/cephadm/bin/iotest_helper.py', 'remount', path]
175+
command = ['sudo', '/mnt/cephadm/bin/iotest_helper.py', 'remount', path]
176+
177+
#sudo_password = get_decrypted_password(password_file, key_file)
178+
179+
result = subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
180+
#output, error = process.communicate(sudo_password + '\n')
181+
182+
print(f"{hostname} Ceph restart? Output: ", result.stdout)
183+
print(f"{hostname} Ceph restart? Errors: ", result.stderr)
180184

181185
active_status = 0
182186
active_counter = 0
183187

184188
while active_status == 0:
185-
active_status = check_ceph_is_active(escaped_path, sudo_password)
189+
active_status = check_ceph_is_active(escaped_path)
190+
#, sudo_password)
186191
time.sleep(1)
187192
active_counter += 1
188193

submit_scripts/hdd_ec63_kernel.sh

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
#SBATCH -p scc
55
#SBATCH --nodes=20
66
#SBATCH --reservation=worker_test
7-
#SBATCH --nodelist=worker[7377-7396]
7+
#SBATCH --nodelist=worker[7377-7386,7388-7397]
88

99
# Define root directory
1010
root_dir=$PyBench_root_dir
@@ -23,9 +23,12 @@ sleep 10
2323

2424
srun --nodes=20 python python_runs/run.py --benchmark "fio-serverless" --slurm-job-number ${SLURM_JOB_ID} --block-size 4M --config python_runs/config/nvme_test_kernel_hdd_ec63.yml --first-node ${first_node} --total-node-count 20 --node-count 20 --template-path /mnt/home/skrit/Documents/testing_clones/clone1/PyBenchFramework/examples/template/starting_template.fio --job-number 48
2525

26-
srun --nodes=20 python python_runs/run.py --benchmark "fio-serverless" --slurm-job-number ${SLURM_JOB_ID} --io-type "read" --config python_runs/config/nvme_test_kernel_hdd_ec63.yml --first-node ${first_node} --total-node-count 20
26+
rm -f results/write/nvme_ec63_kernel_hdd/${SLURM_JOB_ID}/*.tmp
27+
rm -f results/write/nvme_ec63_kernel_hdd/${SLURM_JOB_ID}/*.json
2728

28-
#srun --nodes=20 python python_runs/run.py --benchmark "fio-serverless" --slurm-job-number ${SLURM_JOB_ID} --io-type "write" --config python_runs/config/nvme_test_kernel_hdd_ec63.yml --first-node ${first_node} --total-node-count 20
29+
srun --nodes=20 python python_runs/run.py --benchmark "fio-serverless" --slurm-job-number ${SLURM_JOB_ID} --io-type "read" --block-size 4M --config python_runs/config/nvme_test_kernel_hdd_ec63.yml --first-node ${first_node} --total-node-count 20
30+
31+
srun --nodes=20 python python_runs/run.py --benchmark "fio-serverless" --slurm-job-number ${SLURM_JOB_ID} --io-type "write" --block-size 4M --config python_runs/config/nvme_test_kernel_hdd_ec63.yml --first-node ${first_node} --total-node-count 20
2932

3033
srun --nodes=20 python python_runs/run.py --benchmark "fio-serverless" --slurm-job-number ${SLURM_JOB_ID} --io-type "randread" --config python_runs/config/nvme_test_kernel_hdd_ec63.yml --first-node ${first_node} --total-node-count 20
3134

submit_scripts/hdd_rep3_kernel.sh

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
#SBATCH -p scc
55
#SBATCH --nodes=20
66
#SBATCH --reservation=worker_test
7-
#SBATCH --nodelist=worker[7377-7396]
7+
#SBATCH --nodelist=worker[7377-7386,7388-7397]
88

99
# Define root directory
1010
root_dir=$PyBench_root_dir
@@ -21,12 +21,15 @@ python python_runs/prep_work.py --benchmark "fio-serverless" --slurm-job-number
2121

2222
sleep 10
2323

24-
srun --nodes=20 python python_runs/run.py --benchmark "fio-serverless" --slurm-job-number ${SLURM_JOB_ID} --block-size 4M --config python_runs/config/nvme_test_kernel_hdd_rep3.yml --first-node ${first_node} --total-node-count 20 --node-count 20 --template-path /mnt/home/skrit/Documents/testing_clones/clone1/PyBenchFramework/examples/template/starting_template.fio --job-number 48
24+
#srun --nodes=20 python python_runs/run.py --benchmark "fio-serverless" --slurm-job-number ${SLURM_JOB_ID} --block-size 4M --config python_runs/config/nvme_test_kernel_hdd_rep3.yml --first-node ${first_node} --total-node-count 20 --node-count 20 --template-path /mnt/home/skrit/Documents/testing_clones/clone1/PyBenchFramework/examples/template/starting_template.fio --job-number 48
2525

26-
srun --nodes=20 python python_runs/run.py --benchmark "fio-serverless" --slurm-job-number ${SLURM_JOB_ID} --io-type "read" --config python_runs/config/nvme_test_kernel_hdd_rep3.yml --first-node ${first_node} --total-node-count 20
26+
#rm -f results/write/nvme_rep3_kernel_hdd/${SLURM_JOB_ID}/*.tmp
27+
#rm -f results/write/nvme_rep3_kernel_hdd/${SLURM_JOB_ID}/*.json
2728

28-
#srun --nodes=20 python python_runs/run.py --benchmark "fio-serverless" --slurm-job-number ${SLURM_JOB_ID} --io-type "write" --config python_runs/config/nvme_test_kernel_hdd_rep3.yml --first-node ${first_node} --total-node-count 20
29+
srun --nodes=20 python python_runs/run.py --benchmark "fio-serverless" --slurm-job-number ${SLURM_JOB_ID} --io-type "read" --block-size 4M --config python_runs/config/nvme_test_kernel_hdd_rep3.yml --first-node ${first_node} --total-node-count 20
2930

30-
srun --nodes=20 python python_runs/run.py --benchmark "fio-serverless" --slurm-job-number ${SLURM_JOB_ID} --io-type "randread" --config python_runs/config/nvme_test_kernel_hdd_rep3.yml --first-node ${first_node} --total-node-count 20
31+
#srun --nodes=20 python python_runs/run.py --benchmark "fio-serverless" --slurm-job-number ${SLURM_JOB_ID} --io-type "write" --block-size 4M --config python_runs/config/nvme_test_kernel_hdd_rep3.yml --first-node ${first_node} --total-node-count 20
3132

32-
srun --nodes=20 python python_runs/run.py --benchmark "fio-serverless" --slurm-job-number ${SLURM_JOB_ID} --io-type "randwrite" --config python_runs/config/nvme_test_kernel_hdd_rep3.yml --first-node ${first_node} --total-node-count 20
33+
#srun --nodes=20 python python_runs/run.py --benchmark "fio-serverless" --slurm-job-number ${SLURM_JOB_ID} --io-type "randread" --config python_runs/config/nvme_test_kernel_hdd_rep3.yml --first-node ${first_node} --total-node-count 20
34+
35+
#srun --nodes=20 python python_runs/run.py --benchmark "fio-serverless" --slurm-job-number ${SLURM_JOB_ID} --io-type "randwrite" --config python_runs/config/nvme_test_kernel_hdd_rep3.yml --first-node ${first_node} --total-node-count 20

submit_scripts/ssd_ec63_kernel.sh

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
#SBATCH -p scc
55
#SBATCH --nodes=20
66
#SBATCH --reservation=worker_test
7-
#SBATCH --nodelist=worker[7377-7396]
7+
#SBATCH --nodelist=worker[7377-7386,7388-7397]
88

99
# Define root directory
1010
root_dir=$PyBench_root_dir
@@ -21,11 +21,14 @@ python python_runs/prep_work.py --benchmark "fio-serverless" --slurm-job-number
2121

2222
sleep 10
2323

24-
srun --nodes=20 python python_runs/run.py --benchmark "fio-serverless" --slurm-job-number ${SLURM_JOB_ID} --block-size 4M --config python_runs/config/nvme_test_kernel_ssd_ec63.yml --first-node ${first_node} --total-node-count 20 --node-count 20 --template-path /mnt/home/skrit/Documents/testing_clones/clone1/PyBenchFramework/examples/template/starting_template.fio --job-number 48
24+
#srun --nodes=20 python python_runs/run.py --benchmark "fio-serverless" --slurm-job-number ${SLURM_JOB_ID} --block-size 4M --config python_runs/config/nvme_test_kernel_ssd_ec63.yml --first-node ${first_node} --total-node-count 20 --node-count 20 --template-path /mnt/home/skrit/Documents/testing_clones/clone1/PyBenchFramework/examples/template/starting_template.fio --job-number 48
2525

26-
srun --nodes=20 python python_runs/run.py --benchmark "fio-serverless" --slurm-job-number ${SLURM_JOB_ID} --io-type "read" --config python_runs/config/nvme_test_kernel_ssd_ec63.yml --first-node ${first_node} --total-node-count 20
26+
#rm -f results/write/nvme_rep3_kernel/${SLURM_JOB_ID}/*.tmp
27+
#rm -f results/write/nvme_rep3_kernel/${SLURM_JOB_ID}/*.json
2728

28-
#srun --nodes=20 python python_runs/run.py --benchmark "fio-serverless" --slurm-job-number ${SLURM_JOB_ID} --io-type "write" --config python_runs/config/nvme_test_kernel_ssd_ec63.yml --first-node ${first_node} --total-node-count 20
29+
srun --nodes=20 python python_runs/run.py --benchmark "fio-serverless" --slurm-job-number ${SLURM_JOB_ID} --io-type "read" --block-size 4M --config python_runs/config/nvme_test_kernel_ssd_ec63.yml --first-node ${first_node} --total-node-count 20
30+
31+
srun --nodes=20 python python_runs/run.py --benchmark "fio-serverless" --slurm-job-number ${SLURM_JOB_ID} --io-type "write" --block-size 4M --config python_runs/config/nvme_test_kernel_ssd_ec63.yml --first-node ${first_node} --total-node-count 20
2932

3033
srun --nodes=20 python python_runs/run.py --benchmark "fio-serverless" --slurm-job-number ${SLURM_JOB_ID} --io-type "randread" --config python_runs/config/nvme_test_kernel_ssd_ec63.yml --first-node ${first_node} --total-node-count 20
3134

submit_scripts/ssd_rep3_kernel.sh

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
#SBATCH -p scc
55
#SBATCH --nodes=20
66
#SBATCH --reservation=worker_test
7-
#SBATCH --nodelist=worker[7377-7396]
7+
#SBATCH --nodelist=worker[7377-7386,7388-7397]
88

99
# Define root directory
1010
root_dir=$PyBench_root_dir
@@ -21,11 +21,14 @@ python python_runs/prep_work.py --benchmark "fio-serverless" --slurm-job-number
2121

2222
sleep 10
2323

24-
srun --nodes=20 python python_runs/run.py --benchmark "fio-serverless" --slurm-job-number ${SLURM_JOB_ID} --block-size 4M --config python_runs/config/nvme_test_kernel_ssd_rep3.yml --first-node ${first_node} --total-node-count 20 --node-count 20 --template-path /mnt/home/skrit/Documents/testing_clones/clone1/PyBenchFramework/examples/template/starting_template.fio --job-number 48
24+
#srun --nodes=20 python python_runs/run.py --benchmark "fio-serverless" --slurm-job-number ${SLURM_JOB_ID} --block-size 4M --config python_runs/config/nvme_test_kernel_ssd_rep3.yml --first-node ${first_node} --total-node-count 20 --node-count 20 --template-path /mnt/home/skrit/Documents/testing_clones/clone1/PyBenchFramework/examples/template/starting_template.fio --job-number 48
2525

26-
srun --nodes=20 python python_runs/run.py --benchmark "fio-serverless" --slurm-job-number ${SLURM_JOB_ID} --io-type "read" --config python_runs/config/nvme_test_kernel_ssd_rep3.yml --first-node ${first_node} --total-node-count 20
26+
#rm -f results/write/nvme_rep3_kernel/${SLURM_JOB_ID}/*.tmp
27+
#rm -f results/write/nvme_rep3_kernel/${SLURM_JOB_ID}/*.json
2728

28-
#srun --nodes=20 python python_runs/run.py --benchmark "fio-serverless" --slurm-job-number ${SLURM_JOB_ID} --io-type "write" --config python_runs/config/nvme_test_kernel_ssd_rep3.yml --first-node ${first_node} --total-node-count 20
29+
srun --nodes=20 python python_runs/run.py --benchmark "fio-serverless" --slurm-job-number ${SLURM_JOB_ID} --io-type "read" --block-size 4M --node-count 2,1 --config python_runs/config/nvme_test_kernel_ssd_rep3.yml --first-node ${first_node} --total-node-count 20
30+
31+
srun --nodes=20 python python_runs/run.py --benchmark "fio-serverless" --slurm-job-number ${SLURM_JOB_ID} --io-type "write" --block-size 4M --config python_runs/config/nvme_test_kernel_ssd_rep3.yml --first-node ${first_node} --total-node-count 20
2932

3033
srun --nodes=20 python python_runs/run.py --benchmark "fio-serverless" --slurm-job-number ${SLURM_JOB_ID} --io-type "randread" --config python_runs/config/nvme_test_kernel_ssd_rep3.yml --first-node ${first_node} --total-node-count 20
3134

0 commit comments

Comments
 (0)