Skip to content

Commit 072cb1f

Browse files
committed
tcmur_device: skip reporting the event if device in recovery
If the device is in recovery, we can defer reporting the event in the recovery when reopening the device. And if the device is stopped or stopping we can just skip it. Just wait for the report event to finish when recoverying the device, because the recovery will close and then open the device during which the private data maybe released. And it may cause use-after-free crash in report event routine. Signed-off-by: Xiubo Li <[email protected]>
1 parent b4d1656 commit 072cb1f

File tree

7 files changed

+52
-10
lines changed

7 files changed

+52
-10
lines changed

kmod-devel-25-16.el8.x86_64.rpm

18.7 KB
Binary file not shown.

main.c

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1044,9 +1044,15 @@ static int dev_added(struct tcmu_device *dev)
10441044
goto cleanup_format_lock;
10451045
}
10461046

1047+
ret = pthread_cond_init(&rdev->report_event_cond, NULL);
1048+
if (ret) {
1049+
ret = -ret;
1050+
goto cleanup_rdev_lock;
1051+
}
1052+
10471053
ret = setup_io_work_queue(dev);
10481054
if (ret < 0)
1049-
goto cleanup_rdev_lock;
1055+
goto cleanup_event_cond;
10501056

10511057
ret = setup_aio_tracking(rdev);
10521058
if (ret < 0)
@@ -1088,6 +1094,8 @@ static int dev_added(struct tcmu_device *dev)
10881094
cleanup_aio_tracking(rdev);
10891095
cleanup_io_work_queue:
10901096
cleanup_io_work_queue(dev, true);
1097+
cleanup_event_cond:
1098+
pthread_cond_destroy(&rdev->report_event_cond);
10911099
cleanup_rdev_lock:
10921100
pthread_mutex_destroy(&rdev->rdev_lock);
10931101
cleanup_format_lock:
@@ -1130,6 +1138,10 @@ static void dev_removed(struct tcmu_device *dev)
11301138

11311139
tcmur_destroy_work(rdev->event_work);
11321140

1141+
ret = pthread_cond_destroy(&rdev->report_event_cond);
1142+
if (ret != 0)
1143+
tcmu_err("could not cleanup report event cond %d\n", ret);
1144+
11331145
ret = pthread_mutex_destroy(&rdev->rdev_lock);
11341146
if (ret != 0)
11351147
tcmu_err("could not cleanup state lock %d\n", ret);

rbd.c

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -146,17 +146,14 @@ static int tcmu_rbd_service_status_update(struct tcmu_device *dev,
146146

147147
#endif /* RBD_LOCK_ACQUIRE_SUPPORT */
148148

149-
static int tcmu_rbd_report_event(struct tcmu_device *dev)
149+
static int tcmu_rbd_report_event(struct tcmu_device *dev, bool has_lock)
150150
{
151-
struct tcmur_device *rdev = tcmu_dev_get_private(dev);
152-
153151
/*
154152
* We ignore the specific event and report all the current counter
155153
* values, because tools like gwcli/dashboard may not see every
156154
* update, and we do not want one event to overwrite the info.
157155
*/
158-
return tcmu_rbd_service_status_update(dev,
159-
rdev->lock_state == TCMUR_DEV_LOCK_WRITE_LOCKED ? true : false);
156+
return tcmu_rbd_service_status_update(dev, has_lock);
160157
}
161158

162159
static int tcmu_rbd_service_register(struct tcmu_device *dev)
@@ -215,7 +212,7 @@ static int tcmu_rbd_service_register(struct tcmu_device *dev)
215212
goto free_meta_buf;
216213
}
217214

218-
ret = tcmu_rbd_report_event(dev);
215+
ret = tcmu_rbd_report_event(dev, false);
219216

220217
free_meta_buf:
221218
free(metadata_buf);

target.c

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -252,6 +252,12 @@ static void tgt_port_grp_recovery_work_fn(void *arg)
252252
*/
253253
list_for_each_safe(&tpg->devs, rdev, tmp_rdev, recovery_entry) {
254254
list_del(&rdev->recovery_entry);
255+
256+
pthread_mutex_lock(&rdev->rdev_lock);
257+
if (rdev->flags & TCMUR_DEV_FLAG_REPORTING_EVENT)
258+
pthread_cond_wait(&rdev->report_event_cond, &rdev->rdev_lock);
259+
pthread_mutex_unlock(&rdev->rdev_lock);
260+
255261
ret = __tcmu_reopen_dev(rdev->dev, -1);
256262
if (ret) {
257263
tcmu_dev_err(rdev->dev, "Could not reinitialize device. (err %d).\n",

tcmu-runner.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -161,7 +161,7 @@ struct tcmur_handler {
161161
*
162162
* Return 0 on success and a -Exyz error code on error.
163163
*/
164-
int (*report_event)(struct tcmu_device *dev);
164+
int (*report_event)(struct tcmu_device *dev, bool has_lock);
165165

166166
/*
167167
* If the lock is acquired and the tag is not TCMU_INVALID_LOCK_TAG,

tcmur_device.c

Lines changed: 26 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -86,8 +86,8 @@ int __tcmu_reopen_dev(struct tcmu_device *dev, int retries)
8686
rhandler->close(dev);
8787
}
8888

89-
pthread_mutex_lock(&rdev->rdev_lock);
9089
ret = -EIO;
90+
pthread_mutex_lock(&rdev->rdev_lock);
9191
while (ret != 0 && !(rdev->flags & TCMUR_DEV_FLAG_STOPPING) &&
9292
(retries < 0 || attempt <= retries)) {
9393
pthread_mutex_unlock(&rdev->rdev_lock);
@@ -128,7 +128,10 @@ int tcmu_reopen_dev(struct tcmu_device *dev, int retries)
128128
pthread_mutex_unlock(&rdev->rdev_lock);
129129
return -EBUSY;
130130
}
131+
131132
rdev->flags |= TCMUR_DEV_FLAG_IN_RECOVERY;
133+
if (rdev->flags & TCMUR_DEV_FLAG_REPORTING_EVENT)
134+
pthread_cond_wait(&rdev->report_event_cond, &rdev->rdev_lock);
132135
pthread_mutex_unlock(&rdev->rdev_lock);
133136

134137
return __tcmu_reopen_dev(dev, retries);
@@ -159,6 +162,7 @@ static void __tcmu_report_event(void *data)
159162
struct tcmu_device *dev = data;
160163
struct tcmur_device *rdev = tcmu_dev_get_private(dev);
161164
struct tcmur_handler *rhandler = tcmu_get_runner_handler(dev);
165+
bool has_lock;
162166
int ret;
163167

164168
/*
@@ -168,9 +172,29 @@ static void __tcmu_report_event(void *data)
168172
sleep(1);
169173

170174
pthread_mutex_lock(&rdev->rdev_lock);
171-
ret = rhandler->report_event(dev);
175+
/*
176+
* If the device is in recovery, will skip reporting the event
177+
* this time because the event should be report when the device
178+
* is reopening.
179+
*/
180+
if (rdev->flags & (TCMUR_DEV_FLAG_IN_RECOVERY |
181+
TCMUR_DEV_FLAG_STOPPING |
182+
TCMUR_DEV_FLAG_STOPPED)) {
183+
pthread_mutex_unlock(&rdev->rdev_lock);
184+
return;
185+
}
186+
187+
rdev->flags |= TCMUR_DEV_FLAG_REPORTING_EVENT;
188+
has_lock = rdev->lock_state == TCMUR_DEV_LOCK_WRITE_LOCKED;
189+
pthread_mutex_unlock(&rdev->rdev_lock);
190+
191+
ret = rhandler->report_event(dev, has_lock);
172192
if (ret)
173193
tcmu_dev_err(dev, "Could not report events. Error %d.\n", ret);
194+
195+
pthread_mutex_lock(&rdev->rdev_lock);
196+
rdev->flags &= ~TCMUR_DEV_FLAG_REPORTING_EVENT;
197+
pthread_cond_signal(&rdev->report_event_cond);
174198
pthread_mutex_unlock(&rdev->rdev_lock);
175199
}
176200

tcmur_device.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
#define TCMUR_DEV_FLAG_IS_OPEN (1 << 2)
2323
#define TCMUR_DEV_FLAG_STOPPING (1 << 3)
2424
#define TCMUR_DEV_FLAG_STOPPED (1 << 4)
25+
#define TCMUR_DEV_FLAG_REPORTING_EVENT (1 << 5)
2526

2627
#define TCMUR_UA_DEV_SIZE_CHANGED 0
2728

@@ -83,6 +84,8 @@ struct tcmur_device {
8384

8485
int cmd_time_out;
8586

87+
pthread_cond_t report_event_cond;
88+
8689
pthread_spinlock_t cmds_list_lock; /* protects cmds_list */
8790
struct list_head cmds_list;
8891
};

0 commit comments

Comments
 (0)