Skip to content

Commit

Permalink
tcmur_device: skip reporting the event if device in recovery
Browse files Browse the repository at this point in the history
If the device is in recovery, we can defer reporting the event in
the recovery when reopening the device. And if the device is stopped
or stopping we can just skip it.

Just wait for the report event to finish when recoverying the device,
because the recovery will close and then open the device during which
the private data maybe released. And it may cause use-after-free crash
in report event routine.

Signed-off-by: Xiubo Li <[email protected]>
  • Loading branch information
lxbsz committed Sep 14, 2021
1 parent b4d1656 commit 7f9ad1c
Show file tree
Hide file tree
Showing 6 changed files with 52 additions and 10 deletions.
14 changes: 13 additions & 1 deletion main.c
Original file line number Diff line number Diff line change
Expand Up @@ -1044,9 +1044,15 @@ static int dev_added(struct tcmu_device *dev)
goto cleanup_format_lock;
}

ret = pthread_cond_init(&rdev->report_event_cond, NULL);
if (ret) {
ret = -ret;
goto cleanup_rdev_lock;
}

ret = setup_io_work_queue(dev);
if (ret < 0)
goto cleanup_rdev_lock;
goto cleanup_event_cond;

ret = setup_aio_tracking(rdev);
if (ret < 0)
Expand Down Expand Up @@ -1088,6 +1094,8 @@ static int dev_added(struct tcmu_device *dev)
cleanup_aio_tracking(rdev);
cleanup_io_work_queue:
cleanup_io_work_queue(dev, true);
cleanup_event_cond:
pthread_cond_destroy(&rdev->report_event_cond);
cleanup_rdev_lock:
pthread_mutex_destroy(&rdev->rdev_lock);
cleanup_format_lock:
Expand Down Expand Up @@ -1130,6 +1138,10 @@ static void dev_removed(struct tcmu_device *dev)

tcmur_destroy_work(rdev->event_work);

ret = pthread_cond_destroy(&rdev->report_event_cond);
if (ret != 0)
tcmu_err("could not cleanup report event cond %d\n", ret);

ret = pthread_mutex_destroy(&rdev->rdev_lock);
if (ret != 0)
tcmu_err("could not cleanup state lock %d\n", ret);
Expand Down
9 changes: 3 additions & 6 deletions rbd.c
Original file line number Diff line number Diff line change
Expand Up @@ -146,17 +146,14 @@ static int tcmu_rbd_service_status_update(struct tcmu_device *dev,

#endif /* RBD_LOCK_ACQUIRE_SUPPORT */

static int tcmu_rbd_report_event(struct tcmu_device *dev)
static int tcmu_rbd_report_event(struct tcmu_device *dev, bool has_lock)
{
struct tcmur_device *rdev = tcmu_dev_get_private(dev);

/*
* We ignore the specific event and report all the current counter
* values, because tools like gwcli/dashboard may not see every
* update, and we do not want one event to overwrite the info.
*/
return tcmu_rbd_service_status_update(dev,
rdev->lock_state == TCMUR_DEV_LOCK_WRITE_LOCKED ? true : false);
return tcmu_rbd_service_status_update(dev, has_lock);
}

static int tcmu_rbd_service_register(struct tcmu_device *dev)
Expand Down Expand Up @@ -215,7 +212,7 @@ static int tcmu_rbd_service_register(struct tcmu_device *dev)
goto free_meta_buf;
}

ret = tcmu_rbd_report_event(dev);
ret = tcmu_rbd_report_event(dev, false);

free_meta_buf:
free(metadata_buf);
Expand Down
6 changes: 6 additions & 0 deletions target.c
Original file line number Diff line number Diff line change
Expand Up @@ -252,6 +252,12 @@ static void tgt_port_grp_recovery_work_fn(void *arg)
*/
list_for_each_safe(&tpg->devs, rdev, tmp_rdev, recovery_entry) {
list_del(&rdev->recovery_entry);

pthread_mutex_lock(&rdev->rdev_lock);
if (rdev->flags & TCMUR_DEV_FLAG_REPORTING_EVENT)
pthread_cond_wait(&rdev->report_event_cond, &rdev->rdev_lock);
pthread_mutex_unlock(&rdev->rdev_lock);

ret = __tcmu_reopen_dev(rdev->dev, -1);
if (ret) {
tcmu_dev_err(rdev->dev, "Could not reinitialize device. (err %d).\n",
Expand Down
2 changes: 1 addition & 1 deletion tcmu-runner.h
Original file line number Diff line number Diff line change
Expand Up @@ -161,7 +161,7 @@ struct tcmur_handler {
*
* Return 0 on success and a -Exyz error code on error.
*/
int (*report_event)(struct tcmu_device *dev);
int (*report_event)(struct tcmu_device *dev, bool has_lock);

/*
* If the lock is acquired and the tag is not TCMU_INVALID_LOCK_TAG,
Expand Down
28 changes: 26 additions & 2 deletions tcmur_device.c
Original file line number Diff line number Diff line change
Expand Up @@ -86,8 +86,8 @@ int __tcmu_reopen_dev(struct tcmu_device *dev, int retries)
rhandler->close(dev);
}

pthread_mutex_lock(&rdev->rdev_lock);
ret = -EIO;
pthread_mutex_lock(&rdev->rdev_lock);
while (ret != 0 && !(rdev->flags & TCMUR_DEV_FLAG_STOPPING) &&
(retries < 0 || attempt <= retries)) {
pthread_mutex_unlock(&rdev->rdev_lock);
Expand Down Expand Up @@ -128,7 +128,10 @@ int tcmu_reopen_dev(struct tcmu_device *dev, int retries)
pthread_mutex_unlock(&rdev->rdev_lock);
return -EBUSY;
}

rdev->flags |= TCMUR_DEV_FLAG_IN_RECOVERY;
if (rdev->flags & TCMUR_DEV_FLAG_REPORTING_EVENT)
pthread_cond_wait(&rdev->report_event_cond, &rdev->rdev_lock);
pthread_mutex_unlock(&rdev->rdev_lock);

return __tcmu_reopen_dev(dev, retries);
Expand Down Expand Up @@ -159,6 +162,7 @@ static void __tcmu_report_event(void *data)
struct tcmu_device *dev = data;
struct tcmur_device *rdev = tcmu_dev_get_private(dev);
struct tcmur_handler *rhandler = tcmu_get_runner_handler(dev);
bool has_lock;
int ret;

/*
Expand All @@ -168,9 +172,29 @@ static void __tcmu_report_event(void *data)
sleep(1);

pthread_mutex_lock(&rdev->rdev_lock);
ret = rhandler->report_event(dev);
/*
* If the device is in recovery, will skip reporting the event
* this time because the event should be report when the device
* is reopening.
*/
if (rdev->flags & (TCMUR_DEV_FLAG_IN_RECOVERY |
TCMUR_DEV_FLAG_STOPPING |
TCMUR_DEV_FLAG_STOPPED)) {
pthread_mutex_unlock(&rdev->rdev_lock);
return;
}

rdev->flags |= TCMUR_DEV_FLAG_REPORTING_EVENT;
has_lock = rdev->lock_state == TCMUR_DEV_LOCK_WRITE_LOCKED;
pthread_mutex_unlock(&rdev->rdev_lock);

ret = rhandler->report_event(dev, has_lock);
if (ret)
tcmu_dev_err(dev, "Could not report events. Error %d.\n", ret);

pthread_mutex_lock(&rdev->rdev_lock);
rdev->flags &= ~TCMUR_DEV_FLAG_REPORTING_EVENT;
pthread_cond_signal(&rdev->report_event_cond);
pthread_mutex_unlock(&rdev->rdev_lock);
}

Expand Down
3 changes: 3 additions & 0 deletions tcmur_device.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
#define TCMUR_DEV_FLAG_IS_OPEN (1 << 2)
#define TCMUR_DEV_FLAG_STOPPING (1 << 3)
#define TCMUR_DEV_FLAG_STOPPED (1 << 4)
#define TCMUR_DEV_FLAG_REPORTING_EVENT (1 << 5)

#define TCMUR_UA_DEV_SIZE_CHANGED 0

Expand Down Expand Up @@ -83,6 +84,8 @@ struct tcmur_device {

int cmd_time_out;

pthread_cond_t report_event_cond;

pthread_spinlock_t cmds_list_lock; /* protects cmds_list */
struct list_head cmds_list;
};
Expand Down

0 comments on commit 7f9ad1c

Please sign in to comment.