From 9f8ad8ca00cf7677e9d982c9e98d4b7586e759f6 Mon Sep 17 00:00:00 2001 From: "J. R. Okajima" Date: Wed, 2 Jun 2010 23:38:37 +0900 Subject: [PATCH 1/3] aufs: possible bugfix, revalidate inode race between readdir and lookup Both of readdir and lookup operation need to assign the aufs inode number, but it requires other aufs locks including xi_nondir_mtx which prevents hard-linked inode number from race condition. There can happen a violation of the order of these locks. They acquire these locks. aufs_readdir("./dirA") + si_read_lock + fi_write_lock + di_write_lock + ii_write_lock for dirA + xi_nondir_mtx for non-dir aufs_lookup("./dirA/fileB") + si_read_lock + di_write_lock + xi_nondir_mtx for non-dir + ii_write_lock_nested for fileB Here the fileB may be in copy-up operation which acquires the parent's dentry-info and inode->info lock. So aufs_lookup() waits for the completion of copy-up, aufs_readdir() waits for xi_nondir_mtx, and the copy-up waits for the parent, but it is held by readdir. This is very complicated situation and I am afraid the design of aufs inode assignment is not godd. But I don't have other idea. This commit refines xi_nondir_mtx and releases it before "ii_write_lock_nested for fileB." Signed-off-by: J. R. Okajima --- fs/aufs/i_op.c | 18 ++---------------- fs/aufs/inode.c | 45 ++++++++++++++++++++++++++++++++++++--------- 2 files changed, 38 insertions(+), 25 deletions(-) diff --git a/fs/aufs/i_op.c b/fs/aufs/i_op.c index 31b49f99feec..fd82aa2e7574 100644 --- a/fs/aufs/i_op.c +++ b/fs/aufs/i_op.c @@ -139,11 +139,9 @@ static struct dentry *aufs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd) { struct dentry *ret, *parent; - struct inode *inode, *h_inode; - struct mutex *mtx; + struct inode *inode; struct super_block *sb; int err, npositive; - aufs_bindex_t bstart; IMustLock(dir); @@ -168,19 +166,7 @@ static struct dentry *aufs_lookup(struct inode *dir, struct dentry *dentry, inode = NULL; if (npositive) { - bstart = au_dbstart(dentry); - h_inode = au_h_dptr(dentry, bstart)->d_inode; - if (!S_ISDIR(h_inode->i_mode)) { - /* - * stop 'race'-ing between hardlinks under different - * parents. - */ - mtx = &au_sbr(sb, bstart)->br_xino.xi_nondir_mtx; - mutex_lock(mtx); - inode = au_new_inode(dentry, /*must_new*/0); - mutex_unlock(mtx); - } else - inode = au_new_inode(dentry, /*must_new*/0); + inode = au_new_inode(dentry, /*must_new*/0); ret = (void *)inode; } if (IS_ERR(inode)) diff --git a/fs/aufs/inode.c b/fs/aufs/inode.c index b5adb708da5e..473d88d30985 100644 --- a/fs/aufs/inode.c +++ b/fs/aufs/inode.c @@ -271,11 +271,10 @@ int au_ino(struct super_block *sb, aufs_bindex_t bindex, ino_t h_ino, { int err; struct mutex *mtx; - const int isdir = (d_type == DT_DIR); - /* prevent hardlinks from race condition */ + /* prevent hardlinked inode number from race condition */ mtx = NULL; - if (!isdir) { + if (d_type != DT_DIR) { mtx = &au_sbr(sb, bindex)->br_xino.xi_nondir_mtx; mutex_lock(mtx); } @@ -294,7 +293,7 @@ int au_ino(struct super_block *sb, aufs_bindex_t bindex, ino_t h_ino, } out: - if (!isdir) + if (mtx) mutex_unlock(mtx); return err; } @@ -303,9 +302,10 @@ int au_ino(struct super_block *sb, aufs_bindex_t bindex, ino_t h_ino, /* todo: return with unlocked? */ struct inode *au_new_inode(struct dentry *dentry, int must_new) { - struct inode *inode; + struct inode *inode, *h_inode; struct dentry *h_dentry; struct super_block *sb; + struct mutex *mtx; ino_t h_ino, ino; int err, match; aufs_bindex_t bstart; @@ -313,12 +313,25 @@ struct inode *au_new_inode(struct dentry *dentry, int must_new) sb = dentry->d_sb; bstart = au_dbstart(dentry); h_dentry = au_h_dptr(dentry, bstart); - h_ino = h_dentry->d_inode->i_ino; + h_inode = h_dentry->d_inode; + h_ino = h_inode->i_ino; + + /* + * stop 'race'-ing between hardlinks under different + * parents. + */ + mtx = NULL; + if (!S_ISDIR(h_inode->i_mode)) + mtx = &au_sbr(sb, bstart)->br_xino.xi_nondir_mtx; + + new_ino: + if (mtx) + mutex_lock(mtx); err = au_xino_read(sb, bstart, h_ino, &ino); inode = ERR_PTR(err); if (unlikely(err)) goto out; - new_ino: + if (!ino) { ino = au_xino_new_ino(sb); if (unlikely(!ino)) { @@ -346,11 +359,21 @@ struct inode *au_new_inode(struct dentry *dentry, int must_new) iget_failed(inode); goto out_err; } else if (!must_new) { + /* + * horrible race condition between lookup, readdir and copyup + * (or something). + */ + if (mtx) + mutex_unlock(mtx); err = reval_inode(inode, dentry, &match); - if (!err) + if (!err) { + mtx = NULL; goto out; /* success */ - else if (match) + } else if (match) { + mtx = NULL; goto out_iput; + } else if (mtx) + mutex_lock(mtx); } if (unlikely(au_test_fs_unique_ino(h_dentry->d_inode))) @@ -362,6 +385,8 @@ struct inode *au_new_inode(struct dentry *dentry, int must_new) err = au_xino_write(sb, bstart, h_ino, /*ino*/0); if (!err) { iput(inode); + if (mtx) + mutex_unlock(mtx); goto new_ino; } @@ -370,6 +395,8 @@ struct inode *au_new_inode(struct dentry *dentry, int must_new) out_err: inode = ERR_PTR(err); out: + if (mtx) + mutex_unlock(mtx); return inode; } From 92172f59e363f630ab44217e78135cdeff90ba0d Mon Sep 17 00:00:00 2001 From: "J. R. Okajima" Date: Fri, 4 Jun 2010 02:09:23 +0900 Subject: [PATCH 2/3] aufs: tiny, fake type-cast by union Signed-off-by: J. R. Okajima --- fs/aufs/cpup.c | 18 ++++++++++-------- fs/aufs/i_op.c | 18 ++++++++++-------- fs/aufs/vfsub.c | 14 ++++++++++++-- fs/aufs/xino.c | 18 ++++++++++++++---- 4 files changed, 46 insertions(+), 22 deletions(-) diff --git a/fs/aufs/cpup.c b/fs/aufs/cpup.c index 238bd8d9696c..a927cff7ab7e 100644 --- a/fs/aufs/cpup.c +++ b/fs/aufs/cpup.c @@ -409,29 +409,31 @@ static int au_do_cpup_symlink(struct path *h_path, struct dentry *h_src, { int err, symlen; mm_segment_t old_fs; - char *sym; + union { + char *k; + char __user *u; + } sym; err = -ENOSYS; if (unlikely(!h_src->d_inode->i_op->readlink)) goto out; err = -ENOMEM; - sym = au_getname(); - if (unlikely(!sym)) + sym.k = au_getname(); + if (unlikely(!sym.k)) goto out; old_fs = get_fs(); set_fs(KERNEL_DS); - symlen = h_src->d_inode->i_op->readlink(h_src, (char __user *)sym, - PATH_MAX); + symlen = h_src->d_inode->i_op->readlink(h_src, sym.u, PATH_MAX); err = symlen; set_fs(old_fs); if (symlen > 0) { - sym[symlen] = 0; - err = vfsub_symlink(h_dir, h_path, sym); + sym.k[symlen] = 0; + err = vfsub_symlink(h_dir, h_path, sym.k); } - __putname(sym); + __putname(sym.k); out: return err; diff --git a/fs/aufs/i_op.c b/fs/aufs/i_op.c index fd82aa2e7574..6d43d7bd121f 100644 --- a/fs/aufs/i_op.c +++ b/fs/aufs/i_op.c @@ -805,29 +805,31 @@ static int aufs_readlink(struct dentry *dentry, char __user *buf, int bufsiz) static void *aufs_follow_link(struct dentry *dentry, struct nameidata *nd) { int err; - char *buf; mm_segment_t old_fs; + union { + char *k; + char __user *u; + } buf; err = -ENOMEM; - buf = au_getname(); - if (unlikely(!buf)) + buf.k = au_getname(); + if (unlikely(!buf.k)) goto out; aufs_read_lock(dentry, AuLock_IR); old_fs = get_fs(); set_fs(KERNEL_DS); - err = h_readlink(dentry, au_dbstart(dentry), (char __user *)buf, - PATH_MAX); + err = h_readlink(dentry, au_dbstart(dentry), buf.u, PATH_MAX); set_fs(old_fs); aufs_read_unlock(dentry, AuLock_IR); if (err >= 0) { - buf[err] = 0; + buf.k[err] = 0; /* will be freed by put_link */ - nd_set_link(nd, buf); + nd_set_link(nd, buf.k); return NULL; /* success */ } - __putname(buf); + __putname(buf.k); out: path_put(&nd->path); diff --git a/fs/aufs/vfsub.c b/fs/aufs/vfsub.c index bb8730a7ed0c..910e70374357 100644 --- a/fs/aufs/vfsub.c +++ b/fs/aufs/vfsub.c @@ -354,10 +354,15 @@ ssize_t vfsub_read_k(struct file *file, void *kbuf, size_t count, { ssize_t err; mm_segment_t oldfs; + union { + void *k; + char __user *u; + } buf; + buf.k = kbuf; oldfs = get_fs(); set_fs(KERNEL_DS); - err = vfsub_read_u(file, (char __user *)kbuf, count, ppos); + err = vfsub_read_u(file, buf.u, count, ppos); set_fs(oldfs); return err; } @@ -379,10 +384,15 @@ ssize_t vfsub_write_k(struct file *file, void *kbuf, size_t count, loff_t *ppos) { ssize_t err; mm_segment_t oldfs; + union { + void *k; + const char __user *u; + } buf; + buf.k = kbuf; oldfs = get_fs(); set_fs(KERNEL_DS); - err = vfsub_write_u(file, (const char __user *)kbuf, count, ppos); + err = vfsub_write_u(file, buf.u, count, ppos); set_fs(oldfs); return err; } diff --git a/fs/aufs/xino.c b/fs/aufs/xino.c index 974135b11995..6cee7060a780 100644 --- a/fs/aufs/xino.c +++ b/fs/aufs/xino.c @@ -25,17 +25,22 @@ #include #include "aufs.h" -ssize_t xino_fread(au_readf_t func, struct file *file, void *buf, size_t size, +ssize_t xino_fread(au_readf_t func, struct file *file, void *kbuf, size_t size, loff_t *pos) { ssize_t err; mm_segment_t oldfs; + union { + void *k; + char __user *u; + } buf; + buf.k = kbuf; oldfs = get_fs(); set_fs(KERNEL_DS); do { /* todo: signal_pending? */ - err = func(file, (char __user *)buf, size, pos); + err = func(file, buf.u, size, pos); } while (err == -EAGAIN || err == -EINTR); set_fs(oldfs); @@ -49,18 +54,23 @@ ssize_t xino_fread(au_readf_t func, struct file *file, void *buf, size_t size, /* ---------------------------------------------------------------------- */ -static ssize_t do_xino_fwrite(au_writef_t func, struct file *file, void *buf, +static ssize_t do_xino_fwrite(au_writef_t func, struct file *file, void *kbuf, size_t size, loff_t *pos) { ssize_t err; mm_segment_t oldfs; + union { + void *k; + const char __user *u; + } buf; + buf.k = kbuf; oldfs = get_fs(); set_fs(KERNEL_DS); lockdep_off(); do { /* todo: signal_pending? */ - err = func(file, (const char __user *)buf, size, pos); + err = func(file, buf.u, size, pos); } while (err == -EAGAIN || err == -EINTR); lockdep_on(); set_fs(oldfs); From c503a51d10c9c7767f1eba8e47de545d31ab7858 Mon Sep 17 00:00:00 2001 From: "J. R. Okajima" Date: Fri, 4 Jun 2010 02:54:13 +0900 Subject: [PATCH 3/3] aufs: bugfix, the dentry paramter for security funcs Pass the correct parameter. Signed-off-by: J. R. Okajima --- fs/aufs/vfsub.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/fs/aufs/vfsub.c b/fs/aufs/vfsub.c index 1de136d6b720..cc66fd8db60f 100644 --- a/fs/aufs/vfsub.c +++ b/fs/aufs/vfsub.c @@ -153,7 +153,7 @@ int vfsub_create(struct inode *dir, struct path *path, int mode) d = path->dentry; path->dentry = d->d_parent; - err = security_path_mknod(path, path->dentry, mode, 0); + err = security_path_mknod(path, d, mode, 0); path->dentry = d; if (unlikely(err)) goto out; @@ -200,7 +200,7 @@ int vfsub_symlink(struct inode *dir, struct path *path, const char *symname) d = path->dentry; path->dentry = d->d_parent; - err = security_path_symlink(path, path->dentry, symname); + err = security_path_symlink(path, d, symname); path->dentry = d; if (unlikely(err)) goto out; @@ -231,7 +231,7 @@ int vfsub_mknod(struct inode *dir, struct path *path, int mode, dev_t dev) d = path->dentry; path->dentry = d->d_parent; - err = security_path_mknod(path, path->dentry, mode, dev); + err = security_path_mknod(path, d, mode, dev); path->dentry = d; if (unlikely(err)) goto out; @@ -276,7 +276,7 @@ int vfsub_link(struct dentry *src_dentry, struct inode *dir, struct path *path) d = path->dentry; path->dentry = d->d_parent; - err = security_path_link(src_dentry, path, path->dentry); + err = security_path_link(src_dentry, path, d); path->dentry = d; if (unlikely(err)) goto out; @@ -318,7 +318,7 @@ int vfsub_rename(struct inode *src_dir, struct dentry *src_dentry, d = path->dentry; path->dentry = d->d_parent; tmp.dentry = src_dentry->d_parent; - err = security_path_rename(&tmp, src_dentry, path, path->dentry); + err = security_path_rename(&tmp, src_dentry, path, d); path->dentry = d; if (unlikely(err)) goto out; @@ -353,7 +353,7 @@ int vfsub_mkdir(struct inode *dir, struct path *path, int mode) d = path->dentry; path->dentry = d->d_parent; - err = security_path_mkdir(path, path->dentry, mode); + err = security_path_mkdir(path, d, mode); path->dentry = d; if (unlikely(err)) goto out; @@ -384,7 +384,7 @@ int vfsub_rmdir(struct inode *dir, struct path *path) d = path->dentry; path->dentry = d->d_parent; - err = security_path_rmdir(path, path->dentry); + err = security_path_rmdir(path, d); path->dentry = d; if (unlikely(err)) goto out;