diff options
Diffstat (limited to 'fs')
395 files changed, 15397 insertions, 11475 deletions
diff --git a/fs/9p/Makefile b/fs/9p/Makefile index 1a940ec7af61..91fba025fcbe 100644 --- a/fs/9p/Makefile +++ b/fs/9p/Makefile @@ -8,6 +8,8 @@ obj-$(CONFIG_9P_FS) := 9p.o vfs_dir.o \ vfs_dentry.o \ v9fs.o \ - fid.o + fid.o \ + xattr.o \ + xattr_user.o 9p-$(CONFIG_9P_FSCACHE) += cache.o diff --git a/fs/9p/fid.c b/fs/9p/fid.c index 7317b39b2815..358563689064 100644 --- a/fs/9p/fid.c +++ b/fs/9p/fid.c @@ -97,6 +97,34 @@ static struct p9_fid *v9fs_fid_find(struct dentry *dentry, u32 uid, int any) return ret; } +/* + * We need to hold v9ses->rename_sem as long as we hold references + * to returned path array. Array element contain pointers to + * dentry names. + */ +static int build_path_from_dentry(struct v9fs_session_info *v9ses, + struct dentry *dentry, char ***names) +{ + int n = 0, i; + char **wnames; + struct dentry *ds; + + for (ds = dentry; !IS_ROOT(ds); ds = ds->d_parent) + n++; + + wnames = kmalloc(sizeof(char *) * n, GFP_KERNEL); + if (!wnames) + goto err_out; + + for (ds = dentry, i = (n-1); i >= 0; i--, ds = ds->d_parent) + wnames[i] = (char *)ds->d_name.name; + + *names = wnames; + return n; +err_out: + return -ENOMEM; +} + /** * v9fs_fid_lookup - lookup for a fid, try to walk if not found * @dentry: dentry to look for fid in @@ -112,7 +140,7 @@ struct p9_fid *v9fs_fid_lookup(struct dentry *dentry) int i, n, l, clone, any, access; u32 uid; struct p9_fid *fid, *old_fid = NULL; - struct dentry *d, *ds; + struct dentry *ds; struct v9fs_session_info *v9ses; char **wnames, *uname; @@ -139,49 +167,62 @@ struct p9_fid *v9fs_fid_lookup(struct dentry *dentry) fid = v9fs_fid_find(dentry, uid, any); if (fid) return fid; - + /* + * we don't have a matching fid. To do a TWALK we need + * parent fid. We need to prevent rename when we want to + * look at the parent. + */ + down_read(&v9ses->rename_sem); ds = dentry->d_parent; fid = v9fs_fid_find(ds, uid, any); - if (!fid) { /* walk from the root */ - n = 0; - for (ds = dentry; !IS_ROOT(ds); ds = ds->d_parent) - n++; + if (fid) { + /* Found the parent fid do a lookup with that */ + fid = p9_client_walk(fid, 1, (char **)&dentry->d_name.name, 1); + goto fid_out; + } + up_read(&v9ses->rename_sem); - fid = v9fs_fid_find(ds, uid, any); - if (!fid) { /* the user is not attached to the fs yet */ - if (access == V9FS_ACCESS_SINGLE) - return ERR_PTR(-EPERM); + /* start from the root and try to do a lookup */ + fid = v9fs_fid_find(dentry->d_sb->s_root, uid, any); + if (!fid) { + /* the user is not attached to the fs yet */ + if (access == V9FS_ACCESS_SINGLE) + return ERR_PTR(-EPERM); - if (v9fs_proto_dotu(v9ses)) + if (v9fs_proto_dotu(v9ses) || v9fs_proto_dotl(v9ses)) uname = NULL; - else - uname = v9ses->uname; + else + uname = v9ses->uname; - fid = p9_client_attach(v9ses->clnt, NULL, uname, uid, - v9ses->aname); - - if (IS_ERR(fid)) - return fid; - - v9fs_fid_add(ds, fid); - } - } else /* walk from the parent */ - n = 1; + fid = p9_client_attach(v9ses->clnt, NULL, uname, uid, + v9ses->aname); + if (IS_ERR(fid)) + return fid; - if (ds == dentry) + v9fs_fid_add(dentry->d_sb->s_root, fid); + } + /* If we are root ourself just return that */ + if (dentry->d_sb->s_root == dentry) return fid; - - wnames = kmalloc(sizeof(char *) * n, GFP_KERNEL); - if (!wnames) - return ERR_PTR(-ENOMEM); - - for (d = dentry, i = (n-1); i >= 0; i--, d = d->d_parent) - wnames[i] = (char *) d->d_name.name; - + /* + * Do a multipath walk with attached root. + * When walking parent we need to make sure we + * don't have a parallel rename happening + */ + down_read(&v9ses->rename_sem); + n = build_path_from_dentry(v9ses, dentry, &wnames); + if (n < 0) { + fid = ERR_PTR(n); + goto err_out; + } clone = 1; i = 0; while (i < n) { l = min(n - i, P9_MAXWELEM); + /* + * We need to hold rename lock when doing a multipath + * walk to ensure none of the patch component change + */ fid = p9_client_walk(fid, l, &wnames[i], clone); if (IS_ERR(fid)) { if (old_fid) { @@ -193,15 +234,17 @@ struct p9_fid *v9fs_fid_lookup(struct dentry *dentry) p9_client_clunk(old_fid); } kfree(wnames); - return fid; + goto err_out; } old_fid = fid; i += l; clone = 0; } - kfree(wnames); +fid_out: v9fs_fid_add(dentry, fid); +err_out: + up_read(&v9ses->rename_sem); return fid; } diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c index f8b86e92cd66..38dc0e067599 100644 --- a/fs/9p/v9fs.c +++ b/fs/9p/v9fs.c @@ -237,6 +237,7 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses, __putname(v9ses->uname); return ERR_PTR(-ENOMEM); } + init_rwsem(&v9ses->rename_sem); rc = bdi_setup_and_register(&v9ses->bdi, "9p", BDI_CAP_MAP_COPY); if (rc) { @@ -278,7 +279,7 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses, v9ses->maxdata = v9ses->clnt->msize - P9_IOHDRSZ; /* for legacy mode, fall back to V9FS_ACCESS_ANY */ - if (!v9fs_proto_dotu(v9ses) && + if (!(v9fs_proto_dotu(v9ses) || v9fs_proto_dotl(v9ses)) && ((v9ses->flags&V9FS_ACCESS_MASK) == V9FS_ACCESS_USER)) { v9ses->flags &= ~V9FS_ACCESS_MASK; diff --git a/fs/9p/v9fs.h b/fs/9p/v9fs.h index bec4d0bcb458..4c963c9fc41f 100644 --- a/fs/9p/v9fs.h +++ b/fs/9p/v9fs.h @@ -104,6 +104,7 @@ struct v9fs_session_info { struct p9_client *clnt; /* 9p client */ struct list_head slist; /* list of sessions registered with v9fs */ struct backing_dev_info bdi; + struct rw_semaphore rename_sem; }; struct p9_fid *v9fs_session_init(struct v9fs_session_info *, const char *, diff --git a/fs/9p/v9fs_vfs.h b/fs/9p/v9fs_vfs.h index 32ef4009d030..f47c6bbb01b3 100644 --- a/fs/9p/v9fs_vfs.h +++ b/fs/9p/v9fs_vfs.h @@ -55,6 +55,7 @@ struct inode *v9fs_get_inode(struct super_block *sb, int mode); void v9fs_clear_inode(struct inode *inode); ino_t v9fs_qid2ino(struct p9_qid *qid); void v9fs_stat2inode(struct p9_wstat *, struct inode *, struct super_block *); +void v9fs_stat2inode_dotl(struct p9_stat_dotl *, struct inode *); int v9fs_dir_release(struct inode *inode, struct file *filp); int v9fs_file_open(struct inode *inode, struct file *file); void v9fs_inode2stat(struct inode *inode, struct p9_wstat *stat); diff --git a/fs/9p/vfs_dir.c b/fs/9p/vfs_dir.c index d61e3b28ce37..16c8a2a98c1b 100644 --- a/fs/9p/vfs_dir.c +++ b/fs/9p/vfs_dir.c @@ -87,29 +87,19 @@ static void p9stat_init(struct p9_wstat *stbuf) } /** - * v9fs_dir_readdir - read a directory + * v9fs_alloc_rdir_buf - Allocate buffer used for read and readdir * @filp: opened file structure - * @dirent: directory structure ??? - * @filldir: function to populate directory structure ??? + * @buflen: Length in bytes of buffer to allocate * */ -static int v9fs_dir_readdir(struct file *filp, void *dirent, filldir_t filldir) +static int v9fs_alloc_rdir_buf(struct file *filp, int buflen) { - int over; - struct p9_wstat st; - int err = 0; - struct p9_fid *fid; - int buflen; - int reclen = 0; struct p9_rdir *rdir; + struct p9_fid *fid; + int err = 0; - P9_DPRINTK(P9_DEBUG_VFS, "name %s\n", filp->f_path.dentry->d_name.name); fid = filp->private_data; - - buflen = fid->clnt->msize - P9_IOHDRSZ; - - /* allocate rdir on demand */ if (!fid->rdir) { rdir = kmalloc(sizeof(struct p9_rdir) + buflen, GFP_KERNEL); @@ -128,6 +118,36 @@ static int v9fs_dir_readdir(struct file *filp, void *dirent, filldir_t filldir) spin_unlock(&filp->f_dentry->d_lock); kfree(rdir); } +exit: + return err; +} + +/** + * v9fs_dir_readdir - read a directory + * @filp: opened file structure + * @dirent: directory structure ??? + * @filldir: function to populate directory structure ??? + * + */ + +static int v9fs_dir_readdir(struct file *filp, void *dirent, filldir_t filldir) +{ + int over; + struct p9_wstat st; + int err = 0; + struct p9_fid *fid; + int buflen; + int reclen = 0; + struct p9_rdir *rdir; + + P9_DPRINTK(P9_DEBUG_VFS, "name %s\n", filp->f_path.dentry->d_name.name); + fid = filp->private_data; + + buflen = fid->clnt->msize - P9_IOHDRSZ; + + err = v9fs_alloc_rdir_buf(filp, buflen); + if (err) + goto exit; rdir = (struct p9_rdir *) fid->rdir; err = mutex_lock_interruptible(&rdir->mutex); @@ -146,7 +166,7 @@ static int v9fs_dir_readdir(struct file *filp, void *dirent, filldir_t filldir) while (rdir->head < rdir->tail) { p9stat_init(&st); err = p9stat_read(rdir->buf + rdir->head, - buflen - rdir->head, &st, + rdir->tail - rdir->head, &st, fid->clnt->proto_version); if (err) { P9_DPRINTK(P9_DEBUG_VFS, "returned %d\n", err); @@ -176,6 +196,88 @@ exit: return err; } +/** + * v9fs_dir_readdir_dotl - read a directory + * @filp: opened file structure + * @dirent: buffer to fill dirent structures + * @filldir: function to populate dirent structures + * + */ +static int v9fs_dir_readdir_dotl(struct file *filp, void *dirent, + filldir_t filldir) +{ + int over; + int err = 0; + struct p9_fid *fid; + int buflen; + struct p9_rdir *rdir; + struct p9_dirent curdirent; + u64 oldoffset = 0; + + P9_DPRINTK(P9_DEBUG_VFS, "name %s\n", filp->f_path.dentry->d_name.name); + fid = filp->private_data; + + buflen = fid->clnt->msize - P9_READDIRHDRSZ; + + err = v9fs_alloc_rdir_buf(filp, buflen); + if (err) + goto exit; + rdir = (struct p9_rdir *) fid->rdir; + + err = mutex_lock_interruptible(&rdir->mutex); + if (err) + return err; + + while (err == 0) { + if (rdir->tail == rdir->head) { + err = p9_client_readdir(fid, rdir->buf, buflen, + filp->f_pos); + if (err <= 0) + goto unlock_and_exit; + + rdir->head = 0; + rdir->tail = err; + } + + while (rdir->head < rdir->tail) { + + err = p9dirent_read(rdir->buf + rdir->head, + buflen - rdir->head, &curdirent, + fid->clnt->proto_version); + if (err < 0) { + P9_DPRINTK(P9_DEBUG_VFS, "returned %d\n", err); + err = -EIO; + goto unlock_and_exit; + } + + /* d_off in dirent structure tracks the offset into + * the next dirent in the dir. However, filldir() + * expects offset into the current dirent. Hence + * while calling filldir send the offset from the + * previous dirent structure. + */ + over = filldir(dirent, curdirent.d_name, + strlen(curdirent.d_name), + oldoffset, v9fs_qid2ino(&curdirent.qid), + curdirent.d_type); + oldoffset = curdirent.d_off; + + if (over) { + err = 0; + goto unlock_and_exit; + } + + filp->f_pos = curdirent.d_off; + rdir->head += err; + } + } + +unlock_and_exit: + mutex_unlock(&rdir->mutex); +exit: + return err; +} + /** * v9fs_dir_release - close a directory @@ -207,7 +309,7 @@ const struct file_operations v9fs_dir_operations = { const struct file_operations v9fs_dir_operations_dotl = { .read = generic_read_dir, .llseek = generic_file_llseek, - .readdir = v9fs_dir_readdir, + .readdir = v9fs_dir_readdir_dotl, .open = v9fs_file_open, .release = v9fs_dir_release, }; diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c index 25b300e1c9d7..e97c92bd6f16 100644 --- a/fs/9p/vfs_file.c +++ b/fs/9p/vfs_file.c @@ -59,9 +59,13 @@ int v9fs_file_open(struct inode *inode, struct file *file) struct p9_fid *fid; int omode; - P9_DPRINTK(P9_DEBUG_VFS, "inode: %p file: %p \n", inode, file); + P9_DPRINTK(P9_DEBUG_VFS, "inode: %p file: %p\n", inode, file); v9ses = v9fs_inode2v9ses(inode); - omode = v9fs_uflags2omode(file->f_flags, v9fs_proto_dotu(v9ses)); + if (v9fs_proto_dotl(v9ses)) + omode = file->f_flags; + else + omode = v9fs_uflags2omode(file->f_flags, + v9fs_proto_dotu(v9ses)); fid = file->private_data; if (!fid) { fid = v9fs_fid_clone(file->f_path.dentry); @@ -73,11 +77,12 @@ int v9fs_file_open(struct inode *inode, struct file *file) p9_client_clunk(fid); return err; } - if (omode & P9_OTRUNC) { + if (file->f_flags & O_TRUNC) { i_size_write(inode, 0); inode->i_blocks = 0; } - if ((file->f_flags & O_APPEND) && (!v9fs_proto_dotu(v9ses))) + if ((file->f_flags & O_APPEND) && + (!v9fs_proto_dotu(v9ses) && !v9fs_proto_dotl(v9ses))) generic_file_llseek(file, 0, SEEK_END); } @@ -139,7 +144,7 @@ ssize_t v9fs_file_readn(struct file *filp, char *data, char __user *udata, u32 count, u64 offset) { - int n, total; + int n, total, size; struct p9_fid *fid = filp->private_data; P9_DPRINTK(P9_DEBUG_VFS, "fid %d offset %llu count %d\n", fid->fid, @@ -147,6 +152,7 @@ v9fs_file_readn(struct file *filp, char *data, char __user *udata, u32 count, n = 0; total = 0; + size = fid->iounit ? fid->iounit : fid->clnt->msize - P9_IOHDRSZ; do { n = p9_client_read(fid, data, udata, offset, count); if (n <= 0) @@ -160,7 +166,7 @@ v9fs_file_readn(struct file *filp, char *data, char __user *udata, u32 count, offset += n; count -= n; total += n; - } while (count > 0 && n == (fid->clnt->msize - P9_IOHDRSZ)); + } while (count > 0 && n == size); if (n < 0) total = n; @@ -183,11 +189,13 @@ v9fs_file_read(struct file *filp, char __user *udata, size_t count, { int ret; struct p9_fid *fid; + size_t size; P9_DPRINTK(P9_DEBUG_VFS, "count %zu offset %lld\n", count, *offset); fid = filp->private_data; - if (count > (fid->clnt->msize - P9_IOHDRSZ)) + size = fid->iounit ? fid->iounit : fid->clnt->msize - P9_IOHDRSZ; + if (count > size) ret = v9fs_file_readn(filp, NULL, udata, count, *offset); else ret = p9_client_read(fid, NULL, udata, *offset, count); @@ -224,9 +232,7 @@ v9fs_file_write(struct file *filp, const char __user * data, fid = filp->private_data; clnt = fid->clnt; - rsize = fid->iounit; - if (!rsize || rsize > clnt->msize-P9_IOHDRSZ) - rsize = clnt->msize - P9_IOHDRSZ; + rsize = fid->iounit ? fid->iounit : clnt->msize - P9_IOHDRSZ; do { if (count < rsize) @@ -257,15 +263,13 @@ v9fs_file_write(struct file *filp, const char __user * data, return total; } -static int v9fs_file_fsync(struct file *filp, struct dentry *dentry, - int datasync) +static int v9fs_file_fsync(struct file *filp, int datasync) { struct p9_fid *fid; struct p9_wstat wstat; int retval; - P9_DPRINTK(P9_DEBUG_VFS, "filp %p dentry %p datasync %x\n", filp, - dentry, datasync); + P9_DPRINTK(P9_DEBUG_VFS, "filp %p datasync %x\n", filp, datasync); fid = filp->private_data; v9fs_blank_wstat(&wstat); diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c index 4331b3b5ee1c..6e94f3247cec 100644 --- a/fs/9p/vfs_inode.c +++ b/fs/9p/vfs_inode.c @@ -35,6 +35,7 @@ #include <linux/idr.h> #include <linux/sched.h> #include <linux/slab.h> +#include <linux/xattr.h> #include <net/9p/9p.h> #include <net/9p/client.h> @@ -42,6 +43,7 @@ #include "v9fs_vfs.h" #include "fid.h" #include "cache.h" +#include "xattr.h" static const struct inode_operations v9fs_dir_inode_operations; static const struct inode_operations v9fs_dir_inode_operations_dotu; @@ -236,6 +238,41 @@ void v9fs_destroy_inode(struct inode *inode) #endif /** + * v9fs_get_fsgid_for_create - Helper function to get the gid for creating a + * new file system object. This checks the S_ISGID to determine the owning + * group of the new file system object. + */ + +static gid_t v9fs_get_fsgid_for_create(struct inode *dir_inode) +{ + BUG_ON(dir_inode == NULL); + + if (dir_inode->i_mode & S_ISGID) { + /* set_gid bit is set.*/ + return dir_inode->i_gid; + } + return current_fsgid(); +} + +/** + * v9fs_dentry_from_dir_inode - helper function to get the dentry from + * dir inode. + * + */ + +static struct dentry *v9fs_dentry_from_dir_inode(struct inode *inode) +{ + struct dentry *dentry; + + spin_lock(&dcache_lock); + /* Directory should have only one entry. */ + BUG_ON(S_ISDIR(inode->i_mode) && !list_is_singular(&inode->i_dentry)); + dentry = list_entry(inode->i_dentry.next, struct dentry, d_alias); + spin_unlock(&dcache_lock); + return dentry; +} + +/** * v9fs_get_inode - helper function to setup an inode * @sb: superblock * @mode: mode to setup inode with @@ -267,7 +304,13 @@ struct inode *v9fs_get_inode(struct super_block *sb, int mode) case S_IFBLK: case S_IFCHR: case S_IFSOCK: - if (!v9fs_proto_dotu(v9ses)) { + if (v9fs_proto_dotl(v9ses)) { + inode->i_op = &v9fs_file_inode_operations_dotl; + inode->i_fop = &v9fs_file_operations_dotl; + } else if (v9fs_proto_dotu(v9ses)) { + inode->i_op = &v9fs_file_inode_operations; + inode->i_fop = &v9fs_file_operations; + } else { P9_DPRINTK(P9_DEBUG_ERROR, "special files without extended mode\n"); err = -EINVAL; @@ -396,23 +439,14 @@ void v9fs_clear_inode(struct inode *inode) #endif } -/** - * v9fs_inode_from_fid - populate an inode by issuing a attribute request - * @v9ses: session information - * @fid: fid to issue attribute request for - * @sb: superblock on which to create inode - * - */ - static struct inode * -v9fs_inode_from_fid(struct v9fs_session_info *v9ses, struct p9_fid *fid, +v9fs_inode(struct v9fs_session_info *v9ses, struct p9_fid *fid, struct super_block *sb) { int err, umode; - struct inode *ret; + struct inode *ret = NULL; struct p9_wstat *st; - ret = NULL; st = p9_client_stat(fid); if (IS_ERR(st)) return ERR_CAST(st); @@ -433,15 +467,62 @@ v9fs_inode_from_fid(struct v9fs_session_info *v9ses, struct p9_fid *fid, #endif p9stat_free(st); kfree(st); - return ret; - error: p9stat_free(st); kfree(st); return ERR_PTR(err); } +static struct inode * +v9fs_inode_dotl(struct v9fs_session_info *v9ses, struct p9_fid *fid, + struct super_block *sb) +{ + struct inode *ret = NULL; + int err; + struct p9_stat_dotl *st; + + st = p9_client_getattr_dotl(fid, P9_STATS_BASIC); + if (IS_ERR(st)) + return ERR_CAST(st); + + ret = v9fs_get_inode(sb, st->st_mode); + if (IS_ERR(ret)) { + err = PTR_ERR(ret); + goto error; + } + + v9fs_stat2inode_dotl(st, ret); + ret->i_ino = v9fs_qid2ino(&st->qid); +#ifdef CONFIG_9P_FSCACHE + v9fs_vcookie_set_qid(ret, &st->qid); + v9fs_cache_inode_get_cookie(ret); +#endif + kfree(st); + return ret; +error: + kfree(st); + return ERR_PTR(err); +} + +/** + * v9fs_inode_from_fid - Helper routine to populate an inode by + * issuing a attribute request + * @v9ses: session information + * @fid: fid to issue attribute request for + * @sb: superblock on which to create inode + * + */ +static inline struct inode * +v9fs_inode_from_fid(struct v9fs_session_info *v9ses, struct p9_fid *fid, + struct super_block *sb) +{ + if (v9fs_proto_dotl(v9ses)) + return v9fs_inode_dotl(v9ses, fid, sb); + else + return v9fs_inode(v9ses, fid, sb); +} + /** * v9fs_remove - helper function to remove files and directories * @dir: directory inode that is being deleted @@ -563,6 +644,118 @@ error: } /** + * v9fs_vfs_create_dotl - VFS hook to create files for 9P2000.L protocol. + * @dir: directory inode that is being created + * @dentry: dentry that is being deleted + * @mode: create permissions + * @nd: path information + * + */ + +static int +v9fs_vfs_create_dotl(struct inode *dir, struct dentry *dentry, int mode, + struct nameidata *nd) +{ + int err = 0; + char *name = NULL; + gid_t gid; + int flags; + struct v9fs_session_info *v9ses; + struct p9_fid *fid = NULL; + struct p9_fid *dfid, *ofid; + struct file *filp; + struct p9_qid qid; + struct inode *inode; + + v9ses = v9fs_inode2v9ses(dir); + if (nd && nd->flags & LOOKUP_OPEN) + flags = nd->intent.open.flags - 1; + else + flags = O_RDWR; + + name = (char *) dentry->d_name.name; + P9_DPRINTK(P9_DEBUG_VFS, "v9fs_vfs_create_dotl: name:%s flags:0x%x " + "mode:0x%x\n", name, flags, mode); + + dfid = v9fs_fid_lookup(dentry->d_parent); + if (IS_ERR(dfid)) { + err = PTR_ERR(dfid); + P9_DPRINTK(P9_DEBUG_VFS, "fid lookup failed %d\n", err); + return err; + } + + /* clone a fid to use for creation */ + ofid = p9_client_walk(dfid, 0, NULL, 1); + if (IS_ERR(ofid)) { + err = PTR_ERR(ofid); + P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n", err); + return err; + } + + gid = v9fs_get_fsgid_for_create(dir); + err = p9_client_create_dotl(ofid, name, flags, mode, gid, &qid); + if (err < 0) { + P9_DPRINTK(P9_DEBUG_VFS, + "p9_client_open_dotl failed in creat %d\n", + err); + goto error; + } + + /* No need to populate the inode if we are not opening the file AND + * not in cached mode. + */ + if (!v9ses->cache && !(nd && nd->flags & LOOKUP_OPEN)) { + /* Not in cached mode. No need to populate inode with stat */ + dentry->d_op = &v9fs_dentry_operations; + p9_client_clunk(ofid); + d_instantiate(dentry, NULL); + return 0; + } + + /* Now walk from the parent so we can get an unopened fid. */ + fid = p9_client_walk(dfid, 1, &name, 1); + if (IS_ERR(fid)) { + err = PTR_ERR(fid); + P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n", err); + fid = NULL; + goto error; + } + + /* instantiate inode and assign the unopened fid to dentry */ + inode = v9fs_inode_from_fid(v9ses, fid, dir->i_sb); + if (IS_ERR(inode)) { + err = PTR_ERR(inode); + P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n", err); + goto error; + } + dentry->d_op = &v9fs_cached_dentry_operations; + d_instantiate(dentry, inode); + err = v9fs_fid_add(dentry, fid); + if (err < 0) + goto error; + + /* if we are opening a file, assign the open fid to the file */ + if (nd && nd->flags & LOOKUP_OPEN) { + filp = lookup_instantiate_filp(nd, dentry, v9fs_open_created); + if (IS_ERR(filp)) { + p9_client_clunk(ofid); + return PTR_ERR(filp); + } + filp->private_data = ofid; + } else + p9_client_clunk(ofid); + + return 0; + +error: + if (ofid) + p9_client_clunk(ofid); + if (fid) + p9_client_clunk(fid); + return err; +} + +/** * v9fs_vfs_create - VFS hook to create files * @dir: directory inode that is being created * @dentry: dentry that is being deleted @@ -652,6 +845,83 @@ static int v9fs_vfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) return err; } + +/** + * v9fs_vfs_mkdir_dotl - VFS mkdir hook to create a directory + * @dir: inode that is being unlinked + * @dentry: dentry that is being unlinked + * @mode: mode for new directory + * + */ + +static int v9fs_vfs_mkdir_dotl(struct inode *dir, struct dentry *dentry, + int mode) +{ + int err; + struct v9fs_session_info *v9ses; + struct p9_fid *fid = NULL, *dfid = NULL; + gid_t gid; + char *name; + struct inode *inode; + struct p9_qid qid; + struct dentry *dir_dentry; + + P9_DPRINTK(P9_DEBUG_VFS, "name %s\n", dentry->d_name.name); + err = 0; + v9ses = v9fs_inode2v9ses(dir); + + mode |= S_IFDIR; + dir_dentry = v9fs_dentry_from_dir_inode(dir); + dfid = v9fs_fid_lookup(dir_dentry); + if (IS_ERR(dfid)) { + err = PTR_ERR(dfid); + P9_DPRINTK(P9_DEBUG_VFS, "fid lookup failed %d\n", err); + dfid = NULL; + goto error; + } + + gid = v9fs_get_fsgid_for_create(dir); + if (gid < 0) { + P9_DPRINTK(P9_DEBUG_VFS, "v9fs_get_fsgid_for_create failed\n"); + goto error; + } + + name = (char *) dentry->d_name.name; + err = p9_client_mkdir_dotl(dfid, name, mode, gid, &qid); + if (err < 0) + goto error; + + /* instantiate inode and assign the unopened fid to the dentry */ + if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) { + fid = p9_client_walk(dfid, 1, &name, 1); + if (IS_ERR(fid)) { + err = PTR_ERR(fid); + P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n", + err); + fid = NULL; + goto error; + } + + inode = v9fs_inode_from_fid(v9ses, fid, dir->i_sb); + if (IS_ERR(inode)) { + err = PTR_ERR(inode); + P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n", + err); + goto error; + } + dentry->d_op = &v9fs_cached_dentry_operations; + d_instantiate(dentry, inode); + err = v9fs_fid_add(dentry, fid); + if (err < 0) + goto error; + fid = NULL; + } +error: + if (fid) + p9_client_clunk(fid); + return err; +} + /** * v9fs_vfs_lookup - VFS lookup hook to "walk" to a new inode * @dir: inode that is being walked from @@ -678,6 +948,7 @@ static struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry, sb = dir->i_sb; v9ses = v9fs_inode2v9ses(dir); + /* We can walk d_parent because we hold the dir->i_mutex */ dfid = v9fs_fid_lookup(dentry->d_parent); if (IS_ERR(dfid)) return ERR_CAST(dfid); @@ -785,27 +1056,33 @@ v9fs_vfs_rename(struct inode *old_dir, struct dentry *old_dentry, goto clunk_olddir; } + down_write(&v9ses->rename_sem); if (v9fs_proto_dotl(v9ses)) { retval = p9_client_rename(oldfid, newdirfid, (char *) new_dentry->d_name.name); if (retval != -ENOSYS) goto clunk_newdir; } + if (old_dentry->d_parent != new_dentry->d_parent) { + /* + * 9P .u can only handle file rename in the same directory + */ - /* 9P can only handle file rename in the same directory */ - if (memcmp(&olddirfid->qid, &newdirfid->qid, sizeof(newdirfid->qid))) { P9_DPRINTK(P9_DEBUG_ERROR, "old dir and new dir are different\n"); retval = -EXDEV; goto clunk_newdir; } - v9fs_blank_wstat(&wstat); wstat.muid = v9ses->uname; wstat.name = (char *) new_dentry->d_name.name; retval = p9_client_wstat(oldfid, &wstat); clunk_newdir: + if (!retval) + /* successful rename */ + d_move(old_dentry, new_dentry); + up_write(&v9ses->rename_sem); p9_client_clunk(newdirfid); clunk_olddir: @@ -853,6 +1130,42 @@ v9fs_vfs_getattr(struct vfsmount *mnt, struct dentry *dentry, return 0; } +static int +v9fs_vfs_getattr_dotl(struct vfsmount *mnt, struct dentry *dentry, + struct kstat *stat) +{ + int err; + struct v9fs_session_info *v9ses; + struct p9_fid *fid; + struct p9_stat_dotl *st; + + P9_DPRINTK(P9_DEBUG_VFS, "dentry: %p\n", dentry); + err = -EPERM; + v9ses = v9fs_inode2v9ses(dentry->d_inode); + if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) + return simple_getattr(mnt, dentry, stat); + + fid = v9fs_fid_lookup(dentry); + if (IS_ERR(fid)) + return PTR_ERR(fid); + + /* Ask for all the fields in stat structure. Server will return + * whatever it supports + */ + + st = p9_client_getattr_dotl(fid, P9_STATS_ALL); + if (IS_ERR(st)) + return PTR_ERR(st); + + v9fs_stat2inode_dotl(st, dentry->d_inode); + generic_fillattr(dentry->d_inode, stat); + /* Change block size to what the server returned */ + stat->blksize = st->st_blksize; + + kfree(st); + return 0; +} + /** * v9fs_vfs_setattr - set file metadata * @dentry: file whose metadata to set @@ -903,6 +1216,49 @@ static int v9fs_vfs_setattr(struct dentry *dentry, struct iattr *iattr) } /** + * v9fs_vfs_setattr_dotl - set file metadata + * @dentry: file whose metadata to set + * @iattr: metadata assignment structure + * + */ + +static int v9fs_vfs_setattr_dotl(struct dentry *dentry, struct iattr *iattr) +{ + int retval; + struct v9fs_session_info *v9ses; + struct p9_fid *fid; + struct p9_iattr_dotl p9attr; + + P9_DPRINTK(P9_DEBUG_VFS, "\n"); + + retval = inode_change_ok(dentry->d_inode, iattr); + if (retval) + return retval; + + p9attr.valid = iattr->ia_valid; + p9attr.mode = iattr->ia_mode; + p9attr.uid = iattr->ia_uid; + p9attr.gid = iattr->ia_gid; + p9attr.size = iattr->ia_size; + p9attr.atime_sec = iattr->ia_atime.tv_sec; + p9attr.atime_nsec = iattr->ia_atime.tv_nsec; + p9attr.mtime_sec = iattr->ia_mtime.tv_sec; + p9attr.mtime_nsec = iattr->ia_mtime.tv_nsec; + + retval = -EPERM; + v9ses = v9fs_inode2v9ses(dentry->d_inode); + fid = v9fs_fid_lookup(dentry); + if (IS_ERR(fid)) + return PTR_ERR(fid); + + retval = p9_client_setattr(fid, &p9attr); + if (retval >= 0) + retval = inode_setattr(dentry->d_inode, iattr); + + return retval; +} + +/** * v9fs_stat2inode - populate an inode structure with mistat info * @stat: Plan 9 metadata (mistat) structure * @inode: inode to populate @@ -980,6 +1336,77 @@ v9fs_stat2inode(struct p9_wstat *stat, struct inode *inode, } /** + * v9fs_stat2inode_dotl - populate an inode structure with stat info + * @stat: stat structure + * @inode: inode to populate + * @sb: superblock of filesystem + * + */ + +void +v9fs_stat2inode_dotl(struct p9_stat_dotl *stat, struct inode *inode) +{ + + if ((stat->st_result_mask & P9_STATS_BASIC) == P9_STATS_BASIC) { + inode->i_atime.tv_sec = stat->st_atime_sec; + inode->i_atime.tv_nsec = stat->st_atime_nsec; + inode->i_mtime.tv_sec = stat->st_mtime_sec; + inode->i_mtime.tv_nsec = stat->st_mtime_nsec; + inode->i_ctime.tv_sec = stat->st_ctime_sec; + inode->i_ctime.tv_nsec = stat->st_ctime_nsec; + inode->i_uid = stat->st_uid; + inode->i_gid = stat->st_gid; + inode->i_nlink = stat->st_nlink; + inode->i_mode = stat->st_mode; + inode->i_rdev = new_decode_dev(stat->st_rdev); + + if ((S_ISBLK(inode->i_mode)) || (S_ISCHR(inode->i_mode))) + init_special_inode(inode, inode->i_mode, inode->i_rdev); + + i_size_write(inode, stat->st_size); + inode->i_blocks = stat->st_blocks; + } else { + if (stat->st_result_mask & P9_STATS_ATIME) { + inode->i_atime.tv_sec = stat->st_atime_sec; + inode->i_atime.tv_nsec = stat->st_atime_nsec; + } + if (stat->st_result_mask & P9_STATS_MTIME) { + inode->i_mtime.tv_sec = stat->st_mtime_sec; + inode->i_mtime.tv_nsec = stat->st_mtime_nsec; + } + if (stat->st_result_mask & P9_STATS_CTIME) { + inode->i_ctime.tv_sec = stat->st_ctime_sec; + inode->i_ctime.tv_nsec = stat->st_ctime_nsec; + } + if (stat->st_result_mask & P9_STATS_UID) + inode->i_uid = stat->st_uid; + if (stat->st_result_mask & P9_STATS_GID) + inode->i_gid = stat->st_gid; + if (stat->st_result_mask & P9_STATS_NLINK) + inode->i_nlink = stat->st_nlink; + if (stat->st_result_mask & P9_STATS_MODE) { + inode->i_mode = stat->st_mode; + if ((S_ISBLK(inode->i_mode)) || + (S_ISCHR(inode->i_mode))) + init_special_inode(inode, inode->i_mode, + inode->i_rdev); + } + if (stat->st_result_mask & P9_STATS_RDEV) + inode->i_rdev = new_decode_dev(stat->st_rdev); + if (stat->st_result_mask & P9_STATS_SIZE) + i_size_write(inode, stat->st_size); + if (stat->st_result_mask & P9_STATS_BLOCKS) + inode->i_blocks = stat->st_blocks; + } + if (stat->st_result_mask & P9_STATS_GEN) + inode->i_generation = stat->st_gen; + + /* Currently we don't support P9_STATS_BTIME and P9_STATS_DATA_VERSION + * because the inode structure does not have fields for them. + */ +} + +/** * v9fs_qid2ino - convert qid into inode number * @qid: qid to hash * @@ -1022,7 +1449,7 @@ static int v9fs_readlink(struct dentry *dentry, char *buffer, int buflen) if (IS_ERR(fid)) return PTR_ERR(fid); - if (!v9fs_proto_dotu(v9ses)) + if (!v9fs_proto_dotu(v9ses) && !v9fs_proto_dotl(v9ses)) return -EBADF; st = p9_client_stat(fid); @@ -1128,6 +1555,99 @@ static int v9fs_vfs_mkspecial(struct inode *dir, struct dentry *dentry, } /** + * v9fs_vfs_symlink_dotl - helper function to create symlinks + * @dir: directory inode containing symlink + * @dentry: dentry for symlink + * @symname: symlink data + * + * See Also: 9P2000.L RFC for more information + * + */ + +static int +v9fs_vfs_symlink_dotl(struct inode *dir, struct dentry *dentry, + const char *symname) +{ + struct v9fs_session_info *v9ses; + struct p9_fid *dfid; + struct p9_fid *fid = NULL; + struct inode *inode; + struct p9_qid qid; + char *name; + int err; + gid_t gid; + + name = (char *) dentry->d_name.name; + P9_DPRINTK(P9_DEBUG_VFS, "v9fs_vfs_symlink_dotl : %lu,%s,%s\n", + dir->i_ino, name, symname); + v9ses = v9fs_inode2v9ses(dir); + + dfid = v9fs_fid_lookup(dentry->d_parent); + if (IS_ERR(dfid)) { + err = PTR_ERR(dfid); + P9_DPRINTK(P9_DEBUG_VFS, "fid lookup failed %d\n", err); + return err; + } + + gid = v9fs_get_fsgid_for_create(dir); + + if (gid < 0) { + P9_DPRINTK(P9_DEBUG_VFS, "v9fs_get_egid failed %d\n", gid); + goto error; + } + + /* Server doesn't alter fid on TSYMLINK. Hence no need to clone it. */ + err = p9_client_symlink(dfid, name, (char *)symname, gid, &qid); + + if (err < 0) { + P9_DPRINTK(P9_DEBUG_VFS, "p9_client_symlink failed %d\n", err); + goto error; + } + + if (v9ses->cache) { + /* Now walk from the parent so we can get an unopened fid. */ + fid = p9_client_walk(dfid, 1, &name, 1); + if (IS_ERR(fid)) { + err = PTR_ERR(fid); + P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n", + err); + fid = NULL; + goto error; + } + + /* instantiate inode and assign the unopened fid to dentry */ + inode = v9fs_inode_from_fid(v9ses, fid, dir->i_sb); + if (IS_ERR(inode)) { + err = PTR_ERR(inode); + P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n", + err); + goto error; + } + dentry->d_op = &v9fs_cached_dentry_operations; + d_instantiate(dentry, inode); + err = v9fs_fid_add(dentry, fid); + if (err < 0) + goto error; + fid = NULL; + } else { + /* Not in cached mode. No need to populate inode with stat */ + inode = v9fs_get_inode(dir->i_sb, S_IFLNK); + if (IS_ERR(inode)) { + err = PTR_ERR(inode); + goto error; + } + dentry->d_op = &v9fs_dentry_operations; + d_instantiate(dentry, inode); + } + +error: + if (fid) + p9_client_clunk(fid); + + return err; +} + +/** * v9fs_vfs_symlink - helper function to create symlinks * @dir: directory inode containing symlink * @dentry: dentry for symlink @@ -1186,6 +1706,76 @@ clunk_fid: } /** + * v9fs_vfs_link_dotl - create a hardlink for dotl + * @old_dentry: dentry for file to link to + * @dir: inode destination for new link + * @dentry: dentry for link + * + */ + +static int +v9fs_vfs_link_dotl(struct dentry *old_dentry, struct inode *dir, + struct dentry *dentry) +{ + int err; + struct p9_fid *dfid, *oldfid; + char *name; + struct v9fs_session_info *v9ses; + struct dentry *dir_dentry; + + P9_DPRINTK(P9_DEBUG_VFS, "dir ino: %lu, old_name: %s, new_name: %s\n", + dir->i_ino, old_dentry->d_name.name, + dentry->d_name.name); + + v9ses = v9fs_inode2v9ses(dir); + dir_dentry = v9fs_dentry_from_dir_inode(dir); + dfid = v9fs_fid_lookup(dir_dentry); + if (IS_ERR(dfid)) + return PTR_ERR(dfid); + + oldfid = v9fs_fid_lookup(old_dentry); + if (IS_ERR(oldfid)) + return PTR_ERR(oldfid); + + name = (char *) dentry->d_name.name; + + err = p9_client_link(dfid, oldfid, (char *)dentry->d_name.name); + + if (err < 0) { + P9_DPRINTK(P9_DEBUG_VFS, "p9_client_link failed %d\n", err); + return err; + } + + if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) { + /* Get the latest stat info from server. */ + struct p9_fid *fid; + struct p9_stat_dotl *st; + + fid = v9fs_fid_lookup(old_dentry); + if (IS_ERR(fid)) + return PTR_ERR(fid); + + st = p9_client_getattr_dotl(fid, P9_STATS_BASIC); + if (IS_ERR(st)) + return PTR_ERR(st); + + v9fs_stat2inode_dotl(st, old_dentry->d_inode); + + kfree(st); + } else { + /* Caching disabled. No need to get upto date stat info. + * This dentry will be released immediately. So, just i_count++ + */ + atomic_inc(&old_dentry->d_inode->i_count); + } + + dentry->d_op = old_dentry->d_op; + d_instantiate(dentry, old_dentry->d_inode); + + return err; +} + +/** * v9fs_vfs_mknod - create a special file * @dir: inode destination for new link * @dentry: dentry for file @@ -1230,6 +1820,100 @@ v9fs_vfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t rdev) return retval; } +/** + * v9fs_vfs_mknod_dotl - create a special file + * @dir: inode destination for new link + * @dentry: dentry for file + * @mode: mode for creation + * @rdev: device associated with special file + * + */ +static int +v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, int mode, + dev_t rdev) +{ + int err; + char *name; + struct v9fs_session_info *v9ses; + struct p9_fid *fid = NULL, *dfid = NULL; + struct inode *inode; + gid_t gid; + struct p9_qid qid; + struct dentry *dir_dentry; + + P9_DPRINTK(P9_DEBUG_VFS, + " %lu,%s mode: %x MAJOR: %u MINOR: %u\n", dir->i_ino, + dentry->d_name.name, mode, MAJOR(rdev), MINOR(rdev)); + + if (!new_valid_dev(rdev)) + return -EINVAL; + + v9ses = v9fs_inode2v9ses(dir); + dir_dentry = v9fs_dentry_from_dir_inode(dir); + dfid = v9fs_fid_lookup(dir_dentry); + if (IS_ERR(dfid)) { + err = PTR_ERR(dfid); + P9_DPRINTK(P9_DEBUG_VFS, "fid lookup failed %d\n", err); + dfid = NULL; + goto error; + } + + gid = v9fs_get_fsgid_for_create(dir); + if (gid < 0) { + P9_DPRINTK(P9_DEBUG_VFS, "v9fs_get_fsgid_for_create failed\n"); + goto error; + } + + name = (char *) dentry->d_name.name; + + err = p9_client_mknod_dotl(dfid, name, mode, rdev, gid, &qid); + if (err < 0) + goto error; + + /* instantiate inode and assign the unopened fid to the dentry */ + if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) { + fid = p9_client_walk(dfid, 1, &name, 1); + if (IS_ERR(fid)) { + err = PTR_ERR(fid); + P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n", + err); + fid = NULL; + goto error; + } + + inode = v9fs_inode_from_fid(v9ses, fid, dir->i_sb); + if (IS_ERR(inode)) { + err = PTR_ERR(inode); + P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n", + err); + goto error; + } + dentry->d_op = &v9fs_cached_dentry_operations; + d_instantiate(dentry, inode); + err = v9fs_fid_add(dentry, fid); + if (err < 0) + goto error; + fid = NULL; + } else { + /* + * Not in cached mode. No need to populate inode with stat. + * socket syscall returns a fd, so we need instantiate + */ + inode = v9fs_get_inode(dir->i_sb, mode); + if (IS_ERR(inode)) { + err = PTR_ERR(inode); + goto error; + } + dentry->d_op = &v9fs_dentry_operations; + d_instantiate(dentry, inode); + } + +error: + if (fid) + p9_client_clunk(fid); + return err; +} + static const struct inode_operations v9fs_dir_inode_operations_dotu = { .create = v9fs_vfs_create, .lookup = v9fs_vfs_lookup, @@ -1238,24 +1922,29 @@ static const struct inode_operations v9fs_dir_inode_operations_dotu = { .unlink = v9fs_vfs_unlink, .mkdir = v9fs_vfs_mkdir, .rmdir = v9fs_vfs_rmdir, - .mknod = v9fs_vfs_mknod, + .mknod = v9fs_vfs_mknod_dotl, .rename = v9fs_vfs_rename, .getattr = v9fs_vfs_getattr, .setattr = v9fs_vfs_setattr, }; static const struct inode_operations v9fs_dir_inode_operations_dotl = { - .create = v9fs_vfs_create, + .create = v9fs_vfs_create_dotl, .lookup = v9fs_vfs_lookup, - .symlink = v9fs_vfs_symlink, - .link = v9fs_vfs_link, + .link = v9fs_vfs_link_dotl, + .symlink = v9fs_vfs_symlink_dotl, .unlink = v9fs_vfs_unlink, - .mkdir = v9fs_vfs_mkdir, + .mkdir = v9fs_vfs_mkdir_dotl, .rmdir = v9fs_vfs_rmdir, - .mknod = v9fs_vfs_mknod, + .mknod = v9fs_vfs_mknod_dotl, .rename = v9fs_vfs_rename, - .getattr = v9fs_vfs_getattr, - .setattr = v9fs_vfs_setattr, + .getattr = v9fs_vfs_getattr_dotl, + .setattr = v9fs_vfs_setattr_dotl, + .setxattr = generic_setxattr, + .getxattr = generic_getxattr, + .removexattr = generic_removexattr, + .listxattr = v9fs_listxattr, + }; static const struct inode_operations v9fs_dir_inode_operations = { @@ -1276,8 +1965,12 @@ static const struct inode_operations v9fs_file_inode_operations = { }; static const struct inode_operations v9fs_file_inode_operations_dotl = { - .getattr = v9fs_vfs_getattr, - .setattr = v9fs_vfs_setattr, + .getattr = v9fs_vfs_getattr_dotl, + .setattr = v9fs_vfs_setattr_dotl, + .setxattr = generic_setxattr, + .getxattr = generic_getxattr, + .removexattr = generic_removexattr, + .listxattr = v9fs_listxattr, }; static const struct inode_operations v9fs_symlink_inode_operations = { @@ -1292,6 +1985,10 @@ static const struct inode_operations v9fs_symlink_inode_operations_dotl = { .readlink = generic_readlink, .follow_link = v9fs_vfs_follow_link, .put_link = v9fs_vfs_put_link, - .getattr = v9fs_vfs_getattr, - .setattr = v9fs_vfs_setattr, + .getattr = v9fs_vfs_getattr_dotl, + .setattr = v9fs_vfs_setattr_dotl, + .setxattr = generic_setxattr, + .getxattr = generic_getxattr, + .removexattr = generic_removexattr, + .listxattr = v9fs_listxattr, }; diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c index be74d020436e..4b9ede0b41b7 100644 --- a/fs/9p/vfs_super.c +++ b/fs/9p/vfs_super.c @@ -45,6 +45,7 @@ #include "v9fs.h" #include "v9fs_vfs.h" #include "fid.h" +#include "xattr.h" static const struct super_operations v9fs_super_ops, v9fs_super_ops_dotl; @@ -77,9 +78,10 @@ v9fs_fill_super(struct super_block *sb, struct v9fs_session_info *v9ses, sb->s_blocksize_bits = fls(v9ses->maxdata - 1); sb->s_blocksize = 1 << sb->s_blocksize_bits; sb->s_magic = V9FS_MAGIC; - if (v9fs_proto_dotl(v9ses)) + if (v9fs_proto_dotl(v9ses)) { sb->s_op = &v9fs_super_ops_dotl; - else + sb->s_xattr = v9fs_xattr_handlers; + } else sb->s_op = &v9fs_super_ops; sb->s_bdi = &v9ses->bdi; @@ -107,7 +109,6 @@ static int v9fs_get_sb(struct file_system_type *fs_type, int flags, struct inode *inode = NULL; struct dentry *root = NULL; struct v9fs_session_info *v9ses = NULL; - struct p9_wstat *st = NULL; int mode = S_IRWXUGO | S_ISVTX; struct p9_fid *fid; int retval = 0; @@ -124,16 +125,10 @@ static int v9fs_get_sb(struct file_system_type *fs_type, int flags, goto close_session; } - st = p9_client_stat(fid); - if (IS_ERR(st)) { - retval = PTR_ERR(st); - goto clunk_fid; - } - sb = sget(fs_type, NULL, v9fs_set_super, v9ses); if (IS_ERR(sb)) { retval = PTR_ERR(sb); - goto free_stat; + goto clunk_fid; } v9fs_fill_super(sb, v9ses, flags, data); @@ -151,22 +146,38 @@ static int v9fs_get_sb(struct file_system_type *fs_type, int flags, } sb->s_root = root; - root->d_inode->i_ino = v9fs_qid2ino(&st->qid); - v9fs_stat2inode(st, root->d_inode, sb); + if (v9fs_proto_dotl(v9ses)) { + struct p9_stat_dotl *st = NULL; + st = p9_client_getattr_dotl(fid, P9_STATS_BASIC); + if (IS_ERR(st)) { + retval = PTR_ERR(st); + goto clunk_fid; + } + + v9fs_stat2inode_dotl(st, root->d_inode); + kfree(st); + } else { + struct p9_wstat *st = NULL; + st = p9_client_stat(fid); + if (IS_ERR(st)) { + retval = PTR_ERR(st); + goto clunk_fid; + } + + root->d_inode->i_ino = v9fs_qid2ino(&st->qid); + v9fs_stat2inode(st, root->d_inode, sb); + + p9stat_free(st); + kfree(st); + } v9fs_fid_add(root, fid); - p9stat_free(st); - kfree(st); P9_DPRINTK(P9_DEBUG_VFS, " simple set mount, return 0\n"); simple_set_mnt(mnt, sb); return 0; -free_stat: - p9stat_free(st); - kfree(st); - clunk_fid: p9_client_clunk(fid); @@ -176,8 +187,6 @@ close_session: return retval; release_sb: - p9stat_free(st); - kfree(st); deactivate_locked_super(sb); return retval; } @@ -278,4 +287,5 @@ struct file_system_type v9fs_fs_type = { .get_sb = v9fs_get_sb, .kill_sb = v9fs_kill_super, .owner = THIS_MODULE, + .fs_flags = FS_RENAME_DOES_D_MOVE, }; diff --git a/fs/9p/xattr.c b/fs/9p/xattr.c new file mode 100644 index 000000000000..f88e5c2dc873 --- /dev/null +++ b/fs/9p/xattr.c @@ -0,0 +1,160 @@ +/* + * Copyright IBM Corporation, 2010 + * Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2.1 of the GNU Lesser General Public License + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + */ + +#include <linux/module.h> +#include <linux/fs.h> +#include <linux/sched.h> +#include <net/9p/9p.h> +#include <net/9p/client.h> + +#include "fid.h" +#include "xattr.h" + +/* + * v9fs_xattr_get() + * + * Copy an extended attribute into the buffer + * provided, or compute the buffer size required. + * Buffer is NULL to compute the size of the buffer required. + * + * Returns a negative error number on failure, or the number of bytes + * used / required on success. + */ +ssize_t v9fs_xattr_get(struct dentry *dentry, const char *name, + void *buffer, size_t buffer_size) +{ + ssize_t retval; + int msize, read_count; + u64 offset = 0, attr_size; + struct p9_fid *fid, *attr_fid; + + P9_DPRINTK(P9_DEBUG_VFS, "%s: name = %s value_len = %zu\n", + __func__, name, buffer_size); + + fid = v9fs_fid_lookup(dentry); + if (IS_ERR(fid)) + return PTR_ERR(fid); + + attr_fid = p9_client_xattrwalk(fid, name, &attr_size); + if (IS_ERR(attr_fid)) { + retval = PTR_ERR(attr_fid); + P9_DPRINTK(P9_DEBUG_VFS, + "p9_client_attrwalk failed %zd\n", retval); + attr_fid = NULL; + goto error; + } + if (!buffer_size) { + /* request to get the attr_size */ + retval = attr_size; + goto error; + } + if (attr_size > buffer_size) { + retval = -ERANGE; + goto error; + } + msize = attr_fid->clnt->msize; + while (attr_size) { + if (attr_size > (msize - P9_IOHDRSZ)) + read_count = msize - P9_IOHDRSZ; + else + read_count = attr_size; + read_count = p9_client_read(attr_fid, ((char *)buffer)+offset, + NULL, offset, read_count); + if (read_count < 0) { + /* error in xattr read */ + retval = read_count; + goto error; + } + offset += read_count; + attr_size -= read_count; + } + /* Total read xattr bytes */ + retval = offset; +error: + if (attr_fid) + p9_client_clunk(attr_fid); + return retval; + +} + +/* + * v9fs_xattr_set() + * + * Create, replace or remove an extended attribute for this inode. Buffer + * is NULL to remove an existing extended attribute, and non-NULL to + * either replace an existing extended attribute, or create a new extended + * attribute. The flags XATTR_REPLACE and XATTR_CREATE + * specify that an extended attribute must exist and must not exist + * previous to the call, respectively. + * + * Returns 0, or a negative error number on failure. + */ +int v9fs_xattr_set(struct dentry *dentry, const char *name, + const void *value, size_t value_len, int flags) +{ + u64 offset = 0; + int retval, msize, write_count; + struct p9_fid *fid = NULL; + + P9_DPRINTK(P9_DEBUG_VFS, "%s: name = %s value_len = %zu flags = %d\n", + __func__, name, value_len, flags); + + fid = v9fs_fid_clone(dentry); + if (IS_ERR(fid)) { + retval = PTR_ERR(fid); + fid = NULL; + goto error; + } + /* + * On success fid points to xattr + */ + retval = p9_client_xattrcreate(fid, name, value_len, flags); + if (retval < 0) { + P9_DPRINTK(P9_DEBUG_VFS, + "p9_client_xattrcreate failed %d\n", retval); + goto error; + } + msize = fid->clnt->msize;; + while (value_len) { + if (value_len > (msize - P9_IOHDRSZ)) + write_count = msize - P9_IOHDRSZ; + else + write_count = value_len; + write_count = p9_client_write(fid, ((char *)value)+offset, + NULL, offset, write_count); + if (write_count < 0) { + /* error in xattr write */ + retval = write_count; + goto error; + } + offset += write_count; + value_len -= write_count; + } + /* Total read xattr bytes */ + retval = offset; +error: + if (fid) + retval = p9_client_clunk(fid); + return retval; +} + +ssize_t v9fs_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size) +{ + return v9fs_xattr_get(dentry, NULL, buffer, buffer_size); +} + +const struct xattr_handler *v9fs_xattr_handlers[] = { + &v9fs_xattr_user_handler, + NULL +}; diff --git a/fs/9p/xattr.h b/fs/9p/xattr.h new file mode 100644 index 000000000000..9ddf672ae5c4 --- /dev/null +++ b/fs/9p/xattr.h @@ -0,0 +1,27 @@ +/* + * Copyright IBM Corporation, 2010 + * Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2.1 of the GNU Lesser General Public License + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + */ +#ifndef FS_9P_XATTR_H +#define FS_9P_XATTR_H + +#include <linux/xattr.h> + +extern const struct xattr_handler *v9fs_xattr_handlers[]; +extern struct xattr_handler v9fs_xattr_user_handler; + +extern ssize_t v9fs_xattr_get(struct dentry *, const char *, + void *, size_t); +extern int v9fs_xattr_set(struct dentry *, const char *, + const void *, size_t, int); +extern ssize_t v9fs_listxattr(struct dentry *, char *, size_t); +#endif /* FS_9P_XATTR_H */ diff --git a/fs/9p/xattr_user.c b/fs/9p/xattr_user.c new file mode 100644 index 000000000000..d0b701b72080 --- /dev/null +++ b/fs/9p/xattr_user.c @@ -0,0 +1,80 @@ +/* + * Copyright IBM Corporation, 2010 + * Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2.1 of the GNU Lesser General Public License + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + */ + + +#include <linux/module.h> +#include <linux/string.h> +#include <linux/fs.h> +#include <linux/slab.h> +#include "xattr.h" + +static int v9fs_xattr_user_get(struct dentry *dentry, const char *name, + void *buffer, size_t size, int type) +{ + int retval; + char *full_name; + size_t name_len; + size_t prefix_len = XATTR_USER_PREFIX_LEN; + + if (name == NULL) + return -EINVAL; + + if (strcmp(name, "") == 0) + return -EINVAL; + + name_len = strlen(name); + full_name = kmalloc(prefix_len + name_len + 1 , GFP_KERNEL); + if (!full_name) + return -ENOMEM; + memcpy(full_name, XATTR_USER_PREFIX, prefix_len); + memcpy(full_name+prefix_len, name, name_len); + full_name[prefix_len + name_len] = '\0'; + + retval = v9fs_xattr_get(dentry, full_name, buffer, size); + kfree(full_name); + return retval; +} + +static int v9fs_xattr_user_set(struct dentry *dentry, const char *name, + const void *value, size_t size, int flags, int type) +{ + int retval; + char *full_name; + size_t name_len; + size_t prefix_len = XATTR_USER_PREFIX_LEN; + + if (name == NULL) + return -EINVAL; + + if (strcmp(name, "") == 0) + return -EINVAL; + + name_len = strlen(name); + full_name = kmalloc(prefix_len + name_len + 1 , GFP_KERNEL); + if (!full_name) + return -ENOMEM; + memcpy(full_name, XATTR_USER_PREFIX, prefix_len); + memcpy(full_name + prefix_len, name, name_len); + full_name[prefix_len + name_len] = '\0'; + + retval = v9fs_xattr_set(dentry, full_name, value, size, flags); + kfree(full_name); + return retval; +} + +struct xattr_handler v9fs_xattr_user_handler = { + .prefix = XATTR_USER_PREFIX, + .get = v9fs_xattr_user_get, + .set = v9fs_xattr_user_set, +}; diff --git a/fs/adfs/dir.c b/fs/adfs/dir.c index 23aa52f548a0..f4287e4de744 100644 --- a/fs/adfs/dir.c +++ b/fs/adfs/dir.c @@ -197,7 +197,7 @@ const struct file_operations adfs_dir_operations = { .read = generic_read_dir, .llseek = generic_file_llseek, .readdir = adfs_readdir, - .fsync = simple_fsync, + .fsync = generic_file_fsync, }; static int diff --git a/fs/adfs/file.c b/fs/adfs/file.c index 005ea34d1758..a36da5382b40 100644 --- a/fs/adfs/file.c +++ b/fs/adfs/file.c @@ -26,7 +26,7 @@ const struct file_operations adfs_file_operations = { .read = do_sync_read, .aio_read = generic_file_aio_read, .mmap = generic_file_mmap, - .fsync = simple_fsync, + .fsync = generic_file_fsync, .write = do_sync_write, .aio_write = generic_file_aio_write, .splice_read = generic_file_splice_read, diff --git a/fs/adfs/inode.c b/fs/adfs/inode.c index 0f5e30978135..6f850b06ab62 100644 --- a/fs/adfs/inode.c +++ b/fs/adfs/inode.c @@ -322,8 +322,9 @@ adfs_notify_change(struct dentry *dentry, struct iattr *attr) if (error) goto out; + /* XXX: this is missing some actual on-disk truncation.. */ if (ia_valid & ATTR_SIZE) - error = vmtruncate(inode, attr->ia_size); + error = simple_setsize(inode, attr->ia_size); if (error) goto out; diff --git a/fs/affs/affs.h b/fs/affs/affs.h index 861dae68ac12..f05b6155ccc8 100644 --- a/fs/affs/affs.h +++ b/fs/affs/affs.h @@ -183,7 +183,7 @@ extern int affs_add_entry(struct inode *dir, struct inode *inode, struct dent void affs_free_prealloc(struct inode *inode); extern void affs_truncate(struct inode *); -int affs_file_fsync(struct file *, struct dentry *, int); +int affs_file_fsync(struct file *, int); /* dir.c */ diff --git a/fs/affs/file.c b/fs/affs/file.c index 184e55c1c9ba..322710c3eedf 100644 --- a/fs/affs/file.c +++ b/fs/affs/file.c @@ -916,9 +916,9 @@ affs_truncate(struct inode *inode) affs_free_prealloc(inode); } -int affs_file_fsync(struct file *filp, struct dentry *dentry, int datasync) +int affs_file_fsync(struct file *filp, int datasync) { - struct inode * inode = dentry->d_inode; + struct inode *inode = filp->f_mapping->host; int ret, err; ret = write_inode_now(inode, 0); diff --git a/fs/affs/namei.c b/fs/affs/namei.c index d70bbbac6b7b..914d1c0bc07a 100644 --- a/fs/affs/namei.c +++ b/fs/affs/namei.c @@ -224,7 +224,7 @@ affs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd) affs_brelse(bh); inode = affs_iget(sb, ino); if (IS_ERR(inode)) - return ERR_PTR(PTR_ERR(inode)); + return ERR_CAST(inode); } dentry->d_op = AFFS_SB(sb)->s_flags & SF_INTL ? &affs_intl_dentry_operations : &affs_dentry_operations; d_add(dentry, inode); diff --git a/fs/afs/internal.h b/fs/afs/internal.h index 807f284cc75e..5f679b77ce24 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -740,7 +740,7 @@ extern void afs_pages_written_back(struct afs_vnode *, struct afs_call *); extern ssize_t afs_file_write(struct kiocb *, const struct iovec *, unsigned long, loff_t); extern int afs_writeback_all(struct afs_vnode *); -extern int afs_fsync(struct file *, struct dentry *, int); +extern int afs_fsync(struct file *, int); /*****************************************************************************/ diff --git a/fs/afs/server.c b/fs/afs/server.c index f49099516675..9fdc7fe3a7bc 100644 --- a/fs/afs/server.c +++ b/fs/afs/server.c @@ -91,9 +91,10 @@ static struct afs_server *afs_alloc_server(struct afs_cell *cell, memcpy(&server->addr, addr, sizeof(struct in_addr)); server->addr.s_addr = addr->s_addr; + _leave(" = %p{%d}", server, atomic_read(&server->usage)); + } else { + _leave(" = NULL [nomem]"); } - - _leave(" = %p{%d}", server, atomic_read(&server->usage)); return server; } diff --git a/fs/afs/write.c b/fs/afs/write.c index 3bed54a294d4..722743b152d8 100644 --- a/fs/afs/write.c +++ b/fs/afs/write.c @@ -680,7 +680,6 @@ int afs_writeback_all(struct afs_vnode *vnode) { struct address_space *mapping = vnode->vfs_inode.i_mapping; struct writeback_control wbc = { - .bdi = mapping->backing_dev_info, .sync_mode = WB_SYNC_ALL, .nr_to_write = LONG_MAX, .range_cyclic = 1, @@ -701,8 +700,9 @@ int afs_writeback_all(struct afs_vnode *vnode) * - the return status from this call provides a reliable indication of * whether any write errors occurred for this process. */ -int afs_fsync(struct file *file, struct dentry *dentry, int datasync) +int afs_fsync(struct file *file, int datasync) { + struct dentry *dentry = file->f_path.dentry; struct afs_writeback *wb, *xwb; struct afs_vnode *vnode = AFS_FS_I(dentry->d_inode); int ret; @@ -36,6 +36,7 @@ #include <linux/blkdev.h> #include <linux/mempool.h> #include <linux/hash.h> +#include <linux/compat.h> #include <asm/kmap_types.h> #include <asm/uaccess.h> @@ -526,7 +527,7 @@ static void aio_fput_routine(struct work_struct *data) /* Complete the fput(s) */ if (req->ki_filp != NULL) - __fput(req->ki_filp); + fput(req->ki_filp); /* Link the iocb into the context's free list */ spin_lock_irq(&ctx->ctx_lock); @@ -559,11 +560,11 @@ static int __aio_put_req(struct kioctx *ctx, struct kiocb *req) /* * Try to optimize the aio and eventfd file* puts, by avoiding to - * schedule work in case it is not __fput() time. In normal cases, + * schedule work in case it is not final fput() time. In normal cases, * we would not be holding the last reference to the file*, so * this function will be executed w/out any aio kthread wakeup. */ - if (unlikely(atomic_long_dec_and_test(&req->ki_filp->f_count))) { + if (unlikely(!fput_atomic(req->ki_filp))) { get_ioctx(ctx); spin_lock(&fput_lock); list_add(&req->ki_list, &fput_head); @@ -1384,13 +1385,22 @@ static ssize_t aio_fsync(struct kiocb *iocb) return ret; } -static ssize_t aio_setup_vectored_rw(int type, struct kiocb *kiocb) +static ssize_t aio_setup_vectored_rw(int type, struct kiocb *kiocb, bool compat) { ssize_t ret; - ret = rw_copy_check_uvector(type, (struct iovec __user *)kiocb->ki_buf, - kiocb->ki_nbytes, 1, - &kiocb->ki_inline_vec, &kiocb->ki_iovec); +#ifdef CONFIG_COMPAT + if (compat) + ret = compat_rw_copy_check_uvector(type, + (struct compat_iovec __user *)kiocb->ki_buf, + kiocb->ki_nbytes, 1, &kiocb->ki_inline_vec, + &kiocb->ki_iovec); + else +#endif + ret = rw_copy_check_uvector(type, + (struct iovec __user *)kiocb->ki_buf, + kiocb->ki_nbytes, 1, &kiocb->ki_inline_vec, + &kiocb->ki_iovec); if (ret < 0) goto out; @@ -1420,7 +1430,7 @@ static ssize_t aio_setup_single_vector(struct kiocb *kiocb) * Performs the initial checks and aio retry method * setup for the kiocb at the time of io submission. */ -static ssize_t aio_setup_iocb(struct kiocb *kiocb) +static ssize_t aio_setup_iocb(struct kiocb *kiocb, bool compat) { struct file *file = kiocb->ki_filp; ssize_t ret = 0; @@ -1469,7 +1479,7 @@ static ssize_t aio_setup_iocb(struct kiocb *kiocb) ret = security_file_permission(file, MAY_READ); if (unlikely(ret)) break; - ret = aio_setup_vectored_rw(READ, kiocb); + ret = aio_setup_vectored_rw(READ, kiocb, compat); if (ret) break; ret = -EINVAL; @@ -1483,7 +1493,7 @@ static ssize_t aio_setup_iocb(struct kiocb *kiocb) ret = security_file_permission(file, MAY_WRITE); if (unlikely(ret)) break; - ret = aio_setup_vectored_rw(WRITE, kiocb); + ret = aio_setup_vectored_rw(WRITE, kiocb, compat); if (ret) break; ret = -EINVAL; @@ -1548,7 +1558,8 @@ static void aio_batch_free(struct hlist_head *batch_hash) } static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb, - struct iocb *iocb, struct hlist_head *batch_hash) + struct iocb *iocb, struct hlist_head *batch_hash, + bool compat) { struct kiocb *req; struct file *file; @@ -1609,7 +1620,7 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb, req->ki_left = req->ki_nbytes = iocb->aio_nbytes; req->ki_opcode = iocb->aio_lio_opcode; - ret = aio_setup_iocb(req); + ret = aio_setup_iocb(req, compat); if (ret) goto out_put_req; @@ -1637,20 +1648,8 @@ out_put_req: return ret; } -/* sys_io_submit: - * Queue the nr iocbs pointed to by iocbpp for processing. Returns - * the number of iocbs queued. May return -EINVAL if the aio_context - * specified by ctx_id is invalid, if nr is < 0, if the iocb at - * *iocbpp[0] is not properly initialized, if the operation specified - * is invalid for the file descriptor in the iocb. May fail with - * -EFAULT if any of the data structures point to invalid data. May - * fail with -EBADF if the file descriptor specified in the first - * iocb is invalid. May fail with -EAGAIN if insufficient resources - * are available to queue any iocbs. Will return 0 if nr is 0. Will - * fail with -ENOSYS if not implemented. - */ -SYSCALL_DEFINE3(io_submit, aio_context_t, ctx_id, long, nr, - struct iocb __user * __user *, iocbpp) +long do_io_submit(aio_context_t ctx_id, long nr, + struct iocb __user *__user *iocbpp, bool compat) { struct kioctx *ctx; long ret = 0; @@ -1687,7 +1686,7 @@ SYSCALL_DEFINE3(io_submit, aio_context_t, ctx_id, long, nr, break; } - ret = io_submit_one(ctx, user_iocb, &tmp, batch_hash); + ret = io_submit_one(ctx, user_iocb, &tmp, batch_hash, compat); if (ret) break; } @@ -1697,6 +1696,24 @@ SYSCALL_DEFINE3(io_submit, aio_context_t, ctx_id, long, nr, return i ? i : ret; } +/* sys_io_submit: + * Queue the nr iocbs pointed to by iocbpp for processing. Returns + * the number of iocbs queued. May return -EINVAL if the aio_context + * specified by ctx_id is invalid, if nr is < 0, if the iocb at + * *iocbpp[0] is not properly initialized, if the operation specified + * is invalid for the file descriptor in the iocb. May fail with + * -EFAULT if any of the data structures point to invalid data. May + * fail with -EBADF if the file descriptor specified in the first + * iocb is invalid. May fail with -EAGAIN if insufficient resources + * are available to queue any iocbs. Will return 0 if nr is 0. Will + * fail with -ENOSYS if not implemented. + */ +SYSCALL_DEFINE3(io_submit, aio_context_t, ctx_id, long, nr, + struct iocb __user * __user *, iocbpp) +{ + return do_io_submit(ctx_id, nr, iocbpp, 0); +} + /* lookup_kiocb * Finds a given iocb for cancellation. */ diff --git a/fs/anon_inodes.c b/fs/anon_inodes.c index 9bd4b3876c99..e4b75d6eda83 100644 --- a/fs/anon_inodes.c +++ b/fs/anon_inodes.c @@ -205,7 +205,7 @@ static struct inode *anon_inode_mkinode(void) * that it already _is_ on the dirty list. */ inode->i_state = I_DIRTY; - inode->i_mode = S_IFREG | S_IRUSR | S_IWUSR; + inode->i_mode = S_IRUSR | S_IWUSR; inode->i_uid = current_fsuid(); inode->i_gid = current_fsgid(); inode->i_flags |= S_PRIVATE; diff --git a/fs/attr.c b/fs/attr.c index 0815e93bb487..b4fa3b0aa596 100644 --- a/fs/attr.c +++ b/fs/attr.c @@ -67,14 +67,14 @@ EXPORT_SYMBOL(inode_change_ok); * @offset: the new size to assign to the inode * @Returns: 0 on success, -ve errno on failure * + * inode_newsize_ok must be called with i_mutex held. + * * inode_newsize_ok will check filesystem limits and ulimits to check that the * new inode size is within limits. inode_newsize_ok will also send SIGXFSZ * when necessary. Caller must not proceed with inode size change if failure is * returned. @inode must be a file (not directory), with appropriate * permissions to allow truncate (inode_newsize_ok does NOT check these * conditions). - * - * inode_newsize_ok must be called with i_mutex held. */ int inode_newsize_ok(const struct inode *inode, loff_t offset) { @@ -104,17 +104,25 @@ out_big: } EXPORT_SYMBOL(inode_newsize_ok); -int inode_setattr(struct inode * inode, struct iattr * attr) +/** + * generic_setattr - copy simple metadata updates into the generic inode + * @inode: the inode to be updated + * @attr: the new attributes + * + * generic_setattr must be called with i_mutex held. + * + * generic_setattr updates the inode's metadata with that specified + * in attr. Noticably missing is inode size update, which is more complex + * as it requires pagecache updates. See simple_setsize. + * + * The inode is not marked as dirty after this operation. The rationale is + * that for "simple" filesystems, the struct inode is the inode storage. + * The caller is free to mark the inode dirty afterwards if needed. + */ +void generic_setattr(struct inode *inode, const struct iattr *attr) { unsigned int ia_valid = attr->ia_valid; - if (ia_valid & ATTR_SIZE && - attr->ia_size != i_size_read(inode)) { - int error = vmtruncate(inode, attr->ia_size); - if (error) - return error; - } - if (ia_valid & ATTR_UID) inode->i_uid = attr->ia_uid; if (ia_valid & ATTR_GID) @@ -135,6 +143,28 @@ int inode_setattr(struct inode * inode, struct iattr * attr) mode &= ~S_ISGID; inode->i_mode = mode; } +} +EXPORT_SYMBOL(generic_setattr); + +/* + * note this function is deprecated, the new truncate sequence should be + * used instead -- see eg. simple_setsize, generic_setattr. + */ +int inode_setattr(struct inode *inode, const struct iattr *attr) +{ + unsigned int ia_valid = attr->ia_valid; + + if (ia_valid & ATTR_SIZE && + attr->ia_size != i_size_read(inode)) { + int error; + + error = vmtruncate(inode, attr->ia_size); + if (error) + return error; + } + + generic_setattr(inode, attr); + mark_inode_dirty(inode); return 0; diff --git a/fs/autofs/root.c b/fs/autofs/root.c index 8713c7cfbc79..9a0520b50663 100644 --- a/fs/autofs/root.c +++ b/fs/autofs/root.c @@ -28,6 +28,7 @@ static int autofs_root_mkdir(struct inode *,struct dentry *,int); static int autofs_root_ioctl(struct inode *, struct file *,unsigned int,unsigned long); const struct file_operations autofs_root_operations = { + .llseek = generic_file_llseek, .read = generic_read_dir, .readdir = autofs_root_readdir, .ioctl = autofs_root_ioctl, diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c index d832062869f6..ba4a38b9c22f 100644 --- a/fs/autofs4/dev-ioctl.c +++ b/fs/autofs4/dev-ioctl.c @@ -95,7 +95,7 @@ static int check_dev_ioctl_version(int cmd, struct autofs_dev_ioctl *param) */ static struct autofs_dev_ioctl *copy_dev_ioctl(struct autofs_dev_ioctl __user *in) { - struct autofs_dev_ioctl tmp, *ads; + struct autofs_dev_ioctl tmp; if (copy_from_user(&tmp, in, sizeof(tmp))) return ERR_PTR(-EFAULT); @@ -103,16 +103,7 @@ static struct autofs_dev_ioctl *copy_dev_ioctl(struct autofs_dev_ioctl __user *i if (tmp.size < sizeof(tmp)) return ERR_PTR(-EINVAL); - ads = kmalloc(tmp.size, GFP_KERNEL); - if (!ads) - return ERR_PTR(-ENOMEM); - - if (copy_from_user(ads, in, tmp.size)) { - kfree(ads); - return ERR_PTR(-EFAULT); - } - - return ads; + return memdup_user(in, tmp.size); } static inline void free_dev_ioctl(struct autofs_dev_ioctl *param) diff --git a/fs/bad_inode.c b/fs/bad_inode.c index a05287a23f62..52e59bf4aa5f 100644 --- a/fs/bad_inode.c +++ b/fs/bad_inode.c @@ -93,8 +93,7 @@ static int bad_file_release(struct inode *inode, struct file *filp) return -EIO; } -static int bad_file_fsync(struct file *file, struct dentry *dentry, - int datasync) +static int bad_file_fsync(struct file *file, int datasync) { return -EIO; } diff --git a/fs/bfs/dir.c b/fs/bfs/dir.c index 8f73841fc974..d967e052b779 100644 --- a/fs/bfs/dir.c +++ b/fs/bfs/dir.c @@ -78,7 +78,7 @@ static int bfs_readdir(struct file *f, void *dirent, filldir_t filldir) const struct file_operations bfs_dir_operations = { .read = generic_read_dir, .readdir = bfs_readdir, - .fsync = simple_fsync, + .fsync = generic_file_fsync, .llseek = generic_file_llseek, }; diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c index 2c5f9a0e5d72..63039ed9576f 100644 --- a/fs/binfmt_elf_fdpic.c +++ b/fs/binfmt_elf_fdpic.c @@ -990,10 +990,9 @@ static int elf_fdpic_map_file_constdisp_on_uclinux( /* clear any space allocated but not loaded */ if (phdr->p_filesz < phdr->p_memsz) { - ret = clear_user((void *) (seg->addr + phdr->p_filesz), - phdr->p_memsz - phdr->p_filesz); - if (ret) - return ret; + if (clear_user((void *) (seg->addr + phdr->p_filesz), + phdr->p_memsz - phdr->p_filesz)) + return -EFAULT; } if (mm) { @@ -1027,7 +1026,7 @@ static int elf_fdpic_map_file_by_direct_mmap(struct elf_fdpic_params *params, struct elf32_fdpic_loadseg *seg; struct elf32_phdr *phdr; unsigned long load_addr, delta_vaddr; - int loop, dvset, ret; + int loop, dvset; load_addr = params->load_addr; delta_vaddr = 0; @@ -1127,9 +1126,8 @@ static int elf_fdpic_map_file_by_direct_mmap(struct elf_fdpic_params *params, * PT_LOAD */ if (prot & PROT_WRITE && disp > 0) { kdebug("clear[%d] ad=%lx sz=%lx", loop, maddr, disp); - ret = clear_user((void __user *) maddr, disp); - if (ret) - return ret; + if (clear_user((void __user *) maddr, disp)) + return -EFAULT; maddr += disp; } @@ -1164,19 +1162,17 @@ static int elf_fdpic_map_file_by_direct_mmap(struct elf_fdpic_params *params, if (prot & PROT_WRITE && excess1 > 0) { kdebug("clear[%d] ad=%lx sz=%lx", loop, maddr + phdr->p_filesz, excess1); - ret = clear_user((void __user *) maddr + phdr->p_filesz, - excess1); - if (ret) - return ret; + if (clear_user((void __user *) maddr + phdr->p_filesz, + excess1)) + return -EFAULT; } #else if (excess > 0) { kdebug("clear[%d] ad=%lx sz=%lx", loop, maddr + phdr->p_filesz, excess); - ret = clear_user((void *) maddr + phdr->p_filesz, excess); - if (ret) - return ret; + if (clear_user((void *) maddr + phdr->p_filesz, excess)) + return -EFAULT; } #endif diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c index 49566c1687d8..811384bec8de 100644 --- a/fs/binfmt_flat.c +++ b/fs/binfmt_flat.c @@ -56,16 +56,19 @@ #endif /* - * User data (stack, data section and bss) needs to be aligned - * for the same reasons as SLAB memory is, and to the same amount. - * Avoid duplicating architecture specific code by using the same - * macro as with SLAB allocation: + * User data (data section and bss) needs to be aligned. + * We pick 0x20 here because it is the max value elf2flt has always + * used in producing FLAT files, and because it seems to be large + * enough to make all the gcc alignment related tests happy. */ -#ifdef ARCH_SLAB_MINALIGN -#define FLAT_DATA_ALIGN (ARCH_SLAB_MINALIGN) -#else -#define FLAT_DATA_ALIGN (sizeof(void *)) -#endif +#define FLAT_DATA_ALIGN (0x20) + +/* + * User data (stack) also needs to be aligned. + * Here we can be a bit looser than the data sections since this + * needs to only meet arch ABI requirements. + */ +#define FLAT_STACK_ALIGN max_t(unsigned long, sizeof(void *), ARCH_SLAB_MINALIGN) #define RELOC_FAILED 0xff00ff01 /* Relocation incorrect somewhere */ #define UNLOADED_LIB 0x7ff000ff /* Placeholder for unused library */ @@ -129,7 +132,7 @@ static unsigned long create_flat_tables( sp = (unsigned long *)p; sp -= (envc + argc + 2) + 1 + (flat_argvp_envp_on_stack() ? 2 : 0); - sp = (unsigned long *) ((unsigned long)sp & -FLAT_DATA_ALIGN); + sp = (unsigned long *) ((unsigned long)sp & -FLAT_STACK_ALIGN); argv = sp + 1 + (flat_argvp_envp_on_stack() ? 2 : 0); envp = argv + (argc + 1); @@ -589,7 +592,7 @@ static int load_flat_file(struct linux_binprm * bprm, if (IS_ERR_VALUE(result)) { printk("Unable to read data+bss, errno %d\n", (int)-result); do_munmap(current->mm, textpos, text_len); - do_munmap(current->mm, realdatastart, data_len + extra); + do_munmap(current->mm, realdatastart, len); ret = result; goto err; } @@ -876,7 +879,7 @@ static int load_flat_binary(struct linux_binprm * bprm, struct pt_regs * regs) stack_len = TOP_OF_ARGS - bprm->p; /* the strings */ stack_len += (bprm->argc + 1) * sizeof(char *); /* the argv array */ stack_len += (bprm->envc + 1) * sizeof(char *); /* the envp array */ - stack_len += FLAT_DATA_ALIGN - 1; /* reserve for upcoming alignment */ + stack_len += FLAT_STACK_ALIGN - 1; /* reserve for upcoming alignment */ res = load_flat_file(bprm, &libinfo, 0, &stack_len); if (IS_ERR_VALUE(res)) diff --git a/fs/block_dev.c b/fs/block_dev.c index 26e5f5026620..b3171fb0dc9a 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -172,8 +172,9 @@ blkdev_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, struct file *file = iocb->ki_filp; struct inode *inode = file->f_mapping->host; - return blockdev_direct_IO_no_locking(rw, iocb, inode, I_BDEV(inode), - iov, offset, nr_segs, blkdev_get_blocks, NULL); + return blockdev_direct_IO_no_locking_newtrunc(rw, iocb, inode, + I_BDEV(inode), iov, offset, nr_segs, + blkdev_get_blocks, NULL); } int __sync_blockdev(struct block_device *bdev, int wait) @@ -309,8 +310,8 @@ static int blkdev_write_begin(struct file *file, struct address_space *mapping, struct page **pagep, void **fsdata) { *pagep = NULL; - return block_write_begin(file, mapping, pos, len, flags, pagep, fsdata, - blkdev_get_block); + return block_write_begin_newtrunc(file, mapping, pos, len, flags, + pagep, fsdata, blkdev_get_block); } static int blkdev_write_end(struct file *file, struct address_space *mapping, @@ -358,12 +359,7 @@ static loff_t block_llseek(struct file *file, loff_t offset, int origin) return retval; } -/* - * Filp is never NULL; the only case when ->fsync() is called with - * NULL first argument is nfsd_sync_dir() and that's not a directory. - */ - -int blkdev_fsync(struct file *filp, struct dentry *dentry, int datasync) +int blkdev_fsync(struct file *filp, int datasync) { struct inode *bd_inode = filp->f_mapping->host; struct block_device *bdev = I_BDEV(bd_inode); @@ -685,8 +681,8 @@ retry: if (!bd_may_claim(bdev, whole, holder)) return -EBUSY; - /* if someone else is claiming, wait for it to finish */ - if (whole->bd_claiming && whole->bd_claiming != holder) { + /* if claiming is already in progress, wait for it to finish */ + if (whole->bd_claiming) { wait_queue_head_t *wq = bit_waitqueue(&whole->bd_claiming, 0); DEFINE_WAIT(wait); @@ -710,8 +706,13 @@ retry: * @bdev is about to be opened exclusively. Check @bdev can be opened * exclusively and mark that an exclusive open is in progress. Each * successful call to this function must be matched with a call to - * either bd_claim() or bd_abort_claiming(). If this function - * succeeds, the matching bd_claim() is guaranteed to succeed. + * either bd_finish_claiming() or bd_abort_claiming() (which do not + * fail). + * + * This function is used to gain exclusive access to the block device + * without actually causing other exclusive open attempts to fail. It + * should be used when the open sequence itself requires exclusive + * access but may subsequently fail. * * CONTEXT: * Might sleep. @@ -738,6 +739,7 @@ static struct block_device *bd_start_claiming(struct block_device *bdev, return ERR_PTR(-ENXIO); whole = bdget_disk(disk, 0); + module_put(disk->fops->owner); put_disk(disk); if (!whole) return ERR_PTR(-ENOMEM); @@ -786,15 +788,46 @@ static void bd_abort_claiming(struct block_device *whole, void *holder) __bd_abort_claiming(whole, holder); /* releases bdev_lock */ } +/* increment holders when we have a legitimate claim. requires bdev_lock */ +static void __bd_claim(struct block_device *bdev, struct block_device *whole, + void *holder) +{ + /* note that for a whole device bd_holders + * will be incremented twice, and bd_holder will + * be set to bd_claim before being set to holder + */ + whole->bd_holders++; + whole->bd_holder = bd_claim; + bdev->bd_holders++; + bdev->bd_holder = holder; +} + +/** + * bd_finish_claiming - finish claiming a block device + * @bdev: block device of interest (passed to bd_start_claiming()) + * @whole: whole block device returned by bd_start_claiming() + * @holder: holder trying to claim @bdev + * + * Finish a claiming block started by bd_start_claiming(). + * + * CONTEXT: + * Grabs and releases bdev_lock. + */ +static void bd_finish_claiming(struct block_device *bdev, + struct block_device *whole, void *holder) +{ + spin_lock(&bdev_lock); + BUG_ON(!bd_may_claim(bdev, whole, holder)); + __bd_claim(bdev, whole, holder); + __bd_abort_claiming(whole, holder); /* not actually an abort */ +} + /** * bd_claim - claim a block device * @bdev: block device to claim * @holder: holder trying to claim @bdev * - * Try to claim @bdev which must have been opened successfully. This - * function may be called with or without preceding - * blk_start_claiming(). In the former case, this function is always - * successful and terminates the claiming block. + * Try to claim @bdev which must have been opened successfully. * * CONTEXT: * Might sleep. @@ -810,23 +843,10 @@ int bd_claim(struct block_device *bdev, void *holder) might_sleep(); spin_lock(&bdev_lock); - res = bd_prepare_to_claim(bdev, whole, holder); - if (res == 0) { - /* note that for a whole device bd_holders - * will be incremented twice, and bd_holder will - * be set to bd_claim before being set to holder - */ - whole->bd_holders++; - whole->bd_holder = bd_claim; - bdev->bd_holders++; - bdev->bd_holder = holder; - } - - if (whole->bd_claiming) - __bd_abort_claiming(whole, holder); /* releases bdev_lock */ - else - spin_unlock(&bdev_lock); + if (res == 0) + __bd_claim(bdev, whole, holder); + spin_unlock(&bdev_lock); return res; } @@ -1480,7 +1500,7 @@ static int blkdev_open(struct inode * inode, struct file * filp) if (whole) { if (res == 0) - BUG_ON(bd_claim(bdev, filp) != 0); + bd_finish_claiming(bdev, whole, filp); else bd_abort_claiming(whole, filp); } @@ -1716,7 +1736,7 @@ struct block_device *open_bdev_exclusive(const char *path, fmode_t mode, void *h if ((mode & FMODE_WRITE) && bdev_read_only(bdev)) goto out_blkdev_put; - BUG_ON(bd_claim(bdev, holder) != 0); + bd_finish_claiming(bdev, whole, holder); return bdev; out_blkdev_put: diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c index 8d432cd9d580..2222d161c7b6 100644 --- a/fs/btrfs/acl.c +++ b/fs/btrfs/acl.c @@ -60,6 +60,8 @@ static struct posix_acl *btrfs_get_acl(struct inode *inode, int type) size = __btrfs_getxattr(inode, name, value, size); if (size > 0) { acl = posix_acl_from_xattr(value, size); + if (IS_ERR(acl)) + return acl; set_cached_acl(inode, type, acl); } kfree(value); @@ -160,6 +162,12 @@ static int btrfs_xattr_acl_set(struct dentry *dentry, const char *name, int ret; struct posix_acl *acl = NULL; + if (!is_owner_or_cap(dentry->d_inode)) + return -EPERM; + + if (!IS_POSIXACL(dentry->d_inode)) + return -EOPNOTSUPP; + if (value) { acl = posix_acl_from_xattr(value, size); if (acl == NULL) { diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c index 462859a30141..7ec14097fef1 100644 --- a/fs/btrfs/async-thread.c +++ b/fs/btrfs/async-thread.c @@ -377,6 +377,7 @@ again: if (!list_empty(&worker->pending) || !list_empty(&worker->prio_pending)) { spin_unlock_irq(&worker->lock); + set_current_state(TASK_RUNNING); goto again; } diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 7a4dee199832..6ad63f17eca0 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -137,8 +137,8 @@ struct btrfs_inode { * of extent items we've reserved metadata for. */ spinlock_t accounting_lock; + atomic_t outstanding_extents; int reserved_extents; - int outstanding_extents; /* * ordered_data_close is set by truncate when a file that used @@ -151,6 +151,7 @@ struct btrfs_inode { * of these. */ unsigned ordered_data_close:1; + unsigned orphan_meta_reserved:1; unsigned dummy_inode:1; /* diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 6795a713b205..c3df14ce2cc2 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -280,7 +280,8 @@ int btrfs_block_can_be_shared(struct btrfs_root *root, static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf, - struct extent_buffer *cow) + struct extent_buffer *cow, + int *last_ref) { u64 refs; u64 owner; @@ -366,6 +367,7 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans, BUG_ON(ret); } clean_tree_block(trans, root, buf); + *last_ref = 1; } return 0; } @@ -392,6 +394,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, struct btrfs_disk_key disk_key; struct extent_buffer *cow; int level; + int last_ref = 0; int unlock_orig = 0; u64 parent_start; @@ -442,7 +445,10 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, (unsigned long)btrfs_header_fsid(cow), BTRFS_FSID_SIZE); - update_ref_for_cow(trans, root, buf, cow); + update_ref_for_cow(trans, root, buf, cow, &last_ref); + + if (root->ref_cows) + btrfs_reloc_cow_block(trans, root, buf, cow); if (buf == root->node) { WARN_ON(parent && parent != buf); @@ -457,8 +463,8 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, extent_buffer_get(cow); spin_unlock(&root->node_lock); - btrfs_free_tree_block(trans, root, buf->start, buf->len, - parent_start, root->root_key.objectid, level); + btrfs_free_tree_block(trans, root, buf, parent_start, + last_ref); free_extent_buffer(buf); add_root_to_dirty_list(root); } else { @@ -473,8 +479,8 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, btrfs_set_node_ptr_generation(parent, parent_slot, trans->transid); btrfs_mark_buffer_dirty(parent); - btrfs_free_tree_block(trans, root, buf->start, buf->len, - parent_start, root->root_key.objectid, level); + btrfs_free_tree_block(trans, root, buf, parent_start, + last_ref); } if (unlock_orig) btrfs_tree_unlock(buf); @@ -949,6 +955,22 @@ int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key, return bin_search(eb, key, level, slot); } +static void root_add_used(struct btrfs_root *root, u32 size) +{ + spin_lock(&root->accounting_lock); + btrfs_set_root_used(&root->root_item, + btrfs_root_used(&root->root_item) + size); + spin_unlock(&root->accounting_lock); +} + +static void root_sub_used(struct btrfs_root *root, u32 size) +{ + spin_lock(&root->accounting_lock); + btrfs_set_root_used(&root->root_item, + btrfs_root_used(&root->root_item) - size); + spin_unlock(&root->accounting_lock); +} + /* given a node and slot number, this reads the blocks it points to. The * extent buffer is returned with a reference taken (but unlocked). * NULL is returned on error. @@ -1019,7 +1041,11 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, btrfs_tree_lock(child); btrfs_set_lock_blocking(child); ret = btrfs_cow_block(trans, root, child, mid, 0, &child); - BUG_ON(ret); + if (ret) { + btrfs_tree_unlock(child); + free_extent_buffer(child); + goto enospc; + } spin_lock(&root->node_lock); root->node = child; @@ -1034,11 +1060,12 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, btrfs_tree_unlock(mid); /* once for the path */ free_extent_buffer(mid); - ret = btrfs_free_tree_block(trans, root, mid->start, mid->len, - 0, root->root_key.objectid, level); + + root_sub_used(root, mid->len); + btrfs_free_tree_block(trans, root, mid, 0, 1); /* once for the root ptr */ free_extent_buffer(mid); - return ret; + return 0; } if (btrfs_header_nritems(mid) > BTRFS_NODEPTRS_PER_BLOCK(root) / 4) @@ -1088,23 +1115,16 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, if (wret < 0 && wret != -ENOSPC) ret = wret; if (btrfs_header_nritems(right) == 0) { - u64 bytenr = right->start; - u32 blocksize = right->len; - clean_tree_block(trans, root, right); btrfs_tree_unlock(right); - free_extent_buffer(right); - right = NULL; wret = del_ptr(trans, root, path, level + 1, pslot + 1); if (wret) ret = wret; - wret = btrfs_free_tree_block(trans, root, - bytenr, blocksize, 0, - root->root_key.objectid, - level); - if (wret) - ret = wret; + root_sub_used(root, right->len); + btrfs_free_tree_block(trans, root, right, 0, 1); + free_extent_buffer(right); + right = NULL; } else { struct btrfs_disk_key right_key; btrfs_node_key(right, &right_key, 0); @@ -1136,21 +1156,15 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, BUG_ON(wret == 1); } if (btrfs_header_nritems(mid) == 0) { - /* we've managed to empty the middle node, drop it */ - u64 bytenr = mid->start; - u32 blocksize = mid->len; - clean_tree_block(trans, root, mid); btrfs_tree_unlock(mid); - free_extent_buffer(mid); - mid = NULL; wret = del_ptr(trans, root, path, level + 1, pslot); if (wret) ret = wret; - wret = btrfs_free_tree_block(trans, root, bytenr, blocksize, - 0, root->root_key.objectid, level); - if (wret) - ret = wret; + root_sub_used(root, mid->len); + btrfs_free_tree_block(trans, root, mid, 0, 1); + free_extent_buffer(mid); + mid = NULL; } else { /* update the parent key to reflect our changes */ struct btrfs_disk_key mid_key; @@ -1590,7 +1604,7 @@ read_block_for_search(struct btrfs_trans_handle *trans, btrfs_release_path(NULL, p); ret = -EAGAIN; - tmp = read_tree_block(root, blocknr, blocksize, gen); + tmp = read_tree_block(root, blocknr, blocksize, 0); if (tmp) { /* * If the read above didn't mark this buffer up to date, @@ -1740,7 +1754,6 @@ again: p->nodes[level + 1], p->slots[level + 1], &b); if (err) { - free_extent_buffer(b); ret = err; goto done; } @@ -2076,6 +2089,8 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans, if (IS_ERR(c)) return PTR_ERR(c); + root_add_used(root, root->nodesize); + memset_extent_buffer(c, 0, 0, sizeof(struct btrfs_header)); btrfs_set_header_nritems(c, 1); btrfs_set_header_level(c, level); @@ -2134,6 +2149,7 @@ static int insert_ptr(struct btrfs_trans_handle *trans, struct btrfs_root int nritems; BUG_ON(!path->nodes[level]); + btrfs_assert_tree_locked(path->nodes[level]); lower = path->nodes[level]; nritems = btrfs_header_nritems(lower); BUG_ON(slot > nritems); @@ -2202,6 +2218,8 @@ static noinline int split_node(struct btrfs_trans_handle *trans, if (IS_ERR(split)) return PTR_ERR(split); + root_add_used(root, root->nodesize); + memset_extent_buffer(split, 0, 0, sizeof(struct btrfs_header)); btrfs_set_header_level(split, btrfs_header_level(c)); btrfs_set_header_bytenr(split, split->start); @@ -2286,12 +2304,17 @@ noinline int btrfs_leaf_free_space(struct btrfs_root *root, return ret; } +/* + * min slot controls the lowest index we're willing to push to the + * right. We'll push up to and including min_slot, but no lower + */ static noinline int __push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, int data_size, int empty, struct extent_buffer *right, - int free_space, u32 left_nritems) + int free_space, u32 left_nritems, + u32 min_slot) { struct extent_buffer *left = path->nodes[0]; struct extent_buffer *upper = path->nodes[1]; @@ -2309,7 +2332,7 @@ static noinline int __push_leaf_right(struct btrfs_trans_handle *trans, if (empty) nr = 0; else - nr = 1; + nr = max_t(u32, 1, min_slot); if (path->slots[0] >= left_nritems) push_space += data_size; @@ -2415,6 +2438,9 @@ static noinline int __push_leaf_right(struct btrfs_trans_handle *trans, if (left_nritems) btrfs_mark_buffer_dirty(left); + else + clean_tree_block(trans, root, left); + btrfs_mark_buffer_dirty(right); btrfs_item_key(right, &disk_key, 0); @@ -2448,10 +2474,14 @@ out_unlock: * * returns 1 if the push failed because the other node didn't have enough * room, 0 if everything worked out and < 0 if there were major errors. + * + * this will push starting from min_slot to the end of the leaf. It won't + * push any slot lower than min_slot */ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root - *root, struct btrfs_path *path, int data_size, - int empty) + *root, struct btrfs_path *path, + int min_data_size, int data_size, + int empty, u32 min_slot) { struct extent_buffer *left = path->nodes[0]; struct extent_buffer *right; @@ -2493,8 +2523,8 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root if (left_nritems == 0) goto out_unlock; - return __push_leaf_right(trans, root, path, data_size, empty, - right, free_space, left_nritems); + return __push_leaf_right(trans, root, path, min_data_size, empty, + right, free_space, left_nritems, min_slot); out_unlock: btrfs_tree_unlock(right); free_extent_buffer(right); @@ -2504,12 +2534,17 @@ out_unlock: /* * push some data in the path leaf to the left, trying to free up at * least data_size bytes. returns zero if the push worked, nonzero otherwise + * + * max_slot can put a limit on how far into the leaf we'll push items. The + * item at 'max_slot' won't be touched. Use (u32)-1 to make us do all the + * items */ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, int data_size, int empty, struct extent_buffer *left, - int free_space, int right_nritems) + int free_space, u32 right_nritems, + u32 max_slot) { struct btrfs_disk_key disk_key; struct extent_buffer *right = path->nodes[0]; @@ -2528,9 +2563,9 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans, slot = path->slots[1]; if (empty) - nr = right_nritems; + nr = min(right_nritems, max_slot); else - nr = right_nritems - 1; + nr = min(right_nritems - 1, max_slot); for (i = 0; i < nr; i++) { item = btrfs_item_nr(right, i); @@ -2660,6 +2695,8 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans, btrfs_mark_buffer_dirty(left); if (right_nritems) btrfs_mark_buffer_dirty(right); + else + clean_tree_block(trans, root, right); btrfs_item_key(right, &disk_key, 0); wret = fixup_low_keys(trans, root, path, &disk_key, 1); @@ -2669,8 +2706,6 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans, /* then fixup the leaf pointer in the path */ if (path->slots[0] < push_items) { path->slots[0] += old_left_nritems; - if (btrfs_header_nritems(path->nodes[0]) == 0) - clean_tree_block(trans, root, path->nodes[0]); btrfs_tree_unlock(path->nodes[0]); free_extent_buffer(path->nodes[0]); path->nodes[0] = left; @@ -2691,10 +2726,14 @@ out: /* * push some data in the path leaf to the left, trying to free up at * least data_size bytes. returns zero if the push worked, nonzero otherwise + * + * max_slot can put a limit on how far into the leaf we'll push items. The + * item at 'max_slot' won't be touched. Use (u32)-1 to make us push all the + * items */ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root - *root, struct btrfs_path *path, int data_size, - int empty) + *root, struct btrfs_path *path, int min_data_size, + int data_size, int empty, u32 max_slot) { struct extent_buffer *right = path->nodes[0]; struct extent_buffer *left; @@ -2740,8 +2779,9 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root goto out; } - return __push_leaf_left(trans, root, path, data_size, - empty, left, free_space, right_nritems); + return __push_leaf_left(trans, root, path, min_data_size, + empty, left, free_space, right_nritems, + max_slot); out: btrfs_tree_unlock(left); free_extent_buffer(left); @@ -2834,6 +2874,64 @@ static noinline int copy_for_split(struct btrfs_trans_handle *trans, } /* + * double splits happen when we need to insert a big item in the middle + * of a leaf. A double split can leave us with 3 mostly empty leaves: + * leaf: [ slots 0 - N] [ our target ] [ N + 1 - total in leaf ] + * A B C + * + * We avoid this by trying to push the items on either side of our target + * into the adjacent leaves. If all goes well we can avoid the double split + * completely. + */ +static noinline int push_for_double_split(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + int data_size) +{ + int ret; + int progress = 0; + int slot; + u32 nritems; + + slot = path->slots[0]; + + /* + * try to push all the items after our slot into the + * right leaf + */ + ret = push_leaf_right(trans, root, path, 1, data_size, 0, slot); + if (ret < 0) + return ret; + + if (ret == 0) + progress++; + + nritems = btrfs_header_nritems(path->nodes[0]); + /* + * our goal is to get our slot at the start or end of a leaf. If + * we've done so we're done + */ + if (path->slots[0] == 0 || path->slots[0] == nritems) + return 0; + + if (btrfs_leaf_free_space(root, path->nodes[0]) >= data_size) + return 0; + + /* try to push all the items before our slot into the next leaf */ + slot = path->slots[0]; + ret = push_leaf_left(trans, root, path, 1, data_size, 0, slot); + if (ret < 0) + return ret; + + if (ret == 0) + progress++; + + if (progress) + return 0; + return 1; +} + +/* * split the path's leaf in two, making sure there is at least data_size * available for the resulting leaf level of the path. * @@ -2855,6 +2953,7 @@ static noinline int split_leaf(struct btrfs_trans_handle *trans, int wret; int split; int num_doubles = 0; + int tried_avoid_double = 0; l = path->nodes[0]; slot = path->slots[0]; @@ -2863,12 +2962,14 @@ static noinline int split_leaf(struct btrfs_trans_handle *trans, return -EOVERFLOW; /* first try to make some room by pushing left and right */ - if (data_size && ins_key->type != BTRFS_DIR_ITEM_KEY) { - wret = push_leaf_right(trans, root, path, data_size, 0); + if (data_size) { + wret = push_leaf_right(trans, root, path, data_size, + data_size, 0, 0); if (wret < 0) return wret; if (wret) { - wret = push_leaf_left(trans, root, path, data_size, 0); + wret = push_leaf_left(trans, root, path, data_size, + data_size, 0, (u32)-1); if (wret < 0) return wret; } @@ -2902,6 +3003,8 @@ again: if (mid != nritems && leaf_space_used(l, mid, nritems - mid) + data_size > BTRFS_LEAF_DATA_SIZE(root)) { + if (data_size && !tried_avoid_double) + goto push_for_double; split = 2; } } @@ -2918,6 +3021,8 @@ again: if (mid != nritems && leaf_space_used(l, mid, nritems - mid) + data_size > BTRFS_LEAF_DATA_SIZE(root)) { + if (data_size && !tried_avoid_double) + goto push_for_double; split = 2 ; } } @@ -2932,10 +3037,10 @@ again: right = btrfs_alloc_free_block(trans, root, root->leafsize, 0, root->root_key.objectid, &disk_key, 0, l->start, 0); - if (IS_ERR(right)) { - BUG_ON(1); + if (IS_ERR(right)) return PTR_ERR(right); - } + + root_add_used(root, root->leafsize); memset_extent_buffer(right, 0, 0, sizeof(struct btrfs_header)); btrfs_set_header_bytenr(right, right->start); @@ -2998,6 +3103,13 @@ again: } return ret; + +push_for_double: + push_for_double_split(trans, root, path, data_size); + tried_avoid_double = 1; + if (btrfs_leaf_free_space(root, path->nodes[0]) >= data_size) + return 0; + goto again; } static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans, @@ -3054,7 +3166,8 @@ static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans, btrfs_set_path_blocking(path); ret = split_leaf(trans, root, &key, path, ins_len, 1); - BUG_ON(ret); + if (ret) + goto err; path->keep_locks = 0; btrfs_unlock_up_safe(path, 1); @@ -3796,9 +3909,10 @@ static noinline int btrfs_del_leaf(struct btrfs_trans_handle *trans, */ btrfs_unlock_up_safe(path, 0); - ret = btrfs_free_tree_block(trans, root, leaf->start, leaf->len, - 0, root->root_key.objectid, 0); - return ret; + root_sub_used(root, leaf->len); + + btrfs_free_tree_block(trans, root, leaf, 0, 1); + return 0; } /* * delete the item at the leaf level in path. If that empties @@ -3865,6 +3979,8 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, if (leaf == root->node) { btrfs_set_header_level(leaf, 0); } else { + btrfs_set_path_blocking(path); + clean_tree_block(trans, root, leaf); ret = btrfs_del_leaf(trans, root, path, leaf); BUG_ON(ret); } @@ -3890,13 +4006,15 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, extent_buffer_get(leaf); btrfs_set_path_blocking(path); - wret = push_leaf_left(trans, root, path, 1, 1); + wret = push_leaf_left(trans, root, path, 1, 1, + 1, (u32)-1); if (wret < 0 && wret != -ENOSPC) ret = wret; if (path->nodes[0] == leaf && btrfs_header_nritems(leaf)) { - wret = push_leaf_right(trans, root, path, 1, 1); + wret = push_leaf_right(trans, root, path, 1, + 1, 1, 0); if (wret < 0 && wret != -ENOSPC) ret = wret; } diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 746a7248678e..29c20092847e 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -34,6 +34,7 @@ struct btrfs_trans_handle; struct btrfs_transaction; +struct btrfs_pending_snapshot; extern struct kmem_cache *btrfs_trans_handle_cachep; extern struct kmem_cache *btrfs_transaction_cachep; extern struct kmem_cache *btrfs_bit_radix_cachep; @@ -663,6 +664,7 @@ struct btrfs_csum_item { #define BTRFS_BLOCK_GROUP_RAID1 (1 << 4) #define BTRFS_BLOCK_GROUP_DUP (1 << 5) #define BTRFS_BLOCK_GROUP_RAID10 (1 << 6) +#define BTRFS_NR_RAID_TYPES 5 struct btrfs_block_group_item { __le64 used; @@ -674,42 +676,46 @@ struct btrfs_space_info { u64 flags; u64 total_bytes; /* total bytes in the space */ - u64 bytes_used; /* total bytes used on disk */ + u64 bytes_used; /* total bytes used, + this does't take mirrors into account */ u64 bytes_pinned; /* total bytes pinned, will be freed when the transaction finishes */ u64 bytes_reserved; /* total bytes the allocator has reserved for current allocations */ u64 bytes_readonly; /* total bytes that are read only */ - u64 bytes_super; /* total bytes reserved for the super blocks */ - u64 bytes_root; /* the number of bytes needed to commit a - transaction */ + u64 bytes_may_use; /* number of bytes that may be used for delalloc/allocations */ - u64 bytes_delalloc; /* number of bytes currently reserved for - delayed allocation */ + u64 disk_used; /* total bytes used on disk */ int full; /* indicates that we cannot allocate any more chunks for this space */ int force_alloc; /* set if we need to force a chunk alloc for this space */ - int force_delalloc; /* make people start doing filemap_flush until - we're under a threshold */ struct list_head list; - /* for controlling how we free up space for allocations */ - wait_queue_head_t allocate_wait; - wait_queue_head_t flush_wait; - int allocating_chunk; - int flushing; - /* for block groups in our same type */ - struct list_head block_groups; + struct list_head block_groups[BTRFS_NR_RAID_TYPES]; spinlock_t lock; struct rw_semaphore groups_sem; atomic_t caching_threads; }; +struct btrfs_block_rsv { + u64 size; + u64 reserved; + u64 freed[2]; + struct btrfs_space_info *space_info; + struct list_head list; + spinlock_t lock; + atomic_t usage; + unsigned int priority:8; + unsigned int durable:1; + unsigned int refill_used:1; + unsigned int full:1; +}; + /* * free clusters are used to claim free space in relatively large chunks, * allowing us to do less seeky writes. They are used for all metadata @@ -760,6 +766,7 @@ struct btrfs_block_group_cache { spinlock_t lock; u64 pinned; u64 reserved; + u64 reserved_pinned; u64 bytes_super; u64 flags; u64 sectorsize; @@ -825,6 +832,22 @@ struct btrfs_fs_info { /* logical->physical extent mapping */ struct btrfs_mapping_tree mapping_tree; + /* block reservation for extent, checksum and root tree */ + struct btrfs_block_rsv global_block_rsv; + /* block reservation for delay allocation */ + struct btrfs_block_rsv delalloc_block_rsv; + /* block reservation for metadata operations */ + struct btrfs_block_rsv trans_block_rsv; + /* block reservation for chunk tree */ + struct btrfs_block_rsv chunk_block_rsv; + + struct btrfs_block_rsv empty_block_rsv; + + /* list of block reservations that cross multiple transactions */ + struct list_head durable_block_rsv_list; + + struct mutex durable_block_rsv_mutex; + u64 generation; u64 last_trans_committed; @@ -927,7 +950,6 @@ struct btrfs_fs_info { struct btrfs_workers endio_meta_write_workers; struct btrfs_workers endio_write_workers; struct btrfs_workers submit_workers; - struct btrfs_workers enospc_workers; /* * fixup workers take dirty pages that didn't properly go through * the cow mechanism and make them safe to write. It happens @@ -943,6 +965,7 @@ struct btrfs_fs_info { int do_barriers; int closing; int log_root_recovering; + int enospc_unlink; u64 total_pinned; @@ -1012,6 +1035,9 @@ struct btrfs_root { struct completion kobj_unregister; struct mutex objectid_mutex; + spinlock_t accounting_lock; + struct btrfs_block_rsv *block_rsv; + struct mutex log_mutex; wait_queue_head_t log_writer_wait; wait_queue_head_t log_commit_wait[2]; @@ -1043,7 +1069,6 @@ struct btrfs_root { int ref_cows; int track_dirty; int in_radix; - int clean_orphans; u64 defrag_trans_start; struct btrfs_key defrag_progress; @@ -1057,8 +1082,11 @@ struct btrfs_root { struct list_head root_list; - spinlock_t list_lock; + spinlock_t orphan_lock; struct list_head orphan_list; + struct btrfs_block_rsv *orphan_block_rsv; + int orphan_item_inserted; + int orphan_cleanup_state; spinlock_t inode_lock; /* red-black tree that keeps track of in-memory inodes */ @@ -1965,6 +1993,9 @@ void btrfs_put_block_group(struct btrfs_block_group_cache *cache); int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, struct btrfs_root *root, unsigned long count); int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len); +int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 bytenr, + u64 num_bytes, u64 *refs, u64 *flags); int btrfs_pin_extent(struct btrfs_root *root, u64 bytenr, u64 num, int reserved); int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans, @@ -1984,10 +2015,10 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans, u64 parent, u64 root_objectid, struct btrfs_disk_key *key, int level, u64 hint, u64 empty_size); -int btrfs_free_tree_block(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - u64 bytenr, u32 blocksize, - u64 parent, u64 root_objectid, int level); +void btrfs_free_tree_block(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct extent_buffer *buf, + u64 parent, int last_ref); struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u32 blocksize, @@ -2041,27 +2072,49 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, u64 size); int btrfs_remove_block_group(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 group_start); -int btrfs_prepare_block_group_relocation(struct btrfs_root *root, - struct btrfs_block_group_cache *group); - u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags); void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *ionde); void btrfs_clear_space_info_full(struct btrfs_fs_info *info); - -int btrfs_reserve_metadata_space(struct btrfs_root *root, int num_items); -int btrfs_unreserve_metadata_space(struct btrfs_root *root, int num_items); -int btrfs_unreserve_metadata_for_delalloc(struct btrfs_root *root, - struct inode *inode, int num_items); -int btrfs_reserve_metadata_for_delalloc(struct btrfs_root *root, - struct inode *inode, int num_items); -int btrfs_check_data_free_space(struct btrfs_root *root, struct inode *inode, - u64 bytes); -void btrfs_free_reserved_data_space(struct btrfs_root *root, - struct inode *inode, u64 bytes); -void btrfs_delalloc_reserve_space(struct btrfs_root *root, struct inode *inode, - u64 bytes); -void btrfs_delalloc_free_space(struct btrfs_root *root, struct inode *inode, - u64 bytes); +int btrfs_check_data_free_space(struct inode *inode, u64 bytes); +void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes); +int btrfs_trans_reserve_metadata(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + int num_items, int *retries); +void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans, + struct btrfs_root *root); +int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans, + struct inode *inode); +void btrfs_orphan_release_metadata(struct inode *inode); +int btrfs_snap_reserve_metadata(struct btrfs_trans_handle *trans, + struct btrfs_pending_snapshot *pending); +int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes); +void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes); +int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes); +void btrfs_delalloc_release_space(struct inode *inode, u64 num_bytes); +void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv); +struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root); +void btrfs_free_block_rsv(struct btrfs_root *root, + struct btrfs_block_rsv *rsv); +void btrfs_add_durable_block_rsv(struct btrfs_fs_info *fs_info, + struct btrfs_block_rsv *rsv); +int btrfs_block_rsv_add(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_block_rsv *block_rsv, + u64 num_bytes, int *retries); +int btrfs_block_rsv_check(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_block_rsv *block_rsv, + u64 min_reserved, int min_factor); +int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv, + struct btrfs_block_rsv *dst_rsv, + u64 num_bytes); +void btrfs_block_rsv_release(struct btrfs_root *root, + struct btrfs_block_rsv *block_rsv, + u64 num_bytes); +int btrfs_set_block_group_ro(struct btrfs_root *root, + struct btrfs_block_group_cache *cache); +int btrfs_set_block_group_rw(struct btrfs_root *root, + struct btrfs_block_group_cache *cache); /* ctree.c */ int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key, int level, int *slot); @@ -2152,7 +2205,8 @@ static inline int btrfs_insert_empty_item(struct btrfs_trans_handle *trans, int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path); int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path); int btrfs_leaf_free_space(struct btrfs_root *root, struct extent_buffer *leaf); -int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref); +int btrfs_drop_snapshot(struct btrfs_root *root, + struct btrfs_block_rsv *block_rsv, int update_ref); int btrfs_drop_subtree(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *node, @@ -2245,6 +2299,12 @@ int btrfs_del_inode_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, const char *name, int name_len, u64 inode_objectid, u64 ref_objectid, u64 *index); +struct btrfs_inode_ref * +btrfs_lookup_inode_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + const char *name, int name_len, + u64 inode_objectid, u64 ref_objectid, int mod); int btrfs_insert_empty_inode(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, u64 objectid); @@ -2257,6 +2317,8 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u64 len); int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode, struct bio *bio, u32 *dst); +int btrfs_lookup_bio_sums_dio(struct btrfs_root *root, struct inode *inode, + struct bio *bio, u64 logical_offset, u32 *dst); int btrfs_insert_file_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 objectid, u64 pos, @@ -2311,6 +2373,7 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, u32 min_type); int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput); +int btrfs_start_one_delalloc_inode(struct btrfs_root *root, int delay_iput); int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end, struct extent_state **cached_state); int btrfs_writepages(struct address_space *mapping, @@ -2349,10 +2412,20 @@ int btrfs_update_inode(struct btrfs_trans_handle *trans, int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode); int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode); void btrfs_orphan_cleanup(struct btrfs_root *root); +void btrfs_orphan_pre_snapshot(struct btrfs_trans_handle *trans, + struct btrfs_pending_snapshot *pending, + u64 *bytes_to_reserve); +void btrfs_orphan_post_snapshot(struct btrfs_trans_handle *trans, + struct btrfs_pending_snapshot *pending); +void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans, + struct btrfs_root *root); int btrfs_cont_expand(struct inode *inode, loff_t size); int btrfs_invalidate_inodes(struct btrfs_root *root); void btrfs_add_delayed_iput(struct inode *inode); void btrfs_run_delayed_iputs(struct btrfs_root *root); +int btrfs_prealloc_file_range(struct inode *inode, int mode, + u64 start, u64 num_bytes, u64 min_size, + loff_t actual_len, u64 *alloc_hint); extern const struct dentry_operations btrfs_dentry_operations; /* ioctl.c */ @@ -2361,7 +2434,7 @@ void btrfs_update_iflags(struct inode *inode); void btrfs_inherit_iflags(struct inode *inode, struct inode *dir); /* file.c */ -int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync); +int btrfs_sync_file(struct file *file, int datasync); int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, int skip_pinned); int btrfs_check_file(struct btrfs_root *root, struct inode *inode); @@ -2409,4 +2482,12 @@ int btrfs_update_reloc_root(struct btrfs_trans_handle *trans, struct btrfs_root *root); int btrfs_recover_relocation(struct btrfs_root *root); int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len); +void btrfs_reloc_cow_block(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct extent_buffer *buf, + struct extent_buffer *cow); +void btrfs_reloc_pre_snapshot(struct btrfs_trans_handle *trans, + struct btrfs_pending_snapshot *pending, + u64 *bytes_to_reserve); +void btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans, + struct btrfs_pending_snapshot *pending); #endif diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index 902ce507c4e3..e807b143b857 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -319,107 +319,6 @@ out: } /* - * helper function to lookup reference count and flags of extent. - * - * the head node for delayed ref is used to store the sum of all the - * reference count modifications queued up in the rbtree. the head - * node may also store the extent flags to set. This way you can check - * to see what the reference count and extent flags would be if all of - * the delayed refs are not processed. - */ -int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans, - struct btrfs_root *root, u64 bytenr, - u64 num_bytes, u64 *refs, u64 *flags) -{ - struct btrfs_delayed_ref_node *ref; - struct btrfs_delayed_ref_head *head; - struct btrfs_delayed_ref_root *delayed_refs; - struct btrfs_path *path; - struct btrfs_extent_item *ei; - struct extent_buffer *leaf; - struct btrfs_key key; - u32 item_size; - u64 num_refs; - u64 extent_flags; - int ret; - - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - - key.objectid = bytenr; - key.type = BTRFS_EXTENT_ITEM_KEY; - key.offset = num_bytes; - delayed_refs = &trans->transaction->delayed_refs; -again: - ret = btrfs_search_slot(trans, root->fs_info->extent_root, - &key, path, 0, 0); - if (ret < 0) - goto out; - - if (ret == 0) { - leaf = path->nodes[0]; - item_size = btrfs_item_size_nr(leaf, path->slots[0]); - if (item_size >= sizeof(*ei)) { - ei = btrfs_item_ptr(leaf, path->slots[0], - struct btrfs_extent_item); - num_refs = btrfs_extent_refs(leaf, ei); - extent_flags = btrfs_extent_flags(leaf, ei); - } else { -#ifdef BTRFS_COMPAT_EXTENT_TREE_V0 - struct btrfs_extent_item_v0 *ei0; - BUG_ON(item_size != sizeof(*ei0)); - ei0 = btrfs_item_ptr(leaf, path->slots[0], - struct btrfs_extent_item_v0); - num_refs = btrfs_extent_refs_v0(leaf, ei0); - /* FIXME: this isn't correct for data */ - extent_flags = BTRFS_BLOCK_FLAG_FULL_BACKREF; -#else - BUG(); -#endif - } - BUG_ON(num_refs == 0); - } else { - num_refs = 0; - extent_flags = 0; - ret = 0; - } - - spin_lock(&delayed_refs->lock); - ref = find_ref_head(&delayed_refs->root, bytenr, NULL); - if (ref) { - head = btrfs_delayed_node_to_head(ref); - if (!mutex_trylock(&head->mutex)) { - atomic_inc(&ref->refs); - spin_unlock(&delayed_refs->lock); - - btrfs_release_path(root->fs_info->extent_root, path); - - mutex_lock(&head->mutex); - mutex_unlock(&head->mutex); - btrfs_put_delayed_ref(ref); - goto again; - } - if (head->extent_op && head->extent_op->update_flags) - extent_flags |= head->extent_op->flags_to_set; - else - BUG_ON(num_refs == 0); - - num_refs += ref->ref_mod; - mutex_unlock(&head->mutex); - } - WARN_ON(num_refs == 0); - if (refs) - *refs = num_refs; - if (flags) - *flags = extent_flags; -out: - spin_unlock(&delayed_refs->lock); - btrfs_free_path(path); - return ret; -} - -/* * helper function to update an extent delayed ref in the * rbtree. existing and update must both have the same * bytenr and parent diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h index f6fc67ddad36..50e3cf92fbda 100644 --- a/fs/btrfs/delayed-ref.h +++ b/fs/btrfs/delayed-ref.h @@ -167,9 +167,6 @@ int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_head * btrfs_find_delayed_ref_head(struct btrfs_trans_handle *trans, u64 bytenr); int btrfs_delayed_ref_pending(struct btrfs_trans_handle *trans, u64 bytenr); -int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans, - struct btrfs_root *root, u64 bytenr, - u64 num_bytes, u64 *refs, u64 *flags); int btrfs_update_delayed_ref(struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes, u64 orig_parent, u64 parent, u64 orig_ref_root, u64 ref_root, diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index feca04197d02..34f7c375567e 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -74,6 +74,11 @@ struct async_submit_bio { int rw; int mirror_num; unsigned long bio_flags; + /* + * bio_offset is optional, can be used if the pages in the bio + * can't tell us where in the file the bio should go + */ + u64 bio_offset; struct btrfs_work work; }; @@ -534,7 +539,8 @@ static void run_one_async_start(struct btrfs_work *work) async = container_of(work, struct async_submit_bio, work); fs_info = BTRFS_I(async->inode)->root->fs_info; async->submit_bio_start(async->inode, async->rw, async->bio, - async->mirror_num, async->bio_flags); + async->mirror_num, async->bio_flags, + async->bio_offset); } static void run_one_async_done(struct btrfs_work *work) @@ -556,7 +562,8 @@ static void run_one_async_done(struct btrfs_work *work) wake_up(&fs_info->async_submit_wait); async->submit_bio_done(async->inode, async->rw, async->bio, - async->mirror_num, async->bio_flags); + async->mirror_num, async->bio_flags, + async->bio_offset); } static void run_one_async_free(struct btrfs_work *work) @@ -570,6 +577,7 @@ static void run_one_async_free(struct btrfs_work *work) int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode, int rw, struct bio *bio, int mirror_num, unsigned long bio_flags, + u64 bio_offset, extent_submit_bio_hook_t *submit_bio_start, extent_submit_bio_hook_t *submit_bio_done) { @@ -592,6 +600,7 @@ int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode, async->work.flags = 0; async->bio_flags = bio_flags; + async->bio_offset = bio_offset; atomic_inc(&fs_info->nr_async_submits); @@ -627,7 +636,8 @@ static int btree_csum_one_bio(struct bio *bio) static int __btree_submit_bio_start(struct inode *inode, int rw, struct bio *bio, int mirror_num, - unsigned long bio_flags) + unsigned long bio_flags, + u64 bio_offset) { /* * when we're called for a write, we're already in the async @@ -638,7 +648,8 @@ static int __btree_submit_bio_start(struct inode *inode, int rw, } static int __btree_submit_bio_done(struct inode *inode, int rw, struct bio *bio, - int mirror_num, unsigned long bio_flags) + int mirror_num, unsigned long bio_flags, + u64 bio_offset) { /* * when we're called for a write, we're already in the async @@ -648,7 +659,8 @@ static int __btree_submit_bio_done(struct inode *inode, int rw, struct bio *bio, } static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, - int mirror_num, unsigned long bio_flags) + int mirror_num, unsigned long bio_flags, + u64 bio_offset) { int ret; @@ -671,6 +683,7 @@ static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, */ return btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info, inode, rw, bio, mirror_num, 0, + bio_offset, __btree_submit_bio_start, __btree_submit_bio_done); } @@ -894,7 +907,8 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize, root->ref_cows = 0; root->track_dirty = 0; root->in_radix = 0; - root->clean_orphans = 0; + root->orphan_item_inserted = 0; + root->orphan_cleanup_state = 0; root->fs_info = fs_info; root->objectid = objectid; @@ -903,13 +917,16 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize, root->name = NULL; root->in_sysfs = 0; root->inode_tree = RB_ROOT; + root->block_rsv = NULL; + root->orphan_block_rsv = NULL; INIT_LIST_HEAD(&root->dirty_list); INIT_LIST_HEAD(&root->orphan_list); INIT_LIST_HEAD(&root->root_list); spin_lock_init(&root->node_lock); - spin_lock_init(&root->list_lock); + spin_lock_init(&root->orphan_lock); spin_lock_init(&root->inode_lock); + spin_lock_init(&root->accounting_lock); mutex_init(&root->objectid_mutex); mutex_init(&root->log_mutex); init_waitqueue_head(&root->log_writer_wait); @@ -968,42 +985,6 @@ static int find_and_setup_root(struct btrfs_root *tree_root, return 0; } -int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info) -{ - struct extent_buffer *eb; - struct btrfs_root *log_root_tree = fs_info->log_root_tree; - u64 start = 0; - u64 end = 0; - int ret; - - if (!log_root_tree) - return 0; - - while (1) { - ret = find_first_extent_bit(&log_root_tree->dirty_log_pages, - 0, &start, &end, EXTENT_DIRTY | EXTENT_NEW); - if (ret) - break; - - clear_extent_bits(&log_root_tree->dirty_log_pages, start, end, - EXTENT_DIRTY | EXTENT_NEW, GFP_NOFS); - } - eb = fs_info->log_root_tree->node; - - WARN_ON(btrfs_header_level(eb) != 0); - WARN_ON(btrfs_header_nritems(eb) != 0); - - ret = btrfs_free_reserved_extent(fs_info->tree_root, - eb->start, eb->len); - BUG_ON(ret); - - free_extent_buffer(eb); - kfree(fs_info->log_root_tree); - fs_info->log_root_tree = NULL; - return 0; -} - static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info) { @@ -1191,19 +1172,23 @@ again: if (root) return root; - ret = btrfs_find_orphan_item(fs_info->tree_root, location->objectid); - if (ret == 0) - ret = -ENOENT; - if (ret < 0) - return ERR_PTR(ret); - root = btrfs_read_fs_root_no_radix(fs_info->tree_root, location); if (IS_ERR(root)) return root; - WARN_ON(btrfs_root_refs(&root->root_item) == 0); set_anon_super(&root->anon_super, NULL); + if (btrfs_root_refs(&root->root_item) == 0) { + ret = -ENOENT; + goto fail; + } + + ret = btrfs_find_orphan_item(fs_info->tree_root, location->objectid); + if (ret < 0) + goto fail; + if (ret == 0) + root->orphan_item_inserted = 1; + ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM); if (ret) goto fail; @@ -1212,10 +1197,9 @@ again: ret = radix_tree_insert(&fs_info->fs_roots_radix, (unsigned long)root->root_key.objectid, root); - if (ret == 0) { + if (ret == 0) root->in_radix = 1; - root->clean_orphans = 1; - } + spin_unlock(&fs_info->fs_roots_radix_lock); radix_tree_preload_end(); if (ret) { @@ -1461,10 +1445,6 @@ static int cleaner_kthread(void *arg) struct btrfs_root *root = arg; do { - smp_mb(); - if (root->fs_info->closing) - break; - vfs_check_frozen(root->fs_info->sb, SB_FREEZE_WRITE); if (!(root->fs_info->sb->s_flags & MS_RDONLY) && @@ -1477,11 +1457,9 @@ static int cleaner_kthread(void *arg) if (freezing(current)) { refrigerator(); } else { - smp_mb(); - if (root->fs_info->closing) - break; set_current_state(TASK_INTERRUPTIBLE); - schedule(); + if (!kthread_should_stop()) + schedule(); __set_current_state(TASK_RUNNING); } } while (!kthread_should_stop()); @@ -1493,36 +1471,40 @@ static int transaction_kthread(void *arg) struct btrfs_root *root = arg; struct btrfs_trans_handle *trans; struct btrfs_transaction *cur; + u64 transid; unsigned long now; unsigned long delay; int ret; do { - smp_mb(); - if (root->fs_info->closing) - break; - delay = HZ * 30; vfs_check_frozen(root->fs_info->sb, SB_FREEZE_WRITE); mutex_lock(&root->fs_info->transaction_kthread_mutex); - mutex_lock(&root->fs_info->trans_mutex); + spin_lock(&root->fs_info->new_trans_lock); cur = root->fs_info->running_transaction; if (!cur) { - mutex_unlock(&root->fs_info->trans_mutex); + spin_unlock(&root->fs_info->new_trans_lock); goto sleep; } now = get_seconds(); - if (now < cur->start_time || now - cur->start_time < 30) { - mutex_unlock(&root->fs_info->trans_mutex); + if (!cur->blocked && + (now < cur->start_time || now - cur->start_time < 30)) { + spin_unlock(&root->fs_info->new_trans_lock); delay = HZ * 5; goto sleep; } - mutex_unlock(&root->fs_info->trans_mutex); - trans = btrfs_start_transaction(root, 1); - ret = btrfs_commit_transaction(trans, root); + transid = cur->transid; + spin_unlock(&root->fs_info->new_trans_lock); + trans = btrfs_join_transaction(root, 1); + if (transid == trans->transid) { + ret = btrfs_commit_transaction(trans, root); + BUG_ON(ret); + } else { + btrfs_end_transaction(trans, root); + } sleep: wake_up_process(root->fs_info->cleaner_kthread); mutex_unlock(&root->fs_info->transaction_kthread_mutex); @@ -1530,10 +1512,10 @@ sleep: if (freezing(current)) { refrigerator(); } else { - if (root->fs_info->closing) - break; set_current_state(TASK_INTERRUPTIBLE); - schedule_timeout(delay); + if (!kthread_should_stop() && + !btrfs_transaction_blocked(root->fs_info)) + schedule_timeout(delay); __set_current_state(TASK_RUNNING); } } while (!kthread_should_stop()); @@ -1620,6 +1602,13 @@ struct btrfs_root *open_ctree(struct super_block *sb, INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots); INIT_LIST_HEAD(&fs_info->space_info); btrfs_mapping_init(&fs_info->mapping_tree); + btrfs_init_block_rsv(&fs_info->global_block_rsv); + btrfs_init_block_rsv(&fs_info->delalloc_block_rsv); + btrfs_init_block_rsv(&fs_info->trans_block_rsv); + btrfs_init_block_rsv(&fs_info->chunk_block_rsv); + btrfs_init_block_rsv(&fs_info->empty_block_rsv); + INIT_LIST_HEAD(&fs_info->durable_block_rsv_list); + mutex_init(&fs_info->durable_block_rsv_mutex); atomic_set(&fs_info->nr_async_submits, 0); atomic_set(&fs_info->async_delalloc_pages, 0); atomic_set(&fs_info->async_submit_draining, 0); @@ -1759,9 +1748,6 @@ struct btrfs_root *open_ctree(struct super_block *sb, min_t(u64, fs_devices->num_devices, fs_info->thread_pool_size), &fs_info->generic_worker); - btrfs_init_workers(&fs_info->enospc_workers, "enospc", - fs_info->thread_pool_size, - &fs_info->generic_worker); /* a higher idle thresh on the submit workers makes it much more * likely that bios will be send down in a sane order to the @@ -1809,7 +1795,6 @@ struct btrfs_root *open_ctree(struct super_block *sb, btrfs_start_workers(&fs_info->endio_meta_workers, 1); btrfs_start_workers(&fs_info->endio_meta_write_workers, 1); btrfs_start_workers(&fs_info->endio_write_workers, 1); - btrfs_start_workers(&fs_info->enospc_workers, 1); fs_info->bdi.ra_pages *= btrfs_super_num_devices(disk_super); fs_info->bdi.ra_pages = max(fs_info->bdi.ra_pages, @@ -1912,17 +1897,18 @@ struct btrfs_root *open_ctree(struct super_block *sb, csum_root->track_dirty = 1; + fs_info->generation = generation; + fs_info->last_trans_committed = generation; + fs_info->data_alloc_profile = (u64)-1; + fs_info->metadata_alloc_profile = (u64)-1; + fs_info->system_alloc_profile = fs_info->metadata_alloc_profile; + ret = btrfs_read_block_groups(extent_root); if (ret) { printk(KERN_ERR "Failed to read block groups: %d\n", ret); goto fail_block_groups; } - fs_info->generation = generation; - fs_info->last_trans_committed = generation; - fs_info->data_alloc_profile = (u64)-1; - fs_info->metadata_alloc_profile = (u64)-1; - fs_info->system_alloc_profile = fs_info->metadata_alloc_profile; fs_info->cleaner_kthread = kthread_run(cleaner_kthread, tree_root, "btrfs-cleaner"); if (IS_ERR(fs_info->cleaner_kthread)) @@ -1955,8 +1941,11 @@ struct btrfs_root *open_ctree(struct super_block *sb, btrfs_level_size(tree_root, btrfs_super_log_root_level(disk_super)); - log_tree_root = kzalloc(sizeof(struct btrfs_root), - GFP_NOFS); + log_tree_root = kzalloc(sizeof(struct btrfs_root), GFP_NOFS); + if (!log_tree_root) { + err = -ENOMEM; + goto fail_trans_kthread; + } __setup_root(nodesize, leafsize, sectorsize, stripesize, log_tree_root, fs_info, BTRFS_TREE_LOG_OBJECTID); @@ -1977,6 +1966,9 @@ struct btrfs_root *open_ctree(struct super_block *sb, BUG_ON(ret); if (!(sb->s_flags & MS_RDONLY)) { + ret = btrfs_cleanup_fs_roots(fs_info); + BUG_ON(ret); + ret = btrfs_recover_relocation(tree_root); if (ret < 0) { printk(KERN_WARNING @@ -1993,6 +1985,10 @@ struct btrfs_root *open_ctree(struct super_block *sb, fs_info->fs_root = btrfs_read_fs_root_no_name(fs_info, &location); if (!fs_info->fs_root) goto fail_trans_kthread; + if (IS_ERR(fs_info->fs_root)) { + err = PTR_ERR(fs_info->fs_root); + goto fail_trans_kthread; + } if (!(sb->s_flags & MS_RDONLY)) { down_read(&fs_info->cleanup_work_sem); @@ -2040,7 +2036,6 @@ fail_sb_buffer: btrfs_stop_workers(&fs_info->endio_meta_write_workers); btrfs_stop_workers(&fs_info->endio_write_workers); btrfs_stop_workers(&fs_info->submit_workers); - btrfs_stop_workers(&fs_info->enospc_workers); fail_iput: invalidate_inode_pages2(fs_info->btree_inode->i_mapping); iput(fs_info->btree_inode); @@ -2405,11 +2400,11 @@ int btrfs_commit_super(struct btrfs_root *root) down_write(&root->fs_info->cleanup_work_sem); up_write(&root->fs_info->cleanup_work_sem); - trans = btrfs_start_transaction(root, 1); + trans = btrfs_join_transaction(root, 1); ret = btrfs_commit_transaction(trans, root); BUG_ON(ret); /* run commit again to drop the original snapshot */ - trans = btrfs_start_transaction(root, 1); + trans = btrfs_join_transaction(root, 1); btrfs_commit_transaction(trans, root); ret = btrfs_write_and_wait_transaction(NULL, root); BUG_ON(ret); @@ -2426,15 +2421,15 @@ int close_ctree(struct btrfs_root *root) fs_info->closing = 1; smp_mb(); - kthread_stop(root->fs_info->transaction_kthread); - kthread_stop(root->fs_info->cleaner_kthread); - if (!(fs_info->sb->s_flags & MS_RDONLY)) { ret = btrfs_commit_super(root); if (ret) printk(KERN_ERR "btrfs: commit super ret %d\n", ret); } + kthread_stop(root->fs_info->transaction_kthread); + kthread_stop(root->fs_info->cleaner_kthread); + fs_info->closing = 2; smp_mb(); @@ -2473,7 +2468,6 @@ int close_ctree(struct btrfs_root *root) btrfs_stop_workers(&fs_info->endio_meta_write_workers); btrfs_stop_workers(&fs_info->endio_write_workers); btrfs_stop_workers(&fs_info->submit_workers); - btrfs_stop_workers(&fs_info->enospc_workers); btrfs_close_devices(fs_info->fs_devices); btrfs_mapping_tree_free(&fs_info->mapping_tree); diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index c958ecbc1916..88e825a0bf21 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -87,7 +87,7 @@ int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio, int metadata); int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode, int rw, struct bio *bio, int mirror_num, - unsigned long bio_flags, + unsigned long bio_flags, u64 bio_offset, extent_submit_bio_hook_t *submit_bio_start, extent_submit_bio_hook_t *submit_bio_done); @@ -95,8 +95,6 @@ int btrfs_congested_async(struct btrfs_fs_info *info, int iodone); unsigned long btrfs_async_submit_limit(struct btrfs_fs_info *info); int btrfs_write_tree_block(struct extent_buffer *buf); int btrfs_wait_tree_block_writeback(struct extent_buffer *buf); -int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info); int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info); int btrfs_add_log_tree(struct btrfs_trans_handle *trans, diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index c6a4f459ad76..32d094002a57 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -35,10 +35,9 @@ static int update_block_group(struct btrfs_trans_handle *trans, struct btrfs_root *root, - u64 bytenr, u64 num_bytes, int alloc, - int mark_free); -static int update_reserved_extents(struct btrfs_block_group_cache *cache, - u64 num_bytes, int reserve); + u64 bytenr, u64 num_bytes, int alloc); +static int update_reserved_bytes(struct btrfs_block_group_cache *cache, + u64 num_bytes, int reserve, int sinfo); static int __btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u64 num_bytes, u64 parent, @@ -61,12 +60,6 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, static int do_chunk_alloc(struct btrfs_trans_handle *trans, struct btrfs_root *extent_root, u64 alloc_bytes, u64 flags, int force); -static int pin_down_bytes(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, - u64 bytenr, u64 num_bytes, - int is_data, int reserved, - struct extent_buffer **must_clean); static int find_next_key(struct btrfs_path *path, int level, struct btrfs_key *key); static void dump_space_info(struct btrfs_space_info *info, u64 bytes, @@ -91,8 +84,12 @@ void btrfs_get_block_group(struct btrfs_block_group_cache *cache) void btrfs_put_block_group(struct btrfs_block_group_cache *cache) { - if (atomic_dec_and_test(&cache->count)) + if (atomic_dec_and_test(&cache->count)) { + WARN_ON(cache->pinned > 0); + WARN_ON(cache->reserved > 0); + WARN_ON(cache->reserved_pinned > 0); kfree(cache); + } } /* @@ -319,7 +316,7 @@ static int caching_kthread(void *data) exclude_super_stripes(extent_root, block_group); spin_lock(&block_group->space_info->lock); - block_group->space_info->bytes_super += block_group->bytes_super; + block_group->space_info->bytes_readonly += block_group->bytes_super; spin_unlock(&block_group->space_info->lock); last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET); @@ -507,6 +504,9 @@ static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info, struct list_head *head = &info->space_info; struct btrfs_space_info *found; + flags &= BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_SYSTEM | + BTRFS_BLOCK_GROUP_METADATA; + rcu_read_lock(); list_for_each_entry_rcu(found, head, list) { if (found->flags == flags) { @@ -610,6 +610,113 @@ int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len) } /* + * helper function to lookup reference count and flags of extent. + * + * the head node for delayed ref is used to store the sum of all the + * reference count modifications queued up in the rbtree. the head + * node may also store the extent flags to set. This way you can check + * to see what the reference count and extent flags would be if all of + * the delayed refs are not processed. + */ +int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 bytenr, + u64 num_bytes, u64 *refs, u64 *flags) +{ + struct btrfs_delayed_ref_head *head; + struct btrfs_delayed_ref_root *delayed_refs; + struct btrfs_path *path; + struct btrfs_extent_item *ei; + struct extent_buffer *leaf; + struct btrfs_key key; + u32 item_size; + u64 num_refs; + u64 extent_flags; + int ret; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + key.objectid = bytenr; + key.type = BTRFS_EXTENT_ITEM_KEY; + key.offset = num_bytes; + if (!trans) { + path->skip_locking = 1; + path->search_commit_root = 1; + } +again: + ret = btrfs_search_slot(trans, root->fs_info->extent_root, + &key, path, 0, 0); + if (ret < 0) + goto out_free; + + if (ret == 0) { + leaf = path->nodes[0]; + item_size = btrfs_item_size_nr(leaf, path->slots[0]); + if (item_size >= sizeof(*ei)) { + ei = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_extent_item); + num_refs = btrfs_extent_refs(leaf, ei); + extent_flags = btrfs_extent_flags(leaf, ei); + } else { +#ifdef BTRFS_COMPAT_EXTENT_TREE_V0 + struct btrfs_extent_item_v0 *ei0; + BUG_ON(item_size != sizeof(*ei0)); + ei0 = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_extent_item_v0); + num_refs = btrfs_extent_refs_v0(leaf, ei0); + /* FIXME: this isn't correct for data */ + extent_flags = BTRFS_BLOCK_FLAG_FULL_BACKREF; +#else + BUG(); +#endif + } + BUG_ON(num_refs == 0); + } else { + num_refs = 0; + extent_flags = 0; + ret = 0; + } + + if (!trans) + goto out; + + delayed_refs = &trans->transaction->delayed_refs; + spin_lock(&delayed_refs->lock); + head = btrfs_find_delayed_ref_head(trans, bytenr); + if (head) { + if (!mutex_trylock(&head->mutex)) { + atomic_inc(&head->node.refs); + spin_unlock(&delayed_refs->lock); + + btrfs_release_path(root->fs_info->extent_root, path); + + mutex_lock(&head->mutex); + mutex_unlock(&head->mutex); + btrfs_put_delayed_ref(&head->node); + goto again; + } + if (head->extent_op && head->extent_op->update_flags) + extent_flags |= head->extent_op->flags_to_set; + else + BUG_ON(num_refs == 0); + + num_refs += head->node.ref_mod; + mutex_unlock(&head->mutex); + } + spin_unlock(&delayed_refs->lock); +out: + WARN_ON(num_refs == 0); + if (refs) + *refs = num_refs; + if (flags) + *flags = extent_flags; +out_free: + btrfs_free_path(path); + return ret; +} + +/* * Back reference rules. Back refs have three main goals: * * 1) differentiate between all holders of references to an extent so that @@ -1871,7 +1978,6 @@ static int run_delayed_tree_ref(struct btrfs_trans_handle *trans, return ret; } - /* helper function to actually process a single delayed ref entry */ static int run_one_delayed_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, @@ -1891,32 +1997,14 @@ static int run_one_delayed_ref(struct btrfs_trans_handle *trans, BUG_ON(extent_op); head = btrfs_delayed_node_to_head(node); if (insert_reserved) { - int mark_free = 0; - struct extent_buffer *must_clean = NULL; - - ret = pin_down_bytes(trans, root, NULL, - node->bytenr, node->num_bytes, - head->is_data, 1, &must_clean); - if (ret > 0) - mark_free = 1; - - if (must_clean) { - clean_tree_block(NULL, root, must_clean); - btrfs_tree_unlock(must_clean); - free_extent_buffer(must_clean); - } + btrfs_pin_extent(root, node->bytenr, + node->num_bytes, 1); if (head->is_data) { ret = btrfs_del_csums(trans, root, node->bytenr, node->num_bytes); BUG_ON(ret); } - if (mark_free) { - ret = btrfs_free_reserved_extent(root, - node->bytenr, - node->num_bytes); - BUG_ON(ret); - } } mutex_unlock(&head->mutex); return 0; @@ -2347,6 +2435,8 @@ int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans, ret = 0; out: btrfs_free_path(path); + if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID) + WARN_ON(ret > 0); return ret; } @@ -2660,12 +2750,21 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags, struct btrfs_space_info **space_info) { struct btrfs_space_info *found; + int i; + int factor; + + if (flags & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 | + BTRFS_BLOCK_GROUP_RAID10)) + factor = 2; + else + factor = 1; found = __find_space_info(info, flags); if (found) { spin_lock(&found->lock); found->total_bytes += total_bytes; found->bytes_used += bytes_used; + found->disk_used += bytes_used * factor; found->full = 0; spin_unlock(&found->lock); *space_info = found; @@ -2675,18 +2774,20 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags, if (!found) return -ENOMEM; - INIT_LIST_HEAD(&found->block_groups); + for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) + INIT_LIST_HEAD(&found->block_groups[i]); init_rwsem(&found->groups_sem); - init_waitqueue_head(&found->flush_wait); - init_waitqueue_head(&found->allocate_wait); spin_lock_init(&found->lock); - found->flags = flags; + found->flags = flags & (BTRFS_BLOCK_GROUP_DATA | + BTRFS_BLOCK_GROUP_SYSTEM | + BTRFS_BLOCK_GROUP_METADATA); found->total_bytes = total_bytes; found->bytes_used = bytes_used; + found->disk_used = bytes_used * factor; found->bytes_pinned = 0; found->bytes_reserved = 0; found->bytes_readonly = 0; - found->bytes_delalloc = 0; + found->bytes_may_use = 0; found->full = 0; found->force_alloc = 0; *space_info = found; @@ -2711,19 +2812,6 @@ static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags) } } -static void set_block_group_readonly(struct btrfs_block_group_cache *cache) -{ - spin_lock(&cache->space_info->lock); - spin_lock(&cache->lock); - if (!cache->ro) { - cache->space_info->bytes_readonly += cache->key.offset - - btrfs_block_group_used(&cache->item); - cache->ro = 1; - } - spin_unlock(&cache->lock); - spin_unlock(&cache->space_info->lock); -} - u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags) { u64 num_devices = root->fs_info->fs_devices->rw_devices; @@ -2752,491 +2840,50 @@ u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags) return flags; } -static u64 btrfs_get_alloc_profile(struct btrfs_root *root, u64 data) -{ - struct btrfs_fs_info *info = root->fs_info; - u64 alloc_profile; - - if (data) { - alloc_profile = info->avail_data_alloc_bits & - info->data_alloc_profile; - data = BTRFS_BLOCK_GROUP_DATA | alloc_profile; - } else if (root == root->fs_info->chunk_root) { - alloc_profile = info->avail_system_alloc_bits & - info->system_alloc_profile; - data = BTRFS_BLOCK_GROUP_SYSTEM | alloc_profile; - } else { - alloc_profile = info->avail_metadata_alloc_bits & - info->metadata_alloc_profile; - data = BTRFS_BLOCK_GROUP_METADATA | alloc_profile; - } - - return btrfs_reduce_alloc_profile(root, data); -} - -void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *inode) -{ - u64 alloc_target; - - alloc_target = btrfs_get_alloc_profile(root, 1); - BTRFS_I(inode)->space_info = __find_space_info(root->fs_info, - alloc_target); -} - -static u64 calculate_bytes_needed(struct btrfs_root *root, int num_items) -{ - u64 num_bytes; - int level; - - level = BTRFS_MAX_LEVEL - 2; - /* - * NOTE: these calculations are absolutely the worst possible case. - * This assumes that _every_ item we insert will require a new leaf, and - * that the tree has grown to its maximum level size. - */ - - /* - * for every item we insert we could insert both an extent item and a - * extent ref item. Then for ever item we insert, we will need to cow - * both the original leaf, plus the leaf to the left and right of it. - * - * Unless we are talking about the extent root, then we just want the - * number of items * 2, since we just need the extent item plus its ref. - */ - if (root == root->fs_info->extent_root) - num_bytes = num_items * 2; - else - num_bytes = (num_items + (2 * num_items)) * 3; - - /* - * num_bytes is total number of leaves we could need times the leaf - * size, and then for every leaf we could end up cow'ing 2 nodes per - * level, down to the leaf level. - */ - num_bytes = (num_bytes * root->leafsize) + - (num_bytes * (level * 2)) * root->nodesize; - - return num_bytes; -} - -/* - * Unreserve metadata space for delalloc. If we have less reserved credits than - * we have extents, this function does nothing. - */ -int btrfs_unreserve_metadata_for_delalloc(struct btrfs_root *root, - struct inode *inode, int num_items) -{ - struct btrfs_fs_info *info = root->fs_info; - struct btrfs_space_info *meta_sinfo; - u64 num_bytes; - u64 alloc_target; - bool bug = false; - - /* get the space info for where the metadata will live */ - alloc_target = btrfs_get_alloc_profile(root, 0); - meta_sinfo = __find_space_info(info, alloc_target); - - num_bytes = calculate_bytes_needed(root->fs_info->extent_root, - num_items); - - spin_lock(&meta_sinfo->lock); - spin_lock(&BTRFS_I(inode)->accounting_lock); - if (BTRFS_I(inode)->reserved_extents <= - BTRFS_I(inode)->outstanding_extents) { - spin_unlock(&BTRFS_I(inode)->accounting_lock); - spin_unlock(&meta_sinfo->lock); - return 0; - } - spin_unlock(&BTRFS_I(inode)->accounting_lock); - - BTRFS_I(inode)->reserved_extents -= num_items; - BUG_ON(BTRFS_I(inode)->reserved_extents < 0); - - if (meta_sinfo->bytes_delalloc < num_bytes) { - bug = true; - meta_sinfo->bytes_delalloc = 0; - } else { - meta_sinfo->bytes_delalloc -= num_bytes; - } - spin_unlock(&meta_sinfo->lock); - - BUG_ON(bug); - - return 0; -} - -static void check_force_delalloc(struct btrfs_space_info *meta_sinfo) +static u64 get_alloc_profile(struct btrfs_root *root, u64 flags) { - u64 thresh; - - thresh = meta_sinfo->bytes_used + meta_sinfo->bytes_reserved + - meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly + - meta_sinfo->bytes_super + meta_sinfo->bytes_root + - meta_sinfo->bytes_may_use; - - thresh = meta_sinfo->total_bytes - thresh; - thresh *= 80; - do_div(thresh, 100); - if (thresh <= meta_sinfo->bytes_delalloc) - meta_sinfo->force_delalloc = 1; - else - meta_sinfo->force_delalloc = 0; + if (flags & BTRFS_BLOCK_GROUP_DATA) + flags |= root->fs_info->avail_data_alloc_bits & + root->fs_info->data_alloc_profile; + else if (flags & BTRFS_BLOCK_GROUP_SYSTEM) + flags |= root->fs_info->avail_system_alloc_bits & + root->fs_info->system_alloc_profile; + else if (flags & BTRFS_BLOCK_GROUP_METADATA) + flags |= root->fs_info->avail_metadata_alloc_bits & + root->fs_info->metadata_alloc_profile; + return btrfs_reduce_alloc_profile(root, flags); } -struct async_flush { - struct btrfs_root *root; - struct btrfs_space_info *info; - struct btrfs_work work; -}; - -static noinline void flush_delalloc_async(struct btrfs_work *work) +static u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data) { - struct async_flush *async; - struct btrfs_root *root; - struct btrfs_space_info *info; - - async = container_of(work, struct async_flush, work); - root = async->root; - info = async->info; - - btrfs_start_delalloc_inodes(root, 0); - wake_up(&info->flush_wait); - btrfs_wait_ordered_extents(root, 0, 0); - - spin_lock(&info->lock); - info->flushing = 0; - spin_unlock(&info->lock); - wake_up(&info->flush_wait); - - kfree(async); -} - -static void wait_on_flush(struct btrfs_space_info *info) -{ - DEFINE_WAIT(wait); - u64 used; - - while (1) { - prepare_to_wait(&info->flush_wait, &wait, - TASK_UNINTERRUPTIBLE); - spin_lock(&info->lock); - if (!info->flushing) { - spin_unlock(&info->lock); - break; - } - - used = info->bytes_used + info->bytes_reserved + - info->bytes_pinned + info->bytes_readonly + - info->bytes_super + info->bytes_root + - info->bytes_may_use + info->bytes_delalloc; - if (used < info->total_bytes) { - spin_unlock(&info->lock); - break; - } - spin_unlock(&info->lock); - schedule(); - } - finish_wait(&info->flush_wait, &wait); -} - -static void flush_delalloc(struct btrfs_root *root, - struct btrfs_space_info *info) -{ - struct async_flush *async; - bool wait = false; - - spin_lock(&info->lock); + u64 flags; - if (!info->flushing) - info->flushing = 1; + if (data) + flags = BTRFS_BLOCK_GROUP_DATA; + else if (root == root->fs_info->chunk_root) + flags = BTRFS_BLOCK_GROUP_SYSTEM; else - wait = true; - - spin_unlock(&info->lock); - - if (wait) { - wait_on_flush(info); - return; - } - - async = kzalloc(sizeof(*async), GFP_NOFS); - if (!async) - goto flush; - - async->root = root; - async->info = info; - async->work.func = flush_delalloc_async; - - btrfs_queue_worker(&root->fs_info->enospc_workers, - &async->work); - wait_on_flush(info); - return; - -flush: - btrfs_start_delalloc_inodes(root, 0); - btrfs_wait_ordered_extents(root, 0, 0); - - spin_lock(&info->lock); - info->flushing = 0; - spin_unlock(&info->lock); - wake_up(&info->flush_wait); -} - -static int maybe_allocate_chunk(struct btrfs_root *root, - struct btrfs_space_info *info) -{ - struct btrfs_super_block *disk_super = &root->fs_info->super_copy; - struct btrfs_trans_handle *trans; - bool wait = false; - int ret = 0; - u64 min_metadata; - u64 free_space; - - free_space = btrfs_super_total_bytes(disk_super); - /* - * we allow the metadata to grow to a max of either 10gb or 5% of the - * space in the volume. - */ - min_metadata = min((u64)10 * 1024 * 1024 * 1024, - div64_u64(free_space * 5, 100)); - if (info->total_bytes >= min_metadata) { - spin_unlock(&info->lock); - return 0; - } - - if (info->full) { - spin_unlock(&info->lock); - return 0; - } - - if (!info->allocating_chunk) { - info->force_alloc = 1; - info->allocating_chunk = 1; - } else { - wait = true; - } - - spin_unlock(&info->lock); - - if (wait) { - wait_event(info->allocate_wait, - !info->allocating_chunk); - return 1; - } - - trans = btrfs_start_transaction(root, 1); - if (!trans) { - ret = -ENOMEM; - goto out; - } - - ret = do_chunk_alloc(trans, root->fs_info->extent_root, - 4096 + 2 * 1024 * 1024, - info->flags, 0); - btrfs_end_transaction(trans, root); - if (ret) - goto out; -out: - spin_lock(&info->lock); - info->allocating_chunk = 0; - spin_unlock(&info->lock); - wake_up(&info->allocate_wait); - - if (ret) - return 0; - return 1; -} - -/* - * Reserve metadata space for delalloc. - */ -int btrfs_reserve_metadata_for_delalloc(struct btrfs_root *root, - struct inode *inode, int num_items) -{ - struct btrfs_fs_info *info = root->fs_info; - struct btrfs_space_info *meta_sinfo; - u64 num_bytes; - u64 used; - u64 alloc_target; - int flushed = 0; - int force_delalloc; - - /* get the space info for where the metadata will live */ - alloc_target = btrfs_get_alloc_profile(root, 0); - meta_sinfo = __find_space_info(info, alloc_target); - - num_bytes = calculate_bytes_needed(root->fs_info->extent_root, - num_items); -again: - spin_lock(&meta_sinfo->lock); - - force_delalloc = meta_sinfo->force_delalloc; - - if (unlikely(!meta_sinfo->bytes_root)) - meta_sinfo->bytes_root = calculate_bytes_needed(root, 6); - - if (!flushed) - meta_sinfo->bytes_delalloc += num_bytes; - - used = meta_sinfo->bytes_used + meta_sinfo->bytes_reserved + - meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly + - meta_sinfo->bytes_super + meta_sinfo->bytes_root + - meta_sinfo->bytes_may_use + meta_sinfo->bytes_delalloc; - - if (used > meta_sinfo->total_bytes) { - flushed++; - - if (flushed == 1) { - if (maybe_allocate_chunk(root, meta_sinfo)) - goto again; - flushed++; - } else { - spin_unlock(&meta_sinfo->lock); - } - - if (flushed == 2) { - filemap_flush(inode->i_mapping); - goto again; - } else if (flushed == 3) { - flush_delalloc(root, meta_sinfo); - goto again; - } - spin_lock(&meta_sinfo->lock); - meta_sinfo->bytes_delalloc -= num_bytes; - spin_unlock(&meta_sinfo->lock); - printk(KERN_ERR "enospc, has %d, reserved %d\n", - BTRFS_I(inode)->outstanding_extents, - BTRFS_I(inode)->reserved_extents); - dump_space_info(meta_sinfo, 0, 0); - return -ENOSPC; - } + flags = BTRFS_BLOCK_GROUP_METADATA; - BTRFS_I(inode)->reserved_extents += num_items; - check_force_delalloc(meta_sinfo); - spin_unlock(&meta_sinfo->lock); - - if (!flushed && force_delalloc) - filemap_flush(inode->i_mapping); - - return 0; + return get_alloc_profile(root, flags); } -/* - * unreserve num_items number of items worth of metadata space. This needs to - * be paired with btrfs_reserve_metadata_space. - * - * NOTE: if you have the option, run this _AFTER_ you do a - * btrfs_end_transaction, since btrfs_end_transaction will run delayed ref - * oprations which will result in more used metadata, so we want to make sure we - * can do that without issue. - */ -int btrfs_unreserve_metadata_space(struct btrfs_root *root, int num_items) -{ - struct btrfs_fs_info *info = root->fs_info; - struct btrfs_space_info *meta_sinfo; - u64 num_bytes; - u64 alloc_target; - bool bug = false; - - /* get the space info for where the metadata will live */ - alloc_target = btrfs_get_alloc_profile(root, 0); - meta_sinfo = __find_space_info(info, alloc_target); - - num_bytes = calculate_bytes_needed(root, num_items); - - spin_lock(&meta_sinfo->lock); - if (meta_sinfo->bytes_may_use < num_bytes) { - bug = true; - meta_sinfo->bytes_may_use = 0; - } else { - meta_sinfo->bytes_may_use -= num_bytes; - } - spin_unlock(&meta_sinfo->lock); - - BUG_ON(bug); - - return 0; -} - -/* - * Reserve some metadata space for use. We'll calculate the worste case number - * of bytes that would be needed to modify num_items number of items. If we - * have space, fantastic, if not, you get -ENOSPC. Please call - * btrfs_unreserve_metadata_space when you are done for the _SAME_ number of - * items you reserved, since whatever metadata you needed should have already - * been allocated. - * - * This will commit the transaction to make more space if we don't have enough - * metadata space. THe only time we don't do this is if we're reserving space - * inside of a transaction, then we will just return -ENOSPC and it is the - * callers responsibility to handle it properly. - */ -int btrfs_reserve_metadata_space(struct btrfs_root *root, int num_items) +void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *inode) { - struct btrfs_fs_info *info = root->fs_info; - struct btrfs_space_info *meta_sinfo; - u64 num_bytes; - u64 used; - u64 alloc_target; - int retries = 0; - - /* get the space info for where the metadata will live */ - alloc_target = btrfs_get_alloc_profile(root, 0); - meta_sinfo = __find_space_info(info, alloc_target); - - num_bytes = calculate_bytes_needed(root, num_items); -again: - spin_lock(&meta_sinfo->lock); - - if (unlikely(!meta_sinfo->bytes_root)) - meta_sinfo->bytes_root = calculate_bytes_needed(root, 6); - - if (!retries) - meta_sinfo->bytes_may_use += num_bytes; - - used = meta_sinfo->bytes_used + meta_sinfo->bytes_reserved + - meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly + - meta_sinfo->bytes_super + meta_sinfo->bytes_root + - meta_sinfo->bytes_may_use + meta_sinfo->bytes_delalloc; - - if (used > meta_sinfo->total_bytes) { - retries++; - if (retries == 1) { - if (maybe_allocate_chunk(root, meta_sinfo)) - goto again; - retries++; - } else { - spin_unlock(&meta_sinfo->lock); - } - - if (retries == 2) { - flush_delalloc(root, meta_sinfo); - goto again; - } - spin_lock(&meta_sinfo->lock); - meta_sinfo->bytes_may_use -= num_bytes; - spin_unlock(&meta_sinfo->lock); - - dump_space_info(meta_sinfo, 0, 0); - return -ENOSPC; - } - - check_force_delalloc(meta_sinfo); - spin_unlock(&meta_sinfo->lock); - - return 0; + BTRFS_I(inode)->space_info = __find_space_info(root->fs_info, + BTRFS_BLOCK_GROUP_DATA); } /* * This will check the space that the inode allocates from to make sure we have * enough space for bytes. */ -int btrfs_check_data_free_space(struct btrfs_root *root, struct inode *inode, - u64 bytes) +int btrfs_check_data_free_space(struct inode *inode, u64 bytes) { struct btrfs_space_info *data_sinfo; + struct btrfs_root *root = BTRFS_I(inode)->root; u64 used; - int ret = 0, committed = 0, flushed = 0; + int ret = 0, committed = 0; /* make sure bytes are sectorsize aligned */ bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1); @@ -3248,21 +2895,13 @@ int btrfs_check_data_free_space(struct btrfs_root *root, struct inode *inode, again: /* make sure we have enough space to handle the data first */ spin_lock(&data_sinfo->lock); - used = data_sinfo->bytes_used + data_sinfo->bytes_delalloc + - data_sinfo->bytes_reserved + data_sinfo->bytes_pinned + - data_sinfo->bytes_readonly + data_sinfo->bytes_may_use + - data_sinfo->bytes_super; + used = data_sinfo->bytes_used + data_sinfo->bytes_reserved + + data_sinfo->bytes_pinned + data_sinfo->bytes_readonly + + data_sinfo->bytes_may_use; if (used + bytes > data_sinfo->total_bytes) { struct btrfs_trans_handle *trans; - if (!flushed) { - spin_unlock(&data_sinfo->lock); - flush_delalloc(root, data_sinfo); - flushed = 1; - goto again; - } - /* * if we don't have enough free bytes in this space then we need * to alloc a new chunk. @@ -3274,15 +2913,15 @@ again: spin_unlock(&data_sinfo->lock); alloc: alloc_target = btrfs_get_alloc_profile(root, 1); - trans = btrfs_start_transaction(root, 1); - if (!trans) - return -ENOMEM; + trans = btrfs_join_transaction(root, 1); + if (IS_ERR(trans)) + return PTR_ERR(trans); ret = do_chunk_alloc(trans, root->fs_info->extent_root, bytes + 2 * 1024 * 1024, alloc_target, 0); btrfs_end_transaction(trans, root); - if (ret) + if (ret < 0) return ret; if (!data_sinfo) { @@ -3297,25 +2936,26 @@ alloc: if (!committed && !root->fs_info->open_ioctl_trans) { committed = 1; trans = btrfs_join_transaction(root, 1); - if (!trans) - return -ENOMEM; + if (IS_ERR(trans)) + return PTR_ERR(trans); ret = btrfs_commit_transaction(trans, root); if (ret) return ret; goto again; } - printk(KERN_ERR "no space left, need %llu, %llu delalloc bytes" - ", %llu bytes_used, %llu bytes_reserved, " - "%llu bytes_pinned, %llu bytes_readonly, %llu may use " - "%llu total\n", (unsigned long long)bytes, - (unsigned long long)data_sinfo->bytes_delalloc, +#if 0 /* I hope we never need this code again, just in case */ + printk(KERN_ERR "no space left, need %llu, %llu bytes_used, " + "%llu bytes_reserved, " "%llu bytes_pinned, " + "%llu bytes_readonly, %llu may use %llu total\n", + (unsigned long long)bytes, (unsigned long long)data_sinfo->bytes_used, (unsigned long long)data_sinfo->bytes_reserved, (unsigned long long)data_sinfo->bytes_pinned, (unsigned long long)data_sinfo->bytes_readonly, (unsigned long long)data_sinfo->bytes_may_use, (unsigned long long)data_sinfo->total_bytes); +#endif return -ENOSPC; } data_sinfo->bytes_may_use += bytes; @@ -3326,12 +2966,13 @@ alloc: } /* - * if there was an error for whatever reason after calling - * btrfs_check_data_free_space, call this so we can cleanup the counters. + * called when we are clearing an delalloc extent from the + * inode's io_tree or there was an error for whatever reason + * after calling btrfs_check_data_free_space */ -void btrfs_free_reserved_data_space(struct btrfs_root *root, - struct inode *inode, u64 bytes) +void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes) { + struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_space_info *data_sinfo; /* make sure bytes are sectorsize aligned */ @@ -3344,48 +2985,6 @@ void btrfs_free_reserved_data_space(struct btrfs_root *root, spin_unlock(&data_sinfo->lock); } -/* called when we are adding a delalloc extent to the inode's io_tree */ -void btrfs_delalloc_reserve_space(struct btrfs_root *root, struct inode *inode, - u64 bytes) -{ - struct btrfs_space_info *data_sinfo; - - /* get the space info for where this inode will be storing its data */ - data_sinfo = BTRFS_I(inode)->space_info; - - /* make sure we have enough space to handle the data first */ - spin_lock(&data_sinfo->lock); - data_sinfo->bytes_delalloc += bytes; - - /* - * we are adding a delalloc extent without calling - * btrfs_check_data_free_space first. This happens on a weird - * writepage condition, but shouldn't hurt our accounting - */ - if (unlikely(bytes > BTRFS_I(inode)->reserved_bytes)) { - data_sinfo->bytes_may_use -= BTRFS_I(inode)->reserved_bytes; - BTRFS_I(inode)->reserved_bytes = 0; - } else { - data_sinfo->bytes_may_use -= bytes; - BTRFS_I(inode)->reserved_bytes -= bytes; - } - - spin_unlock(&data_sinfo->lock); -} - -/* called when we are clearing an delalloc extent from the inode's io_tree */ -void btrfs_delalloc_free_space(struct btrfs_root *root, struct inode *inode, - u64 bytes) -{ - struct btrfs_space_info *info; - - info = BTRFS_I(inode)->space_info; - - spin_lock(&info->lock); - info->bytes_delalloc -= bytes; - spin_unlock(&info->lock); -} - static void force_metadata_allocation(struct btrfs_fs_info *info) { struct list_head *head = &info->space_info; @@ -3399,13 +2998,28 @@ static void force_metadata_allocation(struct btrfs_fs_info *info) rcu_read_unlock(); } +static int should_alloc_chunk(struct btrfs_space_info *sinfo, + u64 alloc_bytes) +{ + u64 num_bytes = sinfo->total_bytes - sinfo->bytes_readonly; + + if (sinfo->bytes_used + sinfo->bytes_reserved + + alloc_bytes + 256 * 1024 * 1024 < num_bytes) + return 0; + + if (sinfo->bytes_used + sinfo->bytes_reserved + + alloc_bytes < div_factor(num_bytes, 8)) + return 0; + + return 1; +} + static int do_chunk_alloc(struct btrfs_trans_handle *trans, struct btrfs_root *extent_root, u64 alloc_bytes, u64 flags, int force) { struct btrfs_space_info *space_info; struct btrfs_fs_info *fs_info = extent_root->fs_info; - u64 thresh; int ret = 0; mutex_lock(&fs_info->chunk_mutex); @@ -3428,11 +3042,7 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans, goto out; } - thresh = space_info->total_bytes - space_info->bytes_readonly; - thresh = div_factor(thresh, 8); - if (!force && - (space_info->bytes_used + space_info->bytes_pinned + - space_info->bytes_reserved + alloc_bytes) < thresh) { + if (!force && !should_alloc_chunk(space_info, alloc_bytes)) { spin_unlock(&space_info->lock); goto out; } @@ -3454,6 +3064,8 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans, spin_lock(&space_info->lock); if (ret) space_info->full = 1; + else + ret = 1; space_info->force_alloc = 0; spin_unlock(&space_info->lock); out: @@ -3461,13 +3073,713 @@ out: return ret; } +static int maybe_allocate_chunk(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_space_info *sinfo, u64 num_bytes) +{ + int ret; + int end_trans = 0; + + if (sinfo->full) + return 0; + + spin_lock(&sinfo->lock); + ret = should_alloc_chunk(sinfo, num_bytes + 2 * 1024 * 1024); + spin_unlock(&sinfo->lock); + if (!ret) + return 0; + + if (!trans) { + trans = btrfs_join_transaction(root, 1); + BUG_ON(IS_ERR(trans)); + end_trans = 1; + } + + ret = do_chunk_alloc(trans, root->fs_info->extent_root, + num_bytes + 2 * 1024 * 1024, + get_alloc_profile(root, sinfo->flags), 0); + + if (end_trans) + btrfs_end_transaction(trans, root); + + return ret == 1 ? 1 : 0; +} + +/* + * shrink metadata reservation for delalloc + */ +static int shrink_delalloc(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 to_reclaim) +{ + struct btrfs_block_rsv *block_rsv; + u64 reserved; + u64 max_reclaim; + u64 reclaimed = 0; + int pause = 1; + int ret; + + block_rsv = &root->fs_info->delalloc_block_rsv; + spin_lock(&block_rsv->lock); + reserved = block_rsv->reserved; + spin_unlock(&block_rsv->lock); + + if (reserved == 0) + return 0; + + max_reclaim = min(reserved, to_reclaim); + + while (1) { + ret = btrfs_start_one_delalloc_inode(root, trans ? 1 : 0); + if (!ret) { + __set_current_state(TASK_INTERRUPTIBLE); + schedule_timeout(pause); + pause <<= 1; + if (pause > HZ / 10) + pause = HZ / 10; + } else { + pause = 1; + } + + spin_lock(&block_rsv->lock); + if (reserved > block_rsv->reserved) + reclaimed = reserved - block_rsv->reserved; + reserved = block_rsv->reserved; + spin_unlock(&block_rsv->lock); + + if (reserved == 0 || reclaimed >= max_reclaim) + break; + + if (trans && trans->transaction->blocked) + return -EAGAIN; + } + return reclaimed >= to_reclaim; +} + +static int should_retry_reserve(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_block_rsv *block_rsv, + u64 num_bytes, int *retries) +{ + struct btrfs_space_info *space_info = block_rsv->space_info; + int ret; + + if ((*retries) > 2) + return -ENOSPC; + + ret = maybe_allocate_chunk(trans, root, space_info, num_bytes); + if (ret) + return 1; + + if (trans && trans->transaction->in_commit) + return -ENOSPC; + + ret = shrink_delalloc(trans, root, num_bytes); + if (ret) + return ret; + + spin_lock(&space_info->lock); + if (space_info->bytes_pinned < num_bytes) + ret = 1; + spin_unlock(&space_info->lock); + if (ret) + return -ENOSPC; + + (*retries)++; + + if (trans) + return -EAGAIN; + + trans = btrfs_join_transaction(root, 1); + BUG_ON(IS_ERR(trans)); + ret = btrfs_commit_transaction(trans, root); + BUG_ON(ret); + + return 1; +} + +static int reserve_metadata_bytes(struct btrfs_block_rsv *block_rsv, + u64 num_bytes) +{ + struct btrfs_space_info *space_info = block_rsv->space_info; + u64 unused; + int ret = -ENOSPC; + + spin_lock(&space_info->lock); + unused = space_info->bytes_used + space_info->bytes_reserved + + space_info->bytes_pinned + space_info->bytes_readonly; + + if (unused < space_info->total_bytes) + unused = space_info->total_bytes - unused; + else + unused = 0; + + if (unused >= num_bytes) { + if (block_rsv->priority >= 10) { + space_info->bytes_reserved += num_bytes; + ret = 0; + } else { + if ((unused + block_rsv->reserved) * + block_rsv->priority >= + (num_bytes + block_rsv->reserved) * 10) { + space_info->bytes_reserved += num_bytes; + ret = 0; + } + } + } + spin_unlock(&space_info->lock); + + return ret; +} + +static struct btrfs_block_rsv *get_block_rsv(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + struct btrfs_block_rsv *block_rsv; + if (root->ref_cows) + block_rsv = trans->block_rsv; + else + block_rsv = root->block_rsv; + + if (!block_rsv) + block_rsv = &root->fs_info->empty_block_rsv; + + return block_rsv; +} + +static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv, + u64 num_bytes) +{ + int ret = -ENOSPC; + spin_lock(&block_rsv->lock); + if (block_rsv->reserved >= num_bytes) { + block_rsv->reserved -= num_bytes; + if (block_rsv->reserved < block_rsv->size) + block_rsv->full = 0; + ret = 0; + } + spin_unlock(&block_rsv->lock); + return ret; +} + +static void block_rsv_add_bytes(struct btrfs_block_rsv *block_rsv, + u64 num_bytes, int update_size) +{ + spin_lock(&block_rsv->lock); + block_rsv->reserved += num_bytes; + if (update_size) + block_rsv->size += num_bytes; + else if (block_rsv->reserved >= block_rsv->size) + block_rsv->full = 1; + spin_unlock(&block_rsv->lock); +} + +void block_rsv_release_bytes(struct btrfs_block_rsv *block_rsv, + struct btrfs_block_rsv *dest, u64 num_bytes) +{ + struct btrfs_space_info *space_info = block_rsv->space_info; + + spin_lock(&block_rsv->lock); + if (num_bytes == (u64)-1) + num_bytes = block_rsv->size; + block_rsv->size -= num_bytes; + if (block_rsv->reserved >= block_rsv->size) { + num_bytes = block_rsv->reserved - block_rsv->size; + block_rsv->reserved = block_rsv->size; + block_rsv->full = 1; + } else { + num_bytes = 0; + } + spin_unlock(&block_rsv->lock); + + if (num_bytes > 0) { + if (dest) { + block_rsv_add_bytes(dest, num_bytes, 0); + } else { + spin_lock(&space_info->lock); + space_info->bytes_reserved -= num_bytes; + spin_unlock(&space_info->lock); + } + } +} + +static int block_rsv_migrate_bytes(struct btrfs_block_rsv *src, + struct btrfs_block_rsv *dst, u64 num_bytes) +{ + int ret; + + ret = block_rsv_use_bytes(src, num_bytes); + if (ret) + return ret; + + block_rsv_add_bytes(dst, num_bytes, 1); + return 0; +} + +void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv) +{ + memset(rsv, 0, sizeof(*rsv)); + spin_lock_init(&rsv->lock); + atomic_set(&rsv->usage, 1); + rsv->priority = 6; + INIT_LIST_HEAD(&rsv->list); +} + +struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root) +{ + struct btrfs_block_rsv *block_rsv; + struct btrfs_fs_info *fs_info = root->fs_info; + u64 alloc_target; + + block_rsv = kmalloc(sizeof(*block_rsv), GFP_NOFS); + if (!block_rsv) + return NULL; + + btrfs_init_block_rsv(block_rsv); + + alloc_target = btrfs_get_alloc_profile(root, 0); + block_rsv->space_info = __find_space_info(fs_info, + BTRFS_BLOCK_GROUP_METADATA); + + return block_rsv; +} + +void btrfs_free_block_rsv(struct btrfs_root *root, + struct btrfs_block_rsv *rsv) +{ + if (rsv && atomic_dec_and_test(&rsv->usage)) { + btrfs_block_rsv_release(root, rsv, (u64)-1); + if (!rsv->durable) + kfree(rsv); + } +} + +/* + * make the block_rsv struct be able to capture freed space. + * the captured space will re-add to the the block_rsv struct + * after transaction commit + */ +void btrfs_add_durable_block_rsv(struct btrfs_fs_info *fs_info, + struct btrfs_block_rsv *block_rsv) +{ + block_rsv->durable = 1; + mutex_lock(&fs_info->durable_block_rsv_mutex); + list_add_tail(&block_rsv->list, &fs_info->durable_block_rsv_list); + mutex_unlock(&fs_info->durable_block_rsv_mutex); +} + +int btrfs_block_rsv_add(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_block_rsv *block_rsv, + u64 num_bytes, int *retries) +{ + int ret; + + if (num_bytes == 0) + return 0; +again: + ret = reserve_metadata_bytes(block_rsv, num_bytes); + if (!ret) { + block_rsv_add_bytes(block_rsv, num_bytes, 1); + return 0; + } + + ret = should_retry_reserve(trans, root, block_rsv, num_bytes, retries); + if (ret > 0) + goto again; + + return ret; +} + +int btrfs_block_rsv_check(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_block_rsv *block_rsv, + u64 min_reserved, int min_factor) +{ + u64 num_bytes = 0; + int commit_trans = 0; + int ret = -ENOSPC; + + if (!block_rsv) + return 0; + + spin_lock(&block_rsv->lock); + if (min_factor > 0) + num_bytes = div_factor(block_rsv->size, min_factor); + if (min_reserved > num_bytes) + num_bytes = min_reserved; + + if (block_rsv->reserved >= num_bytes) { + ret = 0; + } else { + num_bytes -= block_rsv->reserved; + if (block_rsv->durable && + block_rsv->freed[0] + block_rsv->freed[1] >= num_bytes) + commit_trans = 1; + } + spin_unlock(&block_rsv->lock); + if (!ret) + return 0; + + if (block_rsv->refill_used) { + ret = reserve_metadata_bytes(block_rsv, num_bytes); + if (!ret) { + block_rsv_add_bytes(block_rsv, num_bytes, 0); + return 0; + } + } + + if (commit_trans) { + if (trans) + return -EAGAIN; + + trans = btrfs_join_transaction(root, 1); + BUG_ON(IS_ERR(trans)); + ret = btrfs_commit_transaction(trans, root); + return 0; + } + + WARN_ON(1); + printk(KERN_INFO"block_rsv size %llu reserved %llu freed %llu %llu\n", + block_rsv->size, block_rsv->reserved, + block_rsv->freed[0], block_rsv->freed[1]); + + return -ENOSPC; +} + +int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv, + struct btrfs_block_rsv *dst_rsv, + u64 num_bytes) +{ + return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes); +} + +void btrfs_block_rsv_release(struct btrfs_root *root, + struct btrfs_block_rsv *block_rsv, + u64 num_bytes) +{ + struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv; + if (global_rsv->full || global_rsv == block_rsv || + block_rsv->space_info != global_rsv->space_info) + global_rsv = NULL; + block_rsv_release_bytes(block_rsv, global_rsv, num_bytes); +} + +/* + * helper to calculate size of global block reservation. + * the desired value is sum of space used by extent tree, + * checksum tree and root tree + */ +static u64 calc_global_metadata_size(struct btrfs_fs_info *fs_info) +{ + struct btrfs_space_info *sinfo; + u64 num_bytes; + u64 meta_used; + u64 data_used; + int csum_size = btrfs_super_csum_size(&fs_info->super_copy); +#if 0 + /* + * per tree used space accounting can be inaccuracy, so we + * can't rely on it. + */ + spin_lock(&fs_info->extent_root->accounting_lock); + num_bytes = btrfs_root_used(&fs_info->extent_root->root_item); + spin_unlock(&fs_info->extent_root->accounting_lock); + + spin_lock(&fs_info->csum_root->accounting_lock); + num_bytes += btrfs_root_used(&fs_info->csum_root->root_item); + spin_unlock(&fs_info->csum_root->accounting_lock); + + spin_lock(&fs_info->tree_root->accounting_lock); + num_bytes += btrfs_root_used(&fs_info->tree_root->root_item); + spin_unlock(&fs_info->tree_root->accounting_lock); +#endif + sinfo = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_DATA); + spin_lock(&sinfo->lock); + data_used = sinfo->bytes_used; + spin_unlock(&sinfo->lock); + + sinfo = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); + spin_lock(&sinfo->lock); + meta_used = sinfo->bytes_used; + spin_unlock(&sinfo->lock); + + num_bytes = (data_used >> fs_info->sb->s_blocksize_bits) * + csum_size * 2; + num_bytes += div64_u64(data_used + meta_used, 50); + + if (num_bytes * 3 > meta_used) + num_bytes = div64_u64(meta_used, 3); + + return ALIGN(num_bytes, fs_info->extent_root->leafsize << 10); +} + +static void update_global_block_rsv(struct btrfs_fs_info *fs_info) +{ + struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv; + struct btrfs_space_info *sinfo = block_rsv->space_info; + u64 num_bytes; + + num_bytes = calc_global_metadata_size(fs_info); + + spin_lock(&block_rsv->lock); + spin_lock(&sinfo->lock); + + block_rsv->size = num_bytes; + + num_bytes = sinfo->bytes_used + sinfo->bytes_pinned + + sinfo->bytes_reserved + sinfo->bytes_readonly; + + if (sinfo->total_bytes > num_bytes) { + num_bytes = sinfo->total_bytes - num_bytes; + block_rsv->reserved += num_bytes; + sinfo->bytes_reserved += num_bytes; + } + + if (block_rsv->reserved >= block_rsv->size) { + num_bytes = block_rsv->reserved - block_rsv->size; + sinfo->bytes_reserved -= num_bytes; + block_rsv->reserved = block_rsv->size; + block_rsv->full = 1; + } +#if 0 + printk(KERN_INFO"global block rsv size %llu reserved %llu\n", + block_rsv->size, block_rsv->reserved); +#endif + spin_unlock(&sinfo->lock); + spin_unlock(&block_rsv->lock); +} + +static void init_global_block_rsv(struct btrfs_fs_info *fs_info) +{ + struct btrfs_space_info *space_info; + + space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM); + fs_info->chunk_block_rsv.space_info = space_info; + fs_info->chunk_block_rsv.priority = 10; + + space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); + fs_info->global_block_rsv.space_info = space_info; + fs_info->global_block_rsv.priority = 10; + fs_info->global_block_rsv.refill_used = 1; + fs_info->delalloc_block_rsv.space_info = space_info; + fs_info->trans_block_rsv.space_info = space_info; + fs_info->empty_block_rsv.space_info = space_info; + fs_info->empty_block_rsv.priority = 10; + + fs_info->extent_root->block_rsv = &fs_info->global_block_rsv; + fs_info->csum_root->block_rsv = &fs_info->global_block_rsv; + fs_info->dev_root->block_rsv = &fs_info->global_block_rsv; + fs_info->tree_root->block_rsv = &fs_info->global_block_rsv; + fs_info->chunk_root->block_rsv = &fs_info->chunk_block_rsv; + + btrfs_add_durable_block_rsv(fs_info, &fs_info->global_block_rsv); + + btrfs_add_durable_block_rsv(fs_info, &fs_info->delalloc_block_rsv); + + update_global_block_rsv(fs_info); +} + +static void release_global_block_rsv(struct btrfs_fs_info *fs_info) +{ + block_rsv_release_bytes(&fs_info->global_block_rsv, NULL, (u64)-1); + WARN_ON(fs_info->delalloc_block_rsv.size > 0); + WARN_ON(fs_info->delalloc_block_rsv.reserved > 0); + WARN_ON(fs_info->trans_block_rsv.size > 0); + WARN_ON(fs_info->trans_block_rsv.reserved > 0); + WARN_ON(fs_info->chunk_block_rsv.size > 0); + WARN_ON(fs_info->chunk_block_rsv.reserved > 0); +} + +static u64 calc_trans_metadata_size(struct btrfs_root *root, int num_items) +{ + return (root->leafsize + root->nodesize * (BTRFS_MAX_LEVEL - 1)) * + 3 * num_items; +} + +int btrfs_trans_reserve_metadata(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + int num_items, int *retries) +{ + u64 num_bytes; + int ret; + + if (num_items == 0 || root->fs_info->chunk_root == root) + return 0; + + num_bytes = calc_trans_metadata_size(root, num_items); + ret = btrfs_block_rsv_add(trans, root, &root->fs_info->trans_block_rsv, + num_bytes, retries); + if (!ret) { + trans->bytes_reserved += num_bytes; + trans->block_rsv = &root->fs_info->trans_block_rsv; + } + return ret; +} + +void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + if (!trans->bytes_reserved) + return; + + BUG_ON(trans->block_rsv != &root->fs_info->trans_block_rsv); + btrfs_block_rsv_release(root, trans->block_rsv, + trans->bytes_reserved); + trans->bytes_reserved = 0; +} + +int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans, + struct inode *inode) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_block_rsv *src_rsv = get_block_rsv(trans, root); + struct btrfs_block_rsv *dst_rsv = root->orphan_block_rsv; + + /* + * one for deleting orphan item, one for updating inode and + * two for calling btrfs_truncate_inode_items. + * + * btrfs_truncate_inode_items is a delete operation, it frees + * more space than it uses in most cases. So two units of + * metadata space should be enough for calling it many times. + * If all of the metadata space is used, we can commit + * transaction and use space it freed. + */ + u64 num_bytes = calc_trans_metadata_size(root, 4); + return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes); +} + +void btrfs_orphan_release_metadata(struct inode *inode) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + u64 num_bytes = calc_trans_metadata_size(root, 4); + btrfs_block_rsv_release(root, root->orphan_block_rsv, num_bytes); +} + +int btrfs_snap_reserve_metadata(struct btrfs_trans_handle *trans, + struct btrfs_pending_snapshot *pending) +{ + struct btrfs_root *root = pending->root; + struct btrfs_block_rsv *src_rsv = get_block_rsv(trans, root); + struct btrfs_block_rsv *dst_rsv = &pending->block_rsv; + /* + * two for root back/forward refs, two for directory entries + * and one for root of the snapshot. + */ + u64 num_bytes = calc_trans_metadata_size(root, 5); + dst_rsv->space_info = src_rsv->space_info; + return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes); +} + +static u64 calc_csum_metadata_size(struct inode *inode, u64 num_bytes) +{ + return num_bytes >>= 3; +} + +int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_block_rsv *block_rsv = &root->fs_info->delalloc_block_rsv; + u64 to_reserve; + int nr_extents; + int retries = 0; + int ret; + + if (btrfs_transaction_in_commit(root->fs_info)) + schedule_timeout(1); + + num_bytes = ALIGN(num_bytes, root->sectorsize); +again: + spin_lock(&BTRFS_I(inode)->accounting_lock); + nr_extents = atomic_read(&BTRFS_I(inode)->outstanding_extents) + 1; + if (nr_extents > BTRFS_I(inode)->reserved_extents) { + nr_extents -= BTRFS_I(inode)->reserved_extents; + to_reserve = calc_trans_metadata_size(root, nr_extents); + } else { + nr_extents = 0; + to_reserve = 0; + } + + to_reserve += calc_csum_metadata_size(inode, num_bytes); + ret = reserve_metadata_bytes(block_rsv, to_reserve); + if (ret) { + spin_unlock(&BTRFS_I(inode)->accounting_lock); + ret = should_retry_reserve(NULL, root, block_rsv, to_reserve, + &retries); + if (ret > 0) + goto again; + return ret; + } + + BTRFS_I(inode)->reserved_extents += nr_extents; + atomic_inc(&BTRFS_I(inode)->outstanding_extents); + spin_unlock(&BTRFS_I(inode)->accounting_lock); + + block_rsv_add_bytes(block_rsv, to_reserve, 1); + + if (block_rsv->size > 512 * 1024 * 1024) + shrink_delalloc(NULL, root, to_reserve); + + return 0; +} + +void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + u64 to_free; + int nr_extents; + + num_bytes = ALIGN(num_bytes, root->sectorsize); + atomic_dec(&BTRFS_I(inode)->outstanding_extents); + + spin_lock(&BTRFS_I(inode)->accounting_lock); + nr_extents = atomic_read(&BTRFS_I(inode)->outstanding_extents); + if (nr_extents < BTRFS_I(inode)->reserved_extents) { + nr_extents = BTRFS_I(inode)->reserved_extents - nr_extents; + BTRFS_I(inode)->reserved_extents -= nr_extents; + } else { + nr_extents = 0; + } + spin_unlock(&BTRFS_I(inode)->accounting_lock); + + to_free = calc_csum_metadata_size(inode, num_bytes); + if (nr_extents > 0) + to_free += calc_trans_metadata_size(root, nr_extents); + + btrfs_block_rsv_release(root, &root->fs_info->delalloc_block_rsv, + to_free); +} + +int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes) +{ + int ret; + + ret = btrfs_check_data_free_space(inode, num_bytes); + if (ret) + return ret; + + ret = btrfs_delalloc_reserve_metadata(inode, num_bytes); + if (ret) { + btrfs_free_reserved_data_space(inode, num_bytes); + return ret; + } + + return 0; +} + +void btrfs_delalloc_release_space(struct inode *inode, u64 num_bytes) +{ + btrfs_delalloc_release_metadata(inode, num_bytes); + btrfs_free_reserved_data_space(inode, num_bytes); +} + static int update_block_group(struct btrfs_trans_handle *trans, struct btrfs_root *root, - u64 bytenr, u64 num_bytes, int alloc, - int mark_free) + u64 bytenr, u64 num_bytes, int alloc) { struct btrfs_block_group_cache *cache; struct btrfs_fs_info *info = root->fs_info; + int factor; u64 total = num_bytes; u64 old_val; u64 byte_in_group; @@ -3486,6 +3798,12 @@ static int update_block_group(struct btrfs_trans_handle *trans, cache = btrfs_lookup_block_group(info, bytenr); if (!cache) return -1; + if (cache->flags & (BTRFS_BLOCK_GROUP_DUP | + BTRFS_BLOCK_GROUP_RAID1 | + BTRFS_BLOCK_GROUP_RAID10)) + factor = 2; + else + factor = 1; byte_in_group = bytenr - cache->key.objectid; WARN_ON(byte_in_group > cache->key.offset); @@ -3498,31 +3816,24 @@ static int update_block_group(struct btrfs_trans_handle *trans, old_val += num_bytes; btrfs_set_block_group_used(&cache->item, old_val); cache->reserved -= num_bytes; - cache->space_info->bytes_used += num_bytes; cache->space_info->bytes_reserved -= num_bytes; - if (cache->ro) - cache->space_info->bytes_readonly -= num_bytes; + cache->space_info->bytes_used += num_bytes; + cache->space_info->disk_used += num_bytes * factor; spin_unlock(&cache->lock); spin_unlock(&cache->space_info->lock); } else { old_val -= num_bytes; - cache->space_info->bytes_used -= num_bytes; - if (cache->ro) - cache->space_info->bytes_readonly += num_bytes; btrfs_set_block_group_used(&cache->item, old_val); + cache->pinned += num_bytes; + cache->space_info->bytes_pinned += num_bytes; + cache->space_info->bytes_used -= num_bytes; + cache->space_info->disk_used -= num_bytes * factor; spin_unlock(&cache->lock); spin_unlock(&cache->space_info->lock); - if (mark_free) { - int ret; - - ret = btrfs_discard_extent(root, bytenr, - num_bytes); - WARN_ON(ret); - ret = btrfs_add_free_space(cache, bytenr, - num_bytes); - WARN_ON(ret); - } + set_extent_dirty(info->pinned_extents, + bytenr, bytenr + num_bytes - 1, + GFP_NOFS | __GFP_NOFAIL); } btrfs_put_block_group(cache); total -= num_bytes; @@ -3546,18 +3857,10 @@ static u64 first_logical_byte(struct btrfs_root *root, u64 search_start) return bytenr; } -/* - * this function must be called within transaction - */ -int btrfs_pin_extent(struct btrfs_root *root, - u64 bytenr, u64 num_bytes, int reserved) +static int pin_down_extent(struct btrfs_root *root, + struct btrfs_block_group_cache *cache, + u64 bytenr, u64 num_bytes, int reserved) { - struct btrfs_fs_info *fs_info = root->fs_info; - struct btrfs_block_group_cache *cache; - - cache = btrfs_lookup_block_group(fs_info, bytenr); - BUG_ON(!cache); - spin_lock(&cache->space_info->lock); spin_lock(&cache->lock); cache->pinned += num_bytes; @@ -3569,28 +3872,68 @@ int btrfs_pin_extent(struct btrfs_root *root, spin_unlock(&cache->lock); spin_unlock(&cache->space_info->lock); - btrfs_put_block_group(cache); + set_extent_dirty(root->fs_info->pinned_extents, bytenr, + bytenr + num_bytes - 1, GFP_NOFS | __GFP_NOFAIL); + return 0; +} - set_extent_dirty(fs_info->pinned_extents, - bytenr, bytenr + num_bytes - 1, GFP_NOFS); +/* + * this function must be called within transaction + */ +int btrfs_pin_extent(struct btrfs_root *root, + u64 bytenr, u64 num_bytes, int reserved) +{ + struct btrfs_block_group_cache *cache; + + cache = btrfs_lookup_block_group(root->fs_info, bytenr); + BUG_ON(!cache); + + pin_down_extent(root, cache, bytenr, num_bytes, reserved); + + btrfs_put_block_group(cache); return 0; } -static int update_reserved_extents(struct btrfs_block_group_cache *cache, - u64 num_bytes, int reserve) +/* + * update size of reserved extents. this function may return -EAGAIN + * if 'reserve' is true or 'sinfo' is false. + */ +static int update_reserved_bytes(struct btrfs_block_group_cache *cache, + u64 num_bytes, int reserve, int sinfo) { - spin_lock(&cache->space_info->lock); - spin_lock(&cache->lock); - if (reserve) { - cache->reserved += num_bytes; - cache->space_info->bytes_reserved += num_bytes; + int ret = 0; + if (sinfo) { + struct btrfs_space_info *space_info = cache->space_info; + spin_lock(&space_info->lock); + spin_lock(&cache->lock); + if (reserve) { + if (cache->ro) { + ret = -EAGAIN; + } else { + cache->reserved += num_bytes; + space_info->bytes_reserved += num_bytes; + } + } else { + if (cache->ro) + space_info->bytes_readonly += num_bytes; + cache->reserved -= num_bytes; + space_info->bytes_reserved -= num_bytes; + } + spin_unlock(&cache->lock); + spin_unlock(&space_info->lock); } else { - cache->reserved -= num_bytes; - cache->space_info->bytes_reserved -= num_bytes; + spin_lock(&cache->lock); + if (cache->ro) { + ret = -EAGAIN; + } else { + if (reserve) + cache->reserved += num_bytes; + else + cache->reserved -= num_bytes; + } + spin_unlock(&cache->lock); } - spin_unlock(&cache->lock); - spin_unlock(&cache->space_info->lock); - return 0; + return ret; } int btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans, @@ -3621,6 +3964,8 @@ int btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans, fs_info->pinned_extents = &fs_info->freed_extents[0]; up_write(&fs_info->extent_commit_sem); + + update_global_block_rsv(fs_info); return 0; } @@ -3647,14 +3992,21 @@ static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end) btrfs_add_free_space(cache, start, len); } + start += len; + spin_lock(&cache->space_info->lock); spin_lock(&cache->lock); cache->pinned -= len; cache->space_info->bytes_pinned -= len; + if (cache->ro) { + cache->space_info->bytes_readonly += len; + } else if (cache->reserved_pinned > 0) { + len = min(len, cache->reserved_pinned); + cache->reserved_pinned -= len; + cache->space_info->bytes_reserved += len; + } spin_unlock(&cache->lock); spin_unlock(&cache->space_info->lock); - - start += len; } if (cache) @@ -3667,8 +4019,11 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, { struct btrfs_fs_info *fs_info = root->fs_info; struct extent_io_tree *unpin; + struct btrfs_block_rsv *block_rsv; + struct btrfs_block_rsv *next_rsv; u64 start; u64 end; + int idx; int ret; if (fs_info->pinned_extents == &fs_info->freed_extents[0]) @@ -3689,59 +4044,30 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, cond_resched(); } - return ret; -} - -static int pin_down_bytes(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, - u64 bytenr, u64 num_bytes, - int is_data, int reserved, - struct extent_buffer **must_clean) -{ - int err = 0; - struct extent_buffer *buf; + mutex_lock(&fs_info->durable_block_rsv_mutex); + list_for_each_entry_safe(block_rsv, next_rsv, + &fs_info->durable_block_rsv_list, list) { - if (is_data) - goto pinit; - - /* - * discard is sloooow, and so triggering discards on - * individual btree blocks isn't a good plan. Just - * pin everything in discard mode. - */ - if (btrfs_test_opt(root, DISCARD)) - goto pinit; - - buf = btrfs_find_tree_block(root, bytenr, num_bytes); - if (!buf) - goto pinit; + idx = trans->transid & 0x1; + if (block_rsv->freed[idx] > 0) { + block_rsv_add_bytes(block_rsv, + block_rsv->freed[idx], 0); + block_rsv->freed[idx] = 0; + } + if (atomic_read(&block_rsv->usage) == 0) { + btrfs_block_rsv_release(root, block_rsv, (u64)-1); - /* we can reuse a block if it hasn't been written - * and it is from this transaction. We can't - * reuse anything from the tree log root because - * it has tiny sub-transactions. - */ - if (btrfs_buffer_uptodate(buf, 0) && - btrfs_try_tree_lock(buf)) { - u64 header_owner = btrfs_header_owner(buf); - u64 header_transid = btrfs_header_generation(buf); - if (header_owner != BTRFS_TREE_LOG_OBJECTID && - header_transid == trans->transid && - !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) { - *must_clean = buf; - return 1; + if (block_rsv->freed[0] == 0 && + block_rsv->freed[1] == 0) { + list_del_init(&block_rsv->list); + kfree(block_rsv); + } + } else { + btrfs_block_rsv_release(root, block_rsv, 0); } - btrfs_tree_unlock(buf); } - free_extent_buffer(buf); -pinit: - if (path) - btrfs_set_path_blocking(path); - /* unlocks the pinned mutex */ - btrfs_pin_extent(root, bytenr, num_bytes, reserved); + mutex_unlock(&fs_info->durable_block_rsv_mutex); - BUG_ON(err < 0); return 0; } @@ -3902,9 +4228,6 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, BUG_ON(ret); } } else { - int mark_free = 0; - struct extent_buffer *must_clean = NULL; - if (found_extent) { BUG_ON(is_data && refs_to_drop != extent_data_ref_count(root, path, iref)); @@ -3917,31 +4240,11 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, } } - ret = pin_down_bytes(trans, root, path, bytenr, - num_bytes, is_data, 0, &must_clean); - if (ret > 0) - mark_free = 1; - BUG_ON(ret < 0); - /* - * it is going to be very rare for someone to be waiting - * on the block we're freeing. del_items might need to - * schedule, so rather than get fancy, just force it - * to blocking here - */ - if (must_clean) - btrfs_set_lock_blocking(must_clean); - ret = btrfs_del_items(trans, extent_root, path, path->slots[0], num_to_del); BUG_ON(ret); btrfs_release_path(extent_root, path); - if (must_clean) { - clean_tree_block(NULL, root, must_clean); - btrfs_tree_unlock(must_clean); - free_extent_buffer(must_clean); - } - if (is_data) { ret = btrfs_del_csums(trans, root, bytenr, num_bytes); BUG_ON(ret); @@ -3951,8 +4254,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, (bytenr + num_bytes - 1) >> PAGE_CACHE_SHIFT); } - ret = update_block_group(trans, root, bytenr, num_bytes, 0, - mark_free); + ret = update_block_group(trans, root, bytenr, num_bytes, 0); BUG_ON(ret); } btrfs_free_path(path); @@ -3960,7 +4262,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, } /* - * when we free an extent, it is possible (and likely) that we free the last + * when we free an block, it is possible (and likely) that we free the last * delayed ref for that extent as well. This searches the delayed ref tree for * a given extent, and if there are no other delayed refs to be processed, it * removes it from the tree. @@ -3972,7 +4274,7 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_root *delayed_refs; struct btrfs_delayed_ref_node *ref; struct rb_node *node; - int ret; + int ret = 0; delayed_refs = &trans->transaction->delayed_refs; spin_lock(&delayed_refs->lock); @@ -4024,17 +4326,100 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans, list_del_init(&head->cluster); spin_unlock(&delayed_refs->lock); - ret = run_one_delayed_ref(trans, root->fs_info->tree_root, - &head->node, head->extent_op, - head->must_insert_reserved); - BUG_ON(ret); + BUG_ON(head->extent_op); + if (head->must_insert_reserved) + ret = 1; + + mutex_unlock(&head->mutex); btrfs_put_delayed_ref(&head->node); - return 0; + return ret; out: spin_unlock(&delayed_refs->lock); return 0; } +void btrfs_free_tree_block(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct extent_buffer *buf, + u64 parent, int last_ref) +{ + struct btrfs_block_rsv *block_rsv; + struct btrfs_block_group_cache *cache = NULL; + int ret; + + if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) { + ret = btrfs_add_delayed_tree_ref(trans, buf->start, buf->len, + parent, root->root_key.objectid, + btrfs_header_level(buf), + BTRFS_DROP_DELAYED_REF, NULL); + BUG_ON(ret); + } + + if (!last_ref) + return; + + block_rsv = get_block_rsv(trans, root); + cache = btrfs_lookup_block_group(root->fs_info, buf->start); + if (block_rsv->space_info != cache->space_info) + goto out; + + if (btrfs_header_generation(buf) == trans->transid) { + if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) { + ret = check_ref_cleanup(trans, root, buf->start); + if (!ret) + goto pin; + } + + if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) { + pin_down_extent(root, cache, buf->start, buf->len, 1); + goto pin; + } + + WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags)); + + btrfs_add_free_space(cache, buf->start, buf->len); + ret = update_reserved_bytes(cache, buf->len, 0, 0); + if (ret == -EAGAIN) { + /* block group became read-only */ + update_reserved_bytes(cache, buf->len, 0, 1); + goto out; + } + + ret = 1; + spin_lock(&block_rsv->lock); + if (block_rsv->reserved < block_rsv->size) { + block_rsv->reserved += buf->len; + ret = 0; + } + spin_unlock(&block_rsv->lock); + + if (ret) { + spin_lock(&cache->space_info->lock); + cache->space_info->bytes_reserved -= buf->len; + spin_unlock(&cache->space_info->lock); + } + goto out; + } +pin: + if (block_rsv->durable && !cache->ro) { + ret = 0; + spin_lock(&cache->lock); + if (!cache->ro) { + cache->reserved_pinned += buf->len; + ret = 1; + } + spin_unlock(&cache->lock); + + if (ret) { + spin_lock(&block_rsv->lock); + block_rsv->freed[trans->transid & 0x1] += buf->len; + spin_unlock(&block_rsv->lock); + } + } +out: + btrfs_put_block_group(cache); +} + int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u64 num_bytes, u64 parent, @@ -4056,8 +4441,6 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, parent, root_objectid, (int)owner, BTRFS_DROP_DELAYED_REF, NULL); BUG_ON(ret); - ret = check_ref_cleanup(trans, root, bytenr); - BUG_ON(ret); } else { ret = btrfs_add_delayed_data_ref(trans, bytenr, num_bytes, parent, root_objectid, owner, @@ -4067,21 +4450,6 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, return ret; } -int btrfs_free_tree_block(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - u64 bytenr, u32 blocksize, - u64 parent, u64 root_objectid, int level) -{ - u64 used; - spin_lock(&root->node_lock); - used = btrfs_root_used(&root->root_item) - blocksize; - btrfs_set_root_used(&root->root_item, used); - spin_unlock(&root->node_lock); - - return btrfs_free_extent(trans, root, bytenr, blocksize, - parent, root_objectid, level, 0); -} - static u64 stripe_align(struct btrfs_root *root, u64 val) { u64 mask = ((u64)root->stripesize - 1); @@ -4134,6 +4502,22 @@ wait_block_group_cache_done(struct btrfs_block_group_cache *cache) return 0; } +static int get_block_group_index(struct btrfs_block_group_cache *cache) +{ + int index; + if (cache->flags & BTRFS_BLOCK_GROUP_RAID10) + index = 0; + else if (cache->flags & BTRFS_BLOCK_GROUP_RAID1) + index = 1; + else if (cache->flags & BTRFS_BLOCK_GROUP_DUP) + index = 2; + else if (cache->flags & BTRFS_BLOCK_GROUP_RAID0) + index = 3; + else + index = 4; + return index; +} + enum btrfs_loop_type { LOOP_FIND_IDEAL = 0, LOOP_CACHING_NOWAIT = 1, @@ -4155,7 +4539,6 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans, u64 num_bytes, u64 empty_size, u64 search_start, u64 search_end, u64 hint_byte, struct btrfs_key *ins, - u64 exclude_start, u64 exclude_nr, int data) { int ret = 0; @@ -4168,6 +4551,7 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans, struct btrfs_space_info *space_info; int last_ptr_loop = 0; int loop = 0; + int index = 0; bool found_uncached_bg = false; bool failed_cluster_refill = false; bool failed_alloc = false; @@ -4237,6 +4621,7 @@ ideal_cache: btrfs_put_block_group(block_group); up_read(&space_info->groups_sem); } else { + index = get_block_group_index(block_group); goto have_block_group; } } else if (block_group) { @@ -4245,7 +4630,8 @@ ideal_cache: } search: down_read(&space_info->groups_sem); - list_for_each_entry(block_group, &space_info->block_groups, list) { + list_for_each_entry(block_group, &space_info->block_groups[index], + list) { u64 offset; int cached; @@ -4436,23 +4822,22 @@ checks: goto loop; } - if (exclude_nr > 0 && - (search_start + num_bytes > exclude_start && - search_start < exclude_start + exclude_nr)) { - search_start = exclude_start + exclude_nr; + ins->objectid = search_start; + ins->offset = num_bytes; + + if (offset < search_start) + btrfs_add_free_space(block_group, offset, + search_start - offset); + BUG_ON(offset > search_start); + ret = update_reserved_bytes(block_group, num_bytes, 1, + (data & BTRFS_BLOCK_GROUP_DATA)); + if (ret == -EAGAIN) { btrfs_add_free_space(block_group, offset, num_bytes); - /* - * if search_start is still in this block group - * then we just re-search this block group - */ - if (search_start >= block_group->key.objectid && - search_start < (block_group->key.objectid + - block_group->key.offset)) - goto have_block_group; goto loop; } + /* we are all good, lets return */ ins->objectid = search_start; ins->offset = num_bytes; @@ -4460,18 +4845,18 @@ checks: btrfs_add_free_space(block_group, offset, search_start - offset); BUG_ON(offset > search_start); - - update_reserved_extents(block_group, num_bytes, 1); - - /* we are all good, lets return */ break; loop: failed_cluster_refill = false; failed_alloc = false; + BUG_ON(index != get_block_group_index(block_group)); btrfs_put_block_group(block_group); } up_read(&space_info->groups_sem); + if (!ins->objectid && ++index < BTRFS_NR_RAID_TYPES) + goto search; + /* LOOP_FIND_IDEAL, only search caching/cached bg's, and don't wait for * for them to make caching progress. Also * determine the best possible bg to cache @@ -4485,6 +4870,7 @@ loop: if (!ins->objectid && loop < LOOP_NO_EMPTY_SIZE && (found_uncached_bg || empty_size || empty_cluster || allowed_chunk_alloc)) { + index = 0; if (loop == LOOP_FIND_IDEAL && found_uncached_bg) { found_uncached_bg = false; loop++; @@ -4567,31 +4953,30 @@ static void dump_space_info(struct btrfs_space_info *info, u64 bytes, int dump_block_groups) { struct btrfs_block_group_cache *cache; + int index = 0; spin_lock(&info->lock); printk(KERN_INFO "space_info has %llu free, is %sfull\n", (unsigned long long)(info->total_bytes - info->bytes_used - info->bytes_pinned - info->bytes_reserved - - info->bytes_super), + info->bytes_readonly), (info->full) ? "" : "not "); - printk(KERN_INFO "space_info total=%llu, pinned=%llu, delalloc=%llu," - " may_use=%llu, used=%llu, root=%llu, super=%llu, reserved=%llu" - "\n", + printk(KERN_INFO "space_info total=%llu, used=%llu, pinned=%llu, " + "reserved=%llu, may_use=%llu, readonly=%llu\n", (unsigned long long)info->total_bytes, + (unsigned long long)info->bytes_used, (unsigned long long)info->bytes_pinned, - (unsigned long long)info->bytes_delalloc, + (unsigned long long)info->bytes_reserved, (unsigned long long)info->bytes_may_use, - (unsigned long long)info->bytes_used, - (unsigned long long)info->bytes_root, - (unsigned long long)info->bytes_super, - (unsigned long long)info->bytes_reserved); + (unsigned long long)info->bytes_readonly); spin_unlock(&info->lock); if (!dump_block_groups) return; down_read(&info->groups_sem); - list_for_each_entry(cache, &info->block_groups, list) { +again: + list_for_each_entry(cache, &info->block_groups[index], list) { spin_lock(&cache->lock); printk(KERN_INFO "block group %llu has %llu bytes, %llu used " "%llu pinned %llu reserved\n", @@ -4603,6 +4988,8 @@ static void dump_space_info(struct btrfs_space_info *info, u64 bytes, btrfs_dump_free_space(cache, bytes); spin_unlock(&cache->lock); } + if (++index < BTRFS_NR_RAID_TYPES) + goto again; up_read(&info->groups_sem); } @@ -4628,9 +5015,8 @@ again: WARN_ON(num_bytes < root->sectorsize); ret = find_free_extent(trans, root, num_bytes, empty_size, - search_start, search_end, hint_byte, ins, - trans->alloc_exclude_start, - trans->alloc_exclude_nr, data); + search_start, search_end, hint_byte, + ins, data); if (ret == -ENOSPC && num_bytes > min_alloc_size) { num_bytes = num_bytes >> 1; @@ -4668,7 +5054,7 @@ int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len) ret = btrfs_discard_extent(root, start, len); btrfs_add_free_space(cache, start, len); - update_reserved_extents(cache, len, 0); + update_reserved_bytes(cache, len, 0, 1); btrfs_put_block_group(cache); return ret; @@ -4731,8 +5117,7 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans, btrfs_mark_buffer_dirty(path->nodes[0]); btrfs_free_path(path); - ret = update_block_group(trans, root, ins->objectid, ins->offset, - 1, 0); + ret = update_block_group(trans, root, ins->objectid, ins->offset, 1); if (ret) { printk(KERN_ERR "btrfs update block group failed for %llu " "%llu\n", (unsigned long long)ins->objectid, @@ -4792,8 +5177,7 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, btrfs_mark_buffer_dirty(leaf); btrfs_free_path(path); - ret = update_block_group(trans, root, ins->objectid, ins->offset, - 1, 0); + ret = update_block_group(trans, root, ins->objectid, ins->offset, 1); if (ret) { printk(KERN_ERR "btrfs update block group failed for %llu " "%llu\n", (unsigned long long)ins->objectid, @@ -4869,73 +5253,14 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans, put_caching_control(caching_ctl); } - update_reserved_extents(block_group, ins->offset, 1); + ret = update_reserved_bytes(block_group, ins->offset, 1, 1); + BUG_ON(ret); btrfs_put_block_group(block_group); ret = alloc_reserved_file_extent(trans, root, 0, root_objectid, 0, owner, offset, ins, 1); return ret; } -/* - * finds a free extent and does all the dirty work required for allocation - * returns the key for the extent through ins, and a tree buffer for - * the first block of the extent through buf. - * - * returns 0 if everything worked, non-zero otherwise. - */ -static int alloc_tree_block(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - u64 num_bytes, u64 parent, u64 root_objectid, - struct btrfs_disk_key *key, int level, - u64 empty_size, u64 hint_byte, u64 search_end, - struct btrfs_key *ins) -{ - int ret; - u64 flags = 0; - - ret = btrfs_reserve_extent(trans, root, num_bytes, num_bytes, - empty_size, hint_byte, search_end, - ins, 0); - if (ret) - return ret; - - if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) { - if (parent == 0) - parent = ins->objectid; - flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF; - } else - BUG_ON(parent > 0); - - if (root_objectid != BTRFS_TREE_LOG_OBJECTID) { - struct btrfs_delayed_extent_op *extent_op; - extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS); - BUG_ON(!extent_op); - if (key) - memcpy(&extent_op->key, key, sizeof(extent_op->key)); - else - memset(&extent_op->key, 0, sizeof(extent_op->key)); - extent_op->flags_to_set = flags; - extent_op->update_key = 1; - extent_op->update_flags = 1; - extent_op->is_data = 0; - - ret = btrfs_add_delayed_tree_ref(trans, ins->objectid, - ins->offset, parent, root_objectid, - level, BTRFS_ADD_DELAYED_EXTENT, - extent_op); - BUG_ON(ret); - } - - if (root_objectid == root->root_key.objectid) { - u64 used; - spin_lock(&root->node_lock); - used = btrfs_root_used(&root->root_item) + num_bytes; - btrfs_set_root_used(&root->root_item, used); - spin_unlock(&root->node_lock); - } - return ret; -} - struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u32 blocksize, @@ -4974,8 +5299,45 @@ struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans, return buf; } +static struct btrfs_block_rsv * +use_block_rsv(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u32 blocksize) +{ + struct btrfs_block_rsv *block_rsv; + int ret; + + block_rsv = get_block_rsv(trans, root); + + if (block_rsv->size == 0) { + ret = reserve_metadata_bytes(block_rsv, blocksize); + if (ret) + return ERR_PTR(ret); + return block_rsv; + } + + ret = block_rsv_use_bytes(block_rsv, blocksize); + if (!ret) + return block_rsv; + + WARN_ON(1); + printk(KERN_INFO"block_rsv size %llu reserved %llu freed %llu %llu\n", + block_rsv->size, block_rsv->reserved, + block_rsv->freed[0], block_rsv->freed[1]); + + return ERR_PTR(-ENOSPC); +} + +static void unuse_block_rsv(struct btrfs_block_rsv *block_rsv, u32 blocksize) +{ + block_rsv_add_bytes(block_rsv, blocksize, 0); + block_rsv_release_bytes(block_rsv, NULL, 0); +} + /* - * helper function to allocate a block for a given tree + * finds a free extent and does all the dirty work required for allocation + * returns the key for the extent through ins, and a tree buffer for + * the first block of the extent through buf. + * * returns the tree buffer or NULL. */ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans, @@ -4985,18 +5347,53 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans, u64 hint, u64 empty_size) { struct btrfs_key ins; - int ret; + struct btrfs_block_rsv *block_rsv; struct extent_buffer *buf; + u64 flags = 0; + int ret; + - ret = alloc_tree_block(trans, root, blocksize, parent, root_objectid, - key, level, empty_size, hint, (u64)-1, &ins); + block_rsv = use_block_rsv(trans, root, blocksize); + if (IS_ERR(block_rsv)) + return ERR_CAST(block_rsv); + + ret = btrfs_reserve_extent(trans, root, blocksize, blocksize, + empty_size, hint, (u64)-1, &ins, 0); if (ret) { - BUG_ON(ret > 0); + unuse_block_rsv(block_rsv, blocksize); return ERR_PTR(ret); } buf = btrfs_init_new_buffer(trans, root, ins.objectid, blocksize, level); + BUG_ON(IS_ERR(buf)); + + if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) { + if (parent == 0) + parent = ins.objectid; + flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF; + } else + BUG_ON(parent > 0); + + if (root_objectid != BTRFS_TREE_LOG_OBJECTID) { + struct btrfs_delayed_extent_op *extent_op; + extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS); + BUG_ON(!extent_op); + if (key) + memcpy(&extent_op->key, key, sizeof(extent_op->key)); + else + memset(&extent_op->key, 0, sizeof(extent_op->key)); + extent_op->flags_to_set = flags; + extent_op->update_key = 1; + extent_op->update_flags = 1; + extent_op->is_data = 0; + + ret = btrfs_add_delayed_tree_ref(trans, ins.objectid, + ins.offset, parent, root_objectid, + level, BTRFS_ADD_DELAYED_EXTENT, + extent_op); + BUG_ON(ret); + } return buf; } @@ -5321,7 +5718,7 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans, struct btrfs_path *path, struct walk_control *wc) { - int ret = 0; + int ret; int level = wc->level; struct extent_buffer *eb = path->nodes[level]; u64 parent = 0; @@ -5399,13 +5796,11 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans, btrfs_header_owner(path->nodes[level + 1])); } - ret = btrfs_free_extent(trans, root, eb->start, eb->len, parent, - root->root_key.objectid, level, 0); - BUG_ON(ret); + btrfs_free_tree_block(trans, root, eb, parent, wc->refs[level] == 1); out: wc->refs[level] = 0; wc->flags[level] = 0; - return ret; + return 0; } static noinline int walk_down_tree(struct btrfs_trans_handle *trans, @@ -5483,7 +5878,8 @@ static noinline int walk_up_tree(struct btrfs_trans_handle *trans, * also make sure backrefs for the shared block and all lower level * blocks are properly updated. */ -int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref) +int btrfs_drop_snapshot(struct btrfs_root *root, + struct btrfs_block_rsv *block_rsv, int update_ref) { struct btrfs_path *path; struct btrfs_trans_handle *trans; @@ -5501,7 +5897,9 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref) wc = kzalloc(sizeof(*wc), GFP_NOFS); BUG_ON(!wc); - trans = btrfs_start_transaction(tree_root, 1); + trans = btrfs_start_transaction(tree_root, 0); + if (block_rsv) + trans->block_rsv = block_rsv; if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) { level = btrfs_header_level(root->node); @@ -5589,22 +5987,16 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref) } BUG_ON(wc->level == 0); - if (trans->transaction->in_commit || - trans->transaction->delayed_refs.flushing) { + if (btrfs_should_end_transaction(trans, tree_root)) { ret = btrfs_update_root(trans, tree_root, &root->root_key, root_item); BUG_ON(ret); - btrfs_end_transaction(trans, tree_root); - trans = btrfs_start_transaction(tree_root, 1); - } else { - unsigned long update; - update = trans->delayed_ref_updates; - trans->delayed_ref_updates = 0; - if (update) - btrfs_run_delayed_refs(trans, tree_root, - update); + btrfs_end_transaction_throttle(trans, tree_root); + trans = btrfs_start_transaction(tree_root, 0); + if (block_rsv) + trans->block_rsv = block_rsv; } } btrfs_release_path(root, path); @@ -5632,7 +6024,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref) kfree(root); } out: - btrfs_end_transaction(trans, tree_root); + btrfs_end_transaction_throttle(trans, tree_root); kfree(wc); btrfs_free_path(path); return err; @@ -7228,48 +7620,80 @@ static u64 update_block_group_flags(struct btrfs_root *root, u64 flags) return flags; } -static int __alloc_chunk_for_shrink(struct btrfs_root *root, - struct btrfs_block_group_cache *shrink_block_group, - int force) +static int set_block_group_ro(struct btrfs_block_group_cache *cache) { - struct btrfs_trans_handle *trans; - u64 new_alloc_flags; - u64 calc; + struct btrfs_space_info *sinfo = cache->space_info; + u64 num_bytes; + int ret = -ENOSPC; - spin_lock(&shrink_block_group->lock); - if (btrfs_block_group_used(&shrink_block_group->item) + - shrink_block_group->reserved > 0) { - spin_unlock(&shrink_block_group->lock); + if (cache->ro) + return 0; - trans = btrfs_start_transaction(root, 1); - spin_lock(&shrink_block_group->lock); + spin_lock(&sinfo->lock); + spin_lock(&cache->lock); + num_bytes = cache->key.offset - cache->reserved - cache->pinned - + cache->bytes_super - btrfs_block_group_used(&cache->item); + + if (sinfo->bytes_used + sinfo->bytes_reserved + sinfo->bytes_pinned + + sinfo->bytes_may_use + sinfo->bytes_readonly + + cache->reserved_pinned + num_bytes < sinfo->total_bytes) { + sinfo->bytes_readonly += num_bytes; + sinfo->bytes_reserved += cache->reserved_pinned; + cache->reserved_pinned = 0; + cache->ro = 1; + ret = 0; + } + spin_unlock(&cache->lock); + spin_unlock(&sinfo->lock); + return ret; +} - new_alloc_flags = update_block_group_flags(root, - shrink_block_group->flags); - if (new_alloc_flags != shrink_block_group->flags) { - calc = - btrfs_block_group_used(&shrink_block_group->item); - } else { - calc = shrink_block_group->key.offset; - } - spin_unlock(&shrink_block_group->lock); +int btrfs_set_block_group_ro(struct btrfs_root *root, + struct btrfs_block_group_cache *cache) - do_chunk_alloc(trans, root->fs_info->extent_root, - calc + 2 * 1024 * 1024, new_alloc_flags, force); +{ + struct btrfs_trans_handle *trans; + u64 alloc_flags; + int ret; - btrfs_end_transaction(trans, root); - } else - spin_unlock(&shrink_block_group->lock); - return 0; -} + BUG_ON(cache->ro); + + trans = btrfs_join_transaction(root, 1); + BUG_ON(IS_ERR(trans)); + alloc_flags = update_block_group_flags(root, cache->flags); + if (alloc_flags != cache->flags) + do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags, 1); -int btrfs_prepare_block_group_relocation(struct btrfs_root *root, - struct btrfs_block_group_cache *group) + ret = set_block_group_ro(cache); + if (!ret) + goto out; + alloc_flags = get_alloc_profile(root, cache->space_info->flags); + ret = do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags, 1); + if (ret < 0) + goto out; + ret = set_block_group_ro(cache); +out: + btrfs_end_transaction(trans, root); + return ret; +} +int btrfs_set_block_group_rw(struct btrfs_root *root, + struct btrfs_block_group_cache *cache) { - __alloc_chunk_for_shrink(root, group, 1); - set_block_group_readonly(group); + struct btrfs_space_info *sinfo = cache->space_info; + u64 num_bytes; + + BUG_ON(!cache->ro); + + spin_lock(&sinfo->lock); + spin_lock(&cache->lock); + num_bytes = cache->key.offset - cache->reserved - cache->pinned - + cache->bytes_super - btrfs_block_group_used(&cache->item); + sinfo->bytes_readonly -= num_bytes; + cache->ro = 0; + spin_unlock(&cache->lock); + spin_unlock(&sinfo->lock); return 0; } @@ -7436,17 +7860,33 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info) */ synchronize_rcu(); + release_global_block_rsv(info); + while(!list_empty(&info->space_info)) { space_info = list_entry(info->space_info.next, struct btrfs_space_info, list); - + if (space_info->bytes_pinned > 0 || + space_info->bytes_reserved > 0) { + WARN_ON(1); + dump_space_info(space_info, 0, 0); + } list_del(&space_info->list); kfree(space_info); } return 0; } +static void __link_block_group(struct btrfs_space_info *space_info, + struct btrfs_block_group_cache *cache) +{ + int index = get_block_group_index(cache); + + down_write(&space_info->groups_sem); + list_add_tail(&cache->list, &space_info->block_groups[index]); + up_write(&space_info->groups_sem); +} + int btrfs_read_block_groups(struct btrfs_root *root) { struct btrfs_path *path; @@ -7468,10 +7908,8 @@ int btrfs_read_block_groups(struct btrfs_root *root) while (1) { ret = find_first_block_group(root, path, &key); - if (ret > 0) { - ret = 0; - goto error; - } + if (ret > 0) + break; if (ret != 0) goto error; @@ -7480,7 +7918,7 @@ int btrfs_read_block_groups(struct btrfs_root *root) cache = kzalloc(sizeof(*cache), GFP_NOFS); if (!cache) { ret = -ENOMEM; - break; + goto error; } atomic_set(&cache->count, 1); @@ -7537,20 +7975,36 @@ int btrfs_read_block_groups(struct btrfs_root *root) BUG_ON(ret); cache->space_info = space_info; spin_lock(&cache->space_info->lock); - cache->space_info->bytes_super += cache->bytes_super; + cache->space_info->bytes_readonly += cache->bytes_super; spin_unlock(&cache->space_info->lock); - down_write(&space_info->groups_sem); - list_add_tail(&cache->list, &space_info->block_groups); - up_write(&space_info->groups_sem); + __link_block_group(space_info, cache); ret = btrfs_add_block_group_cache(root->fs_info, cache); BUG_ON(ret); set_avail_alloc_bits(root->fs_info, cache->flags); if (btrfs_chunk_readonly(root, cache->key.objectid)) - set_block_group_readonly(cache); + set_block_group_ro(cache); } + + list_for_each_entry_rcu(space_info, &root->fs_info->space_info, list) { + if (!(get_alloc_profile(root, space_info->flags) & + (BTRFS_BLOCK_GROUP_RAID10 | + BTRFS_BLOCK_GROUP_RAID1 | + BTRFS_BLOCK_GROUP_DUP))) + continue; + /* + * avoid allocating from un-mirrored block group if there are + * mirrored block groups. + */ + list_for_each_entry(cache, &space_info->block_groups[3], list) + set_block_group_ro(cache); + list_for_each_entry(cache, &space_info->block_groups[4], list) + set_block_group_ro(cache); + } + + init_global_block_rsv(info); ret = 0; error: btrfs_free_path(path); @@ -7611,12 +8065,10 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, BUG_ON(ret); spin_lock(&cache->space_info->lock); - cache->space_info->bytes_super += cache->bytes_super; + cache->space_info->bytes_readonly += cache->bytes_super; spin_unlock(&cache->space_info->lock); - down_write(&cache->space_info->groups_sem); - list_add_tail(&cache->list, &cache->space_info->block_groups); - up_write(&cache->space_info->groups_sem); + __link_block_group(cache->space_info, cache); ret = btrfs_add_block_group_cache(root->fs_info, cache); BUG_ON(ret); diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index d2d03684fab2..d74e6af9b53a 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -135,7 +135,7 @@ static struct extent_state *alloc_extent_state(gfp_t mask) return state; } -static void free_extent_state(struct extent_state *state) +void free_extent_state(struct extent_state *state) { if (!state) return; @@ -335,21 +335,18 @@ static int merge_state(struct extent_io_tree *tree, } static int set_state_cb(struct extent_io_tree *tree, - struct extent_state *state, - unsigned long bits) + struct extent_state *state, int *bits) { if (tree->ops && tree->ops->set_bit_hook) { return tree->ops->set_bit_hook(tree->mapping->host, - state->start, state->end, - state->state, bits); + state, bits); } return 0; } static void clear_state_cb(struct extent_io_tree *tree, - struct extent_state *state, - unsigned long bits) + struct extent_state *state, int *bits) { if (tree->ops && tree->ops->clear_bit_hook) tree->ops->clear_bit_hook(tree->mapping->host, state, bits); @@ -367,9 +364,10 @@ static void clear_state_cb(struct extent_io_tree *tree, */ static int insert_state(struct extent_io_tree *tree, struct extent_state *state, u64 start, u64 end, - int bits) + int *bits) { struct rb_node *node; + int bits_to_set = *bits & ~EXTENT_CTLBITS; int ret; if (end < start) { @@ -384,9 +382,9 @@ static int insert_state(struct extent_io_tree *tree, if (ret) return ret; - if (bits & EXTENT_DIRTY) + if (bits_to_set & EXTENT_DIRTY) tree->dirty_bytes += end - start + 1; - state->state |= bits; + state->state |= bits_to_set; node = tree_insert(&tree->state, end, &state->rb_node); if (node) { struct extent_state *found; @@ -456,13 +454,13 @@ static int split_state(struct extent_io_tree *tree, struct extent_state *orig, * struct is freed and removed from the tree */ static int clear_state_bit(struct extent_io_tree *tree, - struct extent_state *state, int bits, int wake, - int delete) + struct extent_state *state, + int *bits, int wake) { - int bits_to_clear = bits & ~EXTENT_DO_ACCOUNTING; + int bits_to_clear = *bits & ~EXTENT_CTLBITS; int ret = state->state & bits_to_clear; - if ((bits & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) { + if ((bits_to_clear & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) { u64 range = state->end - state->start + 1; WARN_ON(range > tree->dirty_bytes); tree->dirty_bytes -= range; @@ -471,9 +469,8 @@ static int clear_state_bit(struct extent_io_tree *tree, state->state &= ~bits_to_clear; if (wake) wake_up(&state->wq); - if (delete || state->state == 0) { + if (state->state == 0) { if (state->tree) { - clear_state_cb(tree, state, state->state); rb_erase(&state->rb_node, &tree->state); state->tree = NULL; free_extent_state(state); @@ -514,6 +511,10 @@ int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int set = 0; int clear = 0; + if (delete) + bits |= ~EXTENT_CTLBITS; + bits |= EXTENT_FIRST_DELALLOC; + if (bits & (EXTENT_IOBITS | EXTENT_BOUNDARY)) clear = 1; again: @@ -580,8 +581,7 @@ hit_next: if (err) goto out; if (state->end <= end) { - set |= clear_state_bit(tree, state, bits, wake, - delete); + set |= clear_state_bit(tree, state, &bits, wake); if (last_end == (u64)-1) goto out; start = last_end + 1; @@ -602,7 +602,7 @@ hit_next: if (wake) wake_up(&state->wq); - set |= clear_state_bit(tree, prealloc, bits, wake, delete); + set |= clear_state_bit(tree, prealloc, &bits, wake); prealloc = NULL; goto out; @@ -613,7 +613,7 @@ hit_next: else next_node = NULL; - set |= clear_state_bit(tree, state, bits, wake, delete); + set |= clear_state_bit(tree, state, &bits, wake); if (last_end == (u64)-1) goto out; start = last_end + 1; @@ -706,19 +706,19 @@ out: static int set_state_bits(struct extent_io_tree *tree, struct extent_state *state, - int bits) + int *bits) { int ret; + int bits_to_set = *bits & ~EXTENT_CTLBITS; ret = set_state_cb(tree, state, bits); if (ret) return ret; - - if ((bits & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) { + if ((bits_to_set & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) { u64 range = state->end - state->start + 1; tree->dirty_bytes += range; } - state->state |= bits; + state->state |= bits_to_set; return 0; } @@ -745,10 +745,9 @@ static void cache_state(struct extent_state *state, * [start, end] is inclusive This takes the tree lock. */ -static int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, - int bits, int exclusive_bits, u64 *failed_start, - struct extent_state **cached_state, - gfp_t mask) +int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, + int bits, int exclusive_bits, u64 *failed_start, + struct extent_state **cached_state, gfp_t mask) { struct extent_state *state; struct extent_state *prealloc = NULL; @@ -757,6 +756,7 @@ static int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, u64 last_start; u64 last_end; + bits |= EXTENT_FIRST_DELALLOC; again: if (!prealloc && (mask & __GFP_WAIT)) { prealloc = alloc_extent_state(mask); @@ -778,7 +778,7 @@ again: */ node = tree_search(tree, start); if (!node) { - err = insert_state(tree, prealloc, start, end, bits); + err = insert_state(tree, prealloc, start, end, &bits); prealloc = NULL; BUG_ON(err == -EEXIST); goto out; @@ -802,7 +802,7 @@ hit_next: goto out; } - err = set_state_bits(tree, state, bits); + err = set_state_bits(tree, state, &bits); if (err) goto out; @@ -852,7 +852,7 @@ hit_next: if (err) goto out; if (state->end <= end) { - err = set_state_bits(tree, state, bits); + err = set_state_bits(tree, state, &bits); if (err) goto out; cache_state(state, cached_state); @@ -877,7 +877,7 @@ hit_next: else this_end = last_start - 1; err = insert_state(tree, prealloc, start, this_end, - bits); + &bits); BUG_ON(err == -EEXIST); if (err) { prealloc = NULL; @@ -903,7 +903,7 @@ hit_next: err = split_state(tree, state, prealloc, end + 1); BUG_ON(err == -EEXIST); - err = set_state_bits(tree, prealloc, bits); + err = set_state_bits(tree, prealloc, &bits); if (err) { prealloc = NULL; goto out; @@ -966,8 +966,7 @@ int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end, { return clear_extent_bit(tree, start, end, EXTENT_DIRTY | EXTENT_DELALLOC | - EXTENT_DO_ACCOUNTING, 0, 0, - NULL, mask); + EXTENT_DO_ACCOUNTING, 0, 0, NULL, mask); } int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end, @@ -1435,9 +1434,6 @@ int extent_clear_unlock_delalloc(struct inode *inode, if (op & EXTENT_CLEAR_DELALLOC) clear_bits |= EXTENT_DELALLOC; - if (op & EXTENT_CLEAR_ACCOUNTING) - clear_bits |= EXTENT_DO_ACCOUNTING; - clear_extent_bit(tree, start, end, clear_bits, 1, 0, NULL, GFP_NOFS); if (!(op & (EXTENT_CLEAR_UNLOCK_PAGE | EXTENT_CLEAR_DIRTY | EXTENT_SET_WRITEBACK | EXTENT_END_WRITEBACK | @@ -1916,7 +1912,7 @@ static int submit_one_bio(int rw, struct bio *bio, int mirror_num, if (tree->ops && tree->ops->submit_bio_hook) tree->ops->submit_bio_hook(page->mapping->host, rw, bio, - mirror_num, bio_flags); + mirror_num, bio_flags, start); else submit_bio(rw, bio); if (bio_flagged(bio, BIO_EOPNOTSUPP)) @@ -2020,6 +2016,7 @@ static int __extent_read_full_page(struct extent_io_tree *tree, sector_t sector; struct extent_map *em; struct block_device *bdev; + struct btrfs_ordered_extent *ordered; int ret; int nr = 0; size_t page_offset = 0; @@ -2031,7 +2028,15 @@ static int __extent_read_full_page(struct extent_io_tree *tree, set_page_extent_mapped(page); end = page_end; - lock_extent(tree, start, end, GFP_NOFS); + while (1) { + lock_extent(tree, start, end, GFP_NOFS); + ordered = btrfs_lookup_ordered_extent(inode, start); + if (!ordered) + break; + unlock_extent(tree, start, end, GFP_NOFS); + btrfs_start_ordered_extent(inode, ordered, 1); + btrfs_put_ordered_extent(ordered); + } if (page->index == last_byte >> PAGE_CACHE_SHIFT) { char *userpage; @@ -2589,7 +2594,6 @@ int extent_write_full_page(struct extent_io_tree *tree, struct page *page, .sync_io = wbc->sync_mode == WB_SYNC_ALL, }; struct writeback_control wbc_writepages = { - .bdi = wbc->bdi, .sync_mode = wbc->sync_mode, .older_than_this = NULL, .nr_to_write = 64, @@ -2623,7 +2627,6 @@ int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode, .sync_io = mode == WB_SYNC_ALL, }; struct writeback_control wbc_writepages = { - .bdi = inode->i_mapping->backing_dev_info, .sync_mode = mode, .older_than_this = NULL, .nr_to_write = nr_pages * 2, diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index bbab4813646f..5691c7b590da 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -16,7 +16,9 @@ #define EXTENT_BOUNDARY (1 << 9) #define EXTENT_NODATASUM (1 << 10) #define EXTENT_DO_ACCOUNTING (1 << 11) +#define EXTENT_FIRST_DELALLOC (1 << 12) #define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK) +#define EXTENT_CTLBITS (EXTENT_DO_ACCOUNTING | EXTENT_FIRST_DELALLOC) /* flags for bio submission */ #define EXTENT_BIO_COMPRESSED 1 @@ -47,7 +49,7 @@ struct extent_state; typedef int (extent_submit_bio_hook_t)(struct inode *inode, int rw, struct bio *bio, int mirror_num, - unsigned long bio_flags); + unsigned long bio_flags, u64 bio_offset); struct extent_io_ops { int (*fill_delalloc)(struct inode *inode, struct page *locked_page, u64 start, u64 end, int *page_started, @@ -69,10 +71,10 @@ struct extent_io_ops { struct extent_state *state); int (*writepage_end_io_hook)(struct page *page, u64 start, u64 end, struct extent_state *state, int uptodate); - int (*set_bit_hook)(struct inode *inode, u64 start, u64 end, - unsigned long old, unsigned long bits); + int (*set_bit_hook)(struct inode *inode, struct extent_state *state, + int *bits); int (*clear_bit_hook)(struct inode *inode, struct extent_state *state, - unsigned long bits); + int *bits); int (*merge_extent_hook)(struct inode *inode, struct extent_state *new, struct extent_state *other); @@ -176,6 +178,7 @@ u64 count_range_bits(struct extent_io_tree *tree, u64 *start, u64 search_end, u64 max_bytes, unsigned long bits); +void free_extent_state(struct extent_state *state); int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits, int filled, struct extent_state *cached_state); int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, @@ -185,6 +188,9 @@ int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask); int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, int bits, gfp_t mask); +int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, + int bits, int exclusive_bits, u64 *failed_start, + struct extent_state **cached_state, gfp_t mask); int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask); int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end, diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index 54a255065aa3..a562a250ae77 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -149,13 +149,14 @@ int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans, } -int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode, - struct bio *bio, u32 *dst) +static int __btrfs_lookup_bio_sums(struct btrfs_root *root, + struct inode *inode, struct bio *bio, + u64 logical_offset, u32 *dst, int dio) { u32 sum; struct bio_vec *bvec = bio->bi_io_vec; int bio_index = 0; - u64 offset; + u64 offset = 0; u64 item_start_offset = 0; u64 item_last_offset = 0; u64 disk_bytenr; @@ -174,8 +175,11 @@ int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode, WARN_ON(bio->bi_vcnt <= 0); disk_bytenr = (u64)bio->bi_sector << 9; + if (dio) + offset = logical_offset; while (bio_index < bio->bi_vcnt) { - offset = page_offset(bvec->bv_page) + bvec->bv_offset; + if (!dio) + offset = page_offset(bvec->bv_page) + bvec->bv_offset; ret = btrfs_find_ordered_sum(inode, offset, disk_bytenr, &sum); if (ret == 0) goto found; @@ -238,6 +242,7 @@ found: else set_state_private(io_tree, offset, sum); disk_bytenr += bvec->bv_len; + offset += bvec->bv_len; bio_index++; bvec++; } @@ -245,6 +250,18 @@ found: return 0; } +int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode, + struct bio *bio, u32 *dst) +{ + return __btrfs_lookup_bio_sums(root, inode, bio, 0, dst, 0); +} + +int btrfs_lookup_bio_sums_dio(struct btrfs_root *root, struct inode *inode, + struct bio *bio, u64 offset, u32 *dst) +{ + return __btrfs_lookup_bio_sums(root, inode, bio, offset, dst, 1); +} + int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, struct list_head *list) { @@ -657,6 +674,9 @@ again: goto found; } ret = PTR_ERR(item); + if (ret != -EFBIG && ret != -ENOENT) + goto fail_unlock; + if (ret == -EFBIG) { u32 item_size; /* we found one, but it isn't big enough yet */ diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 29ff749ff4ca..e354c33df082 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -46,32 +46,42 @@ static noinline int btrfs_copy_from_user(loff_t pos, int num_pages, int write_bytes, struct page **prepared_pages, - const char __user *buf) + struct iov_iter *i) { - long page_fault = 0; - int i; + size_t copied; + int pg = 0; int offset = pos & (PAGE_CACHE_SIZE - 1); - for (i = 0; i < num_pages && write_bytes > 0; i++, offset = 0) { + while (write_bytes > 0) { size_t count = min_t(size_t, PAGE_CACHE_SIZE - offset, write_bytes); - struct page *page = prepared_pages[i]; - fault_in_pages_readable(buf, count); + struct page *page = prepared_pages[pg]; +again: + if (unlikely(iov_iter_fault_in_readable(i, count))) + return -EFAULT; /* Copy data from userspace to the current page */ - kmap(page); - page_fault = __copy_from_user(page_address(page) + offset, - buf, count); + copied = iov_iter_copy_from_user(page, i, offset, count); + /* Flush processor's dcache for this page */ flush_dcache_page(page); - kunmap(page); - buf += count; - write_bytes -= count; + iov_iter_advance(i, copied); + write_bytes -= copied; - if (page_fault) - break; + if (unlikely(copied == 0)) { + count = min_t(size_t, PAGE_CACHE_SIZE - offset, + iov_iter_single_seg_count(i)); + goto again; + } + + if (unlikely(copied < PAGE_CACHE_SIZE - offset)) { + offset += copied; + } else { + pg++; + offset = 0; + } } - return page_fault ? -EFAULT : 0; + return 0; } /* @@ -126,8 +136,7 @@ static noinline int dirty_and_release_pages(struct btrfs_trans_handle *trans, end_of_last_block = start_pos + num_bytes - 1; err = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block, NULL); - if (err) - return err; + BUG_ON(err); for (i = 0; i < num_pages; i++) { struct page *p = pages[i]; @@ -142,7 +151,7 @@ static noinline int dirty_and_release_pages(struct btrfs_trans_handle *trans, * at this time. */ } - return err; + return 0; } /* @@ -823,45 +832,46 @@ again: return 0; } -static ssize_t btrfs_file_write(struct file *file, const char __user *buf, - size_t count, loff_t *ppos) +static ssize_t btrfs_file_aio_write(struct kiocb *iocb, + const struct iovec *iov, + unsigned long nr_segs, loff_t pos) { - loff_t pos; + struct file *file = iocb->ki_filp; + struct inode *inode = fdentry(file)->d_inode; + struct btrfs_root *root = BTRFS_I(inode)->root; + struct page *pinned[2]; + struct page **pages = NULL; + struct iov_iter i; + loff_t *ppos = &iocb->ki_pos; loff_t start_pos; ssize_t num_written = 0; ssize_t err = 0; + size_t count; + size_t ocount; int ret = 0; - struct inode *inode = fdentry(file)->d_inode; - struct btrfs_root *root = BTRFS_I(inode)->root; - struct page **pages = NULL; int nrptrs; - struct page *pinned[2]; unsigned long first_index; unsigned long last_index; int will_write; + int buffered = 0; will_write = ((file->f_flags & O_DSYNC) || IS_SYNC(inode) || (file->f_flags & O_DIRECT)); - nrptrs = min((count + PAGE_CACHE_SIZE - 1) / PAGE_CACHE_SIZE, - PAGE_CACHE_SIZE / (sizeof(struct page *))); pinned[0] = NULL; pinned[1] = NULL; - pos = *ppos; start_pos = pos; vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE); - /* do the reserve before the mutex lock in case we have to do some - * flushing. We wouldn't deadlock, but this is more polite. - */ - err = btrfs_reserve_metadata_for_delalloc(root, inode, 1); - if (err) - goto out_nolock; - mutex_lock(&inode->i_mutex); + err = generic_segment_checks(iov, &nr_segs, &ocount, VERIFY_READ); + if (err) + goto out; + count = ocount; + current->backing_dev_info = inode->i_mapping->backing_dev_info; err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode)); if (err) @@ -875,15 +885,53 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf, goto out; file_update_time(file); + BTRFS_I(inode)->sequence++; + + if (unlikely(file->f_flags & O_DIRECT)) { + num_written = generic_file_direct_write(iocb, iov, &nr_segs, + pos, ppos, count, + ocount); + /* + * the generic O_DIRECT will update in-memory i_size after the + * DIOs are done. But our endio handlers that update the on + * disk i_size never update past the in memory i_size. So we + * need one more update here to catch any additions to the + * file + */ + if (inode->i_size != BTRFS_I(inode)->disk_i_size) { + btrfs_ordered_update_i_size(inode, inode->i_size, NULL); + mark_inode_dirty(inode); + } + if (num_written < 0) { + ret = num_written; + num_written = 0; + goto out; + } else if (num_written == count) { + /* pick up pos changes done by the generic code */ + pos = *ppos; + goto out; + } + /* + * We are going to do buffered for the rest of the range, so we + * need to make sure to invalidate the buffered pages when we're + * done. + */ + buffered = 1; + pos += num_written; + } + + iov_iter_init(&i, iov, nr_segs, count, num_written); + nrptrs = min((iov_iter_count(&i) + PAGE_CACHE_SIZE - 1) / + PAGE_CACHE_SIZE, PAGE_CACHE_SIZE / + (sizeof(struct page *))); pages = kmalloc(nrptrs * sizeof(struct page *), GFP_KERNEL); /* generic_write_checks can change our pos */ start_pos = pos; - BTRFS_I(inode)->sequence++; first_index = pos >> PAGE_CACHE_SHIFT; - last_index = (pos + count) >> PAGE_CACHE_SHIFT; + last_index = (pos + iov_iter_count(&i)) >> PAGE_CACHE_SHIFT; /* * there are lots of better ways to do this, but this code @@ -900,7 +948,7 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf, unlock_page(pinned[0]); } } - if ((pos + count) & (PAGE_CACHE_SIZE - 1)) { + if ((pos + iov_iter_count(&i)) & (PAGE_CACHE_SIZE - 1)) { pinned[1] = grab_cache_page(inode->i_mapping, last_index); if (!PageUptodate(pinned[1])) { ret = btrfs_readpage(NULL, pinned[1]); @@ -911,10 +959,10 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf, } } - while (count > 0) { + while (iov_iter_count(&i) > 0) { size_t offset = pos & (PAGE_CACHE_SIZE - 1); - size_t write_bytes = min(count, nrptrs * - (size_t)PAGE_CACHE_SIZE - + size_t write_bytes = min(iov_iter_count(&i), + nrptrs * (size_t)PAGE_CACHE_SIZE - offset); size_t num_pages = (write_bytes + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; @@ -922,7 +970,7 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf, WARN_ON(num_pages > nrptrs); memset(pages, 0, sizeof(struct page *) * nrptrs); - ret = btrfs_check_data_free_space(root, inode, write_bytes); + ret = btrfs_delalloc_reserve_space(inode, write_bytes); if (ret) goto out; @@ -930,26 +978,20 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf, pos, first_index, last_index, write_bytes); if (ret) { - btrfs_free_reserved_data_space(root, inode, - write_bytes); + btrfs_delalloc_release_space(inode, write_bytes); goto out; } ret = btrfs_copy_from_user(pos, num_pages, - write_bytes, pages, buf); - if (ret) { - btrfs_free_reserved_data_space(root, inode, - write_bytes); - btrfs_drop_pages(pages, num_pages); - goto out; + write_bytes, pages, &i); + if (ret == 0) { + dirty_and_release_pages(NULL, root, file, pages, + num_pages, pos, write_bytes); } - ret = dirty_and_release_pages(NULL, root, file, pages, - num_pages, pos, write_bytes); btrfs_drop_pages(pages, num_pages); if (ret) { - btrfs_free_reserved_data_space(root, inode, - write_bytes); + btrfs_delalloc_release_space(inode, write_bytes); goto out; } @@ -965,8 +1007,6 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf, btrfs_throttle(root); } - buf += write_bytes; - count -= write_bytes; pos += write_bytes; num_written += write_bytes; @@ -976,9 +1016,7 @@ out: mutex_unlock(&inode->i_mutex); if (ret) err = ret; - btrfs_unreserve_metadata_for_delalloc(root, inode, 1); -out_nolock: kfree(pages); if (pinned[0]) page_cache_release(pinned[0]); @@ -1008,7 +1046,7 @@ out_nolock: num_written = err; if ((file->f_flags & O_DSYNC) || IS_SYNC(inode)) { - trans = btrfs_start_transaction(root, 1); + trans = btrfs_start_transaction(root, 0); ret = btrfs_log_dentry_safe(trans, root, file->f_dentry); if (ret == 0) { @@ -1023,7 +1061,7 @@ out_nolock: btrfs_end_transaction(trans, root); } } - if (file->f_flags & O_DIRECT) { + if (file->f_flags & O_DIRECT && buffered) { invalidate_mapping_pages(inode->i_mapping, start_pos >> PAGE_CACHE_SHIFT, (start_pos + num_written - 1) >> PAGE_CACHE_SHIFT); @@ -1063,8 +1101,9 @@ int btrfs_release_file(struct inode *inode, struct file *filp) * important optimization for directories because holding the mutex prevents * new operations on the dir while we write to disk. */ -int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync) +int btrfs_sync_file(struct file *file, int datasync) { + struct dentry *dentry = file->f_path.dentry; struct inode *inode = dentry->d_inode; struct btrfs_root *root = BTRFS_I(inode)->root; int ret = 0; @@ -1101,12 +1140,12 @@ int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync) /* * ok we haven't committed the transaction yet, lets do a commit */ - if (file && file->private_data) + if (file->private_data) btrfs_ioctl_trans_end(file); - trans = btrfs_start_transaction(root, 1); - if (!trans) { - ret = -ENOMEM; + trans = btrfs_start_transaction(root, 0); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); goto out; } @@ -1151,17 +1190,25 @@ static const struct vm_operations_struct btrfs_file_vm_ops = { static int btrfs_file_mmap(struct file *filp, struct vm_area_struct *vma) { - vma->vm_ops = &btrfs_file_vm_ops; + struct address_space *mapping = filp->f_mapping; + + if (!mapping->a_ops->readpage) + return -ENOEXEC; + file_accessed(filp); + vma->vm_ops = &btrfs_file_vm_ops; + vma->vm_flags |= VM_CAN_NONLINEAR; + return 0; } const struct file_operations btrfs_file_operations = { .llseek = generic_file_llseek, .read = do_sync_read, + .write = do_sync_write, .aio_read = generic_file_aio_read, .splice_read = generic_file_splice_read, - .write = btrfs_file_write, + .aio_write = btrfs_file_aio_write, .mmap = btrfs_file_mmap, .open = generic_file_open, .release = btrfs_release_file, diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c index 72ce3c173d6a..64f1150bb48d 100644 --- a/fs/btrfs/inode-item.c +++ b/fs/btrfs/inode-item.c @@ -49,6 +49,33 @@ static int find_name_in_backref(struct btrfs_path *path, const char *name, return 0; } +struct btrfs_inode_ref * +btrfs_lookup_inode_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + const char *name, int name_len, + u64 inode_objectid, u64 ref_objectid, int mod) +{ + struct btrfs_key key; + struct btrfs_inode_ref *ref; + int ins_len = mod < 0 ? -1 : 0; + int cow = mod != 0; + int ret; + + key.objectid = inode_objectid; + key.type = BTRFS_INODE_REF_KEY; + key.offset = ref_objectid; + + ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow); + if (ret < 0) + return ERR_PTR(ret); + if (ret > 0) + return NULL; + if (!find_name_in_backref(path, name, name_len, &ref)) + return NULL; + return ref; +} + int btrfs_del_inode_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, const char *name, int name_len, diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index d601629b85d1..1bff92ad4744 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -252,6 +252,7 @@ static noinline int cow_file_range_inline(struct btrfs_trans_handle *trans, inline_len, compressed_size, compressed_pages); BUG_ON(ret); + btrfs_delalloc_release_metadata(inode, end + 1 - start); btrfs_drop_extent_cache(inode, start, aligned_end - 1, 0); return 0; } @@ -414,6 +415,7 @@ again: trans = btrfs_join_transaction(root, 1); BUG_ON(!trans); btrfs_set_trans_block_group(trans, inode); + trans->block_rsv = &root->fs_info->delalloc_block_rsv; /* lets try to make an inline extent */ if (ret || total_in < (actual_end - start)) { @@ -439,7 +441,6 @@ again: start, end, NULL, EXTENT_CLEAR_UNLOCK_PAGE | EXTENT_CLEAR_DIRTY | EXTENT_CLEAR_DELALLOC | - EXTENT_CLEAR_ACCOUNTING | EXTENT_SET_WRITEBACK | EXTENT_END_WRITEBACK); btrfs_end_transaction(trans, root); @@ -697,6 +698,38 @@ retry: return 0; } +static u64 get_extent_allocation_hint(struct inode *inode, u64 start, + u64 num_bytes) +{ + struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; + struct extent_map *em; + u64 alloc_hint = 0; + + read_lock(&em_tree->lock); + em = search_extent_mapping(em_tree, start, num_bytes); + if (em) { + /* + * if block start isn't an actual block number then find the + * first block in this inode and use that as a hint. If that + * block is also bogus then just don't worry about it. + */ + if (em->block_start >= EXTENT_MAP_LAST_BYTE) { + free_extent_map(em); + em = search_extent_mapping(em_tree, 0, 0); + if (em && em->block_start < EXTENT_MAP_LAST_BYTE) + alloc_hint = em->block_start; + if (em) + free_extent_map(em); + } else { + alloc_hint = em->block_start; + free_extent_map(em); + } + } + read_unlock(&em_tree->lock); + + return alloc_hint; +} + /* * when extent_io.c finds a delayed allocation range in the file, * the call backs end up in this code. The basic idea is to @@ -734,6 +767,7 @@ static noinline int cow_file_range(struct inode *inode, trans = btrfs_join_transaction(root, 1); BUG_ON(!trans); btrfs_set_trans_block_group(trans, inode); + trans->block_rsv = &root->fs_info->delalloc_block_rsv; actual_end = min_t(u64, isize, end + 1); @@ -753,7 +787,6 @@ static noinline int cow_file_range(struct inode *inode, EXTENT_CLEAR_UNLOCK_PAGE | EXTENT_CLEAR_UNLOCK | EXTENT_CLEAR_DELALLOC | - EXTENT_CLEAR_ACCOUNTING | EXTENT_CLEAR_DIRTY | EXTENT_SET_WRITEBACK | EXTENT_END_WRITEBACK); @@ -769,29 +802,7 @@ static noinline int cow_file_range(struct inode *inode, BUG_ON(disk_num_bytes > btrfs_super_total_bytes(&root->fs_info->super_copy)); - - read_lock(&BTRFS_I(inode)->extent_tree.lock); - em = search_extent_mapping(&BTRFS_I(inode)->extent_tree, - start, num_bytes); - if (em) { - /* - * if block start isn't an actual block number then find the - * first block in this inode and use that as a hint. If that - * block is also bogus then just don't worry about it. - */ - if (em->block_start >= EXTENT_MAP_LAST_BYTE) { - free_extent_map(em); - em = search_extent_mapping(em_tree, 0, 0); - if (em && em->block_start < EXTENT_MAP_LAST_BYTE) - alloc_hint = em->block_start; - if (em) - free_extent_map(em); - } else { - alloc_hint = em->block_start; - free_extent_map(em); - } - } - read_unlock(&BTRFS_I(inode)->extent_tree.lock); + alloc_hint = get_extent_allocation_hint(inode, start, num_bytes); btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0); while (disk_num_bytes > 0) { @@ -1174,6 +1185,13 @@ out_check: num_bytes, num_bytes, type); BUG_ON(ret); + if (root->root_key.objectid == + BTRFS_DATA_RELOC_TREE_OBJECTID) { + ret = btrfs_reloc_clone_csums(inode, cur_offset, + num_bytes); + BUG_ON(ret); + } + extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree, cur_offset, cur_offset + num_bytes - 1, locked_page, EXTENT_CLEAR_UNLOCK_PAGE | @@ -1226,15 +1244,13 @@ static int run_delalloc_range(struct inode *inode, struct page *locked_page, } static int btrfs_split_extent_hook(struct inode *inode, - struct extent_state *orig, u64 split) + struct extent_state *orig, u64 split) { + /* not delalloc, ignore it */ if (!(orig->state & EXTENT_DELALLOC)) return 0; - spin_lock(&BTRFS_I(inode)->accounting_lock); - BTRFS_I(inode)->outstanding_extents++; - spin_unlock(&BTRFS_I(inode)->accounting_lock); - + atomic_inc(&BTRFS_I(inode)->outstanding_extents); return 0; } @@ -1252,10 +1268,7 @@ static int btrfs_merge_extent_hook(struct inode *inode, if (!(other->state & EXTENT_DELALLOC)) return 0; - spin_lock(&BTRFS_I(inode)->accounting_lock); - BTRFS_I(inode)->outstanding_extents--; - spin_unlock(&BTRFS_I(inode)->accounting_lock); - + atomic_dec(&BTRFS_I(inode)->outstanding_extents); return 0; } @@ -1264,8 +1277,8 @@ static int btrfs_merge_extent_hook(struct inode *inode, * bytes in this file, and to maintain the list of inodes that * have pending delalloc work to be done. */ -static int btrfs_set_bit_hook(struct inode *inode, u64 start, u64 end, - unsigned long old, unsigned long bits) +static int btrfs_set_bit_hook(struct inode *inode, + struct extent_state *state, int *bits) { /* @@ -1273,17 +1286,18 @@ static int btrfs_set_bit_hook(struct inode *inode, u64 start, u64 end, * but in this case, we are only testeing for the DELALLOC * bit, which is only set or cleared with irqs on */ - if (!(old & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) { + if (!(state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) { struct btrfs_root *root = BTRFS_I(inode)->root; + u64 len = state->end + 1 - state->start; - spin_lock(&BTRFS_I(inode)->accounting_lock); - BTRFS_I(inode)->outstanding_extents++; - spin_unlock(&BTRFS_I(inode)->accounting_lock); - btrfs_delalloc_reserve_space(root, inode, end - start + 1); + if (*bits & EXTENT_FIRST_DELALLOC) + *bits &= ~EXTENT_FIRST_DELALLOC; + else + atomic_inc(&BTRFS_I(inode)->outstanding_extents); spin_lock(&root->fs_info->delalloc_lock); - BTRFS_I(inode)->delalloc_bytes += end - start + 1; - root->fs_info->delalloc_bytes += end - start + 1; + BTRFS_I(inode)->delalloc_bytes += len; + root->fs_info->delalloc_bytes += len; if (list_empty(&BTRFS_I(inode)->delalloc_inodes)) { list_add_tail(&BTRFS_I(inode)->delalloc_inodes, &root->fs_info->delalloc_inodes); @@ -1297,45 +1311,32 @@ static int btrfs_set_bit_hook(struct inode *inode, u64 start, u64 end, * extent_io.c clear_bit_hook, see set_bit_hook for why */ static int btrfs_clear_bit_hook(struct inode *inode, - struct extent_state *state, unsigned long bits) + struct extent_state *state, int *bits) { /* * set_bit and clear bit hooks normally require _irqsave/restore * but in this case, we are only testeing for the DELALLOC * bit, which is only set or cleared with irqs on */ - if ((state->state & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) { + if ((state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) { struct btrfs_root *root = BTRFS_I(inode)->root; + u64 len = state->end + 1 - state->start; - if (bits & EXTENT_DO_ACCOUNTING) { - spin_lock(&BTRFS_I(inode)->accounting_lock); - WARN_ON(!BTRFS_I(inode)->outstanding_extents); - BTRFS_I(inode)->outstanding_extents--; - spin_unlock(&BTRFS_I(inode)->accounting_lock); - btrfs_unreserve_metadata_for_delalloc(root, inode, 1); - } + if (*bits & EXTENT_FIRST_DELALLOC) + *bits &= ~EXTENT_FIRST_DELALLOC; + else if (!(*bits & EXTENT_DO_ACCOUNTING)) + atomic_dec(&BTRFS_I(inode)->outstanding_extents); + + if (*bits & EXTENT_DO_ACCOUNTING) + btrfs_delalloc_release_metadata(inode, len); + + if (root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID) + btrfs_free_reserved_data_space(inode, len); spin_lock(&root->fs_info->delalloc_lock); - if (state->end - state->start + 1 > - root->fs_info->delalloc_bytes) { - printk(KERN_INFO "btrfs warning: delalloc account " - "%llu %llu\n", - (unsigned long long) - state->end - state->start + 1, - (unsigned long long) - root->fs_info->delalloc_bytes); - btrfs_delalloc_free_space(root, inode, (u64)-1); - root->fs_info->delalloc_bytes = 0; - BTRFS_I(inode)->delalloc_bytes = 0; - } else { - btrfs_delalloc_free_space(root, inode, - state->end - - state->start + 1); - root->fs_info->delalloc_bytes -= state->end - - state->start + 1; - BTRFS_I(inode)->delalloc_bytes -= state->end - - state->start + 1; - } + root->fs_info->delalloc_bytes -= len; + BTRFS_I(inode)->delalloc_bytes -= len; + if (BTRFS_I(inode)->delalloc_bytes == 0 && !list_empty(&BTRFS_I(inode)->delalloc_inodes)) { list_del_init(&BTRFS_I(inode)->delalloc_inodes); @@ -1384,7 +1385,8 @@ int btrfs_merge_bio_hook(struct page *page, unsigned long offset, */ static int __btrfs_submit_bio_start(struct inode *inode, int rw, struct bio *bio, int mirror_num, - unsigned long bio_flags) + unsigned long bio_flags, + u64 bio_offset) { struct btrfs_root *root = BTRFS_I(inode)->root; int ret = 0; @@ -1403,7 +1405,8 @@ static int __btrfs_submit_bio_start(struct inode *inode, int rw, * are inserted into the btree */ static int __btrfs_submit_bio_done(struct inode *inode, int rw, struct bio *bio, - int mirror_num, unsigned long bio_flags) + int mirror_num, unsigned long bio_flags, + u64 bio_offset) { struct btrfs_root *root = BTRFS_I(inode)->root; return btrfs_map_bio(root, rw, bio, mirror_num, 1); @@ -1414,7 +1417,8 @@ static int __btrfs_submit_bio_done(struct inode *inode, int rw, struct bio *bio, * on write, or reading the csums from the tree before a read */ static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, - int mirror_num, unsigned long bio_flags) + int mirror_num, unsigned long bio_flags, + u64 bio_offset) { struct btrfs_root *root = BTRFS_I(inode)->root; int ret = 0; @@ -1439,7 +1443,8 @@ static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, /* we're doing a write, do the async checksumming */ return btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info, inode, rw, bio, mirror_num, - bio_flags, __btrfs_submit_bio_start, + bio_flags, bio_offset, + __btrfs_submit_bio_start, __btrfs_submit_bio_done); } @@ -1520,6 +1525,7 @@ again: goto again; } + BUG(); btrfs_set_extent_delalloc(inode, page_start, page_end, &cached_state); ClearPageChecked(page); out: @@ -1650,7 +1656,7 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans, static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end) { struct btrfs_root *root = BTRFS_I(inode)->root; - struct btrfs_trans_handle *trans; + struct btrfs_trans_handle *trans = NULL; struct btrfs_ordered_extent *ordered_extent = NULL; struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; struct extent_state *cached_state = NULL; @@ -1668,9 +1674,10 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end) ret = btrfs_ordered_update_i_size(inode, 0, ordered_extent); if (!ret) { trans = btrfs_join_transaction(root, 1); + btrfs_set_trans_block_group(trans, inode); + trans->block_rsv = &root->fs_info->delalloc_block_rsv; ret = btrfs_update_inode(trans, root, inode); BUG_ON(ret); - btrfs_end_transaction(trans, root); } goto out; } @@ -1680,6 +1687,8 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end) 0, &cached_state, GFP_NOFS); trans = btrfs_join_transaction(root, 1); + btrfs_set_trans_block_group(trans, inode); + trans->block_rsv = &root->fs_info->delalloc_block_rsv; if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags)) compressed = 1; @@ -1711,12 +1720,13 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end) add_pending_csums(trans, inode, ordered_extent->file_offset, &ordered_extent->list); - /* this also removes the ordered extent from the tree */ btrfs_ordered_update_i_size(inode, 0, ordered_extent); ret = btrfs_update_inode(trans, root, inode); BUG_ON(ret); - btrfs_end_transaction(trans, root); out: + btrfs_delalloc_release_metadata(inode, ordered_extent->len); + if (trans) + btrfs_end_transaction(trans, root); /* once for us */ btrfs_put_ordered_extent(ordered_extent); /* once for the tree */ @@ -1838,7 +1848,7 @@ static int btrfs_io_failed_hook(struct bio *failed_bio, BTRFS_I(inode)->io_tree.ops->submit_bio_hook(inode, rw, bio, failrec->last_mirror, - failrec->bio_flags); + failrec->bio_flags, 0); return 0; } @@ -1993,32 +2003,196 @@ void btrfs_run_delayed_iputs(struct btrfs_root *root) } /* + * calculate extra metadata reservation when snapshotting a subvolume + * contains orphan files. + */ +void btrfs_orphan_pre_snapshot(struct btrfs_trans_handle *trans, + struct btrfs_pending_snapshot *pending, + u64 *bytes_to_reserve) +{ + struct btrfs_root *root; + struct btrfs_block_rsv *block_rsv; + u64 num_bytes; + int index; + + root = pending->root; + if (!root->orphan_block_rsv || list_empty(&root->orphan_list)) + return; + + block_rsv = root->orphan_block_rsv; + + /* orphan block reservation for the snapshot */ + num_bytes = block_rsv->size; + + /* + * after the snapshot is created, COWing tree blocks may use more + * space than it frees. So we should make sure there is enough + * reserved space. + */ + index = trans->transid & 0x1; + if (block_rsv->reserved + block_rsv->freed[index] < block_rsv->size) { + num_bytes += block_rsv->size - + (block_rsv->reserved + block_rsv->freed[index]); + } + + *bytes_to_reserve += num_bytes; +} + +void btrfs_orphan_post_snapshot(struct btrfs_trans_handle *trans, + struct btrfs_pending_snapshot *pending) +{ + struct btrfs_root *root = pending->root; + struct btrfs_root *snap = pending->snap; + struct btrfs_block_rsv *block_rsv; + u64 num_bytes; + int index; + int ret; + + if (!root->orphan_block_rsv || list_empty(&root->orphan_list)) + return; + + /* refill source subvolume's orphan block reservation */ + block_rsv = root->orphan_block_rsv; + index = trans->transid & 0x1; + if (block_rsv->reserved + block_rsv->freed[index] < block_rsv->size) { + num_bytes = block_rsv->size - + (block_rsv->reserved + block_rsv->freed[index]); + ret = btrfs_block_rsv_migrate(&pending->block_rsv, + root->orphan_block_rsv, + num_bytes); + BUG_ON(ret); + } + + /* setup orphan block reservation for the snapshot */ + block_rsv = btrfs_alloc_block_rsv(snap); + BUG_ON(!block_rsv); + + btrfs_add_durable_block_rsv(root->fs_info, block_rsv); + snap->orphan_block_rsv = block_rsv; + + num_bytes = root->orphan_block_rsv->size; + ret = btrfs_block_rsv_migrate(&pending->block_rsv, + block_rsv, num_bytes); + BUG_ON(ret); + +#if 0 + /* insert orphan item for the snapshot */ + WARN_ON(!root->orphan_item_inserted); + ret = btrfs_insert_orphan_item(trans, root->fs_info->tree_root, + snap->root_key.objectid); + BUG_ON(ret); + snap->orphan_item_inserted = 1; +#endif +} + +enum btrfs_orphan_cleanup_state { + ORPHAN_CLEANUP_STARTED = 1, + ORPHAN_CLEANUP_DONE = 2, +}; + +/* + * This is called in transaction commmit time. If there are no orphan + * files in the subvolume, it removes orphan item and frees block_rsv + * structure. + */ +void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + int ret; + + if (!list_empty(&root->orphan_list) || + root->orphan_cleanup_state != ORPHAN_CLEANUP_DONE) + return; + + if (root->orphan_item_inserted && + btrfs_root_refs(&root->root_item) > 0) { + ret = btrfs_del_orphan_item(trans, root->fs_info->tree_root, + root->root_key.objectid); + BUG_ON(ret); + root->orphan_item_inserted = 0; + } + + if (root->orphan_block_rsv) { + WARN_ON(root->orphan_block_rsv->size > 0); + btrfs_free_block_rsv(root, root->orphan_block_rsv); + root->orphan_block_rsv = NULL; + } +} + +/* * This creates an orphan entry for the given inode in case something goes * wrong in the middle of an unlink/truncate. + * + * NOTE: caller of this function should reserve 5 units of metadata for + * this function. */ int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode) { struct btrfs_root *root = BTRFS_I(inode)->root; - int ret = 0; + struct btrfs_block_rsv *block_rsv = NULL; + int reserve = 0; + int insert = 0; + int ret; - spin_lock(&root->list_lock); + if (!root->orphan_block_rsv) { + block_rsv = btrfs_alloc_block_rsv(root); + BUG_ON(!block_rsv); + } - /* already on the orphan list, we're good */ - if (!list_empty(&BTRFS_I(inode)->i_orphan)) { - spin_unlock(&root->list_lock); - return 0; + spin_lock(&root->orphan_lock); + if (!root->orphan_block_rsv) { + root->orphan_block_rsv = block_rsv; + } else if (block_rsv) { + btrfs_free_block_rsv(root, block_rsv); + block_rsv = NULL; + } + + if (list_empty(&BTRFS_I(inode)->i_orphan)) { + list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list); +#if 0 + /* + * For proper ENOSPC handling, we should do orphan + * cleanup when mounting. But this introduces backward + * compatibility issue. + */ + if (!xchg(&root->orphan_item_inserted, 1)) + insert = 2; + else + insert = 1; +#endif + insert = 1; + } else { + WARN_ON(!BTRFS_I(inode)->orphan_meta_reserved); } - list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list); + if (!BTRFS_I(inode)->orphan_meta_reserved) { + BTRFS_I(inode)->orphan_meta_reserved = 1; + reserve = 1; + } + spin_unlock(&root->orphan_lock); - spin_unlock(&root->list_lock); + if (block_rsv) + btrfs_add_durable_block_rsv(root->fs_info, block_rsv); - /* - * insert an orphan item to track this unlinked/truncated file - */ - ret = btrfs_insert_orphan_item(trans, root, inode->i_ino); + /* grab metadata reservation from transaction handle */ + if (reserve) { + ret = btrfs_orphan_reserve_metadata(trans, inode); + BUG_ON(ret); + } - return ret; + /* insert an orphan item to track this unlinked/truncated file */ + if (insert >= 1) { + ret = btrfs_insert_orphan_item(trans, root, inode->i_ino); + BUG_ON(ret); + } + + /* insert an orphan item to track subvolume contains orphan files */ + if (insert >= 2) { + ret = btrfs_insert_orphan_item(trans, root->fs_info->tree_root, + root->root_key.objectid); + BUG_ON(ret); + } + return 0; } /* @@ -2028,26 +2202,31 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode) int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode) { struct btrfs_root *root = BTRFS_I(inode)->root; + int delete_item = 0; + int release_rsv = 0; int ret = 0; - spin_lock(&root->list_lock); - - if (list_empty(&BTRFS_I(inode)->i_orphan)) { - spin_unlock(&root->list_lock); - return 0; + spin_lock(&root->orphan_lock); + if (!list_empty(&BTRFS_I(inode)->i_orphan)) { + list_del_init(&BTRFS_I(inode)->i_orphan); + delete_item = 1; } - list_del_init(&BTRFS_I(inode)->i_orphan); - if (!trans) { - spin_unlock(&root->list_lock); - return 0; + if (BTRFS_I(inode)->orphan_meta_reserved) { + BTRFS_I(inode)->orphan_meta_reserved = 0; + release_rsv = 1; } + spin_unlock(&root->orphan_lock); - spin_unlock(&root->list_lock); + if (trans && delete_item) { + ret = btrfs_del_orphan_item(trans, root, inode->i_ino); + BUG_ON(ret); + } - ret = btrfs_del_orphan_item(trans, root, inode->i_ino); + if (release_rsv) + btrfs_orphan_release_metadata(inode); - return ret; + return 0; } /* @@ -2064,7 +2243,7 @@ void btrfs_orphan_cleanup(struct btrfs_root *root) struct inode *inode; int ret = 0, nr_unlink = 0, nr_truncate = 0; - if (!xchg(&root->clean_orphans, 0)) + if (cmpxchg(&root->orphan_cleanup_state, 0, ORPHAN_CLEANUP_STARTED)) return; path = btrfs_alloc_path(); @@ -2117,16 +2296,15 @@ void btrfs_orphan_cleanup(struct btrfs_root *root) found_key.type = BTRFS_INODE_ITEM_KEY; found_key.offset = 0; inode = btrfs_iget(root->fs_info->sb, &found_key, root, NULL); - if (IS_ERR(inode)) - break; + BUG_ON(IS_ERR(inode)); /* * add this inode to the orphan list so btrfs_orphan_del does * the proper thing when we hit it */ - spin_lock(&root->list_lock); + spin_lock(&root->orphan_lock); list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list); - spin_unlock(&root->list_lock); + spin_unlock(&root->orphan_lock); /* * if this is a bad inode, means we actually succeeded in @@ -2135,7 +2313,7 @@ void btrfs_orphan_cleanup(struct btrfs_root *root) * do a destroy_inode */ if (is_bad_inode(inode)) { - trans = btrfs_start_transaction(root, 1); + trans = btrfs_start_transaction(root, 0); btrfs_orphan_del(trans, inode); btrfs_end_transaction(trans, root); iput(inode); @@ -2153,13 +2331,23 @@ void btrfs_orphan_cleanup(struct btrfs_root *root) /* this will do delete_inode and everything for us */ iput(inode); } + btrfs_free_path(path); + + root->orphan_cleanup_state = ORPHAN_CLEANUP_DONE; + + if (root->orphan_block_rsv) + btrfs_block_rsv_release(root, root->orphan_block_rsv, + (u64)-1); + + if (root->orphan_block_rsv || root->orphan_item_inserted) { + trans = btrfs_join_transaction(root, 1); + btrfs_end_transaction(trans, root); + } if (nr_unlink) printk(KERN_INFO "btrfs: unlinked %d orphans\n", nr_unlink); if (nr_truncate) printk(KERN_INFO "btrfs: truncated %d orphans\n", nr_truncate); - - btrfs_free_path(path); } /* @@ -2478,29 +2666,201 @@ out: return ret; } -static int btrfs_unlink(struct inode *dir, struct dentry *dentry) +/* helper to check if there is any shared block in the path */ +static int check_path_shared(struct btrfs_root *root, + struct btrfs_path *path) +{ + struct extent_buffer *eb; + int level; + int ret; + u64 refs = 1; + + for (level = 0; level < BTRFS_MAX_LEVEL; level++) { + if (!path->nodes[level]) + break; + eb = path->nodes[level]; + if (!btrfs_block_can_be_shared(root, eb)) + continue; + ret = btrfs_lookup_extent_info(NULL, root, eb->start, eb->len, + &refs, NULL); + if (refs > 1) + return 1; + } + return 0; +} + +/* + * helper to start transaction for unlink and rmdir. + * + * unlink and rmdir are special in btrfs, they do not always free space. + * so in enospc case, we should make sure they will free space before + * allowing them to use the global metadata reservation. + */ +static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir, + struct dentry *dentry) { - struct btrfs_root *root; struct btrfs_trans_handle *trans; + struct btrfs_root *root = BTRFS_I(dir)->root; + struct btrfs_path *path; + struct btrfs_inode_ref *ref; + struct btrfs_dir_item *di; struct inode *inode = dentry->d_inode; + u64 index; + int check_link = 1; + int err = -ENOSPC; int ret; - unsigned long nr = 0; - root = BTRFS_I(dir)->root; + trans = btrfs_start_transaction(root, 10); + if (!IS_ERR(trans) || PTR_ERR(trans) != -ENOSPC) + return trans; - /* - * 5 items for unlink inode - * 1 for orphan - */ - ret = btrfs_reserve_metadata_space(root, 6); - if (ret) - return ret; + if (inode->i_ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) + return ERR_PTR(-ENOSPC); + + /* check if there is someone else holds reference */ + if (S_ISDIR(inode->i_mode) && atomic_read(&inode->i_count) > 1) + return ERR_PTR(-ENOSPC); + + if (atomic_read(&inode->i_count) > 2) + return ERR_PTR(-ENOSPC); + + if (xchg(&root->fs_info->enospc_unlink, 1)) + return ERR_PTR(-ENOSPC); - trans = btrfs_start_transaction(root, 1); + path = btrfs_alloc_path(); + if (!path) { + root->fs_info->enospc_unlink = 0; + return ERR_PTR(-ENOMEM); + } + + trans = btrfs_start_transaction(root, 0); if (IS_ERR(trans)) { - btrfs_unreserve_metadata_space(root, 6); - return PTR_ERR(trans); + btrfs_free_path(path); + root->fs_info->enospc_unlink = 0; + return trans; + } + + path->skip_locking = 1; + path->search_commit_root = 1; + + ret = btrfs_lookup_inode(trans, root, path, + &BTRFS_I(dir)->location, 0); + if (ret < 0) { + err = ret; + goto out; + } + if (ret == 0) { + if (check_path_shared(root, path)) + goto out; + } else { + check_link = 0; + } + btrfs_release_path(root, path); + + ret = btrfs_lookup_inode(trans, root, path, + &BTRFS_I(inode)->location, 0); + if (ret < 0) { + err = ret; + goto out; + } + if (ret == 0) { + if (check_path_shared(root, path)) + goto out; + } else { + check_link = 0; + } + btrfs_release_path(root, path); + + if (ret == 0 && S_ISREG(inode->i_mode)) { + ret = btrfs_lookup_file_extent(trans, root, path, + inode->i_ino, (u64)-1, 0); + if (ret < 0) { + err = ret; + goto out; + } + BUG_ON(ret == 0); + if (check_path_shared(root, path)) + goto out; + btrfs_release_path(root, path); + } + + if (!check_link) { + err = 0; + goto out; + } + + di = btrfs_lookup_dir_item(trans, root, path, dir->i_ino, + dentry->d_name.name, dentry->d_name.len, 0); + if (IS_ERR(di)) { + err = PTR_ERR(di); + goto out; + } + if (di) { + if (check_path_shared(root, path)) + goto out; + } else { + err = 0; + goto out; } + btrfs_release_path(root, path); + + ref = btrfs_lookup_inode_ref(trans, root, path, + dentry->d_name.name, dentry->d_name.len, + inode->i_ino, dir->i_ino, 0); + if (IS_ERR(ref)) { + err = PTR_ERR(ref); + goto out; + } + BUG_ON(!ref); + if (check_path_shared(root, path)) + goto out; + index = btrfs_inode_ref_index(path->nodes[0], ref); + btrfs_release_path(root, path); + + di = btrfs_lookup_dir_index_item(trans, root, path, dir->i_ino, index, + dentry->d_name.name, dentry->d_name.len, 0); + if (IS_ERR(di)) { + err = PTR_ERR(di); + goto out; + } + BUG_ON(ret == -ENOENT); + if (check_path_shared(root, path)) + goto out; + + err = 0; +out: + btrfs_free_path(path); + if (err) { + btrfs_end_transaction(trans, root); + root->fs_info->enospc_unlink = 0; + return ERR_PTR(err); + } + + trans->block_rsv = &root->fs_info->global_block_rsv; + return trans; +} + +static void __unlink_end_trans(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + if (trans->block_rsv == &root->fs_info->global_block_rsv) { + BUG_ON(!root->fs_info->enospc_unlink); + root->fs_info->enospc_unlink = 0; + } + btrfs_end_transaction_throttle(trans, root); +} + +static int btrfs_unlink(struct inode *dir, struct dentry *dentry) +{ + struct btrfs_root *root = BTRFS_I(dir)->root; + struct btrfs_trans_handle *trans; + struct inode *inode = dentry->d_inode; + int ret; + unsigned long nr = 0; + + trans = __unlink_start_trans(dir, dentry); + if (IS_ERR(trans)) + return PTR_ERR(trans); btrfs_set_trans_block_group(trans, dir); @@ -2508,14 +2868,15 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry) ret = btrfs_unlink_inode(trans, root, dir, dentry->d_inode, dentry->d_name.name, dentry->d_name.len); + BUG_ON(ret); - if (inode->i_nlink == 0) + if (inode->i_nlink == 0) { ret = btrfs_orphan_add(trans, inode); + BUG_ON(ret); + } nr = trans->blocks_used; - - btrfs_end_transaction_throttle(trans, root); - btrfs_unreserve_metadata_space(root, 6); + __unlink_end_trans(trans, root); btrfs_btree_balance_dirty(root, nr); return ret; } @@ -2587,7 +2948,6 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry) { struct inode *inode = dentry->d_inode; int err = 0; - int ret; struct btrfs_root *root = BTRFS_I(dir)->root; struct btrfs_trans_handle *trans; unsigned long nr = 0; @@ -2596,15 +2956,9 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry) inode->i_ino == BTRFS_FIRST_FREE_OBJECTID) return -ENOTEMPTY; - ret = btrfs_reserve_metadata_space(root, 5); - if (ret) - return ret; - - trans = btrfs_start_transaction(root, 1); - if (IS_ERR(trans)) { - btrfs_unreserve_metadata_space(root, 5); + trans = __unlink_start_trans(dir, dentry); + if (IS_ERR(trans)) return PTR_ERR(trans); - } btrfs_set_trans_block_group(trans, dir); @@ -2627,12 +2981,9 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry) btrfs_i_size_write(inode, 0); out: nr = trans->blocks_used; - ret = btrfs_end_transaction_throttle(trans, root); - btrfs_unreserve_metadata_space(root, 5); + __unlink_end_trans(trans, root); btrfs_btree_balance_dirty(root, nr); - if (ret && !err) - err = ret; return err; } @@ -3029,6 +3380,7 @@ out: if (pending_del_nr) { ret = btrfs_del_items(trans, root, path, pending_del_slot, pending_del_nr); + BUG_ON(ret); } btrfs_free_path(path); return err; @@ -3056,11 +3408,7 @@ static int btrfs_truncate_page(struct address_space *mapping, loff_t from) if ((offset & (blocksize - 1)) == 0) goto out; - ret = btrfs_check_data_free_space(root, inode, PAGE_CACHE_SIZE); - if (ret) - goto out; - - ret = btrfs_reserve_metadata_for_delalloc(root, inode, 1); + ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE); if (ret) goto out; @@ -3068,8 +3416,7 @@ static int btrfs_truncate_page(struct address_space *mapping, loff_t from) again: page = grab_cache_page(mapping, index); if (!page) { - btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE); - btrfs_unreserve_metadata_for_delalloc(root, inode, 1); + btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE); goto out; } @@ -3132,8 +3479,7 @@ again: out_unlock: if (ret) - btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE); - btrfs_unreserve_metadata_for_delalloc(root, inode, 1); + btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE); unlock_page(page); page_cache_release(page); out: @@ -3145,7 +3491,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t size) struct btrfs_trans_handle *trans; struct btrfs_root *root = BTRFS_I(inode)->root; struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; - struct extent_map *em; + struct extent_map *em = NULL; struct extent_state *cached_state = NULL; u64 mask = root->sectorsize - 1; u64 hole_start = (inode->i_size + mask) & ~mask; @@ -3183,11 +3529,11 @@ int btrfs_cont_expand(struct inode *inode, loff_t size) u64 hint_byte = 0; hole_size = last_byte - cur_offset; - err = btrfs_reserve_metadata_space(root, 2); - if (err) + trans = btrfs_start_transaction(root, 2); + if (IS_ERR(trans)) { + err = PTR_ERR(trans); break; - - trans = btrfs_start_transaction(root, 1); + } btrfs_set_trans_block_group(trans, inode); err = btrfs_drop_extents(trans, inode, cur_offset, @@ -3205,14 +3551,15 @@ int btrfs_cont_expand(struct inode *inode, loff_t size) last_byte - 1, 0); btrfs_end_transaction(trans, root); - btrfs_unreserve_metadata_space(root, 2); } free_extent_map(em); + em = NULL; cur_offset = last_byte; if (cur_offset >= block_end) break; } + free_extent_map(em); unlock_extent_cached(io_tree, hole_start, block_end - 1, &cached_state, GFP_NOFS); return err; @@ -3239,11 +3586,10 @@ static int btrfs_setattr_size(struct inode *inode, struct iattr *attr) } } - ret = btrfs_reserve_metadata_space(root, 1); - if (ret) - return ret; + trans = btrfs_start_transaction(root, 5); + if (IS_ERR(trans)) + return PTR_ERR(trans); - trans = btrfs_start_transaction(root, 1); btrfs_set_trans_block_group(trans, inode); ret = btrfs_orphan_add(trans, inode); @@ -3251,7 +3597,6 @@ static int btrfs_setattr_size(struct inode *inode, struct iattr *attr) nr = trans->blocks_used; btrfs_end_transaction(trans, root); - btrfs_unreserve_metadata_space(root, 1); btrfs_btree_balance_dirty(root, nr); if (attr->ia_size > inode->i_size) { @@ -3264,8 +3609,11 @@ static int btrfs_setattr_size(struct inode *inode, struct iattr *attr) i_size_write(inode, attr->ia_size); btrfs_ordered_update_i_size(inode, inode->i_size, NULL); - trans = btrfs_start_transaction(root, 1); + trans = btrfs_start_transaction(root, 0); + BUG_ON(IS_ERR(trans)); btrfs_set_trans_block_group(trans, inode); + trans->block_rsv = root->orphan_block_rsv; + BUG_ON(!trans->block_rsv); ret = btrfs_update_inode(trans, root, inode); BUG_ON(ret); @@ -3345,10 +3693,21 @@ void btrfs_delete_inode(struct inode *inode) btrfs_i_size_write(inode, 0); while (1) { - trans = btrfs_start_transaction(root, 1); + trans = btrfs_start_transaction(root, 0); + BUG_ON(IS_ERR(trans)); btrfs_set_trans_block_group(trans, inode); - ret = btrfs_truncate_inode_items(trans, root, inode, 0, 0); + trans->block_rsv = root->orphan_block_rsv; + + ret = btrfs_block_rsv_check(trans, root, + root->orphan_block_rsv, 0, 5); + if (ret) { + BUG_ON(ret != -EAGAIN); + ret = btrfs_commit_transaction(trans, root); + BUG_ON(ret); + continue; + } + ret = btrfs_truncate_inode_items(trans, root, inode, 0, 0); if (ret != -EAGAIN) break; @@ -3356,6 +3715,7 @@ void btrfs_delete_inode(struct inode *inode) btrfs_end_transaction(trans, root); trans = NULL; btrfs_btree_balance_dirty(root, nr); + } if (ret == 0) { @@ -3596,40 +3956,10 @@ again: return 0; } -static noinline void init_btrfs_i(struct inode *inode) -{ - struct btrfs_inode *bi = BTRFS_I(inode); - - bi->generation = 0; - bi->sequence = 0; - bi->last_trans = 0; - bi->last_sub_trans = 0; - bi->logged_trans = 0; - bi->delalloc_bytes = 0; - bi->reserved_bytes = 0; - bi->disk_i_size = 0; - bi->flags = 0; - bi->index_cnt = (u64)-1; - bi->last_unlink_trans = 0; - bi->ordered_data_close = 0; - bi->force_compress = 0; - extent_map_tree_init(&BTRFS_I(inode)->extent_tree, GFP_NOFS); - extent_io_tree_init(&BTRFS_I(inode)->io_tree, - inode->i_mapping, GFP_NOFS); - extent_io_tree_init(&BTRFS_I(inode)->io_failure_tree, - inode->i_mapping, GFP_NOFS); - INIT_LIST_HEAD(&BTRFS_I(inode)->delalloc_inodes); - INIT_LIST_HEAD(&BTRFS_I(inode)->ordered_operations); - RB_CLEAR_NODE(&BTRFS_I(inode)->rb_node); - btrfs_ordered_inode_tree_init(&BTRFS_I(inode)->ordered_tree); - mutex_init(&BTRFS_I(inode)->log_mutex); -} - static int btrfs_init_locked_inode(struct inode *inode, void *p) { struct btrfs_iget_args *args = p; inode->i_ino = args->ino; - init_btrfs_i(inode); BTRFS_I(inode)->root = args->root; btrfs_set_inode_space_info(args->root, inode); return 0; @@ -3692,8 +4022,6 @@ static struct inode *new_simple_dir(struct super_block *s, if (!inode) return ERR_PTR(-ENOMEM); - init_btrfs_i(inode); - BTRFS_I(inode)->root = root; memcpy(&BTRFS_I(inode)->location, key, sizeof(*key)); BTRFS_I(inode)->dummy_inode = 1; @@ -3950,7 +4278,7 @@ int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc) struct btrfs_trans_handle *trans; int ret = 0; - if (root->fs_info->btree_inode == inode) + if (BTRFS_I(inode)->dummy_inode) return 0; if (wbc->sync_mode == WB_SYNC_ALL) { @@ -3971,10 +4299,38 @@ void btrfs_dirty_inode(struct inode *inode) { struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_trans_handle *trans; + int ret; + + if (BTRFS_I(inode)->dummy_inode) + return; trans = btrfs_join_transaction(root, 1); btrfs_set_trans_block_group(trans, inode); - btrfs_update_inode(trans, root, inode); + + ret = btrfs_update_inode(trans, root, inode); + if (ret && ret == -ENOSPC) { + /* whoops, lets try again with the full transaction */ + btrfs_end_transaction(trans, root); + trans = btrfs_start_transaction(root, 1); + if (IS_ERR(trans)) { + if (printk_ratelimit()) { + printk(KERN_ERR "btrfs: fail to " + "dirty inode %lu error %ld\n", + inode->i_ino, PTR_ERR(trans)); + } + return; + } + btrfs_set_trans_block_group(trans, inode); + + ret = btrfs_update_inode(trans, root, inode); + if (ret) { + if (printk_ratelimit()) { + printk(KERN_ERR "btrfs: fail to " + "dirty inode %lu error %d\n", + inode->i_ino, ret); + } + } + } btrfs_end_transaction(trans, root); } @@ -4092,7 +4448,6 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, * btrfs_get_inode_index_count has an explanation for the magic * number */ - init_btrfs_i(inode); BTRFS_I(inode)->index_cnt = 2; BTRFS_I(inode)->root = root; BTRFS_I(inode)->generation = trans->transid; @@ -4247,26 +4602,21 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry, if (!new_valid_dev(rdev)) return -EINVAL; + err = btrfs_find_free_objectid(NULL, root, dir->i_ino, &objectid); + if (err) + return err; + /* * 2 for inode item and ref * 2 for dir items * 1 for xattr if selinux is on */ - err = btrfs_reserve_metadata_space(root, 5); - if (err) - return err; + trans = btrfs_start_transaction(root, 5); + if (IS_ERR(trans)) + return PTR_ERR(trans); - trans = btrfs_start_transaction(root, 1); - if (!trans) - goto fail; btrfs_set_trans_block_group(trans, dir); - err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid); - if (err) { - err = -ENOSPC; - goto out_unlock; - } - inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name, dentry->d_name.len, dentry->d_parent->d_inode->i_ino, objectid, @@ -4295,13 +4645,11 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry, out_unlock: nr = trans->blocks_used; btrfs_end_transaction_throttle(trans, root); -fail: - btrfs_unreserve_metadata_space(root, 5); + btrfs_btree_balance_dirty(root, nr); if (drop_inode) { inode_dec_link_count(inode); iput(inode); } - btrfs_btree_balance_dirty(root, nr); return err; } @@ -4311,32 +4659,26 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry, struct btrfs_trans_handle *trans; struct btrfs_root *root = BTRFS_I(dir)->root; struct inode *inode = NULL; - int err; int drop_inode = 0; + int err; unsigned long nr = 0; u64 objectid; u64 index = 0; + err = btrfs_find_free_objectid(NULL, root, dir->i_ino, &objectid); + if (err) + return err; /* * 2 for inode item and ref * 2 for dir items * 1 for xattr if selinux is on */ - err = btrfs_reserve_metadata_space(root, 5); - if (err) - return err; + trans = btrfs_start_transaction(root, 5); + if (IS_ERR(trans)) + return PTR_ERR(trans); - trans = btrfs_start_transaction(root, 1); - if (!trans) - goto fail; btrfs_set_trans_block_group(trans, dir); - err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid); - if (err) { - err = -ENOSPC; - goto out_unlock; - } - inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name, dentry->d_name.len, dentry->d_parent->d_inode->i_ino, @@ -4368,8 +4710,6 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry, out_unlock: nr = trans->blocks_used; btrfs_end_transaction_throttle(trans, root); -fail: - btrfs_unreserve_metadata_space(root, 5); if (drop_inode) { inode_dec_link_count(inode); iput(inode); @@ -4396,21 +4736,21 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, if (root->objectid != BTRFS_I(inode)->root->objectid) return -EPERM; - /* - * 1 item for inode ref - * 2 items for dir items - */ - err = btrfs_reserve_metadata_space(root, 3); - if (err) - return err; - btrfs_inc_nlink(inode); err = btrfs_set_inode_index(dir, &index); if (err) goto fail; - trans = btrfs_start_transaction(root, 1); + /* + * 1 item for inode ref + * 2 items for dir items + */ + trans = btrfs_start_transaction(root, 3); + if (IS_ERR(trans)) { + err = PTR_ERR(trans); + goto fail; + } btrfs_set_trans_block_group(trans, dir); atomic_inc(&inode->i_count); @@ -4429,7 +4769,6 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, nr = trans->blocks_used; btrfs_end_transaction_throttle(trans, root); fail: - btrfs_unreserve_metadata_space(root, 3); if (drop_inode) { inode_dec_link_count(inode); iput(inode); @@ -4449,28 +4788,20 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) u64 index = 0; unsigned long nr = 1; + err = btrfs_find_free_objectid(NULL, root, dir->i_ino, &objectid); + if (err) + return err; + /* * 2 items for inode and ref * 2 items for dir items * 1 for xattr if selinux is on */ - err = btrfs_reserve_metadata_space(root, 5); - if (err) - return err; - - trans = btrfs_start_transaction(root, 1); - if (!trans) { - err = -ENOMEM; - goto out_unlock; - } + trans = btrfs_start_transaction(root, 5); + if (IS_ERR(trans)) + return PTR_ERR(trans); btrfs_set_trans_block_group(trans, dir); - err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid); - if (err) { - err = -ENOSPC; - goto out_fail; - } - inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name, dentry->d_name.len, dentry->d_parent->d_inode->i_ino, objectid, @@ -4510,9 +4841,6 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) out_fail: nr = trans->blocks_used; btrfs_end_transaction_throttle(trans, root); - -out_unlock: - btrfs_unreserve_metadata_space(root, 5); if (drop_on_err) iput(inode); btrfs_btree_balance_dirty(root, nr); @@ -4770,6 +5098,7 @@ again: } flush_dcache_page(page); } else if (create && PageUptodate(page)) { + WARN_ON(1); if (!trans) { kunmap(page); free_extent_map(em); @@ -4866,11 +5195,651 @@ out: return em; } +static struct extent_map *btrfs_new_extent_direct(struct inode *inode, + u64 start, u64 len) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_trans_handle *trans; + struct extent_map *em; + struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; + struct btrfs_key ins; + u64 alloc_hint; + int ret; + + btrfs_drop_extent_cache(inode, start, start + len - 1, 0); + + trans = btrfs_join_transaction(root, 0); + if (!trans) + return ERR_PTR(-ENOMEM); + + trans->block_rsv = &root->fs_info->delalloc_block_rsv; + + alloc_hint = get_extent_allocation_hint(inode, start, len); + ret = btrfs_reserve_extent(trans, root, len, root->sectorsize, 0, + alloc_hint, (u64)-1, &ins, 1); + if (ret) { + em = ERR_PTR(ret); + goto out; + } + + em = alloc_extent_map(GFP_NOFS); + if (!em) { + em = ERR_PTR(-ENOMEM); + goto out; + } + + em->start = start; + em->orig_start = em->start; + em->len = ins.offset; + + em->block_start = ins.objectid; + em->block_len = ins.offset; + em->bdev = root->fs_info->fs_devices->latest_bdev; + set_bit(EXTENT_FLAG_PINNED, &em->flags); + + while (1) { + write_lock(&em_tree->lock); + ret = add_extent_mapping(em_tree, em); + write_unlock(&em_tree->lock); + if (ret != -EEXIST) + break; + btrfs_drop_extent_cache(inode, start, start + em->len - 1, 0); + } + + ret = btrfs_add_ordered_extent_dio(inode, start, ins.objectid, + ins.offset, ins.offset, 0); + if (ret) { + btrfs_free_reserved_extent(root, ins.objectid, ins.offset); + em = ERR_PTR(ret); + } +out: + btrfs_end_transaction(trans, root); + return em; +} + +/* + * returns 1 when the nocow is safe, < 1 on error, 0 if the + * block must be cow'd + */ +static noinline int can_nocow_odirect(struct btrfs_trans_handle *trans, + struct inode *inode, u64 offset, u64 len) +{ + struct btrfs_path *path; + int ret; + struct extent_buffer *leaf; + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_file_extent_item *fi; + struct btrfs_key key; + u64 disk_bytenr; + u64 backref_offset; + u64 extent_end; + u64 num_bytes; + int slot; + int found_type; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + ret = btrfs_lookup_file_extent(trans, root, path, inode->i_ino, + offset, 0); + if (ret < 0) + goto out; + + slot = path->slots[0]; + if (ret == 1) { + if (slot == 0) { + /* can't find the item, must cow */ + ret = 0; + goto out; + } + slot--; + } + ret = 0; + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &key, slot); + if (key.objectid != inode->i_ino || + key.type != BTRFS_EXTENT_DATA_KEY) { + /* not our file or wrong item type, must cow */ + goto out; + } + + if (key.offset > offset) { + /* Wrong offset, must cow */ + goto out; + } + + fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); + found_type = btrfs_file_extent_type(leaf, fi); + if (found_type != BTRFS_FILE_EXTENT_REG && + found_type != BTRFS_FILE_EXTENT_PREALLOC) { + /* not a regular extent, must cow */ + goto out; + } + disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); + backref_offset = btrfs_file_extent_offset(leaf, fi); + + extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi); + if (extent_end < offset + len) { + /* extent doesn't include our full range, must cow */ + goto out; + } + + if (btrfs_extent_readonly(root, disk_bytenr)) + goto out; + + /* + * look for other files referencing this extent, if we + * find any we must cow + */ + if (btrfs_cross_ref_exist(trans, root, inode->i_ino, + key.offset - backref_offset, disk_bytenr)) + goto out; + + /* + * adjust disk_bytenr and num_bytes to cover just the bytes + * in this extent we are about to write. If there + * are any csums in that range we have to cow in order + * to keep the csums correct + */ + disk_bytenr += backref_offset; + disk_bytenr += offset - key.offset; + num_bytes = min(offset + len, extent_end) - offset; + if (csum_exist_in_range(root, disk_bytenr, num_bytes)) + goto out; + /* + * all of the above have passed, it is safe to overwrite this extent + * without cow + */ + ret = 1; +out: + btrfs_free_path(path); + return ret; +} + +static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create) +{ + struct extent_map *em; + struct btrfs_root *root = BTRFS_I(inode)->root; + u64 start = iblock << inode->i_blkbits; + u64 len = bh_result->b_size; + struct btrfs_trans_handle *trans; + + em = btrfs_get_extent(inode, NULL, 0, start, len, 0); + if (IS_ERR(em)) + return PTR_ERR(em); + + /* + * Ok for INLINE and COMPRESSED extents we need to fallback on buffered + * io. INLINE is special, and we could probably kludge it in here, but + * it's still buffered so for safety lets just fall back to the generic + * buffered path. + * + * For COMPRESSED we _have_ to read the entire extent in so we can + * decompress it, so there will be buffering required no matter what we + * do, so go ahead and fallback to buffered. + * + * We return -ENOTBLK because thats what makes DIO go ahead and go back + * to buffered IO. Don't blame me, this is the price we pay for using + * the generic code. + */ + if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags) || + em->block_start == EXTENT_MAP_INLINE) { + free_extent_map(em); + return -ENOTBLK; + } + + /* Just a good old fashioned hole, return */ + if (!create && (em->block_start == EXTENT_MAP_HOLE || + test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) { + free_extent_map(em); + /* DIO will do one hole at a time, so just unlock a sector */ + unlock_extent(&BTRFS_I(inode)->io_tree, start, + start + root->sectorsize - 1, GFP_NOFS); + return 0; + } + + /* + * We don't allocate a new extent in the following cases + * + * 1) The inode is marked as NODATACOW. In this case we'll just use the + * existing extent. + * 2) The extent is marked as PREALLOC. We're good to go here and can + * just use the extent. + * + */ + if (!create) { + len = em->len - (start - em->start); + goto map; + } + + if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) || + ((BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) && + em->block_start != EXTENT_MAP_HOLE)) { + int type; + int ret; + u64 block_start; + + if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) + type = BTRFS_ORDERED_PREALLOC; + else + type = BTRFS_ORDERED_NOCOW; + len = min(len, em->len - (start - em->start)); + block_start = em->block_start + (start - em->start); + + /* + * we're not going to log anything, but we do need + * to make sure the current transaction stays open + * while we look for nocow cross refs + */ + trans = btrfs_join_transaction(root, 0); + if (!trans) + goto must_cow; + + if (can_nocow_odirect(trans, inode, start, len) == 1) { + ret = btrfs_add_ordered_extent_dio(inode, start, + block_start, len, len, type); + btrfs_end_transaction(trans, root); + if (ret) { + free_extent_map(em); + return ret; + } + goto unlock; + } + btrfs_end_transaction(trans, root); + } +must_cow: + /* + * this will cow the extent, reset the len in case we changed + * it above + */ + len = bh_result->b_size; + free_extent_map(em); + em = btrfs_new_extent_direct(inode, start, len); + if (IS_ERR(em)) + return PTR_ERR(em); + len = min(len, em->len - (start - em->start)); +unlock: + clear_extent_bit(&BTRFS_I(inode)->io_tree, start, start + len - 1, + EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DIRTY, 1, + 0, NULL, GFP_NOFS); +map: + bh_result->b_blocknr = (em->block_start + (start - em->start)) >> + inode->i_blkbits; + bh_result->b_size = len; + bh_result->b_bdev = em->bdev; + set_buffer_mapped(bh_result); + if (create && !test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) + set_buffer_new(bh_result); + + free_extent_map(em); + + return 0; +} + +struct btrfs_dio_private { + struct inode *inode; + u64 logical_offset; + u64 disk_bytenr; + u64 bytes; + u32 *csums; + void *private; +}; + +static void btrfs_endio_direct_read(struct bio *bio, int err) +{ + struct bio_vec *bvec_end = bio->bi_io_vec + bio->bi_vcnt - 1; + struct bio_vec *bvec = bio->bi_io_vec; + struct btrfs_dio_private *dip = bio->bi_private; + struct inode *inode = dip->inode; + struct btrfs_root *root = BTRFS_I(inode)->root; + u64 start; + u32 *private = dip->csums; + + start = dip->logical_offset; + do { + if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) { + struct page *page = bvec->bv_page; + char *kaddr; + u32 csum = ~(u32)0; + unsigned long flags; + + local_irq_save(flags); + kaddr = kmap_atomic(page, KM_IRQ0); + csum = btrfs_csum_data(root, kaddr + bvec->bv_offset, + csum, bvec->bv_len); + btrfs_csum_final(csum, (char *)&csum); + kunmap_atomic(kaddr, KM_IRQ0); + local_irq_restore(flags); + + flush_dcache_page(bvec->bv_page); + if (csum != *private) { + printk(KERN_ERR "btrfs csum failed ino %lu off" + " %llu csum %u private %u\n", + inode->i_ino, (unsigned long long)start, + csum, *private); + err = -EIO; + } + } + + start += bvec->bv_len; + private++; + bvec++; + } while (bvec <= bvec_end); + + unlock_extent(&BTRFS_I(inode)->io_tree, dip->logical_offset, + dip->logical_offset + dip->bytes - 1, GFP_NOFS); + bio->bi_private = dip->private; + + kfree(dip->csums); + kfree(dip); + dio_end_io(bio, err); +} + +static void btrfs_endio_direct_write(struct bio *bio, int err) +{ + struct btrfs_dio_private *dip = bio->bi_private; + struct inode *inode = dip->inode; + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_trans_handle *trans; + struct btrfs_ordered_extent *ordered = NULL; + struct extent_state *cached_state = NULL; + int ret; + + if (err) + goto out_done; + + ret = btrfs_dec_test_ordered_pending(inode, &ordered, + dip->logical_offset, dip->bytes); + if (!ret) + goto out_done; + + BUG_ON(!ordered); + + trans = btrfs_join_transaction(root, 1); + if (!trans) { + err = -ENOMEM; + goto out; + } + trans->block_rsv = &root->fs_info->delalloc_block_rsv; + + if (test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags)) { + ret = btrfs_ordered_update_i_size(inode, 0, ordered); + if (!ret) + ret = btrfs_update_inode(trans, root, inode); + err = ret; + goto out; + } + + lock_extent_bits(&BTRFS_I(inode)->io_tree, ordered->file_offset, + ordered->file_offset + ordered->len - 1, 0, + &cached_state, GFP_NOFS); + + if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags)) { + ret = btrfs_mark_extent_written(trans, inode, + ordered->file_offset, + ordered->file_offset + + ordered->len); + if (ret) { + err = ret; + goto out_unlock; + } + } else { + ret = insert_reserved_file_extent(trans, inode, + ordered->file_offset, + ordered->start, + ordered->disk_len, + ordered->len, + ordered->len, + 0, 0, 0, + BTRFS_FILE_EXTENT_REG); + unpin_extent_cache(&BTRFS_I(inode)->extent_tree, + ordered->file_offset, ordered->len); + if (ret) { + err = ret; + WARN_ON(1); + goto out_unlock; + } + } + + add_pending_csums(trans, inode, ordered->file_offset, &ordered->list); + btrfs_ordered_update_i_size(inode, 0, ordered); + btrfs_update_inode(trans, root, inode); +out_unlock: + unlock_extent_cached(&BTRFS_I(inode)->io_tree, ordered->file_offset, + ordered->file_offset + ordered->len - 1, + &cached_state, GFP_NOFS); +out: + btrfs_delalloc_release_metadata(inode, ordered->len); + btrfs_end_transaction(trans, root); + btrfs_put_ordered_extent(ordered); + btrfs_put_ordered_extent(ordered); +out_done: + bio->bi_private = dip->private; + + kfree(dip->csums); + kfree(dip); + dio_end_io(bio, err); +} + +static int __btrfs_submit_bio_start_direct_io(struct inode *inode, int rw, + struct bio *bio, int mirror_num, + unsigned long bio_flags, u64 offset) +{ + int ret; + struct btrfs_root *root = BTRFS_I(inode)->root; + ret = btrfs_csum_one_bio(root, inode, bio, offset, 1); + BUG_ON(ret); + return 0; +} + +static void btrfs_submit_direct(int rw, struct bio *bio, struct inode *inode, + loff_t file_offset) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_dio_private *dip; + struct bio_vec *bvec = bio->bi_io_vec; + u64 start; + int skip_sum; + int write = rw & (1 << BIO_RW); + int ret = 0; + + skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; + + dip = kmalloc(sizeof(*dip), GFP_NOFS); + if (!dip) { + ret = -ENOMEM; + goto free_ordered; + } + dip->csums = NULL; + + if (!skip_sum) { + dip->csums = kmalloc(sizeof(u32) * bio->bi_vcnt, GFP_NOFS); + if (!dip->csums) { + ret = -ENOMEM; + goto free_ordered; + } + } + + dip->private = bio->bi_private; + dip->inode = inode; + dip->logical_offset = file_offset; + + start = dip->logical_offset; + dip->bytes = 0; + do { + dip->bytes += bvec->bv_len; + bvec++; + } while (bvec <= (bio->bi_io_vec + bio->bi_vcnt - 1)); + + dip->disk_bytenr = (u64)bio->bi_sector << 9; + bio->bi_private = dip; + + if (write) + bio->bi_end_io = btrfs_endio_direct_write; + else + bio->bi_end_io = btrfs_endio_direct_read; + + ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0); + if (ret) + goto out_err; + + if (write && !skip_sum) { + ret = btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info, + inode, rw, bio, 0, 0, + dip->logical_offset, + __btrfs_submit_bio_start_direct_io, + __btrfs_submit_bio_done); + if (ret) + goto out_err; + return; + } else if (!skip_sum) + btrfs_lookup_bio_sums_dio(root, inode, bio, + dip->logical_offset, dip->csums); + + ret = btrfs_map_bio(root, rw, bio, 0, 1); + if (ret) + goto out_err; + return; +out_err: + kfree(dip->csums); + kfree(dip); +free_ordered: + /* + * If this is a write, we need to clean up the reserved space and kill + * the ordered extent. + */ + if (write) { + struct btrfs_ordered_extent *ordered; + ordered = btrfs_lookup_ordered_extent(inode, + dip->logical_offset); + if (!test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags) && + !test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags)) + btrfs_free_reserved_extent(root, ordered->start, + ordered->disk_len); + btrfs_put_ordered_extent(ordered); + btrfs_put_ordered_extent(ordered); + } + bio_endio(bio, ret); +} + +static ssize_t check_direct_IO(struct btrfs_root *root, int rw, struct kiocb *iocb, + const struct iovec *iov, loff_t offset, + unsigned long nr_segs) +{ + int seg; + size_t size; + unsigned long addr; + unsigned blocksize_mask = root->sectorsize - 1; + ssize_t retval = -EINVAL; + loff_t end = offset; + + if (offset & blocksize_mask) + goto out; + + /* Check the memory alignment. Blocks cannot straddle pages */ + for (seg = 0; seg < nr_segs; seg++) { + addr = (unsigned long)iov[seg].iov_base; + size = iov[seg].iov_len; + end += size; + if ((addr & blocksize_mask) || (size & blocksize_mask)) + goto out; + } + retval = 0; +out: + return retval; +} static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, loff_t offset, unsigned long nr_segs) { - return -EINVAL; + struct file *file = iocb->ki_filp; + struct inode *inode = file->f_mapping->host; + struct btrfs_ordered_extent *ordered; + struct extent_state *cached_state = NULL; + u64 lockstart, lockend; + ssize_t ret; + int writing = rw & WRITE; + int write_bits = 0; + size_t count = iov_length(iov, nr_segs); + + if (check_direct_IO(BTRFS_I(inode)->root, rw, iocb, iov, + offset, nr_segs)) { + return 0; + } + + lockstart = offset; + lockend = offset + count - 1; + + if (writing) { + ret = btrfs_delalloc_reserve_space(inode, count); + if (ret) + goto out; + } + + while (1) { + lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend, + 0, &cached_state, GFP_NOFS); + /* + * We're concerned with the entire range that we're going to be + * doing DIO to, so we need to make sure theres no ordered + * extents in this range. + */ + ordered = btrfs_lookup_ordered_range(inode, lockstart, + lockend - lockstart + 1); + if (!ordered) + break; + unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend, + &cached_state, GFP_NOFS); + btrfs_start_ordered_extent(inode, ordered, 1); + btrfs_put_ordered_extent(ordered); + cond_resched(); + } + + /* + * we don't use btrfs_set_extent_delalloc because we don't want + * the dirty or uptodate bits + */ + if (writing) { + write_bits = EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING; + ret = set_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend, + EXTENT_DELALLOC, 0, NULL, &cached_state, + GFP_NOFS); + if (ret) { + clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, + lockend, EXTENT_LOCKED | write_bits, + 1, 0, &cached_state, GFP_NOFS); + goto out; + } + } + + free_extent_state(cached_state); + cached_state = NULL; + + ret = __blockdev_direct_IO(rw, iocb, inode, + BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev, + iov, offset, nr_segs, btrfs_get_blocks_direct, NULL, + btrfs_submit_direct, 0); + + if (ret < 0 && ret != -EIOCBQUEUED) { + clear_extent_bit(&BTRFS_I(inode)->io_tree, offset, + offset + iov_length(iov, nr_segs) - 1, + EXTENT_LOCKED | write_bits, 1, 0, + &cached_state, GFP_NOFS); + } else if (ret >= 0 && ret < iov_length(iov, nr_segs)) { + /* + * We're falling back to buffered, unlock the section we didn't + * do IO on. + */ + clear_extent_bit(&BTRFS_I(inode)->io_tree, offset + ret, + offset + iov_length(iov, nr_segs) - 1, + EXTENT_LOCKED | write_bits, 1, 0, + &cached_state, GFP_NOFS); + } +out: + free_extent_state(cached_state); + return ret; } static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, @@ -5034,7 +6003,7 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) u64 page_start; u64 page_end; - ret = btrfs_check_data_free_space(root, inode, PAGE_CACHE_SIZE); + ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE); if (ret) { if (ret == -ENOMEM) ret = VM_FAULT_OOM; @@ -5043,13 +6012,6 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) goto out; } - ret = btrfs_reserve_metadata_for_delalloc(root, inode, 1); - if (ret) { - btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE); - ret = VM_FAULT_SIGBUS; - goto out; - } - ret = VM_FAULT_NOPAGE; /* make the VM retry the fault */ again: lock_page(page); @@ -5059,7 +6021,6 @@ again: if ((page->mapping != inode->i_mapping) || (page_start >= size)) { - btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE); /* page got truncated out from underneath us */ goto out_unlock; } @@ -5100,7 +6061,6 @@ again: unlock_extent_cached(io_tree, page_start, page_end, &cached_state, GFP_NOFS); ret = VM_FAULT_SIGBUS; - btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE); goto out_unlock; } ret = 0; @@ -5127,10 +6087,10 @@ again: unlock_extent_cached(io_tree, page_start, page_end, &cached_state, GFP_NOFS); out_unlock: - btrfs_unreserve_metadata_for_delalloc(root, inode, 1); if (!ret) return VM_FAULT_LOCKED; unlock_page(page); + btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE); out: return ret; } @@ -5155,8 +6115,10 @@ static void btrfs_truncate(struct inode *inode) btrfs_wait_ordered_range(inode, inode->i_size & (~mask), (u64)-1); btrfs_ordered_update_i_size(inode, inode->i_size, NULL); - trans = btrfs_start_transaction(root, 1); + trans = btrfs_start_transaction(root, 0); + BUG_ON(IS_ERR(trans)); btrfs_set_trans_block_group(trans, inode); + trans->block_rsv = root->orphan_block_rsv; /* * setattr is responsible for setting the ordered_data_close flag, @@ -5179,6 +6141,23 @@ static void btrfs_truncate(struct inode *inode) btrfs_add_ordered_operation(trans, root, inode); while (1) { + if (!trans) { + trans = btrfs_start_transaction(root, 0); + BUG_ON(IS_ERR(trans)); + btrfs_set_trans_block_group(trans, inode); + trans->block_rsv = root->orphan_block_rsv; + } + + ret = btrfs_block_rsv_check(trans, root, + root->orphan_block_rsv, 0, 5); + if (ret) { + BUG_ON(ret != -EAGAIN); + ret = btrfs_commit_transaction(trans, root); + BUG_ON(ret); + trans = NULL; + continue; + } + ret = btrfs_truncate_inode_items(trans, root, inode, inode->i_size, BTRFS_EXTENT_DATA_KEY); @@ -5190,10 +6169,8 @@ static void btrfs_truncate(struct inode *inode) nr = trans->blocks_used; btrfs_end_transaction(trans, root); + trans = NULL; btrfs_btree_balance_dirty(root, nr); - - trans = btrfs_start_transaction(root, 1); - btrfs_set_trans_block_group(trans, inode); } if (ret == 0 && inode->i_nlink > 0) { @@ -5254,21 +6231,47 @@ unsigned long btrfs_force_ra(struct address_space *mapping, struct inode *btrfs_alloc_inode(struct super_block *sb) { struct btrfs_inode *ei; + struct inode *inode; ei = kmem_cache_alloc(btrfs_inode_cachep, GFP_NOFS); if (!ei) return NULL; + + ei->root = NULL; + ei->space_info = NULL; + ei->generation = 0; + ei->sequence = 0; ei->last_trans = 0; ei->last_sub_trans = 0; ei->logged_trans = 0; - ei->outstanding_extents = 0; - ei->reserved_extents = 0; - ei->root = NULL; + ei->delalloc_bytes = 0; + ei->reserved_bytes = 0; + ei->disk_i_size = 0; + ei->flags = 0; + ei->index_cnt = (u64)-1; + ei->last_unlink_trans = 0; + spin_lock_init(&ei->accounting_lock); + atomic_set(&ei->outstanding_extents, 0); + ei->reserved_extents = 0; + + ei->ordered_data_close = 0; + ei->orphan_meta_reserved = 0; + ei->dummy_inode = 0; + ei->force_compress = 0; + + inode = &ei->vfs_inode; + extent_map_tree_init(&ei->extent_tree, GFP_NOFS); + extent_io_tree_init(&ei->io_tree, &inode->i_data, GFP_NOFS); + extent_io_tree_init(&ei->io_failure_tree, &inode->i_data, GFP_NOFS); + mutex_init(&ei->log_mutex); btrfs_ordered_inode_tree_init(&ei->ordered_tree); INIT_LIST_HEAD(&ei->i_orphan); + INIT_LIST_HEAD(&ei->delalloc_inodes); INIT_LIST_HEAD(&ei->ordered_operations); - return &ei->vfs_inode; + RB_CLEAR_NODE(&ei->rb_node); + + return inode; } void btrfs_destroy_inode(struct inode *inode) @@ -5278,6 +6281,8 @@ void btrfs_destroy_inode(struct inode *inode) WARN_ON(!list_empty(&inode->i_dentry)); WARN_ON(inode->i_data.nrpages); + WARN_ON(atomic_read(&BTRFS_I(inode)->outstanding_extents)); + WARN_ON(BTRFS_I(inode)->reserved_extents); /* * This can happen where we create an inode, but somebody else also @@ -5298,13 +6303,13 @@ void btrfs_destroy_inode(struct inode *inode) spin_unlock(&root->fs_info->ordered_extent_lock); } - spin_lock(&root->list_lock); + spin_lock(&root->orphan_lock); if (!list_empty(&BTRFS_I(inode)->i_orphan)) { printk(KERN_INFO "BTRFS: inode %lu still on the orphan list\n", inode->i_ino); list_del_init(&BTRFS_I(inode)->i_orphan); } - spin_unlock(&root->list_lock); + spin_unlock(&root->orphan_lock); while (1) { ordered = btrfs_lookup_first_ordered_extent(inode, (u64)-1); @@ -5425,19 +6430,6 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, if (S_ISDIR(old_inode->i_mode) && new_inode && new_inode->i_size > BTRFS_EMPTY_DIR_SIZE) return -ENOTEMPTY; - - /* - * We want to reserve the absolute worst case amount of items. So if - * both inodes are subvols and we need to unlink them then that would - * require 4 item modifications, but if they are both normal inodes it - * would require 5 item modifications, so we'll assume their normal - * inodes. So 5 * 2 is 10, plus 1 for the new link, so 11 total items - * should cover the worst case number of items we'll modify. - */ - ret = btrfs_reserve_metadata_space(root, 11); - if (ret) - return ret; - /* * we're using rename to replace one file with another. * and the replacement file is large. Start IO on it now so @@ -5450,8 +6442,18 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, /* close the racy window with snapshot create/destroy ioctl */ if (old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID) down_read(&root->fs_info->subvol_sem); + /* + * We want to reserve the absolute worst case amount of items. So if + * both inodes are subvols and we need to unlink them then that would + * require 4 item modifications, but if they are both normal inodes it + * would require 5 item modifications, so we'll assume their normal + * inodes. So 5 * 2 is 10, plus 1 for the new link, so 11 total items + * should cover the worst case number of items we'll modify. + */ + trans = btrfs_start_transaction(root, 20); + if (IS_ERR(trans)) + return PTR_ERR(trans); - trans = btrfs_start_transaction(root, 1); btrfs_set_trans_block_group(trans, new_dir); if (dest != root) @@ -5550,7 +6552,6 @@ out_fail: if (old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID) up_read(&root->fs_info->subvol_sem); - btrfs_unreserve_metadata_space(root, 11); return ret; } @@ -5602,6 +6603,38 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput) return 0; } +int btrfs_start_one_delalloc_inode(struct btrfs_root *root, int delay_iput) +{ + struct btrfs_inode *binode; + struct inode *inode = NULL; + + spin_lock(&root->fs_info->delalloc_lock); + while (!list_empty(&root->fs_info->delalloc_inodes)) { + binode = list_entry(root->fs_info->delalloc_inodes.next, + struct btrfs_inode, delalloc_inodes); + inode = igrab(&binode->vfs_inode); + if (inode) { + list_move_tail(&binode->delalloc_inodes, + &root->fs_info->delalloc_inodes); + break; + } + + list_del_init(&binode->delalloc_inodes); + cond_resched_lock(&root->fs_info->delalloc_lock); + } + spin_unlock(&root->fs_info->delalloc_lock); + + if (inode) { + write_inode_now(inode, 0); + if (delay_iput) + btrfs_add_delayed_iput(inode); + else + iput(inode); + return 1; + } + return 0; +} + static int btrfs_symlink(struct inode *dir, struct dentry *dentry, const char *symname) { @@ -5625,26 +6658,20 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry, if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(root)) return -ENAMETOOLONG; + err = btrfs_find_free_objectid(NULL, root, dir->i_ino, &objectid); + if (err) + return err; /* * 2 items for inode item and ref * 2 items for dir items * 1 item for xattr if selinux is on */ - err = btrfs_reserve_metadata_space(root, 5); - if (err) - return err; + trans = btrfs_start_transaction(root, 5); + if (IS_ERR(trans)) + return PTR_ERR(trans); - trans = btrfs_start_transaction(root, 1); - if (!trans) - goto out_fail; btrfs_set_trans_block_group(trans, dir); - err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid); - if (err) { - err = -ENOSPC; - goto out_unlock; - } - inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name, dentry->d_name.len, dentry->d_parent->d_inode->i_ino, objectid, @@ -5716,8 +6743,6 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry, out_unlock: nr = trans->blocks_used; btrfs_end_transaction_throttle(trans, root); -out_fail: - btrfs_unreserve_metadata_space(root, 5); if (drop_inode) { inode_dec_link_count(inode); iput(inode); @@ -5726,33 +6751,28 @@ out_fail: return err; } -static int prealloc_file_range(struct inode *inode, u64 start, u64 end, - u64 alloc_hint, int mode, loff_t actual_len) +int btrfs_prealloc_file_range(struct inode *inode, int mode, + u64 start, u64 num_bytes, u64 min_size, + loff_t actual_len, u64 *alloc_hint) { struct btrfs_trans_handle *trans; struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_key ins; u64 cur_offset = start; - u64 num_bytes = end - start; int ret = 0; - u64 i_size; while (num_bytes > 0) { - trans = btrfs_start_transaction(root, 1); - - ret = btrfs_reserve_extent(trans, root, num_bytes, - root->sectorsize, 0, alloc_hint, - (u64)-1, &ins, 1); - if (ret) { - WARN_ON(1); - goto stop_trans; + trans = btrfs_start_transaction(root, 3); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + break; } - ret = btrfs_reserve_metadata_space(root, 3); + ret = btrfs_reserve_extent(trans, root, num_bytes, min_size, + 0, *alloc_hint, (u64)-1, &ins, 1); if (ret) { - btrfs_free_reserved_extent(root, ins.objectid, - ins.offset); - goto stop_trans; + btrfs_end_transaction(trans, root); + break; } ret = insert_reserved_file_extent(trans, inode, @@ -5766,34 +6786,27 @@ static int prealloc_file_range(struct inode *inode, u64 start, u64 end, num_bytes -= ins.offset; cur_offset += ins.offset; - alloc_hint = ins.objectid + ins.offset; + *alloc_hint = ins.objectid + ins.offset; inode->i_ctime = CURRENT_TIME; BTRFS_I(inode)->flags |= BTRFS_INODE_PREALLOC; if (!(mode & FALLOC_FL_KEEP_SIZE) && - (actual_len > inode->i_size) && - (cur_offset > inode->i_size)) { - + (actual_len > inode->i_size) && + (cur_offset > inode->i_size)) { if (cur_offset > actual_len) - i_size = actual_len; + i_size_write(inode, actual_len); else - i_size = cur_offset; - i_size_write(inode, i_size); - btrfs_ordered_update_i_size(inode, i_size, NULL); + i_size_write(inode, cur_offset); + i_size_write(inode, cur_offset); + btrfs_ordered_update_i_size(inode, cur_offset, NULL); } ret = btrfs_update_inode(trans, root, inode); BUG_ON(ret); btrfs_end_transaction(trans, root); - btrfs_unreserve_metadata_space(root, 3); } return ret; - -stop_trans: - btrfs_end_transaction(trans, root); - return ret; - } static long btrfs_fallocate(struct inode *inode, int mode, @@ -5826,8 +6839,7 @@ static long btrfs_fallocate(struct inode *inode, int mode, goto out; } - ret = btrfs_check_data_free_space(BTRFS_I(inode)->root, inode, - alloc_end - alloc_start); + ret = btrfs_check_data_free_space(inode, alloc_end - alloc_start); if (ret) goto out; @@ -5872,16 +6884,16 @@ static long btrfs_fallocate(struct inode *inode, int mode, if (em->block_start == EXTENT_MAP_HOLE || (cur_offset >= inode->i_size && !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) { - ret = prealloc_file_range(inode, - cur_offset, last_byte, - alloc_hint, mode, offset+len); + ret = btrfs_prealloc_file_range(inode, mode, cur_offset, + last_byte - cur_offset, + 1 << inode->i_blkbits, + offset + len, + &alloc_hint); if (ret < 0) { free_extent_map(em); break; } } - if (em->block_start <= EXTENT_MAP_LAST_BYTE) - alloc_hint = em->block_start; free_extent_map(em); cur_offset = last_byte; @@ -5893,8 +6905,7 @@ static long btrfs_fallocate(struct inode *inode, int mode, unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end, &cached_state, GFP_NOFS); - btrfs_free_reserved_data_space(BTRFS_I(inode)->root, inode, - alloc_end - alloc_start); + btrfs_free_reserved_data_space(inode, alloc_end - alloc_start); out: mutex_unlock(&inode->i_mutex); return ret; diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 97a97839a867..9254b3d58dbe 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -239,23 +239,19 @@ static noinline int create_subvol(struct btrfs_root *root, u64 new_dirid = BTRFS_FIRST_FREE_OBJECTID; u64 index = 0; + ret = btrfs_find_free_objectid(NULL, root->fs_info->tree_root, + 0, &objectid); + if (ret) + return ret; /* * 1 - inode item * 2 - refs * 1 - root item * 2 - dir items */ - ret = btrfs_reserve_metadata_space(root, 6); - if (ret) - return ret; - - trans = btrfs_start_transaction(root, 1); - BUG_ON(!trans); - - ret = btrfs_find_free_objectid(trans, root->fs_info->tree_root, - 0, &objectid); - if (ret) - goto fail; + trans = btrfs_start_transaction(root, 6); + if (IS_ERR(trans)) + return PTR_ERR(trans); leaf = btrfs_alloc_free_block(trans, root, root->leafsize, 0, objectid, NULL, 0, 0, 0); @@ -345,13 +341,10 @@ fail: err = btrfs_commit_transaction(trans, root); if (err && !ret) ret = err; - - btrfs_unreserve_metadata_space(root, 6); return ret; } -static int create_snapshot(struct btrfs_root *root, struct dentry *dentry, - char *name, int namelen) +static int create_snapshot(struct btrfs_root *root, struct dentry *dentry) { struct inode *inode; struct btrfs_pending_snapshot *pending_snapshot; @@ -361,40 +354,33 @@ static int create_snapshot(struct btrfs_root *root, struct dentry *dentry, if (!root->ref_cows) return -EINVAL; - /* - * 1 - inode item - * 2 - refs - * 1 - root item - * 2 - dir items - */ - ret = btrfs_reserve_metadata_space(root, 6); - if (ret) - goto fail; - pending_snapshot = kzalloc(sizeof(*pending_snapshot), GFP_NOFS); - if (!pending_snapshot) { - ret = -ENOMEM; - btrfs_unreserve_metadata_space(root, 6); - goto fail; - } - pending_snapshot->name = kmalloc(namelen + 1, GFP_NOFS); - if (!pending_snapshot->name) { - ret = -ENOMEM; - kfree(pending_snapshot); - btrfs_unreserve_metadata_space(root, 6); - goto fail; - } - memcpy(pending_snapshot->name, name, namelen); - pending_snapshot->name[namelen] = '\0'; + if (!pending_snapshot) + return -ENOMEM; + + btrfs_init_block_rsv(&pending_snapshot->block_rsv); pending_snapshot->dentry = dentry; - trans = btrfs_start_transaction(root, 1); - BUG_ON(!trans); pending_snapshot->root = root; + + trans = btrfs_start_transaction(root->fs_info->extent_root, 5); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + goto fail; + } + + ret = btrfs_snap_reserve_metadata(trans, pending_snapshot); + BUG_ON(ret); + list_add(&pending_snapshot->list, &trans->transaction->pending_snapshots); - ret = btrfs_commit_transaction(trans, root); + ret = btrfs_commit_transaction(trans, root->fs_info->extent_root); BUG_ON(ret); - btrfs_unreserve_metadata_space(root, 6); + + ret = pending_snapshot->error; + if (ret) + goto fail; + + btrfs_orphan_cleanup(pending_snapshot->snap); inode = btrfs_lookup_dentry(dentry->d_parent->d_inode, dentry); if (IS_ERR(inode)) { @@ -405,6 +391,7 @@ static int create_snapshot(struct btrfs_root *root, struct dentry *dentry, d_instantiate(dentry, inode); ret = 0; fail: + kfree(pending_snapshot); return ret; } @@ -456,8 +443,7 @@ static noinline int btrfs_mksubvol(struct path *parent, goto out_up_read; if (snap_src) { - error = create_snapshot(snap_src, dentry, - name, namelen); + error = create_snapshot(snap_src, dentry); } else { error = create_subvol(BTRFS_I(dir)->root, dentry, name, namelen); @@ -601,19 +587,9 @@ static int btrfs_defrag_file(struct file *file, if (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS) BTRFS_I(inode)->force_compress = 1; - ret = btrfs_check_data_free_space(root, inode, PAGE_CACHE_SIZE); - if (ret) { - ret = -ENOSPC; - break; - } - - ret = btrfs_reserve_metadata_for_delalloc(root, inode, 1); - if (ret) { - btrfs_free_reserved_data_space(root, inode, - PAGE_CACHE_SIZE); - ret = -ENOSPC; - break; - } + ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE); + if (ret) + goto err_unlock; again: if (inode->i_size == 0 || i > ((inode->i_size - 1) >> PAGE_CACHE_SHIFT)) { @@ -622,8 +598,10 @@ again: } page = grab_cache_page(inode->i_mapping, i); - if (!page) + if (!page) { + ret = -ENOMEM; goto err_reservations; + } if (!PageUptodate(page)) { btrfs_readpage(NULL, page); @@ -631,6 +609,7 @@ again: if (!PageUptodate(page)) { unlock_page(page); page_cache_release(page); + ret = -EIO; goto err_reservations; } } @@ -644,8 +623,7 @@ again: wait_on_page_writeback(page); if (PageDirty(page)) { - btrfs_free_reserved_data_space(root, inode, - PAGE_CACHE_SIZE); + btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE); goto loop_unlock; } @@ -683,7 +661,6 @@ loop_unlock: page_cache_release(page); mutex_unlock(&inode->i_mutex); - btrfs_unreserve_metadata_for_delalloc(root, inode, 1); balance_dirty_pages_ratelimited_nr(inode->i_mapping, 1); i++; } @@ -713,9 +690,9 @@ loop_unlock: return 0; err_reservations: + btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE); +err_unlock: mutex_unlock(&inode->i_mutex); - btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE); - btrfs_unreserve_metadata_for_delalloc(root, inode, 1); return ret; } @@ -811,7 +788,7 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root, device->name, (unsigned long long)new_size); if (new_size > old_size) { - trans = btrfs_start_transaction(root, 1); + trans = btrfs_start_transaction(root, 0); ret = btrfs_grow_device(trans, device, new_size); btrfs_commit_transaction(trans, root); } else { @@ -1300,7 +1277,13 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, if (err) goto out_up_write; - trans = btrfs_start_transaction(root, 1); + trans = btrfs_start_transaction(root, 0); + if (IS_ERR(trans)) { + err = PTR_ERR(trans); + goto out_up_write; + } + trans->block_rsv = &root->fs_info->global_block_rsv; + ret = btrfs_unlink_subvol(trans, root, dir, dest->root_key.objectid, dentry->d_name.name, @@ -1314,10 +1297,12 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, dest->root_item.drop_level = 0; btrfs_set_root_refs(&dest->root_item, 0); - ret = btrfs_insert_orphan_item(trans, - root->fs_info->tree_root, - dest->root_key.objectid); - BUG_ON(ret); + if (!xchg(&dest->orphan_item_inserted, 1)) { + ret = btrfs_insert_orphan_item(trans, + root->fs_info->tree_root, + dest->root_key.objectid); + BUG_ON(ret); + } ret = btrfs_commit_transaction(trans, root); BUG_ON(ret); @@ -1358,8 +1343,10 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp) ret = -EPERM; goto out; } - btrfs_defrag_root(root, 0); - btrfs_defrag_root(root->fs_info->extent_root, 0); + ret = btrfs_defrag_root(root, 0); + if (ret) + goto out; + ret = btrfs_defrag_root(root->fs_info->extent_root, 0); break; case S_IFREG: if (!(file->f_mode & FMODE_WRITE)) { @@ -1389,9 +1376,11 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp) /* the rest are all set to zero by kzalloc */ range->len = (u64)-1; } - btrfs_defrag_file(file, range); + ret = btrfs_defrag_file(file, range); kfree(range); break; + default: + ret = -EINVAL; } out: mnt_drop_write(file->f_path.mnt); @@ -1469,7 +1458,7 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, */ /* the destination must be opened for writing */ - if (!(file->f_mode & FMODE_WRITE)) + if (!(file->f_mode & FMODE_WRITE) || (file->f_flags & O_APPEND)) return -EINVAL; ret = mnt_want_write(file->f_path.mnt); @@ -1522,7 +1511,7 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, /* determine range to clone */ ret = -EINVAL; - if (off >= src->i_size || off + len > src->i_size) + if (off + len > src->i_size || off + len < off) goto out_unlock; if (len == 0) olen = len = src->i_size - off; @@ -1550,12 +1539,6 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, btrfs_wait_ordered_range(src, off, off+len); } - trans = btrfs_start_transaction(root, 1); - BUG_ON(!trans); - - /* punch hole in destination first */ - btrfs_drop_extents(trans, inode, off, off + len, &hint_byte, 1); - /* clone data */ key.objectid = src->i_ino; key.type = BTRFS_EXTENT_DATA_KEY; @@ -1566,7 +1549,7 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, * note the key will change type as we walk through the * tree. */ - ret = btrfs_search_slot(trans, root, &key, path, 0, 0); + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); if (ret < 0) goto out; @@ -1595,6 +1578,7 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, u64 disko = 0, diskl = 0; u64 datao = 0, datal = 0; u8 comp; + u64 endoff; size = btrfs_item_size_nr(leaf, slot); read_extent_buffer(leaf, buf, @@ -1629,12 +1613,31 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, new_key.objectid = inode->i_ino; new_key.offset = key.offset + destoff - off; + trans = btrfs_start_transaction(root, 1); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + goto out; + } + if (type == BTRFS_FILE_EXTENT_REG || type == BTRFS_FILE_EXTENT_PREALLOC) { + if (off > key.offset) { + datao += off - key.offset; + datal -= off - key.offset; + } + + if (key.offset + datal > off + len) + datal = off + len - key.offset; + + ret = btrfs_drop_extents(trans, inode, + new_key.offset, + new_key.offset + datal, + &hint_byte, 1); + BUG_ON(ret); + ret = btrfs_insert_empty_item(trans, root, path, &new_key, size); - if (ret) - goto out; + BUG_ON(ret); leaf = path->nodes[0]; slot = path->slots[0]; @@ -1645,14 +1648,6 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, extent = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); - if (off > key.offset) { - datao += off - key.offset; - datal -= off - key.offset; - } - - if (key.offset + datal > off + len) - datal = off + len - key.offset; - /* disko == 0 means it's a hole */ if (!disko) datao = 0; @@ -1683,14 +1678,21 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, if (comp && (skip || trim)) { ret = -EINVAL; + btrfs_end_transaction(trans, root); goto out; } size -= skip + trim; datal -= skip + trim; + + ret = btrfs_drop_extents(trans, inode, + new_key.offset, + new_key.offset + datal, + &hint_byte, 1); + BUG_ON(ret); + ret = btrfs_insert_empty_item(trans, root, path, &new_key, size); - if (ret) - goto out; + BUG_ON(ret); if (skip) { u32 start = @@ -1708,8 +1710,26 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, } btrfs_mark_buffer_dirty(leaf); - } + btrfs_release_path(root, path); + + inode->i_mtime = inode->i_ctime = CURRENT_TIME; + /* + * we round up to the block size at eof when + * determining which extents to clone above, + * but shouldn't round up the file size + */ + endoff = new_key.offset + datal; + if (endoff > off+olen) + endoff = off+olen; + if (endoff > inode->i_size) + btrfs_i_size_write(inode, endoff); + + BTRFS_I(inode)->flags = BTRFS_I(src)->flags; + ret = btrfs_update_inode(trans, root, inode); + BUG_ON(ret); + btrfs_end_transaction(trans, root); + } next: btrfs_release_path(root, path); key.offset++; @@ -1717,17 +1737,7 @@ next: ret = 0; out: btrfs_release_path(root, path); - if (ret == 0) { - inode->i_mtime = inode->i_ctime = CURRENT_TIME; - if (destoff + olen > inode->i_size) - btrfs_i_size_write(inode, destoff + olen); - BTRFS_I(inode)->flags = BTRFS_I(src)->flags; - ret = btrfs_update_inode(trans, root, inode); - } - btrfs_end_transaction(trans, root); unlock_extent(&BTRFS_I(src)->io_tree, off, off+len, GFP_NOFS); - if (ret) - vmtruncate(inode, 0); out_unlock: mutex_unlock(&src->i_mutex); mutex_unlock(&inode->i_mutex); @@ -1845,7 +1855,7 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp) dir_id = btrfs_super_root_dir(&root->fs_info->super_copy); di = btrfs_lookup_dir_item(trans, root->fs_info->tree_root, path, dir_id, "default", 7, 1); - if (!di) { + if (IS_ERR_OR_NULL(di)) { btrfs_free_path(path); btrfs_end_transaction(trans, root); printk(KERN_ERR "Umm, you don't have the default dir item, " diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index a127c0ebb2dc..e56c72bc5add 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -124,6 +124,15 @@ static int offset_in_entry(struct btrfs_ordered_extent *entry, u64 file_offset) return 1; } +static int range_overlaps(struct btrfs_ordered_extent *entry, u64 file_offset, + u64 len) +{ + if (file_offset + len <= entry->file_offset || + entry->file_offset + entry->len <= file_offset) + return 0; + return 1; +} + /* * look find the first ordered struct that has this offset, otherwise * the first one less than this offset @@ -161,8 +170,9 @@ static inline struct rb_node *tree_search(struct btrfs_ordered_inode_tree *tree, * The tree is given a single reference on the ordered extent that was * inserted. */ -int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, - u64 start, u64 len, u64 disk_len, int type) +static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, + u64 start, u64 len, u64 disk_len, + int type, int dio) { struct btrfs_ordered_inode_tree *tree; struct rb_node *node; @@ -182,6 +192,9 @@ int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, if (type != BTRFS_ORDERED_IO_DONE && type != BTRFS_ORDERED_COMPLETE) set_bit(type, &entry->flags); + if (dio) + set_bit(BTRFS_ORDERED_DIRECT, &entry->flags); + /* one ref for the tree */ atomic_set(&entry->refs, 1); init_waitqueue_head(&entry->wait); @@ -203,6 +216,20 @@ int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, return 0; } +int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, + u64 start, u64 len, u64 disk_len, int type) +{ + return __btrfs_add_ordered_extent(inode, file_offset, start, len, + disk_len, type, 0); +} + +int btrfs_add_ordered_extent_dio(struct inode *inode, u64 file_offset, + u64 start, u64 len, u64 disk_len, int type) +{ + return __btrfs_add_ordered_extent(inode, file_offset, start, len, + disk_len, type, 1); +} + /* * Add a struct btrfs_ordered_sum into the list of checksums to be inserted * when an ordered extent is finished. If the list covers more than one @@ -311,13 +338,6 @@ static int __btrfs_remove_ordered_extent(struct inode *inode, tree->last = NULL; set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags); - spin_lock(&BTRFS_I(inode)->accounting_lock); - WARN_ON(!BTRFS_I(inode)->outstanding_extents); - BTRFS_I(inode)->outstanding_extents--; - spin_unlock(&BTRFS_I(inode)->accounting_lock); - btrfs_unreserve_metadata_for_delalloc(BTRFS_I(inode)->root, - inode, 1); - spin_lock(&root->fs_info->ordered_extent_lock); list_del_init(&entry->root_extent_list); @@ -491,7 +511,8 @@ void btrfs_start_ordered_extent(struct inode *inode, * start IO on any dirty ones so the wait doesn't stall waiting * for pdflush to find them */ - filemap_fdatawrite_range(inode->i_mapping, start, end); + if (!test_bit(BTRFS_ORDERED_DIRECT, &entry->flags)) + filemap_fdatawrite_range(inode->i_mapping, start, end); if (wait) { wait_event(entry->wait, test_bit(BTRFS_ORDERED_COMPLETE, &entry->flags)); @@ -588,6 +609,47 @@ out: return entry; } +/* Since the DIO code tries to lock a wide area we need to look for any ordered + * extents that exist in the range, rather than just the start of the range. + */ +struct btrfs_ordered_extent *btrfs_lookup_ordered_range(struct inode *inode, + u64 file_offset, + u64 len) +{ + struct btrfs_ordered_inode_tree *tree; + struct rb_node *node; + struct btrfs_ordered_extent *entry = NULL; + + tree = &BTRFS_I(inode)->ordered_tree; + spin_lock(&tree->lock); + node = tree_search(tree, file_offset); + if (!node) { + node = tree_search(tree, file_offset + len); + if (!node) + goto out; + } + + while (1) { + entry = rb_entry(node, struct btrfs_ordered_extent, rb_node); + if (range_overlaps(entry, file_offset, len)) + break; + + if (entry->file_offset >= file_offset + len) { + entry = NULL; + break; + } + entry = NULL; + node = rb_next(node); + if (!node) + break; + } +out: + if (entry) + atomic_inc(&entry->refs); + spin_unlock(&tree->lock); + return entry; +} + /* * lookup and return any extent before 'file_offset'. NULL is returned * if none is found diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index c82f76a9f040..8ac365492a3f 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h @@ -72,6 +72,8 @@ struct btrfs_ordered_sum { #define BTRFS_ORDERED_PREALLOC 4 /* set when writing to prealloced extent */ +#define BTRFS_ORDERED_DIRECT 5 /* set when we're doing DIO with this extent */ + struct btrfs_ordered_extent { /* logical offset in the file */ u64 file_offset; @@ -140,7 +142,9 @@ int btrfs_dec_test_ordered_pending(struct inode *inode, struct btrfs_ordered_extent **cached, u64 file_offset, u64 io_size); int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, - u64 start, u64 len, u64 disk_len, int tyep); + u64 start, u64 len, u64 disk_len, int type); +int btrfs_add_ordered_extent_dio(struct inode *inode, u64 file_offset, + u64 start, u64 len, u64 disk_len, int type); int btrfs_add_ordered_sum(struct inode *inode, struct btrfs_ordered_extent *entry, struct btrfs_ordered_sum *sum); @@ -151,6 +155,9 @@ void btrfs_start_ordered_extent(struct inode *inode, int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len); struct btrfs_ordered_extent * btrfs_lookup_first_ordered_extent(struct inode * inode, u64 file_offset); +struct btrfs_ordered_extent *btrfs_lookup_ordered_range(struct inode *inode, + u64 file_offset, + u64 len); int btrfs_ordered_update_i_size(struct inode *inode, u64 offset, struct btrfs_ordered_extent *ordered); int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, u32 *sum); diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index e558dd941ded..b37d723b9d4a 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -44,8 +44,12 @@ struct tree_entry { struct backref_node { struct rb_node rb_node; u64 bytenr; - /* objectid tree block owner */ + + u64 new_bytenr; + /* objectid of tree block owner, can be not uptodate */ u64 owner; + /* link to pending, changed or detached list */ + struct list_head list; /* list of upper level blocks reference this block */ struct list_head upper; /* list of child blocks in the cache */ @@ -56,9 +60,9 @@ struct backref_node { struct extent_buffer *eb; /* level of tree block */ unsigned int level:8; - /* 1 if the block is root of old snapshot */ - unsigned int old_root:1; - /* 1 if no child blocks in the cache */ + /* is the block in non-reference counted tree */ + unsigned int cowonly:1; + /* 1 if no child node in the cache */ unsigned int lowest:1; /* is the extent buffer locked */ unsigned int locked:1; @@ -66,6 +70,16 @@ struct backref_node { unsigned int processed:1; /* have backrefs of this block been checked */ unsigned int checked:1; + /* + * 1 if corresponding block has been cowed but some upper + * level block pointers may not point to the new location + */ + unsigned int pending:1; + /* + * 1 if the backref node isn't connected to any other + * backref node. + */ + unsigned int detached:1; }; /* @@ -74,7 +88,6 @@ struct backref_node { struct backref_edge { struct list_head list[2]; struct backref_node *node[2]; - u64 blockptr; }; #define LOWER 0 @@ -83,9 +96,25 @@ struct backref_edge { struct backref_cache { /* red black tree of all backref nodes in the cache */ struct rb_root rb_root; - /* list of backref nodes with no child block in the cache */ + /* for passing backref nodes to btrfs_reloc_cow_block */ + struct backref_node *path[BTRFS_MAX_LEVEL]; + /* + * list of blocks that have been cowed but some block + * pointers in upper level blocks may not reflect the + * new location + */ struct list_head pending[BTRFS_MAX_LEVEL]; - spinlock_t lock; + /* list of backref nodes with no child node */ + struct list_head leaves; + /* list of blocks that have been cowed in current transaction */ + struct list_head changed; + /* list of detached backref node. */ + struct list_head detached; + + u64 last_trans; + + int nr_nodes; + int nr_edges; }; /* @@ -113,15 +142,6 @@ struct tree_block { unsigned int key_ready:1; }; -/* inode vector */ -#define INODEVEC_SIZE 16 - -struct inodevec { - struct list_head list; - struct inode *inode[INODEVEC_SIZE]; - int nr; -}; - #define MAX_EXTENTS 128 struct file_extent_cluster { @@ -138,36 +158,43 @@ struct reloc_control { struct btrfs_root *extent_root; /* inode for moving data */ struct inode *data_inode; - struct btrfs_workers workers; + + struct btrfs_block_rsv *block_rsv; + + struct backref_cache backref_cache; + + struct file_extent_cluster cluster; /* tree blocks have been processed */ struct extent_io_tree processed_blocks; /* map start of tree root to corresponding reloc tree */ struct mapping_tree reloc_root_tree; /* list of reloc trees */ struct list_head reloc_roots; + /* size of metadata reservation for merging reloc trees */ + u64 merging_rsv_size; + /* size of relocated tree nodes */ + u64 nodes_relocated; + u64 search_start; u64 extents_found; - u64 extents_skipped; - int stage; - int create_reloc_root; + + int block_rsv_retries; + + unsigned int stage:8; + unsigned int create_reloc_tree:1; + unsigned int merge_reloc_tree:1; unsigned int found_file_extent:1; - unsigned int found_old_snapshot:1; + unsigned int commit_transaction:1; }; /* stages of data relocation */ #define MOVE_DATA_EXTENTS 0 #define UPDATE_DATA_PTRS 1 -/* - * merge reloc tree to corresponding fs tree in worker threads - */ -struct async_merge { - struct btrfs_work work; - struct reloc_control *rc; - struct btrfs_root *root; - struct completion *done; - atomic_t *num_pending; -}; +static void remove_backref_node(struct backref_cache *cache, + struct backref_node *node); +static void __mark_block_processed(struct reloc_control *rc, + struct backref_node *node); static void mapping_tree_init(struct mapping_tree *tree) { @@ -181,15 +208,80 @@ static void backref_cache_init(struct backref_cache *cache) cache->rb_root = RB_ROOT; for (i = 0; i < BTRFS_MAX_LEVEL; i++) INIT_LIST_HEAD(&cache->pending[i]); - spin_lock_init(&cache->lock); + INIT_LIST_HEAD(&cache->changed); + INIT_LIST_HEAD(&cache->detached); + INIT_LIST_HEAD(&cache->leaves); +} + +static void backref_cache_cleanup(struct backref_cache *cache) +{ + struct backref_node *node; + int i; + + while (!list_empty(&cache->detached)) { + node = list_entry(cache->detached.next, + struct backref_node, list); + remove_backref_node(cache, node); + } + + while (!list_empty(&cache->leaves)) { + node = list_entry(cache->leaves.next, + struct backref_node, lower); + remove_backref_node(cache, node); + } + + cache->last_trans = 0; + + for (i = 0; i < BTRFS_MAX_LEVEL; i++) + BUG_ON(!list_empty(&cache->pending[i])); + BUG_ON(!list_empty(&cache->changed)); + BUG_ON(!list_empty(&cache->detached)); + BUG_ON(!RB_EMPTY_ROOT(&cache->rb_root)); + BUG_ON(cache->nr_nodes); + BUG_ON(cache->nr_edges); +} + +static struct backref_node *alloc_backref_node(struct backref_cache *cache) +{ + struct backref_node *node; + + node = kzalloc(sizeof(*node), GFP_NOFS); + if (node) { + INIT_LIST_HEAD(&node->list); + INIT_LIST_HEAD(&node->upper); + INIT_LIST_HEAD(&node->lower); + RB_CLEAR_NODE(&node->rb_node); + cache->nr_nodes++; + } + return node; +} + +static void free_backref_node(struct backref_cache *cache, + struct backref_node *node) +{ + if (node) { + cache->nr_nodes--; + kfree(node); + } +} + +static struct backref_edge *alloc_backref_edge(struct backref_cache *cache) +{ + struct backref_edge *edge; + + edge = kzalloc(sizeof(*edge), GFP_NOFS); + if (edge) + cache->nr_edges++; + return edge; } -static void backref_node_init(struct backref_node *node) +static void free_backref_edge(struct backref_cache *cache, + struct backref_edge *edge) { - memset(node, 0, sizeof(*node)); - INIT_LIST_HEAD(&node->upper); - INIT_LIST_HEAD(&node->lower); - RB_CLEAR_NODE(&node->rb_node); + if (edge) { + cache->nr_edges--; + kfree(edge); + } } static struct rb_node *tree_insert(struct rb_root *root, u64 bytenr, @@ -250,6 +342,7 @@ static struct backref_node *walk_up_backref(struct backref_node *node, edges[idx++] = edge; node = edge->node[UPPER]; } + BUG_ON(node->detached); *index = idx; return node; } @@ -281,13 +374,18 @@ static struct backref_node *walk_down_backref(struct backref_edge *edges[], return NULL; } +static void unlock_node_buffer(struct backref_node *node) +{ + if (node->locked) { + btrfs_tree_unlock(node->eb); + node->locked = 0; + } +} + static void drop_node_buffer(struct backref_node *node) { if (node->eb) { - if (node->locked) { - btrfs_tree_unlock(node->eb); - node->locked = 0; - } + unlock_node_buffer(node); free_extent_buffer(node->eb); node->eb = NULL; } @@ -296,14 +394,14 @@ static void drop_node_buffer(struct backref_node *node) static void drop_backref_node(struct backref_cache *tree, struct backref_node *node) { - BUG_ON(!node->lowest); BUG_ON(!list_empty(&node->upper)); drop_node_buffer(node); + list_del(&node->list); list_del(&node->lower); - - rb_erase(&node->rb_node, &tree->rb_root); - kfree(node); + if (!RB_EMPTY_NODE(&node->rb_node)) + rb_erase(&node->rb_node, &tree->rb_root); + free_backref_node(tree, node); } /* @@ -318,27 +416,121 @@ static void remove_backref_node(struct backref_cache *cache, if (!node) return; - BUG_ON(!node->lowest); + BUG_ON(!node->lowest && !node->detached); while (!list_empty(&node->upper)) { edge = list_entry(node->upper.next, struct backref_edge, list[LOWER]); upper = edge->node[UPPER]; list_del(&edge->list[LOWER]); list_del(&edge->list[UPPER]); - kfree(edge); + free_backref_edge(cache, edge); + + if (RB_EMPTY_NODE(&upper->rb_node)) { + BUG_ON(!list_empty(&node->upper)); + drop_backref_node(cache, node); + node = upper; + node->lowest = 1; + continue; + } /* - * add the node to pending list if no other + * add the node to leaf node list if no other * child block cached. */ if (list_empty(&upper->lower)) { - list_add_tail(&upper->lower, - &cache->pending[upper->level]); + list_add_tail(&upper->lower, &cache->leaves); upper->lowest = 1; } } + drop_backref_node(cache, node); } +static void update_backref_node(struct backref_cache *cache, + struct backref_node *node, u64 bytenr) +{ + struct rb_node *rb_node; + rb_erase(&node->rb_node, &cache->rb_root); + node->bytenr = bytenr; + rb_node = tree_insert(&cache->rb_root, node->bytenr, &node->rb_node); + BUG_ON(rb_node); +} + +/* + * update backref cache after a transaction commit + */ +static int update_backref_cache(struct btrfs_trans_handle *trans, + struct backref_cache *cache) +{ + struct backref_node *node; + int level = 0; + + if (cache->last_trans == 0) { + cache->last_trans = trans->transid; + return 0; + } + + if (cache->last_trans == trans->transid) + return 0; + + /* + * detached nodes are used to avoid unnecessary backref + * lookup. transaction commit changes the extent tree. + * so the detached nodes are no longer useful. + */ + while (!list_empty(&cache->detached)) { + node = list_entry(cache->detached.next, + struct backref_node, list); + remove_backref_node(cache, node); + } + + while (!list_empty(&cache->changed)) { + node = list_entry(cache->changed.next, + struct backref_node, list); + list_del_init(&node->list); + BUG_ON(node->pending); + update_backref_node(cache, node, node->new_bytenr); + } + + /* + * some nodes can be left in the pending list if there were + * errors during processing the pending nodes. + */ + for (level = 0; level < BTRFS_MAX_LEVEL; level++) { + list_for_each_entry(node, &cache->pending[level], list) { + BUG_ON(!node->pending); + if (node->bytenr == node->new_bytenr) + continue; + update_backref_node(cache, node, node->new_bytenr); + } + } + + cache->last_trans = 0; + return 1; +} + +static int should_ignore_root(struct btrfs_root *root) +{ + struct btrfs_root *reloc_root; + + if (!root->ref_cows) + return 0; + + reloc_root = root->reloc_root; + if (!reloc_root) + return 0; + + if (btrfs_root_last_snapshot(&reloc_root->root_item) == + root->fs_info->running_transaction->transid - 1) + return 0; + /* + * if there is reloc tree and it was created in previous + * transaction backref lookup can find the reloc tree, + * so backref node for the fs tree root is useless for + * relocation. + */ + return 1; +} + /* * find reloc tree by address of tree root */ @@ -453,11 +645,12 @@ int find_inline_backref(struct extent_buffer *leaf, int slot, * for all upper level blocks that directly/indirectly reference the * block are also cached. */ -static struct backref_node *build_backref_tree(struct reloc_control *rc, - struct backref_cache *cache, - struct btrfs_key *node_key, - int level, u64 bytenr) +static noinline_for_stack +struct backref_node *build_backref_tree(struct reloc_control *rc, + struct btrfs_key *node_key, + int level, u64 bytenr) { + struct backref_cache *cache = &rc->backref_cache; struct btrfs_path *path1; struct btrfs_path *path2; struct extent_buffer *eb; @@ -473,6 +666,8 @@ static struct backref_node *build_backref_tree(struct reloc_control *rc, unsigned long end; unsigned long ptr; LIST_HEAD(list); + LIST_HEAD(useless); + int cowonly; int ret; int err = 0; @@ -483,15 +678,13 @@ static struct backref_node *build_backref_tree(struct reloc_control *rc, goto out; } - node = kmalloc(sizeof(*node), GFP_NOFS); + node = alloc_backref_node(cache); if (!node) { err = -ENOMEM; goto out; } - backref_node_init(node); node->bytenr = bytenr; - node->owner = 0; node->level = level; node->lowest = 1; cur = node; @@ -587,17 +780,21 @@ again: #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 if (key.type == BTRFS_SHARED_BLOCK_REF_KEY || key.type == BTRFS_EXTENT_REF_V0_KEY) { - if (key.objectid == key.offset && - key.type == BTRFS_EXTENT_REF_V0_KEY) { + if (key.type == BTRFS_EXTENT_REF_V0_KEY) { struct btrfs_extent_ref_v0 *ref0; ref0 = btrfs_item_ptr(eb, path1->slots[0], struct btrfs_extent_ref_v0); - root = find_tree_root(rc, eb, ref0); - if (root) - cur->root = root; - else - cur->old_root = 1; - break; + if (key.objectid == key.offset) { + root = find_tree_root(rc, eb, ref0); + if (root && !should_ignore_root(root)) + cur->root = root; + else + list_add(&cur->list, &useless); + break; + } + if (is_cowonly_root(btrfs_ref_root_v0(eb, + ref0))) + cur->cowonly = 1; } #else BUG_ON(key.type == BTRFS_EXTENT_REF_V0_KEY); @@ -614,22 +811,20 @@ again: break; } - edge = kzalloc(sizeof(*edge), GFP_NOFS); + edge = alloc_backref_edge(cache); if (!edge) { err = -ENOMEM; goto out; } rb_node = tree_search(&cache->rb_root, key.offset); if (!rb_node) { - upper = kmalloc(sizeof(*upper), GFP_NOFS); + upper = alloc_backref_node(cache); if (!upper) { - kfree(edge); + free_backref_edge(cache, edge); err = -ENOMEM; goto out; } - backref_node_init(upper); upper->bytenr = key.offset; - upper->owner = 0; upper->level = cur->level + 1; /* * backrefs for the upper level block isn't @@ -639,11 +834,12 @@ again: } else { upper = rb_entry(rb_node, struct backref_node, rb_node); + BUG_ON(!upper->checked); INIT_LIST_HEAD(&edge->list[UPPER]); } - list_add(&edge->list[LOWER], &cur->upper); - edge->node[UPPER] = upper; + list_add_tail(&edge->list[LOWER], &cur->upper); edge->node[LOWER] = cur; + edge->node[UPPER] = upper; goto next; } else if (key.type != BTRFS_TREE_BLOCK_REF_KEY) { @@ -657,11 +853,17 @@ again: goto out; } + if (!root->ref_cows) + cur->cowonly = 1; + if (btrfs_root_level(&root->root_item) == cur->level) { /* tree root */ BUG_ON(btrfs_root_bytenr(&root->root_item) != cur->bytenr); - cur->root = root; + if (should_ignore_root(root)) + list_add(&cur->list, &useless); + else + cur->root = root; break; } @@ -692,11 +894,14 @@ again: if (!path2->nodes[level]) { BUG_ON(btrfs_root_bytenr(&root->root_item) != lower->bytenr); - lower->root = root; + if (should_ignore_root(root)) + list_add(&lower->list, &useless); + else + lower->root = root; break; } - edge = kzalloc(sizeof(*edge), GFP_NOFS); + edge = alloc_backref_edge(cache); if (!edge) { err = -ENOMEM; goto out; @@ -705,16 +910,17 @@ again: eb = path2->nodes[level]; rb_node = tree_search(&cache->rb_root, eb->start); if (!rb_node) { - upper = kmalloc(sizeof(*upper), GFP_NOFS); + upper = alloc_backref_node(cache); if (!upper) { - kfree(edge); + free_backref_edge(cache, edge); err = -ENOMEM; goto out; } - backref_node_init(upper); upper->bytenr = eb->start; upper->owner = btrfs_header_owner(eb); upper->level = lower->level + 1; + if (!root->ref_cows) + upper->cowonly = 1; /* * if we know the block isn't shared @@ -744,10 +950,12 @@ again: rb_node); BUG_ON(!upper->checked); INIT_LIST_HEAD(&edge->list[UPPER]); + if (!upper->owner) + upper->owner = btrfs_header_owner(eb); } list_add_tail(&edge->list[LOWER], &lower->upper); - edge->node[UPPER] = upper; edge->node[LOWER] = lower; + edge->node[UPPER] = upper; if (rb_node) break; @@ -785,8 +993,13 @@ next: * into the cache. */ BUG_ON(!node->checked); - rb_node = tree_insert(&cache->rb_root, node->bytenr, &node->rb_node); - BUG_ON(rb_node); + cowonly = node->cowonly; + if (!cowonly) { + rb_node = tree_insert(&cache->rb_root, node->bytenr, + &node->rb_node); + BUG_ON(rb_node); + list_add_tail(&node->lower, &cache->leaves); + } list_for_each_entry(edge, &node->upper, list[LOWER]) list_add_tail(&edge->list[UPPER], &list); @@ -795,6 +1008,14 @@ next: edge = list_entry(list.next, struct backref_edge, list[UPPER]); list_del_init(&edge->list[UPPER]); upper = edge->node[UPPER]; + if (upper->detached) { + list_del(&edge->list[LOWER]); + lower = edge->node[LOWER]; + free_backref_edge(cache, edge); + if (list_empty(&lower->upper)) + list_add(&lower->list, &useless); + continue; + } if (!RB_EMPTY_NODE(&upper->rb_node)) { if (upper->lowest) { @@ -807,25 +1028,69 @@ next: } BUG_ON(!upper->checked); - rb_node = tree_insert(&cache->rb_root, upper->bytenr, - &upper->rb_node); - BUG_ON(rb_node); + BUG_ON(cowonly != upper->cowonly); + if (!cowonly) { + rb_node = tree_insert(&cache->rb_root, upper->bytenr, + &upper->rb_node); + BUG_ON(rb_node); + } list_add_tail(&edge->list[UPPER], &upper->lower); list_for_each_entry(edge, &upper->upper, list[LOWER]) list_add_tail(&edge->list[UPPER], &list); } + /* + * process useless backref nodes. backref nodes for tree leaves + * are deleted from the cache. backref nodes for upper level + * tree blocks are left in the cache to avoid unnecessary backref + * lookup. + */ + while (!list_empty(&useless)) { + upper = list_entry(useless.next, struct backref_node, list); + list_del_init(&upper->list); + BUG_ON(!list_empty(&upper->upper)); + if (upper == node) + node = NULL; + if (upper->lowest) { + list_del_init(&upper->lower); + upper->lowest = 0; + } + while (!list_empty(&upper->lower)) { + edge = list_entry(upper->lower.next, + struct backref_edge, list[UPPER]); + list_del(&edge->list[UPPER]); + list_del(&edge->list[LOWER]); + lower = edge->node[LOWER]; + free_backref_edge(cache, edge); + + if (list_empty(&lower->upper)) + list_add(&lower->list, &useless); + } + __mark_block_processed(rc, upper); + if (upper->level > 0) { + list_add(&upper->list, &cache->detached); + upper->detached = 1; + } else { + rb_erase(&upper->rb_node, &cache->rb_root); + free_backref_node(cache, upper); + } + } out: btrfs_free_path(path1); btrfs_free_path(path2); if (err) { - INIT_LIST_HEAD(&list); + while (!list_empty(&useless)) { + lower = list_entry(useless.next, + struct backref_node, upper); + list_del_init(&lower->upper); + } upper = node; + INIT_LIST_HEAD(&list); while (upper) { if (RB_EMPTY_NODE(&upper->rb_node)) { list_splice_tail(&upper->upper, &list); - kfree(upper); + free_backref_node(cache, upper); } if (list_empty(&list)) @@ -833,15 +1098,104 @@ out: edge = list_entry(list.next, struct backref_edge, list[LOWER]); + list_del(&edge->list[LOWER]); upper = edge->node[UPPER]; - kfree(edge); + free_backref_edge(cache, edge); } return ERR_PTR(err); } + BUG_ON(node && node->detached); return node; } /* + * helper to add backref node for the newly created snapshot. + * the backref node is created by cloning backref node that + * corresponds to root of source tree + */ +static int clone_backref_node(struct btrfs_trans_handle *trans, + struct reloc_control *rc, + struct btrfs_root *src, + struct btrfs_root *dest) +{ + struct btrfs_root *reloc_root = src->reloc_root; + struct backref_cache *cache = &rc->backref_cache; + struct backref_node *node = NULL; + struct backref_node *new_node; + struct backref_edge *edge; + struct backref_edge *new_edge; + struct rb_node *rb_node; + + if (cache->last_trans > 0) + update_backref_cache(trans, cache); + + rb_node = tree_search(&cache->rb_root, src->commit_root->start); + if (rb_node) { + node = rb_entry(rb_node, struct backref_node, rb_node); + if (node->detached) + node = NULL; + else + BUG_ON(node->new_bytenr != reloc_root->node->start); + } + + if (!node) { + rb_node = tree_search(&cache->rb_root, + reloc_root->commit_root->start); + if (rb_node) { + node = rb_entry(rb_node, struct backref_node, + rb_node); + BUG_ON(node->detached); + } + } + + if (!node) + return 0; + + new_node = alloc_backref_node(cache); + if (!new_node) + return -ENOMEM; + + new_node->bytenr = dest->node->start; + new_node->level = node->level; + new_node->lowest = node->lowest; + new_node->root = dest; + + if (!node->lowest) { + list_for_each_entry(edge, &node->lower, list[UPPER]) { + new_edge = alloc_backref_edge(cache); + if (!new_edge) + goto fail; + + new_edge->node[UPPER] = new_node; + new_edge->node[LOWER] = edge->node[LOWER]; + list_add_tail(&new_edge->list[UPPER], + &new_node->lower); + } + } + + rb_node = tree_insert(&cache->rb_root, new_node->bytenr, + &new_node->rb_node); + BUG_ON(rb_node); + + if (!new_node->lowest) { + list_for_each_entry(new_edge, &new_node->lower, list[UPPER]) { + list_add_tail(&new_edge->list[LOWER], + &new_edge->node[LOWER]->upper); + } + } + return 0; +fail: + while (!list_empty(&new_node->lower)) { + new_edge = list_entry(new_node->lower.next, + struct backref_edge, list[UPPER]); + list_del(&new_edge->list[UPPER]); + free_backref_edge(cache, new_edge); + } + free_backref_node(cache, new_node); + return -ENOMEM; +} + +/* * helper to add 'address of tree root -> reloc tree' mapping */ static int __add_reloc_root(struct btrfs_root *root) @@ -901,12 +1255,8 @@ static int __update_reloc_root(struct btrfs_root *root, int del) return 0; } -/* - * create reloc tree for a given fs tree. reloc tree is just a - * snapshot of the fs tree with special root objectid. - */ -int btrfs_init_reloc_root(struct btrfs_trans_handle *trans, - struct btrfs_root *root) +static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 objectid) { struct btrfs_root *reloc_root; struct extent_buffer *eb; @@ -914,36 +1264,45 @@ int btrfs_init_reloc_root(struct btrfs_trans_handle *trans, struct btrfs_key root_key; int ret; - if (root->reloc_root) { - reloc_root = root->reloc_root; - reloc_root->last_trans = trans->transid; - return 0; - } - - if (!root->fs_info->reloc_ctl || - !root->fs_info->reloc_ctl->create_reloc_root || - root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) - return 0; - root_item = kmalloc(sizeof(*root_item), GFP_NOFS); BUG_ON(!root_item); root_key.objectid = BTRFS_TREE_RELOC_OBJECTID; root_key.type = BTRFS_ROOT_ITEM_KEY; - root_key.offset = root->root_key.objectid; + root_key.offset = objectid; - ret = btrfs_copy_root(trans, root, root->commit_root, &eb, - BTRFS_TREE_RELOC_OBJECTID); - BUG_ON(ret); + if (root->root_key.objectid == objectid) { + /* called by btrfs_init_reloc_root */ + ret = btrfs_copy_root(trans, root, root->commit_root, &eb, + BTRFS_TREE_RELOC_OBJECTID); + BUG_ON(ret); + + btrfs_set_root_last_snapshot(&root->root_item, + trans->transid - 1); + } else { + /* + * called by btrfs_reloc_post_snapshot_hook. + * the source tree is a reloc tree, all tree blocks + * modified after it was created have RELOC flag + * set in their headers. so it's OK to not update + * the 'last_snapshot'. + */ + ret = btrfs_copy_root(trans, root, root->node, &eb, + BTRFS_TREE_RELOC_OBJECTID); + BUG_ON(ret); + } - btrfs_set_root_last_snapshot(&root->root_item, trans->transid - 1); memcpy(root_item, &root->root_item, sizeof(*root_item)); - btrfs_set_root_refs(root_item, 1); btrfs_set_root_bytenr(root_item, eb->start); btrfs_set_root_level(root_item, btrfs_header_level(eb)); btrfs_set_root_generation(root_item, trans->transid); - memset(&root_item->drop_progress, 0, sizeof(struct btrfs_disk_key)); - root_item->drop_level = 0; + + if (root->root_key.objectid == objectid) { + btrfs_set_root_refs(root_item, 0); + memset(&root_item->drop_progress, 0, + sizeof(struct btrfs_disk_key)); + root_item->drop_level = 0; + } btrfs_tree_unlock(eb); free_extent_buffer(eb); @@ -957,6 +1316,37 @@ int btrfs_init_reloc_root(struct btrfs_trans_handle *trans, &root_key); BUG_ON(IS_ERR(reloc_root)); reloc_root->last_trans = trans->transid; + return reloc_root; +} + +/* + * create reloc tree for a given fs tree. reloc tree is just a + * snapshot of the fs tree with special root objectid. + */ +int btrfs_init_reloc_root(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + struct btrfs_root *reloc_root; + struct reloc_control *rc = root->fs_info->reloc_ctl; + int clear_rsv = 0; + + if (root->reloc_root) { + reloc_root = root->reloc_root; + reloc_root->last_trans = trans->transid; + return 0; + } + + if (!rc || !rc->create_reloc_tree || + root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) + return 0; + + if (!trans->block_rsv) { + trans->block_rsv = rc->block_rsv; + clear_rsv = 1; + } + reloc_root = create_reloc_root(trans, root, root->root_key.objectid); + if (clear_rsv) + trans->block_rsv = NULL; __add_reloc_root(reloc_root); root->reloc_root = reloc_root; @@ -980,7 +1370,8 @@ int btrfs_update_reloc_root(struct btrfs_trans_handle *trans, reloc_root = root->reloc_root; root_item = &reloc_root->root_item; - if (btrfs_root_refs(root_item) == 0) { + if (root->fs_info->reloc_ctl->merge_reloc_tree && + btrfs_root_refs(root_item) == 0) { root->reloc_root = NULL; del = 1; } @@ -1102,8 +1493,7 @@ static int get_new_location(struct inode *reloc_inode, u64 *new_bytenr, goto out; } - if (new_bytenr) - *new_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); + *new_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); ret = 0; out: btrfs_free_path(path); @@ -1114,19 +1504,18 @@ out: * update file extent items in the tree leaf to point to * the new locations. */ -static int replace_file_extents(struct btrfs_trans_handle *trans, - struct reloc_control *rc, - struct btrfs_root *root, - struct extent_buffer *leaf, - struct list_head *inode_list) +static noinline_for_stack +int replace_file_extents(struct btrfs_trans_handle *trans, + struct reloc_control *rc, + struct btrfs_root *root, + struct extent_buffer *leaf) { struct btrfs_key key; struct btrfs_file_extent_item *fi; struct inode *inode = NULL; - struct inodevec *ivec = NULL; u64 parent; u64 bytenr; - u64 new_bytenr; + u64 new_bytenr = 0; u64 num_bytes; u64 end; u32 nritems; @@ -1166,21 +1555,12 @@ static int replace_file_extents(struct btrfs_trans_handle *trans, * to complete and drop the extent cache */ if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) { - if (!ivec || ivec->nr == INODEVEC_SIZE) { - ivec = kmalloc(sizeof(*ivec), GFP_NOFS); - BUG_ON(!ivec); - ivec->nr = 0; - list_add_tail(&ivec->list, inode_list); - } if (first) { inode = find_next_inode(root, key.objectid); - if (inode) - ivec->inode[ivec->nr++] = inode; first = 0; } else if (inode && inode->i_ino < key.objectid) { + btrfs_add_delayed_iput(inode); inode = find_next_inode(root, key.objectid); - if (inode) - ivec->inode[ivec->nr++] = inode; } if (inode && inode->i_ino == key.objectid) { end = key.offset + @@ -1204,8 +1584,10 @@ static int replace_file_extents(struct btrfs_trans_handle *trans, ret = get_new_location(rc->data_inode, &new_bytenr, bytenr, num_bytes); - if (ret > 0) + if (ret > 0) { + WARN_ON(1); continue; + } BUG_ON(ret < 0); btrfs_set_file_extent_disk_bytenr(leaf, fi, new_bytenr); @@ -1225,6 +1607,8 @@ static int replace_file_extents(struct btrfs_trans_handle *trans, } if (dirty) btrfs_mark_buffer_dirty(leaf); + if (inode) + btrfs_add_delayed_iput(inode); return 0; } @@ -1248,11 +1632,11 @@ int memcmp_node_keys(struct extent_buffer *eb, int slot, * if no block got replaced, 0 is returned. if there are other * errors, a negative error number is returned. */ -static int replace_path(struct btrfs_trans_handle *trans, - struct btrfs_root *dest, struct btrfs_root *src, - struct btrfs_path *path, struct btrfs_key *next_key, - struct extent_buffer **leaf, - int lowest_level, int max_level) +static noinline_for_stack +int replace_path(struct btrfs_trans_handle *trans, + struct btrfs_root *dest, struct btrfs_root *src, + struct btrfs_path *path, struct btrfs_key *next_key, + int lowest_level, int max_level) { struct extent_buffer *eb; struct extent_buffer *parent; @@ -1263,16 +1647,16 @@ static int replace_path(struct btrfs_trans_handle *trans, u64 new_ptr_gen; u64 last_snapshot; u32 blocksize; + int cow = 0; int level; int ret; int slot; BUG_ON(src->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID); BUG_ON(dest->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID); - BUG_ON(lowest_level > 1 && leaf); last_snapshot = btrfs_root_last_snapshot(&src->root_item); - +again: slot = path->slots[lowest_level]; btrfs_node_key_to_cpu(path->nodes[lowest_level], &key, slot); @@ -1286,8 +1670,10 @@ static int replace_path(struct btrfs_trans_handle *trans, return 0; } - ret = btrfs_cow_block(trans, dest, eb, NULL, 0, &eb); - BUG_ON(ret); + if (cow) { + ret = btrfs_cow_block(trans, dest, eb, NULL, 0, &eb); + BUG_ON(ret); + } btrfs_set_lock_blocking(eb); if (next_key) { @@ -1331,7 +1717,7 @@ static int replace_path(struct btrfs_trans_handle *trans, if (new_bytenr == 0 || old_ptr_gen > last_snapshot || memcmp_node_keys(parent, slot, path, level)) { - if (level <= lowest_level && !leaf) { + if (level <= lowest_level) { ret = 0; break; } @@ -1339,16 +1725,12 @@ static int replace_path(struct btrfs_trans_handle *trans, eb = read_tree_block(dest, old_bytenr, blocksize, old_ptr_gen); btrfs_tree_lock(eb); - ret = btrfs_cow_block(trans, dest, eb, parent, - slot, &eb); - BUG_ON(ret); - btrfs_set_lock_blocking(eb); - - if (level <= lowest_level) { - *leaf = eb; - ret = 0; - break; + if (cow) { + ret = btrfs_cow_block(trans, dest, eb, parent, + slot, &eb); + BUG_ON(ret); } + btrfs_set_lock_blocking(eb); btrfs_tree_unlock(parent); free_extent_buffer(parent); @@ -1357,6 +1739,13 @@ static int replace_path(struct btrfs_trans_handle *trans, continue; } + if (!cow) { + btrfs_tree_unlock(parent); + free_extent_buffer(parent); + cow = 1; + goto again; + } + btrfs_node_key_to_cpu(path->nodes[level], &key, path->slots[level]); btrfs_release_path(src, path); @@ -1562,20 +1951,6 @@ static int invalidate_extent_cache(struct btrfs_root *root, return 0; } -static void put_inodes(struct list_head *list) -{ - struct inodevec *ivec; - while (!list_empty(list)) { - ivec = list_entry(list->next, struct inodevec, list); - list_del(&ivec->list); - while (ivec->nr > 0) { - ivec->nr--; - iput(ivec->inode[ivec->nr]); - } - kfree(ivec); - } -} - static int find_next_key(struct btrfs_path *path, int level, struct btrfs_key *key) @@ -1608,13 +1983,14 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc, struct btrfs_root *reloc_root; struct btrfs_root_item *root_item; struct btrfs_path *path; - struct extent_buffer *leaf = NULL; + struct extent_buffer *leaf; unsigned long nr; int level; int max_level; int replaced = 0; int ret; int err = 0; + u32 min_reserved; path = btrfs_alloc_path(); if (!path) @@ -1648,34 +2024,23 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc, btrfs_unlock_up_safe(path, 0); } - if (level == 0 && rc->stage == UPDATE_DATA_PTRS) { - trans = btrfs_start_transaction(root, 1); + min_reserved = root->nodesize * (BTRFS_MAX_LEVEL - 1) * 2; + memset(&next_key, 0, sizeof(next_key)); - leaf = path->nodes[0]; - btrfs_item_key_to_cpu(leaf, &key, 0); - btrfs_release_path(reloc_root, path); + while (1) { + trans = btrfs_start_transaction(root, 0); + trans->block_rsv = rc->block_rsv; - ret = btrfs_search_slot(trans, root, &key, path, 0, 1); - if (ret < 0) { - err = ret; - goto out; + ret = btrfs_block_rsv_check(trans, root, rc->block_rsv, + min_reserved, 0); + if (ret) { + BUG_ON(ret != -EAGAIN); + ret = btrfs_commit_transaction(trans, root); + BUG_ON(ret); + continue; } - leaf = path->nodes[0]; - btrfs_unlock_up_safe(path, 1); - ret = replace_file_extents(trans, rc, root, leaf, - &inode_list); - if (ret < 0) - err = ret; - goto out; - } - - memset(&next_key, 0, sizeof(next_key)); - - while (1) { - leaf = NULL; replaced = 0; - trans = btrfs_start_transaction(root, 1); max_level = level; ret = walk_down_reloc_tree(reloc_root, path, &level); @@ -1689,14 +2054,9 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc, if (!find_next_key(path, level, &key) && btrfs_comp_cpu_keys(&next_key, &key) >= 0) { ret = 0; - } else if (level == 1 && rc->stage == UPDATE_DATA_PTRS) { - ret = replace_path(trans, root, reloc_root, - path, &next_key, &leaf, - level, max_level); } else { - ret = replace_path(trans, root, reloc_root, - path, &next_key, NULL, - level, max_level); + ret = replace_path(trans, root, reloc_root, path, + &next_key, level, max_level); } if (ret < 0) { err = ret; @@ -1708,16 +2068,6 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc, btrfs_node_key_to_cpu(path->nodes[level], &key, path->slots[level]); replaced = 1; - } else if (leaf) { - /* - * no block got replaced, try replacing file extents - */ - btrfs_item_key_to_cpu(leaf, &key, 0); - ret = replace_file_extents(trans, rc, root, leaf, - &inode_list); - btrfs_tree_unlock(leaf); - free_extent_buffer(leaf); - BUG_ON(ret < 0); } ret = walk_up_reloc_tree(reloc_root, path, &level); @@ -1734,15 +2084,10 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc, root_item->drop_level = level; nr = trans->blocks_used; - btrfs_end_transaction(trans, root); + btrfs_end_transaction_throttle(trans, root); btrfs_btree_balance_dirty(root, nr); - /* - * put inodes outside transaction, otherwise we may deadlock. - */ - put_inodes(&inode_list); - if (replaced && rc->stage == UPDATE_DATA_PTRS) invalidate_extent_cache(root, &key, &next_key); } @@ -1765,87 +2110,125 @@ out: sizeof(root_item->drop_progress)); root_item->drop_level = 0; btrfs_set_root_refs(root_item, 0); + btrfs_update_reloc_root(trans, root); } nr = trans->blocks_used; - btrfs_end_transaction(trans, root); + btrfs_end_transaction_throttle(trans, root); btrfs_btree_balance_dirty(root, nr); - put_inodes(&inode_list); - if (replaced && rc->stage == UPDATE_DATA_PTRS) invalidate_extent_cache(root, &key, &next_key); return err; } -/* - * callback for the work threads. - * this function merges reloc tree with corresponding fs tree, - * and then drops the reloc tree. - */ -static void merge_func(struct btrfs_work *work) +static noinline_for_stack +int prepare_to_merge(struct reloc_control *rc, int err) { - struct btrfs_trans_handle *trans; - struct btrfs_root *root; + struct btrfs_root *root = rc->extent_root; struct btrfs_root *reloc_root; - struct async_merge *async; + struct btrfs_trans_handle *trans; + LIST_HEAD(reloc_roots); + u64 num_bytes = 0; + int ret; + int retries = 0; + + mutex_lock(&root->fs_info->trans_mutex); + rc->merging_rsv_size += root->nodesize * (BTRFS_MAX_LEVEL - 1) * 2; + rc->merging_rsv_size += rc->nodes_relocated * 2; + mutex_unlock(&root->fs_info->trans_mutex); +again: + if (!err) { + num_bytes = rc->merging_rsv_size; + ret = btrfs_block_rsv_add(NULL, root, rc->block_rsv, + num_bytes, &retries); + if (ret) + err = ret; + } + + trans = btrfs_join_transaction(rc->extent_root, 1); + + if (!err) { + if (num_bytes != rc->merging_rsv_size) { + btrfs_end_transaction(trans, rc->extent_root); + btrfs_block_rsv_release(rc->extent_root, + rc->block_rsv, num_bytes); + retries = 0; + goto again; + } + } - async = container_of(work, struct async_merge, work); - reloc_root = async->root; + rc->merge_reloc_tree = 1; + + while (!list_empty(&rc->reloc_roots)) { + reloc_root = list_entry(rc->reloc_roots.next, + struct btrfs_root, root_list); + list_del_init(&reloc_root->root_list); - if (btrfs_root_refs(&reloc_root->root_item) > 0) { root = read_fs_root(reloc_root->fs_info, reloc_root->root_key.offset); BUG_ON(IS_ERR(root)); BUG_ON(root->reloc_root != reloc_root); - merge_reloc_root(async->rc, root); - - trans = btrfs_start_transaction(root, 1); + /* + * set reference count to 1, so btrfs_recover_relocation + * knows it should resumes merging + */ + if (!err) + btrfs_set_root_refs(&reloc_root->root_item, 1); btrfs_update_reloc_root(trans, root); - btrfs_end_transaction(trans, root); - } - btrfs_drop_snapshot(reloc_root, 0); + list_add(&reloc_root->root_list, &reloc_roots); + } - if (atomic_dec_and_test(async->num_pending)) - complete(async->done); + list_splice(&reloc_roots, &rc->reloc_roots); - kfree(async); + if (!err) + btrfs_commit_transaction(trans, rc->extent_root); + else + btrfs_end_transaction(trans, rc->extent_root); + return err; } -static int merge_reloc_roots(struct reloc_control *rc) +static noinline_for_stack +int merge_reloc_roots(struct reloc_control *rc) { - struct async_merge *async; struct btrfs_root *root; - struct completion done; - atomic_t num_pending; + struct btrfs_root *reloc_root; + LIST_HEAD(reloc_roots); + int found = 0; + int ret; +again: + root = rc->extent_root; + mutex_lock(&root->fs_info->trans_mutex); + list_splice_init(&rc->reloc_roots, &reloc_roots); + mutex_unlock(&root->fs_info->trans_mutex); - init_completion(&done); - atomic_set(&num_pending, 1); + while (!list_empty(&reloc_roots)) { + found = 1; + reloc_root = list_entry(reloc_roots.next, + struct btrfs_root, root_list); - while (!list_empty(&rc->reloc_roots)) { - root = list_entry(rc->reloc_roots.next, - struct btrfs_root, root_list); - list_del_init(&root->root_list); + if (btrfs_root_refs(&reloc_root->root_item) > 0) { + root = read_fs_root(reloc_root->fs_info, + reloc_root->root_key.offset); + BUG_ON(IS_ERR(root)); + BUG_ON(root->reloc_root != reloc_root); - async = kmalloc(sizeof(*async), GFP_NOFS); - BUG_ON(!async); - async->work.func = merge_func; - async->work.flags = 0; - async->rc = rc; - async->root = root; - async->done = &done; - async->num_pending = &num_pending; - atomic_inc(&num_pending); - btrfs_queue_worker(&rc->workers, &async->work); + ret = merge_reloc_root(rc, root); + BUG_ON(ret); + } else { + list_del_init(&reloc_root->root_list); + } + btrfs_drop_snapshot(reloc_root, rc->block_rsv, 0); } - if (!atomic_dec_and_test(&num_pending)) - wait_for_completion(&done); - + if (found) { + found = 0; + goto again; + } BUG_ON(!RB_EMPTY_ROOT(&rc->reloc_root_tree.rb_root)); return 0; } @@ -1876,119 +2259,169 @@ static int record_reloc_root_in_trans(struct btrfs_trans_handle *trans, return btrfs_record_root_in_trans(trans, root); } -/* - * select one tree from trees that references the block. - * for blocks in refernce counted trees, we preper reloc tree. - * if no reloc tree found and reloc_only is true, NULL is returned. - */ -static struct btrfs_root *__select_one_root(struct btrfs_trans_handle *trans, - struct backref_node *node, - struct backref_edge *edges[], - int *nr, int reloc_only) +static noinline_for_stack +struct btrfs_root *select_reloc_root(struct btrfs_trans_handle *trans, + struct reloc_control *rc, + struct backref_node *node, + struct backref_edge *edges[], int *nr) { struct backref_node *next; struct btrfs_root *root; - int index; - int loop = 0; -again: - index = 0; + int index = 0; + next = node; while (1) { cond_resched(); next = walk_up_backref(next, edges, &index); root = next->root; - if (!root) { - BUG_ON(!node->old_root); - goto skip; - } - - /* no other choice for non-refernce counted tree */ - if (!root->ref_cows) { - BUG_ON(reloc_only); - break; - } + BUG_ON(!root); + BUG_ON(!root->ref_cows); if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) { record_reloc_root_in_trans(trans, root); break; } - if (loop) { - btrfs_record_root_in_trans(trans, root); + btrfs_record_root_in_trans(trans, root); + root = root->reloc_root; + + if (next->new_bytenr != root->node->start) { + BUG_ON(next->new_bytenr); + BUG_ON(!list_empty(&next->list)); + next->new_bytenr = root->node->start; + next->root = root; + list_add_tail(&next->list, + &rc->backref_cache.changed); + __mark_block_processed(rc, next); break; } - if (reloc_only || next != node) { - if (!root->reloc_root) - btrfs_record_root_in_trans(trans, root); - root = root->reloc_root; - /* - * if the reloc tree was created in current - * transation, there is no node in backref tree - * corresponds to the root of the reloc tree. - */ - if (btrfs_root_last_snapshot(&root->root_item) == - trans->transid - 1) - break; - } -skip: + WARN_ON(1); root = NULL; next = walk_down_backref(edges, &index); if (!next || next->level <= node->level) break; } + if (!root) + return NULL; - if (!root && !loop && !reloc_only) { - loop = 1; - goto again; + *nr = index; + next = node; + /* setup backref node path for btrfs_reloc_cow_block */ + while (1) { + rc->backref_cache.path[next->level] = next; + if (--index < 0) + break; + next = edges[index]->node[UPPER]; } - - if (root) - *nr = index; - else - *nr = 0; - return root; } +/* + * select a tree root for relocation. return NULL if the block + * is reference counted. we should use do_relocation() in this + * case. return a tree root pointer if the block isn't reference + * counted. return -ENOENT if the block is root of reloc tree. + */ static noinline_for_stack struct btrfs_root *select_one_root(struct btrfs_trans_handle *trans, struct backref_node *node) { + struct backref_node *next; + struct btrfs_root *root; + struct btrfs_root *fs_root = NULL; struct backref_edge *edges[BTRFS_MAX_LEVEL - 1]; - int nr; - return __select_one_root(trans, node, edges, &nr, 0); + int index = 0; + + next = node; + while (1) { + cond_resched(); + next = walk_up_backref(next, edges, &index); + root = next->root; + BUG_ON(!root); + + /* no other choice for non-refernce counted tree */ + if (!root->ref_cows) + return root; + + if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) + fs_root = root; + + if (next != node) + return NULL; + + next = walk_down_backref(edges, &index); + if (!next || next->level <= node->level) + break; + } + + if (!fs_root) + return ERR_PTR(-ENOENT); + return fs_root; } static noinline_for_stack -struct btrfs_root *select_reloc_root(struct btrfs_trans_handle *trans, - struct backref_node *node, - struct backref_edge *edges[], int *nr) +u64 calcu_metadata_size(struct reloc_control *rc, + struct backref_node *node, int reserve) { - return __select_one_root(trans, node, edges, nr, 1); + struct backref_node *next = node; + struct backref_edge *edge; + struct backref_edge *edges[BTRFS_MAX_LEVEL - 1]; + u64 num_bytes = 0; + int index = 0; + + BUG_ON(reserve && node->processed); + + while (next) { + cond_resched(); + while (1) { + if (next->processed && (reserve || next != node)) + break; + + num_bytes += btrfs_level_size(rc->extent_root, + next->level); + + if (list_empty(&next->upper)) + break; + + edge = list_entry(next->upper.next, + struct backref_edge, list[LOWER]); + edges[index++] = edge; + next = edge->node[UPPER]; + } + next = walk_down_backref(edges, &index); + } + return num_bytes; } -static void grab_path_buffers(struct btrfs_path *path, - struct backref_node *node, - struct backref_edge *edges[], int nr) +static int reserve_metadata_space(struct btrfs_trans_handle *trans, + struct reloc_control *rc, + struct backref_node *node) { - int i = 0; - while (1) { - drop_node_buffer(node); - node->eb = path->nodes[node->level]; - BUG_ON(!node->eb); - if (path->locks[node->level]) - node->locked = 1; - path->nodes[node->level] = NULL; - path->locks[node->level] = 0; - - if (i >= nr) - break; + struct btrfs_root *root = rc->extent_root; + u64 num_bytes; + int ret; + + num_bytes = calcu_metadata_size(rc, node, 1) * 2; - edges[i]->blockptr = node->eb->start; - node = edges[i]->node[UPPER]; - i++; + trans->block_rsv = rc->block_rsv; + ret = btrfs_block_rsv_add(trans, root, rc->block_rsv, num_bytes, + &rc->block_rsv_retries); + if (ret) { + if (ret == -EAGAIN) + rc->commit_transaction = 1; + return ret; } + + rc->block_rsv_retries = 0; + return 0; +} + +static void release_metadata_space(struct reloc_control *rc, + struct backref_node *node) +{ + u64 num_bytes = calcu_metadata_size(rc, node, 0) * 2; + btrfs_block_rsv_release(rc->extent_root, rc->block_rsv, num_bytes); } /* @@ -1999,6 +2432,7 @@ static void grab_path_buffers(struct btrfs_path *path, * in that case this function just updates pointers. */ static int do_relocation(struct btrfs_trans_handle *trans, + struct reloc_control *rc, struct backref_node *node, struct btrfs_key *key, struct btrfs_path *path, int lowest) @@ -2019,18 +2453,25 @@ static int do_relocation(struct btrfs_trans_handle *trans, BUG_ON(lowest && node->eb); path->lowest_level = node->level + 1; + rc->backref_cache.path[node->level] = node; list_for_each_entry(edge, &node->upper, list[LOWER]) { cond_resched(); - if (node->eb && node->eb->start == edge->blockptr) - continue; upper = edge->node[UPPER]; - root = select_reloc_root(trans, upper, edges, &nr); - if (!root) - continue; - - if (upper->eb && !upper->locked) + root = select_reloc_root(trans, rc, upper, edges, &nr); + BUG_ON(!root); + + if (upper->eb && !upper->locked) { + if (!lowest) { + ret = btrfs_bin_search(upper->eb, key, + upper->level, &slot); + BUG_ON(ret); + bytenr = btrfs_node_blockptr(upper->eb, slot); + if (node->eb->start == bytenr) + goto next; + } drop_node_buffer(upper); + } if (!upper->eb) { ret = btrfs_search_slot(trans, root, key, path, 0, 1); @@ -2040,11 +2481,17 @@ static int do_relocation(struct btrfs_trans_handle *trans, } BUG_ON(ret > 0); - slot = path->slots[upper->level]; + if (!upper->eb) { + upper->eb = path->nodes[upper->level]; + path->nodes[upper->level] = NULL; + } else { + BUG_ON(upper->eb != path->nodes[upper->level]); + } - btrfs_unlock_up_safe(path, upper->level + 1); - grab_path_buffers(path, upper, edges, nr); + upper->locked = 1; + path->locks[upper->level] = 0; + slot = path->slots[upper->level]; btrfs_release_path(NULL, path); } else { ret = btrfs_bin_search(upper->eb, key, upper->level, @@ -2053,14 +2500,11 @@ static int do_relocation(struct btrfs_trans_handle *trans, } bytenr = btrfs_node_blockptr(upper->eb, slot); - if (!lowest) { - if (node->eb->start == bytenr) { - btrfs_tree_unlock(upper->eb); - upper->locked = 0; - continue; - } + if (lowest) { + BUG_ON(bytenr != node->bytenr); } else { - BUG_ON(node->bytenr != bytenr); + if (node->eb->start == bytenr) + goto next; } blocksize = btrfs_level_size(root, node->level); @@ -2072,13 +2516,13 @@ static int do_relocation(struct btrfs_trans_handle *trans, if (!node->eb) { ret = btrfs_cow_block(trans, root, eb, upper->eb, slot, &eb); + btrfs_tree_unlock(eb); + free_extent_buffer(eb); if (ret < 0) { err = ret; - break; + goto next; } - btrfs_set_lock_blocking(eb); - node->eb = eb; - node->locked = 1; + BUG_ON(node->eb != eb); } else { btrfs_set_node_blockptr(upper->eb, slot, node->eb->start); @@ -2096,67 +2540,80 @@ static int do_relocation(struct btrfs_trans_handle *trans, ret = btrfs_drop_subtree(trans, root, eb, upper->eb); BUG_ON(ret); } - if (!lowest) { - btrfs_tree_unlock(upper->eb); - upper->locked = 0; - } +next: + if (!upper->pending) + drop_node_buffer(upper); + else + unlock_node_buffer(upper); + if (err) + break; } + + if (!err && node->pending) { + drop_node_buffer(node); + list_move_tail(&node->list, &rc->backref_cache.changed); + node->pending = 0; + } + path->lowest_level = 0; + BUG_ON(err == -ENOSPC); return err; } static int link_to_upper(struct btrfs_trans_handle *trans, + struct reloc_control *rc, struct backref_node *node, struct btrfs_path *path) { struct btrfs_key key; - if (!node->eb || list_empty(&node->upper)) - return 0; btrfs_node_key_to_cpu(node->eb, &key, 0); - return do_relocation(trans, node, &key, path, 0); + return do_relocation(trans, rc, node, &key, path, 0); } static int finish_pending_nodes(struct btrfs_trans_handle *trans, - struct backref_cache *cache, - struct btrfs_path *path) + struct reloc_control *rc, + struct btrfs_path *path, int err) { + LIST_HEAD(list); + struct backref_cache *cache = &rc->backref_cache; struct backref_node *node; int level; int ret; - int err = 0; for (level = 0; level < BTRFS_MAX_LEVEL; level++) { while (!list_empty(&cache->pending[level])) { node = list_entry(cache->pending[level].next, - struct backref_node, lower); - BUG_ON(node->level != level); + struct backref_node, list); + list_move_tail(&node->list, &list); + BUG_ON(!node->pending); - ret = link_to_upper(trans, node, path); - if (ret < 0) - err = ret; - /* - * this remove the node from the pending list and - * may add some other nodes to the level + 1 - * pending list - */ - remove_backref_node(cache, node); + if (!err) { + ret = link_to_upper(trans, rc, node, path); + if (ret < 0) + err = ret; + } } + list_splice_init(&list, &cache->pending[level]); } - BUG_ON(!RB_EMPTY_ROOT(&cache->rb_root)); return err; } static void mark_block_processed(struct reloc_control *rc, - struct backref_node *node) + u64 bytenr, u32 blocksize) +{ + set_extent_bits(&rc->processed_blocks, bytenr, bytenr + blocksize - 1, + EXTENT_DIRTY, GFP_NOFS); +} + +static void __mark_block_processed(struct reloc_control *rc, + struct backref_node *node) { u32 blocksize; if (node->level == 0 || in_block_group(node->bytenr, rc->block_group)) { blocksize = btrfs_level_size(rc->extent_root, node->level); - set_extent_bits(&rc->processed_blocks, node->bytenr, - node->bytenr + blocksize - 1, EXTENT_DIRTY, - GFP_NOFS); + mark_block_processed(rc, node->bytenr, blocksize); } node->processed = 1; } @@ -2179,7 +2636,7 @@ static void update_processed_blocks(struct reloc_control *rc, if (next->processed) break; - mark_block_processed(rc, next); + __mark_block_processed(rc, next); if (list_empty(&next->upper)) break; @@ -2202,138 +2659,6 @@ static int tree_block_processed(u64 bytenr, u32 blocksize, return 0; } -/* - * check if there are any file extent pointers in the leaf point to - * data require processing - */ -static int check_file_extents(struct reloc_control *rc, - u64 bytenr, u32 blocksize, u64 ptr_gen) -{ - struct btrfs_key found_key; - struct btrfs_file_extent_item *fi; - struct extent_buffer *leaf; - u32 nritems; - int i; - int ret = 0; - - leaf = read_tree_block(rc->extent_root, bytenr, blocksize, ptr_gen); - - nritems = btrfs_header_nritems(leaf); - for (i = 0; i < nritems; i++) { - cond_resched(); - btrfs_item_key_to_cpu(leaf, &found_key, i); - if (found_key.type != BTRFS_EXTENT_DATA_KEY) - continue; - fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item); - if (btrfs_file_extent_type(leaf, fi) == - BTRFS_FILE_EXTENT_INLINE) - continue; - bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); - if (bytenr == 0) - continue; - if (in_block_group(bytenr, rc->block_group)) { - ret = 1; - break; - } - } - free_extent_buffer(leaf); - return ret; -} - -/* - * scan child blocks of a given block to find blocks require processing - */ -static int add_child_blocks(struct btrfs_trans_handle *trans, - struct reloc_control *rc, - struct backref_node *node, - struct rb_root *blocks) -{ - struct tree_block *block; - struct rb_node *rb_node; - u64 bytenr; - u64 ptr_gen; - u32 blocksize; - u32 nritems; - int i; - int err = 0; - - nritems = btrfs_header_nritems(node->eb); - blocksize = btrfs_level_size(rc->extent_root, node->level - 1); - for (i = 0; i < nritems; i++) { - cond_resched(); - bytenr = btrfs_node_blockptr(node->eb, i); - ptr_gen = btrfs_node_ptr_generation(node->eb, i); - if (ptr_gen == trans->transid) - continue; - if (!in_block_group(bytenr, rc->block_group) && - (node->level > 1 || rc->stage == MOVE_DATA_EXTENTS)) - continue; - if (tree_block_processed(bytenr, blocksize, rc)) - continue; - - readahead_tree_block(rc->extent_root, - bytenr, blocksize, ptr_gen); - } - - for (i = 0; i < nritems; i++) { - cond_resched(); - bytenr = btrfs_node_blockptr(node->eb, i); - ptr_gen = btrfs_node_ptr_generation(node->eb, i); - if (ptr_gen == trans->transid) - continue; - if (!in_block_group(bytenr, rc->block_group) && - (node->level > 1 || rc->stage == MOVE_DATA_EXTENTS)) - continue; - if (tree_block_processed(bytenr, blocksize, rc)) - continue; - if (!in_block_group(bytenr, rc->block_group) && - !check_file_extents(rc, bytenr, blocksize, ptr_gen)) - continue; - - block = kmalloc(sizeof(*block), GFP_NOFS); - if (!block) { - err = -ENOMEM; - break; - } - block->bytenr = bytenr; - btrfs_node_key_to_cpu(node->eb, &block->key, i); - block->level = node->level - 1; - block->key_ready = 1; - rb_node = tree_insert(blocks, block->bytenr, &block->rb_node); - BUG_ON(rb_node); - } - if (err) - free_block_list(blocks); - return err; -} - -/* - * find adjacent blocks require processing - */ -static noinline_for_stack -int add_adjacent_blocks(struct btrfs_trans_handle *trans, - struct reloc_control *rc, - struct backref_cache *cache, - struct rb_root *blocks, int level, - struct backref_node **upper) -{ - struct backref_node *node; - int ret = 0; - - WARN_ON(!list_empty(&cache->pending[level])); - - if (list_empty(&cache->pending[level + 1])) - return 1; - - node = list_entry(cache->pending[level + 1].next, - struct backref_node, lower); - if (node->eb) - ret = add_child_blocks(trans, rc, node, blocks); - - *upper = node; - return ret; -} - static int get_tree_block_key(struct reloc_control *rc, struct tree_block *block) { @@ -2371,40 +2696,53 @@ static int relocate_tree_block(struct btrfs_trans_handle *trans, struct btrfs_path *path) { struct btrfs_root *root; - int ret; + int release = 0; + int ret = 0; + if (!node) + return 0; + + BUG_ON(node->processed); root = select_one_root(trans, node); - if (unlikely(!root)) { - rc->found_old_snapshot = 1; + if (root == ERR_PTR(-ENOENT)) { update_processed_blocks(rc, node); - return 0; + goto out; } - if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) { - ret = do_relocation(trans, node, key, path, 1); - if (ret < 0) - goto out; - if (node->level == 0 && rc->stage == UPDATE_DATA_PTRS) { - ret = replace_file_extents(trans, rc, root, - node->eb, NULL); - if (ret < 0) - goto out; - } - drop_node_buffer(node); - } else if (!root->ref_cows) { - path->lowest_level = node->level; - ret = btrfs_search_slot(trans, root, key, path, 0, 1); - btrfs_release_path(root, path); - if (ret < 0) + if (!root || root->ref_cows) { + ret = reserve_metadata_space(trans, rc, node); + if (ret) goto out; - } else if (root != node->root) { - WARN_ON(node->level > 0 || rc->stage != UPDATE_DATA_PTRS); + release = 1; } - update_processed_blocks(rc, node); - ret = 0; + if (root) { + if (root->ref_cows) { + BUG_ON(node->new_bytenr); + BUG_ON(!list_empty(&node->list)); + btrfs_record_root_in_trans(trans, root); + root = root->reloc_root; + node->new_bytenr = root->node->start; + node->root = root; + list_add_tail(&node->list, &rc->backref_cache.changed); + } else { + path->lowest_level = node->level; + ret = btrfs_search_slot(trans, root, key, path, 0, 1); + btrfs_release_path(root, path); + if (ret > 0) + ret = 0; + } + if (!ret) + update_processed_blocks(rc, node); + } else { + ret = do_relocation(trans, rc, node, key, path, 1); + } out: - drop_node_buffer(node); + if (ret || node->level == 0 || node->cowonly) { + if (release) + release_metadata_space(rc, node); + remove_backref_node(&rc->backref_cache, node); + } return ret; } @@ -2415,12 +2753,10 @@ static noinline_for_stack int relocate_tree_blocks(struct btrfs_trans_handle *trans, struct reloc_control *rc, struct rb_root *blocks) { - struct backref_cache *cache; struct backref_node *node; struct btrfs_path *path; struct tree_block *block; struct rb_node *rb_node; - int level = -1; int ret; int err = 0; @@ -2428,21 +2764,9 @@ int relocate_tree_blocks(struct btrfs_trans_handle *trans, if (!path) return -ENOMEM; - cache = kmalloc(sizeof(*cache), GFP_NOFS); - if (!cache) { - btrfs_free_path(path); - return -ENOMEM; - } - - backref_cache_init(cache); - rb_node = rb_first(blocks); while (rb_node) { block = rb_entry(rb_node, struct tree_block, rb_node); - if (level == -1) - level = block->level; - else - BUG_ON(level != block->level); if (!block->key_ready) reada_tree_block(rc, block); rb_node = rb_next(rb_node); @@ -2460,7 +2784,7 @@ int relocate_tree_blocks(struct btrfs_trans_handle *trans, while (rb_node) { block = rb_entry(rb_node, struct tree_block, rb_node); - node = build_backref_tree(rc, cache, &block->key, + node = build_backref_tree(rc, &block->key, block->level, block->bytenr); if (IS_ERR(node)) { err = PTR_ERR(node); @@ -2470,79 +2794,62 @@ int relocate_tree_blocks(struct btrfs_trans_handle *trans, ret = relocate_tree_block(trans, rc, node, &block->key, path); if (ret < 0) { - err = ret; + if (ret != -EAGAIN || rb_node == rb_first(blocks)) + err = ret; goto out; } - remove_backref_node(cache, node); rb_node = rb_next(rb_node); } - - if (level > 0) - goto out; - +out: free_block_list(blocks); + err = finish_pending_nodes(trans, rc, path, err); - /* - * now backrefs of some upper level tree blocks have been cached, - * try relocating blocks referenced by these upper level blocks. - */ - while (1) { - struct backref_node *upper = NULL; - if (trans->transaction->in_commit || - trans->transaction->delayed_refs.flushing) - break; + btrfs_free_path(path); + return err; +} - ret = add_adjacent_blocks(trans, rc, cache, blocks, level, - &upper); - if (ret < 0) - err = ret; - if (ret != 0) - break; +static noinline_for_stack +int prealloc_file_extent_cluster(struct inode *inode, + struct file_extent_cluster *cluster) +{ + u64 alloc_hint = 0; + u64 start; + u64 end; + u64 offset = BTRFS_I(inode)->index_cnt; + u64 num_bytes; + int nr = 0; + int ret = 0; - rb_node = rb_first(blocks); - while (rb_node) { - block = rb_entry(rb_node, struct tree_block, rb_node); - if (trans->transaction->in_commit || - trans->transaction->delayed_refs.flushing) - goto out; - BUG_ON(!block->key_ready); - node = build_backref_tree(rc, cache, &block->key, - level, block->bytenr); - if (IS_ERR(node)) { - err = PTR_ERR(node); - goto out; - } + BUG_ON(cluster->start != cluster->boundary[0]); + mutex_lock(&inode->i_mutex); - ret = relocate_tree_block(trans, rc, node, - &block->key, path); - if (ret < 0) { - err = ret; - goto out; - } - remove_backref_node(cache, node); - rb_node = rb_next(rb_node); - } - free_block_list(blocks); + ret = btrfs_check_data_free_space(inode, cluster->end + + 1 - cluster->start); + if (ret) + goto out; - if (upper) { - ret = link_to_upper(trans, upper, path); - if (ret < 0) { - err = ret; - break; - } - remove_backref_node(cache, upper); - } + while (nr < cluster->nr) { + start = cluster->boundary[nr] - offset; + if (nr + 1 < cluster->nr) + end = cluster->boundary[nr + 1] - 1 - offset; + else + end = cluster->end - offset; + + lock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS); + num_bytes = end + 1 - start; + ret = btrfs_prealloc_file_range(inode, 0, start, + num_bytes, num_bytes, + end + 1, &alloc_hint); + unlock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS); + if (ret) + break; + nr++; } + btrfs_free_reserved_data_space(inode, cluster->end + + 1 - cluster->start); out: - free_block_list(blocks); - - ret = finish_pending_nodes(trans, cache, path); - if (ret < 0) - err = ret; - - kfree(cache); - btrfs_free_path(path); - return err; + mutex_unlock(&inode->i_mutex); + return ret; } static noinline_for_stack @@ -2588,7 +2895,6 @@ static int relocate_file_extent_cluster(struct inode *inode, u64 offset = BTRFS_I(inode)->index_cnt; unsigned long index; unsigned long last_index; - unsigned int dirty_page = 0; struct page *page; struct file_ra_state *ra; int nr = 0; @@ -2601,21 +2907,24 @@ static int relocate_file_extent_cluster(struct inode *inode, if (!ra) return -ENOMEM; - index = (cluster->start - offset) >> PAGE_CACHE_SHIFT; - last_index = (cluster->end - offset) >> PAGE_CACHE_SHIFT; + ret = prealloc_file_extent_cluster(inode, cluster); + if (ret) + goto out; - mutex_lock(&inode->i_mutex); + file_ra_state_init(ra, inode->i_mapping); - i_size_write(inode, cluster->end + 1 - offset); ret = setup_extent_mapping(inode, cluster->start - offset, cluster->end - offset, cluster->start); if (ret) - goto out_unlock; - - file_ra_state_init(ra, inode->i_mapping); + goto out; - WARN_ON(cluster->start != cluster->boundary[0]); + index = (cluster->start - offset) >> PAGE_CACHE_SHIFT; + last_index = (cluster->end - offset) >> PAGE_CACHE_SHIFT; while (index <= last_index) { + ret = btrfs_delalloc_reserve_metadata(inode, PAGE_CACHE_SIZE); + if (ret) + goto out; + page = find_lock_page(inode->i_mapping, index); if (!page) { page_cache_sync_readahead(inode->i_mapping, @@ -2623,8 +2932,10 @@ static int relocate_file_extent_cluster(struct inode *inode, last_index + 1 - index); page = grab_cache_page(inode->i_mapping, index); if (!page) { + btrfs_delalloc_release_metadata(inode, + PAGE_CACHE_SIZE); ret = -ENOMEM; - goto out_unlock; + goto out; } } @@ -2640,8 +2951,10 @@ static int relocate_file_extent_cluster(struct inode *inode, if (!PageUptodate(page)) { unlock_page(page); page_cache_release(page); + btrfs_delalloc_release_metadata(inode, + PAGE_CACHE_SIZE); ret = -EIO; - goto out_unlock; + goto out; } } @@ -2660,10 +2973,9 @@ static int relocate_file_extent_cluster(struct inode *inode, EXTENT_BOUNDARY, GFP_NOFS); nr++; } - btrfs_set_extent_delalloc(inode, page_start, page_end, NULL); + btrfs_set_extent_delalloc(inode, page_start, page_end, NULL); set_page_dirty(page); - dirty_page++; unlock_extent(&BTRFS_I(inode)->io_tree, page_start, page_end, GFP_NOFS); @@ -2671,20 +2983,11 @@ static int relocate_file_extent_cluster(struct inode *inode, page_cache_release(page); index++; - if (nr < cluster->nr && - page_end + 1 + offset == cluster->boundary[nr]) { - balance_dirty_pages_ratelimited_nr(inode->i_mapping, - dirty_page); - dirty_page = 0; - } - } - if (dirty_page) { - balance_dirty_pages_ratelimited_nr(inode->i_mapping, - dirty_page); + balance_dirty_pages_ratelimited(inode->i_mapping); + btrfs_throttle(BTRFS_I(inode)->root); } WARN_ON(nr != cluster->nr); -out_unlock: - mutex_unlock(&inode->i_mutex); +out: kfree(ra); return ret; } @@ -2870,9 +3173,6 @@ out: static int block_use_full_backref(struct reloc_control *rc, struct extent_buffer *eb) { - struct btrfs_path *path; - struct btrfs_extent_item *ei; - struct btrfs_key key; u64 flags; int ret; @@ -2880,28 +3180,14 @@ static int block_use_full_backref(struct reloc_control *rc, btrfs_header_backref_rev(eb) < BTRFS_MIXED_BACKREF_REV) return 1; - path = btrfs_alloc_path(); - BUG_ON(!path); - - key.objectid = eb->start; - key.type = BTRFS_EXTENT_ITEM_KEY; - key.offset = eb->len; - - path->search_commit_root = 1; - path->skip_locking = 1; - ret = btrfs_search_slot(NULL, rc->extent_root, - &key, path, 0, 0); + ret = btrfs_lookup_extent_info(NULL, rc->extent_root, + eb->start, eb->len, NULL, &flags); BUG_ON(ret); - ei = btrfs_item_ptr(path->nodes[0], path->slots[0], - struct btrfs_extent_item); - flags = btrfs_extent_flags(path->nodes[0], ei); - BUG_ON(!(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)); if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) ret = 1; else ret = 0; - btrfs_free_path(path); return ret; } @@ -3074,22 +3360,10 @@ int add_data_references(struct reloc_control *rc, struct btrfs_extent_inline_ref *iref; unsigned long ptr; unsigned long end; - u32 blocksize; + u32 blocksize = btrfs_level_size(rc->extent_root, 0); int ret; int err = 0; - ret = get_new_location(rc->data_inode, NULL, extent_key->objectid, - extent_key->offset); - BUG_ON(ret < 0); - if (ret > 0) { - /* the relocated data is fragmented */ - rc->extents_skipped++; - btrfs_release_path(rc->extent_root, path); - return 0; - } - - blocksize = btrfs_level_size(rc->extent_root, 0); - eb = path->nodes[0]; ptr = btrfs_item_ptr_offset(eb, path->slots[0]); end = ptr + btrfs_item_size_nr(eb, path->slots[0]); @@ -3170,7 +3444,8 @@ int add_data_references(struct reloc_control *rc, */ static noinline_for_stack int find_next_extent(struct btrfs_trans_handle *trans, - struct reloc_control *rc, struct btrfs_path *path) + struct reloc_control *rc, struct btrfs_path *path, + struct btrfs_key *extent_key) { struct btrfs_key key; struct extent_buffer *leaf; @@ -3225,6 +3500,7 @@ next: rc->search_start = end + 1; } else { rc->search_start = key.objectid + key.offset; + memcpy(extent_key, &key, sizeof(key)); return 0; } } @@ -3262,12 +3538,49 @@ static int check_extent_flags(u64 flags) return 0; } +static noinline_for_stack +int prepare_to_relocate(struct reloc_control *rc) +{ + struct btrfs_trans_handle *trans; + int ret; + + rc->block_rsv = btrfs_alloc_block_rsv(rc->extent_root); + if (!rc->block_rsv) + return -ENOMEM; + + /* + * reserve some space for creating reloc trees. + * btrfs_init_reloc_root will use them when there + * is no reservation in transaction handle. + */ + ret = btrfs_block_rsv_add(NULL, rc->extent_root, rc->block_rsv, + rc->extent_root->nodesize * 256, + &rc->block_rsv_retries); + if (ret) + return ret; + + rc->block_rsv->refill_used = 1; + btrfs_add_durable_block_rsv(rc->extent_root->fs_info, rc->block_rsv); + + memset(&rc->cluster, 0, sizeof(rc->cluster)); + rc->search_start = rc->block_group->key.objectid; + rc->extents_found = 0; + rc->nodes_relocated = 0; + rc->merging_rsv_size = 0; + rc->block_rsv_retries = 0; + + rc->create_reloc_tree = 1; + set_reloc_control(rc); + + trans = btrfs_join_transaction(rc->extent_root, 1); + btrfs_commit_transaction(trans, rc->extent_root); + return 0; +} static noinline_for_stack int relocate_block_group(struct reloc_control *rc) { struct rb_root blocks = RB_ROOT; struct btrfs_key key; - struct file_extent_cluster *cluster; struct btrfs_trans_handle *trans = NULL; struct btrfs_path *path; struct btrfs_extent_item *ei; @@ -3277,33 +3590,25 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc) int ret; int err = 0; - cluster = kzalloc(sizeof(*cluster), GFP_NOFS); - if (!cluster) - return -ENOMEM; - path = btrfs_alloc_path(); - if (!path) { - kfree(cluster); + if (!path) return -ENOMEM; - } - - rc->extents_found = 0; - rc->extents_skipped = 0; - - rc->search_start = rc->block_group->key.objectid; - clear_extent_bits(&rc->processed_blocks, 0, (u64)-1, EXTENT_DIRTY, - GFP_NOFS); - - rc->create_reloc_root = 1; - set_reloc_control(rc); - trans = btrfs_start_transaction(rc->extent_root, 1); - btrfs_commit_transaction(trans, rc->extent_root); + ret = prepare_to_relocate(rc); + if (ret) { + err = ret; + goto out_free; + } while (1) { - trans = btrfs_start_transaction(rc->extent_root, 1); + trans = btrfs_start_transaction(rc->extent_root, 0); + + if (update_backref_cache(trans, &rc->backref_cache)) { + btrfs_end_transaction(trans, rc->extent_root); + continue; + } - ret = find_next_extent(trans, rc, path); + ret = find_next_extent(trans, rc, path, &key); if (ret < 0) err = ret; if (ret != 0) @@ -3313,9 +3618,7 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc) ei = btrfs_item_ptr(path->nodes[0], path->slots[0], struct btrfs_extent_item); - btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); - item_size = btrfs_item_size_nr(path->nodes[0], - path->slots[0]); + item_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]); if (item_size >= sizeof(*ei)) { flags = btrfs_extent_flags(path->nodes[0], ei); ret = check_extent_flags(flags); @@ -3356,73 +3659,100 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc) if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { ret = add_tree_block(rc, &key, path, &blocks); } else if (rc->stage == UPDATE_DATA_PTRS && - (flags & BTRFS_EXTENT_FLAG_DATA)) { + (flags & BTRFS_EXTENT_FLAG_DATA)) { ret = add_data_references(rc, &key, path, &blocks); } else { btrfs_release_path(rc->extent_root, path); ret = 0; } if (ret < 0) { - err = 0; + err = ret; break; } if (!RB_EMPTY_ROOT(&blocks)) { ret = relocate_tree_blocks(trans, rc, &blocks); if (ret < 0) { + if (ret != -EAGAIN) { + err = ret; + break; + } + rc->extents_found--; + rc->search_start = key.objectid; + } + } + + ret = btrfs_block_rsv_check(trans, rc->extent_root, + rc->block_rsv, 0, 5); + if (ret < 0) { + if (ret != -EAGAIN) { err = ret; + WARN_ON(1); break; } + rc->commit_transaction = 1; } - nr = trans->blocks_used; - btrfs_end_transaction(trans, rc->extent_root); + if (rc->commit_transaction) { + rc->commit_transaction = 0; + ret = btrfs_commit_transaction(trans, rc->extent_root); + BUG_ON(ret); + } else { + nr = trans->blocks_used; + btrfs_end_transaction_throttle(trans, rc->extent_root); + btrfs_btree_balance_dirty(rc->extent_root, nr); + } trans = NULL; - btrfs_btree_balance_dirty(rc->extent_root, nr); if (rc->stage == MOVE_DATA_EXTENTS && (flags & BTRFS_EXTENT_FLAG_DATA)) { rc->found_file_extent = 1; ret = relocate_data_extent(rc->data_inode, - &key, cluster); + &key, &rc->cluster); if (ret < 0) { err = ret; break; } } } - btrfs_free_path(path); + + btrfs_release_path(rc->extent_root, path); + clear_extent_bits(&rc->processed_blocks, 0, (u64)-1, EXTENT_DIRTY, + GFP_NOFS); if (trans) { nr = trans->blocks_used; - btrfs_end_transaction(trans, rc->extent_root); + btrfs_end_transaction_throttle(trans, rc->extent_root); btrfs_btree_balance_dirty(rc->extent_root, nr); } if (!err) { - ret = relocate_file_extent_cluster(rc->data_inode, cluster); + ret = relocate_file_extent_cluster(rc->data_inode, + &rc->cluster); if (ret < 0) err = ret; } - kfree(cluster); + rc->create_reloc_tree = 0; + set_reloc_control(rc); - rc->create_reloc_root = 0; - smp_mb(); + backref_cache_cleanup(&rc->backref_cache); + btrfs_block_rsv_release(rc->extent_root, rc->block_rsv, (u64)-1); - if (rc->extents_found > 0) { - trans = btrfs_start_transaction(rc->extent_root, 1); - btrfs_commit_transaction(trans, rc->extent_root); - } + err = prepare_to_merge(rc, err); merge_reloc_roots(rc); + rc->merge_reloc_tree = 0; unset_reloc_control(rc); + btrfs_block_rsv_release(rc->extent_root, rc->block_rsv, (u64)-1); /* get rid of pinned extents */ - trans = btrfs_start_transaction(rc->extent_root, 1); + trans = btrfs_join_transaction(rc->extent_root, 1); btrfs_commit_transaction(trans, rc->extent_root); - +out_free: + btrfs_free_block_rsv(rc->extent_root, rc->block_rsv); + btrfs_free_path(path); return err; } @@ -3448,7 +3778,8 @@ static int __insert_orphan_inode(struct btrfs_trans_handle *trans, btrfs_set_inode_generation(leaf, item, 1); btrfs_set_inode_size(leaf, item, 0); btrfs_set_inode_mode(leaf, item, S_IFREG | 0600); - btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS); + btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS | + BTRFS_INODE_PREALLOC); btrfs_mark_buffer_dirty(leaf); btrfs_release_path(root, path); out: @@ -3460,8 +3791,9 @@ out: * helper to create inode for data relocation. * the inode is in data relocation tree and its link count is 0 */ -static struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info, - struct btrfs_block_group_cache *group) +static noinline_for_stack +struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info, + struct btrfs_block_group_cache *group) { struct inode *inode = NULL; struct btrfs_trans_handle *trans; @@ -3475,8 +3807,9 @@ static struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info, if (IS_ERR(root)) return ERR_CAST(root); - trans = btrfs_start_transaction(root, 1); - BUG_ON(!trans); + trans = btrfs_start_transaction(root, 6); + if (IS_ERR(trans)) + return ERR_CAST(trans); err = btrfs_find_free_objectid(trans, root, objectid, &objectid); if (err) @@ -3496,7 +3829,6 @@ static struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info, out: nr = trans->blocks_used; btrfs_end_transaction(trans, root); - btrfs_btree_balance_dirty(root, nr); if (err) { if (inode) @@ -3506,6 +3838,21 @@ out: return inode; } +static struct reloc_control *alloc_reloc_control(void) +{ + struct reloc_control *rc; + + rc = kzalloc(sizeof(*rc), GFP_NOFS); + if (!rc) + return NULL; + + INIT_LIST_HEAD(&rc->reloc_roots); + backref_cache_init(&rc->backref_cache); + mapping_tree_init(&rc->reloc_root_tree); + extent_io_tree_init(&rc->processed_blocks, NULL, GFP_NOFS); + return rc; +} + /* * function to relocate all extents in a block group. */ @@ -3514,24 +3861,26 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start) struct btrfs_fs_info *fs_info = extent_root->fs_info; struct reloc_control *rc; int ret; + int rw = 0; int err = 0; - rc = kzalloc(sizeof(*rc), GFP_NOFS); + rc = alloc_reloc_control(); if (!rc) return -ENOMEM; - mapping_tree_init(&rc->reloc_root_tree); - extent_io_tree_init(&rc->processed_blocks, NULL, GFP_NOFS); - INIT_LIST_HEAD(&rc->reloc_roots); + rc->extent_root = extent_root; rc->block_group = btrfs_lookup_block_group(fs_info, group_start); BUG_ON(!rc->block_group); - btrfs_init_workers(&rc->workers, "relocate", - fs_info->thread_pool_size, NULL); - - rc->extent_root = extent_root; - btrfs_prepare_block_group_relocation(extent_root, rc->block_group); + if (!rc->block_group->ro) { + ret = btrfs_set_block_group_ro(extent_root, rc->block_group); + if (ret) { + err = ret; + goto out; + } + rw = 1; + } rc->data_inode = create_reloc_inode(fs_info, rc->block_group); if (IS_ERR(rc->data_inode)) { @@ -3548,9 +3897,6 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start) btrfs_wait_ordered_extents(fs_info->tree_root, 0, 0); while (1) { - rc->extents_found = 0; - rc->extents_skipped = 0; - mutex_lock(&fs_info->cleaner_mutex); btrfs_clean_old_snapshots(fs_info->tree_root); @@ -3559,7 +3905,7 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start) mutex_unlock(&fs_info->cleaner_mutex); if (ret < 0) { err = ret; - break; + goto out; } if (rc->extents_found == 0) @@ -3573,18 +3919,6 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start) invalidate_mapping_pages(rc->data_inode->i_mapping, 0, -1); rc->stage = UPDATE_DATA_PTRS; - } else if (rc->stage == UPDATE_DATA_PTRS && - rc->extents_skipped >= rc->extents_found) { - iput(rc->data_inode); - rc->data_inode = create_reloc_inode(fs_info, - rc->block_group); - if (IS_ERR(rc->data_inode)) { - err = PTR_ERR(rc->data_inode); - rc->data_inode = NULL; - break; - } - rc->stage = MOVE_DATA_EXTENTS; - rc->found_file_extent = 0; } } @@ -3597,8 +3931,9 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start) WARN_ON(rc->block_group->reserved > 0); WARN_ON(btrfs_block_group_used(&rc->block_group->item) > 0); out: + if (err && rw) + btrfs_set_block_group_rw(extent_root, rc->block_group); iput(rc->data_inode); - btrfs_stop_workers(&rc->workers); btrfs_put_block_group(rc->block_group); kfree(rc); return err; @@ -3609,7 +3944,7 @@ static noinline_for_stack int mark_garbage_root(struct btrfs_root *root) struct btrfs_trans_handle *trans; int ret; - trans = btrfs_start_transaction(root->fs_info->tree_root, 1); + trans = btrfs_start_transaction(root->fs_info->tree_root, 0); memset(&root->root_item.drop_progress, 0, sizeof(root->root_item.drop_progress)); @@ -3702,20 +4037,20 @@ int btrfs_recover_relocation(struct btrfs_root *root) if (list_empty(&reloc_roots)) goto out; - rc = kzalloc(sizeof(*rc), GFP_NOFS); + rc = alloc_reloc_control(); if (!rc) { err = -ENOMEM; goto out; } - mapping_tree_init(&rc->reloc_root_tree); - INIT_LIST_HEAD(&rc->reloc_roots); - btrfs_init_workers(&rc->workers, "relocate", - root->fs_info->thread_pool_size, NULL); rc->extent_root = root->fs_info->extent_root; set_reloc_control(rc); + trans = btrfs_join_transaction(rc->extent_root, 1); + + rc->merge_reloc_tree = 1; + while (!list_empty(&reloc_roots)) { reloc_root = list_entry(reloc_roots.next, struct btrfs_root, root_list); @@ -3735,20 +4070,16 @@ int btrfs_recover_relocation(struct btrfs_root *root) fs_root->reloc_root = reloc_root; } - trans = btrfs_start_transaction(rc->extent_root, 1); btrfs_commit_transaction(trans, rc->extent_root); merge_reloc_roots(rc); unset_reloc_control(rc); - trans = btrfs_start_transaction(rc->extent_root, 1); + trans = btrfs_join_transaction(rc->extent_root, 1); btrfs_commit_transaction(trans, rc->extent_root); out: - if (rc) { - btrfs_stop_workers(&rc->workers); - kfree(rc); - } + kfree(rc); while (!list_empty(&reloc_roots)) { reloc_root = list_entry(reloc_roots.next, struct btrfs_root, root_list); @@ -3814,3 +4145,130 @@ int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len) btrfs_put_ordered_extent(ordered); return 0; } + +void btrfs_reloc_cow_block(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct extent_buffer *buf, + struct extent_buffer *cow) +{ + struct reloc_control *rc; + struct backref_node *node; + int first_cow = 0; + int level; + int ret; + + rc = root->fs_info->reloc_ctl; + if (!rc) + return; + + BUG_ON(rc->stage == UPDATE_DATA_PTRS && + root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID); + + level = btrfs_header_level(buf); + if (btrfs_header_generation(buf) <= + btrfs_root_last_snapshot(&root->root_item)) + first_cow = 1; + + if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID && + rc->create_reloc_tree) { + WARN_ON(!first_cow && level == 0); + + node = rc->backref_cache.path[level]; + BUG_ON(node->bytenr != buf->start && + node->new_bytenr != buf->start); + + drop_node_buffer(node); + extent_buffer_get(cow); + node->eb = cow; + node->new_bytenr = cow->start; + + if (!node->pending) { + list_move_tail(&node->list, + &rc->backref_cache.pending[level]); + node->pending = 1; + } + + if (first_cow) + __mark_block_processed(rc, node); + + if (first_cow && level > 0) + rc->nodes_relocated += buf->len; + } + + if (level == 0 && first_cow && rc->stage == UPDATE_DATA_PTRS) { + ret = replace_file_extents(trans, rc, root, cow); + BUG_ON(ret); + } +} + +/* + * called before creating snapshot. it calculates metadata reservation + * requried for relocating tree blocks in the snapshot + */ +void btrfs_reloc_pre_snapshot(struct btrfs_trans_handle *trans, + struct btrfs_pending_snapshot *pending, + u64 *bytes_to_reserve) +{ + struct btrfs_root *root; + struct reloc_control *rc; + + root = pending->root; + if (!root->reloc_root) + return; + + rc = root->fs_info->reloc_ctl; + if (!rc->merge_reloc_tree) + return; + + root = root->reloc_root; + BUG_ON(btrfs_root_refs(&root->root_item) == 0); + /* + * relocation is in the stage of merging trees. the space + * used by merging a reloc tree is twice the size of + * relocated tree nodes in the worst case. half for cowing + * the reloc tree, half for cowing the fs tree. the space + * used by cowing the reloc tree will be freed after the + * tree is dropped. if we create snapshot, cowing the fs + * tree may use more space than it frees. so we need + * reserve extra space. + */ + *bytes_to_reserve += rc->nodes_relocated; +} + +/* + * called after snapshot is created. migrate block reservation + * and create reloc root for the newly created snapshot + */ +void btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans, + struct btrfs_pending_snapshot *pending) +{ + struct btrfs_root *root = pending->root; + struct btrfs_root *reloc_root; + struct btrfs_root *new_root; + struct reloc_control *rc; + int ret; + + if (!root->reloc_root) + return; + + rc = root->fs_info->reloc_ctl; + rc->merging_rsv_size += rc->nodes_relocated; + + if (rc->merge_reloc_tree) { + ret = btrfs_block_rsv_migrate(&pending->block_rsv, + rc->block_rsv, + rc->nodes_relocated); + BUG_ON(ret); + } + + new_root = pending->snap; + reloc_root = create_reloc_root(trans, root->reloc_root, + new_root->root_key.objectid); + + __add_reloc_root(reloc_root); + new_root->reloc_root = reloc_root; + + if (rc->create_reloc_tree) { + ret = clone_backref_node(trans, rc, root, reloc_root); + BUG_ON(ret); + } +} diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c index 67fa2d29d663..2d958be761c8 100644 --- a/fs/btrfs/root-tree.c +++ b/fs/btrfs/root-tree.c @@ -259,6 +259,8 @@ int btrfs_find_orphan_roots(struct btrfs_root *tree_root) struct extent_buffer *leaf; struct btrfs_path *path; struct btrfs_key key; + struct btrfs_key root_key; + struct btrfs_root *root; int err = 0; int ret; @@ -270,6 +272,9 @@ int btrfs_find_orphan_roots(struct btrfs_root *tree_root) key.type = BTRFS_ORPHAN_ITEM_KEY; key.offset = 0; + root_key.type = BTRFS_ROOT_ITEM_KEY; + root_key.offset = (u64)-1; + while (1) { ret = btrfs_search_slot(NULL, tree_root, &key, path, 0, 0); if (ret < 0) { @@ -294,13 +299,25 @@ int btrfs_find_orphan_roots(struct btrfs_root *tree_root) key.type != BTRFS_ORPHAN_ITEM_KEY) break; - ret = btrfs_find_dead_roots(tree_root, key.offset); - if (ret) { + root_key.objectid = key.offset; + key.offset++; + + root = btrfs_read_fs_root_no_name(tree_root->fs_info, + &root_key); + if (!IS_ERR(root)) + continue; + + ret = PTR_ERR(root); + if (ret != -ENOENT) { err = ret; break; } - key.offset++; + ret = btrfs_find_dead_roots(tree_root, root_key.objectid); + if (ret) { + err = ret; + break; + } } btrfs_free_path(path); @@ -313,7 +330,6 @@ int btrfs_del_root(struct btrfs_trans_handle *trans, struct btrfs_root *root, { struct btrfs_path *path; int ret; - u32 refs; struct btrfs_root_item *ri; struct extent_buffer *leaf; @@ -327,8 +343,6 @@ int btrfs_del_root(struct btrfs_trans_handle *trans, struct btrfs_root *root, leaf = path->nodes[0]; ri = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_item); - refs = btrfs_disk_root_refs(leaf, ri); - BUG_ON(refs != 0); ret = btrfs_del_item(trans, root, path); out: btrfs_free_path(path); diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 2909a03e5230..f2393b390318 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -360,6 +360,8 @@ static struct dentry *get_default_root(struct super_block *sb, */ dir_id = btrfs_super_root_dir(&root->fs_info->super_copy); di = btrfs_lookup_dir_item(NULL, root, path, dir_id, "default", 7, 0); + if (IS_ERR(di)) + return ERR_CAST(di); if (!di) { /* * Ok the default dir item isn't there. This is weird since @@ -390,8 +392,8 @@ setup_root: location.offset = 0; inode = btrfs_iget(sb, &location, new_root, &new); - if (!inode) - return ERR_PTR(-ENOMEM); + if (IS_ERR(inode)) + return ERR_CAST(inode); /* * If we're just mounting the root most subvol put the inode and return @@ -498,7 +500,7 @@ int btrfs_sync_fs(struct super_block *sb, int wait) btrfs_start_delalloc_inodes(root, 0); btrfs_wait_ordered_extents(root, 0, 0); - trans = btrfs_start_transaction(root, 1); + trans = btrfs_start_transaction(root, 0); ret = btrfs_commit_transaction(trans, root); return ret; } @@ -694,11 +696,11 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data) if (btrfs_super_log_root(&root->fs_info->super_copy) != 0) return -EINVAL; - /* recover relocation */ - ret = btrfs_recover_relocation(root); + ret = btrfs_cleanup_fs_roots(root->fs_info); WARN_ON(ret); - ret = btrfs_cleanup_fs_roots(root->fs_info); + /* recover relocation */ + ret = btrfs_recover_relocation(root); WARN_ON(ret); sb->s_flags &= ~MS_RDONLY; @@ -714,34 +716,18 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf) struct list_head *head = &root->fs_info->space_info; struct btrfs_space_info *found; u64 total_used = 0; - u64 data_used = 0; int bits = dentry->d_sb->s_blocksize_bits; __be32 *fsid = (__be32 *)root->fs_info->fsid; rcu_read_lock(); - list_for_each_entry_rcu(found, head, list) { - if (found->flags & (BTRFS_BLOCK_GROUP_DUP| - BTRFS_BLOCK_GROUP_RAID10| - BTRFS_BLOCK_GROUP_RAID1)) { - total_used += found->bytes_used; - if (found->flags & BTRFS_BLOCK_GROUP_DATA) - data_used += found->bytes_used; - else - data_used += found->total_bytes; - } - - total_used += found->bytes_used; - if (found->flags & BTRFS_BLOCK_GROUP_DATA) - data_used += found->bytes_used; - else - data_used += found->total_bytes; - } + list_for_each_entry_rcu(found, head, list) + total_used += found->disk_used; rcu_read_unlock(); buf->f_namelen = BTRFS_NAME_LEN; buf->f_blocks = btrfs_super_total_bytes(disk_super) >> bits; buf->f_bfree = buf->f_blocks - (total_used >> bits); - buf->f_bavail = buf->f_blocks - (data_used >> bits); + buf->f_bavail = buf->f_bfree; buf->f_bsize = dentry->d_sb->s_blocksize; buf->f_type = BTRFS_SUPER_MAGIC; diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 2cb116099b90..66e4c66cc63b 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -165,54 +165,89 @@ enum btrfs_trans_type { TRANS_USERSPACE, }; +static int may_wait_transaction(struct btrfs_root *root, int type) +{ + if (!root->fs_info->log_root_recovering && + ((type == TRANS_START && !root->fs_info->open_ioctl_trans) || + type == TRANS_USERSPACE)) + return 1; + return 0; +} + static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root, - int num_blocks, int type) + u64 num_items, int type) { - struct btrfs_trans_handle *h = - kmem_cache_alloc(btrfs_trans_handle_cachep, GFP_NOFS); + struct btrfs_trans_handle *h; + struct btrfs_transaction *cur_trans; + int retries = 0; int ret; +again: + h = kmem_cache_alloc(btrfs_trans_handle_cachep, GFP_NOFS); + if (!h) + return ERR_PTR(-ENOMEM); mutex_lock(&root->fs_info->trans_mutex); - if (!root->fs_info->log_root_recovering && - ((type == TRANS_START && !root->fs_info->open_ioctl_trans) || - type == TRANS_USERSPACE)) + if (may_wait_transaction(root, type)) wait_current_trans(root); + ret = join_transaction(root); BUG_ON(ret); - h->transid = root->fs_info->running_transaction->transid; - h->transaction = root->fs_info->running_transaction; - h->blocks_reserved = num_blocks; + cur_trans = root->fs_info->running_transaction; + cur_trans->use_count++; + mutex_unlock(&root->fs_info->trans_mutex); + + h->transid = cur_trans->transid; + h->transaction = cur_trans; h->blocks_used = 0; h->block_group = 0; - h->alloc_exclude_nr = 0; - h->alloc_exclude_start = 0; + h->bytes_reserved = 0; h->delayed_ref_updates = 0; + h->block_rsv = NULL; - if (!current->journal_info && type != TRANS_USERSPACE) - current->journal_info = h; + smp_mb(); + if (cur_trans->blocked && may_wait_transaction(root, type)) { + btrfs_commit_transaction(h, root); + goto again; + } + + if (num_items > 0) { + ret = btrfs_trans_reserve_metadata(h, root, num_items, + &retries); + if (ret == -EAGAIN) { + btrfs_commit_transaction(h, root); + goto again; + } + if (ret < 0) { + btrfs_end_transaction(h, root); + return ERR_PTR(ret); + } + } - root->fs_info->running_transaction->use_count++; + mutex_lock(&root->fs_info->trans_mutex); record_root_in_trans(h, root); mutex_unlock(&root->fs_info->trans_mutex); + + if (!current->journal_info && type != TRANS_USERSPACE) + current->journal_info = h; return h; } struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root, - int num_blocks) + int num_items) { - return start_transaction(root, num_blocks, TRANS_START); + return start_transaction(root, num_items, TRANS_START); } struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root, int num_blocks) { - return start_transaction(root, num_blocks, TRANS_JOIN); + return start_transaction(root, 0, TRANS_JOIN); } struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *r, int num_blocks) { - return start_transaction(r, num_blocks, TRANS_USERSPACE); + return start_transaction(r, 0, TRANS_USERSPACE); } /* wait for a transaction commit to be fully complete */ @@ -286,10 +321,36 @@ void btrfs_throttle(struct btrfs_root *root) mutex_unlock(&root->fs_info->trans_mutex); } +static int should_end_transaction(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + int ret; + ret = btrfs_block_rsv_check(trans, root, + &root->fs_info->global_block_rsv, 0, 5); + return ret ? 1 : 0; +} + +int btrfs_should_end_transaction(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + struct btrfs_transaction *cur_trans = trans->transaction; + int updates; + + if (cur_trans->blocked || cur_trans->delayed_refs.flushing) + return 1; + + updates = trans->delayed_ref_updates; + trans->delayed_ref_updates = 0; + if (updates) + btrfs_run_delayed_refs(trans, root, updates); + + return should_end_transaction(trans, root); +} + static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, struct btrfs_root *root, int throttle) { - struct btrfs_transaction *cur_trans; + struct btrfs_transaction *cur_trans = trans->transaction; struct btrfs_fs_info *info = root->fs_info; int count = 0; @@ -313,9 +374,21 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, count++; } + btrfs_trans_release_metadata(trans, root); + + if (!root->fs_info->open_ioctl_trans && + should_end_transaction(trans, root)) + trans->transaction->blocked = 1; + + if (cur_trans->blocked && !cur_trans->in_commit) { + if (throttle) + return btrfs_commit_transaction(trans, root); + else + wake_up_process(info->transaction_kthread); + } + mutex_lock(&info->trans_mutex); - cur_trans = info->running_transaction; - WARN_ON(cur_trans != trans->transaction); + WARN_ON(cur_trans != info->running_transaction); WARN_ON(cur_trans->num_writers < 1); cur_trans->num_writers--; @@ -603,6 +676,7 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans, btrfs_free_log(trans, root); btrfs_update_reloc_root(trans, root); + btrfs_orphan_commit_root(trans, root); if (root->commit_root != root->node) { switch_commit_root(root); @@ -627,30 +701,30 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans, int btrfs_defrag_root(struct btrfs_root *root, int cacheonly) { struct btrfs_fs_info *info = root->fs_info; - int ret; struct btrfs_trans_handle *trans; + int ret; unsigned long nr; - smp_mb(); - if (root->defrag_running) + if (xchg(&root->defrag_running, 1)) return 0; - trans = btrfs_start_transaction(root, 1); + while (1) { - root->defrag_running = 1; + trans = btrfs_start_transaction(root, 0); + if (IS_ERR(trans)) + return PTR_ERR(trans); + ret = btrfs_defrag_leaves(trans, root, cacheonly); + nr = trans->blocks_used; btrfs_end_transaction(trans, root); btrfs_btree_balance_dirty(info->tree_root, nr); cond_resched(); - trans = btrfs_start_transaction(root, 1); if (root->fs_info->closing || ret != -EAGAIN) break; } root->defrag_running = 0; - smp_mb(); - btrfs_end_transaction(trans, root); - return 0; + return ret; } #if 0 @@ -758,47 +832,63 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root *root = pending->root; struct btrfs_root *parent_root; struct inode *parent_inode; + struct dentry *dentry; struct extent_buffer *tmp; struct extent_buffer *old; int ret; - u64 objectid; - int namelen; + int retries = 0; + u64 to_reserve = 0; u64 index = 0; - - parent_inode = pending->dentry->d_parent->d_inode; - parent_root = BTRFS_I(parent_inode)->root; + u64 objectid; new_root_item = kmalloc(sizeof(*new_root_item), GFP_NOFS); if (!new_root_item) { - ret = -ENOMEM; + pending->error = -ENOMEM; goto fail; } + ret = btrfs_find_free_objectid(trans, tree_root, 0, &objectid); - if (ret) + if (ret) { + pending->error = ret; goto fail; + } + + btrfs_reloc_pre_snapshot(trans, pending, &to_reserve); + btrfs_orphan_pre_snapshot(trans, pending, &to_reserve); + + if (to_reserve > 0) { + ret = btrfs_block_rsv_add(trans, root, &pending->block_rsv, + to_reserve, &retries); + if (ret) { + pending->error = ret; + goto fail; + } + } key.objectid = objectid; - /* record when the snapshot was created in key.offset */ - key.offset = trans->transid; - btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY); + key.offset = (u64)-1; + key.type = BTRFS_ROOT_ITEM_KEY; - memcpy(&pending->root_key, &key, sizeof(key)); - pending->root_key.offset = (u64)-1; + trans->block_rsv = &pending->block_rsv; + dentry = pending->dentry; + parent_inode = dentry->d_parent->d_inode; + parent_root = BTRFS_I(parent_inode)->root; record_root_in_trans(trans, parent_root); + /* * insert the directory item */ - namelen = strlen(pending->name); ret = btrfs_set_inode_index(parent_inode, &index); BUG_ON(ret); ret = btrfs_insert_dir_item(trans, parent_root, - pending->name, namelen, - parent_inode->i_ino, - &pending->root_key, BTRFS_FT_DIR, index); + dentry->d_name.name, dentry->d_name.len, + parent_inode->i_ino, &key, + BTRFS_FT_DIR, index); BUG_ON(ret); - btrfs_i_size_write(parent_inode, parent_inode->i_size + namelen * 2); + btrfs_i_size_write(parent_inode, parent_inode->i_size + + dentry->d_name.len * 2); ret = btrfs_update_inode(trans, parent_root, parent_inode); BUG_ON(ret); @@ -815,22 +905,32 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, free_extent_buffer(old); btrfs_set_root_node(new_root_item, tmp); - ret = btrfs_insert_root(trans, root->fs_info->tree_root, &key, - new_root_item); - BUG_ON(ret); + /* record when the snapshot was created in key.offset */ + key.offset = trans->transid; + ret = btrfs_insert_root(trans, tree_root, &key, new_root_item); btrfs_tree_unlock(tmp); free_extent_buffer(tmp); + BUG_ON(ret); - ret = btrfs_add_root_ref(trans, parent_root->fs_info->tree_root, - pending->root_key.objectid, + /* + * insert root back/forward references + */ + ret = btrfs_add_root_ref(trans, tree_root, objectid, parent_root->root_key.objectid, - parent_inode->i_ino, index, pending->name, - namelen); + parent_inode->i_ino, index, + dentry->d_name.name, dentry->d_name.len); BUG_ON(ret); + key.offset = (u64)-1; + pending->snap = btrfs_read_fs_root_no_name(root->fs_info, &key); + BUG_ON(IS_ERR(pending->snap)); + + btrfs_reloc_post_snapshot(trans, pending); + btrfs_orphan_post_snapshot(trans, pending); fail: kfree(new_root_item); - return ret; + btrfs_block_rsv_release(root, &pending->block_rsv, (u64)-1); + return 0; } /* @@ -878,6 +978,16 @@ int btrfs_transaction_in_commit(struct btrfs_fs_info *info) return ret; } +int btrfs_transaction_blocked(struct btrfs_fs_info *info) +{ + int ret = 0; + spin_lock(&info->new_trans_lock); + if (info->running_transaction) + ret = info->running_transaction->blocked; + spin_unlock(&info->new_trans_lock); + return ret; +} + int btrfs_commit_transaction(struct btrfs_trans_handle *trans, struct btrfs_root *root) { @@ -899,6 +1009,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, ret = btrfs_run_delayed_refs(trans, root, 0); BUG_ON(ret); + btrfs_trans_release_metadata(trans, root); + cur_trans = trans->transaction; /* * set the flushing flag so procs in this transaction have to @@ -951,9 +1063,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, snap_pending = 1; WARN_ON(cur_trans != trans->transaction); - prepare_to_wait(&cur_trans->writer_wait, &wait, - TASK_UNINTERRUPTIBLE); - if (cur_trans->num_writers > 1) timeout = MAX_SCHEDULE_TIMEOUT; else if (should_grow) @@ -976,6 +1085,9 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, */ btrfs_run_ordered_operations(root, 1); + prepare_to_wait(&cur_trans->writer_wait, &wait, + TASK_UNINTERRUPTIBLE); + smp_mb(); if (cur_trans->num_writers > 1 || should_grow) schedule_timeout(timeout); @@ -1103,9 +1215,9 @@ int btrfs_clean_old_snapshots(struct btrfs_root *root) if (btrfs_header_backref_rev(root->node) < BTRFS_MIXED_BACKREF_REV) - btrfs_drop_snapshot(root, 0); + btrfs_drop_snapshot(root, NULL, 0); else - btrfs_drop_snapshot(root, 1); + btrfs_drop_snapshot(root, NULL, 1); } return 0; } diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index 93c7ccb33118..e104986d0bfd 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -45,20 +45,23 @@ struct btrfs_transaction { struct btrfs_trans_handle { u64 transid; + u64 block_group; + u64 bytes_reserved; unsigned long blocks_reserved; unsigned long blocks_used; - struct btrfs_transaction *transaction; - u64 block_group; - u64 alloc_exclude_start; - u64 alloc_exclude_nr; unsigned long delayed_ref_updates; + struct btrfs_transaction *transaction; + struct btrfs_block_rsv *block_rsv; }; struct btrfs_pending_snapshot { struct dentry *dentry; struct btrfs_root *root; - char *name; - struct btrfs_key root_key; + struct btrfs_root *snap; + /* block reservation for the operation */ + struct btrfs_block_rsv block_rsv; + /* extra metadata reseration for relocation */ + int error; struct list_head list; }; @@ -85,11 +88,11 @@ static inline void btrfs_set_inode_last_trans(struct btrfs_trans_handle *trans, int btrfs_end_transaction(struct btrfs_trans_handle *trans, struct btrfs_root *root); struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root, - int num_blocks); + int num_items); struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root, - int num_blocks); + int num_blocks); struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *r, - int num_blocks); + int num_blocks); int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans, struct btrfs_root *root); int btrfs_commit_tree_roots(struct btrfs_trans_handle *trans, @@ -103,6 +106,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, struct btrfs_root *root); int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans, struct btrfs_root *root); +int btrfs_should_end_transaction(struct btrfs_trans_handle *trans, + struct btrfs_root *root); void btrfs_throttle(struct btrfs_root *root); int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans, struct btrfs_root *root); @@ -112,5 +117,6 @@ int btrfs_write_marked_extents(struct btrfs_root *root, struct extent_io_tree *dirty_pages, int mark); int btrfs_wait_marked_extents(struct btrfs_root *root, struct extent_io_tree *dirty_pages, int mark); +int btrfs_transaction_blocked(struct btrfs_fs_info *info); int btrfs_transaction_in_commit(struct btrfs_fs_info *info); #endif diff --git a/fs/btrfs/tree-defrag.c b/fs/btrfs/tree-defrag.c index b10eacdb1620..f7ac8e013ed7 100644 --- a/fs/btrfs/tree-defrag.c +++ b/fs/btrfs/tree-defrag.c @@ -117,13 +117,14 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, path->nodes[1], 0, cache_only, &last_ret, &root->defrag_progress); - WARN_ON(ret && ret != -EAGAIN); + if (ret) { + WARN_ON(ret == -EAGAIN); + goto out; + } if (next_key_ret == 0) { memcpy(&root->defrag_progress, &key, sizeof(key)); ret = -EAGAIN; } - - btrfs_release_path(root, path); out: if (path) btrfs_free_path(path); diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index af57dd2b43d4..fb102a9aee9c 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -135,6 +135,7 @@ static int start_log_trans(struct btrfs_trans_handle *trans, struct btrfs_root *root) { int ret; + int err = 0; mutex_lock(&root->log_mutex); if (root->log_root) { @@ -155,17 +156,19 @@ static int start_log_trans(struct btrfs_trans_handle *trans, mutex_lock(&root->fs_info->tree_log_mutex); if (!root->fs_info->log_root_tree) { ret = btrfs_init_log_root_tree(trans, root->fs_info); - BUG_ON(ret); + if (ret) + err = ret; } - if (!root->log_root) { + if (err == 0 && !root->log_root) { ret = btrfs_add_log_tree(trans, root); - BUG_ON(ret); + if (ret) + err = ret; } mutex_unlock(&root->fs_info->tree_log_mutex); root->log_batch++; atomic_inc(&root->log_writers); mutex_unlock(&root->log_mutex); - return 0; + return err; } /* @@ -376,7 +379,7 @@ insert: BUG_ON(ret); } } else if (ret) { - BUG(); + return ret; } dst_ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]); @@ -1699,9 +1702,9 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, next = btrfs_find_create_tree_block(root, bytenr, blocksize); - wc->process_func(root, next, wc, ptr_gen); - if (*level == 1) { + wc->process_func(root, next, wc, ptr_gen); + path->slots[*level]++; if (wc->free) { btrfs_read_buffer(next, ptr_gen); @@ -1734,35 +1737,7 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, WARN_ON(*level < 0); WARN_ON(*level >= BTRFS_MAX_LEVEL); - if (path->nodes[*level] == root->node) - parent = path->nodes[*level]; - else - parent = path->nodes[*level + 1]; - - bytenr = path->nodes[*level]->start; - - blocksize = btrfs_level_size(root, *level); - root_owner = btrfs_header_owner(parent); - root_gen = btrfs_header_generation(parent); - - wc->process_func(root, path->nodes[*level], wc, - btrfs_header_generation(path->nodes[*level])); - - if (wc->free) { - next = path->nodes[*level]; - btrfs_tree_lock(next); - clean_tree_block(trans, root, next); - btrfs_set_lock_blocking(next); - btrfs_wait_tree_block_writeback(next); - btrfs_tree_unlock(next); - - WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID); - ret = btrfs_free_reserved_extent(root, bytenr, blocksize); - BUG_ON(ret); - } - free_extent_buffer(path->nodes[*level]); - path->nodes[*level] = NULL; - *level += 1; + path->slots[*level] = btrfs_header_nritems(path->nodes[*level]); cond_resched(); return 0; @@ -1781,7 +1756,7 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans, for (i = *level; i < BTRFS_MAX_LEVEL - 1 && path->nodes[i]; i++) { slot = path->slots[i]; - if (slot < btrfs_header_nritems(path->nodes[i]) - 1) { + if (slot + 1 < btrfs_header_nritems(path->nodes[i])) { struct extent_buffer *node; node = path->nodes[i]; path->slots[i]++; @@ -2047,7 +2022,6 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, mutex_unlock(&log_root_tree->log_mutex); ret = update_log_root(trans, log); - BUG_ON(ret); mutex_lock(&log_root_tree->log_mutex); if (atomic_dec_and_test(&log_root_tree->log_writers)) { @@ -2056,6 +2030,15 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, wake_up(&log_root_tree->log_writer_wait); } + if (ret) { + BUG_ON(ret != -ENOSPC); + root->fs_info->last_trans_log_full_commit = trans->transid; + btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); + mutex_unlock(&log_root_tree->log_mutex); + ret = -EAGAIN; + goto out; + } + index2 = log_root_tree->log_transid % 2; if (atomic_read(&log_root_tree->log_commit[index2])) { btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); @@ -2129,15 +2112,10 @@ out: return 0; } -/* - * free all the extents used by the tree log. This should be called - * at commit time of the full transaction - */ -int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root) +static void free_log_tree(struct btrfs_trans_handle *trans, + struct btrfs_root *log) { int ret; - struct btrfs_root *log; - struct key; u64 start; u64 end; struct walk_control wc = { @@ -2145,10 +2123,6 @@ int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root) .process_func = process_one_buffer }; - if (!root->log_root || root->fs_info->log_root_recovering) - return 0; - - log = root->log_root; ret = walk_log_tree(trans, log, &wc); BUG_ON(ret); @@ -2162,14 +2136,30 @@ int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root) EXTENT_DIRTY | EXTENT_NEW, GFP_NOFS); } - if (log->log_transid > 0) { - ret = btrfs_del_root(trans, root->fs_info->log_root_tree, - &log->root_key); - BUG_ON(ret); - } - root->log_root = NULL; free_extent_buffer(log->node); kfree(log); +} + +/* + * free all the extents used by the tree log. This should be called + * at commit time of the full transaction + */ +int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root) +{ + if (root->log_root) { + free_log_tree(trans, root->log_root); + root->log_root = NULL; + } + return 0; +} + +int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info) +{ + if (fs_info->log_root_tree) { + free_log_tree(trans, fs_info->log_root_tree); + fs_info->log_root_tree = NULL; + } return 0; } @@ -2203,6 +2193,7 @@ int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans, struct btrfs_dir_item *di; struct btrfs_path *path; int ret; + int err = 0; int bytes_del = 0; if (BTRFS_I(dir)->logged_trans < trans->transid) @@ -2218,7 +2209,11 @@ int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans, path = btrfs_alloc_path(); di = btrfs_lookup_dir_item(trans, log, path, dir->i_ino, name, name_len, -1); - if (di && !IS_ERR(di)) { + if (IS_ERR(di)) { + err = PTR_ERR(di); + goto fail; + } + if (di) { ret = btrfs_delete_one_dir_name(trans, log, path, di); bytes_del += name_len; BUG_ON(ret); @@ -2226,7 +2221,11 @@ int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans, btrfs_release_path(log, path); di = btrfs_lookup_dir_index_item(trans, log, path, dir->i_ino, index, name, name_len, -1); - if (di && !IS_ERR(di)) { + if (IS_ERR(di)) { + err = PTR_ERR(di); + goto fail; + } + if (di) { ret = btrfs_delete_one_dir_name(trans, log, path, di); bytes_del += name_len; BUG_ON(ret); @@ -2244,6 +2243,10 @@ int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans, btrfs_release_path(log, path); ret = btrfs_search_slot(trans, log, &key, path, 0, 1); + if (ret < 0) { + err = ret; + goto fail; + } if (ret == 0) { struct btrfs_inode_item *item; u64 i_size; @@ -2261,9 +2264,13 @@ int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans, ret = 0; btrfs_release_path(log, path); } - +fail: btrfs_free_path(path); mutex_unlock(&BTRFS_I(dir)->log_mutex); + if (ret == -ENOSPC) { + root->fs_info->last_trans_log_full_commit = trans->transid; + ret = 0; + } btrfs_end_log_trans(root); return 0; @@ -2291,6 +2298,10 @@ int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans, ret = btrfs_del_inode_ref(trans, log, name, name_len, inode->i_ino, dirid, &index); mutex_unlock(&BTRFS_I(inode)->log_mutex); + if (ret == -ENOSPC) { + root->fs_info->last_trans_log_full_commit = trans->transid; + ret = 0; + } btrfs_end_log_trans(root); return ret; @@ -2318,7 +2329,8 @@ static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans, else key.type = BTRFS_DIR_LOG_INDEX_KEY; ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*item)); - BUG_ON(ret); + if (ret) + return ret; item = btrfs_item_ptr(path->nodes[0], path->slots[0], struct btrfs_dir_log_item); @@ -2343,6 +2355,7 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans, struct btrfs_key max_key; struct btrfs_root *log = root->log_root; struct extent_buffer *src; + int err = 0; int ret; int i; int nritems; @@ -2405,6 +2418,10 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans, ret = overwrite_item(trans, log, dst_path, path->nodes[0], path->slots[0], &tmp); + if (ret) { + err = ret; + goto done; + } } } btrfs_release_path(root, path); @@ -2432,7 +2449,10 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans, goto done; ret = overwrite_item(trans, log, dst_path, src, i, &min_key); - BUG_ON(ret); + if (ret) { + err = ret; + goto done; + } } path->slots[0] = nritems; @@ -2454,22 +2474,30 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans, ret = overwrite_item(trans, log, dst_path, path->nodes[0], path->slots[0], &tmp); - - BUG_ON(ret); - last_offset = tmp.offset; + if (ret) + err = ret; + else + last_offset = tmp.offset; goto done; } } done: - *last_offset_ret = last_offset; btrfs_release_path(root, path); btrfs_release_path(log, dst_path); - /* insert the log range keys to indicate where the log is valid */ - ret = insert_dir_log_key(trans, log, path, key_type, inode->i_ino, - first_offset, last_offset); - BUG_ON(ret); - return 0; + if (err == 0) { + *last_offset_ret = last_offset; + /* + * insert the log range keys to indicate where the log + * is valid + */ + ret = insert_dir_log_key(trans, log, path, key_type, + inode->i_ino, first_offset, + last_offset); + if (ret) + err = ret; + } + return err; } /* @@ -2501,7 +2529,8 @@ again: ret = log_dir_items(trans, root, inode, path, dst_path, key_type, min_key, &max_key); - BUG_ON(ret); + if (ret) + return ret; if (max_key == (u64)-1) break; min_key = max_key + 1; @@ -2535,8 +2564,8 @@ static int drop_objectid_items(struct btrfs_trans_handle *trans, while (1) { ret = btrfs_search_slot(trans, log, &key, path, -1, 1); - - if (ret != 1) + BUG_ON(ret == 0); + if (ret < 0) break; if (path->slots[0] == 0) @@ -2554,7 +2583,7 @@ static int drop_objectid_items(struct btrfs_trans_handle *trans, btrfs_release_path(log, path); } btrfs_release_path(log, path); - return 0; + return ret; } static noinline int copy_items(struct btrfs_trans_handle *trans, @@ -2587,7 +2616,10 @@ static noinline int copy_items(struct btrfs_trans_handle *trans, } ret = btrfs_insert_empty_items(trans, log, dst_path, ins_keys, ins_sizes, nr); - BUG_ON(ret); + if (ret) { + kfree(ins_data); + return ret; + } for (i = 0; i < nr; i++, dst_path->slots[0]++) { dst_offset = btrfs_item_ptr_offset(dst_path->nodes[0], @@ -2660,16 +2692,17 @@ static noinline int copy_items(struct btrfs_trans_handle *trans, * we have to do this after the loop above to avoid changing the * log tree while trying to change the log tree. */ + ret = 0; while (!list_empty(&ordered_sums)) { struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next, struct btrfs_ordered_sum, list); - ret = btrfs_csum_file_blocks(trans, log, sums); - BUG_ON(ret); + if (!ret) + ret = btrfs_csum_file_blocks(trans, log, sums); list_del(&sums->list); kfree(sums); } - return 0; + return ret; } /* log a single inode in the tree log. @@ -2697,6 +2730,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, struct btrfs_root *log = root->log_root; struct extent_buffer *src = NULL; u32 size; + int err = 0; int ret; int nritems; int ins_start_slot = 0; @@ -2739,7 +2773,10 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, } else { ret = btrfs_truncate_inode_items(trans, log, inode, 0, 0); } - BUG_ON(ret); + if (ret) { + err = ret; + goto out_unlock; + } path->keep_locks = 1; while (1) { @@ -2768,7 +2805,10 @@ again: ret = copy_items(trans, log, dst_path, src, ins_start_slot, ins_nr, inode_only); - BUG_ON(ret); + if (ret) { + err = ret; + goto out_unlock; + } ins_nr = 1; ins_start_slot = path->slots[0]; next_slot: @@ -2784,7 +2824,10 @@ next_slot: ret = copy_items(trans, log, dst_path, src, ins_start_slot, ins_nr, inode_only); - BUG_ON(ret); + if (ret) { + err = ret; + goto out_unlock; + } ins_nr = 0; } btrfs_release_path(root, path); @@ -2802,7 +2845,10 @@ next_slot: ret = copy_items(trans, log, dst_path, src, ins_start_slot, ins_nr, inode_only); - BUG_ON(ret); + if (ret) { + err = ret; + goto out_unlock; + } ins_nr = 0; } WARN_ON(ins_nr); @@ -2810,14 +2856,18 @@ next_slot: btrfs_release_path(root, path); btrfs_release_path(log, dst_path); ret = log_directory_changes(trans, root, inode, path, dst_path); - BUG_ON(ret); + if (ret) { + err = ret; + goto out_unlock; + } } BTRFS_I(inode)->logged_trans = trans->transid; +out_unlock: mutex_unlock(&BTRFS_I(inode)->log_mutex); btrfs_free_path(path); btrfs_free_path(dst_path); - return 0; + return err; } /* @@ -2942,10 +2992,13 @@ int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, goto end_no_trans; } - start_log_trans(trans, root); + ret = start_log_trans(trans, root); + if (ret) + goto end_trans; ret = btrfs_log_inode(trans, root, inode, inode_only); - BUG_ON(ret); + if (ret) + goto end_trans; /* * for regular files, if its inode is already on disk, we don't @@ -2955,8 +3008,10 @@ int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, */ if (S_ISREG(inode->i_mode) && BTRFS_I(inode)->generation <= last_committed && - BTRFS_I(inode)->last_unlink_trans <= last_committed) - goto no_parent; + BTRFS_I(inode)->last_unlink_trans <= last_committed) { + ret = 0; + goto end_trans; + } inode_only = LOG_INODE_EXISTS; while (1) { @@ -2970,15 +3025,21 @@ int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, if (BTRFS_I(inode)->generation > root->fs_info->last_trans_committed) { ret = btrfs_log_inode(trans, root, inode, inode_only); - BUG_ON(ret); + if (ret) + goto end_trans; } if (IS_ROOT(parent)) break; parent = parent->d_parent; } -no_parent: ret = 0; +end_trans: + if (ret < 0) { + BUG_ON(ret != -ENOSPC); + root->fs_info->last_trans_log_full_commit = trans->transid; + ret = 1; + } btrfs_end_log_trans(root); end_no_trans: return ret; @@ -3020,7 +3081,7 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree) path = btrfs_alloc_path(); BUG_ON(!path); - trans = btrfs_start_transaction(fs_info->tree_root, 1); + trans = btrfs_start_transaction(fs_info->tree_root, 0); wc.trans = trans; wc.pin = 1; diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h index 0776eacb5083..3dfae84c8cc8 100644 --- a/fs/btrfs/tree-log.h +++ b/fs/btrfs/tree-log.h @@ -25,6 +25,8 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, struct btrfs_root *root); int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root); +int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info); int btrfs_recover_log_trees(struct btrfs_root *tree_root); int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct dentry *dentry); diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 8db7b14bbae8..d6e3af8be95b 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -1097,7 +1097,7 @@ static int btrfs_rm_dev_item(struct btrfs_root *root, if (!path) return -ENOMEM; - trans = btrfs_start_transaction(root, 1); + trans = btrfs_start_transaction(root, 0); key.objectid = BTRFS_DEV_ITEMS_OBJECTID; key.type = BTRFS_DEV_ITEM_KEY; key.offset = device->devid; @@ -1486,7 +1486,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) goto error; } - trans = btrfs_start_transaction(root, 1); + trans = btrfs_start_transaction(root, 0); lock_chunks(root); device->barriers = 1; @@ -1751,9 +1751,10 @@ static int btrfs_relocate_chunk(struct btrfs_root *root, /* step one, relocate all the extents inside this chunk */ ret = btrfs_relocate_block_group(extent_root, chunk_offset); - BUG_ON(ret); + if (ret) + return ret; - trans = btrfs_start_transaction(root, 1); + trans = btrfs_start_transaction(root, 0); BUG_ON(!trans); lock_chunks(root); @@ -1925,7 +1926,7 @@ int btrfs_balance(struct btrfs_root *dev_root) break; BUG_ON(ret); - trans = btrfs_start_transaction(dev_root, 1); + trans = btrfs_start_transaction(dev_root, 0); BUG_ON(!trans); ret = btrfs_grow_device(trans, device, old_size); @@ -2094,11 +2095,7 @@ again: } /* Shrinking succeeded, else we would be at "done". */ - trans = btrfs_start_transaction(root, 1); - if (!trans) { - ret = -ENOMEM; - goto done; - } + trans = btrfs_start_transaction(root, 0); lock_chunks(root); device->disk_total_bytes = new_size; diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c index 59acd3eb288a..88ecbb215878 100644 --- a/fs/btrfs/xattr.c +++ b/fs/btrfs/xattr.c @@ -154,15 +154,10 @@ int __btrfs_setxattr(struct btrfs_trans_handle *trans, if (trans) return do_setxattr(trans, inode, name, value, size, flags); - ret = btrfs_reserve_metadata_space(root, 2); - if (ret) - return ret; + trans = btrfs_start_transaction(root, 2); + if (IS_ERR(trans)) + return PTR_ERR(trans); - trans = btrfs_start_transaction(root, 1); - if (!trans) { - ret = -ENOMEM; - goto out; - } btrfs_set_trans_block_group(trans, inode); ret = do_setxattr(trans, inode, name, value, size, flags); @@ -174,7 +169,6 @@ int __btrfs_setxattr(struct btrfs_trans_handle *trans, BUG_ON(ret); out: btrfs_end_transaction_throttle(trans, root); - btrfs_unreserve_metadata_space(root, 2); return ret; } diff --git a/fs/buffer.c b/fs/buffer.c index e8aa7081d25c..d54812b198e9 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -1949,14 +1949,11 @@ static int __block_commit_write(struct inode *inode, struct page *page, } /* - * block_write_begin takes care of the basic task of block allocation and - * bringing partial write blocks uptodate first. - * - * If *pagep is not NULL, then block_write_begin uses the locked page - * at *pagep rather than allocating its own. In this case, the page will - * not be unlocked or deallocated on failure. + * Filesystems implementing the new truncate sequence should use the + * _newtrunc postfix variant which won't incorrectly call vmtruncate. + * The filesystem needs to handle block truncation upon failure. */ -int block_write_begin(struct file *file, struct address_space *mapping, +int block_write_begin_newtrunc(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned flags, struct page **pagep, void **fsdata, get_block_t *get_block) @@ -1992,20 +1989,50 @@ int block_write_begin(struct file *file, struct address_space *mapping, unlock_page(page); page_cache_release(page); *pagep = NULL; - - /* - * prepare_write() may have instantiated a few blocks - * outside i_size. Trim these off again. Don't need - * i_size_read because we hold i_mutex. - */ - if (pos + len > inode->i_size) - vmtruncate(inode, inode->i_size); } } out: return status; } +EXPORT_SYMBOL(block_write_begin_newtrunc); + +/* + * block_write_begin takes care of the basic task of block allocation and + * bringing partial write blocks uptodate first. + * + * If *pagep is not NULL, then block_write_begin uses the locked page + * at *pagep rather than allocating its own. In this case, the page will + * not be unlocked or deallocated on failure. + */ +int block_write_begin(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata, + get_block_t *get_block) +{ + int ret; + + ret = block_write_begin_newtrunc(file, mapping, pos, len, flags, + pagep, fsdata, get_block); + + /* + * prepare_write() may have instantiated a few blocks + * outside i_size. Trim these off again. Don't need + * i_size_read because we hold i_mutex. + * + * Filesystems which pass down their own page also cannot + * call into vmtruncate here because it would lead to lock + * inversion problems (*pagep is locked). This is a further + * example of where the old truncate sequence is inadequate. + */ + if (unlikely(ret) && *pagep == NULL) { + loff_t isize = mapping->host->i_size; + if (pos + len > isize) + vmtruncate(mapping->host, isize); + } + + return ret; +} EXPORT_SYMBOL(block_write_begin); int block_write_end(struct file *file, struct address_space *mapping, @@ -2324,7 +2351,7 @@ out: * For moronic filesystems that do not allow holes in file. * We may have to extend the file. */ -int cont_write_begin(struct file *file, struct address_space *mapping, +int cont_write_begin_newtrunc(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned flags, struct page **pagep, void **fsdata, get_block_t *get_block, loff_t *bytes) @@ -2345,11 +2372,30 @@ int cont_write_begin(struct file *file, struct address_space *mapping, } *pagep = NULL; - err = block_write_begin(file, mapping, pos, len, + err = block_write_begin_newtrunc(file, mapping, pos, len, flags, pagep, fsdata, get_block); out: return err; } +EXPORT_SYMBOL(cont_write_begin_newtrunc); + +int cont_write_begin(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata, + get_block_t *get_block, loff_t *bytes) +{ + int ret; + + ret = cont_write_begin_newtrunc(file, mapping, pos, len, flags, + pagep, fsdata, get_block, bytes); + if (unlikely(ret)) { + loff_t isize = mapping->host->i_size; + if (pos + len > isize) + vmtruncate(mapping->host, isize); + } + + return ret; +} EXPORT_SYMBOL(cont_write_begin); int block_prepare_write(struct page *page, unsigned from, unsigned to, @@ -2381,7 +2427,7 @@ EXPORT_SYMBOL(block_commit_write); * * We are not allowed to take the i_mutex here so we have to play games to * protect against truncate races as the page could now be beyond EOF. Because - * vmtruncate() writes the inode size before removing pages, once we have the + * truncate writes the inode size before removing pages, once we have the * page lock we can determine safely if the page is beyond EOF. If it is not * beyond EOF, then the page is guaranteed safe against truncation until we * unlock the page. @@ -2464,10 +2510,11 @@ static void attach_nobh_buffers(struct page *page, struct buffer_head *head) } /* - * On entry, the page is fully not uptodate. - * On exit the page is fully uptodate in the areas outside (from,to) + * Filesystems implementing the new truncate sequence should use the + * _newtrunc postfix variant which won't incorrectly call vmtruncate. + * The filesystem needs to handle block truncation upon failure. */ -int nobh_write_begin(struct file *file, struct address_space *mapping, +int nobh_write_begin_newtrunc(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned flags, struct page **pagep, void **fsdata, get_block_t *get_block) @@ -2500,8 +2547,8 @@ int nobh_write_begin(struct file *file, struct address_space *mapping, unlock_page(page); page_cache_release(page); *pagep = NULL; - return block_write_begin(file, mapping, pos, len, flags, pagep, - fsdata, get_block); + return block_write_begin_newtrunc(file, mapping, pos, len, + flags, pagep, fsdata, get_block); } if (PageMappedToDisk(page)) @@ -2605,8 +2652,34 @@ out_release: page_cache_release(page); *pagep = NULL; - if (pos + len > inode->i_size) - vmtruncate(inode, inode->i_size); + return ret; +} +EXPORT_SYMBOL(nobh_write_begin_newtrunc); + +/* + * On entry, the page is fully not uptodate. + * On exit the page is fully uptodate in the areas outside (from,to) + */ +int nobh_write_begin(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata, + get_block_t *get_block) +{ + int ret; + + ret = nobh_write_begin_newtrunc(file, mapping, pos, len, flags, + pagep, fsdata, get_block); + + /* + * prepare_write() may have instantiated a few blocks + * outside i_size. Trim these off again. Don't need + * i_size_read because we hold i_mutex. + */ + if (unlikely(ret)) { + loff_t isize = mapping->host->i_size; + if (pos + len > isize) + vmtruncate(mapping->host, isize); + } return ret; } diff --git a/fs/ceph/Kconfig b/fs/ceph/Kconfig index 04b8280582a9..bc87b9c1d27e 100644 --- a/fs/ceph/Kconfig +++ b/fs/ceph/Kconfig @@ -2,7 +2,7 @@ config CEPH_FS tristate "Ceph distributed file system (EXPERIMENTAL)" depends on INET && EXPERIMENTAL select LIBCRC32C - select CONFIG_CRYPTO_AES + select CRYPTO_AES help Choose Y or M here to include support for mounting the experimental Ceph distributed file system. Ceph is an extremely diff --git a/fs/ceph/auth.c b/fs/ceph/auth.c index 9f46de2ba7a7..89490beaf537 100644 --- a/fs/ceph/auth.c +++ b/fs/ceph/auth.c @@ -1,7 +1,6 @@ #include "ceph_debug.h" #include <linux/module.h> -#include <linux/slab.h> #include <linux/err.h> #include <linux/slab.h> @@ -217,8 +216,8 @@ int ceph_handle_auth_reply(struct ceph_auth_client *ac, if (ac->protocol != protocol) { ret = ceph_auth_init_protocol(ac, protocol); if (ret) { - pr_err("error %d on auth method %s init\n", - ret, ac->ops->name); + pr_err("error %d on auth protocol %d init\n", + ret, protocol); goto out; } } @@ -247,7 +246,7 @@ int ceph_build_auth(struct ceph_auth_client *ac, if (!ac->protocol) return ceph_auth_build_hello(ac, msg_buf, msg_len); BUG_ON(!ac->ops); - if (!ac->ops->is_authenticated(ac)) + if (ac->ops->should_authenticate(ac)) return ceph_build_auth_request(ac, msg_buf, msg_len); return 0; } diff --git a/fs/ceph/auth.h b/fs/ceph/auth.h index 4429a707c021..d38a2fb4a137 100644 --- a/fs/ceph/auth.h +++ b/fs/ceph/auth.h @@ -24,6 +24,12 @@ struct ceph_auth_client_ops { int (*is_authenticated)(struct ceph_auth_client *ac); /* + * true if we should (re)authenticate, e.g., when our tickets + * are getting old and crusty. + */ + int (*should_authenticate)(struct ceph_auth_client *ac); + + /* * build requests and process replies during monitor * handshake. if handle_reply returns -EAGAIN, we build * another request. diff --git a/fs/ceph/auth_none.c b/fs/ceph/auth_none.c index 24407c119291..ad1dc21286c7 100644 --- a/fs/ceph/auth_none.c +++ b/fs/ceph/auth_none.c @@ -31,6 +31,13 @@ static int is_authenticated(struct ceph_auth_client *ac) return !xi->starting; } +static int should_authenticate(struct ceph_auth_client *ac) +{ + struct ceph_auth_none_info *xi = ac->private; + + return xi->starting; +} + /* * the generic auth code decode the global_id, and we carry no actual * authenticate state, so nothing happens here. @@ -98,6 +105,7 @@ static const struct ceph_auth_client_ops ceph_auth_none_ops = { .reset = reset, .destroy = destroy, .is_authenticated = is_authenticated, + .should_authenticate = should_authenticate, .handle_reply = handle_reply, .create_authorizer = ceph_auth_none_create_authorizer, .destroy_authorizer = ceph_auth_none_destroy_authorizer, diff --git a/fs/ceph/auth_x.c b/fs/ceph/auth_x.c index 7b206231566d..6d44053ecff1 100644 --- a/fs/ceph/auth_x.c +++ b/fs/ceph/auth_x.c @@ -27,6 +27,17 @@ static int ceph_x_is_authenticated(struct ceph_auth_client *ac) return (ac->want_keys & xi->have_keys) == ac->want_keys; } +static int ceph_x_should_authenticate(struct ceph_auth_client *ac) +{ + struct ceph_x_info *xi = ac->private; + int need; + + ceph_x_validate_tickets(ac, &need); + dout("ceph_x_should_authenticate want=%d need=%d have=%d\n", + ac->want_keys, need, xi->have_keys); + return need != 0; +} + static int ceph_x_encrypt_buflen(int ilen) { return sizeof(struct ceph_x_encrypt_header) + ilen + 16 + @@ -482,7 +493,7 @@ static int ceph_x_handle_reply(struct ceph_auth_client *ac, int result, return -EAGAIN; } - op = le32_to_cpu(head->op); + op = le16_to_cpu(head->op); result = le32_to_cpu(head->result); dout("handle_reply op %d result %d\n", op, result); switch (op) { @@ -602,6 +613,9 @@ static void ceph_x_destroy(struct ceph_auth_client *ac) remove_ticket_handler(ac, th); } + if (xi->auth_authorizer.buf) + ceph_buffer_put(xi->auth_authorizer.buf); + kfree(ac->private); ac->private = NULL; } @@ -620,6 +634,7 @@ static void ceph_x_invalidate_authorizer(struct ceph_auth_client *ac, static const struct ceph_auth_client_ops ceph_x_ops = { .name = "x", .is_authenticated = ceph_x_is_authenticated, + .should_authenticate = ceph_x_should_authenticate, .build_request = ceph_x_build_request, .handle_reply = ceph_x_handle_reply, .create_authorizer = ceph_x_create_authorizer, diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 0dd0b81e64f7..b81be9a56487 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -244,8 +244,14 @@ static struct ceph_cap *get_cap(struct ceph_cap_reservation *ctx) struct ceph_cap *cap = NULL; /* temporary, until we do something about cap import/export */ - if (!ctx) - return kmem_cache_alloc(ceph_cap_cachep, GFP_NOFS); + if (!ctx) { + cap = kmem_cache_alloc(ceph_cap_cachep, GFP_NOFS); + if (cap) { + caps_use_count++; + caps_total_count++; + } + return cap; + } spin_lock(&caps_list_lock); dout("get_cap ctx=%p (%d) %d = %d used + %d resv + %d avail\n", @@ -621,7 +627,7 @@ retry: if (fmode >= 0) __ceph_get_fmode(ci, fmode); spin_unlock(&inode->i_lock); - wake_up(&ci->i_cap_wq); + wake_up_all(&ci->i_cap_wq); return 0; } @@ -981,6 +987,46 @@ static int send_cap_msg(struct ceph_mds_session *session, return 0; } +static void __queue_cap_release(struct ceph_mds_session *session, + u64 ino, u64 cap_id, u32 migrate_seq, + u32 issue_seq) +{ + struct ceph_msg *msg; + struct ceph_mds_cap_release *head; + struct ceph_mds_cap_item *item; + + spin_lock(&session->s_cap_lock); + BUG_ON(!session->s_num_cap_releases); + msg = list_first_entry(&session->s_cap_releases, + struct ceph_msg, list_head); + + dout(" adding %llx release to mds%d msg %p (%d left)\n", + ino, session->s_mds, msg, session->s_num_cap_releases); + + BUG_ON(msg->front.iov_len + sizeof(*item) > PAGE_CACHE_SIZE); + head = msg->front.iov_base; + head->num = cpu_to_le32(le32_to_cpu(head->num) + 1); + item = msg->front.iov_base + msg->front.iov_len; + item->ino = cpu_to_le64(ino); + item->cap_id = cpu_to_le64(cap_id); + item->migrate_seq = cpu_to_le32(migrate_seq); + item->seq = cpu_to_le32(issue_seq); + + session->s_num_cap_releases--; + + msg->front.iov_len += sizeof(*item); + if (le32_to_cpu(head->num) == CEPH_CAPS_PER_RELEASE) { + dout(" release msg %p full\n", msg); + list_move_tail(&msg->list_head, &session->s_cap_releases_done); + } else { + dout(" release msg %p at %d/%d (%d)\n", msg, + (int)le32_to_cpu(head->num), + (int)CEPH_CAPS_PER_RELEASE, + (int)msg->front.iov_len); + } + spin_unlock(&session->s_cap_lock); +} + /* * Queue cap releases when an inode is dropped from our cache. Since * inode is about to be destroyed, there is no need for i_lock. @@ -994,41 +1040,9 @@ void ceph_queue_caps_release(struct inode *inode) while (p) { struct ceph_cap *cap = rb_entry(p, struct ceph_cap, ci_node); struct ceph_mds_session *session = cap->session; - struct ceph_msg *msg; - struct ceph_mds_cap_release *head; - struct ceph_mds_cap_item *item; - spin_lock(&session->s_cap_lock); - BUG_ON(!session->s_num_cap_releases); - msg = list_first_entry(&session->s_cap_releases, - struct ceph_msg, list_head); - - dout(" adding %p release to mds%d msg %p (%d left)\n", - inode, session->s_mds, msg, session->s_num_cap_releases); - - BUG_ON(msg->front.iov_len + sizeof(*item) > PAGE_CACHE_SIZE); - head = msg->front.iov_base; - head->num = cpu_to_le32(le32_to_cpu(head->num) + 1); - item = msg->front.iov_base + msg->front.iov_len; - item->ino = cpu_to_le64(ceph_ino(inode)); - item->cap_id = cpu_to_le64(cap->cap_id); - item->migrate_seq = cpu_to_le32(cap->mseq); - item->seq = cpu_to_le32(cap->issue_seq); - - session->s_num_cap_releases--; - - msg->front.iov_len += sizeof(*item); - if (le32_to_cpu(head->num) == CEPH_CAPS_PER_RELEASE) { - dout(" release msg %p full\n", msg); - list_move_tail(&msg->list_head, - &session->s_cap_releases_done); - } else { - dout(" release msg %p at %d/%d (%d)\n", msg, - (int)le32_to_cpu(head->num), - (int)CEPH_CAPS_PER_RELEASE, - (int)msg->front.iov_len); - } - spin_unlock(&session->s_cap_lock); + __queue_cap_release(session, ceph_ino(inode), cap->cap_id, + cap->mseq, cap->issue_seq); p = rb_next(p); __ceph_remove_cap(cap); } @@ -1167,7 +1181,7 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap, } if (wake) - wake_up(&ci->i_cap_wq); + wake_up_all(&ci->i_cap_wq); return delayed; } @@ -1776,9 +1790,9 @@ out: spin_unlock(&ci->i_unsafe_lock); } -int ceph_fsync(struct file *file, struct dentry *dentry, int datasync) +int ceph_fsync(struct file *file, int datasync) { - struct inode *inode = dentry->d_inode; + struct inode *inode = file->f_mapping->host; struct ceph_inode_info *ci = ceph_inode(inode); unsigned flush_tid; int ret; @@ -2139,7 +2153,7 @@ void ceph_put_cap_refs(struct ceph_inode_info *ci, int had) else if (flushsnaps) ceph_flush_snaps(ci); if (wake) - wake_up(&ci->i_cap_wq); + wake_up_all(&ci->i_cap_wq); if (put) iput(inode); } @@ -2215,7 +2229,7 @@ void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr, iput(inode); } else if (complete_capsnap) { ceph_flush_snaps(ci); - wake_up(&ci->i_cap_wq); + wake_up_all(&ci->i_cap_wq); } if (drop_capsnap) iput(inode); @@ -2391,7 +2405,7 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant, if (queue_invalidate) ceph_queue_invalidate(inode); if (wake) - wake_up(&ci->i_cap_wq); + wake_up_all(&ci->i_cap_wq); if (check_caps == 1) ceph_check_caps(ci, CHECK_CAPS_NODELAY|CHECK_CAPS_AUTHONLY, @@ -2446,7 +2460,7 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid, struct ceph_inode_info, i_flushing_item)->vfs_inode); mdsc->num_cap_flushing--; - wake_up(&mdsc->cap_flushing_wq); + wake_up_all(&mdsc->cap_flushing_wq); dout(" inode %p now !flushing\n", inode); if (ci->i_dirty_caps == 0) { @@ -2458,7 +2472,7 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid, } } spin_unlock(&mdsc->cap_dirty_lock); - wake_up(&ci->i_cap_wq); + wake_up_all(&ci->i_cap_wq); out: spin_unlock(&inode->i_lock); @@ -2655,7 +2669,7 @@ void ceph_handle_caps(struct ceph_mds_session *session, struct ceph_mds_caps *h; int mds = session->s_mds; int op; - u32 seq; + u32 seq, mseq; struct ceph_vino vino; u64 cap_id; u64 size, max_size; @@ -2675,6 +2689,7 @@ void ceph_handle_caps(struct ceph_mds_session *session, vino.snap = CEPH_NOSNAP; cap_id = le64_to_cpu(h->cap_id); seq = le32_to_cpu(h->seq); + mseq = le32_to_cpu(h->migrate_seq); size = le64_to_cpu(h->size); max_size = le64_to_cpu(h->max_size); @@ -2689,6 +2704,18 @@ void ceph_handle_caps(struct ceph_mds_session *session, vino.snap, inode); if (!inode) { dout(" i don't have ino %llx\n", vino.ino); + + if (op == CEPH_CAP_OP_IMPORT) + __queue_cap_release(session, vino.ino, cap_id, + mseq, seq); + + /* + * send any full release message to try to move things + * along for the mds (who clearly thinks we still have this + * cap). + */ + ceph_add_cap_releases(mdsc, session, -1); + ceph_send_cap_releases(mdsc, session); goto done; } @@ -2714,7 +2741,7 @@ void ceph_handle_caps(struct ceph_mds_session *session, spin_lock(&inode->i_lock); cap = __get_cap_for_mds(ceph_inode(inode), mds); if (!cap) { - dout("no cap on %p ino %llx.%llx from mds%d, releasing\n", + dout(" no cap on %p ino %llx.%llx from mds%d\n", inode, ceph_ino(inode), ceph_snap(inode), mds); spin_unlock(&inode->i_lock); goto done; @@ -2865,18 +2892,19 @@ int ceph_encode_inode_release(void **p, struct inode *inode, struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_cap *cap; struct ceph_mds_request_release *rel = *p; + int used, dirty; int ret = 0; - int used = 0; spin_lock(&inode->i_lock); used = __ceph_caps_used(ci); + dirty = __ceph_caps_dirty(ci); - dout("encode_inode_release %p mds%d used %s drop %s unless %s\n", inode, - mds, ceph_cap_string(used), ceph_cap_string(drop), + dout("encode_inode_release %p mds%d used|dirty %s drop %s unless %s\n", + inode, mds, ceph_cap_string(used|dirty), ceph_cap_string(drop), ceph_cap_string(unless)); - /* only drop unused caps */ - drop &= ~used; + /* only drop unused, clean caps */ + drop &= ~(used | dirty); cap = __get_cap_for_mds(ci, mds); if (cap && __cap_is_valid(cap)) { @@ -2956,6 +2984,7 @@ int ceph_encode_dentry_release(void **p, struct dentry *dentry, memcpy(*p, dentry->d_name.name, dentry->d_name.len); *p += dentry->d_name.len; rel->dname_seq = cpu_to_le32(di->lease_seq); + __ceph_mdsc_drop_dentry_lease(dentry); } spin_unlock(&dentry->d_lock); return ret; diff --git a/fs/ceph/ceph_fs.h b/fs/ceph/ceph_fs.h index 3b9eeed097b3..2fa992eaf7da 100644 --- a/fs/ceph/ceph_fs.h +++ b/fs/ceph/ceph_fs.h @@ -265,16 +265,17 @@ extern const char *ceph_mds_state_name(int s); * - they also define the lock ordering by the MDS * - a few of these are internal to the mds */ -#define CEPH_LOCK_DN 1 -#define CEPH_LOCK_ISNAP 2 -#define CEPH_LOCK_IVERSION 4 /* mds internal */ -#define CEPH_LOCK_IFILE 8 /* mds internal */ -#define CEPH_LOCK_IAUTH 32 -#define CEPH_LOCK_ILINK 64 -#define CEPH_LOCK_IDFT 128 /* dir frag tree */ -#define CEPH_LOCK_INEST 256 /* mds internal */ -#define CEPH_LOCK_IXATTR 512 -#define CEPH_LOCK_INO 2048 /* immutable inode bits; not a lock */ +#define CEPH_LOCK_DVERSION 1 +#define CEPH_LOCK_DN 2 +#define CEPH_LOCK_ISNAP 16 +#define CEPH_LOCK_IVERSION 32 /* mds internal */ +#define CEPH_LOCK_IFILE 64 +#define CEPH_LOCK_IAUTH 128 +#define CEPH_LOCK_ILINK 256 +#define CEPH_LOCK_IDFT 512 /* dir frag tree */ +#define CEPH_LOCK_INEST 1024 /* mds internal */ +#define CEPH_LOCK_IXATTR 2048 +#define CEPH_LOCK_INO 8192 /* immutable inode bits; not a lock */ /* client_session ops */ enum { diff --git a/fs/ceph/crush/mapper.c b/fs/ceph/crush/mapper.c index 9ba54efb6543..a4eec133258e 100644 --- a/fs/ceph/crush/mapper.c +++ b/fs/ceph/crush/mapper.c @@ -238,7 +238,7 @@ static int bucket_straw_choose(struct crush_bucket_straw *bucket, static int crush_bucket_choose(struct crush_bucket *in, int x, int r) { - dprintk("choose %d x=%d r=%d\n", in->id, x, r); + dprintk(" crush_bucket_choose %d x=%d r=%d\n", in->id, x, r); switch (in->alg) { case CRUSH_BUCKET_UNIFORM: return bucket_uniform_choose((struct crush_bucket_uniform *)in, @@ -264,7 +264,7 @@ static int crush_bucket_choose(struct crush_bucket *in, int x, int r) */ static int is_out(struct crush_map *map, __u32 *weight, int item, int x) { - if (weight[item] >= 0x1000) + if (weight[item] >= 0x10000) return 0; if (weight[item] == 0) return 1; @@ -305,7 +305,9 @@ static int crush_choose(struct crush_map *map, int itemtype; int collide, reject; const int orig_tries = 5; /* attempts before we fall back to search */ - dprintk("choose bucket %d x %d outpos %d\n", bucket->id, x, outpos); + + dprintk("CHOOSE%s bucket %d x %d outpos %d numrep %d\n", recurse_to_leaf ? "_LEAF" : "", + bucket->id, x, outpos, numrep); for (rep = outpos; rep < numrep; rep++) { /* keep trying until we get a non-out, non-colliding item */ @@ -366,6 +368,7 @@ static int crush_choose(struct crush_map *map, BUG_ON(item >= 0 || (-1-item) >= map->max_buckets); in = map->buckets[-1-item]; + retry_bucket = 1; continue; } @@ -377,15 +380,25 @@ static int crush_choose(struct crush_map *map, } } - if (recurse_to_leaf && - item < 0 && - crush_choose(map, map->buckets[-1-item], - weight, - x, outpos+1, 0, - out2, outpos, - firstn, 0, NULL) <= outpos) { - reject = 1; - } else { + reject = 0; + if (recurse_to_leaf) { + if (item < 0) { + if (crush_choose(map, + map->buckets[-1-item], + weight, + x, outpos+1, 0, + out2, outpos, + firstn, 0, + NULL) <= outpos) + /* didn't get leaf */ + reject = 1; + } else { + /* we already have a leaf! */ + out2[outpos] = item; + } + } + + if (!reject) { /* out? */ if (itemtype == 0) reject = is_out(map, weight, @@ -424,12 +437,12 @@ reject: continue; } - dprintk("choose got %d\n", item); + dprintk("CHOOSE got %d\n", item); out[outpos] = item; outpos++; } - dprintk("choose returns %d\n", outpos); + dprintk("CHOOSE returns %d\n", outpos); return outpos; } diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c index 3be33fb066cc..f2f5332ddbba 100644 --- a/fs/ceph/debugfs.c +++ b/fs/ceph/debugfs.c @@ -261,7 +261,7 @@ static int osdc_show(struct seq_file *s, void *pp) static int caps_show(struct seq_file *s, void *p) { - struct ceph_client *client = p; + struct ceph_client *client = s->private; int total, avail, used, reserved, min; ceph_reservation_status(client, &total, &avail, &used, &reserved, &min); diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index 4fd30900eff7..f94ed3c7f6a5 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -266,6 +266,7 @@ static int ceph_readdir(struct file *filp, void *dirent, filldir_t filldir) spin_lock(&inode->i_lock); if ((filp->f_pos == 2 || fi->dentry) && !ceph_test_opt(client, NOASYNCREADDIR) && + ceph_snap(inode) != CEPH_SNAPDIR && (ci->i_ceph_flags & CEPH_I_COMPLETE) && __ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1)) { err = __dcache_readdir(filp, dirent, filldir); @@ -587,7 +588,7 @@ static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry, CEPH_MDS_OP_LOOKUPSNAP : CEPH_MDS_OP_LOOKUP; req = ceph_mdsc_create_request(mdsc, op, USE_ANY_MDS); if (IS_ERR(req)) - return ERR_PTR(PTR_ERR(req)); + return ERR_CAST(req); req->r_dentry = dget(dentry); req->r_num_caps = 2; /* we only need inode linkage */ @@ -1013,18 +1014,22 @@ out_touch: /* * When a dentry is released, clear the dir I_COMPLETE if it was part - * of the current dir gen. + * of the current dir gen or if this is in the snapshot namespace. */ static void ceph_dentry_release(struct dentry *dentry) { struct ceph_dentry_info *di = ceph_dentry(dentry); struct inode *parent_inode = dentry->d_parent->d_inode; + u64 snapid = ceph_snap(parent_inode); - if (parent_inode) { + dout("dentry_release %p parent %p\n", dentry, parent_inode); + + if (parent_inode && snapid != CEPH_SNAPDIR) { struct ceph_inode_info *ci = ceph_inode(parent_inode); spin_lock(&parent_inode->i_lock); - if (ci->i_shared_gen == di->lease_shared_gen) { + if (ci->i_shared_gen == di->lease_shared_gen || + snapid <= CEPH_MAXSNAP) { dout(" clearing %p complete (d_release)\n", parent_inode); ci->i_ceph_flags &= ~CEPH_I_COMPLETE; @@ -1107,10 +1112,9 @@ static ssize_t ceph_read_dir(struct file *file, char __user *buf, size_t size, * an fsync() on a dir will wait for any uncommitted directory * operations to commit. */ -static int ceph_dir_fsync(struct file *file, struct dentry *dentry, - int datasync) +static int ceph_dir_fsync(struct file *file, int datasync) { - struct inode *inode = dentry->d_inode; + struct inode *inode = file->f_path.dentry->d_inode; struct ceph_inode_info *ci = ceph_inode(inode); struct list_head *head = &ci->i_unsafe_dirops; struct ceph_mds_request *req; @@ -1242,7 +1246,9 @@ struct dentry_operations ceph_dentry_ops = { struct dentry_operations ceph_snapdir_dentry_ops = { .d_revalidate = ceph_snapdir_d_revalidate, + .d_release = ceph_dentry_release, }; struct dentry_operations ceph_snap_dentry_ops = { + .d_release = ceph_dentry_release, }; diff --git a/fs/ceph/export.c b/fs/ceph/export.c index 17447644d675..4480cb1c63e7 100644 --- a/fs/ceph/export.c +++ b/fs/ceph/export.c @@ -133,7 +133,7 @@ static struct dentry *__cfh_to_dentry(struct super_block *sb, req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_LOOKUPHASH, USE_ANY_MDS); if (IS_ERR(req)) - return ERR_PTR(PTR_ERR(req)); + return ERR_CAST(req); req->r_ino1 = vino; req->r_ino2.ino = cfh->parent_ino; diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 6512b6701b9e..7c08698fad3e 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -230,7 +230,7 @@ struct dentry *ceph_lookup_open(struct inode *dir, struct dentry *dentry, /* do the open */ req = prepare_open_request(dir->i_sb, flags, mode); if (IS_ERR(req)) - return ERR_PTR(PTR_ERR(req)); + return ERR_CAST(req); req->r_dentry = dget(dentry); req->r_num_caps = 2; if (flags & O_CREAT) { @@ -265,7 +265,7 @@ int ceph_release(struct inode *inode, struct file *file) kmem_cache_free(ceph_file_cachep, cf); /* wake up anyone waiting for caps on this inode */ - wake_up(&ci->i_cap_wq); + wake_up_all(&ci->i_cap_wq); return 0; } diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index a81b8b662c7b..389f9dbd9949 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -69,7 +69,7 @@ struct inode *ceph_get_snapdir(struct inode *parent) BUG_ON(!S_ISDIR(parent->i_mode)); if (IS_ERR(inode)) - return ERR_PTR(PTR_ERR(inode)); + return inode; inode->i_mode = parent->i_mode; inode->i_uid = parent->i_uid; inode->i_gid = parent->i_gid; @@ -827,7 +827,7 @@ static void ceph_set_dentry_offset(struct dentry *dn) spin_lock(&dcache_lock); spin_lock(&dn->d_lock); - list_move_tail(&dir->d_subdirs, &dn->d_u.d_child); + list_move(&dn->d_u.d_child, &dir->d_subdirs); dout("set_dentry_offset %p %lld (%p %p)\n", dn, di->offset, dn->d_u.d_child.prev, dn->d_u.d_child.next); spin_unlock(&dn->d_lock); @@ -854,8 +854,8 @@ static struct dentry *splice_dentry(struct dentry *dn, struct inode *in, d_drop(dn); realdn = d_materialise_unique(dn, in); if (IS_ERR(realdn)) { - pr_err("splice_dentry error %p inode %p ino %llx.%llx\n", - dn, in, ceph_vinop(in)); + pr_err("splice_dentry error %ld %p inode %p ino %llx.%llx\n", + PTR_ERR(realdn), dn, in, ceph_vinop(in)); if (prehash) *prehash = false; /* don't rehash on error */ dn = realdn; /* note realdn contains the error */ @@ -1199,8 +1199,10 @@ retry_lookup: goto out; } err = ceph_init_dentry(dn); - if (err < 0) + if (err < 0) { + dput(dn); goto out; + } } else if (dn->d_inode && (ceph_ino(dn->d_inode) != vino.ino || ceph_snap(dn->d_inode) != vino.snap)) { @@ -1234,18 +1236,23 @@ retry_lookup: goto out; } dn = splice_dentry(dn, in, NULL); + if (IS_ERR(dn)) + dn = NULL; } if (fill_inode(in, &rinfo->dir_in[i], NULL, session, req->r_request_started, -1, &req->r_caps_reservation) < 0) { pr_err("fill_inode badness on %p\n", in); - dput(dn); - continue; + goto next_item; } - update_dentry_lease(dn, rinfo->dir_dlease[i], - req->r_session, req->r_request_started); - dput(dn); + if (dn) + update_dentry_lease(dn, rinfo->dir_dlease[i], + req->r_session, + req->r_request_started); +next_item: + if (dn) + dput(dn); } req->r_did_prepopulate = true; @@ -1494,7 +1501,7 @@ retry: if (wrbuffer_refs == 0) ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL); if (wake) - wake_up(&ci->i_cap_wq); + wake_up_all(&ci->i_cap_wq); } diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 885aa5710cfd..dd440bd438a9 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -868,7 +868,7 @@ static int wake_up_session_cb(struct inode *inode, struct ceph_cap *cap, { struct ceph_inode_info *ci = ceph_inode(inode); - wake_up(&ci->i_cap_wq); + wake_up_all(&ci->i_cap_wq); if (arg) { spin_lock(&inode->i_lock); ci->i_wanted_max_size = 0; @@ -1066,9 +1066,9 @@ static int trim_caps(struct ceph_mds_client *mdsc, * * Called under s_mutex. */ -static int add_cap_releases(struct ceph_mds_client *mdsc, - struct ceph_mds_session *session, - int extra) +int ceph_add_cap_releases(struct ceph_mds_client *mdsc, + struct ceph_mds_session *session, + int extra) { struct ceph_msg *msg; struct ceph_mds_cap_release *head; @@ -1176,8 +1176,8 @@ static int check_cap_flush(struct ceph_mds_client *mdsc, u64 want_flush_seq) /* * called under s_mutex */ -static void send_cap_releases(struct ceph_mds_client *mdsc, - struct ceph_mds_session *session) +void ceph_send_cap_releases(struct ceph_mds_client *mdsc, + struct ceph_mds_session *session) { struct ceph_msg *msg; @@ -1514,6 +1514,9 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc, ceph_encode_filepath(&p, end, ino1, path1); ceph_encode_filepath(&p, end, ino2, path2); + /* make note of release offset, in case we need to replay */ + req->r_request_release_offset = p - msg->front.iov_base; + /* cap releases */ releases = 0; if (req->r_inode_drop) @@ -1561,7 +1564,7 @@ static void complete_request(struct ceph_mds_client *mdsc, if (req->r_callback) req->r_callback(mdsc, req); else - complete(&req->r_completion); + complete_all(&req->r_completion); } /* @@ -1580,6 +1583,32 @@ static int __prepare_send_request(struct ceph_mds_client *mdsc, dout("prepare_send_request %p tid %lld %s (attempt %d)\n", req, req->r_tid, ceph_mds_op_name(req->r_op), req->r_attempts); + if (req->r_got_unsafe) { + /* + * Replay. Do not regenerate message (and rebuild + * paths, etc.); just use the original message. + * Rebuilding paths will break for renames because + * d_move mangles the src name. + */ + msg = req->r_request; + rhead = msg->front.iov_base; + + flags = le32_to_cpu(rhead->flags); + flags |= CEPH_MDS_FLAG_REPLAY; + rhead->flags = cpu_to_le32(flags); + + if (req->r_target_inode) + rhead->ino = cpu_to_le64(ceph_ino(req->r_target_inode)); + + rhead->num_retry = req->r_attempts - 1; + + /* remove cap/dentry releases from message */ + rhead->num_releases = 0; + msg->hdr.front_len = cpu_to_le32(req->r_request_release_offset); + msg->front.iov_len = req->r_request_release_offset; + return 0; + } + if (req->r_request) { ceph_msg_put(req->r_request); req->r_request = NULL; @@ -1601,13 +1630,9 @@ static int __prepare_send_request(struct ceph_mds_client *mdsc, rhead->flags = cpu_to_le32(flags); rhead->num_fwd = req->r_num_fwd; rhead->num_retry = req->r_attempts - 1; + rhead->ino = 0; dout(" r_locked_dir = %p\n", req->r_locked_dir); - - if (req->r_target_inode && req->r_got_unsafe) - rhead->ino = cpu_to_le64(ceph_ino(req->r_target_inode)); - else - rhead->ino = 0; return 0; } @@ -1768,12 +1793,12 @@ int ceph_mdsc_do_request(struct ceph_mds_client *mdsc, mutex_unlock(&mdsc->mutex); dout("do_request waiting\n"); if (req->r_timeout) { - err = (long)wait_for_completion_interruptible_timeout( + err = (long)wait_for_completion_killable_timeout( &req->r_completion, req->r_timeout); if (err == 0) err = -EIO; } else { - err = wait_for_completion_interruptible(&req->r_completion); + err = wait_for_completion_killable(&req->r_completion); } dout("do_request waited, got %d\n", err); mutex_lock(&mdsc->mutex); @@ -1907,7 +1932,7 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg) if (head->safe) { req->r_got_safe = true; __unregister_request(mdsc, req); - complete(&req->r_safe_completion); + complete_all(&req->r_safe_completion); if (req->r_got_unsafe) { /* @@ -1922,7 +1947,7 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg) /* last unsafe request during umount? */ if (mdsc->stopping && !__get_oldest_req(mdsc)) - complete(&mdsc->safe_umount_waiters); + complete_all(&mdsc->safe_umount_waiters); mutex_unlock(&mdsc->mutex); goto out; } @@ -1980,7 +2005,7 @@ out_err: } mutex_unlock(&mdsc->mutex); - add_cap_releases(mdsc, req->r_session, -1); + ceph_add_cap_releases(mdsc, req->r_session, -1); mutex_unlock(&session->s_mutex); /* kick calling process */ @@ -2014,16 +2039,21 @@ static void handle_forward(struct ceph_mds_client *mdsc, mutex_lock(&mdsc->mutex); req = __lookup_request(mdsc, tid); if (!req) { - dout("forward %llu to mds%d - req dne\n", tid, next_mds); + dout("forward tid %llu to mds%d - req dne\n", tid, next_mds); goto out; /* dup reply? */ } - if (fwd_seq <= req->r_num_fwd) { - dout("forward %llu to mds%d - old seq %d <= %d\n", + if (req->r_aborted) { + dout("forward tid %llu aborted, unregistering\n", tid); + __unregister_request(mdsc, req); + } else if (fwd_seq <= req->r_num_fwd) { + dout("forward tid %llu to mds%d - old seq %d <= %d\n", tid, next_mds, req->r_num_fwd, fwd_seq); } else { /* resend. forward race not possible; mds would drop */ - dout("forward %llu to mds%d (we resend)\n", tid, next_mds); + dout("forward tid %llu to mds%d (we resend)\n", tid, next_mds); + BUG_ON(req->r_err); + BUG_ON(req->r_got_result); req->r_num_fwd = fwd_seq; req->r_resend_mds = next_mds; put_request_session(req); @@ -2096,7 +2126,7 @@ static void handle_session(struct ceph_mds_session *session, pr_info("mds%d reconnect denied\n", session->s_mds); remove_session_caps(session); wake = 1; /* for good measure */ - complete(&mdsc->session_close_waiters); + complete_all(&mdsc->session_close_waiters); kick_requests(mdsc, mds); break; @@ -2428,6 +2458,7 @@ static void handle_lease(struct ceph_mds_client *mdsc, struct ceph_dentry_info *di; int mds = session->s_mds; struct ceph_mds_lease *h = msg->front.iov_base; + u32 seq; struct ceph_vino vino; int mask; struct qstr dname; @@ -2441,6 +2472,7 @@ static void handle_lease(struct ceph_mds_client *mdsc, vino.ino = le64_to_cpu(h->ino); vino.snap = CEPH_NOSNAP; mask = le16_to_cpu(h->mask); + seq = le32_to_cpu(h->seq); dname.name = (void *)h + sizeof(*h) + sizeof(u32); dname.len = msg->front.iov_len - sizeof(*h) - sizeof(u32); if (dname.len != get_unaligned_le32(h+1)) @@ -2451,8 +2483,9 @@ static void handle_lease(struct ceph_mds_client *mdsc, /* lookup inode */ inode = ceph_find_inode(sb, vino); - dout("handle_lease '%s', mask %d, ino %llx %p\n", - ceph_lease_op_name(h->action), mask, vino.ino, inode); + dout("handle_lease %s, mask %d, ino %llx %p %.*s\n", + ceph_lease_op_name(h->action), mask, vino.ino, inode, + dname.len, dname.name); if (inode == NULL) { dout("handle_lease no inode %llx\n", vino.ino); goto release; @@ -2477,7 +2510,8 @@ static void handle_lease(struct ceph_mds_client *mdsc, switch (h->action) { case CEPH_MDS_LEASE_REVOKE: if (di && di->lease_session == session) { - h->seq = cpu_to_le32(di->lease_seq); + if (ceph_seq_cmp(di->lease_seq, seq) > 0) + h->seq = cpu_to_le32(di->lease_seq); __ceph_mdsc_drop_dentry_lease(dentry); } release = 1; @@ -2491,7 +2525,7 @@ static void handle_lease(struct ceph_mds_client *mdsc, unsigned long duration = le32_to_cpu(h->duration_ms) * HZ / 1000; - di->lease_seq = le32_to_cpu(h->seq); + di->lease_seq = seq; dentry->d_time = di->lease_renew_from + duration; di->lease_renew_after = di->lease_renew_from + (duration >> 1); @@ -2541,7 +2575,7 @@ void ceph_mdsc_lease_send_msg(struct ceph_mds_session *session, return; lease = msg->front.iov_base; lease->action = action; - lease->mask = cpu_to_le16(CEPH_LOCK_DN); + lease->mask = cpu_to_le16(1); lease->ino = cpu_to_le64(ceph_vino(inode).ino); lease->first = lease->last = cpu_to_le64(ceph_vino(inode).snap); lease->seq = cpu_to_le32(seq); @@ -2571,7 +2605,7 @@ void ceph_mdsc_lease_release(struct ceph_mds_client *mdsc, struct inode *inode, BUG_ON(inode == NULL); BUG_ON(dentry == NULL); - BUG_ON(mask != CEPH_LOCK_DN); + BUG_ON(mask == 0); /* is dentry lease valid? */ spin_lock(&dentry->d_lock); @@ -2681,10 +2715,10 @@ static void delayed_work(struct work_struct *work) send_renew_caps(mdsc, s); else ceph_con_keepalive(&s->s_con); - add_cap_releases(mdsc, s, -1); + ceph_add_cap_releases(mdsc, s, -1); if (s->s_state == CEPH_MDS_SESSION_OPEN || s->s_state == CEPH_MDS_SESSION_HUNG) - send_cap_releases(mdsc, s); + ceph_send_cap_releases(mdsc, s); mutex_unlock(&s->s_mutex); ceph_put_mds_session(s); @@ -2774,6 +2808,12 @@ void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc) drop_leases(mdsc); ceph_flush_dirty_caps(mdsc); wait_requests(mdsc); + + /* + * wait for reply handlers to drop their request refs and + * their inode/dcache refs + */ + ceph_msgr_flush(); } /* diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index d9936c4f1212..952410c60d09 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h @@ -188,6 +188,7 @@ struct ceph_mds_request { int r_old_inode_drop, r_old_inode_unless; struct ceph_msg *r_request; /* original request */ + int r_request_release_offset; struct ceph_msg *r_reply; struct ceph_mds_reply_info_parsed r_reply_info; int r_err; @@ -322,6 +323,12 @@ static inline void ceph_mdsc_put_request(struct ceph_mds_request *req) kref_put(&req->r_kref, ceph_mdsc_release_request); } +extern int ceph_add_cap_releases(struct ceph_mds_client *mdsc, + struct ceph_mds_session *session, + int extra); +extern void ceph_send_cap_releases(struct ceph_mds_client *mdsc, + struct ceph_mds_session *session); + extern void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc); extern char *ceph_mdsc_build_path(struct dentry *dentry, int *plen, u64 *base, diff --git a/fs/ceph/messenger.c b/fs/ceph/messenger.c index 60b74839ebec..15167b2daa55 100644 --- a/fs/ceph/messenger.c +++ b/fs/ceph/messenger.c @@ -43,7 +43,8 @@ static void ceph_fault(struct ceph_connection *con); * nicely render a sockaddr as a string. */ #define MAX_ADDR_STR 20 -static char addr_str[MAX_ADDR_STR][40]; +#define MAX_ADDR_STR_LEN 60 +static char addr_str[MAX_ADDR_STR][MAX_ADDR_STR_LEN]; static DEFINE_SPINLOCK(addr_str_lock); static int last_addr_str; @@ -52,7 +53,6 @@ const char *pr_addr(const struct sockaddr_storage *ss) int i; char *s; struct sockaddr_in *in4 = (void *)ss; - unsigned char *quad = (void *)&in4->sin_addr.s_addr; struct sockaddr_in6 *in6 = (void *)ss; spin_lock(&addr_str_lock); @@ -64,25 +64,13 @@ const char *pr_addr(const struct sockaddr_storage *ss) switch (ss->ss_family) { case AF_INET: - sprintf(s, "%u.%u.%u.%u:%u", - (unsigned int)quad[0], - (unsigned int)quad[1], - (unsigned int)quad[2], - (unsigned int)quad[3], - (unsigned int)ntohs(in4->sin_port)); + snprintf(s, MAX_ADDR_STR_LEN, "%pI4:%u", &in4->sin_addr, + (unsigned int)ntohs(in4->sin_port)); break; case AF_INET6: - sprintf(s, "%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x:%u", - in6->sin6_addr.s6_addr16[0], - in6->sin6_addr.s6_addr16[1], - in6->sin6_addr.s6_addr16[2], - in6->sin6_addr.s6_addr16[3], - in6->sin6_addr.s6_addr16[4], - in6->sin6_addr.s6_addr16[5], - in6->sin6_addr.s6_addr16[6], - in6->sin6_addr.s6_addr16[7], - (unsigned int)ntohs(in6->sin6_port)); + snprintf(s, MAX_ADDR_STR_LEN, "[%pI6c]:%u", &in6->sin6_addr, + (unsigned int)ntohs(in6->sin6_port)); break; default: @@ -120,6 +108,12 @@ void ceph_msgr_exit(void) destroy_workqueue(ceph_msgr_wq); } +void ceph_msgr_flush() +{ + flush_workqueue(ceph_msgr_wq); +} + + /* * socket callback functions */ @@ -209,12 +203,13 @@ static void set_sock_callbacks(struct socket *sock, */ static struct socket *ceph_tcp_connect(struct ceph_connection *con) { - struct sockaddr *paddr = (struct sockaddr *)&con->peer_addr.in_addr; + struct sockaddr_storage *paddr = &con->peer_addr.in_addr; struct socket *sock; int ret; BUG_ON(con->sock); - ret = sock_create_kern(AF_INET, SOCK_STREAM, IPPROTO_TCP, &sock); + ret = sock_create_kern(con->peer_addr.in_addr.ss_family, SOCK_STREAM, + IPPROTO_TCP, &sock); if (ret) return ERR_PTR(ret); con->sock = sock; @@ -228,7 +223,8 @@ static struct socket *ceph_tcp_connect(struct ceph_connection *con) dout("connect %s\n", pr_addr(&con->peer_addr.in_addr)); - ret = sock->ops->connect(sock, paddr, sizeof(*paddr), O_NONBLOCK); + ret = sock->ops->connect(sock, (struct sockaddr *)paddr, sizeof(*paddr), + O_NONBLOCK); if (ret == -EINPROGRESS) { dout("connect %s EINPROGRESS sk_state = %u\n", pr_addr(&con->peer_addr.in_addr), @@ -651,7 +647,7 @@ static void prepare_write_connect(struct ceph_messenger *msgr, dout("prepare_write_connect %p cseq=%d gseq=%d proto=%d\n", con, con->connect_seq, global_seq, proto); - con->out_connect.features = CEPH_FEATURE_SUPPORTED_CLIENT; + con->out_connect.features = cpu_to_le64(CEPH_FEATURE_SUPPORTED_CLIENT); con->out_connect.host_type = cpu_to_le32(CEPH_ENTITY_TYPE_CLIENT); con->out_connect.connect_seq = cpu_to_le32(con->connect_seq); con->out_connect.global_seq = cpu_to_le32(global_seq); @@ -1003,19 +999,32 @@ int ceph_parse_ips(const char *c, const char *end, struct sockaddr_in *in4 = (void *)ss; struct sockaddr_in6 *in6 = (void *)ss; int port; + char delim = ','; + + if (*p == '[') { + delim = ']'; + p++; + } memset(ss, 0, sizeof(*ss)); if (in4_pton(p, end - p, (u8 *)&in4->sin_addr.s_addr, - ',', &ipend)) { + delim, &ipend)) ss->ss_family = AF_INET; - } else if (in6_pton(p, end - p, (u8 *)&in6->sin6_addr.s6_addr, - ',', &ipend)) { + else if (in6_pton(p, end - p, (u8 *)&in6->sin6_addr.s6_addr, + delim, &ipend)) ss->ss_family = AF_INET6; - } else { + else goto bad; - } p = ipend; + if (delim == ']') { + if (*p != ']') { + dout("missing matching ']'\n"); + goto bad; + } + p++; + } + /* port? */ if (p < end && *p == ':') { port = 0; @@ -1049,7 +1058,7 @@ int ceph_parse_ips(const char *c, const char *end, return 0; bad: - pr_err("parse_ips bad ip '%s'\n", c); + pr_err("parse_ips bad ip '%.*s'\n", (int)(end - c), c); return -EINVAL; } @@ -1390,10 +1399,12 @@ static int read_partial_message(struct ceph_connection *con) if (!con->in_msg) { dout("got hdr type %d front %d data %d\n", con->in_hdr.type, con->in_hdr.front_len, con->in_hdr.data_len); + skip = 0; con->in_msg = ceph_alloc_msg(con, &con->in_hdr, &skip); if (skip) { /* skip this message */ dout("alloc_msg said skip message\n"); + BUG_ON(con->in_msg); con->in_base_pos = -front_len - middle_len - data_len - sizeof(m->footer); con->in_tag = CEPH_MSGR_TAG_READY; @@ -2007,20 +2018,20 @@ void ceph_con_revoke(struct ceph_connection *con, struct ceph_msg *msg) { mutex_lock(&con->mutex); if (!list_empty(&msg->list_head)) { - dout("con_revoke %p msg %p\n", con, msg); + dout("con_revoke %p msg %p - was on queue\n", con, msg); list_del_init(&msg->list_head); ceph_msg_put(msg); msg->hdr.seq = 0; - if (con->out_msg == msg) { - ceph_msg_put(con->out_msg); - con->out_msg = NULL; - } + } + if (con->out_msg == msg) { + dout("con_revoke %p msg %p - was sending\n", con, msg); + con->out_msg = NULL; if (con->out_kvec_is_msg) { con->out_skip = con->out_kvec_bytes; con->out_kvec_is_msg = false; } - } else { - dout("con_revoke %p msg %p - not queued (sent?)\n", con, msg); + ceph_msg_put(msg); + msg->hdr.seq = 0; } mutex_unlock(&con->mutex); } diff --git a/fs/ceph/messenger.h b/fs/ceph/messenger.h index 00a9430b1ffc..76fbc957bc13 100644 --- a/fs/ceph/messenger.h +++ b/fs/ceph/messenger.h @@ -213,6 +213,7 @@ extern int ceph_parse_ips(const char *c, const char *end, extern int ceph_msgr_init(void); extern void ceph_msgr_exit(void); +extern void ceph_msgr_flush(void); extern struct ceph_messenger *ceph_messenger_create( struct ceph_entity_addr *myaddr); diff --git a/fs/ceph/mon_client.c b/fs/ceph/mon_client.c index f6510a476e7e..54fe01c50706 100644 --- a/fs/ceph/mon_client.c +++ b/fs/ceph/mon_client.c @@ -345,7 +345,7 @@ static void ceph_monc_handle_map(struct ceph_mon_client *monc, out: mutex_unlock(&monc->mutex); - wake_up(&client->auth_wq); + wake_up_all(&client->auth_wq); } /* @@ -400,6 +400,8 @@ static void release_generic_request(struct kref *kref) ceph_msg_put(req->reply); if (req->request) ceph_msg_put(req->request); + + kfree(req); } static void put_generic_request(struct ceph_mon_generic_request *req) @@ -460,7 +462,7 @@ static void handle_statfs_reply(struct ceph_mon_client *monc, } mutex_unlock(&monc->mutex); if (req) { - complete(&req->completion); + complete_all(&req->completion); put_generic_request(req); } return; @@ -704,8 +706,11 @@ static void handle_auth_reply(struct ceph_mon_client *monc, struct ceph_msg *msg) { int ret; + int was_auth = 0; mutex_lock(&monc->mutex); + if (monc->auth->ops) + was_auth = monc->auth->ops->is_authenticated(monc->auth); monc->pending_auth = 0; ret = ceph_handle_auth_reply(monc->auth, msg->front.iov_base, msg->front.iov_len, @@ -713,14 +718,15 @@ static void handle_auth_reply(struct ceph_mon_client *monc, monc->m_auth->front_max); if (ret < 0) { monc->client->auth_err = ret; - wake_up(&monc->client->auth_wq); + wake_up_all(&monc->client->auth_wq); } else if (ret > 0) { __send_prepared_auth_request(monc, ret); - } else if (monc->auth->ops->is_authenticated(monc->auth)) { + } else if (!was_auth && monc->auth->ops->is_authenticated(monc->auth)) { dout("authenticated, starting session\n"); monc->client->msgr->inst.name.type = CEPH_ENTITY_TYPE_CLIENT; - monc->client->msgr->inst.name.num = monc->auth->global_id; + monc->client->msgr->inst.name.num = + cpu_to_le64(monc->auth->global_id); __send_subscribe(monc); __resend_generic_request(monc); diff --git a/fs/ceph/osd_client.c b/fs/ceph/osd_client.c index afa7bb3895c4..e38522347898 100644 --- a/fs/ceph/osd_client.c +++ b/fs/ceph/osd_client.c @@ -361,8 +361,13 @@ static void put_osd(struct ceph_osd *osd) { dout("put_osd %p %d -> %d\n", osd, atomic_read(&osd->o_ref), atomic_read(&osd->o_ref) - 1); - if (atomic_dec_and_test(&osd->o_ref)) + if (atomic_dec_and_test(&osd->o_ref)) { + struct ceph_auth_client *ac = osd->o_osdc->client->monc.auth; + + if (osd->o_authorizer) + ac->ops->destroy_authorizer(ac, osd->o_authorizer); kfree(osd); + } } /* @@ -857,12 +862,12 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg, if (req->r_callback) req->r_callback(req, msg); else - complete(&req->r_completion); + complete_all(&req->r_completion); if (flags & CEPH_OSD_FLAG_ONDISK) { if (req->r_safe_callback) req->r_safe_callback(req, msg); - complete(&req->r_safe_completion); /* fsync waiter */ + complete_all(&req->r_safe_completion); /* fsync waiter */ } done: @@ -1078,7 +1083,7 @@ done: if (newmap) kick_requests(osdc, NULL); up_read(&osdc->map_sem); - wake_up(&osdc->client->auth_wq); + wake_up_all(&osdc->client->auth_wq); return; bad: @@ -1339,7 +1344,7 @@ static void dispatch(struct ceph_connection *con, struct ceph_msg *msg) int type = le16_to_cpu(msg->hdr.type); if (!osd) - return; + goto out; osdc = osd->o_osdc; switch (type) { @@ -1354,6 +1359,7 @@ static void dispatch(struct ceph_connection *con, struct ceph_msg *msg) pr_err("received unknown message type %d %s\n", type, ceph_msg_type_name(type)); } +out: ceph_msg_put(msg); } diff --git a/fs/ceph/osdmap.c b/fs/ceph/osdmap.c index cfdd8f4388b7..416d46adbf87 100644 --- a/fs/ceph/osdmap.c +++ b/fs/ceph/osdmap.c @@ -568,6 +568,7 @@ struct ceph_osdmap *osdmap_decode(void **p, void *end) if (ev > CEPH_PG_POOL_VERSION) { pr_warning("got unknown v %d > %d of ceph_pg_pool\n", ev, CEPH_PG_POOL_VERSION); + kfree(pi); goto bad; } __decode_pool(p, pi); @@ -706,7 +707,8 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, len, *p, end); newcrush = crush_decode(*p, min(*p+len, end)); if (IS_ERR(newcrush)) - return ERR_PTR(PTR_ERR(newcrush)); + return ERR_CAST(newcrush); + *p += len; } /* new flags? */ @@ -829,12 +831,13 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, /* remove any? */ while (rbp && pgid_cmp(rb_entry(rbp, struct ceph_pg_mapping, node)->pgid, pgid) <= 0) { - struct rb_node *cur = rbp; + struct ceph_pg_mapping *cur = + rb_entry(rbp, struct ceph_pg_mapping, node); + rbp = rb_next(rbp); - dout(" removed pg_temp %llx\n", - *(u64 *)&rb_entry(cur, struct ceph_pg_mapping, - node)->pgid); - rb_erase(cur, &map->pg_temp); + dout(" removed pg_temp %llx\n", *(u64 *)&cur->pgid); + rb_erase(&cur->node, &map->pg_temp); + kfree(cur); } if (pglen) { @@ -850,19 +853,22 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, for (j = 0; j < pglen; j++) pg->osds[j] = ceph_decode_32(p); err = __insert_pg_mapping(pg, &map->pg_temp); - if (err) + if (err) { + kfree(pg); goto bad; + } dout(" added pg_temp %llx len %d\n", *(u64 *)&pgid, pglen); } } while (rbp) { - struct rb_node *cur = rbp; + struct ceph_pg_mapping *cur = + rb_entry(rbp, struct ceph_pg_mapping, node); + rbp = rb_next(rbp); - dout(" removed pg_temp %llx\n", - *(u64 *)&rb_entry(cur, struct ceph_pg_mapping, - node)->pgid); - rb_erase(cur, &map->pg_temp); + dout(" removed pg_temp %llx\n", *(u64 *)&cur->pgid); + rb_erase(&cur->node, &map->pg_temp); + kfree(cur); } /* ignore the rest */ diff --git a/fs/ceph/super.c b/fs/ceph/super.c index 7c663d9b9f81..fa87f51e38e1 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -89,7 +89,7 @@ static int ceph_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_files = le64_to_cpu(st.num_objects); buf->f_ffree = -1; - buf->f_namelen = PATH_MAX; + buf->f_namelen = NAME_MAX; buf->f_frsize = PAGE_CACHE_SIZE; /* leave fsid little-endian, regardless of host endianness */ @@ -669,9 +669,17 @@ static void ceph_destroy_client(struct ceph_client *client) /* unmount */ ceph_mdsc_stop(&client->mdsc); - ceph_monc_stop(&client->monc); ceph_osdc_stop(&client->osdc); + /* + * make sure mds and osd connections close out before destroying + * the auth module, which is needed to free those connections' + * ceph_authorizers. + */ + ceph_msgr_flush(); + + ceph_monc_stop(&client->monc); + ceph_adjust_min_caps(-client->min_caps); ceph_debugfs_client_cleanup(client); @@ -738,7 +746,7 @@ static struct dentry *open_root_dentry(struct ceph_client *client, dout("open_root_inode opening '%s'\n", path); req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETATTR, USE_ANY_MDS); if (IS_ERR(req)) - return ERR_PTR(PTR_ERR(req)); + return ERR_CAST(req); req->r_path1 = kstrdup(path, GFP_NOFS); req->r_ino1.ino = CEPH_INO_ROOT; req->r_ino1.snap = CEPH_NOSNAP; @@ -918,7 +926,7 @@ static int ceph_compare_super(struct super_block *sb, void *data) /* * construct our own bdi so we can control readahead, etc. */ -static atomic_long_t bdi_seq = ATOMIC_INIT(0); +static atomic_long_t bdi_seq = ATOMIC_LONG_INIT(0); static int ceph_register_bdi(struct super_block *sb, struct ceph_client *client) { diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 3725c9ee9d08..10a4a406e887 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -10,7 +10,6 @@ #include <linux/fs.h> #include <linux/mempool.h> #include <linux/pagemap.h> -#include <linux/slab.h> #include <linux/wait.h> #include <linux/writeback.h> #include <linux/slab.h> @@ -811,7 +810,7 @@ extern void ceph_put_cap(struct ceph_cap *cap); extern void ceph_queue_caps_release(struct inode *inode); extern int ceph_write_inode(struct inode *inode, struct writeback_control *wbc); -extern int ceph_fsync(struct file *file, struct dentry *dentry, int datasync); +extern int ceph_fsync(struct file *file, int datasync); extern void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc, struct ceph_mds_session *session); extern int ceph_get_cap_mds(struct inode *inode); diff --git a/fs/cifs/Kconfig b/fs/cifs/Kconfig index 80f352596807..5739fd7f88b4 100644 --- a/fs/cifs/Kconfig +++ b/fs/cifs/Kconfig @@ -131,6 +131,15 @@ config CIFS_DFS_UPCALL IP addresses) which is needed for implicit mounts of DFS junction points. If unsure, say N. +config CIFS_FSCACHE + bool "Provide CIFS client caching support (EXPERIMENTAL)" + depends on EXPERIMENTAL + depends on CIFS=m && FSCACHE || CIFS=y && FSCACHE=y + help + Makes CIFS FS-Cache capable. Say Y here if you want your CIFS data + to be cached locally on disk through the general filesystem cache + manager. If unsure, say N. + config CIFS_EXPERIMENTAL bool "CIFS Experimental Features (EXPERIMENTAL)" depends on CIFS && EXPERIMENTAL diff --git a/fs/cifs/Makefile b/fs/cifs/Makefile index 9948c0030e86..adefa60a9bdc 100644 --- a/fs/cifs/Makefile +++ b/fs/cifs/Makefile @@ -11,3 +11,5 @@ cifs-y := cifsfs.o cifssmb.o cifs_debug.o connect.o dir.o file.o inode.o \ cifs-$(CONFIG_CIFS_UPCALL) += cifs_spnego.o cifs-$(CONFIG_CIFS_DFS_UPCALL) += dns_resolve.o cifs_dfs_ref.o + +cifs-$(CONFIG_CIFS_FSCACHE) += fscache.o cache.o diff --git a/fs/cifs/cache.c b/fs/cifs/cache.c new file mode 100644 index 000000000000..224d7bbd1fcc --- /dev/null +++ b/fs/cifs/cache.c @@ -0,0 +1,331 @@ +/* + * fs/cifs/cache.c - CIFS filesystem cache index structure definitions + * + * Copyright (c) 2010 Novell, Inc. + * Authors(s): Suresh Jayaraman (sjayaraman@suse.de> + * + * This library is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published + * by the Free Software Foundation; either version 2.1 of the License, or + * (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this library; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +#include "fscache.h" +#include "cifs_debug.h" + +/* + * CIFS filesystem definition for FS-Cache + */ +struct fscache_netfs cifs_fscache_netfs = { + .name = "cifs", + .version = 0, +}; + +/* + * Register CIFS for caching with FS-Cache + */ +int cifs_fscache_register(void) +{ + return fscache_register_netfs(&cifs_fscache_netfs); +} + +/* + * Unregister CIFS for caching + */ +void cifs_fscache_unregister(void) +{ + fscache_unregister_netfs(&cifs_fscache_netfs); +} + +/* + * Key layout of CIFS server cache index object + */ +struct cifs_server_key { + uint16_t family; /* address family */ + uint16_t port; /* IP port */ + union { + struct in_addr ipv4_addr; + struct in6_addr ipv6_addr; + } addr[0]; +}; + +/* + * Server object keyed by {IPaddress,port,family} tuple + */ +static uint16_t cifs_server_get_key(const void *cookie_netfs_data, + void *buffer, uint16_t maxbuf) +{ + const struct TCP_Server_Info *server = cookie_netfs_data; + const struct sockaddr *sa = (struct sockaddr *) &server->addr.sockAddr; + struct cifs_server_key *key = buffer; + uint16_t key_len = sizeof(struct cifs_server_key); + + memset(key, 0, key_len); + + /* + * Should not be a problem as sin_family/sin6_family overlays + * sa_family field + */ + switch (sa->sa_family) { + case AF_INET: + key->family = server->addr.sockAddr.sin_family; + key->port = server->addr.sockAddr.sin_port; + key->addr[0].ipv4_addr = server->addr.sockAddr.sin_addr; + key_len += sizeof(key->addr[0].ipv4_addr); + break; + + case AF_INET6: + key->family = server->addr.sockAddr6.sin6_family; + key->port = server->addr.sockAddr6.sin6_port; + key->addr[0].ipv6_addr = server->addr.sockAddr6.sin6_addr; + key_len += sizeof(key->addr[0].ipv6_addr); + break; + + default: + cERROR(1, "CIFS: Unknown network family '%d'", sa->sa_family); + key_len = 0; + break; + } + + return key_len; +} + +/* + * Server object for FS-Cache + */ +const struct fscache_cookie_def cifs_fscache_server_index_def = { + .name = "CIFS.server", + .type = FSCACHE_COOKIE_TYPE_INDEX, + .get_key = cifs_server_get_key, +}; + +/* + * Auxiliary data attached to CIFS superblock within the cache + */ +struct cifs_fscache_super_auxdata { + u64 resource_id; /* unique server resource id */ +}; + +static char *extract_sharename(const char *treename) +{ + const char *src; + char *delim, *dst; + int len; + + /* skip double chars at the beginning */ + src = treename + 2; + + /* share name is always preceded by '\\' now */ + delim = strchr(src, '\\'); + if (!delim) + return ERR_PTR(-EINVAL); + delim++; + len = strlen(delim); + + /* caller has to free the memory */ + dst = kstrndup(delim, len, GFP_KERNEL); + if (!dst) + return ERR_PTR(-ENOMEM); + + return dst; +} + +/* + * Superblock object currently keyed by share name + */ +static uint16_t cifs_super_get_key(const void *cookie_netfs_data, void *buffer, + uint16_t maxbuf) +{ + const struct cifsTconInfo *tcon = cookie_netfs_data; + char *sharename; + uint16_t len; + + sharename = extract_sharename(tcon->treeName); + if (IS_ERR(sharename)) { + cFYI(1, "CIFS: couldn't extract sharename\n"); + sharename = NULL; + return 0; + } + + len = strlen(sharename); + if (len > maxbuf) + return 0; + + memcpy(buffer, sharename, len); + + kfree(sharename); + + return len; +} + +static uint16_t +cifs_fscache_super_get_aux(const void *cookie_netfs_data, void *buffer, + uint16_t maxbuf) +{ + struct cifs_fscache_super_auxdata auxdata; + const struct cifsTconInfo *tcon = cookie_netfs_data; + + memset(&auxdata, 0, sizeof(auxdata)); + auxdata.resource_id = tcon->resource_id; + + if (maxbuf > sizeof(auxdata)) + maxbuf = sizeof(auxdata); + + memcpy(buffer, &auxdata, maxbuf); + + return maxbuf; +} + +static enum +fscache_checkaux cifs_fscache_super_check_aux(void *cookie_netfs_data, + const void *data, + uint16_t datalen) +{ + struct cifs_fscache_super_auxdata auxdata; + const struct cifsTconInfo *tcon = cookie_netfs_data; + + if (datalen != sizeof(auxdata)) + return FSCACHE_CHECKAUX_OBSOLETE; + + memset(&auxdata, 0, sizeof(auxdata)); + auxdata.resource_id = tcon->resource_id; + + if (memcmp(data, &auxdata, datalen) != 0) + return FSCACHE_CHECKAUX_OBSOLETE; + + return FSCACHE_CHECKAUX_OKAY; +} + +/* + * Superblock object for FS-Cache + */ +const struct fscache_cookie_def cifs_fscache_super_index_def = { + .name = "CIFS.super", + .type = FSCACHE_COOKIE_TYPE_INDEX, + .get_key = cifs_super_get_key, + .get_aux = cifs_fscache_super_get_aux, + .check_aux = cifs_fscache_super_check_aux, +}; + +/* + * Auxiliary data attached to CIFS inode within the cache + */ +struct cifs_fscache_inode_auxdata { + struct timespec last_write_time; + struct timespec last_change_time; + u64 eof; +}; + +static uint16_t cifs_fscache_inode_get_key(const void *cookie_netfs_data, + void *buffer, uint16_t maxbuf) +{ + const struct cifsInodeInfo *cifsi = cookie_netfs_data; + uint16_t keylen; + + /* use the UniqueId as the key */ + keylen = sizeof(cifsi->uniqueid); + if (keylen > maxbuf) + keylen = 0; + else + memcpy(buffer, &cifsi->uniqueid, keylen); + + return keylen; +} + +static void +cifs_fscache_inode_get_attr(const void *cookie_netfs_data, uint64_t *size) +{ + const struct cifsInodeInfo *cifsi = cookie_netfs_data; + + *size = cifsi->vfs_inode.i_size; +} + +static uint16_t +cifs_fscache_inode_get_aux(const void *cookie_netfs_data, void *buffer, + uint16_t maxbuf) +{ + struct cifs_fscache_inode_auxdata auxdata; + const struct cifsInodeInfo *cifsi = cookie_netfs_data; + + memset(&auxdata, 0, sizeof(auxdata)); + auxdata.eof = cifsi->server_eof; + auxdata.last_write_time = cifsi->vfs_inode.i_mtime; + auxdata.last_change_time = cifsi->vfs_inode.i_ctime; + + if (maxbuf > sizeof(auxdata)) + maxbuf = sizeof(auxdata); + + memcpy(buffer, &auxdata, maxbuf); + + return maxbuf; +} + +static enum +fscache_checkaux cifs_fscache_inode_check_aux(void *cookie_netfs_data, + const void *data, + uint16_t datalen) +{ + struct cifs_fscache_inode_auxdata auxdata; + struct cifsInodeInfo *cifsi = cookie_netfs_data; + + if (datalen != sizeof(auxdata)) + return FSCACHE_CHECKAUX_OBSOLETE; + + memset(&auxdata, 0, sizeof(auxdata)); + auxdata.eof = cifsi->server_eof; + auxdata.last_write_time = cifsi->vfs_inode.i_mtime; + auxdata.last_change_time = cifsi->vfs_inode.i_ctime; + + if (memcmp(data, &auxdata, datalen) != 0) + return FSCACHE_CHECKAUX_OBSOLETE; + + return FSCACHE_CHECKAUX_OKAY; +} + +static void cifs_fscache_inode_now_uncached(void *cookie_netfs_data) +{ + struct cifsInodeInfo *cifsi = cookie_netfs_data; + struct pagevec pvec; + pgoff_t first; + int loop, nr_pages; + + pagevec_init(&pvec, 0); + first = 0; + + cFYI(1, "cifs inode 0x%p now uncached", cifsi); + + for (;;) { + nr_pages = pagevec_lookup(&pvec, + cifsi->vfs_inode.i_mapping, first, + PAGEVEC_SIZE - pagevec_count(&pvec)); + if (!nr_pages) + break; + + for (loop = 0; loop < nr_pages; loop++) + ClearPageFsCache(pvec.pages[loop]); + + first = pvec.pages[nr_pages - 1]->index + 1; + + pvec.nr = nr_pages; + pagevec_release(&pvec); + cond_resched(); + } +} + +const struct fscache_cookie_def cifs_fscache_inode_object_def = { + .name = "CIFS.uniqueid", + .type = FSCACHE_COOKIE_TYPE_DATAFILE, + .get_key = cifs_fscache_inode_get_key, + .get_attr = cifs_fscache_inode_get_attr, + .get_aux = cifs_fscache_inode_get_aux, + .check_aux = cifs_fscache_inode_check_aux, + .now_uncached = cifs_fscache_inode_now_uncached, +}; diff --git a/fs/cifs/cifs_dfs_ref.c b/fs/cifs/cifs_dfs_ref.c index ac19a6f3dae0..dc1ed50ea06e 100644 --- a/fs/cifs/cifs_dfs_ref.c +++ b/fs/cifs/cifs_dfs_ref.c @@ -230,28 +230,22 @@ compose_mount_options_err: goto compose_mount_options_out; } - -static struct vfsmount *cifs_dfs_do_refmount(const struct vfsmount *mnt_parent, - struct dentry *dentry, const struct dfs_info3_param *ref) +/** + * cifs_dfs_do_refmount - mounts specified path using provided refferal + * @cifs_sb: parent/root superblock + * @fullpath: full path in UNC format + * @ref: server's referral + */ +static struct vfsmount *cifs_dfs_do_refmount(struct cifs_sb_info *cifs_sb, + const char *fullpath, const struct dfs_info3_param *ref) { - struct cifs_sb_info *cifs_sb; struct vfsmount *mnt; char *mountdata; char *devname = NULL; - char *fullpath; - - cifs_sb = CIFS_SB(dentry->d_inode->i_sb); - /* - * this function gives us a path with a double backslash prefix. We - * require a single backslash for DFS. - */ - fullpath = build_path_from_dentry(dentry); - if (!fullpath) - return ERR_PTR(-ENOMEM); + /* strip first '\' from fullpath */ mountdata = cifs_compose_mount_options(cifs_sb->mountdata, fullpath + 1, ref, &devname); - kfree(fullpath); if (IS_ERR(mountdata)) return (struct vfsmount *)mountdata; @@ -357,8 +351,8 @@ cifs_dfs_follow_mountpoint(struct dentry *dentry, struct nameidata *nd) rc = -EINVAL; goto out_err; } - mnt = cifs_dfs_do_refmount(nd->path.mnt, - nd->path.dentry, referrals + i); + mnt = cifs_dfs_do_refmount(cifs_sb, + full_path, referrals + i); cFYI(1, "%s: cifs_dfs_do_refmount:%s , mnt:%p", __func__, referrals[i].node_name, mnt); diff --git a/fs/cifs/cifs_fs_sb.h b/fs/cifs/cifs_fs_sb.h index 246a167cb913..9e771450c3b8 100644 --- a/fs/cifs/cifs_fs_sb.h +++ b/fs/cifs/cifs_fs_sb.h @@ -35,6 +35,7 @@ #define CIFS_MOUNT_DYNPERM 0x1000 /* allow in-memory only mode setting */ #define CIFS_MOUNT_NOPOSIXBRL 0x2000 /* mandatory not posix byte range lock */ #define CIFS_MOUNT_NOSSYNC 0x4000 /* don't do slow SMBflush on every sync*/ +#define CIFS_MOUNT_FSCACHE 0x8000 /* local caching enabled */ struct cifs_sb_info { struct cifsTconInfo *tcon; /* primary mount */ diff --git a/fs/cifs/cifs_spnego.c b/fs/cifs/cifs_spnego.c index 379bd7d9c05f..6effccff85a5 100644 --- a/fs/cifs/cifs_spnego.c +++ b/fs/cifs/cifs_spnego.c @@ -144,6 +144,9 @@ cifs_get_spnego_key(struct cifsSesInfo *sesInfo) sprintf(dp, ";uid=0x%x", sesInfo->linux_uid); dp = description + strlen(description); + sprintf(dp, ";creduid=0x%x", sesInfo->cred_uid); + + dp = description + strlen(description); sprintf(dp, ";user=%s", sesInfo->userName); dp = description + strlen(description); diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index 78c02eb4cb1f..8a2cf129e535 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -47,6 +47,7 @@ #include <linux/key-type.h> #include "dns_resolve.h" #include "cifs_spnego.h" +#include "fscache.h" #define CIFS_MAGIC_NUMBER 0xFF534D42 /* the first four bytes of SMB PDUs */ int cifsFYI = 0; @@ -329,6 +330,12 @@ cifs_destroy_inode(struct inode *inode) } static void +cifs_clear_inode(struct inode *inode) +{ + cifs_fscache_release_inode_cookie(inode); +} + +static void cifs_show_address(struct seq_file *s, struct TCP_Server_Info *server) { seq_printf(s, ",addr="); @@ -473,14 +480,25 @@ static int cifs_remount(struct super_block *sb, int *flags, char *data) return 0; } +void cifs_drop_inode(struct inode *inode) +{ + struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); + + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) + return generic_drop_inode(inode); + + return generic_delete_inode(inode); +} + static const struct super_operations cifs_super_ops = { .put_super = cifs_put_super, .statfs = cifs_statfs, .alloc_inode = cifs_alloc_inode, .destroy_inode = cifs_destroy_inode, -/* .drop_inode = generic_delete_inode, - .delete_inode = cifs_delete_inode, */ /* Do not need above two - functions unless later we add lazy close of inodes or unless the + .drop_inode = cifs_drop_inode, + .clear_inode = cifs_clear_inode, +/* .delete_inode = cifs_delete_inode, */ /* Do not need above + function unless later we add lazy close of inodes or unless the kernel forgets to call us with the same number of releases (closes) as opens */ .show_options = cifs_show_options, @@ -892,6 +910,10 @@ init_cifs(void) cFYI(1, "cifs_max_pending set to max of 256"); } + rc = cifs_fscache_register(); + if (rc) + goto out; + rc = cifs_init_inodecache(); if (rc) goto out_clean_proc; @@ -913,7 +935,7 @@ init_cifs(void) goto out_unregister_filesystem; #endif #ifdef CONFIG_CIFS_DFS_UPCALL - rc = register_key_type(&key_type_dns_resolver); + rc = cifs_init_dns_resolver(); if (rc) goto out_unregister_key_type; #endif @@ -925,7 +947,7 @@ init_cifs(void) out_unregister_resolver_key: #ifdef CONFIG_CIFS_DFS_UPCALL - unregister_key_type(&key_type_dns_resolver); + cifs_exit_dns_resolver(); out_unregister_key_type: #endif #ifdef CONFIG_CIFS_UPCALL @@ -941,6 +963,8 @@ init_cifs(void) cifs_destroy_inodecache(); out_clean_proc: cifs_proc_clean(); + cifs_fscache_unregister(); + out: return rc; } @@ -949,9 +973,10 @@ exit_cifs(void) { cFYI(DBG2, "exit_cifs"); cifs_proc_clean(); + cifs_fscache_unregister(); #ifdef CONFIG_CIFS_DFS_UPCALL cifs_dfs_release_automount_timer(); - unregister_key_type(&key_type_dns_resolver); + cifs_exit_dns_resolver(); #endif #ifdef CONFIG_CIFS_UPCALL unregister_key_type(&cifs_spnego_key_type); diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h index 0242ff9cbf41..d82f5fb4761e 100644 --- a/fs/cifs/cifsfs.h +++ b/fs/cifs/cifsfs.h @@ -84,7 +84,7 @@ extern ssize_t cifs_user_read(struct file *file, char __user *read_data, extern ssize_t cifs_user_write(struct file *file, const char __user *write_data, size_t write_size, loff_t *poffset); extern int cifs_lock(struct file *, int, struct file_lock *); -extern int cifs_fsync(struct file *, struct dentry *, int); +extern int cifs_fsync(struct file *, int); extern int cifs_flush(struct file *, fl_owner_t id); extern int cifs_file_mmap(struct file * , struct vm_area_struct *); extern const struct file_operations cifs_dir_ops; @@ -114,5 +114,5 @@ extern long cifs_ioctl(struct file *filep, unsigned int cmd, unsigned long arg); extern const struct export_operations cifs_export_ops; #endif /* EXPERIMENTAL */ -#define CIFS_VERSION "1.64" +#define CIFS_VERSION "1.65" #endif /* _CIFSFS_H */ diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index a88479ceaad5..59906146ad36 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -16,6 +16,9 @@ * the GNU Lesser General Public License for more details. * */ +#ifndef _CIFS_GLOB_H +#define _CIFS_GLOB_H + #include <linux/in.h> #include <linux/in6.h> #include <linux/slab.h> @@ -34,7 +37,7 @@ #define MAX_SHARE_SIZE 64 /* used to be 20, this should still be enough */ #define MAX_USERNAME_SIZE 32 /* 32 is to allow for 15 char names + null termination then *2 for unicode versions */ -#define MAX_PASSWORD_SIZE 16 +#define MAX_PASSWORD_SIZE 512 /* max for windows seems to be 256 wide chars */ #define CIFS_MIN_RCV_POOL 4 @@ -80,8 +83,7 @@ enum statusEnum { }; enum securityEnum { - PLAINTXT = 0, /* Legacy with Plaintext passwords */ - LANMAN, /* Legacy LANMAN auth */ + LANMAN = 0, /* Legacy LANMAN auth */ NTLM, /* Legacy NTLM012 auth with NTLM hash */ NTLMv2, /* Legacy NTLM auth with NTLMv2 hash */ RawNTLMSSP, /* NTLMSSP without SPNEGO, NTLMv2 hash */ @@ -142,7 +144,6 @@ struct TCP_Server_Info { struct list_head pending_mid_q; void *Server_NlsInfo; /* BB - placeholder for future NLS info */ unsigned short server_codepage; /* codepage for the server */ - unsigned long ip_address; /* IP addr for the server if known */ enum protocolEnum protocolType; char versionMajor; char versionMinor; @@ -190,19 +191,9 @@ struct TCP_Server_Info { bool sec_mskerberos; /* supports legacy MS Kerberos */ bool sec_kerberosu2u; /* supports U2U Kerberos */ bool sec_ntlmssp; /* supports NTLMSSP */ -}; - -/* - * The following is our shortcut to user information. We surface the uid, - * and name. We always get the password on the fly in case it - * has changed. We also hang a list of sessions owned by this user off here. - */ -struct cifsUidInfo { - struct list_head userList; - struct list_head sessionList; /* SMB sessions for this user */ - uid_t linux_uid; - char user[MAX_USERNAME_SIZE + 1]; /* ascii name of user */ - /* BB may need ptr or callback for PAM or WinBind info */ +#ifdef CONFIG_CIFS_FSCACHE + struct fscache_cookie *fscache; /* client index cache cookie */ +#endif }; /* @@ -212,9 +203,6 @@ struct cifsSesInfo { struct list_head smb_ses_list; struct list_head tcon_list; struct mutex session_mutex; -#if 0 - struct cifsUidInfo *uidInfo; /* pointer to user info */ -#endif struct TCP_Server_Info *server; /* pointer to server info */ int ses_count; /* reference counter */ enum statusEnum status; @@ -226,7 +214,8 @@ struct cifsSesInfo { char *serverNOS; /* name of network operating system of server */ char *serverDomain; /* security realm of server */ int Suid; /* remote smb uid */ - uid_t linux_uid; /* local Linux uid */ + uid_t linux_uid; /* overriding owner of files on the mount */ + uid_t cred_uid; /* owner of credentials */ int capabilities; char serverName[SERVER_NAME_LEN_WITH_NULL * 2]; /* BB make bigger for TCP names - will ipv6 and sctp addresses fit? */ @@ -311,6 +300,10 @@ struct cifsTconInfo { bool local_lease:1; /* check leases (only) on local system not remote */ bool broken_posix_open; /* e.g. Samba server versions < 3.3.2, 3.2.9 */ bool need_reconnect:1; /* connection reset, tid now invalid */ +#ifdef CONFIG_CIFS_FSCACHE + u64 resource_id; /* server resource id */ + struct fscache_cookie *fscache; /* cookie for share */ +#endif /* BB add field for back pointer to sb struct(s)? */ }; @@ -398,6 +391,9 @@ struct cifsInodeInfo { bool invalid_mapping:1; /* pagecache is invalid */ u64 server_eof; /* current file size on server */ u64 uniqueid; /* server inode number */ +#ifdef CONFIG_CIFS_FSCACHE + struct fscache_cookie *fscache; +#endif struct inode vfs_inode; }; @@ -733,3 +729,5 @@ GLOBAL_EXTERN unsigned int cifs_min_small; /* min size of small buf pool */ GLOBAL_EXTERN unsigned int cifs_max_pending; /* MAX requests at once to server*/ extern const struct slow_work_ops cifs_oplock_break_ops; + +#endif /* _CIFS_GLOB_H */ diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h index fb1657e0fdb8..2eaebbd31132 100644 --- a/fs/cifs/cifsproto.h +++ b/fs/cifs/cifsproto.h @@ -86,7 +86,9 @@ extern unsigned int smbCalcSize(struct smb_hdr *ptr); extern unsigned int smbCalcSize_LE(struct smb_hdr *ptr); extern int decode_negTokenInit(unsigned char *security_blob, int length, struct TCP_Server_Info *server); -extern int cifs_convert_address(char *src, void *dst); +extern int cifs_convert_address(struct sockaddr *dst, char *src); +extern int cifs_fill_sockaddr(struct sockaddr *dst, char *src, + unsigned short int port); extern int map_smb_to_linux_error(struct smb_hdr *smb, int logErr); extern void header_assemble(struct smb_hdr *, char /* command */ , const struct cifsTconInfo *, int /* length of @@ -106,7 +108,6 @@ extern struct cifsFileInfo *cifs_new_fileinfo(struct inode *newinode, __u16 fileHandle, struct file *file, struct vfsmount *mnt, unsigned int oflags); extern int cifs_posix_open(char *full_path, struct inode **pinode, - struct vfsmount *mnt, struct super_block *sb, int mode, int oflags, __u32 *poplock, __u16 *pnetfid, int xid); diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 2208f06e4c45..2a43a0aca965 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -48,6 +48,7 @@ #include "nterr.h" #include "rfc1002pdu.h" #include "cn_cifs.h" +#include "fscache.h" #define CIFS_PORT 445 #define RFC1001_PORT 139 @@ -66,6 +67,7 @@ struct smb_vol { char *iocharset; /* local code page for mapping to and from Unicode */ char source_rfc1001_name[16]; /* netbios name of client */ char target_rfc1001_name[16]; /* netbios name of server for Win9x/ME */ + uid_t cred_uid; uid_t linux_uid; gid_t linux_gid; mode_t file_mode; @@ -97,6 +99,7 @@ struct smb_vol { bool noblocksnd:1; bool noautotune:1; bool nostrictsync:1; /* do not force expensive SMBflush on every sync */ + bool fsc:1; /* enable fscache */ unsigned int rsize; unsigned int wsize; bool sockopt_tcp_nodelay:1; @@ -830,7 +833,8 @@ cifs_parse_mount_options(char *options, const char *devname, /* null target name indicates to use *SMBSERVR default called name if we end up sending RFC1001 session initialize */ vol->target_rfc1001_name[0] = 0; - vol->linux_uid = current_uid(); /* use current_euid() instead? */ + vol->cred_uid = current_uid(); + vol->linux_uid = current_uid(); vol->linux_gid = current_gid(); /* default to only allowing write access to owner of the mount */ @@ -1257,6 +1261,12 @@ cifs_parse_mount_options(char *options, const char *devname, } else if ((strnicmp(data, "nocase", 6) == 0) || (strnicmp(data, "ignorecase", 10) == 0)) { vol->nocase = 1; + } else if (strnicmp(data, "mand", 4) == 0) { + /* ignore */ + } else if (strnicmp(data, "nomand", 6) == 0) { + /* ignore */ + } else if (strnicmp(data, "_netdev", 7) == 0) { + /* ignore */ } else if (strnicmp(data, "brl", 3) == 0) { vol->nobrl = 0; } else if ((strnicmp(data, "nobrl", 5) == 0) || @@ -1331,6 +1341,8 @@ cifs_parse_mount_options(char *options, const char *devname, printk(KERN_WARNING "CIFS: Mount option noac not " "supported. Instead set " "/proc/fs/cifs/LookupCacheEnabled to 0\n"); + } else if (strnicmp(data, "fsc", 3) == 0) { + vol->fsc = true; } else printk(KERN_WARNING "CIFS: Unknown mount option %s\n", data); @@ -1380,18 +1392,92 @@ cifs_parse_mount_options(char *options, const char *devname, return 0; } +static bool +match_address(struct TCP_Server_Info *server, struct sockaddr *addr) +{ + struct sockaddr_in *addr4 = (struct sockaddr_in *)addr; + struct sockaddr_in6 *addr6 = (struct sockaddr_in6 *)addr; + + switch (addr->sa_family) { + case AF_INET: + if (addr4->sin_addr.s_addr != + server->addr.sockAddr.sin_addr.s_addr) + return false; + if (addr4->sin_port && + addr4->sin_port != server->addr.sockAddr.sin_port) + return false; + break; + case AF_INET6: + if (!ipv6_addr_equal(&addr6->sin6_addr, + &server->addr.sockAddr6.sin6_addr)) + return false; + if (addr6->sin6_scope_id != + server->addr.sockAddr6.sin6_scope_id) + return false; + if (addr6->sin6_port && + addr6->sin6_port != server->addr.sockAddr6.sin6_port) + return false; + break; + } + + return true; +} + +static bool +match_security(struct TCP_Server_Info *server, struct smb_vol *vol) +{ + unsigned int secFlags; + + if (vol->secFlg & (~(CIFSSEC_MUST_SIGN | CIFSSEC_MUST_SEAL))) + secFlags = vol->secFlg; + else + secFlags = global_secflags | vol->secFlg; + + switch (server->secType) { + case LANMAN: + if (!(secFlags & (CIFSSEC_MAY_LANMAN|CIFSSEC_MAY_PLNTXT))) + return false; + break; + case NTLMv2: + if (!(secFlags & CIFSSEC_MAY_NTLMV2)) + return false; + break; + case NTLM: + if (!(secFlags & CIFSSEC_MAY_NTLM)) + return false; + break; + case Kerberos: + if (!(secFlags & CIFSSEC_MAY_KRB5)) + return false; + break; + case RawNTLMSSP: + if (!(secFlags & CIFSSEC_MAY_NTLMSSP)) + return false; + break; + default: + /* shouldn't happen */ + return false; + } + + /* now check if signing mode is acceptible */ + if ((secFlags & CIFSSEC_MAY_SIGN) == 0 && + (server->secMode & SECMODE_SIGN_REQUIRED)) + return false; + else if (((secFlags & CIFSSEC_MUST_SIGN) == CIFSSEC_MUST_SIGN) && + (server->secMode & + (SECMODE_SIGN_ENABLED|SECMODE_SIGN_REQUIRED)) == 0) + return false; + + return true; +} + static struct TCP_Server_Info * -cifs_find_tcp_session(struct sockaddr_storage *addr, unsigned short int port) +cifs_find_tcp_session(struct sockaddr *addr, struct smb_vol *vol) { - struct list_head *tmp; struct TCP_Server_Info *server; - struct sockaddr_in *addr4 = (struct sockaddr_in *) addr; - struct sockaddr_in6 *addr6 = (struct sockaddr_in6 *) addr; write_lock(&cifs_tcp_ses_lock); - list_for_each(tmp, &cifs_tcp_ses_list) { - server = list_entry(tmp, struct TCP_Server_Info, - tcp_ses_list); + list_for_each_entry(server, &cifs_tcp_ses_list, tcp_ses_list) { /* * the demux thread can exit on its own while still in CifsNew * so don't accept any sockets in that state. Since the @@ -1401,37 +1487,11 @@ cifs_find_tcp_session(struct sockaddr_storage *addr, unsigned short int port) if (server->tcpStatus == CifsNew) continue; - switch (addr->ss_family) { - case AF_INET: - if (addr4->sin_addr.s_addr == - server->addr.sockAddr.sin_addr.s_addr) { - addr4->sin_port = htons(port); - /* user overrode default port? */ - if (addr4->sin_port) { - if (addr4->sin_port != - server->addr.sockAddr.sin_port) - continue; - } - break; - } else - continue; + if (!match_address(server, addr)) + continue; - case AF_INET6: - if (ipv6_addr_equal(&addr6->sin6_addr, - &server->addr.sockAddr6.sin6_addr) && - (addr6->sin6_scope_id == - server->addr.sockAddr6.sin6_scope_id)) { - addr6->sin6_port = htons(port); - /* user overrode default port? */ - if (addr6->sin6_port) { - if (addr6->sin6_port != - server->addr.sockAddr6.sin6_port) - continue; - } - break; - } else - continue; - } + if (!match_security(server, vol)) + continue; ++server->srv_count; write_unlock(&cifs_tcp_ses_lock); @@ -1460,6 +1520,8 @@ cifs_put_tcp_session(struct TCP_Server_Info *server) server->tcpStatus = CifsExiting; spin_unlock(&GlobalMid_Lock); + cifs_fscache_release_client_cookie(server); + task = xchg(&server->tsk, NULL); if (task) force_sig(SIGKILL, task); @@ -1479,7 +1541,9 @@ cifs_get_tcp_session(struct smb_vol *volume_info) cFYI(1, "UNC: %s ip: %s", volume_info->UNC, volume_info->UNCip); if (volume_info->UNCip && volume_info->UNC) { - rc = cifs_convert_address(volume_info->UNCip, &addr); + rc = cifs_fill_sockaddr((struct sockaddr *)&addr, + volume_info->UNCip, + volume_info->port); if (!rc) { /* we failed translating address */ rc = -EINVAL; @@ -1499,7 +1563,7 @@ cifs_get_tcp_session(struct smb_vol *volume_info) } /* see if we already have a matching tcp_ses */ - tcp_ses = cifs_find_tcp_session(&addr, volume_info->port); + tcp_ses = cifs_find_tcp_session((struct sockaddr *)&addr, volume_info); if (tcp_ses) return tcp_ses; @@ -1543,12 +1607,10 @@ cifs_get_tcp_session(struct smb_vol *volume_info) cFYI(1, "attempting ipv6 connect"); /* BB should we allow ipv6 on port 139? */ /* other OS never observed in Wild doing 139 with v6 */ - sin_server6->sin6_port = htons(volume_info->port); memcpy(&tcp_ses->addr.sockAddr6, sin_server6, sizeof(struct sockaddr_in6)); rc = ipv6_connect(tcp_ses); } else { - sin_server->sin_port = htons(volume_info->port); memcpy(&tcp_ses->addr.sockAddr, sin_server, sizeof(struct sockaddr_in)); rc = ipv4_connect(tcp_ses); @@ -1577,6 +1639,8 @@ cifs_get_tcp_session(struct smb_vol *volume_info) list_add(&tcp_ses->tcp_ses_list, &cifs_tcp_ses_list); write_unlock(&cifs_tcp_ses_lock); + cifs_fscache_get_client_cookie(tcp_ses); + return tcp_ses; out_err: @@ -1591,17 +1655,27 @@ out_err: } static struct cifsSesInfo * -cifs_find_smb_ses(struct TCP_Server_Info *server, char *username) +cifs_find_smb_ses(struct TCP_Server_Info *server, struct smb_vol *vol) { - struct list_head *tmp; struct cifsSesInfo *ses; write_lock(&cifs_tcp_ses_lock); - list_for_each(tmp, &server->smb_ses_list) { - ses = list_entry(tmp, struct cifsSesInfo, smb_ses_list); - if (strncmp(ses->userName, username, MAX_USERNAME_SIZE)) - continue; - + list_for_each_entry(ses, &server->smb_ses_list, smb_ses_list) { + switch (server->secType) { + case Kerberos: + if (vol->cred_uid != ses->cred_uid) + continue; + break; + default: + /* anything else takes username/password */ + if (strncmp(ses->userName, vol->username, + MAX_USERNAME_SIZE)) + continue; + if (strlen(vol->username) != 0 && + strncmp(ses->password, vol->password, + MAX_PASSWORD_SIZE)) + continue; + } ++ses->ses_count; write_unlock(&cifs_tcp_ses_lock); return ses; @@ -1643,7 +1717,7 @@ cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb_vol *volume_info) xid = GetXid(); - ses = cifs_find_smb_ses(server, volume_info->username); + ses = cifs_find_smb_ses(server, volume_info); if (ses) { cFYI(1, "Existing smb sess found (status=%d)", ses->status); @@ -1706,6 +1780,7 @@ cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb_vol *volume_info) if (ses->domainName) strcpy(ses->domainName, volume_info->domainname); } + ses->cred_uid = volume_info->cred_uid; ses->linux_uid = volume_info->linux_uid; ses->overrideSecFlg = volume_info->secFlg; @@ -1773,6 +1848,7 @@ cifs_put_tcon(struct cifsTconInfo *tcon) CIFSSMBTDis(xid, tcon); _FreeXid(xid); + cifs_fscache_release_super_cookie(tcon); tconInfoFree(tcon); cifs_put_smb_ses(ses); } @@ -1843,6 +1919,8 @@ cifs_get_tcon(struct cifsSesInfo *ses, struct smb_vol *volume_info) list_add(&tcon->tcon_list, &ses->tcon_list); write_unlock(&cifs_tcp_ses_lock); + cifs_fscache_get_super_cookie(tcon); + return tcon; out_fail: @@ -2397,6 +2475,8 @@ static void setup_cifs_sb(struct smb_vol *pvolume_info, cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_OVERR_GID; if (pvolume_info->dynperm) cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_DYNPERM; + if (pvolume_info->fsc) + cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_FSCACHE; if (pvolume_info->direct_io) { cFYI(1, "mounting share using direct i/o"); cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_DIRECT_IO; diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c index 391816b461ca..a7de5e9fff11 100644 --- a/fs/cifs/dir.c +++ b/fs/cifs/dir.c @@ -25,6 +25,7 @@ #include <linux/slab.h> #include <linux/namei.h> #include <linux/mount.h> +#include <linux/file.h> #include "cifsfs.h" #include "cifspdu.h" #include "cifsglob.h" @@ -129,12 +130,6 @@ cifs_bp_rename_retry: return full_path; } -/* - * When called with struct file pointer set to NULL, there is no way we could - * update file->private_data, but getting it stuck on openFileList provides a - * way to access it from cifs_fill_filedata and thereby set file->private_data - * from cifs_open. - */ struct cifsFileInfo * cifs_new_fileinfo(struct inode *newinode, __u16 fileHandle, struct file *file, struct vfsmount *mnt, unsigned int oflags) @@ -184,12 +179,13 @@ cifs_new_fileinfo(struct inode *newinode, __u16 fileHandle, } write_unlock(&GlobalSMBSeslock); + file->private_data = pCifsFile; + return pCifsFile; } int cifs_posix_open(char *full_path, struct inode **pinode, - struct vfsmount *mnt, struct super_block *sb, - int mode, int oflags, + struct super_block *sb, int mode, int oflags, __u32 *poplock, __u16 *pnetfid, int xid) { int rc; @@ -258,19 +254,6 @@ int cifs_posix_open(char *full_path, struct inode **pinode, cifs_fattr_to_inode(*pinode, &fattr); } - /* - * cifs_fill_filedata() takes care of setting cifsFileInfo pointer to - * file->private_data. - */ - if (mnt) { - struct cifsFileInfo *pfile_info; - - pfile_info = cifs_new_fileinfo(*pinode, *pnetfid, NULL, mnt, - oflags); - if (pfile_info == NULL) - rc = -ENOMEM; - } - posix_open_ret: kfree(presp_data); return rc; @@ -298,7 +281,6 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode, int create_options = CREATE_NOT_DIR; __u32 oplock = 0; int oflags; - bool posix_create = false; /* * BB below access is probably too much for mknod to request * but we have to do query and setpathinfo so requesting @@ -339,7 +321,6 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode, (CIFS_UNIX_POSIX_PATH_OPS_CAP & le64_to_cpu(tcon->fsUnixInfo.Capability))) { rc = cifs_posix_open(full_path, &newinode, - nd ? nd->path.mnt : NULL, inode->i_sb, mode, oflags, &oplock, &fileHandle, xid); /* EIO could indicate that (posix open) operation is not supported, despite what server claimed in capability @@ -347,7 +328,6 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode, handled in posix open */ if (rc == 0) { - posix_create = true; if (newinode == NULL) /* query inode info */ goto cifs_create_get_file_info; else /* success, no need to query */ @@ -478,21 +458,28 @@ cifs_create_set_dentry: else cFYI(1, "Create worked, get_inode_info failed rc = %d", rc); - /* nfsd case - nfs srv does not set nd */ - if ((nd == NULL) || (!(nd->flags & LOOKUP_OPEN))) { - /* mknod case - do not leave file open */ - CIFSSMBClose(xid, tcon, fileHandle); - } else if (!(posix_create) && (newinode)) { + if (newinode && nd && (nd->flags & LOOKUP_OPEN)) { struct cifsFileInfo *pfile_info; - /* - * cifs_fill_filedata() takes care of setting cifsFileInfo - * pointer to file->private_data. - */ - pfile_info = cifs_new_fileinfo(newinode, fileHandle, NULL, + struct file *filp; + + filp = lookup_instantiate_filp(nd, direntry, generic_file_open); + if (IS_ERR(filp)) { + rc = PTR_ERR(filp); + CIFSSMBClose(xid, tcon, fileHandle); + goto cifs_create_out; + } + + pfile_info = cifs_new_fileinfo(newinode, fileHandle, filp, nd->path.mnt, oflags); - if (pfile_info == NULL) + if (pfile_info == NULL) { + fput(filp); + CIFSSMBClose(xid, tcon, fileHandle); rc = -ENOMEM; + } + } else { + CIFSSMBClose(xid, tcon, fileHandle); } + cifs_create_out: kfree(buf); kfree(full_path); @@ -636,6 +623,7 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry, bool posix_open = false; struct cifs_sb_info *cifs_sb; struct cifsTconInfo *pTcon; + struct cifsFileInfo *cfile; struct inode *newInode = NULL; char *full_path = NULL; struct file *filp; @@ -703,7 +691,7 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry, if (nd && !(nd->flags & (LOOKUP_PARENT | LOOKUP_DIRECTORY)) && (nd->flags & LOOKUP_OPEN) && !pTcon->broken_posix_open && (nd->intent.open.flags & O_CREAT)) { - rc = cifs_posix_open(full_path, &newInode, nd->path.mnt, + rc = cifs_posix_open(full_path, &newInode, parent_dir_inode->i_sb, nd->intent.open.create_mode, nd->intent.open.flags, &oplock, @@ -733,8 +721,25 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry, else direntry->d_op = &cifs_dentry_ops; d_add(direntry, newInode); - if (posix_open) - filp = lookup_instantiate_filp(nd, direntry, NULL); + if (posix_open) { + filp = lookup_instantiate_filp(nd, direntry, + generic_file_open); + if (IS_ERR(filp)) { + rc = PTR_ERR(filp); + CIFSSMBClose(xid, pTcon, fileHandle); + goto lookup_out; + } + + cfile = cifs_new_fileinfo(newInode, fileHandle, filp, + nd->path.mnt, + nd->intent.open.flags); + if (cfile == NULL) { + fput(filp); + CIFSSMBClose(xid, pTcon, fileHandle); + rc = -ENOMEM; + goto lookup_out; + } + } /* since paths are not looked up by component - the parent directories are presumed to be good here */ renew_parental_timestamps(direntry); @@ -755,6 +760,7 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry, is a common return code */ } +lookup_out: kfree(full_path); FreeXid(xid); return ERR_PTR(rc); diff --git a/fs/cifs/dns_resolve.c b/fs/cifs/dns_resolve.c index 4db2c5e7283f..3ad7f4300c45 100644 --- a/fs/cifs/dns_resolve.c +++ b/fs/cifs/dns_resolve.c @@ -24,12 +24,16 @@ */ #include <linux/slab.h> +#include <linux/keyctl.h> +#include <linux/key-type.h> #include <keys/user-type.h> #include "dns_resolve.h" #include "cifsglob.h" #include "cifsproto.h" #include "cifs_debug.h" +static const struct cred *dns_resolver_cache; + /* Checks if supplied name is IP address * returns: * 1 - name is IP @@ -40,7 +44,7 @@ is_ip(char *name) { struct sockaddr_storage ss; - return cifs_convert_address(name, &ss); + return cifs_convert_address((struct sockaddr *)&ss, name); } static int @@ -94,6 +98,7 @@ struct key_type key_type_dns_resolver = { int dns_resolve_server_name_to_ip(const char *unc, char **ip_addr) { + const struct cred *saved_cred; int rc = -EAGAIN; struct key *rkey = ERR_PTR(-EAGAIN); char *name; @@ -133,8 +138,15 @@ dns_resolve_server_name_to_ip(const char *unc, char **ip_addr) goto skip_upcall; } + saved_cred = override_creds(dns_resolver_cache); rkey = request_key(&key_type_dns_resolver, name, ""); + revert_creds(saved_cred); if (!IS_ERR(rkey)) { + if (!(rkey->perm & KEY_USR_VIEW)) { + down_read(&rkey->sem); + rkey->perm |= KEY_USR_VIEW; + up_read(&rkey->sem); + } len = rkey->type_data.x[0]; data = rkey->payload.data; } else { @@ -165,4 +177,61 @@ out: return rc; } +int __init cifs_init_dns_resolver(void) +{ + struct cred *cred; + struct key *keyring; + int ret; + + printk(KERN_NOTICE "Registering the %s key type\n", + key_type_dns_resolver.name); + + /* create an override credential set with a special thread keyring in + * which DNS requests are cached + * + * this is used to prevent malicious redirections from being installed + * with add_key(). + */ + cred = prepare_kernel_cred(NULL); + if (!cred) + return -ENOMEM; + + keyring = key_alloc(&key_type_keyring, ".dns_resolver", 0, 0, cred, + (KEY_POS_ALL & ~KEY_POS_SETATTR) | + KEY_USR_VIEW | KEY_USR_READ, + KEY_ALLOC_NOT_IN_QUOTA); + if (IS_ERR(keyring)) { + ret = PTR_ERR(keyring); + goto failed_put_cred; + } + + ret = key_instantiate_and_link(keyring, NULL, 0, NULL, NULL); + if (ret < 0) + goto failed_put_key; + + ret = register_key_type(&key_type_dns_resolver); + if (ret < 0) + goto failed_put_key; + + /* instruct request_key() to use this special keyring as a cache for + * the results it looks up */ + cred->thread_keyring = keyring; + cred->jit_keyring = KEY_REQKEY_DEFL_THREAD_KEYRING; + dns_resolver_cache = cred; + return 0; + +failed_put_key: + key_put(keyring); +failed_put_cred: + put_cred(cred); + return ret; +} +void cifs_exit_dns_resolver(void) +{ + key_revoke(dns_resolver_cache->thread_keyring); + unregister_key_type(&key_type_dns_resolver); + put_cred(dns_resolver_cache); + printk(KERN_NOTICE "Unregistered %s key type\n", + key_type_dns_resolver.name); +} diff --git a/fs/cifs/dns_resolve.h b/fs/cifs/dns_resolve.h index 966e9288930b..5d7f291df162 100644 --- a/fs/cifs/dns_resolve.h +++ b/fs/cifs/dns_resolve.h @@ -24,8 +24,8 @@ #define _DNS_RESOLVE_H #ifdef __KERNEL__ -#include <linux/key-type.h> -extern struct key_type key_type_dns_resolver; +extern int __init cifs_init_dns_resolver(void); +extern void cifs_exit_dns_resolver(void); extern int dns_resolve_server_name_to_ip(const char *unc, char **ip_addr); #endif /* KERNEL */ diff --git a/fs/cifs/file.c b/fs/cifs/file.c index a83541ec9713..fa04a00d126d 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -40,6 +40,7 @@ #include "cifs_unicode.h" #include "cifs_debug.h" #include "cifs_fs_sb.h" +#include "fscache.h" static inline int cifs_convert_flags(unsigned int flags) { @@ -162,44 +163,12 @@ psx_client_can_cache: return 0; } -static struct cifsFileInfo * -cifs_fill_filedata(struct file *file) -{ - struct list_head *tmp; - struct cifsFileInfo *pCifsFile = NULL; - struct cifsInodeInfo *pCifsInode = NULL; - - /* search inode for this file and fill in file->private_data */ - pCifsInode = CIFS_I(file->f_path.dentry->d_inode); - read_lock(&GlobalSMBSeslock); - list_for_each(tmp, &pCifsInode->openFileList) { - pCifsFile = list_entry(tmp, struct cifsFileInfo, flist); - if ((pCifsFile->pfile == NULL) && - (pCifsFile->pid == current->tgid)) { - /* mode set in cifs_create */ - - /* needed for writepage */ - pCifsFile->pfile = file; - file->private_data = pCifsFile; - break; - } - } - read_unlock(&GlobalSMBSeslock); - - if (file->private_data != NULL) { - return pCifsFile; - } else if ((file->f_flags & O_CREAT) && (file->f_flags & O_EXCL)) - cERROR(1, "could not find file instance for " - "new file %p", file); - return NULL; -} - /* all arguments to this function must be checked for validity in caller */ -static inline int cifs_open_inode_helper(struct inode *inode, struct file *file, - struct cifsInodeInfo *pCifsInode, struct cifsFileInfo *pCifsFile, +static inline int cifs_open_inode_helper(struct inode *inode, struct cifsTconInfo *pTcon, int *oplock, FILE_ALL_INFO *buf, char *full_path, int xid) { + struct cifsInodeInfo *pCifsInode = CIFS_I(inode); struct timespec temp; int rc; @@ -213,36 +182,35 @@ static inline int cifs_open_inode_helper(struct inode *inode, struct file *file, /* if not oplocked, invalidate inode pages if mtime or file size changed */ temp = cifs_NTtimeToUnix(buf->LastWriteTime); - if (timespec_equal(&file->f_path.dentry->d_inode->i_mtime, &temp) && - (file->f_path.dentry->d_inode->i_size == + if (timespec_equal(&inode->i_mtime, &temp) && + (inode->i_size == (loff_t)le64_to_cpu(buf->EndOfFile))) { cFYI(1, "inode unchanged on server"); } else { - if (file->f_path.dentry->d_inode->i_mapping) { + if (inode->i_mapping) { /* BB no need to lock inode until after invalidate since namei code should already have it locked? */ - rc = filemap_write_and_wait(file->f_path.dentry->d_inode->i_mapping); + rc = filemap_write_and_wait(inode->i_mapping); if (rc != 0) - CIFS_I(file->f_path.dentry->d_inode)->write_behind_rc = rc; + pCifsInode->write_behind_rc = rc; } cFYI(1, "invalidating remote inode since open detected it " "changed"); - invalidate_remote_inode(file->f_path.dentry->d_inode); + invalidate_remote_inode(inode); } client_can_cache: if (pTcon->unix_ext) - rc = cifs_get_inode_info_unix(&file->f_path.dentry->d_inode, - full_path, inode->i_sb, xid); + rc = cifs_get_inode_info_unix(&inode, full_path, inode->i_sb, + xid); else - rc = cifs_get_inode_info(&file->f_path.dentry->d_inode, - full_path, buf, inode->i_sb, xid, NULL); + rc = cifs_get_inode_info(&inode, full_path, buf, inode->i_sb, + xid, NULL); if ((*oplock & 0xF) == OPLOCK_EXCLUSIVE) { pCifsInode->clientCanCacheAll = true; pCifsInode->clientCanCacheRead = true; - cFYI(1, "Exclusive Oplock granted on inode %p", - file->f_path.dentry->d_inode); + cFYI(1, "Exclusive Oplock granted on inode %p", inode); } else if ((*oplock & 0xF) == OPLOCK_READ) pCifsInode->clientCanCacheRead = true; @@ -256,7 +224,7 @@ int cifs_open(struct inode *inode, struct file *file) __u32 oplock; struct cifs_sb_info *cifs_sb; struct cifsTconInfo *tcon; - struct cifsFileInfo *pCifsFile; + struct cifsFileInfo *pCifsFile = NULL; struct cifsInodeInfo *pCifsInode; char *full_path = NULL; int desiredAccess; @@ -270,12 +238,6 @@ int cifs_open(struct inode *inode, struct file *file) tcon = cifs_sb->tcon; pCifsInode = CIFS_I(file->f_path.dentry->d_inode); - pCifsFile = cifs_fill_filedata(file); - if (pCifsFile) { - rc = 0; - FreeXid(xid); - return rc; - } full_path = build_path_from_dentry(file->f_path.dentry); if (full_path == NULL) { @@ -299,8 +261,7 @@ int cifs_open(struct inode *inode, struct file *file) int oflags = (int) cifs_posix_convert_flags(file->f_flags); oflags |= SMB_O_CREAT; /* can not refresh inode info since size could be stale */ - rc = cifs_posix_open(full_path, &inode, file->f_path.mnt, - inode->i_sb, + rc = cifs_posix_open(full_path, &inode, inode->i_sb, cifs_sb->mnt_file_mode /* ignored */, oflags, &oplock, &netfid, xid); if (rc == 0) { @@ -308,9 +269,23 @@ int cifs_open(struct inode *inode, struct file *file) /* no need for special case handling of setting mode on read only files needed here */ - pCifsFile = cifs_fill_filedata(file); - cifs_posix_open_inode_helper(inode, file, pCifsInode, - oplock, netfid); + rc = cifs_posix_open_inode_helper(inode, file, + pCifsInode, oplock, netfid); + if (rc != 0) { + CIFSSMBClose(xid, tcon, netfid); + goto out; + } + + pCifsFile = cifs_new_fileinfo(inode, netfid, file, + file->f_path.mnt, + oflags); + if (pCifsFile == NULL) { + CIFSSMBClose(xid, tcon, netfid); + rc = -ENOMEM; + } + + cifs_fscache_set_inode_cookie(inode, file); + goto out; } else if ((rc == -EINVAL) || (rc == -EOPNOTSUPP)) { if (tcon->ses->serverNOS) @@ -391,16 +366,18 @@ int cifs_open(struct inode *inode, struct file *file) goto out; } + rc = cifs_open_inode_helper(inode, tcon, &oplock, buf, full_path, xid); + if (rc != 0) + goto out; + pCifsFile = cifs_new_fileinfo(inode, netfid, file, file->f_path.mnt, file->f_flags); - file->private_data = pCifsFile; - if (file->private_data == NULL) { + if (pCifsFile == NULL) { rc = -ENOMEM; goto out; } - rc = cifs_open_inode_helper(inode, file, pCifsInode, pCifsFile, tcon, - &oplock, buf, full_path, xid); + cifs_fscache_set_inode_cookie(inode, file); if (oplock & CIFS_CREATE_ACTION) { /* time to set mode which we can not set earlier due to @@ -456,7 +433,7 @@ static int cifs_reopen_file(struct file *file, bool can_flush) __u16 netfid; if (file->private_data) - pCifsFile = (struct cifsFileInfo *)file->private_data; + pCifsFile = file->private_data; else return -EBADF; @@ -513,8 +490,7 @@ reopen_error_exit: le64_to_cpu(tcon->fsUnixInfo.Capability))) { int oflags = (int) cifs_posix_convert_flags(file->f_flags); /* can not refresh inode info since size could be stale */ - rc = cifs_posix_open(full_path, NULL, file->f_path.mnt, - inode->i_sb, + rc = cifs_posix_open(full_path, NULL, inode->i_sb, cifs_sb->mnt_file_mode /* ignored */, oflags, &oplock, &netfid, xid); if (rc == 0) { @@ -595,8 +571,7 @@ int cifs_close(struct inode *inode, struct file *file) int xid, timeout; struct cifs_sb_info *cifs_sb; struct cifsTconInfo *pTcon; - struct cifsFileInfo *pSMBFile = - (struct cifsFileInfo *)file->private_data; + struct cifsFileInfo *pSMBFile = file->private_data; xid = GetXid(); @@ -671,8 +646,7 @@ int cifs_closedir(struct inode *inode, struct file *file) { int rc = 0; int xid; - struct cifsFileInfo *pCFileStruct = - (struct cifsFileInfo *)file->private_data; + struct cifsFileInfo *pCFileStruct = file->private_data; char *ptmp; cFYI(1, "Closedir inode = 0x%p", inode); @@ -893,8 +867,7 @@ int cifs_lock(struct file *file, int cmd, struct file_lock *pfLock) length, pfLock, posix_lock_type, wait_flag); } else { - struct cifsFileInfo *fid = - (struct cifsFileInfo *)file->private_data; + struct cifsFileInfo *fid = file->private_data; if (numLock) { rc = CIFSSMBLock(xid, tcon, netfid, length, @@ -995,7 +968,7 @@ ssize_t cifs_user_write(struct file *file, const char __user *write_data, if (file->private_data == NULL) return -EBADF; - open_file = (struct cifsFileInfo *) file->private_data; + open_file = file->private_data; rc = generic_write_checks(file, poffset, &write_size, 0); if (rc) @@ -1097,7 +1070,7 @@ static ssize_t cifs_write(struct file *file, const char *write_data, if (file->private_data == NULL) return -EBADF; - open_file = (struct cifsFileInfo *)file->private_data; + open_file = file->private_data; xid = GetXid(); @@ -1676,19 +1649,18 @@ static int cifs_write_end(struct file *file, struct address_space *mapping, return rc; } -int cifs_fsync(struct file *file, struct dentry *dentry, int datasync) +int cifs_fsync(struct file *file, int datasync) { int xid; int rc = 0; struct cifsTconInfo *tcon; - struct cifsFileInfo *smbfile = - (struct cifsFileInfo *)file->private_data; + struct cifsFileInfo *smbfile = file->private_data; struct inode *inode = file->f_path.dentry->d_inode; xid = GetXid(); cFYI(1, "Sync file - name: %s datasync: 0x%x", - dentry->d_name.name, datasync); + file->f_path.dentry->d_name.name, datasync); rc = filemap_write_and_wait(inode->i_mapping); if (rc == 0) { @@ -1786,7 +1758,7 @@ ssize_t cifs_user_read(struct file *file, char __user *read_data, FreeXid(xid); return rc; } - open_file = (struct cifsFileInfo *)file->private_data; + open_file = file->private_data; if ((file->f_flags & O_ACCMODE) == O_WRONLY) cFYI(1, "attempting read on write only file instance"); @@ -1867,7 +1839,7 @@ static ssize_t cifs_read(struct file *file, char *read_data, size_t read_size, FreeXid(xid); return rc; } - open_file = (struct cifsFileInfo *)file->private_data; + open_file = file->private_data; if ((file->f_flags & O_ACCMODE) == O_WRONLY) cFYI(1, "attempting read on write only file instance"); @@ -1952,6 +1924,7 @@ static void cifs_copy_cache_pages(struct address_space *mapping, bytes_read -= PAGE_CACHE_SIZE; continue; } + page_cache_release(page); target = kmap_atomic(page, KM_USER0); @@ -1971,6 +1944,9 @@ static void cifs_copy_cache_pages(struct address_space *mapping, SetPageUptodate(page); unlock_page(page); data += PAGE_CACHE_SIZE; + + /* add page to FS-Cache */ + cifs_readpage_to_fscache(mapping->host, page); } return; } @@ -1997,10 +1973,19 @@ static int cifs_readpages(struct file *file, struct address_space *mapping, FreeXid(xid); return rc; } - open_file = (struct cifsFileInfo *)file->private_data; + open_file = file->private_data; cifs_sb = CIFS_SB(file->f_path.dentry->d_sb); pTcon = cifs_sb->tcon; + /* + * Reads as many pages as possible from fscache. Returns -ENOBUFS + * immediately if the cookie is negative + */ + rc = cifs_readpages_from_fscache(mapping->host, mapping, page_list, + &num_pages); + if (rc == 0) + goto read_complete; + cFYI(DBG2, "rpages: num pages %d", num_pages); for (i = 0; i < num_pages; ) { unsigned contig_pages; @@ -2111,6 +2096,7 @@ static int cifs_readpages(struct file *file, struct address_space *mapping, smb_read_data = NULL; } +read_complete: FreeXid(xid); return rc; } @@ -2121,6 +2107,11 @@ static int cifs_readpage_worker(struct file *file, struct page *page, char *read_data; int rc; + /* Is the page cached? */ + rc = cifs_readpage_from_fscache(file->f_path.dentry->d_inode, page); + if (rc == 0) + goto read_complete; + page_cache_get(page); read_data = kmap(page); /* for reads over a certain size could initiate async read ahead */ @@ -2140,11 +2131,17 @@ static int cifs_readpage_worker(struct file *file, struct page *page, flush_dcache_page(page); SetPageUptodate(page); + + /* send this page to the cache */ + cifs_readpage_to_fscache(file->f_path.dentry->d_inode, page); + rc = 0; io_error: kunmap(page); page_cache_release(page); + +read_complete: return rc; } @@ -2294,6 +2291,22 @@ out: return rc; } +static int cifs_release_page(struct page *page, gfp_t gfp) +{ + if (PagePrivate(page)) + return 0; + + return cifs_fscache_release_page(page, gfp); +} + +static void cifs_invalidate_page(struct page *page, unsigned long offset) +{ + struct cifsInodeInfo *cifsi = CIFS_I(page->mapping->host); + + if (offset == 0) + cifs_fscache_invalidate_page(page, &cifsi->vfs_inode); +} + static void cifs_oplock_break(struct slow_work *work) { @@ -2367,6 +2380,8 @@ const struct address_space_operations cifs_addr_ops = { .write_begin = cifs_write_begin, .write_end = cifs_write_end, .set_page_dirty = __set_page_dirty_nobuffers, + .releasepage = cifs_release_page, + .invalidatepage = cifs_invalidate_page, /* .sync_page = cifs_sync_page, */ /* .direct_IO = */ }; @@ -2383,6 +2398,8 @@ const struct address_space_operations cifs_addr_ops_smallbuf = { .write_begin = cifs_write_begin, .write_end = cifs_write_end, .set_page_dirty = __set_page_dirty_nobuffers, + .releasepage = cifs_release_page, + .invalidatepage = cifs_invalidate_page, /* .sync_page = cifs_sync_page, */ /* .direct_IO = */ }; diff --git a/fs/cifs/fscache.c b/fs/cifs/fscache.c new file mode 100644 index 000000000000..9f3f5c4be161 --- /dev/null +++ b/fs/cifs/fscache.c @@ -0,0 +1,236 @@ +/* + * fs/cifs/fscache.c - CIFS filesystem cache interface + * + * Copyright (c) 2010 Novell, Inc. + * Author(s): Suresh Jayaraman (sjayaraman@suse.de> + * + * This library is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published + * by the Free Software Foundation; either version 2.1 of the License, or + * (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this library; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +#include "fscache.h" +#include "cifsglob.h" +#include "cifs_debug.h" +#include "cifs_fs_sb.h" + +void cifs_fscache_get_client_cookie(struct TCP_Server_Info *server) +{ + server->fscache = + fscache_acquire_cookie(cifs_fscache_netfs.primary_index, + &cifs_fscache_server_index_def, server); + cFYI(1, "CIFS: get client cookie (0x%p/0x%p)", server, + server->fscache); +} + +void cifs_fscache_release_client_cookie(struct TCP_Server_Info *server) +{ + cFYI(1, "CIFS: release client cookie (0x%p/0x%p)", server, + server->fscache); + fscache_relinquish_cookie(server->fscache, 0); + server->fscache = NULL; +} + +void cifs_fscache_get_super_cookie(struct cifsTconInfo *tcon) +{ + struct TCP_Server_Info *server = tcon->ses->server; + + tcon->fscache = + fscache_acquire_cookie(server->fscache, + &cifs_fscache_super_index_def, tcon); + cFYI(1, "CIFS: get superblock cookie (0x%p/0x%p)", + server->fscache, tcon->fscache); +} + +void cifs_fscache_release_super_cookie(struct cifsTconInfo *tcon) +{ + cFYI(1, "CIFS: releasing superblock cookie (0x%p)", tcon->fscache); + fscache_relinquish_cookie(tcon->fscache, 0); + tcon->fscache = NULL; +} + +static void cifs_fscache_enable_inode_cookie(struct inode *inode) +{ + struct cifsInodeInfo *cifsi = CIFS_I(inode); + struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); + + if (cifsi->fscache) + return; + + cifsi->fscache = fscache_acquire_cookie(cifs_sb->tcon->fscache, + &cifs_fscache_inode_object_def, + cifsi); + cFYI(1, "CIFS: got FH cookie (0x%p/0x%p)", + cifs_sb->tcon->fscache, cifsi->fscache); +} + +void cifs_fscache_release_inode_cookie(struct inode *inode) +{ + struct cifsInodeInfo *cifsi = CIFS_I(inode); + + if (cifsi->fscache) { + cFYI(1, "CIFS releasing inode cookie (0x%p)", + cifsi->fscache); + fscache_relinquish_cookie(cifsi->fscache, 0); + cifsi->fscache = NULL; + } +} + +static void cifs_fscache_disable_inode_cookie(struct inode *inode) +{ + struct cifsInodeInfo *cifsi = CIFS_I(inode); + + if (cifsi->fscache) { + cFYI(1, "CIFS disabling inode cookie (0x%p)", + cifsi->fscache); + fscache_relinquish_cookie(cifsi->fscache, 1); + cifsi->fscache = NULL; + } +} + +void cifs_fscache_set_inode_cookie(struct inode *inode, struct file *filp) +{ + if ((filp->f_flags & O_ACCMODE) != O_RDONLY) + cifs_fscache_disable_inode_cookie(inode); + else { + cifs_fscache_enable_inode_cookie(inode); + cFYI(1, "CIFS: fscache inode cookie set"); + } +} + +void cifs_fscache_reset_inode_cookie(struct inode *inode) +{ + struct cifsInodeInfo *cifsi = CIFS_I(inode); + struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); + struct fscache_cookie *old = cifsi->fscache; + + if (cifsi->fscache) { + /* retire the current fscache cache and get a new one */ + fscache_relinquish_cookie(cifsi->fscache, 1); + + cifsi->fscache = fscache_acquire_cookie(cifs_sb->tcon->fscache, + &cifs_fscache_inode_object_def, + cifsi); + cFYI(1, "CIFS: new cookie 0x%p oldcookie 0x%p", + cifsi->fscache, old); + } +} + +int cifs_fscache_release_page(struct page *page, gfp_t gfp) +{ + if (PageFsCache(page)) { + struct inode *inode = page->mapping->host; + struct cifsInodeInfo *cifsi = CIFS_I(inode); + + cFYI(1, "CIFS: fscache release page (0x%p/0x%p)", + page, cifsi->fscache); + if (!fscache_maybe_release_page(cifsi->fscache, page, gfp)) + return 0; + } + + return 1; +} + +static void cifs_readpage_from_fscache_complete(struct page *page, void *ctx, + int error) +{ + cFYI(1, "CFS: readpage_from_fscache_complete (0x%p/%d)", + page, error); + if (!error) + SetPageUptodate(page); + unlock_page(page); +} + +/* + * Retrieve a page from FS-Cache + */ +int __cifs_readpage_from_fscache(struct inode *inode, struct page *page) +{ + int ret; + + cFYI(1, "CIFS: readpage_from_fscache(fsc:%p, p:%p, i:0x%p", + CIFS_I(inode)->fscache, page, inode); + ret = fscache_read_or_alloc_page(CIFS_I(inode)->fscache, page, + cifs_readpage_from_fscache_complete, + NULL, + GFP_KERNEL); + switch (ret) { + + case 0: /* page found in fscache, read submitted */ + cFYI(1, "CIFS: readpage_from_fscache: submitted"); + return ret; + case -ENOBUFS: /* page won't be cached */ + case -ENODATA: /* page not in cache */ + cFYI(1, "CIFS: readpage_from_fscache %d", ret); + return 1; + + default: + cERROR(1, "unknown error ret = %d", ret); + } + return ret; +} + +/* + * Retrieve a set of pages from FS-Cache + */ +int __cifs_readpages_from_fscache(struct inode *inode, + struct address_space *mapping, + struct list_head *pages, + unsigned *nr_pages) +{ + int ret; + + cFYI(1, "CIFS: __cifs_readpages_from_fscache (0x%p/%u/0x%p)", + CIFS_I(inode)->fscache, *nr_pages, inode); + ret = fscache_read_or_alloc_pages(CIFS_I(inode)->fscache, mapping, + pages, nr_pages, + cifs_readpage_from_fscache_complete, + NULL, + mapping_gfp_mask(mapping)); + switch (ret) { + case 0: /* read submitted to the cache for all pages */ + cFYI(1, "CIFS: readpages_from_fscache: submitted"); + return ret; + + case -ENOBUFS: /* some pages are not cached and can't be */ + case -ENODATA: /* some pages are not cached */ + cFYI(1, "CIFS: readpages_from_fscache: no page"); + return 1; + + default: + cFYI(1, "unknown error ret = %d", ret); + } + + return ret; +} + +void __cifs_readpage_to_fscache(struct inode *inode, struct page *page) +{ + int ret; + + cFYI(1, "CIFS: readpage_to_fscache(fsc: %p, p: %p, i: %p", + CIFS_I(inode)->fscache, page, inode); + ret = fscache_write_page(CIFS_I(inode)->fscache, page, GFP_KERNEL); + if (ret != 0) + fscache_uncache_page(CIFS_I(inode)->fscache, page); +} + +void __cifs_fscache_invalidate_page(struct page *page, struct inode *inode) +{ + struct cifsInodeInfo *cifsi = CIFS_I(inode); + struct fscache_cookie *cookie = cifsi->fscache; + + cFYI(1, "CIFS: fscache invalidatepage (0x%p/0x%p)", page, cookie); + fscache_wait_on_page_write(cookie, page); + fscache_uncache_page(cookie, page); +} + diff --git a/fs/cifs/fscache.h b/fs/cifs/fscache.h new file mode 100644 index 000000000000..31b88ec2341e --- /dev/null +++ b/fs/cifs/fscache.h @@ -0,0 +1,136 @@ +/* + * fs/cifs/fscache.h - CIFS filesystem cache interface definitions + * + * Copyright (c) 2010 Novell, Inc. + * Authors(s): Suresh Jayaraman (sjayaraman@suse.de> + * + * This library is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published + * by the Free Software Foundation; either version 2.1 of the License, or + * (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this library; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +#ifndef _CIFS_FSCACHE_H +#define _CIFS_FSCACHE_H + +#include <linux/fscache.h> + +#include "cifsglob.h" + +#ifdef CONFIG_CIFS_FSCACHE + +extern struct fscache_netfs cifs_fscache_netfs; +extern const struct fscache_cookie_def cifs_fscache_server_index_def; +extern const struct fscache_cookie_def cifs_fscache_super_index_def; +extern const struct fscache_cookie_def cifs_fscache_inode_object_def; + +extern int cifs_fscache_register(void); +extern void cifs_fscache_unregister(void); + +/* + * fscache.c + */ +extern void cifs_fscache_get_client_cookie(struct TCP_Server_Info *); +extern void cifs_fscache_release_client_cookie(struct TCP_Server_Info *); +extern void cifs_fscache_get_super_cookie(struct cifsTconInfo *); +extern void cifs_fscache_release_super_cookie(struct cifsTconInfo *); + +extern void cifs_fscache_release_inode_cookie(struct inode *); +extern void cifs_fscache_set_inode_cookie(struct inode *, struct file *); +extern void cifs_fscache_reset_inode_cookie(struct inode *); + +extern void __cifs_fscache_invalidate_page(struct page *, struct inode *); +extern int cifs_fscache_release_page(struct page *page, gfp_t gfp); +extern int __cifs_readpage_from_fscache(struct inode *, struct page *); +extern int __cifs_readpages_from_fscache(struct inode *, + struct address_space *, + struct list_head *, + unsigned *); + +extern void __cifs_readpage_to_fscache(struct inode *, struct page *); + +static inline void cifs_fscache_invalidate_page(struct page *page, + struct inode *inode) +{ + if (PageFsCache(page)) + __cifs_fscache_invalidate_page(page, inode); +} + +static inline int cifs_readpage_from_fscache(struct inode *inode, + struct page *page) +{ + if (CIFS_I(inode)->fscache) + return __cifs_readpage_from_fscache(inode, page); + + return -ENOBUFS; +} + +static inline int cifs_readpages_from_fscache(struct inode *inode, + struct address_space *mapping, + struct list_head *pages, + unsigned *nr_pages) +{ + if (CIFS_I(inode)->fscache) + return __cifs_readpages_from_fscache(inode, mapping, pages, + nr_pages); + return -ENOBUFS; +} + +static inline void cifs_readpage_to_fscache(struct inode *inode, + struct page *page) +{ + if (PageFsCache(page)) + __cifs_readpage_to_fscache(inode, page); +} + +#else /* CONFIG_CIFS_FSCACHE */ +static inline int cifs_fscache_register(void) { return 0; } +static inline void cifs_fscache_unregister(void) {} + +static inline void +cifs_fscache_get_client_cookie(struct TCP_Server_Info *server) {} +static inline void +cifs_fscache_release_client_cookie(struct TCP_Server_Info *server) {} +static inline void cifs_fscache_get_super_cookie(struct cifsTconInfo *tcon) {} +static inline void +cifs_fscache_release_super_cookie(struct cifsTconInfo *tcon) {} + +static inline void cifs_fscache_release_inode_cookie(struct inode *inode) {} +static inline void cifs_fscache_set_inode_cookie(struct inode *inode, + struct file *filp) {} +static inline void cifs_fscache_reset_inode_cookie(struct inode *inode) {} +static inline int cifs_fscache_release_page(struct page *page, gfp_t gfp) +{ + return 1; /* May release page */ +} + +static inline void cifs_fscache_invalidate_page(struct page *page, + struct inode *inode) {} +static inline int +cifs_readpage_from_fscache(struct inode *inode, struct page *page) +{ + return -ENOBUFS; +} + +static inline int cifs_readpages_from_fscache(struct inode *inode, + struct address_space *mapping, + struct list_head *pages, + unsigned *nr_pages) +{ + return -ENOBUFS; +} + +static inline void cifs_readpage_to_fscache(struct inode *inode, + struct page *page) {} + +#endif /* CONFIG_CIFS_FSCACHE */ + +#endif /* _CIFS_FSCACHE_H */ diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index 62b324f26a56..a15b3a9bbff4 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -29,6 +29,7 @@ #include "cifsproto.h" #include "cifs_debug.h" #include "cifs_fs_sb.h" +#include "fscache.h" static void cifs_set_ops(struct inode *inode, const bool is_dfs_referral) @@ -288,7 +289,7 @@ int cifs_get_file_info_unix(struct file *filp) struct inode *inode = filp->f_path.dentry->d_inode; struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); struct cifsTconInfo *tcon = cifs_sb->tcon; - struct cifsFileInfo *cfile = (struct cifsFileInfo *) filp->private_data; + struct cifsFileInfo *cfile = filp->private_data; xid = GetXid(); rc = CIFSSMBUnixQFileInfo(xid, tcon, cfile->netfid, &find_data); @@ -515,7 +516,7 @@ int cifs_get_file_info(struct file *filp) struct inode *inode = filp->f_path.dentry->d_inode; struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); struct cifsTconInfo *tcon = cifs_sb->tcon; - struct cifsFileInfo *cfile = (struct cifsFileInfo *) filp->private_data; + struct cifsFileInfo *cfile = filp->private_data; xid = GetXid(); rc = CIFSSMBQFileInfo(xid, tcon, cfile->netfid, &find_data); @@ -723,9 +724,14 @@ cifs_find_inode(struct inode *inode, void *opaque) { struct cifs_fattr *fattr = (struct cifs_fattr *) opaque; + /* don't match inode with different uniqueid */ if (CIFS_I(inode)->uniqueid != fattr->cf_uniqueid) return 0; + /* don't match inode of different type */ + if ((inode->i_mode & S_IFMT) != (fattr->cf_mode & S_IFMT)) + return 0; + /* * uh oh -- it's a directory. We can't use it since hardlinked dirs are * verboten. Disable serverino and return it as if it were found, the @@ -776,6 +782,10 @@ retry_iget5_locked: inode->i_flags |= S_NOATIME | S_NOCMTIME; if (inode->i_state & I_NEW) { inode->i_ino = hash; +#ifdef CONFIG_CIFS_FSCACHE + /* initialize per-inode cache cookie pointer */ + CIFS_I(inode)->fscache = NULL; +#endif unlock_new_inode(inode); } } @@ -807,6 +817,11 @@ struct inode *cifs_root_iget(struct super_block *sb, unsigned long ino) if (!inode) return ERR_PTR(-ENOMEM); +#ifdef CONFIG_CIFS_FSCACHE + /* populate tcon->resource_id */ + cifs_sb->tcon->resource_id = CIFS_I(inode)->uniqueid; +#endif + if (rc && cifs_sb->tcon->ipc) { cFYI(1, "ipc connection - fake read inode"); inode->i_mode |= S_IFDIR; @@ -1401,6 +1416,10 @@ cifs_do_rename(int xid, struct dentry *from_dentry, const char *fromPath, if (rc == 0 || rc != -ETXTBSY) return rc; + /* open-file renames don't work across directories */ + if (to_dentry->d_parent != from_dentry->d_parent) + return rc; + /* open the file to be renamed -- we need DELETE perms */ rc = CIFSSMBOpen(xid, pTcon, fromPath, FILE_OPEN, DELETE, CREATE_NOT_DIR, &srcfid, &oplock, NULL, @@ -1564,6 +1583,7 @@ cifs_invalidate_mapping(struct inode *inode) cifs_i->write_behind_rc = rc; } invalidate_remote_inode(inode); + cifs_fscache_reset_inode_cookie(inode); } int cifs_revalidate_file(struct file *filp) diff --git a/fs/cifs/ioctl.c b/fs/cifs/ioctl.c index 505926f1ee6b..9d38a71c8e14 100644 --- a/fs/cifs/ioctl.c +++ b/fs/cifs/ioctl.c @@ -41,8 +41,7 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg) __u64 ExtAttrMask = 0; __u64 caps; struct cifsTconInfo *tcon; - struct cifsFileInfo *pSMBFile = - (struct cifsFileInfo *)filep->private_data; + struct cifsFileInfo *pSMBFile = filep->private_data; #endif /* CONFIG_CIFS_POSIX */ xid = GetXid(); diff --git a/fs/cifs/netmisc.c b/fs/cifs/netmisc.c index d35d52889cb5..c6721ee26dbc 100644 --- a/fs/cifs/netmisc.c +++ b/fs/cifs/netmisc.c @@ -61,6 +61,7 @@ static const struct smb_to_posix_error mapping_table_ERRDOS[] = { {ERRremcd, -EACCES}, {ERRdiffdevice, -EXDEV}, {ERRnofiles, -ENOENT}, + {ERRwriteprot, -EROFS}, {ERRbadshare, -ETXTBSY}, {ERRlock, -EACCES}, {ERRunsup, -EINVAL}, @@ -164,7 +165,7 @@ cifs_inet_pton(const int address_family, const char *cp, void *dst) * Returns 0 on failure. */ int -cifs_convert_address(char *src, void *dst) +cifs_convert_address(struct sockaddr *dst, char *src) { int rc; char *pct, *endp; @@ -201,6 +202,27 @@ cifs_convert_address(char *src, void *dst) return rc; } +int +cifs_fill_sockaddr(struct sockaddr *dst, char *src, + const unsigned short int port) +{ + if (!cifs_convert_address(dst, src)) + return 0; + + switch (dst->sa_family) { + case AF_INET: + ((struct sockaddr_in *)dst)->sin_port = htons(port); + break; + case AF_INET6: + ((struct sockaddr_in6 *)dst)->sin6_port = htons(port); + break; + default: + return 0; + } + + return 1; +} + /***************************************************************************** convert a NT status code to a dos class/code *****************************************************************************/ diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c index daf1753af674..d5e591fab475 100644 --- a/fs/cifs/readdir.c +++ b/fs/cifs/readdir.c @@ -847,6 +847,11 @@ int cifs_readdir(struct file *file, void *direntry, filldir_t filldir) end_of_smb = cifsFile->srch_inf.ntwrk_buf_start + max_len; tmp_buf = kmalloc(UNICODE_NAME_MAX, GFP_KERNEL); + if (tmp_buf == NULL) { + rc = -ENOMEM; + break; + } + for (i = 0; (i < num_to_fill) && (rc == 0); i++) { if (current_entry == NULL) { /* evaluate whether this case is an error */ diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c index 7707389bdf2c..0a57cb7db5dd 100644 --- a/fs/cifs/sess.c +++ b/fs/cifs/sess.c @@ -730,15 +730,7 @@ ssetup_ntlmssp_authenticate: /* calculate session key */ setup_ntlmv2_rsp(ses, v2_sess_key, nls_cp); - if (first_time) /* should this be moved into common code - with similar ntlmv2 path? */ - /* cifs_calculate_ntlmv2_mac_key(ses->server->mac_signing_key, - response BB FIXME, v2_sess_key); */ - - /* copy session key */ - - /* memcpy(bcc_ptr, (char *)ntlm_session_key,LM2_SESS_KEY_SIZE); - bcc_ptr += LM2_SESS_KEY_SIZE; */ + /* FIXME: calculate MAC key */ memcpy(bcc_ptr, (char *)v2_sess_key, sizeof(struct ntlmv2_resp)); bcc_ptr += sizeof(struct ntlmv2_resp); diff --git a/fs/cifs/smberr.h b/fs/cifs/smberr.h index c5084d27db7c..7f16cb825fe5 100644 --- a/fs/cifs/smberr.h +++ b/fs/cifs/smberr.h @@ -76,6 +76,7 @@ #define ERRnofiles 18 /* A File Search command can find no more files matching the specified criteria. */ +#define ERRwriteprot 19 /* media is write protected */ #define ERRgeneral 31 #define ERRbadshare 32 /* The sharing mode specified for an Open conflicts with existing FIDs on diff --git a/fs/coda/coda_int.h b/fs/coda/coda_int.h index d99860a33890..6b443ff43a19 100644 --- a/fs/coda/coda_int.h +++ b/fs/coda/coda_int.h @@ -11,8 +11,7 @@ extern int coda_fake_statfs; void coda_destroy_inodecache(void); int coda_init_inodecache(void); -int coda_fsync(struct file *coda_file, struct dentry *coda_dentry, - int datasync); +int coda_fsync(struct file *coda_file, int datasync); void coda_sysctl_init(void); void coda_sysctl_clean(void); diff --git a/fs/coda/file.c b/fs/coda/file.c index 7196077b1688..ad3cd2abeeb4 100644 --- a/fs/coda/file.c +++ b/fs/coda/file.c @@ -202,10 +202,10 @@ int coda_release(struct inode *coda_inode, struct file *coda_file) return 0; } -int coda_fsync(struct file *coda_file, struct dentry *coda_dentry, int datasync) +int coda_fsync(struct file *coda_file, int datasync) { struct file *host_file; - struct inode *coda_inode = coda_dentry->d_inode; + struct inode *coda_inode = coda_file->f_path.dentry->d_inode; struct coda_file_info *cfi; int err = 0; diff --git a/fs/compat.c b/fs/compat.c index 05448730f840..6490d2134ff3 100644 --- a/fs/compat.c +++ b/fs/compat.c @@ -568,6 +568,79 @@ out: return ret; } +/* A write operation does a read from user space and vice versa */ +#define vrfy_dir(type) ((type) == READ ? VERIFY_WRITE : VERIFY_READ) + +ssize_t compat_rw_copy_check_uvector(int type, + const struct compat_iovec __user *uvector, unsigned long nr_segs, + unsigned long fast_segs, struct iovec *fast_pointer, + struct iovec **ret_pointer) +{ + compat_ssize_t tot_len; + struct iovec *iov = *ret_pointer = fast_pointer; + ssize_t ret = 0; + int seg; + + /* + * SuS says "The readv() function *may* fail if the iovcnt argument + * was less than or equal to 0, or greater than {IOV_MAX}. Linux has + * traditionally returned zero for zero segments, so... + */ + if (nr_segs == 0) + goto out; + + ret = -EINVAL; + if (nr_segs > UIO_MAXIOV || nr_segs < 0) + goto out; + if (nr_segs > fast_segs) { + ret = -ENOMEM; + iov = kmalloc(nr_segs*sizeof(struct iovec), GFP_KERNEL); + if (iov == NULL) { + *ret_pointer = fast_pointer; + goto out; + } + } + *ret_pointer = iov; + + /* + * Single unix specification: + * We should -EINVAL if an element length is not >= 0 and fitting an + * ssize_t. The total length is fitting an ssize_t + * + * Be careful here because iov_len is a size_t not an ssize_t + */ + tot_len = 0; + ret = -EINVAL; + for (seg = 0; seg < nr_segs; seg++) { + compat_ssize_t tmp = tot_len; + compat_uptr_t buf; + compat_ssize_t len; + + if (__get_user(len, &uvector->iov_len) || + __get_user(buf, &uvector->iov_base)) { + ret = -EFAULT; + goto out; + } + if (len < 0) /* size_t not fitting in compat_ssize_t .. */ + goto out; + tot_len += len; + if (tot_len < tmp) /* maths overflow on the compat_ssize_t */ + goto out; + if (!access_ok(vrfy_dir(type), compat_ptr(buf), len)) { + ret = -EFAULT; + goto out; + } + iov->iov_base = compat_ptr(buf); + iov->iov_len = (compat_size_t) len; + uvector++; + iov++; + } + ret = tot_len; + +out: + return ret; +} + static inline long copy_iocb(long nr, u32 __user *ptr32, struct iocb __user * __user *ptr64) { @@ -600,7 +673,7 @@ compat_sys_io_submit(aio_context_t ctx_id, int nr, u32 __user *iocb) iocb64 = compat_alloc_user_space(nr * sizeof(*iocb64)); ret = copy_iocb(nr, iocb, iocb64); if (!ret) - ret = sys_io_submit(ctx_id, nr, iocb64); + ret = do_io_submit(ctx_id, nr, iocb64, 1); return ret; } @@ -1077,70 +1150,21 @@ static ssize_t compat_do_readv_writev(int type, struct file *file, { compat_ssize_t tot_len; struct iovec iovstack[UIO_FASTIOV]; - struct iovec *iov=iovstack, *vector; + struct iovec *iov; ssize_t ret; - int seg; io_fn_t fn; iov_fn_t fnv; - /* - * SuS says "The readv() function *may* fail if the iovcnt argument - * was less than or equal to 0, or greater than {IOV_MAX}. Linux has - * traditionally returned zero for zero segments, so... - */ - ret = 0; - if (nr_segs == 0) - goto out; - - /* - * First get the "struct iovec" from user memory and - * verify all the pointers - */ ret = -EINVAL; - if ((nr_segs > UIO_MAXIOV) || (nr_segs <= 0)) - goto out; if (!file->f_op) goto out; - if (nr_segs > UIO_FASTIOV) { - ret = -ENOMEM; - iov = kmalloc(nr_segs*sizeof(struct iovec), GFP_KERNEL); - if (!iov) - goto out; - } + ret = -EFAULT; if (!access_ok(VERIFY_READ, uvector, nr_segs*sizeof(*uvector))) goto out; - /* - * Single unix specification: - * We should -EINVAL if an element length is not >= 0 and fitting an - * ssize_t. The total length is fitting an ssize_t - * - * Be careful here because iov_len is a size_t not an ssize_t - */ - tot_len = 0; - vector = iov; - ret = -EINVAL; - for (seg = 0 ; seg < nr_segs; seg++) { - compat_ssize_t tmp = tot_len; - compat_ssize_t len; - compat_uptr_t buf; - - if (__get_user(len, &uvector->iov_len) || - __get_user(buf, &uvector->iov_base)) { - ret = -EFAULT; - goto out; - } - if (len < 0) /* size_t not fitting an compat_ssize_t .. */ - goto out; - tot_len += len; - if (tot_len < tmp) /* maths overflow on the compat_ssize_t */ - goto out; - vector->iov_base = compat_ptr(buf); - vector->iov_len = (compat_size_t) len; - uvector++; - vector++; - } + tot_len = compat_rw_copy_check_uvector(type, uvector, nr_segs, + UIO_FASTIOV, iovstack, &iov); if (tot_len == 0) { ret = 0; goto out; diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c index 641640dc7ae5..8ea5e3374507 100644 --- a/fs/compat_ioctl.c +++ b/fs/compat_ioctl.c @@ -601,8 +601,11 @@ static int ioc_settimeout(unsigned int fd, unsigned int cmd, } /* Bluetooth ioctls */ -#define HCIUARTSETPROTO _IOW('U', 200, int) -#define HCIUARTGETPROTO _IOR('U', 201, int) +#define HCIUARTSETPROTO _IOW('U', 200, int) +#define HCIUARTGETPROTO _IOR('U', 201, int) +#define HCIUARTGETDEVICE _IOR('U', 202, int) +#define HCIUARTSETFLAGS _IOW('U', 203, int) +#define HCIUARTGETFLAGS _IOR('U', 204, int) #define BNEPCONNADD _IOW('B', 200, int) #define BNEPCONNDEL _IOW('B', 201, int) @@ -1328,6 +1331,8 @@ COMPATIBLE_IOCTL(HCISETLINKPOL) COMPATIBLE_IOCTL(HCISETLINKMODE) COMPATIBLE_IOCTL(HCISETACLMTU) COMPATIBLE_IOCTL(HCISETSCOMTU) +COMPATIBLE_IOCTL(HCIBLOCKADDR) +COMPATIBLE_IOCTL(HCIUNBLOCKADDR) COMPATIBLE_IOCTL(HCIINQUIRY) COMPATIBLE_IOCTL(HCIUARTSETPROTO) COMPATIBLE_IOCTL(HCIUARTGETPROTO) diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c index c8af2d91174b..cf78d44a8d6a 100644 --- a/fs/configfs/inode.c +++ b/fs/configfs/inode.c @@ -73,15 +73,6 @@ int configfs_setattr(struct dentry * dentry, struct iattr * iattr) return -EINVAL; sd_iattr = sd->s_iattr; - - error = inode_change_ok(inode, iattr); - if (error) - return error; - - error = inode_setattr(inode, iattr); - if (error) - return error; - if (!sd_iattr) { /* setting attributes for the first time, allocate now */ sd_iattr = kzalloc(sizeof(struct iattr), GFP_KERNEL); @@ -94,9 +85,12 @@ int configfs_setattr(struct dentry * dentry, struct iattr * iattr) sd_iattr->ia_atime = sd_iattr->ia_mtime = sd_iattr->ia_ctime = CURRENT_TIME; sd->s_iattr = sd_iattr; } - /* attributes were changed atleast once in past */ + error = simple_setattr(dentry, iattr); + if (error) + return error; + if (ia_valid & ATTR_UID) sd_iattr->ia_uid = iattr->ia_uid; if (ia_valid & ATTR_GID) diff --git a/fs/dcache.c b/fs/dcache.c index d96047b4a633..86d4db15473e 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -590,6 +590,8 @@ static void prune_dcache(int count) up_read(&sb->s_umount); } spin_lock(&sb_lock); + /* lock was dropped, must reset next */ + list_safe_reset_next(sb, n, s_list); count -= pruned; __put_super(sb); /* more work left to do? */ @@ -894,7 +896,7 @@ EXPORT_SYMBOL(shrink_dcache_parent); * * In this case we return -1 to tell the caller that we baled. */ -static int shrink_dcache_memory(int nr, gfp_t gfp_mask) +static int shrink_dcache_memory(struct shrinker *shrink, int nr, gfp_t gfp_mask) { if (nr) { if (!(gfp_mask & __GFP_FS)) diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c index 4d74fc72c195..0210898458b2 100644 --- a/fs/debugfs/file.c +++ b/fs/debugfs/file.c @@ -277,8 +277,10 @@ DEFINE_SIMPLE_ATTRIBUTE(fops_x32, debugfs_u32_get, debugfs_u32_set, "0x%08llx\n" DEFINE_SIMPLE_ATTRIBUTE(fops_x32_ro, debugfs_u32_get, NULL, "0x%08llx\n"); DEFINE_SIMPLE_ATTRIBUTE(fops_x32_wo, NULL, debugfs_u32_set, "0x%08llx\n"); +DEFINE_SIMPLE_ATTRIBUTE(fops_x64, debugfs_u64_get, debugfs_u64_set, "0x%016llx\n"); + /* - * debugfs_create_x{8,16,32} - create a debugfs file that is used to read and write an unsigned {8,16,32}-bit value + * debugfs_create_x{8,16,32,64} - create a debugfs file that is used to read and write an unsigned {8,16,32,64}-bit value * * These functions are exactly the same as the above functions (but use a hex * output for the decimal challenged). For details look at the above unsigned @@ -357,6 +359,23 @@ struct dentry *debugfs_create_x32(const char *name, mode_t mode, } EXPORT_SYMBOL_GPL(debugfs_create_x32); +/** + * debugfs_create_x64 - create a debugfs file that is used to read and write an unsigned 64-bit value + * @name: a pointer to a string containing the name of the file to create. + * @mode: the permission that the file should have + * @parent: a pointer to the parent dentry for this file. This should be a + * directory dentry if set. If this parameter is %NULL, then the + * file will be created in the root of the debugfs filesystem. + * @value: a pointer to the variable that the file should read to and write + * from. + */ +struct dentry *debugfs_create_x64(const char *name, mode_t mode, + struct dentry *parent, u64 *value) +{ + return debugfs_create_file(name, mode, parent, value, &fops_x64); +} +EXPORT_SYMBOL_GPL(debugfs_create_x64); + static int debugfs_size_t_set(void *data, u64 val) { diff --git a/fs/direct-io.c b/fs/direct-io.c index e82adc2debb7..a10cb91cadea 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -82,6 +82,8 @@ struct dio { int reap_counter; /* rate limit reaping */ get_block_t *get_block; /* block mapping function */ dio_iodone_t *end_io; /* IO completion function */ + dio_submit_t *submit_io; /* IO submition function */ + loff_t logical_offset_in_bio; /* current first logical block in bio */ sector_t final_block_in_bio; /* current final block in bio + 1 */ sector_t next_block_for_io; /* next block to be put under IO, in dio_blocks units */ @@ -96,6 +98,7 @@ struct dio { unsigned cur_page_offset; /* Offset into it, in bytes */ unsigned cur_page_len; /* Nr of bytes at cur_page_offset */ sector_t cur_page_block; /* Where it starts */ + loff_t cur_page_fs_offset; /* Offset in file */ /* BIO completion state */ spinlock_t bio_lock; /* protects BIO fields below */ @@ -215,7 +218,7 @@ static struct page *dio_get_page(struct dio *dio) * filesystems can use it to hold additional state between get_block calls and * dio_complete. */ -static int dio_complete(struct dio *dio, loff_t offset, int ret) +static int dio_complete(struct dio *dio, loff_t offset, int ret, bool is_async) { ssize_t transferred = 0; @@ -236,14 +239,6 @@ static int dio_complete(struct dio *dio, loff_t offset, int ret) transferred = dio->i_size - offset; } - if (dio->end_io && dio->result) - dio->end_io(dio->iocb, offset, transferred, - dio->map_bh.b_private); - - if (dio->flags & DIO_LOCKING) - /* lockdep: non-owner release */ - up_read_non_owner(&dio->inode->i_alloc_sem); - if (ret == 0) ret = dio->page_errors; if (ret == 0) @@ -251,6 +246,17 @@ static int dio_complete(struct dio *dio, loff_t offset, int ret) if (ret == 0) ret = transferred; + if (dio->end_io && dio->result) { + dio->end_io(dio->iocb, offset, transferred, + dio->map_bh.b_private, ret, is_async); + } else if (is_async) { + aio_complete(dio->iocb, ret, 0); + } + + if (dio->flags & DIO_LOCKING) + /* lockdep: non-owner release */ + up_read_non_owner(&dio->inode->i_alloc_sem); + return ret; } @@ -274,8 +280,7 @@ static void dio_bio_end_aio(struct bio *bio, int error) spin_unlock_irqrestore(&dio->bio_lock, flags); if (remaining == 0) { - int ret = dio_complete(dio, dio->iocb->ki_pos, 0); - aio_complete(dio->iocb, ret, 0); + dio_complete(dio, dio->iocb->ki_pos, 0, true); kfree(dio); } } @@ -300,6 +305,26 @@ static void dio_bio_end_io(struct bio *bio, int error) spin_unlock_irqrestore(&dio->bio_lock, flags); } +/** + * dio_end_io - handle the end io action for the given bio + * @bio: The direct io bio thats being completed + * @error: Error if there was one + * + * This is meant to be called by any filesystem that uses their own dio_submit_t + * so that the DIO specific endio actions are dealt with after the filesystem + * has done it's completion work. + */ +void dio_end_io(struct bio *bio, int error) +{ + struct dio *dio = bio->bi_private; + + if (dio->is_async) + dio_bio_end_aio(bio, error); + else + dio_bio_end_io(bio, error); +} +EXPORT_SYMBOL_GPL(dio_end_io); + static int dio_bio_alloc(struct dio *dio, struct block_device *bdev, sector_t first_sector, int nr_vecs) @@ -316,6 +341,7 @@ dio_bio_alloc(struct dio *dio, struct block_device *bdev, bio->bi_end_io = dio_bio_end_io; dio->bio = bio; + dio->logical_offset_in_bio = dio->cur_page_fs_offset; return 0; } @@ -340,10 +366,15 @@ static void dio_bio_submit(struct dio *dio) if (dio->is_async && dio->rw == READ) bio_set_pages_dirty(bio); - submit_bio(dio->rw, bio); + if (dio->submit_io) + dio->submit_io(dio->rw, bio, dio->inode, + dio->logical_offset_in_bio); + else + submit_bio(dio->rw, bio); dio->bio = NULL; dio->boundary = 0; + dio->logical_offset_in_bio = 0; } /* @@ -603,10 +634,26 @@ static int dio_send_cur_page(struct dio *dio) int ret = 0; if (dio->bio) { + loff_t cur_offset = dio->block_in_file << dio->blkbits; + loff_t bio_next_offset = dio->logical_offset_in_bio + + dio->bio->bi_size; + /* - * See whether this new request is contiguous with the old + * See whether this new request is contiguous with the old. + * + * Btrfs cannot handl having logically non-contiguous requests + * submitted. For exmple if you have + * + * Logical: [0-4095][HOLE][8192-12287] + * Phyiscal: [0-4095] [4096-8181] + * + * We cannot submit those pages together as one BIO. So if our + * current logical offset in the file does not equal what would + * be the next logical offset in the bio, submit the bio we + * have. */ - if (dio->final_block_in_bio != dio->cur_page_block) + if (dio->final_block_in_bio != dio->cur_page_block || + cur_offset != bio_next_offset) dio_bio_submit(dio); /* * Submit now if the underlying fs is about to perform a @@ -701,6 +748,7 @@ submit_page_section(struct dio *dio, struct page *page, dio->cur_page_offset = offset; dio->cur_page_len = len; dio->cur_page_block = blocknr; + dio->cur_page_fs_offset = dio->block_in_file << dio->blkbits; out: return ret; } @@ -935,7 +983,7 @@ static ssize_t direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode, const struct iovec *iov, loff_t offset, unsigned long nr_segs, unsigned blkbits, get_block_t get_block, dio_iodone_t end_io, - struct dio *dio) + dio_submit_t submit_io, struct dio *dio) { unsigned long user_addr; unsigned long flags; @@ -952,6 +1000,7 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode, dio->get_block = get_block; dio->end_io = end_io; + dio->submit_io = submit_io; dio->final_block_in_bio = -1; dio->next_block_for_io = -1; @@ -1008,7 +1057,7 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode, } } /* end iovec loop */ - if (ret == -ENOTBLK && (rw & WRITE)) { + if (ret == -ENOTBLK) { /* * The remaining part of the request will be * be handled by buffered I/O when we return @@ -1079,7 +1128,7 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode, spin_unlock_irqrestore(&dio->bio_lock, flags); if (ret2 == 0) { - ret = dio_complete(dio, offset, ret); + ret = dio_complete(dio, offset, ret, false); kfree(dio); } else BUG_ON(ret != -EIOCBQUEUED); @@ -1087,30 +1136,11 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode, return ret; } -/* - * This is a library function for use by filesystem drivers. - * - * The locking rules are governed by the flags parameter: - * - if the flags value contains DIO_LOCKING we use a fancy locking - * scheme for dumb filesystems. - * For writes this function is called under i_mutex and returns with - * i_mutex held, for reads, i_mutex is not held on entry, but it is - * taken and dropped again before returning. - * For reads and writes i_alloc_sem is taken in shared mode and released - * on I/O completion (which may happen asynchronously after returning to - * the caller). - * - * - if the flags value does NOT contain DIO_LOCKING we don't use any - * internal locking but rather rely on the filesystem to synchronize - * direct I/O reads/writes versus each other and truncate. - * For reads and writes both i_mutex and i_alloc_sem are not held on - * entry and are never taken. - */ ssize_t -__blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, +__blockdev_direct_IO_newtrunc(int rw, struct kiocb *iocb, struct inode *inode, struct block_device *bdev, const struct iovec *iov, loff_t offset, unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io, - int flags) + dio_submit_t submit_io, int flags) { int seg; size_t size; @@ -1197,11 +1227,49 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, (end > i_size_read(inode))); retval = direct_io_worker(rw, iocb, inode, iov, offset, - nr_segs, blkbits, get_block, end_io, dio); + nr_segs, blkbits, get_block, end_io, + submit_io, dio); + +out: + return retval; +} +EXPORT_SYMBOL(__blockdev_direct_IO_newtrunc); + +/* + * This is a library function for use by filesystem drivers. + * + * The locking rules are governed by the flags parameter: + * - if the flags value contains DIO_LOCKING we use a fancy locking + * scheme for dumb filesystems. + * For writes this function is called under i_mutex and returns with + * i_mutex held, for reads, i_mutex is not held on entry, but it is + * taken and dropped again before returning. + * For reads and writes i_alloc_sem is taken in shared mode and released + * on I/O completion (which may happen asynchronously after returning to + * the caller). + * + * - if the flags value does NOT contain DIO_LOCKING we don't use any + * internal locking but rather rely on the filesystem to synchronize + * direct I/O reads/writes versus each other and truncate. + * For reads and writes both i_mutex and i_alloc_sem are not held on + * entry and are never taken. + */ +ssize_t +__blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, + struct block_device *bdev, const struct iovec *iov, loff_t offset, + unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io, + dio_submit_t submit_io, int flags) +{ + ssize_t retval; + retval = __blockdev_direct_IO_newtrunc(rw, iocb, inode, bdev, iov, + offset, nr_segs, get_block, end_io, submit_io, flags); /* * In case of error extending write may have instantiated a few * blocks outside i_size. Trim these off again for DIO_LOCKING. + * NOTE: DIO_NO_LOCK/DIO_OWN_LOCK callers have to handle this in + * their own manner. This is a further example of where the old + * truncate sequence is inadequate. * * NOTE: filesystems with their own locking have to handle this * on their own. @@ -1209,12 +1277,13 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, if (flags & DIO_LOCKING) { if (unlikely((rw & WRITE) && retval < 0)) { loff_t isize = i_size_read(inode); + loff_t end = offset + iov_length(iov, nr_segs); + if (end > isize) vmtruncate(inode, isize); } } -out: return retval; } EXPORT_SYMBOL(__blockdev_direct_IO); diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c index 3bdddbcc785f..e8fcf4e2ed7d 100644 --- a/fs/ecryptfs/file.c +++ b/fs/ecryptfs/file.c @@ -274,7 +274,7 @@ static int ecryptfs_release(struct inode *inode, struct file *file) } static int -ecryptfs_fsync(struct file *file, struct dentry *dentry, int datasync) +ecryptfs_fsync(struct file *file, int datasync) { return vfs_fsync(ecryptfs_file_to_lower(file), datasync); } diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c index 65dee2f336ae..31ef5252f0fe 100644 --- a/fs/ecryptfs/inode.c +++ b/fs/ecryptfs/inode.c @@ -805,7 +805,7 @@ static int truncate_upper(struct dentry *dentry, struct iattr *ia, - (ia->ia_size & ~PAGE_CACHE_MASK)); if (!(crypt_stat->flags & ECRYPTFS_ENCRYPTED)) { - rc = vmtruncate(inode, ia->ia_size); + rc = simple_setsize(inode, ia->ia_size); if (rc) goto out; lower_ia->ia_size = ia->ia_size; @@ -830,7 +830,7 @@ static int truncate_upper(struct dentry *dentry, struct iattr *ia, goto out; } } - vmtruncate(inode, ia->ia_size); + simple_setsize(inode, ia->ia_size); rc = ecryptfs_write_inode_size_to_metadata(inode); if (rc) { printk(KERN_ERR "Problem with " diff --git a/fs/ecryptfs/messaging.c b/fs/ecryptfs/messaging.c index 2d8dbce9d485..46c4dd8dfcc3 100644 --- a/fs/ecryptfs/messaging.c +++ b/fs/ecryptfs/messaging.c @@ -31,9 +31,9 @@ static struct mutex ecryptfs_msg_ctx_lists_mux; static struct hlist_head *ecryptfs_daemon_hash; struct mutex ecryptfs_daemon_hash_mux; -static int ecryptfs_hash_buckets; +static int ecryptfs_hash_bits; #define ecryptfs_uid_hash(uid) \ - hash_long((unsigned long)uid, ecryptfs_hash_buckets) + hash_long((unsigned long)uid, ecryptfs_hash_bits) static u32 ecryptfs_msg_counter; static struct ecryptfs_msg_ctx *ecryptfs_msg_ctx_arr; @@ -486,18 +486,19 @@ int ecryptfs_init_messaging(void) } mutex_init(&ecryptfs_daemon_hash_mux); mutex_lock(&ecryptfs_daemon_hash_mux); - ecryptfs_hash_buckets = 1; - while (ecryptfs_number_of_users >> ecryptfs_hash_buckets) - ecryptfs_hash_buckets++; + ecryptfs_hash_bits = 1; + while (ecryptfs_number_of_users >> ecryptfs_hash_bits) + ecryptfs_hash_bits++; ecryptfs_daemon_hash = kmalloc((sizeof(struct hlist_head) - * ecryptfs_hash_buckets), GFP_KERNEL); + * (1 << ecryptfs_hash_bits)), + GFP_KERNEL); if (!ecryptfs_daemon_hash) { rc = -ENOMEM; printk(KERN_ERR "%s: Failed to allocate memory\n", __func__); mutex_unlock(&ecryptfs_daemon_hash_mux); goto out; } - for (i = 0; i < ecryptfs_hash_buckets; i++) + for (i = 0; i < (1 << ecryptfs_hash_bits); i++) INIT_HLIST_HEAD(&ecryptfs_daemon_hash[i]); mutex_unlock(&ecryptfs_daemon_hash_mux); ecryptfs_msg_ctx_arr = kmalloc((sizeof(struct ecryptfs_msg_ctx) @@ -554,7 +555,7 @@ void ecryptfs_release_messaging(void) int i; mutex_lock(&ecryptfs_daemon_hash_mux); - for (i = 0; i < ecryptfs_hash_buckets; i++) { + for (i = 0; i < (1 << ecryptfs_hash_bits); i++) { int rc; hlist_for_each_entry(daemon, elem, diff --git a/fs/exec.c b/fs/exec.c index 9badbc0bfb1d..e19de6a80339 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -768,7 +768,6 @@ static int de_thread(struct task_struct *tsk) struct signal_struct *sig = tsk->signal; struct sighand_struct *oldsighand = tsk->sighand; spinlock_t *lock = &oldsighand->siglock; - int count; if (thread_group_empty(tsk)) goto no_thread_group; @@ -785,13 +784,13 @@ static int de_thread(struct task_struct *tsk) spin_unlock_irq(lock); return -EAGAIN; } + sig->group_exit_task = tsk; - zap_other_threads(tsk); + sig->notify_count = zap_other_threads(tsk); + if (!thread_group_leader(tsk)) + sig->notify_count--; - /* Account for the thread group leader hanging around: */ - count = thread_group_leader(tsk) ? 1 : 2; - sig->notify_count = count; - while (atomic_read(&sig->count) > count) { + while (sig->notify_count) { __set_current_state(TASK_UNINTERRUPTIBLE); spin_unlock_irq(lock); schedule(); @@ -1662,12 +1661,15 @@ static int coredump_wait(int exit_code, struct core_state *core_state) struct task_struct *tsk = current; struct mm_struct *mm = tsk->mm; struct completion *vfork_done; - int core_waiters; + int core_waiters = -EBUSY; init_completion(&core_state->startup); core_state->dumper.task = tsk; core_state->dumper.next = NULL; - core_waiters = zap_threads(tsk, mm, core_state, exit_code); + + down_write(&mm->mmap_sem); + if (!mm->core_state) + core_waiters = zap_threads(tsk, mm, core_state, exit_code); up_write(&mm->mmap_sem); if (unlikely(core_waiters < 0)) @@ -1787,21 +1789,61 @@ static void wait_for_dump_helpers(struct file *file) } +/* + * uhm_pipe_setup + * helper function to customize the process used + * to collect the core in userspace. Specifically + * it sets up a pipe and installs it as fd 0 (stdin) + * for the process. Returns 0 on success, or + * PTR_ERR on failure. + * Note that it also sets the core limit to 1. This + * is a special value that we use to trap recursive + * core dumps + */ +static int umh_pipe_setup(struct subprocess_info *info) +{ + struct file *rp, *wp; + struct fdtable *fdt; + struct coredump_params *cp = (struct coredump_params *)info->data; + struct files_struct *cf = current->files; + + wp = create_write_pipe(0); + if (IS_ERR(wp)) + return PTR_ERR(wp); + + rp = create_read_pipe(wp, 0); + if (IS_ERR(rp)) { + free_write_pipe(wp); + return PTR_ERR(rp); + } + + cp->file = wp; + + sys_close(0); + fd_install(0, rp); + spin_lock(&cf->file_lock); + fdt = files_fdtable(cf); + FD_SET(0, fdt->open_fds); + FD_CLR(0, fdt->close_on_exec); + spin_unlock(&cf->file_lock); + + /* and disallow core files too */ + current->signal->rlim[RLIMIT_CORE] = (struct rlimit){1, 1}; + + return 0; +} + void do_coredump(long signr, int exit_code, struct pt_regs *regs) { struct core_state core_state; char corename[CORENAME_MAX_SIZE + 1]; struct mm_struct *mm = current->mm; struct linux_binfmt * binfmt; - struct inode * inode; const struct cred *old_cred; struct cred *cred; int retval = 0; int flag = 0; - int ispipe = 0; - char **helper_argv = NULL; - int helper_argc = 0; - int dump_count = 0; + int ispipe; static atomic_t core_dump_count = ATOMIC_INIT(0); struct coredump_params cprm = { .signr = signr, @@ -1820,23 +1862,12 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs) binfmt = mm->binfmt; if (!binfmt || !binfmt->core_dump) goto fail; - - cred = prepare_creds(); - if (!cred) { - retval = -ENOMEM; + if (!__get_dumpable(cprm.mm_flags)) goto fail; - } - down_write(&mm->mmap_sem); - /* - * If another thread got here first, or we are not dumpable, bail out. - */ - if (mm->core_state || !__get_dumpable(cprm.mm_flags)) { - up_write(&mm->mmap_sem); - put_cred(cred); + cred = prepare_creds(); + if (!cred) goto fail; - } - /* * We cannot trust fsuid as being the "true" uid of the * process nor do we know its entire history. We only know it @@ -1849,10 +1880,8 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs) } retval = coredump_wait(exit_code, &core_state); - if (retval < 0) { - put_cred(cred); - goto fail; - } + if (retval < 0) + goto fail_creds; old_cred = override_creds(cred); @@ -1870,19 +1899,19 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs) ispipe = format_corename(corename, signr); unlock_kernel(); - if ((!ispipe) && (cprm.limit < binfmt->min_coredump)) - goto fail_unlock; - if (ispipe) { - if (cprm.limit == 0) { + int dump_count; + char **helper_argv; + + if (cprm.limit == 1) { /* * Normally core limits are irrelevant to pipes, since * we're not writing to the file system, but we use - * cprm.limit of 0 here as a speacial value. Any - * non-zero limit gets set to RLIM_INFINITY below, but + * cprm.limit of 1 here as a speacial value. Any + * non-1 limit gets set to RLIM_INFINITY below, but * a limit of 0 skips the dump. This is a consistent * way to catch recursive crashes. We can still crash - * if the core_pattern binary sets RLIM_CORE = !0 + * if the core_pattern binary sets RLIM_CORE = !1 * but it runs as root, and can do lots of stupid things * Note that we use task_tgid_vnr here to grab the pid * of the process group leader. That way we get the @@ -1890,11 +1919,12 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs) * core_pattern process dies. */ printk(KERN_WARNING - "Process %d(%s) has RLIMIT_CORE set to 0\n", + "Process %d(%s) has RLIMIT_CORE set to 1\n", task_tgid_vnr(current), current->comm); printk(KERN_WARNING "Aborting core\n"); goto fail_unlock; } + cprm.limit = RLIM_INFINITY; dump_count = atomic_inc_return(&core_dump_count); if (core_pipe_limit && (core_pipe_limit < dump_count)) { @@ -1904,71 +1934,74 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs) goto fail_dropcount; } - helper_argv = argv_split(GFP_KERNEL, corename+1, &helper_argc); + helper_argv = argv_split(GFP_KERNEL, corename+1, NULL); if (!helper_argv) { printk(KERN_WARNING "%s failed to allocate memory\n", __func__); goto fail_dropcount; } - cprm.limit = RLIM_INFINITY; - - /* SIGPIPE can happen, but it's just never processed */ - if (call_usermodehelper_pipe(helper_argv[0], helper_argv, NULL, - &cprm.file)) { + retval = call_usermodehelper_fns(helper_argv[0], helper_argv, + NULL, UMH_WAIT_EXEC, umh_pipe_setup, + NULL, &cprm); + argv_free(helper_argv); + if (retval) { printk(KERN_INFO "Core dump to %s pipe failed\n", corename); - goto fail_dropcount; + goto close_fail; } - } else + } else { + struct inode *inode; + + if (cprm.limit < binfmt->min_coredump) + goto fail_unlock; + cprm.file = filp_open(corename, O_CREAT | 2 | O_NOFOLLOW | O_LARGEFILE | flag, 0600); - if (IS_ERR(cprm.file)) - goto fail_dropcount; - inode = cprm.file->f_path.dentry->d_inode; - if (inode->i_nlink > 1) - goto close_fail; /* multiple links - don't dump */ - if (!ispipe && d_unhashed(cprm.file->f_path.dentry)) - goto close_fail; - - /* AK: actually i see no reason to not allow this for named pipes etc., - but keep the previous behaviour for now. */ - if (!ispipe && !S_ISREG(inode->i_mode)) - goto close_fail; - /* - * Dont allow local users get cute and trick others to coredump - * into their pre-created files: - * Note, this is not relevant for pipes - */ - if (!ispipe && (inode->i_uid != current_fsuid())) - goto close_fail; - if (!cprm.file->f_op) - goto close_fail; - if (!cprm.file->f_op->write) - goto close_fail; - if (!ispipe && - do_truncate(cprm.file->f_path.dentry, 0, 0, cprm.file) != 0) - goto close_fail; + if (IS_ERR(cprm.file)) + goto fail_unlock; - retval = binfmt->core_dump(&cprm); + inode = cprm.file->f_path.dentry->d_inode; + if (inode->i_nlink > 1) + goto close_fail; + if (d_unhashed(cprm.file->f_path.dentry)) + goto close_fail; + /* + * AK: actually i see no reason to not allow this for named + * pipes etc, but keep the previous behaviour for now. + */ + if (!S_ISREG(inode->i_mode)) + goto close_fail; + /* + * Dont allow local users get cute and trick others to coredump + * into their pre-created files. + */ + if (inode->i_uid != current_fsuid()) + goto close_fail; + if (!cprm.file->f_op || !cprm.file->f_op->write) + goto close_fail; + if (do_truncate(cprm.file->f_path.dentry, 0, 0, cprm.file)) + goto close_fail; + } + retval = binfmt->core_dump(&cprm); if (retval) current->signal->group_exit_code |= 0x80; -close_fail: + if (ispipe && core_pipe_limit) wait_for_dump_helpers(cprm.file); - filp_close(cprm.file, NULL); +close_fail: + if (cprm.file) + filp_close(cprm.file, NULL); fail_dropcount: - if (dump_count) + if (ispipe) atomic_dec(&core_dump_count); fail_unlock: - if (helper_argv) - argv_free(helper_argv); - + coredump_finish(mm); revert_creds(old_cred); +fail_creds: put_cred(cred); - coredump_finish(mm); fail: return; } diff --git a/fs/exofs/file.c b/fs/exofs/file.c index 839b9dc1e70f..fef6899be397 100644 --- a/fs/exofs/file.c +++ b/fs/exofs/file.c @@ -40,12 +40,11 @@ static int exofs_release_file(struct inode *inode, struct file *filp) return 0; } -static int exofs_file_fsync(struct file *filp, struct dentry *dentry, - int datasync) +static int exofs_file_fsync(struct file *filp, int datasync) { int ret; struct address_space *mapping = filp->f_mapping; - struct inode *inode = dentry->d_inode; + struct inode *inode = mapping->host; struct super_block *sb; ret = filemap_write_and_wait(mapping); @@ -66,7 +65,7 @@ static int exofs_file_fsync(struct file *filp, struct dentry *dentry, static int exofs_flush(struct file *file, fl_owner_t id) { - exofs_file_fsync(file, file->f_path.dentry, 1); + exofs_file_fsync(file, 1); /* TODO: Flush the OSD target */ return 0; } diff --git a/fs/ext2/acl.c b/fs/ext2/acl.c index ca7e2a0ed98a..2bcc0431bada 100644 --- a/fs/ext2/acl.c +++ b/fs/ext2/acl.c @@ -200,6 +200,7 @@ ext2_set_acl(struct inode *inode, int type, struct posix_acl *acl) return error; else { inode->i_mode = mode; + inode->i_ctime = CURRENT_TIME_SEC; mark_inode_dirty(inode); if (error == 0) acl = NULL; diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h index 0b038e47ad2f..52b34f1d2738 100644 --- a/fs/ext2/ext2.h +++ b/fs/ext2/ext2.h @@ -122,7 +122,6 @@ extern int ext2_write_inode (struct inode *, struct writeback_control *); extern void ext2_delete_inode (struct inode *); extern int ext2_sync_inode (struct inode *); extern int ext2_get_block(struct inode *, sector_t, struct buffer_head *, int); -extern void ext2_truncate (struct inode *); extern int ext2_setattr (struct dentry *, struct iattr *); extern void ext2_set_inode_flags(struct inode *inode); extern void ext2_get_inode_flags(struct ext2_inode_info *); @@ -155,7 +154,7 @@ extern void ext2_write_super (struct super_block *); extern const struct file_operations ext2_dir_operations; /* file.c */ -extern int ext2_fsync(struct file *file, struct dentry *dentry, int datasync); +extern int ext2_fsync(struct file *file, int datasync); extern const struct inode_operations ext2_file_inode_operations; extern const struct file_operations ext2_file_operations; extern const struct file_operations ext2_xip_file_operations; diff --git a/fs/ext2/file.c b/fs/ext2/file.c index 5d198d0697fb..49eec9456c5b 100644 --- a/fs/ext2/file.c +++ b/fs/ext2/file.c @@ -40,13 +40,13 @@ static int ext2_release_file (struct inode * inode, struct file * filp) return 0; } -int ext2_fsync(struct file *file, struct dentry *dentry, int datasync) +int ext2_fsync(struct file *file, int datasync) { int ret; - struct super_block *sb = dentry->d_inode->i_sb; + struct super_block *sb = file->f_mapping->host->i_sb; struct address_space *mapping = sb->s_bdev->bd_inode->i_mapping; - ret = simple_fsync(file, dentry, datasync); + ret = generic_file_fsync(file, datasync); if (ret == -EIO || test_and_clear_bit(AS_EIO, &mapping->flags)) { /* We don't really know where the IO error happened... */ ext2_error(sb, __func__, @@ -95,7 +95,6 @@ const struct file_operations ext2_xip_file_operations = { #endif const struct inode_operations ext2_file_inode_operations = { - .truncate = ext2_truncate, #ifdef CONFIG_EXT2_FS_XATTR .setxattr = generic_setxattr, .getxattr = generic_getxattr, diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index 527c46d9bc1f..3675088cb88c 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c @@ -54,6 +54,18 @@ static inline int ext2_inode_is_fast_symlink(struct inode *inode) inode->i_blocks - ea_blocks == 0); } +static void ext2_truncate_blocks(struct inode *inode, loff_t offset); + +static void ext2_write_failed(struct address_space *mapping, loff_t to) +{ + struct inode *inode = mapping->host; + + if (to > inode->i_size) { + truncate_pagecache(inode, to, inode->i_size); + ext2_truncate_blocks(inode, inode->i_size); + } +} + /* * Called at the last iput() if i_nlink is zero. */ @@ -71,7 +83,7 @@ void ext2_delete_inode (struct inode * inode) inode->i_size = 0; if (inode->i_blocks) - ext2_truncate (inode); + ext2_truncate_blocks(inode, 0); ext2_free_inode (inode); return; @@ -757,8 +769,8 @@ int __ext2_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned flags, struct page **pagep, void **fsdata) { - return block_write_begin(file, mapping, pos, len, flags, pagep, fsdata, - ext2_get_block); + return block_write_begin_newtrunc(file, mapping, pos, len, flags, + pagep, fsdata, ext2_get_block); } static int @@ -766,8 +778,25 @@ ext2_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned flags, struct page **pagep, void **fsdata) { + int ret; + *pagep = NULL; - return __ext2_write_begin(file, mapping, pos, len, flags, pagep,fsdata); + ret = __ext2_write_begin(file, mapping, pos, len, flags, pagep, fsdata); + if (ret < 0) + ext2_write_failed(mapping, pos + len); + return ret; +} + +static int ext2_write_end(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct page *page, void *fsdata) +{ + int ret; + + ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata); + if (ret < len) + ext2_write_failed(mapping, pos + len); + return ret; } static int @@ -775,13 +804,18 @@ ext2_nobh_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned flags, struct page **pagep, void **fsdata) { + int ret; + /* * Dir-in-pagecache still uses ext2_write_begin. Would have to rework * directory handling code to pass around offsets rather than struct * pages in order to make this work easily. */ - return nobh_write_begin(file, mapping, pos, len, flags, pagep, fsdata, - ext2_get_block); + ret = nobh_write_begin_newtrunc(file, mapping, pos, len, flags, pagep, + fsdata, ext2_get_block); + if (ret < 0) + ext2_write_failed(mapping, pos + len); + return ret; } static int ext2_nobh_writepage(struct page *page, @@ -800,10 +834,15 @@ ext2_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, loff_t offset, unsigned long nr_segs) { struct file *file = iocb->ki_filp; - struct inode *inode = file->f_mapping->host; - - return blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov, - offset, nr_segs, ext2_get_block, NULL); + struct address_space *mapping = file->f_mapping; + struct inode *inode = mapping->host; + ssize_t ret; + + ret = blockdev_direct_IO_newtrunc(rw, iocb, inode, inode->i_sb->s_bdev, + iov, offset, nr_segs, ext2_get_block, NULL); + if (ret < 0 && (rw & WRITE)) + ext2_write_failed(mapping, offset + iov_length(iov, nr_segs)); + return ret; } static int @@ -818,7 +857,7 @@ const struct address_space_operations ext2_aops = { .writepage = ext2_writepage, .sync_page = block_sync_page, .write_begin = ext2_write_begin, - .write_end = generic_write_end, + .write_end = ext2_write_end, .bmap = ext2_bmap, .direct_IO = ext2_direct_IO, .writepages = ext2_writepages, @@ -1027,7 +1066,7 @@ static void ext2_free_branches(struct inode *inode, __le32 *p, __le32 *q, int de ext2_free_data(inode, p, q); } -void ext2_truncate(struct inode *inode) +static void __ext2_truncate_blocks(struct inode *inode, loff_t offset) { __le32 *i_data = EXT2_I(inode)->i_data; struct ext2_inode_info *ei = EXT2_I(inode); @@ -1039,27 +1078,8 @@ void ext2_truncate(struct inode *inode) int n; long iblock; unsigned blocksize; - - if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || - S_ISLNK(inode->i_mode))) - return; - if (ext2_inode_is_fast_symlink(inode)) - return; - if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) - return; - blocksize = inode->i_sb->s_blocksize; - iblock = (inode->i_size + blocksize-1) - >> EXT2_BLOCK_SIZE_BITS(inode->i_sb); - - if (mapping_is_xip(inode->i_mapping)) - xip_truncate_page(inode->i_mapping, inode->i_size); - else if (test_opt(inode->i_sb, NOBH)) - nobh_truncate_page(inode->i_mapping, - inode->i_size, ext2_get_block); - else - block_truncate_page(inode->i_mapping, - inode->i_size, ext2_get_block); + iblock = (offset + blocksize-1) >> EXT2_BLOCK_SIZE_BITS(inode->i_sb); n = ext2_block_to_path(inode, iblock, offsets, NULL); if (n == 0) @@ -1127,6 +1147,62 @@ do_indirects: ext2_discard_reservation(inode); mutex_unlock(&ei->truncate_mutex); +} + +static void ext2_truncate_blocks(struct inode *inode, loff_t offset) +{ + /* + * XXX: it seems like a bug here that we don't allow + * IS_APPEND inode to have blocks-past-i_size trimmed off. + * review and fix this. + * + * Also would be nice to be able to handle IO errors and such, + * but that's probably too much to ask. + */ + if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || + S_ISLNK(inode->i_mode))) + return; + if (ext2_inode_is_fast_symlink(inode)) + return; + if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) + return; + __ext2_truncate_blocks(inode, offset); +} + +int ext2_setsize(struct inode *inode, loff_t newsize) +{ + loff_t oldsize; + int error; + + error = inode_newsize_ok(inode, newsize); + if (error) + return error; + + if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || + S_ISLNK(inode->i_mode))) + return -EINVAL; + if (ext2_inode_is_fast_symlink(inode)) + return -EINVAL; + if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) + return -EPERM; + + if (mapping_is_xip(inode->i_mapping)) + error = xip_truncate_page(inode->i_mapping, newsize); + else if (test_opt(inode->i_sb, NOBH)) + error = nobh_truncate_page(inode->i_mapping, + newsize, ext2_get_block); + else + error = block_truncate_page(inode->i_mapping, + newsize, ext2_get_block); + if (error) + return error; + + oldsize = inode->i_size; + i_size_write(inode, newsize); + truncate_pagecache(inode, oldsize, newsize); + + __ext2_truncate_blocks(inode, newsize); + inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC; if (inode_needs_sync(inode)) { sync_mapping_buffers(inode->i_mapping); @@ -1134,6 +1210,8 @@ do_indirects: } else { mark_inode_dirty(inode); } + + return 0; } static struct ext2_inode *ext2_get_inode(struct super_block *sb, ino_t ino, @@ -1474,8 +1552,15 @@ int ext2_setattr(struct dentry *dentry, struct iattr *iattr) if (error) return error; } - error = inode_setattr(inode, iattr); - if (!error && (iattr->ia_valid & ATTR_MODE)) + if (iattr->ia_valid & ATTR_SIZE && iattr->ia_size != inode->i_size) { + error = ext2_setsize(inode, iattr->ia_size); + if (error) + return error; + } + generic_setattr(inode, iattr); + if (iattr->ia_valid & ATTR_MODE) error = ext2_acl_chmod(inode); + mark_inode_dirty(inode); + return error; } diff --git a/fs/ext2/super.c b/fs/ext2/super.c index 71e9eb1fa696..7ff43f4a59cd 100644 --- a/fs/ext2/super.c +++ b/fs/ext2/super.c @@ -119,6 +119,8 @@ static void ext2_put_super (struct super_block * sb) int i; struct ext2_sb_info *sbi = EXT2_SB(sb); + dquot_disable(sb, -1, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED); + if (sb->s_dirt) ext2_write_super(sb); @@ -1063,6 +1065,12 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent) sb->s_op = &ext2_sops; sb->s_export_op = &ext2_export_ops; sb->s_xattr = ext2_xattr_handlers; + +#ifdef CONFIG_QUOTA + sb->dq_op = &dquot_operations; + sb->s_qcop = &dquot_quotactl_ops; +#endif + root = ext2_iget(sb, EXT2_ROOT_INO); if (IS_ERR(root)) { ret = PTR_ERR(root); @@ -1241,6 +1249,7 @@ static int ext2_remount (struct super_block * sb, int * flags, char * data) spin_unlock(&sbi->s_lock); return 0; } + /* * OK, we are remounting a valid rw partition rdonly, so set * the rdonly flag and then mark the partition as valid again. @@ -1248,6 +1257,13 @@ static int ext2_remount (struct super_block * sb, int * flags, char * data) es->s_state = cpu_to_le16(sbi->s_mount_state); es->s_mtime = cpu_to_le32(get_seconds()); spin_unlock(&sbi->s_lock); + + err = dquot_suspend(sb, -1); + if (err < 0) { + spin_lock(&sbi->s_lock); + goto restore_opts; + } + ext2_sync_super(sb, es, 1); } else { __le32 ret = EXT2_HAS_RO_COMPAT_FEATURE(sb, @@ -1269,8 +1285,12 @@ static int ext2_remount (struct super_block * sb, int * flags, char * data) if (!ext2_setup_super (sb, es, 0)) sb->s_flags &= ~MS_RDONLY; spin_unlock(&sbi->s_lock); + ext2_write_super(sb); + + dquot_resume(sb, -1); } + return 0; restore_opts: sbi->s_mount_opt = old_opts.s_mount_opt; diff --git a/fs/ext3/acl.c b/fs/ext3/acl.c index 01552abbca3c..8a11fe212183 100644 --- a/fs/ext3/acl.c +++ b/fs/ext3/acl.c @@ -205,6 +205,7 @@ ext3_set_acl(handle_t *handle, struct inode *inode, int type, return error; else { inode->i_mode = mode; + inode->i_ctime = CURRENT_TIME_SEC; ext3_mark_inode_dirty(handle, inode); if (error == 0) acl = NULL; diff --git a/fs/ext3/dir.c b/fs/ext3/dir.c index 373fa90c796a..e2e72c367cf6 100644 --- a/fs/ext3/dir.c +++ b/fs/ext3/dir.c @@ -297,7 +297,7 @@ static void free_rb_tree_fname(struct rb_root *root) kfree (old); } if (!parent) - root->rb_node = NULL; + *root = RB_ROOT; else if (parent->rb_left == n) parent->rb_left = NULL; else if (parent->rb_right == n) diff --git a/fs/ext3/fsync.c b/fs/ext3/fsync.c index fcf7487734b6..d7e9f74dc3a6 100644 --- a/fs/ext3/fsync.c +++ b/fs/ext3/fsync.c @@ -43,9 +43,9 @@ * inode to disk. */ -int ext3_sync_file(struct file * file, struct dentry *dentry, int datasync) +int ext3_sync_file(struct file *file, int datasync) { - struct inode *inode = dentry->d_inode; + struct inode *inode = file->f_mapping->host; struct ext3_inode_info *ei = EXT3_I(inode); journal_t *journal = EXT3_SB(inode->i_sb)->s_journal; int ret, needs_barrier = 0; diff --git a/fs/ext3/super.c b/fs/ext3/super.c index 0fc1293d0e96..6c953bb255e7 100644 --- a/fs/ext3/super.c +++ b/fs/ext3/super.c @@ -410,6 +410,8 @@ static void ext3_put_super (struct super_block * sb) struct ext3_super_block *es = sbi->s_es; int i, err; + dquot_disable(sb, -1, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED); + lock_kernel(); ext3_xattr_put_super(sb); @@ -748,7 +750,7 @@ static int ext3_release_dquot(struct dquot *dquot); static int ext3_mark_dquot_dirty(struct dquot *dquot); static int ext3_write_info(struct super_block *sb, int type); static int ext3_quota_on(struct super_block *sb, int type, int format_id, - char *path, int remount); + char *path); static int ext3_quota_on_mount(struct super_block *sb, int type); static ssize_t ext3_quota_read(struct super_block *sb, int type, char *data, size_t len, loff_t off); @@ -767,12 +769,12 @@ static const struct dquot_operations ext3_quota_operations = { static const struct quotactl_ops ext3_qctl_operations = { .quota_on = ext3_quota_on, - .quota_off = vfs_quota_off, - .quota_sync = vfs_quota_sync, - .get_info = vfs_get_dqinfo, - .set_info = vfs_set_dqinfo, - .get_dqblk = vfs_get_dqblk, - .set_dqblk = vfs_set_dqblk + .quota_off = dquot_quota_off, + .quota_sync = dquot_quota_sync, + .get_info = dquot_get_dqinfo, + .set_info = dquot_set_dqinfo, + .get_dqblk = dquot_get_dqblk, + .set_dqblk = dquot_set_dqblk }; #endif @@ -1527,7 +1529,7 @@ static void ext3_orphan_cleanup (struct super_block * sb, /* Turn quotas off */ for (i = 0; i < MAXQUOTAS; i++) { if (sb_dqopt(sb)->files[i]) - vfs_quota_off(sb, i, 0); + dquot_quota_off(sb, i); } #endif sb->s_flags = s_flags; /* Restore MS_RDONLY status */ @@ -2551,6 +2553,7 @@ static int ext3_remount (struct super_block * sb, int * flags, char * data) ext3_fsblk_t n_blocks_count = 0; unsigned long old_sb_flags; struct ext3_mount_options old_opts; + int enable_quota = 0; int err; #ifdef CONFIG_QUOTA int i; @@ -2597,6 +2600,10 @@ static int ext3_remount (struct super_block * sb, int * flags, char * data) } if (*flags & MS_RDONLY) { + err = dquot_suspend(sb, -1); + if (err < 0) + goto restore_opts; + /* * First of all, the unconditional stuff we have to do * to disable replay of the journal when we next remount @@ -2651,6 +2658,7 @@ static int ext3_remount (struct super_block * sb, int * flags, char * data) goto restore_opts; if (!ext3_setup_super (sb, es, 0)) sb->s_flags &= ~MS_RDONLY; + enable_quota = 1; } } #ifdef CONFIG_QUOTA @@ -2662,6 +2670,9 @@ static int ext3_remount (struct super_block * sb, int * flags, char * data) #endif unlock_super(sb); unlock_kernel(); + + if (enable_quota) + dquot_resume(sb, -1); return 0; restore_opts: sb->s_flags = old_sb_flags; @@ -2851,24 +2862,21 @@ static int ext3_write_info(struct super_block *sb, int type) */ static int ext3_quota_on_mount(struct super_block *sb, int type) { - return vfs_quota_on_mount(sb, EXT3_SB(sb)->s_qf_names[type], - EXT3_SB(sb)->s_jquota_fmt, type); + return dquot_quota_on_mount(sb, EXT3_SB(sb)->s_qf_names[type], + EXT3_SB(sb)->s_jquota_fmt, type); } /* * Standard function to be called on quota_on */ static int ext3_quota_on(struct super_block *sb, int type, int format_id, - char *name, int remount) + char *name) { int err; struct path path; if (!test_opt(sb, QUOTA)) return -EINVAL; - /* When remounting, no checks are needed and in fact, name is NULL */ - if (remount) - return vfs_quota_on(sb, type, format_id, name, remount); err = kern_path(name, LOOKUP_FOLLOW, &path); if (err) @@ -2906,7 +2914,7 @@ static int ext3_quota_on(struct super_block *sb, int type, int format_id, } } - err = vfs_quota_on_path(sb, type, format_id, &path); + err = dquot_quota_on_path(sb, type, format_id, &path); path_put(&path); return err; } diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index d2f37a5516c7..95b7594c76f9 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -591,14 +591,15 @@ ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode, ret = ext4_mb_new_blocks(handle, &ar, errp); if (count) *count = ar.len; - /* - * Account for the allocated meta blocks + * Account for the allocated meta blocks. We will never + * fail EDQUOT for metdata, but we do account for it. */ if (!(*errp) && EXT4_I(inode)->i_delalloc_reserved_flag) { spin_lock(&EXT4_I(inode)->i_block_reservation_lock); EXT4_I(inode)->i_allocated_meta_blocks += ar.len; spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); + dquot_alloc_block_nofail(inode, ar.len); } return ret; } diff --git a/fs/ext4/block_validity.c b/fs/ext4/block_validity.c index 538c48655084..5b6973fbf1bd 100644 --- a/fs/ext4/block_validity.c +++ b/fs/ext4/block_validity.c @@ -72,9 +72,9 @@ static int add_system_zone(struct ext4_sb_info *sbi, else if (start_blk >= (entry->start_blk + entry->count)) n = &(*n)->rb_right; else { - if (start_blk + count > (entry->start_blk + + if (start_blk + count > (entry->start_blk + entry->count)) - entry->count = (start_blk + count - + entry->count = (start_blk + count - entry->start_blk); new_node = *n; new_entry = rb_entry(new_node, struct ext4_system_zone, diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c index 86cb6d86a048..ea5e6cb7e2a5 100644 --- a/fs/ext4/dir.c +++ b/fs/ext4/dir.c @@ -83,11 +83,10 @@ int ext4_check_dir_entry(const char *function, struct inode *dir, error_msg = "inode out of bounds"; if (error_msg != NULL) - __ext4_error(dir->i_sb, function, - "bad entry in directory #%lu: %s - block=%llu" + ext4_error_inode(function, dir, + "bad entry in directory: %s - block=%llu" "offset=%u(%u), inode=%u, rec_len=%d, name_len=%d", - dir->i_ino, error_msg, - (unsigned long long) bh->b_blocknr, + error_msg, (unsigned long long) bh->b_blocknr, (unsigned) (offset%bh->b_size), offset, le32_to_cpu(de->inode), rlen, de->name_len); @@ -111,7 +110,7 @@ static int ext4_readdir(struct file *filp, if (EXT4_HAS_COMPAT_FEATURE(inode->i_sb, EXT4_FEATURE_COMPAT_DIR_INDEX) && - ((EXT4_I(inode)->i_flags & EXT4_INDEX_FL) || + ((ext4_test_inode_flag(inode, EXT4_INODE_INDEX)) || ((inode->i_size >> sb->s_blocksize_bits) == 1))) { err = ext4_dx_readdir(filp, dirent, filldir); if (err != ERR_BAD_DX_DIR) { @@ -122,20 +121,20 @@ static int ext4_readdir(struct file *filp, * We don't set the inode dirty flag since it's not * critical that it get flushed back to the disk. */ - EXT4_I(filp->f_path.dentry->d_inode)->i_flags &= ~EXT4_INDEX_FL; + ext4_clear_inode_flag(filp->f_path.dentry->d_inode, EXT4_INODE_INDEX); } stored = 0; offset = filp->f_pos & (sb->s_blocksize - 1); while (!error && !stored && filp->f_pos < inode->i_size) { - ext4_lblk_t blk = filp->f_pos >> EXT4_BLOCK_SIZE_BITS(sb); - struct buffer_head map_bh; + struct ext4_map_blocks map; struct buffer_head *bh = NULL; - map_bh.b_state = 0; - err = ext4_get_blocks(NULL, inode, blk, 1, &map_bh, 0); + map.m_lblk = filp->f_pos >> EXT4_BLOCK_SIZE_BITS(sb); + map.m_len = 1; + err = ext4_map_blocks(NULL, inode, &map, 0); if (err > 0) { - pgoff_t index = map_bh.b_blocknr >> + pgoff_t index = map.m_pblk >> (PAGE_CACHE_SHIFT - inode->i_blkbits); if (!ra_has_index(&filp->f_ra, index)) page_cache_sync_readahead( @@ -143,7 +142,7 @@ static int ext4_readdir(struct file *filp, &filp->f_ra, filp, index, 1); filp->f_ra.prev_pos = (loff_t)index << PAGE_CACHE_SHIFT; - bh = ext4_bread(NULL, inode, blk, 0, &err); + bh = ext4_bread(NULL, inode, map.m_lblk, 0, &err); } /* @@ -152,9 +151,8 @@ static int ext4_readdir(struct file *filp, */ if (!bh) { if (!dir_has_error) { - ext4_error(sb, "directory #%lu " + EXT4_ERROR_INODE(inode, "directory " "contains a hole at offset %Lu", - inode->i_ino, (unsigned long long) filp->f_pos); dir_has_error = 1; } diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index bf938cf7c5f0..19a4de57128a 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -29,6 +29,9 @@ #include <linux/wait.h> #include <linux/blockgroup_lock.h> #include <linux/percpu_counter.h> +#ifdef __KERNEL__ +#include <linux/compat.h> +#endif /* * The fourth extended filesystem constants/structures @@ -54,10 +57,10 @@ #endif #define EXT4_ERROR_INODE(inode, fmt, a...) \ - ext4_error_inode(__func__, (inode), (fmt), ## a); + ext4_error_inode(__func__, (inode), (fmt), ## a) #define EXT4_ERROR_FILE(file, fmt, a...) \ - ext4_error_file(__func__, (file), (fmt), ## a); + ext4_error_file(__func__, (file), (fmt), ## a) /* data type for block offset of block group */ typedef int ext4_grpblk_t; @@ -72,7 +75,7 @@ typedef __u32 ext4_lblk_t; typedef unsigned int ext4_group_t; /* - * Flags used in mballoc's allocation_context flags field. + * Flags used in mballoc's allocation_context flags field. * * Also used to show what's going on for debugging purposes when the * flag field is exported via the traceport interface @@ -126,6 +129,29 @@ struct ext4_allocation_request { }; /* + * Logical to physical block mapping, used by ext4_map_blocks() + * + * This structure is used to pass requests into ext4_map_blocks() as + * well as to store the information returned by ext4_map_blocks(). It + * takes less room on the stack than a struct buffer_head. + */ +#define EXT4_MAP_NEW (1 << BH_New) +#define EXT4_MAP_MAPPED (1 << BH_Mapped) +#define EXT4_MAP_UNWRITTEN (1 << BH_Unwritten) +#define EXT4_MAP_BOUNDARY (1 << BH_Boundary) +#define EXT4_MAP_UNINIT (1 << BH_Uninit) +#define EXT4_MAP_FLAGS (EXT4_MAP_NEW | EXT4_MAP_MAPPED |\ + EXT4_MAP_UNWRITTEN | EXT4_MAP_BOUNDARY |\ + EXT4_MAP_UNINIT) + +struct ext4_map_blocks { + ext4_fsblk_t m_pblk; + ext4_lblk_t m_lblk; + unsigned int m_len; + unsigned int m_flags; +}; + +/* * For delayed allocation tracking */ struct mpage_da_data { @@ -321,6 +347,83 @@ static inline __u32 ext4_mask_flags(umode_t mode, __u32 flags) return flags & EXT4_OTHER_FLMASK; } +/* + * Inode flags used for atomic set/get + */ +enum { + EXT4_INODE_SECRM = 0, /* Secure deletion */ + EXT4_INODE_UNRM = 1, /* Undelete */ + EXT4_INODE_COMPR = 2, /* Compress file */ + EXT4_INODE_SYNC = 3, /* Synchronous updates */ + EXT4_INODE_IMMUTABLE = 4, /* Immutable file */ + EXT4_INODE_APPEND = 5, /* writes to file may only append */ + EXT4_INODE_NODUMP = 6, /* do not dump file */ + EXT4_INODE_NOATIME = 7, /* do not update atime */ +/* Reserved for compression usage... */ + EXT4_INODE_DIRTY = 8, + EXT4_INODE_COMPRBLK = 9, /* One or more compressed clusters */ + EXT4_INODE_NOCOMPR = 10, /* Don't compress */ + EXT4_INODE_ECOMPR = 11, /* Compression error */ +/* End compression flags --- maybe not all used */ + EXT4_INODE_INDEX = 12, /* hash-indexed directory */ + EXT4_INODE_IMAGIC = 13, /* AFS directory */ + EXT4_INODE_JOURNAL_DATA = 14, /* file data should be journaled */ + EXT4_INODE_NOTAIL = 15, /* file tail should not be merged */ + EXT4_INODE_DIRSYNC = 16, /* dirsync behaviour (directories only) */ + EXT4_INODE_TOPDIR = 17, /* Top of directory hierarchies*/ + EXT4_INODE_HUGE_FILE = 18, /* Set to each huge file */ + EXT4_INODE_EXTENTS = 19, /* Inode uses extents */ + EXT4_INODE_EA_INODE = 21, /* Inode used for large EA */ + EXT4_INODE_EOFBLOCKS = 22, /* Blocks allocated beyond EOF */ + EXT4_INODE_RESERVED = 31, /* reserved for ext4 lib */ +}; + +#define TEST_FLAG_VALUE(FLAG) (EXT4_##FLAG##_FL == (1 << EXT4_INODE_##FLAG)) +#define CHECK_FLAG_VALUE(FLAG) if (!TEST_FLAG_VALUE(FLAG)) { \ + printk(KERN_EMERG "EXT4 flag fail: " #FLAG ": %d %d\n", \ + EXT4_##FLAG##_FL, EXT4_INODE_##FLAG); BUG_ON(1); } + +/* + * Since it's pretty easy to mix up bit numbers and hex values, and we + * can't do a compile-time test for ENUM values, we use a run-time + * test to make sure that EXT4_XXX_FL is consistent with respect to + * EXT4_INODE_XXX. If all is well the printk and BUG_ON will all drop + * out so it won't cost any extra space in the compiled kernel image. + * But it's important that these values are the same, since we are + * using EXT4_INODE_XXX to test for the flag values, but EXT4_XX_FL + * must be consistent with the values of FS_XXX_FL defined in + * include/linux/fs.h and the on-disk values found in ext2, ext3, and + * ext4 filesystems, and of course the values defined in e2fsprogs. + * + * It's not paranoia if the Murphy's Law really *is* out to get you. :-) + */ +static inline void ext4_check_flag_values(void) +{ + CHECK_FLAG_VALUE(SECRM); + CHECK_FLAG_VALUE(UNRM); + CHECK_FLAG_VALUE(COMPR); + CHECK_FLAG_VALUE(SYNC); + CHECK_FLAG_VALUE(IMMUTABLE); + CHECK_FLAG_VALUE(APPEND); + CHECK_FLAG_VALUE(NODUMP); + CHECK_FLAG_VALUE(NOATIME); + CHECK_FLAG_VALUE(DIRTY); + CHECK_FLAG_VALUE(COMPRBLK); + CHECK_FLAG_VALUE(NOCOMPR); + CHECK_FLAG_VALUE(ECOMPR); + CHECK_FLAG_VALUE(INDEX); + CHECK_FLAG_VALUE(IMAGIC); + CHECK_FLAG_VALUE(JOURNAL_DATA); + CHECK_FLAG_VALUE(NOTAIL); + CHECK_FLAG_VALUE(DIRSYNC); + CHECK_FLAG_VALUE(TOPDIR); + CHECK_FLAG_VALUE(HUGE_FILE); + CHECK_FLAG_VALUE(EXTENTS); + CHECK_FLAG_VALUE(EA_INODE); + CHECK_FLAG_VALUE(EOFBLOCKS); + CHECK_FLAG_VALUE(RESERVED); +} + /* Used to pass group descriptor data when online resize is done */ struct ext4_new_group_input { __u32 group; /* Group number for this data */ @@ -332,6 +435,18 @@ struct ext4_new_group_input { __u16 unused; }; +#if defined(__KERNEL__) && defined(CONFIG_COMPAT) +struct compat_ext4_new_group_input { + u32 group; + compat_u64 block_bitmap; + compat_u64 inode_bitmap; + compat_u64 inode_table; + u32 blocks_count; + u16 reserved_blocks; + u16 unused; +}; +#endif + /* The struct ext4_new_group_input in kernel space, with free_blocks_count */ struct ext4_new_group_data { __u32 group; @@ -355,7 +470,7 @@ struct ext4_new_group_data { #define EXT4_GET_BLOCKS_CREATE_UNINIT_EXT (EXT4_GET_BLOCKS_UNINIT_EXT|\ EXT4_GET_BLOCKS_CREATE) /* Caller is from the delayed allocation writeout path, - so set the magic i_delalloc_reserve_flag after taking the + so set the magic i_delalloc_reserve_flag after taking the inode allocation semaphore for */ #define EXT4_GET_BLOCKS_DELALLOC_RESERVE 0x0004 /* caller is from the direct IO path, request to creation of an @@ -398,6 +513,7 @@ struct ext4_new_group_data { #define EXT4_IOC_ALLOC_DA_BLKS _IO('f', 12) #define EXT4_IOC_MOVE_EXT _IOWR('f', 15, struct move_extent) +#if defined(__KERNEL__) && defined(CONFIG_COMPAT) /* * ioctl commands in 32 bit emulation */ @@ -408,11 +524,13 @@ struct ext4_new_group_data { #define EXT4_IOC32_GETRSVSZ _IOR('f', 5, int) #define EXT4_IOC32_SETRSVSZ _IOW('f', 6, int) #define EXT4_IOC32_GROUP_EXTEND _IOW('f', 7, unsigned int) +#define EXT4_IOC32_GROUP_ADD _IOW('f', 8, struct compat_ext4_new_group_input) #ifdef CONFIG_JBD2_DEBUG #define EXT4_IOC32_WAIT_FOR_READONLY _IOR('f', 99, int) #endif #define EXT4_IOC32_GETVERSION_OLD FS_IOC32_GETVERSION #define EXT4_IOC32_SETVERSION_OLD FS_IOC32_SETVERSION +#endif /* @@ -616,9 +734,8 @@ struct ext4_ext_cache { */ struct ext4_inode_info { __le32 i_data[15]; /* unconverted */ - __u32 i_flags; - ext4_fsblk_t i_file_acl; __u32 i_dtime; + ext4_fsblk_t i_file_acl; /* * i_block_group is the number of the block group which contains @@ -629,6 +746,7 @@ struct ext4_inode_info { */ ext4_group_t i_block_group; unsigned long i_state_flags; /* Dynamic state flags */ + unsigned long i_flags; ext4_lblk_t i_dir_start_lookup; #ifdef CONFIG_EXT4_FS_XATTR @@ -1062,22 +1180,25 @@ enum { EXT4_STATE_DA_ALLOC_CLOSE, /* Alloc DA blks on close */ EXT4_STATE_EXT_MIGRATE, /* Inode is migrating */ EXT4_STATE_DIO_UNWRITTEN, /* need convert on dio done*/ + EXT4_STATE_NEWENTRY, /* File just added to dir */ }; -static inline int ext4_test_inode_state(struct inode *inode, int bit) -{ - return test_bit(bit, &EXT4_I(inode)->i_state_flags); -} - -static inline void ext4_set_inode_state(struct inode *inode, int bit) -{ - set_bit(bit, &EXT4_I(inode)->i_state_flags); +#define EXT4_INODE_BIT_FNS(name, field) \ +static inline int ext4_test_inode_##name(struct inode *inode, int bit) \ +{ \ + return test_bit(bit, &EXT4_I(inode)->i_##field); \ +} \ +static inline void ext4_set_inode_##name(struct inode *inode, int bit) \ +{ \ + set_bit(bit, &EXT4_I(inode)->i_##field); \ +} \ +static inline void ext4_clear_inode_##name(struct inode *inode, int bit) \ +{ \ + clear_bit(bit, &EXT4_I(inode)->i_##field); \ } -static inline void ext4_clear_inode_state(struct inode *inode, int bit) -{ - clear_bit(bit, &EXT4_I(inode)->i_state_flags); -} +EXT4_INODE_BIT_FNS(flag, flags) +EXT4_INODE_BIT_FNS(state, state_flags) #else /* Assume that user mode programs are passing in an ext4fs superblock, not * a kernel struct super_block. This will allow us to call the feature-test @@ -1264,7 +1385,7 @@ struct ext4_dir_entry_2 { #define is_dx(dir) (EXT4_HAS_COMPAT_FEATURE(dir->i_sb, \ EXT4_FEATURE_COMPAT_DIR_INDEX) && \ - (EXT4_I(dir)->i_flags & EXT4_INDEX_FL)) + ext4_test_inode_flag((dir), EXT4_INODE_INDEX)) #define EXT4_DIR_LINK_MAX(dir) (!is_dx(dir) && (dir)->i_nlink >= EXT4_LINK_MAX) #define EXT4_DIR_LINK_EMPTY(dir) ((dir)->i_nlink == 2 || (dir)->i_nlink == 1) @@ -1398,7 +1519,7 @@ extern int ext4_htree_store_dirent(struct file *dir_file, __u32 hash, extern void ext4_htree_free_dir_info(struct dir_private_info *p); /* fsync.c */ -extern int ext4_sync_file(struct file *, struct dentry *, int); +extern int ext4_sync_file(struct file *, int); /* hash.c */ extern int ext4fs_dirhash(const char *name, int len, struct @@ -1678,6 +1799,7 @@ struct ext4_group_info { ext4_grpblk_t bb_first_free; /* first free block */ ext4_grpblk_t bb_free; /* total free blocks */ ext4_grpblk_t bb_fragments; /* nr of freespace fragments */ + ext4_grpblk_t bb_largest_free_order;/* order of largest frag in BG */ struct list_head bb_prealloc_list; #ifdef DOUBLE_CHECK void *bb_bitmap; @@ -1772,9 +1894,8 @@ extern int ext4_ext_tree_init(handle_t *handle, struct inode *); extern int ext4_ext_writepage_trans_blocks(struct inode *, int); extern int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks, int chunk); -extern int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, - ext4_lblk_t iblock, unsigned int max_blocks, - struct buffer_head *bh_result, int flags); +extern int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, + struct ext4_map_blocks *map, int flags); extern void ext4_ext_truncate(struct inode *); extern void ext4_ext_init(struct super_block *); extern void ext4_ext_release(struct super_block *); @@ -1782,6 +1903,8 @@ extern long ext4_fallocate(struct inode *inode, int mode, loff_t offset, loff_t len); extern int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset, ssize_t len); +extern int ext4_map_blocks(handle_t *handle, struct inode *inode, + struct ext4_map_blocks *map, int flags); extern int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block, unsigned int max_blocks, struct buffer_head *bh, int flags); diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h index b79ad5126468..dade0c024797 100644 --- a/fs/ext4/ext4_jbd2.h +++ b/fs/ext4/ext4_jbd2.h @@ -273,7 +273,7 @@ static inline int ext4_should_journal_data(struct inode *inode) return 1; if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) return 1; - if (EXT4_I(inode)->i_flags & EXT4_JOURNAL_DATA_FL) + if (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA)) return 1; return 0; } @@ -284,7 +284,7 @@ static inline int ext4_should_order_data(struct inode *inode) return 0; if (!S_ISREG(inode->i_mode)) return 0; - if (EXT4_I(inode)->i_flags & EXT4_JOURNAL_DATA_FL) + if (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA)) return 0; if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA) return 1; @@ -297,7 +297,7 @@ static inline int ext4_should_writeback_data(struct inode *inode) return 0; if (EXT4_JOURNAL(inode) == NULL) return 1; - if (EXT4_I(inode)->i_flags & EXT4_JOURNAL_DATA_FL) + if (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA)) return 0; if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA) return 1; @@ -321,7 +321,7 @@ static inline int ext4_should_dioread_nolock(struct inode *inode) return 0; if (!S_ISREG(inode->i_mode)) return 0; - if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)) + if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) return 0; if (ext4_should_journal_data(inode)) return 0; diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 236b834b4ca8..377309c1af65 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -107,11 +107,8 @@ static int ext4_ext_truncate_extend_restart(handle_t *handle, if (err <= 0) return err; err = ext4_truncate_restart_trans(handle, inode, needed); - /* - * We have dropped i_data_sem so someone might have cached again - * an extent we are going to truncate. - */ - ext4_ext_invalidate_cache(inode); + if (err == 0) + err = -EAGAIN; return err; } @@ -185,10 +182,10 @@ static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode, if (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) { /* * If there are at least EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME - * block groups per flexgroup, reserve the first block - * group for directories and special files. Regular + * block groups per flexgroup, reserve the first block + * group for directories and special files. Regular * files will start at the second block group. This - * tends to speed up directory access and improves + * tends to speed up directory access and improves * fsck times. */ block_group &= ~(flex_size-1); @@ -439,10 +436,10 @@ static int __ext4_ext_check(const char *function, struct inode *inode, return 0; corrupted: - __ext4_error(inode->i_sb, function, - "bad header/extent in inode #%lu: %s - magic %x, " + ext4_error_inode(function, inode, + "bad header/extent: %s - magic %x, " "entries %u, max %u(%u), depth %u(%u)", - inode->i_ino, error_msg, le16_to_cpu(eh->eh_magic), + error_msg, le16_to_cpu(eh->eh_magic), le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max), max, le16_to_cpu(eh->eh_depth), depth); @@ -1622,9 +1619,7 @@ int ext4_ext_try_to_merge(struct inode *inode, merge_done = 1; WARN_ON(eh->eh_entries == 0); if (!eh->eh_entries) - ext4_error(inode->i_sb, - "inode#%lu, eh->eh_entries = 0!", - inode->i_ino); + EXT4_ERROR_INODE(inode, "eh->eh_entries = 0!"); } return merge_done; @@ -2039,7 +2034,7 @@ ext4_ext_in_cache(struct inode *inode, ext4_lblk_t block, struct ext4_ext_cache *cex; int ret = EXT4_EXT_CACHE_NO; - /* + /* * We borrow i_block_reservation_lock to protect i_cached_extent */ spin_lock(&EXT4_I(inode)->i_block_reservation_lock); @@ -2361,7 +2356,7 @@ static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start) int depth = ext_depth(inode); struct ext4_ext_path *path; handle_t *handle; - int i = 0, err = 0; + int i, err; ext_debug("truncate since %u\n", start); @@ -2370,23 +2365,26 @@ static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start) if (IS_ERR(handle)) return PTR_ERR(handle); +again: ext4_ext_invalidate_cache(inode); /* * We start scanning from right side, freeing all the blocks * after i_size and walking into the tree depth-wise. */ + depth = ext_depth(inode); path = kzalloc(sizeof(struct ext4_ext_path) * (depth + 1), GFP_NOFS); if (path == NULL) { ext4_journal_stop(handle); return -ENOMEM; } + path[0].p_depth = depth; path[0].p_hdr = ext_inode_hdr(inode); if (ext4_ext_check(inode, path[0].p_hdr, depth)) { err = -EIO; goto out; } - path[0].p_depth = depth; + i = err = 0; while (i >= 0 && err == 0) { if (i == depth) { @@ -2480,6 +2478,8 @@ static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start) out: ext4_ext_drop_refs(path); kfree(path); + if (err == -EAGAIN) + goto again; ext4_journal_stop(handle); return err; @@ -2544,7 +2544,7 @@ static void bi_complete(struct bio *bio, int error) /* FIXME!! we need to try to merge to left or right after zero-out */ static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex) { - int ret = -EIO; + int ret; struct bio *bio; int blkbits, blocksize; sector_t ee_pblock; @@ -2568,6 +2568,9 @@ static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex) len = ee_len; bio = bio_alloc(GFP_NOIO, len); + if (!bio) + return -ENOMEM; + bio->bi_sector = ee_pblock; bio->bi_bdev = inode->i_sb->s_bdev; @@ -2595,22 +2598,20 @@ static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex) submit_bio(WRITE, bio); wait_for_completion(&event); - if (test_bit(BIO_UPTODATE, &bio->bi_flags)) - ret = 0; - else { - ret = -EIO; - break; + if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) { + bio_put(bio); + return -EIO; } bio_put(bio); ee_len -= done; ee_pblock += done << (blkbits - 9); } - return ret; + return 0; } #define EXT4_EXT_ZERO_LEN 7 /* - * This function is called by ext4_ext_get_blocks() if someone tries to write + * This function is called by ext4_ext_map_blocks() if someone tries to write * to an uninitialized extent. It may result in splitting the uninitialized * extent into multiple extents (upto three - one initialized and two * uninitialized). @@ -2620,39 +2621,55 @@ static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex) * c> Splits in three extents: Somone is writing in middle of the extent */ static int ext4_ext_convert_to_initialized(handle_t *handle, - struct inode *inode, - struct ext4_ext_path *path, - ext4_lblk_t iblock, - unsigned int max_blocks) + struct inode *inode, + struct ext4_map_blocks *map, + struct ext4_ext_path *path) { struct ext4_extent *ex, newex, orig_ex; struct ext4_extent *ex1 = NULL; struct ext4_extent *ex2 = NULL; struct ext4_extent *ex3 = NULL; struct ext4_extent_header *eh; - ext4_lblk_t ee_block; + ext4_lblk_t ee_block, eof_block; unsigned int allocated, ee_len, depth; ext4_fsblk_t newblock; int err = 0; int ret = 0; + int may_zeroout; + + ext_debug("ext4_ext_convert_to_initialized: inode %lu, logical" + "block %llu, max_blocks %u\n", inode->i_ino, + (unsigned long long)map->m_lblk, map->m_len); + + eof_block = (inode->i_size + inode->i_sb->s_blocksize - 1) >> + inode->i_sb->s_blocksize_bits; + if (eof_block < map->m_lblk + map->m_len) + eof_block = map->m_lblk + map->m_len; depth = ext_depth(inode); eh = path[depth].p_hdr; ex = path[depth].p_ext; ee_block = le32_to_cpu(ex->ee_block); ee_len = ext4_ext_get_actual_len(ex); - allocated = ee_len - (iblock - ee_block); - newblock = iblock - ee_block + ext_pblock(ex); + allocated = ee_len - (map->m_lblk - ee_block); + newblock = map->m_lblk - ee_block + ext_pblock(ex); + ex2 = ex; orig_ex.ee_block = ex->ee_block; orig_ex.ee_len = cpu_to_le16(ee_len); ext4_ext_store_pblock(&orig_ex, ext_pblock(ex)); + /* + * It is safe to convert extent to initialized via explicit + * zeroout only if extent is fully insde i_size or new_size. + */ + may_zeroout = ee_block + ee_len <= eof_block; + err = ext4_ext_get_access(handle, inode, path + depth); if (err) goto out; /* If extent has less than 2*EXT4_EXT_ZERO_LEN zerout directly */ - if (ee_len <= 2*EXT4_EXT_ZERO_LEN) { + if (ee_len <= 2*EXT4_EXT_ZERO_LEN && may_zeroout) { err = ext4_ext_zeroout(inode, &orig_ex); if (err) goto fix_extent_len; @@ -2665,10 +2682,10 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, return allocated; } - /* ex1: ee_block to iblock - 1 : uninitialized */ - if (iblock > ee_block) { + /* ex1: ee_block to map->m_lblk - 1 : uninitialized */ + if (map->m_lblk > ee_block) { ex1 = ex; - ex1->ee_len = cpu_to_le16(iblock - ee_block); + ex1->ee_len = cpu_to_le16(map->m_lblk - ee_block); ext4_ext_mark_uninitialized(ex1); ex2 = &newex; } @@ -2677,15 +2694,15 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, * we insert ex3, if ex1 is NULL. This is to avoid temporary * overlap of blocks. */ - if (!ex1 && allocated > max_blocks) - ex2->ee_len = cpu_to_le16(max_blocks); + if (!ex1 && allocated > map->m_len) + ex2->ee_len = cpu_to_le16(map->m_len); /* ex3: to ee_block + ee_len : uninitialised */ - if (allocated > max_blocks) { + if (allocated > map->m_len) { unsigned int newdepth; /* If extent has less than EXT4_EXT_ZERO_LEN zerout directly */ - if (allocated <= EXT4_EXT_ZERO_LEN) { + if (allocated <= EXT4_EXT_ZERO_LEN && may_zeroout) { /* - * iblock == ee_block is handled by the zerouout + * map->m_lblk == ee_block is handled by the zerouout * at the beginning. * Mark first half uninitialized. * Mark second half initialized and zero out the @@ -2698,7 +2715,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, ext4_ext_dirty(handle, inode, path + depth); ex3 = &newex; - ex3->ee_block = cpu_to_le32(iblock); + ex3->ee_block = cpu_to_le32(map->m_lblk); ext4_ext_store_pblock(ex3, newblock); ex3->ee_len = cpu_to_le16(allocated); err = ext4_ext_insert_extent(handle, inode, path, @@ -2711,7 +2728,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, ex->ee_len = orig_ex.ee_len; ext4_ext_store_pblock(ex, ext_pblock(&orig_ex)); ext4_ext_dirty(handle, inode, path + depth); - /* blocks available from iblock */ + /* blocks available from map->m_lblk */ return allocated; } else if (err) @@ -2733,8 +2750,8 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, */ depth = ext_depth(inode); ext4_ext_drop_refs(path); - path = ext4_ext_find_extent(inode, - iblock, path); + path = ext4_ext_find_extent(inode, map->m_lblk, + path); if (IS_ERR(path)) { err = PTR_ERR(path); return err; @@ -2754,12 +2771,12 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, return allocated; } ex3 = &newex; - ex3->ee_block = cpu_to_le32(iblock + max_blocks); - ext4_ext_store_pblock(ex3, newblock + max_blocks); - ex3->ee_len = cpu_to_le16(allocated - max_blocks); + ex3->ee_block = cpu_to_le32(map->m_lblk + map->m_len); + ext4_ext_store_pblock(ex3, newblock + map->m_len); + ex3->ee_len = cpu_to_le16(allocated - map->m_len); ext4_ext_mark_uninitialized(ex3); err = ext4_ext_insert_extent(handle, inode, path, ex3, 0); - if (err == -ENOSPC) { + if (err == -ENOSPC && may_zeroout) { err = ext4_ext_zeroout(inode, &orig_ex); if (err) goto fix_extent_len; @@ -2769,7 +2786,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, ext4_ext_store_pblock(ex, ext_pblock(&orig_ex)); ext4_ext_dirty(handle, inode, path + depth); /* zeroed the full extent */ - /* blocks available from iblock */ + /* blocks available from map->m_lblk */ return allocated; } else if (err) @@ -2783,11 +2800,13 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, * update the extent length after successful insert of the * split extent */ - orig_ex.ee_len = cpu_to_le16(ee_len - - ext4_ext_get_actual_len(ex3)); + ee_len -= ext4_ext_get_actual_len(ex3); + orig_ex.ee_len = cpu_to_le16(ee_len); + may_zeroout = ee_block + ee_len <= eof_block; + depth = newdepth; ext4_ext_drop_refs(path); - path = ext4_ext_find_extent(inode, iblock, path); + path = ext4_ext_find_extent(inode, map->m_lblk, path); if (IS_ERR(path)) { err = PTR_ERR(path); goto out; @@ -2801,14 +2820,14 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, if (err) goto out; - allocated = max_blocks; + allocated = map->m_len; /* If extent has less than EXT4_EXT_ZERO_LEN and we are trying * to insert a extent in the middle zerout directly * otherwise give the extent a chance to merge to left */ if (le16_to_cpu(orig_ex.ee_len) <= EXT4_EXT_ZERO_LEN && - iblock != ee_block) { + map->m_lblk != ee_block && may_zeroout) { err = ext4_ext_zeroout(inode, &orig_ex); if (err) goto fix_extent_len; @@ -2818,7 +2837,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, ext4_ext_store_pblock(ex, ext_pblock(&orig_ex)); ext4_ext_dirty(handle, inode, path + depth); /* zero out the first half */ - /* blocks available from iblock */ + /* blocks available from map->m_lblk */ return allocated; } } @@ -2829,12 +2848,12 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, */ if (ex1 && ex1 != ex) { ex1 = ex; - ex1->ee_len = cpu_to_le16(iblock - ee_block); + ex1->ee_len = cpu_to_le16(map->m_lblk - ee_block); ext4_ext_mark_uninitialized(ex1); ex2 = &newex; } - /* ex2: iblock to iblock + maxblocks-1 : initialised */ - ex2->ee_block = cpu_to_le32(iblock); + /* ex2: map->m_lblk to map->m_lblk + maxblocks-1 : initialised */ + ex2->ee_block = cpu_to_le32(map->m_lblk); ext4_ext_store_pblock(ex2, newblock); ex2->ee_len = cpu_to_le16(allocated); if (ex2 != ex) @@ -2877,7 +2896,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, goto out; insert: err = ext4_ext_insert_extent(handle, inode, path, &newex, 0); - if (err == -ENOSPC) { + if (err == -ENOSPC && may_zeroout) { err = ext4_ext_zeroout(inode, &orig_ex); if (err) goto fix_extent_len; @@ -2904,7 +2923,7 @@ fix_extent_len: } /* - * This function is called by ext4_ext_get_blocks() from + * This function is called by ext4_ext_map_blocks() from * ext4_get_blocks_dio_write() when DIO to write * to an uninitialized extent. * @@ -2927,9 +2946,8 @@ fix_extent_len: */ static int ext4_split_unwritten_extents(handle_t *handle, struct inode *inode, + struct ext4_map_blocks *map, struct ext4_ext_path *path, - ext4_lblk_t iblock, - unsigned int max_blocks, int flags) { struct ext4_extent *ex, newex, orig_ex; @@ -2937,41 +2955,55 @@ static int ext4_split_unwritten_extents(handle_t *handle, struct ext4_extent *ex2 = NULL; struct ext4_extent *ex3 = NULL; struct ext4_extent_header *eh; - ext4_lblk_t ee_block; + ext4_lblk_t ee_block, eof_block; unsigned int allocated, ee_len, depth; ext4_fsblk_t newblock; int err = 0; + int may_zeroout; + + ext_debug("ext4_split_unwritten_extents: inode %lu, logical" + "block %llu, max_blocks %u\n", inode->i_ino, + (unsigned long long)map->m_lblk, map->m_len); + + eof_block = (inode->i_size + inode->i_sb->s_blocksize - 1) >> + inode->i_sb->s_blocksize_bits; + if (eof_block < map->m_lblk + map->m_len) + eof_block = map->m_lblk + map->m_len; - ext_debug("ext4_split_unwritten_extents: inode %lu," - "iblock %llu, max_blocks %u\n", inode->i_ino, - (unsigned long long)iblock, max_blocks); depth = ext_depth(inode); eh = path[depth].p_hdr; ex = path[depth].p_ext; ee_block = le32_to_cpu(ex->ee_block); ee_len = ext4_ext_get_actual_len(ex); - allocated = ee_len - (iblock - ee_block); - newblock = iblock - ee_block + ext_pblock(ex); + allocated = ee_len - (map->m_lblk - ee_block); + newblock = map->m_lblk - ee_block + ext_pblock(ex); + ex2 = ex; orig_ex.ee_block = ex->ee_block; orig_ex.ee_len = cpu_to_le16(ee_len); ext4_ext_store_pblock(&orig_ex, ext_pblock(ex)); /* + * It is safe to convert extent to initialized via explicit + * zeroout only if extent is fully insde i_size or new_size. + */ + may_zeroout = ee_block + ee_len <= eof_block; + + /* * If the uninitialized extent begins at the same logical * block where the write begins, and the write completely * covers the extent, then we don't need to split it. */ - if ((iblock == ee_block) && (allocated <= max_blocks)) + if ((map->m_lblk == ee_block) && (allocated <= map->m_len)) return allocated; err = ext4_ext_get_access(handle, inode, path + depth); if (err) goto out; - /* ex1: ee_block to iblock - 1 : uninitialized */ - if (iblock > ee_block) { + /* ex1: ee_block to map->m_lblk - 1 : uninitialized */ + if (map->m_lblk > ee_block) { ex1 = ex; - ex1->ee_len = cpu_to_le16(iblock - ee_block); + ex1->ee_len = cpu_to_le16(map->m_lblk - ee_block); ext4_ext_mark_uninitialized(ex1); ex2 = &newex; } @@ -2980,18 +3012,18 @@ static int ext4_split_unwritten_extents(handle_t *handle, * we insert ex3, if ex1 is NULL. This is to avoid temporary * overlap of blocks. */ - if (!ex1 && allocated > max_blocks) - ex2->ee_len = cpu_to_le16(max_blocks); + if (!ex1 && allocated > map->m_len) + ex2->ee_len = cpu_to_le16(map->m_len); /* ex3: to ee_block + ee_len : uninitialised */ - if (allocated > max_blocks) { + if (allocated > map->m_len) { unsigned int newdepth; ex3 = &newex; - ex3->ee_block = cpu_to_le32(iblock + max_blocks); - ext4_ext_store_pblock(ex3, newblock + max_blocks); - ex3->ee_len = cpu_to_le16(allocated - max_blocks); + ex3->ee_block = cpu_to_le32(map->m_lblk + map->m_len); + ext4_ext_store_pblock(ex3, newblock + map->m_len); + ex3->ee_len = cpu_to_le16(allocated - map->m_len); ext4_ext_mark_uninitialized(ex3); err = ext4_ext_insert_extent(handle, inode, path, ex3, flags); - if (err == -ENOSPC) { + if (err == -ENOSPC && may_zeroout) { err = ext4_ext_zeroout(inode, &orig_ex); if (err) goto fix_extent_len; @@ -3001,7 +3033,7 @@ static int ext4_split_unwritten_extents(handle_t *handle, ext4_ext_store_pblock(ex, ext_pblock(&orig_ex)); ext4_ext_dirty(handle, inode, path + depth); /* zeroed the full extent */ - /* blocks available from iblock */ + /* blocks available from map->m_lblk */ return allocated; } else if (err) @@ -3015,11 +3047,13 @@ static int ext4_split_unwritten_extents(handle_t *handle, * update the extent length after successful insert of the * split extent */ - orig_ex.ee_len = cpu_to_le16(ee_len - - ext4_ext_get_actual_len(ex3)); + ee_len -= ext4_ext_get_actual_len(ex3); + orig_ex.ee_len = cpu_to_le16(ee_len); + may_zeroout = ee_block + ee_len <= eof_block; + depth = newdepth; ext4_ext_drop_refs(path); - path = ext4_ext_find_extent(inode, iblock, path); + path = ext4_ext_find_extent(inode, map->m_lblk, path); if (IS_ERR(path)) { err = PTR_ERR(path); goto out; @@ -3033,7 +3067,7 @@ static int ext4_split_unwritten_extents(handle_t *handle, if (err) goto out; - allocated = max_blocks; + allocated = map->m_len; } /* * If there was a change of depth as part of the @@ -3042,15 +3076,15 @@ static int ext4_split_unwritten_extents(handle_t *handle, */ if (ex1 && ex1 != ex) { ex1 = ex; - ex1->ee_len = cpu_to_le16(iblock - ee_block); + ex1->ee_len = cpu_to_le16(map->m_lblk - ee_block); ext4_ext_mark_uninitialized(ex1); ex2 = &newex; } /* - * ex2: iblock to iblock + maxblocks-1 : to be direct IO written, - * uninitialised still. + * ex2: map->m_lblk to map->m_lblk + map->m_len-1 : to be written + * using direct I/O, uninitialised still. */ - ex2->ee_block = cpu_to_le32(iblock); + ex2->ee_block = cpu_to_le32(map->m_lblk); ext4_ext_store_pblock(ex2, newblock); ex2->ee_len = cpu_to_le16(allocated); ext4_ext_mark_uninitialized(ex2); @@ -3062,7 +3096,7 @@ static int ext4_split_unwritten_extents(handle_t *handle, goto out; insert: err = ext4_ext_insert_extent(handle, inode, path, &newex, flags); - if (err == -ENOSPC) { + if (err == -ENOSPC && may_zeroout) { err = ext4_ext_zeroout(inode, &orig_ex); if (err) goto fix_extent_len; @@ -3152,10 +3186,9 @@ static void unmap_underlying_metadata_blocks(struct block_device *bdev, static int ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode, - ext4_lblk_t iblock, unsigned int max_blocks, + struct ext4_map_blocks *map, struct ext4_ext_path *path, int flags, - unsigned int allocated, struct buffer_head *bh_result, - ext4_fsblk_t newblock) + unsigned int allocated, ext4_fsblk_t newblock) { int ret = 0; int err = 0; @@ -3163,15 +3196,14 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode, ext_debug("ext4_ext_handle_uninitialized_extents: inode %lu, logical" "block %llu, max_blocks %u, flags %d, allocated %u", - inode->i_ino, (unsigned long long)iblock, max_blocks, + inode->i_ino, (unsigned long long)map->m_lblk, map->m_len, flags, allocated); ext4_ext_show_leaf(inode, path); /* get_block() before submit the IO, split the extent */ if ((flags & EXT4_GET_BLOCKS_PRE_IO)) { - ret = ext4_split_unwritten_extents(handle, - inode, path, iblock, - max_blocks, flags); + ret = ext4_split_unwritten_extents(handle, inode, map, + path, flags); /* * Flag the inode(non aio case) or end_io struct (aio case) * that this IO needs to convertion to written when IO is @@ -3182,7 +3214,7 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode, else ext4_set_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN); if (ext4_should_dioread_nolock(inode)) - set_buffer_uninit(bh_result); + map->m_flags |= EXT4_MAP_UNINIT; goto out; } /* IO end_io complete, convert the filled extent to written */ @@ -3210,14 +3242,12 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode, * the buffer head will be unmapped so that * a read from the block returns 0s. */ - set_buffer_unwritten(bh_result); + map->m_flags |= EXT4_MAP_UNWRITTEN; goto out1; } /* buffered write, writepage time, convert*/ - ret = ext4_ext_convert_to_initialized(handle, inode, - path, iblock, - max_blocks); + ret = ext4_ext_convert_to_initialized(handle, inode, map, path); if (ret >= 0) ext4_update_inode_fsync_trans(handle, inode, 1); out: @@ -3226,7 +3256,7 @@ out: goto out2; } else allocated = ret; - set_buffer_new(bh_result); + map->m_flags |= EXT4_MAP_NEW; /* * if we allocated more blocks than requested * we need to make sure we unmap the extra block @@ -3234,11 +3264,11 @@ out: * unmapped later when we find the buffer_head marked * new. */ - if (allocated > max_blocks) { + if (allocated > map->m_len) { unmap_underlying_metadata_blocks(inode->i_sb->s_bdev, - newblock + max_blocks, - allocated - max_blocks); - allocated = max_blocks; + newblock + map->m_len, + allocated - map->m_len); + allocated = map->m_len; } /* @@ -3252,13 +3282,13 @@ out: ext4_da_update_reserve_space(inode, allocated, 0); map_out: - set_buffer_mapped(bh_result); + map->m_flags |= EXT4_MAP_MAPPED; out1: - if (allocated > max_blocks) - allocated = max_blocks; + if (allocated > map->m_len) + allocated = map->m_len; ext4_ext_show_leaf(inode, path); - bh_result->b_bdev = inode->i_sb->s_bdev; - bh_result->b_blocknr = newblock; + map->m_pblk = newblock; + map->m_len = allocated; out2: if (path) { ext4_ext_drop_refs(path); @@ -3284,26 +3314,23 @@ out2: * * return < 0, error case. */ -int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, - ext4_lblk_t iblock, - unsigned int max_blocks, struct buffer_head *bh_result, - int flags) +int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, + struct ext4_map_blocks *map, int flags) { struct ext4_ext_path *path = NULL; struct ext4_extent_header *eh; struct ext4_extent newex, *ex, *last_ex; ext4_fsblk_t newblock; - int err = 0, depth, ret, cache_type; + int i, err = 0, depth, ret, cache_type; unsigned int allocated = 0; struct ext4_allocation_request ar; ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio; - __clear_bit(BH_New, &bh_result->b_state); ext_debug("blocks %u/%u requested for inode %lu\n", - iblock, max_blocks, inode->i_ino); + map->m_lblk, map->m_len, inode->i_ino); /* check in cache */ - cache_type = ext4_ext_in_cache(inode, iblock, &newex); + cache_type = ext4_ext_in_cache(inode, map->m_lblk, &newex); if (cache_type) { if (cache_type == EXT4_EXT_CACHE_GAP) { if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) { @@ -3316,12 +3343,12 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, /* we should allocate requested block */ } else if (cache_type == EXT4_EXT_CACHE_EXTENT) { /* block is already allocated */ - newblock = iblock + newblock = map->m_lblk - le32_to_cpu(newex.ee_block) + ext_pblock(&newex); /* number of remaining blocks in the extent */ allocated = ext4_ext_get_actual_len(&newex) - - (iblock - le32_to_cpu(newex.ee_block)); + (map->m_lblk - le32_to_cpu(newex.ee_block)); goto out; } else { BUG(); @@ -3329,7 +3356,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, } /* find extent for this block */ - path = ext4_ext_find_extent(inode, iblock, NULL); + path = ext4_ext_find_extent(inode, map->m_lblk, NULL); if (IS_ERR(path)) { err = PTR_ERR(path); path = NULL; @@ -3345,8 +3372,9 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, */ if (unlikely(path[depth].p_ext == NULL && depth != 0)) { EXT4_ERROR_INODE(inode, "bad extent address " - "iblock: %d, depth: %d pblock %lld", - iblock, depth, path[depth].p_block); + "lblock: %lu, depth: %d pblock %lld", + (unsigned long) map->m_lblk, depth, + path[depth].p_block); err = -EIO; goto out2; } @@ -3364,12 +3392,12 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, */ ee_len = ext4_ext_get_actual_len(ex); /* if found extent covers block, simply return it */ - if (in_range(iblock, ee_block, ee_len)) { - newblock = iblock - ee_block + ee_start; + if (in_range(map->m_lblk, ee_block, ee_len)) { + newblock = map->m_lblk - ee_block + ee_start; /* number of remaining blocks in the extent */ - allocated = ee_len - (iblock - ee_block); - ext_debug("%u fit into %u:%d -> %llu\n", iblock, - ee_block, ee_len, newblock); + allocated = ee_len - (map->m_lblk - ee_block); + ext_debug("%u fit into %u:%d -> %llu\n", map->m_lblk, + ee_block, ee_len, newblock); /* Do not put uninitialized extent in the cache */ if (!ext4_ext_is_uninitialized(ex)) { @@ -3379,8 +3407,8 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, goto out; } ret = ext4_ext_handle_uninitialized_extents(handle, - inode, iblock, max_blocks, path, - flags, allocated, bh_result, newblock); + inode, map, path, flags, allocated, + newblock); return ret; } } @@ -3394,7 +3422,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, * put just found gap into cache to speed up * subsequent requests */ - ext4_ext_put_gap_in_cache(inode, path, iblock); + ext4_ext_put_gap_in_cache(inode, path, map->m_lblk); goto out2; } /* @@ -3402,11 +3430,11 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, */ /* find neighbour allocated blocks */ - ar.lleft = iblock; + ar.lleft = map->m_lblk; err = ext4_ext_search_left(inode, path, &ar.lleft, &ar.pleft); if (err) goto out2; - ar.lright = iblock; + ar.lright = map->m_lblk; err = ext4_ext_search_right(inode, path, &ar.lright, &ar.pright); if (err) goto out2; @@ -3417,26 +3445,26 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, * EXT_INIT_MAX_LEN and for an uninitialized extent this limit is * EXT_UNINIT_MAX_LEN. */ - if (max_blocks > EXT_INIT_MAX_LEN && + if (map->m_len > EXT_INIT_MAX_LEN && !(flags & EXT4_GET_BLOCKS_UNINIT_EXT)) - max_blocks = EXT_INIT_MAX_LEN; - else if (max_blocks > EXT_UNINIT_MAX_LEN && + map->m_len = EXT_INIT_MAX_LEN; + else if (map->m_len > EXT_UNINIT_MAX_LEN && (flags & EXT4_GET_BLOCKS_UNINIT_EXT)) - max_blocks = EXT_UNINIT_MAX_LEN; + map->m_len = EXT_UNINIT_MAX_LEN; - /* Check if we can really insert (iblock)::(iblock+max_blocks) extent */ - newex.ee_block = cpu_to_le32(iblock); - newex.ee_len = cpu_to_le16(max_blocks); + /* Check if we can really insert (m_lblk)::(m_lblk + m_len) extent */ + newex.ee_block = cpu_to_le32(map->m_lblk); + newex.ee_len = cpu_to_le16(map->m_len); err = ext4_ext_check_overlap(inode, &newex, path); if (err) allocated = ext4_ext_get_actual_len(&newex); else - allocated = max_blocks; + allocated = map->m_len; /* allocate new block */ ar.inode = inode; - ar.goal = ext4_ext_find_goal(inode, path, iblock); - ar.logical = iblock; + ar.goal = ext4_ext_find_goal(inode, path, map->m_lblk); + ar.logical = map->m_lblk; ar.len = allocated; if (S_ISREG(inode->i_mode)) ar.flags = EXT4_MB_HINT_DATA; @@ -3470,21 +3498,33 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, EXT4_STATE_DIO_UNWRITTEN); } if (ext4_should_dioread_nolock(inode)) - set_buffer_uninit(bh_result); + map->m_flags |= EXT4_MAP_UNINIT; } - if (unlikely(EXT4_I(inode)->i_flags & EXT4_EOFBLOCKS_FL)) { + if (unlikely(ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS))) { if (unlikely(!eh->eh_entries)) { EXT4_ERROR_INODE(inode, - "eh->eh_entries == 0 ee_block %d", - ex->ee_block); + "eh->eh_entries == 0 and " + "EOFBLOCKS_FL set"); err = -EIO; goto out2; } last_ex = EXT_LAST_EXTENT(eh); - if (iblock + ar.len > le32_to_cpu(last_ex->ee_block) - + ext4_ext_get_actual_len(last_ex)) - EXT4_I(inode)->i_flags &= ~EXT4_EOFBLOCKS_FL; + /* + * If the current leaf block was reached by looking at + * the last index block all the way down the tree, and + * we are extending the inode beyond the last extent + * in the current leaf block, then clear the + * EOFBLOCKS_FL flag. + */ + for (i = depth-1; i >= 0; i--) { + if (path[i].p_idx != EXT_LAST_INDEX(path[i].p_hdr)) + break; + } + if ((i < 0) && + (map->m_lblk + ar.len > le32_to_cpu(last_ex->ee_block) + + ext4_ext_get_actual_len(last_ex))) + ext4_clear_inode_flag(inode, EXT4_INODE_EOFBLOCKS); } err = ext4_ext_insert_extent(handle, inode, path, &newex, flags); if (err) { @@ -3500,9 +3540,9 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, /* previous routine could use block we allocated */ newblock = ext_pblock(&newex); allocated = ext4_ext_get_actual_len(&newex); - if (allocated > max_blocks) - allocated = max_blocks; - set_buffer_new(bh_result); + if (allocated > map->m_len) + allocated = map->m_len; + map->m_flags |= EXT4_MAP_NEW; /* * Update reserved blocks/metadata blocks after successful @@ -3516,18 +3556,18 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, * when it is _not_ an uninitialized extent. */ if ((flags & EXT4_GET_BLOCKS_UNINIT_EXT) == 0) { - ext4_ext_put_in_cache(inode, iblock, allocated, newblock, + ext4_ext_put_in_cache(inode, map->m_lblk, allocated, newblock, EXT4_EXT_CACHE_EXTENT); ext4_update_inode_fsync_trans(handle, inode, 1); } else ext4_update_inode_fsync_trans(handle, inode, 0); out: - if (allocated > max_blocks) - allocated = max_blocks; + if (allocated > map->m_len) + allocated = map->m_len; ext4_ext_show_leaf(inode, path); - set_buffer_mapped(bh_result); - bh_result->b_bdev = inode->i_sb->s_bdev; - bh_result->b_blocknr = newblock; + map->m_flags |= EXT4_MAP_MAPPED; + map->m_pblk = newblock; + map->m_len = allocated; out2: if (path) { ext4_ext_drop_refs(path); @@ -3625,7 +3665,7 @@ static void ext4_falloc_update_inode(struct inode *inode, * can proceed even if the new size is the same as i_size. */ if (new_size > i_size_read(inode)) - EXT4_I(inode)->i_flags |= EXT4_EOFBLOCKS_FL; + ext4_set_inode_flag(inode, EXT4_INODE_EOFBLOCKS); } } @@ -3640,55 +3680,57 @@ static void ext4_falloc_update_inode(struct inode *inode, long ext4_fallocate(struct inode *inode, int mode, loff_t offset, loff_t len) { handle_t *handle; - ext4_lblk_t block; loff_t new_size; unsigned int max_blocks; int ret = 0; int ret2 = 0; int retries = 0; - struct buffer_head map_bh; + struct ext4_map_blocks map; unsigned int credits, blkbits = inode->i_blkbits; /* * currently supporting (pre)allocate mode for extent-based * files _only_ */ - if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)) + if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) return -EOPNOTSUPP; /* preallocation to directories is currently not supported */ if (S_ISDIR(inode->i_mode)) return -ENODEV; - block = offset >> blkbits; + map.m_lblk = offset >> blkbits; /* * We can't just convert len to max_blocks because * If blocksize = 4096 offset = 3072 and len = 2048 */ max_blocks = (EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits) - - block; + - map.m_lblk; /* * credits to insert 1 extent into extent tree */ credits = ext4_chunk_trans_blocks(inode, max_blocks); mutex_lock(&inode->i_mutex); + ret = inode_newsize_ok(inode, (len + offset)); + if (ret) { + mutex_unlock(&inode->i_mutex); + return ret; + } retry: while (ret >= 0 && ret < max_blocks) { - block = block + ret; - max_blocks = max_blocks - ret; + map.m_lblk = map.m_lblk + ret; + map.m_len = max_blocks = max_blocks - ret; handle = ext4_journal_start(inode, credits); if (IS_ERR(handle)) { ret = PTR_ERR(handle); break; } - map_bh.b_state = 0; - ret = ext4_get_blocks(handle, inode, block, - max_blocks, &map_bh, + ret = ext4_map_blocks(handle, inode, &map, EXT4_GET_BLOCKS_CREATE_UNINIT_EXT); if (ret <= 0) { #ifdef EXT4FS_DEBUG WARN_ON(ret <= 0); - printk(KERN_ERR "%s: ext4_ext_get_blocks " + printk(KERN_ERR "%s: ext4_ext_map_blocks " "returned error inode#%lu, block=%u, " "max_blocks=%u", __func__, inode->i_ino, block, max_blocks); @@ -3697,14 +3739,14 @@ retry: ret2 = ext4_journal_stop(handle); break; } - if ((block + ret) >= (EXT4_BLOCK_ALIGN(offset + len, + if ((map.m_lblk + ret) >= (EXT4_BLOCK_ALIGN(offset + len, blkbits) >> blkbits)) new_size = offset + len; else - new_size = (block + ret) << blkbits; + new_size = (map.m_lblk + ret) << blkbits; ext4_falloc_update_inode(inode, mode, new_size, - buffer_new(&map_bh)); + (map.m_flags & EXT4_MAP_NEW)); ext4_mark_inode_dirty(handle, inode); ret2 = ext4_journal_stop(handle); if (ret2) @@ -3733,42 +3775,39 @@ int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset, ssize_t len) { handle_t *handle; - ext4_lblk_t block; unsigned int max_blocks; int ret = 0; int ret2 = 0; - struct buffer_head map_bh; + struct ext4_map_blocks map; unsigned int credits, blkbits = inode->i_blkbits; - block = offset >> blkbits; + map.m_lblk = offset >> blkbits; /* * We can't just convert len to max_blocks because * If blocksize = 4096 offset = 3072 and len = 2048 */ - max_blocks = (EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits) - - block; + max_blocks = ((EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits) - + map.m_lblk); /* * credits to insert 1 extent into extent tree */ credits = ext4_chunk_trans_blocks(inode, max_blocks); while (ret >= 0 && ret < max_blocks) { - block = block + ret; - max_blocks = max_blocks - ret; + map.m_lblk += ret; + map.m_len = (max_blocks -= ret); handle = ext4_journal_start(inode, credits); if (IS_ERR(handle)) { ret = PTR_ERR(handle); break; } - map_bh.b_state = 0; - ret = ext4_get_blocks(handle, inode, block, - max_blocks, &map_bh, + ret = ext4_map_blocks(handle, inode, &map, EXT4_GET_BLOCKS_IO_CONVERT_EXT); if (ret <= 0) { WARN_ON(ret <= 0); - printk(KERN_ERR "%s: ext4_ext_get_blocks " + printk(KERN_ERR "%s: ext4_ext_map_blocks " "returned error inode#%lu, block=%u, " "max_blocks=%u", __func__, - inode->i_ino, block, max_blocks); + inode->i_ino, map.m_lblk, map.m_len); } ext4_mark_inode_dirty(handle, inode); ret2 = ext4_journal_stop(handle); @@ -3898,7 +3937,7 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, int error = 0; /* fallback to generic here if not in extents fmt */ - if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)) + if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) return generic_block_fiemap(inode, fieinfo, start, len, ext4_get_block); diff --git a/fs/ext4/file.c b/fs/ext4/file.c index d0776e410f34..5313ae4cda2d 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -66,7 +66,7 @@ ext4_file_write(struct kiocb *iocb, const struct iovec *iov, * is smaller than s_maxbytes, which is for extent-mapped files. */ - if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)) { + if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); size_t length = iov_length(iov, nr_segs); diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c index ef3d980e67cb..592adf2e546e 100644 --- a/fs/ext4/fsync.c +++ b/fs/ext4/fsync.c @@ -35,6 +35,29 @@ #include <trace/events/ext4.h> /* + * If we're not journaling and this is a just-created file, we have to + * sync our parent directory (if it was freshly created) since + * otherwise it will only be written by writeback, leaving a huge + * window during which a crash may lose the file. This may apply for + * the parent directory's parent as well, and so on recursively, if + * they are also freshly created. + */ +static void ext4_sync_parent(struct inode *inode) +{ + struct dentry *dentry = NULL; + + while (inode && ext4_test_inode_state(inode, EXT4_STATE_NEWENTRY)) { + ext4_clear_inode_state(inode, EXT4_STATE_NEWENTRY); + dentry = list_entry(inode->i_dentry.next, + struct dentry, d_alias); + if (!dentry || !dentry->d_parent || !dentry->d_parent->d_inode) + break; + inode = dentry->d_parent->d_inode; + sync_mapping_buffers(inode->i_mapping); + } +} + +/* * akpm: A new design for ext4_sync_file(). * * This is only called from sys_fsync(), sys_fdatasync() and sys_msync(). @@ -48,9 +71,9 @@ * i_mutex lock is held when entering and exiting this function */ -int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync) +int ext4_sync_file(struct file *file, int datasync) { - struct inode *inode = dentry->d_inode; + struct inode *inode = file->f_mapping->host; struct ext4_inode_info *ei = EXT4_I(inode); journal_t *journal = EXT4_SB(inode->i_sb)->s_journal; int ret; @@ -58,7 +81,7 @@ int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync) J_ASSERT(ext4_journal_current_handle() == NULL); - trace_ext4_sync_file(file, dentry, datasync); + trace_ext4_sync_file(file, datasync); if (inode->i_sb->s_flags & MS_RDONLY) return 0; @@ -66,9 +89,13 @@ int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync) ret = flush_completed_IO(inode); if (ret < 0) return ret; - - if (!journal) - return simple_fsync(file, dentry, datasync); + + if (!journal) { + ret = generic_file_fsync(file, datasync); + if (!ret && !list_empty(&inode->i_dentry)) + ext4_sync_parent(inode); + return ret; + } /* * data=writeback,ordered: @@ -102,7 +129,7 @@ int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync) (journal->j_flags & JBD2_BARRIER)) blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL, BLKDEV_IFL_WAIT); - jbd2_log_wait_commit(journal, commit_tid); + ret = jbd2_log_wait_commit(journal, commit_tid); } else if (journal->j_flags & JBD2_BARRIER) blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL, BLKDEV_IFL_WAIT); diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 1a0e183a2f04..25c4b3173fd9 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -240,56 +240,49 @@ void ext4_free_inode(handle_t *handle, struct inode *inode) if (fatal) goto error_return; - /* Ok, now we can actually update the inode bitmaps.. */ - cleared = ext4_clear_bit_atomic(ext4_group_lock_ptr(sb, block_group), - bit, bitmap_bh->b_data); - if (!cleared) - ext4_error(sb, "bit already cleared for inode %lu", ino); - else { - gdp = ext4_get_group_desc(sb, block_group, &bh2); - + fatal = -ESRCH; + gdp = ext4_get_group_desc(sb, block_group, &bh2); + if (gdp) { BUFFER_TRACE(bh2, "get_write_access"); fatal = ext4_journal_get_write_access(handle, bh2); - if (fatal) goto error_return; - - if (gdp) { - ext4_lock_group(sb, block_group); - count = ext4_free_inodes_count(sb, gdp) + 1; - ext4_free_inodes_set(sb, gdp, count); - if (is_directory) { - count = ext4_used_dirs_count(sb, gdp) - 1; - ext4_used_dirs_set(sb, gdp, count); - if (sbi->s_log_groups_per_flex) { - ext4_group_t f; - - f = ext4_flex_group(sbi, block_group); - atomic_dec(&sbi->s_flex_groups[f].used_dirs); - } + } + ext4_lock_group(sb, block_group); + cleared = ext4_clear_bit(bit, bitmap_bh->b_data); + if (fatal || !cleared) { + ext4_unlock_group(sb, block_group); + goto out; + } - } - gdp->bg_checksum = ext4_group_desc_csum(sbi, - block_group, gdp); - ext4_unlock_group(sb, block_group); - percpu_counter_inc(&sbi->s_freeinodes_counter); - if (is_directory) - percpu_counter_dec(&sbi->s_dirs_counter); - - if (sbi->s_log_groups_per_flex) { - ext4_group_t f; - - f = ext4_flex_group(sbi, block_group); - atomic_inc(&sbi->s_flex_groups[f].free_inodes); - } - } - BUFFER_TRACE(bh2, "call ext4_handle_dirty_metadata"); - err = ext4_handle_dirty_metadata(handle, NULL, bh2); - if (!fatal) fatal = err; + count = ext4_free_inodes_count(sb, gdp) + 1; + ext4_free_inodes_set(sb, gdp, count); + if (is_directory) { + count = ext4_used_dirs_count(sb, gdp) - 1; + ext4_used_dirs_set(sb, gdp, count); + percpu_counter_dec(&sbi->s_dirs_counter); } - BUFFER_TRACE(bitmap_bh, "call ext4_handle_dirty_metadata"); - err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh); - if (!fatal) - fatal = err; - sb->s_dirt = 1; + gdp->bg_checksum = ext4_group_desc_csum(sbi, block_group, gdp); + ext4_unlock_group(sb, block_group); + + percpu_counter_inc(&sbi->s_freeinodes_counter); + if (sbi->s_log_groups_per_flex) { + ext4_group_t f = ext4_flex_group(sbi, block_group); + + atomic_inc(&sbi->s_flex_groups[f].free_inodes); + if (is_directory) + atomic_dec(&sbi->s_flex_groups[f].used_dirs); + } + BUFFER_TRACE(bh2, "call ext4_handle_dirty_metadata"); + fatal = ext4_handle_dirty_metadata(handle, NULL, bh2); +out: + if (cleared) { + BUFFER_TRACE(bitmap_bh, "call ext4_handle_dirty_metadata"); + err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh); + if (!fatal) + fatal = err; + sb->s_dirt = 1; + } else + ext4_error(sb, "bit already cleared for inode %lu", ino); + error_return: brelse(bitmap_bh); ext4_std_error(sb, fatal); @@ -499,7 +492,7 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent, if (S_ISDIR(mode) && ((parent == sb->s_root->d_inode) || - (EXT4_I(parent)->i_flags & EXT4_TOPDIR_FL))) { + (ext4_test_inode_flag(parent, EXT4_INODE_TOPDIR)))) { int best_ndir = inodes_per_group; int ret = -1; @@ -1041,7 +1034,7 @@ got: if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) { /* set extent flag only for directory, file and normal symlink*/ if (S_ISDIR(mode) || S_ISREG(mode) || S_ISLNK(mode)) { - EXT4_I(inode)->i_flags |= EXT4_EXTENTS_FL; + ext4_set_inode_flag(inode, EXT4_INODE_EXTENTS); ext4_ext_tree_init(handle, inode); } } diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 3e0f6af9d08d..0afc8c1d8cf3 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -149,7 +149,7 @@ int ext4_truncate_restart_trans(handle_t *handle, struct inode *inode, int ret; /* - * Drop i_data_sem to avoid deadlock with ext4_get_blocks At this + * Drop i_data_sem to avoid deadlock with ext4_map_blocks. At this * moment, get_block can be called only for blocks inside i_size since * page cache has been already dropped and writes are blocked by * i_mutex. So we can safely drop the i_data_sem here. @@ -348,9 +348,8 @@ static int __ext4_check_blockref(const char *function, struct inode *inode, if (blk && unlikely(!ext4_data_block_valid(EXT4_SB(inode->i_sb), blk, 1))) { - __ext4_error(inode->i_sb, function, - "invalid block reference %u " - "in inode #%lu", blk, inode->i_ino); + ext4_error_inode(function, inode, + "invalid block reference %u", blk); return -EIO; } } @@ -785,7 +784,7 @@ failed: /* Allocation failed, free what we already allocated */ ext4_free_blocks(handle, inode, 0, new_blocks[0], 1, 0); for (i = 1; i <= n ; i++) { - /* + /* * branch[i].bh is newly allocated, so there is no * need to revoke the block, which is why we don't * need to set EXT4_FREE_BLOCKS_METADATA. @@ -875,7 +874,7 @@ static int ext4_splice_branch(handle_t *handle, struct inode *inode, err_out: for (i = 1; i <= num; i++) { - /* + /* * branch[i].bh is newly allocated, so there is no * need to revoke the block, which is why we don't * need to set EXT4_FREE_BLOCKS_METADATA. @@ -890,9 +889,9 @@ err_out: } /* - * The ext4_ind_get_blocks() function handles non-extents inodes + * The ext4_ind_map_blocks() function handles non-extents inodes * (i.e., using the traditional indirect/double-indirect i_blocks - * scheme) for ext4_get_blocks(). + * scheme) for ext4_map_blocks(). * * Allocation strategy is simple: if we have to allocate something, we will * have to go the whole way to leaf. So let's do it before attaching anything @@ -917,9 +916,8 @@ err_out: * down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system * blocks. */ -static int ext4_ind_get_blocks(handle_t *handle, struct inode *inode, - ext4_lblk_t iblock, unsigned int maxblocks, - struct buffer_head *bh_result, +static int ext4_ind_map_blocks(handle_t *handle, struct inode *inode, + struct ext4_map_blocks *map, int flags) { int err = -EIO; @@ -933,9 +931,9 @@ static int ext4_ind_get_blocks(handle_t *handle, struct inode *inode, int count = 0; ext4_fsblk_t first_block = 0; - J_ASSERT(!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)); + J_ASSERT(!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))); J_ASSERT(handle != NULL || (flags & EXT4_GET_BLOCKS_CREATE) == 0); - depth = ext4_block_to_path(inode, iblock, offsets, + depth = ext4_block_to_path(inode, map->m_lblk, offsets, &blocks_to_boundary); if (depth == 0) @@ -946,10 +944,9 @@ static int ext4_ind_get_blocks(handle_t *handle, struct inode *inode, /* Simplest case - block found, no allocation needed */ if (!partial) { first_block = le32_to_cpu(chain[depth - 1].key); - clear_buffer_new(bh_result); count++; /*map more blocks*/ - while (count < maxblocks && count <= blocks_to_boundary) { + while (count < map->m_len && count <= blocks_to_boundary) { ext4_fsblk_t blk; blk = le32_to_cpu(*(chain[depth-1].p + count)); @@ -969,7 +966,7 @@ static int ext4_ind_get_blocks(handle_t *handle, struct inode *inode, /* * Okay, we need to do block allocation. */ - goal = ext4_find_goal(inode, iblock, partial); + goal = ext4_find_goal(inode, map->m_lblk, partial); /* the number of blocks need to allocate for [d,t]indirect blocks */ indirect_blks = (chain + depth) - partial - 1; @@ -979,11 +976,11 @@ static int ext4_ind_get_blocks(handle_t *handle, struct inode *inode, * direct blocks to allocate for this branch. */ count = ext4_blks_to_allocate(partial, indirect_blks, - maxblocks, blocks_to_boundary); + map->m_len, blocks_to_boundary); /* * Block out ext4_truncate while we alter the tree */ - err = ext4_alloc_branch(handle, inode, iblock, indirect_blks, + err = ext4_alloc_branch(handle, inode, map->m_lblk, indirect_blks, &count, goal, offsets + (partial - chain), partial); @@ -995,18 +992,20 @@ static int ext4_ind_get_blocks(handle_t *handle, struct inode *inode, * may need to return -EAGAIN upwards in the worst case. --sct */ if (!err) - err = ext4_splice_branch(handle, inode, iblock, + err = ext4_splice_branch(handle, inode, map->m_lblk, partial, indirect_blks, count); if (err) goto cleanup; - set_buffer_new(bh_result); + map->m_flags |= EXT4_MAP_NEW; ext4_update_inode_fsync_trans(handle, inode, 1); got_it: - map_bh(bh_result, inode->i_sb, le32_to_cpu(chain[depth-1].key)); + map->m_flags |= EXT4_MAP_MAPPED; + map->m_pblk = le32_to_cpu(chain[depth-1].key); + map->m_len = count; if (count > blocks_to_boundary) - set_buffer_boundary(bh_result); + map->m_flags |= EXT4_MAP_BOUNDARY; err = count; /* Clean up and exit */ partial = chain + depth - 1; /* the whole chain */ @@ -1016,7 +1015,6 @@ cleanup: brelse(partial->bh); partial--; } - BUFFER_TRACE(bh_result, "returned"); out: return err; } @@ -1061,7 +1059,7 @@ static int ext4_indirect_calc_metadata_amount(struct inode *inode, */ static int ext4_calc_metadata_amount(struct inode *inode, sector_t lblock) { - if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) + if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) return ext4_ext_calc_metadata_amount(inode, lblock); return ext4_indirect_calc_metadata_amount(inode, lblock); @@ -1076,7 +1074,6 @@ void ext4_da_update_reserve_space(struct inode *inode, { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); struct ext4_inode_info *ei = EXT4_I(inode); - int mdb_free = 0, allocated_meta_blocks = 0; spin_lock(&ei->i_block_reservation_lock); trace_ext4_da_update_reserve_space(inode, used); @@ -1091,11 +1088,10 @@ void ext4_da_update_reserve_space(struct inode *inode, /* Update per-inode reservations */ ei->i_reserved_data_blocks -= used; - used += ei->i_allocated_meta_blocks; ei->i_reserved_meta_blocks -= ei->i_allocated_meta_blocks; - allocated_meta_blocks = ei->i_allocated_meta_blocks; + percpu_counter_sub(&sbi->s_dirtyblocks_counter, + used + ei->i_allocated_meta_blocks); ei->i_allocated_meta_blocks = 0; - percpu_counter_sub(&sbi->s_dirtyblocks_counter, used); if (ei->i_reserved_data_blocks == 0) { /* @@ -1103,30 +1099,23 @@ void ext4_da_update_reserve_space(struct inode *inode, * only when we have written all of the delayed * allocation blocks. */ - mdb_free = ei->i_reserved_meta_blocks; + percpu_counter_sub(&sbi->s_dirtyblocks_counter, + ei->i_reserved_meta_blocks); ei->i_reserved_meta_blocks = 0; ei->i_da_metadata_calc_len = 0; - percpu_counter_sub(&sbi->s_dirtyblocks_counter, mdb_free); } spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); - /* Update quota subsystem */ - if (quota_claim) { + /* Update quota subsystem for data blocks */ + if (quota_claim) dquot_claim_block(inode, used); - if (mdb_free) - dquot_release_reservation_block(inode, mdb_free); - } else { + else { /* * We did fallocate with an offset that is already delayed * allocated. So on delayed allocated writeback we should - * not update the quota for allocated blocks. But then - * converting an fallocate region to initialized region would - * have caused a metadata allocation. So claim quota for - * that + * not re-claim the quota for fallocated blocks. */ - if (allocated_meta_blocks) - dquot_claim_block(inode, allocated_meta_blocks); - dquot_release_reservation_block(inode, mdb_free + used); + dquot_release_reservation_block(inode, used); } /* @@ -1139,15 +1128,15 @@ void ext4_da_update_reserve_space(struct inode *inode, ext4_discard_preallocations(inode); } -static int check_block_validity(struct inode *inode, const char *msg, - sector_t logical, sector_t phys, int len) +static int check_block_validity(struct inode *inode, const char *func, + struct ext4_map_blocks *map) { - if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), phys, len)) { - __ext4_error(inode->i_sb, msg, - "inode #%lu logical block %llu mapped to %llu " - "(size %d)", inode->i_ino, - (unsigned long long) logical, - (unsigned long long) phys, len); + if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), map->m_pblk, + map->m_len)) { + ext4_error_inode(func, inode, + "lblock %lu mapped to illegal pblock %llu " + "(length %d)", (unsigned long) map->m_lblk, + map->m_pblk, map->m_len); return -EIO; } return 0; @@ -1212,15 +1201,15 @@ static pgoff_t ext4_num_dirty_pages(struct inode *inode, pgoff_t idx, } /* - * The ext4_get_blocks() function tries to look up the requested blocks, + * The ext4_map_blocks() function tries to look up the requested blocks, * and returns if the blocks are already mapped. * * Otherwise it takes the write lock of the i_data_sem and allocate blocks * and store the allocated blocks in the result buffer head and mark it * mapped. * - * If file type is extents based, it will call ext4_ext_get_blocks(), - * Otherwise, call with ext4_ind_get_blocks() to handle indirect mapping + * If file type is extents based, it will call ext4_ext_map_blocks(), + * Otherwise, call with ext4_ind_map_blocks() to handle indirect mapping * based files * * On success, it returns the number of blocks being mapped or allocate. @@ -1233,35 +1222,29 @@ static pgoff_t ext4_num_dirty_pages(struct inode *inode, pgoff_t idx, * * It returns the error in case of allocation failure. */ -int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block, - unsigned int max_blocks, struct buffer_head *bh, - int flags) +int ext4_map_blocks(handle_t *handle, struct inode *inode, + struct ext4_map_blocks *map, int flags) { int retval; - clear_buffer_mapped(bh); - clear_buffer_unwritten(bh); - - ext_debug("ext4_get_blocks(): inode %lu, flag %d, max_blocks %u," - "logical block %lu\n", inode->i_ino, flags, max_blocks, - (unsigned long)block); + map->m_flags = 0; + ext_debug("ext4_map_blocks(): inode %lu, flag %d, max_blocks %u," + "logical block %lu\n", inode->i_ino, flags, map->m_len, + (unsigned long) map->m_lblk); /* * Try to see if we can get the block without requesting a new * file system block. */ down_read((&EXT4_I(inode)->i_data_sem)); - if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) { - retval = ext4_ext_get_blocks(handle, inode, block, max_blocks, - bh, 0); + if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { + retval = ext4_ext_map_blocks(handle, inode, map, 0); } else { - retval = ext4_ind_get_blocks(handle, inode, block, max_blocks, - bh, 0); + retval = ext4_ind_map_blocks(handle, inode, map, 0); } up_read((&EXT4_I(inode)->i_data_sem)); - if (retval > 0 && buffer_mapped(bh)) { - int ret = check_block_validity(inode, "file system corruption", - block, bh->b_blocknr, retval); + if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) { + int ret = check_block_validity(inode, __func__, map); if (ret != 0) return ret; } @@ -1277,7 +1260,7 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block, * ext4_ext_get_block() returns th create = 0 * with buffer head unmapped. */ - if (retval > 0 && buffer_mapped(bh)) + if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) return retval; /* @@ -1290,7 +1273,7 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block, * of BH_Unwritten and BH_Mapped flags being simultaneously * set on the buffer_head. */ - clear_buffer_unwritten(bh); + map->m_flags &= ~EXT4_MAP_UNWRITTEN; /* * New blocks allocate and/or writing to uninitialized extent @@ -1312,14 +1295,12 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block, * We need to check for EXT4 here because migrate * could have changed the inode type in between */ - if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) { - retval = ext4_ext_get_blocks(handle, inode, block, max_blocks, - bh, flags); + if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { + retval = ext4_ext_map_blocks(handle, inode, map, flags); } else { - retval = ext4_ind_get_blocks(handle, inode, block, - max_blocks, bh, flags); + retval = ext4_ind_map_blocks(handle, inode, map, flags); - if (retval > 0 && buffer_new(bh)) { + if (retval > 0 && map->m_flags & EXT4_MAP_NEW) { /* * We allocated new blocks which will result in * i_data's format changing. Force the migrate @@ -1342,10 +1323,10 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block, EXT4_I(inode)->i_delalloc_reserved_flag = 0; up_write((&EXT4_I(inode)->i_data_sem)); - if (retval > 0 && buffer_mapped(bh)) { - int ret = check_block_validity(inode, "file system " - "corruption after allocation", - block, bh->b_blocknr, retval); + if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) { + int ret = check_block_validity(inode, + "ext4_map_blocks_after_alloc", + map); if (ret != 0) return ret; } @@ -1355,109 +1336,109 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block, /* Maximum number of blocks we map for direct IO at once. */ #define DIO_MAX_BLOCKS 4096 -int ext4_get_block(struct inode *inode, sector_t iblock, - struct buffer_head *bh_result, int create) +static int _ext4_get_block(struct inode *inode, sector_t iblock, + struct buffer_head *bh, int flags) { handle_t *handle = ext4_journal_current_handle(); + struct ext4_map_blocks map; int ret = 0, started = 0; - unsigned max_blocks = bh_result->b_size >> inode->i_blkbits; int dio_credits; - if (create && !handle) { + map.m_lblk = iblock; + map.m_len = bh->b_size >> inode->i_blkbits; + + if (flags && !handle) { /* Direct IO write... */ - if (max_blocks > DIO_MAX_BLOCKS) - max_blocks = DIO_MAX_BLOCKS; - dio_credits = ext4_chunk_trans_blocks(inode, max_blocks); + if (map.m_len > DIO_MAX_BLOCKS) + map.m_len = DIO_MAX_BLOCKS; + dio_credits = ext4_chunk_trans_blocks(inode, map.m_len); handle = ext4_journal_start(inode, dio_credits); if (IS_ERR(handle)) { ret = PTR_ERR(handle); - goto out; + return ret; } started = 1; } - ret = ext4_get_blocks(handle, inode, iblock, max_blocks, bh_result, - create ? EXT4_GET_BLOCKS_CREATE : 0); + ret = ext4_map_blocks(handle, inode, &map, flags); if (ret > 0) { - bh_result->b_size = (ret << inode->i_blkbits); + map_bh(bh, inode->i_sb, map.m_pblk); + bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | map.m_flags; + bh->b_size = inode->i_sb->s_blocksize * map.m_len; ret = 0; } if (started) ext4_journal_stop(handle); -out: return ret; } +int ext4_get_block(struct inode *inode, sector_t iblock, + struct buffer_head *bh, int create) +{ + return _ext4_get_block(inode, iblock, bh, + create ? EXT4_GET_BLOCKS_CREATE : 0); +} + /* * `handle' can be NULL if create is zero */ struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode, ext4_lblk_t block, int create, int *errp) { - struct buffer_head dummy; + struct ext4_map_blocks map; + struct buffer_head *bh; int fatal = 0, err; - int flags = 0; J_ASSERT(handle != NULL || create == 0); - dummy.b_state = 0; - dummy.b_blocknr = -1000; - buffer_trace_init(&dummy.b_history); - if (create) - flags |= EXT4_GET_BLOCKS_CREATE; - err = ext4_get_blocks(handle, inode, block, 1, &dummy, flags); - /* - * ext4_get_blocks() returns number of blocks mapped. 0 in - * case of a HOLE. - */ - if (err > 0) { - if (err > 1) - WARN_ON(1); - err = 0; + map.m_lblk = block; + map.m_len = 1; + err = ext4_map_blocks(handle, inode, &map, + create ? EXT4_GET_BLOCKS_CREATE : 0); + + if (err < 0) + *errp = err; + if (err <= 0) + return NULL; + *errp = 0; + + bh = sb_getblk(inode->i_sb, map.m_pblk); + if (!bh) { + *errp = -EIO; + return NULL; } - *errp = err; - if (!err && buffer_mapped(&dummy)) { - struct buffer_head *bh; - bh = sb_getblk(inode->i_sb, dummy.b_blocknr); - if (!bh) { - *errp = -EIO; - goto err; - } - if (buffer_new(&dummy)) { - J_ASSERT(create != 0); - J_ASSERT(handle != NULL); + if (map.m_flags & EXT4_MAP_NEW) { + J_ASSERT(create != 0); + J_ASSERT(handle != NULL); - /* - * Now that we do not always journal data, we should - * keep in mind whether this should always journal the - * new buffer as metadata. For now, regular file - * writes use ext4_get_block instead, so it's not a - * problem. - */ - lock_buffer(bh); - BUFFER_TRACE(bh, "call get_create_access"); - fatal = ext4_journal_get_create_access(handle, bh); - if (!fatal && !buffer_uptodate(bh)) { - memset(bh->b_data, 0, inode->i_sb->s_blocksize); - set_buffer_uptodate(bh); - } - unlock_buffer(bh); - BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); - err = ext4_handle_dirty_metadata(handle, inode, bh); - if (!fatal) - fatal = err; - } else { - BUFFER_TRACE(bh, "not a new buffer"); - } - if (fatal) { - *errp = fatal; - brelse(bh); - bh = NULL; + /* + * Now that we do not always journal data, we should + * keep in mind whether this should always journal the + * new buffer as metadata. For now, regular file + * writes use ext4_get_block instead, so it's not a + * problem. + */ + lock_buffer(bh); + BUFFER_TRACE(bh, "call get_create_access"); + fatal = ext4_journal_get_create_access(handle, bh); + if (!fatal && !buffer_uptodate(bh)) { + memset(bh->b_data, 0, inode->i_sb->s_blocksize); + set_buffer_uptodate(bh); } - return bh; + unlock_buffer(bh); + BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); + err = ext4_handle_dirty_metadata(handle, inode, bh); + if (!fatal) + fatal = err; + } else { + BUFFER_TRACE(bh, "not a new buffer"); } -err: - return NULL; + if (fatal) { + *errp = fatal; + brelse(bh); + bh = NULL; + } + return bh; } struct buffer_head *ext4_bread(handle_t *handle, struct inode *inode, @@ -1860,7 +1841,7 @@ static int ext4_da_reserve_space(struct inode *inode, sector_t lblock) int retries = 0; struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); struct ext4_inode_info *ei = EXT4_I(inode); - unsigned long md_needed, md_reserved; + unsigned long md_needed; int ret; /* @@ -1870,22 +1851,24 @@ static int ext4_da_reserve_space(struct inode *inode, sector_t lblock) */ repeat: spin_lock(&ei->i_block_reservation_lock); - md_reserved = ei->i_reserved_meta_blocks; md_needed = ext4_calc_metadata_amount(inode, lblock); trace_ext4_da_reserve_space(inode, md_needed); spin_unlock(&ei->i_block_reservation_lock); /* - * Make quota reservation here to prevent quota overflow - * later. Real quota accounting is done at pages writeout - * time. + * We will charge metadata quota at writeout time; this saves + * us from metadata over-estimation, though we may go over by + * a small amount in the end. Here we just reserve for data. */ - ret = dquot_reserve_block(inode, md_needed + 1); + ret = dquot_reserve_block(inode, 1); if (ret) return ret; - + /* + * We do still charge estimated metadata to the sb though; + * we cannot afford to run out of free blocks. + */ if (ext4_claim_free_blocks(sbi, md_needed + 1)) { - dquot_release_reservation_block(inode, md_needed + 1); + dquot_release_reservation_block(inode, 1); if (ext4_should_retry_alloc(inode->i_sb, &retries)) { yield(); goto repeat; @@ -1910,6 +1893,7 @@ static void ext4_da_release_space(struct inode *inode, int to_free) spin_lock(&EXT4_I(inode)->i_block_reservation_lock); + trace_ext4_da_release_space(inode, to_free); if (unlikely(to_free > ei->i_reserved_data_blocks)) { /* * if there aren't enough reserved blocks, then the @@ -1932,12 +1916,13 @@ static void ext4_da_release_space(struct inode *inode, int to_free) * only when we have written all of the delayed * allocation blocks. */ - to_free += ei->i_reserved_meta_blocks; + percpu_counter_sub(&sbi->s_dirtyblocks_counter, + ei->i_reserved_meta_blocks); ei->i_reserved_meta_blocks = 0; ei->i_da_metadata_calc_len = 0; } - /* update fs dirty blocks counter */ + /* update fs dirty data blocks counter */ percpu_counter_sub(&sbi->s_dirtyblocks_counter, to_free); spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); @@ -2042,28 +2027,23 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd) /* * mpage_put_bnr_to_bhs - walk blocks and assign them actual numbers * - * @mpd->inode - inode to walk through - * @exbh->b_blocknr - first block on a disk - * @exbh->b_size - amount of space in bytes - * @logical - first logical block to start assignment with - * * the function goes through all passed space and put actual disk * block numbers into buffer heads, dropping BH_Delay and BH_Unwritten */ -static void mpage_put_bnr_to_bhs(struct mpage_da_data *mpd, sector_t logical, - struct buffer_head *exbh) +static void mpage_put_bnr_to_bhs(struct mpage_da_data *mpd, + struct ext4_map_blocks *map) { struct inode *inode = mpd->inode; struct address_space *mapping = inode->i_mapping; - int blocks = exbh->b_size >> inode->i_blkbits; - sector_t pblock = exbh->b_blocknr, cur_logical; + int blocks = map->m_len; + sector_t pblock = map->m_pblk, cur_logical; struct buffer_head *head, *bh; pgoff_t index, end; struct pagevec pvec; int nr_pages, i; - index = logical >> (PAGE_CACHE_SHIFT - inode->i_blkbits); - end = (logical + blocks - 1) >> (PAGE_CACHE_SHIFT - inode->i_blkbits); + index = map->m_lblk >> (PAGE_CACHE_SHIFT - inode->i_blkbits); + end = (map->m_lblk + blocks - 1) >> (PAGE_CACHE_SHIFT - inode->i_blkbits); cur_logical = index << (PAGE_CACHE_SHIFT - inode->i_blkbits); pagevec_init(&pvec, 0); @@ -2090,17 +2070,16 @@ static void mpage_put_bnr_to_bhs(struct mpage_da_data *mpd, sector_t logical, /* skip blocks out of the range */ do { - if (cur_logical >= logical) + if (cur_logical >= map->m_lblk) break; cur_logical++; } while ((bh = bh->b_this_page) != head); do { - if (cur_logical >= logical + blocks) + if (cur_logical >= map->m_lblk + blocks) break; - if (buffer_delay(bh) || - buffer_unwritten(bh)) { + if (buffer_delay(bh) || buffer_unwritten(bh)) { BUG_ON(bh->b_bdev != inode->i_sb->s_bdev); @@ -2119,7 +2098,7 @@ static void mpage_put_bnr_to_bhs(struct mpage_da_data *mpd, sector_t logical, } else if (buffer_mapped(bh)) BUG_ON(bh->b_blocknr != pblock); - if (buffer_uninit(exbh)) + if (map->m_flags & EXT4_MAP_UNINIT) set_buffer_uninit(bh); cur_logical++; pblock++; @@ -2130,21 +2109,6 @@ static void mpage_put_bnr_to_bhs(struct mpage_da_data *mpd, sector_t logical, } -/* - * __unmap_underlying_blocks - just a helper function to unmap - * set of blocks described by @bh - */ -static inline void __unmap_underlying_blocks(struct inode *inode, - struct buffer_head *bh) -{ - struct block_device *bdev = inode->i_sb->s_bdev; - int blocks, i; - - blocks = bh->b_size >> inode->i_blkbits; - for (i = 0; i < blocks; i++) - unmap_underlying_metadata(bdev, bh->b_blocknr + i); -} - static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd, sector_t logical, long blk_cnt) { @@ -2206,7 +2170,7 @@ static void ext4_print_free_blocks(struct inode *inode) static int mpage_da_map_blocks(struct mpage_da_data *mpd) { int err, blks, get_blocks_flags; - struct buffer_head new; + struct ext4_map_blocks map; sector_t next = mpd->b_blocknr; unsigned max_blocks = mpd->b_size >> mpd->inode->i_blkbits; loff_t disksize = EXT4_I(mpd->inode)->i_disksize; @@ -2247,15 +2211,15 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd) * EXT4_GET_BLOCKS_DELALLOC_RESERVE so the delalloc accounting * variables are updated after the blocks have been allocated. */ - new.b_state = 0; + map.m_lblk = next; + map.m_len = max_blocks; get_blocks_flags = EXT4_GET_BLOCKS_CREATE; if (ext4_should_dioread_nolock(mpd->inode)) get_blocks_flags |= EXT4_GET_BLOCKS_IO_CREATE_EXT; if (mpd->b_state & (1 << BH_Delay)) get_blocks_flags |= EXT4_GET_BLOCKS_DELALLOC_RESERVE; - blks = ext4_get_blocks(handle, mpd->inode, next, max_blocks, - &new, get_blocks_flags); + blks = ext4_map_blocks(handle, mpd->inode, &map, get_blocks_flags); if (blks < 0) { err = blks; /* @@ -2282,7 +2246,7 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd) ext4_msg(mpd->inode->i_sb, KERN_CRIT, "delayed block allocation failed for inode %lu at " "logical offset %llu with max blocks %zd with " - "error %d\n", mpd->inode->i_ino, + "error %d", mpd->inode->i_ino, (unsigned long long) next, mpd->b_size >> mpd->inode->i_blkbits, err); printk(KERN_CRIT "This should not happen!! " @@ -2297,10 +2261,13 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd) } BUG_ON(blks == 0); - new.b_size = (blks << mpd->inode->i_blkbits); + if (map.m_flags & EXT4_MAP_NEW) { + struct block_device *bdev = mpd->inode->i_sb->s_bdev; + int i; - if (buffer_new(&new)) - __unmap_underlying_blocks(mpd->inode, &new); + for (i = 0; i < map.m_len; i++) + unmap_underlying_metadata(bdev, map.m_pblk + i); + } /* * If blocks are delayed marked, we need to @@ -2308,7 +2275,7 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd) */ if ((mpd->b_state & (1 << BH_Delay)) || (mpd->b_state & (1 << BH_Unwritten))) - mpage_put_bnr_to_bhs(mpd, next, &new); + mpage_put_bnr_to_bhs(mpd, &map); if (ext4_should_order_data(mpd->inode)) { err = ext4_jbd2_file_inode(handle, mpd->inode); @@ -2349,8 +2316,17 @@ static void mpage_add_bh_to_extent(struct mpage_da_data *mpd, sector_t next; int nrblocks = mpd->b_size >> mpd->inode->i_blkbits; + /* + * XXX Don't go larger than mballoc is willing to allocate + * This is a stopgap solution. We eventually need to fold + * mpage_da_submit_io() into this function and then call + * ext4_get_blocks() multiple times in a loop + */ + if (nrblocks >= 8*1024*1024/mpd->inode->i_sb->s_blocksize) + goto flush_it; + /* check if thereserved journal credits might overflow */ - if (!(EXT4_I(mpd->inode)->i_flags & EXT4_EXTENTS_FL)) { + if (!(ext4_test_inode_flag(mpd->inode, EXT4_INODE_EXTENTS))) { if (nrblocks >= EXT4_MAX_TRANS_DATA) { /* * With non-extent format we are limited by the journal @@ -2423,17 +2399,6 @@ static int __mpage_da_writepage(struct page *page, struct buffer_head *bh, *head; sector_t logical; - if (mpd->io_done) { - /* - * Rest of the page in the page_vec - * redirty then and skip then. We will - * try to write them again after - * starting a new transaction - */ - redirty_page_for_writepage(wbc, page); - unlock_page(page); - return MPAGE_DA_EXTENT_TAIL; - } /* * Can we merge this page to current extent? */ @@ -2528,8 +2493,9 @@ static int __mpage_da_writepage(struct page *page, * initialized properly. */ static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock, - struct buffer_head *bh_result, int create) + struct buffer_head *bh, int create) { + struct ext4_map_blocks map; int ret = 0; sector_t invalid_block = ~((sector_t) 0xffff); @@ -2537,16 +2503,22 @@ static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock, invalid_block = ~0; BUG_ON(create == 0); - BUG_ON(bh_result->b_size != inode->i_sb->s_blocksize); + BUG_ON(bh->b_size != inode->i_sb->s_blocksize); + + map.m_lblk = iblock; + map.m_len = 1; /* * first, we need to know whether the block is allocated already * preallocated blocks are unmapped but should treated * the same as allocated blocks. */ - ret = ext4_get_blocks(NULL, inode, iblock, 1, bh_result, 0); - if ((ret == 0) && !buffer_delay(bh_result)) { - /* the block isn't (pre)allocated yet, let's reserve space */ + ret = ext4_map_blocks(NULL, inode, &map, 0); + if (ret < 0) + return ret; + if (ret == 0) { + if (buffer_delay(bh)) + return 0; /* Not sure this could or should happen */ /* * XXX: __block_prepare_write() unmaps passed block, * is it OK? @@ -2556,26 +2528,26 @@ static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock, /* not enough space to reserve */ return ret; - map_bh(bh_result, inode->i_sb, invalid_block); - set_buffer_new(bh_result); - set_buffer_delay(bh_result); - } else if (ret > 0) { - bh_result->b_size = (ret << inode->i_blkbits); - if (buffer_unwritten(bh_result)) { - /* A delayed write to unwritten bh should - * be marked new and mapped. Mapped ensures - * that we don't do get_block multiple times - * when we write to the same offset and new - * ensures that we do proper zero out for - * partial write. - */ - set_buffer_new(bh_result); - set_buffer_mapped(bh_result); - } - ret = 0; + map_bh(bh, inode->i_sb, invalid_block); + set_buffer_new(bh); + set_buffer_delay(bh); + return 0; } - return ret; + map_bh(bh, inode->i_sb, map.m_pblk); + bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | map.m_flags; + + if (buffer_unwritten(bh)) { + /* A delayed write to unwritten bh should be marked + * new and mapped. Mapped ensures that we don't do + * get_block multiple times when we write to the same + * offset and new ensures that we do proper zero out + * for partial write. + */ + set_buffer_new(bh); + set_buffer_mapped(bh); + } + return 0; } /* @@ -2597,21 +2569,8 @@ static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock, static int noalloc_get_block_write(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create) { - int ret = 0; - unsigned max_blocks = bh_result->b_size >> inode->i_blkbits; - BUG_ON(bh_result->b_size != inode->i_sb->s_blocksize); - - /* - * we don't want to do block allocation in writepage - * so call get_block_wrap with create = 0 - */ - ret = ext4_get_blocks(NULL, inode, iblock, max_blocks, bh_result, 0); - if (ret > 0) { - bh_result->b_size = (ret << inode->i_blkbits); - ret = 0; - } - return ret; + return _ext4_get_block(inode, iblock, bh_result, 0); } static int bget_one(handle_t *handle, struct buffer_head *bh) @@ -2821,13 +2780,131 @@ static int ext4_da_writepages_trans_blocks(struct inode *inode) * number of contiguous block. So we will limit * number of contiguous block to a sane value */ - if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) && + if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) && (max_blocks > EXT4_MAX_TRANS_DATA)) max_blocks = EXT4_MAX_TRANS_DATA; return ext4_chunk_trans_blocks(inode, max_blocks); } +/* + * write_cache_pages_da - walk the list of dirty pages of the given + * address space and call the callback function (which usually writes + * the pages). + * + * This is a forked version of write_cache_pages(). Differences: + * Range cyclic is ignored. + * no_nrwrite_index_update is always presumed true + */ +static int write_cache_pages_da(struct address_space *mapping, + struct writeback_control *wbc, + struct mpage_da_data *mpd) +{ + int ret = 0; + int done = 0; + struct pagevec pvec; + int nr_pages; + pgoff_t index; + pgoff_t end; /* Inclusive */ + long nr_to_write = wbc->nr_to_write; + + pagevec_init(&pvec, 0); + index = wbc->range_start >> PAGE_CACHE_SHIFT; + end = wbc->range_end >> PAGE_CACHE_SHIFT; + + while (!done && (index <= end)) { + int i; + + nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, + PAGECACHE_TAG_DIRTY, + min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1); + if (nr_pages == 0) + break; + + for (i = 0; i < nr_pages; i++) { + struct page *page = pvec.pages[i]; + + /* + * At this point, the page may be truncated or + * invalidated (changing page->mapping to NULL), or + * even swizzled back from swapper_space to tmpfs file + * mapping. However, page->index will not change + * because we have a reference on the page. + */ + if (page->index > end) { + done = 1; + break; + } + + lock_page(page); + + /* + * Page truncated or invalidated. We can freely skip it + * then, even for data integrity operations: the page + * has disappeared concurrently, so there could be no + * real expectation of this data interity operation + * even if there is now a new, dirty page at the same + * pagecache address. + */ + if (unlikely(page->mapping != mapping)) { +continue_unlock: + unlock_page(page); + continue; + } + + if (!PageDirty(page)) { + /* someone wrote it for us */ + goto continue_unlock; + } + + if (PageWriteback(page)) { + if (wbc->sync_mode != WB_SYNC_NONE) + wait_on_page_writeback(page); + else + goto continue_unlock; + } + + BUG_ON(PageWriteback(page)); + if (!clear_page_dirty_for_io(page)) + goto continue_unlock; + + ret = __mpage_da_writepage(page, wbc, mpd); + if (unlikely(ret)) { + if (ret == AOP_WRITEPAGE_ACTIVATE) { + unlock_page(page); + ret = 0; + } else { + done = 1; + break; + } + } + + if (nr_to_write > 0) { + nr_to_write--; + if (nr_to_write == 0 && + wbc->sync_mode == WB_SYNC_NONE) { + /* + * We stop writing back only if we are + * not doing integrity sync. In case of + * integrity sync we have to keep going + * because someone may be concurrently + * dirtying pages, and we might have + * synced a lot of newly appeared dirty + * pages, but have not synced all of the + * old dirty pages. + */ + done = 1; + break; + } + } + } + pagevec_release(&pvec); + cond_resched(); + } + return ret; +} + + static int ext4_da_writepages(struct address_space *mapping, struct writeback_control *wbc) { @@ -2836,7 +2913,6 @@ static int ext4_da_writepages(struct address_space *mapping, handle_t *handle = NULL; struct mpage_da_data mpd; struct inode *inode = mapping->host; - int no_nrwrite_index_update; int pages_written = 0; long pages_skipped; unsigned int max_pages; @@ -2916,12 +2992,6 @@ static int ext4_da_writepages(struct address_space *mapping, mpd.wbc = wbc; mpd.inode = mapping->host; - /* - * we don't want write_cache_pages to update - * nr_to_write and writeback_index - */ - no_nrwrite_index_update = wbc->no_nrwrite_index_update; - wbc->no_nrwrite_index_update = 1; pages_skipped = wbc->pages_skipped; retry: @@ -2941,7 +3011,7 @@ retry: if (IS_ERR(handle)) { ret = PTR_ERR(handle); ext4_msg(inode->i_sb, KERN_CRIT, "%s: jbd2_start: " - "%ld pages, ino %lu; err %d\n", __func__, + "%ld pages, ino %lu; err %d", __func__, wbc->nr_to_write, inode->i_ino, ret); goto out_writepages; } @@ -2963,8 +3033,7 @@ retry: mpd.io_done = 0; mpd.pages_written = 0; mpd.retval = 0; - ret = write_cache_pages(mapping, wbc, __mpage_da_writepage, - &mpd); + ret = write_cache_pages_da(mapping, wbc, &mpd); /* * If we have a contiguous extent of pages and we * haven't done the I/O yet, map the blocks and submit @@ -3016,7 +3085,7 @@ retry: if (pages_skipped != wbc->pages_skipped) ext4_msg(inode->i_sb, KERN_CRIT, "This should not happen leaving %s " - "with nr_to_write = %ld ret = %d\n", + "with nr_to_write = %ld ret = %d", __func__, wbc->nr_to_write, ret); /* Update index */ @@ -3030,8 +3099,6 @@ retry: mapping->writeback_index = index; out_writepages: - if (!no_nrwrite_index_update) - wbc->no_nrwrite_index_update = 0; wbc->nr_to_write -= nr_to_writebump; wbc->range_start = range_start; trace_ext4_da_writepages_result(inode, wbc, ret, pages_written); @@ -3076,7 +3143,7 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned flags, struct page **pagep, void **fsdata) { - int ret, retries = 0, quota_retries = 0; + int ret, retries = 0; struct page *page; pgoff_t index; unsigned from, to; @@ -3135,22 +3202,6 @@ retry: if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) goto retry; - - if ((ret == -EDQUOT) && - EXT4_I(inode)->i_reserved_meta_blocks && - (quota_retries++ < 3)) { - /* - * Since we often over-estimate the number of meta - * data blocks required, we may sometimes get a - * spurios out of quota error even though there would - * be enough space once we write the data blocks and - * find out how many meta data blocks were _really_ - * required. So try forcing the inode write to see if - * that helps. - */ - write_inode_now(inode, (quota_retries == 3)); - goto retry; - } out: return ret; } @@ -3546,46 +3597,18 @@ out: return ret; } +/* + * ext4_get_block used when preparing for a DIO write or buffer write. + * We allocate an uinitialized extent if blocks haven't been allocated. + * The extent will be converted to initialized after the IO is complete. + */ static int ext4_get_block_write(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create) { - handle_t *handle = ext4_journal_current_handle(); - int ret = 0; - unsigned max_blocks = bh_result->b_size >> inode->i_blkbits; - int dio_credits; - int started = 0; - ext4_debug("ext4_get_block_write: inode %lu, create flag %d\n", inode->i_ino, create); - /* - * ext4_get_block in prepare for a DIO write or buffer write. - * We allocate an uinitialized extent if blocks haven't been allocated. - * The extent will be converted to initialized after IO complete. - */ - create = EXT4_GET_BLOCKS_IO_CREATE_EXT; - - if (!handle) { - if (max_blocks > DIO_MAX_BLOCKS) - max_blocks = DIO_MAX_BLOCKS; - dio_credits = ext4_chunk_trans_blocks(inode, max_blocks); - handle = ext4_journal_start(inode, dio_credits); - if (IS_ERR(handle)) { - ret = PTR_ERR(handle); - goto out; - } - started = 1; - } - - ret = ext4_get_blocks(handle, inode, iblock, max_blocks, bh_result, - create); - if (ret > 0) { - bh_result->b_size = (ret << inode->i_blkbits); - ret = 0; - } - if (started) - ext4_journal_stop(handle); -out: - return ret; + return _ext4_get_block(inode, iblock, bh_result, + EXT4_GET_BLOCKS_IO_CREATE_EXT); } static void dump_completed_IO(struct inode * inode) @@ -3752,7 +3775,8 @@ static ext4_io_end_t *ext4_init_io_end (struct inode *inode, gfp_t flags) } static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset, - ssize_t size, void *private) + ssize_t size, void *private, int ret, + bool is_async) { ext4_io_end_t *io_end = iocb->private; struct workqueue_struct *wq; @@ -3761,7 +3785,7 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset, /* if not async direct IO or dio with 0 bytes write, just return */ if (!io_end || !size) - return; + goto out; ext_debug("ext4_end_io_dio(): io_end 0x%p" "for inode %lu, iocb 0x%p, offset %llu, size %llu\n", @@ -3772,7 +3796,7 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset, if (io_end->flag != EXT4_IO_UNWRITTEN){ ext4_free_io_end(io_end); iocb->private = NULL; - return; + goto out; } io_end->offset = offset; @@ -3789,6 +3813,9 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset, list_add_tail(&io_end->list, &ei->i_completed_io_list); spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); iocb->private = NULL; +out: + if (is_async) + aio_complete(iocb, ret, 0); } static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate) @@ -3973,7 +4000,7 @@ static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb, struct file *file = iocb->ki_filp; struct inode *inode = file->f_mapping->host; - if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) + if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) return ext4_ext_direct_IO(rw, iocb, iov, offset, nr_segs); return ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs); @@ -4302,10 +4329,9 @@ static int ext4_clear_blocks(handle_t *handle, struct inode *inode, if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), block_to_free, count)) { - ext4_error(inode->i_sb, "inode #%lu: " - "attempt to clear blocks %llu len %lu, invalid", - inode->i_ino, (unsigned long long) block_to_free, - count); + EXT4_ERROR_INODE(inode, "attempt to clear invalid " + "blocks %llu len %lu", + (unsigned long long) block_to_free, count); return 1; } @@ -4410,11 +4436,10 @@ static void ext4_free_data(handle_t *handle, struct inode *inode, if ((EXT4_JOURNAL(inode) == NULL) || bh2jh(this_bh)) ext4_handle_dirty_metadata(handle, inode, this_bh); else - ext4_error(inode->i_sb, - "circular indirect block detected, " - "inode=%lu, block=%llu", - inode->i_ino, - (unsigned long long) this_bh->b_blocknr); + EXT4_ERROR_INODE(inode, + "circular indirect block detected at " + "block %llu", + (unsigned long long) this_bh->b_blocknr); } } @@ -4452,11 +4477,10 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode, if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), nr, 1)) { - ext4_error(inode->i_sb, - "indirect mapped block in inode " - "#%lu invalid (level %d, blk #%lu)", - inode->i_ino, depth, - (unsigned long) nr); + EXT4_ERROR_INODE(inode, + "invalid indirect mapped " + "block %lu (level %d)", + (unsigned long) nr, depth); break; } @@ -4468,9 +4492,9 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode, * (should be rare). */ if (!bh) { - ext4_error(inode->i_sb, - "Read failure, inode=%lu, block=%llu", - inode->i_ino, nr); + EXT4_ERROR_INODE(inode, + "Read failure block=%llu", + (unsigned long long) nr); continue; } @@ -4612,12 +4636,12 @@ void ext4_truncate(struct inode *inode) if (!ext4_can_truncate(inode)) return; - EXT4_I(inode)->i_flags &= ~EXT4_EOFBLOCKS_FL; + ext4_clear_inode_flag(inode, EXT4_INODE_EOFBLOCKS); if (inode->i_size == 0 && !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC)) ext4_set_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE); - if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) { + if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { ext4_ext_truncate(inode); return; } @@ -4785,8 +4809,8 @@ static int __ext4_get_inode_loc(struct inode *inode, bh = sb_getblk(sb, block); if (!bh) { - ext4_error(sb, "unable to read inode block - " - "inode=%lu, block=%llu", inode->i_ino, block); + EXT4_ERROR_INODE(inode, "unable to read inode block - " + "block %llu", block); return -EIO; } if (!buffer_uptodate(bh)) { @@ -4884,8 +4908,8 @@ make_io: submit_bh(READ_META, bh); wait_on_buffer(bh); if (!buffer_uptodate(bh)) { - ext4_error(sb, "unable to read inode block - inode=%lu," - " block=%llu", inode->i_ino, block); + EXT4_ERROR_INODE(inode, "unable to read inode " + "block %llu", block); brelse(bh); return -EIO; } @@ -4922,20 +4946,26 @@ void ext4_set_inode_flags(struct inode *inode) /* Propagate flags from i_flags to EXT4_I(inode)->i_flags */ void ext4_get_inode_flags(struct ext4_inode_info *ei) { - unsigned int flags = ei->vfs_inode.i_flags; - - ei->i_flags &= ~(EXT4_SYNC_FL|EXT4_APPEND_FL| - EXT4_IMMUTABLE_FL|EXT4_NOATIME_FL|EXT4_DIRSYNC_FL); - if (flags & S_SYNC) - ei->i_flags |= EXT4_SYNC_FL; - if (flags & S_APPEND) - ei->i_flags |= EXT4_APPEND_FL; - if (flags & S_IMMUTABLE) - ei->i_flags |= EXT4_IMMUTABLE_FL; - if (flags & S_NOATIME) - ei->i_flags |= EXT4_NOATIME_FL; - if (flags & S_DIRSYNC) - ei->i_flags |= EXT4_DIRSYNC_FL; + unsigned int vfs_fl; + unsigned long old_fl, new_fl; + + do { + vfs_fl = ei->vfs_inode.i_flags; + old_fl = ei->i_flags; + new_fl = old_fl & ~(EXT4_SYNC_FL|EXT4_APPEND_FL| + EXT4_IMMUTABLE_FL|EXT4_NOATIME_FL| + EXT4_DIRSYNC_FL); + if (vfs_fl & S_SYNC) + new_fl |= EXT4_SYNC_FL; + if (vfs_fl & S_APPEND) + new_fl |= EXT4_APPEND_FL; + if (vfs_fl & S_IMMUTABLE) + new_fl |= EXT4_IMMUTABLE_FL; + if (vfs_fl & S_NOATIME) + new_fl |= EXT4_NOATIME_FL; + if (vfs_fl & S_DIRSYNC) + new_fl |= EXT4_DIRSYNC_FL; + } while (cmpxchg(&ei->i_flags, old_fl, new_fl) != old_fl); } static blkcnt_t ext4_inode_blocks(struct ext4_inode *raw_inode, @@ -5096,8 +5126,8 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) ret = 0; if (ei->i_file_acl && !ext4_data_block_valid(EXT4_SB(sb), ei->i_file_acl, 1)) { - ext4_error(sb, "bad extended attribute block %llu inode #%lu", - ei->i_file_acl, inode->i_ino); + EXT4_ERROR_INODE(inode, "bad extended attribute block %llu", + ei->i_file_acl); ret = -EIO; goto bad_inode; } else if (ei->i_flags & EXT4_EXTENTS_FL) { @@ -5142,8 +5172,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) new_decode_dev(le32_to_cpu(raw_inode->i_block[1]))); } else { ret = -EIO; - ext4_error(inode->i_sb, "bogus i_mode (%o) for inode=%lu", - inode->i_mode, inode->i_ino); + EXT4_ERROR_INODE(inode, "bogus i_mode (%o)", inode->i_mode); goto bad_inode; } brelse(iloc.bh); @@ -5172,7 +5201,7 @@ static int ext4_inode_blocks_set(handle_t *handle, */ raw_inode->i_blocks_lo = cpu_to_le32(i_blocks); raw_inode->i_blocks_high = 0; - ei->i_flags &= ~EXT4_HUGE_FILE_FL; + ext4_clear_inode_flag(inode, EXT4_INODE_HUGE_FILE); return 0; } if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_HUGE_FILE)) @@ -5185,9 +5214,9 @@ static int ext4_inode_blocks_set(handle_t *handle, */ raw_inode->i_blocks_lo = cpu_to_le32(i_blocks); raw_inode->i_blocks_high = cpu_to_le16(i_blocks >> 32); - ei->i_flags &= ~EXT4_HUGE_FILE_FL; + ext4_clear_inode_flag(inode, EXT4_INODE_HUGE_FILE); } else { - ei->i_flags |= EXT4_HUGE_FILE_FL; + ext4_set_inode_flag(inode, EXT4_INODE_HUGE_FILE); /* i_block is stored in file system block size */ i_blocks = i_blocks >> (inode->i_blkbits - 9); raw_inode->i_blocks_lo = cpu_to_le32(i_blocks); @@ -5381,9 +5410,9 @@ int ext4_write_inode(struct inode *inode, struct writeback_control *wbc) if (wbc->sync_mode == WB_SYNC_ALL) sync_dirty_buffer(iloc.bh); if (buffer_req(iloc.bh) && !buffer_uptodate(iloc.bh)) { - ext4_error(inode->i_sb, "IO error syncing inode, " - "inode=%lu, block=%llu", inode->i_ino, - (unsigned long long)iloc.bh->b_blocknr); + EXT4_ERROR_INODE(inode, + "IO error syncing inode (block=%llu)", + (unsigned long long) iloc.bh->b_blocknr); err = -EIO; } brelse(iloc.bh); @@ -5455,7 +5484,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr) } if (attr->ia_valid & ATTR_SIZE) { - if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)) { + if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); if (attr->ia_size > sbi->s_bitmap_maxbytes) { @@ -5468,7 +5497,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr) if (S_ISREG(inode->i_mode) && attr->ia_valid & ATTR_SIZE && (attr->ia_size < inode->i_size || - (EXT4_I(inode)->i_flags & EXT4_EOFBLOCKS_FL))) { + (ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS)))) { handle_t *handle; handle = ext4_journal_start(inode, 3); @@ -5500,7 +5529,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr) } } /* ext4_truncate will clear the flag */ - if ((EXT4_I(inode)->i_flags & EXT4_EOFBLOCKS_FL)) + if ((ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS))) ext4_truncate(inode); } @@ -5576,7 +5605,7 @@ static int ext4_indirect_trans_blocks(struct inode *inode, int nrblocks, static int ext4_index_trans_blocks(struct inode *inode, int nrblocks, int chunk) { - if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)) + if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) return ext4_indirect_trans_blocks(inode, nrblocks, chunk); return ext4_ext_index_trans_blocks(inode, nrblocks, chunk); } @@ -5911,9 +5940,9 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val) */ if (val) - EXT4_I(inode)->i_flags |= EXT4_JOURNAL_DATA_FL; + ext4_set_inode_flag(inode, EXT4_INODE_JOURNAL_DATA); else - EXT4_I(inode)->i_flags &= ~EXT4_JOURNAL_DATA_FL; + ext4_clear_inode_flag(inode, EXT4_INODE_JOURNAL_DATA); ext4_set_aops(inode); jbd2_journal_unlock_updates(journal); diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index 016d0249294f..bf5ae883b1bd 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -258,7 +258,7 @@ setversion_out: if (me.moved_len > 0) file_remove_suid(donor_filp); - if (copy_to_user((struct move_extent __user *)arg, + if (copy_to_user((struct move_extent __user *)arg, &me, sizeof(me))) err = -EFAULT; mext_out: @@ -373,7 +373,30 @@ long ext4_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) case EXT4_IOC32_SETRSVSZ: cmd = EXT4_IOC_SETRSVSZ; break; - case EXT4_IOC_GROUP_ADD: + case EXT4_IOC32_GROUP_ADD: { + struct compat_ext4_new_group_input __user *uinput; + struct ext4_new_group_input input; + mm_segment_t old_fs; + int err; + + uinput = compat_ptr(arg); + err = get_user(input.group, &uinput->group); + err |= get_user(input.block_bitmap, &uinput->block_bitmap); + err |= get_user(input.inode_bitmap, &uinput->inode_bitmap); + err |= get_user(input.inode_table, &uinput->inode_table); + err |= get_user(input.blocks_count, &uinput->blocks_count); + err |= get_user(input.reserved_blocks, + &uinput->reserved_blocks); + if (err) + return -EFAULT; + old_fs = get_fs(); + set_fs(KERNEL_DS); + err = ext4_ioctl(file, EXT4_IOC_GROUP_ADD, + (unsigned long) &input); + set_fs(old_fs); + return err; + } + case EXT4_IOC_MOVE_EXT: break; default: return -ENOIOCTLCMD; diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index b423a364dca3..12b3bc026a68 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -658,6 +658,27 @@ static void ext4_mb_mark_free_simple(struct super_block *sb, } } +/* + * Cache the order of the largest free extent we have available in this block + * group. + */ +static void +mb_set_largest_free_order(struct super_block *sb, struct ext4_group_info *grp) +{ + int i; + int bits; + + grp->bb_largest_free_order = -1; /* uninit */ + + bits = sb->s_blocksize_bits + 1; + for (i = bits; i >= 0; i--) { + if (grp->bb_counters[i] > 0) { + grp->bb_largest_free_order = i; + break; + } + } +} + static noinline_for_stack void ext4_mb_generate_buddy(struct super_block *sb, void *buddy, void *bitmap, ext4_group_t group) @@ -700,6 +721,7 @@ void ext4_mb_generate_buddy(struct super_block *sb, */ grp->bb_free = free; } + mb_set_largest_free_order(sb, grp); clear_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &(grp->bb_state)); @@ -725,6 +747,9 @@ void ext4_mb_generate_buddy(struct super_block *sb, * contain blocks_per_page (PAGE_CACHE_SIZE / blocksize) blocks. * So it can have information regarding groups_per_page which * is blocks_per_page/2 + * + * Locking note: This routine takes the block group lock of all groups + * for this page; do not hold this lock when calling this routine! */ static int ext4_mb_init_cache(struct page *page, char *incore) @@ -865,6 +890,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore) BUG_ON(incore == NULL); mb_debug(1, "put buddy for group %u in page %lu/%x\n", group, page->index, i * blocksize); + trace_ext4_mb_buddy_bitmap_load(sb, group); grinfo = ext4_get_group_info(sb, group); grinfo->bb_fragments = 0; memset(grinfo->bb_counters, 0, @@ -882,6 +908,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore) BUG_ON(incore != NULL); mb_debug(1, "put bitmap for group %u in page %lu/%x\n", group, page->index, i * blocksize); + trace_ext4_mb_bitmap_load(sb, group); /* see comments in ext4_mb_put_pa() */ ext4_lock_group(sb, group); @@ -910,6 +937,11 @@ out: return err; } +/* + * Locking note: This routine calls ext4_mb_init_cache(), which takes the + * block group lock of all groups for this page; do not hold the BG lock when + * calling this routine! + */ static noinline_for_stack int ext4_mb_init_group(struct super_block *sb, ext4_group_t group) { @@ -1004,6 +1036,11 @@ err: return ret; } +/* + * Locking note: This routine calls ext4_mb_init_cache(), which takes the + * block group lock of all groups for this page; do not hold the BG lock when + * calling this routine! + */ static noinline_for_stack int ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group, struct ext4_buddy *e4b) @@ -1150,7 +1187,7 @@ err: return ret; } -static void ext4_mb_release_desc(struct ext4_buddy *e4b) +static void ext4_mb_unload_buddy(struct ext4_buddy *e4b) { if (e4b->bd_bitmap_page) page_cache_release(e4b->bd_bitmap_page); @@ -1299,6 +1336,7 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b, buddy = buddy2; } while (1); } + mb_set_largest_free_order(sb, e4b->bd_info); mb_check_buddy(e4b); } @@ -1427,6 +1465,7 @@ static int mb_mark_used(struct ext4_buddy *e4b, struct ext4_free_extent *ex) e4b->bd_info->bb_counters[ord]++; e4b->bd_info->bb_counters[ord]++; } + mb_set_largest_free_order(e4b->bd_sb, e4b->bd_info); mb_set_bits(EXT4_MB_BITMAP(e4b), ex->fe_start, len0); mb_check_buddy(e4b); @@ -1617,7 +1656,7 @@ int ext4_mb_try_best_found(struct ext4_allocation_context *ac, } ext4_unlock_group(ac->ac_sb, group); - ext4_mb_release_desc(e4b); + ext4_mb_unload_buddy(e4b); return 0; } @@ -1672,7 +1711,7 @@ int ext4_mb_find_by_goal(struct ext4_allocation_context *ac, ext4_mb_use_best_found(ac, e4b); } ext4_unlock_group(ac->ac_sb, group); - ext4_mb_release_desc(e4b); + ext4_mb_unload_buddy(e4b); return 0; } @@ -1821,16 +1860,22 @@ void ext4_mb_scan_aligned(struct ext4_allocation_context *ac, } } +/* This is now called BEFORE we load the buddy bitmap. */ static int ext4_mb_good_group(struct ext4_allocation_context *ac, ext4_group_t group, int cr) { unsigned free, fragments; - unsigned i, bits; int flex_size = ext4_flex_bg_size(EXT4_SB(ac->ac_sb)); struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group); BUG_ON(cr < 0 || cr >= 4); - BUG_ON(EXT4_MB_GRP_NEED_INIT(grp)); + + /* We only do this if the grp has never been initialized */ + if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) { + int ret = ext4_mb_init_group(ac->ac_sb, group); + if (ret) + return 0; + } free = grp->bb_free; fragments = grp->bb_fragments; @@ -1843,17 +1888,16 @@ static int ext4_mb_good_group(struct ext4_allocation_context *ac, case 0: BUG_ON(ac->ac_2order == 0); + if (grp->bb_largest_free_order < ac->ac_2order) + return 0; + /* Avoid using the first bg of a flexgroup for data files */ if ((ac->ac_flags & EXT4_MB_HINT_DATA) && (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) && ((group % flex_size) == 0)) return 0; - bits = ac->ac_sb->s_blocksize_bits + 1; - for (i = ac->ac_2order; i <= bits; i++) - if (grp->bb_counters[i] > 0) - return 1; - break; + return 1; case 1: if ((free / fragments) >= ac->ac_g_ex.fe_len) return 1; @@ -1964,7 +2008,7 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac) sbi = EXT4_SB(sb); ngroups = ext4_get_groups_count(sb); /* non-extent files are limited to low blocks/groups */ - if (!(EXT4_I(ac->ac_inode)->i_flags & EXT4_EXTENTS_FL)) + if (!(ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS))) ngroups = sbi->s_blockfile_groups; BUG_ON(ac->ac_status == AC_STATUS_FOUND); @@ -2024,15 +2068,11 @@ repeat: group = ac->ac_g_ex.fe_group; for (i = 0; i < ngroups; group++, i++) { - struct ext4_group_info *grp; - struct ext4_group_desc *desc; - if (group == ngroups) group = 0; - /* quick check to skip empty groups */ - grp = ext4_get_group_info(sb, group); - if (grp->bb_free == 0) + /* This now checks without needing the buddy page */ + if (!ext4_mb_good_group(ac, group, cr)) continue; err = ext4_mb_load_buddy(sb, group, &e4b); @@ -2040,15 +2080,18 @@ repeat: goto out; ext4_lock_group(sb, group); + + /* + * We need to check again after locking the + * block group + */ if (!ext4_mb_good_group(ac, group, cr)) { - /* someone did allocation from this group */ ext4_unlock_group(sb, group); - ext4_mb_release_desc(&e4b); + ext4_mb_unload_buddy(&e4b); continue; } ac->ac_groups_scanned++; - desc = ext4_get_group_desc(sb, group, NULL); if (cr == 0) ext4_mb_simple_scan_group(ac, &e4b); else if (cr == 1 && @@ -2058,7 +2101,7 @@ repeat: ext4_mb_complex_scan_group(ac, &e4b); ext4_unlock_group(sb, group); - ext4_mb_release_desc(&e4b); + ext4_mb_unload_buddy(&e4b); if (ac->ac_status != AC_STATUS_CONTINUE) break; @@ -2148,7 +2191,7 @@ static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v) ext4_lock_group(sb, group); memcpy(&sg, ext4_get_group_info(sb, group), i); ext4_unlock_group(sb, group); - ext4_mb_release_desc(&e4b); + ext4_mb_unload_buddy(&e4b); seq_printf(seq, "#%-5u: %-5u %-5u %-5u [", group, sg.info.bb_free, sg.info.bb_fragments, sg.info.bb_first_free); @@ -2255,6 +2298,7 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group, INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list); init_rwsem(&meta_group_info[i]->alloc_sem); meta_group_info[i]->bb_free_root = RB_ROOT; + meta_group_info[i]->bb_largest_free_order = -1; /* uninit */ #ifdef DOUBLE_CHECK { @@ -2536,6 +2580,7 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn) entry->count, entry->group, entry); if (test_opt(sb, DISCARD)) { + int ret; ext4_fsblk_t discard_block; discard_block = entry->start_blk + @@ -2543,7 +2588,12 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn) trace_ext4_discard_blocks(sb, (unsigned long long)discard_block, entry->count); - sb_issue_discard(sb, discard_block, entry->count); + ret = sb_issue_discard(sb, discard_block, entry->count); + if (ret == EOPNOTSUPP) { + ext4_warning(sb, + "discard not supported, disabling"); + clear_opt(EXT4_SB(sb)->s_mount_opt, DISCARD); + } } err = ext4_mb_load_buddy(sb, entry->group, &e4b); @@ -2568,7 +2618,7 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn) } ext4_unlock_group(sb, entry->group); kmem_cache_free(ext4_free_ext_cachep, entry); - ext4_mb_release_desc(&e4b); + ext4_mb_unload_buddy(&e4b); } mb_debug(1, "freed %u blocks in %u structures\n", count, count2); @@ -2641,7 +2691,7 @@ int __init init_ext4_mballoc(void) void exit_ext4_mballoc(void) { - /* + /* * Wait for completion of call_rcu()'s on ext4_pspace_cachep * before destroying the slab cache. */ @@ -2981,7 +3031,7 @@ static void ext4_mb_collect_stats(struct ext4_allocation_context *ac) if (sbi->s_mb_stats && ac->ac_g_ex.fe_len > 1) { atomic_inc(&sbi->s_bal_reqs); atomic_add(ac->ac_b_ex.fe_len, &sbi->s_bal_allocated); - if (ac->ac_o_ex.fe_len >= ac->ac_g_ex.fe_len) + if (ac->ac_b_ex.fe_len >= ac->ac_o_ex.fe_len) atomic_inc(&sbi->s_bal_success); atomic_add(ac->ac_found, &sbi->s_bal_ex_scanned); if (ac->ac_g_ex.fe_start == ac->ac_b_ex.fe_start && @@ -3123,7 +3173,7 @@ ext4_mb_use_preallocated(struct ext4_allocation_context *ac) continue; /* non-extent files can't have physical blocks past 2^32 */ - if (!(EXT4_I(ac->ac_inode)->i_flags & EXT4_EXTENTS_FL) && + if (!(ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS)) && pa->pa_pstart + pa->pa_len > EXT4_MAX_BLOCK_FILE_PHYS) continue; @@ -3280,7 +3330,7 @@ static void ext4_mb_put_pa(struct ext4_allocation_context *ac, spin_unlock(&pa->pa_lock); grp_blk = pa->pa_pstart; - /* + /* * If doing group-based preallocation, pa_pstart may be in the * next group when pa is used up */ @@ -3697,7 +3747,7 @@ out: ext4_unlock_group(sb, group); if (ac) kmem_cache_free(ext4_ac_cachep, ac); - ext4_mb_release_desc(&e4b); + ext4_mb_unload_buddy(&e4b); put_bh(bitmap_bh); return free; } @@ -3801,7 +3851,7 @@ repeat: if (bitmap_bh == NULL) { ext4_error(sb, "Error reading block bitmap for %u", group); - ext4_mb_release_desc(&e4b); + ext4_mb_unload_buddy(&e4b); continue; } @@ -3810,7 +3860,7 @@ repeat: ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa, ac); ext4_unlock_group(sb, group); - ext4_mb_release_desc(&e4b); + ext4_mb_unload_buddy(&e4b); put_bh(bitmap_bh); list_del(&pa->u.pa_tmp_list); @@ -4074,7 +4124,7 @@ ext4_mb_discard_lg_preallocations(struct super_block *sb, ext4_mb_release_group_pa(&e4b, pa, ac); ext4_unlock_group(sb, group); - ext4_mb_release_desc(&e4b); + ext4_mb_unload_buddy(&e4b); list_del(&pa->u.pa_tmp_list); call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback); } @@ -4484,12 +4534,12 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode, if (!bh) tbh = sb_find_get_block(inode->i_sb, block + i); - ext4_forget(handle, flags & EXT4_FREE_BLOCKS_METADATA, + ext4_forget(handle, flags & EXT4_FREE_BLOCKS_METADATA, inode, tbh, block + i); } } - /* + /* * We need to make sure we don't reuse the freed block until * after the transaction is committed, which we can do by * treating the block as metadata, below. We make an @@ -4610,7 +4660,7 @@ do_more: atomic_add(count, &sbi->s_flex_groups[flex_group].free_blocks); } - ext4_mb_release_desc(&e4b); + ext4_mb_unload_buddy(&e4b); freed += count; diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c index 34dcfc52ef44..6f3a27ec30bf 100644 --- a/fs/ext4/migrate.c +++ b/fs/ext4/migrate.c @@ -475,7 +475,7 @@ int ext4_ext_migrate(struct inode *inode) */ if (!EXT4_HAS_INCOMPAT_FEATURE(inode->i_sb, EXT4_FEATURE_INCOMPAT_EXTENTS) || - (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)) + (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) return -EINVAL; if (S_ISLNK(inode->i_mode) && inode->i_blocks == 0) diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c index d1fc662cc311..52abfa12762a 100644 --- a/fs/ext4/move_extent.c +++ b/fs/ext4/move_extent.c @@ -482,6 +482,7 @@ mext_leaf_block(handle_t *handle, struct inode *orig_inode, int depth = ext_depth(orig_inode); int ret; + start_ext.ee_block = end_ext.ee_block = 0; o_start = o_end = oext = orig_path[depth].p_ext; oext_alen = ext4_ext_get_actual_len(oext); start_ext.ee_len = end_ext.ee_len = 0; @@ -529,7 +530,7 @@ mext_leaf_block(handle_t *handle, struct inode *orig_inode, * new_ext |-------| */ if (le32_to_cpu(oext->ee_block) + oext_alen - 1 < new_ext_end) { - ext4_error(orig_inode->i_sb, + EXT4_ERROR_INODE(orig_inode, "new_ext_end(%u) should be less than or equal to " "oext->ee_block(%u) + oext_alen(%d) - 1", new_ext_end, le32_to_cpu(oext->ee_block), @@ -692,12 +693,12 @@ mext_replace_branches(handle_t *handle, struct inode *orig_inode, while (1) { /* The extent for donor must be found. */ if (!dext) { - ext4_error(donor_inode->i_sb, + EXT4_ERROR_INODE(donor_inode, "The extent for donor must be found"); *err = -EIO; goto out; } else if (donor_off != le32_to_cpu(tmp_dext.ee_block)) { - ext4_error(donor_inode->i_sb, + EXT4_ERROR_INODE(donor_inode, "Donor offset(%u) and the first block of donor " "extent(%u) should be equal", donor_off, @@ -959,6 +960,9 @@ mext_check_arguments(struct inode *orig_inode, return -EINVAL; } + if (IS_IMMUTABLE(donor_inode) || IS_APPEND(donor_inode)) + return -EPERM; + /* Ext4 move extent does not support swapfile */ if (IS_SWAPFILE(orig_inode) || IS_SWAPFILE(donor_inode)) { ext4_debug("ext4 move extent: The argument files should " @@ -976,11 +980,11 @@ mext_check_arguments(struct inode *orig_inode, } /* Ext4 move extent supports only extent based file */ - if (!(EXT4_I(orig_inode)->i_flags & EXT4_EXTENTS_FL)) { + if (!(ext4_test_inode_flag(orig_inode, EXT4_INODE_EXTENTS))) { ext4_debug("ext4 move extent: orig file is not extents " "based file [ino:orig %lu]\n", orig_inode->i_ino); return -EOPNOTSUPP; - } else if (!(EXT4_I(donor_inode)->i_flags & EXT4_EXTENTS_FL)) { + } else if (!(ext4_test_inode_flag(donor_inode, EXT4_INODE_EXTENTS))) { ext4_debug("ext4 move extent: donor file is not extents " "based file [ino:donor %lu]\n", donor_inode->i_ino); return -EOPNOTSUPP; @@ -1354,7 +1358,7 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, if (ret1 < 0) break; if (*moved_len > len) { - ext4_error(orig_inode->i_sb, + EXT4_ERROR_INODE(orig_inode, "We replaced blocks too much! " "sum of replaced: %llu requested: %llu", *moved_len, len); diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 0c070fabd108..a43e6617b351 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -187,7 +187,7 @@ unsigned int ext4_rec_len_from_disk(__le16 dlen, unsigned blocksize) return blocksize; return (len & 65532) | ((len & 3) << 16); } - + __le16 ext4_rec_len_to_disk(unsigned len, unsigned blocksize) { if ((len > blocksize) || (blocksize > (1 << 18)) || (len & 3)) @@ -197,7 +197,7 @@ __le16 ext4_rec_len_to_disk(unsigned len, unsigned blocksize) if (len == blocksize) { if (blocksize == 65536) return cpu_to_le16(EXT4_MAX_REC_LEN); - else + else return cpu_to_le16(0); } return cpu_to_le16((len & 65532) | ((len >> 16) & 3)); @@ -349,7 +349,7 @@ struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir, brelse(bh); } if (bcount) - printk(KERN_DEBUG "%snames %u, fullness %u (%u%%)\n", + printk(KERN_DEBUG "%snames %u, fullness %u (%u%%)\n", levels ? "" : " ", names, space/bcount, (space/bcount)*100/blocksize); return (struct stats) { names, space, bcount}; @@ -653,10 +653,10 @@ int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash, int ret, err; __u32 hashval; - dxtrace(printk(KERN_DEBUG "In htree_fill_tree, start hash: %x:%x\n", + dxtrace(printk(KERN_DEBUG "In htree_fill_tree, start hash: %x:%x\n", start_hash, start_minor_hash)); dir = dir_file->f_path.dentry->d_inode; - if (!(EXT4_I(dir)->i_flags & EXT4_INDEX_FL)) { + if (!(ext4_test_inode_flag(dir, EXT4_INODE_INDEX))) { hinfo.hash_version = EXT4_SB(dir->i_sb)->s_def_hash_version; if (hinfo.hash_version <= DX_HASH_TEA) hinfo.hash_version += @@ -801,7 +801,7 @@ static void ext4_update_dx_flag(struct inode *inode) { if (!EXT4_HAS_COMPAT_FEATURE(inode->i_sb, EXT4_FEATURE_COMPAT_DIR_INDEX)) - EXT4_I(inode)->i_flags &= ~EXT4_INDEX_FL; + ext4_clear_inode_flag(inode, EXT4_INODE_INDEX); } /* @@ -943,8 +943,8 @@ restart: wait_on_buffer(bh); if (!buffer_uptodate(bh)) { /* read error, skip block & hope for the best */ - ext4_error(sb, "reading directory #%lu offset %lu", - dir->i_ino, (unsigned long)block); + EXT4_ERROR_INODE(dir, "reading directory lblock %lu", + (unsigned long) block); brelse(bh); goto next; } @@ -1066,15 +1066,15 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, stru __u32 ino = le32_to_cpu(de->inode); brelse(bh); if (!ext4_valid_inum(dir->i_sb, ino)) { - ext4_error(dir->i_sb, "bad inode number: %u", ino); + EXT4_ERROR_INODE(dir, "bad inode number: %u", ino); return ERR_PTR(-EIO); } inode = ext4_iget(dir->i_sb, ino); if (unlikely(IS_ERR(inode))) { if (PTR_ERR(inode) == -ESTALE) { - ext4_error(dir->i_sb, - "deleted inode referenced: %u", - ino); + EXT4_ERROR_INODE(dir, + "deleted inode referenced: %u", + ino); return ERR_PTR(-EIO); } else { return ERR_CAST(inode); @@ -1104,8 +1104,8 @@ struct dentry *ext4_get_parent(struct dentry *child) brelse(bh); if (!ext4_valid_inum(child->d_inode->i_sb, ino)) { - ext4_error(child->d_inode->i_sb, - "bad inode number: %u", ino); + EXT4_ERROR_INODE(child->d_inode, + "bad parent inode number: %u", ino); return ERR_PTR(-EIO); } @@ -1141,7 +1141,7 @@ dx_move_dirents(char *from, char *to, struct dx_map_entry *map, int count, unsigned rec_len = 0; while (count--) { - struct ext4_dir_entry_2 *de = (struct ext4_dir_entry_2 *) + struct ext4_dir_entry_2 *de = (struct ext4_dir_entry_2 *) (from + (map->offs<<2)); rec_len = EXT4_DIR_REC_LEN(de->name_len); memcpy (to, de, rec_len); @@ -1404,9 +1404,7 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry, de = (struct ext4_dir_entry_2 *)((char *)fde + ext4_rec_len_from_disk(fde->rec_len, blocksize)); if ((char *) de >= (((char *) root) + blocksize)) { - ext4_error(dir->i_sb, - "invalid rec_len for '..' in inode %lu", - dir->i_ino); + EXT4_ERROR_INODE(dir, "invalid rec_len for '..'"); brelse(bh); return -EIO; } @@ -1418,7 +1416,7 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry, brelse(bh); return retval; } - EXT4_I(dir)->i_flags |= EXT4_INDEX_FL; + ext4_set_inode_flag(dir, EXT4_INODE_INDEX); data1 = bh2->b_data; memcpy (data1, de, len); @@ -1491,7 +1489,7 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry, retval = ext4_dx_add_entry(handle, dentry, inode); if (!retval || (retval != ERR_BAD_DX_DIR)) return retval; - EXT4_I(dir)->i_flags &= ~EXT4_INDEX_FL; + ext4_clear_inode_flag(dir, EXT4_INODE_INDEX); dx_fallback++; ext4_mark_inode_dirty(handle, dir); } @@ -1519,6 +1517,8 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry, de->rec_len = ext4_rec_len_to_disk(blocksize, blocksize); retval = add_dirent_to_buf(handle, dentry, inode, de, bh); brelse(bh); + if (retval == 0) + ext4_set_inode_state(inode, EXT4_STATE_NEWENTRY); return retval; } @@ -1915,9 +1915,8 @@ static int empty_dir(struct inode *inode) if (inode->i_size < EXT4_DIR_REC_LEN(1) + EXT4_DIR_REC_LEN(2) || !(bh = ext4_bread(NULL, inode, 0, 0, &err))) { if (err) - ext4_error(inode->i_sb, - "error %d reading directory #%lu offset 0", - err, inode->i_ino); + EXT4_ERROR_INODE(inode, + "error %d reading directory lblock 0", err); else ext4_warning(inode->i_sb, "bad directory (dir #%lu) - no data block", @@ -1941,17 +1940,17 @@ static int empty_dir(struct inode *inode) de = ext4_next_entry(de1, sb->s_blocksize); while (offset < inode->i_size) { if (!bh || - (void *) de >= (void *) (bh->b_data+sb->s_blocksize)) { + (void *) de >= (void *) (bh->b_data+sb->s_blocksize)) { + unsigned int lblock; err = 0; brelse(bh); - bh = ext4_bread(NULL, inode, - offset >> EXT4_BLOCK_SIZE_BITS(sb), 0, &err); + lblock = offset >> EXT4_BLOCK_SIZE_BITS(sb); + bh = ext4_bread(NULL, inode, lblock, 0, &err); if (!bh) { if (err) - ext4_error(sb, - "error %d reading directory" - " #%lu offset %u", - err, inode->i_ino, offset); + EXT4_ERROR_INODE(inode, + "error %d reading directory " + "lblock %u", err, lblock); offset += sb->s_blocksize; continue; } @@ -2297,7 +2296,7 @@ retry: } } else { /* clear the extent format for fast symlink */ - EXT4_I(inode)->i_flags &= ~EXT4_EXTENTS_FL; + ext4_clear_inode_flag(inode, EXT4_INODE_EXTENTS); inode->i_op = &ext4_fast_symlink_inode_operations; memcpy((char *)&EXT4_I(inode)->i_data, symname, l); inode->i_size = l-1; diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index 5692c48754a0..6df797eb9aeb 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -911,7 +911,8 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input) percpu_counter_add(&sbi->s_freeinodes_counter, EXT4_INODES_PER_GROUP(sb)); - if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG)) { + if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG) && + sbi->s_log_groups_per_flex) { ext4_group_t flex_group; flex_group = ext4_flex_group(sbi, input->group); atomic_add(input->free_blocks_count, diff --git a/fs/ext4/super.c b/fs/ext4/super.c index e14d22c170d5..4e8983a9811b 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -241,6 +241,7 @@ handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks) if (sb->s_flags & MS_RDONLY) return ERR_PTR(-EROFS); + vfs_check_frozen(sb, SB_FREEZE_WRITE); /* Special case here: if the journal has aborted behind our * backs (eg. EIO in the commit thread), then we still need to * take the FS itself readonly cleanly. */ @@ -645,6 +646,8 @@ static void ext4_put_super(struct super_block *sb) struct ext4_super_block *es = sbi->s_es; int i, err; + dquot_disable(sb, -1, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED); + flush_workqueue(sbi->dio_unwritten_wq); destroy_workqueue(sbi->dio_unwritten_wq); @@ -941,6 +944,8 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs) seq_puts(seq, test_opt(sb, BARRIER) ? "1" : "0"); if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) seq_puts(seq, ",journal_async_commit"); + else if (test_opt(sb, JOURNAL_CHECKSUM)) + seq_puts(seq, ",journal_checksum"); if (test_opt(sb, NOBH)) seq_puts(seq, ",nobh"); if (test_opt(sb, I_VERSION)) @@ -1059,7 +1064,7 @@ static int ext4_release_dquot(struct dquot *dquot); static int ext4_mark_dquot_dirty(struct dquot *dquot); static int ext4_write_info(struct super_block *sb, int type); static int ext4_quota_on(struct super_block *sb, int type, int format_id, - char *path, int remount); + char *path); static int ext4_quota_on_mount(struct super_block *sb, int type); static ssize_t ext4_quota_read(struct super_block *sb, int type, char *data, size_t len, loff_t off); @@ -1081,12 +1086,12 @@ static const struct dquot_operations ext4_quota_operations = { static const struct quotactl_ops ext4_qctl_operations = { .quota_on = ext4_quota_on, - .quota_off = vfs_quota_off, - .quota_sync = vfs_quota_sync, - .get_info = vfs_get_dqinfo, - .set_info = vfs_set_dqinfo, - .get_dqblk = vfs_get_dqblk, - .set_dqblk = vfs_set_dqblk + .quota_off = dquot_quota_off, + .quota_sync = dquot_quota_sync, + .get_info = dquot_get_dqinfo, + .set_info = dquot_set_dqinfo, + .get_dqblk = dquot_get_dqblk, + .set_dqblk = dquot_set_dqblk }; #endif @@ -2051,7 +2056,7 @@ static void ext4_orphan_cleanup(struct super_block *sb, /* Turn quotas off */ for (i = 0; i < MAXQUOTAS; i++) { if (sb_dqopt(sb)->files[i]) - vfs_quota_off(sb, i, 0); + dquot_quota_off(sb, i); } #endif sb->s_flags = s_flags; /* Restore MS_RDONLY status */ @@ -2213,7 +2218,7 @@ static unsigned long ext4_get_stripe_size(struct ext4_sb_info *sbi) struct ext4_attr { struct attribute attr; ssize_t (*show)(struct ext4_attr *, struct ext4_sb_info *, char *); - ssize_t (*store)(struct ext4_attr *, struct ext4_sb_info *, + ssize_t (*store)(struct ext4_attr *, struct ext4_sb_info *, const char *, size_t); int offset; }; @@ -2430,6 +2435,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) __releases(kernel_lock) __acquires(kernel_lock) { + char *orig_data = kstrdup(data, GFP_KERNEL); struct buffer_head *bh; struct ext4_super_block *es = NULL; struct ext4_sb_info *sbi; @@ -2793,24 +2799,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) get_random_bytes(&sbi->s_next_generation, sizeof(u32)); spin_lock_init(&sbi->s_next_gen_lock); - err = percpu_counter_init(&sbi->s_freeblocks_counter, - ext4_count_free_blocks(sb)); - if (!err) { - err = percpu_counter_init(&sbi->s_freeinodes_counter, - ext4_count_free_inodes(sb)); - } - if (!err) { - err = percpu_counter_init(&sbi->s_dirs_counter, - ext4_count_dirs(sb)); - } - if (!err) { - err = percpu_counter_init(&sbi->s_dirtyblocks_counter, 0); - } - if (err) { - ext4_msg(sb, KERN_ERR, "insufficient memory"); - goto failed_mount3; - } - sbi->s_stripe = ext4_get_stripe_size(sbi); sbi->s_max_writeback_mb_bump = 128; @@ -2910,6 +2898,20 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) set_task_ioprio(sbi->s_journal->j_task, journal_ioprio); no_journal: + err = percpu_counter_init(&sbi->s_freeblocks_counter, + ext4_count_free_blocks(sb)); + if (!err) + err = percpu_counter_init(&sbi->s_freeinodes_counter, + ext4_count_free_inodes(sb)); + if (!err) + err = percpu_counter_init(&sbi->s_dirs_counter, + ext4_count_dirs(sb)); + if (!err) + err = percpu_counter_init(&sbi->s_dirtyblocks_counter, 0); + if (err) { + ext4_msg(sb, KERN_ERR, "insufficient memory"); + goto failed_mount_wq; + } if (test_opt(sb, NOBH)) { if (!(test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA)) { ext4_msg(sb, KERN_WARNING, "Ignoring nobh option - " @@ -3001,7 +3003,7 @@ no_journal: err = ext4_setup_system_zone(sb); if (err) { ext4_msg(sb, KERN_ERR, "failed to initialize system " - "zone (%d)\n", err); + "zone (%d)", err); goto failed_mount4; } @@ -3040,9 +3042,11 @@ no_journal: } else descr = "out journal"; - ext4_msg(sb, KERN_INFO, "mounted filesystem with%s", descr); + ext4_msg(sb, KERN_INFO, "mounted filesystem with%s. " + "Opts: %s", descr, orig_data); lock_kernel(); + kfree(orig_data); return 0; cantfind_ext4: @@ -3059,6 +3063,10 @@ failed_mount_wq: jbd2_journal_destroy(sbi->s_journal); sbi->s_journal = NULL; } + percpu_counter_destroy(&sbi->s_freeblocks_counter); + percpu_counter_destroy(&sbi->s_freeinodes_counter); + percpu_counter_destroy(&sbi->s_dirs_counter); + percpu_counter_destroy(&sbi->s_dirtyblocks_counter); failed_mount3: if (sbi->s_flex_groups) { if (is_vmalloc_addr(sbi->s_flex_groups)) @@ -3066,10 +3074,6 @@ failed_mount3: else kfree(sbi->s_flex_groups); } - percpu_counter_destroy(&sbi->s_freeblocks_counter); - percpu_counter_destroy(&sbi->s_freeinodes_counter); - percpu_counter_destroy(&sbi->s_dirs_counter); - percpu_counter_destroy(&sbi->s_dirtyblocks_counter); failed_mount2: for (i = 0; i < db_count; i++) brelse(sbi->s_group_desc[i]); @@ -3089,6 +3093,7 @@ out_fail: kfree(sbi->s_blockgroup_lock); kfree(sbi); lock_kernel(); + kfree(orig_data); return ret; } @@ -3380,7 +3385,7 @@ static int ext4_commit_super(struct super_block *sb, int sync) if (!(sb->s_flags & MS_RDONLY)) es->s_wtime = cpu_to_le32(get_seconds()); es->s_kbytes_written = - cpu_to_le64(EXT4_SB(sb)->s_kbytes_written + + cpu_to_le64(EXT4_SB(sb)->s_kbytes_written + ((part_stat_read(sb->s_bdev->bd_part, sectors[1]) - EXT4_SB(sb)->s_sectors_written_start) >> 1)); ext4_free_blocks_count_set(es, percpu_counter_sum_positive( @@ -3485,8 +3490,10 @@ int ext4_force_commit(struct super_block *sb) return 0; journal = EXT4_SB(sb)->s_journal; - if (journal) + if (journal) { + vfs_check_frozen(sb, SB_FREEZE_WRITE); ret = ext4_journal_force_commit(journal); + } return ret; } @@ -3535,18 +3542,16 @@ static int ext4_freeze(struct super_block *sb) * the journal. */ error = jbd2_journal_flush(journal); - if (error < 0) { - out: - jbd2_journal_unlock_updates(journal); - return error; - } + if (error < 0) + goto out; /* Journal blocked and flushed, clear needs_recovery flag. */ EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER); error = ext4_commit_super(sb, 1); - if (error) - goto out; - return 0; +out: + /* we rely on s_frozen to stop further updates */ + jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal); + return error; } /* @@ -3563,7 +3568,6 @@ static int ext4_unfreeze(struct super_block *sb) EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER); ext4_commit_super(sb, 1); unlock_super(sb); - jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal); return 0; } @@ -3574,12 +3578,14 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) ext4_fsblk_t n_blocks_count = 0; unsigned long old_sb_flags; struct ext4_mount_options old_opts; + int enable_quota = 0; ext4_group_t g; unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO; int err; #ifdef CONFIG_QUOTA int i; #endif + char *orig_data = kstrdup(data, GFP_KERNEL); lock_kernel(); @@ -3630,6 +3636,10 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) } if (*flags & MS_RDONLY) { + err = dquot_suspend(sb, -1); + if (err < 0) + goto restore_opts; + /* * First of all, the unconditional stuff we have to do * to disable replay of the journal when we next remount @@ -3698,6 +3708,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) goto restore_opts; if (!ext4_setup_super(sb, es, 0)) sb->s_flags &= ~MS_RDONLY; + enable_quota = 1; } } ext4_setup_system_zone(sb); @@ -3713,6 +3724,11 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) #endif unlock_super(sb); unlock_kernel(); + if (enable_quota) + dquot_resume(sb, -1); + + ext4_msg(sb, KERN_INFO, "re-mounted. Opts: %s", orig_data); + kfree(orig_data); return 0; restore_opts: @@ -3734,6 +3750,7 @@ restore_opts: #endif unlock_super(sb); unlock_kernel(); + kfree(orig_data); return err; } @@ -3906,24 +3923,21 @@ static int ext4_write_info(struct super_block *sb, int type) */ static int ext4_quota_on_mount(struct super_block *sb, int type) { - return vfs_quota_on_mount(sb, EXT4_SB(sb)->s_qf_names[type], - EXT4_SB(sb)->s_jquota_fmt, type); + return dquot_quota_on_mount(sb, EXT4_SB(sb)->s_qf_names[type], + EXT4_SB(sb)->s_jquota_fmt, type); } /* * Standard function to be called on quota_on */ static int ext4_quota_on(struct super_block *sb, int type, int format_id, - char *name, int remount) + char *name) { int err; struct path path; if (!test_opt(sb, QUOTA)) return -EINVAL; - /* When remounting, no checks are needed and in fact, name is NULL */ - if (remount) - return vfs_quota_on(sb, type, format_id, name, remount); err = kern_path(name, LOOKUP_FOLLOW, &path); if (err) @@ -3962,7 +3976,7 @@ static int ext4_quota_on(struct super_block *sb, int type, int format_id, } } - err = vfs_quota_on_path(sb, type, format_id, &path); + err = dquot_quota_on_path(sb, type, format_id, &path); path_put(&path); return err; } @@ -4141,6 +4155,7 @@ static int __init init_ext4_fs(void) { int err; + ext4_check_flag_values(); err = init_ext4_system_zone(); if (err) return err; diff --git a/fs/ext4/symlink.c b/fs/ext4/symlink.c index 00740cb32be3..ed9354aff279 100644 --- a/fs/ext4/symlink.c +++ b/fs/ext4/symlink.c @@ -34,6 +34,7 @@ const struct inode_operations ext4_symlink_inode_operations = { .readlink = generic_readlink, .follow_link = page_follow_link_light, .put_link = page_put_link, + .setattr = ext4_setattr, #ifdef CONFIG_EXT4_FS_XATTR .setxattr = generic_setxattr, .getxattr = generic_getxattr, @@ -45,6 +46,7 @@ const struct inode_operations ext4_symlink_inode_operations = { const struct inode_operations ext4_fast_symlink_inode_operations = { .readlink = generic_readlink, .follow_link = ext4_follow_link, + .setattr = ext4_setattr, #ifdef CONFIG_EXT4_FS_XATTR .setxattr = generic_setxattr, .getxattr = generic_getxattr, diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 2de0e9515089..04338009793a 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -228,9 +228,8 @@ ext4_xattr_block_get(struct inode *inode, int name_index, const char *name, atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount)); if (ext4_xattr_check_block(bh)) { bad_block: - ext4_error(inode->i_sb, - "inode %lu: bad block %llu", inode->i_ino, - EXT4_I(inode)->i_file_acl); + EXT4_ERROR_INODE(inode, "bad block %llu", + EXT4_I(inode)->i_file_acl); error = -EIO; goto cleanup; } @@ -372,9 +371,8 @@ ext4_xattr_block_list(struct dentry *dentry, char *buffer, size_t buffer_size) ea_bdebug(bh, "b_count=%d, refcount=%d", atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount)); if (ext4_xattr_check_block(bh)) { - ext4_error(inode->i_sb, - "inode %lu: bad block %llu", inode->i_ino, - EXT4_I(inode)->i_file_acl); + EXT4_ERROR_INODE(inode, "bad block %llu", + EXT4_I(inode)->i_file_acl); error = -EIO; goto cleanup; } @@ -666,8 +664,8 @@ ext4_xattr_block_find(struct inode *inode, struct ext4_xattr_info *i, atomic_read(&(bs->bh->b_count)), le32_to_cpu(BHDR(bs->bh)->h_refcount)); if (ext4_xattr_check_block(bs->bh)) { - ext4_error(sb, "inode %lu: bad block %llu", - inode->i_ino, EXT4_I(inode)->i_file_acl); + EXT4_ERROR_INODE(inode, "bad block %llu", + EXT4_I(inode)->i_file_acl); error = -EIO; goto cleanup; } @@ -820,7 +818,7 @@ inserted: EXT4_I(inode)->i_block_group); /* non-extent files can't have physical blocks past 2^32 */ - if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)) + if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) goal = goal & EXT4_MAX_BLOCK_FILE_PHYS; block = ext4_new_meta_blocks(handle, inode, @@ -828,7 +826,7 @@ inserted: if (error) goto cleanup; - if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)) + if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) BUG_ON(block > EXT4_MAX_BLOCK_FILE_PHYS); ea_idebug(inode, "creating block %d", block); @@ -880,8 +878,8 @@ cleanup_dquot: goto cleanup; bad_block: - ext4_error(inode->i_sb, "inode %lu: bad block %llu", - inode->i_ino, EXT4_I(inode)->i_file_acl); + EXT4_ERROR_INODE(inode, "bad block %llu", + EXT4_I(inode)->i_file_acl); goto cleanup; #undef header @@ -1194,8 +1192,8 @@ retry: if (!bh) goto cleanup; if (ext4_xattr_check_block(bh)) { - ext4_error(inode->i_sb, "inode %lu: bad block %llu", - inode->i_ino, EXT4_I(inode)->i_file_acl); + EXT4_ERROR_INODE(inode, "bad block %llu", + EXT4_I(inode)->i_file_acl); error = -EIO; goto cleanup; } @@ -1372,14 +1370,14 @@ ext4_xattr_delete_inode(handle_t *handle, struct inode *inode) goto cleanup; bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl); if (!bh) { - ext4_error(inode->i_sb, "inode %lu: block %llu read error", - inode->i_ino, EXT4_I(inode)->i_file_acl); + EXT4_ERROR_INODE(inode, "block %llu read error", + EXT4_I(inode)->i_file_acl); goto cleanup; } if (BHDR(bh)->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC) || BHDR(bh)->h_blocks != cpu_to_le32(1)) { - ext4_error(inode->i_sb, "inode %lu: bad block %llu", - inode->i_ino, EXT4_I(inode)->i_file_acl); + EXT4_ERROR_INODE(inode, "bad block %llu", + EXT4_I(inode)->i_file_acl); goto cleanup; } ext4_xattr_release_block(handle, inode, bh); @@ -1504,9 +1502,8 @@ again: } bh = sb_bread(inode->i_sb, ce->e_block); if (!bh) { - ext4_error(inode->i_sb, - "inode %lu: block %lu read error", - inode->i_ino, (unsigned long) ce->e_block); + EXT4_ERROR_INODE(inode, "block %lu read error", + (unsigned long) ce->e_block); } else if (le32_to_cpu(BHDR(bh)->h_refcount) >= EXT4_XATTR_REFCOUNT_MAX) { ea_idebug(inode, "block %lu refcount %d>=%d", diff --git a/fs/fat/fat.h b/fs/fat/fat.h index 53dba57b49a1..27ac25725954 100644 --- a/fs/fat/fat.h +++ b/fs/fat/fat.h @@ -306,11 +306,11 @@ extern long fat_generic_ioctl(struct file *filp, unsigned int cmd, extern const struct file_operations fat_file_operations; extern const struct inode_operations fat_file_inode_operations; extern int fat_setattr(struct dentry * dentry, struct iattr * attr); -extern void fat_truncate(struct inode *inode); +extern int fat_setsize(struct inode *inode, loff_t offset); +extern void fat_truncate_blocks(struct inode *inode, loff_t offset); extern int fat_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat); -extern int fat_file_fsync(struct file *file, struct dentry *dentry, - int datasync); +extern int fat_file_fsync(struct file *file, int datasync); /* fat/inode.c */ extern void fat_attach(struct inode *inode, loff_t i_pos); diff --git a/fs/fat/file.c b/fs/fat/file.c index a14c2f6a489e..990dfae022e5 100644 --- a/fs/fat/file.c +++ b/fs/fat/file.c @@ -149,12 +149,12 @@ static int fat_file_release(struct inode *inode, struct file *filp) return 0; } -int fat_file_fsync(struct file *filp, struct dentry *dentry, int datasync) +int fat_file_fsync(struct file *filp, int datasync) { - struct inode *inode = dentry->d_inode; + struct inode *inode = filp->f_mapping->host; int res, err; - res = simple_fsync(filp, dentry, datasync); + res = generic_file_fsync(filp, datasync); err = sync_mapping_buffers(MSDOS_SB(inode->i_sb)->fat_inode->i_mapping); return res ? res : err; @@ -283,7 +283,7 @@ static int fat_free(struct inode *inode, int skip) return fat_free_clusters(inode, free_start); } -void fat_truncate(struct inode *inode) +void fat_truncate_blocks(struct inode *inode, loff_t offset) { struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb); const unsigned int cluster_size = sbi->cluster_size; @@ -293,10 +293,10 @@ void fat_truncate(struct inode *inode) * This protects against truncating a file bigger than it was then * trying to write into the hole. */ - if (MSDOS_I(inode)->mmu_private > inode->i_size) - MSDOS_I(inode)->mmu_private = inode->i_size; + if (MSDOS_I(inode)->mmu_private > offset) + MSDOS_I(inode)->mmu_private = offset; - nr_clusters = (inode->i_size + (cluster_size - 1)) >> sbi->cluster_bits; + nr_clusters = (offset + (cluster_size - 1)) >> sbi->cluster_bits; fat_free(inode, nr_clusters); fat_flush_inodes(inode->i_sb, inode, NULL); @@ -364,6 +364,18 @@ static int fat_allow_set_time(struct msdos_sb_info *sbi, struct inode *inode) return 0; } +int fat_setsize(struct inode *inode, loff_t offset) +{ + int error; + + error = simple_setsize(inode, offset); + if (error) + return error; + fat_truncate_blocks(inode, offset); + + return error; +} + #define TIMES_SET_FLAGS (ATTR_MTIME_SET | ATTR_ATIME_SET | ATTR_TIMES_SET) /* valid file mode bits */ #define FAT_VALID_MODE (S_IFREG | S_IFDIR | S_IRWXUGO) @@ -378,7 +390,8 @@ int fat_setattr(struct dentry *dentry, struct iattr *attr) /* * Expand the file. Since inode_setattr() updates ->i_size * before calling the ->truncate(), but FAT needs to fill the - * hole before it. + * hole before it. XXX: this is no longer true with new truncate + * sequence. */ if (attr->ia_valid & ATTR_SIZE) { if (attr->ia_size > inode->i_size) { @@ -427,15 +440,20 @@ int fat_setattr(struct dentry *dentry, struct iattr *attr) attr->ia_valid &= ~ATTR_MODE; } - if (attr->ia_valid) - error = inode_setattr(inode, attr); + if (attr->ia_valid & ATTR_SIZE) { + error = fat_setsize(inode, attr->ia_size); + if (error) + goto out; + } + + generic_setattr(inode, attr); + mark_inode_dirty(inode); out: return error; } EXPORT_SYMBOL_GPL(fat_setattr); const struct inode_operations fat_file_inode_operations = { - .truncate = fat_truncate, .setattr = fat_setattr, .getattr = fat_getattr, }; diff --git a/fs/fat/inode.c b/fs/fat/inode.c index ed33904926ee..7bf45aee56d7 100644 --- a/fs/fat/inode.c +++ b/fs/fat/inode.c @@ -142,14 +142,29 @@ static int fat_readpages(struct file *file, struct address_space *mapping, return mpage_readpages(mapping, pages, nr_pages, fat_get_block); } +static void fat_write_failed(struct address_space *mapping, loff_t to) +{ + struct inode *inode = mapping->host; + + if (to > inode->i_size) { + truncate_pagecache(inode, to, inode->i_size); + fat_truncate_blocks(inode, inode->i_size); + } +} + static int fat_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned flags, struct page **pagep, void **fsdata) { + int err; + *pagep = NULL; - return cont_write_begin(file, mapping, pos, len, flags, pagep, fsdata, - fat_get_block, + err = cont_write_begin_newtrunc(file, mapping, pos, len, flags, + pagep, fsdata, fat_get_block, &MSDOS_I(mapping->host)->mmu_private); + if (err < 0) + fat_write_failed(mapping, pos + len); + return err; } static int fat_write_end(struct file *file, struct address_space *mapping, @@ -159,6 +174,8 @@ static int fat_write_end(struct file *file, struct address_space *mapping, struct inode *inode = mapping->host; int err; err = generic_write_end(file, mapping, pos, len, copied, pagep, fsdata); + if (err < len) + fat_write_failed(mapping, pos + len); if (!(err < 0) && !(MSDOS_I(inode)->i_attrs & ATTR_ARCH)) { inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC; MSDOS_I(inode)->i_attrs |= ATTR_ARCH; @@ -172,7 +189,9 @@ static ssize_t fat_direct_IO(int rw, struct kiocb *iocb, loff_t offset, unsigned long nr_segs) { struct file *file = iocb->ki_filp; - struct inode *inode = file->f_mapping->host; + struct address_space *mapping = file->f_mapping; + struct inode *inode = mapping->host; + ssize_t ret; if (rw == WRITE) { /* @@ -193,8 +212,12 @@ static ssize_t fat_direct_IO(int rw, struct kiocb *iocb, * FAT need to use the DIO_LOCKING for avoiding the race * condition of fat_get_block() and ->truncate(). */ - return blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov, - offset, nr_segs, fat_get_block, NULL); + ret = blockdev_direct_IO_newtrunc(rw, iocb, inode, inode->i_sb->s_bdev, + iov, offset, nr_segs, fat_get_block, NULL); + if (ret < 0 && (rw & WRITE)) + fat_write_failed(mapping, offset + iov_length(iov, nr_segs)); + + return ret; } static sector_t _fat_bmap(struct address_space *mapping, sector_t block) @@ -429,7 +452,7 @@ static void fat_delete_inode(struct inode *inode) { truncate_inode_pages(&inode->i_data, 0); inode->i_size = 0; - fat_truncate(inode); + fat_truncate_blocks(inode, 0); clear_inode(inode); } diff --git a/fs/fcntl.c b/fs/fcntl.c index f74d270ba155..9d175d623aab 100644 --- a/fs/fcntl.c +++ b/fs/fcntl.c @@ -274,7 +274,7 @@ static int f_setown_ex(struct file *filp, unsigned long arg) ret = copy_from_user(&owner, owner_p, sizeof(owner)); if (ret) - return ret; + return -EFAULT; switch (owner.type) { case F_OWNER_TID: @@ -332,8 +332,11 @@ static int f_getown_ex(struct file *filp, unsigned long arg) } read_unlock(&filp->f_owner.lock); - if (!ret) + if (!ret) { ret = copy_to_user(owner_p, &owner, sizeof(owner)); + if (ret) + ret = -EFAULT; + } return ret; } @@ -730,12 +733,14 @@ static void kill_fasync_rcu(struct fasync_struct *fa, int sig, int band) { while (fa) { struct fown_struct *fown; + unsigned long flags; + if (fa->magic != FASYNC_MAGIC) { printk(KERN_ERR "kill_fasync: bad magic number in " "fasync_struct!\n"); return; } - spin_lock(&fa->fa_lock); + spin_lock_irqsave(&fa->fa_lock, flags); if (fa->fa_file) { fown = &fa->fa_file->f_owner; /* Don't send SIGURG to processes which have not set a @@ -744,7 +749,7 @@ static void kill_fasync_rcu(struct fasync_struct *fa, int sig, int band) if (!(sig == SIGURG && fown->signum == 0)) send_sigio(fown, fa->fa_fd, band); } - spin_unlock(&fa->fa_lock); + spin_unlock_irqrestore(&fa->fa_lock, flags); fa = rcu_dereference(fa->fa_next); } } diff --git a/fs/file_table.c b/fs/file_table.c index 32d12b78bac8..5c7d10ead4ad 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -194,14 +194,6 @@ struct file *alloc_file(struct path *path, fmode_t mode, } EXPORT_SYMBOL(alloc_file); -void fput(struct file *file) -{ - if (atomic_long_dec_and_test(&file->f_count)) - __fput(file); -} - -EXPORT_SYMBOL(fput); - /** * drop_file_write_access - give up ability to write to a file * @file: the file to which we will stop writing @@ -227,10 +219,9 @@ void drop_file_write_access(struct file *file) } EXPORT_SYMBOL_GPL(drop_file_write_access); -/* __fput is called from task context when aio completion releases the last - * last use of a struct file *. Do not use otherwise. +/* the real guts of fput() - releasing the last reference to file */ -void __fput(struct file *file) +static void __fput(struct file *file) { struct dentry *dentry = file->f_path.dentry; struct vfsmount *mnt = file->f_path.mnt; @@ -268,6 +259,14 @@ void __fput(struct file *file) mntput(mnt); } +void fput(struct file *file) +{ + if (atomic_long_dec_and_test(&file->f_count)) + __fput(file); +} + +EXPORT_SYMBOL(fput); + struct file *fget(unsigned int fd) { struct file *file; diff --git a/fs/freevxfs/vxfs_lookup.c b/fs/freevxfs/vxfs_lookup.c index aee049cb9f84..0ec7bb2c95c6 100644 --- a/fs/freevxfs/vxfs_lookup.c +++ b/fs/freevxfs/vxfs_lookup.c @@ -57,6 +57,8 @@ const struct inode_operations vxfs_dir_inode_ops = { }; const struct file_operations vxfs_dir_operations = { + .llseek = generic_file_llseek, + .read = generic_read_dir, .readdir = vxfs_readdir, }; diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index ea8592b90696..d5be1693ac93 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -38,52 +38,18 @@ int nr_pdflush_threads; /* * Passed into wb_writeback(), essentially a subset of writeback_control */ -struct wb_writeback_args { +struct wb_writeback_work { long nr_pages; struct super_block *sb; enum writeback_sync_modes sync_mode; unsigned int for_kupdate:1; unsigned int range_cyclic:1; unsigned int for_background:1; - unsigned int sb_pinned:1; -}; -/* - * Work items for the bdi_writeback threads - */ -struct bdi_work { struct list_head list; /* pending work list */ - struct rcu_head rcu_head; /* for RCU free/clear of work */ - - unsigned long seen; /* threads that have seen this work */ - atomic_t pending; /* number of threads still to do work */ - - struct wb_writeback_args args; /* writeback arguments */ - - unsigned long state; /* flag bits, see WS_* */ + struct completion *done; /* set if the caller waits */ }; -enum { - WS_USED_B = 0, - WS_ONSTACK_B, -}; - -#define WS_USED (1 << WS_USED_B) -#define WS_ONSTACK (1 << WS_ONSTACK_B) - -static inline bool bdi_work_on_stack(struct bdi_work *work) -{ - return test_bit(WS_ONSTACK_B, &work->state); -} - -static inline void bdi_work_init(struct bdi_work *work, - struct wb_writeback_args *args) -{ - INIT_RCU_HEAD(&work->rcu_head); - work->args = *args; - work->state = WS_USED; -} - /** * writeback_in_progress - determine whether there is writeback in progress * @bdi: the device's backing_dev_info structure. @@ -96,76 +62,11 @@ int writeback_in_progress(struct backing_dev_info *bdi) return !list_empty(&bdi->work_list); } -static void bdi_work_clear(struct bdi_work *work) -{ - clear_bit(WS_USED_B, &work->state); - smp_mb__after_clear_bit(); - /* - * work can have disappeared at this point. bit waitq functions - * should be able to tolerate this, provided bdi_sched_wait does - * not dereference it's pointer argument. - */ - wake_up_bit(&work->state, WS_USED_B); -} - -static void bdi_work_free(struct rcu_head *head) -{ - struct bdi_work *work = container_of(head, struct bdi_work, rcu_head); - - if (!bdi_work_on_stack(work)) - kfree(work); - else - bdi_work_clear(work); -} - -static void wb_work_complete(struct bdi_work *work) -{ - const enum writeback_sync_modes sync_mode = work->args.sync_mode; - int onstack = bdi_work_on_stack(work); - - /* - * For allocated work, we can clear the done/seen bit right here. - * For on-stack work, we need to postpone both the clear and free - * to after the RCU grace period, since the stack could be invalidated - * as soon as bdi_work_clear() has done the wakeup. - */ - if (!onstack) - bdi_work_clear(work); - if (sync_mode == WB_SYNC_NONE || onstack) - call_rcu(&work->rcu_head, bdi_work_free); -} - -static void wb_clear_pending(struct bdi_writeback *wb, struct bdi_work *work) -{ - /* - * The caller has retrieved the work arguments from this work, - * drop our reference. If this is the last ref, delete and free it - */ - if (atomic_dec_and_test(&work->pending)) { - struct backing_dev_info *bdi = wb->bdi; - - spin_lock(&bdi->wb_lock); - list_del_rcu(&work->list); - spin_unlock(&bdi->wb_lock); - - wb_work_complete(work); - } -} - -static void bdi_queue_work(struct backing_dev_info *bdi, struct bdi_work *work) +static void bdi_queue_work(struct backing_dev_info *bdi, + struct wb_writeback_work *work) { - work->seen = bdi->wb_mask; - BUG_ON(!work->seen); - atomic_set(&work->pending, bdi->wb_cnt); - BUG_ON(!bdi->wb_cnt); - - /* - * list_add_tail_rcu() contains the necessary barriers to - * make sure the above stores are seen before the item is - * noticed on the list - */ spin_lock(&bdi->wb_lock); - list_add_tail_rcu(&work->list, &bdi->work_list); + list_add_tail(&work->list, &bdi->work_list); spin_unlock(&bdi->wb_lock); /* @@ -182,107 +83,59 @@ static void bdi_queue_work(struct backing_dev_info *bdi, struct bdi_work *work) } } -/* - * Used for on-stack allocated work items. The caller needs to wait until - * the wb threads have acked the work before it's safe to continue. - */ -static void bdi_wait_on_work_clear(struct bdi_work *work) -{ - wait_on_bit(&work->state, WS_USED_B, bdi_sched_wait, - TASK_UNINTERRUPTIBLE); -} - -static void bdi_alloc_queue_work(struct backing_dev_info *bdi, - struct wb_writeback_args *args, - int wait) +static void +__bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages, + bool range_cyclic, bool for_background) { - struct bdi_work *work; + struct wb_writeback_work *work; /* * This is WB_SYNC_NONE writeback, so if allocation fails just * wakeup the thread for old dirty data writeback */ - work = kmalloc(sizeof(*work), GFP_ATOMIC); - if (work) { - bdi_work_init(work, args); - bdi_queue_work(bdi, work); - if (wait) - bdi_wait_on_work_clear(work); - } else { - struct bdi_writeback *wb = &bdi->wb; - - if (wb->task) - wake_up_process(wb->task); + work = kzalloc(sizeof(*work), GFP_ATOMIC); + if (!work) { + if (bdi->wb.task) + wake_up_process(bdi->wb.task); + return; } + + work->sync_mode = WB_SYNC_NONE; + work->nr_pages = nr_pages; + work->range_cyclic = range_cyclic; + work->for_background = for_background; + + bdi_queue_work(bdi, work); } /** - * bdi_sync_writeback - start and wait for writeback + * bdi_start_writeback - start writeback * @bdi: the backing device to write from - * @sb: write inodes from this super_block + * @nr_pages: the number of pages to write * * Description: - * This does WB_SYNC_ALL data integrity writeback and waits for the - * IO to complete. Callers must hold the sb s_umount semaphore for - * reading, to avoid having the super disappear before we are done. + * This does WB_SYNC_NONE opportunistic writeback. The IO is only + * started when this function returns, we make no guarentees on + * completion. Caller need not hold sb s_umount semaphore. + * */ -static void bdi_sync_writeback(struct backing_dev_info *bdi, - struct super_block *sb) +void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages) { - struct wb_writeback_args args = { - .sb = sb, - .sync_mode = WB_SYNC_ALL, - .nr_pages = LONG_MAX, - .range_cyclic = 0, - /* - * Setting sb_pinned is not necessary for WB_SYNC_ALL, but - * lets make it explicitly clear. - */ - .sb_pinned = 1, - }; - struct bdi_work work; - - bdi_work_init(&work, &args); - work.state |= WS_ONSTACK; - - bdi_queue_work(bdi, &work); - bdi_wait_on_work_clear(&work); + __bdi_start_writeback(bdi, nr_pages, true, false); } /** - * bdi_start_writeback - start writeback + * bdi_start_background_writeback - start background writeback * @bdi: the backing device to write from - * @sb: write inodes from this super_block - * @nr_pages: the number of pages to write - * @sb_locked: caller already holds sb umount sem. * * Description: - * This does WB_SYNC_NONE opportunistic writeback. The IO is only + * This does WB_SYNC_NONE background writeback. The IO is only * started when this function returns, we make no guarentees on - * completion. Caller specifies whether sb umount sem is held already or not. - * + * completion. Caller need not hold sb s_umount semaphore. */ -void bdi_start_writeback(struct backing_dev_info *bdi, struct super_block *sb, - long nr_pages, int sb_locked) +void bdi_start_background_writeback(struct backing_dev_info *bdi) { - struct wb_writeback_args args = { - .sb = sb, - .sync_mode = WB_SYNC_NONE, - .nr_pages = nr_pages, - .range_cyclic = 1, - .sb_pinned = sb_locked, - }; - - /* - * We treat @nr_pages=0 as the special case to do background writeback, - * ie. to sync pages until the background dirty threshold is reached. - */ - if (!nr_pages) { - args.nr_pages = LONG_MAX; - args.for_background = 1; - } - - bdi_alloc_queue_work(bdi, &args, sb_locked); + __bdi_start_writeback(bdi, LONG_MAX, true, true); } /* @@ -572,75 +425,69 @@ select_queue: return ret; } -static void unpin_sb_for_writeback(struct super_block *sb) -{ - up_read(&sb->s_umount); - put_super(sb); -} - -enum sb_pin_state { - SB_PINNED, - SB_NOT_PINNED, - SB_PIN_FAILED -}; - /* - * For WB_SYNC_NONE writeback, the caller does not have the sb pinned + * For background writeback the caller does not have the sb pinned * before calling writeback. So make sure that we do pin it, so it doesn't * go away while we are writing inodes from it. */ -static enum sb_pin_state pin_sb_for_writeback(struct writeback_control *wbc, - struct super_block *sb) +static bool pin_sb_for_writeback(struct super_block *sb) { - /* - * Caller must already hold the ref for this - */ - if (wbc->sync_mode == WB_SYNC_ALL || wbc->sb_pinned) { - WARN_ON(!rwsem_is_locked(&sb->s_umount)); - return SB_NOT_PINNED; - } spin_lock(&sb_lock); + if (list_empty(&sb->s_instances)) { + spin_unlock(&sb_lock); + return false; + } + sb->s_count++; + spin_unlock(&sb_lock); + if (down_read_trylock(&sb->s_umount)) { - if (sb->s_root) { - spin_unlock(&sb_lock); - return SB_PINNED; - } - /* - * umounted, drop rwsem again and fall through to failure - */ + if (sb->s_root) + return true; up_read(&sb->s_umount); } - sb->s_count--; - spin_unlock(&sb_lock); - return SB_PIN_FAILED; + + put_super(sb); + return false; } /* * Write a portion of b_io inodes which belong to @sb. - * If @wbc->sb != NULL, then find and write all such + * + * If @only_this_sb is true, then find and write all such * inodes. Otherwise write only ones which go sequentially * in reverse order. + * * Return 1, if the caller writeback routine should be * interrupted. Otherwise return 0. */ -static int writeback_sb_inodes(struct super_block *sb, - struct bdi_writeback *wb, - struct writeback_control *wbc) +static int writeback_sb_inodes(struct super_block *sb, struct bdi_writeback *wb, + struct writeback_control *wbc, bool only_this_sb) { while (!list_empty(&wb->b_io)) { long pages_skipped; struct inode *inode = list_entry(wb->b_io.prev, struct inode, i_list); - if (wbc->sb && sb != inode->i_sb) { - /* super block given and doesn't - match, skip this inode */ - redirty_tail(inode); - continue; - } - if (sb != inode->i_sb) - /* finish with this superblock */ + + if (inode->i_sb != sb) { + if (only_this_sb) { + /* + * We only want to write back data for this + * superblock, move all inodes not belonging + * to it back onto the dirty list. + */ + redirty_tail(inode); + continue; + } + + /* + * The inode belongs to a different superblock. + * Bounce back to the caller to unpin this and + * pin the next superblock. + */ return 0; + } + if (inode->i_state & (I_NEW | I_WILL_FREE)) { requeue_io(inode); continue; @@ -678,8 +525,8 @@ static int writeback_sb_inodes(struct super_block *sb, return 1; } -static void writeback_inodes_wb(struct bdi_writeback *wb, - struct writeback_control *wbc) +void writeback_inodes_wb(struct bdi_writeback *wb, + struct writeback_control *wbc) { int ret = 0; @@ -692,24 +539,14 @@ static void writeback_inodes_wb(struct bdi_writeback *wb, struct inode *inode = list_entry(wb->b_io.prev, struct inode, i_list); struct super_block *sb = inode->i_sb; - enum sb_pin_state state; - - if (wbc->sb && sb != wbc->sb) { - /* super block given and doesn't - match, skip this inode */ - redirty_tail(inode); - continue; - } - state = pin_sb_for_writeback(wbc, sb); - if (state == SB_PIN_FAILED) { + if (!pin_sb_for_writeback(sb)) { requeue_io(inode); continue; } - ret = writeback_sb_inodes(sb, wb, wbc); + ret = writeback_sb_inodes(sb, wb, wbc, false); + drop_super(sb); - if (state == SB_PINNED) - unpin_sb_for_writeback(sb); if (ret) break; } @@ -717,11 +554,17 @@ static void writeback_inodes_wb(struct bdi_writeback *wb, /* Leave any unwritten inodes on b_io */ } -void writeback_inodes_wbc(struct writeback_control *wbc) +static void __writeback_inodes_sb(struct super_block *sb, + struct bdi_writeback *wb, struct writeback_control *wbc) { - struct backing_dev_info *bdi = wbc->bdi; + WARN_ON(!rwsem_is_locked(&sb->s_umount)); - writeback_inodes_wb(&bdi->wb, wbc); + wbc->wb_start = jiffies; /* livelock avoidance */ + spin_lock(&inode_lock); + if (!wbc->for_kupdate || list_empty(&wb->b_io)) + queue_io(wb, wbc->older_than_this); + writeback_sb_inodes(sb, wb, wbc, true); + spin_unlock(&inode_lock); } /* @@ -759,17 +602,14 @@ static inline bool over_bground_thresh(void) * all dirty pages if they are all attached to "old" mappings. */ static long wb_writeback(struct bdi_writeback *wb, - struct wb_writeback_args *args) + struct wb_writeback_work *work) { struct writeback_control wbc = { - .bdi = wb->bdi, - .sb = args->sb, - .sync_mode = args->sync_mode, + .sync_mode = work->sync_mode, .older_than_this = NULL, - .for_kupdate = args->for_kupdate, - .for_background = args->for_background, - .range_cyclic = args->range_cyclic, - .sb_pinned = args->sb_pinned, + .for_kupdate = work->for_kupdate, + .for_background = work->for_background, + .range_cyclic = work->range_cyclic, }; unsigned long oldest_jif; long wrote = 0; @@ -789,21 +629,24 @@ static long wb_writeback(struct bdi_writeback *wb, /* * Stop writeback when nr_pages has been consumed */ - if (args->nr_pages <= 0) + if (work->nr_pages <= 0) break; /* * For background writeout, stop when we are below the * background dirty threshold */ - if (args->for_background && !over_bground_thresh()) + if (work->for_background && !over_bground_thresh()) break; wbc.more_io = 0; wbc.nr_to_write = MAX_WRITEBACK_PAGES; wbc.pages_skipped = 0; - writeback_inodes_wb(wb, &wbc); - args->nr_pages -= MAX_WRITEBACK_PAGES - wbc.nr_to_write; + if (work->sb) + __writeback_inodes_sb(work->sb, wb, &wbc); + else + writeback_inodes_wb(wb, &wbc); + work->nr_pages -= MAX_WRITEBACK_PAGES - wbc.nr_to_write; wrote += MAX_WRITEBACK_PAGES - wbc.nr_to_write; /* @@ -839,31 +682,21 @@ static long wb_writeback(struct bdi_writeback *wb, } /* - * Return the next bdi_work struct that hasn't been processed by this - * wb thread yet. ->seen is initially set for each thread that exists - * for this device, when a thread first notices a piece of work it - * clears its bit. Depending on writeback type, the thread will notify - * completion on either receiving the work (WB_SYNC_NONE) or after - * it is done (WB_SYNC_ALL). + * Return the next wb_writeback_work struct that hasn't been processed yet. */ -static struct bdi_work *get_next_work_item(struct backing_dev_info *bdi, - struct bdi_writeback *wb) +static struct wb_writeback_work * +get_next_work_item(struct backing_dev_info *bdi, struct bdi_writeback *wb) { - struct bdi_work *work, *ret = NULL; + struct wb_writeback_work *work = NULL; - rcu_read_lock(); - - list_for_each_entry_rcu(work, &bdi->work_list, list) { - if (!test_bit(wb->nr, &work->seen)) - continue; - clear_bit(wb->nr, &work->seen); - - ret = work; - break; + spin_lock(&bdi->wb_lock); + if (!list_empty(&bdi->work_list)) { + work = list_entry(bdi->work_list.next, + struct wb_writeback_work, list); + list_del_init(&work->list); } - - rcu_read_unlock(); - return ret; + spin_unlock(&bdi->wb_lock); + return work; } static long wb_check_old_data_flush(struct bdi_writeback *wb) @@ -888,14 +721,14 @@ static long wb_check_old_data_flush(struct bdi_writeback *wb) (inodes_stat.nr_inodes - inodes_stat.nr_unused); if (nr_pages) { - struct wb_writeback_args args = { + struct wb_writeback_work work = { .nr_pages = nr_pages, .sync_mode = WB_SYNC_NONE, .for_kupdate = 1, .range_cyclic = 1, }; - return wb_writeback(wb, &args); + return wb_writeback(wb, &work); } return 0; @@ -907,36 +740,27 @@ static long wb_check_old_data_flush(struct bdi_writeback *wb) long wb_do_writeback(struct bdi_writeback *wb, int force_wait) { struct backing_dev_info *bdi = wb->bdi; - struct bdi_work *work; + struct wb_writeback_work *work; long wrote = 0; while ((work = get_next_work_item(bdi, wb)) != NULL) { - struct wb_writeback_args args = work->args; - int post_clear; - /* * Override sync mode, in case we must wait for completion + * because this thread is exiting now. */ if (force_wait) - work->args.sync_mode = args.sync_mode = WB_SYNC_ALL; - - post_clear = WB_SYNC_ALL || args.sb_pinned; - - /* - * If this isn't a data integrity operation, just notify - * that we have seen this work and we are now starting it. - */ - if (!post_clear) - wb_clear_pending(wb, work); + work->sync_mode = WB_SYNC_ALL; - wrote += wb_writeback(wb, &args); + wrote += wb_writeback(wb, work); /* - * This is a data integrity writeback, so only do the - * notification when we have completed the work. + * Notify the caller of completion if this is a synchronous + * work item, otherwise just free it. */ - if (post_clear) - wb_clear_pending(wb, work); + if (work->done) + complete(work->done); + else + kfree(work); } /* @@ -993,42 +817,27 @@ int bdi_writeback_task(struct bdi_writeback *wb) } /* - * Schedule writeback for all backing devices. This does WB_SYNC_NONE - * writeback, for integrity writeback see bdi_sync_writeback(). + * Start writeback of `nr_pages' pages. If `nr_pages' is zero, write back + * the whole world. */ -static void bdi_writeback_all(struct super_block *sb, long nr_pages) +void wakeup_flusher_threads(long nr_pages) { - struct wb_writeback_args args = { - .sb = sb, - .nr_pages = nr_pages, - .sync_mode = WB_SYNC_NONE, - }; struct backing_dev_info *bdi; - rcu_read_lock(); + if (!nr_pages) { + nr_pages = global_page_state(NR_FILE_DIRTY) + + global_page_state(NR_UNSTABLE_NFS); + } + rcu_read_lock(); list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) { if (!bdi_has_dirty_io(bdi)) continue; - - bdi_alloc_queue_work(bdi, &args, 0); + __bdi_start_writeback(bdi, nr_pages, false, false); } - rcu_read_unlock(); } -/* - * Start writeback of `nr_pages' pages. If `nr_pages' is zero, write back - * the whole world. - */ -void wakeup_flusher_threads(long nr_pages) -{ - if (nr_pages == 0) - nr_pages = global_page_state(NR_FILE_DIRTY) + - global_page_state(NR_UNSTABLE_NFS); - bdi_writeback_all(NULL, nr_pages); -} - static noinline void block_dump___mark_inode_dirty(struct inode *inode) { if (inode->i_ino || strcmp(inode->i_sb->s_id, "bdev")) { @@ -1220,18 +1029,6 @@ static void wait_sb_inodes(struct super_block *sb) iput(old_inode); } -static void __writeback_inodes_sb(struct super_block *sb, int sb_locked) -{ - unsigned long nr_dirty = global_page_state(NR_FILE_DIRTY); - unsigned long nr_unstable = global_page_state(NR_UNSTABLE_NFS); - long nr_to_write; - - nr_to_write = nr_dirty + nr_unstable + - (inodes_stat.nr_inodes - inodes_stat.nr_unused); - - bdi_start_writeback(sb->s_bdi, sb, nr_to_write, sb_locked); -} - /** * writeback_inodes_sb - writeback dirty inodes from given super_block * @sb: the superblock @@ -1243,21 +1040,24 @@ static void __writeback_inodes_sb(struct super_block *sb, int sb_locked) */ void writeback_inodes_sb(struct super_block *sb) { - __writeback_inodes_sb(sb, 0); -} -EXPORT_SYMBOL(writeback_inodes_sb); + unsigned long nr_dirty = global_page_state(NR_FILE_DIRTY); + unsigned long nr_unstable = global_page_state(NR_UNSTABLE_NFS); + DECLARE_COMPLETION_ONSTACK(done); + struct wb_writeback_work work = { + .sb = sb, + .sync_mode = WB_SYNC_NONE, + .done = &done, + }; -/** - * writeback_inodes_sb_locked - writeback dirty inodes from given super_block - * @sb: the superblock - * - * Like writeback_inodes_sb(), except the caller already holds the - * sb umount sem. - */ -void writeback_inodes_sb_locked(struct super_block *sb) -{ - __writeback_inodes_sb(sb, 1); + WARN_ON(!rwsem_is_locked(&sb->s_umount)); + + work.nr_pages = nr_dirty + nr_unstable + + (inodes_stat.nr_inodes - inodes_stat.nr_unused); + + bdi_queue_work(sb->s_bdi, &work); + wait_for_completion(&done); } +EXPORT_SYMBOL(writeback_inodes_sb); /** * writeback_inodes_sb_if_idle - start writeback if none underway @@ -1269,7 +1069,9 @@ void writeback_inodes_sb_locked(struct super_block *sb) int writeback_inodes_sb_if_idle(struct super_block *sb) { if (!writeback_in_progress(sb->s_bdi)) { + down_read(&sb->s_umount); writeback_inodes_sb(sb); + up_read(&sb->s_umount); return 1; } else return 0; @@ -1285,7 +1087,20 @@ EXPORT_SYMBOL(writeback_inodes_sb_if_idle); */ void sync_inodes_sb(struct super_block *sb) { - bdi_sync_writeback(sb->s_bdi, sb); + DECLARE_COMPLETION_ONSTACK(done); + struct wb_writeback_work work = { + .sb = sb, + .sync_mode = WB_SYNC_ALL, + .nr_pages = LONG_MAX, + .range_cyclic = 0, + .done = &done, + }; + + WARN_ON(!rwsem_is_locked(&sb->s_umount)); + + bdi_queue_work(sb->s_bdi, &work); + wait_for_completion(&done); + wait_sb_inodes(sb); } EXPORT_SYMBOL(sync_inodes_sb); diff --git a/fs/fscache/object-list.c b/fs/fscache/object-list.c index 1e1f286dd70e..4a8eb31c5338 100644 --- a/fs/fscache/object-list.c +++ b/fs/fscache/object-list.c @@ -103,7 +103,7 @@ static struct fscache_object *fscache_objlist_lookup(loff_t *_pos) /* banners (can't represent line 0 by pos 0 as that would involve * returning a NULL pointer) */ if (pos == 0) - return (struct fscache_object *) ++(*_pos); + return (struct fscache_object *)(long)++(*_pos); if (pos < 3) return (struct fscache_object *)pos; diff --git a/fs/fscache/page.c b/fs/fscache/page.c index 47aefd376e54..723b889fd219 100644 --- a/fs/fscache/page.c +++ b/fs/fscache/page.c @@ -710,30 +710,26 @@ static void fscache_write_op(struct fscache_operation *_op) goto superseded; } - if (page) { - radix_tree_tag_set(&cookie->stores, page->index, - FSCACHE_COOKIE_STORING_TAG); - radix_tree_tag_clear(&cookie->stores, page->index, - FSCACHE_COOKIE_PENDING_TAG); - } + radix_tree_tag_set(&cookie->stores, page->index, + FSCACHE_COOKIE_STORING_TAG); + radix_tree_tag_clear(&cookie->stores, page->index, + FSCACHE_COOKIE_PENDING_TAG); spin_unlock(&cookie->stores_lock); spin_unlock(&object->lock); - if (page) { - fscache_set_op_state(&op->op, "Store"); - fscache_stat(&fscache_n_store_pages); - fscache_stat(&fscache_n_cop_write_page); - ret = object->cache->ops->write_page(op, page); - fscache_stat_d(&fscache_n_cop_write_page); - fscache_set_op_state(&op->op, "EndWrite"); - fscache_end_page_write(object, page); - if (ret < 0) { - fscache_set_op_state(&op->op, "Abort"); - fscache_abort_object(object); - } else { - fscache_enqueue_operation(&op->op); - } + fscache_set_op_state(&op->op, "Store"); + fscache_stat(&fscache_n_store_pages); + fscache_stat(&fscache_n_cop_write_page); + ret = object->cache->ops->write_page(op, page); + fscache_stat_d(&fscache_n_cop_write_page); + fscache_set_op_state(&op->op, "EndWrite"); + fscache_end_page_write(object, page); + if (ret < 0) { + fscache_set_op_state(&op->op, "Abort"); + fscache_abort_object(object); + } else { + fscache_enqueue_operation(&op->op); } _leave(""); diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index e53df5ebb2b8..9424796d6634 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -16,6 +16,9 @@ #include <linux/pagemap.h> #include <linux/file.h> #include <linux/slab.h> +#include <linux/pipe_fs_i.h> +#include <linux/swap.h> +#include <linux/splice.h> MODULE_ALIAS_MISCDEV(FUSE_MINOR); MODULE_ALIAS("devname:fuse"); @@ -499,6 +502,9 @@ struct fuse_copy_state { int write; struct fuse_req *req; const struct iovec *iov; + struct pipe_buffer *pipebufs; + struct pipe_buffer *currbuf; + struct pipe_inode_info *pipe; unsigned long nr_segs; unsigned long seglen; unsigned long addr; @@ -506,16 +512,16 @@ struct fuse_copy_state { void *mapaddr; void *buf; unsigned len; + unsigned move_pages:1; }; static void fuse_copy_init(struct fuse_copy_state *cs, struct fuse_conn *fc, - int write, struct fuse_req *req, + int write, const struct iovec *iov, unsigned long nr_segs) { memset(cs, 0, sizeof(*cs)); cs->fc = fc; cs->write = write; - cs->req = req; cs->iov = iov; cs->nr_segs = nr_segs; } @@ -523,7 +529,18 @@ static void fuse_copy_init(struct fuse_copy_state *cs, struct fuse_conn *fc, /* Unmap and put previous page of userspace buffer */ static void fuse_copy_finish(struct fuse_copy_state *cs) { - if (cs->mapaddr) { + if (cs->currbuf) { + struct pipe_buffer *buf = cs->currbuf; + + if (!cs->write) { + buf->ops->unmap(cs->pipe, buf, cs->mapaddr); + } else { + kunmap_atomic(cs->mapaddr, KM_USER0); + buf->len = PAGE_SIZE - cs->len; + } + cs->currbuf = NULL; + cs->mapaddr = NULL; + } else if (cs->mapaddr) { kunmap_atomic(cs->mapaddr, KM_USER0); if (cs->write) { flush_dcache_page(cs->pg); @@ -545,26 +562,61 @@ static int fuse_copy_fill(struct fuse_copy_state *cs) unlock_request(cs->fc, cs->req); fuse_copy_finish(cs); - if (!cs->seglen) { - BUG_ON(!cs->nr_segs); - cs->seglen = cs->iov[0].iov_len; - cs->addr = (unsigned long) cs->iov[0].iov_base; - cs->iov++; - cs->nr_segs--; + if (cs->pipebufs) { + struct pipe_buffer *buf = cs->pipebufs; + + if (!cs->write) { + err = buf->ops->confirm(cs->pipe, buf); + if (err) + return err; + + BUG_ON(!cs->nr_segs); + cs->currbuf = buf; + cs->mapaddr = buf->ops->map(cs->pipe, buf, 1); + cs->len = buf->len; + cs->buf = cs->mapaddr + buf->offset; + cs->pipebufs++; + cs->nr_segs--; + } else { + struct page *page; + + if (cs->nr_segs == cs->pipe->buffers) + return -EIO; + + page = alloc_page(GFP_HIGHUSER); + if (!page) + return -ENOMEM; + + buf->page = page; + buf->offset = 0; + buf->len = 0; + + cs->currbuf = buf; + cs->mapaddr = kmap_atomic(page, KM_USER0); + cs->buf = cs->mapaddr; + cs->len = PAGE_SIZE; + cs->pipebufs++; + cs->nr_segs++; + } + } else { + if (!cs->seglen) { + BUG_ON(!cs->nr_segs); + cs->seglen = cs->iov[0].iov_len; + cs->addr = (unsigned long) cs->iov[0].iov_base; + cs->iov++; + cs->nr_segs--; + } + err = get_user_pages_fast(cs->addr, 1, cs->write, &cs->pg); + if (err < 0) + return err; + BUG_ON(err != 1); + offset = cs->addr % PAGE_SIZE; + cs->mapaddr = kmap_atomic(cs->pg, KM_USER0); + cs->buf = cs->mapaddr + offset; + cs->len = min(PAGE_SIZE - offset, cs->seglen); + cs->seglen -= cs->len; + cs->addr += cs->len; } - down_read(¤t->mm->mmap_sem); - err = get_user_pages(current, current->mm, cs->addr, 1, cs->write, 0, - &cs->pg, NULL); - up_read(¤t->mm->mmap_sem); - if (err < 0) - return err; - BUG_ON(err != 1); - offset = cs->addr % PAGE_SIZE; - cs->mapaddr = kmap_atomic(cs->pg, KM_USER0); - cs->buf = cs->mapaddr + offset; - cs->len = min(PAGE_SIZE - offset, cs->seglen); - cs->seglen -= cs->len; - cs->addr += cs->len; return lock_request(cs->fc, cs->req); } @@ -586,23 +638,178 @@ static int fuse_copy_do(struct fuse_copy_state *cs, void **val, unsigned *size) return ncpy; } +static int fuse_check_page(struct page *page) +{ + if (page_mapcount(page) || + page->mapping != NULL || + page_count(page) != 1 || + (page->flags & PAGE_FLAGS_CHECK_AT_PREP & + ~(1 << PG_locked | + 1 << PG_referenced | + 1 << PG_uptodate | + 1 << PG_lru | + 1 << PG_active | + 1 << PG_reclaim))) { + printk(KERN_WARNING "fuse: trying to steal weird page\n"); + printk(KERN_WARNING " page=%p index=%li flags=%08lx, count=%i, mapcount=%i, mapping=%p\n", page, page->index, page->flags, page_count(page), page_mapcount(page), page->mapping); + return 1; + } + return 0; +} + +static int fuse_try_move_page(struct fuse_copy_state *cs, struct page **pagep) +{ + int err; + struct page *oldpage = *pagep; + struct page *newpage; + struct pipe_buffer *buf = cs->pipebufs; + struct address_space *mapping; + pgoff_t index; + + unlock_request(cs->fc, cs->req); + fuse_copy_finish(cs); + + err = buf->ops->confirm(cs->pipe, buf); + if (err) + return err; + + BUG_ON(!cs->nr_segs); + cs->currbuf = buf; + cs->len = buf->len; + cs->pipebufs++; + cs->nr_segs--; + + if (cs->len != PAGE_SIZE) + goto out_fallback; + + if (buf->ops->steal(cs->pipe, buf) != 0) + goto out_fallback; + + newpage = buf->page; + + if (WARN_ON(!PageUptodate(newpage))) + return -EIO; + + ClearPageMappedToDisk(newpage); + + if (fuse_check_page(newpage) != 0) + goto out_fallback_unlock; + + mapping = oldpage->mapping; + index = oldpage->index; + + /* + * This is a new and locked page, it shouldn't be mapped or + * have any special flags on it + */ + if (WARN_ON(page_mapped(oldpage))) + goto out_fallback_unlock; + if (WARN_ON(page_has_private(oldpage))) + goto out_fallback_unlock; + if (WARN_ON(PageDirty(oldpage) || PageWriteback(oldpage))) + goto out_fallback_unlock; + if (WARN_ON(PageMlocked(oldpage))) + goto out_fallback_unlock; + + remove_from_page_cache(oldpage); + page_cache_release(oldpage); + + err = add_to_page_cache_locked(newpage, mapping, index, GFP_KERNEL); + if (err) { + printk(KERN_WARNING "fuse_try_move_page: failed to add page"); + goto out_fallback_unlock; + } + page_cache_get(newpage); + + if (!(buf->flags & PIPE_BUF_FLAG_LRU)) + lru_cache_add_file(newpage); + + err = 0; + spin_lock(&cs->fc->lock); + if (cs->req->aborted) + err = -ENOENT; + else + *pagep = newpage; + spin_unlock(&cs->fc->lock); + + if (err) { + unlock_page(newpage); + page_cache_release(newpage); + return err; + } + + unlock_page(oldpage); + page_cache_release(oldpage); + cs->len = 0; + + return 0; + +out_fallback_unlock: + unlock_page(newpage); +out_fallback: + cs->mapaddr = buf->ops->map(cs->pipe, buf, 1); + cs->buf = cs->mapaddr + buf->offset; + + err = lock_request(cs->fc, cs->req); + if (err) + return err; + + return 1; +} + +static int fuse_ref_page(struct fuse_copy_state *cs, struct page *page, + unsigned offset, unsigned count) +{ + struct pipe_buffer *buf; + + if (cs->nr_segs == cs->pipe->buffers) + return -EIO; + + unlock_request(cs->fc, cs->req); + fuse_copy_finish(cs); + + buf = cs->pipebufs; + page_cache_get(page); + buf->page = page; + buf->offset = offset; + buf->len = count; + + cs->pipebufs++; + cs->nr_segs++; + cs->len = 0; + + return 0; +} + /* * Copy a page in the request to/from the userspace buffer. Must be * done atomically */ -static int fuse_copy_page(struct fuse_copy_state *cs, struct page *page, +static int fuse_copy_page(struct fuse_copy_state *cs, struct page **pagep, unsigned offset, unsigned count, int zeroing) { + int err; + struct page *page = *pagep; + if (page && zeroing && count < PAGE_SIZE) { void *mapaddr = kmap_atomic(page, KM_USER1); memset(mapaddr, 0, PAGE_SIZE); kunmap_atomic(mapaddr, KM_USER1); } while (count) { - if (!cs->len) { - int err = fuse_copy_fill(cs); - if (err) - return err; + if (cs->write && cs->pipebufs && page) { + return fuse_ref_page(cs, page, offset, count); + } else if (!cs->len) { + if (cs->move_pages && page && + offset == 0 && count == PAGE_SIZE) { + err = fuse_try_move_page(cs, pagep); + if (err <= 0) + return err; + } else { + err = fuse_copy_fill(cs); + if (err) + return err; + } } if (page) { void *mapaddr = kmap_atomic(page, KM_USER1); @@ -627,8 +834,10 @@ static int fuse_copy_pages(struct fuse_copy_state *cs, unsigned nbytes, unsigned count = min(nbytes, (unsigned) PAGE_SIZE - offset); for (i = 0; i < req->num_pages && (nbytes || zeroing); i++) { - struct page *page = req->pages[i]; - int err = fuse_copy_page(cs, page, offset, count, zeroing); + int err; + + err = fuse_copy_page(cs, &req->pages[i], offset, count, + zeroing); if (err) return err; @@ -705,11 +914,10 @@ __acquires(&fc->lock) * * Called with fc->lock held, releases it */ -static int fuse_read_interrupt(struct fuse_conn *fc, struct fuse_req *req, - const struct iovec *iov, unsigned long nr_segs) +static int fuse_read_interrupt(struct fuse_conn *fc, struct fuse_copy_state *cs, + size_t nbytes, struct fuse_req *req) __releases(&fc->lock) { - struct fuse_copy_state cs; struct fuse_in_header ih; struct fuse_interrupt_in arg; unsigned reqsize = sizeof(ih) + sizeof(arg); @@ -725,14 +933,13 @@ __releases(&fc->lock) arg.unique = req->in.h.unique; spin_unlock(&fc->lock); - if (iov_length(iov, nr_segs) < reqsize) + if (nbytes < reqsize) return -EINVAL; - fuse_copy_init(&cs, fc, 1, NULL, iov, nr_segs); - err = fuse_copy_one(&cs, &ih, sizeof(ih)); + err = fuse_copy_one(cs, &ih, sizeof(ih)); if (!err) - err = fuse_copy_one(&cs, &arg, sizeof(arg)); - fuse_copy_finish(&cs); + err = fuse_copy_one(cs, &arg, sizeof(arg)); + fuse_copy_finish(cs); return err ? err : reqsize; } @@ -746,18 +953,13 @@ __releases(&fc->lock) * request_end(). Otherwise add it to the processing list, and set * the 'sent' flag. */ -static ssize_t fuse_dev_read(struct kiocb *iocb, const struct iovec *iov, - unsigned long nr_segs, loff_t pos) +static ssize_t fuse_dev_do_read(struct fuse_conn *fc, struct file *file, + struct fuse_copy_state *cs, size_t nbytes) { int err; struct fuse_req *req; struct fuse_in *in; - struct fuse_copy_state cs; unsigned reqsize; - struct file *file = iocb->ki_filp; - struct fuse_conn *fc = fuse_get_conn(file); - if (!fc) - return -EPERM; restart: spin_lock(&fc->lock); @@ -777,7 +979,7 @@ static ssize_t fuse_dev_read(struct kiocb *iocb, const struct iovec *iov, if (!list_empty(&fc->interrupts)) { req = list_entry(fc->interrupts.next, struct fuse_req, intr_entry); - return fuse_read_interrupt(fc, req, iov, nr_segs); + return fuse_read_interrupt(fc, cs, nbytes, req); } req = list_entry(fc->pending.next, struct fuse_req, list); @@ -787,7 +989,7 @@ static ssize_t fuse_dev_read(struct kiocb *iocb, const struct iovec *iov, in = &req->in; reqsize = in->h.len; /* If request is too large, reply with an error and restart the read */ - if (iov_length(iov, nr_segs) < reqsize) { + if (nbytes < reqsize) { req->out.h.error = -EIO; /* SETXATTR is special, since it may contain too large data */ if (in->h.opcode == FUSE_SETXATTR) @@ -796,12 +998,12 @@ static ssize_t fuse_dev_read(struct kiocb *iocb, const struct iovec *iov, goto restart; } spin_unlock(&fc->lock); - fuse_copy_init(&cs, fc, 1, req, iov, nr_segs); - err = fuse_copy_one(&cs, &in->h, sizeof(in->h)); + cs->req = req; + err = fuse_copy_one(cs, &in->h, sizeof(in->h)); if (!err) - err = fuse_copy_args(&cs, in->numargs, in->argpages, + err = fuse_copy_args(cs, in->numargs, in->argpages, (struct fuse_arg *) in->args, 0); - fuse_copy_finish(&cs); + fuse_copy_finish(cs); spin_lock(&fc->lock); req->locked = 0; if (req->aborted) { @@ -829,6 +1031,110 @@ static ssize_t fuse_dev_read(struct kiocb *iocb, const struct iovec *iov, return err; } +static ssize_t fuse_dev_read(struct kiocb *iocb, const struct iovec *iov, + unsigned long nr_segs, loff_t pos) +{ + struct fuse_copy_state cs; + struct file *file = iocb->ki_filp; + struct fuse_conn *fc = fuse_get_conn(file); + if (!fc) + return -EPERM; + + fuse_copy_init(&cs, fc, 1, iov, nr_segs); + + return fuse_dev_do_read(fc, file, &cs, iov_length(iov, nr_segs)); +} + +static int fuse_dev_pipe_buf_steal(struct pipe_inode_info *pipe, + struct pipe_buffer *buf) +{ + return 1; +} + +static const struct pipe_buf_operations fuse_dev_pipe_buf_ops = { + .can_merge = 0, + .map = generic_pipe_buf_map, + .unmap = generic_pipe_buf_unmap, + .confirm = generic_pipe_buf_confirm, + .release = generic_pipe_buf_release, + .steal = fuse_dev_pipe_buf_steal, + .get = generic_pipe_buf_get, +}; + +static ssize_t fuse_dev_splice_read(struct file *in, loff_t *ppos, + struct pipe_inode_info *pipe, + size_t len, unsigned int flags) +{ + int ret; + int page_nr = 0; + int do_wakeup = 0; + struct pipe_buffer *bufs; + struct fuse_copy_state cs; + struct fuse_conn *fc = fuse_get_conn(in); + if (!fc) + return -EPERM; + + bufs = kmalloc(pipe->buffers * sizeof (struct pipe_buffer), GFP_KERNEL); + if (!bufs) + return -ENOMEM; + + fuse_copy_init(&cs, fc, 1, NULL, 0); + cs.pipebufs = bufs; + cs.pipe = pipe; + ret = fuse_dev_do_read(fc, in, &cs, len); + if (ret < 0) + goto out; + + ret = 0; + pipe_lock(pipe); + + if (!pipe->readers) { + send_sig(SIGPIPE, current, 0); + if (!ret) + ret = -EPIPE; + goto out_unlock; + } + + if (pipe->nrbufs + cs.nr_segs > pipe->buffers) { + ret = -EIO; + goto out_unlock; + } + + while (page_nr < cs.nr_segs) { + int newbuf = (pipe->curbuf + pipe->nrbufs) & (pipe->buffers - 1); + struct pipe_buffer *buf = pipe->bufs + newbuf; + + buf->page = bufs[page_nr].page; + buf->offset = bufs[page_nr].offset; + buf->len = bufs[page_nr].len; + buf->ops = &fuse_dev_pipe_buf_ops; + + pipe->nrbufs++; + page_nr++; + ret += buf->len; + + if (pipe->inode) + do_wakeup = 1; + } + +out_unlock: + pipe_unlock(pipe); + + if (do_wakeup) { + smp_mb(); + if (waitqueue_active(&pipe->wait)) + wake_up_interruptible(&pipe->wait); + kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); + } + +out: + for (; page_nr < cs.nr_segs; page_nr++) + page_cache_release(bufs[page_nr].page); + + kfree(bufs); + return ret; +} + static int fuse_notify_poll(struct fuse_conn *fc, unsigned int size, struct fuse_copy_state *cs) { @@ -988,23 +1294,17 @@ static int copy_out_args(struct fuse_copy_state *cs, struct fuse_out *out, * it from the list and copy the rest of the buffer to the request. * The request is finished by calling request_end() */ -static ssize_t fuse_dev_write(struct kiocb *iocb, const struct iovec *iov, - unsigned long nr_segs, loff_t pos) +static ssize_t fuse_dev_do_write(struct fuse_conn *fc, + struct fuse_copy_state *cs, size_t nbytes) { int err; - size_t nbytes = iov_length(iov, nr_segs); struct fuse_req *req; struct fuse_out_header oh; - struct fuse_copy_state cs; - struct fuse_conn *fc = fuse_get_conn(iocb->ki_filp); - if (!fc) - return -EPERM; - fuse_copy_init(&cs, fc, 0, NULL, iov, nr_segs); if (nbytes < sizeof(struct fuse_out_header)) return -EINVAL; - err = fuse_copy_one(&cs, &oh, sizeof(oh)); + err = fuse_copy_one(cs, &oh, sizeof(oh)); if (err) goto err_finish; @@ -1017,7 +1317,7 @@ static ssize_t fuse_dev_write(struct kiocb *iocb, const struct iovec *iov, * and error contains notification code. */ if (!oh.unique) { - err = fuse_notify(fc, oh.error, nbytes - sizeof(oh), &cs); + err = fuse_notify(fc, oh.error, nbytes - sizeof(oh), cs); return err ? err : nbytes; } @@ -1036,7 +1336,7 @@ static ssize_t fuse_dev_write(struct kiocb *iocb, const struct iovec *iov, if (req->aborted) { spin_unlock(&fc->lock); - fuse_copy_finish(&cs); + fuse_copy_finish(cs); spin_lock(&fc->lock); request_end(fc, req); return -ENOENT; @@ -1053,7 +1353,7 @@ static ssize_t fuse_dev_write(struct kiocb *iocb, const struct iovec *iov, queue_interrupt(fc, req); spin_unlock(&fc->lock); - fuse_copy_finish(&cs); + fuse_copy_finish(cs); return nbytes; } @@ -1061,11 +1361,13 @@ static ssize_t fuse_dev_write(struct kiocb *iocb, const struct iovec *iov, list_move(&req->list, &fc->io); req->out.h = oh; req->locked = 1; - cs.req = req; + cs->req = req; + if (!req->out.page_replace) + cs->move_pages = 0; spin_unlock(&fc->lock); - err = copy_out_args(&cs, &req->out, nbytes); - fuse_copy_finish(&cs); + err = copy_out_args(cs, &req->out, nbytes); + fuse_copy_finish(cs); spin_lock(&fc->lock); req->locked = 0; @@ -1081,10 +1383,101 @@ static ssize_t fuse_dev_write(struct kiocb *iocb, const struct iovec *iov, err_unlock: spin_unlock(&fc->lock); err_finish: - fuse_copy_finish(&cs); + fuse_copy_finish(cs); return err; } +static ssize_t fuse_dev_write(struct kiocb *iocb, const struct iovec *iov, + unsigned long nr_segs, loff_t pos) +{ + struct fuse_copy_state cs; + struct fuse_conn *fc = fuse_get_conn(iocb->ki_filp); + if (!fc) + return -EPERM; + + fuse_copy_init(&cs, fc, 0, iov, nr_segs); + + return fuse_dev_do_write(fc, &cs, iov_length(iov, nr_segs)); +} + +static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe, + struct file *out, loff_t *ppos, + size_t len, unsigned int flags) +{ + unsigned nbuf; + unsigned idx; + struct pipe_buffer *bufs; + struct fuse_copy_state cs; + struct fuse_conn *fc; + size_t rem; + ssize_t ret; + + fc = fuse_get_conn(out); + if (!fc) + return -EPERM; + + bufs = kmalloc(pipe->buffers * sizeof (struct pipe_buffer), GFP_KERNEL); + if (!bufs) + return -ENOMEM; + + pipe_lock(pipe); + nbuf = 0; + rem = 0; + for (idx = 0; idx < pipe->nrbufs && rem < len; idx++) + rem += pipe->bufs[(pipe->curbuf + idx) & (pipe->buffers - 1)].len; + + ret = -EINVAL; + if (rem < len) { + pipe_unlock(pipe); + goto out; + } + + rem = len; + while (rem) { + struct pipe_buffer *ibuf; + struct pipe_buffer *obuf; + + BUG_ON(nbuf >= pipe->buffers); + BUG_ON(!pipe->nrbufs); + ibuf = &pipe->bufs[pipe->curbuf]; + obuf = &bufs[nbuf]; + + if (rem >= ibuf->len) { + *obuf = *ibuf; + ibuf->ops = NULL; + pipe->curbuf = (pipe->curbuf + 1) & (pipe->buffers - 1); + pipe->nrbufs--; + } else { + ibuf->ops->get(pipe, ibuf); + *obuf = *ibuf; + obuf->flags &= ~PIPE_BUF_FLAG_GIFT; + obuf->len = rem; + ibuf->offset += obuf->len; + ibuf->len -= obuf->len; + } + nbuf++; + rem -= obuf->len; + } + pipe_unlock(pipe); + + fuse_copy_init(&cs, fc, 0, NULL, nbuf); + cs.pipebufs = bufs; + cs.pipe = pipe; + + if (flags & SPLICE_F_MOVE) + cs.move_pages = 1; + + ret = fuse_dev_do_write(fc, &cs, len); + + for (idx = 0; idx < nbuf; idx++) { + struct pipe_buffer *buf = &bufs[idx]; + buf->ops->release(pipe, buf); + } +out: + kfree(bufs); + return ret; +} + static unsigned fuse_dev_poll(struct file *file, poll_table *wait) { unsigned mask = POLLOUT | POLLWRNORM; @@ -1226,8 +1619,10 @@ const struct file_operations fuse_dev_operations = { .llseek = no_llseek, .read = do_sync_read, .aio_read = fuse_dev_read, + .splice_read = fuse_dev_splice_read, .write = do_sync_write, .aio_write = fuse_dev_write, + .splice_write = fuse_dev_splice_write, .poll = fuse_dev_poll, .release = fuse_dev_release, .fasync = fuse_dev_fasync, diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 4787ae6c5c1c..431be0795b6b 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -1016,7 +1016,7 @@ static int fuse_permission(struct inode *inode, int mask) exist. So if permissions are revoked this won't be noticed immediately, only after the attribute timeout has expired */ - } else if (mask & MAY_ACCESS) { + } else if (mask & (MAY_ACCESS | MAY_CHDIR)) { err = fuse_access(inode, mask); } else if ((mask & MAY_EXEC) && S_ISREG(inode->i_mode)) { if (!(inode->i_mode & S_IXUGO)) { @@ -1156,10 +1156,9 @@ static int fuse_dir_release(struct inode *inode, struct file *file) return 0; } -static int fuse_dir_fsync(struct file *file, struct dentry *de, int datasync) +static int fuse_dir_fsync(struct file *file, int datasync) { - /* nfsd can call this with no file */ - return file ? fuse_fsync_common(file, de, datasync, 1) : 0; + return fuse_fsync_common(file, datasync, 1); } static bool update_mtime(unsigned ivalid) diff --git a/fs/fuse/file.c b/fs/fuse/file.c index a9f5e137f1d3..ada0adeb3bb5 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -351,10 +351,9 @@ static void fuse_sync_writes(struct inode *inode) fuse_release_nowrite(inode); } -int fuse_fsync_common(struct file *file, struct dentry *de, int datasync, - int isdir) +int fuse_fsync_common(struct file *file, int datasync, int isdir) { - struct inode *inode = de->d_inode; + struct inode *inode = file->f_mapping->host; struct fuse_conn *fc = get_fuse_conn(inode); struct fuse_file *ff = file->private_data; struct fuse_req *req; @@ -403,9 +402,9 @@ int fuse_fsync_common(struct file *file, struct dentry *de, int datasync, return err; } -static int fuse_fsync(struct file *file, struct dentry *de, int datasync) +static int fuse_fsync(struct file *file, int datasync) { - return fuse_fsync_common(file, de, datasync, 0); + return fuse_fsync_common(file, datasync, 0); } void fuse_read_fill(struct fuse_req *req, struct file *file, loff_t pos, @@ -517,17 +516,26 @@ static void fuse_readpages_end(struct fuse_conn *fc, struct fuse_req *req) int i; size_t count = req->misc.read.in.size; size_t num_read = req->out.args[0].size; - struct inode *inode = req->pages[0]->mapping->host; + struct address_space *mapping = NULL; - /* - * Short read means EOF. If file size is larger, truncate it - */ - if (!req->out.h.error && num_read < count) { - loff_t pos = page_offset(req->pages[0]) + num_read; - fuse_read_update_size(inode, pos, req->misc.read.attr_ver); - } + for (i = 0; mapping == NULL && i < req->num_pages; i++) + mapping = req->pages[i]->mapping; - fuse_invalidate_attr(inode); /* atime changed */ + if (mapping) { + struct inode *inode = mapping->host; + + /* + * Short read means EOF. If file size is larger, truncate it + */ + if (!req->out.h.error && num_read < count) { + loff_t pos; + + pos = page_offset(req->pages[0]) + num_read; + fuse_read_update_size(inode, pos, + req->misc.read.attr_ver); + } + fuse_invalidate_attr(inode); /* atime changed */ + } for (i = 0; i < req->num_pages; i++) { struct page *page = req->pages[i]; @@ -536,6 +544,7 @@ static void fuse_readpages_end(struct fuse_conn *fc, struct fuse_req *req) else SetPageError(page); unlock_page(page); + page_cache_release(page); } if (req->ff) fuse_file_put(req->ff); @@ -550,6 +559,7 @@ static void fuse_send_readpages(struct fuse_req *req, struct file *file) req->out.argpages = 1; req->out.page_zeroing = 1; + req->out.page_replace = 1; fuse_read_fill(req, file, pos, count, FUSE_READ); req->misc.read.attr_ver = fuse_get_attr_version(fc); if (fc->async_read) { @@ -589,6 +599,7 @@ static int fuse_readpages_fill(void *_data, struct page *page) return PTR_ERR(req); } } + page_cache_get(page); req->pages[req->num_pages] = page; req->num_pages++; return 0; @@ -994,10 +1005,7 @@ static int fuse_get_user_pages(struct fuse_req *req, const char __user *buf, nbytes = min_t(size_t, nbytes, FUSE_MAX_PAGES_PER_REQ << PAGE_SHIFT); npages = (nbytes + offset + PAGE_SIZE - 1) >> PAGE_SHIFT; npages = clamp(npages, 1, FUSE_MAX_PAGES_PER_REQ); - down_read(¤t->mm->mmap_sem); - npages = get_user_pages(current, current->mm, user_addr, npages, !write, - 0, req->pages, NULL); - up_read(¤t->mm->mmap_sem); + npages = get_user_pages_fast(user_addr, npages, !write, req->pages); if (npages < 0) return npages; @@ -1580,9 +1588,9 @@ static int fuse_ioctl_copy_user(struct page **pages, struct iovec *iov, while (iov_iter_count(&ii)) { struct page *page = pages[page_idx++]; size_t todo = min_t(size_t, PAGE_SIZE, iov_iter_count(&ii)); - void *kaddr, *map; + void *kaddr; - kaddr = map = kmap(page); + kaddr = kmap(page); while (todo) { char __user *uaddr = ii.iov->iov_base + ii.iov_offset; diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 01cc462ff45d..8f309f04064e 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -177,6 +177,9 @@ struct fuse_out { /** Zero partially or not copied pages */ unsigned page_zeroing:1; + /** Pages may be replaced with new ones */ + unsigned page_replace:1; + /** Number or arguments */ unsigned numargs; @@ -568,8 +571,7 @@ void fuse_release_common(struct file *file, int opcode); /** * Send FSYNC or FSYNCDIR request */ -int fuse_fsync_common(struct file *file, struct dentry *de, int datasync, - int isdir); +int fuse_fsync_common(struct file *file, int datasync, int isdir); /** * Notify poll wakeup diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c index a739a0a48067..5e96cbd8a454 100644 --- a/fs/gfs2/aops.c +++ b/fs/gfs2/aops.c @@ -136,10 +136,7 @@ static int gfs2_writeback_writepage(struct page *page, if (ret <= 0) return ret; - ret = mpage_writepage(page, gfs2_get_block_noalloc, wbc); - if (ret == -EAGAIN) - ret = block_write_full_page(page, gfs2_get_block_noalloc, wbc); - return ret; + return nobh_writepage(page, gfs2_get_block_noalloc, wbc); } /** @@ -637,9 +634,7 @@ static int gfs2_write_begin(struct file *file, struct address_space *mapping, } } - error = gfs2_write_alloc_required(ip, pos, len, &alloc_required); - if (error) - goto out_unlock; + alloc_required = gfs2_write_alloc_required(ip, pos, len); if (alloc_required || gfs2_is_jdata(ip)) gfs2_write_calc_reserv(ip, len, &data_blocks, &ind_blocks); @@ -700,8 +695,14 @@ out: return 0; page_cache_release(page); + + /* + * XXX(hch): the call below should probably be replaced with + * a call to the gfs2-specific truncate blocks helper to actually + * release disk blocks.. + */ if (pos + len > ip->i_inode.i_size) - vmtruncate(&ip->i_inode, ip->i_inode.i_size); + simple_setsize(&ip->i_inode, ip->i_inode.i_size); out_endtrans: gfs2_trans_end(sdp); out_trans_fail: diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c index 4a48c0f4b402..6f482809d1a3 100644 --- a/fs/gfs2/bmap.c +++ b/fs/gfs2/bmap.c @@ -1040,7 +1040,8 @@ static int trunc_start(struct gfs2_inode *ip, u64 size) goto out; if (gfs2_is_stuffed(ip)) { - u64 dsize = size + sizeof(struct gfs2_inode); + u64 dsize = size + sizeof(struct gfs2_dinode); + ip->i_disksize = size; ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME; gfs2_trans_add_bh(ip->i_gl, dibh, 1); gfs2_dinode_out(ip, dibh->b_data); @@ -1243,13 +1244,12 @@ int gfs2_file_dealloc(struct gfs2_inode *ip) * @ip: the file being written to * @offset: the offset to write to * @len: the number of bytes being written - * @alloc_required: set to 1 if an alloc is required, 0 otherwise * - * Returns: errno + * Returns: 1 if an alloc is required, 0 otherwise */ int gfs2_write_alloc_required(struct gfs2_inode *ip, u64 offset, - unsigned int len, int *alloc_required) + unsigned int len) { struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); struct buffer_head bh; @@ -1257,26 +1257,23 @@ int gfs2_write_alloc_required(struct gfs2_inode *ip, u64 offset, u64 lblock, lblock_stop, size; u64 end_of_file; - *alloc_required = 0; - if (!len) return 0; if (gfs2_is_stuffed(ip)) { if (offset + len > sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode)) - *alloc_required = 1; + return 1; return 0; } - *alloc_required = 1; shift = sdp->sd_sb.sb_bsize_shift; BUG_ON(gfs2_is_dir(ip)); end_of_file = (ip->i_disksize + sdp->sd_sb.sb_bsize - 1) >> shift; lblock = offset >> shift; lblock_stop = (offset + len + sdp->sd_sb.sb_bsize - 1) >> shift; if (lblock_stop > end_of_file) - return 0; + return 1; size = (lblock_stop - lblock) << shift; do { @@ -1284,12 +1281,11 @@ int gfs2_write_alloc_required(struct gfs2_inode *ip, u64 offset, bh.b_size = size; gfs2_block_map(&ip->i_inode, lblock, &bh, 0); if (!buffer_mapped(&bh)) - return 0; + return 1; size -= bh.b_size; lblock += (bh.b_size >> ip->i_inode.i_blkbits); } while(size > 0); - *alloc_required = 0; return 0; } diff --git a/fs/gfs2/bmap.h b/fs/gfs2/bmap.h index c983177e05ac..a20a5213135a 100644 --- a/fs/gfs2/bmap.h +++ b/fs/gfs2/bmap.h @@ -52,6 +52,6 @@ int gfs2_truncatei(struct gfs2_inode *ip, u64 size); int gfs2_truncatei_resume(struct gfs2_inode *ip); int gfs2_file_dealloc(struct gfs2_inode *ip); int gfs2_write_alloc_required(struct gfs2_inode *ip, u64 offset, - unsigned int len, int *alloc_required); + unsigned int len); #endif /* __BMAP_DOT_H__ */ diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c index 8295c5b5d4a9..b9dd88a78dd4 100644 --- a/fs/gfs2/dir.c +++ b/fs/gfs2/dir.c @@ -392,7 +392,7 @@ static int gfs2_dirent_find_space(const struct gfs2_dirent *dent, unsigned totlen = be16_to_cpu(dent->de_rec_len); if (gfs2_dirent_sentinel(dent)) - actual = GFS2_DIRENT_SIZE(0); + actual = 0; if (totlen - actual >= required) return 1; return 0; @@ -955,7 +955,12 @@ static int dir_split_leaf(struct inode *inode, const struct qstr *name) /* Change the pointers. Don't bother distinguishing stuffed from non-stuffed. This code is complicated enough already. */ - lp = kmalloc(half_len * sizeof(__be64), GFP_NOFS | __GFP_NOFAIL); + lp = kmalloc(half_len * sizeof(__be64), GFP_NOFS); + if (!lp) { + error = -ENOMEM; + goto fail_brelse; + } + /* Change the pointers */ for (x = 0; x < half_len; x++) lp[x] = cpu_to_be64(bn); @@ -1063,7 +1068,9 @@ static int dir_double_exhash(struct gfs2_inode *dip) /* Allocate both the "from" and "to" buffers in one big chunk */ - buf = kcalloc(3, sdp->sd_hash_bsize, GFP_NOFS | __GFP_NOFAIL); + buf = kcalloc(3, sdp->sd_hash_bsize, GFP_NOFS); + if (!buf) + return -ENOMEM; for (block = dip->i_disksize >> sdp->sd_hash_bsize_shift; block--;) { error = gfs2_dir_read_data(dip, (char *)buf, @@ -1231,6 +1238,25 @@ static int do_filldir_main(struct gfs2_inode *dip, u64 *offset, return 0; } +static void *gfs2_alloc_sort_buffer(unsigned size) +{ + void *ptr = NULL; + + if (size < KMALLOC_MAX_SIZE) + ptr = kmalloc(size, GFP_NOFS | __GFP_NOWARN); + if (!ptr) + ptr = __vmalloc(size, GFP_NOFS, PAGE_KERNEL); + return ptr; +} + +static void gfs2_free_sort_buffer(void *ptr) +{ + if (is_vmalloc_addr(ptr)) + vfree(ptr); + else + kfree(ptr); +} + static int gfs2_dir_read_leaf(struct inode *inode, u64 *offset, void *opaque, filldir_t filldir, int *copied, unsigned *depth, u64 leaf_no) @@ -1271,7 +1297,7 @@ static int gfs2_dir_read_leaf(struct inode *inode, u64 *offset, void *opaque, * 99 is the maximum number of entries that can fit in a single * leaf block. */ - larr = vmalloc((leaves + entries + 99) * sizeof(void *)); + larr = gfs2_alloc_sort_buffer((leaves + entries + 99) * sizeof(void *)); if (!larr) goto out; darr = (const struct gfs2_dirent **)(larr + leaves); @@ -1282,7 +1308,7 @@ static int gfs2_dir_read_leaf(struct inode *inode, u64 *offset, void *opaque, do { error = get_leaf(ip, lfn, &bh); if (error) - goto out_kfree; + goto out_free; lf = (struct gfs2_leaf *)bh->b_data; lfn = be64_to_cpu(lf->lf_next); if (lf->lf_entries) { @@ -1291,7 +1317,7 @@ static int gfs2_dir_read_leaf(struct inode *inode, u64 *offset, void *opaque, gfs2_dirent_gather, NULL, &g); error = PTR_ERR(dent); if (IS_ERR(dent)) - goto out_kfree; + goto out_free; if (entries2 != g.offset) { fs_warn(sdp, "Number of entries corrupt in dir " "leaf %llu, entries2 (%u) != " @@ -1300,7 +1326,7 @@ static int gfs2_dir_read_leaf(struct inode *inode, u64 *offset, void *opaque, entries2, g.offset); error = -EIO; - goto out_kfree; + goto out_free; } error = 0; larr[leaf++] = bh; @@ -1312,10 +1338,10 @@ static int gfs2_dir_read_leaf(struct inode *inode, u64 *offset, void *opaque, BUG_ON(entries2 != entries); error = do_filldir_main(ip, offset, opaque, filldir, darr, entries, copied); -out_kfree: +out_free: for(i = 0; i < leaf; i++) brelse(larr[i]); - vfree(larr); + gfs2_free_sort_buffer(larr); out: return error; } diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index b20bfcc9fa2d..4edd662c8232 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -351,7 +351,6 @@ static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) unsigned long last_index; u64 pos = page->index << PAGE_CACHE_SHIFT; unsigned int data_blocks, ind_blocks, rblocks; - int alloc_required = 0; struct gfs2_holder gh; struct gfs2_alloc *al; int ret; @@ -364,8 +363,7 @@ static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) set_bit(GLF_DIRTY, &ip->i_gl->gl_flags); set_bit(GIF_SW_PAGED, &ip->i_flags); - ret = gfs2_write_alloc_required(ip, pos, PAGE_CACHE_SIZE, &alloc_required); - if (ret || !alloc_required) + if (!gfs2_write_alloc_required(ip, pos, PAGE_CACHE_SIZE)) goto out_unlock; ret = -ENOMEM; al = gfs2_alloc_get(ip); @@ -554,9 +552,9 @@ static int gfs2_close(struct inode *inode, struct file *file) * Returns: errno */ -static int gfs2_fsync(struct file *file, struct dentry *dentry, int datasync) +static int gfs2_fsync(struct file *file, int datasync) { - struct inode *inode = dentry->d_inode; + struct inode *inode = file->f_mapping->host; int sync_state = inode->i_state & (I_DIRTY_SYNC|I_DIRTY_DATASYNC); int ret = 0; diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index ddcdbf493536..9adf8f924e08 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -328,6 +328,30 @@ static void gfs2_holder_wake(struct gfs2_holder *gh) } /** + * do_error - Something unexpected has happened during a lock request + * + */ + +static inline void do_error(struct gfs2_glock *gl, const int ret) +{ + struct gfs2_holder *gh, *tmp; + + list_for_each_entry_safe(gh, tmp, &gl->gl_holders, gh_list) { + if (test_bit(HIF_HOLDER, &gh->gh_iflags)) + continue; + if (ret & LM_OUT_ERROR) + gh->gh_error = -EIO; + else if (gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB)) + gh->gh_error = GLR_TRYFAILED; + else + continue; + list_del_init(&gh->gh_list); + trace_gfs2_glock_queue(gh, 0); + gfs2_holder_wake(gh); + } +} + +/** * do_promote - promote as many requests as possible on the current queue * @gl: The glock * @@ -375,36 +399,13 @@ restart: } if (gh->gh_list.prev == &gl->gl_holders) return 1; + do_error(gl, 0); break; } return 0; } /** - * do_error - Something unexpected has happened during a lock request - * - */ - -static inline void do_error(struct gfs2_glock *gl, const int ret) -{ - struct gfs2_holder *gh, *tmp; - - list_for_each_entry_safe(gh, tmp, &gl->gl_holders, gh_list) { - if (test_bit(HIF_HOLDER, &gh->gh_iflags)) - continue; - if (ret & LM_OUT_ERROR) - gh->gh_error = -EIO; - else if (gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB)) - gh->gh_error = GLR_TRYFAILED; - else - continue; - list_del_init(&gh->gh_list); - trace_gfs2_glock_queue(gh, 0); - gfs2_holder_wake(gh); - } -} - -/** * find_first_waiter - find the first gh that's waiting for the glock * @gl: the glock */ @@ -1062,6 +1063,9 @@ int gfs2_glock_nq(struct gfs2_holder *gh) spin_lock(&gl->gl_spin); add_to_queue(gh); + if ((LM_FLAG_NOEXP & gh->gh_flags) && + test_and_clear_bit(GLF_FROZEN, &gl->gl_flags)) + set_bit(GLF_REPLY_PENDING, &gl->gl_flags); run_queue(gl, 1); spin_unlock(&gl->gl_spin); @@ -1319,6 +1323,36 @@ void gfs2_glock_cb(struct gfs2_glock *gl, unsigned int state) } /** + * gfs2_should_freeze - Figure out if glock should be frozen + * @gl: The glock in question + * + * Glocks are not frozen if (a) the result of the dlm operation is + * an error, (b) the locking operation was an unlock operation or + * (c) if there is a "noexp" flagged request anywhere in the queue + * + * Returns: 1 if freezing should occur, 0 otherwise + */ + +static int gfs2_should_freeze(const struct gfs2_glock *gl) +{ + const struct gfs2_holder *gh; + + if (gl->gl_reply & ~LM_OUT_ST_MASK) + return 0; + if (gl->gl_target == LM_ST_UNLOCKED) + return 0; + + list_for_each_entry(gh, &gl->gl_holders, gh_list) { + if (test_bit(HIF_HOLDER, &gh->gh_iflags)) + continue; + if (LM_FLAG_NOEXP & gh->gh_flags) + return 0; + } + + return 1; +} + +/** * gfs2_glock_complete - Callback used by locking * @gl: Pointer to the glock * @ret: The return value from the dlm @@ -1328,18 +1362,17 @@ void gfs2_glock_cb(struct gfs2_glock *gl, unsigned int state) void gfs2_glock_complete(struct gfs2_glock *gl, int ret) { struct lm_lockstruct *ls = &gl->gl_sbd->sd_lockstruct; + gl->gl_reply = ret; + if (unlikely(test_bit(DFL_BLOCK_LOCKS, &ls->ls_flags))) { - struct gfs2_holder *gh; spin_lock(&gl->gl_spin); - gh = find_first_waiter(gl); - if ((!(gh && (gh->gh_flags & LM_FLAG_NOEXP)) && - (gl->gl_target != LM_ST_UNLOCKED)) || - ((ret & ~LM_OUT_ST_MASK) != 0)) + if (gfs2_should_freeze(gl)) { set_bit(GLF_FROZEN, &gl->gl_flags); - spin_unlock(&gl->gl_spin); - if (test_bit(GLF_FROZEN, &gl->gl_flags)) + spin_unlock(&gl->gl_spin); return; + } + spin_unlock(&gl->gl_spin); } set_bit(GLF_REPLY_PENDING, &gl->gl_flags); gfs2_glock_hold(gl); @@ -1348,7 +1381,7 @@ void gfs2_glock_complete(struct gfs2_glock *gl, int ret) } -static int gfs2_shrink_glock_memory(int nr, gfp_t gfp_mask) +static int gfs2_shrink_glock_memory(struct shrinker *shrink, int nr, gfp_t gfp_mask) { struct gfs2_glock *gl; int may_demote; diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index b5d7363b22da..8fcbce48a128 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -460,6 +460,7 @@ enum { SDF_NOBARRIERS = 3, SDF_NORECOVERY = 4, SDF_DEMOTE = 5, + SDF_NOJOURNALID = 6, }; #define GFS2_FSNAME_LEN 256 diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index b5612cbb62a5..f03afd9c44bc 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -169,7 +169,7 @@ struct inode *gfs2_inode_lookup(struct super_block *sb, { struct inode *inode; struct gfs2_inode *ip; - struct gfs2_glock *io_gl; + struct gfs2_glock *io_gl = NULL; int error; inode = gfs2_iget(sb, no_addr); @@ -198,6 +198,7 @@ struct inode *gfs2_inode_lookup(struct super_block *sb, ip->i_iopen_gh.gh_gl->gl_object = ip; gfs2_glock_put(io_gl); + io_gl = NULL; if ((type == DT_UNKNOWN) && (no_formal_ino == 0)) goto gfs2_nfsbypass; @@ -228,7 +229,8 @@ gfs2_nfsbypass: fail_glock: gfs2_glock_dq(&ip->i_iopen_gh); fail_iopen: - gfs2_glock_put(io_gl); + if (io_gl) + gfs2_glock_put(io_gl); fail_put: if (inode->i_state & I_NEW) ip->i_gl->gl_object = NULL; @@ -256,7 +258,7 @@ void gfs2_process_unlinked_inode(struct super_block *sb, u64 no_addr) { struct gfs2_sbd *sdp; struct gfs2_inode *ip; - struct gfs2_glock *io_gl; + struct gfs2_glock *io_gl = NULL; int error; struct gfs2_holder gh; struct inode *inode; @@ -293,6 +295,7 @@ void gfs2_process_unlinked_inode(struct super_block *sb, u64 no_addr) ip->i_iopen_gh.gh_gl->gl_object = ip; gfs2_glock_put(io_gl); + io_gl = NULL; inode->i_mode = DT2IF(DT_UNKNOWN); @@ -319,7 +322,8 @@ void gfs2_process_unlinked_inode(struct super_block *sb, u64 no_addr) fail_glock: gfs2_glock_dq(&ip->i_iopen_gh); fail_iopen: - gfs2_glock_put(io_gl); + if (io_gl) + gfs2_glock_put(io_gl); fail_put: ip->i_gl->gl_object = NULL; gfs2_glock_put(ip->i_gl); diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index 3593b3a7290e..45a4a36195d8 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -76,7 +76,7 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb) sb->s_fs_info = sdp; sdp->sd_vfs = sb; - + set_bit(SDF_NOJOURNALID, &sdp->sd_flags); gfs2_tune_init(&sdp->sd_tune); init_waitqueue_head(&sdp->sd_glock_wait); @@ -1050,7 +1050,8 @@ static int gfs2_lm_mount(struct gfs2_sbd *sdp, int silent) ret = match_int(&tmp[0], &option); if (ret || option < 0) goto hostdata_error; - ls->ls_jid = option; + if (test_and_clear_bit(SDF_NOJOURNALID, &sdp->sd_flags)) + ls->ls_jid = option; break; case Opt_id: /* Obsolete, but left for backward compat purposes */ @@ -1102,6 +1103,24 @@ void gfs2_lm_unmount(struct gfs2_sbd *sdp) lm->lm_unmount(sdp); } +static int gfs2_journalid_wait(void *word) +{ + if (signal_pending(current)) + return -EINTR; + schedule(); + return 0; +} + +static int wait_on_journal(struct gfs2_sbd *sdp) +{ + if (sdp->sd_args.ar_spectator) + return 0; + if (sdp->sd_lockstruct.ls_ops->lm_mount == NULL) + return 0; + + return wait_on_bit(&sdp->sd_flags, SDF_NOJOURNALID, gfs2_journalid_wait, TASK_INTERRUPTIBLE); +} + void gfs2_online_uevent(struct gfs2_sbd *sdp) { struct super_block *sb = sdp->sd_vfs; @@ -1194,6 +1213,10 @@ static int fill_super(struct super_block *sb, struct gfs2_args *args, int silent if (error) goto fail_locking; + error = wait_on_journal(sdp); + if (error) + goto fail_sb; + error = init_inodes(sdp, DO); if (error) goto fail_sb; diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c index 4e64352d49de..98cdd05f3316 100644 --- a/fs/gfs2/ops_inode.c +++ b/fs/gfs2/ops_inode.c @@ -1071,6 +1071,9 @@ int gfs2_permission(struct inode *inode, int mask) return error; } +/* + * XXX: should be changed to have proper ordering by opencoding simple_setsize + */ static int setattr_size(struct inode *inode, struct iattr *attr) { struct gfs2_inode *ip = GFS2_I(inode); @@ -1081,7 +1084,7 @@ static int setattr_size(struct inode *inode, struct iattr *attr) error = gfs2_trans_begin(sdp, 0, sdp->sd_jdesc->jd_blocks); if (error) return error; - error = vmtruncate(inode, attr->ia_size); + error = simple_setsize(inode, attr->ia_size); gfs2_trans_end(sdp); if (error) return error; diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c index 49667d68769e..8bb643cb2658 100644 --- a/fs/gfs2/quota.c +++ b/fs/gfs2/quota.c @@ -77,7 +77,7 @@ static LIST_HEAD(qd_lru_list); static atomic_t qd_lru_count = ATOMIC_INIT(0); static DEFINE_SPINLOCK(qd_lru_lock); -int gfs2_shrink_qd_memory(int nr, gfp_t gfp_mask) +int gfs2_shrink_qd_memory(struct shrinker *shrink, int nr, gfp_t gfp_mask) { struct gfs2_quota_data *qd; struct gfs2_sbd *sdp; @@ -694,10 +694,8 @@ get_a_page: if (!buffer_mapped(bh)) goto unlock_out; /* If it's a newly allocated disk block for quota, zero it */ - if (buffer_new(bh)) { - memset(bh->b_data, 0, bh->b_size); - set_buffer_uptodate(bh); - } + if (buffer_new(bh)) + zero_user(page, pos - blocksize, bh->b_size); } if (PageUptodate(page)) @@ -723,7 +721,7 @@ get_a_page: /* If quota straddles page boundary, we need to update the rest of the * quota at the beginning of the next page */ - if (offset != 0) { /* first page, offset is closer to PAGE_CACHE_SIZE */ + if ((offset + sizeof(struct gfs2_quota)) > PAGE_CACHE_SIZE) { ptr = ptr + nbytes; nbytes = sizeof(struct gfs2_quota) - nbytes; offset = 0; @@ -789,15 +787,9 @@ static int do_sync(unsigned int num_qd, struct gfs2_quota_data **qda) goto out; for (x = 0; x < num_qd; x++) { - int alloc_required; - offset = qd2offset(qda[x]); - error = gfs2_write_alloc_required(ip, offset, - sizeof(struct gfs2_quota), - &alloc_required); - if (error) - goto out_gunlock; - if (alloc_required) + if (gfs2_write_alloc_required(ip, offset, + sizeof(struct gfs2_quota))) nalloc++; } @@ -1586,10 +1578,7 @@ static int gfs2_set_dqblk(struct super_block *sb, int type, qid_t id, goto out_i; offset = qd2offset(qd); - error = gfs2_write_alloc_required(ip, offset, sizeof(struct gfs2_quota), - &alloc_required); - if (error) - goto out_i; + alloc_required = gfs2_write_alloc_required(ip, offset, sizeof(struct gfs2_quota)); if (alloc_required) { al = gfs2_alloc_get(ip); if (al == NULL) diff --git a/fs/gfs2/quota.h b/fs/gfs2/quota.h index 195f60c8bd14..e7d236ca48bd 100644 --- a/fs/gfs2/quota.h +++ b/fs/gfs2/quota.h @@ -51,7 +51,7 @@ static inline int gfs2_quota_lock_check(struct gfs2_inode *ip) return ret; } -extern int gfs2_shrink_qd_memory(int nr, gfp_t gfp_mask); +extern int gfs2_shrink_qd_memory(struct shrinker *shrink, int nr, gfp_t gfp_mask); extern const struct quotactl_ops gfs2_quotactl_ops; #endif /* __QUOTA_DOT_H__ */ diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index 4d1aad38f1b1..4140811a921c 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -342,8 +342,6 @@ int gfs2_jdesc_check(struct gfs2_jdesc *jd) { struct gfs2_inode *ip = GFS2_I(jd->jd_inode); struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode); - int ar; - int error; if (ip->i_disksize < (8 << 20) || ip->i_disksize > (1 << 30) || (ip->i_disksize & (sdp->sd_sb.sb_bsize - 1))) { @@ -352,13 +350,12 @@ int gfs2_jdesc_check(struct gfs2_jdesc *jd) } jd->jd_blocks = ip->i_disksize >> sdp->sd_sb.sb_bsize_shift; - error = gfs2_write_alloc_required(ip, 0, ip->i_disksize, &ar); - if (!error && ar) { + if (gfs2_write_alloc_required(ip, 0, ip->i_disksize)) { gfs2_consist_inode(ip); - error = -EIO; + return -EIO; } - return error; + return 0; } /** diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c index 37f5393e68e6..d019d0d55e00 100644 --- a/fs/gfs2/sys.c +++ b/fs/gfs2/sys.c @@ -325,6 +325,30 @@ static ssize_t lkfirst_show(struct gfs2_sbd *sdp, char *buf) return sprintf(buf, "%d\n", ls->ls_first); } +static ssize_t lkfirst_store(struct gfs2_sbd *sdp, const char *buf, size_t len) +{ + unsigned first; + int rv; + + rv = sscanf(buf, "%u", &first); + if (rv != 1 || first > 1) + return -EINVAL; + spin_lock(&sdp->sd_jindex_spin); + rv = -EBUSY; + if (test_bit(SDF_NOJOURNALID, &sdp->sd_flags) == 0) + goto out; + rv = -EINVAL; + if (sdp->sd_args.ar_spectator) + goto out; + if (sdp->sd_lockstruct.ls_ops->lm_mount == NULL) + goto out; + sdp->sd_lockstruct.ls_first = first; + rv = 0; +out: + spin_unlock(&sdp->sd_jindex_spin); + return rv ? rv : len; +} + static ssize_t first_done_show(struct gfs2_sbd *sdp, char *buf) { struct lm_lockstruct *ls = &sdp->sd_lockstruct; @@ -377,14 +401,41 @@ static ssize_t jid_show(struct gfs2_sbd *sdp, char *buf) return sprintf(buf, "%u\n", sdp->sd_lockstruct.ls_jid); } +static ssize_t jid_store(struct gfs2_sbd *sdp, const char *buf, size_t len) +{ + unsigned jid; + int rv; + + rv = sscanf(buf, "%u", &jid); + if (rv != 1) + return -EINVAL; + + spin_lock(&sdp->sd_jindex_spin); + rv = -EINVAL; + if (sdp->sd_args.ar_spectator) + goto out; + if (sdp->sd_lockstruct.ls_ops->lm_mount == NULL) + goto out; + rv = -EBUSY; + if (test_and_clear_bit(SDF_NOJOURNALID, &sdp->sd_flags) == 0) + goto out; + sdp->sd_lockstruct.ls_jid = jid; + smp_mb__after_clear_bit(); + wake_up_bit(&sdp->sd_flags, SDF_NOJOURNALID); + rv = 0; +out: + spin_unlock(&sdp->sd_jindex_spin); + return rv ? rv : len; +} + #define GDLM_ATTR(_name,_mode,_show,_store) \ static struct gfs2_attr gdlm_attr_##_name = __ATTR(_name,_mode,_show,_store) GDLM_ATTR(proto_name, 0444, proto_name_show, NULL); GDLM_ATTR(block, 0644, block_show, block_store); GDLM_ATTR(withdraw, 0644, withdraw_show, withdraw_store); -GDLM_ATTR(jid, 0444, jid_show, NULL); -GDLM_ATTR(first, 0444, lkfirst_show, NULL); +GDLM_ATTR(jid, 0644, jid_show, jid_store); +GDLM_ATTR(first, 0644, lkfirst_show, lkfirst_store); GDLM_ATTR(first_done, 0444, first_done_show, NULL); GDLM_ATTR(recover, 0600, NULL, recover_store); GDLM_ATTR(recover_done, 0444, recover_done_show, NULL); @@ -564,7 +615,7 @@ static int gfs2_uevent(struct kset *kset, struct kobject *kobj, add_uevent_var(env, "LOCKTABLE=%s", sdp->sd_table_name); add_uevent_var(env, "LOCKPROTO=%s", sdp->sd_proto_name); - if (!sdp->sd_args.ar_spectator) + if (!test_bit(SDF_NOJOURNALID, &sdp->sd_flags)) add_uevent_var(env, "JOURNALID=%u", sdp->sd_lockstruct.ls_jid); if (gfs2_uuid_valid(uuid)) add_uevent_var(env, "UUID=%pUB", uuid); diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c index 3a029d8f4cf1..87ac1891a185 100644 --- a/fs/hostfs/hostfs_kern.c +++ b/fs/hostfs/hostfs_kern.c @@ -411,9 +411,9 @@ int hostfs_file_open(struct inode *ino, struct file *file) return 0; } -int hostfs_fsync(struct file *file, struct dentry *dentry, int datasync) +int hostfs_fsync(struct file *file, int datasync) { - return fsync_file(HOSTFS_I(dentry->d_inode)->fd, datasync); + return fsync_file(HOSTFS_I(file->f_mapping->host)->fd, datasync); } static const struct file_operations hostfs_file_fops = { diff --git a/fs/hpfs/file.c b/fs/hpfs/file.c index 3efabff00367..a9ae9bfa752f 100644 --- a/fs/hpfs/file.c +++ b/fs/hpfs/file.c @@ -19,9 +19,9 @@ static int hpfs_file_release(struct inode *inode, struct file *file) return 0; } -int hpfs_file_fsync(struct file *file, struct dentry *dentry, int datasync) +int hpfs_file_fsync(struct file *file, int datasync) { - /*return file_fsync(file, dentry);*/ + /*return file_fsync(file, datasync);*/ return 0; /* Don't fsync :-) */ } diff --git a/fs/hpfs/hpfs_fn.h b/fs/hpfs/hpfs_fn.h index 97bf738cd5d6..75f9d4324851 100644 --- a/fs/hpfs/hpfs_fn.h +++ b/fs/hpfs/hpfs_fn.h @@ -268,7 +268,7 @@ void hpfs_set_ea(struct inode *, struct fnode *, const char *, /* file.c */ -int hpfs_file_fsync(struct file *, struct dentry *, int); +int hpfs_file_fsync(struct file *, int); extern const struct file_operations hpfs_file_ops; extern const struct inode_operations hpfs_file_iops; extern const struct address_space_operations hpfs_aops; diff --git a/fs/hppfs/hppfs.c b/fs/hppfs/hppfs.c index 2e4dfa8593da..826c3f9d29ac 100644 --- a/fs/hppfs/hppfs.c +++ b/fs/hppfs/hppfs.c @@ -587,7 +587,7 @@ static int hppfs_readdir(struct file *file, void *ent, filldir_t filldir) return err; } -static int hppfs_fsync(struct file *file, struct dentry *dentry, int datasync) +static int hppfs_fsync(struct file *file, int datasync) { return 0; } diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index a0bbd3d1b41a..a4e9a7ec3691 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -688,7 +688,7 @@ static void init_once(void *foo) const struct file_operations hugetlbfs_file_operations = { .read = hugetlbfs_read, .mmap = hugetlbfs_file_mmap, - .fsync = simple_sync_file, + .fsync = noop_fsync, .get_unmapped_area = hugetlb_get_unmapped_area, }; diff --git a/fs/inode.c b/fs/inode.c index 2bee20ae3d65..722860b323a9 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -512,7 +512,7 @@ static void prune_icache(int nr_to_scan) * This function is passed the number of inodes to scan, and it returns the * total number of remaining possibly-reclaimable inodes. */ -static int shrink_icache_memory(int nr, gfp_t gfp_mask) +static int shrink_icache_memory(struct shrinker *shrink, int nr, gfp_t gfp_mask) { if (nr) { /* diff --git a/fs/isofs/dir.c b/fs/isofs/dir.c index b9ab69b3a482..e0aca9a0ac68 100644 --- a/fs/isofs/dir.c +++ b/fs/isofs/dir.c @@ -272,6 +272,7 @@ static int isofs_readdir(struct file *filp, const struct file_operations isofs_dir_operations = { + .llseek = generic_file_llseek, .read = generic_read_dir, .readdir = isofs_readdir, }; diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index bc2ff5932769..036880895bfc 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -297,7 +297,6 @@ int jbd2_journal_write_metadata_buffer(transaction_t *transaction, struct page *new_page; unsigned int new_offset; struct buffer_head *bh_in = jh2bh(jh_in); - struct jbd2_buffer_trigger_type *triggers; journal_t *journal = transaction->t_journal; /* @@ -328,21 +327,21 @@ repeat: done_copy_out = 1; new_page = virt_to_page(jh_in->b_frozen_data); new_offset = offset_in_page(jh_in->b_frozen_data); - triggers = jh_in->b_frozen_triggers; } else { new_page = jh2bh(jh_in)->b_page; new_offset = offset_in_page(jh2bh(jh_in)->b_data); - triggers = jh_in->b_triggers; } mapped_data = kmap_atomic(new_page, KM_USER0); /* - * Fire any commit trigger. Do this before checking for escaping, - * as the trigger may modify the magic offset. If a copy-out - * happens afterwards, it will have the correct data in the buffer. + * Fire data frozen trigger if data already wasn't frozen. Do this + * before checking for escaping, as the trigger may modify the magic + * offset. If a copy-out happens afterwards, it will have the correct + * data in the buffer. */ - jbd2_buffer_commit_trigger(jh_in, mapped_data + new_offset, - triggers); + if (!done_copy_out) + jbd2_buffer_frozen_trigger(jh_in, mapped_data + new_offset, + jh_in->b_triggers); /* * Check for escaping diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index bfc70f57900f..b8e0806681bb 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -725,6 +725,9 @@ done: page = jh2bh(jh)->b_page; offset = ((unsigned long) jh2bh(jh)->b_data) & ~PAGE_MASK; source = kmap_atomic(page, KM_USER0); + /* Fire data frozen trigger just before we copy the data */ + jbd2_buffer_frozen_trigger(jh, source + offset, + jh->b_triggers); memcpy(jh->b_frozen_data, source+offset, jh2bh(jh)->b_size); kunmap_atomic(source, KM_USER0); @@ -963,15 +966,15 @@ void jbd2_journal_set_triggers(struct buffer_head *bh, jh->b_triggers = type; } -void jbd2_buffer_commit_trigger(struct journal_head *jh, void *mapped_data, +void jbd2_buffer_frozen_trigger(struct journal_head *jh, void *mapped_data, struct jbd2_buffer_trigger_type *triggers) { struct buffer_head *bh = jh2bh(jh); - if (!triggers || !triggers->t_commit) + if (!triggers || !triggers->t_frozen) return; - triggers->t_commit(triggers, bh, mapped_data, bh->b_size); + triggers->t_frozen(triggers, bh, mapped_data, bh->b_size); } void jbd2_buffer_abort_trigger(struct journal_head *jh, @@ -1311,7 +1314,6 @@ int jbd2_journal_stop(handle_t *handle) if (handle->h_sync) transaction->t_synchronous_commit = 1; current->journal_info = NULL; - spin_lock(&journal->j_state_lock); spin_lock(&transaction->t_handle_lock); transaction->t_outstanding_credits -= handle->h_buffer_credits; transaction->t_updates--; @@ -1340,8 +1342,7 @@ int jbd2_journal_stop(handle_t *handle) jbd_debug(2, "transaction too old, requesting commit for " "handle %p\n", handle); /* This is non-blocking */ - __jbd2_log_start_commit(journal, transaction->t_tid); - spin_unlock(&journal->j_state_lock); + jbd2_log_start_commit(journal, transaction->t_tid); /* * Special case: JBD2_SYNC synchronous updates require us @@ -1351,7 +1352,6 @@ int jbd2_journal_stop(handle_t *handle) err = jbd2_log_wait_commit(journal, tid); } else { spin_unlock(&transaction->t_handle_lock); - spin_unlock(&journal->j_state_lock); } lock_map_release(&handle->h_lockdep_map); diff --git a/fs/jffs2/acl.c b/fs/jffs2/acl.c index a33aab6b5e68..54a92fd02bbd 100644 --- a/fs/jffs2/acl.c +++ b/fs/jffs2/acl.c @@ -234,8 +234,9 @@ static int jffs2_set_acl(struct inode *inode, int type, struct posix_acl *acl) if (inode->i_mode != mode) { struct iattr attr; - attr.ia_valid = ATTR_MODE; + attr.ia_valid = ATTR_MODE | ATTR_CTIME; attr.ia_mode = mode; + attr.ia_ctime = CURRENT_TIME_SEC; rc = jffs2_do_setattr(inode, &attr); if (rc < 0) return rc; diff --git a/fs/jffs2/dir.c b/fs/jffs2/dir.c index 7aa4417e085f..166062a68230 100644 --- a/fs/jffs2/dir.c +++ b/fs/jffs2/dir.c @@ -222,15 +222,18 @@ static int jffs2_create(struct inode *dir_i, struct dentry *dentry, int mode, dir_i->i_mtime = dir_i->i_ctime = ITIME(je32_to_cpu(ri->ctime)); jffs2_free_raw_inode(ri); - d_instantiate(dentry, inode); D1(printk(KERN_DEBUG "jffs2_create: Created ino #%lu with mode %o, nlink %d(%d). nrpages %ld\n", inode->i_ino, inode->i_mode, inode->i_nlink, f->inocache->pino_nlink, inode->i_mapping->nrpages)); + + d_instantiate(dentry, inode); + unlock_new_inode(inode); return 0; fail: make_bad_inode(inode); + unlock_new_inode(inode); iput(inode); jffs2_free_raw_inode(ri); return ret; @@ -360,8 +363,8 @@ static int jffs2_symlink (struct inode *dir_i, struct dentry *dentry, const char /* Eeek. Wave bye bye */ mutex_unlock(&f->sem); jffs2_complete_reservation(c); - jffs2_clear_inode(inode); - return PTR_ERR(fn); + ret = PTR_ERR(fn); + goto fail; } /* We use f->target field to store the target path. */ @@ -370,8 +373,8 @@ static int jffs2_symlink (struct inode *dir_i, struct dentry *dentry, const char printk(KERN_WARNING "Can't allocate %d bytes of memory\n", targetlen + 1); mutex_unlock(&f->sem); jffs2_complete_reservation(c); - jffs2_clear_inode(inode); - return -ENOMEM; + ret = -ENOMEM; + goto fail; } memcpy(f->target, target, targetlen + 1); @@ -386,30 +389,24 @@ static int jffs2_symlink (struct inode *dir_i, struct dentry *dentry, const char jffs2_complete_reservation(c); ret = jffs2_init_security(inode, dir_i); - if (ret) { - jffs2_clear_inode(inode); - return ret; - } + if (ret) + goto fail; + ret = jffs2_init_acl_post(inode); - if (ret) { - jffs2_clear_inode(inode); - return ret; - } + if (ret) + goto fail; ret = jffs2_reserve_space(c, sizeof(*rd)+namelen, &alloclen, ALLOC_NORMAL, JFFS2_SUMMARY_DIRENT_SIZE(namelen)); - if (ret) { - /* Eep. */ - jffs2_clear_inode(inode); - return ret; - } + if (ret) + goto fail; rd = jffs2_alloc_raw_dirent(); if (!rd) { /* Argh. Now we treat it like a normal delete */ jffs2_complete_reservation(c); - jffs2_clear_inode(inode); - return -ENOMEM; + ret = -ENOMEM; + goto fail; } dir_f = JFFS2_INODE_INFO(dir_i); @@ -437,8 +434,8 @@ static int jffs2_symlink (struct inode *dir_i, struct dentry *dentry, const char jffs2_complete_reservation(c); jffs2_free_raw_dirent(rd); mutex_unlock(&dir_f->sem); - jffs2_clear_inode(inode); - return PTR_ERR(fd); + ret = PTR_ERR(fd); + goto fail; } dir_i->i_mtime = dir_i->i_ctime = ITIME(je32_to_cpu(rd->mctime)); @@ -453,7 +450,14 @@ static int jffs2_symlink (struct inode *dir_i, struct dentry *dentry, const char jffs2_complete_reservation(c); d_instantiate(dentry, inode); + unlock_new_inode(inode); return 0; + + fail: + make_bad_inode(inode); + unlock_new_inode(inode); + iput(inode); + return ret; } @@ -519,8 +523,8 @@ static int jffs2_mkdir (struct inode *dir_i, struct dentry *dentry, int mode) /* Eeek. Wave bye bye */ mutex_unlock(&f->sem); jffs2_complete_reservation(c); - jffs2_clear_inode(inode); - return PTR_ERR(fn); + ret = PTR_ERR(fn); + goto fail; } /* No data here. Only a metadata node, which will be obsoleted by the first data write @@ -531,30 +535,24 @@ static int jffs2_mkdir (struct inode *dir_i, struct dentry *dentry, int mode) jffs2_complete_reservation(c); ret = jffs2_init_security(inode, dir_i); - if (ret) { - jffs2_clear_inode(inode); - return ret; - } + if (ret) + goto fail; + ret = jffs2_init_acl_post(inode); - if (ret) { - jffs2_clear_inode(inode); - return ret; - } + if (ret) + goto fail; ret = jffs2_reserve_space(c, sizeof(*rd)+namelen, &alloclen, ALLOC_NORMAL, JFFS2_SUMMARY_DIRENT_SIZE(namelen)); - if (ret) { - /* Eep. */ - jffs2_clear_inode(inode); - return ret; - } + if (ret) + goto fail; rd = jffs2_alloc_raw_dirent(); if (!rd) { /* Argh. Now we treat it like a normal delete */ jffs2_complete_reservation(c); - jffs2_clear_inode(inode); - return -ENOMEM; + ret = -ENOMEM; + goto fail; } dir_f = JFFS2_INODE_INFO(dir_i); @@ -582,8 +580,8 @@ static int jffs2_mkdir (struct inode *dir_i, struct dentry *dentry, int mode) jffs2_complete_reservation(c); jffs2_free_raw_dirent(rd); mutex_unlock(&dir_f->sem); - jffs2_clear_inode(inode); - return PTR_ERR(fd); + ret = PTR_ERR(fd); + goto fail; } dir_i->i_mtime = dir_i->i_ctime = ITIME(je32_to_cpu(rd->mctime)); @@ -599,7 +597,14 @@ static int jffs2_mkdir (struct inode *dir_i, struct dentry *dentry, int mode) jffs2_complete_reservation(c); d_instantiate(dentry, inode); + unlock_new_inode(inode); return 0; + + fail: + make_bad_inode(inode); + unlock_new_inode(inode); + iput(inode); + return ret; } static int jffs2_rmdir (struct inode *dir_i, struct dentry *dentry) @@ -693,8 +698,8 @@ static int jffs2_mknod (struct inode *dir_i, struct dentry *dentry, int mode, de /* Eeek. Wave bye bye */ mutex_unlock(&f->sem); jffs2_complete_reservation(c); - jffs2_clear_inode(inode); - return PTR_ERR(fn); + ret = PTR_ERR(fn); + goto fail; } /* No data here. Only a metadata node, which will be obsoleted by the first data write @@ -705,30 +710,24 @@ static int jffs2_mknod (struct inode *dir_i, struct dentry *dentry, int mode, de jffs2_complete_reservation(c); ret = jffs2_init_security(inode, dir_i); - if (ret) { - jffs2_clear_inode(inode); - return ret; - } + if (ret) + goto fail; + ret = jffs2_init_acl_post(inode); - if (ret) { - jffs2_clear_inode(inode); - return ret; - } + if (ret) + goto fail; ret = jffs2_reserve_space(c, sizeof(*rd)+namelen, &alloclen, ALLOC_NORMAL, JFFS2_SUMMARY_DIRENT_SIZE(namelen)); - if (ret) { - /* Eep. */ - jffs2_clear_inode(inode); - return ret; - } + if (ret) + goto fail; rd = jffs2_alloc_raw_dirent(); if (!rd) { /* Argh. Now we treat it like a normal delete */ jffs2_complete_reservation(c); - jffs2_clear_inode(inode); - return -ENOMEM; + ret = -ENOMEM; + goto fail; } dir_f = JFFS2_INODE_INFO(dir_i); @@ -759,8 +758,8 @@ static int jffs2_mknod (struct inode *dir_i, struct dentry *dentry, int mode, de jffs2_complete_reservation(c); jffs2_free_raw_dirent(rd); mutex_unlock(&dir_f->sem); - jffs2_clear_inode(inode); - return PTR_ERR(fd); + ret = PTR_ERR(fd); + goto fail; } dir_i->i_mtime = dir_i->i_ctime = ITIME(je32_to_cpu(rd->mctime)); @@ -775,8 +774,14 @@ static int jffs2_mknod (struct inode *dir_i, struct dentry *dentry, int mode, de jffs2_complete_reservation(c); d_instantiate(dentry, inode); - + unlock_new_inode(inode); return 0; + + fail: + make_bad_inode(inode); + unlock_new_inode(inode); + iput(inode); + return ret; } static int jffs2_rename (struct inode *old_dir_i, struct dentry *old_dentry, diff --git a/fs/jffs2/file.c b/fs/jffs2/file.c index e7291c161a19..813497024437 100644 --- a/fs/jffs2/file.c +++ b/fs/jffs2/file.c @@ -26,9 +26,9 @@ static int jffs2_write_begin(struct file *filp, struct address_space *mapping, struct page **pagep, void **fsdata); static int jffs2_readpage (struct file *filp, struct page *pg); -int jffs2_fsync(struct file *filp, struct dentry *dentry, int datasync) +int jffs2_fsync(struct file *filp, int datasync) { - struct inode *inode = dentry->d_inode; + struct inode *inode = filp->f_mapping->host; struct jffs2_sb_info *c = JFFS2_SB_INFO(inode->i_sb); /* Trigger GC to flush any pending writes for this inode */ diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c index 86e0821fc989..459d39d1ea0b 100644 --- a/fs/jffs2/fs.c +++ b/fs/jffs2/fs.c @@ -169,13 +169,13 @@ int jffs2_do_setattr (struct inode *inode, struct iattr *iattr) mutex_unlock(&f->sem); jffs2_complete_reservation(c); - /* We have to do the vmtruncate() without f->sem held, since + /* We have to do the simple_setsize() without f->sem held, since some pages may be locked and waiting for it in readpage(). We are protected from a simultaneous write() extending i_size back past iattr->ia_size, because do_truncate() holds the generic inode semaphore. */ if (ivalid & ATTR_SIZE && inode->i_size > iattr->ia_size) { - vmtruncate(inode, iattr->ia_size); + simple_setsize(inode, iattr->ia_size); inode->i_blocks = (inode->i_size + 511) >> 9; } @@ -465,7 +465,12 @@ struct inode *jffs2_new_inode (struct inode *dir_i, int mode, struct jffs2_raw_i inode->i_blocks = 0; inode->i_size = 0; - insert_inode_hash(inode); + if (insert_inode_locked(inode) < 0) { + make_bad_inode(inode); + unlock_new_inode(inode); + iput(inode); + return ERR_PTR(-EINVAL); + } return inode; } diff --git a/fs/jffs2/os-linux.h b/fs/jffs2/os-linux.h index 035a767f958b..4791aacf3084 100644 --- a/fs/jffs2/os-linux.h +++ b/fs/jffs2/os-linux.h @@ -158,7 +158,7 @@ extern const struct inode_operations jffs2_dir_inode_operations; extern const struct file_operations jffs2_file_operations; extern const struct inode_operations jffs2_file_inode_operations; extern const struct address_space_operations jffs2_file_address_operations; -int jffs2_fsync(struct file *, struct dentry *, int); +int jffs2_fsync(struct file *, int); int jffs2_do_readpage_unlock (struct inode *inode, struct page *pg); /* ioctl.c */ diff --git a/fs/jffs2/xattr.c b/fs/jffs2/xattr.c index a2d58c96f1b4..d258e261bdc7 100644 --- a/fs/jffs2/xattr.c +++ b/fs/jffs2/xattr.c @@ -626,7 +626,7 @@ void jffs2_xattr_free_inode(struct jffs2_sb_info *c, struct jffs2_inode_cache *i static int check_xattr_ref_inode(struct jffs2_sb_info *c, struct jffs2_inode_cache *ic) { - /* success of check_xattr_ref_inode() means taht inode (ic) dose not have + /* success of check_xattr_ref_inode() means that inode (ic) dose not have * duplicate name/value pairs. If duplicate name/value pair would be found, * one will be removed. */ diff --git a/fs/jfs/file.c b/fs/jfs/file.c index 85d9ec659225..127263cc8657 100644 --- a/fs/jfs/file.c +++ b/fs/jfs/file.c @@ -27,9 +27,9 @@ #include "jfs_acl.h" #include "jfs_debug.h" -int jfs_fsync(struct file *file, struct dentry *dentry, int datasync) +int jfs_fsync(struct file *file, int datasync) { - struct inode *inode = dentry->d_inode; + struct inode *inode = file->f_mapping->host; int rc = 0; if (!(inode->i_state & I_DIRTY) || diff --git a/fs/jfs/jfs_inode.h b/fs/jfs/jfs_inode.h index 9e6bda30a6e8..11042b1f44b5 100644 --- a/fs/jfs/jfs_inode.h +++ b/fs/jfs/jfs_inode.h @@ -21,7 +21,7 @@ struct fid; extern struct inode *ialloc(struct inode *, umode_t); -extern int jfs_fsync(struct file *, struct dentry *, int); +extern int jfs_fsync(struct file *, int); extern long jfs_ioctl(struct file *, unsigned int, unsigned long); extern long jfs_compat_ioctl(struct file *, unsigned int, unsigned long); extern struct inode *jfs_iget(struct super_block *, unsigned long); diff --git a/fs/jfs/super.c b/fs/jfs/super.c index b66832ac33ac..b38f96bef829 100644 --- a/fs/jfs/super.c +++ b/fs/jfs/super.c @@ -179,6 +179,8 @@ static void jfs_put_super(struct super_block *sb) jfs_info("In jfs_put_super"); + dquot_disable(sb, -1, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED); + lock_kernel(); rc = jfs_umount(sb); @@ -396,10 +398,20 @@ static int jfs_remount(struct super_block *sb, int *flags, char *data) JFS_SBI(sb)->flag = flag; ret = jfs_mount_rw(sb, 1); + + /* mark the fs r/w for quota activity */ + sb->s_flags &= ~MS_RDONLY; + unlock_kernel(); + dquot_resume(sb, -1); return ret; } if ((!(sb->s_flags & MS_RDONLY)) && (*flags & MS_RDONLY)) { + rc = dquot_suspend(sb, -1); + if (rc < 0) { + unlock_kernel(); + return rc; + } rc = jfs_umount_rw(sb); JFS_SBI(sb)->flag = flag; unlock_kernel(); @@ -469,6 +481,10 @@ static int jfs_fill_super(struct super_block *sb, void *data, int silent) */ sb->s_op = &jfs_super_operations; sb->s_export_op = &jfs_export_operations; +#ifdef CONFIG_QUOTA + sb->dq_op = &dquot_operations; + sb->s_qcop = &dquot_quotactl_ops; +#endif /* * Initialize direct-mapping inode/address-space diff --git a/fs/libfs.c b/fs/libfs.c index 232bea425b09..dcaf972cbf1b 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -8,6 +8,7 @@ #include <linux/slab.h> #include <linux/mount.h> #include <linux/vfs.h> +#include <linux/quotaops.h> #include <linux/mutex.h> #include <linux/exportfs.h> #include <linux/writeback.h> @@ -58,11 +59,6 @@ struct dentry *simple_lookup(struct inode *dir, struct dentry *dentry, struct na return NULL; } -int simple_sync_file(struct file * file, struct dentry *dentry, int datasync) -{ - return 0; -} - int dcache_dir_open(struct inode *inode, struct file *file) { static struct qstr cursor_name = {.len = 1, .name = "."}; @@ -190,7 +186,7 @@ const struct file_operations simple_dir_operations = { .llseek = dcache_dir_lseek, .read = generic_read_dir, .readdir = dcache_readdir, - .fsync = simple_sync_file, + .fsync = noop_fsync, }; const struct inode_operations simple_dir_inode_operations = { @@ -330,6 +326,81 @@ int simple_rename(struct inode *old_dir, struct dentry *old_dentry, return 0; } +/** + * simple_setsize - handle core mm and vfs requirements for file size change + * @inode: inode + * @newsize: new file size + * + * Returns 0 on success, -error on failure. + * + * simple_setsize must be called with inode_mutex held. + * + * simple_setsize will check that the requested new size is OK (see + * inode_newsize_ok), and then will perform the necessary i_size update + * and pagecache truncation (if necessary). It will be typically be called + * from the filesystem's setattr function when ATTR_SIZE is passed in. + * + * The inode itself must have correct permissions and attributes to allow + * i_size to be changed, this function then just checks that the new size + * requested is valid. + * + * In the case of simple in-memory filesystems with inodes stored solely + * in the inode cache, and file data in the pagecache, nothing more needs + * to be done to satisfy a truncate request. Filesystems with on-disk + * blocks for example will need to free them in the case of truncate, in + * that case it may be easier not to use simple_setsize (but each of its + * components will likely be required at some point to update pagecache + * and inode etc). + */ +int simple_setsize(struct inode *inode, loff_t newsize) +{ + loff_t oldsize; + int error; + + error = inode_newsize_ok(inode, newsize); + if (error) + return error; + + oldsize = inode->i_size; + i_size_write(inode, newsize); + truncate_pagecache(inode, oldsize, newsize); + + return error; +} +EXPORT_SYMBOL(simple_setsize); + +/** + * simple_setattr - setattr for simple in-memory filesystem + * @dentry: dentry + * @iattr: iattr structure + * + * Returns 0 on success, -error on failure. + * + * simple_setattr implements setattr for an in-memory filesystem which + * does not store its own file data or metadata (eg. uses the page cache + * and inode cache as its data store). + */ +int simple_setattr(struct dentry *dentry, struct iattr *iattr) +{ + struct inode *inode = dentry->d_inode; + int error; + + error = inode_change_ok(inode, iattr); + if (error) + return error; + + if (iattr->ia_valid & ATTR_SIZE) { + error = simple_setsize(inode, iattr->ia_size); + if (error) + return error; + } + + generic_setattr(inode, iattr); + + return error; +} +EXPORT_SYMBOL(simple_setattr); + int simple_readpage(struct file *file, struct page *page) { clear_highpage(page); @@ -418,7 +489,8 @@ int simple_write_end(struct file *file, struct address_space *mapping, * unique inode values later for this filesystem, then you must take care * to pass it an appropriate max_reserved value to avoid collisions. */ -int simple_fill_super(struct super_block *s, int magic, struct tree_descr *files) +int simple_fill_super(struct super_block *s, unsigned long magic, + struct tree_descr *files) { struct inode *inode; struct dentry *root; @@ -851,13 +923,22 @@ struct dentry *generic_fh_to_parent(struct super_block *sb, struct fid *fid, } EXPORT_SYMBOL_GPL(generic_fh_to_parent); -int simple_fsync(struct file *file, struct dentry *dentry, int datasync) +/** + * generic_file_fsync - generic fsync implementation for simple filesystems + * @file: file to synchronize + * @datasync: only synchronize essential metadata if true + * + * This is a generic implementation of the fsync method for simple + * filesystems which track all non-inode metadata in the buffers list + * hanging off the address_space structure. + */ +int generic_file_fsync(struct file *file, int datasync) { struct writeback_control wbc = { .sync_mode = WB_SYNC_ALL, .nr_to_write = 0, /* metadata-only; caller takes care of data */ }; - struct inode *inode = dentry->d_inode; + struct inode *inode = file->f_mapping->host; int err; int ret; @@ -872,7 +953,15 @@ int simple_fsync(struct file *file, struct dentry *dentry, int datasync) ret = err; return ret; } -EXPORT_SYMBOL(simple_fsync); +EXPORT_SYMBOL(generic_file_fsync); + +/* + * No-op implementation of ->fsync for in-memory filesystems. + */ +int noop_fsync(struct file *file, int datasync) +{ + return 0; +} EXPORT_SYMBOL(dcache_dir_close); EXPORT_SYMBOL(dcache_dir_lseek); @@ -895,7 +984,7 @@ EXPORT_SYMBOL(simple_release_fs); EXPORT_SYMBOL(simple_rename); EXPORT_SYMBOL(simple_rmdir); EXPORT_SYMBOL(simple_statfs); -EXPORT_SYMBOL(simple_sync_file); +EXPORT_SYMBOL(noop_fsync); EXPORT_SYMBOL(simple_unlink); EXPORT_SYMBOL(simple_read_from_buffer); EXPORT_SYMBOL(simple_write_to_buffer); diff --git a/fs/logfs/file.c b/fs/logfs/file.c index 0de524071870..abe1cafbd4c2 100644 --- a/fs/logfs/file.c +++ b/fs/logfs/file.c @@ -219,9 +219,9 @@ int logfs_ioctl(struct inode *inode, struct file *file, unsigned int cmd, } } -int logfs_fsync(struct file *file, struct dentry *dentry, int datasync) +int logfs_fsync(struct file *file, int datasync) { - struct super_block *sb = dentry->d_inode->i_sb; + struct super_block *sb = file->f_mapping->host->i_sb; logfs_write_anchor(sb); return 0; diff --git a/fs/logfs/logfs.h b/fs/logfs/logfs.h index 1a9db84f8d8f..c838c4d72111 100644 --- a/fs/logfs/logfs.h +++ b/fs/logfs/logfs.h @@ -506,7 +506,7 @@ extern const struct address_space_operations logfs_reg_aops; int logfs_readpage(struct file *file, struct page *page); int logfs_ioctl(struct inode *inode, struct file *file, unsigned int cmd, unsigned long arg); -int logfs_fsync(struct file *file, struct dentry *dentry, int datasync); +int logfs_fsync(struct file *file, int datasync); /* gc.c */ u32 get_best_cand(struct super_block *sb, struct candidate_list *list, u32 *ec); diff --git a/fs/mbcache.c b/fs/mbcache.c index ec88ff3d04a9..e28f21b95344 100644 --- a/fs/mbcache.c +++ b/fs/mbcache.c @@ -115,7 +115,7 @@ mb_cache_indexes(struct mb_cache *cache) * What the mbcache registers as to get shrunk dynamically. */ -static int mb_cache_shrink_fn(int nr_to_scan, gfp_t gfp_mask); +static int mb_cache_shrink_fn(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask); static struct shrinker mb_cache_shrinker = { .shrink = mb_cache_shrink_fn, @@ -191,13 +191,14 @@ forget: * This function is called by the kernel memory management when memory * gets low. * + * @shrink: (ignored) * @nr_to_scan: Number of objects to scan * @gfp_mask: (ignored) * * Returns the number of objects which are present in the cache. */ static int -mb_cache_shrink_fn(int nr_to_scan, gfp_t gfp_mask) +mb_cache_shrink_fn(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask) { LIST_HEAD(free_list); struct list_head *l, *ltmp; diff --git a/fs/minix/dir.c b/fs/minix/dir.c index 6198731d7fcd..1dbf921ca44b 100644 --- a/fs/minix/dir.c +++ b/fs/minix/dir.c @@ -22,7 +22,7 @@ const struct file_operations minix_dir_operations = { .llseek = generic_file_llseek, .read = generic_read_dir, .readdir = minix_readdir, - .fsync = simple_fsync, + .fsync = generic_file_fsync, }; static inline void dir_put_page(struct page *page) @@ -72,16 +72,9 @@ static struct page * dir_get_page(struct inode *dir, unsigned long n) { struct address_space *mapping = dir->i_mapping; struct page *page = read_mapping_page(mapping, n, NULL); - if (!IS_ERR(page)) { + if (!IS_ERR(page)) kmap(page); - if (!PageUptodate(page)) - goto fail; - } return page; - -fail: - dir_put_page(page); - return ERR_PTR(-EIO); } static inline void *minix_next_entry(void *de, struct minix_sb_info *sbi) diff --git a/fs/minix/file.c b/fs/minix/file.c index 3eec3e607a87..d5320ff23faf 100644 --- a/fs/minix/file.c +++ b/fs/minix/file.c @@ -19,7 +19,7 @@ const struct file_operations minix_file_operations = { .write = do_sync_write, .aio_write = generic_file_aio_write, .mmap = generic_file_mmap, - .fsync = simple_fsync, + .fsync = generic_file_fsync, .splice_read = generic_file_splice_read, }; diff --git a/fs/minix/itree_v2.c b/fs/minix/itree_v2.c index f23010969369..13487ad16894 100644 --- a/fs/minix/itree_v2.c +++ b/fs/minix/itree_v2.c @@ -20,6 +20,9 @@ static inline block_t *i_data(struct inode *inode) return (block_t *)minix_i(inode)->u.i2_data; } +#define DIRCOUNT 7 +#define INDIRCOUNT(sb) (1 << ((sb)->s_blocksize_bits - 2)) + static int block_to_path(struct inode * inode, long block, int offsets[DEPTH]) { int n = 0; @@ -34,21 +37,21 @@ static int block_to_path(struct inode * inode, long block, int offsets[DEPTH]) printk("MINIX-fs: block_to_path: " "block %ld too big on dev %s\n", block, bdevname(sb->s_bdev, b)); - } else if (block < 7) { + } else if (block < DIRCOUNT) { offsets[n++] = block; - } else if ((block -= 7) < 256) { - offsets[n++] = 7; + } else if ((block -= DIRCOUNT) < INDIRCOUNT(sb)) { + offsets[n++] = DIRCOUNT; offsets[n++] = block; - } else if ((block -= 256) < 256*256) { - offsets[n++] = 8; - offsets[n++] = block>>8; - offsets[n++] = block & 255; + } else if ((block -= INDIRCOUNT(sb)) < INDIRCOUNT(sb) * INDIRCOUNT(sb)) { + offsets[n++] = DIRCOUNT + 1; + offsets[n++] = block / INDIRCOUNT(sb); + offsets[n++] = block % INDIRCOUNT(sb); } else { - block -= 256*256; - offsets[n++] = 9; - offsets[n++] = block>>16; - offsets[n++] = (block>>8) & 255; - offsets[n++] = block & 255; + block -= INDIRCOUNT(sb) * INDIRCOUNT(sb); + offsets[n++] = DIRCOUNT + 2; + offsets[n++] = (block / INDIRCOUNT(sb)) / INDIRCOUNT(sb); + offsets[n++] = (block / INDIRCOUNT(sb)) % INDIRCOUNT(sb); + offsets[n++] = block % INDIRCOUNT(sb); } return n; } diff --git a/fs/namei.c b/fs/namei.c index 48e1f60520ea..42d2d28fb827 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -282,8 +282,7 @@ int inode_permission(struct inode *inode, int mask) if (retval) return retval; - return security_inode_permission(inode, - mask & (MAY_READ|MAY_WRITE|MAY_EXEC|MAY_APPEND)); + return security_inode_permission(inode, mask); } /** @@ -1484,8 +1483,7 @@ static int handle_truncate(struct path *path) */ error = locks_verify_locked(inode); if (!error) - error = security_path_truncate(path, 0, - ATTR_MTIME|ATTR_CTIME|ATTR_OPEN); + error = security_path_truncate(path); if (!error) { error = do_truncate(path->dentry, 0, ATTR_MTIME|ATTR_CTIME|ATTR_OPEN, @@ -1621,6 +1619,7 @@ static struct file *do_last(struct nameidata *nd, struct path *path, case LAST_DOTDOT: follow_dotdot(nd); dir = nd->path.dentry; + case LAST_DOT: if (nd->path.mnt->mnt_sb->s_type->fs_flags & FS_REVAL_DOT) { if (!dir->d_op->d_revalidate(dir, nd)) { error = -ESTALE; @@ -1628,7 +1627,6 @@ static struct file *do_last(struct nameidata *nd, struct path *path, } } /* fallthrough */ - case LAST_DOT: case LAST_ROOT: if (open_flag & O_CREAT) goto exit; diff --git a/fs/ncpfs/dir.c b/fs/ncpfs/dir.c index 92dde6f8d893..9578cbe0cd58 100644 --- a/fs/ncpfs/dir.c +++ b/fs/ncpfs/dir.c @@ -49,6 +49,7 @@ extern int ncp_symlink(struct inode *, struct dentry *, const char *); const struct file_operations ncp_dir_operations = { + .llseek = generic_file_llseek, .read = generic_read_dir, .readdir = ncp_readdir, .unlocked_ioctl = ncp_ioctl, diff --git a/fs/ncpfs/file.c b/fs/ncpfs/file.c index b93870892892..3639cc5cbdae 100644 --- a/fs/ncpfs/file.c +++ b/fs/ncpfs/file.c @@ -22,7 +22,7 @@ #include <linux/ncp_fs.h> #include "ncplib_kernel.h" -static int ncp_fsync(struct file *file, struct dentry *dentry, int datasync) +static int ncp_fsync(struct file *file, int datasync) { return 0; } diff --git a/fs/nfs/client.c b/fs/nfs/client.c index 7ec9b34a59f8..d25b5257b7a1 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -1286,6 +1286,55 @@ static void nfs4_session_set_rwsize(struct nfs_server *server) #endif /* CONFIG_NFS_V4_1 */ } +static int nfs4_server_common_setup(struct nfs_server *server, + struct nfs_fh *mntfh) +{ + struct nfs_fattr *fattr; + int error; + + BUG_ON(!server->nfs_client); + BUG_ON(!server->nfs_client->rpc_ops); + BUG_ON(!server->nfs_client->rpc_ops->file_inode_ops); + + fattr = nfs_alloc_fattr(); + if (fattr == NULL) + return -ENOMEM; + + /* We must ensure the session is initialised first */ + error = nfs4_init_session(server); + if (error < 0) + goto out; + + /* Probe the root fh to retrieve its FSID and filehandle */ + error = nfs4_get_rootfh(server, mntfh); + if (error < 0) + goto out; + + dprintk("Server FSID: %llx:%llx\n", + (unsigned long long) server->fsid.major, + (unsigned long long) server->fsid.minor); + dprintk("Mount FH: %d\n", mntfh->size); + + nfs4_session_set_rwsize(server); + + error = nfs_probe_fsinfo(server, mntfh, fattr); + if (error < 0) + goto out; + + if (server->namelen == 0 || server->namelen > NFS4_MAXNAMLEN) + server->namelen = NFS4_MAXNAMLEN; + + spin_lock(&nfs_client_lock); + list_add_tail(&server->client_link, &server->nfs_client->cl_superblocks); + list_add_tail(&server->master_link, &nfs_volume_list); + spin_unlock(&nfs_client_lock); + + server->mount_time = jiffies; +out: + nfs_free_fattr(fattr); + return error; +} + /* * Create a version 4 volume record */ @@ -1346,7 +1395,6 @@ error: struct nfs_server *nfs4_create_server(const struct nfs_parsed_mount_data *data, struct nfs_fh *mntfh) { - struct nfs_fattr *fattr; struct nfs_server *server; int error; @@ -1356,55 +1404,19 @@ struct nfs_server *nfs4_create_server(const struct nfs_parsed_mount_data *data, if (!server) return ERR_PTR(-ENOMEM); - error = -ENOMEM; - fattr = nfs_alloc_fattr(); - if (fattr == NULL) - goto error; - /* set up the general RPC client */ error = nfs4_init_server(server, data); if (error < 0) goto error; - BUG_ON(!server->nfs_client); - BUG_ON(!server->nfs_client->rpc_ops); - BUG_ON(!server->nfs_client->rpc_ops->file_inode_ops); - - error = nfs4_init_session(server); - if (error < 0) - goto error; - - /* Probe the root fh to retrieve its FSID */ - error = nfs4_get_rootfh(server, mntfh); + error = nfs4_server_common_setup(server, mntfh); if (error < 0) goto error; - dprintk("Server FSID: %llx:%llx\n", - (unsigned long long) server->fsid.major, - (unsigned long long) server->fsid.minor); - dprintk("Mount FH: %d\n", mntfh->size); - - nfs4_session_set_rwsize(server); - - error = nfs_probe_fsinfo(server, mntfh, fattr); - if (error < 0) - goto error; - - if (server->namelen == 0 || server->namelen > NFS4_MAXNAMLEN) - server->namelen = NFS4_MAXNAMLEN; - - spin_lock(&nfs_client_lock); - list_add_tail(&server->client_link, &server->nfs_client->cl_superblocks); - list_add_tail(&server->master_link, &nfs_volume_list); - spin_unlock(&nfs_client_lock); - - server->mount_time = jiffies; dprintk("<-- nfs4_create_server() = %p\n", server); - nfs_free_fattr(fattr); return server; error: - nfs_free_fattr(fattr); nfs_free_server(server); dprintk("<-- nfs4_create_server() = error %d\n", error); return ERR_PTR(error); @@ -1418,7 +1430,6 @@ struct nfs_server *nfs4_create_referral_server(struct nfs_clone_mount *data, { struct nfs_client *parent_client; struct nfs_server *server, *parent_server; - struct nfs_fattr *fattr; int error; dprintk("--> nfs4_create_referral_server()\n"); @@ -1427,11 +1438,6 @@ struct nfs_server *nfs4_create_referral_server(struct nfs_clone_mount *data, if (!server) return ERR_PTR(-ENOMEM); - error = -ENOMEM; - fattr = nfs_alloc_fattr(); - if (fattr == NULL) - goto error; - parent_server = NFS_SB(data->sb); parent_client = parent_server->nfs_client; @@ -1456,40 +1462,14 @@ struct nfs_server *nfs4_create_referral_server(struct nfs_clone_mount *data, if (error < 0) goto error; - BUG_ON(!server->nfs_client); - BUG_ON(!server->nfs_client->rpc_ops); - BUG_ON(!server->nfs_client->rpc_ops->file_inode_ops); - - /* Probe the root fh to retrieve its FSID and filehandle */ - error = nfs4_get_rootfh(server, mntfh); - if (error < 0) - goto error; - - /* probe the filesystem info for this server filesystem */ - error = nfs_probe_fsinfo(server, mntfh, fattr); + error = nfs4_server_common_setup(server, mntfh); if (error < 0) goto error; - if (server->namelen == 0 || server->namelen > NFS4_MAXNAMLEN) - server->namelen = NFS4_MAXNAMLEN; - - dprintk("Referral FSID: %llx:%llx\n", - (unsigned long long) server->fsid.major, - (unsigned long long) server->fsid.minor); - - spin_lock(&nfs_client_lock); - list_add_tail(&server->client_link, &server->nfs_client->cl_superblocks); - list_add_tail(&server->master_link, &nfs_volume_list); - spin_unlock(&nfs_client_lock); - - server->mount_time = jiffies; - - nfs_free_fattr(fattr); dprintk("<-- nfs_create_referral_server() = %p\n", server); return server; error: - nfs_free_fattr(fattr); nfs_free_server(server); dprintk("<-- nfs4_create_referral_server() = error %d\n", error); return ERR_PTR(error); diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index ee9a179ebdf3..832e9e239324 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -53,7 +53,7 @@ static int nfs_link(struct dentry *, struct inode *, struct dentry *); static int nfs_mknod(struct inode *, struct dentry *, int, dev_t); static int nfs_rename(struct inode *, struct dentry *, struct inode *, struct dentry *); -static int nfs_fsync_dir(struct file *, struct dentry *, int); +static int nfs_fsync_dir(struct file *, int); static loff_t nfs_llseek_dir(struct file *, loff_t, int); const struct file_operations nfs_dir_operations = { @@ -641,8 +641,10 @@ out: * All directory operations under NFS are synchronous, so fsync() * is a dummy operation. */ -static int nfs_fsync_dir(struct file *filp, struct dentry *dentry, int datasync) +static int nfs_fsync_dir(struct file *filp, int datasync) { + struct dentry *dentry = filp->f_path.dentry; + dfprintk(FILE, "NFS: fsync dir(%s/%s) datasync %d\n", dentry->d_parent->d_name.name, dentry->d_name.name, datasync); @@ -1708,7 +1710,7 @@ static void nfs_access_free_list(struct list_head *head) } } -int nfs_access_cache_shrinker(int nr_to_scan, gfp_t gfp_mask) +int nfs_access_cache_shrinker(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask) { LIST_HEAD(head); struct nfs_inode *nfsi; @@ -1741,6 +1743,7 @@ remove_lru_entry: clear_bit(NFS_INO_ACL_LRU_SET, &nfsi->flags); smp_mb__after_clear_bit(); } + spin_unlock(&inode->i_lock); } spin_unlock(&nfs_access_lru_lock); nfs_access_free_list(&head); @@ -1950,7 +1953,7 @@ int nfs_permission(struct inode *inode, int mask) if ((mask & (MAY_READ | MAY_WRITE | MAY_EXEC)) == 0) goto out; /* Is this sys_access() ? */ - if (mask & MAY_ACCESS) + if (mask & (MAY_ACCESS | MAY_CHDIR)) goto force_lookup; switch (inode->i_mode & S_IFMT) { diff --git a/fs/nfs/file.c b/fs/nfs/file.c index cac96bcc91e4..f036153d9f50 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -27,6 +27,7 @@ #include <linux/pagemap.h> #include <linux/aio.h> #include <linux/gfp.h> +#include <linux/swap.h> #include <asm/uaccess.h> #include <asm/system.h> @@ -53,7 +54,7 @@ static ssize_t nfs_file_splice_write(struct pipe_inode_info *pipe, static ssize_t nfs_file_write(struct kiocb *, const struct iovec *iov, unsigned long nr_segs, loff_t pos); static int nfs_file_flush(struct file *, fl_owner_t id); -static int nfs_file_fsync(struct file *, struct dentry *dentry, int datasync); +static int nfs_file_fsync(struct file *, int datasync); static int nfs_check_flags(int flags); static int nfs_lock(struct file *filp, int cmd, struct file_lock *fl); static int nfs_flock(struct file *filp, int cmd, struct file_lock *fl); @@ -322,8 +323,9 @@ nfs_file_mmap(struct file * file, struct vm_area_struct * vma) * whether any write errors occurred for this process. */ static int -nfs_file_fsync(struct file *file, struct dentry *dentry, int datasync) +nfs_file_fsync(struct file *file, int datasync) { + struct dentry *dentry = file->f_path.dentry; struct nfs_open_context *ctx = nfs_file_open_context(file); struct inode *inode = dentry->d_inode; @@ -492,11 +494,19 @@ static void nfs_invalidate_page(struct page *page, unsigned long offset) */ static int nfs_release_page(struct page *page, gfp_t gfp) { + struct address_space *mapping = page->mapping; + dfprintk(PAGECACHE, "NFS: release_page(%p)\n", page); /* Only do I/O if gfp is a superset of GFP_KERNEL */ - if ((gfp & GFP_KERNEL) == GFP_KERNEL) - nfs_wb_page(page->mapping->host, page); + if (mapping && (gfp & GFP_KERNEL) == GFP_KERNEL) { + int how = FLUSH_SYNC; + + /* Don't let kswapd deadlock waiting for OOM RPC calls */ + if (current_is_kswapd()) + how = 0; + nfs_commit_inode(mapping->host, how); + } /* If PagePrivate() is set, then the page is not freeable */ if (PagePrivate(page)) return 0; diff --git a/fs/nfs/getroot.c b/fs/nfs/getroot.c index 7428f7d6273b..a70e446e1605 100644 --- a/fs/nfs/getroot.c +++ b/fs/nfs/getroot.c @@ -146,7 +146,7 @@ int nfs4_get_rootfh(struct nfs_server *server, struct nfs_fh *mntfh) goto out; } - if (!(fsinfo.fattr->valid & NFS_ATTR_FATTR_MODE) + if (!(fsinfo.fattr->valid & NFS_ATTR_FATTR_TYPE) || !S_ISDIR(fsinfo.fattr->mode)) { printk(KERN_ERR "nfs4_get_rootfh:" " getroot encountered non-directory\n"); diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index d8bd619e386c..e70f44b9b3f4 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -205,7 +205,8 @@ extern struct rpc_procinfo nfs4_procedures[]; void nfs_close_context(struct nfs_open_context *ctx, int is_sync); /* dir.c */ -extern int nfs_access_cache_shrinker(int nr_to_scan, gfp_t gfp_mask); +extern int nfs_access_cache_shrinker(struct shrinker *shrink, + int nr_to_scan, gfp_t gfp_mask); /* inode.c */ extern struct workqueue_struct *nfsiod_workqueue; diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index 6bdef28efa33..65c8dae4b267 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -862,8 +862,8 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const bmval1 |= FATTR4_WORD1_TIME_ACCESS_SET; *p++ = cpu_to_be32(NFS4_SET_TO_CLIENT_TIME); *p++ = cpu_to_be32(0); - *p++ = cpu_to_be32(iap->ia_mtime.tv_sec); - *p++ = cpu_to_be32(iap->ia_mtime.tv_nsec); + *p++ = cpu_to_be32(iap->ia_atime.tv_sec); + *p++ = cpu_to_be32(iap->ia_atime.tv_nsec); } else if (iap->ia_valid & ATTR_ATIME) { bmval1 |= FATTR4_WORD1_TIME_ACCESS_SET; diff --git a/fs/nfs/nfsroot.c b/fs/nfs/nfsroot.c index 6bd19d843af7..df101d9f546a 100644 --- a/fs/nfs/nfsroot.c +++ b/fs/nfs/nfsroot.c @@ -105,7 +105,7 @@ static char nfs_root_name[256] __initdata = ""; static __be32 servaddr __initdata = 0; /* Name of directory to mount */ -static char nfs_export_path[NFS_MAXPATHLEN] __initdata = { 0, }; +static char nfs_export_path[NFS_MAXPATHLEN + 1] __initdata = { 0, }; /* NFS-related data */ static struct nfs_mount_data nfs_data __initdata = { 0, };/* NFS mount info */ diff --git a/fs/nfs/super.c b/fs/nfs/super.c index 04214fc5c304..f9df16de4a56 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -570,6 +570,22 @@ static void nfs_show_mountd_options(struct seq_file *m, struct nfs_server *nfss, nfs_show_mountd_netid(m, nfss, showdefaults); } +#ifdef CONFIG_NFS_V4 +static void nfs_show_nfsv4_options(struct seq_file *m, struct nfs_server *nfss, + int showdefaults) +{ + struct nfs_client *clp = nfss->nfs_client; + + seq_printf(m, ",clientaddr=%s", clp->cl_ipaddr); + seq_printf(m, ",minorversion=%u", clp->cl_minorversion); +} +#else +static void nfs_show_nfsv4_options(struct seq_file *m, struct nfs_server *nfss, + int showdefaults) +{ +} +#endif + /* * Describe the mount options in force on this server representation */ @@ -631,11 +647,9 @@ static void nfs_show_mount_options(struct seq_file *m, struct nfs_server *nfss, if (version != 4) nfs_show_mountd_options(m, nfss, showdefaults); + else + nfs_show_nfsv4_options(m, nfss, showdefaults); -#ifdef CONFIG_NFS_V4 - if (clp->rpc_ops->version == 4) - seq_printf(m, ",clientaddr=%s", clp->cl_ipaddr); -#endif if (nfss->options & NFS_OPTION_FSCACHE) seq_printf(m, ",fsc"); } diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 3aea3ca98ab7..9f81bdd91c55 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -222,7 +222,7 @@ static void nfs_end_page_writeback(struct page *page) clear_bdi_congested(&nfss->backing_dev_info, BLK_RW_ASYNC); } -static struct nfs_page *nfs_find_and_lock_request(struct page *page) +static struct nfs_page *nfs_find_and_lock_request(struct page *page, bool nonblock) { struct inode *inode = page->mapping->host; struct nfs_page *req; @@ -241,7 +241,10 @@ static struct nfs_page *nfs_find_and_lock_request(struct page *page) * request as dirty (in which case we don't care). */ spin_unlock(&inode->i_lock); - ret = nfs_wait_on_request(req); + if (!nonblock) + ret = nfs_wait_on_request(req); + else + ret = -EAGAIN; nfs_release_request(req); if (ret != 0) return ERR_PTR(ret); @@ -256,12 +259,12 @@ static struct nfs_page *nfs_find_and_lock_request(struct page *page) * May return an error if the user signalled nfs_wait_on_request(). */ static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio, - struct page *page) + struct page *page, bool nonblock) { struct nfs_page *req; int ret = 0; - req = nfs_find_and_lock_request(page); + req = nfs_find_and_lock_request(page, nonblock); if (!req) goto out; ret = PTR_ERR(req); @@ -283,12 +286,20 @@ out: static int nfs_do_writepage(struct page *page, struct writeback_control *wbc, struct nfs_pageio_descriptor *pgio) { struct inode *inode = page->mapping->host; + int ret; nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGE); nfs_add_stats(inode, NFSIOS_WRITEPAGES, 1); nfs_pageio_cond_complete(pgio, page->index); - return nfs_page_async_flush(pgio, page); + ret = nfs_page_async_flush(pgio, page, + wbc->sync_mode == WB_SYNC_NONE || + wbc->nonblocking != 0); + if (ret == -EAGAIN) { + redirty_page_for_writepage(wbc, page); + ret = 0; + } + return ret; } /* @@ -1379,14 +1390,14 @@ static const struct rpc_call_ops nfs_commit_ops = { .rpc_release = nfs_commit_release, }; -static int nfs_commit_inode(struct inode *inode, int how) +int nfs_commit_inode(struct inode *inode, int how) { LIST_HEAD(head); int may_wait = how & FLUSH_SYNC; int res = 0; if (!nfs_commit_set_lock(NFS_I(inode), may_wait)) - goto out; + goto out_mark_dirty; spin_lock(&inode->i_lock); res = nfs_scan_commit(inode, &head, 0, 0); spin_unlock(&inode->i_lock); @@ -1398,9 +1409,18 @@ static int nfs_commit_inode(struct inode *inode, int how) wait_on_bit(&NFS_I(inode)->flags, NFS_INO_COMMIT, nfs_wait_bit_killable, TASK_KILLABLE); + else + goto out_mark_dirty; } else nfs_commit_clear_lock(NFS_I(inode)); -out: + return res; + /* Note: If we exit without ensuring that the commit is complete, + * we must mark the inode as dirty. Otherwise, future calls to + * sync_inode() with the WB_SYNC_ALL flag set will fail to ensure + * that the data is on the disk. + */ +out_mark_dirty: + __mark_inode_dirty(inode, I_DIRTY_DATASYNC); return res; } @@ -1434,11 +1454,6 @@ out_mark_dirty: return ret; } #else -static int nfs_commit_inode(struct inode *inode, int how) -{ - return 0; -} - static int nfs_commit_unstable_pages(struct inode *inode, struct writeback_control *wbc) { return 0; @@ -1509,14 +1524,17 @@ int nfs_wb_page(struct inode *inode, struct page *page) }; int ret; - while(PagePrivate(page)) { + for (;;) { wait_on_page_writeback(page); if (clear_page_dirty_for_io(page)) { ret = nfs_writepage_locked(page, &wbc); if (ret < 0) goto out_error; + continue; } - ret = sync_inode(inode, &wbc); + if (!PagePrivate(page)) + break; + ret = nfs_commit_inode(inode, FLUSH_SYNC); if (ret < 0) goto out_error; } @@ -1534,7 +1552,7 @@ int nfs_migrate_page(struct address_space *mapping, struct page *newpage, nfs_fscache_release_page(page, GFP_KERNEL); - req = nfs_find_and_lock_request(page); + req = nfs_find_and_lock_request(page, false); ret = PTR_ERR(req); if (IS_ERR(req)) goto out; diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 12f7109720c2..4a2734758778 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -4122,8 +4122,8 @@ nfs4_state_shutdown(void) nfs4_lock_state(); nfs4_release_reclaim(); __nfs4_state_shutdown(); - nfsd4_destroy_callback_queue(); nfs4_unlock_state(); + nfsd4_destroy_callback_queue(); } /* diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index ebbf3b6b2457..3c111120b619 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -443,8 +443,7 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap, if (size_change) put_write_access(inode); if (!err) - if (EX_ISSYNC(fhp->fh_export)) - write_inode_now(inode, 1); + commit_metadata(fhp); out: return err; diff --git a/fs/nilfs2/btree.h b/fs/nilfs2/btree.h index af638d59e3bf..43c8c5b541fd 100644 --- a/fs/nilfs2/btree.h +++ b/fs/nilfs2/btree.h @@ -75,8 +75,6 @@ struct nilfs_btree_path { extern struct kmem_cache *nilfs_btree_path_cache; -int nilfs_btree_path_cache_init(void); -void nilfs_btree_path_cache_destroy(void); int nilfs_btree_init(struct nilfs_bmap *); int nilfs_btree_convert_and_insert(struct nilfs_bmap *, __u64, __u64, const __u64 *, const __u64 *, int); diff --git a/fs/nilfs2/file.c b/fs/nilfs2/file.c index 30292df443ce..c9a30d7ff6fc 100644 --- a/fs/nilfs2/file.c +++ b/fs/nilfs2/file.c @@ -27,7 +27,7 @@ #include "nilfs.h" #include "segment.h" -int nilfs_sync_file(struct file *file, struct dentry *dentry, int datasync) +int nilfs_sync_file(struct file *file, int datasync) { /* * Called from fsync() system call @@ -37,7 +37,7 @@ int nilfs_sync_file(struct file *file, struct dentry *dentry, int datasync) * This function should be implemented when the writeback function * will be implemented. */ - struct inode *inode = dentry->d_inode; + struct inode *inode = file->f_mapping->host; int err; if (!nilfs_inode_dirty(inode)) diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h index 8723e5bfd071..47d6d7928122 100644 --- a/fs/nilfs2/nilfs.h +++ b/fs/nilfs2/nilfs.h @@ -228,7 +228,7 @@ extern void nilfs_set_link(struct inode *, struct nilfs_dir_entry *, struct page *, struct inode *); /* file.c */ -extern int nilfs_sync_file(struct file *, struct dentry *, int); +extern int nilfs_sync_file(struct file *, int); /* ioctl.c */ long nilfs_ioctl(struct file *, unsigned int, unsigned long); diff --git a/fs/nilfs2/segbuf.h b/fs/nilfs2/segbuf.h index fdf1c3b6d673..85fbb66455e2 100644 --- a/fs/nilfs2/segbuf.h +++ b/fs/nilfs2/segbuf.h @@ -127,8 +127,6 @@ struct nilfs_segment_buffer { extern struct kmem_cache *nilfs_segbuf_cachep; -int __init nilfs_init_segbuf_cache(void); -void nilfs_destroy_segbuf_cache(void); struct nilfs_segment_buffer *nilfs_segbuf_new(struct super_block *); void nilfs_segbuf_free(struct nilfs_segment_buffer *); void nilfs_segbuf_map(struct nilfs_segment_buffer *, __u64, unsigned long, diff --git a/fs/nilfs2/segment.h b/fs/nilfs2/segment.h index dca142361ccf..01e20dbb217d 100644 --- a/fs/nilfs2/segment.h +++ b/fs/nilfs2/segment.h @@ -221,8 +221,6 @@ enum { extern struct kmem_cache *nilfs_transaction_cachep; /* segment.c */ -extern int nilfs_init_transaction_cache(void); -extern void nilfs_destroy_transaction_cache(void); extern void nilfs_relax_pressure_in_lock(struct super_block *); extern int nilfs_construct_segment(struct super_block *); diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c index 03b34b738993..414ef68931cf 100644 --- a/fs/nilfs2/super.c +++ b/fs/nilfs2/super.c @@ -1130,13 +1130,13 @@ static void nilfs_segbuf_init_once(void *obj) static void nilfs_destroy_cachep(void) { - if (nilfs_inode_cachep) + if (nilfs_inode_cachep) kmem_cache_destroy(nilfs_inode_cachep); - if (nilfs_transaction_cachep) + if (nilfs_transaction_cachep) kmem_cache_destroy(nilfs_transaction_cachep); - if (nilfs_segbuf_cachep) + if (nilfs_segbuf_cachep) kmem_cache_destroy(nilfs_segbuf_cachep); - if (nilfs_btree_path_cache) + if (nilfs_btree_path_cache) kmem_cache_destroy(nilfs_btree_path_cache); } diff --git a/fs/ntfs/dir.c b/fs/ntfs/dir.c index fe44d3feee4a..0f48e7c5d9e1 100644 --- a/fs/ntfs/dir.c +++ b/fs/ntfs/dir.c @@ -1527,10 +1527,9 @@ static int ntfs_dir_open(struct inode *vi, struct file *filp) * this problem for now. We do write the $BITMAP attribute if it is present * which is the important one for a directory so things are not too bad. */ -static int ntfs_dir_fsync(struct file *filp, struct dentry *dentry, - int datasync) +static int ntfs_dir_fsync(struct file *filp, int datasync) { - struct inode *bmp_vi, *vi = dentry->d_inode; + struct inode *bmp_vi, *vi = filp->f_mapping->host; int err, ret; ntfs_attr na; diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c index a1924a0d2ab0..113ebd9f25a4 100644 --- a/fs/ntfs/file.c +++ b/fs/ntfs/file.c @@ -2133,7 +2133,6 @@ static ssize_t ntfs_file_aio_write(struct kiocb *iocb, const struct iovec *iov, /** * ntfs_file_fsync - sync a file to disk * @filp: file to be synced - * @dentry: dentry describing the file to sync * @datasync: if non-zero only flush user data and not metadata * * Data integrity sync of a file to disk. Used for fsync, fdatasync, and msync @@ -2149,19 +2148,15 @@ static ssize_t ntfs_file_aio_write(struct kiocb *iocb, const struct iovec *iov, * Also, if @datasync is true, we do not wait on the inode to be written out * but we always wait on the page cache pages to be written out. * - * Note: In the past @filp could be NULL so we ignore it as we don't need it - * anyway. - * * Locking: Caller must hold i_mutex on the inode. * * TODO: We should probably also write all attribute/index inodes associated * with this inode but since we have no simple way of getting to them we ignore * this problem for now. */ -static int ntfs_file_fsync(struct file *filp, struct dentry *dentry, - int datasync) +static int ntfs_file_fsync(struct file *filp, int datasync) { - struct inode *vi = dentry->d_inode; + struct inode *vi = filp->f_mapping->host; int err, ret = 0; ntfs_debug("Entering for inode 0x%lx.", vi->i_ino); diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index 3623ca20cc18..96337a4fbbdf 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -196,15 +196,14 @@ int ocfs2_get_block(struct inode *inode, sector_t iblock, dump_stack(); goto bail; } - - past_eof = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode)); - mlog(0, "Inode %lu, past_eof = %llu\n", inode->i_ino, - (unsigned long long)past_eof); - - if (create && (iblock >= past_eof)) - set_buffer_new(bh_result); } + past_eof = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode)); + mlog(0, "Inode %lu, past_eof = %llu\n", inode->i_ino, + (unsigned long long)past_eof); + if (create && (iblock >= past_eof)) + set_buffer_new(bh_result); + bail: if (err < 0) err = -EIO; @@ -459,36 +458,6 @@ int walk_page_buffers( handle_t *handle, return ret; } -handle_t *ocfs2_start_walk_page_trans(struct inode *inode, - struct page *page, - unsigned from, - unsigned to) -{ - struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); - handle_t *handle; - int ret = 0; - - handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); - if (IS_ERR(handle)) { - ret = -ENOMEM; - mlog_errno(ret); - goto out; - } - - if (ocfs2_should_order_data(inode)) { - ret = ocfs2_jbd2_file_inode(handle, inode); - if (ret < 0) - mlog_errno(ret); - } -out: - if (ret) { - if (!IS_ERR(handle)) - ocfs2_commit_trans(osb, handle); - handle = ERR_PTR(ret); - } - return handle; -} - static sector_t ocfs2_bmap(struct address_space *mapping, sector_t block) { sector_t status; @@ -609,7 +578,9 @@ bail: static void ocfs2_dio_end_io(struct kiocb *iocb, loff_t offset, ssize_t bytes, - void *private) + void *private, + int ret, + bool is_async) { struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode; int level; @@ -623,6 +594,9 @@ static void ocfs2_dio_end_io(struct kiocb *iocb, if (!level) up_read(&inode->i_alloc_sem); ocfs2_rw_unlock(inode, level); + + if (is_async) + aio_complete(iocb, ret, 0); } /* @@ -1131,23 +1105,37 @@ out: */ static int ocfs2_grab_pages_for_write(struct address_space *mapping, struct ocfs2_write_ctxt *wc, - u32 cpos, loff_t user_pos, int new, + u32 cpos, loff_t user_pos, + unsigned user_len, int new, struct page *mmap_page) { int ret = 0, i; - unsigned long start, target_index, index; + unsigned long start, target_index, end_index, index; struct inode *inode = mapping->host; + loff_t last_byte; target_index = user_pos >> PAGE_CACHE_SHIFT; /* * Figure out how many pages we'll be manipulating here. For * non allocating write, we just change the one - * page. Otherwise, we'll need a whole clusters worth. + * page. Otherwise, we'll need a whole clusters worth. If we're + * writing past i_size, we only need enough pages to cover the + * last page of the write. */ if (new) { wc->w_num_pages = ocfs2_pages_per_cluster(inode->i_sb); start = ocfs2_align_clusters_to_page_index(inode->i_sb, cpos); + /* + * We need the index *past* the last page we could possibly + * touch. This is the page past the end of the write or + * i_size, whichever is greater. + */ + last_byte = max(user_pos + user_len, i_size_read(inode)); + BUG_ON(last_byte < 1); + end_index = ((last_byte - 1) >> PAGE_CACHE_SHIFT) + 1; + if ((start + wc->w_num_pages) > end_index) + wc->w_num_pages = end_index - start; } else { wc->w_num_pages = 1; start = target_index; @@ -1620,21 +1608,20 @@ out: * write path can treat it as an non-allocating write, which has no * special case code for sparse/nonsparse files. */ -static int ocfs2_expand_nonsparse_inode(struct inode *inode, loff_t pos, - unsigned len, +static int ocfs2_expand_nonsparse_inode(struct inode *inode, + struct buffer_head *di_bh, + loff_t pos, unsigned len, struct ocfs2_write_ctxt *wc) { int ret; - struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); loff_t newsize = pos + len; - if (ocfs2_sparse_alloc(osb)) - return 0; + BUG_ON(ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))); if (newsize <= i_size_read(inode)) return 0; - ret = ocfs2_extend_no_holes(inode, newsize, pos); + ret = ocfs2_extend_no_holes(inode, di_bh, newsize, pos); if (ret) mlog_errno(ret); @@ -1644,6 +1631,18 @@ static int ocfs2_expand_nonsparse_inode(struct inode *inode, loff_t pos, return ret; } +static int ocfs2_zero_tail(struct inode *inode, struct buffer_head *di_bh, + loff_t pos) +{ + int ret = 0; + + BUG_ON(!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))); + if (pos > i_size_read(inode)) + ret = ocfs2_zero_extend(inode, di_bh, pos); + + return ret; +} + int ocfs2_write_begin_nolock(struct address_space *mapping, loff_t pos, unsigned len, unsigned flags, struct page **pagep, void **fsdata, @@ -1679,7 +1678,11 @@ int ocfs2_write_begin_nolock(struct address_space *mapping, } } - ret = ocfs2_expand_nonsparse_inode(inode, pos, len, wc); + if (ocfs2_sparse_alloc(osb)) + ret = ocfs2_zero_tail(inode, di_bh, pos); + else + ret = ocfs2_expand_nonsparse_inode(inode, di_bh, pos, len, + wc); if (ret) { mlog_errno(ret); goto out; @@ -1789,7 +1792,7 @@ int ocfs2_write_begin_nolock(struct address_space *mapping, * that we can zero and flush if we error after adding the * extent. */ - ret = ocfs2_grab_pages_for_write(mapping, wc, wc->w_cpos, pos, + ret = ocfs2_grab_pages_for_write(mapping, wc, wc->w_cpos, pos, len, cluster_of_pages, mmap_page); if (ret) { mlog_errno(ret); diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c index 6b5a492e1749..153abb5abef0 100644 --- a/fs/ocfs2/dlm/dlmdomain.c +++ b/fs/ocfs2/dlm/dlmdomain.c @@ -1671,7 +1671,7 @@ struct dlm_ctxt * dlm_register_domain(const char *domain, struct dlm_ctxt *dlm = NULL; struct dlm_ctxt *new_ctxt = NULL; - if (strlen(domain) > O2NM_MAX_NAME_LEN) { + if (strlen(domain) >= O2NM_MAX_NAME_LEN) { ret = -ENAMETOOLONG; mlog(ML_ERROR, "domain name length too long\n"); goto leave; @@ -1709,6 +1709,7 @@ retry: } if (dlm_protocol_compare(&dlm->fs_locking_proto, fs_proto)) { + spin_unlock(&dlm_domain_lock); mlog(ML_ERROR, "Requested locking protocol version is not " "compatible with already registered domain " diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c index 4a7506a4e314..94b97fc6a88e 100644 --- a/fs/ocfs2/dlm/dlmmaster.c +++ b/fs/ocfs2/dlm/dlmmaster.c @@ -2808,14 +2808,8 @@ again: mlog(0, "trying again...\n"); goto again; } - /* now that we are sure the MIGRATING state is there, drop - * the unneded state which blocked threads trying to DIRTY */ - spin_lock(&res->spinlock); - BUG_ON(!(res->state & DLM_LOCK_RES_BLOCK_DIRTY)); - BUG_ON(!(res->state & DLM_LOCK_RES_MIGRATING)); - res->state &= ~DLM_LOCK_RES_BLOCK_DIRTY; - spin_unlock(&res->spinlock); + ret = 0; /* did the target go down or die? */ spin_lock(&dlm->spinlock); if (!test_bit(target, dlm->domain_map)) { @@ -2826,9 +2820,21 @@ again: spin_unlock(&dlm->spinlock); /* + * if target is down, we need to clear DLM_LOCK_RES_BLOCK_DIRTY for + * another try; otherwise, we are sure the MIGRATING state is there, + * drop the unneded state which blocked threads trying to DIRTY + */ + spin_lock(&res->spinlock); + BUG_ON(!(res->state & DLM_LOCK_RES_BLOCK_DIRTY)); + res->state &= ~DLM_LOCK_RES_BLOCK_DIRTY; + if (!ret) + BUG_ON(!(res->state & DLM_LOCK_RES_MIGRATING)); + spin_unlock(&res->spinlock); + + /* * at this point: * - * o the DLM_LOCK_RES_MIGRATING flag is set + * o the DLM_LOCK_RES_MIGRATING flag is set if target not down * o there are no pending asts on this lockres * o all processes trying to reserve an ast on this * lockres must wait for the MIGRATING flag to clear diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c index f8b75ce4be70..9dfaac73b36d 100644 --- a/fs/ocfs2/dlm/dlmrecovery.c +++ b/fs/ocfs2/dlm/dlmrecovery.c @@ -463,7 +463,7 @@ static int dlm_do_recovery(struct dlm_ctxt *dlm) if (dlm->reco.dead_node == O2NM_INVALID_NODE_NUM) { int bit; - bit = find_next_bit (dlm->recovery_map, O2NM_MAX_NODES+1, 0); + bit = find_next_bit (dlm->recovery_map, O2NM_MAX_NODES, 0); if (bit >= O2NM_MAX_NODES || bit < 0) dlm_set_reco_dead_node(dlm, O2NM_INVALID_NODE_NUM); else diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 97e54b9e654b..2b10b36d1577 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -175,13 +175,12 @@ static int ocfs2_dir_release(struct inode *inode, struct file *file) return 0; } -static int ocfs2_sync_file(struct file *file, - struct dentry *dentry, - int datasync) +static int ocfs2_sync_file(struct file *file, int datasync) { int err = 0; journal_t *journal; - struct inode *inode = dentry->d_inode; + struct dentry *dentry = file->f_path.dentry; + struct inode *inode = file->f_mapping->host; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); mlog_entry("(0x%p, 0x%p, %d, '%.*s')\n", file, dentry, datasync, @@ -725,28 +724,55 @@ leave: return status; } +/* + * While a write will already be ordering the data, a truncate will not. + * Thus, we need to explicitly order the zeroed pages. + */ +static handle_t *ocfs2_zero_start_ordered_transaction(struct inode *inode) +{ + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + handle_t *handle = NULL; + int ret = 0; + + if (!ocfs2_should_order_data(inode)) + goto out; + + handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); + if (IS_ERR(handle)) { + ret = -ENOMEM; + mlog_errno(ret); + goto out; + } + + ret = ocfs2_jbd2_file_inode(handle, inode); + if (ret < 0) + mlog_errno(ret); + +out: + if (ret) { + if (!IS_ERR(handle)) + ocfs2_commit_trans(osb, handle); + handle = ERR_PTR(ret); + } + return handle; +} + /* Some parts of this taken from generic_cont_expand, which turned out * to be too fragile to do exactly what we need without us having to * worry about recursive locking in ->write_begin() and ->write_end(). */ -static int ocfs2_write_zero_page(struct inode *inode, - u64 size) +static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from, + u64 abs_to) { struct address_space *mapping = inode->i_mapping; struct page *page; - unsigned long index; - unsigned int offset; + unsigned long index = abs_from >> PAGE_CACHE_SHIFT; handle_t *handle = NULL; - int ret; + int ret = 0; + unsigned zero_from, zero_to, block_start, block_end; - offset = (size & (PAGE_CACHE_SIZE-1)); /* Within page */ - /* ugh. in prepare/commit_write, if from==to==start of block, we - ** skip the prepare. make sure we never send an offset for the start - ** of a block - */ - if ((offset & (inode->i_sb->s_blocksize - 1)) == 0) { - offset++; - } - index = size >> PAGE_CACHE_SHIFT; + BUG_ON(abs_from >= abs_to); + BUG_ON(abs_to > (((u64)index + 1) << PAGE_CACHE_SHIFT)); + BUG_ON(abs_from & (inode->i_blkbits - 1)); page = grab_cache_page(mapping, index); if (!page) { @@ -755,31 +781,56 @@ static int ocfs2_write_zero_page(struct inode *inode, goto out; } - ret = ocfs2_prepare_write_nolock(inode, page, offset, offset); - if (ret < 0) { - mlog_errno(ret); - goto out_unlock; - } + /* Get the offsets within the page that we want to zero */ + zero_from = abs_from & (PAGE_CACHE_SIZE - 1); + zero_to = abs_to & (PAGE_CACHE_SIZE - 1); + if (!zero_to) + zero_to = PAGE_CACHE_SIZE; - if (ocfs2_should_order_data(inode)) { - handle = ocfs2_start_walk_page_trans(inode, page, offset, - offset); - if (IS_ERR(handle)) { - ret = PTR_ERR(handle); - handle = NULL; + mlog(0, + "abs_from = %llu, abs_to = %llu, index = %lu, zero_from = %u, zero_to = %u\n", + (unsigned long long)abs_from, (unsigned long long)abs_to, + index, zero_from, zero_to); + + /* We know that zero_from is block aligned */ + for (block_start = zero_from; block_start < zero_to; + block_start = block_end) { + block_end = block_start + (1 << inode->i_blkbits); + + /* + * block_start is block-aligned. Bump it by one to + * force ocfs2_{prepare,commit}_write() to zero the + * whole block. + */ + ret = ocfs2_prepare_write_nolock(inode, page, + block_start + 1, + block_start + 1); + if (ret < 0) { + mlog_errno(ret); goto out_unlock; } - } - /* must not update i_size! */ - ret = block_commit_write(page, offset, offset); - if (ret < 0) - mlog_errno(ret); - else - ret = 0; + if (!handle) { + handle = ocfs2_zero_start_ordered_transaction(inode); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + handle = NULL; + break; + } + } + + /* must not update i_size! */ + ret = block_commit_write(page, block_start + 1, + block_start + 1); + if (ret < 0) + mlog_errno(ret); + else + ret = 0; + } if (handle) ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle); + out_unlock: unlock_page(page); page_cache_release(page); @@ -787,22 +838,114 @@ out: return ret; } -static int ocfs2_zero_extend(struct inode *inode, - u64 zero_to_size) +/* + * Find the next range to zero. We do this in terms of bytes because + * that's what ocfs2_zero_extend() wants, and it is dealing with the + * pagecache. We may return multiple extents. + * + * zero_start and zero_end are ocfs2_zero_extend()s current idea of what + * needs to be zeroed. range_start and range_end return the next zeroing + * range. A subsequent call should pass the previous range_end as its + * zero_start. If range_end is 0, there's nothing to do. + * + * Unwritten extents are skipped over. Refcounted extents are CoWd. + */ +static int ocfs2_zero_extend_get_range(struct inode *inode, + struct buffer_head *di_bh, + u64 zero_start, u64 zero_end, + u64 *range_start, u64 *range_end) { - int ret = 0; - u64 start_off; - struct super_block *sb = inode->i_sb; + int rc = 0, needs_cow = 0; + u32 p_cpos, zero_clusters = 0; + u32 zero_cpos = + zero_start >> OCFS2_SB(inode->i_sb)->s_clustersize_bits; + u32 last_cpos = ocfs2_clusters_for_bytes(inode->i_sb, zero_end); + unsigned int num_clusters = 0; + unsigned int ext_flags = 0; - start_off = ocfs2_align_bytes_to_blocks(sb, i_size_read(inode)); - while (start_off < zero_to_size) { - ret = ocfs2_write_zero_page(inode, start_off); - if (ret < 0) { - mlog_errno(ret); + while (zero_cpos < last_cpos) { + rc = ocfs2_get_clusters(inode, zero_cpos, &p_cpos, + &num_clusters, &ext_flags); + if (rc) { + mlog_errno(rc); + goto out; + } + + if (p_cpos && !(ext_flags & OCFS2_EXT_UNWRITTEN)) { + zero_clusters = num_clusters; + if (ext_flags & OCFS2_EXT_REFCOUNTED) + needs_cow = 1; + break; + } + + zero_cpos += num_clusters; + } + if (!zero_clusters) { + *range_end = 0; + goto out; + } + + while ((zero_cpos + zero_clusters) < last_cpos) { + rc = ocfs2_get_clusters(inode, zero_cpos + zero_clusters, + &p_cpos, &num_clusters, + &ext_flags); + if (rc) { + mlog_errno(rc); goto out; } - start_off += sb->s_blocksize; + if (!p_cpos || (ext_flags & OCFS2_EXT_UNWRITTEN)) + break; + if (ext_flags & OCFS2_EXT_REFCOUNTED) + needs_cow = 1; + zero_clusters += num_clusters; + } + if ((zero_cpos + zero_clusters) > last_cpos) + zero_clusters = last_cpos - zero_cpos; + + if (needs_cow) { + rc = ocfs2_refcount_cow(inode, di_bh, zero_cpos, zero_clusters, + UINT_MAX); + if (rc) { + mlog_errno(rc); + goto out; + } + } + + *range_start = ocfs2_clusters_to_bytes(inode->i_sb, zero_cpos); + *range_end = ocfs2_clusters_to_bytes(inode->i_sb, + zero_cpos + zero_clusters); + +out: + return rc; +} + +/* + * Zero one range returned from ocfs2_zero_extend_get_range(). The caller + * has made sure that the entire range needs zeroing. + */ +static int ocfs2_zero_extend_range(struct inode *inode, u64 range_start, + u64 range_end) +{ + int rc = 0; + u64 next_pos; + u64 zero_pos = range_start; + + mlog(0, "range_start = %llu, range_end = %llu\n", + (unsigned long long)range_start, + (unsigned long long)range_end); + BUG_ON(range_start >= range_end); + + while (zero_pos < range_end) { + next_pos = (zero_pos & PAGE_CACHE_MASK) + PAGE_CACHE_SIZE; + if (next_pos > range_end) + next_pos = range_end; + rc = ocfs2_write_zero_page(inode, zero_pos, next_pos); + if (rc < 0) { + mlog_errno(rc); + break; + } + zero_pos = next_pos; /* * Very large extends have the potential to lock up @@ -811,16 +954,63 @@ static int ocfs2_zero_extend(struct inode *inode, cond_resched(); } -out: + return rc; +} + +int ocfs2_zero_extend(struct inode *inode, struct buffer_head *di_bh, + loff_t zero_to_size) +{ + int ret = 0; + u64 zero_start, range_start = 0, range_end = 0; + struct super_block *sb = inode->i_sb; + + zero_start = ocfs2_align_bytes_to_blocks(sb, i_size_read(inode)); + mlog(0, "zero_start %llu for i_size %llu\n", + (unsigned long long)zero_start, + (unsigned long long)i_size_read(inode)); + while (zero_start < zero_to_size) { + ret = ocfs2_zero_extend_get_range(inode, di_bh, zero_start, + zero_to_size, + &range_start, + &range_end); + if (ret) { + mlog_errno(ret); + break; + } + if (!range_end) + break; + /* Trim the ends */ + if (range_start < zero_start) + range_start = zero_start; + if (range_end > zero_to_size) + range_end = zero_to_size; + + ret = ocfs2_zero_extend_range(inode, range_start, + range_end); + if (ret) { + mlog_errno(ret); + break; + } + zero_start = range_end; + } + return ret; } -int ocfs2_extend_no_holes(struct inode *inode, u64 new_i_size, u64 zero_to) +int ocfs2_extend_no_holes(struct inode *inode, struct buffer_head *di_bh, + u64 new_i_size, u64 zero_to) { int ret; u32 clusters_to_add; struct ocfs2_inode_info *oi = OCFS2_I(inode); + /* + * Only quota files call this without a bh, and they can't be + * refcounted. + */ + BUG_ON(!di_bh && (oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL)); + BUG_ON(!di_bh && !(oi->ip_flags & OCFS2_INODE_SYSTEM_FILE)); + clusters_to_add = ocfs2_clusters_for_bytes(inode->i_sb, new_i_size); if (clusters_to_add < oi->ip_clusters) clusters_to_add = 0; @@ -841,7 +1031,7 @@ int ocfs2_extend_no_holes(struct inode *inode, u64 new_i_size, u64 zero_to) * still need to zero the area between the old i_size and the * new i_size. */ - ret = ocfs2_zero_extend(inode, zero_to); + ret = ocfs2_zero_extend(inode, di_bh, zero_to); if (ret < 0) mlog_errno(ret); @@ -863,27 +1053,15 @@ static int ocfs2_extend_file(struct inode *inode, goto out; if (i_size_read(inode) == new_i_size) - goto out; + goto out; BUG_ON(new_i_size < i_size_read(inode)); /* - * Fall through for converting inline data, even if the fs - * supports sparse files. - * - * The check for inline data here is legal - nobody can add - * the feature since we have i_mutex. We must check it again - * after acquiring ip_alloc_sem though, as paths like mmap - * might have raced us to converting the inode to extents. - */ - if (!(oi->ip_dyn_features & OCFS2_INLINE_DATA_FL) - && ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) - goto out_update_size; - - /* * The alloc sem blocks people in read/write from reading our * allocation until we're done changing it. We depend on * i_mutex to block other extend/truncate calls while we're - * here. + * here. We even have to hold it for sparse files because there + * might be some tail zeroing. */ down_write(&oi->ip_alloc_sem); @@ -900,14 +1078,16 @@ static int ocfs2_extend_file(struct inode *inode, ret = ocfs2_convert_inline_data_to_extents(inode, di_bh); if (ret) { up_write(&oi->ip_alloc_sem); - mlog_errno(ret); goto out; } } - if (!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) - ret = ocfs2_extend_no_holes(inode, new_i_size, new_i_size); + if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) + ret = ocfs2_zero_extend(inode, di_bh, new_i_size); + else + ret = ocfs2_extend_no_holes(inode, di_bh, new_i_size, + new_i_size); up_write(&oi->ip_alloc_sem); @@ -1053,7 +1233,7 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr) } /* - * This will intentionally not wind up calling vmtruncate(), + * This will intentionally not wind up calling simple_setsize(), * since all the work for a size change has been done above. * Otherwise, we could get into problems with truncate as * ip_alloc_sem is used there to protect against i_size @@ -2119,9 +2299,13 @@ relock: * direct write may have instantiated a few * blocks outside i_size. Trim these off again. * Don't need i_size_read because we hold i_mutex. + * + * XXX(hch): this looks buggy because ocfs2 did not + * actually implement ->truncate. Take a look at + * the new truncate sequence and update this accordingly */ if (*ppos + count > inode->i_size) - vmtruncate(inode, inode->i_size); + simple_setsize(inode, inode->i_size); ret = written; goto out_dio; } diff --git a/fs/ocfs2/file.h b/fs/ocfs2/file.h index d66cf4f7c70e..97bf761c9e7c 100644 --- a/fs/ocfs2/file.h +++ b/fs/ocfs2/file.h @@ -54,8 +54,10 @@ int ocfs2_add_inode_data(struct ocfs2_super *osb, int ocfs2_simple_size_update(struct inode *inode, struct buffer_head *di_bh, u64 new_i_size); -int ocfs2_extend_no_holes(struct inode *inode, u64 new_i_size, - u64 zero_to); +int ocfs2_extend_no_holes(struct inode *inode, struct buffer_head *di_bh, + u64 new_i_size, u64 zero_to); +int ocfs2_zero_extend(struct inode *inode, struct buffer_head *di_bh, + loff_t zero_to); int ocfs2_setattr(struct dentry *dentry, struct iattr *attr); int ocfs2_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat); diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index 47878cf16418..625de9d7088c 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c @@ -472,7 +472,7 @@ static inline struct ocfs2_triggers *to_ocfs2_trigger(struct jbd2_buffer_trigger return container_of(triggers, struct ocfs2_triggers, ot_triggers); } -static void ocfs2_commit_trigger(struct jbd2_buffer_trigger_type *triggers, +static void ocfs2_frozen_trigger(struct jbd2_buffer_trigger_type *triggers, struct buffer_head *bh, void *data, size_t size) { @@ -491,7 +491,7 @@ static void ocfs2_commit_trigger(struct jbd2_buffer_trigger_type *triggers, * Quota blocks have their own trigger because the struct ocfs2_block_check * offset depends on the blocksize. */ -static void ocfs2_dq_commit_trigger(struct jbd2_buffer_trigger_type *triggers, +static void ocfs2_dq_frozen_trigger(struct jbd2_buffer_trigger_type *triggers, struct buffer_head *bh, void *data, size_t size) { @@ -511,7 +511,7 @@ static void ocfs2_dq_commit_trigger(struct jbd2_buffer_trigger_type *triggers, * Directory blocks also have their own trigger because the * struct ocfs2_block_check offset depends on the blocksize. */ -static void ocfs2_db_commit_trigger(struct jbd2_buffer_trigger_type *triggers, +static void ocfs2_db_frozen_trigger(struct jbd2_buffer_trigger_type *triggers, struct buffer_head *bh, void *data, size_t size) { @@ -544,7 +544,7 @@ static void ocfs2_abort_trigger(struct jbd2_buffer_trigger_type *triggers, static struct ocfs2_triggers di_triggers = { .ot_triggers = { - .t_commit = ocfs2_commit_trigger, + .t_frozen = ocfs2_frozen_trigger, .t_abort = ocfs2_abort_trigger, }, .ot_offset = offsetof(struct ocfs2_dinode, i_check), @@ -552,7 +552,7 @@ static struct ocfs2_triggers di_triggers = { static struct ocfs2_triggers eb_triggers = { .ot_triggers = { - .t_commit = ocfs2_commit_trigger, + .t_frozen = ocfs2_frozen_trigger, .t_abort = ocfs2_abort_trigger, }, .ot_offset = offsetof(struct ocfs2_extent_block, h_check), @@ -560,7 +560,7 @@ static struct ocfs2_triggers eb_triggers = { static struct ocfs2_triggers rb_triggers = { .ot_triggers = { - .t_commit = ocfs2_commit_trigger, + .t_frozen = ocfs2_frozen_trigger, .t_abort = ocfs2_abort_trigger, }, .ot_offset = offsetof(struct ocfs2_refcount_block, rf_check), @@ -568,7 +568,7 @@ static struct ocfs2_triggers rb_triggers = { static struct ocfs2_triggers gd_triggers = { .ot_triggers = { - .t_commit = ocfs2_commit_trigger, + .t_frozen = ocfs2_frozen_trigger, .t_abort = ocfs2_abort_trigger, }, .ot_offset = offsetof(struct ocfs2_group_desc, bg_check), @@ -576,14 +576,14 @@ static struct ocfs2_triggers gd_triggers = { static struct ocfs2_triggers db_triggers = { .ot_triggers = { - .t_commit = ocfs2_db_commit_trigger, + .t_frozen = ocfs2_db_frozen_trigger, .t_abort = ocfs2_abort_trigger, }, }; static struct ocfs2_triggers xb_triggers = { .ot_triggers = { - .t_commit = ocfs2_commit_trigger, + .t_frozen = ocfs2_frozen_trigger, .t_abort = ocfs2_abort_trigger, }, .ot_offset = offsetof(struct ocfs2_xattr_block, xb_check), @@ -591,14 +591,14 @@ static struct ocfs2_triggers xb_triggers = { static struct ocfs2_triggers dq_triggers = { .ot_triggers = { - .t_commit = ocfs2_dq_commit_trigger, + .t_frozen = ocfs2_dq_frozen_trigger, .t_abort = ocfs2_abort_trigger, }, }; static struct ocfs2_triggers dr_triggers = { .ot_triggers = { - .t_commit = ocfs2_commit_trigger, + .t_frozen = ocfs2_frozen_trigger, .t_abort = ocfs2_abort_trigger, }, .ot_offset = offsetof(struct ocfs2_dx_root_block, dr_check), @@ -606,7 +606,7 @@ static struct ocfs2_triggers dr_triggers = { static struct ocfs2_triggers dl_triggers = { .ot_triggers = { - .t_commit = ocfs2_commit_trigger, + .t_frozen = ocfs2_frozen_trigger, .t_abort = ocfs2_abort_trigger, }, .ot_offset = offsetof(struct ocfs2_dx_leaf, dl_check), @@ -1936,7 +1936,7 @@ void ocfs2_orphan_scan_work(struct work_struct *work) mutex_lock(&os->os_lock); ocfs2_queue_orphan_scan(osb); if (atomic_read(&os->os_state) == ORPHAN_SCAN_ACTIVE) - schedule_delayed_work(&os->os_orphan_scan_work, + queue_delayed_work(ocfs2_wq, &os->os_orphan_scan_work, ocfs2_orphan_scan_timeout()); mutex_unlock(&os->os_lock); } @@ -1976,8 +1976,8 @@ void ocfs2_orphan_scan_start(struct ocfs2_super *osb) atomic_set(&os->os_state, ORPHAN_SCAN_INACTIVE); else { atomic_set(&os->os_state, ORPHAN_SCAN_ACTIVE); - schedule_delayed_work(&os->os_orphan_scan_work, - ocfs2_orphan_scan_timeout()); + queue_delayed_work(ocfs2_wq, &os->os_orphan_scan_work, + ocfs2_orphan_scan_timeout()); } } diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c index 3d7419682dc0..ec6adbf8f551 100644 --- a/fs/ocfs2/localalloc.c +++ b/fs/ocfs2/localalloc.c @@ -118,6 +118,7 @@ unsigned int ocfs2_la_default_mb(struct ocfs2_super *osb) { unsigned int la_mb; unsigned int gd_mb; + unsigned int la_max_mb; unsigned int megs_per_slot; struct super_block *sb = osb->sb; @@ -182,6 +183,12 @@ unsigned int ocfs2_la_default_mb(struct ocfs2_super *osb) if (megs_per_slot < la_mb) la_mb = megs_per_slot; + /* We can't store more bits than we can in a block. */ + la_max_mb = ocfs2_clusters_to_megabytes(osb->sb, + ocfs2_local_alloc_size(sb) * 8); + if (la_mb > la_max_mb) + la_mb = la_max_mb; + return la_mb; } diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c index 2bb35fe00511..4607923eb24c 100644 --- a/fs/ocfs2/quota_global.c +++ b/fs/ocfs2/quota_global.c @@ -775,7 +775,7 @@ static int ocfs2_acquire_dquot(struct dquot *dquot) * locking allocators ranks above a transaction start */ WARN_ON(journal_current_handle()); - status = ocfs2_extend_no_holes(gqinode, + status = ocfs2_extend_no_holes(gqinode, NULL, gqinode->i_size + (need_alloc << sb->s_blocksize_bits), gqinode->i_size); if (status < 0) diff --git a/fs/ocfs2/quota_local.c b/fs/ocfs2/quota_local.c index 8bd70d4d184d..dc78764ccc4c 100644 --- a/fs/ocfs2/quota_local.c +++ b/fs/ocfs2/quota_local.c @@ -971,7 +971,7 @@ static struct ocfs2_quota_chunk *ocfs2_local_quota_add_chunk( u64 p_blkno; /* We are protected by dqio_sem so no locking needed */ - status = ocfs2_extend_no_holes(lqinode, + status = ocfs2_extend_no_holes(lqinode, NULL, lqinode->i_size + 2 * sb->s_blocksize, lqinode->i_size); if (status < 0) { @@ -1114,7 +1114,7 @@ static struct ocfs2_quota_chunk *ocfs2_extend_local_quota_file( return ocfs2_local_quota_add_chunk(sb, type, offset); /* We are protected by dqio_sem so no locking needed */ - status = ocfs2_extend_no_holes(lqinode, + status = ocfs2_extend_no_holes(lqinode, NULL, lqinode->i_size + sb->s_blocksize, lqinode->i_size); if (status < 0) { diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c index 4793f36f6518..3ac5aa733e9c 100644 --- a/fs/ocfs2/refcounttree.c +++ b/fs/ocfs2/refcounttree.c @@ -2931,6 +2931,12 @@ static int ocfs2_duplicate_clusters_by_page(handle_t *handle, offset = ((loff_t)cpos) << OCFS2_SB(sb)->s_clustersize_bits; end = offset + (new_len << OCFS2_SB(sb)->s_clustersize_bits); + /* + * We only duplicate pages until we reach the page contains i_size - 1. + * So trim 'end' to i_size. + */ + if (end > i_size_read(context->inode)) + end = i_size_read(context->inode); while (offset < end) { page_index = offset >> PAGE_CACHE_SHIFT; @@ -4166,6 +4172,12 @@ static int __ocfs2_reflink(struct dentry *old_dentry, struct inode *inode = old_dentry->d_inode; struct buffer_head *new_bh = NULL; + if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_SYSTEM_FILE) { + ret = -EINVAL; + mlog_errno(ret); + goto out; + } + ret = filemap_fdatawrite(inode->i_mapping); if (ret) { mlog_errno(ret); diff --git a/fs/ocfs2/reservations.c b/fs/ocfs2/reservations.c index 40650021fc24..d8b6e4259b80 100644 --- a/fs/ocfs2/reservations.c +++ b/fs/ocfs2/reservations.c @@ -26,7 +26,6 @@ #include <linux/fs.h> #include <linux/types.h> -#include <linux/slab.h> #include <linux/highmem.h> #include <linux/bitops.h> #include <linux/list.h> diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c index f4c2a9eb8c4d..a8e6a95a353f 100644 --- a/fs/ocfs2/suballoc.c +++ b/fs/ocfs2/suballoc.c @@ -741,7 +741,7 @@ static int ocfs2_block_group_alloc(struct ocfs2_super *osb, le16_to_cpu(bg->bg_free_bits_count)); le32_add_cpu(&cl->cl_recs[alloc_rec].c_total, le16_to_cpu(bg->bg_bits)); - cl->cl_recs[alloc_rec].c_blkno = cpu_to_le64(bg->bg_blkno); + cl->cl_recs[alloc_rec].c_blkno = bg->bg_blkno; if (le16_to_cpu(cl->cl_next_free_rec) < le16_to_cpu(cl->cl_count)) le16_add_cpu(&cl->cl_next_free_rec, 1); diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 2c26ce251cb3..0eaa929a4dbf 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -879,13 +879,15 @@ static int ocfs2_susp_quotas(struct ocfs2_super *osb, int unsuspend) if (!OCFS2_HAS_RO_COMPAT_FEATURE(sb, feature[type])) continue; if (unsuspend) - status = vfs_quota_enable( - sb_dqopt(sb)->files[type], - type, QFMT_OCFS2, - DQUOT_SUSPENDED); - else - status = vfs_quota_disable(sb, type, - DQUOT_SUSPENDED); + status = dquot_resume(sb, type); + else { + struct ocfs2_mem_dqinfo *oinfo; + + /* Cancel periodic syncing before suspending */ + oinfo = sb_dqinfo(sb, type)->dqi_priv; + cancel_delayed_work_sync(&oinfo->dqi_sync_work); + status = dquot_suspend(sb, type); + } if (status < 0) break; } @@ -916,8 +918,8 @@ static int ocfs2_enable_quotas(struct ocfs2_super *osb) status = -ENOENT; goto out_quota_off; } - status = vfs_quota_enable(inode[type], type, QFMT_OCFS2, - DQUOT_USAGE_ENABLED); + status = dquot_enable(inode[type], type, QFMT_OCFS2, + DQUOT_USAGE_ENABLED); if (status < 0) goto out_quota_off; } @@ -952,8 +954,8 @@ static void ocfs2_disable_quotas(struct ocfs2_super *osb) /* Turn off quotas. This will remove all dquot structures from * memory and so they will be automatically synced to global * quota files */ - vfs_quota_disable(sb, type, DQUOT_USAGE_ENABLED | - DQUOT_LIMITS_ENABLED); + dquot_disable(sb, type, DQUOT_USAGE_ENABLED | + DQUOT_LIMITS_ENABLED); if (!inode) continue; iput(inode); @@ -962,7 +964,7 @@ static void ocfs2_disable_quotas(struct ocfs2_super *osb) /* Handle quota on quotactl */ static int ocfs2_quota_on(struct super_block *sb, int type, int format_id, - char *path, int remount) + char *path) { unsigned int feature[MAXQUOTAS] = { OCFS2_FEATURE_RO_COMPAT_USRQUOTA, OCFS2_FEATURE_RO_COMPAT_GRPQUOTA}; @@ -970,30 +972,24 @@ static int ocfs2_quota_on(struct super_block *sb, int type, int format_id, if (!OCFS2_HAS_RO_COMPAT_FEATURE(sb, feature[type])) return -EINVAL; - if (remount) - return 0; /* Just ignore it has been handled in - * ocfs2_remount() */ - return vfs_quota_enable(sb_dqopt(sb)->files[type], type, - format_id, DQUOT_LIMITS_ENABLED); + return dquot_enable(sb_dqopt(sb)->files[type], type, + format_id, DQUOT_LIMITS_ENABLED); } /* Handle quota off quotactl */ -static int ocfs2_quota_off(struct super_block *sb, int type, int remount) +static int ocfs2_quota_off(struct super_block *sb, int type) { - if (remount) - return 0; /* Ignore now and handle later in - * ocfs2_remount() */ - return vfs_quota_disable(sb, type, DQUOT_LIMITS_ENABLED); + return dquot_disable(sb, type, DQUOT_LIMITS_ENABLED); } static const struct quotactl_ops ocfs2_quotactl_ops = { .quota_on = ocfs2_quota_on, .quota_off = ocfs2_quota_off, - .quota_sync = vfs_quota_sync, - .get_info = vfs_get_dqinfo, - .set_info = vfs_set_dqinfo, - .get_dqblk = vfs_get_dqblk, - .set_dqblk = vfs_set_dqblk, + .quota_sync = dquot_quota_sync, + .get_info = dquot_get_dqinfo, + .set_info = dquot_set_dqinfo, + .get_dqblk = dquot_get_dqblk, + .set_dqblk = dquot_set_dqblk, }; static int ocfs2_fill_super(struct super_block *sb, void *data, int silent) diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c index e97b34842cfe..d03469f61801 100644 --- a/fs/ocfs2/xattr.c +++ b/fs/ocfs2/xattr.c @@ -709,7 +709,7 @@ static int ocfs2_xattr_extend_allocation(struct inode *inode, struct ocfs2_xattr_value_buf *vb, struct ocfs2_xattr_set_ctxt *ctxt) { - int status = 0; + int status = 0, credits; handle_t *handle = ctxt->handle; enum ocfs2_alloc_restarted why; u32 prev_clusters, logical_start = le32_to_cpu(vb->vb_xv->xr_clusters); @@ -719,38 +719,54 @@ static int ocfs2_xattr_extend_allocation(struct inode *inode, ocfs2_init_xattr_value_extent_tree(&et, INODE_CACHE(inode), vb); - status = vb->vb_access(handle, INODE_CACHE(inode), vb->vb_bh, - OCFS2_JOURNAL_ACCESS_WRITE); - if (status < 0) { - mlog_errno(status); - goto leave; - } + while (clusters_to_add) { + status = vb->vb_access(handle, INODE_CACHE(inode), vb->vb_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (status < 0) { + mlog_errno(status); + break; + } - prev_clusters = le32_to_cpu(vb->vb_xv->xr_clusters); - status = ocfs2_add_clusters_in_btree(handle, - &et, - &logical_start, - clusters_to_add, - 0, - ctxt->data_ac, - ctxt->meta_ac, - &why); - if (status < 0) { - mlog_errno(status); - goto leave; - } + prev_clusters = le32_to_cpu(vb->vb_xv->xr_clusters); + status = ocfs2_add_clusters_in_btree(handle, + &et, + &logical_start, + clusters_to_add, + 0, + ctxt->data_ac, + ctxt->meta_ac, + &why); + if ((status < 0) && (status != -EAGAIN)) { + if (status != -ENOSPC) + mlog_errno(status); + break; + } - ocfs2_journal_dirty(handle, vb->vb_bh); + ocfs2_journal_dirty(handle, vb->vb_bh); - clusters_to_add -= le32_to_cpu(vb->vb_xv->xr_clusters) - prev_clusters; + clusters_to_add -= le32_to_cpu(vb->vb_xv->xr_clusters) - + prev_clusters; - /* - * We should have already allocated enough space before the transaction, - * so no need to restart. - */ - BUG_ON(why != RESTART_NONE || clusters_to_add); - -leave: + if (why != RESTART_NONE && clusters_to_add) { + /* + * We can only fail in case the alloc file doesn't give + * up enough clusters. + */ + BUG_ON(why == RESTART_META); + + mlog(0, "restarting xattr value extension for %u" + " clusters,.\n", clusters_to_add); + credits = ocfs2_calc_extend_credits(inode->i_sb, + &vb->vb_xv->xr_list, + clusters_to_add); + status = ocfs2_extend_trans(handle, credits); + if (status < 0) { + status = -ENOMEM; + mlog_errno(status); + break; + } + } + } return status; } @@ -6788,16 +6804,15 @@ out: return ret; } -static int ocfs2_reflink_xattr_buckets(handle_t *handle, +static int ocfs2_reflink_xattr_bucket(handle_t *handle, u64 blkno, u64 new_blkno, u32 clusters, + u32 *cpos, int num_buckets, struct ocfs2_alloc_context *meta_ac, struct ocfs2_alloc_context *data_ac, struct ocfs2_reflink_xattr_tree_args *args) { int i, j, ret = 0; struct super_block *sb = args->reflink->old_inode->i_sb; - u32 bpc = ocfs2_xattr_buckets_per_cluster(OCFS2_SB(sb)); - u32 num_buckets = clusters * bpc; int bpb = args->old_bucket->bu_blocks; struct ocfs2_xattr_value_buf vb = { .vb_access = ocfs2_journal_access, @@ -6816,14 +6831,6 @@ static int ocfs2_reflink_xattr_buckets(handle_t *handle, break; } - /* - * The real bucket num in this series of blocks is stored - * in the 1st bucket. - */ - if (i == 0) - num_buckets = le16_to_cpu( - bucket_xh(args->old_bucket)->xh_num_buckets); - ret = ocfs2_xattr_bucket_journal_access(handle, args->new_bucket, OCFS2_JOURNAL_ACCESS_CREATE); @@ -6837,6 +6844,18 @@ static int ocfs2_reflink_xattr_buckets(handle_t *handle, bucket_block(args->old_bucket, j), sb->s_blocksize); + /* + * Record the start cpos so that we can use it to initialize + * our xattr tree we also set the xh_num_bucket for the new + * bucket. + */ + if (i == 0) { + *cpos = le32_to_cpu(bucket_xh(args->new_bucket)-> + xh_entries[0].xe_name_hash); + bucket_xh(args->new_bucket)->xh_num_buckets = + cpu_to_le16(num_buckets); + } + ocfs2_xattr_bucket_journal_dirty(handle, args->new_bucket); ret = ocfs2_reflink_xattr_header(handle, args->reflink, @@ -6866,6 +6885,7 @@ static int ocfs2_reflink_xattr_buckets(handle_t *handle, } ocfs2_xattr_bucket_journal_dirty(handle, args->new_bucket); + ocfs2_xattr_bucket_relse(args->old_bucket); ocfs2_xattr_bucket_relse(args->new_bucket); } @@ -6874,6 +6894,75 @@ static int ocfs2_reflink_xattr_buckets(handle_t *handle, ocfs2_xattr_bucket_relse(args->new_bucket); return ret; } + +static int ocfs2_reflink_xattr_buckets(handle_t *handle, + struct inode *inode, + struct ocfs2_reflink_xattr_tree_args *args, + struct ocfs2_extent_tree *et, + struct ocfs2_alloc_context *meta_ac, + struct ocfs2_alloc_context *data_ac, + u64 blkno, u32 cpos, u32 len) +{ + int ret, first_inserted = 0; + u32 p_cluster, num_clusters, reflink_cpos = 0; + u64 new_blkno; + unsigned int num_buckets, reflink_buckets; + unsigned int bpc = + ocfs2_xattr_buckets_per_cluster(OCFS2_SB(inode->i_sb)); + + ret = ocfs2_read_xattr_bucket(args->old_bucket, blkno); + if (ret) { + mlog_errno(ret); + goto out; + } + num_buckets = le16_to_cpu(bucket_xh(args->old_bucket)->xh_num_buckets); + ocfs2_xattr_bucket_relse(args->old_bucket); + + while (len && num_buckets) { + ret = ocfs2_claim_clusters(handle, data_ac, + 1, &p_cluster, &num_clusters); + if (ret) { + mlog_errno(ret); + goto out; + } + + new_blkno = ocfs2_clusters_to_blocks(inode->i_sb, p_cluster); + reflink_buckets = min(num_buckets, bpc * num_clusters); + + ret = ocfs2_reflink_xattr_bucket(handle, blkno, + new_blkno, num_clusters, + &reflink_cpos, reflink_buckets, + meta_ac, data_ac, args); + if (ret) { + mlog_errno(ret); + goto out; + } + + /* + * For the 1st allocated cluster, we make it use the same cpos + * so that the xattr tree looks the same as the original one + * in the most case. + */ + if (!first_inserted) { + reflink_cpos = cpos; + first_inserted = 1; + } + ret = ocfs2_insert_extent(handle, et, reflink_cpos, new_blkno, + num_clusters, 0, meta_ac); + if (ret) + mlog_errno(ret); + + mlog(0, "insert new xattr extent rec start %llu len %u to %u\n", + (unsigned long long)new_blkno, num_clusters, reflink_cpos); + + len -= num_clusters; + blkno += ocfs2_clusters_to_blocks(inode->i_sb, num_clusters); + num_buckets -= reflink_buckets; + } +out: + return ret; +} + /* * Create the same xattr extent record in the new inode's xattr tree. */ @@ -6885,8 +6974,6 @@ static int ocfs2_reflink_xattr_rec(struct inode *inode, void *para) { int ret, credits = 0; - u32 p_cluster, num_clusters; - u64 new_blkno; handle_t *handle; struct ocfs2_reflink_xattr_tree_args *args = (struct ocfs2_reflink_xattr_tree_args *)para; @@ -6895,6 +6982,9 @@ static int ocfs2_reflink_xattr_rec(struct inode *inode, struct ocfs2_alloc_context *data_ac = NULL; struct ocfs2_extent_tree et; + mlog(0, "reflink xattr buckets %llu len %u\n", + (unsigned long long)blkno, len); + ocfs2_init_xattr_tree_extent_tree(&et, INODE_CACHE(args->reflink->new_inode), args->new_blk_bh); @@ -6914,32 +7004,12 @@ static int ocfs2_reflink_xattr_rec(struct inode *inode, goto out; } - ret = ocfs2_claim_clusters(handle, data_ac, - len, &p_cluster, &num_clusters); - if (ret) { - mlog_errno(ret); - goto out_commit; - } - - new_blkno = ocfs2_clusters_to_blocks(osb->sb, p_cluster); - - mlog(0, "reflink xattr buckets %llu to %llu, len %u\n", - (unsigned long long)blkno, (unsigned long long)new_blkno, len); - ret = ocfs2_reflink_xattr_buckets(handle, blkno, new_blkno, len, - meta_ac, data_ac, args); - if (ret) { - mlog_errno(ret); - goto out_commit; - } - - mlog(0, "insert new xattr extent rec start %llu len %u to %u\n", - (unsigned long long)new_blkno, len, cpos); - ret = ocfs2_insert_extent(handle, &et, cpos, new_blkno, - len, 0, meta_ac); + ret = ocfs2_reflink_xattr_buckets(handle, inode, args, &et, + meta_ac, data_ac, + blkno, cpos, len); if (ret) mlog_errno(ret); -out_commit: ocfs2_commit_trans(osb, handle); out: diff --git a/fs/omfs/file.c b/fs/omfs/file.c index 399487c09364..6e7a3291bbe8 100644 --- a/fs/omfs/file.c +++ b/fs/omfs/file.c @@ -329,7 +329,7 @@ const struct file_operations omfs_file_operations = { .aio_read = generic_file_aio_read, .aio_write = generic_file_aio_write, .mmap = generic_file_mmap, - .fsync = simple_fsync, + .fsync = generic_file_fsync, .splice_read = generic_file_splice_read, }; diff --git a/fs/open.c b/fs/open.c index 5463266db9e6..0d1fa3dc0efb 100644 --- a/fs/open.c +++ b/fs/open.c @@ -110,7 +110,7 @@ static long do_sys_truncate(const char __user *pathname, loff_t length) error = locks_verify_truncate(inode, NULL, length); if (!error) - error = security_path_truncate(&path, length, 0); + error = security_path_truncate(&path); if (!error) error = do_truncate(path.dentry, length, 0, NULL); @@ -165,8 +165,7 @@ static long do_sys_ftruncate(unsigned int fd, loff_t length, int small) error = locks_verify_truncate(inode, file, length); if (!error) - error = security_path_truncate(&file->f_path, length, - ATTR_MTIME|ATTR_CTIME); + error = security_path_truncate(&file->f_path); if (!error) error = do_truncate(dentry, length, ATTR_MTIME|ATTR_CTIME, file); out_putf: @@ -367,7 +366,7 @@ SYSCALL_DEFINE1(chdir, const char __user *, filename) if (error) goto out; - error = inode_permission(path.dentry->d_inode, MAY_EXEC | MAY_ACCESS); + error = inode_permission(path.dentry->d_inode, MAY_EXEC | MAY_CHDIR); if (error) goto dput_and_out; @@ -396,7 +395,7 @@ SYSCALL_DEFINE1(fchdir, unsigned int, fd) if (!S_ISDIR(inode->i_mode)) goto out_putf; - error = inode_permission(inode, MAY_EXEC | MAY_ACCESS); + error = inode_permission(inode, MAY_EXEC | MAY_CHDIR); if (!error) set_fs_pwd(current->fs, &file->f_path); out_putf: @@ -414,7 +413,7 @@ SYSCALL_DEFINE1(chroot, const char __user *, filename) if (error) goto out; - error = inode_permission(path.dentry->d_inode, MAY_EXEC | MAY_ACCESS); + error = inode_permission(path.dentry->d_inode, MAY_EXEC | MAY_CHDIR); if (error) goto dput_and_out; diff --git a/fs/partitions/ibm.c b/fs/partitions/ibm.c index 3e73de5967ff..fc8497643fd0 100644 --- a/fs/partitions/ibm.c +++ b/fs/partitions/ibm.c @@ -74,6 +74,7 @@ int ibm_partition(struct parsed_partitions *state) } *label; unsigned char *data; Sector sect; + sector_t labelsect; res = 0; blocksize = bdev_logical_block_size(bdev); @@ -98,10 +99,19 @@ int ibm_partition(struct parsed_partitions *state) goto out_freeall; /* + * Special case for FBA disks: label sector does not depend on + * blocksize. + */ + if ((info->cu_type == 0x6310 && info->dev_type == 0x9336) || + (info->cu_type == 0x3880 && info->dev_type == 0x3370)) + labelsect = info->label_block; + else + labelsect = info->label_block * (blocksize >> 9); + + /* * Get volume label, extract name and type. */ - data = read_part_sector(state, info->label_block*(blocksize/512), - §); + data = read_part_sector(state, labelsect, §); if (data == NULL) goto out_readerr; diff --git a/fs/pipe.c b/fs/pipe.c index d79872eba09a..279eef96c51c 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -26,9 +26,14 @@ /* * The max size that a non-root user is allowed to grow the pipe. Can - * be set by root in /proc/sys/fs/pipe-max-pages + * be set by root in /proc/sys/fs/pipe-max-size */ -unsigned int pipe_max_pages = PIPE_DEF_BUFFERS * 16; +unsigned int pipe_max_size = 1048576; + +/* + * Minimum pipe size, as required by POSIX + */ +unsigned int pipe_min_size = PAGE_SIZE; /* * We use a start+len construction, which provides full use of the @@ -230,6 +235,7 @@ void *generic_pipe_buf_map(struct pipe_inode_info *pipe, return kmap(buf->page); } +EXPORT_SYMBOL(generic_pipe_buf_map); /** * generic_pipe_buf_unmap - unmap a previously mapped pipe buffer @@ -249,6 +255,7 @@ void generic_pipe_buf_unmap(struct pipe_inode_info *pipe, } else kunmap(buf->page); } +EXPORT_SYMBOL(generic_pipe_buf_unmap); /** * generic_pipe_buf_steal - attempt to take ownership of a &pipe_buffer @@ -279,6 +286,7 @@ int generic_pipe_buf_steal(struct pipe_inode_info *pipe, return 1; } +EXPORT_SYMBOL(generic_pipe_buf_steal); /** * generic_pipe_buf_get - get a reference to a &struct pipe_buffer @@ -294,6 +302,7 @@ void generic_pipe_buf_get(struct pipe_inode_info *pipe, struct pipe_buffer *buf) { page_cache_get(buf->page); } +EXPORT_SYMBOL(generic_pipe_buf_get); /** * generic_pipe_buf_confirm - verify contents of the pipe buffer @@ -309,6 +318,7 @@ int generic_pipe_buf_confirm(struct pipe_inode_info *info, { return 0; } +EXPORT_SYMBOL(generic_pipe_buf_confirm); /** * generic_pipe_buf_release - put a reference to a &struct pipe_buffer @@ -323,6 +333,7 @@ void generic_pipe_buf_release(struct pipe_inode_info *pipe, { page_cache_release(buf->page); } +EXPORT_SYMBOL(generic_pipe_buf_release); static const struct pipe_buf_operations anon_pipe_buf_ops = { .can_merge = 1, @@ -1112,26 +1123,20 @@ SYSCALL_DEFINE1(pipe, int __user *, fildes) * Allocate a new array of pipe buffers and copy the info over. Returns the * pipe size if successful, or return -ERROR on error. */ -static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long arg) +static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long nr_pages) { struct pipe_buffer *bufs; /* - * Must be a power-of-2 currently - */ - if (!is_power_of_2(arg)) - return -EINVAL; - - /* * We can shrink the pipe, if arg >= pipe->nrbufs. Since we don't * expect a lot of shrink+grow operations, just free and allocate * again like we would do for growing. If the pipe currently * contains more buffers than arg, then return busy. */ - if (arg < pipe->nrbufs) + if (nr_pages < pipe->nrbufs) return -EBUSY; - bufs = kcalloc(arg, sizeof(struct pipe_buffer), GFP_KERNEL); + bufs = kcalloc(nr_pages, sizeof(struct pipe_buffer), GFP_KERNEL); if (unlikely(!bufs)) return -ENOMEM; @@ -1140,20 +1145,56 @@ static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long arg) * and adjust the indexes. */ if (pipe->nrbufs) { - const unsigned int tail = pipe->nrbufs & (pipe->buffers - 1); - const unsigned int head = pipe->nrbufs - tail; + unsigned int tail; + unsigned int head; + tail = pipe->curbuf + pipe->nrbufs; + if (tail < pipe->buffers) + tail = 0; + else + tail &= (pipe->buffers - 1); + + head = pipe->nrbufs - tail; if (head) memcpy(bufs, pipe->bufs + pipe->curbuf, head * sizeof(struct pipe_buffer)); if (tail) - memcpy(bufs + head, pipe->bufs + pipe->curbuf, tail * sizeof(struct pipe_buffer)); + memcpy(bufs + head, pipe->bufs, tail * sizeof(struct pipe_buffer)); } pipe->curbuf = 0; kfree(pipe->bufs); pipe->bufs = bufs; - pipe->buffers = arg; - return arg; + pipe->buffers = nr_pages; + return nr_pages * PAGE_SIZE; +} + +/* + * Currently we rely on the pipe array holding a power-of-2 number + * of pages. + */ +static inline unsigned int round_pipe_size(unsigned int size) +{ + unsigned long nr_pages; + + nr_pages = (size + PAGE_SIZE - 1) >> PAGE_SHIFT; + return roundup_pow_of_two(nr_pages) << PAGE_SHIFT; +} + +/* + * This should work even if CONFIG_PROC_FS isn't set, as proc_dointvec_minmax + * will return an error. + */ +int pipe_proc_fn(struct ctl_table *table, int write, void __user *buf, + size_t *lenp, loff_t *ppos) +{ + int ret; + + ret = proc_dointvec_minmax(table, write, buf, lenp, ppos); + if (ret < 0 || !write) + return ret; + + pipe_max_size = round_pipe_size(pipe_max_size); + return ret; } long pipe_fcntl(struct file *file, unsigned int cmd, unsigned long arg) @@ -1168,25 +1209,32 @@ long pipe_fcntl(struct file *file, unsigned int cmd, unsigned long arg) mutex_lock(&pipe->inode->i_mutex); switch (cmd) { - case F_SETPIPE_SZ: - if (!capable(CAP_SYS_ADMIN) && arg > pipe_max_pages) - return -EINVAL; - /* - * The pipe needs to be at least 2 pages large to - * guarantee POSIX behaviour. - */ - if (arg < 2) - return -EINVAL; - ret = pipe_set_size(pipe, arg); + case F_SETPIPE_SZ: { + unsigned int size, nr_pages; + + size = round_pipe_size(arg); + nr_pages = size >> PAGE_SHIFT; + + ret = -EINVAL; + if (!nr_pages) + goto out; + + if (!capable(CAP_SYS_RESOURCE) && size > pipe_max_size) { + ret = -EPERM; + goto out; + } + ret = pipe_set_size(pipe, nr_pages); break; + } case F_GETPIPE_SZ: - ret = pipe->buffers; + ret = pipe->buffers * PAGE_SIZE; break; default: ret = -EINVAL; break; } +out: mutex_unlock(&pipe->inode->i_mutex); return ret; } diff --git a/fs/proc/array.c b/fs/proc/array.c index 885ab5513ac5..fff6572676ae 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -176,7 +176,7 @@ static inline void task_state(struct seq_file *m, struct pid_namespace *ns, if (tracer) tpid = task_pid_nr_ns(tracer, ns); } - cred = get_cred((struct cred *) __task_cred(p)); + cred = get_task_cred(p); seq_printf(m, "State:\t%s\n" "Tgid:\t%d\n" @@ -267,7 +267,7 @@ static inline void task_sig(struct seq_file *m, struct task_struct *p) shpending = p->signal->shared_pending.signal; blocked = p->blocked; collect_sigign_sigcatch(p, &ignored, &caught); - num_threads = atomic_read(&p->signal->count); + num_threads = get_nr_threads(p); rcu_read_lock(); /* FIXME: is this correct? */ qsize = atomic_read(&__task_cred(p)->user->sigpending); rcu_read_unlock(); @@ -410,7 +410,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, tty_nr = new_encode_dev(tty_devnum(sig->tty)); } - num_threads = atomic_read(&sig->count); + num_threads = get_nr_threads(task); collect_sigign_sigcatch(task, &sigign, &sigcatch); cmin_flt = sig->cmin_flt; diff --git a/fs/proc/base.c b/fs/proc/base.c index c7f9f23449dc..acb7ef80ea4f 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -166,18 +166,6 @@ static int get_fs_path(struct task_struct *task, struct path *path, bool root) return result; } -static int get_nr_threads(struct task_struct *tsk) -{ - unsigned long flags; - int count = 0; - - if (lock_task_sighand(tsk, &flags)) { - count = atomic_read(&tsk->signal->count); - unlock_task_sighand(tsk, &flags); - } - return count; -} - static int proc_cwd_link(struct inode *inode, struct path *path) { struct task_struct *task = get_proc_task(inode); @@ -2444,7 +2432,7 @@ static struct dentry *proc_base_instantiate(struct inode *dir, const struct pid_entry *p = ptr; struct inode *inode; struct proc_inode *ei; - struct dentry *error = ERR_PTR(-EINVAL); + struct dentry *error; /* Allocate the inode */ error = ERR_PTR(-ENOMEM); @@ -2794,7 +2782,7 @@ out: struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, struct nameidata *nd) { - struct dentry *result = ERR_PTR(-ENOENT); + struct dentry *result; struct task_struct *task; unsigned tgid; struct pid_namespace *ns; diff --git a/fs/proc/generic.c b/fs/proc/generic.c index 43c127490606..2791907744ed 100644 --- a/fs/proc/generic.c +++ b/fs/proc/generic.c @@ -343,21 +343,6 @@ static DEFINE_SPINLOCK(proc_inum_lock); /* protects the above */ /* * Return an inode number between PROC_DYNAMIC_FIRST and * 0xffffffff, or zero on failure. - * - * Current inode allocations in the proc-fs (hex-numbers): - * - * 00000000 reserved - * 00000001-00000fff static entries (goners) - * 001 root-ino - * - * 00001000-00001fff unused - * 0001xxxx-7fffxxxx pid-dir entries for pid 1-7fff - * 80000000-efffffff unused - * f0000000-ffffffff dynamic entries - * - * Goal: - * Once we split the thing into several virtual filesystems, - * we will get rid of magical ranges (and this comment, BTW). */ static unsigned int get_inode_number(void) { diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c index c837a77351be..6f37c391468d 100644 --- a/fs/proc/kcore.c +++ b/fs/proc/kcore.c @@ -588,7 +588,7 @@ static struct kcore_list kcore_text; */ static void __init proc_kcore_text_init(void) { - kclist_add(&kcore_text, _stext, _end - _stext, KCORE_TEXT); + kclist_add(&kcore_text, _text, _end - _text, KCORE_TEXT); } #else static void __init proc_kcore_text_init(void) diff --git a/fs/proc/proc_devtree.c b/fs/proc/proc_devtree.c index ce94801f48ca..d9396a4fc7ff 100644 --- a/fs/proc/proc_devtree.c +++ b/fs/proc/proc_devtree.c @@ -209,6 +209,9 @@ void proc_device_tree_add_node(struct device_node *np, for (pp = np->properties; pp != NULL; pp = pp->next) { p = pp->name; + if (strchr(p, '/')) + continue; + if (duplicate_name(de, p)) p = fixup_name(np, de, p); diff --git a/fs/proc/root.c b/fs/proc/root.c index 757c069f2a65..4258384ed22d 100644 --- a/fs/proc/root.c +++ b/fs/proc/root.c @@ -110,7 +110,6 @@ void __init proc_root_init(void) if (err) return; proc_mnt = kern_mount_data(&proc_fs_type, &init_pid_ns); - err = PTR_ERR(proc_mnt); if (IS_ERR(proc_mnt)) { unregister_filesystem(&proc_fs_type); return; diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c index 46d4b5d72bd3..cb6306e63843 100644 --- a/fs/proc/task_nommu.c +++ b/fs/proc/task_nommu.c @@ -122,11 +122,20 @@ int task_statm(struct mm_struct *mm, int *shared, int *text, return size; } +static void pad_len_spaces(struct seq_file *m, int len) +{ + len = 25 + sizeof(void*) * 6 - len; + if (len < 1) + len = 1; + seq_printf(m, "%*c", len, ' '); +} + /* * display a single VMA to a sequenced file */ static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma) { + struct mm_struct *mm = vma->vm_mm; unsigned long ino = 0; struct file *file; dev_t dev = 0; @@ -155,11 +164,14 @@ static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma) MAJOR(dev), MINOR(dev), ino, &len); if (file) { - len = 25 + sizeof(void *) * 6 - len; - if (len < 1) - len = 1; - seq_printf(m, "%*c", len, ' '); + pad_len_spaces(m, len); seq_path(m, &file->f_path, ""); + } else if (mm) { + if (vma->vm_start <= mm->start_stack && + vma->vm_end >= mm->start_stack) { + pad_len_spaces(m, len); + seq_puts(m, "[stack]"); + } } seq_putc(m, '\n'); diff --git a/fs/qnx4/dir.c b/fs/qnx4/dir.c index 6f30c3d5bcbf..6e8fc62b40a8 100644 --- a/fs/qnx4/dir.c +++ b/fs/qnx4/dir.c @@ -77,9 +77,10 @@ out: const struct file_operations qnx4_dir_operations = { + .llseek = generic_file_llseek, .read = generic_read_dir, .readdir = qnx4_readdir, - .fsync = simple_fsync, + .fsync = generic_file_fsync, }; const struct inode_operations qnx4_dir_inode_operations = diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index 655a4c52b8c3..437d2ca2de97 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -228,10 +228,6 @@ static struct hlist_head *dquot_hash; struct dqstats dqstats; EXPORT_SYMBOL(dqstats); -#ifdef CONFIG_SMP -struct dqstats *dqstats_pcpu; -EXPORT_SYMBOL(dqstats_pcpu); -#endif static qsize_t inode_get_rsv_space(struct inode *inode); static void __dquot_initialize(struct inode *inode, int type); @@ -584,7 +580,7 @@ out: } EXPORT_SYMBOL(dquot_scan_active); -int vfs_quota_sync(struct super_block *sb, int type, int wait) +int dquot_quota_sync(struct super_block *sb, int type, int wait) { struct list_head *dirty; struct dquot *dquot; @@ -656,7 +652,7 @@ int vfs_quota_sync(struct super_block *sb, int type, int wait) return 0; } -EXPORT_SYMBOL(vfs_quota_sync); +EXPORT_SYMBOL(dquot_quota_sync); /* Free unused dquots from cache */ static void prune_dqcache(int count) @@ -676,35 +672,20 @@ static void prune_dqcache(int count) } } -static int dqstats_read(unsigned int type) -{ - int count = 0; -#ifdef CONFIG_SMP - int cpu; - for_each_possible_cpu(cpu) - count += per_cpu_ptr(dqstats_pcpu, cpu)->stat[type]; - /* Statistics reading is racy, but absolute accuracy isn't required */ - if (count < 0) - count = 0; -#else - count = dqstats.stat[type]; -#endif - return count; -} - /* * This is called from kswapd when we think we need some * more memory */ - -static int shrink_dqcache_memory(int nr, gfp_t gfp_mask) +static int shrink_dqcache_memory(struct shrinker *shrink, int nr, gfp_t gfp_mask) { if (nr) { spin_lock(&dq_list_lock); prune_dqcache(nr); spin_unlock(&dq_list_lock); } - return (dqstats_read(DQST_FREE_DQUOTS)/100) * sysctl_vfs_cache_pressure; + return ((unsigned) + percpu_counter_read_positive(&dqstats.counter[DQST_FREE_DQUOTS]) + /100) * sysctl_vfs_cache_pressure; } static struct shrinker dqcache_shrinker = { @@ -1514,11 +1495,13 @@ static void inode_decr_space(struct inode *inode, qsize_t number, int reserve) /* * This operation can block, but only after everything is updated */ -int __dquot_alloc_space(struct inode *inode, qsize_t number, - int warn, int reserve) +int __dquot_alloc_space(struct inode *inode, qsize_t number, int flags) { int cnt, ret = 0; char warntype[MAXQUOTAS]; + int warn = flags & DQUOT_SPACE_WARN; + int reserve = flags & DQUOT_SPACE_RESERVE; + int nofail = flags & DQUOT_SPACE_NOFAIL; /* * First test before acquiring mutex - solves deadlocks when we @@ -1539,7 +1522,7 @@ int __dquot_alloc_space(struct inode *inode, qsize_t number, continue; ret = check_bdq(inode->i_dquot[cnt], number, !warn, warntype+cnt); - if (ret) { + if (ret && !nofail) { spin_unlock(&dq_data_lock); goto out_flush_warn; } @@ -1638,10 +1621,11 @@ EXPORT_SYMBOL(dquot_claim_space_nodirty); /* * This operation can block, but only after everything is updated */ -void __dquot_free_space(struct inode *inode, qsize_t number, int reserve) +void __dquot_free_space(struct inode *inode, qsize_t number, int flags) { unsigned int cnt; char warntype[MAXQUOTAS]; + int reserve = flags & DQUOT_SPACE_RESERVE; /* First test before acquiring mutex - solves deadlocks when we * re-enter the quota code and are already holding the mutex */ @@ -1812,7 +1796,7 @@ int dquot_transfer(struct inode *inode, struct iattr *iattr) if (iattr->ia_valid & ATTR_UID && iattr->ia_uid != inode->i_uid) transfer_to[USRQUOTA] = dqget(sb, iattr->ia_uid, USRQUOTA); if (iattr->ia_valid & ATTR_GID && iattr->ia_gid != inode->i_gid) - transfer_to[GRPQUOTA] = dqget(sb, iattr->ia_uid, GRPQUOTA); + transfer_to[GRPQUOTA] = dqget(sb, iattr->ia_gid, GRPQUOTA); ret = __dquot_transfer(inode, transfer_to); dqput_all(transfer_to); @@ -1847,6 +1831,7 @@ const struct dquot_operations dquot_operations = { .alloc_dquot = dquot_alloc, .destroy_dquot = dquot_destroy, }; +EXPORT_SYMBOL(dquot_operations); /* * Generic helper for ->open on filesystems supporting disk quotas. @@ -1865,7 +1850,7 @@ EXPORT_SYMBOL(dquot_file_open); /* * Turn quota off on a device. type == -1 ==> quotaoff for all types (umount) */ -int vfs_quota_disable(struct super_block *sb, int type, unsigned int flags) +int dquot_disable(struct super_block *sb, int type, unsigned int flags) { int cnt, ret = 0; struct quota_info *dqopt = sb_dqopt(sb); @@ -1995,14 +1980,15 @@ put_inodes: } return ret; } -EXPORT_SYMBOL(vfs_quota_disable); +EXPORT_SYMBOL(dquot_disable); -int vfs_quota_off(struct super_block *sb, int type, int remount) +int dquot_quota_off(struct super_block *sb, int type) { - return vfs_quota_disable(sb, type, remount ? DQUOT_SUSPENDED : - (DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED)); + return dquot_disable(sb, type, + DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED); } -EXPORT_SYMBOL(vfs_quota_off); +EXPORT_SYMBOL(dquot_quota_off); + /* * Turn quotas on on a device */ @@ -2120,36 +2106,43 @@ out_fmt: } /* Reenable quotas on remount RW */ -static int vfs_quota_on_remount(struct super_block *sb, int type) +int dquot_resume(struct super_block *sb, int type) { struct quota_info *dqopt = sb_dqopt(sb); struct inode *inode; - int ret; + int ret = 0, cnt; unsigned int flags; - mutex_lock(&dqopt->dqonoff_mutex); - if (!sb_has_quota_suspended(sb, type)) { + for (cnt = 0; cnt < MAXQUOTAS; cnt++) { + if (type != -1 && cnt != type) + continue; + + mutex_lock(&dqopt->dqonoff_mutex); + if (!sb_has_quota_suspended(sb, cnt)) { + mutex_unlock(&dqopt->dqonoff_mutex); + continue; + } + inode = dqopt->files[cnt]; + dqopt->files[cnt] = NULL; + spin_lock(&dq_state_lock); + flags = dqopt->flags & dquot_state_flag(DQUOT_USAGE_ENABLED | + DQUOT_LIMITS_ENABLED, + cnt); + dqopt->flags &= ~dquot_state_flag(DQUOT_STATE_FLAGS, cnt); + spin_unlock(&dq_state_lock); mutex_unlock(&dqopt->dqonoff_mutex); - return 0; - } - inode = dqopt->files[type]; - dqopt->files[type] = NULL; - spin_lock(&dq_state_lock); - flags = dqopt->flags & dquot_state_flag(DQUOT_USAGE_ENABLED | - DQUOT_LIMITS_ENABLED, type); - dqopt->flags &= ~dquot_state_flag(DQUOT_STATE_FLAGS, type); - spin_unlock(&dq_state_lock); - mutex_unlock(&dqopt->dqonoff_mutex); - flags = dquot_generic_flag(flags, type); - ret = vfs_load_quota_inode(inode, type, dqopt->info[type].dqi_fmt_id, - flags); - iput(inode); + flags = dquot_generic_flag(flags, cnt); + ret = vfs_load_quota_inode(inode, cnt, + dqopt->info[cnt].dqi_fmt_id, flags); + iput(inode); + } return ret; } +EXPORT_SYMBOL(dquot_resume); -int vfs_quota_on_path(struct super_block *sb, int type, int format_id, +int dquot_quota_on_path(struct super_block *sb, int type, int format_id, struct path *path) { int error = security_quota_on(path->dentry); @@ -2164,40 +2157,36 @@ int vfs_quota_on_path(struct super_block *sb, int type, int format_id, DQUOT_LIMITS_ENABLED); return error; } -EXPORT_SYMBOL(vfs_quota_on_path); +EXPORT_SYMBOL(dquot_quota_on_path); -int vfs_quota_on(struct super_block *sb, int type, int format_id, char *name, - int remount) +int dquot_quota_on(struct super_block *sb, int type, int format_id, char *name) { struct path path; int error; - if (remount) - return vfs_quota_on_remount(sb, type); - error = kern_path(name, LOOKUP_FOLLOW, &path); if (!error) { - error = vfs_quota_on_path(sb, type, format_id, &path); + error = dquot_quota_on_path(sb, type, format_id, &path); path_put(&path); } return error; } -EXPORT_SYMBOL(vfs_quota_on); +EXPORT_SYMBOL(dquot_quota_on); /* * More powerful function for turning on quotas allowing setting * of individual quota flags */ -int vfs_quota_enable(struct inode *inode, int type, int format_id, - unsigned int flags) +int dquot_enable(struct inode *inode, int type, int format_id, + unsigned int flags) { int ret = 0; struct super_block *sb = inode->i_sb; struct quota_info *dqopt = sb_dqopt(sb); /* Just unsuspend quotas? */ - if (flags & DQUOT_SUSPENDED) - return vfs_quota_on_remount(sb, type); + BUG_ON(flags & DQUOT_SUSPENDED); + if (!flags) return 0; /* Just updating flags needed? */ @@ -2229,13 +2218,13 @@ out_lock: load_quota: return vfs_load_quota_inode(inode, type, format_id, flags); } -EXPORT_SYMBOL(vfs_quota_enable); +EXPORT_SYMBOL(dquot_enable); /* * This function is used when filesystem needs to initialize quotas * during mount time. */ -int vfs_quota_on_mount(struct super_block *sb, char *qf_name, +int dquot_quota_on_mount(struct super_block *sb, char *qf_name, int format_id, int type) { struct dentry *dentry; @@ -2261,24 +2250,7 @@ out: dput(dentry); return error; } -EXPORT_SYMBOL(vfs_quota_on_mount); - -/* Wrapper to turn on quotas when remounting rw */ -int vfs_dq_quota_on_remount(struct super_block *sb) -{ - int cnt; - int ret = 0, err; - - if (!sb->s_qcop || !sb->s_qcop->quota_on) - return -ENOSYS; - for (cnt = 0; cnt < MAXQUOTAS; cnt++) { - err = sb->s_qcop->quota_on(sb, cnt, 0, NULL, 1); - if (err < 0 && !ret) - ret = err; - } - return ret; -} -EXPORT_SYMBOL(vfs_dq_quota_on_remount); +EXPORT_SYMBOL(dquot_quota_on_mount); static inline qsize_t qbtos(qsize_t blocks) { @@ -2313,8 +2285,8 @@ static void do_get_dqblk(struct dquot *dquot, struct fs_disk_quota *di) spin_unlock(&dq_data_lock); } -int vfs_get_dqblk(struct super_block *sb, int type, qid_t id, - struct fs_disk_quota *di) +int dquot_get_dqblk(struct super_block *sb, int type, qid_t id, + struct fs_disk_quota *di) { struct dquot *dquot; @@ -2326,7 +2298,7 @@ int vfs_get_dqblk(struct super_block *sb, int type, qid_t id, return 0; } -EXPORT_SYMBOL(vfs_get_dqblk); +EXPORT_SYMBOL(dquot_get_dqblk); #define VFS_FS_DQ_MASK \ (FS_DQ_BCOUNT | FS_DQ_BSOFT | FS_DQ_BHARD | \ @@ -2425,7 +2397,7 @@ static int do_set_dqblk(struct dquot *dquot, struct fs_disk_quota *di) return 0; } -int vfs_set_dqblk(struct super_block *sb, int type, qid_t id, +int dquot_set_dqblk(struct super_block *sb, int type, qid_t id, struct fs_disk_quota *di) { struct dquot *dquot; @@ -2441,10 +2413,10 @@ int vfs_set_dqblk(struct super_block *sb, int type, qid_t id, out: return rc; } -EXPORT_SYMBOL(vfs_set_dqblk); +EXPORT_SYMBOL(dquot_set_dqblk); /* Generic routine for getting common part of quota file information */ -int vfs_get_dqinfo(struct super_block *sb, int type, struct if_dqinfo *ii) +int dquot_get_dqinfo(struct super_block *sb, int type, struct if_dqinfo *ii) { struct mem_dqinfo *mi; @@ -2463,10 +2435,10 @@ int vfs_get_dqinfo(struct super_block *sb, int type, struct if_dqinfo *ii) mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex); return 0; } -EXPORT_SYMBOL(vfs_get_dqinfo); +EXPORT_SYMBOL(dquot_get_dqinfo); /* Generic routine for setting common part of quota file information */ -int vfs_set_dqinfo(struct super_block *sb, int type, struct if_dqinfo *ii) +int dquot_set_dqinfo(struct super_block *sb, int type, struct if_dqinfo *ii) { struct mem_dqinfo *mi; int err = 0; @@ -2493,27 +2465,27 @@ out: mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex); return err; } -EXPORT_SYMBOL(vfs_set_dqinfo); +EXPORT_SYMBOL(dquot_set_dqinfo); -const struct quotactl_ops vfs_quotactl_ops = { - .quota_on = vfs_quota_on, - .quota_off = vfs_quota_off, - .quota_sync = vfs_quota_sync, - .get_info = vfs_get_dqinfo, - .set_info = vfs_set_dqinfo, - .get_dqblk = vfs_get_dqblk, - .set_dqblk = vfs_set_dqblk +const struct quotactl_ops dquot_quotactl_ops = { + .quota_on = dquot_quota_on, + .quota_off = dquot_quota_off, + .quota_sync = dquot_quota_sync, + .get_info = dquot_get_dqinfo, + .set_info = dquot_set_dqinfo, + .get_dqblk = dquot_get_dqblk, + .set_dqblk = dquot_set_dqblk }; - +EXPORT_SYMBOL(dquot_quotactl_ops); static int do_proc_dqstats(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { -#ifdef CONFIG_SMP - /* Update global table */ unsigned int type = (int *)table->data - dqstats.stat; - dqstats.stat[type] = dqstats_read(type); -#endif + + /* Update global table */ + dqstats.stat[type] = + percpu_counter_sum_positive(&dqstats.counter[type]); return proc_dointvec(table, write, buffer, lenp, ppos); } @@ -2606,7 +2578,7 @@ static ctl_table sys_table[] = { static int __init dquot_init(void) { - int i; + int i, ret; unsigned long nr_hash, order; printk(KERN_NOTICE "VFS: Disk quotas %s\n", __DQUOT_VERSION__); @@ -2624,12 +2596,11 @@ static int __init dquot_init(void) if (!dquot_hash) panic("Cannot create dquot hash table"); -#ifdef CONFIG_SMP - dqstats_pcpu = alloc_percpu(struct dqstats); - if (!dqstats_pcpu) - panic("Cannot create dquot stats table"); -#endif - memset(&dqstats, 0, sizeof(struct dqstats)); + for (i = 0; i < _DQST_DQSTAT_LAST; i++) { + ret = percpu_counter_init(&dqstats.counter[i], 0); + if (ret) + panic("Cannot create dquot stat counters"); + } /* Find power-of-two hlist_heads which can fit into allocation */ nr_hash = (1UL << order) * PAGE_SIZE / sizeof(struct hlist_head); diff --git a/fs/quota/quota.c b/fs/quota/quota.c index ce3dfd066f59..b299961e1edb 100644 --- a/fs/quota/quota.c +++ b/fs/quota/quota.c @@ -73,7 +73,7 @@ static int quota_quotaon(struct super_block *sb, int type, int cmd, qid_t id, if (IS_ERR(pathname)) return PTR_ERR(pathname); if (sb->s_qcop->quota_on) - ret = sb->s_qcop->quota_on(sb, type, id, pathname, 0); + ret = sb->s_qcop->quota_on(sb, type, id, pathname); putname(pathname); return ret; } @@ -260,7 +260,7 @@ static int do_quotactl(struct super_block *sb, int type, int cmd, qid_t id, case Q_QUOTAOFF: if (!sb->s_qcop->quota_off) return -ENOSYS; - return sb->s_qcop->quota_off(sb, type, 0); + return sb->s_qcop->quota_off(sb, type); case Q_GETFMT: return quota_getfmt(sb, type, addr); case Q_GETINFO: diff --git a/fs/ramfs/file-mmu.c b/fs/ramfs/file-mmu.c index 78f613cb9c76..4884ac5ae9be 100644 --- a/fs/ramfs/file-mmu.c +++ b/fs/ramfs/file-mmu.c @@ -43,12 +43,13 @@ const struct file_operations ramfs_file_operations = { .write = do_sync_write, .aio_write = generic_file_aio_write, .mmap = generic_file_mmap, - .fsync = simple_sync_file, + .fsync = noop_fsync, .splice_read = generic_file_splice_read, .splice_write = generic_file_splice_write, .llseek = generic_file_llseek, }; const struct inode_operations ramfs_file_inode_operations = { + .setattr = simple_setattr, .getattr = simple_getattr, }; diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c index 5ea4ad81a429..d532c20fc179 100644 --- a/fs/ramfs/file-nommu.c +++ b/fs/ramfs/file-nommu.c @@ -42,7 +42,7 @@ const struct file_operations ramfs_file_operations = { .aio_read = generic_file_aio_read, .write = do_sync_write, .aio_write = generic_file_aio_write, - .fsync = simple_sync_file, + .fsync = noop_fsync, .splice_read = generic_file_splice_read, .splice_write = generic_file_splice_write, .llseek = generic_file_llseek, @@ -146,7 +146,7 @@ static int ramfs_nommu_resize(struct inode *inode, loff_t newsize, loff_t size) return ret; } - ret = vmtruncate(inode, newsize); + ret = simple_setsize(inode, newsize); return ret; } @@ -169,7 +169,8 @@ static int ramfs_nommu_setattr(struct dentry *dentry, struct iattr *ia) /* pick out size-changing events */ if (ia->ia_valid & ATTR_SIZE) { - loff_t size = i_size_read(inode); + loff_t size = inode->i_size; + if (ia->ia_size != size) { ret = ramfs_nommu_resize(inode, ia->ia_size, size); if (ret < 0 || ia->ia_valid == ATTR_SIZE) @@ -182,7 +183,7 @@ static int ramfs_nommu_setattr(struct dentry *dentry, struct iattr *ia) } } - ret = inode_setattr(inode, ia); + generic_setattr(inode, ia); out: ia->ia_valid = old_ia_valid; return ret; diff --git a/fs/read_write.c b/fs/read_write.c index 113386d6fd2d..9c0485236e68 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -97,6 +97,23 @@ loff_t generic_file_llseek(struct file *file, loff_t offset, int origin) } EXPORT_SYMBOL(generic_file_llseek); +/** + * noop_llseek - No Operation Performed llseek implementation + * @file: file structure to seek on + * @offset: file offset to seek to + * @origin: type of seek + * + * This is an implementation of ->llseek useable for the rare special case when + * userspace expects the seek to succeed but the (device) file is actually not + * able to perform the seek. In this case you use noop_llseek() instead of + * falling back to the default implementation of ->llseek. + */ +loff_t noop_llseek(struct file *file, loff_t offset, int origin) +{ + return file->f_pos; +} +EXPORT_SYMBOL(noop_llseek); + loff_t no_llseek(struct file *file, loff_t offset, int origin) { return -ESPIPE; diff --git a/fs/reiserfs/dir.c b/fs/reiserfs/dir.c index 07930449a958..198dabf1b2bb 100644 --- a/fs/reiserfs/dir.c +++ b/fs/reiserfs/dir.c @@ -14,10 +14,10 @@ extern const struct reiserfs_key MIN_KEY; static int reiserfs_readdir(struct file *, void *, filldir_t); -static int reiserfs_dir_fsync(struct file *filp, struct dentry *dentry, - int datasync); +static int reiserfs_dir_fsync(struct file *filp, int datasync); const struct file_operations reiserfs_dir_operations = { + .llseek = generic_file_llseek, .read = generic_read_dir, .readdir = reiserfs_readdir, .fsync = reiserfs_dir_fsync, @@ -27,10 +27,9 @@ const struct file_operations reiserfs_dir_operations = { #endif }; -static int reiserfs_dir_fsync(struct file *filp, struct dentry *dentry, - int datasync) +static int reiserfs_dir_fsync(struct file *filp, int datasync) { - struct inode *inode = dentry->d_inode; + struct inode *inode = filp->f_mapping->host; int err; reiserfs_write_lock(inode->i_sb); err = reiserfs_commit_for_inode(inode); diff --git a/fs/reiserfs/file.c b/fs/reiserfs/file.c index 9977df9f3a54..b82cdd8a45dd 100644 --- a/fs/reiserfs/file.c +++ b/fs/reiserfs/file.c @@ -134,10 +134,9 @@ static void reiserfs_vfs_truncate_file(struct inode *inode) * be removed... */ -static int reiserfs_sync_file(struct file *filp, - struct dentry *dentry, int datasync) +static int reiserfs_sync_file(struct file *filp, int datasync) { - struct inode *inode = dentry->d_inode; + struct inode *inode = filp->f_mapping->host; int err; int barrier_done; diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c index 59125fb36d42..9822fa15118b 100644 --- a/fs/reiserfs/super.c +++ b/fs/reiserfs/super.c @@ -158,6 +158,7 @@ static int finish_unfinished(struct super_block *s) #ifdef CONFIG_QUOTA int i; int ms_active_set; + int quota_enabled[MAXQUOTAS]; #endif /* compose key to look for "save" links */ @@ -179,8 +180,15 @@ static int finish_unfinished(struct super_block *s) } /* Turn on quotas so that they are updated correctly */ for (i = 0; i < MAXQUOTAS; i++) { + quota_enabled[i] = 1; if (REISERFS_SB(s)->s_qf_names[i]) { - int ret = reiserfs_quota_on_mount(s, i); + int ret; + + if (sb_has_quota_active(s, i)) { + quota_enabled[i] = 0; + continue; + } + ret = reiserfs_quota_on_mount(s, i); if (ret < 0) reiserfs_warning(s, "reiserfs-2500", "cannot turn on journaled " @@ -304,8 +312,8 @@ static int finish_unfinished(struct super_block *s) #ifdef CONFIG_QUOTA /* Turn quotas off */ for (i = 0; i < MAXQUOTAS; i++) { - if (sb_dqopt(s)->files[i]) - vfs_quota_off(s, i, 0); + if (sb_dqopt(s)->files[i] && quota_enabled[i]) + dquot_quota_off(s, i); } if (ms_active_set) /* Restore the flag back */ @@ -466,6 +474,8 @@ static void reiserfs_put_super(struct super_block *s) struct reiserfs_transaction_handle th; th.t_trans_id = 0; + dquot_disable(s, -1, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED); + reiserfs_write_lock(s); if (s->s_dirt) @@ -620,7 +630,7 @@ static int reiserfs_acquire_dquot(struct dquot *); static int reiserfs_release_dquot(struct dquot *); static int reiserfs_mark_dquot_dirty(struct dquot *); static int reiserfs_write_info(struct super_block *, int); -static int reiserfs_quota_on(struct super_block *, int, int, char *, int); +static int reiserfs_quota_on(struct super_block *, int, int, char *); static const struct dquot_operations reiserfs_quota_operations = { .write_dquot = reiserfs_write_dquot, @@ -634,12 +644,12 @@ static const struct dquot_operations reiserfs_quota_operations = { static const struct quotactl_ops reiserfs_qctl_operations = { .quota_on = reiserfs_quota_on, - .quota_off = vfs_quota_off, - .quota_sync = vfs_quota_sync, - .get_info = vfs_get_dqinfo, - .set_info = vfs_set_dqinfo, - .get_dqblk = vfs_get_dqblk, - .set_dqblk = vfs_set_dqblk, + .quota_off = dquot_quota_off, + .quota_sync = dquot_quota_sync, + .get_info = dquot_get_dqinfo, + .set_info = dquot_set_dqinfo, + .get_dqblk = dquot_get_dqblk, + .set_dqblk = dquot_set_dqblk, }; #endif @@ -1242,6 +1252,11 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg) if (s->s_flags & MS_RDONLY) /* it is read-only already */ goto out_ok; + + err = dquot_suspend(s, -1); + if (err < 0) + goto out_err; + /* try to remount file system with read-only permissions */ if (sb_umount_state(rs) == REISERFS_VALID_FS || REISERFS_SB(s)->s_mount_state != REISERFS_VALID_FS) { @@ -1295,6 +1310,7 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg) s->s_dirt = 0; if (!(*mount_flags & MS_RDONLY)) { + dquot_resume(s, -1); finish_unfinished(s); reiserfs_xattr_init(s, *mount_flags); } @@ -2022,15 +2038,15 @@ static int reiserfs_write_info(struct super_block *sb, int type) */ static int reiserfs_quota_on_mount(struct super_block *sb, int type) { - return vfs_quota_on_mount(sb, REISERFS_SB(sb)->s_qf_names[type], - REISERFS_SB(sb)->s_jquota_fmt, type); + return dquot_quota_on_mount(sb, REISERFS_SB(sb)->s_qf_names[type], + REISERFS_SB(sb)->s_jquota_fmt, type); } /* * Standard function to be called on quota_on */ static int reiserfs_quota_on(struct super_block *sb, int type, int format_id, - char *name, int remount) + char *name) { int err; struct path path; @@ -2039,9 +2055,7 @@ static int reiserfs_quota_on(struct super_block *sb, int type, int format_id, if (!(REISERFS_SB(sb)->s_mount_opt & (1 << REISERFS_QUOTA))) return -EINVAL; - /* No more checks needed? Path and format_id are bogus anyway... */ - if (remount) - return vfs_quota_on(sb, type, format_id, name, 1); + err = kern_path(name, LOOKUP_FOLLOW, &path); if (err) return err; @@ -2085,7 +2099,7 @@ static int reiserfs_quota_on(struct super_block *sb, int type, int format_id, if (err) goto out; } - err = vfs_quota_on_path(sb, type, format_id, &path); + err = dquot_quota_on_path(sb, type, format_id, &path); out: path_put(&path); return err; diff --git a/fs/smbfs/dir.c b/fs/smbfs/dir.c index 6c978428892d..00a70cab1f36 100644 --- a/fs/smbfs/dir.c +++ b/fs/smbfs/dir.c @@ -37,6 +37,7 @@ static int smb_link(struct dentry *, struct inode *, struct dentry *); const struct file_operations smb_dir_operations = { + .llseek = generic_file_llseek, .read = generic_read_dir, .readdir = smb_readdir, .unlocked_ioctl = smb_ioctl, diff --git a/fs/smbfs/file.c b/fs/smbfs/file.c index 84ecf0e43f91..8e187a0f94bb 100644 --- a/fs/smbfs/file.c +++ b/fs/smbfs/file.c @@ -28,8 +28,9 @@ #include "proto.h" static int -smb_fsync(struct file *file, struct dentry * dentry, int datasync) +smb_fsync(struct file *file, int datasync) { + struct dentry *dentry = file->f_path.dentry; struct smb_sb_info *server = server_from_dentry(dentry); int result; diff --git a/fs/smbfs/inode.c b/fs/smbfs/inode.c index dfa1d67f8fca..9551cb6f7fe4 100644 --- a/fs/smbfs/inode.c +++ b/fs/smbfs/inode.c @@ -714,7 +714,7 @@ smb_notify_change(struct dentry *dentry, struct iattr *attr) error = server->ops->truncate(inode, attr->ia_size); if (error) goto out; - error = vmtruncate(inode, attr->ia_size); + error = simple_setsize(inode, attr->ia_size); if (error) goto out; refresh = 1; diff --git a/fs/splice.c b/fs/splice.c index ac22b00d86c3..efdbfece9932 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -354,7 +354,7 @@ __generic_file_splice_read(struct file *in, loff_t *ppos, break; error = add_to_page_cache_lru(page, mapping, index, - mapping_gfp_mask(mapping)); + GFP_KERNEL); if (unlikely(error)) { page_cache_release(page); if (error == -EEXIST) @@ -1282,7 +1282,8 @@ static int direct_splice_actor(struct pipe_inode_info *pipe, { struct file *file = sd->u.file; - return do_splice_from(pipe, file, &sd->pos, sd->total_len, sd->flags); + return do_splice_from(pipe, file, &file->f_pos, sd->total_len, + sd->flags); } /** @@ -1371,8 +1372,7 @@ static long do_splice(struct file *in, loff_t __user *off_in, if (off_in) return -ESPIPE; if (off_out) { - if (!out->f_op || !out->f_op->llseek || - out->f_op->llseek == no_llseek) + if (!(out->f_mode & FMODE_PWRITE)) return -EINVAL; if (copy_from_user(&offset, off_out, sizeof(loff_t))) return -EFAULT; @@ -1392,8 +1392,7 @@ static long do_splice(struct file *in, loff_t __user *off_in, if (off_out) return -ESPIPE; if (off_in) { - if (!in->f_op || !in->f_op->llseek || - in->f_op->llseek == no_llseek) + if (!(in->f_mode & FMODE_PREAD)) return -EINVAL; if (copy_from_user(&offset, off_in, sizeof(loff_t))) return -EFAULT; diff --git a/fs/squashfs/Kconfig b/fs/squashfs/Kconfig index 25a00d19d686..cc6ce8a84c21 100644 --- a/fs/squashfs/Kconfig +++ b/fs/squashfs/Kconfig @@ -26,6 +26,17 @@ config SQUASHFS If unsure, say N. +config SQUASHFS_XATTRS + bool "Squashfs XATTR support" + depends on SQUASHFS + default n + help + Saying Y here includes support for extended attributes (xattrs). + Xattrs are name:value pairs associated with inodes by + the kernel or by users (see the attr(5) manual page). + + If unsure, say N. + config SQUASHFS_EMBEDDED bool "Additional option for memory-constrained systems" diff --git a/fs/squashfs/Makefile b/fs/squashfs/Makefile index df8a19ef870d..2cee3e9fa452 100644 --- a/fs/squashfs/Makefile +++ b/fs/squashfs/Makefile @@ -5,3 +5,5 @@ obj-$(CONFIG_SQUASHFS) += squashfs.o squashfs-y += block.o cache.o dir.o export.o file.o fragment.o id.o inode.o squashfs-y += namei.o super.o symlink.o zlib_wrapper.o decompressor.o +squashfs-$(CONFIG_SQUASHFS_XATTRS) += xattr.o xattr_id.o + diff --git a/fs/squashfs/inode.c b/fs/squashfs/inode.c index 49daaf669e41..62e63ad25075 100644 --- a/fs/squashfs/inode.c +++ b/fs/squashfs/inode.c @@ -40,11 +40,13 @@ #include <linux/fs.h> #include <linux/vfs.h> +#include <linux/xattr.h> #include "squashfs_fs.h" #include "squashfs_fs_sb.h" #include "squashfs_fs_i.h" #include "squashfs.h" +#include "xattr.h" /* * Initialise VFS inode with the base inode information common to all @@ -111,6 +113,7 @@ int squashfs_read_inode(struct inode *inode, long long ino) int err, type, offset = SQUASHFS_INODE_OFFSET(ino); union squashfs_inode squashfs_ino; struct squashfs_base_inode *sqshb_ino = &squashfs_ino.base; + int xattr_id = SQUASHFS_INVALID_XATTR; TRACE("Entered squashfs_read_inode\n"); @@ -199,8 +202,10 @@ int squashfs_read_inode(struct inode *inode, long long ino) frag_offset = 0; } + xattr_id = le32_to_cpu(sqsh_ino->xattr); inode->i_nlink = le32_to_cpu(sqsh_ino->nlink); inode->i_size = le64_to_cpu(sqsh_ino->file_size); + inode->i_op = &squashfs_inode_ops; inode->i_fop = &generic_ro_fops; inode->i_mode |= S_IFREG; inode->i_blocks = ((inode->i_size - @@ -251,6 +256,7 @@ int squashfs_read_inode(struct inode *inode, long long ino) if (err < 0) goto failed_read; + xattr_id = le32_to_cpu(sqsh_ino->xattr); inode->i_nlink = le32_to_cpu(sqsh_ino->nlink); inode->i_size = le32_to_cpu(sqsh_ino->file_size); inode->i_op = &squashfs_dir_inode_ops; @@ -280,21 +286,33 @@ int squashfs_read_inode(struct inode *inode, long long ino) inode->i_nlink = le32_to_cpu(sqsh_ino->nlink); inode->i_size = le32_to_cpu(sqsh_ino->symlink_size); - inode->i_op = &page_symlink_inode_operations; + inode->i_op = &squashfs_symlink_inode_ops; inode->i_data.a_ops = &squashfs_symlink_aops; inode->i_mode |= S_IFLNK; squashfs_i(inode)->start = block; squashfs_i(inode)->offset = offset; + if (type == SQUASHFS_LSYMLINK_TYPE) { + __le32 xattr; + + err = squashfs_read_metadata(sb, NULL, &block, + &offset, inode->i_size); + if (err < 0) + goto failed_read; + err = squashfs_read_metadata(sb, &xattr, &block, + &offset, sizeof(xattr)); + if (err < 0) + goto failed_read; + xattr_id = le32_to_cpu(xattr); + } + TRACE("Symbolic link inode %x:%x, start_block %llx, offset " "%x\n", SQUASHFS_INODE_BLK(ino), offset, block, offset); break; } case SQUASHFS_BLKDEV_TYPE: - case SQUASHFS_CHRDEV_TYPE: - case SQUASHFS_LBLKDEV_TYPE: - case SQUASHFS_LCHRDEV_TYPE: { + case SQUASHFS_CHRDEV_TYPE: { struct squashfs_dev_inode *sqsh_ino = &squashfs_ino.dev; unsigned int rdev; @@ -315,10 +333,32 @@ int squashfs_read_inode(struct inode *inode, long long ino) SQUASHFS_INODE_BLK(ino), offset, rdev); break; } + case SQUASHFS_LBLKDEV_TYPE: + case SQUASHFS_LCHRDEV_TYPE: { + struct squashfs_ldev_inode *sqsh_ino = &squashfs_ino.ldev; + unsigned int rdev; + + err = squashfs_read_metadata(sb, sqsh_ino, &block, &offset, + sizeof(*sqsh_ino)); + if (err < 0) + goto failed_read; + + if (type == SQUASHFS_LCHRDEV_TYPE) + inode->i_mode |= S_IFCHR; + else + inode->i_mode |= S_IFBLK; + xattr_id = le32_to_cpu(sqsh_ino->xattr); + inode->i_op = &squashfs_inode_ops; + inode->i_nlink = le32_to_cpu(sqsh_ino->nlink); + rdev = le32_to_cpu(sqsh_ino->rdev); + init_special_inode(inode, inode->i_mode, new_decode_dev(rdev)); + + TRACE("Device inode %x:%x, rdev %x\n", + SQUASHFS_INODE_BLK(ino), offset, rdev); + break; + } case SQUASHFS_FIFO_TYPE: - case SQUASHFS_SOCKET_TYPE: - case SQUASHFS_LFIFO_TYPE: - case SQUASHFS_LSOCKET_TYPE: { + case SQUASHFS_SOCKET_TYPE: { struct squashfs_ipc_inode *sqsh_ino = &squashfs_ino.ipc; err = squashfs_read_metadata(sb, sqsh_ino, &block, &offset, @@ -334,14 +374,52 @@ int squashfs_read_inode(struct inode *inode, long long ino) init_special_inode(inode, inode->i_mode, 0); break; } + case SQUASHFS_LFIFO_TYPE: + case SQUASHFS_LSOCKET_TYPE: { + struct squashfs_lipc_inode *sqsh_ino = &squashfs_ino.lipc; + + err = squashfs_read_metadata(sb, sqsh_ino, &block, &offset, + sizeof(*sqsh_ino)); + if (err < 0) + goto failed_read; + + if (type == SQUASHFS_LFIFO_TYPE) + inode->i_mode |= S_IFIFO; + else + inode->i_mode |= S_IFSOCK; + xattr_id = le32_to_cpu(sqsh_ino->xattr); + inode->i_op = &squashfs_inode_ops; + inode->i_nlink = le32_to_cpu(sqsh_ino->nlink); + init_special_inode(inode, inode->i_mode, 0); + break; + } default: ERROR("Unknown inode type %d in squashfs_iget!\n", type); return -EINVAL; } + if (xattr_id != SQUASHFS_INVALID_XATTR && msblk->xattr_id_table) { + err = squashfs_xattr_lookup(sb, xattr_id, + &squashfs_i(inode)->xattr_count, + &squashfs_i(inode)->xattr_size, + &squashfs_i(inode)->xattr); + if (err < 0) + goto failed_read; + inode->i_blocks += ((squashfs_i(inode)->xattr_size - 1) >> 9) + + 1; + } else + squashfs_i(inode)->xattr_count = 0; + return 0; failed_read: ERROR("Unable to read inode 0x%llx\n", ino); return err; } + + +const struct inode_operations squashfs_inode_ops = { + .getxattr = generic_getxattr, + .listxattr = squashfs_listxattr +}; + diff --git a/fs/squashfs/namei.c b/fs/squashfs/namei.c index 5266bd8ad932..7a9464d08cf6 100644 --- a/fs/squashfs/namei.c +++ b/fs/squashfs/namei.c @@ -57,11 +57,13 @@ #include <linux/slab.h> #include <linux/string.h> #include <linux/dcache.h> +#include <linux/xattr.h> #include "squashfs_fs.h" #include "squashfs_fs_sb.h" #include "squashfs_fs_i.h" #include "squashfs.h" +#include "xattr.h" /* * Lookup name in the directory index, returning the location of the metadata @@ -237,5 +239,7 @@ failed: const struct inode_operations squashfs_dir_inode_ops = { - .lookup = squashfs_lookup + .lookup = squashfs_lookup, + .getxattr = generic_getxattr, + .listxattr = squashfs_listxattr }; diff --git a/fs/squashfs/squashfs.h b/fs/squashfs/squashfs.h index fe2587af5512..733a17c42945 100644 --- a/fs/squashfs/squashfs.h +++ b/fs/squashfs/squashfs.h @@ -73,8 +73,11 @@ extern struct inode *squashfs_iget(struct super_block *, long long, unsigned int); extern int squashfs_read_inode(struct inode *, long long); +/* xattr.c */ +extern ssize_t squashfs_listxattr(struct dentry *, char *, size_t); + /* - * Inodes, files and decompressor operations + * Inodes, files, decompressor and xattr operations */ /* dir.c */ @@ -86,11 +89,18 @@ extern const struct export_operations squashfs_export_ops; /* file.c */ extern const struct address_space_operations squashfs_aops; +/* inode.c */ +extern const struct inode_operations squashfs_inode_ops; + /* namei.c */ extern const struct inode_operations squashfs_dir_inode_ops; /* symlink.c */ extern const struct address_space_operations squashfs_symlink_aops; +extern const struct inode_operations squashfs_symlink_inode_ops; + +/* xattr.c */ +extern const struct xattr_handler *squashfs_xattr_handlers[]; /* zlib_wrapper.c */ extern const struct squashfs_decompressor squashfs_zlib_comp_ops; diff --git a/fs/squashfs/squashfs_fs.h b/fs/squashfs/squashfs_fs.h index 79024245ea00..8eabb808b78d 100644 --- a/fs/squashfs/squashfs_fs.h +++ b/fs/squashfs/squashfs_fs.h @@ -46,6 +46,7 @@ #define SQUASHFS_NAME_LEN 256 #define SQUASHFS_INVALID_FRAG (0xffffffffU) +#define SQUASHFS_INVALID_XATTR (0xffffffffU) #define SQUASHFS_INVALID_BLK (-1LL) /* Filesystem flags */ @@ -96,6 +97,13 @@ #define SQUASHFS_LFIFO_TYPE 13 #define SQUASHFS_LSOCKET_TYPE 14 +/* Xattr types */ +#define SQUASHFS_XATTR_USER 0 +#define SQUASHFS_XATTR_TRUSTED 1 +#define SQUASHFS_XATTR_SECURITY 2 +#define SQUASHFS_XATTR_VALUE_OOL 256 +#define SQUASHFS_XATTR_PREFIX_MASK 0xff + /* Flag whether block is compressed or uncompressed, bit is set if block is * uncompressed */ #define SQUASHFS_COMPRESSED_BIT (1 << 15) @@ -174,6 +182,24 @@ #define SQUASHFS_ID_BLOCK_BYTES(A) (SQUASHFS_ID_BLOCKS(A) *\ sizeof(u64)) +/* xattr id lookup table defines */ +#define SQUASHFS_XATTR_BYTES(A) ((A) * sizeof(struct squashfs_xattr_id)) + +#define SQUASHFS_XATTR_BLOCK(A) (SQUASHFS_XATTR_BYTES(A) / \ + SQUASHFS_METADATA_SIZE) + +#define SQUASHFS_XATTR_BLOCK_OFFSET(A) (SQUASHFS_XATTR_BYTES(A) % \ + SQUASHFS_METADATA_SIZE) + +#define SQUASHFS_XATTR_BLOCKS(A) ((SQUASHFS_XATTR_BYTES(A) + \ + SQUASHFS_METADATA_SIZE - 1) / \ + SQUASHFS_METADATA_SIZE) + +#define SQUASHFS_XATTR_BLOCK_BYTES(A) (SQUASHFS_XATTR_BLOCKS(A) *\ + sizeof(u64)) +#define SQUASHFS_XATTR_BLK(A) ((unsigned int) ((A) >> 16)) + +#define SQUASHFS_XATTR_OFFSET(A) ((unsigned int) ((A) & 0xffff)) /* cached data constants for filesystem */ #define SQUASHFS_CACHED_BLKS 8 @@ -228,7 +254,7 @@ struct squashfs_super_block { __le64 root_inode; __le64 bytes_used; __le64 id_table_start; - __le64 xattr_table_start; + __le64 xattr_id_table_start; __le64 inode_table_start; __le64 directory_table_start; __le64 fragment_table_start; @@ -261,6 +287,17 @@ struct squashfs_ipc_inode { __le32 nlink; }; +struct squashfs_lipc_inode { + __le16 inode_type; + __le16 mode; + __le16 uid; + __le16 guid; + __le32 mtime; + __le32 inode_number; + __le32 nlink; + __le32 xattr; +}; + struct squashfs_dev_inode { __le16 inode_type; __le16 mode; @@ -272,6 +309,18 @@ struct squashfs_dev_inode { __le32 rdev; }; +struct squashfs_ldev_inode { + __le16 inode_type; + __le16 mode; + __le16 uid; + __le16 guid; + __le32 mtime; + __le32 inode_number; + __le32 nlink; + __le32 rdev; + __le32 xattr; +}; + struct squashfs_symlink_inode { __le16 inode_type; __le16 mode; @@ -349,12 +398,14 @@ struct squashfs_ldir_inode { union squashfs_inode { struct squashfs_base_inode base; struct squashfs_dev_inode dev; + struct squashfs_ldev_inode ldev; struct squashfs_symlink_inode symlink; struct squashfs_reg_inode reg; struct squashfs_lreg_inode lreg; struct squashfs_dir_inode dir; struct squashfs_ldir_inode ldir; struct squashfs_ipc_inode ipc; + struct squashfs_lipc_inode lipc; }; struct squashfs_dir_entry { @@ -377,4 +428,27 @@ struct squashfs_fragment_entry { unsigned int unused; }; +struct squashfs_xattr_entry { + __le16 type; + __le16 size; + char data[0]; +}; + +struct squashfs_xattr_val { + __le32 vsize; + char value[0]; +}; + +struct squashfs_xattr_id { + __le64 xattr; + __le32 count; + __le32 size; +}; + +struct squashfs_xattr_id_table { + __le64 xattr_table_start; + __le32 xattr_ids; + __le32 unused; +}; + #endif diff --git a/fs/squashfs/squashfs_fs_i.h b/fs/squashfs/squashfs_fs_i.h index fbfca30c0c68..d3e3a37f28a1 100644 --- a/fs/squashfs/squashfs_fs_i.h +++ b/fs/squashfs/squashfs_fs_i.h @@ -26,6 +26,9 @@ struct squashfs_inode_info { u64 start; int offset; + u64 xattr; + unsigned int xattr_size; + int xattr_count; union { struct { u64 fragment_block; diff --git a/fs/squashfs/squashfs_fs_sb.h b/fs/squashfs/squashfs_fs_sb.h index 2e77dc547e25..d9037a5215f0 100644 --- a/fs/squashfs/squashfs_fs_sb.h +++ b/fs/squashfs/squashfs_fs_sb.h @@ -61,6 +61,7 @@ struct squashfs_sb_info { int next_meta_index; __le64 *id_table; __le64 *fragment_index; + __le64 *xattr_id_table; struct mutex read_data_mutex; struct mutex meta_index_mutex; struct meta_index *meta_index; @@ -68,9 +69,11 @@ struct squashfs_sb_info { __le64 *inode_lookup_table; u64 inode_table; u64 directory_table; + u64 xattr_table; unsigned int block_size; unsigned short block_log; long long bytes_used; unsigned int inodes; + int xattr_ids; }; #endif diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c index 48b6f4a385a6..88b4f8606652 100644 --- a/fs/squashfs/super.c +++ b/fs/squashfs/super.c @@ -36,12 +36,14 @@ #include <linux/init.h> #include <linux/module.h> #include <linux/magic.h> +#include <linux/xattr.h> #include "squashfs_fs.h" #include "squashfs_fs_sb.h" #include "squashfs_fs_i.h" #include "squashfs.h" #include "decompressor.h" +#include "xattr.h" static struct file_system_type squashfs_fs_type; static const struct super_operations squashfs_super_ops; @@ -82,7 +84,7 @@ static int squashfs_fill_super(struct super_block *sb, void *data, int silent) long long root_inode; unsigned short flags; unsigned int fragments; - u64 lookup_table_start; + u64 lookup_table_start, xattr_id_table_start; int err; TRACE("Entered squashfs_fill_superblock\n"); @@ -139,13 +141,6 @@ static int squashfs_fill_super(struct super_block *sb, void *data, int silent) if (msblk->decompressor == NULL) goto failed_mount; - /* - * Check if there's xattrs in the filesystem. These are not - * supported in this version, so warn that they will be ignored. - */ - if (le64_to_cpu(sblk->xattr_table_start) != SQUASHFS_INVALID_BLK) - ERROR("Xattrs in filesystem, these will be ignored\n"); - /* Check the filesystem does not extend beyond the end of the block device */ msblk->bytes_used = le64_to_cpu(sblk->bytes_used); @@ -253,7 +248,7 @@ static int squashfs_fill_super(struct super_block *sb, void *data, int silent) allocate_lookup_table: lookup_table_start = le64_to_cpu(sblk->lookup_table_start); if (lookup_table_start == SQUASHFS_INVALID_BLK) - goto allocate_root; + goto allocate_xattr_table; /* Allocate and read inode lookup table */ msblk->inode_lookup_table = squashfs_read_inode_lookup_table(sb, @@ -266,6 +261,21 @@ allocate_lookup_table: sb->s_export_op = &squashfs_export_ops; +allocate_xattr_table: + sb->s_xattr = squashfs_xattr_handlers; + xattr_id_table_start = le64_to_cpu(sblk->xattr_id_table_start); + if (xattr_id_table_start == SQUASHFS_INVALID_BLK) + goto allocate_root; + + /* Allocate and read xattr id lookup table */ + msblk->xattr_id_table = squashfs_read_xattr_id_table(sb, + xattr_id_table_start, &msblk->xattr_table, &msblk->xattr_ids); + if (IS_ERR(msblk->xattr_id_table)) { + err = PTR_ERR(msblk->xattr_id_table); + msblk->xattr_id_table = NULL; + if (err != -ENOTSUPP) + goto failed_mount; + } allocate_root: root = new_inode(sb); if (!root) { @@ -301,6 +311,7 @@ failed_mount: kfree(msblk->inode_lookup_table); kfree(msblk->fragment_index); kfree(msblk->id_table); + kfree(msblk->xattr_id_table); kfree(sb->s_fs_info); sb->s_fs_info = NULL; kfree(sblk); @@ -355,6 +366,7 @@ static void squashfs_put_super(struct super_block *sb) kfree(sbi->fragment_index); kfree(sbi->meta_index); kfree(sbi->inode_lookup_table); + kfree(sbi->xattr_id_table); kfree(sb->s_fs_info); sb->s_fs_info = NULL; } diff --git a/fs/squashfs/symlink.c b/fs/squashfs/symlink.c index 32b911f4ee39..ec86434921e1 100644 --- a/fs/squashfs/symlink.c +++ b/fs/squashfs/symlink.c @@ -35,11 +35,13 @@ #include <linux/kernel.h> #include <linux/string.h> #include <linux/pagemap.h> +#include <linux/xattr.h> #include "squashfs_fs.h" #include "squashfs_fs_sb.h" #include "squashfs_fs_i.h" #include "squashfs.h" +#include "xattr.h" static int squashfs_symlink_readpage(struct file *file, struct page *page) { @@ -114,3 +116,12 @@ error_out: const struct address_space_operations squashfs_symlink_aops = { .readpage = squashfs_symlink_readpage }; + +const struct inode_operations squashfs_symlink_inode_ops = { + .readlink = generic_readlink, + .follow_link = page_follow_link_light, + .put_link = page_put_link, + .getxattr = generic_getxattr, + .listxattr = squashfs_listxattr +}; + diff --git a/fs/squashfs/xattr.c b/fs/squashfs/xattr.c new file mode 100644 index 000000000000..c7655e8b31cd --- /dev/null +++ b/fs/squashfs/xattr.c @@ -0,0 +1,323 @@ +/* + * Squashfs - a compressed read only filesystem for Linux + * + * Copyright (c) 2010 + * Phillip Lougher <phillip@lougher.demon.co.uk> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2, + * or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * + * xattr_id.c + */ + +#include <linux/init.h> +#include <linux/module.h> +#include <linux/string.h> +#include <linux/fs.h> +#include <linux/vfs.h> +#include <linux/xattr.h> +#include <linux/slab.h> + +#include "squashfs_fs.h" +#include "squashfs_fs_sb.h" +#include "squashfs_fs_i.h" +#include "squashfs.h" + +static const struct xattr_handler *squashfs_xattr_handler(int); + +ssize_t squashfs_listxattr(struct dentry *d, char *buffer, + size_t buffer_size) +{ + struct inode *inode = d->d_inode; + struct super_block *sb = inode->i_sb; + struct squashfs_sb_info *msblk = sb->s_fs_info; + u64 start = SQUASHFS_XATTR_BLK(squashfs_i(inode)->xattr) + + msblk->xattr_table; + int offset = SQUASHFS_XATTR_OFFSET(squashfs_i(inode)->xattr); + int count = squashfs_i(inode)->xattr_count; + size_t rest = buffer_size; + int err; + + /* check that the file system has xattrs */ + if (msblk->xattr_id_table == NULL) + return -EOPNOTSUPP; + + /* loop reading each xattr name */ + while (count--) { + struct squashfs_xattr_entry entry; + struct squashfs_xattr_val val; + const struct xattr_handler *handler; + int name_size, prefix_size = 0; + + err = squashfs_read_metadata(sb, &entry, &start, &offset, + sizeof(entry)); + if (err < 0) + goto failed; + + name_size = le16_to_cpu(entry.size); + handler = squashfs_xattr_handler(le16_to_cpu(entry.type)); + if (handler) + prefix_size = handler->list(d, buffer, rest, NULL, + name_size, handler->flags); + if (prefix_size) { + if (buffer) { + if (prefix_size + name_size + 1 > rest) { + err = -ERANGE; + goto failed; + } + buffer += prefix_size; + } + err = squashfs_read_metadata(sb, buffer, &start, + &offset, name_size); + if (err < 0) + goto failed; + if (buffer) { + buffer[name_size] = '\0'; + buffer += name_size + 1; + } + rest -= prefix_size + name_size + 1; + } else { + /* no handler or insuffficient privileges, so skip */ + err = squashfs_read_metadata(sb, NULL, &start, + &offset, name_size); + if (err < 0) + goto failed; + } + + + /* skip remaining xattr entry */ + err = squashfs_read_metadata(sb, &val, &start, &offset, + sizeof(val)); + if (err < 0) + goto failed; + + err = squashfs_read_metadata(sb, NULL, &start, &offset, + le32_to_cpu(val.vsize)); + if (err < 0) + goto failed; + } + err = buffer_size - rest; + +failed: + return err; +} + + +static int squashfs_xattr_get(struct inode *inode, int name_index, + const char *name, void *buffer, size_t buffer_size) +{ + struct super_block *sb = inode->i_sb; + struct squashfs_sb_info *msblk = sb->s_fs_info; + u64 start = SQUASHFS_XATTR_BLK(squashfs_i(inode)->xattr) + + msblk->xattr_table; + int offset = SQUASHFS_XATTR_OFFSET(squashfs_i(inode)->xattr); + int count = squashfs_i(inode)->xattr_count; + int name_len = strlen(name); + int err, vsize; + char *target = kmalloc(name_len, GFP_KERNEL); + + if (target == NULL) + return -ENOMEM; + + /* loop reading each xattr name */ + for (; count; count--) { + struct squashfs_xattr_entry entry; + struct squashfs_xattr_val val; + int type, prefix, name_size; + + err = squashfs_read_metadata(sb, &entry, &start, &offset, + sizeof(entry)); + if (err < 0) + goto failed; + + name_size = le16_to_cpu(entry.size); + type = le16_to_cpu(entry.type); + prefix = type & SQUASHFS_XATTR_PREFIX_MASK; + + if (prefix == name_index && name_size == name_len) + err = squashfs_read_metadata(sb, target, &start, + &offset, name_size); + else + err = squashfs_read_metadata(sb, NULL, &start, + &offset, name_size); + if (err < 0) + goto failed; + + if (prefix == name_index && name_size == name_len && + strncmp(target, name, name_size) == 0) { + /* found xattr */ + if (type & SQUASHFS_XATTR_VALUE_OOL) { + __le64 xattr; + /* val is a reference to the real location */ + err = squashfs_read_metadata(sb, &val, &start, + &offset, sizeof(val)); + if (err < 0) + goto failed; + err = squashfs_read_metadata(sb, &xattr, &start, + &offset, sizeof(xattr)); + if (err < 0) + goto failed; + xattr = le64_to_cpu(xattr); + start = SQUASHFS_XATTR_BLK(xattr) + + msblk->xattr_table; + offset = SQUASHFS_XATTR_OFFSET(xattr); + } + /* read xattr value */ + err = squashfs_read_metadata(sb, &val, &start, &offset, + sizeof(val)); + if (err < 0) + goto failed; + + vsize = le32_to_cpu(val.vsize); + if (buffer) { + if (vsize > buffer_size) { + err = -ERANGE; + goto failed; + } + err = squashfs_read_metadata(sb, buffer, &start, + &offset, vsize); + if (err < 0) + goto failed; + } + break; + } + + /* no match, skip remaining xattr entry */ + err = squashfs_read_metadata(sb, &val, &start, &offset, + sizeof(val)); + if (err < 0) + goto failed; + err = squashfs_read_metadata(sb, NULL, &start, &offset, + le32_to_cpu(val.vsize)); + if (err < 0) + goto failed; + } + err = count ? vsize : -ENODATA; + +failed: + kfree(target); + return err; +} + + +/* + * User namespace support + */ +static size_t squashfs_user_list(struct dentry *d, char *list, size_t list_size, + const char *name, size_t name_len, int type) +{ + if (list && XATTR_USER_PREFIX_LEN <= list_size) + memcpy(list, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN); + return XATTR_USER_PREFIX_LEN; +} + +static int squashfs_user_get(struct dentry *d, const char *name, void *buffer, + size_t size, int type) +{ + if (name[0] == '\0') + return -EINVAL; + + return squashfs_xattr_get(d->d_inode, SQUASHFS_XATTR_USER, name, + buffer, size); +} + +static const struct xattr_handler squashfs_xattr_user_handler = { + .prefix = XATTR_USER_PREFIX, + .list = squashfs_user_list, + .get = squashfs_user_get +}; + +/* + * Trusted namespace support + */ +static size_t squashfs_trusted_list(struct dentry *d, char *list, + size_t list_size, const char *name, size_t name_len, int type) +{ + if (!capable(CAP_SYS_ADMIN)) + return 0; + + if (list && XATTR_TRUSTED_PREFIX_LEN <= list_size) + memcpy(list, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN); + return XATTR_TRUSTED_PREFIX_LEN; +} + +static int squashfs_trusted_get(struct dentry *d, const char *name, + void *buffer, size_t size, int type) +{ + if (name[0] == '\0') + return -EINVAL; + + return squashfs_xattr_get(d->d_inode, SQUASHFS_XATTR_TRUSTED, name, + buffer, size); +} + +static const struct xattr_handler squashfs_xattr_trusted_handler = { + .prefix = XATTR_TRUSTED_PREFIX, + .list = squashfs_trusted_list, + .get = squashfs_trusted_get +}; + +/* + * Security namespace support + */ +static size_t squashfs_security_list(struct dentry *d, char *list, + size_t list_size, const char *name, size_t name_len, int type) +{ + if (list && XATTR_SECURITY_PREFIX_LEN <= list_size) + memcpy(list, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN); + return XATTR_SECURITY_PREFIX_LEN; +} + +static int squashfs_security_get(struct dentry *d, const char *name, + void *buffer, size_t size, int type) +{ + if (name[0] == '\0') + return -EINVAL; + + return squashfs_xattr_get(d->d_inode, SQUASHFS_XATTR_SECURITY, name, + buffer, size); +} + +static const struct xattr_handler squashfs_xattr_security_handler = { + .prefix = XATTR_SECURITY_PREFIX, + .list = squashfs_security_list, + .get = squashfs_security_get +}; + +static inline const struct xattr_handler *squashfs_xattr_handler(int type) +{ + if (type & ~(SQUASHFS_XATTR_PREFIX_MASK | SQUASHFS_XATTR_VALUE_OOL)) + /* ignore unrecognised type */ + return NULL; + + switch (type & SQUASHFS_XATTR_PREFIX_MASK) { + case SQUASHFS_XATTR_USER: + return &squashfs_xattr_user_handler; + case SQUASHFS_XATTR_TRUSTED: + return &squashfs_xattr_trusted_handler; + case SQUASHFS_XATTR_SECURITY: + return &squashfs_xattr_security_handler; + default: + /* ignore unrecognised type */ + return NULL; + } +} + +const struct xattr_handler *squashfs_xattr_handlers[] = { + &squashfs_xattr_user_handler, + &squashfs_xattr_trusted_handler, + &squashfs_xattr_security_handler, + NULL +}; + diff --git a/fs/squashfs/xattr.h b/fs/squashfs/xattr.h new file mode 100644 index 000000000000..9da071ae181c --- /dev/null +++ b/fs/squashfs/xattr.h @@ -0,0 +1,46 @@ +/* + * Squashfs - a compressed read only filesystem for Linux + * + * Copyright (c) 2010 + * Phillip Lougher <phillip@lougher.demon.co.uk> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2, + * or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * + * xattr.h + */ + +#ifdef CONFIG_SQUASHFS_XATTRS +extern __le64 *squashfs_read_xattr_id_table(struct super_block *, u64, + u64 *, int *); +extern int squashfs_xattr_lookup(struct super_block *, unsigned int, int *, + int *, unsigned long long *); +#else +static inline __le64 *squashfs_read_xattr_id_table(struct super_block *sb, + u64 start, u64 *xattr_table_start, int *xattr_ids) +{ + ERROR("Xattrs in filesystem, these will be ignored\n"); + return ERR_PTR(-ENOTSUPP); +} + +static inline int squashfs_xattr_lookup(struct super_block *sb, + unsigned int index, int *count, int *size, + unsigned long long *xattr) +{ + return 0; +} +#define squashfs_listxattr NULL +#define generic_getxattr NULL +#define squashfs_xattr_handlers NULL +#endif diff --git a/fs/squashfs/xattr_id.c b/fs/squashfs/xattr_id.c new file mode 100644 index 000000000000..cfb41106098f --- /dev/null +++ b/fs/squashfs/xattr_id.c @@ -0,0 +1,100 @@ +/* + * Squashfs - a compressed read only filesystem for Linux + * + * Copyright (c) 2010 + * Phillip Lougher <phillip@lougher.demon.co.uk> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2, + * or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * + * xattr_id.c + */ + +/* + * This file implements code to map the 32-bit xattr id stored in the inode + * into the on disk location of the xattr data. + */ + +#include <linux/fs.h> +#include <linux/vfs.h> +#include <linux/slab.h> + +#include "squashfs_fs.h" +#include "squashfs_fs_sb.h" +#include "squashfs_fs_i.h" +#include "squashfs.h" + +/* + * Map xattr id using the xattr id look up table + */ +int squashfs_xattr_lookup(struct super_block *sb, unsigned int index, + int *count, unsigned int *size, unsigned long long *xattr) +{ + struct squashfs_sb_info *msblk = sb->s_fs_info; + int block = SQUASHFS_XATTR_BLOCK(index); + int offset = SQUASHFS_XATTR_BLOCK_OFFSET(index); + u64 start_block = le64_to_cpu(msblk->xattr_id_table[block]); + struct squashfs_xattr_id id; + int err; + + err = squashfs_read_metadata(sb, &id, &start_block, &offset, + sizeof(id)); + if (err < 0) + return err; + + *xattr = le64_to_cpu(id.xattr); + *size = le32_to_cpu(id.size); + *count = le32_to_cpu(id.count); + return 0; +} + + +/* + * Read uncompressed xattr id lookup table indexes from disk into memory + */ +__le64 *squashfs_read_xattr_id_table(struct super_block *sb, u64 start, + u64 *xattr_table_start, int *xattr_ids) +{ + unsigned int len; + __le64 *xid_table; + struct squashfs_xattr_id_table id_table; + int err; + + err = squashfs_read_table(sb, &id_table, start, sizeof(id_table)); + if (err < 0) { + ERROR("unable to read xattr id table\n"); + return ERR_PTR(err); + } + *xattr_table_start = le64_to_cpu(id_table.xattr_table_start); + *xattr_ids = le32_to_cpu(id_table.xattr_ids); + len = SQUASHFS_XATTR_BLOCK_BYTES(*xattr_ids); + + TRACE("In read_xattr_index_table, length %d\n", len); + + /* Allocate xattr id lookup table indexes */ + xid_table = kmalloc(len, GFP_KERNEL); + if (xid_table == NULL) { + ERROR("Failed to allocate xattr id index table\n"); + return ERR_PTR(-ENOMEM); + } + + err = squashfs_read_table(sb, xid_table, start + sizeof(id_table), len); + if (err < 0) { + ERROR("unable to read xattr id index table\n"); + kfree(xid_table); + return ERR_PTR(err); + } + + return xid_table; +} diff --git a/fs/super.c b/fs/super.c index 69688b15f1fa..938119ab8dcb 100644 --- a/fs/super.c +++ b/fs/super.c @@ -24,7 +24,6 @@ #include <linux/slab.h> #include <linux/acct.h> #include <linux/blkdev.h> -#include <linux/quotaops.h> #include <linux/mount.h> #include <linux/security.h> #include <linux/writeback.h> /* for the emergency remount stuff */ @@ -94,8 +93,6 @@ static struct super_block *alloc_super(struct file_system_type *type) init_rwsem(&s->s_dquot.dqptr_sem); init_waitqueue_head(&s->s_wait_unfrozen); s->s_maxbytes = MAX_NON_LFS; - s->dq_op = sb_dquot_ops; - s->s_qcop = sb_quotactl_ops; s->s_op = &default_op; s->s_time_gran = 1000000000; } @@ -160,7 +157,6 @@ void deactivate_locked_super(struct super_block *s) { struct file_system_type *fs = s->s_type; if (atomic_dec_and_test(&s->s_active)) { - vfs_dq_off(s, 0); fs->kill_sb(s); put_filesystem(fs); put_super(s); @@ -378,6 +374,8 @@ void sync_supers(void) up_read(&sb->s_umount); spin_lock(&sb_lock); + /* lock was dropped, must reset next */ + list_safe_reset_next(sb, n, s_list); __put_super(sb); } } @@ -409,6 +407,8 @@ void iterate_supers(void (*f)(struct super_block *, void *), void *arg) up_read(&sb->s_umount); spin_lock(&sb_lock); + /* lock was dropped, must reset next */ + list_safe_reset_next(sb, n, s_list); __put_super(sb); } spin_unlock(&sb_lock); @@ -524,7 +524,7 @@ rescan: int do_remount_sb(struct super_block *sb, int flags, void *data, int force) { int retval; - int remount_rw, remount_ro; + int remount_ro; if (sb->s_frozen != SB_UNFROZEN) return -EBUSY; @@ -540,7 +540,6 @@ int do_remount_sb(struct super_block *sb, int flags, void *data, int force) sync_filesystem(sb); remount_ro = (flags & MS_RDONLY) && !(sb->s_flags & MS_RDONLY); - remount_rw = !(flags & MS_RDONLY) && (sb->s_flags & MS_RDONLY); /* If we are remounting RDONLY and current sb is read/write, make sure there are no rw files opened */ @@ -549,9 +548,6 @@ int do_remount_sb(struct super_block *sb, int flags, void *data, int force) mark_files_ro(sb); else if (!fs_may_remount_ro(sb)) return -EBUSY; - retval = vfs_dq_off(sb, 1); - if (retval < 0 && retval != -ENOSYS) - return -EBUSY; } if (sb->s_op->remount_fs) { @@ -560,8 +556,7 @@ int do_remount_sb(struct super_block *sb, int flags, void *data, int force) return retval; } sb->s_flags = (sb->s_flags & ~MS_RMT_MASK) | (flags & MS_RMT_MASK); - if (remount_rw) - vfs_dq_quota_on_remount(sb); + /* * Some filesystems modify their metadata via some other path than the * bdev buffer cache (eg. use a private mapping, or directories in @@ -594,6 +589,8 @@ static void do_emergency_remount(struct work_struct *work) } up_write(&sb->s_umount); spin_lock(&sb_lock); + /* lock was dropped, must reset next */ + list_safe_reset_next(sb, n, s_list); __put_super(sb); } spin_unlock(&sb_lock); @@ -946,8 +943,8 @@ out: EXPORT_SYMBOL_GPL(vfs_kern_mount); /** - * freeze_super -- lock the filesystem and force it into a consistent state - * @super: the super to lock + * freeze_super - lock the filesystem and force it into a consistent state + * @sb: the super to lock * * Syncs the super to make sure the filesystem is consistent and calls the fs's * freeze_fs. Subsequent calls to this without first thawing the fs will return diff --git a/fs/sync.c b/fs/sync.c index e8cbd415e50a..15aa6f03b2da 100644 --- a/fs/sync.c +++ b/fs/sync.c @@ -42,7 +42,7 @@ static int __sync_filesystem(struct super_block *sb, int wait) if (wait) sync_inodes_sb(sb); else - writeback_inodes_sb_locked(sb); + writeback_inodes_sb(sb); if (sb->s_op->sync_fs) sb->s_op->sync_fs(sb, wait); @@ -130,12 +130,10 @@ void emergency_sync(void) /* * Generic function to fsync a file. - * - * filp may be NULL if called via the msync of a vma. */ -int file_fsync(struct file *filp, struct dentry *dentry, int datasync) +int file_fsync(struct file *filp, int datasync) { - struct inode * inode = dentry->d_inode; + struct inode *inode = filp->f_mapping->host; struct super_block * sb; int ret, err; @@ -183,7 +181,7 @@ int vfs_fsync_range(struct file *file, loff_t start, loff_t end, int datasync) * livelocks in fsync_buffers_list(). */ mutex_lock(&mapping->host->i_mutex); - err = file->f_op->fsync(file, file->f_path.dentry, datasync); + err = file->f_op->fsync(file, datasync); if (!ret) ret = err; mutex_unlock(&mapping->host->i_mutex); diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c index bbd77e95cf7f..0835a3b70e03 100644 --- a/fs/sysfs/inode.c +++ b/fs/sysfs/inode.c @@ -117,13 +117,13 @@ int sysfs_setattr(struct dentry *dentry, struct iattr *iattr) if (error) goto out; - iattr->ia_valid &= ~ATTR_SIZE; /* ignore size changes */ - - error = inode_setattr(inode, iattr); + error = sysfs_sd_setattr(sd, iattr); if (error) goto out; - error = sysfs_sd_setattr(sd, iattr); + /* this ignores size changes */ + generic_setattr(inode, iattr); + out: mutex_unlock(&sysfs_mutex); return error; diff --git a/fs/sysfs/symlink.c b/fs/sysfs/symlink.c index f71246bebfe4..a7ac78f8e67a 100644 --- a/fs/sysfs/symlink.c +++ b/fs/sysfs/symlink.c @@ -28,6 +28,7 @@ static int sysfs_do_create_link(struct kobject *kobj, struct kobject *target, struct sysfs_dirent *target_sd = NULL; struct sysfs_dirent *sd = NULL; struct sysfs_addrm_cxt acxt; + enum kobj_ns_type ns_type; int error; BUG_ON(!name); @@ -58,16 +59,29 @@ static int sysfs_do_create_link(struct kobject *kobj, struct kobject *target, if (!sd) goto out_put; - if (sysfs_ns_type(parent_sd)) + ns_type = sysfs_ns_type(parent_sd); + if (ns_type) sd->s_ns = target->ktype->namespace(target); sd->s_symlink.target_sd = target_sd; target_sd = NULL; /* reference is now owned by the symlink */ sysfs_addrm_start(&acxt, parent_sd); - if (warn) - error = sysfs_add_one(&acxt, sd); - else - error = __sysfs_add_one(&acxt, sd); + /* Symlinks must be between directories with the same ns_type */ + if (!ns_type || + (ns_type == sysfs_ns_type(sd->s_symlink.target_sd->s_parent))) { + if (warn) + error = sysfs_add_one(&acxt, sd); + else + error = __sysfs_add_one(&acxt, sd); + } else { + error = -EINVAL; + WARN(1, KERN_WARNING + "sysfs: symlink across ns_types %s/%s -> %s/%s\n", + parent_sd->s_name, + sd->s_name, + sd->s_symlink.target_sd->s_parent->s_name, + sd->s_symlink.target_sd->s_name); + } sysfs_addrm_finish(&acxt); if (error) @@ -122,7 +136,7 @@ void sysfs_delete_link(struct kobject *kobj, struct kobject *targ, { const void *ns = NULL; spin_lock(&sysfs_assoc_lock); - if (targ->sd) + if (targ->sd && sysfs_ns_type(kobj->sd)) ns = targ->sd->s_ns; spin_unlock(&sysfs_assoc_lock); sysfs_hash_and_remove(kobj->sd, ns, name); diff --git a/fs/sysv/dir.c b/fs/sysv/dir.c index 1dabed286b4c..79941e4964a4 100644 --- a/fs/sysv/dir.c +++ b/fs/sysv/dir.c @@ -24,7 +24,7 @@ const struct file_operations sysv_dir_operations = { .llseek = generic_file_llseek, .read = generic_read_dir, .readdir = sysv_readdir, - .fsync = simple_fsync, + .fsync = generic_file_fsync, }; static inline void dir_put_page(struct page *page) diff --git a/fs/sysv/file.c b/fs/sysv/file.c index 96340c01f4a7..750cc22349bd 100644 --- a/fs/sysv/file.c +++ b/fs/sysv/file.c @@ -26,7 +26,7 @@ const struct file_operations sysv_file_operations = { .write = do_sync_write, .aio_write = generic_file_aio_write, .mmap = generic_file_mmap, - .fsync = simple_fsync, + .fsync = generic_file_fsync, .splice_read = generic_file_splice_read, }; diff --git a/fs/sysv/ialloc.c b/fs/sysv/ialloc.c index bbd69bdb0fa8..fcc498ec9b33 100644 --- a/fs/sysv/ialloc.c +++ b/fs/sysv/ialloc.c @@ -25,6 +25,7 @@ #include <linux/stat.h> #include <linux/string.h> #include <linux/buffer_head.h> +#include <linux/writeback.h> #include "sysv.h" /* We don't trust the value of @@ -139,6 +140,9 @@ struct inode * sysv_new_inode(const struct inode * dir, mode_t mode) struct inode *inode; sysv_ino_t ino; unsigned count; + struct writeback_control wbc = { + .sync_mode = WB_SYNC_NONE + }; inode = new_inode(sb); if (!inode) @@ -168,7 +172,7 @@ struct inode * sysv_new_inode(const struct inode * dir, mode_t mode) insert_inode_hash(inode); mark_inode_dirty(inode); - sysv_write_inode(inode, 0); /* ensure inode not allocated again */ + sysv_write_inode(inode, &wbc); /* ensure inode not allocated again */ mark_inode_dirty(inode); /* cleared by sysv_write_inode() */ /* That's it. */ unlock_super(sb); diff --git a/fs/sysv/inode.c b/fs/sysv/inode.c index 4573734d723d..d4a5380b5669 100644 --- a/fs/sysv/inode.c +++ b/fs/sysv/inode.c @@ -43,6 +43,7 @@ static int sysv_sync_fs(struct super_block *sb, int wait) * then attach current time stamp. * But if the filesystem was marked clean, keep it clean. */ + sb->s_dirt = 0; old_time = fs32_to_cpu(sbi, *sbi->s_sb_time); if (sbi->s_type == FSTYPE_SYSV4) { if (*sbi->s_sb_state == cpu_to_fs32(sbi, 0x7c269d38 - old_time)) diff --git a/fs/ubifs/budget.c b/fs/ubifs/budget.c index 076ca50e9933..c8ff0d1ae5d3 100644 --- a/fs/ubifs/budget.c +++ b/fs/ubifs/budget.c @@ -62,7 +62,9 @@ */ static void shrink_liability(struct ubifs_info *c, int nr_to_write) { + down_read(&c->vfs_sb->s_umount); writeback_inodes_sb(c->vfs_sb); + up_read(&c->vfs_sb->s_umount); } /** diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c index 5692cf72b807..12f445cee9f7 100644 --- a/fs/ubifs/file.c +++ b/fs/ubifs/file.c @@ -967,12 +967,15 @@ static int do_writepage(struct page *page, int len) * the page locked, and it locks @ui_mutex. However, write-back does take inode * @i_mutex, which means other VFS operations may be run on this inode at the * same time. And the problematic one is truncation to smaller size, from where - * we have to call 'vmtruncate()', which first changes @inode->i_size, then + * we have to call 'simple_setsize()', which first changes @inode->i_size, then * drops the truncated pages. And while dropping the pages, it takes the page - * lock. This means that 'do_truncation()' cannot call 'vmtruncate()' with + * lock. This means that 'do_truncation()' cannot call 'simple_setsize()' with * @ui_mutex locked, because it would deadlock with 'ubifs_writepage()'. This * means that @inode->i_size is changed while @ui_mutex is unlocked. * + * XXX: with the new truncate the above is not true anymore, the simple_setsize + * calls can be replaced with the individual components. + * * But in 'ubifs_writepage()' we have to guarantee that we do not write beyond * inode size. How do we do this if @inode->i_size may became smaller while we * are in the middle of 'ubifs_writepage()'? The UBIFS solution is the @@ -1125,7 +1128,7 @@ static int do_truncation(struct ubifs_info *c, struct inode *inode, budgeted = 0; } - err = vmtruncate(inode, new_size); + err = simple_setsize(inode, new_size); if (err) goto out_budg; @@ -1214,7 +1217,7 @@ static int do_setattr(struct ubifs_info *c, struct inode *inode, if (attr->ia_valid & ATTR_SIZE) { dbg_gen("size %lld -> %lld", inode->i_size, new_size); - err = vmtruncate(inode, new_size); + err = simple_setsize(inode, new_size); if (err) goto out; } @@ -1223,7 +1226,7 @@ static int do_setattr(struct ubifs_info *c, struct inode *inode, if (attr->ia_valid & ATTR_SIZE) { /* Truncation changes inode [mc]time */ inode->i_mtime = inode->i_ctime = ubifs_current_time(inode); - /* 'vmtruncate()' changed @i_size, update @ui_size */ + /* 'simple_setsize()' changed @i_size, update @ui_size */ ui->ui_size = inode->i_size; } @@ -1304,9 +1307,9 @@ static void *ubifs_follow_link(struct dentry *dentry, struct nameidata *nd) return NULL; } -int ubifs_fsync(struct file *file, struct dentry *dentry, int datasync) +int ubifs_fsync(struct file *file, int datasync) { - struct inode *inode = dentry->d_inode; + struct inode *inode = file->f_mapping->host; struct ubifs_info *c = inode->i_sb->s_fs_info; int err; diff --git a/fs/ubifs/lpt.c b/fs/ubifs/lpt.c index ad7f67b827ea..0084a33c4c69 100644 --- a/fs/ubifs/lpt.c +++ b/fs/ubifs/lpt.c @@ -1457,13 +1457,13 @@ struct ubifs_lprops *ubifs_lpt_lookup(struct ubifs_info *c, int lnum) shft -= UBIFS_LPT_FANOUT_SHIFT; nnode = ubifs_get_nnode(c, nnode, iip); if (IS_ERR(nnode)) - return ERR_PTR(PTR_ERR(nnode)); + return ERR_CAST(nnode); } iip = ((i >> shft) & (UBIFS_LPT_FANOUT - 1)); shft -= UBIFS_LPT_FANOUT_SHIFT; pnode = ubifs_get_pnode(c, nnode, iip); if (IS_ERR(pnode)) - return ERR_PTR(PTR_ERR(pnode)); + return ERR_CAST(pnode); iip = (i & (UBIFS_LPT_FANOUT - 1)); dbg_lp("LEB %d, free %d, dirty %d, flags %d", lnum, pnode->lprops[iip].free, pnode->lprops[iip].dirty, @@ -1586,7 +1586,7 @@ struct ubifs_lprops *ubifs_lpt_lookup_dirty(struct ubifs_info *c, int lnum) nnode = c->nroot; nnode = dirty_cow_nnode(c, nnode); if (IS_ERR(nnode)) - return ERR_PTR(PTR_ERR(nnode)); + return ERR_CAST(nnode); i = lnum - c->main_first; shft = c->lpt_hght * UBIFS_LPT_FANOUT_SHIFT; for (h = 1; h < c->lpt_hght; h++) { @@ -1594,19 +1594,19 @@ struct ubifs_lprops *ubifs_lpt_lookup_dirty(struct ubifs_info *c, int lnum) shft -= UBIFS_LPT_FANOUT_SHIFT; nnode = ubifs_get_nnode(c, nnode, iip); if (IS_ERR(nnode)) - return ERR_PTR(PTR_ERR(nnode)); + return ERR_CAST(nnode); nnode = dirty_cow_nnode(c, nnode); if (IS_ERR(nnode)) - return ERR_PTR(PTR_ERR(nnode)); + return ERR_CAST(nnode); } iip = ((i >> shft) & (UBIFS_LPT_FANOUT - 1)); shft -= UBIFS_LPT_FANOUT_SHIFT; pnode = ubifs_get_pnode(c, nnode, iip); if (IS_ERR(pnode)) - return ERR_PTR(PTR_ERR(pnode)); + return ERR_CAST(pnode); pnode = dirty_cow_pnode(c, pnode); if (IS_ERR(pnode)) - return ERR_PTR(PTR_ERR(pnode)); + return ERR_CAST(pnode); iip = (i & (UBIFS_LPT_FANOUT - 1)); dbg_lp("LEB %d, free %d, dirty %d, flags %d", lnum, pnode->lprops[iip].free, pnode->lprops[iip].dirty, diff --git a/fs/ubifs/lpt_commit.c b/fs/ubifs/lpt_commit.c index 13cb7a4237bf..d12535b7fc78 100644 --- a/fs/ubifs/lpt_commit.c +++ b/fs/ubifs/lpt_commit.c @@ -646,7 +646,7 @@ static struct ubifs_pnode *pnode_lookup(struct ubifs_info *c, int i) shft -= UBIFS_LPT_FANOUT_SHIFT; nnode = ubifs_get_nnode(c, nnode, iip); if (IS_ERR(nnode)) - return ERR_PTR(PTR_ERR(nnode)); + return ERR_CAST(nnode); } iip = ((i >> shft) & (UBIFS_LPT_FANOUT - 1)); return ubifs_get_pnode(c, nnode, iip); diff --git a/fs/ubifs/recovery.c b/fs/ubifs/recovery.c index 109c6ea03bb5..daae9e1f5382 100644 --- a/fs/ubifs/recovery.c +++ b/fs/ubifs/recovery.c @@ -24,7 +24,7 @@ * This file implements functions needed to recover from unclean un-mounts. * When UBIFS is mounted, it checks a flag on the master node to determine if * an un-mount was completed successfully. If not, the process of mounting - * incorparates additional checking and fixing of on-flash data structures. + * incorporates additional checking and fixing of on-flash data structures. * UBIFS always cleans away all remnants of an unclean un-mount, so that * errors do not accumulate. However UBIFS defers recovery if it is mounted * read-only, and the flash is not modified in that case. @@ -1063,8 +1063,21 @@ int ubifs_rcvry_gc_commit(struct ubifs_info *c) } err = ubifs_find_dirty_leb(c, &lp, wbuf->offs, 2); if (err) { - if (err == -ENOSPC) - dbg_err("could not find a dirty LEB"); + /* + * There are no dirty or empty LEBs subject to here being + * enough for the index. Try to use + * 'ubifs_find_free_leb_for_idx()', which will return any empty + * LEBs (ignoring index requirements). If the index then + * doesn't have enough LEBs the recovery commit will fail - + * which is the same result anyway i.e. recovery fails. So + * there is no problem ignoring index requirements and just + * grabbing a free LEB since we have already established there + * is not a dirty LEB we could have used instead. + */ + if (err == -ENOSPC) { + dbg_rcvry("could not find a dirty LEB"); + goto find_free; + } return err; } ubifs_assert(!(lp.flags & LPROPS_INDEX)); @@ -1139,8 +1152,8 @@ int ubifs_rcvry_gc_commit(struct ubifs_info *c) find_free: /* * There is no GC head LEB or the free space in the GC head LEB is too - * small. Allocate gc_lnum by calling 'ubifs_find_free_leb_for_idx()' so - * GC is not run. + * small, or there are not dirty LEBs. Allocate gc_lnum by calling + * 'ubifs_find_free_leb_for_idx()' so GC is not run. */ lnum = ubifs_find_free_leb_for_idx(c); if (lnum < 0) { diff --git a/fs/ubifs/shrinker.c b/fs/ubifs/shrinker.c index 02feb59cefca..0b201114a5ad 100644 --- a/fs/ubifs/shrinker.c +++ b/fs/ubifs/shrinker.c @@ -277,7 +277,7 @@ static int kick_a_thread(void) return 0; } -int ubifs_shrinker(int nr, gfp_t gfp_mask) +int ubifs_shrinker(struct shrinker *shrink, int nr, gfp_t gfp_mask) { int freed, contention = 0; long clean_zn_cnt = atomic_long_read(&ubifs_clean_zn_cnt); diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c index 4d2f2157dd3f..5fc5a0988970 100644 --- a/fs/ubifs/super.c +++ b/fs/ubifs/super.c @@ -1307,6 +1307,8 @@ static int mount_ubifs(struct ubifs_info *c) if (err) goto out_orphans; err = ubifs_rcvry_gc_commit(c); + if (err) + goto out_orphans; } else { err = take_gc_lnum(c); if (err) @@ -1318,7 +1320,7 @@ static int mount_ubifs(struct ubifs_info *c) */ err = ubifs_leb_unmap(c, c->gc_lnum); if (err) - return err; + goto out_orphans; } err = dbg_check_lprops(c); diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h index bd2542dad014..04310878f449 100644 --- a/fs/ubifs/ubifs.h +++ b/fs/ubifs/ubifs.h @@ -379,7 +379,7 @@ struct ubifs_gced_idx_leb { * The @ui_size is a "shadow" variable for @inode->i_size and UBIFS uses * @ui_size instead of @inode->i_size. The reason for this is that UBIFS cannot * make sure @inode->i_size is always changed under @ui_mutex, because it - * cannot call 'vmtruncate()' with @ui_mutex locked, because it would deadlock + * cannot call 'simple_setsize()' with @ui_mutex locked, because it would deadlock * with 'ubifs_writepage()' (see file.c). All the other inode fields are * changed under @ui_mutex, so they do not need "shadow" fields. Note, one * could consider to rework locking and base it on "shadow" fields. @@ -1575,7 +1575,7 @@ int ubifs_tnc_start_commit(struct ubifs_info *c, struct ubifs_zbranch *zroot); int ubifs_tnc_end_commit(struct ubifs_info *c); /* shrinker.c */ -int ubifs_shrinker(int nr_to_scan, gfp_t gfp_mask); +int ubifs_shrinker(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask); /* commit.c */ int ubifs_bg_thread(void *info); @@ -1678,7 +1678,7 @@ const struct ubifs_lprops *ubifs_fast_find_frdi_idx(struct ubifs_info *c); int ubifs_calc_dark(const struct ubifs_info *c, int spc); /* file.c */ -int ubifs_fsync(struct file *file, struct dentry *dentry, int datasync); +int ubifs_fsync(struct file *file, int datasync); int ubifs_setattr(struct dentry *dentry, struct iattr *attr); /* dir.c */ diff --git a/fs/udf/balloc.c b/fs/udf/balloc.c index 9a9378b4eb5a..b608efaa4cee 100644 --- a/fs/udf/balloc.c +++ b/fs/udf/balloc.c @@ -21,7 +21,6 @@ #include "udfdecl.h" -#include <linux/quotaops.h> #include <linux/buffer_head.h> #include <linux/bitops.h> @@ -159,8 +158,6 @@ static void udf_bitmap_free_blocks(struct super_block *sb, udf_debug("byte=%2x\n", ((char *)bh->b_data)[(bit + i) >> 3]); } else { - if (inode) - dquot_free_block(inode, 1); udf_add_free_space(sb, sbi->s_partition, 1); } } @@ -210,15 +207,8 @@ static int udf_bitmap_prealloc_blocks(struct super_block *sb, bit = block % (sb->s_blocksize << 3); while (bit < (sb->s_blocksize << 3) && block_count > 0) { - if (!udf_test_bit(bit, bh->b_data)) + if (!udf_clear_bit(bit, bh->b_data)) goto out; - else if (dquot_prealloc_block(inode, 1)) - goto out; - else if (!udf_clear_bit(bit, bh->b_data)) { - udf_debug("bit already cleared for block %d\n", bit); - dquot_free_block(inode, 1); - goto out; - } block_count--; alloc_count++; bit++; @@ -338,20 +328,6 @@ search_back: } got_block: - - /* - * Check quota for allocation of this block. - */ - if (inode) { - int ret = dquot_alloc_block(inode, 1); - - if (ret) { - mutex_unlock(&sbi->s_alloc_mutex); - *err = ret; - return 0; - } - } - newblock = bit + (block_group << (sb->s_blocksize_bits + 3)) - (sizeof(struct spaceBitmapDesc) << 3); @@ -401,10 +377,6 @@ static void udf_table_free_blocks(struct super_block *sb, } iinfo = UDF_I(table); - /* We do this up front - There are some error conditions that - could occure, but.. oh well */ - if (inode) - dquot_free_block(inode, count); udf_add_free_space(sb, sbi->s_partition, count); start = bloc->logicalBlockNum + offset; @@ -649,10 +621,7 @@ static int udf_table_prealloc_blocks(struct super_block *sb, epos.offset -= adsize; alloc_count = (elen >> sb->s_blocksize_bits); - if (inode && dquot_prealloc_block(inode, - alloc_count > block_count ? block_count : alloc_count)) - alloc_count = 0; - else if (alloc_count > block_count) { + if (alloc_count > block_count) { alloc_count = block_count; eloc.logicalBlockNum += alloc_count; elen -= (alloc_count << sb->s_blocksize_bits); @@ -752,14 +721,6 @@ static int udf_table_new_block(struct super_block *sb, newblock = goal_eloc.logicalBlockNum; goal_eloc.logicalBlockNum++; goal_elen -= sb->s_blocksize; - if (inode) { - *err = dquot_alloc_block(inode, 1); - if (*err) { - brelse(goal_epos.bh); - mutex_unlock(&sbi->s_alloc_mutex); - return 0; - } - } if (goal_elen) udf_write_aext(table, &goal_epos, &goal_eloc, goal_elen, 1); diff --git a/fs/udf/dir.c b/fs/udf/dir.c index 3a84455c2a77..51552bf50225 100644 --- a/fs/udf/dir.c +++ b/fs/udf/dir.c @@ -207,8 +207,9 @@ static int udf_readdir(struct file *filp, void *dirent, filldir_t filldir) /* readdir and lookup functions */ const struct file_operations udf_dir_operations = { + .llseek = generic_file_llseek, .read = generic_read_dir, .readdir = udf_readdir, .unlocked_ioctl = udf_ioctl, - .fsync = simple_fsync, + .fsync = generic_file_fsync, }; diff --git a/fs/udf/file.c b/fs/udf/file.c index baae3a723946..94e06d6bddbd 100644 --- a/fs/udf/file.c +++ b/fs/udf/file.c @@ -34,7 +34,6 @@ #include <linux/errno.h> #include <linux/smp_lock.h> #include <linux/pagemap.h> -#include <linux/quotaops.h> #include <linux/buffer_head.h> #include <linux/aio.h> #include <linux/smp_lock.h> @@ -219,39 +218,16 @@ const struct file_operations udf_file_operations = { .read = do_sync_read, .aio_read = generic_file_aio_read, .unlocked_ioctl = udf_ioctl, - .open = dquot_file_open, + .open = generic_file_open, .mmap = generic_file_mmap, .write = do_sync_write, .aio_write = udf_file_aio_write, .release = udf_release_file, - .fsync = simple_fsync, + .fsync = generic_file_fsync, .splice_read = generic_file_splice_read, .llseek = generic_file_llseek, }; -int udf_setattr(struct dentry *dentry, struct iattr *iattr) -{ - struct inode *inode = dentry->d_inode; - int error; - - error = inode_change_ok(inode, iattr); - if (error) - return error; - - if (is_quota_modification(inode, iattr)) - dquot_initialize(inode); - - if ((iattr->ia_valid & ATTR_UID && iattr->ia_uid != inode->i_uid) || - (iattr->ia_valid & ATTR_GID && iattr->ia_gid != inode->i_gid)) { - error = dquot_transfer(inode, iattr); - if (error) - return error; - } - - return inode_setattr(inode, iattr); -} - const struct inode_operations udf_file_inode_operations = { .truncate = udf_truncate, - .setattr = udf_setattr, }; diff --git a/fs/udf/ialloc.c b/fs/udf/ialloc.c index 2b5586c7f02a..18cd7111185d 100644 --- a/fs/udf/ialloc.c +++ b/fs/udf/ialloc.c @@ -20,7 +20,6 @@ #include "udfdecl.h" #include <linux/fs.h> -#include <linux/quotaops.h> #include <linux/sched.h> #include <linux/slab.h> @@ -32,13 +31,6 @@ void udf_free_inode(struct inode *inode) struct super_block *sb = inode->i_sb; struct udf_sb_info *sbi = UDF_SB(sb); - /* - * Note: we must free any quota before locking the superblock, - * as writing the quota to disk may need the lock as well. - */ - dquot_free_inode(inode); - dquot_drop(inode); - clear_inode(inode); mutex_lock(&sbi->s_alloc_mutex); @@ -61,7 +53,7 @@ struct inode *udf_new_inode(struct inode *dir, int mode, int *err) struct super_block *sb = dir->i_sb; struct udf_sb_info *sbi = UDF_SB(sb); struct inode *inode; - int block, ret; + int block; uint32_t start = UDF_I(dir)->i_location.logicalBlockNum; struct udf_inode_info *iinfo; struct udf_inode_info *dinfo = UDF_I(dir); @@ -146,17 +138,6 @@ struct inode *udf_new_inode(struct inode *dir, int mode, int *err) insert_inode_hash(inode); mark_inode_dirty(inode); - dquot_initialize(inode); - ret = dquot_alloc_inode(inode); - if (ret) { - dquot_drop(inode); - inode->i_flags |= S_NOQUOTA; - inode->i_nlink = 0; - iput(inode); - *err = ret; - return NULL; - } - *err = 0; return inode; } diff --git a/fs/udf/inode.c b/fs/udf/inode.c index 8a3fbd177cab..124852bcf6fe 100644 --- a/fs/udf/inode.c +++ b/fs/udf/inode.c @@ -36,7 +36,6 @@ #include <linux/pagemap.h> #include <linux/buffer_head.h> #include <linux/writeback.h> -#include <linux/quotaops.h> #include <linux/slab.h> #include <linux/crc-itu-t.h> @@ -71,9 +70,6 @@ static int udf_get_block(struct inode *, sector_t, struct buffer_head *, int); void udf_delete_inode(struct inode *inode) { - if (!is_bad_inode(inode)) - dquot_initialize(inode); - truncate_inode_pages(&inode->i_data, 0); if (is_bad_inode(inode)) @@ -113,7 +109,6 @@ void udf_clear_inode(struct inode *inode) (unsigned long long)iinfo->i_lenExtents); } - dquot_drop(inode); kfree(iinfo->i_ext.i_data); iinfo->i_ext.i_data = NULL; } diff --git a/fs/udf/namei.c b/fs/udf/namei.c index 585f733615dc..bf5fc674193c 100644 --- a/fs/udf/namei.c +++ b/fs/udf/namei.c @@ -27,7 +27,6 @@ #include <linux/errno.h> #include <linux/mm.h> #include <linux/slab.h> -#include <linux/quotaops.h> #include <linux/smp_lock.h> #include <linux/buffer_head.h> #include <linux/sched.h> @@ -563,8 +562,6 @@ static int udf_create(struct inode *dir, struct dentry *dentry, int mode, int err; struct udf_inode_info *iinfo; - dquot_initialize(dir); - lock_kernel(); inode = udf_new_inode(dir, mode, &err); if (!inode) { @@ -617,8 +614,6 @@ static int udf_mknod(struct inode *dir, struct dentry *dentry, int mode, if (!old_valid_dev(rdev)) return -EINVAL; - dquot_initialize(dir); - lock_kernel(); err = -EIO; inode = udf_new_inode(dir, mode, &err); @@ -664,8 +659,6 @@ static int udf_mkdir(struct inode *dir, struct dentry *dentry, int mode) struct udf_inode_info *dinfo = UDF_I(dir); struct udf_inode_info *iinfo; - dquot_initialize(dir); - lock_kernel(); err = -EMLINK; if (dir->i_nlink >= (256 << sizeof(dir->i_nlink)) - 1) @@ -800,8 +793,6 @@ static int udf_rmdir(struct inode *dir, struct dentry *dentry) struct fileIdentDesc *fi, cfi; struct kernel_lb_addr tloc; - dquot_initialize(dir); - retval = -ENOENT; lock_kernel(); fi = udf_find_entry(dir, &dentry->d_name, &fibh, &cfi); @@ -848,8 +839,6 @@ static int udf_unlink(struct inode *dir, struct dentry *dentry) struct fileIdentDesc cfi; struct kernel_lb_addr tloc; - dquot_initialize(dir); - retval = -ENOENT; lock_kernel(); fi = udf_find_entry(dir, &dentry->d_name, &fibh, &cfi); @@ -904,8 +893,6 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry, struct buffer_head *bh; struct udf_inode_info *iinfo; - dquot_initialize(dir); - lock_kernel(); inode = udf_new_inode(dir, S_IFLNK | S_IRWXUGO, &err); if (!inode) @@ -1075,8 +1062,6 @@ static int udf_link(struct dentry *old_dentry, struct inode *dir, int err; struct buffer_head *bh; - dquot_initialize(dir); - lock_kernel(); if (inode->i_nlink >= (256 << sizeof(inode->i_nlink)) - 1) { unlock_kernel(); @@ -1139,9 +1124,6 @@ static int udf_rename(struct inode *old_dir, struct dentry *old_dentry, struct kernel_lb_addr tloc; struct udf_inode_info *old_iinfo = UDF_I(old_inode); - dquot_initialize(old_dir); - dquot_initialize(new_dir); - lock_kernel(); ofi = udf_find_entry(old_dir, &old_dentry->d_name, &ofibh, &ocfi); if (ofi) { @@ -1387,7 +1369,6 @@ const struct export_operations udf_export_ops = { const struct inode_operations udf_dir_inode_operations = { .lookup = udf_lookup, .create = udf_create, - .setattr = udf_setattr, .link = udf_link, .unlink = udf_unlink, .symlink = udf_symlink, @@ -1400,5 +1381,4 @@ const struct inode_operations udf_symlink_inode_operations = { .readlink = generic_readlink, .follow_link = page_follow_link_light, .put_link = page_put_link, - .setattr = udf_setattr, }; diff --git a/fs/udf/super.c b/fs/udf/super.c index 1e4543cbcd27..612d1e2e285a 100644 --- a/fs/udf/super.c +++ b/fs/udf/super.c @@ -557,6 +557,7 @@ static int udf_remount_fs(struct super_block *sb, int *flags, char *options) { struct udf_options uopt; struct udf_sb_info *sbi = UDF_SB(sb); + int error = 0; uopt.flags = sbi->s_flags; uopt.uid = sbi->s_uid; @@ -582,17 +583,17 @@ static int udf_remount_fs(struct super_block *sb, int *flags, char *options) *flags |= MS_RDONLY; } - if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) { - unlock_kernel(); - return 0; - } + if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) + goto out_unlock; + if (*flags & MS_RDONLY) udf_close_lvid(sb); else udf_open_lvid(sb); +out_unlock: unlock_kernel(); - return 0; + return error; } /* Check Volume Structure Descriptors (ECMA 167 2/9.1) */ @@ -1939,7 +1940,7 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent) /* Fill in the rest of the superblock */ sb->s_op = &udf_sb_ops; sb->s_export_op = &udf_export_ops; - sb->dq_op = NULL; + sb->s_dirt = 0; sb->s_magic = UDF_SUPER_MAGIC; sb->s_time_gran = 1000; diff --git a/fs/udf/udfdecl.h b/fs/udf/udfdecl.h index 9079ff7d6255..2bac0354891f 100644 --- a/fs/udf/udfdecl.h +++ b/fs/udf/udfdecl.h @@ -131,7 +131,6 @@ extern int udf_write_fi(struct inode *inode, struct fileIdentDesc *, /* file.c */ extern long udf_ioctl(struct file *, unsigned int, unsigned long); -extern int udf_setattr(struct dentry *dentry, struct iattr *iattr); /* inode.c */ extern struct inode *udf_iget(struct super_block *, struct kernel_lb_addr *); extern int udf_sync_inode(struct inode *); diff --git a/fs/ufs/balloc.c b/fs/ufs/balloc.c index 5cfa4d85ccf2..048484fb10d2 100644 --- a/fs/ufs/balloc.c +++ b/fs/ufs/balloc.c @@ -12,7 +12,6 @@ #include <linux/stat.h> #include <linux/time.h> #include <linux/string.h> -#include <linux/quotaops.h> #include <linux/buffer_head.h> #include <linux/capability.h> #include <linux/bitops.h> @@ -85,9 +84,6 @@ void ufs_free_fragments(struct inode *inode, u64 fragment, unsigned count) "bit already cleared for fragment %u", i); } - dquot_free_block(inode, count); - - fs32_add(sb, &ucg->cg_cs.cs_nffree, count); uspi->cs_total.cs_nffree += count; fs32_add(sb, &UFS_SB(sb)->fs_cs(cgno).cs_nffree, count); @@ -195,7 +191,6 @@ do_more: ubh_setblock(UCPI_UBH(ucpi), ucpi->c_freeoff, blkno); if ((UFS_SB(sb)->s_flags & UFS_CG_MASK) == UFS_CG_44BSD) ufs_clusteracct (sb, ucpi, blkno, 1); - dquot_free_block(inode, uspi->s_fpb); fs32_add(sb, &ucg->cg_cs.cs_nbfree, 1); uspi->cs_total.cs_nbfree++; @@ -511,7 +506,6 @@ static u64 ufs_add_fragments(struct inode *inode, u64 fragment, struct ufs_cg_private_info * ucpi; struct ufs_cylinder_group * ucg; unsigned cgno, fragno, fragoff, count, fragsize, i; - int ret; UFSD("ENTER, fragment %llu, oldcount %u, newcount %u\n", (unsigned long long)fragment, oldcount, newcount); @@ -557,11 +551,6 @@ static u64 ufs_add_fragments(struct inode *inode, u64 fragment, fs32_add(sb, &ucg->cg_frsum[fragsize - count], 1); for (i = oldcount; i < newcount; i++) ubh_clrbit (UCPI_UBH(ucpi), ucpi->c_freeoff, fragno + i); - ret = dquot_alloc_block(inode, count); - if (ret) { - *err = ret; - return 0; - } fs32_sub(sb, &ucg->cg_cs.cs_nffree, count); fs32_sub(sb, &UFS_SB(sb)->fs_cs(cgno).cs_nffree, count); @@ -598,7 +587,6 @@ static u64 ufs_alloc_fragments(struct inode *inode, unsigned cgno, struct ufs_cylinder_group * ucg; unsigned oldcg, i, j, k, allocsize; u64 result; - int ret; UFSD("ENTER, ino %lu, cgno %u, goal %llu, count %u\n", inode->i_ino, cgno, (unsigned long long)goal, count); @@ -667,7 +655,6 @@ cg_found: for (i = count; i < uspi->s_fpb; i++) ubh_setbit (UCPI_UBH(ucpi), ucpi->c_freeoff, goal + i); i = uspi->s_fpb - count; - dquot_free_block(inode, i); fs32_add(sb, &ucg->cg_cs.cs_nffree, i); uspi->cs_total.cs_nffree += i; @@ -679,11 +666,6 @@ cg_found: result = ufs_bitmap_search (sb, ucpi, goal, allocsize); if (result == INVBLOCK) return 0; - ret = dquot_alloc_block(inode, count); - if (ret) { - *err = ret; - return 0; - } for (i = 0; i < count; i++) ubh_clrbit (UCPI_UBH(ucpi), ucpi->c_freeoff, result + i); @@ -718,7 +700,6 @@ static u64 ufs_alloccg_block(struct inode *inode, struct ufs_super_block_first * usb1; struct ufs_cylinder_group * ucg; u64 result, blkno; - int ret; UFSD("ENTER, goal %llu\n", (unsigned long long)goal); @@ -752,11 +733,6 @@ gotit: ubh_clrblock (UCPI_UBH(ucpi), ucpi->c_freeoff, blkno); if ((UFS_SB(sb)->s_flags & UFS_CG_MASK) == UFS_CG_44BSD) ufs_clusteracct (sb, ucpi, blkno, -1); - ret = dquot_alloc_block(inode, uspi->s_fpb); - if (ret) { - *err = ret; - return INVBLOCK; - } fs32_sub(sb, &ucg->cg_cs.cs_nbfree, 1); uspi->cs_total.cs_nbfree--; diff --git a/fs/ufs/dir.c b/fs/ufs/dir.c index 317a0d444f6b..ec784756dc65 100644 --- a/fs/ufs/dir.c +++ b/fs/ufs/dir.c @@ -666,6 +666,6 @@ not_empty: const struct file_operations ufs_dir_operations = { .read = generic_read_dir, .readdir = ufs_readdir, - .fsync = simple_fsync, + .fsync = generic_file_fsync, .llseek = generic_file_llseek, }; diff --git a/fs/ufs/file.c b/fs/ufs/file.c index a8962cecde5b..33afa20d4509 100644 --- a/fs/ufs/file.c +++ b/fs/ufs/file.c @@ -24,7 +24,6 @@ */ #include <linux/fs.h> -#include <linux/quotaops.h> #include "ufs_fs.h" #include "ufs.h" @@ -41,7 +40,7 @@ const struct file_operations ufs_file_operations = { .write = do_sync_write, .aio_write = generic_file_aio_write, .mmap = generic_file_mmap, - .open = dquot_file_open, - .fsync = simple_fsync, + .open = generic_file_open, + .fsync = generic_file_fsync, .splice_read = generic_file_splice_read, }; diff --git a/fs/ufs/ialloc.c b/fs/ufs/ialloc.c index 3a959d55084d..594480e537d2 100644 --- a/fs/ufs/ialloc.c +++ b/fs/ufs/ialloc.c @@ -27,7 +27,6 @@ #include <linux/time.h> #include <linux/stat.h> #include <linux/string.h> -#include <linux/quotaops.h> #include <linux/buffer_head.h> #include <linux/sched.h> #include <linux/bitops.h> @@ -95,9 +94,6 @@ void ufs_free_inode (struct inode * inode) is_directory = S_ISDIR(inode->i_mode); - dquot_free_inode(inode); - dquot_drop(inode); - clear_inode (inode); if (ubh_isclr (UCPI_UBH(ucpi), ucpi->c_iusedoff, bit)) @@ -347,21 +343,12 @@ cg_found: unlock_super (sb); - dquot_initialize(inode); - err = dquot_alloc_inode(inode); - if (err) { - dquot_drop(inode); - goto fail_without_unlock; - } - UFSD("allocating inode %lu\n", inode->i_ino); UFSD("EXIT\n"); return inode; fail_remove_inode: unlock_super(sb); -fail_without_unlock: - inode->i_flags |= S_NOQUOTA; inode->i_nlink = 0; iput(inode); UFSD("EXIT (FAILED): err %d\n", err); diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c index cffa756f1047..73fe773aa034 100644 --- a/fs/ufs/inode.c +++ b/fs/ufs/inode.c @@ -37,7 +37,6 @@ #include <linux/smp_lock.h> #include <linux/buffer_head.h> #include <linux/writeback.h> -#include <linux/quotaops.h> #include "ufs_fs.h" #include "ufs.h" @@ -910,9 +909,6 @@ void ufs_delete_inode (struct inode * inode) { loff_t old_i_size; - if (!is_bad_inode(inode)) - dquot_initialize(inode); - truncate_inode_pages(&inode->i_data, 0); if (is_bad_inode(inode)) goto no_delete; diff --git a/fs/ufs/namei.c b/fs/ufs/namei.c index eabc02eb1294..b056f02b1fb3 100644 --- a/fs/ufs/namei.c +++ b/fs/ufs/namei.c @@ -30,7 +30,6 @@ #include <linux/time.h> #include <linux/fs.h> #include <linux/smp_lock.h> -#include <linux/quotaops.h> #include "ufs_fs.h" #include "ufs.h" @@ -86,8 +85,6 @@ static int ufs_create (struct inode * dir, struct dentry * dentry, int mode, UFSD("BEGIN\n"); - dquot_initialize(dir); - inode = ufs_new_inode(dir, mode); err = PTR_ERR(inode); @@ -112,8 +109,6 @@ static int ufs_mknod (struct inode * dir, struct dentry *dentry, int mode, dev_t if (!old_valid_dev(rdev)) return -EINVAL; - dquot_initialize(dir); - inode = ufs_new_inode(dir, mode); err = PTR_ERR(inode); if (!IS_ERR(inode)) { @@ -138,8 +133,6 @@ static int ufs_symlink (struct inode * dir, struct dentry * dentry, if (l > sb->s_blocksize) goto out_notlocked; - dquot_initialize(dir); - lock_kernel(); inode = ufs_new_inode(dir, S_IFLNK | S_IRWXUGO); err = PTR_ERR(inode); @@ -185,8 +178,6 @@ static int ufs_link (struct dentry * old_dentry, struct inode * dir, return -EMLINK; } - dquot_initialize(dir); - inode->i_ctime = CURRENT_TIME_SEC; inode_inc_link_count(inode); atomic_inc(&inode->i_count); @@ -204,8 +195,6 @@ static int ufs_mkdir(struct inode * dir, struct dentry * dentry, int mode) if (dir->i_nlink >= UFS_LINK_MAX) goto out; - dquot_initialize(dir); - lock_kernel(); inode_inc_link_count(dir); @@ -250,8 +239,6 @@ static int ufs_unlink(struct inode *dir, struct dentry *dentry) struct page *page; int err = -ENOENT; - dquot_initialize(dir); - de = ufs_find_entry(dir, &dentry->d_name, &page); if (!de) goto out; @@ -296,9 +283,6 @@ static int ufs_rename(struct inode *old_dir, struct dentry *old_dentry, struct ufs_dir_entry *old_de; int err = -ENOENT; - dquot_initialize(old_dir); - dquot_initialize(new_dir); - old_de = ufs_find_entry(old_dir, &old_dentry->d_name, &old_page); if (!old_de) goto out; diff --git a/fs/ufs/super.c b/fs/ufs/super.c index 14743d935a93..3ec5a9eb6efb 100644 --- a/fs/ufs/super.c +++ b/fs/ufs/super.c @@ -77,7 +77,6 @@ #include <linux/errno.h> #include <linux/fs.h> -#include <linux/quotaops.h> #include <linux/slab.h> #include <linux/time.h> #include <linux/stat.h> @@ -918,6 +917,7 @@ again: sbi->s_bytesex = BYTESEX_LE; switch ((uspi->fs_magic = fs32_to_cpu(sb, usb3->fs_magic))) { case UFS_MAGIC: + case UFS_MAGIC_BW: case UFS2_MAGIC: case UFS_MAGIC_LFN: case UFS_MAGIC_FEA: @@ -927,6 +927,7 @@ again: sbi->s_bytesex = BYTESEX_BE; switch ((uspi->fs_magic = fs32_to_cpu(sb, usb3->fs_magic))) { case UFS_MAGIC: + case UFS_MAGIC_BW: case UFS2_MAGIC: case UFS_MAGIC_LFN: case UFS_MAGIC_FEA: @@ -1045,7 +1046,7 @@ magic_found: */ sb->s_op = &ufs_super_ops; sb->s_export_op = &ufs_export_ops; - sb->dq_op = NULL; /***/ + sb->s_magic = fs32_to_cpu(sb, usb3->fs_magic); uspi->s_sblkno = fs32_to_cpu(sb, usb1->fs_sblkno); @@ -1435,126 +1436,19 @@ static void destroy_inodecache(void) kmem_cache_destroy(ufs_inode_cachep); } -static void ufs_clear_inode(struct inode *inode) -{ - dquot_drop(inode); -} - -#ifdef CONFIG_QUOTA -static ssize_t ufs_quota_read(struct super_block *, int, char *,size_t, loff_t); -static ssize_t ufs_quota_write(struct super_block *, int, const char *, size_t, loff_t); -#endif - static const struct super_operations ufs_super_ops = { .alloc_inode = ufs_alloc_inode, .destroy_inode = ufs_destroy_inode, .write_inode = ufs_write_inode, .delete_inode = ufs_delete_inode, - .clear_inode = ufs_clear_inode, .put_super = ufs_put_super, .write_super = ufs_write_super, .sync_fs = ufs_sync_fs, .statfs = ufs_statfs, .remount_fs = ufs_remount, .show_options = ufs_show_options, -#ifdef CONFIG_QUOTA - .quota_read = ufs_quota_read, - .quota_write = ufs_quota_write, -#endif }; -#ifdef CONFIG_QUOTA - -/* Read data from quotafile - avoid pagecache and such because we cannot afford - * acquiring the locks... As quota files are never truncated and quota code - * itself serializes the operations (and noone else should touch the files) - * we don't have to be afraid of races */ -static ssize_t ufs_quota_read(struct super_block *sb, int type, char *data, - size_t len, loff_t off) -{ - struct inode *inode = sb_dqopt(sb)->files[type]; - sector_t blk = off >> sb->s_blocksize_bits; - int err = 0; - int offset = off & (sb->s_blocksize - 1); - int tocopy; - size_t toread; - struct buffer_head *bh; - loff_t i_size = i_size_read(inode); - - if (off > i_size) - return 0; - if (off+len > i_size) - len = i_size-off; - toread = len; - while (toread > 0) { - tocopy = sb->s_blocksize - offset < toread ? - sb->s_blocksize - offset : toread; - - bh = ufs_bread(inode, blk, 0, &err); - if (err) - return err; - if (!bh) /* A hole? */ - memset(data, 0, tocopy); - else { - memcpy(data, bh->b_data+offset, tocopy); - brelse(bh); - } - offset = 0; - toread -= tocopy; - data += tocopy; - blk++; - } - return len; -} - -/* Write to quotafile */ -static ssize_t ufs_quota_write(struct super_block *sb, int type, - const char *data, size_t len, loff_t off) -{ - struct inode *inode = sb_dqopt(sb)->files[type]; - sector_t blk = off >> sb->s_blocksize_bits; - int err = 0; - int offset = off & (sb->s_blocksize - 1); - int tocopy; - size_t towrite = len; - struct buffer_head *bh; - - mutex_lock_nested(&inode->i_mutex, I_MUTEX_QUOTA); - while (towrite > 0) { - tocopy = sb->s_blocksize - offset < towrite ? - sb->s_blocksize - offset : towrite; - - bh = ufs_bread(inode, blk, 1, &err); - if (!bh) - goto out; - lock_buffer(bh); - memcpy(bh->b_data+offset, data, tocopy); - flush_dcache_page(bh->b_page); - set_buffer_uptodate(bh); - mark_buffer_dirty(bh); - unlock_buffer(bh); - brelse(bh); - offset = 0; - towrite -= tocopy; - data += tocopy; - blk++; - } -out: - if (len == towrite) { - mutex_unlock(&inode->i_mutex); - return err; - } - if (inode->i_size < off+len-towrite) - i_size_write(inode, off+len-towrite); - inode->i_version++; - inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC; - mark_inode_dirty(inode); - mutex_unlock(&inode->i_mutex); - return len - towrite; -} - -#endif - static int ufs_get_sb(struct file_system_type *fs_type, int flags, const char *dev_name, void *data, struct vfsmount *mnt) { diff --git a/fs/ufs/truncate.c b/fs/ufs/truncate.c index f294c44577dc..589e01a465ba 100644 --- a/fs/ufs/truncate.c +++ b/fs/ufs/truncate.c @@ -44,7 +44,6 @@ #include <linux/buffer_head.h> #include <linux/blkdev.h> #include <linux/sched.h> -#include <linux/quotaops.h> #include "ufs_fs.h" #include "ufs.h" @@ -501,12 +500,10 @@ out: return err; } - /* - * We don't define our `inode->i_op->truncate', and call it here, - * because of: - * - there is no way to know old size - * - there is no way inform user about error, if it happens in `truncate' + * TODO: + * - truncate case should use proper ordering instead of using + * simple_setsize */ int ufs_setattr(struct dentry *dentry, struct iattr *attr) { @@ -518,19 +515,10 @@ int ufs_setattr(struct dentry *dentry, struct iattr *attr) if (error) return error; - if (is_quota_modification(inode, attr)) - dquot_initialize(inode); - - if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) || - (ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) { - error = dquot_transfer(inode, attr); - if (error) - return error; - } if (ia_valid & ATTR_SIZE && attr->ia_size != inode->i_size) { loff_t old_i_size = inode->i_size; - error = vmtruncate(inode, attr->ia_size); + error = simple_setsize(inode, attr->ia_size); if (error) return error; error = ufs_truncate(inode, old_i_size); diff --git a/fs/ufs/ufs_fs.h b/fs/ufs/ufs_fs.h index 6943ec677c0b..8aba544f9fad 100644 --- a/fs/ufs/ufs_fs.h +++ b/fs/ufs/ufs_fs.h @@ -48,6 +48,7 @@ typedef __u16 __bitwise __fs16; #define UFS_SECTOR_SIZE 512 #define UFS_SECTOR_BITS 9 #define UFS_MAGIC 0x00011954 +#define UFS_MAGIC_BW 0x0f242697 #define UFS2_MAGIC 0x19540119 #define UFS_CIGAM 0x54190100 /* byteswapped MAGIC */ diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile index c8fb13f83b3f..0dce969d6cad 100644 --- a/fs/xfs/Makefile +++ b/fs/xfs/Makefile @@ -87,11 +87,9 @@ xfs-y += xfs_alloc.o \ xfs_trans_buf.o \ xfs_trans_extfree.o \ xfs_trans_inode.o \ - xfs_trans_item.o \ xfs_utils.o \ xfs_vnodeops.o \ - xfs_rw.o \ - xfs_dmops.o + xfs_rw.o xfs-$(CONFIG_XFS_TRACE) += xfs_btree_trace.o diff --git a/fs/xfs/linux-2.6/xfs_acl.c b/fs/xfs/linux-2.6/xfs_acl.c index 9f769b5b38fc..b2771862fd3d 100644 --- a/fs/xfs/linux-2.6/xfs_acl.c +++ b/fs/xfs/linux-2.6/xfs_acl.c @@ -225,7 +225,7 @@ xfs_check_acl(struct inode *inode, int mask) struct posix_acl *acl; int error = -EAGAIN; - xfs_itrace_entry(ip); + trace_xfs_check_acl(ip); /* * If there is no attribute fork no ACL exists on this inode and diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c index 089eaca860b4..d24e78f32f3e 100644 --- a/fs/xfs/linux-2.6/xfs_aops.c +++ b/fs/xfs/linux-2.6/xfs_aops.c @@ -21,19 +21,12 @@ #include "xfs_inum.h" #include "xfs_sb.h" #include "xfs_ag.h" -#include "xfs_dir2.h" #include "xfs_trans.h" -#include "xfs_dmapi.h" #include "xfs_mount.h" #include "xfs_bmap_btree.h" -#include "xfs_alloc_btree.h" -#include "xfs_ialloc_btree.h" -#include "xfs_dir2_sf.h" -#include "xfs_attr_sf.h" #include "xfs_dinode.h" #include "xfs_inode.h" #include "xfs_alloc.h" -#include "xfs_btree.h" #include "xfs_error.h" #include "xfs_rw.h" #include "xfs_iomap.h" @@ -92,18 +85,15 @@ void xfs_count_page_state( struct page *page, int *delalloc, - int *unmapped, int *unwritten) { struct buffer_head *bh, *head; - *delalloc = *unmapped = *unwritten = 0; + *delalloc = *unwritten = 0; bh = head = page_buffers(page); do { - if (buffer_uptodate(bh) && !buffer_mapped(bh)) - (*unmapped) = 1; - else if (buffer_unwritten(bh)) + if (buffer_unwritten(bh)) (*unwritten) = 1; else if (buffer_delay(bh)) (*delalloc) = 1; @@ -212,23 +202,17 @@ xfs_setfilesize( } /* - * Schedule IO completion handling on a xfsdatad if this was - * the final hold on this ioend. If we are asked to wait, - * flush the workqueue. + * Schedule IO completion handling on the final put of an ioend. */ STATIC void xfs_finish_ioend( - xfs_ioend_t *ioend, - int wait) + struct xfs_ioend *ioend) { if (atomic_dec_and_test(&ioend->io_remaining)) { - struct workqueue_struct *wq; - - wq = (ioend->io_type == IO_UNWRITTEN) ? - xfsconvertd_workqueue : xfsdatad_workqueue; - queue_work(wq, &ioend->io_work); - if (wait) - flush_workqueue(wq); + if (ioend->io_type == IO_UNWRITTEN) + queue_work(xfsconvertd_workqueue, &ioend->io_work); + else + queue_work(xfsdatad_workqueue, &ioend->io_work); } } @@ -272,11 +256,25 @@ xfs_end_io( */ if (error == EAGAIN) { atomic_inc(&ioend->io_remaining); - xfs_finish_ioend(ioend, 0); + xfs_finish_ioend(ioend); /* ensure we don't spin on blocked ioends */ delay(1); - } else + } else { + if (ioend->io_iocb) + aio_complete(ioend->io_iocb, ioend->io_result, 0); xfs_destroy_ioend(ioend); + } +} + +/* + * Call IO completion handling in caller context on the final put of an ioend. + */ +STATIC void +xfs_finish_ioend_sync( + struct xfs_ioend *ioend) +{ + if (atomic_dec_and_test(&ioend->io_remaining)) + xfs_end_io(&ioend->io_work); } /* @@ -309,6 +307,8 @@ xfs_alloc_ioend( atomic_inc(&XFS_I(ioend->io_inode)->i_iocount); ioend->io_offset = 0; ioend->io_size = 0; + ioend->io_iocb = NULL; + ioend->io_result = 0; INIT_WORK(&ioend->io_work, xfs_end_io); return ioend; @@ -358,7 +358,7 @@ xfs_end_bio( bio->bi_end_io = NULL; bio_put(bio); - xfs_finish_ioend(ioend, 0); + xfs_finish_ioend(ioend); } STATIC void @@ -500,7 +500,7 @@ xfs_submit_ioend( } if (bio) xfs_submit_ioend_bio(wbc, ioend, bio); - xfs_finish_ioend(ioend, 0); + xfs_finish_ioend(ioend); } while ((ioend = next) != NULL); } @@ -614,31 +614,30 @@ xfs_map_at_offset( STATIC unsigned int xfs_probe_page( struct page *page, - unsigned int pg_offset, - int mapped) + unsigned int pg_offset) { + struct buffer_head *bh, *head; int ret = 0; if (PageWriteback(page)) return 0; + if (!PageDirty(page)) + return 0; + if (!page->mapping) + return 0; + if (!page_has_buffers(page)) + return 0; - if (page->mapping && PageDirty(page)) { - if (page_has_buffers(page)) { - struct buffer_head *bh, *head; - - bh = head = page_buffers(page); - do { - if (!buffer_uptodate(bh)) - break; - if (mapped != buffer_mapped(bh)) - break; - ret += bh->b_size; - if (ret >= pg_offset) - break; - } while ((bh = bh->b_this_page) != head); - } else - ret = mapped ? 0 : PAGE_CACHE_SIZE; - } + bh = head = page_buffers(page); + do { + if (!buffer_uptodate(bh)) + break; + if (!buffer_mapped(bh)) + break; + ret += bh->b_size; + if (ret >= pg_offset) + break; + } while ((bh = bh->b_this_page) != head); return ret; } @@ -648,8 +647,7 @@ xfs_probe_cluster( struct inode *inode, struct page *startpage, struct buffer_head *bh, - struct buffer_head *head, - int mapped) + struct buffer_head *head) { struct pagevec pvec; pgoff_t tindex, tlast, tloff; @@ -658,7 +656,7 @@ xfs_probe_cluster( /* First sum forwards in this page */ do { - if (!buffer_uptodate(bh) || (mapped != buffer_mapped(bh))) + if (!buffer_uptodate(bh) || !buffer_mapped(bh)) return total; total += bh->b_size; } while ((bh = bh->b_this_page) != head); @@ -692,7 +690,7 @@ xfs_probe_cluster( pg_offset = PAGE_CACHE_SIZE; if (page->index == tindex && trylock_page(page)) { - pg_len = xfs_probe_page(page, pg_offset, mapped); + pg_len = xfs_probe_page(page, pg_offset); unlock_page(page); } @@ -761,7 +759,6 @@ xfs_convert_page( struct xfs_bmbt_irec *imap, xfs_ioend_t **ioendp, struct writeback_control *wbc, - int startio, int all_bh) { struct buffer_head *bh, *head; @@ -832,19 +829,14 @@ xfs_convert_page( ASSERT(imap->br_startblock != DELAYSTARTBLOCK); xfs_map_at_offset(inode, bh, imap, offset); - if (startio) { - xfs_add_to_ioend(inode, bh, offset, - type, ioendp, done); - } else { - set_buffer_dirty(bh); - unlock_buffer(bh); - mark_buffer_dirty(bh); - } + xfs_add_to_ioend(inode, bh, offset, type, + ioendp, done); + page_dirty--; count++; } else { type = IO_NEW; - if (buffer_mapped(bh) && all_bh && startio) { + if (buffer_mapped(bh) && all_bh) { lock_buffer(bh); xfs_add_to_ioend(inode, bh, offset, type, ioendp, done); @@ -859,14 +851,12 @@ xfs_convert_page( if (uptodate && bh == head) SetPageUptodate(page); - if (startio) { - if (count) { - wbc->nr_to_write--; - if (wbc->nr_to_write <= 0) - done = 1; - } - xfs_start_page_writeback(page, !page_dirty, count); + if (count) { + wbc->nr_to_write--; + if (wbc->nr_to_write <= 0) + done = 1; } + xfs_start_page_writeback(page, !page_dirty, count); return done; fail_unlock_page: @@ -886,7 +876,6 @@ xfs_cluster_write( struct xfs_bmbt_irec *imap, xfs_ioend_t **ioendp, struct writeback_control *wbc, - int startio, int all_bh, pgoff_t tlast) { @@ -902,7 +891,7 @@ xfs_cluster_write( for (i = 0; i < pagevec_count(&pvec); i++) { done = xfs_convert_page(inode, pvec.pages[i], tindex++, - imap, ioendp, wbc, startio, all_bh); + imap, ioendp, wbc, all_bh); if (done) break; } @@ -981,7 +970,7 @@ xfs_aops_discard_page( */ error = xfs_bmapi(NULL, ip, offset_fsb, 1, XFS_BMAPI_ENTIRE, NULL, 0, &imap, - &nimaps, NULL, NULL); + &nimaps, NULL); if (error) { /* something screwed, just bail */ @@ -1009,7 +998,7 @@ xfs_aops_discard_page( */ xfs_bmap_init(&flist, &firstblock); error = xfs_bunmapi(NULL, ip, offset_fsb, 1, 0, 1, &firstblock, - &flist, NULL, &done); + &flist, &done); ASSERT(!flist.xbf_count && !flist.xbf_first); if (error) { @@ -1032,50 +1021,66 @@ out_invalidate: } /* - * Calling this without startio set means we are being asked to make a dirty - * page ready for freeing it's buffers. When called with startio set then - * we are coming from writepage. + * Write out a dirty page. + * + * For delalloc space on the page we need to allocate space and flush it. + * For unwritten space on the page we need to start the conversion to + * regular allocated space. + * For any other dirty buffer heads on the page we should flush them. * - * When called with startio set it is important that we write the WHOLE - * page if possible. - * The bh->b_state's cannot know if any of the blocks or which block for - * that matter are dirty due to mmap writes, and therefore bh uptodate is - * only valid if the page itself isn't completely uptodate. Some layers - * may clear the page dirty flag prior to calling write page, under the - * assumption the entire page will be written out; by not writing out the - * whole page the page can be reused before all valid dirty data is - * written out. Note: in the case of a page that has been dirty'd by - * mapwrite and but partially setup by block_prepare_write the - * bh->b_states's will not agree and only ones setup by BPW/BCW will have - * valid state, thus the whole page must be written out thing. + * If we detect that a transaction would be required to flush the page, we + * have to check the process flags first, if we are already in a transaction + * or disk I/O during allocations is off, we need to fail the writepage and + * redirty the page. */ - STATIC int -xfs_page_state_convert( - struct inode *inode, - struct page *page, - struct writeback_control *wbc, - int startio, - int unmapped) /* also implies page uptodate */ +xfs_vm_writepage( + struct page *page, + struct writeback_control *wbc) { + struct inode *inode = page->mapping->host; + int delalloc, unwritten; struct buffer_head *bh, *head; struct xfs_bmbt_irec imap; xfs_ioend_t *ioend = NULL, *iohead = NULL; loff_t offset; - unsigned long p_offset = 0; unsigned int type; __uint64_t end_offset; pgoff_t end_index, last_index; ssize_t size, len; int flags, err, imap_valid = 0, uptodate = 1; - int page_dirty, count = 0; - int trylock = 0; - int all_bh = unmapped; + int count = 0; + int all_bh = 0; - if (startio) { - if (wbc->sync_mode == WB_SYNC_NONE && wbc->nonblocking) - trylock |= BMAPI_TRYLOCK; - } + trace_xfs_writepage(inode, page, 0); + + ASSERT(page_has_buffers(page)); + + /* + * Refuse to write the page out if we are called from reclaim context. + * + * This avoids stack overflows when called from deeply used stacks in + * random callers for direct reclaim or memcg reclaim. We explicitly + * allow reclaim from kswapd as the stack usage there is relatively low. + * + * This should really be done by the core VM, but until that happens + * filesystems like XFS, btrfs and ext4 have to take care of this + * by themselves. + */ + if ((current->flags & (PF_MEMALLOC|PF_KSWAPD)) == PF_MEMALLOC) + goto out_fail; + + /* + * We need a transaction if there are delalloc or unwritten buffers + * on the page. + * + * If we need a transaction and the process flags say we are already + * in a transaction, or no IO is allowed then mark the page dirty + * again and leave the page as is. + */ + xfs_count_page_state(page, &delalloc, &unwritten); + if ((current->flags & PF_FSTRANS) && (delalloc || unwritten)) + goto out_fail; /* Is this page beyond the end of the file? */ offset = i_size_read(inode); @@ -1084,50 +1089,33 @@ xfs_page_state_convert( if (page->index >= end_index) { if ((page->index >= end_index + 1) || !(i_size_read(inode) & (PAGE_CACHE_SIZE - 1))) { - if (startio) - unlock_page(page); + unlock_page(page); return 0; } } - /* - * page_dirty is initially a count of buffers on the page before - * EOF and is decremented as we move each into a cleanable state. - * - * Derivation: - * - * End offset is the highest offset that this page should represent. - * If we are on the last page, (end_offset & (PAGE_CACHE_SIZE - 1)) - * will evaluate non-zero and be less than PAGE_CACHE_SIZE and - * hence give us the correct page_dirty count. On any other page, - * it will be zero and in that case we need page_dirty to be the - * count of buffers on the page. - */ end_offset = min_t(unsigned long long, - (xfs_off_t)(page->index + 1) << PAGE_CACHE_SHIFT, offset); + (xfs_off_t)(page->index + 1) << PAGE_CACHE_SHIFT, + offset); len = 1 << inode->i_blkbits; - p_offset = min_t(unsigned long, end_offset & (PAGE_CACHE_SIZE - 1), - PAGE_CACHE_SIZE); - p_offset = p_offset ? roundup(p_offset, len) : PAGE_CACHE_SIZE; - page_dirty = p_offset / len; bh = head = page_buffers(page); offset = page_offset(page); flags = BMAPI_READ; type = IO_NEW; - /* TODO: cleanup count and page_dirty */ - do { if (offset >= end_offset) break; if (!buffer_uptodate(bh)) uptodate = 0; - if (!(PageUptodate(page) || buffer_uptodate(bh)) && !startio) { - /* - * the iomap is actually still valid, but the ioend - * isn't. shouldn't happen too often. - */ + + /* + * A hole may still be marked uptodate because discard_buffer + * leaves the flag set. + */ + if (!buffer_mapped(bh) && buffer_uptodate(bh)) { + ASSERT(!buffer_dirty(bh)); imap_valid = 0; continue; } @@ -1135,19 +1123,7 @@ xfs_page_state_convert( if (imap_valid) imap_valid = xfs_imap_valid(inode, &imap, offset); - /* - * First case, map an unwritten extent and prepare for - * extent state conversion transaction on completion. - * - * Second case, allocate space for a delalloc buffer. - * We can return EAGAIN here in the release page case. - * - * Third case, an unmapped buffer was found, and we are - * in a path where we need to write the whole page out. - */ - if (buffer_unwritten(bh) || buffer_delay(bh) || - ((buffer_uptodate(bh) || PageUptodate(page)) && - !buffer_mapped(bh) && (unmapped || startio))) { + if (buffer_unwritten(bh) || buffer_delay(bh)) { int new_ioend = 0; /* @@ -1161,15 +1137,16 @@ xfs_page_state_convert( flags = BMAPI_WRITE | BMAPI_IGNSTATE; } else if (buffer_delay(bh)) { type = IO_DELAY; - flags = BMAPI_ALLOCATE | trylock; - } else { - type = IO_NEW; - flags = BMAPI_WRITE | BMAPI_MMAP; + flags = BMAPI_ALLOCATE; + + if (wbc->sync_mode == WB_SYNC_NONE && + wbc->nonblocking) + flags |= BMAPI_TRYLOCK; } if (!imap_valid) { /* - * if we didn't have a valid mapping then we + * If we didn't have a valid mapping then we * need to ensure that we put the new mapping * in a new ioend structure. This needs to be * done to ensure that the ioends correctly @@ -1177,14 +1154,7 @@ xfs_page_state_convert( * for unwritten extent conversion. */ new_ioend = 1; - if (type == IO_NEW) { - size = xfs_probe_cluster(inode, - page, bh, head, 0); - } else { - size = len; - } - - err = xfs_map_blocks(inode, offset, size, + err = xfs_map_blocks(inode, offset, len, &imap, flags); if (err) goto error; @@ -1193,19 +1163,11 @@ xfs_page_state_convert( } if (imap_valid) { xfs_map_at_offset(inode, bh, &imap, offset); - if (startio) { - xfs_add_to_ioend(inode, bh, offset, - type, &ioend, - new_ioend); - } else { - set_buffer_dirty(bh); - unlock_buffer(bh); - mark_buffer_dirty(bh); - } - page_dirty--; + xfs_add_to_ioend(inode, bh, offset, type, + &ioend, new_ioend); count++; } - } else if (buffer_uptodate(bh) && startio) { + } else if (buffer_uptodate(bh)) { /* * we got here because the buffer is already mapped. * That means it must already have extents allocated @@ -1213,8 +1175,7 @@ xfs_page_state_convert( */ if (!imap_valid || flags != BMAPI_READ) { flags = BMAPI_READ; - size = xfs_probe_cluster(inode, page, bh, - head, 1); + size = xfs_probe_cluster(inode, page, bh, head); err = xfs_map_blocks(inode, offset, size, &imap, flags); if (err) @@ -1233,18 +1194,16 @@ xfs_page_state_convert( */ type = IO_NEW; if (trylock_buffer(bh)) { - ASSERT(buffer_mapped(bh)); if (imap_valid) all_bh = 1; xfs_add_to_ioend(inode, bh, offset, type, &ioend, !imap_valid); - page_dirty--; count++; } else { imap_valid = 0; } - } else if ((buffer_uptodate(bh) || PageUptodate(page)) && - (unmapped || startio)) { + } else if (PageUptodate(page)) { + ASSERT(buffer_mapped(bh)); imap_valid = 0; } @@ -1256,8 +1215,7 @@ xfs_page_state_convert( if (uptodate && bh == head) SetPageUptodate(page); - if (startio) - xfs_start_page_writeback(page, 1, count); + xfs_start_page_writeback(page, 1, count); if (ioend && imap_valid) { xfs_off_t end_index; @@ -1275,124 +1233,27 @@ xfs_page_state_convert( end_index = last_index; xfs_cluster_write(inode, page->index + 1, &imap, &ioend, - wbc, startio, all_bh, end_index); + wbc, all_bh, end_index); } if (iohead) xfs_submit_ioend(wbc, iohead); - return page_dirty; + return 0; error: if (iohead) xfs_cancel_ioend(iohead); - /* - * If it's delalloc and we have nowhere to put it, - * throw it away, unless the lower layers told - * us to try again. - */ - if (err != -EAGAIN) { - if (!unmapped) - xfs_aops_discard_page(page); - ClearPageUptodate(page); - } + xfs_aops_discard_page(page); + ClearPageUptodate(page); + unlock_page(page); return err; -} - -/* - * writepage: Called from one of two places: - * - * 1. we are flushing a delalloc buffer head. - * - * 2. we are writing out a dirty page. Typically the page dirty - * state is cleared before we get here. In this case is it - * conceivable we have no buffer heads. - * - * For delalloc space on the page we need to allocate space and - * flush it. For unmapped buffer heads on the page we should - * allocate space if the page is uptodate. For any other dirty - * buffer heads on the page we should flush them. - * - * If we detect that a transaction would be required to flush - * the page, we have to check the process flags first, if we - * are already in a transaction or disk I/O during allocations - * is off, we need to fail the writepage and redirty the page. - */ - -STATIC int -xfs_vm_writepage( - struct page *page, - struct writeback_control *wbc) -{ - int error; - int need_trans; - int delalloc, unmapped, unwritten; - struct inode *inode = page->mapping->host; - - trace_xfs_writepage(inode, page, 0); - - /* - * We need a transaction if: - * 1. There are delalloc buffers on the page - * 2. The page is uptodate and we have unmapped buffers - * 3. The page is uptodate and we have no buffers - * 4. There are unwritten buffers on the page - */ - - if (!page_has_buffers(page)) { - unmapped = 1; - need_trans = 1; - } else { - xfs_count_page_state(page, &delalloc, &unmapped, &unwritten); - if (!PageUptodate(page)) - unmapped = 0; - need_trans = delalloc + unmapped + unwritten; - } - - /* - * If we need a transaction and the process flags say - * we are already in a transaction, or no IO is allowed - * then mark the page dirty again and leave the page - * as is. - */ - if (current_test_flags(PF_FSTRANS) && need_trans) - goto out_fail; - - /* - * Delay hooking up buffer heads until we have - * made our go/no-go decision. - */ - if (!page_has_buffers(page)) - create_empty_buffers(page, 1 << inode->i_blkbits, 0); - - - /* - * VM calculation for nr_to_write seems off. Bump it way - * up, this gets simple streaming writes zippy again. - * To be reviewed again after Jens' writeback changes. - */ - wbc->nr_to_write *= 4; - - /* - * Convert delayed allocate, unwritten or unmapped space - * to real space and flush out to disk. - */ - error = xfs_page_state_convert(inode, page, wbc, 1, unmapped); - if (error == -EAGAIN) - goto out_fail; - if (unlikely(error < 0)) - goto out_unlock; - - return 0; out_fail: redirty_page_for_writepage(wbc, page); unlock_page(page); return 0; -out_unlock: - unlock_page(page); - return error; } STATIC int @@ -1406,65 +1267,27 @@ xfs_vm_writepages( /* * Called to move a page into cleanable state - and from there - * to be released. Possibly the page is already clean. We always + * to be released. The page should already be clean. We always * have buffer heads in this call. * - * Returns 0 if the page is ok to release, 1 otherwise. - * - * Possible scenarios are: - * - * 1. We are being called to release a page which has been written - * to via regular I/O. buffer heads will be dirty and possibly - * delalloc. If no delalloc buffer heads in this case then we - * can just return zero. - * - * 2. We are called to release a page which has been written via - * mmap, all we need to do is ensure there is no delalloc - * state in the buffer heads, if not we can let the caller - * free them and we should come back later via writepage. + * Returns 1 if the page is ok to release, 0 otherwise. */ STATIC int xfs_vm_releasepage( struct page *page, gfp_t gfp_mask) { - struct inode *inode = page->mapping->host; - int dirty, delalloc, unmapped, unwritten; - struct writeback_control wbc = { - .sync_mode = WB_SYNC_ALL, - .nr_to_write = 1, - }; + int delalloc, unwritten; - trace_xfs_releasepage(inode, page, 0); + trace_xfs_releasepage(page->mapping->host, page, 0); - if (!page_has_buffers(page)) - return 0; - - xfs_count_page_state(page, &delalloc, &unmapped, &unwritten); - if (!delalloc && !unwritten) - goto free_buffers; + xfs_count_page_state(page, &delalloc, &unwritten); - if (!(gfp_mask & __GFP_FS)) + if (WARN_ON(delalloc)) return 0; - - /* If we are already inside a transaction or the thread cannot - * do I/O, we cannot release this page. - */ - if (current_test_flags(PF_FSTRANS)) + if (WARN_ON(unwritten)) return 0; - /* - * Convert delalloc space to real space, do not flush the - * data out to disk, that will be done by the caller. - * Never need to allocate space here - we will always - * come back to writepage in that case. - */ - dirty = xfs_page_state_convert(inode, page, &wbc, 0, 0); - if (dirty == 0 && !unwritten) - goto free_buffers; - return 0; - -free_buffers: return try_to_free_buffers(page); } @@ -1474,9 +1297,9 @@ __xfs_get_blocks( sector_t iblock, struct buffer_head *bh_result, int create, - int direct, - bmapi_flags_t flags) + int direct) { + int flags = create ? BMAPI_WRITE : BMAPI_READ; struct xfs_bmbt_irec imap; xfs_off_t offset; ssize_t size; @@ -1491,8 +1314,11 @@ __xfs_get_blocks( if (!create && direct && offset >= i_size_read(inode)) return 0; - error = xfs_iomap(XFS_I(inode), offset, size, - create ? flags : BMAPI_READ, &imap, &nimap, &new); + if (direct && create) + flags |= BMAPI_DIRECT; + + error = xfs_iomap(XFS_I(inode), offset, size, flags, &imap, &nimap, + &new); if (error) return -error; if (nimap == 0) @@ -1572,8 +1398,7 @@ xfs_get_blocks( struct buffer_head *bh_result, int create) { - return __xfs_get_blocks(inode, iblock, - bh_result, create, 0, BMAPI_WRITE); + return __xfs_get_blocks(inode, iblock, bh_result, create, 0); } STATIC int @@ -1583,61 +1408,59 @@ xfs_get_blocks_direct( struct buffer_head *bh_result, int create) { - return __xfs_get_blocks(inode, iblock, - bh_result, create, 1, BMAPI_WRITE|BMAPI_DIRECT); + return __xfs_get_blocks(inode, iblock, bh_result, create, 1); } +/* + * Complete a direct I/O write request. + * + * If the private argument is non-NULL __xfs_get_blocks signals us that we + * need to issue a transaction to convert the range from unwritten to written + * extents. In case this is regular synchronous I/O we just call xfs_end_io + * to do this and we are done. But in case this was a successfull AIO + * request this handler is called from interrupt context, from which we + * can't start transactions. In that case offload the I/O completion to + * the workqueues we also use for buffered I/O completion. + */ STATIC void -xfs_end_io_direct( - struct kiocb *iocb, - loff_t offset, - ssize_t size, - void *private) +xfs_end_io_direct_write( + struct kiocb *iocb, + loff_t offset, + ssize_t size, + void *private, + int ret, + bool is_async) { - xfs_ioend_t *ioend = iocb->private; + struct xfs_ioend *ioend = iocb->private; /* - * Non-NULL private data means we need to issue a transaction to - * convert a range from unwritten to written extents. This needs - * to happen from process context but aio+dio I/O completion - * happens from irq context so we need to defer it to a workqueue. - * This is not necessary for synchronous direct I/O, but we do - * it anyway to keep the code uniform and simpler. - * - * Well, if only it were that simple. Because synchronous direct I/O - * requires extent conversion to occur *before* we return to userspace, - * we have to wait for extent conversion to complete. Look at the - * iocb that has been passed to us to determine if this is AIO or - * not. If it is synchronous, tell xfs_finish_ioend() to kick the - * workqueue and wait for it to complete. - * - * The core direct I/O code might be changed to always call the - * completion handler in the future, in which case all this can - * go away. + * blockdev_direct_IO can return an error even after the I/O + * completion handler was called. Thus we need to protect + * against double-freeing. */ + iocb->private = NULL; + ioend->io_offset = offset; ioend->io_size = size; - if (ioend->io_type == IO_READ) { - xfs_finish_ioend(ioend, 0); - } else if (private && size > 0) { - xfs_finish_ioend(ioend, is_sync_kiocb(iocb)); - } else { + if (private && size > 0) + ioend->io_type = IO_UNWRITTEN; + + if (is_async) { /* - * A direct I/O write ioend starts it's life in unwritten - * state in case they map an unwritten extent. This write - * didn't map an unwritten extent so switch it's completion - * handler. + * If we are converting an unwritten extent we need to delay + * the AIO completion until after the unwrittent extent + * conversion has completed, otherwise do it ASAP. */ - ioend->io_type = IO_NEW; - xfs_finish_ioend(ioend, 0); + if (ioend->io_type == IO_UNWRITTEN) { + ioend->io_iocb = iocb; + ioend->io_result = ret; + } else { + aio_complete(iocb, ret, 0); + } + xfs_finish_ioend(ioend); + } else { + xfs_finish_ioend_sync(ioend); } - - /* - * blockdev_direct_IO can return an error even after the I/O - * completion handler was called. Thus we need to protect - * against double-freeing. - */ - iocb->private = NULL; } STATIC ssize_t @@ -1648,23 +1471,26 @@ xfs_vm_direct_IO( loff_t offset, unsigned long nr_segs) { - struct file *file = iocb->ki_filp; - struct inode *inode = file->f_mapping->host; - struct block_device *bdev; - ssize_t ret; - - bdev = xfs_find_bdev_for_inode(inode); - - iocb->private = xfs_alloc_ioend(inode, rw == WRITE ? - IO_UNWRITTEN : IO_READ); - - ret = blockdev_direct_IO_no_locking(rw, iocb, inode, bdev, iov, - offset, nr_segs, - xfs_get_blocks_direct, - xfs_end_io_direct); + struct inode *inode = iocb->ki_filp->f_mapping->host; + struct block_device *bdev = xfs_find_bdev_for_inode(inode); + ssize_t ret; + + if (rw & WRITE) { + iocb->private = xfs_alloc_ioend(inode, IO_NEW); + + ret = blockdev_direct_IO_no_locking(rw, iocb, inode, bdev, iov, + offset, nr_segs, + xfs_get_blocks_direct, + xfs_end_io_direct_write); + if (ret != -EIOCBQUEUED && iocb->private) + xfs_destroy_ioend(iocb->private); + } else { + ret = blockdev_direct_IO_no_locking(rw, iocb, inode, bdev, iov, + offset, nr_segs, + xfs_get_blocks_direct, + NULL); + } - if (unlikely(ret != -EIOCBQUEUED && iocb->private)) - xfs_destroy_ioend(iocb->private); return ret; } @@ -1679,8 +1505,8 @@ xfs_vm_write_begin( void **fsdata) { *pagep = NULL; - return block_write_begin(file, mapping, pos, len, flags, pagep, fsdata, - xfs_get_blocks); + return block_write_begin(file, mapping, pos, len, flags | AOP_FLAG_NOFS, + pagep, fsdata, xfs_get_blocks); } STATIC sector_t @@ -1691,7 +1517,7 @@ xfs_vm_bmap( struct inode *inode = (struct inode *)mapping->host; struct xfs_inode *ip = XFS_I(inode); - xfs_itrace_entry(XFS_I(inode)); + trace_xfs_vm_bmap(XFS_I(inode)); xfs_ilock(ip, XFS_IOLOCK_SHARED); xfs_flush_pages(ip, (xfs_off_t)0, -1, 0, FI_REMAPF); xfs_iunlock(ip, XFS_IOLOCK_SHARED); diff --git a/fs/xfs/linux-2.6/xfs_aops.h b/fs/xfs/linux-2.6/xfs_aops.h index 4cfc6ea87df8..c5057fb6237a 100644 --- a/fs/xfs/linux-2.6/xfs_aops.h +++ b/fs/xfs/linux-2.6/xfs_aops.h @@ -37,6 +37,8 @@ typedef struct xfs_ioend { size_t io_size; /* size of the extent */ xfs_off_t io_offset; /* offset in the file */ struct work_struct io_work; /* xfsdatad work queue */ + struct kiocb *io_iocb; + int io_result; } xfs_ioend_t; extern const struct address_space_operations xfs_address_space_operations; @@ -45,6 +47,6 @@ extern int xfs_get_blocks(struct inode *, sector_t, struct buffer_head *, int); extern void xfs_ioend_init(void); extern void xfs_ioend_wait(struct xfs_inode *); -extern void xfs_count_page_state(struct page *, int *, int *, int *); +extern void xfs_count_page_state(struct page *, int *, int *); #endif /* __XFS_AOPS_H__ */ diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c index 649ade8ef598..ea79072f5210 100644 --- a/fs/xfs/linux-2.6/xfs_buf.c +++ b/fs/xfs/linux-2.6/xfs_buf.c @@ -39,13 +39,12 @@ #include "xfs_inum.h" #include "xfs_log.h" #include "xfs_ag.h" -#include "xfs_dmapi.h" #include "xfs_mount.h" #include "xfs_trace.h" static kmem_zone_t *xfs_buf_zone; STATIC int xfsbufd(void *); -STATIC int xfsbufd_wakeup(int, gfp_t); +STATIC int xfsbufd_wakeup(struct shrinker *, int, gfp_t); STATIC void xfs_buf_delwri_queue(xfs_buf_t *, int); static struct shrinker xfs_buf_shake = { .shrink = xfsbufd_wakeup, @@ -340,7 +339,7 @@ _xfs_buf_lookup_pages( __func__, gfp_mask); XFS_STATS_INC(xb_page_retries); - xfsbufd_wakeup(0, gfp_mask); + xfsbufd_wakeup(NULL, 0, gfp_mask); congestion_wait(BLK_RW_ASYNC, HZ/50); goto retry; } @@ -579,9 +578,9 @@ _xfs_buf_read( XBF_READ_AHEAD | _XBF_RUN_QUEUES); status = xfs_buf_iorequest(bp); - if (!status && !(flags & XBF_ASYNC)) - status = xfs_buf_iowait(bp); - return status; + if (status || XFS_BUF_ISERROR(bp) || (flags & XBF_ASYNC)) + return status; + return xfs_buf_iowait(bp); } xfs_buf_t * @@ -897,36 +896,6 @@ xfs_buf_unlock( trace_xfs_buf_unlock(bp, _RET_IP_); } - -/* - * Pinning Buffer Storage in Memory - * Ensure that no attempt to force a buffer to disk will succeed. - */ -void -xfs_buf_pin( - xfs_buf_t *bp) -{ - trace_xfs_buf_pin(bp, _RET_IP_); - atomic_inc(&bp->b_pin_count); -} - -void -xfs_buf_unpin( - xfs_buf_t *bp) -{ - trace_xfs_buf_unpin(bp, _RET_IP_); - - if (atomic_dec_and_test(&bp->b_pin_count)) - wake_up_all(&bp->b_waiters); -} - -int -xfs_buf_ispin( - xfs_buf_t *bp) -{ - return atomic_read(&bp->b_pin_count); -} - STATIC void xfs_buf_wait_unpin( xfs_buf_t *bp) @@ -1018,13 +987,12 @@ xfs_bwrite( { int error; - bp->b_strat = xfs_bdstrat_cb; bp->b_mount = mp; bp->b_flags |= XBF_WRITE; bp->b_flags &= ~(XBF_ASYNC | XBF_READ); xfs_buf_delwri_dequeue(bp); - xfs_buf_iostrategy(bp); + xfs_bdstrat_cb(bp); error = xfs_buf_iowait(bp); if (error) @@ -1040,7 +1008,6 @@ xfs_bdwrite( { trace_xfs_buf_bdwrite(bp, _RET_IP_); - bp->b_strat = xfs_bdstrat_cb; bp->b_mount = mp; bp->b_flags &= ~XBF_READ; @@ -1075,7 +1042,6 @@ xfs_bioerror( XFS_BUF_UNDONE(bp); XFS_BUF_STALE(bp); - XFS_BUF_CLR_BDSTRAT_FUNC(bp); xfs_biodone(bp); return EIO; @@ -1105,7 +1071,6 @@ xfs_bioerror_relse( XFS_BUF_DONE(bp); XFS_BUF_STALE(bp); XFS_BUF_CLR_IODONE_FUNC(bp); - XFS_BUF_CLR_BDSTRAT_FUNC(bp); if (!(fl & XBF_ASYNC)) { /* * Mark b_error and B_ERROR _both_. @@ -1311,8 +1276,19 @@ submit_io: if (size) goto next_chunk; } else { - bio_put(bio); + /* + * if we get here, no pages were added to the bio. However, + * we can't just error out here - if the pages are locked then + * we have to unlock them otherwise we can hang on a later + * access to the page. + */ xfs_buf_ioerror(bp, EIO); + if (bp->b_flags & _XBF_PAGE_LOCKED) { + int i; + for (i = 0; i < bp->b_page_count; i++) + unlock_page(bp->b_pages[i]); + } + bio_put(bio); } } @@ -1762,6 +1738,7 @@ xfs_buf_runall_queues( STATIC int xfsbufd_wakeup( + struct shrinker *shrink, int priority, gfp_t mask) { @@ -1803,7 +1780,7 @@ xfs_buf_delwri_split( trace_xfs_buf_delwri_split(bp, _RET_IP_); ASSERT(bp->b_flags & XBF_DELWRI); - if (!xfs_buf_ispin(bp) && !xfs_buf_cond_lock(bp)) { + if (!XFS_BUF_ISPINNED(bp) && !xfs_buf_cond_lock(bp)) { if (!force && time_before(jiffies, bp->b_queuetime + age)) { xfs_buf_unlock(bp); @@ -1888,7 +1865,7 @@ xfsbufd( struct xfs_buf *bp; bp = list_first_entry(&tmp, struct xfs_buf, b_list); list_del_init(&bp->b_list); - xfs_buf_iostrategy(bp); + xfs_bdstrat_cb(bp); count++; } if (count) @@ -1935,7 +1912,7 @@ xfs_flush_buftarg( bp->b_flags &= ~XBF_ASYNC; list_add(&bp->b_list, &wait_list); } - xfs_buf_iostrategy(bp); + xfs_bdstrat_cb(bp); } if (wait) { diff --git a/fs/xfs/linux-2.6/xfs_buf.h b/fs/xfs/linux-2.6/xfs_buf.h index 5fbecefa5dfd..d072e5ff923b 100644 --- a/fs/xfs/linux-2.6/xfs_buf.h +++ b/fs/xfs/linux-2.6/xfs_buf.h @@ -44,57 +44,57 @@ typedef enum { XBRW_ZERO = 3, /* Zero target memory */ } xfs_buf_rw_t; -typedef enum { - XBF_READ = (1 << 0), /* buffer intended for reading from device */ - XBF_WRITE = (1 << 1), /* buffer intended for writing to device */ - XBF_MAPPED = (1 << 2), /* buffer mapped (b_addr valid) */ - XBF_ASYNC = (1 << 4), /* initiator will not wait for completion */ - XBF_DONE = (1 << 5), /* all pages in the buffer uptodate */ - XBF_DELWRI = (1 << 6), /* buffer has dirty pages */ - XBF_STALE = (1 << 7), /* buffer has been staled, do not find it */ - XBF_FS_MANAGED = (1 << 8), /* filesystem controls freeing memory */ - XBF_ORDERED = (1 << 11), /* use ordered writes */ - XBF_READ_AHEAD = (1 << 12), /* asynchronous read-ahead */ - XBF_LOG_BUFFER = (1 << 13), /* this is a buffer used for the log */ - - /* flags used only as arguments to access routines */ - XBF_LOCK = (1 << 14), /* lock requested */ - XBF_TRYLOCK = (1 << 15), /* lock requested, but do not wait */ - XBF_DONT_BLOCK = (1 << 16), /* do not block in current thread */ - - /* flags used only internally */ - _XBF_PAGE_CACHE = (1 << 17),/* backed by pagecache */ - _XBF_PAGES = (1 << 18), /* backed by refcounted pages */ - _XBF_RUN_QUEUES = (1 << 19),/* run block device task queue */ - _XBF_DELWRI_Q = (1 << 21), /* buffer on delwri queue */ - - /* - * Special flag for supporting metadata blocks smaller than a FSB. - * - * In this case we can have multiple xfs_buf_t on a single page and - * need to lock out concurrent xfs_buf_t readers as they only - * serialise access to the buffer. - * - * If the FSB size >= PAGE_CACHE_SIZE case, we have no serialisation - * between reads of the page. Hence we can have one thread read the - * page and modify it, but then race with another thread that thinks - * the page is not up-to-date and hence reads it again. - * - * The result is that the first modifcation to the page is lost. - * This sort of AGF/AGI reading race can happen when unlinking inodes - * that require truncation and results in the AGI unlinked list - * modifications being lost. - */ - _XBF_PAGE_LOCKED = (1 << 22), - - /* - * If we try a barrier write, but it fails we have to communicate - * this to the upper layers. Unfortunately b_error gets overwritten - * when the buffer is re-issued so we have to add another flag to - * keep this information. - */ - _XFS_BARRIER_FAILED = (1 << 23), -} xfs_buf_flags_t; +#define XBF_READ (1 << 0) /* buffer intended for reading from device */ +#define XBF_WRITE (1 << 1) /* buffer intended for writing to device */ +#define XBF_MAPPED (1 << 2) /* buffer mapped (b_addr valid) */ +#define XBF_ASYNC (1 << 4) /* initiator will not wait for completion */ +#define XBF_DONE (1 << 5) /* all pages in the buffer uptodate */ +#define XBF_DELWRI (1 << 6) /* buffer has dirty pages */ +#define XBF_STALE (1 << 7) /* buffer has been staled, do not find it */ +#define XBF_FS_MANAGED (1 << 8) /* filesystem controls freeing memory */ +#define XBF_ORDERED (1 << 11)/* use ordered writes */ +#define XBF_READ_AHEAD (1 << 12)/* asynchronous read-ahead */ +#define XBF_LOG_BUFFER (1 << 13)/* this is a buffer used for the log */ + +/* flags used only as arguments to access routines */ +#define XBF_LOCK (1 << 14)/* lock requested */ +#define XBF_TRYLOCK (1 << 15)/* lock requested, but do not wait */ +#define XBF_DONT_BLOCK (1 << 16)/* do not block in current thread */ + +/* flags used only internally */ +#define _XBF_PAGE_CACHE (1 << 17)/* backed by pagecache */ +#define _XBF_PAGES (1 << 18)/* backed by refcounted pages */ +#define _XBF_RUN_QUEUES (1 << 19)/* run block device task queue */ +#define _XBF_DELWRI_Q (1 << 21)/* buffer on delwri queue */ + +/* + * Special flag for supporting metadata blocks smaller than a FSB. + * + * In this case we can have multiple xfs_buf_t on a single page and + * need to lock out concurrent xfs_buf_t readers as they only + * serialise access to the buffer. + * + * If the FSB size >= PAGE_CACHE_SIZE case, we have no serialisation + * between reads of the page. Hence we can have one thread read the + * page and modify it, but then race with another thread that thinks + * the page is not up-to-date and hence reads it again. + * + * The result is that the first modifcation to the page is lost. + * This sort of AGF/AGI reading race can happen when unlinking inodes + * that require truncation and results in the AGI unlinked list + * modifications being lost. + */ +#define _XBF_PAGE_LOCKED (1 << 22) + +/* + * If we try a barrier write, but it fails we have to communicate + * this to the upper layers. Unfortunately b_error gets overwritten + * when the buffer is re-issued so we have to add another flag to + * keep this information. + */ +#define _XFS_BARRIER_FAILED (1 << 23) + +typedef unsigned int xfs_buf_flags_t; #define XFS_BUF_FLAGS \ { XBF_READ, "READ" }, \ @@ -187,7 +187,6 @@ typedef struct xfs_buf { atomic_t b_io_remaining; /* #outstanding I/O requests */ xfs_buf_iodone_t b_iodone; /* I/O completion function */ xfs_buf_relse_t b_relse; /* releasing function */ - xfs_buf_bdstrat_t b_strat; /* pre-write function */ struct completion b_iowait; /* queue for I/O waiters */ void *b_fspriv; void *b_fspriv2; @@ -245,11 +244,6 @@ extern int xfs_buf_iowait(xfs_buf_t *); extern void xfs_buf_iomove(xfs_buf_t *, size_t, size_t, void *, xfs_buf_rw_t); -static inline int xfs_buf_iostrategy(xfs_buf_t *bp) -{ - return bp->b_strat ? bp->b_strat(bp) : xfs_buf_iorequest(bp); -} - static inline int xfs_buf_geterror(xfs_buf_t *bp) { return bp ? bp->b_error : ENOMEM; @@ -258,11 +252,6 @@ static inline int xfs_buf_geterror(xfs_buf_t *bp) /* Buffer Utility Routines */ extern xfs_caddr_t xfs_buf_offset(xfs_buf_t *, size_t); -/* Pinning Buffer Storage in Memory */ -extern void xfs_buf_pin(xfs_buf_t *); -extern void xfs_buf_unpin(xfs_buf_t *); -extern int xfs_buf_ispin(xfs_buf_t *); - /* Delayed Write Buffer Routines */ extern void xfs_buf_delwri_dequeue(xfs_buf_t *); extern void xfs_buf_delwri_promote(xfs_buf_t *); @@ -326,8 +315,6 @@ extern void xfs_buf_terminate(void); #define XFS_BUF_IODONE_FUNC(bp) ((bp)->b_iodone) #define XFS_BUF_SET_IODONE_FUNC(bp, func) ((bp)->b_iodone = (func)) #define XFS_BUF_CLR_IODONE_FUNC(bp) ((bp)->b_iodone = NULL) -#define XFS_BUF_SET_BDSTRAT_FUNC(bp, func) ((bp)->b_strat = (func)) -#define XFS_BUF_CLR_BDSTRAT_FUNC(bp) ((bp)->b_strat = NULL) #define XFS_BUF_FSPRIVATE(bp, type) ((type)(bp)->b_fspriv) #define XFS_BUF_SET_FSPRIVATE(bp, val) ((bp)->b_fspriv = (void*)(val)) @@ -351,7 +338,7 @@ extern void xfs_buf_terminate(void); #define XFS_BUF_SET_VTYPE(bp, type) do { } while (0) #define XFS_BUF_SET_REF(bp, ref) do { } while (0) -#define XFS_BUF_ISPINNED(bp) xfs_buf_ispin(bp) +#define XFS_BUF_ISPINNED(bp) atomic_read(&((bp)->b_pin_count)) #define XFS_BUF_VALUSEMA(bp) xfs_buf_lock_value(bp) #define XFS_BUF_CPSEMA(bp) (xfs_buf_cond_lock(bp) == 0) @@ -370,8 +357,6 @@ static inline void xfs_buf_relse(xfs_buf_t *bp) xfs_buf_rele(bp); } -#define xfs_bpin(bp) xfs_buf_pin(bp) -#define xfs_bunpin(bp) xfs_buf_unpin(bp) #define xfs_biodone(bp) xfs_buf_ioend(bp, 0) #define xfs_biomove(bp, off, len, data, rw) \ diff --git a/fs/xfs/linux-2.6/xfs_dmapi_priv.h b/fs/xfs/linux-2.6/xfs_dmapi_priv.h deleted file mode 100644 index a8b0b1685eed..000000000000 --- a/fs/xfs/linux-2.6/xfs_dmapi_priv.h +++ /dev/null @@ -1,28 +0,0 @@ -/* - * Copyright (c) 2000-2006 Silicon Graphics, Inc. - * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - */ -#ifndef __XFS_DMAPI_PRIV_H__ -#define __XFS_DMAPI_PRIV_H__ - -/* - * Based on IO_ISDIRECT, decide which i_ flag is set. - */ -#define DM_SEM_FLAG_RD(ioflags) (((ioflags) & IO_ISDIRECT) ? \ - DM_FLAGS_IMUX : 0) -#define DM_SEM_FLAG_WR (DM_FLAGS_IALLOCSEM_WR | DM_FLAGS_IMUX) - -#endif /*__XFS_DMAPI_PRIV_H__*/ diff --git a/fs/xfs/linux-2.6/xfs_export.c b/fs/xfs/linux-2.6/xfs_export.c index 846b75aeb2ab..3764d74790ec 100644 --- a/fs/xfs/linux-2.6/xfs_export.c +++ b/fs/xfs/linux-2.6/xfs_export.c @@ -23,13 +23,13 @@ #include "xfs_sb.h" #include "xfs_ag.h" #include "xfs_dir2.h" -#include "xfs_dmapi.h" #include "xfs_mount.h" #include "xfs_export.h" #include "xfs_vnodeops.h" #include "xfs_bmap_btree.h" #include "xfs_inode.h" #include "xfs_inode_item.h" +#include "xfs_trace.h" /* * Note that we only accept fileids which are long enough rather than allow @@ -128,13 +128,11 @@ xfs_nfs_get_inode( return ERR_PTR(-ESTALE); /* - * The XFS_IGET_BULKSTAT means that an invalid inode number is just - * fine and not an indication of a corrupted filesystem. Because - * clients can send any kind of invalid file handle, e.g. after - * a restore on the server we have to deal with this case gracefully. + * The XFS_IGET_UNTRUSTED means that an invalid inode number is just + * fine and not an indication of a corrupted filesystem as clients can + * send invalid file handles and we have to handle it gracefully.. */ - error = xfs_iget(mp, NULL, ino, XFS_IGET_BULKSTAT, - XFS_ILOCK_SHARED, &ip, 0); + error = xfs_iget(mp, NULL, ino, XFS_IGET_UNTRUSTED, 0, &ip); if (error) { /* * EINVAL means the inode cluster doesn't exist anymore. @@ -149,11 +147,10 @@ xfs_nfs_get_inode( } if (ip->i_d.di_gen != generation) { - xfs_iput_new(ip, XFS_ILOCK_SHARED); + IRELE(ip); return ERR_PTR(-ENOENT); } - xfs_iunlock(ip, XFS_ILOCK_SHARED); return VFS_I(ip); } diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c index d8fb1b5d6cb5..ba8ad422a165 100644 --- a/fs/xfs/linux-2.6/xfs_file.c +++ b/fs/xfs/linux-2.6/xfs_file.c @@ -22,23 +22,15 @@ #include "xfs_inum.h" #include "xfs_sb.h" #include "xfs_ag.h" -#include "xfs_dir2.h" #include "xfs_trans.h" -#include "xfs_dmapi.h" #include "xfs_mount.h" #include "xfs_bmap_btree.h" -#include "xfs_alloc_btree.h" -#include "xfs_ialloc_btree.h" #include "xfs_alloc.h" -#include "xfs_btree.h" -#include "xfs_attr_sf.h" -#include "xfs_dir2_sf.h" #include "xfs_dinode.h" #include "xfs_inode.h" #include "xfs_inode_item.h" #include "xfs_bmap.h" #include "xfs_error.h" -#include "xfs_rw.h" #include "xfs_vnodeops.h" #include "xfs_da_btree.h" #include "xfs_ioctl.h" @@ -100,15 +92,15 @@ xfs_iozero( STATIC int xfs_file_fsync( struct file *file, - struct dentry *dentry, int datasync) { - struct xfs_inode *ip = XFS_I(dentry->d_inode); + struct inode *inode = file->f_mapping->host; + struct xfs_inode *ip = XFS_I(inode); struct xfs_trans *tp; int error = 0; int log_flushed = 0; - xfs_itrace_entry(ip); + trace_xfs_file_fsync(ip); if (XFS_FORCED_SHUTDOWN(ip->i_mount)) return -XFS_ERROR(EIO); @@ -140,8 +132,8 @@ xfs_file_fsync( * might gets cleared when the inode gets written out via the AIL * or xfs_iflush_cluster. */ - if (((dentry->d_inode->i_state & I_DIRTY_DATASYNC) || - ((dentry->d_inode->i_state & I_DIRTY_SYNC) && !datasync)) && + if (((inode->i_state & I_DIRTY_DATASYNC) || + ((inode->i_state & I_DIRTY_SYNC) && !datasync)) && ip->i_update_core) { /* * Kick off a transaction to log the inode core to get the @@ -166,8 +158,7 @@ xfs_file_fsync( * transaction. So we play it safe and fire off the * transaction anyway. */ - xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); - xfs_trans_ihold(tp, ip); + xfs_trans_ijoin(tp, ip); xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); xfs_trans_set_sync(tp); error = _xfs_trans_commit(tp, 0, &log_flushed); @@ -275,20 +266,6 @@ xfs_file_aio_read( mutex_lock(&inode->i_mutex); xfs_ilock(ip, XFS_IOLOCK_SHARED); - if (DM_EVENT_ENABLED(ip, DM_EVENT_READ) && !(ioflags & IO_INVIS)) { - int dmflags = FILP_DELAY_FLAG(file) | DM_SEM_FLAG_RD(ioflags); - int iolock = XFS_IOLOCK_SHARED; - - ret = -XFS_SEND_DATA(mp, DM_EVENT_READ, ip, iocb->ki_pos, size, - dmflags, &iolock); - if (ret) { - xfs_iunlock(ip, XFS_IOLOCK_SHARED); - if (unlikely(ioflags & IO_ISDIRECT)) - mutex_unlock(&inode->i_mutex); - return ret; - } - } - if (unlikely(ioflags & IO_ISDIRECT)) { if (inode->i_mapping->nrpages) { ret = -xfs_flushinval_pages(ip, @@ -321,7 +298,6 @@ xfs_file_splice_read( unsigned int flags) { struct xfs_inode *ip = XFS_I(infilp->f_mapping->host); - struct xfs_mount *mp = ip->i_mount; int ioflags = 0; ssize_t ret; @@ -335,18 +311,6 @@ xfs_file_splice_read( xfs_ilock(ip, XFS_IOLOCK_SHARED); - if (DM_EVENT_ENABLED(ip, DM_EVENT_READ) && !(ioflags & IO_INVIS)) { - int iolock = XFS_IOLOCK_SHARED; - int error; - - error = XFS_SEND_DATA(mp, DM_EVENT_READ, ip, *ppos, count, - FILP_DELAY_FLAG(infilp), &iolock); - if (error) { - xfs_iunlock(ip, XFS_IOLOCK_SHARED); - return -error; - } - } - trace_xfs_file_splice_read(ip, count, *ppos, ioflags); ret = generic_file_splice_read(infilp, ppos, pipe, count, flags); @@ -367,7 +331,6 @@ xfs_file_splice_write( { struct inode *inode = outfilp->f_mapping->host; struct xfs_inode *ip = XFS_I(inode); - struct xfs_mount *mp = ip->i_mount; xfs_fsize_t isize, new_size; int ioflags = 0; ssize_t ret; @@ -382,18 +345,6 @@ xfs_file_splice_write( xfs_ilock(ip, XFS_IOLOCK_EXCL); - if (DM_EVENT_ENABLED(ip, DM_EVENT_WRITE) && !(ioflags & IO_INVIS)) { - int iolock = XFS_IOLOCK_EXCL; - int error; - - error = XFS_SEND_DATA(mp, DM_EVENT_WRITE, ip, *ppos, count, - FILP_DELAY_FLAG(outfilp), &iolock); - if (error) { - xfs_iunlock(ip, XFS_IOLOCK_EXCL); - return -error; - } - } - new_size = *ppos + count; xfs_ilock(ip, XFS_ILOCK_EXCL); @@ -463,7 +414,7 @@ xfs_zero_last_block( last_fsb = XFS_B_TO_FSBT(mp, isize); nimaps = 1; error = xfs_bmapi(NULL, ip, last_fsb, 1, 0, NULL, 0, &imap, - &nimaps, NULL, NULL); + &nimaps, NULL); if (error) { return error; } @@ -558,7 +509,7 @@ xfs_zero_eof( nimaps = 1; zero_count_fsb = end_zero_fsb - start_zero_fsb + 1; error = xfs_bmapi(NULL, ip, start_zero_fsb, zero_count_fsb, - 0, NULL, 0, &imap, &nimaps, NULL, NULL); + 0, NULL, 0, &imap, &nimaps, NULL); if (error) { ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_IOLOCK_EXCL)); return error; @@ -627,7 +578,6 @@ xfs_file_aio_write( int ioflags = 0; xfs_fsize_t isize, new_size; int iolock; - int eventsent = 0; size_t ocount = 0, count; int need_i_mutex; @@ -673,33 +623,6 @@ start: goto out_unlock_mutex; } - if ((DM_EVENT_ENABLED(ip, DM_EVENT_WRITE) && - !(ioflags & IO_INVIS) && !eventsent)) { - int dmflags = FILP_DELAY_FLAG(file); - - if (need_i_mutex) - dmflags |= DM_FLAGS_IMUX; - - xfs_iunlock(ip, XFS_ILOCK_EXCL); - error = XFS_SEND_DATA(ip->i_mount, DM_EVENT_WRITE, ip, - pos, count, dmflags, &iolock); - if (error) { - goto out_unlock_internal; - } - xfs_ilock(ip, XFS_ILOCK_EXCL); - eventsent = 1; - - /* - * The iolock was dropped and reacquired in XFS_SEND_DATA - * so we have to recheck the size when appending. - * We will only "goto start;" once, since having sent the - * event prevents another call to XFS_SEND_DATA, which is - * what allows the size to change in the first place. - */ - if ((file->f_flags & O_APPEND) && pos != ip->i_size) - goto start; - } - if (ioflags & IO_ISDIRECT) { xfs_buftarg_t *target = XFS_IS_REALTIME_INODE(ip) ? @@ -830,22 +753,6 @@ write_retry: xfs_iunlock(ip, XFS_ILOCK_EXCL); } - if (ret == -ENOSPC && - DM_EVENT_ENABLED(ip, DM_EVENT_NOSPACE) && !(ioflags & IO_INVIS)) { - xfs_iunlock(ip, iolock); - if (need_i_mutex) - mutex_unlock(&inode->i_mutex); - error = XFS_SEND_NAMESP(ip->i_mount, DM_EVENT_NOSPACE, ip, - DM_RIGHT_NULL, ip, DM_RIGHT_NULL, NULL, NULL, - 0, 0, 0); /* Delay flag intentionally unused */ - if (need_i_mutex) - mutex_lock(&inode->i_mutex); - xfs_ilock(ip, iolock); - if (error) - goto out_unlock_internal; - goto start; - } - error = -ret; if (ret <= 0) goto out_unlock_internal; @@ -868,7 +775,7 @@ write_retry: mutex_lock(&inode->i_mutex); xfs_ilock(ip, iolock); - error2 = -xfs_file_fsync(file, file->f_path.dentry, + error2 = -xfs_file_fsync(file, (file->f_flags & __O_SYNC) ? 0 : 1); if (!error) error = error2; @@ -1014,9 +921,6 @@ const struct file_operations xfs_file_operations = { .open = xfs_file_open, .release = xfs_file_release, .fsync = xfs_file_fsync, -#ifdef HAVE_FOP_OPEN_EXEC - .open_exec = xfs_file_open_exec, -#endif }; const struct file_operations xfs_dir_file_operations = { diff --git a/fs/xfs/linux-2.6/xfs_fs_subr.c b/fs/xfs/linux-2.6/xfs_fs_subr.c index b6918d76bc7b..1f279b012f94 100644 --- a/fs/xfs/linux-2.6/xfs_fs_subr.c +++ b/fs/xfs/linux-2.6/xfs_fs_subr.c @@ -21,10 +21,6 @@ #include "xfs_inode.h" #include "xfs_trace.h" -int fs_noerr(void) { return 0; } -int fs_nosys(void) { return ENOSYS; } -void fs_noval(void) { return; } - /* * note: all filemap functions return negative error codes. These * need to be inverted before returning to the xfs core functions. diff --git a/fs/xfs/linux-2.6/xfs_fs_subr.h b/fs/xfs/linux-2.6/xfs_fs_subr.h deleted file mode 100644 index 82bb19b2599e..000000000000 --- a/fs/xfs/linux-2.6/xfs_fs_subr.h +++ /dev/null @@ -1,25 +0,0 @@ -/* - * Copyright (c) 2000,2002,2005 Silicon Graphics, Inc. - * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - */ -#ifndef __XFS_FS_SUBR_H__ -#define __XFS_FS_SUBR_H__ - -extern int fs_noerr(void); -extern int fs_nosys(void); -extern void fs_noval(void); - -#endif /* __XFS_FS_SUBR_H__ */ diff --git a/fs/xfs/linux-2.6/xfs_ioctl.c b/fs/xfs/linux-2.6/xfs_ioctl.c index 699b60cbab9c..237f5ffb2ee8 100644 --- a/fs/xfs/linux-2.6/xfs_ioctl.c +++ b/fs/xfs/linux-2.6/xfs_ioctl.c @@ -23,24 +23,15 @@ #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" -#include "xfs_dir2.h" #include "xfs_alloc.h" -#include "xfs_dmapi.h" #include "xfs_mount.h" #include "xfs_bmap_btree.h" -#include "xfs_alloc_btree.h" -#include "xfs_ialloc_btree.h" -#include "xfs_attr_sf.h" -#include "xfs_dir2_sf.h" #include "xfs_dinode.h" #include "xfs_inode.h" #include "xfs_ioctl.h" -#include "xfs_btree.h" -#include "xfs_ialloc.h" #include "xfs_rtalloc.h" #include "xfs_itable.h" #include "xfs_error.h" -#include "xfs_rw.h" #include "xfs_attr.h" #include "xfs_bmap.h" #include "xfs_buf_item.h" @@ -679,10 +670,9 @@ xfs_ioc_bulkstat( error = xfs_bulkstat_single(mp, &inlast, bulkreq.ubuffer, &done); else /* XFS_IOC_FSBULKSTAT */ - error = xfs_bulkstat(mp, &inlast, &count, - (bulkstat_one_pf)xfs_bulkstat_one, NULL, - sizeof(xfs_bstat_t), bulkreq.ubuffer, - BULKSTAT_FG_QUICK, &done); + error = xfs_bulkstat(mp, &inlast, &count, xfs_bulkstat_one, + sizeof(xfs_bstat_t), bulkreq.ubuffer, + &done); if (error) return -error; @@ -909,7 +899,7 @@ xfs_ioctl_setattr( struct xfs_dquot *olddquot = NULL; int code; - xfs_itrace_entry(ip); + trace_xfs_ioctl_setattr(ip); if (mp->m_flags & XFS_MOUNT_RDONLY) return XFS_ERROR(EROFS); @@ -1044,8 +1034,7 @@ xfs_ioctl_setattr( } } - xfs_trans_ijoin(tp, ip, lock_flags); - xfs_trans_ihold(tp, ip); + xfs_trans_ijoin(tp, ip); /* * Change file ownership. Must be the owner or privileged. @@ -1117,16 +1106,7 @@ xfs_ioctl_setattr( xfs_qm_dqrele(udqp); xfs_qm_dqrele(gdqp); - if (code) - return code; - - if (DM_EVENT_ENABLED(ip, DM_EVENT_ATTRIBUTE)) { - XFS_SEND_NAMESP(mp, DM_EVENT_ATTRIBUTE, ip, DM_RIGHT_NULL, - NULL, DM_RIGHT_NULL, NULL, NULL, 0, 0, - (mask & FSX_NONBLOCK) ? DM_FLAGS_NDELAY : 0); - } - - return 0; + return code; error_return: xfs_qm_dqrele(udqp); @@ -1302,7 +1282,7 @@ xfs_file_ioctl( if (filp->f_mode & FMODE_NOCMTIME) ioflags |= IO_INVIS; - xfs_itrace_entry(ip); + trace_xfs_file_ioctl(ip); switch (cmd) { case XFS_IOC_ALLOCSP: diff --git a/fs/xfs/linux-2.6/xfs_ioctl32.c b/fs/xfs/linux-2.6/xfs_ioctl32.c index 9287135e9bfc..6c83f7f62dc9 100644 --- a/fs/xfs/linux-2.6/xfs_ioctl32.c +++ b/fs/xfs/linux-2.6/xfs_ioctl32.c @@ -28,12 +28,8 @@ #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" -#include "xfs_dir2.h" -#include "xfs_dmapi.h" #include "xfs_mount.h" #include "xfs_bmap_btree.h" -#include "xfs_attr_sf.h" -#include "xfs_dir2_sf.h" #include "xfs_vnode.h" #include "xfs_dinode.h" #include "xfs_inode.h" @@ -237,15 +233,12 @@ xfs_bulkstat_one_compat( xfs_ino_t ino, /* inode number to get data for */ void __user *buffer, /* buffer to place output in */ int ubsize, /* size of buffer */ - void *private_data, /* my private data */ - xfs_daddr_t bno, /* starting bno of inode cluster */ int *ubused, /* bytes used by me */ - void *dibuff, /* on-disk inode buffer */ int *stat) /* BULKSTAT_RV_... */ { return xfs_bulkstat_one_int(mp, ino, buffer, ubsize, - xfs_bulkstat_one_fmt_compat, bno, - ubused, dibuff, stat); + xfs_bulkstat_one_fmt_compat, + ubused, stat); } /* copied from xfs_ioctl.c */ @@ -298,13 +291,11 @@ xfs_compat_ioc_bulkstat( int res; error = xfs_bulkstat_one_compat(mp, inlast, bulkreq.ubuffer, - sizeof(compat_xfs_bstat_t), - NULL, 0, NULL, NULL, &res); + sizeof(compat_xfs_bstat_t), 0, &res); } else if (cmd == XFS_IOC_FSBULKSTAT_32) { error = xfs_bulkstat(mp, &inlast, &count, - xfs_bulkstat_one_compat, NULL, - sizeof(compat_xfs_bstat_t), bulkreq.ubuffer, - BULKSTAT_FG_QUICK, &done); + xfs_bulkstat_one_compat, sizeof(compat_xfs_bstat_t), + bulkreq.ubuffer, &done); } else error = XFS_ERROR(EINVAL); if (error) @@ -549,7 +540,7 @@ xfs_file_compat_ioctl( if (filp->f_mode & FMODE_NOCMTIME) ioflags |= IO_INVIS; - xfs_itrace_entry(ip); + trace_xfs_file_compat_ioctl(ip); switch (cmd) { /* No size or alignment issues on any arch */ diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c index 9c8019c78c92..536b81e63a3d 100644 --- a/fs/xfs/linux-2.6/xfs_iops.c +++ b/fs/xfs/linux-2.6/xfs_iops.c @@ -24,21 +24,13 @@ #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" -#include "xfs_dir2.h" #include "xfs_alloc.h" -#include "xfs_dmapi.h" #include "xfs_quota.h" #include "xfs_mount.h" #include "xfs_bmap_btree.h" -#include "xfs_alloc_btree.h" -#include "xfs_ialloc_btree.h" -#include "xfs_dir2_sf.h" -#include "xfs_attr_sf.h" #include "xfs_dinode.h" #include "xfs_inode.h" #include "xfs_bmap.h" -#include "xfs_btree.h" -#include "xfs_ialloc.h" #include "xfs_rtalloc.h" #include "xfs_error.h" #include "xfs_itable.h" @@ -496,7 +488,7 @@ xfs_vn_getattr( struct xfs_inode *ip = XFS_I(inode); struct xfs_mount *mp = ip->i_mount; - xfs_itrace_entry(ip); + trace_xfs_getattr(ip); if (XFS_FORCED_SHUTDOWN(mp)) return XFS_ERROR(EIO); @@ -585,11 +577,20 @@ xfs_vn_fallocate( bf.l_len = len; xfs_ilock(ip, XFS_IOLOCK_EXCL); + + /* check the new inode size is valid before allocating */ + if (!(mode & FALLOC_FL_KEEP_SIZE) && + offset + len > i_size_read(inode)) { + new_size = offset + len; + error = inode_newsize_ok(inode, new_size); + if (error) + goto out_unlock; + } + error = -xfs_change_file_space(ip, XFS_IOC_RESVSP, &bf, 0, XFS_ATTR_NOLOCK); - if (!error && !(mode & FALLOC_FL_KEEP_SIZE) && - offset + len > i_size_read(inode)) - new_size = offset + len; + if (error) + goto out_unlock; /* Change file size if needed */ if (new_size) { @@ -600,6 +601,7 @@ xfs_vn_fallocate( error = -xfs_setattr(ip, &iattr, XFS_ATTR_NOLOCK); } +out_unlock: xfs_iunlock(ip, XFS_IOLOCK_EXCL); out_error: return error; diff --git a/fs/xfs/linux-2.6/xfs_linux.h b/fs/xfs/linux-2.6/xfs_linux.h index facfb323a706..998a9d7fb9c8 100644 --- a/fs/xfs/linux-2.6/xfs_linux.h +++ b/fs/xfs/linux-2.6/xfs_linux.h @@ -87,7 +87,6 @@ #include <xfs_aops.h> #include <xfs_super.h> #include <xfs_globals.h> -#include <xfs_fs_subr.h> #include <xfs_buf.h> /* diff --git a/fs/xfs/linux-2.6/xfs_quotaops.c b/fs/xfs/linux-2.6/xfs_quotaops.c index 9ac8aea91529..bfd5ac9d1f6f 100644 --- a/fs/xfs/linux-2.6/xfs_quotaops.c +++ b/fs/xfs/linux-2.6/xfs_quotaops.c @@ -16,14 +16,12 @@ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #include "xfs.h" -#include "xfs_dmapi.h" #include "xfs_sb.h" #include "xfs_inum.h" #include "xfs_log.h" #include "xfs_ag.h" #include "xfs_mount.h" #include "xfs_quota.h" -#include "xfs_log.h" #include "xfs_trans.h" #include "xfs_bmap_btree.h" #include "xfs_inode.h" diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c index f2d1718c9165..758df94690ed 100644 --- a/fs/xfs/linux-2.6/xfs_super.c +++ b/fs/xfs/linux-2.6/xfs_super.c @@ -25,14 +25,11 @@ #include "xfs_ag.h" #include "xfs_dir2.h" #include "xfs_alloc.h" -#include "xfs_dmapi.h" #include "xfs_quota.h" #include "xfs_mount.h" #include "xfs_bmap_btree.h" #include "xfs_alloc_btree.h" #include "xfs_ialloc_btree.h" -#include "xfs_dir2_sf.h" -#include "xfs_attr_sf.h" #include "xfs_dinode.h" #include "xfs_inode.h" #include "xfs_btree.h" @@ -43,7 +40,6 @@ #include "xfs_error.h" #include "xfs_itable.h" #include "xfs_fsops.h" -#include "xfs_rw.h" #include "xfs_attr.h" #include "xfs_buf_item.h" #include "xfs_utils.h" @@ -94,7 +90,6 @@ mempool_t *xfs_ioend_pool; #define MNTOPT_BARRIER "barrier" /* use writer barriers for log write and * unwritten extent conversion */ #define MNTOPT_NOBARRIER "nobarrier" /* .. disable */ -#define MNTOPT_OSYNCISOSYNC "osyncisosync" /* o_sync is REALLY o_sync */ #define MNTOPT_64BITINODE "inode64" /* inodes can be allocated anywhere */ #define MNTOPT_IKEEP "ikeep" /* do not free empty inode clusters */ #define MNTOPT_NOIKEEP "noikeep" /* free empty inode clusters */ @@ -116,9 +111,6 @@ mempool_t *xfs_ioend_pool; #define MNTOPT_GQUOTANOENF "gqnoenforce"/* group quota limit enforcement */ #define MNTOPT_PQUOTANOENF "pqnoenforce"/* project quota limit enforcement */ #define MNTOPT_QUOTANOENF "qnoenforce" /* same as uqnoenforce */ -#define MNTOPT_DMAPI "dmapi" /* DMI enabled (DMAPI / XDSM) */ -#define MNTOPT_XDSM "xdsm" /* DMI enabled (DMAPI / XDSM) */ -#define MNTOPT_DMI "dmi" /* DMI enabled (DMAPI / XDSM) */ #define MNTOPT_DELAYLOG "delaylog" /* Delayed loging enabled */ #define MNTOPT_NODELAYLOG "nodelaylog" /* Delayed loging disabled */ @@ -172,15 +164,13 @@ suffix_strtoul(char *s, char **endp, unsigned int base) STATIC int xfs_parseargs( struct xfs_mount *mp, - char *options, - char **mtpt) + char *options) { struct super_block *sb = mp->m_super; char *this_char, *value, *eov; int dsunit = 0; int dswidth = 0; int iosize = 0; - int dmapi_implies_ikeep = 1; __uint8_t iosizelog = 0; /* @@ -243,15 +233,10 @@ xfs_parseargs( if (!mp->m_logname) return ENOMEM; } else if (!strcmp(this_char, MNTOPT_MTPT)) { - if (!value || !*value) { - cmn_err(CE_WARN, - "XFS: %s option requires an argument", - this_char); - return EINVAL; - } - *mtpt = kstrndup(value, MAXNAMELEN, GFP_KERNEL); - if (!*mtpt) - return ENOMEM; + cmn_err(CE_WARN, + "XFS: %s option not allowed on this system", + this_char); + return EINVAL; } else if (!strcmp(this_char, MNTOPT_RTDEV)) { if (!value || !*value) { cmn_err(CE_WARN, @@ -288,8 +273,6 @@ xfs_parseargs( mp->m_flags &= ~XFS_MOUNT_GRPID; } else if (!strcmp(this_char, MNTOPT_WSYNC)) { mp->m_flags |= XFS_MOUNT_WSYNC; - } else if (!strcmp(this_char, MNTOPT_OSYNCISOSYNC)) { - mp->m_flags |= XFS_MOUNT_OSYNCISOSYNC; } else if (!strcmp(this_char, MNTOPT_NORECOVERY)) { mp->m_flags |= XFS_MOUNT_NORECOVERY; } else if (!strcmp(this_char, MNTOPT_NOALIGN)) { @@ -329,7 +312,6 @@ xfs_parseargs( } else if (!strcmp(this_char, MNTOPT_IKEEP)) { mp->m_flags |= XFS_MOUNT_IKEEP; } else if (!strcmp(this_char, MNTOPT_NOIKEEP)) { - dmapi_implies_ikeep = 0; mp->m_flags &= ~XFS_MOUNT_IKEEP; } else if (!strcmp(this_char, MNTOPT_LARGEIO)) { mp->m_flags &= ~XFS_MOUNT_COMPAT_IOSIZE; @@ -370,12 +352,6 @@ xfs_parseargs( } else if (!strcmp(this_char, MNTOPT_GQUOTANOENF)) { mp->m_qflags |= (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE); mp->m_qflags &= ~XFS_OQUOTA_ENFD; - } else if (!strcmp(this_char, MNTOPT_DMAPI)) { - mp->m_flags |= XFS_MOUNT_DMAPI; - } else if (!strcmp(this_char, MNTOPT_XDSM)) { - mp->m_flags |= XFS_MOUNT_DMAPI; - } else if (!strcmp(this_char, MNTOPT_DMI)) { - mp->m_flags |= XFS_MOUNT_DMAPI; } else if (!strcmp(this_char, MNTOPT_DELAYLOG)) { mp->m_flags |= XFS_MOUNT_DELAYLOG; cmn_err(CE_WARN, @@ -387,9 +363,11 @@ xfs_parseargs( cmn_err(CE_WARN, "XFS: ihashsize no longer used, option is deprecated."); } else if (!strcmp(this_char, "osyncisdsync")) { - /* no-op, this is now the default */ cmn_err(CE_WARN, - "XFS: osyncisdsync is now the default, option is deprecated."); + "XFS: osyncisdsync has no effect, option is deprecated."); + } else if (!strcmp(this_char, "osyncisosync")) { + cmn_err(CE_WARN, + "XFS: osyncisosync has no effect, option is deprecated."); } else if (!strcmp(this_char, "irixsgid")) { cmn_err(CE_WARN, "XFS: irixsgid is now a sysctl(2) variable, option is deprecated."); @@ -430,12 +408,6 @@ xfs_parseargs( return EINVAL; } - if ((mp->m_flags & XFS_MOUNT_DMAPI) && (!*mtpt || *mtpt[0] == '\0')) { - printk("XFS: %s option needs the mount point option as well\n", - MNTOPT_DMAPI); - return EINVAL; - } - if ((dsunit && !dswidth) || (!dsunit && dswidth)) { cmn_err(CE_WARN, "XFS: sunit and swidth must be specified together"); @@ -449,18 +421,6 @@ xfs_parseargs( return EINVAL; } - /* - * Applications using DMI filesystems often expect the - * inode generation number to be monotonically increasing. - * If we delete inode chunks we break this assumption, so - * keep unused inode chunks on disk for DMI filesystems - * until we come up with a better solution. - * Note that if "ikeep" or "noikeep" mount options are - * supplied, then they are honored. - */ - if ((mp->m_flags & XFS_MOUNT_DMAPI) && dmapi_implies_ikeep) - mp->m_flags |= XFS_MOUNT_IKEEP; - done: if (!(mp->m_flags & XFS_MOUNT_NOALIGN)) { /* @@ -539,10 +499,8 @@ xfs_showargs( { XFS_MOUNT_SWALLOC, "," MNTOPT_SWALLOC }, { XFS_MOUNT_NOUUID, "," MNTOPT_NOUUID }, { XFS_MOUNT_NORECOVERY, "," MNTOPT_NORECOVERY }, - { XFS_MOUNT_OSYNCISOSYNC, "," MNTOPT_OSYNCISOSYNC }, { XFS_MOUNT_ATTR2, "," MNTOPT_ATTR2 }, { XFS_MOUNT_FILESTREAMS, "," MNTOPT_FILESTREAM }, - { XFS_MOUNT_DMAPI, "," MNTOPT_DMAPI }, { XFS_MOUNT_GRPID, "," MNTOPT_GRPID }, { XFS_MOUNT_DELAYLOG, "," MNTOPT_DELAYLOG }, { 0, NULL } @@ -947,7 +905,7 @@ xfs_fs_destroy_inode( { struct xfs_inode *ip = XFS_I(inode); - xfs_itrace_entry(ip); + trace_xfs_destroy_inode(ip); XFS_STATS_INC(vn_reclaim); @@ -1063,10 +1021,8 @@ xfs_log_inode( * an inode in another recent transaction. So we play it safe and * fire off the transaction anyway. */ - xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); - xfs_trans_ihold(tp, ip); + xfs_trans_ijoin(tp, ip); xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); - xfs_trans_set_sync(tp); error = xfs_trans_commit(tp, 0); xfs_ilock_demote(ip, XFS_ILOCK_EXCL); @@ -1082,27 +1038,18 @@ xfs_fs_write_inode( struct xfs_mount *mp = ip->i_mount; int error = EAGAIN; - xfs_itrace_entry(ip); + trace_xfs_write_inode(ip); if (XFS_FORCED_SHUTDOWN(mp)) return XFS_ERROR(EIO); if (wbc->sync_mode == WB_SYNC_ALL) { /* - * Make sure the inode has hit stable storage. By using the - * log and the fsync transactions we reduce the IOs we have - * to do here from two (log and inode) to just the log. - * - * Note: We still need to do a delwri write of the inode after - * this to flush it to the backing buffer so that bulkstat - * works properly if this is the first time the inode has been - * written. Because we hold the ilock atomically over the - * transaction commit and the inode flush we are guaranteed - * that the inode is not pinned when it returns. If the flush - * lock is already held, then the inode has already been - * flushed once and we don't need to flush it again. Hence - * the code will only flush the inode if it isn't already - * being flushed. + * Make sure the inode has made it it into the log. Instead + * of forcing it all the way to stable storage using a + * synchronous transaction we let the log force inside the + * ->sync_fs call do that for thus, which reduces the number + * of synchronous log foces dramatically. */ xfs_ioend_wait(ip); xfs_ilock(ip, XFS_ILOCK_SHARED); @@ -1116,27 +1063,29 @@ xfs_fs_write_inode( * We make this non-blocking if the inode is contended, return * EAGAIN to indicate to the caller that they did not succeed. * This prevents the flush path from blocking on inodes inside - * another operation right now, they get caught later by xfs_sync. + * another operation right now, they get caught later by + * xfs_sync. */ if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) goto out; - } - if (xfs_ipincount(ip) || !xfs_iflock_nowait(ip)) - goto out_unlock; + if (xfs_ipincount(ip) || !xfs_iflock_nowait(ip)) + goto out_unlock; - /* - * Now we have the flush lock and the inode is not pinned, we can check - * if the inode is really clean as we know that there are no pending - * transaction completions, it is not waiting on the delayed write - * queue and there is no IO in progress. - */ - if (xfs_inode_clean(ip)) { - xfs_ifunlock(ip); - error = 0; - goto out_unlock; + /* + * Now we have the flush lock and the inode is not pinned, we + * can check if the inode is really clean as we know that + * there are no pending transaction completions, it is not + * waiting on the delayed write queue and there is no IO in + * progress. + */ + if (xfs_inode_clean(ip)) { + xfs_ifunlock(ip); + error = 0; + goto out_unlock; + } + error = xfs_iflush(ip, 0); } - error = xfs_iflush(ip, 0); out_unlock: xfs_iunlock(ip, XFS_ILOCK_SHARED); @@ -1156,7 +1105,8 @@ xfs_fs_clear_inode( { xfs_inode_t *ip = XFS_I(inode); - xfs_itrace_entry(ip); + trace_xfs_clear_inode(ip); + XFS_STATS_INC(vn_rele); XFS_STATS_INC(vn_remove); XFS_STATS_DEC(vn_active); @@ -1193,22 +1143,13 @@ xfs_fs_put_super( { struct xfs_mount *mp = XFS_M(sb); + /* + * Unregister the memory shrinker before we tear down the mount + * structure so we don't have memory reclaim racing with us here. + */ + xfs_inode_shrinker_unregister(mp); xfs_syncd_stop(mp); - if (!(sb->s_flags & MS_RDONLY)) { - /* - * XXX(hch): this should be SYNC_WAIT. - * - * Or more likely not needed at all because the VFS is already - * calling ->sync_fs after shutting down all filestem - * operations and just before calling ->put_super. - */ - xfs_sync_data(mp, 0); - xfs_sync_attr(mp, 0); - } - - XFS_SEND_PREUNMOUNT(mp); - /* * Blow away any referenced inode in the filestreams cache. * This can and will cause log traffic as inodes go inactive @@ -1218,14 +1159,10 @@ xfs_fs_put_super( XFS_bflush(mp->m_ddev_targp); - XFS_SEND_UNMOUNT(mp); - xfs_unmountfs(mp); xfs_freesb(mp); - xfs_inode_shrinker_unregister(mp); xfs_icsb_destroy_counters(mp); xfs_close_devices(mp); - xfs_dmops_put(mp); xfs_free_fsname(mp); kfree(mp); } @@ -1543,7 +1480,6 @@ xfs_fs_fill_super( struct inode *root; struct xfs_mount *mp = NULL; int flags = 0, error = ENOMEM; - char *mtpt = NULL; mp = kzalloc(sizeof(struct xfs_mount), GFP_KERNEL); if (!mp) @@ -1559,7 +1495,7 @@ xfs_fs_fill_super( mp->m_super = sb; sb->s_fs_info = mp; - error = xfs_parseargs(mp, (char *)data, &mtpt); + error = xfs_parseargs(mp, (char *)data); if (error) goto out_free_fsname; @@ -1571,16 +1507,12 @@ xfs_fs_fill_super( #endif sb->s_op = &xfs_super_operations; - error = xfs_dmops_get(mp); - if (error) - goto out_free_fsname; - if (silent) flags |= XFS_MFSI_QUIET; error = xfs_open_devices(mp); if (error) - goto out_put_dmops; + goto out_free_fsname; if (xfs_icsb_init_counters(mp)) mp->m_flags |= XFS_MOUNT_NO_PERCPU_SB; @@ -1608,8 +1540,6 @@ xfs_fs_fill_super( if (error) goto out_filestream_unmount; - XFS_SEND_MOUNT(mp, DM_RIGHT_NULL, mtpt, mp->m_fsname); - sb->s_magic = XFS_SB_MAGIC; sb->s_blocksize = mp->m_sb.sb_blocksize; sb->s_blocksize_bits = ffs(sb->s_blocksize) - 1; @@ -1638,7 +1568,6 @@ xfs_fs_fill_super( xfs_inode_shrinker_register(mp); - kfree(mtpt); return 0; out_filestream_unmount: @@ -1648,11 +1577,8 @@ xfs_fs_fill_super( out_destroy_counters: xfs_icsb_destroy_counters(mp); xfs_close_devices(mp); - out_put_dmops: - xfs_dmops_put(mp); out_free_fsname: xfs_free_fsname(mp); - kfree(mtpt); kfree(mp); out: return -error; @@ -1759,6 +1685,12 @@ xfs_init_zones(void) if (!xfs_trans_zone) goto out_destroy_ifork_zone; + xfs_log_item_desc_zone = + kmem_zone_init(sizeof(struct xfs_log_item_desc), + "xfs_log_item_desc"); + if (!xfs_log_item_desc_zone) + goto out_destroy_trans_zone; + /* * The size of the zone allocated buf log item is the maximum * size possible under XFS. This wastes a little bit of memory, @@ -1768,7 +1700,7 @@ xfs_init_zones(void) (((XFS_MAX_BLOCKSIZE / XFS_BLF_CHUNK) / NBWORD) * sizeof(int))), "xfs_buf_item"); if (!xfs_buf_item_zone) - goto out_destroy_trans_zone; + goto out_destroy_log_item_desc_zone; xfs_efd_zone = kmem_zone_init((sizeof(xfs_efd_log_item_t) + ((XFS_EFD_MAX_FAST_EXTENTS - 1) * @@ -1805,6 +1737,8 @@ xfs_init_zones(void) kmem_zone_destroy(xfs_efd_zone); out_destroy_buf_item_zone: kmem_zone_destroy(xfs_buf_item_zone); + out_destroy_log_item_desc_zone: + kmem_zone_destroy(xfs_log_item_desc_zone); out_destroy_trans_zone: kmem_zone_destroy(xfs_trans_zone); out_destroy_ifork_zone: @@ -1835,6 +1769,7 @@ xfs_destroy_zones(void) kmem_zone_destroy(xfs_efi_zone); kmem_zone_destroy(xfs_efd_zone); kmem_zone_destroy(xfs_buf_item_zone); + kmem_zone_destroy(xfs_log_item_desc_zone); kmem_zone_destroy(xfs_trans_zone); kmem_zone_destroy(xfs_ifork_zone); kmem_zone_destroy(xfs_dabuf_zone); @@ -1883,7 +1818,6 @@ init_xfs_fs(void) goto out_cleanup_procfs; vfs_initquota(); - xfs_inode_shrinker_init(); error = register_filesystem(&xfs_fs_type); if (error) @@ -1911,7 +1845,6 @@ exit_xfs_fs(void) { vfs_exitquota(); unregister_filesystem(&xfs_fs_type); - xfs_inode_shrinker_destroy(); xfs_sysctl_unregister(); xfs_cleanup_procfs(); xfs_buf_terminate(); diff --git a/fs/xfs/linux-2.6/xfs_super.h b/fs/xfs/linux-2.6/xfs_super.h index 519618e9279e..1ef4a4d2d997 100644 --- a/fs/xfs/linux-2.6/xfs_super.h +++ b/fs/xfs/linux-2.6/xfs_super.h @@ -56,12 +56,6 @@ extern void xfs_qm_exit(void); # define XFS_BIGFS_STRING #endif -#ifdef CONFIG_XFS_DMAPI -# define XFS_DMAPI_STRING "dmapi support, " -#else -# define XFS_DMAPI_STRING -#endif - #ifdef DEBUG # define XFS_DBG_STRING "debug" #else @@ -72,7 +66,6 @@ extern void xfs_qm_exit(void); XFS_SECURITY_STRING \ XFS_REALTIME_STRING \ XFS_BIGFS_STRING \ - XFS_DMAPI_STRING \ XFS_DBG_STRING /* DBG must be last */ struct xfs_inode; diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c index 3884e20bc14e..dfcbd98d1599 100644 --- a/fs/xfs/linux-2.6/xfs_sync.c +++ b/fs/xfs/linux-2.6/xfs_sync.c @@ -24,25 +24,14 @@ #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" -#include "xfs_dir2.h" -#include "xfs_dmapi.h" #include "xfs_mount.h" #include "xfs_bmap_btree.h" -#include "xfs_alloc_btree.h" -#include "xfs_ialloc_btree.h" -#include "xfs_btree.h" -#include "xfs_dir2_sf.h" -#include "xfs_attr_sf.h" #include "xfs_inode.h" #include "xfs_dinode.h" #include "xfs_error.h" -#include "xfs_mru_cache.h" #include "xfs_filestream.h" #include "xfs_vnodeops.h" -#include "xfs_utils.h" -#include "xfs_buf_item.h" #include "xfs_inode_item.h" -#include "xfs_rw.h" #include "xfs_quota.h" #include "xfs_trace.h" @@ -144,6 +133,41 @@ restart: return last_error; } +/* + * Select the next per-ag structure to iterate during the walk. The reclaim + * walk is optimised only to walk AGs with reclaimable inodes in them. + */ +static struct xfs_perag * +xfs_inode_ag_iter_next_pag( + struct xfs_mount *mp, + xfs_agnumber_t *first, + int tag) +{ + struct xfs_perag *pag = NULL; + + if (tag == XFS_ICI_RECLAIM_TAG) { + int found; + int ref; + + spin_lock(&mp->m_perag_lock); + found = radix_tree_gang_lookup_tag(&mp->m_perag_tree, + (void **)&pag, *first, 1, tag); + if (found <= 0) { + spin_unlock(&mp->m_perag_lock); + return NULL; + } + *first = pag->pag_agno + 1; + /* open coded pag reference increment */ + ref = atomic_inc_return(&pag->pag_ref); + spin_unlock(&mp->m_perag_lock); + trace_xfs_perag_get_reclaim(mp, pag->pag_agno, ref, _RET_IP_); + } else { + pag = xfs_perag_get(mp, *first); + (*first)++; + } + return pag; +} + int xfs_inode_ag_iterator( struct xfs_mount *mp, @@ -154,20 +178,15 @@ xfs_inode_ag_iterator( int exclusive, int *nr_to_scan) { + struct xfs_perag *pag; int error = 0; int last_error = 0; xfs_agnumber_t ag; int nr; nr = nr_to_scan ? *nr_to_scan : INT_MAX; - for (ag = 0; ag < mp->m_sb.sb_agcount; ag++) { - struct xfs_perag *pag; - - pag = xfs_perag_get(mp, ag); - if (!pag->pag_ici_init) { - xfs_perag_put(pag); - continue; - } + ag = 0; + while ((pag = xfs_inode_ag_iter_next_pag(mp, &ag, tag))) { error = xfs_inode_ag_walk(mp, pag, execute, flags, tag, exclusive, &nr); xfs_perag_put(pag); @@ -289,7 +308,7 @@ xfs_sync_inode_attr( /* * Write out pagecache data for the whole filesystem. */ -int +STATIC int xfs_sync_data( struct xfs_mount *mp, int flags) @@ -310,7 +329,7 @@ xfs_sync_data( /* * Write out inode metadata (attributes) for the whole filesystem. */ -int +STATIC int xfs_sync_attr( struct xfs_mount *mp, int flags) @@ -343,8 +362,7 @@ xfs_commit_dummy_trans( xfs_ilock(ip, XFS_ILOCK_EXCL); - xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); - xfs_trans_ihold(tp, ip); + xfs_trans_ijoin(tp, ip); xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); error = xfs_trans_commit(tp, 0); xfs_iunlock(ip, XFS_ILOCK_EXCL); @@ -644,6 +662,17 @@ __xfs_inode_set_reclaim_tag( radix_tree_tag_set(&pag->pag_ici_root, XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino), XFS_ICI_RECLAIM_TAG); + + if (!pag->pag_ici_reclaimable) { + /* propagate the reclaim tag up into the perag radix tree */ + spin_lock(&ip->i_mount->m_perag_lock); + radix_tree_tag_set(&ip->i_mount->m_perag_tree, + XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino), + XFS_ICI_RECLAIM_TAG); + spin_unlock(&ip->i_mount->m_perag_lock); + trace_xfs_perag_set_reclaim(ip->i_mount, pag->pag_agno, + -1, _RET_IP_); + } pag->pag_ici_reclaimable++; } @@ -678,6 +707,16 @@ __xfs_inode_clear_reclaim_tag( radix_tree_tag_clear(&pag->pag_ici_root, XFS_INO_TO_AGINO(mp, ip->i_ino), XFS_ICI_RECLAIM_TAG); pag->pag_ici_reclaimable--; + if (!pag->pag_ici_reclaimable) { + /* clear the reclaim tag from the perag radix tree */ + spin_lock(&ip->i_mount->m_perag_lock); + radix_tree_tag_clear(&ip->i_mount->m_perag_tree, + XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino), + XFS_ICI_RECLAIM_TAG); + spin_unlock(&ip->i_mount->m_perag_lock); + trace_xfs_perag_clear_reclaim(ip->i_mount, pag->pag_agno, + -1, _RET_IP_); + } } /* @@ -816,7 +855,36 @@ out: reclaim: xfs_ifunlock(ip); xfs_iunlock(ip, XFS_ILOCK_EXCL); - xfs_ireclaim(ip); + + XFS_STATS_INC(xs_ig_reclaims); + /* + * Remove the inode from the per-AG radix tree. + * + * Because radix_tree_delete won't complain even if the item was never + * added to the tree assert that it's been there before to catch + * problems with the inode life time early on. + */ + write_lock(&pag->pag_ici_lock); + if (!radix_tree_delete(&pag->pag_ici_root, + XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino))) + ASSERT(0); + write_unlock(&pag->pag_ici_lock); + + /* + * Here we do an (almost) spurious inode lock in order to coordinate + * with inode cache radix tree lookups. This is because the lookup + * can reference the inodes in the cache without taking references. + * + * We make that OK here by ensuring that we wait until the inode is + * unlocked after the lookup before we go ahead and free it. We get + * both the ilock and the iolock because the code may need to drop the + * ilock one but will still hold the iolock. + */ + xfs_ilock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); + xfs_qm_dqdetach(ip); + xfs_iunlock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); + + xfs_inode_free(ip); return error; } @@ -832,88 +900,52 @@ xfs_reclaim_inodes( /* * Shrinker infrastructure. - * - * This is all far more complex than it needs to be. It adds a global list of - * mounts because the shrinkers can only call a global context. We need to make - * the shrinkers pass a context to avoid the need for global state. */ -static LIST_HEAD(xfs_mount_list); -static struct rw_semaphore xfs_mount_list_lock; - static int xfs_reclaim_inode_shrink( + struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask) { struct xfs_mount *mp; struct xfs_perag *pag; xfs_agnumber_t ag; - int reclaimable = 0; + int reclaimable; + mp = container_of(shrink, struct xfs_mount, m_inode_shrink); if (nr_to_scan) { if (!(gfp_mask & __GFP_FS)) return -1; - down_read(&xfs_mount_list_lock); - list_for_each_entry(mp, &xfs_mount_list, m_mplist) { - xfs_inode_ag_iterator(mp, xfs_reclaim_inode, 0, + xfs_inode_ag_iterator(mp, xfs_reclaim_inode, 0, XFS_ICI_RECLAIM_TAG, 1, &nr_to_scan); - if (nr_to_scan <= 0) - break; - } - up_read(&xfs_mount_list_lock); - } - - down_read(&xfs_mount_list_lock); - list_for_each_entry(mp, &xfs_mount_list, m_mplist) { - for (ag = 0; ag < mp->m_sb.sb_agcount; ag++) { + /* if we don't exhaust the scan, don't bother coming back */ + if (nr_to_scan > 0) + return -1; + } - pag = xfs_perag_get(mp, ag); - if (!pag->pag_ici_init) { - xfs_perag_put(pag); - continue; - } - reclaimable += pag->pag_ici_reclaimable; - xfs_perag_put(pag); - } + reclaimable = 0; + ag = 0; + while ((pag = xfs_inode_ag_iter_next_pag(mp, &ag, + XFS_ICI_RECLAIM_TAG))) { + reclaimable += pag->pag_ici_reclaimable; + xfs_perag_put(pag); } - up_read(&xfs_mount_list_lock); return reclaimable; } -static struct shrinker xfs_inode_shrinker = { - .shrink = xfs_reclaim_inode_shrink, - .seeks = DEFAULT_SEEKS, -}; - -void __init -xfs_inode_shrinker_init(void) -{ - init_rwsem(&xfs_mount_list_lock); - register_shrinker(&xfs_inode_shrinker); -} - -void -xfs_inode_shrinker_destroy(void) -{ - ASSERT(list_empty(&xfs_mount_list)); - unregister_shrinker(&xfs_inode_shrinker); -} - void xfs_inode_shrinker_register( struct xfs_mount *mp) { - down_write(&xfs_mount_list_lock); - list_add_tail(&mp->m_mplist, &xfs_mount_list); - up_write(&xfs_mount_list_lock); + mp->m_inode_shrink.shrink = xfs_reclaim_inode_shrink; + mp->m_inode_shrink.seeks = DEFAULT_SEEKS; + register_shrinker(&mp->m_inode_shrink); } void xfs_inode_shrinker_unregister( struct xfs_mount *mp) { - down_write(&xfs_mount_list_lock); - list_del(&mp->m_mplist); - up_write(&xfs_mount_list_lock); + unregister_shrinker(&mp->m_inode_shrink); } diff --git a/fs/xfs/linux-2.6/xfs_sync.h b/fs/xfs/linux-2.6/xfs_sync.h index cdcbaaca9880..fe78726196f8 100644 --- a/fs/xfs/linux-2.6/xfs_sync.h +++ b/fs/xfs/linux-2.6/xfs_sync.h @@ -35,9 +35,6 @@ typedef struct xfs_sync_work { int xfs_syncd_init(struct xfs_mount *mp); void xfs_syncd_stop(struct xfs_mount *mp); -int xfs_sync_attr(struct xfs_mount *mp, int flags); -int xfs_sync_data(struct xfs_mount *mp, int flags); - int xfs_quiesce_data(struct xfs_mount *mp); void xfs_quiesce_attr(struct xfs_mount *mp); @@ -55,8 +52,6 @@ int xfs_inode_ag_iterator(struct xfs_mount *mp, int (*execute)(struct xfs_inode *ip, struct xfs_perag *pag, int flags), int flags, int tag, int write_lock, int *nr_to_scan); -void xfs_inode_shrinker_init(void); -void xfs_inode_shrinker_destroy(void); void xfs_inode_shrinker_register(struct xfs_mount *mp); void xfs_inode_shrinker_unregister(struct xfs_mount *mp); diff --git a/fs/xfs/linux-2.6/xfs_trace.c b/fs/xfs/linux-2.6/xfs_trace.c index 207fa77f63ae..88d25d4aa56e 100644 --- a/fs/xfs/linux-2.6/xfs_trace.c +++ b/fs/xfs/linux-2.6/xfs_trace.c @@ -24,17 +24,13 @@ #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" -#include "xfs_dir2.h" #include "xfs_da_btree.h" #include "xfs_bmap_btree.h" #include "xfs_alloc_btree.h" #include "xfs_ialloc_btree.h" -#include "xfs_dir2_sf.h" -#include "xfs_attr_sf.h" #include "xfs_dinode.h" #include "xfs_inode.h" #include "xfs_btree.h" -#include "xfs_dmapi.h" #include "xfs_mount.h" #include "xfs_ialloc.h" #include "xfs_itable.h" @@ -50,7 +46,6 @@ #include "quota/xfs_dquot_item.h" #include "quota/xfs_dquot.h" #include "xfs_log_recover.h" -#include "xfs_buf_item.h" #include "xfs_inode_item.h" /* diff --git a/fs/xfs/linux-2.6/xfs_trace.h b/fs/xfs/linux-2.6/xfs_trace.h index ff6bc797baf2..c657cdca2cd2 100644 --- a/fs/xfs/linux-2.6/xfs_trace.h +++ b/fs/xfs/linux-2.6/xfs_trace.h @@ -82,33 +82,6 @@ DECLARE_EVENT_CLASS(xfs_attr_list_class, ) ) -#define DEFINE_PERAG_REF_EVENT(name) \ -TRACE_EVENT(name, \ - TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, int refcount, \ - unsigned long caller_ip), \ - TP_ARGS(mp, agno, refcount, caller_ip), \ - TP_STRUCT__entry( \ - __field(dev_t, dev) \ - __field(xfs_agnumber_t, agno) \ - __field(int, refcount) \ - __field(unsigned long, caller_ip) \ - ), \ - TP_fast_assign( \ - __entry->dev = mp->m_super->s_dev; \ - __entry->agno = agno; \ - __entry->refcount = refcount; \ - __entry->caller_ip = caller_ip; \ - ), \ - TP_printk("dev %d:%d agno %u refcount %d caller %pf", \ - MAJOR(__entry->dev), MINOR(__entry->dev), \ - __entry->agno, \ - __entry->refcount, \ - (char *)__entry->caller_ip) \ -); - -DEFINE_PERAG_REF_EVENT(xfs_perag_get) -DEFINE_PERAG_REF_EVENT(xfs_perag_put) - #define DEFINE_ATTR_LIST_EVENT(name) \ DEFINE_EVENT(xfs_attr_list_class, name, \ TP_PROTO(struct xfs_attr_list_context *ctx), \ @@ -122,6 +95,40 @@ DEFINE_ATTR_LIST_EVENT(xfs_attr_list_add); DEFINE_ATTR_LIST_EVENT(xfs_attr_list_wrong_blk); DEFINE_ATTR_LIST_EVENT(xfs_attr_list_notfound); +DECLARE_EVENT_CLASS(xfs_perag_class, + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, int refcount, + unsigned long caller_ip), + TP_ARGS(mp, agno, refcount, caller_ip), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(int, refcount) + __field(unsigned long, caller_ip) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->agno = agno; + __entry->refcount = refcount; + __entry->caller_ip = caller_ip; + ), + TP_printk("dev %d:%d agno %u refcount %d caller %pf", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->agno, + __entry->refcount, + (char *)__entry->caller_ip) +); + +#define DEFINE_PERAG_REF_EVENT(name) \ +DEFINE_EVENT(xfs_perag_class, name, \ + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, int refcount, \ + unsigned long caller_ip), \ + TP_ARGS(mp, agno, refcount, caller_ip)) +DEFINE_PERAG_REF_EVENT(xfs_perag_get); +DEFINE_PERAG_REF_EVENT(xfs_perag_get_reclaim); +DEFINE_PERAG_REF_EVENT(xfs_perag_put); +DEFINE_PERAG_REF_EVENT(xfs_perag_set_reclaim); +DEFINE_PERAG_REF_EVENT(xfs_perag_clear_reclaim); + TRACE_EVENT(xfs_attr_list_node_descend, TP_PROTO(struct xfs_attr_list_context *ctx, struct xfs_da_node_entry *btree), @@ -310,8 +317,6 @@ DEFINE_BUF_EVENT(xfs_buf_init); DEFINE_BUF_EVENT(xfs_buf_free); DEFINE_BUF_EVENT(xfs_buf_hold); DEFINE_BUF_EVENT(xfs_buf_rele); -DEFINE_BUF_EVENT(xfs_buf_pin); -DEFINE_BUF_EVENT(xfs_buf_unpin); DEFINE_BUF_EVENT(xfs_buf_iodone); DEFINE_BUF_EVENT(xfs_buf_iorequest); DEFINE_BUF_EVENT(xfs_buf_bawrite); @@ -534,7 +539,7 @@ DEFINE_LOCK_EVENT(xfs_ilock_nowait); DEFINE_LOCK_EVENT(xfs_ilock_demote); DEFINE_LOCK_EVENT(xfs_iunlock); -DECLARE_EVENT_CLASS(xfs_iget_class, +DECLARE_EVENT_CLASS(xfs_inode_class, TP_PROTO(struct xfs_inode *ip), TP_ARGS(ip), TP_STRUCT__entry( @@ -550,16 +555,38 @@ DECLARE_EVENT_CLASS(xfs_iget_class, __entry->ino) ) -#define DEFINE_IGET_EVENT(name) \ -DEFINE_EVENT(xfs_iget_class, name, \ +#define DEFINE_INODE_EVENT(name) \ +DEFINE_EVENT(xfs_inode_class, name, \ TP_PROTO(struct xfs_inode *ip), \ TP_ARGS(ip)) -DEFINE_IGET_EVENT(xfs_iget_skip); -DEFINE_IGET_EVENT(xfs_iget_reclaim); -DEFINE_IGET_EVENT(xfs_iget_found); -DEFINE_IGET_EVENT(xfs_iget_alloc); - -DECLARE_EVENT_CLASS(xfs_inode_class, +DEFINE_INODE_EVENT(xfs_iget_skip); +DEFINE_INODE_EVENT(xfs_iget_reclaim); +DEFINE_INODE_EVENT(xfs_iget_reclaim_fail); +DEFINE_INODE_EVENT(xfs_iget_hit); +DEFINE_INODE_EVENT(xfs_iget_miss); + +DEFINE_INODE_EVENT(xfs_getattr); +DEFINE_INODE_EVENT(xfs_setattr); +DEFINE_INODE_EVENT(xfs_readlink); +DEFINE_INODE_EVENT(xfs_alloc_file_space); +DEFINE_INODE_EVENT(xfs_free_file_space); +DEFINE_INODE_EVENT(xfs_readdir); +#ifdef CONFIG_XFS_POSIX_ACL +DEFINE_INODE_EVENT(xfs_check_acl); +#endif +DEFINE_INODE_EVENT(xfs_vm_bmap); +DEFINE_INODE_EVENT(xfs_file_ioctl); +DEFINE_INODE_EVENT(xfs_file_compat_ioctl); +DEFINE_INODE_EVENT(xfs_ioctl_setattr); +DEFINE_INODE_EVENT(xfs_file_fsync); +DEFINE_INODE_EVENT(xfs_destroy_inode); +DEFINE_INODE_EVENT(xfs_write_inode); +DEFINE_INODE_EVENT(xfs_clear_inode); + +DEFINE_INODE_EVENT(xfs_dquot_dqalloc); +DEFINE_INODE_EVENT(xfs_dquot_dqdetach); + +DECLARE_EVENT_CLASS(xfs_iref_class, TP_PROTO(struct xfs_inode *ip, unsigned long caller_ip), TP_ARGS(ip, caller_ip), TP_STRUCT__entry( @@ -584,20 +611,71 @@ DECLARE_EVENT_CLASS(xfs_inode_class, (char *)__entry->caller_ip) ) -#define DEFINE_INODE_EVENT(name) \ -DEFINE_EVENT(xfs_inode_class, name, \ +#define DEFINE_IREF_EVENT(name) \ +DEFINE_EVENT(xfs_iref_class, name, \ TP_PROTO(struct xfs_inode *ip, unsigned long caller_ip), \ TP_ARGS(ip, caller_ip)) -DEFINE_INODE_EVENT(xfs_ihold); -DEFINE_INODE_EVENT(xfs_irele); -DEFINE_INODE_EVENT(xfs_inode_pin); -DEFINE_INODE_EVENT(xfs_inode_unpin); -DEFINE_INODE_EVENT(xfs_inode_unpin_nowait); +DEFINE_IREF_EVENT(xfs_ihold); +DEFINE_IREF_EVENT(xfs_irele); +DEFINE_IREF_EVENT(xfs_inode_pin); +DEFINE_IREF_EVENT(xfs_inode_unpin); +DEFINE_IREF_EVENT(xfs_inode_unpin_nowait); + +DECLARE_EVENT_CLASS(xfs_namespace_class, + TP_PROTO(struct xfs_inode *dp, struct xfs_name *name), + TP_ARGS(dp, name), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, dp_ino) + __dynamic_array(char, name, name->len) + ), + TP_fast_assign( + __entry->dev = VFS_I(dp)->i_sb->s_dev; + __entry->dp_ino = dp->i_ino; + memcpy(__get_str(name), name->name, name->len); + ), + TP_printk("dev %d:%d dp ino 0x%llx name %s", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->dp_ino, + __get_str(name)) +) -/* the old xfs_itrace_entry tracer - to be replaced by s.th. in the VFS */ -DEFINE_INODE_EVENT(xfs_inode); -#define xfs_itrace_entry(ip) \ - trace_xfs_inode(ip, _THIS_IP_) +#define DEFINE_NAMESPACE_EVENT(name) \ +DEFINE_EVENT(xfs_namespace_class, name, \ + TP_PROTO(struct xfs_inode *dp, struct xfs_name *name), \ + TP_ARGS(dp, name)) +DEFINE_NAMESPACE_EVENT(xfs_remove); +DEFINE_NAMESPACE_EVENT(xfs_link); +DEFINE_NAMESPACE_EVENT(xfs_lookup); +DEFINE_NAMESPACE_EVENT(xfs_create); +DEFINE_NAMESPACE_EVENT(xfs_symlink); + +TRACE_EVENT(xfs_rename, + TP_PROTO(struct xfs_inode *src_dp, struct xfs_inode *target_dp, + struct xfs_name *src_name, struct xfs_name *target_name), + TP_ARGS(src_dp, target_dp, src_name, target_name), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, src_dp_ino) + __field(xfs_ino_t, target_dp_ino) + __dynamic_array(char, src_name, src_name->len) + __dynamic_array(char, target_name, target_name->len) + ), + TP_fast_assign( + __entry->dev = VFS_I(src_dp)->i_sb->s_dev; + __entry->src_dp_ino = src_dp->i_ino; + __entry->target_dp_ino = target_dp->i_ino; + memcpy(__get_str(src_name), src_name->name, src_name->len); + memcpy(__get_str(target_name), target_name->name, target_name->len); + ), + TP_printk("dev %d:%d src dp ino 0x%llx target dp ino 0x%llx" + " src name %s target name %s", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->src_dp_ino, + __entry->target_dp_ino, + __get_str(src_name), + __get_str(target_name)) +) DECLARE_EVENT_CLASS(xfs_dquot_class, TP_PROTO(struct xfs_dquot *dqp), @@ -677,9 +755,6 @@ DEFINE_DQUOT_EVENT(xfs_dqrele); DEFINE_DQUOT_EVENT(xfs_dqflush); DEFINE_DQUOT_EVENT(xfs_dqflush_force); DEFINE_DQUOT_EVENT(xfs_dqflush_done); -/* not really iget events, but we re-use the format */ -DEFINE_IGET_EVENT(xfs_dquot_dqalloc); -DEFINE_IGET_EVENT(xfs_dquot_dqdetach); DECLARE_EVENT_CLASS(xfs_loggrant_class, TP_PROTO(struct log *log, struct xlog_ticket *tic), @@ -775,165 +850,177 @@ DEFINE_LOGGRANT_EVENT(xfs_log_ungrant_enter); DEFINE_LOGGRANT_EVENT(xfs_log_ungrant_exit); DEFINE_LOGGRANT_EVENT(xfs_log_ungrant_sub); -#define DEFINE_RW_EVENT(name) \ -TRACE_EVENT(name, \ - TP_PROTO(struct xfs_inode *ip, size_t count, loff_t offset, int flags), \ - TP_ARGS(ip, count, offset, flags), \ - TP_STRUCT__entry( \ - __field(dev_t, dev) \ - __field(xfs_ino_t, ino) \ - __field(xfs_fsize_t, size) \ - __field(xfs_fsize_t, new_size) \ - __field(loff_t, offset) \ - __field(size_t, count) \ - __field(int, flags) \ - ), \ - TP_fast_assign( \ - __entry->dev = VFS_I(ip)->i_sb->s_dev; \ - __entry->ino = ip->i_ino; \ - __entry->size = ip->i_d.di_size; \ - __entry->new_size = ip->i_new_size; \ - __entry->offset = offset; \ - __entry->count = count; \ - __entry->flags = flags; \ - ), \ - TP_printk("dev %d:%d ino 0x%llx size 0x%llx new_size 0x%llx " \ - "offset 0x%llx count 0x%zx ioflags %s", \ - MAJOR(__entry->dev), MINOR(__entry->dev), \ - __entry->ino, \ - __entry->size, \ - __entry->new_size, \ - __entry->offset, \ - __entry->count, \ - __print_flags(__entry->flags, "|", XFS_IO_FLAGS)) \ +DECLARE_EVENT_CLASS(xfs_file_class, + TP_PROTO(struct xfs_inode *ip, size_t count, loff_t offset, int flags), + TP_ARGS(ip, count, offset, flags), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(xfs_fsize_t, size) + __field(xfs_fsize_t, new_size) + __field(loff_t, offset) + __field(size_t, count) + __field(int, flags) + ), + TP_fast_assign( + __entry->dev = VFS_I(ip)->i_sb->s_dev; + __entry->ino = ip->i_ino; + __entry->size = ip->i_d.di_size; + __entry->new_size = ip->i_new_size; + __entry->offset = offset; + __entry->count = count; + __entry->flags = flags; + ), + TP_printk("dev %d:%d ino 0x%llx size 0x%llx new_size 0x%llx " + "offset 0x%llx count 0x%zx ioflags %s", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->size, + __entry->new_size, + __entry->offset, + __entry->count, + __print_flags(__entry->flags, "|", XFS_IO_FLAGS)) ) + +#define DEFINE_RW_EVENT(name) \ +DEFINE_EVENT(xfs_file_class, name, \ + TP_PROTO(struct xfs_inode *ip, size_t count, loff_t offset, int flags), \ + TP_ARGS(ip, count, offset, flags)) DEFINE_RW_EVENT(xfs_file_read); DEFINE_RW_EVENT(xfs_file_buffered_write); DEFINE_RW_EVENT(xfs_file_direct_write); DEFINE_RW_EVENT(xfs_file_splice_read); DEFINE_RW_EVENT(xfs_file_splice_write); - -#define DEFINE_PAGE_EVENT(name) \ -TRACE_EVENT(name, \ - TP_PROTO(struct inode *inode, struct page *page, unsigned long off), \ - TP_ARGS(inode, page, off), \ - TP_STRUCT__entry( \ - __field(dev_t, dev) \ - __field(xfs_ino_t, ino) \ - __field(pgoff_t, pgoff) \ - __field(loff_t, size) \ - __field(unsigned long, offset) \ - __field(int, delalloc) \ - __field(int, unmapped) \ - __field(int, unwritten) \ - ), \ - TP_fast_assign( \ - int delalloc = -1, unmapped = -1, unwritten = -1; \ - \ - if (page_has_buffers(page)) \ - xfs_count_page_state(page, &delalloc, \ - &unmapped, &unwritten); \ - __entry->dev = inode->i_sb->s_dev; \ - __entry->ino = XFS_I(inode)->i_ino; \ - __entry->pgoff = page_offset(page); \ - __entry->size = i_size_read(inode); \ - __entry->offset = off; \ - __entry->delalloc = delalloc; \ - __entry->unmapped = unmapped; \ - __entry->unwritten = unwritten; \ - ), \ - TP_printk("dev %d:%d ino 0x%llx pgoff 0x%lx size 0x%llx offset %lx " \ - "delalloc %d unmapped %d unwritten %d", \ - MAJOR(__entry->dev), MINOR(__entry->dev), \ - __entry->ino, \ - __entry->pgoff, \ - __entry->size, \ - __entry->offset, \ - __entry->delalloc, \ - __entry->unmapped, \ - __entry->unwritten) \ +DECLARE_EVENT_CLASS(xfs_page_class, + TP_PROTO(struct inode *inode, struct page *page, unsigned long off), + TP_ARGS(inode, page, off), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(pgoff_t, pgoff) + __field(loff_t, size) + __field(unsigned long, offset) + __field(int, delalloc) + __field(int, unwritten) + ), + TP_fast_assign( + int delalloc = -1, unwritten = -1; + + if (page_has_buffers(page)) + xfs_count_page_state(page, &delalloc, &unwritten); + __entry->dev = inode->i_sb->s_dev; + __entry->ino = XFS_I(inode)->i_ino; + __entry->pgoff = page_offset(page); + __entry->size = i_size_read(inode); + __entry->offset = off; + __entry->delalloc = delalloc; + __entry->unwritten = unwritten; + ), + TP_printk("dev %d:%d ino 0x%llx pgoff 0x%lx size 0x%llx offset %lx " + "delalloc %d unwritten %d", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->pgoff, + __entry->size, + __entry->offset, + __entry->delalloc, + __entry->unwritten) ) + +#define DEFINE_PAGE_EVENT(name) \ +DEFINE_EVENT(xfs_page_class, name, \ + TP_PROTO(struct inode *inode, struct page *page, unsigned long off), \ + TP_ARGS(inode, page, off)) DEFINE_PAGE_EVENT(xfs_writepage); DEFINE_PAGE_EVENT(xfs_releasepage); DEFINE_PAGE_EVENT(xfs_invalidatepage); -#define DEFINE_IOMAP_EVENT(name) \ -TRACE_EVENT(name, \ - TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count, \ - int flags, struct xfs_bmbt_irec *irec), \ - TP_ARGS(ip, offset, count, flags, irec), \ - TP_STRUCT__entry( \ - __field(dev_t, dev) \ - __field(xfs_ino_t, ino) \ - __field(loff_t, size) \ - __field(loff_t, new_size) \ - __field(loff_t, offset) \ - __field(size_t, count) \ - __field(int, flags) \ - __field(xfs_fileoff_t, startoff) \ - __field(xfs_fsblock_t, startblock) \ - __field(xfs_filblks_t, blockcount) \ - ), \ - TP_fast_assign( \ - __entry->dev = VFS_I(ip)->i_sb->s_dev; \ - __entry->ino = ip->i_ino; \ - __entry->size = ip->i_d.di_size; \ - __entry->new_size = ip->i_new_size; \ - __entry->offset = offset; \ - __entry->count = count; \ - __entry->flags = flags; \ - __entry->startoff = irec ? irec->br_startoff : 0; \ - __entry->startblock = irec ? irec->br_startblock : 0; \ - __entry->blockcount = irec ? irec->br_blockcount : 0; \ - ), \ - TP_printk("dev %d:%d ino 0x%llx size 0x%llx new_size 0x%llx " \ - "offset 0x%llx count %zd flags %s " \ - "startoff 0x%llx startblock %lld blockcount 0x%llx", \ - MAJOR(__entry->dev), MINOR(__entry->dev), \ - __entry->ino, \ - __entry->size, \ - __entry->new_size, \ - __entry->offset, \ - __entry->count, \ - __print_flags(__entry->flags, "|", BMAPI_FLAGS), \ - __entry->startoff, \ - (__int64_t)__entry->startblock, \ - __entry->blockcount) \ +DECLARE_EVENT_CLASS(xfs_iomap_class, + TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count, + int flags, struct xfs_bmbt_irec *irec), + TP_ARGS(ip, offset, count, flags, irec), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(loff_t, size) + __field(loff_t, new_size) + __field(loff_t, offset) + __field(size_t, count) + __field(int, flags) + __field(xfs_fileoff_t, startoff) + __field(xfs_fsblock_t, startblock) + __field(xfs_filblks_t, blockcount) + ), + TP_fast_assign( + __entry->dev = VFS_I(ip)->i_sb->s_dev; + __entry->ino = ip->i_ino; + __entry->size = ip->i_d.di_size; + __entry->new_size = ip->i_new_size; + __entry->offset = offset; + __entry->count = count; + __entry->flags = flags; + __entry->startoff = irec ? irec->br_startoff : 0; + __entry->startblock = irec ? irec->br_startblock : 0; + __entry->blockcount = irec ? irec->br_blockcount : 0; + ), + TP_printk("dev %d:%d ino 0x%llx size 0x%llx new_size 0x%llx " + "offset 0x%llx count %zd flags %s " + "startoff 0x%llx startblock %lld blockcount 0x%llx", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->size, + __entry->new_size, + __entry->offset, + __entry->count, + __print_flags(__entry->flags, "|", BMAPI_FLAGS), + __entry->startoff, + (__int64_t)__entry->startblock, + __entry->blockcount) ) + +#define DEFINE_IOMAP_EVENT(name) \ +DEFINE_EVENT(xfs_iomap_class, name, \ + TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count, \ + int flags, struct xfs_bmbt_irec *irec), \ + TP_ARGS(ip, offset, count, flags, irec)) DEFINE_IOMAP_EVENT(xfs_iomap_enter); DEFINE_IOMAP_EVENT(xfs_iomap_found); DEFINE_IOMAP_EVENT(xfs_iomap_alloc); -#define DEFINE_SIMPLE_IO_EVENT(name) \ -TRACE_EVENT(name, \ - TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count), \ - TP_ARGS(ip, offset, count), \ - TP_STRUCT__entry( \ - __field(dev_t, dev) \ - __field(xfs_ino_t, ino) \ - __field(loff_t, size) \ - __field(loff_t, new_size) \ - __field(loff_t, offset) \ - __field(size_t, count) \ - ), \ - TP_fast_assign( \ - __entry->dev = VFS_I(ip)->i_sb->s_dev; \ - __entry->ino = ip->i_ino; \ - __entry->size = ip->i_d.di_size; \ - __entry->new_size = ip->i_new_size; \ - __entry->offset = offset; \ - __entry->count = count; \ - ), \ - TP_printk("dev %d:%d ino 0x%llx size 0x%llx new_size 0x%llx " \ - "offset 0x%llx count %zd", \ - MAJOR(__entry->dev), MINOR(__entry->dev), \ - __entry->ino, \ - __entry->size, \ - __entry->new_size, \ - __entry->offset, \ - __entry->count) \ +DECLARE_EVENT_CLASS(xfs_simple_io_class, + TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count), + TP_ARGS(ip, offset, count), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(loff_t, size) + __field(loff_t, new_size) + __field(loff_t, offset) + __field(size_t, count) + ), + TP_fast_assign( + __entry->dev = VFS_I(ip)->i_sb->s_dev; + __entry->ino = ip->i_ino; + __entry->size = ip->i_d.di_size; + __entry->new_size = ip->i_new_size; + __entry->offset = offset; + __entry->count = count; + ), + TP_printk("dev %d:%d ino 0x%llx size 0x%llx new_size 0x%llx " + "offset 0x%llx count %zd", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->size, + __entry->new_size, + __entry->offset, + __entry->count) ); + +#define DEFINE_SIMPLE_IO_EVENT(name) \ +DEFINE_EVENT(xfs_simple_io_class, name, \ + TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count), \ + TP_ARGS(ip, offset, count)) DEFINE_SIMPLE_IO_EVENT(xfs_delalloc_enospc); DEFINE_SIMPLE_IO_EVENT(xfs_unwritten_convert); diff --git a/fs/xfs/quota/xfs_dquot.c b/fs/xfs/quota/xfs_dquot.c index 585e7633dfc7..e1a2f6800e01 100644 --- a/fs/xfs/quota/xfs_dquot.c +++ b/fs/xfs/quota/xfs_dquot.c @@ -23,25 +23,15 @@ #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" -#include "xfs_dir2.h" #include "xfs_alloc.h" -#include "xfs_dmapi.h" #include "xfs_quota.h" #include "xfs_mount.h" #include "xfs_bmap_btree.h" -#include "xfs_alloc_btree.h" -#include "xfs_ialloc_btree.h" -#include "xfs_dir2_sf.h" -#include "xfs_attr_sf.h" -#include "xfs_dinode.h" #include "xfs_inode.h" -#include "xfs_btree.h" -#include "xfs_ialloc.h" #include "xfs_bmap.h" #include "xfs_rtalloc.h" #include "xfs_error.h" #include "xfs_itable.h" -#include "xfs_rw.h" #include "xfs_attr.h" #include "xfs_buf_item.h" #include "xfs_trans_space.h" @@ -64,8 +54,6 @@ flush lock - ditto. */ -STATIC void xfs_qm_dqflush_done(xfs_buf_t *, xfs_dq_logitem_t *); - #ifdef DEBUG xfs_buftarg_t *xfs_dqerror_target; int xfs_do_dqerror; @@ -390,21 +378,14 @@ xfs_qm_dqalloc( return (ESRCH); } - /* - * xfs_trans_commit normally decrements the vnode ref count - * when it unlocks the inode. Since we want to keep the quota - * inode around, we bump the vnode ref count now. - */ - IHOLD(quotip); - - xfs_trans_ijoin(tp, quotip, XFS_ILOCK_EXCL); + xfs_trans_ijoin_ref(tp, quotip, XFS_ILOCK_EXCL); nmaps = 1; if ((error = xfs_bmapi(tp, quotip, offset_fsb, XFS_DQUOT_CLUSTER_SIZE_FSB, XFS_BMAPI_METADATA | XFS_BMAPI_WRITE, &firstblock, XFS_QM_DQALLOC_SPACE_RES(mp), - &map, &nmaps, &flist, NULL))) { + &map, &nmaps, &flist))) { goto error0; } ASSERT(map.br_blockcount == XFS_DQUOT_CLUSTER_SIZE_FSB); @@ -520,7 +501,7 @@ xfs_qm_dqtobp( error = xfs_bmapi(NULL, quotip, dqp->q_fileoffset, XFS_DQUOT_CLUSTER_SIZE_FSB, XFS_BMAPI_METADATA, - NULL, 0, &map, &nmaps, NULL, NULL); + NULL, 0, &map, &nmaps, NULL); xfs_iunlock(quotip, XFS_ILOCK_SHARED); if (error) @@ -1141,6 +1122,46 @@ xfs_qm_dqrele( xfs_qm_dqput(dqp); } +/* + * This is the dquot flushing I/O completion routine. It is called + * from interrupt level when the buffer containing the dquot is + * flushed to disk. It is responsible for removing the dquot logitem + * from the AIL if it has not been re-logged, and unlocking the dquot's + * flush lock. This behavior is very similar to that of inodes.. + */ +STATIC void +xfs_qm_dqflush_done( + struct xfs_buf *bp, + struct xfs_log_item *lip) +{ + xfs_dq_logitem_t *qip = (struct xfs_dq_logitem *)lip; + xfs_dquot_t *dqp = qip->qli_dquot; + struct xfs_ail *ailp = lip->li_ailp; + + /* + * We only want to pull the item from the AIL if its + * location in the log has not changed since we started the flush. + * Thus, we only bother if the dquot's lsn has + * not changed. First we check the lsn outside the lock + * since it's cheaper, and then we recheck while + * holding the lock before removing the dquot from the AIL. + */ + if ((lip->li_flags & XFS_LI_IN_AIL) && + lip->li_lsn == qip->qli_flush_lsn) { + + /* xfs_trans_ail_delete() drops the AIL lock. */ + spin_lock(&ailp->xa_lock); + if (lip->li_lsn == qip->qli_flush_lsn) + xfs_trans_ail_delete(ailp, lip); + else + spin_unlock(&ailp->xa_lock); + } + + /* + * Release the dq's flush lock since we're done with it. + */ + xfs_dqfunlock(dqp); +} /* * Write a modified dquot to disk. @@ -1222,8 +1243,9 @@ xfs_qm_dqflush( * Attach an iodone routine so that we can remove this dquot from the * AIL and release the flush lock once the dquot is synced to disk. */ - xfs_buf_attach_iodone(bp, (void(*)(xfs_buf_t *, xfs_log_item_t *)) - xfs_qm_dqflush_done, &(dqp->q_logitem.qli_item)); + xfs_buf_attach_iodone(bp, xfs_qm_dqflush_done, + &dqp->q_logitem.qli_item); + /* * If the buffer is pinned then push on the log so we won't * get stuck waiting in the write for too long. @@ -1247,50 +1269,6 @@ xfs_qm_dqflush( } -/* - * This is the dquot flushing I/O completion routine. It is called - * from interrupt level when the buffer containing the dquot is - * flushed to disk. It is responsible for removing the dquot logitem - * from the AIL if it has not been re-logged, and unlocking the dquot's - * flush lock. This behavior is very similar to that of inodes.. - */ -/*ARGSUSED*/ -STATIC void -xfs_qm_dqflush_done( - xfs_buf_t *bp, - xfs_dq_logitem_t *qip) -{ - xfs_dquot_t *dqp; - struct xfs_ail *ailp; - - dqp = qip->qli_dquot; - ailp = qip->qli_item.li_ailp; - - /* - * We only want to pull the item from the AIL if its - * location in the log has not changed since we started the flush. - * Thus, we only bother if the dquot's lsn has - * not changed. First we check the lsn outside the lock - * since it's cheaper, and then we recheck while - * holding the lock before removing the dquot from the AIL. - */ - if ((qip->qli_item.li_flags & XFS_LI_IN_AIL) && - qip->qli_item.li_lsn == qip->qli_flush_lsn) { - - /* xfs_trans_ail_delete() drops the AIL lock. */ - spin_lock(&ailp->xa_lock); - if (qip->qli_item.li_lsn == qip->qli_flush_lsn) - xfs_trans_ail_delete(ailp, (xfs_log_item_t*)qip); - else - spin_unlock(&ailp->xa_lock); - } - - /* - * Release the dq's flush lock since we're done with it. - */ - xfs_dqfunlock(dqp); -} - int xfs_qm_dqlock_nowait( xfs_dquot_t *dqp) diff --git a/fs/xfs/quota/xfs_dquot_item.c b/fs/xfs/quota/xfs_dquot_item.c index 8d89a24ae324..2a1f3dc10a02 100644 --- a/fs/xfs/quota/xfs_dquot_item.c +++ b/fs/xfs/quota/xfs_dquot_item.c @@ -23,42 +23,36 @@ #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" -#include "xfs_dir2.h" #include "xfs_alloc.h" -#include "xfs_dmapi.h" #include "xfs_quota.h" #include "xfs_mount.h" #include "xfs_bmap_btree.h" -#include "xfs_alloc_btree.h" -#include "xfs_ialloc_btree.h" -#include "xfs_dir2_sf.h" -#include "xfs_attr_sf.h" -#include "xfs_dinode.h" #include "xfs_inode.h" #include "xfs_bmap.h" -#include "xfs_btree.h" -#include "xfs_ialloc.h" #include "xfs_rtalloc.h" #include "xfs_error.h" #include "xfs_itable.h" -#include "xfs_rw.h" #include "xfs_attr.h" #include "xfs_buf_item.h" #include "xfs_trans_priv.h" #include "xfs_qm.h" +static inline struct xfs_dq_logitem *DQUOT_ITEM(struct xfs_log_item *lip) +{ + return container_of(lip, struct xfs_dq_logitem, qli_item); +} + /* * returns the number of iovecs needed to log the given dquot item. */ -/* ARGSUSED */ STATIC uint xfs_qm_dquot_logitem_size( - xfs_dq_logitem_t *logitem) + struct xfs_log_item *lip) { /* * we need only two iovecs, one for the format, one for the real thing */ - return (2); + return 2; } /* @@ -66,22 +60,21 @@ xfs_qm_dquot_logitem_size( */ STATIC void xfs_qm_dquot_logitem_format( - xfs_dq_logitem_t *logitem, - xfs_log_iovec_t *logvec) + struct xfs_log_item *lip, + struct xfs_log_iovec *logvec) { - ASSERT(logitem); - ASSERT(logitem->qli_dquot); + struct xfs_dq_logitem *qlip = DQUOT_ITEM(lip); - logvec->i_addr = (xfs_caddr_t)&logitem->qli_format; + logvec->i_addr = &qlip->qli_format; logvec->i_len = sizeof(xfs_dq_logformat_t); logvec->i_type = XLOG_REG_TYPE_QFORMAT; logvec++; - logvec->i_addr = (xfs_caddr_t)&logitem->qli_dquot->q_core; + logvec->i_addr = &qlip->qli_dquot->q_core; logvec->i_len = sizeof(xfs_disk_dquot_t); logvec->i_type = XLOG_REG_TYPE_DQUOT; - ASSERT(2 == logitem->qli_item.li_desc->lid_size); - logitem->qli_format.qlf_size = 2; + ASSERT(2 == lip->li_desc->lid_size); + qlip->qli_format.qlf_size = 2; } @@ -90,9 +83,9 @@ xfs_qm_dquot_logitem_format( */ STATIC void xfs_qm_dquot_logitem_pin( - xfs_dq_logitem_t *logitem) + struct xfs_log_item *lip) { - xfs_dquot_t *dqp = logitem->qli_dquot; + struct xfs_dquot *dqp = DQUOT_ITEM(lip)->qli_dquot; ASSERT(XFS_DQ_IS_LOCKED(dqp)); atomic_inc(&dqp->q_pincount); @@ -104,27 +97,18 @@ xfs_qm_dquot_logitem_pin( * dquot must have been previously pinned with a call to * xfs_qm_dquot_logitem_pin(). */ -/* ARGSUSED */ STATIC void xfs_qm_dquot_logitem_unpin( - xfs_dq_logitem_t *logitem) + struct xfs_log_item *lip, + int remove) { - xfs_dquot_t *dqp = logitem->qli_dquot; + struct xfs_dquot *dqp = DQUOT_ITEM(lip)->qli_dquot; ASSERT(atomic_read(&dqp->q_pincount) > 0); if (atomic_dec_and_test(&dqp->q_pincount)) wake_up(&dqp->q_pinwait); } -/* ARGSUSED */ -STATIC void -xfs_qm_dquot_logitem_unpin_remove( - xfs_dq_logitem_t *logitem, - xfs_trans_t *tp) -{ - xfs_qm_dquot_logitem_unpin(logitem); -} - /* * Given the logitem, this writes the corresponding dquot entry to disk * asynchronously. This is called with the dquot entry securely locked; @@ -133,12 +117,10 @@ xfs_qm_dquot_logitem_unpin_remove( */ STATIC void xfs_qm_dquot_logitem_push( - xfs_dq_logitem_t *logitem) + struct xfs_log_item *lip) { - xfs_dquot_t *dqp; - int error; - - dqp = logitem->qli_dquot; + struct xfs_dquot *dqp = DQUOT_ITEM(lip)->qli_dquot; + int error; ASSERT(XFS_DQ_IS_LOCKED(dqp)); ASSERT(!completion_done(&dqp->q_flush)); @@ -160,27 +142,25 @@ xfs_qm_dquot_logitem_push( xfs_dqunlock(dqp); } -/*ARGSUSED*/ STATIC xfs_lsn_t xfs_qm_dquot_logitem_committed( - xfs_dq_logitem_t *l, + struct xfs_log_item *lip, xfs_lsn_t lsn) { /* * We always re-log the entire dquot when it becomes dirty, * so, the latest copy _is_ the only one that matters. */ - return (lsn); + return lsn; } - /* * This is called to wait for the given dquot to be unpinned. * Most of these pin/unpin routines are plagiarized from inode code. */ void xfs_qm_dqunpin_wait( - xfs_dquot_t *dqp) + struct xfs_dquot *dqp) { ASSERT(XFS_DQ_IS_LOCKED(dqp)); if (atomic_read(&dqp->q_pincount) == 0) @@ -206,13 +186,12 @@ xfs_qm_dqunpin_wait( */ STATIC void xfs_qm_dquot_logitem_pushbuf( - xfs_dq_logitem_t *qip) + struct xfs_log_item *lip) { - xfs_dquot_t *dqp; - xfs_mount_t *mp; - xfs_buf_t *bp; + struct xfs_dq_logitem *qlip = DQUOT_ITEM(lip); + struct xfs_dquot *dqp = qlip->qli_dquot; + struct xfs_buf *bp; - dqp = qip->qli_dquot; ASSERT(XFS_DQ_IS_LOCKED(dqp)); /* @@ -220,22 +199,20 @@ xfs_qm_dquot_logitem_pushbuf( * inode flush completed and the inode was taken off the AIL. * So, just get out. */ - if (completion_done(&dqp->q_flush) || - ((qip->qli_item.li_flags & XFS_LI_IN_AIL) == 0)) { + if (completion_done(&dqp->q_flush) || + !(lip->li_flags & XFS_LI_IN_AIL)) { xfs_dqunlock(dqp); return; } - mp = dqp->q_mount; - bp = xfs_incore(mp->m_ddev_targp, qip->qli_format.qlf_blkno, - mp->m_quotainfo->qi_dqchunklen, XBF_TRYLOCK); + + bp = xfs_incore(dqp->q_mount->m_ddev_targp, qlip->qli_format.qlf_blkno, + dqp->q_mount->m_quotainfo->qi_dqchunklen, XBF_TRYLOCK); xfs_dqunlock(dqp); if (!bp) return; if (XFS_BUF_ISDELAYWRITE(bp)) xfs_buf_delwri_promote(bp); xfs_buf_relse(bp); - return; - } /* @@ -250,15 +227,14 @@ xfs_qm_dquot_logitem_pushbuf( */ STATIC uint xfs_qm_dquot_logitem_trylock( - xfs_dq_logitem_t *qip) + struct xfs_log_item *lip) { - xfs_dquot_t *dqp; + struct xfs_dquot *dqp = DQUOT_ITEM(lip)->qli_dquot; - dqp = qip->qli_dquot; if (atomic_read(&dqp->q_pincount) > 0) return XFS_ITEM_PINNED; - if (! xfs_qm_dqlock_nowait(dqp)) + if (!xfs_qm_dqlock_nowait(dqp)) return XFS_ITEM_LOCKED; if (!xfs_dqflock_nowait(dqp)) { @@ -269,11 +245,10 @@ xfs_qm_dquot_logitem_trylock( return XFS_ITEM_PUSHBUF; } - ASSERT(qip->qli_item.li_flags & XFS_LI_IN_AIL); + ASSERT(lip->li_flags & XFS_LI_IN_AIL); return XFS_ITEM_SUCCESS; } - /* * Unlock the dquot associated with the log item. * Clear the fields of the dquot and dquot log item that @@ -282,12 +257,10 @@ xfs_qm_dquot_logitem_trylock( */ STATIC void xfs_qm_dquot_logitem_unlock( - xfs_dq_logitem_t *ql) + struct xfs_log_item *lip) { - xfs_dquot_t *dqp; + struct xfs_dquot *dqp = DQUOT_ITEM(lip)->qli_dquot; - ASSERT(ql != NULL); - dqp = ql->qli_dquot; ASSERT(XFS_DQ_IS_LOCKED(dqp)); /* @@ -304,43 +277,32 @@ xfs_qm_dquot_logitem_unlock( xfs_dqunlock(dqp); } - /* * this needs to stamp an lsn into the dquot, I think. * rpc's that look at user dquot's would then have to * push on the dependency recorded in the dquot */ -/* ARGSUSED */ STATIC void xfs_qm_dquot_logitem_committing( - xfs_dq_logitem_t *l, + struct xfs_log_item *lip, xfs_lsn_t lsn) { - return; } - /* * This is the ops vector for dquots */ static struct xfs_item_ops xfs_dquot_item_ops = { - .iop_size = (uint(*)(xfs_log_item_t*))xfs_qm_dquot_logitem_size, - .iop_format = (void(*)(xfs_log_item_t*, xfs_log_iovec_t*)) - xfs_qm_dquot_logitem_format, - .iop_pin = (void(*)(xfs_log_item_t*))xfs_qm_dquot_logitem_pin, - .iop_unpin = (void(*)(xfs_log_item_t*))xfs_qm_dquot_logitem_unpin, - .iop_unpin_remove = (void(*)(xfs_log_item_t*, xfs_trans_t*)) - xfs_qm_dquot_logitem_unpin_remove, - .iop_trylock = (uint(*)(xfs_log_item_t*)) - xfs_qm_dquot_logitem_trylock, - .iop_unlock = (void(*)(xfs_log_item_t*))xfs_qm_dquot_logitem_unlock, - .iop_committed = (xfs_lsn_t(*)(xfs_log_item_t*, xfs_lsn_t)) - xfs_qm_dquot_logitem_committed, - .iop_push = (void(*)(xfs_log_item_t*))xfs_qm_dquot_logitem_push, - .iop_pushbuf = (void(*)(xfs_log_item_t*)) - xfs_qm_dquot_logitem_pushbuf, - .iop_committing = (void(*)(xfs_log_item_t*, xfs_lsn_t)) - xfs_qm_dquot_logitem_committing + .iop_size = xfs_qm_dquot_logitem_size, + .iop_format = xfs_qm_dquot_logitem_format, + .iop_pin = xfs_qm_dquot_logitem_pin, + .iop_unpin = xfs_qm_dquot_logitem_unpin, + .iop_trylock = xfs_qm_dquot_logitem_trylock, + .iop_unlock = xfs_qm_dquot_logitem_unlock, + .iop_committed = xfs_qm_dquot_logitem_committed, + .iop_push = xfs_qm_dquot_logitem_push, + .iop_pushbuf = xfs_qm_dquot_logitem_pushbuf, + .iop_committing = xfs_qm_dquot_logitem_committing }; /* @@ -350,10 +312,9 @@ static struct xfs_item_ops xfs_dquot_item_ops = { */ void xfs_qm_dquot_logitem_init( - struct xfs_dquot *dqp) + struct xfs_dquot *dqp) { - xfs_dq_logitem_t *lp; - lp = &dqp->q_logitem; + struct xfs_dq_logitem *lp = &dqp->q_logitem; xfs_log_item_init(dqp->q_mount, &lp->qli_item, XFS_LI_DQUOT, &xfs_dquot_item_ops); @@ -374,16 +335,22 @@ xfs_qm_dquot_logitem_init( /*------------------ QUOTAOFF LOG ITEMS -------------------*/ +static inline struct xfs_qoff_logitem *QOFF_ITEM(struct xfs_log_item *lip) +{ + return container_of(lip, struct xfs_qoff_logitem, qql_item); +} + + /* * This returns the number of iovecs needed to log the given quotaoff item. * We only need 1 iovec for an quotaoff item. It just logs the * quotaoff_log_format structure. */ -/*ARGSUSED*/ STATIC uint -xfs_qm_qoff_logitem_size(xfs_qoff_logitem_t *qf) +xfs_qm_qoff_logitem_size( + struct xfs_log_item *lip) { - return (1); + return 1; } /* @@ -394,53 +361,46 @@ xfs_qm_qoff_logitem_size(xfs_qoff_logitem_t *qf) * slots in the quotaoff item have been filled. */ STATIC void -xfs_qm_qoff_logitem_format(xfs_qoff_logitem_t *qf, - xfs_log_iovec_t *log_vector) +xfs_qm_qoff_logitem_format( + struct xfs_log_item *lip, + struct xfs_log_iovec *log_vector) { - ASSERT(qf->qql_format.qf_type == XFS_LI_QUOTAOFF); + struct xfs_qoff_logitem *qflip = QOFF_ITEM(lip); + + ASSERT(qflip->qql_format.qf_type == XFS_LI_QUOTAOFF); - log_vector->i_addr = (xfs_caddr_t)&(qf->qql_format); + log_vector->i_addr = &qflip->qql_format; log_vector->i_len = sizeof(xfs_qoff_logitem_t); log_vector->i_type = XLOG_REG_TYPE_QUOTAOFF; - qf->qql_format.qf_size = 1; + qflip->qql_format.qf_size = 1; } - /* * Pinning has no meaning for an quotaoff item, so just return. */ -/*ARGSUSED*/ STATIC void -xfs_qm_qoff_logitem_pin(xfs_qoff_logitem_t *qf) +xfs_qm_qoff_logitem_pin( + struct xfs_log_item *lip) { - return; } - /* * Since pinning has no meaning for an quotaoff item, unpinning does * not either. */ -/*ARGSUSED*/ STATIC void -xfs_qm_qoff_logitem_unpin(xfs_qoff_logitem_t *qf) +xfs_qm_qoff_logitem_unpin( + struct xfs_log_item *lip, + int remove) { - return; -} - -/*ARGSUSED*/ -STATIC void -xfs_qm_qoff_logitem_unpin_remove(xfs_qoff_logitem_t *qf, xfs_trans_t *tp) -{ - return; } /* * Quotaoff items have no locking, so just return success. */ -/*ARGSUSED*/ STATIC uint -xfs_qm_qoff_logitem_trylock(xfs_qoff_logitem_t *qf) +xfs_qm_qoff_logitem_trylock( + struct xfs_log_item *lip) { return XFS_ITEM_LOCKED; } @@ -449,53 +409,51 @@ xfs_qm_qoff_logitem_trylock(xfs_qoff_logitem_t *qf) * Quotaoff items have no locking or pushing, so return failure * so that the caller doesn't bother with us. */ -/*ARGSUSED*/ STATIC void -xfs_qm_qoff_logitem_unlock(xfs_qoff_logitem_t *qf) +xfs_qm_qoff_logitem_unlock( + struct xfs_log_item *lip) { - return; } /* * The quotaoff-start-item is logged only once and cannot be moved in the log, * so simply return the lsn at which it's been logged. */ -/*ARGSUSED*/ STATIC xfs_lsn_t -xfs_qm_qoff_logitem_committed(xfs_qoff_logitem_t *qf, xfs_lsn_t lsn) +xfs_qm_qoff_logitem_committed( + struct xfs_log_item *lip, + xfs_lsn_t lsn) { - return (lsn); + return lsn; } /* * There isn't much you can do to push on an quotaoff item. It is simply * stuck waiting for the log to be flushed to disk. */ -/*ARGSUSED*/ STATIC void -xfs_qm_qoff_logitem_push(xfs_qoff_logitem_t *qf) +xfs_qm_qoff_logitem_push( + struct xfs_log_item *lip) { - return; } -/*ARGSUSED*/ STATIC xfs_lsn_t xfs_qm_qoffend_logitem_committed( - xfs_qoff_logitem_t *qfe, - xfs_lsn_t lsn) + struct xfs_log_item *lip, + xfs_lsn_t lsn) { - xfs_qoff_logitem_t *qfs; - struct xfs_ail *ailp; + struct xfs_qoff_logitem *qfe = QOFF_ITEM(lip); + struct xfs_qoff_logitem *qfs = qfe->qql_start_lip; + struct xfs_ail *ailp = qfs->qql_item.li_ailp; - qfs = qfe->qql_start_lip; - ailp = qfs->qql_item.li_ailp; - spin_lock(&ailp->xa_lock); /* * Delete the qoff-start logitem from the AIL. * xfs_trans_ail_delete() drops the AIL lock. */ + spin_lock(&ailp->xa_lock); xfs_trans_ail_delete(ailp, (xfs_log_item_t *)qfs); + kmem_free(qfs); kmem_free(qfe); return (xfs_lsn_t)-1; @@ -515,71 +473,52 @@ xfs_qm_qoffend_logitem_committed( * (truly makes the quotaoff irrevocable). If we do something else, * then maybe we don't need two. */ -/* ARGSUSED */ -STATIC void -xfs_qm_qoff_logitem_committing(xfs_qoff_logitem_t *qip, xfs_lsn_t commit_lsn) -{ - return; -} - -/* ARGSUSED */ STATIC void -xfs_qm_qoffend_logitem_committing(xfs_qoff_logitem_t *qip, xfs_lsn_t commit_lsn) +xfs_qm_qoff_logitem_committing( + struct xfs_log_item *lip, + xfs_lsn_t commit_lsn) { - return; } static struct xfs_item_ops xfs_qm_qoffend_logitem_ops = { - .iop_size = (uint(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_size, - .iop_format = (void(*)(xfs_log_item_t*, xfs_log_iovec_t*)) - xfs_qm_qoff_logitem_format, - .iop_pin = (void(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_pin, - .iop_unpin = (void(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_unpin, - .iop_unpin_remove = (void(*)(xfs_log_item_t*,xfs_trans_t*)) - xfs_qm_qoff_logitem_unpin_remove, - .iop_trylock = (uint(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_trylock, - .iop_unlock = (void(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_unlock, - .iop_committed = (xfs_lsn_t(*)(xfs_log_item_t*, xfs_lsn_t)) - xfs_qm_qoffend_logitem_committed, - .iop_push = (void(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_push, - .iop_pushbuf = NULL, - .iop_committing = (void(*)(xfs_log_item_t*, xfs_lsn_t)) - xfs_qm_qoffend_logitem_committing + .iop_size = xfs_qm_qoff_logitem_size, + .iop_format = xfs_qm_qoff_logitem_format, + .iop_pin = xfs_qm_qoff_logitem_pin, + .iop_unpin = xfs_qm_qoff_logitem_unpin, + .iop_trylock = xfs_qm_qoff_logitem_trylock, + .iop_unlock = xfs_qm_qoff_logitem_unlock, + .iop_committed = xfs_qm_qoffend_logitem_committed, + .iop_push = xfs_qm_qoff_logitem_push, + .iop_committing = xfs_qm_qoff_logitem_committing }; /* * This is the ops vector shared by all quotaoff-start log items. */ static struct xfs_item_ops xfs_qm_qoff_logitem_ops = { - .iop_size = (uint(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_size, - .iop_format = (void(*)(xfs_log_item_t*, xfs_log_iovec_t*)) - xfs_qm_qoff_logitem_format, - .iop_pin = (void(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_pin, - .iop_unpin = (void(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_unpin, - .iop_unpin_remove = (void(*)(xfs_log_item_t*,xfs_trans_t*)) - xfs_qm_qoff_logitem_unpin_remove, - .iop_trylock = (uint(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_trylock, - .iop_unlock = (void(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_unlock, - .iop_committed = (xfs_lsn_t(*)(xfs_log_item_t*, xfs_lsn_t)) - xfs_qm_qoff_logitem_committed, - .iop_push = (void(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_push, - .iop_pushbuf = NULL, - .iop_committing = (void(*)(xfs_log_item_t*, xfs_lsn_t)) - xfs_qm_qoff_logitem_committing + .iop_size = xfs_qm_qoff_logitem_size, + .iop_format = xfs_qm_qoff_logitem_format, + .iop_pin = xfs_qm_qoff_logitem_pin, + .iop_unpin = xfs_qm_qoff_logitem_unpin, + .iop_trylock = xfs_qm_qoff_logitem_trylock, + .iop_unlock = xfs_qm_qoff_logitem_unlock, + .iop_committed = xfs_qm_qoff_logitem_committed, + .iop_push = xfs_qm_qoff_logitem_push, + .iop_committing = xfs_qm_qoff_logitem_committing }; /* * Allocate and initialize an quotaoff item of the correct quota type(s). */ -xfs_qoff_logitem_t * +struct xfs_qoff_logitem * xfs_qm_qoff_logitem_init( - struct xfs_mount *mp, - xfs_qoff_logitem_t *start, - uint flags) + struct xfs_mount *mp, + struct xfs_qoff_logitem *start, + uint flags) { - xfs_qoff_logitem_t *qf; + struct xfs_qoff_logitem *qf; - qf = (xfs_qoff_logitem_t*) kmem_zalloc(sizeof(xfs_qoff_logitem_t), KM_SLEEP); + qf = kmem_zalloc(sizeof(struct xfs_qoff_logitem), KM_SLEEP); xfs_log_item_init(mp, &qf->qql_item, XFS_LI_QUOTAOFF, start ? &xfs_qm_qoffend_logitem_ops : &xfs_qm_qoff_logitem_ops); @@ -587,5 +526,5 @@ xfs_qm_qoff_logitem_init( qf->qql_format.qf_type = XFS_LI_QUOTAOFF; qf->qql_format.qf_flags = flags; qf->qql_start_lip = start; - return (qf); + return qf; } diff --git a/fs/xfs/quota/xfs_qm.c b/fs/xfs/quota/xfs_qm.c index 38e764146644..9a92407109a1 100644 --- a/fs/xfs/quota/xfs_qm.c +++ b/fs/xfs/quota/xfs_qm.c @@ -23,25 +23,18 @@ #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" -#include "xfs_dir2.h" #include "xfs_alloc.h" -#include "xfs_dmapi.h" #include "xfs_quota.h" #include "xfs_mount.h" #include "xfs_bmap_btree.h" -#include "xfs_alloc_btree.h" #include "xfs_ialloc_btree.h" -#include "xfs_dir2_sf.h" -#include "xfs_attr_sf.h" #include "xfs_dinode.h" #include "xfs_inode.h" -#include "xfs_btree.h" #include "xfs_ialloc.h" #include "xfs_itable.h" #include "xfs_rtalloc.h" #include "xfs_error.h" #include "xfs_bmap.h" -#include "xfs_rw.h" #include "xfs_attr.h" #include "xfs_buf_item.h" #include "xfs_trans_space.h" @@ -69,7 +62,7 @@ STATIC void xfs_qm_list_destroy(xfs_dqlist_t *); STATIC int xfs_qm_init_quotainos(xfs_mount_t *); STATIC int xfs_qm_init_quotainfo(xfs_mount_t *); -STATIC int xfs_qm_shake(int, gfp_t); +STATIC int xfs_qm_shake(struct shrinker *, int, gfp_t); static struct shrinker xfs_qm_shaker = { .shrink = xfs_qm_shake, @@ -249,8 +242,10 @@ xfs_qm_hold_quotafs_ref( if (!xfs_Gqm) { xfs_Gqm = xfs_Gqm_init(); - if (!xfs_Gqm) + if (!xfs_Gqm) { + mutex_unlock(&xfs_Gqm_lock); return ENOMEM; + } } /* @@ -1495,7 +1490,7 @@ xfs_qm_dqiterate( maxlblkcnt - lblkno, XFS_BMAPI_METADATA, NULL, - 0, map, &nmaps, NULL, NULL); + 0, map, &nmaps, NULL); xfs_iunlock(qip, XFS_ILOCK_SHARED); if (error) break; @@ -1630,10 +1625,7 @@ xfs_qm_dqusage_adjust( xfs_ino_t ino, /* inode number to get data for */ void __user *buffer, /* not used */ int ubsize, /* not used */ - void *private_data, /* not used */ - xfs_daddr_t bno, /* starting block of inode cluster */ int *ubused, /* not used */ - void *dip, /* on-disk inode pointer (not used) */ int *res) /* result code value */ { xfs_inode_t *ip; @@ -1658,7 +1650,7 @@ xfs_qm_dqusage_adjust( * the case in all other instances. It's OK that we do this because * quotacheck is done only at mount time. */ - if ((error = xfs_iget(mp, NULL, ino, 0, XFS_ILOCK_EXCL, &ip, bno))) { + if ((error = xfs_iget(mp, NULL, ino, 0, XFS_ILOCK_EXCL, &ip))) { *res = BULKSTAT_RV_NOTHING; return error; } @@ -1670,7 +1662,8 @@ xfs_qm_dqusage_adjust( * making us disable quotas for the file system. */ if ((error = xfs_qm_dqget_noattach(ip, &udqp, &gdqp))) { - xfs_iput(ip, XFS_ILOCK_EXCL); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + IRELE(ip); *res = BULKSTAT_RV_GIVEUP; return error; } @@ -1683,7 +1676,8 @@ xfs_qm_dqusage_adjust( * Walk thru the extent list and count the realtime blocks. */ if ((error = xfs_qm_get_rtblks(ip, &rtblks))) { - xfs_iput(ip, XFS_ILOCK_EXCL); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + IRELE(ip); if (udqp) xfs_qm_dqput(udqp); if (gdqp) @@ -1794,12 +1788,13 @@ xfs_qm_quotacheck( * Iterate thru all the inodes in the file system, * adjusting the corresponding dquot counters in core. */ - if ((error = xfs_bulkstat(mp, &lastino, &count, - xfs_qm_dqusage_adjust, NULL, - structsz, NULL, BULKSTAT_FG_IGET, &done))) + error = xfs_bulkstat(mp, &lastino, &count, + xfs_qm_dqusage_adjust, + structsz, NULL, &done); + if (error) break; - } while (! done); + } while (!done); /* * We've made all the changes that we need to make incore. @@ -1887,14 +1882,14 @@ xfs_qm_init_quotainos( mp->m_sb.sb_uquotino != NULLFSINO) { ASSERT(mp->m_sb.sb_uquotino > 0); if ((error = xfs_iget(mp, NULL, mp->m_sb.sb_uquotino, - 0, 0, &uip, 0))) + 0, 0, &uip))) return XFS_ERROR(error); } if (XFS_IS_OQUOTA_ON(mp) && mp->m_sb.sb_gquotino != NULLFSINO) { ASSERT(mp->m_sb.sb_gquotino > 0); if ((error = xfs_iget(mp, NULL, mp->m_sb.sb_gquotino, - 0, 0, &gip, 0))) { + 0, 0, &gip))) { if (uip) IRELE(uip); return XFS_ERROR(error); @@ -2117,7 +2112,10 @@ xfs_qm_shake_freelist( */ /* ARGSUSED */ STATIC int -xfs_qm_shake(int nr_to_scan, gfp_t gfp_mask) +xfs_qm_shake( + struct shrinker *shrink, + int nr_to_scan, + gfp_t gfp_mask) { int ndqused, nfree, n; diff --git a/fs/xfs/quota/xfs_qm_bhv.c b/fs/xfs/quota/xfs_qm_bhv.c index 97b410c12794..bea02d786c5d 100644 --- a/fs/xfs/quota/xfs_qm_bhv.c +++ b/fs/xfs/quota/xfs_qm_bhv.c @@ -23,25 +23,15 @@ #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" -#include "xfs_dir2.h" #include "xfs_alloc.h" -#include "xfs_dmapi.h" #include "xfs_quota.h" #include "xfs_mount.h" #include "xfs_bmap_btree.h" -#include "xfs_alloc_btree.h" -#include "xfs_ialloc_btree.h" -#include "xfs_dir2_sf.h" -#include "xfs_attr_sf.h" -#include "xfs_dinode.h" #include "xfs_inode.h" -#include "xfs_ialloc.h" #include "xfs_itable.h" -#include "xfs_btree.h" #include "xfs_bmap.h" #include "xfs_rtalloc.h" #include "xfs_error.h" -#include "xfs_rw.h" #include "xfs_attr.h" #include "xfs_buf_item.h" #include "xfs_qm.h" diff --git a/fs/xfs/quota/xfs_qm_stats.c b/fs/xfs/quota/xfs_qm_stats.c index 3d1fc79532e2..8671a0b32644 100644 --- a/fs/xfs/quota/xfs_qm_stats.c +++ b/fs/xfs/quota/xfs_qm_stats.c @@ -23,25 +23,15 @@ #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" -#include "xfs_dir2.h" #include "xfs_alloc.h" -#include "xfs_dmapi.h" #include "xfs_quota.h" #include "xfs_mount.h" #include "xfs_bmap_btree.h" -#include "xfs_alloc_btree.h" -#include "xfs_ialloc_btree.h" -#include "xfs_dir2_sf.h" -#include "xfs_attr_sf.h" -#include "xfs_dinode.h" #include "xfs_inode.h" -#include "xfs_ialloc.h" #include "xfs_itable.h" #include "xfs_bmap.h" -#include "xfs_btree.h" #include "xfs_rtalloc.h" #include "xfs_error.h" -#include "xfs_rw.h" #include "xfs_attr.h" #include "xfs_buf_item.h" #include "xfs_qm.h" diff --git a/fs/xfs/quota/xfs_qm_syscalls.c b/fs/xfs/quota/xfs_qm_syscalls.c index 92b002f1805f..d257eb8557c4 100644 --- a/fs/xfs/quota/xfs_qm_syscalls.c +++ b/fs/xfs/quota/xfs_qm_syscalls.c @@ -26,25 +26,15 @@ #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" -#include "xfs_dir2.h" #include "xfs_alloc.h" -#include "xfs_dmapi.h" #include "xfs_quota.h" #include "xfs_mount.h" #include "xfs_bmap_btree.h" -#include "xfs_alloc_btree.h" -#include "xfs_ialloc_btree.h" -#include "xfs_dir2_sf.h" -#include "xfs_attr_sf.h" -#include "xfs_dinode.h" #include "xfs_inode.h" -#include "xfs_ialloc.h" #include "xfs_itable.h" #include "xfs_bmap.h" -#include "xfs_btree.h" #include "xfs_rtalloc.h" #include "xfs_error.h" -#include "xfs_rw.h" #include "xfs_attr.h" #include "xfs_buf_item.h" #include "xfs_utils.h" @@ -248,40 +238,74 @@ out_unlock: return error; } +STATIC int +xfs_qm_scall_trunc_qfile( + struct xfs_mount *mp, + xfs_ino_t ino) +{ + struct xfs_inode *ip; + struct xfs_trans *tp; + int error; + + if (ino == NULLFSINO) + return 0; + + error = xfs_iget(mp, NULL, ino, 0, 0, &ip); + if (error) + return error; + + xfs_ilock(ip, XFS_IOLOCK_EXCL); + + tp = xfs_trans_alloc(mp, XFS_TRANS_TRUNCATE_FILE); + error = xfs_trans_reserve(tp, 0, XFS_ITRUNCATE_LOG_RES(mp), 0, + XFS_TRANS_PERM_LOG_RES, + XFS_ITRUNCATE_LOG_COUNT); + if (error) { + xfs_trans_cancel(tp, 0); + xfs_iunlock(ip, XFS_IOLOCK_EXCL); + goto out_put; + } + + xfs_ilock(ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, ip); + + error = xfs_itruncate_finish(&tp, ip, 0, XFS_DATA_FORK, 1); + if (error) { + xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | + XFS_TRANS_ABORT); + goto out_unlock; + } + + xfs_ichgtime(ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); + error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + +out_unlock: + xfs_iunlock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); +out_put: + IRELE(ip); + return error; +} + int xfs_qm_scall_trunc_qfiles( xfs_mount_t *mp, uint flags) { int error = 0, error2 = 0; - xfs_inode_t *qip; if (!xfs_sb_version_hasquota(&mp->m_sb) || flags == 0) { qdprintk("qtrunc flags=%x m_qflags=%x\n", flags, mp->m_qflags); return XFS_ERROR(EINVAL); } - if ((flags & XFS_DQ_USER) && mp->m_sb.sb_uquotino != NULLFSINO) { - error = xfs_iget(mp, NULL, mp->m_sb.sb_uquotino, 0, 0, &qip, 0); - if (!error) { - error = xfs_truncate_file(mp, qip); - IRELE(qip); - } - } - - if ((flags & (XFS_DQ_GROUP|XFS_DQ_PROJ)) && - mp->m_sb.sb_gquotino != NULLFSINO) { - error2 = xfs_iget(mp, NULL, mp->m_sb.sb_gquotino, 0, 0, &qip, 0); - if (!error2) { - error2 = xfs_truncate_file(mp, qip); - IRELE(qip); - } - } + if (flags & XFS_DQ_USER) + error = xfs_qm_scall_trunc_qfile(mp, mp->m_sb.sb_uquotino); + if (flags & (XFS_DQ_GROUP|XFS_DQ_PROJ)) + error2 = xfs_qm_scall_trunc_qfile(mp, mp->m_sb.sb_gquotino); return error ? error : error2; } - /* * Switch on (a given) quota enforcement for a filesystem. This takes * effect immediately. @@ -417,12 +441,12 @@ xfs_qm_scall_getqstat( } if (!uip && mp->m_sb.sb_uquotino != NULLFSINO) { if (xfs_iget(mp, NULL, mp->m_sb.sb_uquotino, - 0, 0, &uip, 0) == 0) + 0, 0, &uip) == 0) tempuqip = B_TRUE; } if (!gip && mp->m_sb.sb_gquotino != NULLFSINO) { if (xfs_iget(mp, NULL, mp->m_sb.sb_gquotino, - 0, 0, &gip, 0) == 0) + 0, 0, &gip) == 0) tempgqip = B_TRUE; } if (uip) { @@ -875,8 +899,9 @@ xfs_dqrele_inode( xfs_qm_dqrele(ip->i_gdquot); ip->i_gdquot = NULL; } - xfs_iput(ip, XFS_ILOCK_EXCL); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + IRELE(ip); return 0; } @@ -1109,10 +1134,7 @@ xfs_qm_internalqcheck_adjust( xfs_ino_t ino, /* inode number to get data for */ void __user *buffer, /* not used */ int ubsize, /* not used */ - void *private_data, /* not used */ - xfs_daddr_t bno, /* starting block of inode cluster */ int *ubused, /* not used */ - void *dip, /* not used */ int *res) /* bulkstat result code */ { xfs_inode_t *ip; @@ -1134,7 +1156,7 @@ xfs_qm_internalqcheck_adjust( ipreleased = B_FALSE; again: lock_flags = XFS_ILOCK_SHARED; - if ((error = xfs_iget(mp, NULL, ino, 0, lock_flags, &ip, bno))) { + if ((error = xfs_iget(mp, NULL, ino, 0, lock_flags, &ip))) { *res = BULKSTAT_RV_NOTHING; return (error); } @@ -1146,7 +1168,8 @@ xfs_qm_internalqcheck_adjust( * of those now. */ if (! ipreleased) { - xfs_iput(ip, lock_flags); + xfs_iunlock(ip, lock_flags); + IRELE(ip); ipreleased = B_TRUE; goto again; } @@ -1163,7 +1186,8 @@ xfs_qm_internalqcheck_adjust( ASSERT(gd); xfs_qm_internalqcheck_dqadjust(ip, gd); } - xfs_iput(ip, lock_flags); + xfs_iunlock(ip, lock_flags); + IRELE(ip); *res = BULKSTAT_RV_DIDONE; return (0); } @@ -1205,15 +1229,15 @@ xfs_qm_internalqcheck( * Iterate thru all the inodes in the file system, * adjusting the corresponding dquot counters */ - if ((error = xfs_bulkstat(mp, &lastino, &count, - xfs_qm_internalqcheck_adjust, NULL, - 0, NULL, BULKSTAT_FG_IGET, &done))) { + error = xfs_bulkstat(mp, &lastino, &count, + xfs_qm_internalqcheck_adjust, + 0, NULL, &done); + if (error) { + cmn_err(CE_DEBUG, "Bulkstat returned error 0x%x", error); break; } - } while (! done); - if (error) { - cmn_err(CE_DEBUG, "Bulkstat returned error 0x%x", error); - } + } while (!done); + cmn_err(CE_DEBUG, "Checking results against system dquots"); for (i = 0; i < qmtest_hashmask; i++) { xfs_dqtest_t *d, *n; diff --git a/fs/xfs/quota/xfs_trans_dquot.c b/fs/xfs/quota/xfs_trans_dquot.c index 061d827da33c..7de91d1b75c0 100644 --- a/fs/xfs/quota/xfs_trans_dquot.c +++ b/fs/xfs/quota/xfs_trans_dquot.c @@ -23,25 +23,15 @@ #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" -#include "xfs_dir2.h" #include "xfs_alloc.h" -#include "xfs_dmapi.h" #include "xfs_quota.h" #include "xfs_mount.h" #include "xfs_bmap_btree.h" -#include "xfs_alloc_btree.h" -#include "xfs_ialloc_btree.h" -#include "xfs_attr_sf.h" -#include "xfs_dir2_sf.h" -#include "xfs_dinode.h" #include "xfs_inode.h" -#include "xfs_ialloc.h" #include "xfs_itable.h" -#include "xfs_btree.h" #include "xfs_bmap.h" #include "xfs_rtalloc.h" #include "xfs_error.h" -#include "xfs_rw.h" #include "xfs_attr.h" #include "xfs_buf_item.h" #include "xfs_trans_priv.h" @@ -59,16 +49,14 @@ xfs_trans_dqjoin( xfs_trans_t *tp, xfs_dquot_t *dqp) { - xfs_dq_logitem_t *lp = &dqp->q_logitem; - ASSERT(dqp->q_transp != tp); ASSERT(XFS_DQ_IS_LOCKED(dqp)); - ASSERT(lp->qli_dquot == dqp); + ASSERT(dqp->q_logitem.qli_dquot == dqp); /* * Get a log_item_desc to point at the new item. */ - (void) xfs_trans_add_item(tp, (xfs_log_item_t*)(lp)); + xfs_trans_add_item(tp, &dqp->q_logitem.qli_item); /* * Initialize i_transp so we can later determine if this dquot is @@ -93,16 +81,11 @@ xfs_trans_log_dquot( xfs_trans_t *tp, xfs_dquot_t *dqp) { - xfs_log_item_desc_t *lidp; - ASSERT(dqp->q_transp == tp); ASSERT(XFS_DQ_IS_LOCKED(dqp)); - lidp = xfs_trans_find_item(tp, (xfs_log_item_t*)(&dqp->q_logitem)); - ASSERT(lidp != NULL); - tp->t_flags |= XFS_TRANS_DIRTY; - lidp->lid_flags |= XFS_LID_DIRTY; + dqp->q_logitem.qli_item.li_desc->lid_flags |= XFS_LID_DIRTY; } /* @@ -874,9 +857,8 @@ xfs_trans_get_qoff_item( /* * Get a log_item_desc to point at the new item. */ - (void) xfs_trans_add_item(tp, (xfs_log_item_t*)q); - - return (q); + xfs_trans_add_item(tp, &q->qql_item); + return q; } @@ -890,13 +872,8 @@ xfs_trans_log_quotaoff_item( xfs_trans_t *tp, xfs_qoff_logitem_t *qlp) { - xfs_log_item_desc_t *lidp; - - lidp = xfs_trans_find_item(tp, (xfs_log_item_t *)qlp); - ASSERT(lidp != NULL); - tp->t_flags |= XFS_TRANS_DIRTY; - lidp->lid_flags |= XFS_LID_DIRTY; + qlp->qql_item.li_desc->lid_flags |= XFS_LID_DIRTY; } STATIC void diff --git a/fs/xfs/support/debug.c b/fs/xfs/support/debug.c index 3f3610a7ee05..975aa10e1a47 100644 --- a/fs/xfs/support/debug.c +++ b/fs/xfs/support/debug.c @@ -22,7 +22,6 @@ #include "xfs_sb.h" #include "xfs_inum.h" #include "xfs_ag.h" -#include "xfs_dmapi.h" #include "xfs_mount.h" #include "xfs_error.h" diff --git a/fs/xfs/xfs_ag.h b/fs/xfs/xfs_ag.h index 401f364ad36c..4917d4eed4ed 100644 --- a/fs/xfs/xfs_ag.h +++ b/fs/xfs/xfs_ag.h @@ -227,7 +227,6 @@ typedef struct xfs_perag { atomic_t pagf_fstrms; /* # of filestreams active in this AG */ - int pag_ici_init; /* incore inode cache initialised */ rwlock_t pag_ici_lock; /* incore inode lock */ struct radix_tree_root pag_ici_root; /* incore inode cache root */ int pag_ici_reclaimable; /* reclaimable inodes */ diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c index a7fbe8a99b12..af168faccc7a 100644 --- a/fs/xfs/xfs_alloc.c +++ b/fs/xfs/xfs_alloc.c @@ -24,18 +24,13 @@ #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" -#include "xfs_dir2.h" -#include "xfs_dmapi.h" #include "xfs_mount.h" #include "xfs_bmap_btree.h" #include "xfs_alloc_btree.h" #include "xfs_ialloc_btree.h" -#include "xfs_dir2_sf.h" -#include "xfs_attr_sf.h" #include "xfs_dinode.h" #include "xfs_inode.h" #include "xfs_btree.h" -#include "xfs_ialloc.h" #include "xfs_alloc.h" #include "xfs_error.h" #include "xfs_trace.h" @@ -688,8 +683,6 @@ xfs_alloc_ag_vextent_near( xfs_agblock_t ltbno; /* start bno of left side entry */ xfs_agblock_t ltbnoa; /* aligned ... */ xfs_extlen_t ltdiff; /* difference to left side entry */ - /*REFERENCED*/ - xfs_agblock_t ltend; /* end bno of left side entry */ xfs_extlen_t ltlen; /* length of left side entry */ xfs_extlen_t ltlena; /* aligned ... */ xfs_agblock_t ltnew; /* useful start bno of left side */ @@ -814,8 +807,7 @@ xfs_alloc_ag_vextent_near( if ((error = xfs_alloc_get_rec(cnt_cur, <bno, <len, &i))) goto error0; XFS_WANT_CORRUPTED_GOTO(i == 1, error0); - ltend = ltbno + ltlen; - ASSERT(ltend <= be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length)); + ASSERT(ltbno + ltlen <= be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length)); args->len = blen; if (!xfs_alloc_fix_minleft(args)) { xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR); @@ -828,7 +820,7 @@ xfs_alloc_ag_vextent_near( */ args->agbno = bnew; ASSERT(bnew >= ltbno); - ASSERT(bnew + blen <= ltend); + ASSERT(bnew + blen <= ltbno + ltlen); /* * Set up a cursor for the by-bno tree. */ @@ -1157,7 +1149,6 @@ xfs_alloc_ag_vextent_near( /* * Fix up the length and compute the useful address. */ - ltend = ltbno + ltlen; args->len = XFS_EXTLEN_MIN(ltlena, args->maxlen); xfs_alloc_fix_len(args); if (!xfs_alloc_fix_minleft(args)) { @@ -1170,7 +1161,7 @@ xfs_alloc_ag_vextent_near( (void)xfs_alloc_compute_diff(args->agbno, rlen, args->alignment, ltbno, ltlen, <new); ASSERT(ltnew >= ltbno); - ASSERT(ltnew + rlen <= ltend); + ASSERT(ltnew + rlen <= ltbno + ltlen); ASSERT(ltnew + rlen <= be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length)); args->agbno = ltnew; if ((error = xfs_alloc_fixup_trees(cnt_cur, bno_cur_lt, ltbno, ltlen, diff --git a/fs/xfs/xfs_alloc.h b/fs/xfs/xfs_alloc.h index 6d05199b667c..895009a97271 100644 --- a/fs/xfs/xfs_alloc.h +++ b/fs/xfs/xfs_alloc.h @@ -27,16 +27,16 @@ struct xfs_busy_extent; /* * Freespace allocation types. Argument to xfs_alloc_[v]extent. */ -typedef enum xfs_alloctype -{ - XFS_ALLOCTYPE_ANY_AG, /* allocate anywhere, use rotor */ - XFS_ALLOCTYPE_FIRST_AG, /* ... start at ag 0 */ - XFS_ALLOCTYPE_START_AG, /* anywhere, start in this a.g. */ - XFS_ALLOCTYPE_THIS_AG, /* anywhere in this a.g. */ - XFS_ALLOCTYPE_START_BNO, /* near this block else anywhere */ - XFS_ALLOCTYPE_NEAR_BNO, /* in this a.g. and near this block */ - XFS_ALLOCTYPE_THIS_BNO /* at exactly this block */ -} xfs_alloctype_t; +#define XFS_ALLOCTYPE_ANY_AG 0x01 /* allocate anywhere, use rotor */ +#define XFS_ALLOCTYPE_FIRST_AG 0x02 /* ... start at ag 0 */ +#define XFS_ALLOCTYPE_START_AG 0x04 /* anywhere, start in this a.g. */ +#define XFS_ALLOCTYPE_THIS_AG 0x08 /* anywhere in this a.g. */ +#define XFS_ALLOCTYPE_START_BNO 0x10 /* near this block else anywhere */ +#define XFS_ALLOCTYPE_NEAR_BNO 0x20 /* in this a.g. and near this block */ +#define XFS_ALLOCTYPE_THIS_BNO 0x40 /* at exactly this block */ + +/* this should become an enum again when the tracing code is fixed */ +typedef unsigned int xfs_alloctype_t; #define XFS_ALLOC_TYPES \ { XFS_ALLOCTYPE_ANY_AG, "ANY_AG" }, \ diff --git a/fs/xfs/xfs_alloc_btree.c b/fs/xfs/xfs_alloc_btree.c index 83f494218759..97f7328967fd 100644 --- a/fs/xfs/xfs_alloc_btree.c +++ b/fs/xfs/xfs_alloc_btree.c @@ -24,19 +24,14 @@ #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" -#include "xfs_dir2.h" -#include "xfs_dmapi.h" #include "xfs_mount.h" #include "xfs_bmap_btree.h" #include "xfs_alloc_btree.h" #include "xfs_ialloc_btree.h" -#include "xfs_dir2_sf.h" -#include "xfs_attr_sf.h" #include "xfs_dinode.h" #include "xfs_inode.h" #include "xfs_btree.h" #include "xfs_btree_trace.h" -#include "xfs_ialloc.h" #include "xfs_alloc.h" #include "xfs_error.h" #include "xfs_trace.h" diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c index b9c196a53c42..c2568242a901 100644 --- a/fs/xfs/xfs_attr.c +++ b/fs/xfs/xfs_attr.c @@ -25,19 +25,13 @@ #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" -#include "xfs_dir2.h" -#include "xfs_dmapi.h" #include "xfs_mount.h" #include "xfs_da_btree.h" #include "xfs_bmap_btree.h" -#include "xfs_alloc_btree.h" -#include "xfs_ialloc_btree.h" -#include "xfs_dir2_sf.h" #include "xfs_attr_sf.h" #include "xfs_dinode.h" #include "xfs_inode.h" #include "xfs_alloc.h" -#include "xfs_btree.h" #include "xfs_inode_item.h" #include "xfs_bmap.h" #include "xfs_attr.h" @@ -325,8 +319,7 @@ xfs_attr_set_int( return (error); } - xfs_trans_ijoin(args.trans, dp, XFS_ILOCK_EXCL); - xfs_trans_ihold(args.trans, dp); + xfs_trans_ijoin(args.trans, dp); /* * If the attribute list is non-existent or a shortform list, @@ -396,10 +389,8 @@ xfs_attr_set_int( * bmap_finish() may have committed the last trans and started * a new one. We need the inode to be in all transactions. */ - if (committed) { - xfs_trans_ijoin(args.trans, dp, XFS_ILOCK_EXCL); - xfs_trans_ihold(args.trans, dp); - } + if (committed) + xfs_trans_ijoin(args.trans, dp); /* * Commit the leaf transformation. We'll need another (linked) @@ -544,8 +535,7 @@ xfs_attr_remove_int(xfs_inode_t *dp, struct xfs_name *name, int flags) * No need to make quota reservations here. We expect to release some * blocks not allocate in the common case. */ - xfs_trans_ijoin(args.trans, dp, XFS_ILOCK_EXCL); - xfs_trans_ihold(args.trans, dp); + xfs_trans_ijoin(args.trans, dp); /* * Decide on what work routines to call based on the inode size. @@ -821,8 +811,7 @@ xfs_attr_inactive(xfs_inode_t *dp) * No need to make quota reservations here. We expect to release some * blocks, not allocate, in the common case. */ - xfs_trans_ijoin(trans, dp, XFS_ILOCK_EXCL); - xfs_trans_ihold(trans, dp); + xfs_trans_ijoin(trans, dp); /* * Decide on what work routines to call based on the inode size. @@ -981,10 +970,8 @@ xfs_attr_leaf_addname(xfs_da_args_t *args) * bmap_finish() may have committed the last trans and started * a new one. We need the inode to be in all transactions. */ - if (committed) { - xfs_trans_ijoin(args->trans, dp, XFS_ILOCK_EXCL); - xfs_trans_ihold(args->trans, dp); - } + if (committed) + xfs_trans_ijoin(args->trans, dp); /* * Commit the current trans (including the inode) and start @@ -1085,10 +1072,8 @@ xfs_attr_leaf_addname(xfs_da_args_t *args) * and started a new one. We need the inode to be * in all transactions. */ - if (committed) { - xfs_trans_ijoin(args->trans, dp, XFS_ILOCK_EXCL); - xfs_trans_ihold(args->trans, dp); - } + if (committed) + xfs_trans_ijoin(args->trans, dp); } else xfs_da_buf_done(bp); @@ -1161,10 +1146,8 @@ xfs_attr_leaf_removename(xfs_da_args_t *args) * bmap_finish() may have committed the last trans and started * a new one. We need the inode to be in all transactions. */ - if (committed) { - xfs_trans_ijoin(args->trans, dp, XFS_ILOCK_EXCL); - xfs_trans_ihold(args->trans, dp); - } + if (committed) + xfs_trans_ijoin(args->trans, dp); } else xfs_da_buf_done(bp); return(0); @@ -1317,10 +1300,8 @@ restart: * and started a new one. We need the inode to be * in all transactions. */ - if (committed) { - xfs_trans_ijoin(args->trans, dp, XFS_ILOCK_EXCL); - xfs_trans_ihold(args->trans, dp); - } + if (committed) + xfs_trans_ijoin(args->trans, dp); /* * Commit the node conversion and start the next @@ -1356,10 +1337,8 @@ restart: * bmap_finish() may have committed the last trans and started * a new one. We need the inode to be in all transactions. */ - if (committed) { - xfs_trans_ijoin(args->trans, dp, XFS_ILOCK_EXCL); - xfs_trans_ihold(args->trans, dp); - } + if (committed) + xfs_trans_ijoin(args->trans, dp); } else { /* * Addition succeeded, update Btree hashvals. @@ -1470,10 +1449,8 @@ restart: * and started a new one. We need the inode to be * in all transactions. */ - if (committed) { - xfs_trans_ijoin(args->trans, dp, XFS_ILOCK_EXCL); - xfs_trans_ihold(args->trans, dp); - } + if (committed) + xfs_trans_ijoin(args->trans, dp); } /* @@ -1604,10 +1581,8 @@ xfs_attr_node_removename(xfs_da_args_t *args) * bmap_finish() may have committed the last trans and started * a new one. We need the inode to be in all transactions. */ - if (committed) { - xfs_trans_ijoin(args->trans, dp, XFS_ILOCK_EXCL); - xfs_trans_ihold(args->trans, dp); - } + if (committed) + xfs_trans_ijoin(args->trans, dp); /* * Commit the Btree join operation and start a new trans. @@ -1658,10 +1633,8 @@ xfs_attr_node_removename(xfs_da_args_t *args) * and started a new one. We need the inode to be * in all transactions. */ - if (committed) { - xfs_trans_ijoin(args->trans, dp, XFS_ILOCK_EXCL); - xfs_trans_ihold(args->trans, dp); - } + if (committed) + xfs_trans_ijoin(args->trans, dp); } else xfs_da_brelse(args->trans, bp); } @@ -2004,7 +1977,7 @@ xfs_attr_rmtval_get(xfs_da_args_t *args) error = xfs_bmapi(args->trans, args->dp, (xfs_fileoff_t)lblkno, args->rmtblkcnt, XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA, - NULL, 0, map, &nmap, NULL, NULL); + NULL, 0, map, &nmap, NULL); if (error) return(error); ASSERT(nmap >= 1); @@ -2083,7 +2056,7 @@ xfs_attr_rmtval_set(xfs_da_args_t *args) XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA | XFS_BMAPI_WRITE, args->firstblock, args->total, &map, &nmap, - args->flist, NULL); + args->flist); if (!error) { error = xfs_bmap_finish(&args->trans, args->flist, &committed); @@ -2099,10 +2072,8 @@ xfs_attr_rmtval_set(xfs_da_args_t *args) * bmap_finish() may have committed the last trans and started * a new one. We need the inode to be in all transactions. */ - if (committed) { - xfs_trans_ijoin(args->trans, dp, XFS_ILOCK_EXCL); - xfs_trans_ihold(args->trans, dp); - } + if (committed) + xfs_trans_ijoin(args->trans, dp); ASSERT(nmap == 1); ASSERT((map.br_startblock != DELAYSTARTBLOCK) && @@ -2136,7 +2107,7 @@ xfs_attr_rmtval_set(xfs_da_args_t *args) args->rmtblkcnt, XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA, args->firstblock, 0, &map, &nmap, - NULL, NULL); + NULL); if (error) { return(error); } @@ -2201,7 +2172,7 @@ xfs_attr_rmtval_remove(xfs_da_args_t *args) args->rmtblkcnt, XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA, args->firstblock, 0, &map, &nmap, - args->flist, NULL); + args->flist); if (error) { return(error); } @@ -2239,7 +2210,7 @@ xfs_attr_rmtval_remove(xfs_da_args_t *args) error = xfs_bunmapi(args->trans, args->dp, lblkno, blkcnt, XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA, 1, args->firstblock, args->flist, - NULL, &done); + &done); if (!error) { error = xfs_bmap_finish(&args->trans, args->flist, &committed); @@ -2255,10 +2226,8 @@ xfs_attr_rmtval_remove(xfs_da_args_t *args) * bmap_finish() may have committed the last trans and started * a new one. We need the inode to be in all transactions. */ - if (committed) { - xfs_trans_ijoin(args->trans, args->dp, XFS_ILOCK_EXCL); - xfs_trans_ihold(args->trans, args->dp); - } + if (committed) + xfs_trans_ijoin(args->trans, args->dp); /* * Close out trans and start the next one in the chain. diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/xfs_attr_leaf.c index a90ce74fc256..a6cff8edcdb6 100644 --- a/fs/xfs/xfs_attr_leaf.c +++ b/fs/xfs/xfs_attr_leaf.c @@ -24,8 +24,6 @@ #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" -#include "xfs_dir2.h" -#include "xfs_dmapi.h" #include "xfs_mount.h" #include "xfs_da_btree.h" #include "xfs_bmap_btree.h" @@ -33,7 +31,6 @@ #include "xfs_ialloc_btree.h" #include "xfs_alloc.h" #include "xfs_btree.h" -#include "xfs_dir2_sf.h" #include "xfs_attr_sf.h" #include "xfs_dinode.h" #include "xfs_inode.h" @@ -2931,7 +2928,7 @@ xfs_attr_leaf_freextent(xfs_trans_t **trans, xfs_inode_t *dp, nmap = 1; error = xfs_bmapi(*trans, dp, (xfs_fileoff_t)tblkno, tblkcnt, XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA, - NULL, 0, &map, &nmap, NULL, NULL); + NULL, 0, &map, &nmap, NULL); if (error) { return(error); } diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c index 99587ded043f..23f14e595c18 100644 --- a/fs/xfs/xfs_bmap.c +++ b/fs/xfs/xfs_bmap.c @@ -30,13 +30,10 @@ #include "xfs_alloc_btree.h" #include "xfs_ialloc_btree.h" #include "xfs_dir2_sf.h" -#include "xfs_attr_sf.h" #include "xfs_dinode.h" #include "xfs_inode.h" #include "xfs_btree.h" -#include "xfs_dmapi.h" #include "xfs_mount.h" -#include "xfs_ialloc.h" #include "xfs_itable.h" #include "xfs_dir2_data.h" #include "xfs_dir2_leaf.h" @@ -104,7 +101,6 @@ xfs_bmap_add_extent( xfs_fsblock_t *first, /* pointer to firstblock variable */ xfs_bmap_free_t *flist, /* list of extents to be freed */ int *logflagsp, /* inode logging flags */ - xfs_extdelta_t *delta, /* Change made to incore extents */ int whichfork, /* data or attr fork */ int rsvd); /* OK to allocate reserved blocks */ @@ -122,7 +118,6 @@ xfs_bmap_add_extent_delay_real( xfs_fsblock_t *first, /* pointer to firstblock variable */ xfs_bmap_free_t *flist, /* list of extents to be freed */ int *logflagsp, /* inode logging flags */ - xfs_extdelta_t *delta, /* Change made to incore extents */ int rsvd); /* OK to allocate reserved blocks */ /* @@ -135,7 +130,6 @@ xfs_bmap_add_extent_hole_delay( xfs_extnum_t idx, /* extent number to update/insert */ xfs_bmbt_irec_t *new, /* new data to add to file extents */ int *logflagsp,/* inode logging flags */ - xfs_extdelta_t *delta, /* Change made to incore extents */ int rsvd); /* OK to allocate reserved blocks */ /* @@ -149,7 +143,6 @@ xfs_bmap_add_extent_hole_real( xfs_btree_cur_t *cur, /* if null, not a btree */ xfs_bmbt_irec_t *new, /* new data to add to file extents */ int *logflagsp, /* inode logging flags */ - xfs_extdelta_t *delta, /* Change made to incore extents */ int whichfork); /* data or attr fork */ /* @@ -162,8 +155,7 @@ xfs_bmap_add_extent_unwritten_real( xfs_extnum_t idx, /* extent number to update/insert */ xfs_btree_cur_t **curp, /* if *curp is null, not a btree */ xfs_bmbt_irec_t *new, /* new data to add to file extents */ - int *logflagsp, /* inode logging flags */ - xfs_extdelta_t *delta); /* Change made to incore extents */ + int *logflagsp); /* inode logging flags */ /* * xfs_bmap_alloc is called by xfs_bmapi to allocate an extent for a file. @@ -200,7 +192,6 @@ xfs_bmap_del_extent( xfs_btree_cur_t *cur, /* if null, not a btree */ xfs_bmbt_irec_t *new, /* new data to add to file extents */ int *logflagsp,/* inode logging flags */ - xfs_extdelta_t *delta, /* Change made to incore extents */ int whichfork, /* data or attr fork */ int rsvd); /* OK to allocate reserved blocks */ @@ -489,7 +480,6 @@ xfs_bmap_add_extent( xfs_fsblock_t *first, /* pointer to firstblock variable */ xfs_bmap_free_t *flist, /* list of extents to be freed */ int *logflagsp, /* inode logging flags */ - xfs_extdelta_t *delta, /* Change made to incore extents */ int whichfork, /* data or attr fork */ int rsvd) /* OK to use reserved data blocks */ { @@ -524,15 +514,6 @@ xfs_bmap_add_extent( logflags = XFS_ILOG_CORE | xfs_ilog_fext(whichfork); } else logflags = 0; - /* DELTA: single new extent */ - if (delta) { - if (delta->xed_startoff > new->br_startoff) - delta->xed_startoff = new->br_startoff; - if (delta->xed_blockcount < - new->br_startoff + new->br_blockcount) - delta->xed_blockcount = new->br_startoff + - new->br_blockcount; - } } /* * Any kind of new delayed allocation goes here. @@ -542,7 +523,7 @@ xfs_bmap_add_extent( ASSERT((cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL) == 0); if ((error = xfs_bmap_add_extent_hole_delay(ip, idx, new, - &logflags, delta, rsvd))) + &logflags, rsvd))) goto done; } /* @@ -553,7 +534,7 @@ xfs_bmap_add_extent( ASSERT((cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL) == 0); if ((error = xfs_bmap_add_extent_hole_real(ip, idx, cur, new, - &logflags, delta, whichfork))) + &logflags, whichfork))) goto done; } else { xfs_bmbt_irec_t prev; /* old extent at offset idx */ @@ -578,17 +559,17 @@ xfs_bmap_add_extent( XFS_BTCUR_BPRV_WASDEL); if ((error = xfs_bmap_add_extent_delay_real(ip, idx, &cur, new, &da_new, first, flist, - &logflags, delta, rsvd))) + &logflags, rsvd))) goto done; } else if (new->br_state == XFS_EXT_NORM) { ASSERT(new->br_state == XFS_EXT_NORM); if ((error = xfs_bmap_add_extent_unwritten_real( - ip, idx, &cur, new, &logflags, delta))) + ip, idx, &cur, new, &logflags))) goto done; } else { ASSERT(new->br_state == XFS_EXT_UNWRITTEN); if ((error = xfs_bmap_add_extent_unwritten_real( - ip, idx, &cur, new, &logflags, delta))) + ip, idx, &cur, new, &logflags))) goto done; } ASSERT(*curp == cur || *curp == NULL); @@ -601,7 +582,7 @@ xfs_bmap_add_extent( ASSERT((cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL) == 0); if ((error = xfs_bmap_add_extent_hole_real(ip, idx, cur, - new, &logflags, delta, whichfork))) + new, &logflags, whichfork))) goto done; } } @@ -666,7 +647,6 @@ xfs_bmap_add_extent_delay_real( xfs_fsblock_t *first, /* pointer to firstblock variable */ xfs_bmap_free_t *flist, /* list of extents to be freed */ int *logflagsp, /* inode logging flags */ - xfs_extdelta_t *delta, /* Change made to incore extents */ int rsvd) /* OK to use reserved data block allocation */ { xfs_btree_cur_t *cur; /* btree cursor */ @@ -797,11 +777,6 @@ xfs_bmap_add_extent_delay_real( goto done; } *dnew = 0; - /* DELTA: Three in-core extents are replaced by one. */ - temp = LEFT.br_startoff; - temp2 = LEFT.br_blockcount + - PREV.br_blockcount + - RIGHT.br_blockcount; break; case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING | BMAP_LEFT_CONTIG: @@ -832,10 +807,6 @@ xfs_bmap_add_extent_delay_real( goto done; } *dnew = 0; - /* DELTA: Two in-core extents are replaced by one. */ - temp = LEFT.br_startoff; - temp2 = LEFT.br_blockcount + - PREV.br_blockcount; break; case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG: @@ -867,10 +838,6 @@ xfs_bmap_add_extent_delay_real( goto done; } *dnew = 0; - /* DELTA: Two in-core extents are replaced by one. */ - temp = PREV.br_startoff; - temp2 = PREV.br_blockcount + - RIGHT.br_blockcount; break; case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING: @@ -900,9 +867,6 @@ xfs_bmap_add_extent_delay_real( XFS_WANT_CORRUPTED_GOTO(i == 1, done); } *dnew = 0; - /* DELTA: The in-core extent described by new changed type. */ - temp = new->br_startoff; - temp2 = new->br_blockcount; break; case BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG: @@ -942,10 +906,6 @@ xfs_bmap_add_extent_delay_real( xfs_bmbt_set_startblock(ep, nullstartblock((int)temp)); trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_); *dnew = temp; - /* DELTA: The boundary between two in-core extents moved. */ - temp = LEFT.br_startoff; - temp2 = LEFT.br_blockcount + - PREV.br_blockcount; break; case BMAP_LEFT_FILLING: @@ -990,9 +950,6 @@ xfs_bmap_add_extent_delay_real( xfs_bmbt_set_startblock(ep, nullstartblock((int)temp)); trace_xfs_bmap_post_update(ip, idx + 1, state, _THIS_IP_); *dnew = temp; - /* DELTA: One in-core extent is split in two. */ - temp = PREV.br_startoff; - temp2 = PREV.br_blockcount; break; case BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG: @@ -1031,10 +988,6 @@ xfs_bmap_add_extent_delay_real( xfs_bmbt_set_startblock(ep, nullstartblock((int)temp)); trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_); *dnew = temp; - /* DELTA: The boundary between two in-core extents moved. */ - temp = PREV.br_startoff; - temp2 = PREV.br_blockcount + - RIGHT.br_blockcount; break; case BMAP_RIGHT_FILLING: @@ -1078,9 +1031,6 @@ xfs_bmap_add_extent_delay_real( xfs_bmbt_set_startblock(ep, nullstartblock((int)temp)); trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_); *dnew = temp; - /* DELTA: One in-core extent is split in two. */ - temp = PREV.br_startoff; - temp2 = PREV.br_blockcount; break; case 0: @@ -1161,9 +1111,6 @@ xfs_bmap_add_extent_delay_real( nullstartblock((int)temp2)); trace_xfs_bmap_post_update(ip, idx + 2, state, _THIS_IP_); *dnew = temp + temp2; - /* DELTA: One in-core extent is split in three. */ - temp = PREV.br_startoff; - temp2 = PREV.br_blockcount; break; case BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG: @@ -1179,13 +1126,6 @@ xfs_bmap_add_extent_delay_real( ASSERT(0); } *curp = cur; - if (delta) { - temp2 += temp; - if (delta->xed_startoff > temp) - delta->xed_startoff = temp; - if (delta->xed_blockcount < temp2) - delta->xed_blockcount = temp2; - } done: *logflagsp = rval; return error; @@ -1204,8 +1144,7 @@ xfs_bmap_add_extent_unwritten_real( xfs_extnum_t idx, /* extent number to update/insert */ xfs_btree_cur_t **curp, /* if *curp is null, not a btree */ xfs_bmbt_irec_t *new, /* new data to add to file extents */ - int *logflagsp, /* inode logging flags */ - xfs_extdelta_t *delta) /* Change made to incore extents */ + int *logflagsp) /* inode logging flags */ { xfs_btree_cur_t *cur; /* btree cursor */ xfs_bmbt_rec_host_t *ep; /* extent entry for idx */ @@ -1219,8 +1158,6 @@ xfs_bmap_add_extent_unwritten_real( /* left is 0, right is 1, prev is 2 */ int rval=0; /* return value (logging flags) */ int state = 0;/* state bits, accessed thru macros */ - xfs_filblks_t temp=0; - xfs_filblks_t temp2=0; #define LEFT r[0] #define RIGHT r[1] @@ -1341,11 +1278,6 @@ xfs_bmap_add_extent_unwritten_real( RIGHT.br_blockcount, LEFT.br_state))) goto done; } - /* DELTA: Three in-core extents are replaced by one. */ - temp = LEFT.br_startoff; - temp2 = LEFT.br_blockcount + - PREV.br_blockcount + - RIGHT.br_blockcount; break; case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING | BMAP_LEFT_CONTIG: @@ -1382,10 +1314,6 @@ xfs_bmap_add_extent_unwritten_real( LEFT.br_state))) goto done; } - /* DELTA: Two in-core extents are replaced by one. */ - temp = LEFT.br_startoff; - temp2 = LEFT.br_blockcount + - PREV.br_blockcount; break; case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG: @@ -1422,10 +1350,6 @@ xfs_bmap_add_extent_unwritten_real( newext))) goto done; } - /* DELTA: Two in-core extents are replaced by one. */ - temp = PREV.br_startoff; - temp2 = PREV.br_blockcount + - RIGHT.br_blockcount; break; case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING: @@ -1453,9 +1377,6 @@ xfs_bmap_add_extent_unwritten_real( newext))) goto done; } - /* DELTA: The in-core extent described by new changed type. */ - temp = new->br_startoff; - temp2 = new->br_blockcount; break; case BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG: @@ -1501,10 +1422,6 @@ xfs_bmap_add_extent_unwritten_real( LEFT.br_state)) goto done; } - /* DELTA: The boundary between two in-core extents moved. */ - temp = LEFT.br_startoff; - temp2 = LEFT.br_blockcount + - PREV.br_blockcount; break; case BMAP_LEFT_FILLING: @@ -1544,9 +1461,6 @@ xfs_bmap_add_extent_unwritten_real( goto done; XFS_WANT_CORRUPTED_GOTO(i == 1, done); } - /* DELTA: One in-core extent is split in two. */ - temp = PREV.br_startoff; - temp2 = PREV.br_blockcount; break; case BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG: @@ -1587,10 +1501,6 @@ xfs_bmap_add_extent_unwritten_real( newext))) goto done; } - /* DELTA: The boundary between two in-core extents moved. */ - temp = PREV.br_startoff; - temp2 = PREV.br_blockcount + - RIGHT.br_blockcount; break; case BMAP_RIGHT_FILLING: @@ -1630,9 +1540,6 @@ xfs_bmap_add_extent_unwritten_real( goto done; XFS_WANT_CORRUPTED_GOTO(i == 1, done); } - /* DELTA: One in-core extent is split in two. */ - temp = PREV.br_startoff; - temp2 = PREV.br_blockcount; break; case 0: @@ -1692,9 +1599,6 @@ xfs_bmap_add_extent_unwritten_real( goto done; XFS_WANT_CORRUPTED_GOTO(i == 1, done); } - /* DELTA: One in-core extent is split in three. */ - temp = PREV.br_startoff; - temp2 = PREV.br_blockcount; break; case BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG: @@ -1710,13 +1614,6 @@ xfs_bmap_add_extent_unwritten_real( ASSERT(0); } *curp = cur; - if (delta) { - temp2 += temp; - if (delta->xed_startoff > temp) - delta->xed_startoff = temp; - if (delta->xed_blockcount < temp2) - delta->xed_blockcount = temp2; - } done: *logflagsp = rval; return error; @@ -1736,7 +1633,6 @@ xfs_bmap_add_extent_hole_delay( xfs_extnum_t idx, /* extent number to update/insert */ xfs_bmbt_irec_t *new, /* new data to add to file extents */ int *logflagsp, /* inode logging flags */ - xfs_extdelta_t *delta, /* Change made to incore extents */ int rsvd) /* OK to allocate reserved blocks */ { xfs_bmbt_rec_host_t *ep; /* extent record for idx */ @@ -1747,7 +1643,6 @@ xfs_bmap_add_extent_hole_delay( xfs_bmbt_irec_t right; /* right neighbor extent entry */ int state; /* state bits, accessed thru macros */ xfs_filblks_t temp=0; /* temp for indirect calculations */ - xfs_filblks_t temp2=0; ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK); ep = xfs_iext_get_ext(ifp, idx); @@ -1819,9 +1714,6 @@ xfs_bmap_add_extent_hole_delay( xfs_iext_remove(ip, idx, 1, state); ip->i_df.if_lastex = idx - 1; - /* DELTA: Two in-core extents were replaced by one. */ - temp2 = temp; - temp = left.br_startoff; break; case BMAP_LEFT_CONTIG: @@ -1841,9 +1733,6 @@ xfs_bmap_add_extent_hole_delay( trace_xfs_bmap_post_update(ip, idx - 1, state, _THIS_IP_); ip->i_df.if_lastex = idx - 1; - /* DELTA: One in-core extent grew into a hole. */ - temp2 = temp; - temp = left.br_startoff; break; case BMAP_RIGHT_CONTIG: @@ -1862,9 +1751,6 @@ xfs_bmap_add_extent_hole_delay( trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_); ip->i_df.if_lastex = idx; - /* DELTA: One in-core extent grew into a hole. */ - temp2 = temp; - temp = new->br_startoff; break; case 0: @@ -1876,9 +1762,6 @@ xfs_bmap_add_extent_hole_delay( oldlen = newlen = 0; xfs_iext_insert(ip, idx, 1, new, state); ip->i_df.if_lastex = idx; - /* DELTA: A new in-core extent was added in a hole. */ - temp2 = new->br_blockcount; - temp = new->br_startoff; break; } if (oldlen != newlen) { @@ -1889,13 +1772,6 @@ xfs_bmap_add_extent_hole_delay( * Nothing to do for disk quota accounting here. */ } - if (delta) { - temp2 += temp; - if (delta->xed_startoff > temp) - delta->xed_startoff = temp; - if (delta->xed_blockcount < temp2) - delta->xed_blockcount = temp2; - } *logflagsp = 0; return 0; } @@ -1911,7 +1787,6 @@ xfs_bmap_add_extent_hole_real( xfs_btree_cur_t *cur, /* if null, not a btree */ xfs_bmbt_irec_t *new, /* new data to add to file extents */ int *logflagsp, /* inode logging flags */ - xfs_extdelta_t *delta, /* Change made to incore extents */ int whichfork) /* data or attr fork */ { xfs_bmbt_rec_host_t *ep; /* pointer to extent entry ins. point */ @@ -1922,8 +1797,6 @@ xfs_bmap_add_extent_hole_real( xfs_bmbt_irec_t right; /* right neighbor extent entry */ int rval=0; /* return value (logging flags) */ int state; /* state bits, accessed thru macros */ - xfs_filblks_t temp=0; - xfs_filblks_t temp2=0; ifp = XFS_IFORK_PTR(ip, whichfork); ASSERT(idx <= ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t)); @@ -2020,11 +1893,6 @@ xfs_bmap_add_extent_hole_real( left.br_state))) goto done; } - /* DELTA: Two in-core extents were replaced by one. */ - temp = left.br_startoff; - temp2 = left.br_blockcount + - new->br_blockcount + - right.br_blockcount; break; case BMAP_LEFT_CONTIG: @@ -2056,10 +1924,6 @@ xfs_bmap_add_extent_hole_real( left.br_state))) goto done; } - /* DELTA: One in-core extent grew. */ - temp = left.br_startoff; - temp2 = left.br_blockcount + - new->br_blockcount; break; case BMAP_RIGHT_CONTIG: @@ -2092,10 +1956,6 @@ xfs_bmap_add_extent_hole_real( right.br_state))) goto done; } - /* DELTA: One in-core extent grew. */ - temp = new->br_startoff; - temp2 = new->br_blockcount + - right.br_blockcount; break; case 0: @@ -2123,18 +1983,8 @@ xfs_bmap_add_extent_hole_real( goto done; XFS_WANT_CORRUPTED_GOTO(i == 1, done); } - /* DELTA: A new extent was added in a hole. */ - temp = new->br_startoff; - temp2 = new->br_blockcount; break; } - if (delta) { - temp2 += temp; - if (delta->xed_startoff > temp) - delta->xed_startoff = temp; - if (delta->xed_blockcount < temp2) - delta->xed_blockcount = temp2; - } done: *logflagsp = rval; return error; @@ -2959,7 +2809,6 @@ xfs_bmap_del_extent( xfs_btree_cur_t *cur, /* if null, not a btree */ xfs_bmbt_irec_t *del, /* data to remove from extents */ int *logflagsp, /* inode logging flags */ - xfs_extdelta_t *delta, /* Change made to incore extents */ int whichfork, /* data or attr fork */ int rsvd) /* OK to allocate reserved blocks */ { @@ -3265,14 +3114,6 @@ xfs_bmap_del_extent( if (da_old > da_new) xfs_mod_incore_sb(mp, XFS_SBS_FDBLOCKS, (int64_t)(da_old - da_new), rsvd); - if (delta) { - /* DELTA: report the original extent. */ - if (delta->xed_startoff > got.br_startoff) - delta->xed_startoff = got.br_startoff; - if (delta->xed_blockcount < got.br_startoff+got.br_blockcount) - delta->xed_blockcount = got.br_startoff + - got.br_blockcount; - } done: *logflagsp = flags; return error; @@ -3754,9 +3595,10 @@ xfs_bmap_add_attrfork( ip->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS; } ASSERT(ip->i_d.di_anextents == 0); - IHOLD(ip); - xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); + + xfs_trans_ijoin_ref(tp, ip, XFS_ILOCK_EXCL); xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + switch (ip->i_d.di_format) { case XFS_DINODE_FMT_DEV: ip->i_d.di_forkoff = roundup(sizeof(xfs_dev_t), 8) >> 3; @@ -4483,8 +4325,7 @@ xfs_bmapi( xfs_extlen_t total, /* total blocks needed */ xfs_bmbt_irec_t *mval, /* output: map values */ int *nmap, /* i/o: mval size/count */ - xfs_bmap_free_t *flist, /* i/o: list extents to free */ - xfs_extdelta_t *delta) /* o: change made to incore extents */ + xfs_bmap_free_t *flist) /* i/o: list extents to free */ { xfs_fsblock_t abno; /* allocated block number */ xfs_extlen_t alen; /* allocated extent length */ @@ -4596,10 +4437,7 @@ xfs_bmapi( end = bno + len; obno = bno; bma.ip = NULL; - if (delta) { - delta->xed_startoff = NULLFILEOFF; - delta->xed_blockcount = 0; - } + while (bno < end && n < *nmap) { /* * Reading past eof, act as though there's a hole @@ -4620,19 +4458,13 @@ xfs_bmapi( * allocate the stuff asked for in this bmap call * but that wouldn't be as good. */ - if (wasdelay && !(flags & XFS_BMAPI_EXACT)) { + if (wasdelay) { alen = (xfs_extlen_t)got.br_blockcount; aoff = got.br_startoff; if (lastx != NULLEXTNUM && lastx) { ep = xfs_iext_get_ext(ifp, lastx - 1); xfs_bmbt_get_all(ep, &prev); } - } else if (wasdelay) { - alen = (xfs_extlen_t) - XFS_FILBLKS_MIN(len, - (got.br_startoff + - got.br_blockcount) - bno); - aoff = bno; } else { alen = (xfs_extlen_t) XFS_FILBLKS_MIN(len, MAXEXTLEN); @@ -4831,7 +4663,7 @@ xfs_bmapi( got.br_state = XFS_EXT_UNWRITTEN; } error = xfs_bmap_add_extent(ip, lastx, &cur, &got, - firstblock, flist, &tmp_logflags, delta, + firstblock, flist, &tmp_logflags, whichfork, (flags & XFS_BMAPI_RSVBLOCKS)); logflags |= tmp_logflags; if (error) @@ -4927,7 +4759,7 @@ xfs_bmapi( } mval->br_state = XFS_EXT_NORM; error = xfs_bmap_add_extent(ip, lastx, &cur, mval, - firstblock, flist, &tmp_logflags, delta, + firstblock, flist, &tmp_logflags, whichfork, (flags & XFS_BMAPI_RSVBLOCKS)); logflags |= tmp_logflags; if (error) @@ -5017,14 +4849,6 @@ xfs_bmapi( ASSERT(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE || XFS_IFORK_NEXTENTS(ip, whichfork) > ifp->if_ext_max); error = 0; - if (delta && delta->xed_startoff != NULLFILEOFF) { - /* A change was actually made. - * Note that delta->xed_blockount is an offset at this - * point and needs to be converted to a block count. - */ - ASSERT(delta->xed_blockcount > delta->xed_startoff); - delta->xed_blockcount -= delta->xed_startoff; - } error0: /* * Log everything. Do this after conversion, there's no point in @@ -5136,8 +4960,6 @@ xfs_bunmapi( xfs_fsblock_t *firstblock, /* first allocated block controls a.g. for allocs */ xfs_bmap_free_t *flist, /* i/o: list extents to free */ - xfs_extdelta_t *delta, /* o: change made to incore - extents */ int *done) /* set if not done yet */ { xfs_btree_cur_t *cur; /* bmap btree cursor */ @@ -5196,10 +5018,7 @@ xfs_bunmapi( bno = start + len - 1; ep = xfs_bmap_search_extents(ip, bno, whichfork, &eof, &lastx, &got, &prev); - if (delta) { - delta->xed_startoff = NULLFILEOFF; - delta->xed_blockcount = 0; - } + /* * Check to see if the given block number is past the end of the * file, back up to the last block if so... @@ -5297,7 +5116,7 @@ xfs_bunmapi( } del.br_state = XFS_EXT_UNWRITTEN; error = xfs_bmap_add_extent(ip, lastx, &cur, &del, - firstblock, flist, &logflags, delta, + firstblock, flist, &logflags, XFS_DATA_FORK, 0); if (error) goto error0; @@ -5352,7 +5171,7 @@ xfs_bunmapi( prev.br_state = XFS_EXT_UNWRITTEN; error = xfs_bmap_add_extent(ip, lastx - 1, &cur, &prev, firstblock, flist, &logflags, - delta, XFS_DATA_FORK, 0); + XFS_DATA_FORK, 0); if (error) goto error0; goto nodelete; @@ -5361,7 +5180,7 @@ xfs_bunmapi( del.br_state = XFS_EXT_UNWRITTEN; error = xfs_bmap_add_extent(ip, lastx, &cur, &del, firstblock, flist, &logflags, - delta, XFS_DATA_FORK, 0); + XFS_DATA_FORK, 0); if (error) goto error0; goto nodelete; @@ -5414,7 +5233,7 @@ xfs_bunmapi( goto error0; } error = xfs_bmap_del_extent(ip, tp, lastx, flist, cur, &del, - &tmp_logflags, delta, whichfork, rsvd); + &tmp_logflags, whichfork, rsvd); logflags |= tmp_logflags; if (error) goto error0; @@ -5471,14 +5290,6 @@ nodelete: ASSERT(ifp->if_ext_max == XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t)); error = 0; - if (delta && delta->xed_startoff != NULLFILEOFF) { - /* A change was actually made. - * Note that delta->xed_blockount is an offset at this - * point and needs to be converted to a block count. - */ - ASSERT(delta->xed_blockcount > delta->xed_startoff); - delta->xed_blockcount -= delta->xed_startoff; - } error0: /* * Log everything. Do this after conversion, there's no point in @@ -5605,28 +5416,6 @@ xfs_getbmap( prealloced = 0; fixlen = 1LL << 32; } else { - /* - * If the BMV_IF_NO_DMAPI_READ interface bit specified, do - * not generate a DMAPI read event. Otherwise, if the - * DM_EVENT_READ bit is set for the file, generate a read - * event in order that the DMAPI application may do its thing - * before we return the extents. Usually this means restoring - * user file data to regions of the file that look like holes. - * - * The "old behavior" (from XFS_IOC_GETBMAP) is to not specify - * BMV_IF_NO_DMAPI_READ so that read events are generated. - * If this were not true, callers of ioctl(XFS_IOC_GETBMAP) - * could misinterpret holes in a DMAPI file as true holes, - * when in fact they may represent offline user data. - */ - if (DM_EVENT_ENABLED(ip, DM_EVENT_READ) && - !(iflags & BMV_IF_NO_DMAPI_READ)) { - error = XFS_SEND_DATA(mp, DM_EVENT_READ, ip, - 0, 0, 0, NULL); - if (error) - return XFS_ERROR(error); - } - if (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS && ip->i_d.di_format != XFS_DINODE_FMT_BTREE && ip->i_d.di_format != XFS_DINODE_FMT_LOCAL) @@ -5713,7 +5502,7 @@ xfs_getbmap( error = xfs_bmapi(NULL, ip, XFS_BB_TO_FSBT(mp, bmv->bmv_offset), XFS_BB_TO_FSB(mp, bmv->bmv_length), bmapi_flags, NULL, 0, map, &nmap, - NULL, NULL); + NULL); if (error) goto out_free_map; ASSERT(nmap <= subnex); @@ -5859,66 +5648,34 @@ xfs_bmap_eof( } #ifdef DEBUG -STATIC -xfs_buf_t * +STATIC struct xfs_buf * xfs_bmap_get_bp( - xfs_btree_cur_t *cur, + struct xfs_btree_cur *cur, xfs_fsblock_t bno) { - int i; - xfs_buf_t *bp; + struct xfs_log_item_desc *lidp; + int i; if (!cur) - return(NULL); - - bp = NULL; - for(i = 0; i < XFS_BTREE_MAXLEVELS; i++) { - bp = cur->bc_bufs[i]; - if (!bp) break; - if (XFS_BUF_ADDR(bp) == bno) - break; /* Found it */ - } - if (i == XFS_BTREE_MAXLEVELS) - bp = NULL; - - if (!bp) { /* Chase down all the log items to see if the bp is there */ - xfs_log_item_chunk_t *licp; - xfs_trans_t *tp; - - tp = cur->bc_tp; - licp = &tp->t_items; - while (!bp && licp != NULL) { - if (xfs_lic_are_all_free(licp)) { - licp = licp->lic_next; - continue; - } - for (i = 0; i < licp->lic_unused; i++) { - xfs_log_item_desc_t *lidp; - xfs_log_item_t *lip; - xfs_buf_log_item_t *bip; - xfs_buf_t *lbp; - - if (xfs_lic_isfree(licp, i)) { - continue; - } - - lidp = xfs_lic_slot(licp, i); - lip = lidp->lid_item; - if (lip->li_type != XFS_LI_BUF) - continue; + return NULL; - bip = (xfs_buf_log_item_t *)lip; - lbp = bip->bli_buf; + for (i = 0; i < XFS_BTREE_MAXLEVELS; i++) { + if (!cur->bc_bufs[i]) + break; + if (XFS_BUF_ADDR(cur->bc_bufs[i]) == bno) + return cur->bc_bufs[i]; + } - if (XFS_BUF_ADDR(lbp) == bno) { - bp = lbp; - break; /* Found it */ - } - } - licp = licp->lic_next; - } + /* Chase down all the log items to see if the bp is there */ + list_for_each_entry(lidp, &cur->bc_tp->t_items, lid_trans) { + struct xfs_buf_log_item *bip; + bip = (struct xfs_buf_log_item *)lidp->lid_item; + if (bip->bli_item.li_type == XFS_LI_BUF && + XFS_BUF_ADDR(bip->bli_buf) == bno) + return bip->bli_buf; } - return(bp); + + return NULL; } STATIC void diff --git a/fs/xfs/xfs_bmap.h b/fs/xfs/xfs_bmap.h index 419dafb9d87d..b13569a6179b 100644 --- a/fs/xfs/xfs_bmap.h +++ b/fs/xfs/xfs_bmap.h @@ -28,20 +28,6 @@ struct xfs_trans; extern kmem_zone_t *xfs_bmap_free_item_zone; /* - * DELTA: describe a change to the in-core extent list. - * - * Internally the use of xed_blockount is somewhat funky. - * xed_blockcount contains an offset much of the time because this - * makes merging changes easier. (xfs_fileoff_t and xfs_filblks_t are - * the same underlying type). - */ -typedef struct xfs_extdelta -{ - xfs_fileoff_t xed_startoff; /* offset of range */ - xfs_filblks_t xed_blockcount; /* blocks in range */ -} xfs_extdelta_t; - -/* * List of extents to be free "later". * The list is kept sorted on xbf_startblock. */ @@ -82,16 +68,13 @@ typedef struct xfs_bmap_free #define XFS_BMAPI_DELAY 0x002 /* delayed write operation */ #define XFS_BMAPI_ENTIRE 0x004 /* return entire extent, not trimmed */ #define XFS_BMAPI_METADATA 0x008 /* mapping metadata not user data */ -#define XFS_BMAPI_EXACT 0x010 /* allocate only to spec'd bounds */ -#define XFS_BMAPI_ATTRFORK 0x020 /* use attribute fork not data */ -#define XFS_BMAPI_ASYNC 0x040 /* bunmapi xactions can be async */ -#define XFS_BMAPI_RSVBLOCKS 0x080 /* OK to alloc. reserved data blocks */ -#define XFS_BMAPI_PREALLOC 0x100 /* preallocation op: unwritten space */ -#define XFS_BMAPI_IGSTATE 0x200 /* Ignore state - */ +#define XFS_BMAPI_ATTRFORK 0x010 /* use attribute fork not data */ +#define XFS_BMAPI_RSVBLOCKS 0x020 /* OK to alloc. reserved data blocks */ +#define XFS_BMAPI_PREALLOC 0x040 /* preallocation op: unwritten space */ +#define XFS_BMAPI_IGSTATE 0x080 /* Ignore state - */ /* combine contig. space */ -#define XFS_BMAPI_CONTIG 0x400 /* must allocate only one extent */ -/* XFS_BMAPI_DIRECT_IO 0x800 */ -#define XFS_BMAPI_CONVERT 0x1000 /* unwritten extent conversion - */ +#define XFS_BMAPI_CONTIG 0x100 /* must allocate only one extent */ +#define XFS_BMAPI_CONVERT 0x200 /* unwritten extent conversion - */ /* need write cache flushing and no */ /* additional allocation alignments */ @@ -100,9 +83,7 @@ typedef struct xfs_bmap_free { XFS_BMAPI_DELAY, "DELAY" }, \ { XFS_BMAPI_ENTIRE, "ENTIRE" }, \ { XFS_BMAPI_METADATA, "METADATA" }, \ - { XFS_BMAPI_EXACT, "EXACT" }, \ { XFS_BMAPI_ATTRFORK, "ATTRFORK" }, \ - { XFS_BMAPI_ASYNC, "ASYNC" }, \ { XFS_BMAPI_RSVBLOCKS, "RSVBLOCKS" }, \ { XFS_BMAPI_PREALLOC, "PREALLOC" }, \ { XFS_BMAPI_IGSTATE, "IGSTATE" }, \ @@ -310,9 +291,7 @@ xfs_bmapi( xfs_extlen_t total, /* total blocks needed */ struct xfs_bmbt_irec *mval, /* output: map values */ int *nmap, /* i/o: mval size/count */ - xfs_bmap_free_t *flist, /* i/o: list extents to free */ - xfs_extdelta_t *delta); /* o: change made to incore - extents */ + xfs_bmap_free_t *flist); /* i/o: list extents to free */ /* * Map file blocks to filesystem blocks, simple version. @@ -346,8 +325,6 @@ xfs_bunmapi( xfs_fsblock_t *firstblock, /* first allocated block controls a.g. for allocs */ xfs_bmap_free_t *flist, /* i/o: list extents to free */ - xfs_extdelta_t *delta, /* o: change made to incore - extents */ int *done); /* set if not done yet */ /* diff --git a/fs/xfs/xfs_bmap_btree.c b/fs/xfs/xfs_bmap_btree.c index 416e47e54b83..87d3c10b6954 100644 --- a/fs/xfs/xfs_bmap_btree.c +++ b/fs/xfs/xfs_bmap_btree.c @@ -24,21 +24,16 @@ #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" -#include "xfs_dir2.h" -#include "xfs_dmapi.h" #include "xfs_mount.h" #include "xfs_bmap_btree.h" #include "xfs_alloc_btree.h" #include "xfs_ialloc_btree.h" -#include "xfs_dir2_sf.h" -#include "xfs_attr_sf.h" #include "xfs_dinode.h" #include "xfs_inode.h" #include "xfs_inode_item.h" #include "xfs_alloc.h" #include "xfs_btree.h" #include "xfs_btree_trace.h" -#include "xfs_ialloc.h" #include "xfs_itable.h" #include "xfs_bmap.h" #include "xfs_error.h" diff --git a/fs/xfs/xfs_btree.c b/fs/xfs/xfs_btree.c index 96be4b0f2496..829af92f0fba 100644 --- a/fs/xfs/xfs_btree.c +++ b/fs/xfs/xfs_btree.c @@ -24,20 +24,15 @@ #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" -#include "xfs_dir2.h" -#include "xfs_dmapi.h" #include "xfs_mount.h" #include "xfs_bmap_btree.h" #include "xfs_alloc_btree.h" #include "xfs_ialloc_btree.h" -#include "xfs_dir2_sf.h" -#include "xfs_attr_sf.h" #include "xfs_dinode.h" #include "xfs_inode.h" #include "xfs_inode_item.h" #include "xfs_btree.h" #include "xfs_btree_trace.h" -#include "xfs_ialloc.h" #include "xfs_error.h" #include "xfs_trace.h" diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index 02a80984aa05..1b09d7a280df 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -24,7 +24,6 @@ #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" -#include "xfs_dmapi.h" #include "xfs_mount.h" #include "xfs_buf_item.h" #include "xfs_trans_priv.h" @@ -34,6 +33,12 @@ kmem_zone_t *xfs_buf_item_zone; +static inline struct xfs_buf_log_item *BUF_ITEM(struct xfs_log_item *lip) +{ + return container_of(lip, struct xfs_buf_log_item, bli_item); +} + + #ifdef XFS_TRANS_DEBUG /* * This function uses an alternate strategy for tracking the bytes @@ -151,12 +156,13 @@ STATIC void xfs_buf_do_callbacks(xfs_buf_t *bp, xfs_log_item_t *lip); */ STATIC uint xfs_buf_item_size( - xfs_buf_log_item_t *bip) + struct xfs_log_item *lip) { - uint nvecs; - int next_bit; - int last_bit; - xfs_buf_t *bp; + struct xfs_buf_log_item *bip = BUF_ITEM(lip); + struct xfs_buf *bp = bip->bli_buf; + uint nvecs; + int next_bit; + int last_bit; ASSERT(atomic_read(&bip->bli_refcount) > 0); if (bip->bli_flags & XFS_BLI_STALE) { @@ -170,7 +176,6 @@ xfs_buf_item_size( return 1; } - bp = bip->bli_buf; ASSERT(bip->bli_flags & XFS_BLI_LOGGED); nvecs = 1; last_bit = xfs_next_bit(bip->bli_format.blf_data_map, @@ -219,13 +224,13 @@ xfs_buf_item_size( */ STATIC void xfs_buf_item_format( - xfs_buf_log_item_t *bip, - xfs_log_iovec_t *log_vector) + struct xfs_log_item *lip, + struct xfs_log_iovec *vecp) { + struct xfs_buf_log_item *bip = BUF_ITEM(lip); + struct xfs_buf *bp = bip->bli_buf; uint base_size; uint nvecs; - xfs_log_iovec_t *vecp; - xfs_buf_t *bp; int first_bit; int last_bit; int next_bit; @@ -235,8 +240,6 @@ xfs_buf_item_format( ASSERT(atomic_read(&bip->bli_refcount) > 0); ASSERT((bip->bli_flags & XFS_BLI_LOGGED) || (bip->bli_flags & XFS_BLI_STALE)); - bp = bip->bli_buf; - vecp = log_vector; /* * The size of the base structure is the size of the @@ -248,7 +251,7 @@ xfs_buf_item_format( base_size = (uint)(sizeof(xfs_buf_log_format_t) + ((bip->bli_format.blf_map_size - 1) * sizeof(uint))); - vecp->i_addr = (xfs_caddr_t)&bip->bli_format; + vecp->i_addr = &bip->bli_format; vecp->i_len = base_size; vecp->i_type = XLOG_REG_TYPE_BFORMAT; vecp++; @@ -263,7 +266,7 @@ xfs_buf_item_format( */ if (bip->bli_flags & XFS_BLI_INODE_BUF) { if (!((bip->bli_flags & XFS_BLI_INODE_ALLOC_BUF) && - xfs_log_item_in_current_chkpt(&bip->bli_item))) + xfs_log_item_in_current_chkpt(lip))) bip->bli_format.blf_flags |= XFS_BLF_INODE_BUF; bip->bli_flags &= ~XFS_BLI_INODE_BUF; } @@ -356,66 +359,90 @@ xfs_buf_item_format( /* * This is called to pin the buffer associated with the buf log item in memory - * so it cannot be written out. Simply call bpin() on the buffer to do this. + * so it cannot be written out. * * We also always take a reference to the buffer log item here so that the bli * is held while the item is pinned in memory. This means that we can * unconditionally drop the reference count a transaction holds when the * transaction is completed. */ - STATIC void xfs_buf_item_pin( - xfs_buf_log_item_t *bip) + struct xfs_log_item *lip) { - xfs_buf_t *bp; + struct xfs_buf_log_item *bip = BUF_ITEM(lip); - bp = bip->bli_buf; - ASSERT(XFS_BUF_ISBUSY(bp)); + ASSERT(XFS_BUF_ISBUSY(bip->bli_buf)); ASSERT(atomic_read(&bip->bli_refcount) > 0); ASSERT((bip->bli_flags & XFS_BLI_LOGGED) || (bip->bli_flags & XFS_BLI_STALE)); - atomic_inc(&bip->bli_refcount); + trace_xfs_buf_item_pin(bip); - xfs_bpin(bp); -} + atomic_inc(&bip->bli_refcount); + atomic_inc(&bip->bli_buf->b_pin_count); +} /* * This is called to unpin the buffer associated with the buf log * item which was previously pinned with a call to xfs_buf_item_pin(). - * Just call bunpin() on the buffer to do this. * * Also drop the reference to the buf item for the current transaction. * If the XFS_BLI_STALE flag is set and we are the last reference, * then free up the buf log item and unlock the buffer. + * + * If the remove flag is set we are called from uncommit in the + * forced-shutdown path. If that is true and the reference count on + * the log item is going to drop to zero we need to free the item's + * descriptor in the transaction. */ STATIC void xfs_buf_item_unpin( - xfs_buf_log_item_t *bip) + struct xfs_log_item *lip, + int remove) { - struct xfs_ail *ailp; - xfs_buf_t *bp; - int freed; + struct xfs_buf_log_item *bip = BUF_ITEM(lip); + xfs_buf_t *bp = bip->bli_buf; + struct xfs_ail *ailp = lip->li_ailp; int stale = bip->bli_flags & XFS_BLI_STALE; + int freed; - bp = bip->bli_buf; - ASSERT(bp != NULL); ASSERT(XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *) == bip); ASSERT(atomic_read(&bip->bli_refcount) > 0); + trace_xfs_buf_item_unpin(bip); freed = atomic_dec_and_test(&bip->bli_refcount); - ailp = bip->bli_item.li_ailp; - xfs_bunpin(bp); + + if (atomic_dec_and_test(&bp->b_pin_count)) + wake_up_all(&bp->b_waiters); + if (freed && stale) { ASSERT(bip->bli_flags & XFS_BLI_STALE); ASSERT(XFS_BUF_VALUSEMA(bp) <= 0); ASSERT(!(XFS_BUF_ISDELAYWRITE(bp))); ASSERT(XFS_BUF_ISSTALE(bp)); ASSERT(bip->bli_format.blf_flags & XFS_BLF_CANCEL); + trace_xfs_buf_item_unpin_stale(bip); + if (remove) { + /* + * We have to remove the log item from the transaction + * as we are about to release our reference to the + * buffer. If we don't, the unlock that occurs later + * in xfs_trans_uncommit() will ry to reference the + * buffer which we no longer have a hold on. + */ + xfs_trans_del_item(lip); + + /* + * Since the transaction no longer refers to the buffer, + * the buffer should no longer refer to the transaction. + */ + XFS_BUF_SET_FSPRIVATE2(bp, NULL); + } + /* * If we get called here because of an IO error, we may * or may not have the item on the AIL. xfs_trans_ail_delete() @@ -437,48 +464,6 @@ xfs_buf_item_unpin( } /* - * this is called from uncommit in the forced-shutdown path. - * we need to check to see if the reference count on the log item - * is going to drop to zero. If so, unpin will free the log item - * so we need to free the item's descriptor (that points to the item) - * in the transaction. - */ -STATIC void -xfs_buf_item_unpin_remove( - xfs_buf_log_item_t *bip, - xfs_trans_t *tp) -{ - /* will xfs_buf_item_unpin() call xfs_buf_item_relse()? */ - if ((atomic_read(&bip->bli_refcount) == 1) && - (bip->bli_flags & XFS_BLI_STALE)) { - /* - * yes -- We can safely do some work here and then call - * buf_item_unpin to do the rest because we are - * are holding the buffer locked so no one else will be - * able to bump up the refcount. We have to remove the - * log item from the transaction as we are about to release - * our reference to the buffer. If we don't, the unlock that - * occurs later in the xfs_trans_uncommit() will try to - * reference the buffer which we no longer have a hold on. - */ - struct xfs_log_item_desc *lidp; - - ASSERT(XFS_BUF_VALUSEMA(bip->bli_buf) <= 0); - trace_xfs_buf_item_unpin_stale(bip); - - lidp = xfs_trans_find_item(tp, (xfs_log_item_t *)bip); - xfs_trans_free_item(tp, lidp); - - /* - * Since the transaction no longer refers to the buffer, the - * buffer should no longer refer to the transaction. - */ - XFS_BUF_SET_FSPRIVATE2(bip->bli_buf, NULL); - } - xfs_buf_item_unpin(bip); -} - -/* * This is called to attempt to lock the buffer associated with this * buf log item. Don't sleep on the buffer lock. If we can't get * the lock right away, return 0. If we can get the lock, take a @@ -488,11 +473,11 @@ xfs_buf_item_unpin_remove( */ STATIC uint xfs_buf_item_trylock( - xfs_buf_log_item_t *bip) + struct xfs_log_item *lip) { - xfs_buf_t *bp; + struct xfs_buf_log_item *bip = BUF_ITEM(lip); + struct xfs_buf *bp = bip->bli_buf; - bp = bip->bli_buf; if (XFS_BUF_ISPINNED(bp)) return XFS_ITEM_PINNED; if (!XFS_BUF_CPSEMA(bp)) @@ -529,13 +514,12 @@ xfs_buf_item_trylock( */ STATIC void xfs_buf_item_unlock( - xfs_buf_log_item_t *bip) + struct xfs_log_item *lip) { - int aborted; - xfs_buf_t *bp; - uint hold; - - bp = bip->bli_buf; + struct xfs_buf_log_item *bip = BUF_ITEM(lip); + struct xfs_buf *bp = bip->bli_buf; + int aborted; + uint hold; /* Clear the buffer's association with this transaction. */ XFS_BUF_SET_FSPRIVATE2(bp, NULL); @@ -546,7 +530,7 @@ xfs_buf_item_unlock( * (cancelled) buffers at unpin time, but we'll never go through the * pin/unpin cycle if we abort inside commit. */ - aborted = (bip->bli_item.li_flags & XFS_LI_ABORTED) != 0; + aborted = (lip->li_flags & XFS_LI_ABORTED) != 0; /* * Before possibly freeing the buf item, determine if we should @@ -607,16 +591,16 @@ xfs_buf_item_unlock( */ STATIC xfs_lsn_t xfs_buf_item_committed( - xfs_buf_log_item_t *bip, + struct xfs_log_item *lip, xfs_lsn_t lsn) { + struct xfs_buf_log_item *bip = BUF_ITEM(lip); + trace_xfs_buf_item_committed(bip); - if ((bip->bli_flags & XFS_BLI_INODE_ALLOC_BUF) && - (bip->bli_item.li_lsn != 0)) { - return bip->bli_item.li_lsn; - } - return (lsn); + if ((bip->bli_flags & XFS_BLI_INODE_ALLOC_BUF) && lip->li_lsn != 0) + return lip->li_lsn; + return lsn; } /* @@ -626,15 +610,16 @@ xfs_buf_item_committed( */ STATIC void xfs_buf_item_push( - xfs_buf_log_item_t *bip) + struct xfs_log_item *lip) { - xfs_buf_t *bp; + struct xfs_buf_log_item *bip = BUF_ITEM(lip); + struct xfs_buf *bp = bip->bli_buf; ASSERT(!(bip->bli_flags & XFS_BLI_STALE)); + ASSERT(!XFS_BUF_ISDELAYWRITE(bp)); + trace_xfs_buf_item_push(bip); - bp = bip->bli_buf; - ASSERT(!XFS_BUF_ISDELAYWRITE(bp)); xfs_buf_relse(bp); } @@ -646,22 +631,24 @@ xfs_buf_item_push( */ STATIC void xfs_buf_item_pushbuf( - xfs_buf_log_item_t *bip) + struct xfs_log_item *lip) { - xfs_buf_t *bp; + struct xfs_buf_log_item *bip = BUF_ITEM(lip); + struct xfs_buf *bp = bip->bli_buf; ASSERT(!(bip->bli_flags & XFS_BLI_STALE)); + ASSERT(XFS_BUF_ISDELAYWRITE(bp)); + trace_xfs_buf_item_pushbuf(bip); - bp = bip->bli_buf; - ASSERT(XFS_BUF_ISDELAYWRITE(bp)); xfs_buf_delwri_promote(bp); xfs_buf_relse(bp); } -/* ARGSUSED */ STATIC void -xfs_buf_item_committing(xfs_buf_log_item_t *bip, xfs_lsn_t commit_lsn) +xfs_buf_item_committing( + struct xfs_log_item *lip, + xfs_lsn_t commit_lsn) { } @@ -669,21 +656,16 @@ xfs_buf_item_committing(xfs_buf_log_item_t *bip, xfs_lsn_t commit_lsn) * This is the ops vector shared by all buf log items. */ static struct xfs_item_ops xfs_buf_item_ops = { - .iop_size = (uint(*)(xfs_log_item_t*))xfs_buf_item_size, - .iop_format = (void(*)(xfs_log_item_t*, xfs_log_iovec_t*)) - xfs_buf_item_format, - .iop_pin = (void(*)(xfs_log_item_t*))xfs_buf_item_pin, - .iop_unpin = (void(*)(xfs_log_item_t*))xfs_buf_item_unpin, - .iop_unpin_remove = (void(*)(xfs_log_item_t*, xfs_trans_t *)) - xfs_buf_item_unpin_remove, - .iop_trylock = (uint(*)(xfs_log_item_t*))xfs_buf_item_trylock, - .iop_unlock = (void(*)(xfs_log_item_t*))xfs_buf_item_unlock, - .iop_committed = (xfs_lsn_t(*)(xfs_log_item_t*, xfs_lsn_t)) - xfs_buf_item_committed, - .iop_push = (void(*)(xfs_log_item_t*))xfs_buf_item_push, - .iop_pushbuf = (void(*)(xfs_log_item_t*))xfs_buf_item_pushbuf, - .iop_committing = (void(*)(xfs_log_item_t*, xfs_lsn_t)) - xfs_buf_item_committing + .iop_size = xfs_buf_item_size, + .iop_format = xfs_buf_item_format, + .iop_pin = xfs_buf_item_pin, + .iop_unpin = xfs_buf_item_unpin, + .iop_trylock = xfs_buf_item_trylock, + .iop_unlock = xfs_buf_item_unlock, + .iop_committed = xfs_buf_item_committed, + .iop_push = xfs_buf_item_push, + .iop_pushbuf = xfs_buf_item_pushbuf, + .iop_committing = xfs_buf_item_committing }; @@ -712,7 +694,6 @@ xfs_buf_item_init( */ if (bp->b_mount != mp) bp->b_mount = mp; - XFS_BUF_SET_BDSTRAT_FUNC(bp, xfs_bdstrat_cb); if (XFS_BUF_FSPRIVATE(bp, void *) != NULL) { lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *); if (lip->li_type == XFS_LI_BUF) { @@ -1098,15 +1079,14 @@ xfs_buf_error_relse( * It is called by xfs_buf_iodone_callbacks() above which will take * care of cleaning up the buffer itself. */ -/* ARGSUSED */ void xfs_buf_iodone( - xfs_buf_t *bp, - xfs_buf_log_item_t *bip) + struct xfs_buf *bp, + struct xfs_log_item *lip) { - struct xfs_ail *ailp = bip->bli_item.li_ailp; + struct xfs_ail *ailp = lip->li_ailp; - ASSERT(bip->bli_buf == bp); + ASSERT(BUF_ITEM(lip)->bli_buf == bp); xfs_buf_rele(bp); @@ -1120,6 +1100,6 @@ xfs_buf_iodone( * Either way, AIL is useless if we're forcing a shutdown. */ spin_lock(&ailp->xa_lock); - xfs_trans_ail_delete(ailp, (xfs_log_item_t *)bip); - xfs_buf_item_free(bip); + xfs_trans_ail_delete(ailp, lip); + xfs_buf_item_free(BUF_ITEM(lip)); } diff --git a/fs/xfs/xfs_buf_item.h b/fs/xfs/xfs_buf_item.h index f20bb472d582..0e2ed43f16c7 100644 --- a/fs/xfs/xfs_buf_item.h +++ b/fs/xfs/xfs_buf_item.h @@ -124,7 +124,7 @@ void xfs_buf_attach_iodone(struct xfs_buf *, void(*)(struct xfs_buf *, xfs_log_item_t *), xfs_log_item_t *); void xfs_buf_iodone_callbacks(struct xfs_buf *); -void xfs_buf_iodone(struct xfs_buf *, xfs_buf_log_item_t *); +void xfs_buf_iodone(struct xfs_buf *, struct xfs_log_item *); #ifdef XFS_TRANS_DEBUG void diff --git a/fs/xfs/xfs_da_btree.c b/fs/xfs/xfs_da_btree.c index 0ca556b4bf31..30fa0e206fba 100644 --- a/fs/xfs/xfs_da_btree.c +++ b/fs/xfs/xfs_da_btree.c @@ -25,19 +25,14 @@ #include "xfs_sb.h" #include "xfs_ag.h" #include "xfs_dir2.h" -#include "xfs_dmapi.h" #include "xfs_mount.h" #include "xfs_da_btree.h" #include "xfs_bmap_btree.h" -#include "xfs_alloc_btree.h" -#include "xfs_ialloc_btree.h" #include "xfs_dir2_sf.h" -#include "xfs_attr_sf.h" #include "xfs_dinode.h" #include "xfs_inode.h" #include "xfs_inode_item.h" #include "xfs_alloc.h" -#include "xfs_btree.h" #include "xfs_bmap.h" #include "xfs_attr.h" #include "xfs_attr_leaf.h" @@ -581,16 +576,14 @@ xfs_da_node_add(xfs_da_state_t *state, xfs_da_state_blk_t *oldblk, xfs_da_intnode_t *node; xfs_da_node_entry_t *btree; int tmp; - xfs_mount_t *mp; node = oldblk->bp->data; - mp = state->mp; ASSERT(be16_to_cpu(node->hdr.info.magic) == XFS_DA_NODE_MAGIC); ASSERT((oldblk->index >= 0) && (oldblk->index <= be16_to_cpu(node->hdr.count))); ASSERT(newblk->blkno != 0); if (state->args->whichfork == XFS_DATA_FORK) - ASSERT(newblk->blkno >= mp->m_dirleafblk && - newblk->blkno < mp->m_dirfreeblk); + ASSERT(newblk->blkno >= state->mp->m_dirleafblk && + newblk->blkno < state->mp->m_dirfreeblk); /* * We may need to make some room before we insert the new node. @@ -1601,7 +1594,7 @@ xfs_da_grow_inode(xfs_da_args_t *args, xfs_dablk_t *new_blkno) xfs_bmapi_aflag(w)|XFS_BMAPI_WRITE|XFS_BMAPI_METADATA| XFS_BMAPI_CONTIG, args->firstblock, args->total, &map, &nmap, - args->flist, NULL))) { + args->flist))) { return error; } ASSERT(nmap <= 1); @@ -1622,8 +1615,7 @@ xfs_da_grow_inode(xfs_da_args_t *args, xfs_dablk_t *new_blkno) xfs_bmapi_aflag(w)|XFS_BMAPI_WRITE| XFS_BMAPI_METADATA, args->firstblock, args->total, - &mapp[mapi], &nmap, args->flist, - NULL))) { + &mapp[mapi], &nmap, args->flist))) { kmem_free(mapp); return error; } @@ -1884,7 +1876,7 @@ xfs_da_shrink_inode(xfs_da_args_t *args, xfs_dablk_t dead_blkno, */ if ((error = xfs_bunmapi(tp, dp, dead_blkno, count, xfs_bmapi_aflag(w)|XFS_BMAPI_METADATA, - 0, args->firstblock, args->flist, NULL, + 0, args->firstblock, args->flist, &done)) == ENOSPC) { if (w != XFS_DATA_FORK) break; @@ -1989,7 +1981,7 @@ xfs_da_do_buf( nfsb, XFS_BMAPI_METADATA | xfs_bmapi_aflag(whichfork), - NULL, 0, mapp, &nmap, NULL, NULL))) + NULL, 0, mapp, &nmap, NULL))) goto exit0; } } else { diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c index 5bba29a07812..3b9582c60a22 100644 --- a/fs/xfs/xfs_dfrag.c +++ b/fs/xfs/xfs_dfrag.c @@ -24,24 +24,15 @@ #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" -#include "xfs_dir2.h" -#include "xfs_dmapi.h" #include "xfs_mount.h" #include "xfs_bmap_btree.h" -#include "xfs_alloc_btree.h" -#include "xfs_ialloc_btree.h" -#include "xfs_dir2_sf.h" -#include "xfs_attr_sf.h" #include "xfs_dinode.h" #include "xfs_inode.h" #include "xfs_inode_item.h" #include "xfs_bmap.h" -#include "xfs_btree.h" -#include "xfs_ialloc.h" #include "xfs_itable.h" #include "xfs_dfrag.h" #include "xfs_error.h" -#include "xfs_rw.h" #include "xfs_vnodeops.h" #include "xfs_trace.h" @@ -69,7 +60,9 @@ xfs_swapext( goto out; } - if (!(file->f_mode & FMODE_WRITE) || (file->f_flags & O_APPEND)) { + if (!(file->f_mode & FMODE_WRITE) || + !(file->f_mode & FMODE_READ) || + (file->f_flags & O_APPEND)) { error = XFS_ERROR(EBADF); goto out_put_file; } @@ -81,6 +74,7 @@ xfs_swapext( } if (!(tmp_file->f_mode & FMODE_WRITE) || + !(tmp_file->f_mode & FMODE_READ) || (tmp_file->f_flags & O_APPEND)) { error = XFS_ERROR(EBADF); goto out_put_tmp_file; @@ -422,11 +416,8 @@ xfs_swap_extents( } - IHOLD(ip); - xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); - - IHOLD(tip); - xfs_trans_ijoin(tp, tip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); + xfs_trans_ijoin_ref(tp, ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); + xfs_trans_ijoin_ref(tp, tip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); xfs_trans_log_inode(tp, ip, ilf_fields); xfs_trans_log_inode(tp, tip, tilf_fields); diff --git a/fs/xfs/xfs_dir2.c b/fs/xfs/xfs_dir2.c index 42520f041265..a1321bc7f192 100644 --- a/fs/xfs/xfs_dir2.c +++ b/fs/xfs/xfs_dir2.c @@ -25,13 +25,11 @@ #include "xfs_sb.h" #include "xfs_ag.h" #include "xfs_dir2.h" -#include "xfs_dmapi.h" #include "xfs_mount.h" #include "xfs_da_btree.h" #include "xfs_bmap_btree.h" #include "xfs_alloc_btree.h" #include "xfs_dir2_sf.h" -#include "xfs_attr_sf.h" #include "xfs_dinode.h" #include "xfs_inode.h" #include "xfs_inode_item.h" @@ -382,7 +380,7 @@ xfs_readdir( int rval; /* return value */ int v; /* type-checking value */ - xfs_itrace_entry(dp); + trace_xfs_readdir(dp); if (XFS_FORCED_SHUTDOWN(dp->i_mount)) return XFS_ERROR(EIO); @@ -549,7 +547,7 @@ xfs_dir2_grow_inode( if ((error = xfs_bmapi(tp, dp, bno, count, XFS_BMAPI_WRITE|XFS_BMAPI_METADATA|XFS_BMAPI_CONTIG, args->firstblock, args->total, &map, &nmap, - args->flist, NULL))) + args->flist))) return error; ASSERT(nmap <= 1); if (nmap == 1) { @@ -581,8 +579,7 @@ xfs_dir2_grow_inode( if ((error = xfs_bmapi(tp, dp, b, c, XFS_BMAPI_WRITE|XFS_BMAPI_METADATA, args->firstblock, args->total, - &mapp[mapi], &nmap, args->flist, - NULL))) { + &mapp[mapi], &nmap, args->flist))) { kmem_free(mapp); return error; } @@ -715,7 +712,7 @@ xfs_dir2_shrink_inode( */ if ((error = xfs_bunmapi(tp, dp, da, mp->m_dirblkfsbs, XFS_BMAPI_METADATA, 0, args->firstblock, args->flist, - NULL, &done))) { + &done))) { /* * ENOSPC actually can happen if we're in a removename with * no space reservation, and the resulting block removal diff --git a/fs/xfs/xfs_dir2_block.c b/fs/xfs/xfs_dir2_block.c index 779a267b0a84..580d99cef9e7 100644 --- a/fs/xfs/xfs_dir2_block.c +++ b/fs/xfs/xfs_dir2_block.c @@ -24,12 +24,10 @@ #include "xfs_sb.h" #include "xfs_ag.h" #include "xfs_dir2.h" -#include "xfs_dmapi.h" #include "xfs_mount.h" #include "xfs_da_btree.h" #include "xfs_bmap_btree.h" #include "xfs_dir2_sf.h" -#include "xfs_attr_sf.h" #include "xfs_dinode.h" #include "xfs_inode.h" #include "xfs_inode_item.h" @@ -1073,10 +1071,10 @@ xfs_dir2_sf_to_block( */ buf_len = dp->i_df.if_bytes; - buf = kmem_alloc(dp->i_df.if_bytes, KM_SLEEP); + buf = kmem_alloc(buf_len, KM_SLEEP); - memcpy(buf, sfp, dp->i_df.if_bytes); - xfs_idata_realloc(dp, -dp->i_df.if_bytes, XFS_DATA_FORK); + memcpy(buf, sfp, buf_len); + xfs_idata_realloc(dp, -buf_len, XFS_DATA_FORK); dp->i_d.di_size = 0; xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE); /* diff --git a/fs/xfs/xfs_dir2_data.c b/fs/xfs/xfs_dir2_data.c index 498f8d694330..921595b84f5b 100644 --- a/fs/xfs/xfs_dir2_data.c +++ b/fs/xfs/xfs_dir2_data.c @@ -24,12 +24,10 @@ #include "xfs_sb.h" #include "xfs_ag.h" #include "xfs_dir2.h" -#include "xfs_dmapi.h" #include "xfs_mount.h" #include "xfs_da_btree.h" #include "xfs_bmap_btree.h" #include "xfs_dir2_sf.h" -#include "xfs_attr_sf.h" #include "xfs_dinode.h" #include "xfs_inode.h" #include "xfs_dir2_data.h" diff --git a/fs/xfs/xfs_dir2_leaf.c b/fs/xfs/xfs_dir2_leaf.c index e2d89854ec9e..504be8640e91 100644 --- a/fs/xfs/xfs_dir2_leaf.c +++ b/fs/xfs/xfs_dir2_leaf.c @@ -25,11 +25,9 @@ #include "xfs_sb.h" #include "xfs_ag.h" #include "xfs_dir2.h" -#include "xfs_dmapi.h" #include "xfs_mount.h" #include "xfs_da_btree.h" #include "xfs_bmap_btree.h" -#include "xfs_attr_sf.h" #include "xfs_dir2_sf.h" #include "xfs_dinode.h" #include "xfs_inode.h" @@ -875,7 +873,7 @@ xfs_dir2_leaf_getdents( xfs_dir2_byte_to_da(mp, XFS_DIR2_LEAF_OFFSET) - map_off, XFS_BMAPI_METADATA, NULL, 0, - &map[map_valid], &nmap, NULL, NULL); + &map[map_valid], &nmap, NULL); /* * Don't know if we should ignore this or * try to return an error. diff --git a/fs/xfs/xfs_dir2_node.c b/fs/xfs/xfs_dir2_node.c index 78fc4d9ae756..f9a0864b696a 100644 --- a/fs/xfs/xfs_dir2_node.c +++ b/fs/xfs/xfs_dir2_node.c @@ -24,12 +24,10 @@ #include "xfs_sb.h" #include "xfs_ag.h" #include "xfs_dir2.h" -#include "xfs_dmapi.h" #include "xfs_mount.h" #include "xfs_da_btree.h" #include "xfs_bmap_btree.h" #include "xfs_dir2_sf.h" -#include "xfs_attr_sf.h" #include "xfs_dinode.h" #include "xfs_inode.h" #include "xfs_bmap.h" diff --git a/fs/xfs/xfs_dir2_sf.c b/fs/xfs/xfs_dir2_sf.c index c1a5945d463a..b1bae6b1eed9 100644 --- a/fs/xfs/xfs_dir2_sf.c +++ b/fs/xfs/xfs_dir2_sf.c @@ -24,12 +24,10 @@ #include "xfs_sb.h" #include "xfs_ag.h" #include "xfs_dir2.h" -#include "xfs_dmapi.h" #include "xfs_mount.h" #include "xfs_da_btree.h" #include "xfs_bmap_btree.h" #include "xfs_dir2_sf.h" -#include "xfs_attr_sf.h" #include "xfs_dinode.h" #include "xfs_inode.h" #include "xfs_inode_item.h" diff --git a/fs/xfs/xfs_dmapi.h b/fs/xfs/xfs_dmapi.h deleted file mode 100644 index 2813cdd72375..000000000000 --- a/fs/xfs/xfs_dmapi.h +++ /dev/null @@ -1,170 +0,0 @@ -/* - * Copyright (c) 2000-2005 Silicon Graphics, Inc. - * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - */ -#ifndef __XFS_DMAPI_H__ -#define __XFS_DMAPI_H__ - -/* Values used to define the on-disk version of dm_attrname_t. All - * on-disk attribute names start with the 8-byte string "SGI_DMI_". - * - * In the on-disk inode, DMAPI attribute names consist of the user-provided - * name with the DMATTR_PREFIXSTRING pre-pended. This string must NEVER be - * changed. - */ - -#define DMATTR_PREFIXLEN 8 -#define DMATTR_PREFIXSTRING "SGI_DMI_" - -typedef enum { - DM_EVENT_INVALID = -1, - DM_EVENT_CANCEL = 0, /* not supported */ - DM_EVENT_MOUNT = 1, - DM_EVENT_PREUNMOUNT = 2, - DM_EVENT_UNMOUNT = 3, - DM_EVENT_DEBUT = 4, /* not supported */ - DM_EVENT_CREATE = 5, - DM_EVENT_CLOSE = 6, /* not supported */ - DM_EVENT_POSTCREATE = 7, - DM_EVENT_REMOVE = 8, - DM_EVENT_POSTREMOVE = 9, - DM_EVENT_RENAME = 10, - DM_EVENT_POSTRENAME = 11, - DM_EVENT_LINK = 12, - DM_EVENT_POSTLINK = 13, - DM_EVENT_SYMLINK = 14, - DM_EVENT_POSTSYMLINK = 15, - DM_EVENT_READ = 16, - DM_EVENT_WRITE = 17, - DM_EVENT_TRUNCATE = 18, - DM_EVENT_ATTRIBUTE = 19, - DM_EVENT_DESTROY = 20, - DM_EVENT_NOSPACE = 21, - DM_EVENT_USER = 22, - DM_EVENT_MAX = 23 -} dm_eventtype_t; -#define HAVE_DM_EVENTTYPE_T - -typedef enum { - DM_RIGHT_NULL, - DM_RIGHT_SHARED, - DM_RIGHT_EXCL -} dm_right_t; -#define HAVE_DM_RIGHT_T - -/* Defines for determining if an event message should be sent. */ -#ifdef HAVE_DMAPI -#define DM_EVENT_ENABLED(ip, event) ( \ - unlikely ((ip)->i_mount->m_flags & XFS_MOUNT_DMAPI) && \ - ( ((ip)->i_d.di_dmevmask & (1 << event)) || \ - ((ip)->i_mount->m_dmevmask & (1 << event)) ) \ - ) -#else -#define DM_EVENT_ENABLED(ip, event) (0) -#endif - -#define DM_XFS_VALID_FS_EVENTS ( \ - (1 << DM_EVENT_PREUNMOUNT) | \ - (1 << DM_EVENT_UNMOUNT) | \ - (1 << DM_EVENT_NOSPACE) | \ - (1 << DM_EVENT_DEBUT) | \ - (1 << DM_EVENT_CREATE) | \ - (1 << DM_EVENT_POSTCREATE) | \ - (1 << DM_EVENT_REMOVE) | \ - (1 << DM_EVENT_POSTREMOVE) | \ - (1 << DM_EVENT_RENAME) | \ - (1 << DM_EVENT_POSTRENAME) | \ - (1 << DM_EVENT_LINK) | \ - (1 << DM_EVENT_POSTLINK) | \ - (1 << DM_EVENT_SYMLINK) | \ - (1 << DM_EVENT_POSTSYMLINK) | \ - (1 << DM_EVENT_ATTRIBUTE) | \ - (1 << DM_EVENT_DESTROY) ) - -/* Events valid in dm_set_eventlist() when called with a file handle for - a regular file or a symlink. These events are persistent. -*/ - -#define DM_XFS_VALID_FILE_EVENTS ( \ - (1 << DM_EVENT_ATTRIBUTE) | \ - (1 << DM_EVENT_DESTROY) ) - -/* Events valid in dm_set_eventlist() when called with a file handle for - a directory. These events are persistent. -*/ - -#define DM_XFS_VALID_DIRECTORY_EVENTS ( \ - (1 << DM_EVENT_CREATE) | \ - (1 << DM_EVENT_POSTCREATE) | \ - (1 << DM_EVENT_REMOVE) | \ - (1 << DM_EVENT_POSTREMOVE) | \ - (1 << DM_EVENT_RENAME) | \ - (1 << DM_EVENT_POSTRENAME) | \ - (1 << DM_EVENT_LINK) | \ - (1 << DM_EVENT_POSTLINK) | \ - (1 << DM_EVENT_SYMLINK) | \ - (1 << DM_EVENT_POSTSYMLINK) | \ - (1 << DM_EVENT_ATTRIBUTE) | \ - (1 << DM_EVENT_DESTROY) ) - -/* Events supported by the XFS filesystem. */ -#define DM_XFS_SUPPORTED_EVENTS ( \ - (1 << DM_EVENT_MOUNT) | \ - (1 << DM_EVENT_PREUNMOUNT) | \ - (1 << DM_EVENT_UNMOUNT) | \ - (1 << DM_EVENT_NOSPACE) | \ - (1 << DM_EVENT_CREATE) | \ - (1 << DM_EVENT_POSTCREATE) | \ - (1 << DM_EVENT_REMOVE) | \ - (1 << DM_EVENT_POSTREMOVE) | \ - (1 << DM_EVENT_RENAME) | \ - (1 << DM_EVENT_POSTRENAME) | \ - (1 << DM_EVENT_LINK) | \ - (1 << DM_EVENT_POSTLINK) | \ - (1 << DM_EVENT_SYMLINK) | \ - (1 << DM_EVENT_POSTSYMLINK) | \ - (1 << DM_EVENT_READ) | \ - (1 << DM_EVENT_WRITE) | \ - (1 << DM_EVENT_TRUNCATE) | \ - (1 << DM_EVENT_ATTRIBUTE) | \ - (1 << DM_EVENT_DESTROY) ) - - -/* - * Definitions used for the flags field on dm_send_*_event(). - */ - -#define DM_FLAGS_NDELAY 0x001 /* return EAGAIN after dm_pending() */ -#define DM_FLAGS_UNWANTED 0x002 /* event not in fsys dm_eventset_t */ -#define DM_FLAGS_IMUX 0x004 /* thread holds i_mutex */ -#define DM_FLAGS_IALLOCSEM_RD 0x010 /* thread holds i_alloc_sem rd */ -#define DM_FLAGS_IALLOCSEM_WR 0x020 /* thread holds i_alloc_sem wr */ - -/* - * Pull in platform specific event flags defines - */ -#include "xfs_dmapi_priv.h" - -/* - * Macros to turn caller specified delay/block flags into - * dm_send_xxxx_event flag DM_FLAGS_NDELAY. - */ - -#define FILP_DELAY_FLAG(filp) ((filp->f_flags&(O_NDELAY|O_NONBLOCK)) ? \ - DM_FLAGS_NDELAY : 0) -#define AT_DELAY_FLAG(f) ((f & XFS_ATTR_NONBLOCK) ? DM_FLAGS_NDELAY : 0) - -#endif /* __XFS_DMAPI_H__ */ diff --git a/fs/xfs/xfs_dmops.c b/fs/xfs/xfs_dmops.c deleted file mode 100644 index e71e2581c0c3..000000000000 --- a/fs/xfs/xfs_dmops.c +++ /dev/null @@ -1,55 +0,0 @@ -/* - * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc. - * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - */ -#include "xfs.h" -#include "xfs_fs.h" -#include "xfs_types.h" -#include "xfs_log.h" -#include "xfs_trans.h" -#include "xfs_sb.h" -#include "xfs_dmapi.h" -#include "xfs_inum.h" -#include "xfs_ag.h" -#include "xfs_mount.h" - - -static struct xfs_dmops xfs_dmcore_stub = { - .xfs_send_data = (xfs_send_data_t)fs_nosys, - .xfs_send_mmap = (xfs_send_mmap_t)fs_noerr, - .xfs_send_destroy = (xfs_send_destroy_t)fs_nosys, - .xfs_send_namesp = (xfs_send_namesp_t)fs_nosys, - .xfs_send_mount = (xfs_send_mount_t)fs_nosys, - .xfs_send_unmount = (xfs_send_unmount_t)fs_noerr, -}; - -int -xfs_dmops_get(struct xfs_mount *mp) -{ - if (mp->m_flags & XFS_MOUNT_DMAPI) { - cmn_err(CE_WARN, - "XFS: dmapi support not available in this kernel."); - return EINVAL; - } - - mp->m_dm_ops = &xfs_dmcore_stub; - return 0; -} - -void -xfs_dmops_put(struct xfs_mount *mp) -{ -} diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c index 047b8a8e5c29..ed9990267661 100644 --- a/fs/xfs/xfs_error.c +++ b/fs/xfs/xfs_error.c @@ -23,12 +23,8 @@ #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" -#include "xfs_dir2.h" -#include "xfs_dmapi.h" #include "xfs_mount.h" #include "xfs_bmap_btree.h" -#include "xfs_dir2_sf.h" -#include "xfs_attr_sf.h" #include "xfs_dinode.h" #include "xfs_inode.h" #include "xfs_utils.h" diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c index 409fe81585fd..a55e687bf562 100644 --- a/fs/xfs/xfs_extfree_item.c +++ b/fs/xfs/xfs_extfree_item.c @@ -24,7 +24,6 @@ #include "xfs_buf_item.h" #include "xfs_sb.h" #include "xfs_ag.h" -#include "xfs_dmapi.h" #include "xfs_mount.h" #include "xfs_trans_priv.h" #include "xfs_extfree_item.h" @@ -33,18 +32,19 @@ kmem_zone_t *xfs_efi_zone; kmem_zone_t *xfs_efd_zone; -STATIC void xfs_efi_item_unlock(xfs_efi_log_item_t *); +static inline struct xfs_efi_log_item *EFI_ITEM(struct xfs_log_item *lip) +{ + return container_of(lip, struct xfs_efi_log_item, efi_item); +} void -xfs_efi_item_free(xfs_efi_log_item_t *efip) +xfs_efi_item_free( + struct xfs_efi_log_item *efip) { - int nexts = efip->efi_format.efi_nextents; - - if (nexts > XFS_EFI_MAX_FAST_EXTENTS) { + if (efip->efi_format.efi_nextents > XFS_EFI_MAX_FAST_EXTENTS) kmem_free(efip); - } else { + else kmem_zone_free(xfs_efi_zone, efip); - } } /* @@ -52,9 +52,9 @@ xfs_efi_item_free(xfs_efi_log_item_t *efip) * We only need 1 iovec for an efi item. It just logs the efi_log_format * structure. */ -/*ARGSUSED*/ STATIC uint -xfs_efi_item_size(xfs_efi_log_item_t *efip) +xfs_efi_item_size( + struct xfs_log_item *lip) { return 1; } @@ -67,10 +67,12 @@ xfs_efi_item_size(xfs_efi_log_item_t *efip) * slots in the efi item have been filled. */ STATIC void -xfs_efi_item_format(xfs_efi_log_item_t *efip, - xfs_log_iovec_t *log_vector) +xfs_efi_item_format( + struct xfs_log_item *lip, + struct xfs_log_iovec *log_vector) { - uint size; + struct xfs_efi_log_item *efip = EFI_ITEM(lip); + uint size; ASSERT(efip->efi_next_extent == efip->efi_format.efi_nextents); @@ -80,7 +82,7 @@ xfs_efi_item_format(xfs_efi_log_item_t *efip, size += (efip->efi_format.efi_nextents - 1) * sizeof(xfs_extent_t); efip->efi_format.efi_size = 1; - log_vector->i_addr = (xfs_caddr_t)&(efip->efi_format); + log_vector->i_addr = &efip->efi_format; log_vector->i_len = size; log_vector->i_type = XLOG_REG_TYPE_EFI_FORMAT; ASSERT(size >= sizeof(xfs_efi_log_format_t)); @@ -90,60 +92,33 @@ xfs_efi_item_format(xfs_efi_log_item_t *efip, /* * Pinning has no meaning for an efi item, so just return. */ -/*ARGSUSED*/ STATIC void -xfs_efi_item_pin(xfs_efi_log_item_t *efip) +xfs_efi_item_pin( + struct xfs_log_item *lip) { - return; } - /* * While EFIs cannot really be pinned, the unpin operation is the * last place at which the EFI is manipulated during a transaction. * Here we coordinate with xfs_efi_cancel() to determine who gets to * free the EFI. */ -/*ARGSUSED*/ -STATIC void -xfs_efi_item_unpin(xfs_efi_log_item_t *efip) -{ - struct xfs_ail *ailp = efip->efi_item.li_ailp; - - spin_lock(&ailp->xa_lock); - if (efip->efi_flags & XFS_EFI_CANCELED) { - /* xfs_trans_ail_delete() drops the AIL lock. */ - xfs_trans_ail_delete(ailp, (xfs_log_item_t *)efip); - xfs_efi_item_free(efip); - } else { - efip->efi_flags |= XFS_EFI_COMMITTED; - spin_unlock(&ailp->xa_lock); - } -} - -/* - * like unpin only we have to also clear the xaction descriptor - * pointing the log item if we free the item. This routine duplicates - * unpin because efi_flags is protected by the AIL lock. Freeing - * the descriptor and then calling unpin would force us to drop the AIL - * lock which would open up a race condition. - */ STATIC void -xfs_efi_item_unpin_remove(xfs_efi_log_item_t *efip, xfs_trans_t *tp) +xfs_efi_item_unpin( + struct xfs_log_item *lip, + int remove) { - struct xfs_ail *ailp = efip->efi_item.li_ailp; - xfs_log_item_desc_t *lidp; + struct xfs_efi_log_item *efip = EFI_ITEM(lip); + struct xfs_ail *ailp = lip->li_ailp; spin_lock(&ailp->xa_lock); if (efip->efi_flags & XFS_EFI_CANCELED) { - /* - * free the xaction descriptor pointing to this item - */ - lidp = xfs_trans_find_item(tp, (xfs_log_item_t *) efip); - xfs_trans_free_item(tp, lidp); + if (remove) + xfs_trans_del_item(lip); /* xfs_trans_ail_delete() drops the AIL lock. */ - xfs_trans_ail_delete(ailp, (xfs_log_item_t *)efip); + xfs_trans_ail_delete(ailp, lip); xfs_efi_item_free(efip); } else { efip->efi_flags |= XFS_EFI_COMMITTED; @@ -158,9 +133,9 @@ xfs_efi_item_unpin_remove(xfs_efi_log_item_t *efip, xfs_trans_t *tp) * XFS_ITEM_PINNED so that the caller will eventually flush the log. * This should help in getting the EFI out of the AIL. */ -/*ARGSUSED*/ STATIC uint -xfs_efi_item_trylock(xfs_efi_log_item_t *efip) +xfs_efi_item_trylock( + struct xfs_log_item *lip) { return XFS_ITEM_PINNED; } @@ -168,13 +143,12 @@ xfs_efi_item_trylock(xfs_efi_log_item_t *efip) /* * Efi items have no locking, so just return. */ -/*ARGSUSED*/ STATIC void -xfs_efi_item_unlock(xfs_efi_log_item_t *efip) +xfs_efi_item_unlock( + struct xfs_log_item *lip) { - if (efip->efi_item.li_flags & XFS_LI_ABORTED) - xfs_efi_item_free(efip); - return; + if (lip->li_flags & XFS_LI_ABORTED) + xfs_efi_item_free(EFI_ITEM(lip)); } /* @@ -183,9 +157,10 @@ xfs_efi_item_unlock(xfs_efi_log_item_t *efip) * flag is not paid any attention here. Checking for that is delayed * until the EFI is unpinned. */ -/*ARGSUSED*/ STATIC xfs_lsn_t -xfs_efi_item_committed(xfs_efi_log_item_t *efip, xfs_lsn_t lsn) +xfs_efi_item_committed( + struct xfs_log_item *lip, + xfs_lsn_t lsn) { return lsn; } @@ -195,11 +170,10 @@ xfs_efi_item_committed(xfs_efi_log_item_t *efip, xfs_lsn_t lsn) * stuck waiting for all of its corresponding efd items to be * committed to disk. */ -/*ARGSUSED*/ STATIC void -xfs_efi_item_push(xfs_efi_log_item_t *efip) +xfs_efi_item_push( + struct xfs_log_item *lip) { - return; } /* @@ -209,61 +183,55 @@ xfs_efi_item_push(xfs_efi_log_item_t *efip) * example, for inodes, the inode is locked throughout the extent freeing * so the dependency should be recorded there. */ -/*ARGSUSED*/ STATIC void -xfs_efi_item_committing(xfs_efi_log_item_t *efip, xfs_lsn_t lsn) +xfs_efi_item_committing( + struct xfs_log_item *lip, + xfs_lsn_t lsn) { - return; } /* * This is the ops vector shared by all efi log items. */ static struct xfs_item_ops xfs_efi_item_ops = { - .iop_size = (uint(*)(xfs_log_item_t*))xfs_efi_item_size, - .iop_format = (void(*)(xfs_log_item_t*, xfs_log_iovec_t*)) - xfs_efi_item_format, - .iop_pin = (void(*)(xfs_log_item_t*))xfs_efi_item_pin, - .iop_unpin = (void(*)(xfs_log_item_t*))xfs_efi_item_unpin, - .iop_unpin_remove = (void(*)(xfs_log_item_t*, xfs_trans_t *)) - xfs_efi_item_unpin_remove, - .iop_trylock = (uint(*)(xfs_log_item_t*))xfs_efi_item_trylock, - .iop_unlock = (void(*)(xfs_log_item_t*))xfs_efi_item_unlock, - .iop_committed = (xfs_lsn_t(*)(xfs_log_item_t*, xfs_lsn_t)) - xfs_efi_item_committed, - .iop_push = (void(*)(xfs_log_item_t*))xfs_efi_item_push, - .iop_pushbuf = NULL, - .iop_committing = (void(*)(xfs_log_item_t*, xfs_lsn_t)) - xfs_efi_item_committing + .iop_size = xfs_efi_item_size, + .iop_format = xfs_efi_item_format, + .iop_pin = xfs_efi_item_pin, + .iop_unpin = xfs_efi_item_unpin, + .iop_trylock = xfs_efi_item_trylock, + .iop_unlock = xfs_efi_item_unlock, + .iop_committed = xfs_efi_item_committed, + .iop_push = xfs_efi_item_push, + .iop_committing = xfs_efi_item_committing }; /* * Allocate and initialize an efi item with the given number of extents. */ -xfs_efi_log_item_t * -xfs_efi_init(xfs_mount_t *mp, - uint nextents) +struct xfs_efi_log_item * +xfs_efi_init( + struct xfs_mount *mp, + uint nextents) { - xfs_efi_log_item_t *efip; + struct xfs_efi_log_item *efip; uint size; ASSERT(nextents > 0); if (nextents > XFS_EFI_MAX_FAST_EXTENTS) { size = (uint)(sizeof(xfs_efi_log_item_t) + ((nextents - 1) * sizeof(xfs_extent_t))); - efip = (xfs_efi_log_item_t*)kmem_zalloc(size, KM_SLEEP); + efip = kmem_zalloc(size, KM_SLEEP); } else { - efip = (xfs_efi_log_item_t*)kmem_zone_zalloc(xfs_efi_zone, - KM_SLEEP); + efip = kmem_zone_zalloc(xfs_efi_zone, KM_SLEEP); } xfs_log_item_init(mp, &efip->efi_item, XFS_LI_EFI, &xfs_efi_item_ops); efip->efi_format.efi_nextents = nextents; efip->efi_format.efi_id = (__psint_t)(void*)efip; - return (efip); + return efip; } /* @@ -276,7 +244,7 @@ xfs_efi_init(xfs_mount_t *mp, int xfs_efi_copy_format(xfs_log_iovec_t *buf, xfs_efi_log_format_t *dst_efi_fmt) { - xfs_efi_log_format_t *src_efi_fmt = (xfs_efi_log_format_t *)buf->i_addr; + xfs_efi_log_format_t *src_efi_fmt = buf->i_addr; uint i; uint len = sizeof(xfs_efi_log_format_t) + (src_efi_fmt->efi_nextents - 1) * sizeof(xfs_extent_t); @@ -289,8 +257,7 @@ xfs_efi_copy_format(xfs_log_iovec_t *buf, xfs_efi_log_format_t *dst_efi_fmt) memcpy((char *)dst_efi_fmt, (char*)src_efi_fmt, len); return 0; } else if (buf->i_len == len32) { - xfs_efi_log_format_32_t *src_efi_fmt_32 = - (xfs_efi_log_format_32_t *)buf->i_addr; + xfs_efi_log_format_32_t *src_efi_fmt_32 = buf->i_addr; dst_efi_fmt->efi_type = src_efi_fmt_32->efi_type; dst_efi_fmt->efi_size = src_efi_fmt_32->efi_size; @@ -304,8 +271,7 @@ xfs_efi_copy_format(xfs_log_iovec_t *buf, xfs_efi_log_format_t *dst_efi_fmt) } return 0; } else if (buf->i_len == len64) { - xfs_efi_log_format_64_t *src_efi_fmt_64 = - (xfs_efi_log_format_64_t *)buf->i_addr; + xfs_efi_log_format_64_t *src_efi_fmt_64 = buf->i_addr; dst_efi_fmt->efi_type = src_efi_fmt_64->efi_type; dst_efi_fmt->efi_size = src_efi_fmt_64->efi_size; @@ -356,16 +322,18 @@ xfs_efi_release(xfs_efi_log_item_t *efip, } } -STATIC void -xfs_efd_item_free(xfs_efd_log_item_t *efdp) +static inline struct xfs_efd_log_item *EFD_ITEM(struct xfs_log_item *lip) { - int nexts = efdp->efd_format.efd_nextents; + return container_of(lip, struct xfs_efd_log_item, efd_item); +} - if (nexts > XFS_EFD_MAX_FAST_EXTENTS) { +STATIC void +xfs_efd_item_free(struct xfs_efd_log_item *efdp) +{ + if (efdp->efd_format.efd_nextents > XFS_EFD_MAX_FAST_EXTENTS) kmem_free(efdp); - } else { + else kmem_zone_free(xfs_efd_zone, efdp); - } } /* @@ -373,9 +341,9 @@ xfs_efd_item_free(xfs_efd_log_item_t *efdp) * We only need 1 iovec for an efd item. It just logs the efd_log_format * structure. */ -/*ARGSUSED*/ STATIC uint -xfs_efd_item_size(xfs_efd_log_item_t *efdp) +xfs_efd_item_size( + struct xfs_log_item *lip) { return 1; } @@ -388,10 +356,12 @@ xfs_efd_item_size(xfs_efd_log_item_t *efdp) * slots in the efd item have been filled. */ STATIC void -xfs_efd_item_format(xfs_efd_log_item_t *efdp, - xfs_log_iovec_t *log_vector) +xfs_efd_item_format( + struct xfs_log_item *lip, + struct xfs_log_iovec *log_vector) { - uint size; + struct xfs_efd_log_item *efdp = EFD_ITEM(lip); + uint size; ASSERT(efdp->efd_next_extent == efdp->efd_format.efd_nextents); @@ -401,48 +371,38 @@ xfs_efd_item_format(xfs_efd_log_item_t *efdp, size += (efdp->efd_format.efd_nextents - 1) * sizeof(xfs_extent_t); efdp->efd_format.efd_size = 1; - log_vector->i_addr = (xfs_caddr_t)&(efdp->efd_format); + log_vector->i_addr = &efdp->efd_format; log_vector->i_len = size; log_vector->i_type = XLOG_REG_TYPE_EFD_FORMAT; ASSERT(size >= sizeof(xfs_efd_log_format_t)); } - /* * Pinning has no meaning for an efd item, so just return. */ -/*ARGSUSED*/ STATIC void -xfs_efd_item_pin(xfs_efd_log_item_t *efdp) +xfs_efd_item_pin( + struct xfs_log_item *lip) { - return; } - /* * Since pinning has no meaning for an efd item, unpinning does * not either. */ -/*ARGSUSED*/ -STATIC void -xfs_efd_item_unpin(xfs_efd_log_item_t *efdp) -{ - return; -} - -/*ARGSUSED*/ STATIC void -xfs_efd_item_unpin_remove(xfs_efd_log_item_t *efdp, xfs_trans_t *tp) +xfs_efd_item_unpin( + struct xfs_log_item *lip, + int remove) { - return; } /* * Efd items have no locking, so just return success. */ -/*ARGSUSED*/ STATIC uint -xfs_efd_item_trylock(xfs_efd_log_item_t *efdp) +xfs_efd_item_trylock( + struct xfs_log_item *lip) { return XFS_ITEM_LOCKED; } @@ -451,13 +411,12 @@ xfs_efd_item_trylock(xfs_efd_log_item_t *efdp) * Efd items have no locking or pushing, so return failure * so that the caller doesn't bother with us. */ -/*ARGSUSED*/ STATIC void -xfs_efd_item_unlock(xfs_efd_log_item_t *efdp) +xfs_efd_item_unlock( + struct xfs_log_item *lip) { - if (efdp->efd_item.li_flags & XFS_LI_ABORTED) - xfs_efd_item_free(efdp); - return; + if (lip->li_flags & XFS_LI_ABORTED) + xfs_efd_item_free(EFD_ITEM(lip)); } /* @@ -467,15 +426,18 @@ xfs_efd_item_unlock(xfs_efd_log_item_t *efdp) * return -1 to keep the transaction code from further referencing * this item. */ -/*ARGSUSED*/ STATIC xfs_lsn_t -xfs_efd_item_committed(xfs_efd_log_item_t *efdp, xfs_lsn_t lsn) +xfs_efd_item_committed( + struct xfs_log_item *lip, + xfs_lsn_t lsn) { + struct xfs_efd_log_item *efdp = EFD_ITEM(lip); + /* * If we got a log I/O error, it's always the case that the LR with the * EFI got unpinned and freed before the EFD got aborted. */ - if ((efdp->efd_item.li_flags & XFS_LI_ABORTED) == 0) + if (!(lip->li_flags & XFS_LI_ABORTED)) xfs_efi_release(efdp->efd_efip, efdp->efd_format.efd_nextents); xfs_efd_item_free(efdp); @@ -486,11 +448,10 @@ xfs_efd_item_committed(xfs_efd_log_item_t *efdp, xfs_lsn_t lsn) * There isn't much you can do to push on an efd item. It is simply * stuck waiting for the log to be flushed to disk. */ -/*ARGSUSED*/ STATIC void -xfs_efd_item_push(xfs_efd_log_item_t *efdp) +xfs_efd_item_push( + struct xfs_log_item *lip) { - return; } /* @@ -500,55 +461,48 @@ xfs_efd_item_push(xfs_efd_log_item_t *efdp) * example, for inodes, the inode is locked throughout the extent freeing * so the dependency should be recorded there. */ -/*ARGSUSED*/ STATIC void -xfs_efd_item_committing(xfs_efd_log_item_t *efip, xfs_lsn_t lsn) +xfs_efd_item_committing( + struct xfs_log_item *lip, + xfs_lsn_t lsn) { - return; } /* * This is the ops vector shared by all efd log items. */ static struct xfs_item_ops xfs_efd_item_ops = { - .iop_size = (uint(*)(xfs_log_item_t*))xfs_efd_item_size, - .iop_format = (void(*)(xfs_log_item_t*, xfs_log_iovec_t*)) - xfs_efd_item_format, - .iop_pin = (void(*)(xfs_log_item_t*))xfs_efd_item_pin, - .iop_unpin = (void(*)(xfs_log_item_t*))xfs_efd_item_unpin, - .iop_unpin_remove = (void(*)(xfs_log_item_t*, xfs_trans_t*)) - xfs_efd_item_unpin_remove, - .iop_trylock = (uint(*)(xfs_log_item_t*))xfs_efd_item_trylock, - .iop_unlock = (void(*)(xfs_log_item_t*))xfs_efd_item_unlock, - .iop_committed = (xfs_lsn_t(*)(xfs_log_item_t*, xfs_lsn_t)) - xfs_efd_item_committed, - .iop_push = (void(*)(xfs_log_item_t*))xfs_efd_item_push, - .iop_pushbuf = NULL, - .iop_committing = (void(*)(xfs_log_item_t*, xfs_lsn_t)) - xfs_efd_item_committing + .iop_size = xfs_efd_item_size, + .iop_format = xfs_efd_item_format, + .iop_pin = xfs_efd_item_pin, + .iop_unpin = xfs_efd_item_unpin, + .iop_trylock = xfs_efd_item_trylock, + .iop_unlock = xfs_efd_item_unlock, + .iop_committed = xfs_efd_item_committed, + .iop_push = xfs_efd_item_push, + .iop_committing = xfs_efd_item_committing }; - /* * Allocate and initialize an efd item with the given number of extents. */ -xfs_efd_log_item_t * -xfs_efd_init(xfs_mount_t *mp, - xfs_efi_log_item_t *efip, - uint nextents) +struct xfs_efd_log_item * +xfs_efd_init( + struct xfs_mount *mp, + struct xfs_efi_log_item *efip, + uint nextents) { - xfs_efd_log_item_t *efdp; + struct xfs_efd_log_item *efdp; uint size; ASSERT(nextents > 0); if (nextents > XFS_EFD_MAX_FAST_EXTENTS) { size = (uint)(sizeof(xfs_efd_log_item_t) + ((nextents - 1) * sizeof(xfs_extent_t))); - efdp = (xfs_efd_log_item_t*)kmem_zalloc(size, KM_SLEEP); + efdp = kmem_zalloc(size, KM_SLEEP); } else { - efdp = (xfs_efd_log_item_t*)kmem_zone_zalloc(xfs_efd_zone, - KM_SLEEP); + efdp = kmem_zone_zalloc(xfs_efd_zone, KM_SLEEP); } xfs_log_item_init(mp, &efdp->efd_item, XFS_LI_EFD, &xfs_efd_item_ops); @@ -556,5 +510,5 @@ xfs_efd_init(xfs_mount_t *mp, efdp->efd_format.efd_nextents = nextents; efdp->efd_format.efd_efi_id = efip->efi_format.efi_id; - return (efdp); + return efdp; } diff --git a/fs/xfs/xfs_filestream.c b/fs/xfs/xfs_filestream.c index 390850ee6603..9b715dce5699 100644 --- a/fs/xfs/xfs_filestream.c +++ b/fs/xfs/xfs_filestream.c @@ -18,13 +18,9 @@ #include "xfs.h" #include "xfs_bmap_btree.h" #include "xfs_inum.h" -#include "xfs_dir2.h" -#include "xfs_dir2_sf.h" -#include "xfs_attr_sf.h" #include "xfs_dinode.h" #include "xfs_inode.h" #include "xfs_ag.h" -#include "xfs_dmapi.h" #include "xfs_log.h" #include "xfs_trans.h" #include "xfs_sb.h" @@ -127,6 +123,82 @@ typedef struct fstrm_item xfs_inode_t *pip; /* Parent directory inode pointer. */ } fstrm_item_t; +/* + * Allocation group filestream associations are tracked with per-ag atomic + * counters. These counters allow _xfs_filestream_pick_ag() to tell whether a + * particular AG already has active filestreams associated with it. The mount + * point's m_peraglock is used to protect these counters from per-ag array + * re-allocation during a growfs operation. When xfs_growfs_data_private() is + * about to reallocate the array, it calls xfs_filestream_flush() with the + * m_peraglock held in write mode. + * + * Since xfs_mru_cache_flush() guarantees that all the free functions for all + * the cache elements have finished executing before it returns, it's safe for + * the free functions to use the atomic counters without m_peraglock protection. + * This allows the implementation of xfs_fstrm_free_func() to be agnostic about + * whether it was called with the m_peraglock held in read mode, write mode or + * not held at all. The race condition this addresses is the following: + * + * - The work queue scheduler fires and pulls a filestream directory cache + * element off the LRU end of the cache for deletion, then gets pre-empted. + * - A growfs operation grabs the m_peraglock in write mode, flushes all the + * remaining items from the cache and reallocates the mount point's per-ag + * array, resetting all the counters to zero. + * - The work queue thread resumes and calls the free function for the element + * it started cleaning up earlier. In the process it decrements the + * filestreams counter for an AG that now has no references. + * + * With a shrinkfs feature, the above scenario could panic the system. + * + * All other uses of the following macros should be protected by either the + * m_peraglock held in read mode, or the cache's internal locking exposed by the + * interval between a call to xfs_mru_cache_lookup() and a call to + * xfs_mru_cache_done(). In addition, the m_peraglock must be held in read mode + * when new elements are added to the cache. + * + * Combined, these locking rules ensure that no associations will ever exist in + * the cache that reference per-ag array elements that have since been + * reallocated. + */ +static int +xfs_filestream_peek_ag( + xfs_mount_t *mp, + xfs_agnumber_t agno) +{ + struct xfs_perag *pag; + int ret; + + pag = xfs_perag_get(mp, agno); + ret = atomic_read(&pag->pagf_fstrms); + xfs_perag_put(pag); + return ret; +} + +static int +xfs_filestream_get_ag( + xfs_mount_t *mp, + xfs_agnumber_t agno) +{ + struct xfs_perag *pag; + int ret; + + pag = xfs_perag_get(mp, agno); + ret = atomic_inc_return(&pag->pagf_fstrms); + xfs_perag_put(pag); + return ret; +} + +static void +xfs_filestream_put_ag( + xfs_mount_t *mp, + xfs_agnumber_t agno) +{ + struct xfs_perag *pag; + + pag = xfs_perag_get(mp, agno); + atomic_dec(&pag->pagf_fstrms); + xfs_perag_put(pag); +} /* * Scan the AGs starting at startag looking for an AG that isn't in use and has @@ -355,16 +427,14 @@ xfs_fstrm_free_func( { fstrm_item_t *item = (fstrm_item_t *)data; xfs_inode_t *ip = item->ip; - int ref; ASSERT(ip->i_ino == ino); xfs_iflags_clear(ip, XFS_IFILESTREAM); /* Drop the reference taken on the AG when the item was added. */ - ref = xfs_filestream_put_ag(ip->i_mount, item->ag); + xfs_filestream_put_ag(ip->i_mount, item->ag); - ASSERT(ref >= 0); TRACE_FREE(ip->i_mount, ip, item->pip, item->ag, xfs_filestream_peek_ag(ip->i_mount, item->ag)); diff --git a/fs/xfs/xfs_filestream.h b/fs/xfs/xfs_filestream.h index 260f757bbc5d..09dd9af45434 100644 --- a/fs/xfs/xfs_filestream.h +++ b/fs/xfs/xfs_filestream.h @@ -42,88 +42,6 @@ extern ktrace_t *xfs_filestreams_trace_buf; #endif -/* - * Allocation group filestream associations are tracked with per-ag atomic - * counters. These counters allow _xfs_filestream_pick_ag() to tell whether a - * particular AG already has active filestreams associated with it. The mount - * point's m_peraglock is used to protect these counters from per-ag array - * re-allocation during a growfs operation. When xfs_growfs_data_private() is - * about to reallocate the array, it calls xfs_filestream_flush() with the - * m_peraglock held in write mode. - * - * Since xfs_mru_cache_flush() guarantees that all the free functions for all - * the cache elements have finished executing before it returns, it's safe for - * the free functions to use the atomic counters without m_peraglock protection. - * This allows the implementation of xfs_fstrm_free_func() to be agnostic about - * whether it was called with the m_peraglock held in read mode, write mode or - * not held at all. The race condition this addresses is the following: - * - * - The work queue scheduler fires and pulls a filestream directory cache - * element off the LRU end of the cache for deletion, then gets pre-empted. - * - A growfs operation grabs the m_peraglock in write mode, flushes all the - * remaining items from the cache and reallocates the mount point's per-ag - * array, resetting all the counters to zero. - * - The work queue thread resumes and calls the free function for the element - * it started cleaning up earlier. In the process it decrements the - * filestreams counter for an AG that now has no references. - * - * With a shrinkfs feature, the above scenario could panic the system. - * - * All other uses of the following macros should be protected by either the - * m_peraglock held in read mode, or the cache's internal locking exposed by the - * interval between a call to xfs_mru_cache_lookup() and a call to - * xfs_mru_cache_done(). In addition, the m_peraglock must be held in read mode - * when new elements are added to the cache. - * - * Combined, these locking rules ensure that no associations will ever exist in - * the cache that reference per-ag array elements that have since been - * reallocated. - */ -/* - * xfs_filestream_peek_ag is only used in tracing code - */ -static inline int -xfs_filestream_peek_ag( - xfs_mount_t *mp, - xfs_agnumber_t agno) -{ - struct xfs_perag *pag; - int ret; - - pag = xfs_perag_get(mp, agno); - ret = atomic_read(&pag->pagf_fstrms); - xfs_perag_put(pag); - return ret; -} - -static inline int -xfs_filestream_get_ag( - xfs_mount_t *mp, - xfs_agnumber_t agno) -{ - struct xfs_perag *pag; - int ret; - - pag = xfs_perag_get(mp, agno); - ret = atomic_inc_return(&pag->pagf_fstrms); - xfs_perag_put(pag); - return ret; -} - -static inline int -xfs_filestream_put_ag( - xfs_mount_t *mp, - xfs_agnumber_t agno) -{ - struct xfs_perag *pag; - int ret; - - pag = xfs_perag_get(mp, agno); - ret = atomic_dec_return(&pag->pagf_fstrms); - xfs_perag_put(pag); - return ret; -} - /* allocation selection flags */ typedef enum xfs_fstrm_alloc { XFS_PICK_USERDATA = 1, diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index 37a6f62c57b6..dbca5f5c37ba 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -24,14 +24,10 @@ #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" -#include "xfs_dir2.h" -#include "xfs_dmapi.h" #include "xfs_mount.h" #include "xfs_bmap_btree.h" #include "xfs_alloc_btree.h" #include "xfs_ialloc_btree.h" -#include "xfs_dir2_sf.h" -#include "xfs_attr_sf.h" #include "xfs_dinode.h" #include "xfs_inode.h" #include "xfs_inode_item.h" @@ -626,8 +622,7 @@ xfs_fs_log_dummy( ip = mp->m_rootip; xfs_ilock(ip, XFS_ILOCK_EXCL); - xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); - xfs_trans_ihold(tp, ip); + xfs_trans_ijoin(tp, ip); xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); xfs_trans_set_sync(tp); error = xfs_trans_commit(tp, 0); diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c index 9d884c127bb9..abf80ae1e95b 100644 --- a/fs/xfs/xfs_ialloc.c +++ b/fs/xfs/xfs_ialloc.c @@ -24,14 +24,10 @@ #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" -#include "xfs_dir2.h" -#include "xfs_dmapi.h" #include "xfs_mount.h" #include "xfs_bmap_btree.h" #include "xfs_alloc_btree.h" #include "xfs_ialloc_btree.h" -#include "xfs_dir2_sf.h" -#include "xfs_attr_sf.h" #include "xfs_dinode.h" #include "xfs_inode.h" #include "xfs_btree.h" @@ -1203,6 +1199,63 @@ error0: return error; } +STATIC int +xfs_imap_lookup( + struct xfs_mount *mp, + struct xfs_trans *tp, + xfs_agnumber_t agno, + xfs_agino_t agino, + xfs_agblock_t agbno, + xfs_agblock_t *chunk_agbno, + xfs_agblock_t *offset_agbno, + int flags) +{ + struct xfs_inobt_rec_incore rec; + struct xfs_btree_cur *cur; + struct xfs_buf *agbp; + xfs_agino_t startino; + int error; + int i; + + error = xfs_ialloc_read_agi(mp, tp, agno, &agbp); + if (error) { + xfs_fs_cmn_err(CE_ALERT, mp, "xfs_imap: " + "xfs_ialloc_read_agi() returned " + "error %d, agno %d", + error, agno); + return error; + } + + /* + * derive and lookup the exact inode record for the given agino. If the + * record cannot be found, then it's an invalid inode number and we + * should abort. + */ + cur = xfs_inobt_init_cursor(mp, tp, agbp, agno); + startino = agino & ~(XFS_IALLOC_INODES(mp) - 1); + error = xfs_inobt_lookup(cur, startino, XFS_LOOKUP_EQ, &i); + if (!error) { + if (i) + error = xfs_inobt_get_rec(cur, &rec, &i); + if (!error && i == 0) + error = EINVAL; + } + + xfs_trans_brelse(tp, agbp); + xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); + if (error) + return error; + + /* for untrusted inodes check it is allocated first */ + if ((flags & XFS_IGET_UNTRUSTED) && + (rec.ir_free & XFS_INOBT_MASK(agino - rec.ir_startino))) + return EINVAL; + + *chunk_agbno = XFS_AGINO_TO_AGBNO(mp, rec.ir_startino); + *offset_agbno = agbno - *chunk_agbno; + return 0; +} + /* * Return the location of the inode in imap, for mapping it into a buffer. */ @@ -1235,8 +1288,11 @@ xfs_imap( if (agno >= mp->m_sb.sb_agcount || agbno >= mp->m_sb.sb_agblocks || ino != XFS_AGINO_TO_INO(mp, agno, agino)) { #ifdef DEBUG - /* no diagnostics for bulkstat, ino comes from userspace */ - if (flags & XFS_IGET_BULKSTAT) + /* + * Don't output diagnostic information for untrusted inodes + * as they can be invalid without implying corruption. + */ + if (flags & XFS_IGET_UNTRUSTED) return XFS_ERROR(EINVAL); if (agno >= mp->m_sb.sb_agcount) { xfs_fs_cmn_err(CE_ALERT, mp, @@ -1263,6 +1319,23 @@ xfs_imap( return XFS_ERROR(EINVAL); } + blks_per_cluster = XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_blocklog; + + /* + * For bulkstat and handle lookups, we have an untrusted inode number + * that we have to verify is valid. We cannot do this just by reading + * the inode buffer as it may have been unlinked and removed leaving + * inodes in stale state on disk. Hence we have to do a btree lookup + * in all cases where an untrusted inode number is passed. + */ + if (flags & XFS_IGET_UNTRUSTED) { + error = xfs_imap_lookup(mp, tp, agno, agino, agbno, + &chunk_agbno, &offset_agbno, flags); + if (error) + return error; + goto out_map; + } + /* * If the inode cluster size is the same as the blocksize or * smaller we get to the buffer by simple arithmetics. @@ -1277,24 +1350,6 @@ xfs_imap( return 0; } - blks_per_cluster = XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_blocklog; - - /* - * If we get a block number passed from bulkstat we can use it to - * find the buffer easily. - */ - if (imap->im_blkno) { - offset = XFS_INO_TO_OFFSET(mp, ino); - ASSERT(offset < mp->m_sb.sb_inopblock); - - cluster_agbno = xfs_daddr_to_agbno(mp, imap->im_blkno); - offset += (agbno - cluster_agbno) * mp->m_sb.sb_inopblock; - - imap->im_len = XFS_FSB_TO_BB(mp, blks_per_cluster); - imap->im_boffset = (ushort)(offset << mp->m_sb.sb_inodelog); - return 0; - } - /* * If the inode chunks are aligned then use simple maths to * find the location. Otherwise we have to do a btree @@ -1304,50 +1359,13 @@ xfs_imap( offset_agbno = agbno & mp->m_inoalign_mask; chunk_agbno = agbno - offset_agbno; } else { - xfs_btree_cur_t *cur; /* inode btree cursor */ - xfs_inobt_rec_incore_t chunk_rec; - xfs_buf_t *agbp; /* agi buffer */ - int i; /* temp state */ - - error = xfs_ialloc_read_agi(mp, tp, agno, &agbp); - if (error) { - xfs_fs_cmn_err(CE_ALERT, mp, "xfs_imap: " - "xfs_ialloc_read_agi() returned " - "error %d, agno %d", - error, agno); - return error; - } - - cur = xfs_inobt_init_cursor(mp, tp, agbp, agno); - error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE, &i); - if (error) { - xfs_fs_cmn_err(CE_ALERT, mp, "xfs_imap: " - "xfs_inobt_lookup() failed"); - goto error0; - } - - error = xfs_inobt_get_rec(cur, &chunk_rec, &i); - if (error) { - xfs_fs_cmn_err(CE_ALERT, mp, "xfs_imap: " - "xfs_inobt_get_rec() failed"); - goto error0; - } - if (i == 0) { -#ifdef DEBUG - xfs_fs_cmn_err(CE_ALERT, mp, "xfs_imap: " - "xfs_inobt_get_rec() failed"); -#endif /* DEBUG */ - error = XFS_ERROR(EINVAL); - } - error0: - xfs_trans_brelse(tp, agbp); - xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); + error = xfs_imap_lookup(mp, tp, agno, agino, agbno, + &chunk_agbno, &offset_agbno, flags); if (error) return error; - chunk_agbno = XFS_AGINO_TO_AGBNO(mp, chunk_rec.ir_startino); - offset_agbno = agbno - chunk_agbno; } +out_map: ASSERT(agbno >= chunk_agbno); cluster_agbno = chunk_agbno + ((offset_agbno / blks_per_cluster) * blks_per_cluster); diff --git a/fs/xfs/xfs_ialloc_btree.c b/fs/xfs/xfs_ialloc_btree.c index c282a9af5393..d352862cefa0 100644 --- a/fs/xfs/xfs_ialloc_btree.c +++ b/fs/xfs/xfs_ialloc_btree.c @@ -24,14 +24,10 @@ #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" -#include "xfs_dir2.h" -#include "xfs_dmapi.h" #include "xfs_mount.h" #include "xfs_bmap_btree.h" #include "xfs_alloc_btree.h" #include "xfs_ialloc_btree.h" -#include "xfs_dir2_sf.h" -#include "xfs_attr_sf.h" #include "xfs_dinode.h" #include "xfs_inode.h" #include "xfs_btree.h" diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c index 6845db90818f..b1ecc6f97ade 100644 --- a/fs/xfs/xfs_iget.c +++ b/fs/xfs/xfs_iget.c @@ -25,14 +25,10 @@ #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" -#include "xfs_dir2.h" -#include "xfs_dmapi.h" #include "xfs_mount.h" #include "xfs_bmap_btree.h" #include "xfs_alloc_btree.h" #include "xfs_ialloc_btree.h" -#include "xfs_dir2_sf.h" -#include "xfs_attr_sf.h" #include "xfs_dinode.h" #include "xfs_inode.h" #include "xfs_btree.h" @@ -95,7 +91,7 @@ xfs_inode_alloc( return ip; } -STATIC void +void xfs_inode_free( struct xfs_inode *ip) { @@ -212,7 +208,7 @@ xfs_iget_cache_hit( ip->i_flags &= ~XFS_INEW; ip->i_flags |= XFS_IRECLAIMABLE; __xfs_inode_set_reclaim_tag(pag, ip); - trace_xfs_iget_reclaim(ip); + trace_xfs_iget_reclaim_fail(ip); goto out_error; } @@ -227,6 +223,7 @@ xfs_iget_cache_hit( } else { /* If the VFS inode is being torn down, pause and try again. */ if (!igrab(inode)) { + trace_xfs_iget_skip(ip); error = EAGAIN; goto out_error; } @@ -234,6 +231,7 @@ xfs_iget_cache_hit( /* We've got a live one. */ spin_unlock(&ip->i_flags_lock); read_unlock(&pag->pag_ici_lock); + trace_xfs_iget_hit(ip); } if (lock_flags != 0) @@ -242,7 +240,6 @@ xfs_iget_cache_hit( xfs_iflags_clear(ip, XFS_ISTALE); XFS_STATS_INC(xs_ig_found); - trace_xfs_iget_found(ip); return 0; out_error: @@ -259,24 +256,22 @@ xfs_iget_cache_miss( xfs_trans_t *tp, xfs_ino_t ino, struct xfs_inode **ipp, - xfs_daddr_t bno, int flags, int lock_flags) { struct xfs_inode *ip; int error; - unsigned long first_index, mask; xfs_agino_t agino = XFS_INO_TO_AGINO(mp, ino); ip = xfs_inode_alloc(mp, ino); if (!ip) return ENOMEM; - error = xfs_iread(mp, tp, ip, bno, flags); + error = xfs_iread(mp, tp, ip, flags); if (error) goto out_destroy; - xfs_itrace_entry(ip); + trace_xfs_iget_miss(ip); if ((ip->i_d.di_mode == 0) && !(flags & XFS_IGET_CREATE)) { error = ENOENT; @@ -302,8 +297,6 @@ xfs_iget_cache_miss( BUG(); } - mask = ~(((XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog)) - 1); - first_index = agino & mask; write_lock(&pag->pag_ici_lock); /* insert the new inode */ @@ -322,7 +315,6 @@ xfs_iget_cache_miss( write_unlock(&pag->pag_ici_lock); radix_tree_preload_end(); - trace_xfs_iget_alloc(ip); *ipp = ip; return 0; @@ -358,8 +350,6 @@ out_destroy: * within the file system for the inode being requested. * lock_flags -- flags indicating how to lock the inode. See the comment * for xfs_ilock() for a list of valid values. - * bno -- the block number starting the buffer containing the inode, - * if known (as by bulkstat), else 0. */ int xfs_iget( @@ -368,8 +358,7 @@ xfs_iget( xfs_ino_t ino, uint flags, uint lock_flags, - xfs_inode_t **ipp, - xfs_daddr_t bno) + xfs_inode_t **ipp) { xfs_inode_t *ip; int error; @@ -382,9 +371,6 @@ xfs_iget( /* get the perag structure and ensure that it's inode capable */ pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ino)); - if (!pag->pagi_inodeok) - return EINVAL; - ASSERT(pag->pag_ici_init); agino = XFS_INO_TO_AGINO(mp, ino); again: @@ -400,7 +386,7 @@ again: read_unlock(&pag->pag_ici_lock); XFS_STATS_INC(xs_ig_missed); - error = xfs_iget_cache_miss(mp, pag, tp, ino, &ip, bno, + error = xfs_iget_cache_miss(mp, pag, tp, ino, &ip, flags, lock_flags); if (error) goto out_error_or_again; @@ -429,97 +415,6 @@ out_error_or_again: } /* - * Decrement reference count of an inode structure and unlock it. - * - * ip -- the inode being released - * lock_flags -- this parameter indicates the inode's locks to be - * to be released. See the comment on xfs_iunlock() for a list - * of valid values. - */ -void -xfs_iput(xfs_inode_t *ip, - uint lock_flags) -{ - xfs_itrace_entry(ip); - xfs_iunlock(ip, lock_flags); - IRELE(ip); -} - -/* - * Special iput for brand-new inodes that are still locked - */ -void -xfs_iput_new( - xfs_inode_t *ip, - uint lock_flags) -{ - struct inode *inode = VFS_I(ip); - - xfs_itrace_entry(ip); - - if ((ip->i_d.di_mode == 0)) { - ASSERT(!xfs_iflags_test(ip, XFS_IRECLAIMABLE)); - make_bad_inode(inode); - } - if (inode->i_state & I_NEW) - unlock_new_inode(inode); - if (lock_flags) - xfs_iunlock(ip, lock_flags); - IRELE(ip); -} - -/* - * This is called free all the memory associated with an inode. - * It must free the inode itself and any buffers allocated for - * if_extents/if_data and if_broot. It must also free the lock - * associated with the inode. - * - * Note: because we don't initialise everything on reallocation out - * of the zone, we must ensure we nullify everything correctly before - * freeing the structure. - */ -void -xfs_ireclaim( - struct xfs_inode *ip) -{ - struct xfs_mount *mp = ip->i_mount; - struct xfs_perag *pag; - xfs_agino_t agino = XFS_INO_TO_AGINO(mp, ip->i_ino); - - XFS_STATS_INC(xs_ig_reclaims); - - /* - * Remove the inode from the per-AG radix tree. - * - * Because radix_tree_delete won't complain even if the item was never - * added to the tree assert that it's been there before to catch - * problems with the inode life time early on. - */ - pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); - write_lock(&pag->pag_ici_lock); - if (!radix_tree_delete(&pag->pag_ici_root, agino)) - ASSERT(0); - write_unlock(&pag->pag_ici_lock); - xfs_perag_put(pag); - - /* - * Here we do an (almost) spurious inode lock in order to coordinate - * with inode cache radix tree lookups. This is because the lookup - * can reference the inodes in the cache without taking references. - * - * We make that OK here by ensuring that we wait until the inode is - * unlocked after the lookup before we go ahead and free it. We get - * both the ilock and the iolock because the code may need to drop the - * ilock one but will still hold the iolock. - */ - xfs_ilock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); - xfs_qm_dqdetach(ip); - xfs_iunlock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); - - xfs_inode_free(ip); -} - -/* * This is a wrapper routine around the xfs_ilock() routine * used to centralize some grungy code. It is used in places * that wish to lock the inode solely for reading the extents. @@ -744,30 +639,24 @@ xfs_ilock_demote( } #ifdef DEBUG -/* - * Debug-only routine, without additional rw_semaphore APIs, we can - * now only answer requests regarding whether we hold the lock for write - * (reader state is outside our visibility, we only track writer state). - * - * Note: this means !xfs_isilocked would give false positives, so don't do that. - */ int xfs_isilocked( xfs_inode_t *ip, uint lock_flags) { - if ((lock_flags & (XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)) == - XFS_ILOCK_EXCL) { - if (!ip->i_lock.mr_writer) - return 0; + if (lock_flags & (XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)) { + if (!(lock_flags & XFS_ILOCK_SHARED)) + return !!ip->i_lock.mr_writer; + return rwsem_is_locked(&ip->i_lock.mr_lock); } - if ((lock_flags & (XFS_IOLOCK_EXCL|XFS_IOLOCK_SHARED)) == - XFS_IOLOCK_EXCL) { - if (!ip->i_iolock.mr_writer) - return 0; + if (lock_flags & (XFS_IOLOCK_EXCL|XFS_IOLOCK_SHARED)) { + if (!(lock_flags & XFS_IOLOCK_SHARED)) + return !!ip->i_iolock.mr_writer; + return rwsem_is_locked(&ip->i_iolock.mr_lock); } - return 1; + ASSERT(0); + return 0; } #endif diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 8cd6e8d8fe9c..68415cb4f23c 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -27,13 +27,10 @@ #include "xfs_trans_priv.h" #include "xfs_sb.h" #include "xfs_ag.h" -#include "xfs_dir2.h" -#include "xfs_dmapi.h" #include "xfs_mount.h" #include "xfs_bmap_btree.h" #include "xfs_alloc_btree.h" #include "xfs_ialloc_btree.h" -#include "xfs_dir2_sf.h" #include "xfs_attr_sf.h" #include "xfs_dinode.h" #include "xfs_inode.h" @@ -44,7 +41,6 @@ #include "xfs_alloc.h" #include "xfs_ialloc.h" #include "xfs_bmap.h" -#include "xfs_rw.h" #include "xfs_error.h" #include "xfs_utils.h" #include "xfs_quota.h" @@ -177,7 +173,7 @@ xfs_imap_to_bp( if (unlikely(XFS_TEST_ERROR(!di_ok, mp, XFS_ERRTAG_ITOBP_INOTOBP, XFS_RANDOM_ITOBP_INOTOBP))) { - if (iget_flags & XFS_IGET_BULKSTAT) { + if (iget_flags & XFS_IGET_UNTRUSTED) { xfs_trans_brelse(tp, bp); return XFS_ERROR(EINVAL); } @@ -426,7 +422,7 @@ xfs_iformat( if (!XFS_DFORK_Q(dip)) return 0; ASSERT(ip->i_afp == NULL); - ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, KM_SLEEP); + ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, KM_SLEEP | KM_NOFS); ip->i_afp->if_ext_max = XFS_IFORK_ASIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t); switch (dip->di_aformat) { @@ -509,7 +505,7 @@ xfs_iformat_local( ifp->if_u1.if_data = ifp->if_u2.if_inline_data; else { real_size = roundup(size, 4); - ifp->if_u1.if_data = kmem_alloc(real_size, KM_SLEEP); + ifp->if_u1.if_data = kmem_alloc(real_size, KM_SLEEP | KM_NOFS); } ifp->if_bytes = size; ifp->if_real_bytes = real_size; @@ -636,7 +632,7 @@ xfs_iformat_btree( } ifp->if_broot_bytes = size; - ifp->if_broot = kmem_alloc(size, KM_SLEEP); + ifp->if_broot = kmem_alloc(size, KM_SLEEP | KM_NOFS); ASSERT(ifp->if_broot != NULL); /* * Copy and convert from the on-disk structure @@ -787,7 +783,6 @@ xfs_iread( xfs_mount_t *mp, xfs_trans_t *tp, xfs_inode_t *ip, - xfs_daddr_t bno, uint iget_flags) { xfs_buf_t *bp; @@ -797,11 +792,9 @@ xfs_iread( /* * Fill in the location information in the in-core inode. */ - ip->i_imap.im_blkno = bno; error = xfs_imap(mp, tp, ip->i_ino, &ip->i_imap, iget_flags); if (error) return error; - ASSERT(bno == 0 || bno == ip->i_imap.im_blkno); /* * Get pointers to the on-disk inode and the buffer containing it. @@ -925,7 +918,6 @@ xfs_iread_extents( int error; xfs_ifork_t *ifp; xfs_extnum_t nextents; - size_t size; if (unlikely(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE)) { XFS_ERROR_REPORT("xfs_iread_extents", XFS_ERRLEVEL_LOW, @@ -933,7 +925,6 @@ xfs_iread_extents( return XFS_ERROR(EFSCORRUPTED); } nextents = XFS_IFORK_NEXTENTS(ip, whichfork); - size = nextents * sizeof(xfs_bmbt_rec_t); ifp = XFS_IFORK_PTR(ip, whichfork); /* @@ -1229,7 +1220,7 @@ xfs_isize_check( (xfs_ufsize_t)XFS_MAXIOFFSET(mp)) - map_first), XFS_BMAPI_ENTIRE, NULL, 0, imaps, &nimaps, - NULL, NULL)) + NULL)) return; ASSERT(nimaps == 1); ASSERT(imaps[0].br_startblock == HOLESTARTBLOCK); @@ -1463,7 +1454,7 @@ xfs_itruncate_finish( ASSERT((*tp)->t_flags & XFS_TRANS_PERM_LOG_RES); ASSERT(ip->i_transp == *tp); ASSERT(ip->i_itemp != NULL); - ASSERT(ip->i_itemp->ili_flags & XFS_ILI_HOLD); + ASSERT(ip->i_itemp->ili_lock_flags == 0); ntp = *tp; @@ -1592,11 +1583,10 @@ xfs_itruncate_finish( xfs_bmap_init(&free_list, &first_block); error = xfs_bunmapi(ntp, ip, first_unmap_block, unmap_len, - xfs_bmapi_aflag(fork) | - (sync ? 0 : XFS_BMAPI_ASYNC), + xfs_bmapi_aflag(fork), XFS_ITRUNC_MAX_EXTENTS, &first_block, &free_list, - NULL, &done); + &done); if (error) { /* * If the bunmapi call encounters an error, @@ -1615,12 +1605,8 @@ xfs_itruncate_finish( */ error = xfs_bmap_finish(tp, &free_list, &committed); ntp = *tp; - if (committed) { - /* link the inode into the next xact in the chain */ - xfs_trans_ijoin(ntp, ip, - XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); - xfs_trans_ihold(ntp, ip); - } + if (committed) + xfs_trans_ijoin(ntp, ip); if (error) { /* @@ -1649,9 +1635,7 @@ xfs_itruncate_finish( error = xfs_trans_commit(*tp, 0); *tp = ntp; - /* link the inode into the next transaction in the chain */ - xfs_trans_ijoin(ntp, ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); - xfs_trans_ihold(ntp, ip); + xfs_trans_ijoin(ntp, ip); if (error) return error; @@ -1940,10 +1924,10 @@ xfs_ifree_cluster( int blks_per_cluster; int nbufs; int ninodes; - int i, j, found, pre_flushed; + int i, j; xfs_daddr_t blkno; xfs_buf_t *bp; - xfs_inode_t *ip, **ip_found; + xfs_inode_t *ip; xfs_inode_log_item_t *iip; xfs_log_item_t *lip; struct xfs_perag *pag; @@ -1960,114 +1944,97 @@ xfs_ifree_cluster( nbufs = XFS_IALLOC_BLOCKS(mp) / blks_per_cluster; } - ip_found = kmem_alloc(ninodes * sizeof(xfs_inode_t *), KM_NOFS); - for (j = 0; j < nbufs; j++, inum += ninodes) { + int found = 0; + blkno = XFS_AGB_TO_DADDR(mp, XFS_INO_TO_AGNO(mp, inum), XFS_INO_TO_AGBNO(mp, inum)); + /* + * We obtain and lock the backing buffer first in the process + * here, as we have to ensure that any dirty inode that we + * can't get the flush lock on is attached to the buffer. + * If we scan the in-memory inodes first, then buffer IO can + * complete before we get a lock on it, and hence we may fail + * to mark all the active inodes on the buffer stale. + */ + bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, blkno, + mp->m_bsize * blks_per_cluster, + XBF_LOCK); /* - * Look for each inode in memory and attempt to lock it, - * we can be racing with flush and tail pushing here. - * any inode we get the locks on, add to an array of - * inode items to process later. + * Walk the inodes already attached to the buffer and mark them + * stale. These will all have the flush locks held, so an + * in-memory inode walk can't lock them. + */ + lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *); + while (lip) { + if (lip->li_type == XFS_LI_INODE) { + iip = (xfs_inode_log_item_t *)lip; + ASSERT(iip->ili_logged == 1); + lip->li_cb = xfs_istale_done; + xfs_trans_ail_copy_lsn(mp->m_ail, + &iip->ili_flush_lsn, + &iip->ili_item.li_lsn); + xfs_iflags_set(iip->ili_inode, XFS_ISTALE); + found++; + } + lip = lip->li_bio_list; + } + + /* + * For each inode in memory attempt to add it to the inode + * buffer and set it up for being staled on buffer IO + * completion. This is safe as we've locked out tail pushing + * and flushing by locking the buffer. * - * The get the buffer lock, we could beat a flush - * or tail pushing thread to the lock here, in which - * case they will go looking for the inode buffer - * and fail, we need some other form of interlock - * here. + * We have already marked every inode that was part of a + * transaction stale above, which means there is no point in + * even trying to lock them. */ - found = 0; for (i = 0; i < ninodes; i++) { read_lock(&pag->pag_ici_lock); ip = radix_tree_lookup(&pag->pag_ici_root, XFS_INO_TO_AGINO(mp, (inum + i))); - /* Inode not in memory or we found it already, - * nothing to do - */ + /* Inode not in memory or stale, nothing to do */ if (!ip || xfs_iflags_test(ip, XFS_ISTALE)) { read_unlock(&pag->pag_ici_lock); continue; } - if (xfs_inode_clean(ip)) { - read_unlock(&pag->pag_ici_lock); - continue; - } - - /* If we can get the locks then add it to the - * list, otherwise by the time we get the bp lock - * below it will already be attached to the - * inode buffer. - */ - - /* This inode will already be locked - by us, lets - * keep it that way. - */ - - if (ip == free_ip) { - if (xfs_iflock_nowait(ip)) { - xfs_iflags_set(ip, XFS_ISTALE); - if (xfs_inode_clean(ip)) { - xfs_ifunlock(ip); - } else { - ip_found[found++] = ip; - } - } + /* don't try to lock/unlock the current inode */ + if (ip != free_ip && + !xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) { read_unlock(&pag->pag_ici_lock); continue; } + read_unlock(&pag->pag_ici_lock); - if (xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) { - if (xfs_iflock_nowait(ip)) { - xfs_iflags_set(ip, XFS_ISTALE); - - if (xfs_inode_clean(ip)) { - xfs_ifunlock(ip); - xfs_iunlock(ip, XFS_ILOCK_EXCL); - } else { - ip_found[found++] = ip; - } - } else { + if (!xfs_iflock_nowait(ip)) { + if (ip != free_ip) xfs_iunlock(ip, XFS_ILOCK_EXCL); - } + continue; } - read_unlock(&pag->pag_ici_lock); - } - - bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, blkno, - mp->m_bsize * blks_per_cluster, - XBF_LOCK); - pre_flushed = 0; - lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *); - while (lip) { - if (lip->li_type == XFS_LI_INODE) { - iip = (xfs_inode_log_item_t *)lip; - ASSERT(iip->ili_logged == 1); - lip->li_cb = (void(*)(xfs_buf_t*,xfs_log_item_t*)) xfs_istale_done; - xfs_trans_ail_copy_lsn(mp->m_ail, - &iip->ili_flush_lsn, - &iip->ili_item.li_lsn); - xfs_iflags_set(iip->ili_inode, XFS_ISTALE); - pre_flushed++; + xfs_iflags_set(ip, XFS_ISTALE); + if (xfs_inode_clean(ip)) { + ASSERT(ip != free_ip); + xfs_ifunlock(ip); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + continue; } - lip = lip->li_bio_list; - } - for (i = 0; i < found; i++) { - ip = ip_found[i]; iip = ip->i_itemp; - if (!iip) { + /* inode with unlogged changes only */ + ASSERT(ip != free_ip); ip->i_update_core = 0; xfs_ifunlock(ip); xfs_iunlock(ip, XFS_ILOCK_EXCL); continue; } + found++; iip->ili_last_fields = iip->ili_format.ilf_fields; iip->ili_format.ilf_fields = 0; @@ -2075,20 +2042,18 @@ xfs_ifree_cluster( xfs_trans_ail_copy_lsn(mp->m_ail, &iip->ili_flush_lsn, &iip->ili_item.li_lsn); - xfs_buf_attach_iodone(bp, - (void(*)(xfs_buf_t*,xfs_log_item_t*)) - xfs_istale_done, (xfs_log_item_t *)iip); - if (ip != free_ip) { + xfs_buf_attach_iodone(bp, xfs_istale_done, + &iip->ili_item); + + if (ip != free_ip) xfs_iunlock(ip, XFS_ILOCK_EXCL); - } } - if (found || pre_flushed) + if (found) xfs_trans_stale_inode_buf(tp, bp); xfs_trans_binval(tp, bp); } - kmem_free(ip_found); xfs_perag_put(pag); } @@ -2224,7 +2189,7 @@ xfs_iroot_realloc( */ if (ifp->if_broot_bytes == 0) { new_size = (size_t)XFS_BMAP_BROOT_SPACE_CALC(rec_diff); - ifp->if_broot = kmem_alloc(new_size, KM_SLEEP); + ifp->if_broot = kmem_alloc(new_size, KM_SLEEP | KM_NOFS); ifp->if_broot_bytes = (int)new_size; return; } @@ -2240,7 +2205,7 @@ xfs_iroot_realloc( new_size = (size_t)XFS_BMAP_BROOT_SPACE_CALC(new_max); ifp->if_broot = kmem_realloc(ifp->if_broot, new_size, (size_t)XFS_BMAP_BROOT_SPACE_CALC(cur_max), /* old size */ - KM_SLEEP); + KM_SLEEP | KM_NOFS); op = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1, ifp->if_broot_bytes); np = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1, @@ -2266,7 +2231,7 @@ xfs_iroot_realloc( else new_size = 0; if (new_size > 0) { - new_broot = kmem_alloc(new_size, KM_SLEEP); + new_broot = kmem_alloc(new_size, KM_SLEEP | KM_NOFS); /* * First copy over the btree block header. */ @@ -2370,7 +2335,8 @@ xfs_idata_realloc( real_size = roundup(new_size, 4); if (ifp->if_u1.if_data == NULL) { ASSERT(ifp->if_real_bytes == 0); - ifp->if_u1.if_data = kmem_alloc(real_size, KM_SLEEP); + ifp->if_u1.if_data = kmem_alloc(real_size, + KM_SLEEP | KM_NOFS); } else if (ifp->if_u1.if_data != ifp->if_u2.if_inline_data) { /* * Only do the realloc if the underlying size @@ -2381,11 +2347,12 @@ xfs_idata_realloc( kmem_realloc(ifp->if_u1.if_data, real_size, ifp->if_real_bytes, - KM_SLEEP); + KM_SLEEP | KM_NOFS); } } else { ASSERT(ifp->if_real_bytes == 0); - ifp->if_u1.if_data = kmem_alloc(real_size, KM_SLEEP); + ifp->if_u1.if_data = kmem_alloc(real_size, + KM_SLEEP | KM_NOFS); memcpy(ifp->if_u1.if_data, ifp->if_u2.if_inline_data, ifp->if_bytes); } @@ -2649,8 +2616,6 @@ xfs_iflush_cluster( int i; pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); - ASSERT(pag->pagi_inodeok); - ASSERT(pag->pag_ici_init); inodes_per_cluster = XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog; ilist_size = inodes_per_cluster * sizeof(xfs_inode_t *); @@ -2754,7 +2719,6 @@ cluster_corrupt_out: * mark it as stale and brelse. */ if (XFS_BUF_IODONE_FUNC(bp)) { - XFS_BUF_CLR_BDSTRAT_FUNC(bp); XFS_BUF_UNDONE(bp); XFS_BUF_STALE(bp); XFS_BUF_ERROR(bp,EIO); @@ -3092,8 +3056,7 @@ xfs_iflush_int( * and unlock the inode's flush lock when the inode is * completely written to disk. */ - xfs_buf_attach_iodone(bp, (void(*)(xfs_buf_t*,xfs_log_item_t*)) - xfs_iflush_done, (xfs_log_item_t *)iip); + xfs_buf_attach_iodone(bp, xfs_iflush_done, &iip->ili_item); ASSERT(XFS_BUF_FSPRIVATE(bp, void *) != NULL); ASSERT(XFS_BUF_IODONE_FUNC(bp) != NULL); @@ -3537,13 +3500,11 @@ xfs_iext_remove_indirect( xfs_extnum_t ext_diff; /* extents to remove in current list */ xfs_extnum_t nex1; /* number of extents before idx */ xfs_extnum_t nex2; /* extents after idx + count */ - int nlists; /* entries in indirection array */ int page_idx = idx; /* index in target extent list */ ASSERT(ifp->if_flags & XFS_IFEXTIREC); erp = xfs_iext_idx_to_irec(ifp, &page_idx, &erp_idx, 0); ASSERT(erp != NULL); - nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; nex1 = page_idx; ext_cnt = count; while (ext_cnt) { diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index 9965e40a4615..0898c5417d12 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -442,9 +442,7 @@ static inline void xfs_ifunlock(xfs_inode_t *ip) * xfs_iget.c prototypes. */ int xfs_iget(struct xfs_mount *, struct xfs_trans *, xfs_ino_t, - uint, uint, xfs_inode_t **, xfs_daddr_t); -void xfs_iput(xfs_inode_t *, uint); -void xfs_iput_new(xfs_inode_t *, uint); + uint, uint, xfs_inode_t **); void xfs_ilock(xfs_inode_t *, uint); int xfs_ilock_nowait(xfs_inode_t *, uint); void xfs_iunlock(xfs_inode_t *, uint); @@ -452,7 +450,7 @@ void xfs_ilock_demote(xfs_inode_t *, uint); int xfs_isilocked(xfs_inode_t *, uint); uint xfs_ilock_map_shared(xfs_inode_t *); void xfs_iunlock_map_shared(xfs_inode_t *, uint); -void xfs_ireclaim(xfs_inode_t *); +void xfs_inode_free(struct xfs_inode *ip); /* * xfs_inode.c prototypes. @@ -500,7 +498,7 @@ do { \ * Flags for xfs_iget() */ #define XFS_IGET_CREATE 0x1 -#define XFS_IGET_BULKSTAT 0x2 +#define XFS_IGET_UNTRUSTED 0x2 int xfs_inotobp(struct xfs_mount *, struct xfs_trans *, xfs_ino_t, struct xfs_dinode **, @@ -509,7 +507,7 @@ int xfs_itobp(struct xfs_mount *, struct xfs_trans *, struct xfs_inode *, struct xfs_dinode **, struct xfs_buf **, uint); int xfs_iread(struct xfs_mount *, struct xfs_trans *, - struct xfs_inode *, xfs_daddr_t, uint); + struct xfs_inode *, uint); void xfs_dinode_to_disk(struct xfs_dinode *, struct xfs_icdinode *); void xfs_idestroy_fork(struct xfs_inode *, int); diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index cf8249a60004..fe00777e2796 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -22,30 +22,26 @@ #include "xfs_log.h" #include "xfs_inum.h" #include "xfs_trans.h" -#include "xfs_buf_item.h" #include "xfs_sb.h" #include "xfs_ag.h" -#include "xfs_dir2.h" -#include "xfs_dmapi.h" #include "xfs_mount.h" #include "xfs_trans_priv.h" #include "xfs_bmap_btree.h" -#include "xfs_alloc_btree.h" -#include "xfs_ialloc_btree.h" -#include "xfs_dir2_sf.h" -#include "xfs_attr_sf.h" #include "xfs_dinode.h" #include "xfs_inode.h" #include "xfs_inode_item.h" -#include "xfs_btree.h" -#include "xfs_ialloc.h" -#include "xfs_rw.h" #include "xfs_error.h" #include "xfs_trace.h" kmem_zone_t *xfs_ili_zone; /* inode log item zone */ +static inline struct xfs_inode_log_item *INODE_ITEM(struct xfs_log_item *lip) +{ + return container_of(lip, struct xfs_inode_log_item, ili_item); +} + + /* * This returns the number of iovecs needed to log the given inode item. * @@ -55,13 +51,11 @@ kmem_zone_t *xfs_ili_zone; /* inode log item zone */ */ STATIC uint xfs_inode_item_size( - xfs_inode_log_item_t *iip) + struct xfs_log_item *lip) { - uint nvecs; - xfs_inode_t *ip; - - ip = iip->ili_inode; - nvecs = 2; + struct xfs_inode_log_item *iip = INODE_ITEM(lip); + struct xfs_inode *ip = iip->ili_inode; + uint nvecs = 2; /* * Only log the data/extents/b-tree root if there is something @@ -212,21 +206,17 @@ xfs_inode_item_size( */ STATIC void xfs_inode_item_format( - xfs_inode_log_item_t *iip, - xfs_log_iovec_t *log_vector) + struct xfs_log_item *lip, + struct xfs_log_iovec *vecp) { + struct xfs_inode_log_item *iip = INODE_ITEM(lip); + struct xfs_inode *ip = iip->ili_inode; uint nvecs; - xfs_log_iovec_t *vecp; - xfs_inode_t *ip; size_t data_bytes; xfs_bmbt_rec_t *ext_buffer; - int nrecs; xfs_mount_t *mp; - ip = iip->ili_inode; - vecp = log_vector; - - vecp->i_addr = (xfs_caddr_t)&iip->ili_format; + vecp->i_addr = &iip->ili_format; vecp->i_len = sizeof(xfs_inode_log_format_t); vecp->i_type = XLOG_REG_TYPE_IFORMAT; vecp++; @@ -277,7 +267,7 @@ xfs_inode_item_format( */ xfs_synchronize_times(ip); - vecp->i_addr = (xfs_caddr_t)&ip->i_d; + vecp->i_addr = &ip->i_d; vecp->i_len = sizeof(struct xfs_icdinode); vecp->i_type = XLOG_REG_TYPE_ICORE; vecp++; @@ -323,18 +313,17 @@ xfs_inode_item_format( ASSERT(ip->i_df.if_u1.if_extents != NULL); ASSERT(ip->i_d.di_nextents > 0); ASSERT(iip->ili_extents_buf == NULL); - nrecs = ip->i_df.if_bytes / - (uint)sizeof(xfs_bmbt_rec_t); - ASSERT(nrecs > 0); + ASSERT((ip->i_df.if_bytes / + (uint)sizeof(xfs_bmbt_rec_t)) > 0); #ifdef XFS_NATIVE_HOST - if (nrecs == ip->i_d.di_nextents) { + if (ip->i_d.di_nextents == ip->i_df.if_bytes / + (uint)sizeof(xfs_bmbt_rec_t)) { /* * There are no delayed allocation * extents, so just point to the * real extents array. */ - vecp->i_addr = - (char *)(ip->i_df.if_u1.if_extents); + vecp->i_addr = ip->i_df.if_u1.if_extents; vecp->i_len = ip->i_df.if_bytes; vecp->i_type = XLOG_REG_TYPE_IEXT; } else @@ -352,7 +341,7 @@ xfs_inode_item_format( ext_buffer = kmem_alloc(ip->i_df.if_bytes, KM_SLEEP); iip->ili_extents_buf = ext_buffer; - vecp->i_addr = (xfs_caddr_t)ext_buffer; + vecp->i_addr = ext_buffer; vecp->i_len = xfs_iextents_copy(ip, ext_buffer, XFS_DATA_FORK); vecp->i_type = XLOG_REG_TYPE_IEXT; @@ -371,7 +360,7 @@ xfs_inode_item_format( if (iip->ili_format.ilf_fields & XFS_ILOG_DBROOT) { ASSERT(ip->i_df.if_broot_bytes > 0); ASSERT(ip->i_df.if_broot != NULL); - vecp->i_addr = (xfs_caddr_t)ip->i_df.if_broot; + vecp->i_addr = ip->i_df.if_broot; vecp->i_len = ip->i_df.if_broot_bytes; vecp->i_type = XLOG_REG_TYPE_IBROOT; vecp++; @@ -389,7 +378,7 @@ xfs_inode_item_format( ASSERT(ip->i_df.if_u1.if_data != NULL); ASSERT(ip->i_d.di_size > 0); - vecp->i_addr = (xfs_caddr_t)ip->i_df.if_u1.if_data; + vecp->i_addr = ip->i_df.if_u1.if_data; /* * Round i_bytes up to a word boundary. * The underlying memory is guaranteed to @@ -437,7 +426,7 @@ xfs_inode_item_format( * Assert that no attribute-related log flags are set. */ if (!XFS_IFORK_Q(ip)) { - ASSERT(nvecs == iip->ili_item.li_desc->lid_size); + ASSERT(nvecs == lip->li_desc->lid_size); iip->ili_format.ilf_size = nvecs; ASSERT(!(iip->ili_format.ilf_fields & (XFS_ILOG_ADATA | XFS_ILOG_ABROOT | XFS_ILOG_AEXT))); @@ -449,21 +438,21 @@ xfs_inode_item_format( ASSERT(!(iip->ili_format.ilf_fields & (XFS_ILOG_ADATA | XFS_ILOG_ABROOT))); if (iip->ili_format.ilf_fields & XFS_ILOG_AEXT) { - ASSERT(ip->i_afp->if_bytes > 0); - ASSERT(ip->i_afp->if_u1.if_extents != NULL); - ASSERT(ip->i_d.di_anextents > 0); #ifdef DEBUG - nrecs = ip->i_afp->if_bytes / + int nrecs = ip->i_afp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); -#endif ASSERT(nrecs > 0); ASSERT(nrecs == ip->i_d.di_anextents); + ASSERT(ip->i_afp->if_bytes > 0); + ASSERT(ip->i_afp->if_u1.if_extents != NULL); + ASSERT(ip->i_d.di_anextents > 0); +#endif #ifdef XFS_NATIVE_HOST /* * There are not delayed allocation extents * for attributes, so just point at the array. */ - vecp->i_addr = (char *)(ip->i_afp->if_u1.if_extents); + vecp->i_addr = ip->i_afp->if_u1.if_extents; vecp->i_len = ip->i_afp->if_bytes; #else ASSERT(iip->ili_aextents_buf == NULL); @@ -473,7 +462,7 @@ xfs_inode_item_format( ext_buffer = kmem_alloc(ip->i_afp->if_bytes, KM_SLEEP); iip->ili_aextents_buf = ext_buffer; - vecp->i_addr = (xfs_caddr_t)ext_buffer; + vecp->i_addr = ext_buffer; vecp->i_len = xfs_iextents_copy(ip, ext_buffer, XFS_ATTR_FORK); #endif @@ -490,7 +479,7 @@ xfs_inode_item_format( if (iip->ili_format.ilf_fields & XFS_ILOG_ABROOT) { ASSERT(ip->i_afp->if_broot_bytes > 0); ASSERT(ip->i_afp->if_broot != NULL); - vecp->i_addr = (xfs_caddr_t)ip->i_afp->if_broot; + vecp->i_addr = ip->i_afp->if_broot; vecp->i_len = ip->i_afp->if_broot_bytes; vecp->i_type = XLOG_REG_TYPE_IATTR_BROOT; vecp++; @@ -506,7 +495,7 @@ xfs_inode_item_format( ASSERT(ip->i_afp->if_bytes > 0); ASSERT(ip->i_afp->if_u1.if_data != NULL); - vecp->i_addr = (xfs_caddr_t)ip->i_afp->if_u1.if_data; + vecp->i_addr = ip->i_afp->if_u1.if_data; /* * Round i_bytes up to a word boundary. * The underlying memory is guaranteed to @@ -528,7 +517,7 @@ xfs_inode_item_format( break; } - ASSERT(nvecs == iip->ili_item.li_desc->lid_size); + ASSERT(nvecs == lip->li_desc->lid_size); iip->ili_format.ilf_size = nvecs; } @@ -539,12 +528,14 @@ xfs_inode_item_format( */ STATIC void xfs_inode_item_pin( - xfs_inode_log_item_t *iip) + struct xfs_log_item *lip) { - ASSERT(xfs_isilocked(iip->ili_inode, XFS_ILOCK_EXCL)); + struct xfs_inode *ip = INODE_ITEM(lip)->ili_inode; + + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); - trace_xfs_inode_pin(iip->ili_inode, _RET_IP_); - atomic_inc(&iip->ili_inode->i_pincount); + trace_xfs_inode_pin(ip, _RET_IP_); + atomic_inc(&ip->i_pincount); } @@ -554,12 +545,12 @@ xfs_inode_item_pin( * * Also wake up anyone in xfs_iunpin_wait() if the count goes to 0. */ -/* ARGSUSED */ STATIC void xfs_inode_item_unpin( - xfs_inode_log_item_t *iip) + struct xfs_log_item *lip, + int remove) { - struct xfs_inode *ip = iip->ili_inode; + struct xfs_inode *ip = INODE_ITEM(lip)->ili_inode; trace_xfs_inode_unpin(ip, _RET_IP_); ASSERT(atomic_read(&ip->i_pincount) > 0); @@ -567,15 +558,6 @@ xfs_inode_item_unpin( wake_up(&ip->i_ipin_wait); } -/* ARGSUSED */ -STATIC void -xfs_inode_item_unpin_remove( - xfs_inode_log_item_t *iip, - xfs_trans_t *tp) -{ - xfs_inode_item_unpin(iip); -} - /* * This is called to attempt to lock the inode associated with this * inode log item, in preparation for the push routine which does the actual @@ -591,19 +573,16 @@ xfs_inode_item_unpin_remove( */ STATIC uint xfs_inode_item_trylock( - xfs_inode_log_item_t *iip) + struct xfs_log_item *lip) { - register xfs_inode_t *ip; - - ip = iip->ili_inode; + struct xfs_inode_log_item *iip = INODE_ITEM(lip); + struct xfs_inode *ip = iip->ili_inode; - if (xfs_ipincount(ip) > 0) { + if (xfs_ipincount(ip) > 0) return XFS_ITEM_PINNED; - } - if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) { + if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) return XFS_ITEM_LOCKED; - } if (!xfs_iflock_nowait(ip)) { /* @@ -629,7 +608,7 @@ xfs_inode_item_trylock( if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) { ASSERT(iip->ili_format.ilf_fields != 0); ASSERT(iip->ili_logged == 0); - ASSERT(iip->ili_item.li_flags & XFS_LI_IN_AIL); + ASSERT(lip->li_flags & XFS_LI_IN_AIL); } #endif return XFS_ITEM_SUCCESS; @@ -643,26 +622,18 @@ xfs_inode_item_trylock( */ STATIC void xfs_inode_item_unlock( - xfs_inode_log_item_t *iip) + struct xfs_log_item *lip) { - uint hold; - uint iolocked; - uint lock_flags; - xfs_inode_t *ip; + struct xfs_inode_log_item *iip = INODE_ITEM(lip); + struct xfs_inode *ip = iip->ili_inode; + unsigned short lock_flags; - ASSERT(iip != NULL); ASSERT(iip->ili_inode->i_itemp != NULL); ASSERT(xfs_isilocked(iip->ili_inode, XFS_ILOCK_EXCL)); - ASSERT((!(iip->ili_inode->i_itemp->ili_flags & - XFS_ILI_IOLOCKED_EXCL)) || - xfs_isilocked(iip->ili_inode, XFS_IOLOCK_EXCL)); - ASSERT((!(iip->ili_inode->i_itemp->ili_flags & - XFS_ILI_IOLOCKED_SHARED)) || - xfs_isilocked(iip->ili_inode, XFS_IOLOCK_SHARED)); + /* * Clear the transaction pointer in the inode. */ - ip = iip->ili_inode; ip->i_transp = NULL; /* @@ -686,34 +657,11 @@ xfs_inode_item_unlock( iip->ili_aextents_buf = NULL; } - /* - * Figure out if we should unlock the inode or not. - */ - hold = iip->ili_flags & XFS_ILI_HOLD; - - /* - * Before clearing out the flags, remember whether we - * are holding the inode's IO lock. - */ - iolocked = iip->ili_flags & XFS_ILI_IOLOCKED_ANY; - - /* - * Clear out the fields of the inode log item particular - * to the current transaction. - */ - iip->ili_flags = 0; - - /* - * Unlock the inode if XFS_ILI_HOLD was not set. - */ - if (!hold) { - lock_flags = XFS_ILOCK_EXCL; - if (iolocked & XFS_ILI_IOLOCKED_EXCL) { - lock_flags |= XFS_IOLOCK_EXCL; - } else if (iolocked & XFS_ILI_IOLOCKED_SHARED) { - lock_flags |= XFS_IOLOCK_SHARED; - } - xfs_iput(iip->ili_inode, lock_flags); + lock_flags = iip->ili_lock_flags; + iip->ili_lock_flags = 0; + if (lock_flags) { + xfs_iunlock(iip->ili_inode, lock_flags); + IRELE(iip->ili_inode); } } @@ -725,13 +673,12 @@ xfs_inode_item_unlock( * is the only one that matters. Therefore, simply return the * given lsn. */ -/*ARGSUSED*/ STATIC xfs_lsn_t xfs_inode_item_committed( - xfs_inode_log_item_t *iip, + struct xfs_log_item *lip, xfs_lsn_t lsn) { - return (lsn); + return lsn; } /* @@ -743,13 +690,12 @@ xfs_inode_item_committed( */ STATIC void xfs_inode_item_pushbuf( - xfs_inode_log_item_t *iip) + struct xfs_log_item *lip) { - xfs_inode_t *ip; - xfs_mount_t *mp; - xfs_buf_t *bp; + struct xfs_inode_log_item *iip = INODE_ITEM(lip); + struct xfs_inode *ip = iip->ili_inode; + struct xfs_buf *bp; - ip = iip->ili_inode; ASSERT(xfs_isilocked(ip, XFS_ILOCK_SHARED)); /* @@ -757,14 +703,13 @@ xfs_inode_item_pushbuf( * inode was taken off the AIL. So, just get out. */ if (completion_done(&ip->i_flush) || - ((iip->ili_item.li_flags & XFS_LI_IN_AIL) == 0)) { + !(lip->li_flags & XFS_LI_IN_AIL)) { xfs_iunlock(ip, XFS_ILOCK_SHARED); return; } - mp = ip->i_mount; - bp = xfs_incore(mp->m_ddev_targp, iip->ili_format.ilf_blkno, - iip->ili_format.ilf_len, XBF_TRYLOCK); + bp = xfs_incore(ip->i_mount->m_ddev_targp, iip->ili_format.ilf_blkno, + iip->ili_format.ilf_len, XBF_TRYLOCK); xfs_iunlock(ip, XFS_ILOCK_SHARED); if (!bp) @@ -772,10 +717,8 @@ xfs_inode_item_pushbuf( if (XFS_BUF_ISDELAYWRITE(bp)) xfs_buf_delwri_promote(bp); xfs_buf_relse(bp); - return; } - /* * This is called to asynchronously write the inode associated with this * inode log item out to disk. The inode will already have been locked by @@ -783,14 +726,14 @@ xfs_inode_item_pushbuf( */ STATIC void xfs_inode_item_push( - xfs_inode_log_item_t *iip) + struct xfs_log_item *lip) { - xfs_inode_t *ip; - - ip = iip->ili_inode; + struct xfs_inode_log_item *iip = INODE_ITEM(lip); + struct xfs_inode *ip = iip->ili_inode; ASSERT(xfs_isilocked(ip, XFS_ILOCK_SHARED)); ASSERT(!completion_done(&ip->i_flush)); + /* * Since we were able to lock the inode's flush lock and * we found it on the AIL, the inode must be dirty. This @@ -813,43 +756,34 @@ xfs_inode_item_push( */ (void) xfs_iflush(ip, 0); xfs_iunlock(ip, XFS_ILOCK_SHARED); - - return; } /* * XXX rcc - this one really has to do something. Probably needs * to stamp in a new field in the incore inode. */ -/* ARGSUSED */ STATIC void xfs_inode_item_committing( - xfs_inode_log_item_t *iip, + struct xfs_log_item *lip, xfs_lsn_t lsn) { - iip->ili_last_lsn = lsn; - return; + INODE_ITEM(lip)->ili_last_lsn = lsn; } /* * This is the ops vector shared by all buf log items. */ static struct xfs_item_ops xfs_inode_item_ops = { - .iop_size = (uint(*)(xfs_log_item_t*))xfs_inode_item_size, - .iop_format = (void(*)(xfs_log_item_t*, xfs_log_iovec_t*)) - xfs_inode_item_format, - .iop_pin = (void(*)(xfs_log_item_t*))xfs_inode_item_pin, - .iop_unpin = (void(*)(xfs_log_item_t*))xfs_inode_item_unpin, - .iop_unpin_remove = (void(*)(xfs_log_item_t*, xfs_trans_t*)) - xfs_inode_item_unpin_remove, - .iop_trylock = (uint(*)(xfs_log_item_t*))xfs_inode_item_trylock, - .iop_unlock = (void(*)(xfs_log_item_t*))xfs_inode_item_unlock, - .iop_committed = (xfs_lsn_t(*)(xfs_log_item_t*, xfs_lsn_t)) - xfs_inode_item_committed, - .iop_push = (void(*)(xfs_log_item_t*))xfs_inode_item_push, - .iop_pushbuf = (void(*)(xfs_log_item_t*))xfs_inode_item_pushbuf, - .iop_committing = (void(*)(xfs_log_item_t*, xfs_lsn_t)) - xfs_inode_item_committing + .iop_size = xfs_inode_item_size, + .iop_format = xfs_inode_item_format, + .iop_pin = xfs_inode_item_pin, + .iop_unpin = xfs_inode_item_unpin, + .iop_trylock = xfs_inode_item_trylock, + .iop_unlock = xfs_inode_item_unlock, + .iop_committed = xfs_inode_item_committed, + .iop_push = xfs_inode_item_push, + .iop_pushbuf = xfs_inode_item_pushbuf, + .iop_committing = xfs_inode_item_committing }; @@ -858,10 +792,10 @@ static struct xfs_item_ops xfs_inode_item_ops = { */ void xfs_inode_item_init( - xfs_inode_t *ip, - xfs_mount_t *mp) + struct xfs_inode *ip, + struct xfs_mount *mp) { - xfs_inode_log_item_t *iip; + struct xfs_inode_log_item *iip; ASSERT(ip->i_itemp == NULL); iip = ip->i_itemp = kmem_zone_zalloc(xfs_ili_zone, KM_SLEEP); @@ -899,14 +833,14 @@ xfs_inode_item_destroy( * from the AIL if it has not been re-logged, and unlocking the inode's * flush lock. */ -/*ARGSUSED*/ void xfs_iflush_done( - xfs_buf_t *bp, - xfs_inode_log_item_t *iip) + struct xfs_buf *bp, + struct xfs_log_item *lip) { + struct xfs_inode_log_item *iip = INODE_ITEM(lip); xfs_inode_t *ip = iip->ili_inode; - struct xfs_ail *ailp = iip->ili_item.li_ailp; + struct xfs_ail *ailp = lip->li_ailp; /* * We only want to pull the item from the AIL if it is @@ -917,12 +851,11 @@ xfs_iflush_done( * the lock since it's cheaper, and then we recheck while * holding the lock before removing the inode from the AIL. */ - if (iip->ili_logged && - (iip->ili_item.li_lsn == iip->ili_flush_lsn)) { + if (iip->ili_logged && lip->li_lsn == iip->ili_flush_lsn) { spin_lock(&ailp->xa_lock); - if (iip->ili_item.li_lsn == iip->ili_flush_lsn) { + if (lip->li_lsn == iip->ili_flush_lsn) { /* xfs_trans_ail_delete() drops the AIL lock. */ - xfs_trans_ail_delete(ailp, (xfs_log_item_t*)iip); + xfs_trans_ail_delete(ailp, lip); } else { spin_unlock(&ailp->xa_lock); } @@ -940,8 +873,6 @@ xfs_iflush_done( * Release the inode's flush lock since we're done with it. */ xfs_ifunlock(ip); - - return; } /* @@ -957,10 +888,8 @@ xfs_iflush_abort( xfs_inode_t *ip) { xfs_inode_log_item_t *iip = ip->i_itemp; - xfs_mount_t *mp; iip = ip->i_itemp; - mp = ip->i_mount; if (iip) { struct xfs_ail *ailp = iip->ili_item.li_ailp; if (iip->ili_item.li_flags & XFS_LI_IN_AIL) { @@ -991,10 +920,10 @@ xfs_iflush_abort( void xfs_istale_done( - xfs_buf_t *bp, - xfs_inode_log_item_t *iip) + struct xfs_buf *bp, + struct xfs_log_item *lip) { - xfs_iflush_abort(iip->ili_inode); + xfs_iflush_abort(INODE_ITEM(lip)->ili_inode); } /* @@ -1007,9 +936,8 @@ xfs_inode_item_format_convert( xfs_inode_log_format_t *in_f) { if (buf->i_len == sizeof(xfs_inode_log_format_32_t)) { - xfs_inode_log_format_32_t *in_f32; + xfs_inode_log_format_32_t *in_f32 = buf->i_addr; - in_f32 = (xfs_inode_log_format_32_t *)buf->i_addr; in_f->ilf_type = in_f32->ilf_type; in_f->ilf_size = in_f32->ilf_size; in_f->ilf_fields = in_f32->ilf_fields; @@ -1025,9 +953,8 @@ xfs_inode_item_format_convert( in_f->ilf_boffset = in_f32->ilf_boffset; return 0; } else if (buf->i_len == sizeof(xfs_inode_log_format_64_t)){ - xfs_inode_log_format_64_t *in_f64; + xfs_inode_log_format_64_t *in_f64 = buf->i_addr; - in_f64 = (xfs_inode_log_format_64_t *)buf->i_addr; in_f->ilf_type = in_f64->ilf_type; in_f->ilf_size = in_f64->ilf_size; in_f->ilf_fields = in_f64->ilf_fields; diff --git a/fs/xfs/xfs_inode_item.h b/fs/xfs/xfs_inode_item.h index 9a467958ecdd..d3dee61e6d91 100644 --- a/fs/xfs/xfs_inode_item.h +++ b/fs/xfs/xfs_inode_item.h @@ -103,12 +103,6 @@ typedef struct xfs_inode_log_format_64 { XFS_ILOG_ADATA | XFS_ILOG_AEXT | \ XFS_ILOG_ABROOT) -#define XFS_ILI_HOLD 0x1 -#define XFS_ILI_IOLOCKED_EXCL 0x2 -#define XFS_ILI_IOLOCKED_SHARED 0x4 - -#define XFS_ILI_IOLOCKED_ANY (XFS_ILI_IOLOCKED_EXCL | XFS_ILI_IOLOCKED_SHARED) - static inline int xfs_ilog_fbroot(int w) { return (w == XFS_DATA_FORK ? XFS_ILOG_DBROOT : XFS_ILOG_ABROOT); @@ -137,7 +131,7 @@ typedef struct xfs_inode_log_item { struct xfs_inode *ili_inode; /* inode ptr */ xfs_lsn_t ili_flush_lsn; /* lsn at last flush */ xfs_lsn_t ili_last_lsn; /* lsn at last transaction */ - unsigned short ili_flags; /* misc flags */ + unsigned short ili_lock_flags; /* lock flags */ unsigned short ili_logged; /* flushed logged data */ unsigned int ili_last_fields; /* fields when flushed */ struct xfs_bmbt_rec *ili_extents_buf; /* array of logged @@ -161,8 +155,8 @@ static inline int xfs_inode_clean(xfs_inode_t *ip) extern void xfs_inode_item_init(struct xfs_inode *, struct xfs_mount *); extern void xfs_inode_item_destroy(struct xfs_inode *); -extern void xfs_iflush_done(struct xfs_buf *, xfs_inode_log_item_t *); -extern void xfs_istale_done(struct xfs_buf *, xfs_inode_log_item_t *); +extern void xfs_iflush_done(struct xfs_buf *, struct xfs_log_item *); +extern void xfs_istale_done(struct xfs_buf *, struct xfs_log_item *); extern void xfs_iflush_abort(struct xfs_inode *); extern int xfs_inode_item_format_convert(xfs_log_iovec_t *, xfs_inode_log_format_t *); diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index ef14943829da..20576146369f 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -23,19 +23,14 @@ #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" -#include "xfs_dir2.h" #include "xfs_alloc.h" -#include "xfs_dmapi.h" #include "xfs_quota.h" #include "xfs_mount.h" #include "xfs_bmap_btree.h" #include "xfs_alloc_btree.h" #include "xfs_ialloc_btree.h" -#include "xfs_dir2_sf.h" -#include "xfs_attr_sf.h" #include "xfs_dinode.h" #include "xfs_inode.h" -#include "xfs_ialloc.h" #include "xfs_btree.h" #include "xfs_bmap.h" #include "xfs_rtalloc.h" @@ -123,7 +118,7 @@ xfs_iomap( error = xfs_bmapi(NULL, ip, offset_fsb, (xfs_filblks_t)(end_fsb - offset_fsb), bmapi_flags, NULL, 0, imap, - nimaps, NULL, NULL); + nimaps, NULL); if (error) goto out; @@ -138,7 +133,7 @@ xfs_iomap( break; } - if (flags & (BMAPI_DIRECT|BMAPI_MMAP)) { + if (flags & BMAPI_DIRECT) { error = xfs_iomap_write_direct(ip, offset, count, flags, imap, nimaps); } else { @@ -247,7 +242,7 @@ xfs_iomap_write_direct( xfs_off_t offset, size_t count, int flags, - xfs_bmbt_irec_t *ret_imap, + xfs_bmbt_irec_t *imap, int *nmaps) { xfs_mount_t *mp = ip->i_mount; @@ -261,7 +256,6 @@ xfs_iomap_write_direct( int quota_flag; int rt; xfs_trans_t *tp; - xfs_bmbt_irec_t imap; xfs_bmap_free_t free_list; uint qblocks, resblks, resrtextents; int committed; @@ -285,10 +279,10 @@ xfs_iomap_write_direct( if (error) goto error_out; } else { - if (*nmaps && (ret_imap->br_startblock == HOLESTARTBLOCK)) + if (*nmaps && (imap->br_startblock == HOLESTARTBLOCK)) last_fsb = MIN(last_fsb, (xfs_fileoff_t) - ret_imap->br_blockcount + - ret_imap->br_startoff); + imap->br_blockcount + + imap->br_startoff); } count_fsb = last_fsb - offset_fsb; ASSERT(count_fsb > 0); @@ -334,20 +328,22 @@ xfs_iomap_write_direct( if (error) goto error1; - xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); - xfs_trans_ihold(tp, ip); + xfs_trans_ijoin(tp, ip); bmapi_flag = XFS_BMAPI_WRITE; if ((flags & BMAPI_DIRECT) && (offset < ip->i_size || extsz)) bmapi_flag |= XFS_BMAPI_PREALLOC; /* - * Issue the xfs_bmapi() call to allocate the blocks + * Issue the xfs_bmapi() call to allocate the blocks. + * + * From this point onwards we overwrite the imap pointer that the + * caller gave to us. */ xfs_bmap_init(&free_list, &firstfsb); nimaps = 1; error = xfs_bmapi(tp, ip, offset_fsb, count_fsb, bmapi_flag, - &firstfsb, 0, &imap, &nimaps, &free_list, NULL); + &firstfsb, 0, imap, &nimaps, &free_list); if (error) goto error0; @@ -369,12 +365,11 @@ xfs_iomap_write_direct( goto error_out; } - if (!(imap.br_startblock || XFS_IS_REALTIME_INODE(ip))) { - error = xfs_cmn_err_fsblock_zero(ip, &imap); + if (!(imap->br_startblock || XFS_IS_REALTIME_INODE(ip))) { + error = xfs_cmn_err_fsblock_zero(ip, imap); goto error_out; } - *ret_imap = imap; *nmaps = 1; return 0; @@ -425,7 +420,7 @@ xfs_iomap_eof_want_preallocate( imaps = nimaps; firstblock = NULLFSBLOCK; error = xfs_bmapi(NULL, ip, start_fsb, count_fsb, 0, - &firstblock, 0, imap, &imaps, NULL, NULL); + &firstblock, 0, imap, &imaps, NULL); if (error) return error; for (n = 0; n < imaps; n++) { @@ -500,7 +495,7 @@ retry: (xfs_filblks_t)(last_fsb - offset_fsb), XFS_BMAPI_DELAY | XFS_BMAPI_WRITE | XFS_BMAPI_ENTIRE, &firstblock, 1, imap, - &nimaps, NULL, NULL); + &nimaps, NULL); if (error && (error != ENOSPC)) return XFS_ERROR(error); @@ -548,7 +543,7 @@ xfs_iomap_write_allocate( xfs_inode_t *ip, xfs_off_t offset, size_t count, - xfs_bmbt_irec_t *map, + xfs_bmbt_irec_t *imap, int *retmap) { xfs_mount_t *mp = ip->i_mount; @@ -557,7 +552,6 @@ xfs_iomap_write_allocate( xfs_fsblock_t first_block; xfs_bmap_free_t free_list; xfs_filblks_t count_fsb; - xfs_bmbt_irec_t imap; xfs_trans_t *tp; int nimaps, committed; int error = 0; @@ -573,8 +567,8 @@ xfs_iomap_write_allocate( return XFS_ERROR(error); offset_fsb = XFS_B_TO_FSBT(mp, offset); - count_fsb = map->br_blockcount; - map_start_fsb = map->br_startoff; + count_fsb = imap->br_blockcount; + map_start_fsb = imap->br_startoff; XFS_STATS_ADD(xs_xstrat_bytes, XFS_FSB_TO_B(mp, count_fsb)); @@ -602,8 +596,7 @@ xfs_iomap_write_allocate( return XFS_ERROR(error); } xfs_ilock(ip, XFS_ILOCK_EXCL); - xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); - xfs_trans_ihold(tp, ip); + xfs_trans_ijoin(tp, ip); xfs_bmap_init(&free_list, &first_block); @@ -654,10 +647,15 @@ xfs_iomap_write_allocate( } } - /* Go get the actual blocks */ + /* + * Go get the actual blocks. + * + * From this point onwards we overwrite the imap + * pointer that the caller gave to us. + */ error = xfs_bmapi(tp, ip, map_start_fsb, count_fsb, XFS_BMAPI_WRITE, &first_block, 1, - &imap, &nimaps, &free_list, NULL); + imap, &nimaps, &free_list); if (error) goto trans_cancel; @@ -676,13 +674,12 @@ xfs_iomap_write_allocate( * See if we were able to allocate an extent that * covers at least part of the callers request */ - if (!(imap.br_startblock || XFS_IS_REALTIME_INODE(ip))) - return xfs_cmn_err_fsblock_zero(ip, &imap); + if (!(imap->br_startblock || XFS_IS_REALTIME_INODE(ip))) + return xfs_cmn_err_fsblock_zero(ip, imap); - if ((offset_fsb >= imap.br_startoff) && - (offset_fsb < (imap.br_startoff + - imap.br_blockcount))) { - *map = imap; + if ((offset_fsb >= imap->br_startoff) && + (offset_fsb < (imap->br_startoff + + imap->br_blockcount))) { *retmap = 1; XFS_STATS_INC(xs_xstrat_quick); return 0; @@ -692,8 +689,8 @@ xfs_iomap_write_allocate( * So far we have not mapped the requested part of the * file, just surrounding data, try again. */ - count_fsb -= imap.br_blockcount; - map_start_fsb = imap.br_startoff + imap.br_blockcount; + count_fsb -= imap->br_blockcount; + map_start_fsb = imap->br_startoff + imap->br_blockcount; } trans_cancel: @@ -766,8 +763,7 @@ xfs_iomap_write_unwritten( } xfs_ilock(ip, XFS_ILOCK_EXCL); - xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); - xfs_trans_ihold(tp, ip); + xfs_trans_ijoin(tp, ip); /* * Modify the unwritten extent state of the buffer. @@ -776,7 +772,7 @@ xfs_iomap_write_unwritten( nimaps = 1; error = xfs_bmapi(tp, ip, offset_fsb, count_fsb, XFS_BMAPI_WRITE|XFS_BMAPI_CONVERT, &firstfsb, - 1, &imap, &nimaps, &free_list, NULL); + 1, &imap, &nimaps, &free_list); if (error) goto error_on_bmapi_transaction; diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h index 81ac4afd45b3..7748a430f50d 100644 --- a/fs/xfs/xfs_iomap.h +++ b/fs/xfs/xfs_iomap.h @@ -18,17 +18,16 @@ #ifndef __XFS_IOMAP_H__ #define __XFS_IOMAP_H__ -typedef enum { - /* base extent manipulation calls */ - BMAPI_READ = (1 << 0), /* read extents */ - BMAPI_WRITE = (1 << 1), /* create extents */ - BMAPI_ALLOCATE = (1 << 2), /* delayed allocate to real extents */ - /* modifiers */ - BMAPI_IGNSTATE = (1 << 4), /* ignore unwritten state on read */ - BMAPI_DIRECT = (1 << 5), /* direct instead of buffered write */ - BMAPI_MMAP = (1 << 6), /* allocate for mmap write */ - BMAPI_TRYLOCK = (1 << 7), /* non-blocking request */ -} bmapi_flags_t; +/* base extent manipulation calls */ +#define BMAPI_READ (1 << 0) /* read extents */ +#define BMAPI_WRITE (1 << 1) /* create extents */ +#define BMAPI_ALLOCATE (1 << 2) /* delayed allocate to real extents */ + +/* modifiers */ +#define BMAPI_IGNSTATE (1 << 4) /* ignore unwritten state on read */ +#define BMAPI_DIRECT (1 << 5) /* direct instead of buffered write */ +#define BMAPI_MMA (1 << 6) /* allocate for mmap write */ +#define BMAPI_TRYLOCK (1 << 7) /* non-blocking request */ #define BMAPI_FLAGS \ { BMAPI_READ, "READ" }, \ @@ -36,7 +35,6 @@ typedef enum { { BMAPI_ALLOCATE, "ALLOCATE" }, \ { BMAPI_IGNSTATE, "IGNSTATE" }, \ { BMAPI_DIRECT, "DIRECT" }, \ - { BMAPI_MMAP, "MMAP" }, \ { BMAPI_TRYLOCK, "TRYLOCK" } struct xfs_inode; diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c index b1b801e4a28e..7e3626e5925c 100644 --- a/fs/xfs/xfs_itable.c +++ b/fs/xfs/xfs_itable.c @@ -24,20 +24,17 @@ #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" -#include "xfs_dir2.h" -#include "xfs_dmapi.h" #include "xfs_mount.h" #include "xfs_bmap_btree.h" #include "xfs_alloc_btree.h" #include "xfs_ialloc_btree.h" -#include "xfs_dir2_sf.h" -#include "xfs_attr_sf.h" #include "xfs_dinode.h" #include "xfs_inode.h" #include "xfs_ialloc.h" #include "xfs_itable.h" #include "xfs_error.h" #include "xfs_btree.h" +#include "xfs_trace.h" STATIC int xfs_internal_inum( @@ -49,24 +46,40 @@ xfs_internal_inum( (ino == mp->m_sb.sb_uquotino || ino == mp->m_sb.sb_gquotino))); } -STATIC int -xfs_bulkstat_one_iget( - xfs_mount_t *mp, /* mount point for filesystem */ - xfs_ino_t ino, /* inode number to get data for */ - xfs_daddr_t bno, /* starting bno of inode cluster */ - xfs_bstat_t *buf, /* return buffer */ - int *stat) /* BULKSTAT_RV_... */ +/* + * Return stat information for one inode. + * Return 0 if ok, else errno. + */ +int +xfs_bulkstat_one_int( + struct xfs_mount *mp, /* mount point for filesystem */ + xfs_ino_t ino, /* inode to get data for */ + void __user *buffer, /* buffer to place output in */ + int ubsize, /* size of buffer */ + bulkstat_one_fmt_pf formatter, /* formatter, copy to user */ + int *ubused, /* bytes used by me */ + int *stat) /* BULKSTAT_RV_... */ { - xfs_icdinode_t *dic; /* dinode core info pointer */ - xfs_inode_t *ip; /* incore inode pointer */ - struct inode *inode; - int error; + struct xfs_icdinode *dic; /* dinode core info pointer */ + struct xfs_inode *ip; /* incore inode pointer */ + struct inode *inode; + struct xfs_bstat *buf; /* return buffer */ + int error = 0; /* error value */ + + *stat = BULKSTAT_RV_NOTHING; + + if (!buffer || xfs_internal_inum(mp, ino)) + return XFS_ERROR(EINVAL); + + buf = kmem_alloc(sizeof(*buf), KM_SLEEP | KM_MAYFAIL); + if (!buf) + return XFS_ERROR(ENOMEM); error = xfs_iget(mp, NULL, ino, - XFS_IGET_BULKSTAT, XFS_ILOCK_SHARED, &ip, bno); + XFS_IGET_UNTRUSTED, XFS_ILOCK_SHARED, &ip); if (error) { *stat = BULKSTAT_RV_NOTHING; - return error; + goto out_free; } ASSERT(ip != NULL); @@ -127,77 +140,17 @@ xfs_bulkstat_one_iget( buf->bs_blocks = dic->di_nblocks + ip->i_delayed_blks; break; } + xfs_iunlock(ip, XFS_ILOCK_SHARED); + IRELE(ip); - xfs_iput(ip, XFS_ILOCK_SHARED); - return error; -} - -STATIC void -xfs_bulkstat_one_dinode( - xfs_mount_t *mp, /* mount point for filesystem */ - xfs_ino_t ino, /* inode number to get data for */ - xfs_dinode_t *dic, /* dinode inode pointer */ - xfs_bstat_t *buf) /* return buffer */ -{ - /* - * The inode format changed when we moved the link count and - * made it 32 bits long. If this is an old format inode, - * convert it in memory to look like a new one. If it gets - * flushed to disk we will convert back before flushing or - * logging it. We zero out the new projid field and the old link - * count field. We'll handle clearing the pad field (the remains - * of the old uuid field) when we actually convert the inode to - * the new format. We don't change the version number so that we - * can distinguish this from a real new format inode. - */ - if (dic->di_version == 1) { - buf->bs_nlink = be16_to_cpu(dic->di_onlink); - buf->bs_projid = 0; - } else { - buf->bs_nlink = be32_to_cpu(dic->di_nlink); - buf->bs_projid = be16_to_cpu(dic->di_projid); - } + error = formatter(buffer, ubsize, ubused, buf); - buf->bs_ino = ino; - buf->bs_mode = be16_to_cpu(dic->di_mode); - buf->bs_uid = be32_to_cpu(dic->di_uid); - buf->bs_gid = be32_to_cpu(dic->di_gid); - buf->bs_size = be64_to_cpu(dic->di_size); - buf->bs_atime.tv_sec = be32_to_cpu(dic->di_atime.t_sec); - buf->bs_atime.tv_nsec = be32_to_cpu(dic->di_atime.t_nsec); - buf->bs_mtime.tv_sec = be32_to_cpu(dic->di_mtime.t_sec); - buf->bs_mtime.tv_nsec = be32_to_cpu(dic->di_mtime.t_nsec); - buf->bs_ctime.tv_sec = be32_to_cpu(dic->di_ctime.t_sec); - buf->bs_ctime.tv_nsec = be32_to_cpu(dic->di_ctime.t_nsec); - buf->bs_xflags = xfs_dic2xflags(dic); - buf->bs_extsize = be32_to_cpu(dic->di_extsize) << mp->m_sb.sb_blocklog; - buf->bs_extents = be32_to_cpu(dic->di_nextents); - buf->bs_gen = be32_to_cpu(dic->di_gen); - memset(buf->bs_pad, 0, sizeof(buf->bs_pad)); - buf->bs_dmevmask = be32_to_cpu(dic->di_dmevmask); - buf->bs_dmstate = be16_to_cpu(dic->di_dmstate); - buf->bs_aextents = be16_to_cpu(dic->di_anextents); - buf->bs_forkoff = XFS_DFORK_BOFF(dic); + if (!error) + *stat = BULKSTAT_RV_DIDONE; - switch (dic->di_format) { - case XFS_DINODE_FMT_DEV: - buf->bs_rdev = xfs_dinode_get_rdev(dic); - buf->bs_blksize = BLKDEV_IOSIZE; - buf->bs_blocks = 0; - break; - case XFS_DINODE_FMT_LOCAL: - case XFS_DINODE_FMT_UUID: - buf->bs_rdev = 0; - buf->bs_blksize = mp->m_sb.sb_blocksize; - buf->bs_blocks = 0; - break; - case XFS_DINODE_FMT_EXTENTS: - case XFS_DINODE_FMT_BTREE: - buf->bs_rdev = 0; - buf->bs_blksize = mp->m_sb.sb_blocksize; - buf->bs_blocks = be64_to_cpu(dic->di_nblocks); - break; - } + out_free: + kmem_free(buf); + return error; } /* Return 0 on success or positive error */ @@ -217,118 +170,17 @@ xfs_bulkstat_one_fmt( return 0; } -/* - * Return stat information for one inode. - * Return 0 if ok, else errno. - */ -int /* error status */ -xfs_bulkstat_one_int( - xfs_mount_t *mp, /* mount point for filesystem */ - xfs_ino_t ino, /* inode number to get data for */ - void __user *buffer, /* buffer to place output in */ - int ubsize, /* size of buffer */ - bulkstat_one_fmt_pf formatter, /* formatter, copy to user */ - xfs_daddr_t bno, /* starting bno of inode cluster */ - int *ubused, /* bytes used by me */ - void *dibuff, /* on-disk inode buffer */ - int *stat) /* BULKSTAT_RV_... */ -{ - xfs_bstat_t *buf; /* return buffer */ - int error = 0; /* error value */ - xfs_dinode_t *dip; /* dinode inode pointer */ - - dip = (xfs_dinode_t *)dibuff; - *stat = BULKSTAT_RV_NOTHING; - - if (!buffer || xfs_internal_inum(mp, ino)) - return XFS_ERROR(EINVAL); - - buf = kmem_alloc(sizeof(*buf), KM_SLEEP); - - if (dip == NULL) { - /* We're not being passed a pointer to a dinode. This happens - * if BULKSTAT_FG_IGET is selected. Do the iget. - */ - error = xfs_bulkstat_one_iget(mp, ino, bno, buf, stat); - if (error) - goto out_free; - } else { - xfs_bulkstat_one_dinode(mp, ino, dip, buf); - } - - error = formatter(buffer, ubsize, ubused, buf); - if (error) - goto out_free; - - *stat = BULKSTAT_RV_DIDONE; - - out_free: - kmem_free(buf); - return error; -} - int xfs_bulkstat_one( xfs_mount_t *mp, /* mount point for filesystem */ xfs_ino_t ino, /* inode number to get data for */ void __user *buffer, /* buffer to place output in */ int ubsize, /* size of buffer */ - void *private_data, /* my private data */ - xfs_daddr_t bno, /* starting bno of inode cluster */ int *ubused, /* bytes used by me */ - void *dibuff, /* on-disk inode buffer */ int *stat) /* BULKSTAT_RV_... */ { return xfs_bulkstat_one_int(mp, ino, buffer, ubsize, - xfs_bulkstat_one_fmt, bno, - ubused, dibuff, stat); -} - -/* - * Test to see whether we can use the ondisk inode directly, based - * on the given bulkstat flags, filling in dipp accordingly. - * Returns zero if the inode is dodgey. - */ -STATIC int -xfs_bulkstat_use_dinode( - xfs_mount_t *mp, - int flags, - xfs_buf_t *bp, - int clustidx, - xfs_dinode_t **dipp) -{ - xfs_dinode_t *dip; - unsigned int aformat; - - *dipp = NULL; - if (!bp || (flags & BULKSTAT_FG_IGET)) - return 1; - dip = (xfs_dinode_t *) - xfs_buf_offset(bp, clustidx << mp->m_sb.sb_inodelog); - /* - * Check the buffer containing the on-disk inode for di_mode == 0. - * This is to prevent xfs_bulkstat from picking up just reclaimed - * inodes that have their in-core state initialized but not flushed - * to disk yet. This is a temporary hack that would require a proper - * fix in the future. - */ - if (be16_to_cpu(dip->di_magic) != XFS_DINODE_MAGIC || - !XFS_DINODE_GOOD_VERSION(dip->di_version) || - !dip->di_mode) - return 0; - if (flags & BULKSTAT_FG_QUICK) { - *dipp = dip; - return 1; - } - /* BULKSTAT_FG_INLINE: if attr fork is local, or not there, use it */ - aformat = dip->di_aformat; - if ((XFS_DFORK_Q(dip) == 0) || - (aformat == XFS_DINODE_FMT_LOCAL) || - (aformat == XFS_DINODE_FMT_EXTENTS && !dip->di_anextents)) { - *dipp = dip; - return 1; - } - return 1; + xfs_bulkstat_one_fmt, ubused, stat); } #define XFS_BULKSTAT_UBLEFT(ubleft) ((ubleft) >= statstruct_size) @@ -342,10 +194,8 @@ xfs_bulkstat( xfs_ino_t *lastinop, /* last inode returned */ int *ubcountp, /* size of buffer/count returned */ bulkstat_one_pf formatter, /* func that'd fill a single buf */ - void *private_data,/* private data for formatter */ size_t statstruct_size, /* sizeof struct filling */ char __user *ubuffer, /* buffer with inode stats */ - int flags, /* defined in xfs_itable.h */ int *done) /* 1 if there are more stats to get */ { xfs_agblock_t agbno=0;/* allocation group block number */ @@ -380,14 +230,12 @@ xfs_bulkstat( int ubelem; /* spaces used in user's buffer */ int ubused; /* bytes used by formatter */ xfs_buf_t *bp; /* ptr to on-disk inode cluster buf */ - xfs_dinode_t *dip; /* ptr into bp for specific inode */ /* * Get the last inode value, see if there's nothing to do. */ ino = (xfs_ino_t)*lastinop; lastino = ino; - dip = NULL; agno = XFS_INO_TO_AGNO(mp, ino); agino = XFS_INO_TO_AGINO(mp, ino); if (agno >= mp->m_sb.sb_agcount || @@ -612,37 +460,6 @@ xfs_bulkstat( irbp->ir_startino) + ((chunkidx & nimask) >> mp->m_sb.sb_inopblog); - - if (flags & (BULKSTAT_FG_QUICK | - BULKSTAT_FG_INLINE)) { - int offset; - - ino = XFS_AGINO_TO_INO(mp, agno, - agino); - bno = XFS_AGB_TO_DADDR(mp, agno, - agbno); - - /* - * Get the inode cluster buffer - */ - if (bp) - xfs_buf_relse(bp); - - error = xfs_inotobp(mp, NULL, ino, &dip, - &bp, &offset, - XFS_IGET_BULKSTAT); - - if (!error) - clustidx = offset / mp->m_sb.sb_inodesize; - if (XFS_TEST_ERROR(error != 0, - mp, XFS_ERRTAG_BULKSTAT_READ_CHUNK, - XFS_RANDOM_BULKSTAT_READ_CHUNK)) { - bp = NULL; - ubleft = 0; - rval = error; - break; - } - } } ino = XFS_AGINO_TO_INO(mp, agno, agino); bno = XFS_AGB_TO_DADDR(mp, agno, agbno); @@ -658,35 +475,13 @@ xfs_bulkstat( * when the chunk is used up. */ irbp->ir_freecount++; - if (!xfs_bulkstat_use_dinode(mp, flags, bp, - clustidx, &dip)) { - lastino = ino; - continue; - } - /* - * If we need to do an iget, cannot hold bp. - * Drop it, until starting the next cluster. - */ - if ((flags & BULKSTAT_FG_INLINE) && !dip) { - if (bp) - xfs_buf_relse(bp); - bp = NULL; - } /* * Get the inode and fill in a single buffer. - * BULKSTAT_FG_QUICK uses dip to fill it in. - * BULKSTAT_FG_IGET uses igets. - * BULKSTAT_FG_INLINE uses dip if we have an - * inline attr fork, else igets. - * See: xfs_bulkstat_one & xfs_dm_bulkstat_one. - * This is also used to count inodes/blks, etc - * in xfs_qm_quotacheck. */ ubused = statstruct_size; - error = formatter(mp, ino, ubufp, - ubleft, private_data, - bno, &ubused, dip, &fmterror); + error = formatter(mp, ino, ubufp, ubleft, + &ubused, &fmterror); if (fmterror == BULKSTAT_RV_NOTHING) { if (error && error != ENOENT && error != EINVAL) { @@ -778,8 +573,7 @@ xfs_bulkstat_single( */ ino = (xfs_ino_t)*lastinop; - error = xfs_bulkstat_one(mp, ino, buffer, sizeof(xfs_bstat_t), - NULL, 0, NULL, NULL, &res); + error = xfs_bulkstat_one(mp, ino, buffer, sizeof(xfs_bstat_t), 0, &res); if (error) { /* * Special case way failed, do it the "long" way @@ -788,8 +582,7 @@ xfs_bulkstat_single( (*lastinop)--; count = 1; if (xfs_bulkstat(mp, lastinop, &count, xfs_bulkstat_one, - NULL, sizeof(xfs_bstat_t), buffer, - BULKSTAT_FG_IGET, done)) + sizeof(xfs_bstat_t), buffer, done)) return error; if (count == 0 || (xfs_ino_t)*lastinop != ino) return error == EFSCORRUPTED ? diff --git a/fs/xfs/xfs_itable.h b/fs/xfs/xfs_itable.h index 20792bf45946..97295d91d170 100644 --- a/fs/xfs/xfs_itable.h +++ b/fs/xfs/xfs_itable.h @@ -27,10 +27,7 @@ typedef int (*bulkstat_one_pf)(struct xfs_mount *mp, xfs_ino_t ino, void __user *buffer, int ubsize, - void *private_data, - xfs_daddr_t bno, int *ubused, - void *dip, int *stat); /* @@ -41,13 +38,6 @@ typedef int (*bulkstat_one_pf)(struct xfs_mount *mp, #define BULKSTAT_RV_GIVEUP 2 /* - * Values for bulkstat flag argument. - */ -#define BULKSTAT_FG_IGET 0x1 /* Go through the buffer cache */ -#define BULKSTAT_FG_QUICK 0x2 /* No iget, walk the dinode cluster */ -#define BULKSTAT_FG_INLINE 0x4 /* No iget if inline attrs */ - -/* * Return stat information in bulk (by-inode) for the filesystem. */ int /* error status */ @@ -56,10 +46,8 @@ xfs_bulkstat( xfs_ino_t *lastino, /* last inode returned */ int *count, /* size of buffer/count returned */ bulkstat_one_pf formatter, /* func that'd fill a single buf */ - void *private_data, /* private data for formatter */ size_t statstruct_size,/* sizeof struct that we're filling */ char __user *ubuffer,/* buffer with inode stats */ - int flags, /* flag to control access method */ int *done); /* 1 if there are more stats to get */ int @@ -82,9 +70,7 @@ xfs_bulkstat_one_int( void __user *buffer, int ubsize, bulkstat_one_fmt_pf formatter, - xfs_daddr_t bno, int *ubused, - void *dibuff, int *stat); int @@ -93,10 +79,7 @@ xfs_bulkstat_one( xfs_ino_t ino, void __user *buffer, int ubsize, - void *private_data, - xfs_daddr_t bno, int *ubused, - void *dibuff, int *stat); typedef int (*inumbers_fmt_pf)( diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 5215abc8023a..925d572bf0f4 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -24,8 +24,6 @@ #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" -#include "xfs_dir2.h" -#include "xfs_dmapi.h" #include "xfs_mount.h" #include "xfs_error.h" #include "xfs_log_priv.h" @@ -35,8 +33,6 @@ #include "xfs_ialloc_btree.h" #include "xfs_log_recover.h" #include "xfs_trans_priv.h" -#include "xfs_dir2_sf.h" -#include "xfs_attr_sf.h" #include "xfs_dinode.h" #include "xfs_inode.h" #include "xfs_rw.h" @@ -337,7 +333,6 @@ xfs_log_reserve( int retval = 0; ASSERT(client == XFS_TRANSACTION || client == XFS_LOG); - ASSERT((flags & XFS_LOG_NOSLEEP) == 0); if (XLOG_FORCED_SHUTDOWN(log)) return XFS_ERROR(EIO); @@ -552,7 +547,7 @@ xfs_log_unmount_write(xfs_mount_t *mp) .magic = XLOG_UNMOUNT_TYPE, }; struct xfs_log_iovec reg = { - .i_addr = (void *)&magic, + .i_addr = &magic, .i_len = sizeof(magic), .i_type = XLOG_REG_TYPE_UNMOUNT, }; @@ -1047,7 +1042,6 @@ xlog_alloc_log(xfs_mount_t *mp, xlog_in_core_t *iclog, *prev_iclog=NULL; xfs_buf_t *bp; int i; - int iclogsize; int error = ENOMEM; uint log2_size = 0; @@ -1127,7 +1121,6 @@ xlog_alloc_log(xfs_mount_t *mp, * with different amounts of memory. See the definition of * xlog_in_core_t in xfs_log_priv.h for details. */ - iclogsize = log->l_iclog_size; ASSERT(log->l_iclog_size >= 4096); for (i=0; i < log->l_iclog_bufs; i++) { *iclogp = kmem_zalloc(sizeof(xlog_in_core_t), KM_MAYFAIL); @@ -1428,11 +1421,8 @@ xlog_sync(xlog_t *log, XFS_BUF_BUSY(bp); XFS_BUF_ASYNC(bp); bp->b_flags |= XBF_LOG_BUFFER; - /* - * Do an ordered write for the log block. - * Its unnecessary to flush the first split block in the log wrap case. - */ - if (!split && (log->l_mp->m_flags & XFS_MOUNT_BARRIER)) + + if (log->l_mp->m_flags & XFS_MOUNT_BARRIER) XFS_BUF_ORDERED(bp); ASSERT(XFS_BUF_ADDR(bp) <= log->l_logBBsize-1); diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h index 04c78e642cc8..916eb7db14d9 100644 --- a/fs/xfs/xfs_log.h +++ b/fs/xfs/xfs_log.h @@ -55,14 +55,10 @@ static inline xfs_lsn_t _lsn_cmp(xfs_lsn_t lsn1, xfs_lsn_t lsn2) /* * Flags to xfs_log_reserve() * - * XFS_LOG_SLEEP: If space is not available, sleep (default) - * XFS_LOG_NOSLEEP: If space is not available, return error * XFS_LOG_PERM_RESERV: Permanent reservation. When writes are * performed against this type of reservation, the reservation * is not decreased. Long running transactions should use this. */ -#define XFS_LOG_SLEEP 0x0 -#define XFS_LOG_NOSLEEP 0x1 #define XFS_LOG_PERM_RESERV 0x2 /* @@ -104,7 +100,7 @@ static inline xfs_lsn_t _lsn_cmp(xfs_lsn_t lsn1, xfs_lsn_t lsn2) #define XLOG_REG_TYPE_MAX 19 typedef struct xfs_log_iovec { - xfs_caddr_t i_addr; /* beginning address of region */ + void *i_addr; /* beginning address of region */ int i_len; /* length in bytes of region */ uint i_type; /* type of region */ } xfs_log_iovec_t; @@ -201,9 +197,4 @@ int xfs_log_commit_cil(struct xfs_mount *mp, struct xfs_trans *tp, bool xfs_log_item_in_current_chkpt(struct xfs_log_item *lip); #endif - - -extern int xlog_debug; /* set to 1 to enable real log */ - - #endif /* __XFS_LOG_H__ */ diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c index bb17cc044bf3..31e4ea2d19ac 100644 --- a/fs/xfs/xfs_log_cil.c +++ b/fs/xfs/xfs_log_cil.c @@ -26,8 +26,6 @@ #include "xfs_log_priv.h" #include "xfs_sb.h" #include "xfs_ag.h" -#include "xfs_dir2.h" -#include "xfs_dmapi.h" #include "xfs_mount.h" #include "xfs_error.h" #include "xfs_alloc.h" @@ -554,7 +552,7 @@ xlog_cil_push( thdr.th_type = XFS_TRANS_CHECKPOINT; thdr.th_tid = tic->t_tid; thdr.th_num_items = num_iovecs; - lhdr.i_addr = (xfs_caddr_t)&thdr; + lhdr.i_addr = &thdr; lhdr.i_len = sizeof(xfs_trans_header_t); lhdr.i_type = XLOG_REG_TYPE_TRANSHDR; tic->t_curr_res -= lhdr.i_len + sizeof(xlog_op_header_t); diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 14a69aec2c0b..6f3f5fa37acf 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -24,15 +24,11 @@ #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" -#include "xfs_dir2.h" -#include "xfs_dmapi.h" #include "xfs_mount.h" #include "xfs_error.h" #include "xfs_bmap_btree.h" #include "xfs_alloc_btree.h" #include "xfs_ialloc_btree.h" -#include "xfs_dir2_sf.h" -#include "xfs_attr_sf.h" #include "xfs_dinode.h" #include "xfs_inode.h" #include "xfs_inode_item.h" @@ -132,15 +128,10 @@ xlog_align( int nbblks, xfs_buf_t *bp) { - xfs_daddr_t offset; - xfs_caddr_t ptr; + xfs_daddr_t offset = blk_no & ((xfs_daddr_t)log->l_sectBBsize - 1); - offset = blk_no & ((xfs_daddr_t) log->l_sectBBsize - 1); - ptr = XFS_BUF_PTR(bp) + BBTOB(offset); - - ASSERT(ptr + BBTOB(nbblks) <= XFS_BUF_PTR(bp) + XFS_BUF_SIZE(bp)); - - return ptr; + ASSERT(BBTOB(offset + nbblks) <= XFS_BUF_SIZE(bp)); + return XFS_BUF_PTR(bp) + BBTOB(offset); } @@ -1570,9 +1561,7 @@ xlog_recover_reorder_trans( list_splice_init(&trans->r_itemq, &sort_list); list_for_each_entry_safe(item, n, &sort_list, ri_list) { - xfs_buf_log_format_t *buf_f; - - buf_f = (xfs_buf_log_format_t *)item->ri_buf[0].i_addr; + xfs_buf_log_format_t *buf_f = item->ri_buf[0].i_addr; switch (ITEM_TYPE(item)) { case XFS_LI_BUF: @@ -1897,9 +1886,8 @@ xlog_recover_do_inode_buffer( * current di_next_unlinked field. Extract its value * and copy it to the buffer copy. */ - logged_nextp = (xfs_agino_t *) - ((char *)(item->ri_buf[item_index].i_addr) + - (next_unlinked_offset - reg_buf_offset)); + logged_nextp = item->ri_buf[item_index].i_addr + + next_unlinked_offset - reg_buf_offset; if (unlikely(*logged_nextp == 0)) { xfs_fs_cmn_err(CE_ALERT, mp, "bad inode buffer log record (ptr = 0x%p, bp = 0x%p). XFS trying to replay bad (0) inode di_next_unlinked field", @@ -1978,8 +1966,7 @@ xlog_recover_do_reg_buffer( item->ri_buf[i].i_len, __func__); goto next; } - error = xfs_qm_dqcheck((xfs_disk_dquot_t *) - item->ri_buf[i].i_addr, + error = xfs_qm_dqcheck(item->ri_buf[i].i_addr, -1, 0, XFS_QMOPT_DOWARN, "dquot_buf_recover"); if (error) @@ -2192,7 +2179,7 @@ xlog_recover_do_buffer_trans( xlog_recover_item_t *item, int pass) { - xfs_buf_log_format_t *buf_f; + xfs_buf_log_format_t *buf_f = item->ri_buf[0].i_addr; xfs_mount_t *mp; xfs_buf_t *bp; int error; @@ -2202,8 +2189,6 @@ xlog_recover_do_buffer_trans( ushort flags; uint buf_flags; - buf_f = (xfs_buf_log_format_t *)item->ri_buf[0].i_addr; - if (pass == XLOG_RECOVER_PASS1) { /* * In this pass we're only looking for buf items @@ -2324,10 +2309,9 @@ xlog_recover_do_inode_trans( } if (item->ri_buf[0].i_len == sizeof(xfs_inode_log_format_t)) { - in_f = (xfs_inode_log_format_t *)item->ri_buf[0].i_addr; + in_f = item->ri_buf[0].i_addr; } else { - in_f = (xfs_inode_log_format_t *)kmem_alloc( - sizeof(xfs_inode_log_format_t), KM_SLEEP); + in_f = kmem_alloc(sizeof(xfs_inode_log_format_t), KM_SLEEP); need_free = 1; error = xfs_inode_item_format_convert(&item->ri_buf[0], in_f); if (error) @@ -2375,7 +2359,7 @@ xlog_recover_do_inode_trans( error = EFSCORRUPTED; goto error; } - dicp = (xfs_icdinode_t *)(item->ri_buf[1].i_addr); + dicp = item->ri_buf[1].i_addr; if (unlikely(dicp->di_magic != XFS_DINODE_MAGIC)) { xfs_buf_relse(bp); xfs_fs_cmn_err(CE_ALERT, mp, @@ -2466,7 +2450,7 @@ xlog_recover_do_inode_trans( } /* The core is in in-core format */ - xfs_dinode_to_disk(dip, (xfs_icdinode_t *)item->ri_buf[1].i_addr); + xfs_dinode_to_disk(dip, item->ri_buf[1].i_addr); /* the rest is in on-disk format */ if (item->ri_buf[1].i_len > sizeof(struct xfs_icdinode)) { @@ -2583,7 +2567,7 @@ xlog_recover_do_quotaoff_trans( return (0); } - qoff_f = (xfs_qoff_logformat_t *)item->ri_buf[0].i_addr; + qoff_f = item->ri_buf[0].i_addr; ASSERT(qoff_f); /* @@ -2627,9 +2611,8 @@ xlog_recover_do_dquot_trans( if (mp->m_qflags == 0) return (0); - recddq = (xfs_disk_dquot_t *)item->ri_buf[1].i_addr; - - if (item->ri_buf[1].i_addr == NULL) { + recddq = item->ri_buf[1].i_addr; + if (recddq == NULL) { cmn_err(CE_ALERT, "XFS: NULL dquot in %s.", __func__); return XFS_ERROR(EIO); @@ -2659,7 +2642,7 @@ xlog_recover_do_dquot_trans( * The other possibility, of course, is that the quota subsystem was * removed since the last mount - ENOSYS. */ - dq_f = (xfs_dq_logformat_t *)item->ri_buf[0].i_addr; + dq_f = item->ri_buf[0].i_addr; ASSERT(dq_f); if ((error = xfs_qm_dqcheck(recddq, dq_f->qlf_id, @@ -2726,7 +2709,7 @@ xlog_recover_do_efi_trans( return 0; } - efi_formatp = (xfs_efi_log_format_t *)item->ri_buf[0].i_addr; + efi_formatp = item->ri_buf[0].i_addr; mp = log->l_mp; efip = xfs_efi_init(mp, efi_formatp->efi_nextents); @@ -2772,7 +2755,7 @@ xlog_recover_do_efd_trans( return; } - efd_formatp = (xfs_efd_log_format_t *)item->ri_buf[0].i_addr; + efd_formatp = item->ri_buf[0].i_addr; ASSERT((item->ri_buf[0].i_len == (sizeof(xfs_efd_log_format_32_t) + ((efd_formatp->efd_nextents - 1) * sizeof(xfs_extent_32_t)))) || (item->ri_buf[0].i_len == (sizeof(xfs_efd_log_format_64_t) + @@ -3203,7 +3186,7 @@ xlog_recover_process_one_iunlink( int error; ino = XFS_AGINO_TO_INO(mp, agno, agino); - error = xfs_iget(mp, NULL, ino, 0, 0, &ip, 0); + error = xfs_iget(mp, NULL, ino, 0, 0, &ip); if (error) goto fail; diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index d7bf38c8cd1c..aeb9d72ebf6e 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -25,13 +25,10 @@ #include "xfs_sb.h" #include "xfs_ag.h" #include "xfs_dir2.h" -#include "xfs_dmapi.h" #include "xfs_mount.h" #include "xfs_bmap_btree.h" #include "xfs_alloc_btree.h" #include "xfs_ialloc_btree.h" -#include "xfs_dir2_sf.h" -#include "xfs_attr_sf.h" #include "xfs_dinode.h" #include "xfs_inode.h" #include "xfs_btree.h" @@ -268,10 +265,10 @@ xfs_sb_validate_fsb_count( #if XFS_BIG_BLKNOS /* Limited by ULONG_MAX of page cache index */ if (nblocks >> (PAGE_CACHE_SHIFT - sbp->sb_blocklog) > ULONG_MAX) - return E2BIG; + return EFBIG; #else /* Limited by UINT_MAX of sectors */ if (nblocks << (sbp->sb_blocklog - BBSHIFT) > UINT_MAX) - return E2BIG; + return EFBIG; #endif return 0; } @@ -393,7 +390,7 @@ xfs_mount_validate_sb( xfs_sb_validate_fsb_count(sbp, sbp->sb_rblocks)) { xfs_fs_mount_cmn_err(flags, "file system too large to be mounted on this system."); - return XFS_ERROR(E2BIG); + return XFS_ERROR(EFBIG); } if (unlikely(sbp->sb_inprogress)) { @@ -413,17 +410,6 @@ xfs_mount_validate_sb( return 0; } -STATIC void -xfs_initialize_perag_icache( - xfs_perag_t *pag) -{ - if (!pag->pag_ici_init) { - rwlock_init(&pag->pag_ici_lock); - INIT_RADIX_TREE(&pag->pag_ici_root, GFP_ATOMIC); - pag->pag_ici_init = 1; - } -} - int xfs_initialize_perag( xfs_mount_t *mp, @@ -436,13 +422,8 @@ xfs_initialize_perag( xfs_agino_t agino; xfs_ino_t ino; xfs_sb_t *sbp = &mp->m_sb; - xfs_ino_t max_inum = XFS_MAXINUMBER_32; int error = -ENOMEM; - /* Check to see if the filesystem can overflow 32 bit inodes */ - agino = XFS_OFFBNO_TO_AGINO(mp, sbp->sb_agblocks - 1, 0); - ino = XFS_AGINO_TO_INO(mp, agcount - 1, agino); - /* * Walk the current per-ag tree so we don't try to initialise AGs * that already exist (growfs case). Allocate and insert all the @@ -456,11 +437,18 @@ xfs_initialize_perag( } if (!first_initialised) first_initialised = index; + pag = kmem_zalloc(sizeof(*pag), KM_MAYFAIL); if (!pag) goto out_unwind; + pag->pag_agno = index; + pag->pag_mount = mp; + rwlock_init(&pag->pag_ici_lock); + INIT_RADIX_TREE(&pag->pag_ici_root, GFP_ATOMIC); + if (radix_tree_preload(GFP_NOFS)) goto out_unwind; + spin_lock(&mp->m_perag_lock); if (radix_tree_insert(&mp->m_perag_tree, index, pag)) { BUG(); @@ -469,25 +457,26 @@ xfs_initialize_perag( error = -EEXIST; goto out_unwind; } - pag->pag_agno = index; - pag->pag_mount = mp; spin_unlock(&mp->m_perag_lock); radix_tree_preload_end(); } - /* Clear the mount flag if no inode can overflow 32 bits - * on this filesystem, or if specifically requested.. + /* + * If we mount with the inode64 option, or no inode overflows + * the legacy 32-bit address space clear the inode32 option. */ - if ((mp->m_flags & XFS_MOUNT_SMALL_INUMS) && ino > max_inum) { + agino = XFS_OFFBNO_TO_AGINO(mp, sbp->sb_agblocks - 1, 0); + ino = XFS_AGINO_TO_INO(mp, agcount - 1, agino); + + if ((mp->m_flags & XFS_MOUNT_SMALL_INUMS) && ino > XFS_MAXINUMBER_32) mp->m_flags |= XFS_MOUNT_32BITINODES; - } else { + else mp->m_flags &= ~XFS_MOUNT_32BITINODES; - } - /* If we can overflow then setup the ag headers accordingly */ if (mp->m_flags & XFS_MOUNT_32BITINODES) { - /* Calculate how much should be reserved for inodes to - * meet the max inode percentage. + /* + * Calculate how much should be reserved for inodes to meet + * the max inode percentage. */ if (mp->m_maxicount) { __uint64_t icount; @@ -500,30 +489,28 @@ xfs_initialize_perag( } else { max_metadata = agcount; } + for (index = 0; index < agcount; index++) { ino = XFS_AGINO_TO_INO(mp, index, agino); - if (ino > max_inum) { + if (ino > XFS_MAXINUMBER_32) { index++; break; } - /* This ag is preferred for inodes */ pag = xfs_perag_get(mp, index); pag->pagi_inodeok = 1; if (index < max_metadata) pag->pagf_metadata = 1; - xfs_initialize_perag_icache(pag); xfs_perag_put(pag); } } else { - /* Setup default behavior for smaller filesystems */ for (index = 0; index < agcount; index++) { pag = xfs_perag_get(mp, index); pag->pagi_inodeok = 1; - xfs_initialize_perag_icache(pag); xfs_perag_put(pag); } } + if (maxagi) *maxagi = index; return 0; @@ -1009,7 +996,7 @@ xfs_check_sizes(xfs_mount_t *mp) d = (xfs_daddr_t)XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks); if (XFS_BB_TO_FSB(mp, d) != mp->m_sb.sb_dblocks) { cmn_err(CE_WARN, "XFS: size check 1 failed"); - return XFS_ERROR(E2BIG); + return XFS_ERROR(EFBIG); } error = xfs_read_buf(mp, mp->m_ddev_targp, d - XFS_FSS_TO_BB(mp, 1), @@ -1019,7 +1006,7 @@ xfs_check_sizes(xfs_mount_t *mp) } else { cmn_err(CE_WARN, "XFS: size check 2 failed"); if (error == ENOSPC) - error = XFS_ERROR(E2BIG); + error = XFS_ERROR(EFBIG); return error; } @@ -1027,7 +1014,7 @@ xfs_check_sizes(xfs_mount_t *mp) d = (xfs_daddr_t)XFS_FSB_TO_BB(mp, mp->m_sb.sb_logblocks); if (XFS_BB_TO_FSB(mp, d) != mp->m_sb.sb_logblocks) { cmn_err(CE_WARN, "XFS: size check 3 failed"); - return XFS_ERROR(E2BIG); + return XFS_ERROR(EFBIG); } error = xfs_read_buf(mp, mp->m_logdev_targp, d - XFS_FSB_TO_BB(mp, 1), @@ -1037,7 +1024,7 @@ xfs_check_sizes(xfs_mount_t *mp) } else { cmn_err(CE_WARN, "XFS: size check 3 failed"); if (error == ENOSPC) - error = XFS_ERROR(E2BIG); + error = XFS_ERROR(EFBIG); return error; } } @@ -1254,7 +1241,7 @@ xfs_mountfs( * Allocate and initialize the per-ag data. */ spin_lock_init(&mp->m_perag_lock); - INIT_RADIX_TREE(&mp->m_perag_tree, GFP_NOFS); + INIT_RADIX_TREE(&mp->m_perag_tree, GFP_ATOMIC); error = xfs_initialize_perag(mp, sbp->sb_agcount, &mp->m_maxagi); if (error) { cmn_err(CE_WARN, "XFS: Failed per-ag init: %d", error); @@ -1310,7 +1297,7 @@ xfs_mountfs( * Get and sanity-check the root inode. * Save the pointer to it in the mount structure. */ - error = xfs_iget(mp, NULL, sbp->sb_rootino, 0, XFS_ILOCK_EXCL, &rip, 0); + error = xfs_iget(mp, NULL, sbp->sb_rootino, 0, XFS_ILOCK_EXCL, &rip); if (error) { cmn_err(CE_WARN, "XFS: failed to read root inode"); goto out_log_dealloc; diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index 1d2c7eed4eda..622da2179a57 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -66,65 +66,6 @@ struct xfs_nameops; struct xfs_ail; struct xfs_quotainfo; - -/* - * Prototypes and functions for the Data Migration subsystem. - */ - -typedef int (*xfs_send_data_t)(int, struct xfs_inode *, - xfs_off_t, size_t, int, int *); -typedef int (*xfs_send_mmap_t)(struct vm_area_struct *, uint); -typedef int (*xfs_send_destroy_t)(struct xfs_inode *, dm_right_t); -typedef int (*xfs_send_namesp_t)(dm_eventtype_t, struct xfs_mount *, - struct xfs_inode *, dm_right_t, - struct xfs_inode *, dm_right_t, - const unsigned char *, const unsigned char *, - mode_t, int, int); -typedef int (*xfs_send_mount_t)(struct xfs_mount *, dm_right_t, - char *, char *); -typedef void (*xfs_send_unmount_t)(struct xfs_mount *, struct xfs_inode *, - dm_right_t, mode_t, int, int); - -typedef struct xfs_dmops { - xfs_send_data_t xfs_send_data; - xfs_send_mmap_t xfs_send_mmap; - xfs_send_destroy_t xfs_send_destroy; - xfs_send_namesp_t xfs_send_namesp; - xfs_send_mount_t xfs_send_mount; - xfs_send_unmount_t xfs_send_unmount; -} xfs_dmops_t; - -#define XFS_DMAPI_UNMOUNT_FLAGS(mp) \ - (((mp)->m_dmevmask & (1 << DM_EVENT_UNMOUNT)) ? 0 : DM_FLAGS_UNWANTED) - -#define XFS_SEND_DATA(mp, ev,ip,off,len,fl,lock) \ - (*(mp)->m_dm_ops->xfs_send_data)(ev,ip,off,len,fl,lock) -#define XFS_SEND_MMAP(mp, vma,fl) \ - (*(mp)->m_dm_ops->xfs_send_mmap)(vma,fl) -#define XFS_SEND_DESTROY(mp, ip,right) \ - (*(mp)->m_dm_ops->xfs_send_destroy)(ip,right) -#define XFS_SEND_NAMESP(mp, ev,b1,r1,b2,r2,n1,n2,mode,rval,fl) \ - (*(mp)->m_dm_ops->xfs_send_namesp)(ev,NULL,b1,r1,b2,r2,n1,n2,mode,rval,fl) -#define XFS_SEND_MOUNT(mp,right,path,name) \ - (*(mp)->m_dm_ops->xfs_send_mount)(mp,right,path,name) -#define XFS_SEND_PREUNMOUNT(mp) \ -do { \ - if (mp->m_flags & XFS_MOUNT_DMAPI) { \ - (*(mp)->m_dm_ops->xfs_send_namesp)(DM_EVENT_PREUNMOUNT, mp, \ - (mp)->m_rootip, DM_RIGHT_NULL, \ - (mp)->m_rootip, DM_RIGHT_NULL, \ - NULL, NULL, 0, 0, XFS_DMAPI_UNMOUNT_FLAGS(mp)); \ - } \ -} while (0) -#define XFS_SEND_UNMOUNT(mp) \ -do { \ - if (mp->m_flags & XFS_MOUNT_DMAPI) { \ - (*(mp)->m_dm_ops->xfs_send_unmount)(mp, (mp)->m_rootip, \ - DM_RIGHT_NULL, 0, 0, XFS_DMAPI_UNMOUNT_FLAGS(mp)); \ - } \ -} while (0) - - #ifdef HAVE_PERCPU_SB /* @@ -241,8 +182,6 @@ typedef struct xfs_mount { uint m_chsize; /* size of next field */ struct xfs_chash *m_chash; /* fs private inode per-cluster * hash table */ - struct xfs_dmops *m_dm_ops; /* vector of DMI ops */ - struct xfs_qmops *m_qm_ops; /* vector of XQM ops */ atomic_t m_active_trans; /* number trans frozen */ #ifdef HAVE_PERCPU_SB xfs_icsb_cnts_t __percpu *m_sb_cnts; /* per-cpu superblock counters */ @@ -259,7 +198,7 @@ typedef struct xfs_mount { wait_queue_head_t m_wait_single_sync_task; __int64_t m_update_flags; /* sb flags we need to update on the next remount,rw */ - struct list_head m_mplist; /* inode shrinker mount list */ + struct shrinker m_inode_shrink; /* inode reclaim shrinker */ } xfs_mount_t; /* @@ -269,7 +208,6 @@ typedef struct xfs_mount { must be synchronous except for space allocations */ #define XFS_MOUNT_DELAYLOG (1ULL << 1) /* delayed logging is enabled */ -#define XFS_MOUNT_DMAPI (1ULL << 2) /* dmapi is enabled */ #define XFS_MOUNT_WAS_CLEAN (1ULL << 3) #define XFS_MOUNT_FS_SHUTDOWN (1ULL << 4) /* atomic stop of all filesystem operations, typically for @@ -282,8 +220,6 @@ typedef struct xfs_mount { #define XFS_MOUNT_GRPID (1ULL << 9) /* group-ID assigned from directory */ #define XFS_MOUNT_NORECOVERY (1ULL << 10) /* no recovery - dirty fs */ #define XFS_MOUNT_DFLT_IOSIZE (1ULL << 12) /* set default i/o size */ -#define XFS_MOUNT_OSYNCISOSYNC (1ULL << 13) /* o_sync is REALLY o_sync */ - /* osyncisdsync is now default*/ #define XFS_MOUNT_32BITINODES (1ULL << 14) /* do not create inodes above * 32 bits in size */ #define XFS_MOUNT_SMALL_INUMS (1ULL << 15) /* users wants 32bit inodes */ @@ -440,11 +376,6 @@ extern int xfs_sb_validate_fsb_count(struct xfs_sb *, __uint64_t); extern int xfs_dev_is_read_only(struct xfs_mount *, char *); -extern int xfs_dmops_get(struct xfs_mount *); -extern void xfs_dmops_put(struct xfs_mount *); - -extern struct xfs_dmops xfs_dmcore_xfs; - #endif /* __KERNEL__ */ extern void xfs_mod_sb(struct xfs_trans *, __int64_t); diff --git a/fs/xfs/xfs_rename.c b/fs/xfs/xfs_rename.c index fc1cda23b817..8fca957200df 100644 --- a/fs/xfs/xfs_rename.c +++ b/fs/xfs/xfs_rename.c @@ -24,12 +24,9 @@ #include "xfs_sb.h" #include "xfs_ag.h" #include "xfs_dir2.h" -#include "xfs_dmapi.h" #include "xfs_mount.h" #include "xfs_da_btree.h" #include "xfs_bmap_btree.h" -#include "xfs_dir2_sf.h" -#include "xfs_attr_sf.h" #include "xfs_dinode.h" #include "xfs_inode.h" #include "xfs_inode_item.h" @@ -116,20 +113,7 @@ xfs_rename( int spaceres; int num_inodes; - xfs_itrace_entry(src_dp); - xfs_itrace_entry(target_dp); - - if (DM_EVENT_ENABLED(src_dp, DM_EVENT_RENAME) || - DM_EVENT_ENABLED(target_dp, DM_EVENT_RENAME)) { - error = XFS_SEND_NAMESP(mp, DM_EVENT_RENAME, - src_dp, DM_RIGHT_NULL, - target_dp, DM_RIGHT_NULL, - src_name->name, target_name->name, - 0, 0, 0); - if (error) - return error; - } - /* Return through std_return after this point. */ + trace_xfs_rename(src_dp, target_dp, src_name, target_name); new_parent = (src_dp != target_dp); src_is_directory = ((src_ip->i_d.di_mode & S_IFMT) == S_IFDIR); @@ -184,26 +168,14 @@ xfs_rename( /* * Join all the inodes to the transaction. From this point on, * we can rely on either trans_commit or trans_cancel to unlock - * them. Note that we need to add a vnode reference to the - * directories since trans_commit & trans_cancel will decrement - * them when they unlock the inodes. Also, we need to be careful - * not to add an inode to the transaction more than once. + * them. */ - IHOLD(src_dp); - xfs_trans_ijoin(tp, src_dp, XFS_ILOCK_EXCL); - - if (new_parent) { - IHOLD(target_dp); - xfs_trans_ijoin(tp, target_dp, XFS_ILOCK_EXCL); - } - - IHOLD(src_ip); - xfs_trans_ijoin(tp, src_ip, XFS_ILOCK_EXCL); - - if (target_ip) { - IHOLD(target_ip); - xfs_trans_ijoin(tp, target_ip, XFS_ILOCK_EXCL); - } + xfs_trans_ijoin_ref(tp, src_dp, XFS_ILOCK_EXCL); + if (new_parent) + xfs_trans_ijoin_ref(tp, target_dp, XFS_ILOCK_EXCL); + xfs_trans_ijoin_ref(tp, src_ip, XFS_ILOCK_EXCL); + if (target_ip) + xfs_trans_ijoin_ref(tp, target_ip, XFS_ILOCK_EXCL); /* * If we are using project inheritance, we only allow renames @@ -369,26 +341,13 @@ xfs_rename( * trans_commit will unlock src_ip, target_ip & decrement * the vnode references. */ - error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); - - /* Fall through to std_return with error = 0 or errno from - * xfs_trans_commit */ -std_return: - if (DM_EVENT_ENABLED(src_dp, DM_EVENT_POSTRENAME) || - DM_EVENT_ENABLED(target_dp, DM_EVENT_POSTRENAME)) { - (void) XFS_SEND_NAMESP (mp, DM_EVENT_POSTRENAME, - src_dp, DM_RIGHT_NULL, - target_dp, DM_RIGHT_NULL, - src_name->name, target_name->name, - 0, error, 0); - } - return error; + return xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); abort_return: cancel_flags |= XFS_TRANS_ABORT; - /* FALLTHROUGH */ error_return: xfs_bmap_cancel(&free_list); xfs_trans_cancel(tp, cancel_flags); - goto std_return; + std_return: + return error; } diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c index 6be05f756d59..891260fea11e 100644 --- a/fs/xfs/xfs_rtalloc.c +++ b/fs/xfs/xfs_rtalloc.c @@ -25,17 +25,10 @@ #include "xfs_sb.h" #include "xfs_ag.h" #include "xfs_dir2.h" -#include "xfs_dmapi.h" #include "xfs_mount.h" #include "xfs_bmap_btree.h" -#include "xfs_alloc_btree.h" -#include "xfs_ialloc_btree.h" -#include "xfs_dir2_sf.h" -#include "xfs_attr_sf.h" #include "xfs_dinode.h" #include "xfs_inode.h" -#include "xfs_btree.h" -#include "xfs_ialloc.h" #include "xfs_alloc.h" #include "xfs_bmap.h" #include "xfs_rtalloc.h" @@ -129,7 +122,7 @@ xfs_growfs_rt_alloc( cancelflags |= XFS_TRANS_ABORT; error = xfs_bmapi(tp, ip, oblocks, nblocks - oblocks, XFS_BMAPI_WRITE | XFS_BMAPI_METADATA, &firstblock, - resblks, &map, &nmap, &flist, NULL); + resblks, &map, &nmap, &flist); if (!error && nmap < 1) error = XFS_ERROR(ENOSPC); if (error) @@ -2247,7 +2240,7 @@ xfs_rtmount_init( cmn_err(CE_WARN, "XFS: realtime mount -- %llu != %llu", (unsigned long long) XFS_BB_TO_FSB(mp, d), (unsigned long long) mp->m_sb.sb_rblocks); - return XFS_ERROR(E2BIG); + return XFS_ERROR(EFBIG); } error = xfs_read_buf(mp, mp->m_rtdev_targp, d - XFS_FSB_TO_BB(mp, 1), @@ -2256,7 +2249,7 @@ xfs_rtmount_init( cmn_err(CE_WARN, "XFS: realtime mount -- xfs_read_buf failed, returned %d", error); if (error == ENOSPC) - return XFS_ERROR(E2BIG); + return XFS_ERROR(EFBIG); return error; } xfs_buf_relse(bp); @@ -2277,12 +2270,12 @@ xfs_rtmount_inodes( sbp = &mp->m_sb; if (sbp->sb_rbmino == NULLFSINO) return 0; - error = xfs_iget(mp, NULL, sbp->sb_rbmino, 0, 0, &mp->m_rbmip, 0); + error = xfs_iget(mp, NULL, sbp->sb_rbmino, 0, 0, &mp->m_rbmip); if (error) return error; ASSERT(mp->m_rbmip != NULL); ASSERT(sbp->sb_rsumino != NULLFSINO); - error = xfs_iget(mp, NULL, sbp->sb_rsumino, 0, 0, &mp->m_rsumip, 0); + error = xfs_iget(mp, NULL, sbp->sb_rsumino, 0, 0, &mp->m_rsumip); if (error) { IRELE(mp->m_rbmip); return error; diff --git a/fs/xfs/xfs_rtalloc.h b/fs/xfs/xfs_rtalloc.h index b2d67adb6a08..ff614c29b441 100644 --- a/fs/xfs/xfs_rtalloc.h +++ b/fs/xfs/xfs_rtalloc.h @@ -147,7 +147,16 @@ xfs_growfs_rt( # define xfs_rtfree_extent(t,b,l) (ENOSYS) # define xfs_rtpick_extent(m,t,l,rb) (ENOSYS) # define xfs_growfs_rt(mp,in) (ENOSYS) -# define xfs_rtmount_init(m) (((mp)->m_sb.sb_rblocks == 0)? 0 : (ENOSYS)) +static inline int /* error */ +xfs_rtmount_init( + xfs_mount_t *mp) /* file system mount structure */ +{ + if (mp->m_sb.sb_rblocks == 0) + return 0; + + cmn_err(CE_WARN, "XFS: Not built with CONFIG_XFS_RT"); + return ENOSYS; +} # define xfs_rtmount_inodes(m) (((mp)->m_sb.sb_rblocks == 0)? 0 : (ENOSYS)) # define xfs_rtunmount_inodes(m) #endif /* CONFIG_XFS_RT */ diff --git a/fs/xfs/xfs_rw.c b/fs/xfs/xfs_rw.c index e336742a58a4..56861d5daaef 100644 --- a/fs/xfs/xfs_rw.c +++ b/fs/xfs/xfs_rw.c @@ -24,27 +24,12 @@ #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" -#include "xfs_dir2.h" -#include "xfs_dmapi.h" #include "xfs_mount.h" #include "xfs_bmap_btree.h" -#include "xfs_alloc_btree.h" -#include "xfs_ialloc_btree.h" -#include "xfs_dir2_sf.h" -#include "xfs_attr_sf.h" #include "xfs_dinode.h" #include "xfs_inode.h" -#include "xfs_inode_item.h" -#include "xfs_itable.h" -#include "xfs_btree.h" -#include "xfs_alloc.h" -#include "xfs_ialloc.h" -#include "xfs_attr.h" -#include "xfs_bmap.h" #include "xfs_error.h" -#include "xfs_buf_item.h" #include "xfs_rw.h" -#include "xfs_trace.h" /* * Force a shutdown of the filesystem instantly while keeping diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index ce558efa2ea0..fdca7416c754 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -1,5 +1,6 @@ /* * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc. + * Copyright (C) 2010 Red Hat, Inc. * All Rights Reserved. * * This program is free software; you can redistribute it and/or @@ -24,16 +25,12 @@ #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" -#include "xfs_dir2.h" -#include "xfs_dmapi.h" #include "xfs_mount.h" #include "xfs_error.h" #include "xfs_da_btree.h" #include "xfs_bmap_btree.h" #include "xfs_alloc_btree.h" #include "xfs_ialloc_btree.h" -#include "xfs_dir2_sf.h" -#include "xfs_attr_sf.h" #include "xfs_dinode.h" #include "xfs_inode.h" #include "xfs_btree.h" @@ -47,135 +44,491 @@ #include "xfs_trace.h" kmem_zone_t *xfs_trans_zone; +kmem_zone_t *xfs_log_item_desc_zone; + + +/* + * Various log reservation values. + * + * These are based on the size of the file system block because that is what + * most transactions manipulate. Each adds in an additional 128 bytes per + * item logged to try to account for the overhead of the transaction mechanism. + * + * Note: Most of the reservations underestimate the number of allocation + * groups into which they could free extents in the xfs_bmap_finish() call. + * This is because the number in the worst case is quite high and quite + * unusual. In order to fix this we need to change xfs_bmap_finish() to free + * extents in only a single AG at a time. This will require changes to the + * EFI code as well, however, so that the EFI for the extents not freed is + * logged again in each transaction. See SGI PV #261917. + * + * Reservation functions here avoid a huge stack in xfs_trans_init due to + * register overflow from temporaries in the calculations. + */ + /* - * Reservation functions here avoid a huge stack in xfs_trans_init - * due to register overflow from temporaries in the calculations. + * In a write transaction we can allocate a maximum of 2 + * extents. This gives: + * the inode getting the new extents: inode size + * the inode's bmap btree: max depth * block size + * the agfs of the ags from which the extents are allocated: 2 * sector + * the superblock free block counter: sector size + * the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size + * And the bmap_finish transaction can free bmap blocks in a join: + * the agfs of the ags containing the blocks: 2 * sector size + * the agfls of the ags containing the blocks: 2 * sector size + * the super block free block counter: sector size + * the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size */ STATIC uint -xfs_calc_write_reservation(xfs_mount_t *mp) +xfs_calc_write_reservation( + struct xfs_mount *mp) { - return XFS_CALC_WRITE_LOG_RES(mp) + XFS_DQUOT_LOGRES(mp); + return XFS_DQUOT_LOGRES(mp) + + MAX((mp->m_sb.sb_inodesize + + XFS_FSB_TO_B(mp, XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK)) + + 2 * mp->m_sb.sb_sectsize + + mp->m_sb.sb_sectsize + + XFS_ALLOCFREE_LOG_RES(mp, 2) + + 128 * (4 + XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) + + XFS_ALLOCFREE_LOG_COUNT(mp, 2))), + (2 * mp->m_sb.sb_sectsize + + 2 * mp->m_sb.sb_sectsize + + mp->m_sb.sb_sectsize + + XFS_ALLOCFREE_LOG_RES(mp, 2) + + 128 * (5 + XFS_ALLOCFREE_LOG_COUNT(mp, 2)))); } +/* + * In truncating a file we free up to two extents at once. We can modify: + * the inode being truncated: inode size + * the inode's bmap btree: (max depth + 1) * block size + * And the bmap_finish transaction can free the blocks and bmap blocks: + * the agf for each of the ags: 4 * sector size + * the agfl for each of the ags: 4 * sector size + * the super block to reflect the freed blocks: sector size + * worst case split in allocation btrees per extent assuming 4 extents: + * 4 exts * 2 trees * (2 * max depth - 1) * block size + * the inode btree: max depth * blocksize + * the allocation btrees: 2 trees * (max depth - 1) * block size + */ STATIC uint -xfs_calc_itruncate_reservation(xfs_mount_t *mp) +xfs_calc_itruncate_reservation( + struct xfs_mount *mp) { - return XFS_CALC_ITRUNCATE_LOG_RES(mp) + XFS_DQUOT_LOGRES(mp); + return XFS_DQUOT_LOGRES(mp) + + MAX((mp->m_sb.sb_inodesize + + XFS_FSB_TO_B(mp, XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) + 1) + + 128 * (2 + XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK))), + (4 * mp->m_sb.sb_sectsize + + 4 * mp->m_sb.sb_sectsize + + mp->m_sb.sb_sectsize + + XFS_ALLOCFREE_LOG_RES(mp, 4) + + 128 * (9 + XFS_ALLOCFREE_LOG_COUNT(mp, 4)) + + 128 * 5 + + XFS_ALLOCFREE_LOG_RES(mp, 1) + + 128 * (2 + XFS_IALLOC_BLOCKS(mp) + mp->m_in_maxlevels + + XFS_ALLOCFREE_LOG_COUNT(mp, 1)))); } +/* + * In renaming a files we can modify: + * the four inodes involved: 4 * inode size + * the two directory btrees: 2 * (max depth + v2) * dir block size + * the two directory bmap btrees: 2 * max depth * block size + * And the bmap_finish transaction can free dir and bmap blocks (two sets + * of bmap blocks) giving: + * the agf for the ags in which the blocks live: 3 * sector size + * the agfl for the ags in which the blocks live: 3 * sector size + * the superblock for the free block count: sector size + * the allocation btrees: 3 exts * 2 trees * (2 * max depth - 1) * block size + */ STATIC uint -xfs_calc_rename_reservation(xfs_mount_t *mp) +xfs_calc_rename_reservation( + struct xfs_mount *mp) { - return XFS_CALC_RENAME_LOG_RES(mp) + XFS_DQUOT_LOGRES(mp); + return XFS_DQUOT_LOGRES(mp) + + MAX((4 * mp->m_sb.sb_inodesize + + 2 * XFS_DIROP_LOG_RES(mp) + + 128 * (4 + 2 * XFS_DIROP_LOG_COUNT(mp))), + (3 * mp->m_sb.sb_sectsize + + 3 * mp->m_sb.sb_sectsize + + mp->m_sb.sb_sectsize + + XFS_ALLOCFREE_LOG_RES(mp, 3) + + 128 * (7 + XFS_ALLOCFREE_LOG_COUNT(mp, 3)))); } +/* + * For creating a link to an inode: + * the parent directory inode: inode size + * the linked inode: inode size + * the directory btree could split: (max depth + v2) * dir block size + * the directory bmap btree could join or split: (max depth + v2) * blocksize + * And the bmap_finish transaction can free some bmap blocks giving: + * the agf for the ag in which the blocks live: sector size + * the agfl for the ag in which the blocks live: sector size + * the superblock for the free block count: sector size + * the allocation btrees: 2 trees * (2 * max depth - 1) * block size + */ STATIC uint -xfs_calc_link_reservation(xfs_mount_t *mp) +xfs_calc_link_reservation( + struct xfs_mount *mp) { - return XFS_CALC_LINK_LOG_RES(mp) + XFS_DQUOT_LOGRES(mp); + return XFS_DQUOT_LOGRES(mp) + + MAX((mp->m_sb.sb_inodesize + + mp->m_sb.sb_inodesize + + XFS_DIROP_LOG_RES(mp) + + 128 * (2 + XFS_DIROP_LOG_COUNT(mp))), + (mp->m_sb.sb_sectsize + + mp->m_sb.sb_sectsize + + mp->m_sb.sb_sectsize + + XFS_ALLOCFREE_LOG_RES(mp, 1) + + 128 * (3 + XFS_ALLOCFREE_LOG_COUNT(mp, 1)))); } +/* + * For removing a directory entry we can modify: + * the parent directory inode: inode size + * the removed inode: inode size + * the directory btree could join: (max depth + v2) * dir block size + * the directory bmap btree could join or split: (max depth + v2) * blocksize + * And the bmap_finish transaction can free the dir and bmap blocks giving: + * the agf for the ag in which the blocks live: 2 * sector size + * the agfl for the ag in which the blocks live: 2 * sector size + * the superblock for the free block count: sector size + * the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size + */ STATIC uint -xfs_calc_remove_reservation(xfs_mount_t *mp) +xfs_calc_remove_reservation( + struct xfs_mount *mp) { - return XFS_CALC_REMOVE_LOG_RES(mp) + XFS_DQUOT_LOGRES(mp); + return XFS_DQUOT_LOGRES(mp) + + MAX((mp->m_sb.sb_inodesize + + mp->m_sb.sb_inodesize + + XFS_DIROP_LOG_RES(mp) + + 128 * (2 + XFS_DIROP_LOG_COUNT(mp))), + (2 * mp->m_sb.sb_sectsize + + 2 * mp->m_sb.sb_sectsize + + mp->m_sb.sb_sectsize + + XFS_ALLOCFREE_LOG_RES(mp, 2) + + 128 * (5 + XFS_ALLOCFREE_LOG_COUNT(mp, 2)))); } +/* + * For symlink we can modify: + * the parent directory inode: inode size + * the new inode: inode size + * the inode btree entry: 1 block + * the directory btree: (max depth + v2) * dir block size + * the directory inode's bmap btree: (max depth + v2) * block size + * the blocks for the symlink: 1 kB + * Or in the first xact we allocate some inodes giving: + * the agi and agf of the ag getting the new inodes: 2 * sectorsize + * the inode blocks allocated: XFS_IALLOC_BLOCKS * blocksize + * the inode btree: max depth * blocksize + * the allocation btrees: 2 trees * (2 * max depth - 1) * block size + */ STATIC uint -xfs_calc_symlink_reservation(xfs_mount_t *mp) +xfs_calc_symlink_reservation( + struct xfs_mount *mp) { - return XFS_CALC_SYMLINK_LOG_RES(mp) + XFS_DQUOT_LOGRES(mp); + return XFS_DQUOT_LOGRES(mp) + + MAX((mp->m_sb.sb_inodesize + + mp->m_sb.sb_inodesize + + XFS_FSB_TO_B(mp, 1) + + XFS_DIROP_LOG_RES(mp) + + 1024 + + 128 * (4 + XFS_DIROP_LOG_COUNT(mp))), + (2 * mp->m_sb.sb_sectsize + + XFS_FSB_TO_B(mp, XFS_IALLOC_BLOCKS(mp)) + + XFS_FSB_TO_B(mp, mp->m_in_maxlevels) + + XFS_ALLOCFREE_LOG_RES(mp, 1) + + 128 * (2 + XFS_IALLOC_BLOCKS(mp) + mp->m_in_maxlevels + + XFS_ALLOCFREE_LOG_COUNT(mp, 1)))); } +/* + * For create we can modify: + * the parent directory inode: inode size + * the new inode: inode size + * the inode btree entry: block size + * the superblock for the nlink flag: sector size + * the directory btree: (max depth + v2) * dir block size + * the directory inode's bmap btree: (max depth + v2) * block size + * Or in the first xact we allocate some inodes giving: + * the agi and agf of the ag getting the new inodes: 2 * sectorsize + * the superblock for the nlink flag: sector size + * the inode blocks allocated: XFS_IALLOC_BLOCKS * blocksize + * the inode btree: max depth * blocksize + * the allocation btrees: 2 trees * (max depth - 1) * block size + */ STATIC uint -xfs_calc_create_reservation(xfs_mount_t *mp) +xfs_calc_create_reservation( + struct xfs_mount *mp) { - return XFS_CALC_CREATE_LOG_RES(mp) + XFS_DQUOT_LOGRES(mp); + return XFS_DQUOT_LOGRES(mp) + + MAX((mp->m_sb.sb_inodesize + + mp->m_sb.sb_inodesize + + mp->m_sb.sb_sectsize + + XFS_FSB_TO_B(mp, 1) + + XFS_DIROP_LOG_RES(mp) + + 128 * (3 + XFS_DIROP_LOG_COUNT(mp))), + (3 * mp->m_sb.sb_sectsize + + XFS_FSB_TO_B(mp, XFS_IALLOC_BLOCKS(mp)) + + XFS_FSB_TO_B(mp, mp->m_in_maxlevels) + + XFS_ALLOCFREE_LOG_RES(mp, 1) + + 128 * (2 + XFS_IALLOC_BLOCKS(mp) + mp->m_in_maxlevels + + XFS_ALLOCFREE_LOG_COUNT(mp, 1)))); } +/* + * Making a new directory is the same as creating a new file. + */ STATIC uint -xfs_calc_mkdir_reservation(xfs_mount_t *mp) +xfs_calc_mkdir_reservation( + struct xfs_mount *mp) { - return XFS_CALC_MKDIR_LOG_RES(mp) + XFS_DQUOT_LOGRES(mp); + return xfs_calc_create_reservation(mp); } +/* + * In freeing an inode we can modify: + * the inode being freed: inode size + * the super block free inode counter: sector size + * the agi hash list and counters: sector size + * the inode btree entry: block size + * the on disk inode before ours in the agi hash list: inode cluster size + * the inode btree: max depth * blocksize + * the allocation btrees: 2 trees * (max depth - 1) * block size + */ STATIC uint -xfs_calc_ifree_reservation(xfs_mount_t *mp) +xfs_calc_ifree_reservation( + struct xfs_mount *mp) { - return XFS_CALC_IFREE_LOG_RES(mp) + XFS_DQUOT_LOGRES(mp); + return XFS_DQUOT_LOGRES(mp) + + mp->m_sb.sb_inodesize + + mp->m_sb.sb_sectsize + + mp->m_sb.sb_sectsize + + XFS_FSB_TO_B(mp, 1) + + MAX((__uint16_t)XFS_FSB_TO_B(mp, 1), + XFS_INODE_CLUSTER_SIZE(mp)) + + 128 * 5 + + XFS_ALLOCFREE_LOG_RES(mp, 1) + + 128 * (2 + XFS_IALLOC_BLOCKS(mp) + mp->m_in_maxlevels + + XFS_ALLOCFREE_LOG_COUNT(mp, 1)); } +/* + * When only changing the inode we log the inode and possibly the superblock + * We also add a bit of slop for the transaction stuff. + */ STATIC uint -xfs_calc_ichange_reservation(xfs_mount_t *mp) +xfs_calc_ichange_reservation( + struct xfs_mount *mp) { - return XFS_CALC_ICHANGE_LOG_RES(mp) + XFS_DQUOT_LOGRES(mp); + return XFS_DQUOT_LOGRES(mp) + + mp->m_sb.sb_inodesize + + mp->m_sb.sb_sectsize + + 512; + } +/* + * Growing the data section of the filesystem. + * superblock + * agi and agf + * allocation btrees + */ STATIC uint -xfs_calc_growdata_reservation(xfs_mount_t *mp) +xfs_calc_growdata_reservation( + struct xfs_mount *mp) { - return XFS_CALC_GROWDATA_LOG_RES(mp); + return mp->m_sb.sb_sectsize * 3 + + XFS_ALLOCFREE_LOG_RES(mp, 1) + + 128 * (3 + XFS_ALLOCFREE_LOG_COUNT(mp, 1)); } +/* + * Growing the rt section of the filesystem. + * In the first set of transactions (ALLOC) we allocate space to the + * bitmap or summary files. + * superblock: sector size + * agf of the ag from which the extent is allocated: sector size + * bmap btree for bitmap/summary inode: max depth * blocksize + * bitmap/summary inode: inode size + * allocation btrees for 1 block alloc: 2 * (2 * maxdepth - 1) * blocksize + */ STATIC uint -xfs_calc_growrtalloc_reservation(xfs_mount_t *mp) +xfs_calc_growrtalloc_reservation( + struct xfs_mount *mp) { - return XFS_CALC_GROWRTALLOC_LOG_RES(mp); + return 2 * mp->m_sb.sb_sectsize + + XFS_FSB_TO_B(mp, XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK)) + + mp->m_sb.sb_inodesize + + XFS_ALLOCFREE_LOG_RES(mp, 1) + + 128 * (3 + XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) + + XFS_ALLOCFREE_LOG_COUNT(mp, 1)); } +/* + * Growing the rt section of the filesystem. + * In the second set of transactions (ZERO) we zero the new metadata blocks. + * one bitmap/summary block: blocksize + */ STATIC uint -xfs_calc_growrtzero_reservation(xfs_mount_t *mp) +xfs_calc_growrtzero_reservation( + struct xfs_mount *mp) { - return XFS_CALC_GROWRTZERO_LOG_RES(mp); + return mp->m_sb.sb_blocksize + 128; } +/* + * Growing the rt section of the filesystem. + * In the third set of transactions (FREE) we update metadata without + * allocating any new blocks. + * superblock: sector size + * bitmap inode: inode size + * summary inode: inode size + * one bitmap block: blocksize + * summary blocks: new summary size + */ STATIC uint -xfs_calc_growrtfree_reservation(xfs_mount_t *mp) +xfs_calc_growrtfree_reservation( + struct xfs_mount *mp) { - return XFS_CALC_GROWRTFREE_LOG_RES(mp); + return mp->m_sb.sb_sectsize + + 2 * mp->m_sb.sb_inodesize + + mp->m_sb.sb_blocksize + + mp->m_rsumsize + + 128 * 5; } +/* + * Logging the inode modification timestamp on a synchronous write. + * inode + */ STATIC uint -xfs_calc_swrite_reservation(xfs_mount_t *mp) +xfs_calc_swrite_reservation( + struct xfs_mount *mp) { - return XFS_CALC_SWRITE_LOG_RES(mp); + return mp->m_sb.sb_inodesize + 128; } +/* + * Logging the inode mode bits when writing a setuid/setgid file + * inode + */ STATIC uint xfs_calc_writeid_reservation(xfs_mount_t *mp) { - return XFS_CALC_WRITEID_LOG_RES(mp); + return mp->m_sb.sb_inodesize + 128; } +/* + * Converting the inode from non-attributed to attributed. + * the inode being converted: inode size + * agf block and superblock (for block allocation) + * the new block (directory sized) + * bmap blocks for the new directory block + * allocation btrees + */ STATIC uint -xfs_calc_addafork_reservation(xfs_mount_t *mp) +xfs_calc_addafork_reservation( + struct xfs_mount *mp) { - return XFS_CALC_ADDAFORK_LOG_RES(mp) + XFS_DQUOT_LOGRES(mp); + return XFS_DQUOT_LOGRES(mp) + + mp->m_sb.sb_inodesize + + mp->m_sb.sb_sectsize * 2 + + mp->m_dirblksize + + XFS_FSB_TO_B(mp, XFS_DAENTER_BMAP1B(mp, XFS_DATA_FORK) + 1) + + XFS_ALLOCFREE_LOG_RES(mp, 1) + + 128 * (4 + XFS_DAENTER_BMAP1B(mp, XFS_DATA_FORK) + 1 + + XFS_ALLOCFREE_LOG_COUNT(mp, 1)); } +/* + * Removing the attribute fork of a file + * the inode being truncated: inode size + * the inode's bmap btree: max depth * block size + * And the bmap_finish transaction can free the blocks and bmap blocks: + * the agf for each of the ags: 4 * sector size + * the agfl for each of the ags: 4 * sector size + * the super block to reflect the freed blocks: sector size + * worst case split in allocation btrees per extent assuming 4 extents: + * 4 exts * 2 trees * (2 * max depth - 1) * block size + */ STATIC uint -xfs_calc_attrinval_reservation(xfs_mount_t *mp) +xfs_calc_attrinval_reservation( + struct xfs_mount *mp) { - return XFS_CALC_ATTRINVAL_LOG_RES(mp); + return MAX((mp->m_sb.sb_inodesize + + XFS_FSB_TO_B(mp, XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK)) + + 128 * (1 + XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK))), + (4 * mp->m_sb.sb_sectsize + + 4 * mp->m_sb.sb_sectsize + + mp->m_sb.sb_sectsize + + XFS_ALLOCFREE_LOG_RES(mp, 4) + + 128 * (9 + XFS_ALLOCFREE_LOG_COUNT(mp, 4)))); } +/* + * Setting an attribute. + * the inode getting the attribute + * the superblock for allocations + * the agfs extents are allocated from + * the attribute btree * max depth + * the inode allocation btree + * Since attribute transaction space is dependent on the size of the attribute, + * the calculation is done partially at mount time and partially at runtime. + */ STATIC uint -xfs_calc_attrset_reservation(xfs_mount_t *mp) +xfs_calc_attrset_reservation( + struct xfs_mount *mp) { - return XFS_CALC_ATTRSET_LOG_RES(mp) + XFS_DQUOT_LOGRES(mp); + return XFS_DQUOT_LOGRES(mp) + + mp->m_sb.sb_inodesize + + mp->m_sb.sb_sectsize + + XFS_FSB_TO_B(mp, XFS_DA_NODE_MAXDEPTH) + + 128 * (2 + XFS_DA_NODE_MAXDEPTH); } +/* + * Removing an attribute. + * the inode: inode size + * the attribute btree could join: max depth * block size + * the inode bmap btree could join or split: max depth * block size + * And the bmap_finish transaction can free the attr blocks freed giving: + * the agf for the ag in which the blocks live: 2 * sector size + * the agfl for the ag in which the blocks live: 2 * sector size + * the superblock for the free block count: sector size + * the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size + */ STATIC uint -xfs_calc_attrrm_reservation(xfs_mount_t *mp) +xfs_calc_attrrm_reservation( + struct xfs_mount *mp) { - return XFS_CALC_ATTRRM_LOG_RES(mp) + XFS_DQUOT_LOGRES(mp); + return XFS_DQUOT_LOGRES(mp) + + MAX((mp->m_sb.sb_inodesize + + XFS_FSB_TO_B(mp, XFS_DA_NODE_MAXDEPTH) + + XFS_FSB_TO_B(mp, XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK)) + + 128 * (1 + XFS_DA_NODE_MAXDEPTH + + XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK))), + (2 * mp->m_sb.sb_sectsize + + 2 * mp->m_sb.sb_sectsize + + mp->m_sb.sb_sectsize + + XFS_ALLOCFREE_LOG_RES(mp, 2) + + 128 * (5 + XFS_ALLOCFREE_LOG_COUNT(mp, 2)))); } +/* + * Clearing a bad agino number in an agi hash bucket. + */ STATIC uint -xfs_calc_clear_agi_bucket_reservation(xfs_mount_t *mp) +xfs_calc_clear_agi_bucket_reservation( + struct xfs_mount *mp) { - return XFS_CALC_CLEAR_AGI_BUCKET_LOG_RES(mp); + return mp->m_sb.sb_sectsize + 128; } /* @@ -184,11 +537,10 @@ xfs_calc_clear_agi_bucket_reservation(xfs_mount_t *mp) */ void xfs_trans_init( - xfs_mount_t *mp) + struct xfs_mount *mp) { - xfs_trans_reservations_t *resp; + struct xfs_trans_reservations *resp = &mp->m_reservations; - resp = &(mp->m_reservations); resp->tr_write = xfs_calc_write_reservation(mp); resp->tr_itruncate = xfs_calc_itruncate_reservation(mp); resp->tr_rename = xfs_calc_rename_reservation(mp); @@ -243,8 +595,7 @@ _xfs_trans_alloc( tp->t_magic = XFS_TRANS_MAGIC; tp->t_type = type; tp->t_mountp = mp; - tp->t_items_free = XFS_LIC_NUM_SLOTS; - xfs_lic_init(&(tp->t_items)); + INIT_LIST_HEAD(&tp->t_items); INIT_LIST_HEAD(&tp->t_busy); return tp; } @@ -289,8 +640,7 @@ xfs_trans_dup( ntp->t_magic = XFS_TRANS_MAGIC; ntp->t_type = tp->t_type; ntp->t_mountp = tp->t_mountp; - ntp->t_items_free = XFS_LIC_NUM_SLOTS; - xfs_lic_init(&(ntp->t_items)); + INIT_LIST_HEAD(&ntp->t_items); INIT_LIST_HEAD(&ntp->t_busy); ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES); @@ -770,6 +1120,108 @@ xfs_trans_unreserve_and_mod_sb( } /* + * Add the given log item to the transaction's list of log items. + * + * The log item will now point to its new descriptor with its li_desc field. + */ +void +xfs_trans_add_item( + struct xfs_trans *tp, + struct xfs_log_item *lip) +{ + struct xfs_log_item_desc *lidp; + + ASSERT(lip->li_mountp = tp->t_mountp); + ASSERT(lip->li_ailp = tp->t_mountp->m_ail); + + lidp = kmem_zone_zalloc(xfs_log_item_desc_zone, KM_SLEEP | KM_NOFS); + + lidp->lid_item = lip; + lidp->lid_flags = 0; + lidp->lid_size = 0; + list_add_tail(&lidp->lid_trans, &tp->t_items); + + lip->li_desc = lidp; +} + +STATIC void +xfs_trans_free_item_desc( + struct xfs_log_item_desc *lidp) +{ + list_del_init(&lidp->lid_trans); + kmem_zone_free(xfs_log_item_desc_zone, lidp); +} + +/* + * Unlink and free the given descriptor. + */ +void +xfs_trans_del_item( + struct xfs_log_item *lip) +{ + xfs_trans_free_item_desc(lip->li_desc); + lip->li_desc = NULL; +} + +/* + * Unlock all of the items of a transaction and free all the descriptors + * of that transaction. + */ +STATIC void +xfs_trans_free_items( + struct xfs_trans *tp, + xfs_lsn_t commit_lsn, + int flags) +{ + struct xfs_log_item_desc *lidp, *next; + + list_for_each_entry_safe(lidp, next, &tp->t_items, lid_trans) { + struct xfs_log_item *lip = lidp->lid_item; + + lip->li_desc = NULL; + + if (commit_lsn != NULLCOMMITLSN) + IOP_COMMITTING(lip, commit_lsn); + if (flags & XFS_TRANS_ABORT) + lip->li_flags |= XFS_LI_ABORTED; + IOP_UNLOCK(lip); + + xfs_trans_free_item_desc(lidp); + } +} + +/* + * Unlock the items associated with a transaction. + * + * Items which were not logged should be freed. Those which were logged must + * still be tracked so they can be unpinned when the transaction commits. + */ +STATIC void +xfs_trans_unlock_items( + struct xfs_trans *tp, + xfs_lsn_t commit_lsn) +{ + struct xfs_log_item_desc *lidp, *next; + + list_for_each_entry_safe(lidp, next, &tp->t_items, lid_trans) { + struct xfs_log_item *lip = lidp->lid_item; + + lip->li_desc = NULL; + + if (commit_lsn != NULLCOMMITLSN) + IOP_COMMITTING(lip, commit_lsn); + IOP_UNLOCK(lip); + + /* + * Free the descriptor if the item is not dirty + * within this transaction. + */ + if (!(lidp->lid_flags & XFS_LID_DIRTY)) + xfs_trans_free_item_desc(lidp); + } +} + +/* * Total up the number of log iovecs needed to commit this * transaction. The transaction itself needs one for the * transaction header. Ask each dirty item in turn how many @@ -780,30 +1232,27 @@ xfs_trans_count_vecs( struct xfs_trans *tp) { int nvecs; - xfs_log_item_desc_t *lidp; + struct xfs_log_item_desc *lidp; nvecs = 1; - lidp = xfs_trans_first_item(tp); - ASSERT(lidp != NULL); /* In the non-debug case we need to start bailing out if we * didn't find a log_item here, return zero and let trans_commit * deal with it. */ - if (lidp == NULL) + if (list_empty(&tp->t_items)) { + ASSERT(0); return 0; + } - while (lidp != NULL) { + list_for_each_entry(lidp, &tp->t_items, lid_trans) { /* * Skip items which aren't dirty in this transaction. */ - if (!(lidp->lid_flags & XFS_LID_DIRTY)) { - lidp = xfs_trans_next_item(tp, lidp); + if (!(lidp->lid_flags & XFS_LID_DIRTY)) continue; - } lidp->lid_size = IOP_SIZE(lidp->lid_item); nvecs += lidp->lid_size; - lidp = xfs_trans_next_item(tp, lidp); } return nvecs; @@ -823,7 +1272,7 @@ xfs_trans_fill_vecs( struct xfs_trans *tp, struct xfs_log_iovec *log_vector) { - xfs_log_item_desc_t *lidp; + struct xfs_log_item_desc *lidp; struct xfs_log_iovec *vecp; uint nitems; @@ -834,14 +1283,11 @@ xfs_trans_fill_vecs( vecp = log_vector + 1; nitems = 0; - lidp = xfs_trans_first_item(tp); - ASSERT(lidp); - while (lidp) { + ASSERT(!list_empty(&tp->t_items)); + list_for_each_entry(lidp, &tp->t_items, lid_trans) { /* Skip items which aren't dirty in this transaction. */ - if (!(lidp->lid_flags & XFS_LID_DIRTY)) { - lidp = xfs_trans_next_item(tp, lidp); + if (!(lidp->lid_flags & XFS_LID_DIRTY)) continue; - } /* * The item may be marked dirty but not log anything. This can @@ -852,7 +1298,6 @@ xfs_trans_fill_vecs( IOP_FORMAT(lidp->lid_item, vecp); vecp += lidp->lid_size; IOP_PIN(lidp->lid_item); - lidp = xfs_trans_next_item(tp, lidp); } /* @@ -930,7 +1375,7 @@ xfs_trans_item_committed( * log item flags, if anyone else stales the buffer we do not want to * pay any attention to it. */ - IOP_UNPIN(lip); + IOP_UNPIN(lip, 0); } /* @@ -947,24 +1392,15 @@ xfs_trans_committed( struct xfs_trans *tp, int abortflag) { - xfs_log_item_desc_t *lidp; - xfs_log_item_chunk_t *licp; - xfs_log_item_chunk_t *next_licp; + struct xfs_log_item_desc *lidp, *next; /* Call the transaction's completion callback if there is one. */ if (tp->t_callback != NULL) tp->t_callback(tp, tp->t_callarg); - for (lidp = xfs_trans_first_item(tp); - lidp != NULL; - lidp = xfs_trans_next_item(tp, lidp)) { + list_for_each_entry_safe(lidp, next, &tp->t_items, lid_trans) { xfs_trans_item_committed(lidp->lid_item, tp->t_lsn, abortflag); - } - - /* free the item chunks, ignoring the embedded chunk */ - for (licp = tp->t_items.lic_next; licp != NULL; licp = next_licp) { - next_licp = licp->lic_next; - kmem_free(licp); + xfs_trans_free_item_desc(lidp); } xfs_trans_free(tp); @@ -979,16 +1415,14 @@ xfs_trans_uncommit( struct xfs_trans *tp, uint flags) { - xfs_log_item_desc_t *lidp; + struct xfs_log_item_desc *lidp; - for (lidp = xfs_trans_first_item(tp); - lidp != NULL; - lidp = xfs_trans_next_item(tp, lidp)) { + list_for_each_entry(lidp, &tp->t_items, lid_trans) { /* * Unpin all but those that aren't dirty. */ if (lidp->lid_flags & XFS_LID_DIRTY) - IOP_UNPIN_REMOVE(lidp->lid_item, tp); + IOP_UNPIN(lidp->lid_item, 1); } xfs_trans_unreserve_and_mod_sb(tp); @@ -1154,33 +1588,28 @@ STATIC struct xfs_log_vec * xfs_trans_alloc_log_vecs( xfs_trans_t *tp) { - xfs_log_item_desc_t *lidp; + struct xfs_log_item_desc *lidp; struct xfs_log_vec *lv = NULL; struct xfs_log_vec *ret_lv = NULL; - lidp = xfs_trans_first_item(tp); /* Bail out if we didn't find a log item. */ - if (!lidp) { + if (list_empty(&tp->t_items)) { ASSERT(0); return NULL; } - while (lidp != NULL) { + list_for_each_entry(lidp, &tp->t_items, lid_trans) { struct xfs_log_vec *new_lv; /* Skip items which aren't dirty in this transaction. */ - if (!(lidp->lid_flags & XFS_LID_DIRTY)) { - lidp = xfs_trans_next_item(tp, lidp); + if (!(lidp->lid_flags & XFS_LID_DIRTY)) continue; - } /* Skip items that do not have any vectors for writing */ lidp->lid_size = IOP_SIZE(lidp->lid_item); - if (!lidp->lid_size) { - lidp = xfs_trans_next_item(tp, lidp); + if (!lidp->lid_size) continue; - } new_lv = kmem_zalloc(sizeof(*new_lv) + lidp->lid_size * sizeof(struct xfs_log_iovec), @@ -1195,7 +1624,6 @@ xfs_trans_alloc_log_vecs( else lv->lv_next = new_lv; lv = new_lv; - lidp = xfs_trans_next_item(tp, lidp); } return ret_lv; @@ -1354,12 +1782,6 @@ xfs_trans_cancel( int flags) { int log_flags; -#ifdef DEBUG - xfs_log_item_chunk_t *licp; - xfs_log_item_desc_t *lidp; - xfs_log_item_t *lip; - int i; -#endif xfs_mount_t *mp = tp->t_mountp; /* @@ -1378,21 +1800,11 @@ xfs_trans_cancel( xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); } #ifdef DEBUG - if (!(flags & XFS_TRANS_ABORT)) { - licp = &(tp->t_items); - while (licp != NULL) { - lidp = licp->lic_descs; - for (i = 0; i < licp->lic_unused; i++, lidp++) { - if (xfs_lic_isfree(licp, i)) { - continue; - } - - lip = lidp->lid_item; - if (!XFS_FORCED_SHUTDOWN(mp)) - ASSERT(!(lip->li_type == XFS_LI_EFD)); - } - licp = licp->lic_next; - } + if (!(flags & XFS_TRANS_ABORT) && !XFS_FORCED_SHUTDOWN(mp)) { + struct xfs_log_item_desc *lidp; + + list_for_each_entry(lidp, &tp->t_items, lid_trans) + ASSERT(!(lidp->lid_item->li_type == XFS_LI_EFD)); } #endif xfs_trans_unreserve_and_mod_sb(tp); @@ -1480,7 +1892,6 @@ xfs_trans_roll( if (error) return error; - xfs_trans_ijoin(trans, dp, XFS_ILOCK_EXCL); - xfs_trans_ihold(trans, dp); + xfs_trans_ijoin(trans, dp); return 0; } diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h index 8c69e7824f68..c13c0f97b494 100644 --- a/fs/xfs/xfs_trans.h +++ b/fs/xfs/xfs_trans.h @@ -161,105 +161,14 @@ typedef struct xfs_trans_header { * the amount of space needed to log the item it describes * once we get to commit processing (see xfs_trans_commit()). */ -typedef struct xfs_log_item_desc { +struct xfs_log_item_desc { struct xfs_log_item *lid_item; - ushort lid_size; - unsigned char lid_flags; - unsigned char lid_index; -} xfs_log_item_desc_t; + ushort lid_size; + unsigned char lid_flags; + struct list_head lid_trans; +}; #define XFS_LID_DIRTY 0x1 -#define XFS_LID_PINNED 0x2 - -/* - * This structure is used to maintain a chunk list of log_item_desc - * structures. The free field is a bitmask indicating which descriptors - * in this chunk's array are free. The unused field is the first value - * not used since this chunk was allocated. - */ -#define XFS_LIC_NUM_SLOTS 15 -typedef struct xfs_log_item_chunk { - struct xfs_log_item_chunk *lic_next; - ushort lic_free; - ushort lic_unused; - xfs_log_item_desc_t lic_descs[XFS_LIC_NUM_SLOTS]; -} xfs_log_item_chunk_t; - -#define XFS_LIC_MAX_SLOT (XFS_LIC_NUM_SLOTS - 1) -#define XFS_LIC_FREEMASK ((1 << XFS_LIC_NUM_SLOTS) - 1) - - -/* - * Initialize the given chunk. Set the chunk's free descriptor mask - * to indicate that all descriptors are free. The caller gets to set - * lic_unused to the right value (0 matches all free). The - * lic_descs.lid_index values are set up as each desc is allocated. - */ -static inline void xfs_lic_init(xfs_log_item_chunk_t *cp) -{ - cp->lic_free = XFS_LIC_FREEMASK; -} - -static inline void xfs_lic_init_slot(xfs_log_item_chunk_t *cp, int slot) -{ - cp->lic_descs[slot].lid_index = (unsigned char)(slot); -} - -static inline int xfs_lic_vacancy(xfs_log_item_chunk_t *cp) -{ - return cp->lic_free & XFS_LIC_FREEMASK; -} - -static inline void xfs_lic_all_free(xfs_log_item_chunk_t *cp) -{ - cp->lic_free = XFS_LIC_FREEMASK; -} - -static inline int xfs_lic_are_all_free(xfs_log_item_chunk_t *cp) -{ - return ((cp->lic_free & XFS_LIC_FREEMASK) == XFS_LIC_FREEMASK); -} - -static inline int xfs_lic_isfree(xfs_log_item_chunk_t *cp, int slot) -{ - return (cp->lic_free & (1 << slot)); -} - -static inline void xfs_lic_claim(xfs_log_item_chunk_t *cp, int slot) -{ - cp->lic_free &= ~(1 << slot); -} - -static inline void xfs_lic_relse(xfs_log_item_chunk_t *cp, int slot) -{ - cp->lic_free |= 1 << slot; -} - -static inline xfs_log_item_desc_t * -xfs_lic_slot(xfs_log_item_chunk_t *cp, int slot) -{ - return &(cp->lic_descs[slot]); -} - -static inline int xfs_lic_desc_to_slot(xfs_log_item_desc_t *dp) -{ - return (uint)dp->lid_index; -} - -/* - * Calculate the address of a chunk given a descriptor pointer: - * dp - dp->lid_index give the address of the start of the lic_descs array. - * From this we subtract the offset of the lic_descs field in a chunk. - * All of this yields the address of the chunk, which is - * cast to a chunk pointer. - */ -static inline xfs_log_item_chunk_t * -xfs_lic_desc_to_chunk(xfs_log_item_desc_t *dp) -{ - return (xfs_log_item_chunk_t*) \ - (((xfs_caddr_t)((dp) - (dp)->lid_index)) - \ - (xfs_caddr_t)(((xfs_log_item_chunk_t*)0)->lic_descs)); -} #define XFS_TRANS_MAGIC 0x5452414E /* 'TRAN' */ /* @@ -275,8 +184,6 @@ xfs_lic_desc_to_chunk(xfs_log_item_desc_t *dp) /* * Values for call flags parameter. */ -#define XFS_TRANS_NOSLEEP 0x1 -#define XFS_TRANS_WAIT 0x2 #define XFS_TRANS_RELEASE_LOG_RES 0x4 #define XFS_TRANS_ABORT 0x8 @@ -300,24 +207,6 @@ xfs_lic_desc_to_chunk(xfs_log_item_desc_t *dp) /* - * Various log reservation values. - * These are based on the size of the file system block - * because that is what most transactions manipulate. - * Each adds in an additional 128 bytes per item logged to - * try to account for the overhead of the transaction mechanism. - * - * Note: - * Most of the reservations underestimate the number of allocation - * groups into which they could free extents in the xfs_bmap_finish() - * call. This is because the number in the worst case is quite high - * and quite unusual. In order to fix this we need to change - * xfs_bmap_finish() to free extents in only a single AG at a time. - * This will require changes to the EFI code as well, however, so that - * the EFI for the extents not freed is logged again in each transaction. - * See bug 261917. - */ - -/* * Per-extent log reservation for the allocation btree changes * involved in freeing or allocating an extent. * 2 trees * (2 blocks/level * max depth - 1) * block size @@ -341,429 +230,36 @@ xfs_lic_desc_to_chunk(xfs_log_item_desc_t *dp) (XFS_DAENTER_BLOCKS(mp, XFS_DATA_FORK) + \ XFS_DAENTER_BMAPS(mp, XFS_DATA_FORK) + 1) -/* - * In a write transaction we can allocate a maximum of 2 - * extents. This gives: - * the inode getting the new extents: inode size - * the inode's bmap btree: max depth * block size - * the agfs of the ags from which the extents are allocated: 2 * sector - * the superblock free block counter: sector size - * the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size - * And the bmap_finish transaction can free bmap blocks in a join: - * the agfs of the ags containing the blocks: 2 * sector size - * the agfls of the ags containing the blocks: 2 * sector size - * the super block free block counter: sector size - * the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size - */ -#define XFS_CALC_WRITE_LOG_RES(mp) \ - (MAX( \ - ((mp)->m_sb.sb_inodesize + \ - XFS_FSB_TO_B((mp), XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK)) + \ - (2 * (mp)->m_sb.sb_sectsize) + \ - (mp)->m_sb.sb_sectsize + \ - XFS_ALLOCFREE_LOG_RES(mp, 2) + \ - (128 * (4 + XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) + XFS_ALLOCFREE_LOG_COUNT(mp, 2)))),\ - ((2 * (mp)->m_sb.sb_sectsize) + \ - (2 * (mp)->m_sb.sb_sectsize) + \ - (mp)->m_sb.sb_sectsize + \ - XFS_ALLOCFREE_LOG_RES(mp, 2) + \ - (128 * (5 + XFS_ALLOCFREE_LOG_COUNT(mp, 2)))))) #define XFS_WRITE_LOG_RES(mp) ((mp)->m_reservations.tr_write) - -/* - * In truncating a file we free up to two extents at once. We can modify: - * the inode being truncated: inode size - * the inode's bmap btree: (max depth + 1) * block size - * And the bmap_finish transaction can free the blocks and bmap blocks: - * the agf for each of the ags: 4 * sector size - * the agfl for each of the ags: 4 * sector size - * the super block to reflect the freed blocks: sector size - * worst case split in allocation btrees per extent assuming 4 extents: - * 4 exts * 2 trees * (2 * max depth - 1) * block size - * the inode btree: max depth * blocksize - * the allocation btrees: 2 trees * (max depth - 1) * block size - */ -#define XFS_CALC_ITRUNCATE_LOG_RES(mp) \ - (MAX( \ - ((mp)->m_sb.sb_inodesize + \ - XFS_FSB_TO_B((mp), XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) + 1) + \ - (128 * (2 + XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK)))), \ - ((4 * (mp)->m_sb.sb_sectsize) + \ - (4 * (mp)->m_sb.sb_sectsize) + \ - (mp)->m_sb.sb_sectsize + \ - XFS_ALLOCFREE_LOG_RES(mp, 4) + \ - (128 * (9 + XFS_ALLOCFREE_LOG_COUNT(mp, 4))) + \ - (128 * 5) + \ - XFS_ALLOCFREE_LOG_RES(mp, 1) + \ - (128 * (2 + XFS_IALLOC_BLOCKS(mp) + (mp)->m_in_maxlevels + \ - XFS_ALLOCFREE_LOG_COUNT(mp, 1)))))) - #define XFS_ITRUNCATE_LOG_RES(mp) ((mp)->m_reservations.tr_itruncate) - -/* - * In renaming a files we can modify: - * the four inodes involved: 4 * inode size - * the two directory btrees: 2 * (max depth + v2) * dir block size - * the two directory bmap btrees: 2 * max depth * block size - * And the bmap_finish transaction can free dir and bmap blocks (two sets - * of bmap blocks) giving: - * the agf for the ags in which the blocks live: 3 * sector size - * the agfl for the ags in which the blocks live: 3 * sector size - * the superblock for the free block count: sector size - * the allocation btrees: 3 exts * 2 trees * (2 * max depth - 1) * block size - */ -#define XFS_CALC_RENAME_LOG_RES(mp) \ - (MAX( \ - ((4 * (mp)->m_sb.sb_inodesize) + \ - (2 * XFS_DIROP_LOG_RES(mp)) + \ - (128 * (4 + 2 * XFS_DIROP_LOG_COUNT(mp)))), \ - ((3 * (mp)->m_sb.sb_sectsize) + \ - (3 * (mp)->m_sb.sb_sectsize) + \ - (mp)->m_sb.sb_sectsize + \ - XFS_ALLOCFREE_LOG_RES(mp, 3) + \ - (128 * (7 + XFS_ALLOCFREE_LOG_COUNT(mp, 3)))))) - #define XFS_RENAME_LOG_RES(mp) ((mp)->m_reservations.tr_rename) - -/* - * For creating a link to an inode: - * the parent directory inode: inode size - * the linked inode: inode size - * the directory btree could split: (max depth + v2) * dir block size - * the directory bmap btree could join or split: (max depth + v2) * blocksize - * And the bmap_finish transaction can free some bmap blocks giving: - * the agf for the ag in which the blocks live: sector size - * the agfl for the ag in which the blocks live: sector size - * the superblock for the free block count: sector size - * the allocation btrees: 2 trees * (2 * max depth - 1) * block size - */ -#define XFS_CALC_LINK_LOG_RES(mp) \ - (MAX( \ - ((mp)->m_sb.sb_inodesize + \ - (mp)->m_sb.sb_inodesize + \ - XFS_DIROP_LOG_RES(mp) + \ - (128 * (2 + XFS_DIROP_LOG_COUNT(mp)))), \ - ((mp)->m_sb.sb_sectsize + \ - (mp)->m_sb.sb_sectsize + \ - (mp)->m_sb.sb_sectsize + \ - XFS_ALLOCFREE_LOG_RES(mp, 1) + \ - (128 * (3 + XFS_ALLOCFREE_LOG_COUNT(mp, 1)))))) - #define XFS_LINK_LOG_RES(mp) ((mp)->m_reservations.tr_link) - -/* - * For removing a directory entry we can modify: - * the parent directory inode: inode size - * the removed inode: inode size - * the directory btree could join: (max depth + v2) * dir block size - * the directory bmap btree could join or split: (max depth + v2) * blocksize - * And the bmap_finish transaction can free the dir and bmap blocks giving: - * the agf for the ag in which the blocks live: 2 * sector size - * the agfl for the ag in which the blocks live: 2 * sector size - * the superblock for the free block count: sector size - * the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size - */ -#define XFS_CALC_REMOVE_LOG_RES(mp) \ - (MAX( \ - ((mp)->m_sb.sb_inodesize + \ - (mp)->m_sb.sb_inodesize + \ - XFS_DIROP_LOG_RES(mp) + \ - (128 * (2 + XFS_DIROP_LOG_COUNT(mp)))), \ - ((2 * (mp)->m_sb.sb_sectsize) + \ - (2 * (mp)->m_sb.sb_sectsize) + \ - (mp)->m_sb.sb_sectsize + \ - XFS_ALLOCFREE_LOG_RES(mp, 2) + \ - (128 * (5 + XFS_ALLOCFREE_LOG_COUNT(mp, 2)))))) - #define XFS_REMOVE_LOG_RES(mp) ((mp)->m_reservations.tr_remove) - -/* - * For symlink we can modify: - * the parent directory inode: inode size - * the new inode: inode size - * the inode btree entry: 1 block - * the directory btree: (max depth + v2) * dir block size - * the directory inode's bmap btree: (max depth + v2) * block size - * the blocks for the symlink: 1 kB - * Or in the first xact we allocate some inodes giving: - * the agi and agf of the ag getting the new inodes: 2 * sectorsize - * the inode blocks allocated: XFS_IALLOC_BLOCKS * blocksize - * the inode btree: max depth * blocksize - * the allocation btrees: 2 trees * (2 * max depth - 1) * block size - */ -#define XFS_CALC_SYMLINK_LOG_RES(mp) \ - (MAX( \ - ((mp)->m_sb.sb_inodesize + \ - (mp)->m_sb.sb_inodesize + \ - XFS_FSB_TO_B(mp, 1) + \ - XFS_DIROP_LOG_RES(mp) + \ - 1024 + \ - (128 * (4 + XFS_DIROP_LOG_COUNT(mp)))), \ - (2 * (mp)->m_sb.sb_sectsize + \ - XFS_FSB_TO_B((mp), XFS_IALLOC_BLOCKS((mp))) + \ - XFS_FSB_TO_B((mp), (mp)->m_in_maxlevels) + \ - XFS_ALLOCFREE_LOG_RES(mp, 1) + \ - (128 * (2 + XFS_IALLOC_BLOCKS(mp) + (mp)->m_in_maxlevels + \ - XFS_ALLOCFREE_LOG_COUNT(mp, 1)))))) - #define XFS_SYMLINK_LOG_RES(mp) ((mp)->m_reservations.tr_symlink) - -/* - * For create we can modify: - * the parent directory inode: inode size - * the new inode: inode size - * the inode btree entry: block size - * the superblock for the nlink flag: sector size - * the directory btree: (max depth + v2) * dir block size - * the directory inode's bmap btree: (max depth + v2) * block size - * Or in the first xact we allocate some inodes giving: - * the agi and agf of the ag getting the new inodes: 2 * sectorsize - * the superblock for the nlink flag: sector size - * the inode blocks allocated: XFS_IALLOC_BLOCKS * blocksize - * the inode btree: max depth * blocksize - * the allocation btrees: 2 trees * (max depth - 1) * block size - */ -#define XFS_CALC_CREATE_LOG_RES(mp) \ - (MAX( \ - ((mp)->m_sb.sb_inodesize + \ - (mp)->m_sb.sb_inodesize + \ - (mp)->m_sb.sb_sectsize + \ - XFS_FSB_TO_B(mp, 1) + \ - XFS_DIROP_LOG_RES(mp) + \ - (128 * (3 + XFS_DIROP_LOG_COUNT(mp)))), \ - (3 * (mp)->m_sb.sb_sectsize + \ - XFS_FSB_TO_B((mp), XFS_IALLOC_BLOCKS((mp))) + \ - XFS_FSB_TO_B((mp), (mp)->m_in_maxlevels) + \ - XFS_ALLOCFREE_LOG_RES(mp, 1) + \ - (128 * (2 + XFS_IALLOC_BLOCKS(mp) + (mp)->m_in_maxlevels + \ - XFS_ALLOCFREE_LOG_COUNT(mp, 1)))))) - #define XFS_CREATE_LOG_RES(mp) ((mp)->m_reservations.tr_create) - -/* - * Making a new directory is the same as creating a new file. - */ -#define XFS_CALC_MKDIR_LOG_RES(mp) XFS_CALC_CREATE_LOG_RES(mp) - #define XFS_MKDIR_LOG_RES(mp) ((mp)->m_reservations.tr_mkdir) - -/* - * In freeing an inode we can modify: - * the inode being freed: inode size - * the super block free inode counter: sector size - * the agi hash list and counters: sector size - * the inode btree entry: block size - * the on disk inode before ours in the agi hash list: inode cluster size - * the inode btree: max depth * blocksize - * the allocation btrees: 2 trees * (max depth - 1) * block size - */ -#define XFS_CALC_IFREE_LOG_RES(mp) \ - ((mp)->m_sb.sb_inodesize + \ - (mp)->m_sb.sb_sectsize + \ - (mp)->m_sb.sb_sectsize + \ - XFS_FSB_TO_B((mp), 1) + \ - MAX((__uint16_t)XFS_FSB_TO_B((mp), 1), XFS_INODE_CLUSTER_SIZE(mp)) + \ - (128 * 5) + \ - XFS_ALLOCFREE_LOG_RES(mp, 1) + \ - (128 * (2 + XFS_IALLOC_BLOCKS(mp) + (mp)->m_in_maxlevels + \ - XFS_ALLOCFREE_LOG_COUNT(mp, 1)))) - - #define XFS_IFREE_LOG_RES(mp) ((mp)->m_reservations.tr_ifree) - -/* - * When only changing the inode we log the inode and possibly the superblock - * We also add a bit of slop for the transaction stuff. - */ -#define XFS_CALC_ICHANGE_LOG_RES(mp) ((mp)->m_sb.sb_inodesize + \ - (mp)->m_sb.sb_sectsize + 512) - #define XFS_ICHANGE_LOG_RES(mp) ((mp)->m_reservations.tr_ichange) - -/* - * Growing the data section of the filesystem. - * superblock - * agi and agf - * allocation btrees - */ -#define XFS_CALC_GROWDATA_LOG_RES(mp) \ - ((mp)->m_sb.sb_sectsize * 3 + \ - XFS_ALLOCFREE_LOG_RES(mp, 1) + \ - (128 * (3 + XFS_ALLOCFREE_LOG_COUNT(mp, 1)))) - #define XFS_GROWDATA_LOG_RES(mp) ((mp)->m_reservations.tr_growdata) - -/* - * Growing the rt section of the filesystem. - * In the first set of transactions (ALLOC) we allocate space to the - * bitmap or summary files. - * superblock: sector size - * agf of the ag from which the extent is allocated: sector size - * bmap btree for bitmap/summary inode: max depth * blocksize - * bitmap/summary inode: inode size - * allocation btrees for 1 block alloc: 2 * (2 * maxdepth - 1) * blocksize - */ -#define XFS_CALC_GROWRTALLOC_LOG_RES(mp) \ - (2 * (mp)->m_sb.sb_sectsize + \ - XFS_FSB_TO_B((mp), XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK)) + \ - (mp)->m_sb.sb_inodesize + \ - XFS_ALLOCFREE_LOG_RES(mp, 1) + \ - (128 * \ - (3 + XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) + \ - XFS_ALLOCFREE_LOG_COUNT(mp, 1)))) - #define XFS_GROWRTALLOC_LOG_RES(mp) ((mp)->m_reservations.tr_growrtalloc) - -/* - * Growing the rt section of the filesystem. - * In the second set of transactions (ZERO) we zero the new metadata blocks. - * one bitmap/summary block: blocksize - */ -#define XFS_CALC_GROWRTZERO_LOG_RES(mp) \ - ((mp)->m_sb.sb_blocksize + 128) - #define XFS_GROWRTZERO_LOG_RES(mp) ((mp)->m_reservations.tr_growrtzero) - -/* - * Growing the rt section of the filesystem. - * In the third set of transactions (FREE) we update metadata without - * allocating any new blocks. - * superblock: sector size - * bitmap inode: inode size - * summary inode: inode size - * one bitmap block: blocksize - * summary blocks: new summary size - */ -#define XFS_CALC_GROWRTFREE_LOG_RES(mp) \ - ((mp)->m_sb.sb_sectsize + \ - 2 * (mp)->m_sb.sb_inodesize + \ - (mp)->m_sb.sb_blocksize + \ - (mp)->m_rsumsize + \ - (128 * 5)) - #define XFS_GROWRTFREE_LOG_RES(mp) ((mp)->m_reservations.tr_growrtfree) - -/* - * Logging the inode modification timestamp on a synchronous write. - * inode - */ -#define XFS_CALC_SWRITE_LOG_RES(mp) \ - ((mp)->m_sb.sb_inodesize + 128) - #define XFS_SWRITE_LOG_RES(mp) ((mp)->m_reservations.tr_swrite) - /* * Logging the inode timestamps on an fsync -- same as SWRITE * as long as SWRITE logs the entire inode core */ #define XFS_FSYNC_TS_LOG_RES(mp) ((mp)->m_reservations.tr_swrite) - -/* - * Logging the inode mode bits when writing a setuid/setgid file - * inode - */ -#define XFS_CALC_WRITEID_LOG_RES(mp) \ - ((mp)->m_sb.sb_inodesize + 128) - #define XFS_WRITEID_LOG_RES(mp) ((mp)->m_reservations.tr_swrite) - -/* - * Converting the inode from non-attributed to attributed. - * the inode being converted: inode size - * agf block and superblock (for block allocation) - * the new block (directory sized) - * bmap blocks for the new directory block - * allocation btrees - */ -#define XFS_CALC_ADDAFORK_LOG_RES(mp) \ - ((mp)->m_sb.sb_inodesize + \ - (mp)->m_sb.sb_sectsize * 2 + \ - (mp)->m_dirblksize + \ - XFS_FSB_TO_B(mp, (XFS_DAENTER_BMAP1B(mp, XFS_DATA_FORK) + 1)) + \ - XFS_ALLOCFREE_LOG_RES(mp, 1) + \ - (128 * (4 + (XFS_DAENTER_BMAP1B(mp, XFS_DATA_FORK) + 1) + \ - XFS_ALLOCFREE_LOG_COUNT(mp, 1)))) - #define XFS_ADDAFORK_LOG_RES(mp) ((mp)->m_reservations.tr_addafork) - -/* - * Removing the attribute fork of a file - * the inode being truncated: inode size - * the inode's bmap btree: max depth * block size - * And the bmap_finish transaction can free the blocks and bmap blocks: - * the agf for each of the ags: 4 * sector size - * the agfl for each of the ags: 4 * sector size - * the super block to reflect the freed blocks: sector size - * worst case split in allocation btrees per extent assuming 4 extents: - * 4 exts * 2 trees * (2 * max depth - 1) * block size - */ -#define XFS_CALC_ATTRINVAL_LOG_RES(mp) \ - (MAX( \ - ((mp)->m_sb.sb_inodesize + \ - XFS_FSB_TO_B((mp), XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK)) + \ - (128 * (1 + XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK)))), \ - ((4 * (mp)->m_sb.sb_sectsize) + \ - (4 * (mp)->m_sb.sb_sectsize) + \ - (mp)->m_sb.sb_sectsize + \ - XFS_ALLOCFREE_LOG_RES(mp, 4) + \ - (128 * (9 + XFS_ALLOCFREE_LOG_COUNT(mp, 4)))))) - #define XFS_ATTRINVAL_LOG_RES(mp) ((mp)->m_reservations.tr_attrinval) - -/* - * Setting an attribute. - * the inode getting the attribute - * the superblock for allocations - * the agfs extents are allocated from - * the attribute btree * max depth - * the inode allocation btree - * Since attribute transaction space is dependent on the size of the attribute, - * the calculation is done partially at mount time and partially at runtime. - */ -#define XFS_CALC_ATTRSET_LOG_RES(mp) \ - ((mp)->m_sb.sb_inodesize + \ - (mp)->m_sb.sb_sectsize + \ - XFS_FSB_TO_B((mp), XFS_DA_NODE_MAXDEPTH) + \ - (128 * (2 + XFS_DA_NODE_MAXDEPTH))) - #define XFS_ATTRSET_LOG_RES(mp, ext) \ ((mp)->m_reservations.tr_attrset + \ (ext * (mp)->m_sb.sb_sectsize) + \ (ext * XFS_FSB_TO_B((mp), XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK))) + \ (128 * (ext + (ext * XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK))))) - -/* - * Removing an attribute. - * the inode: inode size - * the attribute btree could join: max depth * block size - * the inode bmap btree could join or split: max depth * block size - * And the bmap_finish transaction can free the attr blocks freed giving: - * the agf for the ag in which the blocks live: 2 * sector size - * the agfl for the ag in which the blocks live: 2 * sector size - * the superblock for the free block count: sector size - * the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size - */ -#define XFS_CALC_ATTRRM_LOG_RES(mp) \ - (MAX( \ - ((mp)->m_sb.sb_inodesize + \ - XFS_FSB_TO_B((mp), XFS_DA_NODE_MAXDEPTH) + \ - XFS_FSB_TO_B((mp), XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK)) + \ - (128 * (1 + XFS_DA_NODE_MAXDEPTH + XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK)))), \ - ((2 * (mp)->m_sb.sb_sectsize) + \ - (2 * (mp)->m_sb.sb_sectsize) + \ - (mp)->m_sb.sb_sectsize + \ - XFS_ALLOCFREE_LOG_RES(mp, 2) + \ - (128 * (5 + XFS_ALLOCFREE_LOG_COUNT(mp, 2)))))) - #define XFS_ATTRRM_LOG_RES(mp) ((mp)->m_reservations.tr_attrrm) - -/* - * Clearing a bad agino number in an agi hash bucket. - */ -#define XFS_CALC_CLEAR_AGI_BUCKET_LOG_RES(mp) \ - ((mp)->m_sb.sb_sectsize + 128) - #define XFS_CLEAR_AGI_BUCKET_LOG_RES(mp) ((mp)->m_reservations.tr_clearagi) @@ -849,8 +345,7 @@ typedef struct xfs_item_ops { uint (*iop_size)(xfs_log_item_t *); void (*iop_format)(xfs_log_item_t *, struct xfs_log_iovec *); void (*iop_pin)(xfs_log_item_t *); - void (*iop_unpin)(xfs_log_item_t *); - void (*iop_unpin_remove)(xfs_log_item_t *, struct xfs_trans *); + void (*iop_unpin)(xfs_log_item_t *, int remove); uint (*iop_trylock)(xfs_log_item_t *); void (*iop_unlock)(xfs_log_item_t *); xfs_lsn_t (*iop_committed)(xfs_log_item_t *, xfs_lsn_t); @@ -862,8 +357,7 @@ typedef struct xfs_item_ops { #define IOP_SIZE(ip) (*(ip)->li_ops->iop_size)(ip) #define IOP_FORMAT(ip,vp) (*(ip)->li_ops->iop_format)(ip, vp) #define IOP_PIN(ip) (*(ip)->li_ops->iop_pin)(ip) -#define IOP_UNPIN(ip) (*(ip)->li_ops->iop_unpin)(ip) -#define IOP_UNPIN_REMOVE(ip,tp) (*(ip)->li_ops->iop_unpin_remove)(ip, tp) +#define IOP_UNPIN(ip, remove) (*(ip)->li_ops->iop_unpin)(ip, remove) #define IOP_TRYLOCK(ip) (*(ip)->li_ops->iop_trylock)(ip) #define IOP_UNLOCK(ip) (*(ip)->li_ops->iop_unlock)(ip) #define IOP_COMMITTED(ip, lsn) (*(ip)->li_ops->iop_committed)(ip, lsn) @@ -927,8 +421,7 @@ typedef struct xfs_trans { int64_t t_rblocks_delta;/* superblock rblocks change */ int64_t t_rextents_delta;/* superblocks rextents chg */ int64_t t_rextslog_delta;/* superblocks rextslog chg */ - unsigned int t_items_free; /* log item descs free */ - xfs_log_item_chunk_t t_items; /* first log item desc chunk */ + struct list_head t_items; /* log item descriptors */ xfs_trans_header_t t_header; /* header for in-log trans */ struct list_head t_busy; /* list of busy extents */ unsigned long t_pflags; /* saved process flags state */ @@ -980,8 +473,8 @@ void xfs_trans_dquot_buf(xfs_trans_t *, struct xfs_buf *, uint); void xfs_trans_inode_alloc_buf(xfs_trans_t *, struct xfs_buf *); int xfs_trans_iget(struct xfs_mount *, xfs_trans_t *, xfs_ino_t , uint, uint, struct xfs_inode **); -void xfs_trans_ijoin(xfs_trans_t *, struct xfs_inode *, uint); -void xfs_trans_ihold(xfs_trans_t *, struct xfs_inode *); +void xfs_trans_ijoin_ref(struct xfs_trans *, struct xfs_inode *, uint); +void xfs_trans_ijoin(struct xfs_trans *, struct xfs_inode *); void xfs_trans_log_buf(xfs_trans_t *, struct xfs_buf *, uint, uint); void xfs_trans_log_inode(xfs_trans_t *, struct xfs_inode *, uint); struct xfs_efi_log_item *xfs_trans_get_efi(xfs_trans_t *, uint); @@ -1006,6 +499,7 @@ int xfs_trans_ail_init(struct xfs_mount *); void xfs_trans_ail_destroy(struct xfs_mount *); extern kmem_zone_t *xfs_trans_zone; +extern kmem_zone_t *xfs_log_item_desc_zone; #endif /* __KERNEL__ */ diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c index e799824f7245..dc9069568ff7 100644 --- a/fs/xfs/xfs_trans_ail.c +++ b/fs/xfs/xfs_trans_ail.c @@ -24,7 +24,6 @@ #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" -#include "xfs_dmapi.h" #include "xfs_mount.h" #include "xfs_trans_priv.h" #include "xfs_error.h" diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c index 63d81a22f4fd..90af025e6839 100644 --- a/fs/xfs/xfs_trans_buf.c +++ b/fs/xfs/xfs_trans_buf.c @@ -24,14 +24,10 @@ #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" -#include "xfs_dir2.h" -#include "xfs_dmapi.h" #include "xfs_mount.h" #include "xfs_bmap_btree.h" #include "xfs_alloc_btree.h" #include "xfs_ialloc_btree.h" -#include "xfs_dir2_sf.h" -#include "xfs_attr_sf.h" #include "xfs_dinode.h" #include "xfs_inode.h" #include "xfs_buf_item.h" @@ -51,36 +47,17 @@ xfs_trans_buf_item_match( xfs_daddr_t blkno, int len) { - xfs_log_item_chunk_t *licp; - xfs_log_item_desc_t *lidp; - xfs_buf_log_item_t *blip; - int i; + struct xfs_log_item_desc *lidp; + struct xfs_buf_log_item *blip; len = BBTOB(len); - for (licp = &tp->t_items; licp != NULL; licp = licp->lic_next) { - if (xfs_lic_are_all_free(licp)) { - ASSERT(licp == &tp->t_items); - ASSERT(licp->lic_next == NULL); - return NULL; - } - - for (i = 0; i < licp->lic_unused; i++) { - /* - * Skip unoccupied slots. - */ - if (xfs_lic_isfree(licp, i)) - continue; - - lidp = xfs_lic_slot(licp, i); - blip = (xfs_buf_log_item_t *)lidp->lid_item; - if (blip->bli_item.li_type != XFS_LI_BUF) - continue; - - if (XFS_BUF_TARGET(blip->bli_buf) == target && - XFS_BUF_ADDR(blip->bli_buf) == blkno && - XFS_BUF_COUNT(blip->bli_buf) == len) - return blip->bli_buf; - } + list_for_each_entry(lidp, &tp->t_items, lid_trans) { + blip = (struct xfs_buf_log_item *)lidp->lid_item; + if (blip->bli_item.li_type == XFS_LI_BUF && + XFS_BUF_TARGET(blip->bli_buf) == target && + XFS_BUF_ADDR(blip->bli_buf) == blkno && + XFS_BUF_COUNT(blip->bli_buf) == len) + return blip->bli_buf; } return NULL; @@ -127,7 +104,7 @@ _xfs_trans_bjoin( /* * Get a log_item_desc to point at the new item. */ - (void) xfs_trans_add_item(tp, (xfs_log_item_t *)bip); + xfs_trans_add_item(tp, &bip->bli_item); /* * Initialize b_fsprivate2 so we can find it with incore_match() @@ -483,7 +460,6 @@ xfs_trans_brelse(xfs_trans_t *tp, { xfs_buf_log_item_t *bip; xfs_log_item_t *lip; - xfs_log_item_desc_t *lidp; /* * Default to a normal brelse() call if the tp is NULL. @@ -514,13 +490,6 @@ xfs_trans_brelse(xfs_trans_t *tp, ASSERT(!(bip->bli_format.blf_flags & XFS_BLF_CANCEL)); ASSERT(atomic_read(&bip->bli_refcount) > 0); - /* - * Find the item descriptor pointing to this buffer's - * log item. It must be there. - */ - lidp = xfs_trans_find_item(tp, (xfs_log_item_t*)bip); - ASSERT(lidp != NULL); - trace_xfs_trans_brelse(bip); /* @@ -536,7 +505,7 @@ xfs_trans_brelse(xfs_trans_t *tp, * If the buffer is dirty within this transaction, we can't * release it until we commit. */ - if (lidp->lid_flags & XFS_LID_DIRTY) + if (bip->bli_item.li_desc->lid_flags & XFS_LID_DIRTY) return; /* @@ -553,7 +522,7 @@ xfs_trans_brelse(xfs_trans_t *tp, /* * Free up the log item descriptor tracking the released item. */ - xfs_trans_free_item(tp, lidp); + xfs_trans_del_item(&bip->bli_item); /* * Clear the hold flag in the buf log item if it is set. @@ -665,7 +634,6 @@ xfs_trans_log_buf(xfs_trans_t *tp, uint last) { xfs_buf_log_item_t *bip; - xfs_log_item_desc_t *lidp; ASSERT(XFS_BUF_ISBUSY(bp)); ASSERT(XFS_BUF_FSPRIVATE2(bp, xfs_trans_t *) == tp); @@ -690,7 +658,7 @@ xfs_trans_log_buf(xfs_trans_t *tp, bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *); ASSERT(atomic_read(&bip->bli_refcount) > 0); XFS_BUF_SET_IODONE_FUNC(bp, xfs_buf_iodone_callbacks); - bip->bli_item.li_cb = (void(*)(xfs_buf_t*,xfs_log_item_t*))xfs_buf_iodone; + bip->bli_item.li_cb = xfs_buf_iodone; trace_xfs_trans_log_buf(bip); @@ -707,11 +675,8 @@ xfs_trans_log_buf(xfs_trans_t *tp, bip->bli_format.blf_flags &= ~XFS_BLF_CANCEL; } - lidp = xfs_trans_find_item(tp, (xfs_log_item_t*)bip); - ASSERT(lidp != NULL); - tp->t_flags |= XFS_TRANS_DIRTY; - lidp->lid_flags |= XFS_LID_DIRTY; + bip->bli_item.li_desc->lid_flags |= XFS_LID_DIRTY; bip->bli_flags |= XFS_BLI_LOGGED; xfs_buf_item_log(bip, first, last); } @@ -740,7 +705,6 @@ xfs_trans_binval( xfs_trans_t *tp, xfs_buf_t *bp) { - xfs_log_item_desc_t *lidp; xfs_buf_log_item_t *bip; ASSERT(XFS_BUF_ISBUSY(bp)); @@ -748,8 +712,6 @@ xfs_trans_binval( ASSERT(XFS_BUF_FSPRIVATE(bp, void *) != NULL); bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *); - lidp = xfs_trans_find_item(tp, (xfs_log_item_t*)bip); - ASSERT(lidp != NULL); ASSERT(atomic_read(&bip->bli_refcount) > 0); trace_xfs_trans_binval(bip); @@ -764,7 +726,7 @@ xfs_trans_binval( ASSERT(!(bip->bli_flags & (XFS_BLI_LOGGED | XFS_BLI_DIRTY))); ASSERT(!(bip->bli_format.blf_flags & XFS_BLF_INODE_BUF)); ASSERT(bip->bli_format.blf_flags & XFS_BLF_CANCEL); - ASSERT(lidp->lid_flags & XFS_LID_DIRTY); + ASSERT(bip->bli_item.li_desc->lid_flags & XFS_LID_DIRTY); ASSERT(tp->t_flags & XFS_TRANS_DIRTY); return; } @@ -797,7 +759,7 @@ xfs_trans_binval( bip->bli_format.blf_flags |= XFS_BLF_CANCEL; memset((char *)(bip->bli_format.blf_data_map), 0, (bip->bli_format.blf_map_size * sizeof(uint))); - lidp->lid_flags |= XFS_LID_DIRTY; + bip->bli_item.li_desc->lid_flags |= XFS_LID_DIRTY; tp->t_flags |= XFS_TRANS_DIRTY; } @@ -853,12 +815,9 @@ xfs_trans_stale_inode_buf( ASSERT(atomic_read(&bip->bli_refcount) > 0); bip->bli_flags |= XFS_BLI_STALE_INODE; - bip->bli_item.li_cb = (void(*)(xfs_buf_t*,xfs_log_item_t*)) - xfs_buf_iodone; + bip->bli_item.li_cb = xfs_buf_iodone; } - - /* * Mark the buffer as being one which contains newly allocated * inodes. We need to make sure that even if this buffer is diff --git a/fs/xfs/xfs_trans_extfree.c b/fs/xfs/xfs_trans_extfree.c index 27cce2a9c7e9..f783d5e9fa70 100644 --- a/fs/xfs/xfs_trans_extfree.c +++ b/fs/xfs/xfs_trans_extfree.c @@ -23,7 +23,6 @@ #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" -#include "xfs_dmapi.h" #include "xfs_mount.h" #include "xfs_trans_priv.h" #include "xfs_extfree_item.h" @@ -49,9 +48,8 @@ xfs_trans_get_efi(xfs_trans_t *tp, /* * Get a log_item_desc to point at the new item. */ - (void) xfs_trans_add_item(tp, (xfs_log_item_t*)efip); - - return (efip); + xfs_trans_add_item(tp, &efip->efi_item); + return efip; } /* @@ -65,15 +63,11 @@ xfs_trans_log_efi_extent(xfs_trans_t *tp, xfs_fsblock_t start_block, xfs_extlen_t ext_len) { - xfs_log_item_desc_t *lidp; uint next_extent; xfs_extent_t *extp; - lidp = xfs_trans_find_item(tp, (xfs_log_item_t*)efip); - ASSERT(lidp != NULL); - tp->t_flags |= XFS_TRANS_DIRTY; - lidp->lid_flags |= XFS_LID_DIRTY; + efip->efi_item.li_desc->lid_flags |= XFS_LID_DIRTY; next_extent = efip->efi_next_extent; ASSERT(next_extent < efip->efi_format.efi_nextents); @@ -106,9 +100,8 @@ xfs_trans_get_efd(xfs_trans_t *tp, /* * Get a log_item_desc to point at the new item. */ - (void) xfs_trans_add_item(tp, (xfs_log_item_t*)efdp); - - return (efdp); + xfs_trans_add_item(tp, &efdp->efd_item); + return efdp; } /* @@ -122,15 +115,11 @@ xfs_trans_log_efd_extent(xfs_trans_t *tp, xfs_fsblock_t start_block, xfs_extlen_t ext_len) { - xfs_log_item_desc_t *lidp; uint next_extent; xfs_extent_t *extp; - lidp = xfs_trans_find_item(tp, (xfs_log_item_t*)efdp); - ASSERT(lidp != NULL); - tp->t_flags |= XFS_TRANS_DIRTY; - lidp->lid_flags |= XFS_LID_DIRTY; + efdp->efd_item.li_desc->lid_flags |= XFS_LID_DIRTY; next_extent = efdp->efd_next_extent; ASSERT(next_extent < efdp->efd_format.efd_nextents); diff --git a/fs/xfs/xfs_trans_inode.c b/fs/xfs/xfs_trans_inode.c index 785ff101da0a..cdc53a1050c5 100644 --- a/fs/xfs/xfs_trans_inode.c +++ b/fs/xfs/xfs_trans_inode.c @@ -24,20 +24,16 @@ #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" -#include "xfs_dir2.h" -#include "xfs_dmapi.h" #include "xfs_mount.h" #include "xfs_bmap_btree.h" #include "xfs_alloc_btree.h" #include "xfs_ialloc_btree.h" -#include "xfs_dir2_sf.h" -#include "xfs_attr_sf.h" #include "xfs_dinode.h" #include "xfs_inode.h" #include "xfs_btree.h" -#include "xfs_ialloc.h" #include "xfs_trans_priv.h" #include "xfs_inode_item.h" +#include "xfs_trace.h" #ifdef XFS_TRANS_DEBUG STATIC void @@ -47,7 +43,6 @@ xfs_trans_inode_broot_debug( #define xfs_trans_inode_broot_debug(ip) #endif - /* * Get an inode and join it to the transaction. */ @@ -62,78 +57,66 @@ xfs_trans_iget( { int error; - error = xfs_iget(mp, tp, ino, flags, lock_flags, ipp, 0); - if (!error && tp) - xfs_trans_ijoin(tp, *ipp, lock_flags); + error = xfs_iget(mp, tp, ino, flags, lock_flags, ipp); + if (!error && tp) { + xfs_trans_ijoin(tp, *ipp); + (*ipp)->i_itemp->ili_lock_flags = lock_flags; + } return error; } /* - * Add the locked inode to the transaction. - * The inode must be locked, and it cannot be associated with any - * transaction. The caller must specify the locks already held - * on the inode. + * Add a locked inode to the transaction. + * + * The inode must be locked, and it cannot be associated with any transaction. */ void xfs_trans_ijoin( - xfs_trans_t *tp, - xfs_inode_t *ip, - uint lock_flags) + struct xfs_trans *tp, + struct xfs_inode *ip) { xfs_inode_log_item_t *iip; ASSERT(ip->i_transp == NULL); ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); - ASSERT(lock_flags & XFS_ILOCK_EXCL); if (ip->i_itemp == NULL) xfs_inode_item_init(ip, ip->i_mount); iip = ip->i_itemp; - ASSERT(iip->ili_flags == 0); + ASSERT(iip->ili_lock_flags == 0); /* * Get a log_item_desc to point at the new item. */ - (void) xfs_trans_add_item(tp, (xfs_log_item_t*)(iip)); + xfs_trans_add_item(tp, &iip->ili_item); xfs_trans_inode_broot_debug(ip); /* - * If the IO lock is already held, mark that in the inode log item. - */ - if (lock_flags & XFS_IOLOCK_EXCL) { - iip->ili_flags |= XFS_ILI_IOLOCKED_EXCL; - } else if (lock_flags & XFS_IOLOCK_SHARED) { - iip->ili_flags |= XFS_ILI_IOLOCKED_SHARED; - } - - /* * Initialize i_transp so we can find it with xfs_inode_incore() * in xfs_trans_iget() above. */ ip->i_transp = tp; } - - /* - * Mark the inode as not needing to be unlocked when the inode item's - * IOP_UNLOCK() routine is called. The inode must already be locked - * and associated with the given transaction. + * Add a locked inode to the transaction. + * + * + * Grabs a reference to the inode which will be dropped when the transaction + * is commited. The inode will also be unlocked at that point. The inode + * must be locked, and it cannot be associated with any transaction. */ -/*ARGSUSED*/ void -xfs_trans_ihold( - xfs_trans_t *tp, - xfs_inode_t *ip) +xfs_trans_ijoin_ref( + struct xfs_trans *tp, + struct xfs_inode *ip, + uint lock_flags) { - ASSERT(ip->i_transp == tp); - ASSERT(ip->i_itemp != NULL); - ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); - - ip->i_itemp->ili_flags |= XFS_ILI_HOLD; + xfs_trans_ijoin(tp, ip); + IHOLD(ip); + ip->i_itemp->ili_lock_flags = lock_flags; } - /* * This is called to mark the fields indicated in fieldmask as needing * to be logged when the transaction is committed. The inode must @@ -149,17 +132,12 @@ xfs_trans_log_inode( xfs_inode_t *ip, uint flags) { - xfs_log_item_desc_t *lidp; - ASSERT(ip->i_transp == tp); ASSERT(ip->i_itemp != NULL); ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); - lidp = xfs_trans_find_item(tp, (xfs_log_item_t*)(ip->i_itemp)); - ASSERT(lidp != NULL); - tp->t_flags |= XFS_TRANS_DIRTY; - lidp->lid_flags |= XFS_LID_DIRTY; + ip->i_itemp->ili_item.li_desc->lid_flags |= XFS_LID_DIRTY; /* * Always OR in the bits from the ili_last_fields field. diff --git a/fs/xfs/xfs_trans_item.c b/fs/xfs/xfs_trans_item.c deleted file mode 100644 index f11d37d06dcc..000000000000 --- a/fs/xfs/xfs_trans_item.c +++ /dev/null @@ -1,441 +0,0 @@ -/* - * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc. - * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - */ -#include "xfs.h" -#include "xfs_fs.h" -#include "xfs_types.h" -#include "xfs_log.h" -#include "xfs_inum.h" -#include "xfs_trans.h" -#include "xfs_trans_priv.h" -/* XXX: from here down needed until struct xfs_trans has its own ailp */ -#include "xfs_bit.h" -#include "xfs_buf_item.h" -#include "xfs_sb.h" -#include "xfs_ag.h" -#include "xfs_dir2.h" -#include "xfs_dmapi.h" -#include "xfs_mount.h" - -STATIC int xfs_trans_unlock_chunk(xfs_log_item_chunk_t *, - int, int, xfs_lsn_t); - -/* - * This is called to add the given log item to the transaction's - * list of log items. It must find a free log item descriptor - * or allocate a new one and add the item to that descriptor. - * The function returns a pointer to item descriptor used to point - * to the new item. The log item will now point to its new descriptor - * with its li_desc field. - */ -xfs_log_item_desc_t * -xfs_trans_add_item(xfs_trans_t *tp, xfs_log_item_t *lip) -{ - xfs_log_item_desc_t *lidp; - xfs_log_item_chunk_t *licp; - int i=0; - - /* - * If there are no free descriptors, allocate a new chunk - * of them and put it at the front of the chunk list. - */ - if (tp->t_items_free == 0) { - licp = (xfs_log_item_chunk_t*) - kmem_alloc(sizeof(xfs_log_item_chunk_t), KM_SLEEP); - ASSERT(licp != NULL); - /* - * Initialize the chunk, and then - * claim the first slot in the newly allocated chunk. - */ - xfs_lic_init(licp); - xfs_lic_claim(licp, 0); - licp->lic_unused = 1; - xfs_lic_init_slot(licp, 0); - lidp = xfs_lic_slot(licp, 0); - - /* - * Link in the new chunk and update the free count. - */ - licp->lic_next = tp->t_items.lic_next; - tp->t_items.lic_next = licp; - tp->t_items_free = XFS_LIC_NUM_SLOTS - 1; - - /* - * Initialize the descriptor and the generic portion - * of the log item. - * - * Point the new slot at this item and return it. - * Also point the log item at its currently active - * descriptor and set the item's mount pointer. - */ - lidp->lid_item = lip; - lidp->lid_flags = 0; - lidp->lid_size = 0; - lip->li_desc = lidp; - lip->li_mountp = tp->t_mountp; - lip->li_ailp = tp->t_mountp->m_ail; - return lidp; - } - - /* - * Find the free descriptor. It is somewhere in the chunklist - * of descriptors. - */ - licp = &tp->t_items; - while (licp != NULL) { - if (xfs_lic_vacancy(licp)) { - if (licp->lic_unused <= XFS_LIC_MAX_SLOT) { - i = licp->lic_unused; - ASSERT(xfs_lic_isfree(licp, i)); - break; - } - for (i = 0; i <= XFS_LIC_MAX_SLOT; i++) { - if (xfs_lic_isfree(licp, i)) - break; - } - ASSERT(i <= XFS_LIC_MAX_SLOT); - break; - } - licp = licp->lic_next; - } - ASSERT(licp != NULL); - /* - * If we find a free descriptor, claim it, - * initialize it, and return it. - */ - xfs_lic_claim(licp, i); - if (licp->lic_unused <= i) { - licp->lic_unused = i + 1; - xfs_lic_init_slot(licp, i); - } - lidp = xfs_lic_slot(licp, i); - tp->t_items_free--; - lidp->lid_item = lip; - lidp->lid_flags = 0; - lidp->lid_size = 0; - lip->li_desc = lidp; - lip->li_mountp = tp->t_mountp; - lip->li_ailp = tp->t_mountp->m_ail; - return lidp; -} - -/* - * Free the given descriptor. - * - * This requires setting the bit in the chunk's free mask corresponding - * to the given slot. - */ -void -xfs_trans_free_item(xfs_trans_t *tp, xfs_log_item_desc_t *lidp) -{ - uint slot; - xfs_log_item_chunk_t *licp; - xfs_log_item_chunk_t **licpp; - - slot = xfs_lic_desc_to_slot(lidp); - licp = xfs_lic_desc_to_chunk(lidp); - xfs_lic_relse(licp, slot); - lidp->lid_item->li_desc = NULL; - tp->t_items_free++; - - /* - * If there are no more used items in the chunk and this is not - * the chunk embedded in the transaction structure, then free - * the chunk. First pull it from the chunk list and then - * free it back to the heap. We didn't bother with a doubly - * linked list here because the lists should be very short - * and this is not a performance path. It's better to save - * the memory of the extra pointer. - * - * Also decrement the transaction structure's count of free items - * by the number in a chunk since we are freeing an empty chunk. - */ - if (xfs_lic_are_all_free(licp) && (licp != &(tp->t_items))) { - licpp = &(tp->t_items.lic_next); - while (*licpp != licp) { - ASSERT(*licpp != NULL); - licpp = &((*licpp)->lic_next); - } - *licpp = licp->lic_next; - kmem_free(licp); - tp->t_items_free -= XFS_LIC_NUM_SLOTS; - } -} - -/* - * This is called to find the descriptor corresponding to the given - * log item. It returns a pointer to the descriptor. - * The log item MUST have a corresponding descriptor in the given - * transaction. This routine does not return NULL, it panics. - * - * The descriptor pointer is kept in the log item's li_desc field. - * Just return it. - */ -/*ARGSUSED*/ -xfs_log_item_desc_t * -xfs_trans_find_item(xfs_trans_t *tp, xfs_log_item_t *lip) -{ - ASSERT(lip->li_desc != NULL); - - return lip->li_desc; -} - - -/* - * Return a pointer to the first descriptor in the chunk list. - * This does not return NULL if there are none, it panics. - * - * The first descriptor must be in either the first or second chunk. - * This is because the only chunk allowed to be empty is the first. - * All others are freed when they become empty. - * - * At some point this and xfs_trans_next_item() should be optimized - * to quickly look at the mask to determine if there is anything to - * look at. - */ -xfs_log_item_desc_t * -xfs_trans_first_item(xfs_trans_t *tp) -{ - xfs_log_item_chunk_t *licp; - int i; - - licp = &tp->t_items; - /* - * If it's not in the first chunk, skip to the second. - */ - if (xfs_lic_are_all_free(licp)) { - licp = licp->lic_next; - } - - /* - * Return the first non-free descriptor in the chunk. - */ - ASSERT(!xfs_lic_are_all_free(licp)); - for (i = 0; i < licp->lic_unused; i++) { - if (xfs_lic_isfree(licp, i)) { - continue; - } - - return xfs_lic_slot(licp, i); - } - cmn_err(CE_WARN, "xfs_trans_first_item() -- no first item"); - return NULL; -} - - -/* - * Given a descriptor, return the next descriptor in the chunk list. - * This returns NULL if there are no more used descriptors in the list. - * - * We do this by first locating the chunk in which the descriptor resides, - * and then scanning forward in the chunk and the list for the next - * used descriptor. - */ -/*ARGSUSED*/ -xfs_log_item_desc_t * -xfs_trans_next_item(xfs_trans_t *tp, xfs_log_item_desc_t *lidp) -{ - xfs_log_item_chunk_t *licp; - int i; - - licp = xfs_lic_desc_to_chunk(lidp); - - /* - * First search the rest of the chunk. The for loop keeps us - * from referencing things beyond the end of the chunk. - */ - for (i = (int)xfs_lic_desc_to_slot(lidp) + 1; i < licp->lic_unused; i++) { - if (xfs_lic_isfree(licp, i)) { - continue; - } - - return xfs_lic_slot(licp, i); - } - - /* - * Now search the next chunk. It must be there, because the - * next chunk would have been freed if it were empty. - * If there is no next chunk, return NULL. - */ - if (licp->lic_next == NULL) { - return NULL; - } - - licp = licp->lic_next; - ASSERT(!xfs_lic_are_all_free(licp)); - for (i = 0; i < licp->lic_unused; i++) { - if (xfs_lic_isfree(licp, i)) { - continue; - } - - return xfs_lic_slot(licp, i); - } - ASSERT(0); - /* NOTREACHED */ - return NULL; /* keep gcc quite */ -} - -/* - * This is called to unlock all of the items of a transaction and to free - * all the descriptors of that transaction. - * - * It walks the list of descriptors and unlocks each item. It frees - * each chunk except that embedded in the transaction as it goes along. - */ -void -xfs_trans_free_items( - xfs_trans_t *tp, - xfs_lsn_t commit_lsn, - int flags) -{ - xfs_log_item_chunk_t *licp; - xfs_log_item_chunk_t *next_licp; - int abort; - - abort = flags & XFS_TRANS_ABORT; - licp = &tp->t_items; - /* - * Special case the embedded chunk so we don't free it below. - */ - if (!xfs_lic_are_all_free(licp)) { - (void) xfs_trans_unlock_chunk(licp, 1, abort, commit_lsn); - xfs_lic_all_free(licp); - licp->lic_unused = 0; - } - licp = licp->lic_next; - - /* - * Unlock each item in each chunk and free the chunks. - */ - while (licp != NULL) { - ASSERT(!xfs_lic_are_all_free(licp)); - (void) xfs_trans_unlock_chunk(licp, 1, abort, commit_lsn); - next_licp = licp->lic_next; - kmem_free(licp); - licp = next_licp; - } - - /* - * Reset the transaction structure's free item count. - */ - tp->t_items_free = XFS_LIC_NUM_SLOTS; - tp->t_items.lic_next = NULL; -} - - - -/* - * This is called to unlock the items associated with a transaction. - * Items which were not logged should be freed. - * Those which were logged must still be tracked so they can be unpinned - * when the transaction commits. - */ -void -xfs_trans_unlock_items(xfs_trans_t *tp, xfs_lsn_t commit_lsn) -{ - xfs_log_item_chunk_t *licp; - xfs_log_item_chunk_t *next_licp; - xfs_log_item_chunk_t **licpp; - int freed; - - freed = 0; - licp = &tp->t_items; - - /* - * Special case the embedded chunk so we don't free. - */ - if (!xfs_lic_are_all_free(licp)) { - freed = xfs_trans_unlock_chunk(licp, 0, 0, commit_lsn); - } - licpp = &(tp->t_items.lic_next); - licp = licp->lic_next; - - /* - * Unlock each item in each chunk, free non-dirty descriptors, - * and free empty chunks. - */ - while (licp != NULL) { - ASSERT(!xfs_lic_are_all_free(licp)); - freed += xfs_trans_unlock_chunk(licp, 0, 0, commit_lsn); - next_licp = licp->lic_next; - if (xfs_lic_are_all_free(licp)) { - *licpp = next_licp; - kmem_free(licp); - freed -= XFS_LIC_NUM_SLOTS; - } else { - licpp = &(licp->lic_next); - } - ASSERT(*licpp == next_licp); - licp = next_licp; - } - - /* - * Fix the free descriptor count in the transaction. - */ - tp->t_items_free += freed; -} - -/* - * Unlock each item pointed to by a descriptor in the given chunk. - * Stamp the commit lsn into each item if necessary. - * Free descriptors pointing to items which are not dirty if freeing_chunk - * is zero. If freeing_chunk is non-zero, then we need to unlock all - * items in the chunk. - * - * Return the number of descriptors freed. - */ -STATIC int -xfs_trans_unlock_chunk( - xfs_log_item_chunk_t *licp, - int freeing_chunk, - int abort, - xfs_lsn_t commit_lsn) -{ - xfs_log_item_desc_t *lidp; - xfs_log_item_t *lip; - int i; - int freed; - - freed = 0; - lidp = licp->lic_descs; - for (i = 0; i < licp->lic_unused; i++, lidp++) { - if (xfs_lic_isfree(licp, i)) { - continue; - } - lip = lidp->lid_item; - lip->li_desc = NULL; - - if (commit_lsn != NULLCOMMITLSN) - IOP_COMMITTING(lip, commit_lsn); - if (abort) - lip->li_flags |= XFS_LI_ABORTED; - IOP_UNLOCK(lip); - - /* - * Free the descriptor if the item is not dirty - * within this transaction and the caller is not - * going to just free the entire thing regardless. - */ - if (!(freeing_chunk) && - (!(lidp->lid_flags & XFS_LID_DIRTY) || abort)) { - xfs_lic_relse(licp, i); - freed++; - } - } - - return freed; -} diff --git a/fs/xfs/xfs_trans_priv.h b/fs/xfs/xfs_trans_priv.h index c6e4f2c8de6e..e2d93d8ead7b 100644 --- a/fs/xfs/xfs_trans_priv.h +++ b/fs/xfs/xfs_trans_priv.h @@ -23,22 +23,8 @@ struct xfs_log_item_desc; struct xfs_mount; struct xfs_trans; -/* - * From xfs_trans_item.c - */ -struct xfs_log_item_desc *xfs_trans_add_item(struct xfs_trans *, - struct xfs_log_item *); -void xfs_trans_free_item(struct xfs_trans *, - struct xfs_log_item_desc *); -struct xfs_log_item_desc *xfs_trans_find_item(struct xfs_trans *, - struct xfs_log_item *); -struct xfs_log_item_desc *xfs_trans_first_item(struct xfs_trans *); -struct xfs_log_item_desc *xfs_trans_next_item(struct xfs_trans *, - struct xfs_log_item_desc *); - -void xfs_trans_unlock_items(struct xfs_trans *tp, xfs_lsn_t commit_lsn); -void xfs_trans_free_items(struct xfs_trans *tp, xfs_lsn_t commit_lsn, - int flags); +void xfs_trans_add_item(struct xfs_trans *, struct xfs_log_item *); +void xfs_trans_del_item(struct xfs_log_item *); void xfs_trans_item_committed(struct xfs_log_item *lip, xfs_lsn_t commit_lsn, int aborted); diff --git a/fs/xfs/xfs_utils.c b/fs/xfs/xfs_utils.c index 4d88616bde91..b7d5769d2df0 100644 --- a/fs/xfs/xfs_utils.c +++ b/fs/xfs/xfs_utils.c @@ -25,18 +25,14 @@ #include "xfs_sb.h" #include "xfs_ag.h" #include "xfs_dir2.h" -#include "xfs_dmapi.h" #include "xfs_mount.h" #include "xfs_bmap_btree.h" -#include "xfs_dir2_sf.h" -#include "xfs_attr_sf.h" #include "xfs_dinode.h" #include "xfs_inode.h" #include "xfs_inode_item.h" #include "xfs_bmap.h" #include "xfs_error.h" #include "xfs_quota.h" -#include "xfs_rw.h" #include "xfs_itable.h" #include "xfs_utils.h" @@ -324,86 +320,3 @@ xfs_bumplink( xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); return 0; } - -/* - * Try to truncate the given file to 0 length. Currently called - * only out of xfs_remove when it has to truncate a file to free - * up space for the remove to proceed. - */ -int -xfs_truncate_file( - xfs_mount_t *mp, - xfs_inode_t *ip) -{ - xfs_trans_t *tp; - int error; - -#ifdef QUOTADEBUG - /* - * This is called to truncate the quotainodes too. - */ - if (XFS_IS_UQUOTA_ON(mp)) { - if (ip->i_ino != mp->m_sb.sb_uquotino) - ASSERT(ip->i_udquot); - } - if (XFS_IS_OQUOTA_ON(mp)) { - if (ip->i_ino != mp->m_sb.sb_gquotino) - ASSERT(ip->i_gdquot); - } -#endif - /* - * Make the call to xfs_itruncate_start before starting the - * transaction, because we cannot make the call while we're - * in a transaction. - */ - xfs_ilock(ip, XFS_IOLOCK_EXCL); - error = xfs_itruncate_start(ip, XFS_ITRUNC_DEFINITE, (xfs_fsize_t)0); - if (error) { - xfs_iunlock(ip, XFS_IOLOCK_EXCL); - return error; - } - - tp = xfs_trans_alloc(mp, XFS_TRANS_TRUNCATE_FILE); - if ((error = xfs_trans_reserve(tp, 0, XFS_ITRUNCATE_LOG_RES(mp), 0, - XFS_TRANS_PERM_LOG_RES, - XFS_ITRUNCATE_LOG_COUNT))) { - xfs_trans_cancel(tp, 0); - xfs_iunlock(ip, XFS_IOLOCK_EXCL); - return error; - } - - /* - * Follow the normal truncate locking protocol. Since we - * hold the inode in the transaction, we know that its number - * of references will stay constant. - */ - xfs_ilock(ip, XFS_ILOCK_EXCL); - xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); - xfs_trans_ihold(tp, ip); - /* - * Signal a sync xaction. The only case where that isn't - * the case is if we're truncating an already unlinked file - * on a wsync fs. In that case, we know the blocks can't - * reappear in the file because the links to file are - * permanently toast. Currently, we're always going to - * want a sync transaction because this code is being - * called from places where nlink is guaranteed to be 1 - * but I'm leaving the tests in to protect against future - * changes -- rcc. - */ - error = xfs_itruncate_finish(&tp, ip, (xfs_fsize_t)0, - XFS_DATA_FORK, - ((ip->i_d.di_nlink != 0 || - !(mp->m_flags & XFS_MOUNT_WSYNC)) - ? 1 : 0)); - if (error) { - xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | - XFS_TRANS_ABORT); - } else { - xfs_ichgtime(ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); - error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); - } - xfs_iunlock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); - - return error; -} diff --git a/fs/xfs/xfs_utils.h b/fs/xfs/xfs_utils.h index ef321225d269..f55b9678264f 100644 --- a/fs/xfs/xfs_utils.h +++ b/fs/xfs/xfs_utils.h @@ -18,7 +18,6 @@ #ifndef __XFS_UTILS_H__ #define __XFS_UTILS_H__ -extern int xfs_truncate_file(xfs_mount_t *, xfs_inode_t *); extern int xfs_dir_ialloc(xfs_trans_t **, xfs_inode_t *, mode_t, xfs_nlink_t, xfs_dev_t, cred_t *, prid_t, int, xfs_inode_t **, int *); diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c index 9d376be0ea38..3ac137dd531b 100644 --- a/fs/xfs/xfs_vnodeops.c +++ b/fs/xfs/xfs_vnodeops.c @@ -26,19 +26,14 @@ #include "xfs_sb.h" #include "xfs_ag.h" #include "xfs_dir2.h" -#include "xfs_dmapi.h" #include "xfs_mount.h" #include "xfs_da_btree.h" #include "xfs_bmap_btree.h" -#include "xfs_alloc_btree.h" #include "xfs_ialloc_btree.h" -#include "xfs_dir2_sf.h" -#include "xfs_attr_sf.h" #include "xfs_dinode.h" #include "xfs_inode.h" #include "xfs_inode_item.h" #include "xfs_itable.h" -#include "xfs_btree.h" #include "xfs_ialloc.h" #include "xfs_alloc.h" #include "xfs_bmap.h" @@ -73,7 +68,7 @@ xfs_setattr( struct xfs_dquot *udqp, *gdqp, *olddquot1, *olddquot2; int need_iolock = 1; - xfs_itrace_entry(ip); + trace_xfs_setattr(ip); if (mp->m_flags & XFS_MOUNT_RDONLY) return XFS_ERROR(EROFS); @@ -143,16 +138,6 @@ xfs_setattr( goto error_return; } } else { - if (DM_EVENT_ENABLED(ip, DM_EVENT_TRUNCATE) && - !(flags & XFS_ATTR_DMI)) { - int dmflags = AT_DELAY_FLAG(flags) | DM_SEM_FLAG_WR; - code = XFS_SEND_DATA(mp, DM_EVENT_TRUNCATE, ip, - iattr->ia_size, 0, dmflags, NULL); - if (code) { - lock_flags = 0; - goto error_return; - } - } if (need_iolock) lock_flags |= XFS_IOLOCK_EXCL; } @@ -267,7 +252,7 @@ xfs_setattr( if (code) { ASSERT(tp == NULL); lock_flags &= ~XFS_ILOCK_EXCL; - ASSERT(lock_flags == XFS_IOLOCK_EXCL); + ASSERT(lock_flags == XFS_IOLOCK_EXCL || !need_iolock); goto error_return; } tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_SIZE); @@ -283,8 +268,7 @@ xfs_setattr( commit_flags = XFS_TRANS_RELEASE_LOG_RES; xfs_ilock(ip, XFS_ILOCK_EXCL); - xfs_trans_ijoin(tp, ip, lock_flags); - xfs_trans_ihold(tp, ip); + xfs_trans_ijoin(tp, ip); /* * Only change the c/mtime if we are changing the size @@ -334,8 +318,7 @@ xfs_setattr( xfs_iflags_set(ip, XFS_ITRUNCATED); } } else if (tp) { - xfs_trans_ijoin(tp, ip, lock_flags); - xfs_trans_ihold(tp, ip); + xfs_trans_ijoin(tp, ip); } /* @@ -470,17 +453,10 @@ xfs_setattr( return XFS_ERROR(code); } - if (DM_EVENT_ENABLED(ip, DM_EVENT_ATTRIBUTE) && - !(flags & XFS_ATTR_DMI)) { - (void) XFS_SEND_NAMESP(mp, DM_EVENT_ATTRIBUTE, ip, DM_RIGHT_NULL, - NULL, DM_RIGHT_NULL, NULL, NULL, - 0, 0, AT_DELAY_FLAG(flags)); - } return 0; abort_return: commit_flags |= XFS_TRANS_ABORT; - /* FALLTHROUGH */ error_return: xfs_qm_dqrele(udqp); xfs_qm_dqrele(gdqp); @@ -516,7 +492,7 @@ xfs_readlink_bmap( int error = 0; error = xfs_bmapi(NULL, ip, 0, XFS_B_TO_FSB(mp, pathlen), 0, NULL, 0, - mval, &nmaps, NULL, NULL); + mval, &nmaps, NULL); if (error) goto out; @@ -557,7 +533,7 @@ xfs_readlink( int pathlen; int error = 0; - xfs_itrace_entry(ip); + trace_xfs_readlink(ip); if (XFS_FORCED_SHUTDOWN(mp)) return XFS_ERROR(EIO); @@ -613,14 +589,14 @@ xfs_free_eofblocks( */ end_fsb = XFS_B_TO_FSB(mp, ((xfs_ufsize_t)ip->i_size)); last_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)XFS_MAXIOFFSET(mp)); - map_len = last_fsb - end_fsb; - if (map_len <= 0) + if (last_fsb <= end_fsb) return 0; + map_len = last_fsb - end_fsb; nimaps = 1; xfs_ilock(ip, XFS_ILOCK_SHARED); error = xfs_bmapi(NULL, ip, end_fsb, map_len, 0, - NULL, 0, &imap, &nimaps, NULL, NULL); + NULL, 0, &imap, &nimaps, NULL); xfs_iunlock(ip, XFS_ILOCK_SHARED); if (!error && (nimaps != 0) && @@ -675,10 +651,7 @@ xfs_free_eofblocks( } xfs_ilock(ip, XFS_ILOCK_EXCL); - xfs_trans_ijoin(tp, ip, - XFS_IOLOCK_EXCL | - XFS_ILOCK_EXCL); - xfs_trans_ihold(tp, ip); + xfs_trans_ijoin(tp, ip); error = xfs_itruncate_finish(&tp, ip, ip->i_size, @@ -750,8 +723,7 @@ xfs_inactive_symlink_rmt( xfs_ilock(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL); size = (int)ip->i_d.di_size; ip->i_d.di_size = 0; - xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); - xfs_trans_ihold(tp, ip); + xfs_trans_ijoin(tp, ip); xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); /* * Find the block(s) so we can inval and unmap them. @@ -761,7 +733,7 @@ xfs_inactive_symlink_rmt( nmaps = ARRAY_SIZE(mval); if ((error = xfs_bmapi(tp, ip, 0, XFS_B_TO_FSB(mp, size), XFS_BMAPI_METADATA, &first_block, 0, mval, &nmaps, - &free_list, NULL))) + &free_list))) goto error0; /* * Invalidate the block(s). @@ -776,7 +748,7 @@ xfs_inactive_symlink_rmt( * Unmap the dead block(s) to the free_list. */ if ((error = xfs_bunmapi(tp, ip, 0, size, XFS_BMAPI_METADATA, nmaps, - &first_block, &free_list, NULL, &done))) + &first_block, &free_list, &done))) goto error1; ASSERT(done); /* @@ -795,8 +767,7 @@ xfs_inactive_symlink_rmt( * Mark it dirty so it will be logged and moved forward in the log as * part of every commit. */ - xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); - xfs_trans_ihold(tp, ip); + xfs_trans_ijoin(tp, ip); xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); /* * Get a new, empty transaction to return to our caller. @@ -929,8 +900,7 @@ xfs_inactive_attrs( goto error_cancel; xfs_ilock(ip, XFS_ILOCK_EXCL); - xfs_trans_ijoin(tp, ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL); - xfs_trans_ihold(tp, ip); + xfs_trans_ijoin(tp, ip); xfs_idestroy_fork(ip, XFS_ATTR_FORK); ASSERT(ip->i_d.di_anextents == 0); @@ -1035,8 +1005,6 @@ xfs_inactive( int error; int truncate; - xfs_itrace_entry(ip); - /* * If the inode is already free, then there can be nothing * to clean up here. @@ -1060,9 +1028,6 @@ xfs_inactive( mp = ip->i_mount; - if (ip->i_d.di_nlink == 0 && DM_EVENT_ENABLED(ip, DM_EVENT_DESTROY)) - XFS_SEND_DESTROY(mp, ip, DM_RIGHT_NULL); - error = 0; /* If this is a read-only mount, don't do this (would generate I/O) */ @@ -1120,8 +1085,7 @@ xfs_inactive( } xfs_ilock(ip, XFS_ILOCK_EXCL); - xfs_trans_ijoin(tp, ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL); - xfs_trans_ihold(tp, ip); + xfs_trans_ijoin(tp, ip); /* * normally, we have to run xfs_itruncate_finish sync. @@ -1154,8 +1118,7 @@ xfs_inactive( return VN_INACTIVE_CACHE; } - xfs_trans_ijoin(tp, ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL); - xfs_trans_ihold(tp, ip); + xfs_trans_ijoin(tp, ip); } else { error = xfs_trans_reserve(tp, 0, XFS_IFREE_LOG_RES(mp), @@ -1168,8 +1131,7 @@ xfs_inactive( } xfs_ilock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); - xfs_trans_ijoin(tp, ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL); - xfs_trans_ihold(tp, ip); + xfs_trans_ijoin(tp, ip); } /* @@ -1257,7 +1219,7 @@ xfs_lookup( int error; uint lock_mode; - xfs_itrace_entry(dp); + trace_xfs_lookup(dp, name); if (XFS_FORCED_SHUTDOWN(dp->i_mount)) return XFS_ERROR(EIO); @@ -1269,7 +1231,7 @@ xfs_lookup( if (error) goto out; - error = xfs_iget(dp->i_mount, NULL, inum, 0, 0, ipp, 0); + error = xfs_iget(dp->i_mount, NULL, inum, 0, 0, ipp); if (error) goto out_free_name; @@ -1309,21 +1271,11 @@ xfs_create( uint log_res; uint log_count; - xfs_itrace_entry(dp); + trace_xfs_create(dp, name); if (XFS_FORCED_SHUTDOWN(mp)) return XFS_ERROR(EIO); - if (DM_EVENT_ENABLED(dp, DM_EVENT_CREATE)) { - error = XFS_SEND_NAMESP(mp, DM_EVENT_CREATE, - dp, DM_RIGHT_NULL, NULL, - DM_RIGHT_NULL, name->name, NULL, - mode, 0, 0); - - if (error) - return error; - } - if (dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) prid = dp->i_d.di_projid; else @@ -1427,8 +1379,7 @@ xfs_create( * the transaction cancel unlocking dp so don't do it explicitly in the * error path. */ - IHOLD(dp); - xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL); + xfs_trans_ijoin_ref(tp, dp, XFS_ILOCK_EXCL); unlock_dp_on_error = B_FALSE; error = xfs_dir_createname(tp, dp, name, ip->i_ino, @@ -1487,16 +1438,7 @@ xfs_create( xfs_qm_dqrele(gdqp); *ipp = ip; - - /* Fallthrough to std_return with error = 0 */ - std_return: - if (DM_EVENT_ENABLED(dp, DM_EVENT_POSTCREATE)) { - XFS_SEND_NAMESP(mp, DM_EVENT_POSTCREATE, dp, DM_RIGHT_NULL, - ip, DM_RIGHT_NULL, name->name, NULL, mode, - error, 0); - } - - return error; + return 0; out_bmap_cancel: xfs_bmap_cancel(&free_list); @@ -1510,8 +1452,8 @@ xfs_create( if (unlock_dp_on_error) xfs_iunlock(dp, XFS_ILOCK_EXCL); - - goto std_return; + std_return: + return error; out_abort_rele: /* @@ -1726,20 +1668,11 @@ xfs_remove( uint resblks; uint log_count; - xfs_itrace_entry(dp); - xfs_itrace_entry(ip); + trace_xfs_remove(dp, name); if (XFS_FORCED_SHUTDOWN(mp)) return XFS_ERROR(EIO); - if (DM_EVENT_ENABLED(dp, DM_EVENT_REMOVE)) { - error = XFS_SEND_NAMESP(mp, DM_EVENT_REMOVE, dp, DM_RIGHT_NULL, - NULL, DM_RIGHT_NULL, name->name, NULL, - ip->i_d.di_mode, 0, 0); - if (error) - return error; - } - error = xfs_qm_dqattach(dp, 0); if (error) goto std_return; @@ -1782,15 +1715,8 @@ xfs_remove( xfs_lock_two_inodes(dp, ip, XFS_ILOCK_EXCL); - /* - * At this point, we've gotten both the directory and the entry - * inodes locked. - */ - IHOLD(ip); - xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL); - - IHOLD(dp); - xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin_ref(tp, dp, XFS_ILOCK_EXCL); + xfs_trans_ijoin_ref(tp, ip, XFS_ILOCK_EXCL); /* * If we're removing a directory perform some additional validation. @@ -1877,21 +1803,15 @@ xfs_remove( if (!is_dir && link_zero && xfs_inode_is_filestream(ip)) xfs_filestream_deassociate(ip); - std_return: - if (DM_EVENT_ENABLED(dp, DM_EVENT_POSTREMOVE)) { - XFS_SEND_NAMESP(mp, DM_EVENT_POSTREMOVE, dp, DM_RIGHT_NULL, - NULL, DM_RIGHT_NULL, name->name, NULL, - ip->i_d.di_mode, error, 0); - } - - return error; + return 0; out_bmap_cancel: xfs_bmap_cancel(&free_list); cancel_flags |= XFS_TRANS_ABORT; out_trans_cancel: xfs_trans_cancel(tp, cancel_flags); - goto std_return; + std_return: + return error; } int @@ -1909,25 +1829,13 @@ xfs_link( int committed; int resblks; - xfs_itrace_entry(tdp); - xfs_itrace_entry(sip); + trace_xfs_link(tdp, target_name); ASSERT(!S_ISDIR(sip->i_d.di_mode)); if (XFS_FORCED_SHUTDOWN(mp)) return XFS_ERROR(EIO); - if (DM_EVENT_ENABLED(tdp, DM_EVENT_LINK)) { - error = XFS_SEND_NAMESP(mp, DM_EVENT_LINK, - tdp, DM_RIGHT_NULL, - sip, DM_RIGHT_NULL, - target_name->name, NULL, 0, 0, 0); - if (error) - return error; - } - - /* Return through std_return after this point. */ - error = xfs_qm_dqattach(sip, 0); if (error) goto std_return; @@ -1953,15 +1861,8 @@ xfs_link( xfs_lock_two_inodes(sip, tdp, XFS_ILOCK_EXCL); - /* - * Increment vnode ref counts since xfs_trans_commit & - * xfs_trans_cancel will both unlock the inodes and - * decrement the associated ref counts. - */ - IHOLD(sip); - IHOLD(tdp); - xfs_trans_ijoin(tp, sip, XFS_ILOCK_EXCL); - xfs_trans_ijoin(tp, tdp, XFS_ILOCK_EXCL); + xfs_trans_ijoin_ref(tp, sip, XFS_ILOCK_EXCL); + xfs_trans_ijoin_ref(tp, tdp, XFS_ILOCK_EXCL); /* * If the source has too many links, we can't make any more to it. @@ -2014,27 +1915,14 @@ xfs_link( goto abort_return; } - error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); - if (error) - goto std_return; - - /* Fall through to std_return with error = 0. */ -std_return: - if (DM_EVENT_ENABLED(sip, DM_EVENT_POSTLINK)) { - (void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTLINK, - tdp, DM_RIGHT_NULL, - sip, DM_RIGHT_NULL, - target_name->name, NULL, 0, error, 0); - } - return error; + return xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); abort_return: cancel_flags |= XFS_TRANS_ABORT; - /* FALLTHROUGH */ - error_return: xfs_trans_cancel(tp, cancel_flags); - goto std_return; + std_return: + return error; } int @@ -2074,7 +1962,7 @@ xfs_symlink( ip = NULL; tp = NULL; - xfs_itrace_entry(dp); + trace_xfs_symlink(dp, link_name); if (XFS_FORCED_SHUTDOWN(mp)) return XFS_ERROR(EIO); @@ -2086,17 +1974,6 @@ xfs_symlink( if (pathlen >= MAXPATHLEN) /* total string too long */ return XFS_ERROR(ENAMETOOLONG); - if (DM_EVENT_ENABLED(dp, DM_EVENT_SYMLINK)) { - error = XFS_SEND_NAMESP(mp, DM_EVENT_SYMLINK, dp, - DM_RIGHT_NULL, NULL, DM_RIGHT_NULL, - link_name->name, - (unsigned char *)target_path, 0, 0, 0); - if (error) - return error; - } - - /* Return through std_return after this point. */ - udqp = gdqp = NULL; if (dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) prid = dp->i_d.di_projid; @@ -2180,8 +2057,7 @@ xfs_symlink( * transaction cancel unlocking dp so don't do it explicitly in the * error path. */ - IHOLD(dp); - xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL); + xfs_trans_ijoin_ref(tp, dp, XFS_ILOCK_EXCL); unlock_dp_on_error = B_FALSE; /* @@ -2215,7 +2091,7 @@ xfs_symlink( error = xfs_bmapi(tp, ip, first_fsb, fs_blocks, XFS_BMAPI_WRITE | XFS_BMAPI_METADATA, &first_block, resblks, mval, &nmaps, - &free_list, NULL); + &free_list); if (error) { goto error1; } @@ -2278,21 +2154,8 @@ xfs_symlink( xfs_qm_dqrele(udqp); xfs_qm_dqrele(gdqp); - /* Fall through to std_return with error = 0 or errno from - * xfs_trans_commit */ -std_return: - if (DM_EVENT_ENABLED(dp, DM_EVENT_POSTSYMLINK)) { - (void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTSYMLINK, - dp, DM_RIGHT_NULL, - error ? NULL : ip, - DM_RIGHT_NULL, link_name->name, - (unsigned char *)target_path, - 0, error, 0); - } - - if (!error) - *ipp = ip; - return error; + *ipp = ip; + return 0; error2: IRELE(ip); @@ -2306,8 +2169,8 @@ std_return: if (unlock_dp_on_error) xfs_iunlock(dp, XFS_ILOCK_EXCL); - - goto std_return; + std_return: + return error; } int @@ -2333,13 +2196,12 @@ xfs_set_dmattrs( return error; } xfs_ilock(ip, XFS_ILOCK_EXCL); - xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin_ref(tp, ip, XFS_ILOCK_EXCL); ip->i_d.di_dmevmask = evmask; ip->i_d.di_dmstate = state; xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); - IHOLD(ip); error = xfs_trans_commit(tp, 0); return error; @@ -2390,7 +2252,7 @@ xfs_alloc_file_space( int committed; int error; - xfs_itrace_entry(ip); + trace_xfs_alloc_file_space(ip); if (XFS_FORCED_SHUTDOWN(mp)) return XFS_ERROR(EIO); @@ -2412,25 +2274,9 @@ xfs_alloc_file_space( startoffset_fsb = XFS_B_TO_FSBT(mp, offset); allocatesize_fsb = XFS_B_TO_FSB(mp, count); - /* Generate a DMAPI event if needed. */ - if (alloc_type != 0 && offset < ip->i_size && - (attr_flags & XFS_ATTR_DMI) == 0 && - DM_EVENT_ENABLED(ip, DM_EVENT_WRITE)) { - xfs_off_t end_dmi_offset; - - end_dmi_offset = offset+len; - if (end_dmi_offset > ip->i_size) - end_dmi_offset = ip->i_size; - error = XFS_SEND_DATA(mp, DM_EVENT_WRITE, ip, offset, - end_dmi_offset - offset, 0, NULL); - if (error) - return error; - } - /* * Allocate file space until done or until there is an error */ -retry: while (allocatesize_fsb && !error) { xfs_fileoff_t s, e; @@ -2488,8 +2334,7 @@ retry: if (error) goto error1; - xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); - xfs_trans_ihold(tp, ip); + xfs_trans_ijoin(tp, ip); /* * Issue the xfs_bmapi() call to allocate the blocks @@ -2498,7 +2343,7 @@ retry: error = xfs_bmapi(tp, ip, startoffset_fsb, allocatesize_fsb, bmapi_flag, &firstfsb, 0, imapp, &nimaps, - &free_list, NULL); + &free_list); if (error) { goto error0; } @@ -2527,17 +2372,6 @@ retry: startoffset_fsb += allocated_fsb; allocatesize_fsb -= allocated_fsb; } -dmapi_enospc_check: - if (error == ENOSPC && (attr_flags & XFS_ATTR_DMI) == 0 && - DM_EVENT_ENABLED(ip, DM_EVENT_NOSPACE)) { - error = XFS_SEND_NAMESP(mp, DM_EVENT_NOSPACE, - ip, DM_RIGHT_NULL, - ip, DM_RIGHT_NULL, - NULL, NULL, 0, 0, 0); /* Delay flag intentionally unused */ - if (error == 0) - goto retry; /* Maybe DMAPI app. has made space */ - /* else fall through with error from XFS_SEND_DATA */ - } return error; @@ -2548,7 +2382,7 @@ error0: /* Cancel bmap, unlock inode, unreserve quota blocks, cancel trans */ error1: /* Just cancel transaction */ xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT); xfs_iunlock(ip, XFS_ILOCK_EXCL); - goto dmapi_enospc_check; + return error; } /* @@ -2598,7 +2432,7 @@ xfs_zero_remaining_bytes( offset_fsb = XFS_B_TO_FSBT(mp, offset); nimap = 1; error = xfs_bmapi(NULL, ip, offset_fsb, 1, 0, - NULL, 0, &imap, &nimap, NULL, NULL); + NULL, 0, &imap, &nimap, NULL); if (error || nimap < 1) break; ASSERT(imap.br_blockcount >= 1); @@ -2661,7 +2495,6 @@ xfs_free_file_space( { int committed; int done; - xfs_off_t end_dmi_offset; xfs_fileoff_t endoffset_fsb; int error; xfs_fsblock_t firstfsb; @@ -2680,7 +2513,7 @@ xfs_free_file_space( mp = ip->i_mount; - xfs_itrace_entry(ip); + trace_xfs_free_file_space(ip); error = xfs_qm_dqattach(ip, 0); if (error) @@ -2691,19 +2524,7 @@ xfs_free_file_space( return error; rt = XFS_IS_REALTIME_INODE(ip); startoffset_fsb = XFS_B_TO_FSB(mp, offset); - end_dmi_offset = offset + len; - endoffset_fsb = XFS_B_TO_FSBT(mp, end_dmi_offset); - - if (offset < ip->i_size && (attr_flags & XFS_ATTR_DMI) == 0 && - DM_EVENT_ENABLED(ip, DM_EVENT_WRITE)) { - if (end_dmi_offset > ip->i_size) - end_dmi_offset = ip->i_size; - error = XFS_SEND_DATA(mp, DM_EVENT_WRITE, ip, - offset, end_dmi_offset - offset, - AT_DELAY_FLAG(attr_flags), NULL); - if (error) - return error; - } + endoffset_fsb = XFS_B_TO_FSBT(mp, offset + len); if (attr_flags & XFS_ATTR_NOLOCK) need_iolock = 0; @@ -2731,7 +2552,7 @@ xfs_free_file_space( if (rt && !xfs_sb_version_hasextflgbit(&mp->m_sb)) { nimap = 1; error = xfs_bmapi(NULL, ip, startoffset_fsb, - 1, 0, NULL, 0, &imap, &nimap, NULL, NULL); + 1, 0, NULL, 0, &imap, &nimap, NULL); if (error) goto out_unlock_iolock; ASSERT(nimap == 0 || nimap == 1); @@ -2746,7 +2567,7 @@ xfs_free_file_space( } nimap = 1; error = xfs_bmapi(NULL, ip, endoffset_fsb - 1, - 1, 0, NULL, 0, &imap, &nimap, NULL, NULL); + 1, 0, NULL, 0, &imap, &nimap, NULL); if (error) goto out_unlock_iolock; ASSERT(nimap == 0 || nimap == 1); @@ -2814,8 +2635,7 @@ xfs_free_file_space( if (error) goto error1; - xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); - xfs_trans_ihold(tp, ip); + xfs_trans_ijoin(tp, ip); /* * issue the bunmapi() call to free the blocks @@ -2823,7 +2643,7 @@ xfs_free_file_space( xfs_bmap_init(&free_list, &firstfsb); error = xfs_bunmapi(tp, ip, startoffset_fsb, endoffset_fsb - startoffset_fsb, - 0, 2, &firstfsb, &free_list, NULL, &done); + 0, 2, &firstfsb, &free_list, &done); if (error) { goto error0; } @@ -2883,8 +2703,6 @@ xfs_change_file_space( xfs_trans_t *tp; struct iattr iattr; - xfs_itrace_entry(ip); - if (!S_ISREG(ip->i_d.di_mode)) return XFS_ERROR(EINVAL); @@ -2985,8 +2803,7 @@ xfs_change_file_space( xfs_ilock(ip, XFS_ILOCK_EXCL); - xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); - xfs_trans_ihold(tp, ip); + xfs_trans_ijoin(tp, ip); if ((attr_flags & XFS_ATTR_DMI) == 0) { ip->i_d.di_mode &= ~S_ISUID; |
