summaryrefslogtreecommitdiff
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/9p/Makefile4
-rw-r--r--fs/9p/fid.c111
-rw-r--r--fs/9p/v9fs.c3
-rw-r--r--fs/9p/v9fs.h1
-rw-r--r--fs/9p/v9fs_vfs.h1
-rw-r--r--fs/9p/vfs_dir.c136
-rw-r--r--fs/9p/vfs_file.c32
-rw-r--r--fs/9p/vfs_inode.c757
-rw-r--r--fs/9p/vfs_super.c50
-rw-r--r--fs/9p/xattr.c160
-rw-r--r--fs/9p/xattr.h27
-rw-r--r--fs/9p/xattr_user.c80
-rw-r--r--fs/adfs/dir.c2
-rw-r--r--fs/adfs/file.c2
-rw-r--r--fs/adfs/inode.c3
-rw-r--r--fs/affs/affs.h2
-rw-r--r--fs/affs/file.c4
-rw-r--r--fs/affs/namei.c2
-rw-r--r--fs/afs/internal.h2
-rw-r--r--fs/afs/server.c5
-rw-r--r--fs/afs/write.c4
-rw-r--r--fs/aio.c71
-rw-r--r--fs/anon_inodes.c2
-rw-r--r--fs/attr.c50
-rw-r--r--fs/autofs/root.c1
-rw-r--r--fs/autofs4/dev-ioctl.c13
-rw-r--r--fs/bad_inode.c3
-rw-r--r--fs/bfs/dir.c2
-rw-r--r--fs/binfmt_elf_fdpic.c26
-rw-r--r--fs/binfmt_flat.c27
-rw-r--r--fs/block_dev.c92
-rw-r--r--fs/btrfs/acl.c8
-rw-r--r--fs/btrfs/async-thread.c1
-rw-r--r--fs/btrfs/btrfs_inode.h3
-rw-r--r--fs/btrfs/ctree.c238
-rw-r--r--fs/btrfs/ctree.h165
-rw-r--r--fs/btrfs/delayed-ref.c101
-rw-r--r--fs/btrfs/delayed-ref.h3
-rw-r--r--fs/btrfs/disk-io.c180
-rw-r--r--fs/btrfs/disk-io.h4
-rw-r--r--fs/btrfs/extent-tree.c2256
-rw-r--r--fs/btrfs/extent_io.c87
-rw-r--r--fs/btrfs/extent_io.h14
-rw-r--r--fs/btrfs/file-item.c28
-rw-r--r--fs/btrfs/file.c181
-rw-r--r--fs/btrfs/inode-item.c27
-rw-r--r--fs/btrfs/inode.c1713
-rw-r--r--fs/btrfs/ioctl.c222
-rw-r--r--fs/btrfs/ordered-data.c82
-rw-r--r--fs/btrfs/ordered-data.h9
-rw-r--r--fs/btrfs/relocation.c1974
-rw-r--r--fs/btrfs/root-tree.c26
-rw-r--r--fs/btrfs/super.c36
-rw-r--r--fs/btrfs/transaction.c232
-rw-r--r--fs/btrfs/transaction.h24
-rw-r--r--fs/btrfs/tree-defrag.c7
-rw-r--r--fs/btrfs/tree-log.c241
-rw-r--r--fs/btrfs/tree-log.h2
-rw-r--r--fs/btrfs/volumes.c17
-rw-r--r--fs/btrfs/xattr.c12
-rw-r--r--fs/buffer.c123
-rw-r--r--fs/ceph/Kconfig2
-rw-r--r--fs/ceph/auth.c7
-rw-r--r--fs/ceph/auth.h6
-rw-r--r--fs/ceph/auth_none.c8
-rw-r--r--fs/ceph/auth_x.c17
-rw-r--r--fs/ceph/caps.c133
-rw-r--r--fs/ceph/ceph_fs.h21
-rw-r--r--fs/ceph/crush/mapper.c41
-rw-r--r--fs/ceph/debugfs.c2
-rw-r--r--fs/ceph/dir.c20
-rw-r--r--fs/ceph/export.c2
-rw-r--r--fs/ceph/file.c4
-rw-r--r--fs/ceph/inode.c29
-rw-r--r--fs/ceph/mds_client.c100
-rw-r--r--fs/ceph/mds_client.h7
-rw-r--r--fs/ceph/messenger.c81
-rw-r--r--fs/ceph/messenger.h1
-rw-r--r--fs/ceph/mon_client.c16
-rw-r--r--fs/ceph/osd_client.c16
-rw-r--r--fs/ceph/osdmap.c30
-rw-r--r--fs/ceph/super.c16
-rw-r--r--fs/ceph/super.h3
-rw-r--r--fs/cifs/Kconfig9
-rw-r--r--fs/cifs/Makefile2
-rw-r--r--fs/cifs/cache.c331
-rw-r--r--fs/cifs/cifs_dfs_ref.c28
-rw-r--r--fs/cifs/cifs_fs_sb.h1
-rw-r--r--fs/cifs/cifs_spnego.c3
-rw-r--r--fs/cifs/cifsfs.c37
-rw-r--r--fs/cifs/cifsfs.h4
-rw-r--r--fs/cifs/cifsglob.h40
-rw-r--r--fs/cifs/cifsproto.h5
-rw-r--r--fs/cifs/connect.c180
-rw-r--r--fs/cifs/dir.c82
-rw-r--r--fs/cifs/dns_resolve.c71
-rw-r--r--fs/cifs/dns_resolve.h4
-rw-r--r--fs/cifs/file.c177
-rw-r--r--fs/cifs/fscache.c236
-rw-r--r--fs/cifs/fscache.h136
-rw-r--r--fs/cifs/inode.c24
-rw-r--r--fs/cifs/ioctl.c3
-rw-r--r--fs/cifs/netmisc.c24
-rw-r--r--fs/cifs/readdir.c5
-rw-r--r--fs/cifs/sess.c10
-rw-r--r--fs/cifs/smberr.h1
-rw-r--r--fs/coda/coda_int.h3
-rw-r--r--fs/coda/file.c4
-rw-r--r--fs/compat.c132
-rw-r--r--fs/compat_ioctl.c9
-rw-r--r--fs/configfs/inode.c14
-rw-r--r--fs/dcache.c4
-rw-r--r--fs/debugfs/file.c21
-rw-r--r--fs/direct-io.c149
-rw-r--r--fs/ecryptfs/file.c2
-rw-r--r--fs/ecryptfs/inode.c4
-rw-r--r--fs/ecryptfs/messaging.c17
-rw-r--r--fs/exec.c195
-rw-r--r--fs/exofs/file.c7
-rw-r--r--fs/ext2/acl.c1
-rw-r--r--fs/ext2/ext2.h3
-rw-r--r--fs/ext2/file.c7
-rw-r--r--fs/ext2/inode.c153
-rw-r--r--fs/ext2/super.c20
-rw-r--r--fs/ext3/acl.c1
-rw-r--r--fs/ext3/dir.c2
-rw-r--r--fs/ext3/fsync.c4
-rw-r--r--fs/ext3/super.c38
-rw-r--r--fs/ext4/balloc.c5
-rw-r--r--fs/ext4/block_validity.c4
-rw-r--r--fs/ext4/dir.c26
-rw-r--r--fs/ext4/ext4.h169
-rw-r--r--fs/ext4/ext4_jbd2.h8
-rw-r--r--fs/ext4/extents.c417
-rw-r--r--fs/ext4/file.c2
-rw-r--r--fs/ext4/fsync.c41
-rw-r--r--fs/ext4/ialloc.c89
-rw-r--r--fs/ext4/inode.c773
-rw-r--r--fs/ext4/ioctl.c27
-rw-r--r--fs/ext4/mballoc.c120
-rw-r--r--fs/ext4/migrate.c2
-rw-r--r--fs/ext4/move_extent.c16
-rw-r--r--fs/ext4/namei.c61
-rw-r--r--fs/ext4/resize.c3
-rw-r--r--fs/ext4/super.c117
-rw-r--r--fs/ext4/symlink.c2
-rw-r--r--fs/ext4/xattr.c39
-rw-r--r--fs/fat/fat.h6
-rw-r--r--fs/fat/file.c40
-rw-r--r--fs/fat/inode.c35
-rw-r--r--fs/fcntl.c13
-rw-r--r--fs/file_table.c21
-rw-r--r--fs/freevxfs/vxfs_lookup.c2
-rw-r--r--fs/fs-writeback.c509
-rw-r--r--fs/fscache/object-list.c2
-rw-r--r--fs/fscache/page.c36
-rw-r--r--fs/fuse/dev.c527
-rw-r--r--fs/fuse/dir.c7
-rw-r--r--fs/fuse/file.c48
-rw-r--r--fs/fuse/fuse_i.h6
-rw-r--r--fs/gfs2/aops.c17
-rw-r--r--fs/gfs2/bmap.c18
-rw-r--r--fs/gfs2/bmap.h2
-rw-r--r--fs/gfs2/dir.c44
-rw-r--r--fs/gfs2/file.c8
-rw-r--r--fs/gfs2/glock.c97
-rw-r--r--fs/gfs2/incore.h1
-rw-r--r--fs/gfs2/inode.c12
-rw-r--r--fs/gfs2/ops_fstype.c27
-rw-r--r--fs/gfs2/ops_inode.c5
-rw-r--r--fs/gfs2/quota.c25
-rw-r--r--fs/gfs2/quota.h2
-rw-r--r--fs/gfs2/super.c9
-rw-r--r--fs/gfs2/sys.c57
-rw-r--r--fs/hostfs/hostfs_kern.c4
-rw-r--r--fs/hpfs/file.c4
-rw-r--r--fs/hpfs/hpfs_fn.h2
-rw-r--r--fs/hppfs/hppfs.c2
-rw-r--r--fs/hugetlbfs/inode.c2
-rw-r--r--fs/inode.c2
-rw-r--r--fs/isofs/dir.c1
-rw-r--r--fs/jbd2/journal.c15
-rw-r--r--fs/jbd2/transaction.c14
-rw-r--r--fs/jffs2/acl.c3
-rw-r--r--fs/jffs2/dir.c127
-rw-r--r--fs/jffs2/file.c4
-rw-r--r--fs/jffs2/fs.c11
-rw-r--r--fs/jffs2/os-linux.h2
-rw-r--r--fs/jffs2/xattr.c2
-rw-r--r--fs/jfs/file.c4
-rw-r--r--fs/jfs/jfs_inode.h2
-rw-r--r--fs/jfs/super.c16
-rw-r--r--fs/libfs.c111
-rw-r--r--fs/logfs/file.c4
-rw-r--r--fs/logfs/logfs.h2
-rw-r--r--fs/mbcache.c5
-rw-r--r--fs/minix/dir.c11
-rw-r--r--fs/minix/file.c2
-rw-r--r--fs/minix/itree_v2.c27
-rw-r--r--fs/namei.c8
-rw-r--r--fs/ncpfs/dir.c1
-rw-r--r--fs/ncpfs/file.c2
-rw-r--r--fs/nfs/client.c122
-rw-r--r--fs/nfs/dir.c11
-rw-r--r--fs/nfs/file.c18
-rw-r--r--fs/nfs/getroot.c2
-rw-r--r--fs/nfs/internal.h3
-rw-r--r--fs/nfs/nfs4xdr.c4
-rw-r--r--fs/nfs/nfsroot.c2
-rw-r--r--fs/nfs/super.c22
-rw-r--r--fs/nfs/write.c50
-rw-r--r--fs/nfsd/nfs4state.c2
-rw-r--r--fs/nfsd/vfs.c3
-rw-r--r--fs/nilfs2/btree.h2
-rw-r--r--fs/nilfs2/file.c4
-rw-r--r--fs/nilfs2/nilfs.h2
-rw-r--r--fs/nilfs2/segbuf.h2
-rw-r--r--fs/nilfs2/segment.h2
-rw-r--r--fs/nilfs2/super.c8
-rw-r--r--fs/ntfs/dir.c5
-rw-r--r--fs/ntfs/file.c9
-rw-r--r--fs/ocfs2/aops.c101
-rw-r--r--fs/ocfs2/dlm/dlmdomain.c3
-rw-r--r--fs/ocfs2/dlm/dlmmaster.c22
-rw-r--r--fs/ocfs2/dlm/dlmrecovery.c2
-rw-r--r--fs/ocfs2/file.c324
-rw-r--r--fs/ocfs2/file.h6
-rw-r--r--fs/ocfs2/journal.c30
-rw-r--r--fs/ocfs2/localalloc.c7
-rw-r--r--fs/ocfs2/quota_global.c2
-rw-r--r--fs/ocfs2/quota_local.c4
-rw-r--r--fs/ocfs2/refcounttree.c12
-rw-r--r--fs/ocfs2/reservations.c1
-rw-r--r--fs/ocfs2/suballoc.c2
-rw-r--r--fs/ocfs2/super.c50
-rw-r--r--fs/ocfs2/xattr.c200
-rw-r--r--fs/omfs/file.c2
-rw-r--r--fs/open.c11
-rw-r--r--fs/partitions/ibm.c14
-rw-r--r--fs/pipe.c102
-rw-r--r--fs/proc/array.c6
-rw-r--r--fs/proc/base.c16
-rw-r--r--fs/proc/generic.c15
-rw-r--r--fs/proc/kcore.c2
-rw-r--r--fs/proc/proc_devtree.c3
-rw-r--r--fs/proc/root.c1
-rw-r--r--fs/proc/task_nommu.c20
-rw-r--r--fs/qnx4/dir.c3
-rw-r--r--fs/quota/dquot.c203
-rw-r--r--fs/quota/quota.c4
-rw-r--r--fs/ramfs/file-mmu.c3
-rw-r--r--fs/ramfs/file-nommu.c9
-rw-r--r--fs/read_write.c17
-rw-r--r--fs/reiserfs/dir.c9
-rw-r--r--fs/reiserfs/file.c5
-rw-r--r--fs/reiserfs/super.c48
-rw-r--r--fs/smbfs/dir.c1
-rw-r--r--fs/smbfs/file.c3
-rw-r--r--fs/smbfs/inode.c2
-rw-r--r--fs/splice.c11
-rw-r--r--fs/squashfs/Kconfig11
-rw-r--r--fs/squashfs/Makefile2
-rw-r--r--fs/squashfs/inode.c92
-rw-r--r--fs/squashfs/namei.c6
-rw-r--r--fs/squashfs/squashfs.h12
-rw-r--r--fs/squashfs/squashfs_fs.h76
-rw-r--r--fs/squashfs/squashfs_fs_i.h3
-rw-r--r--fs/squashfs/squashfs_fs_sb.h3
-rw-r--r--fs/squashfs/super.c30
-rw-r--r--fs/squashfs/symlink.c11
-rw-r--r--fs/squashfs/xattr.c323
-rw-r--r--fs/squashfs/xattr.h46
-rw-r--r--fs/squashfs/xattr_id.c100
-rw-r--r--fs/super.c23
-rw-r--r--fs/sync.c10
-rw-r--r--fs/sysfs/inode.c8
-rw-r--r--fs/sysfs/symlink.c26
-rw-r--r--fs/sysv/dir.c2
-rw-r--r--fs/sysv/file.c2
-rw-r--r--fs/sysv/ialloc.c6
-rw-r--r--fs/sysv/inode.c1
-rw-r--r--fs/ubifs/budget.c2
-rw-r--r--fs/ubifs/file.c17
-rw-r--r--fs/ubifs/lpt.c14
-rw-r--r--fs/ubifs/lpt_commit.c2
-rw-r--r--fs/ubifs/recovery.c23
-rw-r--r--fs/ubifs/shrinker.c2
-rw-r--r--fs/ubifs/super.c4
-rw-r--r--fs/ubifs/ubifs.h6
-rw-r--r--fs/udf/balloc.c43
-rw-r--r--fs/udf/dir.c3
-rw-r--r--fs/udf/file.c28
-rw-r--r--fs/udf/ialloc.c21
-rw-r--r--fs/udf/inode.c5
-rw-r--r--fs/udf/namei.c20
-rw-r--r--fs/udf/super.c13
-rw-r--r--fs/udf/udfdecl.h1
-rw-r--r--fs/ufs/balloc.c24
-rw-r--r--fs/ufs/dir.c2
-rw-r--r--fs/ufs/file.c5
-rw-r--r--fs/ufs/ialloc.c13
-rw-r--r--fs/ufs/inode.c4
-rw-r--r--fs/ufs/namei.c16
-rw-r--r--fs/ufs/super.c112
-rw-r--r--fs/ufs/truncate.c20
-rw-r--r--fs/ufs/ufs_fs.h1
-rw-r--r--fs/xfs/Makefile4
-rw-r--r--fs/xfs/linux-2.6/xfs_acl.c2
-rw-r--r--fs/xfs/linux-2.6/xfs_aops.c604
-rw-r--r--fs/xfs/linux-2.6/xfs_aops.h4
-rw-r--r--fs/xfs/linux-2.6/xfs_buf.c67
-rw-r--r--fs/xfs/linux-2.6/xfs_buf.h119
-rw-r--r--fs/xfs/linux-2.6/xfs_dmapi_priv.h28
-rw-r--r--fs/xfs/linux-2.6/xfs_export.c15
-rw-r--r--fs/xfs/linux-2.6/xfs_file.c114
-rw-r--r--fs/xfs/linux-2.6/xfs_fs_subr.c4
-rw-r--r--fs/xfs/linux-2.6/xfs_fs_subr.h25
-rw-r--r--fs/xfs/linux-2.6/xfs_ioctl.c34
-rw-r--r--fs/xfs/linux-2.6/xfs_ioctl32.c21
-rw-r--r--fs/xfs/linux-2.6/xfs_iops.c26
-rw-r--r--fs/xfs/linux-2.6/xfs_linux.h1
-rw-r--r--fs/xfs/linux-2.6/xfs_quotaops.c2
-rw-r--r--fs/xfs/linux-2.6/xfs_super.c173
-rw-r--r--fs/xfs/linux-2.6/xfs_super.h7
-rw-r--r--fs/xfs/linux-2.6/xfs_sync.c188
-rw-r--r--fs/xfs/linux-2.6/xfs_sync.h5
-rw-r--r--fs/xfs/linux-2.6/xfs_trace.c5
-rw-r--r--fs/xfs/linux-2.6/xfs_trace.h473
-rw-r--r--fs/xfs/quota/xfs_dquot.c114
-rw-r--r--fs/xfs/quota/xfs_dquot_item.c301
-rw-r--r--fs/xfs/quota/xfs_qm.c44
-rw-r--r--fs/xfs/quota/xfs_qm_bhv.c10
-rw-r--r--fs/xfs/quota/xfs_qm_stats.c10
-rw-r--r--fs/xfs/quota/xfs_qm_syscalls.c112
-rw-r--r--fs/xfs/quota/xfs_trans_dquot.c35
-rw-r--r--fs/xfs/support/debug.c1
-rw-r--r--fs/xfs/xfs_ag.h1
-rw-r--r--fs/xfs/xfs_alloc.c15
-rw-r--r--fs/xfs/xfs_alloc.h20
-rw-r--r--fs/xfs/xfs_alloc_btree.c5
-rw-r--r--fs/xfs/xfs_attr.c91
-rw-r--r--fs/xfs/xfs_attr_leaf.c5
-rw-r--r--fs/xfs/xfs_bmap.c327
-rw-r--r--fs/xfs/xfs_bmap.h37
-rw-r--r--fs/xfs/xfs_bmap_btree.c5
-rw-r--r--fs/xfs/xfs_btree.c5
-rw-r--r--fs/xfs/xfs_buf_item.c228
-rw-r--r--fs/xfs/xfs_buf_item.h2
-rw-r--r--fs/xfs/xfs_da_btree.c20
-rw-r--r--fs/xfs/xfs_dfrag.c21
-rw-r--r--fs/xfs/xfs_dir2.c11
-rw-r--r--fs/xfs/xfs_dir2_block.c8
-rw-r--r--fs/xfs/xfs_dir2_data.c2
-rw-r--r--fs/xfs/xfs_dir2_leaf.c4
-rw-r--r--fs/xfs/xfs_dir2_node.c2
-rw-r--r--fs/xfs/xfs_dir2_sf.c2
-rw-r--r--fs/xfs/xfs_dmapi.h170
-rw-r--r--fs/xfs/xfs_dmops.c55
-rw-r--r--fs/xfs/xfs_error.c4
-rw-r--r--fs/xfs/xfs_extfree_item.c278
-rw-r--r--fs/xfs/xfs_filestream.c84
-rw-r--r--fs/xfs/xfs_filestream.h82
-rw-r--r--fs/xfs/xfs_fsops.c7
-rw-r--r--fs/xfs/xfs_ialloc.c146
-rw-r--r--fs/xfs/xfs_ialloc_btree.c4
-rw-r--r--fs/xfs/xfs_iget.c147
-rw-r--r--fs/xfs/xfs_inode.c207
-rw-r--r--fs/xfs/xfs_inode.h10
-rw-r--r--fs/xfs/xfs_inode_item.c273
-rw-r--r--fs/xfs/xfs_inode_item.h12
-rw-r--r--fs/xfs/xfs_iomap.c76
-rw-r--r--fs/xfs/xfs_iomap.h22
-rw-r--r--fs/xfs/xfs_itable.c293
-rw-r--r--fs/xfs/xfs_itable.h17
-rw-r--r--fs/xfs/xfs_log.c16
-rw-r--r--fs/xfs/xfs_log.h11
-rw-r--r--fs/xfs/xfs_log_cil.c4
-rw-r--r--fs/xfs/xfs_log_recover.c55
-rw-r--r--fs/xfs/xfs_mount.c73
-rw-r--r--fs/xfs/xfs_mount.h71
-rw-r--r--fs/xfs/xfs_rename.c63
-rw-r--r--fs/xfs/xfs_rtalloc.c17
-rw-r--r--fs/xfs/xfs_rtalloc.h11
-rw-r--r--fs/xfs/xfs_rw.c15
-rw-r--r--fs/xfs/xfs_trans.c657
-rw-r--r--fs/xfs/xfs_trans.h528
-rw-r--r--fs/xfs/xfs_trans_ail.c1
-rw-r--r--fs/xfs/xfs_trans_buf.c75
-rw-r--r--fs/xfs/xfs_trans_extfree.c23
-rw-r--r--fs/xfs/xfs_trans_inode.c76
-rw-r--r--fs/xfs/xfs_trans_item.c441
-rw-r--r--fs/xfs/xfs_trans_priv.h18
-rw-r--r--fs/xfs/xfs_utils.c87
-rw-r--r--fs/xfs/xfs_utils.h1
-rw-r--r--fs/xfs/xfs_vnodeops.c297
395 files changed, 15397 insertions, 11475 deletions
diff --git a/fs/9p/Makefile b/fs/9p/Makefile
index 1a940ec7af61..91fba025fcbe 100644
--- a/fs/9p/Makefile
+++ b/fs/9p/Makefile
@@ -8,6 +8,8 @@ obj-$(CONFIG_9P_FS) := 9p.o
vfs_dir.o \
vfs_dentry.o \
v9fs.o \
- fid.o
+ fid.o \
+ xattr.o \
+ xattr_user.o
9p-$(CONFIG_9P_FSCACHE) += cache.o
diff --git a/fs/9p/fid.c b/fs/9p/fid.c
index 7317b39b2815..358563689064 100644
--- a/fs/9p/fid.c
+++ b/fs/9p/fid.c
@@ -97,6 +97,34 @@ static struct p9_fid *v9fs_fid_find(struct dentry *dentry, u32 uid, int any)
return ret;
}
+/*
+ * We need to hold v9ses->rename_sem as long as we hold references
+ * to returned path array. Array element contain pointers to
+ * dentry names.
+ */
+static int build_path_from_dentry(struct v9fs_session_info *v9ses,
+ struct dentry *dentry, char ***names)
+{
+ int n = 0, i;
+ char **wnames;
+ struct dentry *ds;
+
+ for (ds = dentry; !IS_ROOT(ds); ds = ds->d_parent)
+ n++;
+
+ wnames = kmalloc(sizeof(char *) * n, GFP_KERNEL);
+ if (!wnames)
+ goto err_out;
+
+ for (ds = dentry, i = (n-1); i >= 0; i--, ds = ds->d_parent)
+ wnames[i] = (char *)ds->d_name.name;
+
+ *names = wnames;
+ return n;
+err_out:
+ return -ENOMEM;
+}
+
/**
* v9fs_fid_lookup - lookup for a fid, try to walk if not found
* @dentry: dentry to look for fid in
@@ -112,7 +140,7 @@ struct p9_fid *v9fs_fid_lookup(struct dentry *dentry)
int i, n, l, clone, any, access;
u32 uid;
struct p9_fid *fid, *old_fid = NULL;
- struct dentry *d, *ds;
+ struct dentry *ds;
struct v9fs_session_info *v9ses;
char **wnames, *uname;
@@ -139,49 +167,62 @@ struct p9_fid *v9fs_fid_lookup(struct dentry *dentry)
fid = v9fs_fid_find(dentry, uid, any);
if (fid)
return fid;
-
+ /*
+ * we don't have a matching fid. To do a TWALK we need
+ * parent fid. We need to prevent rename when we want to
+ * look at the parent.
+ */
+ down_read(&v9ses->rename_sem);
ds = dentry->d_parent;
fid = v9fs_fid_find(ds, uid, any);
- if (!fid) { /* walk from the root */
- n = 0;
- for (ds = dentry; !IS_ROOT(ds); ds = ds->d_parent)
- n++;
+ if (fid) {
+ /* Found the parent fid do a lookup with that */
+ fid = p9_client_walk(fid, 1, (char **)&dentry->d_name.name, 1);
+ goto fid_out;
+ }
+ up_read(&v9ses->rename_sem);
- fid = v9fs_fid_find(ds, uid, any);
- if (!fid) { /* the user is not attached to the fs yet */
- if (access == V9FS_ACCESS_SINGLE)
- return ERR_PTR(-EPERM);
+ /* start from the root and try to do a lookup */
+ fid = v9fs_fid_find(dentry->d_sb->s_root, uid, any);
+ if (!fid) {
+ /* the user is not attached to the fs yet */
+ if (access == V9FS_ACCESS_SINGLE)
+ return ERR_PTR(-EPERM);
- if (v9fs_proto_dotu(v9ses))
+ if (v9fs_proto_dotu(v9ses) || v9fs_proto_dotl(v9ses))
uname = NULL;
- else
- uname = v9ses->uname;
+ else
+ uname = v9ses->uname;
- fid = p9_client_attach(v9ses->clnt, NULL, uname, uid,
- v9ses->aname);
-
- if (IS_ERR(fid))
- return fid;
-
- v9fs_fid_add(ds, fid);
- }
- } else /* walk from the parent */
- n = 1;
+ fid = p9_client_attach(v9ses->clnt, NULL, uname, uid,
+ v9ses->aname);
+ if (IS_ERR(fid))
+ return fid;
- if (ds == dentry)
+ v9fs_fid_add(dentry->d_sb->s_root, fid);
+ }
+ /* If we are root ourself just return that */
+ if (dentry->d_sb->s_root == dentry)
return fid;
-
- wnames = kmalloc(sizeof(char *) * n, GFP_KERNEL);
- if (!wnames)
- return ERR_PTR(-ENOMEM);
-
- for (d = dentry, i = (n-1); i >= 0; i--, d = d->d_parent)
- wnames[i] = (char *) d->d_name.name;
-
+ /*
+ * Do a multipath walk with attached root.
+ * When walking parent we need to make sure we
+ * don't have a parallel rename happening
+ */
+ down_read(&v9ses->rename_sem);
+ n = build_path_from_dentry(v9ses, dentry, &wnames);
+ if (n < 0) {
+ fid = ERR_PTR(n);
+ goto err_out;
+ }
clone = 1;
i = 0;
while (i < n) {
l = min(n - i, P9_MAXWELEM);
+ /*
+ * We need to hold rename lock when doing a multipath
+ * walk to ensure none of the patch component change
+ */
fid = p9_client_walk(fid, l, &wnames[i], clone);
if (IS_ERR(fid)) {
if (old_fid) {
@@ -193,15 +234,17 @@ struct p9_fid *v9fs_fid_lookup(struct dentry *dentry)
p9_client_clunk(old_fid);
}
kfree(wnames);
- return fid;
+ goto err_out;
}
old_fid = fid;
i += l;
clone = 0;
}
-
kfree(wnames);
+fid_out:
v9fs_fid_add(dentry, fid);
+err_out:
+ up_read(&v9ses->rename_sem);
return fid;
}
diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c
index f8b86e92cd66..38dc0e067599 100644
--- a/fs/9p/v9fs.c
+++ b/fs/9p/v9fs.c
@@ -237,6 +237,7 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses,
__putname(v9ses->uname);
return ERR_PTR(-ENOMEM);
}
+ init_rwsem(&v9ses->rename_sem);
rc = bdi_setup_and_register(&v9ses->bdi, "9p", BDI_CAP_MAP_COPY);
if (rc) {
@@ -278,7 +279,7 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses,
v9ses->maxdata = v9ses->clnt->msize - P9_IOHDRSZ;
/* for legacy mode, fall back to V9FS_ACCESS_ANY */
- if (!v9fs_proto_dotu(v9ses) &&
+ if (!(v9fs_proto_dotu(v9ses) || v9fs_proto_dotl(v9ses)) &&
((v9ses->flags&V9FS_ACCESS_MASK) == V9FS_ACCESS_USER)) {
v9ses->flags &= ~V9FS_ACCESS_MASK;
diff --git a/fs/9p/v9fs.h b/fs/9p/v9fs.h
index bec4d0bcb458..4c963c9fc41f 100644
--- a/fs/9p/v9fs.h
+++ b/fs/9p/v9fs.h
@@ -104,6 +104,7 @@ struct v9fs_session_info {
struct p9_client *clnt; /* 9p client */
struct list_head slist; /* list of sessions registered with v9fs */
struct backing_dev_info bdi;
+ struct rw_semaphore rename_sem;
};
struct p9_fid *v9fs_session_init(struct v9fs_session_info *, const char *,
diff --git a/fs/9p/v9fs_vfs.h b/fs/9p/v9fs_vfs.h
index 32ef4009d030..f47c6bbb01b3 100644
--- a/fs/9p/v9fs_vfs.h
+++ b/fs/9p/v9fs_vfs.h
@@ -55,6 +55,7 @@ struct inode *v9fs_get_inode(struct super_block *sb, int mode);
void v9fs_clear_inode(struct inode *inode);
ino_t v9fs_qid2ino(struct p9_qid *qid);
void v9fs_stat2inode(struct p9_wstat *, struct inode *, struct super_block *);
+void v9fs_stat2inode_dotl(struct p9_stat_dotl *, struct inode *);
int v9fs_dir_release(struct inode *inode, struct file *filp);
int v9fs_file_open(struct inode *inode, struct file *file);
void v9fs_inode2stat(struct inode *inode, struct p9_wstat *stat);
diff --git a/fs/9p/vfs_dir.c b/fs/9p/vfs_dir.c
index d61e3b28ce37..16c8a2a98c1b 100644
--- a/fs/9p/vfs_dir.c
+++ b/fs/9p/vfs_dir.c
@@ -87,29 +87,19 @@ static void p9stat_init(struct p9_wstat *stbuf)
}
/**
- * v9fs_dir_readdir - read a directory
+ * v9fs_alloc_rdir_buf - Allocate buffer used for read and readdir
* @filp: opened file structure
- * @dirent: directory structure ???
- * @filldir: function to populate directory structure ???
+ * @buflen: Length in bytes of buffer to allocate
*
*/
-static int v9fs_dir_readdir(struct file *filp, void *dirent, filldir_t filldir)
+static int v9fs_alloc_rdir_buf(struct file *filp, int buflen)
{
- int over;
- struct p9_wstat st;
- int err = 0;
- struct p9_fid *fid;
- int buflen;
- int reclen = 0;
struct p9_rdir *rdir;
+ struct p9_fid *fid;
+ int err = 0;
- P9_DPRINTK(P9_DEBUG_VFS, "name %s\n", filp->f_path.dentry->d_name.name);
fid = filp->private_data;
-
- buflen = fid->clnt->msize - P9_IOHDRSZ;
-
- /* allocate rdir on demand */
if (!fid->rdir) {
rdir = kmalloc(sizeof(struct p9_rdir) + buflen, GFP_KERNEL);
@@ -128,6 +118,36 @@ static int v9fs_dir_readdir(struct file *filp, void *dirent, filldir_t filldir)
spin_unlock(&filp->f_dentry->d_lock);
kfree(rdir);
}
+exit:
+ return err;
+}
+
+/**
+ * v9fs_dir_readdir - read a directory
+ * @filp: opened file structure
+ * @dirent: directory structure ???
+ * @filldir: function to populate directory structure ???
+ *
+ */
+
+static int v9fs_dir_readdir(struct file *filp, void *dirent, filldir_t filldir)
+{
+ int over;
+ struct p9_wstat st;
+ int err = 0;
+ struct p9_fid *fid;
+ int buflen;
+ int reclen = 0;
+ struct p9_rdir *rdir;
+
+ P9_DPRINTK(P9_DEBUG_VFS, "name %s\n", filp->f_path.dentry->d_name.name);
+ fid = filp->private_data;
+
+ buflen = fid->clnt->msize - P9_IOHDRSZ;
+
+ err = v9fs_alloc_rdir_buf(filp, buflen);
+ if (err)
+ goto exit;
rdir = (struct p9_rdir *) fid->rdir;
err = mutex_lock_interruptible(&rdir->mutex);
@@ -146,7 +166,7 @@ static int v9fs_dir_readdir(struct file *filp, void *dirent, filldir_t filldir)
while (rdir->head < rdir->tail) {
p9stat_init(&st);
err = p9stat_read(rdir->buf + rdir->head,
- buflen - rdir->head, &st,
+ rdir->tail - rdir->head, &st,
fid->clnt->proto_version);
if (err) {
P9_DPRINTK(P9_DEBUG_VFS, "returned %d\n", err);
@@ -176,6 +196,88 @@ exit:
return err;
}
+/**
+ * v9fs_dir_readdir_dotl - read a directory
+ * @filp: opened file structure
+ * @dirent: buffer to fill dirent structures
+ * @filldir: function to populate dirent structures
+ *
+ */
+static int v9fs_dir_readdir_dotl(struct file *filp, void *dirent,
+ filldir_t filldir)
+{
+ int over;
+ int err = 0;
+ struct p9_fid *fid;
+ int buflen;
+ struct p9_rdir *rdir;
+ struct p9_dirent curdirent;
+ u64 oldoffset = 0;
+
+ P9_DPRINTK(P9_DEBUG_VFS, "name %s\n", filp->f_path.dentry->d_name.name);
+ fid = filp->private_data;
+
+ buflen = fid->clnt->msize - P9_READDIRHDRSZ;
+
+ err = v9fs_alloc_rdir_buf(filp, buflen);
+ if (err)
+ goto exit;
+ rdir = (struct p9_rdir *) fid->rdir;
+
+ err = mutex_lock_interruptible(&rdir->mutex);
+ if (err)
+ return err;
+
+ while (err == 0) {
+ if (rdir->tail == rdir->head) {
+ err = p9_client_readdir(fid, rdir->buf, buflen,
+ filp->f_pos);
+ if (err <= 0)
+ goto unlock_and_exit;
+
+ rdir->head = 0;
+ rdir->tail = err;
+ }
+
+ while (rdir->head < rdir->tail) {
+
+ err = p9dirent_read(rdir->buf + rdir->head,
+ buflen - rdir->head, &curdirent,
+ fid->clnt->proto_version);
+ if (err < 0) {
+ P9_DPRINTK(P9_DEBUG_VFS, "returned %d\n", err);
+ err = -EIO;
+ goto unlock_and_exit;
+ }
+
+ /* d_off in dirent structure tracks the offset into
+ * the next dirent in the dir. However, filldir()
+ * expects offset into the current dirent. Hence
+ * while calling filldir send the offset from the
+ * previous dirent structure.
+ */
+ over = filldir(dirent, curdirent.d_name,
+ strlen(curdirent.d_name),
+ oldoffset, v9fs_qid2ino(&curdirent.qid),
+ curdirent.d_type);
+ oldoffset = curdirent.d_off;
+
+ if (over) {
+ err = 0;
+ goto unlock_and_exit;
+ }
+
+ filp->f_pos = curdirent.d_off;
+ rdir->head += err;
+ }
+ }
+
+unlock_and_exit:
+ mutex_unlock(&rdir->mutex);
+exit:
+ return err;
+}
+
/**
* v9fs_dir_release - close a directory
@@ -207,7 +309,7 @@ const struct file_operations v9fs_dir_operations = {
const struct file_operations v9fs_dir_operations_dotl = {
.read = generic_read_dir,
.llseek = generic_file_llseek,
- .readdir = v9fs_dir_readdir,
+ .readdir = v9fs_dir_readdir_dotl,
.open = v9fs_file_open,
.release = v9fs_dir_release,
};
diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index 25b300e1c9d7..e97c92bd6f16 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -59,9 +59,13 @@ int v9fs_file_open(struct inode *inode, struct file *file)
struct p9_fid *fid;
int omode;
- P9_DPRINTK(P9_DEBUG_VFS, "inode: %p file: %p \n", inode, file);
+ P9_DPRINTK(P9_DEBUG_VFS, "inode: %p file: %p\n", inode, file);
v9ses = v9fs_inode2v9ses(inode);
- omode = v9fs_uflags2omode(file->f_flags, v9fs_proto_dotu(v9ses));
+ if (v9fs_proto_dotl(v9ses))
+ omode = file->f_flags;
+ else
+ omode = v9fs_uflags2omode(file->f_flags,
+ v9fs_proto_dotu(v9ses));
fid = file->private_data;
if (!fid) {
fid = v9fs_fid_clone(file->f_path.dentry);
@@ -73,11 +77,12 @@ int v9fs_file_open(struct inode *inode, struct file *file)
p9_client_clunk(fid);
return err;
}
- if (omode & P9_OTRUNC) {
+ if (file->f_flags & O_TRUNC) {
i_size_write(inode, 0);
inode->i_blocks = 0;
}
- if ((file->f_flags & O_APPEND) && (!v9fs_proto_dotu(v9ses)))
+ if ((file->f_flags & O_APPEND) &&
+ (!v9fs_proto_dotu(v9ses) && !v9fs_proto_dotl(v9ses)))
generic_file_llseek(file, 0, SEEK_END);
}
@@ -139,7 +144,7 @@ ssize_t
v9fs_file_readn(struct file *filp, char *data, char __user *udata, u32 count,
u64 offset)
{
- int n, total;
+ int n, total, size;
struct p9_fid *fid = filp->private_data;
P9_DPRINTK(P9_DEBUG_VFS, "fid %d offset %llu count %d\n", fid->fid,
@@ -147,6 +152,7 @@ v9fs_file_readn(struct file *filp, char *data, char __user *udata, u32 count,
n = 0;
total = 0;
+ size = fid->iounit ? fid->iounit : fid->clnt->msize - P9_IOHDRSZ;
do {
n = p9_client_read(fid, data, udata, offset, count);
if (n <= 0)
@@ -160,7 +166,7 @@ v9fs_file_readn(struct file *filp, char *data, char __user *udata, u32 count,
offset += n;
count -= n;
total += n;
- } while (count > 0 && n == (fid->clnt->msize - P9_IOHDRSZ));
+ } while (count > 0 && n == size);
if (n < 0)
total = n;
@@ -183,11 +189,13 @@ v9fs_file_read(struct file *filp, char __user *udata, size_t count,
{
int ret;
struct p9_fid *fid;
+ size_t size;
P9_DPRINTK(P9_DEBUG_VFS, "count %zu offset %lld\n", count, *offset);
fid = filp->private_data;
- if (count > (fid->clnt->msize - P9_IOHDRSZ))
+ size = fid->iounit ? fid->iounit : fid->clnt->msize - P9_IOHDRSZ;
+ if (count > size)
ret = v9fs_file_readn(filp, NULL, udata, count, *offset);
else
ret = p9_client_read(fid, NULL, udata, *offset, count);
@@ -224,9 +232,7 @@ v9fs_file_write(struct file *filp, const char __user * data,
fid = filp->private_data;
clnt = fid->clnt;
- rsize = fid->iounit;
- if (!rsize || rsize > clnt->msize-P9_IOHDRSZ)
- rsize = clnt->msize - P9_IOHDRSZ;
+ rsize = fid->iounit ? fid->iounit : clnt->msize - P9_IOHDRSZ;
do {
if (count < rsize)
@@ -257,15 +263,13 @@ v9fs_file_write(struct file *filp, const char __user * data,
return total;
}
-static int v9fs_file_fsync(struct file *filp, struct dentry *dentry,
- int datasync)
+static int v9fs_file_fsync(struct file *filp, int datasync)
{
struct p9_fid *fid;
struct p9_wstat wstat;
int retval;
- P9_DPRINTK(P9_DEBUG_VFS, "filp %p dentry %p datasync %x\n", filp,
- dentry, datasync);
+ P9_DPRINTK(P9_DEBUG_VFS, "filp %p datasync %x\n", filp, datasync);
fid = filp->private_data;
v9fs_blank_wstat(&wstat);
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 4331b3b5ee1c..6e94f3247cec 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -35,6 +35,7 @@
#include <linux/idr.h>
#include <linux/sched.h>
#include <linux/slab.h>
+#include <linux/xattr.h>
#include <net/9p/9p.h>
#include <net/9p/client.h>
@@ -42,6 +43,7 @@
#include "v9fs_vfs.h"
#include "fid.h"
#include "cache.h"
+#include "xattr.h"
static const struct inode_operations v9fs_dir_inode_operations;
static const struct inode_operations v9fs_dir_inode_operations_dotu;
@@ -236,6 +238,41 @@ void v9fs_destroy_inode(struct inode *inode)
#endif
/**
+ * v9fs_get_fsgid_for_create - Helper function to get the gid for creating a
+ * new file system object. This checks the S_ISGID to determine the owning
+ * group of the new file system object.
+ */
+
+static gid_t v9fs_get_fsgid_for_create(struct inode *dir_inode)
+{
+ BUG_ON(dir_inode == NULL);
+
+ if (dir_inode->i_mode & S_ISGID) {
+ /* set_gid bit is set.*/
+ return dir_inode->i_gid;
+ }
+ return current_fsgid();
+}
+
+/**
+ * v9fs_dentry_from_dir_inode - helper function to get the dentry from
+ * dir inode.
+ *
+ */
+
+static struct dentry *v9fs_dentry_from_dir_inode(struct inode *inode)
+{
+ struct dentry *dentry;
+
+ spin_lock(&dcache_lock);
+ /* Directory should have only one entry. */
+ BUG_ON(S_ISDIR(inode->i_mode) && !list_is_singular(&inode->i_dentry));
+ dentry = list_entry(inode->i_dentry.next, struct dentry, d_alias);
+ spin_unlock(&dcache_lock);
+ return dentry;
+}
+
+/**
* v9fs_get_inode - helper function to setup an inode
* @sb: superblock
* @mode: mode to setup inode with
@@ -267,7 +304,13 @@ struct inode *v9fs_get_inode(struct super_block *sb, int mode)
case S_IFBLK:
case S_IFCHR:
case S_IFSOCK:
- if (!v9fs_proto_dotu(v9ses)) {
+ if (v9fs_proto_dotl(v9ses)) {
+ inode->i_op = &v9fs_file_inode_operations_dotl;
+ inode->i_fop = &v9fs_file_operations_dotl;
+ } else if (v9fs_proto_dotu(v9ses)) {
+ inode->i_op = &v9fs_file_inode_operations;
+ inode->i_fop = &v9fs_file_operations;
+ } else {
P9_DPRINTK(P9_DEBUG_ERROR,
"special files without extended mode\n");
err = -EINVAL;
@@ -396,23 +439,14 @@ void v9fs_clear_inode(struct inode *inode)
#endif
}
-/**
- * v9fs_inode_from_fid - populate an inode by issuing a attribute request
- * @v9ses: session information
- * @fid: fid to issue attribute request for
- * @sb: superblock on which to create inode
- *
- */
-
static struct inode *
-v9fs_inode_from_fid(struct v9fs_session_info *v9ses, struct p9_fid *fid,
+v9fs_inode(struct v9fs_session_info *v9ses, struct p9_fid *fid,
struct super_block *sb)
{
int err, umode;
- struct inode *ret;
+ struct inode *ret = NULL;
struct p9_wstat *st;
- ret = NULL;
st = p9_client_stat(fid);
if (IS_ERR(st))
return ERR_CAST(st);
@@ -433,15 +467,62 @@ v9fs_inode_from_fid(struct v9fs_session_info *v9ses, struct p9_fid *fid,
#endif
p9stat_free(st);
kfree(st);
-
return ret;
-
error:
p9stat_free(st);
kfree(st);
return ERR_PTR(err);
}
+static struct inode *
+v9fs_inode_dotl(struct v9fs_session_info *v9ses, struct p9_fid *fid,
+ struct super_block *sb)
+{
+ struct inode *ret = NULL;
+ int err;
+ struct p9_stat_dotl *st;
+
+ st = p9_client_getattr_dotl(fid, P9_STATS_BASIC);
+ if (IS_ERR(st))
+ return ERR_CAST(st);
+
+ ret = v9fs_get_inode(sb, st->st_mode);
+ if (IS_ERR(ret)) {
+ err = PTR_ERR(ret);
+ goto error;
+ }
+
+ v9fs_stat2inode_dotl(st, ret);
+ ret->i_ino = v9fs_qid2ino(&st->qid);
+#ifdef CONFIG_9P_FSCACHE
+ v9fs_vcookie_set_qid(ret, &st->qid);
+ v9fs_cache_inode_get_cookie(ret);
+#endif
+ kfree(st);
+ return ret;
+error:
+ kfree(st);
+ return ERR_PTR(err);
+}
+
+/**
+ * v9fs_inode_from_fid - Helper routine to populate an inode by
+ * issuing a attribute request
+ * @v9ses: session information
+ * @fid: fid to issue attribute request for
+ * @sb: superblock on which to create inode
+ *
+ */
+static inline struct inode *
+v9fs_inode_from_fid(struct v9fs_session_info *v9ses, struct p9_fid *fid,
+ struct super_block *sb)
+{
+ if (v9fs_proto_dotl(v9ses))
+ return v9fs_inode_dotl(v9ses, fid, sb);
+ else
+ return v9fs_inode(v9ses, fid, sb);
+}
+
/**
* v9fs_remove - helper function to remove files and directories
* @dir: directory inode that is being deleted
@@ -563,6 +644,118 @@ error:
}
/**
+ * v9fs_vfs_create_dotl - VFS hook to create files for 9P2000.L protocol.
+ * @dir: directory inode that is being created
+ * @dentry: dentry that is being deleted
+ * @mode: create permissions
+ * @nd: path information
+ *
+ */
+
+static int
+v9fs_vfs_create_dotl(struct inode *dir, struct dentry *dentry, int mode,
+ struct nameidata *nd)
+{
+ int err = 0;
+ char *name = NULL;
+ gid_t gid;
+ int flags;
+ struct v9fs_session_info *v9ses;
+ struct p9_fid *fid = NULL;
+ struct p9_fid *dfid, *ofid;
+ struct file *filp;
+ struct p9_qid qid;
+ struct inode *inode;
+
+ v9ses = v9fs_inode2v9ses(dir);
+ if (nd && nd->flags & LOOKUP_OPEN)
+ flags = nd->intent.open.flags - 1;
+ else
+ flags = O_RDWR;
+
+ name = (char *) dentry->d_name.name;
+ P9_DPRINTK(P9_DEBUG_VFS, "v9fs_vfs_create_dotl: name:%s flags:0x%x "
+ "mode:0x%x\n", name, flags, mode);
+
+ dfid = v9fs_fid_lookup(dentry->d_parent);
+ if (IS_ERR(dfid)) {
+ err = PTR_ERR(dfid);
+ P9_DPRINTK(P9_DEBUG_VFS, "fid lookup failed %d\n", err);
+ return err;
+ }
+
+ /* clone a fid to use for creation */
+ ofid = p9_client_walk(dfid, 0, NULL, 1);
+ if (IS_ERR(ofid)) {
+ err = PTR_ERR(ofid);
+ P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n", err);
+ return err;
+ }
+
+ gid = v9fs_get_fsgid_for_create(dir);
+ err = p9_client_create_dotl(ofid, name, flags, mode, gid, &qid);
+ if (err < 0) {
+ P9_DPRINTK(P9_DEBUG_VFS,
+ "p9_client_open_dotl failed in creat %d\n",
+ err);
+ goto error;
+ }
+
+ /* No need to populate the inode if we are not opening the file AND
+ * not in cached mode.
+ */
+ if (!v9ses->cache && !(nd && nd->flags & LOOKUP_OPEN)) {
+ /* Not in cached mode. No need to populate inode with stat */
+ dentry->d_op = &v9fs_dentry_operations;
+ p9_client_clunk(ofid);
+ d_instantiate(dentry, NULL);
+ return 0;
+ }
+
+ /* Now walk from the parent so we can get an unopened fid. */
+ fid = p9_client_walk(dfid, 1, &name, 1);
+ if (IS_ERR(fid)) {
+ err = PTR_ERR(fid);
+ P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n", err);
+ fid = NULL;
+ goto error;
+ }
+
+ /* instantiate inode and assign the unopened fid to dentry */
+ inode = v9fs_inode_from_fid(v9ses, fid, dir->i_sb);
+ if (IS_ERR(inode)) {
+ err = PTR_ERR(inode);
+ P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n", err);
+ goto error;
+ }
+ dentry->d_op = &v9fs_cached_dentry_operations;
+ d_instantiate(dentry, inode);
+ err = v9fs_fid_add(dentry, fid);
+ if (err < 0)
+ goto error;
+
+ /* if we are opening a file, assign the open fid to the file */
+ if (nd && nd->flags & LOOKUP_OPEN) {
+ filp = lookup_instantiate_filp(nd, dentry, v9fs_open_created);
+ if (IS_ERR(filp)) {
+ p9_client_clunk(ofid);
+ return PTR_ERR(filp);
+ }
+ filp->private_data = ofid;
+ } else
+ p9_client_clunk(ofid);
+
+ return 0;
+
+error:
+ if (ofid)
+ p9_client_clunk(ofid);
+ if (fid)
+ p9_client_clunk(fid);
+ return err;
+}
+
+/**
* v9fs_vfs_create - VFS hook to create files
* @dir: directory inode that is being created
* @dentry: dentry that is being deleted
@@ -652,6 +845,83 @@ static int v9fs_vfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
return err;
}
+
+/**
+ * v9fs_vfs_mkdir_dotl - VFS mkdir hook to create a directory
+ * @dir: inode that is being unlinked
+ * @dentry: dentry that is being unlinked
+ * @mode: mode for new directory
+ *
+ */
+
+static int v9fs_vfs_mkdir_dotl(struct inode *dir, struct dentry *dentry,
+ int mode)
+{
+ int err;
+ struct v9fs_session_info *v9ses;
+ struct p9_fid *fid = NULL, *dfid = NULL;
+ gid_t gid;
+ char *name;
+ struct inode *inode;
+ struct p9_qid qid;
+ struct dentry *dir_dentry;
+
+ P9_DPRINTK(P9_DEBUG_VFS, "name %s\n", dentry->d_name.name);
+ err = 0;
+ v9ses = v9fs_inode2v9ses(dir);
+
+ mode |= S_IFDIR;
+ dir_dentry = v9fs_dentry_from_dir_inode(dir);
+ dfid = v9fs_fid_lookup(dir_dentry);
+ if (IS_ERR(dfid)) {
+ err = PTR_ERR(dfid);
+ P9_DPRINTK(P9_DEBUG_VFS, "fid lookup failed %d\n", err);
+ dfid = NULL;
+ goto error;
+ }
+
+ gid = v9fs_get_fsgid_for_create(dir);
+ if (gid < 0) {
+ P9_DPRINTK(P9_DEBUG_VFS, "v9fs_get_fsgid_for_create failed\n");
+ goto error;
+ }
+
+ name = (char *) dentry->d_name.name;
+ err = p9_client_mkdir_dotl(dfid, name, mode, gid, &qid);
+ if (err < 0)
+ goto error;
+
+ /* instantiate inode and assign the unopened fid to the dentry */
+ if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) {
+ fid = p9_client_walk(dfid, 1, &name, 1);
+ if (IS_ERR(fid)) {
+ err = PTR_ERR(fid);
+ P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n",
+ err);
+ fid = NULL;
+ goto error;
+ }
+
+ inode = v9fs_inode_from_fid(v9ses, fid, dir->i_sb);
+ if (IS_ERR(inode)) {
+ err = PTR_ERR(inode);
+ P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n",
+ err);
+ goto error;
+ }
+ dentry->d_op = &v9fs_cached_dentry_operations;
+ d_instantiate(dentry, inode);
+ err = v9fs_fid_add(dentry, fid);
+ if (err < 0)
+ goto error;
+ fid = NULL;
+ }
+error:
+ if (fid)
+ p9_client_clunk(fid);
+ return err;
+}
+
/**
* v9fs_vfs_lookup - VFS lookup hook to "walk" to a new inode
* @dir: inode that is being walked from
@@ -678,6 +948,7 @@ static struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry,
sb = dir->i_sb;
v9ses = v9fs_inode2v9ses(dir);
+ /* We can walk d_parent because we hold the dir->i_mutex */
dfid = v9fs_fid_lookup(dentry->d_parent);
if (IS_ERR(dfid))
return ERR_CAST(dfid);
@@ -785,27 +1056,33 @@ v9fs_vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
goto clunk_olddir;
}
+ down_write(&v9ses->rename_sem);
if (v9fs_proto_dotl(v9ses)) {
retval = p9_client_rename(oldfid, newdirfid,
(char *) new_dentry->d_name.name);
if (retval != -ENOSYS)
goto clunk_newdir;
}
+ if (old_dentry->d_parent != new_dentry->d_parent) {
+ /*
+ * 9P .u can only handle file rename in the same directory
+ */
- /* 9P can only handle file rename in the same directory */
- if (memcmp(&olddirfid->qid, &newdirfid->qid, sizeof(newdirfid->qid))) {
P9_DPRINTK(P9_DEBUG_ERROR,
"old dir and new dir are different\n");
retval = -EXDEV;
goto clunk_newdir;
}
-
v9fs_blank_wstat(&wstat);
wstat.muid = v9ses->uname;
wstat.name = (char *) new_dentry->d_name.name;
retval = p9_client_wstat(oldfid, &wstat);
clunk_newdir:
+ if (!retval)
+ /* successful rename */
+ d_move(old_dentry, new_dentry);
+ up_write(&v9ses->rename_sem);
p9_client_clunk(newdirfid);
clunk_olddir:
@@ -853,6 +1130,42 @@ v9fs_vfs_getattr(struct vfsmount *mnt, struct dentry *dentry,
return 0;
}
+static int
+v9fs_vfs_getattr_dotl(struct vfsmount *mnt, struct dentry *dentry,
+ struct kstat *stat)
+{
+ int err;
+ struct v9fs_session_info *v9ses;
+ struct p9_fid *fid;
+ struct p9_stat_dotl *st;
+
+ P9_DPRINTK(P9_DEBUG_VFS, "dentry: %p\n", dentry);
+ err = -EPERM;
+ v9ses = v9fs_inode2v9ses(dentry->d_inode);
+ if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE)
+ return simple_getattr(mnt, dentry, stat);
+
+ fid = v9fs_fid_lookup(dentry);
+ if (IS_ERR(fid))
+ return PTR_ERR(fid);
+
+ /* Ask for all the fields in stat structure. Server will return
+ * whatever it supports
+ */
+
+ st = p9_client_getattr_dotl(fid, P9_STATS_ALL);
+ if (IS_ERR(st))
+ return PTR_ERR(st);
+
+ v9fs_stat2inode_dotl(st, dentry->d_inode);
+ generic_fillattr(dentry->d_inode, stat);
+ /* Change block size to what the server returned */
+ stat->blksize = st->st_blksize;
+
+ kfree(st);
+ return 0;
+}
+
/**
* v9fs_vfs_setattr - set file metadata
* @dentry: file whose metadata to set
@@ -903,6 +1216,49 @@ static int v9fs_vfs_setattr(struct dentry *dentry, struct iattr *iattr)
}
/**
+ * v9fs_vfs_setattr_dotl - set file metadata
+ * @dentry: file whose metadata to set
+ * @iattr: metadata assignment structure
+ *
+ */
+
+static int v9fs_vfs_setattr_dotl(struct dentry *dentry, struct iattr *iattr)
+{
+ int retval;
+ struct v9fs_session_info *v9ses;
+ struct p9_fid *fid;
+ struct p9_iattr_dotl p9attr;
+
+ P9_DPRINTK(P9_DEBUG_VFS, "\n");
+
+ retval = inode_change_ok(dentry->d_inode, iattr);
+ if (retval)
+ return retval;
+
+ p9attr.valid = iattr->ia_valid;
+ p9attr.mode = iattr->ia_mode;
+ p9attr.uid = iattr->ia_uid;
+ p9attr.gid = iattr->ia_gid;
+ p9attr.size = iattr->ia_size;
+ p9attr.atime_sec = iattr->ia_atime.tv_sec;
+ p9attr.atime_nsec = iattr->ia_atime.tv_nsec;
+ p9attr.mtime_sec = iattr->ia_mtime.tv_sec;
+ p9attr.mtime_nsec = iattr->ia_mtime.tv_nsec;
+
+ retval = -EPERM;
+ v9ses = v9fs_inode2v9ses(dentry->d_inode);
+ fid = v9fs_fid_lookup(dentry);
+ if (IS_ERR(fid))
+ return PTR_ERR(fid);
+
+ retval = p9_client_setattr(fid, &p9attr);
+ if (retval >= 0)
+ retval = inode_setattr(dentry->d_inode, iattr);
+
+ return retval;
+}
+
+/**
* v9fs_stat2inode - populate an inode structure with mistat info
* @stat: Plan 9 metadata (mistat) structure
* @inode: inode to populate
@@ -980,6 +1336,77 @@ v9fs_stat2inode(struct p9_wstat *stat, struct inode *inode,
}
/**
+ * v9fs_stat2inode_dotl - populate an inode structure with stat info
+ * @stat: stat structure
+ * @inode: inode to populate
+ * @sb: superblock of filesystem
+ *
+ */
+
+void
+v9fs_stat2inode_dotl(struct p9_stat_dotl *stat, struct inode *inode)
+{
+
+ if ((stat->st_result_mask & P9_STATS_BASIC) == P9_STATS_BASIC) {
+ inode->i_atime.tv_sec = stat->st_atime_sec;
+ inode->i_atime.tv_nsec = stat->st_atime_nsec;
+ inode->i_mtime.tv_sec = stat->st_mtime_sec;
+ inode->i_mtime.tv_nsec = stat->st_mtime_nsec;
+ inode->i_ctime.tv_sec = stat->st_ctime_sec;
+ inode->i_ctime.tv_nsec = stat->st_ctime_nsec;
+ inode->i_uid = stat->st_uid;
+ inode->i_gid = stat->st_gid;
+ inode->i_nlink = stat->st_nlink;
+ inode->i_mode = stat->st_mode;
+ inode->i_rdev = new_decode_dev(stat->st_rdev);
+
+ if ((S_ISBLK(inode->i_mode)) || (S_ISCHR(inode->i_mode)))
+ init_special_inode(inode, inode->i_mode, inode->i_rdev);
+
+ i_size_write(inode, stat->st_size);
+ inode->i_blocks = stat->st_blocks;
+ } else {
+ if (stat->st_result_mask & P9_STATS_ATIME) {
+ inode->i_atime.tv_sec = stat->st_atime_sec;
+ inode->i_atime.tv_nsec = stat->st_atime_nsec;
+ }
+ if (stat->st_result_mask & P9_STATS_MTIME) {
+ inode->i_mtime.tv_sec = stat->st_mtime_sec;
+ inode->i_mtime.tv_nsec = stat->st_mtime_nsec;
+ }
+ if (stat->st_result_mask & P9_STATS_CTIME) {
+ inode->i_ctime.tv_sec = stat->st_ctime_sec;
+ inode->i_ctime.tv_nsec = stat->st_ctime_nsec;
+ }
+ if (stat->st_result_mask & P9_STATS_UID)
+ inode->i_uid = stat->st_uid;
+ if (stat->st_result_mask & P9_STATS_GID)
+ inode->i_gid = stat->st_gid;
+ if (stat->st_result_mask & P9_STATS_NLINK)
+ inode->i_nlink = stat->st_nlink;
+ if (stat->st_result_mask & P9_STATS_MODE) {
+ inode->i_mode = stat->st_mode;
+ if ((S_ISBLK(inode->i_mode)) ||
+ (S_ISCHR(inode->i_mode)))
+ init_special_inode(inode, inode->i_mode,
+ inode->i_rdev);
+ }
+ if (stat->st_result_mask & P9_STATS_RDEV)
+ inode->i_rdev = new_decode_dev(stat->st_rdev);
+ if (stat->st_result_mask & P9_STATS_SIZE)
+ i_size_write(inode, stat->st_size);
+ if (stat->st_result_mask & P9_STATS_BLOCKS)
+ inode->i_blocks = stat->st_blocks;
+ }
+ if (stat->st_result_mask & P9_STATS_GEN)
+ inode->i_generation = stat->st_gen;
+
+ /* Currently we don't support P9_STATS_BTIME and P9_STATS_DATA_VERSION
+ * because the inode structure does not have fields for them.
+ */
+}
+
+/**
* v9fs_qid2ino - convert qid into inode number
* @qid: qid to hash
*
@@ -1022,7 +1449,7 @@ static int v9fs_readlink(struct dentry *dentry, char *buffer, int buflen)
if (IS_ERR(fid))
return PTR_ERR(fid);
- if (!v9fs_proto_dotu(v9ses))
+ if (!v9fs_proto_dotu(v9ses) && !v9fs_proto_dotl(v9ses))
return -EBADF;
st = p9_client_stat(fid);
@@ -1128,6 +1555,99 @@ static int v9fs_vfs_mkspecial(struct inode *dir, struct dentry *dentry,
}
/**
+ * v9fs_vfs_symlink_dotl - helper function to create symlinks
+ * @dir: directory inode containing symlink
+ * @dentry: dentry for symlink
+ * @symname: symlink data
+ *
+ * See Also: 9P2000.L RFC for more information
+ *
+ */
+
+static int
+v9fs_vfs_symlink_dotl(struct inode *dir, struct dentry *dentry,
+ const char *symname)
+{
+ struct v9fs_session_info *v9ses;
+ struct p9_fid *dfid;
+ struct p9_fid *fid = NULL;
+ struct inode *inode;
+ struct p9_qid qid;
+ char *name;
+ int err;
+ gid_t gid;
+
+ name = (char *) dentry->d_name.name;
+ P9_DPRINTK(P9_DEBUG_VFS, "v9fs_vfs_symlink_dotl : %lu,%s,%s\n",
+ dir->i_ino, name, symname);
+ v9ses = v9fs_inode2v9ses(dir);
+
+ dfid = v9fs_fid_lookup(dentry->d_parent);
+ if (IS_ERR(dfid)) {
+ err = PTR_ERR(dfid);
+ P9_DPRINTK(P9_DEBUG_VFS, "fid lookup failed %d\n", err);
+ return err;
+ }
+
+ gid = v9fs_get_fsgid_for_create(dir);
+
+ if (gid < 0) {
+ P9_DPRINTK(P9_DEBUG_VFS, "v9fs_get_egid failed %d\n", gid);
+ goto error;
+ }
+
+ /* Server doesn't alter fid on TSYMLINK. Hence no need to clone it. */
+ err = p9_client_symlink(dfid, name, (char *)symname, gid, &qid);
+
+ if (err < 0) {
+ P9_DPRINTK(P9_DEBUG_VFS, "p9_client_symlink failed %d\n", err);
+ goto error;
+ }
+
+ if (v9ses->cache) {
+ /* Now walk from the parent so we can get an unopened fid. */
+ fid = p9_client_walk(dfid, 1, &name, 1);
+ if (IS_ERR(fid)) {
+ err = PTR_ERR(fid);
+ P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n",
+ err);
+ fid = NULL;
+ goto error;
+ }
+
+ /* instantiate inode and assign the unopened fid to dentry */
+ inode = v9fs_inode_from_fid(v9ses, fid, dir->i_sb);
+ if (IS_ERR(inode)) {
+ err = PTR_ERR(inode);
+ P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n",
+ err);
+ goto error;
+ }
+ dentry->d_op = &v9fs_cached_dentry_operations;
+ d_instantiate(dentry, inode);
+ err = v9fs_fid_add(dentry, fid);
+ if (err < 0)
+ goto error;
+ fid = NULL;
+ } else {
+ /* Not in cached mode. No need to populate inode with stat */
+ inode = v9fs_get_inode(dir->i_sb, S_IFLNK);
+ if (IS_ERR(inode)) {
+ err = PTR_ERR(inode);
+ goto error;
+ }
+ dentry->d_op = &v9fs_dentry_operations;
+ d_instantiate(dentry, inode);
+ }
+
+error:
+ if (fid)
+ p9_client_clunk(fid);
+
+ return err;
+}
+
+/**
* v9fs_vfs_symlink - helper function to create symlinks
* @dir: directory inode containing symlink
* @dentry: dentry for symlink
@@ -1186,6 +1706,76 @@ clunk_fid:
}
/**
+ * v9fs_vfs_link_dotl - create a hardlink for dotl
+ * @old_dentry: dentry for file to link to
+ * @dir: inode destination for new link
+ * @dentry: dentry for link
+ *
+ */
+
+static int
+v9fs_vfs_link_dotl(struct dentry *old_dentry, struct inode *dir,
+ struct dentry *dentry)
+{
+ int err;
+ struct p9_fid *dfid, *oldfid;
+ char *name;
+ struct v9fs_session_info *v9ses;
+ struct dentry *dir_dentry;
+
+ P9_DPRINTK(P9_DEBUG_VFS, "dir ino: %lu, old_name: %s, new_name: %s\n",
+ dir->i_ino, old_dentry->d_name.name,
+ dentry->d_name.name);
+
+ v9ses = v9fs_inode2v9ses(dir);
+ dir_dentry = v9fs_dentry_from_dir_inode(dir);
+ dfid = v9fs_fid_lookup(dir_dentry);
+ if (IS_ERR(dfid))
+ return PTR_ERR(dfid);
+
+ oldfid = v9fs_fid_lookup(old_dentry);
+ if (IS_ERR(oldfid))
+ return PTR_ERR(oldfid);
+
+ name = (char *) dentry->d_name.name;
+
+ err = p9_client_link(dfid, oldfid, (char *)dentry->d_name.name);
+
+ if (err < 0) {
+ P9_DPRINTK(P9_DEBUG_VFS, "p9_client_link failed %d\n", err);
+ return err;
+ }
+
+ if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) {
+ /* Get the latest stat info from server. */
+ struct p9_fid *fid;
+ struct p9_stat_dotl *st;
+
+ fid = v9fs_fid_lookup(old_dentry);
+ if (IS_ERR(fid))
+ return PTR_ERR(fid);
+
+ st = p9_client_getattr_dotl(fid, P9_STATS_BASIC);
+ if (IS_ERR(st))
+ return PTR_ERR(st);
+
+ v9fs_stat2inode_dotl(st, old_dentry->d_inode);
+
+ kfree(st);
+ } else {
+ /* Caching disabled. No need to get upto date stat info.
+ * This dentry will be released immediately. So, just i_count++
+ */
+ atomic_inc(&old_dentry->d_inode->i_count);
+ }
+
+ dentry->d_op = old_dentry->d_op;
+ d_instantiate(dentry, old_dentry->d_inode);
+
+ return err;
+}
+
+/**
* v9fs_vfs_mknod - create a special file
* @dir: inode destination for new link
* @dentry: dentry for file
@@ -1230,6 +1820,100 @@ v9fs_vfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t rdev)
return retval;
}
+/**
+ * v9fs_vfs_mknod_dotl - create a special file
+ * @dir: inode destination for new link
+ * @dentry: dentry for file
+ * @mode: mode for creation
+ * @rdev: device associated with special file
+ *
+ */
+static int
+v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, int mode,
+ dev_t rdev)
+{
+ int err;
+ char *name;
+ struct v9fs_session_info *v9ses;
+ struct p9_fid *fid = NULL, *dfid = NULL;
+ struct inode *inode;
+ gid_t gid;
+ struct p9_qid qid;
+ struct dentry *dir_dentry;
+
+ P9_DPRINTK(P9_DEBUG_VFS,
+ " %lu,%s mode: %x MAJOR: %u MINOR: %u\n", dir->i_ino,
+ dentry->d_name.name, mode, MAJOR(rdev), MINOR(rdev));
+
+ if (!new_valid_dev(rdev))
+ return -EINVAL;
+
+ v9ses = v9fs_inode2v9ses(dir);
+ dir_dentry = v9fs_dentry_from_dir_inode(dir);
+ dfid = v9fs_fid_lookup(dir_dentry);
+ if (IS_ERR(dfid)) {
+ err = PTR_ERR(dfid);
+ P9_DPRINTK(P9_DEBUG_VFS, "fid lookup failed %d\n", err);
+ dfid = NULL;
+ goto error;
+ }
+
+ gid = v9fs_get_fsgid_for_create(dir);
+ if (gid < 0) {
+ P9_DPRINTK(P9_DEBUG_VFS, "v9fs_get_fsgid_for_create failed\n");
+ goto error;
+ }
+
+ name = (char *) dentry->d_name.name;
+
+ err = p9_client_mknod_dotl(dfid, name, mode, rdev, gid, &qid);
+ if (err < 0)
+ goto error;
+
+ /* instantiate inode and assign the unopened fid to the dentry */
+ if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) {
+ fid = p9_client_walk(dfid, 1, &name, 1);
+ if (IS_ERR(fid)) {
+ err = PTR_ERR(fid);
+ P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n",
+ err);
+ fid = NULL;
+ goto error;
+ }
+
+ inode = v9fs_inode_from_fid(v9ses, fid, dir->i_sb);
+ if (IS_ERR(inode)) {
+ err = PTR_ERR(inode);
+ P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n",
+ err);
+ goto error;
+ }
+ dentry->d_op = &v9fs_cached_dentry_operations;
+ d_instantiate(dentry, inode);
+ err = v9fs_fid_add(dentry, fid);
+ if (err < 0)
+ goto error;
+ fid = NULL;
+ } else {
+ /*
+ * Not in cached mode. No need to populate inode with stat.
+ * socket syscall returns a fd, so we need instantiate
+ */
+ inode = v9fs_get_inode(dir->i_sb, mode);
+ if (IS_ERR(inode)) {
+ err = PTR_ERR(inode);
+ goto error;
+ }
+ dentry->d_op = &v9fs_dentry_operations;
+ d_instantiate(dentry, inode);
+ }
+
+error:
+ if (fid)
+ p9_client_clunk(fid);
+ return err;
+}
+
static const struct inode_operations v9fs_dir_inode_operations_dotu = {
.create = v9fs_vfs_create,
.lookup = v9fs_vfs_lookup,
@@ -1238,24 +1922,29 @@ static const struct inode_operations v9fs_dir_inode_operations_dotu = {
.unlink = v9fs_vfs_unlink,
.mkdir = v9fs_vfs_mkdir,
.rmdir = v9fs_vfs_rmdir,
- .mknod = v9fs_vfs_mknod,
+ .mknod = v9fs_vfs_mknod_dotl,
.rename = v9fs_vfs_rename,
.getattr = v9fs_vfs_getattr,
.setattr = v9fs_vfs_setattr,
};
static const struct inode_operations v9fs_dir_inode_operations_dotl = {
- .create = v9fs_vfs_create,
+ .create = v9fs_vfs_create_dotl,
.lookup = v9fs_vfs_lookup,
- .symlink = v9fs_vfs_symlink,
- .link = v9fs_vfs_link,
+ .link = v9fs_vfs_link_dotl,
+ .symlink = v9fs_vfs_symlink_dotl,
.unlink = v9fs_vfs_unlink,
- .mkdir = v9fs_vfs_mkdir,
+ .mkdir = v9fs_vfs_mkdir_dotl,
.rmdir = v9fs_vfs_rmdir,
- .mknod = v9fs_vfs_mknod,
+ .mknod = v9fs_vfs_mknod_dotl,
.rename = v9fs_vfs_rename,
- .getattr = v9fs_vfs_getattr,
- .setattr = v9fs_vfs_setattr,
+ .getattr = v9fs_vfs_getattr_dotl,
+ .setattr = v9fs_vfs_setattr_dotl,
+ .setxattr = generic_setxattr,
+ .getxattr = generic_getxattr,
+ .removexattr = generic_removexattr,
+ .listxattr = v9fs_listxattr,
+
};
static const struct inode_operations v9fs_dir_inode_operations = {
@@ -1276,8 +1965,12 @@ static const struct inode_operations v9fs_file_inode_operations = {
};
static const struct inode_operations v9fs_file_inode_operations_dotl = {
- .getattr = v9fs_vfs_getattr,
- .setattr = v9fs_vfs_setattr,
+ .getattr = v9fs_vfs_getattr_dotl,
+ .setattr = v9fs_vfs_setattr_dotl,
+ .setxattr = generic_setxattr,
+ .getxattr = generic_getxattr,
+ .removexattr = generic_removexattr,
+ .listxattr = v9fs_listxattr,
};
static const struct inode_operations v9fs_symlink_inode_operations = {
@@ -1292,6 +1985,10 @@ static const struct inode_operations v9fs_symlink_inode_operations_dotl = {
.readlink = generic_readlink,
.follow_link = v9fs_vfs_follow_link,
.put_link = v9fs_vfs_put_link,
- .getattr = v9fs_vfs_getattr,
- .setattr = v9fs_vfs_setattr,
+ .getattr = v9fs_vfs_getattr_dotl,
+ .setattr = v9fs_vfs_setattr_dotl,
+ .setxattr = generic_setxattr,
+ .getxattr = generic_getxattr,
+ .removexattr = generic_removexattr,
+ .listxattr = v9fs_listxattr,
};
diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c
index be74d020436e..4b9ede0b41b7 100644
--- a/fs/9p/vfs_super.c
+++ b/fs/9p/vfs_super.c
@@ -45,6 +45,7 @@
#include "v9fs.h"
#include "v9fs_vfs.h"
#include "fid.h"
+#include "xattr.h"
static const struct super_operations v9fs_super_ops, v9fs_super_ops_dotl;
@@ -77,9 +78,10 @@ v9fs_fill_super(struct super_block *sb, struct v9fs_session_info *v9ses,
sb->s_blocksize_bits = fls(v9ses->maxdata - 1);
sb->s_blocksize = 1 << sb->s_blocksize_bits;
sb->s_magic = V9FS_MAGIC;
- if (v9fs_proto_dotl(v9ses))
+ if (v9fs_proto_dotl(v9ses)) {
sb->s_op = &v9fs_super_ops_dotl;
- else
+ sb->s_xattr = v9fs_xattr_handlers;
+ } else
sb->s_op = &v9fs_super_ops;
sb->s_bdi = &v9ses->bdi;
@@ -107,7 +109,6 @@ static int v9fs_get_sb(struct file_system_type *fs_type, int flags,
struct inode *inode = NULL;
struct dentry *root = NULL;
struct v9fs_session_info *v9ses = NULL;
- struct p9_wstat *st = NULL;
int mode = S_IRWXUGO | S_ISVTX;
struct p9_fid *fid;
int retval = 0;
@@ -124,16 +125,10 @@ static int v9fs_get_sb(struct file_system_type *fs_type, int flags,
goto close_session;
}
- st = p9_client_stat(fid);
- if (IS_ERR(st)) {
- retval = PTR_ERR(st);
- goto clunk_fid;
- }
-
sb = sget(fs_type, NULL, v9fs_set_super, v9ses);
if (IS_ERR(sb)) {
retval = PTR_ERR(sb);
- goto free_stat;
+ goto clunk_fid;
}
v9fs_fill_super(sb, v9ses, flags, data);
@@ -151,22 +146,38 @@ static int v9fs_get_sb(struct file_system_type *fs_type, int flags,
}
sb->s_root = root;
- root->d_inode->i_ino = v9fs_qid2ino(&st->qid);
- v9fs_stat2inode(st, root->d_inode, sb);
+ if (v9fs_proto_dotl(v9ses)) {
+ struct p9_stat_dotl *st = NULL;
+ st = p9_client_getattr_dotl(fid, P9_STATS_BASIC);
+ if (IS_ERR(st)) {
+ retval = PTR_ERR(st);
+ goto clunk_fid;
+ }
+
+ v9fs_stat2inode_dotl(st, root->d_inode);
+ kfree(st);
+ } else {
+ struct p9_wstat *st = NULL;
+ st = p9_client_stat(fid);
+ if (IS_ERR(st)) {
+ retval = PTR_ERR(st);
+ goto clunk_fid;
+ }
+
+ root->d_inode->i_ino = v9fs_qid2ino(&st->qid);
+ v9fs_stat2inode(st, root->d_inode, sb);
+
+ p9stat_free(st);
+ kfree(st);
+ }
v9fs_fid_add(root, fid);
- p9stat_free(st);
- kfree(st);
P9_DPRINTK(P9_DEBUG_VFS, " simple set mount, return 0\n");
simple_set_mnt(mnt, sb);
return 0;
-free_stat:
- p9stat_free(st);
- kfree(st);
-
clunk_fid:
p9_client_clunk(fid);
@@ -176,8 +187,6 @@ close_session:
return retval;
release_sb:
- p9stat_free(st);
- kfree(st);
deactivate_locked_super(sb);
return retval;
}
@@ -278,4 +287,5 @@ struct file_system_type v9fs_fs_type = {
.get_sb = v9fs_get_sb,
.kill_sb = v9fs_kill_super,
.owner = THIS_MODULE,
+ .fs_flags = FS_RENAME_DOES_D_MOVE,
};
diff --git a/fs/9p/xattr.c b/fs/9p/xattr.c
new file mode 100644
index 000000000000..f88e5c2dc873
--- /dev/null
+++ b/fs/9p/xattr.c
@@ -0,0 +1,160 @@
+/*
+ * Copyright IBM Corporation, 2010
+ * Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2.1 of the GNU Lesser General Public License
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/sched.h>
+#include <net/9p/9p.h>
+#include <net/9p/client.h>
+
+#include "fid.h"
+#include "xattr.h"
+
+/*
+ * v9fs_xattr_get()
+ *
+ * Copy an extended attribute into the buffer
+ * provided, or compute the buffer size required.
+ * Buffer is NULL to compute the size of the buffer required.
+ *
+ * Returns a negative error number on failure, or the number of bytes
+ * used / required on success.
+ */
+ssize_t v9fs_xattr_get(struct dentry *dentry, const char *name,
+ void *buffer, size_t buffer_size)
+{
+ ssize_t retval;
+ int msize, read_count;
+ u64 offset = 0, attr_size;
+ struct p9_fid *fid, *attr_fid;
+
+ P9_DPRINTK(P9_DEBUG_VFS, "%s: name = %s value_len = %zu\n",
+ __func__, name, buffer_size);
+
+ fid = v9fs_fid_lookup(dentry);
+ if (IS_ERR(fid))
+ return PTR_ERR(fid);
+
+ attr_fid = p9_client_xattrwalk(fid, name, &attr_size);
+ if (IS_ERR(attr_fid)) {
+ retval = PTR_ERR(attr_fid);
+ P9_DPRINTK(P9_DEBUG_VFS,
+ "p9_client_attrwalk failed %zd\n", retval);
+ attr_fid = NULL;
+ goto error;
+ }
+ if (!buffer_size) {
+ /* request to get the attr_size */
+ retval = attr_size;
+ goto error;
+ }
+ if (attr_size > buffer_size) {
+ retval = -ERANGE;
+ goto error;
+ }
+ msize = attr_fid->clnt->msize;
+ while (attr_size) {
+ if (attr_size > (msize - P9_IOHDRSZ))
+ read_count = msize - P9_IOHDRSZ;
+ else
+ read_count = attr_size;
+ read_count = p9_client_read(attr_fid, ((char *)buffer)+offset,
+ NULL, offset, read_count);
+ if (read_count < 0) {
+ /* error in xattr read */
+ retval = read_count;
+ goto error;
+ }
+ offset += read_count;
+ attr_size -= read_count;
+ }
+ /* Total read xattr bytes */
+ retval = offset;
+error:
+ if (attr_fid)
+ p9_client_clunk(attr_fid);
+ return retval;
+
+}
+
+/*
+ * v9fs_xattr_set()
+ *
+ * Create, replace or remove an extended attribute for this inode. Buffer
+ * is NULL to remove an existing extended attribute, and non-NULL to
+ * either replace an existing extended attribute, or create a new extended
+ * attribute. The flags XATTR_REPLACE and XATTR_CREATE
+ * specify that an extended attribute must exist and must not exist
+ * previous to the call, respectively.
+ *
+ * Returns 0, or a negative error number on failure.
+ */
+int v9fs_xattr_set(struct dentry *dentry, const char *name,
+ const void *value, size_t value_len, int flags)
+{
+ u64 offset = 0;
+ int retval, msize, write_count;
+ struct p9_fid *fid = NULL;
+
+ P9_DPRINTK(P9_DEBUG_VFS, "%s: name = %s value_len = %zu flags = %d\n",
+ __func__, name, value_len, flags);
+
+ fid = v9fs_fid_clone(dentry);
+ if (IS_ERR(fid)) {
+ retval = PTR_ERR(fid);
+ fid = NULL;
+ goto error;
+ }
+ /*
+ * On success fid points to xattr
+ */
+ retval = p9_client_xattrcreate(fid, name, value_len, flags);
+ if (retval < 0) {
+ P9_DPRINTK(P9_DEBUG_VFS,
+ "p9_client_xattrcreate failed %d\n", retval);
+ goto error;
+ }
+ msize = fid->clnt->msize;;
+ while (value_len) {
+ if (value_len > (msize - P9_IOHDRSZ))
+ write_count = msize - P9_IOHDRSZ;
+ else
+ write_count = value_len;
+ write_count = p9_client_write(fid, ((char *)value)+offset,
+ NULL, offset, write_count);
+ if (write_count < 0) {
+ /* error in xattr write */
+ retval = write_count;
+ goto error;
+ }
+ offset += write_count;
+ value_len -= write_count;
+ }
+ /* Total read xattr bytes */
+ retval = offset;
+error:
+ if (fid)
+ retval = p9_client_clunk(fid);
+ return retval;
+}
+
+ssize_t v9fs_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size)
+{
+ return v9fs_xattr_get(dentry, NULL, buffer, buffer_size);
+}
+
+const struct xattr_handler *v9fs_xattr_handlers[] = {
+ &v9fs_xattr_user_handler,
+ NULL
+};
diff --git a/fs/9p/xattr.h b/fs/9p/xattr.h
new file mode 100644
index 000000000000..9ddf672ae5c4
--- /dev/null
+++ b/fs/9p/xattr.h
@@ -0,0 +1,27 @@
+/*
+ * Copyright IBM Corporation, 2010
+ * Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2.1 of the GNU Lesser General Public License
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ */
+#ifndef FS_9P_XATTR_H
+#define FS_9P_XATTR_H
+
+#include <linux/xattr.h>
+
+extern const struct xattr_handler *v9fs_xattr_handlers[];
+extern struct xattr_handler v9fs_xattr_user_handler;
+
+extern ssize_t v9fs_xattr_get(struct dentry *, const char *,
+ void *, size_t);
+extern int v9fs_xattr_set(struct dentry *, const char *,
+ const void *, size_t, int);
+extern ssize_t v9fs_listxattr(struct dentry *, char *, size_t);
+#endif /* FS_9P_XATTR_H */
diff --git a/fs/9p/xattr_user.c b/fs/9p/xattr_user.c
new file mode 100644
index 000000000000..d0b701b72080
--- /dev/null
+++ b/fs/9p/xattr_user.c
@@ -0,0 +1,80 @@
+/*
+ * Copyright IBM Corporation, 2010
+ * Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2.1 of the GNU Lesser General Public License
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ */
+
+
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/fs.h>
+#include <linux/slab.h>
+#include "xattr.h"
+
+static int v9fs_xattr_user_get(struct dentry *dentry, const char *name,
+ void *buffer, size_t size, int type)
+{
+ int retval;
+ char *full_name;
+ size_t name_len;
+ size_t prefix_len = XATTR_USER_PREFIX_LEN;
+
+ if (name == NULL)
+ return -EINVAL;
+
+ if (strcmp(name, "") == 0)
+ return -EINVAL;
+
+ name_len = strlen(name);
+ full_name = kmalloc(prefix_len + name_len + 1 , GFP_KERNEL);
+ if (!full_name)
+ return -ENOMEM;
+ memcpy(full_name, XATTR_USER_PREFIX, prefix_len);
+ memcpy(full_name+prefix_len, name, name_len);
+ full_name[prefix_len + name_len] = '\0';
+
+ retval = v9fs_xattr_get(dentry, full_name, buffer, size);
+ kfree(full_name);
+ return retval;
+}
+
+static int v9fs_xattr_user_set(struct dentry *dentry, const char *name,
+ const void *value, size_t size, int flags, int type)
+{
+ int retval;
+ char *full_name;
+ size_t name_len;
+ size_t prefix_len = XATTR_USER_PREFIX_LEN;
+
+ if (name == NULL)
+ return -EINVAL;
+
+ if (strcmp(name, "") == 0)
+ return -EINVAL;
+
+ name_len = strlen(name);
+ full_name = kmalloc(prefix_len + name_len + 1 , GFP_KERNEL);
+ if (!full_name)
+ return -ENOMEM;
+ memcpy(full_name, XATTR_USER_PREFIX, prefix_len);
+ memcpy(full_name + prefix_len, name, name_len);
+ full_name[prefix_len + name_len] = '\0';
+
+ retval = v9fs_xattr_set(dentry, full_name, value, size, flags);
+ kfree(full_name);
+ return retval;
+}
+
+struct xattr_handler v9fs_xattr_user_handler = {
+ .prefix = XATTR_USER_PREFIX,
+ .get = v9fs_xattr_user_get,
+ .set = v9fs_xattr_user_set,
+};
diff --git a/fs/adfs/dir.c b/fs/adfs/dir.c
index 23aa52f548a0..f4287e4de744 100644
--- a/fs/adfs/dir.c
+++ b/fs/adfs/dir.c
@@ -197,7 +197,7 @@ const struct file_operations adfs_dir_operations = {
.read = generic_read_dir,
.llseek = generic_file_llseek,
.readdir = adfs_readdir,
- .fsync = simple_fsync,
+ .fsync = generic_file_fsync,
};
static int
diff --git a/fs/adfs/file.c b/fs/adfs/file.c
index 005ea34d1758..a36da5382b40 100644
--- a/fs/adfs/file.c
+++ b/fs/adfs/file.c
@@ -26,7 +26,7 @@ const struct file_operations adfs_file_operations = {
.read = do_sync_read,
.aio_read = generic_file_aio_read,
.mmap = generic_file_mmap,
- .fsync = simple_fsync,
+ .fsync = generic_file_fsync,
.write = do_sync_write,
.aio_write = generic_file_aio_write,
.splice_read = generic_file_splice_read,
diff --git a/fs/adfs/inode.c b/fs/adfs/inode.c
index 0f5e30978135..6f850b06ab62 100644
--- a/fs/adfs/inode.c
+++ b/fs/adfs/inode.c
@@ -322,8 +322,9 @@ adfs_notify_change(struct dentry *dentry, struct iattr *attr)
if (error)
goto out;
+ /* XXX: this is missing some actual on-disk truncation.. */
if (ia_valid & ATTR_SIZE)
- error = vmtruncate(inode, attr->ia_size);
+ error = simple_setsize(inode, attr->ia_size);
if (error)
goto out;
diff --git a/fs/affs/affs.h b/fs/affs/affs.h
index 861dae68ac12..f05b6155ccc8 100644
--- a/fs/affs/affs.h
+++ b/fs/affs/affs.h
@@ -183,7 +183,7 @@ extern int affs_add_entry(struct inode *dir, struct inode *inode, struct dent
void affs_free_prealloc(struct inode *inode);
extern void affs_truncate(struct inode *);
-int affs_file_fsync(struct file *, struct dentry *, int);
+int affs_file_fsync(struct file *, int);
/* dir.c */
diff --git a/fs/affs/file.c b/fs/affs/file.c
index 184e55c1c9ba..322710c3eedf 100644
--- a/fs/affs/file.c
+++ b/fs/affs/file.c
@@ -916,9 +916,9 @@ affs_truncate(struct inode *inode)
affs_free_prealloc(inode);
}
-int affs_file_fsync(struct file *filp, struct dentry *dentry, int datasync)
+int affs_file_fsync(struct file *filp, int datasync)
{
- struct inode * inode = dentry->d_inode;
+ struct inode *inode = filp->f_mapping->host;
int ret, err;
ret = write_inode_now(inode, 0);
diff --git a/fs/affs/namei.c b/fs/affs/namei.c
index d70bbbac6b7b..914d1c0bc07a 100644
--- a/fs/affs/namei.c
+++ b/fs/affs/namei.c
@@ -224,7 +224,7 @@ affs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
affs_brelse(bh);
inode = affs_iget(sb, ino);
if (IS_ERR(inode))
- return ERR_PTR(PTR_ERR(inode));
+ return ERR_CAST(inode);
}
dentry->d_op = AFFS_SB(sb)->s_flags & SF_INTL ? &affs_intl_dentry_operations : &affs_dentry_operations;
d_add(dentry, inode);
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index 807f284cc75e..5f679b77ce24 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -740,7 +740,7 @@ extern void afs_pages_written_back(struct afs_vnode *, struct afs_call *);
extern ssize_t afs_file_write(struct kiocb *, const struct iovec *,
unsigned long, loff_t);
extern int afs_writeback_all(struct afs_vnode *);
-extern int afs_fsync(struct file *, struct dentry *, int);
+extern int afs_fsync(struct file *, int);
/*****************************************************************************/
diff --git a/fs/afs/server.c b/fs/afs/server.c
index f49099516675..9fdc7fe3a7bc 100644
--- a/fs/afs/server.c
+++ b/fs/afs/server.c
@@ -91,9 +91,10 @@ static struct afs_server *afs_alloc_server(struct afs_cell *cell,
memcpy(&server->addr, addr, sizeof(struct in_addr));
server->addr.s_addr = addr->s_addr;
+ _leave(" = %p{%d}", server, atomic_read(&server->usage));
+ } else {
+ _leave(" = NULL [nomem]");
}
-
- _leave(" = %p{%d}", server, atomic_read(&server->usage));
return server;
}
diff --git a/fs/afs/write.c b/fs/afs/write.c
index 3bed54a294d4..722743b152d8 100644
--- a/fs/afs/write.c
+++ b/fs/afs/write.c
@@ -680,7 +680,6 @@ int afs_writeback_all(struct afs_vnode *vnode)
{
struct address_space *mapping = vnode->vfs_inode.i_mapping;
struct writeback_control wbc = {
- .bdi = mapping->backing_dev_info,
.sync_mode = WB_SYNC_ALL,
.nr_to_write = LONG_MAX,
.range_cyclic = 1,
@@ -701,8 +700,9 @@ int afs_writeback_all(struct afs_vnode *vnode)
* - the return status from this call provides a reliable indication of
* whether any write errors occurred for this process.
*/
-int afs_fsync(struct file *file, struct dentry *dentry, int datasync)
+int afs_fsync(struct file *file, int datasync)
{
+ struct dentry *dentry = file->f_path.dentry;
struct afs_writeback *wb, *xwb;
struct afs_vnode *vnode = AFS_FS_I(dentry->d_inode);
int ret;
diff --git a/fs/aio.c b/fs/aio.c
index 1cf12b3dd83a..1ccf25cef1f0 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -36,6 +36,7 @@
#include <linux/blkdev.h>
#include <linux/mempool.h>
#include <linux/hash.h>
+#include <linux/compat.h>
#include <asm/kmap_types.h>
#include <asm/uaccess.h>
@@ -526,7 +527,7 @@ static void aio_fput_routine(struct work_struct *data)
/* Complete the fput(s) */
if (req->ki_filp != NULL)
- __fput(req->ki_filp);
+ fput(req->ki_filp);
/* Link the iocb into the context's free list */
spin_lock_irq(&ctx->ctx_lock);
@@ -559,11 +560,11 @@ static int __aio_put_req(struct kioctx *ctx, struct kiocb *req)
/*
* Try to optimize the aio and eventfd file* puts, by avoiding to
- * schedule work in case it is not __fput() time. In normal cases,
+ * schedule work in case it is not final fput() time. In normal cases,
* we would not be holding the last reference to the file*, so
* this function will be executed w/out any aio kthread wakeup.
*/
- if (unlikely(atomic_long_dec_and_test(&req->ki_filp->f_count))) {
+ if (unlikely(!fput_atomic(req->ki_filp))) {
get_ioctx(ctx);
spin_lock(&fput_lock);
list_add(&req->ki_list, &fput_head);
@@ -1384,13 +1385,22 @@ static ssize_t aio_fsync(struct kiocb *iocb)
return ret;
}
-static ssize_t aio_setup_vectored_rw(int type, struct kiocb *kiocb)
+static ssize_t aio_setup_vectored_rw(int type, struct kiocb *kiocb, bool compat)
{
ssize_t ret;
- ret = rw_copy_check_uvector(type, (struct iovec __user *)kiocb->ki_buf,
- kiocb->ki_nbytes, 1,
- &kiocb->ki_inline_vec, &kiocb->ki_iovec);
+#ifdef CONFIG_COMPAT
+ if (compat)
+ ret = compat_rw_copy_check_uvector(type,
+ (struct compat_iovec __user *)kiocb->ki_buf,
+ kiocb->ki_nbytes, 1, &kiocb->ki_inline_vec,
+ &kiocb->ki_iovec);
+ else
+#endif
+ ret = rw_copy_check_uvector(type,
+ (struct iovec __user *)kiocb->ki_buf,
+ kiocb->ki_nbytes, 1, &kiocb->ki_inline_vec,
+ &kiocb->ki_iovec);
if (ret < 0)
goto out;
@@ -1420,7 +1430,7 @@ static ssize_t aio_setup_single_vector(struct kiocb *kiocb)
* Performs the initial checks and aio retry method
* setup for the kiocb at the time of io submission.
*/
-static ssize_t aio_setup_iocb(struct kiocb *kiocb)
+static ssize_t aio_setup_iocb(struct kiocb *kiocb, bool compat)
{
struct file *file = kiocb->ki_filp;
ssize_t ret = 0;
@@ -1469,7 +1479,7 @@ static ssize_t aio_setup_iocb(struct kiocb *kiocb)
ret = security_file_permission(file, MAY_READ);
if (unlikely(ret))
break;
- ret = aio_setup_vectored_rw(READ, kiocb);
+ ret = aio_setup_vectored_rw(READ, kiocb, compat);
if (ret)
break;
ret = -EINVAL;
@@ -1483,7 +1493,7 @@ static ssize_t aio_setup_iocb(struct kiocb *kiocb)
ret = security_file_permission(file, MAY_WRITE);
if (unlikely(ret))
break;
- ret = aio_setup_vectored_rw(WRITE, kiocb);
+ ret = aio_setup_vectored_rw(WRITE, kiocb, compat);
if (ret)
break;
ret = -EINVAL;
@@ -1548,7 +1558,8 @@ static void aio_batch_free(struct hlist_head *batch_hash)
}
static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
- struct iocb *iocb, struct hlist_head *batch_hash)
+ struct iocb *iocb, struct hlist_head *batch_hash,
+ bool compat)
{
struct kiocb *req;
struct file *file;
@@ -1609,7 +1620,7 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
req->ki_left = req->ki_nbytes = iocb->aio_nbytes;
req->ki_opcode = iocb->aio_lio_opcode;
- ret = aio_setup_iocb(req);
+ ret = aio_setup_iocb(req, compat);
if (ret)
goto out_put_req;
@@ -1637,20 +1648,8 @@ out_put_req:
return ret;
}
-/* sys_io_submit:
- * Queue the nr iocbs pointed to by iocbpp for processing. Returns
- * the number of iocbs queued. May return -EINVAL if the aio_context
- * specified by ctx_id is invalid, if nr is < 0, if the iocb at
- * *iocbpp[0] is not properly initialized, if the operation specified
- * is invalid for the file descriptor in the iocb. May fail with
- * -EFAULT if any of the data structures point to invalid data. May
- * fail with -EBADF if the file descriptor specified in the first
- * iocb is invalid. May fail with -EAGAIN if insufficient resources
- * are available to queue any iocbs. Will return 0 if nr is 0. Will
- * fail with -ENOSYS if not implemented.
- */
-SYSCALL_DEFINE3(io_submit, aio_context_t, ctx_id, long, nr,
- struct iocb __user * __user *, iocbpp)
+long do_io_submit(aio_context_t ctx_id, long nr,
+ struct iocb __user *__user *iocbpp, bool compat)
{
struct kioctx *ctx;
long ret = 0;
@@ -1687,7 +1686,7 @@ SYSCALL_DEFINE3(io_submit, aio_context_t, ctx_id, long, nr,
break;
}
- ret = io_submit_one(ctx, user_iocb, &tmp, batch_hash);
+ ret = io_submit_one(ctx, user_iocb, &tmp, batch_hash, compat);
if (ret)
break;
}
@@ -1697,6 +1696,24 @@ SYSCALL_DEFINE3(io_submit, aio_context_t, ctx_id, long, nr,
return i ? i : ret;
}
+/* sys_io_submit:
+ * Queue the nr iocbs pointed to by iocbpp for processing. Returns
+ * the number of iocbs queued. May return -EINVAL if the aio_context
+ * specified by ctx_id is invalid, if nr is < 0, if the iocb at
+ * *iocbpp[0] is not properly initialized, if the operation specified
+ * is invalid for the file descriptor in the iocb. May fail with
+ * -EFAULT if any of the data structures point to invalid data. May
+ * fail with -EBADF if the file descriptor specified in the first
+ * iocb is invalid. May fail with -EAGAIN if insufficient resources
+ * are available to queue any iocbs. Will return 0 if nr is 0. Will
+ * fail with -ENOSYS if not implemented.
+ */
+SYSCALL_DEFINE3(io_submit, aio_context_t, ctx_id, long, nr,
+ struct iocb __user * __user *, iocbpp)
+{
+ return do_io_submit(ctx_id, nr, iocbpp, 0);
+}
+
/* lookup_kiocb
* Finds a given iocb for cancellation.
*/
diff --git a/fs/anon_inodes.c b/fs/anon_inodes.c
index 9bd4b3876c99..e4b75d6eda83 100644
--- a/fs/anon_inodes.c
+++ b/fs/anon_inodes.c
@@ -205,7 +205,7 @@ static struct inode *anon_inode_mkinode(void)
* that it already _is_ on the dirty list.
*/
inode->i_state = I_DIRTY;
- inode->i_mode = S_IFREG | S_IRUSR | S_IWUSR;
+ inode->i_mode = S_IRUSR | S_IWUSR;
inode->i_uid = current_fsuid();
inode->i_gid = current_fsgid();
inode->i_flags |= S_PRIVATE;
diff --git a/fs/attr.c b/fs/attr.c
index 0815e93bb487..b4fa3b0aa596 100644
--- a/fs/attr.c
+++ b/fs/attr.c
@@ -67,14 +67,14 @@ EXPORT_SYMBOL(inode_change_ok);
* @offset: the new size to assign to the inode
* @Returns: 0 on success, -ve errno on failure
*
+ * inode_newsize_ok must be called with i_mutex held.
+ *
* inode_newsize_ok will check filesystem limits and ulimits to check that the
* new inode size is within limits. inode_newsize_ok will also send SIGXFSZ
* when necessary. Caller must not proceed with inode size change if failure is
* returned. @inode must be a file (not directory), with appropriate
* permissions to allow truncate (inode_newsize_ok does NOT check these
* conditions).
- *
- * inode_newsize_ok must be called with i_mutex held.
*/
int inode_newsize_ok(const struct inode *inode, loff_t offset)
{
@@ -104,17 +104,25 @@ out_big:
}
EXPORT_SYMBOL(inode_newsize_ok);
-int inode_setattr(struct inode * inode, struct iattr * attr)
+/**
+ * generic_setattr - copy simple metadata updates into the generic inode
+ * @inode: the inode to be updated
+ * @attr: the new attributes
+ *
+ * generic_setattr must be called with i_mutex held.
+ *
+ * generic_setattr updates the inode's metadata with that specified
+ * in attr. Noticably missing is inode size update, which is more complex
+ * as it requires pagecache updates. See simple_setsize.
+ *
+ * The inode is not marked as dirty after this operation. The rationale is
+ * that for "simple" filesystems, the struct inode is the inode storage.
+ * The caller is free to mark the inode dirty afterwards if needed.
+ */
+void generic_setattr(struct inode *inode, const struct iattr *attr)
{
unsigned int ia_valid = attr->ia_valid;
- if (ia_valid & ATTR_SIZE &&
- attr->ia_size != i_size_read(inode)) {
- int error = vmtruncate(inode, attr->ia_size);
- if (error)
- return error;
- }
-
if (ia_valid & ATTR_UID)
inode->i_uid = attr->ia_uid;
if (ia_valid & ATTR_GID)
@@ -135,6 +143,28 @@ int inode_setattr(struct inode * inode, struct iattr * attr)
mode &= ~S_ISGID;
inode->i_mode = mode;
}
+}
+EXPORT_SYMBOL(generic_setattr);
+
+/*
+ * note this function is deprecated, the new truncate sequence should be
+ * used instead -- see eg. simple_setsize, generic_setattr.
+ */
+int inode_setattr(struct inode *inode, const struct iattr *attr)
+{
+ unsigned int ia_valid = attr->ia_valid;
+
+ if (ia_valid & ATTR_SIZE &&
+ attr->ia_size != i_size_read(inode)) {
+ int error;
+
+ error = vmtruncate(inode, attr->ia_size);
+ if (error)
+ return error;
+ }
+
+ generic_setattr(inode, attr);
+
mark_inode_dirty(inode);
return 0;
diff --git a/fs/autofs/root.c b/fs/autofs/root.c
index 8713c7cfbc79..9a0520b50663 100644
--- a/fs/autofs/root.c
+++ b/fs/autofs/root.c
@@ -28,6 +28,7 @@ static int autofs_root_mkdir(struct inode *,struct dentry *,int);
static int autofs_root_ioctl(struct inode *, struct file *,unsigned int,unsigned long);
const struct file_operations autofs_root_operations = {
+ .llseek = generic_file_llseek,
.read = generic_read_dir,
.readdir = autofs_root_readdir,
.ioctl = autofs_root_ioctl,
diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c
index d832062869f6..ba4a38b9c22f 100644
--- a/fs/autofs4/dev-ioctl.c
+++ b/fs/autofs4/dev-ioctl.c
@@ -95,7 +95,7 @@ static int check_dev_ioctl_version(int cmd, struct autofs_dev_ioctl *param)
*/
static struct autofs_dev_ioctl *copy_dev_ioctl(struct autofs_dev_ioctl __user *in)
{
- struct autofs_dev_ioctl tmp, *ads;
+ struct autofs_dev_ioctl tmp;
if (copy_from_user(&tmp, in, sizeof(tmp)))
return ERR_PTR(-EFAULT);
@@ -103,16 +103,7 @@ static struct autofs_dev_ioctl *copy_dev_ioctl(struct autofs_dev_ioctl __user *i
if (tmp.size < sizeof(tmp))
return ERR_PTR(-EINVAL);
- ads = kmalloc(tmp.size, GFP_KERNEL);
- if (!ads)
- return ERR_PTR(-ENOMEM);
-
- if (copy_from_user(ads, in, tmp.size)) {
- kfree(ads);
- return ERR_PTR(-EFAULT);
- }
-
- return ads;
+ return memdup_user(in, tmp.size);
}
static inline void free_dev_ioctl(struct autofs_dev_ioctl *param)
diff --git a/fs/bad_inode.c b/fs/bad_inode.c
index a05287a23f62..52e59bf4aa5f 100644
--- a/fs/bad_inode.c
+++ b/fs/bad_inode.c
@@ -93,8 +93,7 @@ static int bad_file_release(struct inode *inode, struct file *filp)
return -EIO;
}
-static int bad_file_fsync(struct file *file, struct dentry *dentry,
- int datasync)
+static int bad_file_fsync(struct file *file, int datasync)
{
return -EIO;
}
diff --git a/fs/bfs/dir.c b/fs/bfs/dir.c
index 8f73841fc974..d967e052b779 100644
--- a/fs/bfs/dir.c
+++ b/fs/bfs/dir.c
@@ -78,7 +78,7 @@ static int bfs_readdir(struct file *f, void *dirent, filldir_t filldir)
const struct file_operations bfs_dir_operations = {
.read = generic_read_dir,
.readdir = bfs_readdir,
- .fsync = simple_fsync,
+ .fsync = generic_file_fsync,
.llseek = generic_file_llseek,
};
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index 2c5f9a0e5d72..63039ed9576f 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -990,10 +990,9 @@ static int elf_fdpic_map_file_constdisp_on_uclinux(
/* clear any space allocated but not loaded */
if (phdr->p_filesz < phdr->p_memsz) {
- ret = clear_user((void *) (seg->addr + phdr->p_filesz),
- phdr->p_memsz - phdr->p_filesz);
- if (ret)
- return ret;
+ if (clear_user((void *) (seg->addr + phdr->p_filesz),
+ phdr->p_memsz - phdr->p_filesz))
+ return -EFAULT;
}
if (mm) {
@@ -1027,7 +1026,7 @@ static int elf_fdpic_map_file_by_direct_mmap(struct elf_fdpic_params *params,
struct elf32_fdpic_loadseg *seg;
struct elf32_phdr *phdr;
unsigned long load_addr, delta_vaddr;
- int loop, dvset, ret;
+ int loop, dvset;
load_addr = params->load_addr;
delta_vaddr = 0;
@@ -1127,9 +1126,8 @@ static int elf_fdpic_map_file_by_direct_mmap(struct elf_fdpic_params *params,
* PT_LOAD */
if (prot & PROT_WRITE && disp > 0) {
kdebug("clear[%d] ad=%lx sz=%lx", loop, maddr, disp);
- ret = clear_user((void __user *) maddr, disp);
- if (ret)
- return ret;
+ if (clear_user((void __user *) maddr, disp))
+ return -EFAULT;
maddr += disp;
}
@@ -1164,19 +1162,17 @@ static int elf_fdpic_map_file_by_direct_mmap(struct elf_fdpic_params *params,
if (prot & PROT_WRITE && excess1 > 0) {
kdebug("clear[%d] ad=%lx sz=%lx",
loop, maddr + phdr->p_filesz, excess1);
- ret = clear_user((void __user *) maddr + phdr->p_filesz,
- excess1);
- if (ret)
- return ret;
+ if (clear_user((void __user *) maddr + phdr->p_filesz,
+ excess1))
+ return -EFAULT;
}
#else
if (excess > 0) {
kdebug("clear[%d] ad=%lx sz=%lx",
loop, maddr + phdr->p_filesz, excess);
- ret = clear_user((void *) maddr + phdr->p_filesz, excess);
- if (ret)
- return ret;
+ if (clear_user((void *) maddr + phdr->p_filesz, excess))
+ return -EFAULT;
}
#endif
diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c
index 49566c1687d8..811384bec8de 100644
--- a/fs/binfmt_flat.c
+++ b/fs/binfmt_flat.c
@@ -56,16 +56,19 @@
#endif
/*
- * User data (stack, data section and bss) needs to be aligned
- * for the same reasons as SLAB memory is, and to the same amount.
- * Avoid duplicating architecture specific code by using the same
- * macro as with SLAB allocation:
+ * User data (data section and bss) needs to be aligned.
+ * We pick 0x20 here because it is the max value elf2flt has always
+ * used in producing FLAT files, and because it seems to be large
+ * enough to make all the gcc alignment related tests happy.
*/
-#ifdef ARCH_SLAB_MINALIGN
-#define FLAT_DATA_ALIGN (ARCH_SLAB_MINALIGN)
-#else
-#define FLAT_DATA_ALIGN (sizeof(void *))
-#endif
+#define FLAT_DATA_ALIGN (0x20)
+
+/*
+ * User data (stack) also needs to be aligned.
+ * Here we can be a bit looser than the data sections since this
+ * needs to only meet arch ABI requirements.
+ */
+#define FLAT_STACK_ALIGN max_t(unsigned long, sizeof(void *), ARCH_SLAB_MINALIGN)
#define RELOC_FAILED 0xff00ff01 /* Relocation incorrect somewhere */
#define UNLOADED_LIB 0x7ff000ff /* Placeholder for unused library */
@@ -129,7 +132,7 @@ static unsigned long create_flat_tables(
sp = (unsigned long *)p;
sp -= (envc + argc + 2) + 1 + (flat_argvp_envp_on_stack() ? 2 : 0);
- sp = (unsigned long *) ((unsigned long)sp & -FLAT_DATA_ALIGN);
+ sp = (unsigned long *) ((unsigned long)sp & -FLAT_STACK_ALIGN);
argv = sp + 1 + (flat_argvp_envp_on_stack() ? 2 : 0);
envp = argv + (argc + 1);
@@ -589,7 +592,7 @@ static int load_flat_file(struct linux_binprm * bprm,
if (IS_ERR_VALUE(result)) {
printk("Unable to read data+bss, errno %d\n", (int)-result);
do_munmap(current->mm, textpos, text_len);
- do_munmap(current->mm, realdatastart, data_len + extra);
+ do_munmap(current->mm, realdatastart, len);
ret = result;
goto err;
}
@@ -876,7 +879,7 @@ static int load_flat_binary(struct linux_binprm * bprm, struct pt_regs * regs)
stack_len = TOP_OF_ARGS - bprm->p; /* the strings */
stack_len += (bprm->argc + 1) * sizeof(char *); /* the argv array */
stack_len += (bprm->envc + 1) * sizeof(char *); /* the envp array */
- stack_len += FLAT_DATA_ALIGN - 1; /* reserve for upcoming alignment */
+ stack_len += FLAT_STACK_ALIGN - 1; /* reserve for upcoming alignment */
res = load_flat_file(bprm, &libinfo, 0, &stack_len);
if (IS_ERR_VALUE(res))
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 26e5f5026620..b3171fb0dc9a 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -172,8 +172,9 @@ blkdev_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
struct file *file = iocb->ki_filp;
struct inode *inode = file->f_mapping->host;
- return blockdev_direct_IO_no_locking(rw, iocb, inode, I_BDEV(inode),
- iov, offset, nr_segs, blkdev_get_blocks, NULL);
+ return blockdev_direct_IO_no_locking_newtrunc(rw, iocb, inode,
+ I_BDEV(inode), iov, offset, nr_segs,
+ blkdev_get_blocks, NULL);
}
int __sync_blockdev(struct block_device *bdev, int wait)
@@ -309,8 +310,8 @@ static int blkdev_write_begin(struct file *file, struct address_space *mapping,
struct page **pagep, void **fsdata)
{
*pagep = NULL;
- return block_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
- blkdev_get_block);
+ return block_write_begin_newtrunc(file, mapping, pos, len, flags,
+ pagep, fsdata, blkdev_get_block);
}
static int blkdev_write_end(struct file *file, struct address_space *mapping,
@@ -358,12 +359,7 @@ static loff_t block_llseek(struct file *file, loff_t offset, int origin)
return retval;
}
-/*
- * Filp is never NULL; the only case when ->fsync() is called with
- * NULL first argument is nfsd_sync_dir() and that's not a directory.
- */
-
-int blkdev_fsync(struct file *filp, struct dentry *dentry, int datasync)
+int blkdev_fsync(struct file *filp, int datasync)
{
struct inode *bd_inode = filp->f_mapping->host;
struct block_device *bdev = I_BDEV(bd_inode);
@@ -685,8 +681,8 @@ retry:
if (!bd_may_claim(bdev, whole, holder))
return -EBUSY;
- /* if someone else is claiming, wait for it to finish */
- if (whole->bd_claiming && whole->bd_claiming != holder) {
+ /* if claiming is already in progress, wait for it to finish */
+ if (whole->bd_claiming) {
wait_queue_head_t *wq = bit_waitqueue(&whole->bd_claiming, 0);
DEFINE_WAIT(wait);
@@ -710,8 +706,13 @@ retry:
* @bdev is about to be opened exclusively. Check @bdev can be opened
* exclusively and mark that an exclusive open is in progress. Each
* successful call to this function must be matched with a call to
- * either bd_claim() or bd_abort_claiming(). If this function
- * succeeds, the matching bd_claim() is guaranteed to succeed.
+ * either bd_finish_claiming() or bd_abort_claiming() (which do not
+ * fail).
+ *
+ * This function is used to gain exclusive access to the block device
+ * without actually causing other exclusive open attempts to fail. It
+ * should be used when the open sequence itself requires exclusive
+ * access but may subsequently fail.
*
* CONTEXT:
* Might sleep.
@@ -738,6 +739,7 @@ static struct block_device *bd_start_claiming(struct block_device *bdev,
return ERR_PTR(-ENXIO);
whole = bdget_disk(disk, 0);
+ module_put(disk->fops->owner);
put_disk(disk);
if (!whole)
return ERR_PTR(-ENOMEM);
@@ -786,15 +788,46 @@ static void bd_abort_claiming(struct block_device *whole, void *holder)
__bd_abort_claiming(whole, holder); /* releases bdev_lock */
}
+/* increment holders when we have a legitimate claim. requires bdev_lock */
+static void __bd_claim(struct block_device *bdev, struct block_device *whole,
+ void *holder)
+{
+ /* note that for a whole device bd_holders
+ * will be incremented twice, and bd_holder will
+ * be set to bd_claim before being set to holder
+ */
+ whole->bd_holders++;
+ whole->bd_holder = bd_claim;
+ bdev->bd_holders++;
+ bdev->bd_holder = holder;
+}
+
+/**
+ * bd_finish_claiming - finish claiming a block device
+ * @bdev: block device of interest (passed to bd_start_claiming())
+ * @whole: whole block device returned by bd_start_claiming()
+ * @holder: holder trying to claim @bdev
+ *
+ * Finish a claiming block started by bd_start_claiming().
+ *
+ * CONTEXT:
+ * Grabs and releases bdev_lock.
+ */
+static void bd_finish_claiming(struct block_device *bdev,
+ struct block_device *whole, void *holder)
+{
+ spin_lock(&bdev_lock);
+ BUG_ON(!bd_may_claim(bdev, whole, holder));
+ __bd_claim(bdev, whole, holder);
+ __bd_abort_claiming(whole, holder); /* not actually an abort */
+}
+
/**
* bd_claim - claim a block device
* @bdev: block device to claim
* @holder: holder trying to claim @bdev
*
- * Try to claim @bdev which must have been opened successfully. This
- * function may be called with or without preceding
- * blk_start_claiming(). In the former case, this function is always
- * successful and terminates the claiming block.
+ * Try to claim @bdev which must have been opened successfully.
*
* CONTEXT:
* Might sleep.
@@ -810,23 +843,10 @@ int bd_claim(struct block_device *bdev, void *holder)
might_sleep();
spin_lock(&bdev_lock);
-
res = bd_prepare_to_claim(bdev, whole, holder);
- if (res == 0) {
- /* note that for a whole device bd_holders
- * will be incremented twice, and bd_holder will
- * be set to bd_claim before being set to holder
- */
- whole->bd_holders++;
- whole->bd_holder = bd_claim;
- bdev->bd_holders++;
- bdev->bd_holder = holder;
- }
-
- if (whole->bd_claiming)
- __bd_abort_claiming(whole, holder); /* releases bdev_lock */
- else
- spin_unlock(&bdev_lock);
+ if (res == 0)
+ __bd_claim(bdev, whole, holder);
+ spin_unlock(&bdev_lock);
return res;
}
@@ -1480,7 +1500,7 @@ static int blkdev_open(struct inode * inode, struct file * filp)
if (whole) {
if (res == 0)
- BUG_ON(bd_claim(bdev, filp) != 0);
+ bd_finish_claiming(bdev, whole, filp);
else
bd_abort_claiming(whole, filp);
}
@@ -1716,7 +1736,7 @@ struct block_device *open_bdev_exclusive(const char *path, fmode_t mode, void *h
if ((mode & FMODE_WRITE) && bdev_read_only(bdev))
goto out_blkdev_put;
- BUG_ON(bd_claim(bdev, holder) != 0);
+ bd_finish_claiming(bdev, whole, holder);
return bdev;
out_blkdev_put:
diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
index 8d432cd9d580..2222d161c7b6 100644
--- a/fs/btrfs/acl.c
+++ b/fs/btrfs/acl.c
@@ -60,6 +60,8 @@ static struct posix_acl *btrfs_get_acl(struct inode *inode, int type)
size = __btrfs_getxattr(inode, name, value, size);
if (size > 0) {
acl = posix_acl_from_xattr(value, size);
+ if (IS_ERR(acl))
+ return acl;
set_cached_acl(inode, type, acl);
}
kfree(value);
@@ -160,6 +162,12 @@ static int btrfs_xattr_acl_set(struct dentry *dentry, const char *name,
int ret;
struct posix_acl *acl = NULL;
+ if (!is_owner_or_cap(dentry->d_inode))
+ return -EPERM;
+
+ if (!IS_POSIXACL(dentry->d_inode))
+ return -EOPNOTSUPP;
+
if (value) {
acl = posix_acl_from_xattr(value, size);
if (acl == NULL) {
diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index 462859a30141..7ec14097fef1 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -377,6 +377,7 @@ again:
if (!list_empty(&worker->pending) ||
!list_empty(&worker->prio_pending)) {
spin_unlock_irq(&worker->lock);
+ set_current_state(TASK_RUNNING);
goto again;
}
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 7a4dee199832..6ad63f17eca0 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -137,8 +137,8 @@ struct btrfs_inode {
* of extent items we've reserved metadata for.
*/
spinlock_t accounting_lock;
+ atomic_t outstanding_extents;
int reserved_extents;
- int outstanding_extents;
/*
* ordered_data_close is set by truncate when a file that used
@@ -151,6 +151,7 @@ struct btrfs_inode {
* of these.
*/
unsigned ordered_data_close:1;
+ unsigned orphan_meta_reserved:1;
unsigned dummy_inode:1;
/*
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 6795a713b205..c3df14ce2cc2 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -280,7 +280,8 @@ int btrfs_block_can_be_shared(struct btrfs_root *root,
static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct extent_buffer *buf,
- struct extent_buffer *cow)
+ struct extent_buffer *cow,
+ int *last_ref)
{
u64 refs;
u64 owner;
@@ -366,6 +367,7 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
BUG_ON(ret);
}
clean_tree_block(trans, root, buf);
+ *last_ref = 1;
}
return 0;
}
@@ -392,6 +394,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
struct btrfs_disk_key disk_key;
struct extent_buffer *cow;
int level;
+ int last_ref = 0;
int unlock_orig = 0;
u64 parent_start;
@@ -442,7 +445,10 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
(unsigned long)btrfs_header_fsid(cow),
BTRFS_FSID_SIZE);
- update_ref_for_cow(trans, root, buf, cow);
+ update_ref_for_cow(trans, root, buf, cow, &last_ref);
+
+ if (root->ref_cows)
+ btrfs_reloc_cow_block(trans, root, buf, cow);
if (buf == root->node) {
WARN_ON(parent && parent != buf);
@@ -457,8 +463,8 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
extent_buffer_get(cow);
spin_unlock(&root->node_lock);
- btrfs_free_tree_block(trans, root, buf->start, buf->len,
- parent_start, root->root_key.objectid, level);
+ btrfs_free_tree_block(trans, root, buf, parent_start,
+ last_ref);
free_extent_buffer(buf);
add_root_to_dirty_list(root);
} else {
@@ -473,8 +479,8 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
btrfs_set_node_ptr_generation(parent, parent_slot,
trans->transid);
btrfs_mark_buffer_dirty(parent);
- btrfs_free_tree_block(trans, root, buf->start, buf->len,
- parent_start, root->root_key.objectid, level);
+ btrfs_free_tree_block(trans, root, buf, parent_start,
+ last_ref);
}
if (unlock_orig)
btrfs_tree_unlock(buf);
@@ -949,6 +955,22 @@ int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key,
return bin_search(eb, key, level, slot);
}
+static void root_add_used(struct btrfs_root *root, u32 size)
+{
+ spin_lock(&root->accounting_lock);
+ btrfs_set_root_used(&root->root_item,
+ btrfs_root_used(&root->root_item) + size);
+ spin_unlock(&root->accounting_lock);
+}
+
+static void root_sub_used(struct btrfs_root *root, u32 size)
+{
+ spin_lock(&root->accounting_lock);
+ btrfs_set_root_used(&root->root_item,
+ btrfs_root_used(&root->root_item) - size);
+ spin_unlock(&root->accounting_lock);
+}
+
/* given a node and slot number, this reads the blocks it points to. The
* extent buffer is returned with a reference taken (but unlocked).
* NULL is returned on error.
@@ -1019,7 +1041,11 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
btrfs_tree_lock(child);
btrfs_set_lock_blocking(child);
ret = btrfs_cow_block(trans, root, child, mid, 0, &child);
- BUG_ON(ret);
+ if (ret) {
+ btrfs_tree_unlock(child);
+ free_extent_buffer(child);
+ goto enospc;
+ }
spin_lock(&root->node_lock);
root->node = child;
@@ -1034,11 +1060,12 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
btrfs_tree_unlock(mid);
/* once for the path */
free_extent_buffer(mid);
- ret = btrfs_free_tree_block(trans, root, mid->start, mid->len,
- 0, root->root_key.objectid, level);
+
+ root_sub_used(root, mid->len);
+ btrfs_free_tree_block(trans, root, mid, 0, 1);
/* once for the root ptr */
free_extent_buffer(mid);
- return ret;
+ return 0;
}
if (btrfs_header_nritems(mid) >
BTRFS_NODEPTRS_PER_BLOCK(root) / 4)
@@ -1088,23 +1115,16 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
if (wret < 0 && wret != -ENOSPC)
ret = wret;
if (btrfs_header_nritems(right) == 0) {
- u64 bytenr = right->start;
- u32 blocksize = right->len;
-
clean_tree_block(trans, root, right);
btrfs_tree_unlock(right);
- free_extent_buffer(right);
- right = NULL;
wret = del_ptr(trans, root, path, level + 1, pslot +
1);
if (wret)
ret = wret;
- wret = btrfs_free_tree_block(trans, root,
- bytenr, blocksize, 0,
- root->root_key.objectid,
- level);
- if (wret)
- ret = wret;
+ root_sub_used(root, right->len);
+ btrfs_free_tree_block(trans, root, right, 0, 1);
+ free_extent_buffer(right);
+ right = NULL;
} else {
struct btrfs_disk_key right_key;
btrfs_node_key(right, &right_key, 0);
@@ -1136,21 +1156,15 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
BUG_ON(wret == 1);
}
if (btrfs_header_nritems(mid) == 0) {
- /* we've managed to empty the middle node, drop it */
- u64 bytenr = mid->start;
- u32 blocksize = mid->len;
-
clean_tree_block(trans, root, mid);
btrfs_tree_unlock(mid);
- free_extent_buffer(mid);
- mid = NULL;
wret = del_ptr(trans, root, path, level + 1, pslot);
if (wret)
ret = wret;
- wret = btrfs_free_tree_block(trans, root, bytenr, blocksize,
- 0, root->root_key.objectid, level);
- if (wret)
- ret = wret;
+ root_sub_used(root, mid->len);
+ btrfs_free_tree_block(trans, root, mid, 0, 1);
+ free_extent_buffer(mid);
+ mid = NULL;
} else {
/* update the parent key to reflect our changes */
struct btrfs_disk_key mid_key;
@@ -1590,7 +1604,7 @@ read_block_for_search(struct btrfs_trans_handle *trans,
btrfs_release_path(NULL, p);
ret = -EAGAIN;
- tmp = read_tree_block(root, blocknr, blocksize, gen);
+ tmp = read_tree_block(root, blocknr, blocksize, 0);
if (tmp) {
/*
* If the read above didn't mark this buffer up to date,
@@ -1740,7 +1754,6 @@ again:
p->nodes[level + 1],
p->slots[level + 1], &b);
if (err) {
- free_extent_buffer(b);
ret = err;
goto done;
}
@@ -2076,6 +2089,8 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans,
if (IS_ERR(c))
return PTR_ERR(c);
+ root_add_used(root, root->nodesize);
+
memset_extent_buffer(c, 0, 0, sizeof(struct btrfs_header));
btrfs_set_header_nritems(c, 1);
btrfs_set_header_level(c, level);
@@ -2134,6 +2149,7 @@ static int insert_ptr(struct btrfs_trans_handle *trans, struct btrfs_root
int nritems;
BUG_ON(!path->nodes[level]);
+ btrfs_assert_tree_locked(path->nodes[level]);
lower = path->nodes[level];
nritems = btrfs_header_nritems(lower);
BUG_ON(slot > nritems);
@@ -2202,6 +2218,8 @@ static noinline int split_node(struct btrfs_trans_handle *trans,
if (IS_ERR(split))
return PTR_ERR(split);
+ root_add_used(root, root->nodesize);
+
memset_extent_buffer(split, 0, 0, sizeof(struct btrfs_header));
btrfs_set_header_level(split, btrfs_header_level(c));
btrfs_set_header_bytenr(split, split->start);
@@ -2286,12 +2304,17 @@ noinline int btrfs_leaf_free_space(struct btrfs_root *root,
return ret;
}
+/*
+ * min slot controls the lowest index we're willing to push to the
+ * right. We'll push up to and including min_slot, but no lower
+ */
static noinline int __push_leaf_right(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_path *path,
int data_size, int empty,
struct extent_buffer *right,
- int free_space, u32 left_nritems)
+ int free_space, u32 left_nritems,
+ u32 min_slot)
{
struct extent_buffer *left = path->nodes[0];
struct extent_buffer *upper = path->nodes[1];
@@ -2309,7 +2332,7 @@ static noinline int __push_leaf_right(struct btrfs_trans_handle *trans,
if (empty)
nr = 0;
else
- nr = 1;
+ nr = max_t(u32, 1, min_slot);
if (path->slots[0] >= left_nritems)
push_space += data_size;
@@ -2415,6 +2438,9 @@ static noinline int __push_leaf_right(struct btrfs_trans_handle *trans,
if (left_nritems)
btrfs_mark_buffer_dirty(left);
+ else
+ clean_tree_block(trans, root, left);
+
btrfs_mark_buffer_dirty(right);
btrfs_item_key(right, &disk_key, 0);
@@ -2448,10 +2474,14 @@ out_unlock:
*
* returns 1 if the push failed because the other node didn't have enough
* room, 0 if everything worked out and < 0 if there were major errors.
+ *
+ * this will push starting from min_slot to the end of the leaf. It won't
+ * push any slot lower than min_slot
*/
static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
- *root, struct btrfs_path *path, int data_size,
- int empty)
+ *root, struct btrfs_path *path,
+ int min_data_size, int data_size,
+ int empty, u32 min_slot)
{
struct extent_buffer *left = path->nodes[0];
struct extent_buffer *right;
@@ -2493,8 +2523,8 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
if (left_nritems == 0)
goto out_unlock;
- return __push_leaf_right(trans, root, path, data_size, empty,
- right, free_space, left_nritems);
+ return __push_leaf_right(trans, root, path, min_data_size, empty,
+ right, free_space, left_nritems, min_slot);
out_unlock:
btrfs_tree_unlock(right);
free_extent_buffer(right);
@@ -2504,12 +2534,17 @@ out_unlock:
/*
* push some data in the path leaf to the left, trying to free up at
* least data_size bytes. returns zero if the push worked, nonzero otherwise
+ *
+ * max_slot can put a limit on how far into the leaf we'll push items. The
+ * item at 'max_slot' won't be touched. Use (u32)-1 to make us do all the
+ * items
*/
static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_path *path, int data_size,
int empty, struct extent_buffer *left,
- int free_space, int right_nritems)
+ int free_space, u32 right_nritems,
+ u32 max_slot)
{
struct btrfs_disk_key disk_key;
struct extent_buffer *right = path->nodes[0];
@@ -2528,9 +2563,9 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
slot = path->slots[1];
if (empty)
- nr = right_nritems;
+ nr = min(right_nritems, max_slot);
else
- nr = right_nritems - 1;
+ nr = min(right_nritems - 1, max_slot);
for (i = 0; i < nr; i++) {
item = btrfs_item_nr(right, i);
@@ -2660,6 +2695,8 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
btrfs_mark_buffer_dirty(left);
if (right_nritems)
btrfs_mark_buffer_dirty(right);
+ else
+ clean_tree_block(trans, root, right);
btrfs_item_key(right, &disk_key, 0);
wret = fixup_low_keys(trans, root, path, &disk_key, 1);
@@ -2669,8 +2706,6 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
/* then fixup the leaf pointer in the path */
if (path->slots[0] < push_items) {
path->slots[0] += old_left_nritems;
- if (btrfs_header_nritems(path->nodes[0]) == 0)
- clean_tree_block(trans, root, path->nodes[0]);
btrfs_tree_unlock(path->nodes[0]);
free_extent_buffer(path->nodes[0]);
path->nodes[0] = left;
@@ -2691,10 +2726,14 @@ out:
/*
* push some data in the path leaf to the left, trying to free up at
* least data_size bytes. returns zero if the push worked, nonzero otherwise
+ *
+ * max_slot can put a limit on how far into the leaf we'll push items. The
+ * item at 'max_slot' won't be touched. Use (u32)-1 to make us push all the
+ * items
*/
static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
- *root, struct btrfs_path *path, int data_size,
- int empty)
+ *root, struct btrfs_path *path, int min_data_size,
+ int data_size, int empty, u32 max_slot)
{
struct extent_buffer *right = path->nodes[0];
struct extent_buffer *left;
@@ -2740,8 +2779,9 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
goto out;
}
- return __push_leaf_left(trans, root, path, data_size,
- empty, left, free_space, right_nritems);
+ return __push_leaf_left(trans, root, path, min_data_size,
+ empty, left, free_space, right_nritems,
+ max_slot);
out:
btrfs_tree_unlock(left);
free_extent_buffer(left);
@@ -2834,6 +2874,64 @@ static noinline int copy_for_split(struct btrfs_trans_handle *trans,
}
/*
+ * double splits happen when we need to insert a big item in the middle
+ * of a leaf. A double split can leave us with 3 mostly empty leaves:
+ * leaf: [ slots 0 - N] [ our target ] [ N + 1 - total in leaf ]
+ * A B C
+ *
+ * We avoid this by trying to push the items on either side of our target
+ * into the adjacent leaves. If all goes well we can avoid the double split
+ * completely.
+ */
+static noinline int push_for_double_split(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ int data_size)
+{
+ int ret;
+ int progress = 0;
+ int slot;
+ u32 nritems;
+
+ slot = path->slots[0];
+
+ /*
+ * try to push all the items after our slot into the
+ * right leaf
+ */
+ ret = push_leaf_right(trans, root, path, 1, data_size, 0, slot);
+ if (ret < 0)
+ return ret;
+
+ if (ret == 0)
+ progress++;
+
+ nritems = btrfs_header_nritems(path->nodes[0]);
+ /*
+ * our goal is to get our slot at the start or end of a leaf. If
+ * we've done so we're done
+ */
+ if (path->slots[0] == 0 || path->slots[0] == nritems)
+ return 0;
+
+ if (btrfs_leaf_free_space(root, path->nodes[0]) >= data_size)
+ return 0;
+
+ /* try to push all the items before our slot into the next leaf */
+ slot = path->slots[0];
+ ret = push_leaf_left(trans, root, path, 1, data_size, 0, slot);
+ if (ret < 0)
+ return ret;
+
+ if (ret == 0)
+ progress++;
+
+ if (progress)
+ return 0;
+ return 1;
+}
+
+/*
* split the path's leaf in two, making sure there is at least data_size
* available for the resulting leaf level of the path.
*
@@ -2855,6 +2953,7 @@ static noinline int split_leaf(struct btrfs_trans_handle *trans,
int wret;
int split;
int num_doubles = 0;
+ int tried_avoid_double = 0;
l = path->nodes[0];
slot = path->slots[0];
@@ -2863,12 +2962,14 @@ static noinline int split_leaf(struct btrfs_trans_handle *trans,
return -EOVERFLOW;
/* first try to make some room by pushing left and right */
- if (data_size && ins_key->type != BTRFS_DIR_ITEM_KEY) {
- wret = push_leaf_right(trans, root, path, data_size, 0);
+ if (data_size) {
+ wret = push_leaf_right(trans, root, path, data_size,
+ data_size, 0, 0);
if (wret < 0)
return wret;
if (wret) {
- wret = push_leaf_left(trans, root, path, data_size, 0);
+ wret = push_leaf_left(trans, root, path, data_size,
+ data_size, 0, (u32)-1);
if (wret < 0)
return wret;
}
@@ -2902,6 +3003,8 @@ again:
if (mid != nritems &&
leaf_space_used(l, mid, nritems - mid) +
data_size > BTRFS_LEAF_DATA_SIZE(root)) {
+ if (data_size && !tried_avoid_double)
+ goto push_for_double;
split = 2;
}
}
@@ -2918,6 +3021,8 @@ again:
if (mid != nritems &&
leaf_space_used(l, mid, nritems - mid) +
data_size > BTRFS_LEAF_DATA_SIZE(root)) {
+ if (data_size && !tried_avoid_double)
+ goto push_for_double;
split = 2 ;
}
}
@@ -2932,10 +3037,10 @@ again:
right = btrfs_alloc_free_block(trans, root, root->leafsize, 0,
root->root_key.objectid,
&disk_key, 0, l->start, 0);
- if (IS_ERR(right)) {
- BUG_ON(1);
+ if (IS_ERR(right))
return PTR_ERR(right);
- }
+
+ root_add_used(root, root->leafsize);
memset_extent_buffer(right, 0, 0, sizeof(struct btrfs_header));
btrfs_set_header_bytenr(right, right->start);
@@ -2998,6 +3103,13 @@ again:
}
return ret;
+
+push_for_double:
+ push_for_double_split(trans, root, path, data_size);
+ tried_avoid_double = 1;
+ if (btrfs_leaf_free_space(root, path->nodes[0]) >= data_size)
+ return 0;
+ goto again;
}
static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans,
@@ -3054,7 +3166,8 @@ static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans,
btrfs_set_path_blocking(path);
ret = split_leaf(trans, root, &key, path, ins_len, 1);
- BUG_ON(ret);
+ if (ret)
+ goto err;
path->keep_locks = 0;
btrfs_unlock_up_safe(path, 1);
@@ -3796,9 +3909,10 @@ static noinline int btrfs_del_leaf(struct btrfs_trans_handle *trans,
*/
btrfs_unlock_up_safe(path, 0);
- ret = btrfs_free_tree_block(trans, root, leaf->start, leaf->len,
- 0, root->root_key.objectid, 0);
- return ret;
+ root_sub_used(root, leaf->len);
+
+ btrfs_free_tree_block(trans, root, leaf, 0, 1);
+ return 0;
}
/*
* delete the item at the leaf level in path. If that empties
@@ -3865,6 +3979,8 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
if (leaf == root->node) {
btrfs_set_header_level(leaf, 0);
} else {
+ btrfs_set_path_blocking(path);
+ clean_tree_block(trans, root, leaf);
ret = btrfs_del_leaf(trans, root, path, leaf);
BUG_ON(ret);
}
@@ -3890,13 +4006,15 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
extent_buffer_get(leaf);
btrfs_set_path_blocking(path);
- wret = push_leaf_left(trans, root, path, 1, 1);
+ wret = push_leaf_left(trans, root, path, 1, 1,
+ 1, (u32)-1);
if (wret < 0 && wret != -ENOSPC)
ret = wret;
if (path->nodes[0] == leaf &&
btrfs_header_nritems(leaf)) {
- wret = push_leaf_right(trans, root, path, 1, 1);
+ wret = push_leaf_right(trans, root, path, 1,
+ 1, 1, 0);
if (wret < 0 && wret != -ENOSPC)
ret = wret;
}
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 746a7248678e..29c20092847e 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -34,6 +34,7 @@
struct btrfs_trans_handle;
struct btrfs_transaction;
+struct btrfs_pending_snapshot;
extern struct kmem_cache *btrfs_trans_handle_cachep;
extern struct kmem_cache *btrfs_transaction_cachep;
extern struct kmem_cache *btrfs_bit_radix_cachep;
@@ -663,6 +664,7 @@ struct btrfs_csum_item {
#define BTRFS_BLOCK_GROUP_RAID1 (1 << 4)
#define BTRFS_BLOCK_GROUP_DUP (1 << 5)
#define BTRFS_BLOCK_GROUP_RAID10 (1 << 6)
+#define BTRFS_NR_RAID_TYPES 5
struct btrfs_block_group_item {
__le64 used;
@@ -674,42 +676,46 @@ struct btrfs_space_info {
u64 flags;
u64 total_bytes; /* total bytes in the space */
- u64 bytes_used; /* total bytes used on disk */
+ u64 bytes_used; /* total bytes used,
+ this does't take mirrors into account */
u64 bytes_pinned; /* total bytes pinned, will be freed when the
transaction finishes */
u64 bytes_reserved; /* total bytes the allocator has reserved for
current allocations */
u64 bytes_readonly; /* total bytes that are read only */
- u64 bytes_super; /* total bytes reserved for the super blocks */
- u64 bytes_root; /* the number of bytes needed to commit a
- transaction */
+
u64 bytes_may_use; /* number of bytes that may be used for
delalloc/allocations */
- u64 bytes_delalloc; /* number of bytes currently reserved for
- delayed allocation */
+ u64 disk_used; /* total bytes used on disk */
int full; /* indicates that we cannot allocate any more
chunks for this space */
int force_alloc; /* set if we need to force a chunk alloc for
this space */
- int force_delalloc; /* make people start doing filemap_flush until
- we're under a threshold */
struct list_head list;
- /* for controlling how we free up space for allocations */
- wait_queue_head_t allocate_wait;
- wait_queue_head_t flush_wait;
- int allocating_chunk;
- int flushing;
-
/* for block groups in our same type */
- struct list_head block_groups;
+ struct list_head block_groups[BTRFS_NR_RAID_TYPES];
spinlock_t lock;
struct rw_semaphore groups_sem;
atomic_t caching_threads;
};
+struct btrfs_block_rsv {
+ u64 size;
+ u64 reserved;
+ u64 freed[2];
+ struct btrfs_space_info *space_info;
+ struct list_head list;
+ spinlock_t lock;
+ atomic_t usage;
+ unsigned int priority:8;
+ unsigned int durable:1;
+ unsigned int refill_used:1;
+ unsigned int full:1;
+};
+
/*
* free clusters are used to claim free space in relatively large chunks,
* allowing us to do less seeky writes. They are used for all metadata
@@ -760,6 +766,7 @@ struct btrfs_block_group_cache {
spinlock_t lock;
u64 pinned;
u64 reserved;
+ u64 reserved_pinned;
u64 bytes_super;
u64 flags;
u64 sectorsize;
@@ -825,6 +832,22 @@ struct btrfs_fs_info {
/* logical->physical extent mapping */
struct btrfs_mapping_tree mapping_tree;
+ /* block reservation for extent, checksum and root tree */
+ struct btrfs_block_rsv global_block_rsv;
+ /* block reservation for delay allocation */
+ struct btrfs_block_rsv delalloc_block_rsv;
+ /* block reservation for metadata operations */
+ struct btrfs_block_rsv trans_block_rsv;
+ /* block reservation for chunk tree */
+ struct btrfs_block_rsv chunk_block_rsv;
+
+ struct btrfs_block_rsv empty_block_rsv;
+
+ /* list of block reservations that cross multiple transactions */
+ struct list_head durable_block_rsv_list;
+
+ struct mutex durable_block_rsv_mutex;
+
u64 generation;
u64 last_trans_committed;
@@ -927,7 +950,6 @@ struct btrfs_fs_info {
struct btrfs_workers endio_meta_write_workers;
struct btrfs_workers endio_write_workers;
struct btrfs_workers submit_workers;
- struct btrfs_workers enospc_workers;
/*
* fixup workers take dirty pages that didn't properly go through
* the cow mechanism and make them safe to write. It happens
@@ -943,6 +965,7 @@ struct btrfs_fs_info {
int do_barriers;
int closing;
int log_root_recovering;
+ int enospc_unlink;
u64 total_pinned;
@@ -1012,6 +1035,9 @@ struct btrfs_root {
struct completion kobj_unregister;
struct mutex objectid_mutex;
+ spinlock_t accounting_lock;
+ struct btrfs_block_rsv *block_rsv;
+
struct mutex log_mutex;
wait_queue_head_t log_writer_wait;
wait_queue_head_t log_commit_wait[2];
@@ -1043,7 +1069,6 @@ struct btrfs_root {
int ref_cows;
int track_dirty;
int in_radix;
- int clean_orphans;
u64 defrag_trans_start;
struct btrfs_key defrag_progress;
@@ -1057,8 +1082,11 @@ struct btrfs_root {
struct list_head root_list;
- spinlock_t list_lock;
+ spinlock_t orphan_lock;
struct list_head orphan_list;
+ struct btrfs_block_rsv *orphan_block_rsv;
+ int orphan_item_inserted;
+ int orphan_cleanup_state;
spinlock_t inode_lock;
/* red-black tree that keeps track of in-memory inodes */
@@ -1965,6 +1993,9 @@ void btrfs_put_block_group(struct btrfs_block_group_cache *cache);
int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
struct btrfs_root *root, unsigned long count);
int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len);
+int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, u64 bytenr,
+ u64 num_bytes, u64 *refs, u64 *flags);
int btrfs_pin_extent(struct btrfs_root *root,
u64 bytenr, u64 num, int reserved);
int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans,
@@ -1984,10 +2015,10 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
u64 parent, u64 root_objectid,
struct btrfs_disk_key *key, int level,
u64 hint, u64 empty_size);
-int btrfs_free_tree_block(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- u64 bytenr, u32 blocksize,
- u64 parent, u64 root_objectid, int level);
+void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct extent_buffer *buf,
+ u64 parent, int last_ref);
struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
u64 bytenr, u32 blocksize,
@@ -2041,27 +2072,49 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
u64 size);
int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
struct btrfs_root *root, u64 group_start);
-int btrfs_prepare_block_group_relocation(struct btrfs_root *root,
- struct btrfs_block_group_cache *group);
-
u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags);
void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *ionde);
void btrfs_clear_space_info_full(struct btrfs_fs_info *info);
-
-int btrfs_reserve_metadata_space(struct btrfs_root *root, int num_items);
-int btrfs_unreserve_metadata_space(struct btrfs_root *root, int num_items);
-int btrfs_unreserve_metadata_for_delalloc(struct btrfs_root *root,
- struct inode *inode, int num_items);
-int btrfs_reserve_metadata_for_delalloc(struct btrfs_root *root,
- struct inode *inode, int num_items);
-int btrfs_check_data_free_space(struct btrfs_root *root, struct inode *inode,
- u64 bytes);
-void btrfs_free_reserved_data_space(struct btrfs_root *root,
- struct inode *inode, u64 bytes);
-void btrfs_delalloc_reserve_space(struct btrfs_root *root, struct inode *inode,
- u64 bytes);
-void btrfs_delalloc_free_space(struct btrfs_root *root, struct inode *inode,
- u64 bytes);
+int btrfs_check_data_free_space(struct inode *inode, u64 bytes);
+void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes);
+int btrfs_trans_reserve_metadata(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ int num_items, int *retries);
+void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root);
+int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans,
+ struct inode *inode);
+void btrfs_orphan_release_metadata(struct inode *inode);
+int btrfs_snap_reserve_metadata(struct btrfs_trans_handle *trans,
+ struct btrfs_pending_snapshot *pending);
+int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes);
+void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes);
+int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes);
+void btrfs_delalloc_release_space(struct inode *inode, u64 num_bytes);
+void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv);
+struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root);
+void btrfs_free_block_rsv(struct btrfs_root *root,
+ struct btrfs_block_rsv *rsv);
+void btrfs_add_durable_block_rsv(struct btrfs_fs_info *fs_info,
+ struct btrfs_block_rsv *rsv);
+int btrfs_block_rsv_add(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_block_rsv *block_rsv,
+ u64 num_bytes, int *retries);
+int btrfs_block_rsv_check(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_block_rsv *block_rsv,
+ u64 min_reserved, int min_factor);
+int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv,
+ struct btrfs_block_rsv *dst_rsv,
+ u64 num_bytes);
+void btrfs_block_rsv_release(struct btrfs_root *root,
+ struct btrfs_block_rsv *block_rsv,
+ u64 num_bytes);
+int btrfs_set_block_group_ro(struct btrfs_root *root,
+ struct btrfs_block_group_cache *cache);
+int btrfs_set_block_group_rw(struct btrfs_root *root,
+ struct btrfs_block_group_cache *cache);
/* ctree.c */
int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key,
int level, int *slot);
@@ -2152,7 +2205,8 @@ static inline int btrfs_insert_empty_item(struct btrfs_trans_handle *trans,
int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path);
int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path);
int btrfs_leaf_free_space(struct btrfs_root *root, struct extent_buffer *leaf);
-int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref);
+int btrfs_drop_snapshot(struct btrfs_root *root,
+ struct btrfs_block_rsv *block_rsv, int update_ref);
int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct extent_buffer *node,
@@ -2245,6 +2299,12 @@ int btrfs_del_inode_ref(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
const char *name, int name_len,
u64 inode_objectid, u64 ref_objectid, u64 *index);
+struct btrfs_inode_ref *
+btrfs_lookup_inode_ref(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ const char *name, int name_len,
+ u64 inode_objectid, u64 ref_objectid, int mod);
int btrfs_insert_empty_inode(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_path *path, u64 objectid);
@@ -2257,6 +2317,8 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans,
struct btrfs_root *root, u64 bytenr, u64 len);
int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode,
struct bio *bio, u32 *dst);
+int btrfs_lookup_bio_sums_dio(struct btrfs_root *root, struct inode *inode,
+ struct bio *bio, u64 logical_offset, u32 *dst);
int btrfs_insert_file_extent(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
u64 objectid, u64 pos,
@@ -2311,6 +2373,7 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
u32 min_type);
int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput);
+int btrfs_start_one_delalloc_inode(struct btrfs_root *root, int delay_iput);
int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end,
struct extent_state **cached_state);
int btrfs_writepages(struct address_space *mapping,
@@ -2349,10 +2412,20 @@ int btrfs_update_inode(struct btrfs_trans_handle *trans,
int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode);
int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode);
void btrfs_orphan_cleanup(struct btrfs_root *root);
+void btrfs_orphan_pre_snapshot(struct btrfs_trans_handle *trans,
+ struct btrfs_pending_snapshot *pending,
+ u64 *bytes_to_reserve);
+void btrfs_orphan_post_snapshot(struct btrfs_trans_handle *trans,
+ struct btrfs_pending_snapshot *pending);
+void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root);
int btrfs_cont_expand(struct inode *inode, loff_t size);
int btrfs_invalidate_inodes(struct btrfs_root *root);
void btrfs_add_delayed_iput(struct inode *inode);
void btrfs_run_delayed_iputs(struct btrfs_root *root);
+int btrfs_prealloc_file_range(struct inode *inode, int mode,
+ u64 start, u64 num_bytes, u64 min_size,
+ loff_t actual_len, u64 *alloc_hint);
extern const struct dentry_operations btrfs_dentry_operations;
/* ioctl.c */
@@ -2361,7 +2434,7 @@ void btrfs_update_iflags(struct inode *inode);
void btrfs_inherit_iflags(struct inode *inode, struct inode *dir);
/* file.c */
-int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync);
+int btrfs_sync_file(struct file *file, int datasync);
int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
int skip_pinned);
int btrfs_check_file(struct btrfs_root *root, struct inode *inode);
@@ -2409,4 +2482,12 @@ int btrfs_update_reloc_root(struct btrfs_trans_handle *trans,
struct btrfs_root *root);
int btrfs_recover_relocation(struct btrfs_root *root);
int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len);
+void btrfs_reloc_cow_block(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, struct extent_buffer *buf,
+ struct extent_buffer *cow);
+void btrfs_reloc_pre_snapshot(struct btrfs_trans_handle *trans,
+ struct btrfs_pending_snapshot *pending,
+ u64 *bytes_to_reserve);
+void btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans,
+ struct btrfs_pending_snapshot *pending);
#endif
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index 902ce507c4e3..e807b143b857 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -319,107 +319,6 @@ out:
}
/*
- * helper function to lookup reference count and flags of extent.
- *
- * the head node for delayed ref is used to store the sum of all the
- * reference count modifications queued up in the rbtree. the head
- * node may also store the extent flags to set. This way you can check
- * to see what the reference count and extent flags would be if all of
- * the delayed refs are not processed.
- */
-int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, u64 bytenr,
- u64 num_bytes, u64 *refs, u64 *flags)
-{
- struct btrfs_delayed_ref_node *ref;
- struct btrfs_delayed_ref_head *head;
- struct btrfs_delayed_ref_root *delayed_refs;
- struct btrfs_path *path;
- struct btrfs_extent_item *ei;
- struct extent_buffer *leaf;
- struct btrfs_key key;
- u32 item_size;
- u64 num_refs;
- u64 extent_flags;
- int ret;
-
- path = btrfs_alloc_path();
- if (!path)
- return -ENOMEM;
-
- key.objectid = bytenr;
- key.type = BTRFS_EXTENT_ITEM_KEY;
- key.offset = num_bytes;
- delayed_refs = &trans->transaction->delayed_refs;
-again:
- ret = btrfs_search_slot(trans, root->fs_info->extent_root,
- &key, path, 0, 0);
- if (ret < 0)
- goto out;
-
- if (ret == 0) {
- leaf = path->nodes[0];
- item_size = btrfs_item_size_nr(leaf, path->slots[0]);
- if (item_size >= sizeof(*ei)) {
- ei = btrfs_item_ptr(leaf, path->slots[0],
- struct btrfs_extent_item);
- num_refs = btrfs_extent_refs(leaf, ei);
- extent_flags = btrfs_extent_flags(leaf, ei);
- } else {
-#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
- struct btrfs_extent_item_v0 *ei0;
- BUG_ON(item_size != sizeof(*ei0));
- ei0 = btrfs_item_ptr(leaf, path->slots[0],
- struct btrfs_extent_item_v0);
- num_refs = btrfs_extent_refs_v0(leaf, ei0);
- /* FIXME: this isn't correct for data */
- extent_flags = BTRFS_BLOCK_FLAG_FULL_BACKREF;
-#else
- BUG();
-#endif
- }
- BUG_ON(num_refs == 0);
- } else {
- num_refs = 0;
- extent_flags = 0;
- ret = 0;
- }
-
- spin_lock(&delayed_refs->lock);
- ref = find_ref_head(&delayed_refs->root, bytenr, NULL);
- if (ref) {
- head = btrfs_delayed_node_to_head(ref);
- if (!mutex_trylock(&head->mutex)) {
- atomic_inc(&ref->refs);
- spin_unlock(&delayed_refs->lock);
-
- btrfs_release_path(root->fs_info->extent_root, path);
-
- mutex_lock(&head->mutex);
- mutex_unlock(&head->mutex);
- btrfs_put_delayed_ref(ref);
- goto again;
- }
- if (head->extent_op && head->extent_op->update_flags)
- extent_flags |= head->extent_op->flags_to_set;
- else
- BUG_ON(num_refs == 0);
-
- num_refs += ref->ref_mod;
- mutex_unlock(&head->mutex);
- }
- WARN_ON(num_refs == 0);
- if (refs)
- *refs = num_refs;
- if (flags)
- *flags = extent_flags;
-out:
- spin_unlock(&delayed_refs->lock);
- btrfs_free_path(path);
- return ret;
-}
-
-/*
* helper function to update an extent delayed ref in the
* rbtree. existing and update must both have the same
* bytenr and parent
diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
index f6fc67ddad36..50e3cf92fbda 100644
--- a/fs/btrfs/delayed-ref.h
+++ b/fs/btrfs/delayed-ref.h
@@ -167,9 +167,6 @@ int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans,
struct btrfs_delayed_ref_head *
btrfs_find_delayed_ref_head(struct btrfs_trans_handle *trans, u64 bytenr);
int btrfs_delayed_ref_pending(struct btrfs_trans_handle *trans, u64 bytenr);
-int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, u64 bytenr,
- u64 num_bytes, u64 *refs, u64 *flags);
int btrfs_update_delayed_ref(struct btrfs_trans_handle *trans,
u64 bytenr, u64 num_bytes, u64 orig_parent,
u64 parent, u64 orig_ref_root, u64 ref_root,
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index feca04197d02..34f7c375567e 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -74,6 +74,11 @@ struct async_submit_bio {
int rw;
int mirror_num;
unsigned long bio_flags;
+ /*
+ * bio_offset is optional, can be used if the pages in the bio
+ * can't tell us where in the file the bio should go
+ */
+ u64 bio_offset;
struct btrfs_work work;
};
@@ -534,7 +539,8 @@ static void run_one_async_start(struct btrfs_work *work)
async = container_of(work, struct async_submit_bio, work);
fs_info = BTRFS_I(async->inode)->root->fs_info;
async->submit_bio_start(async->inode, async->rw, async->bio,
- async->mirror_num, async->bio_flags);
+ async->mirror_num, async->bio_flags,
+ async->bio_offset);
}
static void run_one_async_done(struct btrfs_work *work)
@@ -556,7 +562,8 @@ static void run_one_async_done(struct btrfs_work *work)
wake_up(&fs_info->async_submit_wait);
async->submit_bio_done(async->inode, async->rw, async->bio,
- async->mirror_num, async->bio_flags);
+ async->mirror_num, async->bio_flags,
+ async->bio_offset);
}
static void run_one_async_free(struct btrfs_work *work)
@@ -570,6 +577,7 @@ static void run_one_async_free(struct btrfs_work *work)
int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
int rw, struct bio *bio, int mirror_num,
unsigned long bio_flags,
+ u64 bio_offset,
extent_submit_bio_hook_t *submit_bio_start,
extent_submit_bio_hook_t *submit_bio_done)
{
@@ -592,6 +600,7 @@ int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
async->work.flags = 0;
async->bio_flags = bio_flags;
+ async->bio_offset = bio_offset;
atomic_inc(&fs_info->nr_async_submits);
@@ -627,7 +636,8 @@ static int btree_csum_one_bio(struct bio *bio)
static int __btree_submit_bio_start(struct inode *inode, int rw,
struct bio *bio, int mirror_num,
- unsigned long bio_flags)
+ unsigned long bio_flags,
+ u64 bio_offset)
{
/*
* when we're called for a write, we're already in the async
@@ -638,7 +648,8 @@ static int __btree_submit_bio_start(struct inode *inode, int rw,
}
static int __btree_submit_bio_done(struct inode *inode, int rw, struct bio *bio,
- int mirror_num, unsigned long bio_flags)
+ int mirror_num, unsigned long bio_flags,
+ u64 bio_offset)
{
/*
* when we're called for a write, we're already in the async
@@ -648,7 +659,8 @@ static int __btree_submit_bio_done(struct inode *inode, int rw, struct bio *bio,
}
static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
- int mirror_num, unsigned long bio_flags)
+ int mirror_num, unsigned long bio_flags,
+ u64 bio_offset)
{
int ret;
@@ -671,6 +683,7 @@ static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
*/
return btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info,
inode, rw, bio, mirror_num, 0,
+ bio_offset,
__btree_submit_bio_start,
__btree_submit_bio_done);
}
@@ -894,7 +907,8 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
root->ref_cows = 0;
root->track_dirty = 0;
root->in_radix = 0;
- root->clean_orphans = 0;
+ root->orphan_item_inserted = 0;
+ root->orphan_cleanup_state = 0;
root->fs_info = fs_info;
root->objectid = objectid;
@@ -903,13 +917,16 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
root->name = NULL;
root->in_sysfs = 0;
root->inode_tree = RB_ROOT;
+ root->block_rsv = NULL;
+ root->orphan_block_rsv = NULL;
INIT_LIST_HEAD(&root->dirty_list);
INIT_LIST_HEAD(&root->orphan_list);
INIT_LIST_HEAD(&root->root_list);
spin_lock_init(&root->node_lock);
- spin_lock_init(&root->list_lock);
+ spin_lock_init(&root->orphan_lock);
spin_lock_init(&root->inode_lock);
+ spin_lock_init(&root->accounting_lock);
mutex_init(&root->objectid_mutex);
mutex_init(&root->log_mutex);
init_waitqueue_head(&root->log_writer_wait);
@@ -968,42 +985,6 @@ static int find_and_setup_root(struct btrfs_root *tree_root,
return 0;
}
-int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info)
-{
- struct extent_buffer *eb;
- struct btrfs_root *log_root_tree = fs_info->log_root_tree;
- u64 start = 0;
- u64 end = 0;
- int ret;
-
- if (!log_root_tree)
- return 0;
-
- while (1) {
- ret = find_first_extent_bit(&log_root_tree->dirty_log_pages,
- 0, &start, &end, EXTENT_DIRTY | EXTENT_NEW);
- if (ret)
- break;
-
- clear_extent_bits(&log_root_tree->dirty_log_pages, start, end,
- EXTENT_DIRTY | EXTENT_NEW, GFP_NOFS);
- }
- eb = fs_info->log_root_tree->node;
-
- WARN_ON(btrfs_header_level(eb) != 0);
- WARN_ON(btrfs_header_nritems(eb) != 0);
-
- ret = btrfs_free_reserved_extent(fs_info->tree_root,
- eb->start, eb->len);
- BUG_ON(ret);
-
- free_extent_buffer(eb);
- kfree(fs_info->log_root_tree);
- fs_info->log_root_tree = NULL;
- return 0;
-}
-
static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info)
{
@@ -1191,19 +1172,23 @@ again:
if (root)
return root;
- ret = btrfs_find_orphan_item(fs_info->tree_root, location->objectid);
- if (ret == 0)
- ret = -ENOENT;
- if (ret < 0)
- return ERR_PTR(ret);
-
root = btrfs_read_fs_root_no_radix(fs_info->tree_root, location);
if (IS_ERR(root))
return root;
- WARN_ON(btrfs_root_refs(&root->root_item) == 0);
set_anon_super(&root->anon_super, NULL);
+ if (btrfs_root_refs(&root->root_item) == 0) {
+ ret = -ENOENT;
+ goto fail;
+ }
+
+ ret = btrfs_find_orphan_item(fs_info->tree_root, location->objectid);
+ if (ret < 0)
+ goto fail;
+ if (ret == 0)
+ root->orphan_item_inserted = 1;
+
ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM);
if (ret)
goto fail;
@@ -1212,10 +1197,9 @@ again:
ret = radix_tree_insert(&fs_info->fs_roots_radix,
(unsigned long)root->root_key.objectid,
root);
- if (ret == 0) {
+ if (ret == 0)
root->in_radix = 1;
- root->clean_orphans = 1;
- }
+
spin_unlock(&fs_info->fs_roots_radix_lock);
radix_tree_preload_end();
if (ret) {
@@ -1461,10 +1445,6 @@ static int cleaner_kthread(void *arg)
struct btrfs_root *root = arg;
do {
- smp_mb();
- if (root->fs_info->closing)
- break;
-
vfs_check_frozen(root->fs_info->sb, SB_FREEZE_WRITE);
if (!(root->fs_info->sb->s_flags & MS_RDONLY) &&
@@ -1477,11 +1457,9 @@ static int cleaner_kthread(void *arg)
if (freezing(current)) {
refrigerator();
} else {
- smp_mb();
- if (root->fs_info->closing)
- break;
set_current_state(TASK_INTERRUPTIBLE);
- schedule();
+ if (!kthread_should_stop())
+ schedule();
__set_current_state(TASK_RUNNING);
}
} while (!kthread_should_stop());
@@ -1493,36 +1471,40 @@ static int transaction_kthread(void *arg)
struct btrfs_root *root = arg;
struct btrfs_trans_handle *trans;
struct btrfs_transaction *cur;
+ u64 transid;
unsigned long now;
unsigned long delay;
int ret;
do {
- smp_mb();
- if (root->fs_info->closing)
- break;
-
delay = HZ * 30;
vfs_check_frozen(root->fs_info->sb, SB_FREEZE_WRITE);
mutex_lock(&root->fs_info->transaction_kthread_mutex);
- mutex_lock(&root->fs_info->trans_mutex);
+ spin_lock(&root->fs_info->new_trans_lock);
cur = root->fs_info->running_transaction;
if (!cur) {
- mutex_unlock(&root->fs_info->trans_mutex);
+ spin_unlock(&root->fs_info->new_trans_lock);
goto sleep;
}
now = get_seconds();
- if (now < cur->start_time || now - cur->start_time < 30) {
- mutex_unlock(&root->fs_info->trans_mutex);
+ if (!cur->blocked &&
+ (now < cur->start_time || now - cur->start_time < 30)) {
+ spin_unlock(&root->fs_info->new_trans_lock);
delay = HZ * 5;
goto sleep;
}
- mutex_unlock(&root->fs_info->trans_mutex);
- trans = btrfs_start_transaction(root, 1);
- ret = btrfs_commit_transaction(trans, root);
+ transid = cur->transid;
+ spin_unlock(&root->fs_info->new_trans_lock);
+ trans = btrfs_join_transaction(root, 1);
+ if (transid == trans->transid) {
+ ret = btrfs_commit_transaction(trans, root);
+ BUG_ON(ret);
+ } else {
+ btrfs_end_transaction(trans, root);
+ }
sleep:
wake_up_process(root->fs_info->cleaner_kthread);
mutex_unlock(&root->fs_info->transaction_kthread_mutex);
@@ -1530,10 +1512,10 @@ sleep:
if (freezing(current)) {
refrigerator();
} else {
- if (root->fs_info->closing)
- break;
set_current_state(TASK_INTERRUPTIBLE);
- schedule_timeout(delay);
+ if (!kthread_should_stop() &&
+ !btrfs_transaction_blocked(root->fs_info))
+ schedule_timeout(delay);
__set_current_state(TASK_RUNNING);
}
} while (!kthread_should_stop());
@@ -1620,6 +1602,13 @@ struct btrfs_root *open_ctree(struct super_block *sb,
INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots);
INIT_LIST_HEAD(&fs_info->space_info);
btrfs_mapping_init(&fs_info->mapping_tree);
+ btrfs_init_block_rsv(&fs_info->global_block_rsv);
+ btrfs_init_block_rsv(&fs_info->delalloc_block_rsv);
+ btrfs_init_block_rsv(&fs_info->trans_block_rsv);
+ btrfs_init_block_rsv(&fs_info->chunk_block_rsv);
+ btrfs_init_block_rsv(&fs_info->empty_block_rsv);
+ INIT_LIST_HEAD(&fs_info->durable_block_rsv_list);
+ mutex_init(&fs_info->durable_block_rsv_mutex);
atomic_set(&fs_info->nr_async_submits, 0);
atomic_set(&fs_info->async_delalloc_pages, 0);
atomic_set(&fs_info->async_submit_draining, 0);
@@ -1759,9 +1748,6 @@ struct btrfs_root *open_ctree(struct super_block *sb,
min_t(u64, fs_devices->num_devices,
fs_info->thread_pool_size),
&fs_info->generic_worker);
- btrfs_init_workers(&fs_info->enospc_workers, "enospc",
- fs_info->thread_pool_size,
- &fs_info->generic_worker);
/* a higher idle thresh on the submit workers makes it much more
* likely that bios will be send down in a sane order to the
@@ -1809,7 +1795,6 @@ struct btrfs_root *open_ctree(struct super_block *sb,
btrfs_start_workers(&fs_info->endio_meta_workers, 1);
btrfs_start_workers(&fs_info->endio_meta_write_workers, 1);
btrfs_start_workers(&fs_info->endio_write_workers, 1);
- btrfs_start_workers(&fs_info->enospc_workers, 1);
fs_info->bdi.ra_pages *= btrfs_super_num_devices(disk_super);
fs_info->bdi.ra_pages = max(fs_info->bdi.ra_pages,
@@ -1912,17 +1897,18 @@ struct btrfs_root *open_ctree(struct super_block *sb,
csum_root->track_dirty = 1;
+ fs_info->generation = generation;
+ fs_info->last_trans_committed = generation;
+ fs_info->data_alloc_profile = (u64)-1;
+ fs_info->metadata_alloc_profile = (u64)-1;
+ fs_info->system_alloc_profile = fs_info->metadata_alloc_profile;
+
ret = btrfs_read_block_groups(extent_root);
if (ret) {
printk(KERN_ERR "Failed to read block groups: %d\n", ret);
goto fail_block_groups;
}
- fs_info->generation = generation;
- fs_info->last_trans_committed = generation;
- fs_info->data_alloc_profile = (u64)-1;
- fs_info->metadata_alloc_profile = (u64)-1;
- fs_info->system_alloc_profile = fs_info->metadata_alloc_profile;
fs_info->cleaner_kthread = kthread_run(cleaner_kthread, tree_root,
"btrfs-cleaner");
if (IS_ERR(fs_info->cleaner_kthread))
@@ -1955,8 +1941,11 @@ struct btrfs_root *open_ctree(struct super_block *sb,
btrfs_level_size(tree_root,
btrfs_super_log_root_level(disk_super));
- log_tree_root = kzalloc(sizeof(struct btrfs_root),
- GFP_NOFS);
+ log_tree_root = kzalloc(sizeof(struct btrfs_root), GFP_NOFS);
+ if (!log_tree_root) {
+ err = -ENOMEM;
+ goto fail_trans_kthread;
+ }
__setup_root(nodesize, leafsize, sectorsize, stripesize,
log_tree_root, fs_info, BTRFS_TREE_LOG_OBJECTID);
@@ -1977,6 +1966,9 @@ struct btrfs_root *open_ctree(struct super_block *sb,
BUG_ON(ret);
if (!(sb->s_flags & MS_RDONLY)) {
+ ret = btrfs_cleanup_fs_roots(fs_info);
+ BUG_ON(ret);
+
ret = btrfs_recover_relocation(tree_root);
if (ret < 0) {
printk(KERN_WARNING
@@ -1993,6 +1985,10 @@ struct btrfs_root *open_ctree(struct super_block *sb,
fs_info->fs_root = btrfs_read_fs_root_no_name(fs_info, &location);
if (!fs_info->fs_root)
goto fail_trans_kthread;
+ if (IS_ERR(fs_info->fs_root)) {
+ err = PTR_ERR(fs_info->fs_root);
+ goto fail_trans_kthread;
+ }
if (!(sb->s_flags & MS_RDONLY)) {
down_read(&fs_info->cleanup_work_sem);
@@ -2040,7 +2036,6 @@ fail_sb_buffer:
btrfs_stop_workers(&fs_info->endio_meta_write_workers);
btrfs_stop_workers(&fs_info->endio_write_workers);
btrfs_stop_workers(&fs_info->submit_workers);
- btrfs_stop_workers(&fs_info->enospc_workers);
fail_iput:
invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
iput(fs_info->btree_inode);
@@ -2405,11 +2400,11 @@ int btrfs_commit_super(struct btrfs_root *root)
down_write(&root->fs_info->cleanup_work_sem);
up_write(&root->fs_info->cleanup_work_sem);
- trans = btrfs_start_transaction(root, 1);
+ trans = btrfs_join_transaction(root, 1);
ret = btrfs_commit_transaction(trans, root);
BUG_ON(ret);
/* run commit again to drop the original snapshot */
- trans = btrfs_start_transaction(root, 1);
+ trans = btrfs_join_transaction(root, 1);
btrfs_commit_transaction(trans, root);
ret = btrfs_write_and_wait_transaction(NULL, root);
BUG_ON(ret);
@@ -2426,15 +2421,15 @@ int close_ctree(struct btrfs_root *root)
fs_info->closing = 1;
smp_mb();
- kthread_stop(root->fs_info->transaction_kthread);
- kthread_stop(root->fs_info->cleaner_kthread);
-
if (!(fs_info->sb->s_flags & MS_RDONLY)) {
ret = btrfs_commit_super(root);
if (ret)
printk(KERN_ERR "btrfs: commit super ret %d\n", ret);
}
+ kthread_stop(root->fs_info->transaction_kthread);
+ kthread_stop(root->fs_info->cleaner_kthread);
+
fs_info->closing = 2;
smp_mb();
@@ -2473,7 +2468,6 @@ int close_ctree(struct btrfs_root *root)
btrfs_stop_workers(&fs_info->endio_meta_write_workers);
btrfs_stop_workers(&fs_info->endio_write_workers);
btrfs_stop_workers(&fs_info->submit_workers);
- btrfs_stop_workers(&fs_info->enospc_workers);
btrfs_close_devices(fs_info->fs_devices);
btrfs_mapping_tree_free(&fs_info->mapping_tree);
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index c958ecbc1916..88e825a0bf21 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -87,7 +87,7 @@ int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio,
int metadata);
int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
int rw, struct bio *bio, int mirror_num,
- unsigned long bio_flags,
+ unsigned long bio_flags, u64 bio_offset,
extent_submit_bio_hook_t *submit_bio_start,
extent_submit_bio_hook_t *submit_bio_done);
@@ -95,8 +95,6 @@ int btrfs_congested_async(struct btrfs_fs_info *info, int iodone);
unsigned long btrfs_async_submit_limit(struct btrfs_fs_info *info);
int btrfs_write_tree_block(struct extent_buffer *buf);
int btrfs_wait_tree_block_writeback(struct extent_buffer *buf);
-int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info);
int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info);
int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index c6a4f459ad76..32d094002a57 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -35,10 +35,9 @@
static int update_block_group(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
- u64 bytenr, u64 num_bytes, int alloc,
- int mark_free);
-static int update_reserved_extents(struct btrfs_block_group_cache *cache,
- u64 num_bytes, int reserve);
+ u64 bytenr, u64 num_bytes, int alloc);
+static int update_reserved_bytes(struct btrfs_block_group_cache *cache,
+ u64 num_bytes, int reserve, int sinfo);
static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
u64 bytenr, u64 num_bytes, u64 parent,
@@ -61,12 +60,6 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
static int do_chunk_alloc(struct btrfs_trans_handle *trans,
struct btrfs_root *extent_root, u64 alloc_bytes,
u64 flags, int force);
-static int pin_down_bytes(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct btrfs_path *path,
- u64 bytenr, u64 num_bytes,
- int is_data, int reserved,
- struct extent_buffer **must_clean);
static int find_next_key(struct btrfs_path *path, int level,
struct btrfs_key *key);
static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
@@ -91,8 +84,12 @@ void btrfs_get_block_group(struct btrfs_block_group_cache *cache)
void btrfs_put_block_group(struct btrfs_block_group_cache *cache)
{
- if (atomic_dec_and_test(&cache->count))
+ if (atomic_dec_and_test(&cache->count)) {
+ WARN_ON(cache->pinned > 0);
+ WARN_ON(cache->reserved > 0);
+ WARN_ON(cache->reserved_pinned > 0);
kfree(cache);
+ }
}
/*
@@ -319,7 +316,7 @@ static int caching_kthread(void *data)
exclude_super_stripes(extent_root, block_group);
spin_lock(&block_group->space_info->lock);
- block_group->space_info->bytes_super += block_group->bytes_super;
+ block_group->space_info->bytes_readonly += block_group->bytes_super;
spin_unlock(&block_group->space_info->lock);
last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET);
@@ -507,6 +504,9 @@ static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info,
struct list_head *head = &info->space_info;
struct btrfs_space_info *found;
+ flags &= BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_SYSTEM |
+ BTRFS_BLOCK_GROUP_METADATA;
+
rcu_read_lock();
list_for_each_entry_rcu(found, head, list) {
if (found->flags == flags) {
@@ -610,6 +610,113 @@ int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len)
}
/*
+ * helper function to lookup reference count and flags of extent.
+ *
+ * the head node for delayed ref is used to store the sum of all the
+ * reference count modifications queued up in the rbtree. the head
+ * node may also store the extent flags to set. This way you can check
+ * to see what the reference count and extent flags would be if all of
+ * the delayed refs are not processed.
+ */
+int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, u64 bytenr,
+ u64 num_bytes, u64 *refs, u64 *flags)
+{
+ struct btrfs_delayed_ref_head *head;
+ struct btrfs_delayed_ref_root *delayed_refs;
+ struct btrfs_path *path;
+ struct btrfs_extent_item *ei;
+ struct extent_buffer *leaf;
+ struct btrfs_key key;
+ u32 item_size;
+ u64 num_refs;
+ u64 extent_flags;
+ int ret;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ key.objectid = bytenr;
+ key.type = BTRFS_EXTENT_ITEM_KEY;
+ key.offset = num_bytes;
+ if (!trans) {
+ path->skip_locking = 1;
+ path->search_commit_root = 1;
+ }
+again:
+ ret = btrfs_search_slot(trans, root->fs_info->extent_root,
+ &key, path, 0, 0);
+ if (ret < 0)
+ goto out_free;
+
+ if (ret == 0) {
+ leaf = path->nodes[0];
+ item_size = btrfs_item_size_nr(leaf, path->slots[0]);
+ if (item_size >= sizeof(*ei)) {
+ ei = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_extent_item);
+ num_refs = btrfs_extent_refs(leaf, ei);
+ extent_flags = btrfs_extent_flags(leaf, ei);
+ } else {
+#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
+ struct btrfs_extent_item_v0 *ei0;
+ BUG_ON(item_size != sizeof(*ei0));
+ ei0 = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_extent_item_v0);
+ num_refs = btrfs_extent_refs_v0(leaf, ei0);
+ /* FIXME: this isn't correct for data */
+ extent_flags = BTRFS_BLOCK_FLAG_FULL_BACKREF;
+#else
+ BUG();
+#endif
+ }
+ BUG_ON(num_refs == 0);
+ } else {
+ num_refs = 0;
+ extent_flags = 0;
+ ret = 0;
+ }
+
+ if (!trans)
+ goto out;
+
+ delayed_refs = &trans->transaction->delayed_refs;
+ spin_lock(&delayed_refs->lock);
+ head = btrfs_find_delayed_ref_head(trans, bytenr);
+ if (head) {
+ if (!mutex_trylock(&head->mutex)) {
+ atomic_inc(&head->node.refs);
+ spin_unlock(&delayed_refs->lock);
+
+ btrfs_release_path(root->fs_info->extent_root, path);
+
+ mutex_lock(&head->mutex);
+ mutex_unlock(&head->mutex);
+ btrfs_put_delayed_ref(&head->node);
+ goto again;
+ }
+ if (head->extent_op && head->extent_op->update_flags)
+ extent_flags |= head->extent_op->flags_to_set;
+ else
+ BUG_ON(num_refs == 0);
+
+ num_refs += head->node.ref_mod;
+ mutex_unlock(&head->mutex);
+ }
+ spin_unlock(&delayed_refs->lock);
+out:
+ WARN_ON(num_refs == 0);
+ if (refs)
+ *refs = num_refs;
+ if (flags)
+ *flags = extent_flags;
+out_free:
+ btrfs_free_path(path);
+ return ret;
+}
+
+/*
* Back reference rules. Back refs have three main goals:
*
* 1) differentiate between all holders of references to an extent so that
@@ -1871,7 +1978,6 @@ static int run_delayed_tree_ref(struct btrfs_trans_handle *trans,
return ret;
}
-
/* helper function to actually process a single delayed ref entry */
static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
@@ -1891,32 +1997,14 @@ static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
BUG_ON(extent_op);
head = btrfs_delayed_node_to_head(node);
if (insert_reserved) {
- int mark_free = 0;
- struct extent_buffer *must_clean = NULL;
-
- ret = pin_down_bytes(trans, root, NULL,
- node->bytenr, node->num_bytes,
- head->is_data, 1, &must_clean);
- if (ret > 0)
- mark_free = 1;
-
- if (must_clean) {
- clean_tree_block(NULL, root, must_clean);
- btrfs_tree_unlock(must_clean);
- free_extent_buffer(must_clean);
- }
+ btrfs_pin_extent(root, node->bytenr,
+ node->num_bytes, 1);
if (head->is_data) {
ret = btrfs_del_csums(trans, root,
node->bytenr,
node->num_bytes);
BUG_ON(ret);
}
- if (mark_free) {
- ret = btrfs_free_reserved_extent(root,
- node->bytenr,
- node->num_bytes);
- BUG_ON(ret);
- }
}
mutex_unlock(&head->mutex);
return 0;
@@ -2347,6 +2435,8 @@ int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans,
ret = 0;
out:
btrfs_free_path(path);
+ if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID)
+ WARN_ON(ret > 0);
return ret;
}
@@ -2660,12 +2750,21 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
struct btrfs_space_info **space_info)
{
struct btrfs_space_info *found;
+ int i;
+ int factor;
+
+ if (flags & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 |
+ BTRFS_BLOCK_GROUP_RAID10))
+ factor = 2;
+ else
+ factor = 1;
found = __find_space_info(info, flags);
if (found) {
spin_lock(&found->lock);
found->total_bytes += total_bytes;
found->bytes_used += bytes_used;
+ found->disk_used += bytes_used * factor;
found->full = 0;
spin_unlock(&found->lock);
*space_info = found;
@@ -2675,18 +2774,20 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
if (!found)
return -ENOMEM;
- INIT_LIST_HEAD(&found->block_groups);
+ for (i = 0; i < BTRFS_NR_RAID_TYPES; i++)
+ INIT_LIST_HEAD(&found->block_groups[i]);
init_rwsem(&found->groups_sem);
- init_waitqueue_head(&found->flush_wait);
- init_waitqueue_head(&found->allocate_wait);
spin_lock_init(&found->lock);
- found->flags = flags;
+ found->flags = flags & (BTRFS_BLOCK_GROUP_DATA |
+ BTRFS_BLOCK_GROUP_SYSTEM |
+ BTRFS_BLOCK_GROUP_METADATA);
found->total_bytes = total_bytes;
found->bytes_used = bytes_used;
+ found->disk_used = bytes_used * factor;
found->bytes_pinned = 0;
found->bytes_reserved = 0;
found->bytes_readonly = 0;
- found->bytes_delalloc = 0;
+ found->bytes_may_use = 0;
found->full = 0;
found->force_alloc = 0;
*space_info = found;
@@ -2711,19 +2812,6 @@ static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
}
}
-static void set_block_group_readonly(struct btrfs_block_group_cache *cache)
-{
- spin_lock(&cache->space_info->lock);
- spin_lock(&cache->lock);
- if (!cache->ro) {
- cache->space_info->bytes_readonly += cache->key.offset -
- btrfs_block_group_used(&cache->item);
- cache->ro = 1;
- }
- spin_unlock(&cache->lock);
- spin_unlock(&cache->space_info->lock);
-}
-
u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
{
u64 num_devices = root->fs_info->fs_devices->rw_devices;
@@ -2752,491 +2840,50 @@ u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
return flags;
}
-static u64 btrfs_get_alloc_profile(struct btrfs_root *root, u64 data)
-{
- struct btrfs_fs_info *info = root->fs_info;
- u64 alloc_profile;
-
- if (data) {
- alloc_profile = info->avail_data_alloc_bits &
- info->data_alloc_profile;
- data = BTRFS_BLOCK_GROUP_DATA | alloc_profile;
- } else if (root == root->fs_info->chunk_root) {
- alloc_profile = info->avail_system_alloc_bits &
- info->system_alloc_profile;
- data = BTRFS_BLOCK_GROUP_SYSTEM | alloc_profile;
- } else {
- alloc_profile = info->avail_metadata_alloc_bits &
- info->metadata_alloc_profile;
- data = BTRFS_BLOCK_GROUP_METADATA | alloc_profile;
- }
-
- return btrfs_reduce_alloc_profile(root, data);
-}
-
-void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *inode)
-{
- u64 alloc_target;
-
- alloc_target = btrfs_get_alloc_profile(root, 1);
- BTRFS_I(inode)->space_info = __find_space_info(root->fs_info,
- alloc_target);
-}
-
-static u64 calculate_bytes_needed(struct btrfs_root *root, int num_items)
-{
- u64 num_bytes;
- int level;
-
- level = BTRFS_MAX_LEVEL - 2;
- /*
- * NOTE: these calculations are absolutely the worst possible case.
- * This assumes that _every_ item we insert will require a new leaf, and
- * that the tree has grown to its maximum level size.
- */
-
- /*
- * for every item we insert we could insert both an extent item and a
- * extent ref item. Then for ever item we insert, we will need to cow
- * both the original leaf, plus the leaf to the left and right of it.
- *
- * Unless we are talking about the extent root, then we just want the
- * number of items * 2, since we just need the extent item plus its ref.
- */
- if (root == root->fs_info->extent_root)
- num_bytes = num_items * 2;
- else
- num_bytes = (num_items + (2 * num_items)) * 3;
-
- /*
- * num_bytes is total number of leaves we could need times the leaf
- * size, and then for every leaf we could end up cow'ing 2 nodes per
- * level, down to the leaf level.
- */
- num_bytes = (num_bytes * root->leafsize) +
- (num_bytes * (level * 2)) * root->nodesize;
-
- return num_bytes;
-}
-
-/*
- * Unreserve metadata space for delalloc. If we have less reserved credits than
- * we have extents, this function does nothing.
- */
-int btrfs_unreserve_metadata_for_delalloc(struct btrfs_root *root,
- struct inode *inode, int num_items)
-{
- struct btrfs_fs_info *info = root->fs_info;
- struct btrfs_space_info *meta_sinfo;
- u64 num_bytes;
- u64 alloc_target;
- bool bug = false;
-
- /* get the space info for where the metadata will live */
- alloc_target = btrfs_get_alloc_profile(root, 0);
- meta_sinfo = __find_space_info(info, alloc_target);
-
- num_bytes = calculate_bytes_needed(root->fs_info->extent_root,
- num_items);
-
- spin_lock(&meta_sinfo->lock);
- spin_lock(&BTRFS_I(inode)->accounting_lock);
- if (BTRFS_I(inode)->reserved_extents <=
- BTRFS_I(inode)->outstanding_extents) {
- spin_unlock(&BTRFS_I(inode)->accounting_lock);
- spin_unlock(&meta_sinfo->lock);
- return 0;
- }
- spin_unlock(&BTRFS_I(inode)->accounting_lock);
-
- BTRFS_I(inode)->reserved_extents -= num_items;
- BUG_ON(BTRFS_I(inode)->reserved_extents < 0);
-
- if (meta_sinfo->bytes_delalloc < num_bytes) {
- bug = true;
- meta_sinfo->bytes_delalloc = 0;
- } else {
- meta_sinfo->bytes_delalloc -= num_bytes;
- }
- spin_unlock(&meta_sinfo->lock);
-
- BUG_ON(bug);
-
- return 0;
-}
-
-static void check_force_delalloc(struct btrfs_space_info *meta_sinfo)
+static u64 get_alloc_profile(struct btrfs_root *root, u64 flags)
{
- u64 thresh;
-
- thresh = meta_sinfo->bytes_used + meta_sinfo->bytes_reserved +
- meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly +
- meta_sinfo->bytes_super + meta_sinfo->bytes_root +
- meta_sinfo->bytes_may_use;
-
- thresh = meta_sinfo->total_bytes - thresh;
- thresh *= 80;
- do_div(thresh, 100);
- if (thresh <= meta_sinfo->bytes_delalloc)
- meta_sinfo->force_delalloc = 1;
- else
- meta_sinfo->force_delalloc = 0;
+ if (flags & BTRFS_BLOCK_GROUP_DATA)
+ flags |= root->fs_info->avail_data_alloc_bits &
+ root->fs_info->data_alloc_profile;
+ else if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
+ flags |= root->fs_info->avail_system_alloc_bits &
+ root->fs_info->system_alloc_profile;
+ else if (flags & BTRFS_BLOCK_GROUP_METADATA)
+ flags |= root->fs_info->avail_metadata_alloc_bits &
+ root->fs_info->metadata_alloc_profile;
+ return btrfs_reduce_alloc_profile(root, flags);
}
-struct async_flush {
- struct btrfs_root *root;
- struct btrfs_space_info *info;
- struct btrfs_work work;
-};
-
-static noinline void flush_delalloc_async(struct btrfs_work *work)
+static u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data)
{
- struct async_flush *async;
- struct btrfs_root *root;
- struct btrfs_space_info *info;
-
- async = container_of(work, struct async_flush, work);
- root = async->root;
- info = async->info;
-
- btrfs_start_delalloc_inodes(root, 0);
- wake_up(&info->flush_wait);
- btrfs_wait_ordered_extents(root, 0, 0);
-
- spin_lock(&info->lock);
- info->flushing = 0;
- spin_unlock(&info->lock);
- wake_up(&info->flush_wait);
-
- kfree(async);
-}
-
-static void wait_on_flush(struct btrfs_space_info *info)
-{
- DEFINE_WAIT(wait);
- u64 used;
-
- while (1) {
- prepare_to_wait(&info->flush_wait, &wait,
- TASK_UNINTERRUPTIBLE);
- spin_lock(&info->lock);
- if (!info->flushing) {
- spin_unlock(&info->lock);
- break;
- }
-
- used = info->bytes_used + info->bytes_reserved +
- info->bytes_pinned + info->bytes_readonly +
- info->bytes_super + info->bytes_root +
- info->bytes_may_use + info->bytes_delalloc;
- if (used < info->total_bytes) {
- spin_unlock(&info->lock);
- break;
- }
- spin_unlock(&info->lock);
- schedule();
- }
- finish_wait(&info->flush_wait, &wait);
-}
-
-static void flush_delalloc(struct btrfs_root *root,
- struct btrfs_space_info *info)
-{
- struct async_flush *async;
- bool wait = false;
-
- spin_lock(&info->lock);
+ u64 flags;
- if (!info->flushing)
- info->flushing = 1;
+ if (data)
+ flags = BTRFS_BLOCK_GROUP_DATA;
+ else if (root == root->fs_info->chunk_root)
+ flags = BTRFS_BLOCK_GROUP_SYSTEM;
else
- wait = true;
-
- spin_unlock(&info->lock);
-
- if (wait) {
- wait_on_flush(info);
- return;
- }
-
- async = kzalloc(sizeof(*async), GFP_NOFS);
- if (!async)
- goto flush;
-
- async->root = root;
- async->info = info;
- async->work.func = flush_delalloc_async;
-
- btrfs_queue_worker(&root->fs_info->enospc_workers,
- &async->work);
- wait_on_flush(info);
- return;
-
-flush:
- btrfs_start_delalloc_inodes(root, 0);
- btrfs_wait_ordered_extents(root, 0, 0);
-
- spin_lock(&info->lock);
- info->flushing = 0;
- spin_unlock(&info->lock);
- wake_up(&info->flush_wait);
-}
-
-static int maybe_allocate_chunk(struct btrfs_root *root,
- struct btrfs_space_info *info)
-{
- struct btrfs_super_block *disk_super = &root->fs_info->super_copy;
- struct btrfs_trans_handle *trans;
- bool wait = false;
- int ret = 0;
- u64 min_metadata;
- u64 free_space;
-
- free_space = btrfs_super_total_bytes(disk_super);
- /*
- * we allow the metadata to grow to a max of either 10gb or 5% of the
- * space in the volume.
- */
- min_metadata = min((u64)10 * 1024 * 1024 * 1024,
- div64_u64(free_space * 5, 100));
- if (info->total_bytes >= min_metadata) {
- spin_unlock(&info->lock);
- return 0;
- }
-
- if (info->full) {
- spin_unlock(&info->lock);
- return 0;
- }
-
- if (!info->allocating_chunk) {
- info->force_alloc = 1;
- info->allocating_chunk = 1;
- } else {
- wait = true;
- }
-
- spin_unlock(&info->lock);
-
- if (wait) {
- wait_event(info->allocate_wait,
- !info->allocating_chunk);
- return 1;
- }
-
- trans = btrfs_start_transaction(root, 1);
- if (!trans) {
- ret = -ENOMEM;
- goto out;
- }
-
- ret = do_chunk_alloc(trans, root->fs_info->extent_root,
- 4096 + 2 * 1024 * 1024,
- info->flags, 0);
- btrfs_end_transaction(trans, root);
- if (ret)
- goto out;
-out:
- spin_lock(&info->lock);
- info->allocating_chunk = 0;
- spin_unlock(&info->lock);
- wake_up(&info->allocate_wait);
-
- if (ret)
- return 0;
- return 1;
-}
-
-/*
- * Reserve metadata space for delalloc.
- */
-int btrfs_reserve_metadata_for_delalloc(struct btrfs_root *root,
- struct inode *inode, int num_items)
-{
- struct btrfs_fs_info *info = root->fs_info;
- struct btrfs_space_info *meta_sinfo;
- u64 num_bytes;
- u64 used;
- u64 alloc_target;
- int flushed = 0;
- int force_delalloc;
-
- /* get the space info for where the metadata will live */
- alloc_target = btrfs_get_alloc_profile(root, 0);
- meta_sinfo = __find_space_info(info, alloc_target);
-
- num_bytes = calculate_bytes_needed(root->fs_info->extent_root,
- num_items);
-again:
- spin_lock(&meta_sinfo->lock);
-
- force_delalloc = meta_sinfo->force_delalloc;
-
- if (unlikely(!meta_sinfo->bytes_root))
- meta_sinfo->bytes_root = calculate_bytes_needed(root, 6);
-
- if (!flushed)
- meta_sinfo->bytes_delalloc += num_bytes;
-
- used = meta_sinfo->bytes_used + meta_sinfo->bytes_reserved +
- meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly +
- meta_sinfo->bytes_super + meta_sinfo->bytes_root +
- meta_sinfo->bytes_may_use + meta_sinfo->bytes_delalloc;
-
- if (used > meta_sinfo->total_bytes) {
- flushed++;
-
- if (flushed == 1) {
- if (maybe_allocate_chunk(root, meta_sinfo))
- goto again;
- flushed++;
- } else {
- spin_unlock(&meta_sinfo->lock);
- }
-
- if (flushed == 2) {
- filemap_flush(inode->i_mapping);
- goto again;
- } else if (flushed == 3) {
- flush_delalloc(root, meta_sinfo);
- goto again;
- }
- spin_lock(&meta_sinfo->lock);
- meta_sinfo->bytes_delalloc -= num_bytes;
- spin_unlock(&meta_sinfo->lock);
- printk(KERN_ERR "enospc, has %d, reserved %d\n",
- BTRFS_I(inode)->outstanding_extents,
- BTRFS_I(inode)->reserved_extents);
- dump_space_info(meta_sinfo, 0, 0);
- return -ENOSPC;
- }
+ flags = BTRFS_BLOCK_GROUP_METADATA;
- BTRFS_I(inode)->reserved_extents += num_items;
- check_force_delalloc(meta_sinfo);
- spin_unlock(&meta_sinfo->lock);
-
- if (!flushed && force_delalloc)
- filemap_flush(inode->i_mapping);
-
- return 0;
+ return get_alloc_profile(root, flags);
}
-/*
- * unreserve num_items number of items worth of metadata space. This needs to
- * be paired with btrfs_reserve_metadata_space.
- *
- * NOTE: if you have the option, run this _AFTER_ you do a
- * btrfs_end_transaction, since btrfs_end_transaction will run delayed ref
- * oprations which will result in more used metadata, so we want to make sure we
- * can do that without issue.
- */
-int btrfs_unreserve_metadata_space(struct btrfs_root *root, int num_items)
-{
- struct btrfs_fs_info *info = root->fs_info;
- struct btrfs_space_info *meta_sinfo;
- u64 num_bytes;
- u64 alloc_target;
- bool bug = false;
-
- /* get the space info for where the metadata will live */
- alloc_target = btrfs_get_alloc_profile(root, 0);
- meta_sinfo = __find_space_info(info, alloc_target);
-
- num_bytes = calculate_bytes_needed(root, num_items);
-
- spin_lock(&meta_sinfo->lock);
- if (meta_sinfo->bytes_may_use < num_bytes) {
- bug = true;
- meta_sinfo->bytes_may_use = 0;
- } else {
- meta_sinfo->bytes_may_use -= num_bytes;
- }
- spin_unlock(&meta_sinfo->lock);
-
- BUG_ON(bug);
-
- return 0;
-}
-
-/*
- * Reserve some metadata space for use. We'll calculate the worste case number
- * of bytes that would be needed to modify num_items number of items. If we
- * have space, fantastic, if not, you get -ENOSPC. Please call
- * btrfs_unreserve_metadata_space when you are done for the _SAME_ number of
- * items you reserved, since whatever metadata you needed should have already
- * been allocated.
- *
- * This will commit the transaction to make more space if we don't have enough
- * metadata space. THe only time we don't do this is if we're reserving space
- * inside of a transaction, then we will just return -ENOSPC and it is the
- * callers responsibility to handle it properly.
- */
-int btrfs_reserve_metadata_space(struct btrfs_root *root, int num_items)
+void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *inode)
{
- struct btrfs_fs_info *info = root->fs_info;
- struct btrfs_space_info *meta_sinfo;
- u64 num_bytes;
- u64 used;
- u64 alloc_target;
- int retries = 0;
-
- /* get the space info for where the metadata will live */
- alloc_target = btrfs_get_alloc_profile(root, 0);
- meta_sinfo = __find_space_info(info, alloc_target);
-
- num_bytes = calculate_bytes_needed(root, num_items);
-again:
- spin_lock(&meta_sinfo->lock);
-
- if (unlikely(!meta_sinfo->bytes_root))
- meta_sinfo->bytes_root = calculate_bytes_needed(root, 6);
-
- if (!retries)
- meta_sinfo->bytes_may_use += num_bytes;
-
- used = meta_sinfo->bytes_used + meta_sinfo->bytes_reserved +
- meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly +
- meta_sinfo->bytes_super + meta_sinfo->bytes_root +
- meta_sinfo->bytes_may_use + meta_sinfo->bytes_delalloc;
-
- if (used > meta_sinfo->total_bytes) {
- retries++;
- if (retries == 1) {
- if (maybe_allocate_chunk(root, meta_sinfo))
- goto again;
- retries++;
- } else {
- spin_unlock(&meta_sinfo->lock);
- }
-
- if (retries == 2) {
- flush_delalloc(root, meta_sinfo);
- goto again;
- }
- spin_lock(&meta_sinfo->lock);
- meta_sinfo->bytes_may_use -= num_bytes;
- spin_unlock(&meta_sinfo->lock);
-
- dump_space_info(meta_sinfo, 0, 0);
- return -ENOSPC;
- }
-
- check_force_delalloc(meta_sinfo);
- spin_unlock(&meta_sinfo->lock);
-
- return 0;
+ BTRFS_I(inode)->space_info = __find_space_info(root->fs_info,
+ BTRFS_BLOCK_GROUP_DATA);
}
/*
* This will check the space that the inode allocates from to make sure we have
* enough space for bytes.
*/
-int btrfs_check_data_free_space(struct btrfs_root *root, struct inode *inode,
- u64 bytes)
+int btrfs_check_data_free_space(struct inode *inode, u64 bytes)
{
struct btrfs_space_info *data_sinfo;
+ struct btrfs_root *root = BTRFS_I(inode)->root;
u64 used;
- int ret = 0, committed = 0, flushed = 0;
+ int ret = 0, committed = 0;
/* make sure bytes are sectorsize aligned */
bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
@@ -3248,21 +2895,13 @@ int btrfs_check_data_free_space(struct btrfs_root *root, struct inode *inode,
again:
/* make sure we have enough space to handle the data first */
spin_lock(&data_sinfo->lock);
- used = data_sinfo->bytes_used + data_sinfo->bytes_delalloc +
- data_sinfo->bytes_reserved + data_sinfo->bytes_pinned +
- data_sinfo->bytes_readonly + data_sinfo->bytes_may_use +
- data_sinfo->bytes_super;
+ used = data_sinfo->bytes_used + data_sinfo->bytes_reserved +
+ data_sinfo->bytes_pinned + data_sinfo->bytes_readonly +
+ data_sinfo->bytes_may_use;
if (used + bytes > data_sinfo->total_bytes) {
struct btrfs_trans_handle *trans;
- if (!flushed) {
- spin_unlock(&data_sinfo->lock);
- flush_delalloc(root, data_sinfo);
- flushed = 1;
- goto again;
- }
-
/*
* if we don't have enough free bytes in this space then we need
* to alloc a new chunk.
@@ -3274,15 +2913,15 @@ again:
spin_unlock(&data_sinfo->lock);
alloc:
alloc_target = btrfs_get_alloc_profile(root, 1);
- trans = btrfs_start_transaction(root, 1);
- if (!trans)
- return -ENOMEM;
+ trans = btrfs_join_transaction(root, 1);
+ if (IS_ERR(trans))
+ return PTR_ERR(trans);
ret = do_chunk_alloc(trans, root->fs_info->extent_root,
bytes + 2 * 1024 * 1024,
alloc_target, 0);
btrfs_end_transaction(trans, root);
- if (ret)
+ if (ret < 0)
return ret;
if (!data_sinfo) {
@@ -3297,25 +2936,26 @@ alloc:
if (!committed && !root->fs_info->open_ioctl_trans) {
committed = 1;
trans = btrfs_join_transaction(root, 1);
- if (!trans)
- return -ENOMEM;
+ if (IS_ERR(trans))
+ return PTR_ERR(trans);
ret = btrfs_commit_transaction(trans, root);
if (ret)
return ret;
goto again;
}
- printk(KERN_ERR "no space left, need %llu, %llu delalloc bytes"
- ", %llu bytes_used, %llu bytes_reserved, "
- "%llu bytes_pinned, %llu bytes_readonly, %llu may use "
- "%llu total\n", (unsigned long long)bytes,
- (unsigned long long)data_sinfo->bytes_delalloc,
+#if 0 /* I hope we never need this code again, just in case */
+ printk(KERN_ERR "no space left, need %llu, %llu bytes_used, "
+ "%llu bytes_reserved, " "%llu bytes_pinned, "
+ "%llu bytes_readonly, %llu may use %llu total\n",
+ (unsigned long long)bytes,
(unsigned long long)data_sinfo->bytes_used,
(unsigned long long)data_sinfo->bytes_reserved,
(unsigned long long)data_sinfo->bytes_pinned,
(unsigned long long)data_sinfo->bytes_readonly,
(unsigned long long)data_sinfo->bytes_may_use,
(unsigned long long)data_sinfo->total_bytes);
+#endif
return -ENOSPC;
}
data_sinfo->bytes_may_use += bytes;
@@ -3326,12 +2966,13 @@ alloc:
}
/*
- * if there was an error for whatever reason after calling
- * btrfs_check_data_free_space, call this so we can cleanup the counters.
+ * called when we are clearing an delalloc extent from the
+ * inode's io_tree or there was an error for whatever reason
+ * after calling btrfs_check_data_free_space
*/
-void btrfs_free_reserved_data_space(struct btrfs_root *root,
- struct inode *inode, u64 bytes)
+void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes)
{
+ struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_space_info *data_sinfo;
/* make sure bytes are sectorsize aligned */
@@ -3344,48 +2985,6 @@ void btrfs_free_reserved_data_space(struct btrfs_root *root,
spin_unlock(&data_sinfo->lock);
}
-/* called when we are adding a delalloc extent to the inode's io_tree */
-void btrfs_delalloc_reserve_space(struct btrfs_root *root, struct inode *inode,
- u64 bytes)
-{
- struct btrfs_space_info *data_sinfo;
-
- /* get the space info for where this inode will be storing its data */
- data_sinfo = BTRFS_I(inode)->space_info;
-
- /* make sure we have enough space to handle the data first */
- spin_lock(&data_sinfo->lock);
- data_sinfo->bytes_delalloc += bytes;
-
- /*
- * we are adding a delalloc extent without calling
- * btrfs_check_data_free_space first. This happens on a weird
- * writepage condition, but shouldn't hurt our accounting
- */
- if (unlikely(bytes > BTRFS_I(inode)->reserved_bytes)) {
- data_sinfo->bytes_may_use -= BTRFS_I(inode)->reserved_bytes;
- BTRFS_I(inode)->reserved_bytes = 0;
- } else {
- data_sinfo->bytes_may_use -= bytes;
- BTRFS_I(inode)->reserved_bytes -= bytes;
- }
-
- spin_unlock(&data_sinfo->lock);
-}
-
-/* called when we are clearing an delalloc extent from the inode's io_tree */
-void btrfs_delalloc_free_space(struct btrfs_root *root, struct inode *inode,
- u64 bytes)
-{
- struct btrfs_space_info *info;
-
- info = BTRFS_I(inode)->space_info;
-
- spin_lock(&info->lock);
- info->bytes_delalloc -= bytes;
- spin_unlock(&info->lock);
-}
-
static void force_metadata_allocation(struct btrfs_fs_info *info)
{
struct list_head *head = &info->space_info;
@@ -3399,13 +2998,28 @@ static void force_metadata_allocation(struct btrfs_fs_info *info)
rcu_read_unlock();
}
+static int should_alloc_chunk(struct btrfs_space_info *sinfo,
+ u64 alloc_bytes)
+{
+ u64 num_bytes = sinfo->total_bytes - sinfo->bytes_readonly;
+
+ if (sinfo->bytes_used + sinfo->bytes_reserved +
+ alloc_bytes + 256 * 1024 * 1024 < num_bytes)
+ return 0;
+
+ if (sinfo->bytes_used + sinfo->bytes_reserved +
+ alloc_bytes < div_factor(num_bytes, 8))
+ return 0;
+
+ return 1;
+}
+
static int do_chunk_alloc(struct btrfs_trans_handle *trans,
struct btrfs_root *extent_root, u64 alloc_bytes,
u64 flags, int force)
{
struct btrfs_space_info *space_info;
struct btrfs_fs_info *fs_info = extent_root->fs_info;
- u64 thresh;
int ret = 0;
mutex_lock(&fs_info->chunk_mutex);
@@ -3428,11 +3042,7 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
goto out;
}
- thresh = space_info->total_bytes - space_info->bytes_readonly;
- thresh = div_factor(thresh, 8);
- if (!force &&
- (space_info->bytes_used + space_info->bytes_pinned +
- space_info->bytes_reserved + alloc_bytes) < thresh) {
+ if (!force && !should_alloc_chunk(space_info, alloc_bytes)) {
spin_unlock(&space_info->lock);
goto out;
}
@@ -3454,6 +3064,8 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
spin_lock(&space_info->lock);
if (ret)
space_info->full = 1;
+ else
+ ret = 1;
space_info->force_alloc = 0;
spin_unlock(&space_info->lock);
out:
@@ -3461,13 +3073,713 @@ out:
return ret;
}
+static int maybe_allocate_chunk(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_space_info *sinfo, u64 num_bytes)
+{
+ int ret;
+ int end_trans = 0;
+
+ if (sinfo->full)
+ return 0;
+
+ spin_lock(&sinfo->lock);
+ ret = should_alloc_chunk(sinfo, num_bytes + 2 * 1024 * 1024);
+ spin_unlock(&sinfo->lock);
+ if (!ret)
+ return 0;
+
+ if (!trans) {
+ trans = btrfs_join_transaction(root, 1);
+ BUG_ON(IS_ERR(trans));
+ end_trans = 1;
+ }
+
+ ret = do_chunk_alloc(trans, root->fs_info->extent_root,
+ num_bytes + 2 * 1024 * 1024,
+ get_alloc_profile(root, sinfo->flags), 0);
+
+ if (end_trans)
+ btrfs_end_transaction(trans, root);
+
+ return ret == 1 ? 1 : 0;
+}
+
+/*
+ * shrink metadata reservation for delalloc
+ */
+static int shrink_delalloc(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, u64 to_reclaim)
+{
+ struct btrfs_block_rsv *block_rsv;
+ u64 reserved;
+ u64 max_reclaim;
+ u64 reclaimed = 0;
+ int pause = 1;
+ int ret;
+
+ block_rsv = &root->fs_info->delalloc_block_rsv;
+ spin_lock(&block_rsv->lock);
+ reserved = block_rsv->reserved;
+ spin_unlock(&block_rsv->lock);
+
+ if (reserved == 0)
+ return 0;
+
+ max_reclaim = min(reserved, to_reclaim);
+
+ while (1) {
+ ret = btrfs_start_one_delalloc_inode(root, trans ? 1 : 0);
+ if (!ret) {
+ __set_current_state(TASK_INTERRUPTIBLE);
+ schedule_timeout(pause);
+ pause <<= 1;
+ if (pause > HZ / 10)
+ pause = HZ / 10;
+ } else {
+ pause = 1;
+ }
+
+ spin_lock(&block_rsv->lock);
+ if (reserved > block_rsv->reserved)
+ reclaimed = reserved - block_rsv->reserved;
+ reserved = block_rsv->reserved;
+ spin_unlock(&block_rsv->lock);
+
+ if (reserved == 0 || reclaimed >= max_reclaim)
+ break;
+
+ if (trans && trans->transaction->blocked)
+ return -EAGAIN;
+ }
+ return reclaimed >= to_reclaim;
+}
+
+static int should_retry_reserve(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_block_rsv *block_rsv,
+ u64 num_bytes, int *retries)
+{
+ struct btrfs_space_info *space_info = block_rsv->space_info;
+ int ret;
+
+ if ((*retries) > 2)
+ return -ENOSPC;
+
+ ret = maybe_allocate_chunk(trans, root, space_info, num_bytes);
+ if (ret)
+ return 1;
+
+ if (trans && trans->transaction->in_commit)
+ return -ENOSPC;
+
+ ret = shrink_delalloc(trans, root, num_bytes);
+ if (ret)
+ return ret;
+
+ spin_lock(&space_info->lock);
+ if (space_info->bytes_pinned < num_bytes)
+ ret = 1;
+ spin_unlock(&space_info->lock);
+ if (ret)
+ return -ENOSPC;
+
+ (*retries)++;
+
+ if (trans)
+ return -EAGAIN;
+
+ trans = btrfs_join_transaction(root, 1);
+ BUG_ON(IS_ERR(trans));
+ ret = btrfs_commit_transaction(trans, root);
+ BUG_ON(ret);
+
+ return 1;
+}
+
+static int reserve_metadata_bytes(struct btrfs_block_rsv *block_rsv,
+ u64 num_bytes)
+{
+ struct btrfs_space_info *space_info = block_rsv->space_info;
+ u64 unused;
+ int ret = -ENOSPC;
+
+ spin_lock(&space_info->lock);
+ unused = space_info->bytes_used + space_info->bytes_reserved +
+ space_info->bytes_pinned + space_info->bytes_readonly;
+
+ if (unused < space_info->total_bytes)
+ unused = space_info->total_bytes - unused;
+ else
+ unused = 0;
+
+ if (unused >= num_bytes) {
+ if (block_rsv->priority >= 10) {
+ space_info->bytes_reserved += num_bytes;
+ ret = 0;
+ } else {
+ if ((unused + block_rsv->reserved) *
+ block_rsv->priority >=
+ (num_bytes + block_rsv->reserved) * 10) {
+ space_info->bytes_reserved += num_bytes;
+ ret = 0;
+ }
+ }
+ }
+ spin_unlock(&space_info->lock);
+
+ return ret;
+}
+
+static struct btrfs_block_rsv *get_block_rsv(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root)
+{
+ struct btrfs_block_rsv *block_rsv;
+ if (root->ref_cows)
+ block_rsv = trans->block_rsv;
+ else
+ block_rsv = root->block_rsv;
+
+ if (!block_rsv)
+ block_rsv = &root->fs_info->empty_block_rsv;
+
+ return block_rsv;
+}
+
+static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv,
+ u64 num_bytes)
+{
+ int ret = -ENOSPC;
+ spin_lock(&block_rsv->lock);
+ if (block_rsv->reserved >= num_bytes) {
+ block_rsv->reserved -= num_bytes;
+ if (block_rsv->reserved < block_rsv->size)
+ block_rsv->full = 0;
+ ret = 0;
+ }
+ spin_unlock(&block_rsv->lock);
+ return ret;
+}
+
+static void block_rsv_add_bytes(struct btrfs_block_rsv *block_rsv,
+ u64 num_bytes, int update_size)
+{
+ spin_lock(&block_rsv->lock);
+ block_rsv->reserved += num_bytes;
+ if (update_size)
+ block_rsv->size += num_bytes;
+ else if (block_rsv->reserved >= block_rsv->size)
+ block_rsv->full = 1;
+ spin_unlock(&block_rsv->lock);
+}
+
+void block_rsv_release_bytes(struct btrfs_block_rsv *block_rsv,
+ struct btrfs_block_rsv *dest, u64 num_bytes)
+{
+ struct btrfs_space_info *space_info = block_rsv->space_info;
+
+ spin_lock(&block_rsv->lock);
+ if (num_bytes == (u64)-1)
+ num_bytes = block_rsv->size;
+ block_rsv->size -= num_bytes;
+ if (block_rsv->reserved >= block_rsv->size) {
+ num_bytes = block_rsv->reserved - block_rsv->size;
+ block_rsv->reserved = block_rsv->size;
+ block_rsv->full = 1;
+ } else {
+ num_bytes = 0;
+ }
+ spin_unlock(&block_rsv->lock);
+
+ if (num_bytes > 0) {
+ if (dest) {
+ block_rsv_add_bytes(dest, num_bytes, 0);
+ } else {
+ spin_lock(&space_info->lock);
+ space_info->bytes_reserved -= num_bytes;
+ spin_unlock(&space_info->lock);
+ }
+ }
+}
+
+static int block_rsv_migrate_bytes(struct btrfs_block_rsv *src,
+ struct btrfs_block_rsv *dst, u64 num_bytes)
+{
+ int ret;
+
+ ret = block_rsv_use_bytes(src, num_bytes);
+ if (ret)
+ return ret;
+
+ block_rsv_add_bytes(dst, num_bytes, 1);
+ return 0;
+}
+
+void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv)
+{
+ memset(rsv, 0, sizeof(*rsv));
+ spin_lock_init(&rsv->lock);
+ atomic_set(&rsv->usage, 1);
+ rsv->priority = 6;
+ INIT_LIST_HEAD(&rsv->list);
+}
+
+struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root)
+{
+ struct btrfs_block_rsv *block_rsv;
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ u64 alloc_target;
+
+ block_rsv = kmalloc(sizeof(*block_rsv), GFP_NOFS);
+ if (!block_rsv)
+ return NULL;
+
+ btrfs_init_block_rsv(block_rsv);
+
+ alloc_target = btrfs_get_alloc_profile(root, 0);
+ block_rsv->space_info = __find_space_info(fs_info,
+ BTRFS_BLOCK_GROUP_METADATA);
+
+ return block_rsv;
+}
+
+void btrfs_free_block_rsv(struct btrfs_root *root,
+ struct btrfs_block_rsv *rsv)
+{
+ if (rsv && atomic_dec_and_test(&rsv->usage)) {
+ btrfs_block_rsv_release(root, rsv, (u64)-1);
+ if (!rsv->durable)
+ kfree(rsv);
+ }
+}
+
+/*
+ * make the block_rsv struct be able to capture freed space.
+ * the captured space will re-add to the the block_rsv struct
+ * after transaction commit
+ */
+void btrfs_add_durable_block_rsv(struct btrfs_fs_info *fs_info,
+ struct btrfs_block_rsv *block_rsv)
+{
+ block_rsv->durable = 1;
+ mutex_lock(&fs_info->durable_block_rsv_mutex);
+ list_add_tail(&block_rsv->list, &fs_info->durable_block_rsv_list);
+ mutex_unlock(&fs_info->durable_block_rsv_mutex);
+}
+
+int btrfs_block_rsv_add(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_block_rsv *block_rsv,
+ u64 num_bytes, int *retries)
+{
+ int ret;
+
+ if (num_bytes == 0)
+ return 0;
+again:
+ ret = reserve_metadata_bytes(block_rsv, num_bytes);
+ if (!ret) {
+ block_rsv_add_bytes(block_rsv, num_bytes, 1);
+ return 0;
+ }
+
+ ret = should_retry_reserve(trans, root, block_rsv, num_bytes, retries);
+ if (ret > 0)
+ goto again;
+
+ return ret;
+}
+
+int btrfs_block_rsv_check(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_block_rsv *block_rsv,
+ u64 min_reserved, int min_factor)
+{
+ u64 num_bytes = 0;
+ int commit_trans = 0;
+ int ret = -ENOSPC;
+
+ if (!block_rsv)
+ return 0;
+
+ spin_lock(&block_rsv->lock);
+ if (min_factor > 0)
+ num_bytes = div_factor(block_rsv->size, min_factor);
+ if (min_reserved > num_bytes)
+ num_bytes = min_reserved;
+
+ if (block_rsv->reserved >= num_bytes) {
+ ret = 0;
+ } else {
+ num_bytes -= block_rsv->reserved;
+ if (block_rsv->durable &&
+ block_rsv->freed[0] + block_rsv->freed[1] >= num_bytes)
+ commit_trans = 1;
+ }
+ spin_unlock(&block_rsv->lock);
+ if (!ret)
+ return 0;
+
+ if (block_rsv->refill_used) {
+ ret = reserve_metadata_bytes(block_rsv, num_bytes);
+ if (!ret) {
+ block_rsv_add_bytes(block_rsv, num_bytes, 0);
+ return 0;
+ }
+ }
+
+ if (commit_trans) {
+ if (trans)
+ return -EAGAIN;
+
+ trans = btrfs_join_transaction(root, 1);
+ BUG_ON(IS_ERR(trans));
+ ret = btrfs_commit_transaction(trans, root);
+ return 0;
+ }
+
+ WARN_ON(1);
+ printk(KERN_INFO"block_rsv size %llu reserved %llu freed %llu %llu\n",
+ block_rsv->size, block_rsv->reserved,
+ block_rsv->freed[0], block_rsv->freed[1]);
+
+ return -ENOSPC;
+}
+
+int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv,
+ struct btrfs_block_rsv *dst_rsv,
+ u64 num_bytes)
+{
+ return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes);
+}
+
+void btrfs_block_rsv_release(struct btrfs_root *root,
+ struct btrfs_block_rsv *block_rsv,
+ u64 num_bytes)
+{
+ struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv;
+ if (global_rsv->full || global_rsv == block_rsv ||
+ block_rsv->space_info != global_rsv->space_info)
+ global_rsv = NULL;
+ block_rsv_release_bytes(block_rsv, global_rsv, num_bytes);
+}
+
+/*
+ * helper to calculate size of global block reservation.
+ * the desired value is sum of space used by extent tree,
+ * checksum tree and root tree
+ */
+static u64 calc_global_metadata_size(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_space_info *sinfo;
+ u64 num_bytes;
+ u64 meta_used;
+ u64 data_used;
+ int csum_size = btrfs_super_csum_size(&fs_info->super_copy);
+#if 0
+ /*
+ * per tree used space accounting can be inaccuracy, so we
+ * can't rely on it.
+ */
+ spin_lock(&fs_info->extent_root->accounting_lock);
+ num_bytes = btrfs_root_used(&fs_info->extent_root->root_item);
+ spin_unlock(&fs_info->extent_root->accounting_lock);
+
+ spin_lock(&fs_info->csum_root->accounting_lock);
+ num_bytes += btrfs_root_used(&fs_info->csum_root->root_item);
+ spin_unlock(&fs_info->csum_root->accounting_lock);
+
+ spin_lock(&fs_info->tree_root->accounting_lock);
+ num_bytes += btrfs_root_used(&fs_info->tree_root->root_item);
+ spin_unlock(&fs_info->tree_root->accounting_lock);
+#endif
+ sinfo = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_DATA);
+ spin_lock(&sinfo->lock);
+ data_used = sinfo->bytes_used;
+ spin_unlock(&sinfo->lock);
+
+ sinfo = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
+ spin_lock(&sinfo->lock);
+ meta_used = sinfo->bytes_used;
+ spin_unlock(&sinfo->lock);
+
+ num_bytes = (data_used >> fs_info->sb->s_blocksize_bits) *
+ csum_size * 2;
+ num_bytes += div64_u64(data_used + meta_used, 50);
+
+ if (num_bytes * 3 > meta_used)
+ num_bytes = div64_u64(meta_used, 3);
+
+ return ALIGN(num_bytes, fs_info->extent_root->leafsize << 10);
+}
+
+static void update_global_block_rsv(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv;
+ struct btrfs_space_info *sinfo = block_rsv->space_info;
+ u64 num_bytes;
+
+ num_bytes = calc_global_metadata_size(fs_info);
+
+ spin_lock(&block_rsv->lock);
+ spin_lock(&sinfo->lock);
+
+ block_rsv->size = num_bytes;
+
+ num_bytes = sinfo->bytes_used + sinfo->bytes_pinned +
+ sinfo->bytes_reserved + sinfo->bytes_readonly;
+
+ if (sinfo->total_bytes > num_bytes) {
+ num_bytes = sinfo->total_bytes - num_bytes;
+ block_rsv->reserved += num_bytes;
+ sinfo->bytes_reserved += num_bytes;
+ }
+
+ if (block_rsv->reserved >= block_rsv->size) {
+ num_bytes = block_rsv->reserved - block_rsv->size;
+ sinfo->bytes_reserved -= num_bytes;
+ block_rsv->reserved = block_rsv->size;
+ block_rsv->full = 1;
+ }
+#if 0
+ printk(KERN_INFO"global block rsv size %llu reserved %llu\n",
+ block_rsv->size, block_rsv->reserved);
+#endif
+ spin_unlock(&sinfo->lock);
+ spin_unlock(&block_rsv->lock);
+}
+
+static void init_global_block_rsv(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_space_info *space_info;
+
+ space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
+ fs_info->chunk_block_rsv.space_info = space_info;
+ fs_info->chunk_block_rsv.priority = 10;
+
+ space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
+ fs_info->global_block_rsv.space_info = space_info;
+ fs_info->global_block_rsv.priority = 10;
+ fs_info->global_block_rsv.refill_used = 1;
+ fs_info->delalloc_block_rsv.space_info = space_info;
+ fs_info->trans_block_rsv.space_info = space_info;
+ fs_info->empty_block_rsv.space_info = space_info;
+ fs_info->empty_block_rsv.priority = 10;
+
+ fs_info->extent_root->block_rsv = &fs_info->global_block_rsv;
+ fs_info->csum_root->block_rsv = &fs_info->global_block_rsv;
+ fs_info->dev_root->block_rsv = &fs_info->global_block_rsv;
+ fs_info->tree_root->block_rsv = &fs_info->global_block_rsv;
+ fs_info->chunk_root->block_rsv = &fs_info->chunk_block_rsv;
+
+ btrfs_add_durable_block_rsv(fs_info, &fs_info->global_block_rsv);
+
+ btrfs_add_durable_block_rsv(fs_info, &fs_info->delalloc_block_rsv);
+
+ update_global_block_rsv(fs_info);
+}
+
+static void release_global_block_rsv(struct btrfs_fs_info *fs_info)
+{
+ block_rsv_release_bytes(&fs_info->global_block_rsv, NULL, (u64)-1);
+ WARN_ON(fs_info->delalloc_block_rsv.size > 0);
+ WARN_ON(fs_info->delalloc_block_rsv.reserved > 0);
+ WARN_ON(fs_info->trans_block_rsv.size > 0);
+ WARN_ON(fs_info->trans_block_rsv.reserved > 0);
+ WARN_ON(fs_info->chunk_block_rsv.size > 0);
+ WARN_ON(fs_info->chunk_block_rsv.reserved > 0);
+}
+
+static u64 calc_trans_metadata_size(struct btrfs_root *root, int num_items)
+{
+ return (root->leafsize + root->nodesize * (BTRFS_MAX_LEVEL - 1)) *
+ 3 * num_items;
+}
+
+int btrfs_trans_reserve_metadata(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ int num_items, int *retries)
+{
+ u64 num_bytes;
+ int ret;
+
+ if (num_items == 0 || root->fs_info->chunk_root == root)
+ return 0;
+
+ num_bytes = calc_trans_metadata_size(root, num_items);
+ ret = btrfs_block_rsv_add(trans, root, &root->fs_info->trans_block_rsv,
+ num_bytes, retries);
+ if (!ret) {
+ trans->bytes_reserved += num_bytes;
+ trans->block_rsv = &root->fs_info->trans_block_rsv;
+ }
+ return ret;
+}
+
+void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root)
+{
+ if (!trans->bytes_reserved)
+ return;
+
+ BUG_ON(trans->block_rsv != &root->fs_info->trans_block_rsv);
+ btrfs_block_rsv_release(root, trans->block_rsv,
+ trans->bytes_reserved);
+ trans->bytes_reserved = 0;
+}
+
+int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans,
+ struct inode *inode)
+{
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_block_rsv *src_rsv = get_block_rsv(trans, root);
+ struct btrfs_block_rsv *dst_rsv = root->orphan_block_rsv;
+
+ /*
+ * one for deleting orphan item, one for updating inode and
+ * two for calling btrfs_truncate_inode_items.
+ *
+ * btrfs_truncate_inode_items is a delete operation, it frees
+ * more space than it uses in most cases. So two units of
+ * metadata space should be enough for calling it many times.
+ * If all of the metadata space is used, we can commit
+ * transaction and use space it freed.
+ */
+ u64 num_bytes = calc_trans_metadata_size(root, 4);
+ return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes);
+}
+
+void btrfs_orphan_release_metadata(struct inode *inode)
+{
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ u64 num_bytes = calc_trans_metadata_size(root, 4);
+ btrfs_block_rsv_release(root, root->orphan_block_rsv, num_bytes);
+}
+
+int btrfs_snap_reserve_metadata(struct btrfs_trans_handle *trans,
+ struct btrfs_pending_snapshot *pending)
+{
+ struct btrfs_root *root = pending->root;
+ struct btrfs_block_rsv *src_rsv = get_block_rsv(trans, root);
+ struct btrfs_block_rsv *dst_rsv = &pending->block_rsv;
+ /*
+ * two for root back/forward refs, two for directory entries
+ * and one for root of the snapshot.
+ */
+ u64 num_bytes = calc_trans_metadata_size(root, 5);
+ dst_rsv->space_info = src_rsv->space_info;
+ return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes);
+}
+
+static u64 calc_csum_metadata_size(struct inode *inode, u64 num_bytes)
+{
+ return num_bytes >>= 3;
+}
+
+int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
+{
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_block_rsv *block_rsv = &root->fs_info->delalloc_block_rsv;
+ u64 to_reserve;
+ int nr_extents;
+ int retries = 0;
+ int ret;
+
+ if (btrfs_transaction_in_commit(root->fs_info))
+ schedule_timeout(1);
+
+ num_bytes = ALIGN(num_bytes, root->sectorsize);
+again:
+ spin_lock(&BTRFS_I(inode)->accounting_lock);
+ nr_extents = atomic_read(&BTRFS_I(inode)->outstanding_extents) + 1;
+ if (nr_extents > BTRFS_I(inode)->reserved_extents) {
+ nr_extents -= BTRFS_I(inode)->reserved_extents;
+ to_reserve = calc_trans_metadata_size(root, nr_extents);
+ } else {
+ nr_extents = 0;
+ to_reserve = 0;
+ }
+
+ to_reserve += calc_csum_metadata_size(inode, num_bytes);
+ ret = reserve_metadata_bytes(block_rsv, to_reserve);
+ if (ret) {
+ spin_unlock(&BTRFS_I(inode)->accounting_lock);
+ ret = should_retry_reserve(NULL, root, block_rsv, to_reserve,
+ &retries);
+ if (ret > 0)
+ goto again;
+ return ret;
+ }
+
+ BTRFS_I(inode)->reserved_extents += nr_extents;
+ atomic_inc(&BTRFS_I(inode)->outstanding_extents);
+ spin_unlock(&BTRFS_I(inode)->accounting_lock);
+
+ block_rsv_add_bytes(block_rsv, to_reserve, 1);
+
+ if (block_rsv->size > 512 * 1024 * 1024)
+ shrink_delalloc(NULL, root, to_reserve);
+
+ return 0;
+}
+
+void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes)
+{
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ u64 to_free;
+ int nr_extents;
+
+ num_bytes = ALIGN(num_bytes, root->sectorsize);
+ atomic_dec(&BTRFS_I(inode)->outstanding_extents);
+
+ spin_lock(&BTRFS_I(inode)->accounting_lock);
+ nr_extents = atomic_read(&BTRFS_I(inode)->outstanding_extents);
+ if (nr_extents < BTRFS_I(inode)->reserved_extents) {
+ nr_extents = BTRFS_I(inode)->reserved_extents - nr_extents;
+ BTRFS_I(inode)->reserved_extents -= nr_extents;
+ } else {
+ nr_extents = 0;
+ }
+ spin_unlock(&BTRFS_I(inode)->accounting_lock);
+
+ to_free = calc_csum_metadata_size(inode, num_bytes);
+ if (nr_extents > 0)
+ to_free += calc_trans_metadata_size(root, nr_extents);
+
+ btrfs_block_rsv_release(root, &root->fs_info->delalloc_block_rsv,
+ to_free);
+}
+
+int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes)
+{
+ int ret;
+
+ ret = btrfs_check_data_free_space(inode, num_bytes);
+ if (ret)
+ return ret;
+
+ ret = btrfs_delalloc_reserve_metadata(inode, num_bytes);
+ if (ret) {
+ btrfs_free_reserved_data_space(inode, num_bytes);
+ return ret;
+ }
+
+ return 0;
+}
+
+void btrfs_delalloc_release_space(struct inode *inode, u64 num_bytes)
+{
+ btrfs_delalloc_release_metadata(inode, num_bytes);
+ btrfs_free_reserved_data_space(inode, num_bytes);
+}
+
static int update_block_group(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
- u64 bytenr, u64 num_bytes, int alloc,
- int mark_free)
+ u64 bytenr, u64 num_bytes, int alloc)
{
struct btrfs_block_group_cache *cache;
struct btrfs_fs_info *info = root->fs_info;
+ int factor;
u64 total = num_bytes;
u64 old_val;
u64 byte_in_group;
@@ -3486,6 +3798,12 @@ static int update_block_group(struct btrfs_trans_handle *trans,
cache = btrfs_lookup_block_group(info, bytenr);
if (!cache)
return -1;
+ if (cache->flags & (BTRFS_BLOCK_GROUP_DUP |
+ BTRFS_BLOCK_GROUP_RAID1 |
+ BTRFS_BLOCK_GROUP_RAID10))
+ factor = 2;
+ else
+ factor = 1;
byte_in_group = bytenr - cache->key.objectid;
WARN_ON(byte_in_group > cache->key.offset);
@@ -3498,31 +3816,24 @@ static int update_block_group(struct btrfs_trans_handle *trans,
old_val += num_bytes;
btrfs_set_block_group_used(&cache->item, old_val);
cache->reserved -= num_bytes;
- cache->space_info->bytes_used += num_bytes;
cache->space_info->bytes_reserved -= num_bytes;
- if (cache->ro)
- cache->space_info->bytes_readonly -= num_bytes;
+ cache->space_info->bytes_used += num_bytes;
+ cache->space_info->disk_used += num_bytes * factor;
spin_unlock(&cache->lock);
spin_unlock(&cache->space_info->lock);
} else {
old_val -= num_bytes;
- cache->space_info->bytes_used -= num_bytes;
- if (cache->ro)
- cache->space_info->bytes_readonly += num_bytes;
btrfs_set_block_group_used(&cache->item, old_val);
+ cache->pinned += num_bytes;
+ cache->space_info->bytes_pinned += num_bytes;
+ cache->space_info->bytes_used -= num_bytes;
+ cache->space_info->disk_used -= num_bytes * factor;
spin_unlock(&cache->lock);
spin_unlock(&cache->space_info->lock);
- if (mark_free) {
- int ret;
-
- ret = btrfs_discard_extent(root, bytenr,
- num_bytes);
- WARN_ON(ret);
- ret = btrfs_add_free_space(cache, bytenr,
- num_bytes);
- WARN_ON(ret);
- }
+ set_extent_dirty(info->pinned_extents,
+ bytenr, bytenr + num_bytes - 1,
+ GFP_NOFS | __GFP_NOFAIL);
}
btrfs_put_block_group(cache);
total -= num_bytes;
@@ -3546,18 +3857,10 @@ static u64 first_logical_byte(struct btrfs_root *root, u64 search_start)
return bytenr;
}
-/*
- * this function must be called within transaction
- */
-int btrfs_pin_extent(struct btrfs_root *root,
- u64 bytenr, u64 num_bytes, int reserved)
+static int pin_down_extent(struct btrfs_root *root,
+ struct btrfs_block_group_cache *cache,
+ u64 bytenr, u64 num_bytes, int reserved)
{
- struct btrfs_fs_info *fs_info = root->fs_info;
- struct btrfs_block_group_cache *cache;
-
- cache = btrfs_lookup_block_group(fs_info, bytenr);
- BUG_ON(!cache);
-
spin_lock(&cache->space_info->lock);
spin_lock(&cache->lock);
cache->pinned += num_bytes;
@@ -3569,28 +3872,68 @@ int btrfs_pin_extent(struct btrfs_root *root,
spin_unlock(&cache->lock);
spin_unlock(&cache->space_info->lock);
- btrfs_put_block_group(cache);
+ set_extent_dirty(root->fs_info->pinned_extents, bytenr,
+ bytenr + num_bytes - 1, GFP_NOFS | __GFP_NOFAIL);
+ return 0;
+}
- set_extent_dirty(fs_info->pinned_extents,
- bytenr, bytenr + num_bytes - 1, GFP_NOFS);
+/*
+ * this function must be called within transaction
+ */
+int btrfs_pin_extent(struct btrfs_root *root,
+ u64 bytenr, u64 num_bytes, int reserved)
+{
+ struct btrfs_block_group_cache *cache;
+
+ cache = btrfs_lookup_block_group(root->fs_info, bytenr);
+ BUG_ON(!cache);
+
+ pin_down_extent(root, cache, bytenr, num_bytes, reserved);
+
+ btrfs_put_block_group(cache);
return 0;
}
-static int update_reserved_extents(struct btrfs_block_group_cache *cache,
- u64 num_bytes, int reserve)
+/*
+ * update size of reserved extents. this function may return -EAGAIN
+ * if 'reserve' is true or 'sinfo' is false.
+ */
+static int update_reserved_bytes(struct btrfs_block_group_cache *cache,
+ u64 num_bytes, int reserve, int sinfo)
{
- spin_lock(&cache->space_info->lock);
- spin_lock(&cache->lock);
- if (reserve) {
- cache->reserved += num_bytes;
- cache->space_info->bytes_reserved += num_bytes;
+ int ret = 0;
+ if (sinfo) {
+ struct btrfs_space_info *space_info = cache->space_info;
+ spin_lock(&space_info->lock);
+ spin_lock(&cache->lock);
+ if (reserve) {
+ if (cache->ro) {
+ ret = -EAGAIN;
+ } else {
+ cache->reserved += num_bytes;
+ space_info->bytes_reserved += num_bytes;
+ }
+ } else {
+ if (cache->ro)
+ space_info->bytes_readonly += num_bytes;
+ cache->reserved -= num_bytes;
+ space_info->bytes_reserved -= num_bytes;
+ }
+ spin_unlock(&cache->lock);
+ spin_unlock(&space_info->lock);
} else {
- cache->reserved -= num_bytes;
- cache->space_info->bytes_reserved -= num_bytes;
+ spin_lock(&cache->lock);
+ if (cache->ro) {
+ ret = -EAGAIN;
+ } else {
+ if (reserve)
+ cache->reserved += num_bytes;
+ else
+ cache->reserved -= num_bytes;
+ }
+ spin_unlock(&cache->lock);
}
- spin_unlock(&cache->lock);
- spin_unlock(&cache->space_info->lock);
- return 0;
+ return ret;
}
int btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
@@ -3621,6 +3964,8 @@ int btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
fs_info->pinned_extents = &fs_info->freed_extents[0];
up_write(&fs_info->extent_commit_sem);
+
+ update_global_block_rsv(fs_info);
return 0;
}
@@ -3647,14 +3992,21 @@ static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end)
btrfs_add_free_space(cache, start, len);
}
+ start += len;
+
spin_lock(&cache->space_info->lock);
spin_lock(&cache->lock);
cache->pinned -= len;
cache->space_info->bytes_pinned -= len;
+ if (cache->ro) {
+ cache->space_info->bytes_readonly += len;
+ } else if (cache->reserved_pinned > 0) {
+ len = min(len, cache->reserved_pinned);
+ cache->reserved_pinned -= len;
+ cache->space_info->bytes_reserved += len;
+ }
spin_unlock(&cache->lock);
spin_unlock(&cache->space_info->lock);
-
- start += len;
}
if (cache)
@@ -3667,8 +4019,11 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
{
struct btrfs_fs_info *fs_info = root->fs_info;
struct extent_io_tree *unpin;
+ struct btrfs_block_rsv *block_rsv;
+ struct btrfs_block_rsv *next_rsv;
u64 start;
u64 end;
+ int idx;
int ret;
if (fs_info->pinned_extents == &fs_info->freed_extents[0])
@@ -3689,59 +4044,30 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
cond_resched();
}
- return ret;
-}
-
-static int pin_down_bytes(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct btrfs_path *path,
- u64 bytenr, u64 num_bytes,
- int is_data, int reserved,
- struct extent_buffer **must_clean)
-{
- int err = 0;
- struct extent_buffer *buf;
+ mutex_lock(&fs_info->durable_block_rsv_mutex);
+ list_for_each_entry_safe(block_rsv, next_rsv,
+ &fs_info->durable_block_rsv_list, list) {
- if (is_data)
- goto pinit;
-
- /*
- * discard is sloooow, and so triggering discards on
- * individual btree blocks isn't a good plan. Just
- * pin everything in discard mode.
- */
- if (btrfs_test_opt(root, DISCARD))
- goto pinit;
-
- buf = btrfs_find_tree_block(root, bytenr, num_bytes);
- if (!buf)
- goto pinit;
+ idx = trans->transid & 0x1;
+ if (block_rsv->freed[idx] > 0) {
+ block_rsv_add_bytes(block_rsv,
+ block_rsv->freed[idx], 0);
+ block_rsv->freed[idx] = 0;
+ }
+ if (atomic_read(&block_rsv->usage) == 0) {
+ btrfs_block_rsv_release(root, block_rsv, (u64)-1);
- /* we can reuse a block if it hasn't been written
- * and it is from this transaction. We can't
- * reuse anything from the tree log root because
- * it has tiny sub-transactions.
- */
- if (btrfs_buffer_uptodate(buf, 0) &&
- btrfs_try_tree_lock(buf)) {
- u64 header_owner = btrfs_header_owner(buf);
- u64 header_transid = btrfs_header_generation(buf);
- if (header_owner != BTRFS_TREE_LOG_OBJECTID &&
- header_transid == trans->transid &&
- !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
- *must_clean = buf;
- return 1;
+ if (block_rsv->freed[0] == 0 &&
+ block_rsv->freed[1] == 0) {
+ list_del_init(&block_rsv->list);
+ kfree(block_rsv);
+ }
+ } else {
+ btrfs_block_rsv_release(root, block_rsv, 0);
}
- btrfs_tree_unlock(buf);
}
- free_extent_buffer(buf);
-pinit:
- if (path)
- btrfs_set_path_blocking(path);
- /* unlocks the pinned mutex */
- btrfs_pin_extent(root, bytenr, num_bytes, reserved);
+ mutex_unlock(&fs_info->durable_block_rsv_mutex);
- BUG_ON(err < 0);
return 0;
}
@@ -3902,9 +4228,6 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
BUG_ON(ret);
}
} else {
- int mark_free = 0;
- struct extent_buffer *must_clean = NULL;
-
if (found_extent) {
BUG_ON(is_data && refs_to_drop !=
extent_data_ref_count(root, path, iref));
@@ -3917,31 +4240,11 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
}
}
- ret = pin_down_bytes(trans, root, path, bytenr,
- num_bytes, is_data, 0, &must_clean);
- if (ret > 0)
- mark_free = 1;
- BUG_ON(ret < 0);
- /*
- * it is going to be very rare for someone to be waiting
- * on the block we're freeing. del_items might need to
- * schedule, so rather than get fancy, just force it
- * to blocking here
- */
- if (must_clean)
- btrfs_set_lock_blocking(must_clean);
-
ret = btrfs_del_items(trans, extent_root, path, path->slots[0],
num_to_del);
BUG_ON(ret);
btrfs_release_path(extent_root, path);
- if (must_clean) {
- clean_tree_block(NULL, root, must_clean);
- btrfs_tree_unlock(must_clean);
- free_extent_buffer(must_clean);
- }
-
if (is_data) {
ret = btrfs_del_csums(trans, root, bytenr, num_bytes);
BUG_ON(ret);
@@ -3951,8 +4254,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
(bytenr + num_bytes - 1) >> PAGE_CACHE_SHIFT);
}
- ret = update_block_group(trans, root, bytenr, num_bytes, 0,
- mark_free);
+ ret = update_block_group(trans, root, bytenr, num_bytes, 0);
BUG_ON(ret);
}
btrfs_free_path(path);
@@ -3960,7 +4262,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
}
/*
- * when we free an extent, it is possible (and likely) that we free the last
+ * when we free an block, it is possible (and likely) that we free the last
* delayed ref for that extent as well. This searches the delayed ref tree for
* a given extent, and if there are no other delayed refs to be processed, it
* removes it from the tree.
@@ -3972,7 +4274,7 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
struct btrfs_delayed_ref_root *delayed_refs;
struct btrfs_delayed_ref_node *ref;
struct rb_node *node;
- int ret;
+ int ret = 0;
delayed_refs = &trans->transaction->delayed_refs;
spin_lock(&delayed_refs->lock);
@@ -4024,17 +4326,100 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
list_del_init(&head->cluster);
spin_unlock(&delayed_refs->lock);
- ret = run_one_delayed_ref(trans, root->fs_info->tree_root,
- &head->node, head->extent_op,
- head->must_insert_reserved);
- BUG_ON(ret);
+ BUG_ON(head->extent_op);
+ if (head->must_insert_reserved)
+ ret = 1;
+
+ mutex_unlock(&head->mutex);
btrfs_put_delayed_ref(&head->node);
- return 0;
+ return ret;
out:
spin_unlock(&delayed_refs->lock);
return 0;
}
+void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct extent_buffer *buf,
+ u64 parent, int last_ref)
+{
+ struct btrfs_block_rsv *block_rsv;
+ struct btrfs_block_group_cache *cache = NULL;
+ int ret;
+
+ if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
+ ret = btrfs_add_delayed_tree_ref(trans, buf->start, buf->len,
+ parent, root->root_key.objectid,
+ btrfs_header_level(buf),
+ BTRFS_DROP_DELAYED_REF, NULL);
+ BUG_ON(ret);
+ }
+
+ if (!last_ref)
+ return;
+
+ block_rsv = get_block_rsv(trans, root);
+ cache = btrfs_lookup_block_group(root->fs_info, buf->start);
+ if (block_rsv->space_info != cache->space_info)
+ goto out;
+
+ if (btrfs_header_generation(buf) == trans->transid) {
+ if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
+ ret = check_ref_cleanup(trans, root, buf->start);
+ if (!ret)
+ goto pin;
+ }
+
+ if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
+ pin_down_extent(root, cache, buf->start, buf->len, 1);
+ goto pin;
+ }
+
+ WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags));
+
+ btrfs_add_free_space(cache, buf->start, buf->len);
+ ret = update_reserved_bytes(cache, buf->len, 0, 0);
+ if (ret == -EAGAIN) {
+ /* block group became read-only */
+ update_reserved_bytes(cache, buf->len, 0, 1);
+ goto out;
+ }
+
+ ret = 1;
+ spin_lock(&block_rsv->lock);
+ if (block_rsv->reserved < block_rsv->size) {
+ block_rsv->reserved += buf->len;
+ ret = 0;
+ }
+ spin_unlock(&block_rsv->lock);
+
+ if (ret) {
+ spin_lock(&cache->space_info->lock);
+ cache->space_info->bytes_reserved -= buf->len;
+ spin_unlock(&cache->space_info->lock);
+ }
+ goto out;
+ }
+pin:
+ if (block_rsv->durable && !cache->ro) {
+ ret = 0;
+ spin_lock(&cache->lock);
+ if (!cache->ro) {
+ cache->reserved_pinned += buf->len;
+ ret = 1;
+ }
+ spin_unlock(&cache->lock);
+
+ if (ret) {
+ spin_lock(&block_rsv->lock);
+ block_rsv->freed[trans->transid & 0x1] += buf->len;
+ spin_unlock(&block_rsv->lock);
+ }
+ }
+out:
+ btrfs_put_block_group(cache);
+}
+
int btrfs_free_extent(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
u64 bytenr, u64 num_bytes, u64 parent,
@@ -4056,8 +4441,6 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans,
parent, root_objectid, (int)owner,
BTRFS_DROP_DELAYED_REF, NULL);
BUG_ON(ret);
- ret = check_ref_cleanup(trans, root, bytenr);
- BUG_ON(ret);
} else {
ret = btrfs_add_delayed_data_ref(trans, bytenr, num_bytes,
parent, root_objectid, owner,
@@ -4067,21 +4450,6 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans,
return ret;
}
-int btrfs_free_tree_block(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- u64 bytenr, u32 blocksize,
- u64 parent, u64 root_objectid, int level)
-{
- u64 used;
- spin_lock(&root->node_lock);
- used = btrfs_root_used(&root->root_item) - blocksize;
- btrfs_set_root_used(&root->root_item, used);
- spin_unlock(&root->node_lock);
-
- return btrfs_free_extent(trans, root, bytenr, blocksize,
- parent, root_objectid, level, 0);
-}
-
static u64 stripe_align(struct btrfs_root *root, u64 val)
{
u64 mask = ((u64)root->stripesize - 1);
@@ -4134,6 +4502,22 @@ wait_block_group_cache_done(struct btrfs_block_group_cache *cache)
return 0;
}
+static int get_block_group_index(struct btrfs_block_group_cache *cache)
+{
+ int index;
+ if (cache->flags & BTRFS_BLOCK_GROUP_RAID10)
+ index = 0;
+ else if (cache->flags & BTRFS_BLOCK_GROUP_RAID1)
+ index = 1;
+ else if (cache->flags & BTRFS_BLOCK_GROUP_DUP)
+ index = 2;
+ else if (cache->flags & BTRFS_BLOCK_GROUP_RAID0)
+ index = 3;
+ else
+ index = 4;
+ return index;
+}
+
enum btrfs_loop_type {
LOOP_FIND_IDEAL = 0,
LOOP_CACHING_NOWAIT = 1,
@@ -4155,7 +4539,6 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
u64 num_bytes, u64 empty_size,
u64 search_start, u64 search_end,
u64 hint_byte, struct btrfs_key *ins,
- u64 exclude_start, u64 exclude_nr,
int data)
{
int ret = 0;
@@ -4168,6 +4551,7 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
struct btrfs_space_info *space_info;
int last_ptr_loop = 0;
int loop = 0;
+ int index = 0;
bool found_uncached_bg = false;
bool failed_cluster_refill = false;
bool failed_alloc = false;
@@ -4237,6 +4621,7 @@ ideal_cache:
btrfs_put_block_group(block_group);
up_read(&space_info->groups_sem);
} else {
+ index = get_block_group_index(block_group);
goto have_block_group;
}
} else if (block_group) {
@@ -4245,7 +4630,8 @@ ideal_cache:
}
search:
down_read(&space_info->groups_sem);
- list_for_each_entry(block_group, &space_info->block_groups, list) {
+ list_for_each_entry(block_group, &space_info->block_groups[index],
+ list) {
u64 offset;
int cached;
@@ -4436,23 +4822,22 @@ checks:
goto loop;
}
- if (exclude_nr > 0 &&
- (search_start + num_bytes > exclude_start &&
- search_start < exclude_start + exclude_nr)) {
- search_start = exclude_start + exclude_nr;
+ ins->objectid = search_start;
+ ins->offset = num_bytes;
+
+ if (offset < search_start)
+ btrfs_add_free_space(block_group, offset,
+ search_start - offset);
+ BUG_ON(offset > search_start);
+ ret = update_reserved_bytes(block_group, num_bytes, 1,
+ (data & BTRFS_BLOCK_GROUP_DATA));
+ if (ret == -EAGAIN) {
btrfs_add_free_space(block_group, offset, num_bytes);
- /*
- * if search_start is still in this block group
- * then we just re-search this block group
- */
- if (search_start >= block_group->key.objectid &&
- search_start < (block_group->key.objectid +
- block_group->key.offset))
- goto have_block_group;
goto loop;
}
+ /* we are all good, lets return */
ins->objectid = search_start;
ins->offset = num_bytes;
@@ -4460,18 +4845,18 @@ checks:
btrfs_add_free_space(block_group, offset,
search_start - offset);
BUG_ON(offset > search_start);
-
- update_reserved_extents(block_group, num_bytes, 1);
-
- /* we are all good, lets return */
break;
loop:
failed_cluster_refill = false;
failed_alloc = false;
+ BUG_ON(index != get_block_group_index(block_group));
btrfs_put_block_group(block_group);
}
up_read(&space_info->groups_sem);
+ if (!ins->objectid && ++index < BTRFS_NR_RAID_TYPES)
+ goto search;
+
/* LOOP_FIND_IDEAL, only search caching/cached bg's, and don't wait for
* for them to make caching progress. Also
* determine the best possible bg to cache
@@ -4485,6 +4870,7 @@ loop:
if (!ins->objectid && loop < LOOP_NO_EMPTY_SIZE &&
(found_uncached_bg || empty_size || empty_cluster ||
allowed_chunk_alloc)) {
+ index = 0;
if (loop == LOOP_FIND_IDEAL && found_uncached_bg) {
found_uncached_bg = false;
loop++;
@@ -4567,31 +4953,30 @@ static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
int dump_block_groups)
{
struct btrfs_block_group_cache *cache;
+ int index = 0;
spin_lock(&info->lock);
printk(KERN_INFO "space_info has %llu free, is %sfull\n",
(unsigned long long)(info->total_bytes - info->bytes_used -
info->bytes_pinned - info->bytes_reserved -
- info->bytes_super),
+ info->bytes_readonly),
(info->full) ? "" : "not ");
- printk(KERN_INFO "space_info total=%llu, pinned=%llu, delalloc=%llu,"
- " may_use=%llu, used=%llu, root=%llu, super=%llu, reserved=%llu"
- "\n",
+ printk(KERN_INFO "space_info total=%llu, used=%llu, pinned=%llu, "
+ "reserved=%llu, may_use=%llu, readonly=%llu\n",
(unsigned long long)info->total_bytes,
+ (unsigned long long)info->bytes_used,
(unsigned long long)info->bytes_pinned,
- (unsigned long long)info->bytes_delalloc,
+ (unsigned long long)info->bytes_reserved,
(unsigned long long)info->bytes_may_use,
- (unsigned long long)info->bytes_used,
- (unsigned long long)info->bytes_root,
- (unsigned long long)info->bytes_super,
- (unsigned long long)info->bytes_reserved);
+ (unsigned long long)info->bytes_readonly);
spin_unlock(&info->lock);
if (!dump_block_groups)
return;
down_read(&info->groups_sem);
- list_for_each_entry(cache, &info->block_groups, list) {
+again:
+ list_for_each_entry(cache, &info->block_groups[index], list) {
spin_lock(&cache->lock);
printk(KERN_INFO "block group %llu has %llu bytes, %llu used "
"%llu pinned %llu reserved\n",
@@ -4603,6 +4988,8 @@ static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
btrfs_dump_free_space(cache, bytes);
spin_unlock(&cache->lock);
}
+ if (++index < BTRFS_NR_RAID_TYPES)
+ goto again;
up_read(&info->groups_sem);
}
@@ -4628,9 +5015,8 @@ again:
WARN_ON(num_bytes < root->sectorsize);
ret = find_free_extent(trans, root, num_bytes, empty_size,
- search_start, search_end, hint_byte, ins,
- trans->alloc_exclude_start,
- trans->alloc_exclude_nr, data);
+ search_start, search_end, hint_byte,
+ ins, data);
if (ret == -ENOSPC && num_bytes > min_alloc_size) {
num_bytes = num_bytes >> 1;
@@ -4668,7 +5054,7 @@ int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len)
ret = btrfs_discard_extent(root, start, len);
btrfs_add_free_space(cache, start, len);
- update_reserved_extents(cache, len, 0);
+ update_reserved_bytes(cache, len, 0, 1);
btrfs_put_block_group(cache);
return ret;
@@ -4731,8 +5117,7 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
btrfs_mark_buffer_dirty(path->nodes[0]);
btrfs_free_path(path);
- ret = update_block_group(trans, root, ins->objectid, ins->offset,
- 1, 0);
+ ret = update_block_group(trans, root, ins->objectid, ins->offset, 1);
if (ret) {
printk(KERN_ERR "btrfs update block group failed for %llu "
"%llu\n", (unsigned long long)ins->objectid,
@@ -4792,8 +5177,7 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
btrfs_mark_buffer_dirty(leaf);
btrfs_free_path(path);
- ret = update_block_group(trans, root, ins->objectid, ins->offset,
- 1, 0);
+ ret = update_block_group(trans, root, ins->objectid, ins->offset, 1);
if (ret) {
printk(KERN_ERR "btrfs update block group failed for %llu "
"%llu\n", (unsigned long long)ins->objectid,
@@ -4869,73 +5253,14 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
put_caching_control(caching_ctl);
}
- update_reserved_extents(block_group, ins->offset, 1);
+ ret = update_reserved_bytes(block_group, ins->offset, 1, 1);
+ BUG_ON(ret);
btrfs_put_block_group(block_group);
ret = alloc_reserved_file_extent(trans, root, 0, root_objectid,
0, owner, offset, ins, 1);
return ret;
}
-/*
- * finds a free extent and does all the dirty work required for allocation
- * returns the key for the extent through ins, and a tree buffer for
- * the first block of the extent through buf.
- *
- * returns 0 if everything worked, non-zero otherwise.
- */
-static int alloc_tree_block(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- u64 num_bytes, u64 parent, u64 root_objectid,
- struct btrfs_disk_key *key, int level,
- u64 empty_size, u64 hint_byte, u64 search_end,
- struct btrfs_key *ins)
-{
- int ret;
- u64 flags = 0;
-
- ret = btrfs_reserve_extent(trans, root, num_bytes, num_bytes,
- empty_size, hint_byte, search_end,
- ins, 0);
- if (ret)
- return ret;
-
- if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) {
- if (parent == 0)
- parent = ins->objectid;
- flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
- } else
- BUG_ON(parent > 0);
-
- if (root_objectid != BTRFS_TREE_LOG_OBJECTID) {
- struct btrfs_delayed_extent_op *extent_op;
- extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS);
- BUG_ON(!extent_op);
- if (key)
- memcpy(&extent_op->key, key, sizeof(extent_op->key));
- else
- memset(&extent_op->key, 0, sizeof(extent_op->key));
- extent_op->flags_to_set = flags;
- extent_op->update_key = 1;
- extent_op->update_flags = 1;
- extent_op->is_data = 0;
-
- ret = btrfs_add_delayed_tree_ref(trans, ins->objectid,
- ins->offset, parent, root_objectid,
- level, BTRFS_ADD_DELAYED_EXTENT,
- extent_op);
- BUG_ON(ret);
- }
-
- if (root_objectid == root->root_key.objectid) {
- u64 used;
- spin_lock(&root->node_lock);
- used = btrfs_root_used(&root->root_item) + num_bytes;
- btrfs_set_root_used(&root->root_item, used);
- spin_unlock(&root->node_lock);
- }
- return ret;
-}
-
struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
u64 bytenr, u32 blocksize,
@@ -4974,8 +5299,45 @@ struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
return buf;
}
+static struct btrfs_block_rsv *
+use_block_rsv(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, u32 blocksize)
+{
+ struct btrfs_block_rsv *block_rsv;
+ int ret;
+
+ block_rsv = get_block_rsv(trans, root);
+
+ if (block_rsv->size == 0) {
+ ret = reserve_metadata_bytes(block_rsv, blocksize);
+ if (ret)
+ return ERR_PTR(ret);
+ return block_rsv;
+ }
+
+ ret = block_rsv_use_bytes(block_rsv, blocksize);
+ if (!ret)
+ return block_rsv;
+
+ WARN_ON(1);
+ printk(KERN_INFO"block_rsv size %llu reserved %llu freed %llu %llu\n",
+ block_rsv->size, block_rsv->reserved,
+ block_rsv->freed[0], block_rsv->freed[1]);
+
+ return ERR_PTR(-ENOSPC);
+}
+
+static void unuse_block_rsv(struct btrfs_block_rsv *block_rsv, u32 blocksize)
+{
+ block_rsv_add_bytes(block_rsv, blocksize, 0);
+ block_rsv_release_bytes(block_rsv, NULL, 0);
+}
+
/*
- * helper function to allocate a block for a given tree
+ * finds a free extent and does all the dirty work required for allocation
+ * returns the key for the extent through ins, and a tree buffer for
+ * the first block of the extent through buf.
+ *
* returns the tree buffer or NULL.
*/
struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
@@ -4985,18 +5347,53 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
u64 hint, u64 empty_size)
{
struct btrfs_key ins;
- int ret;
+ struct btrfs_block_rsv *block_rsv;
struct extent_buffer *buf;
+ u64 flags = 0;
+ int ret;
+
- ret = alloc_tree_block(trans, root, blocksize, parent, root_objectid,
- key, level, empty_size, hint, (u64)-1, &ins);
+ block_rsv = use_block_rsv(trans, root, blocksize);
+ if (IS_ERR(block_rsv))
+ return ERR_CAST(block_rsv);
+
+ ret = btrfs_reserve_extent(trans, root, blocksize, blocksize,
+ empty_size, hint, (u64)-1, &ins, 0);
if (ret) {
- BUG_ON(ret > 0);
+ unuse_block_rsv(block_rsv, blocksize);
return ERR_PTR(ret);
}
buf = btrfs_init_new_buffer(trans, root, ins.objectid,
blocksize, level);
+ BUG_ON(IS_ERR(buf));
+
+ if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) {
+ if (parent == 0)
+ parent = ins.objectid;
+ flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
+ } else
+ BUG_ON(parent > 0);
+
+ if (root_objectid != BTRFS_TREE_LOG_OBJECTID) {
+ struct btrfs_delayed_extent_op *extent_op;
+ extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS);
+ BUG_ON(!extent_op);
+ if (key)
+ memcpy(&extent_op->key, key, sizeof(extent_op->key));
+ else
+ memset(&extent_op->key, 0, sizeof(extent_op->key));
+ extent_op->flags_to_set = flags;
+ extent_op->update_key = 1;
+ extent_op->update_flags = 1;
+ extent_op->is_data = 0;
+
+ ret = btrfs_add_delayed_tree_ref(trans, ins.objectid,
+ ins.offset, parent, root_objectid,
+ level, BTRFS_ADD_DELAYED_EXTENT,
+ extent_op);
+ BUG_ON(ret);
+ }
return buf;
}
@@ -5321,7 +5718,7 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
struct btrfs_path *path,
struct walk_control *wc)
{
- int ret = 0;
+ int ret;
int level = wc->level;
struct extent_buffer *eb = path->nodes[level];
u64 parent = 0;
@@ -5399,13 +5796,11 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
btrfs_header_owner(path->nodes[level + 1]));
}
- ret = btrfs_free_extent(trans, root, eb->start, eb->len, parent,
- root->root_key.objectid, level, 0);
- BUG_ON(ret);
+ btrfs_free_tree_block(trans, root, eb, parent, wc->refs[level] == 1);
out:
wc->refs[level] = 0;
wc->flags[level] = 0;
- return ret;
+ return 0;
}
static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
@@ -5483,7 +5878,8 @@ static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
* also make sure backrefs for the shared block and all lower level
* blocks are properly updated.
*/
-int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref)
+int btrfs_drop_snapshot(struct btrfs_root *root,
+ struct btrfs_block_rsv *block_rsv, int update_ref)
{
struct btrfs_path *path;
struct btrfs_trans_handle *trans;
@@ -5501,7 +5897,9 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref)
wc = kzalloc(sizeof(*wc), GFP_NOFS);
BUG_ON(!wc);
- trans = btrfs_start_transaction(tree_root, 1);
+ trans = btrfs_start_transaction(tree_root, 0);
+ if (block_rsv)
+ trans->block_rsv = block_rsv;
if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {
level = btrfs_header_level(root->node);
@@ -5589,22 +5987,16 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref)
}
BUG_ON(wc->level == 0);
- if (trans->transaction->in_commit ||
- trans->transaction->delayed_refs.flushing) {
+ if (btrfs_should_end_transaction(trans, tree_root)) {
ret = btrfs_update_root(trans, tree_root,
&root->root_key,
root_item);
BUG_ON(ret);
- btrfs_end_transaction(trans, tree_root);
- trans = btrfs_start_transaction(tree_root, 1);
- } else {
- unsigned long update;
- update = trans->delayed_ref_updates;
- trans->delayed_ref_updates = 0;
- if (update)
- btrfs_run_delayed_refs(trans, tree_root,
- update);
+ btrfs_end_transaction_throttle(trans, tree_root);
+ trans = btrfs_start_transaction(tree_root, 0);
+ if (block_rsv)
+ trans->block_rsv = block_rsv;
}
}
btrfs_release_path(root, path);
@@ -5632,7 +6024,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref)
kfree(root);
}
out:
- btrfs_end_transaction(trans, tree_root);
+ btrfs_end_transaction_throttle(trans, tree_root);
kfree(wc);
btrfs_free_path(path);
return err;
@@ -7228,48 +7620,80 @@ static u64 update_block_group_flags(struct btrfs_root *root, u64 flags)
return flags;
}
-static int __alloc_chunk_for_shrink(struct btrfs_root *root,
- struct btrfs_block_group_cache *shrink_block_group,
- int force)
+static int set_block_group_ro(struct btrfs_block_group_cache *cache)
{
- struct btrfs_trans_handle *trans;
- u64 new_alloc_flags;
- u64 calc;
+ struct btrfs_space_info *sinfo = cache->space_info;
+ u64 num_bytes;
+ int ret = -ENOSPC;
- spin_lock(&shrink_block_group->lock);
- if (btrfs_block_group_used(&shrink_block_group->item) +
- shrink_block_group->reserved > 0) {
- spin_unlock(&shrink_block_group->lock);
+ if (cache->ro)
+ return 0;
- trans = btrfs_start_transaction(root, 1);
- spin_lock(&shrink_block_group->lock);
+ spin_lock(&sinfo->lock);
+ spin_lock(&cache->lock);
+ num_bytes = cache->key.offset - cache->reserved - cache->pinned -
+ cache->bytes_super - btrfs_block_group_used(&cache->item);
+
+ if (sinfo->bytes_used + sinfo->bytes_reserved + sinfo->bytes_pinned +
+ sinfo->bytes_may_use + sinfo->bytes_readonly +
+ cache->reserved_pinned + num_bytes < sinfo->total_bytes) {
+ sinfo->bytes_readonly += num_bytes;
+ sinfo->bytes_reserved += cache->reserved_pinned;
+ cache->reserved_pinned = 0;
+ cache->ro = 1;
+ ret = 0;
+ }
+ spin_unlock(&cache->lock);
+ spin_unlock(&sinfo->lock);
+ return ret;
+}
- new_alloc_flags = update_block_group_flags(root,
- shrink_block_group->flags);
- if (new_alloc_flags != shrink_block_group->flags) {
- calc =
- btrfs_block_group_used(&shrink_block_group->item);
- } else {
- calc = shrink_block_group->key.offset;
- }
- spin_unlock(&shrink_block_group->lock);
+int btrfs_set_block_group_ro(struct btrfs_root *root,
+ struct btrfs_block_group_cache *cache)
- do_chunk_alloc(trans, root->fs_info->extent_root,
- calc + 2 * 1024 * 1024, new_alloc_flags, force);
+{
+ struct btrfs_trans_handle *trans;
+ u64 alloc_flags;
+ int ret;
- btrfs_end_transaction(trans, root);
- } else
- spin_unlock(&shrink_block_group->lock);
- return 0;
-}
+ BUG_ON(cache->ro);
+
+ trans = btrfs_join_transaction(root, 1);
+ BUG_ON(IS_ERR(trans));
+ alloc_flags = update_block_group_flags(root, cache->flags);
+ if (alloc_flags != cache->flags)
+ do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags, 1);
-int btrfs_prepare_block_group_relocation(struct btrfs_root *root,
- struct btrfs_block_group_cache *group)
+ ret = set_block_group_ro(cache);
+ if (!ret)
+ goto out;
+ alloc_flags = get_alloc_profile(root, cache->space_info->flags);
+ ret = do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags, 1);
+ if (ret < 0)
+ goto out;
+ ret = set_block_group_ro(cache);
+out:
+ btrfs_end_transaction(trans, root);
+ return ret;
+}
+int btrfs_set_block_group_rw(struct btrfs_root *root,
+ struct btrfs_block_group_cache *cache)
{
- __alloc_chunk_for_shrink(root, group, 1);
- set_block_group_readonly(group);
+ struct btrfs_space_info *sinfo = cache->space_info;
+ u64 num_bytes;
+
+ BUG_ON(!cache->ro);
+
+ spin_lock(&sinfo->lock);
+ spin_lock(&cache->lock);
+ num_bytes = cache->key.offset - cache->reserved - cache->pinned -
+ cache->bytes_super - btrfs_block_group_used(&cache->item);
+ sinfo->bytes_readonly -= num_bytes;
+ cache->ro = 0;
+ spin_unlock(&cache->lock);
+ spin_unlock(&sinfo->lock);
return 0;
}
@@ -7436,17 +7860,33 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
*/
synchronize_rcu();
+ release_global_block_rsv(info);
+
while(!list_empty(&info->space_info)) {
space_info = list_entry(info->space_info.next,
struct btrfs_space_info,
list);
-
+ if (space_info->bytes_pinned > 0 ||
+ space_info->bytes_reserved > 0) {
+ WARN_ON(1);
+ dump_space_info(space_info, 0, 0);
+ }
list_del(&space_info->list);
kfree(space_info);
}
return 0;
}
+static void __link_block_group(struct btrfs_space_info *space_info,
+ struct btrfs_block_group_cache *cache)
+{
+ int index = get_block_group_index(cache);
+
+ down_write(&space_info->groups_sem);
+ list_add_tail(&cache->list, &space_info->block_groups[index]);
+ up_write(&space_info->groups_sem);
+}
+
int btrfs_read_block_groups(struct btrfs_root *root)
{
struct btrfs_path *path;
@@ -7468,10 +7908,8 @@ int btrfs_read_block_groups(struct btrfs_root *root)
while (1) {
ret = find_first_block_group(root, path, &key);
- if (ret > 0) {
- ret = 0;
- goto error;
- }
+ if (ret > 0)
+ break;
if (ret != 0)
goto error;
@@ -7480,7 +7918,7 @@ int btrfs_read_block_groups(struct btrfs_root *root)
cache = kzalloc(sizeof(*cache), GFP_NOFS);
if (!cache) {
ret = -ENOMEM;
- break;
+ goto error;
}
atomic_set(&cache->count, 1);
@@ -7537,20 +7975,36 @@ int btrfs_read_block_groups(struct btrfs_root *root)
BUG_ON(ret);
cache->space_info = space_info;
spin_lock(&cache->space_info->lock);
- cache->space_info->bytes_super += cache->bytes_super;
+ cache->space_info->bytes_readonly += cache->bytes_super;
spin_unlock(&cache->space_info->lock);
- down_write(&space_info->groups_sem);
- list_add_tail(&cache->list, &space_info->block_groups);
- up_write(&space_info->groups_sem);
+ __link_block_group(space_info, cache);
ret = btrfs_add_block_group_cache(root->fs_info, cache);
BUG_ON(ret);
set_avail_alloc_bits(root->fs_info, cache->flags);
if (btrfs_chunk_readonly(root, cache->key.objectid))
- set_block_group_readonly(cache);
+ set_block_group_ro(cache);
}
+
+ list_for_each_entry_rcu(space_info, &root->fs_info->space_info, list) {
+ if (!(get_alloc_profile(root, space_info->flags) &
+ (BTRFS_BLOCK_GROUP_RAID10 |
+ BTRFS_BLOCK_GROUP_RAID1 |
+ BTRFS_BLOCK_GROUP_DUP)))
+ continue;
+ /*
+ * avoid allocating from un-mirrored block group if there are
+ * mirrored block groups.
+ */
+ list_for_each_entry(cache, &space_info->block_groups[3], list)
+ set_block_group_ro(cache);
+ list_for_each_entry(cache, &space_info->block_groups[4], list)
+ set_block_group_ro(cache);
+ }
+
+ init_global_block_rsv(info);
ret = 0;
error:
btrfs_free_path(path);
@@ -7611,12 +8065,10 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
BUG_ON(ret);
spin_lock(&cache->space_info->lock);
- cache->space_info->bytes_super += cache->bytes_super;
+ cache->space_info->bytes_readonly += cache->bytes_super;
spin_unlock(&cache->space_info->lock);
- down_write(&cache->space_info->groups_sem);
- list_add_tail(&cache->list, &cache->space_info->block_groups);
- up_write(&cache->space_info->groups_sem);
+ __link_block_group(cache->space_info, cache);
ret = btrfs_add_block_group_cache(root->fs_info, cache);
BUG_ON(ret);
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index d2d03684fab2..d74e6af9b53a 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -135,7 +135,7 @@ static struct extent_state *alloc_extent_state(gfp_t mask)
return state;
}
-static void free_extent_state(struct extent_state *state)
+void free_extent_state(struct extent_state *state)
{
if (!state)
return;
@@ -335,21 +335,18 @@ static int merge_state(struct extent_io_tree *tree,
}
static int set_state_cb(struct extent_io_tree *tree,
- struct extent_state *state,
- unsigned long bits)
+ struct extent_state *state, int *bits)
{
if (tree->ops && tree->ops->set_bit_hook) {
return tree->ops->set_bit_hook(tree->mapping->host,
- state->start, state->end,
- state->state, bits);
+ state, bits);
}
return 0;
}
static void clear_state_cb(struct extent_io_tree *tree,
- struct extent_state *state,
- unsigned long bits)
+ struct extent_state *state, int *bits)
{
if (tree->ops && tree->ops->clear_bit_hook)
tree->ops->clear_bit_hook(tree->mapping->host, state, bits);
@@ -367,9 +364,10 @@ static void clear_state_cb(struct extent_io_tree *tree,
*/
static int insert_state(struct extent_io_tree *tree,
struct extent_state *state, u64 start, u64 end,
- int bits)
+ int *bits)
{
struct rb_node *node;
+ int bits_to_set = *bits & ~EXTENT_CTLBITS;
int ret;
if (end < start) {
@@ -384,9 +382,9 @@ static int insert_state(struct extent_io_tree *tree,
if (ret)
return ret;
- if (bits & EXTENT_DIRTY)
+ if (bits_to_set & EXTENT_DIRTY)
tree->dirty_bytes += end - start + 1;
- state->state |= bits;
+ state->state |= bits_to_set;
node = tree_insert(&tree->state, end, &state->rb_node);
if (node) {
struct extent_state *found;
@@ -456,13 +454,13 @@ static int split_state(struct extent_io_tree *tree, struct extent_state *orig,
* struct is freed and removed from the tree
*/
static int clear_state_bit(struct extent_io_tree *tree,
- struct extent_state *state, int bits, int wake,
- int delete)
+ struct extent_state *state,
+ int *bits, int wake)
{
- int bits_to_clear = bits & ~EXTENT_DO_ACCOUNTING;
+ int bits_to_clear = *bits & ~EXTENT_CTLBITS;
int ret = state->state & bits_to_clear;
- if ((bits & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) {
+ if ((bits_to_clear & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) {
u64 range = state->end - state->start + 1;
WARN_ON(range > tree->dirty_bytes);
tree->dirty_bytes -= range;
@@ -471,9 +469,8 @@ static int clear_state_bit(struct extent_io_tree *tree,
state->state &= ~bits_to_clear;
if (wake)
wake_up(&state->wq);
- if (delete || state->state == 0) {
+ if (state->state == 0) {
if (state->tree) {
- clear_state_cb(tree, state, state->state);
rb_erase(&state->rb_node, &tree->state);
state->tree = NULL;
free_extent_state(state);
@@ -514,6 +511,10 @@ int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
int set = 0;
int clear = 0;
+ if (delete)
+ bits |= ~EXTENT_CTLBITS;
+ bits |= EXTENT_FIRST_DELALLOC;
+
if (bits & (EXTENT_IOBITS | EXTENT_BOUNDARY))
clear = 1;
again:
@@ -580,8 +581,7 @@ hit_next:
if (err)
goto out;
if (state->end <= end) {
- set |= clear_state_bit(tree, state, bits, wake,
- delete);
+ set |= clear_state_bit(tree, state, &bits, wake);
if (last_end == (u64)-1)
goto out;
start = last_end + 1;
@@ -602,7 +602,7 @@ hit_next:
if (wake)
wake_up(&state->wq);
- set |= clear_state_bit(tree, prealloc, bits, wake, delete);
+ set |= clear_state_bit(tree, prealloc, &bits, wake);
prealloc = NULL;
goto out;
@@ -613,7 +613,7 @@ hit_next:
else
next_node = NULL;
- set |= clear_state_bit(tree, state, bits, wake, delete);
+ set |= clear_state_bit(tree, state, &bits, wake);
if (last_end == (u64)-1)
goto out;
start = last_end + 1;
@@ -706,19 +706,19 @@ out:
static int set_state_bits(struct extent_io_tree *tree,
struct extent_state *state,
- int bits)
+ int *bits)
{
int ret;
+ int bits_to_set = *bits & ~EXTENT_CTLBITS;
ret = set_state_cb(tree, state, bits);
if (ret)
return ret;
-
- if ((bits & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) {
+ if ((bits_to_set & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) {
u64 range = state->end - state->start + 1;
tree->dirty_bytes += range;
}
- state->state |= bits;
+ state->state |= bits_to_set;
return 0;
}
@@ -745,10 +745,9 @@ static void cache_state(struct extent_state *state,
* [start, end] is inclusive This takes the tree lock.
*/
-static int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
- int bits, int exclusive_bits, u64 *failed_start,
- struct extent_state **cached_state,
- gfp_t mask)
+int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
+ int bits, int exclusive_bits, u64 *failed_start,
+ struct extent_state **cached_state, gfp_t mask)
{
struct extent_state *state;
struct extent_state *prealloc = NULL;
@@ -757,6 +756,7 @@ static int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
u64 last_start;
u64 last_end;
+ bits |= EXTENT_FIRST_DELALLOC;
again:
if (!prealloc && (mask & __GFP_WAIT)) {
prealloc = alloc_extent_state(mask);
@@ -778,7 +778,7 @@ again:
*/
node = tree_search(tree, start);
if (!node) {
- err = insert_state(tree, prealloc, start, end, bits);
+ err = insert_state(tree, prealloc, start, end, &bits);
prealloc = NULL;
BUG_ON(err == -EEXIST);
goto out;
@@ -802,7 +802,7 @@ hit_next:
goto out;
}
- err = set_state_bits(tree, state, bits);
+ err = set_state_bits(tree, state, &bits);
if (err)
goto out;
@@ -852,7 +852,7 @@ hit_next:
if (err)
goto out;
if (state->end <= end) {
- err = set_state_bits(tree, state, bits);
+ err = set_state_bits(tree, state, &bits);
if (err)
goto out;
cache_state(state, cached_state);
@@ -877,7 +877,7 @@ hit_next:
else
this_end = last_start - 1;
err = insert_state(tree, prealloc, start, this_end,
- bits);
+ &bits);
BUG_ON(err == -EEXIST);
if (err) {
prealloc = NULL;
@@ -903,7 +903,7 @@ hit_next:
err = split_state(tree, state, prealloc, end + 1);
BUG_ON(err == -EEXIST);
- err = set_state_bits(tree, prealloc, bits);
+ err = set_state_bits(tree, prealloc, &bits);
if (err) {
prealloc = NULL;
goto out;
@@ -966,8 +966,7 @@ int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
{
return clear_extent_bit(tree, start, end,
EXTENT_DIRTY | EXTENT_DELALLOC |
- EXTENT_DO_ACCOUNTING, 0, 0,
- NULL, mask);
+ EXTENT_DO_ACCOUNTING, 0, 0, NULL, mask);
}
int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
@@ -1435,9 +1434,6 @@ int extent_clear_unlock_delalloc(struct inode *inode,
if (op & EXTENT_CLEAR_DELALLOC)
clear_bits |= EXTENT_DELALLOC;
- if (op & EXTENT_CLEAR_ACCOUNTING)
- clear_bits |= EXTENT_DO_ACCOUNTING;
-
clear_extent_bit(tree, start, end, clear_bits, 1, 0, NULL, GFP_NOFS);
if (!(op & (EXTENT_CLEAR_UNLOCK_PAGE | EXTENT_CLEAR_DIRTY |
EXTENT_SET_WRITEBACK | EXTENT_END_WRITEBACK |
@@ -1916,7 +1912,7 @@ static int submit_one_bio(int rw, struct bio *bio, int mirror_num,
if (tree->ops && tree->ops->submit_bio_hook)
tree->ops->submit_bio_hook(page->mapping->host, rw, bio,
- mirror_num, bio_flags);
+ mirror_num, bio_flags, start);
else
submit_bio(rw, bio);
if (bio_flagged(bio, BIO_EOPNOTSUPP))
@@ -2020,6 +2016,7 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
sector_t sector;
struct extent_map *em;
struct block_device *bdev;
+ struct btrfs_ordered_extent *ordered;
int ret;
int nr = 0;
size_t page_offset = 0;
@@ -2031,7 +2028,15 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
set_page_extent_mapped(page);
end = page_end;
- lock_extent(tree, start, end, GFP_NOFS);
+ while (1) {
+ lock_extent(tree, start, end, GFP_NOFS);
+ ordered = btrfs_lookup_ordered_extent(inode, start);
+ if (!ordered)
+ break;
+ unlock_extent(tree, start, end, GFP_NOFS);
+ btrfs_start_ordered_extent(inode, ordered, 1);
+ btrfs_put_ordered_extent(ordered);
+ }
if (page->index == last_byte >> PAGE_CACHE_SHIFT) {
char *userpage;
@@ -2589,7 +2594,6 @@ int extent_write_full_page(struct extent_io_tree *tree, struct page *page,
.sync_io = wbc->sync_mode == WB_SYNC_ALL,
};
struct writeback_control wbc_writepages = {
- .bdi = wbc->bdi,
.sync_mode = wbc->sync_mode,
.older_than_this = NULL,
.nr_to_write = 64,
@@ -2623,7 +2627,6 @@ int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode,
.sync_io = mode == WB_SYNC_ALL,
};
struct writeback_control wbc_writepages = {
- .bdi = inode->i_mapping->backing_dev_info,
.sync_mode = mode,
.older_than_this = NULL,
.nr_to_write = nr_pages * 2,
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index bbab4813646f..5691c7b590da 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -16,7 +16,9 @@
#define EXTENT_BOUNDARY (1 << 9)
#define EXTENT_NODATASUM (1 << 10)
#define EXTENT_DO_ACCOUNTING (1 << 11)
+#define EXTENT_FIRST_DELALLOC (1 << 12)
#define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK)
+#define EXTENT_CTLBITS (EXTENT_DO_ACCOUNTING | EXTENT_FIRST_DELALLOC)
/* flags for bio submission */
#define EXTENT_BIO_COMPRESSED 1
@@ -47,7 +49,7 @@ struct extent_state;
typedef int (extent_submit_bio_hook_t)(struct inode *inode, int rw,
struct bio *bio, int mirror_num,
- unsigned long bio_flags);
+ unsigned long bio_flags, u64 bio_offset);
struct extent_io_ops {
int (*fill_delalloc)(struct inode *inode, struct page *locked_page,
u64 start, u64 end, int *page_started,
@@ -69,10 +71,10 @@ struct extent_io_ops {
struct extent_state *state);
int (*writepage_end_io_hook)(struct page *page, u64 start, u64 end,
struct extent_state *state, int uptodate);
- int (*set_bit_hook)(struct inode *inode, u64 start, u64 end,
- unsigned long old, unsigned long bits);
+ int (*set_bit_hook)(struct inode *inode, struct extent_state *state,
+ int *bits);
int (*clear_bit_hook)(struct inode *inode, struct extent_state *state,
- unsigned long bits);
+ int *bits);
int (*merge_extent_hook)(struct inode *inode,
struct extent_state *new,
struct extent_state *other);
@@ -176,6 +178,7 @@ u64 count_range_bits(struct extent_io_tree *tree,
u64 *start, u64 search_end,
u64 max_bytes, unsigned long bits);
+void free_extent_state(struct extent_state *state);
int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
int bits, int filled, struct extent_state *cached_state);
int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
@@ -185,6 +188,9 @@ int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
gfp_t mask);
int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
int bits, gfp_t mask);
+int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
+ int bits, int exclusive_bits, u64 *failed_start,
+ struct extent_state **cached_state, gfp_t mask);
int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
gfp_t mask);
int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index 54a255065aa3..a562a250ae77 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -149,13 +149,14 @@ int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans,
}
-int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode,
- struct bio *bio, u32 *dst)
+static int __btrfs_lookup_bio_sums(struct btrfs_root *root,
+ struct inode *inode, struct bio *bio,
+ u64 logical_offset, u32 *dst, int dio)
{
u32 sum;
struct bio_vec *bvec = bio->bi_io_vec;
int bio_index = 0;
- u64 offset;
+ u64 offset = 0;
u64 item_start_offset = 0;
u64 item_last_offset = 0;
u64 disk_bytenr;
@@ -174,8 +175,11 @@ int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode,
WARN_ON(bio->bi_vcnt <= 0);
disk_bytenr = (u64)bio->bi_sector << 9;
+ if (dio)
+ offset = logical_offset;
while (bio_index < bio->bi_vcnt) {
- offset = page_offset(bvec->bv_page) + bvec->bv_offset;
+ if (!dio)
+ offset = page_offset(bvec->bv_page) + bvec->bv_offset;
ret = btrfs_find_ordered_sum(inode, offset, disk_bytenr, &sum);
if (ret == 0)
goto found;
@@ -238,6 +242,7 @@ found:
else
set_state_private(io_tree, offset, sum);
disk_bytenr += bvec->bv_len;
+ offset += bvec->bv_len;
bio_index++;
bvec++;
}
@@ -245,6 +250,18 @@ found:
return 0;
}
+int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode,
+ struct bio *bio, u32 *dst)
+{
+ return __btrfs_lookup_bio_sums(root, inode, bio, 0, dst, 0);
+}
+
+int btrfs_lookup_bio_sums_dio(struct btrfs_root *root, struct inode *inode,
+ struct bio *bio, u64 offset, u32 *dst)
+{
+ return __btrfs_lookup_bio_sums(root, inode, bio, offset, dst, 1);
+}
+
int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
struct list_head *list)
{
@@ -657,6 +674,9 @@ again:
goto found;
}
ret = PTR_ERR(item);
+ if (ret != -EFBIG && ret != -ENOENT)
+ goto fail_unlock;
+
if (ret == -EFBIG) {
u32 item_size;
/* we found one, but it isn't big enough yet */
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 29ff749ff4ca..e354c33df082 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -46,32 +46,42 @@
static noinline int btrfs_copy_from_user(loff_t pos, int num_pages,
int write_bytes,
struct page **prepared_pages,
- const char __user *buf)
+ struct iov_iter *i)
{
- long page_fault = 0;
- int i;
+ size_t copied;
+ int pg = 0;
int offset = pos & (PAGE_CACHE_SIZE - 1);
- for (i = 0; i < num_pages && write_bytes > 0; i++, offset = 0) {
+ while (write_bytes > 0) {
size_t count = min_t(size_t,
PAGE_CACHE_SIZE - offset, write_bytes);
- struct page *page = prepared_pages[i];
- fault_in_pages_readable(buf, count);
+ struct page *page = prepared_pages[pg];
+again:
+ if (unlikely(iov_iter_fault_in_readable(i, count)))
+ return -EFAULT;
/* Copy data from userspace to the current page */
- kmap(page);
- page_fault = __copy_from_user(page_address(page) + offset,
- buf, count);
+ copied = iov_iter_copy_from_user(page, i, offset, count);
+
/* Flush processor's dcache for this page */
flush_dcache_page(page);
- kunmap(page);
- buf += count;
- write_bytes -= count;
+ iov_iter_advance(i, copied);
+ write_bytes -= copied;
- if (page_fault)
- break;
+ if (unlikely(copied == 0)) {
+ count = min_t(size_t, PAGE_CACHE_SIZE - offset,
+ iov_iter_single_seg_count(i));
+ goto again;
+ }
+
+ if (unlikely(copied < PAGE_CACHE_SIZE - offset)) {
+ offset += copied;
+ } else {
+ pg++;
+ offset = 0;
+ }
}
- return page_fault ? -EFAULT : 0;
+ return 0;
}
/*
@@ -126,8 +136,7 @@ static noinline int dirty_and_release_pages(struct btrfs_trans_handle *trans,
end_of_last_block = start_pos + num_bytes - 1;
err = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block,
NULL);
- if (err)
- return err;
+ BUG_ON(err);
for (i = 0; i < num_pages; i++) {
struct page *p = pages[i];
@@ -142,7 +151,7 @@ static noinline int dirty_and_release_pages(struct btrfs_trans_handle *trans,
* at this time.
*/
}
- return err;
+ return 0;
}
/*
@@ -823,45 +832,46 @@ again:
return 0;
}
-static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
- size_t count, loff_t *ppos)
+static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
+ const struct iovec *iov,
+ unsigned long nr_segs, loff_t pos)
{
- loff_t pos;
+ struct file *file = iocb->ki_filp;
+ struct inode *inode = fdentry(file)->d_inode;
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct page *pinned[2];
+ struct page **pages = NULL;
+ struct iov_iter i;
+ loff_t *ppos = &iocb->ki_pos;
loff_t start_pos;
ssize_t num_written = 0;
ssize_t err = 0;
+ size_t count;
+ size_t ocount;
int ret = 0;
- struct inode *inode = fdentry(file)->d_inode;
- struct btrfs_root *root = BTRFS_I(inode)->root;
- struct page **pages = NULL;
int nrptrs;
- struct page *pinned[2];
unsigned long first_index;
unsigned long last_index;
int will_write;
+ int buffered = 0;
will_write = ((file->f_flags & O_DSYNC) || IS_SYNC(inode) ||
(file->f_flags & O_DIRECT));
- nrptrs = min((count + PAGE_CACHE_SIZE - 1) / PAGE_CACHE_SIZE,
- PAGE_CACHE_SIZE / (sizeof(struct page *)));
pinned[0] = NULL;
pinned[1] = NULL;
- pos = *ppos;
start_pos = pos;
vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
- /* do the reserve before the mutex lock in case we have to do some
- * flushing. We wouldn't deadlock, but this is more polite.
- */
- err = btrfs_reserve_metadata_for_delalloc(root, inode, 1);
- if (err)
- goto out_nolock;
-
mutex_lock(&inode->i_mutex);
+ err = generic_segment_checks(iov, &nr_segs, &ocount, VERIFY_READ);
+ if (err)
+ goto out;
+ count = ocount;
+
current->backing_dev_info = inode->i_mapping->backing_dev_info;
err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
if (err)
@@ -875,15 +885,53 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
goto out;
file_update_time(file);
+ BTRFS_I(inode)->sequence++;
+
+ if (unlikely(file->f_flags & O_DIRECT)) {
+ num_written = generic_file_direct_write(iocb, iov, &nr_segs,
+ pos, ppos, count,
+ ocount);
+ /*
+ * the generic O_DIRECT will update in-memory i_size after the
+ * DIOs are done. But our endio handlers that update the on
+ * disk i_size never update past the in memory i_size. So we
+ * need one more update here to catch any additions to the
+ * file
+ */
+ if (inode->i_size != BTRFS_I(inode)->disk_i_size) {
+ btrfs_ordered_update_i_size(inode, inode->i_size, NULL);
+ mark_inode_dirty(inode);
+ }
+ if (num_written < 0) {
+ ret = num_written;
+ num_written = 0;
+ goto out;
+ } else if (num_written == count) {
+ /* pick up pos changes done by the generic code */
+ pos = *ppos;
+ goto out;
+ }
+ /*
+ * We are going to do buffered for the rest of the range, so we
+ * need to make sure to invalidate the buffered pages when we're
+ * done.
+ */
+ buffered = 1;
+ pos += num_written;
+ }
+
+ iov_iter_init(&i, iov, nr_segs, count, num_written);
+ nrptrs = min((iov_iter_count(&i) + PAGE_CACHE_SIZE - 1) /
+ PAGE_CACHE_SIZE, PAGE_CACHE_SIZE /
+ (sizeof(struct page *)));
pages = kmalloc(nrptrs * sizeof(struct page *), GFP_KERNEL);
/* generic_write_checks can change our pos */
start_pos = pos;
- BTRFS_I(inode)->sequence++;
first_index = pos >> PAGE_CACHE_SHIFT;
- last_index = (pos + count) >> PAGE_CACHE_SHIFT;
+ last_index = (pos + iov_iter_count(&i)) >> PAGE_CACHE_SHIFT;
/*
* there are lots of better ways to do this, but this code
@@ -900,7 +948,7 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
unlock_page(pinned[0]);
}
}
- if ((pos + count) & (PAGE_CACHE_SIZE - 1)) {
+ if ((pos + iov_iter_count(&i)) & (PAGE_CACHE_SIZE - 1)) {
pinned[1] = grab_cache_page(inode->i_mapping, last_index);
if (!PageUptodate(pinned[1])) {
ret = btrfs_readpage(NULL, pinned[1]);
@@ -911,10 +959,10 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
}
}
- while (count > 0) {
+ while (iov_iter_count(&i) > 0) {
size_t offset = pos & (PAGE_CACHE_SIZE - 1);
- size_t write_bytes = min(count, nrptrs *
- (size_t)PAGE_CACHE_SIZE -
+ size_t write_bytes = min(iov_iter_count(&i),
+ nrptrs * (size_t)PAGE_CACHE_SIZE -
offset);
size_t num_pages = (write_bytes + PAGE_CACHE_SIZE - 1) >>
PAGE_CACHE_SHIFT;
@@ -922,7 +970,7 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
WARN_ON(num_pages > nrptrs);
memset(pages, 0, sizeof(struct page *) * nrptrs);
- ret = btrfs_check_data_free_space(root, inode, write_bytes);
+ ret = btrfs_delalloc_reserve_space(inode, write_bytes);
if (ret)
goto out;
@@ -930,26 +978,20 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
pos, first_index, last_index,
write_bytes);
if (ret) {
- btrfs_free_reserved_data_space(root, inode,
- write_bytes);
+ btrfs_delalloc_release_space(inode, write_bytes);
goto out;
}
ret = btrfs_copy_from_user(pos, num_pages,
- write_bytes, pages, buf);
- if (ret) {
- btrfs_free_reserved_data_space(root, inode,
- write_bytes);
- btrfs_drop_pages(pages, num_pages);
- goto out;
+ write_bytes, pages, &i);
+ if (ret == 0) {
+ dirty_and_release_pages(NULL, root, file, pages,
+ num_pages, pos, write_bytes);
}
- ret = dirty_and_release_pages(NULL, root, file, pages,
- num_pages, pos, write_bytes);
btrfs_drop_pages(pages, num_pages);
if (ret) {
- btrfs_free_reserved_data_space(root, inode,
- write_bytes);
+ btrfs_delalloc_release_space(inode, write_bytes);
goto out;
}
@@ -965,8 +1007,6 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
btrfs_throttle(root);
}
- buf += write_bytes;
- count -= write_bytes;
pos += write_bytes;
num_written += write_bytes;
@@ -976,9 +1016,7 @@ out:
mutex_unlock(&inode->i_mutex);
if (ret)
err = ret;
- btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
-out_nolock:
kfree(pages);
if (pinned[0])
page_cache_release(pinned[0]);
@@ -1008,7 +1046,7 @@ out_nolock:
num_written = err;
if ((file->f_flags & O_DSYNC) || IS_SYNC(inode)) {
- trans = btrfs_start_transaction(root, 1);
+ trans = btrfs_start_transaction(root, 0);
ret = btrfs_log_dentry_safe(trans, root,
file->f_dentry);
if (ret == 0) {
@@ -1023,7 +1061,7 @@ out_nolock:
btrfs_end_transaction(trans, root);
}
}
- if (file->f_flags & O_DIRECT) {
+ if (file->f_flags & O_DIRECT && buffered) {
invalidate_mapping_pages(inode->i_mapping,
start_pos >> PAGE_CACHE_SHIFT,
(start_pos + num_written - 1) >> PAGE_CACHE_SHIFT);
@@ -1063,8 +1101,9 @@ int btrfs_release_file(struct inode *inode, struct file *filp)
* important optimization for directories because holding the mutex prevents
* new operations on the dir while we write to disk.
*/
-int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync)
+int btrfs_sync_file(struct file *file, int datasync)
{
+ struct dentry *dentry = file->f_path.dentry;
struct inode *inode = dentry->d_inode;
struct btrfs_root *root = BTRFS_I(inode)->root;
int ret = 0;
@@ -1101,12 +1140,12 @@ int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync)
/*
* ok we haven't committed the transaction yet, lets do a commit
*/
- if (file && file->private_data)
+ if (file->private_data)
btrfs_ioctl_trans_end(file);
- trans = btrfs_start_transaction(root, 1);
- if (!trans) {
- ret = -ENOMEM;
+ trans = btrfs_start_transaction(root, 0);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
goto out;
}
@@ -1151,17 +1190,25 @@ static const struct vm_operations_struct btrfs_file_vm_ops = {
static int btrfs_file_mmap(struct file *filp, struct vm_area_struct *vma)
{
- vma->vm_ops = &btrfs_file_vm_ops;
+ struct address_space *mapping = filp->f_mapping;
+
+ if (!mapping->a_ops->readpage)
+ return -ENOEXEC;
+
file_accessed(filp);
+ vma->vm_ops = &btrfs_file_vm_ops;
+ vma->vm_flags |= VM_CAN_NONLINEAR;
+
return 0;
}
const struct file_operations btrfs_file_operations = {
.llseek = generic_file_llseek,
.read = do_sync_read,
+ .write = do_sync_write,
.aio_read = generic_file_aio_read,
.splice_read = generic_file_splice_read,
- .write = btrfs_file_write,
+ .aio_write = btrfs_file_aio_write,
.mmap = btrfs_file_mmap,
.open = generic_file_open,
.release = btrfs_release_file,
diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c
index 72ce3c173d6a..64f1150bb48d 100644
--- a/fs/btrfs/inode-item.c
+++ b/fs/btrfs/inode-item.c
@@ -49,6 +49,33 @@ static int find_name_in_backref(struct btrfs_path *path, const char *name,
return 0;
}
+struct btrfs_inode_ref *
+btrfs_lookup_inode_ref(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ const char *name, int name_len,
+ u64 inode_objectid, u64 ref_objectid, int mod)
+{
+ struct btrfs_key key;
+ struct btrfs_inode_ref *ref;
+ int ins_len = mod < 0 ? -1 : 0;
+ int cow = mod != 0;
+ int ret;
+
+ key.objectid = inode_objectid;
+ key.type = BTRFS_INODE_REF_KEY;
+ key.offset = ref_objectid;
+
+ ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow);
+ if (ret < 0)
+ return ERR_PTR(ret);
+ if (ret > 0)
+ return NULL;
+ if (!find_name_in_backref(path, name, name_len, &ref))
+ return NULL;
+ return ref;
+}
+
int btrfs_del_inode_ref(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
const char *name, int name_len,
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index d601629b85d1..1bff92ad4744 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -252,6 +252,7 @@ static noinline int cow_file_range_inline(struct btrfs_trans_handle *trans,
inline_len, compressed_size,
compressed_pages);
BUG_ON(ret);
+ btrfs_delalloc_release_metadata(inode, end + 1 - start);
btrfs_drop_extent_cache(inode, start, aligned_end - 1, 0);
return 0;
}
@@ -414,6 +415,7 @@ again:
trans = btrfs_join_transaction(root, 1);
BUG_ON(!trans);
btrfs_set_trans_block_group(trans, inode);
+ trans->block_rsv = &root->fs_info->delalloc_block_rsv;
/* lets try to make an inline extent */
if (ret || total_in < (actual_end - start)) {
@@ -439,7 +441,6 @@ again:
start, end, NULL,
EXTENT_CLEAR_UNLOCK_PAGE | EXTENT_CLEAR_DIRTY |
EXTENT_CLEAR_DELALLOC |
- EXTENT_CLEAR_ACCOUNTING |
EXTENT_SET_WRITEBACK | EXTENT_END_WRITEBACK);
btrfs_end_transaction(trans, root);
@@ -697,6 +698,38 @@ retry:
return 0;
}
+static u64 get_extent_allocation_hint(struct inode *inode, u64 start,
+ u64 num_bytes)
+{
+ struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
+ struct extent_map *em;
+ u64 alloc_hint = 0;
+
+ read_lock(&em_tree->lock);
+ em = search_extent_mapping(em_tree, start, num_bytes);
+ if (em) {
+ /*
+ * if block start isn't an actual block number then find the
+ * first block in this inode and use that as a hint. If that
+ * block is also bogus then just don't worry about it.
+ */
+ if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
+ free_extent_map(em);
+ em = search_extent_mapping(em_tree, 0, 0);
+ if (em && em->block_start < EXTENT_MAP_LAST_BYTE)
+ alloc_hint = em->block_start;
+ if (em)
+ free_extent_map(em);
+ } else {
+ alloc_hint = em->block_start;
+ free_extent_map(em);
+ }
+ }
+ read_unlock(&em_tree->lock);
+
+ return alloc_hint;
+}
+
/*
* when extent_io.c finds a delayed allocation range in the file,
* the call backs end up in this code. The basic idea is to
@@ -734,6 +767,7 @@ static noinline int cow_file_range(struct inode *inode,
trans = btrfs_join_transaction(root, 1);
BUG_ON(!trans);
btrfs_set_trans_block_group(trans, inode);
+ trans->block_rsv = &root->fs_info->delalloc_block_rsv;
actual_end = min_t(u64, isize, end + 1);
@@ -753,7 +787,6 @@ static noinline int cow_file_range(struct inode *inode,
EXTENT_CLEAR_UNLOCK_PAGE |
EXTENT_CLEAR_UNLOCK |
EXTENT_CLEAR_DELALLOC |
- EXTENT_CLEAR_ACCOUNTING |
EXTENT_CLEAR_DIRTY |
EXTENT_SET_WRITEBACK |
EXTENT_END_WRITEBACK);
@@ -769,29 +802,7 @@ static noinline int cow_file_range(struct inode *inode,
BUG_ON(disk_num_bytes >
btrfs_super_total_bytes(&root->fs_info->super_copy));
-
- read_lock(&BTRFS_I(inode)->extent_tree.lock);
- em = search_extent_mapping(&BTRFS_I(inode)->extent_tree,
- start, num_bytes);
- if (em) {
- /*
- * if block start isn't an actual block number then find the
- * first block in this inode and use that as a hint. If that
- * block is also bogus then just don't worry about it.
- */
- if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
- free_extent_map(em);
- em = search_extent_mapping(em_tree, 0, 0);
- if (em && em->block_start < EXTENT_MAP_LAST_BYTE)
- alloc_hint = em->block_start;
- if (em)
- free_extent_map(em);
- } else {
- alloc_hint = em->block_start;
- free_extent_map(em);
- }
- }
- read_unlock(&BTRFS_I(inode)->extent_tree.lock);
+ alloc_hint = get_extent_allocation_hint(inode, start, num_bytes);
btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0);
while (disk_num_bytes > 0) {
@@ -1174,6 +1185,13 @@ out_check:
num_bytes, num_bytes, type);
BUG_ON(ret);
+ if (root->root_key.objectid ==
+ BTRFS_DATA_RELOC_TREE_OBJECTID) {
+ ret = btrfs_reloc_clone_csums(inode, cur_offset,
+ num_bytes);
+ BUG_ON(ret);
+ }
+
extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree,
cur_offset, cur_offset + num_bytes - 1,
locked_page, EXTENT_CLEAR_UNLOCK_PAGE |
@@ -1226,15 +1244,13 @@ static int run_delalloc_range(struct inode *inode, struct page *locked_page,
}
static int btrfs_split_extent_hook(struct inode *inode,
- struct extent_state *orig, u64 split)
+ struct extent_state *orig, u64 split)
{
+ /* not delalloc, ignore it */
if (!(orig->state & EXTENT_DELALLOC))
return 0;
- spin_lock(&BTRFS_I(inode)->accounting_lock);
- BTRFS_I(inode)->outstanding_extents++;
- spin_unlock(&BTRFS_I(inode)->accounting_lock);
-
+ atomic_inc(&BTRFS_I(inode)->outstanding_extents);
return 0;
}
@@ -1252,10 +1268,7 @@ static int btrfs_merge_extent_hook(struct inode *inode,
if (!(other->state & EXTENT_DELALLOC))
return 0;
- spin_lock(&BTRFS_I(inode)->accounting_lock);
- BTRFS_I(inode)->outstanding_extents--;
- spin_unlock(&BTRFS_I(inode)->accounting_lock);
-
+ atomic_dec(&BTRFS_I(inode)->outstanding_extents);
return 0;
}
@@ -1264,8 +1277,8 @@ static int btrfs_merge_extent_hook(struct inode *inode,
* bytes in this file, and to maintain the list of inodes that
* have pending delalloc work to be done.
*/
-static int btrfs_set_bit_hook(struct inode *inode, u64 start, u64 end,
- unsigned long old, unsigned long bits)
+static int btrfs_set_bit_hook(struct inode *inode,
+ struct extent_state *state, int *bits)
{
/*
@@ -1273,17 +1286,18 @@ static int btrfs_set_bit_hook(struct inode *inode, u64 start, u64 end,
* but in this case, we are only testeing for the DELALLOC
* bit, which is only set or cleared with irqs on
*/
- if (!(old & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) {
+ if (!(state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) {
struct btrfs_root *root = BTRFS_I(inode)->root;
+ u64 len = state->end + 1 - state->start;
- spin_lock(&BTRFS_I(inode)->accounting_lock);
- BTRFS_I(inode)->outstanding_extents++;
- spin_unlock(&BTRFS_I(inode)->accounting_lock);
- btrfs_delalloc_reserve_space(root, inode, end - start + 1);
+ if (*bits & EXTENT_FIRST_DELALLOC)
+ *bits &= ~EXTENT_FIRST_DELALLOC;
+ else
+ atomic_inc(&BTRFS_I(inode)->outstanding_extents);
spin_lock(&root->fs_info->delalloc_lock);
- BTRFS_I(inode)->delalloc_bytes += end - start + 1;
- root->fs_info->delalloc_bytes += end - start + 1;
+ BTRFS_I(inode)->delalloc_bytes += len;
+ root->fs_info->delalloc_bytes += len;
if (list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
list_add_tail(&BTRFS_I(inode)->delalloc_inodes,
&root->fs_info->delalloc_inodes);
@@ -1297,45 +1311,32 @@ static int btrfs_set_bit_hook(struct inode *inode, u64 start, u64 end,
* extent_io.c clear_bit_hook, see set_bit_hook for why
*/
static int btrfs_clear_bit_hook(struct inode *inode,
- struct extent_state *state, unsigned long bits)
+ struct extent_state *state, int *bits)
{
/*
* set_bit and clear bit hooks normally require _irqsave/restore
* but in this case, we are only testeing for the DELALLOC
* bit, which is only set or cleared with irqs on
*/
- if ((state->state & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) {
+ if ((state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) {
struct btrfs_root *root = BTRFS_I(inode)->root;
+ u64 len = state->end + 1 - state->start;
- if (bits & EXTENT_DO_ACCOUNTING) {
- spin_lock(&BTRFS_I(inode)->accounting_lock);
- WARN_ON(!BTRFS_I(inode)->outstanding_extents);
- BTRFS_I(inode)->outstanding_extents--;
- spin_unlock(&BTRFS_I(inode)->accounting_lock);
- btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
- }
+ if (*bits & EXTENT_FIRST_DELALLOC)
+ *bits &= ~EXTENT_FIRST_DELALLOC;
+ else if (!(*bits & EXTENT_DO_ACCOUNTING))
+ atomic_dec(&BTRFS_I(inode)->outstanding_extents);
+
+ if (*bits & EXTENT_DO_ACCOUNTING)
+ btrfs_delalloc_release_metadata(inode, len);
+
+ if (root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID)
+ btrfs_free_reserved_data_space(inode, len);
spin_lock(&root->fs_info->delalloc_lock);
- if (state->end - state->start + 1 >
- root->fs_info->delalloc_bytes) {
- printk(KERN_INFO "btrfs warning: delalloc account "
- "%llu %llu\n",
- (unsigned long long)
- state->end - state->start + 1,
- (unsigned long long)
- root->fs_info->delalloc_bytes);
- btrfs_delalloc_free_space(root, inode, (u64)-1);
- root->fs_info->delalloc_bytes = 0;
- BTRFS_I(inode)->delalloc_bytes = 0;
- } else {
- btrfs_delalloc_free_space(root, inode,
- state->end -
- state->start + 1);
- root->fs_info->delalloc_bytes -= state->end -
- state->start + 1;
- BTRFS_I(inode)->delalloc_bytes -= state->end -
- state->start + 1;
- }
+ root->fs_info->delalloc_bytes -= len;
+ BTRFS_I(inode)->delalloc_bytes -= len;
+
if (BTRFS_I(inode)->delalloc_bytes == 0 &&
!list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
list_del_init(&BTRFS_I(inode)->delalloc_inodes);
@@ -1384,7 +1385,8 @@ int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
*/
static int __btrfs_submit_bio_start(struct inode *inode, int rw,
struct bio *bio, int mirror_num,
- unsigned long bio_flags)
+ unsigned long bio_flags,
+ u64 bio_offset)
{
struct btrfs_root *root = BTRFS_I(inode)->root;
int ret = 0;
@@ -1403,7 +1405,8 @@ static int __btrfs_submit_bio_start(struct inode *inode, int rw,
* are inserted into the btree
*/
static int __btrfs_submit_bio_done(struct inode *inode, int rw, struct bio *bio,
- int mirror_num, unsigned long bio_flags)
+ int mirror_num, unsigned long bio_flags,
+ u64 bio_offset)
{
struct btrfs_root *root = BTRFS_I(inode)->root;
return btrfs_map_bio(root, rw, bio, mirror_num, 1);
@@ -1414,7 +1417,8 @@ static int __btrfs_submit_bio_done(struct inode *inode, int rw, struct bio *bio,
* on write, or reading the csums from the tree before a read
*/
static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
- int mirror_num, unsigned long bio_flags)
+ int mirror_num, unsigned long bio_flags,
+ u64 bio_offset)
{
struct btrfs_root *root = BTRFS_I(inode)->root;
int ret = 0;
@@ -1439,7 +1443,8 @@ static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
/* we're doing a write, do the async checksumming */
return btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info,
inode, rw, bio, mirror_num,
- bio_flags, __btrfs_submit_bio_start,
+ bio_flags, bio_offset,
+ __btrfs_submit_bio_start,
__btrfs_submit_bio_done);
}
@@ -1520,6 +1525,7 @@ again:
goto again;
}
+ BUG();
btrfs_set_extent_delalloc(inode, page_start, page_end, &cached_state);
ClearPageChecked(page);
out:
@@ -1650,7 +1656,7 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
{
struct btrfs_root *root = BTRFS_I(inode)->root;
- struct btrfs_trans_handle *trans;
+ struct btrfs_trans_handle *trans = NULL;
struct btrfs_ordered_extent *ordered_extent = NULL;
struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
struct extent_state *cached_state = NULL;
@@ -1668,9 +1674,10 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
ret = btrfs_ordered_update_i_size(inode, 0, ordered_extent);
if (!ret) {
trans = btrfs_join_transaction(root, 1);
+ btrfs_set_trans_block_group(trans, inode);
+ trans->block_rsv = &root->fs_info->delalloc_block_rsv;
ret = btrfs_update_inode(trans, root, inode);
BUG_ON(ret);
- btrfs_end_transaction(trans, root);
}
goto out;
}
@@ -1680,6 +1687,8 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
0, &cached_state, GFP_NOFS);
trans = btrfs_join_transaction(root, 1);
+ btrfs_set_trans_block_group(trans, inode);
+ trans->block_rsv = &root->fs_info->delalloc_block_rsv;
if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags))
compressed = 1;
@@ -1711,12 +1720,13 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
add_pending_csums(trans, inode, ordered_extent->file_offset,
&ordered_extent->list);
- /* this also removes the ordered extent from the tree */
btrfs_ordered_update_i_size(inode, 0, ordered_extent);
ret = btrfs_update_inode(trans, root, inode);
BUG_ON(ret);
- btrfs_end_transaction(trans, root);
out:
+ btrfs_delalloc_release_metadata(inode, ordered_extent->len);
+ if (trans)
+ btrfs_end_transaction(trans, root);
/* once for us */
btrfs_put_ordered_extent(ordered_extent);
/* once for the tree */
@@ -1838,7 +1848,7 @@ static int btrfs_io_failed_hook(struct bio *failed_bio,
BTRFS_I(inode)->io_tree.ops->submit_bio_hook(inode, rw, bio,
failrec->last_mirror,
- failrec->bio_flags);
+ failrec->bio_flags, 0);
return 0;
}
@@ -1993,32 +2003,196 @@ void btrfs_run_delayed_iputs(struct btrfs_root *root)
}
/*
+ * calculate extra metadata reservation when snapshotting a subvolume
+ * contains orphan files.
+ */
+void btrfs_orphan_pre_snapshot(struct btrfs_trans_handle *trans,
+ struct btrfs_pending_snapshot *pending,
+ u64 *bytes_to_reserve)
+{
+ struct btrfs_root *root;
+ struct btrfs_block_rsv *block_rsv;
+ u64 num_bytes;
+ int index;
+
+ root = pending->root;
+ if (!root->orphan_block_rsv || list_empty(&root->orphan_list))
+ return;
+
+ block_rsv = root->orphan_block_rsv;
+
+ /* orphan block reservation for the snapshot */
+ num_bytes = block_rsv->size;
+
+ /*
+ * after the snapshot is created, COWing tree blocks may use more
+ * space than it frees. So we should make sure there is enough
+ * reserved space.
+ */
+ index = trans->transid & 0x1;
+ if (block_rsv->reserved + block_rsv->freed[index] < block_rsv->size) {
+ num_bytes += block_rsv->size -
+ (block_rsv->reserved + block_rsv->freed[index]);
+ }
+
+ *bytes_to_reserve += num_bytes;
+}
+
+void btrfs_orphan_post_snapshot(struct btrfs_trans_handle *trans,
+ struct btrfs_pending_snapshot *pending)
+{
+ struct btrfs_root *root = pending->root;
+ struct btrfs_root *snap = pending->snap;
+ struct btrfs_block_rsv *block_rsv;
+ u64 num_bytes;
+ int index;
+ int ret;
+
+ if (!root->orphan_block_rsv || list_empty(&root->orphan_list))
+ return;
+
+ /* refill source subvolume's orphan block reservation */
+ block_rsv = root->orphan_block_rsv;
+ index = trans->transid & 0x1;
+ if (block_rsv->reserved + block_rsv->freed[index] < block_rsv->size) {
+ num_bytes = block_rsv->size -
+ (block_rsv->reserved + block_rsv->freed[index]);
+ ret = btrfs_block_rsv_migrate(&pending->block_rsv,
+ root->orphan_block_rsv,
+ num_bytes);
+ BUG_ON(ret);
+ }
+
+ /* setup orphan block reservation for the snapshot */
+ block_rsv = btrfs_alloc_block_rsv(snap);
+ BUG_ON(!block_rsv);
+
+ btrfs_add_durable_block_rsv(root->fs_info, block_rsv);
+ snap->orphan_block_rsv = block_rsv;
+
+ num_bytes = root->orphan_block_rsv->size;
+ ret = btrfs_block_rsv_migrate(&pending->block_rsv,
+ block_rsv, num_bytes);
+ BUG_ON(ret);
+
+#if 0
+ /* insert orphan item for the snapshot */
+ WARN_ON(!root->orphan_item_inserted);
+ ret = btrfs_insert_orphan_item(trans, root->fs_info->tree_root,
+ snap->root_key.objectid);
+ BUG_ON(ret);
+ snap->orphan_item_inserted = 1;
+#endif
+}
+
+enum btrfs_orphan_cleanup_state {
+ ORPHAN_CLEANUP_STARTED = 1,
+ ORPHAN_CLEANUP_DONE = 2,
+};
+
+/*
+ * This is called in transaction commmit time. If there are no orphan
+ * files in the subvolume, it removes orphan item and frees block_rsv
+ * structure.
+ */
+void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root)
+{
+ int ret;
+
+ if (!list_empty(&root->orphan_list) ||
+ root->orphan_cleanup_state != ORPHAN_CLEANUP_DONE)
+ return;
+
+ if (root->orphan_item_inserted &&
+ btrfs_root_refs(&root->root_item) > 0) {
+ ret = btrfs_del_orphan_item(trans, root->fs_info->tree_root,
+ root->root_key.objectid);
+ BUG_ON(ret);
+ root->orphan_item_inserted = 0;
+ }
+
+ if (root->orphan_block_rsv) {
+ WARN_ON(root->orphan_block_rsv->size > 0);
+ btrfs_free_block_rsv(root, root->orphan_block_rsv);
+ root->orphan_block_rsv = NULL;
+ }
+}
+
+/*
* This creates an orphan entry for the given inode in case something goes
* wrong in the middle of an unlink/truncate.
+ *
+ * NOTE: caller of this function should reserve 5 units of metadata for
+ * this function.
*/
int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode)
{
struct btrfs_root *root = BTRFS_I(inode)->root;
- int ret = 0;
+ struct btrfs_block_rsv *block_rsv = NULL;
+ int reserve = 0;
+ int insert = 0;
+ int ret;
- spin_lock(&root->list_lock);
+ if (!root->orphan_block_rsv) {
+ block_rsv = btrfs_alloc_block_rsv(root);
+ BUG_ON(!block_rsv);
+ }
- /* already on the orphan list, we're good */
- if (!list_empty(&BTRFS_I(inode)->i_orphan)) {
- spin_unlock(&root->list_lock);
- return 0;
+ spin_lock(&root->orphan_lock);
+ if (!root->orphan_block_rsv) {
+ root->orphan_block_rsv = block_rsv;
+ } else if (block_rsv) {
+ btrfs_free_block_rsv(root, block_rsv);
+ block_rsv = NULL;
+ }
+
+ if (list_empty(&BTRFS_I(inode)->i_orphan)) {
+ list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list);
+#if 0
+ /*
+ * For proper ENOSPC handling, we should do orphan
+ * cleanup when mounting. But this introduces backward
+ * compatibility issue.
+ */
+ if (!xchg(&root->orphan_item_inserted, 1))
+ insert = 2;
+ else
+ insert = 1;
+#endif
+ insert = 1;
+ } else {
+ WARN_ON(!BTRFS_I(inode)->orphan_meta_reserved);
}
- list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list);
+ if (!BTRFS_I(inode)->orphan_meta_reserved) {
+ BTRFS_I(inode)->orphan_meta_reserved = 1;
+ reserve = 1;
+ }
+ spin_unlock(&root->orphan_lock);
- spin_unlock(&root->list_lock);
+ if (block_rsv)
+ btrfs_add_durable_block_rsv(root->fs_info, block_rsv);
- /*
- * insert an orphan item to track this unlinked/truncated file
- */
- ret = btrfs_insert_orphan_item(trans, root, inode->i_ino);
+ /* grab metadata reservation from transaction handle */
+ if (reserve) {
+ ret = btrfs_orphan_reserve_metadata(trans, inode);
+ BUG_ON(ret);
+ }
- return ret;
+ /* insert an orphan item to track this unlinked/truncated file */
+ if (insert >= 1) {
+ ret = btrfs_insert_orphan_item(trans, root, inode->i_ino);
+ BUG_ON(ret);
+ }
+
+ /* insert an orphan item to track subvolume contains orphan files */
+ if (insert >= 2) {
+ ret = btrfs_insert_orphan_item(trans, root->fs_info->tree_root,
+ root->root_key.objectid);
+ BUG_ON(ret);
+ }
+ return 0;
}
/*
@@ -2028,26 +2202,31 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode)
int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode)
{
struct btrfs_root *root = BTRFS_I(inode)->root;
+ int delete_item = 0;
+ int release_rsv = 0;
int ret = 0;
- spin_lock(&root->list_lock);
-
- if (list_empty(&BTRFS_I(inode)->i_orphan)) {
- spin_unlock(&root->list_lock);
- return 0;
+ spin_lock(&root->orphan_lock);
+ if (!list_empty(&BTRFS_I(inode)->i_orphan)) {
+ list_del_init(&BTRFS_I(inode)->i_orphan);
+ delete_item = 1;
}
- list_del_init(&BTRFS_I(inode)->i_orphan);
- if (!trans) {
- spin_unlock(&root->list_lock);
- return 0;
+ if (BTRFS_I(inode)->orphan_meta_reserved) {
+ BTRFS_I(inode)->orphan_meta_reserved = 0;
+ release_rsv = 1;
}
+ spin_unlock(&root->orphan_lock);
- spin_unlock(&root->list_lock);
+ if (trans && delete_item) {
+ ret = btrfs_del_orphan_item(trans, root, inode->i_ino);
+ BUG_ON(ret);
+ }
- ret = btrfs_del_orphan_item(trans, root, inode->i_ino);
+ if (release_rsv)
+ btrfs_orphan_release_metadata(inode);
- return ret;
+ return 0;
}
/*
@@ -2064,7 +2243,7 @@ void btrfs_orphan_cleanup(struct btrfs_root *root)
struct inode *inode;
int ret = 0, nr_unlink = 0, nr_truncate = 0;
- if (!xchg(&root->clean_orphans, 0))
+ if (cmpxchg(&root->orphan_cleanup_state, 0, ORPHAN_CLEANUP_STARTED))
return;
path = btrfs_alloc_path();
@@ -2117,16 +2296,15 @@ void btrfs_orphan_cleanup(struct btrfs_root *root)
found_key.type = BTRFS_INODE_ITEM_KEY;
found_key.offset = 0;
inode = btrfs_iget(root->fs_info->sb, &found_key, root, NULL);
- if (IS_ERR(inode))
- break;
+ BUG_ON(IS_ERR(inode));
/*
* add this inode to the orphan list so btrfs_orphan_del does
* the proper thing when we hit it
*/
- spin_lock(&root->list_lock);
+ spin_lock(&root->orphan_lock);
list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list);
- spin_unlock(&root->list_lock);
+ spin_unlock(&root->orphan_lock);
/*
* if this is a bad inode, means we actually succeeded in
@@ -2135,7 +2313,7 @@ void btrfs_orphan_cleanup(struct btrfs_root *root)
* do a destroy_inode
*/
if (is_bad_inode(inode)) {
- trans = btrfs_start_transaction(root, 1);
+ trans = btrfs_start_transaction(root, 0);
btrfs_orphan_del(trans, inode);
btrfs_end_transaction(trans, root);
iput(inode);
@@ -2153,13 +2331,23 @@ void btrfs_orphan_cleanup(struct btrfs_root *root)
/* this will do delete_inode and everything for us */
iput(inode);
}
+ btrfs_free_path(path);
+
+ root->orphan_cleanup_state = ORPHAN_CLEANUP_DONE;
+
+ if (root->orphan_block_rsv)
+ btrfs_block_rsv_release(root, root->orphan_block_rsv,
+ (u64)-1);
+
+ if (root->orphan_block_rsv || root->orphan_item_inserted) {
+ trans = btrfs_join_transaction(root, 1);
+ btrfs_end_transaction(trans, root);
+ }
if (nr_unlink)
printk(KERN_INFO "btrfs: unlinked %d orphans\n", nr_unlink);
if (nr_truncate)
printk(KERN_INFO "btrfs: truncated %d orphans\n", nr_truncate);
-
- btrfs_free_path(path);
}
/*
@@ -2478,29 +2666,201 @@ out:
return ret;
}
-static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
+/* helper to check if there is any shared block in the path */
+static int check_path_shared(struct btrfs_root *root,
+ struct btrfs_path *path)
+{
+ struct extent_buffer *eb;
+ int level;
+ int ret;
+ u64 refs = 1;
+
+ for (level = 0; level < BTRFS_MAX_LEVEL; level++) {
+ if (!path->nodes[level])
+ break;
+ eb = path->nodes[level];
+ if (!btrfs_block_can_be_shared(root, eb))
+ continue;
+ ret = btrfs_lookup_extent_info(NULL, root, eb->start, eb->len,
+ &refs, NULL);
+ if (refs > 1)
+ return 1;
+ }
+ return 0;
+}
+
+/*
+ * helper to start transaction for unlink and rmdir.
+ *
+ * unlink and rmdir are special in btrfs, they do not always free space.
+ * so in enospc case, we should make sure they will free space before
+ * allowing them to use the global metadata reservation.
+ */
+static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir,
+ struct dentry *dentry)
{
- struct btrfs_root *root;
struct btrfs_trans_handle *trans;
+ struct btrfs_root *root = BTRFS_I(dir)->root;
+ struct btrfs_path *path;
+ struct btrfs_inode_ref *ref;
+ struct btrfs_dir_item *di;
struct inode *inode = dentry->d_inode;
+ u64 index;
+ int check_link = 1;
+ int err = -ENOSPC;
int ret;
- unsigned long nr = 0;
- root = BTRFS_I(dir)->root;
+ trans = btrfs_start_transaction(root, 10);
+ if (!IS_ERR(trans) || PTR_ERR(trans) != -ENOSPC)
+ return trans;
- /*
- * 5 items for unlink inode
- * 1 for orphan
- */
- ret = btrfs_reserve_metadata_space(root, 6);
- if (ret)
- return ret;
+ if (inode->i_ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)
+ return ERR_PTR(-ENOSPC);
+
+ /* check if there is someone else holds reference */
+ if (S_ISDIR(inode->i_mode) && atomic_read(&inode->i_count) > 1)
+ return ERR_PTR(-ENOSPC);
+
+ if (atomic_read(&inode->i_count) > 2)
+ return ERR_PTR(-ENOSPC);
+
+ if (xchg(&root->fs_info->enospc_unlink, 1))
+ return ERR_PTR(-ENOSPC);
- trans = btrfs_start_transaction(root, 1);
+ path = btrfs_alloc_path();
+ if (!path) {
+ root->fs_info->enospc_unlink = 0;
+ return ERR_PTR(-ENOMEM);
+ }
+
+ trans = btrfs_start_transaction(root, 0);
if (IS_ERR(trans)) {
- btrfs_unreserve_metadata_space(root, 6);
- return PTR_ERR(trans);
+ btrfs_free_path(path);
+ root->fs_info->enospc_unlink = 0;
+ return trans;
+ }
+
+ path->skip_locking = 1;
+ path->search_commit_root = 1;
+
+ ret = btrfs_lookup_inode(trans, root, path,
+ &BTRFS_I(dir)->location, 0);
+ if (ret < 0) {
+ err = ret;
+ goto out;
+ }
+ if (ret == 0) {
+ if (check_path_shared(root, path))
+ goto out;
+ } else {
+ check_link = 0;
+ }
+ btrfs_release_path(root, path);
+
+ ret = btrfs_lookup_inode(trans, root, path,
+ &BTRFS_I(inode)->location, 0);
+ if (ret < 0) {
+ err = ret;
+ goto out;
+ }
+ if (ret == 0) {
+ if (check_path_shared(root, path))
+ goto out;
+ } else {
+ check_link = 0;
+ }
+ btrfs_release_path(root, path);
+
+ if (ret == 0 && S_ISREG(inode->i_mode)) {
+ ret = btrfs_lookup_file_extent(trans, root, path,
+ inode->i_ino, (u64)-1, 0);
+ if (ret < 0) {
+ err = ret;
+ goto out;
+ }
+ BUG_ON(ret == 0);
+ if (check_path_shared(root, path))
+ goto out;
+ btrfs_release_path(root, path);
+ }
+
+ if (!check_link) {
+ err = 0;
+ goto out;
+ }
+
+ di = btrfs_lookup_dir_item(trans, root, path, dir->i_ino,
+ dentry->d_name.name, dentry->d_name.len, 0);
+ if (IS_ERR(di)) {
+ err = PTR_ERR(di);
+ goto out;
+ }
+ if (di) {
+ if (check_path_shared(root, path))
+ goto out;
+ } else {
+ err = 0;
+ goto out;
}
+ btrfs_release_path(root, path);
+
+ ref = btrfs_lookup_inode_ref(trans, root, path,
+ dentry->d_name.name, dentry->d_name.len,
+ inode->i_ino, dir->i_ino, 0);
+ if (IS_ERR(ref)) {
+ err = PTR_ERR(ref);
+ goto out;
+ }
+ BUG_ON(!ref);
+ if (check_path_shared(root, path))
+ goto out;
+ index = btrfs_inode_ref_index(path->nodes[0], ref);
+ btrfs_release_path(root, path);
+
+ di = btrfs_lookup_dir_index_item(trans, root, path, dir->i_ino, index,
+ dentry->d_name.name, dentry->d_name.len, 0);
+ if (IS_ERR(di)) {
+ err = PTR_ERR(di);
+ goto out;
+ }
+ BUG_ON(ret == -ENOENT);
+ if (check_path_shared(root, path))
+ goto out;
+
+ err = 0;
+out:
+ btrfs_free_path(path);
+ if (err) {
+ btrfs_end_transaction(trans, root);
+ root->fs_info->enospc_unlink = 0;
+ return ERR_PTR(err);
+ }
+
+ trans->block_rsv = &root->fs_info->global_block_rsv;
+ return trans;
+}
+
+static void __unlink_end_trans(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root)
+{
+ if (trans->block_rsv == &root->fs_info->global_block_rsv) {
+ BUG_ON(!root->fs_info->enospc_unlink);
+ root->fs_info->enospc_unlink = 0;
+ }
+ btrfs_end_transaction_throttle(trans, root);
+}
+
+static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
+{
+ struct btrfs_root *root = BTRFS_I(dir)->root;
+ struct btrfs_trans_handle *trans;
+ struct inode *inode = dentry->d_inode;
+ int ret;
+ unsigned long nr = 0;
+
+ trans = __unlink_start_trans(dir, dentry);
+ if (IS_ERR(trans))
+ return PTR_ERR(trans);
btrfs_set_trans_block_group(trans, dir);
@@ -2508,14 +2868,15 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
ret = btrfs_unlink_inode(trans, root, dir, dentry->d_inode,
dentry->d_name.name, dentry->d_name.len);
+ BUG_ON(ret);
- if (inode->i_nlink == 0)
+ if (inode->i_nlink == 0) {
ret = btrfs_orphan_add(trans, inode);
+ BUG_ON(ret);
+ }
nr = trans->blocks_used;
-
- btrfs_end_transaction_throttle(trans, root);
- btrfs_unreserve_metadata_space(root, 6);
+ __unlink_end_trans(trans, root);
btrfs_btree_balance_dirty(root, nr);
return ret;
}
@@ -2587,7 +2948,6 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
{
struct inode *inode = dentry->d_inode;
int err = 0;
- int ret;
struct btrfs_root *root = BTRFS_I(dir)->root;
struct btrfs_trans_handle *trans;
unsigned long nr = 0;
@@ -2596,15 +2956,9 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)
return -ENOTEMPTY;
- ret = btrfs_reserve_metadata_space(root, 5);
- if (ret)
- return ret;
-
- trans = btrfs_start_transaction(root, 1);
- if (IS_ERR(trans)) {
- btrfs_unreserve_metadata_space(root, 5);
+ trans = __unlink_start_trans(dir, dentry);
+ if (IS_ERR(trans))
return PTR_ERR(trans);
- }
btrfs_set_trans_block_group(trans, dir);
@@ -2627,12 +2981,9 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
btrfs_i_size_write(inode, 0);
out:
nr = trans->blocks_used;
- ret = btrfs_end_transaction_throttle(trans, root);
- btrfs_unreserve_metadata_space(root, 5);
+ __unlink_end_trans(trans, root);
btrfs_btree_balance_dirty(root, nr);
- if (ret && !err)
- err = ret;
return err;
}
@@ -3029,6 +3380,7 @@ out:
if (pending_del_nr) {
ret = btrfs_del_items(trans, root, path, pending_del_slot,
pending_del_nr);
+ BUG_ON(ret);
}
btrfs_free_path(path);
return err;
@@ -3056,11 +3408,7 @@ static int btrfs_truncate_page(struct address_space *mapping, loff_t from)
if ((offset & (blocksize - 1)) == 0)
goto out;
- ret = btrfs_check_data_free_space(root, inode, PAGE_CACHE_SIZE);
- if (ret)
- goto out;
-
- ret = btrfs_reserve_metadata_for_delalloc(root, inode, 1);
+ ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE);
if (ret)
goto out;
@@ -3068,8 +3416,7 @@ static int btrfs_truncate_page(struct address_space *mapping, loff_t from)
again:
page = grab_cache_page(mapping, index);
if (!page) {
- btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE);
- btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
+ btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
goto out;
}
@@ -3132,8 +3479,7 @@ again:
out_unlock:
if (ret)
- btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE);
- btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
+ btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
unlock_page(page);
page_cache_release(page);
out:
@@ -3145,7 +3491,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t size)
struct btrfs_trans_handle *trans;
struct btrfs_root *root = BTRFS_I(inode)->root;
struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
- struct extent_map *em;
+ struct extent_map *em = NULL;
struct extent_state *cached_state = NULL;
u64 mask = root->sectorsize - 1;
u64 hole_start = (inode->i_size + mask) & ~mask;
@@ -3183,11 +3529,11 @@ int btrfs_cont_expand(struct inode *inode, loff_t size)
u64 hint_byte = 0;
hole_size = last_byte - cur_offset;
- err = btrfs_reserve_metadata_space(root, 2);
- if (err)
+ trans = btrfs_start_transaction(root, 2);
+ if (IS_ERR(trans)) {
+ err = PTR_ERR(trans);
break;
-
- trans = btrfs_start_transaction(root, 1);
+ }
btrfs_set_trans_block_group(trans, inode);
err = btrfs_drop_extents(trans, inode, cur_offset,
@@ -3205,14 +3551,15 @@ int btrfs_cont_expand(struct inode *inode, loff_t size)
last_byte - 1, 0);
btrfs_end_transaction(trans, root);
- btrfs_unreserve_metadata_space(root, 2);
}
free_extent_map(em);
+ em = NULL;
cur_offset = last_byte;
if (cur_offset >= block_end)
break;
}
+ free_extent_map(em);
unlock_extent_cached(io_tree, hole_start, block_end - 1, &cached_state,
GFP_NOFS);
return err;
@@ -3239,11 +3586,10 @@ static int btrfs_setattr_size(struct inode *inode, struct iattr *attr)
}
}
- ret = btrfs_reserve_metadata_space(root, 1);
- if (ret)
- return ret;
+ trans = btrfs_start_transaction(root, 5);
+ if (IS_ERR(trans))
+ return PTR_ERR(trans);
- trans = btrfs_start_transaction(root, 1);
btrfs_set_trans_block_group(trans, inode);
ret = btrfs_orphan_add(trans, inode);
@@ -3251,7 +3597,6 @@ static int btrfs_setattr_size(struct inode *inode, struct iattr *attr)
nr = trans->blocks_used;
btrfs_end_transaction(trans, root);
- btrfs_unreserve_metadata_space(root, 1);
btrfs_btree_balance_dirty(root, nr);
if (attr->ia_size > inode->i_size) {
@@ -3264,8 +3609,11 @@ static int btrfs_setattr_size(struct inode *inode, struct iattr *attr)
i_size_write(inode, attr->ia_size);
btrfs_ordered_update_i_size(inode, inode->i_size, NULL);
- trans = btrfs_start_transaction(root, 1);
+ trans = btrfs_start_transaction(root, 0);
+ BUG_ON(IS_ERR(trans));
btrfs_set_trans_block_group(trans, inode);
+ trans->block_rsv = root->orphan_block_rsv;
+ BUG_ON(!trans->block_rsv);
ret = btrfs_update_inode(trans, root, inode);
BUG_ON(ret);
@@ -3345,10 +3693,21 @@ void btrfs_delete_inode(struct inode *inode)
btrfs_i_size_write(inode, 0);
while (1) {
- trans = btrfs_start_transaction(root, 1);
+ trans = btrfs_start_transaction(root, 0);
+ BUG_ON(IS_ERR(trans));
btrfs_set_trans_block_group(trans, inode);
- ret = btrfs_truncate_inode_items(trans, root, inode, 0, 0);
+ trans->block_rsv = root->orphan_block_rsv;
+
+ ret = btrfs_block_rsv_check(trans, root,
+ root->orphan_block_rsv, 0, 5);
+ if (ret) {
+ BUG_ON(ret != -EAGAIN);
+ ret = btrfs_commit_transaction(trans, root);
+ BUG_ON(ret);
+ continue;
+ }
+ ret = btrfs_truncate_inode_items(trans, root, inode, 0, 0);
if (ret != -EAGAIN)
break;
@@ -3356,6 +3715,7 @@ void btrfs_delete_inode(struct inode *inode)
btrfs_end_transaction(trans, root);
trans = NULL;
btrfs_btree_balance_dirty(root, nr);
+
}
if (ret == 0) {
@@ -3596,40 +3956,10 @@ again:
return 0;
}
-static noinline void init_btrfs_i(struct inode *inode)
-{
- struct btrfs_inode *bi = BTRFS_I(inode);
-
- bi->generation = 0;
- bi->sequence = 0;
- bi->last_trans = 0;
- bi->last_sub_trans = 0;
- bi->logged_trans = 0;
- bi->delalloc_bytes = 0;
- bi->reserved_bytes = 0;
- bi->disk_i_size = 0;
- bi->flags = 0;
- bi->index_cnt = (u64)-1;
- bi->last_unlink_trans = 0;
- bi->ordered_data_close = 0;
- bi->force_compress = 0;
- extent_map_tree_init(&BTRFS_I(inode)->extent_tree, GFP_NOFS);
- extent_io_tree_init(&BTRFS_I(inode)->io_tree,
- inode->i_mapping, GFP_NOFS);
- extent_io_tree_init(&BTRFS_I(inode)->io_failure_tree,
- inode->i_mapping, GFP_NOFS);
- INIT_LIST_HEAD(&BTRFS_I(inode)->delalloc_inodes);
- INIT_LIST_HEAD(&BTRFS_I(inode)->ordered_operations);
- RB_CLEAR_NODE(&BTRFS_I(inode)->rb_node);
- btrfs_ordered_inode_tree_init(&BTRFS_I(inode)->ordered_tree);
- mutex_init(&BTRFS_I(inode)->log_mutex);
-}
-
static int btrfs_init_locked_inode(struct inode *inode, void *p)
{
struct btrfs_iget_args *args = p;
inode->i_ino = args->ino;
- init_btrfs_i(inode);
BTRFS_I(inode)->root = args->root;
btrfs_set_inode_space_info(args->root, inode);
return 0;
@@ -3692,8 +4022,6 @@ static struct inode *new_simple_dir(struct super_block *s,
if (!inode)
return ERR_PTR(-ENOMEM);
- init_btrfs_i(inode);
-
BTRFS_I(inode)->root = root;
memcpy(&BTRFS_I(inode)->location, key, sizeof(*key));
BTRFS_I(inode)->dummy_inode = 1;
@@ -3950,7 +4278,7 @@ int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc)
struct btrfs_trans_handle *trans;
int ret = 0;
- if (root->fs_info->btree_inode == inode)
+ if (BTRFS_I(inode)->dummy_inode)
return 0;
if (wbc->sync_mode == WB_SYNC_ALL) {
@@ -3971,10 +4299,38 @@ void btrfs_dirty_inode(struct inode *inode)
{
struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_trans_handle *trans;
+ int ret;
+
+ if (BTRFS_I(inode)->dummy_inode)
+ return;
trans = btrfs_join_transaction(root, 1);
btrfs_set_trans_block_group(trans, inode);
- btrfs_update_inode(trans, root, inode);
+
+ ret = btrfs_update_inode(trans, root, inode);
+ if (ret && ret == -ENOSPC) {
+ /* whoops, lets try again with the full transaction */
+ btrfs_end_transaction(trans, root);
+ trans = btrfs_start_transaction(root, 1);
+ if (IS_ERR(trans)) {
+ if (printk_ratelimit()) {
+ printk(KERN_ERR "btrfs: fail to "
+ "dirty inode %lu error %ld\n",
+ inode->i_ino, PTR_ERR(trans));
+ }
+ return;
+ }
+ btrfs_set_trans_block_group(trans, inode);
+
+ ret = btrfs_update_inode(trans, root, inode);
+ if (ret) {
+ if (printk_ratelimit()) {
+ printk(KERN_ERR "btrfs: fail to "
+ "dirty inode %lu error %d\n",
+ inode->i_ino, ret);
+ }
+ }
+ }
btrfs_end_transaction(trans, root);
}
@@ -4092,7 +4448,6 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
* btrfs_get_inode_index_count has an explanation for the magic
* number
*/
- init_btrfs_i(inode);
BTRFS_I(inode)->index_cnt = 2;
BTRFS_I(inode)->root = root;
BTRFS_I(inode)->generation = trans->transid;
@@ -4247,26 +4602,21 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
if (!new_valid_dev(rdev))
return -EINVAL;
+ err = btrfs_find_free_objectid(NULL, root, dir->i_ino, &objectid);
+ if (err)
+ return err;
+
/*
* 2 for inode item and ref
* 2 for dir items
* 1 for xattr if selinux is on
*/
- err = btrfs_reserve_metadata_space(root, 5);
- if (err)
- return err;
+ trans = btrfs_start_transaction(root, 5);
+ if (IS_ERR(trans))
+ return PTR_ERR(trans);
- trans = btrfs_start_transaction(root, 1);
- if (!trans)
- goto fail;
btrfs_set_trans_block_group(trans, dir);
- err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
- if (err) {
- err = -ENOSPC;
- goto out_unlock;
- }
-
inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
dentry->d_name.len,
dentry->d_parent->d_inode->i_ino, objectid,
@@ -4295,13 +4645,11 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
out_unlock:
nr = trans->blocks_used;
btrfs_end_transaction_throttle(trans, root);
-fail:
- btrfs_unreserve_metadata_space(root, 5);
+ btrfs_btree_balance_dirty(root, nr);
if (drop_inode) {
inode_dec_link_count(inode);
iput(inode);
}
- btrfs_btree_balance_dirty(root, nr);
return err;
}
@@ -4311,32 +4659,26 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
struct btrfs_trans_handle *trans;
struct btrfs_root *root = BTRFS_I(dir)->root;
struct inode *inode = NULL;
- int err;
int drop_inode = 0;
+ int err;
unsigned long nr = 0;
u64 objectid;
u64 index = 0;
+ err = btrfs_find_free_objectid(NULL, root, dir->i_ino, &objectid);
+ if (err)
+ return err;
/*
* 2 for inode item and ref
* 2 for dir items
* 1 for xattr if selinux is on
*/
- err = btrfs_reserve_metadata_space(root, 5);
- if (err)
- return err;
+ trans = btrfs_start_transaction(root, 5);
+ if (IS_ERR(trans))
+ return PTR_ERR(trans);
- trans = btrfs_start_transaction(root, 1);
- if (!trans)
- goto fail;
btrfs_set_trans_block_group(trans, dir);
- err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
- if (err) {
- err = -ENOSPC;
- goto out_unlock;
- }
-
inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
dentry->d_name.len,
dentry->d_parent->d_inode->i_ino,
@@ -4368,8 +4710,6 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
out_unlock:
nr = trans->blocks_used;
btrfs_end_transaction_throttle(trans, root);
-fail:
- btrfs_unreserve_metadata_space(root, 5);
if (drop_inode) {
inode_dec_link_count(inode);
iput(inode);
@@ -4396,21 +4736,21 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
if (root->objectid != BTRFS_I(inode)->root->objectid)
return -EPERM;
- /*
- * 1 item for inode ref
- * 2 items for dir items
- */
- err = btrfs_reserve_metadata_space(root, 3);
- if (err)
- return err;
-
btrfs_inc_nlink(inode);
err = btrfs_set_inode_index(dir, &index);
if (err)
goto fail;
- trans = btrfs_start_transaction(root, 1);
+ /*
+ * 1 item for inode ref
+ * 2 items for dir items
+ */
+ trans = btrfs_start_transaction(root, 3);
+ if (IS_ERR(trans)) {
+ err = PTR_ERR(trans);
+ goto fail;
+ }
btrfs_set_trans_block_group(trans, dir);
atomic_inc(&inode->i_count);
@@ -4429,7 +4769,6 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
nr = trans->blocks_used;
btrfs_end_transaction_throttle(trans, root);
fail:
- btrfs_unreserve_metadata_space(root, 3);
if (drop_inode) {
inode_dec_link_count(inode);
iput(inode);
@@ -4449,28 +4788,20 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
u64 index = 0;
unsigned long nr = 1;
+ err = btrfs_find_free_objectid(NULL, root, dir->i_ino, &objectid);
+ if (err)
+ return err;
+
/*
* 2 items for inode and ref
* 2 items for dir items
* 1 for xattr if selinux is on
*/
- err = btrfs_reserve_metadata_space(root, 5);
- if (err)
- return err;
-
- trans = btrfs_start_transaction(root, 1);
- if (!trans) {
- err = -ENOMEM;
- goto out_unlock;
- }
+ trans = btrfs_start_transaction(root, 5);
+ if (IS_ERR(trans))
+ return PTR_ERR(trans);
btrfs_set_trans_block_group(trans, dir);
- err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
- if (err) {
- err = -ENOSPC;
- goto out_fail;
- }
-
inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
dentry->d_name.len,
dentry->d_parent->d_inode->i_ino, objectid,
@@ -4510,9 +4841,6 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
out_fail:
nr = trans->blocks_used;
btrfs_end_transaction_throttle(trans, root);
-
-out_unlock:
- btrfs_unreserve_metadata_space(root, 5);
if (drop_on_err)
iput(inode);
btrfs_btree_balance_dirty(root, nr);
@@ -4770,6 +5098,7 @@ again:
}
flush_dcache_page(page);
} else if (create && PageUptodate(page)) {
+ WARN_ON(1);
if (!trans) {
kunmap(page);
free_extent_map(em);
@@ -4866,11 +5195,651 @@ out:
return em;
}
+static struct extent_map *btrfs_new_extent_direct(struct inode *inode,
+ u64 start, u64 len)
+{
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_trans_handle *trans;
+ struct extent_map *em;
+ struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
+ struct btrfs_key ins;
+ u64 alloc_hint;
+ int ret;
+
+ btrfs_drop_extent_cache(inode, start, start + len - 1, 0);
+
+ trans = btrfs_join_transaction(root, 0);
+ if (!trans)
+ return ERR_PTR(-ENOMEM);
+
+ trans->block_rsv = &root->fs_info->delalloc_block_rsv;
+
+ alloc_hint = get_extent_allocation_hint(inode, start, len);
+ ret = btrfs_reserve_extent(trans, root, len, root->sectorsize, 0,
+ alloc_hint, (u64)-1, &ins, 1);
+ if (ret) {
+ em = ERR_PTR(ret);
+ goto out;
+ }
+
+ em = alloc_extent_map(GFP_NOFS);
+ if (!em) {
+ em = ERR_PTR(-ENOMEM);
+ goto out;
+ }
+
+ em->start = start;
+ em->orig_start = em->start;
+ em->len = ins.offset;
+
+ em->block_start = ins.objectid;
+ em->block_len = ins.offset;
+ em->bdev = root->fs_info->fs_devices->latest_bdev;
+ set_bit(EXTENT_FLAG_PINNED, &em->flags);
+
+ while (1) {
+ write_lock(&em_tree->lock);
+ ret = add_extent_mapping(em_tree, em);
+ write_unlock(&em_tree->lock);
+ if (ret != -EEXIST)
+ break;
+ btrfs_drop_extent_cache(inode, start, start + em->len - 1, 0);
+ }
+
+ ret = btrfs_add_ordered_extent_dio(inode, start, ins.objectid,
+ ins.offset, ins.offset, 0);
+ if (ret) {
+ btrfs_free_reserved_extent(root, ins.objectid, ins.offset);
+ em = ERR_PTR(ret);
+ }
+out:
+ btrfs_end_transaction(trans, root);
+ return em;
+}
+
+/*
+ * returns 1 when the nocow is safe, < 1 on error, 0 if the
+ * block must be cow'd
+ */
+static noinline int can_nocow_odirect(struct btrfs_trans_handle *trans,
+ struct inode *inode, u64 offset, u64 len)
+{
+ struct btrfs_path *path;
+ int ret;
+ struct extent_buffer *leaf;
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_file_extent_item *fi;
+ struct btrfs_key key;
+ u64 disk_bytenr;
+ u64 backref_offset;
+ u64 extent_end;
+ u64 num_bytes;
+ int slot;
+ int found_type;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ ret = btrfs_lookup_file_extent(trans, root, path, inode->i_ino,
+ offset, 0);
+ if (ret < 0)
+ goto out;
+
+ slot = path->slots[0];
+ if (ret == 1) {
+ if (slot == 0) {
+ /* can't find the item, must cow */
+ ret = 0;
+ goto out;
+ }
+ slot--;
+ }
+ ret = 0;
+ leaf = path->nodes[0];
+ btrfs_item_key_to_cpu(leaf, &key, slot);
+ if (key.objectid != inode->i_ino ||
+ key.type != BTRFS_EXTENT_DATA_KEY) {
+ /* not our file or wrong item type, must cow */
+ goto out;
+ }
+
+ if (key.offset > offset) {
+ /* Wrong offset, must cow */
+ goto out;
+ }
+
+ fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
+ found_type = btrfs_file_extent_type(leaf, fi);
+ if (found_type != BTRFS_FILE_EXTENT_REG &&
+ found_type != BTRFS_FILE_EXTENT_PREALLOC) {
+ /* not a regular extent, must cow */
+ goto out;
+ }
+ disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
+ backref_offset = btrfs_file_extent_offset(leaf, fi);
+
+ extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi);
+ if (extent_end < offset + len) {
+ /* extent doesn't include our full range, must cow */
+ goto out;
+ }
+
+ if (btrfs_extent_readonly(root, disk_bytenr))
+ goto out;
+
+ /*
+ * look for other files referencing this extent, if we
+ * find any we must cow
+ */
+ if (btrfs_cross_ref_exist(trans, root, inode->i_ino,
+ key.offset - backref_offset, disk_bytenr))
+ goto out;
+
+ /*
+ * adjust disk_bytenr and num_bytes to cover just the bytes
+ * in this extent we are about to write. If there
+ * are any csums in that range we have to cow in order
+ * to keep the csums correct
+ */
+ disk_bytenr += backref_offset;
+ disk_bytenr += offset - key.offset;
+ num_bytes = min(offset + len, extent_end) - offset;
+ if (csum_exist_in_range(root, disk_bytenr, num_bytes))
+ goto out;
+ /*
+ * all of the above have passed, it is safe to overwrite this extent
+ * without cow
+ */
+ ret = 1;
+out:
+ btrfs_free_path(path);
+ return ret;
+}
+
+static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
+ struct buffer_head *bh_result, int create)
+{
+ struct extent_map *em;
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ u64 start = iblock << inode->i_blkbits;
+ u64 len = bh_result->b_size;
+ struct btrfs_trans_handle *trans;
+
+ em = btrfs_get_extent(inode, NULL, 0, start, len, 0);
+ if (IS_ERR(em))
+ return PTR_ERR(em);
+
+ /*
+ * Ok for INLINE and COMPRESSED extents we need to fallback on buffered
+ * io. INLINE is special, and we could probably kludge it in here, but
+ * it's still buffered so for safety lets just fall back to the generic
+ * buffered path.
+ *
+ * For COMPRESSED we _have_ to read the entire extent in so we can
+ * decompress it, so there will be buffering required no matter what we
+ * do, so go ahead and fallback to buffered.
+ *
+ * We return -ENOTBLK because thats what makes DIO go ahead and go back
+ * to buffered IO. Don't blame me, this is the price we pay for using
+ * the generic code.
+ */
+ if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags) ||
+ em->block_start == EXTENT_MAP_INLINE) {
+ free_extent_map(em);
+ return -ENOTBLK;
+ }
+
+ /* Just a good old fashioned hole, return */
+ if (!create && (em->block_start == EXTENT_MAP_HOLE ||
+ test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {
+ free_extent_map(em);
+ /* DIO will do one hole at a time, so just unlock a sector */
+ unlock_extent(&BTRFS_I(inode)->io_tree, start,
+ start + root->sectorsize - 1, GFP_NOFS);
+ return 0;
+ }
+
+ /*
+ * We don't allocate a new extent in the following cases
+ *
+ * 1) The inode is marked as NODATACOW. In this case we'll just use the
+ * existing extent.
+ * 2) The extent is marked as PREALLOC. We're good to go here and can
+ * just use the extent.
+ *
+ */
+ if (!create) {
+ len = em->len - (start - em->start);
+ goto map;
+ }
+
+ if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) ||
+ ((BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) &&
+ em->block_start != EXTENT_MAP_HOLE)) {
+ int type;
+ int ret;
+ u64 block_start;
+
+ if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
+ type = BTRFS_ORDERED_PREALLOC;
+ else
+ type = BTRFS_ORDERED_NOCOW;
+ len = min(len, em->len - (start - em->start));
+ block_start = em->block_start + (start - em->start);
+
+ /*
+ * we're not going to log anything, but we do need
+ * to make sure the current transaction stays open
+ * while we look for nocow cross refs
+ */
+ trans = btrfs_join_transaction(root, 0);
+ if (!trans)
+ goto must_cow;
+
+ if (can_nocow_odirect(trans, inode, start, len) == 1) {
+ ret = btrfs_add_ordered_extent_dio(inode, start,
+ block_start, len, len, type);
+ btrfs_end_transaction(trans, root);
+ if (ret) {
+ free_extent_map(em);
+ return ret;
+ }
+ goto unlock;
+ }
+ btrfs_end_transaction(trans, root);
+ }
+must_cow:
+ /*
+ * this will cow the extent, reset the len in case we changed
+ * it above
+ */
+ len = bh_result->b_size;
+ free_extent_map(em);
+ em = btrfs_new_extent_direct(inode, start, len);
+ if (IS_ERR(em))
+ return PTR_ERR(em);
+ len = min(len, em->len - (start - em->start));
+unlock:
+ clear_extent_bit(&BTRFS_I(inode)->io_tree, start, start + len - 1,
+ EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DIRTY, 1,
+ 0, NULL, GFP_NOFS);
+map:
+ bh_result->b_blocknr = (em->block_start + (start - em->start)) >>
+ inode->i_blkbits;
+ bh_result->b_size = len;
+ bh_result->b_bdev = em->bdev;
+ set_buffer_mapped(bh_result);
+ if (create && !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
+ set_buffer_new(bh_result);
+
+ free_extent_map(em);
+
+ return 0;
+}
+
+struct btrfs_dio_private {
+ struct inode *inode;
+ u64 logical_offset;
+ u64 disk_bytenr;
+ u64 bytes;
+ u32 *csums;
+ void *private;
+};
+
+static void btrfs_endio_direct_read(struct bio *bio, int err)
+{
+ struct bio_vec *bvec_end = bio->bi_io_vec + bio->bi_vcnt - 1;
+ struct bio_vec *bvec = bio->bi_io_vec;
+ struct btrfs_dio_private *dip = bio->bi_private;
+ struct inode *inode = dip->inode;
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ u64 start;
+ u32 *private = dip->csums;
+
+ start = dip->logical_offset;
+ do {
+ if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) {
+ struct page *page = bvec->bv_page;
+ char *kaddr;
+ u32 csum = ~(u32)0;
+ unsigned long flags;
+
+ local_irq_save(flags);
+ kaddr = kmap_atomic(page, KM_IRQ0);
+ csum = btrfs_csum_data(root, kaddr + bvec->bv_offset,
+ csum, bvec->bv_len);
+ btrfs_csum_final(csum, (char *)&csum);
+ kunmap_atomic(kaddr, KM_IRQ0);
+ local_irq_restore(flags);
+
+ flush_dcache_page(bvec->bv_page);
+ if (csum != *private) {
+ printk(KERN_ERR "btrfs csum failed ino %lu off"
+ " %llu csum %u private %u\n",
+ inode->i_ino, (unsigned long long)start,
+ csum, *private);
+ err = -EIO;
+ }
+ }
+
+ start += bvec->bv_len;
+ private++;
+ bvec++;
+ } while (bvec <= bvec_end);
+
+ unlock_extent(&BTRFS_I(inode)->io_tree, dip->logical_offset,
+ dip->logical_offset + dip->bytes - 1, GFP_NOFS);
+ bio->bi_private = dip->private;
+
+ kfree(dip->csums);
+ kfree(dip);
+ dio_end_io(bio, err);
+}
+
+static void btrfs_endio_direct_write(struct bio *bio, int err)
+{
+ struct btrfs_dio_private *dip = bio->bi_private;
+ struct inode *inode = dip->inode;
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_trans_handle *trans;
+ struct btrfs_ordered_extent *ordered = NULL;
+ struct extent_state *cached_state = NULL;
+ int ret;
+
+ if (err)
+ goto out_done;
+
+ ret = btrfs_dec_test_ordered_pending(inode, &ordered,
+ dip->logical_offset, dip->bytes);
+ if (!ret)
+ goto out_done;
+
+ BUG_ON(!ordered);
+
+ trans = btrfs_join_transaction(root, 1);
+ if (!trans) {
+ err = -ENOMEM;
+ goto out;
+ }
+ trans->block_rsv = &root->fs_info->delalloc_block_rsv;
+
+ if (test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags)) {
+ ret = btrfs_ordered_update_i_size(inode, 0, ordered);
+ if (!ret)
+ ret = btrfs_update_inode(trans, root, inode);
+ err = ret;
+ goto out;
+ }
+
+ lock_extent_bits(&BTRFS_I(inode)->io_tree, ordered->file_offset,
+ ordered->file_offset + ordered->len - 1, 0,
+ &cached_state, GFP_NOFS);
+
+ if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags)) {
+ ret = btrfs_mark_extent_written(trans, inode,
+ ordered->file_offset,
+ ordered->file_offset +
+ ordered->len);
+ if (ret) {
+ err = ret;
+ goto out_unlock;
+ }
+ } else {
+ ret = insert_reserved_file_extent(trans, inode,
+ ordered->file_offset,
+ ordered->start,
+ ordered->disk_len,
+ ordered->len,
+ ordered->len,
+ 0, 0, 0,
+ BTRFS_FILE_EXTENT_REG);
+ unpin_extent_cache(&BTRFS_I(inode)->extent_tree,
+ ordered->file_offset, ordered->len);
+ if (ret) {
+ err = ret;
+ WARN_ON(1);
+ goto out_unlock;
+ }
+ }
+
+ add_pending_csums(trans, inode, ordered->file_offset, &ordered->list);
+ btrfs_ordered_update_i_size(inode, 0, ordered);
+ btrfs_update_inode(trans, root, inode);
+out_unlock:
+ unlock_extent_cached(&BTRFS_I(inode)->io_tree, ordered->file_offset,
+ ordered->file_offset + ordered->len - 1,
+ &cached_state, GFP_NOFS);
+out:
+ btrfs_delalloc_release_metadata(inode, ordered->len);
+ btrfs_end_transaction(trans, root);
+ btrfs_put_ordered_extent(ordered);
+ btrfs_put_ordered_extent(ordered);
+out_done:
+ bio->bi_private = dip->private;
+
+ kfree(dip->csums);
+ kfree(dip);
+ dio_end_io(bio, err);
+}
+
+static int __btrfs_submit_bio_start_direct_io(struct inode *inode, int rw,
+ struct bio *bio, int mirror_num,
+ unsigned long bio_flags, u64 offset)
+{
+ int ret;
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ ret = btrfs_csum_one_bio(root, inode, bio, offset, 1);
+ BUG_ON(ret);
+ return 0;
+}
+
+static void btrfs_submit_direct(int rw, struct bio *bio, struct inode *inode,
+ loff_t file_offset)
+{
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_dio_private *dip;
+ struct bio_vec *bvec = bio->bi_io_vec;
+ u64 start;
+ int skip_sum;
+ int write = rw & (1 << BIO_RW);
+ int ret = 0;
+
+ skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
+
+ dip = kmalloc(sizeof(*dip), GFP_NOFS);
+ if (!dip) {
+ ret = -ENOMEM;
+ goto free_ordered;
+ }
+ dip->csums = NULL;
+
+ if (!skip_sum) {
+ dip->csums = kmalloc(sizeof(u32) * bio->bi_vcnt, GFP_NOFS);
+ if (!dip->csums) {
+ ret = -ENOMEM;
+ goto free_ordered;
+ }
+ }
+
+ dip->private = bio->bi_private;
+ dip->inode = inode;
+ dip->logical_offset = file_offset;
+
+ start = dip->logical_offset;
+ dip->bytes = 0;
+ do {
+ dip->bytes += bvec->bv_len;
+ bvec++;
+ } while (bvec <= (bio->bi_io_vec + bio->bi_vcnt - 1));
+
+ dip->disk_bytenr = (u64)bio->bi_sector << 9;
+ bio->bi_private = dip;
+
+ if (write)
+ bio->bi_end_io = btrfs_endio_direct_write;
+ else
+ bio->bi_end_io = btrfs_endio_direct_read;
+
+ ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0);
+ if (ret)
+ goto out_err;
+
+ if (write && !skip_sum) {
+ ret = btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info,
+ inode, rw, bio, 0, 0,
+ dip->logical_offset,
+ __btrfs_submit_bio_start_direct_io,
+ __btrfs_submit_bio_done);
+ if (ret)
+ goto out_err;
+ return;
+ } else if (!skip_sum)
+ btrfs_lookup_bio_sums_dio(root, inode, bio,
+ dip->logical_offset, dip->csums);
+
+ ret = btrfs_map_bio(root, rw, bio, 0, 1);
+ if (ret)
+ goto out_err;
+ return;
+out_err:
+ kfree(dip->csums);
+ kfree(dip);
+free_ordered:
+ /*
+ * If this is a write, we need to clean up the reserved space and kill
+ * the ordered extent.
+ */
+ if (write) {
+ struct btrfs_ordered_extent *ordered;
+ ordered = btrfs_lookup_ordered_extent(inode,
+ dip->logical_offset);
+ if (!test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags) &&
+ !test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags))
+ btrfs_free_reserved_extent(root, ordered->start,
+ ordered->disk_len);
+ btrfs_put_ordered_extent(ordered);
+ btrfs_put_ordered_extent(ordered);
+ }
+ bio_endio(bio, ret);
+}
+
+static ssize_t check_direct_IO(struct btrfs_root *root, int rw, struct kiocb *iocb,
+ const struct iovec *iov, loff_t offset,
+ unsigned long nr_segs)
+{
+ int seg;
+ size_t size;
+ unsigned long addr;
+ unsigned blocksize_mask = root->sectorsize - 1;
+ ssize_t retval = -EINVAL;
+ loff_t end = offset;
+
+ if (offset & blocksize_mask)
+ goto out;
+
+ /* Check the memory alignment. Blocks cannot straddle pages */
+ for (seg = 0; seg < nr_segs; seg++) {
+ addr = (unsigned long)iov[seg].iov_base;
+ size = iov[seg].iov_len;
+ end += size;
+ if ((addr & blocksize_mask) || (size & blocksize_mask))
+ goto out;
+ }
+ retval = 0;
+out:
+ return retval;
+}
static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
const struct iovec *iov, loff_t offset,
unsigned long nr_segs)
{
- return -EINVAL;
+ struct file *file = iocb->ki_filp;
+ struct inode *inode = file->f_mapping->host;
+ struct btrfs_ordered_extent *ordered;
+ struct extent_state *cached_state = NULL;
+ u64 lockstart, lockend;
+ ssize_t ret;
+ int writing = rw & WRITE;
+ int write_bits = 0;
+ size_t count = iov_length(iov, nr_segs);
+
+ if (check_direct_IO(BTRFS_I(inode)->root, rw, iocb, iov,
+ offset, nr_segs)) {
+ return 0;
+ }
+
+ lockstart = offset;
+ lockend = offset + count - 1;
+
+ if (writing) {
+ ret = btrfs_delalloc_reserve_space(inode, count);
+ if (ret)
+ goto out;
+ }
+
+ while (1) {
+ lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend,
+ 0, &cached_state, GFP_NOFS);
+ /*
+ * We're concerned with the entire range that we're going to be
+ * doing DIO to, so we need to make sure theres no ordered
+ * extents in this range.
+ */
+ ordered = btrfs_lookup_ordered_range(inode, lockstart,
+ lockend - lockstart + 1);
+ if (!ordered)
+ break;
+ unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
+ &cached_state, GFP_NOFS);
+ btrfs_start_ordered_extent(inode, ordered, 1);
+ btrfs_put_ordered_extent(ordered);
+ cond_resched();
+ }
+
+ /*
+ * we don't use btrfs_set_extent_delalloc because we don't want
+ * the dirty or uptodate bits
+ */
+ if (writing) {
+ write_bits = EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING;
+ ret = set_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend,
+ EXTENT_DELALLOC, 0, NULL, &cached_state,
+ GFP_NOFS);
+ if (ret) {
+ clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart,
+ lockend, EXTENT_LOCKED | write_bits,
+ 1, 0, &cached_state, GFP_NOFS);
+ goto out;
+ }
+ }
+
+ free_extent_state(cached_state);
+ cached_state = NULL;
+
+ ret = __blockdev_direct_IO(rw, iocb, inode,
+ BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev,
+ iov, offset, nr_segs, btrfs_get_blocks_direct, NULL,
+ btrfs_submit_direct, 0);
+
+ if (ret < 0 && ret != -EIOCBQUEUED) {
+ clear_extent_bit(&BTRFS_I(inode)->io_tree, offset,
+ offset + iov_length(iov, nr_segs) - 1,
+ EXTENT_LOCKED | write_bits, 1, 0,
+ &cached_state, GFP_NOFS);
+ } else if (ret >= 0 && ret < iov_length(iov, nr_segs)) {
+ /*
+ * We're falling back to buffered, unlock the section we didn't
+ * do IO on.
+ */
+ clear_extent_bit(&BTRFS_I(inode)->io_tree, offset + ret,
+ offset + iov_length(iov, nr_segs) - 1,
+ EXTENT_LOCKED | write_bits, 1, 0,
+ &cached_state, GFP_NOFS);
+ }
+out:
+ free_extent_state(cached_state);
+ return ret;
}
static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
@@ -5034,7 +6003,7 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
u64 page_start;
u64 page_end;
- ret = btrfs_check_data_free_space(root, inode, PAGE_CACHE_SIZE);
+ ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE);
if (ret) {
if (ret == -ENOMEM)
ret = VM_FAULT_OOM;
@@ -5043,13 +6012,6 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
goto out;
}
- ret = btrfs_reserve_metadata_for_delalloc(root, inode, 1);
- if (ret) {
- btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE);
- ret = VM_FAULT_SIGBUS;
- goto out;
- }
-
ret = VM_FAULT_NOPAGE; /* make the VM retry the fault */
again:
lock_page(page);
@@ -5059,7 +6021,6 @@ again:
if ((page->mapping != inode->i_mapping) ||
(page_start >= size)) {
- btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE);
/* page got truncated out from underneath us */
goto out_unlock;
}
@@ -5100,7 +6061,6 @@ again:
unlock_extent_cached(io_tree, page_start, page_end,
&cached_state, GFP_NOFS);
ret = VM_FAULT_SIGBUS;
- btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE);
goto out_unlock;
}
ret = 0;
@@ -5127,10 +6087,10 @@ again:
unlock_extent_cached(io_tree, page_start, page_end, &cached_state, GFP_NOFS);
out_unlock:
- btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
if (!ret)
return VM_FAULT_LOCKED;
unlock_page(page);
+ btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
out:
return ret;
}
@@ -5155,8 +6115,10 @@ static void btrfs_truncate(struct inode *inode)
btrfs_wait_ordered_range(inode, inode->i_size & (~mask), (u64)-1);
btrfs_ordered_update_i_size(inode, inode->i_size, NULL);
- trans = btrfs_start_transaction(root, 1);
+ trans = btrfs_start_transaction(root, 0);
+ BUG_ON(IS_ERR(trans));
btrfs_set_trans_block_group(trans, inode);
+ trans->block_rsv = root->orphan_block_rsv;
/*
* setattr is responsible for setting the ordered_data_close flag,
@@ -5179,6 +6141,23 @@ static void btrfs_truncate(struct inode *inode)
btrfs_add_ordered_operation(trans, root, inode);
while (1) {
+ if (!trans) {
+ trans = btrfs_start_transaction(root, 0);
+ BUG_ON(IS_ERR(trans));
+ btrfs_set_trans_block_group(trans, inode);
+ trans->block_rsv = root->orphan_block_rsv;
+ }
+
+ ret = btrfs_block_rsv_check(trans, root,
+ root->orphan_block_rsv, 0, 5);
+ if (ret) {
+ BUG_ON(ret != -EAGAIN);
+ ret = btrfs_commit_transaction(trans, root);
+ BUG_ON(ret);
+ trans = NULL;
+ continue;
+ }
+
ret = btrfs_truncate_inode_items(trans, root, inode,
inode->i_size,
BTRFS_EXTENT_DATA_KEY);
@@ -5190,10 +6169,8 @@ static void btrfs_truncate(struct inode *inode)
nr = trans->blocks_used;
btrfs_end_transaction(trans, root);
+ trans = NULL;
btrfs_btree_balance_dirty(root, nr);
-
- trans = btrfs_start_transaction(root, 1);
- btrfs_set_trans_block_group(trans, inode);
}
if (ret == 0 && inode->i_nlink > 0) {
@@ -5254,21 +6231,47 @@ unsigned long btrfs_force_ra(struct address_space *mapping,
struct inode *btrfs_alloc_inode(struct super_block *sb)
{
struct btrfs_inode *ei;
+ struct inode *inode;
ei = kmem_cache_alloc(btrfs_inode_cachep, GFP_NOFS);
if (!ei)
return NULL;
+
+ ei->root = NULL;
+ ei->space_info = NULL;
+ ei->generation = 0;
+ ei->sequence = 0;
ei->last_trans = 0;
ei->last_sub_trans = 0;
ei->logged_trans = 0;
- ei->outstanding_extents = 0;
- ei->reserved_extents = 0;
- ei->root = NULL;
+ ei->delalloc_bytes = 0;
+ ei->reserved_bytes = 0;
+ ei->disk_i_size = 0;
+ ei->flags = 0;
+ ei->index_cnt = (u64)-1;
+ ei->last_unlink_trans = 0;
+
spin_lock_init(&ei->accounting_lock);
+ atomic_set(&ei->outstanding_extents, 0);
+ ei->reserved_extents = 0;
+
+ ei->ordered_data_close = 0;
+ ei->orphan_meta_reserved = 0;
+ ei->dummy_inode = 0;
+ ei->force_compress = 0;
+
+ inode = &ei->vfs_inode;
+ extent_map_tree_init(&ei->extent_tree, GFP_NOFS);
+ extent_io_tree_init(&ei->io_tree, &inode->i_data, GFP_NOFS);
+ extent_io_tree_init(&ei->io_failure_tree, &inode->i_data, GFP_NOFS);
+ mutex_init(&ei->log_mutex);
btrfs_ordered_inode_tree_init(&ei->ordered_tree);
INIT_LIST_HEAD(&ei->i_orphan);
+ INIT_LIST_HEAD(&ei->delalloc_inodes);
INIT_LIST_HEAD(&ei->ordered_operations);
- return &ei->vfs_inode;
+ RB_CLEAR_NODE(&ei->rb_node);
+
+ return inode;
}
void btrfs_destroy_inode(struct inode *inode)
@@ -5278,6 +6281,8 @@ void btrfs_destroy_inode(struct inode *inode)
WARN_ON(!list_empty(&inode->i_dentry));
WARN_ON(inode->i_data.nrpages);
+ WARN_ON(atomic_read(&BTRFS_I(inode)->outstanding_extents));
+ WARN_ON(BTRFS_I(inode)->reserved_extents);
/*
* This can happen where we create an inode, but somebody else also
@@ -5298,13 +6303,13 @@ void btrfs_destroy_inode(struct inode *inode)
spin_unlock(&root->fs_info->ordered_extent_lock);
}
- spin_lock(&root->list_lock);
+ spin_lock(&root->orphan_lock);
if (!list_empty(&BTRFS_I(inode)->i_orphan)) {
printk(KERN_INFO "BTRFS: inode %lu still on the orphan list\n",
inode->i_ino);
list_del_init(&BTRFS_I(inode)->i_orphan);
}
- spin_unlock(&root->list_lock);
+ spin_unlock(&root->orphan_lock);
while (1) {
ordered = btrfs_lookup_first_ordered_extent(inode, (u64)-1);
@@ -5425,19 +6430,6 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
if (S_ISDIR(old_inode->i_mode) && new_inode &&
new_inode->i_size > BTRFS_EMPTY_DIR_SIZE)
return -ENOTEMPTY;
-
- /*
- * We want to reserve the absolute worst case amount of items. So if
- * both inodes are subvols and we need to unlink them then that would
- * require 4 item modifications, but if they are both normal inodes it
- * would require 5 item modifications, so we'll assume their normal
- * inodes. So 5 * 2 is 10, plus 1 for the new link, so 11 total items
- * should cover the worst case number of items we'll modify.
- */
- ret = btrfs_reserve_metadata_space(root, 11);
- if (ret)
- return ret;
-
/*
* we're using rename to replace one file with another.
* and the replacement file is large. Start IO on it now so
@@ -5450,8 +6442,18 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
/* close the racy window with snapshot create/destroy ioctl */
if (old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)
down_read(&root->fs_info->subvol_sem);
+ /*
+ * We want to reserve the absolute worst case amount of items. So if
+ * both inodes are subvols and we need to unlink them then that would
+ * require 4 item modifications, but if they are both normal inodes it
+ * would require 5 item modifications, so we'll assume their normal
+ * inodes. So 5 * 2 is 10, plus 1 for the new link, so 11 total items
+ * should cover the worst case number of items we'll modify.
+ */
+ trans = btrfs_start_transaction(root, 20);
+ if (IS_ERR(trans))
+ return PTR_ERR(trans);
- trans = btrfs_start_transaction(root, 1);
btrfs_set_trans_block_group(trans, new_dir);
if (dest != root)
@@ -5550,7 +6552,6 @@ out_fail:
if (old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)
up_read(&root->fs_info->subvol_sem);
- btrfs_unreserve_metadata_space(root, 11);
return ret;
}
@@ -5602,6 +6603,38 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
return 0;
}
+int btrfs_start_one_delalloc_inode(struct btrfs_root *root, int delay_iput)
+{
+ struct btrfs_inode *binode;
+ struct inode *inode = NULL;
+
+ spin_lock(&root->fs_info->delalloc_lock);
+ while (!list_empty(&root->fs_info->delalloc_inodes)) {
+ binode = list_entry(root->fs_info->delalloc_inodes.next,
+ struct btrfs_inode, delalloc_inodes);
+ inode = igrab(&binode->vfs_inode);
+ if (inode) {
+ list_move_tail(&binode->delalloc_inodes,
+ &root->fs_info->delalloc_inodes);
+ break;
+ }
+
+ list_del_init(&binode->delalloc_inodes);
+ cond_resched_lock(&root->fs_info->delalloc_lock);
+ }
+ spin_unlock(&root->fs_info->delalloc_lock);
+
+ if (inode) {
+ write_inode_now(inode, 0);
+ if (delay_iput)
+ btrfs_add_delayed_iput(inode);
+ else
+ iput(inode);
+ return 1;
+ }
+ return 0;
+}
+
static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
const char *symname)
{
@@ -5625,26 +6658,20 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(root))
return -ENAMETOOLONG;
+ err = btrfs_find_free_objectid(NULL, root, dir->i_ino, &objectid);
+ if (err)
+ return err;
/*
* 2 items for inode item and ref
* 2 items for dir items
* 1 item for xattr if selinux is on
*/
- err = btrfs_reserve_metadata_space(root, 5);
- if (err)
- return err;
+ trans = btrfs_start_transaction(root, 5);
+ if (IS_ERR(trans))
+ return PTR_ERR(trans);
- trans = btrfs_start_transaction(root, 1);
- if (!trans)
- goto out_fail;
btrfs_set_trans_block_group(trans, dir);
- err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
- if (err) {
- err = -ENOSPC;
- goto out_unlock;
- }
-
inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
dentry->d_name.len,
dentry->d_parent->d_inode->i_ino, objectid,
@@ -5716,8 +6743,6 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
out_unlock:
nr = trans->blocks_used;
btrfs_end_transaction_throttle(trans, root);
-out_fail:
- btrfs_unreserve_metadata_space(root, 5);
if (drop_inode) {
inode_dec_link_count(inode);
iput(inode);
@@ -5726,33 +6751,28 @@ out_fail:
return err;
}
-static int prealloc_file_range(struct inode *inode, u64 start, u64 end,
- u64 alloc_hint, int mode, loff_t actual_len)
+int btrfs_prealloc_file_range(struct inode *inode, int mode,
+ u64 start, u64 num_bytes, u64 min_size,
+ loff_t actual_len, u64 *alloc_hint)
{
struct btrfs_trans_handle *trans;
struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_key ins;
u64 cur_offset = start;
- u64 num_bytes = end - start;
int ret = 0;
- u64 i_size;
while (num_bytes > 0) {
- trans = btrfs_start_transaction(root, 1);
-
- ret = btrfs_reserve_extent(trans, root, num_bytes,
- root->sectorsize, 0, alloc_hint,
- (u64)-1, &ins, 1);
- if (ret) {
- WARN_ON(1);
- goto stop_trans;
+ trans = btrfs_start_transaction(root, 3);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ break;
}
- ret = btrfs_reserve_metadata_space(root, 3);
+ ret = btrfs_reserve_extent(trans, root, num_bytes, min_size,
+ 0, *alloc_hint, (u64)-1, &ins, 1);
if (ret) {
- btrfs_free_reserved_extent(root, ins.objectid,
- ins.offset);
- goto stop_trans;
+ btrfs_end_transaction(trans, root);
+ break;
}
ret = insert_reserved_file_extent(trans, inode,
@@ -5766,34 +6786,27 @@ static int prealloc_file_range(struct inode *inode, u64 start, u64 end,
num_bytes -= ins.offset;
cur_offset += ins.offset;
- alloc_hint = ins.objectid + ins.offset;
+ *alloc_hint = ins.objectid + ins.offset;
inode->i_ctime = CURRENT_TIME;
BTRFS_I(inode)->flags |= BTRFS_INODE_PREALLOC;
if (!(mode & FALLOC_FL_KEEP_SIZE) &&
- (actual_len > inode->i_size) &&
- (cur_offset > inode->i_size)) {
-
+ (actual_len > inode->i_size) &&
+ (cur_offset > inode->i_size)) {
if (cur_offset > actual_len)
- i_size = actual_len;
+ i_size_write(inode, actual_len);
else
- i_size = cur_offset;
- i_size_write(inode, i_size);
- btrfs_ordered_update_i_size(inode, i_size, NULL);
+ i_size_write(inode, cur_offset);
+ i_size_write(inode, cur_offset);
+ btrfs_ordered_update_i_size(inode, cur_offset, NULL);
}
ret = btrfs_update_inode(trans, root, inode);
BUG_ON(ret);
btrfs_end_transaction(trans, root);
- btrfs_unreserve_metadata_space(root, 3);
}
return ret;
-
-stop_trans:
- btrfs_end_transaction(trans, root);
- return ret;
-
}
static long btrfs_fallocate(struct inode *inode, int mode,
@@ -5826,8 +6839,7 @@ static long btrfs_fallocate(struct inode *inode, int mode,
goto out;
}
- ret = btrfs_check_data_free_space(BTRFS_I(inode)->root, inode,
- alloc_end - alloc_start);
+ ret = btrfs_check_data_free_space(inode, alloc_end - alloc_start);
if (ret)
goto out;
@@ -5872,16 +6884,16 @@ static long btrfs_fallocate(struct inode *inode, int mode,
if (em->block_start == EXTENT_MAP_HOLE ||
(cur_offset >= inode->i_size &&
!test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {
- ret = prealloc_file_range(inode,
- cur_offset, last_byte,
- alloc_hint, mode, offset+len);
+ ret = btrfs_prealloc_file_range(inode, mode, cur_offset,
+ last_byte - cur_offset,
+ 1 << inode->i_blkbits,
+ offset + len,
+ &alloc_hint);
if (ret < 0) {
free_extent_map(em);
break;
}
}
- if (em->block_start <= EXTENT_MAP_LAST_BYTE)
- alloc_hint = em->block_start;
free_extent_map(em);
cur_offset = last_byte;
@@ -5893,8 +6905,7 @@ static long btrfs_fallocate(struct inode *inode, int mode,
unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
&cached_state, GFP_NOFS);
- btrfs_free_reserved_data_space(BTRFS_I(inode)->root, inode,
- alloc_end - alloc_start);
+ btrfs_free_reserved_data_space(inode, alloc_end - alloc_start);
out:
mutex_unlock(&inode->i_mutex);
return ret;
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 97a97839a867..9254b3d58dbe 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -239,23 +239,19 @@ static noinline int create_subvol(struct btrfs_root *root,
u64 new_dirid = BTRFS_FIRST_FREE_OBJECTID;
u64 index = 0;
+ ret = btrfs_find_free_objectid(NULL, root->fs_info->tree_root,
+ 0, &objectid);
+ if (ret)
+ return ret;
/*
* 1 - inode item
* 2 - refs
* 1 - root item
* 2 - dir items
*/
- ret = btrfs_reserve_metadata_space(root, 6);
- if (ret)
- return ret;
-
- trans = btrfs_start_transaction(root, 1);
- BUG_ON(!trans);
-
- ret = btrfs_find_free_objectid(trans, root->fs_info->tree_root,
- 0, &objectid);
- if (ret)
- goto fail;
+ trans = btrfs_start_transaction(root, 6);
+ if (IS_ERR(trans))
+ return PTR_ERR(trans);
leaf = btrfs_alloc_free_block(trans, root, root->leafsize,
0, objectid, NULL, 0, 0, 0);
@@ -345,13 +341,10 @@ fail:
err = btrfs_commit_transaction(trans, root);
if (err && !ret)
ret = err;
-
- btrfs_unreserve_metadata_space(root, 6);
return ret;
}
-static int create_snapshot(struct btrfs_root *root, struct dentry *dentry,
- char *name, int namelen)
+static int create_snapshot(struct btrfs_root *root, struct dentry *dentry)
{
struct inode *inode;
struct btrfs_pending_snapshot *pending_snapshot;
@@ -361,40 +354,33 @@ static int create_snapshot(struct btrfs_root *root, struct dentry *dentry,
if (!root->ref_cows)
return -EINVAL;
- /*
- * 1 - inode item
- * 2 - refs
- * 1 - root item
- * 2 - dir items
- */
- ret = btrfs_reserve_metadata_space(root, 6);
- if (ret)
- goto fail;
-
pending_snapshot = kzalloc(sizeof(*pending_snapshot), GFP_NOFS);
- if (!pending_snapshot) {
- ret = -ENOMEM;
- btrfs_unreserve_metadata_space(root, 6);
- goto fail;
- }
- pending_snapshot->name = kmalloc(namelen + 1, GFP_NOFS);
- if (!pending_snapshot->name) {
- ret = -ENOMEM;
- kfree(pending_snapshot);
- btrfs_unreserve_metadata_space(root, 6);
- goto fail;
- }
- memcpy(pending_snapshot->name, name, namelen);
- pending_snapshot->name[namelen] = '\0';
+ if (!pending_snapshot)
+ return -ENOMEM;
+
+ btrfs_init_block_rsv(&pending_snapshot->block_rsv);
pending_snapshot->dentry = dentry;
- trans = btrfs_start_transaction(root, 1);
- BUG_ON(!trans);
pending_snapshot->root = root;
+
+ trans = btrfs_start_transaction(root->fs_info->extent_root, 5);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ goto fail;
+ }
+
+ ret = btrfs_snap_reserve_metadata(trans, pending_snapshot);
+ BUG_ON(ret);
+
list_add(&pending_snapshot->list,
&trans->transaction->pending_snapshots);
- ret = btrfs_commit_transaction(trans, root);
+ ret = btrfs_commit_transaction(trans, root->fs_info->extent_root);
BUG_ON(ret);
- btrfs_unreserve_metadata_space(root, 6);
+
+ ret = pending_snapshot->error;
+ if (ret)
+ goto fail;
+
+ btrfs_orphan_cleanup(pending_snapshot->snap);
inode = btrfs_lookup_dentry(dentry->d_parent->d_inode, dentry);
if (IS_ERR(inode)) {
@@ -405,6 +391,7 @@ static int create_snapshot(struct btrfs_root *root, struct dentry *dentry,
d_instantiate(dentry, inode);
ret = 0;
fail:
+ kfree(pending_snapshot);
return ret;
}
@@ -456,8 +443,7 @@ static noinline int btrfs_mksubvol(struct path *parent,
goto out_up_read;
if (snap_src) {
- error = create_snapshot(snap_src, dentry,
- name, namelen);
+ error = create_snapshot(snap_src, dentry);
} else {
error = create_subvol(BTRFS_I(dir)->root, dentry,
name, namelen);
@@ -601,19 +587,9 @@ static int btrfs_defrag_file(struct file *file,
if (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS)
BTRFS_I(inode)->force_compress = 1;
- ret = btrfs_check_data_free_space(root, inode, PAGE_CACHE_SIZE);
- if (ret) {
- ret = -ENOSPC;
- break;
- }
-
- ret = btrfs_reserve_metadata_for_delalloc(root, inode, 1);
- if (ret) {
- btrfs_free_reserved_data_space(root, inode,
- PAGE_CACHE_SIZE);
- ret = -ENOSPC;
- break;
- }
+ ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE);
+ if (ret)
+ goto err_unlock;
again:
if (inode->i_size == 0 ||
i > ((inode->i_size - 1) >> PAGE_CACHE_SHIFT)) {
@@ -622,8 +598,10 @@ again:
}
page = grab_cache_page(inode->i_mapping, i);
- if (!page)
+ if (!page) {
+ ret = -ENOMEM;
goto err_reservations;
+ }
if (!PageUptodate(page)) {
btrfs_readpage(NULL, page);
@@ -631,6 +609,7 @@ again:
if (!PageUptodate(page)) {
unlock_page(page);
page_cache_release(page);
+ ret = -EIO;
goto err_reservations;
}
}
@@ -644,8 +623,7 @@ again:
wait_on_page_writeback(page);
if (PageDirty(page)) {
- btrfs_free_reserved_data_space(root, inode,
- PAGE_CACHE_SIZE);
+ btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
goto loop_unlock;
}
@@ -683,7 +661,6 @@ loop_unlock:
page_cache_release(page);
mutex_unlock(&inode->i_mutex);
- btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
balance_dirty_pages_ratelimited_nr(inode->i_mapping, 1);
i++;
}
@@ -713,9 +690,9 @@ loop_unlock:
return 0;
err_reservations:
+ btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
+err_unlock:
mutex_unlock(&inode->i_mutex);
- btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE);
- btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
return ret;
}
@@ -811,7 +788,7 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
device->name, (unsigned long long)new_size);
if (new_size > old_size) {
- trans = btrfs_start_transaction(root, 1);
+ trans = btrfs_start_transaction(root, 0);
ret = btrfs_grow_device(trans, device, new_size);
btrfs_commit_transaction(trans, root);
} else {
@@ -1300,7 +1277,13 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
if (err)
goto out_up_write;
- trans = btrfs_start_transaction(root, 1);
+ trans = btrfs_start_transaction(root, 0);
+ if (IS_ERR(trans)) {
+ err = PTR_ERR(trans);
+ goto out_up_write;
+ }
+ trans->block_rsv = &root->fs_info->global_block_rsv;
+
ret = btrfs_unlink_subvol(trans, root, dir,
dest->root_key.objectid,
dentry->d_name.name,
@@ -1314,10 +1297,12 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
dest->root_item.drop_level = 0;
btrfs_set_root_refs(&dest->root_item, 0);
- ret = btrfs_insert_orphan_item(trans,
- root->fs_info->tree_root,
- dest->root_key.objectid);
- BUG_ON(ret);
+ if (!xchg(&dest->orphan_item_inserted, 1)) {
+ ret = btrfs_insert_orphan_item(trans,
+ root->fs_info->tree_root,
+ dest->root_key.objectid);
+ BUG_ON(ret);
+ }
ret = btrfs_commit_transaction(trans, root);
BUG_ON(ret);
@@ -1358,8 +1343,10 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp)
ret = -EPERM;
goto out;
}
- btrfs_defrag_root(root, 0);
- btrfs_defrag_root(root->fs_info->extent_root, 0);
+ ret = btrfs_defrag_root(root, 0);
+ if (ret)
+ goto out;
+ ret = btrfs_defrag_root(root->fs_info->extent_root, 0);
break;
case S_IFREG:
if (!(file->f_mode & FMODE_WRITE)) {
@@ -1389,9 +1376,11 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp)
/* the rest are all set to zero by kzalloc */
range->len = (u64)-1;
}
- btrfs_defrag_file(file, range);
+ ret = btrfs_defrag_file(file, range);
kfree(range);
break;
+ default:
+ ret = -EINVAL;
}
out:
mnt_drop_write(file->f_path.mnt);
@@ -1469,7 +1458,7 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
*/
/* the destination must be opened for writing */
- if (!(file->f_mode & FMODE_WRITE))
+ if (!(file->f_mode & FMODE_WRITE) || (file->f_flags & O_APPEND))
return -EINVAL;
ret = mnt_want_write(file->f_path.mnt);
@@ -1522,7 +1511,7 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
/* determine range to clone */
ret = -EINVAL;
- if (off >= src->i_size || off + len > src->i_size)
+ if (off + len > src->i_size || off + len < off)
goto out_unlock;
if (len == 0)
olen = len = src->i_size - off;
@@ -1550,12 +1539,6 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
btrfs_wait_ordered_range(src, off, off+len);
}
- trans = btrfs_start_transaction(root, 1);
- BUG_ON(!trans);
-
- /* punch hole in destination first */
- btrfs_drop_extents(trans, inode, off, off + len, &hint_byte, 1);
-
/* clone data */
key.objectid = src->i_ino;
key.type = BTRFS_EXTENT_DATA_KEY;
@@ -1566,7 +1549,7 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
* note the key will change type as we walk through the
* tree.
*/
- ret = btrfs_search_slot(trans, root, &key, path, 0, 0);
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
if (ret < 0)
goto out;
@@ -1595,6 +1578,7 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
u64 disko = 0, diskl = 0;
u64 datao = 0, datal = 0;
u8 comp;
+ u64 endoff;
size = btrfs_item_size_nr(leaf, slot);
read_extent_buffer(leaf, buf,
@@ -1629,12 +1613,31 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
new_key.objectid = inode->i_ino;
new_key.offset = key.offset + destoff - off;
+ trans = btrfs_start_transaction(root, 1);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ goto out;
+ }
+
if (type == BTRFS_FILE_EXTENT_REG ||
type == BTRFS_FILE_EXTENT_PREALLOC) {
+ if (off > key.offset) {
+ datao += off - key.offset;
+ datal -= off - key.offset;
+ }
+
+ if (key.offset + datal > off + len)
+ datal = off + len - key.offset;
+
+ ret = btrfs_drop_extents(trans, inode,
+ new_key.offset,
+ new_key.offset + datal,
+ &hint_byte, 1);
+ BUG_ON(ret);
+
ret = btrfs_insert_empty_item(trans, root, path,
&new_key, size);
- if (ret)
- goto out;
+ BUG_ON(ret);
leaf = path->nodes[0];
slot = path->slots[0];
@@ -1645,14 +1648,6 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
extent = btrfs_item_ptr(leaf, slot,
struct btrfs_file_extent_item);
- if (off > key.offset) {
- datao += off - key.offset;
- datal -= off - key.offset;
- }
-
- if (key.offset + datal > off + len)
- datal = off + len - key.offset;
-
/* disko == 0 means it's a hole */
if (!disko)
datao = 0;
@@ -1683,14 +1678,21 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
if (comp && (skip || trim)) {
ret = -EINVAL;
+ btrfs_end_transaction(trans, root);
goto out;
}
size -= skip + trim;
datal -= skip + trim;
+
+ ret = btrfs_drop_extents(trans, inode,
+ new_key.offset,
+ new_key.offset + datal,
+ &hint_byte, 1);
+ BUG_ON(ret);
+
ret = btrfs_insert_empty_item(trans, root, path,
&new_key, size);
- if (ret)
- goto out;
+ BUG_ON(ret);
if (skip) {
u32 start =
@@ -1708,8 +1710,26 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
}
btrfs_mark_buffer_dirty(leaf);
- }
+ btrfs_release_path(root, path);
+
+ inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+ /*
+ * we round up to the block size at eof when
+ * determining which extents to clone above,
+ * but shouldn't round up the file size
+ */
+ endoff = new_key.offset + datal;
+ if (endoff > off+olen)
+ endoff = off+olen;
+ if (endoff > inode->i_size)
+ btrfs_i_size_write(inode, endoff);
+
+ BTRFS_I(inode)->flags = BTRFS_I(src)->flags;
+ ret = btrfs_update_inode(trans, root, inode);
+ BUG_ON(ret);
+ btrfs_end_transaction(trans, root);
+ }
next:
btrfs_release_path(root, path);
key.offset++;
@@ -1717,17 +1737,7 @@ next:
ret = 0;
out:
btrfs_release_path(root, path);
- if (ret == 0) {
- inode->i_mtime = inode->i_ctime = CURRENT_TIME;
- if (destoff + olen > inode->i_size)
- btrfs_i_size_write(inode, destoff + olen);
- BTRFS_I(inode)->flags = BTRFS_I(src)->flags;
- ret = btrfs_update_inode(trans, root, inode);
- }
- btrfs_end_transaction(trans, root);
unlock_extent(&BTRFS_I(src)->io_tree, off, off+len, GFP_NOFS);
- if (ret)
- vmtruncate(inode, 0);
out_unlock:
mutex_unlock(&src->i_mutex);
mutex_unlock(&inode->i_mutex);
@@ -1845,7 +1855,7 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)
dir_id = btrfs_super_root_dir(&root->fs_info->super_copy);
di = btrfs_lookup_dir_item(trans, root->fs_info->tree_root, path,
dir_id, "default", 7, 1);
- if (!di) {
+ if (IS_ERR_OR_NULL(di)) {
btrfs_free_path(path);
btrfs_end_transaction(trans, root);
printk(KERN_ERR "Umm, you don't have the default dir item, "
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index a127c0ebb2dc..e56c72bc5add 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -124,6 +124,15 @@ static int offset_in_entry(struct btrfs_ordered_extent *entry, u64 file_offset)
return 1;
}
+static int range_overlaps(struct btrfs_ordered_extent *entry, u64 file_offset,
+ u64 len)
+{
+ if (file_offset + len <= entry->file_offset ||
+ entry->file_offset + entry->len <= file_offset)
+ return 0;
+ return 1;
+}
+
/*
* look find the first ordered struct that has this offset, otherwise
* the first one less than this offset
@@ -161,8 +170,9 @@ static inline struct rb_node *tree_search(struct btrfs_ordered_inode_tree *tree,
* The tree is given a single reference on the ordered extent that was
* inserted.
*/
-int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
- u64 start, u64 len, u64 disk_len, int type)
+static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
+ u64 start, u64 len, u64 disk_len,
+ int type, int dio)
{
struct btrfs_ordered_inode_tree *tree;
struct rb_node *node;
@@ -182,6 +192,9 @@ int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
if (type != BTRFS_ORDERED_IO_DONE && type != BTRFS_ORDERED_COMPLETE)
set_bit(type, &entry->flags);
+ if (dio)
+ set_bit(BTRFS_ORDERED_DIRECT, &entry->flags);
+
/* one ref for the tree */
atomic_set(&entry->refs, 1);
init_waitqueue_head(&entry->wait);
@@ -203,6 +216,20 @@ int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
return 0;
}
+int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
+ u64 start, u64 len, u64 disk_len, int type)
+{
+ return __btrfs_add_ordered_extent(inode, file_offset, start, len,
+ disk_len, type, 0);
+}
+
+int btrfs_add_ordered_extent_dio(struct inode *inode, u64 file_offset,
+ u64 start, u64 len, u64 disk_len, int type)
+{
+ return __btrfs_add_ordered_extent(inode, file_offset, start, len,
+ disk_len, type, 1);
+}
+
/*
* Add a struct btrfs_ordered_sum into the list of checksums to be inserted
* when an ordered extent is finished. If the list covers more than one
@@ -311,13 +338,6 @@ static int __btrfs_remove_ordered_extent(struct inode *inode,
tree->last = NULL;
set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags);
- spin_lock(&BTRFS_I(inode)->accounting_lock);
- WARN_ON(!BTRFS_I(inode)->outstanding_extents);
- BTRFS_I(inode)->outstanding_extents--;
- spin_unlock(&BTRFS_I(inode)->accounting_lock);
- btrfs_unreserve_metadata_for_delalloc(BTRFS_I(inode)->root,
- inode, 1);
-
spin_lock(&root->fs_info->ordered_extent_lock);
list_del_init(&entry->root_extent_list);
@@ -491,7 +511,8 @@ void btrfs_start_ordered_extent(struct inode *inode,
* start IO on any dirty ones so the wait doesn't stall waiting
* for pdflush to find them
*/
- filemap_fdatawrite_range(inode->i_mapping, start, end);
+ if (!test_bit(BTRFS_ORDERED_DIRECT, &entry->flags))
+ filemap_fdatawrite_range(inode->i_mapping, start, end);
if (wait) {
wait_event(entry->wait, test_bit(BTRFS_ORDERED_COMPLETE,
&entry->flags));
@@ -588,6 +609,47 @@ out:
return entry;
}
+/* Since the DIO code tries to lock a wide area we need to look for any ordered
+ * extents that exist in the range, rather than just the start of the range.
+ */
+struct btrfs_ordered_extent *btrfs_lookup_ordered_range(struct inode *inode,
+ u64 file_offset,
+ u64 len)
+{
+ struct btrfs_ordered_inode_tree *tree;
+ struct rb_node *node;
+ struct btrfs_ordered_extent *entry = NULL;
+
+ tree = &BTRFS_I(inode)->ordered_tree;
+ spin_lock(&tree->lock);
+ node = tree_search(tree, file_offset);
+ if (!node) {
+ node = tree_search(tree, file_offset + len);
+ if (!node)
+ goto out;
+ }
+
+ while (1) {
+ entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
+ if (range_overlaps(entry, file_offset, len))
+ break;
+
+ if (entry->file_offset >= file_offset + len) {
+ entry = NULL;
+ break;
+ }
+ entry = NULL;
+ node = rb_next(node);
+ if (!node)
+ break;
+ }
+out:
+ if (entry)
+ atomic_inc(&entry->refs);
+ spin_unlock(&tree->lock);
+ return entry;
+}
+
/*
* lookup and return any extent before 'file_offset'. NULL is returned
* if none is found
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index c82f76a9f040..8ac365492a3f 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -72,6 +72,8 @@ struct btrfs_ordered_sum {
#define BTRFS_ORDERED_PREALLOC 4 /* set when writing to prealloced extent */
+#define BTRFS_ORDERED_DIRECT 5 /* set when we're doing DIO with this extent */
+
struct btrfs_ordered_extent {
/* logical offset in the file */
u64 file_offset;
@@ -140,7 +142,9 @@ int btrfs_dec_test_ordered_pending(struct inode *inode,
struct btrfs_ordered_extent **cached,
u64 file_offset, u64 io_size);
int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
- u64 start, u64 len, u64 disk_len, int tyep);
+ u64 start, u64 len, u64 disk_len, int type);
+int btrfs_add_ordered_extent_dio(struct inode *inode, u64 file_offset,
+ u64 start, u64 len, u64 disk_len, int type);
int btrfs_add_ordered_sum(struct inode *inode,
struct btrfs_ordered_extent *entry,
struct btrfs_ordered_sum *sum);
@@ -151,6 +155,9 @@ void btrfs_start_ordered_extent(struct inode *inode,
int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len);
struct btrfs_ordered_extent *
btrfs_lookup_first_ordered_extent(struct inode * inode, u64 file_offset);
+struct btrfs_ordered_extent *btrfs_lookup_ordered_range(struct inode *inode,
+ u64 file_offset,
+ u64 len);
int btrfs_ordered_update_i_size(struct inode *inode, u64 offset,
struct btrfs_ordered_extent *ordered);
int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, u32 *sum);
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index e558dd941ded..b37d723b9d4a 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -44,8 +44,12 @@ struct tree_entry {
struct backref_node {
struct rb_node rb_node;
u64 bytenr;
- /* objectid tree block owner */
+
+ u64 new_bytenr;
+ /* objectid of tree block owner, can be not uptodate */
u64 owner;
+ /* link to pending, changed or detached list */
+ struct list_head list;
/* list of upper level blocks reference this block */
struct list_head upper;
/* list of child blocks in the cache */
@@ -56,9 +60,9 @@ struct backref_node {
struct extent_buffer *eb;
/* level of tree block */
unsigned int level:8;
- /* 1 if the block is root of old snapshot */
- unsigned int old_root:1;
- /* 1 if no child blocks in the cache */
+ /* is the block in non-reference counted tree */
+ unsigned int cowonly:1;
+ /* 1 if no child node in the cache */
unsigned int lowest:1;
/* is the extent buffer locked */
unsigned int locked:1;
@@ -66,6 +70,16 @@ struct backref_node {
unsigned int processed:1;
/* have backrefs of this block been checked */
unsigned int checked:1;
+ /*
+ * 1 if corresponding block has been cowed but some upper
+ * level block pointers may not point to the new location
+ */
+ unsigned int pending:1;
+ /*
+ * 1 if the backref node isn't connected to any other
+ * backref node.
+ */
+ unsigned int detached:1;
};
/*
@@ -74,7 +88,6 @@ struct backref_node {
struct backref_edge {
struct list_head list[2];
struct backref_node *node[2];
- u64 blockptr;
};
#define LOWER 0
@@ -83,9 +96,25 @@ struct backref_edge {
struct backref_cache {
/* red black tree of all backref nodes in the cache */
struct rb_root rb_root;
- /* list of backref nodes with no child block in the cache */
+ /* for passing backref nodes to btrfs_reloc_cow_block */
+ struct backref_node *path[BTRFS_MAX_LEVEL];
+ /*
+ * list of blocks that have been cowed but some block
+ * pointers in upper level blocks may not reflect the
+ * new location
+ */
struct list_head pending[BTRFS_MAX_LEVEL];
- spinlock_t lock;
+ /* list of backref nodes with no child node */
+ struct list_head leaves;
+ /* list of blocks that have been cowed in current transaction */
+ struct list_head changed;
+ /* list of detached backref node. */
+ struct list_head detached;
+
+ u64 last_trans;
+
+ int nr_nodes;
+ int nr_edges;
};
/*
@@ -113,15 +142,6 @@ struct tree_block {
unsigned int key_ready:1;
};
-/* inode vector */
-#define INODEVEC_SIZE 16
-
-struct inodevec {
- struct list_head list;
- struct inode *inode[INODEVEC_SIZE];
- int nr;
-};
-
#define MAX_EXTENTS 128
struct file_extent_cluster {
@@ -138,36 +158,43 @@ struct reloc_control {
struct btrfs_root *extent_root;
/* inode for moving data */
struct inode *data_inode;
- struct btrfs_workers workers;
+
+ struct btrfs_block_rsv *block_rsv;
+
+ struct backref_cache backref_cache;
+
+ struct file_extent_cluster cluster;
/* tree blocks have been processed */
struct extent_io_tree processed_blocks;
/* map start of tree root to corresponding reloc tree */
struct mapping_tree reloc_root_tree;
/* list of reloc trees */
struct list_head reloc_roots;
+ /* size of metadata reservation for merging reloc trees */
+ u64 merging_rsv_size;
+ /* size of relocated tree nodes */
+ u64 nodes_relocated;
+
u64 search_start;
u64 extents_found;
- u64 extents_skipped;
- int stage;
- int create_reloc_root;
+
+ int block_rsv_retries;
+
+ unsigned int stage:8;
+ unsigned int create_reloc_tree:1;
+ unsigned int merge_reloc_tree:1;
unsigned int found_file_extent:1;
- unsigned int found_old_snapshot:1;
+ unsigned int commit_transaction:1;
};
/* stages of data relocation */
#define MOVE_DATA_EXTENTS 0
#define UPDATE_DATA_PTRS 1
-/*
- * merge reloc tree to corresponding fs tree in worker threads
- */
-struct async_merge {
- struct btrfs_work work;
- struct reloc_control *rc;
- struct btrfs_root *root;
- struct completion *done;
- atomic_t *num_pending;
-};
+static void remove_backref_node(struct backref_cache *cache,
+ struct backref_node *node);
+static void __mark_block_processed(struct reloc_control *rc,
+ struct backref_node *node);
static void mapping_tree_init(struct mapping_tree *tree)
{
@@ -181,15 +208,80 @@ static void backref_cache_init(struct backref_cache *cache)
cache->rb_root = RB_ROOT;
for (i = 0; i < BTRFS_MAX_LEVEL; i++)
INIT_LIST_HEAD(&cache->pending[i]);
- spin_lock_init(&cache->lock);
+ INIT_LIST_HEAD(&cache->changed);
+ INIT_LIST_HEAD(&cache->detached);
+ INIT_LIST_HEAD(&cache->leaves);
+}
+
+static void backref_cache_cleanup(struct backref_cache *cache)
+{
+ struct backref_node *node;
+ int i;
+
+ while (!list_empty(&cache->detached)) {
+ node = list_entry(cache->detached.next,
+ struct backref_node, list);
+ remove_backref_node(cache, node);
+ }
+
+ while (!list_empty(&cache->leaves)) {
+ node = list_entry(cache->leaves.next,
+ struct backref_node, lower);
+ remove_backref_node(cache, node);
+ }
+
+ cache->last_trans = 0;
+
+ for (i = 0; i < BTRFS_MAX_LEVEL; i++)
+ BUG_ON(!list_empty(&cache->pending[i]));
+ BUG_ON(!list_empty(&cache->changed));
+ BUG_ON(!list_empty(&cache->detached));
+ BUG_ON(!RB_EMPTY_ROOT(&cache->rb_root));
+ BUG_ON(cache->nr_nodes);
+ BUG_ON(cache->nr_edges);
+}
+
+static struct backref_node *alloc_backref_node(struct backref_cache *cache)
+{
+ struct backref_node *node;
+
+ node = kzalloc(sizeof(*node), GFP_NOFS);
+ if (node) {
+ INIT_LIST_HEAD(&node->list);
+ INIT_LIST_HEAD(&node->upper);
+ INIT_LIST_HEAD(&node->lower);
+ RB_CLEAR_NODE(&node->rb_node);
+ cache->nr_nodes++;
+ }
+ return node;
+}
+
+static void free_backref_node(struct backref_cache *cache,
+ struct backref_node *node)
+{
+ if (node) {
+ cache->nr_nodes--;
+ kfree(node);
+ }
+}
+
+static struct backref_edge *alloc_backref_edge(struct backref_cache *cache)
+{
+ struct backref_edge *edge;
+
+ edge = kzalloc(sizeof(*edge), GFP_NOFS);
+ if (edge)
+ cache->nr_edges++;
+ return edge;
}
-static void backref_node_init(struct backref_node *node)
+static void free_backref_edge(struct backref_cache *cache,
+ struct backref_edge *edge)
{
- memset(node, 0, sizeof(*node));
- INIT_LIST_HEAD(&node->upper);
- INIT_LIST_HEAD(&node->lower);
- RB_CLEAR_NODE(&node->rb_node);
+ if (edge) {
+ cache->nr_edges--;
+ kfree(edge);
+ }
}
static struct rb_node *tree_insert(struct rb_root *root, u64 bytenr,
@@ -250,6 +342,7 @@ static struct backref_node *walk_up_backref(struct backref_node *node,
edges[idx++] = edge;
node = edge->node[UPPER];
}
+ BUG_ON(node->detached);
*index = idx;
return node;
}
@@ -281,13 +374,18 @@ static struct backref_node *walk_down_backref(struct backref_edge *edges[],
return NULL;
}
+static void unlock_node_buffer(struct backref_node *node)
+{
+ if (node->locked) {
+ btrfs_tree_unlock(node->eb);
+ node->locked = 0;
+ }
+}
+
static void drop_node_buffer(struct backref_node *node)
{
if (node->eb) {
- if (node->locked) {
- btrfs_tree_unlock(node->eb);
- node->locked = 0;
- }
+ unlock_node_buffer(node);
free_extent_buffer(node->eb);
node->eb = NULL;
}
@@ -296,14 +394,14 @@ static void drop_node_buffer(struct backref_node *node)
static void drop_backref_node(struct backref_cache *tree,
struct backref_node *node)
{
- BUG_ON(!node->lowest);
BUG_ON(!list_empty(&node->upper));
drop_node_buffer(node);
+ list_del(&node->list);
list_del(&node->lower);
-
- rb_erase(&node->rb_node, &tree->rb_root);
- kfree(node);
+ if (!RB_EMPTY_NODE(&node->rb_node))
+ rb_erase(&node->rb_node, &tree->rb_root);
+ free_backref_node(tree, node);
}
/*
@@ -318,27 +416,121 @@ static void remove_backref_node(struct backref_cache *cache,
if (!node)
return;
- BUG_ON(!node->lowest);
+ BUG_ON(!node->lowest && !node->detached);
while (!list_empty(&node->upper)) {
edge = list_entry(node->upper.next, struct backref_edge,
list[LOWER]);
upper = edge->node[UPPER];
list_del(&edge->list[LOWER]);
list_del(&edge->list[UPPER]);
- kfree(edge);
+ free_backref_edge(cache, edge);
+
+ if (RB_EMPTY_NODE(&upper->rb_node)) {
+ BUG_ON(!list_empty(&node->upper));
+ drop_backref_node(cache, node);
+ node = upper;
+ node->lowest = 1;
+ continue;
+ }
/*
- * add the node to pending list if no other
+ * add the node to leaf node list if no other
* child block cached.
*/
if (list_empty(&upper->lower)) {
- list_add_tail(&upper->lower,
- &cache->pending[upper->level]);
+ list_add_tail(&upper->lower, &cache->leaves);
upper->lowest = 1;
}
}
+
drop_backref_node(cache, node);
}
+static void update_backref_node(struct backref_cache *cache,
+ struct backref_node *node, u64 bytenr)
+{
+ struct rb_node *rb_node;
+ rb_erase(&node->rb_node, &cache->rb_root);
+ node->bytenr = bytenr;
+ rb_node = tree_insert(&cache->rb_root, node->bytenr, &node->rb_node);
+ BUG_ON(rb_node);
+}
+
+/*
+ * update backref cache after a transaction commit
+ */
+static int update_backref_cache(struct btrfs_trans_handle *trans,
+ struct backref_cache *cache)
+{
+ struct backref_node *node;
+ int level = 0;
+
+ if (cache->last_trans == 0) {
+ cache->last_trans = trans->transid;
+ return 0;
+ }
+
+ if (cache->last_trans == trans->transid)
+ return 0;
+
+ /*
+ * detached nodes are used to avoid unnecessary backref
+ * lookup. transaction commit changes the extent tree.
+ * so the detached nodes are no longer useful.
+ */
+ while (!list_empty(&cache->detached)) {
+ node = list_entry(cache->detached.next,
+ struct backref_node, list);
+ remove_backref_node(cache, node);
+ }
+
+ while (!list_empty(&cache->changed)) {
+ node = list_entry(cache->changed.next,
+ struct backref_node, list);
+ list_del_init(&node->list);
+ BUG_ON(node->pending);
+ update_backref_node(cache, node, node->new_bytenr);
+ }
+
+ /*
+ * some nodes can be left in the pending list if there were
+ * errors during processing the pending nodes.
+ */
+ for (level = 0; level < BTRFS_MAX_LEVEL; level++) {
+ list_for_each_entry(node, &cache->pending[level], list) {
+ BUG_ON(!node->pending);
+ if (node->bytenr == node->new_bytenr)
+ continue;
+ update_backref_node(cache, node, node->new_bytenr);
+ }
+ }
+
+ cache->last_trans = 0;
+ return 1;
+}
+
+static int should_ignore_root(struct btrfs_root *root)
+{
+ struct btrfs_root *reloc_root;
+
+ if (!root->ref_cows)
+ return 0;
+
+ reloc_root = root->reloc_root;
+ if (!reloc_root)
+ return 0;
+
+ if (btrfs_root_last_snapshot(&reloc_root->root_item) ==
+ root->fs_info->running_transaction->transid - 1)
+ return 0;
+ /*
+ * if there is reloc tree and it was created in previous
+ * transaction backref lookup can find the reloc tree,
+ * so backref node for the fs tree root is useless for
+ * relocation.
+ */
+ return 1;
+}
+
/*
* find reloc tree by address of tree root
*/
@@ -453,11 +645,12 @@ int find_inline_backref(struct extent_buffer *leaf, int slot,
* for all upper level blocks that directly/indirectly reference the
* block are also cached.
*/
-static struct backref_node *build_backref_tree(struct reloc_control *rc,
- struct backref_cache *cache,
- struct btrfs_key *node_key,
- int level, u64 bytenr)
+static noinline_for_stack
+struct backref_node *build_backref_tree(struct reloc_control *rc,
+ struct btrfs_key *node_key,
+ int level, u64 bytenr)
{
+ struct backref_cache *cache = &rc->backref_cache;
struct btrfs_path *path1;
struct btrfs_path *path2;
struct extent_buffer *eb;
@@ -473,6 +666,8 @@ static struct backref_node *build_backref_tree(struct reloc_control *rc,
unsigned long end;
unsigned long ptr;
LIST_HEAD(list);
+ LIST_HEAD(useless);
+ int cowonly;
int ret;
int err = 0;
@@ -483,15 +678,13 @@ static struct backref_node *build_backref_tree(struct reloc_control *rc,
goto out;
}
- node = kmalloc(sizeof(*node), GFP_NOFS);
+ node = alloc_backref_node(cache);
if (!node) {
err = -ENOMEM;
goto out;
}
- backref_node_init(node);
node->bytenr = bytenr;
- node->owner = 0;
node->level = level;
node->lowest = 1;
cur = node;
@@ -587,17 +780,21 @@ again:
#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
if (key.type == BTRFS_SHARED_BLOCK_REF_KEY ||
key.type == BTRFS_EXTENT_REF_V0_KEY) {
- if (key.objectid == key.offset &&
- key.type == BTRFS_EXTENT_REF_V0_KEY) {
+ if (key.type == BTRFS_EXTENT_REF_V0_KEY) {
struct btrfs_extent_ref_v0 *ref0;
ref0 = btrfs_item_ptr(eb, path1->slots[0],
struct btrfs_extent_ref_v0);
- root = find_tree_root(rc, eb, ref0);
- if (root)
- cur->root = root;
- else
- cur->old_root = 1;
- break;
+ if (key.objectid == key.offset) {
+ root = find_tree_root(rc, eb, ref0);
+ if (root && !should_ignore_root(root))
+ cur->root = root;
+ else
+ list_add(&cur->list, &useless);
+ break;
+ }
+ if (is_cowonly_root(btrfs_ref_root_v0(eb,
+ ref0)))
+ cur->cowonly = 1;
}
#else
BUG_ON(key.type == BTRFS_EXTENT_REF_V0_KEY);
@@ -614,22 +811,20 @@ again:
break;
}
- edge = kzalloc(sizeof(*edge), GFP_NOFS);
+ edge = alloc_backref_edge(cache);
if (!edge) {
err = -ENOMEM;
goto out;
}
rb_node = tree_search(&cache->rb_root, key.offset);
if (!rb_node) {
- upper = kmalloc(sizeof(*upper), GFP_NOFS);
+ upper = alloc_backref_node(cache);
if (!upper) {
- kfree(edge);
+ free_backref_edge(cache, edge);
err = -ENOMEM;
goto out;
}
- backref_node_init(upper);
upper->bytenr = key.offset;
- upper->owner = 0;
upper->level = cur->level + 1;
/*
* backrefs for the upper level block isn't
@@ -639,11 +834,12 @@ again:
} else {
upper = rb_entry(rb_node, struct backref_node,
rb_node);
+ BUG_ON(!upper->checked);
INIT_LIST_HEAD(&edge->list[UPPER]);
}
- list_add(&edge->list[LOWER], &cur->upper);
- edge->node[UPPER] = upper;
+ list_add_tail(&edge->list[LOWER], &cur->upper);
edge->node[LOWER] = cur;
+ edge->node[UPPER] = upper;
goto next;
} else if (key.type != BTRFS_TREE_BLOCK_REF_KEY) {
@@ -657,11 +853,17 @@ again:
goto out;
}
+ if (!root->ref_cows)
+ cur->cowonly = 1;
+
if (btrfs_root_level(&root->root_item) == cur->level) {
/* tree root */
BUG_ON(btrfs_root_bytenr(&root->root_item) !=
cur->bytenr);
- cur->root = root;
+ if (should_ignore_root(root))
+ list_add(&cur->list, &useless);
+ else
+ cur->root = root;
break;
}
@@ -692,11 +894,14 @@ again:
if (!path2->nodes[level]) {
BUG_ON(btrfs_root_bytenr(&root->root_item) !=
lower->bytenr);
- lower->root = root;
+ if (should_ignore_root(root))
+ list_add(&lower->list, &useless);
+ else
+ lower->root = root;
break;
}
- edge = kzalloc(sizeof(*edge), GFP_NOFS);
+ edge = alloc_backref_edge(cache);
if (!edge) {
err = -ENOMEM;
goto out;
@@ -705,16 +910,17 @@ again:
eb = path2->nodes[level];
rb_node = tree_search(&cache->rb_root, eb->start);
if (!rb_node) {
- upper = kmalloc(sizeof(*upper), GFP_NOFS);
+ upper = alloc_backref_node(cache);
if (!upper) {
- kfree(edge);
+ free_backref_edge(cache, edge);
err = -ENOMEM;
goto out;
}
- backref_node_init(upper);
upper->bytenr = eb->start;
upper->owner = btrfs_header_owner(eb);
upper->level = lower->level + 1;
+ if (!root->ref_cows)
+ upper->cowonly = 1;
/*
* if we know the block isn't shared
@@ -744,10 +950,12 @@ again:
rb_node);
BUG_ON(!upper->checked);
INIT_LIST_HEAD(&edge->list[UPPER]);
+ if (!upper->owner)
+ upper->owner = btrfs_header_owner(eb);
}
list_add_tail(&edge->list[LOWER], &lower->upper);
- edge->node[UPPER] = upper;
edge->node[LOWER] = lower;
+ edge->node[UPPER] = upper;
if (rb_node)
break;
@@ -785,8 +993,13 @@ next:
* into the cache.
*/
BUG_ON(!node->checked);
- rb_node = tree_insert(&cache->rb_root, node->bytenr, &node->rb_node);
- BUG_ON(rb_node);
+ cowonly = node->cowonly;
+ if (!cowonly) {
+ rb_node = tree_insert(&cache->rb_root, node->bytenr,
+ &node->rb_node);
+ BUG_ON(rb_node);
+ list_add_tail(&node->lower, &cache->leaves);
+ }
list_for_each_entry(edge, &node->upper, list[LOWER])
list_add_tail(&edge->list[UPPER], &list);
@@ -795,6 +1008,14 @@ next:
edge = list_entry(list.next, struct backref_edge, list[UPPER]);
list_del_init(&edge->list[UPPER]);
upper = edge->node[UPPER];
+ if (upper->detached) {
+ list_del(&edge->list[LOWER]);
+ lower = edge->node[LOWER];
+ free_backref_edge(cache, edge);
+ if (list_empty(&lower->upper))
+ list_add(&lower->list, &useless);
+ continue;
+ }
if (!RB_EMPTY_NODE(&upper->rb_node)) {
if (upper->lowest) {
@@ -807,25 +1028,69 @@ next:
}
BUG_ON(!upper->checked);
- rb_node = tree_insert(&cache->rb_root, upper->bytenr,
- &upper->rb_node);
- BUG_ON(rb_node);
+ BUG_ON(cowonly != upper->cowonly);
+ if (!cowonly) {
+ rb_node = tree_insert(&cache->rb_root, upper->bytenr,
+ &upper->rb_node);
+ BUG_ON(rb_node);
+ }
list_add_tail(&edge->list[UPPER], &upper->lower);
list_for_each_entry(edge, &upper->upper, list[LOWER])
list_add_tail(&edge->list[UPPER], &list);
}
+ /*
+ * process useless backref nodes. backref nodes for tree leaves
+ * are deleted from the cache. backref nodes for upper level
+ * tree blocks are left in the cache to avoid unnecessary backref
+ * lookup.
+ */
+ while (!list_empty(&useless)) {
+ upper = list_entry(useless.next, struct backref_node, list);
+ list_del_init(&upper->list);
+ BUG_ON(!list_empty(&upper->upper));
+ if (upper == node)
+ node = NULL;
+ if (upper->lowest) {
+ list_del_init(&upper->lower);
+ upper->lowest = 0;
+ }
+ while (!list_empty(&upper->lower)) {
+ edge = list_entry(upper->lower.next,
+ struct backref_edge, list[UPPER]);
+ list_del(&edge->list[UPPER]);
+ list_del(&edge->list[LOWER]);
+ lower = edge->node[LOWER];
+ free_backref_edge(cache, edge);
+
+ if (list_empty(&lower->upper))
+ list_add(&lower->list, &useless);
+ }
+ __mark_block_processed(rc, upper);
+ if (upper->level > 0) {
+ list_add(&upper->list, &cache->detached);
+ upper->detached = 1;
+ } else {
+ rb_erase(&upper->rb_node, &cache->rb_root);
+ free_backref_node(cache, upper);
+ }
+ }
out:
btrfs_free_path(path1);
btrfs_free_path(path2);
if (err) {
- INIT_LIST_HEAD(&list);
+ while (!list_empty(&useless)) {
+ lower = list_entry(useless.next,
+ struct backref_node, upper);
+ list_del_init(&lower->upper);
+ }
upper = node;
+ INIT_LIST_HEAD(&list);
while (upper) {
if (RB_EMPTY_NODE(&upper->rb_node)) {
list_splice_tail(&upper->upper, &list);
- kfree(upper);
+ free_backref_node(cache, upper);
}
if (list_empty(&list))
@@ -833,15 +1098,104 @@ out:
edge = list_entry(list.next, struct backref_edge,
list[LOWER]);
+ list_del(&edge->list[LOWER]);
upper = edge->node[UPPER];
- kfree(edge);
+ free_backref_edge(cache, edge);
}
return ERR_PTR(err);
}
+ BUG_ON(node && node->detached);
return node;
}
/*
+ * helper to add backref node for the newly created snapshot.
+ * the backref node is created by cloning backref node that
+ * corresponds to root of source tree
+ */
+static int clone_backref_node(struct btrfs_trans_handle *trans,
+ struct reloc_control *rc,
+ struct btrfs_root *src,
+ struct btrfs_root *dest)
+{
+ struct btrfs_root *reloc_root = src->reloc_root;
+ struct backref_cache *cache = &rc->backref_cache;
+ struct backref_node *node = NULL;
+ struct backref_node *new_node;
+ struct backref_edge *edge;
+ struct backref_edge *new_edge;
+ struct rb_node *rb_node;
+
+ if (cache->last_trans > 0)
+ update_backref_cache(trans, cache);
+
+ rb_node = tree_search(&cache->rb_root, src->commit_root->start);
+ if (rb_node) {
+ node = rb_entry(rb_node, struct backref_node, rb_node);
+ if (node->detached)
+ node = NULL;
+ else
+ BUG_ON(node->new_bytenr != reloc_root->node->start);
+ }
+
+ if (!node) {
+ rb_node = tree_search(&cache->rb_root,
+ reloc_root->commit_root->start);
+ if (rb_node) {
+ node = rb_entry(rb_node, struct backref_node,
+ rb_node);
+ BUG_ON(node->detached);
+ }
+ }
+
+ if (!node)
+ return 0;
+
+ new_node = alloc_backref_node(cache);
+ if (!new_node)
+ return -ENOMEM;
+
+ new_node->bytenr = dest->node->start;
+ new_node->level = node->level;
+ new_node->lowest = node->lowest;
+ new_node->root = dest;
+
+ if (!node->lowest) {
+ list_for_each_entry(edge, &node->lower, list[UPPER]) {
+ new_edge = alloc_backref_edge(cache);
+ if (!new_edge)
+ goto fail;
+
+ new_edge->node[UPPER] = new_node;
+ new_edge->node[LOWER] = edge->node[LOWER];
+ list_add_tail(&new_edge->list[UPPER],
+ &new_node->lower);
+ }
+ }
+
+ rb_node = tree_insert(&cache->rb_root, new_node->bytenr,
+ &new_node->rb_node);
+ BUG_ON(rb_node);
+
+ if (!new_node->lowest) {
+ list_for_each_entry(new_edge, &new_node->lower, list[UPPER]) {
+ list_add_tail(&new_edge->list[LOWER],
+ &new_edge->node[LOWER]->upper);
+ }
+ }
+ return 0;
+fail:
+ while (!list_empty(&new_node->lower)) {
+ new_edge = list_entry(new_node->lower.next,
+ struct backref_edge, list[UPPER]);
+ list_del(&new_edge->list[UPPER]);
+ free_backref_edge(cache, new_edge);
+ }
+ free_backref_node(cache, new_node);
+ return -ENOMEM;
+}
+
+/*
* helper to add 'address of tree root -> reloc tree' mapping
*/
static int __add_reloc_root(struct btrfs_root *root)
@@ -901,12 +1255,8 @@ static int __update_reloc_root(struct btrfs_root *root, int del)
return 0;
}
-/*
- * create reloc tree for a given fs tree. reloc tree is just a
- * snapshot of the fs tree with special root objectid.
- */
-int btrfs_init_reloc_root(struct btrfs_trans_handle *trans,
- struct btrfs_root *root)
+static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, u64 objectid)
{
struct btrfs_root *reloc_root;
struct extent_buffer *eb;
@@ -914,36 +1264,45 @@ int btrfs_init_reloc_root(struct btrfs_trans_handle *trans,
struct btrfs_key root_key;
int ret;
- if (root->reloc_root) {
- reloc_root = root->reloc_root;
- reloc_root->last_trans = trans->transid;
- return 0;
- }
-
- if (!root->fs_info->reloc_ctl ||
- !root->fs_info->reloc_ctl->create_reloc_root ||
- root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
- return 0;
-
root_item = kmalloc(sizeof(*root_item), GFP_NOFS);
BUG_ON(!root_item);
root_key.objectid = BTRFS_TREE_RELOC_OBJECTID;
root_key.type = BTRFS_ROOT_ITEM_KEY;
- root_key.offset = root->root_key.objectid;
+ root_key.offset = objectid;
- ret = btrfs_copy_root(trans, root, root->commit_root, &eb,
- BTRFS_TREE_RELOC_OBJECTID);
- BUG_ON(ret);
+ if (root->root_key.objectid == objectid) {
+ /* called by btrfs_init_reloc_root */
+ ret = btrfs_copy_root(trans, root, root->commit_root, &eb,
+ BTRFS_TREE_RELOC_OBJECTID);
+ BUG_ON(ret);
+
+ btrfs_set_root_last_snapshot(&root->root_item,
+ trans->transid - 1);
+ } else {
+ /*
+ * called by btrfs_reloc_post_snapshot_hook.
+ * the source tree is a reloc tree, all tree blocks
+ * modified after it was created have RELOC flag
+ * set in their headers. so it's OK to not update
+ * the 'last_snapshot'.
+ */
+ ret = btrfs_copy_root(trans, root, root->node, &eb,
+ BTRFS_TREE_RELOC_OBJECTID);
+ BUG_ON(ret);
+ }
- btrfs_set_root_last_snapshot(&root->root_item, trans->transid - 1);
memcpy(root_item, &root->root_item, sizeof(*root_item));
- btrfs_set_root_refs(root_item, 1);
btrfs_set_root_bytenr(root_item, eb->start);
btrfs_set_root_level(root_item, btrfs_header_level(eb));
btrfs_set_root_generation(root_item, trans->transid);
- memset(&root_item->drop_progress, 0, sizeof(struct btrfs_disk_key));
- root_item->drop_level = 0;
+
+ if (root->root_key.objectid == objectid) {
+ btrfs_set_root_refs(root_item, 0);
+ memset(&root_item->drop_progress, 0,
+ sizeof(struct btrfs_disk_key));
+ root_item->drop_level = 0;
+ }
btrfs_tree_unlock(eb);
free_extent_buffer(eb);
@@ -957,6 +1316,37 @@ int btrfs_init_reloc_root(struct btrfs_trans_handle *trans,
&root_key);
BUG_ON(IS_ERR(reloc_root));
reloc_root->last_trans = trans->transid;
+ return reloc_root;
+}
+
+/*
+ * create reloc tree for a given fs tree. reloc tree is just a
+ * snapshot of the fs tree with special root objectid.
+ */
+int btrfs_init_reloc_root(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root)
+{
+ struct btrfs_root *reloc_root;
+ struct reloc_control *rc = root->fs_info->reloc_ctl;
+ int clear_rsv = 0;
+
+ if (root->reloc_root) {
+ reloc_root = root->reloc_root;
+ reloc_root->last_trans = trans->transid;
+ return 0;
+ }
+
+ if (!rc || !rc->create_reloc_tree ||
+ root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
+ return 0;
+
+ if (!trans->block_rsv) {
+ trans->block_rsv = rc->block_rsv;
+ clear_rsv = 1;
+ }
+ reloc_root = create_reloc_root(trans, root, root->root_key.objectid);
+ if (clear_rsv)
+ trans->block_rsv = NULL;
__add_reloc_root(reloc_root);
root->reloc_root = reloc_root;
@@ -980,7 +1370,8 @@ int btrfs_update_reloc_root(struct btrfs_trans_handle *trans,
reloc_root = root->reloc_root;
root_item = &reloc_root->root_item;
- if (btrfs_root_refs(root_item) == 0) {
+ if (root->fs_info->reloc_ctl->merge_reloc_tree &&
+ btrfs_root_refs(root_item) == 0) {
root->reloc_root = NULL;
del = 1;
}
@@ -1102,8 +1493,7 @@ static int get_new_location(struct inode *reloc_inode, u64 *new_bytenr,
goto out;
}
- if (new_bytenr)
- *new_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
+ *new_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
ret = 0;
out:
btrfs_free_path(path);
@@ -1114,19 +1504,18 @@ out:
* update file extent items in the tree leaf to point to
* the new locations.
*/
-static int replace_file_extents(struct btrfs_trans_handle *trans,
- struct reloc_control *rc,
- struct btrfs_root *root,
- struct extent_buffer *leaf,
- struct list_head *inode_list)
+static noinline_for_stack
+int replace_file_extents(struct btrfs_trans_handle *trans,
+ struct reloc_control *rc,
+ struct btrfs_root *root,
+ struct extent_buffer *leaf)
{
struct btrfs_key key;
struct btrfs_file_extent_item *fi;
struct inode *inode = NULL;
- struct inodevec *ivec = NULL;
u64 parent;
u64 bytenr;
- u64 new_bytenr;
+ u64 new_bytenr = 0;
u64 num_bytes;
u64 end;
u32 nritems;
@@ -1166,21 +1555,12 @@ static int replace_file_extents(struct btrfs_trans_handle *trans,
* to complete and drop the extent cache
*/
if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
- if (!ivec || ivec->nr == INODEVEC_SIZE) {
- ivec = kmalloc(sizeof(*ivec), GFP_NOFS);
- BUG_ON(!ivec);
- ivec->nr = 0;
- list_add_tail(&ivec->list, inode_list);
- }
if (first) {
inode = find_next_inode(root, key.objectid);
- if (inode)
- ivec->inode[ivec->nr++] = inode;
first = 0;
} else if (inode && inode->i_ino < key.objectid) {
+ btrfs_add_delayed_iput(inode);
inode = find_next_inode(root, key.objectid);
- if (inode)
- ivec->inode[ivec->nr++] = inode;
}
if (inode && inode->i_ino == key.objectid) {
end = key.offset +
@@ -1204,8 +1584,10 @@ static int replace_file_extents(struct btrfs_trans_handle *trans,
ret = get_new_location(rc->data_inode, &new_bytenr,
bytenr, num_bytes);
- if (ret > 0)
+ if (ret > 0) {
+ WARN_ON(1);
continue;
+ }
BUG_ON(ret < 0);
btrfs_set_file_extent_disk_bytenr(leaf, fi, new_bytenr);
@@ -1225,6 +1607,8 @@ static int replace_file_extents(struct btrfs_trans_handle *trans,
}
if (dirty)
btrfs_mark_buffer_dirty(leaf);
+ if (inode)
+ btrfs_add_delayed_iput(inode);
return 0;
}
@@ -1248,11 +1632,11 @@ int memcmp_node_keys(struct extent_buffer *eb, int slot,
* if no block got replaced, 0 is returned. if there are other
* errors, a negative error number is returned.
*/
-static int replace_path(struct btrfs_trans_handle *trans,
- struct btrfs_root *dest, struct btrfs_root *src,
- struct btrfs_path *path, struct btrfs_key *next_key,
- struct extent_buffer **leaf,
- int lowest_level, int max_level)
+static noinline_for_stack
+int replace_path(struct btrfs_trans_handle *trans,
+ struct btrfs_root *dest, struct btrfs_root *src,
+ struct btrfs_path *path, struct btrfs_key *next_key,
+ int lowest_level, int max_level)
{
struct extent_buffer *eb;
struct extent_buffer *parent;
@@ -1263,16 +1647,16 @@ static int replace_path(struct btrfs_trans_handle *trans,
u64 new_ptr_gen;
u64 last_snapshot;
u32 blocksize;
+ int cow = 0;
int level;
int ret;
int slot;
BUG_ON(src->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);
BUG_ON(dest->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID);
- BUG_ON(lowest_level > 1 && leaf);
last_snapshot = btrfs_root_last_snapshot(&src->root_item);
-
+again:
slot = path->slots[lowest_level];
btrfs_node_key_to_cpu(path->nodes[lowest_level], &key, slot);
@@ -1286,8 +1670,10 @@ static int replace_path(struct btrfs_trans_handle *trans,
return 0;
}
- ret = btrfs_cow_block(trans, dest, eb, NULL, 0, &eb);
- BUG_ON(ret);
+ if (cow) {
+ ret = btrfs_cow_block(trans, dest, eb, NULL, 0, &eb);
+ BUG_ON(ret);
+ }
btrfs_set_lock_blocking(eb);
if (next_key) {
@@ -1331,7 +1717,7 @@ static int replace_path(struct btrfs_trans_handle *trans,
if (new_bytenr == 0 || old_ptr_gen > last_snapshot ||
memcmp_node_keys(parent, slot, path, level)) {
- if (level <= lowest_level && !leaf) {
+ if (level <= lowest_level) {
ret = 0;
break;
}
@@ -1339,16 +1725,12 @@ static int replace_path(struct btrfs_trans_handle *trans,
eb = read_tree_block(dest, old_bytenr, blocksize,
old_ptr_gen);
btrfs_tree_lock(eb);
- ret = btrfs_cow_block(trans, dest, eb, parent,
- slot, &eb);
- BUG_ON(ret);
- btrfs_set_lock_blocking(eb);
-
- if (level <= lowest_level) {
- *leaf = eb;
- ret = 0;
- break;
+ if (cow) {
+ ret = btrfs_cow_block(trans, dest, eb, parent,
+ slot, &eb);
+ BUG_ON(ret);
}
+ btrfs_set_lock_blocking(eb);
btrfs_tree_unlock(parent);
free_extent_buffer(parent);
@@ -1357,6 +1739,13 @@ static int replace_path(struct btrfs_trans_handle *trans,
continue;
}
+ if (!cow) {
+ btrfs_tree_unlock(parent);
+ free_extent_buffer(parent);
+ cow = 1;
+ goto again;
+ }
+
btrfs_node_key_to_cpu(path->nodes[level], &key,
path->slots[level]);
btrfs_release_path(src, path);
@@ -1562,20 +1951,6 @@ static int invalidate_extent_cache(struct btrfs_root *root,
return 0;
}
-static void put_inodes(struct list_head *list)
-{
- struct inodevec *ivec;
- while (!list_empty(list)) {
- ivec = list_entry(list->next, struct inodevec, list);
- list_del(&ivec->list);
- while (ivec->nr > 0) {
- ivec->nr--;
- iput(ivec->inode[ivec->nr]);
- }
- kfree(ivec);
- }
-}
-
static int find_next_key(struct btrfs_path *path, int level,
struct btrfs_key *key)
@@ -1608,13 +1983,14 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
struct btrfs_root *reloc_root;
struct btrfs_root_item *root_item;
struct btrfs_path *path;
- struct extent_buffer *leaf = NULL;
+ struct extent_buffer *leaf;
unsigned long nr;
int level;
int max_level;
int replaced = 0;
int ret;
int err = 0;
+ u32 min_reserved;
path = btrfs_alloc_path();
if (!path)
@@ -1648,34 +2024,23 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
btrfs_unlock_up_safe(path, 0);
}
- if (level == 0 && rc->stage == UPDATE_DATA_PTRS) {
- trans = btrfs_start_transaction(root, 1);
+ min_reserved = root->nodesize * (BTRFS_MAX_LEVEL - 1) * 2;
+ memset(&next_key, 0, sizeof(next_key));
- leaf = path->nodes[0];
- btrfs_item_key_to_cpu(leaf, &key, 0);
- btrfs_release_path(reloc_root, path);
+ while (1) {
+ trans = btrfs_start_transaction(root, 0);
+ trans->block_rsv = rc->block_rsv;
- ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
- if (ret < 0) {
- err = ret;
- goto out;
+ ret = btrfs_block_rsv_check(trans, root, rc->block_rsv,
+ min_reserved, 0);
+ if (ret) {
+ BUG_ON(ret != -EAGAIN);
+ ret = btrfs_commit_transaction(trans, root);
+ BUG_ON(ret);
+ continue;
}
- leaf = path->nodes[0];
- btrfs_unlock_up_safe(path, 1);
- ret = replace_file_extents(trans, rc, root, leaf,
- &inode_list);
- if (ret < 0)
- err = ret;
- goto out;
- }
-
- memset(&next_key, 0, sizeof(next_key));
-
- while (1) {
- leaf = NULL;
replaced = 0;
- trans = btrfs_start_transaction(root, 1);
max_level = level;
ret = walk_down_reloc_tree(reloc_root, path, &level);
@@ -1689,14 +2054,9 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
if (!find_next_key(path, level, &key) &&
btrfs_comp_cpu_keys(&next_key, &key) >= 0) {
ret = 0;
- } else if (level == 1 && rc->stage == UPDATE_DATA_PTRS) {
- ret = replace_path(trans, root, reloc_root,
- path, &next_key, &leaf,
- level, max_level);
} else {
- ret = replace_path(trans, root, reloc_root,
- path, &next_key, NULL,
- level, max_level);
+ ret = replace_path(trans, root, reloc_root, path,
+ &next_key, level, max_level);
}
if (ret < 0) {
err = ret;
@@ -1708,16 +2068,6 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
btrfs_node_key_to_cpu(path->nodes[level], &key,
path->slots[level]);
replaced = 1;
- } else if (leaf) {
- /*
- * no block got replaced, try replacing file extents
- */
- btrfs_item_key_to_cpu(leaf, &key, 0);
- ret = replace_file_extents(trans, rc, root, leaf,
- &inode_list);
- btrfs_tree_unlock(leaf);
- free_extent_buffer(leaf);
- BUG_ON(ret < 0);
}
ret = walk_up_reloc_tree(reloc_root, path, &level);
@@ -1734,15 +2084,10 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
root_item->drop_level = level;
nr = trans->blocks_used;
- btrfs_end_transaction(trans, root);
+ btrfs_end_transaction_throttle(trans, root);
btrfs_btree_balance_dirty(root, nr);
- /*
- * put inodes outside transaction, otherwise we may deadlock.
- */
- put_inodes(&inode_list);
-
if (replaced && rc->stage == UPDATE_DATA_PTRS)
invalidate_extent_cache(root, &key, &next_key);
}
@@ -1765,87 +2110,125 @@ out:
sizeof(root_item->drop_progress));
root_item->drop_level = 0;
btrfs_set_root_refs(root_item, 0);
+ btrfs_update_reloc_root(trans, root);
}
nr = trans->blocks_used;
- btrfs_end_transaction(trans, root);
+ btrfs_end_transaction_throttle(trans, root);
btrfs_btree_balance_dirty(root, nr);
- put_inodes(&inode_list);
-
if (replaced && rc->stage == UPDATE_DATA_PTRS)
invalidate_extent_cache(root, &key, &next_key);
return err;
}
-/*
- * callback for the work threads.
- * this function merges reloc tree with corresponding fs tree,
- * and then drops the reloc tree.
- */
-static void merge_func(struct btrfs_work *work)
+static noinline_for_stack
+int prepare_to_merge(struct reloc_control *rc, int err)
{
- struct btrfs_trans_handle *trans;
- struct btrfs_root *root;
+ struct btrfs_root *root = rc->extent_root;
struct btrfs_root *reloc_root;
- struct async_merge *async;
+ struct btrfs_trans_handle *trans;
+ LIST_HEAD(reloc_roots);
+ u64 num_bytes = 0;
+ int ret;
+ int retries = 0;
+
+ mutex_lock(&root->fs_info->trans_mutex);
+ rc->merging_rsv_size += root->nodesize * (BTRFS_MAX_LEVEL - 1) * 2;
+ rc->merging_rsv_size += rc->nodes_relocated * 2;
+ mutex_unlock(&root->fs_info->trans_mutex);
+again:
+ if (!err) {
+ num_bytes = rc->merging_rsv_size;
+ ret = btrfs_block_rsv_add(NULL, root, rc->block_rsv,
+ num_bytes, &retries);
+ if (ret)
+ err = ret;
+ }
+
+ trans = btrfs_join_transaction(rc->extent_root, 1);
+
+ if (!err) {
+ if (num_bytes != rc->merging_rsv_size) {
+ btrfs_end_transaction(trans, rc->extent_root);
+ btrfs_block_rsv_release(rc->extent_root,
+ rc->block_rsv, num_bytes);
+ retries = 0;
+ goto again;
+ }
+ }
- async = container_of(work, struct async_merge, work);
- reloc_root = async->root;
+ rc->merge_reloc_tree = 1;
+
+ while (!list_empty(&rc->reloc_roots)) {
+ reloc_root = list_entry(rc->reloc_roots.next,
+ struct btrfs_root, root_list);
+ list_del_init(&reloc_root->root_list);
- if (btrfs_root_refs(&reloc_root->root_item) > 0) {
root = read_fs_root(reloc_root->fs_info,
reloc_root->root_key.offset);
BUG_ON(IS_ERR(root));
BUG_ON(root->reloc_root != reloc_root);
- merge_reloc_root(async->rc, root);
-
- trans = btrfs_start_transaction(root, 1);
+ /*
+ * set reference count to 1, so btrfs_recover_relocation
+ * knows it should resumes merging
+ */
+ if (!err)
+ btrfs_set_root_refs(&reloc_root->root_item, 1);
btrfs_update_reloc_root(trans, root);
- btrfs_end_transaction(trans, root);
- }
- btrfs_drop_snapshot(reloc_root, 0);
+ list_add(&reloc_root->root_list, &reloc_roots);
+ }
- if (atomic_dec_and_test(async->num_pending))
- complete(async->done);
+ list_splice(&reloc_roots, &rc->reloc_roots);
- kfree(async);
+ if (!err)
+ btrfs_commit_transaction(trans, rc->extent_root);
+ else
+ btrfs_end_transaction(trans, rc->extent_root);
+ return err;
}
-static int merge_reloc_roots(struct reloc_control *rc)
+static noinline_for_stack
+int merge_reloc_roots(struct reloc_control *rc)
{
- struct async_merge *async;
struct btrfs_root *root;
- struct completion done;
- atomic_t num_pending;
+ struct btrfs_root *reloc_root;
+ LIST_HEAD(reloc_roots);
+ int found = 0;
+ int ret;
+again:
+ root = rc->extent_root;
+ mutex_lock(&root->fs_info->trans_mutex);
+ list_splice_init(&rc->reloc_roots, &reloc_roots);
+ mutex_unlock(&root->fs_info->trans_mutex);
- init_completion(&done);
- atomic_set(&num_pending, 1);
+ while (!list_empty(&reloc_roots)) {
+ found = 1;
+ reloc_root = list_entry(reloc_roots.next,
+ struct btrfs_root, root_list);
- while (!list_empty(&rc->reloc_roots)) {
- root = list_entry(rc->reloc_roots.next,
- struct btrfs_root, root_list);
- list_del_init(&root->root_list);
+ if (btrfs_root_refs(&reloc_root->root_item) > 0) {
+ root = read_fs_root(reloc_root->fs_info,
+ reloc_root->root_key.offset);
+ BUG_ON(IS_ERR(root));
+ BUG_ON(root->reloc_root != reloc_root);
- async = kmalloc(sizeof(*async), GFP_NOFS);
- BUG_ON(!async);
- async->work.func = merge_func;
- async->work.flags = 0;
- async->rc = rc;
- async->root = root;
- async->done = &done;
- async->num_pending = &num_pending;
- atomic_inc(&num_pending);
- btrfs_queue_worker(&rc->workers, &async->work);
+ ret = merge_reloc_root(rc, root);
+ BUG_ON(ret);
+ } else {
+ list_del_init(&reloc_root->root_list);
+ }
+ btrfs_drop_snapshot(reloc_root, rc->block_rsv, 0);
}
- if (!atomic_dec_and_test(&num_pending))
- wait_for_completion(&done);
-
+ if (found) {
+ found = 0;
+ goto again;
+ }
BUG_ON(!RB_EMPTY_ROOT(&rc->reloc_root_tree.rb_root));
return 0;
}
@@ -1876,119 +2259,169 @@ static int record_reloc_root_in_trans(struct btrfs_trans_handle *trans,
return btrfs_record_root_in_trans(trans, root);
}
-/*
- * select one tree from trees that references the block.
- * for blocks in refernce counted trees, we preper reloc tree.
- * if no reloc tree found and reloc_only is true, NULL is returned.
- */
-static struct btrfs_root *__select_one_root(struct btrfs_trans_handle *trans,
- struct backref_node *node,
- struct backref_edge *edges[],
- int *nr, int reloc_only)
+static noinline_for_stack
+struct btrfs_root *select_reloc_root(struct btrfs_trans_handle *trans,
+ struct reloc_control *rc,
+ struct backref_node *node,
+ struct backref_edge *edges[], int *nr)
{
struct backref_node *next;
struct btrfs_root *root;
- int index;
- int loop = 0;
-again:
- index = 0;
+ int index = 0;
+
next = node;
while (1) {
cond_resched();
next = walk_up_backref(next, edges, &index);
root = next->root;
- if (!root) {
- BUG_ON(!node->old_root);
- goto skip;
- }
-
- /* no other choice for non-refernce counted tree */
- if (!root->ref_cows) {
- BUG_ON(reloc_only);
- break;
- }
+ BUG_ON(!root);
+ BUG_ON(!root->ref_cows);
if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) {
record_reloc_root_in_trans(trans, root);
break;
}
- if (loop) {
- btrfs_record_root_in_trans(trans, root);
+ btrfs_record_root_in_trans(trans, root);
+ root = root->reloc_root;
+
+ if (next->new_bytenr != root->node->start) {
+ BUG_ON(next->new_bytenr);
+ BUG_ON(!list_empty(&next->list));
+ next->new_bytenr = root->node->start;
+ next->root = root;
+ list_add_tail(&next->list,
+ &rc->backref_cache.changed);
+ __mark_block_processed(rc, next);
break;
}
- if (reloc_only || next != node) {
- if (!root->reloc_root)
- btrfs_record_root_in_trans(trans, root);
- root = root->reloc_root;
- /*
- * if the reloc tree was created in current
- * transation, there is no node in backref tree
- * corresponds to the root of the reloc tree.
- */
- if (btrfs_root_last_snapshot(&root->root_item) ==
- trans->transid - 1)
- break;
- }
-skip:
+ WARN_ON(1);
root = NULL;
next = walk_down_backref(edges, &index);
if (!next || next->level <= node->level)
break;
}
+ if (!root)
+ return NULL;
- if (!root && !loop && !reloc_only) {
- loop = 1;
- goto again;
+ *nr = index;
+ next = node;
+ /* setup backref node path for btrfs_reloc_cow_block */
+ while (1) {
+ rc->backref_cache.path[next->level] = next;
+ if (--index < 0)
+ break;
+ next = edges[index]->node[UPPER];
}
-
- if (root)
- *nr = index;
- else
- *nr = 0;
-
return root;
}
+/*
+ * select a tree root for relocation. return NULL if the block
+ * is reference counted. we should use do_relocation() in this
+ * case. return a tree root pointer if the block isn't reference
+ * counted. return -ENOENT if the block is root of reloc tree.
+ */
static noinline_for_stack
struct btrfs_root *select_one_root(struct btrfs_trans_handle *trans,
struct backref_node *node)
{
+ struct backref_node *next;
+ struct btrfs_root *root;
+ struct btrfs_root *fs_root = NULL;
struct backref_edge *edges[BTRFS_MAX_LEVEL - 1];
- int nr;
- return __select_one_root(trans, node, edges, &nr, 0);
+ int index = 0;
+
+ next = node;
+ while (1) {
+ cond_resched();
+ next = walk_up_backref(next, edges, &index);
+ root = next->root;
+ BUG_ON(!root);
+
+ /* no other choice for non-refernce counted tree */
+ if (!root->ref_cows)
+ return root;
+
+ if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID)
+ fs_root = root;
+
+ if (next != node)
+ return NULL;
+
+ next = walk_down_backref(edges, &index);
+ if (!next || next->level <= node->level)
+ break;
+ }
+
+ if (!fs_root)
+ return ERR_PTR(-ENOENT);
+ return fs_root;
}
static noinline_for_stack
-struct btrfs_root *select_reloc_root(struct btrfs_trans_handle *trans,
- struct backref_node *node,
- struct backref_edge *edges[], int *nr)
+u64 calcu_metadata_size(struct reloc_control *rc,
+ struct backref_node *node, int reserve)
{
- return __select_one_root(trans, node, edges, nr, 1);
+ struct backref_node *next = node;
+ struct backref_edge *edge;
+ struct backref_edge *edges[BTRFS_MAX_LEVEL - 1];
+ u64 num_bytes = 0;
+ int index = 0;
+
+ BUG_ON(reserve && node->processed);
+
+ while (next) {
+ cond_resched();
+ while (1) {
+ if (next->processed && (reserve || next != node))
+ break;
+
+ num_bytes += btrfs_level_size(rc->extent_root,
+ next->level);
+
+ if (list_empty(&next->upper))
+ break;
+
+ edge = list_entry(next->upper.next,
+ struct backref_edge, list[LOWER]);
+ edges[index++] = edge;
+ next = edge->node[UPPER];
+ }
+ next = walk_down_backref(edges, &index);
+ }
+ return num_bytes;
}
-static void grab_path_buffers(struct btrfs_path *path,
- struct backref_node *node,
- struct backref_edge *edges[], int nr)
+static int reserve_metadata_space(struct btrfs_trans_handle *trans,
+ struct reloc_control *rc,
+ struct backref_node *node)
{
- int i = 0;
- while (1) {
- drop_node_buffer(node);
- node->eb = path->nodes[node->level];
- BUG_ON(!node->eb);
- if (path->locks[node->level])
- node->locked = 1;
- path->nodes[node->level] = NULL;
- path->locks[node->level] = 0;
-
- if (i >= nr)
- break;
+ struct btrfs_root *root = rc->extent_root;
+ u64 num_bytes;
+ int ret;
+
+ num_bytes = calcu_metadata_size(rc, node, 1) * 2;
- edges[i]->blockptr = node->eb->start;
- node = edges[i]->node[UPPER];
- i++;
+ trans->block_rsv = rc->block_rsv;
+ ret = btrfs_block_rsv_add(trans, root, rc->block_rsv, num_bytes,
+ &rc->block_rsv_retries);
+ if (ret) {
+ if (ret == -EAGAIN)
+ rc->commit_transaction = 1;
+ return ret;
}
+
+ rc->block_rsv_retries = 0;
+ return 0;
+}
+
+static void release_metadata_space(struct reloc_control *rc,
+ struct backref_node *node)
+{
+ u64 num_bytes = calcu_metadata_size(rc, node, 0) * 2;
+ btrfs_block_rsv_release(rc->extent_root, rc->block_rsv, num_bytes);
}
/*
@@ -1999,6 +2432,7 @@ static void grab_path_buffers(struct btrfs_path *path,
* in that case this function just updates pointers.
*/
static int do_relocation(struct btrfs_trans_handle *trans,
+ struct reloc_control *rc,
struct backref_node *node,
struct btrfs_key *key,
struct btrfs_path *path, int lowest)
@@ -2019,18 +2453,25 @@ static int do_relocation(struct btrfs_trans_handle *trans,
BUG_ON(lowest && node->eb);
path->lowest_level = node->level + 1;
+ rc->backref_cache.path[node->level] = node;
list_for_each_entry(edge, &node->upper, list[LOWER]) {
cond_resched();
- if (node->eb && node->eb->start == edge->blockptr)
- continue;
upper = edge->node[UPPER];
- root = select_reloc_root(trans, upper, edges, &nr);
- if (!root)
- continue;
-
- if (upper->eb && !upper->locked)
+ root = select_reloc_root(trans, rc, upper, edges, &nr);
+ BUG_ON(!root);
+
+ if (upper->eb && !upper->locked) {
+ if (!lowest) {
+ ret = btrfs_bin_search(upper->eb, key,
+ upper->level, &slot);
+ BUG_ON(ret);
+ bytenr = btrfs_node_blockptr(upper->eb, slot);
+ if (node->eb->start == bytenr)
+ goto next;
+ }
drop_node_buffer(upper);
+ }
if (!upper->eb) {
ret = btrfs_search_slot(trans, root, key, path, 0, 1);
@@ -2040,11 +2481,17 @@ static int do_relocation(struct btrfs_trans_handle *trans,
}
BUG_ON(ret > 0);
- slot = path->slots[upper->level];
+ if (!upper->eb) {
+ upper->eb = path->nodes[upper->level];
+ path->nodes[upper->level] = NULL;
+ } else {
+ BUG_ON(upper->eb != path->nodes[upper->level]);
+ }
- btrfs_unlock_up_safe(path, upper->level + 1);
- grab_path_buffers(path, upper, edges, nr);
+ upper->locked = 1;
+ path->locks[upper->level] = 0;
+ slot = path->slots[upper->level];
btrfs_release_path(NULL, path);
} else {
ret = btrfs_bin_search(upper->eb, key, upper->level,
@@ -2053,14 +2500,11 @@ static int do_relocation(struct btrfs_trans_handle *trans,
}
bytenr = btrfs_node_blockptr(upper->eb, slot);
- if (!lowest) {
- if (node->eb->start == bytenr) {
- btrfs_tree_unlock(upper->eb);
- upper->locked = 0;
- continue;
- }
+ if (lowest) {
+ BUG_ON(bytenr != node->bytenr);
} else {
- BUG_ON(node->bytenr != bytenr);
+ if (node->eb->start == bytenr)
+ goto next;
}
blocksize = btrfs_level_size(root, node->level);
@@ -2072,13 +2516,13 @@ static int do_relocation(struct btrfs_trans_handle *trans,
if (!node->eb) {
ret = btrfs_cow_block(trans, root, eb, upper->eb,
slot, &eb);
+ btrfs_tree_unlock(eb);
+ free_extent_buffer(eb);
if (ret < 0) {
err = ret;
- break;
+ goto next;
}
- btrfs_set_lock_blocking(eb);
- node->eb = eb;
- node->locked = 1;
+ BUG_ON(node->eb != eb);
} else {
btrfs_set_node_blockptr(upper->eb, slot,
node->eb->start);
@@ -2096,67 +2540,80 @@ static int do_relocation(struct btrfs_trans_handle *trans,
ret = btrfs_drop_subtree(trans, root, eb, upper->eb);
BUG_ON(ret);
}
- if (!lowest) {
- btrfs_tree_unlock(upper->eb);
- upper->locked = 0;
- }
+next:
+ if (!upper->pending)
+ drop_node_buffer(upper);
+ else
+ unlock_node_buffer(upper);
+ if (err)
+ break;
}
+
+ if (!err && node->pending) {
+ drop_node_buffer(node);
+ list_move_tail(&node->list, &rc->backref_cache.changed);
+ node->pending = 0;
+ }
+
path->lowest_level = 0;
+ BUG_ON(err == -ENOSPC);
return err;
}
static int link_to_upper(struct btrfs_trans_handle *trans,
+ struct reloc_control *rc,
struct backref_node *node,
struct btrfs_path *path)
{
struct btrfs_key key;
- if (!node->eb || list_empty(&node->upper))
- return 0;
btrfs_node_key_to_cpu(node->eb, &key, 0);
- return do_relocation(trans, node, &key, path, 0);
+ return do_relocation(trans, rc, node, &key, path, 0);
}
static int finish_pending_nodes(struct btrfs_trans_handle *trans,
- struct backref_cache *cache,
- struct btrfs_path *path)
+ struct reloc_control *rc,
+ struct btrfs_path *path, int err)
{
+ LIST_HEAD(list);
+ struct backref_cache *cache = &rc->backref_cache;
struct backref_node *node;
int level;
int ret;
- int err = 0;
for (level = 0; level < BTRFS_MAX_LEVEL; level++) {
while (!list_empty(&cache->pending[level])) {
node = list_entry(cache->pending[level].next,
- struct backref_node, lower);
- BUG_ON(node->level != level);
+ struct backref_node, list);
+ list_move_tail(&node->list, &list);
+ BUG_ON(!node->pending);
- ret = link_to_upper(trans, node, path);
- if (ret < 0)
- err = ret;
- /*
- * this remove the node from the pending list and
- * may add some other nodes to the level + 1
- * pending list
- */
- remove_backref_node(cache, node);
+ if (!err) {
+ ret = link_to_upper(trans, rc, node, path);
+ if (ret < 0)
+ err = ret;
+ }
}
+ list_splice_init(&list, &cache->pending[level]);
}
- BUG_ON(!RB_EMPTY_ROOT(&cache->rb_root));
return err;
}
static void mark_block_processed(struct reloc_control *rc,
- struct backref_node *node)
+ u64 bytenr, u32 blocksize)
+{
+ set_extent_bits(&rc->processed_blocks, bytenr, bytenr + blocksize - 1,
+ EXTENT_DIRTY, GFP_NOFS);
+}
+
+static void __mark_block_processed(struct reloc_control *rc,
+ struct backref_node *node)
{
u32 blocksize;
if (node->level == 0 ||
in_block_group(node->bytenr, rc->block_group)) {
blocksize = btrfs_level_size(rc->extent_root, node->level);
- set_extent_bits(&rc->processed_blocks, node->bytenr,
- node->bytenr + blocksize - 1, EXTENT_DIRTY,
- GFP_NOFS);
+ mark_block_processed(rc, node->bytenr, blocksize);
}
node->processed = 1;
}
@@ -2179,7 +2636,7 @@ static void update_processed_blocks(struct reloc_control *rc,
if (next->processed)
break;
- mark_block_processed(rc, next);
+ __mark_block_processed(rc, next);
if (list_empty(&next->upper))
break;
@@ -2202,138 +2659,6 @@ static int tree_block_processed(u64 bytenr, u32 blocksize,
return 0;
}
-/*
- * check if there are any file extent pointers in the leaf point to
- * data require processing
- */
-static int check_file_extents(struct reloc_control *rc,
- u64 bytenr, u32 blocksize, u64 ptr_gen)
-{
- struct btrfs_key found_key;
- struct btrfs_file_extent_item *fi;
- struct extent_buffer *leaf;
- u32 nritems;
- int i;
- int ret = 0;
-
- leaf = read_tree_block(rc->extent_root, bytenr, blocksize, ptr_gen);
-
- nritems = btrfs_header_nritems(leaf);
- for (i = 0; i < nritems; i++) {
- cond_resched();
- btrfs_item_key_to_cpu(leaf, &found_key, i);
- if (found_key.type != BTRFS_EXTENT_DATA_KEY)
- continue;
- fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item);
- if (btrfs_file_extent_type(leaf, fi) ==
- BTRFS_FILE_EXTENT_INLINE)
- continue;
- bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
- if (bytenr == 0)
- continue;
- if (in_block_group(bytenr, rc->block_group)) {
- ret = 1;
- break;
- }
- }
- free_extent_buffer(leaf);
- return ret;
-}
-
-/*
- * scan child blocks of a given block to find blocks require processing
- */
-static int add_child_blocks(struct btrfs_trans_handle *trans,
- struct reloc_control *rc,
- struct backref_node *node,
- struct rb_root *blocks)
-{
- struct tree_block *block;
- struct rb_node *rb_node;
- u64 bytenr;
- u64 ptr_gen;
- u32 blocksize;
- u32 nritems;
- int i;
- int err = 0;
-
- nritems = btrfs_header_nritems(node->eb);
- blocksize = btrfs_level_size(rc->extent_root, node->level - 1);
- for (i = 0; i < nritems; i++) {
- cond_resched();
- bytenr = btrfs_node_blockptr(node->eb, i);
- ptr_gen = btrfs_node_ptr_generation(node->eb, i);
- if (ptr_gen == trans->transid)
- continue;
- if (!in_block_group(bytenr, rc->block_group) &&
- (node->level > 1 || rc->stage == MOVE_DATA_EXTENTS))
- continue;
- if (tree_block_processed(bytenr, blocksize, rc))
- continue;
-
- readahead_tree_block(rc->extent_root,
- bytenr, blocksize, ptr_gen);
- }
-
- for (i = 0; i < nritems; i++) {
- cond_resched();
- bytenr = btrfs_node_blockptr(node->eb, i);
- ptr_gen = btrfs_node_ptr_generation(node->eb, i);
- if (ptr_gen == trans->transid)
- continue;
- if (!in_block_group(bytenr, rc->block_group) &&
- (node->level > 1 || rc->stage == MOVE_DATA_EXTENTS))
- continue;
- if (tree_block_processed(bytenr, blocksize, rc))
- continue;
- if (!in_block_group(bytenr, rc->block_group) &&
- !check_file_extents(rc, bytenr, blocksize, ptr_gen))
- continue;
-
- block = kmalloc(sizeof(*block), GFP_NOFS);
- if (!block) {
- err = -ENOMEM;
- break;
- }
- block->bytenr = bytenr;
- btrfs_node_key_to_cpu(node->eb, &block->key, i);
- block->level = node->level - 1;
- block->key_ready = 1;
- rb_node = tree_insert(blocks, block->bytenr, &block->rb_node);
- BUG_ON(rb_node);
- }
- if (err)
- free_block_list(blocks);
- return err;
-}
-
-/*
- * find adjacent blocks require processing
- */
-static noinline_for_stack
-int add_adjacent_blocks(struct btrfs_trans_handle *trans,
- struct reloc_control *rc,
- struct backref_cache *cache,
- struct rb_root *blocks, int level,
- struct backref_node **upper)
-{
- struct backref_node *node;
- int ret = 0;
-
- WARN_ON(!list_empty(&cache->pending[level]));
-
- if (list_empty(&cache->pending[level + 1]))
- return 1;
-
- node = list_entry(cache->pending[level + 1].next,
- struct backref_node, lower);
- if (node->eb)
- ret = add_child_blocks(trans, rc, node, blocks);
-
- *upper = node;
- return ret;
-}
-
static int get_tree_block_key(struct reloc_control *rc,
struct tree_block *block)
{
@@ -2371,40 +2696,53 @@ static int relocate_tree_block(struct btrfs_trans_handle *trans,
struct btrfs_path *path)
{
struct btrfs_root *root;
- int ret;
+ int release = 0;
+ int ret = 0;
+ if (!node)
+ return 0;
+
+ BUG_ON(node->processed);
root = select_one_root(trans, node);
- if (unlikely(!root)) {
- rc->found_old_snapshot = 1;
+ if (root == ERR_PTR(-ENOENT)) {
update_processed_blocks(rc, node);
- return 0;
+ goto out;
}
- if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) {
- ret = do_relocation(trans, node, key, path, 1);
- if (ret < 0)
- goto out;
- if (node->level == 0 && rc->stage == UPDATE_DATA_PTRS) {
- ret = replace_file_extents(trans, rc, root,
- node->eb, NULL);
- if (ret < 0)
- goto out;
- }
- drop_node_buffer(node);
- } else if (!root->ref_cows) {
- path->lowest_level = node->level;
- ret = btrfs_search_slot(trans, root, key, path, 0, 1);
- btrfs_release_path(root, path);
- if (ret < 0)
+ if (!root || root->ref_cows) {
+ ret = reserve_metadata_space(trans, rc, node);
+ if (ret)
goto out;
- } else if (root != node->root) {
- WARN_ON(node->level > 0 || rc->stage != UPDATE_DATA_PTRS);
+ release = 1;
}
- update_processed_blocks(rc, node);
- ret = 0;
+ if (root) {
+ if (root->ref_cows) {
+ BUG_ON(node->new_bytenr);
+ BUG_ON(!list_empty(&node->list));
+ btrfs_record_root_in_trans(trans, root);
+ root = root->reloc_root;
+ node->new_bytenr = root->node->start;
+ node->root = root;
+ list_add_tail(&node->list, &rc->backref_cache.changed);
+ } else {
+ path->lowest_level = node->level;
+ ret = btrfs_search_slot(trans, root, key, path, 0, 1);
+ btrfs_release_path(root, path);
+ if (ret > 0)
+ ret = 0;
+ }
+ if (!ret)
+ update_processed_blocks(rc, node);
+ } else {
+ ret = do_relocation(trans, rc, node, key, path, 1);
+ }
out:
- drop_node_buffer(node);
+ if (ret || node->level == 0 || node->cowonly) {
+ if (release)
+ release_metadata_space(rc, node);
+ remove_backref_node(&rc->backref_cache, node);
+ }
return ret;
}
@@ -2415,12 +2753,10 @@ static noinline_for_stack
int relocate_tree_blocks(struct btrfs_trans_handle *trans,
struct reloc_control *rc, struct rb_root *blocks)
{
- struct backref_cache *cache;
struct backref_node *node;
struct btrfs_path *path;
struct tree_block *block;
struct rb_node *rb_node;
- int level = -1;
int ret;
int err = 0;
@@ -2428,21 +2764,9 @@ int relocate_tree_blocks(struct btrfs_trans_handle *trans,
if (!path)
return -ENOMEM;
- cache = kmalloc(sizeof(*cache), GFP_NOFS);
- if (!cache) {
- btrfs_free_path(path);
- return -ENOMEM;
- }
-
- backref_cache_init(cache);
-
rb_node = rb_first(blocks);
while (rb_node) {
block = rb_entry(rb_node, struct tree_block, rb_node);
- if (level == -1)
- level = block->level;
- else
- BUG_ON(level != block->level);
if (!block->key_ready)
reada_tree_block(rc, block);
rb_node = rb_next(rb_node);
@@ -2460,7 +2784,7 @@ int relocate_tree_blocks(struct btrfs_trans_handle *trans,
while (rb_node) {
block = rb_entry(rb_node, struct tree_block, rb_node);
- node = build_backref_tree(rc, cache, &block->key,
+ node = build_backref_tree(rc, &block->key,
block->level, block->bytenr);
if (IS_ERR(node)) {
err = PTR_ERR(node);
@@ -2470,79 +2794,62 @@ int relocate_tree_blocks(struct btrfs_trans_handle *trans,
ret = relocate_tree_block(trans, rc, node, &block->key,
path);
if (ret < 0) {
- err = ret;
+ if (ret != -EAGAIN || rb_node == rb_first(blocks))
+ err = ret;
goto out;
}
- remove_backref_node(cache, node);
rb_node = rb_next(rb_node);
}
-
- if (level > 0)
- goto out;
-
+out:
free_block_list(blocks);
+ err = finish_pending_nodes(trans, rc, path, err);
- /*
- * now backrefs of some upper level tree blocks have been cached,
- * try relocating blocks referenced by these upper level blocks.
- */
- while (1) {
- struct backref_node *upper = NULL;
- if (trans->transaction->in_commit ||
- trans->transaction->delayed_refs.flushing)
- break;
+ btrfs_free_path(path);
+ return err;
+}
- ret = add_adjacent_blocks(trans, rc, cache, blocks, level,
- &upper);
- if (ret < 0)
- err = ret;
- if (ret != 0)
- break;
+static noinline_for_stack
+int prealloc_file_extent_cluster(struct inode *inode,
+ struct file_extent_cluster *cluster)
+{
+ u64 alloc_hint = 0;
+ u64 start;
+ u64 end;
+ u64 offset = BTRFS_I(inode)->index_cnt;
+ u64 num_bytes;
+ int nr = 0;
+ int ret = 0;
- rb_node = rb_first(blocks);
- while (rb_node) {
- block = rb_entry(rb_node, struct tree_block, rb_node);
- if (trans->transaction->in_commit ||
- trans->transaction->delayed_refs.flushing)
- goto out;
- BUG_ON(!block->key_ready);
- node = build_backref_tree(rc, cache, &block->key,
- level, block->bytenr);
- if (IS_ERR(node)) {
- err = PTR_ERR(node);
- goto out;
- }
+ BUG_ON(cluster->start != cluster->boundary[0]);
+ mutex_lock(&inode->i_mutex);
- ret = relocate_tree_block(trans, rc, node,
- &block->key, path);
- if (ret < 0) {
- err = ret;
- goto out;
- }
- remove_backref_node(cache, node);
- rb_node = rb_next(rb_node);
- }
- free_block_list(blocks);
+ ret = btrfs_check_data_free_space(inode, cluster->end +
+ 1 - cluster->start);
+ if (ret)
+ goto out;
- if (upper) {
- ret = link_to_upper(trans, upper, path);
- if (ret < 0) {
- err = ret;
- break;
- }
- remove_backref_node(cache, upper);
- }
+ while (nr < cluster->nr) {
+ start = cluster->boundary[nr] - offset;
+ if (nr + 1 < cluster->nr)
+ end = cluster->boundary[nr + 1] - 1 - offset;
+ else
+ end = cluster->end - offset;
+
+ lock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS);
+ num_bytes = end + 1 - start;
+ ret = btrfs_prealloc_file_range(inode, 0, start,
+ num_bytes, num_bytes,
+ end + 1, &alloc_hint);
+ unlock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS);
+ if (ret)
+ break;
+ nr++;
}
+ btrfs_free_reserved_data_space(inode, cluster->end +
+ 1 - cluster->start);
out:
- free_block_list(blocks);
-
- ret = finish_pending_nodes(trans, cache, path);
- if (ret < 0)
- err = ret;
-
- kfree(cache);
- btrfs_free_path(path);
- return err;
+ mutex_unlock(&inode->i_mutex);
+ return ret;
}
static noinline_for_stack
@@ -2588,7 +2895,6 @@ static int relocate_file_extent_cluster(struct inode *inode,
u64 offset = BTRFS_I(inode)->index_cnt;
unsigned long index;
unsigned long last_index;
- unsigned int dirty_page = 0;
struct page *page;
struct file_ra_state *ra;
int nr = 0;
@@ -2601,21 +2907,24 @@ static int relocate_file_extent_cluster(struct inode *inode,
if (!ra)
return -ENOMEM;
- index = (cluster->start - offset) >> PAGE_CACHE_SHIFT;
- last_index = (cluster->end - offset) >> PAGE_CACHE_SHIFT;
+ ret = prealloc_file_extent_cluster(inode, cluster);
+ if (ret)
+ goto out;
- mutex_lock(&inode->i_mutex);
+ file_ra_state_init(ra, inode->i_mapping);
- i_size_write(inode, cluster->end + 1 - offset);
ret = setup_extent_mapping(inode, cluster->start - offset,
cluster->end - offset, cluster->start);
if (ret)
- goto out_unlock;
-
- file_ra_state_init(ra, inode->i_mapping);
+ goto out;
- WARN_ON(cluster->start != cluster->boundary[0]);
+ index = (cluster->start - offset) >> PAGE_CACHE_SHIFT;
+ last_index = (cluster->end - offset) >> PAGE_CACHE_SHIFT;
while (index <= last_index) {
+ ret = btrfs_delalloc_reserve_metadata(inode, PAGE_CACHE_SIZE);
+ if (ret)
+ goto out;
+
page = find_lock_page(inode->i_mapping, index);
if (!page) {
page_cache_sync_readahead(inode->i_mapping,
@@ -2623,8 +2932,10 @@ static int relocate_file_extent_cluster(struct inode *inode,
last_index + 1 - index);
page = grab_cache_page(inode->i_mapping, index);
if (!page) {
+ btrfs_delalloc_release_metadata(inode,
+ PAGE_CACHE_SIZE);
ret = -ENOMEM;
- goto out_unlock;
+ goto out;
}
}
@@ -2640,8 +2951,10 @@ static int relocate_file_extent_cluster(struct inode *inode,
if (!PageUptodate(page)) {
unlock_page(page);
page_cache_release(page);
+ btrfs_delalloc_release_metadata(inode,
+ PAGE_CACHE_SIZE);
ret = -EIO;
- goto out_unlock;
+ goto out;
}
}
@@ -2660,10 +2973,9 @@ static int relocate_file_extent_cluster(struct inode *inode,
EXTENT_BOUNDARY, GFP_NOFS);
nr++;
}
- btrfs_set_extent_delalloc(inode, page_start, page_end, NULL);
+ btrfs_set_extent_delalloc(inode, page_start, page_end, NULL);
set_page_dirty(page);
- dirty_page++;
unlock_extent(&BTRFS_I(inode)->io_tree,
page_start, page_end, GFP_NOFS);
@@ -2671,20 +2983,11 @@ static int relocate_file_extent_cluster(struct inode *inode,
page_cache_release(page);
index++;
- if (nr < cluster->nr &&
- page_end + 1 + offset == cluster->boundary[nr]) {
- balance_dirty_pages_ratelimited_nr(inode->i_mapping,
- dirty_page);
- dirty_page = 0;
- }
- }
- if (dirty_page) {
- balance_dirty_pages_ratelimited_nr(inode->i_mapping,
- dirty_page);
+ balance_dirty_pages_ratelimited(inode->i_mapping);
+ btrfs_throttle(BTRFS_I(inode)->root);
}
WARN_ON(nr != cluster->nr);
-out_unlock:
- mutex_unlock(&inode->i_mutex);
+out:
kfree(ra);
return ret;
}
@@ -2870,9 +3173,6 @@ out:
static int block_use_full_backref(struct reloc_control *rc,
struct extent_buffer *eb)
{
- struct btrfs_path *path;
- struct btrfs_extent_item *ei;
- struct btrfs_key key;
u64 flags;
int ret;
@@ -2880,28 +3180,14 @@ static int block_use_full_backref(struct reloc_control *rc,
btrfs_header_backref_rev(eb) < BTRFS_MIXED_BACKREF_REV)
return 1;
- path = btrfs_alloc_path();
- BUG_ON(!path);
-
- key.objectid = eb->start;
- key.type = BTRFS_EXTENT_ITEM_KEY;
- key.offset = eb->len;
-
- path->search_commit_root = 1;
- path->skip_locking = 1;
- ret = btrfs_search_slot(NULL, rc->extent_root,
- &key, path, 0, 0);
+ ret = btrfs_lookup_extent_info(NULL, rc->extent_root,
+ eb->start, eb->len, NULL, &flags);
BUG_ON(ret);
- ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
- struct btrfs_extent_item);
- flags = btrfs_extent_flags(path->nodes[0], ei);
- BUG_ON(!(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK));
if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)
ret = 1;
else
ret = 0;
- btrfs_free_path(path);
return ret;
}
@@ -3074,22 +3360,10 @@ int add_data_references(struct reloc_control *rc,
struct btrfs_extent_inline_ref *iref;
unsigned long ptr;
unsigned long end;
- u32 blocksize;
+ u32 blocksize = btrfs_level_size(rc->extent_root, 0);
int ret;
int err = 0;
- ret = get_new_location(rc->data_inode, NULL, extent_key->objectid,
- extent_key->offset);
- BUG_ON(ret < 0);
- if (ret > 0) {
- /* the relocated data is fragmented */
- rc->extents_skipped++;
- btrfs_release_path(rc->extent_root, path);
- return 0;
- }
-
- blocksize = btrfs_level_size(rc->extent_root, 0);
-
eb = path->nodes[0];
ptr = btrfs_item_ptr_offset(eb, path->slots[0]);
end = ptr + btrfs_item_size_nr(eb, path->slots[0]);
@@ -3170,7 +3444,8 @@ int add_data_references(struct reloc_control *rc,
*/
static noinline_for_stack
int find_next_extent(struct btrfs_trans_handle *trans,
- struct reloc_control *rc, struct btrfs_path *path)
+ struct reloc_control *rc, struct btrfs_path *path,
+ struct btrfs_key *extent_key)
{
struct btrfs_key key;
struct extent_buffer *leaf;
@@ -3225,6 +3500,7 @@ next:
rc->search_start = end + 1;
} else {
rc->search_start = key.objectid + key.offset;
+ memcpy(extent_key, &key, sizeof(key));
return 0;
}
}
@@ -3262,12 +3538,49 @@ static int check_extent_flags(u64 flags)
return 0;
}
+static noinline_for_stack
+int prepare_to_relocate(struct reloc_control *rc)
+{
+ struct btrfs_trans_handle *trans;
+ int ret;
+
+ rc->block_rsv = btrfs_alloc_block_rsv(rc->extent_root);
+ if (!rc->block_rsv)
+ return -ENOMEM;
+
+ /*
+ * reserve some space for creating reloc trees.
+ * btrfs_init_reloc_root will use them when there
+ * is no reservation in transaction handle.
+ */
+ ret = btrfs_block_rsv_add(NULL, rc->extent_root, rc->block_rsv,
+ rc->extent_root->nodesize * 256,
+ &rc->block_rsv_retries);
+ if (ret)
+ return ret;
+
+ rc->block_rsv->refill_used = 1;
+ btrfs_add_durable_block_rsv(rc->extent_root->fs_info, rc->block_rsv);
+
+ memset(&rc->cluster, 0, sizeof(rc->cluster));
+ rc->search_start = rc->block_group->key.objectid;
+ rc->extents_found = 0;
+ rc->nodes_relocated = 0;
+ rc->merging_rsv_size = 0;
+ rc->block_rsv_retries = 0;
+
+ rc->create_reloc_tree = 1;
+ set_reloc_control(rc);
+
+ trans = btrfs_join_transaction(rc->extent_root, 1);
+ btrfs_commit_transaction(trans, rc->extent_root);
+ return 0;
+}
static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
{
struct rb_root blocks = RB_ROOT;
struct btrfs_key key;
- struct file_extent_cluster *cluster;
struct btrfs_trans_handle *trans = NULL;
struct btrfs_path *path;
struct btrfs_extent_item *ei;
@@ -3277,33 +3590,25 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
int ret;
int err = 0;
- cluster = kzalloc(sizeof(*cluster), GFP_NOFS);
- if (!cluster)
- return -ENOMEM;
-
path = btrfs_alloc_path();
- if (!path) {
- kfree(cluster);
+ if (!path)
return -ENOMEM;
- }
-
- rc->extents_found = 0;
- rc->extents_skipped = 0;
-
- rc->search_start = rc->block_group->key.objectid;
- clear_extent_bits(&rc->processed_blocks, 0, (u64)-1, EXTENT_DIRTY,
- GFP_NOFS);
-
- rc->create_reloc_root = 1;
- set_reloc_control(rc);
- trans = btrfs_start_transaction(rc->extent_root, 1);
- btrfs_commit_transaction(trans, rc->extent_root);
+ ret = prepare_to_relocate(rc);
+ if (ret) {
+ err = ret;
+ goto out_free;
+ }
while (1) {
- trans = btrfs_start_transaction(rc->extent_root, 1);
+ trans = btrfs_start_transaction(rc->extent_root, 0);
+
+ if (update_backref_cache(trans, &rc->backref_cache)) {
+ btrfs_end_transaction(trans, rc->extent_root);
+ continue;
+ }
- ret = find_next_extent(trans, rc, path);
+ ret = find_next_extent(trans, rc, path, &key);
if (ret < 0)
err = ret;
if (ret != 0)
@@ -3313,9 +3618,7 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
struct btrfs_extent_item);
- btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
- item_size = btrfs_item_size_nr(path->nodes[0],
- path->slots[0]);
+ item_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]);
if (item_size >= sizeof(*ei)) {
flags = btrfs_extent_flags(path->nodes[0], ei);
ret = check_extent_flags(flags);
@@ -3356,73 +3659,100 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
ret = add_tree_block(rc, &key, path, &blocks);
} else if (rc->stage == UPDATE_DATA_PTRS &&
- (flags & BTRFS_EXTENT_FLAG_DATA)) {
+ (flags & BTRFS_EXTENT_FLAG_DATA)) {
ret = add_data_references(rc, &key, path, &blocks);
} else {
btrfs_release_path(rc->extent_root, path);
ret = 0;
}
if (ret < 0) {
- err = 0;
+ err = ret;
break;
}
if (!RB_EMPTY_ROOT(&blocks)) {
ret = relocate_tree_blocks(trans, rc, &blocks);
if (ret < 0) {
+ if (ret != -EAGAIN) {
+ err = ret;
+ break;
+ }
+ rc->extents_found--;
+ rc->search_start = key.objectid;
+ }
+ }
+
+ ret = btrfs_block_rsv_check(trans, rc->extent_root,
+ rc->block_rsv, 0, 5);
+ if (ret < 0) {
+ if (ret != -EAGAIN) {
err = ret;
+ WARN_ON(1);
break;
}
+ rc->commit_transaction = 1;
}
- nr = trans->blocks_used;
- btrfs_end_transaction(trans, rc->extent_root);
+ if (rc->commit_transaction) {
+ rc->commit_transaction = 0;
+ ret = btrfs_commit_transaction(trans, rc->extent_root);
+ BUG_ON(ret);
+ } else {
+ nr = trans->blocks_used;
+ btrfs_end_transaction_throttle(trans, rc->extent_root);
+ btrfs_btree_balance_dirty(rc->extent_root, nr);
+ }
trans = NULL;
- btrfs_btree_balance_dirty(rc->extent_root, nr);
if (rc->stage == MOVE_DATA_EXTENTS &&
(flags & BTRFS_EXTENT_FLAG_DATA)) {
rc->found_file_extent = 1;
ret = relocate_data_extent(rc->data_inode,
- &key, cluster);
+ &key, &rc->cluster);
if (ret < 0) {
err = ret;
break;
}
}
}
- btrfs_free_path(path);
+
+ btrfs_release_path(rc->extent_root, path);
+ clear_extent_bits(&rc->processed_blocks, 0, (u64)-1, EXTENT_DIRTY,
+ GFP_NOFS);
if (trans) {
nr = trans->blocks_used;
- btrfs_end_transaction(trans, rc->extent_root);
+ btrfs_end_transaction_throttle(trans, rc->extent_root);
btrfs_btree_balance_dirty(rc->extent_root, nr);
}
if (!err) {
- ret = relocate_file_extent_cluster(rc->data_inode, cluster);
+ ret = relocate_file_extent_cluster(rc->data_inode,
+ &rc->cluster);
if (ret < 0)
err = ret;
}
- kfree(cluster);
+ rc->create_reloc_tree = 0;
+ set_reloc_control(rc);
- rc->create_reloc_root = 0;
- smp_mb();
+ backref_cache_cleanup(&rc->backref_cache);
+ btrfs_block_rsv_release(rc->extent_root, rc->block_rsv, (u64)-1);
- if (rc->extents_found > 0) {
- trans = btrfs_start_transaction(rc->extent_root, 1);
- btrfs_commit_transaction(trans, rc->extent_root);
- }
+ err = prepare_to_merge(rc, err);
merge_reloc_roots(rc);
+ rc->merge_reloc_tree = 0;
unset_reloc_control(rc);
+ btrfs_block_rsv_release(rc->extent_root, rc->block_rsv, (u64)-1);
/* get rid of pinned extents */
- trans = btrfs_start_transaction(rc->extent_root, 1);
+ trans = btrfs_join_transaction(rc->extent_root, 1);
btrfs_commit_transaction(trans, rc->extent_root);
-
+out_free:
+ btrfs_free_block_rsv(rc->extent_root, rc->block_rsv);
+ btrfs_free_path(path);
return err;
}
@@ -3448,7 +3778,8 @@ static int __insert_orphan_inode(struct btrfs_trans_handle *trans,
btrfs_set_inode_generation(leaf, item, 1);
btrfs_set_inode_size(leaf, item, 0);
btrfs_set_inode_mode(leaf, item, S_IFREG | 0600);
- btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS);
+ btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS |
+ BTRFS_INODE_PREALLOC);
btrfs_mark_buffer_dirty(leaf);
btrfs_release_path(root, path);
out:
@@ -3460,8 +3791,9 @@ out:
* helper to create inode for data relocation.
* the inode is in data relocation tree and its link count is 0
*/
-static struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info,
- struct btrfs_block_group_cache *group)
+static noinline_for_stack
+struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info,
+ struct btrfs_block_group_cache *group)
{
struct inode *inode = NULL;
struct btrfs_trans_handle *trans;
@@ -3475,8 +3807,9 @@ static struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info,
if (IS_ERR(root))
return ERR_CAST(root);
- trans = btrfs_start_transaction(root, 1);
- BUG_ON(!trans);
+ trans = btrfs_start_transaction(root, 6);
+ if (IS_ERR(trans))
+ return ERR_CAST(trans);
err = btrfs_find_free_objectid(trans, root, objectid, &objectid);
if (err)
@@ -3496,7 +3829,6 @@ static struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info,
out:
nr = trans->blocks_used;
btrfs_end_transaction(trans, root);
-
btrfs_btree_balance_dirty(root, nr);
if (err) {
if (inode)
@@ -3506,6 +3838,21 @@ out:
return inode;
}
+static struct reloc_control *alloc_reloc_control(void)
+{
+ struct reloc_control *rc;
+
+ rc = kzalloc(sizeof(*rc), GFP_NOFS);
+ if (!rc)
+ return NULL;
+
+ INIT_LIST_HEAD(&rc->reloc_roots);
+ backref_cache_init(&rc->backref_cache);
+ mapping_tree_init(&rc->reloc_root_tree);
+ extent_io_tree_init(&rc->processed_blocks, NULL, GFP_NOFS);
+ return rc;
+}
+
/*
* function to relocate all extents in a block group.
*/
@@ -3514,24 +3861,26 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
struct btrfs_fs_info *fs_info = extent_root->fs_info;
struct reloc_control *rc;
int ret;
+ int rw = 0;
int err = 0;
- rc = kzalloc(sizeof(*rc), GFP_NOFS);
+ rc = alloc_reloc_control();
if (!rc)
return -ENOMEM;
- mapping_tree_init(&rc->reloc_root_tree);
- extent_io_tree_init(&rc->processed_blocks, NULL, GFP_NOFS);
- INIT_LIST_HEAD(&rc->reloc_roots);
+ rc->extent_root = extent_root;
rc->block_group = btrfs_lookup_block_group(fs_info, group_start);
BUG_ON(!rc->block_group);
- btrfs_init_workers(&rc->workers, "relocate",
- fs_info->thread_pool_size, NULL);
-
- rc->extent_root = extent_root;
- btrfs_prepare_block_group_relocation(extent_root, rc->block_group);
+ if (!rc->block_group->ro) {
+ ret = btrfs_set_block_group_ro(extent_root, rc->block_group);
+ if (ret) {
+ err = ret;
+ goto out;
+ }
+ rw = 1;
+ }
rc->data_inode = create_reloc_inode(fs_info, rc->block_group);
if (IS_ERR(rc->data_inode)) {
@@ -3548,9 +3897,6 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
btrfs_wait_ordered_extents(fs_info->tree_root, 0, 0);
while (1) {
- rc->extents_found = 0;
- rc->extents_skipped = 0;
-
mutex_lock(&fs_info->cleaner_mutex);
btrfs_clean_old_snapshots(fs_info->tree_root);
@@ -3559,7 +3905,7 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
mutex_unlock(&fs_info->cleaner_mutex);
if (ret < 0) {
err = ret;
- break;
+ goto out;
}
if (rc->extents_found == 0)
@@ -3573,18 +3919,6 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
invalidate_mapping_pages(rc->data_inode->i_mapping,
0, -1);
rc->stage = UPDATE_DATA_PTRS;
- } else if (rc->stage == UPDATE_DATA_PTRS &&
- rc->extents_skipped >= rc->extents_found) {
- iput(rc->data_inode);
- rc->data_inode = create_reloc_inode(fs_info,
- rc->block_group);
- if (IS_ERR(rc->data_inode)) {
- err = PTR_ERR(rc->data_inode);
- rc->data_inode = NULL;
- break;
- }
- rc->stage = MOVE_DATA_EXTENTS;
- rc->found_file_extent = 0;
}
}
@@ -3597,8 +3931,9 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
WARN_ON(rc->block_group->reserved > 0);
WARN_ON(btrfs_block_group_used(&rc->block_group->item) > 0);
out:
+ if (err && rw)
+ btrfs_set_block_group_rw(extent_root, rc->block_group);
iput(rc->data_inode);
- btrfs_stop_workers(&rc->workers);
btrfs_put_block_group(rc->block_group);
kfree(rc);
return err;
@@ -3609,7 +3944,7 @@ static noinline_for_stack int mark_garbage_root(struct btrfs_root *root)
struct btrfs_trans_handle *trans;
int ret;
- trans = btrfs_start_transaction(root->fs_info->tree_root, 1);
+ trans = btrfs_start_transaction(root->fs_info->tree_root, 0);
memset(&root->root_item.drop_progress, 0,
sizeof(root->root_item.drop_progress));
@@ -3702,20 +4037,20 @@ int btrfs_recover_relocation(struct btrfs_root *root)
if (list_empty(&reloc_roots))
goto out;
- rc = kzalloc(sizeof(*rc), GFP_NOFS);
+ rc = alloc_reloc_control();
if (!rc) {
err = -ENOMEM;
goto out;
}
- mapping_tree_init(&rc->reloc_root_tree);
- INIT_LIST_HEAD(&rc->reloc_roots);
- btrfs_init_workers(&rc->workers, "relocate",
- root->fs_info->thread_pool_size, NULL);
rc->extent_root = root->fs_info->extent_root;
set_reloc_control(rc);
+ trans = btrfs_join_transaction(rc->extent_root, 1);
+
+ rc->merge_reloc_tree = 1;
+
while (!list_empty(&reloc_roots)) {
reloc_root = list_entry(reloc_roots.next,
struct btrfs_root, root_list);
@@ -3735,20 +4070,16 @@ int btrfs_recover_relocation(struct btrfs_root *root)
fs_root->reloc_root = reloc_root;
}
- trans = btrfs_start_transaction(rc->extent_root, 1);
btrfs_commit_transaction(trans, rc->extent_root);
merge_reloc_roots(rc);
unset_reloc_control(rc);
- trans = btrfs_start_transaction(rc->extent_root, 1);
+ trans = btrfs_join_transaction(rc->extent_root, 1);
btrfs_commit_transaction(trans, rc->extent_root);
out:
- if (rc) {
- btrfs_stop_workers(&rc->workers);
- kfree(rc);
- }
+ kfree(rc);
while (!list_empty(&reloc_roots)) {
reloc_root = list_entry(reloc_roots.next,
struct btrfs_root, root_list);
@@ -3814,3 +4145,130 @@ int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len)
btrfs_put_ordered_extent(ordered);
return 0;
}
+
+void btrfs_reloc_cow_block(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, struct extent_buffer *buf,
+ struct extent_buffer *cow)
+{
+ struct reloc_control *rc;
+ struct backref_node *node;
+ int first_cow = 0;
+ int level;
+ int ret;
+
+ rc = root->fs_info->reloc_ctl;
+ if (!rc)
+ return;
+
+ BUG_ON(rc->stage == UPDATE_DATA_PTRS &&
+ root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID);
+
+ level = btrfs_header_level(buf);
+ if (btrfs_header_generation(buf) <=
+ btrfs_root_last_snapshot(&root->root_item))
+ first_cow = 1;
+
+ if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID &&
+ rc->create_reloc_tree) {
+ WARN_ON(!first_cow && level == 0);
+
+ node = rc->backref_cache.path[level];
+ BUG_ON(node->bytenr != buf->start &&
+ node->new_bytenr != buf->start);
+
+ drop_node_buffer(node);
+ extent_buffer_get(cow);
+ node->eb = cow;
+ node->new_bytenr = cow->start;
+
+ if (!node->pending) {
+ list_move_tail(&node->list,
+ &rc->backref_cache.pending[level]);
+ node->pending = 1;
+ }
+
+ if (first_cow)
+ __mark_block_processed(rc, node);
+
+ if (first_cow && level > 0)
+ rc->nodes_relocated += buf->len;
+ }
+
+ if (level == 0 && first_cow && rc->stage == UPDATE_DATA_PTRS) {
+ ret = replace_file_extents(trans, rc, root, cow);
+ BUG_ON(ret);
+ }
+}
+
+/*
+ * called before creating snapshot. it calculates metadata reservation
+ * requried for relocating tree blocks in the snapshot
+ */
+void btrfs_reloc_pre_snapshot(struct btrfs_trans_handle *trans,
+ struct btrfs_pending_snapshot *pending,
+ u64 *bytes_to_reserve)
+{
+ struct btrfs_root *root;
+ struct reloc_control *rc;
+
+ root = pending->root;
+ if (!root->reloc_root)
+ return;
+
+ rc = root->fs_info->reloc_ctl;
+ if (!rc->merge_reloc_tree)
+ return;
+
+ root = root->reloc_root;
+ BUG_ON(btrfs_root_refs(&root->root_item) == 0);
+ /*
+ * relocation is in the stage of merging trees. the space
+ * used by merging a reloc tree is twice the size of
+ * relocated tree nodes in the worst case. half for cowing
+ * the reloc tree, half for cowing the fs tree. the space
+ * used by cowing the reloc tree will be freed after the
+ * tree is dropped. if we create snapshot, cowing the fs
+ * tree may use more space than it frees. so we need
+ * reserve extra space.
+ */
+ *bytes_to_reserve += rc->nodes_relocated;
+}
+
+/*
+ * called after snapshot is created. migrate block reservation
+ * and create reloc root for the newly created snapshot
+ */
+void btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans,
+ struct btrfs_pending_snapshot *pending)
+{
+ struct btrfs_root *root = pending->root;
+ struct btrfs_root *reloc_root;
+ struct btrfs_root *new_root;
+ struct reloc_control *rc;
+ int ret;
+
+ if (!root->reloc_root)
+ return;
+
+ rc = root->fs_info->reloc_ctl;
+ rc->merging_rsv_size += rc->nodes_relocated;
+
+ if (rc->merge_reloc_tree) {
+ ret = btrfs_block_rsv_migrate(&pending->block_rsv,
+ rc->block_rsv,
+ rc->nodes_relocated);
+ BUG_ON(ret);
+ }
+
+ new_root = pending->snap;
+ reloc_root = create_reloc_root(trans, root->reloc_root,
+ new_root->root_key.objectid);
+
+ __add_reloc_root(reloc_root);
+ new_root->reloc_root = reloc_root;
+
+ if (rc->create_reloc_tree) {
+ ret = clone_backref_node(trans, rc, root, reloc_root);
+ BUG_ON(ret);
+ }
+}
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index 67fa2d29d663..2d958be761c8 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -259,6 +259,8 @@ int btrfs_find_orphan_roots(struct btrfs_root *tree_root)
struct extent_buffer *leaf;
struct btrfs_path *path;
struct btrfs_key key;
+ struct btrfs_key root_key;
+ struct btrfs_root *root;
int err = 0;
int ret;
@@ -270,6 +272,9 @@ int btrfs_find_orphan_roots(struct btrfs_root *tree_root)
key.type = BTRFS_ORPHAN_ITEM_KEY;
key.offset = 0;
+ root_key.type = BTRFS_ROOT_ITEM_KEY;
+ root_key.offset = (u64)-1;
+
while (1) {
ret = btrfs_search_slot(NULL, tree_root, &key, path, 0, 0);
if (ret < 0) {
@@ -294,13 +299,25 @@ int btrfs_find_orphan_roots(struct btrfs_root *tree_root)
key.type != BTRFS_ORPHAN_ITEM_KEY)
break;
- ret = btrfs_find_dead_roots(tree_root, key.offset);
- if (ret) {
+ root_key.objectid = key.offset;
+ key.offset++;
+
+ root = btrfs_read_fs_root_no_name(tree_root->fs_info,
+ &root_key);
+ if (!IS_ERR(root))
+ continue;
+
+ ret = PTR_ERR(root);
+ if (ret != -ENOENT) {
err = ret;
break;
}
- key.offset++;
+ ret = btrfs_find_dead_roots(tree_root, root_key.objectid);
+ if (ret) {
+ err = ret;
+ break;
+ }
}
btrfs_free_path(path);
@@ -313,7 +330,6 @@ int btrfs_del_root(struct btrfs_trans_handle *trans, struct btrfs_root *root,
{
struct btrfs_path *path;
int ret;
- u32 refs;
struct btrfs_root_item *ri;
struct extent_buffer *leaf;
@@ -327,8 +343,6 @@ int btrfs_del_root(struct btrfs_trans_handle *trans, struct btrfs_root *root,
leaf = path->nodes[0];
ri = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_item);
- refs = btrfs_disk_root_refs(leaf, ri);
- BUG_ON(refs != 0);
ret = btrfs_del_item(trans, root, path);
out:
btrfs_free_path(path);
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 2909a03e5230..f2393b390318 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -360,6 +360,8 @@ static struct dentry *get_default_root(struct super_block *sb,
*/
dir_id = btrfs_super_root_dir(&root->fs_info->super_copy);
di = btrfs_lookup_dir_item(NULL, root, path, dir_id, "default", 7, 0);
+ if (IS_ERR(di))
+ return ERR_CAST(di);
if (!di) {
/*
* Ok the default dir item isn't there. This is weird since
@@ -390,8 +392,8 @@ setup_root:
location.offset = 0;
inode = btrfs_iget(sb, &location, new_root, &new);
- if (!inode)
- return ERR_PTR(-ENOMEM);
+ if (IS_ERR(inode))
+ return ERR_CAST(inode);
/*
* If we're just mounting the root most subvol put the inode and return
@@ -498,7 +500,7 @@ int btrfs_sync_fs(struct super_block *sb, int wait)
btrfs_start_delalloc_inodes(root, 0);
btrfs_wait_ordered_extents(root, 0, 0);
- trans = btrfs_start_transaction(root, 1);
+ trans = btrfs_start_transaction(root, 0);
ret = btrfs_commit_transaction(trans, root);
return ret;
}
@@ -694,11 +696,11 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
if (btrfs_super_log_root(&root->fs_info->super_copy) != 0)
return -EINVAL;
- /* recover relocation */
- ret = btrfs_recover_relocation(root);
+ ret = btrfs_cleanup_fs_roots(root->fs_info);
WARN_ON(ret);
- ret = btrfs_cleanup_fs_roots(root->fs_info);
+ /* recover relocation */
+ ret = btrfs_recover_relocation(root);
WARN_ON(ret);
sb->s_flags &= ~MS_RDONLY;
@@ -714,34 +716,18 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
struct list_head *head = &root->fs_info->space_info;
struct btrfs_space_info *found;
u64 total_used = 0;
- u64 data_used = 0;
int bits = dentry->d_sb->s_blocksize_bits;
__be32 *fsid = (__be32 *)root->fs_info->fsid;
rcu_read_lock();
- list_for_each_entry_rcu(found, head, list) {
- if (found->flags & (BTRFS_BLOCK_GROUP_DUP|
- BTRFS_BLOCK_GROUP_RAID10|
- BTRFS_BLOCK_GROUP_RAID1)) {
- total_used += found->bytes_used;
- if (found->flags & BTRFS_BLOCK_GROUP_DATA)
- data_used += found->bytes_used;
- else
- data_used += found->total_bytes;
- }
-
- total_used += found->bytes_used;
- if (found->flags & BTRFS_BLOCK_GROUP_DATA)
- data_used += found->bytes_used;
- else
- data_used += found->total_bytes;
- }
+ list_for_each_entry_rcu(found, head, list)
+ total_used += found->disk_used;
rcu_read_unlock();
buf->f_namelen = BTRFS_NAME_LEN;
buf->f_blocks = btrfs_super_total_bytes(disk_super) >> bits;
buf->f_bfree = buf->f_blocks - (total_used >> bits);
- buf->f_bavail = buf->f_blocks - (data_used >> bits);
+ buf->f_bavail = buf->f_bfree;
buf->f_bsize = dentry->d_sb->s_blocksize;
buf->f_type = BTRFS_SUPER_MAGIC;
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 2cb116099b90..66e4c66cc63b 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -165,54 +165,89 @@ enum btrfs_trans_type {
TRANS_USERSPACE,
};
+static int may_wait_transaction(struct btrfs_root *root, int type)
+{
+ if (!root->fs_info->log_root_recovering &&
+ ((type == TRANS_START && !root->fs_info->open_ioctl_trans) ||
+ type == TRANS_USERSPACE))
+ return 1;
+ return 0;
+}
+
static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
- int num_blocks, int type)
+ u64 num_items, int type)
{
- struct btrfs_trans_handle *h =
- kmem_cache_alloc(btrfs_trans_handle_cachep, GFP_NOFS);
+ struct btrfs_trans_handle *h;
+ struct btrfs_transaction *cur_trans;
+ int retries = 0;
int ret;
+again:
+ h = kmem_cache_alloc(btrfs_trans_handle_cachep, GFP_NOFS);
+ if (!h)
+ return ERR_PTR(-ENOMEM);
mutex_lock(&root->fs_info->trans_mutex);
- if (!root->fs_info->log_root_recovering &&
- ((type == TRANS_START && !root->fs_info->open_ioctl_trans) ||
- type == TRANS_USERSPACE))
+ if (may_wait_transaction(root, type))
wait_current_trans(root);
+
ret = join_transaction(root);
BUG_ON(ret);
- h->transid = root->fs_info->running_transaction->transid;
- h->transaction = root->fs_info->running_transaction;
- h->blocks_reserved = num_blocks;
+ cur_trans = root->fs_info->running_transaction;
+ cur_trans->use_count++;
+ mutex_unlock(&root->fs_info->trans_mutex);
+
+ h->transid = cur_trans->transid;
+ h->transaction = cur_trans;
h->blocks_used = 0;
h->block_group = 0;
- h->alloc_exclude_nr = 0;
- h->alloc_exclude_start = 0;
+ h->bytes_reserved = 0;
h->delayed_ref_updates = 0;
+ h->block_rsv = NULL;
- if (!current->journal_info && type != TRANS_USERSPACE)
- current->journal_info = h;
+ smp_mb();
+ if (cur_trans->blocked && may_wait_transaction(root, type)) {
+ btrfs_commit_transaction(h, root);
+ goto again;
+ }
+
+ if (num_items > 0) {
+ ret = btrfs_trans_reserve_metadata(h, root, num_items,
+ &retries);
+ if (ret == -EAGAIN) {
+ btrfs_commit_transaction(h, root);
+ goto again;
+ }
+ if (ret < 0) {
+ btrfs_end_transaction(h, root);
+ return ERR_PTR(ret);
+ }
+ }
- root->fs_info->running_transaction->use_count++;
+ mutex_lock(&root->fs_info->trans_mutex);
record_root_in_trans(h, root);
mutex_unlock(&root->fs_info->trans_mutex);
+
+ if (!current->journal_info && type != TRANS_USERSPACE)
+ current->journal_info = h;
return h;
}
struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,
- int num_blocks)
+ int num_items)
{
- return start_transaction(root, num_blocks, TRANS_START);
+ return start_transaction(root, num_items, TRANS_START);
}
struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root,
int num_blocks)
{
- return start_transaction(root, num_blocks, TRANS_JOIN);
+ return start_transaction(root, 0, TRANS_JOIN);
}
struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *r,
int num_blocks)
{
- return start_transaction(r, num_blocks, TRANS_USERSPACE);
+ return start_transaction(r, 0, TRANS_USERSPACE);
}
/* wait for a transaction commit to be fully complete */
@@ -286,10 +321,36 @@ void btrfs_throttle(struct btrfs_root *root)
mutex_unlock(&root->fs_info->trans_mutex);
}
+static int should_end_transaction(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root)
+{
+ int ret;
+ ret = btrfs_block_rsv_check(trans, root,
+ &root->fs_info->global_block_rsv, 0, 5);
+ return ret ? 1 : 0;
+}
+
+int btrfs_should_end_transaction(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root)
+{
+ struct btrfs_transaction *cur_trans = trans->transaction;
+ int updates;
+
+ if (cur_trans->blocked || cur_trans->delayed_refs.flushing)
+ return 1;
+
+ updates = trans->delayed_ref_updates;
+ trans->delayed_ref_updates = 0;
+ if (updates)
+ btrfs_run_delayed_refs(trans, root, updates);
+
+ return should_end_transaction(trans, root);
+}
+
static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
struct btrfs_root *root, int throttle)
{
- struct btrfs_transaction *cur_trans;
+ struct btrfs_transaction *cur_trans = trans->transaction;
struct btrfs_fs_info *info = root->fs_info;
int count = 0;
@@ -313,9 +374,21 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
count++;
}
+ btrfs_trans_release_metadata(trans, root);
+
+ if (!root->fs_info->open_ioctl_trans &&
+ should_end_transaction(trans, root))
+ trans->transaction->blocked = 1;
+
+ if (cur_trans->blocked && !cur_trans->in_commit) {
+ if (throttle)
+ return btrfs_commit_transaction(trans, root);
+ else
+ wake_up_process(info->transaction_kthread);
+ }
+
mutex_lock(&info->trans_mutex);
- cur_trans = info->running_transaction;
- WARN_ON(cur_trans != trans->transaction);
+ WARN_ON(cur_trans != info->running_transaction);
WARN_ON(cur_trans->num_writers < 1);
cur_trans->num_writers--;
@@ -603,6 +676,7 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans,
btrfs_free_log(trans, root);
btrfs_update_reloc_root(trans, root);
+ btrfs_orphan_commit_root(trans, root);
if (root->commit_root != root->node) {
switch_commit_root(root);
@@ -627,30 +701,30 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans,
int btrfs_defrag_root(struct btrfs_root *root, int cacheonly)
{
struct btrfs_fs_info *info = root->fs_info;
- int ret;
struct btrfs_trans_handle *trans;
+ int ret;
unsigned long nr;
- smp_mb();
- if (root->defrag_running)
+ if (xchg(&root->defrag_running, 1))
return 0;
- trans = btrfs_start_transaction(root, 1);
+
while (1) {
- root->defrag_running = 1;
+ trans = btrfs_start_transaction(root, 0);
+ if (IS_ERR(trans))
+ return PTR_ERR(trans);
+
ret = btrfs_defrag_leaves(trans, root, cacheonly);
+
nr = trans->blocks_used;
btrfs_end_transaction(trans, root);
btrfs_btree_balance_dirty(info->tree_root, nr);
cond_resched();
- trans = btrfs_start_transaction(root, 1);
if (root->fs_info->closing || ret != -EAGAIN)
break;
}
root->defrag_running = 0;
- smp_mb();
- btrfs_end_transaction(trans, root);
- return 0;
+ return ret;
}
#if 0
@@ -758,47 +832,63 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
struct btrfs_root *root = pending->root;
struct btrfs_root *parent_root;
struct inode *parent_inode;
+ struct dentry *dentry;
struct extent_buffer *tmp;
struct extent_buffer *old;
int ret;
- u64 objectid;
- int namelen;
+ int retries = 0;
+ u64 to_reserve = 0;
u64 index = 0;
-
- parent_inode = pending->dentry->d_parent->d_inode;
- parent_root = BTRFS_I(parent_inode)->root;
+ u64 objectid;
new_root_item = kmalloc(sizeof(*new_root_item), GFP_NOFS);
if (!new_root_item) {
- ret = -ENOMEM;
+ pending->error = -ENOMEM;
goto fail;
}
+
ret = btrfs_find_free_objectid(trans, tree_root, 0, &objectid);
- if (ret)
+ if (ret) {
+ pending->error = ret;
goto fail;
+ }
+
+ btrfs_reloc_pre_snapshot(trans, pending, &to_reserve);
+ btrfs_orphan_pre_snapshot(trans, pending, &to_reserve);
+
+ if (to_reserve > 0) {
+ ret = btrfs_block_rsv_add(trans, root, &pending->block_rsv,
+ to_reserve, &retries);
+ if (ret) {
+ pending->error = ret;
+ goto fail;
+ }
+ }
key.objectid = objectid;
- /* record when the snapshot was created in key.offset */
- key.offset = trans->transid;
- btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
+ key.offset = (u64)-1;
+ key.type = BTRFS_ROOT_ITEM_KEY;
- memcpy(&pending->root_key, &key, sizeof(key));
- pending->root_key.offset = (u64)-1;
+ trans->block_rsv = &pending->block_rsv;
+ dentry = pending->dentry;
+ parent_inode = dentry->d_parent->d_inode;
+ parent_root = BTRFS_I(parent_inode)->root;
record_root_in_trans(trans, parent_root);
+
/*
* insert the directory item
*/
- namelen = strlen(pending->name);
ret = btrfs_set_inode_index(parent_inode, &index);
BUG_ON(ret);
ret = btrfs_insert_dir_item(trans, parent_root,
- pending->name, namelen,
- parent_inode->i_ino,
- &pending->root_key, BTRFS_FT_DIR, index);
+ dentry->d_name.name, dentry->d_name.len,
+ parent_inode->i_ino, &key,
+ BTRFS_FT_DIR, index);
BUG_ON(ret);
- btrfs_i_size_write(parent_inode, parent_inode->i_size + namelen * 2);
+ btrfs_i_size_write(parent_inode, parent_inode->i_size +
+ dentry->d_name.len * 2);
ret = btrfs_update_inode(trans, parent_root, parent_inode);
BUG_ON(ret);
@@ -815,22 +905,32 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
free_extent_buffer(old);
btrfs_set_root_node(new_root_item, tmp);
- ret = btrfs_insert_root(trans, root->fs_info->tree_root, &key,
- new_root_item);
- BUG_ON(ret);
+ /* record when the snapshot was created in key.offset */
+ key.offset = trans->transid;
+ ret = btrfs_insert_root(trans, tree_root, &key, new_root_item);
btrfs_tree_unlock(tmp);
free_extent_buffer(tmp);
+ BUG_ON(ret);
- ret = btrfs_add_root_ref(trans, parent_root->fs_info->tree_root,
- pending->root_key.objectid,
+ /*
+ * insert root back/forward references
+ */
+ ret = btrfs_add_root_ref(trans, tree_root, objectid,
parent_root->root_key.objectid,
- parent_inode->i_ino, index, pending->name,
- namelen);
+ parent_inode->i_ino, index,
+ dentry->d_name.name, dentry->d_name.len);
BUG_ON(ret);
+ key.offset = (u64)-1;
+ pending->snap = btrfs_read_fs_root_no_name(root->fs_info, &key);
+ BUG_ON(IS_ERR(pending->snap));
+
+ btrfs_reloc_post_snapshot(trans, pending);
+ btrfs_orphan_post_snapshot(trans, pending);
fail:
kfree(new_root_item);
- return ret;
+ btrfs_block_rsv_release(root, &pending->block_rsv, (u64)-1);
+ return 0;
}
/*
@@ -878,6 +978,16 @@ int btrfs_transaction_in_commit(struct btrfs_fs_info *info)
return ret;
}
+int btrfs_transaction_blocked(struct btrfs_fs_info *info)
+{
+ int ret = 0;
+ spin_lock(&info->new_trans_lock);
+ if (info->running_transaction)
+ ret = info->running_transaction->blocked;
+ spin_unlock(&info->new_trans_lock);
+ return ret;
+}
+
int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
struct btrfs_root *root)
{
@@ -899,6 +1009,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
ret = btrfs_run_delayed_refs(trans, root, 0);
BUG_ON(ret);
+ btrfs_trans_release_metadata(trans, root);
+
cur_trans = trans->transaction;
/*
* set the flushing flag so procs in this transaction have to
@@ -951,9 +1063,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
snap_pending = 1;
WARN_ON(cur_trans != trans->transaction);
- prepare_to_wait(&cur_trans->writer_wait, &wait,
- TASK_UNINTERRUPTIBLE);
-
if (cur_trans->num_writers > 1)
timeout = MAX_SCHEDULE_TIMEOUT;
else if (should_grow)
@@ -976,6 +1085,9 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
*/
btrfs_run_ordered_operations(root, 1);
+ prepare_to_wait(&cur_trans->writer_wait, &wait,
+ TASK_UNINTERRUPTIBLE);
+
smp_mb();
if (cur_trans->num_writers > 1 || should_grow)
schedule_timeout(timeout);
@@ -1103,9 +1215,9 @@ int btrfs_clean_old_snapshots(struct btrfs_root *root)
if (btrfs_header_backref_rev(root->node) <
BTRFS_MIXED_BACKREF_REV)
- btrfs_drop_snapshot(root, 0);
+ btrfs_drop_snapshot(root, NULL, 0);
else
- btrfs_drop_snapshot(root, 1);
+ btrfs_drop_snapshot(root, NULL, 1);
}
return 0;
}
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 93c7ccb33118..e104986d0bfd 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -45,20 +45,23 @@ struct btrfs_transaction {
struct btrfs_trans_handle {
u64 transid;
+ u64 block_group;
+ u64 bytes_reserved;
unsigned long blocks_reserved;
unsigned long blocks_used;
- struct btrfs_transaction *transaction;
- u64 block_group;
- u64 alloc_exclude_start;
- u64 alloc_exclude_nr;
unsigned long delayed_ref_updates;
+ struct btrfs_transaction *transaction;
+ struct btrfs_block_rsv *block_rsv;
};
struct btrfs_pending_snapshot {
struct dentry *dentry;
struct btrfs_root *root;
- char *name;
- struct btrfs_key root_key;
+ struct btrfs_root *snap;
+ /* block reservation for the operation */
+ struct btrfs_block_rsv block_rsv;
+ /* extra metadata reseration for relocation */
+ int error;
struct list_head list;
};
@@ -85,11 +88,11 @@ static inline void btrfs_set_inode_last_trans(struct btrfs_trans_handle *trans,
int btrfs_end_transaction(struct btrfs_trans_handle *trans,
struct btrfs_root *root);
struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,
- int num_blocks);
+ int num_items);
struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root,
- int num_blocks);
+ int num_blocks);
struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *r,
- int num_blocks);
+ int num_blocks);
int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans,
struct btrfs_root *root);
int btrfs_commit_tree_roots(struct btrfs_trans_handle *trans,
@@ -103,6 +106,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
struct btrfs_root *root);
int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans,
struct btrfs_root *root);
+int btrfs_should_end_transaction(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root);
void btrfs_throttle(struct btrfs_root *root);
int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans,
struct btrfs_root *root);
@@ -112,5 +117,6 @@ int btrfs_write_marked_extents(struct btrfs_root *root,
struct extent_io_tree *dirty_pages, int mark);
int btrfs_wait_marked_extents(struct btrfs_root *root,
struct extent_io_tree *dirty_pages, int mark);
+int btrfs_transaction_blocked(struct btrfs_fs_info *info);
int btrfs_transaction_in_commit(struct btrfs_fs_info *info);
#endif
diff --git a/fs/btrfs/tree-defrag.c b/fs/btrfs/tree-defrag.c
index b10eacdb1620..f7ac8e013ed7 100644
--- a/fs/btrfs/tree-defrag.c
+++ b/fs/btrfs/tree-defrag.c
@@ -117,13 +117,14 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
path->nodes[1], 0,
cache_only, &last_ret,
&root->defrag_progress);
- WARN_ON(ret && ret != -EAGAIN);
+ if (ret) {
+ WARN_ON(ret == -EAGAIN);
+ goto out;
+ }
if (next_key_ret == 0) {
memcpy(&root->defrag_progress, &key, sizeof(key));
ret = -EAGAIN;
}
-
- btrfs_release_path(root, path);
out:
if (path)
btrfs_free_path(path);
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index af57dd2b43d4..fb102a9aee9c 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -135,6 +135,7 @@ static int start_log_trans(struct btrfs_trans_handle *trans,
struct btrfs_root *root)
{
int ret;
+ int err = 0;
mutex_lock(&root->log_mutex);
if (root->log_root) {
@@ -155,17 +156,19 @@ static int start_log_trans(struct btrfs_trans_handle *trans,
mutex_lock(&root->fs_info->tree_log_mutex);
if (!root->fs_info->log_root_tree) {
ret = btrfs_init_log_root_tree(trans, root->fs_info);
- BUG_ON(ret);
+ if (ret)
+ err = ret;
}
- if (!root->log_root) {
+ if (err == 0 && !root->log_root) {
ret = btrfs_add_log_tree(trans, root);
- BUG_ON(ret);
+ if (ret)
+ err = ret;
}
mutex_unlock(&root->fs_info->tree_log_mutex);
root->log_batch++;
atomic_inc(&root->log_writers);
mutex_unlock(&root->log_mutex);
- return 0;
+ return err;
}
/*
@@ -376,7 +379,7 @@ insert:
BUG_ON(ret);
}
} else if (ret) {
- BUG();
+ return ret;
}
dst_ptr = btrfs_item_ptr_offset(path->nodes[0],
path->slots[0]);
@@ -1699,9 +1702,9 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
next = btrfs_find_create_tree_block(root, bytenr, blocksize);
- wc->process_func(root, next, wc, ptr_gen);
-
if (*level == 1) {
+ wc->process_func(root, next, wc, ptr_gen);
+
path->slots[*level]++;
if (wc->free) {
btrfs_read_buffer(next, ptr_gen);
@@ -1734,35 +1737,7 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
WARN_ON(*level < 0);
WARN_ON(*level >= BTRFS_MAX_LEVEL);
- if (path->nodes[*level] == root->node)
- parent = path->nodes[*level];
- else
- parent = path->nodes[*level + 1];
-
- bytenr = path->nodes[*level]->start;
-
- blocksize = btrfs_level_size(root, *level);
- root_owner = btrfs_header_owner(parent);
- root_gen = btrfs_header_generation(parent);
-
- wc->process_func(root, path->nodes[*level], wc,
- btrfs_header_generation(path->nodes[*level]));
-
- if (wc->free) {
- next = path->nodes[*level];
- btrfs_tree_lock(next);
- clean_tree_block(trans, root, next);
- btrfs_set_lock_blocking(next);
- btrfs_wait_tree_block_writeback(next);
- btrfs_tree_unlock(next);
-
- WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID);
- ret = btrfs_free_reserved_extent(root, bytenr, blocksize);
- BUG_ON(ret);
- }
- free_extent_buffer(path->nodes[*level]);
- path->nodes[*level] = NULL;
- *level += 1;
+ path->slots[*level] = btrfs_header_nritems(path->nodes[*level]);
cond_resched();
return 0;
@@ -1781,7 +1756,7 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,
for (i = *level; i < BTRFS_MAX_LEVEL - 1 && path->nodes[i]; i++) {
slot = path->slots[i];
- if (slot < btrfs_header_nritems(path->nodes[i]) - 1) {
+ if (slot + 1 < btrfs_header_nritems(path->nodes[i])) {
struct extent_buffer *node;
node = path->nodes[i];
path->slots[i]++;
@@ -2047,7 +2022,6 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
mutex_unlock(&log_root_tree->log_mutex);
ret = update_log_root(trans, log);
- BUG_ON(ret);
mutex_lock(&log_root_tree->log_mutex);
if (atomic_dec_and_test(&log_root_tree->log_writers)) {
@@ -2056,6 +2030,15 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
wake_up(&log_root_tree->log_writer_wait);
}
+ if (ret) {
+ BUG_ON(ret != -ENOSPC);
+ root->fs_info->last_trans_log_full_commit = trans->transid;
+ btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
+ mutex_unlock(&log_root_tree->log_mutex);
+ ret = -EAGAIN;
+ goto out;
+ }
+
index2 = log_root_tree->log_transid % 2;
if (atomic_read(&log_root_tree->log_commit[index2])) {
btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
@@ -2129,15 +2112,10 @@ out:
return 0;
}
-/*
- * free all the extents used by the tree log. This should be called
- * at commit time of the full transaction
- */
-int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root)
+static void free_log_tree(struct btrfs_trans_handle *trans,
+ struct btrfs_root *log)
{
int ret;
- struct btrfs_root *log;
- struct key;
u64 start;
u64 end;
struct walk_control wc = {
@@ -2145,10 +2123,6 @@ int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root)
.process_func = process_one_buffer
};
- if (!root->log_root || root->fs_info->log_root_recovering)
- return 0;
-
- log = root->log_root;
ret = walk_log_tree(trans, log, &wc);
BUG_ON(ret);
@@ -2162,14 +2136,30 @@ int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root)
EXTENT_DIRTY | EXTENT_NEW, GFP_NOFS);
}
- if (log->log_transid > 0) {
- ret = btrfs_del_root(trans, root->fs_info->log_root_tree,
- &log->root_key);
- BUG_ON(ret);
- }
- root->log_root = NULL;
free_extent_buffer(log->node);
kfree(log);
+}
+
+/*
+ * free all the extents used by the tree log. This should be called
+ * at commit time of the full transaction
+ */
+int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root)
+{
+ if (root->log_root) {
+ free_log_tree(trans, root->log_root);
+ root->log_root = NULL;
+ }
+ return 0;
+}
+
+int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info)
+{
+ if (fs_info->log_root_tree) {
+ free_log_tree(trans, fs_info->log_root_tree);
+ fs_info->log_root_tree = NULL;
+ }
return 0;
}
@@ -2203,6 +2193,7 @@ int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
struct btrfs_dir_item *di;
struct btrfs_path *path;
int ret;
+ int err = 0;
int bytes_del = 0;
if (BTRFS_I(dir)->logged_trans < trans->transid)
@@ -2218,7 +2209,11 @@ int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
path = btrfs_alloc_path();
di = btrfs_lookup_dir_item(trans, log, path, dir->i_ino,
name, name_len, -1);
- if (di && !IS_ERR(di)) {
+ if (IS_ERR(di)) {
+ err = PTR_ERR(di);
+ goto fail;
+ }
+ if (di) {
ret = btrfs_delete_one_dir_name(trans, log, path, di);
bytes_del += name_len;
BUG_ON(ret);
@@ -2226,7 +2221,11 @@ int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
btrfs_release_path(log, path);
di = btrfs_lookup_dir_index_item(trans, log, path, dir->i_ino,
index, name, name_len, -1);
- if (di && !IS_ERR(di)) {
+ if (IS_ERR(di)) {
+ err = PTR_ERR(di);
+ goto fail;
+ }
+ if (di) {
ret = btrfs_delete_one_dir_name(trans, log, path, di);
bytes_del += name_len;
BUG_ON(ret);
@@ -2244,6 +2243,10 @@ int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
btrfs_release_path(log, path);
ret = btrfs_search_slot(trans, log, &key, path, 0, 1);
+ if (ret < 0) {
+ err = ret;
+ goto fail;
+ }
if (ret == 0) {
struct btrfs_inode_item *item;
u64 i_size;
@@ -2261,9 +2264,13 @@ int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
ret = 0;
btrfs_release_path(log, path);
}
-
+fail:
btrfs_free_path(path);
mutex_unlock(&BTRFS_I(dir)->log_mutex);
+ if (ret == -ENOSPC) {
+ root->fs_info->last_trans_log_full_commit = trans->transid;
+ ret = 0;
+ }
btrfs_end_log_trans(root);
return 0;
@@ -2291,6 +2298,10 @@ int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
ret = btrfs_del_inode_ref(trans, log, name, name_len, inode->i_ino,
dirid, &index);
mutex_unlock(&BTRFS_I(inode)->log_mutex);
+ if (ret == -ENOSPC) {
+ root->fs_info->last_trans_log_full_commit = trans->transid;
+ ret = 0;
+ }
btrfs_end_log_trans(root);
return ret;
@@ -2318,7 +2329,8 @@ static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans,
else
key.type = BTRFS_DIR_LOG_INDEX_KEY;
ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*item));
- BUG_ON(ret);
+ if (ret)
+ return ret;
item = btrfs_item_ptr(path->nodes[0], path->slots[0],
struct btrfs_dir_log_item);
@@ -2343,6 +2355,7 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
struct btrfs_key max_key;
struct btrfs_root *log = root->log_root;
struct extent_buffer *src;
+ int err = 0;
int ret;
int i;
int nritems;
@@ -2405,6 +2418,10 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
ret = overwrite_item(trans, log, dst_path,
path->nodes[0], path->slots[0],
&tmp);
+ if (ret) {
+ err = ret;
+ goto done;
+ }
}
}
btrfs_release_path(root, path);
@@ -2432,7 +2449,10 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
goto done;
ret = overwrite_item(trans, log, dst_path, src, i,
&min_key);
- BUG_ON(ret);
+ if (ret) {
+ err = ret;
+ goto done;
+ }
}
path->slots[0] = nritems;
@@ -2454,22 +2474,30 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
ret = overwrite_item(trans, log, dst_path,
path->nodes[0], path->slots[0],
&tmp);
-
- BUG_ON(ret);
- last_offset = tmp.offset;
+ if (ret)
+ err = ret;
+ else
+ last_offset = tmp.offset;
goto done;
}
}
done:
- *last_offset_ret = last_offset;
btrfs_release_path(root, path);
btrfs_release_path(log, dst_path);
- /* insert the log range keys to indicate where the log is valid */
- ret = insert_dir_log_key(trans, log, path, key_type, inode->i_ino,
- first_offset, last_offset);
- BUG_ON(ret);
- return 0;
+ if (err == 0) {
+ *last_offset_ret = last_offset;
+ /*
+ * insert the log range keys to indicate where the log
+ * is valid
+ */
+ ret = insert_dir_log_key(trans, log, path, key_type,
+ inode->i_ino, first_offset,
+ last_offset);
+ if (ret)
+ err = ret;
+ }
+ return err;
}
/*
@@ -2501,7 +2529,8 @@ again:
ret = log_dir_items(trans, root, inode, path,
dst_path, key_type, min_key,
&max_key);
- BUG_ON(ret);
+ if (ret)
+ return ret;
if (max_key == (u64)-1)
break;
min_key = max_key + 1;
@@ -2535,8 +2564,8 @@ static int drop_objectid_items(struct btrfs_trans_handle *trans,
while (1) {
ret = btrfs_search_slot(trans, log, &key, path, -1, 1);
-
- if (ret != 1)
+ BUG_ON(ret == 0);
+ if (ret < 0)
break;
if (path->slots[0] == 0)
@@ -2554,7 +2583,7 @@ static int drop_objectid_items(struct btrfs_trans_handle *trans,
btrfs_release_path(log, path);
}
btrfs_release_path(log, path);
- return 0;
+ return ret;
}
static noinline int copy_items(struct btrfs_trans_handle *trans,
@@ -2587,7 +2616,10 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
}
ret = btrfs_insert_empty_items(trans, log, dst_path,
ins_keys, ins_sizes, nr);
- BUG_ON(ret);
+ if (ret) {
+ kfree(ins_data);
+ return ret;
+ }
for (i = 0; i < nr; i++, dst_path->slots[0]++) {
dst_offset = btrfs_item_ptr_offset(dst_path->nodes[0],
@@ -2660,16 +2692,17 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
* we have to do this after the loop above to avoid changing the
* log tree while trying to change the log tree.
*/
+ ret = 0;
while (!list_empty(&ordered_sums)) {
struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next,
struct btrfs_ordered_sum,
list);
- ret = btrfs_csum_file_blocks(trans, log, sums);
- BUG_ON(ret);
+ if (!ret)
+ ret = btrfs_csum_file_blocks(trans, log, sums);
list_del(&sums->list);
kfree(sums);
}
- return 0;
+ return ret;
}
/* log a single inode in the tree log.
@@ -2697,6 +2730,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
struct btrfs_root *log = root->log_root;
struct extent_buffer *src = NULL;
u32 size;
+ int err = 0;
int ret;
int nritems;
int ins_start_slot = 0;
@@ -2739,7 +2773,10 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
} else {
ret = btrfs_truncate_inode_items(trans, log, inode, 0, 0);
}
- BUG_ON(ret);
+ if (ret) {
+ err = ret;
+ goto out_unlock;
+ }
path->keep_locks = 1;
while (1) {
@@ -2768,7 +2805,10 @@ again:
ret = copy_items(trans, log, dst_path, src, ins_start_slot,
ins_nr, inode_only);
- BUG_ON(ret);
+ if (ret) {
+ err = ret;
+ goto out_unlock;
+ }
ins_nr = 1;
ins_start_slot = path->slots[0];
next_slot:
@@ -2784,7 +2824,10 @@ next_slot:
ret = copy_items(trans, log, dst_path, src,
ins_start_slot,
ins_nr, inode_only);
- BUG_ON(ret);
+ if (ret) {
+ err = ret;
+ goto out_unlock;
+ }
ins_nr = 0;
}
btrfs_release_path(root, path);
@@ -2802,7 +2845,10 @@ next_slot:
ret = copy_items(trans, log, dst_path, src,
ins_start_slot,
ins_nr, inode_only);
- BUG_ON(ret);
+ if (ret) {
+ err = ret;
+ goto out_unlock;
+ }
ins_nr = 0;
}
WARN_ON(ins_nr);
@@ -2810,14 +2856,18 @@ next_slot:
btrfs_release_path(root, path);
btrfs_release_path(log, dst_path);
ret = log_directory_changes(trans, root, inode, path, dst_path);
- BUG_ON(ret);
+ if (ret) {
+ err = ret;
+ goto out_unlock;
+ }
}
BTRFS_I(inode)->logged_trans = trans->transid;
+out_unlock:
mutex_unlock(&BTRFS_I(inode)->log_mutex);
btrfs_free_path(path);
btrfs_free_path(dst_path);
- return 0;
+ return err;
}
/*
@@ -2942,10 +2992,13 @@ int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
goto end_no_trans;
}
- start_log_trans(trans, root);
+ ret = start_log_trans(trans, root);
+ if (ret)
+ goto end_trans;
ret = btrfs_log_inode(trans, root, inode, inode_only);
- BUG_ON(ret);
+ if (ret)
+ goto end_trans;
/*
* for regular files, if its inode is already on disk, we don't
@@ -2955,8 +3008,10 @@ int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
*/
if (S_ISREG(inode->i_mode) &&
BTRFS_I(inode)->generation <= last_committed &&
- BTRFS_I(inode)->last_unlink_trans <= last_committed)
- goto no_parent;
+ BTRFS_I(inode)->last_unlink_trans <= last_committed) {
+ ret = 0;
+ goto end_trans;
+ }
inode_only = LOG_INODE_EXISTS;
while (1) {
@@ -2970,15 +3025,21 @@ int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
if (BTRFS_I(inode)->generation >
root->fs_info->last_trans_committed) {
ret = btrfs_log_inode(trans, root, inode, inode_only);
- BUG_ON(ret);
+ if (ret)
+ goto end_trans;
}
if (IS_ROOT(parent))
break;
parent = parent->d_parent;
}
-no_parent:
ret = 0;
+end_trans:
+ if (ret < 0) {
+ BUG_ON(ret != -ENOSPC);
+ root->fs_info->last_trans_log_full_commit = trans->transid;
+ ret = 1;
+ }
btrfs_end_log_trans(root);
end_no_trans:
return ret;
@@ -3020,7 +3081,7 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree)
path = btrfs_alloc_path();
BUG_ON(!path);
- trans = btrfs_start_transaction(fs_info->tree_root, 1);
+ trans = btrfs_start_transaction(fs_info->tree_root, 0);
wc.trans = trans;
wc.pin = 1;
diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h
index 0776eacb5083..3dfae84c8cc8 100644
--- a/fs/btrfs/tree-log.h
+++ b/fs/btrfs/tree-log.h
@@ -25,6 +25,8 @@
int btrfs_sync_log(struct btrfs_trans_handle *trans,
struct btrfs_root *root);
int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root);
+int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info);
int btrfs_recover_log_trees(struct btrfs_root *tree_root);
int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
struct btrfs_root *root, struct dentry *dentry);
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 8db7b14bbae8..d6e3af8be95b 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -1097,7 +1097,7 @@ static int btrfs_rm_dev_item(struct btrfs_root *root,
if (!path)
return -ENOMEM;
- trans = btrfs_start_transaction(root, 1);
+ trans = btrfs_start_transaction(root, 0);
key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
key.type = BTRFS_DEV_ITEM_KEY;
key.offset = device->devid;
@@ -1486,7 +1486,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
goto error;
}
- trans = btrfs_start_transaction(root, 1);
+ trans = btrfs_start_transaction(root, 0);
lock_chunks(root);
device->barriers = 1;
@@ -1751,9 +1751,10 @@ static int btrfs_relocate_chunk(struct btrfs_root *root,
/* step one, relocate all the extents inside this chunk */
ret = btrfs_relocate_block_group(extent_root, chunk_offset);
- BUG_ON(ret);
+ if (ret)
+ return ret;
- trans = btrfs_start_transaction(root, 1);
+ trans = btrfs_start_transaction(root, 0);
BUG_ON(!trans);
lock_chunks(root);
@@ -1925,7 +1926,7 @@ int btrfs_balance(struct btrfs_root *dev_root)
break;
BUG_ON(ret);
- trans = btrfs_start_transaction(dev_root, 1);
+ trans = btrfs_start_transaction(dev_root, 0);
BUG_ON(!trans);
ret = btrfs_grow_device(trans, device, old_size);
@@ -2094,11 +2095,7 @@ again:
}
/* Shrinking succeeded, else we would be at "done". */
- trans = btrfs_start_transaction(root, 1);
- if (!trans) {
- ret = -ENOMEM;
- goto done;
- }
+ trans = btrfs_start_transaction(root, 0);
lock_chunks(root);
device->disk_total_bytes = new_size;
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index 59acd3eb288a..88ecbb215878 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -154,15 +154,10 @@ int __btrfs_setxattr(struct btrfs_trans_handle *trans,
if (trans)
return do_setxattr(trans, inode, name, value, size, flags);
- ret = btrfs_reserve_metadata_space(root, 2);
- if (ret)
- return ret;
+ trans = btrfs_start_transaction(root, 2);
+ if (IS_ERR(trans))
+ return PTR_ERR(trans);
- trans = btrfs_start_transaction(root, 1);
- if (!trans) {
- ret = -ENOMEM;
- goto out;
- }
btrfs_set_trans_block_group(trans, inode);
ret = do_setxattr(trans, inode, name, value, size, flags);
@@ -174,7 +169,6 @@ int __btrfs_setxattr(struct btrfs_trans_handle *trans,
BUG_ON(ret);
out:
btrfs_end_transaction_throttle(trans, root);
- btrfs_unreserve_metadata_space(root, 2);
return ret;
}
diff --git a/fs/buffer.c b/fs/buffer.c
index e8aa7081d25c..d54812b198e9 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -1949,14 +1949,11 @@ static int __block_commit_write(struct inode *inode, struct page *page,
}
/*
- * block_write_begin takes care of the basic task of block allocation and
- * bringing partial write blocks uptodate first.
- *
- * If *pagep is not NULL, then block_write_begin uses the locked page
- * at *pagep rather than allocating its own. In this case, the page will
- * not be unlocked or deallocated on failure.
+ * Filesystems implementing the new truncate sequence should use the
+ * _newtrunc postfix variant which won't incorrectly call vmtruncate.
+ * The filesystem needs to handle block truncation upon failure.
*/
-int block_write_begin(struct file *file, struct address_space *mapping,
+int block_write_begin_newtrunc(struct file *file, struct address_space *mapping,
loff_t pos, unsigned len, unsigned flags,
struct page **pagep, void **fsdata,
get_block_t *get_block)
@@ -1992,20 +1989,50 @@ int block_write_begin(struct file *file, struct address_space *mapping,
unlock_page(page);
page_cache_release(page);
*pagep = NULL;
-
- /*
- * prepare_write() may have instantiated a few blocks
- * outside i_size. Trim these off again. Don't need
- * i_size_read because we hold i_mutex.
- */
- if (pos + len > inode->i_size)
- vmtruncate(inode, inode->i_size);
}
}
out:
return status;
}
+EXPORT_SYMBOL(block_write_begin_newtrunc);
+
+/*
+ * block_write_begin takes care of the basic task of block allocation and
+ * bringing partial write blocks uptodate first.
+ *
+ * If *pagep is not NULL, then block_write_begin uses the locked page
+ * at *pagep rather than allocating its own. In this case, the page will
+ * not be unlocked or deallocated on failure.
+ */
+int block_write_begin(struct file *file, struct address_space *mapping,
+ loff_t pos, unsigned len, unsigned flags,
+ struct page **pagep, void **fsdata,
+ get_block_t *get_block)
+{
+ int ret;
+
+ ret = block_write_begin_newtrunc(file, mapping, pos, len, flags,
+ pagep, fsdata, get_block);
+
+ /*
+ * prepare_write() may have instantiated a few blocks
+ * outside i_size. Trim these off again. Don't need
+ * i_size_read because we hold i_mutex.
+ *
+ * Filesystems which pass down their own page also cannot
+ * call into vmtruncate here because it would lead to lock
+ * inversion problems (*pagep is locked). This is a further
+ * example of where the old truncate sequence is inadequate.
+ */
+ if (unlikely(ret) && *pagep == NULL) {
+ loff_t isize = mapping->host->i_size;
+ if (pos + len > isize)
+ vmtruncate(mapping->host, isize);
+ }
+
+ return ret;
+}
EXPORT_SYMBOL(block_write_begin);
int block_write_end(struct file *file, struct address_space *mapping,
@@ -2324,7 +2351,7 @@ out:
* For moronic filesystems that do not allow holes in file.
* We may have to extend the file.
*/
-int cont_write_begin(struct file *file, struct address_space *mapping,
+int cont_write_begin_newtrunc(struct file *file, struct address_space *mapping,
loff_t pos, unsigned len, unsigned flags,
struct page **pagep, void **fsdata,
get_block_t *get_block, loff_t *bytes)
@@ -2345,11 +2372,30 @@ int cont_write_begin(struct file *file, struct address_space *mapping,
}
*pagep = NULL;
- err = block_write_begin(file, mapping, pos, len,
+ err = block_write_begin_newtrunc(file, mapping, pos, len,
flags, pagep, fsdata, get_block);
out:
return err;
}
+EXPORT_SYMBOL(cont_write_begin_newtrunc);
+
+int cont_write_begin(struct file *file, struct address_space *mapping,
+ loff_t pos, unsigned len, unsigned flags,
+ struct page **pagep, void **fsdata,
+ get_block_t *get_block, loff_t *bytes)
+{
+ int ret;
+
+ ret = cont_write_begin_newtrunc(file, mapping, pos, len, flags,
+ pagep, fsdata, get_block, bytes);
+ if (unlikely(ret)) {
+ loff_t isize = mapping->host->i_size;
+ if (pos + len > isize)
+ vmtruncate(mapping->host, isize);
+ }
+
+ return ret;
+}
EXPORT_SYMBOL(cont_write_begin);
int block_prepare_write(struct page *page, unsigned from, unsigned to,
@@ -2381,7 +2427,7 @@ EXPORT_SYMBOL(block_commit_write);
*
* We are not allowed to take the i_mutex here so we have to play games to
* protect against truncate races as the page could now be beyond EOF. Because
- * vmtruncate() writes the inode size before removing pages, once we have the
+ * truncate writes the inode size before removing pages, once we have the
* page lock we can determine safely if the page is beyond EOF. If it is not
* beyond EOF, then the page is guaranteed safe against truncation until we
* unlock the page.
@@ -2464,10 +2510,11 @@ static void attach_nobh_buffers(struct page *page, struct buffer_head *head)
}
/*
- * On entry, the page is fully not uptodate.
- * On exit the page is fully uptodate in the areas outside (from,to)
+ * Filesystems implementing the new truncate sequence should use the
+ * _newtrunc postfix variant which won't incorrectly call vmtruncate.
+ * The filesystem needs to handle block truncation upon failure.
*/
-int nobh_write_begin(struct file *file, struct address_space *mapping,
+int nobh_write_begin_newtrunc(struct file *file, struct address_space *mapping,
loff_t pos, unsigned len, unsigned flags,
struct page **pagep, void **fsdata,
get_block_t *get_block)
@@ -2500,8 +2547,8 @@ int nobh_write_begin(struct file *file, struct address_space *mapping,
unlock_page(page);
page_cache_release(page);
*pagep = NULL;
- return block_write_begin(file, mapping, pos, len, flags, pagep,
- fsdata, get_block);
+ return block_write_begin_newtrunc(file, mapping, pos, len,
+ flags, pagep, fsdata, get_block);
}
if (PageMappedToDisk(page))
@@ -2605,8 +2652,34 @@ out_release:
page_cache_release(page);
*pagep = NULL;
- if (pos + len > inode->i_size)
- vmtruncate(inode, inode->i_size);
+ return ret;
+}
+EXPORT_SYMBOL(nobh_write_begin_newtrunc);
+
+/*
+ * On entry, the page is fully not uptodate.
+ * On exit the page is fully uptodate in the areas outside (from,to)
+ */
+int nobh_write_begin(struct file *file, struct address_space *mapping,
+ loff_t pos, unsigned len, unsigned flags,
+ struct page **pagep, void **fsdata,
+ get_block_t *get_block)
+{
+ int ret;
+
+ ret = nobh_write_begin_newtrunc(file, mapping, pos, len, flags,
+ pagep, fsdata, get_block);
+
+ /*
+ * prepare_write() may have instantiated a few blocks
+ * outside i_size. Trim these off again. Don't need
+ * i_size_read because we hold i_mutex.
+ */
+ if (unlikely(ret)) {
+ loff_t isize = mapping->host->i_size;
+ if (pos + len > isize)
+ vmtruncate(mapping->host, isize);
+ }
return ret;
}
diff --git a/fs/ceph/Kconfig b/fs/ceph/Kconfig
index 04b8280582a9..bc87b9c1d27e 100644
--- a/fs/ceph/Kconfig
+++ b/fs/ceph/Kconfig
@@ -2,7 +2,7 @@ config CEPH_FS
tristate "Ceph distributed file system (EXPERIMENTAL)"
depends on INET && EXPERIMENTAL
select LIBCRC32C
- select CONFIG_CRYPTO_AES
+ select CRYPTO_AES
help
Choose Y or M here to include support for mounting the
experimental Ceph distributed file system. Ceph is an extremely
diff --git a/fs/ceph/auth.c b/fs/ceph/auth.c
index 9f46de2ba7a7..89490beaf537 100644
--- a/fs/ceph/auth.c
+++ b/fs/ceph/auth.c
@@ -1,7 +1,6 @@
#include "ceph_debug.h"
#include <linux/module.h>
-#include <linux/slab.h>
#include <linux/err.h>
#include <linux/slab.h>
@@ -217,8 +216,8 @@ int ceph_handle_auth_reply(struct ceph_auth_client *ac,
if (ac->protocol != protocol) {
ret = ceph_auth_init_protocol(ac, protocol);
if (ret) {
- pr_err("error %d on auth method %s init\n",
- ret, ac->ops->name);
+ pr_err("error %d on auth protocol %d init\n",
+ ret, protocol);
goto out;
}
}
@@ -247,7 +246,7 @@ int ceph_build_auth(struct ceph_auth_client *ac,
if (!ac->protocol)
return ceph_auth_build_hello(ac, msg_buf, msg_len);
BUG_ON(!ac->ops);
- if (!ac->ops->is_authenticated(ac))
+ if (ac->ops->should_authenticate(ac))
return ceph_build_auth_request(ac, msg_buf, msg_len);
return 0;
}
diff --git a/fs/ceph/auth.h b/fs/ceph/auth.h
index 4429a707c021..d38a2fb4a137 100644
--- a/fs/ceph/auth.h
+++ b/fs/ceph/auth.h
@@ -24,6 +24,12 @@ struct ceph_auth_client_ops {
int (*is_authenticated)(struct ceph_auth_client *ac);
/*
+ * true if we should (re)authenticate, e.g., when our tickets
+ * are getting old and crusty.
+ */
+ int (*should_authenticate)(struct ceph_auth_client *ac);
+
+ /*
* build requests and process replies during monitor
* handshake. if handle_reply returns -EAGAIN, we build
* another request.
diff --git a/fs/ceph/auth_none.c b/fs/ceph/auth_none.c
index 24407c119291..ad1dc21286c7 100644
--- a/fs/ceph/auth_none.c
+++ b/fs/ceph/auth_none.c
@@ -31,6 +31,13 @@ static int is_authenticated(struct ceph_auth_client *ac)
return !xi->starting;
}
+static int should_authenticate(struct ceph_auth_client *ac)
+{
+ struct ceph_auth_none_info *xi = ac->private;
+
+ return xi->starting;
+}
+
/*
* the generic auth code decode the global_id, and we carry no actual
* authenticate state, so nothing happens here.
@@ -98,6 +105,7 @@ static const struct ceph_auth_client_ops ceph_auth_none_ops = {
.reset = reset,
.destroy = destroy,
.is_authenticated = is_authenticated,
+ .should_authenticate = should_authenticate,
.handle_reply = handle_reply,
.create_authorizer = ceph_auth_none_create_authorizer,
.destroy_authorizer = ceph_auth_none_destroy_authorizer,
diff --git a/fs/ceph/auth_x.c b/fs/ceph/auth_x.c
index 7b206231566d..6d44053ecff1 100644
--- a/fs/ceph/auth_x.c
+++ b/fs/ceph/auth_x.c
@@ -27,6 +27,17 @@ static int ceph_x_is_authenticated(struct ceph_auth_client *ac)
return (ac->want_keys & xi->have_keys) == ac->want_keys;
}
+static int ceph_x_should_authenticate(struct ceph_auth_client *ac)
+{
+ struct ceph_x_info *xi = ac->private;
+ int need;
+
+ ceph_x_validate_tickets(ac, &need);
+ dout("ceph_x_should_authenticate want=%d need=%d have=%d\n",
+ ac->want_keys, need, xi->have_keys);
+ return need != 0;
+}
+
static int ceph_x_encrypt_buflen(int ilen)
{
return sizeof(struct ceph_x_encrypt_header) + ilen + 16 +
@@ -482,7 +493,7 @@ static int ceph_x_handle_reply(struct ceph_auth_client *ac, int result,
return -EAGAIN;
}
- op = le32_to_cpu(head->op);
+ op = le16_to_cpu(head->op);
result = le32_to_cpu(head->result);
dout("handle_reply op %d result %d\n", op, result);
switch (op) {
@@ -602,6 +613,9 @@ static void ceph_x_destroy(struct ceph_auth_client *ac)
remove_ticket_handler(ac, th);
}
+ if (xi->auth_authorizer.buf)
+ ceph_buffer_put(xi->auth_authorizer.buf);
+
kfree(ac->private);
ac->private = NULL;
}
@@ -620,6 +634,7 @@ static void ceph_x_invalidate_authorizer(struct ceph_auth_client *ac,
static const struct ceph_auth_client_ops ceph_x_ops = {
.name = "x",
.is_authenticated = ceph_x_is_authenticated,
+ .should_authenticate = ceph_x_should_authenticate,
.build_request = ceph_x_build_request,
.handle_reply = ceph_x_handle_reply,
.create_authorizer = ceph_x_create_authorizer,
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 0dd0b81e64f7..b81be9a56487 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -244,8 +244,14 @@ static struct ceph_cap *get_cap(struct ceph_cap_reservation *ctx)
struct ceph_cap *cap = NULL;
/* temporary, until we do something about cap import/export */
- if (!ctx)
- return kmem_cache_alloc(ceph_cap_cachep, GFP_NOFS);
+ if (!ctx) {
+ cap = kmem_cache_alloc(ceph_cap_cachep, GFP_NOFS);
+ if (cap) {
+ caps_use_count++;
+ caps_total_count++;
+ }
+ return cap;
+ }
spin_lock(&caps_list_lock);
dout("get_cap ctx=%p (%d) %d = %d used + %d resv + %d avail\n",
@@ -621,7 +627,7 @@ retry:
if (fmode >= 0)
__ceph_get_fmode(ci, fmode);
spin_unlock(&inode->i_lock);
- wake_up(&ci->i_cap_wq);
+ wake_up_all(&ci->i_cap_wq);
return 0;
}
@@ -981,6 +987,46 @@ static int send_cap_msg(struct ceph_mds_session *session,
return 0;
}
+static void __queue_cap_release(struct ceph_mds_session *session,
+ u64 ino, u64 cap_id, u32 migrate_seq,
+ u32 issue_seq)
+{
+ struct ceph_msg *msg;
+ struct ceph_mds_cap_release *head;
+ struct ceph_mds_cap_item *item;
+
+ spin_lock(&session->s_cap_lock);
+ BUG_ON(!session->s_num_cap_releases);
+ msg = list_first_entry(&session->s_cap_releases,
+ struct ceph_msg, list_head);
+
+ dout(" adding %llx release to mds%d msg %p (%d left)\n",
+ ino, session->s_mds, msg, session->s_num_cap_releases);
+
+ BUG_ON(msg->front.iov_len + sizeof(*item) > PAGE_CACHE_SIZE);
+ head = msg->front.iov_base;
+ head->num = cpu_to_le32(le32_to_cpu(head->num) + 1);
+ item = msg->front.iov_base + msg->front.iov_len;
+ item->ino = cpu_to_le64(ino);
+ item->cap_id = cpu_to_le64(cap_id);
+ item->migrate_seq = cpu_to_le32(migrate_seq);
+ item->seq = cpu_to_le32(issue_seq);
+
+ session->s_num_cap_releases--;
+
+ msg->front.iov_len += sizeof(*item);
+ if (le32_to_cpu(head->num) == CEPH_CAPS_PER_RELEASE) {
+ dout(" release msg %p full\n", msg);
+ list_move_tail(&msg->list_head, &session->s_cap_releases_done);
+ } else {
+ dout(" release msg %p at %d/%d (%d)\n", msg,
+ (int)le32_to_cpu(head->num),
+ (int)CEPH_CAPS_PER_RELEASE,
+ (int)msg->front.iov_len);
+ }
+ spin_unlock(&session->s_cap_lock);
+}
+
/*
* Queue cap releases when an inode is dropped from our cache. Since
* inode is about to be destroyed, there is no need for i_lock.
@@ -994,41 +1040,9 @@ void ceph_queue_caps_release(struct inode *inode)
while (p) {
struct ceph_cap *cap = rb_entry(p, struct ceph_cap, ci_node);
struct ceph_mds_session *session = cap->session;
- struct ceph_msg *msg;
- struct ceph_mds_cap_release *head;
- struct ceph_mds_cap_item *item;
- spin_lock(&session->s_cap_lock);
- BUG_ON(!session->s_num_cap_releases);
- msg = list_first_entry(&session->s_cap_releases,
- struct ceph_msg, list_head);
-
- dout(" adding %p release to mds%d msg %p (%d left)\n",
- inode, session->s_mds, msg, session->s_num_cap_releases);
-
- BUG_ON(msg->front.iov_len + sizeof(*item) > PAGE_CACHE_SIZE);
- head = msg->front.iov_base;
- head->num = cpu_to_le32(le32_to_cpu(head->num) + 1);
- item = msg->front.iov_base + msg->front.iov_len;
- item->ino = cpu_to_le64(ceph_ino(inode));
- item->cap_id = cpu_to_le64(cap->cap_id);
- item->migrate_seq = cpu_to_le32(cap->mseq);
- item->seq = cpu_to_le32(cap->issue_seq);
-
- session->s_num_cap_releases--;
-
- msg->front.iov_len += sizeof(*item);
- if (le32_to_cpu(head->num) == CEPH_CAPS_PER_RELEASE) {
- dout(" release msg %p full\n", msg);
- list_move_tail(&msg->list_head,
- &session->s_cap_releases_done);
- } else {
- dout(" release msg %p at %d/%d (%d)\n", msg,
- (int)le32_to_cpu(head->num),
- (int)CEPH_CAPS_PER_RELEASE,
- (int)msg->front.iov_len);
- }
- spin_unlock(&session->s_cap_lock);
+ __queue_cap_release(session, ceph_ino(inode), cap->cap_id,
+ cap->mseq, cap->issue_seq);
p = rb_next(p);
__ceph_remove_cap(cap);
}
@@ -1167,7 +1181,7 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
}
if (wake)
- wake_up(&ci->i_cap_wq);
+ wake_up_all(&ci->i_cap_wq);
return delayed;
}
@@ -1776,9 +1790,9 @@ out:
spin_unlock(&ci->i_unsafe_lock);
}
-int ceph_fsync(struct file *file, struct dentry *dentry, int datasync)
+int ceph_fsync(struct file *file, int datasync)
{
- struct inode *inode = dentry->d_inode;
+ struct inode *inode = file->f_mapping->host;
struct ceph_inode_info *ci = ceph_inode(inode);
unsigned flush_tid;
int ret;
@@ -2139,7 +2153,7 @@ void ceph_put_cap_refs(struct ceph_inode_info *ci, int had)
else if (flushsnaps)
ceph_flush_snaps(ci);
if (wake)
- wake_up(&ci->i_cap_wq);
+ wake_up_all(&ci->i_cap_wq);
if (put)
iput(inode);
}
@@ -2215,7 +2229,7 @@ void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr,
iput(inode);
} else if (complete_capsnap) {
ceph_flush_snaps(ci);
- wake_up(&ci->i_cap_wq);
+ wake_up_all(&ci->i_cap_wq);
}
if (drop_capsnap)
iput(inode);
@@ -2391,7 +2405,7 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
if (queue_invalidate)
ceph_queue_invalidate(inode);
if (wake)
- wake_up(&ci->i_cap_wq);
+ wake_up_all(&ci->i_cap_wq);
if (check_caps == 1)
ceph_check_caps(ci, CHECK_CAPS_NODELAY|CHECK_CAPS_AUTHONLY,
@@ -2446,7 +2460,7 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid,
struct ceph_inode_info,
i_flushing_item)->vfs_inode);
mdsc->num_cap_flushing--;
- wake_up(&mdsc->cap_flushing_wq);
+ wake_up_all(&mdsc->cap_flushing_wq);
dout(" inode %p now !flushing\n", inode);
if (ci->i_dirty_caps == 0) {
@@ -2458,7 +2472,7 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid,
}
}
spin_unlock(&mdsc->cap_dirty_lock);
- wake_up(&ci->i_cap_wq);
+ wake_up_all(&ci->i_cap_wq);
out:
spin_unlock(&inode->i_lock);
@@ -2655,7 +2669,7 @@ void ceph_handle_caps(struct ceph_mds_session *session,
struct ceph_mds_caps *h;
int mds = session->s_mds;
int op;
- u32 seq;
+ u32 seq, mseq;
struct ceph_vino vino;
u64 cap_id;
u64 size, max_size;
@@ -2675,6 +2689,7 @@ void ceph_handle_caps(struct ceph_mds_session *session,
vino.snap = CEPH_NOSNAP;
cap_id = le64_to_cpu(h->cap_id);
seq = le32_to_cpu(h->seq);
+ mseq = le32_to_cpu(h->migrate_seq);
size = le64_to_cpu(h->size);
max_size = le64_to_cpu(h->max_size);
@@ -2689,6 +2704,18 @@ void ceph_handle_caps(struct ceph_mds_session *session,
vino.snap, inode);
if (!inode) {
dout(" i don't have ino %llx\n", vino.ino);
+
+ if (op == CEPH_CAP_OP_IMPORT)
+ __queue_cap_release(session, vino.ino, cap_id,
+ mseq, seq);
+
+ /*
+ * send any full release message to try to move things
+ * along for the mds (who clearly thinks we still have this
+ * cap).
+ */
+ ceph_add_cap_releases(mdsc, session, -1);
+ ceph_send_cap_releases(mdsc, session);
goto done;
}
@@ -2714,7 +2741,7 @@ void ceph_handle_caps(struct ceph_mds_session *session,
spin_lock(&inode->i_lock);
cap = __get_cap_for_mds(ceph_inode(inode), mds);
if (!cap) {
- dout("no cap on %p ino %llx.%llx from mds%d, releasing\n",
+ dout(" no cap on %p ino %llx.%llx from mds%d\n",
inode, ceph_ino(inode), ceph_snap(inode), mds);
spin_unlock(&inode->i_lock);
goto done;
@@ -2865,18 +2892,19 @@ int ceph_encode_inode_release(void **p, struct inode *inode,
struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_cap *cap;
struct ceph_mds_request_release *rel = *p;
+ int used, dirty;
int ret = 0;
- int used = 0;
spin_lock(&inode->i_lock);
used = __ceph_caps_used(ci);
+ dirty = __ceph_caps_dirty(ci);
- dout("encode_inode_release %p mds%d used %s drop %s unless %s\n", inode,
- mds, ceph_cap_string(used), ceph_cap_string(drop),
+ dout("encode_inode_release %p mds%d used|dirty %s drop %s unless %s\n",
+ inode, mds, ceph_cap_string(used|dirty), ceph_cap_string(drop),
ceph_cap_string(unless));
- /* only drop unused caps */
- drop &= ~used;
+ /* only drop unused, clean caps */
+ drop &= ~(used | dirty);
cap = __get_cap_for_mds(ci, mds);
if (cap && __cap_is_valid(cap)) {
@@ -2956,6 +2984,7 @@ int ceph_encode_dentry_release(void **p, struct dentry *dentry,
memcpy(*p, dentry->d_name.name, dentry->d_name.len);
*p += dentry->d_name.len;
rel->dname_seq = cpu_to_le32(di->lease_seq);
+ __ceph_mdsc_drop_dentry_lease(dentry);
}
spin_unlock(&dentry->d_lock);
return ret;
diff --git a/fs/ceph/ceph_fs.h b/fs/ceph/ceph_fs.h
index 3b9eeed097b3..2fa992eaf7da 100644
--- a/fs/ceph/ceph_fs.h
+++ b/fs/ceph/ceph_fs.h
@@ -265,16 +265,17 @@ extern const char *ceph_mds_state_name(int s);
* - they also define the lock ordering by the MDS
* - a few of these are internal to the mds
*/
-#define CEPH_LOCK_DN 1
-#define CEPH_LOCK_ISNAP 2
-#define CEPH_LOCK_IVERSION 4 /* mds internal */
-#define CEPH_LOCK_IFILE 8 /* mds internal */
-#define CEPH_LOCK_IAUTH 32
-#define CEPH_LOCK_ILINK 64
-#define CEPH_LOCK_IDFT 128 /* dir frag tree */
-#define CEPH_LOCK_INEST 256 /* mds internal */
-#define CEPH_LOCK_IXATTR 512
-#define CEPH_LOCK_INO 2048 /* immutable inode bits; not a lock */
+#define CEPH_LOCK_DVERSION 1
+#define CEPH_LOCK_DN 2
+#define CEPH_LOCK_ISNAP 16
+#define CEPH_LOCK_IVERSION 32 /* mds internal */
+#define CEPH_LOCK_IFILE 64
+#define CEPH_LOCK_IAUTH 128
+#define CEPH_LOCK_ILINK 256
+#define CEPH_LOCK_IDFT 512 /* dir frag tree */
+#define CEPH_LOCK_INEST 1024 /* mds internal */
+#define CEPH_LOCK_IXATTR 2048
+#define CEPH_LOCK_INO 8192 /* immutable inode bits; not a lock */
/* client_session ops */
enum {
diff --git a/fs/ceph/crush/mapper.c b/fs/ceph/crush/mapper.c
index 9ba54efb6543..a4eec133258e 100644
--- a/fs/ceph/crush/mapper.c
+++ b/fs/ceph/crush/mapper.c
@@ -238,7 +238,7 @@ static int bucket_straw_choose(struct crush_bucket_straw *bucket,
static int crush_bucket_choose(struct crush_bucket *in, int x, int r)
{
- dprintk("choose %d x=%d r=%d\n", in->id, x, r);
+ dprintk(" crush_bucket_choose %d x=%d r=%d\n", in->id, x, r);
switch (in->alg) {
case CRUSH_BUCKET_UNIFORM:
return bucket_uniform_choose((struct crush_bucket_uniform *)in,
@@ -264,7 +264,7 @@ static int crush_bucket_choose(struct crush_bucket *in, int x, int r)
*/
static int is_out(struct crush_map *map, __u32 *weight, int item, int x)
{
- if (weight[item] >= 0x1000)
+ if (weight[item] >= 0x10000)
return 0;
if (weight[item] == 0)
return 1;
@@ -305,7 +305,9 @@ static int crush_choose(struct crush_map *map,
int itemtype;
int collide, reject;
const int orig_tries = 5; /* attempts before we fall back to search */
- dprintk("choose bucket %d x %d outpos %d\n", bucket->id, x, outpos);
+
+ dprintk("CHOOSE%s bucket %d x %d outpos %d numrep %d\n", recurse_to_leaf ? "_LEAF" : "",
+ bucket->id, x, outpos, numrep);
for (rep = outpos; rep < numrep; rep++) {
/* keep trying until we get a non-out, non-colliding item */
@@ -366,6 +368,7 @@ static int crush_choose(struct crush_map *map,
BUG_ON(item >= 0 ||
(-1-item) >= map->max_buckets);
in = map->buckets[-1-item];
+ retry_bucket = 1;
continue;
}
@@ -377,15 +380,25 @@ static int crush_choose(struct crush_map *map,
}
}
- if (recurse_to_leaf &&
- item < 0 &&
- crush_choose(map, map->buckets[-1-item],
- weight,
- x, outpos+1, 0,
- out2, outpos,
- firstn, 0, NULL) <= outpos) {
- reject = 1;
- } else {
+ reject = 0;
+ if (recurse_to_leaf) {
+ if (item < 0) {
+ if (crush_choose(map,
+ map->buckets[-1-item],
+ weight,
+ x, outpos+1, 0,
+ out2, outpos,
+ firstn, 0,
+ NULL) <= outpos)
+ /* didn't get leaf */
+ reject = 1;
+ } else {
+ /* we already have a leaf! */
+ out2[outpos] = item;
+ }
+ }
+
+ if (!reject) {
/* out? */
if (itemtype == 0)
reject = is_out(map, weight,
@@ -424,12 +437,12 @@ reject:
continue;
}
- dprintk("choose got %d\n", item);
+ dprintk("CHOOSE got %d\n", item);
out[outpos] = item;
outpos++;
}
- dprintk("choose returns %d\n", outpos);
+ dprintk("CHOOSE returns %d\n", outpos);
return outpos;
}
diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c
index 3be33fb066cc..f2f5332ddbba 100644
--- a/fs/ceph/debugfs.c
+++ b/fs/ceph/debugfs.c
@@ -261,7 +261,7 @@ static int osdc_show(struct seq_file *s, void *pp)
static int caps_show(struct seq_file *s, void *p)
{
- struct ceph_client *client = p;
+ struct ceph_client *client = s->private;
int total, avail, used, reserved, min;
ceph_reservation_status(client, &total, &avail, &used, &reserved, &min);
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index 4fd30900eff7..f94ed3c7f6a5 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -266,6 +266,7 @@ static int ceph_readdir(struct file *filp, void *dirent, filldir_t filldir)
spin_lock(&inode->i_lock);
if ((filp->f_pos == 2 || fi->dentry) &&
!ceph_test_opt(client, NOASYNCREADDIR) &&
+ ceph_snap(inode) != CEPH_SNAPDIR &&
(ci->i_ceph_flags & CEPH_I_COMPLETE) &&
__ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1)) {
err = __dcache_readdir(filp, dirent, filldir);
@@ -587,7 +588,7 @@ static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry,
CEPH_MDS_OP_LOOKUPSNAP : CEPH_MDS_OP_LOOKUP;
req = ceph_mdsc_create_request(mdsc, op, USE_ANY_MDS);
if (IS_ERR(req))
- return ERR_PTR(PTR_ERR(req));
+ return ERR_CAST(req);
req->r_dentry = dget(dentry);
req->r_num_caps = 2;
/* we only need inode linkage */
@@ -1013,18 +1014,22 @@ out_touch:
/*
* When a dentry is released, clear the dir I_COMPLETE if it was part
- * of the current dir gen.
+ * of the current dir gen or if this is in the snapshot namespace.
*/
static void ceph_dentry_release(struct dentry *dentry)
{
struct ceph_dentry_info *di = ceph_dentry(dentry);
struct inode *parent_inode = dentry->d_parent->d_inode;
+ u64 snapid = ceph_snap(parent_inode);
- if (parent_inode) {
+ dout("dentry_release %p parent %p\n", dentry, parent_inode);
+
+ if (parent_inode && snapid != CEPH_SNAPDIR) {
struct ceph_inode_info *ci = ceph_inode(parent_inode);
spin_lock(&parent_inode->i_lock);
- if (ci->i_shared_gen == di->lease_shared_gen) {
+ if (ci->i_shared_gen == di->lease_shared_gen ||
+ snapid <= CEPH_MAXSNAP) {
dout(" clearing %p complete (d_release)\n",
parent_inode);
ci->i_ceph_flags &= ~CEPH_I_COMPLETE;
@@ -1107,10 +1112,9 @@ static ssize_t ceph_read_dir(struct file *file, char __user *buf, size_t size,
* an fsync() on a dir will wait for any uncommitted directory
* operations to commit.
*/
-static int ceph_dir_fsync(struct file *file, struct dentry *dentry,
- int datasync)
+static int ceph_dir_fsync(struct file *file, int datasync)
{
- struct inode *inode = dentry->d_inode;
+ struct inode *inode = file->f_path.dentry->d_inode;
struct ceph_inode_info *ci = ceph_inode(inode);
struct list_head *head = &ci->i_unsafe_dirops;
struct ceph_mds_request *req;
@@ -1242,7 +1246,9 @@ struct dentry_operations ceph_dentry_ops = {
struct dentry_operations ceph_snapdir_dentry_ops = {
.d_revalidate = ceph_snapdir_d_revalidate,
+ .d_release = ceph_dentry_release,
};
struct dentry_operations ceph_snap_dentry_ops = {
+ .d_release = ceph_dentry_release,
};
diff --git a/fs/ceph/export.c b/fs/ceph/export.c
index 17447644d675..4480cb1c63e7 100644
--- a/fs/ceph/export.c
+++ b/fs/ceph/export.c
@@ -133,7 +133,7 @@ static struct dentry *__cfh_to_dentry(struct super_block *sb,
req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_LOOKUPHASH,
USE_ANY_MDS);
if (IS_ERR(req))
- return ERR_PTR(PTR_ERR(req));
+ return ERR_CAST(req);
req->r_ino1 = vino;
req->r_ino2.ino = cfh->parent_ino;
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 6512b6701b9e..7c08698fad3e 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -230,7 +230,7 @@ struct dentry *ceph_lookup_open(struct inode *dir, struct dentry *dentry,
/* do the open */
req = prepare_open_request(dir->i_sb, flags, mode);
if (IS_ERR(req))
- return ERR_PTR(PTR_ERR(req));
+ return ERR_CAST(req);
req->r_dentry = dget(dentry);
req->r_num_caps = 2;
if (flags & O_CREAT) {
@@ -265,7 +265,7 @@ int ceph_release(struct inode *inode, struct file *file)
kmem_cache_free(ceph_file_cachep, cf);
/* wake up anyone waiting for caps on this inode */
- wake_up(&ci->i_cap_wq);
+ wake_up_all(&ci->i_cap_wq);
return 0;
}
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index a81b8b662c7b..389f9dbd9949 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -69,7 +69,7 @@ struct inode *ceph_get_snapdir(struct inode *parent)
BUG_ON(!S_ISDIR(parent->i_mode));
if (IS_ERR(inode))
- return ERR_PTR(PTR_ERR(inode));
+ return inode;
inode->i_mode = parent->i_mode;
inode->i_uid = parent->i_uid;
inode->i_gid = parent->i_gid;
@@ -827,7 +827,7 @@ static void ceph_set_dentry_offset(struct dentry *dn)
spin_lock(&dcache_lock);
spin_lock(&dn->d_lock);
- list_move_tail(&dir->d_subdirs, &dn->d_u.d_child);
+ list_move(&dn->d_u.d_child, &dir->d_subdirs);
dout("set_dentry_offset %p %lld (%p %p)\n", dn, di->offset,
dn->d_u.d_child.prev, dn->d_u.d_child.next);
spin_unlock(&dn->d_lock);
@@ -854,8 +854,8 @@ static struct dentry *splice_dentry(struct dentry *dn, struct inode *in,
d_drop(dn);
realdn = d_materialise_unique(dn, in);
if (IS_ERR(realdn)) {
- pr_err("splice_dentry error %p inode %p ino %llx.%llx\n",
- dn, in, ceph_vinop(in));
+ pr_err("splice_dentry error %ld %p inode %p ino %llx.%llx\n",
+ PTR_ERR(realdn), dn, in, ceph_vinop(in));
if (prehash)
*prehash = false; /* don't rehash on error */
dn = realdn; /* note realdn contains the error */
@@ -1199,8 +1199,10 @@ retry_lookup:
goto out;
}
err = ceph_init_dentry(dn);
- if (err < 0)
+ if (err < 0) {
+ dput(dn);
goto out;
+ }
} else if (dn->d_inode &&
(ceph_ino(dn->d_inode) != vino.ino ||
ceph_snap(dn->d_inode) != vino.snap)) {
@@ -1234,18 +1236,23 @@ retry_lookup:
goto out;
}
dn = splice_dentry(dn, in, NULL);
+ if (IS_ERR(dn))
+ dn = NULL;
}
if (fill_inode(in, &rinfo->dir_in[i], NULL, session,
req->r_request_started, -1,
&req->r_caps_reservation) < 0) {
pr_err("fill_inode badness on %p\n", in);
- dput(dn);
- continue;
+ goto next_item;
}
- update_dentry_lease(dn, rinfo->dir_dlease[i],
- req->r_session, req->r_request_started);
- dput(dn);
+ if (dn)
+ update_dentry_lease(dn, rinfo->dir_dlease[i],
+ req->r_session,
+ req->r_request_started);
+next_item:
+ if (dn)
+ dput(dn);
}
req->r_did_prepopulate = true;
@@ -1494,7 +1501,7 @@ retry:
if (wrbuffer_refs == 0)
ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL);
if (wake)
- wake_up(&ci->i_cap_wq);
+ wake_up_all(&ci->i_cap_wq);
}
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 885aa5710cfd..dd440bd438a9 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -868,7 +868,7 @@ static int wake_up_session_cb(struct inode *inode, struct ceph_cap *cap,
{
struct ceph_inode_info *ci = ceph_inode(inode);
- wake_up(&ci->i_cap_wq);
+ wake_up_all(&ci->i_cap_wq);
if (arg) {
spin_lock(&inode->i_lock);
ci->i_wanted_max_size = 0;
@@ -1066,9 +1066,9 @@ static int trim_caps(struct ceph_mds_client *mdsc,
*
* Called under s_mutex.
*/
-static int add_cap_releases(struct ceph_mds_client *mdsc,
- struct ceph_mds_session *session,
- int extra)
+int ceph_add_cap_releases(struct ceph_mds_client *mdsc,
+ struct ceph_mds_session *session,
+ int extra)
{
struct ceph_msg *msg;
struct ceph_mds_cap_release *head;
@@ -1176,8 +1176,8 @@ static int check_cap_flush(struct ceph_mds_client *mdsc, u64 want_flush_seq)
/*
* called under s_mutex
*/
-static void send_cap_releases(struct ceph_mds_client *mdsc,
- struct ceph_mds_session *session)
+void ceph_send_cap_releases(struct ceph_mds_client *mdsc,
+ struct ceph_mds_session *session)
{
struct ceph_msg *msg;
@@ -1514,6 +1514,9 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc,
ceph_encode_filepath(&p, end, ino1, path1);
ceph_encode_filepath(&p, end, ino2, path2);
+ /* make note of release offset, in case we need to replay */
+ req->r_request_release_offset = p - msg->front.iov_base;
+
/* cap releases */
releases = 0;
if (req->r_inode_drop)
@@ -1561,7 +1564,7 @@ static void complete_request(struct ceph_mds_client *mdsc,
if (req->r_callback)
req->r_callback(mdsc, req);
else
- complete(&req->r_completion);
+ complete_all(&req->r_completion);
}
/*
@@ -1580,6 +1583,32 @@ static int __prepare_send_request(struct ceph_mds_client *mdsc,
dout("prepare_send_request %p tid %lld %s (attempt %d)\n", req,
req->r_tid, ceph_mds_op_name(req->r_op), req->r_attempts);
+ if (req->r_got_unsafe) {
+ /*
+ * Replay. Do not regenerate message (and rebuild
+ * paths, etc.); just use the original message.
+ * Rebuilding paths will break for renames because
+ * d_move mangles the src name.
+ */
+ msg = req->r_request;
+ rhead = msg->front.iov_base;
+
+ flags = le32_to_cpu(rhead->flags);
+ flags |= CEPH_MDS_FLAG_REPLAY;
+ rhead->flags = cpu_to_le32(flags);
+
+ if (req->r_target_inode)
+ rhead->ino = cpu_to_le64(ceph_ino(req->r_target_inode));
+
+ rhead->num_retry = req->r_attempts - 1;
+
+ /* remove cap/dentry releases from message */
+ rhead->num_releases = 0;
+ msg->hdr.front_len = cpu_to_le32(req->r_request_release_offset);
+ msg->front.iov_len = req->r_request_release_offset;
+ return 0;
+ }
+
if (req->r_request) {
ceph_msg_put(req->r_request);
req->r_request = NULL;
@@ -1601,13 +1630,9 @@ static int __prepare_send_request(struct ceph_mds_client *mdsc,
rhead->flags = cpu_to_le32(flags);
rhead->num_fwd = req->r_num_fwd;
rhead->num_retry = req->r_attempts - 1;
+ rhead->ino = 0;
dout(" r_locked_dir = %p\n", req->r_locked_dir);
-
- if (req->r_target_inode && req->r_got_unsafe)
- rhead->ino = cpu_to_le64(ceph_ino(req->r_target_inode));
- else
- rhead->ino = 0;
return 0;
}
@@ -1768,12 +1793,12 @@ int ceph_mdsc_do_request(struct ceph_mds_client *mdsc,
mutex_unlock(&mdsc->mutex);
dout("do_request waiting\n");
if (req->r_timeout) {
- err = (long)wait_for_completion_interruptible_timeout(
+ err = (long)wait_for_completion_killable_timeout(
&req->r_completion, req->r_timeout);
if (err == 0)
err = -EIO;
} else {
- err = wait_for_completion_interruptible(&req->r_completion);
+ err = wait_for_completion_killable(&req->r_completion);
}
dout("do_request waited, got %d\n", err);
mutex_lock(&mdsc->mutex);
@@ -1907,7 +1932,7 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
if (head->safe) {
req->r_got_safe = true;
__unregister_request(mdsc, req);
- complete(&req->r_safe_completion);
+ complete_all(&req->r_safe_completion);
if (req->r_got_unsafe) {
/*
@@ -1922,7 +1947,7 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
/* last unsafe request during umount? */
if (mdsc->stopping && !__get_oldest_req(mdsc))
- complete(&mdsc->safe_umount_waiters);
+ complete_all(&mdsc->safe_umount_waiters);
mutex_unlock(&mdsc->mutex);
goto out;
}
@@ -1980,7 +2005,7 @@ out_err:
}
mutex_unlock(&mdsc->mutex);
- add_cap_releases(mdsc, req->r_session, -1);
+ ceph_add_cap_releases(mdsc, req->r_session, -1);
mutex_unlock(&session->s_mutex);
/* kick calling process */
@@ -2014,16 +2039,21 @@ static void handle_forward(struct ceph_mds_client *mdsc,
mutex_lock(&mdsc->mutex);
req = __lookup_request(mdsc, tid);
if (!req) {
- dout("forward %llu to mds%d - req dne\n", tid, next_mds);
+ dout("forward tid %llu to mds%d - req dne\n", tid, next_mds);
goto out; /* dup reply? */
}
- if (fwd_seq <= req->r_num_fwd) {
- dout("forward %llu to mds%d - old seq %d <= %d\n",
+ if (req->r_aborted) {
+ dout("forward tid %llu aborted, unregistering\n", tid);
+ __unregister_request(mdsc, req);
+ } else if (fwd_seq <= req->r_num_fwd) {
+ dout("forward tid %llu to mds%d - old seq %d <= %d\n",
tid, next_mds, req->r_num_fwd, fwd_seq);
} else {
/* resend. forward race not possible; mds would drop */
- dout("forward %llu to mds%d (we resend)\n", tid, next_mds);
+ dout("forward tid %llu to mds%d (we resend)\n", tid, next_mds);
+ BUG_ON(req->r_err);
+ BUG_ON(req->r_got_result);
req->r_num_fwd = fwd_seq;
req->r_resend_mds = next_mds;
put_request_session(req);
@@ -2096,7 +2126,7 @@ static void handle_session(struct ceph_mds_session *session,
pr_info("mds%d reconnect denied\n", session->s_mds);
remove_session_caps(session);
wake = 1; /* for good measure */
- complete(&mdsc->session_close_waiters);
+ complete_all(&mdsc->session_close_waiters);
kick_requests(mdsc, mds);
break;
@@ -2428,6 +2458,7 @@ static void handle_lease(struct ceph_mds_client *mdsc,
struct ceph_dentry_info *di;
int mds = session->s_mds;
struct ceph_mds_lease *h = msg->front.iov_base;
+ u32 seq;
struct ceph_vino vino;
int mask;
struct qstr dname;
@@ -2441,6 +2472,7 @@ static void handle_lease(struct ceph_mds_client *mdsc,
vino.ino = le64_to_cpu(h->ino);
vino.snap = CEPH_NOSNAP;
mask = le16_to_cpu(h->mask);
+ seq = le32_to_cpu(h->seq);
dname.name = (void *)h + sizeof(*h) + sizeof(u32);
dname.len = msg->front.iov_len - sizeof(*h) - sizeof(u32);
if (dname.len != get_unaligned_le32(h+1))
@@ -2451,8 +2483,9 @@ static void handle_lease(struct ceph_mds_client *mdsc,
/* lookup inode */
inode = ceph_find_inode(sb, vino);
- dout("handle_lease '%s', mask %d, ino %llx %p\n",
- ceph_lease_op_name(h->action), mask, vino.ino, inode);
+ dout("handle_lease %s, mask %d, ino %llx %p %.*s\n",
+ ceph_lease_op_name(h->action), mask, vino.ino, inode,
+ dname.len, dname.name);
if (inode == NULL) {
dout("handle_lease no inode %llx\n", vino.ino);
goto release;
@@ -2477,7 +2510,8 @@ static void handle_lease(struct ceph_mds_client *mdsc,
switch (h->action) {
case CEPH_MDS_LEASE_REVOKE:
if (di && di->lease_session == session) {
- h->seq = cpu_to_le32(di->lease_seq);
+ if (ceph_seq_cmp(di->lease_seq, seq) > 0)
+ h->seq = cpu_to_le32(di->lease_seq);
__ceph_mdsc_drop_dentry_lease(dentry);
}
release = 1;
@@ -2491,7 +2525,7 @@ static void handle_lease(struct ceph_mds_client *mdsc,
unsigned long duration =
le32_to_cpu(h->duration_ms) * HZ / 1000;
- di->lease_seq = le32_to_cpu(h->seq);
+ di->lease_seq = seq;
dentry->d_time = di->lease_renew_from + duration;
di->lease_renew_after = di->lease_renew_from +
(duration >> 1);
@@ -2541,7 +2575,7 @@ void ceph_mdsc_lease_send_msg(struct ceph_mds_session *session,
return;
lease = msg->front.iov_base;
lease->action = action;
- lease->mask = cpu_to_le16(CEPH_LOCK_DN);
+ lease->mask = cpu_to_le16(1);
lease->ino = cpu_to_le64(ceph_vino(inode).ino);
lease->first = lease->last = cpu_to_le64(ceph_vino(inode).snap);
lease->seq = cpu_to_le32(seq);
@@ -2571,7 +2605,7 @@ void ceph_mdsc_lease_release(struct ceph_mds_client *mdsc, struct inode *inode,
BUG_ON(inode == NULL);
BUG_ON(dentry == NULL);
- BUG_ON(mask != CEPH_LOCK_DN);
+ BUG_ON(mask == 0);
/* is dentry lease valid? */
spin_lock(&dentry->d_lock);
@@ -2681,10 +2715,10 @@ static void delayed_work(struct work_struct *work)
send_renew_caps(mdsc, s);
else
ceph_con_keepalive(&s->s_con);
- add_cap_releases(mdsc, s, -1);
+ ceph_add_cap_releases(mdsc, s, -1);
if (s->s_state == CEPH_MDS_SESSION_OPEN ||
s->s_state == CEPH_MDS_SESSION_HUNG)
- send_cap_releases(mdsc, s);
+ ceph_send_cap_releases(mdsc, s);
mutex_unlock(&s->s_mutex);
ceph_put_mds_session(s);
@@ -2774,6 +2808,12 @@ void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc)
drop_leases(mdsc);
ceph_flush_dirty_caps(mdsc);
wait_requests(mdsc);
+
+ /*
+ * wait for reply handlers to drop their request refs and
+ * their inode/dcache refs
+ */
+ ceph_msgr_flush();
}
/*
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index d9936c4f1212..952410c60d09 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -188,6 +188,7 @@ struct ceph_mds_request {
int r_old_inode_drop, r_old_inode_unless;
struct ceph_msg *r_request; /* original request */
+ int r_request_release_offset;
struct ceph_msg *r_reply;
struct ceph_mds_reply_info_parsed r_reply_info;
int r_err;
@@ -322,6 +323,12 @@ static inline void ceph_mdsc_put_request(struct ceph_mds_request *req)
kref_put(&req->r_kref, ceph_mdsc_release_request);
}
+extern int ceph_add_cap_releases(struct ceph_mds_client *mdsc,
+ struct ceph_mds_session *session,
+ int extra);
+extern void ceph_send_cap_releases(struct ceph_mds_client *mdsc,
+ struct ceph_mds_session *session);
+
extern void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc);
extern char *ceph_mdsc_build_path(struct dentry *dentry, int *plen, u64 *base,
diff --git a/fs/ceph/messenger.c b/fs/ceph/messenger.c
index 60b74839ebec..15167b2daa55 100644
--- a/fs/ceph/messenger.c
+++ b/fs/ceph/messenger.c
@@ -43,7 +43,8 @@ static void ceph_fault(struct ceph_connection *con);
* nicely render a sockaddr as a string.
*/
#define MAX_ADDR_STR 20
-static char addr_str[MAX_ADDR_STR][40];
+#define MAX_ADDR_STR_LEN 60
+static char addr_str[MAX_ADDR_STR][MAX_ADDR_STR_LEN];
static DEFINE_SPINLOCK(addr_str_lock);
static int last_addr_str;
@@ -52,7 +53,6 @@ const char *pr_addr(const struct sockaddr_storage *ss)
int i;
char *s;
struct sockaddr_in *in4 = (void *)ss;
- unsigned char *quad = (void *)&in4->sin_addr.s_addr;
struct sockaddr_in6 *in6 = (void *)ss;
spin_lock(&addr_str_lock);
@@ -64,25 +64,13 @@ const char *pr_addr(const struct sockaddr_storage *ss)
switch (ss->ss_family) {
case AF_INET:
- sprintf(s, "%u.%u.%u.%u:%u",
- (unsigned int)quad[0],
- (unsigned int)quad[1],
- (unsigned int)quad[2],
- (unsigned int)quad[3],
- (unsigned int)ntohs(in4->sin_port));
+ snprintf(s, MAX_ADDR_STR_LEN, "%pI4:%u", &in4->sin_addr,
+ (unsigned int)ntohs(in4->sin_port));
break;
case AF_INET6:
- sprintf(s, "%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x:%u",
- in6->sin6_addr.s6_addr16[0],
- in6->sin6_addr.s6_addr16[1],
- in6->sin6_addr.s6_addr16[2],
- in6->sin6_addr.s6_addr16[3],
- in6->sin6_addr.s6_addr16[4],
- in6->sin6_addr.s6_addr16[5],
- in6->sin6_addr.s6_addr16[6],
- in6->sin6_addr.s6_addr16[7],
- (unsigned int)ntohs(in6->sin6_port));
+ snprintf(s, MAX_ADDR_STR_LEN, "[%pI6c]:%u", &in6->sin6_addr,
+ (unsigned int)ntohs(in6->sin6_port));
break;
default:
@@ -120,6 +108,12 @@ void ceph_msgr_exit(void)
destroy_workqueue(ceph_msgr_wq);
}
+void ceph_msgr_flush()
+{
+ flush_workqueue(ceph_msgr_wq);
+}
+
+
/*
* socket callback functions
*/
@@ -209,12 +203,13 @@ static void set_sock_callbacks(struct socket *sock,
*/
static struct socket *ceph_tcp_connect(struct ceph_connection *con)
{
- struct sockaddr *paddr = (struct sockaddr *)&con->peer_addr.in_addr;
+ struct sockaddr_storage *paddr = &con->peer_addr.in_addr;
struct socket *sock;
int ret;
BUG_ON(con->sock);
- ret = sock_create_kern(AF_INET, SOCK_STREAM, IPPROTO_TCP, &sock);
+ ret = sock_create_kern(con->peer_addr.in_addr.ss_family, SOCK_STREAM,
+ IPPROTO_TCP, &sock);
if (ret)
return ERR_PTR(ret);
con->sock = sock;
@@ -228,7 +223,8 @@ static struct socket *ceph_tcp_connect(struct ceph_connection *con)
dout("connect %s\n", pr_addr(&con->peer_addr.in_addr));
- ret = sock->ops->connect(sock, paddr, sizeof(*paddr), O_NONBLOCK);
+ ret = sock->ops->connect(sock, (struct sockaddr *)paddr, sizeof(*paddr),
+ O_NONBLOCK);
if (ret == -EINPROGRESS) {
dout("connect %s EINPROGRESS sk_state = %u\n",
pr_addr(&con->peer_addr.in_addr),
@@ -651,7 +647,7 @@ static void prepare_write_connect(struct ceph_messenger *msgr,
dout("prepare_write_connect %p cseq=%d gseq=%d proto=%d\n", con,
con->connect_seq, global_seq, proto);
- con->out_connect.features = CEPH_FEATURE_SUPPORTED_CLIENT;
+ con->out_connect.features = cpu_to_le64(CEPH_FEATURE_SUPPORTED_CLIENT);
con->out_connect.host_type = cpu_to_le32(CEPH_ENTITY_TYPE_CLIENT);
con->out_connect.connect_seq = cpu_to_le32(con->connect_seq);
con->out_connect.global_seq = cpu_to_le32(global_seq);
@@ -1003,19 +999,32 @@ int ceph_parse_ips(const char *c, const char *end,
struct sockaddr_in *in4 = (void *)ss;
struct sockaddr_in6 *in6 = (void *)ss;
int port;
+ char delim = ',';
+
+ if (*p == '[') {
+ delim = ']';
+ p++;
+ }
memset(ss, 0, sizeof(*ss));
if (in4_pton(p, end - p, (u8 *)&in4->sin_addr.s_addr,
- ',', &ipend)) {
+ delim, &ipend))
ss->ss_family = AF_INET;
- } else if (in6_pton(p, end - p, (u8 *)&in6->sin6_addr.s6_addr,
- ',', &ipend)) {
+ else if (in6_pton(p, end - p, (u8 *)&in6->sin6_addr.s6_addr,
+ delim, &ipend))
ss->ss_family = AF_INET6;
- } else {
+ else
goto bad;
- }
p = ipend;
+ if (delim == ']') {
+ if (*p != ']') {
+ dout("missing matching ']'\n");
+ goto bad;
+ }
+ p++;
+ }
+
/* port? */
if (p < end && *p == ':') {
port = 0;
@@ -1049,7 +1058,7 @@ int ceph_parse_ips(const char *c, const char *end,
return 0;
bad:
- pr_err("parse_ips bad ip '%s'\n", c);
+ pr_err("parse_ips bad ip '%.*s'\n", (int)(end - c), c);
return -EINVAL;
}
@@ -1390,10 +1399,12 @@ static int read_partial_message(struct ceph_connection *con)
if (!con->in_msg) {
dout("got hdr type %d front %d data %d\n", con->in_hdr.type,
con->in_hdr.front_len, con->in_hdr.data_len);
+ skip = 0;
con->in_msg = ceph_alloc_msg(con, &con->in_hdr, &skip);
if (skip) {
/* skip this message */
dout("alloc_msg said skip message\n");
+ BUG_ON(con->in_msg);
con->in_base_pos = -front_len - middle_len - data_len -
sizeof(m->footer);
con->in_tag = CEPH_MSGR_TAG_READY;
@@ -2007,20 +2018,20 @@ void ceph_con_revoke(struct ceph_connection *con, struct ceph_msg *msg)
{
mutex_lock(&con->mutex);
if (!list_empty(&msg->list_head)) {
- dout("con_revoke %p msg %p\n", con, msg);
+ dout("con_revoke %p msg %p - was on queue\n", con, msg);
list_del_init(&msg->list_head);
ceph_msg_put(msg);
msg->hdr.seq = 0;
- if (con->out_msg == msg) {
- ceph_msg_put(con->out_msg);
- con->out_msg = NULL;
- }
+ }
+ if (con->out_msg == msg) {
+ dout("con_revoke %p msg %p - was sending\n", con, msg);
+ con->out_msg = NULL;
if (con->out_kvec_is_msg) {
con->out_skip = con->out_kvec_bytes;
con->out_kvec_is_msg = false;
}
- } else {
- dout("con_revoke %p msg %p - not queued (sent?)\n", con, msg);
+ ceph_msg_put(msg);
+ msg->hdr.seq = 0;
}
mutex_unlock(&con->mutex);
}
diff --git a/fs/ceph/messenger.h b/fs/ceph/messenger.h
index 00a9430b1ffc..76fbc957bc13 100644
--- a/fs/ceph/messenger.h
+++ b/fs/ceph/messenger.h
@@ -213,6 +213,7 @@ extern int ceph_parse_ips(const char *c, const char *end,
extern int ceph_msgr_init(void);
extern void ceph_msgr_exit(void);
+extern void ceph_msgr_flush(void);
extern struct ceph_messenger *ceph_messenger_create(
struct ceph_entity_addr *myaddr);
diff --git a/fs/ceph/mon_client.c b/fs/ceph/mon_client.c
index f6510a476e7e..54fe01c50706 100644
--- a/fs/ceph/mon_client.c
+++ b/fs/ceph/mon_client.c
@@ -345,7 +345,7 @@ static void ceph_monc_handle_map(struct ceph_mon_client *monc,
out:
mutex_unlock(&monc->mutex);
- wake_up(&client->auth_wq);
+ wake_up_all(&client->auth_wq);
}
/*
@@ -400,6 +400,8 @@ static void release_generic_request(struct kref *kref)
ceph_msg_put(req->reply);
if (req->request)
ceph_msg_put(req->request);
+
+ kfree(req);
}
static void put_generic_request(struct ceph_mon_generic_request *req)
@@ -460,7 +462,7 @@ static void handle_statfs_reply(struct ceph_mon_client *monc,
}
mutex_unlock(&monc->mutex);
if (req) {
- complete(&req->completion);
+ complete_all(&req->completion);
put_generic_request(req);
}
return;
@@ -704,8 +706,11 @@ static void handle_auth_reply(struct ceph_mon_client *monc,
struct ceph_msg *msg)
{
int ret;
+ int was_auth = 0;
mutex_lock(&monc->mutex);
+ if (monc->auth->ops)
+ was_auth = monc->auth->ops->is_authenticated(monc->auth);
monc->pending_auth = 0;
ret = ceph_handle_auth_reply(monc->auth, msg->front.iov_base,
msg->front.iov_len,
@@ -713,14 +718,15 @@ static void handle_auth_reply(struct ceph_mon_client *monc,
monc->m_auth->front_max);
if (ret < 0) {
monc->client->auth_err = ret;
- wake_up(&monc->client->auth_wq);
+ wake_up_all(&monc->client->auth_wq);
} else if (ret > 0) {
__send_prepared_auth_request(monc, ret);
- } else if (monc->auth->ops->is_authenticated(monc->auth)) {
+ } else if (!was_auth && monc->auth->ops->is_authenticated(monc->auth)) {
dout("authenticated, starting session\n");
monc->client->msgr->inst.name.type = CEPH_ENTITY_TYPE_CLIENT;
- monc->client->msgr->inst.name.num = monc->auth->global_id;
+ monc->client->msgr->inst.name.num =
+ cpu_to_le64(monc->auth->global_id);
__send_subscribe(monc);
__resend_generic_request(monc);
diff --git a/fs/ceph/osd_client.c b/fs/ceph/osd_client.c
index afa7bb3895c4..e38522347898 100644
--- a/fs/ceph/osd_client.c
+++ b/fs/ceph/osd_client.c
@@ -361,8 +361,13 @@ static void put_osd(struct ceph_osd *osd)
{
dout("put_osd %p %d -> %d\n", osd, atomic_read(&osd->o_ref),
atomic_read(&osd->o_ref) - 1);
- if (atomic_dec_and_test(&osd->o_ref))
+ if (atomic_dec_and_test(&osd->o_ref)) {
+ struct ceph_auth_client *ac = osd->o_osdc->client->monc.auth;
+
+ if (osd->o_authorizer)
+ ac->ops->destroy_authorizer(ac, osd->o_authorizer);
kfree(osd);
+ }
}
/*
@@ -857,12 +862,12 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg,
if (req->r_callback)
req->r_callback(req, msg);
else
- complete(&req->r_completion);
+ complete_all(&req->r_completion);
if (flags & CEPH_OSD_FLAG_ONDISK) {
if (req->r_safe_callback)
req->r_safe_callback(req, msg);
- complete(&req->r_safe_completion); /* fsync waiter */
+ complete_all(&req->r_safe_completion); /* fsync waiter */
}
done:
@@ -1078,7 +1083,7 @@ done:
if (newmap)
kick_requests(osdc, NULL);
up_read(&osdc->map_sem);
- wake_up(&osdc->client->auth_wq);
+ wake_up_all(&osdc->client->auth_wq);
return;
bad:
@@ -1339,7 +1344,7 @@ static void dispatch(struct ceph_connection *con, struct ceph_msg *msg)
int type = le16_to_cpu(msg->hdr.type);
if (!osd)
- return;
+ goto out;
osdc = osd->o_osdc;
switch (type) {
@@ -1354,6 +1359,7 @@ static void dispatch(struct ceph_connection *con, struct ceph_msg *msg)
pr_err("received unknown message type %d %s\n", type,
ceph_msg_type_name(type));
}
+out:
ceph_msg_put(msg);
}
diff --git a/fs/ceph/osdmap.c b/fs/ceph/osdmap.c
index cfdd8f4388b7..416d46adbf87 100644
--- a/fs/ceph/osdmap.c
+++ b/fs/ceph/osdmap.c
@@ -568,6 +568,7 @@ struct ceph_osdmap *osdmap_decode(void **p, void *end)
if (ev > CEPH_PG_POOL_VERSION) {
pr_warning("got unknown v %d > %d of ceph_pg_pool\n",
ev, CEPH_PG_POOL_VERSION);
+ kfree(pi);
goto bad;
}
__decode_pool(p, pi);
@@ -706,7 +707,8 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
len, *p, end);
newcrush = crush_decode(*p, min(*p+len, end));
if (IS_ERR(newcrush))
- return ERR_PTR(PTR_ERR(newcrush));
+ return ERR_CAST(newcrush);
+ *p += len;
}
/* new flags? */
@@ -829,12 +831,13 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
/* remove any? */
while (rbp && pgid_cmp(rb_entry(rbp, struct ceph_pg_mapping,
node)->pgid, pgid) <= 0) {
- struct rb_node *cur = rbp;
+ struct ceph_pg_mapping *cur =
+ rb_entry(rbp, struct ceph_pg_mapping, node);
+
rbp = rb_next(rbp);
- dout(" removed pg_temp %llx\n",
- *(u64 *)&rb_entry(cur, struct ceph_pg_mapping,
- node)->pgid);
- rb_erase(cur, &map->pg_temp);
+ dout(" removed pg_temp %llx\n", *(u64 *)&cur->pgid);
+ rb_erase(&cur->node, &map->pg_temp);
+ kfree(cur);
}
if (pglen) {
@@ -850,19 +853,22 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
for (j = 0; j < pglen; j++)
pg->osds[j] = ceph_decode_32(p);
err = __insert_pg_mapping(pg, &map->pg_temp);
- if (err)
+ if (err) {
+ kfree(pg);
goto bad;
+ }
dout(" added pg_temp %llx len %d\n", *(u64 *)&pgid,
pglen);
}
}
while (rbp) {
- struct rb_node *cur = rbp;
+ struct ceph_pg_mapping *cur =
+ rb_entry(rbp, struct ceph_pg_mapping, node);
+
rbp = rb_next(rbp);
- dout(" removed pg_temp %llx\n",
- *(u64 *)&rb_entry(cur, struct ceph_pg_mapping,
- node)->pgid);
- rb_erase(cur, &map->pg_temp);
+ dout(" removed pg_temp %llx\n", *(u64 *)&cur->pgid);
+ rb_erase(&cur->node, &map->pg_temp);
+ kfree(cur);
}
/* ignore the rest */
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index 7c663d9b9f81..fa87f51e38e1 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -89,7 +89,7 @@ static int ceph_statfs(struct dentry *dentry, struct kstatfs *buf)
buf->f_files = le64_to_cpu(st.num_objects);
buf->f_ffree = -1;
- buf->f_namelen = PATH_MAX;
+ buf->f_namelen = NAME_MAX;
buf->f_frsize = PAGE_CACHE_SIZE;
/* leave fsid little-endian, regardless of host endianness */
@@ -669,9 +669,17 @@ static void ceph_destroy_client(struct ceph_client *client)
/* unmount */
ceph_mdsc_stop(&client->mdsc);
- ceph_monc_stop(&client->monc);
ceph_osdc_stop(&client->osdc);
+ /*
+ * make sure mds and osd connections close out before destroying
+ * the auth module, which is needed to free those connections'
+ * ceph_authorizers.
+ */
+ ceph_msgr_flush();
+
+ ceph_monc_stop(&client->monc);
+
ceph_adjust_min_caps(-client->min_caps);
ceph_debugfs_client_cleanup(client);
@@ -738,7 +746,7 @@ static struct dentry *open_root_dentry(struct ceph_client *client,
dout("open_root_inode opening '%s'\n", path);
req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETATTR, USE_ANY_MDS);
if (IS_ERR(req))
- return ERR_PTR(PTR_ERR(req));
+ return ERR_CAST(req);
req->r_path1 = kstrdup(path, GFP_NOFS);
req->r_ino1.ino = CEPH_INO_ROOT;
req->r_ino1.snap = CEPH_NOSNAP;
@@ -918,7 +926,7 @@ static int ceph_compare_super(struct super_block *sb, void *data)
/*
* construct our own bdi so we can control readahead, etc.
*/
-static atomic_long_t bdi_seq = ATOMIC_INIT(0);
+static atomic_long_t bdi_seq = ATOMIC_LONG_INIT(0);
static int ceph_register_bdi(struct super_block *sb, struct ceph_client *client)
{
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 3725c9ee9d08..10a4a406e887 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -10,7 +10,6 @@
#include <linux/fs.h>
#include <linux/mempool.h>
#include <linux/pagemap.h>
-#include <linux/slab.h>
#include <linux/wait.h>
#include <linux/writeback.h>
#include <linux/slab.h>
@@ -811,7 +810,7 @@ extern void ceph_put_cap(struct ceph_cap *cap);
extern void ceph_queue_caps_release(struct inode *inode);
extern int ceph_write_inode(struct inode *inode, struct writeback_control *wbc);
-extern int ceph_fsync(struct file *file, struct dentry *dentry, int datasync);
+extern int ceph_fsync(struct file *file, int datasync);
extern void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc,
struct ceph_mds_session *session);
extern int ceph_get_cap_mds(struct inode *inode);
diff --git a/fs/cifs/Kconfig b/fs/cifs/Kconfig
index 80f352596807..5739fd7f88b4 100644
--- a/fs/cifs/Kconfig
+++ b/fs/cifs/Kconfig
@@ -131,6 +131,15 @@ config CIFS_DFS_UPCALL
IP addresses) which is needed for implicit mounts of DFS junction
points. If unsure, say N.
+config CIFS_FSCACHE
+ bool "Provide CIFS client caching support (EXPERIMENTAL)"
+ depends on EXPERIMENTAL
+ depends on CIFS=m && FSCACHE || CIFS=y && FSCACHE=y
+ help
+ Makes CIFS FS-Cache capable. Say Y here if you want your CIFS data
+ to be cached locally on disk through the general filesystem cache
+ manager. If unsure, say N.
+
config CIFS_EXPERIMENTAL
bool "CIFS Experimental Features (EXPERIMENTAL)"
depends on CIFS && EXPERIMENTAL
diff --git a/fs/cifs/Makefile b/fs/cifs/Makefile
index 9948c0030e86..adefa60a9bdc 100644
--- a/fs/cifs/Makefile
+++ b/fs/cifs/Makefile
@@ -11,3 +11,5 @@ cifs-y := cifsfs.o cifssmb.o cifs_debug.o connect.o dir.o file.o inode.o \
cifs-$(CONFIG_CIFS_UPCALL) += cifs_spnego.o
cifs-$(CONFIG_CIFS_DFS_UPCALL) += dns_resolve.o cifs_dfs_ref.o
+
+cifs-$(CONFIG_CIFS_FSCACHE) += fscache.o cache.o
diff --git a/fs/cifs/cache.c b/fs/cifs/cache.c
new file mode 100644
index 000000000000..224d7bbd1fcc
--- /dev/null
+++ b/fs/cifs/cache.c
@@ -0,0 +1,331 @@
+/*
+ * fs/cifs/cache.c - CIFS filesystem cache index structure definitions
+ *
+ * Copyright (c) 2010 Novell, Inc.
+ * Authors(s): Suresh Jayaraman (sjayaraman@suse.de>
+ *
+ * This library is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published
+ * by the Free Software Foundation; either version 2.1 of the License, or
+ * (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
+ * the GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with this library; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+#include "fscache.h"
+#include "cifs_debug.h"
+
+/*
+ * CIFS filesystem definition for FS-Cache
+ */
+struct fscache_netfs cifs_fscache_netfs = {
+ .name = "cifs",
+ .version = 0,
+};
+
+/*
+ * Register CIFS for caching with FS-Cache
+ */
+int cifs_fscache_register(void)
+{
+ return fscache_register_netfs(&cifs_fscache_netfs);
+}
+
+/*
+ * Unregister CIFS for caching
+ */
+void cifs_fscache_unregister(void)
+{
+ fscache_unregister_netfs(&cifs_fscache_netfs);
+}
+
+/*
+ * Key layout of CIFS server cache index object
+ */
+struct cifs_server_key {
+ uint16_t family; /* address family */
+ uint16_t port; /* IP port */
+ union {
+ struct in_addr ipv4_addr;
+ struct in6_addr ipv6_addr;
+ } addr[0];
+};
+
+/*
+ * Server object keyed by {IPaddress,port,family} tuple
+ */
+static uint16_t cifs_server_get_key(const void *cookie_netfs_data,
+ void *buffer, uint16_t maxbuf)
+{
+ const struct TCP_Server_Info *server = cookie_netfs_data;
+ const struct sockaddr *sa = (struct sockaddr *) &server->addr.sockAddr;
+ struct cifs_server_key *key = buffer;
+ uint16_t key_len = sizeof(struct cifs_server_key);
+
+ memset(key, 0, key_len);
+
+ /*
+ * Should not be a problem as sin_family/sin6_family overlays
+ * sa_family field
+ */
+ switch (sa->sa_family) {
+ case AF_INET:
+ key->family = server->addr.sockAddr.sin_family;
+ key->port = server->addr.sockAddr.sin_port;
+ key->addr[0].ipv4_addr = server->addr.sockAddr.sin_addr;
+ key_len += sizeof(key->addr[0].ipv4_addr);
+ break;
+
+ case AF_INET6:
+ key->family = server->addr.sockAddr6.sin6_family;
+ key->port = server->addr.sockAddr6.sin6_port;
+ key->addr[0].ipv6_addr = server->addr.sockAddr6.sin6_addr;
+ key_len += sizeof(key->addr[0].ipv6_addr);
+ break;
+
+ default:
+ cERROR(1, "CIFS: Unknown network family '%d'", sa->sa_family);
+ key_len = 0;
+ break;
+ }
+
+ return key_len;
+}
+
+/*
+ * Server object for FS-Cache
+ */
+const struct fscache_cookie_def cifs_fscache_server_index_def = {
+ .name = "CIFS.server",
+ .type = FSCACHE_COOKIE_TYPE_INDEX,
+ .get_key = cifs_server_get_key,
+};
+
+/*
+ * Auxiliary data attached to CIFS superblock within the cache
+ */
+struct cifs_fscache_super_auxdata {
+ u64 resource_id; /* unique server resource id */
+};
+
+static char *extract_sharename(const char *treename)
+{
+ const char *src;
+ char *delim, *dst;
+ int len;
+
+ /* skip double chars at the beginning */
+ src = treename + 2;
+
+ /* share name is always preceded by '\\' now */
+ delim = strchr(src, '\\');
+ if (!delim)
+ return ERR_PTR(-EINVAL);
+ delim++;
+ len = strlen(delim);
+
+ /* caller has to free the memory */
+ dst = kstrndup(delim, len, GFP_KERNEL);
+ if (!dst)
+ return ERR_PTR(-ENOMEM);
+
+ return dst;
+}
+
+/*
+ * Superblock object currently keyed by share name
+ */
+static uint16_t cifs_super_get_key(const void *cookie_netfs_data, void *buffer,
+ uint16_t maxbuf)
+{
+ const struct cifsTconInfo *tcon = cookie_netfs_data;
+ char *sharename;
+ uint16_t len;
+
+ sharename = extract_sharename(tcon->treeName);
+ if (IS_ERR(sharename)) {
+ cFYI(1, "CIFS: couldn't extract sharename\n");
+ sharename = NULL;
+ return 0;
+ }
+
+ len = strlen(sharename);
+ if (len > maxbuf)
+ return 0;
+
+ memcpy(buffer, sharename, len);
+
+ kfree(sharename);
+
+ return len;
+}
+
+static uint16_t
+cifs_fscache_super_get_aux(const void *cookie_netfs_data, void *buffer,
+ uint16_t maxbuf)
+{
+ struct cifs_fscache_super_auxdata auxdata;
+ const struct cifsTconInfo *tcon = cookie_netfs_data;
+
+ memset(&auxdata, 0, sizeof(auxdata));
+ auxdata.resource_id = tcon->resource_id;
+
+ if (maxbuf > sizeof(auxdata))
+ maxbuf = sizeof(auxdata);
+
+ memcpy(buffer, &auxdata, maxbuf);
+
+ return maxbuf;
+}
+
+static enum
+fscache_checkaux cifs_fscache_super_check_aux(void *cookie_netfs_data,
+ const void *data,
+ uint16_t datalen)
+{
+ struct cifs_fscache_super_auxdata auxdata;
+ const struct cifsTconInfo *tcon = cookie_netfs_data;
+
+ if (datalen != sizeof(auxdata))
+ return FSCACHE_CHECKAUX_OBSOLETE;
+
+ memset(&auxdata, 0, sizeof(auxdata));
+ auxdata.resource_id = tcon->resource_id;
+
+ if (memcmp(data, &auxdata, datalen) != 0)
+ return FSCACHE_CHECKAUX_OBSOLETE;
+
+ return FSCACHE_CHECKAUX_OKAY;
+}
+
+/*
+ * Superblock object for FS-Cache
+ */
+const struct fscache_cookie_def cifs_fscache_super_index_def = {
+ .name = "CIFS.super",
+ .type = FSCACHE_COOKIE_TYPE_INDEX,
+ .get_key = cifs_super_get_key,
+ .get_aux = cifs_fscache_super_get_aux,
+ .check_aux = cifs_fscache_super_check_aux,
+};
+
+/*
+ * Auxiliary data attached to CIFS inode within the cache
+ */
+struct cifs_fscache_inode_auxdata {
+ struct timespec last_write_time;
+ struct timespec last_change_time;
+ u64 eof;
+};
+
+static uint16_t cifs_fscache_inode_get_key(const void *cookie_netfs_data,
+ void *buffer, uint16_t maxbuf)
+{
+ const struct cifsInodeInfo *cifsi = cookie_netfs_data;
+ uint16_t keylen;
+
+ /* use the UniqueId as the key */
+ keylen = sizeof(cifsi->uniqueid);
+ if (keylen > maxbuf)
+ keylen = 0;
+ else
+ memcpy(buffer, &cifsi->uniqueid, keylen);
+
+ return keylen;
+}
+
+static void
+cifs_fscache_inode_get_attr(const void *cookie_netfs_data, uint64_t *size)
+{
+ const struct cifsInodeInfo *cifsi = cookie_netfs_data;
+
+ *size = cifsi->vfs_inode.i_size;
+}
+
+static uint16_t
+cifs_fscache_inode_get_aux(const void *cookie_netfs_data, void *buffer,
+ uint16_t maxbuf)
+{
+ struct cifs_fscache_inode_auxdata auxdata;
+ const struct cifsInodeInfo *cifsi = cookie_netfs_data;
+
+ memset(&auxdata, 0, sizeof(auxdata));
+ auxdata.eof = cifsi->server_eof;
+ auxdata.last_write_time = cifsi->vfs_inode.i_mtime;
+ auxdata.last_change_time = cifsi->vfs_inode.i_ctime;
+
+ if (maxbuf > sizeof(auxdata))
+ maxbuf = sizeof(auxdata);
+
+ memcpy(buffer, &auxdata, maxbuf);
+
+ return maxbuf;
+}
+
+static enum
+fscache_checkaux cifs_fscache_inode_check_aux(void *cookie_netfs_data,
+ const void *data,
+ uint16_t datalen)
+{
+ struct cifs_fscache_inode_auxdata auxdata;
+ struct cifsInodeInfo *cifsi = cookie_netfs_data;
+
+ if (datalen != sizeof(auxdata))
+ return FSCACHE_CHECKAUX_OBSOLETE;
+
+ memset(&auxdata, 0, sizeof(auxdata));
+ auxdata.eof = cifsi->server_eof;
+ auxdata.last_write_time = cifsi->vfs_inode.i_mtime;
+ auxdata.last_change_time = cifsi->vfs_inode.i_ctime;
+
+ if (memcmp(data, &auxdata, datalen) != 0)
+ return FSCACHE_CHECKAUX_OBSOLETE;
+
+ return FSCACHE_CHECKAUX_OKAY;
+}
+
+static void cifs_fscache_inode_now_uncached(void *cookie_netfs_data)
+{
+ struct cifsInodeInfo *cifsi = cookie_netfs_data;
+ struct pagevec pvec;
+ pgoff_t first;
+ int loop, nr_pages;
+
+ pagevec_init(&pvec, 0);
+ first = 0;
+
+ cFYI(1, "cifs inode 0x%p now uncached", cifsi);
+
+ for (;;) {
+ nr_pages = pagevec_lookup(&pvec,
+ cifsi->vfs_inode.i_mapping, first,
+ PAGEVEC_SIZE - pagevec_count(&pvec));
+ if (!nr_pages)
+ break;
+
+ for (loop = 0; loop < nr_pages; loop++)
+ ClearPageFsCache(pvec.pages[loop]);
+
+ first = pvec.pages[nr_pages - 1]->index + 1;
+
+ pvec.nr = nr_pages;
+ pagevec_release(&pvec);
+ cond_resched();
+ }
+}
+
+const struct fscache_cookie_def cifs_fscache_inode_object_def = {
+ .name = "CIFS.uniqueid",
+ .type = FSCACHE_COOKIE_TYPE_DATAFILE,
+ .get_key = cifs_fscache_inode_get_key,
+ .get_attr = cifs_fscache_inode_get_attr,
+ .get_aux = cifs_fscache_inode_get_aux,
+ .check_aux = cifs_fscache_inode_check_aux,
+ .now_uncached = cifs_fscache_inode_now_uncached,
+};
diff --git a/fs/cifs/cifs_dfs_ref.c b/fs/cifs/cifs_dfs_ref.c
index ac19a6f3dae0..dc1ed50ea06e 100644
--- a/fs/cifs/cifs_dfs_ref.c
+++ b/fs/cifs/cifs_dfs_ref.c
@@ -230,28 +230,22 @@ compose_mount_options_err:
goto compose_mount_options_out;
}
-
-static struct vfsmount *cifs_dfs_do_refmount(const struct vfsmount *mnt_parent,
- struct dentry *dentry, const struct dfs_info3_param *ref)
+/**
+ * cifs_dfs_do_refmount - mounts specified path using provided refferal
+ * @cifs_sb: parent/root superblock
+ * @fullpath: full path in UNC format
+ * @ref: server's referral
+ */
+static struct vfsmount *cifs_dfs_do_refmount(struct cifs_sb_info *cifs_sb,
+ const char *fullpath, const struct dfs_info3_param *ref)
{
- struct cifs_sb_info *cifs_sb;
struct vfsmount *mnt;
char *mountdata;
char *devname = NULL;
- char *fullpath;
-
- cifs_sb = CIFS_SB(dentry->d_inode->i_sb);
- /*
- * this function gives us a path with a double backslash prefix. We
- * require a single backslash for DFS.
- */
- fullpath = build_path_from_dentry(dentry);
- if (!fullpath)
- return ERR_PTR(-ENOMEM);
+ /* strip first '\' from fullpath */
mountdata = cifs_compose_mount_options(cifs_sb->mountdata,
fullpath + 1, ref, &devname);
- kfree(fullpath);
if (IS_ERR(mountdata))
return (struct vfsmount *)mountdata;
@@ -357,8 +351,8 @@ cifs_dfs_follow_mountpoint(struct dentry *dentry, struct nameidata *nd)
rc = -EINVAL;
goto out_err;
}
- mnt = cifs_dfs_do_refmount(nd->path.mnt,
- nd->path.dentry, referrals + i);
+ mnt = cifs_dfs_do_refmount(cifs_sb,
+ full_path, referrals + i);
cFYI(1, "%s: cifs_dfs_do_refmount:%s , mnt:%p", __func__,
referrals[i].node_name, mnt);
diff --git a/fs/cifs/cifs_fs_sb.h b/fs/cifs/cifs_fs_sb.h
index 246a167cb913..9e771450c3b8 100644
--- a/fs/cifs/cifs_fs_sb.h
+++ b/fs/cifs/cifs_fs_sb.h
@@ -35,6 +35,7 @@
#define CIFS_MOUNT_DYNPERM 0x1000 /* allow in-memory only mode setting */
#define CIFS_MOUNT_NOPOSIXBRL 0x2000 /* mandatory not posix byte range lock */
#define CIFS_MOUNT_NOSSYNC 0x4000 /* don't do slow SMBflush on every sync*/
+#define CIFS_MOUNT_FSCACHE 0x8000 /* local caching enabled */
struct cifs_sb_info {
struct cifsTconInfo *tcon; /* primary mount */
diff --git a/fs/cifs/cifs_spnego.c b/fs/cifs/cifs_spnego.c
index 379bd7d9c05f..6effccff85a5 100644
--- a/fs/cifs/cifs_spnego.c
+++ b/fs/cifs/cifs_spnego.c
@@ -144,6 +144,9 @@ cifs_get_spnego_key(struct cifsSesInfo *sesInfo)
sprintf(dp, ";uid=0x%x", sesInfo->linux_uid);
dp = description + strlen(description);
+ sprintf(dp, ";creduid=0x%x", sesInfo->cred_uid);
+
+ dp = description + strlen(description);
sprintf(dp, ";user=%s", sesInfo->userName);
dp = description + strlen(description);
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 78c02eb4cb1f..8a2cf129e535 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -47,6 +47,7 @@
#include <linux/key-type.h>
#include "dns_resolve.h"
#include "cifs_spnego.h"
+#include "fscache.h"
#define CIFS_MAGIC_NUMBER 0xFF534D42 /* the first four bytes of SMB PDUs */
int cifsFYI = 0;
@@ -329,6 +330,12 @@ cifs_destroy_inode(struct inode *inode)
}
static void
+cifs_clear_inode(struct inode *inode)
+{
+ cifs_fscache_release_inode_cookie(inode);
+}
+
+static void
cifs_show_address(struct seq_file *s, struct TCP_Server_Info *server)
{
seq_printf(s, ",addr=");
@@ -473,14 +480,25 @@ static int cifs_remount(struct super_block *sb, int *flags, char *data)
return 0;
}
+void cifs_drop_inode(struct inode *inode)
+{
+ struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
+
+ if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM)
+ return generic_drop_inode(inode);
+
+ return generic_delete_inode(inode);
+}
+
static const struct super_operations cifs_super_ops = {
.put_super = cifs_put_super,
.statfs = cifs_statfs,
.alloc_inode = cifs_alloc_inode,
.destroy_inode = cifs_destroy_inode,
-/* .drop_inode = generic_delete_inode,
- .delete_inode = cifs_delete_inode, */ /* Do not need above two
- functions unless later we add lazy close of inodes or unless the
+ .drop_inode = cifs_drop_inode,
+ .clear_inode = cifs_clear_inode,
+/* .delete_inode = cifs_delete_inode, */ /* Do not need above
+ function unless later we add lazy close of inodes or unless the
kernel forgets to call us with the same number of releases (closes)
as opens */
.show_options = cifs_show_options,
@@ -892,6 +910,10 @@ init_cifs(void)
cFYI(1, "cifs_max_pending set to max of 256");
}
+ rc = cifs_fscache_register();
+ if (rc)
+ goto out;
+
rc = cifs_init_inodecache();
if (rc)
goto out_clean_proc;
@@ -913,7 +935,7 @@ init_cifs(void)
goto out_unregister_filesystem;
#endif
#ifdef CONFIG_CIFS_DFS_UPCALL
- rc = register_key_type(&key_type_dns_resolver);
+ rc = cifs_init_dns_resolver();
if (rc)
goto out_unregister_key_type;
#endif
@@ -925,7 +947,7 @@ init_cifs(void)
out_unregister_resolver_key:
#ifdef CONFIG_CIFS_DFS_UPCALL
- unregister_key_type(&key_type_dns_resolver);
+ cifs_exit_dns_resolver();
out_unregister_key_type:
#endif
#ifdef CONFIG_CIFS_UPCALL
@@ -941,6 +963,8 @@ init_cifs(void)
cifs_destroy_inodecache();
out_clean_proc:
cifs_proc_clean();
+ cifs_fscache_unregister();
+ out:
return rc;
}
@@ -949,9 +973,10 @@ exit_cifs(void)
{
cFYI(DBG2, "exit_cifs");
cifs_proc_clean();
+ cifs_fscache_unregister();
#ifdef CONFIG_CIFS_DFS_UPCALL
cifs_dfs_release_automount_timer();
- unregister_key_type(&key_type_dns_resolver);
+ cifs_exit_dns_resolver();
#endif
#ifdef CONFIG_CIFS_UPCALL
unregister_key_type(&cifs_spnego_key_type);
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index 0242ff9cbf41..d82f5fb4761e 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -84,7 +84,7 @@ extern ssize_t cifs_user_read(struct file *file, char __user *read_data,
extern ssize_t cifs_user_write(struct file *file, const char __user *write_data,
size_t write_size, loff_t *poffset);
extern int cifs_lock(struct file *, int, struct file_lock *);
-extern int cifs_fsync(struct file *, struct dentry *, int);
+extern int cifs_fsync(struct file *, int);
extern int cifs_flush(struct file *, fl_owner_t id);
extern int cifs_file_mmap(struct file * , struct vm_area_struct *);
extern const struct file_operations cifs_dir_ops;
@@ -114,5 +114,5 @@ extern long cifs_ioctl(struct file *filep, unsigned int cmd, unsigned long arg);
extern const struct export_operations cifs_export_ops;
#endif /* EXPERIMENTAL */
-#define CIFS_VERSION "1.64"
+#define CIFS_VERSION "1.65"
#endif /* _CIFSFS_H */
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index a88479ceaad5..59906146ad36 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -16,6 +16,9 @@
* the GNU Lesser General Public License for more details.
*
*/
+#ifndef _CIFS_GLOB_H
+#define _CIFS_GLOB_H
+
#include <linux/in.h>
#include <linux/in6.h>
#include <linux/slab.h>
@@ -34,7 +37,7 @@
#define MAX_SHARE_SIZE 64 /* used to be 20, this should still be enough */
#define MAX_USERNAME_SIZE 32 /* 32 is to allow for 15 char names + null
termination then *2 for unicode versions */
-#define MAX_PASSWORD_SIZE 16
+#define MAX_PASSWORD_SIZE 512 /* max for windows seems to be 256 wide chars */
#define CIFS_MIN_RCV_POOL 4
@@ -80,8 +83,7 @@ enum statusEnum {
};
enum securityEnum {
- PLAINTXT = 0, /* Legacy with Plaintext passwords */
- LANMAN, /* Legacy LANMAN auth */
+ LANMAN = 0, /* Legacy LANMAN auth */
NTLM, /* Legacy NTLM012 auth with NTLM hash */
NTLMv2, /* Legacy NTLM auth with NTLMv2 hash */
RawNTLMSSP, /* NTLMSSP without SPNEGO, NTLMv2 hash */
@@ -142,7 +144,6 @@ struct TCP_Server_Info {
struct list_head pending_mid_q;
void *Server_NlsInfo; /* BB - placeholder for future NLS info */
unsigned short server_codepage; /* codepage for the server */
- unsigned long ip_address; /* IP addr for the server if known */
enum protocolEnum protocolType;
char versionMajor;
char versionMinor;
@@ -190,19 +191,9 @@ struct TCP_Server_Info {
bool sec_mskerberos; /* supports legacy MS Kerberos */
bool sec_kerberosu2u; /* supports U2U Kerberos */
bool sec_ntlmssp; /* supports NTLMSSP */
-};
-
-/*
- * The following is our shortcut to user information. We surface the uid,
- * and name. We always get the password on the fly in case it
- * has changed. We also hang a list of sessions owned by this user off here.
- */
-struct cifsUidInfo {
- struct list_head userList;
- struct list_head sessionList; /* SMB sessions for this user */
- uid_t linux_uid;
- char user[MAX_USERNAME_SIZE + 1]; /* ascii name of user */
- /* BB may need ptr or callback for PAM or WinBind info */
+#ifdef CONFIG_CIFS_FSCACHE
+ struct fscache_cookie *fscache; /* client index cache cookie */
+#endif
};
/*
@@ -212,9 +203,6 @@ struct cifsSesInfo {
struct list_head smb_ses_list;
struct list_head tcon_list;
struct mutex session_mutex;
-#if 0
- struct cifsUidInfo *uidInfo; /* pointer to user info */
-#endif
struct TCP_Server_Info *server; /* pointer to server info */
int ses_count; /* reference counter */
enum statusEnum status;
@@ -226,7 +214,8 @@ struct cifsSesInfo {
char *serverNOS; /* name of network operating system of server */
char *serverDomain; /* security realm of server */
int Suid; /* remote smb uid */
- uid_t linux_uid; /* local Linux uid */
+ uid_t linux_uid; /* overriding owner of files on the mount */
+ uid_t cred_uid; /* owner of credentials */
int capabilities;
char serverName[SERVER_NAME_LEN_WITH_NULL * 2]; /* BB make bigger for
TCP names - will ipv6 and sctp addresses fit? */
@@ -311,6 +300,10 @@ struct cifsTconInfo {
bool local_lease:1; /* check leases (only) on local system not remote */
bool broken_posix_open; /* e.g. Samba server versions < 3.3.2, 3.2.9 */
bool need_reconnect:1; /* connection reset, tid now invalid */
+#ifdef CONFIG_CIFS_FSCACHE
+ u64 resource_id; /* server resource id */
+ struct fscache_cookie *fscache; /* cookie for share */
+#endif
/* BB add field for back pointer to sb struct(s)? */
};
@@ -398,6 +391,9 @@ struct cifsInodeInfo {
bool invalid_mapping:1; /* pagecache is invalid */
u64 server_eof; /* current file size on server */
u64 uniqueid; /* server inode number */
+#ifdef CONFIG_CIFS_FSCACHE
+ struct fscache_cookie *fscache;
+#endif
struct inode vfs_inode;
};
@@ -733,3 +729,5 @@ GLOBAL_EXTERN unsigned int cifs_min_small; /* min size of small buf pool */
GLOBAL_EXTERN unsigned int cifs_max_pending; /* MAX requests at once to server*/
extern const struct slow_work_ops cifs_oplock_break_ops;
+
+#endif /* _CIFS_GLOB_H */
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index fb1657e0fdb8..2eaebbd31132 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -86,7 +86,9 @@ extern unsigned int smbCalcSize(struct smb_hdr *ptr);
extern unsigned int smbCalcSize_LE(struct smb_hdr *ptr);
extern int decode_negTokenInit(unsigned char *security_blob, int length,
struct TCP_Server_Info *server);
-extern int cifs_convert_address(char *src, void *dst);
+extern int cifs_convert_address(struct sockaddr *dst, char *src);
+extern int cifs_fill_sockaddr(struct sockaddr *dst, char *src,
+ unsigned short int port);
extern int map_smb_to_linux_error(struct smb_hdr *smb, int logErr);
extern void header_assemble(struct smb_hdr *, char /* command */ ,
const struct cifsTconInfo *, int /* length of
@@ -106,7 +108,6 @@ extern struct cifsFileInfo *cifs_new_fileinfo(struct inode *newinode,
__u16 fileHandle, struct file *file,
struct vfsmount *mnt, unsigned int oflags);
extern int cifs_posix_open(char *full_path, struct inode **pinode,
- struct vfsmount *mnt,
struct super_block *sb,
int mode, int oflags,
__u32 *poplock, __u16 *pnetfid, int xid);
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 2208f06e4c45..2a43a0aca965 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -48,6 +48,7 @@
#include "nterr.h"
#include "rfc1002pdu.h"
#include "cn_cifs.h"
+#include "fscache.h"
#define CIFS_PORT 445
#define RFC1001_PORT 139
@@ -66,6 +67,7 @@ struct smb_vol {
char *iocharset; /* local code page for mapping to and from Unicode */
char source_rfc1001_name[16]; /* netbios name of client */
char target_rfc1001_name[16]; /* netbios name of server for Win9x/ME */
+ uid_t cred_uid;
uid_t linux_uid;
gid_t linux_gid;
mode_t file_mode;
@@ -97,6 +99,7 @@ struct smb_vol {
bool noblocksnd:1;
bool noautotune:1;
bool nostrictsync:1; /* do not force expensive SMBflush on every sync */
+ bool fsc:1; /* enable fscache */
unsigned int rsize;
unsigned int wsize;
bool sockopt_tcp_nodelay:1;
@@ -830,7 +833,8 @@ cifs_parse_mount_options(char *options, const char *devname,
/* null target name indicates to use *SMBSERVR default called name
if we end up sending RFC1001 session initialize */
vol->target_rfc1001_name[0] = 0;
- vol->linux_uid = current_uid(); /* use current_euid() instead? */
+ vol->cred_uid = current_uid();
+ vol->linux_uid = current_uid();
vol->linux_gid = current_gid();
/* default to only allowing write access to owner of the mount */
@@ -1257,6 +1261,12 @@ cifs_parse_mount_options(char *options, const char *devname,
} else if ((strnicmp(data, "nocase", 6) == 0) ||
(strnicmp(data, "ignorecase", 10) == 0)) {
vol->nocase = 1;
+ } else if (strnicmp(data, "mand", 4) == 0) {
+ /* ignore */
+ } else if (strnicmp(data, "nomand", 6) == 0) {
+ /* ignore */
+ } else if (strnicmp(data, "_netdev", 7) == 0) {
+ /* ignore */
} else if (strnicmp(data, "brl", 3) == 0) {
vol->nobrl = 0;
} else if ((strnicmp(data, "nobrl", 5) == 0) ||
@@ -1331,6 +1341,8 @@ cifs_parse_mount_options(char *options, const char *devname,
printk(KERN_WARNING "CIFS: Mount option noac not "
"supported. Instead set "
"/proc/fs/cifs/LookupCacheEnabled to 0\n");
+ } else if (strnicmp(data, "fsc", 3) == 0) {
+ vol->fsc = true;
} else
printk(KERN_WARNING "CIFS: Unknown mount option %s\n",
data);
@@ -1380,18 +1392,92 @@ cifs_parse_mount_options(char *options, const char *devname,
return 0;
}
+static bool
+match_address(struct TCP_Server_Info *server, struct sockaddr *addr)
+{
+ struct sockaddr_in *addr4 = (struct sockaddr_in *)addr;
+ struct sockaddr_in6 *addr6 = (struct sockaddr_in6 *)addr;
+
+ switch (addr->sa_family) {
+ case AF_INET:
+ if (addr4->sin_addr.s_addr !=
+ server->addr.sockAddr.sin_addr.s_addr)
+ return false;
+ if (addr4->sin_port &&
+ addr4->sin_port != server->addr.sockAddr.sin_port)
+ return false;
+ break;
+ case AF_INET6:
+ if (!ipv6_addr_equal(&addr6->sin6_addr,
+ &server->addr.sockAddr6.sin6_addr))
+ return false;
+ if (addr6->sin6_scope_id !=
+ server->addr.sockAddr6.sin6_scope_id)
+ return false;
+ if (addr6->sin6_port &&
+ addr6->sin6_port != server->addr.sockAddr6.sin6_port)
+ return false;
+ break;
+ }
+
+ return true;
+}
+
+static bool
+match_security(struct TCP_Server_Info *server, struct smb_vol *vol)
+{
+ unsigned int secFlags;
+
+ if (vol->secFlg & (~(CIFSSEC_MUST_SIGN | CIFSSEC_MUST_SEAL)))
+ secFlags = vol->secFlg;
+ else
+ secFlags = global_secflags | vol->secFlg;
+
+ switch (server->secType) {
+ case LANMAN:
+ if (!(secFlags & (CIFSSEC_MAY_LANMAN|CIFSSEC_MAY_PLNTXT)))
+ return false;
+ break;
+ case NTLMv2:
+ if (!(secFlags & CIFSSEC_MAY_NTLMV2))
+ return false;
+ break;
+ case NTLM:
+ if (!(secFlags & CIFSSEC_MAY_NTLM))
+ return false;
+ break;
+ case Kerberos:
+ if (!(secFlags & CIFSSEC_MAY_KRB5))
+ return false;
+ break;
+ case RawNTLMSSP:
+ if (!(secFlags & CIFSSEC_MAY_NTLMSSP))
+ return false;
+ break;
+ default:
+ /* shouldn't happen */
+ return false;
+ }
+
+ /* now check if signing mode is acceptible */
+ if ((secFlags & CIFSSEC_MAY_SIGN) == 0 &&
+ (server->secMode & SECMODE_SIGN_REQUIRED))
+ return false;
+ else if (((secFlags & CIFSSEC_MUST_SIGN) == CIFSSEC_MUST_SIGN) &&
+ (server->secMode &
+ (SECMODE_SIGN_ENABLED|SECMODE_SIGN_REQUIRED)) == 0)
+ return false;
+
+ return true;
+}
+
static struct TCP_Server_Info *
-cifs_find_tcp_session(struct sockaddr_storage *addr, unsigned short int port)
+cifs_find_tcp_session(struct sockaddr *addr, struct smb_vol *vol)
{
- struct list_head *tmp;
struct TCP_Server_Info *server;
- struct sockaddr_in *addr4 = (struct sockaddr_in *) addr;
- struct sockaddr_in6 *addr6 = (struct sockaddr_in6 *) addr;
write_lock(&cifs_tcp_ses_lock);
- list_for_each(tmp, &cifs_tcp_ses_list) {
- server = list_entry(tmp, struct TCP_Server_Info,
- tcp_ses_list);
+ list_for_each_entry(server, &cifs_tcp_ses_list, tcp_ses_list) {
/*
* the demux thread can exit on its own while still in CifsNew
* so don't accept any sockets in that state. Since the
@@ -1401,37 +1487,11 @@ cifs_find_tcp_session(struct sockaddr_storage *addr, unsigned short int port)
if (server->tcpStatus == CifsNew)
continue;
- switch (addr->ss_family) {
- case AF_INET:
- if (addr4->sin_addr.s_addr ==
- server->addr.sockAddr.sin_addr.s_addr) {
- addr4->sin_port = htons(port);
- /* user overrode default port? */
- if (addr4->sin_port) {
- if (addr4->sin_port !=
- server->addr.sockAddr.sin_port)
- continue;
- }
- break;
- } else
- continue;
+ if (!match_address(server, addr))
+ continue;
- case AF_INET6:
- if (ipv6_addr_equal(&addr6->sin6_addr,
- &server->addr.sockAddr6.sin6_addr) &&
- (addr6->sin6_scope_id ==
- server->addr.sockAddr6.sin6_scope_id)) {
- addr6->sin6_port = htons(port);
- /* user overrode default port? */
- if (addr6->sin6_port) {
- if (addr6->sin6_port !=
- server->addr.sockAddr6.sin6_port)
- continue;
- }
- break;
- } else
- continue;
- }
+ if (!match_security(server, vol))
+ continue;
++server->srv_count;
write_unlock(&cifs_tcp_ses_lock);
@@ -1460,6 +1520,8 @@ cifs_put_tcp_session(struct TCP_Server_Info *server)
server->tcpStatus = CifsExiting;
spin_unlock(&GlobalMid_Lock);
+ cifs_fscache_release_client_cookie(server);
+
task = xchg(&server->tsk, NULL);
if (task)
force_sig(SIGKILL, task);
@@ -1479,7 +1541,9 @@ cifs_get_tcp_session(struct smb_vol *volume_info)
cFYI(1, "UNC: %s ip: %s", volume_info->UNC, volume_info->UNCip);
if (volume_info->UNCip && volume_info->UNC) {
- rc = cifs_convert_address(volume_info->UNCip, &addr);
+ rc = cifs_fill_sockaddr((struct sockaddr *)&addr,
+ volume_info->UNCip,
+ volume_info->port);
if (!rc) {
/* we failed translating address */
rc = -EINVAL;
@@ -1499,7 +1563,7 @@ cifs_get_tcp_session(struct smb_vol *volume_info)
}
/* see if we already have a matching tcp_ses */
- tcp_ses = cifs_find_tcp_session(&addr, volume_info->port);
+ tcp_ses = cifs_find_tcp_session((struct sockaddr *)&addr, volume_info);
if (tcp_ses)
return tcp_ses;
@@ -1543,12 +1607,10 @@ cifs_get_tcp_session(struct smb_vol *volume_info)
cFYI(1, "attempting ipv6 connect");
/* BB should we allow ipv6 on port 139? */
/* other OS never observed in Wild doing 139 with v6 */
- sin_server6->sin6_port = htons(volume_info->port);
memcpy(&tcp_ses->addr.sockAddr6, sin_server6,
sizeof(struct sockaddr_in6));
rc = ipv6_connect(tcp_ses);
} else {
- sin_server->sin_port = htons(volume_info->port);
memcpy(&tcp_ses->addr.sockAddr, sin_server,
sizeof(struct sockaddr_in));
rc = ipv4_connect(tcp_ses);
@@ -1577,6 +1639,8 @@ cifs_get_tcp_session(struct smb_vol *volume_info)
list_add(&tcp_ses->tcp_ses_list, &cifs_tcp_ses_list);
write_unlock(&cifs_tcp_ses_lock);
+ cifs_fscache_get_client_cookie(tcp_ses);
+
return tcp_ses;
out_err:
@@ -1591,17 +1655,27 @@ out_err:
}
static struct cifsSesInfo *
-cifs_find_smb_ses(struct TCP_Server_Info *server, char *username)
+cifs_find_smb_ses(struct TCP_Server_Info *server, struct smb_vol *vol)
{
- struct list_head *tmp;
struct cifsSesInfo *ses;
write_lock(&cifs_tcp_ses_lock);
- list_for_each(tmp, &server->smb_ses_list) {
- ses = list_entry(tmp, struct cifsSesInfo, smb_ses_list);
- if (strncmp(ses->userName, username, MAX_USERNAME_SIZE))
- continue;
-
+ list_for_each_entry(ses, &server->smb_ses_list, smb_ses_list) {
+ switch (server->secType) {
+ case Kerberos:
+ if (vol->cred_uid != ses->cred_uid)
+ continue;
+ break;
+ default:
+ /* anything else takes username/password */
+ if (strncmp(ses->userName, vol->username,
+ MAX_USERNAME_SIZE))
+ continue;
+ if (strlen(vol->username) != 0 &&
+ strncmp(ses->password, vol->password,
+ MAX_PASSWORD_SIZE))
+ continue;
+ }
++ses->ses_count;
write_unlock(&cifs_tcp_ses_lock);
return ses;
@@ -1643,7 +1717,7 @@ cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb_vol *volume_info)
xid = GetXid();
- ses = cifs_find_smb_ses(server, volume_info->username);
+ ses = cifs_find_smb_ses(server, volume_info);
if (ses) {
cFYI(1, "Existing smb sess found (status=%d)", ses->status);
@@ -1706,6 +1780,7 @@ cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb_vol *volume_info)
if (ses->domainName)
strcpy(ses->domainName, volume_info->domainname);
}
+ ses->cred_uid = volume_info->cred_uid;
ses->linux_uid = volume_info->linux_uid;
ses->overrideSecFlg = volume_info->secFlg;
@@ -1773,6 +1848,7 @@ cifs_put_tcon(struct cifsTconInfo *tcon)
CIFSSMBTDis(xid, tcon);
_FreeXid(xid);
+ cifs_fscache_release_super_cookie(tcon);
tconInfoFree(tcon);
cifs_put_smb_ses(ses);
}
@@ -1843,6 +1919,8 @@ cifs_get_tcon(struct cifsSesInfo *ses, struct smb_vol *volume_info)
list_add(&tcon->tcon_list, &ses->tcon_list);
write_unlock(&cifs_tcp_ses_lock);
+ cifs_fscache_get_super_cookie(tcon);
+
return tcon;
out_fail:
@@ -2397,6 +2475,8 @@ static void setup_cifs_sb(struct smb_vol *pvolume_info,
cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_OVERR_GID;
if (pvolume_info->dynperm)
cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_DYNPERM;
+ if (pvolume_info->fsc)
+ cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_FSCACHE;
if (pvolume_info->direct_io) {
cFYI(1, "mounting share using direct i/o");
cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_DIRECT_IO;
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index 391816b461ca..a7de5e9fff11 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -25,6 +25,7 @@
#include <linux/slab.h>
#include <linux/namei.h>
#include <linux/mount.h>
+#include <linux/file.h>
#include "cifsfs.h"
#include "cifspdu.h"
#include "cifsglob.h"
@@ -129,12 +130,6 @@ cifs_bp_rename_retry:
return full_path;
}
-/*
- * When called with struct file pointer set to NULL, there is no way we could
- * update file->private_data, but getting it stuck on openFileList provides a
- * way to access it from cifs_fill_filedata and thereby set file->private_data
- * from cifs_open.
- */
struct cifsFileInfo *
cifs_new_fileinfo(struct inode *newinode, __u16 fileHandle,
struct file *file, struct vfsmount *mnt, unsigned int oflags)
@@ -184,12 +179,13 @@ cifs_new_fileinfo(struct inode *newinode, __u16 fileHandle,
}
write_unlock(&GlobalSMBSeslock);
+ file->private_data = pCifsFile;
+
return pCifsFile;
}
int cifs_posix_open(char *full_path, struct inode **pinode,
- struct vfsmount *mnt, struct super_block *sb,
- int mode, int oflags,
+ struct super_block *sb, int mode, int oflags,
__u32 *poplock, __u16 *pnetfid, int xid)
{
int rc;
@@ -258,19 +254,6 @@ int cifs_posix_open(char *full_path, struct inode **pinode,
cifs_fattr_to_inode(*pinode, &fattr);
}
- /*
- * cifs_fill_filedata() takes care of setting cifsFileInfo pointer to
- * file->private_data.
- */
- if (mnt) {
- struct cifsFileInfo *pfile_info;
-
- pfile_info = cifs_new_fileinfo(*pinode, *pnetfid, NULL, mnt,
- oflags);
- if (pfile_info == NULL)
- rc = -ENOMEM;
- }
-
posix_open_ret:
kfree(presp_data);
return rc;
@@ -298,7 +281,6 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
int create_options = CREATE_NOT_DIR;
__u32 oplock = 0;
int oflags;
- bool posix_create = false;
/*
* BB below access is probably too much for mknod to request
* but we have to do query and setpathinfo so requesting
@@ -339,7 +321,6 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
(CIFS_UNIX_POSIX_PATH_OPS_CAP &
le64_to_cpu(tcon->fsUnixInfo.Capability))) {
rc = cifs_posix_open(full_path, &newinode,
- nd ? nd->path.mnt : NULL,
inode->i_sb, mode, oflags, &oplock, &fileHandle, xid);
/* EIO could indicate that (posix open) operation is not
supported, despite what server claimed in capability
@@ -347,7 +328,6 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
handled in posix open */
if (rc == 0) {
- posix_create = true;
if (newinode == NULL) /* query inode info */
goto cifs_create_get_file_info;
else /* success, no need to query */
@@ -478,21 +458,28 @@ cifs_create_set_dentry:
else
cFYI(1, "Create worked, get_inode_info failed rc = %d", rc);
- /* nfsd case - nfs srv does not set nd */
- if ((nd == NULL) || (!(nd->flags & LOOKUP_OPEN))) {
- /* mknod case - do not leave file open */
- CIFSSMBClose(xid, tcon, fileHandle);
- } else if (!(posix_create) && (newinode)) {
+ if (newinode && nd && (nd->flags & LOOKUP_OPEN)) {
struct cifsFileInfo *pfile_info;
- /*
- * cifs_fill_filedata() takes care of setting cifsFileInfo
- * pointer to file->private_data.
- */
- pfile_info = cifs_new_fileinfo(newinode, fileHandle, NULL,
+ struct file *filp;
+
+ filp = lookup_instantiate_filp(nd, direntry, generic_file_open);
+ if (IS_ERR(filp)) {
+ rc = PTR_ERR(filp);
+ CIFSSMBClose(xid, tcon, fileHandle);
+ goto cifs_create_out;
+ }
+
+ pfile_info = cifs_new_fileinfo(newinode, fileHandle, filp,
nd->path.mnt, oflags);
- if (pfile_info == NULL)
+ if (pfile_info == NULL) {
+ fput(filp);
+ CIFSSMBClose(xid, tcon, fileHandle);
rc = -ENOMEM;
+ }
+ } else {
+ CIFSSMBClose(xid, tcon, fileHandle);
}
+
cifs_create_out:
kfree(buf);
kfree(full_path);
@@ -636,6 +623,7 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry,
bool posix_open = false;
struct cifs_sb_info *cifs_sb;
struct cifsTconInfo *pTcon;
+ struct cifsFileInfo *cfile;
struct inode *newInode = NULL;
char *full_path = NULL;
struct file *filp;
@@ -703,7 +691,7 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry,
if (nd && !(nd->flags & (LOOKUP_PARENT | LOOKUP_DIRECTORY)) &&
(nd->flags & LOOKUP_OPEN) && !pTcon->broken_posix_open &&
(nd->intent.open.flags & O_CREAT)) {
- rc = cifs_posix_open(full_path, &newInode, nd->path.mnt,
+ rc = cifs_posix_open(full_path, &newInode,
parent_dir_inode->i_sb,
nd->intent.open.create_mode,
nd->intent.open.flags, &oplock,
@@ -733,8 +721,25 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry,
else
direntry->d_op = &cifs_dentry_ops;
d_add(direntry, newInode);
- if (posix_open)
- filp = lookup_instantiate_filp(nd, direntry, NULL);
+ if (posix_open) {
+ filp = lookup_instantiate_filp(nd, direntry,
+ generic_file_open);
+ if (IS_ERR(filp)) {
+ rc = PTR_ERR(filp);
+ CIFSSMBClose(xid, pTcon, fileHandle);
+ goto lookup_out;
+ }
+
+ cfile = cifs_new_fileinfo(newInode, fileHandle, filp,
+ nd->path.mnt,
+ nd->intent.open.flags);
+ if (cfile == NULL) {
+ fput(filp);
+ CIFSSMBClose(xid, pTcon, fileHandle);
+ rc = -ENOMEM;
+ goto lookup_out;
+ }
+ }
/* since paths are not looked up by component - the parent
directories are presumed to be good here */
renew_parental_timestamps(direntry);
@@ -755,6 +760,7 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry,
is a common return code */
}
+lookup_out:
kfree(full_path);
FreeXid(xid);
return ERR_PTR(rc);
diff --git a/fs/cifs/dns_resolve.c b/fs/cifs/dns_resolve.c
index 4db2c5e7283f..3ad7f4300c45 100644
--- a/fs/cifs/dns_resolve.c
+++ b/fs/cifs/dns_resolve.c
@@ -24,12 +24,16 @@
*/
#include <linux/slab.h>
+#include <linux/keyctl.h>
+#include <linux/key-type.h>
#include <keys/user-type.h>
#include "dns_resolve.h"
#include "cifsglob.h"
#include "cifsproto.h"
#include "cifs_debug.h"
+static const struct cred *dns_resolver_cache;
+
/* Checks if supplied name is IP address
* returns:
* 1 - name is IP
@@ -40,7 +44,7 @@ is_ip(char *name)
{
struct sockaddr_storage ss;
- return cifs_convert_address(name, &ss);
+ return cifs_convert_address((struct sockaddr *)&ss, name);
}
static int
@@ -94,6 +98,7 @@ struct key_type key_type_dns_resolver = {
int
dns_resolve_server_name_to_ip(const char *unc, char **ip_addr)
{
+ const struct cred *saved_cred;
int rc = -EAGAIN;
struct key *rkey = ERR_PTR(-EAGAIN);
char *name;
@@ -133,8 +138,15 @@ dns_resolve_server_name_to_ip(const char *unc, char **ip_addr)
goto skip_upcall;
}
+ saved_cred = override_creds(dns_resolver_cache);
rkey = request_key(&key_type_dns_resolver, name, "");
+ revert_creds(saved_cred);
if (!IS_ERR(rkey)) {
+ if (!(rkey->perm & KEY_USR_VIEW)) {
+ down_read(&rkey->sem);
+ rkey->perm |= KEY_USR_VIEW;
+ up_read(&rkey->sem);
+ }
len = rkey->type_data.x[0];
data = rkey->payload.data;
} else {
@@ -165,4 +177,61 @@ out:
return rc;
}
+int __init cifs_init_dns_resolver(void)
+{
+ struct cred *cred;
+ struct key *keyring;
+ int ret;
+
+ printk(KERN_NOTICE "Registering the %s key type\n",
+ key_type_dns_resolver.name);
+
+ /* create an override credential set with a special thread keyring in
+ * which DNS requests are cached
+ *
+ * this is used to prevent malicious redirections from being installed
+ * with add_key().
+ */
+ cred = prepare_kernel_cred(NULL);
+ if (!cred)
+ return -ENOMEM;
+
+ keyring = key_alloc(&key_type_keyring, ".dns_resolver", 0, 0, cred,
+ (KEY_POS_ALL & ~KEY_POS_SETATTR) |
+ KEY_USR_VIEW | KEY_USR_READ,
+ KEY_ALLOC_NOT_IN_QUOTA);
+ if (IS_ERR(keyring)) {
+ ret = PTR_ERR(keyring);
+ goto failed_put_cred;
+ }
+
+ ret = key_instantiate_and_link(keyring, NULL, 0, NULL, NULL);
+ if (ret < 0)
+ goto failed_put_key;
+
+ ret = register_key_type(&key_type_dns_resolver);
+ if (ret < 0)
+ goto failed_put_key;
+
+ /* instruct request_key() to use this special keyring as a cache for
+ * the results it looks up */
+ cred->thread_keyring = keyring;
+ cred->jit_keyring = KEY_REQKEY_DEFL_THREAD_KEYRING;
+ dns_resolver_cache = cred;
+ return 0;
+
+failed_put_key:
+ key_put(keyring);
+failed_put_cred:
+ put_cred(cred);
+ return ret;
+}
+void cifs_exit_dns_resolver(void)
+{
+ key_revoke(dns_resolver_cache->thread_keyring);
+ unregister_key_type(&key_type_dns_resolver);
+ put_cred(dns_resolver_cache);
+ printk(KERN_NOTICE "Unregistered %s key type\n",
+ key_type_dns_resolver.name);
+}
diff --git a/fs/cifs/dns_resolve.h b/fs/cifs/dns_resolve.h
index 966e9288930b..5d7f291df162 100644
--- a/fs/cifs/dns_resolve.h
+++ b/fs/cifs/dns_resolve.h
@@ -24,8 +24,8 @@
#define _DNS_RESOLVE_H
#ifdef __KERNEL__
-#include <linux/key-type.h>
-extern struct key_type key_type_dns_resolver;
+extern int __init cifs_init_dns_resolver(void);
+extern void cifs_exit_dns_resolver(void);
extern int dns_resolve_server_name_to_ip(const char *unc, char **ip_addr);
#endif /* KERNEL */
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index a83541ec9713..fa04a00d126d 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -40,6 +40,7 @@
#include "cifs_unicode.h"
#include "cifs_debug.h"
#include "cifs_fs_sb.h"
+#include "fscache.h"
static inline int cifs_convert_flags(unsigned int flags)
{
@@ -162,44 +163,12 @@ psx_client_can_cache:
return 0;
}
-static struct cifsFileInfo *
-cifs_fill_filedata(struct file *file)
-{
- struct list_head *tmp;
- struct cifsFileInfo *pCifsFile = NULL;
- struct cifsInodeInfo *pCifsInode = NULL;
-
- /* search inode for this file and fill in file->private_data */
- pCifsInode = CIFS_I(file->f_path.dentry->d_inode);
- read_lock(&GlobalSMBSeslock);
- list_for_each(tmp, &pCifsInode->openFileList) {
- pCifsFile = list_entry(tmp, struct cifsFileInfo, flist);
- if ((pCifsFile->pfile == NULL) &&
- (pCifsFile->pid == current->tgid)) {
- /* mode set in cifs_create */
-
- /* needed for writepage */
- pCifsFile->pfile = file;
- file->private_data = pCifsFile;
- break;
- }
- }
- read_unlock(&GlobalSMBSeslock);
-
- if (file->private_data != NULL) {
- return pCifsFile;
- } else if ((file->f_flags & O_CREAT) && (file->f_flags & O_EXCL))
- cERROR(1, "could not find file instance for "
- "new file %p", file);
- return NULL;
-}
-
/* all arguments to this function must be checked for validity in caller */
-static inline int cifs_open_inode_helper(struct inode *inode, struct file *file,
- struct cifsInodeInfo *pCifsInode, struct cifsFileInfo *pCifsFile,
+static inline int cifs_open_inode_helper(struct inode *inode,
struct cifsTconInfo *pTcon, int *oplock, FILE_ALL_INFO *buf,
char *full_path, int xid)
{
+ struct cifsInodeInfo *pCifsInode = CIFS_I(inode);
struct timespec temp;
int rc;
@@ -213,36 +182,35 @@ static inline int cifs_open_inode_helper(struct inode *inode, struct file *file,
/* if not oplocked, invalidate inode pages if mtime or file
size changed */
temp = cifs_NTtimeToUnix(buf->LastWriteTime);
- if (timespec_equal(&file->f_path.dentry->d_inode->i_mtime, &temp) &&
- (file->f_path.dentry->d_inode->i_size ==
+ if (timespec_equal(&inode->i_mtime, &temp) &&
+ (inode->i_size ==
(loff_t)le64_to_cpu(buf->EndOfFile))) {
cFYI(1, "inode unchanged on server");
} else {
- if (file->f_path.dentry->d_inode->i_mapping) {
+ if (inode->i_mapping) {
/* BB no need to lock inode until after invalidate
since namei code should already have it locked? */
- rc = filemap_write_and_wait(file->f_path.dentry->d_inode->i_mapping);
+ rc = filemap_write_and_wait(inode->i_mapping);
if (rc != 0)
- CIFS_I(file->f_path.dentry->d_inode)->write_behind_rc = rc;
+ pCifsInode->write_behind_rc = rc;
}
cFYI(1, "invalidating remote inode since open detected it "
"changed");
- invalidate_remote_inode(file->f_path.dentry->d_inode);
+ invalidate_remote_inode(inode);
}
client_can_cache:
if (pTcon->unix_ext)
- rc = cifs_get_inode_info_unix(&file->f_path.dentry->d_inode,
- full_path, inode->i_sb, xid);
+ rc = cifs_get_inode_info_unix(&inode, full_path, inode->i_sb,
+ xid);
else
- rc = cifs_get_inode_info(&file->f_path.dentry->d_inode,
- full_path, buf, inode->i_sb, xid, NULL);
+ rc = cifs_get_inode_info(&inode, full_path, buf, inode->i_sb,
+ xid, NULL);
if ((*oplock & 0xF) == OPLOCK_EXCLUSIVE) {
pCifsInode->clientCanCacheAll = true;
pCifsInode->clientCanCacheRead = true;
- cFYI(1, "Exclusive Oplock granted on inode %p",
- file->f_path.dentry->d_inode);
+ cFYI(1, "Exclusive Oplock granted on inode %p", inode);
} else if ((*oplock & 0xF) == OPLOCK_READ)
pCifsInode->clientCanCacheRead = true;
@@ -256,7 +224,7 @@ int cifs_open(struct inode *inode, struct file *file)
__u32 oplock;
struct cifs_sb_info *cifs_sb;
struct cifsTconInfo *tcon;
- struct cifsFileInfo *pCifsFile;
+ struct cifsFileInfo *pCifsFile = NULL;
struct cifsInodeInfo *pCifsInode;
char *full_path = NULL;
int desiredAccess;
@@ -270,12 +238,6 @@ int cifs_open(struct inode *inode, struct file *file)
tcon = cifs_sb->tcon;
pCifsInode = CIFS_I(file->f_path.dentry->d_inode);
- pCifsFile = cifs_fill_filedata(file);
- if (pCifsFile) {
- rc = 0;
- FreeXid(xid);
- return rc;
- }
full_path = build_path_from_dentry(file->f_path.dentry);
if (full_path == NULL) {
@@ -299,8 +261,7 @@ int cifs_open(struct inode *inode, struct file *file)
int oflags = (int) cifs_posix_convert_flags(file->f_flags);
oflags |= SMB_O_CREAT;
/* can not refresh inode info since size could be stale */
- rc = cifs_posix_open(full_path, &inode, file->f_path.mnt,
- inode->i_sb,
+ rc = cifs_posix_open(full_path, &inode, inode->i_sb,
cifs_sb->mnt_file_mode /* ignored */,
oflags, &oplock, &netfid, xid);
if (rc == 0) {
@@ -308,9 +269,23 @@ int cifs_open(struct inode *inode, struct file *file)
/* no need for special case handling of setting mode
on read only files needed here */
- pCifsFile = cifs_fill_filedata(file);
- cifs_posix_open_inode_helper(inode, file, pCifsInode,
- oplock, netfid);
+ rc = cifs_posix_open_inode_helper(inode, file,
+ pCifsInode, oplock, netfid);
+ if (rc != 0) {
+ CIFSSMBClose(xid, tcon, netfid);
+ goto out;
+ }
+
+ pCifsFile = cifs_new_fileinfo(inode, netfid, file,
+ file->f_path.mnt,
+ oflags);
+ if (pCifsFile == NULL) {
+ CIFSSMBClose(xid, tcon, netfid);
+ rc = -ENOMEM;
+ }
+
+ cifs_fscache_set_inode_cookie(inode, file);
+
goto out;
} else if ((rc == -EINVAL) || (rc == -EOPNOTSUPP)) {
if (tcon->ses->serverNOS)
@@ -391,16 +366,18 @@ int cifs_open(struct inode *inode, struct file *file)
goto out;
}
+ rc = cifs_open_inode_helper(inode, tcon, &oplock, buf, full_path, xid);
+ if (rc != 0)
+ goto out;
+
pCifsFile = cifs_new_fileinfo(inode, netfid, file, file->f_path.mnt,
file->f_flags);
- file->private_data = pCifsFile;
- if (file->private_data == NULL) {
+ if (pCifsFile == NULL) {
rc = -ENOMEM;
goto out;
}
- rc = cifs_open_inode_helper(inode, file, pCifsInode, pCifsFile, tcon,
- &oplock, buf, full_path, xid);
+ cifs_fscache_set_inode_cookie(inode, file);
if (oplock & CIFS_CREATE_ACTION) {
/* time to set mode which we can not set earlier due to
@@ -456,7 +433,7 @@ static int cifs_reopen_file(struct file *file, bool can_flush)
__u16 netfid;
if (file->private_data)
- pCifsFile = (struct cifsFileInfo *)file->private_data;
+ pCifsFile = file->private_data;
else
return -EBADF;
@@ -513,8 +490,7 @@ reopen_error_exit:
le64_to_cpu(tcon->fsUnixInfo.Capability))) {
int oflags = (int) cifs_posix_convert_flags(file->f_flags);
/* can not refresh inode info since size could be stale */
- rc = cifs_posix_open(full_path, NULL, file->f_path.mnt,
- inode->i_sb,
+ rc = cifs_posix_open(full_path, NULL, inode->i_sb,
cifs_sb->mnt_file_mode /* ignored */,
oflags, &oplock, &netfid, xid);
if (rc == 0) {
@@ -595,8 +571,7 @@ int cifs_close(struct inode *inode, struct file *file)
int xid, timeout;
struct cifs_sb_info *cifs_sb;
struct cifsTconInfo *pTcon;
- struct cifsFileInfo *pSMBFile =
- (struct cifsFileInfo *)file->private_data;
+ struct cifsFileInfo *pSMBFile = file->private_data;
xid = GetXid();
@@ -671,8 +646,7 @@ int cifs_closedir(struct inode *inode, struct file *file)
{
int rc = 0;
int xid;
- struct cifsFileInfo *pCFileStruct =
- (struct cifsFileInfo *)file->private_data;
+ struct cifsFileInfo *pCFileStruct = file->private_data;
char *ptmp;
cFYI(1, "Closedir inode = 0x%p", inode);
@@ -893,8 +867,7 @@ int cifs_lock(struct file *file, int cmd, struct file_lock *pfLock)
length, pfLock,
posix_lock_type, wait_flag);
} else {
- struct cifsFileInfo *fid =
- (struct cifsFileInfo *)file->private_data;
+ struct cifsFileInfo *fid = file->private_data;
if (numLock) {
rc = CIFSSMBLock(xid, tcon, netfid, length,
@@ -995,7 +968,7 @@ ssize_t cifs_user_write(struct file *file, const char __user *write_data,
if (file->private_data == NULL)
return -EBADF;
- open_file = (struct cifsFileInfo *) file->private_data;
+ open_file = file->private_data;
rc = generic_write_checks(file, poffset, &write_size, 0);
if (rc)
@@ -1097,7 +1070,7 @@ static ssize_t cifs_write(struct file *file, const char *write_data,
if (file->private_data == NULL)
return -EBADF;
- open_file = (struct cifsFileInfo *)file->private_data;
+ open_file = file->private_data;
xid = GetXid();
@@ -1676,19 +1649,18 @@ static int cifs_write_end(struct file *file, struct address_space *mapping,
return rc;
}
-int cifs_fsync(struct file *file, struct dentry *dentry, int datasync)
+int cifs_fsync(struct file *file, int datasync)
{
int xid;
int rc = 0;
struct cifsTconInfo *tcon;
- struct cifsFileInfo *smbfile =
- (struct cifsFileInfo *)file->private_data;
+ struct cifsFileInfo *smbfile = file->private_data;
struct inode *inode = file->f_path.dentry->d_inode;
xid = GetXid();
cFYI(1, "Sync file - name: %s datasync: 0x%x",
- dentry->d_name.name, datasync);
+ file->f_path.dentry->d_name.name, datasync);
rc = filemap_write_and_wait(inode->i_mapping);
if (rc == 0) {
@@ -1786,7 +1758,7 @@ ssize_t cifs_user_read(struct file *file, char __user *read_data,
FreeXid(xid);
return rc;
}
- open_file = (struct cifsFileInfo *)file->private_data;
+ open_file = file->private_data;
if ((file->f_flags & O_ACCMODE) == O_WRONLY)
cFYI(1, "attempting read on write only file instance");
@@ -1867,7 +1839,7 @@ static ssize_t cifs_read(struct file *file, char *read_data, size_t read_size,
FreeXid(xid);
return rc;
}
- open_file = (struct cifsFileInfo *)file->private_data;
+ open_file = file->private_data;
if ((file->f_flags & O_ACCMODE) == O_WRONLY)
cFYI(1, "attempting read on write only file instance");
@@ -1952,6 +1924,7 @@ static void cifs_copy_cache_pages(struct address_space *mapping,
bytes_read -= PAGE_CACHE_SIZE;
continue;
}
+ page_cache_release(page);
target = kmap_atomic(page, KM_USER0);
@@ -1971,6 +1944,9 @@ static void cifs_copy_cache_pages(struct address_space *mapping,
SetPageUptodate(page);
unlock_page(page);
data += PAGE_CACHE_SIZE;
+
+ /* add page to FS-Cache */
+ cifs_readpage_to_fscache(mapping->host, page);
}
return;
}
@@ -1997,10 +1973,19 @@ static int cifs_readpages(struct file *file, struct address_space *mapping,
FreeXid(xid);
return rc;
}
- open_file = (struct cifsFileInfo *)file->private_data;
+ open_file = file->private_data;
cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
pTcon = cifs_sb->tcon;
+ /*
+ * Reads as many pages as possible from fscache. Returns -ENOBUFS
+ * immediately if the cookie is negative
+ */
+ rc = cifs_readpages_from_fscache(mapping->host, mapping, page_list,
+ &num_pages);
+ if (rc == 0)
+ goto read_complete;
+
cFYI(DBG2, "rpages: num pages %d", num_pages);
for (i = 0; i < num_pages; ) {
unsigned contig_pages;
@@ -2111,6 +2096,7 @@ static int cifs_readpages(struct file *file, struct address_space *mapping,
smb_read_data = NULL;
}
+read_complete:
FreeXid(xid);
return rc;
}
@@ -2121,6 +2107,11 @@ static int cifs_readpage_worker(struct file *file, struct page *page,
char *read_data;
int rc;
+ /* Is the page cached? */
+ rc = cifs_readpage_from_fscache(file->f_path.dentry->d_inode, page);
+ if (rc == 0)
+ goto read_complete;
+
page_cache_get(page);
read_data = kmap(page);
/* for reads over a certain size could initiate async read ahead */
@@ -2140,11 +2131,17 @@ static int cifs_readpage_worker(struct file *file, struct page *page,
flush_dcache_page(page);
SetPageUptodate(page);
+
+ /* send this page to the cache */
+ cifs_readpage_to_fscache(file->f_path.dentry->d_inode, page);
+
rc = 0;
io_error:
kunmap(page);
page_cache_release(page);
+
+read_complete:
return rc;
}
@@ -2294,6 +2291,22 @@ out:
return rc;
}
+static int cifs_release_page(struct page *page, gfp_t gfp)
+{
+ if (PagePrivate(page))
+ return 0;
+
+ return cifs_fscache_release_page(page, gfp);
+}
+
+static void cifs_invalidate_page(struct page *page, unsigned long offset)
+{
+ struct cifsInodeInfo *cifsi = CIFS_I(page->mapping->host);
+
+ if (offset == 0)
+ cifs_fscache_invalidate_page(page, &cifsi->vfs_inode);
+}
+
static void
cifs_oplock_break(struct slow_work *work)
{
@@ -2367,6 +2380,8 @@ const struct address_space_operations cifs_addr_ops = {
.write_begin = cifs_write_begin,
.write_end = cifs_write_end,
.set_page_dirty = __set_page_dirty_nobuffers,
+ .releasepage = cifs_release_page,
+ .invalidatepage = cifs_invalidate_page,
/* .sync_page = cifs_sync_page, */
/* .direct_IO = */
};
@@ -2383,6 +2398,8 @@ const struct address_space_operations cifs_addr_ops_smallbuf = {
.write_begin = cifs_write_begin,
.write_end = cifs_write_end,
.set_page_dirty = __set_page_dirty_nobuffers,
+ .releasepage = cifs_release_page,
+ .invalidatepage = cifs_invalidate_page,
/* .sync_page = cifs_sync_page, */
/* .direct_IO = */
};
diff --git a/fs/cifs/fscache.c b/fs/cifs/fscache.c
new file mode 100644
index 000000000000..9f3f5c4be161
--- /dev/null
+++ b/fs/cifs/fscache.c
@@ -0,0 +1,236 @@
+/*
+ * fs/cifs/fscache.c - CIFS filesystem cache interface
+ *
+ * Copyright (c) 2010 Novell, Inc.
+ * Author(s): Suresh Jayaraman (sjayaraman@suse.de>
+ *
+ * This library is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published
+ * by the Free Software Foundation; either version 2.1 of the License, or
+ * (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
+ * the GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with this library; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+#include "fscache.h"
+#include "cifsglob.h"
+#include "cifs_debug.h"
+#include "cifs_fs_sb.h"
+
+void cifs_fscache_get_client_cookie(struct TCP_Server_Info *server)
+{
+ server->fscache =
+ fscache_acquire_cookie(cifs_fscache_netfs.primary_index,
+ &cifs_fscache_server_index_def, server);
+ cFYI(1, "CIFS: get client cookie (0x%p/0x%p)", server,
+ server->fscache);
+}
+
+void cifs_fscache_release_client_cookie(struct TCP_Server_Info *server)
+{
+ cFYI(1, "CIFS: release client cookie (0x%p/0x%p)", server,
+ server->fscache);
+ fscache_relinquish_cookie(server->fscache, 0);
+ server->fscache = NULL;
+}
+
+void cifs_fscache_get_super_cookie(struct cifsTconInfo *tcon)
+{
+ struct TCP_Server_Info *server = tcon->ses->server;
+
+ tcon->fscache =
+ fscache_acquire_cookie(server->fscache,
+ &cifs_fscache_super_index_def, tcon);
+ cFYI(1, "CIFS: get superblock cookie (0x%p/0x%p)",
+ server->fscache, tcon->fscache);
+}
+
+void cifs_fscache_release_super_cookie(struct cifsTconInfo *tcon)
+{
+ cFYI(1, "CIFS: releasing superblock cookie (0x%p)", tcon->fscache);
+ fscache_relinquish_cookie(tcon->fscache, 0);
+ tcon->fscache = NULL;
+}
+
+static void cifs_fscache_enable_inode_cookie(struct inode *inode)
+{
+ struct cifsInodeInfo *cifsi = CIFS_I(inode);
+ struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
+
+ if (cifsi->fscache)
+ return;
+
+ cifsi->fscache = fscache_acquire_cookie(cifs_sb->tcon->fscache,
+ &cifs_fscache_inode_object_def,
+ cifsi);
+ cFYI(1, "CIFS: got FH cookie (0x%p/0x%p)",
+ cifs_sb->tcon->fscache, cifsi->fscache);
+}
+
+void cifs_fscache_release_inode_cookie(struct inode *inode)
+{
+ struct cifsInodeInfo *cifsi = CIFS_I(inode);
+
+ if (cifsi->fscache) {
+ cFYI(1, "CIFS releasing inode cookie (0x%p)",
+ cifsi->fscache);
+ fscache_relinquish_cookie(cifsi->fscache, 0);
+ cifsi->fscache = NULL;
+ }
+}
+
+static void cifs_fscache_disable_inode_cookie(struct inode *inode)
+{
+ struct cifsInodeInfo *cifsi = CIFS_I(inode);
+
+ if (cifsi->fscache) {
+ cFYI(1, "CIFS disabling inode cookie (0x%p)",
+ cifsi->fscache);
+ fscache_relinquish_cookie(cifsi->fscache, 1);
+ cifsi->fscache = NULL;
+ }
+}
+
+void cifs_fscache_set_inode_cookie(struct inode *inode, struct file *filp)
+{
+ if ((filp->f_flags & O_ACCMODE) != O_RDONLY)
+ cifs_fscache_disable_inode_cookie(inode);
+ else {
+ cifs_fscache_enable_inode_cookie(inode);
+ cFYI(1, "CIFS: fscache inode cookie set");
+ }
+}
+
+void cifs_fscache_reset_inode_cookie(struct inode *inode)
+{
+ struct cifsInodeInfo *cifsi = CIFS_I(inode);
+ struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
+ struct fscache_cookie *old = cifsi->fscache;
+
+ if (cifsi->fscache) {
+ /* retire the current fscache cache and get a new one */
+ fscache_relinquish_cookie(cifsi->fscache, 1);
+
+ cifsi->fscache = fscache_acquire_cookie(cifs_sb->tcon->fscache,
+ &cifs_fscache_inode_object_def,
+ cifsi);
+ cFYI(1, "CIFS: new cookie 0x%p oldcookie 0x%p",
+ cifsi->fscache, old);
+ }
+}
+
+int cifs_fscache_release_page(struct page *page, gfp_t gfp)
+{
+ if (PageFsCache(page)) {
+ struct inode *inode = page->mapping->host;
+ struct cifsInodeInfo *cifsi = CIFS_I(inode);
+
+ cFYI(1, "CIFS: fscache release page (0x%p/0x%p)",
+ page, cifsi->fscache);
+ if (!fscache_maybe_release_page(cifsi->fscache, page, gfp))
+ return 0;
+ }
+
+ return 1;
+}
+
+static void cifs_readpage_from_fscache_complete(struct page *page, void *ctx,
+ int error)
+{
+ cFYI(1, "CFS: readpage_from_fscache_complete (0x%p/%d)",
+ page, error);
+ if (!error)
+ SetPageUptodate(page);
+ unlock_page(page);
+}
+
+/*
+ * Retrieve a page from FS-Cache
+ */
+int __cifs_readpage_from_fscache(struct inode *inode, struct page *page)
+{
+ int ret;
+
+ cFYI(1, "CIFS: readpage_from_fscache(fsc:%p, p:%p, i:0x%p",
+ CIFS_I(inode)->fscache, page, inode);
+ ret = fscache_read_or_alloc_page(CIFS_I(inode)->fscache, page,
+ cifs_readpage_from_fscache_complete,
+ NULL,
+ GFP_KERNEL);
+ switch (ret) {
+
+ case 0: /* page found in fscache, read submitted */
+ cFYI(1, "CIFS: readpage_from_fscache: submitted");
+ return ret;
+ case -ENOBUFS: /* page won't be cached */
+ case -ENODATA: /* page not in cache */
+ cFYI(1, "CIFS: readpage_from_fscache %d", ret);
+ return 1;
+
+ default:
+ cERROR(1, "unknown error ret = %d", ret);
+ }
+ return ret;
+}
+
+/*
+ * Retrieve a set of pages from FS-Cache
+ */
+int __cifs_readpages_from_fscache(struct inode *inode,
+ struct address_space *mapping,
+ struct list_head *pages,
+ unsigned *nr_pages)
+{
+ int ret;
+
+ cFYI(1, "CIFS: __cifs_readpages_from_fscache (0x%p/%u/0x%p)",
+ CIFS_I(inode)->fscache, *nr_pages, inode);
+ ret = fscache_read_or_alloc_pages(CIFS_I(inode)->fscache, mapping,
+ pages, nr_pages,
+ cifs_readpage_from_fscache_complete,
+ NULL,
+ mapping_gfp_mask(mapping));
+ switch (ret) {
+ case 0: /* read submitted to the cache for all pages */
+ cFYI(1, "CIFS: readpages_from_fscache: submitted");
+ return ret;
+
+ case -ENOBUFS: /* some pages are not cached and can't be */
+ case -ENODATA: /* some pages are not cached */
+ cFYI(1, "CIFS: readpages_from_fscache: no page");
+ return 1;
+
+ default:
+ cFYI(1, "unknown error ret = %d", ret);
+ }
+
+ return ret;
+}
+
+void __cifs_readpage_to_fscache(struct inode *inode, struct page *page)
+{
+ int ret;
+
+ cFYI(1, "CIFS: readpage_to_fscache(fsc: %p, p: %p, i: %p",
+ CIFS_I(inode)->fscache, page, inode);
+ ret = fscache_write_page(CIFS_I(inode)->fscache, page, GFP_KERNEL);
+ if (ret != 0)
+ fscache_uncache_page(CIFS_I(inode)->fscache, page);
+}
+
+void __cifs_fscache_invalidate_page(struct page *page, struct inode *inode)
+{
+ struct cifsInodeInfo *cifsi = CIFS_I(inode);
+ struct fscache_cookie *cookie = cifsi->fscache;
+
+ cFYI(1, "CIFS: fscache invalidatepage (0x%p/0x%p)", page, cookie);
+ fscache_wait_on_page_write(cookie, page);
+ fscache_uncache_page(cookie, page);
+}
+
diff --git a/fs/cifs/fscache.h b/fs/cifs/fscache.h
new file mode 100644
index 000000000000..31b88ec2341e
--- /dev/null
+++ b/fs/cifs/fscache.h
@@ -0,0 +1,136 @@
+/*
+ * fs/cifs/fscache.h - CIFS filesystem cache interface definitions
+ *
+ * Copyright (c) 2010 Novell, Inc.
+ * Authors(s): Suresh Jayaraman (sjayaraman@suse.de>
+ *
+ * This library is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published
+ * by the Free Software Foundation; either version 2.1 of the License, or
+ * (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
+ * the GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with this library; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+#ifndef _CIFS_FSCACHE_H
+#define _CIFS_FSCACHE_H
+
+#include <linux/fscache.h>
+
+#include "cifsglob.h"
+
+#ifdef CONFIG_CIFS_FSCACHE
+
+extern struct fscache_netfs cifs_fscache_netfs;
+extern const struct fscache_cookie_def cifs_fscache_server_index_def;
+extern const struct fscache_cookie_def cifs_fscache_super_index_def;
+extern const struct fscache_cookie_def cifs_fscache_inode_object_def;
+
+extern int cifs_fscache_register(void);
+extern void cifs_fscache_unregister(void);
+
+/*
+ * fscache.c
+ */
+extern void cifs_fscache_get_client_cookie(struct TCP_Server_Info *);
+extern void cifs_fscache_release_client_cookie(struct TCP_Server_Info *);
+extern void cifs_fscache_get_super_cookie(struct cifsTconInfo *);
+extern void cifs_fscache_release_super_cookie(struct cifsTconInfo *);
+
+extern void cifs_fscache_release_inode_cookie(struct inode *);
+extern void cifs_fscache_set_inode_cookie(struct inode *, struct file *);
+extern void cifs_fscache_reset_inode_cookie(struct inode *);
+
+extern void __cifs_fscache_invalidate_page(struct page *, struct inode *);
+extern int cifs_fscache_release_page(struct page *page, gfp_t gfp);
+extern int __cifs_readpage_from_fscache(struct inode *, struct page *);
+extern int __cifs_readpages_from_fscache(struct inode *,
+ struct address_space *,
+ struct list_head *,
+ unsigned *);
+
+extern void __cifs_readpage_to_fscache(struct inode *, struct page *);
+
+static inline void cifs_fscache_invalidate_page(struct page *page,
+ struct inode *inode)
+{
+ if (PageFsCache(page))
+ __cifs_fscache_invalidate_page(page, inode);
+}
+
+static inline int cifs_readpage_from_fscache(struct inode *inode,
+ struct page *page)
+{
+ if (CIFS_I(inode)->fscache)
+ return __cifs_readpage_from_fscache(inode, page);
+
+ return -ENOBUFS;
+}
+
+static inline int cifs_readpages_from_fscache(struct inode *inode,
+ struct address_space *mapping,
+ struct list_head *pages,
+ unsigned *nr_pages)
+{
+ if (CIFS_I(inode)->fscache)
+ return __cifs_readpages_from_fscache(inode, mapping, pages,
+ nr_pages);
+ return -ENOBUFS;
+}
+
+static inline void cifs_readpage_to_fscache(struct inode *inode,
+ struct page *page)
+{
+ if (PageFsCache(page))
+ __cifs_readpage_to_fscache(inode, page);
+}
+
+#else /* CONFIG_CIFS_FSCACHE */
+static inline int cifs_fscache_register(void) { return 0; }
+static inline void cifs_fscache_unregister(void) {}
+
+static inline void
+cifs_fscache_get_client_cookie(struct TCP_Server_Info *server) {}
+static inline void
+cifs_fscache_release_client_cookie(struct TCP_Server_Info *server) {}
+static inline void cifs_fscache_get_super_cookie(struct cifsTconInfo *tcon) {}
+static inline void
+cifs_fscache_release_super_cookie(struct cifsTconInfo *tcon) {}
+
+static inline void cifs_fscache_release_inode_cookie(struct inode *inode) {}
+static inline void cifs_fscache_set_inode_cookie(struct inode *inode,
+ struct file *filp) {}
+static inline void cifs_fscache_reset_inode_cookie(struct inode *inode) {}
+static inline int cifs_fscache_release_page(struct page *page, gfp_t gfp)
+{
+ return 1; /* May release page */
+}
+
+static inline void cifs_fscache_invalidate_page(struct page *page,
+ struct inode *inode) {}
+static inline int
+cifs_readpage_from_fscache(struct inode *inode, struct page *page)
+{
+ return -ENOBUFS;
+}
+
+static inline int cifs_readpages_from_fscache(struct inode *inode,
+ struct address_space *mapping,
+ struct list_head *pages,
+ unsigned *nr_pages)
+{
+ return -ENOBUFS;
+}
+
+static inline void cifs_readpage_to_fscache(struct inode *inode,
+ struct page *page) {}
+
+#endif /* CONFIG_CIFS_FSCACHE */
+
+#endif /* _CIFS_FSCACHE_H */
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 62b324f26a56..a15b3a9bbff4 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -29,6 +29,7 @@
#include "cifsproto.h"
#include "cifs_debug.h"
#include "cifs_fs_sb.h"
+#include "fscache.h"
static void cifs_set_ops(struct inode *inode, const bool is_dfs_referral)
@@ -288,7 +289,7 @@ int cifs_get_file_info_unix(struct file *filp)
struct inode *inode = filp->f_path.dentry->d_inode;
struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
struct cifsTconInfo *tcon = cifs_sb->tcon;
- struct cifsFileInfo *cfile = (struct cifsFileInfo *) filp->private_data;
+ struct cifsFileInfo *cfile = filp->private_data;
xid = GetXid();
rc = CIFSSMBUnixQFileInfo(xid, tcon, cfile->netfid, &find_data);
@@ -515,7 +516,7 @@ int cifs_get_file_info(struct file *filp)
struct inode *inode = filp->f_path.dentry->d_inode;
struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
struct cifsTconInfo *tcon = cifs_sb->tcon;
- struct cifsFileInfo *cfile = (struct cifsFileInfo *) filp->private_data;
+ struct cifsFileInfo *cfile = filp->private_data;
xid = GetXid();
rc = CIFSSMBQFileInfo(xid, tcon, cfile->netfid, &find_data);
@@ -723,9 +724,14 @@ cifs_find_inode(struct inode *inode, void *opaque)
{
struct cifs_fattr *fattr = (struct cifs_fattr *) opaque;
+ /* don't match inode with different uniqueid */
if (CIFS_I(inode)->uniqueid != fattr->cf_uniqueid)
return 0;
+ /* don't match inode of different type */
+ if ((inode->i_mode & S_IFMT) != (fattr->cf_mode & S_IFMT))
+ return 0;
+
/*
* uh oh -- it's a directory. We can't use it since hardlinked dirs are
* verboten. Disable serverino and return it as if it were found, the
@@ -776,6 +782,10 @@ retry_iget5_locked:
inode->i_flags |= S_NOATIME | S_NOCMTIME;
if (inode->i_state & I_NEW) {
inode->i_ino = hash;
+#ifdef CONFIG_CIFS_FSCACHE
+ /* initialize per-inode cache cookie pointer */
+ CIFS_I(inode)->fscache = NULL;
+#endif
unlock_new_inode(inode);
}
}
@@ -807,6 +817,11 @@ struct inode *cifs_root_iget(struct super_block *sb, unsigned long ino)
if (!inode)
return ERR_PTR(-ENOMEM);
+#ifdef CONFIG_CIFS_FSCACHE
+ /* populate tcon->resource_id */
+ cifs_sb->tcon->resource_id = CIFS_I(inode)->uniqueid;
+#endif
+
if (rc && cifs_sb->tcon->ipc) {
cFYI(1, "ipc connection - fake read inode");
inode->i_mode |= S_IFDIR;
@@ -1401,6 +1416,10 @@ cifs_do_rename(int xid, struct dentry *from_dentry, const char *fromPath,
if (rc == 0 || rc != -ETXTBSY)
return rc;
+ /* open-file renames don't work across directories */
+ if (to_dentry->d_parent != from_dentry->d_parent)
+ return rc;
+
/* open the file to be renamed -- we need DELETE perms */
rc = CIFSSMBOpen(xid, pTcon, fromPath, FILE_OPEN, DELETE,
CREATE_NOT_DIR, &srcfid, &oplock, NULL,
@@ -1564,6 +1583,7 @@ cifs_invalidate_mapping(struct inode *inode)
cifs_i->write_behind_rc = rc;
}
invalidate_remote_inode(inode);
+ cifs_fscache_reset_inode_cookie(inode);
}
int cifs_revalidate_file(struct file *filp)
diff --git a/fs/cifs/ioctl.c b/fs/cifs/ioctl.c
index 505926f1ee6b..9d38a71c8e14 100644
--- a/fs/cifs/ioctl.c
+++ b/fs/cifs/ioctl.c
@@ -41,8 +41,7 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg)
__u64 ExtAttrMask = 0;
__u64 caps;
struct cifsTconInfo *tcon;
- struct cifsFileInfo *pSMBFile =
- (struct cifsFileInfo *)filep->private_data;
+ struct cifsFileInfo *pSMBFile = filep->private_data;
#endif /* CONFIG_CIFS_POSIX */
xid = GetXid();
diff --git a/fs/cifs/netmisc.c b/fs/cifs/netmisc.c
index d35d52889cb5..c6721ee26dbc 100644
--- a/fs/cifs/netmisc.c
+++ b/fs/cifs/netmisc.c
@@ -61,6 +61,7 @@ static const struct smb_to_posix_error mapping_table_ERRDOS[] = {
{ERRremcd, -EACCES},
{ERRdiffdevice, -EXDEV},
{ERRnofiles, -ENOENT},
+ {ERRwriteprot, -EROFS},
{ERRbadshare, -ETXTBSY},
{ERRlock, -EACCES},
{ERRunsup, -EINVAL},
@@ -164,7 +165,7 @@ cifs_inet_pton(const int address_family, const char *cp, void *dst)
* Returns 0 on failure.
*/
int
-cifs_convert_address(char *src, void *dst)
+cifs_convert_address(struct sockaddr *dst, char *src)
{
int rc;
char *pct, *endp;
@@ -201,6 +202,27 @@ cifs_convert_address(char *src, void *dst)
return rc;
}
+int
+cifs_fill_sockaddr(struct sockaddr *dst, char *src,
+ const unsigned short int port)
+{
+ if (!cifs_convert_address(dst, src))
+ return 0;
+
+ switch (dst->sa_family) {
+ case AF_INET:
+ ((struct sockaddr_in *)dst)->sin_port = htons(port);
+ break;
+ case AF_INET6:
+ ((struct sockaddr_in6 *)dst)->sin6_port = htons(port);
+ break;
+ default:
+ return 0;
+ }
+
+ return 1;
+}
+
/*****************************************************************************
convert a NT status code to a dos class/code
*****************************************************************************/
diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
index daf1753af674..d5e591fab475 100644
--- a/fs/cifs/readdir.c
+++ b/fs/cifs/readdir.c
@@ -847,6 +847,11 @@ int cifs_readdir(struct file *file, void *direntry, filldir_t filldir)
end_of_smb = cifsFile->srch_inf.ntwrk_buf_start + max_len;
tmp_buf = kmalloc(UNICODE_NAME_MAX, GFP_KERNEL);
+ if (tmp_buf == NULL) {
+ rc = -ENOMEM;
+ break;
+ }
+
for (i = 0; (i < num_to_fill) && (rc == 0); i++) {
if (current_entry == NULL) {
/* evaluate whether this case is an error */
diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c
index 7707389bdf2c..0a57cb7db5dd 100644
--- a/fs/cifs/sess.c
+++ b/fs/cifs/sess.c
@@ -730,15 +730,7 @@ ssetup_ntlmssp_authenticate:
/* calculate session key */
setup_ntlmv2_rsp(ses, v2_sess_key, nls_cp);
- if (first_time) /* should this be moved into common code
- with similar ntlmv2 path? */
- /* cifs_calculate_ntlmv2_mac_key(ses->server->mac_signing_key,
- response BB FIXME, v2_sess_key); */
-
- /* copy session key */
-
- /* memcpy(bcc_ptr, (char *)ntlm_session_key,LM2_SESS_KEY_SIZE);
- bcc_ptr += LM2_SESS_KEY_SIZE; */
+ /* FIXME: calculate MAC key */
memcpy(bcc_ptr, (char *)v2_sess_key,
sizeof(struct ntlmv2_resp));
bcc_ptr += sizeof(struct ntlmv2_resp);
diff --git a/fs/cifs/smberr.h b/fs/cifs/smberr.h
index c5084d27db7c..7f16cb825fe5 100644
--- a/fs/cifs/smberr.h
+++ b/fs/cifs/smberr.h
@@ -76,6 +76,7 @@
#define ERRnofiles 18 /* A File Search command can find no
more files matching the specified
criteria. */
+#define ERRwriteprot 19 /* media is write protected */
#define ERRgeneral 31
#define ERRbadshare 32 /* The sharing mode specified for an
Open conflicts with existing FIDs on
diff --git a/fs/coda/coda_int.h b/fs/coda/coda_int.h
index d99860a33890..6b443ff43a19 100644
--- a/fs/coda/coda_int.h
+++ b/fs/coda/coda_int.h
@@ -11,8 +11,7 @@ extern int coda_fake_statfs;
void coda_destroy_inodecache(void);
int coda_init_inodecache(void);
-int coda_fsync(struct file *coda_file, struct dentry *coda_dentry,
- int datasync);
+int coda_fsync(struct file *coda_file, int datasync);
void coda_sysctl_init(void);
void coda_sysctl_clean(void);
diff --git a/fs/coda/file.c b/fs/coda/file.c
index 7196077b1688..ad3cd2abeeb4 100644
--- a/fs/coda/file.c
+++ b/fs/coda/file.c
@@ -202,10 +202,10 @@ int coda_release(struct inode *coda_inode, struct file *coda_file)
return 0;
}
-int coda_fsync(struct file *coda_file, struct dentry *coda_dentry, int datasync)
+int coda_fsync(struct file *coda_file, int datasync)
{
struct file *host_file;
- struct inode *coda_inode = coda_dentry->d_inode;
+ struct inode *coda_inode = coda_file->f_path.dentry->d_inode;
struct coda_file_info *cfi;
int err = 0;
diff --git a/fs/compat.c b/fs/compat.c
index 05448730f840..6490d2134ff3 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -568,6 +568,79 @@ out:
return ret;
}
+/* A write operation does a read from user space and vice versa */
+#define vrfy_dir(type) ((type) == READ ? VERIFY_WRITE : VERIFY_READ)
+
+ssize_t compat_rw_copy_check_uvector(int type,
+ const struct compat_iovec __user *uvector, unsigned long nr_segs,
+ unsigned long fast_segs, struct iovec *fast_pointer,
+ struct iovec **ret_pointer)
+{
+ compat_ssize_t tot_len;
+ struct iovec *iov = *ret_pointer = fast_pointer;
+ ssize_t ret = 0;
+ int seg;
+
+ /*
+ * SuS says "The readv() function *may* fail if the iovcnt argument
+ * was less than or equal to 0, or greater than {IOV_MAX}. Linux has
+ * traditionally returned zero for zero segments, so...
+ */
+ if (nr_segs == 0)
+ goto out;
+
+ ret = -EINVAL;
+ if (nr_segs > UIO_MAXIOV || nr_segs < 0)
+ goto out;
+ if (nr_segs > fast_segs) {
+ ret = -ENOMEM;
+ iov = kmalloc(nr_segs*sizeof(struct iovec), GFP_KERNEL);
+ if (iov == NULL) {
+ *ret_pointer = fast_pointer;
+ goto out;
+ }
+ }
+ *ret_pointer = iov;
+
+ /*
+ * Single unix specification:
+ * We should -EINVAL if an element length is not >= 0 and fitting an
+ * ssize_t. The total length is fitting an ssize_t
+ *
+ * Be careful here because iov_len is a size_t not an ssize_t
+ */
+ tot_len = 0;
+ ret = -EINVAL;
+ for (seg = 0; seg < nr_segs; seg++) {
+ compat_ssize_t tmp = tot_len;
+ compat_uptr_t buf;
+ compat_ssize_t len;
+
+ if (__get_user(len, &uvector->iov_len) ||
+ __get_user(buf, &uvector->iov_base)) {
+ ret = -EFAULT;
+ goto out;
+ }
+ if (len < 0) /* size_t not fitting in compat_ssize_t .. */
+ goto out;
+ tot_len += len;
+ if (tot_len < tmp) /* maths overflow on the compat_ssize_t */
+ goto out;
+ if (!access_ok(vrfy_dir(type), compat_ptr(buf), len)) {
+ ret = -EFAULT;
+ goto out;
+ }
+ iov->iov_base = compat_ptr(buf);
+ iov->iov_len = (compat_size_t) len;
+ uvector++;
+ iov++;
+ }
+ ret = tot_len;
+
+out:
+ return ret;
+}
+
static inline long
copy_iocb(long nr, u32 __user *ptr32, struct iocb __user * __user *ptr64)
{
@@ -600,7 +673,7 @@ compat_sys_io_submit(aio_context_t ctx_id, int nr, u32 __user *iocb)
iocb64 = compat_alloc_user_space(nr * sizeof(*iocb64));
ret = copy_iocb(nr, iocb, iocb64);
if (!ret)
- ret = sys_io_submit(ctx_id, nr, iocb64);
+ ret = do_io_submit(ctx_id, nr, iocb64, 1);
return ret;
}
@@ -1077,70 +1150,21 @@ static ssize_t compat_do_readv_writev(int type, struct file *file,
{
compat_ssize_t tot_len;
struct iovec iovstack[UIO_FASTIOV];
- struct iovec *iov=iovstack, *vector;
+ struct iovec *iov;
ssize_t ret;
- int seg;
io_fn_t fn;
iov_fn_t fnv;
- /*
- * SuS says "The readv() function *may* fail if the iovcnt argument
- * was less than or equal to 0, or greater than {IOV_MAX}. Linux has
- * traditionally returned zero for zero segments, so...
- */
- ret = 0;
- if (nr_segs == 0)
- goto out;
-
- /*
- * First get the "struct iovec" from user memory and
- * verify all the pointers
- */
ret = -EINVAL;
- if ((nr_segs > UIO_MAXIOV) || (nr_segs <= 0))
- goto out;
if (!file->f_op)
goto out;
- if (nr_segs > UIO_FASTIOV) {
- ret = -ENOMEM;
- iov = kmalloc(nr_segs*sizeof(struct iovec), GFP_KERNEL);
- if (!iov)
- goto out;
- }
+
ret = -EFAULT;
if (!access_ok(VERIFY_READ, uvector, nr_segs*sizeof(*uvector)))
goto out;
- /*
- * Single unix specification:
- * We should -EINVAL if an element length is not >= 0 and fitting an
- * ssize_t. The total length is fitting an ssize_t
- *
- * Be careful here because iov_len is a size_t not an ssize_t
- */
- tot_len = 0;
- vector = iov;
- ret = -EINVAL;
- for (seg = 0 ; seg < nr_segs; seg++) {
- compat_ssize_t tmp = tot_len;
- compat_ssize_t len;
- compat_uptr_t buf;
-
- if (__get_user(len, &uvector->iov_len) ||
- __get_user(buf, &uvector->iov_base)) {
- ret = -EFAULT;
- goto out;
- }
- if (len < 0) /* size_t not fitting an compat_ssize_t .. */
- goto out;
- tot_len += len;
- if (tot_len < tmp) /* maths overflow on the compat_ssize_t */
- goto out;
- vector->iov_base = compat_ptr(buf);
- vector->iov_len = (compat_size_t) len;
- uvector++;
- vector++;
- }
+ tot_len = compat_rw_copy_check_uvector(type, uvector, nr_segs,
+ UIO_FASTIOV, iovstack, &iov);
if (tot_len == 0) {
ret = 0;
goto out;
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index 641640dc7ae5..8ea5e3374507 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -601,8 +601,11 @@ static int ioc_settimeout(unsigned int fd, unsigned int cmd,
}
/* Bluetooth ioctls */
-#define HCIUARTSETPROTO _IOW('U', 200, int)
-#define HCIUARTGETPROTO _IOR('U', 201, int)
+#define HCIUARTSETPROTO _IOW('U', 200, int)
+#define HCIUARTGETPROTO _IOR('U', 201, int)
+#define HCIUARTGETDEVICE _IOR('U', 202, int)
+#define HCIUARTSETFLAGS _IOW('U', 203, int)
+#define HCIUARTGETFLAGS _IOR('U', 204, int)
#define BNEPCONNADD _IOW('B', 200, int)
#define BNEPCONNDEL _IOW('B', 201, int)
@@ -1328,6 +1331,8 @@ COMPATIBLE_IOCTL(HCISETLINKPOL)
COMPATIBLE_IOCTL(HCISETLINKMODE)
COMPATIBLE_IOCTL(HCISETACLMTU)
COMPATIBLE_IOCTL(HCISETSCOMTU)
+COMPATIBLE_IOCTL(HCIBLOCKADDR)
+COMPATIBLE_IOCTL(HCIUNBLOCKADDR)
COMPATIBLE_IOCTL(HCIINQUIRY)
COMPATIBLE_IOCTL(HCIUARTSETPROTO)
COMPATIBLE_IOCTL(HCIUARTGETPROTO)
diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c
index c8af2d91174b..cf78d44a8d6a 100644
--- a/fs/configfs/inode.c
+++ b/fs/configfs/inode.c
@@ -73,15 +73,6 @@ int configfs_setattr(struct dentry * dentry, struct iattr * iattr)
return -EINVAL;
sd_iattr = sd->s_iattr;
-
- error = inode_change_ok(inode, iattr);
- if (error)
- return error;
-
- error = inode_setattr(inode, iattr);
- if (error)
- return error;
-
if (!sd_iattr) {
/* setting attributes for the first time, allocate now */
sd_iattr = kzalloc(sizeof(struct iattr), GFP_KERNEL);
@@ -94,9 +85,12 @@ int configfs_setattr(struct dentry * dentry, struct iattr * iattr)
sd_iattr->ia_atime = sd_iattr->ia_mtime = sd_iattr->ia_ctime = CURRENT_TIME;
sd->s_iattr = sd_iattr;
}
-
/* attributes were changed atleast once in past */
+ error = simple_setattr(dentry, iattr);
+ if (error)
+ return error;
+
if (ia_valid & ATTR_UID)
sd_iattr->ia_uid = iattr->ia_uid;
if (ia_valid & ATTR_GID)
diff --git a/fs/dcache.c b/fs/dcache.c
index d96047b4a633..86d4db15473e 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -590,6 +590,8 @@ static void prune_dcache(int count)
up_read(&sb->s_umount);
}
spin_lock(&sb_lock);
+ /* lock was dropped, must reset next */
+ list_safe_reset_next(sb, n, s_list);
count -= pruned;
__put_super(sb);
/* more work left to do? */
@@ -894,7 +896,7 @@ EXPORT_SYMBOL(shrink_dcache_parent);
*
* In this case we return -1 to tell the caller that we baled.
*/
-static int shrink_dcache_memory(int nr, gfp_t gfp_mask)
+static int shrink_dcache_memory(struct shrinker *shrink, int nr, gfp_t gfp_mask)
{
if (nr) {
if (!(gfp_mask & __GFP_FS))
diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c
index 4d74fc72c195..0210898458b2 100644
--- a/fs/debugfs/file.c
+++ b/fs/debugfs/file.c
@@ -277,8 +277,10 @@ DEFINE_SIMPLE_ATTRIBUTE(fops_x32, debugfs_u32_get, debugfs_u32_set, "0x%08llx\n"
DEFINE_SIMPLE_ATTRIBUTE(fops_x32_ro, debugfs_u32_get, NULL, "0x%08llx\n");
DEFINE_SIMPLE_ATTRIBUTE(fops_x32_wo, NULL, debugfs_u32_set, "0x%08llx\n");
+DEFINE_SIMPLE_ATTRIBUTE(fops_x64, debugfs_u64_get, debugfs_u64_set, "0x%016llx\n");
+
/*
- * debugfs_create_x{8,16,32} - create a debugfs file that is used to read and write an unsigned {8,16,32}-bit value
+ * debugfs_create_x{8,16,32,64} - create a debugfs file that is used to read and write an unsigned {8,16,32,64}-bit value
*
* These functions are exactly the same as the above functions (but use a hex
* output for the decimal challenged). For details look at the above unsigned
@@ -357,6 +359,23 @@ struct dentry *debugfs_create_x32(const char *name, mode_t mode,
}
EXPORT_SYMBOL_GPL(debugfs_create_x32);
+/**
+ * debugfs_create_x64 - create a debugfs file that is used to read and write an unsigned 64-bit value
+ * @name: a pointer to a string containing the name of the file to create.
+ * @mode: the permission that the file should have
+ * @parent: a pointer to the parent dentry for this file. This should be a
+ * directory dentry if set. If this parameter is %NULL, then the
+ * file will be created in the root of the debugfs filesystem.
+ * @value: a pointer to the variable that the file should read to and write
+ * from.
+ */
+struct dentry *debugfs_create_x64(const char *name, mode_t mode,
+ struct dentry *parent, u64 *value)
+{
+ return debugfs_create_file(name, mode, parent, value, &fops_x64);
+}
+EXPORT_SYMBOL_GPL(debugfs_create_x64);
+
static int debugfs_size_t_set(void *data, u64 val)
{
diff --git a/fs/direct-io.c b/fs/direct-io.c
index e82adc2debb7..a10cb91cadea 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -82,6 +82,8 @@ struct dio {
int reap_counter; /* rate limit reaping */
get_block_t *get_block; /* block mapping function */
dio_iodone_t *end_io; /* IO completion function */
+ dio_submit_t *submit_io; /* IO submition function */
+ loff_t logical_offset_in_bio; /* current first logical block in bio */
sector_t final_block_in_bio; /* current final block in bio + 1 */
sector_t next_block_for_io; /* next block to be put under IO,
in dio_blocks units */
@@ -96,6 +98,7 @@ struct dio {
unsigned cur_page_offset; /* Offset into it, in bytes */
unsigned cur_page_len; /* Nr of bytes at cur_page_offset */
sector_t cur_page_block; /* Where it starts */
+ loff_t cur_page_fs_offset; /* Offset in file */
/* BIO completion state */
spinlock_t bio_lock; /* protects BIO fields below */
@@ -215,7 +218,7 @@ static struct page *dio_get_page(struct dio *dio)
* filesystems can use it to hold additional state between get_block calls and
* dio_complete.
*/
-static int dio_complete(struct dio *dio, loff_t offset, int ret)
+static int dio_complete(struct dio *dio, loff_t offset, int ret, bool is_async)
{
ssize_t transferred = 0;
@@ -236,14 +239,6 @@ static int dio_complete(struct dio *dio, loff_t offset, int ret)
transferred = dio->i_size - offset;
}
- if (dio->end_io && dio->result)
- dio->end_io(dio->iocb, offset, transferred,
- dio->map_bh.b_private);
-
- if (dio->flags & DIO_LOCKING)
- /* lockdep: non-owner release */
- up_read_non_owner(&dio->inode->i_alloc_sem);
-
if (ret == 0)
ret = dio->page_errors;
if (ret == 0)
@@ -251,6 +246,17 @@ static int dio_complete(struct dio *dio, loff_t offset, int ret)
if (ret == 0)
ret = transferred;
+ if (dio->end_io && dio->result) {
+ dio->end_io(dio->iocb, offset, transferred,
+ dio->map_bh.b_private, ret, is_async);
+ } else if (is_async) {
+ aio_complete(dio->iocb, ret, 0);
+ }
+
+ if (dio->flags & DIO_LOCKING)
+ /* lockdep: non-owner release */
+ up_read_non_owner(&dio->inode->i_alloc_sem);
+
return ret;
}
@@ -274,8 +280,7 @@ static void dio_bio_end_aio(struct bio *bio, int error)
spin_unlock_irqrestore(&dio->bio_lock, flags);
if (remaining == 0) {
- int ret = dio_complete(dio, dio->iocb->ki_pos, 0);
- aio_complete(dio->iocb, ret, 0);
+ dio_complete(dio, dio->iocb->ki_pos, 0, true);
kfree(dio);
}
}
@@ -300,6 +305,26 @@ static void dio_bio_end_io(struct bio *bio, int error)
spin_unlock_irqrestore(&dio->bio_lock, flags);
}
+/**
+ * dio_end_io - handle the end io action for the given bio
+ * @bio: The direct io bio thats being completed
+ * @error: Error if there was one
+ *
+ * This is meant to be called by any filesystem that uses their own dio_submit_t
+ * so that the DIO specific endio actions are dealt with after the filesystem
+ * has done it's completion work.
+ */
+void dio_end_io(struct bio *bio, int error)
+{
+ struct dio *dio = bio->bi_private;
+
+ if (dio->is_async)
+ dio_bio_end_aio(bio, error);
+ else
+ dio_bio_end_io(bio, error);
+}
+EXPORT_SYMBOL_GPL(dio_end_io);
+
static int
dio_bio_alloc(struct dio *dio, struct block_device *bdev,
sector_t first_sector, int nr_vecs)
@@ -316,6 +341,7 @@ dio_bio_alloc(struct dio *dio, struct block_device *bdev,
bio->bi_end_io = dio_bio_end_io;
dio->bio = bio;
+ dio->logical_offset_in_bio = dio->cur_page_fs_offset;
return 0;
}
@@ -340,10 +366,15 @@ static void dio_bio_submit(struct dio *dio)
if (dio->is_async && dio->rw == READ)
bio_set_pages_dirty(bio);
- submit_bio(dio->rw, bio);
+ if (dio->submit_io)
+ dio->submit_io(dio->rw, bio, dio->inode,
+ dio->logical_offset_in_bio);
+ else
+ submit_bio(dio->rw, bio);
dio->bio = NULL;
dio->boundary = 0;
+ dio->logical_offset_in_bio = 0;
}
/*
@@ -603,10 +634,26 @@ static int dio_send_cur_page(struct dio *dio)
int ret = 0;
if (dio->bio) {
+ loff_t cur_offset = dio->block_in_file << dio->blkbits;
+ loff_t bio_next_offset = dio->logical_offset_in_bio +
+ dio->bio->bi_size;
+
/*
- * See whether this new request is contiguous with the old
+ * See whether this new request is contiguous with the old.
+ *
+ * Btrfs cannot handl having logically non-contiguous requests
+ * submitted. For exmple if you have
+ *
+ * Logical: [0-4095][HOLE][8192-12287]
+ * Phyiscal: [0-4095] [4096-8181]
+ *
+ * We cannot submit those pages together as one BIO. So if our
+ * current logical offset in the file does not equal what would
+ * be the next logical offset in the bio, submit the bio we
+ * have.
*/
- if (dio->final_block_in_bio != dio->cur_page_block)
+ if (dio->final_block_in_bio != dio->cur_page_block ||
+ cur_offset != bio_next_offset)
dio_bio_submit(dio);
/*
* Submit now if the underlying fs is about to perform a
@@ -701,6 +748,7 @@ submit_page_section(struct dio *dio, struct page *page,
dio->cur_page_offset = offset;
dio->cur_page_len = len;
dio->cur_page_block = blocknr;
+ dio->cur_page_fs_offset = dio->block_in_file << dio->blkbits;
out:
return ret;
}
@@ -935,7 +983,7 @@ static ssize_t
direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
const struct iovec *iov, loff_t offset, unsigned long nr_segs,
unsigned blkbits, get_block_t get_block, dio_iodone_t end_io,
- struct dio *dio)
+ dio_submit_t submit_io, struct dio *dio)
{
unsigned long user_addr;
unsigned long flags;
@@ -952,6 +1000,7 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
dio->get_block = get_block;
dio->end_io = end_io;
+ dio->submit_io = submit_io;
dio->final_block_in_bio = -1;
dio->next_block_for_io = -1;
@@ -1008,7 +1057,7 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
}
} /* end iovec loop */
- if (ret == -ENOTBLK && (rw & WRITE)) {
+ if (ret == -ENOTBLK) {
/*
* The remaining part of the request will be
* be handled by buffered I/O when we return
@@ -1079,7 +1128,7 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
spin_unlock_irqrestore(&dio->bio_lock, flags);
if (ret2 == 0) {
- ret = dio_complete(dio, offset, ret);
+ ret = dio_complete(dio, offset, ret, false);
kfree(dio);
} else
BUG_ON(ret != -EIOCBQUEUED);
@@ -1087,30 +1136,11 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
return ret;
}
-/*
- * This is a library function for use by filesystem drivers.
- *
- * The locking rules are governed by the flags parameter:
- * - if the flags value contains DIO_LOCKING we use a fancy locking
- * scheme for dumb filesystems.
- * For writes this function is called under i_mutex and returns with
- * i_mutex held, for reads, i_mutex is not held on entry, but it is
- * taken and dropped again before returning.
- * For reads and writes i_alloc_sem is taken in shared mode and released
- * on I/O completion (which may happen asynchronously after returning to
- * the caller).
- *
- * - if the flags value does NOT contain DIO_LOCKING we don't use any
- * internal locking but rather rely on the filesystem to synchronize
- * direct I/O reads/writes versus each other and truncate.
- * For reads and writes both i_mutex and i_alloc_sem are not held on
- * entry and are never taken.
- */
ssize_t
-__blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
+__blockdev_direct_IO_newtrunc(int rw, struct kiocb *iocb, struct inode *inode,
struct block_device *bdev, const struct iovec *iov, loff_t offset,
unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io,
- int flags)
+ dio_submit_t submit_io, int flags)
{
int seg;
size_t size;
@@ -1197,11 +1227,49 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
(end > i_size_read(inode)));
retval = direct_io_worker(rw, iocb, inode, iov, offset,
- nr_segs, blkbits, get_block, end_io, dio);
+ nr_segs, blkbits, get_block, end_io,
+ submit_io, dio);
+
+out:
+ return retval;
+}
+EXPORT_SYMBOL(__blockdev_direct_IO_newtrunc);
+
+/*
+ * This is a library function for use by filesystem drivers.
+ *
+ * The locking rules are governed by the flags parameter:
+ * - if the flags value contains DIO_LOCKING we use a fancy locking
+ * scheme for dumb filesystems.
+ * For writes this function is called under i_mutex and returns with
+ * i_mutex held, for reads, i_mutex is not held on entry, but it is
+ * taken and dropped again before returning.
+ * For reads and writes i_alloc_sem is taken in shared mode and released
+ * on I/O completion (which may happen asynchronously after returning to
+ * the caller).
+ *
+ * - if the flags value does NOT contain DIO_LOCKING we don't use any
+ * internal locking but rather rely on the filesystem to synchronize
+ * direct I/O reads/writes versus each other and truncate.
+ * For reads and writes both i_mutex and i_alloc_sem are not held on
+ * entry and are never taken.
+ */
+ssize_t
+__blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
+ struct block_device *bdev, const struct iovec *iov, loff_t offset,
+ unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io,
+ dio_submit_t submit_io, int flags)
+{
+ ssize_t retval;
+ retval = __blockdev_direct_IO_newtrunc(rw, iocb, inode, bdev, iov,
+ offset, nr_segs, get_block, end_io, submit_io, flags);
/*
* In case of error extending write may have instantiated a few
* blocks outside i_size. Trim these off again for DIO_LOCKING.
+ * NOTE: DIO_NO_LOCK/DIO_OWN_LOCK callers have to handle this in
+ * their own manner. This is a further example of where the old
+ * truncate sequence is inadequate.
*
* NOTE: filesystems with their own locking have to handle this
* on their own.
@@ -1209,12 +1277,13 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
if (flags & DIO_LOCKING) {
if (unlikely((rw & WRITE) && retval < 0)) {
loff_t isize = i_size_read(inode);
+ loff_t end = offset + iov_length(iov, nr_segs);
+
if (end > isize)
vmtruncate(inode, isize);
}
}
-out:
return retval;
}
EXPORT_SYMBOL(__blockdev_direct_IO);
diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c
index 3bdddbcc785f..e8fcf4e2ed7d 100644
--- a/fs/ecryptfs/file.c
+++ b/fs/ecryptfs/file.c
@@ -274,7 +274,7 @@ static int ecryptfs_release(struct inode *inode, struct file *file)
}
static int
-ecryptfs_fsync(struct file *file, struct dentry *dentry, int datasync)
+ecryptfs_fsync(struct file *file, int datasync)
{
return vfs_fsync(ecryptfs_file_to_lower(file), datasync);
}
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index 65dee2f336ae..31ef5252f0fe 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -805,7 +805,7 @@ static int truncate_upper(struct dentry *dentry, struct iattr *ia,
- (ia->ia_size & ~PAGE_CACHE_MASK));
if (!(crypt_stat->flags & ECRYPTFS_ENCRYPTED)) {
- rc = vmtruncate(inode, ia->ia_size);
+ rc = simple_setsize(inode, ia->ia_size);
if (rc)
goto out;
lower_ia->ia_size = ia->ia_size;
@@ -830,7 +830,7 @@ static int truncate_upper(struct dentry *dentry, struct iattr *ia,
goto out;
}
}
- vmtruncate(inode, ia->ia_size);
+ simple_setsize(inode, ia->ia_size);
rc = ecryptfs_write_inode_size_to_metadata(inode);
if (rc) {
printk(KERN_ERR "Problem with "
diff --git a/fs/ecryptfs/messaging.c b/fs/ecryptfs/messaging.c
index 2d8dbce9d485..46c4dd8dfcc3 100644
--- a/fs/ecryptfs/messaging.c
+++ b/fs/ecryptfs/messaging.c
@@ -31,9 +31,9 @@ static struct mutex ecryptfs_msg_ctx_lists_mux;
static struct hlist_head *ecryptfs_daemon_hash;
struct mutex ecryptfs_daemon_hash_mux;
-static int ecryptfs_hash_buckets;
+static int ecryptfs_hash_bits;
#define ecryptfs_uid_hash(uid) \
- hash_long((unsigned long)uid, ecryptfs_hash_buckets)
+ hash_long((unsigned long)uid, ecryptfs_hash_bits)
static u32 ecryptfs_msg_counter;
static struct ecryptfs_msg_ctx *ecryptfs_msg_ctx_arr;
@@ -486,18 +486,19 @@ int ecryptfs_init_messaging(void)
}
mutex_init(&ecryptfs_daemon_hash_mux);
mutex_lock(&ecryptfs_daemon_hash_mux);
- ecryptfs_hash_buckets = 1;
- while (ecryptfs_number_of_users >> ecryptfs_hash_buckets)
- ecryptfs_hash_buckets++;
+ ecryptfs_hash_bits = 1;
+ while (ecryptfs_number_of_users >> ecryptfs_hash_bits)
+ ecryptfs_hash_bits++;
ecryptfs_daemon_hash = kmalloc((sizeof(struct hlist_head)
- * ecryptfs_hash_buckets), GFP_KERNEL);
+ * (1 << ecryptfs_hash_bits)),
+ GFP_KERNEL);
if (!ecryptfs_daemon_hash) {
rc = -ENOMEM;
printk(KERN_ERR "%s: Failed to allocate memory\n", __func__);
mutex_unlock(&ecryptfs_daemon_hash_mux);
goto out;
}
- for (i = 0; i < ecryptfs_hash_buckets; i++)
+ for (i = 0; i < (1 << ecryptfs_hash_bits); i++)
INIT_HLIST_HEAD(&ecryptfs_daemon_hash[i]);
mutex_unlock(&ecryptfs_daemon_hash_mux);
ecryptfs_msg_ctx_arr = kmalloc((sizeof(struct ecryptfs_msg_ctx)
@@ -554,7 +555,7 @@ void ecryptfs_release_messaging(void)
int i;
mutex_lock(&ecryptfs_daemon_hash_mux);
- for (i = 0; i < ecryptfs_hash_buckets; i++) {
+ for (i = 0; i < (1 << ecryptfs_hash_bits); i++) {
int rc;
hlist_for_each_entry(daemon, elem,
diff --git a/fs/exec.c b/fs/exec.c
index 9badbc0bfb1d..e19de6a80339 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -768,7 +768,6 @@ static int de_thread(struct task_struct *tsk)
struct signal_struct *sig = tsk->signal;
struct sighand_struct *oldsighand = tsk->sighand;
spinlock_t *lock = &oldsighand->siglock;
- int count;
if (thread_group_empty(tsk))
goto no_thread_group;
@@ -785,13 +784,13 @@ static int de_thread(struct task_struct *tsk)
spin_unlock_irq(lock);
return -EAGAIN;
}
+
sig->group_exit_task = tsk;
- zap_other_threads(tsk);
+ sig->notify_count = zap_other_threads(tsk);
+ if (!thread_group_leader(tsk))
+ sig->notify_count--;
- /* Account for the thread group leader hanging around: */
- count = thread_group_leader(tsk) ? 1 : 2;
- sig->notify_count = count;
- while (atomic_read(&sig->count) > count) {
+ while (sig->notify_count) {
__set_current_state(TASK_UNINTERRUPTIBLE);
spin_unlock_irq(lock);
schedule();
@@ -1662,12 +1661,15 @@ static int coredump_wait(int exit_code, struct core_state *core_state)
struct task_struct *tsk = current;
struct mm_struct *mm = tsk->mm;
struct completion *vfork_done;
- int core_waiters;
+ int core_waiters = -EBUSY;
init_completion(&core_state->startup);
core_state->dumper.task = tsk;
core_state->dumper.next = NULL;
- core_waiters = zap_threads(tsk, mm, core_state, exit_code);
+
+ down_write(&mm->mmap_sem);
+ if (!mm->core_state)
+ core_waiters = zap_threads(tsk, mm, core_state, exit_code);
up_write(&mm->mmap_sem);
if (unlikely(core_waiters < 0))
@@ -1787,21 +1789,61 @@ static void wait_for_dump_helpers(struct file *file)
}
+/*
+ * uhm_pipe_setup
+ * helper function to customize the process used
+ * to collect the core in userspace. Specifically
+ * it sets up a pipe and installs it as fd 0 (stdin)
+ * for the process. Returns 0 on success, or
+ * PTR_ERR on failure.
+ * Note that it also sets the core limit to 1. This
+ * is a special value that we use to trap recursive
+ * core dumps
+ */
+static int umh_pipe_setup(struct subprocess_info *info)
+{
+ struct file *rp, *wp;
+ struct fdtable *fdt;
+ struct coredump_params *cp = (struct coredump_params *)info->data;
+ struct files_struct *cf = current->files;
+
+ wp = create_write_pipe(0);
+ if (IS_ERR(wp))
+ return PTR_ERR(wp);
+
+ rp = create_read_pipe(wp, 0);
+ if (IS_ERR(rp)) {
+ free_write_pipe(wp);
+ return PTR_ERR(rp);
+ }
+
+ cp->file = wp;
+
+ sys_close(0);
+ fd_install(0, rp);
+ spin_lock(&cf->file_lock);
+ fdt = files_fdtable(cf);
+ FD_SET(0, fdt->open_fds);
+ FD_CLR(0, fdt->close_on_exec);
+ spin_unlock(&cf->file_lock);
+
+ /* and disallow core files too */
+ current->signal->rlim[RLIMIT_CORE] = (struct rlimit){1, 1};
+
+ return 0;
+}
+
void do_coredump(long signr, int exit_code, struct pt_regs *regs)
{
struct core_state core_state;
char corename[CORENAME_MAX_SIZE + 1];
struct mm_struct *mm = current->mm;
struct linux_binfmt * binfmt;
- struct inode * inode;
const struct cred *old_cred;
struct cred *cred;
int retval = 0;
int flag = 0;
- int ispipe = 0;
- char **helper_argv = NULL;
- int helper_argc = 0;
- int dump_count = 0;
+ int ispipe;
static atomic_t core_dump_count = ATOMIC_INIT(0);
struct coredump_params cprm = {
.signr = signr,
@@ -1820,23 +1862,12 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs)
binfmt = mm->binfmt;
if (!binfmt || !binfmt->core_dump)
goto fail;
-
- cred = prepare_creds();
- if (!cred) {
- retval = -ENOMEM;
+ if (!__get_dumpable(cprm.mm_flags))
goto fail;
- }
- down_write(&mm->mmap_sem);
- /*
- * If another thread got here first, or we are not dumpable, bail out.
- */
- if (mm->core_state || !__get_dumpable(cprm.mm_flags)) {
- up_write(&mm->mmap_sem);
- put_cred(cred);
+ cred = prepare_creds();
+ if (!cred)
goto fail;
- }
-
/*
* We cannot trust fsuid as being the "true" uid of the
* process nor do we know its entire history. We only know it
@@ -1849,10 +1880,8 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs)
}
retval = coredump_wait(exit_code, &core_state);
- if (retval < 0) {
- put_cred(cred);
- goto fail;
- }
+ if (retval < 0)
+ goto fail_creds;
old_cred = override_creds(cred);
@@ -1870,19 +1899,19 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs)
ispipe = format_corename(corename, signr);
unlock_kernel();
- if ((!ispipe) && (cprm.limit < binfmt->min_coredump))
- goto fail_unlock;
-
if (ispipe) {
- if (cprm.limit == 0) {
+ int dump_count;
+ char **helper_argv;
+
+ if (cprm.limit == 1) {
/*
* Normally core limits are irrelevant to pipes, since
* we're not writing to the file system, but we use
- * cprm.limit of 0 here as a speacial value. Any
- * non-zero limit gets set to RLIM_INFINITY below, but
+ * cprm.limit of 1 here as a speacial value. Any
+ * non-1 limit gets set to RLIM_INFINITY below, but
* a limit of 0 skips the dump. This is a consistent
* way to catch recursive crashes. We can still crash
- * if the core_pattern binary sets RLIM_CORE = !0
+ * if the core_pattern binary sets RLIM_CORE = !1
* but it runs as root, and can do lots of stupid things
* Note that we use task_tgid_vnr here to grab the pid
* of the process group leader. That way we get the
@@ -1890,11 +1919,12 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs)
* core_pattern process dies.
*/
printk(KERN_WARNING
- "Process %d(%s) has RLIMIT_CORE set to 0\n",
+ "Process %d(%s) has RLIMIT_CORE set to 1\n",
task_tgid_vnr(current), current->comm);
printk(KERN_WARNING "Aborting core\n");
goto fail_unlock;
}
+ cprm.limit = RLIM_INFINITY;
dump_count = atomic_inc_return(&core_dump_count);
if (core_pipe_limit && (core_pipe_limit < dump_count)) {
@@ -1904,71 +1934,74 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs)
goto fail_dropcount;
}
- helper_argv = argv_split(GFP_KERNEL, corename+1, &helper_argc);
+ helper_argv = argv_split(GFP_KERNEL, corename+1, NULL);
if (!helper_argv) {
printk(KERN_WARNING "%s failed to allocate memory\n",
__func__);
goto fail_dropcount;
}
- cprm.limit = RLIM_INFINITY;
-
- /* SIGPIPE can happen, but it's just never processed */
- if (call_usermodehelper_pipe(helper_argv[0], helper_argv, NULL,
- &cprm.file)) {
+ retval = call_usermodehelper_fns(helper_argv[0], helper_argv,
+ NULL, UMH_WAIT_EXEC, umh_pipe_setup,
+ NULL, &cprm);
+ argv_free(helper_argv);
+ if (retval) {
printk(KERN_INFO "Core dump to %s pipe failed\n",
corename);
- goto fail_dropcount;
+ goto close_fail;
}
- } else
+ } else {
+ struct inode *inode;
+
+ if (cprm.limit < binfmt->min_coredump)
+ goto fail_unlock;
+
cprm.file = filp_open(corename,
O_CREAT | 2 | O_NOFOLLOW | O_LARGEFILE | flag,
0600);
- if (IS_ERR(cprm.file))
- goto fail_dropcount;
- inode = cprm.file->f_path.dentry->d_inode;
- if (inode->i_nlink > 1)
- goto close_fail; /* multiple links - don't dump */
- if (!ispipe && d_unhashed(cprm.file->f_path.dentry))
- goto close_fail;
-
- /* AK: actually i see no reason to not allow this for named pipes etc.,
- but keep the previous behaviour for now. */
- if (!ispipe && !S_ISREG(inode->i_mode))
- goto close_fail;
- /*
- * Dont allow local users get cute and trick others to coredump
- * into their pre-created files:
- * Note, this is not relevant for pipes
- */
- if (!ispipe && (inode->i_uid != current_fsuid()))
- goto close_fail;
- if (!cprm.file->f_op)
- goto close_fail;
- if (!cprm.file->f_op->write)
- goto close_fail;
- if (!ispipe &&
- do_truncate(cprm.file->f_path.dentry, 0, 0, cprm.file) != 0)
- goto close_fail;
+ if (IS_ERR(cprm.file))
+ goto fail_unlock;
- retval = binfmt->core_dump(&cprm);
+ inode = cprm.file->f_path.dentry->d_inode;
+ if (inode->i_nlink > 1)
+ goto close_fail;
+ if (d_unhashed(cprm.file->f_path.dentry))
+ goto close_fail;
+ /*
+ * AK: actually i see no reason to not allow this for named
+ * pipes etc, but keep the previous behaviour for now.
+ */
+ if (!S_ISREG(inode->i_mode))
+ goto close_fail;
+ /*
+ * Dont allow local users get cute and trick others to coredump
+ * into their pre-created files.
+ */
+ if (inode->i_uid != current_fsuid())
+ goto close_fail;
+ if (!cprm.file->f_op || !cprm.file->f_op->write)
+ goto close_fail;
+ if (do_truncate(cprm.file->f_path.dentry, 0, 0, cprm.file))
+ goto close_fail;
+ }
+ retval = binfmt->core_dump(&cprm);
if (retval)
current->signal->group_exit_code |= 0x80;
-close_fail:
+
if (ispipe && core_pipe_limit)
wait_for_dump_helpers(cprm.file);
- filp_close(cprm.file, NULL);
+close_fail:
+ if (cprm.file)
+ filp_close(cprm.file, NULL);
fail_dropcount:
- if (dump_count)
+ if (ispipe)
atomic_dec(&core_dump_count);
fail_unlock:
- if (helper_argv)
- argv_free(helper_argv);
-
+ coredump_finish(mm);
revert_creds(old_cred);
+fail_creds:
put_cred(cred);
- coredump_finish(mm);
fail:
return;
}
diff --git a/fs/exofs/file.c b/fs/exofs/file.c
index 839b9dc1e70f..fef6899be397 100644
--- a/fs/exofs/file.c
+++ b/fs/exofs/file.c
@@ -40,12 +40,11 @@ static int exofs_release_file(struct inode *inode, struct file *filp)
return 0;
}
-static int exofs_file_fsync(struct file *filp, struct dentry *dentry,
- int datasync)
+static int exofs_file_fsync(struct file *filp, int datasync)
{
int ret;
struct address_space *mapping = filp->f_mapping;
- struct inode *inode = dentry->d_inode;
+ struct inode *inode = mapping->host;
struct super_block *sb;
ret = filemap_write_and_wait(mapping);
@@ -66,7 +65,7 @@ static int exofs_file_fsync(struct file *filp, struct dentry *dentry,
static int exofs_flush(struct file *file, fl_owner_t id)
{
- exofs_file_fsync(file, file->f_path.dentry, 1);
+ exofs_file_fsync(file, 1);
/* TODO: Flush the OSD target */
return 0;
}
diff --git a/fs/ext2/acl.c b/fs/ext2/acl.c
index ca7e2a0ed98a..2bcc0431bada 100644
--- a/fs/ext2/acl.c
+++ b/fs/ext2/acl.c
@@ -200,6 +200,7 @@ ext2_set_acl(struct inode *inode, int type, struct posix_acl *acl)
return error;
else {
inode->i_mode = mode;
+ inode->i_ctime = CURRENT_TIME_SEC;
mark_inode_dirty(inode);
if (error == 0)
acl = NULL;
diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h
index 0b038e47ad2f..52b34f1d2738 100644
--- a/fs/ext2/ext2.h
+++ b/fs/ext2/ext2.h
@@ -122,7 +122,6 @@ extern int ext2_write_inode (struct inode *, struct writeback_control *);
extern void ext2_delete_inode (struct inode *);
extern int ext2_sync_inode (struct inode *);
extern int ext2_get_block(struct inode *, sector_t, struct buffer_head *, int);
-extern void ext2_truncate (struct inode *);
extern int ext2_setattr (struct dentry *, struct iattr *);
extern void ext2_set_inode_flags(struct inode *inode);
extern void ext2_get_inode_flags(struct ext2_inode_info *);
@@ -155,7 +154,7 @@ extern void ext2_write_super (struct super_block *);
extern const struct file_operations ext2_dir_operations;
/* file.c */
-extern int ext2_fsync(struct file *file, struct dentry *dentry, int datasync);
+extern int ext2_fsync(struct file *file, int datasync);
extern const struct inode_operations ext2_file_inode_operations;
extern const struct file_operations ext2_file_operations;
extern const struct file_operations ext2_xip_file_operations;
diff --git a/fs/ext2/file.c b/fs/ext2/file.c
index 5d198d0697fb..49eec9456c5b 100644
--- a/fs/ext2/file.c
+++ b/fs/ext2/file.c
@@ -40,13 +40,13 @@ static int ext2_release_file (struct inode * inode, struct file * filp)
return 0;
}
-int ext2_fsync(struct file *file, struct dentry *dentry, int datasync)
+int ext2_fsync(struct file *file, int datasync)
{
int ret;
- struct super_block *sb = dentry->d_inode->i_sb;
+ struct super_block *sb = file->f_mapping->host->i_sb;
struct address_space *mapping = sb->s_bdev->bd_inode->i_mapping;
- ret = simple_fsync(file, dentry, datasync);
+ ret = generic_file_fsync(file, datasync);
if (ret == -EIO || test_and_clear_bit(AS_EIO, &mapping->flags)) {
/* We don't really know where the IO error happened... */
ext2_error(sb, __func__,
@@ -95,7 +95,6 @@ const struct file_operations ext2_xip_file_operations = {
#endif
const struct inode_operations ext2_file_inode_operations = {
- .truncate = ext2_truncate,
#ifdef CONFIG_EXT2_FS_XATTR
.setxattr = generic_setxattr,
.getxattr = generic_getxattr,
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index 527c46d9bc1f..3675088cb88c 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -54,6 +54,18 @@ static inline int ext2_inode_is_fast_symlink(struct inode *inode)
inode->i_blocks - ea_blocks == 0);
}
+static void ext2_truncate_blocks(struct inode *inode, loff_t offset);
+
+static void ext2_write_failed(struct address_space *mapping, loff_t to)
+{
+ struct inode *inode = mapping->host;
+
+ if (to > inode->i_size) {
+ truncate_pagecache(inode, to, inode->i_size);
+ ext2_truncate_blocks(inode, inode->i_size);
+ }
+}
+
/*
* Called at the last iput() if i_nlink is zero.
*/
@@ -71,7 +83,7 @@ void ext2_delete_inode (struct inode * inode)
inode->i_size = 0;
if (inode->i_blocks)
- ext2_truncate (inode);
+ ext2_truncate_blocks(inode, 0);
ext2_free_inode (inode);
return;
@@ -757,8 +769,8 @@ int __ext2_write_begin(struct file *file, struct address_space *mapping,
loff_t pos, unsigned len, unsigned flags,
struct page **pagep, void **fsdata)
{
- return block_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
- ext2_get_block);
+ return block_write_begin_newtrunc(file, mapping, pos, len, flags,
+ pagep, fsdata, ext2_get_block);
}
static int
@@ -766,8 +778,25 @@ ext2_write_begin(struct file *file, struct address_space *mapping,
loff_t pos, unsigned len, unsigned flags,
struct page **pagep, void **fsdata)
{
+ int ret;
+
*pagep = NULL;
- return __ext2_write_begin(file, mapping, pos, len, flags, pagep,fsdata);
+ ret = __ext2_write_begin(file, mapping, pos, len, flags, pagep, fsdata);
+ if (ret < 0)
+ ext2_write_failed(mapping, pos + len);
+ return ret;
+}
+
+static int ext2_write_end(struct file *file, struct address_space *mapping,
+ loff_t pos, unsigned len, unsigned copied,
+ struct page *page, void *fsdata)
+{
+ int ret;
+
+ ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata);
+ if (ret < len)
+ ext2_write_failed(mapping, pos + len);
+ return ret;
}
static int
@@ -775,13 +804,18 @@ ext2_nobh_write_begin(struct file *file, struct address_space *mapping,
loff_t pos, unsigned len, unsigned flags,
struct page **pagep, void **fsdata)
{
+ int ret;
+
/*
* Dir-in-pagecache still uses ext2_write_begin. Would have to rework
* directory handling code to pass around offsets rather than struct
* pages in order to make this work easily.
*/
- return nobh_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
- ext2_get_block);
+ ret = nobh_write_begin_newtrunc(file, mapping, pos, len, flags, pagep,
+ fsdata, ext2_get_block);
+ if (ret < 0)
+ ext2_write_failed(mapping, pos + len);
+ return ret;
}
static int ext2_nobh_writepage(struct page *page,
@@ -800,10 +834,15 @@ ext2_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
loff_t offset, unsigned long nr_segs)
{
struct file *file = iocb->ki_filp;
- struct inode *inode = file->f_mapping->host;
-
- return blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov,
- offset, nr_segs, ext2_get_block, NULL);
+ struct address_space *mapping = file->f_mapping;
+ struct inode *inode = mapping->host;
+ ssize_t ret;
+
+ ret = blockdev_direct_IO_newtrunc(rw, iocb, inode, inode->i_sb->s_bdev,
+ iov, offset, nr_segs, ext2_get_block, NULL);
+ if (ret < 0 && (rw & WRITE))
+ ext2_write_failed(mapping, offset + iov_length(iov, nr_segs));
+ return ret;
}
static int
@@ -818,7 +857,7 @@ const struct address_space_operations ext2_aops = {
.writepage = ext2_writepage,
.sync_page = block_sync_page,
.write_begin = ext2_write_begin,
- .write_end = generic_write_end,
+ .write_end = ext2_write_end,
.bmap = ext2_bmap,
.direct_IO = ext2_direct_IO,
.writepages = ext2_writepages,
@@ -1027,7 +1066,7 @@ static void ext2_free_branches(struct inode *inode, __le32 *p, __le32 *q, int de
ext2_free_data(inode, p, q);
}
-void ext2_truncate(struct inode *inode)
+static void __ext2_truncate_blocks(struct inode *inode, loff_t offset)
{
__le32 *i_data = EXT2_I(inode)->i_data;
struct ext2_inode_info *ei = EXT2_I(inode);
@@ -1039,27 +1078,8 @@ void ext2_truncate(struct inode *inode)
int n;
long iblock;
unsigned blocksize;
-
- if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
- S_ISLNK(inode->i_mode)))
- return;
- if (ext2_inode_is_fast_symlink(inode))
- return;
- if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
- return;
-
blocksize = inode->i_sb->s_blocksize;
- iblock = (inode->i_size + blocksize-1)
- >> EXT2_BLOCK_SIZE_BITS(inode->i_sb);
-
- if (mapping_is_xip(inode->i_mapping))
- xip_truncate_page(inode->i_mapping, inode->i_size);
- else if (test_opt(inode->i_sb, NOBH))
- nobh_truncate_page(inode->i_mapping,
- inode->i_size, ext2_get_block);
- else
- block_truncate_page(inode->i_mapping,
- inode->i_size, ext2_get_block);
+ iblock = (offset + blocksize-1) >> EXT2_BLOCK_SIZE_BITS(inode->i_sb);
n = ext2_block_to_path(inode, iblock, offsets, NULL);
if (n == 0)
@@ -1127,6 +1147,62 @@ do_indirects:
ext2_discard_reservation(inode);
mutex_unlock(&ei->truncate_mutex);
+}
+
+static void ext2_truncate_blocks(struct inode *inode, loff_t offset)
+{
+ /*
+ * XXX: it seems like a bug here that we don't allow
+ * IS_APPEND inode to have blocks-past-i_size trimmed off.
+ * review and fix this.
+ *
+ * Also would be nice to be able to handle IO errors and such,
+ * but that's probably too much to ask.
+ */
+ if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
+ S_ISLNK(inode->i_mode)))
+ return;
+ if (ext2_inode_is_fast_symlink(inode))
+ return;
+ if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
+ return;
+ __ext2_truncate_blocks(inode, offset);
+}
+
+int ext2_setsize(struct inode *inode, loff_t newsize)
+{
+ loff_t oldsize;
+ int error;
+
+ error = inode_newsize_ok(inode, newsize);
+ if (error)
+ return error;
+
+ if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
+ S_ISLNK(inode->i_mode)))
+ return -EINVAL;
+ if (ext2_inode_is_fast_symlink(inode))
+ return -EINVAL;
+ if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
+ return -EPERM;
+
+ if (mapping_is_xip(inode->i_mapping))
+ error = xip_truncate_page(inode->i_mapping, newsize);
+ else if (test_opt(inode->i_sb, NOBH))
+ error = nobh_truncate_page(inode->i_mapping,
+ newsize, ext2_get_block);
+ else
+ error = block_truncate_page(inode->i_mapping,
+ newsize, ext2_get_block);
+ if (error)
+ return error;
+
+ oldsize = inode->i_size;
+ i_size_write(inode, newsize);
+ truncate_pagecache(inode, oldsize, newsize);
+
+ __ext2_truncate_blocks(inode, newsize);
+
inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC;
if (inode_needs_sync(inode)) {
sync_mapping_buffers(inode->i_mapping);
@@ -1134,6 +1210,8 @@ do_indirects:
} else {
mark_inode_dirty(inode);
}
+
+ return 0;
}
static struct ext2_inode *ext2_get_inode(struct super_block *sb, ino_t ino,
@@ -1474,8 +1552,15 @@ int ext2_setattr(struct dentry *dentry, struct iattr *iattr)
if (error)
return error;
}
- error = inode_setattr(inode, iattr);
- if (!error && (iattr->ia_valid & ATTR_MODE))
+ if (iattr->ia_valid & ATTR_SIZE && iattr->ia_size != inode->i_size) {
+ error = ext2_setsize(inode, iattr->ia_size);
+ if (error)
+ return error;
+ }
+ generic_setattr(inode, iattr);
+ if (iattr->ia_valid & ATTR_MODE)
error = ext2_acl_chmod(inode);
+ mark_inode_dirty(inode);
+
return error;
}
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index 71e9eb1fa696..7ff43f4a59cd 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -119,6 +119,8 @@ static void ext2_put_super (struct super_block * sb)
int i;
struct ext2_sb_info *sbi = EXT2_SB(sb);
+ dquot_disable(sb, -1, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED);
+
if (sb->s_dirt)
ext2_write_super(sb);
@@ -1063,6 +1065,12 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
sb->s_op = &ext2_sops;
sb->s_export_op = &ext2_export_ops;
sb->s_xattr = ext2_xattr_handlers;
+
+#ifdef CONFIG_QUOTA
+ sb->dq_op = &dquot_operations;
+ sb->s_qcop = &dquot_quotactl_ops;
+#endif
+
root = ext2_iget(sb, EXT2_ROOT_INO);
if (IS_ERR(root)) {
ret = PTR_ERR(root);
@@ -1241,6 +1249,7 @@ static int ext2_remount (struct super_block * sb, int * flags, char * data)
spin_unlock(&sbi->s_lock);
return 0;
}
+
/*
* OK, we are remounting a valid rw partition rdonly, so set
* the rdonly flag and then mark the partition as valid again.
@@ -1248,6 +1257,13 @@ static int ext2_remount (struct super_block * sb, int * flags, char * data)
es->s_state = cpu_to_le16(sbi->s_mount_state);
es->s_mtime = cpu_to_le32(get_seconds());
spin_unlock(&sbi->s_lock);
+
+ err = dquot_suspend(sb, -1);
+ if (err < 0) {
+ spin_lock(&sbi->s_lock);
+ goto restore_opts;
+ }
+
ext2_sync_super(sb, es, 1);
} else {
__le32 ret = EXT2_HAS_RO_COMPAT_FEATURE(sb,
@@ -1269,8 +1285,12 @@ static int ext2_remount (struct super_block * sb, int * flags, char * data)
if (!ext2_setup_super (sb, es, 0))
sb->s_flags &= ~MS_RDONLY;
spin_unlock(&sbi->s_lock);
+
ext2_write_super(sb);
+
+ dquot_resume(sb, -1);
}
+
return 0;
restore_opts:
sbi->s_mount_opt = old_opts.s_mount_opt;
diff --git a/fs/ext3/acl.c b/fs/ext3/acl.c
index 01552abbca3c..8a11fe212183 100644
--- a/fs/ext3/acl.c
+++ b/fs/ext3/acl.c
@@ -205,6 +205,7 @@ ext3_set_acl(handle_t *handle, struct inode *inode, int type,
return error;
else {
inode->i_mode = mode;
+ inode->i_ctime = CURRENT_TIME_SEC;
ext3_mark_inode_dirty(handle, inode);
if (error == 0)
acl = NULL;
diff --git a/fs/ext3/dir.c b/fs/ext3/dir.c
index 373fa90c796a..e2e72c367cf6 100644
--- a/fs/ext3/dir.c
+++ b/fs/ext3/dir.c
@@ -297,7 +297,7 @@ static void free_rb_tree_fname(struct rb_root *root)
kfree (old);
}
if (!parent)
- root->rb_node = NULL;
+ *root = RB_ROOT;
else if (parent->rb_left == n)
parent->rb_left = NULL;
else if (parent->rb_right == n)
diff --git a/fs/ext3/fsync.c b/fs/ext3/fsync.c
index fcf7487734b6..d7e9f74dc3a6 100644
--- a/fs/ext3/fsync.c
+++ b/fs/ext3/fsync.c
@@ -43,9 +43,9 @@
* inode to disk.
*/
-int ext3_sync_file(struct file * file, struct dentry *dentry, int datasync)
+int ext3_sync_file(struct file *file, int datasync)
{
- struct inode *inode = dentry->d_inode;
+ struct inode *inode = file->f_mapping->host;
struct ext3_inode_info *ei = EXT3_I(inode);
journal_t *journal = EXT3_SB(inode->i_sb)->s_journal;
int ret, needs_barrier = 0;
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 0fc1293d0e96..6c953bb255e7 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -410,6 +410,8 @@ static void ext3_put_super (struct super_block * sb)
struct ext3_super_block *es = sbi->s_es;
int i, err;
+ dquot_disable(sb, -1, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED);
+
lock_kernel();
ext3_xattr_put_super(sb);
@@ -748,7 +750,7 @@ static int ext3_release_dquot(struct dquot *dquot);
static int ext3_mark_dquot_dirty(struct dquot *dquot);
static int ext3_write_info(struct super_block *sb, int type);
static int ext3_quota_on(struct super_block *sb, int type, int format_id,
- char *path, int remount);
+ char *path);
static int ext3_quota_on_mount(struct super_block *sb, int type);
static ssize_t ext3_quota_read(struct super_block *sb, int type, char *data,
size_t len, loff_t off);
@@ -767,12 +769,12 @@ static const struct dquot_operations ext3_quota_operations = {
static const struct quotactl_ops ext3_qctl_operations = {
.quota_on = ext3_quota_on,
- .quota_off = vfs_quota_off,
- .quota_sync = vfs_quota_sync,
- .get_info = vfs_get_dqinfo,
- .set_info = vfs_set_dqinfo,
- .get_dqblk = vfs_get_dqblk,
- .set_dqblk = vfs_set_dqblk
+ .quota_off = dquot_quota_off,
+ .quota_sync = dquot_quota_sync,
+ .get_info = dquot_get_dqinfo,
+ .set_info = dquot_set_dqinfo,
+ .get_dqblk = dquot_get_dqblk,
+ .set_dqblk = dquot_set_dqblk
};
#endif
@@ -1527,7 +1529,7 @@ static void ext3_orphan_cleanup (struct super_block * sb,
/* Turn quotas off */
for (i = 0; i < MAXQUOTAS; i++) {
if (sb_dqopt(sb)->files[i])
- vfs_quota_off(sb, i, 0);
+ dquot_quota_off(sb, i);
}
#endif
sb->s_flags = s_flags; /* Restore MS_RDONLY status */
@@ -2551,6 +2553,7 @@ static int ext3_remount (struct super_block * sb, int * flags, char * data)
ext3_fsblk_t n_blocks_count = 0;
unsigned long old_sb_flags;
struct ext3_mount_options old_opts;
+ int enable_quota = 0;
int err;
#ifdef CONFIG_QUOTA
int i;
@@ -2597,6 +2600,10 @@ static int ext3_remount (struct super_block * sb, int * flags, char * data)
}
if (*flags & MS_RDONLY) {
+ err = dquot_suspend(sb, -1);
+ if (err < 0)
+ goto restore_opts;
+
/*
* First of all, the unconditional stuff we have to do
* to disable replay of the journal when we next remount
@@ -2651,6 +2658,7 @@ static int ext3_remount (struct super_block * sb, int * flags, char * data)
goto restore_opts;
if (!ext3_setup_super (sb, es, 0))
sb->s_flags &= ~MS_RDONLY;
+ enable_quota = 1;
}
}
#ifdef CONFIG_QUOTA
@@ -2662,6 +2670,9 @@ static int ext3_remount (struct super_block * sb, int * flags, char * data)
#endif
unlock_super(sb);
unlock_kernel();
+
+ if (enable_quota)
+ dquot_resume(sb, -1);
return 0;
restore_opts:
sb->s_flags = old_sb_flags;
@@ -2851,24 +2862,21 @@ static int ext3_write_info(struct super_block *sb, int type)
*/
static int ext3_quota_on_mount(struct super_block *sb, int type)
{
- return vfs_quota_on_mount(sb, EXT3_SB(sb)->s_qf_names[type],
- EXT3_SB(sb)->s_jquota_fmt, type);
+ return dquot_quota_on_mount(sb, EXT3_SB(sb)->s_qf_names[type],
+ EXT3_SB(sb)->s_jquota_fmt, type);
}
/*
* Standard function to be called on quota_on
*/
static int ext3_quota_on(struct super_block *sb, int type, int format_id,
- char *name, int remount)
+ char *name)
{
int err;
struct path path;
if (!test_opt(sb, QUOTA))
return -EINVAL;
- /* When remounting, no checks are needed and in fact, name is NULL */
- if (remount)
- return vfs_quota_on(sb, type, format_id, name, remount);
err = kern_path(name, LOOKUP_FOLLOW, &path);
if (err)
@@ -2906,7 +2914,7 @@ static int ext3_quota_on(struct super_block *sb, int type, int format_id,
}
}
- err = vfs_quota_on_path(sb, type, format_id, &path);
+ err = dquot_quota_on_path(sb, type, format_id, &path);
path_put(&path);
return err;
}
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index d2f37a5516c7..95b7594c76f9 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -591,14 +591,15 @@ ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode,
ret = ext4_mb_new_blocks(handle, &ar, errp);
if (count)
*count = ar.len;
-
/*
- * Account for the allocated meta blocks
+ * Account for the allocated meta blocks. We will never
+ * fail EDQUOT for metdata, but we do account for it.
*/
if (!(*errp) && EXT4_I(inode)->i_delalloc_reserved_flag) {
spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
EXT4_I(inode)->i_allocated_meta_blocks += ar.len;
spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
+ dquot_alloc_block_nofail(inode, ar.len);
}
return ret;
}
diff --git a/fs/ext4/block_validity.c b/fs/ext4/block_validity.c
index 538c48655084..5b6973fbf1bd 100644
--- a/fs/ext4/block_validity.c
+++ b/fs/ext4/block_validity.c
@@ -72,9 +72,9 @@ static int add_system_zone(struct ext4_sb_info *sbi,
else if (start_blk >= (entry->start_blk + entry->count))
n = &(*n)->rb_right;
else {
- if (start_blk + count > (entry->start_blk +
+ if (start_blk + count > (entry->start_blk +
entry->count))
- entry->count = (start_blk + count -
+ entry->count = (start_blk + count -
entry->start_blk);
new_node = *n;
new_entry = rb_entry(new_node, struct ext4_system_zone,
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index 86cb6d86a048..ea5e6cb7e2a5 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -83,11 +83,10 @@ int ext4_check_dir_entry(const char *function, struct inode *dir,
error_msg = "inode out of bounds";
if (error_msg != NULL)
- __ext4_error(dir->i_sb, function,
- "bad entry in directory #%lu: %s - block=%llu"
+ ext4_error_inode(function, dir,
+ "bad entry in directory: %s - block=%llu"
"offset=%u(%u), inode=%u, rec_len=%d, name_len=%d",
- dir->i_ino, error_msg,
- (unsigned long long) bh->b_blocknr,
+ error_msg, (unsigned long long) bh->b_blocknr,
(unsigned) (offset%bh->b_size), offset,
le32_to_cpu(de->inode),
rlen, de->name_len);
@@ -111,7 +110,7 @@ static int ext4_readdir(struct file *filp,
if (EXT4_HAS_COMPAT_FEATURE(inode->i_sb,
EXT4_FEATURE_COMPAT_DIR_INDEX) &&
- ((EXT4_I(inode)->i_flags & EXT4_INDEX_FL) ||
+ ((ext4_test_inode_flag(inode, EXT4_INODE_INDEX)) ||
((inode->i_size >> sb->s_blocksize_bits) == 1))) {
err = ext4_dx_readdir(filp, dirent, filldir);
if (err != ERR_BAD_DX_DIR) {
@@ -122,20 +121,20 @@ static int ext4_readdir(struct file *filp,
* We don't set the inode dirty flag since it's not
* critical that it get flushed back to the disk.
*/
- EXT4_I(filp->f_path.dentry->d_inode)->i_flags &= ~EXT4_INDEX_FL;
+ ext4_clear_inode_flag(filp->f_path.dentry->d_inode, EXT4_INODE_INDEX);
}
stored = 0;
offset = filp->f_pos & (sb->s_blocksize - 1);
while (!error && !stored && filp->f_pos < inode->i_size) {
- ext4_lblk_t blk = filp->f_pos >> EXT4_BLOCK_SIZE_BITS(sb);
- struct buffer_head map_bh;
+ struct ext4_map_blocks map;
struct buffer_head *bh = NULL;
- map_bh.b_state = 0;
- err = ext4_get_blocks(NULL, inode, blk, 1, &map_bh, 0);
+ map.m_lblk = filp->f_pos >> EXT4_BLOCK_SIZE_BITS(sb);
+ map.m_len = 1;
+ err = ext4_map_blocks(NULL, inode, &map, 0);
if (err > 0) {
- pgoff_t index = map_bh.b_blocknr >>
+ pgoff_t index = map.m_pblk >>
(PAGE_CACHE_SHIFT - inode->i_blkbits);
if (!ra_has_index(&filp->f_ra, index))
page_cache_sync_readahead(
@@ -143,7 +142,7 @@ static int ext4_readdir(struct file *filp,
&filp->f_ra, filp,
index, 1);
filp->f_ra.prev_pos = (loff_t)index << PAGE_CACHE_SHIFT;
- bh = ext4_bread(NULL, inode, blk, 0, &err);
+ bh = ext4_bread(NULL, inode, map.m_lblk, 0, &err);
}
/*
@@ -152,9 +151,8 @@ static int ext4_readdir(struct file *filp,
*/
if (!bh) {
if (!dir_has_error) {
- ext4_error(sb, "directory #%lu "
+ EXT4_ERROR_INODE(inode, "directory "
"contains a hole at offset %Lu",
- inode->i_ino,
(unsigned long long) filp->f_pos);
dir_has_error = 1;
}
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index bf938cf7c5f0..19a4de57128a 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -29,6 +29,9 @@
#include <linux/wait.h>
#include <linux/blockgroup_lock.h>
#include <linux/percpu_counter.h>
+#ifdef __KERNEL__
+#include <linux/compat.h>
+#endif
/*
* The fourth extended filesystem constants/structures
@@ -54,10 +57,10 @@
#endif
#define EXT4_ERROR_INODE(inode, fmt, a...) \
- ext4_error_inode(__func__, (inode), (fmt), ## a);
+ ext4_error_inode(__func__, (inode), (fmt), ## a)
#define EXT4_ERROR_FILE(file, fmt, a...) \
- ext4_error_file(__func__, (file), (fmt), ## a);
+ ext4_error_file(__func__, (file), (fmt), ## a)
/* data type for block offset of block group */
typedef int ext4_grpblk_t;
@@ -72,7 +75,7 @@ typedef __u32 ext4_lblk_t;
typedef unsigned int ext4_group_t;
/*
- * Flags used in mballoc's allocation_context flags field.
+ * Flags used in mballoc's allocation_context flags field.
*
* Also used to show what's going on for debugging purposes when the
* flag field is exported via the traceport interface
@@ -126,6 +129,29 @@ struct ext4_allocation_request {
};
/*
+ * Logical to physical block mapping, used by ext4_map_blocks()
+ *
+ * This structure is used to pass requests into ext4_map_blocks() as
+ * well as to store the information returned by ext4_map_blocks(). It
+ * takes less room on the stack than a struct buffer_head.
+ */
+#define EXT4_MAP_NEW (1 << BH_New)
+#define EXT4_MAP_MAPPED (1 << BH_Mapped)
+#define EXT4_MAP_UNWRITTEN (1 << BH_Unwritten)
+#define EXT4_MAP_BOUNDARY (1 << BH_Boundary)
+#define EXT4_MAP_UNINIT (1 << BH_Uninit)
+#define EXT4_MAP_FLAGS (EXT4_MAP_NEW | EXT4_MAP_MAPPED |\
+ EXT4_MAP_UNWRITTEN | EXT4_MAP_BOUNDARY |\
+ EXT4_MAP_UNINIT)
+
+struct ext4_map_blocks {
+ ext4_fsblk_t m_pblk;
+ ext4_lblk_t m_lblk;
+ unsigned int m_len;
+ unsigned int m_flags;
+};
+
+/*
* For delayed allocation tracking
*/
struct mpage_da_data {
@@ -321,6 +347,83 @@ static inline __u32 ext4_mask_flags(umode_t mode, __u32 flags)
return flags & EXT4_OTHER_FLMASK;
}
+/*
+ * Inode flags used for atomic set/get
+ */
+enum {
+ EXT4_INODE_SECRM = 0, /* Secure deletion */
+ EXT4_INODE_UNRM = 1, /* Undelete */
+ EXT4_INODE_COMPR = 2, /* Compress file */
+ EXT4_INODE_SYNC = 3, /* Synchronous updates */
+ EXT4_INODE_IMMUTABLE = 4, /* Immutable file */
+ EXT4_INODE_APPEND = 5, /* writes to file may only append */
+ EXT4_INODE_NODUMP = 6, /* do not dump file */
+ EXT4_INODE_NOATIME = 7, /* do not update atime */
+/* Reserved for compression usage... */
+ EXT4_INODE_DIRTY = 8,
+ EXT4_INODE_COMPRBLK = 9, /* One or more compressed clusters */
+ EXT4_INODE_NOCOMPR = 10, /* Don't compress */
+ EXT4_INODE_ECOMPR = 11, /* Compression error */
+/* End compression flags --- maybe not all used */
+ EXT4_INODE_INDEX = 12, /* hash-indexed directory */
+ EXT4_INODE_IMAGIC = 13, /* AFS directory */
+ EXT4_INODE_JOURNAL_DATA = 14, /* file data should be journaled */
+ EXT4_INODE_NOTAIL = 15, /* file tail should not be merged */
+ EXT4_INODE_DIRSYNC = 16, /* dirsync behaviour (directories only) */
+ EXT4_INODE_TOPDIR = 17, /* Top of directory hierarchies*/
+ EXT4_INODE_HUGE_FILE = 18, /* Set to each huge file */
+ EXT4_INODE_EXTENTS = 19, /* Inode uses extents */
+ EXT4_INODE_EA_INODE = 21, /* Inode used for large EA */
+ EXT4_INODE_EOFBLOCKS = 22, /* Blocks allocated beyond EOF */
+ EXT4_INODE_RESERVED = 31, /* reserved for ext4 lib */
+};
+
+#define TEST_FLAG_VALUE(FLAG) (EXT4_##FLAG##_FL == (1 << EXT4_INODE_##FLAG))
+#define CHECK_FLAG_VALUE(FLAG) if (!TEST_FLAG_VALUE(FLAG)) { \
+ printk(KERN_EMERG "EXT4 flag fail: " #FLAG ": %d %d\n", \
+ EXT4_##FLAG##_FL, EXT4_INODE_##FLAG); BUG_ON(1); }
+
+/*
+ * Since it's pretty easy to mix up bit numbers and hex values, and we
+ * can't do a compile-time test for ENUM values, we use a run-time
+ * test to make sure that EXT4_XXX_FL is consistent with respect to
+ * EXT4_INODE_XXX. If all is well the printk and BUG_ON will all drop
+ * out so it won't cost any extra space in the compiled kernel image.
+ * But it's important that these values are the same, since we are
+ * using EXT4_INODE_XXX to test for the flag values, but EXT4_XX_FL
+ * must be consistent with the values of FS_XXX_FL defined in
+ * include/linux/fs.h and the on-disk values found in ext2, ext3, and
+ * ext4 filesystems, and of course the values defined in e2fsprogs.
+ *
+ * It's not paranoia if the Murphy's Law really *is* out to get you. :-)
+ */
+static inline void ext4_check_flag_values(void)
+{
+ CHECK_FLAG_VALUE(SECRM);
+ CHECK_FLAG_VALUE(UNRM);
+ CHECK_FLAG_VALUE(COMPR);
+ CHECK_FLAG_VALUE(SYNC);
+ CHECK_FLAG_VALUE(IMMUTABLE);
+ CHECK_FLAG_VALUE(APPEND);
+ CHECK_FLAG_VALUE(NODUMP);
+ CHECK_FLAG_VALUE(NOATIME);
+ CHECK_FLAG_VALUE(DIRTY);
+ CHECK_FLAG_VALUE(COMPRBLK);
+ CHECK_FLAG_VALUE(NOCOMPR);
+ CHECK_FLAG_VALUE(ECOMPR);
+ CHECK_FLAG_VALUE(INDEX);
+ CHECK_FLAG_VALUE(IMAGIC);
+ CHECK_FLAG_VALUE(JOURNAL_DATA);
+ CHECK_FLAG_VALUE(NOTAIL);
+ CHECK_FLAG_VALUE(DIRSYNC);
+ CHECK_FLAG_VALUE(TOPDIR);
+ CHECK_FLAG_VALUE(HUGE_FILE);
+ CHECK_FLAG_VALUE(EXTENTS);
+ CHECK_FLAG_VALUE(EA_INODE);
+ CHECK_FLAG_VALUE(EOFBLOCKS);
+ CHECK_FLAG_VALUE(RESERVED);
+}
+
/* Used to pass group descriptor data when online resize is done */
struct ext4_new_group_input {
__u32 group; /* Group number for this data */
@@ -332,6 +435,18 @@ struct ext4_new_group_input {
__u16 unused;
};
+#if defined(__KERNEL__) && defined(CONFIG_COMPAT)
+struct compat_ext4_new_group_input {
+ u32 group;
+ compat_u64 block_bitmap;
+ compat_u64 inode_bitmap;
+ compat_u64 inode_table;
+ u32 blocks_count;
+ u16 reserved_blocks;
+ u16 unused;
+};
+#endif
+
/* The struct ext4_new_group_input in kernel space, with free_blocks_count */
struct ext4_new_group_data {
__u32 group;
@@ -355,7 +470,7 @@ struct ext4_new_group_data {
#define EXT4_GET_BLOCKS_CREATE_UNINIT_EXT (EXT4_GET_BLOCKS_UNINIT_EXT|\
EXT4_GET_BLOCKS_CREATE)
/* Caller is from the delayed allocation writeout path,
- so set the magic i_delalloc_reserve_flag after taking the
+ so set the magic i_delalloc_reserve_flag after taking the
inode allocation semaphore for */
#define EXT4_GET_BLOCKS_DELALLOC_RESERVE 0x0004
/* caller is from the direct IO path, request to creation of an
@@ -398,6 +513,7 @@ struct ext4_new_group_data {
#define EXT4_IOC_ALLOC_DA_BLKS _IO('f', 12)
#define EXT4_IOC_MOVE_EXT _IOWR('f', 15, struct move_extent)
+#if defined(__KERNEL__) && defined(CONFIG_COMPAT)
/*
* ioctl commands in 32 bit emulation
*/
@@ -408,11 +524,13 @@ struct ext4_new_group_data {
#define EXT4_IOC32_GETRSVSZ _IOR('f', 5, int)
#define EXT4_IOC32_SETRSVSZ _IOW('f', 6, int)
#define EXT4_IOC32_GROUP_EXTEND _IOW('f', 7, unsigned int)
+#define EXT4_IOC32_GROUP_ADD _IOW('f', 8, struct compat_ext4_new_group_input)
#ifdef CONFIG_JBD2_DEBUG
#define EXT4_IOC32_WAIT_FOR_READONLY _IOR('f', 99, int)
#endif
#define EXT4_IOC32_GETVERSION_OLD FS_IOC32_GETVERSION
#define EXT4_IOC32_SETVERSION_OLD FS_IOC32_SETVERSION
+#endif
/*
@@ -616,9 +734,8 @@ struct ext4_ext_cache {
*/
struct ext4_inode_info {
__le32 i_data[15]; /* unconverted */
- __u32 i_flags;
- ext4_fsblk_t i_file_acl;
__u32 i_dtime;
+ ext4_fsblk_t i_file_acl;
/*
* i_block_group is the number of the block group which contains
@@ -629,6 +746,7 @@ struct ext4_inode_info {
*/
ext4_group_t i_block_group;
unsigned long i_state_flags; /* Dynamic state flags */
+ unsigned long i_flags;
ext4_lblk_t i_dir_start_lookup;
#ifdef CONFIG_EXT4_FS_XATTR
@@ -1062,22 +1180,25 @@ enum {
EXT4_STATE_DA_ALLOC_CLOSE, /* Alloc DA blks on close */
EXT4_STATE_EXT_MIGRATE, /* Inode is migrating */
EXT4_STATE_DIO_UNWRITTEN, /* need convert on dio done*/
+ EXT4_STATE_NEWENTRY, /* File just added to dir */
};
-static inline int ext4_test_inode_state(struct inode *inode, int bit)
-{
- return test_bit(bit, &EXT4_I(inode)->i_state_flags);
-}
-
-static inline void ext4_set_inode_state(struct inode *inode, int bit)
-{
- set_bit(bit, &EXT4_I(inode)->i_state_flags);
+#define EXT4_INODE_BIT_FNS(name, field) \
+static inline int ext4_test_inode_##name(struct inode *inode, int bit) \
+{ \
+ return test_bit(bit, &EXT4_I(inode)->i_##field); \
+} \
+static inline void ext4_set_inode_##name(struct inode *inode, int bit) \
+{ \
+ set_bit(bit, &EXT4_I(inode)->i_##field); \
+} \
+static inline void ext4_clear_inode_##name(struct inode *inode, int bit) \
+{ \
+ clear_bit(bit, &EXT4_I(inode)->i_##field); \
}
-static inline void ext4_clear_inode_state(struct inode *inode, int bit)
-{
- clear_bit(bit, &EXT4_I(inode)->i_state_flags);
-}
+EXT4_INODE_BIT_FNS(flag, flags)
+EXT4_INODE_BIT_FNS(state, state_flags)
#else
/* Assume that user mode programs are passing in an ext4fs superblock, not
* a kernel struct super_block. This will allow us to call the feature-test
@@ -1264,7 +1385,7 @@ struct ext4_dir_entry_2 {
#define is_dx(dir) (EXT4_HAS_COMPAT_FEATURE(dir->i_sb, \
EXT4_FEATURE_COMPAT_DIR_INDEX) && \
- (EXT4_I(dir)->i_flags & EXT4_INDEX_FL))
+ ext4_test_inode_flag((dir), EXT4_INODE_INDEX))
#define EXT4_DIR_LINK_MAX(dir) (!is_dx(dir) && (dir)->i_nlink >= EXT4_LINK_MAX)
#define EXT4_DIR_LINK_EMPTY(dir) ((dir)->i_nlink == 2 || (dir)->i_nlink == 1)
@@ -1398,7 +1519,7 @@ extern int ext4_htree_store_dirent(struct file *dir_file, __u32 hash,
extern void ext4_htree_free_dir_info(struct dir_private_info *p);
/* fsync.c */
-extern int ext4_sync_file(struct file *, struct dentry *, int);
+extern int ext4_sync_file(struct file *, int);
/* hash.c */
extern int ext4fs_dirhash(const char *name, int len, struct
@@ -1678,6 +1799,7 @@ struct ext4_group_info {
ext4_grpblk_t bb_first_free; /* first free block */
ext4_grpblk_t bb_free; /* total free blocks */
ext4_grpblk_t bb_fragments; /* nr of freespace fragments */
+ ext4_grpblk_t bb_largest_free_order;/* order of largest frag in BG */
struct list_head bb_prealloc_list;
#ifdef DOUBLE_CHECK
void *bb_bitmap;
@@ -1772,9 +1894,8 @@ extern int ext4_ext_tree_init(handle_t *handle, struct inode *);
extern int ext4_ext_writepage_trans_blocks(struct inode *, int);
extern int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks,
int chunk);
-extern int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
- ext4_lblk_t iblock, unsigned int max_blocks,
- struct buffer_head *bh_result, int flags);
+extern int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
+ struct ext4_map_blocks *map, int flags);
extern void ext4_ext_truncate(struct inode *);
extern void ext4_ext_init(struct super_block *);
extern void ext4_ext_release(struct super_block *);
@@ -1782,6 +1903,8 @@ extern long ext4_fallocate(struct inode *inode, int mode, loff_t offset,
loff_t len);
extern int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset,
ssize_t len);
+extern int ext4_map_blocks(handle_t *handle, struct inode *inode,
+ struct ext4_map_blocks *map, int flags);
extern int ext4_get_blocks(handle_t *handle, struct inode *inode,
sector_t block, unsigned int max_blocks,
struct buffer_head *bh, int flags);
diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
index b79ad5126468..dade0c024797 100644
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h
@@ -273,7 +273,7 @@ static inline int ext4_should_journal_data(struct inode *inode)
return 1;
if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)
return 1;
- if (EXT4_I(inode)->i_flags & EXT4_JOURNAL_DATA_FL)
+ if (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA))
return 1;
return 0;
}
@@ -284,7 +284,7 @@ static inline int ext4_should_order_data(struct inode *inode)
return 0;
if (!S_ISREG(inode->i_mode))
return 0;
- if (EXT4_I(inode)->i_flags & EXT4_JOURNAL_DATA_FL)
+ if (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA))
return 0;
if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA)
return 1;
@@ -297,7 +297,7 @@ static inline int ext4_should_writeback_data(struct inode *inode)
return 0;
if (EXT4_JOURNAL(inode) == NULL)
return 1;
- if (EXT4_I(inode)->i_flags & EXT4_JOURNAL_DATA_FL)
+ if (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA))
return 0;
if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA)
return 1;
@@ -321,7 +321,7 @@ static inline int ext4_should_dioread_nolock(struct inode *inode)
return 0;
if (!S_ISREG(inode->i_mode))
return 0;
- if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL))
+ if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
return 0;
if (ext4_should_journal_data(inode))
return 0;
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 236b834b4ca8..377309c1af65 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -107,11 +107,8 @@ static int ext4_ext_truncate_extend_restart(handle_t *handle,
if (err <= 0)
return err;
err = ext4_truncate_restart_trans(handle, inode, needed);
- /*
- * We have dropped i_data_sem so someone might have cached again
- * an extent we are going to truncate.
- */
- ext4_ext_invalidate_cache(inode);
+ if (err == 0)
+ err = -EAGAIN;
return err;
}
@@ -185,10 +182,10 @@ static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode,
if (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) {
/*
* If there are at least EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME
- * block groups per flexgroup, reserve the first block
- * group for directories and special files. Regular
+ * block groups per flexgroup, reserve the first block
+ * group for directories and special files. Regular
* files will start at the second block group. This
- * tends to speed up directory access and improves
+ * tends to speed up directory access and improves
* fsck times.
*/
block_group &= ~(flex_size-1);
@@ -439,10 +436,10 @@ static int __ext4_ext_check(const char *function, struct inode *inode,
return 0;
corrupted:
- __ext4_error(inode->i_sb, function,
- "bad header/extent in inode #%lu: %s - magic %x, "
+ ext4_error_inode(function, inode,
+ "bad header/extent: %s - magic %x, "
"entries %u, max %u(%u), depth %u(%u)",
- inode->i_ino, error_msg, le16_to_cpu(eh->eh_magic),
+ error_msg, le16_to_cpu(eh->eh_magic),
le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max),
max, le16_to_cpu(eh->eh_depth), depth);
@@ -1622,9 +1619,7 @@ int ext4_ext_try_to_merge(struct inode *inode,
merge_done = 1;
WARN_ON(eh->eh_entries == 0);
if (!eh->eh_entries)
- ext4_error(inode->i_sb,
- "inode#%lu, eh->eh_entries = 0!",
- inode->i_ino);
+ EXT4_ERROR_INODE(inode, "eh->eh_entries = 0!");
}
return merge_done;
@@ -2039,7 +2034,7 @@ ext4_ext_in_cache(struct inode *inode, ext4_lblk_t block,
struct ext4_ext_cache *cex;
int ret = EXT4_EXT_CACHE_NO;
- /*
+ /*
* We borrow i_block_reservation_lock to protect i_cached_extent
*/
spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
@@ -2361,7 +2356,7 @@ static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start)
int depth = ext_depth(inode);
struct ext4_ext_path *path;
handle_t *handle;
- int i = 0, err = 0;
+ int i, err;
ext_debug("truncate since %u\n", start);
@@ -2370,23 +2365,26 @@ static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start)
if (IS_ERR(handle))
return PTR_ERR(handle);
+again:
ext4_ext_invalidate_cache(inode);
/*
* We start scanning from right side, freeing all the blocks
* after i_size and walking into the tree depth-wise.
*/
+ depth = ext_depth(inode);
path = kzalloc(sizeof(struct ext4_ext_path) * (depth + 1), GFP_NOFS);
if (path == NULL) {
ext4_journal_stop(handle);
return -ENOMEM;
}
+ path[0].p_depth = depth;
path[0].p_hdr = ext_inode_hdr(inode);
if (ext4_ext_check(inode, path[0].p_hdr, depth)) {
err = -EIO;
goto out;
}
- path[0].p_depth = depth;
+ i = err = 0;
while (i >= 0 && err == 0) {
if (i == depth) {
@@ -2480,6 +2478,8 @@ static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start)
out:
ext4_ext_drop_refs(path);
kfree(path);
+ if (err == -EAGAIN)
+ goto again;
ext4_journal_stop(handle);
return err;
@@ -2544,7 +2544,7 @@ static void bi_complete(struct bio *bio, int error)
/* FIXME!! we need to try to merge to left or right after zero-out */
static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex)
{
- int ret = -EIO;
+ int ret;
struct bio *bio;
int blkbits, blocksize;
sector_t ee_pblock;
@@ -2568,6 +2568,9 @@ static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex)
len = ee_len;
bio = bio_alloc(GFP_NOIO, len);
+ if (!bio)
+ return -ENOMEM;
+
bio->bi_sector = ee_pblock;
bio->bi_bdev = inode->i_sb->s_bdev;
@@ -2595,22 +2598,20 @@ static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex)
submit_bio(WRITE, bio);
wait_for_completion(&event);
- if (test_bit(BIO_UPTODATE, &bio->bi_flags))
- ret = 0;
- else {
- ret = -EIO;
- break;
+ if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) {
+ bio_put(bio);
+ return -EIO;
}
bio_put(bio);
ee_len -= done;
ee_pblock += done << (blkbits - 9);
}
- return ret;
+ return 0;
}
#define EXT4_EXT_ZERO_LEN 7
/*
- * This function is called by ext4_ext_get_blocks() if someone tries to write
+ * This function is called by ext4_ext_map_blocks() if someone tries to write
* to an uninitialized extent. It may result in splitting the uninitialized
* extent into multiple extents (upto three - one initialized and two
* uninitialized).
@@ -2620,39 +2621,55 @@ static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex)
* c> Splits in three extents: Somone is writing in middle of the extent
*/
static int ext4_ext_convert_to_initialized(handle_t *handle,
- struct inode *inode,
- struct ext4_ext_path *path,
- ext4_lblk_t iblock,
- unsigned int max_blocks)
+ struct inode *inode,
+ struct ext4_map_blocks *map,
+ struct ext4_ext_path *path)
{
struct ext4_extent *ex, newex, orig_ex;
struct ext4_extent *ex1 = NULL;
struct ext4_extent *ex2 = NULL;
struct ext4_extent *ex3 = NULL;
struct ext4_extent_header *eh;
- ext4_lblk_t ee_block;
+ ext4_lblk_t ee_block, eof_block;
unsigned int allocated, ee_len, depth;
ext4_fsblk_t newblock;
int err = 0;
int ret = 0;
+ int may_zeroout;
+
+ ext_debug("ext4_ext_convert_to_initialized: inode %lu, logical"
+ "block %llu, max_blocks %u\n", inode->i_ino,
+ (unsigned long long)map->m_lblk, map->m_len);
+
+ eof_block = (inode->i_size + inode->i_sb->s_blocksize - 1) >>
+ inode->i_sb->s_blocksize_bits;
+ if (eof_block < map->m_lblk + map->m_len)
+ eof_block = map->m_lblk + map->m_len;
depth = ext_depth(inode);
eh = path[depth].p_hdr;
ex = path[depth].p_ext;
ee_block = le32_to_cpu(ex->ee_block);
ee_len = ext4_ext_get_actual_len(ex);
- allocated = ee_len - (iblock - ee_block);
- newblock = iblock - ee_block + ext_pblock(ex);
+ allocated = ee_len - (map->m_lblk - ee_block);
+ newblock = map->m_lblk - ee_block + ext_pblock(ex);
+
ex2 = ex;
orig_ex.ee_block = ex->ee_block;
orig_ex.ee_len = cpu_to_le16(ee_len);
ext4_ext_store_pblock(&orig_ex, ext_pblock(ex));
+ /*
+ * It is safe to convert extent to initialized via explicit
+ * zeroout only if extent is fully insde i_size or new_size.
+ */
+ may_zeroout = ee_block + ee_len <= eof_block;
+
err = ext4_ext_get_access(handle, inode, path + depth);
if (err)
goto out;
/* If extent has less than 2*EXT4_EXT_ZERO_LEN zerout directly */
- if (ee_len <= 2*EXT4_EXT_ZERO_LEN) {
+ if (ee_len <= 2*EXT4_EXT_ZERO_LEN && may_zeroout) {
err = ext4_ext_zeroout(inode, &orig_ex);
if (err)
goto fix_extent_len;
@@ -2665,10 +2682,10 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
return allocated;
}
- /* ex1: ee_block to iblock - 1 : uninitialized */
- if (iblock > ee_block) {
+ /* ex1: ee_block to map->m_lblk - 1 : uninitialized */
+ if (map->m_lblk > ee_block) {
ex1 = ex;
- ex1->ee_len = cpu_to_le16(iblock - ee_block);
+ ex1->ee_len = cpu_to_le16(map->m_lblk - ee_block);
ext4_ext_mark_uninitialized(ex1);
ex2 = &newex;
}
@@ -2677,15 +2694,15 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
* we insert ex3, if ex1 is NULL. This is to avoid temporary
* overlap of blocks.
*/
- if (!ex1 && allocated > max_blocks)
- ex2->ee_len = cpu_to_le16(max_blocks);
+ if (!ex1 && allocated > map->m_len)
+ ex2->ee_len = cpu_to_le16(map->m_len);
/* ex3: to ee_block + ee_len : uninitialised */
- if (allocated > max_blocks) {
+ if (allocated > map->m_len) {
unsigned int newdepth;
/* If extent has less than EXT4_EXT_ZERO_LEN zerout directly */
- if (allocated <= EXT4_EXT_ZERO_LEN) {
+ if (allocated <= EXT4_EXT_ZERO_LEN && may_zeroout) {
/*
- * iblock == ee_block is handled by the zerouout
+ * map->m_lblk == ee_block is handled by the zerouout
* at the beginning.
* Mark first half uninitialized.
* Mark second half initialized and zero out the
@@ -2698,7 +2715,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
ext4_ext_dirty(handle, inode, path + depth);
ex3 = &newex;
- ex3->ee_block = cpu_to_le32(iblock);
+ ex3->ee_block = cpu_to_le32(map->m_lblk);
ext4_ext_store_pblock(ex3, newblock);
ex3->ee_len = cpu_to_le16(allocated);
err = ext4_ext_insert_extent(handle, inode, path,
@@ -2711,7 +2728,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
ex->ee_len = orig_ex.ee_len;
ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
ext4_ext_dirty(handle, inode, path + depth);
- /* blocks available from iblock */
+ /* blocks available from map->m_lblk */
return allocated;
} else if (err)
@@ -2733,8 +2750,8 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
*/
depth = ext_depth(inode);
ext4_ext_drop_refs(path);
- path = ext4_ext_find_extent(inode,
- iblock, path);
+ path = ext4_ext_find_extent(inode, map->m_lblk,
+ path);
if (IS_ERR(path)) {
err = PTR_ERR(path);
return err;
@@ -2754,12 +2771,12 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
return allocated;
}
ex3 = &newex;
- ex3->ee_block = cpu_to_le32(iblock + max_blocks);
- ext4_ext_store_pblock(ex3, newblock + max_blocks);
- ex3->ee_len = cpu_to_le16(allocated - max_blocks);
+ ex3->ee_block = cpu_to_le32(map->m_lblk + map->m_len);
+ ext4_ext_store_pblock(ex3, newblock + map->m_len);
+ ex3->ee_len = cpu_to_le16(allocated - map->m_len);
ext4_ext_mark_uninitialized(ex3);
err = ext4_ext_insert_extent(handle, inode, path, ex3, 0);
- if (err == -ENOSPC) {
+ if (err == -ENOSPC && may_zeroout) {
err = ext4_ext_zeroout(inode, &orig_ex);
if (err)
goto fix_extent_len;
@@ -2769,7 +2786,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
ext4_ext_dirty(handle, inode, path + depth);
/* zeroed the full extent */
- /* blocks available from iblock */
+ /* blocks available from map->m_lblk */
return allocated;
} else if (err)
@@ -2783,11 +2800,13 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
* update the extent length after successful insert of the
* split extent
*/
- orig_ex.ee_len = cpu_to_le16(ee_len -
- ext4_ext_get_actual_len(ex3));
+ ee_len -= ext4_ext_get_actual_len(ex3);
+ orig_ex.ee_len = cpu_to_le16(ee_len);
+ may_zeroout = ee_block + ee_len <= eof_block;
+
depth = newdepth;
ext4_ext_drop_refs(path);
- path = ext4_ext_find_extent(inode, iblock, path);
+ path = ext4_ext_find_extent(inode, map->m_lblk, path);
if (IS_ERR(path)) {
err = PTR_ERR(path);
goto out;
@@ -2801,14 +2820,14 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
if (err)
goto out;
- allocated = max_blocks;
+ allocated = map->m_len;
/* If extent has less than EXT4_EXT_ZERO_LEN and we are trying
* to insert a extent in the middle zerout directly
* otherwise give the extent a chance to merge to left
*/
if (le16_to_cpu(orig_ex.ee_len) <= EXT4_EXT_ZERO_LEN &&
- iblock != ee_block) {
+ map->m_lblk != ee_block && may_zeroout) {
err = ext4_ext_zeroout(inode, &orig_ex);
if (err)
goto fix_extent_len;
@@ -2818,7 +2837,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
ext4_ext_dirty(handle, inode, path + depth);
/* zero out the first half */
- /* blocks available from iblock */
+ /* blocks available from map->m_lblk */
return allocated;
}
}
@@ -2829,12 +2848,12 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
*/
if (ex1 && ex1 != ex) {
ex1 = ex;
- ex1->ee_len = cpu_to_le16(iblock - ee_block);
+ ex1->ee_len = cpu_to_le16(map->m_lblk - ee_block);
ext4_ext_mark_uninitialized(ex1);
ex2 = &newex;
}
- /* ex2: iblock to iblock + maxblocks-1 : initialised */
- ex2->ee_block = cpu_to_le32(iblock);
+ /* ex2: map->m_lblk to map->m_lblk + maxblocks-1 : initialised */
+ ex2->ee_block = cpu_to_le32(map->m_lblk);
ext4_ext_store_pblock(ex2, newblock);
ex2->ee_len = cpu_to_le16(allocated);
if (ex2 != ex)
@@ -2877,7 +2896,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
goto out;
insert:
err = ext4_ext_insert_extent(handle, inode, path, &newex, 0);
- if (err == -ENOSPC) {
+ if (err == -ENOSPC && may_zeroout) {
err = ext4_ext_zeroout(inode, &orig_ex);
if (err)
goto fix_extent_len;
@@ -2904,7 +2923,7 @@ fix_extent_len:
}
/*
- * This function is called by ext4_ext_get_blocks() from
+ * This function is called by ext4_ext_map_blocks() from
* ext4_get_blocks_dio_write() when DIO to write
* to an uninitialized extent.
*
@@ -2927,9 +2946,8 @@ fix_extent_len:
*/
static int ext4_split_unwritten_extents(handle_t *handle,
struct inode *inode,
+ struct ext4_map_blocks *map,
struct ext4_ext_path *path,
- ext4_lblk_t iblock,
- unsigned int max_blocks,
int flags)
{
struct ext4_extent *ex, newex, orig_ex;
@@ -2937,41 +2955,55 @@ static int ext4_split_unwritten_extents(handle_t *handle,
struct ext4_extent *ex2 = NULL;
struct ext4_extent *ex3 = NULL;
struct ext4_extent_header *eh;
- ext4_lblk_t ee_block;
+ ext4_lblk_t ee_block, eof_block;
unsigned int allocated, ee_len, depth;
ext4_fsblk_t newblock;
int err = 0;
+ int may_zeroout;
+
+ ext_debug("ext4_split_unwritten_extents: inode %lu, logical"
+ "block %llu, max_blocks %u\n", inode->i_ino,
+ (unsigned long long)map->m_lblk, map->m_len);
+
+ eof_block = (inode->i_size + inode->i_sb->s_blocksize - 1) >>
+ inode->i_sb->s_blocksize_bits;
+ if (eof_block < map->m_lblk + map->m_len)
+ eof_block = map->m_lblk + map->m_len;
- ext_debug("ext4_split_unwritten_extents: inode %lu,"
- "iblock %llu, max_blocks %u\n", inode->i_ino,
- (unsigned long long)iblock, max_blocks);
depth = ext_depth(inode);
eh = path[depth].p_hdr;
ex = path[depth].p_ext;
ee_block = le32_to_cpu(ex->ee_block);
ee_len = ext4_ext_get_actual_len(ex);
- allocated = ee_len - (iblock - ee_block);
- newblock = iblock - ee_block + ext_pblock(ex);
+ allocated = ee_len - (map->m_lblk - ee_block);
+ newblock = map->m_lblk - ee_block + ext_pblock(ex);
+
ex2 = ex;
orig_ex.ee_block = ex->ee_block;
orig_ex.ee_len = cpu_to_le16(ee_len);
ext4_ext_store_pblock(&orig_ex, ext_pblock(ex));
/*
+ * It is safe to convert extent to initialized via explicit
+ * zeroout only if extent is fully insde i_size or new_size.
+ */
+ may_zeroout = ee_block + ee_len <= eof_block;
+
+ /*
* If the uninitialized extent begins at the same logical
* block where the write begins, and the write completely
* covers the extent, then we don't need to split it.
*/
- if ((iblock == ee_block) && (allocated <= max_blocks))
+ if ((map->m_lblk == ee_block) && (allocated <= map->m_len))
return allocated;
err = ext4_ext_get_access(handle, inode, path + depth);
if (err)
goto out;
- /* ex1: ee_block to iblock - 1 : uninitialized */
- if (iblock > ee_block) {
+ /* ex1: ee_block to map->m_lblk - 1 : uninitialized */
+ if (map->m_lblk > ee_block) {
ex1 = ex;
- ex1->ee_len = cpu_to_le16(iblock - ee_block);
+ ex1->ee_len = cpu_to_le16(map->m_lblk - ee_block);
ext4_ext_mark_uninitialized(ex1);
ex2 = &newex;
}
@@ -2980,18 +3012,18 @@ static int ext4_split_unwritten_extents(handle_t *handle,
* we insert ex3, if ex1 is NULL. This is to avoid temporary
* overlap of blocks.
*/
- if (!ex1 && allocated > max_blocks)
- ex2->ee_len = cpu_to_le16(max_blocks);
+ if (!ex1 && allocated > map->m_len)
+ ex2->ee_len = cpu_to_le16(map->m_len);
/* ex3: to ee_block + ee_len : uninitialised */
- if (allocated > max_blocks) {
+ if (allocated > map->m_len) {
unsigned int newdepth;
ex3 = &newex;
- ex3->ee_block = cpu_to_le32(iblock + max_blocks);
- ext4_ext_store_pblock(ex3, newblock + max_blocks);
- ex3->ee_len = cpu_to_le16(allocated - max_blocks);
+ ex3->ee_block = cpu_to_le32(map->m_lblk + map->m_len);
+ ext4_ext_store_pblock(ex3, newblock + map->m_len);
+ ex3->ee_len = cpu_to_le16(allocated - map->m_len);
ext4_ext_mark_uninitialized(ex3);
err = ext4_ext_insert_extent(handle, inode, path, ex3, flags);
- if (err == -ENOSPC) {
+ if (err == -ENOSPC && may_zeroout) {
err = ext4_ext_zeroout(inode, &orig_ex);
if (err)
goto fix_extent_len;
@@ -3001,7 +3033,7 @@ static int ext4_split_unwritten_extents(handle_t *handle,
ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
ext4_ext_dirty(handle, inode, path + depth);
/* zeroed the full extent */
- /* blocks available from iblock */
+ /* blocks available from map->m_lblk */
return allocated;
} else if (err)
@@ -3015,11 +3047,13 @@ static int ext4_split_unwritten_extents(handle_t *handle,
* update the extent length after successful insert of the
* split extent
*/
- orig_ex.ee_len = cpu_to_le16(ee_len -
- ext4_ext_get_actual_len(ex3));
+ ee_len -= ext4_ext_get_actual_len(ex3);
+ orig_ex.ee_len = cpu_to_le16(ee_len);
+ may_zeroout = ee_block + ee_len <= eof_block;
+
depth = newdepth;
ext4_ext_drop_refs(path);
- path = ext4_ext_find_extent(inode, iblock, path);
+ path = ext4_ext_find_extent(inode, map->m_lblk, path);
if (IS_ERR(path)) {
err = PTR_ERR(path);
goto out;
@@ -3033,7 +3067,7 @@ static int ext4_split_unwritten_extents(handle_t *handle,
if (err)
goto out;
- allocated = max_blocks;
+ allocated = map->m_len;
}
/*
* If there was a change of depth as part of the
@@ -3042,15 +3076,15 @@ static int ext4_split_unwritten_extents(handle_t *handle,
*/
if (ex1 && ex1 != ex) {
ex1 = ex;
- ex1->ee_len = cpu_to_le16(iblock - ee_block);
+ ex1->ee_len = cpu_to_le16(map->m_lblk - ee_block);
ext4_ext_mark_uninitialized(ex1);
ex2 = &newex;
}
/*
- * ex2: iblock to iblock + maxblocks-1 : to be direct IO written,
- * uninitialised still.
+ * ex2: map->m_lblk to map->m_lblk + map->m_len-1 : to be written
+ * using direct I/O, uninitialised still.
*/
- ex2->ee_block = cpu_to_le32(iblock);
+ ex2->ee_block = cpu_to_le32(map->m_lblk);
ext4_ext_store_pblock(ex2, newblock);
ex2->ee_len = cpu_to_le16(allocated);
ext4_ext_mark_uninitialized(ex2);
@@ -3062,7 +3096,7 @@ static int ext4_split_unwritten_extents(handle_t *handle,
goto out;
insert:
err = ext4_ext_insert_extent(handle, inode, path, &newex, flags);
- if (err == -ENOSPC) {
+ if (err == -ENOSPC && may_zeroout) {
err = ext4_ext_zeroout(inode, &orig_ex);
if (err)
goto fix_extent_len;
@@ -3152,10 +3186,9 @@ static void unmap_underlying_metadata_blocks(struct block_device *bdev,
static int
ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
- ext4_lblk_t iblock, unsigned int max_blocks,
+ struct ext4_map_blocks *map,
struct ext4_ext_path *path, int flags,
- unsigned int allocated, struct buffer_head *bh_result,
- ext4_fsblk_t newblock)
+ unsigned int allocated, ext4_fsblk_t newblock)
{
int ret = 0;
int err = 0;
@@ -3163,15 +3196,14 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
ext_debug("ext4_ext_handle_uninitialized_extents: inode %lu, logical"
"block %llu, max_blocks %u, flags %d, allocated %u",
- inode->i_ino, (unsigned long long)iblock, max_blocks,
+ inode->i_ino, (unsigned long long)map->m_lblk, map->m_len,
flags, allocated);
ext4_ext_show_leaf(inode, path);
/* get_block() before submit the IO, split the extent */
if ((flags & EXT4_GET_BLOCKS_PRE_IO)) {
- ret = ext4_split_unwritten_extents(handle,
- inode, path, iblock,
- max_blocks, flags);
+ ret = ext4_split_unwritten_extents(handle, inode, map,
+ path, flags);
/*
* Flag the inode(non aio case) or end_io struct (aio case)
* that this IO needs to convertion to written when IO is
@@ -3182,7 +3214,7 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
else
ext4_set_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN);
if (ext4_should_dioread_nolock(inode))
- set_buffer_uninit(bh_result);
+ map->m_flags |= EXT4_MAP_UNINIT;
goto out;
}
/* IO end_io complete, convert the filled extent to written */
@@ -3210,14 +3242,12 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
* the buffer head will be unmapped so that
* a read from the block returns 0s.
*/
- set_buffer_unwritten(bh_result);
+ map->m_flags |= EXT4_MAP_UNWRITTEN;
goto out1;
}
/* buffered write, writepage time, convert*/
- ret = ext4_ext_convert_to_initialized(handle, inode,
- path, iblock,
- max_blocks);
+ ret = ext4_ext_convert_to_initialized(handle, inode, map, path);
if (ret >= 0)
ext4_update_inode_fsync_trans(handle, inode, 1);
out:
@@ -3226,7 +3256,7 @@ out:
goto out2;
} else
allocated = ret;
- set_buffer_new(bh_result);
+ map->m_flags |= EXT4_MAP_NEW;
/*
* if we allocated more blocks than requested
* we need to make sure we unmap the extra block
@@ -3234,11 +3264,11 @@ out:
* unmapped later when we find the buffer_head marked
* new.
*/
- if (allocated > max_blocks) {
+ if (allocated > map->m_len) {
unmap_underlying_metadata_blocks(inode->i_sb->s_bdev,
- newblock + max_blocks,
- allocated - max_blocks);
- allocated = max_blocks;
+ newblock + map->m_len,
+ allocated - map->m_len);
+ allocated = map->m_len;
}
/*
@@ -3252,13 +3282,13 @@ out:
ext4_da_update_reserve_space(inode, allocated, 0);
map_out:
- set_buffer_mapped(bh_result);
+ map->m_flags |= EXT4_MAP_MAPPED;
out1:
- if (allocated > max_blocks)
- allocated = max_blocks;
+ if (allocated > map->m_len)
+ allocated = map->m_len;
ext4_ext_show_leaf(inode, path);
- bh_result->b_bdev = inode->i_sb->s_bdev;
- bh_result->b_blocknr = newblock;
+ map->m_pblk = newblock;
+ map->m_len = allocated;
out2:
if (path) {
ext4_ext_drop_refs(path);
@@ -3284,26 +3314,23 @@ out2:
*
* return < 0, error case.
*/
-int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
- ext4_lblk_t iblock,
- unsigned int max_blocks, struct buffer_head *bh_result,
- int flags)
+int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
+ struct ext4_map_blocks *map, int flags)
{
struct ext4_ext_path *path = NULL;
struct ext4_extent_header *eh;
struct ext4_extent newex, *ex, *last_ex;
ext4_fsblk_t newblock;
- int err = 0, depth, ret, cache_type;
+ int i, err = 0, depth, ret, cache_type;
unsigned int allocated = 0;
struct ext4_allocation_request ar;
ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio;
- __clear_bit(BH_New, &bh_result->b_state);
ext_debug("blocks %u/%u requested for inode %lu\n",
- iblock, max_blocks, inode->i_ino);
+ map->m_lblk, map->m_len, inode->i_ino);
/* check in cache */
- cache_type = ext4_ext_in_cache(inode, iblock, &newex);
+ cache_type = ext4_ext_in_cache(inode, map->m_lblk, &newex);
if (cache_type) {
if (cache_type == EXT4_EXT_CACHE_GAP) {
if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
@@ -3316,12 +3343,12 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
/* we should allocate requested block */
} else if (cache_type == EXT4_EXT_CACHE_EXTENT) {
/* block is already allocated */
- newblock = iblock
+ newblock = map->m_lblk
- le32_to_cpu(newex.ee_block)
+ ext_pblock(&newex);
/* number of remaining blocks in the extent */
allocated = ext4_ext_get_actual_len(&newex) -
- (iblock - le32_to_cpu(newex.ee_block));
+ (map->m_lblk - le32_to_cpu(newex.ee_block));
goto out;
} else {
BUG();
@@ -3329,7 +3356,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
}
/* find extent for this block */
- path = ext4_ext_find_extent(inode, iblock, NULL);
+ path = ext4_ext_find_extent(inode, map->m_lblk, NULL);
if (IS_ERR(path)) {
err = PTR_ERR(path);
path = NULL;
@@ -3345,8 +3372,9 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
*/
if (unlikely(path[depth].p_ext == NULL && depth != 0)) {
EXT4_ERROR_INODE(inode, "bad extent address "
- "iblock: %d, depth: %d pblock %lld",
- iblock, depth, path[depth].p_block);
+ "lblock: %lu, depth: %d pblock %lld",
+ (unsigned long) map->m_lblk, depth,
+ path[depth].p_block);
err = -EIO;
goto out2;
}
@@ -3364,12 +3392,12 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
*/
ee_len = ext4_ext_get_actual_len(ex);
/* if found extent covers block, simply return it */
- if (in_range(iblock, ee_block, ee_len)) {
- newblock = iblock - ee_block + ee_start;
+ if (in_range(map->m_lblk, ee_block, ee_len)) {
+ newblock = map->m_lblk - ee_block + ee_start;
/* number of remaining blocks in the extent */
- allocated = ee_len - (iblock - ee_block);
- ext_debug("%u fit into %u:%d -> %llu\n", iblock,
- ee_block, ee_len, newblock);
+ allocated = ee_len - (map->m_lblk - ee_block);
+ ext_debug("%u fit into %u:%d -> %llu\n", map->m_lblk,
+ ee_block, ee_len, newblock);
/* Do not put uninitialized extent in the cache */
if (!ext4_ext_is_uninitialized(ex)) {
@@ -3379,8 +3407,8 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
goto out;
}
ret = ext4_ext_handle_uninitialized_extents(handle,
- inode, iblock, max_blocks, path,
- flags, allocated, bh_result, newblock);
+ inode, map, path, flags, allocated,
+ newblock);
return ret;
}
}
@@ -3394,7 +3422,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
* put just found gap into cache to speed up
* subsequent requests
*/
- ext4_ext_put_gap_in_cache(inode, path, iblock);
+ ext4_ext_put_gap_in_cache(inode, path, map->m_lblk);
goto out2;
}
/*
@@ -3402,11 +3430,11 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
*/
/* find neighbour allocated blocks */
- ar.lleft = iblock;
+ ar.lleft = map->m_lblk;
err = ext4_ext_search_left(inode, path, &ar.lleft, &ar.pleft);
if (err)
goto out2;
- ar.lright = iblock;
+ ar.lright = map->m_lblk;
err = ext4_ext_search_right(inode, path, &ar.lright, &ar.pright);
if (err)
goto out2;
@@ -3417,26 +3445,26 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
* EXT_INIT_MAX_LEN and for an uninitialized extent this limit is
* EXT_UNINIT_MAX_LEN.
*/
- if (max_blocks > EXT_INIT_MAX_LEN &&
+ if (map->m_len > EXT_INIT_MAX_LEN &&
!(flags & EXT4_GET_BLOCKS_UNINIT_EXT))
- max_blocks = EXT_INIT_MAX_LEN;
- else if (max_blocks > EXT_UNINIT_MAX_LEN &&
+ map->m_len = EXT_INIT_MAX_LEN;
+ else if (map->m_len > EXT_UNINIT_MAX_LEN &&
(flags & EXT4_GET_BLOCKS_UNINIT_EXT))
- max_blocks = EXT_UNINIT_MAX_LEN;
+ map->m_len = EXT_UNINIT_MAX_LEN;
- /* Check if we can really insert (iblock)::(iblock+max_blocks) extent */
- newex.ee_block = cpu_to_le32(iblock);
- newex.ee_len = cpu_to_le16(max_blocks);
+ /* Check if we can really insert (m_lblk)::(m_lblk + m_len) extent */
+ newex.ee_block = cpu_to_le32(map->m_lblk);
+ newex.ee_len = cpu_to_le16(map->m_len);
err = ext4_ext_check_overlap(inode, &newex, path);
if (err)
allocated = ext4_ext_get_actual_len(&newex);
else
- allocated = max_blocks;
+ allocated = map->m_len;
/* allocate new block */
ar.inode = inode;
- ar.goal = ext4_ext_find_goal(inode, path, iblock);
- ar.logical = iblock;
+ ar.goal = ext4_ext_find_goal(inode, path, map->m_lblk);
+ ar.logical = map->m_lblk;
ar.len = allocated;
if (S_ISREG(inode->i_mode))
ar.flags = EXT4_MB_HINT_DATA;
@@ -3470,21 +3498,33 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
EXT4_STATE_DIO_UNWRITTEN);
}
if (ext4_should_dioread_nolock(inode))
- set_buffer_uninit(bh_result);
+ map->m_flags |= EXT4_MAP_UNINIT;
}
- if (unlikely(EXT4_I(inode)->i_flags & EXT4_EOFBLOCKS_FL)) {
+ if (unlikely(ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS))) {
if (unlikely(!eh->eh_entries)) {
EXT4_ERROR_INODE(inode,
- "eh->eh_entries == 0 ee_block %d",
- ex->ee_block);
+ "eh->eh_entries == 0 and "
+ "EOFBLOCKS_FL set");
err = -EIO;
goto out2;
}
last_ex = EXT_LAST_EXTENT(eh);
- if (iblock + ar.len > le32_to_cpu(last_ex->ee_block)
- + ext4_ext_get_actual_len(last_ex))
- EXT4_I(inode)->i_flags &= ~EXT4_EOFBLOCKS_FL;
+ /*
+ * If the current leaf block was reached by looking at
+ * the last index block all the way down the tree, and
+ * we are extending the inode beyond the last extent
+ * in the current leaf block, then clear the
+ * EOFBLOCKS_FL flag.
+ */
+ for (i = depth-1; i >= 0; i--) {
+ if (path[i].p_idx != EXT_LAST_INDEX(path[i].p_hdr))
+ break;
+ }
+ if ((i < 0) &&
+ (map->m_lblk + ar.len > le32_to_cpu(last_ex->ee_block) +
+ ext4_ext_get_actual_len(last_ex)))
+ ext4_clear_inode_flag(inode, EXT4_INODE_EOFBLOCKS);
}
err = ext4_ext_insert_extent(handle, inode, path, &newex, flags);
if (err) {
@@ -3500,9 +3540,9 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
/* previous routine could use block we allocated */
newblock = ext_pblock(&newex);
allocated = ext4_ext_get_actual_len(&newex);
- if (allocated > max_blocks)
- allocated = max_blocks;
- set_buffer_new(bh_result);
+ if (allocated > map->m_len)
+ allocated = map->m_len;
+ map->m_flags |= EXT4_MAP_NEW;
/*
* Update reserved blocks/metadata blocks after successful
@@ -3516,18 +3556,18 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
* when it is _not_ an uninitialized extent.
*/
if ((flags & EXT4_GET_BLOCKS_UNINIT_EXT) == 0) {
- ext4_ext_put_in_cache(inode, iblock, allocated, newblock,
+ ext4_ext_put_in_cache(inode, map->m_lblk, allocated, newblock,
EXT4_EXT_CACHE_EXTENT);
ext4_update_inode_fsync_trans(handle, inode, 1);
} else
ext4_update_inode_fsync_trans(handle, inode, 0);
out:
- if (allocated > max_blocks)
- allocated = max_blocks;
+ if (allocated > map->m_len)
+ allocated = map->m_len;
ext4_ext_show_leaf(inode, path);
- set_buffer_mapped(bh_result);
- bh_result->b_bdev = inode->i_sb->s_bdev;
- bh_result->b_blocknr = newblock;
+ map->m_flags |= EXT4_MAP_MAPPED;
+ map->m_pblk = newblock;
+ map->m_len = allocated;
out2:
if (path) {
ext4_ext_drop_refs(path);
@@ -3625,7 +3665,7 @@ static void ext4_falloc_update_inode(struct inode *inode,
* can proceed even if the new size is the same as i_size.
*/
if (new_size > i_size_read(inode))
- EXT4_I(inode)->i_flags |= EXT4_EOFBLOCKS_FL;
+ ext4_set_inode_flag(inode, EXT4_INODE_EOFBLOCKS);
}
}
@@ -3640,55 +3680,57 @@ static void ext4_falloc_update_inode(struct inode *inode,
long ext4_fallocate(struct inode *inode, int mode, loff_t offset, loff_t len)
{
handle_t *handle;
- ext4_lblk_t block;
loff_t new_size;
unsigned int max_blocks;
int ret = 0;
int ret2 = 0;
int retries = 0;
- struct buffer_head map_bh;
+ struct ext4_map_blocks map;
unsigned int credits, blkbits = inode->i_blkbits;
/*
* currently supporting (pre)allocate mode for extent-based
* files _only_
*/
- if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL))
+ if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
return -EOPNOTSUPP;
/* preallocation to directories is currently not supported */
if (S_ISDIR(inode->i_mode))
return -ENODEV;
- block = offset >> blkbits;
+ map.m_lblk = offset >> blkbits;
/*
* We can't just convert len to max_blocks because
* If blocksize = 4096 offset = 3072 and len = 2048
*/
max_blocks = (EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits)
- - block;
+ - map.m_lblk;
/*
* credits to insert 1 extent into extent tree
*/
credits = ext4_chunk_trans_blocks(inode, max_blocks);
mutex_lock(&inode->i_mutex);
+ ret = inode_newsize_ok(inode, (len + offset));
+ if (ret) {
+ mutex_unlock(&inode->i_mutex);
+ return ret;
+ }
retry:
while (ret >= 0 && ret < max_blocks) {
- block = block + ret;
- max_blocks = max_blocks - ret;
+ map.m_lblk = map.m_lblk + ret;
+ map.m_len = max_blocks = max_blocks - ret;
handle = ext4_journal_start(inode, credits);
if (IS_ERR(handle)) {
ret = PTR_ERR(handle);
break;
}
- map_bh.b_state = 0;
- ret = ext4_get_blocks(handle, inode, block,
- max_blocks, &map_bh,
+ ret = ext4_map_blocks(handle, inode, &map,
EXT4_GET_BLOCKS_CREATE_UNINIT_EXT);
if (ret <= 0) {
#ifdef EXT4FS_DEBUG
WARN_ON(ret <= 0);
- printk(KERN_ERR "%s: ext4_ext_get_blocks "
+ printk(KERN_ERR "%s: ext4_ext_map_blocks "
"returned error inode#%lu, block=%u, "
"max_blocks=%u", __func__,
inode->i_ino, block, max_blocks);
@@ -3697,14 +3739,14 @@ retry:
ret2 = ext4_journal_stop(handle);
break;
}
- if ((block + ret) >= (EXT4_BLOCK_ALIGN(offset + len,
+ if ((map.m_lblk + ret) >= (EXT4_BLOCK_ALIGN(offset + len,
blkbits) >> blkbits))
new_size = offset + len;
else
- new_size = (block + ret) << blkbits;
+ new_size = (map.m_lblk + ret) << blkbits;
ext4_falloc_update_inode(inode, mode, new_size,
- buffer_new(&map_bh));
+ (map.m_flags & EXT4_MAP_NEW));
ext4_mark_inode_dirty(handle, inode);
ret2 = ext4_journal_stop(handle);
if (ret2)
@@ -3733,42 +3775,39 @@ int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset,
ssize_t len)
{
handle_t *handle;
- ext4_lblk_t block;
unsigned int max_blocks;
int ret = 0;
int ret2 = 0;
- struct buffer_head map_bh;
+ struct ext4_map_blocks map;
unsigned int credits, blkbits = inode->i_blkbits;
- block = offset >> blkbits;
+ map.m_lblk = offset >> blkbits;
/*
* We can't just convert len to max_blocks because
* If blocksize = 4096 offset = 3072 and len = 2048
*/
- max_blocks = (EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits)
- - block;
+ max_blocks = ((EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits) -
+ map.m_lblk);
/*
* credits to insert 1 extent into extent tree
*/
credits = ext4_chunk_trans_blocks(inode, max_blocks);
while (ret >= 0 && ret < max_blocks) {
- block = block + ret;
- max_blocks = max_blocks - ret;
+ map.m_lblk += ret;
+ map.m_len = (max_blocks -= ret);
handle = ext4_journal_start(inode, credits);
if (IS_ERR(handle)) {
ret = PTR_ERR(handle);
break;
}
- map_bh.b_state = 0;
- ret = ext4_get_blocks(handle, inode, block,
- max_blocks, &map_bh,
+ ret = ext4_map_blocks(handle, inode, &map,
EXT4_GET_BLOCKS_IO_CONVERT_EXT);
if (ret <= 0) {
WARN_ON(ret <= 0);
- printk(KERN_ERR "%s: ext4_ext_get_blocks "
+ printk(KERN_ERR "%s: ext4_ext_map_blocks "
"returned error inode#%lu, block=%u, "
"max_blocks=%u", __func__,
- inode->i_ino, block, max_blocks);
+ inode->i_ino, map.m_lblk, map.m_len);
}
ext4_mark_inode_dirty(handle, inode);
ret2 = ext4_journal_stop(handle);
@@ -3898,7 +3937,7 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
int error = 0;
/* fallback to generic here if not in extents fmt */
- if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL))
+ if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
return generic_block_fiemap(inode, fieinfo, start, len,
ext4_get_block);
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index d0776e410f34..5313ae4cda2d 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -66,7 +66,7 @@ ext4_file_write(struct kiocb *iocb, const struct iovec *iov,
* is smaller than s_maxbytes, which is for extent-mapped files.
*/
- if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)) {
+ if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
size_t length = iov_length(iov, nr_segs);
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index ef3d980e67cb..592adf2e546e 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -35,6 +35,29 @@
#include <trace/events/ext4.h>
/*
+ * If we're not journaling and this is a just-created file, we have to
+ * sync our parent directory (if it was freshly created) since
+ * otherwise it will only be written by writeback, leaving a huge
+ * window during which a crash may lose the file. This may apply for
+ * the parent directory's parent as well, and so on recursively, if
+ * they are also freshly created.
+ */
+static void ext4_sync_parent(struct inode *inode)
+{
+ struct dentry *dentry = NULL;
+
+ while (inode && ext4_test_inode_state(inode, EXT4_STATE_NEWENTRY)) {
+ ext4_clear_inode_state(inode, EXT4_STATE_NEWENTRY);
+ dentry = list_entry(inode->i_dentry.next,
+ struct dentry, d_alias);
+ if (!dentry || !dentry->d_parent || !dentry->d_parent->d_inode)
+ break;
+ inode = dentry->d_parent->d_inode;
+ sync_mapping_buffers(inode->i_mapping);
+ }
+}
+
+/*
* akpm: A new design for ext4_sync_file().
*
* This is only called from sys_fsync(), sys_fdatasync() and sys_msync().
@@ -48,9 +71,9 @@
* i_mutex lock is held when entering and exiting this function
*/
-int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync)
+int ext4_sync_file(struct file *file, int datasync)
{
- struct inode *inode = dentry->d_inode;
+ struct inode *inode = file->f_mapping->host;
struct ext4_inode_info *ei = EXT4_I(inode);
journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
int ret;
@@ -58,7 +81,7 @@ int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync)
J_ASSERT(ext4_journal_current_handle() == NULL);
- trace_ext4_sync_file(file, dentry, datasync);
+ trace_ext4_sync_file(file, datasync);
if (inode->i_sb->s_flags & MS_RDONLY)
return 0;
@@ -66,9 +89,13 @@ int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync)
ret = flush_completed_IO(inode);
if (ret < 0)
return ret;
-
- if (!journal)
- return simple_fsync(file, dentry, datasync);
+
+ if (!journal) {
+ ret = generic_file_fsync(file, datasync);
+ if (!ret && !list_empty(&inode->i_dentry))
+ ext4_sync_parent(inode);
+ return ret;
+ }
/*
* data=writeback,ordered:
@@ -102,7 +129,7 @@ int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync)
(journal->j_flags & JBD2_BARRIER))
blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL,
NULL, BLKDEV_IFL_WAIT);
- jbd2_log_wait_commit(journal, commit_tid);
+ ret = jbd2_log_wait_commit(journal, commit_tid);
} else if (journal->j_flags & JBD2_BARRIER)
blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL,
BLKDEV_IFL_WAIT);
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 1a0e183a2f04..25c4b3173fd9 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -240,56 +240,49 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
if (fatal)
goto error_return;
- /* Ok, now we can actually update the inode bitmaps.. */
- cleared = ext4_clear_bit_atomic(ext4_group_lock_ptr(sb, block_group),
- bit, bitmap_bh->b_data);
- if (!cleared)
- ext4_error(sb, "bit already cleared for inode %lu", ino);
- else {
- gdp = ext4_get_group_desc(sb, block_group, &bh2);
-
+ fatal = -ESRCH;
+ gdp = ext4_get_group_desc(sb, block_group, &bh2);
+ if (gdp) {
BUFFER_TRACE(bh2, "get_write_access");
fatal = ext4_journal_get_write_access(handle, bh2);
- if (fatal) goto error_return;
-
- if (gdp) {
- ext4_lock_group(sb, block_group);
- count = ext4_free_inodes_count(sb, gdp) + 1;
- ext4_free_inodes_set(sb, gdp, count);
- if (is_directory) {
- count = ext4_used_dirs_count(sb, gdp) - 1;
- ext4_used_dirs_set(sb, gdp, count);
- if (sbi->s_log_groups_per_flex) {
- ext4_group_t f;
-
- f = ext4_flex_group(sbi, block_group);
- atomic_dec(&sbi->s_flex_groups[f].used_dirs);
- }
+ }
+ ext4_lock_group(sb, block_group);
+ cleared = ext4_clear_bit(bit, bitmap_bh->b_data);
+ if (fatal || !cleared) {
+ ext4_unlock_group(sb, block_group);
+ goto out;
+ }
- }
- gdp->bg_checksum = ext4_group_desc_csum(sbi,
- block_group, gdp);
- ext4_unlock_group(sb, block_group);
- percpu_counter_inc(&sbi->s_freeinodes_counter);
- if (is_directory)
- percpu_counter_dec(&sbi->s_dirs_counter);
-
- if (sbi->s_log_groups_per_flex) {
- ext4_group_t f;
-
- f = ext4_flex_group(sbi, block_group);
- atomic_inc(&sbi->s_flex_groups[f].free_inodes);
- }
- }
- BUFFER_TRACE(bh2, "call ext4_handle_dirty_metadata");
- err = ext4_handle_dirty_metadata(handle, NULL, bh2);
- if (!fatal) fatal = err;
+ count = ext4_free_inodes_count(sb, gdp) + 1;
+ ext4_free_inodes_set(sb, gdp, count);
+ if (is_directory) {
+ count = ext4_used_dirs_count(sb, gdp) - 1;
+ ext4_used_dirs_set(sb, gdp, count);
+ percpu_counter_dec(&sbi->s_dirs_counter);
}
- BUFFER_TRACE(bitmap_bh, "call ext4_handle_dirty_metadata");
- err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
- if (!fatal)
- fatal = err;
- sb->s_dirt = 1;
+ gdp->bg_checksum = ext4_group_desc_csum(sbi, block_group, gdp);
+ ext4_unlock_group(sb, block_group);
+
+ percpu_counter_inc(&sbi->s_freeinodes_counter);
+ if (sbi->s_log_groups_per_flex) {
+ ext4_group_t f = ext4_flex_group(sbi, block_group);
+
+ atomic_inc(&sbi->s_flex_groups[f].free_inodes);
+ if (is_directory)
+ atomic_dec(&sbi->s_flex_groups[f].used_dirs);
+ }
+ BUFFER_TRACE(bh2, "call ext4_handle_dirty_metadata");
+ fatal = ext4_handle_dirty_metadata(handle, NULL, bh2);
+out:
+ if (cleared) {
+ BUFFER_TRACE(bitmap_bh, "call ext4_handle_dirty_metadata");
+ err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
+ if (!fatal)
+ fatal = err;
+ sb->s_dirt = 1;
+ } else
+ ext4_error(sb, "bit already cleared for inode %lu", ino);
+
error_return:
brelse(bitmap_bh);
ext4_std_error(sb, fatal);
@@ -499,7 +492,7 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent,
if (S_ISDIR(mode) &&
((parent == sb->s_root->d_inode) ||
- (EXT4_I(parent)->i_flags & EXT4_TOPDIR_FL))) {
+ (ext4_test_inode_flag(parent, EXT4_INODE_TOPDIR)))) {
int best_ndir = inodes_per_group;
int ret = -1;
@@ -1041,7 +1034,7 @@ got:
if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) {
/* set extent flag only for directory, file and normal symlink*/
if (S_ISDIR(mode) || S_ISREG(mode) || S_ISLNK(mode)) {
- EXT4_I(inode)->i_flags |= EXT4_EXTENTS_FL;
+ ext4_set_inode_flag(inode, EXT4_INODE_EXTENTS);
ext4_ext_tree_init(handle, inode);
}
}
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 3e0f6af9d08d..0afc8c1d8cf3 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -149,7 +149,7 @@ int ext4_truncate_restart_trans(handle_t *handle, struct inode *inode,
int ret;
/*
- * Drop i_data_sem to avoid deadlock with ext4_get_blocks At this
+ * Drop i_data_sem to avoid deadlock with ext4_map_blocks. At this
* moment, get_block can be called only for blocks inside i_size since
* page cache has been already dropped and writes are blocked by
* i_mutex. So we can safely drop the i_data_sem here.
@@ -348,9 +348,8 @@ static int __ext4_check_blockref(const char *function, struct inode *inode,
if (blk &&
unlikely(!ext4_data_block_valid(EXT4_SB(inode->i_sb),
blk, 1))) {
- __ext4_error(inode->i_sb, function,
- "invalid block reference %u "
- "in inode #%lu", blk, inode->i_ino);
+ ext4_error_inode(function, inode,
+ "invalid block reference %u", blk);
return -EIO;
}
}
@@ -785,7 +784,7 @@ failed:
/* Allocation failed, free what we already allocated */
ext4_free_blocks(handle, inode, 0, new_blocks[0], 1, 0);
for (i = 1; i <= n ; i++) {
- /*
+ /*
* branch[i].bh is newly allocated, so there is no
* need to revoke the block, which is why we don't
* need to set EXT4_FREE_BLOCKS_METADATA.
@@ -875,7 +874,7 @@ static int ext4_splice_branch(handle_t *handle, struct inode *inode,
err_out:
for (i = 1; i <= num; i++) {
- /*
+ /*
* branch[i].bh is newly allocated, so there is no
* need to revoke the block, which is why we don't
* need to set EXT4_FREE_BLOCKS_METADATA.
@@ -890,9 +889,9 @@ err_out:
}
/*
- * The ext4_ind_get_blocks() function handles non-extents inodes
+ * The ext4_ind_map_blocks() function handles non-extents inodes
* (i.e., using the traditional indirect/double-indirect i_blocks
- * scheme) for ext4_get_blocks().
+ * scheme) for ext4_map_blocks().
*
* Allocation strategy is simple: if we have to allocate something, we will
* have to go the whole way to leaf. So let's do it before attaching anything
@@ -917,9 +916,8 @@ err_out:
* down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system
* blocks.
*/
-static int ext4_ind_get_blocks(handle_t *handle, struct inode *inode,
- ext4_lblk_t iblock, unsigned int maxblocks,
- struct buffer_head *bh_result,
+static int ext4_ind_map_blocks(handle_t *handle, struct inode *inode,
+ struct ext4_map_blocks *map,
int flags)
{
int err = -EIO;
@@ -933,9 +931,9 @@ static int ext4_ind_get_blocks(handle_t *handle, struct inode *inode,
int count = 0;
ext4_fsblk_t first_block = 0;
- J_ASSERT(!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL));
+ J_ASSERT(!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)));
J_ASSERT(handle != NULL || (flags & EXT4_GET_BLOCKS_CREATE) == 0);
- depth = ext4_block_to_path(inode, iblock, offsets,
+ depth = ext4_block_to_path(inode, map->m_lblk, offsets,
&blocks_to_boundary);
if (depth == 0)
@@ -946,10 +944,9 @@ static int ext4_ind_get_blocks(handle_t *handle, struct inode *inode,
/* Simplest case - block found, no allocation needed */
if (!partial) {
first_block = le32_to_cpu(chain[depth - 1].key);
- clear_buffer_new(bh_result);
count++;
/*map more blocks*/
- while (count < maxblocks && count <= blocks_to_boundary) {
+ while (count < map->m_len && count <= blocks_to_boundary) {
ext4_fsblk_t blk;
blk = le32_to_cpu(*(chain[depth-1].p + count));
@@ -969,7 +966,7 @@ static int ext4_ind_get_blocks(handle_t *handle, struct inode *inode,
/*
* Okay, we need to do block allocation.
*/
- goal = ext4_find_goal(inode, iblock, partial);
+ goal = ext4_find_goal(inode, map->m_lblk, partial);
/* the number of blocks need to allocate for [d,t]indirect blocks */
indirect_blks = (chain + depth) - partial - 1;
@@ -979,11 +976,11 @@ static int ext4_ind_get_blocks(handle_t *handle, struct inode *inode,
* direct blocks to allocate for this branch.
*/
count = ext4_blks_to_allocate(partial, indirect_blks,
- maxblocks, blocks_to_boundary);
+ map->m_len, blocks_to_boundary);
/*
* Block out ext4_truncate while we alter the tree
*/
- err = ext4_alloc_branch(handle, inode, iblock, indirect_blks,
+ err = ext4_alloc_branch(handle, inode, map->m_lblk, indirect_blks,
&count, goal,
offsets + (partial - chain), partial);
@@ -995,18 +992,20 @@ static int ext4_ind_get_blocks(handle_t *handle, struct inode *inode,
* may need to return -EAGAIN upwards in the worst case. --sct
*/
if (!err)
- err = ext4_splice_branch(handle, inode, iblock,
+ err = ext4_splice_branch(handle, inode, map->m_lblk,
partial, indirect_blks, count);
if (err)
goto cleanup;
- set_buffer_new(bh_result);
+ map->m_flags |= EXT4_MAP_NEW;
ext4_update_inode_fsync_trans(handle, inode, 1);
got_it:
- map_bh(bh_result, inode->i_sb, le32_to_cpu(chain[depth-1].key));
+ map->m_flags |= EXT4_MAP_MAPPED;
+ map->m_pblk = le32_to_cpu(chain[depth-1].key);
+ map->m_len = count;
if (count > blocks_to_boundary)
- set_buffer_boundary(bh_result);
+ map->m_flags |= EXT4_MAP_BOUNDARY;
err = count;
/* Clean up and exit */
partial = chain + depth - 1; /* the whole chain */
@@ -1016,7 +1015,6 @@ cleanup:
brelse(partial->bh);
partial--;
}
- BUFFER_TRACE(bh_result, "returned");
out:
return err;
}
@@ -1061,7 +1059,7 @@ static int ext4_indirect_calc_metadata_amount(struct inode *inode,
*/
static int ext4_calc_metadata_amount(struct inode *inode, sector_t lblock)
{
- if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)
+ if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
return ext4_ext_calc_metadata_amount(inode, lblock);
return ext4_indirect_calc_metadata_amount(inode, lblock);
@@ -1076,7 +1074,6 @@ void ext4_da_update_reserve_space(struct inode *inode,
{
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
struct ext4_inode_info *ei = EXT4_I(inode);
- int mdb_free = 0, allocated_meta_blocks = 0;
spin_lock(&ei->i_block_reservation_lock);
trace_ext4_da_update_reserve_space(inode, used);
@@ -1091,11 +1088,10 @@ void ext4_da_update_reserve_space(struct inode *inode,
/* Update per-inode reservations */
ei->i_reserved_data_blocks -= used;
- used += ei->i_allocated_meta_blocks;
ei->i_reserved_meta_blocks -= ei->i_allocated_meta_blocks;
- allocated_meta_blocks = ei->i_allocated_meta_blocks;
+ percpu_counter_sub(&sbi->s_dirtyblocks_counter,
+ used + ei->i_allocated_meta_blocks);
ei->i_allocated_meta_blocks = 0;
- percpu_counter_sub(&sbi->s_dirtyblocks_counter, used);
if (ei->i_reserved_data_blocks == 0) {
/*
@@ -1103,30 +1099,23 @@ void ext4_da_update_reserve_space(struct inode *inode,
* only when we have written all of the delayed
* allocation blocks.
*/
- mdb_free = ei->i_reserved_meta_blocks;
+ percpu_counter_sub(&sbi->s_dirtyblocks_counter,
+ ei->i_reserved_meta_blocks);
ei->i_reserved_meta_blocks = 0;
ei->i_da_metadata_calc_len = 0;
- percpu_counter_sub(&sbi->s_dirtyblocks_counter, mdb_free);
}
spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
- /* Update quota subsystem */
- if (quota_claim) {
+ /* Update quota subsystem for data blocks */
+ if (quota_claim)
dquot_claim_block(inode, used);
- if (mdb_free)
- dquot_release_reservation_block(inode, mdb_free);
- } else {
+ else {
/*
* We did fallocate with an offset that is already delayed
* allocated. So on delayed allocated writeback we should
- * not update the quota for allocated blocks. But then
- * converting an fallocate region to initialized region would
- * have caused a metadata allocation. So claim quota for
- * that
+ * not re-claim the quota for fallocated blocks.
*/
- if (allocated_meta_blocks)
- dquot_claim_block(inode, allocated_meta_blocks);
- dquot_release_reservation_block(inode, mdb_free + used);
+ dquot_release_reservation_block(inode, used);
}
/*
@@ -1139,15 +1128,15 @@ void ext4_da_update_reserve_space(struct inode *inode,
ext4_discard_preallocations(inode);
}
-static int check_block_validity(struct inode *inode, const char *msg,
- sector_t logical, sector_t phys, int len)
+static int check_block_validity(struct inode *inode, const char *func,
+ struct ext4_map_blocks *map)
{
- if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), phys, len)) {
- __ext4_error(inode->i_sb, msg,
- "inode #%lu logical block %llu mapped to %llu "
- "(size %d)", inode->i_ino,
- (unsigned long long) logical,
- (unsigned long long) phys, len);
+ if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), map->m_pblk,
+ map->m_len)) {
+ ext4_error_inode(func, inode,
+ "lblock %lu mapped to illegal pblock %llu "
+ "(length %d)", (unsigned long) map->m_lblk,
+ map->m_pblk, map->m_len);
return -EIO;
}
return 0;
@@ -1212,15 +1201,15 @@ static pgoff_t ext4_num_dirty_pages(struct inode *inode, pgoff_t idx,
}
/*
- * The ext4_get_blocks() function tries to look up the requested blocks,
+ * The ext4_map_blocks() function tries to look up the requested blocks,
* and returns if the blocks are already mapped.
*
* Otherwise it takes the write lock of the i_data_sem and allocate blocks
* and store the allocated blocks in the result buffer head and mark it
* mapped.
*
- * If file type is extents based, it will call ext4_ext_get_blocks(),
- * Otherwise, call with ext4_ind_get_blocks() to handle indirect mapping
+ * If file type is extents based, it will call ext4_ext_map_blocks(),
+ * Otherwise, call with ext4_ind_map_blocks() to handle indirect mapping
* based files
*
* On success, it returns the number of blocks being mapped or allocate.
@@ -1233,35 +1222,29 @@ static pgoff_t ext4_num_dirty_pages(struct inode *inode, pgoff_t idx,
*
* It returns the error in case of allocation failure.
*/
-int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block,
- unsigned int max_blocks, struct buffer_head *bh,
- int flags)
+int ext4_map_blocks(handle_t *handle, struct inode *inode,
+ struct ext4_map_blocks *map, int flags)
{
int retval;
- clear_buffer_mapped(bh);
- clear_buffer_unwritten(bh);
-
- ext_debug("ext4_get_blocks(): inode %lu, flag %d, max_blocks %u,"
- "logical block %lu\n", inode->i_ino, flags, max_blocks,
- (unsigned long)block);
+ map->m_flags = 0;
+ ext_debug("ext4_map_blocks(): inode %lu, flag %d, max_blocks %u,"
+ "logical block %lu\n", inode->i_ino, flags, map->m_len,
+ (unsigned long) map->m_lblk);
/*
* Try to see if we can get the block without requesting a new
* file system block.
*/
down_read((&EXT4_I(inode)->i_data_sem));
- if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) {
- retval = ext4_ext_get_blocks(handle, inode, block, max_blocks,
- bh, 0);
+ if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
+ retval = ext4_ext_map_blocks(handle, inode, map, 0);
} else {
- retval = ext4_ind_get_blocks(handle, inode, block, max_blocks,
- bh, 0);
+ retval = ext4_ind_map_blocks(handle, inode, map, 0);
}
up_read((&EXT4_I(inode)->i_data_sem));
- if (retval > 0 && buffer_mapped(bh)) {
- int ret = check_block_validity(inode, "file system corruption",
- block, bh->b_blocknr, retval);
+ if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) {
+ int ret = check_block_validity(inode, __func__, map);
if (ret != 0)
return ret;
}
@@ -1277,7 +1260,7 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block,
* ext4_ext_get_block() returns th create = 0
* with buffer head unmapped.
*/
- if (retval > 0 && buffer_mapped(bh))
+ if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED)
return retval;
/*
@@ -1290,7 +1273,7 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block,
* of BH_Unwritten and BH_Mapped flags being simultaneously
* set on the buffer_head.
*/
- clear_buffer_unwritten(bh);
+ map->m_flags &= ~EXT4_MAP_UNWRITTEN;
/*
* New blocks allocate and/or writing to uninitialized extent
@@ -1312,14 +1295,12 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block,
* We need to check for EXT4 here because migrate
* could have changed the inode type in between
*/
- if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) {
- retval = ext4_ext_get_blocks(handle, inode, block, max_blocks,
- bh, flags);
+ if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
+ retval = ext4_ext_map_blocks(handle, inode, map, flags);
} else {
- retval = ext4_ind_get_blocks(handle, inode, block,
- max_blocks, bh, flags);
+ retval = ext4_ind_map_blocks(handle, inode, map, flags);
- if (retval > 0 && buffer_new(bh)) {
+ if (retval > 0 && map->m_flags & EXT4_MAP_NEW) {
/*
* We allocated new blocks which will result in
* i_data's format changing. Force the migrate
@@ -1342,10 +1323,10 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block,
EXT4_I(inode)->i_delalloc_reserved_flag = 0;
up_write((&EXT4_I(inode)->i_data_sem));
- if (retval > 0 && buffer_mapped(bh)) {
- int ret = check_block_validity(inode, "file system "
- "corruption after allocation",
- block, bh->b_blocknr, retval);
+ if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) {
+ int ret = check_block_validity(inode,
+ "ext4_map_blocks_after_alloc",
+ map);
if (ret != 0)
return ret;
}
@@ -1355,109 +1336,109 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block,
/* Maximum number of blocks we map for direct IO at once. */
#define DIO_MAX_BLOCKS 4096
-int ext4_get_block(struct inode *inode, sector_t iblock,
- struct buffer_head *bh_result, int create)
+static int _ext4_get_block(struct inode *inode, sector_t iblock,
+ struct buffer_head *bh, int flags)
{
handle_t *handle = ext4_journal_current_handle();
+ struct ext4_map_blocks map;
int ret = 0, started = 0;
- unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
int dio_credits;
- if (create && !handle) {
+ map.m_lblk = iblock;
+ map.m_len = bh->b_size >> inode->i_blkbits;
+
+ if (flags && !handle) {
/* Direct IO write... */
- if (max_blocks > DIO_MAX_BLOCKS)
- max_blocks = DIO_MAX_BLOCKS;
- dio_credits = ext4_chunk_trans_blocks(inode, max_blocks);
+ if (map.m_len > DIO_MAX_BLOCKS)
+ map.m_len = DIO_MAX_BLOCKS;
+ dio_credits = ext4_chunk_trans_blocks(inode, map.m_len);
handle = ext4_journal_start(inode, dio_credits);
if (IS_ERR(handle)) {
ret = PTR_ERR(handle);
- goto out;
+ return ret;
}
started = 1;
}
- ret = ext4_get_blocks(handle, inode, iblock, max_blocks, bh_result,
- create ? EXT4_GET_BLOCKS_CREATE : 0);
+ ret = ext4_map_blocks(handle, inode, &map, flags);
if (ret > 0) {
- bh_result->b_size = (ret << inode->i_blkbits);
+ map_bh(bh, inode->i_sb, map.m_pblk);
+ bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | map.m_flags;
+ bh->b_size = inode->i_sb->s_blocksize * map.m_len;
ret = 0;
}
if (started)
ext4_journal_stop(handle);
-out:
return ret;
}
+int ext4_get_block(struct inode *inode, sector_t iblock,
+ struct buffer_head *bh, int create)
+{
+ return _ext4_get_block(inode, iblock, bh,
+ create ? EXT4_GET_BLOCKS_CREATE : 0);
+}
+
/*
* `handle' can be NULL if create is zero
*/
struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode,
ext4_lblk_t block, int create, int *errp)
{
- struct buffer_head dummy;
+ struct ext4_map_blocks map;
+ struct buffer_head *bh;
int fatal = 0, err;
- int flags = 0;
J_ASSERT(handle != NULL || create == 0);
- dummy.b_state = 0;
- dummy.b_blocknr = -1000;
- buffer_trace_init(&dummy.b_history);
- if (create)
- flags |= EXT4_GET_BLOCKS_CREATE;
- err = ext4_get_blocks(handle, inode, block, 1, &dummy, flags);
- /*
- * ext4_get_blocks() returns number of blocks mapped. 0 in
- * case of a HOLE.
- */
- if (err > 0) {
- if (err > 1)
- WARN_ON(1);
- err = 0;
+ map.m_lblk = block;
+ map.m_len = 1;
+ err = ext4_map_blocks(handle, inode, &map,
+ create ? EXT4_GET_BLOCKS_CREATE : 0);
+
+ if (err < 0)
+ *errp = err;
+ if (err <= 0)
+ return NULL;
+ *errp = 0;
+
+ bh = sb_getblk(inode->i_sb, map.m_pblk);
+ if (!bh) {
+ *errp = -EIO;
+ return NULL;
}
- *errp = err;
- if (!err && buffer_mapped(&dummy)) {
- struct buffer_head *bh;
- bh = sb_getblk(inode->i_sb, dummy.b_blocknr);
- if (!bh) {
- *errp = -EIO;
- goto err;
- }
- if (buffer_new(&dummy)) {
- J_ASSERT(create != 0);
- J_ASSERT(handle != NULL);
+ if (map.m_flags & EXT4_MAP_NEW) {
+ J_ASSERT(create != 0);
+ J_ASSERT(handle != NULL);
- /*
- * Now that we do not always journal data, we should
- * keep in mind whether this should always journal the
- * new buffer as metadata. For now, regular file
- * writes use ext4_get_block instead, so it's not a
- * problem.
- */
- lock_buffer(bh);
- BUFFER_TRACE(bh, "call get_create_access");
- fatal = ext4_journal_get_create_access(handle, bh);
- if (!fatal && !buffer_uptodate(bh)) {
- memset(bh->b_data, 0, inode->i_sb->s_blocksize);
- set_buffer_uptodate(bh);
- }
- unlock_buffer(bh);
- BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
- err = ext4_handle_dirty_metadata(handle, inode, bh);
- if (!fatal)
- fatal = err;
- } else {
- BUFFER_TRACE(bh, "not a new buffer");
- }
- if (fatal) {
- *errp = fatal;
- brelse(bh);
- bh = NULL;
+ /*
+ * Now that we do not always journal data, we should
+ * keep in mind whether this should always journal the
+ * new buffer as metadata. For now, regular file
+ * writes use ext4_get_block instead, so it's not a
+ * problem.
+ */
+ lock_buffer(bh);
+ BUFFER_TRACE(bh, "call get_create_access");
+ fatal = ext4_journal_get_create_access(handle, bh);
+ if (!fatal && !buffer_uptodate(bh)) {
+ memset(bh->b_data, 0, inode->i_sb->s_blocksize);
+ set_buffer_uptodate(bh);
}
- return bh;
+ unlock_buffer(bh);
+ BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
+ err = ext4_handle_dirty_metadata(handle, inode, bh);
+ if (!fatal)
+ fatal = err;
+ } else {
+ BUFFER_TRACE(bh, "not a new buffer");
}
-err:
- return NULL;
+ if (fatal) {
+ *errp = fatal;
+ brelse(bh);
+ bh = NULL;
+ }
+ return bh;
}
struct buffer_head *ext4_bread(handle_t *handle, struct inode *inode,
@@ -1860,7 +1841,7 @@ static int ext4_da_reserve_space(struct inode *inode, sector_t lblock)
int retries = 0;
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
struct ext4_inode_info *ei = EXT4_I(inode);
- unsigned long md_needed, md_reserved;
+ unsigned long md_needed;
int ret;
/*
@@ -1870,22 +1851,24 @@ static int ext4_da_reserve_space(struct inode *inode, sector_t lblock)
*/
repeat:
spin_lock(&ei->i_block_reservation_lock);
- md_reserved = ei->i_reserved_meta_blocks;
md_needed = ext4_calc_metadata_amount(inode, lblock);
trace_ext4_da_reserve_space(inode, md_needed);
spin_unlock(&ei->i_block_reservation_lock);
/*
- * Make quota reservation here to prevent quota overflow
- * later. Real quota accounting is done at pages writeout
- * time.
+ * We will charge metadata quota at writeout time; this saves
+ * us from metadata over-estimation, though we may go over by
+ * a small amount in the end. Here we just reserve for data.
*/
- ret = dquot_reserve_block(inode, md_needed + 1);
+ ret = dquot_reserve_block(inode, 1);
if (ret)
return ret;
-
+ /*
+ * We do still charge estimated metadata to the sb though;
+ * we cannot afford to run out of free blocks.
+ */
if (ext4_claim_free_blocks(sbi, md_needed + 1)) {
- dquot_release_reservation_block(inode, md_needed + 1);
+ dquot_release_reservation_block(inode, 1);
if (ext4_should_retry_alloc(inode->i_sb, &retries)) {
yield();
goto repeat;
@@ -1910,6 +1893,7 @@ static void ext4_da_release_space(struct inode *inode, int to_free)
spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
+ trace_ext4_da_release_space(inode, to_free);
if (unlikely(to_free > ei->i_reserved_data_blocks)) {
/*
* if there aren't enough reserved blocks, then the
@@ -1932,12 +1916,13 @@ static void ext4_da_release_space(struct inode *inode, int to_free)
* only when we have written all of the delayed
* allocation blocks.
*/
- to_free += ei->i_reserved_meta_blocks;
+ percpu_counter_sub(&sbi->s_dirtyblocks_counter,
+ ei->i_reserved_meta_blocks);
ei->i_reserved_meta_blocks = 0;
ei->i_da_metadata_calc_len = 0;
}
- /* update fs dirty blocks counter */
+ /* update fs dirty data blocks counter */
percpu_counter_sub(&sbi->s_dirtyblocks_counter, to_free);
spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
@@ -2042,28 +2027,23 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd)
/*
* mpage_put_bnr_to_bhs - walk blocks and assign them actual numbers
*
- * @mpd->inode - inode to walk through
- * @exbh->b_blocknr - first block on a disk
- * @exbh->b_size - amount of space in bytes
- * @logical - first logical block to start assignment with
- *
* the function goes through all passed space and put actual disk
* block numbers into buffer heads, dropping BH_Delay and BH_Unwritten
*/
-static void mpage_put_bnr_to_bhs(struct mpage_da_data *mpd, sector_t logical,
- struct buffer_head *exbh)
+static void mpage_put_bnr_to_bhs(struct mpage_da_data *mpd,
+ struct ext4_map_blocks *map)
{
struct inode *inode = mpd->inode;
struct address_space *mapping = inode->i_mapping;
- int blocks = exbh->b_size >> inode->i_blkbits;
- sector_t pblock = exbh->b_blocknr, cur_logical;
+ int blocks = map->m_len;
+ sector_t pblock = map->m_pblk, cur_logical;
struct buffer_head *head, *bh;
pgoff_t index, end;
struct pagevec pvec;
int nr_pages, i;
- index = logical >> (PAGE_CACHE_SHIFT - inode->i_blkbits);
- end = (logical + blocks - 1) >> (PAGE_CACHE_SHIFT - inode->i_blkbits);
+ index = map->m_lblk >> (PAGE_CACHE_SHIFT - inode->i_blkbits);
+ end = (map->m_lblk + blocks - 1) >> (PAGE_CACHE_SHIFT - inode->i_blkbits);
cur_logical = index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
pagevec_init(&pvec, 0);
@@ -2090,17 +2070,16 @@ static void mpage_put_bnr_to_bhs(struct mpage_da_data *mpd, sector_t logical,
/* skip blocks out of the range */
do {
- if (cur_logical >= logical)
+ if (cur_logical >= map->m_lblk)
break;
cur_logical++;
} while ((bh = bh->b_this_page) != head);
do {
- if (cur_logical >= logical + blocks)
+ if (cur_logical >= map->m_lblk + blocks)
break;
- if (buffer_delay(bh) ||
- buffer_unwritten(bh)) {
+ if (buffer_delay(bh) || buffer_unwritten(bh)) {
BUG_ON(bh->b_bdev != inode->i_sb->s_bdev);
@@ -2119,7 +2098,7 @@ static void mpage_put_bnr_to_bhs(struct mpage_da_data *mpd, sector_t logical,
} else if (buffer_mapped(bh))
BUG_ON(bh->b_blocknr != pblock);
- if (buffer_uninit(exbh))
+ if (map->m_flags & EXT4_MAP_UNINIT)
set_buffer_uninit(bh);
cur_logical++;
pblock++;
@@ -2130,21 +2109,6 @@ static void mpage_put_bnr_to_bhs(struct mpage_da_data *mpd, sector_t logical,
}
-/*
- * __unmap_underlying_blocks - just a helper function to unmap
- * set of blocks described by @bh
- */
-static inline void __unmap_underlying_blocks(struct inode *inode,
- struct buffer_head *bh)
-{
- struct block_device *bdev = inode->i_sb->s_bdev;
- int blocks, i;
-
- blocks = bh->b_size >> inode->i_blkbits;
- for (i = 0; i < blocks; i++)
- unmap_underlying_metadata(bdev, bh->b_blocknr + i);
-}
-
static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd,
sector_t logical, long blk_cnt)
{
@@ -2206,7 +2170,7 @@ static void ext4_print_free_blocks(struct inode *inode)
static int mpage_da_map_blocks(struct mpage_da_data *mpd)
{
int err, blks, get_blocks_flags;
- struct buffer_head new;
+ struct ext4_map_blocks map;
sector_t next = mpd->b_blocknr;
unsigned max_blocks = mpd->b_size >> mpd->inode->i_blkbits;
loff_t disksize = EXT4_I(mpd->inode)->i_disksize;
@@ -2247,15 +2211,15 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
* EXT4_GET_BLOCKS_DELALLOC_RESERVE so the delalloc accounting
* variables are updated after the blocks have been allocated.
*/
- new.b_state = 0;
+ map.m_lblk = next;
+ map.m_len = max_blocks;
get_blocks_flags = EXT4_GET_BLOCKS_CREATE;
if (ext4_should_dioread_nolock(mpd->inode))
get_blocks_flags |= EXT4_GET_BLOCKS_IO_CREATE_EXT;
if (mpd->b_state & (1 << BH_Delay))
get_blocks_flags |= EXT4_GET_BLOCKS_DELALLOC_RESERVE;
- blks = ext4_get_blocks(handle, mpd->inode, next, max_blocks,
- &new, get_blocks_flags);
+ blks = ext4_map_blocks(handle, mpd->inode, &map, get_blocks_flags);
if (blks < 0) {
err = blks;
/*
@@ -2282,7 +2246,7 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
ext4_msg(mpd->inode->i_sb, KERN_CRIT,
"delayed block allocation failed for inode %lu at "
"logical offset %llu with max blocks %zd with "
- "error %d\n", mpd->inode->i_ino,
+ "error %d", mpd->inode->i_ino,
(unsigned long long) next,
mpd->b_size >> mpd->inode->i_blkbits, err);
printk(KERN_CRIT "This should not happen!! "
@@ -2297,10 +2261,13 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
}
BUG_ON(blks == 0);
- new.b_size = (blks << mpd->inode->i_blkbits);
+ if (map.m_flags & EXT4_MAP_NEW) {
+ struct block_device *bdev = mpd->inode->i_sb->s_bdev;
+ int i;
- if (buffer_new(&new))
- __unmap_underlying_blocks(mpd->inode, &new);
+ for (i = 0; i < map.m_len; i++)
+ unmap_underlying_metadata(bdev, map.m_pblk + i);
+ }
/*
* If blocks are delayed marked, we need to
@@ -2308,7 +2275,7 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
*/
if ((mpd->b_state & (1 << BH_Delay)) ||
(mpd->b_state & (1 << BH_Unwritten)))
- mpage_put_bnr_to_bhs(mpd, next, &new);
+ mpage_put_bnr_to_bhs(mpd, &map);
if (ext4_should_order_data(mpd->inode)) {
err = ext4_jbd2_file_inode(handle, mpd->inode);
@@ -2349,8 +2316,17 @@ static void mpage_add_bh_to_extent(struct mpage_da_data *mpd,
sector_t next;
int nrblocks = mpd->b_size >> mpd->inode->i_blkbits;
+ /*
+ * XXX Don't go larger than mballoc is willing to allocate
+ * This is a stopgap solution. We eventually need to fold
+ * mpage_da_submit_io() into this function and then call
+ * ext4_get_blocks() multiple times in a loop
+ */
+ if (nrblocks >= 8*1024*1024/mpd->inode->i_sb->s_blocksize)
+ goto flush_it;
+
/* check if thereserved journal credits might overflow */
- if (!(EXT4_I(mpd->inode)->i_flags & EXT4_EXTENTS_FL)) {
+ if (!(ext4_test_inode_flag(mpd->inode, EXT4_INODE_EXTENTS))) {
if (nrblocks >= EXT4_MAX_TRANS_DATA) {
/*
* With non-extent format we are limited by the journal
@@ -2423,17 +2399,6 @@ static int __mpage_da_writepage(struct page *page,
struct buffer_head *bh, *head;
sector_t logical;
- if (mpd->io_done) {
- /*
- * Rest of the page in the page_vec
- * redirty then and skip then. We will
- * try to write them again after
- * starting a new transaction
- */
- redirty_page_for_writepage(wbc, page);
- unlock_page(page);
- return MPAGE_DA_EXTENT_TAIL;
- }
/*
* Can we merge this page to current extent?
*/
@@ -2528,8 +2493,9 @@ static int __mpage_da_writepage(struct page *page,
* initialized properly.
*/
static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
- struct buffer_head *bh_result, int create)
+ struct buffer_head *bh, int create)
{
+ struct ext4_map_blocks map;
int ret = 0;
sector_t invalid_block = ~((sector_t) 0xffff);
@@ -2537,16 +2503,22 @@ static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
invalid_block = ~0;
BUG_ON(create == 0);
- BUG_ON(bh_result->b_size != inode->i_sb->s_blocksize);
+ BUG_ON(bh->b_size != inode->i_sb->s_blocksize);
+
+ map.m_lblk = iblock;
+ map.m_len = 1;
/*
* first, we need to know whether the block is allocated already
* preallocated blocks are unmapped but should treated
* the same as allocated blocks.
*/
- ret = ext4_get_blocks(NULL, inode, iblock, 1, bh_result, 0);
- if ((ret == 0) && !buffer_delay(bh_result)) {
- /* the block isn't (pre)allocated yet, let's reserve space */
+ ret = ext4_map_blocks(NULL, inode, &map, 0);
+ if (ret < 0)
+ return ret;
+ if (ret == 0) {
+ if (buffer_delay(bh))
+ return 0; /* Not sure this could or should happen */
/*
* XXX: __block_prepare_write() unmaps passed block,
* is it OK?
@@ -2556,26 +2528,26 @@ static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
/* not enough space to reserve */
return ret;
- map_bh(bh_result, inode->i_sb, invalid_block);
- set_buffer_new(bh_result);
- set_buffer_delay(bh_result);
- } else if (ret > 0) {
- bh_result->b_size = (ret << inode->i_blkbits);
- if (buffer_unwritten(bh_result)) {
- /* A delayed write to unwritten bh should
- * be marked new and mapped. Mapped ensures
- * that we don't do get_block multiple times
- * when we write to the same offset and new
- * ensures that we do proper zero out for
- * partial write.
- */
- set_buffer_new(bh_result);
- set_buffer_mapped(bh_result);
- }
- ret = 0;
+ map_bh(bh, inode->i_sb, invalid_block);
+ set_buffer_new(bh);
+ set_buffer_delay(bh);
+ return 0;
}
- return ret;
+ map_bh(bh, inode->i_sb, map.m_pblk);
+ bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | map.m_flags;
+
+ if (buffer_unwritten(bh)) {
+ /* A delayed write to unwritten bh should be marked
+ * new and mapped. Mapped ensures that we don't do
+ * get_block multiple times when we write to the same
+ * offset and new ensures that we do proper zero out
+ * for partial write.
+ */
+ set_buffer_new(bh);
+ set_buffer_mapped(bh);
+ }
+ return 0;
}
/*
@@ -2597,21 +2569,8 @@ static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
static int noalloc_get_block_write(struct inode *inode, sector_t iblock,
struct buffer_head *bh_result, int create)
{
- int ret = 0;
- unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
-
BUG_ON(bh_result->b_size != inode->i_sb->s_blocksize);
-
- /*
- * we don't want to do block allocation in writepage
- * so call get_block_wrap with create = 0
- */
- ret = ext4_get_blocks(NULL, inode, iblock, max_blocks, bh_result, 0);
- if (ret > 0) {
- bh_result->b_size = (ret << inode->i_blkbits);
- ret = 0;
- }
- return ret;
+ return _ext4_get_block(inode, iblock, bh_result, 0);
}
static int bget_one(handle_t *handle, struct buffer_head *bh)
@@ -2821,13 +2780,131 @@ static int ext4_da_writepages_trans_blocks(struct inode *inode)
* number of contiguous block. So we will limit
* number of contiguous block to a sane value
*/
- if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) &&
+ if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) &&
(max_blocks > EXT4_MAX_TRANS_DATA))
max_blocks = EXT4_MAX_TRANS_DATA;
return ext4_chunk_trans_blocks(inode, max_blocks);
}
+/*
+ * write_cache_pages_da - walk the list of dirty pages of the given
+ * address space and call the callback function (which usually writes
+ * the pages).
+ *
+ * This is a forked version of write_cache_pages(). Differences:
+ * Range cyclic is ignored.
+ * no_nrwrite_index_update is always presumed true
+ */
+static int write_cache_pages_da(struct address_space *mapping,
+ struct writeback_control *wbc,
+ struct mpage_da_data *mpd)
+{
+ int ret = 0;
+ int done = 0;
+ struct pagevec pvec;
+ int nr_pages;
+ pgoff_t index;
+ pgoff_t end; /* Inclusive */
+ long nr_to_write = wbc->nr_to_write;
+
+ pagevec_init(&pvec, 0);
+ index = wbc->range_start >> PAGE_CACHE_SHIFT;
+ end = wbc->range_end >> PAGE_CACHE_SHIFT;
+
+ while (!done && (index <= end)) {
+ int i;
+
+ nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
+ PAGECACHE_TAG_DIRTY,
+ min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1);
+ if (nr_pages == 0)
+ break;
+
+ for (i = 0; i < nr_pages; i++) {
+ struct page *page = pvec.pages[i];
+
+ /*
+ * At this point, the page may be truncated or
+ * invalidated (changing page->mapping to NULL), or
+ * even swizzled back from swapper_space to tmpfs file
+ * mapping. However, page->index will not change
+ * because we have a reference on the page.
+ */
+ if (page->index > end) {
+ done = 1;
+ break;
+ }
+
+ lock_page(page);
+
+ /*
+ * Page truncated or invalidated. We can freely skip it
+ * then, even for data integrity operations: the page
+ * has disappeared concurrently, so there could be no
+ * real expectation of this data interity operation
+ * even if there is now a new, dirty page at the same
+ * pagecache address.
+ */
+ if (unlikely(page->mapping != mapping)) {
+continue_unlock:
+ unlock_page(page);
+ continue;
+ }
+
+ if (!PageDirty(page)) {
+ /* someone wrote it for us */
+ goto continue_unlock;
+ }
+
+ if (PageWriteback(page)) {
+ if (wbc->sync_mode != WB_SYNC_NONE)
+ wait_on_page_writeback(page);
+ else
+ goto continue_unlock;
+ }
+
+ BUG_ON(PageWriteback(page));
+ if (!clear_page_dirty_for_io(page))
+ goto continue_unlock;
+
+ ret = __mpage_da_writepage(page, wbc, mpd);
+ if (unlikely(ret)) {
+ if (ret == AOP_WRITEPAGE_ACTIVATE) {
+ unlock_page(page);
+ ret = 0;
+ } else {
+ done = 1;
+ break;
+ }
+ }
+
+ if (nr_to_write > 0) {
+ nr_to_write--;
+ if (nr_to_write == 0 &&
+ wbc->sync_mode == WB_SYNC_NONE) {
+ /*
+ * We stop writing back only if we are
+ * not doing integrity sync. In case of
+ * integrity sync we have to keep going
+ * because someone may be concurrently
+ * dirtying pages, and we might have
+ * synced a lot of newly appeared dirty
+ * pages, but have not synced all of the
+ * old dirty pages.
+ */
+ done = 1;
+ break;
+ }
+ }
+ }
+ pagevec_release(&pvec);
+ cond_resched();
+ }
+ return ret;
+}
+
+
static int ext4_da_writepages(struct address_space *mapping,
struct writeback_control *wbc)
{
@@ -2836,7 +2913,6 @@ static int ext4_da_writepages(struct address_space *mapping,
handle_t *handle = NULL;
struct mpage_da_data mpd;
struct inode *inode = mapping->host;
- int no_nrwrite_index_update;
int pages_written = 0;
long pages_skipped;
unsigned int max_pages;
@@ -2916,12 +2992,6 @@ static int ext4_da_writepages(struct address_space *mapping,
mpd.wbc = wbc;
mpd.inode = mapping->host;
- /*
- * we don't want write_cache_pages to update
- * nr_to_write and writeback_index
- */
- no_nrwrite_index_update = wbc->no_nrwrite_index_update;
- wbc->no_nrwrite_index_update = 1;
pages_skipped = wbc->pages_skipped;
retry:
@@ -2941,7 +3011,7 @@ retry:
if (IS_ERR(handle)) {
ret = PTR_ERR(handle);
ext4_msg(inode->i_sb, KERN_CRIT, "%s: jbd2_start: "
- "%ld pages, ino %lu; err %d\n", __func__,
+ "%ld pages, ino %lu; err %d", __func__,
wbc->nr_to_write, inode->i_ino, ret);
goto out_writepages;
}
@@ -2963,8 +3033,7 @@ retry:
mpd.io_done = 0;
mpd.pages_written = 0;
mpd.retval = 0;
- ret = write_cache_pages(mapping, wbc, __mpage_da_writepage,
- &mpd);
+ ret = write_cache_pages_da(mapping, wbc, &mpd);
/*
* If we have a contiguous extent of pages and we
* haven't done the I/O yet, map the blocks and submit
@@ -3016,7 +3085,7 @@ retry:
if (pages_skipped != wbc->pages_skipped)
ext4_msg(inode->i_sb, KERN_CRIT,
"This should not happen leaving %s "
- "with nr_to_write = %ld ret = %d\n",
+ "with nr_to_write = %ld ret = %d",
__func__, wbc->nr_to_write, ret);
/* Update index */
@@ -3030,8 +3099,6 @@ retry:
mapping->writeback_index = index;
out_writepages:
- if (!no_nrwrite_index_update)
- wbc->no_nrwrite_index_update = 0;
wbc->nr_to_write -= nr_to_writebump;
wbc->range_start = range_start;
trace_ext4_da_writepages_result(inode, wbc, ret, pages_written);
@@ -3076,7 +3143,7 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
loff_t pos, unsigned len, unsigned flags,
struct page **pagep, void **fsdata)
{
- int ret, retries = 0, quota_retries = 0;
+ int ret, retries = 0;
struct page *page;
pgoff_t index;
unsigned from, to;
@@ -3135,22 +3202,6 @@ retry:
if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
goto retry;
-
- if ((ret == -EDQUOT) &&
- EXT4_I(inode)->i_reserved_meta_blocks &&
- (quota_retries++ < 3)) {
- /*
- * Since we often over-estimate the number of meta
- * data blocks required, we may sometimes get a
- * spurios out of quota error even though there would
- * be enough space once we write the data blocks and
- * find out how many meta data blocks were _really_
- * required. So try forcing the inode write to see if
- * that helps.
- */
- write_inode_now(inode, (quota_retries == 3));
- goto retry;
- }
out:
return ret;
}
@@ -3546,46 +3597,18 @@ out:
return ret;
}
+/*
+ * ext4_get_block used when preparing for a DIO write or buffer write.
+ * We allocate an uinitialized extent if blocks haven't been allocated.
+ * The extent will be converted to initialized after the IO is complete.
+ */
static int ext4_get_block_write(struct inode *inode, sector_t iblock,
struct buffer_head *bh_result, int create)
{
- handle_t *handle = ext4_journal_current_handle();
- int ret = 0;
- unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
- int dio_credits;
- int started = 0;
-
ext4_debug("ext4_get_block_write: inode %lu, create flag %d\n",
inode->i_ino, create);
- /*
- * ext4_get_block in prepare for a DIO write or buffer write.
- * We allocate an uinitialized extent if blocks haven't been allocated.
- * The extent will be converted to initialized after IO complete.
- */
- create = EXT4_GET_BLOCKS_IO_CREATE_EXT;
-
- if (!handle) {
- if (max_blocks > DIO_MAX_BLOCKS)
- max_blocks = DIO_MAX_BLOCKS;
- dio_credits = ext4_chunk_trans_blocks(inode, max_blocks);
- handle = ext4_journal_start(inode, dio_credits);
- if (IS_ERR(handle)) {
- ret = PTR_ERR(handle);
- goto out;
- }
- started = 1;
- }
-
- ret = ext4_get_blocks(handle, inode, iblock, max_blocks, bh_result,
- create);
- if (ret > 0) {
- bh_result->b_size = (ret << inode->i_blkbits);
- ret = 0;
- }
- if (started)
- ext4_journal_stop(handle);
-out:
- return ret;
+ return _ext4_get_block(inode, iblock, bh_result,
+ EXT4_GET_BLOCKS_IO_CREATE_EXT);
}
static void dump_completed_IO(struct inode * inode)
@@ -3752,7 +3775,8 @@ static ext4_io_end_t *ext4_init_io_end (struct inode *inode, gfp_t flags)
}
static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
- ssize_t size, void *private)
+ ssize_t size, void *private, int ret,
+ bool is_async)
{
ext4_io_end_t *io_end = iocb->private;
struct workqueue_struct *wq;
@@ -3761,7 +3785,7 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
/* if not async direct IO or dio with 0 bytes write, just return */
if (!io_end || !size)
- return;
+ goto out;
ext_debug("ext4_end_io_dio(): io_end 0x%p"
"for inode %lu, iocb 0x%p, offset %llu, size %llu\n",
@@ -3772,7 +3796,7 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
if (io_end->flag != EXT4_IO_UNWRITTEN){
ext4_free_io_end(io_end);
iocb->private = NULL;
- return;
+ goto out;
}
io_end->offset = offset;
@@ -3789,6 +3813,9 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
list_add_tail(&io_end->list, &ei->i_completed_io_list);
spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
iocb->private = NULL;
+out:
+ if (is_async)
+ aio_complete(iocb, ret, 0);
}
static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate)
@@ -3973,7 +4000,7 @@ static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb,
struct file *file = iocb->ki_filp;
struct inode *inode = file->f_mapping->host;
- if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)
+ if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
return ext4_ext_direct_IO(rw, iocb, iov, offset, nr_segs);
return ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs);
@@ -4302,10 +4329,9 @@ static int ext4_clear_blocks(handle_t *handle, struct inode *inode,
if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), block_to_free,
count)) {
- ext4_error(inode->i_sb, "inode #%lu: "
- "attempt to clear blocks %llu len %lu, invalid",
- inode->i_ino, (unsigned long long) block_to_free,
- count);
+ EXT4_ERROR_INODE(inode, "attempt to clear invalid "
+ "blocks %llu len %lu",
+ (unsigned long long) block_to_free, count);
return 1;
}
@@ -4410,11 +4436,10 @@ static void ext4_free_data(handle_t *handle, struct inode *inode,
if ((EXT4_JOURNAL(inode) == NULL) || bh2jh(this_bh))
ext4_handle_dirty_metadata(handle, inode, this_bh);
else
- ext4_error(inode->i_sb,
- "circular indirect block detected, "
- "inode=%lu, block=%llu",
- inode->i_ino,
- (unsigned long long) this_bh->b_blocknr);
+ EXT4_ERROR_INODE(inode,
+ "circular indirect block detected at "
+ "block %llu",
+ (unsigned long long) this_bh->b_blocknr);
}
}
@@ -4452,11 +4477,10 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode,
if (!ext4_data_block_valid(EXT4_SB(inode->i_sb),
nr, 1)) {
- ext4_error(inode->i_sb,
- "indirect mapped block in inode "
- "#%lu invalid (level %d, blk #%lu)",
- inode->i_ino, depth,
- (unsigned long) nr);
+ EXT4_ERROR_INODE(inode,
+ "invalid indirect mapped "
+ "block %lu (level %d)",
+ (unsigned long) nr, depth);
break;
}
@@ -4468,9 +4492,9 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode,
* (should be rare).
*/
if (!bh) {
- ext4_error(inode->i_sb,
- "Read failure, inode=%lu, block=%llu",
- inode->i_ino, nr);
+ EXT4_ERROR_INODE(inode,
+ "Read failure block=%llu",
+ (unsigned long long) nr);
continue;
}
@@ -4612,12 +4636,12 @@ void ext4_truncate(struct inode *inode)
if (!ext4_can_truncate(inode))
return;
- EXT4_I(inode)->i_flags &= ~EXT4_EOFBLOCKS_FL;
+ ext4_clear_inode_flag(inode, EXT4_INODE_EOFBLOCKS);
if (inode->i_size == 0 && !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC))
ext4_set_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE);
- if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) {
+ if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
ext4_ext_truncate(inode);
return;
}
@@ -4785,8 +4809,8 @@ static int __ext4_get_inode_loc(struct inode *inode,
bh = sb_getblk(sb, block);
if (!bh) {
- ext4_error(sb, "unable to read inode block - "
- "inode=%lu, block=%llu", inode->i_ino, block);
+ EXT4_ERROR_INODE(inode, "unable to read inode block - "
+ "block %llu", block);
return -EIO;
}
if (!buffer_uptodate(bh)) {
@@ -4884,8 +4908,8 @@ make_io:
submit_bh(READ_META, bh);
wait_on_buffer(bh);
if (!buffer_uptodate(bh)) {
- ext4_error(sb, "unable to read inode block - inode=%lu,"
- " block=%llu", inode->i_ino, block);
+ EXT4_ERROR_INODE(inode, "unable to read inode "
+ "block %llu", block);
brelse(bh);
return -EIO;
}
@@ -4922,20 +4946,26 @@ void ext4_set_inode_flags(struct inode *inode)
/* Propagate flags from i_flags to EXT4_I(inode)->i_flags */
void ext4_get_inode_flags(struct ext4_inode_info *ei)
{
- unsigned int flags = ei->vfs_inode.i_flags;
-
- ei->i_flags &= ~(EXT4_SYNC_FL|EXT4_APPEND_FL|
- EXT4_IMMUTABLE_FL|EXT4_NOATIME_FL|EXT4_DIRSYNC_FL);
- if (flags & S_SYNC)
- ei->i_flags |= EXT4_SYNC_FL;
- if (flags & S_APPEND)
- ei->i_flags |= EXT4_APPEND_FL;
- if (flags & S_IMMUTABLE)
- ei->i_flags |= EXT4_IMMUTABLE_FL;
- if (flags & S_NOATIME)
- ei->i_flags |= EXT4_NOATIME_FL;
- if (flags & S_DIRSYNC)
- ei->i_flags |= EXT4_DIRSYNC_FL;
+ unsigned int vfs_fl;
+ unsigned long old_fl, new_fl;
+
+ do {
+ vfs_fl = ei->vfs_inode.i_flags;
+ old_fl = ei->i_flags;
+ new_fl = old_fl & ~(EXT4_SYNC_FL|EXT4_APPEND_FL|
+ EXT4_IMMUTABLE_FL|EXT4_NOATIME_FL|
+ EXT4_DIRSYNC_FL);
+ if (vfs_fl & S_SYNC)
+ new_fl |= EXT4_SYNC_FL;
+ if (vfs_fl & S_APPEND)
+ new_fl |= EXT4_APPEND_FL;
+ if (vfs_fl & S_IMMUTABLE)
+ new_fl |= EXT4_IMMUTABLE_FL;
+ if (vfs_fl & S_NOATIME)
+ new_fl |= EXT4_NOATIME_FL;
+ if (vfs_fl & S_DIRSYNC)
+ new_fl |= EXT4_DIRSYNC_FL;
+ } while (cmpxchg(&ei->i_flags, old_fl, new_fl) != old_fl);
}
static blkcnt_t ext4_inode_blocks(struct ext4_inode *raw_inode,
@@ -5096,8 +5126,8 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
ret = 0;
if (ei->i_file_acl &&
!ext4_data_block_valid(EXT4_SB(sb), ei->i_file_acl, 1)) {
- ext4_error(sb, "bad extended attribute block %llu inode #%lu",
- ei->i_file_acl, inode->i_ino);
+ EXT4_ERROR_INODE(inode, "bad extended attribute block %llu",
+ ei->i_file_acl);
ret = -EIO;
goto bad_inode;
} else if (ei->i_flags & EXT4_EXTENTS_FL) {
@@ -5142,8 +5172,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
new_decode_dev(le32_to_cpu(raw_inode->i_block[1])));
} else {
ret = -EIO;
- ext4_error(inode->i_sb, "bogus i_mode (%o) for inode=%lu",
- inode->i_mode, inode->i_ino);
+ EXT4_ERROR_INODE(inode, "bogus i_mode (%o)", inode->i_mode);
goto bad_inode;
}
brelse(iloc.bh);
@@ -5172,7 +5201,7 @@ static int ext4_inode_blocks_set(handle_t *handle,
*/
raw_inode->i_blocks_lo = cpu_to_le32(i_blocks);
raw_inode->i_blocks_high = 0;
- ei->i_flags &= ~EXT4_HUGE_FILE_FL;
+ ext4_clear_inode_flag(inode, EXT4_INODE_HUGE_FILE);
return 0;
}
if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_HUGE_FILE))
@@ -5185,9 +5214,9 @@ static int ext4_inode_blocks_set(handle_t *handle,
*/
raw_inode->i_blocks_lo = cpu_to_le32(i_blocks);
raw_inode->i_blocks_high = cpu_to_le16(i_blocks >> 32);
- ei->i_flags &= ~EXT4_HUGE_FILE_FL;
+ ext4_clear_inode_flag(inode, EXT4_INODE_HUGE_FILE);
} else {
- ei->i_flags |= EXT4_HUGE_FILE_FL;
+ ext4_set_inode_flag(inode, EXT4_INODE_HUGE_FILE);
/* i_block is stored in file system block size */
i_blocks = i_blocks >> (inode->i_blkbits - 9);
raw_inode->i_blocks_lo = cpu_to_le32(i_blocks);
@@ -5381,9 +5410,9 @@ int ext4_write_inode(struct inode *inode, struct writeback_control *wbc)
if (wbc->sync_mode == WB_SYNC_ALL)
sync_dirty_buffer(iloc.bh);
if (buffer_req(iloc.bh) && !buffer_uptodate(iloc.bh)) {
- ext4_error(inode->i_sb, "IO error syncing inode, "
- "inode=%lu, block=%llu", inode->i_ino,
- (unsigned long long)iloc.bh->b_blocknr);
+ EXT4_ERROR_INODE(inode,
+ "IO error syncing inode (block=%llu)",
+ (unsigned long long) iloc.bh->b_blocknr);
err = -EIO;
}
brelse(iloc.bh);
@@ -5455,7 +5484,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
}
if (attr->ia_valid & ATTR_SIZE) {
- if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)) {
+ if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
if (attr->ia_size > sbi->s_bitmap_maxbytes) {
@@ -5468,7 +5497,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
if (S_ISREG(inode->i_mode) &&
attr->ia_valid & ATTR_SIZE &&
(attr->ia_size < inode->i_size ||
- (EXT4_I(inode)->i_flags & EXT4_EOFBLOCKS_FL))) {
+ (ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS)))) {
handle_t *handle;
handle = ext4_journal_start(inode, 3);
@@ -5500,7 +5529,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
}
}
/* ext4_truncate will clear the flag */
- if ((EXT4_I(inode)->i_flags & EXT4_EOFBLOCKS_FL))
+ if ((ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS)))
ext4_truncate(inode);
}
@@ -5576,7 +5605,7 @@ static int ext4_indirect_trans_blocks(struct inode *inode, int nrblocks,
static int ext4_index_trans_blocks(struct inode *inode, int nrblocks, int chunk)
{
- if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL))
+ if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
return ext4_indirect_trans_blocks(inode, nrblocks, chunk);
return ext4_ext_index_trans_blocks(inode, nrblocks, chunk);
}
@@ -5911,9 +5940,9 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
*/
if (val)
- EXT4_I(inode)->i_flags |= EXT4_JOURNAL_DATA_FL;
+ ext4_set_inode_flag(inode, EXT4_INODE_JOURNAL_DATA);
else
- EXT4_I(inode)->i_flags &= ~EXT4_JOURNAL_DATA_FL;
+ ext4_clear_inode_flag(inode, EXT4_INODE_JOURNAL_DATA);
ext4_set_aops(inode);
jbd2_journal_unlock_updates(journal);
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index 016d0249294f..bf5ae883b1bd 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -258,7 +258,7 @@ setversion_out:
if (me.moved_len > 0)
file_remove_suid(donor_filp);
- if (copy_to_user((struct move_extent __user *)arg,
+ if (copy_to_user((struct move_extent __user *)arg,
&me, sizeof(me)))
err = -EFAULT;
mext_out:
@@ -373,7 +373,30 @@ long ext4_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
case EXT4_IOC32_SETRSVSZ:
cmd = EXT4_IOC_SETRSVSZ;
break;
- case EXT4_IOC_GROUP_ADD:
+ case EXT4_IOC32_GROUP_ADD: {
+ struct compat_ext4_new_group_input __user *uinput;
+ struct ext4_new_group_input input;
+ mm_segment_t old_fs;
+ int err;
+
+ uinput = compat_ptr(arg);
+ err = get_user(input.group, &uinput->group);
+ err |= get_user(input.block_bitmap, &uinput->block_bitmap);
+ err |= get_user(input.inode_bitmap, &uinput->inode_bitmap);
+ err |= get_user(input.inode_table, &uinput->inode_table);
+ err |= get_user(input.blocks_count, &uinput->blocks_count);
+ err |= get_user(input.reserved_blocks,
+ &uinput->reserved_blocks);
+ if (err)
+ return -EFAULT;
+ old_fs = get_fs();
+ set_fs(KERNEL_DS);
+ err = ext4_ioctl(file, EXT4_IOC_GROUP_ADD,
+ (unsigned long) &input);
+ set_fs(old_fs);
+ return err;
+ }
+ case EXT4_IOC_MOVE_EXT:
break;
default:
return -ENOIOCTLCMD;
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index b423a364dca3..12b3bc026a68 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -658,6 +658,27 @@ static void ext4_mb_mark_free_simple(struct super_block *sb,
}
}
+/*
+ * Cache the order of the largest free extent we have available in this block
+ * group.
+ */
+static void
+mb_set_largest_free_order(struct super_block *sb, struct ext4_group_info *grp)
+{
+ int i;
+ int bits;
+
+ grp->bb_largest_free_order = -1; /* uninit */
+
+ bits = sb->s_blocksize_bits + 1;
+ for (i = bits; i >= 0; i--) {
+ if (grp->bb_counters[i] > 0) {
+ grp->bb_largest_free_order = i;
+ break;
+ }
+ }
+}
+
static noinline_for_stack
void ext4_mb_generate_buddy(struct super_block *sb,
void *buddy, void *bitmap, ext4_group_t group)
@@ -700,6 +721,7 @@ void ext4_mb_generate_buddy(struct super_block *sb,
*/
grp->bb_free = free;
}
+ mb_set_largest_free_order(sb, grp);
clear_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &(grp->bb_state));
@@ -725,6 +747,9 @@ void ext4_mb_generate_buddy(struct super_block *sb,
* contain blocks_per_page (PAGE_CACHE_SIZE / blocksize) blocks.
* So it can have information regarding groups_per_page which
* is blocks_per_page/2
+ *
+ * Locking note: This routine takes the block group lock of all groups
+ * for this page; do not hold this lock when calling this routine!
*/
static int ext4_mb_init_cache(struct page *page, char *incore)
@@ -865,6 +890,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
BUG_ON(incore == NULL);
mb_debug(1, "put buddy for group %u in page %lu/%x\n",
group, page->index, i * blocksize);
+ trace_ext4_mb_buddy_bitmap_load(sb, group);
grinfo = ext4_get_group_info(sb, group);
grinfo->bb_fragments = 0;
memset(grinfo->bb_counters, 0,
@@ -882,6 +908,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
BUG_ON(incore != NULL);
mb_debug(1, "put bitmap for group %u in page %lu/%x\n",
group, page->index, i * blocksize);
+ trace_ext4_mb_bitmap_load(sb, group);
/* see comments in ext4_mb_put_pa() */
ext4_lock_group(sb, group);
@@ -910,6 +937,11 @@ out:
return err;
}
+/*
+ * Locking note: This routine calls ext4_mb_init_cache(), which takes the
+ * block group lock of all groups for this page; do not hold the BG lock when
+ * calling this routine!
+ */
static noinline_for_stack
int ext4_mb_init_group(struct super_block *sb, ext4_group_t group)
{
@@ -1004,6 +1036,11 @@ err:
return ret;
}
+/*
+ * Locking note: This routine calls ext4_mb_init_cache(), which takes the
+ * block group lock of all groups for this page; do not hold the BG lock when
+ * calling this routine!
+ */
static noinline_for_stack int
ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
struct ext4_buddy *e4b)
@@ -1150,7 +1187,7 @@ err:
return ret;
}
-static void ext4_mb_release_desc(struct ext4_buddy *e4b)
+static void ext4_mb_unload_buddy(struct ext4_buddy *e4b)
{
if (e4b->bd_bitmap_page)
page_cache_release(e4b->bd_bitmap_page);
@@ -1299,6 +1336,7 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
buddy = buddy2;
} while (1);
}
+ mb_set_largest_free_order(sb, e4b->bd_info);
mb_check_buddy(e4b);
}
@@ -1427,6 +1465,7 @@ static int mb_mark_used(struct ext4_buddy *e4b, struct ext4_free_extent *ex)
e4b->bd_info->bb_counters[ord]++;
e4b->bd_info->bb_counters[ord]++;
}
+ mb_set_largest_free_order(e4b->bd_sb, e4b->bd_info);
mb_set_bits(EXT4_MB_BITMAP(e4b), ex->fe_start, len0);
mb_check_buddy(e4b);
@@ -1617,7 +1656,7 @@ int ext4_mb_try_best_found(struct ext4_allocation_context *ac,
}
ext4_unlock_group(ac->ac_sb, group);
- ext4_mb_release_desc(e4b);
+ ext4_mb_unload_buddy(e4b);
return 0;
}
@@ -1672,7 +1711,7 @@ int ext4_mb_find_by_goal(struct ext4_allocation_context *ac,
ext4_mb_use_best_found(ac, e4b);
}
ext4_unlock_group(ac->ac_sb, group);
- ext4_mb_release_desc(e4b);
+ ext4_mb_unload_buddy(e4b);
return 0;
}
@@ -1821,16 +1860,22 @@ void ext4_mb_scan_aligned(struct ext4_allocation_context *ac,
}
}
+/* This is now called BEFORE we load the buddy bitmap. */
static int ext4_mb_good_group(struct ext4_allocation_context *ac,
ext4_group_t group, int cr)
{
unsigned free, fragments;
- unsigned i, bits;
int flex_size = ext4_flex_bg_size(EXT4_SB(ac->ac_sb));
struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group);
BUG_ON(cr < 0 || cr >= 4);
- BUG_ON(EXT4_MB_GRP_NEED_INIT(grp));
+
+ /* We only do this if the grp has never been initialized */
+ if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) {
+ int ret = ext4_mb_init_group(ac->ac_sb, group);
+ if (ret)
+ return 0;
+ }
free = grp->bb_free;
fragments = grp->bb_fragments;
@@ -1843,17 +1888,16 @@ static int ext4_mb_good_group(struct ext4_allocation_context *ac,
case 0:
BUG_ON(ac->ac_2order == 0);
+ if (grp->bb_largest_free_order < ac->ac_2order)
+ return 0;
+
/* Avoid using the first bg of a flexgroup for data files */
if ((ac->ac_flags & EXT4_MB_HINT_DATA) &&
(flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) &&
((group % flex_size) == 0))
return 0;
- bits = ac->ac_sb->s_blocksize_bits + 1;
- for (i = ac->ac_2order; i <= bits; i++)
- if (grp->bb_counters[i] > 0)
- return 1;
- break;
+ return 1;
case 1:
if ((free / fragments) >= ac->ac_g_ex.fe_len)
return 1;
@@ -1964,7 +2008,7 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
sbi = EXT4_SB(sb);
ngroups = ext4_get_groups_count(sb);
/* non-extent files are limited to low blocks/groups */
- if (!(EXT4_I(ac->ac_inode)->i_flags & EXT4_EXTENTS_FL))
+ if (!(ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS)))
ngroups = sbi->s_blockfile_groups;
BUG_ON(ac->ac_status == AC_STATUS_FOUND);
@@ -2024,15 +2068,11 @@ repeat:
group = ac->ac_g_ex.fe_group;
for (i = 0; i < ngroups; group++, i++) {
- struct ext4_group_info *grp;
- struct ext4_group_desc *desc;
-
if (group == ngroups)
group = 0;
- /* quick check to skip empty groups */
- grp = ext4_get_group_info(sb, group);
- if (grp->bb_free == 0)
+ /* This now checks without needing the buddy page */
+ if (!ext4_mb_good_group(ac, group, cr))
continue;
err = ext4_mb_load_buddy(sb, group, &e4b);
@@ -2040,15 +2080,18 @@ repeat:
goto out;
ext4_lock_group(sb, group);
+
+ /*
+ * We need to check again after locking the
+ * block group
+ */
if (!ext4_mb_good_group(ac, group, cr)) {
- /* someone did allocation from this group */
ext4_unlock_group(sb, group);
- ext4_mb_release_desc(&e4b);
+ ext4_mb_unload_buddy(&e4b);
continue;
}
ac->ac_groups_scanned++;
- desc = ext4_get_group_desc(sb, group, NULL);
if (cr == 0)
ext4_mb_simple_scan_group(ac, &e4b);
else if (cr == 1 &&
@@ -2058,7 +2101,7 @@ repeat:
ext4_mb_complex_scan_group(ac, &e4b);
ext4_unlock_group(sb, group);
- ext4_mb_release_desc(&e4b);
+ ext4_mb_unload_buddy(&e4b);
if (ac->ac_status != AC_STATUS_CONTINUE)
break;
@@ -2148,7 +2191,7 @@ static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v)
ext4_lock_group(sb, group);
memcpy(&sg, ext4_get_group_info(sb, group), i);
ext4_unlock_group(sb, group);
- ext4_mb_release_desc(&e4b);
+ ext4_mb_unload_buddy(&e4b);
seq_printf(seq, "#%-5u: %-5u %-5u %-5u [", group, sg.info.bb_free,
sg.info.bb_fragments, sg.info.bb_first_free);
@@ -2255,6 +2298,7 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list);
init_rwsem(&meta_group_info[i]->alloc_sem);
meta_group_info[i]->bb_free_root = RB_ROOT;
+ meta_group_info[i]->bb_largest_free_order = -1; /* uninit */
#ifdef DOUBLE_CHECK
{
@@ -2536,6 +2580,7 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
entry->count, entry->group, entry);
if (test_opt(sb, DISCARD)) {
+ int ret;
ext4_fsblk_t discard_block;
discard_block = entry->start_blk +
@@ -2543,7 +2588,12 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
trace_ext4_discard_blocks(sb,
(unsigned long long)discard_block,
entry->count);
- sb_issue_discard(sb, discard_block, entry->count);
+ ret = sb_issue_discard(sb, discard_block, entry->count);
+ if (ret == EOPNOTSUPP) {
+ ext4_warning(sb,
+ "discard not supported, disabling");
+ clear_opt(EXT4_SB(sb)->s_mount_opt, DISCARD);
+ }
}
err = ext4_mb_load_buddy(sb, entry->group, &e4b);
@@ -2568,7 +2618,7 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
}
ext4_unlock_group(sb, entry->group);
kmem_cache_free(ext4_free_ext_cachep, entry);
- ext4_mb_release_desc(&e4b);
+ ext4_mb_unload_buddy(&e4b);
}
mb_debug(1, "freed %u blocks in %u structures\n", count, count2);
@@ -2641,7 +2691,7 @@ int __init init_ext4_mballoc(void)
void exit_ext4_mballoc(void)
{
- /*
+ /*
* Wait for completion of call_rcu()'s on ext4_pspace_cachep
* before destroying the slab cache.
*/
@@ -2981,7 +3031,7 @@ static void ext4_mb_collect_stats(struct ext4_allocation_context *ac)
if (sbi->s_mb_stats && ac->ac_g_ex.fe_len > 1) {
atomic_inc(&sbi->s_bal_reqs);
atomic_add(ac->ac_b_ex.fe_len, &sbi->s_bal_allocated);
- if (ac->ac_o_ex.fe_len >= ac->ac_g_ex.fe_len)
+ if (ac->ac_b_ex.fe_len >= ac->ac_o_ex.fe_len)
atomic_inc(&sbi->s_bal_success);
atomic_add(ac->ac_found, &sbi->s_bal_ex_scanned);
if (ac->ac_g_ex.fe_start == ac->ac_b_ex.fe_start &&
@@ -3123,7 +3173,7 @@ ext4_mb_use_preallocated(struct ext4_allocation_context *ac)
continue;
/* non-extent files can't have physical blocks past 2^32 */
- if (!(EXT4_I(ac->ac_inode)->i_flags & EXT4_EXTENTS_FL) &&
+ if (!(ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS)) &&
pa->pa_pstart + pa->pa_len > EXT4_MAX_BLOCK_FILE_PHYS)
continue;
@@ -3280,7 +3330,7 @@ static void ext4_mb_put_pa(struct ext4_allocation_context *ac,
spin_unlock(&pa->pa_lock);
grp_blk = pa->pa_pstart;
- /*
+ /*
* If doing group-based preallocation, pa_pstart may be in the
* next group when pa is used up
*/
@@ -3697,7 +3747,7 @@ out:
ext4_unlock_group(sb, group);
if (ac)
kmem_cache_free(ext4_ac_cachep, ac);
- ext4_mb_release_desc(&e4b);
+ ext4_mb_unload_buddy(&e4b);
put_bh(bitmap_bh);
return free;
}
@@ -3801,7 +3851,7 @@ repeat:
if (bitmap_bh == NULL) {
ext4_error(sb, "Error reading block bitmap for %u",
group);
- ext4_mb_release_desc(&e4b);
+ ext4_mb_unload_buddy(&e4b);
continue;
}
@@ -3810,7 +3860,7 @@ repeat:
ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa, ac);
ext4_unlock_group(sb, group);
- ext4_mb_release_desc(&e4b);
+ ext4_mb_unload_buddy(&e4b);
put_bh(bitmap_bh);
list_del(&pa->u.pa_tmp_list);
@@ -4074,7 +4124,7 @@ ext4_mb_discard_lg_preallocations(struct super_block *sb,
ext4_mb_release_group_pa(&e4b, pa, ac);
ext4_unlock_group(sb, group);
- ext4_mb_release_desc(&e4b);
+ ext4_mb_unload_buddy(&e4b);
list_del(&pa->u.pa_tmp_list);
call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback);
}
@@ -4484,12 +4534,12 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
if (!bh)
tbh = sb_find_get_block(inode->i_sb,
block + i);
- ext4_forget(handle, flags & EXT4_FREE_BLOCKS_METADATA,
+ ext4_forget(handle, flags & EXT4_FREE_BLOCKS_METADATA,
inode, tbh, block + i);
}
}
- /*
+ /*
* We need to make sure we don't reuse the freed block until
* after the transaction is committed, which we can do by
* treating the block as metadata, below. We make an
@@ -4610,7 +4660,7 @@ do_more:
atomic_add(count, &sbi->s_flex_groups[flex_group].free_blocks);
}
- ext4_mb_release_desc(&e4b);
+ ext4_mb_unload_buddy(&e4b);
freed += count;
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index 34dcfc52ef44..6f3a27ec30bf 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -475,7 +475,7 @@ int ext4_ext_migrate(struct inode *inode)
*/
if (!EXT4_HAS_INCOMPAT_FEATURE(inode->i_sb,
EXT4_FEATURE_INCOMPAT_EXTENTS) ||
- (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL))
+ (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
return -EINVAL;
if (S_ISLNK(inode->i_mode) && inode->i_blocks == 0)
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index d1fc662cc311..52abfa12762a 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -482,6 +482,7 @@ mext_leaf_block(handle_t *handle, struct inode *orig_inode,
int depth = ext_depth(orig_inode);
int ret;
+ start_ext.ee_block = end_ext.ee_block = 0;
o_start = o_end = oext = orig_path[depth].p_ext;
oext_alen = ext4_ext_get_actual_len(oext);
start_ext.ee_len = end_ext.ee_len = 0;
@@ -529,7 +530,7 @@ mext_leaf_block(handle_t *handle, struct inode *orig_inode,
* new_ext |-------|
*/
if (le32_to_cpu(oext->ee_block) + oext_alen - 1 < new_ext_end) {
- ext4_error(orig_inode->i_sb,
+ EXT4_ERROR_INODE(orig_inode,
"new_ext_end(%u) should be less than or equal to "
"oext->ee_block(%u) + oext_alen(%d) - 1",
new_ext_end, le32_to_cpu(oext->ee_block),
@@ -692,12 +693,12 @@ mext_replace_branches(handle_t *handle, struct inode *orig_inode,
while (1) {
/* The extent for donor must be found. */
if (!dext) {
- ext4_error(donor_inode->i_sb,
+ EXT4_ERROR_INODE(donor_inode,
"The extent for donor must be found");
*err = -EIO;
goto out;
} else if (donor_off != le32_to_cpu(tmp_dext.ee_block)) {
- ext4_error(donor_inode->i_sb,
+ EXT4_ERROR_INODE(donor_inode,
"Donor offset(%u) and the first block of donor "
"extent(%u) should be equal",
donor_off,
@@ -959,6 +960,9 @@ mext_check_arguments(struct inode *orig_inode,
return -EINVAL;
}
+ if (IS_IMMUTABLE(donor_inode) || IS_APPEND(donor_inode))
+ return -EPERM;
+
/* Ext4 move extent does not support swapfile */
if (IS_SWAPFILE(orig_inode) || IS_SWAPFILE(donor_inode)) {
ext4_debug("ext4 move extent: The argument files should "
@@ -976,11 +980,11 @@ mext_check_arguments(struct inode *orig_inode,
}
/* Ext4 move extent supports only extent based file */
- if (!(EXT4_I(orig_inode)->i_flags & EXT4_EXTENTS_FL)) {
+ if (!(ext4_test_inode_flag(orig_inode, EXT4_INODE_EXTENTS))) {
ext4_debug("ext4 move extent: orig file is not extents "
"based file [ino:orig %lu]\n", orig_inode->i_ino);
return -EOPNOTSUPP;
- } else if (!(EXT4_I(donor_inode)->i_flags & EXT4_EXTENTS_FL)) {
+ } else if (!(ext4_test_inode_flag(donor_inode, EXT4_INODE_EXTENTS))) {
ext4_debug("ext4 move extent: donor file is not extents "
"based file [ino:donor %lu]\n", donor_inode->i_ino);
return -EOPNOTSUPP;
@@ -1354,7 +1358,7 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
if (ret1 < 0)
break;
if (*moved_len > len) {
- ext4_error(orig_inode->i_sb,
+ EXT4_ERROR_INODE(orig_inode,
"We replaced blocks too much! "
"sum of replaced: %llu requested: %llu",
*moved_len, len);
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 0c070fabd108..a43e6617b351 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -187,7 +187,7 @@ unsigned int ext4_rec_len_from_disk(__le16 dlen, unsigned blocksize)
return blocksize;
return (len & 65532) | ((len & 3) << 16);
}
-
+
__le16 ext4_rec_len_to_disk(unsigned len, unsigned blocksize)
{
if ((len > blocksize) || (blocksize > (1 << 18)) || (len & 3))
@@ -197,7 +197,7 @@ __le16 ext4_rec_len_to_disk(unsigned len, unsigned blocksize)
if (len == blocksize) {
if (blocksize == 65536)
return cpu_to_le16(EXT4_MAX_REC_LEN);
- else
+ else
return cpu_to_le16(0);
}
return cpu_to_le16((len & 65532) | ((len >> 16) & 3));
@@ -349,7 +349,7 @@ struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir,
brelse(bh);
}
if (bcount)
- printk(KERN_DEBUG "%snames %u, fullness %u (%u%%)\n",
+ printk(KERN_DEBUG "%snames %u, fullness %u (%u%%)\n",
levels ? "" : " ", names, space/bcount,
(space/bcount)*100/blocksize);
return (struct stats) { names, space, bcount};
@@ -653,10 +653,10 @@ int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash,
int ret, err;
__u32 hashval;
- dxtrace(printk(KERN_DEBUG "In htree_fill_tree, start hash: %x:%x\n",
+ dxtrace(printk(KERN_DEBUG "In htree_fill_tree, start hash: %x:%x\n",
start_hash, start_minor_hash));
dir = dir_file->f_path.dentry->d_inode;
- if (!(EXT4_I(dir)->i_flags & EXT4_INDEX_FL)) {
+ if (!(ext4_test_inode_flag(dir, EXT4_INODE_INDEX))) {
hinfo.hash_version = EXT4_SB(dir->i_sb)->s_def_hash_version;
if (hinfo.hash_version <= DX_HASH_TEA)
hinfo.hash_version +=
@@ -801,7 +801,7 @@ static void ext4_update_dx_flag(struct inode *inode)
{
if (!EXT4_HAS_COMPAT_FEATURE(inode->i_sb,
EXT4_FEATURE_COMPAT_DIR_INDEX))
- EXT4_I(inode)->i_flags &= ~EXT4_INDEX_FL;
+ ext4_clear_inode_flag(inode, EXT4_INODE_INDEX);
}
/*
@@ -943,8 +943,8 @@ restart:
wait_on_buffer(bh);
if (!buffer_uptodate(bh)) {
/* read error, skip block & hope for the best */
- ext4_error(sb, "reading directory #%lu offset %lu",
- dir->i_ino, (unsigned long)block);
+ EXT4_ERROR_INODE(dir, "reading directory lblock %lu",
+ (unsigned long) block);
brelse(bh);
goto next;
}
@@ -1066,15 +1066,15 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, stru
__u32 ino = le32_to_cpu(de->inode);
brelse(bh);
if (!ext4_valid_inum(dir->i_sb, ino)) {
- ext4_error(dir->i_sb, "bad inode number: %u", ino);
+ EXT4_ERROR_INODE(dir, "bad inode number: %u", ino);
return ERR_PTR(-EIO);
}
inode = ext4_iget(dir->i_sb, ino);
if (unlikely(IS_ERR(inode))) {
if (PTR_ERR(inode) == -ESTALE) {
- ext4_error(dir->i_sb,
- "deleted inode referenced: %u",
- ino);
+ EXT4_ERROR_INODE(dir,
+ "deleted inode referenced: %u",
+ ino);
return ERR_PTR(-EIO);
} else {
return ERR_CAST(inode);
@@ -1104,8 +1104,8 @@ struct dentry *ext4_get_parent(struct dentry *child)
brelse(bh);
if (!ext4_valid_inum(child->d_inode->i_sb, ino)) {
- ext4_error(child->d_inode->i_sb,
- "bad inode number: %u", ino);
+ EXT4_ERROR_INODE(child->d_inode,
+ "bad parent inode number: %u", ino);
return ERR_PTR(-EIO);
}
@@ -1141,7 +1141,7 @@ dx_move_dirents(char *from, char *to, struct dx_map_entry *map, int count,
unsigned rec_len = 0;
while (count--) {
- struct ext4_dir_entry_2 *de = (struct ext4_dir_entry_2 *)
+ struct ext4_dir_entry_2 *de = (struct ext4_dir_entry_2 *)
(from + (map->offs<<2));
rec_len = EXT4_DIR_REC_LEN(de->name_len);
memcpy (to, de, rec_len);
@@ -1404,9 +1404,7 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
de = (struct ext4_dir_entry_2 *)((char *)fde +
ext4_rec_len_from_disk(fde->rec_len, blocksize));
if ((char *) de >= (((char *) root) + blocksize)) {
- ext4_error(dir->i_sb,
- "invalid rec_len for '..' in inode %lu",
- dir->i_ino);
+ EXT4_ERROR_INODE(dir, "invalid rec_len for '..'");
brelse(bh);
return -EIO;
}
@@ -1418,7 +1416,7 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
brelse(bh);
return retval;
}
- EXT4_I(dir)->i_flags |= EXT4_INDEX_FL;
+ ext4_set_inode_flag(dir, EXT4_INODE_INDEX);
data1 = bh2->b_data;
memcpy (data1, de, len);
@@ -1491,7 +1489,7 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
retval = ext4_dx_add_entry(handle, dentry, inode);
if (!retval || (retval != ERR_BAD_DX_DIR))
return retval;
- EXT4_I(dir)->i_flags &= ~EXT4_INDEX_FL;
+ ext4_clear_inode_flag(dir, EXT4_INODE_INDEX);
dx_fallback++;
ext4_mark_inode_dirty(handle, dir);
}
@@ -1519,6 +1517,8 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
de->rec_len = ext4_rec_len_to_disk(blocksize, blocksize);
retval = add_dirent_to_buf(handle, dentry, inode, de, bh);
brelse(bh);
+ if (retval == 0)
+ ext4_set_inode_state(inode, EXT4_STATE_NEWENTRY);
return retval;
}
@@ -1915,9 +1915,8 @@ static int empty_dir(struct inode *inode)
if (inode->i_size < EXT4_DIR_REC_LEN(1) + EXT4_DIR_REC_LEN(2) ||
!(bh = ext4_bread(NULL, inode, 0, 0, &err))) {
if (err)
- ext4_error(inode->i_sb,
- "error %d reading directory #%lu offset 0",
- err, inode->i_ino);
+ EXT4_ERROR_INODE(inode,
+ "error %d reading directory lblock 0", err);
else
ext4_warning(inode->i_sb,
"bad directory (dir #%lu) - no data block",
@@ -1941,17 +1940,17 @@ static int empty_dir(struct inode *inode)
de = ext4_next_entry(de1, sb->s_blocksize);
while (offset < inode->i_size) {
if (!bh ||
- (void *) de >= (void *) (bh->b_data+sb->s_blocksize)) {
+ (void *) de >= (void *) (bh->b_data+sb->s_blocksize)) {
+ unsigned int lblock;
err = 0;
brelse(bh);
- bh = ext4_bread(NULL, inode,
- offset >> EXT4_BLOCK_SIZE_BITS(sb), 0, &err);
+ lblock = offset >> EXT4_BLOCK_SIZE_BITS(sb);
+ bh = ext4_bread(NULL, inode, lblock, 0, &err);
if (!bh) {
if (err)
- ext4_error(sb,
- "error %d reading directory"
- " #%lu offset %u",
- err, inode->i_ino, offset);
+ EXT4_ERROR_INODE(inode,
+ "error %d reading directory "
+ "lblock %u", err, lblock);
offset += sb->s_blocksize;
continue;
}
@@ -2297,7 +2296,7 @@ retry:
}
} else {
/* clear the extent format for fast symlink */
- EXT4_I(inode)->i_flags &= ~EXT4_EXTENTS_FL;
+ ext4_clear_inode_flag(inode, EXT4_INODE_EXTENTS);
inode->i_op = &ext4_fast_symlink_inode_operations;
memcpy((char *)&EXT4_I(inode)->i_data, symname, l);
inode->i_size = l-1;
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index 5692c48754a0..6df797eb9aeb 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -911,7 +911,8 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
percpu_counter_add(&sbi->s_freeinodes_counter,
EXT4_INODES_PER_GROUP(sb));
- if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG)) {
+ if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG) &&
+ sbi->s_log_groups_per_flex) {
ext4_group_t flex_group;
flex_group = ext4_flex_group(sbi, input->group);
atomic_add(input->free_blocks_count,
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index e14d22c170d5..4e8983a9811b 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -241,6 +241,7 @@ handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks)
if (sb->s_flags & MS_RDONLY)
return ERR_PTR(-EROFS);
+ vfs_check_frozen(sb, SB_FREEZE_WRITE);
/* Special case here: if the journal has aborted behind our
* backs (eg. EIO in the commit thread), then we still need to
* take the FS itself readonly cleanly. */
@@ -645,6 +646,8 @@ static void ext4_put_super(struct super_block *sb)
struct ext4_super_block *es = sbi->s_es;
int i, err;
+ dquot_disable(sb, -1, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED);
+
flush_workqueue(sbi->dio_unwritten_wq);
destroy_workqueue(sbi->dio_unwritten_wq);
@@ -941,6 +944,8 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
seq_puts(seq, test_opt(sb, BARRIER) ? "1" : "0");
if (test_opt(sb, JOURNAL_ASYNC_COMMIT))
seq_puts(seq, ",journal_async_commit");
+ else if (test_opt(sb, JOURNAL_CHECKSUM))
+ seq_puts(seq, ",journal_checksum");
if (test_opt(sb, NOBH))
seq_puts(seq, ",nobh");
if (test_opt(sb, I_VERSION))
@@ -1059,7 +1064,7 @@ static int ext4_release_dquot(struct dquot *dquot);
static int ext4_mark_dquot_dirty(struct dquot *dquot);
static int ext4_write_info(struct super_block *sb, int type);
static int ext4_quota_on(struct super_block *sb, int type, int format_id,
- char *path, int remount);
+ char *path);
static int ext4_quota_on_mount(struct super_block *sb, int type);
static ssize_t ext4_quota_read(struct super_block *sb, int type, char *data,
size_t len, loff_t off);
@@ -1081,12 +1086,12 @@ static const struct dquot_operations ext4_quota_operations = {
static const struct quotactl_ops ext4_qctl_operations = {
.quota_on = ext4_quota_on,
- .quota_off = vfs_quota_off,
- .quota_sync = vfs_quota_sync,
- .get_info = vfs_get_dqinfo,
- .set_info = vfs_set_dqinfo,
- .get_dqblk = vfs_get_dqblk,
- .set_dqblk = vfs_set_dqblk
+ .quota_off = dquot_quota_off,
+ .quota_sync = dquot_quota_sync,
+ .get_info = dquot_get_dqinfo,
+ .set_info = dquot_set_dqinfo,
+ .get_dqblk = dquot_get_dqblk,
+ .set_dqblk = dquot_set_dqblk
};
#endif
@@ -2051,7 +2056,7 @@ static void ext4_orphan_cleanup(struct super_block *sb,
/* Turn quotas off */
for (i = 0; i < MAXQUOTAS; i++) {
if (sb_dqopt(sb)->files[i])
- vfs_quota_off(sb, i, 0);
+ dquot_quota_off(sb, i);
}
#endif
sb->s_flags = s_flags; /* Restore MS_RDONLY status */
@@ -2213,7 +2218,7 @@ static unsigned long ext4_get_stripe_size(struct ext4_sb_info *sbi)
struct ext4_attr {
struct attribute attr;
ssize_t (*show)(struct ext4_attr *, struct ext4_sb_info *, char *);
- ssize_t (*store)(struct ext4_attr *, struct ext4_sb_info *,
+ ssize_t (*store)(struct ext4_attr *, struct ext4_sb_info *,
const char *, size_t);
int offset;
};
@@ -2430,6 +2435,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
__releases(kernel_lock)
__acquires(kernel_lock)
{
+ char *orig_data = kstrdup(data, GFP_KERNEL);
struct buffer_head *bh;
struct ext4_super_block *es = NULL;
struct ext4_sb_info *sbi;
@@ -2793,24 +2799,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
get_random_bytes(&sbi->s_next_generation, sizeof(u32));
spin_lock_init(&sbi->s_next_gen_lock);
- err = percpu_counter_init(&sbi->s_freeblocks_counter,
- ext4_count_free_blocks(sb));
- if (!err) {
- err = percpu_counter_init(&sbi->s_freeinodes_counter,
- ext4_count_free_inodes(sb));
- }
- if (!err) {
- err = percpu_counter_init(&sbi->s_dirs_counter,
- ext4_count_dirs(sb));
- }
- if (!err) {
- err = percpu_counter_init(&sbi->s_dirtyblocks_counter, 0);
- }
- if (err) {
- ext4_msg(sb, KERN_ERR, "insufficient memory");
- goto failed_mount3;
- }
-
sbi->s_stripe = ext4_get_stripe_size(sbi);
sbi->s_max_writeback_mb_bump = 128;
@@ -2910,6 +2898,20 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
set_task_ioprio(sbi->s_journal->j_task, journal_ioprio);
no_journal:
+ err = percpu_counter_init(&sbi->s_freeblocks_counter,
+ ext4_count_free_blocks(sb));
+ if (!err)
+ err = percpu_counter_init(&sbi->s_freeinodes_counter,
+ ext4_count_free_inodes(sb));
+ if (!err)
+ err = percpu_counter_init(&sbi->s_dirs_counter,
+ ext4_count_dirs(sb));
+ if (!err)
+ err = percpu_counter_init(&sbi->s_dirtyblocks_counter, 0);
+ if (err) {
+ ext4_msg(sb, KERN_ERR, "insufficient memory");
+ goto failed_mount_wq;
+ }
if (test_opt(sb, NOBH)) {
if (!(test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA)) {
ext4_msg(sb, KERN_WARNING, "Ignoring nobh option - "
@@ -3001,7 +3003,7 @@ no_journal:
err = ext4_setup_system_zone(sb);
if (err) {
ext4_msg(sb, KERN_ERR, "failed to initialize system "
- "zone (%d)\n", err);
+ "zone (%d)", err);
goto failed_mount4;
}
@@ -3040,9 +3042,11 @@ no_journal:
} else
descr = "out journal";
- ext4_msg(sb, KERN_INFO, "mounted filesystem with%s", descr);
+ ext4_msg(sb, KERN_INFO, "mounted filesystem with%s. "
+ "Opts: %s", descr, orig_data);
lock_kernel();
+ kfree(orig_data);
return 0;
cantfind_ext4:
@@ -3059,6 +3063,10 @@ failed_mount_wq:
jbd2_journal_destroy(sbi->s_journal);
sbi->s_journal = NULL;
}
+ percpu_counter_destroy(&sbi->s_freeblocks_counter);
+ percpu_counter_destroy(&sbi->s_freeinodes_counter);
+ percpu_counter_destroy(&sbi->s_dirs_counter);
+ percpu_counter_destroy(&sbi->s_dirtyblocks_counter);
failed_mount3:
if (sbi->s_flex_groups) {
if (is_vmalloc_addr(sbi->s_flex_groups))
@@ -3066,10 +3074,6 @@ failed_mount3:
else
kfree(sbi->s_flex_groups);
}
- percpu_counter_destroy(&sbi->s_freeblocks_counter);
- percpu_counter_destroy(&sbi->s_freeinodes_counter);
- percpu_counter_destroy(&sbi->s_dirs_counter);
- percpu_counter_destroy(&sbi->s_dirtyblocks_counter);
failed_mount2:
for (i = 0; i < db_count; i++)
brelse(sbi->s_group_desc[i]);
@@ -3089,6 +3093,7 @@ out_fail:
kfree(sbi->s_blockgroup_lock);
kfree(sbi);
lock_kernel();
+ kfree(orig_data);
return ret;
}
@@ -3380,7 +3385,7 @@ static int ext4_commit_super(struct super_block *sb, int sync)
if (!(sb->s_flags & MS_RDONLY))
es->s_wtime = cpu_to_le32(get_seconds());
es->s_kbytes_written =
- cpu_to_le64(EXT4_SB(sb)->s_kbytes_written +
+ cpu_to_le64(EXT4_SB(sb)->s_kbytes_written +
((part_stat_read(sb->s_bdev->bd_part, sectors[1]) -
EXT4_SB(sb)->s_sectors_written_start) >> 1));
ext4_free_blocks_count_set(es, percpu_counter_sum_positive(
@@ -3485,8 +3490,10 @@ int ext4_force_commit(struct super_block *sb)
return 0;
journal = EXT4_SB(sb)->s_journal;
- if (journal)
+ if (journal) {
+ vfs_check_frozen(sb, SB_FREEZE_WRITE);
ret = ext4_journal_force_commit(journal);
+ }
return ret;
}
@@ -3535,18 +3542,16 @@ static int ext4_freeze(struct super_block *sb)
* the journal.
*/
error = jbd2_journal_flush(journal);
- if (error < 0) {
- out:
- jbd2_journal_unlock_updates(journal);
- return error;
- }
+ if (error < 0)
+ goto out;
/* Journal blocked and flushed, clear needs_recovery flag. */
EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
error = ext4_commit_super(sb, 1);
- if (error)
- goto out;
- return 0;
+out:
+ /* we rely on s_frozen to stop further updates */
+ jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
+ return error;
}
/*
@@ -3563,7 +3568,6 @@ static int ext4_unfreeze(struct super_block *sb)
EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
ext4_commit_super(sb, 1);
unlock_super(sb);
- jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
return 0;
}
@@ -3574,12 +3578,14 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
ext4_fsblk_t n_blocks_count = 0;
unsigned long old_sb_flags;
struct ext4_mount_options old_opts;
+ int enable_quota = 0;
ext4_group_t g;
unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO;
int err;
#ifdef CONFIG_QUOTA
int i;
#endif
+ char *orig_data = kstrdup(data, GFP_KERNEL);
lock_kernel();
@@ -3630,6 +3636,10 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
}
if (*flags & MS_RDONLY) {
+ err = dquot_suspend(sb, -1);
+ if (err < 0)
+ goto restore_opts;
+
/*
* First of all, the unconditional stuff we have to do
* to disable replay of the journal when we next remount
@@ -3698,6 +3708,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
goto restore_opts;
if (!ext4_setup_super(sb, es, 0))
sb->s_flags &= ~MS_RDONLY;
+ enable_quota = 1;
}
}
ext4_setup_system_zone(sb);
@@ -3713,6 +3724,11 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
#endif
unlock_super(sb);
unlock_kernel();
+ if (enable_quota)
+ dquot_resume(sb, -1);
+
+ ext4_msg(sb, KERN_INFO, "re-mounted. Opts: %s", orig_data);
+ kfree(orig_data);
return 0;
restore_opts:
@@ -3734,6 +3750,7 @@ restore_opts:
#endif
unlock_super(sb);
unlock_kernel();
+ kfree(orig_data);
return err;
}
@@ -3906,24 +3923,21 @@ static int ext4_write_info(struct super_block *sb, int type)
*/
static int ext4_quota_on_mount(struct super_block *sb, int type)
{
- return vfs_quota_on_mount(sb, EXT4_SB(sb)->s_qf_names[type],
- EXT4_SB(sb)->s_jquota_fmt, type);
+ return dquot_quota_on_mount(sb, EXT4_SB(sb)->s_qf_names[type],
+ EXT4_SB(sb)->s_jquota_fmt, type);
}
/*
* Standard function to be called on quota_on
*/
static int ext4_quota_on(struct super_block *sb, int type, int format_id,
- char *name, int remount)
+ char *name)
{
int err;
struct path path;
if (!test_opt(sb, QUOTA))
return -EINVAL;
- /* When remounting, no checks are needed and in fact, name is NULL */
- if (remount)
- return vfs_quota_on(sb, type, format_id, name, remount);
err = kern_path(name, LOOKUP_FOLLOW, &path);
if (err)
@@ -3962,7 +3976,7 @@ static int ext4_quota_on(struct super_block *sb, int type, int format_id,
}
}
- err = vfs_quota_on_path(sb, type, format_id, &path);
+ err = dquot_quota_on_path(sb, type, format_id, &path);
path_put(&path);
return err;
}
@@ -4141,6 +4155,7 @@ static int __init init_ext4_fs(void)
{
int err;
+ ext4_check_flag_values();
err = init_ext4_system_zone();
if (err)
return err;
diff --git a/fs/ext4/symlink.c b/fs/ext4/symlink.c
index 00740cb32be3..ed9354aff279 100644
--- a/fs/ext4/symlink.c
+++ b/fs/ext4/symlink.c
@@ -34,6 +34,7 @@ const struct inode_operations ext4_symlink_inode_operations = {
.readlink = generic_readlink,
.follow_link = page_follow_link_light,
.put_link = page_put_link,
+ .setattr = ext4_setattr,
#ifdef CONFIG_EXT4_FS_XATTR
.setxattr = generic_setxattr,
.getxattr = generic_getxattr,
@@ -45,6 +46,7 @@ const struct inode_operations ext4_symlink_inode_operations = {
const struct inode_operations ext4_fast_symlink_inode_operations = {
.readlink = generic_readlink,
.follow_link = ext4_follow_link,
+ .setattr = ext4_setattr,
#ifdef CONFIG_EXT4_FS_XATTR
.setxattr = generic_setxattr,
.getxattr = generic_getxattr,
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 2de0e9515089..04338009793a 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -228,9 +228,8 @@ ext4_xattr_block_get(struct inode *inode, int name_index, const char *name,
atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount));
if (ext4_xattr_check_block(bh)) {
bad_block:
- ext4_error(inode->i_sb,
- "inode %lu: bad block %llu", inode->i_ino,
- EXT4_I(inode)->i_file_acl);
+ EXT4_ERROR_INODE(inode, "bad block %llu",
+ EXT4_I(inode)->i_file_acl);
error = -EIO;
goto cleanup;
}
@@ -372,9 +371,8 @@ ext4_xattr_block_list(struct dentry *dentry, char *buffer, size_t buffer_size)
ea_bdebug(bh, "b_count=%d, refcount=%d",
atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount));
if (ext4_xattr_check_block(bh)) {
- ext4_error(inode->i_sb,
- "inode %lu: bad block %llu", inode->i_ino,
- EXT4_I(inode)->i_file_acl);
+ EXT4_ERROR_INODE(inode, "bad block %llu",
+ EXT4_I(inode)->i_file_acl);
error = -EIO;
goto cleanup;
}
@@ -666,8 +664,8 @@ ext4_xattr_block_find(struct inode *inode, struct ext4_xattr_info *i,
atomic_read(&(bs->bh->b_count)),
le32_to_cpu(BHDR(bs->bh)->h_refcount));
if (ext4_xattr_check_block(bs->bh)) {
- ext4_error(sb, "inode %lu: bad block %llu",
- inode->i_ino, EXT4_I(inode)->i_file_acl);
+ EXT4_ERROR_INODE(inode, "bad block %llu",
+ EXT4_I(inode)->i_file_acl);
error = -EIO;
goto cleanup;
}
@@ -820,7 +818,7 @@ inserted:
EXT4_I(inode)->i_block_group);
/* non-extent files can't have physical blocks past 2^32 */
- if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL))
+ if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
goal = goal & EXT4_MAX_BLOCK_FILE_PHYS;
block = ext4_new_meta_blocks(handle, inode,
@@ -828,7 +826,7 @@ inserted:
if (error)
goto cleanup;
- if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL))
+ if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
BUG_ON(block > EXT4_MAX_BLOCK_FILE_PHYS);
ea_idebug(inode, "creating block %d", block);
@@ -880,8 +878,8 @@ cleanup_dquot:
goto cleanup;
bad_block:
- ext4_error(inode->i_sb, "inode %lu: bad block %llu",
- inode->i_ino, EXT4_I(inode)->i_file_acl);
+ EXT4_ERROR_INODE(inode, "bad block %llu",
+ EXT4_I(inode)->i_file_acl);
goto cleanup;
#undef header
@@ -1194,8 +1192,8 @@ retry:
if (!bh)
goto cleanup;
if (ext4_xattr_check_block(bh)) {
- ext4_error(inode->i_sb, "inode %lu: bad block %llu",
- inode->i_ino, EXT4_I(inode)->i_file_acl);
+ EXT4_ERROR_INODE(inode, "bad block %llu",
+ EXT4_I(inode)->i_file_acl);
error = -EIO;
goto cleanup;
}
@@ -1372,14 +1370,14 @@ ext4_xattr_delete_inode(handle_t *handle, struct inode *inode)
goto cleanup;
bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl);
if (!bh) {
- ext4_error(inode->i_sb, "inode %lu: block %llu read error",
- inode->i_ino, EXT4_I(inode)->i_file_acl);
+ EXT4_ERROR_INODE(inode, "block %llu read error",
+ EXT4_I(inode)->i_file_acl);
goto cleanup;
}
if (BHDR(bh)->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC) ||
BHDR(bh)->h_blocks != cpu_to_le32(1)) {
- ext4_error(inode->i_sb, "inode %lu: bad block %llu",
- inode->i_ino, EXT4_I(inode)->i_file_acl);
+ EXT4_ERROR_INODE(inode, "bad block %llu",
+ EXT4_I(inode)->i_file_acl);
goto cleanup;
}
ext4_xattr_release_block(handle, inode, bh);
@@ -1504,9 +1502,8 @@ again:
}
bh = sb_bread(inode->i_sb, ce->e_block);
if (!bh) {
- ext4_error(inode->i_sb,
- "inode %lu: block %lu read error",
- inode->i_ino, (unsigned long) ce->e_block);
+ EXT4_ERROR_INODE(inode, "block %lu read error",
+ (unsigned long) ce->e_block);
} else if (le32_to_cpu(BHDR(bh)->h_refcount) >=
EXT4_XATTR_REFCOUNT_MAX) {
ea_idebug(inode, "block %lu refcount %d>=%d",
diff --git a/fs/fat/fat.h b/fs/fat/fat.h
index 53dba57b49a1..27ac25725954 100644
--- a/fs/fat/fat.h
+++ b/fs/fat/fat.h
@@ -306,11 +306,11 @@ extern long fat_generic_ioctl(struct file *filp, unsigned int cmd,
extern const struct file_operations fat_file_operations;
extern const struct inode_operations fat_file_inode_operations;
extern int fat_setattr(struct dentry * dentry, struct iattr * attr);
-extern void fat_truncate(struct inode *inode);
+extern int fat_setsize(struct inode *inode, loff_t offset);
+extern void fat_truncate_blocks(struct inode *inode, loff_t offset);
extern int fat_getattr(struct vfsmount *mnt, struct dentry *dentry,
struct kstat *stat);
-extern int fat_file_fsync(struct file *file, struct dentry *dentry,
- int datasync);
+extern int fat_file_fsync(struct file *file, int datasync);
/* fat/inode.c */
extern void fat_attach(struct inode *inode, loff_t i_pos);
diff --git a/fs/fat/file.c b/fs/fat/file.c
index a14c2f6a489e..990dfae022e5 100644
--- a/fs/fat/file.c
+++ b/fs/fat/file.c
@@ -149,12 +149,12 @@ static int fat_file_release(struct inode *inode, struct file *filp)
return 0;
}
-int fat_file_fsync(struct file *filp, struct dentry *dentry, int datasync)
+int fat_file_fsync(struct file *filp, int datasync)
{
- struct inode *inode = dentry->d_inode;
+ struct inode *inode = filp->f_mapping->host;
int res, err;
- res = simple_fsync(filp, dentry, datasync);
+ res = generic_file_fsync(filp, datasync);
err = sync_mapping_buffers(MSDOS_SB(inode->i_sb)->fat_inode->i_mapping);
return res ? res : err;
@@ -283,7 +283,7 @@ static int fat_free(struct inode *inode, int skip)
return fat_free_clusters(inode, free_start);
}
-void fat_truncate(struct inode *inode)
+void fat_truncate_blocks(struct inode *inode, loff_t offset)
{
struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb);
const unsigned int cluster_size = sbi->cluster_size;
@@ -293,10 +293,10 @@ void fat_truncate(struct inode *inode)
* This protects against truncating a file bigger than it was then
* trying to write into the hole.
*/
- if (MSDOS_I(inode)->mmu_private > inode->i_size)
- MSDOS_I(inode)->mmu_private = inode->i_size;
+ if (MSDOS_I(inode)->mmu_private > offset)
+ MSDOS_I(inode)->mmu_private = offset;
- nr_clusters = (inode->i_size + (cluster_size - 1)) >> sbi->cluster_bits;
+ nr_clusters = (offset + (cluster_size - 1)) >> sbi->cluster_bits;
fat_free(inode, nr_clusters);
fat_flush_inodes(inode->i_sb, inode, NULL);
@@ -364,6 +364,18 @@ static int fat_allow_set_time(struct msdos_sb_info *sbi, struct inode *inode)
return 0;
}
+int fat_setsize(struct inode *inode, loff_t offset)
+{
+ int error;
+
+ error = simple_setsize(inode, offset);
+ if (error)
+ return error;
+ fat_truncate_blocks(inode, offset);
+
+ return error;
+}
+
#define TIMES_SET_FLAGS (ATTR_MTIME_SET | ATTR_ATIME_SET | ATTR_TIMES_SET)
/* valid file mode bits */
#define FAT_VALID_MODE (S_IFREG | S_IFDIR | S_IRWXUGO)
@@ -378,7 +390,8 @@ int fat_setattr(struct dentry *dentry, struct iattr *attr)
/*
* Expand the file. Since inode_setattr() updates ->i_size
* before calling the ->truncate(), but FAT needs to fill the
- * hole before it.
+ * hole before it. XXX: this is no longer true with new truncate
+ * sequence.
*/
if (attr->ia_valid & ATTR_SIZE) {
if (attr->ia_size > inode->i_size) {
@@ -427,15 +440,20 @@ int fat_setattr(struct dentry *dentry, struct iattr *attr)
attr->ia_valid &= ~ATTR_MODE;
}
- if (attr->ia_valid)
- error = inode_setattr(inode, attr);
+ if (attr->ia_valid & ATTR_SIZE) {
+ error = fat_setsize(inode, attr->ia_size);
+ if (error)
+ goto out;
+ }
+
+ generic_setattr(inode, attr);
+ mark_inode_dirty(inode);
out:
return error;
}
EXPORT_SYMBOL_GPL(fat_setattr);
const struct inode_operations fat_file_inode_operations = {
- .truncate = fat_truncate,
.setattr = fat_setattr,
.getattr = fat_getattr,
};
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index ed33904926ee..7bf45aee56d7 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -142,14 +142,29 @@ static int fat_readpages(struct file *file, struct address_space *mapping,
return mpage_readpages(mapping, pages, nr_pages, fat_get_block);
}
+static void fat_write_failed(struct address_space *mapping, loff_t to)
+{
+ struct inode *inode = mapping->host;
+
+ if (to > inode->i_size) {
+ truncate_pagecache(inode, to, inode->i_size);
+ fat_truncate_blocks(inode, inode->i_size);
+ }
+}
+
static int fat_write_begin(struct file *file, struct address_space *mapping,
loff_t pos, unsigned len, unsigned flags,
struct page **pagep, void **fsdata)
{
+ int err;
+
*pagep = NULL;
- return cont_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
- fat_get_block,
+ err = cont_write_begin_newtrunc(file, mapping, pos, len, flags,
+ pagep, fsdata, fat_get_block,
&MSDOS_I(mapping->host)->mmu_private);
+ if (err < 0)
+ fat_write_failed(mapping, pos + len);
+ return err;
}
static int fat_write_end(struct file *file, struct address_space *mapping,
@@ -159,6 +174,8 @@ static int fat_write_end(struct file *file, struct address_space *mapping,
struct inode *inode = mapping->host;
int err;
err = generic_write_end(file, mapping, pos, len, copied, pagep, fsdata);
+ if (err < len)
+ fat_write_failed(mapping, pos + len);
if (!(err < 0) && !(MSDOS_I(inode)->i_attrs & ATTR_ARCH)) {
inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC;
MSDOS_I(inode)->i_attrs |= ATTR_ARCH;
@@ -172,7 +189,9 @@ static ssize_t fat_direct_IO(int rw, struct kiocb *iocb,
loff_t offset, unsigned long nr_segs)
{
struct file *file = iocb->ki_filp;
- struct inode *inode = file->f_mapping->host;
+ struct address_space *mapping = file->f_mapping;
+ struct inode *inode = mapping->host;
+ ssize_t ret;
if (rw == WRITE) {
/*
@@ -193,8 +212,12 @@ static ssize_t fat_direct_IO(int rw, struct kiocb *iocb,
* FAT need to use the DIO_LOCKING for avoiding the race
* condition of fat_get_block() and ->truncate().
*/
- return blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov,
- offset, nr_segs, fat_get_block, NULL);
+ ret = blockdev_direct_IO_newtrunc(rw, iocb, inode, inode->i_sb->s_bdev,
+ iov, offset, nr_segs, fat_get_block, NULL);
+ if (ret < 0 && (rw & WRITE))
+ fat_write_failed(mapping, offset + iov_length(iov, nr_segs));
+
+ return ret;
}
static sector_t _fat_bmap(struct address_space *mapping, sector_t block)
@@ -429,7 +452,7 @@ static void fat_delete_inode(struct inode *inode)
{
truncate_inode_pages(&inode->i_data, 0);
inode->i_size = 0;
- fat_truncate(inode);
+ fat_truncate_blocks(inode, 0);
clear_inode(inode);
}
diff --git a/fs/fcntl.c b/fs/fcntl.c
index f74d270ba155..9d175d623aab 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -274,7 +274,7 @@ static int f_setown_ex(struct file *filp, unsigned long arg)
ret = copy_from_user(&owner, owner_p, sizeof(owner));
if (ret)
- return ret;
+ return -EFAULT;
switch (owner.type) {
case F_OWNER_TID:
@@ -332,8 +332,11 @@ static int f_getown_ex(struct file *filp, unsigned long arg)
}
read_unlock(&filp->f_owner.lock);
- if (!ret)
+ if (!ret) {
ret = copy_to_user(owner_p, &owner, sizeof(owner));
+ if (ret)
+ ret = -EFAULT;
+ }
return ret;
}
@@ -730,12 +733,14 @@ static void kill_fasync_rcu(struct fasync_struct *fa, int sig, int band)
{
while (fa) {
struct fown_struct *fown;
+ unsigned long flags;
+
if (fa->magic != FASYNC_MAGIC) {
printk(KERN_ERR "kill_fasync: bad magic number in "
"fasync_struct!\n");
return;
}
- spin_lock(&fa->fa_lock);
+ spin_lock_irqsave(&fa->fa_lock, flags);
if (fa->fa_file) {
fown = &fa->fa_file->f_owner;
/* Don't send SIGURG to processes which have not set a
@@ -744,7 +749,7 @@ static void kill_fasync_rcu(struct fasync_struct *fa, int sig, int band)
if (!(sig == SIGURG && fown->signum == 0))
send_sigio(fown, fa->fa_fd, band);
}
- spin_unlock(&fa->fa_lock);
+ spin_unlock_irqrestore(&fa->fa_lock, flags);
fa = rcu_dereference(fa->fa_next);
}
}
diff --git a/fs/file_table.c b/fs/file_table.c
index 32d12b78bac8..5c7d10ead4ad 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -194,14 +194,6 @@ struct file *alloc_file(struct path *path, fmode_t mode,
}
EXPORT_SYMBOL(alloc_file);
-void fput(struct file *file)
-{
- if (atomic_long_dec_and_test(&file->f_count))
- __fput(file);
-}
-
-EXPORT_SYMBOL(fput);
-
/**
* drop_file_write_access - give up ability to write to a file
* @file: the file to which we will stop writing
@@ -227,10 +219,9 @@ void drop_file_write_access(struct file *file)
}
EXPORT_SYMBOL_GPL(drop_file_write_access);
-/* __fput is called from task context when aio completion releases the last
- * last use of a struct file *. Do not use otherwise.
+/* the real guts of fput() - releasing the last reference to file
*/
-void __fput(struct file *file)
+static void __fput(struct file *file)
{
struct dentry *dentry = file->f_path.dentry;
struct vfsmount *mnt = file->f_path.mnt;
@@ -268,6 +259,14 @@ void __fput(struct file *file)
mntput(mnt);
}
+void fput(struct file *file)
+{
+ if (atomic_long_dec_and_test(&file->f_count))
+ __fput(file);
+}
+
+EXPORT_SYMBOL(fput);
+
struct file *fget(unsigned int fd)
{
struct file *file;
diff --git a/fs/freevxfs/vxfs_lookup.c b/fs/freevxfs/vxfs_lookup.c
index aee049cb9f84..0ec7bb2c95c6 100644
--- a/fs/freevxfs/vxfs_lookup.c
+++ b/fs/freevxfs/vxfs_lookup.c
@@ -57,6 +57,8 @@ const struct inode_operations vxfs_dir_inode_ops = {
};
const struct file_operations vxfs_dir_operations = {
+ .llseek = generic_file_llseek,
+ .read = generic_read_dir,
.readdir = vxfs_readdir,
};
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index ea8592b90696..d5be1693ac93 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -38,52 +38,18 @@ int nr_pdflush_threads;
/*
* Passed into wb_writeback(), essentially a subset of writeback_control
*/
-struct wb_writeback_args {
+struct wb_writeback_work {
long nr_pages;
struct super_block *sb;
enum writeback_sync_modes sync_mode;
unsigned int for_kupdate:1;
unsigned int range_cyclic:1;
unsigned int for_background:1;
- unsigned int sb_pinned:1;
-};
-/*
- * Work items for the bdi_writeback threads
- */
-struct bdi_work {
struct list_head list; /* pending work list */
- struct rcu_head rcu_head; /* for RCU free/clear of work */
-
- unsigned long seen; /* threads that have seen this work */
- atomic_t pending; /* number of threads still to do work */
-
- struct wb_writeback_args args; /* writeback arguments */
-
- unsigned long state; /* flag bits, see WS_* */
+ struct completion *done; /* set if the caller waits */
};
-enum {
- WS_USED_B = 0,
- WS_ONSTACK_B,
-};
-
-#define WS_USED (1 << WS_USED_B)
-#define WS_ONSTACK (1 << WS_ONSTACK_B)
-
-static inline bool bdi_work_on_stack(struct bdi_work *work)
-{
- return test_bit(WS_ONSTACK_B, &work->state);
-}
-
-static inline void bdi_work_init(struct bdi_work *work,
- struct wb_writeback_args *args)
-{
- INIT_RCU_HEAD(&work->rcu_head);
- work->args = *args;
- work->state = WS_USED;
-}
-
/**
* writeback_in_progress - determine whether there is writeback in progress
* @bdi: the device's backing_dev_info structure.
@@ -96,76 +62,11 @@ int writeback_in_progress(struct backing_dev_info *bdi)
return !list_empty(&bdi->work_list);
}
-static void bdi_work_clear(struct bdi_work *work)
-{
- clear_bit(WS_USED_B, &work->state);
- smp_mb__after_clear_bit();
- /*
- * work can have disappeared at this point. bit waitq functions
- * should be able to tolerate this, provided bdi_sched_wait does
- * not dereference it's pointer argument.
- */
- wake_up_bit(&work->state, WS_USED_B);
-}
-
-static void bdi_work_free(struct rcu_head *head)
-{
- struct bdi_work *work = container_of(head, struct bdi_work, rcu_head);
-
- if (!bdi_work_on_stack(work))
- kfree(work);
- else
- bdi_work_clear(work);
-}
-
-static void wb_work_complete(struct bdi_work *work)
-{
- const enum writeback_sync_modes sync_mode = work->args.sync_mode;
- int onstack = bdi_work_on_stack(work);
-
- /*
- * For allocated work, we can clear the done/seen bit right here.
- * For on-stack work, we need to postpone both the clear and free
- * to after the RCU grace period, since the stack could be invalidated
- * as soon as bdi_work_clear() has done the wakeup.
- */
- if (!onstack)
- bdi_work_clear(work);
- if (sync_mode == WB_SYNC_NONE || onstack)
- call_rcu(&work->rcu_head, bdi_work_free);
-}
-
-static void wb_clear_pending(struct bdi_writeback *wb, struct bdi_work *work)
-{
- /*
- * The caller has retrieved the work arguments from this work,
- * drop our reference. If this is the last ref, delete and free it
- */
- if (atomic_dec_and_test(&work->pending)) {
- struct backing_dev_info *bdi = wb->bdi;
-
- spin_lock(&bdi->wb_lock);
- list_del_rcu(&work->list);
- spin_unlock(&bdi->wb_lock);
-
- wb_work_complete(work);
- }
-}
-
-static void bdi_queue_work(struct backing_dev_info *bdi, struct bdi_work *work)
+static void bdi_queue_work(struct backing_dev_info *bdi,
+ struct wb_writeback_work *work)
{
- work->seen = bdi->wb_mask;
- BUG_ON(!work->seen);
- atomic_set(&work->pending, bdi->wb_cnt);
- BUG_ON(!bdi->wb_cnt);
-
- /*
- * list_add_tail_rcu() contains the necessary barriers to
- * make sure the above stores are seen before the item is
- * noticed on the list
- */
spin_lock(&bdi->wb_lock);
- list_add_tail_rcu(&work->list, &bdi->work_list);
+ list_add_tail(&work->list, &bdi->work_list);
spin_unlock(&bdi->wb_lock);
/*
@@ -182,107 +83,59 @@ static void bdi_queue_work(struct backing_dev_info *bdi, struct bdi_work *work)
}
}
-/*
- * Used for on-stack allocated work items. The caller needs to wait until
- * the wb threads have acked the work before it's safe to continue.
- */
-static void bdi_wait_on_work_clear(struct bdi_work *work)
-{
- wait_on_bit(&work->state, WS_USED_B, bdi_sched_wait,
- TASK_UNINTERRUPTIBLE);
-}
-
-static void bdi_alloc_queue_work(struct backing_dev_info *bdi,
- struct wb_writeback_args *args,
- int wait)
+static void
+__bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages,
+ bool range_cyclic, bool for_background)
{
- struct bdi_work *work;
+ struct wb_writeback_work *work;
/*
* This is WB_SYNC_NONE writeback, so if allocation fails just
* wakeup the thread for old dirty data writeback
*/
- work = kmalloc(sizeof(*work), GFP_ATOMIC);
- if (work) {
- bdi_work_init(work, args);
- bdi_queue_work(bdi, work);
- if (wait)
- bdi_wait_on_work_clear(work);
- } else {
- struct bdi_writeback *wb = &bdi->wb;
-
- if (wb->task)
- wake_up_process(wb->task);
+ work = kzalloc(sizeof(*work), GFP_ATOMIC);
+ if (!work) {
+ if (bdi->wb.task)
+ wake_up_process(bdi->wb.task);
+ return;
}
+
+ work->sync_mode = WB_SYNC_NONE;
+ work->nr_pages = nr_pages;
+ work->range_cyclic = range_cyclic;
+ work->for_background = for_background;
+
+ bdi_queue_work(bdi, work);
}
/**
- * bdi_sync_writeback - start and wait for writeback
+ * bdi_start_writeback - start writeback
* @bdi: the backing device to write from
- * @sb: write inodes from this super_block
+ * @nr_pages: the number of pages to write
*
* Description:
- * This does WB_SYNC_ALL data integrity writeback and waits for the
- * IO to complete. Callers must hold the sb s_umount semaphore for
- * reading, to avoid having the super disappear before we are done.
+ * This does WB_SYNC_NONE opportunistic writeback. The IO is only
+ * started when this function returns, we make no guarentees on
+ * completion. Caller need not hold sb s_umount semaphore.
+ *
*/
-static void bdi_sync_writeback(struct backing_dev_info *bdi,
- struct super_block *sb)
+void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages)
{
- struct wb_writeback_args args = {
- .sb = sb,
- .sync_mode = WB_SYNC_ALL,
- .nr_pages = LONG_MAX,
- .range_cyclic = 0,
- /*
- * Setting sb_pinned is not necessary for WB_SYNC_ALL, but
- * lets make it explicitly clear.
- */
- .sb_pinned = 1,
- };
- struct bdi_work work;
-
- bdi_work_init(&work, &args);
- work.state |= WS_ONSTACK;
-
- bdi_queue_work(bdi, &work);
- bdi_wait_on_work_clear(&work);
+ __bdi_start_writeback(bdi, nr_pages, true, false);
}
/**
- * bdi_start_writeback - start writeback
+ * bdi_start_background_writeback - start background writeback
* @bdi: the backing device to write from
- * @sb: write inodes from this super_block
- * @nr_pages: the number of pages to write
- * @sb_locked: caller already holds sb umount sem.
*
* Description:
- * This does WB_SYNC_NONE opportunistic writeback. The IO is only
+ * This does WB_SYNC_NONE background writeback. The IO is only
* started when this function returns, we make no guarentees on
- * completion. Caller specifies whether sb umount sem is held already or not.
- *
+ * completion. Caller need not hold sb s_umount semaphore.
*/
-void bdi_start_writeback(struct backing_dev_info *bdi, struct super_block *sb,
- long nr_pages, int sb_locked)
+void bdi_start_background_writeback(struct backing_dev_info *bdi)
{
- struct wb_writeback_args args = {
- .sb = sb,
- .sync_mode = WB_SYNC_NONE,
- .nr_pages = nr_pages,
- .range_cyclic = 1,
- .sb_pinned = sb_locked,
- };
-
- /*
- * We treat @nr_pages=0 as the special case to do background writeback,
- * ie. to sync pages until the background dirty threshold is reached.
- */
- if (!nr_pages) {
- args.nr_pages = LONG_MAX;
- args.for_background = 1;
- }
-
- bdi_alloc_queue_work(bdi, &args, sb_locked);
+ __bdi_start_writeback(bdi, LONG_MAX, true, true);
}
/*
@@ -572,75 +425,69 @@ select_queue:
return ret;
}
-static void unpin_sb_for_writeback(struct super_block *sb)
-{
- up_read(&sb->s_umount);
- put_super(sb);
-}
-
-enum sb_pin_state {
- SB_PINNED,
- SB_NOT_PINNED,
- SB_PIN_FAILED
-};
-
/*
- * For WB_SYNC_NONE writeback, the caller does not have the sb pinned
+ * For background writeback the caller does not have the sb pinned
* before calling writeback. So make sure that we do pin it, so it doesn't
* go away while we are writing inodes from it.
*/
-static enum sb_pin_state pin_sb_for_writeback(struct writeback_control *wbc,
- struct super_block *sb)
+static bool pin_sb_for_writeback(struct super_block *sb)
{
- /*
- * Caller must already hold the ref for this
- */
- if (wbc->sync_mode == WB_SYNC_ALL || wbc->sb_pinned) {
- WARN_ON(!rwsem_is_locked(&sb->s_umount));
- return SB_NOT_PINNED;
- }
spin_lock(&sb_lock);
+ if (list_empty(&sb->s_instances)) {
+ spin_unlock(&sb_lock);
+ return false;
+ }
+
sb->s_count++;
+ spin_unlock(&sb_lock);
+
if (down_read_trylock(&sb->s_umount)) {
- if (sb->s_root) {
- spin_unlock(&sb_lock);
- return SB_PINNED;
- }
- /*
- * umounted, drop rwsem again and fall through to failure
- */
+ if (sb->s_root)
+ return true;
up_read(&sb->s_umount);
}
- sb->s_count--;
- spin_unlock(&sb_lock);
- return SB_PIN_FAILED;
+
+ put_super(sb);
+ return false;
}
/*
* Write a portion of b_io inodes which belong to @sb.
- * If @wbc->sb != NULL, then find and write all such
+ *
+ * If @only_this_sb is true, then find and write all such
* inodes. Otherwise write only ones which go sequentially
* in reverse order.
+ *
* Return 1, if the caller writeback routine should be
* interrupted. Otherwise return 0.
*/
-static int writeback_sb_inodes(struct super_block *sb,
- struct bdi_writeback *wb,
- struct writeback_control *wbc)
+static int writeback_sb_inodes(struct super_block *sb, struct bdi_writeback *wb,
+ struct writeback_control *wbc, bool only_this_sb)
{
while (!list_empty(&wb->b_io)) {
long pages_skipped;
struct inode *inode = list_entry(wb->b_io.prev,
struct inode, i_list);
- if (wbc->sb && sb != inode->i_sb) {
- /* super block given and doesn't
- match, skip this inode */
- redirty_tail(inode);
- continue;
- }
- if (sb != inode->i_sb)
- /* finish with this superblock */
+
+ if (inode->i_sb != sb) {
+ if (only_this_sb) {
+ /*
+ * We only want to write back data for this
+ * superblock, move all inodes not belonging
+ * to it back onto the dirty list.
+ */
+ redirty_tail(inode);
+ continue;
+ }
+
+ /*
+ * The inode belongs to a different superblock.
+ * Bounce back to the caller to unpin this and
+ * pin the next superblock.
+ */
return 0;
+ }
+
if (inode->i_state & (I_NEW | I_WILL_FREE)) {
requeue_io(inode);
continue;
@@ -678,8 +525,8 @@ static int writeback_sb_inodes(struct super_block *sb,
return 1;
}
-static void writeback_inodes_wb(struct bdi_writeback *wb,
- struct writeback_control *wbc)
+void writeback_inodes_wb(struct bdi_writeback *wb,
+ struct writeback_control *wbc)
{
int ret = 0;
@@ -692,24 +539,14 @@ static void writeback_inodes_wb(struct bdi_writeback *wb,
struct inode *inode = list_entry(wb->b_io.prev,
struct inode, i_list);
struct super_block *sb = inode->i_sb;
- enum sb_pin_state state;
-
- if (wbc->sb && sb != wbc->sb) {
- /* super block given and doesn't
- match, skip this inode */
- redirty_tail(inode);
- continue;
- }
- state = pin_sb_for_writeback(wbc, sb);
- if (state == SB_PIN_FAILED) {
+ if (!pin_sb_for_writeback(sb)) {
requeue_io(inode);
continue;
}
- ret = writeback_sb_inodes(sb, wb, wbc);
+ ret = writeback_sb_inodes(sb, wb, wbc, false);
+ drop_super(sb);
- if (state == SB_PINNED)
- unpin_sb_for_writeback(sb);
if (ret)
break;
}
@@ -717,11 +554,17 @@ static void writeback_inodes_wb(struct bdi_writeback *wb,
/* Leave any unwritten inodes on b_io */
}
-void writeback_inodes_wbc(struct writeback_control *wbc)
+static void __writeback_inodes_sb(struct super_block *sb,
+ struct bdi_writeback *wb, struct writeback_control *wbc)
{
- struct backing_dev_info *bdi = wbc->bdi;
+ WARN_ON(!rwsem_is_locked(&sb->s_umount));
- writeback_inodes_wb(&bdi->wb, wbc);
+ wbc->wb_start = jiffies; /* livelock avoidance */
+ spin_lock(&inode_lock);
+ if (!wbc->for_kupdate || list_empty(&wb->b_io))
+ queue_io(wb, wbc->older_than_this);
+ writeback_sb_inodes(sb, wb, wbc, true);
+ spin_unlock(&inode_lock);
}
/*
@@ -759,17 +602,14 @@ static inline bool over_bground_thresh(void)
* all dirty pages if they are all attached to "old" mappings.
*/
static long wb_writeback(struct bdi_writeback *wb,
- struct wb_writeback_args *args)
+ struct wb_writeback_work *work)
{
struct writeback_control wbc = {
- .bdi = wb->bdi,
- .sb = args->sb,
- .sync_mode = args->sync_mode,
+ .sync_mode = work->sync_mode,
.older_than_this = NULL,
- .for_kupdate = args->for_kupdate,
- .for_background = args->for_background,
- .range_cyclic = args->range_cyclic,
- .sb_pinned = args->sb_pinned,
+ .for_kupdate = work->for_kupdate,
+ .for_background = work->for_background,
+ .range_cyclic = work->range_cyclic,
};
unsigned long oldest_jif;
long wrote = 0;
@@ -789,21 +629,24 @@ static long wb_writeback(struct bdi_writeback *wb,
/*
* Stop writeback when nr_pages has been consumed
*/
- if (args->nr_pages <= 0)
+ if (work->nr_pages <= 0)
break;
/*
* For background writeout, stop when we are below the
* background dirty threshold
*/
- if (args->for_background && !over_bground_thresh())
+ if (work->for_background && !over_bground_thresh())
break;
wbc.more_io = 0;
wbc.nr_to_write = MAX_WRITEBACK_PAGES;
wbc.pages_skipped = 0;
- writeback_inodes_wb(wb, &wbc);
- args->nr_pages -= MAX_WRITEBACK_PAGES - wbc.nr_to_write;
+ if (work->sb)
+ __writeback_inodes_sb(work->sb, wb, &wbc);
+ else
+ writeback_inodes_wb(wb, &wbc);
+ work->nr_pages -= MAX_WRITEBACK_PAGES - wbc.nr_to_write;
wrote += MAX_WRITEBACK_PAGES - wbc.nr_to_write;
/*
@@ -839,31 +682,21 @@ static long wb_writeback(struct bdi_writeback *wb,
}
/*
- * Return the next bdi_work struct that hasn't been processed by this
- * wb thread yet. ->seen is initially set for each thread that exists
- * for this device, when a thread first notices a piece of work it
- * clears its bit. Depending on writeback type, the thread will notify
- * completion on either receiving the work (WB_SYNC_NONE) or after
- * it is done (WB_SYNC_ALL).
+ * Return the next wb_writeback_work struct that hasn't been processed yet.
*/
-static struct bdi_work *get_next_work_item(struct backing_dev_info *bdi,
- struct bdi_writeback *wb)
+static struct wb_writeback_work *
+get_next_work_item(struct backing_dev_info *bdi, struct bdi_writeback *wb)
{
- struct bdi_work *work, *ret = NULL;
+ struct wb_writeback_work *work = NULL;
- rcu_read_lock();
-
- list_for_each_entry_rcu(work, &bdi->work_list, list) {
- if (!test_bit(wb->nr, &work->seen))
- continue;
- clear_bit(wb->nr, &work->seen);
-
- ret = work;
- break;
+ spin_lock(&bdi->wb_lock);
+ if (!list_empty(&bdi->work_list)) {
+ work = list_entry(bdi->work_list.next,
+ struct wb_writeback_work, list);
+ list_del_init(&work->list);
}
-
- rcu_read_unlock();
- return ret;
+ spin_unlock(&bdi->wb_lock);
+ return work;
}
static long wb_check_old_data_flush(struct bdi_writeback *wb)
@@ -888,14 +721,14 @@ static long wb_check_old_data_flush(struct bdi_writeback *wb)
(inodes_stat.nr_inodes - inodes_stat.nr_unused);
if (nr_pages) {
- struct wb_writeback_args args = {
+ struct wb_writeback_work work = {
.nr_pages = nr_pages,
.sync_mode = WB_SYNC_NONE,
.for_kupdate = 1,
.range_cyclic = 1,
};
- return wb_writeback(wb, &args);
+ return wb_writeback(wb, &work);
}
return 0;
@@ -907,36 +740,27 @@ static long wb_check_old_data_flush(struct bdi_writeback *wb)
long wb_do_writeback(struct bdi_writeback *wb, int force_wait)
{
struct backing_dev_info *bdi = wb->bdi;
- struct bdi_work *work;
+ struct wb_writeback_work *work;
long wrote = 0;
while ((work = get_next_work_item(bdi, wb)) != NULL) {
- struct wb_writeback_args args = work->args;
- int post_clear;
-
/*
* Override sync mode, in case we must wait for completion
+ * because this thread is exiting now.
*/
if (force_wait)
- work->args.sync_mode = args.sync_mode = WB_SYNC_ALL;
-
- post_clear = WB_SYNC_ALL || args.sb_pinned;
-
- /*
- * If this isn't a data integrity operation, just notify
- * that we have seen this work and we are now starting it.
- */
- if (!post_clear)
- wb_clear_pending(wb, work);
+ work->sync_mode = WB_SYNC_ALL;
- wrote += wb_writeback(wb, &args);
+ wrote += wb_writeback(wb, work);
/*
- * This is a data integrity writeback, so only do the
- * notification when we have completed the work.
+ * Notify the caller of completion if this is a synchronous
+ * work item, otherwise just free it.
*/
- if (post_clear)
- wb_clear_pending(wb, work);
+ if (work->done)
+ complete(work->done);
+ else
+ kfree(work);
}
/*
@@ -993,42 +817,27 @@ int bdi_writeback_task(struct bdi_writeback *wb)
}
/*
- * Schedule writeback for all backing devices. This does WB_SYNC_NONE
- * writeback, for integrity writeback see bdi_sync_writeback().
+ * Start writeback of `nr_pages' pages. If `nr_pages' is zero, write back
+ * the whole world.
*/
-static void bdi_writeback_all(struct super_block *sb, long nr_pages)
+void wakeup_flusher_threads(long nr_pages)
{
- struct wb_writeback_args args = {
- .sb = sb,
- .nr_pages = nr_pages,
- .sync_mode = WB_SYNC_NONE,
- };
struct backing_dev_info *bdi;
- rcu_read_lock();
+ if (!nr_pages) {
+ nr_pages = global_page_state(NR_FILE_DIRTY) +
+ global_page_state(NR_UNSTABLE_NFS);
+ }
+ rcu_read_lock();
list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) {
if (!bdi_has_dirty_io(bdi))
continue;
-
- bdi_alloc_queue_work(bdi, &args, 0);
+ __bdi_start_writeback(bdi, nr_pages, false, false);
}
-
rcu_read_unlock();
}
-/*
- * Start writeback of `nr_pages' pages. If `nr_pages' is zero, write back
- * the whole world.
- */
-void wakeup_flusher_threads(long nr_pages)
-{
- if (nr_pages == 0)
- nr_pages = global_page_state(NR_FILE_DIRTY) +
- global_page_state(NR_UNSTABLE_NFS);
- bdi_writeback_all(NULL, nr_pages);
-}
-
static noinline void block_dump___mark_inode_dirty(struct inode *inode)
{
if (inode->i_ino || strcmp(inode->i_sb->s_id, "bdev")) {
@@ -1220,18 +1029,6 @@ static void wait_sb_inodes(struct super_block *sb)
iput(old_inode);
}
-static void __writeback_inodes_sb(struct super_block *sb, int sb_locked)
-{
- unsigned long nr_dirty = global_page_state(NR_FILE_DIRTY);
- unsigned long nr_unstable = global_page_state(NR_UNSTABLE_NFS);
- long nr_to_write;
-
- nr_to_write = nr_dirty + nr_unstable +
- (inodes_stat.nr_inodes - inodes_stat.nr_unused);
-
- bdi_start_writeback(sb->s_bdi, sb, nr_to_write, sb_locked);
-}
-
/**
* writeback_inodes_sb - writeback dirty inodes from given super_block
* @sb: the superblock
@@ -1243,21 +1040,24 @@ static void __writeback_inodes_sb(struct super_block *sb, int sb_locked)
*/
void writeback_inodes_sb(struct super_block *sb)
{
- __writeback_inodes_sb(sb, 0);
-}
-EXPORT_SYMBOL(writeback_inodes_sb);
+ unsigned long nr_dirty = global_page_state(NR_FILE_DIRTY);
+ unsigned long nr_unstable = global_page_state(NR_UNSTABLE_NFS);
+ DECLARE_COMPLETION_ONSTACK(done);
+ struct wb_writeback_work work = {
+ .sb = sb,
+ .sync_mode = WB_SYNC_NONE,
+ .done = &done,
+ };
-/**
- * writeback_inodes_sb_locked - writeback dirty inodes from given super_block
- * @sb: the superblock
- *
- * Like writeback_inodes_sb(), except the caller already holds the
- * sb umount sem.
- */
-void writeback_inodes_sb_locked(struct super_block *sb)
-{
- __writeback_inodes_sb(sb, 1);
+ WARN_ON(!rwsem_is_locked(&sb->s_umount));
+
+ work.nr_pages = nr_dirty + nr_unstable +
+ (inodes_stat.nr_inodes - inodes_stat.nr_unused);
+
+ bdi_queue_work(sb->s_bdi, &work);
+ wait_for_completion(&done);
}
+EXPORT_SYMBOL(writeback_inodes_sb);
/**
* writeback_inodes_sb_if_idle - start writeback if none underway
@@ -1269,7 +1069,9 @@ void writeback_inodes_sb_locked(struct super_block *sb)
int writeback_inodes_sb_if_idle(struct super_block *sb)
{
if (!writeback_in_progress(sb->s_bdi)) {
+ down_read(&sb->s_umount);
writeback_inodes_sb(sb);
+ up_read(&sb->s_umount);
return 1;
} else
return 0;
@@ -1285,7 +1087,20 @@ EXPORT_SYMBOL(writeback_inodes_sb_if_idle);
*/
void sync_inodes_sb(struct super_block *sb)
{
- bdi_sync_writeback(sb->s_bdi, sb);
+ DECLARE_COMPLETION_ONSTACK(done);
+ struct wb_writeback_work work = {
+ .sb = sb,
+ .sync_mode = WB_SYNC_ALL,
+ .nr_pages = LONG_MAX,
+ .range_cyclic = 0,
+ .done = &done,
+ };
+
+ WARN_ON(!rwsem_is_locked(&sb->s_umount));
+
+ bdi_queue_work(sb->s_bdi, &work);
+ wait_for_completion(&done);
+
wait_sb_inodes(sb);
}
EXPORT_SYMBOL(sync_inodes_sb);
diff --git a/fs/fscache/object-list.c b/fs/fscache/object-list.c
index 1e1f286dd70e..4a8eb31c5338 100644
--- a/fs/fscache/object-list.c
+++ b/fs/fscache/object-list.c
@@ -103,7 +103,7 @@ static struct fscache_object *fscache_objlist_lookup(loff_t *_pos)
/* banners (can't represent line 0 by pos 0 as that would involve
* returning a NULL pointer) */
if (pos == 0)
- return (struct fscache_object *) ++(*_pos);
+ return (struct fscache_object *)(long)++(*_pos);
if (pos < 3)
return (struct fscache_object *)pos;
diff --git a/fs/fscache/page.c b/fs/fscache/page.c
index 47aefd376e54..723b889fd219 100644
--- a/fs/fscache/page.c
+++ b/fs/fscache/page.c
@@ -710,30 +710,26 @@ static void fscache_write_op(struct fscache_operation *_op)
goto superseded;
}
- if (page) {
- radix_tree_tag_set(&cookie->stores, page->index,
- FSCACHE_COOKIE_STORING_TAG);
- radix_tree_tag_clear(&cookie->stores, page->index,
- FSCACHE_COOKIE_PENDING_TAG);
- }
+ radix_tree_tag_set(&cookie->stores, page->index,
+ FSCACHE_COOKIE_STORING_TAG);
+ radix_tree_tag_clear(&cookie->stores, page->index,
+ FSCACHE_COOKIE_PENDING_TAG);
spin_unlock(&cookie->stores_lock);
spin_unlock(&object->lock);
- if (page) {
- fscache_set_op_state(&op->op, "Store");
- fscache_stat(&fscache_n_store_pages);
- fscache_stat(&fscache_n_cop_write_page);
- ret = object->cache->ops->write_page(op, page);
- fscache_stat_d(&fscache_n_cop_write_page);
- fscache_set_op_state(&op->op, "EndWrite");
- fscache_end_page_write(object, page);
- if (ret < 0) {
- fscache_set_op_state(&op->op, "Abort");
- fscache_abort_object(object);
- } else {
- fscache_enqueue_operation(&op->op);
- }
+ fscache_set_op_state(&op->op, "Store");
+ fscache_stat(&fscache_n_store_pages);
+ fscache_stat(&fscache_n_cop_write_page);
+ ret = object->cache->ops->write_page(op, page);
+ fscache_stat_d(&fscache_n_cop_write_page);
+ fscache_set_op_state(&op->op, "EndWrite");
+ fscache_end_page_write(object, page);
+ if (ret < 0) {
+ fscache_set_op_state(&op->op, "Abort");
+ fscache_abort_object(object);
+ } else {
+ fscache_enqueue_operation(&op->op);
}
_leave("");
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index e53df5ebb2b8..9424796d6634 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -16,6 +16,9 @@
#include <linux/pagemap.h>
#include <linux/file.h>
#include <linux/slab.h>
+#include <linux/pipe_fs_i.h>
+#include <linux/swap.h>
+#include <linux/splice.h>
MODULE_ALIAS_MISCDEV(FUSE_MINOR);
MODULE_ALIAS("devname:fuse");
@@ -499,6 +502,9 @@ struct fuse_copy_state {
int write;
struct fuse_req *req;
const struct iovec *iov;
+ struct pipe_buffer *pipebufs;
+ struct pipe_buffer *currbuf;
+ struct pipe_inode_info *pipe;
unsigned long nr_segs;
unsigned long seglen;
unsigned long addr;
@@ -506,16 +512,16 @@ struct fuse_copy_state {
void *mapaddr;
void *buf;
unsigned len;
+ unsigned move_pages:1;
};
static void fuse_copy_init(struct fuse_copy_state *cs, struct fuse_conn *fc,
- int write, struct fuse_req *req,
+ int write,
const struct iovec *iov, unsigned long nr_segs)
{
memset(cs, 0, sizeof(*cs));
cs->fc = fc;
cs->write = write;
- cs->req = req;
cs->iov = iov;
cs->nr_segs = nr_segs;
}
@@ -523,7 +529,18 @@ static void fuse_copy_init(struct fuse_copy_state *cs, struct fuse_conn *fc,
/* Unmap and put previous page of userspace buffer */
static void fuse_copy_finish(struct fuse_copy_state *cs)
{
- if (cs->mapaddr) {
+ if (cs->currbuf) {
+ struct pipe_buffer *buf = cs->currbuf;
+
+ if (!cs->write) {
+ buf->ops->unmap(cs->pipe, buf, cs->mapaddr);
+ } else {
+ kunmap_atomic(cs->mapaddr, KM_USER0);
+ buf->len = PAGE_SIZE - cs->len;
+ }
+ cs->currbuf = NULL;
+ cs->mapaddr = NULL;
+ } else if (cs->mapaddr) {
kunmap_atomic(cs->mapaddr, KM_USER0);
if (cs->write) {
flush_dcache_page(cs->pg);
@@ -545,26 +562,61 @@ static int fuse_copy_fill(struct fuse_copy_state *cs)
unlock_request(cs->fc, cs->req);
fuse_copy_finish(cs);
- if (!cs->seglen) {
- BUG_ON(!cs->nr_segs);
- cs->seglen = cs->iov[0].iov_len;
- cs->addr = (unsigned long) cs->iov[0].iov_base;
- cs->iov++;
- cs->nr_segs--;
+ if (cs->pipebufs) {
+ struct pipe_buffer *buf = cs->pipebufs;
+
+ if (!cs->write) {
+ err = buf->ops->confirm(cs->pipe, buf);
+ if (err)
+ return err;
+
+ BUG_ON(!cs->nr_segs);
+ cs->currbuf = buf;
+ cs->mapaddr = buf->ops->map(cs->pipe, buf, 1);
+ cs->len = buf->len;
+ cs->buf = cs->mapaddr + buf->offset;
+ cs->pipebufs++;
+ cs->nr_segs--;
+ } else {
+ struct page *page;
+
+ if (cs->nr_segs == cs->pipe->buffers)
+ return -EIO;
+
+ page = alloc_page(GFP_HIGHUSER);
+ if (!page)
+ return -ENOMEM;
+
+ buf->page = page;
+ buf->offset = 0;
+ buf->len = 0;
+
+ cs->currbuf = buf;
+ cs->mapaddr = kmap_atomic(page, KM_USER0);
+ cs->buf = cs->mapaddr;
+ cs->len = PAGE_SIZE;
+ cs->pipebufs++;
+ cs->nr_segs++;
+ }
+ } else {
+ if (!cs->seglen) {
+ BUG_ON(!cs->nr_segs);
+ cs->seglen = cs->iov[0].iov_len;
+ cs->addr = (unsigned long) cs->iov[0].iov_base;
+ cs->iov++;
+ cs->nr_segs--;
+ }
+ err = get_user_pages_fast(cs->addr, 1, cs->write, &cs->pg);
+ if (err < 0)
+ return err;
+ BUG_ON(err != 1);
+ offset = cs->addr % PAGE_SIZE;
+ cs->mapaddr = kmap_atomic(cs->pg, KM_USER0);
+ cs->buf = cs->mapaddr + offset;
+ cs->len = min(PAGE_SIZE - offset, cs->seglen);
+ cs->seglen -= cs->len;
+ cs->addr += cs->len;
}
- down_read(&current->mm->mmap_sem);
- err = get_user_pages(current, current->mm, cs->addr, 1, cs->write, 0,
- &cs->pg, NULL);
- up_read(&current->mm->mmap_sem);
- if (err < 0)
- return err;
- BUG_ON(err != 1);
- offset = cs->addr % PAGE_SIZE;
- cs->mapaddr = kmap_atomic(cs->pg, KM_USER0);
- cs->buf = cs->mapaddr + offset;
- cs->len = min(PAGE_SIZE - offset, cs->seglen);
- cs->seglen -= cs->len;
- cs->addr += cs->len;
return lock_request(cs->fc, cs->req);
}
@@ -586,23 +638,178 @@ static int fuse_copy_do(struct fuse_copy_state *cs, void **val, unsigned *size)
return ncpy;
}
+static int fuse_check_page(struct page *page)
+{
+ if (page_mapcount(page) ||
+ page->mapping != NULL ||
+ page_count(page) != 1 ||
+ (page->flags & PAGE_FLAGS_CHECK_AT_PREP &
+ ~(1 << PG_locked |
+ 1 << PG_referenced |
+ 1 << PG_uptodate |
+ 1 << PG_lru |
+ 1 << PG_active |
+ 1 << PG_reclaim))) {
+ printk(KERN_WARNING "fuse: trying to steal weird page\n");
+ printk(KERN_WARNING " page=%p index=%li flags=%08lx, count=%i, mapcount=%i, mapping=%p\n", page, page->index, page->flags, page_count(page), page_mapcount(page), page->mapping);
+ return 1;
+ }
+ return 0;
+}
+
+static int fuse_try_move_page(struct fuse_copy_state *cs, struct page **pagep)
+{
+ int err;
+ struct page *oldpage = *pagep;
+ struct page *newpage;
+ struct pipe_buffer *buf = cs->pipebufs;
+ struct address_space *mapping;
+ pgoff_t index;
+
+ unlock_request(cs->fc, cs->req);
+ fuse_copy_finish(cs);
+
+ err = buf->ops->confirm(cs->pipe, buf);
+ if (err)
+ return err;
+
+ BUG_ON(!cs->nr_segs);
+ cs->currbuf = buf;
+ cs->len = buf->len;
+ cs->pipebufs++;
+ cs->nr_segs--;
+
+ if (cs->len != PAGE_SIZE)
+ goto out_fallback;
+
+ if (buf->ops->steal(cs->pipe, buf) != 0)
+ goto out_fallback;
+
+ newpage = buf->page;
+
+ if (WARN_ON(!PageUptodate(newpage)))
+ return -EIO;
+
+ ClearPageMappedToDisk(newpage);
+
+ if (fuse_check_page(newpage) != 0)
+ goto out_fallback_unlock;
+
+ mapping = oldpage->mapping;
+ index = oldpage->index;
+
+ /*
+ * This is a new and locked page, it shouldn't be mapped or
+ * have any special flags on it
+ */
+ if (WARN_ON(page_mapped(oldpage)))
+ goto out_fallback_unlock;
+ if (WARN_ON(page_has_private(oldpage)))
+ goto out_fallback_unlock;
+ if (WARN_ON(PageDirty(oldpage) || PageWriteback(oldpage)))
+ goto out_fallback_unlock;
+ if (WARN_ON(PageMlocked(oldpage)))
+ goto out_fallback_unlock;
+
+ remove_from_page_cache(oldpage);
+ page_cache_release(oldpage);
+
+ err = add_to_page_cache_locked(newpage, mapping, index, GFP_KERNEL);
+ if (err) {
+ printk(KERN_WARNING "fuse_try_move_page: failed to add page");
+ goto out_fallback_unlock;
+ }
+ page_cache_get(newpage);
+
+ if (!(buf->flags & PIPE_BUF_FLAG_LRU))
+ lru_cache_add_file(newpage);
+
+ err = 0;
+ spin_lock(&cs->fc->lock);
+ if (cs->req->aborted)
+ err = -ENOENT;
+ else
+ *pagep = newpage;
+ spin_unlock(&cs->fc->lock);
+
+ if (err) {
+ unlock_page(newpage);
+ page_cache_release(newpage);
+ return err;
+ }
+
+ unlock_page(oldpage);
+ page_cache_release(oldpage);
+ cs->len = 0;
+
+ return 0;
+
+out_fallback_unlock:
+ unlock_page(newpage);
+out_fallback:
+ cs->mapaddr = buf->ops->map(cs->pipe, buf, 1);
+ cs->buf = cs->mapaddr + buf->offset;
+
+ err = lock_request(cs->fc, cs->req);
+ if (err)
+ return err;
+
+ return 1;
+}
+
+static int fuse_ref_page(struct fuse_copy_state *cs, struct page *page,
+ unsigned offset, unsigned count)
+{
+ struct pipe_buffer *buf;
+
+ if (cs->nr_segs == cs->pipe->buffers)
+ return -EIO;
+
+ unlock_request(cs->fc, cs->req);
+ fuse_copy_finish(cs);
+
+ buf = cs->pipebufs;
+ page_cache_get(page);
+ buf->page = page;
+ buf->offset = offset;
+ buf->len = count;
+
+ cs->pipebufs++;
+ cs->nr_segs++;
+ cs->len = 0;
+
+ return 0;
+}
+
/*
* Copy a page in the request to/from the userspace buffer. Must be
* done atomically
*/
-static int fuse_copy_page(struct fuse_copy_state *cs, struct page *page,
+static int fuse_copy_page(struct fuse_copy_state *cs, struct page **pagep,
unsigned offset, unsigned count, int zeroing)
{
+ int err;
+ struct page *page = *pagep;
+
if (page && zeroing && count < PAGE_SIZE) {
void *mapaddr = kmap_atomic(page, KM_USER1);
memset(mapaddr, 0, PAGE_SIZE);
kunmap_atomic(mapaddr, KM_USER1);
}
while (count) {
- if (!cs->len) {
- int err = fuse_copy_fill(cs);
- if (err)
- return err;
+ if (cs->write && cs->pipebufs && page) {
+ return fuse_ref_page(cs, page, offset, count);
+ } else if (!cs->len) {
+ if (cs->move_pages && page &&
+ offset == 0 && count == PAGE_SIZE) {
+ err = fuse_try_move_page(cs, pagep);
+ if (err <= 0)
+ return err;
+ } else {
+ err = fuse_copy_fill(cs);
+ if (err)
+ return err;
+ }
}
if (page) {
void *mapaddr = kmap_atomic(page, KM_USER1);
@@ -627,8 +834,10 @@ static int fuse_copy_pages(struct fuse_copy_state *cs, unsigned nbytes,
unsigned count = min(nbytes, (unsigned) PAGE_SIZE - offset);
for (i = 0; i < req->num_pages && (nbytes || zeroing); i++) {
- struct page *page = req->pages[i];
- int err = fuse_copy_page(cs, page, offset, count, zeroing);
+ int err;
+
+ err = fuse_copy_page(cs, &req->pages[i], offset, count,
+ zeroing);
if (err)
return err;
@@ -705,11 +914,10 @@ __acquires(&fc->lock)
*
* Called with fc->lock held, releases it
*/
-static int fuse_read_interrupt(struct fuse_conn *fc, struct fuse_req *req,
- const struct iovec *iov, unsigned long nr_segs)
+static int fuse_read_interrupt(struct fuse_conn *fc, struct fuse_copy_state *cs,
+ size_t nbytes, struct fuse_req *req)
__releases(&fc->lock)
{
- struct fuse_copy_state cs;
struct fuse_in_header ih;
struct fuse_interrupt_in arg;
unsigned reqsize = sizeof(ih) + sizeof(arg);
@@ -725,14 +933,13 @@ __releases(&fc->lock)
arg.unique = req->in.h.unique;
spin_unlock(&fc->lock);
- if (iov_length(iov, nr_segs) < reqsize)
+ if (nbytes < reqsize)
return -EINVAL;
- fuse_copy_init(&cs, fc, 1, NULL, iov, nr_segs);
- err = fuse_copy_one(&cs, &ih, sizeof(ih));
+ err = fuse_copy_one(cs, &ih, sizeof(ih));
if (!err)
- err = fuse_copy_one(&cs, &arg, sizeof(arg));
- fuse_copy_finish(&cs);
+ err = fuse_copy_one(cs, &arg, sizeof(arg));
+ fuse_copy_finish(cs);
return err ? err : reqsize;
}
@@ -746,18 +953,13 @@ __releases(&fc->lock)
* request_end(). Otherwise add it to the processing list, and set
* the 'sent' flag.
*/
-static ssize_t fuse_dev_read(struct kiocb *iocb, const struct iovec *iov,
- unsigned long nr_segs, loff_t pos)
+static ssize_t fuse_dev_do_read(struct fuse_conn *fc, struct file *file,
+ struct fuse_copy_state *cs, size_t nbytes)
{
int err;
struct fuse_req *req;
struct fuse_in *in;
- struct fuse_copy_state cs;
unsigned reqsize;
- struct file *file = iocb->ki_filp;
- struct fuse_conn *fc = fuse_get_conn(file);
- if (!fc)
- return -EPERM;
restart:
spin_lock(&fc->lock);
@@ -777,7 +979,7 @@ static ssize_t fuse_dev_read(struct kiocb *iocb, const struct iovec *iov,
if (!list_empty(&fc->interrupts)) {
req = list_entry(fc->interrupts.next, struct fuse_req,
intr_entry);
- return fuse_read_interrupt(fc, req, iov, nr_segs);
+ return fuse_read_interrupt(fc, cs, nbytes, req);
}
req = list_entry(fc->pending.next, struct fuse_req, list);
@@ -787,7 +989,7 @@ static ssize_t fuse_dev_read(struct kiocb *iocb, const struct iovec *iov,
in = &req->in;
reqsize = in->h.len;
/* If request is too large, reply with an error and restart the read */
- if (iov_length(iov, nr_segs) < reqsize) {
+ if (nbytes < reqsize) {
req->out.h.error = -EIO;
/* SETXATTR is special, since it may contain too large data */
if (in->h.opcode == FUSE_SETXATTR)
@@ -796,12 +998,12 @@ static ssize_t fuse_dev_read(struct kiocb *iocb, const struct iovec *iov,
goto restart;
}
spin_unlock(&fc->lock);
- fuse_copy_init(&cs, fc, 1, req, iov, nr_segs);
- err = fuse_copy_one(&cs, &in->h, sizeof(in->h));
+ cs->req = req;
+ err = fuse_copy_one(cs, &in->h, sizeof(in->h));
if (!err)
- err = fuse_copy_args(&cs, in->numargs, in->argpages,
+ err = fuse_copy_args(cs, in->numargs, in->argpages,
(struct fuse_arg *) in->args, 0);
- fuse_copy_finish(&cs);
+ fuse_copy_finish(cs);
spin_lock(&fc->lock);
req->locked = 0;
if (req->aborted) {
@@ -829,6 +1031,110 @@ static ssize_t fuse_dev_read(struct kiocb *iocb, const struct iovec *iov,
return err;
}
+static ssize_t fuse_dev_read(struct kiocb *iocb, const struct iovec *iov,
+ unsigned long nr_segs, loff_t pos)
+{
+ struct fuse_copy_state cs;
+ struct file *file = iocb->ki_filp;
+ struct fuse_conn *fc = fuse_get_conn(file);
+ if (!fc)
+ return -EPERM;
+
+ fuse_copy_init(&cs, fc, 1, iov, nr_segs);
+
+ return fuse_dev_do_read(fc, file, &cs, iov_length(iov, nr_segs));
+}
+
+static int fuse_dev_pipe_buf_steal(struct pipe_inode_info *pipe,
+ struct pipe_buffer *buf)
+{
+ return 1;
+}
+
+static const struct pipe_buf_operations fuse_dev_pipe_buf_ops = {
+ .can_merge = 0,
+ .map = generic_pipe_buf_map,
+ .unmap = generic_pipe_buf_unmap,
+ .confirm = generic_pipe_buf_confirm,
+ .release = generic_pipe_buf_release,
+ .steal = fuse_dev_pipe_buf_steal,
+ .get = generic_pipe_buf_get,
+};
+
+static ssize_t fuse_dev_splice_read(struct file *in, loff_t *ppos,
+ struct pipe_inode_info *pipe,
+ size_t len, unsigned int flags)
+{
+ int ret;
+ int page_nr = 0;
+ int do_wakeup = 0;
+ struct pipe_buffer *bufs;
+ struct fuse_copy_state cs;
+ struct fuse_conn *fc = fuse_get_conn(in);
+ if (!fc)
+ return -EPERM;
+
+ bufs = kmalloc(pipe->buffers * sizeof (struct pipe_buffer), GFP_KERNEL);
+ if (!bufs)
+ return -ENOMEM;
+
+ fuse_copy_init(&cs, fc, 1, NULL, 0);
+ cs.pipebufs = bufs;
+ cs.pipe = pipe;
+ ret = fuse_dev_do_read(fc, in, &cs, len);
+ if (ret < 0)
+ goto out;
+
+ ret = 0;
+ pipe_lock(pipe);
+
+ if (!pipe->readers) {
+ send_sig(SIGPIPE, current, 0);
+ if (!ret)
+ ret = -EPIPE;
+ goto out_unlock;
+ }
+
+ if (pipe->nrbufs + cs.nr_segs > pipe->buffers) {
+ ret = -EIO;
+ goto out_unlock;
+ }
+
+ while (page_nr < cs.nr_segs) {
+ int newbuf = (pipe->curbuf + pipe->nrbufs) & (pipe->buffers - 1);
+ struct pipe_buffer *buf = pipe->bufs + newbuf;
+
+ buf->page = bufs[page_nr].page;
+ buf->offset = bufs[page_nr].offset;
+ buf->len = bufs[page_nr].len;
+ buf->ops = &fuse_dev_pipe_buf_ops;
+
+ pipe->nrbufs++;
+ page_nr++;
+ ret += buf->len;
+
+ if (pipe->inode)
+ do_wakeup = 1;
+ }
+
+out_unlock:
+ pipe_unlock(pipe);
+
+ if (do_wakeup) {
+ smp_mb();
+ if (waitqueue_active(&pipe->wait))
+ wake_up_interruptible(&pipe->wait);
+ kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
+ }
+
+out:
+ for (; page_nr < cs.nr_segs; page_nr++)
+ page_cache_release(bufs[page_nr].page);
+
+ kfree(bufs);
+ return ret;
+}
+
static int fuse_notify_poll(struct fuse_conn *fc, unsigned int size,
struct fuse_copy_state *cs)
{
@@ -988,23 +1294,17 @@ static int copy_out_args(struct fuse_copy_state *cs, struct fuse_out *out,
* it from the list and copy the rest of the buffer to the request.
* The request is finished by calling request_end()
*/
-static ssize_t fuse_dev_write(struct kiocb *iocb, const struct iovec *iov,
- unsigned long nr_segs, loff_t pos)
+static ssize_t fuse_dev_do_write(struct fuse_conn *fc,
+ struct fuse_copy_state *cs, size_t nbytes)
{
int err;
- size_t nbytes = iov_length(iov, nr_segs);
struct fuse_req *req;
struct fuse_out_header oh;
- struct fuse_copy_state cs;
- struct fuse_conn *fc = fuse_get_conn(iocb->ki_filp);
- if (!fc)
- return -EPERM;
- fuse_copy_init(&cs, fc, 0, NULL, iov, nr_segs);
if (nbytes < sizeof(struct fuse_out_header))
return -EINVAL;
- err = fuse_copy_one(&cs, &oh, sizeof(oh));
+ err = fuse_copy_one(cs, &oh, sizeof(oh));
if (err)
goto err_finish;
@@ -1017,7 +1317,7 @@ static ssize_t fuse_dev_write(struct kiocb *iocb, const struct iovec *iov,
* and error contains notification code.
*/
if (!oh.unique) {
- err = fuse_notify(fc, oh.error, nbytes - sizeof(oh), &cs);
+ err = fuse_notify(fc, oh.error, nbytes - sizeof(oh), cs);
return err ? err : nbytes;
}
@@ -1036,7 +1336,7 @@ static ssize_t fuse_dev_write(struct kiocb *iocb, const struct iovec *iov,
if (req->aborted) {
spin_unlock(&fc->lock);
- fuse_copy_finish(&cs);
+ fuse_copy_finish(cs);
spin_lock(&fc->lock);
request_end(fc, req);
return -ENOENT;
@@ -1053,7 +1353,7 @@ static ssize_t fuse_dev_write(struct kiocb *iocb, const struct iovec *iov,
queue_interrupt(fc, req);
spin_unlock(&fc->lock);
- fuse_copy_finish(&cs);
+ fuse_copy_finish(cs);
return nbytes;
}
@@ -1061,11 +1361,13 @@ static ssize_t fuse_dev_write(struct kiocb *iocb, const struct iovec *iov,
list_move(&req->list, &fc->io);
req->out.h = oh;
req->locked = 1;
- cs.req = req;
+ cs->req = req;
+ if (!req->out.page_replace)
+ cs->move_pages = 0;
spin_unlock(&fc->lock);
- err = copy_out_args(&cs, &req->out, nbytes);
- fuse_copy_finish(&cs);
+ err = copy_out_args(cs, &req->out, nbytes);
+ fuse_copy_finish(cs);
spin_lock(&fc->lock);
req->locked = 0;
@@ -1081,10 +1383,101 @@ static ssize_t fuse_dev_write(struct kiocb *iocb, const struct iovec *iov,
err_unlock:
spin_unlock(&fc->lock);
err_finish:
- fuse_copy_finish(&cs);
+ fuse_copy_finish(cs);
return err;
}
+static ssize_t fuse_dev_write(struct kiocb *iocb, const struct iovec *iov,
+ unsigned long nr_segs, loff_t pos)
+{
+ struct fuse_copy_state cs;
+ struct fuse_conn *fc = fuse_get_conn(iocb->ki_filp);
+ if (!fc)
+ return -EPERM;
+
+ fuse_copy_init(&cs, fc, 0, iov, nr_segs);
+
+ return fuse_dev_do_write(fc, &cs, iov_length(iov, nr_segs));
+}
+
+static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe,
+ struct file *out, loff_t *ppos,
+ size_t len, unsigned int flags)
+{
+ unsigned nbuf;
+ unsigned idx;
+ struct pipe_buffer *bufs;
+ struct fuse_copy_state cs;
+ struct fuse_conn *fc;
+ size_t rem;
+ ssize_t ret;
+
+ fc = fuse_get_conn(out);
+ if (!fc)
+ return -EPERM;
+
+ bufs = kmalloc(pipe->buffers * sizeof (struct pipe_buffer), GFP_KERNEL);
+ if (!bufs)
+ return -ENOMEM;
+
+ pipe_lock(pipe);
+ nbuf = 0;
+ rem = 0;
+ for (idx = 0; idx < pipe->nrbufs && rem < len; idx++)
+ rem += pipe->bufs[(pipe->curbuf + idx) & (pipe->buffers - 1)].len;
+
+ ret = -EINVAL;
+ if (rem < len) {
+ pipe_unlock(pipe);
+ goto out;
+ }
+
+ rem = len;
+ while (rem) {
+ struct pipe_buffer *ibuf;
+ struct pipe_buffer *obuf;
+
+ BUG_ON(nbuf >= pipe->buffers);
+ BUG_ON(!pipe->nrbufs);
+ ibuf = &pipe->bufs[pipe->curbuf];
+ obuf = &bufs[nbuf];
+
+ if (rem >= ibuf->len) {
+ *obuf = *ibuf;
+ ibuf->ops = NULL;
+ pipe->curbuf = (pipe->curbuf + 1) & (pipe->buffers - 1);
+ pipe->nrbufs--;
+ } else {
+ ibuf->ops->get(pipe, ibuf);
+ *obuf = *ibuf;
+ obuf->flags &= ~PIPE_BUF_FLAG_GIFT;
+ obuf->len = rem;
+ ibuf->offset += obuf->len;
+ ibuf->len -= obuf->len;
+ }
+ nbuf++;
+ rem -= obuf->len;
+ }
+ pipe_unlock(pipe);
+
+ fuse_copy_init(&cs, fc, 0, NULL, nbuf);
+ cs.pipebufs = bufs;
+ cs.pipe = pipe;
+
+ if (flags & SPLICE_F_MOVE)
+ cs.move_pages = 1;
+
+ ret = fuse_dev_do_write(fc, &cs, len);
+
+ for (idx = 0; idx < nbuf; idx++) {
+ struct pipe_buffer *buf = &bufs[idx];
+ buf->ops->release(pipe, buf);
+ }
+out:
+ kfree(bufs);
+ return ret;
+}
+
static unsigned fuse_dev_poll(struct file *file, poll_table *wait)
{
unsigned mask = POLLOUT | POLLWRNORM;
@@ -1226,8 +1619,10 @@ const struct file_operations fuse_dev_operations = {
.llseek = no_llseek,
.read = do_sync_read,
.aio_read = fuse_dev_read,
+ .splice_read = fuse_dev_splice_read,
.write = do_sync_write,
.aio_write = fuse_dev_write,
+ .splice_write = fuse_dev_splice_write,
.poll = fuse_dev_poll,
.release = fuse_dev_release,
.fasync = fuse_dev_fasync,
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 4787ae6c5c1c..431be0795b6b 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -1016,7 +1016,7 @@ static int fuse_permission(struct inode *inode, int mask)
exist. So if permissions are revoked this won't be
noticed immediately, only after the attribute
timeout has expired */
- } else if (mask & MAY_ACCESS) {
+ } else if (mask & (MAY_ACCESS | MAY_CHDIR)) {
err = fuse_access(inode, mask);
} else if ((mask & MAY_EXEC) && S_ISREG(inode->i_mode)) {
if (!(inode->i_mode & S_IXUGO)) {
@@ -1156,10 +1156,9 @@ static int fuse_dir_release(struct inode *inode, struct file *file)
return 0;
}
-static int fuse_dir_fsync(struct file *file, struct dentry *de, int datasync)
+static int fuse_dir_fsync(struct file *file, int datasync)
{
- /* nfsd can call this with no file */
- return file ? fuse_fsync_common(file, de, datasync, 1) : 0;
+ return fuse_fsync_common(file, datasync, 1);
}
static bool update_mtime(unsigned ivalid)
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index a9f5e137f1d3..ada0adeb3bb5 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -351,10 +351,9 @@ static void fuse_sync_writes(struct inode *inode)
fuse_release_nowrite(inode);
}
-int fuse_fsync_common(struct file *file, struct dentry *de, int datasync,
- int isdir)
+int fuse_fsync_common(struct file *file, int datasync, int isdir)
{
- struct inode *inode = de->d_inode;
+ struct inode *inode = file->f_mapping->host;
struct fuse_conn *fc = get_fuse_conn(inode);
struct fuse_file *ff = file->private_data;
struct fuse_req *req;
@@ -403,9 +402,9 @@ int fuse_fsync_common(struct file *file, struct dentry *de, int datasync,
return err;
}
-static int fuse_fsync(struct file *file, struct dentry *de, int datasync)
+static int fuse_fsync(struct file *file, int datasync)
{
- return fuse_fsync_common(file, de, datasync, 0);
+ return fuse_fsync_common(file, datasync, 0);
}
void fuse_read_fill(struct fuse_req *req, struct file *file, loff_t pos,
@@ -517,17 +516,26 @@ static void fuse_readpages_end(struct fuse_conn *fc, struct fuse_req *req)
int i;
size_t count = req->misc.read.in.size;
size_t num_read = req->out.args[0].size;
- struct inode *inode = req->pages[0]->mapping->host;
+ struct address_space *mapping = NULL;
- /*
- * Short read means EOF. If file size is larger, truncate it
- */
- if (!req->out.h.error && num_read < count) {
- loff_t pos = page_offset(req->pages[0]) + num_read;
- fuse_read_update_size(inode, pos, req->misc.read.attr_ver);
- }
+ for (i = 0; mapping == NULL && i < req->num_pages; i++)
+ mapping = req->pages[i]->mapping;
- fuse_invalidate_attr(inode); /* atime changed */
+ if (mapping) {
+ struct inode *inode = mapping->host;
+
+ /*
+ * Short read means EOF. If file size is larger, truncate it
+ */
+ if (!req->out.h.error && num_read < count) {
+ loff_t pos;
+
+ pos = page_offset(req->pages[0]) + num_read;
+ fuse_read_update_size(inode, pos,
+ req->misc.read.attr_ver);
+ }
+ fuse_invalidate_attr(inode); /* atime changed */
+ }
for (i = 0; i < req->num_pages; i++) {
struct page *page = req->pages[i];
@@ -536,6 +544,7 @@ static void fuse_readpages_end(struct fuse_conn *fc, struct fuse_req *req)
else
SetPageError(page);
unlock_page(page);
+ page_cache_release(page);
}
if (req->ff)
fuse_file_put(req->ff);
@@ -550,6 +559,7 @@ static void fuse_send_readpages(struct fuse_req *req, struct file *file)
req->out.argpages = 1;
req->out.page_zeroing = 1;
+ req->out.page_replace = 1;
fuse_read_fill(req, file, pos, count, FUSE_READ);
req->misc.read.attr_ver = fuse_get_attr_version(fc);
if (fc->async_read) {
@@ -589,6 +599,7 @@ static int fuse_readpages_fill(void *_data, struct page *page)
return PTR_ERR(req);
}
}
+ page_cache_get(page);
req->pages[req->num_pages] = page;
req->num_pages++;
return 0;
@@ -994,10 +1005,7 @@ static int fuse_get_user_pages(struct fuse_req *req, const char __user *buf,
nbytes = min_t(size_t, nbytes, FUSE_MAX_PAGES_PER_REQ << PAGE_SHIFT);
npages = (nbytes + offset + PAGE_SIZE - 1) >> PAGE_SHIFT;
npages = clamp(npages, 1, FUSE_MAX_PAGES_PER_REQ);
- down_read(&current->mm->mmap_sem);
- npages = get_user_pages(current, current->mm, user_addr, npages, !write,
- 0, req->pages, NULL);
- up_read(&current->mm->mmap_sem);
+ npages = get_user_pages_fast(user_addr, npages, !write, req->pages);
if (npages < 0)
return npages;
@@ -1580,9 +1588,9 @@ static int fuse_ioctl_copy_user(struct page **pages, struct iovec *iov,
while (iov_iter_count(&ii)) {
struct page *page = pages[page_idx++];
size_t todo = min_t(size_t, PAGE_SIZE, iov_iter_count(&ii));
- void *kaddr, *map;
+ void *kaddr;
- kaddr = map = kmap(page);
+ kaddr = kmap(page);
while (todo) {
char __user *uaddr = ii.iov->iov_base + ii.iov_offset;
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 01cc462ff45d..8f309f04064e 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -177,6 +177,9 @@ struct fuse_out {
/** Zero partially or not copied pages */
unsigned page_zeroing:1;
+ /** Pages may be replaced with new ones */
+ unsigned page_replace:1;
+
/** Number or arguments */
unsigned numargs;
@@ -568,8 +571,7 @@ void fuse_release_common(struct file *file, int opcode);
/**
* Send FSYNC or FSYNCDIR request
*/
-int fuse_fsync_common(struct file *file, struct dentry *de, int datasync,
- int isdir);
+int fuse_fsync_common(struct file *file, int datasync, int isdir);
/**
* Notify poll wakeup
diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index a739a0a48067..5e96cbd8a454 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -136,10 +136,7 @@ static int gfs2_writeback_writepage(struct page *page,
if (ret <= 0)
return ret;
- ret = mpage_writepage(page, gfs2_get_block_noalloc, wbc);
- if (ret == -EAGAIN)
- ret = block_write_full_page(page, gfs2_get_block_noalloc, wbc);
- return ret;
+ return nobh_writepage(page, gfs2_get_block_noalloc, wbc);
}
/**
@@ -637,9 +634,7 @@ static int gfs2_write_begin(struct file *file, struct address_space *mapping,
}
}
- error = gfs2_write_alloc_required(ip, pos, len, &alloc_required);
- if (error)
- goto out_unlock;
+ alloc_required = gfs2_write_alloc_required(ip, pos, len);
if (alloc_required || gfs2_is_jdata(ip))
gfs2_write_calc_reserv(ip, len, &data_blocks, &ind_blocks);
@@ -700,8 +695,14 @@ out:
return 0;
page_cache_release(page);
+
+ /*
+ * XXX(hch): the call below should probably be replaced with
+ * a call to the gfs2-specific truncate blocks helper to actually
+ * release disk blocks..
+ */
if (pos + len > ip->i_inode.i_size)
- vmtruncate(&ip->i_inode, ip->i_inode.i_size);
+ simple_setsize(&ip->i_inode, ip->i_inode.i_size);
out_endtrans:
gfs2_trans_end(sdp);
out_trans_fail:
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index 4a48c0f4b402..6f482809d1a3 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -1040,7 +1040,8 @@ static int trunc_start(struct gfs2_inode *ip, u64 size)
goto out;
if (gfs2_is_stuffed(ip)) {
- u64 dsize = size + sizeof(struct gfs2_inode);
+ u64 dsize = size + sizeof(struct gfs2_dinode);
+ ip->i_disksize = size;
ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
gfs2_trans_add_bh(ip->i_gl, dibh, 1);
gfs2_dinode_out(ip, dibh->b_data);
@@ -1243,13 +1244,12 @@ int gfs2_file_dealloc(struct gfs2_inode *ip)
* @ip: the file being written to
* @offset: the offset to write to
* @len: the number of bytes being written
- * @alloc_required: set to 1 if an alloc is required, 0 otherwise
*
- * Returns: errno
+ * Returns: 1 if an alloc is required, 0 otherwise
*/
int gfs2_write_alloc_required(struct gfs2_inode *ip, u64 offset,
- unsigned int len, int *alloc_required)
+ unsigned int len)
{
struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
struct buffer_head bh;
@@ -1257,26 +1257,23 @@ int gfs2_write_alloc_required(struct gfs2_inode *ip, u64 offset,
u64 lblock, lblock_stop, size;
u64 end_of_file;
- *alloc_required = 0;
-
if (!len)
return 0;
if (gfs2_is_stuffed(ip)) {
if (offset + len >
sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode))
- *alloc_required = 1;
+ return 1;
return 0;
}
- *alloc_required = 1;
shift = sdp->sd_sb.sb_bsize_shift;
BUG_ON(gfs2_is_dir(ip));
end_of_file = (ip->i_disksize + sdp->sd_sb.sb_bsize - 1) >> shift;
lblock = offset >> shift;
lblock_stop = (offset + len + sdp->sd_sb.sb_bsize - 1) >> shift;
if (lblock_stop > end_of_file)
- return 0;
+ return 1;
size = (lblock_stop - lblock) << shift;
do {
@@ -1284,12 +1281,11 @@ int gfs2_write_alloc_required(struct gfs2_inode *ip, u64 offset,
bh.b_size = size;
gfs2_block_map(&ip->i_inode, lblock, &bh, 0);
if (!buffer_mapped(&bh))
- return 0;
+ return 1;
size -= bh.b_size;
lblock += (bh.b_size >> ip->i_inode.i_blkbits);
} while(size > 0);
- *alloc_required = 0;
return 0;
}
diff --git a/fs/gfs2/bmap.h b/fs/gfs2/bmap.h
index c983177e05ac..a20a5213135a 100644
--- a/fs/gfs2/bmap.h
+++ b/fs/gfs2/bmap.h
@@ -52,6 +52,6 @@ int gfs2_truncatei(struct gfs2_inode *ip, u64 size);
int gfs2_truncatei_resume(struct gfs2_inode *ip);
int gfs2_file_dealloc(struct gfs2_inode *ip);
int gfs2_write_alloc_required(struct gfs2_inode *ip, u64 offset,
- unsigned int len, int *alloc_required);
+ unsigned int len);
#endif /* __BMAP_DOT_H__ */
diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
index 8295c5b5d4a9..b9dd88a78dd4 100644
--- a/fs/gfs2/dir.c
+++ b/fs/gfs2/dir.c
@@ -392,7 +392,7 @@ static int gfs2_dirent_find_space(const struct gfs2_dirent *dent,
unsigned totlen = be16_to_cpu(dent->de_rec_len);
if (gfs2_dirent_sentinel(dent))
- actual = GFS2_DIRENT_SIZE(0);
+ actual = 0;
if (totlen - actual >= required)
return 1;
return 0;
@@ -955,7 +955,12 @@ static int dir_split_leaf(struct inode *inode, const struct qstr *name)
/* Change the pointers.
Don't bother distinguishing stuffed from non-stuffed.
This code is complicated enough already. */
- lp = kmalloc(half_len * sizeof(__be64), GFP_NOFS | __GFP_NOFAIL);
+ lp = kmalloc(half_len * sizeof(__be64), GFP_NOFS);
+ if (!lp) {
+ error = -ENOMEM;
+ goto fail_brelse;
+ }
+
/* Change the pointers */
for (x = 0; x < half_len; x++)
lp[x] = cpu_to_be64(bn);
@@ -1063,7 +1068,9 @@ static int dir_double_exhash(struct gfs2_inode *dip)
/* Allocate both the "from" and "to" buffers in one big chunk */
- buf = kcalloc(3, sdp->sd_hash_bsize, GFP_NOFS | __GFP_NOFAIL);
+ buf = kcalloc(3, sdp->sd_hash_bsize, GFP_NOFS);
+ if (!buf)
+ return -ENOMEM;
for (block = dip->i_disksize >> sdp->sd_hash_bsize_shift; block--;) {
error = gfs2_dir_read_data(dip, (char *)buf,
@@ -1231,6 +1238,25 @@ static int do_filldir_main(struct gfs2_inode *dip, u64 *offset,
return 0;
}
+static void *gfs2_alloc_sort_buffer(unsigned size)
+{
+ void *ptr = NULL;
+
+ if (size < KMALLOC_MAX_SIZE)
+ ptr = kmalloc(size, GFP_NOFS | __GFP_NOWARN);
+ if (!ptr)
+ ptr = __vmalloc(size, GFP_NOFS, PAGE_KERNEL);
+ return ptr;
+}
+
+static void gfs2_free_sort_buffer(void *ptr)
+{
+ if (is_vmalloc_addr(ptr))
+ vfree(ptr);
+ else
+ kfree(ptr);
+}
+
static int gfs2_dir_read_leaf(struct inode *inode, u64 *offset, void *opaque,
filldir_t filldir, int *copied, unsigned *depth,
u64 leaf_no)
@@ -1271,7 +1297,7 @@ static int gfs2_dir_read_leaf(struct inode *inode, u64 *offset, void *opaque,
* 99 is the maximum number of entries that can fit in a single
* leaf block.
*/
- larr = vmalloc((leaves + entries + 99) * sizeof(void *));
+ larr = gfs2_alloc_sort_buffer((leaves + entries + 99) * sizeof(void *));
if (!larr)
goto out;
darr = (const struct gfs2_dirent **)(larr + leaves);
@@ -1282,7 +1308,7 @@ static int gfs2_dir_read_leaf(struct inode *inode, u64 *offset, void *opaque,
do {
error = get_leaf(ip, lfn, &bh);
if (error)
- goto out_kfree;
+ goto out_free;
lf = (struct gfs2_leaf *)bh->b_data;
lfn = be64_to_cpu(lf->lf_next);
if (lf->lf_entries) {
@@ -1291,7 +1317,7 @@ static int gfs2_dir_read_leaf(struct inode *inode, u64 *offset, void *opaque,
gfs2_dirent_gather, NULL, &g);
error = PTR_ERR(dent);
if (IS_ERR(dent))
- goto out_kfree;
+ goto out_free;
if (entries2 != g.offset) {
fs_warn(sdp, "Number of entries corrupt in dir "
"leaf %llu, entries2 (%u) != "
@@ -1300,7 +1326,7 @@ static int gfs2_dir_read_leaf(struct inode *inode, u64 *offset, void *opaque,
entries2, g.offset);
error = -EIO;
- goto out_kfree;
+ goto out_free;
}
error = 0;
larr[leaf++] = bh;
@@ -1312,10 +1338,10 @@ static int gfs2_dir_read_leaf(struct inode *inode, u64 *offset, void *opaque,
BUG_ON(entries2 != entries);
error = do_filldir_main(ip, offset, opaque, filldir, darr,
entries, copied);
-out_kfree:
+out_free:
for(i = 0; i < leaf; i++)
brelse(larr[i]);
- vfree(larr);
+ gfs2_free_sort_buffer(larr);
out:
return error;
}
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index b20bfcc9fa2d..4edd662c8232 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -351,7 +351,6 @@ static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
unsigned long last_index;
u64 pos = page->index << PAGE_CACHE_SHIFT;
unsigned int data_blocks, ind_blocks, rblocks;
- int alloc_required = 0;
struct gfs2_holder gh;
struct gfs2_alloc *al;
int ret;
@@ -364,8 +363,7 @@ static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
set_bit(GLF_DIRTY, &ip->i_gl->gl_flags);
set_bit(GIF_SW_PAGED, &ip->i_flags);
- ret = gfs2_write_alloc_required(ip, pos, PAGE_CACHE_SIZE, &alloc_required);
- if (ret || !alloc_required)
+ if (!gfs2_write_alloc_required(ip, pos, PAGE_CACHE_SIZE))
goto out_unlock;
ret = -ENOMEM;
al = gfs2_alloc_get(ip);
@@ -554,9 +552,9 @@ static int gfs2_close(struct inode *inode, struct file *file)
* Returns: errno
*/
-static int gfs2_fsync(struct file *file, struct dentry *dentry, int datasync)
+static int gfs2_fsync(struct file *file, int datasync)
{
- struct inode *inode = dentry->d_inode;
+ struct inode *inode = file->f_mapping->host;
int sync_state = inode->i_state & (I_DIRTY_SYNC|I_DIRTY_DATASYNC);
int ret = 0;
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index ddcdbf493536..9adf8f924e08 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -328,6 +328,30 @@ static void gfs2_holder_wake(struct gfs2_holder *gh)
}
/**
+ * do_error - Something unexpected has happened during a lock request
+ *
+ */
+
+static inline void do_error(struct gfs2_glock *gl, const int ret)
+{
+ struct gfs2_holder *gh, *tmp;
+
+ list_for_each_entry_safe(gh, tmp, &gl->gl_holders, gh_list) {
+ if (test_bit(HIF_HOLDER, &gh->gh_iflags))
+ continue;
+ if (ret & LM_OUT_ERROR)
+ gh->gh_error = -EIO;
+ else if (gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB))
+ gh->gh_error = GLR_TRYFAILED;
+ else
+ continue;
+ list_del_init(&gh->gh_list);
+ trace_gfs2_glock_queue(gh, 0);
+ gfs2_holder_wake(gh);
+ }
+}
+
+/**
* do_promote - promote as many requests as possible on the current queue
* @gl: The glock
*
@@ -375,36 +399,13 @@ restart:
}
if (gh->gh_list.prev == &gl->gl_holders)
return 1;
+ do_error(gl, 0);
break;
}
return 0;
}
/**
- * do_error - Something unexpected has happened during a lock request
- *
- */
-
-static inline void do_error(struct gfs2_glock *gl, const int ret)
-{
- struct gfs2_holder *gh, *tmp;
-
- list_for_each_entry_safe(gh, tmp, &gl->gl_holders, gh_list) {
- if (test_bit(HIF_HOLDER, &gh->gh_iflags))
- continue;
- if (ret & LM_OUT_ERROR)
- gh->gh_error = -EIO;
- else if (gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB))
- gh->gh_error = GLR_TRYFAILED;
- else
- continue;
- list_del_init(&gh->gh_list);
- trace_gfs2_glock_queue(gh, 0);
- gfs2_holder_wake(gh);
- }
-}
-
-/**
* find_first_waiter - find the first gh that's waiting for the glock
* @gl: the glock
*/
@@ -1062,6 +1063,9 @@ int gfs2_glock_nq(struct gfs2_holder *gh)
spin_lock(&gl->gl_spin);
add_to_queue(gh);
+ if ((LM_FLAG_NOEXP & gh->gh_flags) &&
+ test_and_clear_bit(GLF_FROZEN, &gl->gl_flags))
+ set_bit(GLF_REPLY_PENDING, &gl->gl_flags);
run_queue(gl, 1);
spin_unlock(&gl->gl_spin);
@@ -1319,6 +1323,36 @@ void gfs2_glock_cb(struct gfs2_glock *gl, unsigned int state)
}
/**
+ * gfs2_should_freeze - Figure out if glock should be frozen
+ * @gl: The glock in question
+ *
+ * Glocks are not frozen if (a) the result of the dlm operation is
+ * an error, (b) the locking operation was an unlock operation or
+ * (c) if there is a "noexp" flagged request anywhere in the queue
+ *
+ * Returns: 1 if freezing should occur, 0 otherwise
+ */
+
+static int gfs2_should_freeze(const struct gfs2_glock *gl)
+{
+ const struct gfs2_holder *gh;
+
+ if (gl->gl_reply & ~LM_OUT_ST_MASK)
+ return 0;
+ if (gl->gl_target == LM_ST_UNLOCKED)
+ return 0;
+
+ list_for_each_entry(gh, &gl->gl_holders, gh_list) {
+ if (test_bit(HIF_HOLDER, &gh->gh_iflags))
+ continue;
+ if (LM_FLAG_NOEXP & gh->gh_flags)
+ return 0;
+ }
+
+ return 1;
+}
+
+/**
* gfs2_glock_complete - Callback used by locking
* @gl: Pointer to the glock
* @ret: The return value from the dlm
@@ -1328,18 +1362,17 @@ void gfs2_glock_cb(struct gfs2_glock *gl, unsigned int state)
void gfs2_glock_complete(struct gfs2_glock *gl, int ret)
{
struct lm_lockstruct *ls = &gl->gl_sbd->sd_lockstruct;
+
gl->gl_reply = ret;
+
if (unlikely(test_bit(DFL_BLOCK_LOCKS, &ls->ls_flags))) {
- struct gfs2_holder *gh;
spin_lock(&gl->gl_spin);
- gh = find_first_waiter(gl);
- if ((!(gh && (gh->gh_flags & LM_FLAG_NOEXP)) &&
- (gl->gl_target != LM_ST_UNLOCKED)) ||
- ((ret & ~LM_OUT_ST_MASK) != 0))
+ if (gfs2_should_freeze(gl)) {
set_bit(GLF_FROZEN, &gl->gl_flags);
- spin_unlock(&gl->gl_spin);
- if (test_bit(GLF_FROZEN, &gl->gl_flags))
+ spin_unlock(&gl->gl_spin);
return;
+ }
+ spin_unlock(&gl->gl_spin);
}
set_bit(GLF_REPLY_PENDING, &gl->gl_flags);
gfs2_glock_hold(gl);
@@ -1348,7 +1381,7 @@ void gfs2_glock_complete(struct gfs2_glock *gl, int ret)
}
-static int gfs2_shrink_glock_memory(int nr, gfp_t gfp_mask)
+static int gfs2_shrink_glock_memory(struct shrinker *shrink, int nr, gfp_t gfp_mask)
{
struct gfs2_glock *gl;
int may_demote;
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index b5d7363b22da..8fcbce48a128 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -460,6 +460,7 @@ enum {
SDF_NOBARRIERS = 3,
SDF_NORECOVERY = 4,
SDF_DEMOTE = 5,
+ SDF_NOJOURNALID = 6,
};
#define GFS2_FSNAME_LEN 256
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index b5612cbb62a5..f03afd9c44bc 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -169,7 +169,7 @@ struct inode *gfs2_inode_lookup(struct super_block *sb,
{
struct inode *inode;
struct gfs2_inode *ip;
- struct gfs2_glock *io_gl;
+ struct gfs2_glock *io_gl = NULL;
int error;
inode = gfs2_iget(sb, no_addr);
@@ -198,6 +198,7 @@ struct inode *gfs2_inode_lookup(struct super_block *sb,
ip->i_iopen_gh.gh_gl->gl_object = ip;
gfs2_glock_put(io_gl);
+ io_gl = NULL;
if ((type == DT_UNKNOWN) && (no_formal_ino == 0))
goto gfs2_nfsbypass;
@@ -228,7 +229,8 @@ gfs2_nfsbypass:
fail_glock:
gfs2_glock_dq(&ip->i_iopen_gh);
fail_iopen:
- gfs2_glock_put(io_gl);
+ if (io_gl)
+ gfs2_glock_put(io_gl);
fail_put:
if (inode->i_state & I_NEW)
ip->i_gl->gl_object = NULL;
@@ -256,7 +258,7 @@ void gfs2_process_unlinked_inode(struct super_block *sb, u64 no_addr)
{
struct gfs2_sbd *sdp;
struct gfs2_inode *ip;
- struct gfs2_glock *io_gl;
+ struct gfs2_glock *io_gl = NULL;
int error;
struct gfs2_holder gh;
struct inode *inode;
@@ -293,6 +295,7 @@ void gfs2_process_unlinked_inode(struct super_block *sb, u64 no_addr)
ip->i_iopen_gh.gh_gl->gl_object = ip;
gfs2_glock_put(io_gl);
+ io_gl = NULL;
inode->i_mode = DT2IF(DT_UNKNOWN);
@@ -319,7 +322,8 @@ void gfs2_process_unlinked_inode(struct super_block *sb, u64 no_addr)
fail_glock:
gfs2_glock_dq(&ip->i_iopen_gh);
fail_iopen:
- gfs2_glock_put(io_gl);
+ if (io_gl)
+ gfs2_glock_put(io_gl);
fail_put:
ip->i_gl->gl_object = NULL;
gfs2_glock_put(ip->i_gl);
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 3593b3a7290e..45a4a36195d8 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -76,7 +76,7 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb)
sb->s_fs_info = sdp;
sdp->sd_vfs = sb;
-
+ set_bit(SDF_NOJOURNALID, &sdp->sd_flags);
gfs2_tune_init(&sdp->sd_tune);
init_waitqueue_head(&sdp->sd_glock_wait);
@@ -1050,7 +1050,8 @@ static int gfs2_lm_mount(struct gfs2_sbd *sdp, int silent)
ret = match_int(&tmp[0], &option);
if (ret || option < 0)
goto hostdata_error;
- ls->ls_jid = option;
+ if (test_and_clear_bit(SDF_NOJOURNALID, &sdp->sd_flags))
+ ls->ls_jid = option;
break;
case Opt_id:
/* Obsolete, but left for backward compat purposes */
@@ -1102,6 +1103,24 @@ void gfs2_lm_unmount(struct gfs2_sbd *sdp)
lm->lm_unmount(sdp);
}
+static int gfs2_journalid_wait(void *word)
+{
+ if (signal_pending(current))
+ return -EINTR;
+ schedule();
+ return 0;
+}
+
+static int wait_on_journal(struct gfs2_sbd *sdp)
+{
+ if (sdp->sd_args.ar_spectator)
+ return 0;
+ if (sdp->sd_lockstruct.ls_ops->lm_mount == NULL)
+ return 0;
+
+ return wait_on_bit(&sdp->sd_flags, SDF_NOJOURNALID, gfs2_journalid_wait, TASK_INTERRUPTIBLE);
+}
+
void gfs2_online_uevent(struct gfs2_sbd *sdp)
{
struct super_block *sb = sdp->sd_vfs;
@@ -1194,6 +1213,10 @@ static int fill_super(struct super_block *sb, struct gfs2_args *args, int silent
if (error)
goto fail_locking;
+ error = wait_on_journal(sdp);
+ if (error)
+ goto fail_sb;
+
error = init_inodes(sdp, DO);
if (error)
goto fail_sb;
diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c
index 4e64352d49de..98cdd05f3316 100644
--- a/fs/gfs2/ops_inode.c
+++ b/fs/gfs2/ops_inode.c
@@ -1071,6 +1071,9 @@ int gfs2_permission(struct inode *inode, int mask)
return error;
}
+/*
+ * XXX: should be changed to have proper ordering by opencoding simple_setsize
+ */
static int setattr_size(struct inode *inode, struct iattr *attr)
{
struct gfs2_inode *ip = GFS2_I(inode);
@@ -1081,7 +1084,7 @@ static int setattr_size(struct inode *inode, struct iattr *attr)
error = gfs2_trans_begin(sdp, 0, sdp->sd_jdesc->jd_blocks);
if (error)
return error;
- error = vmtruncate(inode, attr->ia_size);
+ error = simple_setsize(inode, attr->ia_size);
gfs2_trans_end(sdp);
if (error)
return error;
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index 49667d68769e..8bb643cb2658 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -77,7 +77,7 @@ static LIST_HEAD(qd_lru_list);
static atomic_t qd_lru_count = ATOMIC_INIT(0);
static DEFINE_SPINLOCK(qd_lru_lock);
-int gfs2_shrink_qd_memory(int nr, gfp_t gfp_mask)
+int gfs2_shrink_qd_memory(struct shrinker *shrink, int nr, gfp_t gfp_mask)
{
struct gfs2_quota_data *qd;
struct gfs2_sbd *sdp;
@@ -694,10 +694,8 @@ get_a_page:
if (!buffer_mapped(bh))
goto unlock_out;
/* If it's a newly allocated disk block for quota, zero it */
- if (buffer_new(bh)) {
- memset(bh->b_data, 0, bh->b_size);
- set_buffer_uptodate(bh);
- }
+ if (buffer_new(bh))
+ zero_user(page, pos - blocksize, bh->b_size);
}
if (PageUptodate(page))
@@ -723,7 +721,7 @@ get_a_page:
/* If quota straddles page boundary, we need to update the rest of the
* quota at the beginning of the next page */
- if (offset != 0) { /* first page, offset is closer to PAGE_CACHE_SIZE */
+ if ((offset + sizeof(struct gfs2_quota)) > PAGE_CACHE_SIZE) {
ptr = ptr + nbytes;
nbytes = sizeof(struct gfs2_quota) - nbytes;
offset = 0;
@@ -789,15 +787,9 @@ static int do_sync(unsigned int num_qd, struct gfs2_quota_data **qda)
goto out;
for (x = 0; x < num_qd; x++) {
- int alloc_required;
-
offset = qd2offset(qda[x]);
- error = gfs2_write_alloc_required(ip, offset,
- sizeof(struct gfs2_quota),
- &alloc_required);
- if (error)
- goto out_gunlock;
- if (alloc_required)
+ if (gfs2_write_alloc_required(ip, offset,
+ sizeof(struct gfs2_quota)))
nalloc++;
}
@@ -1586,10 +1578,7 @@ static int gfs2_set_dqblk(struct super_block *sb, int type, qid_t id,
goto out_i;
offset = qd2offset(qd);
- error = gfs2_write_alloc_required(ip, offset, sizeof(struct gfs2_quota),
- &alloc_required);
- if (error)
- goto out_i;
+ alloc_required = gfs2_write_alloc_required(ip, offset, sizeof(struct gfs2_quota));
if (alloc_required) {
al = gfs2_alloc_get(ip);
if (al == NULL)
diff --git a/fs/gfs2/quota.h b/fs/gfs2/quota.h
index 195f60c8bd14..e7d236ca48bd 100644
--- a/fs/gfs2/quota.h
+++ b/fs/gfs2/quota.h
@@ -51,7 +51,7 @@ static inline int gfs2_quota_lock_check(struct gfs2_inode *ip)
return ret;
}
-extern int gfs2_shrink_qd_memory(int nr, gfp_t gfp_mask);
+extern int gfs2_shrink_qd_memory(struct shrinker *shrink, int nr, gfp_t gfp_mask);
extern const struct quotactl_ops gfs2_quotactl_ops;
#endif /* __QUOTA_DOT_H__ */
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index 4d1aad38f1b1..4140811a921c 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -342,8 +342,6 @@ int gfs2_jdesc_check(struct gfs2_jdesc *jd)
{
struct gfs2_inode *ip = GFS2_I(jd->jd_inode);
struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode);
- int ar;
- int error;
if (ip->i_disksize < (8 << 20) || ip->i_disksize > (1 << 30) ||
(ip->i_disksize & (sdp->sd_sb.sb_bsize - 1))) {
@@ -352,13 +350,12 @@ int gfs2_jdesc_check(struct gfs2_jdesc *jd)
}
jd->jd_blocks = ip->i_disksize >> sdp->sd_sb.sb_bsize_shift;
- error = gfs2_write_alloc_required(ip, 0, ip->i_disksize, &ar);
- if (!error && ar) {
+ if (gfs2_write_alloc_required(ip, 0, ip->i_disksize)) {
gfs2_consist_inode(ip);
- error = -EIO;
+ return -EIO;
}
- return error;
+ return 0;
}
/**
diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c
index 37f5393e68e6..d019d0d55e00 100644
--- a/fs/gfs2/sys.c
+++ b/fs/gfs2/sys.c
@@ -325,6 +325,30 @@ static ssize_t lkfirst_show(struct gfs2_sbd *sdp, char *buf)
return sprintf(buf, "%d\n", ls->ls_first);
}
+static ssize_t lkfirst_store(struct gfs2_sbd *sdp, const char *buf, size_t len)
+{
+ unsigned first;
+ int rv;
+
+ rv = sscanf(buf, "%u", &first);
+ if (rv != 1 || first > 1)
+ return -EINVAL;
+ spin_lock(&sdp->sd_jindex_spin);
+ rv = -EBUSY;
+ if (test_bit(SDF_NOJOURNALID, &sdp->sd_flags) == 0)
+ goto out;
+ rv = -EINVAL;
+ if (sdp->sd_args.ar_spectator)
+ goto out;
+ if (sdp->sd_lockstruct.ls_ops->lm_mount == NULL)
+ goto out;
+ sdp->sd_lockstruct.ls_first = first;
+ rv = 0;
+out:
+ spin_unlock(&sdp->sd_jindex_spin);
+ return rv ? rv : len;
+}
+
static ssize_t first_done_show(struct gfs2_sbd *sdp, char *buf)
{
struct lm_lockstruct *ls = &sdp->sd_lockstruct;
@@ -377,14 +401,41 @@ static ssize_t jid_show(struct gfs2_sbd *sdp, char *buf)
return sprintf(buf, "%u\n", sdp->sd_lockstruct.ls_jid);
}
+static ssize_t jid_store(struct gfs2_sbd *sdp, const char *buf, size_t len)
+{
+ unsigned jid;
+ int rv;
+
+ rv = sscanf(buf, "%u", &jid);
+ if (rv != 1)
+ return -EINVAL;
+
+ spin_lock(&sdp->sd_jindex_spin);
+ rv = -EINVAL;
+ if (sdp->sd_args.ar_spectator)
+ goto out;
+ if (sdp->sd_lockstruct.ls_ops->lm_mount == NULL)
+ goto out;
+ rv = -EBUSY;
+ if (test_and_clear_bit(SDF_NOJOURNALID, &sdp->sd_flags) == 0)
+ goto out;
+ sdp->sd_lockstruct.ls_jid = jid;
+ smp_mb__after_clear_bit();
+ wake_up_bit(&sdp->sd_flags, SDF_NOJOURNALID);
+ rv = 0;
+out:
+ spin_unlock(&sdp->sd_jindex_spin);
+ return rv ? rv : len;
+}
+
#define GDLM_ATTR(_name,_mode,_show,_store) \
static struct gfs2_attr gdlm_attr_##_name = __ATTR(_name,_mode,_show,_store)
GDLM_ATTR(proto_name, 0444, proto_name_show, NULL);
GDLM_ATTR(block, 0644, block_show, block_store);
GDLM_ATTR(withdraw, 0644, withdraw_show, withdraw_store);
-GDLM_ATTR(jid, 0444, jid_show, NULL);
-GDLM_ATTR(first, 0444, lkfirst_show, NULL);
+GDLM_ATTR(jid, 0644, jid_show, jid_store);
+GDLM_ATTR(first, 0644, lkfirst_show, lkfirst_store);
GDLM_ATTR(first_done, 0444, first_done_show, NULL);
GDLM_ATTR(recover, 0600, NULL, recover_store);
GDLM_ATTR(recover_done, 0444, recover_done_show, NULL);
@@ -564,7 +615,7 @@ static int gfs2_uevent(struct kset *kset, struct kobject *kobj,
add_uevent_var(env, "LOCKTABLE=%s", sdp->sd_table_name);
add_uevent_var(env, "LOCKPROTO=%s", sdp->sd_proto_name);
- if (!sdp->sd_args.ar_spectator)
+ if (!test_bit(SDF_NOJOURNALID, &sdp->sd_flags))
add_uevent_var(env, "JOURNALID=%u", sdp->sd_lockstruct.ls_jid);
if (gfs2_uuid_valid(uuid))
add_uevent_var(env, "UUID=%pUB", uuid);
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index 3a029d8f4cf1..87ac1891a185 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -411,9 +411,9 @@ int hostfs_file_open(struct inode *ino, struct file *file)
return 0;
}
-int hostfs_fsync(struct file *file, struct dentry *dentry, int datasync)
+int hostfs_fsync(struct file *file, int datasync)
{
- return fsync_file(HOSTFS_I(dentry->d_inode)->fd, datasync);
+ return fsync_file(HOSTFS_I(file->f_mapping->host)->fd, datasync);
}
static const struct file_operations hostfs_file_fops = {
diff --git a/fs/hpfs/file.c b/fs/hpfs/file.c
index 3efabff00367..a9ae9bfa752f 100644
--- a/fs/hpfs/file.c
+++ b/fs/hpfs/file.c
@@ -19,9 +19,9 @@ static int hpfs_file_release(struct inode *inode, struct file *file)
return 0;
}
-int hpfs_file_fsync(struct file *file, struct dentry *dentry, int datasync)
+int hpfs_file_fsync(struct file *file, int datasync)
{
- /*return file_fsync(file, dentry);*/
+ /*return file_fsync(file, datasync);*/
return 0; /* Don't fsync :-) */
}
diff --git a/fs/hpfs/hpfs_fn.h b/fs/hpfs/hpfs_fn.h
index 97bf738cd5d6..75f9d4324851 100644
--- a/fs/hpfs/hpfs_fn.h
+++ b/fs/hpfs/hpfs_fn.h
@@ -268,7 +268,7 @@ void hpfs_set_ea(struct inode *, struct fnode *, const char *,
/* file.c */
-int hpfs_file_fsync(struct file *, struct dentry *, int);
+int hpfs_file_fsync(struct file *, int);
extern const struct file_operations hpfs_file_ops;
extern const struct inode_operations hpfs_file_iops;
extern const struct address_space_operations hpfs_aops;
diff --git a/fs/hppfs/hppfs.c b/fs/hppfs/hppfs.c
index 2e4dfa8593da..826c3f9d29ac 100644
--- a/fs/hppfs/hppfs.c
+++ b/fs/hppfs/hppfs.c
@@ -587,7 +587,7 @@ static int hppfs_readdir(struct file *file, void *ent, filldir_t filldir)
return err;
}
-static int hppfs_fsync(struct file *file, struct dentry *dentry, int datasync)
+static int hppfs_fsync(struct file *file, int datasync)
{
return 0;
}
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index a0bbd3d1b41a..a4e9a7ec3691 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -688,7 +688,7 @@ static void init_once(void *foo)
const struct file_operations hugetlbfs_file_operations = {
.read = hugetlbfs_read,
.mmap = hugetlbfs_file_mmap,
- .fsync = simple_sync_file,
+ .fsync = noop_fsync,
.get_unmapped_area = hugetlb_get_unmapped_area,
};
diff --git a/fs/inode.c b/fs/inode.c
index 2bee20ae3d65..722860b323a9 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -512,7 +512,7 @@ static void prune_icache(int nr_to_scan)
* This function is passed the number of inodes to scan, and it returns the
* total number of remaining possibly-reclaimable inodes.
*/
-static int shrink_icache_memory(int nr, gfp_t gfp_mask)
+static int shrink_icache_memory(struct shrinker *shrink, int nr, gfp_t gfp_mask)
{
if (nr) {
/*
diff --git a/fs/isofs/dir.c b/fs/isofs/dir.c
index b9ab69b3a482..e0aca9a0ac68 100644
--- a/fs/isofs/dir.c
+++ b/fs/isofs/dir.c
@@ -272,6 +272,7 @@ static int isofs_readdir(struct file *filp,
const struct file_operations isofs_dir_operations =
{
+ .llseek = generic_file_llseek,
.read = generic_read_dir,
.readdir = isofs_readdir,
};
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index bc2ff5932769..036880895bfc 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -297,7 +297,6 @@ int jbd2_journal_write_metadata_buffer(transaction_t *transaction,
struct page *new_page;
unsigned int new_offset;
struct buffer_head *bh_in = jh2bh(jh_in);
- struct jbd2_buffer_trigger_type *triggers;
journal_t *journal = transaction->t_journal;
/*
@@ -328,21 +327,21 @@ repeat:
done_copy_out = 1;
new_page = virt_to_page(jh_in->b_frozen_data);
new_offset = offset_in_page(jh_in->b_frozen_data);
- triggers = jh_in->b_frozen_triggers;
} else {
new_page = jh2bh(jh_in)->b_page;
new_offset = offset_in_page(jh2bh(jh_in)->b_data);
- triggers = jh_in->b_triggers;
}
mapped_data = kmap_atomic(new_page, KM_USER0);
/*
- * Fire any commit trigger. Do this before checking for escaping,
- * as the trigger may modify the magic offset. If a copy-out
- * happens afterwards, it will have the correct data in the buffer.
+ * Fire data frozen trigger if data already wasn't frozen. Do this
+ * before checking for escaping, as the trigger may modify the magic
+ * offset. If a copy-out happens afterwards, it will have the correct
+ * data in the buffer.
*/
- jbd2_buffer_commit_trigger(jh_in, mapped_data + new_offset,
- triggers);
+ if (!done_copy_out)
+ jbd2_buffer_frozen_trigger(jh_in, mapped_data + new_offset,
+ jh_in->b_triggers);
/*
* Check for escaping
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index bfc70f57900f..b8e0806681bb 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -725,6 +725,9 @@ done:
page = jh2bh(jh)->b_page;
offset = ((unsigned long) jh2bh(jh)->b_data) & ~PAGE_MASK;
source = kmap_atomic(page, KM_USER0);
+ /* Fire data frozen trigger just before we copy the data */
+ jbd2_buffer_frozen_trigger(jh, source + offset,
+ jh->b_triggers);
memcpy(jh->b_frozen_data, source+offset, jh2bh(jh)->b_size);
kunmap_atomic(source, KM_USER0);
@@ -963,15 +966,15 @@ void jbd2_journal_set_triggers(struct buffer_head *bh,
jh->b_triggers = type;
}
-void jbd2_buffer_commit_trigger(struct journal_head *jh, void *mapped_data,
+void jbd2_buffer_frozen_trigger(struct journal_head *jh, void *mapped_data,
struct jbd2_buffer_trigger_type *triggers)
{
struct buffer_head *bh = jh2bh(jh);
- if (!triggers || !triggers->t_commit)
+ if (!triggers || !triggers->t_frozen)
return;
- triggers->t_commit(triggers, bh, mapped_data, bh->b_size);
+ triggers->t_frozen(triggers, bh, mapped_data, bh->b_size);
}
void jbd2_buffer_abort_trigger(struct journal_head *jh,
@@ -1311,7 +1314,6 @@ int jbd2_journal_stop(handle_t *handle)
if (handle->h_sync)
transaction->t_synchronous_commit = 1;
current->journal_info = NULL;
- spin_lock(&journal->j_state_lock);
spin_lock(&transaction->t_handle_lock);
transaction->t_outstanding_credits -= handle->h_buffer_credits;
transaction->t_updates--;
@@ -1340,8 +1342,7 @@ int jbd2_journal_stop(handle_t *handle)
jbd_debug(2, "transaction too old, requesting commit for "
"handle %p\n", handle);
/* This is non-blocking */
- __jbd2_log_start_commit(journal, transaction->t_tid);
- spin_unlock(&journal->j_state_lock);
+ jbd2_log_start_commit(journal, transaction->t_tid);
/*
* Special case: JBD2_SYNC synchronous updates require us
@@ -1351,7 +1352,6 @@ int jbd2_journal_stop(handle_t *handle)
err = jbd2_log_wait_commit(journal, tid);
} else {
spin_unlock(&transaction->t_handle_lock);
- spin_unlock(&journal->j_state_lock);
}
lock_map_release(&handle->h_lockdep_map);
diff --git a/fs/jffs2/acl.c b/fs/jffs2/acl.c
index a33aab6b5e68..54a92fd02bbd 100644
--- a/fs/jffs2/acl.c
+++ b/fs/jffs2/acl.c
@@ -234,8 +234,9 @@ static int jffs2_set_acl(struct inode *inode, int type, struct posix_acl *acl)
if (inode->i_mode != mode) {
struct iattr attr;
- attr.ia_valid = ATTR_MODE;
+ attr.ia_valid = ATTR_MODE | ATTR_CTIME;
attr.ia_mode = mode;
+ attr.ia_ctime = CURRENT_TIME_SEC;
rc = jffs2_do_setattr(inode, &attr);
if (rc < 0)
return rc;
diff --git a/fs/jffs2/dir.c b/fs/jffs2/dir.c
index 7aa4417e085f..166062a68230 100644
--- a/fs/jffs2/dir.c
+++ b/fs/jffs2/dir.c
@@ -222,15 +222,18 @@ static int jffs2_create(struct inode *dir_i, struct dentry *dentry, int mode,
dir_i->i_mtime = dir_i->i_ctime = ITIME(je32_to_cpu(ri->ctime));
jffs2_free_raw_inode(ri);
- d_instantiate(dentry, inode);
D1(printk(KERN_DEBUG "jffs2_create: Created ino #%lu with mode %o, nlink %d(%d). nrpages %ld\n",
inode->i_ino, inode->i_mode, inode->i_nlink,
f->inocache->pino_nlink, inode->i_mapping->nrpages));
+
+ d_instantiate(dentry, inode);
+ unlock_new_inode(inode);
return 0;
fail:
make_bad_inode(inode);
+ unlock_new_inode(inode);
iput(inode);
jffs2_free_raw_inode(ri);
return ret;
@@ -360,8 +363,8 @@ static int jffs2_symlink (struct inode *dir_i, struct dentry *dentry, const char
/* Eeek. Wave bye bye */
mutex_unlock(&f->sem);
jffs2_complete_reservation(c);
- jffs2_clear_inode(inode);
- return PTR_ERR(fn);
+ ret = PTR_ERR(fn);
+ goto fail;
}
/* We use f->target field to store the target path. */
@@ -370,8 +373,8 @@ static int jffs2_symlink (struct inode *dir_i, struct dentry *dentry, const char
printk(KERN_WARNING "Can't allocate %d bytes of memory\n", targetlen + 1);
mutex_unlock(&f->sem);
jffs2_complete_reservation(c);
- jffs2_clear_inode(inode);
- return -ENOMEM;
+ ret = -ENOMEM;
+ goto fail;
}
memcpy(f->target, target, targetlen + 1);
@@ -386,30 +389,24 @@ static int jffs2_symlink (struct inode *dir_i, struct dentry *dentry, const char
jffs2_complete_reservation(c);
ret = jffs2_init_security(inode, dir_i);
- if (ret) {
- jffs2_clear_inode(inode);
- return ret;
- }
+ if (ret)
+ goto fail;
+
ret = jffs2_init_acl_post(inode);
- if (ret) {
- jffs2_clear_inode(inode);
- return ret;
- }
+ if (ret)
+ goto fail;
ret = jffs2_reserve_space(c, sizeof(*rd)+namelen, &alloclen,
ALLOC_NORMAL, JFFS2_SUMMARY_DIRENT_SIZE(namelen));
- if (ret) {
- /* Eep. */
- jffs2_clear_inode(inode);
- return ret;
- }
+ if (ret)
+ goto fail;
rd = jffs2_alloc_raw_dirent();
if (!rd) {
/* Argh. Now we treat it like a normal delete */
jffs2_complete_reservation(c);
- jffs2_clear_inode(inode);
- return -ENOMEM;
+ ret = -ENOMEM;
+ goto fail;
}
dir_f = JFFS2_INODE_INFO(dir_i);
@@ -437,8 +434,8 @@ static int jffs2_symlink (struct inode *dir_i, struct dentry *dentry, const char
jffs2_complete_reservation(c);
jffs2_free_raw_dirent(rd);
mutex_unlock(&dir_f->sem);
- jffs2_clear_inode(inode);
- return PTR_ERR(fd);
+ ret = PTR_ERR(fd);
+ goto fail;
}
dir_i->i_mtime = dir_i->i_ctime = ITIME(je32_to_cpu(rd->mctime));
@@ -453,7 +450,14 @@ static int jffs2_symlink (struct inode *dir_i, struct dentry *dentry, const char
jffs2_complete_reservation(c);
d_instantiate(dentry, inode);
+ unlock_new_inode(inode);
return 0;
+
+ fail:
+ make_bad_inode(inode);
+ unlock_new_inode(inode);
+ iput(inode);
+ return ret;
}
@@ -519,8 +523,8 @@ static int jffs2_mkdir (struct inode *dir_i, struct dentry *dentry, int mode)
/* Eeek. Wave bye bye */
mutex_unlock(&f->sem);
jffs2_complete_reservation(c);
- jffs2_clear_inode(inode);
- return PTR_ERR(fn);
+ ret = PTR_ERR(fn);
+ goto fail;
}
/* No data here. Only a metadata node, which will be
obsoleted by the first data write
@@ -531,30 +535,24 @@ static int jffs2_mkdir (struct inode *dir_i, struct dentry *dentry, int mode)
jffs2_complete_reservation(c);
ret = jffs2_init_security(inode, dir_i);
- if (ret) {
- jffs2_clear_inode(inode);
- return ret;
- }
+ if (ret)
+ goto fail;
+
ret = jffs2_init_acl_post(inode);
- if (ret) {
- jffs2_clear_inode(inode);
- return ret;
- }
+ if (ret)
+ goto fail;
ret = jffs2_reserve_space(c, sizeof(*rd)+namelen, &alloclen,
ALLOC_NORMAL, JFFS2_SUMMARY_DIRENT_SIZE(namelen));
- if (ret) {
- /* Eep. */
- jffs2_clear_inode(inode);
- return ret;
- }
+ if (ret)
+ goto fail;
rd = jffs2_alloc_raw_dirent();
if (!rd) {
/* Argh. Now we treat it like a normal delete */
jffs2_complete_reservation(c);
- jffs2_clear_inode(inode);
- return -ENOMEM;
+ ret = -ENOMEM;
+ goto fail;
}
dir_f = JFFS2_INODE_INFO(dir_i);
@@ -582,8 +580,8 @@ static int jffs2_mkdir (struct inode *dir_i, struct dentry *dentry, int mode)
jffs2_complete_reservation(c);
jffs2_free_raw_dirent(rd);
mutex_unlock(&dir_f->sem);
- jffs2_clear_inode(inode);
- return PTR_ERR(fd);
+ ret = PTR_ERR(fd);
+ goto fail;
}
dir_i->i_mtime = dir_i->i_ctime = ITIME(je32_to_cpu(rd->mctime));
@@ -599,7 +597,14 @@ static int jffs2_mkdir (struct inode *dir_i, struct dentry *dentry, int mode)
jffs2_complete_reservation(c);
d_instantiate(dentry, inode);
+ unlock_new_inode(inode);
return 0;
+
+ fail:
+ make_bad_inode(inode);
+ unlock_new_inode(inode);
+ iput(inode);
+ return ret;
}
static int jffs2_rmdir (struct inode *dir_i, struct dentry *dentry)
@@ -693,8 +698,8 @@ static int jffs2_mknod (struct inode *dir_i, struct dentry *dentry, int mode, de
/* Eeek. Wave bye bye */
mutex_unlock(&f->sem);
jffs2_complete_reservation(c);
- jffs2_clear_inode(inode);
- return PTR_ERR(fn);
+ ret = PTR_ERR(fn);
+ goto fail;
}
/* No data here. Only a metadata node, which will be
obsoleted by the first data write
@@ -705,30 +710,24 @@ static int jffs2_mknod (struct inode *dir_i, struct dentry *dentry, int mode, de
jffs2_complete_reservation(c);
ret = jffs2_init_security(inode, dir_i);
- if (ret) {
- jffs2_clear_inode(inode);
- return ret;
- }
+ if (ret)
+ goto fail;
+
ret = jffs2_init_acl_post(inode);
- if (ret) {
- jffs2_clear_inode(inode);
- return ret;
- }
+ if (ret)
+ goto fail;
ret = jffs2_reserve_space(c, sizeof(*rd)+namelen, &alloclen,
ALLOC_NORMAL, JFFS2_SUMMARY_DIRENT_SIZE(namelen));
- if (ret) {
- /* Eep. */
- jffs2_clear_inode(inode);
- return ret;
- }
+ if (ret)
+ goto fail;
rd = jffs2_alloc_raw_dirent();
if (!rd) {
/* Argh. Now we treat it like a normal delete */
jffs2_complete_reservation(c);
- jffs2_clear_inode(inode);
- return -ENOMEM;
+ ret = -ENOMEM;
+ goto fail;
}
dir_f = JFFS2_INODE_INFO(dir_i);
@@ -759,8 +758,8 @@ static int jffs2_mknod (struct inode *dir_i, struct dentry *dentry, int mode, de
jffs2_complete_reservation(c);
jffs2_free_raw_dirent(rd);
mutex_unlock(&dir_f->sem);
- jffs2_clear_inode(inode);
- return PTR_ERR(fd);
+ ret = PTR_ERR(fd);
+ goto fail;
}
dir_i->i_mtime = dir_i->i_ctime = ITIME(je32_to_cpu(rd->mctime));
@@ -775,8 +774,14 @@ static int jffs2_mknod (struct inode *dir_i, struct dentry *dentry, int mode, de
jffs2_complete_reservation(c);
d_instantiate(dentry, inode);
-
+ unlock_new_inode(inode);
return 0;
+
+ fail:
+ make_bad_inode(inode);
+ unlock_new_inode(inode);
+ iput(inode);
+ return ret;
}
static int jffs2_rename (struct inode *old_dir_i, struct dentry *old_dentry,
diff --git a/fs/jffs2/file.c b/fs/jffs2/file.c
index e7291c161a19..813497024437 100644
--- a/fs/jffs2/file.c
+++ b/fs/jffs2/file.c
@@ -26,9 +26,9 @@ static int jffs2_write_begin(struct file *filp, struct address_space *mapping,
struct page **pagep, void **fsdata);
static int jffs2_readpage (struct file *filp, struct page *pg);
-int jffs2_fsync(struct file *filp, struct dentry *dentry, int datasync)
+int jffs2_fsync(struct file *filp, int datasync)
{
- struct inode *inode = dentry->d_inode;
+ struct inode *inode = filp->f_mapping->host;
struct jffs2_sb_info *c = JFFS2_SB_INFO(inode->i_sb);
/* Trigger GC to flush any pending writes for this inode */
diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c
index 86e0821fc989..459d39d1ea0b 100644
--- a/fs/jffs2/fs.c
+++ b/fs/jffs2/fs.c
@@ -169,13 +169,13 @@ int jffs2_do_setattr (struct inode *inode, struct iattr *iattr)
mutex_unlock(&f->sem);
jffs2_complete_reservation(c);
- /* We have to do the vmtruncate() without f->sem held, since
+ /* We have to do the simple_setsize() without f->sem held, since
some pages may be locked and waiting for it in readpage().
We are protected from a simultaneous write() extending i_size
back past iattr->ia_size, because do_truncate() holds the
generic inode semaphore. */
if (ivalid & ATTR_SIZE && inode->i_size > iattr->ia_size) {
- vmtruncate(inode, iattr->ia_size);
+ simple_setsize(inode, iattr->ia_size);
inode->i_blocks = (inode->i_size + 511) >> 9;
}
@@ -465,7 +465,12 @@ struct inode *jffs2_new_inode (struct inode *dir_i, int mode, struct jffs2_raw_i
inode->i_blocks = 0;
inode->i_size = 0;
- insert_inode_hash(inode);
+ if (insert_inode_locked(inode) < 0) {
+ make_bad_inode(inode);
+ unlock_new_inode(inode);
+ iput(inode);
+ return ERR_PTR(-EINVAL);
+ }
return inode;
}
diff --git a/fs/jffs2/os-linux.h b/fs/jffs2/os-linux.h
index 035a767f958b..4791aacf3084 100644
--- a/fs/jffs2/os-linux.h
+++ b/fs/jffs2/os-linux.h
@@ -158,7 +158,7 @@ extern const struct inode_operations jffs2_dir_inode_operations;
extern const struct file_operations jffs2_file_operations;
extern const struct inode_operations jffs2_file_inode_operations;
extern const struct address_space_operations jffs2_file_address_operations;
-int jffs2_fsync(struct file *, struct dentry *, int);
+int jffs2_fsync(struct file *, int);
int jffs2_do_readpage_unlock (struct inode *inode, struct page *pg);
/* ioctl.c */
diff --git a/fs/jffs2/xattr.c b/fs/jffs2/xattr.c
index a2d58c96f1b4..d258e261bdc7 100644
--- a/fs/jffs2/xattr.c
+++ b/fs/jffs2/xattr.c
@@ -626,7 +626,7 @@ void jffs2_xattr_free_inode(struct jffs2_sb_info *c, struct jffs2_inode_cache *i
static int check_xattr_ref_inode(struct jffs2_sb_info *c, struct jffs2_inode_cache *ic)
{
- /* success of check_xattr_ref_inode() means taht inode (ic) dose not have
+ /* success of check_xattr_ref_inode() means that inode (ic) dose not have
* duplicate name/value pairs. If duplicate name/value pair would be found,
* one will be removed.
*/
diff --git a/fs/jfs/file.c b/fs/jfs/file.c
index 85d9ec659225..127263cc8657 100644
--- a/fs/jfs/file.c
+++ b/fs/jfs/file.c
@@ -27,9 +27,9 @@
#include "jfs_acl.h"
#include "jfs_debug.h"
-int jfs_fsync(struct file *file, struct dentry *dentry, int datasync)
+int jfs_fsync(struct file *file, int datasync)
{
- struct inode *inode = dentry->d_inode;
+ struct inode *inode = file->f_mapping->host;
int rc = 0;
if (!(inode->i_state & I_DIRTY) ||
diff --git a/fs/jfs/jfs_inode.h b/fs/jfs/jfs_inode.h
index 9e6bda30a6e8..11042b1f44b5 100644
--- a/fs/jfs/jfs_inode.h
+++ b/fs/jfs/jfs_inode.h
@@ -21,7 +21,7 @@
struct fid;
extern struct inode *ialloc(struct inode *, umode_t);
-extern int jfs_fsync(struct file *, struct dentry *, int);
+extern int jfs_fsync(struct file *, int);
extern long jfs_ioctl(struct file *, unsigned int, unsigned long);
extern long jfs_compat_ioctl(struct file *, unsigned int, unsigned long);
extern struct inode *jfs_iget(struct super_block *, unsigned long);
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index b66832ac33ac..b38f96bef829 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -179,6 +179,8 @@ static void jfs_put_super(struct super_block *sb)
jfs_info("In jfs_put_super");
+ dquot_disable(sb, -1, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED);
+
lock_kernel();
rc = jfs_umount(sb);
@@ -396,10 +398,20 @@ static int jfs_remount(struct super_block *sb, int *flags, char *data)
JFS_SBI(sb)->flag = flag;
ret = jfs_mount_rw(sb, 1);
+
+ /* mark the fs r/w for quota activity */
+ sb->s_flags &= ~MS_RDONLY;
+
unlock_kernel();
+ dquot_resume(sb, -1);
return ret;
}
if ((!(sb->s_flags & MS_RDONLY)) && (*flags & MS_RDONLY)) {
+ rc = dquot_suspend(sb, -1);
+ if (rc < 0) {
+ unlock_kernel();
+ return rc;
+ }
rc = jfs_umount_rw(sb);
JFS_SBI(sb)->flag = flag;
unlock_kernel();
@@ -469,6 +481,10 @@ static int jfs_fill_super(struct super_block *sb, void *data, int silent)
*/
sb->s_op = &jfs_super_operations;
sb->s_export_op = &jfs_export_operations;
+#ifdef CONFIG_QUOTA
+ sb->dq_op = &dquot_operations;
+ sb->s_qcop = &dquot_quotactl_ops;
+#endif
/*
* Initialize direct-mapping inode/address-space
diff --git a/fs/libfs.c b/fs/libfs.c
index 232bea425b09..dcaf972cbf1b 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -8,6 +8,7 @@
#include <linux/slab.h>
#include <linux/mount.h>
#include <linux/vfs.h>
+#include <linux/quotaops.h>
#include <linux/mutex.h>
#include <linux/exportfs.h>
#include <linux/writeback.h>
@@ -58,11 +59,6 @@ struct dentry *simple_lookup(struct inode *dir, struct dentry *dentry, struct na
return NULL;
}
-int simple_sync_file(struct file * file, struct dentry *dentry, int datasync)
-{
- return 0;
-}
-
int dcache_dir_open(struct inode *inode, struct file *file)
{
static struct qstr cursor_name = {.len = 1, .name = "."};
@@ -190,7 +186,7 @@ const struct file_operations simple_dir_operations = {
.llseek = dcache_dir_lseek,
.read = generic_read_dir,
.readdir = dcache_readdir,
- .fsync = simple_sync_file,
+ .fsync = noop_fsync,
};
const struct inode_operations simple_dir_inode_operations = {
@@ -330,6 +326,81 @@ int simple_rename(struct inode *old_dir, struct dentry *old_dentry,
return 0;
}
+/**
+ * simple_setsize - handle core mm and vfs requirements for file size change
+ * @inode: inode
+ * @newsize: new file size
+ *
+ * Returns 0 on success, -error on failure.
+ *
+ * simple_setsize must be called with inode_mutex held.
+ *
+ * simple_setsize will check that the requested new size is OK (see
+ * inode_newsize_ok), and then will perform the necessary i_size update
+ * and pagecache truncation (if necessary). It will be typically be called
+ * from the filesystem's setattr function when ATTR_SIZE is passed in.
+ *
+ * The inode itself must have correct permissions and attributes to allow
+ * i_size to be changed, this function then just checks that the new size
+ * requested is valid.
+ *
+ * In the case of simple in-memory filesystems with inodes stored solely
+ * in the inode cache, and file data in the pagecache, nothing more needs
+ * to be done to satisfy a truncate request. Filesystems with on-disk
+ * blocks for example will need to free them in the case of truncate, in
+ * that case it may be easier not to use simple_setsize (but each of its
+ * components will likely be required at some point to update pagecache
+ * and inode etc).
+ */
+int simple_setsize(struct inode *inode, loff_t newsize)
+{
+ loff_t oldsize;
+ int error;
+
+ error = inode_newsize_ok(inode, newsize);
+ if (error)
+ return error;
+
+ oldsize = inode->i_size;
+ i_size_write(inode, newsize);
+ truncate_pagecache(inode, oldsize, newsize);
+
+ return error;
+}
+EXPORT_SYMBOL(simple_setsize);
+
+/**
+ * simple_setattr - setattr for simple in-memory filesystem
+ * @dentry: dentry
+ * @iattr: iattr structure
+ *
+ * Returns 0 on success, -error on failure.
+ *
+ * simple_setattr implements setattr for an in-memory filesystem which
+ * does not store its own file data or metadata (eg. uses the page cache
+ * and inode cache as its data store).
+ */
+int simple_setattr(struct dentry *dentry, struct iattr *iattr)
+{
+ struct inode *inode = dentry->d_inode;
+ int error;
+
+ error = inode_change_ok(inode, iattr);
+ if (error)
+ return error;
+
+ if (iattr->ia_valid & ATTR_SIZE) {
+ error = simple_setsize(inode, iattr->ia_size);
+ if (error)
+ return error;
+ }
+
+ generic_setattr(inode, iattr);
+
+ return error;
+}
+EXPORT_SYMBOL(simple_setattr);
+
int simple_readpage(struct file *file, struct page *page)
{
clear_highpage(page);
@@ -418,7 +489,8 @@ int simple_write_end(struct file *file, struct address_space *mapping,
* unique inode values later for this filesystem, then you must take care
* to pass it an appropriate max_reserved value to avoid collisions.
*/
-int simple_fill_super(struct super_block *s, int magic, struct tree_descr *files)
+int simple_fill_super(struct super_block *s, unsigned long magic,
+ struct tree_descr *files)
{
struct inode *inode;
struct dentry *root;
@@ -851,13 +923,22 @@ struct dentry *generic_fh_to_parent(struct super_block *sb, struct fid *fid,
}
EXPORT_SYMBOL_GPL(generic_fh_to_parent);
-int simple_fsync(struct file *file, struct dentry *dentry, int datasync)
+/**
+ * generic_file_fsync - generic fsync implementation for simple filesystems
+ * @file: file to synchronize
+ * @datasync: only synchronize essential metadata if true
+ *
+ * This is a generic implementation of the fsync method for simple
+ * filesystems which track all non-inode metadata in the buffers list
+ * hanging off the address_space structure.
+ */
+int generic_file_fsync(struct file *file, int datasync)
{
struct writeback_control wbc = {
.sync_mode = WB_SYNC_ALL,
.nr_to_write = 0, /* metadata-only; caller takes care of data */
};
- struct inode *inode = dentry->d_inode;
+ struct inode *inode = file->f_mapping->host;
int err;
int ret;
@@ -872,7 +953,15 @@ int simple_fsync(struct file *file, struct dentry *dentry, int datasync)
ret = err;
return ret;
}
-EXPORT_SYMBOL(simple_fsync);
+EXPORT_SYMBOL(generic_file_fsync);
+
+/*
+ * No-op implementation of ->fsync for in-memory filesystems.
+ */
+int noop_fsync(struct file *file, int datasync)
+{
+ return 0;
+}
EXPORT_SYMBOL(dcache_dir_close);
EXPORT_SYMBOL(dcache_dir_lseek);
@@ -895,7 +984,7 @@ EXPORT_SYMBOL(simple_release_fs);
EXPORT_SYMBOL(simple_rename);
EXPORT_SYMBOL(simple_rmdir);
EXPORT_SYMBOL(simple_statfs);
-EXPORT_SYMBOL(simple_sync_file);
+EXPORT_SYMBOL(noop_fsync);
EXPORT_SYMBOL(simple_unlink);
EXPORT_SYMBOL(simple_read_from_buffer);
EXPORT_SYMBOL(simple_write_to_buffer);
diff --git a/fs/logfs/file.c b/fs/logfs/file.c
index 0de524071870..abe1cafbd4c2 100644
--- a/fs/logfs/file.c
+++ b/fs/logfs/file.c
@@ -219,9 +219,9 @@ int logfs_ioctl(struct inode *inode, struct file *file, unsigned int cmd,
}
}
-int logfs_fsync(struct file *file, struct dentry *dentry, int datasync)
+int logfs_fsync(struct file *file, int datasync)
{
- struct super_block *sb = dentry->d_inode->i_sb;
+ struct super_block *sb = file->f_mapping->host->i_sb;
logfs_write_anchor(sb);
return 0;
diff --git a/fs/logfs/logfs.h b/fs/logfs/logfs.h
index 1a9db84f8d8f..c838c4d72111 100644
--- a/fs/logfs/logfs.h
+++ b/fs/logfs/logfs.h
@@ -506,7 +506,7 @@ extern const struct address_space_operations logfs_reg_aops;
int logfs_readpage(struct file *file, struct page *page);
int logfs_ioctl(struct inode *inode, struct file *file, unsigned int cmd,
unsigned long arg);
-int logfs_fsync(struct file *file, struct dentry *dentry, int datasync);
+int logfs_fsync(struct file *file, int datasync);
/* gc.c */
u32 get_best_cand(struct super_block *sb, struct candidate_list *list, u32 *ec);
diff --git a/fs/mbcache.c b/fs/mbcache.c
index ec88ff3d04a9..e28f21b95344 100644
--- a/fs/mbcache.c
+++ b/fs/mbcache.c
@@ -115,7 +115,7 @@ mb_cache_indexes(struct mb_cache *cache)
* What the mbcache registers as to get shrunk dynamically.
*/
-static int mb_cache_shrink_fn(int nr_to_scan, gfp_t gfp_mask);
+static int mb_cache_shrink_fn(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask);
static struct shrinker mb_cache_shrinker = {
.shrink = mb_cache_shrink_fn,
@@ -191,13 +191,14 @@ forget:
* This function is called by the kernel memory management when memory
* gets low.
*
+ * @shrink: (ignored)
* @nr_to_scan: Number of objects to scan
* @gfp_mask: (ignored)
*
* Returns the number of objects which are present in the cache.
*/
static int
-mb_cache_shrink_fn(int nr_to_scan, gfp_t gfp_mask)
+mb_cache_shrink_fn(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask)
{
LIST_HEAD(free_list);
struct list_head *l, *ltmp;
diff --git a/fs/minix/dir.c b/fs/minix/dir.c
index 6198731d7fcd..1dbf921ca44b 100644
--- a/fs/minix/dir.c
+++ b/fs/minix/dir.c
@@ -22,7 +22,7 @@ const struct file_operations minix_dir_operations = {
.llseek = generic_file_llseek,
.read = generic_read_dir,
.readdir = minix_readdir,
- .fsync = simple_fsync,
+ .fsync = generic_file_fsync,
};
static inline void dir_put_page(struct page *page)
@@ -72,16 +72,9 @@ static struct page * dir_get_page(struct inode *dir, unsigned long n)
{
struct address_space *mapping = dir->i_mapping;
struct page *page = read_mapping_page(mapping, n, NULL);
- if (!IS_ERR(page)) {
+ if (!IS_ERR(page))
kmap(page);
- if (!PageUptodate(page))
- goto fail;
- }
return page;
-
-fail:
- dir_put_page(page);
- return ERR_PTR(-EIO);
}
static inline void *minix_next_entry(void *de, struct minix_sb_info *sbi)
diff --git a/fs/minix/file.c b/fs/minix/file.c
index 3eec3e607a87..d5320ff23faf 100644
--- a/fs/minix/file.c
+++ b/fs/minix/file.c
@@ -19,7 +19,7 @@ const struct file_operations minix_file_operations = {
.write = do_sync_write,
.aio_write = generic_file_aio_write,
.mmap = generic_file_mmap,
- .fsync = simple_fsync,
+ .fsync = generic_file_fsync,
.splice_read = generic_file_splice_read,
};
diff --git a/fs/minix/itree_v2.c b/fs/minix/itree_v2.c
index f23010969369..13487ad16894 100644
--- a/fs/minix/itree_v2.c
+++ b/fs/minix/itree_v2.c
@@ -20,6 +20,9 @@ static inline block_t *i_data(struct inode *inode)
return (block_t *)minix_i(inode)->u.i2_data;
}
+#define DIRCOUNT 7
+#define INDIRCOUNT(sb) (1 << ((sb)->s_blocksize_bits - 2))
+
static int block_to_path(struct inode * inode, long block, int offsets[DEPTH])
{
int n = 0;
@@ -34,21 +37,21 @@ static int block_to_path(struct inode * inode, long block, int offsets[DEPTH])
printk("MINIX-fs: block_to_path: "
"block %ld too big on dev %s\n",
block, bdevname(sb->s_bdev, b));
- } else if (block < 7) {
+ } else if (block < DIRCOUNT) {
offsets[n++] = block;
- } else if ((block -= 7) < 256) {
- offsets[n++] = 7;
+ } else if ((block -= DIRCOUNT) < INDIRCOUNT(sb)) {
+ offsets[n++] = DIRCOUNT;
offsets[n++] = block;
- } else if ((block -= 256) < 256*256) {
- offsets[n++] = 8;
- offsets[n++] = block>>8;
- offsets[n++] = block & 255;
+ } else if ((block -= INDIRCOUNT(sb)) < INDIRCOUNT(sb) * INDIRCOUNT(sb)) {
+ offsets[n++] = DIRCOUNT + 1;
+ offsets[n++] = block / INDIRCOUNT(sb);
+ offsets[n++] = block % INDIRCOUNT(sb);
} else {
- block -= 256*256;
- offsets[n++] = 9;
- offsets[n++] = block>>16;
- offsets[n++] = (block>>8) & 255;
- offsets[n++] = block & 255;
+ block -= INDIRCOUNT(sb) * INDIRCOUNT(sb);
+ offsets[n++] = DIRCOUNT + 2;
+ offsets[n++] = (block / INDIRCOUNT(sb)) / INDIRCOUNT(sb);
+ offsets[n++] = (block / INDIRCOUNT(sb)) % INDIRCOUNT(sb);
+ offsets[n++] = block % INDIRCOUNT(sb);
}
return n;
}
diff --git a/fs/namei.c b/fs/namei.c
index 48e1f60520ea..42d2d28fb827 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -282,8 +282,7 @@ int inode_permission(struct inode *inode, int mask)
if (retval)
return retval;
- return security_inode_permission(inode,
- mask & (MAY_READ|MAY_WRITE|MAY_EXEC|MAY_APPEND));
+ return security_inode_permission(inode, mask);
}
/**
@@ -1484,8 +1483,7 @@ static int handle_truncate(struct path *path)
*/
error = locks_verify_locked(inode);
if (!error)
- error = security_path_truncate(path, 0,
- ATTR_MTIME|ATTR_CTIME|ATTR_OPEN);
+ error = security_path_truncate(path);
if (!error) {
error = do_truncate(path->dentry, 0,
ATTR_MTIME|ATTR_CTIME|ATTR_OPEN,
@@ -1621,6 +1619,7 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
case LAST_DOTDOT:
follow_dotdot(nd);
dir = nd->path.dentry;
+ case LAST_DOT:
if (nd->path.mnt->mnt_sb->s_type->fs_flags & FS_REVAL_DOT) {
if (!dir->d_op->d_revalidate(dir, nd)) {
error = -ESTALE;
@@ -1628,7 +1627,6 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
}
}
/* fallthrough */
- case LAST_DOT:
case LAST_ROOT:
if (open_flag & O_CREAT)
goto exit;
diff --git a/fs/ncpfs/dir.c b/fs/ncpfs/dir.c
index 92dde6f8d893..9578cbe0cd58 100644
--- a/fs/ncpfs/dir.c
+++ b/fs/ncpfs/dir.c
@@ -49,6 +49,7 @@ extern int ncp_symlink(struct inode *, struct dentry *, const char *);
const struct file_operations ncp_dir_operations =
{
+ .llseek = generic_file_llseek,
.read = generic_read_dir,
.readdir = ncp_readdir,
.unlocked_ioctl = ncp_ioctl,
diff --git a/fs/ncpfs/file.c b/fs/ncpfs/file.c
index b93870892892..3639cc5cbdae 100644
--- a/fs/ncpfs/file.c
+++ b/fs/ncpfs/file.c
@@ -22,7 +22,7 @@
#include <linux/ncp_fs.h>
#include "ncplib_kernel.h"
-static int ncp_fsync(struct file *file, struct dentry *dentry, int datasync)
+static int ncp_fsync(struct file *file, int datasync)
{
return 0;
}
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 7ec9b34a59f8..d25b5257b7a1 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -1286,6 +1286,55 @@ static void nfs4_session_set_rwsize(struct nfs_server *server)
#endif /* CONFIG_NFS_V4_1 */
}
+static int nfs4_server_common_setup(struct nfs_server *server,
+ struct nfs_fh *mntfh)
+{
+ struct nfs_fattr *fattr;
+ int error;
+
+ BUG_ON(!server->nfs_client);
+ BUG_ON(!server->nfs_client->rpc_ops);
+ BUG_ON(!server->nfs_client->rpc_ops->file_inode_ops);
+
+ fattr = nfs_alloc_fattr();
+ if (fattr == NULL)
+ return -ENOMEM;
+
+ /* We must ensure the session is initialised first */
+ error = nfs4_init_session(server);
+ if (error < 0)
+ goto out;
+
+ /* Probe the root fh to retrieve its FSID and filehandle */
+ error = nfs4_get_rootfh(server, mntfh);
+ if (error < 0)
+ goto out;
+
+ dprintk("Server FSID: %llx:%llx\n",
+ (unsigned long long) server->fsid.major,
+ (unsigned long long) server->fsid.minor);
+ dprintk("Mount FH: %d\n", mntfh->size);
+
+ nfs4_session_set_rwsize(server);
+
+ error = nfs_probe_fsinfo(server, mntfh, fattr);
+ if (error < 0)
+ goto out;
+
+ if (server->namelen == 0 || server->namelen > NFS4_MAXNAMLEN)
+ server->namelen = NFS4_MAXNAMLEN;
+
+ spin_lock(&nfs_client_lock);
+ list_add_tail(&server->client_link, &server->nfs_client->cl_superblocks);
+ list_add_tail(&server->master_link, &nfs_volume_list);
+ spin_unlock(&nfs_client_lock);
+
+ server->mount_time = jiffies;
+out:
+ nfs_free_fattr(fattr);
+ return error;
+}
+
/*
* Create a version 4 volume record
*/
@@ -1346,7 +1395,6 @@ error:
struct nfs_server *nfs4_create_server(const struct nfs_parsed_mount_data *data,
struct nfs_fh *mntfh)
{
- struct nfs_fattr *fattr;
struct nfs_server *server;
int error;
@@ -1356,55 +1404,19 @@ struct nfs_server *nfs4_create_server(const struct nfs_parsed_mount_data *data,
if (!server)
return ERR_PTR(-ENOMEM);
- error = -ENOMEM;
- fattr = nfs_alloc_fattr();
- if (fattr == NULL)
- goto error;
-
/* set up the general RPC client */
error = nfs4_init_server(server, data);
if (error < 0)
goto error;
- BUG_ON(!server->nfs_client);
- BUG_ON(!server->nfs_client->rpc_ops);
- BUG_ON(!server->nfs_client->rpc_ops->file_inode_ops);
-
- error = nfs4_init_session(server);
- if (error < 0)
- goto error;
-
- /* Probe the root fh to retrieve its FSID */
- error = nfs4_get_rootfh(server, mntfh);
+ error = nfs4_server_common_setup(server, mntfh);
if (error < 0)
goto error;
- dprintk("Server FSID: %llx:%llx\n",
- (unsigned long long) server->fsid.major,
- (unsigned long long) server->fsid.minor);
- dprintk("Mount FH: %d\n", mntfh->size);
-
- nfs4_session_set_rwsize(server);
-
- error = nfs_probe_fsinfo(server, mntfh, fattr);
- if (error < 0)
- goto error;
-
- if (server->namelen == 0 || server->namelen > NFS4_MAXNAMLEN)
- server->namelen = NFS4_MAXNAMLEN;
-
- spin_lock(&nfs_client_lock);
- list_add_tail(&server->client_link, &server->nfs_client->cl_superblocks);
- list_add_tail(&server->master_link, &nfs_volume_list);
- spin_unlock(&nfs_client_lock);
-
- server->mount_time = jiffies;
dprintk("<-- nfs4_create_server() = %p\n", server);
- nfs_free_fattr(fattr);
return server;
error:
- nfs_free_fattr(fattr);
nfs_free_server(server);
dprintk("<-- nfs4_create_server() = error %d\n", error);
return ERR_PTR(error);
@@ -1418,7 +1430,6 @@ struct nfs_server *nfs4_create_referral_server(struct nfs_clone_mount *data,
{
struct nfs_client *parent_client;
struct nfs_server *server, *parent_server;
- struct nfs_fattr *fattr;
int error;
dprintk("--> nfs4_create_referral_server()\n");
@@ -1427,11 +1438,6 @@ struct nfs_server *nfs4_create_referral_server(struct nfs_clone_mount *data,
if (!server)
return ERR_PTR(-ENOMEM);
- error = -ENOMEM;
- fattr = nfs_alloc_fattr();
- if (fattr == NULL)
- goto error;
-
parent_server = NFS_SB(data->sb);
parent_client = parent_server->nfs_client;
@@ -1456,40 +1462,14 @@ struct nfs_server *nfs4_create_referral_server(struct nfs_clone_mount *data,
if (error < 0)
goto error;
- BUG_ON(!server->nfs_client);
- BUG_ON(!server->nfs_client->rpc_ops);
- BUG_ON(!server->nfs_client->rpc_ops->file_inode_ops);
-
- /* Probe the root fh to retrieve its FSID and filehandle */
- error = nfs4_get_rootfh(server, mntfh);
- if (error < 0)
- goto error;
-
- /* probe the filesystem info for this server filesystem */
- error = nfs_probe_fsinfo(server, mntfh, fattr);
+ error = nfs4_server_common_setup(server, mntfh);
if (error < 0)
goto error;
- if (server->namelen == 0 || server->namelen > NFS4_MAXNAMLEN)
- server->namelen = NFS4_MAXNAMLEN;
-
- dprintk("Referral FSID: %llx:%llx\n",
- (unsigned long long) server->fsid.major,
- (unsigned long long) server->fsid.minor);
-
- spin_lock(&nfs_client_lock);
- list_add_tail(&server->client_link, &server->nfs_client->cl_superblocks);
- list_add_tail(&server->master_link, &nfs_volume_list);
- spin_unlock(&nfs_client_lock);
-
- server->mount_time = jiffies;
-
- nfs_free_fattr(fattr);
dprintk("<-- nfs_create_referral_server() = %p\n", server);
return server;
error:
- nfs_free_fattr(fattr);
nfs_free_server(server);
dprintk("<-- nfs4_create_referral_server() = error %d\n", error);
return ERR_PTR(error);
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index ee9a179ebdf3..832e9e239324 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -53,7 +53,7 @@ static int nfs_link(struct dentry *, struct inode *, struct dentry *);
static int nfs_mknod(struct inode *, struct dentry *, int, dev_t);
static int nfs_rename(struct inode *, struct dentry *,
struct inode *, struct dentry *);
-static int nfs_fsync_dir(struct file *, struct dentry *, int);
+static int nfs_fsync_dir(struct file *, int);
static loff_t nfs_llseek_dir(struct file *, loff_t, int);
const struct file_operations nfs_dir_operations = {
@@ -641,8 +641,10 @@ out:
* All directory operations under NFS are synchronous, so fsync()
* is a dummy operation.
*/
-static int nfs_fsync_dir(struct file *filp, struct dentry *dentry, int datasync)
+static int nfs_fsync_dir(struct file *filp, int datasync)
{
+ struct dentry *dentry = filp->f_path.dentry;
+
dfprintk(FILE, "NFS: fsync dir(%s/%s) datasync %d\n",
dentry->d_parent->d_name.name, dentry->d_name.name,
datasync);
@@ -1708,7 +1710,7 @@ static void nfs_access_free_list(struct list_head *head)
}
}
-int nfs_access_cache_shrinker(int nr_to_scan, gfp_t gfp_mask)
+int nfs_access_cache_shrinker(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask)
{
LIST_HEAD(head);
struct nfs_inode *nfsi;
@@ -1741,6 +1743,7 @@ remove_lru_entry:
clear_bit(NFS_INO_ACL_LRU_SET, &nfsi->flags);
smp_mb__after_clear_bit();
}
+ spin_unlock(&inode->i_lock);
}
spin_unlock(&nfs_access_lru_lock);
nfs_access_free_list(&head);
@@ -1950,7 +1953,7 @@ int nfs_permission(struct inode *inode, int mask)
if ((mask & (MAY_READ | MAY_WRITE | MAY_EXEC)) == 0)
goto out;
/* Is this sys_access() ? */
- if (mask & MAY_ACCESS)
+ if (mask & (MAY_ACCESS | MAY_CHDIR))
goto force_lookup;
switch (inode->i_mode & S_IFMT) {
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index cac96bcc91e4..f036153d9f50 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -27,6 +27,7 @@
#include <linux/pagemap.h>
#include <linux/aio.h>
#include <linux/gfp.h>
+#include <linux/swap.h>
#include <asm/uaccess.h>
#include <asm/system.h>
@@ -53,7 +54,7 @@ static ssize_t nfs_file_splice_write(struct pipe_inode_info *pipe,
static ssize_t nfs_file_write(struct kiocb *, const struct iovec *iov,
unsigned long nr_segs, loff_t pos);
static int nfs_file_flush(struct file *, fl_owner_t id);
-static int nfs_file_fsync(struct file *, struct dentry *dentry, int datasync);
+static int nfs_file_fsync(struct file *, int datasync);
static int nfs_check_flags(int flags);
static int nfs_lock(struct file *filp, int cmd, struct file_lock *fl);
static int nfs_flock(struct file *filp, int cmd, struct file_lock *fl);
@@ -322,8 +323,9 @@ nfs_file_mmap(struct file * file, struct vm_area_struct * vma)
* whether any write errors occurred for this process.
*/
static int
-nfs_file_fsync(struct file *file, struct dentry *dentry, int datasync)
+nfs_file_fsync(struct file *file, int datasync)
{
+ struct dentry *dentry = file->f_path.dentry;
struct nfs_open_context *ctx = nfs_file_open_context(file);
struct inode *inode = dentry->d_inode;
@@ -492,11 +494,19 @@ static void nfs_invalidate_page(struct page *page, unsigned long offset)
*/
static int nfs_release_page(struct page *page, gfp_t gfp)
{
+ struct address_space *mapping = page->mapping;
+
dfprintk(PAGECACHE, "NFS: release_page(%p)\n", page);
/* Only do I/O if gfp is a superset of GFP_KERNEL */
- if ((gfp & GFP_KERNEL) == GFP_KERNEL)
- nfs_wb_page(page->mapping->host, page);
+ if (mapping && (gfp & GFP_KERNEL) == GFP_KERNEL) {
+ int how = FLUSH_SYNC;
+
+ /* Don't let kswapd deadlock waiting for OOM RPC calls */
+ if (current_is_kswapd())
+ how = 0;
+ nfs_commit_inode(mapping->host, how);
+ }
/* If PagePrivate() is set, then the page is not freeable */
if (PagePrivate(page))
return 0;
diff --git a/fs/nfs/getroot.c b/fs/nfs/getroot.c
index 7428f7d6273b..a70e446e1605 100644
--- a/fs/nfs/getroot.c
+++ b/fs/nfs/getroot.c
@@ -146,7 +146,7 @@ int nfs4_get_rootfh(struct nfs_server *server, struct nfs_fh *mntfh)
goto out;
}
- if (!(fsinfo.fattr->valid & NFS_ATTR_FATTR_MODE)
+ if (!(fsinfo.fattr->valid & NFS_ATTR_FATTR_TYPE)
|| !S_ISDIR(fsinfo.fattr->mode)) {
printk(KERN_ERR "nfs4_get_rootfh:"
" getroot encountered non-directory\n");
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index d8bd619e386c..e70f44b9b3f4 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -205,7 +205,8 @@ extern struct rpc_procinfo nfs4_procedures[];
void nfs_close_context(struct nfs_open_context *ctx, int is_sync);
/* dir.c */
-extern int nfs_access_cache_shrinker(int nr_to_scan, gfp_t gfp_mask);
+extern int nfs_access_cache_shrinker(struct shrinker *shrink,
+ int nr_to_scan, gfp_t gfp_mask);
/* inode.c */
extern struct workqueue_struct *nfsiod_workqueue;
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 6bdef28efa33..65c8dae4b267 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -862,8 +862,8 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const
bmval1 |= FATTR4_WORD1_TIME_ACCESS_SET;
*p++ = cpu_to_be32(NFS4_SET_TO_CLIENT_TIME);
*p++ = cpu_to_be32(0);
- *p++ = cpu_to_be32(iap->ia_mtime.tv_sec);
- *p++ = cpu_to_be32(iap->ia_mtime.tv_nsec);
+ *p++ = cpu_to_be32(iap->ia_atime.tv_sec);
+ *p++ = cpu_to_be32(iap->ia_atime.tv_nsec);
}
else if (iap->ia_valid & ATTR_ATIME) {
bmval1 |= FATTR4_WORD1_TIME_ACCESS_SET;
diff --git a/fs/nfs/nfsroot.c b/fs/nfs/nfsroot.c
index 6bd19d843af7..df101d9f546a 100644
--- a/fs/nfs/nfsroot.c
+++ b/fs/nfs/nfsroot.c
@@ -105,7 +105,7 @@ static char nfs_root_name[256] __initdata = "";
static __be32 servaddr __initdata = 0;
/* Name of directory to mount */
-static char nfs_export_path[NFS_MAXPATHLEN] __initdata = { 0, };
+static char nfs_export_path[NFS_MAXPATHLEN + 1] __initdata = { 0, };
/* NFS-related data */
static struct nfs_mount_data nfs_data __initdata = { 0, };/* NFS mount info */
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 04214fc5c304..f9df16de4a56 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -570,6 +570,22 @@ static void nfs_show_mountd_options(struct seq_file *m, struct nfs_server *nfss,
nfs_show_mountd_netid(m, nfss, showdefaults);
}
+#ifdef CONFIG_NFS_V4
+static void nfs_show_nfsv4_options(struct seq_file *m, struct nfs_server *nfss,
+ int showdefaults)
+{
+ struct nfs_client *clp = nfss->nfs_client;
+
+ seq_printf(m, ",clientaddr=%s", clp->cl_ipaddr);
+ seq_printf(m, ",minorversion=%u", clp->cl_minorversion);
+}
+#else
+static void nfs_show_nfsv4_options(struct seq_file *m, struct nfs_server *nfss,
+ int showdefaults)
+{
+}
+#endif
+
/*
* Describe the mount options in force on this server representation
*/
@@ -631,11 +647,9 @@ static void nfs_show_mount_options(struct seq_file *m, struct nfs_server *nfss,
if (version != 4)
nfs_show_mountd_options(m, nfss, showdefaults);
+ else
+ nfs_show_nfsv4_options(m, nfss, showdefaults);
-#ifdef CONFIG_NFS_V4
- if (clp->rpc_ops->version == 4)
- seq_printf(m, ",clientaddr=%s", clp->cl_ipaddr);
-#endif
if (nfss->options & NFS_OPTION_FSCACHE)
seq_printf(m, ",fsc");
}
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 3aea3ca98ab7..9f81bdd91c55 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -222,7 +222,7 @@ static void nfs_end_page_writeback(struct page *page)
clear_bdi_congested(&nfss->backing_dev_info, BLK_RW_ASYNC);
}
-static struct nfs_page *nfs_find_and_lock_request(struct page *page)
+static struct nfs_page *nfs_find_and_lock_request(struct page *page, bool nonblock)
{
struct inode *inode = page->mapping->host;
struct nfs_page *req;
@@ -241,7 +241,10 @@ static struct nfs_page *nfs_find_and_lock_request(struct page *page)
* request as dirty (in which case we don't care).
*/
spin_unlock(&inode->i_lock);
- ret = nfs_wait_on_request(req);
+ if (!nonblock)
+ ret = nfs_wait_on_request(req);
+ else
+ ret = -EAGAIN;
nfs_release_request(req);
if (ret != 0)
return ERR_PTR(ret);
@@ -256,12 +259,12 @@ static struct nfs_page *nfs_find_and_lock_request(struct page *page)
* May return an error if the user signalled nfs_wait_on_request().
*/
static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio,
- struct page *page)
+ struct page *page, bool nonblock)
{
struct nfs_page *req;
int ret = 0;
- req = nfs_find_and_lock_request(page);
+ req = nfs_find_and_lock_request(page, nonblock);
if (!req)
goto out;
ret = PTR_ERR(req);
@@ -283,12 +286,20 @@ out:
static int nfs_do_writepage(struct page *page, struct writeback_control *wbc, struct nfs_pageio_descriptor *pgio)
{
struct inode *inode = page->mapping->host;
+ int ret;
nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGE);
nfs_add_stats(inode, NFSIOS_WRITEPAGES, 1);
nfs_pageio_cond_complete(pgio, page->index);
- return nfs_page_async_flush(pgio, page);
+ ret = nfs_page_async_flush(pgio, page,
+ wbc->sync_mode == WB_SYNC_NONE ||
+ wbc->nonblocking != 0);
+ if (ret == -EAGAIN) {
+ redirty_page_for_writepage(wbc, page);
+ ret = 0;
+ }
+ return ret;
}
/*
@@ -1379,14 +1390,14 @@ static const struct rpc_call_ops nfs_commit_ops = {
.rpc_release = nfs_commit_release,
};
-static int nfs_commit_inode(struct inode *inode, int how)
+int nfs_commit_inode(struct inode *inode, int how)
{
LIST_HEAD(head);
int may_wait = how & FLUSH_SYNC;
int res = 0;
if (!nfs_commit_set_lock(NFS_I(inode), may_wait))
- goto out;
+ goto out_mark_dirty;
spin_lock(&inode->i_lock);
res = nfs_scan_commit(inode, &head, 0, 0);
spin_unlock(&inode->i_lock);
@@ -1398,9 +1409,18 @@ static int nfs_commit_inode(struct inode *inode, int how)
wait_on_bit(&NFS_I(inode)->flags, NFS_INO_COMMIT,
nfs_wait_bit_killable,
TASK_KILLABLE);
+ else
+ goto out_mark_dirty;
} else
nfs_commit_clear_lock(NFS_I(inode));
-out:
+ return res;
+ /* Note: If we exit without ensuring that the commit is complete,
+ * we must mark the inode as dirty. Otherwise, future calls to
+ * sync_inode() with the WB_SYNC_ALL flag set will fail to ensure
+ * that the data is on the disk.
+ */
+out_mark_dirty:
+ __mark_inode_dirty(inode, I_DIRTY_DATASYNC);
return res;
}
@@ -1434,11 +1454,6 @@ out_mark_dirty:
return ret;
}
#else
-static int nfs_commit_inode(struct inode *inode, int how)
-{
- return 0;
-}
-
static int nfs_commit_unstable_pages(struct inode *inode, struct writeback_control *wbc)
{
return 0;
@@ -1509,14 +1524,17 @@ int nfs_wb_page(struct inode *inode, struct page *page)
};
int ret;
- while(PagePrivate(page)) {
+ for (;;) {
wait_on_page_writeback(page);
if (clear_page_dirty_for_io(page)) {
ret = nfs_writepage_locked(page, &wbc);
if (ret < 0)
goto out_error;
+ continue;
}
- ret = sync_inode(inode, &wbc);
+ if (!PagePrivate(page))
+ break;
+ ret = nfs_commit_inode(inode, FLUSH_SYNC);
if (ret < 0)
goto out_error;
}
@@ -1534,7 +1552,7 @@ int nfs_migrate_page(struct address_space *mapping, struct page *newpage,
nfs_fscache_release_page(page, GFP_KERNEL);
- req = nfs_find_and_lock_request(page);
+ req = nfs_find_and_lock_request(page, false);
ret = PTR_ERR(req);
if (IS_ERR(req))
goto out;
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 12f7109720c2..4a2734758778 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -4122,8 +4122,8 @@ nfs4_state_shutdown(void)
nfs4_lock_state();
nfs4_release_reclaim();
__nfs4_state_shutdown();
- nfsd4_destroy_callback_queue();
nfs4_unlock_state();
+ nfsd4_destroy_callback_queue();
}
/*
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index ebbf3b6b2457..3c111120b619 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -443,8 +443,7 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap,
if (size_change)
put_write_access(inode);
if (!err)
- if (EX_ISSYNC(fhp->fh_export))
- write_inode_now(inode, 1);
+ commit_metadata(fhp);
out:
return err;
diff --git a/fs/nilfs2/btree.h b/fs/nilfs2/btree.h
index af638d59e3bf..43c8c5b541fd 100644
--- a/fs/nilfs2/btree.h
+++ b/fs/nilfs2/btree.h
@@ -75,8 +75,6 @@ struct nilfs_btree_path {
extern struct kmem_cache *nilfs_btree_path_cache;
-int nilfs_btree_path_cache_init(void);
-void nilfs_btree_path_cache_destroy(void);
int nilfs_btree_init(struct nilfs_bmap *);
int nilfs_btree_convert_and_insert(struct nilfs_bmap *, __u64, __u64,
const __u64 *, const __u64 *, int);
diff --git a/fs/nilfs2/file.c b/fs/nilfs2/file.c
index 30292df443ce..c9a30d7ff6fc 100644
--- a/fs/nilfs2/file.c
+++ b/fs/nilfs2/file.c
@@ -27,7 +27,7 @@
#include "nilfs.h"
#include "segment.h"
-int nilfs_sync_file(struct file *file, struct dentry *dentry, int datasync)
+int nilfs_sync_file(struct file *file, int datasync)
{
/*
* Called from fsync() system call
@@ -37,7 +37,7 @@ int nilfs_sync_file(struct file *file, struct dentry *dentry, int datasync)
* This function should be implemented when the writeback function
* will be implemented.
*/
- struct inode *inode = dentry->d_inode;
+ struct inode *inode = file->f_mapping->host;
int err;
if (!nilfs_inode_dirty(inode))
diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h
index 8723e5bfd071..47d6d7928122 100644
--- a/fs/nilfs2/nilfs.h
+++ b/fs/nilfs2/nilfs.h
@@ -228,7 +228,7 @@ extern void nilfs_set_link(struct inode *, struct nilfs_dir_entry *,
struct page *, struct inode *);
/* file.c */
-extern int nilfs_sync_file(struct file *, struct dentry *, int);
+extern int nilfs_sync_file(struct file *, int);
/* ioctl.c */
long nilfs_ioctl(struct file *, unsigned int, unsigned long);
diff --git a/fs/nilfs2/segbuf.h b/fs/nilfs2/segbuf.h
index fdf1c3b6d673..85fbb66455e2 100644
--- a/fs/nilfs2/segbuf.h
+++ b/fs/nilfs2/segbuf.h
@@ -127,8 +127,6 @@ struct nilfs_segment_buffer {
extern struct kmem_cache *nilfs_segbuf_cachep;
-int __init nilfs_init_segbuf_cache(void);
-void nilfs_destroy_segbuf_cache(void);
struct nilfs_segment_buffer *nilfs_segbuf_new(struct super_block *);
void nilfs_segbuf_free(struct nilfs_segment_buffer *);
void nilfs_segbuf_map(struct nilfs_segment_buffer *, __u64, unsigned long,
diff --git a/fs/nilfs2/segment.h b/fs/nilfs2/segment.h
index dca142361ccf..01e20dbb217d 100644
--- a/fs/nilfs2/segment.h
+++ b/fs/nilfs2/segment.h
@@ -221,8 +221,6 @@ enum {
extern struct kmem_cache *nilfs_transaction_cachep;
/* segment.c */
-extern int nilfs_init_transaction_cache(void);
-extern void nilfs_destroy_transaction_cache(void);
extern void nilfs_relax_pressure_in_lock(struct super_block *);
extern int nilfs_construct_segment(struct super_block *);
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index 03b34b738993..414ef68931cf 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -1130,13 +1130,13 @@ static void nilfs_segbuf_init_once(void *obj)
static void nilfs_destroy_cachep(void)
{
- if (nilfs_inode_cachep)
+ if (nilfs_inode_cachep)
kmem_cache_destroy(nilfs_inode_cachep);
- if (nilfs_transaction_cachep)
+ if (nilfs_transaction_cachep)
kmem_cache_destroy(nilfs_transaction_cachep);
- if (nilfs_segbuf_cachep)
+ if (nilfs_segbuf_cachep)
kmem_cache_destroy(nilfs_segbuf_cachep);
- if (nilfs_btree_path_cache)
+ if (nilfs_btree_path_cache)
kmem_cache_destroy(nilfs_btree_path_cache);
}
diff --git a/fs/ntfs/dir.c b/fs/ntfs/dir.c
index fe44d3feee4a..0f48e7c5d9e1 100644
--- a/fs/ntfs/dir.c
+++ b/fs/ntfs/dir.c
@@ -1527,10 +1527,9 @@ static int ntfs_dir_open(struct inode *vi, struct file *filp)
* this problem for now. We do write the $BITMAP attribute if it is present
* which is the important one for a directory so things are not too bad.
*/
-static int ntfs_dir_fsync(struct file *filp, struct dentry *dentry,
- int datasync)
+static int ntfs_dir_fsync(struct file *filp, int datasync)
{
- struct inode *bmp_vi, *vi = dentry->d_inode;
+ struct inode *bmp_vi, *vi = filp->f_mapping->host;
int err, ret;
ntfs_attr na;
diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c
index a1924a0d2ab0..113ebd9f25a4 100644
--- a/fs/ntfs/file.c
+++ b/fs/ntfs/file.c
@@ -2133,7 +2133,6 @@ static ssize_t ntfs_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
/**
* ntfs_file_fsync - sync a file to disk
* @filp: file to be synced
- * @dentry: dentry describing the file to sync
* @datasync: if non-zero only flush user data and not metadata
*
* Data integrity sync of a file to disk. Used for fsync, fdatasync, and msync
@@ -2149,19 +2148,15 @@ static ssize_t ntfs_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
* Also, if @datasync is true, we do not wait on the inode to be written out
* but we always wait on the page cache pages to be written out.
*
- * Note: In the past @filp could be NULL so we ignore it as we don't need it
- * anyway.
- *
* Locking: Caller must hold i_mutex on the inode.
*
* TODO: We should probably also write all attribute/index inodes associated
* with this inode but since we have no simple way of getting to them we ignore
* this problem for now.
*/
-static int ntfs_file_fsync(struct file *filp, struct dentry *dentry,
- int datasync)
+static int ntfs_file_fsync(struct file *filp, int datasync)
{
- struct inode *vi = dentry->d_inode;
+ struct inode *vi = filp->f_mapping->host;
int err, ret = 0;
ntfs_debug("Entering for inode 0x%lx.", vi->i_ino);
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 3623ca20cc18..96337a4fbbdf 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -196,15 +196,14 @@ int ocfs2_get_block(struct inode *inode, sector_t iblock,
dump_stack();
goto bail;
}
-
- past_eof = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode));
- mlog(0, "Inode %lu, past_eof = %llu\n", inode->i_ino,
- (unsigned long long)past_eof);
-
- if (create && (iblock >= past_eof))
- set_buffer_new(bh_result);
}
+ past_eof = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode));
+ mlog(0, "Inode %lu, past_eof = %llu\n", inode->i_ino,
+ (unsigned long long)past_eof);
+ if (create && (iblock >= past_eof))
+ set_buffer_new(bh_result);
+
bail:
if (err < 0)
err = -EIO;
@@ -459,36 +458,6 @@ int walk_page_buffers( handle_t *handle,
return ret;
}
-handle_t *ocfs2_start_walk_page_trans(struct inode *inode,
- struct page *page,
- unsigned from,
- unsigned to)
-{
- struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
- handle_t *handle;
- int ret = 0;
-
- handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
- if (IS_ERR(handle)) {
- ret = -ENOMEM;
- mlog_errno(ret);
- goto out;
- }
-
- if (ocfs2_should_order_data(inode)) {
- ret = ocfs2_jbd2_file_inode(handle, inode);
- if (ret < 0)
- mlog_errno(ret);
- }
-out:
- if (ret) {
- if (!IS_ERR(handle))
- ocfs2_commit_trans(osb, handle);
- handle = ERR_PTR(ret);
- }
- return handle;
-}
-
static sector_t ocfs2_bmap(struct address_space *mapping, sector_t block)
{
sector_t status;
@@ -609,7 +578,9 @@ bail:
static void ocfs2_dio_end_io(struct kiocb *iocb,
loff_t offset,
ssize_t bytes,
- void *private)
+ void *private,
+ int ret,
+ bool is_async)
{
struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode;
int level;
@@ -623,6 +594,9 @@ static void ocfs2_dio_end_io(struct kiocb *iocb,
if (!level)
up_read(&inode->i_alloc_sem);
ocfs2_rw_unlock(inode, level);
+
+ if (is_async)
+ aio_complete(iocb, ret, 0);
}
/*
@@ -1131,23 +1105,37 @@ out:
*/
static int ocfs2_grab_pages_for_write(struct address_space *mapping,
struct ocfs2_write_ctxt *wc,
- u32 cpos, loff_t user_pos, int new,
+ u32 cpos, loff_t user_pos,
+ unsigned user_len, int new,
struct page *mmap_page)
{
int ret = 0, i;
- unsigned long start, target_index, index;
+ unsigned long start, target_index, end_index, index;
struct inode *inode = mapping->host;
+ loff_t last_byte;
target_index = user_pos >> PAGE_CACHE_SHIFT;
/*
* Figure out how many pages we'll be manipulating here. For
* non allocating write, we just change the one
- * page. Otherwise, we'll need a whole clusters worth.
+ * page. Otherwise, we'll need a whole clusters worth. If we're
+ * writing past i_size, we only need enough pages to cover the
+ * last page of the write.
*/
if (new) {
wc->w_num_pages = ocfs2_pages_per_cluster(inode->i_sb);
start = ocfs2_align_clusters_to_page_index(inode->i_sb, cpos);
+ /*
+ * We need the index *past* the last page we could possibly
+ * touch. This is the page past the end of the write or
+ * i_size, whichever is greater.
+ */
+ last_byte = max(user_pos + user_len, i_size_read(inode));
+ BUG_ON(last_byte < 1);
+ end_index = ((last_byte - 1) >> PAGE_CACHE_SHIFT) + 1;
+ if ((start + wc->w_num_pages) > end_index)
+ wc->w_num_pages = end_index - start;
} else {
wc->w_num_pages = 1;
start = target_index;
@@ -1620,21 +1608,20 @@ out:
* write path can treat it as an non-allocating write, which has no
* special case code for sparse/nonsparse files.
*/
-static int ocfs2_expand_nonsparse_inode(struct inode *inode, loff_t pos,
- unsigned len,
+static int ocfs2_expand_nonsparse_inode(struct inode *inode,
+ struct buffer_head *di_bh,
+ loff_t pos, unsigned len,
struct ocfs2_write_ctxt *wc)
{
int ret;
- struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
loff_t newsize = pos + len;
- if (ocfs2_sparse_alloc(osb))
- return 0;
+ BUG_ON(ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)));
if (newsize <= i_size_read(inode))
return 0;
- ret = ocfs2_extend_no_holes(inode, newsize, pos);
+ ret = ocfs2_extend_no_holes(inode, di_bh, newsize, pos);
if (ret)
mlog_errno(ret);
@@ -1644,6 +1631,18 @@ static int ocfs2_expand_nonsparse_inode(struct inode *inode, loff_t pos,
return ret;
}
+static int ocfs2_zero_tail(struct inode *inode, struct buffer_head *di_bh,
+ loff_t pos)
+{
+ int ret = 0;
+
+ BUG_ON(!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)));
+ if (pos > i_size_read(inode))
+ ret = ocfs2_zero_extend(inode, di_bh, pos);
+
+ return ret;
+}
+
int ocfs2_write_begin_nolock(struct address_space *mapping,
loff_t pos, unsigned len, unsigned flags,
struct page **pagep, void **fsdata,
@@ -1679,7 +1678,11 @@ int ocfs2_write_begin_nolock(struct address_space *mapping,
}
}
- ret = ocfs2_expand_nonsparse_inode(inode, pos, len, wc);
+ if (ocfs2_sparse_alloc(osb))
+ ret = ocfs2_zero_tail(inode, di_bh, pos);
+ else
+ ret = ocfs2_expand_nonsparse_inode(inode, di_bh, pos, len,
+ wc);
if (ret) {
mlog_errno(ret);
goto out;
@@ -1789,7 +1792,7 @@ int ocfs2_write_begin_nolock(struct address_space *mapping,
* that we can zero and flush if we error after adding the
* extent.
*/
- ret = ocfs2_grab_pages_for_write(mapping, wc, wc->w_cpos, pos,
+ ret = ocfs2_grab_pages_for_write(mapping, wc, wc->w_cpos, pos, len,
cluster_of_pages, mmap_page);
if (ret) {
mlog_errno(ret);
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
index 6b5a492e1749..153abb5abef0 100644
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -1671,7 +1671,7 @@ struct dlm_ctxt * dlm_register_domain(const char *domain,
struct dlm_ctxt *dlm = NULL;
struct dlm_ctxt *new_ctxt = NULL;
- if (strlen(domain) > O2NM_MAX_NAME_LEN) {
+ if (strlen(domain) >= O2NM_MAX_NAME_LEN) {
ret = -ENAMETOOLONG;
mlog(ML_ERROR, "domain name length too long\n");
goto leave;
@@ -1709,6 +1709,7 @@ retry:
}
if (dlm_protocol_compare(&dlm->fs_locking_proto, fs_proto)) {
+ spin_unlock(&dlm_domain_lock);
mlog(ML_ERROR,
"Requested locking protocol version is not "
"compatible with already registered domain "
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index 4a7506a4e314..94b97fc6a88e 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -2808,14 +2808,8 @@ again:
mlog(0, "trying again...\n");
goto again;
}
- /* now that we are sure the MIGRATING state is there, drop
- * the unneded state which blocked threads trying to DIRTY */
- spin_lock(&res->spinlock);
- BUG_ON(!(res->state & DLM_LOCK_RES_BLOCK_DIRTY));
- BUG_ON(!(res->state & DLM_LOCK_RES_MIGRATING));
- res->state &= ~DLM_LOCK_RES_BLOCK_DIRTY;
- spin_unlock(&res->spinlock);
+ ret = 0;
/* did the target go down or die? */
spin_lock(&dlm->spinlock);
if (!test_bit(target, dlm->domain_map)) {
@@ -2826,9 +2820,21 @@ again:
spin_unlock(&dlm->spinlock);
/*
+ * if target is down, we need to clear DLM_LOCK_RES_BLOCK_DIRTY for
+ * another try; otherwise, we are sure the MIGRATING state is there,
+ * drop the unneded state which blocked threads trying to DIRTY
+ */
+ spin_lock(&res->spinlock);
+ BUG_ON(!(res->state & DLM_LOCK_RES_BLOCK_DIRTY));
+ res->state &= ~DLM_LOCK_RES_BLOCK_DIRTY;
+ if (!ret)
+ BUG_ON(!(res->state & DLM_LOCK_RES_MIGRATING));
+ spin_unlock(&res->spinlock);
+
+ /*
* at this point:
*
- * o the DLM_LOCK_RES_MIGRATING flag is set
+ * o the DLM_LOCK_RES_MIGRATING flag is set if target not down
* o there are no pending asts on this lockres
* o all processes trying to reserve an ast on this
* lockres must wait for the MIGRATING flag to clear
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index f8b75ce4be70..9dfaac73b36d 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -463,7 +463,7 @@ static int dlm_do_recovery(struct dlm_ctxt *dlm)
if (dlm->reco.dead_node == O2NM_INVALID_NODE_NUM) {
int bit;
- bit = find_next_bit (dlm->recovery_map, O2NM_MAX_NODES+1, 0);
+ bit = find_next_bit (dlm->recovery_map, O2NM_MAX_NODES, 0);
if (bit >= O2NM_MAX_NODES || bit < 0)
dlm_set_reco_dead_node(dlm, O2NM_INVALID_NODE_NUM);
else
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 97e54b9e654b..2b10b36d1577 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -175,13 +175,12 @@ static int ocfs2_dir_release(struct inode *inode, struct file *file)
return 0;
}
-static int ocfs2_sync_file(struct file *file,
- struct dentry *dentry,
- int datasync)
+static int ocfs2_sync_file(struct file *file, int datasync)
{
int err = 0;
journal_t *journal;
- struct inode *inode = dentry->d_inode;
+ struct dentry *dentry = file->f_path.dentry;
+ struct inode *inode = file->f_mapping->host;
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
mlog_entry("(0x%p, 0x%p, %d, '%.*s')\n", file, dentry, datasync,
@@ -725,28 +724,55 @@ leave:
return status;
}
+/*
+ * While a write will already be ordering the data, a truncate will not.
+ * Thus, we need to explicitly order the zeroed pages.
+ */
+static handle_t *ocfs2_zero_start_ordered_transaction(struct inode *inode)
+{
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+ handle_t *handle = NULL;
+ int ret = 0;
+
+ if (!ocfs2_should_order_data(inode))
+ goto out;
+
+ handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
+ if (IS_ERR(handle)) {
+ ret = -ENOMEM;
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_jbd2_file_inode(handle, inode);
+ if (ret < 0)
+ mlog_errno(ret);
+
+out:
+ if (ret) {
+ if (!IS_ERR(handle))
+ ocfs2_commit_trans(osb, handle);
+ handle = ERR_PTR(ret);
+ }
+ return handle;
+}
+
/* Some parts of this taken from generic_cont_expand, which turned out
* to be too fragile to do exactly what we need without us having to
* worry about recursive locking in ->write_begin() and ->write_end(). */
-static int ocfs2_write_zero_page(struct inode *inode,
- u64 size)
+static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from,
+ u64 abs_to)
{
struct address_space *mapping = inode->i_mapping;
struct page *page;
- unsigned long index;
- unsigned int offset;
+ unsigned long index = abs_from >> PAGE_CACHE_SHIFT;
handle_t *handle = NULL;
- int ret;
+ int ret = 0;
+ unsigned zero_from, zero_to, block_start, block_end;
- offset = (size & (PAGE_CACHE_SIZE-1)); /* Within page */
- /* ugh. in prepare/commit_write, if from==to==start of block, we
- ** skip the prepare. make sure we never send an offset for the start
- ** of a block
- */
- if ((offset & (inode->i_sb->s_blocksize - 1)) == 0) {
- offset++;
- }
- index = size >> PAGE_CACHE_SHIFT;
+ BUG_ON(abs_from >= abs_to);
+ BUG_ON(abs_to > (((u64)index + 1) << PAGE_CACHE_SHIFT));
+ BUG_ON(abs_from & (inode->i_blkbits - 1));
page = grab_cache_page(mapping, index);
if (!page) {
@@ -755,31 +781,56 @@ static int ocfs2_write_zero_page(struct inode *inode,
goto out;
}
- ret = ocfs2_prepare_write_nolock(inode, page, offset, offset);
- if (ret < 0) {
- mlog_errno(ret);
- goto out_unlock;
- }
+ /* Get the offsets within the page that we want to zero */
+ zero_from = abs_from & (PAGE_CACHE_SIZE - 1);
+ zero_to = abs_to & (PAGE_CACHE_SIZE - 1);
+ if (!zero_to)
+ zero_to = PAGE_CACHE_SIZE;
- if (ocfs2_should_order_data(inode)) {
- handle = ocfs2_start_walk_page_trans(inode, page, offset,
- offset);
- if (IS_ERR(handle)) {
- ret = PTR_ERR(handle);
- handle = NULL;
+ mlog(0,
+ "abs_from = %llu, abs_to = %llu, index = %lu, zero_from = %u, zero_to = %u\n",
+ (unsigned long long)abs_from, (unsigned long long)abs_to,
+ index, zero_from, zero_to);
+
+ /* We know that zero_from is block aligned */
+ for (block_start = zero_from; block_start < zero_to;
+ block_start = block_end) {
+ block_end = block_start + (1 << inode->i_blkbits);
+
+ /*
+ * block_start is block-aligned. Bump it by one to
+ * force ocfs2_{prepare,commit}_write() to zero the
+ * whole block.
+ */
+ ret = ocfs2_prepare_write_nolock(inode, page,
+ block_start + 1,
+ block_start + 1);
+ if (ret < 0) {
+ mlog_errno(ret);
goto out_unlock;
}
- }
- /* must not update i_size! */
- ret = block_commit_write(page, offset, offset);
- if (ret < 0)
- mlog_errno(ret);
- else
- ret = 0;
+ if (!handle) {
+ handle = ocfs2_zero_start_ordered_transaction(inode);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ handle = NULL;
+ break;
+ }
+ }
+
+ /* must not update i_size! */
+ ret = block_commit_write(page, block_start + 1,
+ block_start + 1);
+ if (ret < 0)
+ mlog_errno(ret);
+ else
+ ret = 0;
+ }
if (handle)
ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
+
out_unlock:
unlock_page(page);
page_cache_release(page);
@@ -787,22 +838,114 @@ out:
return ret;
}
-static int ocfs2_zero_extend(struct inode *inode,
- u64 zero_to_size)
+/*
+ * Find the next range to zero. We do this in terms of bytes because
+ * that's what ocfs2_zero_extend() wants, and it is dealing with the
+ * pagecache. We may return multiple extents.
+ *
+ * zero_start and zero_end are ocfs2_zero_extend()s current idea of what
+ * needs to be zeroed. range_start and range_end return the next zeroing
+ * range. A subsequent call should pass the previous range_end as its
+ * zero_start. If range_end is 0, there's nothing to do.
+ *
+ * Unwritten extents are skipped over. Refcounted extents are CoWd.
+ */
+static int ocfs2_zero_extend_get_range(struct inode *inode,
+ struct buffer_head *di_bh,
+ u64 zero_start, u64 zero_end,
+ u64 *range_start, u64 *range_end)
{
- int ret = 0;
- u64 start_off;
- struct super_block *sb = inode->i_sb;
+ int rc = 0, needs_cow = 0;
+ u32 p_cpos, zero_clusters = 0;
+ u32 zero_cpos =
+ zero_start >> OCFS2_SB(inode->i_sb)->s_clustersize_bits;
+ u32 last_cpos = ocfs2_clusters_for_bytes(inode->i_sb, zero_end);
+ unsigned int num_clusters = 0;
+ unsigned int ext_flags = 0;
- start_off = ocfs2_align_bytes_to_blocks(sb, i_size_read(inode));
- while (start_off < zero_to_size) {
- ret = ocfs2_write_zero_page(inode, start_off);
- if (ret < 0) {
- mlog_errno(ret);
+ while (zero_cpos < last_cpos) {
+ rc = ocfs2_get_clusters(inode, zero_cpos, &p_cpos,
+ &num_clusters, &ext_flags);
+ if (rc) {
+ mlog_errno(rc);
+ goto out;
+ }
+
+ if (p_cpos && !(ext_flags & OCFS2_EXT_UNWRITTEN)) {
+ zero_clusters = num_clusters;
+ if (ext_flags & OCFS2_EXT_REFCOUNTED)
+ needs_cow = 1;
+ break;
+ }
+
+ zero_cpos += num_clusters;
+ }
+ if (!zero_clusters) {
+ *range_end = 0;
+ goto out;
+ }
+
+ while ((zero_cpos + zero_clusters) < last_cpos) {
+ rc = ocfs2_get_clusters(inode, zero_cpos + zero_clusters,
+ &p_cpos, &num_clusters,
+ &ext_flags);
+ if (rc) {
+ mlog_errno(rc);
goto out;
}
- start_off += sb->s_blocksize;
+ if (!p_cpos || (ext_flags & OCFS2_EXT_UNWRITTEN))
+ break;
+ if (ext_flags & OCFS2_EXT_REFCOUNTED)
+ needs_cow = 1;
+ zero_clusters += num_clusters;
+ }
+ if ((zero_cpos + zero_clusters) > last_cpos)
+ zero_clusters = last_cpos - zero_cpos;
+
+ if (needs_cow) {
+ rc = ocfs2_refcount_cow(inode, di_bh, zero_cpos, zero_clusters,
+ UINT_MAX);
+ if (rc) {
+ mlog_errno(rc);
+ goto out;
+ }
+ }
+
+ *range_start = ocfs2_clusters_to_bytes(inode->i_sb, zero_cpos);
+ *range_end = ocfs2_clusters_to_bytes(inode->i_sb,
+ zero_cpos + zero_clusters);
+
+out:
+ return rc;
+}
+
+/*
+ * Zero one range returned from ocfs2_zero_extend_get_range(). The caller
+ * has made sure that the entire range needs zeroing.
+ */
+static int ocfs2_zero_extend_range(struct inode *inode, u64 range_start,
+ u64 range_end)
+{
+ int rc = 0;
+ u64 next_pos;
+ u64 zero_pos = range_start;
+
+ mlog(0, "range_start = %llu, range_end = %llu\n",
+ (unsigned long long)range_start,
+ (unsigned long long)range_end);
+ BUG_ON(range_start >= range_end);
+
+ while (zero_pos < range_end) {
+ next_pos = (zero_pos & PAGE_CACHE_MASK) + PAGE_CACHE_SIZE;
+ if (next_pos > range_end)
+ next_pos = range_end;
+ rc = ocfs2_write_zero_page(inode, zero_pos, next_pos);
+ if (rc < 0) {
+ mlog_errno(rc);
+ break;
+ }
+ zero_pos = next_pos;
/*
* Very large extends have the potential to lock up
@@ -811,16 +954,63 @@ static int ocfs2_zero_extend(struct inode *inode,
cond_resched();
}
-out:
+ return rc;
+}
+
+int ocfs2_zero_extend(struct inode *inode, struct buffer_head *di_bh,
+ loff_t zero_to_size)
+{
+ int ret = 0;
+ u64 zero_start, range_start = 0, range_end = 0;
+ struct super_block *sb = inode->i_sb;
+
+ zero_start = ocfs2_align_bytes_to_blocks(sb, i_size_read(inode));
+ mlog(0, "zero_start %llu for i_size %llu\n",
+ (unsigned long long)zero_start,
+ (unsigned long long)i_size_read(inode));
+ while (zero_start < zero_to_size) {
+ ret = ocfs2_zero_extend_get_range(inode, di_bh, zero_start,
+ zero_to_size,
+ &range_start,
+ &range_end);
+ if (ret) {
+ mlog_errno(ret);
+ break;
+ }
+ if (!range_end)
+ break;
+ /* Trim the ends */
+ if (range_start < zero_start)
+ range_start = zero_start;
+ if (range_end > zero_to_size)
+ range_end = zero_to_size;
+
+ ret = ocfs2_zero_extend_range(inode, range_start,
+ range_end);
+ if (ret) {
+ mlog_errno(ret);
+ break;
+ }
+ zero_start = range_end;
+ }
+
return ret;
}
-int ocfs2_extend_no_holes(struct inode *inode, u64 new_i_size, u64 zero_to)
+int ocfs2_extend_no_holes(struct inode *inode, struct buffer_head *di_bh,
+ u64 new_i_size, u64 zero_to)
{
int ret;
u32 clusters_to_add;
struct ocfs2_inode_info *oi = OCFS2_I(inode);
+ /*
+ * Only quota files call this without a bh, and they can't be
+ * refcounted.
+ */
+ BUG_ON(!di_bh && (oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL));
+ BUG_ON(!di_bh && !(oi->ip_flags & OCFS2_INODE_SYSTEM_FILE));
+
clusters_to_add = ocfs2_clusters_for_bytes(inode->i_sb, new_i_size);
if (clusters_to_add < oi->ip_clusters)
clusters_to_add = 0;
@@ -841,7 +1031,7 @@ int ocfs2_extend_no_holes(struct inode *inode, u64 new_i_size, u64 zero_to)
* still need to zero the area between the old i_size and the
* new i_size.
*/
- ret = ocfs2_zero_extend(inode, zero_to);
+ ret = ocfs2_zero_extend(inode, di_bh, zero_to);
if (ret < 0)
mlog_errno(ret);
@@ -863,27 +1053,15 @@ static int ocfs2_extend_file(struct inode *inode,
goto out;
if (i_size_read(inode) == new_i_size)
- goto out;
+ goto out;
BUG_ON(new_i_size < i_size_read(inode));
/*
- * Fall through for converting inline data, even if the fs
- * supports sparse files.
- *
- * The check for inline data here is legal - nobody can add
- * the feature since we have i_mutex. We must check it again
- * after acquiring ip_alloc_sem though, as paths like mmap
- * might have raced us to converting the inode to extents.
- */
- if (!(oi->ip_dyn_features & OCFS2_INLINE_DATA_FL)
- && ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)))
- goto out_update_size;
-
- /*
* The alloc sem blocks people in read/write from reading our
* allocation until we're done changing it. We depend on
* i_mutex to block other extend/truncate calls while we're
- * here.
+ * here. We even have to hold it for sparse files because there
+ * might be some tail zeroing.
*/
down_write(&oi->ip_alloc_sem);
@@ -900,14 +1078,16 @@ static int ocfs2_extend_file(struct inode *inode,
ret = ocfs2_convert_inline_data_to_extents(inode, di_bh);
if (ret) {
up_write(&oi->ip_alloc_sem);
-
mlog_errno(ret);
goto out;
}
}
- if (!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)))
- ret = ocfs2_extend_no_holes(inode, new_i_size, new_i_size);
+ if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)))
+ ret = ocfs2_zero_extend(inode, di_bh, new_i_size);
+ else
+ ret = ocfs2_extend_no_holes(inode, di_bh, new_i_size,
+ new_i_size);
up_write(&oi->ip_alloc_sem);
@@ -1053,7 +1233,7 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
}
/*
- * This will intentionally not wind up calling vmtruncate(),
+ * This will intentionally not wind up calling simple_setsize(),
* since all the work for a size change has been done above.
* Otherwise, we could get into problems with truncate as
* ip_alloc_sem is used there to protect against i_size
@@ -2119,9 +2299,13 @@ relock:
* direct write may have instantiated a few
* blocks outside i_size. Trim these off again.
* Don't need i_size_read because we hold i_mutex.
+ *
+ * XXX(hch): this looks buggy because ocfs2 did not
+ * actually implement ->truncate. Take a look at
+ * the new truncate sequence and update this accordingly
*/
if (*ppos + count > inode->i_size)
- vmtruncate(inode, inode->i_size);
+ simple_setsize(inode, inode->i_size);
ret = written;
goto out_dio;
}
diff --git a/fs/ocfs2/file.h b/fs/ocfs2/file.h
index d66cf4f7c70e..97bf761c9e7c 100644
--- a/fs/ocfs2/file.h
+++ b/fs/ocfs2/file.h
@@ -54,8 +54,10 @@ int ocfs2_add_inode_data(struct ocfs2_super *osb,
int ocfs2_simple_size_update(struct inode *inode,
struct buffer_head *di_bh,
u64 new_i_size);
-int ocfs2_extend_no_holes(struct inode *inode, u64 new_i_size,
- u64 zero_to);
+int ocfs2_extend_no_holes(struct inode *inode, struct buffer_head *di_bh,
+ u64 new_i_size, u64 zero_to);
+int ocfs2_zero_extend(struct inode *inode, struct buffer_head *di_bh,
+ loff_t zero_to);
int ocfs2_setattr(struct dentry *dentry, struct iattr *attr);
int ocfs2_getattr(struct vfsmount *mnt, struct dentry *dentry,
struct kstat *stat);
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index 47878cf16418..625de9d7088c 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -472,7 +472,7 @@ static inline struct ocfs2_triggers *to_ocfs2_trigger(struct jbd2_buffer_trigger
return container_of(triggers, struct ocfs2_triggers, ot_triggers);
}
-static void ocfs2_commit_trigger(struct jbd2_buffer_trigger_type *triggers,
+static void ocfs2_frozen_trigger(struct jbd2_buffer_trigger_type *triggers,
struct buffer_head *bh,
void *data, size_t size)
{
@@ -491,7 +491,7 @@ static void ocfs2_commit_trigger(struct jbd2_buffer_trigger_type *triggers,
* Quota blocks have their own trigger because the struct ocfs2_block_check
* offset depends on the blocksize.
*/
-static void ocfs2_dq_commit_trigger(struct jbd2_buffer_trigger_type *triggers,
+static void ocfs2_dq_frozen_trigger(struct jbd2_buffer_trigger_type *triggers,
struct buffer_head *bh,
void *data, size_t size)
{
@@ -511,7 +511,7 @@ static void ocfs2_dq_commit_trigger(struct jbd2_buffer_trigger_type *triggers,
* Directory blocks also have their own trigger because the
* struct ocfs2_block_check offset depends on the blocksize.
*/
-static void ocfs2_db_commit_trigger(struct jbd2_buffer_trigger_type *triggers,
+static void ocfs2_db_frozen_trigger(struct jbd2_buffer_trigger_type *triggers,
struct buffer_head *bh,
void *data, size_t size)
{
@@ -544,7 +544,7 @@ static void ocfs2_abort_trigger(struct jbd2_buffer_trigger_type *triggers,
static struct ocfs2_triggers di_triggers = {
.ot_triggers = {
- .t_commit = ocfs2_commit_trigger,
+ .t_frozen = ocfs2_frozen_trigger,
.t_abort = ocfs2_abort_trigger,
},
.ot_offset = offsetof(struct ocfs2_dinode, i_check),
@@ -552,7 +552,7 @@ static struct ocfs2_triggers di_triggers = {
static struct ocfs2_triggers eb_triggers = {
.ot_triggers = {
- .t_commit = ocfs2_commit_trigger,
+ .t_frozen = ocfs2_frozen_trigger,
.t_abort = ocfs2_abort_trigger,
},
.ot_offset = offsetof(struct ocfs2_extent_block, h_check),
@@ -560,7 +560,7 @@ static struct ocfs2_triggers eb_triggers = {
static struct ocfs2_triggers rb_triggers = {
.ot_triggers = {
- .t_commit = ocfs2_commit_trigger,
+ .t_frozen = ocfs2_frozen_trigger,
.t_abort = ocfs2_abort_trigger,
},
.ot_offset = offsetof(struct ocfs2_refcount_block, rf_check),
@@ -568,7 +568,7 @@ static struct ocfs2_triggers rb_triggers = {
static struct ocfs2_triggers gd_triggers = {
.ot_triggers = {
- .t_commit = ocfs2_commit_trigger,
+ .t_frozen = ocfs2_frozen_trigger,
.t_abort = ocfs2_abort_trigger,
},
.ot_offset = offsetof(struct ocfs2_group_desc, bg_check),
@@ -576,14 +576,14 @@ static struct ocfs2_triggers gd_triggers = {
static struct ocfs2_triggers db_triggers = {
.ot_triggers = {
- .t_commit = ocfs2_db_commit_trigger,
+ .t_frozen = ocfs2_db_frozen_trigger,
.t_abort = ocfs2_abort_trigger,
},
};
static struct ocfs2_triggers xb_triggers = {
.ot_triggers = {
- .t_commit = ocfs2_commit_trigger,
+ .t_frozen = ocfs2_frozen_trigger,
.t_abort = ocfs2_abort_trigger,
},
.ot_offset = offsetof(struct ocfs2_xattr_block, xb_check),
@@ -591,14 +591,14 @@ static struct ocfs2_triggers xb_triggers = {
static struct ocfs2_triggers dq_triggers = {
.ot_triggers = {
- .t_commit = ocfs2_dq_commit_trigger,
+ .t_frozen = ocfs2_dq_frozen_trigger,
.t_abort = ocfs2_abort_trigger,
},
};
static struct ocfs2_triggers dr_triggers = {
.ot_triggers = {
- .t_commit = ocfs2_commit_trigger,
+ .t_frozen = ocfs2_frozen_trigger,
.t_abort = ocfs2_abort_trigger,
},
.ot_offset = offsetof(struct ocfs2_dx_root_block, dr_check),
@@ -606,7 +606,7 @@ static struct ocfs2_triggers dr_triggers = {
static struct ocfs2_triggers dl_triggers = {
.ot_triggers = {
- .t_commit = ocfs2_commit_trigger,
+ .t_frozen = ocfs2_frozen_trigger,
.t_abort = ocfs2_abort_trigger,
},
.ot_offset = offsetof(struct ocfs2_dx_leaf, dl_check),
@@ -1936,7 +1936,7 @@ void ocfs2_orphan_scan_work(struct work_struct *work)
mutex_lock(&os->os_lock);
ocfs2_queue_orphan_scan(osb);
if (atomic_read(&os->os_state) == ORPHAN_SCAN_ACTIVE)
- schedule_delayed_work(&os->os_orphan_scan_work,
+ queue_delayed_work(ocfs2_wq, &os->os_orphan_scan_work,
ocfs2_orphan_scan_timeout());
mutex_unlock(&os->os_lock);
}
@@ -1976,8 +1976,8 @@ void ocfs2_orphan_scan_start(struct ocfs2_super *osb)
atomic_set(&os->os_state, ORPHAN_SCAN_INACTIVE);
else {
atomic_set(&os->os_state, ORPHAN_SCAN_ACTIVE);
- schedule_delayed_work(&os->os_orphan_scan_work,
- ocfs2_orphan_scan_timeout());
+ queue_delayed_work(ocfs2_wq, &os->os_orphan_scan_work,
+ ocfs2_orphan_scan_timeout());
}
}
diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c
index 3d7419682dc0..ec6adbf8f551 100644
--- a/fs/ocfs2/localalloc.c
+++ b/fs/ocfs2/localalloc.c
@@ -118,6 +118,7 @@ unsigned int ocfs2_la_default_mb(struct ocfs2_super *osb)
{
unsigned int la_mb;
unsigned int gd_mb;
+ unsigned int la_max_mb;
unsigned int megs_per_slot;
struct super_block *sb = osb->sb;
@@ -182,6 +183,12 @@ unsigned int ocfs2_la_default_mb(struct ocfs2_super *osb)
if (megs_per_slot < la_mb)
la_mb = megs_per_slot;
+ /* We can't store more bits than we can in a block. */
+ la_max_mb = ocfs2_clusters_to_megabytes(osb->sb,
+ ocfs2_local_alloc_size(sb) * 8);
+ if (la_mb > la_max_mb)
+ la_mb = la_max_mb;
+
return la_mb;
}
diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c
index 2bb35fe00511..4607923eb24c 100644
--- a/fs/ocfs2/quota_global.c
+++ b/fs/ocfs2/quota_global.c
@@ -775,7 +775,7 @@ static int ocfs2_acquire_dquot(struct dquot *dquot)
* locking allocators ranks above a transaction start
*/
WARN_ON(journal_current_handle());
- status = ocfs2_extend_no_holes(gqinode,
+ status = ocfs2_extend_no_holes(gqinode, NULL,
gqinode->i_size + (need_alloc << sb->s_blocksize_bits),
gqinode->i_size);
if (status < 0)
diff --git a/fs/ocfs2/quota_local.c b/fs/ocfs2/quota_local.c
index 8bd70d4d184d..dc78764ccc4c 100644
--- a/fs/ocfs2/quota_local.c
+++ b/fs/ocfs2/quota_local.c
@@ -971,7 +971,7 @@ static struct ocfs2_quota_chunk *ocfs2_local_quota_add_chunk(
u64 p_blkno;
/* We are protected by dqio_sem so no locking needed */
- status = ocfs2_extend_no_holes(lqinode,
+ status = ocfs2_extend_no_holes(lqinode, NULL,
lqinode->i_size + 2 * sb->s_blocksize,
lqinode->i_size);
if (status < 0) {
@@ -1114,7 +1114,7 @@ static struct ocfs2_quota_chunk *ocfs2_extend_local_quota_file(
return ocfs2_local_quota_add_chunk(sb, type, offset);
/* We are protected by dqio_sem so no locking needed */
- status = ocfs2_extend_no_holes(lqinode,
+ status = ocfs2_extend_no_holes(lqinode, NULL,
lqinode->i_size + sb->s_blocksize,
lqinode->i_size);
if (status < 0) {
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index 4793f36f6518..3ac5aa733e9c 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -2931,6 +2931,12 @@ static int ocfs2_duplicate_clusters_by_page(handle_t *handle,
offset = ((loff_t)cpos) << OCFS2_SB(sb)->s_clustersize_bits;
end = offset + (new_len << OCFS2_SB(sb)->s_clustersize_bits);
+ /*
+ * We only duplicate pages until we reach the page contains i_size - 1.
+ * So trim 'end' to i_size.
+ */
+ if (end > i_size_read(context->inode))
+ end = i_size_read(context->inode);
while (offset < end) {
page_index = offset >> PAGE_CACHE_SHIFT;
@@ -4166,6 +4172,12 @@ static int __ocfs2_reflink(struct dentry *old_dentry,
struct inode *inode = old_dentry->d_inode;
struct buffer_head *new_bh = NULL;
+ if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_SYSTEM_FILE) {
+ ret = -EINVAL;
+ mlog_errno(ret);
+ goto out;
+ }
+
ret = filemap_fdatawrite(inode->i_mapping);
if (ret) {
mlog_errno(ret);
diff --git a/fs/ocfs2/reservations.c b/fs/ocfs2/reservations.c
index 40650021fc24..d8b6e4259b80 100644
--- a/fs/ocfs2/reservations.c
+++ b/fs/ocfs2/reservations.c
@@ -26,7 +26,6 @@
#include <linux/fs.h>
#include <linux/types.h>
-#include <linux/slab.h>
#include <linux/highmem.h>
#include <linux/bitops.h>
#include <linux/list.h>
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index f4c2a9eb8c4d..a8e6a95a353f 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -741,7 +741,7 @@ static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
le16_to_cpu(bg->bg_free_bits_count));
le32_add_cpu(&cl->cl_recs[alloc_rec].c_total,
le16_to_cpu(bg->bg_bits));
- cl->cl_recs[alloc_rec].c_blkno = cpu_to_le64(bg->bg_blkno);
+ cl->cl_recs[alloc_rec].c_blkno = bg->bg_blkno;
if (le16_to_cpu(cl->cl_next_free_rec) < le16_to_cpu(cl->cl_count))
le16_add_cpu(&cl->cl_next_free_rec, 1);
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 2c26ce251cb3..0eaa929a4dbf 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -879,13 +879,15 @@ static int ocfs2_susp_quotas(struct ocfs2_super *osb, int unsuspend)
if (!OCFS2_HAS_RO_COMPAT_FEATURE(sb, feature[type]))
continue;
if (unsuspend)
- status = vfs_quota_enable(
- sb_dqopt(sb)->files[type],
- type, QFMT_OCFS2,
- DQUOT_SUSPENDED);
- else
- status = vfs_quota_disable(sb, type,
- DQUOT_SUSPENDED);
+ status = dquot_resume(sb, type);
+ else {
+ struct ocfs2_mem_dqinfo *oinfo;
+
+ /* Cancel periodic syncing before suspending */
+ oinfo = sb_dqinfo(sb, type)->dqi_priv;
+ cancel_delayed_work_sync(&oinfo->dqi_sync_work);
+ status = dquot_suspend(sb, type);
+ }
if (status < 0)
break;
}
@@ -916,8 +918,8 @@ static int ocfs2_enable_quotas(struct ocfs2_super *osb)
status = -ENOENT;
goto out_quota_off;
}
- status = vfs_quota_enable(inode[type], type, QFMT_OCFS2,
- DQUOT_USAGE_ENABLED);
+ status = dquot_enable(inode[type], type, QFMT_OCFS2,
+ DQUOT_USAGE_ENABLED);
if (status < 0)
goto out_quota_off;
}
@@ -952,8 +954,8 @@ static void ocfs2_disable_quotas(struct ocfs2_super *osb)
/* Turn off quotas. This will remove all dquot structures from
* memory and so they will be automatically synced to global
* quota files */
- vfs_quota_disable(sb, type, DQUOT_USAGE_ENABLED |
- DQUOT_LIMITS_ENABLED);
+ dquot_disable(sb, type, DQUOT_USAGE_ENABLED |
+ DQUOT_LIMITS_ENABLED);
if (!inode)
continue;
iput(inode);
@@ -962,7 +964,7 @@ static void ocfs2_disable_quotas(struct ocfs2_super *osb)
/* Handle quota on quotactl */
static int ocfs2_quota_on(struct super_block *sb, int type, int format_id,
- char *path, int remount)
+ char *path)
{
unsigned int feature[MAXQUOTAS] = { OCFS2_FEATURE_RO_COMPAT_USRQUOTA,
OCFS2_FEATURE_RO_COMPAT_GRPQUOTA};
@@ -970,30 +972,24 @@ static int ocfs2_quota_on(struct super_block *sb, int type, int format_id,
if (!OCFS2_HAS_RO_COMPAT_FEATURE(sb, feature[type]))
return -EINVAL;
- if (remount)
- return 0; /* Just ignore it has been handled in
- * ocfs2_remount() */
- return vfs_quota_enable(sb_dqopt(sb)->files[type], type,
- format_id, DQUOT_LIMITS_ENABLED);
+ return dquot_enable(sb_dqopt(sb)->files[type], type,
+ format_id, DQUOT_LIMITS_ENABLED);
}
/* Handle quota off quotactl */
-static int ocfs2_quota_off(struct super_block *sb, int type, int remount)
+static int ocfs2_quota_off(struct super_block *sb, int type)
{
- if (remount)
- return 0; /* Ignore now and handle later in
- * ocfs2_remount() */
- return vfs_quota_disable(sb, type, DQUOT_LIMITS_ENABLED);
+ return dquot_disable(sb, type, DQUOT_LIMITS_ENABLED);
}
static const struct quotactl_ops ocfs2_quotactl_ops = {
.quota_on = ocfs2_quota_on,
.quota_off = ocfs2_quota_off,
- .quota_sync = vfs_quota_sync,
- .get_info = vfs_get_dqinfo,
- .set_info = vfs_set_dqinfo,
- .get_dqblk = vfs_get_dqblk,
- .set_dqblk = vfs_set_dqblk,
+ .quota_sync = dquot_quota_sync,
+ .get_info = dquot_get_dqinfo,
+ .set_info = dquot_set_dqinfo,
+ .get_dqblk = dquot_get_dqblk,
+ .set_dqblk = dquot_set_dqblk,
};
static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index e97b34842cfe..d03469f61801 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -709,7 +709,7 @@ static int ocfs2_xattr_extend_allocation(struct inode *inode,
struct ocfs2_xattr_value_buf *vb,
struct ocfs2_xattr_set_ctxt *ctxt)
{
- int status = 0;
+ int status = 0, credits;
handle_t *handle = ctxt->handle;
enum ocfs2_alloc_restarted why;
u32 prev_clusters, logical_start = le32_to_cpu(vb->vb_xv->xr_clusters);
@@ -719,38 +719,54 @@ static int ocfs2_xattr_extend_allocation(struct inode *inode,
ocfs2_init_xattr_value_extent_tree(&et, INODE_CACHE(inode), vb);
- status = vb->vb_access(handle, INODE_CACHE(inode), vb->vb_bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
- if (status < 0) {
- mlog_errno(status);
- goto leave;
- }
+ while (clusters_to_add) {
+ status = vb->vb_access(handle, INODE_CACHE(inode), vb->vb_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (status < 0) {
+ mlog_errno(status);
+ break;
+ }
- prev_clusters = le32_to_cpu(vb->vb_xv->xr_clusters);
- status = ocfs2_add_clusters_in_btree(handle,
- &et,
- &logical_start,
- clusters_to_add,
- 0,
- ctxt->data_ac,
- ctxt->meta_ac,
- &why);
- if (status < 0) {
- mlog_errno(status);
- goto leave;
- }
+ prev_clusters = le32_to_cpu(vb->vb_xv->xr_clusters);
+ status = ocfs2_add_clusters_in_btree(handle,
+ &et,
+ &logical_start,
+ clusters_to_add,
+ 0,
+ ctxt->data_ac,
+ ctxt->meta_ac,
+ &why);
+ if ((status < 0) && (status != -EAGAIN)) {
+ if (status != -ENOSPC)
+ mlog_errno(status);
+ break;
+ }
- ocfs2_journal_dirty(handle, vb->vb_bh);
+ ocfs2_journal_dirty(handle, vb->vb_bh);
- clusters_to_add -= le32_to_cpu(vb->vb_xv->xr_clusters) - prev_clusters;
+ clusters_to_add -= le32_to_cpu(vb->vb_xv->xr_clusters) -
+ prev_clusters;
- /*
- * We should have already allocated enough space before the transaction,
- * so no need to restart.
- */
- BUG_ON(why != RESTART_NONE || clusters_to_add);
-
-leave:
+ if (why != RESTART_NONE && clusters_to_add) {
+ /*
+ * We can only fail in case the alloc file doesn't give
+ * up enough clusters.
+ */
+ BUG_ON(why == RESTART_META);
+
+ mlog(0, "restarting xattr value extension for %u"
+ " clusters,.\n", clusters_to_add);
+ credits = ocfs2_calc_extend_credits(inode->i_sb,
+ &vb->vb_xv->xr_list,
+ clusters_to_add);
+ status = ocfs2_extend_trans(handle, credits);
+ if (status < 0) {
+ status = -ENOMEM;
+ mlog_errno(status);
+ break;
+ }
+ }
+ }
return status;
}
@@ -6788,16 +6804,15 @@ out:
return ret;
}
-static int ocfs2_reflink_xattr_buckets(handle_t *handle,
+static int ocfs2_reflink_xattr_bucket(handle_t *handle,
u64 blkno, u64 new_blkno, u32 clusters,
+ u32 *cpos, int num_buckets,
struct ocfs2_alloc_context *meta_ac,
struct ocfs2_alloc_context *data_ac,
struct ocfs2_reflink_xattr_tree_args *args)
{
int i, j, ret = 0;
struct super_block *sb = args->reflink->old_inode->i_sb;
- u32 bpc = ocfs2_xattr_buckets_per_cluster(OCFS2_SB(sb));
- u32 num_buckets = clusters * bpc;
int bpb = args->old_bucket->bu_blocks;
struct ocfs2_xattr_value_buf vb = {
.vb_access = ocfs2_journal_access,
@@ -6816,14 +6831,6 @@ static int ocfs2_reflink_xattr_buckets(handle_t *handle,
break;
}
- /*
- * The real bucket num in this series of blocks is stored
- * in the 1st bucket.
- */
- if (i == 0)
- num_buckets = le16_to_cpu(
- bucket_xh(args->old_bucket)->xh_num_buckets);
-
ret = ocfs2_xattr_bucket_journal_access(handle,
args->new_bucket,
OCFS2_JOURNAL_ACCESS_CREATE);
@@ -6837,6 +6844,18 @@ static int ocfs2_reflink_xattr_buckets(handle_t *handle,
bucket_block(args->old_bucket, j),
sb->s_blocksize);
+ /*
+ * Record the start cpos so that we can use it to initialize
+ * our xattr tree we also set the xh_num_bucket for the new
+ * bucket.
+ */
+ if (i == 0) {
+ *cpos = le32_to_cpu(bucket_xh(args->new_bucket)->
+ xh_entries[0].xe_name_hash);
+ bucket_xh(args->new_bucket)->xh_num_buckets =
+ cpu_to_le16(num_buckets);
+ }
+
ocfs2_xattr_bucket_journal_dirty(handle, args->new_bucket);
ret = ocfs2_reflink_xattr_header(handle, args->reflink,
@@ -6866,6 +6885,7 @@ static int ocfs2_reflink_xattr_buckets(handle_t *handle,
}
ocfs2_xattr_bucket_journal_dirty(handle, args->new_bucket);
+
ocfs2_xattr_bucket_relse(args->old_bucket);
ocfs2_xattr_bucket_relse(args->new_bucket);
}
@@ -6874,6 +6894,75 @@ static int ocfs2_reflink_xattr_buckets(handle_t *handle,
ocfs2_xattr_bucket_relse(args->new_bucket);
return ret;
}
+
+static int ocfs2_reflink_xattr_buckets(handle_t *handle,
+ struct inode *inode,
+ struct ocfs2_reflink_xattr_tree_args *args,
+ struct ocfs2_extent_tree *et,
+ struct ocfs2_alloc_context *meta_ac,
+ struct ocfs2_alloc_context *data_ac,
+ u64 blkno, u32 cpos, u32 len)
+{
+ int ret, first_inserted = 0;
+ u32 p_cluster, num_clusters, reflink_cpos = 0;
+ u64 new_blkno;
+ unsigned int num_buckets, reflink_buckets;
+ unsigned int bpc =
+ ocfs2_xattr_buckets_per_cluster(OCFS2_SB(inode->i_sb));
+
+ ret = ocfs2_read_xattr_bucket(args->old_bucket, blkno);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+ num_buckets = le16_to_cpu(bucket_xh(args->old_bucket)->xh_num_buckets);
+ ocfs2_xattr_bucket_relse(args->old_bucket);
+
+ while (len && num_buckets) {
+ ret = ocfs2_claim_clusters(handle, data_ac,
+ 1, &p_cluster, &num_clusters);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ new_blkno = ocfs2_clusters_to_blocks(inode->i_sb, p_cluster);
+ reflink_buckets = min(num_buckets, bpc * num_clusters);
+
+ ret = ocfs2_reflink_xattr_bucket(handle, blkno,
+ new_blkno, num_clusters,
+ &reflink_cpos, reflink_buckets,
+ meta_ac, data_ac, args);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ /*
+ * For the 1st allocated cluster, we make it use the same cpos
+ * so that the xattr tree looks the same as the original one
+ * in the most case.
+ */
+ if (!first_inserted) {
+ reflink_cpos = cpos;
+ first_inserted = 1;
+ }
+ ret = ocfs2_insert_extent(handle, et, reflink_cpos, new_blkno,
+ num_clusters, 0, meta_ac);
+ if (ret)
+ mlog_errno(ret);
+
+ mlog(0, "insert new xattr extent rec start %llu len %u to %u\n",
+ (unsigned long long)new_blkno, num_clusters, reflink_cpos);
+
+ len -= num_clusters;
+ blkno += ocfs2_clusters_to_blocks(inode->i_sb, num_clusters);
+ num_buckets -= reflink_buckets;
+ }
+out:
+ return ret;
+}
+
/*
* Create the same xattr extent record in the new inode's xattr tree.
*/
@@ -6885,8 +6974,6 @@ static int ocfs2_reflink_xattr_rec(struct inode *inode,
void *para)
{
int ret, credits = 0;
- u32 p_cluster, num_clusters;
- u64 new_blkno;
handle_t *handle;
struct ocfs2_reflink_xattr_tree_args *args =
(struct ocfs2_reflink_xattr_tree_args *)para;
@@ -6895,6 +6982,9 @@ static int ocfs2_reflink_xattr_rec(struct inode *inode,
struct ocfs2_alloc_context *data_ac = NULL;
struct ocfs2_extent_tree et;
+ mlog(0, "reflink xattr buckets %llu len %u\n",
+ (unsigned long long)blkno, len);
+
ocfs2_init_xattr_tree_extent_tree(&et,
INODE_CACHE(args->reflink->new_inode),
args->new_blk_bh);
@@ -6914,32 +7004,12 @@ static int ocfs2_reflink_xattr_rec(struct inode *inode,
goto out;
}
- ret = ocfs2_claim_clusters(handle, data_ac,
- len, &p_cluster, &num_clusters);
- if (ret) {
- mlog_errno(ret);
- goto out_commit;
- }
-
- new_blkno = ocfs2_clusters_to_blocks(osb->sb, p_cluster);
-
- mlog(0, "reflink xattr buckets %llu to %llu, len %u\n",
- (unsigned long long)blkno, (unsigned long long)new_blkno, len);
- ret = ocfs2_reflink_xattr_buckets(handle, blkno, new_blkno, len,
- meta_ac, data_ac, args);
- if (ret) {
- mlog_errno(ret);
- goto out_commit;
- }
-
- mlog(0, "insert new xattr extent rec start %llu len %u to %u\n",
- (unsigned long long)new_blkno, len, cpos);
- ret = ocfs2_insert_extent(handle, &et, cpos, new_blkno,
- len, 0, meta_ac);
+ ret = ocfs2_reflink_xattr_buckets(handle, inode, args, &et,
+ meta_ac, data_ac,
+ blkno, cpos, len);
if (ret)
mlog_errno(ret);
-out_commit:
ocfs2_commit_trans(osb, handle);
out:
diff --git a/fs/omfs/file.c b/fs/omfs/file.c
index 399487c09364..6e7a3291bbe8 100644
--- a/fs/omfs/file.c
+++ b/fs/omfs/file.c
@@ -329,7 +329,7 @@ const struct file_operations omfs_file_operations = {
.aio_read = generic_file_aio_read,
.aio_write = generic_file_aio_write,
.mmap = generic_file_mmap,
- .fsync = simple_fsync,
+ .fsync = generic_file_fsync,
.splice_read = generic_file_splice_read,
};
diff --git a/fs/open.c b/fs/open.c
index 5463266db9e6..0d1fa3dc0efb 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -110,7 +110,7 @@ static long do_sys_truncate(const char __user *pathname, loff_t length)
error = locks_verify_truncate(inode, NULL, length);
if (!error)
- error = security_path_truncate(&path, length, 0);
+ error = security_path_truncate(&path);
if (!error)
error = do_truncate(path.dentry, length, 0, NULL);
@@ -165,8 +165,7 @@ static long do_sys_ftruncate(unsigned int fd, loff_t length, int small)
error = locks_verify_truncate(inode, file, length);
if (!error)
- error = security_path_truncate(&file->f_path, length,
- ATTR_MTIME|ATTR_CTIME);
+ error = security_path_truncate(&file->f_path);
if (!error)
error = do_truncate(dentry, length, ATTR_MTIME|ATTR_CTIME, file);
out_putf:
@@ -367,7 +366,7 @@ SYSCALL_DEFINE1(chdir, const char __user *, filename)
if (error)
goto out;
- error = inode_permission(path.dentry->d_inode, MAY_EXEC | MAY_ACCESS);
+ error = inode_permission(path.dentry->d_inode, MAY_EXEC | MAY_CHDIR);
if (error)
goto dput_and_out;
@@ -396,7 +395,7 @@ SYSCALL_DEFINE1(fchdir, unsigned int, fd)
if (!S_ISDIR(inode->i_mode))
goto out_putf;
- error = inode_permission(inode, MAY_EXEC | MAY_ACCESS);
+ error = inode_permission(inode, MAY_EXEC | MAY_CHDIR);
if (!error)
set_fs_pwd(current->fs, &file->f_path);
out_putf:
@@ -414,7 +413,7 @@ SYSCALL_DEFINE1(chroot, const char __user *, filename)
if (error)
goto out;
- error = inode_permission(path.dentry->d_inode, MAY_EXEC | MAY_ACCESS);
+ error = inode_permission(path.dentry->d_inode, MAY_EXEC | MAY_CHDIR);
if (error)
goto dput_and_out;
diff --git a/fs/partitions/ibm.c b/fs/partitions/ibm.c
index 3e73de5967ff..fc8497643fd0 100644
--- a/fs/partitions/ibm.c
+++ b/fs/partitions/ibm.c
@@ -74,6 +74,7 @@ int ibm_partition(struct parsed_partitions *state)
} *label;
unsigned char *data;
Sector sect;
+ sector_t labelsect;
res = 0;
blocksize = bdev_logical_block_size(bdev);
@@ -98,10 +99,19 @@ int ibm_partition(struct parsed_partitions *state)
goto out_freeall;
/*
+ * Special case for FBA disks: label sector does not depend on
+ * blocksize.
+ */
+ if ((info->cu_type == 0x6310 && info->dev_type == 0x9336) ||
+ (info->cu_type == 0x3880 && info->dev_type == 0x3370))
+ labelsect = info->label_block;
+ else
+ labelsect = info->label_block * (blocksize >> 9);
+
+ /*
* Get volume label, extract name and type.
*/
- data = read_part_sector(state, info->label_block*(blocksize/512),
- &sect);
+ data = read_part_sector(state, labelsect, &sect);
if (data == NULL)
goto out_readerr;
diff --git a/fs/pipe.c b/fs/pipe.c
index d79872eba09a..279eef96c51c 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -26,9 +26,14 @@
/*
* The max size that a non-root user is allowed to grow the pipe. Can
- * be set by root in /proc/sys/fs/pipe-max-pages
+ * be set by root in /proc/sys/fs/pipe-max-size
*/
-unsigned int pipe_max_pages = PIPE_DEF_BUFFERS * 16;
+unsigned int pipe_max_size = 1048576;
+
+/*
+ * Minimum pipe size, as required by POSIX
+ */
+unsigned int pipe_min_size = PAGE_SIZE;
/*
* We use a start+len construction, which provides full use of the
@@ -230,6 +235,7 @@ void *generic_pipe_buf_map(struct pipe_inode_info *pipe,
return kmap(buf->page);
}
+EXPORT_SYMBOL(generic_pipe_buf_map);
/**
* generic_pipe_buf_unmap - unmap a previously mapped pipe buffer
@@ -249,6 +255,7 @@ void generic_pipe_buf_unmap(struct pipe_inode_info *pipe,
} else
kunmap(buf->page);
}
+EXPORT_SYMBOL(generic_pipe_buf_unmap);
/**
* generic_pipe_buf_steal - attempt to take ownership of a &pipe_buffer
@@ -279,6 +286,7 @@ int generic_pipe_buf_steal(struct pipe_inode_info *pipe,
return 1;
}
+EXPORT_SYMBOL(generic_pipe_buf_steal);
/**
* generic_pipe_buf_get - get a reference to a &struct pipe_buffer
@@ -294,6 +302,7 @@ void generic_pipe_buf_get(struct pipe_inode_info *pipe, struct pipe_buffer *buf)
{
page_cache_get(buf->page);
}
+EXPORT_SYMBOL(generic_pipe_buf_get);
/**
* generic_pipe_buf_confirm - verify contents of the pipe buffer
@@ -309,6 +318,7 @@ int generic_pipe_buf_confirm(struct pipe_inode_info *info,
{
return 0;
}
+EXPORT_SYMBOL(generic_pipe_buf_confirm);
/**
* generic_pipe_buf_release - put a reference to a &struct pipe_buffer
@@ -323,6 +333,7 @@ void generic_pipe_buf_release(struct pipe_inode_info *pipe,
{
page_cache_release(buf->page);
}
+EXPORT_SYMBOL(generic_pipe_buf_release);
static const struct pipe_buf_operations anon_pipe_buf_ops = {
.can_merge = 1,
@@ -1112,26 +1123,20 @@ SYSCALL_DEFINE1(pipe, int __user *, fildes)
* Allocate a new array of pipe buffers and copy the info over. Returns the
* pipe size if successful, or return -ERROR on error.
*/
-static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long arg)
+static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long nr_pages)
{
struct pipe_buffer *bufs;
/*
- * Must be a power-of-2 currently
- */
- if (!is_power_of_2(arg))
- return -EINVAL;
-
- /*
* We can shrink the pipe, if arg >= pipe->nrbufs. Since we don't
* expect a lot of shrink+grow operations, just free and allocate
* again like we would do for growing. If the pipe currently
* contains more buffers than arg, then return busy.
*/
- if (arg < pipe->nrbufs)
+ if (nr_pages < pipe->nrbufs)
return -EBUSY;
- bufs = kcalloc(arg, sizeof(struct pipe_buffer), GFP_KERNEL);
+ bufs = kcalloc(nr_pages, sizeof(struct pipe_buffer), GFP_KERNEL);
if (unlikely(!bufs))
return -ENOMEM;
@@ -1140,20 +1145,56 @@ static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long arg)
* and adjust the indexes.
*/
if (pipe->nrbufs) {
- const unsigned int tail = pipe->nrbufs & (pipe->buffers - 1);
- const unsigned int head = pipe->nrbufs - tail;
+ unsigned int tail;
+ unsigned int head;
+ tail = pipe->curbuf + pipe->nrbufs;
+ if (tail < pipe->buffers)
+ tail = 0;
+ else
+ tail &= (pipe->buffers - 1);
+
+ head = pipe->nrbufs - tail;
if (head)
memcpy(bufs, pipe->bufs + pipe->curbuf, head * sizeof(struct pipe_buffer));
if (tail)
- memcpy(bufs + head, pipe->bufs + pipe->curbuf, tail * sizeof(struct pipe_buffer));
+ memcpy(bufs + head, pipe->bufs, tail * sizeof(struct pipe_buffer));
}
pipe->curbuf = 0;
kfree(pipe->bufs);
pipe->bufs = bufs;
- pipe->buffers = arg;
- return arg;
+ pipe->buffers = nr_pages;
+ return nr_pages * PAGE_SIZE;
+}
+
+/*
+ * Currently we rely on the pipe array holding a power-of-2 number
+ * of pages.
+ */
+static inline unsigned int round_pipe_size(unsigned int size)
+{
+ unsigned long nr_pages;
+
+ nr_pages = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
+ return roundup_pow_of_two(nr_pages) << PAGE_SHIFT;
+}
+
+/*
+ * This should work even if CONFIG_PROC_FS isn't set, as proc_dointvec_minmax
+ * will return an error.
+ */
+int pipe_proc_fn(struct ctl_table *table, int write, void __user *buf,
+ size_t *lenp, loff_t *ppos)
+{
+ int ret;
+
+ ret = proc_dointvec_minmax(table, write, buf, lenp, ppos);
+ if (ret < 0 || !write)
+ return ret;
+
+ pipe_max_size = round_pipe_size(pipe_max_size);
+ return ret;
}
long pipe_fcntl(struct file *file, unsigned int cmd, unsigned long arg)
@@ -1168,25 +1209,32 @@ long pipe_fcntl(struct file *file, unsigned int cmd, unsigned long arg)
mutex_lock(&pipe->inode->i_mutex);
switch (cmd) {
- case F_SETPIPE_SZ:
- if (!capable(CAP_SYS_ADMIN) && arg > pipe_max_pages)
- return -EINVAL;
- /*
- * The pipe needs to be at least 2 pages large to
- * guarantee POSIX behaviour.
- */
- if (arg < 2)
- return -EINVAL;
- ret = pipe_set_size(pipe, arg);
+ case F_SETPIPE_SZ: {
+ unsigned int size, nr_pages;
+
+ size = round_pipe_size(arg);
+ nr_pages = size >> PAGE_SHIFT;
+
+ ret = -EINVAL;
+ if (!nr_pages)
+ goto out;
+
+ if (!capable(CAP_SYS_RESOURCE) && size > pipe_max_size) {
+ ret = -EPERM;
+ goto out;
+ }
+ ret = pipe_set_size(pipe, nr_pages);
break;
+ }
case F_GETPIPE_SZ:
- ret = pipe->buffers;
+ ret = pipe->buffers * PAGE_SIZE;
break;
default:
ret = -EINVAL;
break;
}
+out:
mutex_unlock(&pipe->inode->i_mutex);
return ret;
}
diff --git a/fs/proc/array.c b/fs/proc/array.c
index 885ab5513ac5..fff6572676ae 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -176,7 +176,7 @@ static inline void task_state(struct seq_file *m, struct pid_namespace *ns,
if (tracer)
tpid = task_pid_nr_ns(tracer, ns);
}
- cred = get_cred((struct cred *) __task_cred(p));
+ cred = get_task_cred(p);
seq_printf(m,
"State:\t%s\n"
"Tgid:\t%d\n"
@@ -267,7 +267,7 @@ static inline void task_sig(struct seq_file *m, struct task_struct *p)
shpending = p->signal->shared_pending.signal;
blocked = p->blocked;
collect_sigign_sigcatch(p, &ignored, &caught);
- num_threads = atomic_read(&p->signal->count);
+ num_threads = get_nr_threads(p);
rcu_read_lock(); /* FIXME: is this correct? */
qsize = atomic_read(&__task_cred(p)->user->sigpending);
rcu_read_unlock();
@@ -410,7 +410,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
tty_nr = new_encode_dev(tty_devnum(sig->tty));
}
- num_threads = atomic_read(&sig->count);
+ num_threads = get_nr_threads(task);
collect_sigign_sigcatch(task, &sigign, &sigcatch);
cmin_flt = sig->cmin_flt;
diff --git a/fs/proc/base.c b/fs/proc/base.c
index c7f9f23449dc..acb7ef80ea4f 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -166,18 +166,6 @@ static int get_fs_path(struct task_struct *task, struct path *path, bool root)
return result;
}
-static int get_nr_threads(struct task_struct *tsk)
-{
- unsigned long flags;
- int count = 0;
-
- if (lock_task_sighand(tsk, &flags)) {
- count = atomic_read(&tsk->signal->count);
- unlock_task_sighand(tsk, &flags);
- }
- return count;
-}
-
static int proc_cwd_link(struct inode *inode, struct path *path)
{
struct task_struct *task = get_proc_task(inode);
@@ -2444,7 +2432,7 @@ static struct dentry *proc_base_instantiate(struct inode *dir,
const struct pid_entry *p = ptr;
struct inode *inode;
struct proc_inode *ei;
- struct dentry *error = ERR_PTR(-EINVAL);
+ struct dentry *error;
/* Allocate the inode */
error = ERR_PTR(-ENOMEM);
@@ -2794,7 +2782,7 @@ out:
struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, struct nameidata *nd)
{
- struct dentry *result = ERR_PTR(-ENOENT);
+ struct dentry *result;
struct task_struct *task;
unsigned tgid;
struct pid_namespace *ns;
diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index 43c127490606..2791907744ed 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -343,21 +343,6 @@ static DEFINE_SPINLOCK(proc_inum_lock); /* protects the above */
/*
* Return an inode number between PROC_DYNAMIC_FIRST and
* 0xffffffff, or zero on failure.
- *
- * Current inode allocations in the proc-fs (hex-numbers):
- *
- * 00000000 reserved
- * 00000001-00000fff static entries (goners)
- * 001 root-ino
- *
- * 00001000-00001fff unused
- * 0001xxxx-7fffxxxx pid-dir entries for pid 1-7fff
- * 80000000-efffffff unused
- * f0000000-ffffffff dynamic entries
- *
- * Goal:
- * Once we split the thing into several virtual filesystems,
- * we will get rid of magical ranges (and this comment, BTW).
*/
static unsigned int get_inode_number(void)
{
diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index c837a77351be..6f37c391468d 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -588,7 +588,7 @@ static struct kcore_list kcore_text;
*/
static void __init proc_kcore_text_init(void)
{
- kclist_add(&kcore_text, _stext, _end - _stext, KCORE_TEXT);
+ kclist_add(&kcore_text, _text, _end - _text, KCORE_TEXT);
}
#else
static void __init proc_kcore_text_init(void)
diff --git a/fs/proc/proc_devtree.c b/fs/proc/proc_devtree.c
index ce94801f48ca..d9396a4fc7ff 100644
--- a/fs/proc/proc_devtree.c
+++ b/fs/proc/proc_devtree.c
@@ -209,6 +209,9 @@ void proc_device_tree_add_node(struct device_node *np,
for (pp = np->properties; pp != NULL; pp = pp->next) {
p = pp->name;
+ if (strchr(p, '/'))
+ continue;
+
if (duplicate_name(de, p))
p = fixup_name(np, de, p);
diff --git a/fs/proc/root.c b/fs/proc/root.c
index 757c069f2a65..4258384ed22d 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -110,7 +110,6 @@ void __init proc_root_init(void)
if (err)
return;
proc_mnt = kern_mount_data(&proc_fs_type, &init_pid_ns);
- err = PTR_ERR(proc_mnt);
if (IS_ERR(proc_mnt)) {
unregister_filesystem(&proc_fs_type);
return;
diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c
index 46d4b5d72bd3..cb6306e63843 100644
--- a/fs/proc/task_nommu.c
+++ b/fs/proc/task_nommu.c
@@ -122,11 +122,20 @@ int task_statm(struct mm_struct *mm, int *shared, int *text,
return size;
}
+static void pad_len_spaces(struct seq_file *m, int len)
+{
+ len = 25 + sizeof(void*) * 6 - len;
+ if (len < 1)
+ len = 1;
+ seq_printf(m, "%*c", len, ' ');
+}
+
/*
* display a single VMA to a sequenced file
*/
static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma)
{
+ struct mm_struct *mm = vma->vm_mm;
unsigned long ino = 0;
struct file *file;
dev_t dev = 0;
@@ -155,11 +164,14 @@ static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma)
MAJOR(dev), MINOR(dev), ino, &len);
if (file) {
- len = 25 + sizeof(void *) * 6 - len;
- if (len < 1)
- len = 1;
- seq_printf(m, "%*c", len, ' ');
+ pad_len_spaces(m, len);
seq_path(m, &file->f_path, "");
+ } else if (mm) {
+ if (vma->vm_start <= mm->start_stack &&
+ vma->vm_end >= mm->start_stack) {
+ pad_len_spaces(m, len);
+ seq_puts(m, "[stack]");
+ }
}
seq_putc(m, '\n');
diff --git a/fs/qnx4/dir.c b/fs/qnx4/dir.c
index 6f30c3d5bcbf..6e8fc62b40a8 100644
--- a/fs/qnx4/dir.c
+++ b/fs/qnx4/dir.c
@@ -77,9 +77,10 @@ out:
const struct file_operations qnx4_dir_operations =
{
+ .llseek = generic_file_llseek,
.read = generic_read_dir,
.readdir = qnx4_readdir,
- .fsync = simple_fsync,
+ .fsync = generic_file_fsync,
};
const struct inode_operations qnx4_dir_inode_operations =
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index 655a4c52b8c3..437d2ca2de97 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -228,10 +228,6 @@ static struct hlist_head *dquot_hash;
struct dqstats dqstats;
EXPORT_SYMBOL(dqstats);
-#ifdef CONFIG_SMP
-struct dqstats *dqstats_pcpu;
-EXPORT_SYMBOL(dqstats_pcpu);
-#endif
static qsize_t inode_get_rsv_space(struct inode *inode);
static void __dquot_initialize(struct inode *inode, int type);
@@ -584,7 +580,7 @@ out:
}
EXPORT_SYMBOL(dquot_scan_active);
-int vfs_quota_sync(struct super_block *sb, int type, int wait)
+int dquot_quota_sync(struct super_block *sb, int type, int wait)
{
struct list_head *dirty;
struct dquot *dquot;
@@ -656,7 +652,7 @@ int vfs_quota_sync(struct super_block *sb, int type, int wait)
return 0;
}
-EXPORT_SYMBOL(vfs_quota_sync);
+EXPORT_SYMBOL(dquot_quota_sync);
/* Free unused dquots from cache */
static void prune_dqcache(int count)
@@ -676,35 +672,20 @@ static void prune_dqcache(int count)
}
}
-static int dqstats_read(unsigned int type)
-{
- int count = 0;
-#ifdef CONFIG_SMP
- int cpu;
- for_each_possible_cpu(cpu)
- count += per_cpu_ptr(dqstats_pcpu, cpu)->stat[type];
- /* Statistics reading is racy, but absolute accuracy isn't required */
- if (count < 0)
- count = 0;
-#else
- count = dqstats.stat[type];
-#endif
- return count;
-}
-
/*
* This is called from kswapd when we think we need some
* more memory
*/
-
-static int shrink_dqcache_memory(int nr, gfp_t gfp_mask)
+static int shrink_dqcache_memory(struct shrinker *shrink, int nr, gfp_t gfp_mask)
{
if (nr) {
spin_lock(&dq_list_lock);
prune_dqcache(nr);
spin_unlock(&dq_list_lock);
}
- return (dqstats_read(DQST_FREE_DQUOTS)/100) * sysctl_vfs_cache_pressure;
+ return ((unsigned)
+ percpu_counter_read_positive(&dqstats.counter[DQST_FREE_DQUOTS])
+ /100) * sysctl_vfs_cache_pressure;
}
static struct shrinker dqcache_shrinker = {
@@ -1514,11 +1495,13 @@ static void inode_decr_space(struct inode *inode, qsize_t number, int reserve)
/*
* This operation can block, but only after everything is updated
*/
-int __dquot_alloc_space(struct inode *inode, qsize_t number,
- int warn, int reserve)
+int __dquot_alloc_space(struct inode *inode, qsize_t number, int flags)
{
int cnt, ret = 0;
char warntype[MAXQUOTAS];
+ int warn = flags & DQUOT_SPACE_WARN;
+ int reserve = flags & DQUOT_SPACE_RESERVE;
+ int nofail = flags & DQUOT_SPACE_NOFAIL;
/*
* First test before acquiring mutex - solves deadlocks when we
@@ -1539,7 +1522,7 @@ int __dquot_alloc_space(struct inode *inode, qsize_t number,
continue;
ret = check_bdq(inode->i_dquot[cnt], number, !warn,
warntype+cnt);
- if (ret) {
+ if (ret && !nofail) {
spin_unlock(&dq_data_lock);
goto out_flush_warn;
}
@@ -1638,10 +1621,11 @@ EXPORT_SYMBOL(dquot_claim_space_nodirty);
/*
* This operation can block, but only after everything is updated
*/
-void __dquot_free_space(struct inode *inode, qsize_t number, int reserve)
+void __dquot_free_space(struct inode *inode, qsize_t number, int flags)
{
unsigned int cnt;
char warntype[MAXQUOTAS];
+ int reserve = flags & DQUOT_SPACE_RESERVE;
/* First test before acquiring mutex - solves deadlocks when we
* re-enter the quota code and are already holding the mutex */
@@ -1812,7 +1796,7 @@ int dquot_transfer(struct inode *inode, struct iattr *iattr)
if (iattr->ia_valid & ATTR_UID && iattr->ia_uid != inode->i_uid)
transfer_to[USRQUOTA] = dqget(sb, iattr->ia_uid, USRQUOTA);
if (iattr->ia_valid & ATTR_GID && iattr->ia_gid != inode->i_gid)
- transfer_to[GRPQUOTA] = dqget(sb, iattr->ia_uid, GRPQUOTA);
+ transfer_to[GRPQUOTA] = dqget(sb, iattr->ia_gid, GRPQUOTA);
ret = __dquot_transfer(inode, transfer_to);
dqput_all(transfer_to);
@@ -1847,6 +1831,7 @@ const struct dquot_operations dquot_operations = {
.alloc_dquot = dquot_alloc,
.destroy_dquot = dquot_destroy,
};
+EXPORT_SYMBOL(dquot_operations);
/*
* Generic helper for ->open on filesystems supporting disk quotas.
@@ -1865,7 +1850,7 @@ EXPORT_SYMBOL(dquot_file_open);
/*
* Turn quota off on a device. type == -1 ==> quotaoff for all types (umount)
*/
-int vfs_quota_disable(struct super_block *sb, int type, unsigned int flags)
+int dquot_disable(struct super_block *sb, int type, unsigned int flags)
{
int cnt, ret = 0;
struct quota_info *dqopt = sb_dqopt(sb);
@@ -1995,14 +1980,15 @@ put_inodes:
}
return ret;
}
-EXPORT_SYMBOL(vfs_quota_disable);
+EXPORT_SYMBOL(dquot_disable);
-int vfs_quota_off(struct super_block *sb, int type, int remount)
+int dquot_quota_off(struct super_block *sb, int type)
{
- return vfs_quota_disable(sb, type, remount ? DQUOT_SUSPENDED :
- (DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED));
+ return dquot_disable(sb, type,
+ DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED);
}
-EXPORT_SYMBOL(vfs_quota_off);
+EXPORT_SYMBOL(dquot_quota_off);
+
/*
* Turn quotas on on a device
*/
@@ -2120,36 +2106,43 @@ out_fmt:
}
/* Reenable quotas on remount RW */
-static int vfs_quota_on_remount(struct super_block *sb, int type)
+int dquot_resume(struct super_block *sb, int type)
{
struct quota_info *dqopt = sb_dqopt(sb);
struct inode *inode;
- int ret;
+ int ret = 0, cnt;
unsigned int flags;
- mutex_lock(&dqopt->dqonoff_mutex);
- if (!sb_has_quota_suspended(sb, type)) {
+ for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
+ if (type != -1 && cnt != type)
+ continue;
+
+ mutex_lock(&dqopt->dqonoff_mutex);
+ if (!sb_has_quota_suspended(sb, cnt)) {
+ mutex_unlock(&dqopt->dqonoff_mutex);
+ continue;
+ }
+ inode = dqopt->files[cnt];
+ dqopt->files[cnt] = NULL;
+ spin_lock(&dq_state_lock);
+ flags = dqopt->flags & dquot_state_flag(DQUOT_USAGE_ENABLED |
+ DQUOT_LIMITS_ENABLED,
+ cnt);
+ dqopt->flags &= ~dquot_state_flag(DQUOT_STATE_FLAGS, cnt);
+ spin_unlock(&dq_state_lock);
mutex_unlock(&dqopt->dqonoff_mutex);
- return 0;
- }
- inode = dqopt->files[type];
- dqopt->files[type] = NULL;
- spin_lock(&dq_state_lock);
- flags = dqopt->flags & dquot_state_flag(DQUOT_USAGE_ENABLED |
- DQUOT_LIMITS_ENABLED, type);
- dqopt->flags &= ~dquot_state_flag(DQUOT_STATE_FLAGS, type);
- spin_unlock(&dq_state_lock);
- mutex_unlock(&dqopt->dqonoff_mutex);
- flags = dquot_generic_flag(flags, type);
- ret = vfs_load_quota_inode(inode, type, dqopt->info[type].dqi_fmt_id,
- flags);
- iput(inode);
+ flags = dquot_generic_flag(flags, cnt);
+ ret = vfs_load_quota_inode(inode, cnt,
+ dqopt->info[cnt].dqi_fmt_id, flags);
+ iput(inode);
+ }
return ret;
}
+EXPORT_SYMBOL(dquot_resume);
-int vfs_quota_on_path(struct super_block *sb, int type, int format_id,
+int dquot_quota_on_path(struct super_block *sb, int type, int format_id,
struct path *path)
{
int error = security_quota_on(path->dentry);
@@ -2164,40 +2157,36 @@ int vfs_quota_on_path(struct super_block *sb, int type, int format_id,
DQUOT_LIMITS_ENABLED);
return error;
}
-EXPORT_SYMBOL(vfs_quota_on_path);
+EXPORT_SYMBOL(dquot_quota_on_path);
-int vfs_quota_on(struct super_block *sb, int type, int format_id, char *name,
- int remount)
+int dquot_quota_on(struct super_block *sb, int type, int format_id, char *name)
{
struct path path;
int error;
- if (remount)
- return vfs_quota_on_remount(sb, type);
-
error = kern_path(name, LOOKUP_FOLLOW, &path);
if (!error) {
- error = vfs_quota_on_path(sb, type, format_id, &path);
+ error = dquot_quota_on_path(sb, type, format_id, &path);
path_put(&path);
}
return error;
}
-EXPORT_SYMBOL(vfs_quota_on);
+EXPORT_SYMBOL(dquot_quota_on);
/*
* More powerful function for turning on quotas allowing setting
* of individual quota flags
*/
-int vfs_quota_enable(struct inode *inode, int type, int format_id,
- unsigned int flags)
+int dquot_enable(struct inode *inode, int type, int format_id,
+ unsigned int flags)
{
int ret = 0;
struct super_block *sb = inode->i_sb;
struct quota_info *dqopt = sb_dqopt(sb);
/* Just unsuspend quotas? */
- if (flags & DQUOT_SUSPENDED)
- return vfs_quota_on_remount(sb, type);
+ BUG_ON(flags & DQUOT_SUSPENDED);
+
if (!flags)
return 0;
/* Just updating flags needed? */
@@ -2229,13 +2218,13 @@ out_lock:
load_quota:
return vfs_load_quota_inode(inode, type, format_id, flags);
}
-EXPORT_SYMBOL(vfs_quota_enable);
+EXPORT_SYMBOL(dquot_enable);
/*
* This function is used when filesystem needs to initialize quotas
* during mount time.
*/
-int vfs_quota_on_mount(struct super_block *sb, char *qf_name,
+int dquot_quota_on_mount(struct super_block *sb, char *qf_name,
int format_id, int type)
{
struct dentry *dentry;
@@ -2261,24 +2250,7 @@ out:
dput(dentry);
return error;
}
-EXPORT_SYMBOL(vfs_quota_on_mount);
-
-/* Wrapper to turn on quotas when remounting rw */
-int vfs_dq_quota_on_remount(struct super_block *sb)
-{
- int cnt;
- int ret = 0, err;
-
- if (!sb->s_qcop || !sb->s_qcop->quota_on)
- return -ENOSYS;
- for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
- err = sb->s_qcop->quota_on(sb, cnt, 0, NULL, 1);
- if (err < 0 && !ret)
- ret = err;
- }
- return ret;
-}
-EXPORT_SYMBOL(vfs_dq_quota_on_remount);
+EXPORT_SYMBOL(dquot_quota_on_mount);
static inline qsize_t qbtos(qsize_t blocks)
{
@@ -2313,8 +2285,8 @@ static void do_get_dqblk(struct dquot *dquot, struct fs_disk_quota *di)
spin_unlock(&dq_data_lock);
}
-int vfs_get_dqblk(struct super_block *sb, int type, qid_t id,
- struct fs_disk_quota *di)
+int dquot_get_dqblk(struct super_block *sb, int type, qid_t id,
+ struct fs_disk_quota *di)
{
struct dquot *dquot;
@@ -2326,7 +2298,7 @@ int vfs_get_dqblk(struct super_block *sb, int type, qid_t id,
return 0;
}
-EXPORT_SYMBOL(vfs_get_dqblk);
+EXPORT_SYMBOL(dquot_get_dqblk);
#define VFS_FS_DQ_MASK \
(FS_DQ_BCOUNT | FS_DQ_BSOFT | FS_DQ_BHARD | \
@@ -2425,7 +2397,7 @@ static int do_set_dqblk(struct dquot *dquot, struct fs_disk_quota *di)
return 0;
}
-int vfs_set_dqblk(struct super_block *sb, int type, qid_t id,
+int dquot_set_dqblk(struct super_block *sb, int type, qid_t id,
struct fs_disk_quota *di)
{
struct dquot *dquot;
@@ -2441,10 +2413,10 @@ int vfs_set_dqblk(struct super_block *sb, int type, qid_t id,
out:
return rc;
}
-EXPORT_SYMBOL(vfs_set_dqblk);
+EXPORT_SYMBOL(dquot_set_dqblk);
/* Generic routine for getting common part of quota file information */
-int vfs_get_dqinfo(struct super_block *sb, int type, struct if_dqinfo *ii)
+int dquot_get_dqinfo(struct super_block *sb, int type, struct if_dqinfo *ii)
{
struct mem_dqinfo *mi;
@@ -2463,10 +2435,10 @@ int vfs_get_dqinfo(struct super_block *sb, int type, struct if_dqinfo *ii)
mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex);
return 0;
}
-EXPORT_SYMBOL(vfs_get_dqinfo);
+EXPORT_SYMBOL(dquot_get_dqinfo);
/* Generic routine for setting common part of quota file information */
-int vfs_set_dqinfo(struct super_block *sb, int type, struct if_dqinfo *ii)
+int dquot_set_dqinfo(struct super_block *sb, int type, struct if_dqinfo *ii)
{
struct mem_dqinfo *mi;
int err = 0;
@@ -2493,27 +2465,27 @@ out:
mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex);
return err;
}
-EXPORT_SYMBOL(vfs_set_dqinfo);
+EXPORT_SYMBOL(dquot_set_dqinfo);
-const struct quotactl_ops vfs_quotactl_ops = {
- .quota_on = vfs_quota_on,
- .quota_off = vfs_quota_off,
- .quota_sync = vfs_quota_sync,
- .get_info = vfs_get_dqinfo,
- .set_info = vfs_set_dqinfo,
- .get_dqblk = vfs_get_dqblk,
- .set_dqblk = vfs_set_dqblk
+const struct quotactl_ops dquot_quotactl_ops = {
+ .quota_on = dquot_quota_on,
+ .quota_off = dquot_quota_off,
+ .quota_sync = dquot_quota_sync,
+ .get_info = dquot_get_dqinfo,
+ .set_info = dquot_set_dqinfo,
+ .get_dqblk = dquot_get_dqblk,
+ .set_dqblk = dquot_set_dqblk
};
-
+EXPORT_SYMBOL(dquot_quotactl_ops);
static int do_proc_dqstats(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp, loff_t *ppos)
{
-#ifdef CONFIG_SMP
- /* Update global table */
unsigned int type = (int *)table->data - dqstats.stat;
- dqstats.stat[type] = dqstats_read(type);
-#endif
+
+ /* Update global table */
+ dqstats.stat[type] =
+ percpu_counter_sum_positive(&dqstats.counter[type]);
return proc_dointvec(table, write, buffer, lenp, ppos);
}
@@ -2606,7 +2578,7 @@ static ctl_table sys_table[] = {
static int __init dquot_init(void)
{
- int i;
+ int i, ret;
unsigned long nr_hash, order;
printk(KERN_NOTICE "VFS: Disk quotas %s\n", __DQUOT_VERSION__);
@@ -2624,12 +2596,11 @@ static int __init dquot_init(void)
if (!dquot_hash)
panic("Cannot create dquot hash table");
-#ifdef CONFIG_SMP
- dqstats_pcpu = alloc_percpu(struct dqstats);
- if (!dqstats_pcpu)
- panic("Cannot create dquot stats table");
-#endif
- memset(&dqstats, 0, sizeof(struct dqstats));
+ for (i = 0; i < _DQST_DQSTAT_LAST; i++) {
+ ret = percpu_counter_init(&dqstats.counter[i], 0);
+ if (ret)
+ panic("Cannot create dquot stat counters");
+ }
/* Find power-of-two hlist_heads which can fit into allocation */
nr_hash = (1UL << order) * PAGE_SIZE / sizeof(struct hlist_head);
diff --git a/fs/quota/quota.c b/fs/quota/quota.c
index ce3dfd066f59..b299961e1edb 100644
--- a/fs/quota/quota.c
+++ b/fs/quota/quota.c
@@ -73,7 +73,7 @@ static int quota_quotaon(struct super_block *sb, int type, int cmd, qid_t id,
if (IS_ERR(pathname))
return PTR_ERR(pathname);
if (sb->s_qcop->quota_on)
- ret = sb->s_qcop->quota_on(sb, type, id, pathname, 0);
+ ret = sb->s_qcop->quota_on(sb, type, id, pathname);
putname(pathname);
return ret;
}
@@ -260,7 +260,7 @@ static int do_quotactl(struct super_block *sb, int type, int cmd, qid_t id,
case Q_QUOTAOFF:
if (!sb->s_qcop->quota_off)
return -ENOSYS;
- return sb->s_qcop->quota_off(sb, type, 0);
+ return sb->s_qcop->quota_off(sb, type);
case Q_GETFMT:
return quota_getfmt(sb, type, addr);
case Q_GETINFO:
diff --git a/fs/ramfs/file-mmu.c b/fs/ramfs/file-mmu.c
index 78f613cb9c76..4884ac5ae9be 100644
--- a/fs/ramfs/file-mmu.c
+++ b/fs/ramfs/file-mmu.c
@@ -43,12 +43,13 @@ const struct file_operations ramfs_file_operations = {
.write = do_sync_write,
.aio_write = generic_file_aio_write,
.mmap = generic_file_mmap,
- .fsync = simple_sync_file,
+ .fsync = noop_fsync,
.splice_read = generic_file_splice_read,
.splice_write = generic_file_splice_write,
.llseek = generic_file_llseek,
};
const struct inode_operations ramfs_file_inode_operations = {
+ .setattr = simple_setattr,
.getattr = simple_getattr,
};
diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c
index 5ea4ad81a429..d532c20fc179 100644
--- a/fs/ramfs/file-nommu.c
+++ b/fs/ramfs/file-nommu.c
@@ -42,7 +42,7 @@ const struct file_operations ramfs_file_operations = {
.aio_read = generic_file_aio_read,
.write = do_sync_write,
.aio_write = generic_file_aio_write,
- .fsync = simple_sync_file,
+ .fsync = noop_fsync,
.splice_read = generic_file_splice_read,
.splice_write = generic_file_splice_write,
.llseek = generic_file_llseek,
@@ -146,7 +146,7 @@ static int ramfs_nommu_resize(struct inode *inode, loff_t newsize, loff_t size)
return ret;
}
- ret = vmtruncate(inode, newsize);
+ ret = simple_setsize(inode, newsize);
return ret;
}
@@ -169,7 +169,8 @@ static int ramfs_nommu_setattr(struct dentry *dentry, struct iattr *ia)
/* pick out size-changing events */
if (ia->ia_valid & ATTR_SIZE) {
- loff_t size = i_size_read(inode);
+ loff_t size = inode->i_size;
+
if (ia->ia_size != size) {
ret = ramfs_nommu_resize(inode, ia->ia_size, size);
if (ret < 0 || ia->ia_valid == ATTR_SIZE)
@@ -182,7 +183,7 @@ static int ramfs_nommu_setattr(struct dentry *dentry, struct iattr *ia)
}
}
- ret = inode_setattr(inode, ia);
+ generic_setattr(inode, ia);
out:
ia->ia_valid = old_ia_valid;
return ret;
diff --git a/fs/read_write.c b/fs/read_write.c
index 113386d6fd2d..9c0485236e68 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -97,6 +97,23 @@ loff_t generic_file_llseek(struct file *file, loff_t offset, int origin)
}
EXPORT_SYMBOL(generic_file_llseek);
+/**
+ * noop_llseek - No Operation Performed llseek implementation
+ * @file: file structure to seek on
+ * @offset: file offset to seek to
+ * @origin: type of seek
+ *
+ * This is an implementation of ->llseek useable for the rare special case when
+ * userspace expects the seek to succeed but the (device) file is actually not
+ * able to perform the seek. In this case you use noop_llseek() instead of
+ * falling back to the default implementation of ->llseek.
+ */
+loff_t noop_llseek(struct file *file, loff_t offset, int origin)
+{
+ return file->f_pos;
+}
+EXPORT_SYMBOL(noop_llseek);
+
loff_t no_llseek(struct file *file, loff_t offset, int origin)
{
return -ESPIPE;
diff --git a/fs/reiserfs/dir.c b/fs/reiserfs/dir.c
index 07930449a958..198dabf1b2bb 100644
--- a/fs/reiserfs/dir.c
+++ b/fs/reiserfs/dir.c
@@ -14,10 +14,10 @@
extern const struct reiserfs_key MIN_KEY;
static int reiserfs_readdir(struct file *, void *, filldir_t);
-static int reiserfs_dir_fsync(struct file *filp, struct dentry *dentry,
- int datasync);
+static int reiserfs_dir_fsync(struct file *filp, int datasync);
const struct file_operations reiserfs_dir_operations = {
+ .llseek = generic_file_llseek,
.read = generic_read_dir,
.readdir = reiserfs_readdir,
.fsync = reiserfs_dir_fsync,
@@ -27,10 +27,9 @@ const struct file_operations reiserfs_dir_operations = {
#endif
};
-static int reiserfs_dir_fsync(struct file *filp, struct dentry *dentry,
- int datasync)
+static int reiserfs_dir_fsync(struct file *filp, int datasync)
{
- struct inode *inode = dentry->d_inode;
+ struct inode *inode = filp->f_mapping->host;
int err;
reiserfs_write_lock(inode->i_sb);
err = reiserfs_commit_for_inode(inode);
diff --git a/fs/reiserfs/file.c b/fs/reiserfs/file.c
index 9977df9f3a54..b82cdd8a45dd 100644
--- a/fs/reiserfs/file.c
+++ b/fs/reiserfs/file.c
@@ -134,10 +134,9 @@ static void reiserfs_vfs_truncate_file(struct inode *inode)
* be removed...
*/
-static int reiserfs_sync_file(struct file *filp,
- struct dentry *dentry, int datasync)
+static int reiserfs_sync_file(struct file *filp, int datasync)
{
- struct inode *inode = dentry->d_inode;
+ struct inode *inode = filp->f_mapping->host;
int err;
int barrier_done;
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index 59125fb36d42..9822fa15118b 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -158,6 +158,7 @@ static int finish_unfinished(struct super_block *s)
#ifdef CONFIG_QUOTA
int i;
int ms_active_set;
+ int quota_enabled[MAXQUOTAS];
#endif
/* compose key to look for "save" links */
@@ -179,8 +180,15 @@ static int finish_unfinished(struct super_block *s)
}
/* Turn on quotas so that they are updated correctly */
for (i = 0; i < MAXQUOTAS; i++) {
+ quota_enabled[i] = 1;
if (REISERFS_SB(s)->s_qf_names[i]) {
- int ret = reiserfs_quota_on_mount(s, i);
+ int ret;
+
+ if (sb_has_quota_active(s, i)) {
+ quota_enabled[i] = 0;
+ continue;
+ }
+ ret = reiserfs_quota_on_mount(s, i);
if (ret < 0)
reiserfs_warning(s, "reiserfs-2500",
"cannot turn on journaled "
@@ -304,8 +312,8 @@ static int finish_unfinished(struct super_block *s)
#ifdef CONFIG_QUOTA
/* Turn quotas off */
for (i = 0; i < MAXQUOTAS; i++) {
- if (sb_dqopt(s)->files[i])
- vfs_quota_off(s, i, 0);
+ if (sb_dqopt(s)->files[i] && quota_enabled[i])
+ dquot_quota_off(s, i);
}
if (ms_active_set)
/* Restore the flag back */
@@ -466,6 +474,8 @@ static void reiserfs_put_super(struct super_block *s)
struct reiserfs_transaction_handle th;
th.t_trans_id = 0;
+ dquot_disable(s, -1, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED);
+
reiserfs_write_lock(s);
if (s->s_dirt)
@@ -620,7 +630,7 @@ static int reiserfs_acquire_dquot(struct dquot *);
static int reiserfs_release_dquot(struct dquot *);
static int reiserfs_mark_dquot_dirty(struct dquot *);
static int reiserfs_write_info(struct super_block *, int);
-static int reiserfs_quota_on(struct super_block *, int, int, char *, int);
+static int reiserfs_quota_on(struct super_block *, int, int, char *);
static const struct dquot_operations reiserfs_quota_operations = {
.write_dquot = reiserfs_write_dquot,
@@ -634,12 +644,12 @@ static const struct dquot_operations reiserfs_quota_operations = {
static const struct quotactl_ops reiserfs_qctl_operations = {
.quota_on = reiserfs_quota_on,
- .quota_off = vfs_quota_off,
- .quota_sync = vfs_quota_sync,
- .get_info = vfs_get_dqinfo,
- .set_info = vfs_set_dqinfo,
- .get_dqblk = vfs_get_dqblk,
- .set_dqblk = vfs_set_dqblk,
+ .quota_off = dquot_quota_off,
+ .quota_sync = dquot_quota_sync,
+ .get_info = dquot_get_dqinfo,
+ .set_info = dquot_set_dqinfo,
+ .get_dqblk = dquot_get_dqblk,
+ .set_dqblk = dquot_set_dqblk,
};
#endif
@@ -1242,6 +1252,11 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg)
if (s->s_flags & MS_RDONLY)
/* it is read-only already */
goto out_ok;
+
+ err = dquot_suspend(s, -1);
+ if (err < 0)
+ goto out_err;
+
/* try to remount file system with read-only permissions */
if (sb_umount_state(rs) == REISERFS_VALID_FS
|| REISERFS_SB(s)->s_mount_state != REISERFS_VALID_FS) {
@@ -1295,6 +1310,7 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg)
s->s_dirt = 0;
if (!(*mount_flags & MS_RDONLY)) {
+ dquot_resume(s, -1);
finish_unfinished(s);
reiserfs_xattr_init(s, *mount_flags);
}
@@ -2022,15 +2038,15 @@ static int reiserfs_write_info(struct super_block *sb, int type)
*/
static int reiserfs_quota_on_mount(struct super_block *sb, int type)
{
- return vfs_quota_on_mount(sb, REISERFS_SB(sb)->s_qf_names[type],
- REISERFS_SB(sb)->s_jquota_fmt, type);
+ return dquot_quota_on_mount(sb, REISERFS_SB(sb)->s_qf_names[type],
+ REISERFS_SB(sb)->s_jquota_fmt, type);
}
/*
* Standard function to be called on quota_on
*/
static int reiserfs_quota_on(struct super_block *sb, int type, int format_id,
- char *name, int remount)
+ char *name)
{
int err;
struct path path;
@@ -2039,9 +2055,7 @@ static int reiserfs_quota_on(struct super_block *sb, int type, int format_id,
if (!(REISERFS_SB(sb)->s_mount_opt & (1 << REISERFS_QUOTA)))
return -EINVAL;
- /* No more checks needed? Path and format_id are bogus anyway... */
- if (remount)
- return vfs_quota_on(sb, type, format_id, name, 1);
+
err = kern_path(name, LOOKUP_FOLLOW, &path);
if (err)
return err;
@@ -2085,7 +2099,7 @@ static int reiserfs_quota_on(struct super_block *sb, int type, int format_id,
if (err)
goto out;
}
- err = vfs_quota_on_path(sb, type, format_id, &path);
+ err = dquot_quota_on_path(sb, type, format_id, &path);
out:
path_put(&path);
return err;
diff --git a/fs/smbfs/dir.c b/fs/smbfs/dir.c
index 6c978428892d..00a70cab1f36 100644
--- a/fs/smbfs/dir.c
+++ b/fs/smbfs/dir.c
@@ -37,6 +37,7 @@ static int smb_link(struct dentry *, struct inode *, struct dentry *);
const struct file_operations smb_dir_operations =
{
+ .llseek = generic_file_llseek,
.read = generic_read_dir,
.readdir = smb_readdir,
.unlocked_ioctl = smb_ioctl,
diff --git a/fs/smbfs/file.c b/fs/smbfs/file.c
index 84ecf0e43f91..8e187a0f94bb 100644
--- a/fs/smbfs/file.c
+++ b/fs/smbfs/file.c
@@ -28,8 +28,9 @@
#include "proto.h"
static int
-smb_fsync(struct file *file, struct dentry * dentry, int datasync)
+smb_fsync(struct file *file, int datasync)
{
+ struct dentry *dentry = file->f_path.dentry;
struct smb_sb_info *server = server_from_dentry(dentry);
int result;
diff --git a/fs/smbfs/inode.c b/fs/smbfs/inode.c
index dfa1d67f8fca..9551cb6f7fe4 100644
--- a/fs/smbfs/inode.c
+++ b/fs/smbfs/inode.c
@@ -714,7 +714,7 @@ smb_notify_change(struct dentry *dentry, struct iattr *attr)
error = server->ops->truncate(inode, attr->ia_size);
if (error)
goto out;
- error = vmtruncate(inode, attr->ia_size);
+ error = simple_setsize(inode, attr->ia_size);
if (error)
goto out;
refresh = 1;
diff --git a/fs/splice.c b/fs/splice.c
index ac22b00d86c3..efdbfece9932 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -354,7 +354,7 @@ __generic_file_splice_read(struct file *in, loff_t *ppos,
break;
error = add_to_page_cache_lru(page, mapping, index,
- mapping_gfp_mask(mapping));
+ GFP_KERNEL);
if (unlikely(error)) {
page_cache_release(page);
if (error == -EEXIST)
@@ -1282,7 +1282,8 @@ static int direct_splice_actor(struct pipe_inode_info *pipe,
{
struct file *file = sd->u.file;
- return do_splice_from(pipe, file, &sd->pos, sd->total_len, sd->flags);
+ return do_splice_from(pipe, file, &file->f_pos, sd->total_len,
+ sd->flags);
}
/**
@@ -1371,8 +1372,7 @@ static long do_splice(struct file *in, loff_t __user *off_in,
if (off_in)
return -ESPIPE;
if (off_out) {
- if (!out->f_op || !out->f_op->llseek ||
- out->f_op->llseek == no_llseek)
+ if (!(out->f_mode & FMODE_PWRITE))
return -EINVAL;
if (copy_from_user(&offset, off_out, sizeof(loff_t)))
return -EFAULT;
@@ -1392,8 +1392,7 @@ static long do_splice(struct file *in, loff_t __user *off_in,
if (off_out)
return -ESPIPE;
if (off_in) {
- if (!in->f_op || !in->f_op->llseek ||
- in->f_op->llseek == no_llseek)
+ if (!(in->f_mode & FMODE_PREAD))
return -EINVAL;
if (copy_from_user(&offset, off_in, sizeof(loff_t)))
return -EFAULT;
diff --git a/fs/squashfs/Kconfig b/fs/squashfs/Kconfig
index 25a00d19d686..cc6ce8a84c21 100644
--- a/fs/squashfs/Kconfig
+++ b/fs/squashfs/Kconfig
@@ -26,6 +26,17 @@ config SQUASHFS
If unsure, say N.
+config SQUASHFS_XATTRS
+ bool "Squashfs XATTR support"
+ depends on SQUASHFS
+ default n
+ help
+ Saying Y here includes support for extended attributes (xattrs).
+ Xattrs are name:value pairs associated with inodes by
+ the kernel or by users (see the attr(5) manual page).
+
+ If unsure, say N.
+
config SQUASHFS_EMBEDDED
bool "Additional option for memory-constrained systems"
diff --git a/fs/squashfs/Makefile b/fs/squashfs/Makefile
index df8a19ef870d..2cee3e9fa452 100644
--- a/fs/squashfs/Makefile
+++ b/fs/squashfs/Makefile
@@ -5,3 +5,5 @@
obj-$(CONFIG_SQUASHFS) += squashfs.o
squashfs-y += block.o cache.o dir.o export.o file.o fragment.o id.o inode.o
squashfs-y += namei.o super.o symlink.o zlib_wrapper.o decompressor.o
+squashfs-$(CONFIG_SQUASHFS_XATTRS) += xattr.o xattr_id.o
+
diff --git a/fs/squashfs/inode.c b/fs/squashfs/inode.c
index 49daaf669e41..62e63ad25075 100644
--- a/fs/squashfs/inode.c
+++ b/fs/squashfs/inode.c
@@ -40,11 +40,13 @@
#include <linux/fs.h>
#include <linux/vfs.h>
+#include <linux/xattr.h>
#include "squashfs_fs.h"
#include "squashfs_fs_sb.h"
#include "squashfs_fs_i.h"
#include "squashfs.h"
+#include "xattr.h"
/*
* Initialise VFS inode with the base inode information common to all
@@ -111,6 +113,7 @@ int squashfs_read_inode(struct inode *inode, long long ino)
int err, type, offset = SQUASHFS_INODE_OFFSET(ino);
union squashfs_inode squashfs_ino;
struct squashfs_base_inode *sqshb_ino = &squashfs_ino.base;
+ int xattr_id = SQUASHFS_INVALID_XATTR;
TRACE("Entered squashfs_read_inode\n");
@@ -199,8 +202,10 @@ int squashfs_read_inode(struct inode *inode, long long ino)
frag_offset = 0;
}
+ xattr_id = le32_to_cpu(sqsh_ino->xattr);
inode->i_nlink = le32_to_cpu(sqsh_ino->nlink);
inode->i_size = le64_to_cpu(sqsh_ino->file_size);
+ inode->i_op = &squashfs_inode_ops;
inode->i_fop = &generic_ro_fops;
inode->i_mode |= S_IFREG;
inode->i_blocks = ((inode->i_size -
@@ -251,6 +256,7 @@ int squashfs_read_inode(struct inode *inode, long long ino)
if (err < 0)
goto failed_read;
+ xattr_id = le32_to_cpu(sqsh_ino->xattr);
inode->i_nlink = le32_to_cpu(sqsh_ino->nlink);
inode->i_size = le32_to_cpu(sqsh_ino->file_size);
inode->i_op = &squashfs_dir_inode_ops;
@@ -280,21 +286,33 @@ int squashfs_read_inode(struct inode *inode, long long ino)
inode->i_nlink = le32_to_cpu(sqsh_ino->nlink);
inode->i_size = le32_to_cpu(sqsh_ino->symlink_size);
- inode->i_op = &page_symlink_inode_operations;
+ inode->i_op = &squashfs_symlink_inode_ops;
inode->i_data.a_ops = &squashfs_symlink_aops;
inode->i_mode |= S_IFLNK;
squashfs_i(inode)->start = block;
squashfs_i(inode)->offset = offset;
+ if (type == SQUASHFS_LSYMLINK_TYPE) {
+ __le32 xattr;
+
+ err = squashfs_read_metadata(sb, NULL, &block,
+ &offset, inode->i_size);
+ if (err < 0)
+ goto failed_read;
+ err = squashfs_read_metadata(sb, &xattr, &block,
+ &offset, sizeof(xattr));
+ if (err < 0)
+ goto failed_read;
+ xattr_id = le32_to_cpu(xattr);
+ }
+
TRACE("Symbolic link inode %x:%x, start_block %llx, offset "
"%x\n", SQUASHFS_INODE_BLK(ino), offset,
block, offset);
break;
}
case SQUASHFS_BLKDEV_TYPE:
- case SQUASHFS_CHRDEV_TYPE:
- case SQUASHFS_LBLKDEV_TYPE:
- case SQUASHFS_LCHRDEV_TYPE: {
+ case SQUASHFS_CHRDEV_TYPE: {
struct squashfs_dev_inode *sqsh_ino = &squashfs_ino.dev;
unsigned int rdev;
@@ -315,10 +333,32 @@ int squashfs_read_inode(struct inode *inode, long long ino)
SQUASHFS_INODE_BLK(ino), offset, rdev);
break;
}
+ case SQUASHFS_LBLKDEV_TYPE:
+ case SQUASHFS_LCHRDEV_TYPE: {
+ struct squashfs_ldev_inode *sqsh_ino = &squashfs_ino.ldev;
+ unsigned int rdev;
+
+ err = squashfs_read_metadata(sb, sqsh_ino, &block, &offset,
+ sizeof(*sqsh_ino));
+ if (err < 0)
+ goto failed_read;
+
+ if (type == SQUASHFS_LCHRDEV_TYPE)
+ inode->i_mode |= S_IFCHR;
+ else
+ inode->i_mode |= S_IFBLK;
+ xattr_id = le32_to_cpu(sqsh_ino->xattr);
+ inode->i_op = &squashfs_inode_ops;
+ inode->i_nlink = le32_to_cpu(sqsh_ino->nlink);
+ rdev = le32_to_cpu(sqsh_ino->rdev);
+ init_special_inode(inode, inode->i_mode, new_decode_dev(rdev));
+
+ TRACE("Device inode %x:%x, rdev %x\n",
+ SQUASHFS_INODE_BLK(ino), offset, rdev);
+ break;
+ }
case SQUASHFS_FIFO_TYPE:
- case SQUASHFS_SOCKET_TYPE:
- case SQUASHFS_LFIFO_TYPE:
- case SQUASHFS_LSOCKET_TYPE: {
+ case SQUASHFS_SOCKET_TYPE: {
struct squashfs_ipc_inode *sqsh_ino = &squashfs_ino.ipc;
err = squashfs_read_metadata(sb, sqsh_ino, &block, &offset,
@@ -334,14 +374,52 @@ int squashfs_read_inode(struct inode *inode, long long ino)
init_special_inode(inode, inode->i_mode, 0);
break;
}
+ case SQUASHFS_LFIFO_TYPE:
+ case SQUASHFS_LSOCKET_TYPE: {
+ struct squashfs_lipc_inode *sqsh_ino = &squashfs_ino.lipc;
+
+ err = squashfs_read_metadata(sb, sqsh_ino, &block, &offset,
+ sizeof(*sqsh_ino));
+ if (err < 0)
+ goto failed_read;
+
+ if (type == SQUASHFS_LFIFO_TYPE)
+ inode->i_mode |= S_IFIFO;
+ else
+ inode->i_mode |= S_IFSOCK;
+ xattr_id = le32_to_cpu(sqsh_ino->xattr);
+ inode->i_op = &squashfs_inode_ops;
+ inode->i_nlink = le32_to_cpu(sqsh_ino->nlink);
+ init_special_inode(inode, inode->i_mode, 0);
+ break;
+ }
default:
ERROR("Unknown inode type %d in squashfs_iget!\n", type);
return -EINVAL;
}
+ if (xattr_id != SQUASHFS_INVALID_XATTR && msblk->xattr_id_table) {
+ err = squashfs_xattr_lookup(sb, xattr_id,
+ &squashfs_i(inode)->xattr_count,
+ &squashfs_i(inode)->xattr_size,
+ &squashfs_i(inode)->xattr);
+ if (err < 0)
+ goto failed_read;
+ inode->i_blocks += ((squashfs_i(inode)->xattr_size - 1) >> 9)
+ + 1;
+ } else
+ squashfs_i(inode)->xattr_count = 0;
+
return 0;
failed_read:
ERROR("Unable to read inode 0x%llx\n", ino);
return err;
}
+
+
+const struct inode_operations squashfs_inode_ops = {
+ .getxattr = generic_getxattr,
+ .listxattr = squashfs_listxattr
+};
+
diff --git a/fs/squashfs/namei.c b/fs/squashfs/namei.c
index 5266bd8ad932..7a9464d08cf6 100644
--- a/fs/squashfs/namei.c
+++ b/fs/squashfs/namei.c
@@ -57,11 +57,13 @@
#include <linux/slab.h>
#include <linux/string.h>
#include <linux/dcache.h>
+#include <linux/xattr.h>
#include "squashfs_fs.h"
#include "squashfs_fs_sb.h"
#include "squashfs_fs_i.h"
#include "squashfs.h"
+#include "xattr.h"
/*
* Lookup name in the directory index, returning the location of the metadata
@@ -237,5 +239,7 @@ failed:
const struct inode_operations squashfs_dir_inode_ops = {
- .lookup = squashfs_lookup
+ .lookup = squashfs_lookup,
+ .getxattr = generic_getxattr,
+ .listxattr = squashfs_listxattr
};
diff --git a/fs/squashfs/squashfs.h b/fs/squashfs/squashfs.h
index fe2587af5512..733a17c42945 100644
--- a/fs/squashfs/squashfs.h
+++ b/fs/squashfs/squashfs.h
@@ -73,8 +73,11 @@ extern struct inode *squashfs_iget(struct super_block *, long long,
unsigned int);
extern int squashfs_read_inode(struct inode *, long long);
+/* xattr.c */
+extern ssize_t squashfs_listxattr(struct dentry *, char *, size_t);
+
/*
- * Inodes, files and decompressor operations
+ * Inodes, files, decompressor and xattr operations
*/
/* dir.c */
@@ -86,11 +89,18 @@ extern const struct export_operations squashfs_export_ops;
/* file.c */
extern const struct address_space_operations squashfs_aops;
+/* inode.c */
+extern const struct inode_operations squashfs_inode_ops;
+
/* namei.c */
extern const struct inode_operations squashfs_dir_inode_ops;
/* symlink.c */
extern const struct address_space_operations squashfs_symlink_aops;
+extern const struct inode_operations squashfs_symlink_inode_ops;
+
+/* xattr.c */
+extern const struct xattr_handler *squashfs_xattr_handlers[];
/* zlib_wrapper.c */
extern const struct squashfs_decompressor squashfs_zlib_comp_ops;
diff --git a/fs/squashfs/squashfs_fs.h b/fs/squashfs/squashfs_fs.h
index 79024245ea00..8eabb808b78d 100644
--- a/fs/squashfs/squashfs_fs.h
+++ b/fs/squashfs/squashfs_fs.h
@@ -46,6 +46,7 @@
#define SQUASHFS_NAME_LEN 256
#define SQUASHFS_INVALID_FRAG (0xffffffffU)
+#define SQUASHFS_INVALID_XATTR (0xffffffffU)
#define SQUASHFS_INVALID_BLK (-1LL)
/* Filesystem flags */
@@ -96,6 +97,13 @@
#define SQUASHFS_LFIFO_TYPE 13
#define SQUASHFS_LSOCKET_TYPE 14
+/* Xattr types */
+#define SQUASHFS_XATTR_USER 0
+#define SQUASHFS_XATTR_TRUSTED 1
+#define SQUASHFS_XATTR_SECURITY 2
+#define SQUASHFS_XATTR_VALUE_OOL 256
+#define SQUASHFS_XATTR_PREFIX_MASK 0xff
+
/* Flag whether block is compressed or uncompressed, bit is set if block is
* uncompressed */
#define SQUASHFS_COMPRESSED_BIT (1 << 15)
@@ -174,6 +182,24 @@
#define SQUASHFS_ID_BLOCK_BYTES(A) (SQUASHFS_ID_BLOCKS(A) *\
sizeof(u64))
+/* xattr id lookup table defines */
+#define SQUASHFS_XATTR_BYTES(A) ((A) * sizeof(struct squashfs_xattr_id))
+
+#define SQUASHFS_XATTR_BLOCK(A) (SQUASHFS_XATTR_BYTES(A) / \
+ SQUASHFS_METADATA_SIZE)
+
+#define SQUASHFS_XATTR_BLOCK_OFFSET(A) (SQUASHFS_XATTR_BYTES(A) % \
+ SQUASHFS_METADATA_SIZE)
+
+#define SQUASHFS_XATTR_BLOCKS(A) ((SQUASHFS_XATTR_BYTES(A) + \
+ SQUASHFS_METADATA_SIZE - 1) / \
+ SQUASHFS_METADATA_SIZE)
+
+#define SQUASHFS_XATTR_BLOCK_BYTES(A) (SQUASHFS_XATTR_BLOCKS(A) *\
+ sizeof(u64))
+#define SQUASHFS_XATTR_BLK(A) ((unsigned int) ((A) >> 16))
+
+#define SQUASHFS_XATTR_OFFSET(A) ((unsigned int) ((A) & 0xffff))
/* cached data constants for filesystem */
#define SQUASHFS_CACHED_BLKS 8
@@ -228,7 +254,7 @@ struct squashfs_super_block {
__le64 root_inode;
__le64 bytes_used;
__le64 id_table_start;
- __le64 xattr_table_start;
+ __le64 xattr_id_table_start;
__le64 inode_table_start;
__le64 directory_table_start;
__le64 fragment_table_start;
@@ -261,6 +287,17 @@ struct squashfs_ipc_inode {
__le32 nlink;
};
+struct squashfs_lipc_inode {
+ __le16 inode_type;
+ __le16 mode;
+ __le16 uid;
+ __le16 guid;
+ __le32 mtime;
+ __le32 inode_number;
+ __le32 nlink;
+ __le32 xattr;
+};
+
struct squashfs_dev_inode {
__le16 inode_type;
__le16 mode;
@@ -272,6 +309,18 @@ struct squashfs_dev_inode {
__le32 rdev;
};
+struct squashfs_ldev_inode {
+ __le16 inode_type;
+ __le16 mode;
+ __le16 uid;
+ __le16 guid;
+ __le32 mtime;
+ __le32 inode_number;
+ __le32 nlink;
+ __le32 rdev;
+ __le32 xattr;
+};
+
struct squashfs_symlink_inode {
__le16 inode_type;
__le16 mode;
@@ -349,12 +398,14 @@ struct squashfs_ldir_inode {
union squashfs_inode {
struct squashfs_base_inode base;
struct squashfs_dev_inode dev;
+ struct squashfs_ldev_inode ldev;
struct squashfs_symlink_inode symlink;
struct squashfs_reg_inode reg;
struct squashfs_lreg_inode lreg;
struct squashfs_dir_inode dir;
struct squashfs_ldir_inode ldir;
struct squashfs_ipc_inode ipc;
+ struct squashfs_lipc_inode lipc;
};
struct squashfs_dir_entry {
@@ -377,4 +428,27 @@ struct squashfs_fragment_entry {
unsigned int unused;
};
+struct squashfs_xattr_entry {
+ __le16 type;
+ __le16 size;
+ char data[0];
+};
+
+struct squashfs_xattr_val {
+ __le32 vsize;
+ char value[0];
+};
+
+struct squashfs_xattr_id {
+ __le64 xattr;
+ __le32 count;
+ __le32 size;
+};
+
+struct squashfs_xattr_id_table {
+ __le64 xattr_table_start;
+ __le32 xattr_ids;
+ __le32 unused;
+};
+
#endif
diff --git a/fs/squashfs/squashfs_fs_i.h b/fs/squashfs/squashfs_fs_i.h
index fbfca30c0c68..d3e3a37f28a1 100644
--- a/fs/squashfs/squashfs_fs_i.h
+++ b/fs/squashfs/squashfs_fs_i.h
@@ -26,6 +26,9 @@
struct squashfs_inode_info {
u64 start;
int offset;
+ u64 xattr;
+ unsigned int xattr_size;
+ int xattr_count;
union {
struct {
u64 fragment_block;
diff --git a/fs/squashfs/squashfs_fs_sb.h b/fs/squashfs/squashfs_fs_sb.h
index 2e77dc547e25..d9037a5215f0 100644
--- a/fs/squashfs/squashfs_fs_sb.h
+++ b/fs/squashfs/squashfs_fs_sb.h
@@ -61,6 +61,7 @@ struct squashfs_sb_info {
int next_meta_index;
__le64 *id_table;
__le64 *fragment_index;
+ __le64 *xattr_id_table;
struct mutex read_data_mutex;
struct mutex meta_index_mutex;
struct meta_index *meta_index;
@@ -68,9 +69,11 @@ struct squashfs_sb_info {
__le64 *inode_lookup_table;
u64 inode_table;
u64 directory_table;
+ u64 xattr_table;
unsigned int block_size;
unsigned short block_log;
long long bytes_used;
unsigned int inodes;
+ int xattr_ids;
};
#endif
diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c
index 48b6f4a385a6..88b4f8606652 100644
--- a/fs/squashfs/super.c
+++ b/fs/squashfs/super.c
@@ -36,12 +36,14 @@
#include <linux/init.h>
#include <linux/module.h>
#include <linux/magic.h>
+#include <linux/xattr.h>
#include "squashfs_fs.h"
#include "squashfs_fs_sb.h"
#include "squashfs_fs_i.h"
#include "squashfs.h"
#include "decompressor.h"
+#include "xattr.h"
static struct file_system_type squashfs_fs_type;
static const struct super_operations squashfs_super_ops;
@@ -82,7 +84,7 @@ static int squashfs_fill_super(struct super_block *sb, void *data, int silent)
long long root_inode;
unsigned short flags;
unsigned int fragments;
- u64 lookup_table_start;
+ u64 lookup_table_start, xattr_id_table_start;
int err;
TRACE("Entered squashfs_fill_superblock\n");
@@ -139,13 +141,6 @@ static int squashfs_fill_super(struct super_block *sb, void *data, int silent)
if (msblk->decompressor == NULL)
goto failed_mount;
- /*
- * Check if there's xattrs in the filesystem. These are not
- * supported in this version, so warn that they will be ignored.
- */
- if (le64_to_cpu(sblk->xattr_table_start) != SQUASHFS_INVALID_BLK)
- ERROR("Xattrs in filesystem, these will be ignored\n");
-
/* Check the filesystem does not extend beyond the end of the
block device */
msblk->bytes_used = le64_to_cpu(sblk->bytes_used);
@@ -253,7 +248,7 @@ static int squashfs_fill_super(struct super_block *sb, void *data, int silent)
allocate_lookup_table:
lookup_table_start = le64_to_cpu(sblk->lookup_table_start);
if (lookup_table_start == SQUASHFS_INVALID_BLK)
- goto allocate_root;
+ goto allocate_xattr_table;
/* Allocate and read inode lookup table */
msblk->inode_lookup_table = squashfs_read_inode_lookup_table(sb,
@@ -266,6 +261,21 @@ allocate_lookup_table:
sb->s_export_op = &squashfs_export_ops;
+allocate_xattr_table:
+ sb->s_xattr = squashfs_xattr_handlers;
+ xattr_id_table_start = le64_to_cpu(sblk->xattr_id_table_start);
+ if (xattr_id_table_start == SQUASHFS_INVALID_BLK)
+ goto allocate_root;
+
+ /* Allocate and read xattr id lookup table */
+ msblk->xattr_id_table = squashfs_read_xattr_id_table(sb,
+ xattr_id_table_start, &msblk->xattr_table, &msblk->xattr_ids);
+ if (IS_ERR(msblk->xattr_id_table)) {
+ err = PTR_ERR(msblk->xattr_id_table);
+ msblk->xattr_id_table = NULL;
+ if (err != -ENOTSUPP)
+ goto failed_mount;
+ }
allocate_root:
root = new_inode(sb);
if (!root) {
@@ -301,6 +311,7 @@ failed_mount:
kfree(msblk->inode_lookup_table);
kfree(msblk->fragment_index);
kfree(msblk->id_table);
+ kfree(msblk->xattr_id_table);
kfree(sb->s_fs_info);
sb->s_fs_info = NULL;
kfree(sblk);
@@ -355,6 +366,7 @@ static void squashfs_put_super(struct super_block *sb)
kfree(sbi->fragment_index);
kfree(sbi->meta_index);
kfree(sbi->inode_lookup_table);
+ kfree(sbi->xattr_id_table);
kfree(sb->s_fs_info);
sb->s_fs_info = NULL;
}
diff --git a/fs/squashfs/symlink.c b/fs/squashfs/symlink.c
index 32b911f4ee39..ec86434921e1 100644
--- a/fs/squashfs/symlink.c
+++ b/fs/squashfs/symlink.c
@@ -35,11 +35,13 @@
#include <linux/kernel.h>
#include <linux/string.h>
#include <linux/pagemap.h>
+#include <linux/xattr.h>
#include "squashfs_fs.h"
#include "squashfs_fs_sb.h"
#include "squashfs_fs_i.h"
#include "squashfs.h"
+#include "xattr.h"
static int squashfs_symlink_readpage(struct file *file, struct page *page)
{
@@ -114,3 +116,12 @@ error_out:
const struct address_space_operations squashfs_symlink_aops = {
.readpage = squashfs_symlink_readpage
};
+
+const struct inode_operations squashfs_symlink_inode_ops = {
+ .readlink = generic_readlink,
+ .follow_link = page_follow_link_light,
+ .put_link = page_put_link,
+ .getxattr = generic_getxattr,
+ .listxattr = squashfs_listxattr
+};
+
diff --git a/fs/squashfs/xattr.c b/fs/squashfs/xattr.c
new file mode 100644
index 000000000000..c7655e8b31cd
--- /dev/null
+++ b/fs/squashfs/xattr.c
@@ -0,0 +1,323 @@
+/*
+ * Squashfs - a compressed read only filesystem for Linux
+ *
+ * Copyright (c) 2010
+ * Phillip Lougher <phillip@lougher.demon.co.uk>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2,
+ * or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+ * xattr_id.c
+ */
+
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/fs.h>
+#include <linux/vfs.h>
+#include <linux/xattr.h>
+#include <linux/slab.h>
+
+#include "squashfs_fs.h"
+#include "squashfs_fs_sb.h"
+#include "squashfs_fs_i.h"
+#include "squashfs.h"
+
+static const struct xattr_handler *squashfs_xattr_handler(int);
+
+ssize_t squashfs_listxattr(struct dentry *d, char *buffer,
+ size_t buffer_size)
+{
+ struct inode *inode = d->d_inode;
+ struct super_block *sb = inode->i_sb;
+ struct squashfs_sb_info *msblk = sb->s_fs_info;
+ u64 start = SQUASHFS_XATTR_BLK(squashfs_i(inode)->xattr)
+ + msblk->xattr_table;
+ int offset = SQUASHFS_XATTR_OFFSET(squashfs_i(inode)->xattr);
+ int count = squashfs_i(inode)->xattr_count;
+ size_t rest = buffer_size;
+ int err;
+
+ /* check that the file system has xattrs */
+ if (msblk->xattr_id_table == NULL)
+ return -EOPNOTSUPP;
+
+ /* loop reading each xattr name */
+ while (count--) {
+ struct squashfs_xattr_entry entry;
+ struct squashfs_xattr_val val;
+ const struct xattr_handler *handler;
+ int name_size, prefix_size = 0;
+
+ err = squashfs_read_metadata(sb, &entry, &start, &offset,
+ sizeof(entry));
+ if (err < 0)
+ goto failed;
+
+ name_size = le16_to_cpu(entry.size);
+ handler = squashfs_xattr_handler(le16_to_cpu(entry.type));
+ if (handler)
+ prefix_size = handler->list(d, buffer, rest, NULL,
+ name_size, handler->flags);
+ if (prefix_size) {
+ if (buffer) {
+ if (prefix_size + name_size + 1 > rest) {
+ err = -ERANGE;
+ goto failed;
+ }
+ buffer += prefix_size;
+ }
+ err = squashfs_read_metadata(sb, buffer, &start,
+ &offset, name_size);
+ if (err < 0)
+ goto failed;
+ if (buffer) {
+ buffer[name_size] = '\0';
+ buffer += name_size + 1;
+ }
+ rest -= prefix_size + name_size + 1;
+ } else {
+ /* no handler or insuffficient privileges, so skip */
+ err = squashfs_read_metadata(sb, NULL, &start,
+ &offset, name_size);
+ if (err < 0)
+ goto failed;
+ }
+
+
+ /* skip remaining xattr entry */
+ err = squashfs_read_metadata(sb, &val, &start, &offset,
+ sizeof(val));
+ if (err < 0)
+ goto failed;
+
+ err = squashfs_read_metadata(sb, NULL, &start, &offset,
+ le32_to_cpu(val.vsize));
+ if (err < 0)
+ goto failed;
+ }
+ err = buffer_size - rest;
+
+failed:
+ return err;
+}
+
+
+static int squashfs_xattr_get(struct inode *inode, int name_index,
+ const char *name, void *buffer, size_t buffer_size)
+{
+ struct super_block *sb = inode->i_sb;
+ struct squashfs_sb_info *msblk = sb->s_fs_info;
+ u64 start = SQUASHFS_XATTR_BLK(squashfs_i(inode)->xattr)
+ + msblk->xattr_table;
+ int offset = SQUASHFS_XATTR_OFFSET(squashfs_i(inode)->xattr);
+ int count = squashfs_i(inode)->xattr_count;
+ int name_len = strlen(name);
+ int err, vsize;
+ char *target = kmalloc(name_len, GFP_KERNEL);
+
+ if (target == NULL)
+ return -ENOMEM;
+
+ /* loop reading each xattr name */
+ for (; count; count--) {
+ struct squashfs_xattr_entry entry;
+ struct squashfs_xattr_val val;
+ int type, prefix, name_size;
+
+ err = squashfs_read_metadata(sb, &entry, &start, &offset,
+ sizeof(entry));
+ if (err < 0)
+ goto failed;
+
+ name_size = le16_to_cpu(entry.size);
+ type = le16_to_cpu(entry.type);
+ prefix = type & SQUASHFS_XATTR_PREFIX_MASK;
+
+ if (prefix == name_index && name_size == name_len)
+ err = squashfs_read_metadata(sb, target, &start,
+ &offset, name_size);
+ else
+ err = squashfs_read_metadata(sb, NULL, &start,
+ &offset, name_size);
+ if (err < 0)
+ goto failed;
+
+ if (prefix == name_index && name_size == name_len &&
+ strncmp(target, name, name_size) == 0) {
+ /* found xattr */
+ if (type & SQUASHFS_XATTR_VALUE_OOL) {
+ __le64 xattr;
+ /* val is a reference to the real location */
+ err = squashfs_read_metadata(sb, &val, &start,
+ &offset, sizeof(val));
+ if (err < 0)
+ goto failed;
+ err = squashfs_read_metadata(sb, &xattr, &start,
+ &offset, sizeof(xattr));
+ if (err < 0)
+ goto failed;
+ xattr = le64_to_cpu(xattr);
+ start = SQUASHFS_XATTR_BLK(xattr) +
+ msblk->xattr_table;
+ offset = SQUASHFS_XATTR_OFFSET(xattr);
+ }
+ /* read xattr value */
+ err = squashfs_read_metadata(sb, &val, &start, &offset,
+ sizeof(val));
+ if (err < 0)
+ goto failed;
+
+ vsize = le32_to_cpu(val.vsize);
+ if (buffer) {
+ if (vsize > buffer_size) {
+ err = -ERANGE;
+ goto failed;
+ }
+ err = squashfs_read_metadata(sb, buffer, &start,
+ &offset, vsize);
+ if (err < 0)
+ goto failed;
+ }
+ break;
+ }
+
+ /* no match, skip remaining xattr entry */
+ err = squashfs_read_metadata(sb, &val, &start, &offset,
+ sizeof(val));
+ if (err < 0)
+ goto failed;
+ err = squashfs_read_metadata(sb, NULL, &start, &offset,
+ le32_to_cpu(val.vsize));
+ if (err < 0)
+ goto failed;
+ }
+ err = count ? vsize : -ENODATA;
+
+failed:
+ kfree(target);
+ return err;
+}
+
+
+/*
+ * User namespace support
+ */
+static size_t squashfs_user_list(struct dentry *d, char *list, size_t list_size,
+ const char *name, size_t name_len, int type)
+{
+ if (list && XATTR_USER_PREFIX_LEN <= list_size)
+ memcpy(list, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN);
+ return XATTR_USER_PREFIX_LEN;
+}
+
+static int squashfs_user_get(struct dentry *d, const char *name, void *buffer,
+ size_t size, int type)
+{
+ if (name[0] == '\0')
+ return -EINVAL;
+
+ return squashfs_xattr_get(d->d_inode, SQUASHFS_XATTR_USER, name,
+ buffer, size);
+}
+
+static const struct xattr_handler squashfs_xattr_user_handler = {
+ .prefix = XATTR_USER_PREFIX,
+ .list = squashfs_user_list,
+ .get = squashfs_user_get
+};
+
+/*
+ * Trusted namespace support
+ */
+static size_t squashfs_trusted_list(struct dentry *d, char *list,
+ size_t list_size, const char *name, size_t name_len, int type)
+{
+ if (!capable(CAP_SYS_ADMIN))
+ return 0;
+
+ if (list && XATTR_TRUSTED_PREFIX_LEN <= list_size)
+ memcpy(list, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN);
+ return XATTR_TRUSTED_PREFIX_LEN;
+}
+
+static int squashfs_trusted_get(struct dentry *d, const char *name,
+ void *buffer, size_t size, int type)
+{
+ if (name[0] == '\0')
+ return -EINVAL;
+
+ return squashfs_xattr_get(d->d_inode, SQUASHFS_XATTR_TRUSTED, name,
+ buffer, size);
+}
+
+static const struct xattr_handler squashfs_xattr_trusted_handler = {
+ .prefix = XATTR_TRUSTED_PREFIX,
+ .list = squashfs_trusted_list,
+ .get = squashfs_trusted_get
+};
+
+/*
+ * Security namespace support
+ */
+static size_t squashfs_security_list(struct dentry *d, char *list,
+ size_t list_size, const char *name, size_t name_len, int type)
+{
+ if (list && XATTR_SECURITY_PREFIX_LEN <= list_size)
+ memcpy(list, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN);
+ return XATTR_SECURITY_PREFIX_LEN;
+}
+
+static int squashfs_security_get(struct dentry *d, const char *name,
+ void *buffer, size_t size, int type)
+{
+ if (name[0] == '\0')
+ return -EINVAL;
+
+ return squashfs_xattr_get(d->d_inode, SQUASHFS_XATTR_SECURITY, name,
+ buffer, size);
+}
+
+static const struct xattr_handler squashfs_xattr_security_handler = {
+ .prefix = XATTR_SECURITY_PREFIX,
+ .list = squashfs_security_list,
+ .get = squashfs_security_get
+};
+
+static inline const struct xattr_handler *squashfs_xattr_handler(int type)
+{
+ if (type & ~(SQUASHFS_XATTR_PREFIX_MASK | SQUASHFS_XATTR_VALUE_OOL))
+ /* ignore unrecognised type */
+ return NULL;
+
+ switch (type & SQUASHFS_XATTR_PREFIX_MASK) {
+ case SQUASHFS_XATTR_USER:
+ return &squashfs_xattr_user_handler;
+ case SQUASHFS_XATTR_TRUSTED:
+ return &squashfs_xattr_trusted_handler;
+ case SQUASHFS_XATTR_SECURITY:
+ return &squashfs_xattr_security_handler;
+ default:
+ /* ignore unrecognised type */
+ return NULL;
+ }
+}
+
+const struct xattr_handler *squashfs_xattr_handlers[] = {
+ &squashfs_xattr_user_handler,
+ &squashfs_xattr_trusted_handler,
+ &squashfs_xattr_security_handler,
+ NULL
+};
+
diff --git a/fs/squashfs/xattr.h b/fs/squashfs/xattr.h
new file mode 100644
index 000000000000..9da071ae181c
--- /dev/null
+++ b/fs/squashfs/xattr.h
@@ -0,0 +1,46 @@
+/*
+ * Squashfs - a compressed read only filesystem for Linux
+ *
+ * Copyright (c) 2010
+ * Phillip Lougher <phillip@lougher.demon.co.uk>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2,
+ * or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+ * xattr.h
+ */
+
+#ifdef CONFIG_SQUASHFS_XATTRS
+extern __le64 *squashfs_read_xattr_id_table(struct super_block *, u64,
+ u64 *, int *);
+extern int squashfs_xattr_lookup(struct super_block *, unsigned int, int *,
+ int *, unsigned long long *);
+#else
+static inline __le64 *squashfs_read_xattr_id_table(struct super_block *sb,
+ u64 start, u64 *xattr_table_start, int *xattr_ids)
+{
+ ERROR("Xattrs in filesystem, these will be ignored\n");
+ return ERR_PTR(-ENOTSUPP);
+}
+
+static inline int squashfs_xattr_lookup(struct super_block *sb,
+ unsigned int index, int *count, int *size,
+ unsigned long long *xattr)
+{
+ return 0;
+}
+#define squashfs_listxattr NULL
+#define generic_getxattr NULL
+#define squashfs_xattr_handlers NULL
+#endif
diff --git a/fs/squashfs/xattr_id.c b/fs/squashfs/xattr_id.c
new file mode 100644
index 000000000000..cfb41106098f
--- /dev/null
+++ b/fs/squashfs/xattr_id.c
@@ -0,0 +1,100 @@
+/*
+ * Squashfs - a compressed read only filesystem for Linux
+ *
+ * Copyright (c) 2010
+ * Phillip Lougher <phillip@lougher.demon.co.uk>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2,
+ * or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+ * xattr_id.c
+ */
+
+/*
+ * This file implements code to map the 32-bit xattr id stored in the inode
+ * into the on disk location of the xattr data.
+ */
+
+#include <linux/fs.h>
+#include <linux/vfs.h>
+#include <linux/slab.h>
+
+#include "squashfs_fs.h"
+#include "squashfs_fs_sb.h"
+#include "squashfs_fs_i.h"
+#include "squashfs.h"
+
+/*
+ * Map xattr id using the xattr id look up table
+ */
+int squashfs_xattr_lookup(struct super_block *sb, unsigned int index,
+ int *count, unsigned int *size, unsigned long long *xattr)
+{
+ struct squashfs_sb_info *msblk = sb->s_fs_info;
+ int block = SQUASHFS_XATTR_BLOCK(index);
+ int offset = SQUASHFS_XATTR_BLOCK_OFFSET(index);
+ u64 start_block = le64_to_cpu(msblk->xattr_id_table[block]);
+ struct squashfs_xattr_id id;
+ int err;
+
+ err = squashfs_read_metadata(sb, &id, &start_block, &offset,
+ sizeof(id));
+ if (err < 0)
+ return err;
+
+ *xattr = le64_to_cpu(id.xattr);
+ *size = le32_to_cpu(id.size);
+ *count = le32_to_cpu(id.count);
+ return 0;
+}
+
+
+/*
+ * Read uncompressed xattr id lookup table indexes from disk into memory
+ */
+__le64 *squashfs_read_xattr_id_table(struct super_block *sb, u64 start,
+ u64 *xattr_table_start, int *xattr_ids)
+{
+ unsigned int len;
+ __le64 *xid_table;
+ struct squashfs_xattr_id_table id_table;
+ int err;
+
+ err = squashfs_read_table(sb, &id_table, start, sizeof(id_table));
+ if (err < 0) {
+ ERROR("unable to read xattr id table\n");
+ return ERR_PTR(err);
+ }
+ *xattr_table_start = le64_to_cpu(id_table.xattr_table_start);
+ *xattr_ids = le32_to_cpu(id_table.xattr_ids);
+ len = SQUASHFS_XATTR_BLOCK_BYTES(*xattr_ids);
+
+ TRACE("In read_xattr_index_table, length %d\n", len);
+
+ /* Allocate xattr id lookup table indexes */
+ xid_table = kmalloc(len, GFP_KERNEL);
+ if (xid_table == NULL) {
+ ERROR("Failed to allocate xattr id index table\n");
+ return ERR_PTR(-ENOMEM);
+ }
+
+ err = squashfs_read_table(sb, xid_table, start + sizeof(id_table), len);
+ if (err < 0) {
+ ERROR("unable to read xattr id index table\n");
+ kfree(xid_table);
+ return ERR_PTR(err);
+ }
+
+ return xid_table;
+}
diff --git a/fs/super.c b/fs/super.c
index 69688b15f1fa..938119ab8dcb 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -24,7 +24,6 @@
#include <linux/slab.h>
#include <linux/acct.h>
#include <linux/blkdev.h>
-#include <linux/quotaops.h>
#include <linux/mount.h>
#include <linux/security.h>
#include <linux/writeback.h> /* for the emergency remount stuff */
@@ -94,8 +93,6 @@ static struct super_block *alloc_super(struct file_system_type *type)
init_rwsem(&s->s_dquot.dqptr_sem);
init_waitqueue_head(&s->s_wait_unfrozen);
s->s_maxbytes = MAX_NON_LFS;
- s->dq_op = sb_dquot_ops;
- s->s_qcop = sb_quotactl_ops;
s->s_op = &default_op;
s->s_time_gran = 1000000000;
}
@@ -160,7 +157,6 @@ void deactivate_locked_super(struct super_block *s)
{
struct file_system_type *fs = s->s_type;
if (atomic_dec_and_test(&s->s_active)) {
- vfs_dq_off(s, 0);
fs->kill_sb(s);
put_filesystem(fs);
put_super(s);
@@ -378,6 +374,8 @@ void sync_supers(void)
up_read(&sb->s_umount);
spin_lock(&sb_lock);
+ /* lock was dropped, must reset next */
+ list_safe_reset_next(sb, n, s_list);
__put_super(sb);
}
}
@@ -409,6 +407,8 @@ void iterate_supers(void (*f)(struct super_block *, void *), void *arg)
up_read(&sb->s_umount);
spin_lock(&sb_lock);
+ /* lock was dropped, must reset next */
+ list_safe_reset_next(sb, n, s_list);
__put_super(sb);
}
spin_unlock(&sb_lock);
@@ -524,7 +524,7 @@ rescan:
int do_remount_sb(struct super_block *sb, int flags, void *data, int force)
{
int retval;
- int remount_rw, remount_ro;
+ int remount_ro;
if (sb->s_frozen != SB_UNFROZEN)
return -EBUSY;
@@ -540,7 +540,6 @@ int do_remount_sb(struct super_block *sb, int flags, void *data, int force)
sync_filesystem(sb);
remount_ro = (flags & MS_RDONLY) && !(sb->s_flags & MS_RDONLY);
- remount_rw = !(flags & MS_RDONLY) && (sb->s_flags & MS_RDONLY);
/* If we are remounting RDONLY and current sb is read/write,
make sure there are no rw files opened */
@@ -549,9 +548,6 @@ int do_remount_sb(struct super_block *sb, int flags, void *data, int force)
mark_files_ro(sb);
else if (!fs_may_remount_ro(sb))
return -EBUSY;
- retval = vfs_dq_off(sb, 1);
- if (retval < 0 && retval != -ENOSYS)
- return -EBUSY;
}
if (sb->s_op->remount_fs) {
@@ -560,8 +556,7 @@ int do_remount_sb(struct super_block *sb, int flags, void *data, int force)
return retval;
}
sb->s_flags = (sb->s_flags & ~MS_RMT_MASK) | (flags & MS_RMT_MASK);
- if (remount_rw)
- vfs_dq_quota_on_remount(sb);
+
/*
* Some filesystems modify their metadata via some other path than the
* bdev buffer cache (eg. use a private mapping, or directories in
@@ -594,6 +589,8 @@ static void do_emergency_remount(struct work_struct *work)
}
up_write(&sb->s_umount);
spin_lock(&sb_lock);
+ /* lock was dropped, must reset next */
+ list_safe_reset_next(sb, n, s_list);
__put_super(sb);
}
spin_unlock(&sb_lock);
@@ -946,8 +943,8 @@ out:
EXPORT_SYMBOL_GPL(vfs_kern_mount);
/**
- * freeze_super -- lock the filesystem and force it into a consistent state
- * @super: the super to lock
+ * freeze_super - lock the filesystem and force it into a consistent state
+ * @sb: the super to lock
*
* Syncs the super to make sure the filesystem is consistent and calls the fs's
* freeze_fs. Subsequent calls to this without first thawing the fs will return
diff --git a/fs/sync.c b/fs/sync.c
index e8cbd415e50a..15aa6f03b2da 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -42,7 +42,7 @@ static int __sync_filesystem(struct super_block *sb, int wait)
if (wait)
sync_inodes_sb(sb);
else
- writeback_inodes_sb_locked(sb);
+ writeback_inodes_sb(sb);
if (sb->s_op->sync_fs)
sb->s_op->sync_fs(sb, wait);
@@ -130,12 +130,10 @@ void emergency_sync(void)
/*
* Generic function to fsync a file.
- *
- * filp may be NULL if called via the msync of a vma.
*/
-int file_fsync(struct file *filp, struct dentry *dentry, int datasync)
+int file_fsync(struct file *filp, int datasync)
{
- struct inode * inode = dentry->d_inode;
+ struct inode *inode = filp->f_mapping->host;
struct super_block * sb;
int ret, err;
@@ -183,7 +181,7 @@ int vfs_fsync_range(struct file *file, loff_t start, loff_t end, int datasync)
* livelocks in fsync_buffers_list().
*/
mutex_lock(&mapping->host->i_mutex);
- err = file->f_op->fsync(file, file->f_path.dentry, datasync);
+ err = file->f_op->fsync(file, datasync);
if (!ret)
ret = err;
mutex_unlock(&mapping->host->i_mutex);
diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c
index bbd77e95cf7f..0835a3b70e03 100644
--- a/fs/sysfs/inode.c
+++ b/fs/sysfs/inode.c
@@ -117,13 +117,13 @@ int sysfs_setattr(struct dentry *dentry, struct iattr *iattr)
if (error)
goto out;
- iattr->ia_valid &= ~ATTR_SIZE; /* ignore size changes */
-
- error = inode_setattr(inode, iattr);
+ error = sysfs_sd_setattr(sd, iattr);
if (error)
goto out;
- error = sysfs_sd_setattr(sd, iattr);
+ /* this ignores size changes */
+ generic_setattr(inode, iattr);
+
out:
mutex_unlock(&sysfs_mutex);
return error;
diff --git a/fs/sysfs/symlink.c b/fs/sysfs/symlink.c
index f71246bebfe4..a7ac78f8e67a 100644
--- a/fs/sysfs/symlink.c
+++ b/fs/sysfs/symlink.c
@@ -28,6 +28,7 @@ static int sysfs_do_create_link(struct kobject *kobj, struct kobject *target,
struct sysfs_dirent *target_sd = NULL;
struct sysfs_dirent *sd = NULL;
struct sysfs_addrm_cxt acxt;
+ enum kobj_ns_type ns_type;
int error;
BUG_ON(!name);
@@ -58,16 +59,29 @@ static int sysfs_do_create_link(struct kobject *kobj, struct kobject *target,
if (!sd)
goto out_put;
- if (sysfs_ns_type(parent_sd))
+ ns_type = sysfs_ns_type(parent_sd);
+ if (ns_type)
sd->s_ns = target->ktype->namespace(target);
sd->s_symlink.target_sd = target_sd;
target_sd = NULL; /* reference is now owned by the symlink */
sysfs_addrm_start(&acxt, parent_sd);
- if (warn)
- error = sysfs_add_one(&acxt, sd);
- else
- error = __sysfs_add_one(&acxt, sd);
+ /* Symlinks must be between directories with the same ns_type */
+ if (!ns_type ||
+ (ns_type == sysfs_ns_type(sd->s_symlink.target_sd->s_parent))) {
+ if (warn)
+ error = sysfs_add_one(&acxt, sd);
+ else
+ error = __sysfs_add_one(&acxt, sd);
+ } else {
+ error = -EINVAL;
+ WARN(1, KERN_WARNING
+ "sysfs: symlink across ns_types %s/%s -> %s/%s\n",
+ parent_sd->s_name,
+ sd->s_name,
+ sd->s_symlink.target_sd->s_parent->s_name,
+ sd->s_symlink.target_sd->s_name);
+ }
sysfs_addrm_finish(&acxt);
if (error)
@@ -122,7 +136,7 @@ void sysfs_delete_link(struct kobject *kobj, struct kobject *targ,
{
const void *ns = NULL;
spin_lock(&sysfs_assoc_lock);
- if (targ->sd)
+ if (targ->sd && sysfs_ns_type(kobj->sd))
ns = targ->sd->s_ns;
spin_unlock(&sysfs_assoc_lock);
sysfs_hash_and_remove(kobj->sd, ns, name);
diff --git a/fs/sysv/dir.c b/fs/sysv/dir.c
index 1dabed286b4c..79941e4964a4 100644
--- a/fs/sysv/dir.c
+++ b/fs/sysv/dir.c
@@ -24,7 +24,7 @@ const struct file_operations sysv_dir_operations = {
.llseek = generic_file_llseek,
.read = generic_read_dir,
.readdir = sysv_readdir,
- .fsync = simple_fsync,
+ .fsync = generic_file_fsync,
};
static inline void dir_put_page(struct page *page)
diff --git a/fs/sysv/file.c b/fs/sysv/file.c
index 96340c01f4a7..750cc22349bd 100644
--- a/fs/sysv/file.c
+++ b/fs/sysv/file.c
@@ -26,7 +26,7 @@ const struct file_operations sysv_file_operations = {
.write = do_sync_write,
.aio_write = generic_file_aio_write,
.mmap = generic_file_mmap,
- .fsync = simple_fsync,
+ .fsync = generic_file_fsync,
.splice_read = generic_file_splice_read,
};
diff --git a/fs/sysv/ialloc.c b/fs/sysv/ialloc.c
index bbd69bdb0fa8..fcc498ec9b33 100644
--- a/fs/sysv/ialloc.c
+++ b/fs/sysv/ialloc.c
@@ -25,6 +25,7 @@
#include <linux/stat.h>
#include <linux/string.h>
#include <linux/buffer_head.h>
+#include <linux/writeback.h>
#include "sysv.h"
/* We don't trust the value of
@@ -139,6 +140,9 @@ struct inode * sysv_new_inode(const struct inode * dir, mode_t mode)
struct inode *inode;
sysv_ino_t ino;
unsigned count;
+ struct writeback_control wbc = {
+ .sync_mode = WB_SYNC_NONE
+ };
inode = new_inode(sb);
if (!inode)
@@ -168,7 +172,7 @@ struct inode * sysv_new_inode(const struct inode * dir, mode_t mode)
insert_inode_hash(inode);
mark_inode_dirty(inode);
- sysv_write_inode(inode, 0); /* ensure inode not allocated again */
+ sysv_write_inode(inode, &wbc); /* ensure inode not allocated again */
mark_inode_dirty(inode); /* cleared by sysv_write_inode() */
/* That's it. */
unlock_super(sb);
diff --git a/fs/sysv/inode.c b/fs/sysv/inode.c
index 4573734d723d..d4a5380b5669 100644
--- a/fs/sysv/inode.c
+++ b/fs/sysv/inode.c
@@ -43,6 +43,7 @@ static int sysv_sync_fs(struct super_block *sb, int wait)
* then attach current time stamp.
* But if the filesystem was marked clean, keep it clean.
*/
+ sb->s_dirt = 0;
old_time = fs32_to_cpu(sbi, *sbi->s_sb_time);
if (sbi->s_type == FSTYPE_SYSV4) {
if (*sbi->s_sb_state == cpu_to_fs32(sbi, 0x7c269d38 - old_time))
diff --git a/fs/ubifs/budget.c b/fs/ubifs/budget.c
index 076ca50e9933..c8ff0d1ae5d3 100644
--- a/fs/ubifs/budget.c
+++ b/fs/ubifs/budget.c
@@ -62,7 +62,9 @@
*/
static void shrink_liability(struct ubifs_info *c, int nr_to_write)
{
+ down_read(&c->vfs_sb->s_umount);
writeback_inodes_sb(c->vfs_sb);
+ up_read(&c->vfs_sb->s_umount);
}
/**
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index 5692cf72b807..12f445cee9f7 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -967,12 +967,15 @@ static int do_writepage(struct page *page, int len)
* the page locked, and it locks @ui_mutex. However, write-back does take inode
* @i_mutex, which means other VFS operations may be run on this inode at the
* same time. And the problematic one is truncation to smaller size, from where
- * we have to call 'vmtruncate()', which first changes @inode->i_size, then
+ * we have to call 'simple_setsize()', which first changes @inode->i_size, then
* drops the truncated pages. And while dropping the pages, it takes the page
- * lock. This means that 'do_truncation()' cannot call 'vmtruncate()' with
+ * lock. This means that 'do_truncation()' cannot call 'simple_setsize()' with
* @ui_mutex locked, because it would deadlock with 'ubifs_writepage()'. This
* means that @inode->i_size is changed while @ui_mutex is unlocked.
*
+ * XXX: with the new truncate the above is not true anymore, the simple_setsize
+ * calls can be replaced with the individual components.
+ *
* But in 'ubifs_writepage()' we have to guarantee that we do not write beyond
* inode size. How do we do this if @inode->i_size may became smaller while we
* are in the middle of 'ubifs_writepage()'? The UBIFS solution is the
@@ -1125,7 +1128,7 @@ static int do_truncation(struct ubifs_info *c, struct inode *inode,
budgeted = 0;
}
- err = vmtruncate(inode, new_size);
+ err = simple_setsize(inode, new_size);
if (err)
goto out_budg;
@@ -1214,7 +1217,7 @@ static int do_setattr(struct ubifs_info *c, struct inode *inode,
if (attr->ia_valid & ATTR_SIZE) {
dbg_gen("size %lld -> %lld", inode->i_size, new_size);
- err = vmtruncate(inode, new_size);
+ err = simple_setsize(inode, new_size);
if (err)
goto out;
}
@@ -1223,7 +1226,7 @@ static int do_setattr(struct ubifs_info *c, struct inode *inode,
if (attr->ia_valid & ATTR_SIZE) {
/* Truncation changes inode [mc]time */
inode->i_mtime = inode->i_ctime = ubifs_current_time(inode);
- /* 'vmtruncate()' changed @i_size, update @ui_size */
+ /* 'simple_setsize()' changed @i_size, update @ui_size */
ui->ui_size = inode->i_size;
}
@@ -1304,9 +1307,9 @@ static void *ubifs_follow_link(struct dentry *dentry, struct nameidata *nd)
return NULL;
}
-int ubifs_fsync(struct file *file, struct dentry *dentry, int datasync)
+int ubifs_fsync(struct file *file, int datasync)
{
- struct inode *inode = dentry->d_inode;
+ struct inode *inode = file->f_mapping->host;
struct ubifs_info *c = inode->i_sb->s_fs_info;
int err;
diff --git a/fs/ubifs/lpt.c b/fs/ubifs/lpt.c
index ad7f67b827ea..0084a33c4c69 100644
--- a/fs/ubifs/lpt.c
+++ b/fs/ubifs/lpt.c
@@ -1457,13 +1457,13 @@ struct ubifs_lprops *ubifs_lpt_lookup(struct ubifs_info *c, int lnum)
shft -= UBIFS_LPT_FANOUT_SHIFT;
nnode = ubifs_get_nnode(c, nnode, iip);
if (IS_ERR(nnode))
- return ERR_PTR(PTR_ERR(nnode));
+ return ERR_CAST(nnode);
}
iip = ((i >> shft) & (UBIFS_LPT_FANOUT - 1));
shft -= UBIFS_LPT_FANOUT_SHIFT;
pnode = ubifs_get_pnode(c, nnode, iip);
if (IS_ERR(pnode))
- return ERR_PTR(PTR_ERR(pnode));
+ return ERR_CAST(pnode);
iip = (i & (UBIFS_LPT_FANOUT - 1));
dbg_lp("LEB %d, free %d, dirty %d, flags %d", lnum,
pnode->lprops[iip].free, pnode->lprops[iip].dirty,
@@ -1586,7 +1586,7 @@ struct ubifs_lprops *ubifs_lpt_lookup_dirty(struct ubifs_info *c, int lnum)
nnode = c->nroot;
nnode = dirty_cow_nnode(c, nnode);
if (IS_ERR(nnode))
- return ERR_PTR(PTR_ERR(nnode));
+ return ERR_CAST(nnode);
i = lnum - c->main_first;
shft = c->lpt_hght * UBIFS_LPT_FANOUT_SHIFT;
for (h = 1; h < c->lpt_hght; h++) {
@@ -1594,19 +1594,19 @@ struct ubifs_lprops *ubifs_lpt_lookup_dirty(struct ubifs_info *c, int lnum)
shft -= UBIFS_LPT_FANOUT_SHIFT;
nnode = ubifs_get_nnode(c, nnode, iip);
if (IS_ERR(nnode))
- return ERR_PTR(PTR_ERR(nnode));
+ return ERR_CAST(nnode);
nnode = dirty_cow_nnode(c, nnode);
if (IS_ERR(nnode))
- return ERR_PTR(PTR_ERR(nnode));
+ return ERR_CAST(nnode);
}
iip = ((i >> shft) & (UBIFS_LPT_FANOUT - 1));
shft -= UBIFS_LPT_FANOUT_SHIFT;
pnode = ubifs_get_pnode(c, nnode, iip);
if (IS_ERR(pnode))
- return ERR_PTR(PTR_ERR(pnode));
+ return ERR_CAST(pnode);
pnode = dirty_cow_pnode(c, pnode);
if (IS_ERR(pnode))
- return ERR_PTR(PTR_ERR(pnode));
+ return ERR_CAST(pnode);
iip = (i & (UBIFS_LPT_FANOUT - 1));
dbg_lp("LEB %d, free %d, dirty %d, flags %d", lnum,
pnode->lprops[iip].free, pnode->lprops[iip].dirty,
diff --git a/fs/ubifs/lpt_commit.c b/fs/ubifs/lpt_commit.c
index 13cb7a4237bf..d12535b7fc78 100644
--- a/fs/ubifs/lpt_commit.c
+++ b/fs/ubifs/lpt_commit.c
@@ -646,7 +646,7 @@ static struct ubifs_pnode *pnode_lookup(struct ubifs_info *c, int i)
shft -= UBIFS_LPT_FANOUT_SHIFT;
nnode = ubifs_get_nnode(c, nnode, iip);
if (IS_ERR(nnode))
- return ERR_PTR(PTR_ERR(nnode));
+ return ERR_CAST(nnode);
}
iip = ((i >> shft) & (UBIFS_LPT_FANOUT - 1));
return ubifs_get_pnode(c, nnode, iip);
diff --git a/fs/ubifs/recovery.c b/fs/ubifs/recovery.c
index 109c6ea03bb5..daae9e1f5382 100644
--- a/fs/ubifs/recovery.c
+++ b/fs/ubifs/recovery.c
@@ -24,7 +24,7 @@
* This file implements functions needed to recover from unclean un-mounts.
* When UBIFS is mounted, it checks a flag on the master node to determine if
* an un-mount was completed successfully. If not, the process of mounting
- * incorparates additional checking and fixing of on-flash data structures.
+ * incorporates additional checking and fixing of on-flash data structures.
* UBIFS always cleans away all remnants of an unclean un-mount, so that
* errors do not accumulate. However UBIFS defers recovery if it is mounted
* read-only, and the flash is not modified in that case.
@@ -1063,8 +1063,21 @@ int ubifs_rcvry_gc_commit(struct ubifs_info *c)
}
err = ubifs_find_dirty_leb(c, &lp, wbuf->offs, 2);
if (err) {
- if (err == -ENOSPC)
- dbg_err("could not find a dirty LEB");
+ /*
+ * There are no dirty or empty LEBs subject to here being
+ * enough for the index. Try to use
+ * 'ubifs_find_free_leb_for_idx()', which will return any empty
+ * LEBs (ignoring index requirements). If the index then
+ * doesn't have enough LEBs the recovery commit will fail -
+ * which is the same result anyway i.e. recovery fails. So
+ * there is no problem ignoring index requirements and just
+ * grabbing a free LEB since we have already established there
+ * is not a dirty LEB we could have used instead.
+ */
+ if (err == -ENOSPC) {
+ dbg_rcvry("could not find a dirty LEB");
+ goto find_free;
+ }
return err;
}
ubifs_assert(!(lp.flags & LPROPS_INDEX));
@@ -1139,8 +1152,8 @@ int ubifs_rcvry_gc_commit(struct ubifs_info *c)
find_free:
/*
* There is no GC head LEB or the free space in the GC head LEB is too
- * small. Allocate gc_lnum by calling 'ubifs_find_free_leb_for_idx()' so
- * GC is not run.
+ * small, or there are not dirty LEBs. Allocate gc_lnum by calling
+ * 'ubifs_find_free_leb_for_idx()' so GC is not run.
*/
lnum = ubifs_find_free_leb_for_idx(c);
if (lnum < 0) {
diff --git a/fs/ubifs/shrinker.c b/fs/ubifs/shrinker.c
index 02feb59cefca..0b201114a5ad 100644
--- a/fs/ubifs/shrinker.c
+++ b/fs/ubifs/shrinker.c
@@ -277,7 +277,7 @@ static int kick_a_thread(void)
return 0;
}
-int ubifs_shrinker(int nr, gfp_t gfp_mask)
+int ubifs_shrinker(struct shrinker *shrink, int nr, gfp_t gfp_mask)
{
int freed, contention = 0;
long clean_zn_cnt = atomic_long_read(&ubifs_clean_zn_cnt);
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index 4d2f2157dd3f..5fc5a0988970 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -1307,6 +1307,8 @@ static int mount_ubifs(struct ubifs_info *c)
if (err)
goto out_orphans;
err = ubifs_rcvry_gc_commit(c);
+ if (err)
+ goto out_orphans;
} else {
err = take_gc_lnum(c);
if (err)
@@ -1318,7 +1320,7 @@ static int mount_ubifs(struct ubifs_info *c)
*/
err = ubifs_leb_unmap(c, c->gc_lnum);
if (err)
- return err;
+ goto out_orphans;
}
err = dbg_check_lprops(c);
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index bd2542dad014..04310878f449 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -379,7 +379,7 @@ struct ubifs_gced_idx_leb {
* The @ui_size is a "shadow" variable for @inode->i_size and UBIFS uses
* @ui_size instead of @inode->i_size. The reason for this is that UBIFS cannot
* make sure @inode->i_size is always changed under @ui_mutex, because it
- * cannot call 'vmtruncate()' with @ui_mutex locked, because it would deadlock
+ * cannot call 'simple_setsize()' with @ui_mutex locked, because it would deadlock
* with 'ubifs_writepage()' (see file.c). All the other inode fields are
* changed under @ui_mutex, so they do not need "shadow" fields. Note, one
* could consider to rework locking and base it on "shadow" fields.
@@ -1575,7 +1575,7 @@ int ubifs_tnc_start_commit(struct ubifs_info *c, struct ubifs_zbranch *zroot);
int ubifs_tnc_end_commit(struct ubifs_info *c);
/* shrinker.c */
-int ubifs_shrinker(int nr_to_scan, gfp_t gfp_mask);
+int ubifs_shrinker(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask);
/* commit.c */
int ubifs_bg_thread(void *info);
@@ -1678,7 +1678,7 @@ const struct ubifs_lprops *ubifs_fast_find_frdi_idx(struct ubifs_info *c);
int ubifs_calc_dark(const struct ubifs_info *c, int spc);
/* file.c */
-int ubifs_fsync(struct file *file, struct dentry *dentry, int datasync);
+int ubifs_fsync(struct file *file, int datasync);
int ubifs_setattr(struct dentry *dentry, struct iattr *attr);
/* dir.c */
diff --git a/fs/udf/balloc.c b/fs/udf/balloc.c
index 9a9378b4eb5a..b608efaa4cee 100644
--- a/fs/udf/balloc.c
+++ b/fs/udf/balloc.c
@@ -21,7 +21,6 @@
#include "udfdecl.h"
-#include <linux/quotaops.h>
#include <linux/buffer_head.h>
#include <linux/bitops.h>
@@ -159,8 +158,6 @@ static void udf_bitmap_free_blocks(struct super_block *sb,
udf_debug("byte=%2x\n",
((char *)bh->b_data)[(bit + i) >> 3]);
} else {
- if (inode)
- dquot_free_block(inode, 1);
udf_add_free_space(sb, sbi->s_partition, 1);
}
}
@@ -210,15 +207,8 @@ static int udf_bitmap_prealloc_blocks(struct super_block *sb,
bit = block % (sb->s_blocksize << 3);
while (bit < (sb->s_blocksize << 3) && block_count > 0) {
- if (!udf_test_bit(bit, bh->b_data))
+ if (!udf_clear_bit(bit, bh->b_data))
goto out;
- else if (dquot_prealloc_block(inode, 1))
- goto out;
- else if (!udf_clear_bit(bit, bh->b_data)) {
- udf_debug("bit already cleared for block %d\n", bit);
- dquot_free_block(inode, 1);
- goto out;
- }
block_count--;
alloc_count++;
bit++;
@@ -338,20 +328,6 @@ search_back:
}
got_block:
-
- /*
- * Check quota for allocation of this block.
- */
- if (inode) {
- int ret = dquot_alloc_block(inode, 1);
-
- if (ret) {
- mutex_unlock(&sbi->s_alloc_mutex);
- *err = ret;
- return 0;
- }
- }
-
newblock = bit + (block_group << (sb->s_blocksize_bits + 3)) -
(sizeof(struct spaceBitmapDesc) << 3);
@@ -401,10 +377,6 @@ static void udf_table_free_blocks(struct super_block *sb,
}
iinfo = UDF_I(table);
- /* We do this up front - There are some error conditions that
- could occure, but.. oh well */
- if (inode)
- dquot_free_block(inode, count);
udf_add_free_space(sb, sbi->s_partition, count);
start = bloc->logicalBlockNum + offset;
@@ -649,10 +621,7 @@ static int udf_table_prealloc_blocks(struct super_block *sb,
epos.offset -= adsize;
alloc_count = (elen >> sb->s_blocksize_bits);
- if (inode && dquot_prealloc_block(inode,
- alloc_count > block_count ? block_count : alloc_count))
- alloc_count = 0;
- else if (alloc_count > block_count) {
+ if (alloc_count > block_count) {
alloc_count = block_count;
eloc.logicalBlockNum += alloc_count;
elen -= (alloc_count << sb->s_blocksize_bits);
@@ -752,14 +721,6 @@ static int udf_table_new_block(struct super_block *sb,
newblock = goal_eloc.logicalBlockNum;
goal_eloc.logicalBlockNum++;
goal_elen -= sb->s_blocksize;
- if (inode) {
- *err = dquot_alloc_block(inode, 1);
- if (*err) {
- brelse(goal_epos.bh);
- mutex_unlock(&sbi->s_alloc_mutex);
- return 0;
- }
- }
if (goal_elen)
udf_write_aext(table, &goal_epos, &goal_eloc, goal_elen, 1);
diff --git a/fs/udf/dir.c b/fs/udf/dir.c
index 3a84455c2a77..51552bf50225 100644
--- a/fs/udf/dir.c
+++ b/fs/udf/dir.c
@@ -207,8 +207,9 @@ static int udf_readdir(struct file *filp, void *dirent, filldir_t filldir)
/* readdir and lookup functions */
const struct file_operations udf_dir_operations = {
+ .llseek = generic_file_llseek,
.read = generic_read_dir,
.readdir = udf_readdir,
.unlocked_ioctl = udf_ioctl,
- .fsync = simple_fsync,
+ .fsync = generic_file_fsync,
};
diff --git a/fs/udf/file.c b/fs/udf/file.c
index baae3a723946..94e06d6bddbd 100644
--- a/fs/udf/file.c
+++ b/fs/udf/file.c
@@ -34,7 +34,6 @@
#include <linux/errno.h>
#include <linux/smp_lock.h>
#include <linux/pagemap.h>
-#include <linux/quotaops.h>
#include <linux/buffer_head.h>
#include <linux/aio.h>
#include <linux/smp_lock.h>
@@ -219,39 +218,16 @@ const struct file_operations udf_file_operations = {
.read = do_sync_read,
.aio_read = generic_file_aio_read,
.unlocked_ioctl = udf_ioctl,
- .open = dquot_file_open,
+ .open = generic_file_open,
.mmap = generic_file_mmap,
.write = do_sync_write,
.aio_write = udf_file_aio_write,
.release = udf_release_file,
- .fsync = simple_fsync,
+ .fsync = generic_file_fsync,
.splice_read = generic_file_splice_read,
.llseek = generic_file_llseek,
};
-int udf_setattr(struct dentry *dentry, struct iattr *iattr)
-{
- struct inode *inode = dentry->d_inode;
- int error;
-
- error = inode_change_ok(inode, iattr);
- if (error)
- return error;
-
- if (is_quota_modification(inode, iattr))
- dquot_initialize(inode);
-
- if ((iattr->ia_valid & ATTR_UID && iattr->ia_uid != inode->i_uid) ||
- (iattr->ia_valid & ATTR_GID && iattr->ia_gid != inode->i_gid)) {
- error = dquot_transfer(inode, iattr);
- if (error)
- return error;
- }
-
- return inode_setattr(inode, iattr);
-}
-
const struct inode_operations udf_file_inode_operations = {
.truncate = udf_truncate,
- .setattr = udf_setattr,
};
diff --git a/fs/udf/ialloc.c b/fs/udf/ialloc.c
index 2b5586c7f02a..18cd7111185d 100644
--- a/fs/udf/ialloc.c
+++ b/fs/udf/ialloc.c
@@ -20,7 +20,6 @@
#include "udfdecl.h"
#include <linux/fs.h>
-#include <linux/quotaops.h>
#include <linux/sched.h>
#include <linux/slab.h>
@@ -32,13 +31,6 @@ void udf_free_inode(struct inode *inode)
struct super_block *sb = inode->i_sb;
struct udf_sb_info *sbi = UDF_SB(sb);
- /*
- * Note: we must free any quota before locking the superblock,
- * as writing the quota to disk may need the lock as well.
- */
- dquot_free_inode(inode);
- dquot_drop(inode);
-
clear_inode(inode);
mutex_lock(&sbi->s_alloc_mutex);
@@ -61,7 +53,7 @@ struct inode *udf_new_inode(struct inode *dir, int mode, int *err)
struct super_block *sb = dir->i_sb;
struct udf_sb_info *sbi = UDF_SB(sb);
struct inode *inode;
- int block, ret;
+ int block;
uint32_t start = UDF_I(dir)->i_location.logicalBlockNum;
struct udf_inode_info *iinfo;
struct udf_inode_info *dinfo = UDF_I(dir);
@@ -146,17 +138,6 @@ struct inode *udf_new_inode(struct inode *dir, int mode, int *err)
insert_inode_hash(inode);
mark_inode_dirty(inode);
- dquot_initialize(inode);
- ret = dquot_alloc_inode(inode);
- if (ret) {
- dquot_drop(inode);
- inode->i_flags |= S_NOQUOTA;
- inode->i_nlink = 0;
- iput(inode);
- *err = ret;
- return NULL;
- }
-
*err = 0;
return inode;
}
diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index 8a3fbd177cab..124852bcf6fe 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -36,7 +36,6 @@
#include <linux/pagemap.h>
#include <linux/buffer_head.h>
#include <linux/writeback.h>
-#include <linux/quotaops.h>
#include <linux/slab.h>
#include <linux/crc-itu-t.h>
@@ -71,9 +70,6 @@ static int udf_get_block(struct inode *, sector_t, struct buffer_head *, int);
void udf_delete_inode(struct inode *inode)
{
- if (!is_bad_inode(inode))
- dquot_initialize(inode);
-
truncate_inode_pages(&inode->i_data, 0);
if (is_bad_inode(inode))
@@ -113,7 +109,6 @@ void udf_clear_inode(struct inode *inode)
(unsigned long long)iinfo->i_lenExtents);
}
- dquot_drop(inode);
kfree(iinfo->i_ext.i_data);
iinfo->i_ext.i_data = NULL;
}
diff --git a/fs/udf/namei.c b/fs/udf/namei.c
index 585f733615dc..bf5fc674193c 100644
--- a/fs/udf/namei.c
+++ b/fs/udf/namei.c
@@ -27,7 +27,6 @@
#include <linux/errno.h>
#include <linux/mm.h>
#include <linux/slab.h>
-#include <linux/quotaops.h>
#include <linux/smp_lock.h>
#include <linux/buffer_head.h>
#include <linux/sched.h>
@@ -563,8 +562,6 @@ static int udf_create(struct inode *dir, struct dentry *dentry, int mode,
int err;
struct udf_inode_info *iinfo;
- dquot_initialize(dir);
-
lock_kernel();
inode = udf_new_inode(dir, mode, &err);
if (!inode) {
@@ -617,8 +614,6 @@ static int udf_mknod(struct inode *dir, struct dentry *dentry, int mode,
if (!old_valid_dev(rdev))
return -EINVAL;
- dquot_initialize(dir);
-
lock_kernel();
err = -EIO;
inode = udf_new_inode(dir, mode, &err);
@@ -664,8 +659,6 @@ static int udf_mkdir(struct inode *dir, struct dentry *dentry, int mode)
struct udf_inode_info *dinfo = UDF_I(dir);
struct udf_inode_info *iinfo;
- dquot_initialize(dir);
-
lock_kernel();
err = -EMLINK;
if (dir->i_nlink >= (256 << sizeof(dir->i_nlink)) - 1)
@@ -800,8 +793,6 @@ static int udf_rmdir(struct inode *dir, struct dentry *dentry)
struct fileIdentDesc *fi, cfi;
struct kernel_lb_addr tloc;
- dquot_initialize(dir);
-
retval = -ENOENT;
lock_kernel();
fi = udf_find_entry(dir, &dentry->d_name, &fibh, &cfi);
@@ -848,8 +839,6 @@ static int udf_unlink(struct inode *dir, struct dentry *dentry)
struct fileIdentDesc cfi;
struct kernel_lb_addr tloc;
- dquot_initialize(dir);
-
retval = -ENOENT;
lock_kernel();
fi = udf_find_entry(dir, &dentry->d_name, &fibh, &cfi);
@@ -904,8 +893,6 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry,
struct buffer_head *bh;
struct udf_inode_info *iinfo;
- dquot_initialize(dir);
-
lock_kernel();
inode = udf_new_inode(dir, S_IFLNK | S_IRWXUGO, &err);
if (!inode)
@@ -1075,8 +1062,6 @@ static int udf_link(struct dentry *old_dentry, struct inode *dir,
int err;
struct buffer_head *bh;
- dquot_initialize(dir);
-
lock_kernel();
if (inode->i_nlink >= (256 << sizeof(inode->i_nlink)) - 1) {
unlock_kernel();
@@ -1139,9 +1124,6 @@ static int udf_rename(struct inode *old_dir, struct dentry *old_dentry,
struct kernel_lb_addr tloc;
struct udf_inode_info *old_iinfo = UDF_I(old_inode);
- dquot_initialize(old_dir);
- dquot_initialize(new_dir);
-
lock_kernel();
ofi = udf_find_entry(old_dir, &old_dentry->d_name, &ofibh, &ocfi);
if (ofi) {
@@ -1387,7 +1369,6 @@ const struct export_operations udf_export_ops = {
const struct inode_operations udf_dir_inode_operations = {
.lookup = udf_lookup,
.create = udf_create,
- .setattr = udf_setattr,
.link = udf_link,
.unlink = udf_unlink,
.symlink = udf_symlink,
@@ -1400,5 +1381,4 @@ const struct inode_operations udf_symlink_inode_operations = {
.readlink = generic_readlink,
.follow_link = page_follow_link_light,
.put_link = page_put_link,
- .setattr = udf_setattr,
};
diff --git a/fs/udf/super.c b/fs/udf/super.c
index 1e4543cbcd27..612d1e2e285a 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -557,6 +557,7 @@ static int udf_remount_fs(struct super_block *sb, int *flags, char *options)
{
struct udf_options uopt;
struct udf_sb_info *sbi = UDF_SB(sb);
+ int error = 0;
uopt.flags = sbi->s_flags;
uopt.uid = sbi->s_uid;
@@ -582,17 +583,17 @@ static int udf_remount_fs(struct super_block *sb, int *flags, char *options)
*flags |= MS_RDONLY;
}
- if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) {
- unlock_kernel();
- return 0;
- }
+ if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY))
+ goto out_unlock;
+
if (*flags & MS_RDONLY)
udf_close_lvid(sb);
else
udf_open_lvid(sb);
+out_unlock:
unlock_kernel();
- return 0;
+ return error;
}
/* Check Volume Structure Descriptors (ECMA 167 2/9.1) */
@@ -1939,7 +1940,7 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent)
/* Fill in the rest of the superblock */
sb->s_op = &udf_sb_ops;
sb->s_export_op = &udf_export_ops;
- sb->dq_op = NULL;
+
sb->s_dirt = 0;
sb->s_magic = UDF_SUPER_MAGIC;
sb->s_time_gran = 1000;
diff --git a/fs/udf/udfdecl.h b/fs/udf/udfdecl.h
index 9079ff7d6255..2bac0354891f 100644
--- a/fs/udf/udfdecl.h
+++ b/fs/udf/udfdecl.h
@@ -131,7 +131,6 @@ extern int udf_write_fi(struct inode *inode, struct fileIdentDesc *,
/* file.c */
extern long udf_ioctl(struct file *, unsigned int, unsigned long);
-extern int udf_setattr(struct dentry *dentry, struct iattr *iattr);
/* inode.c */
extern struct inode *udf_iget(struct super_block *, struct kernel_lb_addr *);
extern int udf_sync_inode(struct inode *);
diff --git a/fs/ufs/balloc.c b/fs/ufs/balloc.c
index 5cfa4d85ccf2..048484fb10d2 100644
--- a/fs/ufs/balloc.c
+++ b/fs/ufs/balloc.c
@@ -12,7 +12,6 @@
#include <linux/stat.h>
#include <linux/time.h>
#include <linux/string.h>
-#include <linux/quotaops.h>
#include <linux/buffer_head.h>
#include <linux/capability.h>
#include <linux/bitops.h>
@@ -85,9 +84,6 @@ void ufs_free_fragments(struct inode *inode, u64 fragment, unsigned count)
"bit already cleared for fragment %u", i);
}
- dquot_free_block(inode, count);
-
-
fs32_add(sb, &ucg->cg_cs.cs_nffree, count);
uspi->cs_total.cs_nffree += count;
fs32_add(sb, &UFS_SB(sb)->fs_cs(cgno).cs_nffree, count);
@@ -195,7 +191,6 @@ do_more:
ubh_setblock(UCPI_UBH(ucpi), ucpi->c_freeoff, blkno);
if ((UFS_SB(sb)->s_flags & UFS_CG_MASK) == UFS_CG_44BSD)
ufs_clusteracct (sb, ucpi, blkno, 1);
- dquot_free_block(inode, uspi->s_fpb);
fs32_add(sb, &ucg->cg_cs.cs_nbfree, 1);
uspi->cs_total.cs_nbfree++;
@@ -511,7 +506,6 @@ static u64 ufs_add_fragments(struct inode *inode, u64 fragment,
struct ufs_cg_private_info * ucpi;
struct ufs_cylinder_group * ucg;
unsigned cgno, fragno, fragoff, count, fragsize, i;
- int ret;
UFSD("ENTER, fragment %llu, oldcount %u, newcount %u\n",
(unsigned long long)fragment, oldcount, newcount);
@@ -557,11 +551,6 @@ static u64 ufs_add_fragments(struct inode *inode, u64 fragment,
fs32_add(sb, &ucg->cg_frsum[fragsize - count], 1);
for (i = oldcount; i < newcount; i++)
ubh_clrbit (UCPI_UBH(ucpi), ucpi->c_freeoff, fragno + i);
- ret = dquot_alloc_block(inode, count);
- if (ret) {
- *err = ret;
- return 0;
- }
fs32_sub(sb, &ucg->cg_cs.cs_nffree, count);
fs32_sub(sb, &UFS_SB(sb)->fs_cs(cgno).cs_nffree, count);
@@ -598,7 +587,6 @@ static u64 ufs_alloc_fragments(struct inode *inode, unsigned cgno,
struct ufs_cylinder_group * ucg;
unsigned oldcg, i, j, k, allocsize;
u64 result;
- int ret;
UFSD("ENTER, ino %lu, cgno %u, goal %llu, count %u\n",
inode->i_ino, cgno, (unsigned long long)goal, count);
@@ -667,7 +655,6 @@ cg_found:
for (i = count; i < uspi->s_fpb; i++)
ubh_setbit (UCPI_UBH(ucpi), ucpi->c_freeoff, goal + i);
i = uspi->s_fpb - count;
- dquot_free_block(inode, i);
fs32_add(sb, &ucg->cg_cs.cs_nffree, i);
uspi->cs_total.cs_nffree += i;
@@ -679,11 +666,6 @@ cg_found:
result = ufs_bitmap_search (sb, ucpi, goal, allocsize);
if (result == INVBLOCK)
return 0;
- ret = dquot_alloc_block(inode, count);
- if (ret) {
- *err = ret;
- return 0;
- }
for (i = 0; i < count; i++)
ubh_clrbit (UCPI_UBH(ucpi), ucpi->c_freeoff, result + i);
@@ -718,7 +700,6 @@ static u64 ufs_alloccg_block(struct inode *inode,
struct ufs_super_block_first * usb1;
struct ufs_cylinder_group * ucg;
u64 result, blkno;
- int ret;
UFSD("ENTER, goal %llu\n", (unsigned long long)goal);
@@ -752,11 +733,6 @@ gotit:
ubh_clrblock (UCPI_UBH(ucpi), ucpi->c_freeoff, blkno);
if ((UFS_SB(sb)->s_flags & UFS_CG_MASK) == UFS_CG_44BSD)
ufs_clusteracct (sb, ucpi, blkno, -1);
- ret = dquot_alloc_block(inode, uspi->s_fpb);
- if (ret) {
- *err = ret;
- return INVBLOCK;
- }
fs32_sub(sb, &ucg->cg_cs.cs_nbfree, 1);
uspi->cs_total.cs_nbfree--;
diff --git a/fs/ufs/dir.c b/fs/ufs/dir.c
index 317a0d444f6b..ec784756dc65 100644
--- a/fs/ufs/dir.c
+++ b/fs/ufs/dir.c
@@ -666,6 +666,6 @@ not_empty:
const struct file_operations ufs_dir_operations = {
.read = generic_read_dir,
.readdir = ufs_readdir,
- .fsync = simple_fsync,
+ .fsync = generic_file_fsync,
.llseek = generic_file_llseek,
};
diff --git a/fs/ufs/file.c b/fs/ufs/file.c
index a8962cecde5b..33afa20d4509 100644
--- a/fs/ufs/file.c
+++ b/fs/ufs/file.c
@@ -24,7 +24,6 @@
*/
#include <linux/fs.h>
-#include <linux/quotaops.h>
#include "ufs_fs.h"
#include "ufs.h"
@@ -41,7 +40,7 @@ const struct file_operations ufs_file_operations = {
.write = do_sync_write,
.aio_write = generic_file_aio_write,
.mmap = generic_file_mmap,
- .open = dquot_file_open,
- .fsync = simple_fsync,
+ .open = generic_file_open,
+ .fsync = generic_file_fsync,
.splice_read = generic_file_splice_read,
};
diff --git a/fs/ufs/ialloc.c b/fs/ufs/ialloc.c
index 3a959d55084d..594480e537d2 100644
--- a/fs/ufs/ialloc.c
+++ b/fs/ufs/ialloc.c
@@ -27,7 +27,6 @@
#include <linux/time.h>
#include <linux/stat.h>
#include <linux/string.h>
-#include <linux/quotaops.h>
#include <linux/buffer_head.h>
#include <linux/sched.h>
#include <linux/bitops.h>
@@ -95,9 +94,6 @@ void ufs_free_inode (struct inode * inode)
is_directory = S_ISDIR(inode->i_mode);
- dquot_free_inode(inode);
- dquot_drop(inode);
-
clear_inode (inode);
if (ubh_isclr (UCPI_UBH(ucpi), ucpi->c_iusedoff, bit))
@@ -347,21 +343,12 @@ cg_found:
unlock_super (sb);
- dquot_initialize(inode);
- err = dquot_alloc_inode(inode);
- if (err) {
- dquot_drop(inode);
- goto fail_without_unlock;
- }
-
UFSD("allocating inode %lu\n", inode->i_ino);
UFSD("EXIT\n");
return inode;
fail_remove_inode:
unlock_super(sb);
-fail_without_unlock:
- inode->i_flags |= S_NOQUOTA;
inode->i_nlink = 0;
iput(inode);
UFSD("EXIT (FAILED): err %d\n", err);
diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c
index cffa756f1047..73fe773aa034 100644
--- a/fs/ufs/inode.c
+++ b/fs/ufs/inode.c
@@ -37,7 +37,6 @@
#include <linux/smp_lock.h>
#include <linux/buffer_head.h>
#include <linux/writeback.h>
-#include <linux/quotaops.h>
#include "ufs_fs.h"
#include "ufs.h"
@@ -910,9 +909,6 @@ void ufs_delete_inode (struct inode * inode)
{
loff_t old_i_size;
- if (!is_bad_inode(inode))
- dquot_initialize(inode);
-
truncate_inode_pages(&inode->i_data, 0);
if (is_bad_inode(inode))
goto no_delete;
diff --git a/fs/ufs/namei.c b/fs/ufs/namei.c
index eabc02eb1294..b056f02b1fb3 100644
--- a/fs/ufs/namei.c
+++ b/fs/ufs/namei.c
@@ -30,7 +30,6 @@
#include <linux/time.h>
#include <linux/fs.h>
#include <linux/smp_lock.h>
-#include <linux/quotaops.h>
#include "ufs_fs.h"
#include "ufs.h"
@@ -86,8 +85,6 @@ static int ufs_create (struct inode * dir, struct dentry * dentry, int mode,
UFSD("BEGIN\n");
- dquot_initialize(dir);
-
inode = ufs_new_inode(dir, mode);
err = PTR_ERR(inode);
@@ -112,8 +109,6 @@ static int ufs_mknod (struct inode * dir, struct dentry *dentry, int mode, dev_t
if (!old_valid_dev(rdev))
return -EINVAL;
- dquot_initialize(dir);
-
inode = ufs_new_inode(dir, mode);
err = PTR_ERR(inode);
if (!IS_ERR(inode)) {
@@ -138,8 +133,6 @@ static int ufs_symlink (struct inode * dir, struct dentry * dentry,
if (l > sb->s_blocksize)
goto out_notlocked;
- dquot_initialize(dir);
-
lock_kernel();
inode = ufs_new_inode(dir, S_IFLNK | S_IRWXUGO);
err = PTR_ERR(inode);
@@ -185,8 +178,6 @@ static int ufs_link (struct dentry * old_dentry, struct inode * dir,
return -EMLINK;
}
- dquot_initialize(dir);
-
inode->i_ctime = CURRENT_TIME_SEC;
inode_inc_link_count(inode);
atomic_inc(&inode->i_count);
@@ -204,8 +195,6 @@ static int ufs_mkdir(struct inode * dir, struct dentry * dentry, int mode)
if (dir->i_nlink >= UFS_LINK_MAX)
goto out;
- dquot_initialize(dir);
-
lock_kernel();
inode_inc_link_count(dir);
@@ -250,8 +239,6 @@ static int ufs_unlink(struct inode *dir, struct dentry *dentry)
struct page *page;
int err = -ENOENT;
- dquot_initialize(dir);
-
de = ufs_find_entry(dir, &dentry->d_name, &page);
if (!de)
goto out;
@@ -296,9 +283,6 @@ static int ufs_rename(struct inode *old_dir, struct dentry *old_dentry,
struct ufs_dir_entry *old_de;
int err = -ENOENT;
- dquot_initialize(old_dir);
- dquot_initialize(new_dir);
-
old_de = ufs_find_entry(old_dir, &old_dentry->d_name, &old_page);
if (!old_de)
goto out;
diff --git a/fs/ufs/super.c b/fs/ufs/super.c
index 14743d935a93..3ec5a9eb6efb 100644
--- a/fs/ufs/super.c
+++ b/fs/ufs/super.c
@@ -77,7 +77,6 @@
#include <linux/errno.h>
#include <linux/fs.h>
-#include <linux/quotaops.h>
#include <linux/slab.h>
#include <linux/time.h>
#include <linux/stat.h>
@@ -918,6 +917,7 @@ again:
sbi->s_bytesex = BYTESEX_LE;
switch ((uspi->fs_magic = fs32_to_cpu(sb, usb3->fs_magic))) {
case UFS_MAGIC:
+ case UFS_MAGIC_BW:
case UFS2_MAGIC:
case UFS_MAGIC_LFN:
case UFS_MAGIC_FEA:
@@ -927,6 +927,7 @@ again:
sbi->s_bytesex = BYTESEX_BE;
switch ((uspi->fs_magic = fs32_to_cpu(sb, usb3->fs_magic))) {
case UFS_MAGIC:
+ case UFS_MAGIC_BW:
case UFS2_MAGIC:
case UFS_MAGIC_LFN:
case UFS_MAGIC_FEA:
@@ -1045,7 +1046,7 @@ magic_found:
*/
sb->s_op = &ufs_super_ops;
sb->s_export_op = &ufs_export_ops;
- sb->dq_op = NULL; /***/
+
sb->s_magic = fs32_to_cpu(sb, usb3->fs_magic);
uspi->s_sblkno = fs32_to_cpu(sb, usb1->fs_sblkno);
@@ -1435,126 +1436,19 @@ static void destroy_inodecache(void)
kmem_cache_destroy(ufs_inode_cachep);
}
-static void ufs_clear_inode(struct inode *inode)
-{
- dquot_drop(inode);
-}
-
-#ifdef CONFIG_QUOTA
-static ssize_t ufs_quota_read(struct super_block *, int, char *,size_t, loff_t);
-static ssize_t ufs_quota_write(struct super_block *, int, const char *, size_t, loff_t);
-#endif
-
static const struct super_operations ufs_super_ops = {
.alloc_inode = ufs_alloc_inode,
.destroy_inode = ufs_destroy_inode,
.write_inode = ufs_write_inode,
.delete_inode = ufs_delete_inode,
- .clear_inode = ufs_clear_inode,
.put_super = ufs_put_super,
.write_super = ufs_write_super,
.sync_fs = ufs_sync_fs,
.statfs = ufs_statfs,
.remount_fs = ufs_remount,
.show_options = ufs_show_options,
-#ifdef CONFIG_QUOTA
- .quota_read = ufs_quota_read,
- .quota_write = ufs_quota_write,
-#endif
};
-#ifdef CONFIG_QUOTA
-
-/* Read data from quotafile - avoid pagecache and such because we cannot afford
- * acquiring the locks... As quota files are never truncated and quota code
- * itself serializes the operations (and noone else should touch the files)
- * we don't have to be afraid of races */
-static ssize_t ufs_quota_read(struct super_block *sb, int type, char *data,
- size_t len, loff_t off)
-{
- struct inode *inode = sb_dqopt(sb)->files[type];
- sector_t blk = off >> sb->s_blocksize_bits;
- int err = 0;
- int offset = off & (sb->s_blocksize - 1);
- int tocopy;
- size_t toread;
- struct buffer_head *bh;
- loff_t i_size = i_size_read(inode);
-
- if (off > i_size)
- return 0;
- if (off+len > i_size)
- len = i_size-off;
- toread = len;
- while (toread > 0) {
- tocopy = sb->s_blocksize - offset < toread ?
- sb->s_blocksize - offset : toread;
-
- bh = ufs_bread(inode, blk, 0, &err);
- if (err)
- return err;
- if (!bh) /* A hole? */
- memset(data, 0, tocopy);
- else {
- memcpy(data, bh->b_data+offset, tocopy);
- brelse(bh);
- }
- offset = 0;
- toread -= tocopy;
- data += tocopy;
- blk++;
- }
- return len;
-}
-
-/* Write to quotafile */
-static ssize_t ufs_quota_write(struct super_block *sb, int type,
- const char *data, size_t len, loff_t off)
-{
- struct inode *inode = sb_dqopt(sb)->files[type];
- sector_t blk = off >> sb->s_blocksize_bits;
- int err = 0;
- int offset = off & (sb->s_blocksize - 1);
- int tocopy;
- size_t towrite = len;
- struct buffer_head *bh;
-
- mutex_lock_nested(&inode->i_mutex, I_MUTEX_QUOTA);
- while (towrite > 0) {
- tocopy = sb->s_blocksize - offset < towrite ?
- sb->s_blocksize - offset : towrite;
-
- bh = ufs_bread(inode, blk, 1, &err);
- if (!bh)
- goto out;
- lock_buffer(bh);
- memcpy(bh->b_data+offset, data, tocopy);
- flush_dcache_page(bh->b_page);
- set_buffer_uptodate(bh);
- mark_buffer_dirty(bh);
- unlock_buffer(bh);
- brelse(bh);
- offset = 0;
- towrite -= tocopy;
- data += tocopy;
- blk++;
- }
-out:
- if (len == towrite) {
- mutex_unlock(&inode->i_mutex);
- return err;
- }
- if (inode->i_size < off+len-towrite)
- i_size_write(inode, off+len-towrite);
- inode->i_version++;
- inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC;
- mark_inode_dirty(inode);
- mutex_unlock(&inode->i_mutex);
- return len - towrite;
-}
-
-#endif
-
static int ufs_get_sb(struct file_system_type *fs_type,
int flags, const char *dev_name, void *data, struct vfsmount *mnt)
{
diff --git a/fs/ufs/truncate.c b/fs/ufs/truncate.c
index f294c44577dc..589e01a465ba 100644
--- a/fs/ufs/truncate.c
+++ b/fs/ufs/truncate.c
@@ -44,7 +44,6 @@
#include <linux/buffer_head.h>
#include <linux/blkdev.h>
#include <linux/sched.h>
-#include <linux/quotaops.h>
#include "ufs_fs.h"
#include "ufs.h"
@@ -501,12 +500,10 @@ out:
return err;
}
-
/*
- * We don't define our `inode->i_op->truncate', and call it here,
- * because of:
- * - there is no way to know old size
- * - there is no way inform user about error, if it happens in `truncate'
+ * TODO:
+ * - truncate case should use proper ordering instead of using
+ * simple_setsize
*/
int ufs_setattr(struct dentry *dentry, struct iattr *attr)
{
@@ -518,19 +515,10 @@ int ufs_setattr(struct dentry *dentry, struct iattr *attr)
if (error)
return error;
- if (is_quota_modification(inode, attr))
- dquot_initialize(inode);
-
- if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) ||
- (ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) {
- error = dquot_transfer(inode, attr);
- if (error)
- return error;
- }
if (ia_valid & ATTR_SIZE && attr->ia_size != inode->i_size) {
loff_t old_i_size = inode->i_size;
- error = vmtruncate(inode, attr->ia_size);
+ error = simple_setsize(inode, attr->ia_size);
if (error)
return error;
error = ufs_truncate(inode, old_i_size);
diff --git a/fs/ufs/ufs_fs.h b/fs/ufs/ufs_fs.h
index 6943ec677c0b..8aba544f9fad 100644
--- a/fs/ufs/ufs_fs.h
+++ b/fs/ufs/ufs_fs.h
@@ -48,6 +48,7 @@ typedef __u16 __bitwise __fs16;
#define UFS_SECTOR_SIZE 512
#define UFS_SECTOR_BITS 9
#define UFS_MAGIC 0x00011954
+#define UFS_MAGIC_BW 0x0f242697
#define UFS2_MAGIC 0x19540119
#define UFS_CIGAM 0x54190100 /* byteswapped MAGIC */
diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index c8fb13f83b3f..0dce969d6cad 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -87,11 +87,9 @@ xfs-y += xfs_alloc.o \
xfs_trans_buf.o \
xfs_trans_extfree.o \
xfs_trans_inode.o \
- xfs_trans_item.o \
xfs_utils.o \
xfs_vnodeops.o \
- xfs_rw.o \
- xfs_dmops.o
+ xfs_rw.o
xfs-$(CONFIG_XFS_TRACE) += xfs_btree_trace.o
diff --git a/fs/xfs/linux-2.6/xfs_acl.c b/fs/xfs/linux-2.6/xfs_acl.c
index 9f769b5b38fc..b2771862fd3d 100644
--- a/fs/xfs/linux-2.6/xfs_acl.c
+++ b/fs/xfs/linux-2.6/xfs_acl.c
@@ -225,7 +225,7 @@ xfs_check_acl(struct inode *inode, int mask)
struct posix_acl *acl;
int error = -EAGAIN;
- xfs_itrace_entry(ip);
+ trace_xfs_check_acl(ip);
/*
* If there is no attribute fork no ACL exists on this inode and
diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
index 089eaca860b4..d24e78f32f3e 100644
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -21,19 +21,12 @@
#include "xfs_inum.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
-#include "xfs_dir2.h"
#include "xfs_trans.h"
-#include "xfs_dmapi.h"
#include "xfs_mount.h"
#include "xfs_bmap_btree.h"
-#include "xfs_alloc_btree.h"
-#include "xfs_ialloc_btree.h"
-#include "xfs_dir2_sf.h"
-#include "xfs_attr_sf.h"
#include "xfs_dinode.h"
#include "xfs_inode.h"
#include "xfs_alloc.h"
-#include "xfs_btree.h"
#include "xfs_error.h"
#include "xfs_rw.h"
#include "xfs_iomap.h"
@@ -92,18 +85,15 @@ void
xfs_count_page_state(
struct page *page,
int *delalloc,
- int *unmapped,
int *unwritten)
{
struct buffer_head *bh, *head;
- *delalloc = *unmapped = *unwritten = 0;
+ *delalloc = *unwritten = 0;
bh = head = page_buffers(page);
do {
- if (buffer_uptodate(bh) && !buffer_mapped(bh))
- (*unmapped) = 1;
- else if (buffer_unwritten(bh))
+ if (buffer_unwritten(bh))
(*unwritten) = 1;
else if (buffer_delay(bh))
(*delalloc) = 1;
@@ -212,23 +202,17 @@ xfs_setfilesize(
}
/*
- * Schedule IO completion handling on a xfsdatad if this was
- * the final hold on this ioend. If we are asked to wait,
- * flush the workqueue.
+ * Schedule IO completion handling on the final put of an ioend.
*/
STATIC void
xfs_finish_ioend(
- xfs_ioend_t *ioend,
- int wait)
+ struct xfs_ioend *ioend)
{
if (atomic_dec_and_test(&ioend->io_remaining)) {
- struct workqueue_struct *wq;
-
- wq = (ioend->io_type == IO_UNWRITTEN) ?
- xfsconvertd_workqueue : xfsdatad_workqueue;
- queue_work(wq, &ioend->io_work);
- if (wait)
- flush_workqueue(wq);
+ if (ioend->io_type == IO_UNWRITTEN)
+ queue_work(xfsconvertd_workqueue, &ioend->io_work);
+ else
+ queue_work(xfsdatad_workqueue, &ioend->io_work);
}
}
@@ -272,11 +256,25 @@ xfs_end_io(
*/
if (error == EAGAIN) {
atomic_inc(&ioend->io_remaining);
- xfs_finish_ioend(ioend, 0);
+ xfs_finish_ioend(ioend);
/* ensure we don't spin on blocked ioends */
delay(1);
- } else
+ } else {
+ if (ioend->io_iocb)
+ aio_complete(ioend->io_iocb, ioend->io_result, 0);
xfs_destroy_ioend(ioend);
+ }
+}
+
+/*
+ * Call IO completion handling in caller context on the final put of an ioend.
+ */
+STATIC void
+xfs_finish_ioend_sync(
+ struct xfs_ioend *ioend)
+{
+ if (atomic_dec_and_test(&ioend->io_remaining))
+ xfs_end_io(&ioend->io_work);
}
/*
@@ -309,6 +307,8 @@ xfs_alloc_ioend(
atomic_inc(&XFS_I(ioend->io_inode)->i_iocount);
ioend->io_offset = 0;
ioend->io_size = 0;
+ ioend->io_iocb = NULL;
+ ioend->io_result = 0;
INIT_WORK(&ioend->io_work, xfs_end_io);
return ioend;
@@ -358,7 +358,7 @@ xfs_end_bio(
bio->bi_end_io = NULL;
bio_put(bio);
- xfs_finish_ioend(ioend, 0);
+ xfs_finish_ioend(ioend);
}
STATIC void
@@ -500,7 +500,7 @@ xfs_submit_ioend(
}
if (bio)
xfs_submit_ioend_bio(wbc, ioend, bio);
- xfs_finish_ioend(ioend, 0);
+ xfs_finish_ioend(ioend);
} while ((ioend = next) != NULL);
}
@@ -614,31 +614,30 @@ xfs_map_at_offset(
STATIC unsigned int
xfs_probe_page(
struct page *page,
- unsigned int pg_offset,
- int mapped)
+ unsigned int pg_offset)
{
+ struct buffer_head *bh, *head;
int ret = 0;
if (PageWriteback(page))
return 0;
+ if (!PageDirty(page))
+ return 0;
+ if (!page->mapping)
+ return 0;
+ if (!page_has_buffers(page))
+ return 0;
- if (page->mapping && PageDirty(page)) {
- if (page_has_buffers(page)) {
- struct buffer_head *bh, *head;
-
- bh = head = page_buffers(page);
- do {
- if (!buffer_uptodate(bh))
- break;
- if (mapped != buffer_mapped(bh))
- break;
- ret += bh->b_size;
- if (ret >= pg_offset)
- break;
- } while ((bh = bh->b_this_page) != head);
- } else
- ret = mapped ? 0 : PAGE_CACHE_SIZE;
- }
+ bh = head = page_buffers(page);
+ do {
+ if (!buffer_uptodate(bh))
+ break;
+ if (!buffer_mapped(bh))
+ break;
+ ret += bh->b_size;
+ if (ret >= pg_offset)
+ break;
+ } while ((bh = bh->b_this_page) != head);
return ret;
}
@@ -648,8 +647,7 @@ xfs_probe_cluster(
struct inode *inode,
struct page *startpage,
struct buffer_head *bh,
- struct buffer_head *head,
- int mapped)
+ struct buffer_head *head)
{
struct pagevec pvec;
pgoff_t tindex, tlast, tloff;
@@ -658,7 +656,7 @@ xfs_probe_cluster(
/* First sum forwards in this page */
do {
- if (!buffer_uptodate(bh) || (mapped != buffer_mapped(bh)))
+ if (!buffer_uptodate(bh) || !buffer_mapped(bh))
return total;
total += bh->b_size;
} while ((bh = bh->b_this_page) != head);
@@ -692,7 +690,7 @@ xfs_probe_cluster(
pg_offset = PAGE_CACHE_SIZE;
if (page->index == tindex && trylock_page(page)) {
- pg_len = xfs_probe_page(page, pg_offset, mapped);
+ pg_len = xfs_probe_page(page, pg_offset);
unlock_page(page);
}
@@ -761,7 +759,6 @@ xfs_convert_page(
struct xfs_bmbt_irec *imap,
xfs_ioend_t **ioendp,
struct writeback_control *wbc,
- int startio,
int all_bh)
{
struct buffer_head *bh, *head;
@@ -832,19 +829,14 @@ xfs_convert_page(
ASSERT(imap->br_startblock != DELAYSTARTBLOCK);
xfs_map_at_offset(inode, bh, imap, offset);
- if (startio) {
- xfs_add_to_ioend(inode, bh, offset,
- type, ioendp, done);
- } else {
- set_buffer_dirty(bh);
- unlock_buffer(bh);
- mark_buffer_dirty(bh);
- }
+ xfs_add_to_ioend(inode, bh, offset, type,
+ ioendp, done);
+
page_dirty--;
count++;
} else {
type = IO_NEW;
- if (buffer_mapped(bh) && all_bh && startio) {
+ if (buffer_mapped(bh) && all_bh) {
lock_buffer(bh);
xfs_add_to_ioend(inode, bh, offset,
type, ioendp, done);
@@ -859,14 +851,12 @@ xfs_convert_page(
if (uptodate && bh == head)
SetPageUptodate(page);
- if (startio) {
- if (count) {
- wbc->nr_to_write--;
- if (wbc->nr_to_write <= 0)
- done = 1;
- }
- xfs_start_page_writeback(page, !page_dirty, count);
+ if (count) {
+ wbc->nr_to_write--;
+ if (wbc->nr_to_write <= 0)
+ done = 1;
}
+ xfs_start_page_writeback(page, !page_dirty, count);
return done;
fail_unlock_page:
@@ -886,7 +876,6 @@ xfs_cluster_write(
struct xfs_bmbt_irec *imap,
xfs_ioend_t **ioendp,
struct writeback_control *wbc,
- int startio,
int all_bh,
pgoff_t tlast)
{
@@ -902,7 +891,7 @@ xfs_cluster_write(
for (i = 0; i < pagevec_count(&pvec); i++) {
done = xfs_convert_page(inode, pvec.pages[i], tindex++,
- imap, ioendp, wbc, startio, all_bh);
+ imap, ioendp, wbc, all_bh);
if (done)
break;
}
@@ -981,7 +970,7 @@ xfs_aops_discard_page(
*/
error = xfs_bmapi(NULL, ip, offset_fsb, 1,
XFS_BMAPI_ENTIRE, NULL, 0, &imap,
- &nimaps, NULL, NULL);
+ &nimaps, NULL);
if (error) {
/* something screwed, just bail */
@@ -1009,7 +998,7 @@ xfs_aops_discard_page(
*/
xfs_bmap_init(&flist, &firstblock);
error = xfs_bunmapi(NULL, ip, offset_fsb, 1, 0, 1, &firstblock,
- &flist, NULL, &done);
+ &flist, &done);
ASSERT(!flist.xbf_count && !flist.xbf_first);
if (error) {
@@ -1032,50 +1021,66 @@ out_invalidate:
}
/*
- * Calling this without startio set means we are being asked to make a dirty
- * page ready for freeing it's buffers. When called with startio set then
- * we are coming from writepage.
+ * Write out a dirty page.
+ *
+ * For delalloc space on the page we need to allocate space and flush it.
+ * For unwritten space on the page we need to start the conversion to
+ * regular allocated space.
+ * For any other dirty buffer heads on the page we should flush them.
*
- * When called with startio set it is important that we write the WHOLE
- * page if possible.
- * The bh->b_state's cannot know if any of the blocks or which block for
- * that matter are dirty due to mmap writes, and therefore bh uptodate is
- * only valid if the page itself isn't completely uptodate. Some layers
- * may clear the page dirty flag prior to calling write page, under the
- * assumption the entire page will be written out; by not writing out the
- * whole page the page can be reused before all valid dirty data is
- * written out. Note: in the case of a page that has been dirty'd by
- * mapwrite and but partially setup by block_prepare_write the
- * bh->b_states's will not agree and only ones setup by BPW/BCW will have
- * valid state, thus the whole page must be written out thing.
+ * If we detect that a transaction would be required to flush the page, we
+ * have to check the process flags first, if we are already in a transaction
+ * or disk I/O during allocations is off, we need to fail the writepage and
+ * redirty the page.
*/
-
STATIC int
-xfs_page_state_convert(
- struct inode *inode,
- struct page *page,
- struct writeback_control *wbc,
- int startio,
- int unmapped) /* also implies page uptodate */
+xfs_vm_writepage(
+ struct page *page,
+ struct writeback_control *wbc)
{
+ struct inode *inode = page->mapping->host;
+ int delalloc, unwritten;
struct buffer_head *bh, *head;
struct xfs_bmbt_irec imap;
xfs_ioend_t *ioend = NULL, *iohead = NULL;
loff_t offset;
- unsigned long p_offset = 0;
unsigned int type;
__uint64_t end_offset;
pgoff_t end_index, last_index;
ssize_t size, len;
int flags, err, imap_valid = 0, uptodate = 1;
- int page_dirty, count = 0;
- int trylock = 0;
- int all_bh = unmapped;
+ int count = 0;
+ int all_bh = 0;
- if (startio) {
- if (wbc->sync_mode == WB_SYNC_NONE && wbc->nonblocking)
- trylock |= BMAPI_TRYLOCK;
- }
+ trace_xfs_writepage(inode, page, 0);
+
+ ASSERT(page_has_buffers(page));
+
+ /*
+ * Refuse to write the page out if we are called from reclaim context.
+ *
+ * This avoids stack overflows when called from deeply used stacks in
+ * random callers for direct reclaim or memcg reclaim. We explicitly
+ * allow reclaim from kswapd as the stack usage there is relatively low.
+ *
+ * This should really be done by the core VM, but until that happens
+ * filesystems like XFS, btrfs and ext4 have to take care of this
+ * by themselves.
+ */
+ if ((current->flags & (PF_MEMALLOC|PF_KSWAPD)) == PF_MEMALLOC)
+ goto out_fail;
+
+ /*
+ * We need a transaction if there are delalloc or unwritten buffers
+ * on the page.
+ *
+ * If we need a transaction and the process flags say we are already
+ * in a transaction, or no IO is allowed then mark the page dirty
+ * again and leave the page as is.
+ */
+ xfs_count_page_state(page, &delalloc, &unwritten);
+ if ((current->flags & PF_FSTRANS) && (delalloc || unwritten))
+ goto out_fail;
/* Is this page beyond the end of the file? */
offset = i_size_read(inode);
@@ -1084,50 +1089,33 @@ xfs_page_state_convert(
if (page->index >= end_index) {
if ((page->index >= end_index + 1) ||
!(i_size_read(inode) & (PAGE_CACHE_SIZE - 1))) {
- if (startio)
- unlock_page(page);
+ unlock_page(page);
return 0;
}
}
- /*
- * page_dirty is initially a count of buffers on the page before
- * EOF and is decremented as we move each into a cleanable state.
- *
- * Derivation:
- *
- * End offset is the highest offset that this page should represent.
- * If we are on the last page, (end_offset & (PAGE_CACHE_SIZE - 1))
- * will evaluate non-zero and be less than PAGE_CACHE_SIZE and
- * hence give us the correct page_dirty count. On any other page,
- * it will be zero and in that case we need page_dirty to be the
- * count of buffers on the page.
- */
end_offset = min_t(unsigned long long,
- (xfs_off_t)(page->index + 1) << PAGE_CACHE_SHIFT, offset);
+ (xfs_off_t)(page->index + 1) << PAGE_CACHE_SHIFT,
+ offset);
len = 1 << inode->i_blkbits;
- p_offset = min_t(unsigned long, end_offset & (PAGE_CACHE_SIZE - 1),
- PAGE_CACHE_SIZE);
- p_offset = p_offset ? roundup(p_offset, len) : PAGE_CACHE_SIZE;
- page_dirty = p_offset / len;
bh = head = page_buffers(page);
offset = page_offset(page);
flags = BMAPI_READ;
type = IO_NEW;
- /* TODO: cleanup count and page_dirty */
-
do {
if (offset >= end_offset)
break;
if (!buffer_uptodate(bh))
uptodate = 0;
- if (!(PageUptodate(page) || buffer_uptodate(bh)) && !startio) {
- /*
- * the iomap is actually still valid, but the ioend
- * isn't. shouldn't happen too often.
- */
+
+ /*
+ * A hole may still be marked uptodate because discard_buffer
+ * leaves the flag set.
+ */
+ if (!buffer_mapped(bh) && buffer_uptodate(bh)) {
+ ASSERT(!buffer_dirty(bh));
imap_valid = 0;
continue;
}
@@ -1135,19 +1123,7 @@ xfs_page_state_convert(
if (imap_valid)
imap_valid = xfs_imap_valid(inode, &imap, offset);
- /*
- * First case, map an unwritten extent and prepare for
- * extent state conversion transaction on completion.
- *
- * Second case, allocate space for a delalloc buffer.
- * We can return EAGAIN here in the release page case.
- *
- * Third case, an unmapped buffer was found, and we are
- * in a path where we need to write the whole page out.
- */
- if (buffer_unwritten(bh) || buffer_delay(bh) ||
- ((buffer_uptodate(bh) || PageUptodate(page)) &&
- !buffer_mapped(bh) && (unmapped || startio))) {
+ if (buffer_unwritten(bh) || buffer_delay(bh)) {
int new_ioend = 0;
/*
@@ -1161,15 +1137,16 @@ xfs_page_state_convert(
flags = BMAPI_WRITE | BMAPI_IGNSTATE;
} else if (buffer_delay(bh)) {
type = IO_DELAY;
- flags = BMAPI_ALLOCATE | trylock;
- } else {
- type = IO_NEW;
- flags = BMAPI_WRITE | BMAPI_MMAP;
+ flags = BMAPI_ALLOCATE;
+
+ if (wbc->sync_mode == WB_SYNC_NONE &&
+ wbc->nonblocking)
+ flags |= BMAPI_TRYLOCK;
}
if (!imap_valid) {
/*
- * if we didn't have a valid mapping then we
+ * If we didn't have a valid mapping then we
* need to ensure that we put the new mapping
* in a new ioend structure. This needs to be
* done to ensure that the ioends correctly
@@ -1177,14 +1154,7 @@ xfs_page_state_convert(
* for unwritten extent conversion.
*/
new_ioend = 1;
- if (type == IO_NEW) {
- size = xfs_probe_cluster(inode,
- page, bh, head, 0);
- } else {
- size = len;
- }
-
- err = xfs_map_blocks(inode, offset, size,
+ err = xfs_map_blocks(inode, offset, len,
&imap, flags);
if (err)
goto error;
@@ -1193,19 +1163,11 @@ xfs_page_state_convert(
}
if (imap_valid) {
xfs_map_at_offset(inode, bh, &imap, offset);
- if (startio) {
- xfs_add_to_ioend(inode, bh, offset,
- type, &ioend,
- new_ioend);
- } else {
- set_buffer_dirty(bh);
- unlock_buffer(bh);
- mark_buffer_dirty(bh);
- }
- page_dirty--;
+ xfs_add_to_ioend(inode, bh, offset, type,
+ &ioend, new_ioend);
count++;
}
- } else if (buffer_uptodate(bh) && startio) {
+ } else if (buffer_uptodate(bh)) {
/*
* we got here because the buffer is already mapped.
* That means it must already have extents allocated
@@ -1213,8 +1175,7 @@ xfs_page_state_convert(
*/
if (!imap_valid || flags != BMAPI_READ) {
flags = BMAPI_READ;
- size = xfs_probe_cluster(inode, page, bh,
- head, 1);
+ size = xfs_probe_cluster(inode, page, bh, head);
err = xfs_map_blocks(inode, offset, size,
&imap, flags);
if (err)
@@ -1233,18 +1194,16 @@ xfs_page_state_convert(
*/
type = IO_NEW;
if (trylock_buffer(bh)) {
- ASSERT(buffer_mapped(bh));
if (imap_valid)
all_bh = 1;
xfs_add_to_ioend(inode, bh, offset, type,
&ioend, !imap_valid);
- page_dirty--;
count++;
} else {
imap_valid = 0;
}
- } else if ((buffer_uptodate(bh) || PageUptodate(page)) &&
- (unmapped || startio)) {
+ } else if (PageUptodate(page)) {
+ ASSERT(buffer_mapped(bh));
imap_valid = 0;
}
@@ -1256,8 +1215,7 @@ xfs_page_state_convert(
if (uptodate && bh == head)
SetPageUptodate(page);
- if (startio)
- xfs_start_page_writeback(page, 1, count);
+ xfs_start_page_writeback(page, 1, count);
if (ioend && imap_valid) {
xfs_off_t end_index;
@@ -1275,124 +1233,27 @@ xfs_page_state_convert(
end_index = last_index;
xfs_cluster_write(inode, page->index + 1, &imap, &ioend,
- wbc, startio, all_bh, end_index);
+ wbc, all_bh, end_index);
}
if (iohead)
xfs_submit_ioend(wbc, iohead);
- return page_dirty;
+ return 0;
error:
if (iohead)
xfs_cancel_ioend(iohead);
- /*
- * If it's delalloc and we have nowhere to put it,
- * throw it away, unless the lower layers told
- * us to try again.
- */
- if (err != -EAGAIN) {
- if (!unmapped)
- xfs_aops_discard_page(page);
- ClearPageUptodate(page);
- }
+ xfs_aops_discard_page(page);
+ ClearPageUptodate(page);
+ unlock_page(page);
return err;
-}
-
-/*
- * writepage: Called from one of two places:
- *
- * 1. we are flushing a delalloc buffer head.
- *
- * 2. we are writing out a dirty page. Typically the page dirty
- * state is cleared before we get here. In this case is it
- * conceivable we have no buffer heads.
- *
- * For delalloc space on the page we need to allocate space and
- * flush it. For unmapped buffer heads on the page we should
- * allocate space if the page is uptodate. For any other dirty
- * buffer heads on the page we should flush them.
- *
- * If we detect that a transaction would be required to flush
- * the page, we have to check the process flags first, if we
- * are already in a transaction or disk I/O during allocations
- * is off, we need to fail the writepage and redirty the page.
- */
-
-STATIC int
-xfs_vm_writepage(
- struct page *page,
- struct writeback_control *wbc)
-{
- int error;
- int need_trans;
- int delalloc, unmapped, unwritten;
- struct inode *inode = page->mapping->host;
-
- trace_xfs_writepage(inode, page, 0);
-
- /*
- * We need a transaction if:
- * 1. There are delalloc buffers on the page
- * 2. The page is uptodate and we have unmapped buffers
- * 3. The page is uptodate and we have no buffers
- * 4. There are unwritten buffers on the page
- */
-
- if (!page_has_buffers(page)) {
- unmapped = 1;
- need_trans = 1;
- } else {
- xfs_count_page_state(page, &delalloc, &unmapped, &unwritten);
- if (!PageUptodate(page))
- unmapped = 0;
- need_trans = delalloc + unmapped + unwritten;
- }
-
- /*
- * If we need a transaction and the process flags say
- * we are already in a transaction, or no IO is allowed
- * then mark the page dirty again and leave the page
- * as is.
- */
- if (current_test_flags(PF_FSTRANS) && need_trans)
- goto out_fail;
-
- /*
- * Delay hooking up buffer heads until we have
- * made our go/no-go decision.
- */
- if (!page_has_buffers(page))
- create_empty_buffers(page, 1 << inode->i_blkbits, 0);
-
-
- /*
- * VM calculation for nr_to_write seems off. Bump it way
- * up, this gets simple streaming writes zippy again.
- * To be reviewed again after Jens' writeback changes.
- */
- wbc->nr_to_write *= 4;
-
- /*
- * Convert delayed allocate, unwritten or unmapped space
- * to real space and flush out to disk.
- */
- error = xfs_page_state_convert(inode, page, wbc, 1, unmapped);
- if (error == -EAGAIN)
- goto out_fail;
- if (unlikely(error < 0))
- goto out_unlock;
-
- return 0;
out_fail:
redirty_page_for_writepage(wbc, page);
unlock_page(page);
return 0;
-out_unlock:
- unlock_page(page);
- return error;
}
STATIC int
@@ -1406,65 +1267,27 @@ xfs_vm_writepages(
/*
* Called to move a page into cleanable state - and from there
- * to be released. Possibly the page is already clean. We always
+ * to be released. The page should already be clean. We always
* have buffer heads in this call.
*
- * Returns 0 if the page is ok to release, 1 otherwise.
- *
- * Possible scenarios are:
- *
- * 1. We are being called to release a page which has been written
- * to via regular I/O. buffer heads will be dirty and possibly
- * delalloc. If no delalloc buffer heads in this case then we
- * can just return zero.
- *
- * 2. We are called to release a page which has been written via
- * mmap, all we need to do is ensure there is no delalloc
- * state in the buffer heads, if not we can let the caller
- * free them and we should come back later via writepage.
+ * Returns 1 if the page is ok to release, 0 otherwise.
*/
STATIC int
xfs_vm_releasepage(
struct page *page,
gfp_t gfp_mask)
{
- struct inode *inode = page->mapping->host;
- int dirty, delalloc, unmapped, unwritten;
- struct writeback_control wbc = {
- .sync_mode = WB_SYNC_ALL,
- .nr_to_write = 1,
- };
+ int delalloc, unwritten;
- trace_xfs_releasepage(inode, page, 0);
+ trace_xfs_releasepage(page->mapping->host, page, 0);
- if (!page_has_buffers(page))
- return 0;
-
- xfs_count_page_state(page, &delalloc, &unmapped, &unwritten);
- if (!delalloc && !unwritten)
- goto free_buffers;
+ xfs_count_page_state(page, &delalloc, &unwritten);
- if (!(gfp_mask & __GFP_FS))
+ if (WARN_ON(delalloc))
return 0;
-
- /* If we are already inside a transaction or the thread cannot
- * do I/O, we cannot release this page.
- */
- if (current_test_flags(PF_FSTRANS))
+ if (WARN_ON(unwritten))
return 0;
- /*
- * Convert delalloc space to real space, do not flush the
- * data out to disk, that will be done by the caller.
- * Never need to allocate space here - we will always
- * come back to writepage in that case.
- */
- dirty = xfs_page_state_convert(inode, page, &wbc, 0, 0);
- if (dirty == 0 && !unwritten)
- goto free_buffers;
- return 0;
-
-free_buffers:
return try_to_free_buffers(page);
}
@@ -1474,9 +1297,9 @@ __xfs_get_blocks(
sector_t iblock,
struct buffer_head *bh_result,
int create,
- int direct,
- bmapi_flags_t flags)
+ int direct)
{
+ int flags = create ? BMAPI_WRITE : BMAPI_READ;
struct xfs_bmbt_irec imap;
xfs_off_t offset;
ssize_t size;
@@ -1491,8 +1314,11 @@ __xfs_get_blocks(
if (!create && direct && offset >= i_size_read(inode))
return 0;
- error = xfs_iomap(XFS_I(inode), offset, size,
- create ? flags : BMAPI_READ, &imap, &nimap, &new);
+ if (direct && create)
+ flags |= BMAPI_DIRECT;
+
+ error = xfs_iomap(XFS_I(inode), offset, size, flags, &imap, &nimap,
+ &new);
if (error)
return -error;
if (nimap == 0)
@@ -1572,8 +1398,7 @@ xfs_get_blocks(
struct buffer_head *bh_result,
int create)
{
- return __xfs_get_blocks(inode, iblock,
- bh_result, create, 0, BMAPI_WRITE);
+ return __xfs_get_blocks(inode, iblock, bh_result, create, 0);
}
STATIC int
@@ -1583,61 +1408,59 @@ xfs_get_blocks_direct(
struct buffer_head *bh_result,
int create)
{
- return __xfs_get_blocks(inode, iblock,
- bh_result, create, 1, BMAPI_WRITE|BMAPI_DIRECT);
+ return __xfs_get_blocks(inode, iblock, bh_result, create, 1);
}
+/*
+ * Complete a direct I/O write request.
+ *
+ * If the private argument is non-NULL __xfs_get_blocks signals us that we
+ * need to issue a transaction to convert the range from unwritten to written
+ * extents. In case this is regular synchronous I/O we just call xfs_end_io
+ * to do this and we are done. But in case this was a successfull AIO
+ * request this handler is called from interrupt context, from which we
+ * can't start transactions. In that case offload the I/O completion to
+ * the workqueues we also use for buffered I/O completion.
+ */
STATIC void
-xfs_end_io_direct(
- struct kiocb *iocb,
- loff_t offset,
- ssize_t size,
- void *private)
+xfs_end_io_direct_write(
+ struct kiocb *iocb,
+ loff_t offset,
+ ssize_t size,
+ void *private,
+ int ret,
+ bool is_async)
{
- xfs_ioend_t *ioend = iocb->private;
+ struct xfs_ioend *ioend = iocb->private;
/*
- * Non-NULL private data means we need to issue a transaction to
- * convert a range from unwritten to written extents. This needs
- * to happen from process context but aio+dio I/O completion
- * happens from irq context so we need to defer it to a workqueue.
- * This is not necessary for synchronous direct I/O, but we do
- * it anyway to keep the code uniform and simpler.
- *
- * Well, if only it were that simple. Because synchronous direct I/O
- * requires extent conversion to occur *before* we return to userspace,
- * we have to wait for extent conversion to complete. Look at the
- * iocb that has been passed to us to determine if this is AIO or
- * not. If it is synchronous, tell xfs_finish_ioend() to kick the
- * workqueue and wait for it to complete.
- *
- * The core direct I/O code might be changed to always call the
- * completion handler in the future, in which case all this can
- * go away.
+ * blockdev_direct_IO can return an error even after the I/O
+ * completion handler was called. Thus we need to protect
+ * against double-freeing.
*/
+ iocb->private = NULL;
+
ioend->io_offset = offset;
ioend->io_size = size;
- if (ioend->io_type == IO_READ) {
- xfs_finish_ioend(ioend, 0);
- } else if (private && size > 0) {
- xfs_finish_ioend(ioend, is_sync_kiocb(iocb));
- } else {
+ if (private && size > 0)
+ ioend->io_type = IO_UNWRITTEN;
+
+ if (is_async) {
/*
- * A direct I/O write ioend starts it's life in unwritten
- * state in case they map an unwritten extent. This write
- * didn't map an unwritten extent so switch it's completion
- * handler.
+ * If we are converting an unwritten extent we need to delay
+ * the AIO completion until after the unwrittent extent
+ * conversion has completed, otherwise do it ASAP.
*/
- ioend->io_type = IO_NEW;
- xfs_finish_ioend(ioend, 0);
+ if (ioend->io_type == IO_UNWRITTEN) {
+ ioend->io_iocb = iocb;
+ ioend->io_result = ret;
+ } else {
+ aio_complete(iocb, ret, 0);
+ }
+ xfs_finish_ioend(ioend);
+ } else {
+ xfs_finish_ioend_sync(ioend);
}
-
- /*
- * blockdev_direct_IO can return an error even after the I/O
- * completion handler was called. Thus we need to protect
- * against double-freeing.
- */
- iocb->private = NULL;
}
STATIC ssize_t
@@ -1648,23 +1471,26 @@ xfs_vm_direct_IO(
loff_t offset,
unsigned long nr_segs)
{
- struct file *file = iocb->ki_filp;
- struct inode *inode = file->f_mapping->host;
- struct block_device *bdev;
- ssize_t ret;
-
- bdev = xfs_find_bdev_for_inode(inode);
-
- iocb->private = xfs_alloc_ioend(inode, rw == WRITE ?
- IO_UNWRITTEN : IO_READ);
-
- ret = blockdev_direct_IO_no_locking(rw, iocb, inode, bdev, iov,
- offset, nr_segs,
- xfs_get_blocks_direct,
- xfs_end_io_direct);
+ struct inode *inode = iocb->ki_filp->f_mapping->host;
+ struct block_device *bdev = xfs_find_bdev_for_inode(inode);
+ ssize_t ret;
+
+ if (rw & WRITE) {
+ iocb->private = xfs_alloc_ioend(inode, IO_NEW);
+
+ ret = blockdev_direct_IO_no_locking(rw, iocb, inode, bdev, iov,
+ offset, nr_segs,
+ xfs_get_blocks_direct,
+ xfs_end_io_direct_write);
+ if (ret != -EIOCBQUEUED && iocb->private)
+ xfs_destroy_ioend(iocb->private);
+ } else {
+ ret = blockdev_direct_IO_no_locking(rw, iocb, inode, bdev, iov,
+ offset, nr_segs,
+ xfs_get_blocks_direct,
+ NULL);
+ }
- if (unlikely(ret != -EIOCBQUEUED && iocb->private))
- xfs_destroy_ioend(iocb->private);
return ret;
}
@@ -1679,8 +1505,8 @@ xfs_vm_write_begin(
void **fsdata)
{
*pagep = NULL;
- return block_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
- xfs_get_blocks);
+ return block_write_begin(file, mapping, pos, len, flags | AOP_FLAG_NOFS,
+ pagep, fsdata, xfs_get_blocks);
}
STATIC sector_t
@@ -1691,7 +1517,7 @@ xfs_vm_bmap(
struct inode *inode = (struct inode *)mapping->host;
struct xfs_inode *ip = XFS_I(inode);
- xfs_itrace_entry(XFS_I(inode));
+ trace_xfs_vm_bmap(XFS_I(inode));
xfs_ilock(ip, XFS_IOLOCK_SHARED);
xfs_flush_pages(ip, (xfs_off_t)0, -1, 0, FI_REMAPF);
xfs_iunlock(ip, XFS_IOLOCK_SHARED);
diff --git a/fs/xfs/linux-2.6/xfs_aops.h b/fs/xfs/linux-2.6/xfs_aops.h
index 4cfc6ea87df8..c5057fb6237a 100644
--- a/fs/xfs/linux-2.6/xfs_aops.h
+++ b/fs/xfs/linux-2.6/xfs_aops.h
@@ -37,6 +37,8 @@ typedef struct xfs_ioend {
size_t io_size; /* size of the extent */
xfs_off_t io_offset; /* offset in the file */
struct work_struct io_work; /* xfsdatad work queue */
+ struct kiocb *io_iocb;
+ int io_result;
} xfs_ioend_t;
extern const struct address_space_operations xfs_address_space_operations;
@@ -45,6 +47,6 @@ extern int xfs_get_blocks(struct inode *, sector_t, struct buffer_head *, int);
extern void xfs_ioend_init(void);
extern void xfs_ioend_wait(struct xfs_inode *);
-extern void xfs_count_page_state(struct page *, int *, int *, int *);
+extern void xfs_count_page_state(struct page *, int *, int *);
#endif /* __XFS_AOPS_H__ */
diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index 649ade8ef598..ea79072f5210 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -39,13 +39,12 @@
#include "xfs_inum.h"
#include "xfs_log.h"
#include "xfs_ag.h"
-#include "xfs_dmapi.h"
#include "xfs_mount.h"
#include "xfs_trace.h"
static kmem_zone_t *xfs_buf_zone;
STATIC int xfsbufd(void *);
-STATIC int xfsbufd_wakeup(int, gfp_t);
+STATIC int xfsbufd_wakeup(struct shrinker *, int, gfp_t);
STATIC void xfs_buf_delwri_queue(xfs_buf_t *, int);
static struct shrinker xfs_buf_shake = {
.shrink = xfsbufd_wakeup,
@@ -340,7 +339,7 @@ _xfs_buf_lookup_pages(
__func__, gfp_mask);
XFS_STATS_INC(xb_page_retries);
- xfsbufd_wakeup(0, gfp_mask);
+ xfsbufd_wakeup(NULL, 0, gfp_mask);
congestion_wait(BLK_RW_ASYNC, HZ/50);
goto retry;
}
@@ -579,9 +578,9 @@ _xfs_buf_read(
XBF_READ_AHEAD | _XBF_RUN_QUEUES);
status = xfs_buf_iorequest(bp);
- if (!status && !(flags & XBF_ASYNC))
- status = xfs_buf_iowait(bp);
- return status;
+ if (status || XFS_BUF_ISERROR(bp) || (flags & XBF_ASYNC))
+ return status;
+ return xfs_buf_iowait(bp);
}
xfs_buf_t *
@@ -897,36 +896,6 @@ xfs_buf_unlock(
trace_xfs_buf_unlock(bp, _RET_IP_);
}
-
-/*
- * Pinning Buffer Storage in Memory
- * Ensure that no attempt to force a buffer to disk will succeed.
- */
-void
-xfs_buf_pin(
- xfs_buf_t *bp)
-{
- trace_xfs_buf_pin(bp, _RET_IP_);
- atomic_inc(&bp->b_pin_count);
-}
-
-void
-xfs_buf_unpin(
- xfs_buf_t *bp)
-{
- trace_xfs_buf_unpin(bp, _RET_IP_);
-
- if (atomic_dec_and_test(&bp->b_pin_count))
- wake_up_all(&bp->b_waiters);
-}
-
-int
-xfs_buf_ispin(
- xfs_buf_t *bp)
-{
- return atomic_read(&bp->b_pin_count);
-}
-
STATIC void
xfs_buf_wait_unpin(
xfs_buf_t *bp)
@@ -1018,13 +987,12 @@ xfs_bwrite(
{
int error;
- bp->b_strat = xfs_bdstrat_cb;
bp->b_mount = mp;
bp->b_flags |= XBF_WRITE;
bp->b_flags &= ~(XBF_ASYNC | XBF_READ);
xfs_buf_delwri_dequeue(bp);
- xfs_buf_iostrategy(bp);
+ xfs_bdstrat_cb(bp);
error = xfs_buf_iowait(bp);
if (error)
@@ -1040,7 +1008,6 @@ xfs_bdwrite(
{
trace_xfs_buf_bdwrite(bp, _RET_IP_);
- bp->b_strat = xfs_bdstrat_cb;
bp->b_mount = mp;
bp->b_flags &= ~XBF_READ;
@@ -1075,7 +1042,6 @@ xfs_bioerror(
XFS_BUF_UNDONE(bp);
XFS_BUF_STALE(bp);
- XFS_BUF_CLR_BDSTRAT_FUNC(bp);
xfs_biodone(bp);
return EIO;
@@ -1105,7 +1071,6 @@ xfs_bioerror_relse(
XFS_BUF_DONE(bp);
XFS_BUF_STALE(bp);
XFS_BUF_CLR_IODONE_FUNC(bp);
- XFS_BUF_CLR_BDSTRAT_FUNC(bp);
if (!(fl & XBF_ASYNC)) {
/*
* Mark b_error and B_ERROR _both_.
@@ -1311,8 +1276,19 @@ submit_io:
if (size)
goto next_chunk;
} else {
- bio_put(bio);
+ /*
+ * if we get here, no pages were added to the bio. However,
+ * we can't just error out here - if the pages are locked then
+ * we have to unlock them otherwise we can hang on a later
+ * access to the page.
+ */
xfs_buf_ioerror(bp, EIO);
+ if (bp->b_flags & _XBF_PAGE_LOCKED) {
+ int i;
+ for (i = 0; i < bp->b_page_count; i++)
+ unlock_page(bp->b_pages[i]);
+ }
+ bio_put(bio);
}
}
@@ -1762,6 +1738,7 @@ xfs_buf_runall_queues(
STATIC int
xfsbufd_wakeup(
+ struct shrinker *shrink,
int priority,
gfp_t mask)
{
@@ -1803,7 +1780,7 @@ xfs_buf_delwri_split(
trace_xfs_buf_delwri_split(bp, _RET_IP_);
ASSERT(bp->b_flags & XBF_DELWRI);
- if (!xfs_buf_ispin(bp) && !xfs_buf_cond_lock(bp)) {
+ if (!XFS_BUF_ISPINNED(bp) && !xfs_buf_cond_lock(bp)) {
if (!force &&
time_before(jiffies, bp->b_queuetime + age)) {
xfs_buf_unlock(bp);
@@ -1888,7 +1865,7 @@ xfsbufd(
struct xfs_buf *bp;
bp = list_first_entry(&tmp, struct xfs_buf, b_list);
list_del_init(&bp->b_list);
- xfs_buf_iostrategy(bp);
+ xfs_bdstrat_cb(bp);
count++;
}
if (count)
@@ -1935,7 +1912,7 @@ xfs_flush_buftarg(
bp->b_flags &= ~XBF_ASYNC;
list_add(&bp->b_list, &wait_list);
}
- xfs_buf_iostrategy(bp);
+ xfs_bdstrat_cb(bp);
}
if (wait) {
diff --git a/fs/xfs/linux-2.6/xfs_buf.h b/fs/xfs/linux-2.6/xfs_buf.h
index 5fbecefa5dfd..d072e5ff923b 100644
--- a/fs/xfs/linux-2.6/xfs_buf.h
+++ b/fs/xfs/linux-2.6/xfs_buf.h
@@ -44,57 +44,57 @@ typedef enum {
XBRW_ZERO = 3, /* Zero target memory */
} xfs_buf_rw_t;
-typedef enum {
- XBF_READ = (1 << 0), /* buffer intended for reading from device */
- XBF_WRITE = (1 << 1), /* buffer intended for writing to device */
- XBF_MAPPED = (1 << 2), /* buffer mapped (b_addr valid) */
- XBF_ASYNC = (1 << 4), /* initiator will not wait for completion */
- XBF_DONE = (1 << 5), /* all pages in the buffer uptodate */
- XBF_DELWRI = (1 << 6), /* buffer has dirty pages */
- XBF_STALE = (1 << 7), /* buffer has been staled, do not find it */
- XBF_FS_MANAGED = (1 << 8), /* filesystem controls freeing memory */
- XBF_ORDERED = (1 << 11), /* use ordered writes */
- XBF_READ_AHEAD = (1 << 12), /* asynchronous read-ahead */
- XBF_LOG_BUFFER = (1 << 13), /* this is a buffer used for the log */
-
- /* flags used only as arguments to access routines */
- XBF_LOCK = (1 << 14), /* lock requested */
- XBF_TRYLOCK = (1 << 15), /* lock requested, but do not wait */
- XBF_DONT_BLOCK = (1 << 16), /* do not block in current thread */
-
- /* flags used only internally */
- _XBF_PAGE_CACHE = (1 << 17),/* backed by pagecache */
- _XBF_PAGES = (1 << 18), /* backed by refcounted pages */
- _XBF_RUN_QUEUES = (1 << 19),/* run block device task queue */
- _XBF_DELWRI_Q = (1 << 21), /* buffer on delwri queue */
-
- /*
- * Special flag for supporting metadata blocks smaller than a FSB.
- *
- * In this case we can have multiple xfs_buf_t on a single page and
- * need to lock out concurrent xfs_buf_t readers as they only
- * serialise access to the buffer.
- *
- * If the FSB size >= PAGE_CACHE_SIZE case, we have no serialisation
- * between reads of the page. Hence we can have one thread read the
- * page and modify it, but then race with another thread that thinks
- * the page is not up-to-date and hence reads it again.
- *
- * The result is that the first modifcation to the page is lost.
- * This sort of AGF/AGI reading race can happen when unlinking inodes
- * that require truncation and results in the AGI unlinked list
- * modifications being lost.
- */
- _XBF_PAGE_LOCKED = (1 << 22),
-
- /*
- * If we try a barrier write, but it fails we have to communicate
- * this to the upper layers. Unfortunately b_error gets overwritten
- * when the buffer is re-issued so we have to add another flag to
- * keep this information.
- */
- _XFS_BARRIER_FAILED = (1 << 23),
-} xfs_buf_flags_t;
+#define XBF_READ (1 << 0) /* buffer intended for reading from device */
+#define XBF_WRITE (1 << 1) /* buffer intended for writing to device */
+#define XBF_MAPPED (1 << 2) /* buffer mapped (b_addr valid) */
+#define XBF_ASYNC (1 << 4) /* initiator will not wait for completion */
+#define XBF_DONE (1 << 5) /* all pages in the buffer uptodate */
+#define XBF_DELWRI (1 << 6) /* buffer has dirty pages */
+#define XBF_STALE (1 << 7) /* buffer has been staled, do not find it */
+#define XBF_FS_MANAGED (1 << 8) /* filesystem controls freeing memory */
+#define XBF_ORDERED (1 << 11)/* use ordered writes */
+#define XBF_READ_AHEAD (1 << 12)/* asynchronous read-ahead */
+#define XBF_LOG_BUFFER (1 << 13)/* this is a buffer used for the log */
+
+/* flags used only as arguments to access routines */
+#define XBF_LOCK (1 << 14)/* lock requested */
+#define XBF_TRYLOCK (1 << 15)/* lock requested, but do not wait */
+#define XBF_DONT_BLOCK (1 << 16)/* do not block in current thread */
+
+/* flags used only internally */
+#define _XBF_PAGE_CACHE (1 << 17)/* backed by pagecache */
+#define _XBF_PAGES (1 << 18)/* backed by refcounted pages */
+#define _XBF_RUN_QUEUES (1 << 19)/* run block device task queue */
+#define _XBF_DELWRI_Q (1 << 21)/* buffer on delwri queue */
+
+/*
+ * Special flag for supporting metadata blocks smaller than a FSB.
+ *
+ * In this case we can have multiple xfs_buf_t on a single page and
+ * need to lock out concurrent xfs_buf_t readers as they only
+ * serialise access to the buffer.
+ *
+ * If the FSB size >= PAGE_CACHE_SIZE case, we have no serialisation
+ * between reads of the page. Hence we can have one thread read the
+ * page and modify it, but then race with another thread that thinks
+ * the page is not up-to-date and hence reads it again.
+ *
+ * The result is that the first modifcation to the page is lost.
+ * This sort of AGF/AGI reading race can happen when unlinking inodes
+ * that require truncation and results in the AGI unlinked list
+ * modifications being lost.
+ */
+#define _XBF_PAGE_LOCKED (1 << 22)
+
+/*
+ * If we try a barrier write, but it fails we have to communicate
+ * this to the upper layers. Unfortunately b_error gets overwritten
+ * when the buffer is re-issued so we have to add another flag to
+ * keep this information.
+ */
+#define _XFS_BARRIER_FAILED (1 << 23)
+
+typedef unsigned int xfs_buf_flags_t;
#define XFS_BUF_FLAGS \
{ XBF_READ, "READ" }, \
@@ -187,7 +187,6 @@ typedef struct xfs_buf {
atomic_t b_io_remaining; /* #outstanding I/O requests */
xfs_buf_iodone_t b_iodone; /* I/O completion function */
xfs_buf_relse_t b_relse; /* releasing function */
- xfs_buf_bdstrat_t b_strat; /* pre-write function */
struct completion b_iowait; /* queue for I/O waiters */
void *b_fspriv;
void *b_fspriv2;
@@ -245,11 +244,6 @@ extern int xfs_buf_iowait(xfs_buf_t *);
extern void xfs_buf_iomove(xfs_buf_t *, size_t, size_t, void *,
xfs_buf_rw_t);
-static inline int xfs_buf_iostrategy(xfs_buf_t *bp)
-{
- return bp->b_strat ? bp->b_strat(bp) : xfs_buf_iorequest(bp);
-}
-
static inline int xfs_buf_geterror(xfs_buf_t *bp)
{
return bp ? bp->b_error : ENOMEM;
@@ -258,11 +252,6 @@ static inline int xfs_buf_geterror(xfs_buf_t *bp)
/* Buffer Utility Routines */
extern xfs_caddr_t xfs_buf_offset(xfs_buf_t *, size_t);
-/* Pinning Buffer Storage in Memory */
-extern void xfs_buf_pin(xfs_buf_t *);
-extern void xfs_buf_unpin(xfs_buf_t *);
-extern int xfs_buf_ispin(xfs_buf_t *);
-
/* Delayed Write Buffer Routines */
extern void xfs_buf_delwri_dequeue(xfs_buf_t *);
extern void xfs_buf_delwri_promote(xfs_buf_t *);
@@ -326,8 +315,6 @@ extern void xfs_buf_terminate(void);
#define XFS_BUF_IODONE_FUNC(bp) ((bp)->b_iodone)
#define XFS_BUF_SET_IODONE_FUNC(bp, func) ((bp)->b_iodone = (func))
#define XFS_BUF_CLR_IODONE_FUNC(bp) ((bp)->b_iodone = NULL)
-#define XFS_BUF_SET_BDSTRAT_FUNC(bp, func) ((bp)->b_strat = (func))
-#define XFS_BUF_CLR_BDSTRAT_FUNC(bp) ((bp)->b_strat = NULL)
#define XFS_BUF_FSPRIVATE(bp, type) ((type)(bp)->b_fspriv)
#define XFS_BUF_SET_FSPRIVATE(bp, val) ((bp)->b_fspriv = (void*)(val))
@@ -351,7 +338,7 @@ extern void xfs_buf_terminate(void);
#define XFS_BUF_SET_VTYPE(bp, type) do { } while (0)
#define XFS_BUF_SET_REF(bp, ref) do { } while (0)
-#define XFS_BUF_ISPINNED(bp) xfs_buf_ispin(bp)
+#define XFS_BUF_ISPINNED(bp) atomic_read(&((bp)->b_pin_count))
#define XFS_BUF_VALUSEMA(bp) xfs_buf_lock_value(bp)
#define XFS_BUF_CPSEMA(bp) (xfs_buf_cond_lock(bp) == 0)
@@ -370,8 +357,6 @@ static inline void xfs_buf_relse(xfs_buf_t *bp)
xfs_buf_rele(bp);
}
-#define xfs_bpin(bp) xfs_buf_pin(bp)
-#define xfs_bunpin(bp) xfs_buf_unpin(bp)
#define xfs_biodone(bp) xfs_buf_ioend(bp, 0)
#define xfs_biomove(bp, off, len, data, rw) \
diff --git a/fs/xfs/linux-2.6/xfs_dmapi_priv.h b/fs/xfs/linux-2.6/xfs_dmapi_priv.h
deleted file mode 100644
index a8b0b1685eed..000000000000
--- a/fs/xfs/linux-2.6/xfs_dmapi_priv.h
+++ /dev/null
@@ -1,28 +0,0 @@
-/*
- * Copyright (c) 2000-2006 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- */
-#ifndef __XFS_DMAPI_PRIV_H__
-#define __XFS_DMAPI_PRIV_H__
-
-/*
- * Based on IO_ISDIRECT, decide which i_ flag is set.
- */
-#define DM_SEM_FLAG_RD(ioflags) (((ioflags) & IO_ISDIRECT) ? \
- DM_FLAGS_IMUX : 0)
-#define DM_SEM_FLAG_WR (DM_FLAGS_IALLOCSEM_WR | DM_FLAGS_IMUX)
-
-#endif /*__XFS_DMAPI_PRIV_H__*/
diff --git a/fs/xfs/linux-2.6/xfs_export.c b/fs/xfs/linux-2.6/xfs_export.c
index 846b75aeb2ab..3764d74790ec 100644
--- a/fs/xfs/linux-2.6/xfs_export.c
+++ b/fs/xfs/linux-2.6/xfs_export.c
@@ -23,13 +23,13 @@
#include "xfs_sb.h"
#include "xfs_ag.h"
#include "xfs_dir2.h"
-#include "xfs_dmapi.h"
#include "xfs_mount.h"
#include "xfs_export.h"
#include "xfs_vnodeops.h"
#include "xfs_bmap_btree.h"
#include "xfs_inode.h"
#include "xfs_inode_item.h"
+#include "xfs_trace.h"
/*
* Note that we only accept fileids which are long enough rather than allow
@@ -128,13 +128,11 @@ xfs_nfs_get_inode(
return ERR_PTR(-ESTALE);
/*
- * The XFS_IGET_BULKSTAT means that an invalid inode number is just
- * fine and not an indication of a corrupted filesystem. Because
- * clients can send any kind of invalid file handle, e.g. after
- * a restore on the server we have to deal with this case gracefully.
+ * The XFS_IGET_UNTRUSTED means that an invalid inode number is just
+ * fine and not an indication of a corrupted filesystem as clients can
+ * send invalid file handles and we have to handle it gracefully..
*/
- error = xfs_iget(mp, NULL, ino, XFS_IGET_BULKSTAT,
- XFS_ILOCK_SHARED, &ip, 0);
+ error = xfs_iget(mp, NULL, ino, XFS_IGET_UNTRUSTED, 0, &ip);
if (error) {
/*
* EINVAL means the inode cluster doesn't exist anymore.
@@ -149,11 +147,10 @@ xfs_nfs_get_inode(
}
if (ip->i_d.di_gen != generation) {
- xfs_iput_new(ip, XFS_ILOCK_SHARED);
+ IRELE(ip);
return ERR_PTR(-ENOENT);
}
- xfs_iunlock(ip, XFS_ILOCK_SHARED);
return VFS_I(ip);
}
diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c
index d8fb1b5d6cb5..ba8ad422a165 100644
--- a/fs/xfs/linux-2.6/xfs_file.c
+++ b/fs/xfs/linux-2.6/xfs_file.c
@@ -22,23 +22,15 @@
#include "xfs_inum.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
-#include "xfs_dir2.h"
#include "xfs_trans.h"
-#include "xfs_dmapi.h"
#include "xfs_mount.h"
#include "xfs_bmap_btree.h"
-#include "xfs_alloc_btree.h"
-#include "xfs_ialloc_btree.h"
#include "xfs_alloc.h"
-#include "xfs_btree.h"
-#include "xfs_attr_sf.h"
-#include "xfs_dir2_sf.h"
#include "xfs_dinode.h"
#include "xfs_inode.h"
#include "xfs_inode_item.h"
#include "xfs_bmap.h"
#include "xfs_error.h"
-#include "xfs_rw.h"
#include "xfs_vnodeops.h"
#include "xfs_da_btree.h"
#include "xfs_ioctl.h"
@@ -100,15 +92,15 @@ xfs_iozero(
STATIC int
xfs_file_fsync(
struct file *file,
- struct dentry *dentry,
int datasync)
{
- struct xfs_inode *ip = XFS_I(dentry->d_inode);
+ struct inode *inode = file->f_mapping->host;
+ struct xfs_inode *ip = XFS_I(inode);
struct xfs_trans *tp;
int error = 0;
int log_flushed = 0;
- xfs_itrace_entry(ip);
+ trace_xfs_file_fsync(ip);
if (XFS_FORCED_SHUTDOWN(ip->i_mount))
return -XFS_ERROR(EIO);
@@ -140,8 +132,8 @@ xfs_file_fsync(
* might gets cleared when the inode gets written out via the AIL
* or xfs_iflush_cluster.
*/
- if (((dentry->d_inode->i_state & I_DIRTY_DATASYNC) ||
- ((dentry->d_inode->i_state & I_DIRTY_SYNC) && !datasync)) &&
+ if (((inode->i_state & I_DIRTY_DATASYNC) ||
+ ((inode->i_state & I_DIRTY_SYNC) && !datasync)) &&
ip->i_update_core) {
/*
* Kick off a transaction to log the inode core to get the
@@ -166,8 +158,7 @@ xfs_file_fsync(
* transaction. So we play it safe and fire off the
* transaction anyway.
*/
- xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
- xfs_trans_ihold(tp, ip);
+ xfs_trans_ijoin(tp, ip);
xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
xfs_trans_set_sync(tp);
error = _xfs_trans_commit(tp, 0, &log_flushed);
@@ -275,20 +266,6 @@ xfs_file_aio_read(
mutex_lock(&inode->i_mutex);
xfs_ilock(ip, XFS_IOLOCK_SHARED);
- if (DM_EVENT_ENABLED(ip, DM_EVENT_READ) && !(ioflags & IO_INVIS)) {
- int dmflags = FILP_DELAY_FLAG(file) | DM_SEM_FLAG_RD(ioflags);
- int iolock = XFS_IOLOCK_SHARED;
-
- ret = -XFS_SEND_DATA(mp, DM_EVENT_READ, ip, iocb->ki_pos, size,
- dmflags, &iolock);
- if (ret) {
- xfs_iunlock(ip, XFS_IOLOCK_SHARED);
- if (unlikely(ioflags & IO_ISDIRECT))
- mutex_unlock(&inode->i_mutex);
- return ret;
- }
- }
-
if (unlikely(ioflags & IO_ISDIRECT)) {
if (inode->i_mapping->nrpages) {
ret = -xfs_flushinval_pages(ip,
@@ -321,7 +298,6 @@ xfs_file_splice_read(
unsigned int flags)
{
struct xfs_inode *ip = XFS_I(infilp->f_mapping->host);
- struct xfs_mount *mp = ip->i_mount;
int ioflags = 0;
ssize_t ret;
@@ -335,18 +311,6 @@ xfs_file_splice_read(
xfs_ilock(ip, XFS_IOLOCK_SHARED);
- if (DM_EVENT_ENABLED(ip, DM_EVENT_READ) && !(ioflags & IO_INVIS)) {
- int iolock = XFS_IOLOCK_SHARED;
- int error;
-
- error = XFS_SEND_DATA(mp, DM_EVENT_READ, ip, *ppos, count,
- FILP_DELAY_FLAG(infilp), &iolock);
- if (error) {
- xfs_iunlock(ip, XFS_IOLOCK_SHARED);
- return -error;
- }
- }
-
trace_xfs_file_splice_read(ip, count, *ppos, ioflags);
ret = generic_file_splice_read(infilp, ppos, pipe, count, flags);
@@ -367,7 +331,6 @@ xfs_file_splice_write(
{
struct inode *inode = outfilp->f_mapping->host;
struct xfs_inode *ip = XFS_I(inode);
- struct xfs_mount *mp = ip->i_mount;
xfs_fsize_t isize, new_size;
int ioflags = 0;
ssize_t ret;
@@ -382,18 +345,6 @@ xfs_file_splice_write(
xfs_ilock(ip, XFS_IOLOCK_EXCL);
- if (DM_EVENT_ENABLED(ip, DM_EVENT_WRITE) && !(ioflags & IO_INVIS)) {
- int iolock = XFS_IOLOCK_EXCL;
- int error;
-
- error = XFS_SEND_DATA(mp, DM_EVENT_WRITE, ip, *ppos, count,
- FILP_DELAY_FLAG(outfilp), &iolock);
- if (error) {
- xfs_iunlock(ip, XFS_IOLOCK_EXCL);
- return -error;
- }
- }
-
new_size = *ppos + count;
xfs_ilock(ip, XFS_ILOCK_EXCL);
@@ -463,7 +414,7 @@ xfs_zero_last_block(
last_fsb = XFS_B_TO_FSBT(mp, isize);
nimaps = 1;
error = xfs_bmapi(NULL, ip, last_fsb, 1, 0, NULL, 0, &imap,
- &nimaps, NULL, NULL);
+ &nimaps, NULL);
if (error) {
return error;
}
@@ -558,7 +509,7 @@ xfs_zero_eof(
nimaps = 1;
zero_count_fsb = end_zero_fsb - start_zero_fsb + 1;
error = xfs_bmapi(NULL, ip, start_zero_fsb, zero_count_fsb,
- 0, NULL, 0, &imap, &nimaps, NULL, NULL);
+ 0, NULL, 0, &imap, &nimaps, NULL);
if (error) {
ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_IOLOCK_EXCL));
return error;
@@ -627,7 +578,6 @@ xfs_file_aio_write(
int ioflags = 0;
xfs_fsize_t isize, new_size;
int iolock;
- int eventsent = 0;
size_t ocount = 0, count;
int need_i_mutex;
@@ -673,33 +623,6 @@ start:
goto out_unlock_mutex;
}
- if ((DM_EVENT_ENABLED(ip, DM_EVENT_WRITE) &&
- !(ioflags & IO_INVIS) && !eventsent)) {
- int dmflags = FILP_DELAY_FLAG(file);
-
- if (need_i_mutex)
- dmflags |= DM_FLAGS_IMUX;
-
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
- error = XFS_SEND_DATA(ip->i_mount, DM_EVENT_WRITE, ip,
- pos, count, dmflags, &iolock);
- if (error) {
- goto out_unlock_internal;
- }
- xfs_ilock(ip, XFS_ILOCK_EXCL);
- eventsent = 1;
-
- /*
- * The iolock was dropped and reacquired in XFS_SEND_DATA
- * so we have to recheck the size when appending.
- * We will only "goto start;" once, since having sent the
- * event prevents another call to XFS_SEND_DATA, which is
- * what allows the size to change in the first place.
- */
- if ((file->f_flags & O_APPEND) && pos != ip->i_size)
- goto start;
- }
-
if (ioflags & IO_ISDIRECT) {
xfs_buftarg_t *target =
XFS_IS_REALTIME_INODE(ip) ?
@@ -830,22 +753,6 @@ write_retry:
xfs_iunlock(ip, XFS_ILOCK_EXCL);
}
- if (ret == -ENOSPC &&
- DM_EVENT_ENABLED(ip, DM_EVENT_NOSPACE) && !(ioflags & IO_INVIS)) {
- xfs_iunlock(ip, iolock);
- if (need_i_mutex)
- mutex_unlock(&inode->i_mutex);
- error = XFS_SEND_NAMESP(ip->i_mount, DM_EVENT_NOSPACE, ip,
- DM_RIGHT_NULL, ip, DM_RIGHT_NULL, NULL, NULL,
- 0, 0, 0); /* Delay flag intentionally unused */
- if (need_i_mutex)
- mutex_lock(&inode->i_mutex);
- xfs_ilock(ip, iolock);
- if (error)
- goto out_unlock_internal;
- goto start;
- }
-
error = -ret;
if (ret <= 0)
goto out_unlock_internal;
@@ -868,7 +775,7 @@ write_retry:
mutex_lock(&inode->i_mutex);
xfs_ilock(ip, iolock);
- error2 = -xfs_file_fsync(file, file->f_path.dentry,
+ error2 = -xfs_file_fsync(file,
(file->f_flags & __O_SYNC) ? 0 : 1);
if (!error)
error = error2;
@@ -1014,9 +921,6 @@ const struct file_operations xfs_file_operations = {
.open = xfs_file_open,
.release = xfs_file_release,
.fsync = xfs_file_fsync,
-#ifdef HAVE_FOP_OPEN_EXEC
- .open_exec = xfs_file_open_exec,
-#endif
};
const struct file_operations xfs_dir_file_operations = {
diff --git a/fs/xfs/linux-2.6/xfs_fs_subr.c b/fs/xfs/linux-2.6/xfs_fs_subr.c
index b6918d76bc7b..1f279b012f94 100644
--- a/fs/xfs/linux-2.6/xfs_fs_subr.c
+++ b/fs/xfs/linux-2.6/xfs_fs_subr.c
@@ -21,10 +21,6 @@
#include "xfs_inode.h"
#include "xfs_trace.h"
-int fs_noerr(void) { return 0; }
-int fs_nosys(void) { return ENOSYS; }
-void fs_noval(void) { return; }
-
/*
* note: all filemap functions return negative error codes. These
* need to be inverted before returning to the xfs core functions.
diff --git a/fs/xfs/linux-2.6/xfs_fs_subr.h b/fs/xfs/linux-2.6/xfs_fs_subr.h
deleted file mode 100644
index 82bb19b2599e..000000000000
--- a/fs/xfs/linux-2.6/xfs_fs_subr.h
+++ /dev/null
@@ -1,25 +0,0 @@
-/*
- * Copyright (c) 2000,2002,2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- */
-#ifndef __XFS_FS_SUBR_H__
-#define __XFS_FS_SUBR_H__
-
-extern int fs_noerr(void);
-extern int fs_nosys(void);
-extern void fs_noval(void);
-
-#endif /* __XFS_FS_SUBR_H__ */
diff --git a/fs/xfs/linux-2.6/xfs_ioctl.c b/fs/xfs/linux-2.6/xfs_ioctl.c
index 699b60cbab9c..237f5ffb2ee8 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl.c
+++ b/fs/xfs/linux-2.6/xfs_ioctl.c
@@ -23,24 +23,15 @@
#include "xfs_trans.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
-#include "xfs_dir2.h"
#include "xfs_alloc.h"
-#include "xfs_dmapi.h"
#include "xfs_mount.h"
#include "xfs_bmap_btree.h"
-#include "xfs_alloc_btree.h"
-#include "xfs_ialloc_btree.h"
-#include "xfs_attr_sf.h"
-#include "xfs_dir2_sf.h"
#include "xfs_dinode.h"
#include "xfs_inode.h"
#include "xfs_ioctl.h"
-#include "xfs_btree.h"
-#include "xfs_ialloc.h"
#include "xfs_rtalloc.h"
#include "xfs_itable.h"
#include "xfs_error.h"
-#include "xfs_rw.h"
#include "xfs_attr.h"
#include "xfs_bmap.h"
#include "xfs_buf_item.h"
@@ -679,10 +670,9 @@ xfs_ioc_bulkstat(
error = xfs_bulkstat_single(mp, &inlast,
bulkreq.ubuffer, &done);
else /* XFS_IOC_FSBULKSTAT */
- error = xfs_bulkstat(mp, &inlast, &count,
- (bulkstat_one_pf)xfs_bulkstat_one, NULL,
- sizeof(xfs_bstat_t), bulkreq.ubuffer,
- BULKSTAT_FG_QUICK, &done);
+ error = xfs_bulkstat(mp, &inlast, &count, xfs_bulkstat_one,
+ sizeof(xfs_bstat_t), bulkreq.ubuffer,
+ &done);
if (error)
return -error;
@@ -909,7 +899,7 @@ xfs_ioctl_setattr(
struct xfs_dquot *olddquot = NULL;
int code;
- xfs_itrace_entry(ip);
+ trace_xfs_ioctl_setattr(ip);
if (mp->m_flags & XFS_MOUNT_RDONLY)
return XFS_ERROR(EROFS);
@@ -1044,8 +1034,7 @@ xfs_ioctl_setattr(
}
}
- xfs_trans_ijoin(tp, ip, lock_flags);
- xfs_trans_ihold(tp, ip);
+ xfs_trans_ijoin(tp, ip);
/*
* Change file ownership. Must be the owner or privileged.
@@ -1117,16 +1106,7 @@ xfs_ioctl_setattr(
xfs_qm_dqrele(udqp);
xfs_qm_dqrele(gdqp);
- if (code)
- return code;
-
- if (DM_EVENT_ENABLED(ip, DM_EVENT_ATTRIBUTE)) {
- XFS_SEND_NAMESP(mp, DM_EVENT_ATTRIBUTE, ip, DM_RIGHT_NULL,
- NULL, DM_RIGHT_NULL, NULL, NULL, 0, 0,
- (mask & FSX_NONBLOCK) ? DM_FLAGS_NDELAY : 0);
- }
-
- return 0;
+ return code;
error_return:
xfs_qm_dqrele(udqp);
@@ -1302,7 +1282,7 @@ xfs_file_ioctl(
if (filp->f_mode & FMODE_NOCMTIME)
ioflags |= IO_INVIS;
- xfs_itrace_entry(ip);
+ trace_xfs_file_ioctl(ip);
switch (cmd) {
case XFS_IOC_ALLOCSP:
diff --git a/fs/xfs/linux-2.6/xfs_ioctl32.c b/fs/xfs/linux-2.6/xfs_ioctl32.c
index 9287135e9bfc..6c83f7f62dc9 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl32.c
+++ b/fs/xfs/linux-2.6/xfs_ioctl32.c
@@ -28,12 +28,8 @@
#include "xfs_trans.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
-#include "xfs_dir2.h"
-#include "xfs_dmapi.h"
#include "xfs_mount.h"
#include "xfs_bmap_btree.h"
-#include "xfs_attr_sf.h"
-#include "xfs_dir2_sf.h"
#include "xfs_vnode.h"
#include "xfs_dinode.h"
#include "xfs_inode.h"
@@ -237,15 +233,12 @@ xfs_bulkstat_one_compat(
xfs_ino_t ino, /* inode number to get data for */
void __user *buffer, /* buffer to place output in */
int ubsize, /* size of buffer */
- void *private_data, /* my private data */
- xfs_daddr_t bno, /* starting bno of inode cluster */
int *ubused, /* bytes used by me */
- void *dibuff, /* on-disk inode buffer */
int *stat) /* BULKSTAT_RV_... */
{
return xfs_bulkstat_one_int(mp, ino, buffer, ubsize,
- xfs_bulkstat_one_fmt_compat, bno,
- ubused, dibuff, stat);
+ xfs_bulkstat_one_fmt_compat,
+ ubused, stat);
}
/* copied from xfs_ioctl.c */
@@ -298,13 +291,11 @@ xfs_compat_ioc_bulkstat(
int res;
error = xfs_bulkstat_one_compat(mp, inlast, bulkreq.ubuffer,
- sizeof(compat_xfs_bstat_t),
- NULL, 0, NULL, NULL, &res);
+ sizeof(compat_xfs_bstat_t), 0, &res);
} else if (cmd == XFS_IOC_FSBULKSTAT_32) {
error = xfs_bulkstat(mp, &inlast, &count,
- xfs_bulkstat_one_compat, NULL,
- sizeof(compat_xfs_bstat_t), bulkreq.ubuffer,
- BULKSTAT_FG_QUICK, &done);
+ xfs_bulkstat_one_compat, sizeof(compat_xfs_bstat_t),
+ bulkreq.ubuffer, &done);
} else
error = XFS_ERROR(EINVAL);
if (error)
@@ -549,7 +540,7 @@ xfs_file_compat_ioctl(
if (filp->f_mode & FMODE_NOCMTIME)
ioflags |= IO_INVIS;
- xfs_itrace_entry(ip);
+ trace_xfs_file_compat_ioctl(ip);
switch (cmd) {
/* No size or alignment issues on any arch */
diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c
index 9c8019c78c92..536b81e63a3d 100644
--- a/fs/xfs/linux-2.6/xfs_iops.c
+++ b/fs/xfs/linux-2.6/xfs_iops.c
@@ -24,21 +24,13 @@
#include "xfs_trans.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
-#include "xfs_dir2.h"
#include "xfs_alloc.h"
-#include "xfs_dmapi.h"
#include "xfs_quota.h"
#include "xfs_mount.h"
#include "xfs_bmap_btree.h"
-#include "xfs_alloc_btree.h"
-#include "xfs_ialloc_btree.h"
-#include "xfs_dir2_sf.h"
-#include "xfs_attr_sf.h"
#include "xfs_dinode.h"
#include "xfs_inode.h"
#include "xfs_bmap.h"
-#include "xfs_btree.h"
-#include "xfs_ialloc.h"
#include "xfs_rtalloc.h"
#include "xfs_error.h"
#include "xfs_itable.h"
@@ -496,7 +488,7 @@ xfs_vn_getattr(
struct xfs_inode *ip = XFS_I(inode);
struct xfs_mount *mp = ip->i_mount;
- xfs_itrace_entry(ip);
+ trace_xfs_getattr(ip);
if (XFS_FORCED_SHUTDOWN(mp))
return XFS_ERROR(EIO);
@@ -585,11 +577,20 @@ xfs_vn_fallocate(
bf.l_len = len;
xfs_ilock(ip, XFS_IOLOCK_EXCL);
+
+ /* check the new inode size is valid before allocating */
+ if (!(mode & FALLOC_FL_KEEP_SIZE) &&
+ offset + len > i_size_read(inode)) {
+ new_size = offset + len;
+ error = inode_newsize_ok(inode, new_size);
+ if (error)
+ goto out_unlock;
+ }
+
error = -xfs_change_file_space(ip, XFS_IOC_RESVSP, &bf,
0, XFS_ATTR_NOLOCK);
- if (!error && !(mode & FALLOC_FL_KEEP_SIZE) &&
- offset + len > i_size_read(inode))
- new_size = offset + len;
+ if (error)
+ goto out_unlock;
/* Change file size if needed */
if (new_size) {
@@ -600,6 +601,7 @@ xfs_vn_fallocate(
error = -xfs_setattr(ip, &iattr, XFS_ATTR_NOLOCK);
}
+out_unlock:
xfs_iunlock(ip, XFS_IOLOCK_EXCL);
out_error:
return error;
diff --git a/fs/xfs/linux-2.6/xfs_linux.h b/fs/xfs/linux-2.6/xfs_linux.h
index facfb323a706..998a9d7fb9c8 100644
--- a/fs/xfs/linux-2.6/xfs_linux.h
+++ b/fs/xfs/linux-2.6/xfs_linux.h
@@ -87,7 +87,6 @@
#include <xfs_aops.h>
#include <xfs_super.h>
#include <xfs_globals.h>
-#include <xfs_fs_subr.h>
#include <xfs_buf.h>
/*
diff --git a/fs/xfs/linux-2.6/xfs_quotaops.c b/fs/xfs/linux-2.6/xfs_quotaops.c
index 9ac8aea91529..bfd5ac9d1f6f 100644
--- a/fs/xfs/linux-2.6/xfs_quotaops.c
+++ b/fs/xfs/linux-2.6/xfs_quotaops.c
@@ -16,14 +16,12 @@
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "xfs.h"
-#include "xfs_dmapi.h"
#include "xfs_sb.h"
#include "xfs_inum.h"
#include "xfs_log.h"
#include "xfs_ag.h"
#include "xfs_mount.h"
#include "xfs_quota.h"
-#include "xfs_log.h"
#include "xfs_trans.h"
#include "xfs_bmap_btree.h"
#include "xfs_inode.h"
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index f2d1718c9165..758df94690ed 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -25,14 +25,11 @@
#include "xfs_ag.h"
#include "xfs_dir2.h"
#include "xfs_alloc.h"
-#include "xfs_dmapi.h"
#include "xfs_quota.h"
#include "xfs_mount.h"
#include "xfs_bmap_btree.h"
#include "xfs_alloc_btree.h"
#include "xfs_ialloc_btree.h"
-#include "xfs_dir2_sf.h"
-#include "xfs_attr_sf.h"
#include "xfs_dinode.h"
#include "xfs_inode.h"
#include "xfs_btree.h"
@@ -43,7 +40,6 @@
#include "xfs_error.h"
#include "xfs_itable.h"
#include "xfs_fsops.h"
-#include "xfs_rw.h"
#include "xfs_attr.h"
#include "xfs_buf_item.h"
#include "xfs_utils.h"
@@ -94,7 +90,6 @@ mempool_t *xfs_ioend_pool;
#define MNTOPT_BARRIER "barrier" /* use writer barriers for log write and
* unwritten extent conversion */
#define MNTOPT_NOBARRIER "nobarrier" /* .. disable */
-#define MNTOPT_OSYNCISOSYNC "osyncisosync" /* o_sync is REALLY o_sync */
#define MNTOPT_64BITINODE "inode64" /* inodes can be allocated anywhere */
#define MNTOPT_IKEEP "ikeep" /* do not free empty inode clusters */
#define MNTOPT_NOIKEEP "noikeep" /* free empty inode clusters */
@@ -116,9 +111,6 @@ mempool_t *xfs_ioend_pool;
#define MNTOPT_GQUOTANOENF "gqnoenforce"/* group quota limit enforcement */
#define MNTOPT_PQUOTANOENF "pqnoenforce"/* project quota limit enforcement */
#define MNTOPT_QUOTANOENF "qnoenforce" /* same as uqnoenforce */
-#define MNTOPT_DMAPI "dmapi" /* DMI enabled (DMAPI / XDSM) */
-#define MNTOPT_XDSM "xdsm" /* DMI enabled (DMAPI / XDSM) */
-#define MNTOPT_DMI "dmi" /* DMI enabled (DMAPI / XDSM) */
#define MNTOPT_DELAYLOG "delaylog" /* Delayed loging enabled */
#define MNTOPT_NODELAYLOG "nodelaylog" /* Delayed loging disabled */
@@ -172,15 +164,13 @@ suffix_strtoul(char *s, char **endp, unsigned int base)
STATIC int
xfs_parseargs(
struct xfs_mount *mp,
- char *options,
- char **mtpt)
+ char *options)
{
struct super_block *sb = mp->m_super;
char *this_char, *value, *eov;
int dsunit = 0;
int dswidth = 0;
int iosize = 0;
- int dmapi_implies_ikeep = 1;
__uint8_t iosizelog = 0;
/*
@@ -243,15 +233,10 @@ xfs_parseargs(
if (!mp->m_logname)
return ENOMEM;
} else if (!strcmp(this_char, MNTOPT_MTPT)) {
- if (!value || !*value) {
- cmn_err(CE_WARN,
- "XFS: %s option requires an argument",
- this_char);
- return EINVAL;
- }
- *mtpt = kstrndup(value, MAXNAMELEN, GFP_KERNEL);
- if (!*mtpt)
- return ENOMEM;
+ cmn_err(CE_WARN,
+ "XFS: %s option not allowed on this system",
+ this_char);
+ return EINVAL;
} else if (!strcmp(this_char, MNTOPT_RTDEV)) {
if (!value || !*value) {
cmn_err(CE_WARN,
@@ -288,8 +273,6 @@ xfs_parseargs(
mp->m_flags &= ~XFS_MOUNT_GRPID;
} else if (!strcmp(this_char, MNTOPT_WSYNC)) {
mp->m_flags |= XFS_MOUNT_WSYNC;
- } else if (!strcmp(this_char, MNTOPT_OSYNCISOSYNC)) {
- mp->m_flags |= XFS_MOUNT_OSYNCISOSYNC;
} else if (!strcmp(this_char, MNTOPT_NORECOVERY)) {
mp->m_flags |= XFS_MOUNT_NORECOVERY;
} else if (!strcmp(this_char, MNTOPT_NOALIGN)) {
@@ -329,7 +312,6 @@ xfs_parseargs(
} else if (!strcmp(this_char, MNTOPT_IKEEP)) {
mp->m_flags |= XFS_MOUNT_IKEEP;
} else if (!strcmp(this_char, MNTOPT_NOIKEEP)) {
- dmapi_implies_ikeep = 0;
mp->m_flags &= ~XFS_MOUNT_IKEEP;
} else if (!strcmp(this_char, MNTOPT_LARGEIO)) {
mp->m_flags &= ~XFS_MOUNT_COMPAT_IOSIZE;
@@ -370,12 +352,6 @@ xfs_parseargs(
} else if (!strcmp(this_char, MNTOPT_GQUOTANOENF)) {
mp->m_qflags |= (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE);
mp->m_qflags &= ~XFS_OQUOTA_ENFD;
- } else if (!strcmp(this_char, MNTOPT_DMAPI)) {
- mp->m_flags |= XFS_MOUNT_DMAPI;
- } else if (!strcmp(this_char, MNTOPT_XDSM)) {
- mp->m_flags |= XFS_MOUNT_DMAPI;
- } else if (!strcmp(this_char, MNTOPT_DMI)) {
- mp->m_flags |= XFS_MOUNT_DMAPI;
} else if (!strcmp(this_char, MNTOPT_DELAYLOG)) {
mp->m_flags |= XFS_MOUNT_DELAYLOG;
cmn_err(CE_WARN,
@@ -387,9 +363,11 @@ xfs_parseargs(
cmn_err(CE_WARN,
"XFS: ihashsize no longer used, option is deprecated.");
} else if (!strcmp(this_char, "osyncisdsync")) {
- /* no-op, this is now the default */
cmn_err(CE_WARN,
- "XFS: osyncisdsync is now the default, option is deprecated.");
+ "XFS: osyncisdsync has no effect, option is deprecated.");
+ } else if (!strcmp(this_char, "osyncisosync")) {
+ cmn_err(CE_WARN,
+ "XFS: osyncisosync has no effect, option is deprecated.");
} else if (!strcmp(this_char, "irixsgid")) {
cmn_err(CE_WARN,
"XFS: irixsgid is now a sysctl(2) variable, option is deprecated.");
@@ -430,12 +408,6 @@ xfs_parseargs(
return EINVAL;
}
- if ((mp->m_flags & XFS_MOUNT_DMAPI) && (!*mtpt || *mtpt[0] == '\0')) {
- printk("XFS: %s option needs the mount point option as well\n",
- MNTOPT_DMAPI);
- return EINVAL;
- }
-
if ((dsunit && !dswidth) || (!dsunit && dswidth)) {
cmn_err(CE_WARN,
"XFS: sunit and swidth must be specified together");
@@ -449,18 +421,6 @@ xfs_parseargs(
return EINVAL;
}
- /*
- * Applications using DMI filesystems often expect the
- * inode generation number to be monotonically increasing.
- * If we delete inode chunks we break this assumption, so
- * keep unused inode chunks on disk for DMI filesystems
- * until we come up with a better solution.
- * Note that if "ikeep" or "noikeep" mount options are
- * supplied, then they are honored.
- */
- if ((mp->m_flags & XFS_MOUNT_DMAPI) && dmapi_implies_ikeep)
- mp->m_flags |= XFS_MOUNT_IKEEP;
-
done:
if (!(mp->m_flags & XFS_MOUNT_NOALIGN)) {
/*
@@ -539,10 +499,8 @@ xfs_showargs(
{ XFS_MOUNT_SWALLOC, "," MNTOPT_SWALLOC },
{ XFS_MOUNT_NOUUID, "," MNTOPT_NOUUID },
{ XFS_MOUNT_NORECOVERY, "," MNTOPT_NORECOVERY },
- { XFS_MOUNT_OSYNCISOSYNC, "," MNTOPT_OSYNCISOSYNC },
{ XFS_MOUNT_ATTR2, "," MNTOPT_ATTR2 },
{ XFS_MOUNT_FILESTREAMS, "," MNTOPT_FILESTREAM },
- { XFS_MOUNT_DMAPI, "," MNTOPT_DMAPI },
{ XFS_MOUNT_GRPID, "," MNTOPT_GRPID },
{ XFS_MOUNT_DELAYLOG, "," MNTOPT_DELAYLOG },
{ 0, NULL }
@@ -947,7 +905,7 @@ xfs_fs_destroy_inode(
{
struct xfs_inode *ip = XFS_I(inode);
- xfs_itrace_entry(ip);
+ trace_xfs_destroy_inode(ip);
XFS_STATS_INC(vn_reclaim);
@@ -1063,10 +1021,8 @@ xfs_log_inode(
* an inode in another recent transaction. So we play it safe and
* fire off the transaction anyway.
*/
- xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
- xfs_trans_ihold(tp, ip);
+ xfs_trans_ijoin(tp, ip);
xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
- xfs_trans_set_sync(tp);
error = xfs_trans_commit(tp, 0);
xfs_ilock_demote(ip, XFS_ILOCK_EXCL);
@@ -1082,27 +1038,18 @@ xfs_fs_write_inode(
struct xfs_mount *mp = ip->i_mount;
int error = EAGAIN;
- xfs_itrace_entry(ip);
+ trace_xfs_write_inode(ip);
if (XFS_FORCED_SHUTDOWN(mp))
return XFS_ERROR(EIO);
if (wbc->sync_mode == WB_SYNC_ALL) {
/*
- * Make sure the inode has hit stable storage. By using the
- * log and the fsync transactions we reduce the IOs we have
- * to do here from two (log and inode) to just the log.
- *
- * Note: We still need to do a delwri write of the inode after
- * this to flush it to the backing buffer so that bulkstat
- * works properly if this is the first time the inode has been
- * written. Because we hold the ilock atomically over the
- * transaction commit and the inode flush we are guaranteed
- * that the inode is not pinned when it returns. If the flush
- * lock is already held, then the inode has already been
- * flushed once and we don't need to flush it again. Hence
- * the code will only flush the inode if it isn't already
- * being flushed.
+ * Make sure the inode has made it it into the log. Instead
+ * of forcing it all the way to stable storage using a
+ * synchronous transaction we let the log force inside the
+ * ->sync_fs call do that for thus, which reduces the number
+ * of synchronous log foces dramatically.
*/
xfs_ioend_wait(ip);
xfs_ilock(ip, XFS_ILOCK_SHARED);
@@ -1116,27 +1063,29 @@ xfs_fs_write_inode(
* We make this non-blocking if the inode is contended, return
* EAGAIN to indicate to the caller that they did not succeed.
* This prevents the flush path from blocking on inodes inside
- * another operation right now, they get caught later by xfs_sync.
+ * another operation right now, they get caught later by
+ * xfs_sync.
*/
if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED))
goto out;
- }
- if (xfs_ipincount(ip) || !xfs_iflock_nowait(ip))
- goto out_unlock;
+ if (xfs_ipincount(ip) || !xfs_iflock_nowait(ip))
+ goto out_unlock;
- /*
- * Now we have the flush lock and the inode is not pinned, we can check
- * if the inode is really clean as we know that there are no pending
- * transaction completions, it is not waiting on the delayed write
- * queue and there is no IO in progress.
- */
- if (xfs_inode_clean(ip)) {
- xfs_ifunlock(ip);
- error = 0;
- goto out_unlock;
+ /*
+ * Now we have the flush lock and the inode is not pinned, we
+ * can check if the inode is really clean as we know that
+ * there are no pending transaction completions, it is not
+ * waiting on the delayed write queue and there is no IO in
+ * progress.
+ */
+ if (xfs_inode_clean(ip)) {
+ xfs_ifunlock(ip);
+ error = 0;
+ goto out_unlock;
+ }
+ error = xfs_iflush(ip, 0);
}
- error = xfs_iflush(ip, 0);
out_unlock:
xfs_iunlock(ip, XFS_ILOCK_SHARED);
@@ -1156,7 +1105,8 @@ xfs_fs_clear_inode(
{
xfs_inode_t *ip = XFS_I(inode);
- xfs_itrace_entry(ip);
+ trace_xfs_clear_inode(ip);
+
XFS_STATS_INC(vn_rele);
XFS_STATS_INC(vn_remove);
XFS_STATS_DEC(vn_active);
@@ -1193,22 +1143,13 @@ xfs_fs_put_super(
{
struct xfs_mount *mp = XFS_M(sb);
+ /*
+ * Unregister the memory shrinker before we tear down the mount
+ * structure so we don't have memory reclaim racing with us here.
+ */
+ xfs_inode_shrinker_unregister(mp);
xfs_syncd_stop(mp);
- if (!(sb->s_flags & MS_RDONLY)) {
- /*
- * XXX(hch): this should be SYNC_WAIT.
- *
- * Or more likely not needed at all because the VFS is already
- * calling ->sync_fs after shutting down all filestem
- * operations and just before calling ->put_super.
- */
- xfs_sync_data(mp, 0);
- xfs_sync_attr(mp, 0);
- }
-
- XFS_SEND_PREUNMOUNT(mp);
-
/*
* Blow away any referenced inode in the filestreams cache.
* This can and will cause log traffic as inodes go inactive
@@ -1218,14 +1159,10 @@ xfs_fs_put_super(
XFS_bflush(mp->m_ddev_targp);
- XFS_SEND_UNMOUNT(mp);
-
xfs_unmountfs(mp);
xfs_freesb(mp);
- xfs_inode_shrinker_unregister(mp);
xfs_icsb_destroy_counters(mp);
xfs_close_devices(mp);
- xfs_dmops_put(mp);
xfs_free_fsname(mp);
kfree(mp);
}
@@ -1543,7 +1480,6 @@ xfs_fs_fill_super(
struct inode *root;
struct xfs_mount *mp = NULL;
int flags = 0, error = ENOMEM;
- char *mtpt = NULL;
mp = kzalloc(sizeof(struct xfs_mount), GFP_KERNEL);
if (!mp)
@@ -1559,7 +1495,7 @@ xfs_fs_fill_super(
mp->m_super = sb;
sb->s_fs_info = mp;
- error = xfs_parseargs(mp, (char *)data, &mtpt);
+ error = xfs_parseargs(mp, (char *)data);
if (error)
goto out_free_fsname;
@@ -1571,16 +1507,12 @@ xfs_fs_fill_super(
#endif
sb->s_op = &xfs_super_operations;
- error = xfs_dmops_get(mp);
- if (error)
- goto out_free_fsname;
-
if (silent)
flags |= XFS_MFSI_QUIET;
error = xfs_open_devices(mp);
if (error)
- goto out_put_dmops;
+ goto out_free_fsname;
if (xfs_icsb_init_counters(mp))
mp->m_flags |= XFS_MOUNT_NO_PERCPU_SB;
@@ -1608,8 +1540,6 @@ xfs_fs_fill_super(
if (error)
goto out_filestream_unmount;
- XFS_SEND_MOUNT(mp, DM_RIGHT_NULL, mtpt, mp->m_fsname);
-
sb->s_magic = XFS_SB_MAGIC;
sb->s_blocksize = mp->m_sb.sb_blocksize;
sb->s_blocksize_bits = ffs(sb->s_blocksize) - 1;
@@ -1638,7 +1568,6 @@ xfs_fs_fill_super(
xfs_inode_shrinker_register(mp);
- kfree(mtpt);
return 0;
out_filestream_unmount:
@@ -1648,11 +1577,8 @@ xfs_fs_fill_super(
out_destroy_counters:
xfs_icsb_destroy_counters(mp);
xfs_close_devices(mp);
- out_put_dmops:
- xfs_dmops_put(mp);
out_free_fsname:
xfs_free_fsname(mp);
- kfree(mtpt);
kfree(mp);
out:
return -error;
@@ -1759,6 +1685,12 @@ xfs_init_zones(void)
if (!xfs_trans_zone)
goto out_destroy_ifork_zone;
+ xfs_log_item_desc_zone =
+ kmem_zone_init(sizeof(struct xfs_log_item_desc),
+ "xfs_log_item_desc");
+ if (!xfs_log_item_desc_zone)
+ goto out_destroy_trans_zone;
+
/*
* The size of the zone allocated buf log item is the maximum
* size possible under XFS. This wastes a little bit of memory,
@@ -1768,7 +1700,7 @@ xfs_init_zones(void)
(((XFS_MAX_BLOCKSIZE / XFS_BLF_CHUNK) /
NBWORD) * sizeof(int))), "xfs_buf_item");
if (!xfs_buf_item_zone)
- goto out_destroy_trans_zone;
+ goto out_destroy_log_item_desc_zone;
xfs_efd_zone = kmem_zone_init((sizeof(xfs_efd_log_item_t) +
((XFS_EFD_MAX_FAST_EXTENTS - 1) *
@@ -1805,6 +1737,8 @@ xfs_init_zones(void)
kmem_zone_destroy(xfs_efd_zone);
out_destroy_buf_item_zone:
kmem_zone_destroy(xfs_buf_item_zone);
+ out_destroy_log_item_desc_zone:
+ kmem_zone_destroy(xfs_log_item_desc_zone);
out_destroy_trans_zone:
kmem_zone_destroy(xfs_trans_zone);
out_destroy_ifork_zone:
@@ -1835,6 +1769,7 @@ xfs_destroy_zones(void)
kmem_zone_destroy(xfs_efi_zone);
kmem_zone_destroy(xfs_efd_zone);
kmem_zone_destroy(xfs_buf_item_zone);
+ kmem_zone_destroy(xfs_log_item_desc_zone);
kmem_zone_destroy(xfs_trans_zone);
kmem_zone_destroy(xfs_ifork_zone);
kmem_zone_destroy(xfs_dabuf_zone);
@@ -1883,7 +1818,6 @@ init_xfs_fs(void)
goto out_cleanup_procfs;
vfs_initquota();
- xfs_inode_shrinker_init();
error = register_filesystem(&xfs_fs_type);
if (error)
@@ -1911,7 +1845,6 @@ exit_xfs_fs(void)
{
vfs_exitquota();
unregister_filesystem(&xfs_fs_type);
- xfs_inode_shrinker_destroy();
xfs_sysctl_unregister();
xfs_cleanup_procfs();
xfs_buf_terminate();
diff --git a/fs/xfs/linux-2.6/xfs_super.h b/fs/xfs/linux-2.6/xfs_super.h
index 519618e9279e..1ef4a4d2d997 100644
--- a/fs/xfs/linux-2.6/xfs_super.h
+++ b/fs/xfs/linux-2.6/xfs_super.h
@@ -56,12 +56,6 @@ extern void xfs_qm_exit(void);
# define XFS_BIGFS_STRING
#endif
-#ifdef CONFIG_XFS_DMAPI
-# define XFS_DMAPI_STRING "dmapi support, "
-#else
-# define XFS_DMAPI_STRING
-#endif
-
#ifdef DEBUG
# define XFS_DBG_STRING "debug"
#else
@@ -72,7 +66,6 @@ extern void xfs_qm_exit(void);
XFS_SECURITY_STRING \
XFS_REALTIME_STRING \
XFS_BIGFS_STRING \
- XFS_DMAPI_STRING \
XFS_DBG_STRING /* DBG must be last */
struct xfs_inode;
diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c
index 3884e20bc14e..dfcbd98d1599 100644
--- a/fs/xfs/linux-2.6/xfs_sync.c
+++ b/fs/xfs/linux-2.6/xfs_sync.c
@@ -24,25 +24,14 @@
#include "xfs_trans.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
-#include "xfs_dir2.h"
-#include "xfs_dmapi.h"
#include "xfs_mount.h"
#include "xfs_bmap_btree.h"
-#include "xfs_alloc_btree.h"
-#include "xfs_ialloc_btree.h"
-#include "xfs_btree.h"
-#include "xfs_dir2_sf.h"
-#include "xfs_attr_sf.h"
#include "xfs_inode.h"
#include "xfs_dinode.h"
#include "xfs_error.h"
-#include "xfs_mru_cache.h"
#include "xfs_filestream.h"
#include "xfs_vnodeops.h"
-#include "xfs_utils.h"
-#include "xfs_buf_item.h"
#include "xfs_inode_item.h"
-#include "xfs_rw.h"
#include "xfs_quota.h"
#include "xfs_trace.h"
@@ -144,6 +133,41 @@ restart:
return last_error;
}
+/*
+ * Select the next per-ag structure to iterate during the walk. The reclaim
+ * walk is optimised only to walk AGs with reclaimable inodes in them.
+ */
+static struct xfs_perag *
+xfs_inode_ag_iter_next_pag(
+ struct xfs_mount *mp,
+ xfs_agnumber_t *first,
+ int tag)
+{
+ struct xfs_perag *pag = NULL;
+
+ if (tag == XFS_ICI_RECLAIM_TAG) {
+ int found;
+ int ref;
+
+ spin_lock(&mp->m_perag_lock);
+ found = radix_tree_gang_lookup_tag(&mp->m_perag_tree,
+ (void **)&pag, *first, 1, tag);
+ if (found <= 0) {
+ spin_unlock(&mp->m_perag_lock);
+ return NULL;
+ }
+ *first = pag->pag_agno + 1;
+ /* open coded pag reference increment */
+ ref = atomic_inc_return(&pag->pag_ref);
+ spin_unlock(&mp->m_perag_lock);
+ trace_xfs_perag_get_reclaim(mp, pag->pag_agno, ref, _RET_IP_);
+ } else {
+ pag = xfs_perag_get(mp, *first);
+ (*first)++;
+ }
+ return pag;
+}
+
int
xfs_inode_ag_iterator(
struct xfs_mount *mp,
@@ -154,20 +178,15 @@ xfs_inode_ag_iterator(
int exclusive,
int *nr_to_scan)
{
+ struct xfs_perag *pag;
int error = 0;
int last_error = 0;
xfs_agnumber_t ag;
int nr;
nr = nr_to_scan ? *nr_to_scan : INT_MAX;
- for (ag = 0; ag < mp->m_sb.sb_agcount; ag++) {
- struct xfs_perag *pag;
-
- pag = xfs_perag_get(mp, ag);
- if (!pag->pag_ici_init) {
- xfs_perag_put(pag);
- continue;
- }
+ ag = 0;
+ while ((pag = xfs_inode_ag_iter_next_pag(mp, &ag, tag))) {
error = xfs_inode_ag_walk(mp, pag, execute, flags, tag,
exclusive, &nr);
xfs_perag_put(pag);
@@ -289,7 +308,7 @@ xfs_sync_inode_attr(
/*
* Write out pagecache data for the whole filesystem.
*/
-int
+STATIC int
xfs_sync_data(
struct xfs_mount *mp,
int flags)
@@ -310,7 +329,7 @@ xfs_sync_data(
/*
* Write out inode metadata (attributes) for the whole filesystem.
*/
-int
+STATIC int
xfs_sync_attr(
struct xfs_mount *mp,
int flags)
@@ -343,8 +362,7 @@ xfs_commit_dummy_trans(
xfs_ilock(ip, XFS_ILOCK_EXCL);
- xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
- xfs_trans_ihold(tp, ip);
+ xfs_trans_ijoin(tp, ip);
xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
error = xfs_trans_commit(tp, 0);
xfs_iunlock(ip, XFS_ILOCK_EXCL);
@@ -644,6 +662,17 @@ __xfs_inode_set_reclaim_tag(
radix_tree_tag_set(&pag->pag_ici_root,
XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino),
XFS_ICI_RECLAIM_TAG);
+
+ if (!pag->pag_ici_reclaimable) {
+ /* propagate the reclaim tag up into the perag radix tree */
+ spin_lock(&ip->i_mount->m_perag_lock);
+ radix_tree_tag_set(&ip->i_mount->m_perag_tree,
+ XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino),
+ XFS_ICI_RECLAIM_TAG);
+ spin_unlock(&ip->i_mount->m_perag_lock);
+ trace_xfs_perag_set_reclaim(ip->i_mount, pag->pag_agno,
+ -1, _RET_IP_);
+ }
pag->pag_ici_reclaimable++;
}
@@ -678,6 +707,16 @@ __xfs_inode_clear_reclaim_tag(
radix_tree_tag_clear(&pag->pag_ici_root,
XFS_INO_TO_AGINO(mp, ip->i_ino), XFS_ICI_RECLAIM_TAG);
pag->pag_ici_reclaimable--;
+ if (!pag->pag_ici_reclaimable) {
+ /* clear the reclaim tag from the perag radix tree */
+ spin_lock(&ip->i_mount->m_perag_lock);
+ radix_tree_tag_clear(&ip->i_mount->m_perag_tree,
+ XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino),
+ XFS_ICI_RECLAIM_TAG);
+ spin_unlock(&ip->i_mount->m_perag_lock);
+ trace_xfs_perag_clear_reclaim(ip->i_mount, pag->pag_agno,
+ -1, _RET_IP_);
+ }
}
/*
@@ -816,7 +855,36 @@ out:
reclaim:
xfs_ifunlock(ip);
xfs_iunlock(ip, XFS_ILOCK_EXCL);
- xfs_ireclaim(ip);
+
+ XFS_STATS_INC(xs_ig_reclaims);
+ /*
+ * Remove the inode from the per-AG radix tree.
+ *
+ * Because radix_tree_delete won't complain even if the item was never
+ * added to the tree assert that it's been there before to catch
+ * problems with the inode life time early on.
+ */
+ write_lock(&pag->pag_ici_lock);
+ if (!radix_tree_delete(&pag->pag_ici_root,
+ XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino)))
+ ASSERT(0);
+ write_unlock(&pag->pag_ici_lock);
+
+ /*
+ * Here we do an (almost) spurious inode lock in order to coordinate
+ * with inode cache radix tree lookups. This is because the lookup
+ * can reference the inodes in the cache without taking references.
+ *
+ * We make that OK here by ensuring that we wait until the inode is
+ * unlocked after the lookup before we go ahead and free it. We get
+ * both the ilock and the iolock because the code may need to drop the
+ * ilock one but will still hold the iolock.
+ */
+ xfs_ilock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
+ xfs_qm_dqdetach(ip);
+ xfs_iunlock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
+
+ xfs_inode_free(ip);
return error;
}
@@ -832,88 +900,52 @@ xfs_reclaim_inodes(
/*
* Shrinker infrastructure.
- *
- * This is all far more complex than it needs to be. It adds a global list of
- * mounts because the shrinkers can only call a global context. We need to make
- * the shrinkers pass a context to avoid the need for global state.
*/
-static LIST_HEAD(xfs_mount_list);
-static struct rw_semaphore xfs_mount_list_lock;
-
static int
xfs_reclaim_inode_shrink(
+ struct shrinker *shrink,
int nr_to_scan,
gfp_t gfp_mask)
{
struct xfs_mount *mp;
struct xfs_perag *pag;
xfs_agnumber_t ag;
- int reclaimable = 0;
+ int reclaimable;
+ mp = container_of(shrink, struct xfs_mount, m_inode_shrink);
if (nr_to_scan) {
if (!(gfp_mask & __GFP_FS))
return -1;
- down_read(&xfs_mount_list_lock);
- list_for_each_entry(mp, &xfs_mount_list, m_mplist) {
- xfs_inode_ag_iterator(mp, xfs_reclaim_inode, 0,
+ xfs_inode_ag_iterator(mp, xfs_reclaim_inode, 0,
XFS_ICI_RECLAIM_TAG, 1, &nr_to_scan);
- if (nr_to_scan <= 0)
- break;
- }
- up_read(&xfs_mount_list_lock);
- }
-
- down_read(&xfs_mount_list_lock);
- list_for_each_entry(mp, &xfs_mount_list, m_mplist) {
- for (ag = 0; ag < mp->m_sb.sb_agcount; ag++) {
+ /* if we don't exhaust the scan, don't bother coming back */
+ if (nr_to_scan > 0)
+ return -1;
+ }
- pag = xfs_perag_get(mp, ag);
- if (!pag->pag_ici_init) {
- xfs_perag_put(pag);
- continue;
- }
- reclaimable += pag->pag_ici_reclaimable;
- xfs_perag_put(pag);
- }
+ reclaimable = 0;
+ ag = 0;
+ while ((pag = xfs_inode_ag_iter_next_pag(mp, &ag,
+ XFS_ICI_RECLAIM_TAG))) {
+ reclaimable += pag->pag_ici_reclaimable;
+ xfs_perag_put(pag);
}
- up_read(&xfs_mount_list_lock);
return reclaimable;
}
-static struct shrinker xfs_inode_shrinker = {
- .shrink = xfs_reclaim_inode_shrink,
- .seeks = DEFAULT_SEEKS,
-};
-
-void __init
-xfs_inode_shrinker_init(void)
-{
- init_rwsem(&xfs_mount_list_lock);
- register_shrinker(&xfs_inode_shrinker);
-}
-
-void
-xfs_inode_shrinker_destroy(void)
-{
- ASSERT(list_empty(&xfs_mount_list));
- unregister_shrinker(&xfs_inode_shrinker);
-}
-
void
xfs_inode_shrinker_register(
struct xfs_mount *mp)
{
- down_write(&xfs_mount_list_lock);
- list_add_tail(&mp->m_mplist, &xfs_mount_list);
- up_write(&xfs_mount_list_lock);
+ mp->m_inode_shrink.shrink = xfs_reclaim_inode_shrink;
+ mp->m_inode_shrink.seeks = DEFAULT_SEEKS;
+ register_shrinker(&mp->m_inode_shrink);
}
void
xfs_inode_shrinker_unregister(
struct xfs_mount *mp)
{
- down_write(&xfs_mount_list_lock);
- list_del(&mp->m_mplist);
- up_write(&xfs_mount_list_lock);
+ unregister_shrinker(&mp->m_inode_shrink);
}
diff --git a/fs/xfs/linux-2.6/xfs_sync.h b/fs/xfs/linux-2.6/xfs_sync.h
index cdcbaaca9880..fe78726196f8 100644
--- a/fs/xfs/linux-2.6/xfs_sync.h
+++ b/fs/xfs/linux-2.6/xfs_sync.h
@@ -35,9 +35,6 @@ typedef struct xfs_sync_work {
int xfs_syncd_init(struct xfs_mount *mp);
void xfs_syncd_stop(struct xfs_mount *mp);
-int xfs_sync_attr(struct xfs_mount *mp, int flags);
-int xfs_sync_data(struct xfs_mount *mp, int flags);
-
int xfs_quiesce_data(struct xfs_mount *mp);
void xfs_quiesce_attr(struct xfs_mount *mp);
@@ -55,8 +52,6 @@ int xfs_inode_ag_iterator(struct xfs_mount *mp,
int (*execute)(struct xfs_inode *ip, struct xfs_perag *pag, int flags),
int flags, int tag, int write_lock, int *nr_to_scan);
-void xfs_inode_shrinker_init(void);
-void xfs_inode_shrinker_destroy(void);
void xfs_inode_shrinker_register(struct xfs_mount *mp);
void xfs_inode_shrinker_unregister(struct xfs_mount *mp);
diff --git a/fs/xfs/linux-2.6/xfs_trace.c b/fs/xfs/linux-2.6/xfs_trace.c
index 207fa77f63ae..88d25d4aa56e 100644
--- a/fs/xfs/linux-2.6/xfs_trace.c
+++ b/fs/xfs/linux-2.6/xfs_trace.c
@@ -24,17 +24,13 @@
#include "xfs_trans.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
-#include "xfs_dir2.h"
#include "xfs_da_btree.h"
#include "xfs_bmap_btree.h"
#include "xfs_alloc_btree.h"
#include "xfs_ialloc_btree.h"
-#include "xfs_dir2_sf.h"
-#include "xfs_attr_sf.h"
#include "xfs_dinode.h"
#include "xfs_inode.h"
#include "xfs_btree.h"
-#include "xfs_dmapi.h"
#include "xfs_mount.h"
#include "xfs_ialloc.h"
#include "xfs_itable.h"
@@ -50,7 +46,6 @@
#include "quota/xfs_dquot_item.h"
#include "quota/xfs_dquot.h"
#include "xfs_log_recover.h"
-#include "xfs_buf_item.h"
#include "xfs_inode_item.h"
/*
diff --git a/fs/xfs/linux-2.6/xfs_trace.h b/fs/xfs/linux-2.6/xfs_trace.h
index ff6bc797baf2..c657cdca2cd2 100644
--- a/fs/xfs/linux-2.6/xfs_trace.h
+++ b/fs/xfs/linux-2.6/xfs_trace.h
@@ -82,33 +82,6 @@ DECLARE_EVENT_CLASS(xfs_attr_list_class,
)
)
-#define DEFINE_PERAG_REF_EVENT(name) \
-TRACE_EVENT(name, \
- TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, int refcount, \
- unsigned long caller_ip), \
- TP_ARGS(mp, agno, refcount, caller_ip), \
- TP_STRUCT__entry( \
- __field(dev_t, dev) \
- __field(xfs_agnumber_t, agno) \
- __field(int, refcount) \
- __field(unsigned long, caller_ip) \
- ), \
- TP_fast_assign( \
- __entry->dev = mp->m_super->s_dev; \
- __entry->agno = agno; \
- __entry->refcount = refcount; \
- __entry->caller_ip = caller_ip; \
- ), \
- TP_printk("dev %d:%d agno %u refcount %d caller %pf", \
- MAJOR(__entry->dev), MINOR(__entry->dev), \
- __entry->agno, \
- __entry->refcount, \
- (char *)__entry->caller_ip) \
-);
-
-DEFINE_PERAG_REF_EVENT(xfs_perag_get)
-DEFINE_PERAG_REF_EVENT(xfs_perag_put)
-
#define DEFINE_ATTR_LIST_EVENT(name) \
DEFINE_EVENT(xfs_attr_list_class, name, \
TP_PROTO(struct xfs_attr_list_context *ctx), \
@@ -122,6 +95,40 @@ DEFINE_ATTR_LIST_EVENT(xfs_attr_list_add);
DEFINE_ATTR_LIST_EVENT(xfs_attr_list_wrong_blk);
DEFINE_ATTR_LIST_EVENT(xfs_attr_list_notfound);
+DECLARE_EVENT_CLASS(xfs_perag_class,
+ TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, int refcount,
+ unsigned long caller_ip),
+ TP_ARGS(mp, agno, refcount, caller_ip),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_agnumber_t, agno)
+ __field(int, refcount)
+ __field(unsigned long, caller_ip)
+ ),
+ TP_fast_assign(
+ __entry->dev = mp->m_super->s_dev;
+ __entry->agno = agno;
+ __entry->refcount = refcount;
+ __entry->caller_ip = caller_ip;
+ ),
+ TP_printk("dev %d:%d agno %u refcount %d caller %pf",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->agno,
+ __entry->refcount,
+ (char *)__entry->caller_ip)
+);
+
+#define DEFINE_PERAG_REF_EVENT(name) \
+DEFINE_EVENT(xfs_perag_class, name, \
+ TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, int refcount, \
+ unsigned long caller_ip), \
+ TP_ARGS(mp, agno, refcount, caller_ip))
+DEFINE_PERAG_REF_EVENT(xfs_perag_get);
+DEFINE_PERAG_REF_EVENT(xfs_perag_get_reclaim);
+DEFINE_PERAG_REF_EVENT(xfs_perag_put);
+DEFINE_PERAG_REF_EVENT(xfs_perag_set_reclaim);
+DEFINE_PERAG_REF_EVENT(xfs_perag_clear_reclaim);
+
TRACE_EVENT(xfs_attr_list_node_descend,
TP_PROTO(struct xfs_attr_list_context *ctx,
struct xfs_da_node_entry *btree),
@@ -310,8 +317,6 @@ DEFINE_BUF_EVENT(xfs_buf_init);
DEFINE_BUF_EVENT(xfs_buf_free);
DEFINE_BUF_EVENT(xfs_buf_hold);
DEFINE_BUF_EVENT(xfs_buf_rele);
-DEFINE_BUF_EVENT(xfs_buf_pin);
-DEFINE_BUF_EVENT(xfs_buf_unpin);
DEFINE_BUF_EVENT(xfs_buf_iodone);
DEFINE_BUF_EVENT(xfs_buf_iorequest);
DEFINE_BUF_EVENT(xfs_buf_bawrite);
@@ -534,7 +539,7 @@ DEFINE_LOCK_EVENT(xfs_ilock_nowait);
DEFINE_LOCK_EVENT(xfs_ilock_demote);
DEFINE_LOCK_EVENT(xfs_iunlock);
-DECLARE_EVENT_CLASS(xfs_iget_class,
+DECLARE_EVENT_CLASS(xfs_inode_class,
TP_PROTO(struct xfs_inode *ip),
TP_ARGS(ip),
TP_STRUCT__entry(
@@ -550,16 +555,38 @@ DECLARE_EVENT_CLASS(xfs_iget_class,
__entry->ino)
)
-#define DEFINE_IGET_EVENT(name) \
-DEFINE_EVENT(xfs_iget_class, name, \
+#define DEFINE_INODE_EVENT(name) \
+DEFINE_EVENT(xfs_inode_class, name, \
TP_PROTO(struct xfs_inode *ip), \
TP_ARGS(ip))
-DEFINE_IGET_EVENT(xfs_iget_skip);
-DEFINE_IGET_EVENT(xfs_iget_reclaim);
-DEFINE_IGET_EVENT(xfs_iget_found);
-DEFINE_IGET_EVENT(xfs_iget_alloc);
-
-DECLARE_EVENT_CLASS(xfs_inode_class,
+DEFINE_INODE_EVENT(xfs_iget_skip);
+DEFINE_INODE_EVENT(xfs_iget_reclaim);
+DEFINE_INODE_EVENT(xfs_iget_reclaim_fail);
+DEFINE_INODE_EVENT(xfs_iget_hit);
+DEFINE_INODE_EVENT(xfs_iget_miss);
+
+DEFINE_INODE_EVENT(xfs_getattr);
+DEFINE_INODE_EVENT(xfs_setattr);
+DEFINE_INODE_EVENT(xfs_readlink);
+DEFINE_INODE_EVENT(xfs_alloc_file_space);
+DEFINE_INODE_EVENT(xfs_free_file_space);
+DEFINE_INODE_EVENT(xfs_readdir);
+#ifdef CONFIG_XFS_POSIX_ACL
+DEFINE_INODE_EVENT(xfs_check_acl);
+#endif
+DEFINE_INODE_EVENT(xfs_vm_bmap);
+DEFINE_INODE_EVENT(xfs_file_ioctl);
+DEFINE_INODE_EVENT(xfs_file_compat_ioctl);
+DEFINE_INODE_EVENT(xfs_ioctl_setattr);
+DEFINE_INODE_EVENT(xfs_file_fsync);
+DEFINE_INODE_EVENT(xfs_destroy_inode);
+DEFINE_INODE_EVENT(xfs_write_inode);
+DEFINE_INODE_EVENT(xfs_clear_inode);
+
+DEFINE_INODE_EVENT(xfs_dquot_dqalloc);
+DEFINE_INODE_EVENT(xfs_dquot_dqdetach);
+
+DECLARE_EVENT_CLASS(xfs_iref_class,
TP_PROTO(struct xfs_inode *ip, unsigned long caller_ip),
TP_ARGS(ip, caller_ip),
TP_STRUCT__entry(
@@ -584,20 +611,71 @@ DECLARE_EVENT_CLASS(xfs_inode_class,
(char *)__entry->caller_ip)
)
-#define DEFINE_INODE_EVENT(name) \
-DEFINE_EVENT(xfs_inode_class, name, \
+#define DEFINE_IREF_EVENT(name) \
+DEFINE_EVENT(xfs_iref_class, name, \
TP_PROTO(struct xfs_inode *ip, unsigned long caller_ip), \
TP_ARGS(ip, caller_ip))
-DEFINE_INODE_EVENT(xfs_ihold);
-DEFINE_INODE_EVENT(xfs_irele);
-DEFINE_INODE_EVENT(xfs_inode_pin);
-DEFINE_INODE_EVENT(xfs_inode_unpin);
-DEFINE_INODE_EVENT(xfs_inode_unpin_nowait);
+DEFINE_IREF_EVENT(xfs_ihold);
+DEFINE_IREF_EVENT(xfs_irele);
+DEFINE_IREF_EVENT(xfs_inode_pin);
+DEFINE_IREF_EVENT(xfs_inode_unpin);
+DEFINE_IREF_EVENT(xfs_inode_unpin_nowait);
+
+DECLARE_EVENT_CLASS(xfs_namespace_class,
+ TP_PROTO(struct xfs_inode *dp, struct xfs_name *name),
+ TP_ARGS(dp, name),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, dp_ino)
+ __dynamic_array(char, name, name->len)
+ ),
+ TP_fast_assign(
+ __entry->dev = VFS_I(dp)->i_sb->s_dev;
+ __entry->dp_ino = dp->i_ino;
+ memcpy(__get_str(name), name->name, name->len);
+ ),
+ TP_printk("dev %d:%d dp ino 0x%llx name %s",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->dp_ino,
+ __get_str(name))
+)
-/* the old xfs_itrace_entry tracer - to be replaced by s.th. in the VFS */
-DEFINE_INODE_EVENT(xfs_inode);
-#define xfs_itrace_entry(ip) \
- trace_xfs_inode(ip, _THIS_IP_)
+#define DEFINE_NAMESPACE_EVENT(name) \
+DEFINE_EVENT(xfs_namespace_class, name, \
+ TP_PROTO(struct xfs_inode *dp, struct xfs_name *name), \
+ TP_ARGS(dp, name))
+DEFINE_NAMESPACE_EVENT(xfs_remove);
+DEFINE_NAMESPACE_EVENT(xfs_link);
+DEFINE_NAMESPACE_EVENT(xfs_lookup);
+DEFINE_NAMESPACE_EVENT(xfs_create);
+DEFINE_NAMESPACE_EVENT(xfs_symlink);
+
+TRACE_EVENT(xfs_rename,
+ TP_PROTO(struct xfs_inode *src_dp, struct xfs_inode *target_dp,
+ struct xfs_name *src_name, struct xfs_name *target_name),
+ TP_ARGS(src_dp, target_dp, src_name, target_name),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, src_dp_ino)
+ __field(xfs_ino_t, target_dp_ino)
+ __dynamic_array(char, src_name, src_name->len)
+ __dynamic_array(char, target_name, target_name->len)
+ ),
+ TP_fast_assign(
+ __entry->dev = VFS_I(src_dp)->i_sb->s_dev;
+ __entry->src_dp_ino = src_dp->i_ino;
+ __entry->target_dp_ino = target_dp->i_ino;
+ memcpy(__get_str(src_name), src_name->name, src_name->len);
+ memcpy(__get_str(target_name), target_name->name, target_name->len);
+ ),
+ TP_printk("dev %d:%d src dp ino 0x%llx target dp ino 0x%llx"
+ " src name %s target name %s",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->src_dp_ino,
+ __entry->target_dp_ino,
+ __get_str(src_name),
+ __get_str(target_name))
+)
DECLARE_EVENT_CLASS(xfs_dquot_class,
TP_PROTO(struct xfs_dquot *dqp),
@@ -677,9 +755,6 @@ DEFINE_DQUOT_EVENT(xfs_dqrele);
DEFINE_DQUOT_EVENT(xfs_dqflush);
DEFINE_DQUOT_EVENT(xfs_dqflush_force);
DEFINE_DQUOT_EVENT(xfs_dqflush_done);
-/* not really iget events, but we re-use the format */
-DEFINE_IGET_EVENT(xfs_dquot_dqalloc);
-DEFINE_IGET_EVENT(xfs_dquot_dqdetach);
DECLARE_EVENT_CLASS(xfs_loggrant_class,
TP_PROTO(struct log *log, struct xlog_ticket *tic),
@@ -775,165 +850,177 @@ DEFINE_LOGGRANT_EVENT(xfs_log_ungrant_enter);
DEFINE_LOGGRANT_EVENT(xfs_log_ungrant_exit);
DEFINE_LOGGRANT_EVENT(xfs_log_ungrant_sub);
-#define DEFINE_RW_EVENT(name) \
-TRACE_EVENT(name, \
- TP_PROTO(struct xfs_inode *ip, size_t count, loff_t offset, int flags), \
- TP_ARGS(ip, count, offset, flags), \
- TP_STRUCT__entry( \
- __field(dev_t, dev) \
- __field(xfs_ino_t, ino) \
- __field(xfs_fsize_t, size) \
- __field(xfs_fsize_t, new_size) \
- __field(loff_t, offset) \
- __field(size_t, count) \
- __field(int, flags) \
- ), \
- TP_fast_assign( \
- __entry->dev = VFS_I(ip)->i_sb->s_dev; \
- __entry->ino = ip->i_ino; \
- __entry->size = ip->i_d.di_size; \
- __entry->new_size = ip->i_new_size; \
- __entry->offset = offset; \
- __entry->count = count; \
- __entry->flags = flags; \
- ), \
- TP_printk("dev %d:%d ino 0x%llx size 0x%llx new_size 0x%llx " \
- "offset 0x%llx count 0x%zx ioflags %s", \
- MAJOR(__entry->dev), MINOR(__entry->dev), \
- __entry->ino, \
- __entry->size, \
- __entry->new_size, \
- __entry->offset, \
- __entry->count, \
- __print_flags(__entry->flags, "|", XFS_IO_FLAGS)) \
+DECLARE_EVENT_CLASS(xfs_file_class,
+ TP_PROTO(struct xfs_inode *ip, size_t count, loff_t offset, int flags),
+ TP_ARGS(ip, count, offset, flags),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, ino)
+ __field(xfs_fsize_t, size)
+ __field(xfs_fsize_t, new_size)
+ __field(loff_t, offset)
+ __field(size_t, count)
+ __field(int, flags)
+ ),
+ TP_fast_assign(
+ __entry->dev = VFS_I(ip)->i_sb->s_dev;
+ __entry->ino = ip->i_ino;
+ __entry->size = ip->i_d.di_size;
+ __entry->new_size = ip->i_new_size;
+ __entry->offset = offset;
+ __entry->count = count;
+ __entry->flags = flags;
+ ),
+ TP_printk("dev %d:%d ino 0x%llx size 0x%llx new_size 0x%llx "
+ "offset 0x%llx count 0x%zx ioflags %s",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->ino,
+ __entry->size,
+ __entry->new_size,
+ __entry->offset,
+ __entry->count,
+ __print_flags(__entry->flags, "|", XFS_IO_FLAGS))
)
+
+#define DEFINE_RW_EVENT(name) \
+DEFINE_EVENT(xfs_file_class, name, \
+ TP_PROTO(struct xfs_inode *ip, size_t count, loff_t offset, int flags), \
+ TP_ARGS(ip, count, offset, flags))
DEFINE_RW_EVENT(xfs_file_read);
DEFINE_RW_EVENT(xfs_file_buffered_write);
DEFINE_RW_EVENT(xfs_file_direct_write);
DEFINE_RW_EVENT(xfs_file_splice_read);
DEFINE_RW_EVENT(xfs_file_splice_write);
-
-#define DEFINE_PAGE_EVENT(name) \
-TRACE_EVENT(name, \
- TP_PROTO(struct inode *inode, struct page *page, unsigned long off), \
- TP_ARGS(inode, page, off), \
- TP_STRUCT__entry( \
- __field(dev_t, dev) \
- __field(xfs_ino_t, ino) \
- __field(pgoff_t, pgoff) \
- __field(loff_t, size) \
- __field(unsigned long, offset) \
- __field(int, delalloc) \
- __field(int, unmapped) \
- __field(int, unwritten) \
- ), \
- TP_fast_assign( \
- int delalloc = -1, unmapped = -1, unwritten = -1; \
- \
- if (page_has_buffers(page)) \
- xfs_count_page_state(page, &delalloc, \
- &unmapped, &unwritten); \
- __entry->dev = inode->i_sb->s_dev; \
- __entry->ino = XFS_I(inode)->i_ino; \
- __entry->pgoff = page_offset(page); \
- __entry->size = i_size_read(inode); \
- __entry->offset = off; \
- __entry->delalloc = delalloc; \
- __entry->unmapped = unmapped; \
- __entry->unwritten = unwritten; \
- ), \
- TP_printk("dev %d:%d ino 0x%llx pgoff 0x%lx size 0x%llx offset %lx " \
- "delalloc %d unmapped %d unwritten %d", \
- MAJOR(__entry->dev), MINOR(__entry->dev), \
- __entry->ino, \
- __entry->pgoff, \
- __entry->size, \
- __entry->offset, \
- __entry->delalloc, \
- __entry->unmapped, \
- __entry->unwritten) \
+DECLARE_EVENT_CLASS(xfs_page_class,
+ TP_PROTO(struct inode *inode, struct page *page, unsigned long off),
+ TP_ARGS(inode, page, off),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, ino)
+ __field(pgoff_t, pgoff)
+ __field(loff_t, size)
+ __field(unsigned long, offset)
+ __field(int, delalloc)
+ __field(int, unwritten)
+ ),
+ TP_fast_assign(
+ int delalloc = -1, unwritten = -1;
+
+ if (page_has_buffers(page))
+ xfs_count_page_state(page, &delalloc, &unwritten);
+ __entry->dev = inode->i_sb->s_dev;
+ __entry->ino = XFS_I(inode)->i_ino;
+ __entry->pgoff = page_offset(page);
+ __entry->size = i_size_read(inode);
+ __entry->offset = off;
+ __entry->delalloc = delalloc;
+ __entry->unwritten = unwritten;
+ ),
+ TP_printk("dev %d:%d ino 0x%llx pgoff 0x%lx size 0x%llx offset %lx "
+ "delalloc %d unwritten %d",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->ino,
+ __entry->pgoff,
+ __entry->size,
+ __entry->offset,
+ __entry->delalloc,
+ __entry->unwritten)
)
+
+#define DEFINE_PAGE_EVENT(name) \
+DEFINE_EVENT(xfs_page_class, name, \
+ TP_PROTO(struct inode *inode, struct page *page, unsigned long off), \
+ TP_ARGS(inode, page, off))
DEFINE_PAGE_EVENT(xfs_writepage);
DEFINE_PAGE_EVENT(xfs_releasepage);
DEFINE_PAGE_EVENT(xfs_invalidatepage);
-#define DEFINE_IOMAP_EVENT(name) \
-TRACE_EVENT(name, \
- TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count, \
- int flags, struct xfs_bmbt_irec *irec), \
- TP_ARGS(ip, offset, count, flags, irec), \
- TP_STRUCT__entry( \
- __field(dev_t, dev) \
- __field(xfs_ino_t, ino) \
- __field(loff_t, size) \
- __field(loff_t, new_size) \
- __field(loff_t, offset) \
- __field(size_t, count) \
- __field(int, flags) \
- __field(xfs_fileoff_t, startoff) \
- __field(xfs_fsblock_t, startblock) \
- __field(xfs_filblks_t, blockcount) \
- ), \
- TP_fast_assign( \
- __entry->dev = VFS_I(ip)->i_sb->s_dev; \
- __entry->ino = ip->i_ino; \
- __entry->size = ip->i_d.di_size; \
- __entry->new_size = ip->i_new_size; \
- __entry->offset = offset; \
- __entry->count = count; \
- __entry->flags = flags; \
- __entry->startoff = irec ? irec->br_startoff : 0; \
- __entry->startblock = irec ? irec->br_startblock : 0; \
- __entry->blockcount = irec ? irec->br_blockcount : 0; \
- ), \
- TP_printk("dev %d:%d ino 0x%llx size 0x%llx new_size 0x%llx " \
- "offset 0x%llx count %zd flags %s " \
- "startoff 0x%llx startblock %lld blockcount 0x%llx", \
- MAJOR(__entry->dev), MINOR(__entry->dev), \
- __entry->ino, \
- __entry->size, \
- __entry->new_size, \
- __entry->offset, \
- __entry->count, \
- __print_flags(__entry->flags, "|", BMAPI_FLAGS), \
- __entry->startoff, \
- (__int64_t)__entry->startblock, \
- __entry->blockcount) \
+DECLARE_EVENT_CLASS(xfs_iomap_class,
+ TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count,
+ int flags, struct xfs_bmbt_irec *irec),
+ TP_ARGS(ip, offset, count, flags, irec),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, ino)
+ __field(loff_t, size)
+ __field(loff_t, new_size)
+ __field(loff_t, offset)
+ __field(size_t, count)
+ __field(int, flags)
+ __field(xfs_fileoff_t, startoff)
+ __field(xfs_fsblock_t, startblock)
+ __field(xfs_filblks_t, blockcount)
+ ),
+ TP_fast_assign(
+ __entry->dev = VFS_I(ip)->i_sb->s_dev;
+ __entry->ino = ip->i_ino;
+ __entry->size = ip->i_d.di_size;
+ __entry->new_size = ip->i_new_size;
+ __entry->offset = offset;
+ __entry->count = count;
+ __entry->flags = flags;
+ __entry->startoff = irec ? irec->br_startoff : 0;
+ __entry->startblock = irec ? irec->br_startblock : 0;
+ __entry->blockcount = irec ? irec->br_blockcount : 0;
+ ),
+ TP_printk("dev %d:%d ino 0x%llx size 0x%llx new_size 0x%llx "
+ "offset 0x%llx count %zd flags %s "
+ "startoff 0x%llx startblock %lld blockcount 0x%llx",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->ino,
+ __entry->size,
+ __entry->new_size,
+ __entry->offset,
+ __entry->count,
+ __print_flags(__entry->flags, "|", BMAPI_FLAGS),
+ __entry->startoff,
+ (__int64_t)__entry->startblock,
+ __entry->blockcount)
)
+
+#define DEFINE_IOMAP_EVENT(name) \
+DEFINE_EVENT(xfs_iomap_class, name, \
+ TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count, \
+ int flags, struct xfs_bmbt_irec *irec), \
+ TP_ARGS(ip, offset, count, flags, irec))
DEFINE_IOMAP_EVENT(xfs_iomap_enter);
DEFINE_IOMAP_EVENT(xfs_iomap_found);
DEFINE_IOMAP_EVENT(xfs_iomap_alloc);
-#define DEFINE_SIMPLE_IO_EVENT(name) \
-TRACE_EVENT(name, \
- TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count), \
- TP_ARGS(ip, offset, count), \
- TP_STRUCT__entry( \
- __field(dev_t, dev) \
- __field(xfs_ino_t, ino) \
- __field(loff_t, size) \
- __field(loff_t, new_size) \
- __field(loff_t, offset) \
- __field(size_t, count) \
- ), \
- TP_fast_assign( \
- __entry->dev = VFS_I(ip)->i_sb->s_dev; \
- __entry->ino = ip->i_ino; \
- __entry->size = ip->i_d.di_size; \
- __entry->new_size = ip->i_new_size; \
- __entry->offset = offset; \
- __entry->count = count; \
- ), \
- TP_printk("dev %d:%d ino 0x%llx size 0x%llx new_size 0x%llx " \
- "offset 0x%llx count %zd", \
- MAJOR(__entry->dev), MINOR(__entry->dev), \
- __entry->ino, \
- __entry->size, \
- __entry->new_size, \
- __entry->offset, \
- __entry->count) \
+DECLARE_EVENT_CLASS(xfs_simple_io_class,
+ TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count),
+ TP_ARGS(ip, offset, count),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, ino)
+ __field(loff_t, size)
+ __field(loff_t, new_size)
+ __field(loff_t, offset)
+ __field(size_t, count)
+ ),
+ TP_fast_assign(
+ __entry->dev = VFS_I(ip)->i_sb->s_dev;
+ __entry->ino = ip->i_ino;
+ __entry->size = ip->i_d.di_size;
+ __entry->new_size = ip->i_new_size;
+ __entry->offset = offset;
+ __entry->count = count;
+ ),
+ TP_printk("dev %d:%d ino 0x%llx size 0x%llx new_size 0x%llx "
+ "offset 0x%llx count %zd",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->ino,
+ __entry->size,
+ __entry->new_size,
+ __entry->offset,
+ __entry->count)
);
+
+#define DEFINE_SIMPLE_IO_EVENT(name) \
+DEFINE_EVENT(xfs_simple_io_class, name, \
+ TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count), \
+ TP_ARGS(ip, offset, count))
DEFINE_SIMPLE_IO_EVENT(xfs_delalloc_enospc);
DEFINE_SIMPLE_IO_EVENT(xfs_unwritten_convert);
diff --git a/fs/xfs/quota/xfs_dquot.c b/fs/xfs/quota/xfs_dquot.c
index 585e7633dfc7..e1a2f6800e01 100644
--- a/fs/xfs/quota/xfs_dquot.c
+++ b/fs/xfs/quota/xfs_dquot.c
@@ -23,25 +23,15 @@
#include "xfs_trans.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
-#include "xfs_dir2.h"
#include "xfs_alloc.h"
-#include "xfs_dmapi.h"
#include "xfs_quota.h"
#include "xfs_mount.h"
#include "xfs_bmap_btree.h"
-#include "xfs_alloc_btree.h"
-#include "xfs_ialloc_btree.h"
-#include "xfs_dir2_sf.h"
-#include "xfs_attr_sf.h"
-#include "xfs_dinode.h"
#include "xfs_inode.h"
-#include "xfs_btree.h"
-#include "xfs_ialloc.h"
#include "xfs_bmap.h"
#include "xfs_rtalloc.h"
#include "xfs_error.h"
#include "xfs_itable.h"
-#include "xfs_rw.h"
#include "xfs_attr.h"
#include "xfs_buf_item.h"
#include "xfs_trans_space.h"
@@ -64,8 +54,6 @@
flush lock - ditto.
*/
-STATIC void xfs_qm_dqflush_done(xfs_buf_t *, xfs_dq_logitem_t *);
-
#ifdef DEBUG
xfs_buftarg_t *xfs_dqerror_target;
int xfs_do_dqerror;
@@ -390,21 +378,14 @@ xfs_qm_dqalloc(
return (ESRCH);
}
- /*
- * xfs_trans_commit normally decrements the vnode ref count
- * when it unlocks the inode. Since we want to keep the quota
- * inode around, we bump the vnode ref count now.
- */
- IHOLD(quotip);
-
- xfs_trans_ijoin(tp, quotip, XFS_ILOCK_EXCL);
+ xfs_trans_ijoin_ref(tp, quotip, XFS_ILOCK_EXCL);
nmaps = 1;
if ((error = xfs_bmapi(tp, quotip,
offset_fsb, XFS_DQUOT_CLUSTER_SIZE_FSB,
XFS_BMAPI_METADATA | XFS_BMAPI_WRITE,
&firstblock,
XFS_QM_DQALLOC_SPACE_RES(mp),
- &map, &nmaps, &flist, NULL))) {
+ &map, &nmaps, &flist))) {
goto error0;
}
ASSERT(map.br_blockcount == XFS_DQUOT_CLUSTER_SIZE_FSB);
@@ -520,7 +501,7 @@ xfs_qm_dqtobp(
error = xfs_bmapi(NULL, quotip, dqp->q_fileoffset,
XFS_DQUOT_CLUSTER_SIZE_FSB,
XFS_BMAPI_METADATA,
- NULL, 0, &map, &nmaps, NULL, NULL);
+ NULL, 0, &map, &nmaps, NULL);
xfs_iunlock(quotip, XFS_ILOCK_SHARED);
if (error)
@@ -1141,6 +1122,46 @@ xfs_qm_dqrele(
xfs_qm_dqput(dqp);
}
+/*
+ * This is the dquot flushing I/O completion routine. It is called
+ * from interrupt level when the buffer containing the dquot is
+ * flushed to disk. It is responsible for removing the dquot logitem
+ * from the AIL if it has not been re-logged, and unlocking the dquot's
+ * flush lock. This behavior is very similar to that of inodes..
+ */
+STATIC void
+xfs_qm_dqflush_done(
+ struct xfs_buf *bp,
+ struct xfs_log_item *lip)
+{
+ xfs_dq_logitem_t *qip = (struct xfs_dq_logitem *)lip;
+ xfs_dquot_t *dqp = qip->qli_dquot;
+ struct xfs_ail *ailp = lip->li_ailp;
+
+ /*
+ * We only want to pull the item from the AIL if its
+ * location in the log has not changed since we started the flush.
+ * Thus, we only bother if the dquot's lsn has
+ * not changed. First we check the lsn outside the lock
+ * since it's cheaper, and then we recheck while
+ * holding the lock before removing the dquot from the AIL.
+ */
+ if ((lip->li_flags & XFS_LI_IN_AIL) &&
+ lip->li_lsn == qip->qli_flush_lsn) {
+
+ /* xfs_trans_ail_delete() drops the AIL lock. */
+ spin_lock(&ailp->xa_lock);
+ if (lip->li_lsn == qip->qli_flush_lsn)
+ xfs_trans_ail_delete(ailp, lip);
+ else
+ spin_unlock(&ailp->xa_lock);
+ }
+
+ /*
+ * Release the dq's flush lock since we're done with it.
+ */
+ xfs_dqfunlock(dqp);
+}
/*
* Write a modified dquot to disk.
@@ -1222,8 +1243,9 @@ xfs_qm_dqflush(
* Attach an iodone routine so that we can remove this dquot from the
* AIL and release the flush lock once the dquot is synced to disk.
*/
- xfs_buf_attach_iodone(bp, (void(*)(xfs_buf_t *, xfs_log_item_t *))
- xfs_qm_dqflush_done, &(dqp->q_logitem.qli_item));
+ xfs_buf_attach_iodone(bp, xfs_qm_dqflush_done,
+ &dqp->q_logitem.qli_item);
+
/*
* If the buffer is pinned then push on the log so we won't
* get stuck waiting in the write for too long.
@@ -1247,50 +1269,6 @@ xfs_qm_dqflush(
}
-/*
- * This is the dquot flushing I/O completion routine. It is called
- * from interrupt level when the buffer containing the dquot is
- * flushed to disk. It is responsible for removing the dquot logitem
- * from the AIL if it has not been re-logged, and unlocking the dquot's
- * flush lock. This behavior is very similar to that of inodes..
- */
-/*ARGSUSED*/
-STATIC void
-xfs_qm_dqflush_done(
- xfs_buf_t *bp,
- xfs_dq_logitem_t *qip)
-{
- xfs_dquot_t *dqp;
- struct xfs_ail *ailp;
-
- dqp = qip->qli_dquot;
- ailp = qip->qli_item.li_ailp;
-
- /*
- * We only want to pull the item from the AIL if its
- * location in the log has not changed since we started the flush.
- * Thus, we only bother if the dquot's lsn has
- * not changed. First we check the lsn outside the lock
- * since it's cheaper, and then we recheck while
- * holding the lock before removing the dquot from the AIL.
- */
- if ((qip->qli_item.li_flags & XFS_LI_IN_AIL) &&
- qip->qli_item.li_lsn == qip->qli_flush_lsn) {
-
- /* xfs_trans_ail_delete() drops the AIL lock. */
- spin_lock(&ailp->xa_lock);
- if (qip->qli_item.li_lsn == qip->qli_flush_lsn)
- xfs_trans_ail_delete(ailp, (xfs_log_item_t*)qip);
- else
- spin_unlock(&ailp->xa_lock);
- }
-
- /*
- * Release the dq's flush lock since we're done with it.
- */
- xfs_dqfunlock(dqp);
-}
-
int
xfs_qm_dqlock_nowait(
xfs_dquot_t *dqp)
diff --git a/fs/xfs/quota/xfs_dquot_item.c b/fs/xfs/quota/xfs_dquot_item.c
index 8d89a24ae324..2a1f3dc10a02 100644
--- a/fs/xfs/quota/xfs_dquot_item.c
+++ b/fs/xfs/quota/xfs_dquot_item.c
@@ -23,42 +23,36 @@
#include "xfs_trans.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
-#include "xfs_dir2.h"
#include "xfs_alloc.h"
-#include "xfs_dmapi.h"
#include "xfs_quota.h"
#include "xfs_mount.h"
#include "xfs_bmap_btree.h"
-#include "xfs_alloc_btree.h"
-#include "xfs_ialloc_btree.h"
-#include "xfs_dir2_sf.h"
-#include "xfs_attr_sf.h"
-#include "xfs_dinode.h"
#include "xfs_inode.h"
#include "xfs_bmap.h"
-#include "xfs_btree.h"
-#include "xfs_ialloc.h"
#include "xfs_rtalloc.h"
#include "xfs_error.h"
#include "xfs_itable.h"
-#include "xfs_rw.h"
#include "xfs_attr.h"
#include "xfs_buf_item.h"
#include "xfs_trans_priv.h"
#include "xfs_qm.h"
+static inline struct xfs_dq_logitem *DQUOT_ITEM(struct xfs_log_item *lip)
+{
+ return container_of(lip, struct xfs_dq_logitem, qli_item);
+}
+
/*
* returns the number of iovecs needed to log the given dquot item.
*/
-/* ARGSUSED */
STATIC uint
xfs_qm_dquot_logitem_size(
- xfs_dq_logitem_t *logitem)
+ struct xfs_log_item *lip)
{
/*
* we need only two iovecs, one for the format, one for the real thing
*/
- return (2);
+ return 2;
}
/*
@@ -66,22 +60,21 @@ xfs_qm_dquot_logitem_size(
*/
STATIC void
xfs_qm_dquot_logitem_format(
- xfs_dq_logitem_t *logitem,
- xfs_log_iovec_t *logvec)
+ struct xfs_log_item *lip,
+ struct xfs_log_iovec *logvec)
{
- ASSERT(logitem);
- ASSERT(logitem->qli_dquot);
+ struct xfs_dq_logitem *qlip = DQUOT_ITEM(lip);
- logvec->i_addr = (xfs_caddr_t)&logitem->qli_format;
+ logvec->i_addr = &qlip->qli_format;
logvec->i_len = sizeof(xfs_dq_logformat_t);
logvec->i_type = XLOG_REG_TYPE_QFORMAT;
logvec++;
- logvec->i_addr = (xfs_caddr_t)&logitem->qli_dquot->q_core;
+ logvec->i_addr = &qlip->qli_dquot->q_core;
logvec->i_len = sizeof(xfs_disk_dquot_t);
logvec->i_type = XLOG_REG_TYPE_DQUOT;
- ASSERT(2 == logitem->qli_item.li_desc->lid_size);
- logitem->qli_format.qlf_size = 2;
+ ASSERT(2 == lip->li_desc->lid_size);
+ qlip->qli_format.qlf_size = 2;
}
@@ -90,9 +83,9 @@ xfs_qm_dquot_logitem_format(
*/
STATIC void
xfs_qm_dquot_logitem_pin(
- xfs_dq_logitem_t *logitem)
+ struct xfs_log_item *lip)
{
- xfs_dquot_t *dqp = logitem->qli_dquot;
+ struct xfs_dquot *dqp = DQUOT_ITEM(lip)->qli_dquot;
ASSERT(XFS_DQ_IS_LOCKED(dqp));
atomic_inc(&dqp->q_pincount);
@@ -104,27 +97,18 @@ xfs_qm_dquot_logitem_pin(
* dquot must have been previously pinned with a call to
* xfs_qm_dquot_logitem_pin().
*/
-/* ARGSUSED */
STATIC void
xfs_qm_dquot_logitem_unpin(
- xfs_dq_logitem_t *logitem)
+ struct xfs_log_item *lip,
+ int remove)
{
- xfs_dquot_t *dqp = logitem->qli_dquot;
+ struct xfs_dquot *dqp = DQUOT_ITEM(lip)->qli_dquot;
ASSERT(atomic_read(&dqp->q_pincount) > 0);
if (atomic_dec_and_test(&dqp->q_pincount))
wake_up(&dqp->q_pinwait);
}
-/* ARGSUSED */
-STATIC void
-xfs_qm_dquot_logitem_unpin_remove(
- xfs_dq_logitem_t *logitem,
- xfs_trans_t *tp)
-{
- xfs_qm_dquot_logitem_unpin(logitem);
-}
-
/*
* Given the logitem, this writes the corresponding dquot entry to disk
* asynchronously. This is called with the dquot entry securely locked;
@@ -133,12 +117,10 @@ xfs_qm_dquot_logitem_unpin_remove(
*/
STATIC void
xfs_qm_dquot_logitem_push(
- xfs_dq_logitem_t *logitem)
+ struct xfs_log_item *lip)
{
- xfs_dquot_t *dqp;
- int error;
-
- dqp = logitem->qli_dquot;
+ struct xfs_dquot *dqp = DQUOT_ITEM(lip)->qli_dquot;
+ int error;
ASSERT(XFS_DQ_IS_LOCKED(dqp));
ASSERT(!completion_done(&dqp->q_flush));
@@ -160,27 +142,25 @@ xfs_qm_dquot_logitem_push(
xfs_dqunlock(dqp);
}
-/*ARGSUSED*/
STATIC xfs_lsn_t
xfs_qm_dquot_logitem_committed(
- xfs_dq_logitem_t *l,
+ struct xfs_log_item *lip,
xfs_lsn_t lsn)
{
/*
* We always re-log the entire dquot when it becomes dirty,
* so, the latest copy _is_ the only one that matters.
*/
- return (lsn);
+ return lsn;
}
-
/*
* This is called to wait for the given dquot to be unpinned.
* Most of these pin/unpin routines are plagiarized from inode code.
*/
void
xfs_qm_dqunpin_wait(
- xfs_dquot_t *dqp)
+ struct xfs_dquot *dqp)
{
ASSERT(XFS_DQ_IS_LOCKED(dqp));
if (atomic_read(&dqp->q_pincount) == 0)
@@ -206,13 +186,12 @@ xfs_qm_dqunpin_wait(
*/
STATIC void
xfs_qm_dquot_logitem_pushbuf(
- xfs_dq_logitem_t *qip)
+ struct xfs_log_item *lip)
{
- xfs_dquot_t *dqp;
- xfs_mount_t *mp;
- xfs_buf_t *bp;
+ struct xfs_dq_logitem *qlip = DQUOT_ITEM(lip);
+ struct xfs_dquot *dqp = qlip->qli_dquot;
+ struct xfs_buf *bp;
- dqp = qip->qli_dquot;
ASSERT(XFS_DQ_IS_LOCKED(dqp));
/*
@@ -220,22 +199,20 @@ xfs_qm_dquot_logitem_pushbuf(
* inode flush completed and the inode was taken off the AIL.
* So, just get out.
*/
- if (completion_done(&dqp->q_flush) ||
- ((qip->qli_item.li_flags & XFS_LI_IN_AIL) == 0)) {
+ if (completion_done(&dqp->q_flush) ||
+ !(lip->li_flags & XFS_LI_IN_AIL)) {
xfs_dqunlock(dqp);
return;
}
- mp = dqp->q_mount;
- bp = xfs_incore(mp->m_ddev_targp, qip->qli_format.qlf_blkno,
- mp->m_quotainfo->qi_dqchunklen, XBF_TRYLOCK);
+
+ bp = xfs_incore(dqp->q_mount->m_ddev_targp, qlip->qli_format.qlf_blkno,
+ dqp->q_mount->m_quotainfo->qi_dqchunklen, XBF_TRYLOCK);
xfs_dqunlock(dqp);
if (!bp)
return;
if (XFS_BUF_ISDELAYWRITE(bp))
xfs_buf_delwri_promote(bp);
xfs_buf_relse(bp);
- return;
-
}
/*
@@ -250,15 +227,14 @@ xfs_qm_dquot_logitem_pushbuf(
*/
STATIC uint
xfs_qm_dquot_logitem_trylock(
- xfs_dq_logitem_t *qip)
+ struct xfs_log_item *lip)
{
- xfs_dquot_t *dqp;
+ struct xfs_dquot *dqp = DQUOT_ITEM(lip)->qli_dquot;
- dqp = qip->qli_dquot;
if (atomic_read(&dqp->q_pincount) > 0)
return XFS_ITEM_PINNED;
- if (! xfs_qm_dqlock_nowait(dqp))
+ if (!xfs_qm_dqlock_nowait(dqp))
return XFS_ITEM_LOCKED;
if (!xfs_dqflock_nowait(dqp)) {
@@ -269,11 +245,10 @@ xfs_qm_dquot_logitem_trylock(
return XFS_ITEM_PUSHBUF;
}
- ASSERT(qip->qli_item.li_flags & XFS_LI_IN_AIL);
+ ASSERT(lip->li_flags & XFS_LI_IN_AIL);
return XFS_ITEM_SUCCESS;
}
-
/*
* Unlock the dquot associated with the log item.
* Clear the fields of the dquot and dquot log item that
@@ -282,12 +257,10 @@ xfs_qm_dquot_logitem_trylock(
*/
STATIC void
xfs_qm_dquot_logitem_unlock(
- xfs_dq_logitem_t *ql)
+ struct xfs_log_item *lip)
{
- xfs_dquot_t *dqp;
+ struct xfs_dquot *dqp = DQUOT_ITEM(lip)->qli_dquot;
- ASSERT(ql != NULL);
- dqp = ql->qli_dquot;
ASSERT(XFS_DQ_IS_LOCKED(dqp));
/*
@@ -304,43 +277,32 @@ xfs_qm_dquot_logitem_unlock(
xfs_dqunlock(dqp);
}
-
/*
* this needs to stamp an lsn into the dquot, I think.
* rpc's that look at user dquot's would then have to
* push on the dependency recorded in the dquot
*/
-/* ARGSUSED */
STATIC void
xfs_qm_dquot_logitem_committing(
- xfs_dq_logitem_t *l,
+ struct xfs_log_item *lip,
xfs_lsn_t lsn)
{
- return;
}
-
/*
* This is the ops vector for dquots
*/
static struct xfs_item_ops xfs_dquot_item_ops = {
- .iop_size = (uint(*)(xfs_log_item_t*))xfs_qm_dquot_logitem_size,
- .iop_format = (void(*)(xfs_log_item_t*, xfs_log_iovec_t*))
- xfs_qm_dquot_logitem_format,
- .iop_pin = (void(*)(xfs_log_item_t*))xfs_qm_dquot_logitem_pin,
- .iop_unpin = (void(*)(xfs_log_item_t*))xfs_qm_dquot_logitem_unpin,
- .iop_unpin_remove = (void(*)(xfs_log_item_t*, xfs_trans_t*))
- xfs_qm_dquot_logitem_unpin_remove,
- .iop_trylock = (uint(*)(xfs_log_item_t*))
- xfs_qm_dquot_logitem_trylock,
- .iop_unlock = (void(*)(xfs_log_item_t*))xfs_qm_dquot_logitem_unlock,
- .iop_committed = (xfs_lsn_t(*)(xfs_log_item_t*, xfs_lsn_t))
- xfs_qm_dquot_logitem_committed,
- .iop_push = (void(*)(xfs_log_item_t*))xfs_qm_dquot_logitem_push,
- .iop_pushbuf = (void(*)(xfs_log_item_t*))
- xfs_qm_dquot_logitem_pushbuf,
- .iop_committing = (void(*)(xfs_log_item_t*, xfs_lsn_t))
- xfs_qm_dquot_logitem_committing
+ .iop_size = xfs_qm_dquot_logitem_size,
+ .iop_format = xfs_qm_dquot_logitem_format,
+ .iop_pin = xfs_qm_dquot_logitem_pin,
+ .iop_unpin = xfs_qm_dquot_logitem_unpin,
+ .iop_trylock = xfs_qm_dquot_logitem_trylock,
+ .iop_unlock = xfs_qm_dquot_logitem_unlock,
+ .iop_committed = xfs_qm_dquot_logitem_committed,
+ .iop_push = xfs_qm_dquot_logitem_push,
+ .iop_pushbuf = xfs_qm_dquot_logitem_pushbuf,
+ .iop_committing = xfs_qm_dquot_logitem_committing
};
/*
@@ -350,10 +312,9 @@ static struct xfs_item_ops xfs_dquot_item_ops = {
*/
void
xfs_qm_dquot_logitem_init(
- struct xfs_dquot *dqp)
+ struct xfs_dquot *dqp)
{
- xfs_dq_logitem_t *lp;
- lp = &dqp->q_logitem;
+ struct xfs_dq_logitem *lp = &dqp->q_logitem;
xfs_log_item_init(dqp->q_mount, &lp->qli_item, XFS_LI_DQUOT,
&xfs_dquot_item_ops);
@@ -374,16 +335,22 @@ xfs_qm_dquot_logitem_init(
/*------------------ QUOTAOFF LOG ITEMS -------------------*/
+static inline struct xfs_qoff_logitem *QOFF_ITEM(struct xfs_log_item *lip)
+{
+ return container_of(lip, struct xfs_qoff_logitem, qql_item);
+}
+
+
/*
* This returns the number of iovecs needed to log the given quotaoff item.
* We only need 1 iovec for an quotaoff item. It just logs the
* quotaoff_log_format structure.
*/
-/*ARGSUSED*/
STATIC uint
-xfs_qm_qoff_logitem_size(xfs_qoff_logitem_t *qf)
+xfs_qm_qoff_logitem_size(
+ struct xfs_log_item *lip)
{
- return (1);
+ return 1;
}
/*
@@ -394,53 +361,46 @@ xfs_qm_qoff_logitem_size(xfs_qoff_logitem_t *qf)
* slots in the quotaoff item have been filled.
*/
STATIC void
-xfs_qm_qoff_logitem_format(xfs_qoff_logitem_t *qf,
- xfs_log_iovec_t *log_vector)
+xfs_qm_qoff_logitem_format(
+ struct xfs_log_item *lip,
+ struct xfs_log_iovec *log_vector)
{
- ASSERT(qf->qql_format.qf_type == XFS_LI_QUOTAOFF);
+ struct xfs_qoff_logitem *qflip = QOFF_ITEM(lip);
+
+ ASSERT(qflip->qql_format.qf_type == XFS_LI_QUOTAOFF);
- log_vector->i_addr = (xfs_caddr_t)&(qf->qql_format);
+ log_vector->i_addr = &qflip->qql_format;
log_vector->i_len = sizeof(xfs_qoff_logitem_t);
log_vector->i_type = XLOG_REG_TYPE_QUOTAOFF;
- qf->qql_format.qf_size = 1;
+ qflip->qql_format.qf_size = 1;
}
-
/*
* Pinning has no meaning for an quotaoff item, so just return.
*/
-/*ARGSUSED*/
STATIC void
-xfs_qm_qoff_logitem_pin(xfs_qoff_logitem_t *qf)
+xfs_qm_qoff_logitem_pin(
+ struct xfs_log_item *lip)
{
- return;
}
-
/*
* Since pinning has no meaning for an quotaoff item, unpinning does
* not either.
*/
-/*ARGSUSED*/
STATIC void
-xfs_qm_qoff_logitem_unpin(xfs_qoff_logitem_t *qf)
+xfs_qm_qoff_logitem_unpin(
+ struct xfs_log_item *lip,
+ int remove)
{
- return;
-}
-
-/*ARGSUSED*/
-STATIC void
-xfs_qm_qoff_logitem_unpin_remove(xfs_qoff_logitem_t *qf, xfs_trans_t *tp)
-{
- return;
}
/*
* Quotaoff items have no locking, so just return success.
*/
-/*ARGSUSED*/
STATIC uint
-xfs_qm_qoff_logitem_trylock(xfs_qoff_logitem_t *qf)
+xfs_qm_qoff_logitem_trylock(
+ struct xfs_log_item *lip)
{
return XFS_ITEM_LOCKED;
}
@@ -449,53 +409,51 @@ xfs_qm_qoff_logitem_trylock(xfs_qoff_logitem_t *qf)
* Quotaoff items have no locking or pushing, so return failure
* so that the caller doesn't bother with us.
*/
-/*ARGSUSED*/
STATIC void
-xfs_qm_qoff_logitem_unlock(xfs_qoff_logitem_t *qf)
+xfs_qm_qoff_logitem_unlock(
+ struct xfs_log_item *lip)
{
- return;
}
/*
* The quotaoff-start-item is logged only once and cannot be moved in the log,
* so simply return the lsn at which it's been logged.
*/
-/*ARGSUSED*/
STATIC xfs_lsn_t
-xfs_qm_qoff_logitem_committed(xfs_qoff_logitem_t *qf, xfs_lsn_t lsn)
+xfs_qm_qoff_logitem_committed(
+ struct xfs_log_item *lip,
+ xfs_lsn_t lsn)
{
- return (lsn);
+ return lsn;
}
/*
* There isn't much you can do to push on an quotaoff item. It is simply
* stuck waiting for the log to be flushed to disk.
*/
-/*ARGSUSED*/
STATIC void
-xfs_qm_qoff_logitem_push(xfs_qoff_logitem_t *qf)
+xfs_qm_qoff_logitem_push(
+ struct xfs_log_item *lip)
{
- return;
}
-/*ARGSUSED*/
STATIC xfs_lsn_t
xfs_qm_qoffend_logitem_committed(
- xfs_qoff_logitem_t *qfe,
- xfs_lsn_t lsn)
+ struct xfs_log_item *lip,
+ xfs_lsn_t lsn)
{
- xfs_qoff_logitem_t *qfs;
- struct xfs_ail *ailp;
+ struct xfs_qoff_logitem *qfe = QOFF_ITEM(lip);
+ struct xfs_qoff_logitem *qfs = qfe->qql_start_lip;
+ struct xfs_ail *ailp = qfs->qql_item.li_ailp;
- qfs = qfe->qql_start_lip;
- ailp = qfs->qql_item.li_ailp;
- spin_lock(&ailp->xa_lock);
/*
* Delete the qoff-start logitem from the AIL.
* xfs_trans_ail_delete() drops the AIL lock.
*/
+ spin_lock(&ailp->xa_lock);
xfs_trans_ail_delete(ailp, (xfs_log_item_t *)qfs);
+
kmem_free(qfs);
kmem_free(qfe);
return (xfs_lsn_t)-1;
@@ -515,71 +473,52 @@ xfs_qm_qoffend_logitem_committed(
* (truly makes the quotaoff irrevocable). If we do something else,
* then maybe we don't need two.
*/
-/* ARGSUSED */
-STATIC void
-xfs_qm_qoff_logitem_committing(xfs_qoff_logitem_t *qip, xfs_lsn_t commit_lsn)
-{
- return;
-}
-
-/* ARGSUSED */
STATIC void
-xfs_qm_qoffend_logitem_committing(xfs_qoff_logitem_t *qip, xfs_lsn_t commit_lsn)
+xfs_qm_qoff_logitem_committing(
+ struct xfs_log_item *lip,
+ xfs_lsn_t commit_lsn)
{
- return;
}
static struct xfs_item_ops xfs_qm_qoffend_logitem_ops = {
- .iop_size = (uint(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_size,
- .iop_format = (void(*)(xfs_log_item_t*, xfs_log_iovec_t*))
- xfs_qm_qoff_logitem_format,
- .iop_pin = (void(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_pin,
- .iop_unpin = (void(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_unpin,
- .iop_unpin_remove = (void(*)(xfs_log_item_t*,xfs_trans_t*))
- xfs_qm_qoff_logitem_unpin_remove,
- .iop_trylock = (uint(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_trylock,
- .iop_unlock = (void(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_unlock,
- .iop_committed = (xfs_lsn_t(*)(xfs_log_item_t*, xfs_lsn_t))
- xfs_qm_qoffend_logitem_committed,
- .iop_push = (void(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_push,
- .iop_pushbuf = NULL,
- .iop_committing = (void(*)(xfs_log_item_t*, xfs_lsn_t))
- xfs_qm_qoffend_logitem_committing
+ .iop_size = xfs_qm_qoff_logitem_size,
+ .iop_format = xfs_qm_qoff_logitem_format,
+ .iop_pin = xfs_qm_qoff_logitem_pin,
+ .iop_unpin = xfs_qm_qoff_logitem_unpin,
+ .iop_trylock = xfs_qm_qoff_logitem_trylock,
+ .iop_unlock = xfs_qm_qoff_logitem_unlock,
+ .iop_committed = xfs_qm_qoffend_logitem_committed,
+ .iop_push = xfs_qm_qoff_logitem_push,
+ .iop_committing = xfs_qm_qoff_logitem_committing
};
/*
* This is the ops vector shared by all quotaoff-start log items.
*/
static struct xfs_item_ops xfs_qm_qoff_logitem_ops = {
- .iop_size = (uint(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_size,
- .iop_format = (void(*)(xfs_log_item_t*, xfs_log_iovec_t*))
- xfs_qm_qoff_logitem_format,
- .iop_pin = (void(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_pin,
- .iop_unpin = (void(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_unpin,
- .iop_unpin_remove = (void(*)(xfs_log_item_t*,xfs_trans_t*))
- xfs_qm_qoff_logitem_unpin_remove,
- .iop_trylock = (uint(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_trylock,
- .iop_unlock = (void(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_unlock,
- .iop_committed = (xfs_lsn_t(*)(xfs_log_item_t*, xfs_lsn_t))
- xfs_qm_qoff_logitem_committed,
- .iop_push = (void(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_push,
- .iop_pushbuf = NULL,
- .iop_committing = (void(*)(xfs_log_item_t*, xfs_lsn_t))
- xfs_qm_qoff_logitem_committing
+ .iop_size = xfs_qm_qoff_logitem_size,
+ .iop_format = xfs_qm_qoff_logitem_format,
+ .iop_pin = xfs_qm_qoff_logitem_pin,
+ .iop_unpin = xfs_qm_qoff_logitem_unpin,
+ .iop_trylock = xfs_qm_qoff_logitem_trylock,
+ .iop_unlock = xfs_qm_qoff_logitem_unlock,
+ .iop_committed = xfs_qm_qoff_logitem_committed,
+ .iop_push = xfs_qm_qoff_logitem_push,
+ .iop_committing = xfs_qm_qoff_logitem_committing
};
/*
* Allocate and initialize an quotaoff item of the correct quota type(s).
*/
-xfs_qoff_logitem_t *
+struct xfs_qoff_logitem *
xfs_qm_qoff_logitem_init(
- struct xfs_mount *mp,
- xfs_qoff_logitem_t *start,
- uint flags)
+ struct xfs_mount *mp,
+ struct xfs_qoff_logitem *start,
+ uint flags)
{
- xfs_qoff_logitem_t *qf;
+ struct xfs_qoff_logitem *qf;
- qf = (xfs_qoff_logitem_t*) kmem_zalloc(sizeof(xfs_qoff_logitem_t), KM_SLEEP);
+ qf = kmem_zalloc(sizeof(struct xfs_qoff_logitem), KM_SLEEP);
xfs_log_item_init(mp, &qf->qql_item, XFS_LI_QUOTAOFF, start ?
&xfs_qm_qoffend_logitem_ops : &xfs_qm_qoff_logitem_ops);
@@ -587,5 +526,5 @@ xfs_qm_qoff_logitem_init(
qf->qql_format.qf_type = XFS_LI_QUOTAOFF;
qf->qql_format.qf_flags = flags;
qf->qql_start_lip = start;
- return (qf);
+ return qf;
}
diff --git a/fs/xfs/quota/xfs_qm.c b/fs/xfs/quota/xfs_qm.c
index 38e764146644..9a92407109a1 100644
--- a/fs/xfs/quota/xfs_qm.c
+++ b/fs/xfs/quota/xfs_qm.c
@@ -23,25 +23,18 @@
#include "xfs_trans.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
-#include "xfs_dir2.h"
#include "xfs_alloc.h"
-#include "xfs_dmapi.h"
#include "xfs_quota.h"
#include "xfs_mount.h"
#include "xfs_bmap_btree.h"
-#include "xfs_alloc_btree.h"
#include "xfs_ialloc_btree.h"
-#include "xfs_dir2_sf.h"
-#include "xfs_attr_sf.h"
#include "xfs_dinode.h"
#include "xfs_inode.h"
-#include "xfs_btree.h"
#include "xfs_ialloc.h"
#include "xfs_itable.h"
#include "xfs_rtalloc.h"
#include "xfs_error.h"
#include "xfs_bmap.h"
-#include "xfs_rw.h"
#include "xfs_attr.h"
#include "xfs_buf_item.h"
#include "xfs_trans_space.h"
@@ -69,7 +62,7 @@ STATIC void xfs_qm_list_destroy(xfs_dqlist_t *);
STATIC int xfs_qm_init_quotainos(xfs_mount_t *);
STATIC int xfs_qm_init_quotainfo(xfs_mount_t *);
-STATIC int xfs_qm_shake(int, gfp_t);
+STATIC int xfs_qm_shake(struct shrinker *, int, gfp_t);
static struct shrinker xfs_qm_shaker = {
.shrink = xfs_qm_shake,
@@ -249,8 +242,10 @@ xfs_qm_hold_quotafs_ref(
if (!xfs_Gqm) {
xfs_Gqm = xfs_Gqm_init();
- if (!xfs_Gqm)
+ if (!xfs_Gqm) {
+ mutex_unlock(&xfs_Gqm_lock);
return ENOMEM;
+ }
}
/*
@@ -1495,7 +1490,7 @@ xfs_qm_dqiterate(
maxlblkcnt - lblkno,
XFS_BMAPI_METADATA,
NULL,
- 0, map, &nmaps, NULL, NULL);
+ 0, map, &nmaps, NULL);
xfs_iunlock(qip, XFS_ILOCK_SHARED);
if (error)
break;
@@ -1630,10 +1625,7 @@ xfs_qm_dqusage_adjust(
xfs_ino_t ino, /* inode number to get data for */
void __user *buffer, /* not used */
int ubsize, /* not used */
- void *private_data, /* not used */
- xfs_daddr_t bno, /* starting block of inode cluster */
int *ubused, /* not used */
- void *dip, /* on-disk inode pointer (not used) */
int *res) /* result code value */
{
xfs_inode_t *ip;
@@ -1658,7 +1650,7 @@ xfs_qm_dqusage_adjust(
* the case in all other instances. It's OK that we do this because
* quotacheck is done only at mount time.
*/
- if ((error = xfs_iget(mp, NULL, ino, 0, XFS_ILOCK_EXCL, &ip, bno))) {
+ if ((error = xfs_iget(mp, NULL, ino, 0, XFS_ILOCK_EXCL, &ip))) {
*res = BULKSTAT_RV_NOTHING;
return error;
}
@@ -1670,7 +1662,8 @@ xfs_qm_dqusage_adjust(
* making us disable quotas for the file system.
*/
if ((error = xfs_qm_dqget_noattach(ip, &udqp, &gdqp))) {
- xfs_iput(ip, XFS_ILOCK_EXCL);
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ IRELE(ip);
*res = BULKSTAT_RV_GIVEUP;
return error;
}
@@ -1683,7 +1676,8 @@ xfs_qm_dqusage_adjust(
* Walk thru the extent list and count the realtime blocks.
*/
if ((error = xfs_qm_get_rtblks(ip, &rtblks))) {
- xfs_iput(ip, XFS_ILOCK_EXCL);
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ IRELE(ip);
if (udqp)
xfs_qm_dqput(udqp);
if (gdqp)
@@ -1794,12 +1788,13 @@ xfs_qm_quotacheck(
* Iterate thru all the inodes in the file system,
* adjusting the corresponding dquot counters in core.
*/
- if ((error = xfs_bulkstat(mp, &lastino, &count,
- xfs_qm_dqusage_adjust, NULL,
- structsz, NULL, BULKSTAT_FG_IGET, &done)))
+ error = xfs_bulkstat(mp, &lastino, &count,
+ xfs_qm_dqusage_adjust,
+ structsz, NULL, &done);
+ if (error)
break;
- } while (! done);
+ } while (!done);
/*
* We've made all the changes that we need to make incore.
@@ -1887,14 +1882,14 @@ xfs_qm_init_quotainos(
mp->m_sb.sb_uquotino != NULLFSINO) {
ASSERT(mp->m_sb.sb_uquotino > 0);
if ((error = xfs_iget(mp, NULL, mp->m_sb.sb_uquotino,
- 0, 0, &uip, 0)))
+ 0, 0, &uip)))
return XFS_ERROR(error);
}
if (XFS_IS_OQUOTA_ON(mp) &&
mp->m_sb.sb_gquotino != NULLFSINO) {
ASSERT(mp->m_sb.sb_gquotino > 0);
if ((error = xfs_iget(mp, NULL, mp->m_sb.sb_gquotino,
- 0, 0, &gip, 0))) {
+ 0, 0, &gip))) {
if (uip)
IRELE(uip);
return XFS_ERROR(error);
@@ -2117,7 +2112,10 @@ xfs_qm_shake_freelist(
*/
/* ARGSUSED */
STATIC int
-xfs_qm_shake(int nr_to_scan, gfp_t gfp_mask)
+xfs_qm_shake(
+ struct shrinker *shrink,
+ int nr_to_scan,
+ gfp_t gfp_mask)
{
int ndqused, nfree, n;
diff --git a/fs/xfs/quota/xfs_qm_bhv.c b/fs/xfs/quota/xfs_qm_bhv.c
index 97b410c12794..bea02d786c5d 100644
--- a/fs/xfs/quota/xfs_qm_bhv.c
+++ b/fs/xfs/quota/xfs_qm_bhv.c
@@ -23,25 +23,15 @@
#include "xfs_trans.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
-#include "xfs_dir2.h"
#include "xfs_alloc.h"
-#include "xfs_dmapi.h"
#include "xfs_quota.h"
#include "xfs_mount.h"
#include "xfs_bmap_btree.h"
-#include "xfs_alloc_btree.h"
-#include "xfs_ialloc_btree.h"
-#include "xfs_dir2_sf.h"
-#include "xfs_attr_sf.h"
-#include "xfs_dinode.h"
#include "xfs_inode.h"
-#include "xfs_ialloc.h"
#include "xfs_itable.h"
-#include "xfs_btree.h"
#include "xfs_bmap.h"
#include "xfs_rtalloc.h"
#include "xfs_error.h"
-#include "xfs_rw.h"
#include "xfs_attr.h"
#include "xfs_buf_item.h"
#include "xfs_qm.h"
diff --git a/fs/xfs/quota/xfs_qm_stats.c b/fs/xfs/quota/xfs_qm_stats.c
index 3d1fc79532e2..8671a0b32644 100644
--- a/fs/xfs/quota/xfs_qm_stats.c
+++ b/fs/xfs/quota/xfs_qm_stats.c
@@ -23,25 +23,15 @@
#include "xfs_trans.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
-#include "xfs_dir2.h"
#include "xfs_alloc.h"
-#include "xfs_dmapi.h"
#include "xfs_quota.h"
#include "xfs_mount.h"
#include "xfs_bmap_btree.h"
-#include "xfs_alloc_btree.h"
-#include "xfs_ialloc_btree.h"
-#include "xfs_dir2_sf.h"
-#include "xfs_attr_sf.h"
-#include "xfs_dinode.h"
#include "xfs_inode.h"
-#include "xfs_ialloc.h"
#include "xfs_itable.h"
#include "xfs_bmap.h"
-#include "xfs_btree.h"
#include "xfs_rtalloc.h"
#include "xfs_error.h"
-#include "xfs_rw.h"
#include "xfs_attr.h"
#include "xfs_buf_item.h"
#include "xfs_qm.h"
diff --git a/fs/xfs/quota/xfs_qm_syscalls.c b/fs/xfs/quota/xfs_qm_syscalls.c
index 92b002f1805f..d257eb8557c4 100644
--- a/fs/xfs/quota/xfs_qm_syscalls.c
+++ b/fs/xfs/quota/xfs_qm_syscalls.c
@@ -26,25 +26,15 @@
#include "xfs_trans.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
-#include "xfs_dir2.h"
#include "xfs_alloc.h"
-#include "xfs_dmapi.h"
#include "xfs_quota.h"
#include "xfs_mount.h"
#include "xfs_bmap_btree.h"
-#include "xfs_alloc_btree.h"
-#include "xfs_ialloc_btree.h"
-#include "xfs_dir2_sf.h"
-#include "xfs_attr_sf.h"
-#include "xfs_dinode.h"
#include "xfs_inode.h"
-#include "xfs_ialloc.h"
#include "xfs_itable.h"
#include "xfs_bmap.h"
-#include "xfs_btree.h"
#include "xfs_rtalloc.h"
#include "xfs_error.h"
-#include "xfs_rw.h"
#include "xfs_attr.h"
#include "xfs_buf_item.h"
#include "xfs_utils.h"
@@ -248,40 +238,74 @@ out_unlock:
return error;
}
+STATIC int
+xfs_qm_scall_trunc_qfile(
+ struct xfs_mount *mp,
+ xfs_ino_t ino)
+{
+ struct xfs_inode *ip;
+ struct xfs_trans *tp;
+ int error;
+
+ if (ino == NULLFSINO)
+ return 0;
+
+ error = xfs_iget(mp, NULL, ino, 0, 0, &ip);
+ if (error)
+ return error;
+
+ xfs_ilock(ip, XFS_IOLOCK_EXCL);
+
+ tp = xfs_trans_alloc(mp, XFS_TRANS_TRUNCATE_FILE);
+ error = xfs_trans_reserve(tp, 0, XFS_ITRUNCATE_LOG_RES(mp), 0,
+ XFS_TRANS_PERM_LOG_RES,
+ XFS_ITRUNCATE_LOG_COUNT);
+ if (error) {
+ xfs_trans_cancel(tp, 0);
+ xfs_iunlock(ip, XFS_IOLOCK_EXCL);
+ goto out_put;
+ }
+
+ xfs_ilock(ip, XFS_ILOCK_EXCL);
+ xfs_trans_ijoin(tp, ip);
+
+ error = xfs_itruncate_finish(&tp, ip, 0, XFS_DATA_FORK, 1);
+ if (error) {
+ xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES |
+ XFS_TRANS_ABORT);
+ goto out_unlock;
+ }
+
+ xfs_ichgtime(ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+ error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+
+out_unlock:
+ xfs_iunlock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
+out_put:
+ IRELE(ip);
+ return error;
+}
+
int
xfs_qm_scall_trunc_qfiles(
xfs_mount_t *mp,
uint flags)
{
int error = 0, error2 = 0;
- xfs_inode_t *qip;
if (!xfs_sb_version_hasquota(&mp->m_sb) || flags == 0) {
qdprintk("qtrunc flags=%x m_qflags=%x\n", flags, mp->m_qflags);
return XFS_ERROR(EINVAL);
}
- if ((flags & XFS_DQ_USER) && mp->m_sb.sb_uquotino != NULLFSINO) {
- error = xfs_iget(mp, NULL, mp->m_sb.sb_uquotino, 0, 0, &qip, 0);
- if (!error) {
- error = xfs_truncate_file(mp, qip);
- IRELE(qip);
- }
- }
-
- if ((flags & (XFS_DQ_GROUP|XFS_DQ_PROJ)) &&
- mp->m_sb.sb_gquotino != NULLFSINO) {
- error2 = xfs_iget(mp, NULL, mp->m_sb.sb_gquotino, 0, 0, &qip, 0);
- if (!error2) {
- error2 = xfs_truncate_file(mp, qip);
- IRELE(qip);
- }
- }
+ if (flags & XFS_DQ_USER)
+ error = xfs_qm_scall_trunc_qfile(mp, mp->m_sb.sb_uquotino);
+ if (flags & (XFS_DQ_GROUP|XFS_DQ_PROJ))
+ error2 = xfs_qm_scall_trunc_qfile(mp, mp->m_sb.sb_gquotino);
return error ? error : error2;
}
-
/*
* Switch on (a given) quota enforcement for a filesystem. This takes
* effect immediately.
@@ -417,12 +441,12 @@ xfs_qm_scall_getqstat(
}
if (!uip && mp->m_sb.sb_uquotino != NULLFSINO) {
if (xfs_iget(mp, NULL, mp->m_sb.sb_uquotino,
- 0, 0, &uip, 0) == 0)
+ 0, 0, &uip) == 0)
tempuqip = B_TRUE;
}
if (!gip && mp->m_sb.sb_gquotino != NULLFSINO) {
if (xfs_iget(mp, NULL, mp->m_sb.sb_gquotino,
- 0, 0, &gip, 0) == 0)
+ 0, 0, &gip) == 0)
tempgqip = B_TRUE;
}
if (uip) {
@@ -875,8 +899,9 @@ xfs_dqrele_inode(
xfs_qm_dqrele(ip->i_gdquot);
ip->i_gdquot = NULL;
}
- xfs_iput(ip, XFS_ILOCK_EXCL);
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ IRELE(ip);
return 0;
}
@@ -1109,10 +1134,7 @@ xfs_qm_internalqcheck_adjust(
xfs_ino_t ino, /* inode number to get data for */
void __user *buffer, /* not used */
int ubsize, /* not used */
- void *private_data, /* not used */
- xfs_daddr_t bno, /* starting block of inode cluster */
int *ubused, /* not used */
- void *dip, /* not used */
int *res) /* bulkstat result code */
{
xfs_inode_t *ip;
@@ -1134,7 +1156,7 @@ xfs_qm_internalqcheck_adjust(
ipreleased = B_FALSE;
again:
lock_flags = XFS_ILOCK_SHARED;
- if ((error = xfs_iget(mp, NULL, ino, 0, lock_flags, &ip, bno))) {
+ if ((error = xfs_iget(mp, NULL, ino, 0, lock_flags, &ip))) {
*res = BULKSTAT_RV_NOTHING;
return (error);
}
@@ -1146,7 +1168,8 @@ xfs_qm_internalqcheck_adjust(
* of those now.
*/
if (! ipreleased) {
- xfs_iput(ip, lock_flags);
+ xfs_iunlock(ip, lock_flags);
+ IRELE(ip);
ipreleased = B_TRUE;
goto again;
}
@@ -1163,7 +1186,8 @@ xfs_qm_internalqcheck_adjust(
ASSERT(gd);
xfs_qm_internalqcheck_dqadjust(ip, gd);
}
- xfs_iput(ip, lock_flags);
+ xfs_iunlock(ip, lock_flags);
+ IRELE(ip);
*res = BULKSTAT_RV_DIDONE;
return (0);
}
@@ -1205,15 +1229,15 @@ xfs_qm_internalqcheck(
* Iterate thru all the inodes in the file system,
* adjusting the corresponding dquot counters
*/
- if ((error = xfs_bulkstat(mp, &lastino, &count,
- xfs_qm_internalqcheck_adjust, NULL,
- 0, NULL, BULKSTAT_FG_IGET, &done))) {
+ error = xfs_bulkstat(mp, &lastino, &count,
+ xfs_qm_internalqcheck_adjust,
+ 0, NULL, &done);
+ if (error) {
+ cmn_err(CE_DEBUG, "Bulkstat returned error 0x%x", error);
break;
}
- } while (! done);
- if (error) {
- cmn_err(CE_DEBUG, "Bulkstat returned error 0x%x", error);
- }
+ } while (!done);
+
cmn_err(CE_DEBUG, "Checking results against system dquots");
for (i = 0; i < qmtest_hashmask; i++) {
xfs_dqtest_t *d, *n;
diff --git a/fs/xfs/quota/xfs_trans_dquot.c b/fs/xfs/quota/xfs_trans_dquot.c
index 061d827da33c..7de91d1b75c0 100644
--- a/fs/xfs/quota/xfs_trans_dquot.c
+++ b/fs/xfs/quota/xfs_trans_dquot.c
@@ -23,25 +23,15 @@
#include "xfs_trans.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
-#include "xfs_dir2.h"
#include "xfs_alloc.h"
-#include "xfs_dmapi.h"
#include "xfs_quota.h"
#include "xfs_mount.h"
#include "xfs_bmap_btree.h"
-#include "xfs_alloc_btree.h"
-#include "xfs_ialloc_btree.h"
-#include "xfs_attr_sf.h"
-#include "xfs_dir2_sf.h"
-#include "xfs_dinode.h"
#include "xfs_inode.h"
-#include "xfs_ialloc.h"
#include "xfs_itable.h"
-#include "xfs_btree.h"
#include "xfs_bmap.h"
#include "xfs_rtalloc.h"
#include "xfs_error.h"
-#include "xfs_rw.h"
#include "xfs_attr.h"
#include "xfs_buf_item.h"
#include "xfs_trans_priv.h"
@@ -59,16 +49,14 @@ xfs_trans_dqjoin(
xfs_trans_t *tp,
xfs_dquot_t *dqp)
{
- xfs_dq_logitem_t *lp = &dqp->q_logitem;
-
ASSERT(dqp->q_transp != tp);
ASSERT(XFS_DQ_IS_LOCKED(dqp));
- ASSERT(lp->qli_dquot == dqp);
+ ASSERT(dqp->q_logitem.qli_dquot == dqp);
/*
* Get a log_item_desc to point at the new item.
*/
- (void) xfs_trans_add_item(tp, (xfs_log_item_t*)(lp));
+ xfs_trans_add_item(tp, &dqp->q_logitem.qli_item);
/*
* Initialize i_transp so we can later determine if this dquot is
@@ -93,16 +81,11 @@ xfs_trans_log_dquot(
xfs_trans_t *tp,
xfs_dquot_t *dqp)
{
- xfs_log_item_desc_t *lidp;
-
ASSERT(dqp->q_transp == tp);
ASSERT(XFS_DQ_IS_LOCKED(dqp));
- lidp = xfs_trans_find_item(tp, (xfs_log_item_t*)(&dqp->q_logitem));
- ASSERT(lidp != NULL);
-
tp->t_flags |= XFS_TRANS_DIRTY;
- lidp->lid_flags |= XFS_LID_DIRTY;
+ dqp->q_logitem.qli_item.li_desc->lid_flags |= XFS_LID_DIRTY;
}
/*
@@ -874,9 +857,8 @@ xfs_trans_get_qoff_item(
/*
* Get a log_item_desc to point at the new item.
*/
- (void) xfs_trans_add_item(tp, (xfs_log_item_t*)q);
-
- return (q);
+ xfs_trans_add_item(tp, &q->qql_item);
+ return q;
}
@@ -890,13 +872,8 @@ xfs_trans_log_quotaoff_item(
xfs_trans_t *tp,
xfs_qoff_logitem_t *qlp)
{
- xfs_log_item_desc_t *lidp;
-
- lidp = xfs_trans_find_item(tp, (xfs_log_item_t *)qlp);
- ASSERT(lidp != NULL);
-
tp->t_flags |= XFS_TRANS_DIRTY;
- lidp->lid_flags |= XFS_LID_DIRTY;
+ qlp->qql_item.li_desc->lid_flags |= XFS_LID_DIRTY;
}
STATIC void
diff --git a/fs/xfs/support/debug.c b/fs/xfs/support/debug.c
index 3f3610a7ee05..975aa10e1a47 100644
--- a/fs/xfs/support/debug.c
+++ b/fs/xfs/support/debug.c
@@ -22,7 +22,6 @@
#include "xfs_sb.h"
#include "xfs_inum.h"
#include "xfs_ag.h"
-#include "xfs_dmapi.h"
#include "xfs_mount.h"
#include "xfs_error.h"
diff --git a/fs/xfs/xfs_ag.h b/fs/xfs/xfs_ag.h
index 401f364ad36c..4917d4eed4ed 100644
--- a/fs/xfs/xfs_ag.h
+++ b/fs/xfs/xfs_ag.h
@@ -227,7 +227,6 @@ typedef struct xfs_perag {
atomic_t pagf_fstrms; /* # of filestreams active in this AG */
- int pag_ici_init; /* incore inode cache initialised */
rwlock_t pag_ici_lock; /* incore inode lock */
struct radix_tree_root pag_ici_root; /* incore inode cache root */
int pag_ici_reclaimable; /* reclaimable inodes */
diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c
index a7fbe8a99b12..af168faccc7a 100644
--- a/fs/xfs/xfs_alloc.c
+++ b/fs/xfs/xfs_alloc.c
@@ -24,18 +24,13 @@
#include "xfs_trans.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
-#include "xfs_dir2.h"
-#include "xfs_dmapi.h"
#include "xfs_mount.h"
#include "xfs_bmap_btree.h"
#include "xfs_alloc_btree.h"
#include "xfs_ialloc_btree.h"
-#include "xfs_dir2_sf.h"
-#include "xfs_attr_sf.h"
#include "xfs_dinode.h"
#include "xfs_inode.h"
#include "xfs_btree.h"
-#include "xfs_ialloc.h"
#include "xfs_alloc.h"
#include "xfs_error.h"
#include "xfs_trace.h"
@@ -688,8 +683,6 @@ xfs_alloc_ag_vextent_near(
xfs_agblock_t ltbno; /* start bno of left side entry */
xfs_agblock_t ltbnoa; /* aligned ... */
xfs_extlen_t ltdiff; /* difference to left side entry */
- /*REFERENCED*/
- xfs_agblock_t ltend; /* end bno of left side entry */
xfs_extlen_t ltlen; /* length of left side entry */
xfs_extlen_t ltlena; /* aligned ... */
xfs_agblock_t ltnew; /* useful start bno of left side */
@@ -814,8 +807,7 @@ xfs_alloc_ag_vextent_near(
if ((error = xfs_alloc_get_rec(cnt_cur, &ltbno, &ltlen, &i)))
goto error0;
XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
- ltend = ltbno + ltlen;
- ASSERT(ltend <= be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length));
+ ASSERT(ltbno + ltlen <= be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length));
args->len = blen;
if (!xfs_alloc_fix_minleft(args)) {
xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
@@ -828,7 +820,7 @@ xfs_alloc_ag_vextent_near(
*/
args->agbno = bnew;
ASSERT(bnew >= ltbno);
- ASSERT(bnew + blen <= ltend);
+ ASSERT(bnew + blen <= ltbno + ltlen);
/*
* Set up a cursor for the by-bno tree.
*/
@@ -1157,7 +1149,6 @@ xfs_alloc_ag_vextent_near(
/*
* Fix up the length and compute the useful address.
*/
- ltend = ltbno + ltlen;
args->len = XFS_EXTLEN_MIN(ltlena, args->maxlen);
xfs_alloc_fix_len(args);
if (!xfs_alloc_fix_minleft(args)) {
@@ -1170,7 +1161,7 @@ xfs_alloc_ag_vextent_near(
(void)xfs_alloc_compute_diff(args->agbno, rlen, args->alignment, ltbno,
ltlen, &ltnew);
ASSERT(ltnew >= ltbno);
- ASSERT(ltnew + rlen <= ltend);
+ ASSERT(ltnew + rlen <= ltbno + ltlen);
ASSERT(ltnew + rlen <= be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length));
args->agbno = ltnew;
if ((error = xfs_alloc_fixup_trees(cnt_cur, bno_cur_lt, ltbno, ltlen,
diff --git a/fs/xfs/xfs_alloc.h b/fs/xfs/xfs_alloc.h
index 6d05199b667c..895009a97271 100644
--- a/fs/xfs/xfs_alloc.h
+++ b/fs/xfs/xfs_alloc.h
@@ -27,16 +27,16 @@ struct xfs_busy_extent;
/*
* Freespace allocation types. Argument to xfs_alloc_[v]extent.
*/
-typedef enum xfs_alloctype
-{
- XFS_ALLOCTYPE_ANY_AG, /* allocate anywhere, use rotor */
- XFS_ALLOCTYPE_FIRST_AG, /* ... start at ag 0 */
- XFS_ALLOCTYPE_START_AG, /* anywhere, start in this a.g. */
- XFS_ALLOCTYPE_THIS_AG, /* anywhere in this a.g. */
- XFS_ALLOCTYPE_START_BNO, /* near this block else anywhere */
- XFS_ALLOCTYPE_NEAR_BNO, /* in this a.g. and near this block */
- XFS_ALLOCTYPE_THIS_BNO /* at exactly this block */
-} xfs_alloctype_t;
+#define XFS_ALLOCTYPE_ANY_AG 0x01 /* allocate anywhere, use rotor */
+#define XFS_ALLOCTYPE_FIRST_AG 0x02 /* ... start at ag 0 */
+#define XFS_ALLOCTYPE_START_AG 0x04 /* anywhere, start in this a.g. */
+#define XFS_ALLOCTYPE_THIS_AG 0x08 /* anywhere in this a.g. */
+#define XFS_ALLOCTYPE_START_BNO 0x10 /* near this block else anywhere */
+#define XFS_ALLOCTYPE_NEAR_BNO 0x20 /* in this a.g. and near this block */
+#define XFS_ALLOCTYPE_THIS_BNO 0x40 /* at exactly this block */
+
+/* this should become an enum again when the tracing code is fixed */
+typedef unsigned int xfs_alloctype_t;
#define XFS_ALLOC_TYPES \
{ XFS_ALLOCTYPE_ANY_AG, "ANY_AG" }, \
diff --git a/fs/xfs/xfs_alloc_btree.c b/fs/xfs/xfs_alloc_btree.c
index 83f494218759..97f7328967fd 100644
--- a/fs/xfs/xfs_alloc_btree.c
+++ b/fs/xfs/xfs_alloc_btree.c
@@ -24,19 +24,14 @@
#include "xfs_trans.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
-#include "xfs_dir2.h"
-#include "xfs_dmapi.h"
#include "xfs_mount.h"
#include "xfs_bmap_btree.h"
#include "xfs_alloc_btree.h"
#include "xfs_ialloc_btree.h"
-#include "xfs_dir2_sf.h"
-#include "xfs_attr_sf.h"
#include "xfs_dinode.h"
#include "xfs_inode.h"
#include "xfs_btree.h"
#include "xfs_btree_trace.h"
-#include "xfs_ialloc.h"
#include "xfs_alloc.h"
#include "xfs_error.h"
#include "xfs_trace.h"
diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c
index b9c196a53c42..c2568242a901 100644
--- a/fs/xfs/xfs_attr.c
+++ b/fs/xfs/xfs_attr.c
@@ -25,19 +25,13 @@
#include "xfs_trans.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
-#include "xfs_dir2.h"
-#include "xfs_dmapi.h"
#include "xfs_mount.h"
#include "xfs_da_btree.h"
#include "xfs_bmap_btree.h"
-#include "xfs_alloc_btree.h"
-#include "xfs_ialloc_btree.h"
-#include "xfs_dir2_sf.h"
#include "xfs_attr_sf.h"
#include "xfs_dinode.h"
#include "xfs_inode.h"
#include "xfs_alloc.h"
-#include "xfs_btree.h"
#include "xfs_inode_item.h"
#include "xfs_bmap.h"
#include "xfs_attr.h"
@@ -325,8 +319,7 @@ xfs_attr_set_int(
return (error);
}
- xfs_trans_ijoin(args.trans, dp, XFS_ILOCK_EXCL);
- xfs_trans_ihold(args.trans, dp);
+ xfs_trans_ijoin(args.trans, dp);
/*
* If the attribute list is non-existent or a shortform list,
@@ -396,10 +389,8 @@ xfs_attr_set_int(
* bmap_finish() may have committed the last trans and started
* a new one. We need the inode to be in all transactions.
*/
- if (committed) {
- xfs_trans_ijoin(args.trans, dp, XFS_ILOCK_EXCL);
- xfs_trans_ihold(args.trans, dp);
- }
+ if (committed)
+ xfs_trans_ijoin(args.trans, dp);
/*
* Commit the leaf transformation. We'll need another (linked)
@@ -544,8 +535,7 @@ xfs_attr_remove_int(xfs_inode_t *dp, struct xfs_name *name, int flags)
* No need to make quota reservations here. We expect to release some
* blocks not allocate in the common case.
*/
- xfs_trans_ijoin(args.trans, dp, XFS_ILOCK_EXCL);
- xfs_trans_ihold(args.trans, dp);
+ xfs_trans_ijoin(args.trans, dp);
/*
* Decide on what work routines to call based on the inode size.
@@ -821,8 +811,7 @@ xfs_attr_inactive(xfs_inode_t *dp)
* No need to make quota reservations here. We expect to release some
* blocks, not allocate, in the common case.
*/
- xfs_trans_ijoin(trans, dp, XFS_ILOCK_EXCL);
- xfs_trans_ihold(trans, dp);
+ xfs_trans_ijoin(trans, dp);
/*
* Decide on what work routines to call based on the inode size.
@@ -981,10 +970,8 @@ xfs_attr_leaf_addname(xfs_da_args_t *args)
* bmap_finish() may have committed the last trans and started
* a new one. We need the inode to be in all transactions.
*/
- if (committed) {
- xfs_trans_ijoin(args->trans, dp, XFS_ILOCK_EXCL);
- xfs_trans_ihold(args->trans, dp);
- }
+ if (committed)
+ xfs_trans_ijoin(args->trans, dp);
/*
* Commit the current trans (including the inode) and start
@@ -1085,10 +1072,8 @@ xfs_attr_leaf_addname(xfs_da_args_t *args)
* and started a new one. We need the inode to be
* in all transactions.
*/
- if (committed) {
- xfs_trans_ijoin(args->trans, dp, XFS_ILOCK_EXCL);
- xfs_trans_ihold(args->trans, dp);
- }
+ if (committed)
+ xfs_trans_ijoin(args->trans, dp);
} else
xfs_da_buf_done(bp);
@@ -1161,10 +1146,8 @@ xfs_attr_leaf_removename(xfs_da_args_t *args)
* bmap_finish() may have committed the last trans and started
* a new one. We need the inode to be in all transactions.
*/
- if (committed) {
- xfs_trans_ijoin(args->trans, dp, XFS_ILOCK_EXCL);
- xfs_trans_ihold(args->trans, dp);
- }
+ if (committed)
+ xfs_trans_ijoin(args->trans, dp);
} else
xfs_da_buf_done(bp);
return(0);
@@ -1317,10 +1300,8 @@ restart:
* and started a new one. We need the inode to be
* in all transactions.
*/
- if (committed) {
- xfs_trans_ijoin(args->trans, dp, XFS_ILOCK_EXCL);
- xfs_trans_ihold(args->trans, dp);
- }
+ if (committed)
+ xfs_trans_ijoin(args->trans, dp);
/*
* Commit the node conversion and start the next
@@ -1356,10 +1337,8 @@ restart:
* bmap_finish() may have committed the last trans and started
* a new one. We need the inode to be in all transactions.
*/
- if (committed) {
- xfs_trans_ijoin(args->trans, dp, XFS_ILOCK_EXCL);
- xfs_trans_ihold(args->trans, dp);
- }
+ if (committed)
+ xfs_trans_ijoin(args->trans, dp);
} else {
/*
* Addition succeeded, update Btree hashvals.
@@ -1470,10 +1449,8 @@ restart:
* and started a new one. We need the inode to be
* in all transactions.
*/
- if (committed) {
- xfs_trans_ijoin(args->trans, dp, XFS_ILOCK_EXCL);
- xfs_trans_ihold(args->trans, dp);
- }
+ if (committed)
+ xfs_trans_ijoin(args->trans, dp);
}
/*
@@ -1604,10 +1581,8 @@ xfs_attr_node_removename(xfs_da_args_t *args)
* bmap_finish() may have committed the last trans and started
* a new one. We need the inode to be in all transactions.
*/
- if (committed) {
- xfs_trans_ijoin(args->trans, dp, XFS_ILOCK_EXCL);
- xfs_trans_ihold(args->trans, dp);
- }
+ if (committed)
+ xfs_trans_ijoin(args->trans, dp);
/*
* Commit the Btree join operation and start a new trans.
@@ -1658,10 +1633,8 @@ xfs_attr_node_removename(xfs_da_args_t *args)
* and started a new one. We need the inode to be
* in all transactions.
*/
- if (committed) {
- xfs_trans_ijoin(args->trans, dp, XFS_ILOCK_EXCL);
- xfs_trans_ihold(args->trans, dp);
- }
+ if (committed)
+ xfs_trans_ijoin(args->trans, dp);
} else
xfs_da_brelse(args->trans, bp);
}
@@ -2004,7 +1977,7 @@ xfs_attr_rmtval_get(xfs_da_args_t *args)
error = xfs_bmapi(args->trans, args->dp, (xfs_fileoff_t)lblkno,
args->rmtblkcnt,
XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA,
- NULL, 0, map, &nmap, NULL, NULL);
+ NULL, 0, map, &nmap, NULL);
if (error)
return(error);
ASSERT(nmap >= 1);
@@ -2083,7 +2056,7 @@ xfs_attr_rmtval_set(xfs_da_args_t *args)
XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA |
XFS_BMAPI_WRITE,
args->firstblock, args->total, &map, &nmap,
- args->flist, NULL);
+ args->flist);
if (!error) {
error = xfs_bmap_finish(&args->trans, args->flist,
&committed);
@@ -2099,10 +2072,8 @@ xfs_attr_rmtval_set(xfs_da_args_t *args)
* bmap_finish() may have committed the last trans and started
* a new one. We need the inode to be in all transactions.
*/
- if (committed) {
- xfs_trans_ijoin(args->trans, dp, XFS_ILOCK_EXCL);
- xfs_trans_ihold(args->trans, dp);
- }
+ if (committed)
+ xfs_trans_ijoin(args->trans, dp);
ASSERT(nmap == 1);
ASSERT((map.br_startblock != DELAYSTARTBLOCK) &&
@@ -2136,7 +2107,7 @@ xfs_attr_rmtval_set(xfs_da_args_t *args)
args->rmtblkcnt,
XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA,
args->firstblock, 0, &map, &nmap,
- NULL, NULL);
+ NULL);
if (error) {
return(error);
}
@@ -2201,7 +2172,7 @@ xfs_attr_rmtval_remove(xfs_da_args_t *args)
args->rmtblkcnt,
XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA,
args->firstblock, 0, &map, &nmap,
- args->flist, NULL);
+ args->flist);
if (error) {
return(error);
}
@@ -2239,7 +2210,7 @@ xfs_attr_rmtval_remove(xfs_da_args_t *args)
error = xfs_bunmapi(args->trans, args->dp, lblkno, blkcnt,
XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA,
1, args->firstblock, args->flist,
- NULL, &done);
+ &done);
if (!error) {
error = xfs_bmap_finish(&args->trans, args->flist,
&committed);
@@ -2255,10 +2226,8 @@ xfs_attr_rmtval_remove(xfs_da_args_t *args)
* bmap_finish() may have committed the last trans and started
* a new one. We need the inode to be in all transactions.
*/
- if (committed) {
- xfs_trans_ijoin(args->trans, args->dp, XFS_ILOCK_EXCL);
- xfs_trans_ihold(args->trans, args->dp);
- }
+ if (committed)
+ xfs_trans_ijoin(args->trans, args->dp);
/*
* Close out trans and start the next one in the chain.
diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/xfs_attr_leaf.c
index a90ce74fc256..a6cff8edcdb6 100644
--- a/fs/xfs/xfs_attr_leaf.c
+++ b/fs/xfs/xfs_attr_leaf.c
@@ -24,8 +24,6 @@
#include "xfs_trans.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
-#include "xfs_dir2.h"
-#include "xfs_dmapi.h"
#include "xfs_mount.h"
#include "xfs_da_btree.h"
#include "xfs_bmap_btree.h"
@@ -33,7 +31,6 @@
#include "xfs_ialloc_btree.h"
#include "xfs_alloc.h"
#include "xfs_btree.h"
-#include "xfs_dir2_sf.h"
#include "xfs_attr_sf.h"
#include "xfs_dinode.h"
#include "xfs_inode.h"
@@ -2931,7 +2928,7 @@ xfs_attr_leaf_freextent(xfs_trans_t **trans, xfs_inode_t *dp,
nmap = 1;
error = xfs_bmapi(*trans, dp, (xfs_fileoff_t)tblkno, tblkcnt,
XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA,
- NULL, 0, &map, &nmap, NULL, NULL);
+ NULL, 0, &map, &nmap, NULL);
if (error) {
return(error);
}
diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index 99587ded043f..23f14e595c18 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -30,13 +30,10 @@
#include "xfs_alloc_btree.h"
#include "xfs_ialloc_btree.h"
#include "xfs_dir2_sf.h"
-#include "xfs_attr_sf.h"
#include "xfs_dinode.h"
#include "xfs_inode.h"
#include "xfs_btree.h"
-#include "xfs_dmapi.h"
#include "xfs_mount.h"
-#include "xfs_ialloc.h"
#include "xfs_itable.h"
#include "xfs_dir2_data.h"
#include "xfs_dir2_leaf.h"
@@ -104,7 +101,6 @@ xfs_bmap_add_extent(
xfs_fsblock_t *first, /* pointer to firstblock variable */
xfs_bmap_free_t *flist, /* list of extents to be freed */
int *logflagsp, /* inode logging flags */
- xfs_extdelta_t *delta, /* Change made to incore extents */
int whichfork, /* data or attr fork */
int rsvd); /* OK to allocate reserved blocks */
@@ -122,7 +118,6 @@ xfs_bmap_add_extent_delay_real(
xfs_fsblock_t *first, /* pointer to firstblock variable */
xfs_bmap_free_t *flist, /* list of extents to be freed */
int *logflagsp, /* inode logging flags */
- xfs_extdelta_t *delta, /* Change made to incore extents */
int rsvd); /* OK to allocate reserved blocks */
/*
@@ -135,7 +130,6 @@ xfs_bmap_add_extent_hole_delay(
xfs_extnum_t idx, /* extent number to update/insert */
xfs_bmbt_irec_t *new, /* new data to add to file extents */
int *logflagsp,/* inode logging flags */
- xfs_extdelta_t *delta, /* Change made to incore extents */
int rsvd); /* OK to allocate reserved blocks */
/*
@@ -149,7 +143,6 @@ xfs_bmap_add_extent_hole_real(
xfs_btree_cur_t *cur, /* if null, not a btree */
xfs_bmbt_irec_t *new, /* new data to add to file extents */
int *logflagsp, /* inode logging flags */
- xfs_extdelta_t *delta, /* Change made to incore extents */
int whichfork); /* data or attr fork */
/*
@@ -162,8 +155,7 @@ xfs_bmap_add_extent_unwritten_real(
xfs_extnum_t idx, /* extent number to update/insert */
xfs_btree_cur_t **curp, /* if *curp is null, not a btree */
xfs_bmbt_irec_t *new, /* new data to add to file extents */
- int *logflagsp, /* inode logging flags */
- xfs_extdelta_t *delta); /* Change made to incore extents */
+ int *logflagsp); /* inode logging flags */
/*
* xfs_bmap_alloc is called by xfs_bmapi to allocate an extent for a file.
@@ -200,7 +192,6 @@ xfs_bmap_del_extent(
xfs_btree_cur_t *cur, /* if null, not a btree */
xfs_bmbt_irec_t *new, /* new data to add to file extents */
int *logflagsp,/* inode logging flags */
- xfs_extdelta_t *delta, /* Change made to incore extents */
int whichfork, /* data or attr fork */
int rsvd); /* OK to allocate reserved blocks */
@@ -489,7 +480,6 @@ xfs_bmap_add_extent(
xfs_fsblock_t *first, /* pointer to firstblock variable */
xfs_bmap_free_t *flist, /* list of extents to be freed */
int *logflagsp, /* inode logging flags */
- xfs_extdelta_t *delta, /* Change made to incore extents */
int whichfork, /* data or attr fork */
int rsvd) /* OK to use reserved data blocks */
{
@@ -524,15 +514,6 @@ xfs_bmap_add_extent(
logflags = XFS_ILOG_CORE | xfs_ilog_fext(whichfork);
} else
logflags = 0;
- /* DELTA: single new extent */
- if (delta) {
- if (delta->xed_startoff > new->br_startoff)
- delta->xed_startoff = new->br_startoff;
- if (delta->xed_blockcount <
- new->br_startoff + new->br_blockcount)
- delta->xed_blockcount = new->br_startoff +
- new->br_blockcount;
- }
}
/*
* Any kind of new delayed allocation goes here.
@@ -542,7 +523,7 @@ xfs_bmap_add_extent(
ASSERT((cur->bc_private.b.flags &
XFS_BTCUR_BPRV_WASDEL) == 0);
if ((error = xfs_bmap_add_extent_hole_delay(ip, idx, new,
- &logflags, delta, rsvd)))
+ &logflags, rsvd)))
goto done;
}
/*
@@ -553,7 +534,7 @@ xfs_bmap_add_extent(
ASSERT((cur->bc_private.b.flags &
XFS_BTCUR_BPRV_WASDEL) == 0);
if ((error = xfs_bmap_add_extent_hole_real(ip, idx, cur, new,
- &logflags, delta, whichfork)))
+ &logflags, whichfork)))
goto done;
} else {
xfs_bmbt_irec_t prev; /* old extent at offset idx */
@@ -578,17 +559,17 @@ xfs_bmap_add_extent(
XFS_BTCUR_BPRV_WASDEL);
if ((error = xfs_bmap_add_extent_delay_real(ip,
idx, &cur, new, &da_new, first, flist,
- &logflags, delta, rsvd)))
+ &logflags, rsvd)))
goto done;
} else if (new->br_state == XFS_EXT_NORM) {
ASSERT(new->br_state == XFS_EXT_NORM);
if ((error = xfs_bmap_add_extent_unwritten_real(
- ip, idx, &cur, new, &logflags, delta)))
+ ip, idx, &cur, new, &logflags)))
goto done;
} else {
ASSERT(new->br_state == XFS_EXT_UNWRITTEN);
if ((error = xfs_bmap_add_extent_unwritten_real(
- ip, idx, &cur, new, &logflags, delta)))
+ ip, idx, &cur, new, &logflags)))
goto done;
}
ASSERT(*curp == cur || *curp == NULL);
@@ -601,7 +582,7 @@ xfs_bmap_add_extent(
ASSERT((cur->bc_private.b.flags &
XFS_BTCUR_BPRV_WASDEL) == 0);
if ((error = xfs_bmap_add_extent_hole_real(ip, idx, cur,
- new, &logflags, delta, whichfork)))
+ new, &logflags, whichfork)))
goto done;
}
}
@@ -666,7 +647,6 @@ xfs_bmap_add_extent_delay_real(
xfs_fsblock_t *first, /* pointer to firstblock variable */
xfs_bmap_free_t *flist, /* list of extents to be freed */
int *logflagsp, /* inode logging flags */
- xfs_extdelta_t *delta, /* Change made to incore extents */
int rsvd) /* OK to use reserved data block allocation */
{
xfs_btree_cur_t *cur; /* btree cursor */
@@ -797,11 +777,6 @@ xfs_bmap_add_extent_delay_real(
goto done;
}
*dnew = 0;
- /* DELTA: Three in-core extents are replaced by one. */
- temp = LEFT.br_startoff;
- temp2 = LEFT.br_blockcount +
- PREV.br_blockcount +
- RIGHT.br_blockcount;
break;
case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING | BMAP_LEFT_CONTIG:
@@ -832,10 +807,6 @@ xfs_bmap_add_extent_delay_real(
goto done;
}
*dnew = 0;
- /* DELTA: Two in-core extents are replaced by one. */
- temp = LEFT.br_startoff;
- temp2 = LEFT.br_blockcount +
- PREV.br_blockcount;
break;
case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG:
@@ -867,10 +838,6 @@ xfs_bmap_add_extent_delay_real(
goto done;
}
*dnew = 0;
- /* DELTA: Two in-core extents are replaced by one. */
- temp = PREV.br_startoff;
- temp2 = PREV.br_blockcount +
- RIGHT.br_blockcount;
break;
case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING:
@@ -900,9 +867,6 @@ xfs_bmap_add_extent_delay_real(
XFS_WANT_CORRUPTED_GOTO(i == 1, done);
}
*dnew = 0;
- /* DELTA: The in-core extent described by new changed type. */
- temp = new->br_startoff;
- temp2 = new->br_blockcount;
break;
case BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG:
@@ -942,10 +906,6 @@ xfs_bmap_add_extent_delay_real(
xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_);
*dnew = temp;
- /* DELTA: The boundary between two in-core extents moved. */
- temp = LEFT.br_startoff;
- temp2 = LEFT.br_blockcount +
- PREV.br_blockcount;
break;
case BMAP_LEFT_FILLING:
@@ -990,9 +950,6 @@ xfs_bmap_add_extent_delay_real(
xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
trace_xfs_bmap_post_update(ip, idx + 1, state, _THIS_IP_);
*dnew = temp;
- /* DELTA: One in-core extent is split in two. */
- temp = PREV.br_startoff;
- temp2 = PREV.br_blockcount;
break;
case BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG:
@@ -1031,10 +988,6 @@ xfs_bmap_add_extent_delay_real(
xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_);
*dnew = temp;
- /* DELTA: The boundary between two in-core extents moved. */
- temp = PREV.br_startoff;
- temp2 = PREV.br_blockcount +
- RIGHT.br_blockcount;
break;
case BMAP_RIGHT_FILLING:
@@ -1078,9 +1031,6 @@ xfs_bmap_add_extent_delay_real(
xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_);
*dnew = temp;
- /* DELTA: One in-core extent is split in two. */
- temp = PREV.br_startoff;
- temp2 = PREV.br_blockcount;
break;
case 0:
@@ -1161,9 +1111,6 @@ xfs_bmap_add_extent_delay_real(
nullstartblock((int)temp2));
trace_xfs_bmap_post_update(ip, idx + 2, state, _THIS_IP_);
*dnew = temp + temp2;
- /* DELTA: One in-core extent is split in three. */
- temp = PREV.br_startoff;
- temp2 = PREV.br_blockcount;
break;
case BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG:
@@ -1179,13 +1126,6 @@ xfs_bmap_add_extent_delay_real(
ASSERT(0);
}
*curp = cur;
- if (delta) {
- temp2 += temp;
- if (delta->xed_startoff > temp)
- delta->xed_startoff = temp;
- if (delta->xed_blockcount < temp2)
- delta->xed_blockcount = temp2;
- }
done:
*logflagsp = rval;
return error;
@@ -1204,8 +1144,7 @@ xfs_bmap_add_extent_unwritten_real(
xfs_extnum_t idx, /* extent number to update/insert */
xfs_btree_cur_t **curp, /* if *curp is null, not a btree */
xfs_bmbt_irec_t *new, /* new data to add to file extents */
- int *logflagsp, /* inode logging flags */
- xfs_extdelta_t *delta) /* Change made to incore extents */
+ int *logflagsp) /* inode logging flags */
{
xfs_btree_cur_t *cur; /* btree cursor */
xfs_bmbt_rec_host_t *ep; /* extent entry for idx */
@@ -1219,8 +1158,6 @@ xfs_bmap_add_extent_unwritten_real(
/* left is 0, right is 1, prev is 2 */
int rval=0; /* return value (logging flags) */
int state = 0;/* state bits, accessed thru macros */
- xfs_filblks_t temp=0;
- xfs_filblks_t temp2=0;
#define LEFT r[0]
#define RIGHT r[1]
@@ -1341,11 +1278,6 @@ xfs_bmap_add_extent_unwritten_real(
RIGHT.br_blockcount, LEFT.br_state)))
goto done;
}
- /* DELTA: Three in-core extents are replaced by one. */
- temp = LEFT.br_startoff;
- temp2 = LEFT.br_blockcount +
- PREV.br_blockcount +
- RIGHT.br_blockcount;
break;
case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING | BMAP_LEFT_CONTIG:
@@ -1382,10 +1314,6 @@ xfs_bmap_add_extent_unwritten_real(
LEFT.br_state)))
goto done;
}
- /* DELTA: Two in-core extents are replaced by one. */
- temp = LEFT.br_startoff;
- temp2 = LEFT.br_blockcount +
- PREV.br_blockcount;
break;
case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG:
@@ -1422,10 +1350,6 @@ xfs_bmap_add_extent_unwritten_real(
newext)))
goto done;
}
- /* DELTA: Two in-core extents are replaced by one. */
- temp = PREV.br_startoff;
- temp2 = PREV.br_blockcount +
- RIGHT.br_blockcount;
break;
case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING:
@@ -1453,9 +1377,6 @@ xfs_bmap_add_extent_unwritten_real(
newext)))
goto done;
}
- /* DELTA: The in-core extent described by new changed type. */
- temp = new->br_startoff;
- temp2 = new->br_blockcount;
break;
case BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG:
@@ -1501,10 +1422,6 @@ xfs_bmap_add_extent_unwritten_real(
LEFT.br_state))
goto done;
}
- /* DELTA: The boundary between two in-core extents moved. */
- temp = LEFT.br_startoff;
- temp2 = LEFT.br_blockcount +
- PREV.br_blockcount;
break;
case BMAP_LEFT_FILLING:
@@ -1544,9 +1461,6 @@ xfs_bmap_add_extent_unwritten_real(
goto done;
XFS_WANT_CORRUPTED_GOTO(i == 1, done);
}
- /* DELTA: One in-core extent is split in two. */
- temp = PREV.br_startoff;
- temp2 = PREV.br_blockcount;
break;
case BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG:
@@ -1587,10 +1501,6 @@ xfs_bmap_add_extent_unwritten_real(
newext)))
goto done;
}
- /* DELTA: The boundary between two in-core extents moved. */
- temp = PREV.br_startoff;
- temp2 = PREV.br_blockcount +
- RIGHT.br_blockcount;
break;
case BMAP_RIGHT_FILLING:
@@ -1630,9 +1540,6 @@ xfs_bmap_add_extent_unwritten_real(
goto done;
XFS_WANT_CORRUPTED_GOTO(i == 1, done);
}
- /* DELTA: One in-core extent is split in two. */
- temp = PREV.br_startoff;
- temp2 = PREV.br_blockcount;
break;
case 0:
@@ -1692,9 +1599,6 @@ xfs_bmap_add_extent_unwritten_real(
goto done;
XFS_WANT_CORRUPTED_GOTO(i == 1, done);
}
- /* DELTA: One in-core extent is split in three. */
- temp = PREV.br_startoff;
- temp2 = PREV.br_blockcount;
break;
case BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG:
@@ -1710,13 +1614,6 @@ xfs_bmap_add_extent_unwritten_real(
ASSERT(0);
}
*curp = cur;
- if (delta) {
- temp2 += temp;
- if (delta->xed_startoff > temp)
- delta->xed_startoff = temp;
- if (delta->xed_blockcount < temp2)
- delta->xed_blockcount = temp2;
- }
done:
*logflagsp = rval;
return error;
@@ -1736,7 +1633,6 @@ xfs_bmap_add_extent_hole_delay(
xfs_extnum_t idx, /* extent number to update/insert */
xfs_bmbt_irec_t *new, /* new data to add to file extents */
int *logflagsp, /* inode logging flags */
- xfs_extdelta_t *delta, /* Change made to incore extents */
int rsvd) /* OK to allocate reserved blocks */
{
xfs_bmbt_rec_host_t *ep; /* extent record for idx */
@@ -1747,7 +1643,6 @@ xfs_bmap_add_extent_hole_delay(
xfs_bmbt_irec_t right; /* right neighbor extent entry */
int state; /* state bits, accessed thru macros */
xfs_filblks_t temp=0; /* temp for indirect calculations */
- xfs_filblks_t temp2=0;
ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
ep = xfs_iext_get_ext(ifp, idx);
@@ -1819,9 +1714,6 @@ xfs_bmap_add_extent_hole_delay(
xfs_iext_remove(ip, idx, 1, state);
ip->i_df.if_lastex = idx - 1;
- /* DELTA: Two in-core extents were replaced by one. */
- temp2 = temp;
- temp = left.br_startoff;
break;
case BMAP_LEFT_CONTIG:
@@ -1841,9 +1733,6 @@ xfs_bmap_add_extent_hole_delay(
trace_xfs_bmap_post_update(ip, idx - 1, state, _THIS_IP_);
ip->i_df.if_lastex = idx - 1;
- /* DELTA: One in-core extent grew into a hole. */
- temp2 = temp;
- temp = left.br_startoff;
break;
case BMAP_RIGHT_CONTIG:
@@ -1862,9 +1751,6 @@ xfs_bmap_add_extent_hole_delay(
trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_);
ip->i_df.if_lastex = idx;
- /* DELTA: One in-core extent grew into a hole. */
- temp2 = temp;
- temp = new->br_startoff;
break;
case 0:
@@ -1876,9 +1762,6 @@ xfs_bmap_add_extent_hole_delay(
oldlen = newlen = 0;
xfs_iext_insert(ip, idx, 1, new, state);
ip->i_df.if_lastex = idx;
- /* DELTA: A new in-core extent was added in a hole. */
- temp2 = new->br_blockcount;
- temp = new->br_startoff;
break;
}
if (oldlen != newlen) {
@@ -1889,13 +1772,6 @@ xfs_bmap_add_extent_hole_delay(
* Nothing to do for disk quota accounting here.
*/
}
- if (delta) {
- temp2 += temp;
- if (delta->xed_startoff > temp)
- delta->xed_startoff = temp;
- if (delta->xed_blockcount < temp2)
- delta->xed_blockcount = temp2;
- }
*logflagsp = 0;
return 0;
}
@@ -1911,7 +1787,6 @@ xfs_bmap_add_extent_hole_real(
xfs_btree_cur_t *cur, /* if null, not a btree */
xfs_bmbt_irec_t *new, /* new data to add to file extents */
int *logflagsp, /* inode logging flags */
- xfs_extdelta_t *delta, /* Change made to incore extents */
int whichfork) /* data or attr fork */
{
xfs_bmbt_rec_host_t *ep; /* pointer to extent entry ins. point */
@@ -1922,8 +1797,6 @@ xfs_bmap_add_extent_hole_real(
xfs_bmbt_irec_t right; /* right neighbor extent entry */
int rval=0; /* return value (logging flags) */
int state; /* state bits, accessed thru macros */
- xfs_filblks_t temp=0;
- xfs_filblks_t temp2=0;
ifp = XFS_IFORK_PTR(ip, whichfork);
ASSERT(idx <= ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t));
@@ -2020,11 +1893,6 @@ xfs_bmap_add_extent_hole_real(
left.br_state)))
goto done;
}
- /* DELTA: Two in-core extents were replaced by one. */
- temp = left.br_startoff;
- temp2 = left.br_blockcount +
- new->br_blockcount +
- right.br_blockcount;
break;
case BMAP_LEFT_CONTIG:
@@ -2056,10 +1924,6 @@ xfs_bmap_add_extent_hole_real(
left.br_state)))
goto done;
}
- /* DELTA: One in-core extent grew. */
- temp = left.br_startoff;
- temp2 = left.br_blockcount +
- new->br_blockcount;
break;
case BMAP_RIGHT_CONTIG:
@@ -2092,10 +1956,6 @@ xfs_bmap_add_extent_hole_real(
right.br_state)))
goto done;
}
- /* DELTA: One in-core extent grew. */
- temp = new->br_startoff;
- temp2 = new->br_blockcount +
- right.br_blockcount;
break;
case 0:
@@ -2123,18 +1983,8 @@ xfs_bmap_add_extent_hole_real(
goto done;
XFS_WANT_CORRUPTED_GOTO(i == 1, done);
}
- /* DELTA: A new extent was added in a hole. */
- temp = new->br_startoff;
- temp2 = new->br_blockcount;
break;
}
- if (delta) {
- temp2 += temp;
- if (delta->xed_startoff > temp)
- delta->xed_startoff = temp;
- if (delta->xed_blockcount < temp2)
- delta->xed_blockcount = temp2;
- }
done:
*logflagsp = rval;
return error;
@@ -2959,7 +2809,6 @@ xfs_bmap_del_extent(
xfs_btree_cur_t *cur, /* if null, not a btree */
xfs_bmbt_irec_t *del, /* data to remove from extents */
int *logflagsp, /* inode logging flags */
- xfs_extdelta_t *delta, /* Change made to incore extents */
int whichfork, /* data or attr fork */
int rsvd) /* OK to allocate reserved blocks */
{
@@ -3265,14 +3114,6 @@ xfs_bmap_del_extent(
if (da_old > da_new)
xfs_mod_incore_sb(mp, XFS_SBS_FDBLOCKS, (int64_t)(da_old - da_new),
rsvd);
- if (delta) {
- /* DELTA: report the original extent. */
- if (delta->xed_startoff > got.br_startoff)
- delta->xed_startoff = got.br_startoff;
- if (delta->xed_blockcount < got.br_startoff+got.br_blockcount)
- delta->xed_blockcount = got.br_startoff +
- got.br_blockcount;
- }
done:
*logflagsp = flags;
return error;
@@ -3754,9 +3595,10 @@ xfs_bmap_add_attrfork(
ip->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS;
}
ASSERT(ip->i_d.di_anextents == 0);
- IHOLD(ip);
- xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
+
+ xfs_trans_ijoin_ref(tp, ip, XFS_ILOCK_EXCL);
xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+
switch (ip->i_d.di_format) {
case XFS_DINODE_FMT_DEV:
ip->i_d.di_forkoff = roundup(sizeof(xfs_dev_t), 8) >> 3;
@@ -4483,8 +4325,7 @@ xfs_bmapi(
xfs_extlen_t total, /* total blocks needed */
xfs_bmbt_irec_t *mval, /* output: map values */
int *nmap, /* i/o: mval size/count */
- xfs_bmap_free_t *flist, /* i/o: list extents to free */
- xfs_extdelta_t *delta) /* o: change made to incore extents */
+ xfs_bmap_free_t *flist) /* i/o: list extents to free */
{
xfs_fsblock_t abno; /* allocated block number */
xfs_extlen_t alen; /* allocated extent length */
@@ -4596,10 +4437,7 @@ xfs_bmapi(
end = bno + len;
obno = bno;
bma.ip = NULL;
- if (delta) {
- delta->xed_startoff = NULLFILEOFF;
- delta->xed_blockcount = 0;
- }
+
while (bno < end && n < *nmap) {
/*
* Reading past eof, act as though there's a hole
@@ -4620,19 +4458,13 @@ xfs_bmapi(
* allocate the stuff asked for in this bmap call
* but that wouldn't be as good.
*/
- if (wasdelay && !(flags & XFS_BMAPI_EXACT)) {
+ if (wasdelay) {
alen = (xfs_extlen_t)got.br_blockcount;
aoff = got.br_startoff;
if (lastx != NULLEXTNUM && lastx) {
ep = xfs_iext_get_ext(ifp, lastx - 1);
xfs_bmbt_get_all(ep, &prev);
}
- } else if (wasdelay) {
- alen = (xfs_extlen_t)
- XFS_FILBLKS_MIN(len,
- (got.br_startoff +
- got.br_blockcount) - bno);
- aoff = bno;
} else {
alen = (xfs_extlen_t)
XFS_FILBLKS_MIN(len, MAXEXTLEN);
@@ -4831,7 +4663,7 @@ xfs_bmapi(
got.br_state = XFS_EXT_UNWRITTEN;
}
error = xfs_bmap_add_extent(ip, lastx, &cur, &got,
- firstblock, flist, &tmp_logflags, delta,
+ firstblock, flist, &tmp_logflags,
whichfork, (flags & XFS_BMAPI_RSVBLOCKS));
logflags |= tmp_logflags;
if (error)
@@ -4927,7 +4759,7 @@ xfs_bmapi(
}
mval->br_state = XFS_EXT_NORM;
error = xfs_bmap_add_extent(ip, lastx, &cur, mval,
- firstblock, flist, &tmp_logflags, delta,
+ firstblock, flist, &tmp_logflags,
whichfork, (flags & XFS_BMAPI_RSVBLOCKS));
logflags |= tmp_logflags;
if (error)
@@ -5017,14 +4849,6 @@ xfs_bmapi(
ASSERT(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE ||
XFS_IFORK_NEXTENTS(ip, whichfork) > ifp->if_ext_max);
error = 0;
- if (delta && delta->xed_startoff != NULLFILEOFF) {
- /* A change was actually made.
- * Note that delta->xed_blockount is an offset at this
- * point and needs to be converted to a block count.
- */
- ASSERT(delta->xed_blockcount > delta->xed_startoff);
- delta->xed_blockcount -= delta->xed_startoff;
- }
error0:
/*
* Log everything. Do this after conversion, there's no point in
@@ -5136,8 +4960,6 @@ xfs_bunmapi(
xfs_fsblock_t *firstblock, /* first allocated block
controls a.g. for allocs */
xfs_bmap_free_t *flist, /* i/o: list extents to free */
- xfs_extdelta_t *delta, /* o: change made to incore
- extents */
int *done) /* set if not done yet */
{
xfs_btree_cur_t *cur; /* bmap btree cursor */
@@ -5196,10 +5018,7 @@ xfs_bunmapi(
bno = start + len - 1;
ep = xfs_bmap_search_extents(ip, bno, whichfork, &eof, &lastx, &got,
&prev);
- if (delta) {
- delta->xed_startoff = NULLFILEOFF;
- delta->xed_blockcount = 0;
- }
+
/*
* Check to see if the given block number is past the end of the
* file, back up to the last block if so...
@@ -5297,7 +5116,7 @@ xfs_bunmapi(
}
del.br_state = XFS_EXT_UNWRITTEN;
error = xfs_bmap_add_extent(ip, lastx, &cur, &del,
- firstblock, flist, &logflags, delta,
+ firstblock, flist, &logflags,
XFS_DATA_FORK, 0);
if (error)
goto error0;
@@ -5352,7 +5171,7 @@ xfs_bunmapi(
prev.br_state = XFS_EXT_UNWRITTEN;
error = xfs_bmap_add_extent(ip, lastx - 1, &cur,
&prev, firstblock, flist, &logflags,
- delta, XFS_DATA_FORK, 0);
+ XFS_DATA_FORK, 0);
if (error)
goto error0;
goto nodelete;
@@ -5361,7 +5180,7 @@ xfs_bunmapi(
del.br_state = XFS_EXT_UNWRITTEN;
error = xfs_bmap_add_extent(ip, lastx, &cur,
&del, firstblock, flist, &logflags,
- delta, XFS_DATA_FORK, 0);
+ XFS_DATA_FORK, 0);
if (error)
goto error0;
goto nodelete;
@@ -5414,7 +5233,7 @@ xfs_bunmapi(
goto error0;
}
error = xfs_bmap_del_extent(ip, tp, lastx, flist, cur, &del,
- &tmp_logflags, delta, whichfork, rsvd);
+ &tmp_logflags, whichfork, rsvd);
logflags |= tmp_logflags;
if (error)
goto error0;
@@ -5471,14 +5290,6 @@ nodelete:
ASSERT(ifp->if_ext_max ==
XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t));
error = 0;
- if (delta && delta->xed_startoff != NULLFILEOFF) {
- /* A change was actually made.
- * Note that delta->xed_blockount is an offset at this
- * point and needs to be converted to a block count.
- */
- ASSERT(delta->xed_blockcount > delta->xed_startoff);
- delta->xed_blockcount -= delta->xed_startoff;
- }
error0:
/*
* Log everything. Do this after conversion, there's no point in
@@ -5605,28 +5416,6 @@ xfs_getbmap(
prealloced = 0;
fixlen = 1LL << 32;
} else {
- /*
- * If the BMV_IF_NO_DMAPI_READ interface bit specified, do
- * not generate a DMAPI read event. Otherwise, if the
- * DM_EVENT_READ bit is set for the file, generate a read
- * event in order that the DMAPI application may do its thing
- * before we return the extents. Usually this means restoring
- * user file data to regions of the file that look like holes.
- *
- * The "old behavior" (from XFS_IOC_GETBMAP) is to not specify
- * BMV_IF_NO_DMAPI_READ so that read events are generated.
- * If this were not true, callers of ioctl(XFS_IOC_GETBMAP)
- * could misinterpret holes in a DMAPI file as true holes,
- * when in fact they may represent offline user data.
- */
- if (DM_EVENT_ENABLED(ip, DM_EVENT_READ) &&
- !(iflags & BMV_IF_NO_DMAPI_READ)) {
- error = XFS_SEND_DATA(mp, DM_EVENT_READ, ip,
- 0, 0, 0, NULL);
- if (error)
- return XFS_ERROR(error);
- }
-
if (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS &&
ip->i_d.di_format != XFS_DINODE_FMT_BTREE &&
ip->i_d.di_format != XFS_DINODE_FMT_LOCAL)
@@ -5713,7 +5502,7 @@ xfs_getbmap(
error = xfs_bmapi(NULL, ip, XFS_BB_TO_FSBT(mp, bmv->bmv_offset),
XFS_BB_TO_FSB(mp, bmv->bmv_length),
bmapi_flags, NULL, 0, map, &nmap,
- NULL, NULL);
+ NULL);
if (error)
goto out_free_map;
ASSERT(nmap <= subnex);
@@ -5859,66 +5648,34 @@ xfs_bmap_eof(
}
#ifdef DEBUG
-STATIC
-xfs_buf_t *
+STATIC struct xfs_buf *
xfs_bmap_get_bp(
- xfs_btree_cur_t *cur,
+ struct xfs_btree_cur *cur,
xfs_fsblock_t bno)
{
- int i;
- xfs_buf_t *bp;
+ struct xfs_log_item_desc *lidp;
+ int i;
if (!cur)
- return(NULL);
-
- bp = NULL;
- for(i = 0; i < XFS_BTREE_MAXLEVELS; i++) {
- bp = cur->bc_bufs[i];
- if (!bp) break;
- if (XFS_BUF_ADDR(bp) == bno)
- break; /* Found it */
- }
- if (i == XFS_BTREE_MAXLEVELS)
- bp = NULL;
-
- if (!bp) { /* Chase down all the log items to see if the bp is there */
- xfs_log_item_chunk_t *licp;
- xfs_trans_t *tp;
-
- tp = cur->bc_tp;
- licp = &tp->t_items;
- while (!bp && licp != NULL) {
- if (xfs_lic_are_all_free(licp)) {
- licp = licp->lic_next;
- continue;
- }
- for (i = 0; i < licp->lic_unused; i++) {
- xfs_log_item_desc_t *lidp;
- xfs_log_item_t *lip;
- xfs_buf_log_item_t *bip;
- xfs_buf_t *lbp;
-
- if (xfs_lic_isfree(licp, i)) {
- continue;
- }
-
- lidp = xfs_lic_slot(licp, i);
- lip = lidp->lid_item;
- if (lip->li_type != XFS_LI_BUF)
- continue;
+ return NULL;
- bip = (xfs_buf_log_item_t *)lip;
- lbp = bip->bli_buf;
+ for (i = 0; i < XFS_BTREE_MAXLEVELS; i++) {
+ if (!cur->bc_bufs[i])
+ break;
+ if (XFS_BUF_ADDR(cur->bc_bufs[i]) == bno)
+ return cur->bc_bufs[i];
+ }
- if (XFS_BUF_ADDR(lbp) == bno) {
- bp = lbp;
- break; /* Found it */
- }
- }
- licp = licp->lic_next;
- }
+ /* Chase down all the log items to see if the bp is there */
+ list_for_each_entry(lidp, &cur->bc_tp->t_items, lid_trans) {
+ struct xfs_buf_log_item *bip;
+ bip = (struct xfs_buf_log_item *)lidp->lid_item;
+ if (bip->bli_item.li_type == XFS_LI_BUF &&
+ XFS_BUF_ADDR(bip->bli_buf) == bno)
+ return bip->bli_buf;
}
- return(bp);
+
+ return NULL;
}
STATIC void
diff --git a/fs/xfs/xfs_bmap.h b/fs/xfs/xfs_bmap.h
index 419dafb9d87d..b13569a6179b 100644
--- a/fs/xfs/xfs_bmap.h
+++ b/fs/xfs/xfs_bmap.h
@@ -28,20 +28,6 @@ struct xfs_trans;
extern kmem_zone_t *xfs_bmap_free_item_zone;
/*
- * DELTA: describe a change to the in-core extent list.
- *
- * Internally the use of xed_blockount is somewhat funky.
- * xed_blockcount contains an offset much of the time because this
- * makes merging changes easier. (xfs_fileoff_t and xfs_filblks_t are
- * the same underlying type).
- */
-typedef struct xfs_extdelta
-{
- xfs_fileoff_t xed_startoff; /* offset of range */
- xfs_filblks_t xed_blockcount; /* blocks in range */
-} xfs_extdelta_t;
-
-/*
* List of extents to be free "later".
* The list is kept sorted on xbf_startblock.
*/
@@ -82,16 +68,13 @@ typedef struct xfs_bmap_free
#define XFS_BMAPI_DELAY 0x002 /* delayed write operation */
#define XFS_BMAPI_ENTIRE 0x004 /* return entire extent, not trimmed */
#define XFS_BMAPI_METADATA 0x008 /* mapping metadata not user data */
-#define XFS_BMAPI_EXACT 0x010 /* allocate only to spec'd bounds */
-#define XFS_BMAPI_ATTRFORK 0x020 /* use attribute fork not data */
-#define XFS_BMAPI_ASYNC 0x040 /* bunmapi xactions can be async */
-#define XFS_BMAPI_RSVBLOCKS 0x080 /* OK to alloc. reserved data blocks */
-#define XFS_BMAPI_PREALLOC 0x100 /* preallocation op: unwritten space */
-#define XFS_BMAPI_IGSTATE 0x200 /* Ignore state - */
+#define XFS_BMAPI_ATTRFORK 0x010 /* use attribute fork not data */
+#define XFS_BMAPI_RSVBLOCKS 0x020 /* OK to alloc. reserved data blocks */
+#define XFS_BMAPI_PREALLOC 0x040 /* preallocation op: unwritten space */
+#define XFS_BMAPI_IGSTATE 0x080 /* Ignore state - */
/* combine contig. space */
-#define XFS_BMAPI_CONTIG 0x400 /* must allocate only one extent */
-/* XFS_BMAPI_DIRECT_IO 0x800 */
-#define XFS_BMAPI_CONVERT 0x1000 /* unwritten extent conversion - */
+#define XFS_BMAPI_CONTIG 0x100 /* must allocate only one extent */
+#define XFS_BMAPI_CONVERT 0x200 /* unwritten extent conversion - */
/* need write cache flushing and no */
/* additional allocation alignments */
@@ -100,9 +83,7 @@ typedef struct xfs_bmap_free
{ XFS_BMAPI_DELAY, "DELAY" }, \
{ XFS_BMAPI_ENTIRE, "ENTIRE" }, \
{ XFS_BMAPI_METADATA, "METADATA" }, \
- { XFS_BMAPI_EXACT, "EXACT" }, \
{ XFS_BMAPI_ATTRFORK, "ATTRFORK" }, \
- { XFS_BMAPI_ASYNC, "ASYNC" }, \
{ XFS_BMAPI_RSVBLOCKS, "RSVBLOCKS" }, \
{ XFS_BMAPI_PREALLOC, "PREALLOC" }, \
{ XFS_BMAPI_IGSTATE, "IGSTATE" }, \
@@ -310,9 +291,7 @@ xfs_bmapi(
xfs_extlen_t total, /* total blocks needed */
struct xfs_bmbt_irec *mval, /* output: map values */
int *nmap, /* i/o: mval size/count */
- xfs_bmap_free_t *flist, /* i/o: list extents to free */
- xfs_extdelta_t *delta); /* o: change made to incore
- extents */
+ xfs_bmap_free_t *flist); /* i/o: list extents to free */
/*
* Map file blocks to filesystem blocks, simple version.
@@ -346,8 +325,6 @@ xfs_bunmapi(
xfs_fsblock_t *firstblock, /* first allocated block
controls a.g. for allocs */
xfs_bmap_free_t *flist, /* i/o: list extents to free */
- xfs_extdelta_t *delta, /* o: change made to incore
- extents */
int *done); /* set if not done yet */
/*
diff --git a/fs/xfs/xfs_bmap_btree.c b/fs/xfs/xfs_bmap_btree.c
index 416e47e54b83..87d3c10b6954 100644
--- a/fs/xfs/xfs_bmap_btree.c
+++ b/fs/xfs/xfs_bmap_btree.c
@@ -24,21 +24,16 @@
#include "xfs_trans.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
-#include "xfs_dir2.h"
-#include "xfs_dmapi.h"
#include "xfs_mount.h"
#include "xfs_bmap_btree.h"
#include "xfs_alloc_btree.h"
#include "xfs_ialloc_btree.h"
-#include "xfs_dir2_sf.h"
-#include "xfs_attr_sf.h"
#include "xfs_dinode.h"
#include "xfs_inode.h"
#include "xfs_inode_item.h"
#include "xfs_alloc.h"
#include "xfs_btree.h"
#include "xfs_btree_trace.h"
-#include "xfs_ialloc.h"
#include "xfs_itable.h"
#include "xfs_bmap.h"
#include "xfs_error.h"
diff --git a/fs/xfs/xfs_btree.c b/fs/xfs/xfs_btree.c
index 96be4b0f2496..829af92f0fba 100644
--- a/fs/xfs/xfs_btree.c
+++ b/fs/xfs/xfs_btree.c
@@ -24,20 +24,15 @@
#include "xfs_trans.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
-#include "xfs_dir2.h"
-#include "xfs_dmapi.h"
#include "xfs_mount.h"
#include "xfs_bmap_btree.h"
#include "xfs_alloc_btree.h"
#include "xfs_ialloc_btree.h"
-#include "xfs_dir2_sf.h"
-#include "xfs_attr_sf.h"
#include "xfs_dinode.h"
#include "xfs_inode.h"
#include "xfs_inode_item.h"
#include "xfs_btree.h"
#include "xfs_btree_trace.h"
-#include "xfs_ialloc.h"
#include "xfs_error.h"
#include "xfs_trace.h"
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index 02a80984aa05..1b09d7a280df 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -24,7 +24,6 @@
#include "xfs_trans.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
-#include "xfs_dmapi.h"
#include "xfs_mount.h"
#include "xfs_buf_item.h"
#include "xfs_trans_priv.h"
@@ -34,6 +33,12 @@
kmem_zone_t *xfs_buf_item_zone;
+static inline struct xfs_buf_log_item *BUF_ITEM(struct xfs_log_item *lip)
+{
+ return container_of(lip, struct xfs_buf_log_item, bli_item);
+}
+
+
#ifdef XFS_TRANS_DEBUG
/*
* This function uses an alternate strategy for tracking the bytes
@@ -151,12 +156,13 @@ STATIC void xfs_buf_do_callbacks(xfs_buf_t *bp, xfs_log_item_t *lip);
*/
STATIC uint
xfs_buf_item_size(
- xfs_buf_log_item_t *bip)
+ struct xfs_log_item *lip)
{
- uint nvecs;
- int next_bit;
- int last_bit;
- xfs_buf_t *bp;
+ struct xfs_buf_log_item *bip = BUF_ITEM(lip);
+ struct xfs_buf *bp = bip->bli_buf;
+ uint nvecs;
+ int next_bit;
+ int last_bit;
ASSERT(atomic_read(&bip->bli_refcount) > 0);
if (bip->bli_flags & XFS_BLI_STALE) {
@@ -170,7 +176,6 @@ xfs_buf_item_size(
return 1;
}
- bp = bip->bli_buf;
ASSERT(bip->bli_flags & XFS_BLI_LOGGED);
nvecs = 1;
last_bit = xfs_next_bit(bip->bli_format.blf_data_map,
@@ -219,13 +224,13 @@ xfs_buf_item_size(
*/
STATIC void
xfs_buf_item_format(
- xfs_buf_log_item_t *bip,
- xfs_log_iovec_t *log_vector)
+ struct xfs_log_item *lip,
+ struct xfs_log_iovec *vecp)
{
+ struct xfs_buf_log_item *bip = BUF_ITEM(lip);
+ struct xfs_buf *bp = bip->bli_buf;
uint base_size;
uint nvecs;
- xfs_log_iovec_t *vecp;
- xfs_buf_t *bp;
int first_bit;
int last_bit;
int next_bit;
@@ -235,8 +240,6 @@ xfs_buf_item_format(
ASSERT(atomic_read(&bip->bli_refcount) > 0);
ASSERT((bip->bli_flags & XFS_BLI_LOGGED) ||
(bip->bli_flags & XFS_BLI_STALE));
- bp = bip->bli_buf;
- vecp = log_vector;
/*
* The size of the base structure is the size of the
@@ -248,7 +251,7 @@ xfs_buf_item_format(
base_size =
(uint)(sizeof(xfs_buf_log_format_t) +
((bip->bli_format.blf_map_size - 1) * sizeof(uint)));
- vecp->i_addr = (xfs_caddr_t)&bip->bli_format;
+ vecp->i_addr = &bip->bli_format;
vecp->i_len = base_size;
vecp->i_type = XLOG_REG_TYPE_BFORMAT;
vecp++;
@@ -263,7 +266,7 @@ xfs_buf_item_format(
*/
if (bip->bli_flags & XFS_BLI_INODE_BUF) {
if (!((bip->bli_flags & XFS_BLI_INODE_ALLOC_BUF) &&
- xfs_log_item_in_current_chkpt(&bip->bli_item)))
+ xfs_log_item_in_current_chkpt(lip)))
bip->bli_format.blf_flags |= XFS_BLF_INODE_BUF;
bip->bli_flags &= ~XFS_BLI_INODE_BUF;
}
@@ -356,66 +359,90 @@ xfs_buf_item_format(
/*
* This is called to pin the buffer associated with the buf log item in memory
- * so it cannot be written out. Simply call bpin() on the buffer to do this.
+ * so it cannot be written out.
*
* We also always take a reference to the buffer log item here so that the bli
* is held while the item is pinned in memory. This means that we can
* unconditionally drop the reference count a transaction holds when the
* transaction is completed.
*/
-
STATIC void
xfs_buf_item_pin(
- xfs_buf_log_item_t *bip)
+ struct xfs_log_item *lip)
{
- xfs_buf_t *bp;
+ struct xfs_buf_log_item *bip = BUF_ITEM(lip);
- bp = bip->bli_buf;
- ASSERT(XFS_BUF_ISBUSY(bp));
+ ASSERT(XFS_BUF_ISBUSY(bip->bli_buf));
ASSERT(atomic_read(&bip->bli_refcount) > 0);
ASSERT((bip->bli_flags & XFS_BLI_LOGGED) ||
(bip->bli_flags & XFS_BLI_STALE));
- atomic_inc(&bip->bli_refcount);
+
trace_xfs_buf_item_pin(bip);
- xfs_bpin(bp);
-}
+ atomic_inc(&bip->bli_refcount);
+ atomic_inc(&bip->bli_buf->b_pin_count);
+}
/*
* This is called to unpin the buffer associated with the buf log
* item which was previously pinned with a call to xfs_buf_item_pin().
- * Just call bunpin() on the buffer to do this.
*
* Also drop the reference to the buf item for the current transaction.
* If the XFS_BLI_STALE flag is set and we are the last reference,
* then free up the buf log item and unlock the buffer.
+ *
+ * If the remove flag is set we are called from uncommit in the
+ * forced-shutdown path. If that is true and the reference count on
+ * the log item is going to drop to zero we need to free the item's
+ * descriptor in the transaction.
*/
STATIC void
xfs_buf_item_unpin(
- xfs_buf_log_item_t *bip)
+ struct xfs_log_item *lip,
+ int remove)
{
- struct xfs_ail *ailp;
- xfs_buf_t *bp;
- int freed;
+ struct xfs_buf_log_item *bip = BUF_ITEM(lip);
+ xfs_buf_t *bp = bip->bli_buf;
+ struct xfs_ail *ailp = lip->li_ailp;
int stale = bip->bli_flags & XFS_BLI_STALE;
+ int freed;
- bp = bip->bli_buf;
- ASSERT(bp != NULL);
ASSERT(XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *) == bip);
ASSERT(atomic_read(&bip->bli_refcount) > 0);
+
trace_xfs_buf_item_unpin(bip);
freed = atomic_dec_and_test(&bip->bli_refcount);
- ailp = bip->bli_item.li_ailp;
- xfs_bunpin(bp);
+
+ if (atomic_dec_and_test(&bp->b_pin_count))
+ wake_up_all(&bp->b_waiters);
+
if (freed && stale) {
ASSERT(bip->bli_flags & XFS_BLI_STALE);
ASSERT(XFS_BUF_VALUSEMA(bp) <= 0);
ASSERT(!(XFS_BUF_ISDELAYWRITE(bp)));
ASSERT(XFS_BUF_ISSTALE(bp));
ASSERT(bip->bli_format.blf_flags & XFS_BLF_CANCEL);
+
trace_xfs_buf_item_unpin_stale(bip);
+ if (remove) {
+ /*
+ * We have to remove the log item from the transaction
+ * as we are about to release our reference to the
+ * buffer. If we don't, the unlock that occurs later
+ * in xfs_trans_uncommit() will ry to reference the
+ * buffer which we no longer have a hold on.
+ */
+ xfs_trans_del_item(lip);
+
+ /*
+ * Since the transaction no longer refers to the buffer,
+ * the buffer should no longer refer to the transaction.
+ */
+ XFS_BUF_SET_FSPRIVATE2(bp, NULL);
+ }
+
/*
* If we get called here because of an IO error, we may
* or may not have the item on the AIL. xfs_trans_ail_delete()
@@ -437,48 +464,6 @@ xfs_buf_item_unpin(
}
/*
- * this is called from uncommit in the forced-shutdown path.
- * we need to check to see if the reference count on the log item
- * is going to drop to zero. If so, unpin will free the log item
- * so we need to free the item's descriptor (that points to the item)
- * in the transaction.
- */
-STATIC void
-xfs_buf_item_unpin_remove(
- xfs_buf_log_item_t *bip,
- xfs_trans_t *tp)
-{
- /* will xfs_buf_item_unpin() call xfs_buf_item_relse()? */
- if ((atomic_read(&bip->bli_refcount) == 1) &&
- (bip->bli_flags & XFS_BLI_STALE)) {
- /*
- * yes -- We can safely do some work here and then call
- * buf_item_unpin to do the rest because we are
- * are holding the buffer locked so no one else will be
- * able to bump up the refcount. We have to remove the
- * log item from the transaction as we are about to release
- * our reference to the buffer. If we don't, the unlock that
- * occurs later in the xfs_trans_uncommit() will try to
- * reference the buffer which we no longer have a hold on.
- */
- struct xfs_log_item_desc *lidp;
-
- ASSERT(XFS_BUF_VALUSEMA(bip->bli_buf) <= 0);
- trace_xfs_buf_item_unpin_stale(bip);
-
- lidp = xfs_trans_find_item(tp, (xfs_log_item_t *)bip);
- xfs_trans_free_item(tp, lidp);
-
- /*
- * Since the transaction no longer refers to the buffer, the
- * buffer should no longer refer to the transaction.
- */
- XFS_BUF_SET_FSPRIVATE2(bip->bli_buf, NULL);
- }
- xfs_buf_item_unpin(bip);
-}
-
-/*
* This is called to attempt to lock the buffer associated with this
* buf log item. Don't sleep on the buffer lock. If we can't get
* the lock right away, return 0. If we can get the lock, take a
@@ -488,11 +473,11 @@ xfs_buf_item_unpin_remove(
*/
STATIC uint
xfs_buf_item_trylock(
- xfs_buf_log_item_t *bip)
+ struct xfs_log_item *lip)
{
- xfs_buf_t *bp;
+ struct xfs_buf_log_item *bip = BUF_ITEM(lip);
+ struct xfs_buf *bp = bip->bli_buf;
- bp = bip->bli_buf;
if (XFS_BUF_ISPINNED(bp))
return XFS_ITEM_PINNED;
if (!XFS_BUF_CPSEMA(bp))
@@ -529,13 +514,12 @@ xfs_buf_item_trylock(
*/
STATIC void
xfs_buf_item_unlock(
- xfs_buf_log_item_t *bip)
+ struct xfs_log_item *lip)
{
- int aborted;
- xfs_buf_t *bp;
- uint hold;
-
- bp = bip->bli_buf;
+ struct xfs_buf_log_item *bip = BUF_ITEM(lip);
+ struct xfs_buf *bp = bip->bli_buf;
+ int aborted;
+ uint hold;
/* Clear the buffer's association with this transaction. */
XFS_BUF_SET_FSPRIVATE2(bp, NULL);
@@ -546,7 +530,7 @@ xfs_buf_item_unlock(
* (cancelled) buffers at unpin time, but we'll never go through the
* pin/unpin cycle if we abort inside commit.
*/
- aborted = (bip->bli_item.li_flags & XFS_LI_ABORTED) != 0;
+ aborted = (lip->li_flags & XFS_LI_ABORTED) != 0;
/*
* Before possibly freeing the buf item, determine if we should
@@ -607,16 +591,16 @@ xfs_buf_item_unlock(
*/
STATIC xfs_lsn_t
xfs_buf_item_committed(
- xfs_buf_log_item_t *bip,
+ struct xfs_log_item *lip,
xfs_lsn_t lsn)
{
+ struct xfs_buf_log_item *bip = BUF_ITEM(lip);
+
trace_xfs_buf_item_committed(bip);
- if ((bip->bli_flags & XFS_BLI_INODE_ALLOC_BUF) &&
- (bip->bli_item.li_lsn != 0)) {
- return bip->bli_item.li_lsn;
- }
- return (lsn);
+ if ((bip->bli_flags & XFS_BLI_INODE_ALLOC_BUF) && lip->li_lsn != 0)
+ return lip->li_lsn;
+ return lsn;
}
/*
@@ -626,15 +610,16 @@ xfs_buf_item_committed(
*/
STATIC void
xfs_buf_item_push(
- xfs_buf_log_item_t *bip)
+ struct xfs_log_item *lip)
{
- xfs_buf_t *bp;
+ struct xfs_buf_log_item *bip = BUF_ITEM(lip);
+ struct xfs_buf *bp = bip->bli_buf;
ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
+ ASSERT(!XFS_BUF_ISDELAYWRITE(bp));
+
trace_xfs_buf_item_push(bip);
- bp = bip->bli_buf;
- ASSERT(!XFS_BUF_ISDELAYWRITE(bp));
xfs_buf_relse(bp);
}
@@ -646,22 +631,24 @@ xfs_buf_item_push(
*/
STATIC void
xfs_buf_item_pushbuf(
- xfs_buf_log_item_t *bip)
+ struct xfs_log_item *lip)
{
- xfs_buf_t *bp;
+ struct xfs_buf_log_item *bip = BUF_ITEM(lip);
+ struct xfs_buf *bp = bip->bli_buf;
ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
+ ASSERT(XFS_BUF_ISDELAYWRITE(bp));
+
trace_xfs_buf_item_pushbuf(bip);
- bp = bip->bli_buf;
- ASSERT(XFS_BUF_ISDELAYWRITE(bp));
xfs_buf_delwri_promote(bp);
xfs_buf_relse(bp);
}
-/* ARGSUSED */
STATIC void
-xfs_buf_item_committing(xfs_buf_log_item_t *bip, xfs_lsn_t commit_lsn)
+xfs_buf_item_committing(
+ struct xfs_log_item *lip,
+ xfs_lsn_t commit_lsn)
{
}
@@ -669,21 +656,16 @@ xfs_buf_item_committing(xfs_buf_log_item_t *bip, xfs_lsn_t commit_lsn)
* This is the ops vector shared by all buf log items.
*/
static struct xfs_item_ops xfs_buf_item_ops = {
- .iop_size = (uint(*)(xfs_log_item_t*))xfs_buf_item_size,
- .iop_format = (void(*)(xfs_log_item_t*, xfs_log_iovec_t*))
- xfs_buf_item_format,
- .iop_pin = (void(*)(xfs_log_item_t*))xfs_buf_item_pin,
- .iop_unpin = (void(*)(xfs_log_item_t*))xfs_buf_item_unpin,
- .iop_unpin_remove = (void(*)(xfs_log_item_t*, xfs_trans_t *))
- xfs_buf_item_unpin_remove,
- .iop_trylock = (uint(*)(xfs_log_item_t*))xfs_buf_item_trylock,
- .iop_unlock = (void(*)(xfs_log_item_t*))xfs_buf_item_unlock,
- .iop_committed = (xfs_lsn_t(*)(xfs_log_item_t*, xfs_lsn_t))
- xfs_buf_item_committed,
- .iop_push = (void(*)(xfs_log_item_t*))xfs_buf_item_push,
- .iop_pushbuf = (void(*)(xfs_log_item_t*))xfs_buf_item_pushbuf,
- .iop_committing = (void(*)(xfs_log_item_t*, xfs_lsn_t))
- xfs_buf_item_committing
+ .iop_size = xfs_buf_item_size,
+ .iop_format = xfs_buf_item_format,
+ .iop_pin = xfs_buf_item_pin,
+ .iop_unpin = xfs_buf_item_unpin,
+ .iop_trylock = xfs_buf_item_trylock,
+ .iop_unlock = xfs_buf_item_unlock,
+ .iop_committed = xfs_buf_item_committed,
+ .iop_push = xfs_buf_item_push,
+ .iop_pushbuf = xfs_buf_item_pushbuf,
+ .iop_committing = xfs_buf_item_committing
};
@@ -712,7 +694,6 @@ xfs_buf_item_init(
*/
if (bp->b_mount != mp)
bp->b_mount = mp;
- XFS_BUF_SET_BDSTRAT_FUNC(bp, xfs_bdstrat_cb);
if (XFS_BUF_FSPRIVATE(bp, void *) != NULL) {
lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *);
if (lip->li_type == XFS_LI_BUF) {
@@ -1098,15 +1079,14 @@ xfs_buf_error_relse(
* It is called by xfs_buf_iodone_callbacks() above which will take
* care of cleaning up the buffer itself.
*/
-/* ARGSUSED */
void
xfs_buf_iodone(
- xfs_buf_t *bp,
- xfs_buf_log_item_t *bip)
+ struct xfs_buf *bp,
+ struct xfs_log_item *lip)
{
- struct xfs_ail *ailp = bip->bli_item.li_ailp;
+ struct xfs_ail *ailp = lip->li_ailp;
- ASSERT(bip->bli_buf == bp);
+ ASSERT(BUF_ITEM(lip)->bli_buf == bp);
xfs_buf_rele(bp);
@@ -1120,6 +1100,6 @@ xfs_buf_iodone(
* Either way, AIL is useless if we're forcing a shutdown.
*/
spin_lock(&ailp->xa_lock);
- xfs_trans_ail_delete(ailp, (xfs_log_item_t *)bip);
- xfs_buf_item_free(bip);
+ xfs_trans_ail_delete(ailp, lip);
+ xfs_buf_item_free(BUF_ITEM(lip));
}
diff --git a/fs/xfs/xfs_buf_item.h b/fs/xfs/xfs_buf_item.h
index f20bb472d582..0e2ed43f16c7 100644
--- a/fs/xfs/xfs_buf_item.h
+++ b/fs/xfs/xfs_buf_item.h
@@ -124,7 +124,7 @@ void xfs_buf_attach_iodone(struct xfs_buf *,
void(*)(struct xfs_buf *, xfs_log_item_t *),
xfs_log_item_t *);
void xfs_buf_iodone_callbacks(struct xfs_buf *);
-void xfs_buf_iodone(struct xfs_buf *, xfs_buf_log_item_t *);
+void xfs_buf_iodone(struct xfs_buf *, struct xfs_log_item *);
#ifdef XFS_TRANS_DEBUG
void
diff --git a/fs/xfs/xfs_da_btree.c b/fs/xfs/xfs_da_btree.c
index 0ca556b4bf31..30fa0e206fba 100644
--- a/fs/xfs/xfs_da_btree.c
+++ b/fs/xfs/xfs_da_btree.c
@@ -25,19 +25,14 @@
#include "xfs_sb.h"
#include "xfs_ag.h"
#include "xfs_dir2.h"
-#include "xfs_dmapi.h"
#include "xfs_mount.h"
#include "xfs_da_btree.h"
#include "xfs_bmap_btree.h"
-#include "xfs_alloc_btree.h"
-#include "xfs_ialloc_btree.h"
#include "xfs_dir2_sf.h"
-#include "xfs_attr_sf.h"
#include "xfs_dinode.h"
#include "xfs_inode.h"
#include "xfs_inode_item.h"
#include "xfs_alloc.h"
-#include "xfs_btree.h"
#include "xfs_bmap.h"
#include "xfs_attr.h"
#include "xfs_attr_leaf.h"
@@ -581,16 +576,14 @@ xfs_da_node_add(xfs_da_state_t *state, xfs_da_state_blk_t *oldblk,
xfs_da_intnode_t *node;
xfs_da_node_entry_t *btree;
int tmp;
- xfs_mount_t *mp;
node = oldblk->bp->data;
- mp = state->mp;
ASSERT(be16_to_cpu(node->hdr.info.magic) == XFS_DA_NODE_MAGIC);
ASSERT((oldblk->index >= 0) && (oldblk->index <= be16_to_cpu(node->hdr.count)));
ASSERT(newblk->blkno != 0);
if (state->args->whichfork == XFS_DATA_FORK)
- ASSERT(newblk->blkno >= mp->m_dirleafblk &&
- newblk->blkno < mp->m_dirfreeblk);
+ ASSERT(newblk->blkno >= state->mp->m_dirleafblk &&
+ newblk->blkno < state->mp->m_dirfreeblk);
/*
* We may need to make some room before we insert the new node.
@@ -1601,7 +1594,7 @@ xfs_da_grow_inode(xfs_da_args_t *args, xfs_dablk_t *new_blkno)
xfs_bmapi_aflag(w)|XFS_BMAPI_WRITE|XFS_BMAPI_METADATA|
XFS_BMAPI_CONTIG,
args->firstblock, args->total, &map, &nmap,
- args->flist, NULL))) {
+ args->flist))) {
return error;
}
ASSERT(nmap <= 1);
@@ -1622,8 +1615,7 @@ xfs_da_grow_inode(xfs_da_args_t *args, xfs_dablk_t *new_blkno)
xfs_bmapi_aflag(w)|XFS_BMAPI_WRITE|
XFS_BMAPI_METADATA,
args->firstblock, args->total,
- &mapp[mapi], &nmap, args->flist,
- NULL))) {
+ &mapp[mapi], &nmap, args->flist))) {
kmem_free(mapp);
return error;
}
@@ -1884,7 +1876,7 @@ xfs_da_shrink_inode(xfs_da_args_t *args, xfs_dablk_t dead_blkno,
*/
if ((error = xfs_bunmapi(tp, dp, dead_blkno, count,
xfs_bmapi_aflag(w)|XFS_BMAPI_METADATA,
- 0, args->firstblock, args->flist, NULL,
+ 0, args->firstblock, args->flist,
&done)) == ENOSPC) {
if (w != XFS_DATA_FORK)
break;
@@ -1989,7 +1981,7 @@ xfs_da_do_buf(
nfsb,
XFS_BMAPI_METADATA |
xfs_bmapi_aflag(whichfork),
- NULL, 0, mapp, &nmap, NULL, NULL)))
+ NULL, 0, mapp, &nmap, NULL)))
goto exit0;
}
} else {
diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c
index 5bba29a07812..3b9582c60a22 100644
--- a/fs/xfs/xfs_dfrag.c
+++ b/fs/xfs/xfs_dfrag.c
@@ -24,24 +24,15 @@
#include "xfs_trans.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
-#include "xfs_dir2.h"
-#include "xfs_dmapi.h"
#include "xfs_mount.h"
#include "xfs_bmap_btree.h"
-#include "xfs_alloc_btree.h"
-#include "xfs_ialloc_btree.h"
-#include "xfs_dir2_sf.h"
-#include "xfs_attr_sf.h"
#include "xfs_dinode.h"
#include "xfs_inode.h"
#include "xfs_inode_item.h"
#include "xfs_bmap.h"
-#include "xfs_btree.h"
-#include "xfs_ialloc.h"
#include "xfs_itable.h"
#include "xfs_dfrag.h"
#include "xfs_error.h"
-#include "xfs_rw.h"
#include "xfs_vnodeops.h"
#include "xfs_trace.h"
@@ -69,7 +60,9 @@ xfs_swapext(
goto out;
}
- if (!(file->f_mode & FMODE_WRITE) || (file->f_flags & O_APPEND)) {
+ if (!(file->f_mode & FMODE_WRITE) ||
+ !(file->f_mode & FMODE_READ) ||
+ (file->f_flags & O_APPEND)) {
error = XFS_ERROR(EBADF);
goto out_put_file;
}
@@ -81,6 +74,7 @@ xfs_swapext(
}
if (!(tmp_file->f_mode & FMODE_WRITE) ||
+ !(tmp_file->f_mode & FMODE_READ) ||
(tmp_file->f_flags & O_APPEND)) {
error = XFS_ERROR(EBADF);
goto out_put_tmp_file;
@@ -422,11 +416,8 @@ xfs_swap_extents(
}
- IHOLD(ip);
- xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
-
- IHOLD(tip);
- xfs_trans_ijoin(tp, tip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
+ xfs_trans_ijoin_ref(tp, ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
+ xfs_trans_ijoin_ref(tp, tip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
xfs_trans_log_inode(tp, ip, ilf_fields);
xfs_trans_log_inode(tp, tip, tilf_fields);
diff --git a/fs/xfs/xfs_dir2.c b/fs/xfs/xfs_dir2.c
index 42520f041265..a1321bc7f192 100644
--- a/fs/xfs/xfs_dir2.c
+++ b/fs/xfs/xfs_dir2.c
@@ -25,13 +25,11 @@
#include "xfs_sb.h"
#include "xfs_ag.h"
#include "xfs_dir2.h"
-#include "xfs_dmapi.h"
#include "xfs_mount.h"
#include "xfs_da_btree.h"
#include "xfs_bmap_btree.h"
#include "xfs_alloc_btree.h"
#include "xfs_dir2_sf.h"
-#include "xfs_attr_sf.h"
#include "xfs_dinode.h"
#include "xfs_inode.h"
#include "xfs_inode_item.h"
@@ -382,7 +380,7 @@ xfs_readdir(
int rval; /* return value */
int v; /* type-checking value */
- xfs_itrace_entry(dp);
+ trace_xfs_readdir(dp);
if (XFS_FORCED_SHUTDOWN(dp->i_mount))
return XFS_ERROR(EIO);
@@ -549,7 +547,7 @@ xfs_dir2_grow_inode(
if ((error = xfs_bmapi(tp, dp, bno, count,
XFS_BMAPI_WRITE|XFS_BMAPI_METADATA|XFS_BMAPI_CONTIG,
args->firstblock, args->total, &map, &nmap,
- args->flist, NULL)))
+ args->flist)))
return error;
ASSERT(nmap <= 1);
if (nmap == 1) {
@@ -581,8 +579,7 @@ xfs_dir2_grow_inode(
if ((error = xfs_bmapi(tp, dp, b, c,
XFS_BMAPI_WRITE|XFS_BMAPI_METADATA,
args->firstblock, args->total,
- &mapp[mapi], &nmap, args->flist,
- NULL))) {
+ &mapp[mapi], &nmap, args->flist))) {
kmem_free(mapp);
return error;
}
@@ -715,7 +712,7 @@ xfs_dir2_shrink_inode(
*/
if ((error = xfs_bunmapi(tp, dp, da, mp->m_dirblkfsbs,
XFS_BMAPI_METADATA, 0, args->firstblock, args->flist,
- NULL, &done))) {
+ &done))) {
/*
* ENOSPC actually can happen if we're in a removename with
* no space reservation, and the resulting block removal
diff --git a/fs/xfs/xfs_dir2_block.c b/fs/xfs/xfs_dir2_block.c
index 779a267b0a84..580d99cef9e7 100644
--- a/fs/xfs/xfs_dir2_block.c
+++ b/fs/xfs/xfs_dir2_block.c
@@ -24,12 +24,10 @@
#include "xfs_sb.h"
#include "xfs_ag.h"
#include "xfs_dir2.h"
-#include "xfs_dmapi.h"
#include "xfs_mount.h"
#include "xfs_da_btree.h"
#include "xfs_bmap_btree.h"
#include "xfs_dir2_sf.h"
-#include "xfs_attr_sf.h"
#include "xfs_dinode.h"
#include "xfs_inode.h"
#include "xfs_inode_item.h"
@@ -1073,10 +1071,10 @@ xfs_dir2_sf_to_block(
*/
buf_len = dp->i_df.if_bytes;
- buf = kmem_alloc(dp->i_df.if_bytes, KM_SLEEP);
+ buf = kmem_alloc(buf_len, KM_SLEEP);
- memcpy(buf, sfp, dp->i_df.if_bytes);
- xfs_idata_realloc(dp, -dp->i_df.if_bytes, XFS_DATA_FORK);
+ memcpy(buf, sfp, buf_len);
+ xfs_idata_realloc(dp, -buf_len, XFS_DATA_FORK);
dp->i_d.di_size = 0;
xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
/*
diff --git a/fs/xfs/xfs_dir2_data.c b/fs/xfs/xfs_dir2_data.c
index 498f8d694330..921595b84f5b 100644
--- a/fs/xfs/xfs_dir2_data.c
+++ b/fs/xfs/xfs_dir2_data.c
@@ -24,12 +24,10 @@
#include "xfs_sb.h"
#include "xfs_ag.h"
#include "xfs_dir2.h"
-#include "xfs_dmapi.h"
#include "xfs_mount.h"
#include "xfs_da_btree.h"
#include "xfs_bmap_btree.h"
#include "xfs_dir2_sf.h"
-#include "xfs_attr_sf.h"
#include "xfs_dinode.h"
#include "xfs_inode.h"
#include "xfs_dir2_data.h"
diff --git a/fs/xfs/xfs_dir2_leaf.c b/fs/xfs/xfs_dir2_leaf.c
index e2d89854ec9e..504be8640e91 100644
--- a/fs/xfs/xfs_dir2_leaf.c
+++ b/fs/xfs/xfs_dir2_leaf.c
@@ -25,11 +25,9 @@
#include "xfs_sb.h"
#include "xfs_ag.h"
#include "xfs_dir2.h"
-#include "xfs_dmapi.h"
#include "xfs_mount.h"
#include "xfs_da_btree.h"
#include "xfs_bmap_btree.h"
-#include "xfs_attr_sf.h"
#include "xfs_dir2_sf.h"
#include "xfs_dinode.h"
#include "xfs_inode.h"
@@ -875,7 +873,7 @@ xfs_dir2_leaf_getdents(
xfs_dir2_byte_to_da(mp,
XFS_DIR2_LEAF_OFFSET) - map_off,
XFS_BMAPI_METADATA, NULL, 0,
- &map[map_valid], &nmap, NULL, NULL);
+ &map[map_valid], &nmap, NULL);
/*
* Don't know if we should ignore this or
* try to return an error.
diff --git a/fs/xfs/xfs_dir2_node.c b/fs/xfs/xfs_dir2_node.c
index 78fc4d9ae756..f9a0864b696a 100644
--- a/fs/xfs/xfs_dir2_node.c
+++ b/fs/xfs/xfs_dir2_node.c
@@ -24,12 +24,10 @@
#include "xfs_sb.h"
#include "xfs_ag.h"
#include "xfs_dir2.h"
-#include "xfs_dmapi.h"
#include "xfs_mount.h"
#include "xfs_da_btree.h"
#include "xfs_bmap_btree.h"
#include "xfs_dir2_sf.h"
-#include "xfs_attr_sf.h"
#include "xfs_dinode.h"
#include "xfs_inode.h"
#include "xfs_bmap.h"
diff --git a/fs/xfs/xfs_dir2_sf.c b/fs/xfs/xfs_dir2_sf.c
index c1a5945d463a..b1bae6b1eed9 100644
--- a/fs/xfs/xfs_dir2_sf.c
+++ b/fs/xfs/xfs_dir2_sf.c
@@ -24,12 +24,10 @@
#include "xfs_sb.h"
#include "xfs_ag.h"
#include "xfs_dir2.h"
-#include "xfs_dmapi.h"
#include "xfs_mount.h"
#include "xfs_da_btree.h"
#include "xfs_bmap_btree.h"
#include "xfs_dir2_sf.h"
-#include "xfs_attr_sf.h"
#include "xfs_dinode.h"
#include "xfs_inode.h"
#include "xfs_inode_item.h"
diff --git a/fs/xfs/xfs_dmapi.h b/fs/xfs/xfs_dmapi.h
deleted file mode 100644
index 2813cdd72375..000000000000
--- a/fs/xfs/xfs_dmapi.h
+++ /dev/null
@@ -1,170 +0,0 @@
-/*
- * Copyright (c) 2000-2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- */
-#ifndef __XFS_DMAPI_H__
-#define __XFS_DMAPI_H__
-
-/* Values used to define the on-disk version of dm_attrname_t. All
- * on-disk attribute names start with the 8-byte string "SGI_DMI_".
- *
- * In the on-disk inode, DMAPI attribute names consist of the user-provided
- * name with the DMATTR_PREFIXSTRING pre-pended. This string must NEVER be
- * changed.
- */
-
-#define DMATTR_PREFIXLEN 8
-#define DMATTR_PREFIXSTRING "SGI_DMI_"
-
-typedef enum {
- DM_EVENT_INVALID = -1,
- DM_EVENT_CANCEL = 0, /* not supported */
- DM_EVENT_MOUNT = 1,
- DM_EVENT_PREUNMOUNT = 2,
- DM_EVENT_UNMOUNT = 3,
- DM_EVENT_DEBUT = 4, /* not supported */
- DM_EVENT_CREATE = 5,
- DM_EVENT_CLOSE = 6, /* not supported */
- DM_EVENT_POSTCREATE = 7,
- DM_EVENT_REMOVE = 8,
- DM_EVENT_POSTREMOVE = 9,
- DM_EVENT_RENAME = 10,
- DM_EVENT_POSTRENAME = 11,
- DM_EVENT_LINK = 12,
- DM_EVENT_POSTLINK = 13,
- DM_EVENT_SYMLINK = 14,
- DM_EVENT_POSTSYMLINK = 15,
- DM_EVENT_READ = 16,
- DM_EVENT_WRITE = 17,
- DM_EVENT_TRUNCATE = 18,
- DM_EVENT_ATTRIBUTE = 19,
- DM_EVENT_DESTROY = 20,
- DM_EVENT_NOSPACE = 21,
- DM_EVENT_USER = 22,
- DM_EVENT_MAX = 23
-} dm_eventtype_t;
-#define HAVE_DM_EVENTTYPE_T
-
-typedef enum {
- DM_RIGHT_NULL,
- DM_RIGHT_SHARED,
- DM_RIGHT_EXCL
-} dm_right_t;
-#define HAVE_DM_RIGHT_T
-
-/* Defines for determining if an event message should be sent. */
-#ifdef HAVE_DMAPI
-#define DM_EVENT_ENABLED(ip, event) ( \
- unlikely ((ip)->i_mount->m_flags & XFS_MOUNT_DMAPI) && \
- ( ((ip)->i_d.di_dmevmask & (1 << event)) || \
- ((ip)->i_mount->m_dmevmask & (1 << event)) ) \
- )
-#else
-#define DM_EVENT_ENABLED(ip, event) (0)
-#endif
-
-#define DM_XFS_VALID_FS_EVENTS ( \
- (1 << DM_EVENT_PREUNMOUNT) | \
- (1 << DM_EVENT_UNMOUNT) | \
- (1 << DM_EVENT_NOSPACE) | \
- (1 << DM_EVENT_DEBUT) | \
- (1 << DM_EVENT_CREATE) | \
- (1 << DM_EVENT_POSTCREATE) | \
- (1 << DM_EVENT_REMOVE) | \
- (1 << DM_EVENT_POSTREMOVE) | \
- (1 << DM_EVENT_RENAME) | \
- (1 << DM_EVENT_POSTRENAME) | \
- (1 << DM_EVENT_LINK) | \
- (1 << DM_EVENT_POSTLINK) | \
- (1 << DM_EVENT_SYMLINK) | \
- (1 << DM_EVENT_POSTSYMLINK) | \
- (1 << DM_EVENT_ATTRIBUTE) | \
- (1 << DM_EVENT_DESTROY) )
-
-/* Events valid in dm_set_eventlist() when called with a file handle for
- a regular file or a symlink. These events are persistent.
-*/
-
-#define DM_XFS_VALID_FILE_EVENTS ( \
- (1 << DM_EVENT_ATTRIBUTE) | \
- (1 << DM_EVENT_DESTROY) )
-
-/* Events valid in dm_set_eventlist() when called with a file handle for
- a directory. These events are persistent.
-*/
-
-#define DM_XFS_VALID_DIRECTORY_EVENTS ( \
- (1 << DM_EVENT_CREATE) | \
- (1 << DM_EVENT_POSTCREATE) | \
- (1 << DM_EVENT_REMOVE) | \
- (1 << DM_EVENT_POSTREMOVE) | \
- (1 << DM_EVENT_RENAME) | \
- (1 << DM_EVENT_POSTRENAME) | \
- (1 << DM_EVENT_LINK) | \
- (1 << DM_EVENT_POSTLINK) | \
- (1 << DM_EVENT_SYMLINK) | \
- (1 << DM_EVENT_POSTSYMLINK) | \
- (1 << DM_EVENT_ATTRIBUTE) | \
- (1 << DM_EVENT_DESTROY) )
-
-/* Events supported by the XFS filesystem. */
-#define DM_XFS_SUPPORTED_EVENTS ( \
- (1 << DM_EVENT_MOUNT) | \
- (1 << DM_EVENT_PREUNMOUNT) | \
- (1 << DM_EVENT_UNMOUNT) | \
- (1 << DM_EVENT_NOSPACE) | \
- (1 << DM_EVENT_CREATE) | \
- (1 << DM_EVENT_POSTCREATE) | \
- (1 << DM_EVENT_REMOVE) | \
- (1 << DM_EVENT_POSTREMOVE) | \
- (1 << DM_EVENT_RENAME) | \
- (1 << DM_EVENT_POSTRENAME) | \
- (1 << DM_EVENT_LINK) | \
- (1 << DM_EVENT_POSTLINK) | \
- (1 << DM_EVENT_SYMLINK) | \
- (1 << DM_EVENT_POSTSYMLINK) | \
- (1 << DM_EVENT_READ) | \
- (1 << DM_EVENT_WRITE) | \
- (1 << DM_EVENT_TRUNCATE) | \
- (1 << DM_EVENT_ATTRIBUTE) | \
- (1 << DM_EVENT_DESTROY) )
-
-
-/*
- * Definitions used for the flags field on dm_send_*_event().
- */
-
-#define DM_FLAGS_NDELAY 0x001 /* return EAGAIN after dm_pending() */
-#define DM_FLAGS_UNWANTED 0x002 /* event not in fsys dm_eventset_t */
-#define DM_FLAGS_IMUX 0x004 /* thread holds i_mutex */
-#define DM_FLAGS_IALLOCSEM_RD 0x010 /* thread holds i_alloc_sem rd */
-#define DM_FLAGS_IALLOCSEM_WR 0x020 /* thread holds i_alloc_sem wr */
-
-/*
- * Pull in platform specific event flags defines
- */
-#include "xfs_dmapi_priv.h"
-
-/*
- * Macros to turn caller specified delay/block flags into
- * dm_send_xxxx_event flag DM_FLAGS_NDELAY.
- */
-
-#define FILP_DELAY_FLAG(filp) ((filp->f_flags&(O_NDELAY|O_NONBLOCK)) ? \
- DM_FLAGS_NDELAY : 0)
-#define AT_DELAY_FLAG(f) ((f & XFS_ATTR_NONBLOCK) ? DM_FLAGS_NDELAY : 0)
-
-#endif /* __XFS_DMAPI_H__ */
diff --git a/fs/xfs/xfs_dmops.c b/fs/xfs/xfs_dmops.c
deleted file mode 100644
index e71e2581c0c3..000000000000
--- a/fs/xfs/xfs_dmops.c
+++ /dev/null
@@ -1,55 +0,0 @@
-/*
- * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- */
-#include "xfs.h"
-#include "xfs_fs.h"
-#include "xfs_types.h"
-#include "xfs_log.h"
-#include "xfs_trans.h"
-#include "xfs_sb.h"
-#include "xfs_dmapi.h"
-#include "xfs_inum.h"
-#include "xfs_ag.h"
-#include "xfs_mount.h"
-
-
-static struct xfs_dmops xfs_dmcore_stub = {
- .xfs_send_data = (xfs_send_data_t)fs_nosys,
- .xfs_send_mmap = (xfs_send_mmap_t)fs_noerr,
- .xfs_send_destroy = (xfs_send_destroy_t)fs_nosys,
- .xfs_send_namesp = (xfs_send_namesp_t)fs_nosys,
- .xfs_send_mount = (xfs_send_mount_t)fs_nosys,
- .xfs_send_unmount = (xfs_send_unmount_t)fs_noerr,
-};
-
-int
-xfs_dmops_get(struct xfs_mount *mp)
-{
- if (mp->m_flags & XFS_MOUNT_DMAPI) {
- cmn_err(CE_WARN,
- "XFS: dmapi support not available in this kernel.");
- return EINVAL;
- }
-
- mp->m_dm_ops = &xfs_dmcore_stub;
- return 0;
-}
-
-void
-xfs_dmops_put(struct xfs_mount *mp)
-{
-}
diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c
index 047b8a8e5c29..ed9990267661 100644
--- a/fs/xfs/xfs_error.c
+++ b/fs/xfs/xfs_error.c
@@ -23,12 +23,8 @@
#include "xfs_trans.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
-#include "xfs_dir2.h"
-#include "xfs_dmapi.h"
#include "xfs_mount.h"
#include "xfs_bmap_btree.h"
-#include "xfs_dir2_sf.h"
-#include "xfs_attr_sf.h"
#include "xfs_dinode.h"
#include "xfs_inode.h"
#include "xfs_utils.h"
diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c
index 409fe81585fd..a55e687bf562 100644
--- a/fs/xfs/xfs_extfree_item.c
+++ b/fs/xfs/xfs_extfree_item.c
@@ -24,7 +24,6 @@
#include "xfs_buf_item.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
-#include "xfs_dmapi.h"
#include "xfs_mount.h"
#include "xfs_trans_priv.h"
#include "xfs_extfree_item.h"
@@ -33,18 +32,19 @@
kmem_zone_t *xfs_efi_zone;
kmem_zone_t *xfs_efd_zone;
-STATIC void xfs_efi_item_unlock(xfs_efi_log_item_t *);
+static inline struct xfs_efi_log_item *EFI_ITEM(struct xfs_log_item *lip)
+{
+ return container_of(lip, struct xfs_efi_log_item, efi_item);
+}
void
-xfs_efi_item_free(xfs_efi_log_item_t *efip)
+xfs_efi_item_free(
+ struct xfs_efi_log_item *efip)
{
- int nexts = efip->efi_format.efi_nextents;
-
- if (nexts > XFS_EFI_MAX_FAST_EXTENTS) {
+ if (efip->efi_format.efi_nextents > XFS_EFI_MAX_FAST_EXTENTS)
kmem_free(efip);
- } else {
+ else
kmem_zone_free(xfs_efi_zone, efip);
- }
}
/*
@@ -52,9 +52,9 @@ xfs_efi_item_free(xfs_efi_log_item_t *efip)
* We only need 1 iovec for an efi item. It just logs the efi_log_format
* structure.
*/
-/*ARGSUSED*/
STATIC uint
-xfs_efi_item_size(xfs_efi_log_item_t *efip)
+xfs_efi_item_size(
+ struct xfs_log_item *lip)
{
return 1;
}
@@ -67,10 +67,12 @@ xfs_efi_item_size(xfs_efi_log_item_t *efip)
* slots in the efi item have been filled.
*/
STATIC void
-xfs_efi_item_format(xfs_efi_log_item_t *efip,
- xfs_log_iovec_t *log_vector)
+xfs_efi_item_format(
+ struct xfs_log_item *lip,
+ struct xfs_log_iovec *log_vector)
{
- uint size;
+ struct xfs_efi_log_item *efip = EFI_ITEM(lip);
+ uint size;
ASSERT(efip->efi_next_extent == efip->efi_format.efi_nextents);
@@ -80,7 +82,7 @@ xfs_efi_item_format(xfs_efi_log_item_t *efip,
size += (efip->efi_format.efi_nextents - 1) * sizeof(xfs_extent_t);
efip->efi_format.efi_size = 1;
- log_vector->i_addr = (xfs_caddr_t)&(efip->efi_format);
+ log_vector->i_addr = &efip->efi_format;
log_vector->i_len = size;
log_vector->i_type = XLOG_REG_TYPE_EFI_FORMAT;
ASSERT(size >= sizeof(xfs_efi_log_format_t));
@@ -90,60 +92,33 @@ xfs_efi_item_format(xfs_efi_log_item_t *efip,
/*
* Pinning has no meaning for an efi item, so just return.
*/
-/*ARGSUSED*/
STATIC void
-xfs_efi_item_pin(xfs_efi_log_item_t *efip)
+xfs_efi_item_pin(
+ struct xfs_log_item *lip)
{
- return;
}
-
/*
* While EFIs cannot really be pinned, the unpin operation is the
* last place at which the EFI is manipulated during a transaction.
* Here we coordinate with xfs_efi_cancel() to determine who gets to
* free the EFI.
*/
-/*ARGSUSED*/
-STATIC void
-xfs_efi_item_unpin(xfs_efi_log_item_t *efip)
-{
- struct xfs_ail *ailp = efip->efi_item.li_ailp;
-
- spin_lock(&ailp->xa_lock);
- if (efip->efi_flags & XFS_EFI_CANCELED) {
- /* xfs_trans_ail_delete() drops the AIL lock. */
- xfs_trans_ail_delete(ailp, (xfs_log_item_t *)efip);
- xfs_efi_item_free(efip);
- } else {
- efip->efi_flags |= XFS_EFI_COMMITTED;
- spin_unlock(&ailp->xa_lock);
- }
-}
-
-/*
- * like unpin only we have to also clear the xaction descriptor
- * pointing the log item if we free the item. This routine duplicates
- * unpin because efi_flags is protected by the AIL lock. Freeing
- * the descriptor and then calling unpin would force us to drop the AIL
- * lock which would open up a race condition.
- */
STATIC void
-xfs_efi_item_unpin_remove(xfs_efi_log_item_t *efip, xfs_trans_t *tp)
+xfs_efi_item_unpin(
+ struct xfs_log_item *lip,
+ int remove)
{
- struct xfs_ail *ailp = efip->efi_item.li_ailp;
- xfs_log_item_desc_t *lidp;
+ struct xfs_efi_log_item *efip = EFI_ITEM(lip);
+ struct xfs_ail *ailp = lip->li_ailp;
spin_lock(&ailp->xa_lock);
if (efip->efi_flags & XFS_EFI_CANCELED) {
- /*
- * free the xaction descriptor pointing to this item
- */
- lidp = xfs_trans_find_item(tp, (xfs_log_item_t *) efip);
- xfs_trans_free_item(tp, lidp);
+ if (remove)
+ xfs_trans_del_item(lip);
/* xfs_trans_ail_delete() drops the AIL lock. */
- xfs_trans_ail_delete(ailp, (xfs_log_item_t *)efip);
+ xfs_trans_ail_delete(ailp, lip);
xfs_efi_item_free(efip);
} else {
efip->efi_flags |= XFS_EFI_COMMITTED;
@@ -158,9 +133,9 @@ xfs_efi_item_unpin_remove(xfs_efi_log_item_t *efip, xfs_trans_t *tp)
* XFS_ITEM_PINNED so that the caller will eventually flush the log.
* This should help in getting the EFI out of the AIL.
*/
-/*ARGSUSED*/
STATIC uint
-xfs_efi_item_trylock(xfs_efi_log_item_t *efip)
+xfs_efi_item_trylock(
+ struct xfs_log_item *lip)
{
return XFS_ITEM_PINNED;
}
@@ -168,13 +143,12 @@ xfs_efi_item_trylock(xfs_efi_log_item_t *efip)
/*
* Efi items have no locking, so just return.
*/
-/*ARGSUSED*/
STATIC void
-xfs_efi_item_unlock(xfs_efi_log_item_t *efip)
+xfs_efi_item_unlock(
+ struct xfs_log_item *lip)
{
- if (efip->efi_item.li_flags & XFS_LI_ABORTED)
- xfs_efi_item_free(efip);
- return;
+ if (lip->li_flags & XFS_LI_ABORTED)
+ xfs_efi_item_free(EFI_ITEM(lip));
}
/*
@@ -183,9 +157,10 @@ xfs_efi_item_unlock(xfs_efi_log_item_t *efip)
* flag is not paid any attention here. Checking for that is delayed
* until the EFI is unpinned.
*/
-/*ARGSUSED*/
STATIC xfs_lsn_t
-xfs_efi_item_committed(xfs_efi_log_item_t *efip, xfs_lsn_t lsn)
+xfs_efi_item_committed(
+ struct xfs_log_item *lip,
+ xfs_lsn_t lsn)
{
return lsn;
}
@@ -195,11 +170,10 @@ xfs_efi_item_committed(xfs_efi_log_item_t *efip, xfs_lsn_t lsn)
* stuck waiting for all of its corresponding efd items to be
* committed to disk.
*/
-/*ARGSUSED*/
STATIC void
-xfs_efi_item_push(xfs_efi_log_item_t *efip)
+xfs_efi_item_push(
+ struct xfs_log_item *lip)
{
- return;
}
/*
@@ -209,61 +183,55 @@ xfs_efi_item_push(xfs_efi_log_item_t *efip)
* example, for inodes, the inode is locked throughout the extent freeing
* so the dependency should be recorded there.
*/
-/*ARGSUSED*/
STATIC void
-xfs_efi_item_committing(xfs_efi_log_item_t *efip, xfs_lsn_t lsn)
+xfs_efi_item_committing(
+ struct xfs_log_item *lip,
+ xfs_lsn_t lsn)
{
- return;
}
/*
* This is the ops vector shared by all efi log items.
*/
static struct xfs_item_ops xfs_efi_item_ops = {
- .iop_size = (uint(*)(xfs_log_item_t*))xfs_efi_item_size,
- .iop_format = (void(*)(xfs_log_item_t*, xfs_log_iovec_t*))
- xfs_efi_item_format,
- .iop_pin = (void(*)(xfs_log_item_t*))xfs_efi_item_pin,
- .iop_unpin = (void(*)(xfs_log_item_t*))xfs_efi_item_unpin,
- .iop_unpin_remove = (void(*)(xfs_log_item_t*, xfs_trans_t *))
- xfs_efi_item_unpin_remove,
- .iop_trylock = (uint(*)(xfs_log_item_t*))xfs_efi_item_trylock,
- .iop_unlock = (void(*)(xfs_log_item_t*))xfs_efi_item_unlock,
- .iop_committed = (xfs_lsn_t(*)(xfs_log_item_t*, xfs_lsn_t))
- xfs_efi_item_committed,
- .iop_push = (void(*)(xfs_log_item_t*))xfs_efi_item_push,
- .iop_pushbuf = NULL,
- .iop_committing = (void(*)(xfs_log_item_t*, xfs_lsn_t))
- xfs_efi_item_committing
+ .iop_size = xfs_efi_item_size,
+ .iop_format = xfs_efi_item_format,
+ .iop_pin = xfs_efi_item_pin,
+ .iop_unpin = xfs_efi_item_unpin,
+ .iop_trylock = xfs_efi_item_trylock,
+ .iop_unlock = xfs_efi_item_unlock,
+ .iop_committed = xfs_efi_item_committed,
+ .iop_push = xfs_efi_item_push,
+ .iop_committing = xfs_efi_item_committing
};
/*
* Allocate and initialize an efi item with the given number of extents.
*/
-xfs_efi_log_item_t *
-xfs_efi_init(xfs_mount_t *mp,
- uint nextents)
+struct xfs_efi_log_item *
+xfs_efi_init(
+ struct xfs_mount *mp,
+ uint nextents)
{
- xfs_efi_log_item_t *efip;
+ struct xfs_efi_log_item *efip;
uint size;
ASSERT(nextents > 0);
if (nextents > XFS_EFI_MAX_FAST_EXTENTS) {
size = (uint)(sizeof(xfs_efi_log_item_t) +
((nextents - 1) * sizeof(xfs_extent_t)));
- efip = (xfs_efi_log_item_t*)kmem_zalloc(size, KM_SLEEP);
+ efip = kmem_zalloc(size, KM_SLEEP);
} else {
- efip = (xfs_efi_log_item_t*)kmem_zone_zalloc(xfs_efi_zone,
- KM_SLEEP);
+ efip = kmem_zone_zalloc(xfs_efi_zone, KM_SLEEP);
}
xfs_log_item_init(mp, &efip->efi_item, XFS_LI_EFI, &xfs_efi_item_ops);
efip->efi_format.efi_nextents = nextents;
efip->efi_format.efi_id = (__psint_t)(void*)efip;
- return (efip);
+ return efip;
}
/*
@@ -276,7 +244,7 @@ xfs_efi_init(xfs_mount_t *mp,
int
xfs_efi_copy_format(xfs_log_iovec_t *buf, xfs_efi_log_format_t *dst_efi_fmt)
{
- xfs_efi_log_format_t *src_efi_fmt = (xfs_efi_log_format_t *)buf->i_addr;
+ xfs_efi_log_format_t *src_efi_fmt = buf->i_addr;
uint i;
uint len = sizeof(xfs_efi_log_format_t) +
(src_efi_fmt->efi_nextents - 1) * sizeof(xfs_extent_t);
@@ -289,8 +257,7 @@ xfs_efi_copy_format(xfs_log_iovec_t *buf, xfs_efi_log_format_t *dst_efi_fmt)
memcpy((char *)dst_efi_fmt, (char*)src_efi_fmt, len);
return 0;
} else if (buf->i_len == len32) {
- xfs_efi_log_format_32_t *src_efi_fmt_32 =
- (xfs_efi_log_format_32_t *)buf->i_addr;
+ xfs_efi_log_format_32_t *src_efi_fmt_32 = buf->i_addr;
dst_efi_fmt->efi_type = src_efi_fmt_32->efi_type;
dst_efi_fmt->efi_size = src_efi_fmt_32->efi_size;
@@ -304,8 +271,7 @@ xfs_efi_copy_format(xfs_log_iovec_t *buf, xfs_efi_log_format_t *dst_efi_fmt)
}
return 0;
} else if (buf->i_len == len64) {
- xfs_efi_log_format_64_t *src_efi_fmt_64 =
- (xfs_efi_log_format_64_t *)buf->i_addr;
+ xfs_efi_log_format_64_t *src_efi_fmt_64 = buf->i_addr;
dst_efi_fmt->efi_type = src_efi_fmt_64->efi_type;
dst_efi_fmt->efi_size = src_efi_fmt_64->efi_size;
@@ -356,16 +322,18 @@ xfs_efi_release(xfs_efi_log_item_t *efip,
}
}
-STATIC void
-xfs_efd_item_free(xfs_efd_log_item_t *efdp)
+static inline struct xfs_efd_log_item *EFD_ITEM(struct xfs_log_item *lip)
{
- int nexts = efdp->efd_format.efd_nextents;
+ return container_of(lip, struct xfs_efd_log_item, efd_item);
+}
- if (nexts > XFS_EFD_MAX_FAST_EXTENTS) {
+STATIC void
+xfs_efd_item_free(struct xfs_efd_log_item *efdp)
+{
+ if (efdp->efd_format.efd_nextents > XFS_EFD_MAX_FAST_EXTENTS)
kmem_free(efdp);
- } else {
+ else
kmem_zone_free(xfs_efd_zone, efdp);
- }
}
/*
@@ -373,9 +341,9 @@ xfs_efd_item_free(xfs_efd_log_item_t *efdp)
* We only need 1 iovec for an efd item. It just logs the efd_log_format
* structure.
*/
-/*ARGSUSED*/
STATIC uint
-xfs_efd_item_size(xfs_efd_log_item_t *efdp)
+xfs_efd_item_size(
+ struct xfs_log_item *lip)
{
return 1;
}
@@ -388,10 +356,12 @@ xfs_efd_item_size(xfs_efd_log_item_t *efdp)
* slots in the efd item have been filled.
*/
STATIC void
-xfs_efd_item_format(xfs_efd_log_item_t *efdp,
- xfs_log_iovec_t *log_vector)
+xfs_efd_item_format(
+ struct xfs_log_item *lip,
+ struct xfs_log_iovec *log_vector)
{
- uint size;
+ struct xfs_efd_log_item *efdp = EFD_ITEM(lip);
+ uint size;
ASSERT(efdp->efd_next_extent == efdp->efd_format.efd_nextents);
@@ -401,48 +371,38 @@ xfs_efd_item_format(xfs_efd_log_item_t *efdp,
size += (efdp->efd_format.efd_nextents - 1) * sizeof(xfs_extent_t);
efdp->efd_format.efd_size = 1;
- log_vector->i_addr = (xfs_caddr_t)&(efdp->efd_format);
+ log_vector->i_addr = &efdp->efd_format;
log_vector->i_len = size;
log_vector->i_type = XLOG_REG_TYPE_EFD_FORMAT;
ASSERT(size >= sizeof(xfs_efd_log_format_t));
}
-
/*
* Pinning has no meaning for an efd item, so just return.
*/
-/*ARGSUSED*/
STATIC void
-xfs_efd_item_pin(xfs_efd_log_item_t *efdp)
+xfs_efd_item_pin(
+ struct xfs_log_item *lip)
{
- return;
}
-
/*
* Since pinning has no meaning for an efd item, unpinning does
* not either.
*/
-/*ARGSUSED*/
-STATIC void
-xfs_efd_item_unpin(xfs_efd_log_item_t *efdp)
-{
- return;
-}
-
-/*ARGSUSED*/
STATIC void
-xfs_efd_item_unpin_remove(xfs_efd_log_item_t *efdp, xfs_trans_t *tp)
+xfs_efd_item_unpin(
+ struct xfs_log_item *lip,
+ int remove)
{
- return;
}
/*
* Efd items have no locking, so just return success.
*/
-/*ARGSUSED*/
STATIC uint
-xfs_efd_item_trylock(xfs_efd_log_item_t *efdp)
+xfs_efd_item_trylock(
+ struct xfs_log_item *lip)
{
return XFS_ITEM_LOCKED;
}
@@ -451,13 +411,12 @@ xfs_efd_item_trylock(xfs_efd_log_item_t *efdp)
* Efd items have no locking or pushing, so return failure
* so that the caller doesn't bother with us.
*/
-/*ARGSUSED*/
STATIC void
-xfs_efd_item_unlock(xfs_efd_log_item_t *efdp)
+xfs_efd_item_unlock(
+ struct xfs_log_item *lip)
{
- if (efdp->efd_item.li_flags & XFS_LI_ABORTED)
- xfs_efd_item_free(efdp);
- return;
+ if (lip->li_flags & XFS_LI_ABORTED)
+ xfs_efd_item_free(EFD_ITEM(lip));
}
/*
@@ -467,15 +426,18 @@ xfs_efd_item_unlock(xfs_efd_log_item_t *efdp)
* return -1 to keep the transaction code from further referencing
* this item.
*/
-/*ARGSUSED*/
STATIC xfs_lsn_t
-xfs_efd_item_committed(xfs_efd_log_item_t *efdp, xfs_lsn_t lsn)
+xfs_efd_item_committed(
+ struct xfs_log_item *lip,
+ xfs_lsn_t lsn)
{
+ struct xfs_efd_log_item *efdp = EFD_ITEM(lip);
+
/*
* If we got a log I/O error, it's always the case that the LR with the
* EFI got unpinned and freed before the EFD got aborted.
*/
- if ((efdp->efd_item.li_flags & XFS_LI_ABORTED) == 0)
+ if (!(lip->li_flags & XFS_LI_ABORTED))
xfs_efi_release(efdp->efd_efip, efdp->efd_format.efd_nextents);
xfs_efd_item_free(efdp);
@@ -486,11 +448,10 @@ xfs_efd_item_committed(xfs_efd_log_item_t *efdp, xfs_lsn_t lsn)
* There isn't much you can do to push on an efd item. It is simply
* stuck waiting for the log to be flushed to disk.
*/
-/*ARGSUSED*/
STATIC void
-xfs_efd_item_push(xfs_efd_log_item_t *efdp)
+xfs_efd_item_push(
+ struct xfs_log_item *lip)
{
- return;
}
/*
@@ -500,55 +461,48 @@ xfs_efd_item_push(xfs_efd_log_item_t *efdp)
* example, for inodes, the inode is locked throughout the extent freeing
* so the dependency should be recorded there.
*/
-/*ARGSUSED*/
STATIC void
-xfs_efd_item_committing(xfs_efd_log_item_t *efip, xfs_lsn_t lsn)
+xfs_efd_item_committing(
+ struct xfs_log_item *lip,
+ xfs_lsn_t lsn)
{
- return;
}
/*
* This is the ops vector shared by all efd log items.
*/
static struct xfs_item_ops xfs_efd_item_ops = {
- .iop_size = (uint(*)(xfs_log_item_t*))xfs_efd_item_size,
- .iop_format = (void(*)(xfs_log_item_t*, xfs_log_iovec_t*))
- xfs_efd_item_format,
- .iop_pin = (void(*)(xfs_log_item_t*))xfs_efd_item_pin,
- .iop_unpin = (void(*)(xfs_log_item_t*))xfs_efd_item_unpin,
- .iop_unpin_remove = (void(*)(xfs_log_item_t*, xfs_trans_t*))
- xfs_efd_item_unpin_remove,
- .iop_trylock = (uint(*)(xfs_log_item_t*))xfs_efd_item_trylock,
- .iop_unlock = (void(*)(xfs_log_item_t*))xfs_efd_item_unlock,
- .iop_committed = (xfs_lsn_t(*)(xfs_log_item_t*, xfs_lsn_t))
- xfs_efd_item_committed,
- .iop_push = (void(*)(xfs_log_item_t*))xfs_efd_item_push,
- .iop_pushbuf = NULL,
- .iop_committing = (void(*)(xfs_log_item_t*, xfs_lsn_t))
- xfs_efd_item_committing
+ .iop_size = xfs_efd_item_size,
+ .iop_format = xfs_efd_item_format,
+ .iop_pin = xfs_efd_item_pin,
+ .iop_unpin = xfs_efd_item_unpin,
+ .iop_trylock = xfs_efd_item_trylock,
+ .iop_unlock = xfs_efd_item_unlock,
+ .iop_committed = xfs_efd_item_committed,
+ .iop_push = xfs_efd_item_push,
+ .iop_committing = xfs_efd_item_committing
};
-
/*
* Allocate and initialize an efd item with the given number of extents.
*/
-xfs_efd_log_item_t *
-xfs_efd_init(xfs_mount_t *mp,
- xfs_efi_log_item_t *efip,
- uint nextents)
+struct xfs_efd_log_item *
+xfs_efd_init(
+ struct xfs_mount *mp,
+ struct xfs_efi_log_item *efip,
+ uint nextents)
{
- xfs_efd_log_item_t *efdp;
+ struct xfs_efd_log_item *efdp;
uint size;
ASSERT(nextents > 0);
if (nextents > XFS_EFD_MAX_FAST_EXTENTS) {
size = (uint)(sizeof(xfs_efd_log_item_t) +
((nextents - 1) * sizeof(xfs_extent_t)));
- efdp = (xfs_efd_log_item_t*)kmem_zalloc(size, KM_SLEEP);
+ efdp = kmem_zalloc(size, KM_SLEEP);
} else {
- efdp = (xfs_efd_log_item_t*)kmem_zone_zalloc(xfs_efd_zone,
- KM_SLEEP);
+ efdp = kmem_zone_zalloc(xfs_efd_zone, KM_SLEEP);
}
xfs_log_item_init(mp, &efdp->efd_item, XFS_LI_EFD, &xfs_efd_item_ops);
@@ -556,5 +510,5 @@ xfs_efd_init(xfs_mount_t *mp,
efdp->efd_format.efd_nextents = nextents;
efdp->efd_format.efd_efi_id = efip->efi_format.efi_id;
- return (efdp);
+ return efdp;
}
diff --git a/fs/xfs/xfs_filestream.c b/fs/xfs/xfs_filestream.c
index 390850ee6603..9b715dce5699 100644
--- a/fs/xfs/xfs_filestream.c
+++ b/fs/xfs/xfs_filestream.c
@@ -18,13 +18,9 @@
#include "xfs.h"
#include "xfs_bmap_btree.h"
#include "xfs_inum.h"
-#include "xfs_dir2.h"
-#include "xfs_dir2_sf.h"
-#include "xfs_attr_sf.h"
#include "xfs_dinode.h"
#include "xfs_inode.h"
#include "xfs_ag.h"
-#include "xfs_dmapi.h"
#include "xfs_log.h"
#include "xfs_trans.h"
#include "xfs_sb.h"
@@ -127,6 +123,82 @@ typedef struct fstrm_item
xfs_inode_t *pip; /* Parent directory inode pointer. */
} fstrm_item_t;
+/*
+ * Allocation group filestream associations are tracked with per-ag atomic
+ * counters. These counters allow _xfs_filestream_pick_ag() to tell whether a
+ * particular AG already has active filestreams associated with it. The mount
+ * point's m_peraglock is used to protect these counters from per-ag array
+ * re-allocation during a growfs operation. When xfs_growfs_data_private() is
+ * about to reallocate the array, it calls xfs_filestream_flush() with the
+ * m_peraglock held in write mode.
+ *
+ * Since xfs_mru_cache_flush() guarantees that all the free functions for all
+ * the cache elements have finished executing before it returns, it's safe for
+ * the free functions to use the atomic counters without m_peraglock protection.
+ * This allows the implementation of xfs_fstrm_free_func() to be agnostic about
+ * whether it was called with the m_peraglock held in read mode, write mode or
+ * not held at all. The race condition this addresses is the following:
+ *
+ * - The work queue scheduler fires and pulls a filestream directory cache
+ * element off the LRU end of the cache for deletion, then gets pre-empted.
+ * - A growfs operation grabs the m_peraglock in write mode, flushes all the
+ * remaining items from the cache and reallocates the mount point's per-ag
+ * array, resetting all the counters to zero.
+ * - The work queue thread resumes and calls the free function for the element
+ * it started cleaning up earlier. In the process it decrements the
+ * filestreams counter for an AG that now has no references.
+ *
+ * With a shrinkfs feature, the above scenario could panic the system.
+ *
+ * All other uses of the following macros should be protected by either the
+ * m_peraglock held in read mode, or the cache's internal locking exposed by the
+ * interval between a call to xfs_mru_cache_lookup() and a call to
+ * xfs_mru_cache_done(). In addition, the m_peraglock must be held in read mode
+ * when new elements are added to the cache.
+ *
+ * Combined, these locking rules ensure that no associations will ever exist in
+ * the cache that reference per-ag array elements that have since been
+ * reallocated.
+ */
+static int
+xfs_filestream_peek_ag(
+ xfs_mount_t *mp,
+ xfs_agnumber_t agno)
+{
+ struct xfs_perag *pag;
+ int ret;
+
+ pag = xfs_perag_get(mp, agno);
+ ret = atomic_read(&pag->pagf_fstrms);
+ xfs_perag_put(pag);
+ return ret;
+}
+
+static int
+xfs_filestream_get_ag(
+ xfs_mount_t *mp,
+ xfs_agnumber_t agno)
+{
+ struct xfs_perag *pag;
+ int ret;
+
+ pag = xfs_perag_get(mp, agno);
+ ret = atomic_inc_return(&pag->pagf_fstrms);
+ xfs_perag_put(pag);
+ return ret;
+}
+
+static void
+xfs_filestream_put_ag(
+ xfs_mount_t *mp,
+ xfs_agnumber_t agno)
+{
+ struct xfs_perag *pag;
+
+ pag = xfs_perag_get(mp, agno);
+ atomic_dec(&pag->pagf_fstrms);
+ xfs_perag_put(pag);
+}
/*
* Scan the AGs starting at startag looking for an AG that isn't in use and has
@@ -355,16 +427,14 @@ xfs_fstrm_free_func(
{
fstrm_item_t *item = (fstrm_item_t *)data;
xfs_inode_t *ip = item->ip;
- int ref;
ASSERT(ip->i_ino == ino);
xfs_iflags_clear(ip, XFS_IFILESTREAM);
/* Drop the reference taken on the AG when the item was added. */
- ref = xfs_filestream_put_ag(ip->i_mount, item->ag);
+ xfs_filestream_put_ag(ip->i_mount, item->ag);
- ASSERT(ref >= 0);
TRACE_FREE(ip->i_mount, ip, item->pip, item->ag,
xfs_filestream_peek_ag(ip->i_mount, item->ag));
diff --git a/fs/xfs/xfs_filestream.h b/fs/xfs/xfs_filestream.h
index 260f757bbc5d..09dd9af45434 100644
--- a/fs/xfs/xfs_filestream.h
+++ b/fs/xfs/xfs_filestream.h
@@ -42,88 +42,6 @@ extern ktrace_t *xfs_filestreams_trace_buf;
#endif
-/*
- * Allocation group filestream associations are tracked with per-ag atomic
- * counters. These counters allow _xfs_filestream_pick_ag() to tell whether a
- * particular AG already has active filestreams associated with it. The mount
- * point's m_peraglock is used to protect these counters from per-ag array
- * re-allocation during a growfs operation. When xfs_growfs_data_private() is
- * about to reallocate the array, it calls xfs_filestream_flush() with the
- * m_peraglock held in write mode.
- *
- * Since xfs_mru_cache_flush() guarantees that all the free functions for all
- * the cache elements have finished executing before it returns, it's safe for
- * the free functions to use the atomic counters without m_peraglock protection.
- * This allows the implementation of xfs_fstrm_free_func() to be agnostic about
- * whether it was called with the m_peraglock held in read mode, write mode or
- * not held at all. The race condition this addresses is the following:
- *
- * - The work queue scheduler fires and pulls a filestream directory cache
- * element off the LRU end of the cache for deletion, then gets pre-empted.
- * - A growfs operation grabs the m_peraglock in write mode, flushes all the
- * remaining items from the cache and reallocates the mount point's per-ag
- * array, resetting all the counters to zero.
- * - The work queue thread resumes and calls the free function for the element
- * it started cleaning up earlier. In the process it decrements the
- * filestreams counter for an AG that now has no references.
- *
- * With a shrinkfs feature, the above scenario could panic the system.
- *
- * All other uses of the following macros should be protected by either the
- * m_peraglock held in read mode, or the cache's internal locking exposed by the
- * interval between a call to xfs_mru_cache_lookup() and a call to
- * xfs_mru_cache_done(). In addition, the m_peraglock must be held in read mode
- * when new elements are added to the cache.
- *
- * Combined, these locking rules ensure that no associations will ever exist in
- * the cache that reference per-ag array elements that have since been
- * reallocated.
- */
-/*
- * xfs_filestream_peek_ag is only used in tracing code
- */
-static inline int
-xfs_filestream_peek_ag(
- xfs_mount_t *mp,
- xfs_agnumber_t agno)
-{
- struct xfs_perag *pag;
- int ret;
-
- pag = xfs_perag_get(mp, agno);
- ret = atomic_read(&pag->pagf_fstrms);
- xfs_perag_put(pag);
- return ret;
-}
-
-static inline int
-xfs_filestream_get_ag(
- xfs_mount_t *mp,
- xfs_agnumber_t agno)
-{
- struct xfs_perag *pag;
- int ret;
-
- pag = xfs_perag_get(mp, agno);
- ret = atomic_inc_return(&pag->pagf_fstrms);
- xfs_perag_put(pag);
- return ret;
-}
-
-static inline int
-xfs_filestream_put_ag(
- xfs_mount_t *mp,
- xfs_agnumber_t agno)
-{
- struct xfs_perag *pag;
- int ret;
-
- pag = xfs_perag_get(mp, agno);
- ret = atomic_dec_return(&pag->pagf_fstrms);
- xfs_perag_put(pag);
- return ret;
-}
-
/* allocation selection flags */
typedef enum xfs_fstrm_alloc {
XFS_PICK_USERDATA = 1,
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index 37a6f62c57b6..dbca5f5c37ba 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -24,14 +24,10 @@
#include "xfs_trans.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
-#include "xfs_dir2.h"
-#include "xfs_dmapi.h"
#include "xfs_mount.h"
#include "xfs_bmap_btree.h"
#include "xfs_alloc_btree.h"
#include "xfs_ialloc_btree.h"
-#include "xfs_dir2_sf.h"
-#include "xfs_attr_sf.h"
#include "xfs_dinode.h"
#include "xfs_inode.h"
#include "xfs_inode_item.h"
@@ -626,8 +622,7 @@ xfs_fs_log_dummy(
ip = mp->m_rootip;
xfs_ilock(ip, XFS_ILOCK_EXCL);
- xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
- xfs_trans_ihold(tp, ip);
+ xfs_trans_ijoin(tp, ip);
xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
xfs_trans_set_sync(tp);
error = xfs_trans_commit(tp, 0);
diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c
index 9d884c127bb9..abf80ae1e95b 100644
--- a/fs/xfs/xfs_ialloc.c
+++ b/fs/xfs/xfs_ialloc.c
@@ -24,14 +24,10 @@
#include "xfs_trans.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
-#include "xfs_dir2.h"
-#include "xfs_dmapi.h"
#include "xfs_mount.h"
#include "xfs_bmap_btree.h"
#include "xfs_alloc_btree.h"
#include "xfs_ialloc_btree.h"
-#include "xfs_dir2_sf.h"
-#include "xfs_attr_sf.h"
#include "xfs_dinode.h"
#include "xfs_inode.h"
#include "xfs_btree.h"
@@ -1203,6 +1199,63 @@ error0:
return error;
}
+STATIC int
+xfs_imap_lookup(
+ struct xfs_mount *mp,
+ struct xfs_trans *tp,
+ xfs_agnumber_t agno,
+ xfs_agino_t agino,
+ xfs_agblock_t agbno,
+ xfs_agblock_t *chunk_agbno,
+ xfs_agblock_t *offset_agbno,
+ int flags)
+{
+ struct xfs_inobt_rec_incore rec;
+ struct xfs_btree_cur *cur;
+ struct xfs_buf *agbp;
+ xfs_agino_t startino;
+ int error;
+ int i;
+
+ error = xfs_ialloc_read_agi(mp, tp, agno, &agbp);
+ if (error) {
+ xfs_fs_cmn_err(CE_ALERT, mp, "xfs_imap: "
+ "xfs_ialloc_read_agi() returned "
+ "error %d, agno %d",
+ error, agno);
+ return error;
+ }
+
+ /*
+ * derive and lookup the exact inode record for the given agino. If the
+ * record cannot be found, then it's an invalid inode number and we
+ * should abort.
+ */
+ cur = xfs_inobt_init_cursor(mp, tp, agbp, agno);
+ startino = agino & ~(XFS_IALLOC_INODES(mp) - 1);
+ error = xfs_inobt_lookup(cur, startino, XFS_LOOKUP_EQ, &i);
+ if (!error) {
+ if (i)
+ error = xfs_inobt_get_rec(cur, &rec, &i);
+ if (!error && i == 0)
+ error = EINVAL;
+ }
+
+ xfs_trans_brelse(tp, agbp);
+ xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
+ if (error)
+ return error;
+
+ /* for untrusted inodes check it is allocated first */
+ if ((flags & XFS_IGET_UNTRUSTED) &&
+ (rec.ir_free & XFS_INOBT_MASK(agino - rec.ir_startino)))
+ return EINVAL;
+
+ *chunk_agbno = XFS_AGINO_TO_AGBNO(mp, rec.ir_startino);
+ *offset_agbno = agbno - *chunk_agbno;
+ return 0;
+}
+
/*
* Return the location of the inode in imap, for mapping it into a buffer.
*/
@@ -1235,8 +1288,11 @@ xfs_imap(
if (agno >= mp->m_sb.sb_agcount || agbno >= mp->m_sb.sb_agblocks ||
ino != XFS_AGINO_TO_INO(mp, agno, agino)) {
#ifdef DEBUG
- /* no diagnostics for bulkstat, ino comes from userspace */
- if (flags & XFS_IGET_BULKSTAT)
+ /*
+ * Don't output diagnostic information for untrusted inodes
+ * as they can be invalid without implying corruption.
+ */
+ if (flags & XFS_IGET_UNTRUSTED)
return XFS_ERROR(EINVAL);
if (agno >= mp->m_sb.sb_agcount) {
xfs_fs_cmn_err(CE_ALERT, mp,
@@ -1263,6 +1319,23 @@ xfs_imap(
return XFS_ERROR(EINVAL);
}
+ blks_per_cluster = XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_blocklog;
+
+ /*
+ * For bulkstat and handle lookups, we have an untrusted inode number
+ * that we have to verify is valid. We cannot do this just by reading
+ * the inode buffer as it may have been unlinked and removed leaving
+ * inodes in stale state on disk. Hence we have to do a btree lookup
+ * in all cases where an untrusted inode number is passed.
+ */
+ if (flags & XFS_IGET_UNTRUSTED) {
+ error = xfs_imap_lookup(mp, tp, agno, agino, agbno,
+ &chunk_agbno, &offset_agbno, flags);
+ if (error)
+ return error;
+ goto out_map;
+ }
+
/*
* If the inode cluster size is the same as the blocksize or
* smaller we get to the buffer by simple arithmetics.
@@ -1277,24 +1350,6 @@ xfs_imap(
return 0;
}
- blks_per_cluster = XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_blocklog;
-
- /*
- * If we get a block number passed from bulkstat we can use it to
- * find the buffer easily.
- */
- if (imap->im_blkno) {
- offset = XFS_INO_TO_OFFSET(mp, ino);
- ASSERT(offset < mp->m_sb.sb_inopblock);
-
- cluster_agbno = xfs_daddr_to_agbno(mp, imap->im_blkno);
- offset += (agbno - cluster_agbno) * mp->m_sb.sb_inopblock;
-
- imap->im_len = XFS_FSB_TO_BB(mp, blks_per_cluster);
- imap->im_boffset = (ushort)(offset << mp->m_sb.sb_inodelog);
- return 0;
- }
-
/*
* If the inode chunks are aligned then use simple maths to
* find the location. Otherwise we have to do a btree
@@ -1304,50 +1359,13 @@ xfs_imap(
offset_agbno = agbno & mp->m_inoalign_mask;
chunk_agbno = agbno - offset_agbno;
} else {
- xfs_btree_cur_t *cur; /* inode btree cursor */
- xfs_inobt_rec_incore_t chunk_rec;
- xfs_buf_t *agbp; /* agi buffer */
- int i; /* temp state */
-
- error = xfs_ialloc_read_agi(mp, tp, agno, &agbp);
- if (error) {
- xfs_fs_cmn_err(CE_ALERT, mp, "xfs_imap: "
- "xfs_ialloc_read_agi() returned "
- "error %d, agno %d",
- error, agno);
- return error;
- }
-
- cur = xfs_inobt_init_cursor(mp, tp, agbp, agno);
- error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE, &i);
- if (error) {
- xfs_fs_cmn_err(CE_ALERT, mp, "xfs_imap: "
- "xfs_inobt_lookup() failed");
- goto error0;
- }
-
- error = xfs_inobt_get_rec(cur, &chunk_rec, &i);
- if (error) {
- xfs_fs_cmn_err(CE_ALERT, mp, "xfs_imap: "
- "xfs_inobt_get_rec() failed");
- goto error0;
- }
- if (i == 0) {
-#ifdef DEBUG
- xfs_fs_cmn_err(CE_ALERT, mp, "xfs_imap: "
- "xfs_inobt_get_rec() failed");
-#endif /* DEBUG */
- error = XFS_ERROR(EINVAL);
- }
- error0:
- xfs_trans_brelse(tp, agbp);
- xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
+ error = xfs_imap_lookup(mp, tp, agno, agino, agbno,
+ &chunk_agbno, &offset_agbno, flags);
if (error)
return error;
- chunk_agbno = XFS_AGINO_TO_AGBNO(mp, chunk_rec.ir_startino);
- offset_agbno = agbno - chunk_agbno;
}
+out_map:
ASSERT(agbno >= chunk_agbno);
cluster_agbno = chunk_agbno +
((offset_agbno / blks_per_cluster) * blks_per_cluster);
diff --git a/fs/xfs/xfs_ialloc_btree.c b/fs/xfs/xfs_ialloc_btree.c
index c282a9af5393..d352862cefa0 100644
--- a/fs/xfs/xfs_ialloc_btree.c
+++ b/fs/xfs/xfs_ialloc_btree.c
@@ -24,14 +24,10 @@
#include "xfs_trans.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
-#include "xfs_dir2.h"
-#include "xfs_dmapi.h"
#include "xfs_mount.h"
#include "xfs_bmap_btree.h"
#include "xfs_alloc_btree.h"
#include "xfs_ialloc_btree.h"
-#include "xfs_dir2_sf.h"
-#include "xfs_attr_sf.h"
#include "xfs_dinode.h"
#include "xfs_inode.h"
#include "xfs_btree.h"
diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c
index 6845db90818f..b1ecc6f97ade 100644
--- a/fs/xfs/xfs_iget.c
+++ b/fs/xfs/xfs_iget.c
@@ -25,14 +25,10 @@
#include "xfs_trans.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
-#include "xfs_dir2.h"
-#include "xfs_dmapi.h"
#include "xfs_mount.h"
#include "xfs_bmap_btree.h"
#include "xfs_alloc_btree.h"
#include "xfs_ialloc_btree.h"
-#include "xfs_dir2_sf.h"
-#include "xfs_attr_sf.h"
#include "xfs_dinode.h"
#include "xfs_inode.h"
#include "xfs_btree.h"
@@ -95,7 +91,7 @@ xfs_inode_alloc(
return ip;
}
-STATIC void
+void
xfs_inode_free(
struct xfs_inode *ip)
{
@@ -212,7 +208,7 @@ xfs_iget_cache_hit(
ip->i_flags &= ~XFS_INEW;
ip->i_flags |= XFS_IRECLAIMABLE;
__xfs_inode_set_reclaim_tag(pag, ip);
- trace_xfs_iget_reclaim(ip);
+ trace_xfs_iget_reclaim_fail(ip);
goto out_error;
}
@@ -227,6 +223,7 @@ xfs_iget_cache_hit(
} else {
/* If the VFS inode is being torn down, pause and try again. */
if (!igrab(inode)) {
+ trace_xfs_iget_skip(ip);
error = EAGAIN;
goto out_error;
}
@@ -234,6 +231,7 @@ xfs_iget_cache_hit(
/* We've got a live one. */
spin_unlock(&ip->i_flags_lock);
read_unlock(&pag->pag_ici_lock);
+ trace_xfs_iget_hit(ip);
}
if (lock_flags != 0)
@@ -242,7 +240,6 @@ xfs_iget_cache_hit(
xfs_iflags_clear(ip, XFS_ISTALE);
XFS_STATS_INC(xs_ig_found);
- trace_xfs_iget_found(ip);
return 0;
out_error:
@@ -259,24 +256,22 @@ xfs_iget_cache_miss(
xfs_trans_t *tp,
xfs_ino_t ino,
struct xfs_inode **ipp,
- xfs_daddr_t bno,
int flags,
int lock_flags)
{
struct xfs_inode *ip;
int error;
- unsigned long first_index, mask;
xfs_agino_t agino = XFS_INO_TO_AGINO(mp, ino);
ip = xfs_inode_alloc(mp, ino);
if (!ip)
return ENOMEM;
- error = xfs_iread(mp, tp, ip, bno, flags);
+ error = xfs_iread(mp, tp, ip, flags);
if (error)
goto out_destroy;
- xfs_itrace_entry(ip);
+ trace_xfs_iget_miss(ip);
if ((ip->i_d.di_mode == 0) && !(flags & XFS_IGET_CREATE)) {
error = ENOENT;
@@ -302,8 +297,6 @@ xfs_iget_cache_miss(
BUG();
}
- mask = ~(((XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog)) - 1);
- first_index = agino & mask;
write_lock(&pag->pag_ici_lock);
/* insert the new inode */
@@ -322,7 +315,6 @@ xfs_iget_cache_miss(
write_unlock(&pag->pag_ici_lock);
radix_tree_preload_end();
- trace_xfs_iget_alloc(ip);
*ipp = ip;
return 0;
@@ -358,8 +350,6 @@ out_destroy:
* within the file system for the inode being requested.
* lock_flags -- flags indicating how to lock the inode. See the comment
* for xfs_ilock() for a list of valid values.
- * bno -- the block number starting the buffer containing the inode,
- * if known (as by bulkstat), else 0.
*/
int
xfs_iget(
@@ -368,8 +358,7 @@ xfs_iget(
xfs_ino_t ino,
uint flags,
uint lock_flags,
- xfs_inode_t **ipp,
- xfs_daddr_t bno)
+ xfs_inode_t **ipp)
{
xfs_inode_t *ip;
int error;
@@ -382,9 +371,6 @@ xfs_iget(
/* get the perag structure and ensure that it's inode capable */
pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ino));
- if (!pag->pagi_inodeok)
- return EINVAL;
- ASSERT(pag->pag_ici_init);
agino = XFS_INO_TO_AGINO(mp, ino);
again:
@@ -400,7 +386,7 @@ again:
read_unlock(&pag->pag_ici_lock);
XFS_STATS_INC(xs_ig_missed);
- error = xfs_iget_cache_miss(mp, pag, tp, ino, &ip, bno,
+ error = xfs_iget_cache_miss(mp, pag, tp, ino, &ip,
flags, lock_flags);
if (error)
goto out_error_or_again;
@@ -429,97 +415,6 @@ out_error_or_again:
}
/*
- * Decrement reference count of an inode structure and unlock it.
- *
- * ip -- the inode being released
- * lock_flags -- this parameter indicates the inode's locks to be
- * to be released. See the comment on xfs_iunlock() for a list
- * of valid values.
- */
-void
-xfs_iput(xfs_inode_t *ip,
- uint lock_flags)
-{
- xfs_itrace_entry(ip);
- xfs_iunlock(ip, lock_flags);
- IRELE(ip);
-}
-
-/*
- * Special iput for brand-new inodes that are still locked
- */
-void
-xfs_iput_new(
- xfs_inode_t *ip,
- uint lock_flags)
-{
- struct inode *inode = VFS_I(ip);
-
- xfs_itrace_entry(ip);
-
- if ((ip->i_d.di_mode == 0)) {
- ASSERT(!xfs_iflags_test(ip, XFS_IRECLAIMABLE));
- make_bad_inode(inode);
- }
- if (inode->i_state & I_NEW)
- unlock_new_inode(inode);
- if (lock_flags)
- xfs_iunlock(ip, lock_flags);
- IRELE(ip);
-}
-
-/*
- * This is called free all the memory associated with an inode.
- * It must free the inode itself and any buffers allocated for
- * if_extents/if_data and if_broot. It must also free the lock
- * associated with the inode.
- *
- * Note: because we don't initialise everything on reallocation out
- * of the zone, we must ensure we nullify everything correctly before
- * freeing the structure.
- */
-void
-xfs_ireclaim(
- struct xfs_inode *ip)
-{
- struct xfs_mount *mp = ip->i_mount;
- struct xfs_perag *pag;
- xfs_agino_t agino = XFS_INO_TO_AGINO(mp, ip->i_ino);
-
- XFS_STATS_INC(xs_ig_reclaims);
-
- /*
- * Remove the inode from the per-AG radix tree.
- *
- * Because radix_tree_delete won't complain even if the item was never
- * added to the tree assert that it's been there before to catch
- * problems with the inode life time early on.
- */
- pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
- write_lock(&pag->pag_ici_lock);
- if (!radix_tree_delete(&pag->pag_ici_root, agino))
- ASSERT(0);
- write_unlock(&pag->pag_ici_lock);
- xfs_perag_put(pag);
-
- /*
- * Here we do an (almost) spurious inode lock in order to coordinate
- * with inode cache radix tree lookups. This is because the lookup
- * can reference the inodes in the cache without taking references.
- *
- * We make that OK here by ensuring that we wait until the inode is
- * unlocked after the lookup before we go ahead and free it. We get
- * both the ilock and the iolock because the code may need to drop the
- * ilock one but will still hold the iolock.
- */
- xfs_ilock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
- xfs_qm_dqdetach(ip);
- xfs_iunlock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
-
- xfs_inode_free(ip);
-}
-
-/*
* This is a wrapper routine around the xfs_ilock() routine
* used to centralize some grungy code. It is used in places
* that wish to lock the inode solely for reading the extents.
@@ -744,30 +639,24 @@ xfs_ilock_demote(
}
#ifdef DEBUG
-/*
- * Debug-only routine, without additional rw_semaphore APIs, we can
- * now only answer requests regarding whether we hold the lock for write
- * (reader state is outside our visibility, we only track writer state).
- *
- * Note: this means !xfs_isilocked would give false positives, so don't do that.
- */
int
xfs_isilocked(
xfs_inode_t *ip,
uint lock_flags)
{
- if ((lock_flags & (XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)) ==
- XFS_ILOCK_EXCL) {
- if (!ip->i_lock.mr_writer)
- return 0;
+ if (lock_flags & (XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)) {
+ if (!(lock_flags & XFS_ILOCK_SHARED))
+ return !!ip->i_lock.mr_writer;
+ return rwsem_is_locked(&ip->i_lock.mr_lock);
}
- if ((lock_flags & (XFS_IOLOCK_EXCL|XFS_IOLOCK_SHARED)) ==
- XFS_IOLOCK_EXCL) {
- if (!ip->i_iolock.mr_writer)
- return 0;
+ if (lock_flags & (XFS_IOLOCK_EXCL|XFS_IOLOCK_SHARED)) {
+ if (!(lock_flags & XFS_IOLOCK_SHARED))
+ return !!ip->i_iolock.mr_writer;
+ return rwsem_is_locked(&ip->i_iolock.mr_lock);
}
- return 1;
+ ASSERT(0);
+ return 0;
}
#endif
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 8cd6e8d8fe9c..68415cb4f23c 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -27,13 +27,10 @@
#include "xfs_trans_priv.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
-#include "xfs_dir2.h"
-#include "xfs_dmapi.h"
#include "xfs_mount.h"
#include "xfs_bmap_btree.h"
#include "xfs_alloc_btree.h"
#include "xfs_ialloc_btree.h"
-#include "xfs_dir2_sf.h"
#include "xfs_attr_sf.h"
#include "xfs_dinode.h"
#include "xfs_inode.h"
@@ -44,7 +41,6 @@
#include "xfs_alloc.h"
#include "xfs_ialloc.h"
#include "xfs_bmap.h"
-#include "xfs_rw.h"
#include "xfs_error.h"
#include "xfs_utils.h"
#include "xfs_quota.h"
@@ -177,7 +173,7 @@ xfs_imap_to_bp(
if (unlikely(XFS_TEST_ERROR(!di_ok, mp,
XFS_ERRTAG_ITOBP_INOTOBP,
XFS_RANDOM_ITOBP_INOTOBP))) {
- if (iget_flags & XFS_IGET_BULKSTAT) {
+ if (iget_flags & XFS_IGET_UNTRUSTED) {
xfs_trans_brelse(tp, bp);
return XFS_ERROR(EINVAL);
}
@@ -426,7 +422,7 @@ xfs_iformat(
if (!XFS_DFORK_Q(dip))
return 0;
ASSERT(ip->i_afp == NULL);
- ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, KM_SLEEP);
+ ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, KM_SLEEP | KM_NOFS);
ip->i_afp->if_ext_max =
XFS_IFORK_ASIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t);
switch (dip->di_aformat) {
@@ -509,7 +505,7 @@ xfs_iformat_local(
ifp->if_u1.if_data = ifp->if_u2.if_inline_data;
else {
real_size = roundup(size, 4);
- ifp->if_u1.if_data = kmem_alloc(real_size, KM_SLEEP);
+ ifp->if_u1.if_data = kmem_alloc(real_size, KM_SLEEP | KM_NOFS);
}
ifp->if_bytes = size;
ifp->if_real_bytes = real_size;
@@ -636,7 +632,7 @@ xfs_iformat_btree(
}
ifp->if_broot_bytes = size;
- ifp->if_broot = kmem_alloc(size, KM_SLEEP);
+ ifp->if_broot = kmem_alloc(size, KM_SLEEP | KM_NOFS);
ASSERT(ifp->if_broot != NULL);
/*
* Copy and convert from the on-disk structure
@@ -787,7 +783,6 @@ xfs_iread(
xfs_mount_t *mp,
xfs_trans_t *tp,
xfs_inode_t *ip,
- xfs_daddr_t bno,
uint iget_flags)
{
xfs_buf_t *bp;
@@ -797,11 +792,9 @@ xfs_iread(
/*
* Fill in the location information in the in-core inode.
*/
- ip->i_imap.im_blkno = bno;
error = xfs_imap(mp, tp, ip->i_ino, &ip->i_imap, iget_flags);
if (error)
return error;
- ASSERT(bno == 0 || bno == ip->i_imap.im_blkno);
/*
* Get pointers to the on-disk inode and the buffer containing it.
@@ -925,7 +918,6 @@ xfs_iread_extents(
int error;
xfs_ifork_t *ifp;
xfs_extnum_t nextents;
- size_t size;
if (unlikely(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE)) {
XFS_ERROR_REPORT("xfs_iread_extents", XFS_ERRLEVEL_LOW,
@@ -933,7 +925,6 @@ xfs_iread_extents(
return XFS_ERROR(EFSCORRUPTED);
}
nextents = XFS_IFORK_NEXTENTS(ip, whichfork);
- size = nextents * sizeof(xfs_bmbt_rec_t);
ifp = XFS_IFORK_PTR(ip, whichfork);
/*
@@ -1229,7 +1220,7 @@ xfs_isize_check(
(xfs_ufsize_t)XFS_MAXIOFFSET(mp)) -
map_first),
XFS_BMAPI_ENTIRE, NULL, 0, imaps, &nimaps,
- NULL, NULL))
+ NULL))
return;
ASSERT(nimaps == 1);
ASSERT(imaps[0].br_startblock == HOLESTARTBLOCK);
@@ -1463,7 +1454,7 @@ xfs_itruncate_finish(
ASSERT((*tp)->t_flags & XFS_TRANS_PERM_LOG_RES);
ASSERT(ip->i_transp == *tp);
ASSERT(ip->i_itemp != NULL);
- ASSERT(ip->i_itemp->ili_flags & XFS_ILI_HOLD);
+ ASSERT(ip->i_itemp->ili_lock_flags == 0);
ntp = *tp;
@@ -1592,11 +1583,10 @@ xfs_itruncate_finish(
xfs_bmap_init(&free_list, &first_block);
error = xfs_bunmapi(ntp, ip,
first_unmap_block, unmap_len,
- xfs_bmapi_aflag(fork) |
- (sync ? 0 : XFS_BMAPI_ASYNC),
+ xfs_bmapi_aflag(fork),
XFS_ITRUNC_MAX_EXTENTS,
&first_block, &free_list,
- NULL, &done);
+ &done);
if (error) {
/*
* If the bunmapi call encounters an error,
@@ -1615,12 +1605,8 @@ xfs_itruncate_finish(
*/
error = xfs_bmap_finish(tp, &free_list, &committed);
ntp = *tp;
- if (committed) {
- /* link the inode into the next xact in the chain */
- xfs_trans_ijoin(ntp, ip,
- XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
- xfs_trans_ihold(ntp, ip);
- }
+ if (committed)
+ xfs_trans_ijoin(ntp, ip);
if (error) {
/*
@@ -1649,9 +1635,7 @@ xfs_itruncate_finish(
error = xfs_trans_commit(*tp, 0);
*tp = ntp;
- /* link the inode into the next transaction in the chain */
- xfs_trans_ijoin(ntp, ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
- xfs_trans_ihold(ntp, ip);
+ xfs_trans_ijoin(ntp, ip);
if (error)
return error;
@@ -1940,10 +1924,10 @@ xfs_ifree_cluster(
int blks_per_cluster;
int nbufs;
int ninodes;
- int i, j, found, pre_flushed;
+ int i, j;
xfs_daddr_t blkno;
xfs_buf_t *bp;
- xfs_inode_t *ip, **ip_found;
+ xfs_inode_t *ip;
xfs_inode_log_item_t *iip;
xfs_log_item_t *lip;
struct xfs_perag *pag;
@@ -1960,114 +1944,97 @@ xfs_ifree_cluster(
nbufs = XFS_IALLOC_BLOCKS(mp) / blks_per_cluster;
}
- ip_found = kmem_alloc(ninodes * sizeof(xfs_inode_t *), KM_NOFS);
-
for (j = 0; j < nbufs; j++, inum += ninodes) {
+ int found = 0;
+
blkno = XFS_AGB_TO_DADDR(mp, XFS_INO_TO_AGNO(mp, inum),
XFS_INO_TO_AGBNO(mp, inum));
+ /*
+ * We obtain and lock the backing buffer first in the process
+ * here, as we have to ensure that any dirty inode that we
+ * can't get the flush lock on is attached to the buffer.
+ * If we scan the in-memory inodes first, then buffer IO can
+ * complete before we get a lock on it, and hence we may fail
+ * to mark all the active inodes on the buffer stale.
+ */
+ bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, blkno,
+ mp->m_bsize * blks_per_cluster,
+ XBF_LOCK);
/*
- * Look for each inode in memory and attempt to lock it,
- * we can be racing with flush and tail pushing here.
- * any inode we get the locks on, add to an array of
- * inode items to process later.
+ * Walk the inodes already attached to the buffer and mark them
+ * stale. These will all have the flush locks held, so an
+ * in-memory inode walk can't lock them.
+ */
+ lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *);
+ while (lip) {
+ if (lip->li_type == XFS_LI_INODE) {
+ iip = (xfs_inode_log_item_t *)lip;
+ ASSERT(iip->ili_logged == 1);
+ lip->li_cb = xfs_istale_done;
+ xfs_trans_ail_copy_lsn(mp->m_ail,
+ &iip->ili_flush_lsn,
+ &iip->ili_item.li_lsn);
+ xfs_iflags_set(iip->ili_inode, XFS_ISTALE);
+ found++;
+ }
+ lip = lip->li_bio_list;
+ }
+
+ /*
+ * For each inode in memory attempt to add it to the inode
+ * buffer and set it up for being staled on buffer IO
+ * completion. This is safe as we've locked out tail pushing
+ * and flushing by locking the buffer.
*
- * The get the buffer lock, we could beat a flush
- * or tail pushing thread to the lock here, in which
- * case they will go looking for the inode buffer
- * and fail, we need some other form of interlock
- * here.
+ * We have already marked every inode that was part of a
+ * transaction stale above, which means there is no point in
+ * even trying to lock them.
*/
- found = 0;
for (i = 0; i < ninodes; i++) {
read_lock(&pag->pag_ici_lock);
ip = radix_tree_lookup(&pag->pag_ici_root,
XFS_INO_TO_AGINO(mp, (inum + i)));
- /* Inode not in memory or we found it already,
- * nothing to do
- */
+ /* Inode not in memory or stale, nothing to do */
if (!ip || xfs_iflags_test(ip, XFS_ISTALE)) {
read_unlock(&pag->pag_ici_lock);
continue;
}
- if (xfs_inode_clean(ip)) {
- read_unlock(&pag->pag_ici_lock);
- continue;
- }
-
- /* If we can get the locks then add it to the
- * list, otherwise by the time we get the bp lock
- * below it will already be attached to the
- * inode buffer.
- */
-
- /* This inode will already be locked - by us, lets
- * keep it that way.
- */
-
- if (ip == free_ip) {
- if (xfs_iflock_nowait(ip)) {
- xfs_iflags_set(ip, XFS_ISTALE);
- if (xfs_inode_clean(ip)) {
- xfs_ifunlock(ip);
- } else {
- ip_found[found++] = ip;
- }
- }
+ /* don't try to lock/unlock the current inode */
+ if (ip != free_ip &&
+ !xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) {
read_unlock(&pag->pag_ici_lock);
continue;
}
+ read_unlock(&pag->pag_ici_lock);
- if (xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) {
- if (xfs_iflock_nowait(ip)) {
- xfs_iflags_set(ip, XFS_ISTALE);
-
- if (xfs_inode_clean(ip)) {
- xfs_ifunlock(ip);
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
- } else {
- ip_found[found++] = ip;
- }
- } else {
+ if (!xfs_iflock_nowait(ip)) {
+ if (ip != free_ip)
xfs_iunlock(ip, XFS_ILOCK_EXCL);
- }
+ continue;
}
- read_unlock(&pag->pag_ici_lock);
- }
-
- bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, blkno,
- mp->m_bsize * blks_per_cluster,
- XBF_LOCK);
- pre_flushed = 0;
- lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *);
- while (lip) {
- if (lip->li_type == XFS_LI_INODE) {
- iip = (xfs_inode_log_item_t *)lip;
- ASSERT(iip->ili_logged == 1);
- lip->li_cb = (void(*)(xfs_buf_t*,xfs_log_item_t*)) xfs_istale_done;
- xfs_trans_ail_copy_lsn(mp->m_ail,
- &iip->ili_flush_lsn,
- &iip->ili_item.li_lsn);
- xfs_iflags_set(iip->ili_inode, XFS_ISTALE);
- pre_flushed++;
+ xfs_iflags_set(ip, XFS_ISTALE);
+ if (xfs_inode_clean(ip)) {
+ ASSERT(ip != free_ip);
+ xfs_ifunlock(ip);
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ continue;
}
- lip = lip->li_bio_list;
- }
- for (i = 0; i < found; i++) {
- ip = ip_found[i];
iip = ip->i_itemp;
-
if (!iip) {
+ /* inode with unlogged changes only */
+ ASSERT(ip != free_ip);
ip->i_update_core = 0;
xfs_ifunlock(ip);
xfs_iunlock(ip, XFS_ILOCK_EXCL);
continue;
}
+ found++;
iip->ili_last_fields = iip->ili_format.ilf_fields;
iip->ili_format.ilf_fields = 0;
@@ -2075,20 +2042,18 @@ xfs_ifree_cluster(
xfs_trans_ail_copy_lsn(mp->m_ail, &iip->ili_flush_lsn,
&iip->ili_item.li_lsn);
- xfs_buf_attach_iodone(bp,
- (void(*)(xfs_buf_t*,xfs_log_item_t*))
- xfs_istale_done, (xfs_log_item_t *)iip);
- if (ip != free_ip) {
+ xfs_buf_attach_iodone(bp, xfs_istale_done,
+ &iip->ili_item);
+
+ if (ip != free_ip)
xfs_iunlock(ip, XFS_ILOCK_EXCL);
- }
}
- if (found || pre_flushed)
+ if (found)
xfs_trans_stale_inode_buf(tp, bp);
xfs_trans_binval(tp, bp);
}
- kmem_free(ip_found);
xfs_perag_put(pag);
}
@@ -2224,7 +2189,7 @@ xfs_iroot_realloc(
*/
if (ifp->if_broot_bytes == 0) {
new_size = (size_t)XFS_BMAP_BROOT_SPACE_CALC(rec_diff);
- ifp->if_broot = kmem_alloc(new_size, KM_SLEEP);
+ ifp->if_broot = kmem_alloc(new_size, KM_SLEEP | KM_NOFS);
ifp->if_broot_bytes = (int)new_size;
return;
}
@@ -2240,7 +2205,7 @@ xfs_iroot_realloc(
new_size = (size_t)XFS_BMAP_BROOT_SPACE_CALC(new_max);
ifp->if_broot = kmem_realloc(ifp->if_broot, new_size,
(size_t)XFS_BMAP_BROOT_SPACE_CALC(cur_max), /* old size */
- KM_SLEEP);
+ KM_SLEEP | KM_NOFS);
op = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1,
ifp->if_broot_bytes);
np = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1,
@@ -2266,7 +2231,7 @@ xfs_iroot_realloc(
else
new_size = 0;
if (new_size > 0) {
- new_broot = kmem_alloc(new_size, KM_SLEEP);
+ new_broot = kmem_alloc(new_size, KM_SLEEP | KM_NOFS);
/*
* First copy over the btree block header.
*/
@@ -2370,7 +2335,8 @@ xfs_idata_realloc(
real_size = roundup(new_size, 4);
if (ifp->if_u1.if_data == NULL) {
ASSERT(ifp->if_real_bytes == 0);
- ifp->if_u1.if_data = kmem_alloc(real_size, KM_SLEEP);
+ ifp->if_u1.if_data = kmem_alloc(real_size,
+ KM_SLEEP | KM_NOFS);
} else if (ifp->if_u1.if_data != ifp->if_u2.if_inline_data) {
/*
* Only do the realloc if the underlying size
@@ -2381,11 +2347,12 @@ xfs_idata_realloc(
kmem_realloc(ifp->if_u1.if_data,
real_size,
ifp->if_real_bytes,
- KM_SLEEP);
+ KM_SLEEP | KM_NOFS);
}
} else {
ASSERT(ifp->if_real_bytes == 0);
- ifp->if_u1.if_data = kmem_alloc(real_size, KM_SLEEP);
+ ifp->if_u1.if_data = kmem_alloc(real_size,
+ KM_SLEEP | KM_NOFS);
memcpy(ifp->if_u1.if_data, ifp->if_u2.if_inline_data,
ifp->if_bytes);
}
@@ -2649,8 +2616,6 @@ xfs_iflush_cluster(
int i;
pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
- ASSERT(pag->pagi_inodeok);
- ASSERT(pag->pag_ici_init);
inodes_per_cluster = XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog;
ilist_size = inodes_per_cluster * sizeof(xfs_inode_t *);
@@ -2754,7 +2719,6 @@ cluster_corrupt_out:
* mark it as stale and brelse.
*/
if (XFS_BUF_IODONE_FUNC(bp)) {
- XFS_BUF_CLR_BDSTRAT_FUNC(bp);
XFS_BUF_UNDONE(bp);
XFS_BUF_STALE(bp);
XFS_BUF_ERROR(bp,EIO);
@@ -3092,8 +3056,7 @@ xfs_iflush_int(
* and unlock the inode's flush lock when the inode is
* completely written to disk.
*/
- xfs_buf_attach_iodone(bp, (void(*)(xfs_buf_t*,xfs_log_item_t*))
- xfs_iflush_done, (xfs_log_item_t *)iip);
+ xfs_buf_attach_iodone(bp, xfs_iflush_done, &iip->ili_item);
ASSERT(XFS_BUF_FSPRIVATE(bp, void *) != NULL);
ASSERT(XFS_BUF_IODONE_FUNC(bp) != NULL);
@@ -3537,13 +3500,11 @@ xfs_iext_remove_indirect(
xfs_extnum_t ext_diff; /* extents to remove in current list */
xfs_extnum_t nex1; /* number of extents before idx */
xfs_extnum_t nex2; /* extents after idx + count */
- int nlists; /* entries in indirection array */
int page_idx = idx; /* index in target extent list */
ASSERT(ifp->if_flags & XFS_IFEXTIREC);
erp = xfs_iext_idx_to_irec(ifp, &page_idx, &erp_idx, 0);
ASSERT(erp != NULL);
- nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
nex1 = page_idx;
ext_cnt = count;
while (ext_cnt) {
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 9965e40a4615..0898c5417d12 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -442,9 +442,7 @@ static inline void xfs_ifunlock(xfs_inode_t *ip)
* xfs_iget.c prototypes.
*/
int xfs_iget(struct xfs_mount *, struct xfs_trans *, xfs_ino_t,
- uint, uint, xfs_inode_t **, xfs_daddr_t);
-void xfs_iput(xfs_inode_t *, uint);
-void xfs_iput_new(xfs_inode_t *, uint);
+ uint, uint, xfs_inode_t **);
void xfs_ilock(xfs_inode_t *, uint);
int xfs_ilock_nowait(xfs_inode_t *, uint);
void xfs_iunlock(xfs_inode_t *, uint);
@@ -452,7 +450,7 @@ void xfs_ilock_demote(xfs_inode_t *, uint);
int xfs_isilocked(xfs_inode_t *, uint);
uint xfs_ilock_map_shared(xfs_inode_t *);
void xfs_iunlock_map_shared(xfs_inode_t *, uint);
-void xfs_ireclaim(xfs_inode_t *);
+void xfs_inode_free(struct xfs_inode *ip);
/*
* xfs_inode.c prototypes.
@@ -500,7 +498,7 @@ do { \
* Flags for xfs_iget()
*/
#define XFS_IGET_CREATE 0x1
-#define XFS_IGET_BULKSTAT 0x2
+#define XFS_IGET_UNTRUSTED 0x2
int xfs_inotobp(struct xfs_mount *, struct xfs_trans *,
xfs_ino_t, struct xfs_dinode **,
@@ -509,7 +507,7 @@ int xfs_itobp(struct xfs_mount *, struct xfs_trans *,
struct xfs_inode *, struct xfs_dinode **,
struct xfs_buf **, uint);
int xfs_iread(struct xfs_mount *, struct xfs_trans *,
- struct xfs_inode *, xfs_daddr_t, uint);
+ struct xfs_inode *, uint);
void xfs_dinode_to_disk(struct xfs_dinode *,
struct xfs_icdinode *);
void xfs_idestroy_fork(struct xfs_inode *, int);
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index cf8249a60004..fe00777e2796 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -22,30 +22,26 @@
#include "xfs_log.h"
#include "xfs_inum.h"
#include "xfs_trans.h"
-#include "xfs_buf_item.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
-#include "xfs_dir2.h"
-#include "xfs_dmapi.h"
#include "xfs_mount.h"
#include "xfs_trans_priv.h"
#include "xfs_bmap_btree.h"
-#include "xfs_alloc_btree.h"
-#include "xfs_ialloc_btree.h"
-#include "xfs_dir2_sf.h"
-#include "xfs_attr_sf.h"
#include "xfs_dinode.h"
#include "xfs_inode.h"
#include "xfs_inode_item.h"
-#include "xfs_btree.h"
-#include "xfs_ialloc.h"
-#include "xfs_rw.h"
#include "xfs_error.h"
#include "xfs_trace.h"
kmem_zone_t *xfs_ili_zone; /* inode log item zone */
+static inline struct xfs_inode_log_item *INODE_ITEM(struct xfs_log_item *lip)
+{
+ return container_of(lip, struct xfs_inode_log_item, ili_item);
+}
+
+
/*
* This returns the number of iovecs needed to log the given inode item.
*
@@ -55,13 +51,11 @@ kmem_zone_t *xfs_ili_zone; /* inode log item zone */
*/
STATIC uint
xfs_inode_item_size(
- xfs_inode_log_item_t *iip)
+ struct xfs_log_item *lip)
{
- uint nvecs;
- xfs_inode_t *ip;
-
- ip = iip->ili_inode;
- nvecs = 2;
+ struct xfs_inode_log_item *iip = INODE_ITEM(lip);
+ struct xfs_inode *ip = iip->ili_inode;
+ uint nvecs = 2;
/*
* Only log the data/extents/b-tree root if there is something
@@ -212,21 +206,17 @@ xfs_inode_item_size(
*/
STATIC void
xfs_inode_item_format(
- xfs_inode_log_item_t *iip,
- xfs_log_iovec_t *log_vector)
+ struct xfs_log_item *lip,
+ struct xfs_log_iovec *vecp)
{
+ struct xfs_inode_log_item *iip = INODE_ITEM(lip);
+ struct xfs_inode *ip = iip->ili_inode;
uint nvecs;
- xfs_log_iovec_t *vecp;
- xfs_inode_t *ip;
size_t data_bytes;
xfs_bmbt_rec_t *ext_buffer;
- int nrecs;
xfs_mount_t *mp;
- ip = iip->ili_inode;
- vecp = log_vector;
-
- vecp->i_addr = (xfs_caddr_t)&iip->ili_format;
+ vecp->i_addr = &iip->ili_format;
vecp->i_len = sizeof(xfs_inode_log_format_t);
vecp->i_type = XLOG_REG_TYPE_IFORMAT;
vecp++;
@@ -277,7 +267,7 @@ xfs_inode_item_format(
*/
xfs_synchronize_times(ip);
- vecp->i_addr = (xfs_caddr_t)&ip->i_d;
+ vecp->i_addr = &ip->i_d;
vecp->i_len = sizeof(struct xfs_icdinode);
vecp->i_type = XLOG_REG_TYPE_ICORE;
vecp++;
@@ -323,18 +313,17 @@ xfs_inode_item_format(
ASSERT(ip->i_df.if_u1.if_extents != NULL);
ASSERT(ip->i_d.di_nextents > 0);
ASSERT(iip->ili_extents_buf == NULL);
- nrecs = ip->i_df.if_bytes /
- (uint)sizeof(xfs_bmbt_rec_t);
- ASSERT(nrecs > 0);
+ ASSERT((ip->i_df.if_bytes /
+ (uint)sizeof(xfs_bmbt_rec_t)) > 0);
#ifdef XFS_NATIVE_HOST
- if (nrecs == ip->i_d.di_nextents) {
+ if (ip->i_d.di_nextents == ip->i_df.if_bytes /
+ (uint)sizeof(xfs_bmbt_rec_t)) {
/*
* There are no delayed allocation
* extents, so just point to the
* real extents array.
*/
- vecp->i_addr =
- (char *)(ip->i_df.if_u1.if_extents);
+ vecp->i_addr = ip->i_df.if_u1.if_extents;
vecp->i_len = ip->i_df.if_bytes;
vecp->i_type = XLOG_REG_TYPE_IEXT;
} else
@@ -352,7 +341,7 @@ xfs_inode_item_format(
ext_buffer = kmem_alloc(ip->i_df.if_bytes,
KM_SLEEP);
iip->ili_extents_buf = ext_buffer;
- vecp->i_addr = (xfs_caddr_t)ext_buffer;
+ vecp->i_addr = ext_buffer;
vecp->i_len = xfs_iextents_copy(ip, ext_buffer,
XFS_DATA_FORK);
vecp->i_type = XLOG_REG_TYPE_IEXT;
@@ -371,7 +360,7 @@ xfs_inode_item_format(
if (iip->ili_format.ilf_fields & XFS_ILOG_DBROOT) {
ASSERT(ip->i_df.if_broot_bytes > 0);
ASSERT(ip->i_df.if_broot != NULL);
- vecp->i_addr = (xfs_caddr_t)ip->i_df.if_broot;
+ vecp->i_addr = ip->i_df.if_broot;
vecp->i_len = ip->i_df.if_broot_bytes;
vecp->i_type = XLOG_REG_TYPE_IBROOT;
vecp++;
@@ -389,7 +378,7 @@ xfs_inode_item_format(
ASSERT(ip->i_df.if_u1.if_data != NULL);
ASSERT(ip->i_d.di_size > 0);
- vecp->i_addr = (xfs_caddr_t)ip->i_df.if_u1.if_data;
+ vecp->i_addr = ip->i_df.if_u1.if_data;
/*
* Round i_bytes up to a word boundary.
* The underlying memory is guaranteed to
@@ -437,7 +426,7 @@ xfs_inode_item_format(
* Assert that no attribute-related log flags are set.
*/
if (!XFS_IFORK_Q(ip)) {
- ASSERT(nvecs == iip->ili_item.li_desc->lid_size);
+ ASSERT(nvecs == lip->li_desc->lid_size);
iip->ili_format.ilf_size = nvecs;
ASSERT(!(iip->ili_format.ilf_fields &
(XFS_ILOG_ADATA | XFS_ILOG_ABROOT | XFS_ILOG_AEXT)));
@@ -449,21 +438,21 @@ xfs_inode_item_format(
ASSERT(!(iip->ili_format.ilf_fields &
(XFS_ILOG_ADATA | XFS_ILOG_ABROOT)));
if (iip->ili_format.ilf_fields & XFS_ILOG_AEXT) {
- ASSERT(ip->i_afp->if_bytes > 0);
- ASSERT(ip->i_afp->if_u1.if_extents != NULL);
- ASSERT(ip->i_d.di_anextents > 0);
#ifdef DEBUG
- nrecs = ip->i_afp->if_bytes /
+ int nrecs = ip->i_afp->if_bytes /
(uint)sizeof(xfs_bmbt_rec_t);
-#endif
ASSERT(nrecs > 0);
ASSERT(nrecs == ip->i_d.di_anextents);
+ ASSERT(ip->i_afp->if_bytes > 0);
+ ASSERT(ip->i_afp->if_u1.if_extents != NULL);
+ ASSERT(ip->i_d.di_anextents > 0);
+#endif
#ifdef XFS_NATIVE_HOST
/*
* There are not delayed allocation extents
* for attributes, so just point at the array.
*/
- vecp->i_addr = (char *)(ip->i_afp->if_u1.if_extents);
+ vecp->i_addr = ip->i_afp->if_u1.if_extents;
vecp->i_len = ip->i_afp->if_bytes;
#else
ASSERT(iip->ili_aextents_buf == NULL);
@@ -473,7 +462,7 @@ xfs_inode_item_format(
ext_buffer = kmem_alloc(ip->i_afp->if_bytes,
KM_SLEEP);
iip->ili_aextents_buf = ext_buffer;
- vecp->i_addr = (xfs_caddr_t)ext_buffer;
+ vecp->i_addr = ext_buffer;
vecp->i_len = xfs_iextents_copy(ip, ext_buffer,
XFS_ATTR_FORK);
#endif
@@ -490,7 +479,7 @@ xfs_inode_item_format(
if (iip->ili_format.ilf_fields & XFS_ILOG_ABROOT) {
ASSERT(ip->i_afp->if_broot_bytes > 0);
ASSERT(ip->i_afp->if_broot != NULL);
- vecp->i_addr = (xfs_caddr_t)ip->i_afp->if_broot;
+ vecp->i_addr = ip->i_afp->if_broot;
vecp->i_len = ip->i_afp->if_broot_bytes;
vecp->i_type = XLOG_REG_TYPE_IATTR_BROOT;
vecp++;
@@ -506,7 +495,7 @@ xfs_inode_item_format(
ASSERT(ip->i_afp->if_bytes > 0);
ASSERT(ip->i_afp->if_u1.if_data != NULL);
- vecp->i_addr = (xfs_caddr_t)ip->i_afp->if_u1.if_data;
+ vecp->i_addr = ip->i_afp->if_u1.if_data;
/*
* Round i_bytes up to a word boundary.
* The underlying memory is guaranteed to
@@ -528,7 +517,7 @@ xfs_inode_item_format(
break;
}
- ASSERT(nvecs == iip->ili_item.li_desc->lid_size);
+ ASSERT(nvecs == lip->li_desc->lid_size);
iip->ili_format.ilf_size = nvecs;
}
@@ -539,12 +528,14 @@ xfs_inode_item_format(
*/
STATIC void
xfs_inode_item_pin(
- xfs_inode_log_item_t *iip)
+ struct xfs_log_item *lip)
{
- ASSERT(xfs_isilocked(iip->ili_inode, XFS_ILOCK_EXCL));
+ struct xfs_inode *ip = INODE_ITEM(lip)->ili_inode;
+
+ ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
- trace_xfs_inode_pin(iip->ili_inode, _RET_IP_);
- atomic_inc(&iip->ili_inode->i_pincount);
+ trace_xfs_inode_pin(ip, _RET_IP_);
+ atomic_inc(&ip->i_pincount);
}
@@ -554,12 +545,12 @@ xfs_inode_item_pin(
*
* Also wake up anyone in xfs_iunpin_wait() if the count goes to 0.
*/
-/* ARGSUSED */
STATIC void
xfs_inode_item_unpin(
- xfs_inode_log_item_t *iip)
+ struct xfs_log_item *lip,
+ int remove)
{
- struct xfs_inode *ip = iip->ili_inode;
+ struct xfs_inode *ip = INODE_ITEM(lip)->ili_inode;
trace_xfs_inode_unpin(ip, _RET_IP_);
ASSERT(atomic_read(&ip->i_pincount) > 0);
@@ -567,15 +558,6 @@ xfs_inode_item_unpin(
wake_up(&ip->i_ipin_wait);
}
-/* ARGSUSED */
-STATIC void
-xfs_inode_item_unpin_remove(
- xfs_inode_log_item_t *iip,
- xfs_trans_t *tp)
-{
- xfs_inode_item_unpin(iip);
-}
-
/*
* This is called to attempt to lock the inode associated with this
* inode log item, in preparation for the push routine which does the actual
@@ -591,19 +573,16 @@ xfs_inode_item_unpin_remove(
*/
STATIC uint
xfs_inode_item_trylock(
- xfs_inode_log_item_t *iip)
+ struct xfs_log_item *lip)
{
- register xfs_inode_t *ip;
-
- ip = iip->ili_inode;
+ struct xfs_inode_log_item *iip = INODE_ITEM(lip);
+ struct xfs_inode *ip = iip->ili_inode;
- if (xfs_ipincount(ip) > 0) {
+ if (xfs_ipincount(ip) > 0)
return XFS_ITEM_PINNED;
- }
- if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) {
+ if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED))
return XFS_ITEM_LOCKED;
- }
if (!xfs_iflock_nowait(ip)) {
/*
@@ -629,7 +608,7 @@ xfs_inode_item_trylock(
if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
ASSERT(iip->ili_format.ilf_fields != 0);
ASSERT(iip->ili_logged == 0);
- ASSERT(iip->ili_item.li_flags & XFS_LI_IN_AIL);
+ ASSERT(lip->li_flags & XFS_LI_IN_AIL);
}
#endif
return XFS_ITEM_SUCCESS;
@@ -643,26 +622,18 @@ xfs_inode_item_trylock(
*/
STATIC void
xfs_inode_item_unlock(
- xfs_inode_log_item_t *iip)
+ struct xfs_log_item *lip)
{
- uint hold;
- uint iolocked;
- uint lock_flags;
- xfs_inode_t *ip;
+ struct xfs_inode_log_item *iip = INODE_ITEM(lip);
+ struct xfs_inode *ip = iip->ili_inode;
+ unsigned short lock_flags;
- ASSERT(iip != NULL);
ASSERT(iip->ili_inode->i_itemp != NULL);
ASSERT(xfs_isilocked(iip->ili_inode, XFS_ILOCK_EXCL));
- ASSERT((!(iip->ili_inode->i_itemp->ili_flags &
- XFS_ILI_IOLOCKED_EXCL)) ||
- xfs_isilocked(iip->ili_inode, XFS_IOLOCK_EXCL));
- ASSERT((!(iip->ili_inode->i_itemp->ili_flags &
- XFS_ILI_IOLOCKED_SHARED)) ||
- xfs_isilocked(iip->ili_inode, XFS_IOLOCK_SHARED));
+
/*
* Clear the transaction pointer in the inode.
*/
- ip = iip->ili_inode;
ip->i_transp = NULL;
/*
@@ -686,34 +657,11 @@ xfs_inode_item_unlock(
iip->ili_aextents_buf = NULL;
}
- /*
- * Figure out if we should unlock the inode or not.
- */
- hold = iip->ili_flags & XFS_ILI_HOLD;
-
- /*
- * Before clearing out the flags, remember whether we
- * are holding the inode's IO lock.
- */
- iolocked = iip->ili_flags & XFS_ILI_IOLOCKED_ANY;
-
- /*
- * Clear out the fields of the inode log item particular
- * to the current transaction.
- */
- iip->ili_flags = 0;
-
- /*
- * Unlock the inode if XFS_ILI_HOLD was not set.
- */
- if (!hold) {
- lock_flags = XFS_ILOCK_EXCL;
- if (iolocked & XFS_ILI_IOLOCKED_EXCL) {
- lock_flags |= XFS_IOLOCK_EXCL;
- } else if (iolocked & XFS_ILI_IOLOCKED_SHARED) {
- lock_flags |= XFS_IOLOCK_SHARED;
- }
- xfs_iput(iip->ili_inode, lock_flags);
+ lock_flags = iip->ili_lock_flags;
+ iip->ili_lock_flags = 0;
+ if (lock_flags) {
+ xfs_iunlock(iip->ili_inode, lock_flags);
+ IRELE(iip->ili_inode);
}
}
@@ -725,13 +673,12 @@ xfs_inode_item_unlock(
* is the only one that matters. Therefore, simply return the
* given lsn.
*/
-/*ARGSUSED*/
STATIC xfs_lsn_t
xfs_inode_item_committed(
- xfs_inode_log_item_t *iip,
+ struct xfs_log_item *lip,
xfs_lsn_t lsn)
{
- return (lsn);
+ return lsn;
}
/*
@@ -743,13 +690,12 @@ xfs_inode_item_committed(
*/
STATIC void
xfs_inode_item_pushbuf(
- xfs_inode_log_item_t *iip)
+ struct xfs_log_item *lip)
{
- xfs_inode_t *ip;
- xfs_mount_t *mp;
- xfs_buf_t *bp;
+ struct xfs_inode_log_item *iip = INODE_ITEM(lip);
+ struct xfs_inode *ip = iip->ili_inode;
+ struct xfs_buf *bp;
- ip = iip->ili_inode;
ASSERT(xfs_isilocked(ip, XFS_ILOCK_SHARED));
/*
@@ -757,14 +703,13 @@ xfs_inode_item_pushbuf(
* inode was taken off the AIL. So, just get out.
*/
if (completion_done(&ip->i_flush) ||
- ((iip->ili_item.li_flags & XFS_LI_IN_AIL) == 0)) {
+ !(lip->li_flags & XFS_LI_IN_AIL)) {
xfs_iunlock(ip, XFS_ILOCK_SHARED);
return;
}
- mp = ip->i_mount;
- bp = xfs_incore(mp->m_ddev_targp, iip->ili_format.ilf_blkno,
- iip->ili_format.ilf_len, XBF_TRYLOCK);
+ bp = xfs_incore(ip->i_mount->m_ddev_targp, iip->ili_format.ilf_blkno,
+ iip->ili_format.ilf_len, XBF_TRYLOCK);
xfs_iunlock(ip, XFS_ILOCK_SHARED);
if (!bp)
@@ -772,10 +717,8 @@ xfs_inode_item_pushbuf(
if (XFS_BUF_ISDELAYWRITE(bp))
xfs_buf_delwri_promote(bp);
xfs_buf_relse(bp);
- return;
}
-
/*
* This is called to asynchronously write the inode associated with this
* inode log item out to disk. The inode will already have been locked by
@@ -783,14 +726,14 @@ xfs_inode_item_pushbuf(
*/
STATIC void
xfs_inode_item_push(
- xfs_inode_log_item_t *iip)
+ struct xfs_log_item *lip)
{
- xfs_inode_t *ip;
-
- ip = iip->ili_inode;
+ struct xfs_inode_log_item *iip = INODE_ITEM(lip);
+ struct xfs_inode *ip = iip->ili_inode;
ASSERT(xfs_isilocked(ip, XFS_ILOCK_SHARED));
ASSERT(!completion_done(&ip->i_flush));
+
/*
* Since we were able to lock the inode's flush lock and
* we found it on the AIL, the inode must be dirty. This
@@ -813,43 +756,34 @@ xfs_inode_item_push(
*/
(void) xfs_iflush(ip, 0);
xfs_iunlock(ip, XFS_ILOCK_SHARED);
-
- return;
}
/*
* XXX rcc - this one really has to do something. Probably needs
* to stamp in a new field in the incore inode.
*/
-/* ARGSUSED */
STATIC void
xfs_inode_item_committing(
- xfs_inode_log_item_t *iip,
+ struct xfs_log_item *lip,
xfs_lsn_t lsn)
{
- iip->ili_last_lsn = lsn;
- return;
+ INODE_ITEM(lip)->ili_last_lsn = lsn;
}
/*
* This is the ops vector shared by all buf log items.
*/
static struct xfs_item_ops xfs_inode_item_ops = {
- .iop_size = (uint(*)(xfs_log_item_t*))xfs_inode_item_size,
- .iop_format = (void(*)(xfs_log_item_t*, xfs_log_iovec_t*))
- xfs_inode_item_format,
- .iop_pin = (void(*)(xfs_log_item_t*))xfs_inode_item_pin,
- .iop_unpin = (void(*)(xfs_log_item_t*))xfs_inode_item_unpin,
- .iop_unpin_remove = (void(*)(xfs_log_item_t*, xfs_trans_t*))
- xfs_inode_item_unpin_remove,
- .iop_trylock = (uint(*)(xfs_log_item_t*))xfs_inode_item_trylock,
- .iop_unlock = (void(*)(xfs_log_item_t*))xfs_inode_item_unlock,
- .iop_committed = (xfs_lsn_t(*)(xfs_log_item_t*, xfs_lsn_t))
- xfs_inode_item_committed,
- .iop_push = (void(*)(xfs_log_item_t*))xfs_inode_item_push,
- .iop_pushbuf = (void(*)(xfs_log_item_t*))xfs_inode_item_pushbuf,
- .iop_committing = (void(*)(xfs_log_item_t*, xfs_lsn_t))
- xfs_inode_item_committing
+ .iop_size = xfs_inode_item_size,
+ .iop_format = xfs_inode_item_format,
+ .iop_pin = xfs_inode_item_pin,
+ .iop_unpin = xfs_inode_item_unpin,
+ .iop_trylock = xfs_inode_item_trylock,
+ .iop_unlock = xfs_inode_item_unlock,
+ .iop_committed = xfs_inode_item_committed,
+ .iop_push = xfs_inode_item_push,
+ .iop_pushbuf = xfs_inode_item_pushbuf,
+ .iop_committing = xfs_inode_item_committing
};
@@ -858,10 +792,10 @@ static struct xfs_item_ops xfs_inode_item_ops = {
*/
void
xfs_inode_item_init(
- xfs_inode_t *ip,
- xfs_mount_t *mp)
+ struct xfs_inode *ip,
+ struct xfs_mount *mp)
{
- xfs_inode_log_item_t *iip;
+ struct xfs_inode_log_item *iip;
ASSERT(ip->i_itemp == NULL);
iip = ip->i_itemp = kmem_zone_zalloc(xfs_ili_zone, KM_SLEEP);
@@ -899,14 +833,14 @@ xfs_inode_item_destroy(
* from the AIL if it has not been re-logged, and unlocking the inode's
* flush lock.
*/
-/*ARGSUSED*/
void
xfs_iflush_done(
- xfs_buf_t *bp,
- xfs_inode_log_item_t *iip)
+ struct xfs_buf *bp,
+ struct xfs_log_item *lip)
{
+ struct xfs_inode_log_item *iip = INODE_ITEM(lip);
xfs_inode_t *ip = iip->ili_inode;
- struct xfs_ail *ailp = iip->ili_item.li_ailp;
+ struct xfs_ail *ailp = lip->li_ailp;
/*
* We only want to pull the item from the AIL if it is
@@ -917,12 +851,11 @@ xfs_iflush_done(
* the lock since it's cheaper, and then we recheck while
* holding the lock before removing the inode from the AIL.
*/
- if (iip->ili_logged &&
- (iip->ili_item.li_lsn == iip->ili_flush_lsn)) {
+ if (iip->ili_logged && lip->li_lsn == iip->ili_flush_lsn) {
spin_lock(&ailp->xa_lock);
- if (iip->ili_item.li_lsn == iip->ili_flush_lsn) {
+ if (lip->li_lsn == iip->ili_flush_lsn) {
/* xfs_trans_ail_delete() drops the AIL lock. */
- xfs_trans_ail_delete(ailp, (xfs_log_item_t*)iip);
+ xfs_trans_ail_delete(ailp, lip);
} else {
spin_unlock(&ailp->xa_lock);
}
@@ -940,8 +873,6 @@ xfs_iflush_done(
* Release the inode's flush lock since we're done with it.
*/
xfs_ifunlock(ip);
-
- return;
}
/*
@@ -957,10 +888,8 @@ xfs_iflush_abort(
xfs_inode_t *ip)
{
xfs_inode_log_item_t *iip = ip->i_itemp;
- xfs_mount_t *mp;
iip = ip->i_itemp;
- mp = ip->i_mount;
if (iip) {
struct xfs_ail *ailp = iip->ili_item.li_ailp;
if (iip->ili_item.li_flags & XFS_LI_IN_AIL) {
@@ -991,10 +920,10 @@ xfs_iflush_abort(
void
xfs_istale_done(
- xfs_buf_t *bp,
- xfs_inode_log_item_t *iip)
+ struct xfs_buf *bp,
+ struct xfs_log_item *lip)
{
- xfs_iflush_abort(iip->ili_inode);
+ xfs_iflush_abort(INODE_ITEM(lip)->ili_inode);
}
/*
@@ -1007,9 +936,8 @@ xfs_inode_item_format_convert(
xfs_inode_log_format_t *in_f)
{
if (buf->i_len == sizeof(xfs_inode_log_format_32_t)) {
- xfs_inode_log_format_32_t *in_f32;
+ xfs_inode_log_format_32_t *in_f32 = buf->i_addr;
- in_f32 = (xfs_inode_log_format_32_t *)buf->i_addr;
in_f->ilf_type = in_f32->ilf_type;
in_f->ilf_size = in_f32->ilf_size;
in_f->ilf_fields = in_f32->ilf_fields;
@@ -1025,9 +953,8 @@ xfs_inode_item_format_convert(
in_f->ilf_boffset = in_f32->ilf_boffset;
return 0;
} else if (buf->i_len == sizeof(xfs_inode_log_format_64_t)){
- xfs_inode_log_format_64_t *in_f64;
+ xfs_inode_log_format_64_t *in_f64 = buf->i_addr;
- in_f64 = (xfs_inode_log_format_64_t *)buf->i_addr;
in_f->ilf_type = in_f64->ilf_type;
in_f->ilf_size = in_f64->ilf_size;
in_f->ilf_fields = in_f64->ilf_fields;
diff --git a/fs/xfs/xfs_inode_item.h b/fs/xfs/xfs_inode_item.h
index 9a467958ecdd..d3dee61e6d91 100644
--- a/fs/xfs/xfs_inode_item.h
+++ b/fs/xfs/xfs_inode_item.h
@@ -103,12 +103,6 @@ typedef struct xfs_inode_log_format_64 {
XFS_ILOG_ADATA | XFS_ILOG_AEXT | \
XFS_ILOG_ABROOT)
-#define XFS_ILI_HOLD 0x1
-#define XFS_ILI_IOLOCKED_EXCL 0x2
-#define XFS_ILI_IOLOCKED_SHARED 0x4
-
-#define XFS_ILI_IOLOCKED_ANY (XFS_ILI_IOLOCKED_EXCL | XFS_ILI_IOLOCKED_SHARED)
-
static inline int xfs_ilog_fbroot(int w)
{
return (w == XFS_DATA_FORK ? XFS_ILOG_DBROOT : XFS_ILOG_ABROOT);
@@ -137,7 +131,7 @@ typedef struct xfs_inode_log_item {
struct xfs_inode *ili_inode; /* inode ptr */
xfs_lsn_t ili_flush_lsn; /* lsn at last flush */
xfs_lsn_t ili_last_lsn; /* lsn at last transaction */
- unsigned short ili_flags; /* misc flags */
+ unsigned short ili_lock_flags; /* lock flags */
unsigned short ili_logged; /* flushed logged data */
unsigned int ili_last_fields; /* fields when flushed */
struct xfs_bmbt_rec *ili_extents_buf; /* array of logged
@@ -161,8 +155,8 @@ static inline int xfs_inode_clean(xfs_inode_t *ip)
extern void xfs_inode_item_init(struct xfs_inode *, struct xfs_mount *);
extern void xfs_inode_item_destroy(struct xfs_inode *);
-extern void xfs_iflush_done(struct xfs_buf *, xfs_inode_log_item_t *);
-extern void xfs_istale_done(struct xfs_buf *, xfs_inode_log_item_t *);
+extern void xfs_iflush_done(struct xfs_buf *, struct xfs_log_item *);
+extern void xfs_istale_done(struct xfs_buf *, struct xfs_log_item *);
extern void xfs_iflush_abort(struct xfs_inode *);
extern int xfs_inode_item_format_convert(xfs_log_iovec_t *,
xfs_inode_log_format_t *);
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index ef14943829da..20576146369f 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -23,19 +23,14 @@
#include "xfs_trans.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
-#include "xfs_dir2.h"
#include "xfs_alloc.h"
-#include "xfs_dmapi.h"
#include "xfs_quota.h"
#include "xfs_mount.h"
#include "xfs_bmap_btree.h"
#include "xfs_alloc_btree.h"
#include "xfs_ialloc_btree.h"
-#include "xfs_dir2_sf.h"
-#include "xfs_attr_sf.h"
#include "xfs_dinode.h"
#include "xfs_inode.h"
-#include "xfs_ialloc.h"
#include "xfs_btree.h"
#include "xfs_bmap.h"
#include "xfs_rtalloc.h"
@@ -123,7 +118,7 @@ xfs_iomap(
error = xfs_bmapi(NULL, ip, offset_fsb,
(xfs_filblks_t)(end_fsb - offset_fsb),
bmapi_flags, NULL, 0, imap,
- nimaps, NULL, NULL);
+ nimaps, NULL);
if (error)
goto out;
@@ -138,7 +133,7 @@ xfs_iomap(
break;
}
- if (flags & (BMAPI_DIRECT|BMAPI_MMAP)) {
+ if (flags & BMAPI_DIRECT) {
error = xfs_iomap_write_direct(ip, offset, count, flags,
imap, nimaps);
} else {
@@ -247,7 +242,7 @@ xfs_iomap_write_direct(
xfs_off_t offset,
size_t count,
int flags,
- xfs_bmbt_irec_t *ret_imap,
+ xfs_bmbt_irec_t *imap,
int *nmaps)
{
xfs_mount_t *mp = ip->i_mount;
@@ -261,7 +256,6 @@ xfs_iomap_write_direct(
int quota_flag;
int rt;
xfs_trans_t *tp;
- xfs_bmbt_irec_t imap;
xfs_bmap_free_t free_list;
uint qblocks, resblks, resrtextents;
int committed;
@@ -285,10 +279,10 @@ xfs_iomap_write_direct(
if (error)
goto error_out;
} else {
- if (*nmaps && (ret_imap->br_startblock == HOLESTARTBLOCK))
+ if (*nmaps && (imap->br_startblock == HOLESTARTBLOCK))
last_fsb = MIN(last_fsb, (xfs_fileoff_t)
- ret_imap->br_blockcount +
- ret_imap->br_startoff);
+ imap->br_blockcount +
+ imap->br_startoff);
}
count_fsb = last_fsb - offset_fsb;
ASSERT(count_fsb > 0);
@@ -334,20 +328,22 @@ xfs_iomap_write_direct(
if (error)
goto error1;
- xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
- xfs_trans_ihold(tp, ip);
+ xfs_trans_ijoin(tp, ip);
bmapi_flag = XFS_BMAPI_WRITE;
if ((flags & BMAPI_DIRECT) && (offset < ip->i_size || extsz))
bmapi_flag |= XFS_BMAPI_PREALLOC;
/*
- * Issue the xfs_bmapi() call to allocate the blocks
+ * Issue the xfs_bmapi() call to allocate the blocks.
+ *
+ * From this point onwards we overwrite the imap pointer that the
+ * caller gave to us.
*/
xfs_bmap_init(&free_list, &firstfsb);
nimaps = 1;
error = xfs_bmapi(tp, ip, offset_fsb, count_fsb, bmapi_flag,
- &firstfsb, 0, &imap, &nimaps, &free_list, NULL);
+ &firstfsb, 0, imap, &nimaps, &free_list);
if (error)
goto error0;
@@ -369,12 +365,11 @@ xfs_iomap_write_direct(
goto error_out;
}
- if (!(imap.br_startblock || XFS_IS_REALTIME_INODE(ip))) {
- error = xfs_cmn_err_fsblock_zero(ip, &imap);
+ if (!(imap->br_startblock || XFS_IS_REALTIME_INODE(ip))) {
+ error = xfs_cmn_err_fsblock_zero(ip, imap);
goto error_out;
}
- *ret_imap = imap;
*nmaps = 1;
return 0;
@@ -425,7 +420,7 @@ xfs_iomap_eof_want_preallocate(
imaps = nimaps;
firstblock = NULLFSBLOCK;
error = xfs_bmapi(NULL, ip, start_fsb, count_fsb, 0,
- &firstblock, 0, imap, &imaps, NULL, NULL);
+ &firstblock, 0, imap, &imaps, NULL);
if (error)
return error;
for (n = 0; n < imaps; n++) {
@@ -500,7 +495,7 @@ retry:
(xfs_filblks_t)(last_fsb - offset_fsb),
XFS_BMAPI_DELAY | XFS_BMAPI_WRITE |
XFS_BMAPI_ENTIRE, &firstblock, 1, imap,
- &nimaps, NULL, NULL);
+ &nimaps, NULL);
if (error && (error != ENOSPC))
return XFS_ERROR(error);
@@ -548,7 +543,7 @@ xfs_iomap_write_allocate(
xfs_inode_t *ip,
xfs_off_t offset,
size_t count,
- xfs_bmbt_irec_t *map,
+ xfs_bmbt_irec_t *imap,
int *retmap)
{
xfs_mount_t *mp = ip->i_mount;
@@ -557,7 +552,6 @@ xfs_iomap_write_allocate(
xfs_fsblock_t first_block;
xfs_bmap_free_t free_list;
xfs_filblks_t count_fsb;
- xfs_bmbt_irec_t imap;
xfs_trans_t *tp;
int nimaps, committed;
int error = 0;
@@ -573,8 +567,8 @@ xfs_iomap_write_allocate(
return XFS_ERROR(error);
offset_fsb = XFS_B_TO_FSBT(mp, offset);
- count_fsb = map->br_blockcount;
- map_start_fsb = map->br_startoff;
+ count_fsb = imap->br_blockcount;
+ map_start_fsb = imap->br_startoff;
XFS_STATS_ADD(xs_xstrat_bytes, XFS_FSB_TO_B(mp, count_fsb));
@@ -602,8 +596,7 @@ xfs_iomap_write_allocate(
return XFS_ERROR(error);
}
xfs_ilock(ip, XFS_ILOCK_EXCL);
- xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
- xfs_trans_ihold(tp, ip);
+ xfs_trans_ijoin(tp, ip);
xfs_bmap_init(&free_list, &first_block);
@@ -654,10 +647,15 @@ xfs_iomap_write_allocate(
}
}
- /* Go get the actual blocks */
+ /*
+ * Go get the actual blocks.
+ *
+ * From this point onwards we overwrite the imap
+ * pointer that the caller gave to us.
+ */
error = xfs_bmapi(tp, ip, map_start_fsb, count_fsb,
XFS_BMAPI_WRITE, &first_block, 1,
- &imap, &nimaps, &free_list, NULL);
+ imap, &nimaps, &free_list);
if (error)
goto trans_cancel;
@@ -676,13 +674,12 @@ xfs_iomap_write_allocate(
* See if we were able to allocate an extent that
* covers at least part of the callers request
*/
- if (!(imap.br_startblock || XFS_IS_REALTIME_INODE(ip)))
- return xfs_cmn_err_fsblock_zero(ip, &imap);
+ if (!(imap->br_startblock || XFS_IS_REALTIME_INODE(ip)))
+ return xfs_cmn_err_fsblock_zero(ip, imap);
- if ((offset_fsb >= imap.br_startoff) &&
- (offset_fsb < (imap.br_startoff +
- imap.br_blockcount))) {
- *map = imap;
+ if ((offset_fsb >= imap->br_startoff) &&
+ (offset_fsb < (imap->br_startoff +
+ imap->br_blockcount))) {
*retmap = 1;
XFS_STATS_INC(xs_xstrat_quick);
return 0;
@@ -692,8 +689,8 @@ xfs_iomap_write_allocate(
* So far we have not mapped the requested part of the
* file, just surrounding data, try again.
*/
- count_fsb -= imap.br_blockcount;
- map_start_fsb = imap.br_startoff + imap.br_blockcount;
+ count_fsb -= imap->br_blockcount;
+ map_start_fsb = imap->br_startoff + imap->br_blockcount;
}
trans_cancel:
@@ -766,8 +763,7 @@ xfs_iomap_write_unwritten(
}
xfs_ilock(ip, XFS_ILOCK_EXCL);
- xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
- xfs_trans_ihold(tp, ip);
+ xfs_trans_ijoin(tp, ip);
/*
* Modify the unwritten extent state of the buffer.
@@ -776,7 +772,7 @@ xfs_iomap_write_unwritten(
nimaps = 1;
error = xfs_bmapi(tp, ip, offset_fsb, count_fsb,
XFS_BMAPI_WRITE|XFS_BMAPI_CONVERT, &firstfsb,
- 1, &imap, &nimaps, &free_list, NULL);
+ 1, &imap, &nimaps, &free_list);
if (error)
goto error_on_bmapi_transaction;
diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h
index 81ac4afd45b3..7748a430f50d 100644
--- a/fs/xfs/xfs_iomap.h
+++ b/fs/xfs/xfs_iomap.h
@@ -18,17 +18,16 @@
#ifndef __XFS_IOMAP_H__
#define __XFS_IOMAP_H__
-typedef enum {
- /* base extent manipulation calls */
- BMAPI_READ = (1 << 0), /* read extents */
- BMAPI_WRITE = (1 << 1), /* create extents */
- BMAPI_ALLOCATE = (1 << 2), /* delayed allocate to real extents */
- /* modifiers */
- BMAPI_IGNSTATE = (1 << 4), /* ignore unwritten state on read */
- BMAPI_DIRECT = (1 << 5), /* direct instead of buffered write */
- BMAPI_MMAP = (1 << 6), /* allocate for mmap write */
- BMAPI_TRYLOCK = (1 << 7), /* non-blocking request */
-} bmapi_flags_t;
+/* base extent manipulation calls */
+#define BMAPI_READ (1 << 0) /* read extents */
+#define BMAPI_WRITE (1 << 1) /* create extents */
+#define BMAPI_ALLOCATE (1 << 2) /* delayed allocate to real extents */
+
+/* modifiers */
+#define BMAPI_IGNSTATE (1 << 4) /* ignore unwritten state on read */
+#define BMAPI_DIRECT (1 << 5) /* direct instead of buffered write */
+#define BMAPI_MMA (1 << 6) /* allocate for mmap write */
+#define BMAPI_TRYLOCK (1 << 7) /* non-blocking request */
#define BMAPI_FLAGS \
{ BMAPI_READ, "READ" }, \
@@ -36,7 +35,6 @@ typedef enum {
{ BMAPI_ALLOCATE, "ALLOCATE" }, \
{ BMAPI_IGNSTATE, "IGNSTATE" }, \
{ BMAPI_DIRECT, "DIRECT" }, \
- { BMAPI_MMAP, "MMAP" }, \
{ BMAPI_TRYLOCK, "TRYLOCK" }
struct xfs_inode;
diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index b1b801e4a28e..7e3626e5925c 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -24,20 +24,17 @@
#include "xfs_trans.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
-#include "xfs_dir2.h"
-#include "xfs_dmapi.h"
#include "xfs_mount.h"
#include "xfs_bmap_btree.h"
#include "xfs_alloc_btree.h"
#include "xfs_ialloc_btree.h"
-#include "xfs_dir2_sf.h"
-#include "xfs_attr_sf.h"
#include "xfs_dinode.h"
#include "xfs_inode.h"
#include "xfs_ialloc.h"
#include "xfs_itable.h"
#include "xfs_error.h"
#include "xfs_btree.h"
+#include "xfs_trace.h"
STATIC int
xfs_internal_inum(
@@ -49,24 +46,40 @@ xfs_internal_inum(
(ino == mp->m_sb.sb_uquotino || ino == mp->m_sb.sb_gquotino)));
}
-STATIC int
-xfs_bulkstat_one_iget(
- xfs_mount_t *mp, /* mount point for filesystem */
- xfs_ino_t ino, /* inode number to get data for */
- xfs_daddr_t bno, /* starting bno of inode cluster */
- xfs_bstat_t *buf, /* return buffer */
- int *stat) /* BULKSTAT_RV_... */
+/*
+ * Return stat information for one inode.
+ * Return 0 if ok, else errno.
+ */
+int
+xfs_bulkstat_one_int(
+ struct xfs_mount *mp, /* mount point for filesystem */
+ xfs_ino_t ino, /* inode to get data for */
+ void __user *buffer, /* buffer to place output in */
+ int ubsize, /* size of buffer */
+ bulkstat_one_fmt_pf formatter, /* formatter, copy to user */
+ int *ubused, /* bytes used by me */
+ int *stat) /* BULKSTAT_RV_... */
{
- xfs_icdinode_t *dic; /* dinode core info pointer */
- xfs_inode_t *ip; /* incore inode pointer */
- struct inode *inode;
- int error;
+ struct xfs_icdinode *dic; /* dinode core info pointer */
+ struct xfs_inode *ip; /* incore inode pointer */
+ struct inode *inode;
+ struct xfs_bstat *buf; /* return buffer */
+ int error = 0; /* error value */
+
+ *stat = BULKSTAT_RV_NOTHING;
+
+ if (!buffer || xfs_internal_inum(mp, ino))
+ return XFS_ERROR(EINVAL);
+
+ buf = kmem_alloc(sizeof(*buf), KM_SLEEP | KM_MAYFAIL);
+ if (!buf)
+ return XFS_ERROR(ENOMEM);
error = xfs_iget(mp, NULL, ino,
- XFS_IGET_BULKSTAT, XFS_ILOCK_SHARED, &ip, bno);
+ XFS_IGET_UNTRUSTED, XFS_ILOCK_SHARED, &ip);
if (error) {
*stat = BULKSTAT_RV_NOTHING;
- return error;
+ goto out_free;
}
ASSERT(ip != NULL);
@@ -127,77 +140,17 @@ xfs_bulkstat_one_iget(
buf->bs_blocks = dic->di_nblocks + ip->i_delayed_blks;
break;
}
+ xfs_iunlock(ip, XFS_ILOCK_SHARED);
+ IRELE(ip);
- xfs_iput(ip, XFS_ILOCK_SHARED);
- return error;
-}
-
-STATIC void
-xfs_bulkstat_one_dinode(
- xfs_mount_t *mp, /* mount point for filesystem */
- xfs_ino_t ino, /* inode number to get data for */
- xfs_dinode_t *dic, /* dinode inode pointer */
- xfs_bstat_t *buf) /* return buffer */
-{
- /*
- * The inode format changed when we moved the link count and
- * made it 32 bits long. If this is an old format inode,
- * convert it in memory to look like a new one. If it gets
- * flushed to disk we will convert back before flushing or
- * logging it. We zero out the new projid field and the old link
- * count field. We'll handle clearing the pad field (the remains
- * of the old uuid field) when we actually convert the inode to
- * the new format. We don't change the version number so that we
- * can distinguish this from a real new format inode.
- */
- if (dic->di_version == 1) {
- buf->bs_nlink = be16_to_cpu(dic->di_onlink);
- buf->bs_projid = 0;
- } else {
- buf->bs_nlink = be32_to_cpu(dic->di_nlink);
- buf->bs_projid = be16_to_cpu(dic->di_projid);
- }
+ error = formatter(buffer, ubsize, ubused, buf);
- buf->bs_ino = ino;
- buf->bs_mode = be16_to_cpu(dic->di_mode);
- buf->bs_uid = be32_to_cpu(dic->di_uid);
- buf->bs_gid = be32_to_cpu(dic->di_gid);
- buf->bs_size = be64_to_cpu(dic->di_size);
- buf->bs_atime.tv_sec = be32_to_cpu(dic->di_atime.t_sec);
- buf->bs_atime.tv_nsec = be32_to_cpu(dic->di_atime.t_nsec);
- buf->bs_mtime.tv_sec = be32_to_cpu(dic->di_mtime.t_sec);
- buf->bs_mtime.tv_nsec = be32_to_cpu(dic->di_mtime.t_nsec);
- buf->bs_ctime.tv_sec = be32_to_cpu(dic->di_ctime.t_sec);
- buf->bs_ctime.tv_nsec = be32_to_cpu(dic->di_ctime.t_nsec);
- buf->bs_xflags = xfs_dic2xflags(dic);
- buf->bs_extsize = be32_to_cpu(dic->di_extsize) << mp->m_sb.sb_blocklog;
- buf->bs_extents = be32_to_cpu(dic->di_nextents);
- buf->bs_gen = be32_to_cpu(dic->di_gen);
- memset(buf->bs_pad, 0, sizeof(buf->bs_pad));
- buf->bs_dmevmask = be32_to_cpu(dic->di_dmevmask);
- buf->bs_dmstate = be16_to_cpu(dic->di_dmstate);
- buf->bs_aextents = be16_to_cpu(dic->di_anextents);
- buf->bs_forkoff = XFS_DFORK_BOFF(dic);
+ if (!error)
+ *stat = BULKSTAT_RV_DIDONE;
- switch (dic->di_format) {
- case XFS_DINODE_FMT_DEV:
- buf->bs_rdev = xfs_dinode_get_rdev(dic);
- buf->bs_blksize = BLKDEV_IOSIZE;
- buf->bs_blocks = 0;
- break;
- case XFS_DINODE_FMT_LOCAL:
- case XFS_DINODE_FMT_UUID:
- buf->bs_rdev = 0;
- buf->bs_blksize = mp->m_sb.sb_blocksize;
- buf->bs_blocks = 0;
- break;
- case XFS_DINODE_FMT_EXTENTS:
- case XFS_DINODE_FMT_BTREE:
- buf->bs_rdev = 0;
- buf->bs_blksize = mp->m_sb.sb_blocksize;
- buf->bs_blocks = be64_to_cpu(dic->di_nblocks);
- break;
- }
+ out_free:
+ kmem_free(buf);
+ return error;
}
/* Return 0 on success or positive error */
@@ -217,118 +170,17 @@ xfs_bulkstat_one_fmt(
return 0;
}
-/*
- * Return stat information for one inode.
- * Return 0 if ok, else errno.
- */
-int /* error status */
-xfs_bulkstat_one_int(
- xfs_mount_t *mp, /* mount point for filesystem */
- xfs_ino_t ino, /* inode number to get data for */
- void __user *buffer, /* buffer to place output in */
- int ubsize, /* size of buffer */
- bulkstat_one_fmt_pf formatter, /* formatter, copy to user */
- xfs_daddr_t bno, /* starting bno of inode cluster */
- int *ubused, /* bytes used by me */
- void *dibuff, /* on-disk inode buffer */
- int *stat) /* BULKSTAT_RV_... */
-{
- xfs_bstat_t *buf; /* return buffer */
- int error = 0; /* error value */
- xfs_dinode_t *dip; /* dinode inode pointer */
-
- dip = (xfs_dinode_t *)dibuff;
- *stat = BULKSTAT_RV_NOTHING;
-
- if (!buffer || xfs_internal_inum(mp, ino))
- return XFS_ERROR(EINVAL);
-
- buf = kmem_alloc(sizeof(*buf), KM_SLEEP);
-
- if (dip == NULL) {
- /* We're not being passed a pointer to a dinode. This happens
- * if BULKSTAT_FG_IGET is selected. Do the iget.
- */
- error = xfs_bulkstat_one_iget(mp, ino, bno, buf, stat);
- if (error)
- goto out_free;
- } else {
- xfs_bulkstat_one_dinode(mp, ino, dip, buf);
- }
-
- error = formatter(buffer, ubsize, ubused, buf);
- if (error)
- goto out_free;
-
- *stat = BULKSTAT_RV_DIDONE;
-
- out_free:
- kmem_free(buf);
- return error;
-}
-
int
xfs_bulkstat_one(
xfs_mount_t *mp, /* mount point for filesystem */
xfs_ino_t ino, /* inode number to get data for */
void __user *buffer, /* buffer to place output in */
int ubsize, /* size of buffer */
- void *private_data, /* my private data */
- xfs_daddr_t bno, /* starting bno of inode cluster */
int *ubused, /* bytes used by me */
- void *dibuff, /* on-disk inode buffer */
int *stat) /* BULKSTAT_RV_... */
{
return xfs_bulkstat_one_int(mp, ino, buffer, ubsize,
- xfs_bulkstat_one_fmt, bno,
- ubused, dibuff, stat);
-}
-
-/*
- * Test to see whether we can use the ondisk inode directly, based
- * on the given bulkstat flags, filling in dipp accordingly.
- * Returns zero if the inode is dodgey.
- */
-STATIC int
-xfs_bulkstat_use_dinode(
- xfs_mount_t *mp,
- int flags,
- xfs_buf_t *bp,
- int clustidx,
- xfs_dinode_t **dipp)
-{
- xfs_dinode_t *dip;
- unsigned int aformat;
-
- *dipp = NULL;
- if (!bp || (flags & BULKSTAT_FG_IGET))
- return 1;
- dip = (xfs_dinode_t *)
- xfs_buf_offset(bp, clustidx << mp->m_sb.sb_inodelog);
- /*
- * Check the buffer containing the on-disk inode for di_mode == 0.
- * This is to prevent xfs_bulkstat from picking up just reclaimed
- * inodes that have their in-core state initialized but not flushed
- * to disk yet. This is a temporary hack that would require a proper
- * fix in the future.
- */
- if (be16_to_cpu(dip->di_magic) != XFS_DINODE_MAGIC ||
- !XFS_DINODE_GOOD_VERSION(dip->di_version) ||
- !dip->di_mode)
- return 0;
- if (flags & BULKSTAT_FG_QUICK) {
- *dipp = dip;
- return 1;
- }
- /* BULKSTAT_FG_INLINE: if attr fork is local, or not there, use it */
- aformat = dip->di_aformat;
- if ((XFS_DFORK_Q(dip) == 0) ||
- (aformat == XFS_DINODE_FMT_LOCAL) ||
- (aformat == XFS_DINODE_FMT_EXTENTS && !dip->di_anextents)) {
- *dipp = dip;
- return 1;
- }
- return 1;
+ xfs_bulkstat_one_fmt, ubused, stat);
}
#define XFS_BULKSTAT_UBLEFT(ubleft) ((ubleft) >= statstruct_size)
@@ -342,10 +194,8 @@ xfs_bulkstat(
xfs_ino_t *lastinop, /* last inode returned */
int *ubcountp, /* size of buffer/count returned */
bulkstat_one_pf formatter, /* func that'd fill a single buf */
- void *private_data,/* private data for formatter */
size_t statstruct_size, /* sizeof struct filling */
char __user *ubuffer, /* buffer with inode stats */
- int flags, /* defined in xfs_itable.h */
int *done) /* 1 if there are more stats to get */
{
xfs_agblock_t agbno=0;/* allocation group block number */
@@ -380,14 +230,12 @@ xfs_bulkstat(
int ubelem; /* spaces used in user's buffer */
int ubused; /* bytes used by formatter */
xfs_buf_t *bp; /* ptr to on-disk inode cluster buf */
- xfs_dinode_t *dip; /* ptr into bp for specific inode */
/*
* Get the last inode value, see if there's nothing to do.
*/
ino = (xfs_ino_t)*lastinop;
lastino = ino;
- dip = NULL;
agno = XFS_INO_TO_AGNO(mp, ino);
agino = XFS_INO_TO_AGINO(mp, ino);
if (agno >= mp->m_sb.sb_agcount ||
@@ -612,37 +460,6 @@ xfs_bulkstat(
irbp->ir_startino) +
((chunkidx & nimask) >>
mp->m_sb.sb_inopblog);
-
- if (flags & (BULKSTAT_FG_QUICK |
- BULKSTAT_FG_INLINE)) {
- int offset;
-
- ino = XFS_AGINO_TO_INO(mp, agno,
- agino);
- bno = XFS_AGB_TO_DADDR(mp, agno,
- agbno);
-
- /*
- * Get the inode cluster buffer
- */
- if (bp)
- xfs_buf_relse(bp);
-
- error = xfs_inotobp(mp, NULL, ino, &dip,
- &bp, &offset,
- XFS_IGET_BULKSTAT);
-
- if (!error)
- clustidx = offset / mp->m_sb.sb_inodesize;
- if (XFS_TEST_ERROR(error != 0,
- mp, XFS_ERRTAG_BULKSTAT_READ_CHUNK,
- XFS_RANDOM_BULKSTAT_READ_CHUNK)) {
- bp = NULL;
- ubleft = 0;
- rval = error;
- break;
- }
- }
}
ino = XFS_AGINO_TO_INO(mp, agno, agino);
bno = XFS_AGB_TO_DADDR(mp, agno, agbno);
@@ -658,35 +475,13 @@ xfs_bulkstat(
* when the chunk is used up.
*/
irbp->ir_freecount++;
- if (!xfs_bulkstat_use_dinode(mp, flags, bp,
- clustidx, &dip)) {
- lastino = ino;
- continue;
- }
- /*
- * If we need to do an iget, cannot hold bp.
- * Drop it, until starting the next cluster.
- */
- if ((flags & BULKSTAT_FG_INLINE) && !dip) {
- if (bp)
- xfs_buf_relse(bp);
- bp = NULL;
- }
/*
* Get the inode and fill in a single buffer.
- * BULKSTAT_FG_QUICK uses dip to fill it in.
- * BULKSTAT_FG_IGET uses igets.
- * BULKSTAT_FG_INLINE uses dip if we have an
- * inline attr fork, else igets.
- * See: xfs_bulkstat_one & xfs_dm_bulkstat_one.
- * This is also used to count inodes/blks, etc
- * in xfs_qm_quotacheck.
*/
ubused = statstruct_size;
- error = formatter(mp, ino, ubufp,
- ubleft, private_data,
- bno, &ubused, dip, &fmterror);
+ error = formatter(mp, ino, ubufp, ubleft,
+ &ubused, &fmterror);
if (fmterror == BULKSTAT_RV_NOTHING) {
if (error && error != ENOENT &&
error != EINVAL) {
@@ -778,8 +573,7 @@ xfs_bulkstat_single(
*/
ino = (xfs_ino_t)*lastinop;
- error = xfs_bulkstat_one(mp, ino, buffer, sizeof(xfs_bstat_t),
- NULL, 0, NULL, NULL, &res);
+ error = xfs_bulkstat_one(mp, ino, buffer, sizeof(xfs_bstat_t), 0, &res);
if (error) {
/*
* Special case way failed, do it the "long" way
@@ -788,8 +582,7 @@ xfs_bulkstat_single(
(*lastinop)--;
count = 1;
if (xfs_bulkstat(mp, lastinop, &count, xfs_bulkstat_one,
- NULL, sizeof(xfs_bstat_t), buffer,
- BULKSTAT_FG_IGET, done))
+ sizeof(xfs_bstat_t), buffer, done))
return error;
if (count == 0 || (xfs_ino_t)*lastinop != ino)
return error == EFSCORRUPTED ?
diff --git a/fs/xfs/xfs_itable.h b/fs/xfs/xfs_itable.h
index 20792bf45946..97295d91d170 100644
--- a/fs/xfs/xfs_itable.h
+++ b/fs/xfs/xfs_itable.h
@@ -27,10 +27,7 @@ typedef int (*bulkstat_one_pf)(struct xfs_mount *mp,
xfs_ino_t ino,
void __user *buffer,
int ubsize,
- void *private_data,
- xfs_daddr_t bno,
int *ubused,
- void *dip,
int *stat);
/*
@@ -41,13 +38,6 @@ typedef int (*bulkstat_one_pf)(struct xfs_mount *mp,
#define BULKSTAT_RV_GIVEUP 2
/*
- * Values for bulkstat flag argument.
- */
-#define BULKSTAT_FG_IGET 0x1 /* Go through the buffer cache */
-#define BULKSTAT_FG_QUICK 0x2 /* No iget, walk the dinode cluster */
-#define BULKSTAT_FG_INLINE 0x4 /* No iget if inline attrs */
-
-/*
* Return stat information in bulk (by-inode) for the filesystem.
*/
int /* error status */
@@ -56,10 +46,8 @@ xfs_bulkstat(
xfs_ino_t *lastino, /* last inode returned */
int *count, /* size of buffer/count returned */
bulkstat_one_pf formatter, /* func that'd fill a single buf */
- void *private_data, /* private data for formatter */
size_t statstruct_size,/* sizeof struct that we're filling */
char __user *ubuffer,/* buffer with inode stats */
- int flags, /* flag to control access method */
int *done); /* 1 if there are more stats to get */
int
@@ -82,9 +70,7 @@ xfs_bulkstat_one_int(
void __user *buffer,
int ubsize,
bulkstat_one_fmt_pf formatter,
- xfs_daddr_t bno,
int *ubused,
- void *dibuff,
int *stat);
int
@@ -93,10 +79,7 @@ xfs_bulkstat_one(
xfs_ino_t ino,
void __user *buffer,
int ubsize,
- void *private_data,
- xfs_daddr_t bno,
int *ubused,
- void *dibuff,
int *stat);
typedef int (*inumbers_fmt_pf)(
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 5215abc8023a..925d572bf0f4 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -24,8 +24,6 @@
#include "xfs_trans.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
-#include "xfs_dir2.h"
-#include "xfs_dmapi.h"
#include "xfs_mount.h"
#include "xfs_error.h"
#include "xfs_log_priv.h"
@@ -35,8 +33,6 @@
#include "xfs_ialloc_btree.h"
#include "xfs_log_recover.h"
#include "xfs_trans_priv.h"
-#include "xfs_dir2_sf.h"
-#include "xfs_attr_sf.h"
#include "xfs_dinode.h"
#include "xfs_inode.h"
#include "xfs_rw.h"
@@ -337,7 +333,6 @@ xfs_log_reserve(
int retval = 0;
ASSERT(client == XFS_TRANSACTION || client == XFS_LOG);
- ASSERT((flags & XFS_LOG_NOSLEEP) == 0);
if (XLOG_FORCED_SHUTDOWN(log))
return XFS_ERROR(EIO);
@@ -552,7 +547,7 @@ xfs_log_unmount_write(xfs_mount_t *mp)
.magic = XLOG_UNMOUNT_TYPE,
};
struct xfs_log_iovec reg = {
- .i_addr = (void *)&magic,
+ .i_addr = &magic,
.i_len = sizeof(magic),
.i_type = XLOG_REG_TYPE_UNMOUNT,
};
@@ -1047,7 +1042,6 @@ xlog_alloc_log(xfs_mount_t *mp,
xlog_in_core_t *iclog, *prev_iclog=NULL;
xfs_buf_t *bp;
int i;
- int iclogsize;
int error = ENOMEM;
uint log2_size = 0;
@@ -1127,7 +1121,6 @@ xlog_alloc_log(xfs_mount_t *mp,
* with different amounts of memory. See the definition of
* xlog_in_core_t in xfs_log_priv.h for details.
*/
- iclogsize = log->l_iclog_size;
ASSERT(log->l_iclog_size >= 4096);
for (i=0; i < log->l_iclog_bufs; i++) {
*iclogp = kmem_zalloc(sizeof(xlog_in_core_t), KM_MAYFAIL);
@@ -1428,11 +1421,8 @@ xlog_sync(xlog_t *log,
XFS_BUF_BUSY(bp);
XFS_BUF_ASYNC(bp);
bp->b_flags |= XBF_LOG_BUFFER;
- /*
- * Do an ordered write for the log block.
- * Its unnecessary to flush the first split block in the log wrap case.
- */
- if (!split && (log->l_mp->m_flags & XFS_MOUNT_BARRIER))
+
+ if (log->l_mp->m_flags & XFS_MOUNT_BARRIER)
XFS_BUF_ORDERED(bp);
ASSERT(XFS_BUF_ADDR(bp) <= log->l_logBBsize-1);
diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h
index 04c78e642cc8..916eb7db14d9 100644
--- a/fs/xfs/xfs_log.h
+++ b/fs/xfs/xfs_log.h
@@ -55,14 +55,10 @@ static inline xfs_lsn_t _lsn_cmp(xfs_lsn_t lsn1, xfs_lsn_t lsn2)
/*
* Flags to xfs_log_reserve()
*
- * XFS_LOG_SLEEP: If space is not available, sleep (default)
- * XFS_LOG_NOSLEEP: If space is not available, return error
* XFS_LOG_PERM_RESERV: Permanent reservation. When writes are
* performed against this type of reservation, the reservation
* is not decreased. Long running transactions should use this.
*/
-#define XFS_LOG_SLEEP 0x0
-#define XFS_LOG_NOSLEEP 0x1
#define XFS_LOG_PERM_RESERV 0x2
/*
@@ -104,7 +100,7 @@ static inline xfs_lsn_t _lsn_cmp(xfs_lsn_t lsn1, xfs_lsn_t lsn2)
#define XLOG_REG_TYPE_MAX 19
typedef struct xfs_log_iovec {
- xfs_caddr_t i_addr; /* beginning address of region */
+ void *i_addr; /* beginning address of region */
int i_len; /* length in bytes of region */
uint i_type; /* type of region */
} xfs_log_iovec_t;
@@ -201,9 +197,4 @@ int xfs_log_commit_cil(struct xfs_mount *mp, struct xfs_trans *tp,
bool xfs_log_item_in_current_chkpt(struct xfs_log_item *lip);
#endif
-
-
-extern int xlog_debug; /* set to 1 to enable real log */
-
-
#endif /* __XFS_LOG_H__ */
diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c
index bb17cc044bf3..31e4ea2d19ac 100644
--- a/fs/xfs/xfs_log_cil.c
+++ b/fs/xfs/xfs_log_cil.c
@@ -26,8 +26,6 @@
#include "xfs_log_priv.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
-#include "xfs_dir2.h"
-#include "xfs_dmapi.h"
#include "xfs_mount.h"
#include "xfs_error.h"
#include "xfs_alloc.h"
@@ -554,7 +552,7 @@ xlog_cil_push(
thdr.th_type = XFS_TRANS_CHECKPOINT;
thdr.th_tid = tic->t_tid;
thdr.th_num_items = num_iovecs;
- lhdr.i_addr = (xfs_caddr_t)&thdr;
+ lhdr.i_addr = &thdr;
lhdr.i_len = sizeof(xfs_trans_header_t);
lhdr.i_type = XLOG_REG_TYPE_TRANSHDR;
tic->t_curr_res -= lhdr.i_len + sizeof(xlog_op_header_t);
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 14a69aec2c0b..6f3f5fa37acf 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -24,15 +24,11 @@
#include "xfs_trans.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
-#include "xfs_dir2.h"
-#include "xfs_dmapi.h"
#include "xfs_mount.h"
#include "xfs_error.h"
#include "xfs_bmap_btree.h"
#include "xfs_alloc_btree.h"
#include "xfs_ialloc_btree.h"
-#include "xfs_dir2_sf.h"
-#include "xfs_attr_sf.h"
#include "xfs_dinode.h"
#include "xfs_inode.h"
#include "xfs_inode_item.h"
@@ -132,15 +128,10 @@ xlog_align(
int nbblks,
xfs_buf_t *bp)
{
- xfs_daddr_t offset;
- xfs_caddr_t ptr;
+ xfs_daddr_t offset = blk_no & ((xfs_daddr_t)log->l_sectBBsize - 1);
- offset = blk_no & ((xfs_daddr_t) log->l_sectBBsize - 1);
- ptr = XFS_BUF_PTR(bp) + BBTOB(offset);
-
- ASSERT(ptr + BBTOB(nbblks) <= XFS_BUF_PTR(bp) + XFS_BUF_SIZE(bp));
-
- return ptr;
+ ASSERT(BBTOB(offset + nbblks) <= XFS_BUF_SIZE(bp));
+ return XFS_BUF_PTR(bp) + BBTOB(offset);
}
@@ -1570,9 +1561,7 @@ xlog_recover_reorder_trans(
list_splice_init(&trans->r_itemq, &sort_list);
list_for_each_entry_safe(item, n, &sort_list, ri_list) {
- xfs_buf_log_format_t *buf_f;
-
- buf_f = (xfs_buf_log_format_t *)item->ri_buf[0].i_addr;
+ xfs_buf_log_format_t *buf_f = item->ri_buf[0].i_addr;
switch (ITEM_TYPE(item)) {
case XFS_LI_BUF:
@@ -1897,9 +1886,8 @@ xlog_recover_do_inode_buffer(
* current di_next_unlinked field. Extract its value
* and copy it to the buffer copy.
*/
- logged_nextp = (xfs_agino_t *)
- ((char *)(item->ri_buf[item_index].i_addr) +
- (next_unlinked_offset - reg_buf_offset));
+ logged_nextp = item->ri_buf[item_index].i_addr +
+ next_unlinked_offset - reg_buf_offset;
if (unlikely(*logged_nextp == 0)) {
xfs_fs_cmn_err(CE_ALERT, mp,
"bad inode buffer log record (ptr = 0x%p, bp = 0x%p). XFS trying to replay bad (0) inode di_next_unlinked field",
@@ -1978,8 +1966,7 @@ xlog_recover_do_reg_buffer(
item->ri_buf[i].i_len, __func__);
goto next;
}
- error = xfs_qm_dqcheck((xfs_disk_dquot_t *)
- item->ri_buf[i].i_addr,
+ error = xfs_qm_dqcheck(item->ri_buf[i].i_addr,
-1, 0, XFS_QMOPT_DOWARN,
"dquot_buf_recover");
if (error)
@@ -2192,7 +2179,7 @@ xlog_recover_do_buffer_trans(
xlog_recover_item_t *item,
int pass)
{
- xfs_buf_log_format_t *buf_f;
+ xfs_buf_log_format_t *buf_f = item->ri_buf[0].i_addr;
xfs_mount_t *mp;
xfs_buf_t *bp;
int error;
@@ -2202,8 +2189,6 @@ xlog_recover_do_buffer_trans(
ushort flags;
uint buf_flags;
- buf_f = (xfs_buf_log_format_t *)item->ri_buf[0].i_addr;
-
if (pass == XLOG_RECOVER_PASS1) {
/*
* In this pass we're only looking for buf items
@@ -2324,10 +2309,9 @@ xlog_recover_do_inode_trans(
}
if (item->ri_buf[0].i_len == sizeof(xfs_inode_log_format_t)) {
- in_f = (xfs_inode_log_format_t *)item->ri_buf[0].i_addr;
+ in_f = item->ri_buf[0].i_addr;
} else {
- in_f = (xfs_inode_log_format_t *)kmem_alloc(
- sizeof(xfs_inode_log_format_t), KM_SLEEP);
+ in_f = kmem_alloc(sizeof(xfs_inode_log_format_t), KM_SLEEP);
need_free = 1;
error = xfs_inode_item_format_convert(&item->ri_buf[0], in_f);
if (error)
@@ -2375,7 +2359,7 @@ xlog_recover_do_inode_trans(
error = EFSCORRUPTED;
goto error;
}
- dicp = (xfs_icdinode_t *)(item->ri_buf[1].i_addr);
+ dicp = item->ri_buf[1].i_addr;
if (unlikely(dicp->di_magic != XFS_DINODE_MAGIC)) {
xfs_buf_relse(bp);
xfs_fs_cmn_err(CE_ALERT, mp,
@@ -2466,7 +2450,7 @@ xlog_recover_do_inode_trans(
}
/* The core is in in-core format */
- xfs_dinode_to_disk(dip, (xfs_icdinode_t *)item->ri_buf[1].i_addr);
+ xfs_dinode_to_disk(dip, item->ri_buf[1].i_addr);
/* the rest is in on-disk format */
if (item->ri_buf[1].i_len > sizeof(struct xfs_icdinode)) {
@@ -2583,7 +2567,7 @@ xlog_recover_do_quotaoff_trans(
return (0);
}
- qoff_f = (xfs_qoff_logformat_t *)item->ri_buf[0].i_addr;
+ qoff_f = item->ri_buf[0].i_addr;
ASSERT(qoff_f);
/*
@@ -2627,9 +2611,8 @@ xlog_recover_do_dquot_trans(
if (mp->m_qflags == 0)
return (0);
- recddq = (xfs_disk_dquot_t *)item->ri_buf[1].i_addr;
-
- if (item->ri_buf[1].i_addr == NULL) {
+ recddq = item->ri_buf[1].i_addr;
+ if (recddq == NULL) {
cmn_err(CE_ALERT,
"XFS: NULL dquot in %s.", __func__);
return XFS_ERROR(EIO);
@@ -2659,7 +2642,7 @@ xlog_recover_do_dquot_trans(
* The other possibility, of course, is that the quota subsystem was
* removed since the last mount - ENOSYS.
*/
- dq_f = (xfs_dq_logformat_t *)item->ri_buf[0].i_addr;
+ dq_f = item->ri_buf[0].i_addr;
ASSERT(dq_f);
if ((error = xfs_qm_dqcheck(recddq,
dq_f->qlf_id,
@@ -2726,7 +2709,7 @@ xlog_recover_do_efi_trans(
return 0;
}
- efi_formatp = (xfs_efi_log_format_t *)item->ri_buf[0].i_addr;
+ efi_formatp = item->ri_buf[0].i_addr;
mp = log->l_mp;
efip = xfs_efi_init(mp, efi_formatp->efi_nextents);
@@ -2772,7 +2755,7 @@ xlog_recover_do_efd_trans(
return;
}
- efd_formatp = (xfs_efd_log_format_t *)item->ri_buf[0].i_addr;
+ efd_formatp = item->ri_buf[0].i_addr;
ASSERT((item->ri_buf[0].i_len == (sizeof(xfs_efd_log_format_32_t) +
((efd_formatp->efd_nextents - 1) * sizeof(xfs_extent_32_t)))) ||
(item->ri_buf[0].i_len == (sizeof(xfs_efd_log_format_64_t) +
@@ -3203,7 +3186,7 @@ xlog_recover_process_one_iunlink(
int error;
ino = XFS_AGINO_TO_INO(mp, agno, agino);
- error = xfs_iget(mp, NULL, ino, 0, 0, &ip, 0);
+ error = xfs_iget(mp, NULL, ino, 0, 0, &ip);
if (error)
goto fail;
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index d7bf38c8cd1c..aeb9d72ebf6e 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -25,13 +25,10 @@
#include "xfs_sb.h"
#include "xfs_ag.h"
#include "xfs_dir2.h"
-#include "xfs_dmapi.h"
#include "xfs_mount.h"
#include "xfs_bmap_btree.h"
#include "xfs_alloc_btree.h"
#include "xfs_ialloc_btree.h"
-#include "xfs_dir2_sf.h"
-#include "xfs_attr_sf.h"
#include "xfs_dinode.h"
#include "xfs_inode.h"
#include "xfs_btree.h"
@@ -268,10 +265,10 @@ xfs_sb_validate_fsb_count(
#if XFS_BIG_BLKNOS /* Limited by ULONG_MAX of page cache index */
if (nblocks >> (PAGE_CACHE_SHIFT - sbp->sb_blocklog) > ULONG_MAX)
- return E2BIG;
+ return EFBIG;
#else /* Limited by UINT_MAX of sectors */
if (nblocks << (sbp->sb_blocklog - BBSHIFT) > UINT_MAX)
- return E2BIG;
+ return EFBIG;
#endif
return 0;
}
@@ -393,7 +390,7 @@ xfs_mount_validate_sb(
xfs_sb_validate_fsb_count(sbp, sbp->sb_rblocks)) {
xfs_fs_mount_cmn_err(flags,
"file system too large to be mounted on this system.");
- return XFS_ERROR(E2BIG);
+ return XFS_ERROR(EFBIG);
}
if (unlikely(sbp->sb_inprogress)) {
@@ -413,17 +410,6 @@ xfs_mount_validate_sb(
return 0;
}
-STATIC void
-xfs_initialize_perag_icache(
- xfs_perag_t *pag)
-{
- if (!pag->pag_ici_init) {
- rwlock_init(&pag->pag_ici_lock);
- INIT_RADIX_TREE(&pag->pag_ici_root, GFP_ATOMIC);
- pag->pag_ici_init = 1;
- }
-}
-
int
xfs_initialize_perag(
xfs_mount_t *mp,
@@ -436,13 +422,8 @@ xfs_initialize_perag(
xfs_agino_t agino;
xfs_ino_t ino;
xfs_sb_t *sbp = &mp->m_sb;
- xfs_ino_t max_inum = XFS_MAXINUMBER_32;
int error = -ENOMEM;
- /* Check to see if the filesystem can overflow 32 bit inodes */
- agino = XFS_OFFBNO_TO_AGINO(mp, sbp->sb_agblocks - 1, 0);
- ino = XFS_AGINO_TO_INO(mp, agcount - 1, agino);
-
/*
* Walk the current per-ag tree so we don't try to initialise AGs
* that already exist (growfs case). Allocate and insert all the
@@ -456,11 +437,18 @@ xfs_initialize_perag(
}
if (!first_initialised)
first_initialised = index;
+
pag = kmem_zalloc(sizeof(*pag), KM_MAYFAIL);
if (!pag)
goto out_unwind;
+ pag->pag_agno = index;
+ pag->pag_mount = mp;
+ rwlock_init(&pag->pag_ici_lock);
+ INIT_RADIX_TREE(&pag->pag_ici_root, GFP_ATOMIC);
+
if (radix_tree_preload(GFP_NOFS))
goto out_unwind;
+
spin_lock(&mp->m_perag_lock);
if (radix_tree_insert(&mp->m_perag_tree, index, pag)) {
BUG();
@@ -469,25 +457,26 @@ xfs_initialize_perag(
error = -EEXIST;
goto out_unwind;
}
- pag->pag_agno = index;
- pag->pag_mount = mp;
spin_unlock(&mp->m_perag_lock);
radix_tree_preload_end();
}
- /* Clear the mount flag if no inode can overflow 32 bits
- * on this filesystem, or if specifically requested..
+ /*
+ * If we mount with the inode64 option, or no inode overflows
+ * the legacy 32-bit address space clear the inode32 option.
*/
- if ((mp->m_flags & XFS_MOUNT_SMALL_INUMS) && ino > max_inum) {
+ agino = XFS_OFFBNO_TO_AGINO(mp, sbp->sb_agblocks - 1, 0);
+ ino = XFS_AGINO_TO_INO(mp, agcount - 1, agino);
+
+ if ((mp->m_flags & XFS_MOUNT_SMALL_INUMS) && ino > XFS_MAXINUMBER_32)
mp->m_flags |= XFS_MOUNT_32BITINODES;
- } else {
+ else
mp->m_flags &= ~XFS_MOUNT_32BITINODES;
- }
- /* If we can overflow then setup the ag headers accordingly */
if (mp->m_flags & XFS_MOUNT_32BITINODES) {
- /* Calculate how much should be reserved for inodes to
- * meet the max inode percentage.
+ /*
+ * Calculate how much should be reserved for inodes to meet
+ * the max inode percentage.
*/
if (mp->m_maxicount) {
__uint64_t icount;
@@ -500,30 +489,28 @@ xfs_initialize_perag(
} else {
max_metadata = agcount;
}
+
for (index = 0; index < agcount; index++) {
ino = XFS_AGINO_TO_INO(mp, index, agino);
- if (ino > max_inum) {
+ if (ino > XFS_MAXINUMBER_32) {
index++;
break;
}
- /* This ag is preferred for inodes */
pag = xfs_perag_get(mp, index);
pag->pagi_inodeok = 1;
if (index < max_metadata)
pag->pagf_metadata = 1;
- xfs_initialize_perag_icache(pag);
xfs_perag_put(pag);
}
} else {
- /* Setup default behavior for smaller filesystems */
for (index = 0; index < agcount; index++) {
pag = xfs_perag_get(mp, index);
pag->pagi_inodeok = 1;
- xfs_initialize_perag_icache(pag);
xfs_perag_put(pag);
}
}
+
if (maxagi)
*maxagi = index;
return 0;
@@ -1009,7 +996,7 @@ xfs_check_sizes(xfs_mount_t *mp)
d = (xfs_daddr_t)XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks);
if (XFS_BB_TO_FSB(mp, d) != mp->m_sb.sb_dblocks) {
cmn_err(CE_WARN, "XFS: size check 1 failed");
- return XFS_ERROR(E2BIG);
+ return XFS_ERROR(EFBIG);
}
error = xfs_read_buf(mp, mp->m_ddev_targp,
d - XFS_FSS_TO_BB(mp, 1),
@@ -1019,7 +1006,7 @@ xfs_check_sizes(xfs_mount_t *mp)
} else {
cmn_err(CE_WARN, "XFS: size check 2 failed");
if (error == ENOSPC)
- error = XFS_ERROR(E2BIG);
+ error = XFS_ERROR(EFBIG);
return error;
}
@@ -1027,7 +1014,7 @@ xfs_check_sizes(xfs_mount_t *mp)
d = (xfs_daddr_t)XFS_FSB_TO_BB(mp, mp->m_sb.sb_logblocks);
if (XFS_BB_TO_FSB(mp, d) != mp->m_sb.sb_logblocks) {
cmn_err(CE_WARN, "XFS: size check 3 failed");
- return XFS_ERROR(E2BIG);
+ return XFS_ERROR(EFBIG);
}
error = xfs_read_buf(mp, mp->m_logdev_targp,
d - XFS_FSB_TO_BB(mp, 1),
@@ -1037,7 +1024,7 @@ xfs_check_sizes(xfs_mount_t *mp)
} else {
cmn_err(CE_WARN, "XFS: size check 3 failed");
if (error == ENOSPC)
- error = XFS_ERROR(E2BIG);
+ error = XFS_ERROR(EFBIG);
return error;
}
}
@@ -1254,7 +1241,7 @@ xfs_mountfs(
* Allocate and initialize the per-ag data.
*/
spin_lock_init(&mp->m_perag_lock);
- INIT_RADIX_TREE(&mp->m_perag_tree, GFP_NOFS);
+ INIT_RADIX_TREE(&mp->m_perag_tree, GFP_ATOMIC);
error = xfs_initialize_perag(mp, sbp->sb_agcount, &mp->m_maxagi);
if (error) {
cmn_err(CE_WARN, "XFS: Failed per-ag init: %d", error);
@@ -1310,7 +1297,7 @@ xfs_mountfs(
* Get and sanity-check the root inode.
* Save the pointer to it in the mount structure.
*/
- error = xfs_iget(mp, NULL, sbp->sb_rootino, 0, XFS_ILOCK_EXCL, &rip, 0);
+ error = xfs_iget(mp, NULL, sbp->sb_rootino, 0, XFS_ILOCK_EXCL, &rip);
if (error) {
cmn_err(CE_WARN, "XFS: failed to read root inode");
goto out_log_dealloc;
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 1d2c7eed4eda..622da2179a57 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -66,65 +66,6 @@ struct xfs_nameops;
struct xfs_ail;
struct xfs_quotainfo;
-
-/*
- * Prototypes and functions for the Data Migration subsystem.
- */
-
-typedef int (*xfs_send_data_t)(int, struct xfs_inode *,
- xfs_off_t, size_t, int, int *);
-typedef int (*xfs_send_mmap_t)(struct vm_area_struct *, uint);
-typedef int (*xfs_send_destroy_t)(struct xfs_inode *, dm_right_t);
-typedef int (*xfs_send_namesp_t)(dm_eventtype_t, struct xfs_mount *,
- struct xfs_inode *, dm_right_t,
- struct xfs_inode *, dm_right_t,
- const unsigned char *, const unsigned char *,
- mode_t, int, int);
-typedef int (*xfs_send_mount_t)(struct xfs_mount *, dm_right_t,
- char *, char *);
-typedef void (*xfs_send_unmount_t)(struct xfs_mount *, struct xfs_inode *,
- dm_right_t, mode_t, int, int);
-
-typedef struct xfs_dmops {
- xfs_send_data_t xfs_send_data;
- xfs_send_mmap_t xfs_send_mmap;
- xfs_send_destroy_t xfs_send_destroy;
- xfs_send_namesp_t xfs_send_namesp;
- xfs_send_mount_t xfs_send_mount;
- xfs_send_unmount_t xfs_send_unmount;
-} xfs_dmops_t;
-
-#define XFS_DMAPI_UNMOUNT_FLAGS(mp) \
- (((mp)->m_dmevmask & (1 << DM_EVENT_UNMOUNT)) ? 0 : DM_FLAGS_UNWANTED)
-
-#define XFS_SEND_DATA(mp, ev,ip,off,len,fl,lock) \
- (*(mp)->m_dm_ops->xfs_send_data)(ev,ip,off,len,fl,lock)
-#define XFS_SEND_MMAP(mp, vma,fl) \
- (*(mp)->m_dm_ops->xfs_send_mmap)(vma,fl)
-#define XFS_SEND_DESTROY(mp, ip,right) \
- (*(mp)->m_dm_ops->xfs_send_destroy)(ip,right)
-#define XFS_SEND_NAMESP(mp, ev,b1,r1,b2,r2,n1,n2,mode,rval,fl) \
- (*(mp)->m_dm_ops->xfs_send_namesp)(ev,NULL,b1,r1,b2,r2,n1,n2,mode,rval,fl)
-#define XFS_SEND_MOUNT(mp,right,path,name) \
- (*(mp)->m_dm_ops->xfs_send_mount)(mp,right,path,name)
-#define XFS_SEND_PREUNMOUNT(mp) \
-do { \
- if (mp->m_flags & XFS_MOUNT_DMAPI) { \
- (*(mp)->m_dm_ops->xfs_send_namesp)(DM_EVENT_PREUNMOUNT, mp, \
- (mp)->m_rootip, DM_RIGHT_NULL, \
- (mp)->m_rootip, DM_RIGHT_NULL, \
- NULL, NULL, 0, 0, XFS_DMAPI_UNMOUNT_FLAGS(mp)); \
- } \
-} while (0)
-#define XFS_SEND_UNMOUNT(mp) \
-do { \
- if (mp->m_flags & XFS_MOUNT_DMAPI) { \
- (*(mp)->m_dm_ops->xfs_send_unmount)(mp, (mp)->m_rootip, \
- DM_RIGHT_NULL, 0, 0, XFS_DMAPI_UNMOUNT_FLAGS(mp)); \
- } \
-} while (0)
-
-
#ifdef HAVE_PERCPU_SB
/*
@@ -241,8 +182,6 @@ typedef struct xfs_mount {
uint m_chsize; /* size of next field */
struct xfs_chash *m_chash; /* fs private inode per-cluster
* hash table */
- struct xfs_dmops *m_dm_ops; /* vector of DMI ops */
- struct xfs_qmops *m_qm_ops; /* vector of XQM ops */
atomic_t m_active_trans; /* number trans frozen */
#ifdef HAVE_PERCPU_SB
xfs_icsb_cnts_t __percpu *m_sb_cnts; /* per-cpu superblock counters */
@@ -259,7 +198,7 @@ typedef struct xfs_mount {
wait_queue_head_t m_wait_single_sync_task;
__int64_t m_update_flags; /* sb flags we need to update
on the next remount,rw */
- struct list_head m_mplist; /* inode shrinker mount list */
+ struct shrinker m_inode_shrink; /* inode reclaim shrinker */
} xfs_mount_t;
/*
@@ -269,7 +208,6 @@ typedef struct xfs_mount {
must be synchronous except
for space allocations */
#define XFS_MOUNT_DELAYLOG (1ULL << 1) /* delayed logging is enabled */
-#define XFS_MOUNT_DMAPI (1ULL << 2) /* dmapi is enabled */
#define XFS_MOUNT_WAS_CLEAN (1ULL << 3)
#define XFS_MOUNT_FS_SHUTDOWN (1ULL << 4) /* atomic stop of all filesystem
operations, typically for
@@ -282,8 +220,6 @@ typedef struct xfs_mount {
#define XFS_MOUNT_GRPID (1ULL << 9) /* group-ID assigned from directory */
#define XFS_MOUNT_NORECOVERY (1ULL << 10) /* no recovery - dirty fs */
#define XFS_MOUNT_DFLT_IOSIZE (1ULL << 12) /* set default i/o size */
-#define XFS_MOUNT_OSYNCISOSYNC (1ULL << 13) /* o_sync is REALLY o_sync */
- /* osyncisdsync is now default*/
#define XFS_MOUNT_32BITINODES (1ULL << 14) /* do not create inodes above
* 32 bits in size */
#define XFS_MOUNT_SMALL_INUMS (1ULL << 15) /* users wants 32bit inodes */
@@ -440,11 +376,6 @@ extern int xfs_sb_validate_fsb_count(struct xfs_sb *, __uint64_t);
extern int xfs_dev_is_read_only(struct xfs_mount *, char *);
-extern int xfs_dmops_get(struct xfs_mount *);
-extern void xfs_dmops_put(struct xfs_mount *);
-
-extern struct xfs_dmops xfs_dmcore_xfs;
-
#endif /* __KERNEL__ */
extern void xfs_mod_sb(struct xfs_trans *, __int64_t);
diff --git a/fs/xfs/xfs_rename.c b/fs/xfs/xfs_rename.c
index fc1cda23b817..8fca957200df 100644
--- a/fs/xfs/xfs_rename.c
+++ b/fs/xfs/xfs_rename.c
@@ -24,12 +24,9 @@
#include "xfs_sb.h"
#include "xfs_ag.h"
#include "xfs_dir2.h"
-#include "xfs_dmapi.h"
#include "xfs_mount.h"
#include "xfs_da_btree.h"
#include "xfs_bmap_btree.h"
-#include "xfs_dir2_sf.h"
-#include "xfs_attr_sf.h"
#include "xfs_dinode.h"
#include "xfs_inode.h"
#include "xfs_inode_item.h"
@@ -116,20 +113,7 @@ xfs_rename(
int spaceres;
int num_inodes;
- xfs_itrace_entry(src_dp);
- xfs_itrace_entry(target_dp);
-
- if (DM_EVENT_ENABLED(src_dp, DM_EVENT_RENAME) ||
- DM_EVENT_ENABLED(target_dp, DM_EVENT_RENAME)) {
- error = XFS_SEND_NAMESP(mp, DM_EVENT_RENAME,
- src_dp, DM_RIGHT_NULL,
- target_dp, DM_RIGHT_NULL,
- src_name->name, target_name->name,
- 0, 0, 0);
- if (error)
- return error;
- }
- /* Return through std_return after this point. */
+ trace_xfs_rename(src_dp, target_dp, src_name, target_name);
new_parent = (src_dp != target_dp);
src_is_directory = ((src_ip->i_d.di_mode & S_IFMT) == S_IFDIR);
@@ -184,26 +168,14 @@ xfs_rename(
/*
* Join all the inodes to the transaction. From this point on,
* we can rely on either trans_commit or trans_cancel to unlock
- * them. Note that we need to add a vnode reference to the
- * directories since trans_commit & trans_cancel will decrement
- * them when they unlock the inodes. Also, we need to be careful
- * not to add an inode to the transaction more than once.
+ * them.
*/
- IHOLD(src_dp);
- xfs_trans_ijoin(tp, src_dp, XFS_ILOCK_EXCL);
-
- if (new_parent) {
- IHOLD(target_dp);
- xfs_trans_ijoin(tp, target_dp, XFS_ILOCK_EXCL);
- }
-
- IHOLD(src_ip);
- xfs_trans_ijoin(tp, src_ip, XFS_ILOCK_EXCL);
-
- if (target_ip) {
- IHOLD(target_ip);
- xfs_trans_ijoin(tp, target_ip, XFS_ILOCK_EXCL);
- }
+ xfs_trans_ijoin_ref(tp, src_dp, XFS_ILOCK_EXCL);
+ if (new_parent)
+ xfs_trans_ijoin_ref(tp, target_dp, XFS_ILOCK_EXCL);
+ xfs_trans_ijoin_ref(tp, src_ip, XFS_ILOCK_EXCL);
+ if (target_ip)
+ xfs_trans_ijoin_ref(tp, target_ip, XFS_ILOCK_EXCL);
/*
* If we are using project inheritance, we only allow renames
@@ -369,26 +341,13 @@ xfs_rename(
* trans_commit will unlock src_ip, target_ip & decrement
* the vnode references.
*/
- error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
-
- /* Fall through to std_return with error = 0 or errno from
- * xfs_trans_commit */
-std_return:
- if (DM_EVENT_ENABLED(src_dp, DM_EVENT_POSTRENAME) ||
- DM_EVENT_ENABLED(target_dp, DM_EVENT_POSTRENAME)) {
- (void) XFS_SEND_NAMESP (mp, DM_EVENT_POSTRENAME,
- src_dp, DM_RIGHT_NULL,
- target_dp, DM_RIGHT_NULL,
- src_name->name, target_name->name,
- 0, error, 0);
- }
- return error;
+ return xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
abort_return:
cancel_flags |= XFS_TRANS_ABORT;
- /* FALLTHROUGH */
error_return:
xfs_bmap_cancel(&free_list);
xfs_trans_cancel(tp, cancel_flags);
- goto std_return;
+ std_return:
+ return error;
}
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index 6be05f756d59..891260fea11e 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -25,17 +25,10 @@
#include "xfs_sb.h"
#include "xfs_ag.h"
#include "xfs_dir2.h"
-#include "xfs_dmapi.h"
#include "xfs_mount.h"
#include "xfs_bmap_btree.h"
-#include "xfs_alloc_btree.h"
-#include "xfs_ialloc_btree.h"
-#include "xfs_dir2_sf.h"
-#include "xfs_attr_sf.h"
#include "xfs_dinode.h"
#include "xfs_inode.h"
-#include "xfs_btree.h"
-#include "xfs_ialloc.h"
#include "xfs_alloc.h"
#include "xfs_bmap.h"
#include "xfs_rtalloc.h"
@@ -129,7 +122,7 @@ xfs_growfs_rt_alloc(
cancelflags |= XFS_TRANS_ABORT;
error = xfs_bmapi(tp, ip, oblocks, nblocks - oblocks,
XFS_BMAPI_WRITE | XFS_BMAPI_METADATA, &firstblock,
- resblks, &map, &nmap, &flist, NULL);
+ resblks, &map, &nmap, &flist);
if (!error && nmap < 1)
error = XFS_ERROR(ENOSPC);
if (error)
@@ -2247,7 +2240,7 @@ xfs_rtmount_init(
cmn_err(CE_WARN, "XFS: realtime mount -- %llu != %llu",
(unsigned long long) XFS_BB_TO_FSB(mp, d),
(unsigned long long) mp->m_sb.sb_rblocks);
- return XFS_ERROR(E2BIG);
+ return XFS_ERROR(EFBIG);
}
error = xfs_read_buf(mp, mp->m_rtdev_targp,
d - XFS_FSB_TO_BB(mp, 1),
@@ -2256,7 +2249,7 @@ xfs_rtmount_init(
cmn_err(CE_WARN,
"XFS: realtime mount -- xfs_read_buf failed, returned %d", error);
if (error == ENOSPC)
- return XFS_ERROR(E2BIG);
+ return XFS_ERROR(EFBIG);
return error;
}
xfs_buf_relse(bp);
@@ -2277,12 +2270,12 @@ xfs_rtmount_inodes(
sbp = &mp->m_sb;
if (sbp->sb_rbmino == NULLFSINO)
return 0;
- error = xfs_iget(mp, NULL, sbp->sb_rbmino, 0, 0, &mp->m_rbmip, 0);
+ error = xfs_iget(mp, NULL, sbp->sb_rbmino, 0, 0, &mp->m_rbmip);
if (error)
return error;
ASSERT(mp->m_rbmip != NULL);
ASSERT(sbp->sb_rsumino != NULLFSINO);
- error = xfs_iget(mp, NULL, sbp->sb_rsumino, 0, 0, &mp->m_rsumip, 0);
+ error = xfs_iget(mp, NULL, sbp->sb_rsumino, 0, 0, &mp->m_rsumip);
if (error) {
IRELE(mp->m_rbmip);
return error;
diff --git a/fs/xfs/xfs_rtalloc.h b/fs/xfs/xfs_rtalloc.h
index b2d67adb6a08..ff614c29b441 100644
--- a/fs/xfs/xfs_rtalloc.h
+++ b/fs/xfs/xfs_rtalloc.h
@@ -147,7 +147,16 @@ xfs_growfs_rt(
# define xfs_rtfree_extent(t,b,l) (ENOSYS)
# define xfs_rtpick_extent(m,t,l,rb) (ENOSYS)
# define xfs_growfs_rt(mp,in) (ENOSYS)
-# define xfs_rtmount_init(m) (((mp)->m_sb.sb_rblocks == 0)? 0 : (ENOSYS))
+static inline int /* error */
+xfs_rtmount_init(
+ xfs_mount_t *mp) /* file system mount structure */
+{
+ if (mp->m_sb.sb_rblocks == 0)
+ return 0;
+
+ cmn_err(CE_WARN, "XFS: Not built with CONFIG_XFS_RT");
+ return ENOSYS;
+}
# define xfs_rtmount_inodes(m) (((mp)->m_sb.sb_rblocks == 0)? 0 : (ENOSYS))
# define xfs_rtunmount_inodes(m)
#endif /* CONFIG_XFS_RT */
diff --git a/fs/xfs/xfs_rw.c b/fs/xfs/xfs_rw.c
index e336742a58a4..56861d5daaef 100644
--- a/fs/xfs/xfs_rw.c
+++ b/fs/xfs/xfs_rw.c
@@ -24,27 +24,12 @@
#include "xfs_trans.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
-#include "xfs_dir2.h"
-#include "xfs_dmapi.h"
#include "xfs_mount.h"
#include "xfs_bmap_btree.h"
-#include "xfs_alloc_btree.h"
-#include "xfs_ialloc_btree.h"
-#include "xfs_dir2_sf.h"
-#include "xfs_attr_sf.h"
#include "xfs_dinode.h"
#include "xfs_inode.h"
-#include "xfs_inode_item.h"
-#include "xfs_itable.h"
-#include "xfs_btree.h"
-#include "xfs_alloc.h"
-#include "xfs_ialloc.h"
-#include "xfs_attr.h"
-#include "xfs_bmap.h"
#include "xfs_error.h"
-#include "xfs_buf_item.h"
#include "xfs_rw.h"
-#include "xfs_trace.h"
/*
* Force a shutdown of the filesystem instantly while keeping
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index ce558efa2ea0..fdca7416c754 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -1,5 +1,6 @@
/*
* Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
+ * Copyright (C) 2010 Red Hat, Inc.
* All Rights Reserved.
*
* This program is free software; you can redistribute it and/or
@@ -24,16 +25,12 @@
#include "xfs_trans.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
-#include "xfs_dir2.h"
-#include "xfs_dmapi.h"
#include "xfs_mount.h"
#include "xfs_error.h"
#include "xfs_da_btree.h"
#include "xfs_bmap_btree.h"
#include "xfs_alloc_btree.h"
#include "xfs_ialloc_btree.h"
-#include "xfs_dir2_sf.h"
-#include "xfs_attr_sf.h"
#include "xfs_dinode.h"
#include "xfs_inode.h"
#include "xfs_btree.h"
@@ -47,135 +44,491 @@
#include "xfs_trace.h"
kmem_zone_t *xfs_trans_zone;
+kmem_zone_t *xfs_log_item_desc_zone;
+
+
+/*
+ * Various log reservation values.
+ *
+ * These are based on the size of the file system block because that is what
+ * most transactions manipulate. Each adds in an additional 128 bytes per
+ * item logged to try to account for the overhead of the transaction mechanism.
+ *
+ * Note: Most of the reservations underestimate the number of allocation
+ * groups into which they could free extents in the xfs_bmap_finish() call.
+ * This is because the number in the worst case is quite high and quite
+ * unusual. In order to fix this we need to change xfs_bmap_finish() to free
+ * extents in only a single AG at a time. This will require changes to the
+ * EFI code as well, however, so that the EFI for the extents not freed is
+ * logged again in each transaction. See SGI PV #261917.
+ *
+ * Reservation functions here avoid a huge stack in xfs_trans_init due to
+ * register overflow from temporaries in the calculations.
+ */
+
/*
- * Reservation functions here avoid a huge stack in xfs_trans_init
- * due to register overflow from temporaries in the calculations.
+ * In a write transaction we can allocate a maximum of 2
+ * extents. This gives:
+ * the inode getting the new extents: inode size
+ * the inode's bmap btree: max depth * block size
+ * the agfs of the ags from which the extents are allocated: 2 * sector
+ * the superblock free block counter: sector size
+ * the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size
+ * And the bmap_finish transaction can free bmap blocks in a join:
+ * the agfs of the ags containing the blocks: 2 * sector size
+ * the agfls of the ags containing the blocks: 2 * sector size
+ * the super block free block counter: sector size
+ * the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size
*/
STATIC uint
-xfs_calc_write_reservation(xfs_mount_t *mp)
+xfs_calc_write_reservation(
+ struct xfs_mount *mp)
{
- return XFS_CALC_WRITE_LOG_RES(mp) + XFS_DQUOT_LOGRES(mp);
+ return XFS_DQUOT_LOGRES(mp) +
+ MAX((mp->m_sb.sb_inodesize +
+ XFS_FSB_TO_B(mp, XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK)) +
+ 2 * mp->m_sb.sb_sectsize +
+ mp->m_sb.sb_sectsize +
+ XFS_ALLOCFREE_LOG_RES(mp, 2) +
+ 128 * (4 + XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) +
+ XFS_ALLOCFREE_LOG_COUNT(mp, 2))),
+ (2 * mp->m_sb.sb_sectsize +
+ 2 * mp->m_sb.sb_sectsize +
+ mp->m_sb.sb_sectsize +
+ XFS_ALLOCFREE_LOG_RES(mp, 2) +
+ 128 * (5 + XFS_ALLOCFREE_LOG_COUNT(mp, 2))));
}
+/*
+ * In truncating a file we free up to two extents at once. We can modify:
+ * the inode being truncated: inode size
+ * the inode's bmap btree: (max depth + 1) * block size
+ * And the bmap_finish transaction can free the blocks and bmap blocks:
+ * the agf for each of the ags: 4 * sector size
+ * the agfl for each of the ags: 4 * sector size
+ * the super block to reflect the freed blocks: sector size
+ * worst case split in allocation btrees per extent assuming 4 extents:
+ * 4 exts * 2 trees * (2 * max depth - 1) * block size
+ * the inode btree: max depth * blocksize
+ * the allocation btrees: 2 trees * (max depth - 1) * block size
+ */
STATIC uint
-xfs_calc_itruncate_reservation(xfs_mount_t *mp)
+xfs_calc_itruncate_reservation(
+ struct xfs_mount *mp)
{
- return XFS_CALC_ITRUNCATE_LOG_RES(mp) + XFS_DQUOT_LOGRES(mp);
+ return XFS_DQUOT_LOGRES(mp) +
+ MAX((mp->m_sb.sb_inodesize +
+ XFS_FSB_TO_B(mp, XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) + 1) +
+ 128 * (2 + XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK))),
+ (4 * mp->m_sb.sb_sectsize +
+ 4 * mp->m_sb.sb_sectsize +
+ mp->m_sb.sb_sectsize +
+ XFS_ALLOCFREE_LOG_RES(mp, 4) +
+ 128 * (9 + XFS_ALLOCFREE_LOG_COUNT(mp, 4)) +
+ 128 * 5 +
+ XFS_ALLOCFREE_LOG_RES(mp, 1) +
+ 128 * (2 + XFS_IALLOC_BLOCKS(mp) + mp->m_in_maxlevels +
+ XFS_ALLOCFREE_LOG_COUNT(mp, 1))));
}
+/*
+ * In renaming a files we can modify:
+ * the four inodes involved: 4 * inode size
+ * the two directory btrees: 2 * (max depth + v2) * dir block size
+ * the two directory bmap btrees: 2 * max depth * block size
+ * And the bmap_finish transaction can free dir and bmap blocks (two sets
+ * of bmap blocks) giving:
+ * the agf for the ags in which the blocks live: 3 * sector size
+ * the agfl for the ags in which the blocks live: 3 * sector size
+ * the superblock for the free block count: sector size
+ * the allocation btrees: 3 exts * 2 trees * (2 * max depth - 1) * block size
+ */
STATIC uint
-xfs_calc_rename_reservation(xfs_mount_t *mp)
+xfs_calc_rename_reservation(
+ struct xfs_mount *mp)
{
- return XFS_CALC_RENAME_LOG_RES(mp) + XFS_DQUOT_LOGRES(mp);
+ return XFS_DQUOT_LOGRES(mp) +
+ MAX((4 * mp->m_sb.sb_inodesize +
+ 2 * XFS_DIROP_LOG_RES(mp) +
+ 128 * (4 + 2 * XFS_DIROP_LOG_COUNT(mp))),
+ (3 * mp->m_sb.sb_sectsize +
+ 3 * mp->m_sb.sb_sectsize +
+ mp->m_sb.sb_sectsize +
+ XFS_ALLOCFREE_LOG_RES(mp, 3) +
+ 128 * (7 + XFS_ALLOCFREE_LOG_COUNT(mp, 3))));
}
+/*
+ * For creating a link to an inode:
+ * the parent directory inode: inode size
+ * the linked inode: inode size
+ * the directory btree could split: (max depth + v2) * dir block size
+ * the directory bmap btree could join or split: (max depth + v2) * blocksize
+ * And the bmap_finish transaction can free some bmap blocks giving:
+ * the agf for the ag in which the blocks live: sector size
+ * the agfl for the ag in which the blocks live: sector size
+ * the superblock for the free block count: sector size
+ * the allocation btrees: 2 trees * (2 * max depth - 1) * block size
+ */
STATIC uint
-xfs_calc_link_reservation(xfs_mount_t *mp)
+xfs_calc_link_reservation(
+ struct xfs_mount *mp)
{
- return XFS_CALC_LINK_LOG_RES(mp) + XFS_DQUOT_LOGRES(mp);
+ return XFS_DQUOT_LOGRES(mp) +
+ MAX((mp->m_sb.sb_inodesize +
+ mp->m_sb.sb_inodesize +
+ XFS_DIROP_LOG_RES(mp) +
+ 128 * (2 + XFS_DIROP_LOG_COUNT(mp))),
+ (mp->m_sb.sb_sectsize +
+ mp->m_sb.sb_sectsize +
+ mp->m_sb.sb_sectsize +
+ XFS_ALLOCFREE_LOG_RES(mp, 1) +
+ 128 * (3 + XFS_ALLOCFREE_LOG_COUNT(mp, 1))));
}
+/*
+ * For removing a directory entry we can modify:
+ * the parent directory inode: inode size
+ * the removed inode: inode size
+ * the directory btree could join: (max depth + v2) * dir block size
+ * the directory bmap btree could join or split: (max depth + v2) * blocksize
+ * And the bmap_finish transaction can free the dir and bmap blocks giving:
+ * the agf for the ag in which the blocks live: 2 * sector size
+ * the agfl for the ag in which the blocks live: 2 * sector size
+ * the superblock for the free block count: sector size
+ * the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size
+ */
STATIC uint
-xfs_calc_remove_reservation(xfs_mount_t *mp)
+xfs_calc_remove_reservation(
+ struct xfs_mount *mp)
{
- return XFS_CALC_REMOVE_LOG_RES(mp) + XFS_DQUOT_LOGRES(mp);
+ return XFS_DQUOT_LOGRES(mp) +
+ MAX((mp->m_sb.sb_inodesize +
+ mp->m_sb.sb_inodesize +
+ XFS_DIROP_LOG_RES(mp) +
+ 128 * (2 + XFS_DIROP_LOG_COUNT(mp))),
+ (2 * mp->m_sb.sb_sectsize +
+ 2 * mp->m_sb.sb_sectsize +
+ mp->m_sb.sb_sectsize +
+ XFS_ALLOCFREE_LOG_RES(mp, 2) +
+ 128 * (5 + XFS_ALLOCFREE_LOG_COUNT(mp, 2))));
}
+/*
+ * For symlink we can modify:
+ * the parent directory inode: inode size
+ * the new inode: inode size
+ * the inode btree entry: 1 block
+ * the directory btree: (max depth + v2) * dir block size
+ * the directory inode's bmap btree: (max depth + v2) * block size
+ * the blocks for the symlink: 1 kB
+ * Or in the first xact we allocate some inodes giving:
+ * the agi and agf of the ag getting the new inodes: 2 * sectorsize
+ * the inode blocks allocated: XFS_IALLOC_BLOCKS * blocksize
+ * the inode btree: max depth * blocksize
+ * the allocation btrees: 2 trees * (2 * max depth - 1) * block size
+ */
STATIC uint
-xfs_calc_symlink_reservation(xfs_mount_t *mp)
+xfs_calc_symlink_reservation(
+ struct xfs_mount *mp)
{
- return XFS_CALC_SYMLINK_LOG_RES(mp) + XFS_DQUOT_LOGRES(mp);
+ return XFS_DQUOT_LOGRES(mp) +
+ MAX((mp->m_sb.sb_inodesize +
+ mp->m_sb.sb_inodesize +
+ XFS_FSB_TO_B(mp, 1) +
+ XFS_DIROP_LOG_RES(mp) +
+ 1024 +
+ 128 * (4 + XFS_DIROP_LOG_COUNT(mp))),
+ (2 * mp->m_sb.sb_sectsize +
+ XFS_FSB_TO_B(mp, XFS_IALLOC_BLOCKS(mp)) +
+ XFS_FSB_TO_B(mp, mp->m_in_maxlevels) +
+ XFS_ALLOCFREE_LOG_RES(mp, 1) +
+ 128 * (2 + XFS_IALLOC_BLOCKS(mp) + mp->m_in_maxlevels +
+ XFS_ALLOCFREE_LOG_COUNT(mp, 1))));
}
+/*
+ * For create we can modify:
+ * the parent directory inode: inode size
+ * the new inode: inode size
+ * the inode btree entry: block size
+ * the superblock for the nlink flag: sector size
+ * the directory btree: (max depth + v2) * dir block size
+ * the directory inode's bmap btree: (max depth + v2) * block size
+ * Or in the first xact we allocate some inodes giving:
+ * the agi and agf of the ag getting the new inodes: 2 * sectorsize
+ * the superblock for the nlink flag: sector size
+ * the inode blocks allocated: XFS_IALLOC_BLOCKS * blocksize
+ * the inode btree: max depth * blocksize
+ * the allocation btrees: 2 trees * (max depth - 1) * block size
+ */
STATIC uint
-xfs_calc_create_reservation(xfs_mount_t *mp)
+xfs_calc_create_reservation(
+ struct xfs_mount *mp)
{
- return XFS_CALC_CREATE_LOG_RES(mp) + XFS_DQUOT_LOGRES(mp);
+ return XFS_DQUOT_LOGRES(mp) +
+ MAX((mp->m_sb.sb_inodesize +
+ mp->m_sb.sb_inodesize +
+ mp->m_sb.sb_sectsize +
+ XFS_FSB_TO_B(mp, 1) +
+ XFS_DIROP_LOG_RES(mp) +
+ 128 * (3 + XFS_DIROP_LOG_COUNT(mp))),
+ (3 * mp->m_sb.sb_sectsize +
+ XFS_FSB_TO_B(mp, XFS_IALLOC_BLOCKS(mp)) +
+ XFS_FSB_TO_B(mp, mp->m_in_maxlevels) +
+ XFS_ALLOCFREE_LOG_RES(mp, 1) +
+ 128 * (2 + XFS_IALLOC_BLOCKS(mp) + mp->m_in_maxlevels +
+ XFS_ALLOCFREE_LOG_COUNT(mp, 1))));
}
+/*
+ * Making a new directory is the same as creating a new file.
+ */
STATIC uint
-xfs_calc_mkdir_reservation(xfs_mount_t *mp)
+xfs_calc_mkdir_reservation(
+ struct xfs_mount *mp)
{
- return XFS_CALC_MKDIR_LOG_RES(mp) + XFS_DQUOT_LOGRES(mp);
+ return xfs_calc_create_reservation(mp);
}
+/*
+ * In freeing an inode we can modify:
+ * the inode being freed: inode size
+ * the super block free inode counter: sector size
+ * the agi hash list and counters: sector size
+ * the inode btree entry: block size
+ * the on disk inode before ours in the agi hash list: inode cluster size
+ * the inode btree: max depth * blocksize
+ * the allocation btrees: 2 trees * (max depth - 1) * block size
+ */
STATIC uint
-xfs_calc_ifree_reservation(xfs_mount_t *mp)
+xfs_calc_ifree_reservation(
+ struct xfs_mount *mp)
{
- return XFS_CALC_IFREE_LOG_RES(mp) + XFS_DQUOT_LOGRES(mp);
+ return XFS_DQUOT_LOGRES(mp) +
+ mp->m_sb.sb_inodesize +
+ mp->m_sb.sb_sectsize +
+ mp->m_sb.sb_sectsize +
+ XFS_FSB_TO_B(mp, 1) +
+ MAX((__uint16_t)XFS_FSB_TO_B(mp, 1),
+ XFS_INODE_CLUSTER_SIZE(mp)) +
+ 128 * 5 +
+ XFS_ALLOCFREE_LOG_RES(mp, 1) +
+ 128 * (2 + XFS_IALLOC_BLOCKS(mp) + mp->m_in_maxlevels +
+ XFS_ALLOCFREE_LOG_COUNT(mp, 1));
}
+/*
+ * When only changing the inode we log the inode and possibly the superblock
+ * We also add a bit of slop for the transaction stuff.
+ */
STATIC uint
-xfs_calc_ichange_reservation(xfs_mount_t *mp)
+xfs_calc_ichange_reservation(
+ struct xfs_mount *mp)
{
- return XFS_CALC_ICHANGE_LOG_RES(mp) + XFS_DQUOT_LOGRES(mp);
+ return XFS_DQUOT_LOGRES(mp) +
+ mp->m_sb.sb_inodesize +
+ mp->m_sb.sb_sectsize +
+ 512;
+
}
+/*
+ * Growing the data section of the filesystem.
+ * superblock
+ * agi and agf
+ * allocation btrees
+ */
STATIC uint
-xfs_calc_growdata_reservation(xfs_mount_t *mp)
+xfs_calc_growdata_reservation(
+ struct xfs_mount *mp)
{
- return XFS_CALC_GROWDATA_LOG_RES(mp);
+ return mp->m_sb.sb_sectsize * 3 +
+ XFS_ALLOCFREE_LOG_RES(mp, 1) +
+ 128 * (3 + XFS_ALLOCFREE_LOG_COUNT(mp, 1));
}
+/*
+ * Growing the rt section of the filesystem.
+ * In the first set of transactions (ALLOC) we allocate space to the
+ * bitmap or summary files.
+ * superblock: sector size
+ * agf of the ag from which the extent is allocated: sector size
+ * bmap btree for bitmap/summary inode: max depth * blocksize
+ * bitmap/summary inode: inode size
+ * allocation btrees for 1 block alloc: 2 * (2 * maxdepth - 1) * blocksize
+ */
STATIC uint
-xfs_calc_growrtalloc_reservation(xfs_mount_t *mp)
+xfs_calc_growrtalloc_reservation(
+ struct xfs_mount *mp)
{
- return XFS_CALC_GROWRTALLOC_LOG_RES(mp);
+ return 2 * mp->m_sb.sb_sectsize +
+ XFS_FSB_TO_B(mp, XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK)) +
+ mp->m_sb.sb_inodesize +
+ XFS_ALLOCFREE_LOG_RES(mp, 1) +
+ 128 * (3 + XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) +
+ XFS_ALLOCFREE_LOG_COUNT(mp, 1));
}
+/*
+ * Growing the rt section of the filesystem.
+ * In the second set of transactions (ZERO) we zero the new metadata blocks.
+ * one bitmap/summary block: blocksize
+ */
STATIC uint
-xfs_calc_growrtzero_reservation(xfs_mount_t *mp)
+xfs_calc_growrtzero_reservation(
+ struct xfs_mount *mp)
{
- return XFS_CALC_GROWRTZERO_LOG_RES(mp);
+ return mp->m_sb.sb_blocksize + 128;
}
+/*
+ * Growing the rt section of the filesystem.
+ * In the third set of transactions (FREE) we update metadata without
+ * allocating any new blocks.
+ * superblock: sector size
+ * bitmap inode: inode size
+ * summary inode: inode size
+ * one bitmap block: blocksize
+ * summary blocks: new summary size
+ */
STATIC uint
-xfs_calc_growrtfree_reservation(xfs_mount_t *mp)
+xfs_calc_growrtfree_reservation(
+ struct xfs_mount *mp)
{
- return XFS_CALC_GROWRTFREE_LOG_RES(mp);
+ return mp->m_sb.sb_sectsize +
+ 2 * mp->m_sb.sb_inodesize +
+ mp->m_sb.sb_blocksize +
+ mp->m_rsumsize +
+ 128 * 5;
}
+/*
+ * Logging the inode modification timestamp on a synchronous write.
+ * inode
+ */
STATIC uint
-xfs_calc_swrite_reservation(xfs_mount_t *mp)
+xfs_calc_swrite_reservation(
+ struct xfs_mount *mp)
{
- return XFS_CALC_SWRITE_LOG_RES(mp);
+ return mp->m_sb.sb_inodesize + 128;
}
+/*
+ * Logging the inode mode bits when writing a setuid/setgid file
+ * inode
+ */
STATIC uint
xfs_calc_writeid_reservation(xfs_mount_t *mp)
{
- return XFS_CALC_WRITEID_LOG_RES(mp);
+ return mp->m_sb.sb_inodesize + 128;
}
+/*
+ * Converting the inode from non-attributed to attributed.
+ * the inode being converted: inode size
+ * agf block and superblock (for block allocation)
+ * the new block (directory sized)
+ * bmap blocks for the new directory block
+ * allocation btrees
+ */
STATIC uint
-xfs_calc_addafork_reservation(xfs_mount_t *mp)
+xfs_calc_addafork_reservation(
+ struct xfs_mount *mp)
{
- return XFS_CALC_ADDAFORK_LOG_RES(mp) + XFS_DQUOT_LOGRES(mp);
+ return XFS_DQUOT_LOGRES(mp) +
+ mp->m_sb.sb_inodesize +
+ mp->m_sb.sb_sectsize * 2 +
+ mp->m_dirblksize +
+ XFS_FSB_TO_B(mp, XFS_DAENTER_BMAP1B(mp, XFS_DATA_FORK) + 1) +
+ XFS_ALLOCFREE_LOG_RES(mp, 1) +
+ 128 * (4 + XFS_DAENTER_BMAP1B(mp, XFS_DATA_FORK) + 1 +
+ XFS_ALLOCFREE_LOG_COUNT(mp, 1));
}
+/*
+ * Removing the attribute fork of a file
+ * the inode being truncated: inode size
+ * the inode's bmap btree: max depth * block size
+ * And the bmap_finish transaction can free the blocks and bmap blocks:
+ * the agf for each of the ags: 4 * sector size
+ * the agfl for each of the ags: 4 * sector size
+ * the super block to reflect the freed blocks: sector size
+ * worst case split in allocation btrees per extent assuming 4 extents:
+ * 4 exts * 2 trees * (2 * max depth - 1) * block size
+ */
STATIC uint
-xfs_calc_attrinval_reservation(xfs_mount_t *mp)
+xfs_calc_attrinval_reservation(
+ struct xfs_mount *mp)
{
- return XFS_CALC_ATTRINVAL_LOG_RES(mp);
+ return MAX((mp->m_sb.sb_inodesize +
+ XFS_FSB_TO_B(mp, XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK)) +
+ 128 * (1 + XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK))),
+ (4 * mp->m_sb.sb_sectsize +
+ 4 * mp->m_sb.sb_sectsize +
+ mp->m_sb.sb_sectsize +
+ XFS_ALLOCFREE_LOG_RES(mp, 4) +
+ 128 * (9 + XFS_ALLOCFREE_LOG_COUNT(mp, 4))));
}
+/*
+ * Setting an attribute.
+ * the inode getting the attribute
+ * the superblock for allocations
+ * the agfs extents are allocated from
+ * the attribute btree * max depth
+ * the inode allocation btree
+ * Since attribute transaction space is dependent on the size of the attribute,
+ * the calculation is done partially at mount time and partially at runtime.
+ */
STATIC uint
-xfs_calc_attrset_reservation(xfs_mount_t *mp)
+xfs_calc_attrset_reservation(
+ struct xfs_mount *mp)
{
- return XFS_CALC_ATTRSET_LOG_RES(mp) + XFS_DQUOT_LOGRES(mp);
+ return XFS_DQUOT_LOGRES(mp) +
+ mp->m_sb.sb_inodesize +
+ mp->m_sb.sb_sectsize +
+ XFS_FSB_TO_B(mp, XFS_DA_NODE_MAXDEPTH) +
+ 128 * (2 + XFS_DA_NODE_MAXDEPTH);
}
+/*
+ * Removing an attribute.
+ * the inode: inode size
+ * the attribute btree could join: max depth * block size
+ * the inode bmap btree could join or split: max depth * block size
+ * And the bmap_finish transaction can free the attr blocks freed giving:
+ * the agf for the ag in which the blocks live: 2 * sector size
+ * the agfl for the ag in which the blocks live: 2 * sector size
+ * the superblock for the free block count: sector size
+ * the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size
+ */
STATIC uint
-xfs_calc_attrrm_reservation(xfs_mount_t *mp)
+xfs_calc_attrrm_reservation(
+ struct xfs_mount *mp)
{
- return XFS_CALC_ATTRRM_LOG_RES(mp) + XFS_DQUOT_LOGRES(mp);
+ return XFS_DQUOT_LOGRES(mp) +
+ MAX((mp->m_sb.sb_inodesize +
+ XFS_FSB_TO_B(mp, XFS_DA_NODE_MAXDEPTH) +
+ XFS_FSB_TO_B(mp, XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK)) +
+ 128 * (1 + XFS_DA_NODE_MAXDEPTH +
+ XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK))),
+ (2 * mp->m_sb.sb_sectsize +
+ 2 * mp->m_sb.sb_sectsize +
+ mp->m_sb.sb_sectsize +
+ XFS_ALLOCFREE_LOG_RES(mp, 2) +
+ 128 * (5 + XFS_ALLOCFREE_LOG_COUNT(mp, 2))));
}
+/*
+ * Clearing a bad agino number in an agi hash bucket.
+ */
STATIC uint
-xfs_calc_clear_agi_bucket_reservation(xfs_mount_t *mp)
+xfs_calc_clear_agi_bucket_reservation(
+ struct xfs_mount *mp)
{
- return XFS_CALC_CLEAR_AGI_BUCKET_LOG_RES(mp);
+ return mp->m_sb.sb_sectsize + 128;
}
/*
@@ -184,11 +537,10 @@ xfs_calc_clear_agi_bucket_reservation(xfs_mount_t *mp)
*/
void
xfs_trans_init(
- xfs_mount_t *mp)
+ struct xfs_mount *mp)
{
- xfs_trans_reservations_t *resp;
+ struct xfs_trans_reservations *resp = &mp->m_reservations;
- resp = &(mp->m_reservations);
resp->tr_write = xfs_calc_write_reservation(mp);
resp->tr_itruncate = xfs_calc_itruncate_reservation(mp);
resp->tr_rename = xfs_calc_rename_reservation(mp);
@@ -243,8 +595,7 @@ _xfs_trans_alloc(
tp->t_magic = XFS_TRANS_MAGIC;
tp->t_type = type;
tp->t_mountp = mp;
- tp->t_items_free = XFS_LIC_NUM_SLOTS;
- xfs_lic_init(&(tp->t_items));
+ INIT_LIST_HEAD(&tp->t_items);
INIT_LIST_HEAD(&tp->t_busy);
return tp;
}
@@ -289,8 +640,7 @@ xfs_trans_dup(
ntp->t_magic = XFS_TRANS_MAGIC;
ntp->t_type = tp->t_type;
ntp->t_mountp = tp->t_mountp;
- ntp->t_items_free = XFS_LIC_NUM_SLOTS;
- xfs_lic_init(&(ntp->t_items));
+ INIT_LIST_HEAD(&ntp->t_items);
INIT_LIST_HEAD(&ntp->t_busy);
ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);
@@ -770,6 +1120,108 @@ xfs_trans_unreserve_and_mod_sb(
}
/*
+ * Add the given log item to the transaction's list of log items.
+ *
+ * The log item will now point to its new descriptor with its li_desc field.
+ */
+void
+xfs_trans_add_item(
+ struct xfs_trans *tp,
+ struct xfs_log_item *lip)
+{
+ struct xfs_log_item_desc *lidp;
+
+ ASSERT(lip->li_mountp = tp->t_mountp);
+ ASSERT(lip->li_ailp = tp->t_mountp->m_ail);
+
+ lidp = kmem_zone_zalloc(xfs_log_item_desc_zone, KM_SLEEP | KM_NOFS);
+
+ lidp->lid_item = lip;
+ lidp->lid_flags = 0;
+ lidp->lid_size = 0;
+ list_add_tail(&lidp->lid_trans, &tp->t_items);
+
+ lip->li_desc = lidp;
+}
+
+STATIC void
+xfs_trans_free_item_desc(
+ struct xfs_log_item_desc *lidp)
+{
+ list_del_init(&lidp->lid_trans);
+ kmem_zone_free(xfs_log_item_desc_zone, lidp);
+}
+
+/*
+ * Unlink and free the given descriptor.
+ */
+void
+xfs_trans_del_item(
+ struct xfs_log_item *lip)
+{
+ xfs_trans_free_item_desc(lip->li_desc);
+ lip->li_desc = NULL;
+}
+
+/*
+ * Unlock all of the items of a transaction and free all the descriptors
+ * of that transaction.
+ */
+STATIC void
+xfs_trans_free_items(
+ struct xfs_trans *tp,
+ xfs_lsn_t commit_lsn,
+ int flags)
+{
+ struct xfs_log_item_desc *lidp, *next;
+
+ list_for_each_entry_safe(lidp, next, &tp->t_items, lid_trans) {
+ struct xfs_log_item *lip = lidp->lid_item;
+
+ lip->li_desc = NULL;
+
+ if (commit_lsn != NULLCOMMITLSN)
+ IOP_COMMITTING(lip, commit_lsn);
+ if (flags & XFS_TRANS_ABORT)
+ lip->li_flags |= XFS_LI_ABORTED;
+ IOP_UNLOCK(lip);
+
+ xfs_trans_free_item_desc(lidp);
+ }
+}
+
+/*
+ * Unlock the items associated with a transaction.
+ *
+ * Items which were not logged should be freed. Those which were logged must
+ * still be tracked so they can be unpinned when the transaction commits.
+ */
+STATIC void
+xfs_trans_unlock_items(
+ struct xfs_trans *tp,
+ xfs_lsn_t commit_lsn)
+{
+ struct xfs_log_item_desc *lidp, *next;
+
+ list_for_each_entry_safe(lidp, next, &tp->t_items, lid_trans) {
+ struct xfs_log_item *lip = lidp->lid_item;
+
+ lip->li_desc = NULL;
+
+ if (commit_lsn != NULLCOMMITLSN)
+ IOP_COMMITTING(lip, commit_lsn);
+ IOP_UNLOCK(lip);
+
+ /*
+ * Free the descriptor if the item is not dirty
+ * within this transaction.
+ */
+ if (!(lidp->lid_flags & XFS_LID_DIRTY))
+ xfs_trans_free_item_desc(lidp);
+ }
+}
+
+/*
* Total up the number of log iovecs needed to commit this
* transaction. The transaction itself needs one for the
* transaction header. Ask each dirty item in turn how many
@@ -780,30 +1232,27 @@ xfs_trans_count_vecs(
struct xfs_trans *tp)
{
int nvecs;
- xfs_log_item_desc_t *lidp;
+ struct xfs_log_item_desc *lidp;
nvecs = 1;
- lidp = xfs_trans_first_item(tp);
- ASSERT(lidp != NULL);
/* In the non-debug case we need to start bailing out if we
* didn't find a log_item here, return zero and let trans_commit
* deal with it.
*/
- if (lidp == NULL)
+ if (list_empty(&tp->t_items)) {
+ ASSERT(0);
return 0;
+ }
- while (lidp != NULL) {
+ list_for_each_entry(lidp, &tp->t_items, lid_trans) {
/*
* Skip items which aren't dirty in this transaction.
*/
- if (!(lidp->lid_flags & XFS_LID_DIRTY)) {
- lidp = xfs_trans_next_item(tp, lidp);
+ if (!(lidp->lid_flags & XFS_LID_DIRTY))
continue;
- }
lidp->lid_size = IOP_SIZE(lidp->lid_item);
nvecs += lidp->lid_size;
- lidp = xfs_trans_next_item(tp, lidp);
}
return nvecs;
@@ -823,7 +1272,7 @@ xfs_trans_fill_vecs(
struct xfs_trans *tp,
struct xfs_log_iovec *log_vector)
{
- xfs_log_item_desc_t *lidp;
+ struct xfs_log_item_desc *lidp;
struct xfs_log_iovec *vecp;
uint nitems;
@@ -834,14 +1283,11 @@ xfs_trans_fill_vecs(
vecp = log_vector + 1;
nitems = 0;
- lidp = xfs_trans_first_item(tp);
- ASSERT(lidp);
- while (lidp) {
+ ASSERT(!list_empty(&tp->t_items));
+ list_for_each_entry(lidp, &tp->t_items, lid_trans) {
/* Skip items which aren't dirty in this transaction. */
- if (!(lidp->lid_flags & XFS_LID_DIRTY)) {
- lidp = xfs_trans_next_item(tp, lidp);
+ if (!(lidp->lid_flags & XFS_LID_DIRTY))
continue;
- }
/*
* The item may be marked dirty but not log anything. This can
@@ -852,7 +1298,6 @@ xfs_trans_fill_vecs(
IOP_FORMAT(lidp->lid_item, vecp);
vecp += lidp->lid_size;
IOP_PIN(lidp->lid_item);
- lidp = xfs_trans_next_item(tp, lidp);
}
/*
@@ -930,7 +1375,7 @@ xfs_trans_item_committed(
* log item flags, if anyone else stales the buffer we do not want to
* pay any attention to it.
*/
- IOP_UNPIN(lip);
+ IOP_UNPIN(lip, 0);
}
/*
@@ -947,24 +1392,15 @@ xfs_trans_committed(
struct xfs_trans *tp,
int abortflag)
{
- xfs_log_item_desc_t *lidp;
- xfs_log_item_chunk_t *licp;
- xfs_log_item_chunk_t *next_licp;
+ struct xfs_log_item_desc *lidp, *next;
/* Call the transaction's completion callback if there is one. */
if (tp->t_callback != NULL)
tp->t_callback(tp, tp->t_callarg);
- for (lidp = xfs_trans_first_item(tp);
- lidp != NULL;
- lidp = xfs_trans_next_item(tp, lidp)) {
+ list_for_each_entry_safe(lidp, next, &tp->t_items, lid_trans) {
xfs_trans_item_committed(lidp->lid_item, tp->t_lsn, abortflag);
- }
-
- /* free the item chunks, ignoring the embedded chunk */
- for (licp = tp->t_items.lic_next; licp != NULL; licp = next_licp) {
- next_licp = licp->lic_next;
- kmem_free(licp);
+ xfs_trans_free_item_desc(lidp);
}
xfs_trans_free(tp);
@@ -979,16 +1415,14 @@ xfs_trans_uncommit(
struct xfs_trans *tp,
uint flags)
{
- xfs_log_item_desc_t *lidp;
+ struct xfs_log_item_desc *lidp;
- for (lidp = xfs_trans_first_item(tp);
- lidp != NULL;
- lidp = xfs_trans_next_item(tp, lidp)) {
+ list_for_each_entry(lidp, &tp->t_items, lid_trans) {
/*
* Unpin all but those that aren't dirty.
*/
if (lidp->lid_flags & XFS_LID_DIRTY)
- IOP_UNPIN_REMOVE(lidp->lid_item, tp);
+ IOP_UNPIN(lidp->lid_item, 1);
}
xfs_trans_unreserve_and_mod_sb(tp);
@@ -1154,33 +1588,28 @@ STATIC struct xfs_log_vec *
xfs_trans_alloc_log_vecs(
xfs_trans_t *tp)
{
- xfs_log_item_desc_t *lidp;
+ struct xfs_log_item_desc *lidp;
struct xfs_log_vec *lv = NULL;
struct xfs_log_vec *ret_lv = NULL;
- lidp = xfs_trans_first_item(tp);
/* Bail out if we didn't find a log item. */
- if (!lidp) {
+ if (list_empty(&tp->t_items)) {
ASSERT(0);
return NULL;
}
- while (lidp != NULL) {
+ list_for_each_entry(lidp, &tp->t_items, lid_trans) {
struct xfs_log_vec *new_lv;
/* Skip items which aren't dirty in this transaction. */
- if (!(lidp->lid_flags & XFS_LID_DIRTY)) {
- lidp = xfs_trans_next_item(tp, lidp);
+ if (!(lidp->lid_flags & XFS_LID_DIRTY))
continue;
- }
/* Skip items that do not have any vectors for writing */
lidp->lid_size = IOP_SIZE(lidp->lid_item);
- if (!lidp->lid_size) {
- lidp = xfs_trans_next_item(tp, lidp);
+ if (!lidp->lid_size)
continue;
- }
new_lv = kmem_zalloc(sizeof(*new_lv) +
lidp->lid_size * sizeof(struct xfs_log_iovec),
@@ -1195,7 +1624,6 @@ xfs_trans_alloc_log_vecs(
else
lv->lv_next = new_lv;
lv = new_lv;
- lidp = xfs_trans_next_item(tp, lidp);
}
return ret_lv;
@@ -1354,12 +1782,6 @@ xfs_trans_cancel(
int flags)
{
int log_flags;
-#ifdef DEBUG
- xfs_log_item_chunk_t *licp;
- xfs_log_item_desc_t *lidp;
- xfs_log_item_t *lip;
- int i;
-#endif
xfs_mount_t *mp = tp->t_mountp;
/*
@@ -1378,21 +1800,11 @@ xfs_trans_cancel(
xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
}
#ifdef DEBUG
- if (!(flags & XFS_TRANS_ABORT)) {
- licp = &(tp->t_items);
- while (licp != NULL) {
- lidp = licp->lic_descs;
- for (i = 0; i < licp->lic_unused; i++, lidp++) {
- if (xfs_lic_isfree(licp, i)) {
- continue;
- }
-
- lip = lidp->lid_item;
- if (!XFS_FORCED_SHUTDOWN(mp))
- ASSERT(!(lip->li_type == XFS_LI_EFD));
- }
- licp = licp->lic_next;
- }
+ if (!(flags & XFS_TRANS_ABORT) && !XFS_FORCED_SHUTDOWN(mp)) {
+ struct xfs_log_item_desc *lidp;
+
+ list_for_each_entry(lidp, &tp->t_items, lid_trans)
+ ASSERT(!(lidp->lid_item->li_type == XFS_LI_EFD));
}
#endif
xfs_trans_unreserve_and_mod_sb(tp);
@@ -1480,7 +1892,6 @@ xfs_trans_roll(
if (error)
return error;
- xfs_trans_ijoin(trans, dp, XFS_ILOCK_EXCL);
- xfs_trans_ihold(trans, dp);
+ xfs_trans_ijoin(trans, dp);
return 0;
}
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index 8c69e7824f68..c13c0f97b494 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -161,105 +161,14 @@ typedef struct xfs_trans_header {
* the amount of space needed to log the item it describes
* once we get to commit processing (see xfs_trans_commit()).
*/
-typedef struct xfs_log_item_desc {
+struct xfs_log_item_desc {
struct xfs_log_item *lid_item;
- ushort lid_size;
- unsigned char lid_flags;
- unsigned char lid_index;
-} xfs_log_item_desc_t;
+ ushort lid_size;
+ unsigned char lid_flags;
+ struct list_head lid_trans;
+};
#define XFS_LID_DIRTY 0x1
-#define XFS_LID_PINNED 0x2
-
-/*
- * This structure is used to maintain a chunk list of log_item_desc
- * structures. The free field is a bitmask indicating which descriptors
- * in this chunk's array are free. The unused field is the first value
- * not used since this chunk was allocated.
- */
-#define XFS_LIC_NUM_SLOTS 15
-typedef struct xfs_log_item_chunk {
- struct xfs_log_item_chunk *lic_next;
- ushort lic_free;
- ushort lic_unused;
- xfs_log_item_desc_t lic_descs[XFS_LIC_NUM_SLOTS];
-} xfs_log_item_chunk_t;
-
-#define XFS_LIC_MAX_SLOT (XFS_LIC_NUM_SLOTS - 1)
-#define XFS_LIC_FREEMASK ((1 << XFS_LIC_NUM_SLOTS) - 1)
-
-
-/*
- * Initialize the given chunk. Set the chunk's free descriptor mask
- * to indicate that all descriptors are free. The caller gets to set
- * lic_unused to the right value (0 matches all free). The
- * lic_descs.lid_index values are set up as each desc is allocated.
- */
-static inline void xfs_lic_init(xfs_log_item_chunk_t *cp)
-{
- cp->lic_free = XFS_LIC_FREEMASK;
-}
-
-static inline void xfs_lic_init_slot(xfs_log_item_chunk_t *cp, int slot)
-{
- cp->lic_descs[slot].lid_index = (unsigned char)(slot);
-}
-
-static inline int xfs_lic_vacancy(xfs_log_item_chunk_t *cp)
-{
- return cp->lic_free & XFS_LIC_FREEMASK;
-}
-
-static inline void xfs_lic_all_free(xfs_log_item_chunk_t *cp)
-{
- cp->lic_free = XFS_LIC_FREEMASK;
-}
-
-static inline int xfs_lic_are_all_free(xfs_log_item_chunk_t *cp)
-{
- return ((cp->lic_free & XFS_LIC_FREEMASK) == XFS_LIC_FREEMASK);
-}
-
-static inline int xfs_lic_isfree(xfs_log_item_chunk_t *cp, int slot)
-{
- return (cp->lic_free & (1 << slot));
-}
-
-static inline void xfs_lic_claim(xfs_log_item_chunk_t *cp, int slot)
-{
- cp->lic_free &= ~(1 << slot);
-}
-
-static inline void xfs_lic_relse(xfs_log_item_chunk_t *cp, int slot)
-{
- cp->lic_free |= 1 << slot;
-}
-
-static inline xfs_log_item_desc_t *
-xfs_lic_slot(xfs_log_item_chunk_t *cp, int slot)
-{
- return &(cp->lic_descs[slot]);
-}
-
-static inline int xfs_lic_desc_to_slot(xfs_log_item_desc_t *dp)
-{
- return (uint)dp->lid_index;
-}
-
-/*
- * Calculate the address of a chunk given a descriptor pointer:
- * dp - dp->lid_index give the address of the start of the lic_descs array.
- * From this we subtract the offset of the lic_descs field in a chunk.
- * All of this yields the address of the chunk, which is
- * cast to a chunk pointer.
- */
-static inline xfs_log_item_chunk_t *
-xfs_lic_desc_to_chunk(xfs_log_item_desc_t *dp)
-{
- return (xfs_log_item_chunk_t*) \
- (((xfs_caddr_t)((dp) - (dp)->lid_index)) - \
- (xfs_caddr_t)(((xfs_log_item_chunk_t*)0)->lic_descs));
-}
#define XFS_TRANS_MAGIC 0x5452414E /* 'TRAN' */
/*
@@ -275,8 +184,6 @@ xfs_lic_desc_to_chunk(xfs_log_item_desc_t *dp)
/*
* Values for call flags parameter.
*/
-#define XFS_TRANS_NOSLEEP 0x1
-#define XFS_TRANS_WAIT 0x2
#define XFS_TRANS_RELEASE_LOG_RES 0x4
#define XFS_TRANS_ABORT 0x8
@@ -300,24 +207,6 @@ xfs_lic_desc_to_chunk(xfs_log_item_desc_t *dp)
/*
- * Various log reservation values.
- * These are based on the size of the file system block
- * because that is what most transactions manipulate.
- * Each adds in an additional 128 bytes per item logged to
- * try to account for the overhead of the transaction mechanism.
- *
- * Note:
- * Most of the reservations underestimate the number of allocation
- * groups into which they could free extents in the xfs_bmap_finish()
- * call. This is because the number in the worst case is quite high
- * and quite unusual. In order to fix this we need to change
- * xfs_bmap_finish() to free extents in only a single AG at a time.
- * This will require changes to the EFI code as well, however, so that
- * the EFI for the extents not freed is logged again in each transaction.
- * See bug 261917.
- */
-
-/*
* Per-extent log reservation for the allocation btree changes
* involved in freeing or allocating an extent.
* 2 trees * (2 blocks/level * max depth - 1) * block size
@@ -341,429 +230,36 @@ xfs_lic_desc_to_chunk(xfs_log_item_desc_t *dp)
(XFS_DAENTER_BLOCKS(mp, XFS_DATA_FORK) + \
XFS_DAENTER_BMAPS(mp, XFS_DATA_FORK) + 1)
-/*
- * In a write transaction we can allocate a maximum of 2
- * extents. This gives:
- * the inode getting the new extents: inode size
- * the inode's bmap btree: max depth * block size
- * the agfs of the ags from which the extents are allocated: 2 * sector
- * the superblock free block counter: sector size
- * the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size
- * And the bmap_finish transaction can free bmap blocks in a join:
- * the agfs of the ags containing the blocks: 2 * sector size
- * the agfls of the ags containing the blocks: 2 * sector size
- * the super block free block counter: sector size
- * the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size
- */
-#define XFS_CALC_WRITE_LOG_RES(mp) \
- (MAX( \
- ((mp)->m_sb.sb_inodesize + \
- XFS_FSB_TO_B((mp), XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK)) + \
- (2 * (mp)->m_sb.sb_sectsize) + \
- (mp)->m_sb.sb_sectsize + \
- XFS_ALLOCFREE_LOG_RES(mp, 2) + \
- (128 * (4 + XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) + XFS_ALLOCFREE_LOG_COUNT(mp, 2)))),\
- ((2 * (mp)->m_sb.sb_sectsize) + \
- (2 * (mp)->m_sb.sb_sectsize) + \
- (mp)->m_sb.sb_sectsize + \
- XFS_ALLOCFREE_LOG_RES(mp, 2) + \
- (128 * (5 + XFS_ALLOCFREE_LOG_COUNT(mp, 2))))))
#define XFS_WRITE_LOG_RES(mp) ((mp)->m_reservations.tr_write)
-
-/*
- * In truncating a file we free up to two extents at once. We can modify:
- * the inode being truncated: inode size
- * the inode's bmap btree: (max depth + 1) * block size
- * And the bmap_finish transaction can free the blocks and bmap blocks:
- * the agf for each of the ags: 4 * sector size
- * the agfl for each of the ags: 4 * sector size
- * the super block to reflect the freed blocks: sector size
- * worst case split in allocation btrees per extent assuming 4 extents:
- * 4 exts * 2 trees * (2 * max depth - 1) * block size
- * the inode btree: max depth * blocksize
- * the allocation btrees: 2 trees * (max depth - 1) * block size
- */
-#define XFS_CALC_ITRUNCATE_LOG_RES(mp) \
- (MAX( \
- ((mp)->m_sb.sb_inodesize + \
- XFS_FSB_TO_B((mp), XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) + 1) + \
- (128 * (2 + XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK)))), \
- ((4 * (mp)->m_sb.sb_sectsize) + \
- (4 * (mp)->m_sb.sb_sectsize) + \
- (mp)->m_sb.sb_sectsize + \
- XFS_ALLOCFREE_LOG_RES(mp, 4) + \
- (128 * (9 + XFS_ALLOCFREE_LOG_COUNT(mp, 4))) + \
- (128 * 5) + \
- XFS_ALLOCFREE_LOG_RES(mp, 1) + \
- (128 * (2 + XFS_IALLOC_BLOCKS(mp) + (mp)->m_in_maxlevels + \
- XFS_ALLOCFREE_LOG_COUNT(mp, 1))))))
-
#define XFS_ITRUNCATE_LOG_RES(mp) ((mp)->m_reservations.tr_itruncate)
-
-/*
- * In renaming a files we can modify:
- * the four inodes involved: 4 * inode size
- * the two directory btrees: 2 * (max depth + v2) * dir block size
- * the two directory bmap btrees: 2 * max depth * block size
- * And the bmap_finish transaction can free dir and bmap blocks (two sets
- * of bmap blocks) giving:
- * the agf for the ags in which the blocks live: 3 * sector size
- * the agfl for the ags in which the blocks live: 3 * sector size
- * the superblock for the free block count: sector size
- * the allocation btrees: 3 exts * 2 trees * (2 * max depth - 1) * block size
- */
-#define XFS_CALC_RENAME_LOG_RES(mp) \
- (MAX( \
- ((4 * (mp)->m_sb.sb_inodesize) + \
- (2 * XFS_DIROP_LOG_RES(mp)) + \
- (128 * (4 + 2 * XFS_DIROP_LOG_COUNT(mp)))), \
- ((3 * (mp)->m_sb.sb_sectsize) + \
- (3 * (mp)->m_sb.sb_sectsize) + \
- (mp)->m_sb.sb_sectsize + \
- XFS_ALLOCFREE_LOG_RES(mp, 3) + \
- (128 * (7 + XFS_ALLOCFREE_LOG_COUNT(mp, 3))))))
-
#define XFS_RENAME_LOG_RES(mp) ((mp)->m_reservations.tr_rename)
-
-/*
- * For creating a link to an inode:
- * the parent directory inode: inode size
- * the linked inode: inode size
- * the directory btree could split: (max depth + v2) * dir block size
- * the directory bmap btree could join or split: (max depth + v2) * blocksize
- * And the bmap_finish transaction can free some bmap blocks giving:
- * the agf for the ag in which the blocks live: sector size
- * the agfl for the ag in which the blocks live: sector size
- * the superblock for the free block count: sector size
- * the allocation btrees: 2 trees * (2 * max depth - 1) * block size
- */
-#define XFS_CALC_LINK_LOG_RES(mp) \
- (MAX( \
- ((mp)->m_sb.sb_inodesize + \
- (mp)->m_sb.sb_inodesize + \
- XFS_DIROP_LOG_RES(mp) + \
- (128 * (2 + XFS_DIROP_LOG_COUNT(mp)))), \
- ((mp)->m_sb.sb_sectsize + \
- (mp)->m_sb.sb_sectsize + \
- (mp)->m_sb.sb_sectsize + \
- XFS_ALLOCFREE_LOG_RES(mp, 1) + \
- (128 * (3 + XFS_ALLOCFREE_LOG_COUNT(mp, 1))))))
-
#define XFS_LINK_LOG_RES(mp) ((mp)->m_reservations.tr_link)
-
-/*
- * For removing a directory entry we can modify:
- * the parent directory inode: inode size
- * the removed inode: inode size
- * the directory btree could join: (max depth + v2) * dir block size
- * the directory bmap btree could join or split: (max depth + v2) * blocksize
- * And the bmap_finish transaction can free the dir and bmap blocks giving:
- * the agf for the ag in which the blocks live: 2 * sector size
- * the agfl for the ag in which the blocks live: 2 * sector size
- * the superblock for the free block count: sector size
- * the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size
- */
-#define XFS_CALC_REMOVE_LOG_RES(mp) \
- (MAX( \
- ((mp)->m_sb.sb_inodesize + \
- (mp)->m_sb.sb_inodesize + \
- XFS_DIROP_LOG_RES(mp) + \
- (128 * (2 + XFS_DIROP_LOG_COUNT(mp)))), \
- ((2 * (mp)->m_sb.sb_sectsize) + \
- (2 * (mp)->m_sb.sb_sectsize) + \
- (mp)->m_sb.sb_sectsize + \
- XFS_ALLOCFREE_LOG_RES(mp, 2) + \
- (128 * (5 + XFS_ALLOCFREE_LOG_COUNT(mp, 2))))))
-
#define XFS_REMOVE_LOG_RES(mp) ((mp)->m_reservations.tr_remove)
-
-/*
- * For symlink we can modify:
- * the parent directory inode: inode size
- * the new inode: inode size
- * the inode btree entry: 1 block
- * the directory btree: (max depth + v2) * dir block size
- * the directory inode's bmap btree: (max depth + v2) * block size
- * the blocks for the symlink: 1 kB
- * Or in the first xact we allocate some inodes giving:
- * the agi and agf of the ag getting the new inodes: 2 * sectorsize
- * the inode blocks allocated: XFS_IALLOC_BLOCKS * blocksize
- * the inode btree: max depth * blocksize
- * the allocation btrees: 2 trees * (2 * max depth - 1) * block size
- */
-#define XFS_CALC_SYMLINK_LOG_RES(mp) \
- (MAX( \
- ((mp)->m_sb.sb_inodesize + \
- (mp)->m_sb.sb_inodesize + \
- XFS_FSB_TO_B(mp, 1) + \
- XFS_DIROP_LOG_RES(mp) + \
- 1024 + \
- (128 * (4 + XFS_DIROP_LOG_COUNT(mp)))), \
- (2 * (mp)->m_sb.sb_sectsize + \
- XFS_FSB_TO_B((mp), XFS_IALLOC_BLOCKS((mp))) + \
- XFS_FSB_TO_B((mp), (mp)->m_in_maxlevels) + \
- XFS_ALLOCFREE_LOG_RES(mp, 1) + \
- (128 * (2 + XFS_IALLOC_BLOCKS(mp) + (mp)->m_in_maxlevels + \
- XFS_ALLOCFREE_LOG_COUNT(mp, 1))))))
-
#define XFS_SYMLINK_LOG_RES(mp) ((mp)->m_reservations.tr_symlink)
-
-/*
- * For create we can modify:
- * the parent directory inode: inode size
- * the new inode: inode size
- * the inode btree entry: block size
- * the superblock for the nlink flag: sector size
- * the directory btree: (max depth + v2) * dir block size
- * the directory inode's bmap btree: (max depth + v2) * block size
- * Or in the first xact we allocate some inodes giving:
- * the agi and agf of the ag getting the new inodes: 2 * sectorsize
- * the superblock for the nlink flag: sector size
- * the inode blocks allocated: XFS_IALLOC_BLOCKS * blocksize
- * the inode btree: max depth * blocksize
- * the allocation btrees: 2 trees * (max depth - 1) * block size
- */
-#define XFS_CALC_CREATE_LOG_RES(mp) \
- (MAX( \
- ((mp)->m_sb.sb_inodesize + \
- (mp)->m_sb.sb_inodesize + \
- (mp)->m_sb.sb_sectsize + \
- XFS_FSB_TO_B(mp, 1) + \
- XFS_DIROP_LOG_RES(mp) + \
- (128 * (3 + XFS_DIROP_LOG_COUNT(mp)))), \
- (3 * (mp)->m_sb.sb_sectsize + \
- XFS_FSB_TO_B((mp), XFS_IALLOC_BLOCKS((mp))) + \
- XFS_FSB_TO_B((mp), (mp)->m_in_maxlevels) + \
- XFS_ALLOCFREE_LOG_RES(mp, 1) + \
- (128 * (2 + XFS_IALLOC_BLOCKS(mp) + (mp)->m_in_maxlevels + \
- XFS_ALLOCFREE_LOG_COUNT(mp, 1))))))
-
#define XFS_CREATE_LOG_RES(mp) ((mp)->m_reservations.tr_create)
-
-/*
- * Making a new directory is the same as creating a new file.
- */
-#define XFS_CALC_MKDIR_LOG_RES(mp) XFS_CALC_CREATE_LOG_RES(mp)
-
#define XFS_MKDIR_LOG_RES(mp) ((mp)->m_reservations.tr_mkdir)
-
-/*
- * In freeing an inode we can modify:
- * the inode being freed: inode size
- * the super block free inode counter: sector size
- * the agi hash list and counters: sector size
- * the inode btree entry: block size
- * the on disk inode before ours in the agi hash list: inode cluster size
- * the inode btree: max depth * blocksize
- * the allocation btrees: 2 trees * (max depth - 1) * block size
- */
-#define XFS_CALC_IFREE_LOG_RES(mp) \
- ((mp)->m_sb.sb_inodesize + \
- (mp)->m_sb.sb_sectsize + \
- (mp)->m_sb.sb_sectsize + \
- XFS_FSB_TO_B((mp), 1) + \
- MAX((__uint16_t)XFS_FSB_TO_B((mp), 1), XFS_INODE_CLUSTER_SIZE(mp)) + \
- (128 * 5) + \
- XFS_ALLOCFREE_LOG_RES(mp, 1) + \
- (128 * (2 + XFS_IALLOC_BLOCKS(mp) + (mp)->m_in_maxlevels + \
- XFS_ALLOCFREE_LOG_COUNT(mp, 1))))
-
-
#define XFS_IFREE_LOG_RES(mp) ((mp)->m_reservations.tr_ifree)
-
-/*
- * When only changing the inode we log the inode and possibly the superblock
- * We also add a bit of slop for the transaction stuff.
- */
-#define XFS_CALC_ICHANGE_LOG_RES(mp) ((mp)->m_sb.sb_inodesize + \
- (mp)->m_sb.sb_sectsize + 512)
-
#define XFS_ICHANGE_LOG_RES(mp) ((mp)->m_reservations.tr_ichange)
-
-/*
- * Growing the data section of the filesystem.
- * superblock
- * agi and agf
- * allocation btrees
- */
-#define XFS_CALC_GROWDATA_LOG_RES(mp) \
- ((mp)->m_sb.sb_sectsize * 3 + \
- XFS_ALLOCFREE_LOG_RES(mp, 1) + \
- (128 * (3 + XFS_ALLOCFREE_LOG_COUNT(mp, 1))))
-
#define XFS_GROWDATA_LOG_RES(mp) ((mp)->m_reservations.tr_growdata)
-
-/*
- * Growing the rt section of the filesystem.
- * In the first set of transactions (ALLOC) we allocate space to the
- * bitmap or summary files.
- * superblock: sector size
- * agf of the ag from which the extent is allocated: sector size
- * bmap btree for bitmap/summary inode: max depth * blocksize
- * bitmap/summary inode: inode size
- * allocation btrees for 1 block alloc: 2 * (2 * maxdepth - 1) * blocksize
- */
-#define XFS_CALC_GROWRTALLOC_LOG_RES(mp) \
- (2 * (mp)->m_sb.sb_sectsize + \
- XFS_FSB_TO_B((mp), XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK)) + \
- (mp)->m_sb.sb_inodesize + \
- XFS_ALLOCFREE_LOG_RES(mp, 1) + \
- (128 * \
- (3 + XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) + \
- XFS_ALLOCFREE_LOG_COUNT(mp, 1))))
-
#define XFS_GROWRTALLOC_LOG_RES(mp) ((mp)->m_reservations.tr_growrtalloc)
-
-/*
- * Growing the rt section of the filesystem.
- * In the second set of transactions (ZERO) we zero the new metadata blocks.
- * one bitmap/summary block: blocksize
- */
-#define XFS_CALC_GROWRTZERO_LOG_RES(mp) \
- ((mp)->m_sb.sb_blocksize + 128)
-
#define XFS_GROWRTZERO_LOG_RES(mp) ((mp)->m_reservations.tr_growrtzero)
-
-/*
- * Growing the rt section of the filesystem.
- * In the third set of transactions (FREE) we update metadata without
- * allocating any new blocks.
- * superblock: sector size
- * bitmap inode: inode size
- * summary inode: inode size
- * one bitmap block: blocksize
- * summary blocks: new summary size
- */
-#define XFS_CALC_GROWRTFREE_LOG_RES(mp) \
- ((mp)->m_sb.sb_sectsize + \
- 2 * (mp)->m_sb.sb_inodesize + \
- (mp)->m_sb.sb_blocksize + \
- (mp)->m_rsumsize + \
- (128 * 5))
-
#define XFS_GROWRTFREE_LOG_RES(mp) ((mp)->m_reservations.tr_growrtfree)
-
-/*
- * Logging the inode modification timestamp on a synchronous write.
- * inode
- */
-#define XFS_CALC_SWRITE_LOG_RES(mp) \
- ((mp)->m_sb.sb_inodesize + 128)
-
#define XFS_SWRITE_LOG_RES(mp) ((mp)->m_reservations.tr_swrite)
-
/*
* Logging the inode timestamps on an fsync -- same as SWRITE
* as long as SWRITE logs the entire inode core
*/
#define XFS_FSYNC_TS_LOG_RES(mp) ((mp)->m_reservations.tr_swrite)
-
-/*
- * Logging the inode mode bits when writing a setuid/setgid file
- * inode
- */
-#define XFS_CALC_WRITEID_LOG_RES(mp) \
- ((mp)->m_sb.sb_inodesize + 128)
-
#define XFS_WRITEID_LOG_RES(mp) ((mp)->m_reservations.tr_swrite)
-
-/*
- * Converting the inode from non-attributed to attributed.
- * the inode being converted: inode size
- * agf block and superblock (for block allocation)
- * the new block (directory sized)
- * bmap blocks for the new directory block
- * allocation btrees
- */
-#define XFS_CALC_ADDAFORK_LOG_RES(mp) \
- ((mp)->m_sb.sb_inodesize + \
- (mp)->m_sb.sb_sectsize * 2 + \
- (mp)->m_dirblksize + \
- XFS_FSB_TO_B(mp, (XFS_DAENTER_BMAP1B(mp, XFS_DATA_FORK) + 1)) + \
- XFS_ALLOCFREE_LOG_RES(mp, 1) + \
- (128 * (4 + (XFS_DAENTER_BMAP1B(mp, XFS_DATA_FORK) + 1) + \
- XFS_ALLOCFREE_LOG_COUNT(mp, 1))))
-
#define XFS_ADDAFORK_LOG_RES(mp) ((mp)->m_reservations.tr_addafork)
-
-/*
- * Removing the attribute fork of a file
- * the inode being truncated: inode size
- * the inode's bmap btree: max depth * block size
- * And the bmap_finish transaction can free the blocks and bmap blocks:
- * the agf for each of the ags: 4 * sector size
- * the agfl for each of the ags: 4 * sector size
- * the super block to reflect the freed blocks: sector size
- * worst case split in allocation btrees per extent assuming 4 extents:
- * 4 exts * 2 trees * (2 * max depth - 1) * block size
- */
-#define XFS_CALC_ATTRINVAL_LOG_RES(mp) \
- (MAX( \
- ((mp)->m_sb.sb_inodesize + \
- XFS_FSB_TO_B((mp), XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK)) + \
- (128 * (1 + XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK)))), \
- ((4 * (mp)->m_sb.sb_sectsize) + \
- (4 * (mp)->m_sb.sb_sectsize) + \
- (mp)->m_sb.sb_sectsize + \
- XFS_ALLOCFREE_LOG_RES(mp, 4) + \
- (128 * (9 + XFS_ALLOCFREE_LOG_COUNT(mp, 4))))))
-
#define XFS_ATTRINVAL_LOG_RES(mp) ((mp)->m_reservations.tr_attrinval)
-
-/*
- * Setting an attribute.
- * the inode getting the attribute
- * the superblock for allocations
- * the agfs extents are allocated from
- * the attribute btree * max depth
- * the inode allocation btree
- * Since attribute transaction space is dependent on the size of the attribute,
- * the calculation is done partially at mount time and partially at runtime.
- */
-#define XFS_CALC_ATTRSET_LOG_RES(mp) \
- ((mp)->m_sb.sb_inodesize + \
- (mp)->m_sb.sb_sectsize + \
- XFS_FSB_TO_B((mp), XFS_DA_NODE_MAXDEPTH) + \
- (128 * (2 + XFS_DA_NODE_MAXDEPTH)))
-
#define XFS_ATTRSET_LOG_RES(mp, ext) \
((mp)->m_reservations.tr_attrset + \
(ext * (mp)->m_sb.sb_sectsize) + \
(ext * XFS_FSB_TO_B((mp), XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK))) + \
(128 * (ext + (ext * XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK)))))
-
-/*
- * Removing an attribute.
- * the inode: inode size
- * the attribute btree could join: max depth * block size
- * the inode bmap btree could join or split: max depth * block size
- * And the bmap_finish transaction can free the attr blocks freed giving:
- * the agf for the ag in which the blocks live: 2 * sector size
- * the agfl for the ag in which the blocks live: 2 * sector size
- * the superblock for the free block count: sector size
- * the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size
- */
-#define XFS_CALC_ATTRRM_LOG_RES(mp) \
- (MAX( \
- ((mp)->m_sb.sb_inodesize + \
- XFS_FSB_TO_B((mp), XFS_DA_NODE_MAXDEPTH) + \
- XFS_FSB_TO_B((mp), XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK)) + \
- (128 * (1 + XFS_DA_NODE_MAXDEPTH + XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK)))), \
- ((2 * (mp)->m_sb.sb_sectsize) + \
- (2 * (mp)->m_sb.sb_sectsize) + \
- (mp)->m_sb.sb_sectsize + \
- XFS_ALLOCFREE_LOG_RES(mp, 2) + \
- (128 * (5 + XFS_ALLOCFREE_LOG_COUNT(mp, 2))))))
-
#define XFS_ATTRRM_LOG_RES(mp) ((mp)->m_reservations.tr_attrrm)
-
-/*
- * Clearing a bad agino number in an agi hash bucket.
- */
-#define XFS_CALC_CLEAR_AGI_BUCKET_LOG_RES(mp) \
- ((mp)->m_sb.sb_sectsize + 128)
-
#define XFS_CLEAR_AGI_BUCKET_LOG_RES(mp) ((mp)->m_reservations.tr_clearagi)
@@ -849,8 +345,7 @@ typedef struct xfs_item_ops {
uint (*iop_size)(xfs_log_item_t *);
void (*iop_format)(xfs_log_item_t *, struct xfs_log_iovec *);
void (*iop_pin)(xfs_log_item_t *);
- void (*iop_unpin)(xfs_log_item_t *);
- void (*iop_unpin_remove)(xfs_log_item_t *, struct xfs_trans *);
+ void (*iop_unpin)(xfs_log_item_t *, int remove);
uint (*iop_trylock)(xfs_log_item_t *);
void (*iop_unlock)(xfs_log_item_t *);
xfs_lsn_t (*iop_committed)(xfs_log_item_t *, xfs_lsn_t);
@@ -862,8 +357,7 @@ typedef struct xfs_item_ops {
#define IOP_SIZE(ip) (*(ip)->li_ops->iop_size)(ip)
#define IOP_FORMAT(ip,vp) (*(ip)->li_ops->iop_format)(ip, vp)
#define IOP_PIN(ip) (*(ip)->li_ops->iop_pin)(ip)
-#define IOP_UNPIN(ip) (*(ip)->li_ops->iop_unpin)(ip)
-#define IOP_UNPIN_REMOVE(ip,tp) (*(ip)->li_ops->iop_unpin_remove)(ip, tp)
+#define IOP_UNPIN(ip, remove) (*(ip)->li_ops->iop_unpin)(ip, remove)
#define IOP_TRYLOCK(ip) (*(ip)->li_ops->iop_trylock)(ip)
#define IOP_UNLOCK(ip) (*(ip)->li_ops->iop_unlock)(ip)
#define IOP_COMMITTED(ip, lsn) (*(ip)->li_ops->iop_committed)(ip, lsn)
@@ -927,8 +421,7 @@ typedef struct xfs_trans {
int64_t t_rblocks_delta;/* superblock rblocks change */
int64_t t_rextents_delta;/* superblocks rextents chg */
int64_t t_rextslog_delta;/* superblocks rextslog chg */
- unsigned int t_items_free; /* log item descs free */
- xfs_log_item_chunk_t t_items; /* first log item desc chunk */
+ struct list_head t_items; /* log item descriptors */
xfs_trans_header_t t_header; /* header for in-log trans */
struct list_head t_busy; /* list of busy extents */
unsigned long t_pflags; /* saved process flags state */
@@ -980,8 +473,8 @@ void xfs_trans_dquot_buf(xfs_trans_t *, struct xfs_buf *, uint);
void xfs_trans_inode_alloc_buf(xfs_trans_t *, struct xfs_buf *);
int xfs_trans_iget(struct xfs_mount *, xfs_trans_t *,
xfs_ino_t , uint, uint, struct xfs_inode **);
-void xfs_trans_ijoin(xfs_trans_t *, struct xfs_inode *, uint);
-void xfs_trans_ihold(xfs_trans_t *, struct xfs_inode *);
+void xfs_trans_ijoin_ref(struct xfs_trans *, struct xfs_inode *, uint);
+void xfs_trans_ijoin(struct xfs_trans *, struct xfs_inode *);
void xfs_trans_log_buf(xfs_trans_t *, struct xfs_buf *, uint, uint);
void xfs_trans_log_inode(xfs_trans_t *, struct xfs_inode *, uint);
struct xfs_efi_log_item *xfs_trans_get_efi(xfs_trans_t *, uint);
@@ -1006,6 +499,7 @@ int xfs_trans_ail_init(struct xfs_mount *);
void xfs_trans_ail_destroy(struct xfs_mount *);
extern kmem_zone_t *xfs_trans_zone;
+extern kmem_zone_t *xfs_log_item_desc_zone;
#endif /* __KERNEL__ */
diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c
index e799824f7245..dc9069568ff7 100644
--- a/fs/xfs/xfs_trans_ail.c
+++ b/fs/xfs/xfs_trans_ail.c
@@ -24,7 +24,6 @@
#include "xfs_trans.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
-#include "xfs_dmapi.h"
#include "xfs_mount.h"
#include "xfs_trans_priv.h"
#include "xfs_error.h"
diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c
index 63d81a22f4fd..90af025e6839 100644
--- a/fs/xfs/xfs_trans_buf.c
+++ b/fs/xfs/xfs_trans_buf.c
@@ -24,14 +24,10 @@
#include "xfs_trans.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
-#include "xfs_dir2.h"
-#include "xfs_dmapi.h"
#include "xfs_mount.h"
#include "xfs_bmap_btree.h"
#include "xfs_alloc_btree.h"
#include "xfs_ialloc_btree.h"
-#include "xfs_dir2_sf.h"
-#include "xfs_attr_sf.h"
#include "xfs_dinode.h"
#include "xfs_inode.h"
#include "xfs_buf_item.h"
@@ -51,36 +47,17 @@ xfs_trans_buf_item_match(
xfs_daddr_t blkno,
int len)
{
- xfs_log_item_chunk_t *licp;
- xfs_log_item_desc_t *lidp;
- xfs_buf_log_item_t *blip;
- int i;
+ struct xfs_log_item_desc *lidp;
+ struct xfs_buf_log_item *blip;
len = BBTOB(len);
- for (licp = &tp->t_items; licp != NULL; licp = licp->lic_next) {
- if (xfs_lic_are_all_free(licp)) {
- ASSERT(licp == &tp->t_items);
- ASSERT(licp->lic_next == NULL);
- return NULL;
- }
-
- for (i = 0; i < licp->lic_unused; i++) {
- /*
- * Skip unoccupied slots.
- */
- if (xfs_lic_isfree(licp, i))
- continue;
-
- lidp = xfs_lic_slot(licp, i);
- blip = (xfs_buf_log_item_t *)lidp->lid_item;
- if (blip->bli_item.li_type != XFS_LI_BUF)
- continue;
-
- if (XFS_BUF_TARGET(blip->bli_buf) == target &&
- XFS_BUF_ADDR(blip->bli_buf) == blkno &&
- XFS_BUF_COUNT(blip->bli_buf) == len)
- return blip->bli_buf;
- }
+ list_for_each_entry(lidp, &tp->t_items, lid_trans) {
+ blip = (struct xfs_buf_log_item *)lidp->lid_item;
+ if (blip->bli_item.li_type == XFS_LI_BUF &&
+ XFS_BUF_TARGET(blip->bli_buf) == target &&
+ XFS_BUF_ADDR(blip->bli_buf) == blkno &&
+ XFS_BUF_COUNT(blip->bli_buf) == len)
+ return blip->bli_buf;
}
return NULL;
@@ -127,7 +104,7 @@ _xfs_trans_bjoin(
/*
* Get a log_item_desc to point at the new item.
*/
- (void) xfs_trans_add_item(tp, (xfs_log_item_t *)bip);
+ xfs_trans_add_item(tp, &bip->bli_item);
/*
* Initialize b_fsprivate2 so we can find it with incore_match()
@@ -483,7 +460,6 @@ xfs_trans_brelse(xfs_trans_t *tp,
{
xfs_buf_log_item_t *bip;
xfs_log_item_t *lip;
- xfs_log_item_desc_t *lidp;
/*
* Default to a normal brelse() call if the tp is NULL.
@@ -514,13 +490,6 @@ xfs_trans_brelse(xfs_trans_t *tp,
ASSERT(!(bip->bli_format.blf_flags & XFS_BLF_CANCEL));
ASSERT(atomic_read(&bip->bli_refcount) > 0);
- /*
- * Find the item descriptor pointing to this buffer's
- * log item. It must be there.
- */
- lidp = xfs_trans_find_item(tp, (xfs_log_item_t*)bip);
- ASSERT(lidp != NULL);
-
trace_xfs_trans_brelse(bip);
/*
@@ -536,7 +505,7 @@ xfs_trans_brelse(xfs_trans_t *tp,
* If the buffer is dirty within this transaction, we can't
* release it until we commit.
*/
- if (lidp->lid_flags & XFS_LID_DIRTY)
+ if (bip->bli_item.li_desc->lid_flags & XFS_LID_DIRTY)
return;
/*
@@ -553,7 +522,7 @@ xfs_trans_brelse(xfs_trans_t *tp,
/*
* Free up the log item descriptor tracking the released item.
*/
- xfs_trans_free_item(tp, lidp);
+ xfs_trans_del_item(&bip->bli_item);
/*
* Clear the hold flag in the buf log item if it is set.
@@ -665,7 +634,6 @@ xfs_trans_log_buf(xfs_trans_t *tp,
uint last)
{
xfs_buf_log_item_t *bip;
- xfs_log_item_desc_t *lidp;
ASSERT(XFS_BUF_ISBUSY(bp));
ASSERT(XFS_BUF_FSPRIVATE2(bp, xfs_trans_t *) == tp);
@@ -690,7 +658,7 @@ xfs_trans_log_buf(xfs_trans_t *tp,
bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *);
ASSERT(atomic_read(&bip->bli_refcount) > 0);
XFS_BUF_SET_IODONE_FUNC(bp, xfs_buf_iodone_callbacks);
- bip->bli_item.li_cb = (void(*)(xfs_buf_t*,xfs_log_item_t*))xfs_buf_iodone;
+ bip->bli_item.li_cb = xfs_buf_iodone;
trace_xfs_trans_log_buf(bip);
@@ -707,11 +675,8 @@ xfs_trans_log_buf(xfs_trans_t *tp,
bip->bli_format.blf_flags &= ~XFS_BLF_CANCEL;
}
- lidp = xfs_trans_find_item(tp, (xfs_log_item_t*)bip);
- ASSERT(lidp != NULL);
-
tp->t_flags |= XFS_TRANS_DIRTY;
- lidp->lid_flags |= XFS_LID_DIRTY;
+ bip->bli_item.li_desc->lid_flags |= XFS_LID_DIRTY;
bip->bli_flags |= XFS_BLI_LOGGED;
xfs_buf_item_log(bip, first, last);
}
@@ -740,7 +705,6 @@ xfs_trans_binval(
xfs_trans_t *tp,
xfs_buf_t *bp)
{
- xfs_log_item_desc_t *lidp;
xfs_buf_log_item_t *bip;
ASSERT(XFS_BUF_ISBUSY(bp));
@@ -748,8 +712,6 @@ xfs_trans_binval(
ASSERT(XFS_BUF_FSPRIVATE(bp, void *) != NULL);
bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *);
- lidp = xfs_trans_find_item(tp, (xfs_log_item_t*)bip);
- ASSERT(lidp != NULL);
ASSERT(atomic_read(&bip->bli_refcount) > 0);
trace_xfs_trans_binval(bip);
@@ -764,7 +726,7 @@ xfs_trans_binval(
ASSERT(!(bip->bli_flags & (XFS_BLI_LOGGED | XFS_BLI_DIRTY)));
ASSERT(!(bip->bli_format.blf_flags & XFS_BLF_INODE_BUF));
ASSERT(bip->bli_format.blf_flags & XFS_BLF_CANCEL);
- ASSERT(lidp->lid_flags & XFS_LID_DIRTY);
+ ASSERT(bip->bli_item.li_desc->lid_flags & XFS_LID_DIRTY);
ASSERT(tp->t_flags & XFS_TRANS_DIRTY);
return;
}
@@ -797,7 +759,7 @@ xfs_trans_binval(
bip->bli_format.blf_flags |= XFS_BLF_CANCEL;
memset((char *)(bip->bli_format.blf_data_map), 0,
(bip->bli_format.blf_map_size * sizeof(uint)));
- lidp->lid_flags |= XFS_LID_DIRTY;
+ bip->bli_item.li_desc->lid_flags |= XFS_LID_DIRTY;
tp->t_flags |= XFS_TRANS_DIRTY;
}
@@ -853,12 +815,9 @@ xfs_trans_stale_inode_buf(
ASSERT(atomic_read(&bip->bli_refcount) > 0);
bip->bli_flags |= XFS_BLI_STALE_INODE;
- bip->bli_item.li_cb = (void(*)(xfs_buf_t*,xfs_log_item_t*))
- xfs_buf_iodone;
+ bip->bli_item.li_cb = xfs_buf_iodone;
}
-
-
/*
* Mark the buffer as being one which contains newly allocated
* inodes. We need to make sure that even if this buffer is
diff --git a/fs/xfs/xfs_trans_extfree.c b/fs/xfs/xfs_trans_extfree.c
index 27cce2a9c7e9..f783d5e9fa70 100644
--- a/fs/xfs/xfs_trans_extfree.c
+++ b/fs/xfs/xfs_trans_extfree.c
@@ -23,7 +23,6 @@
#include "xfs_trans.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
-#include "xfs_dmapi.h"
#include "xfs_mount.h"
#include "xfs_trans_priv.h"
#include "xfs_extfree_item.h"
@@ -49,9 +48,8 @@ xfs_trans_get_efi(xfs_trans_t *tp,
/*
* Get a log_item_desc to point at the new item.
*/
- (void) xfs_trans_add_item(tp, (xfs_log_item_t*)efip);
-
- return (efip);
+ xfs_trans_add_item(tp, &efip->efi_item);
+ return efip;
}
/*
@@ -65,15 +63,11 @@ xfs_trans_log_efi_extent(xfs_trans_t *tp,
xfs_fsblock_t start_block,
xfs_extlen_t ext_len)
{
- xfs_log_item_desc_t *lidp;
uint next_extent;
xfs_extent_t *extp;
- lidp = xfs_trans_find_item(tp, (xfs_log_item_t*)efip);
- ASSERT(lidp != NULL);
-
tp->t_flags |= XFS_TRANS_DIRTY;
- lidp->lid_flags |= XFS_LID_DIRTY;
+ efip->efi_item.li_desc->lid_flags |= XFS_LID_DIRTY;
next_extent = efip->efi_next_extent;
ASSERT(next_extent < efip->efi_format.efi_nextents);
@@ -106,9 +100,8 @@ xfs_trans_get_efd(xfs_trans_t *tp,
/*
* Get a log_item_desc to point at the new item.
*/
- (void) xfs_trans_add_item(tp, (xfs_log_item_t*)efdp);
-
- return (efdp);
+ xfs_trans_add_item(tp, &efdp->efd_item);
+ return efdp;
}
/*
@@ -122,15 +115,11 @@ xfs_trans_log_efd_extent(xfs_trans_t *tp,
xfs_fsblock_t start_block,
xfs_extlen_t ext_len)
{
- xfs_log_item_desc_t *lidp;
uint next_extent;
xfs_extent_t *extp;
- lidp = xfs_trans_find_item(tp, (xfs_log_item_t*)efdp);
- ASSERT(lidp != NULL);
-
tp->t_flags |= XFS_TRANS_DIRTY;
- lidp->lid_flags |= XFS_LID_DIRTY;
+ efdp->efd_item.li_desc->lid_flags |= XFS_LID_DIRTY;
next_extent = efdp->efd_next_extent;
ASSERT(next_extent < efdp->efd_format.efd_nextents);
diff --git a/fs/xfs/xfs_trans_inode.c b/fs/xfs/xfs_trans_inode.c
index 785ff101da0a..cdc53a1050c5 100644
--- a/fs/xfs/xfs_trans_inode.c
+++ b/fs/xfs/xfs_trans_inode.c
@@ -24,20 +24,16 @@
#include "xfs_trans.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
-#include "xfs_dir2.h"
-#include "xfs_dmapi.h"
#include "xfs_mount.h"
#include "xfs_bmap_btree.h"
#include "xfs_alloc_btree.h"
#include "xfs_ialloc_btree.h"
-#include "xfs_dir2_sf.h"
-#include "xfs_attr_sf.h"
#include "xfs_dinode.h"
#include "xfs_inode.h"
#include "xfs_btree.h"
-#include "xfs_ialloc.h"
#include "xfs_trans_priv.h"
#include "xfs_inode_item.h"
+#include "xfs_trace.h"
#ifdef XFS_TRANS_DEBUG
STATIC void
@@ -47,7 +43,6 @@ xfs_trans_inode_broot_debug(
#define xfs_trans_inode_broot_debug(ip)
#endif
-
/*
* Get an inode and join it to the transaction.
*/
@@ -62,78 +57,66 @@ xfs_trans_iget(
{
int error;
- error = xfs_iget(mp, tp, ino, flags, lock_flags, ipp, 0);
- if (!error && tp)
- xfs_trans_ijoin(tp, *ipp, lock_flags);
+ error = xfs_iget(mp, tp, ino, flags, lock_flags, ipp);
+ if (!error && tp) {
+ xfs_trans_ijoin(tp, *ipp);
+ (*ipp)->i_itemp->ili_lock_flags = lock_flags;
+ }
return error;
}
/*
- * Add the locked inode to the transaction.
- * The inode must be locked, and it cannot be associated with any
- * transaction. The caller must specify the locks already held
- * on the inode.
+ * Add a locked inode to the transaction.
+ *
+ * The inode must be locked, and it cannot be associated with any transaction.
*/
void
xfs_trans_ijoin(
- xfs_trans_t *tp,
- xfs_inode_t *ip,
- uint lock_flags)
+ struct xfs_trans *tp,
+ struct xfs_inode *ip)
{
xfs_inode_log_item_t *iip;
ASSERT(ip->i_transp == NULL);
ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
- ASSERT(lock_flags & XFS_ILOCK_EXCL);
if (ip->i_itemp == NULL)
xfs_inode_item_init(ip, ip->i_mount);
iip = ip->i_itemp;
- ASSERT(iip->ili_flags == 0);
+ ASSERT(iip->ili_lock_flags == 0);
/*
* Get a log_item_desc to point at the new item.
*/
- (void) xfs_trans_add_item(tp, (xfs_log_item_t*)(iip));
+ xfs_trans_add_item(tp, &iip->ili_item);
xfs_trans_inode_broot_debug(ip);
/*
- * If the IO lock is already held, mark that in the inode log item.
- */
- if (lock_flags & XFS_IOLOCK_EXCL) {
- iip->ili_flags |= XFS_ILI_IOLOCKED_EXCL;
- } else if (lock_flags & XFS_IOLOCK_SHARED) {
- iip->ili_flags |= XFS_ILI_IOLOCKED_SHARED;
- }
-
- /*
* Initialize i_transp so we can find it with xfs_inode_incore()
* in xfs_trans_iget() above.
*/
ip->i_transp = tp;
}
-
-
/*
- * Mark the inode as not needing to be unlocked when the inode item's
- * IOP_UNLOCK() routine is called. The inode must already be locked
- * and associated with the given transaction.
+ * Add a locked inode to the transaction.
+ *
+ *
+ * Grabs a reference to the inode which will be dropped when the transaction
+ * is commited. The inode will also be unlocked at that point. The inode
+ * must be locked, and it cannot be associated with any transaction.
*/
-/*ARGSUSED*/
void
-xfs_trans_ihold(
- xfs_trans_t *tp,
- xfs_inode_t *ip)
+xfs_trans_ijoin_ref(
+ struct xfs_trans *tp,
+ struct xfs_inode *ip,
+ uint lock_flags)
{
- ASSERT(ip->i_transp == tp);
- ASSERT(ip->i_itemp != NULL);
- ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
-
- ip->i_itemp->ili_flags |= XFS_ILI_HOLD;
+ xfs_trans_ijoin(tp, ip);
+ IHOLD(ip);
+ ip->i_itemp->ili_lock_flags = lock_flags;
}
-
/*
* This is called to mark the fields indicated in fieldmask as needing
* to be logged when the transaction is committed. The inode must
@@ -149,17 +132,12 @@ xfs_trans_log_inode(
xfs_inode_t *ip,
uint flags)
{
- xfs_log_item_desc_t *lidp;
-
ASSERT(ip->i_transp == tp);
ASSERT(ip->i_itemp != NULL);
ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
- lidp = xfs_trans_find_item(tp, (xfs_log_item_t*)(ip->i_itemp));
- ASSERT(lidp != NULL);
-
tp->t_flags |= XFS_TRANS_DIRTY;
- lidp->lid_flags |= XFS_LID_DIRTY;
+ ip->i_itemp->ili_item.li_desc->lid_flags |= XFS_LID_DIRTY;
/*
* Always OR in the bits from the ili_last_fields field.
diff --git a/fs/xfs/xfs_trans_item.c b/fs/xfs/xfs_trans_item.c
deleted file mode 100644
index f11d37d06dcc..000000000000
--- a/fs/xfs/xfs_trans_item.c
+++ /dev/null
@@ -1,441 +0,0 @@
-/*
- * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- */
-#include "xfs.h"
-#include "xfs_fs.h"
-#include "xfs_types.h"
-#include "xfs_log.h"
-#include "xfs_inum.h"
-#include "xfs_trans.h"
-#include "xfs_trans_priv.h"
-/* XXX: from here down needed until struct xfs_trans has its own ailp */
-#include "xfs_bit.h"
-#include "xfs_buf_item.h"
-#include "xfs_sb.h"
-#include "xfs_ag.h"
-#include "xfs_dir2.h"
-#include "xfs_dmapi.h"
-#include "xfs_mount.h"
-
-STATIC int xfs_trans_unlock_chunk(xfs_log_item_chunk_t *,
- int, int, xfs_lsn_t);
-
-/*
- * This is called to add the given log item to the transaction's
- * list of log items. It must find a free log item descriptor
- * or allocate a new one and add the item to that descriptor.
- * The function returns a pointer to item descriptor used to point
- * to the new item. The log item will now point to its new descriptor
- * with its li_desc field.
- */
-xfs_log_item_desc_t *
-xfs_trans_add_item(xfs_trans_t *tp, xfs_log_item_t *lip)
-{
- xfs_log_item_desc_t *lidp;
- xfs_log_item_chunk_t *licp;
- int i=0;
-
- /*
- * If there are no free descriptors, allocate a new chunk
- * of them and put it at the front of the chunk list.
- */
- if (tp->t_items_free == 0) {
- licp = (xfs_log_item_chunk_t*)
- kmem_alloc(sizeof(xfs_log_item_chunk_t), KM_SLEEP);
- ASSERT(licp != NULL);
- /*
- * Initialize the chunk, and then
- * claim the first slot in the newly allocated chunk.
- */
- xfs_lic_init(licp);
- xfs_lic_claim(licp, 0);
- licp->lic_unused = 1;
- xfs_lic_init_slot(licp, 0);
- lidp = xfs_lic_slot(licp, 0);
-
- /*
- * Link in the new chunk and update the free count.
- */
- licp->lic_next = tp->t_items.lic_next;
- tp->t_items.lic_next = licp;
- tp->t_items_free = XFS_LIC_NUM_SLOTS - 1;
-
- /*
- * Initialize the descriptor and the generic portion
- * of the log item.
- *
- * Point the new slot at this item and return it.
- * Also point the log item at its currently active
- * descriptor and set the item's mount pointer.
- */
- lidp->lid_item = lip;
- lidp->lid_flags = 0;
- lidp->lid_size = 0;
- lip->li_desc = lidp;
- lip->li_mountp = tp->t_mountp;
- lip->li_ailp = tp->t_mountp->m_ail;
- return lidp;
- }
-
- /*
- * Find the free descriptor. It is somewhere in the chunklist
- * of descriptors.
- */
- licp = &tp->t_items;
- while (licp != NULL) {
- if (xfs_lic_vacancy(licp)) {
- if (licp->lic_unused <= XFS_LIC_MAX_SLOT) {
- i = licp->lic_unused;
- ASSERT(xfs_lic_isfree(licp, i));
- break;
- }
- for (i = 0; i <= XFS_LIC_MAX_SLOT; i++) {
- if (xfs_lic_isfree(licp, i))
- break;
- }
- ASSERT(i <= XFS_LIC_MAX_SLOT);
- break;
- }
- licp = licp->lic_next;
- }
- ASSERT(licp != NULL);
- /*
- * If we find a free descriptor, claim it,
- * initialize it, and return it.
- */
- xfs_lic_claim(licp, i);
- if (licp->lic_unused <= i) {
- licp->lic_unused = i + 1;
- xfs_lic_init_slot(licp, i);
- }
- lidp = xfs_lic_slot(licp, i);
- tp->t_items_free--;
- lidp->lid_item = lip;
- lidp->lid_flags = 0;
- lidp->lid_size = 0;
- lip->li_desc = lidp;
- lip->li_mountp = tp->t_mountp;
- lip->li_ailp = tp->t_mountp->m_ail;
- return lidp;
-}
-
-/*
- * Free the given descriptor.
- *
- * This requires setting the bit in the chunk's free mask corresponding
- * to the given slot.
- */
-void
-xfs_trans_free_item(xfs_trans_t *tp, xfs_log_item_desc_t *lidp)
-{
- uint slot;
- xfs_log_item_chunk_t *licp;
- xfs_log_item_chunk_t **licpp;
-
- slot = xfs_lic_desc_to_slot(lidp);
- licp = xfs_lic_desc_to_chunk(lidp);
- xfs_lic_relse(licp, slot);
- lidp->lid_item->li_desc = NULL;
- tp->t_items_free++;
-
- /*
- * If there are no more used items in the chunk and this is not
- * the chunk embedded in the transaction structure, then free
- * the chunk. First pull it from the chunk list and then
- * free it back to the heap. We didn't bother with a doubly
- * linked list here because the lists should be very short
- * and this is not a performance path. It's better to save
- * the memory of the extra pointer.
- *
- * Also decrement the transaction structure's count of free items
- * by the number in a chunk since we are freeing an empty chunk.
- */
- if (xfs_lic_are_all_free(licp) && (licp != &(tp->t_items))) {
- licpp = &(tp->t_items.lic_next);
- while (*licpp != licp) {
- ASSERT(*licpp != NULL);
- licpp = &((*licpp)->lic_next);
- }
- *licpp = licp->lic_next;
- kmem_free(licp);
- tp->t_items_free -= XFS_LIC_NUM_SLOTS;
- }
-}
-
-/*
- * This is called to find the descriptor corresponding to the given
- * log item. It returns a pointer to the descriptor.
- * The log item MUST have a corresponding descriptor in the given
- * transaction. This routine does not return NULL, it panics.
- *
- * The descriptor pointer is kept in the log item's li_desc field.
- * Just return it.
- */
-/*ARGSUSED*/
-xfs_log_item_desc_t *
-xfs_trans_find_item(xfs_trans_t *tp, xfs_log_item_t *lip)
-{
- ASSERT(lip->li_desc != NULL);
-
- return lip->li_desc;
-}
-
-
-/*
- * Return a pointer to the first descriptor in the chunk list.
- * This does not return NULL if there are none, it panics.
- *
- * The first descriptor must be in either the first or second chunk.
- * This is because the only chunk allowed to be empty is the first.
- * All others are freed when they become empty.
- *
- * At some point this and xfs_trans_next_item() should be optimized
- * to quickly look at the mask to determine if there is anything to
- * look at.
- */
-xfs_log_item_desc_t *
-xfs_trans_first_item(xfs_trans_t *tp)
-{
- xfs_log_item_chunk_t *licp;
- int i;
-
- licp = &tp->t_items;
- /*
- * If it's not in the first chunk, skip to the second.
- */
- if (xfs_lic_are_all_free(licp)) {
- licp = licp->lic_next;
- }
-
- /*
- * Return the first non-free descriptor in the chunk.
- */
- ASSERT(!xfs_lic_are_all_free(licp));
- for (i = 0; i < licp->lic_unused; i++) {
- if (xfs_lic_isfree(licp, i)) {
- continue;
- }
-
- return xfs_lic_slot(licp, i);
- }
- cmn_err(CE_WARN, "xfs_trans_first_item() -- no first item");
- return NULL;
-}
-
-
-/*
- * Given a descriptor, return the next descriptor in the chunk list.
- * This returns NULL if there are no more used descriptors in the list.
- *
- * We do this by first locating the chunk in which the descriptor resides,
- * and then scanning forward in the chunk and the list for the next
- * used descriptor.
- */
-/*ARGSUSED*/
-xfs_log_item_desc_t *
-xfs_trans_next_item(xfs_trans_t *tp, xfs_log_item_desc_t *lidp)
-{
- xfs_log_item_chunk_t *licp;
- int i;
-
- licp = xfs_lic_desc_to_chunk(lidp);
-
- /*
- * First search the rest of the chunk. The for loop keeps us
- * from referencing things beyond the end of the chunk.
- */
- for (i = (int)xfs_lic_desc_to_slot(lidp) + 1; i < licp->lic_unused; i++) {
- if (xfs_lic_isfree(licp, i)) {
- continue;
- }
-
- return xfs_lic_slot(licp, i);
- }
-
- /*
- * Now search the next chunk. It must be there, because the
- * next chunk would have been freed if it were empty.
- * If there is no next chunk, return NULL.
- */
- if (licp->lic_next == NULL) {
- return NULL;
- }
-
- licp = licp->lic_next;
- ASSERT(!xfs_lic_are_all_free(licp));
- for (i = 0; i < licp->lic_unused; i++) {
- if (xfs_lic_isfree(licp, i)) {
- continue;
- }
-
- return xfs_lic_slot(licp, i);
- }
- ASSERT(0);
- /* NOTREACHED */
- return NULL; /* keep gcc quite */
-}
-
-/*
- * This is called to unlock all of the items of a transaction and to free
- * all the descriptors of that transaction.
- *
- * It walks the list of descriptors and unlocks each item. It frees
- * each chunk except that embedded in the transaction as it goes along.
- */
-void
-xfs_trans_free_items(
- xfs_trans_t *tp,
- xfs_lsn_t commit_lsn,
- int flags)
-{
- xfs_log_item_chunk_t *licp;
- xfs_log_item_chunk_t *next_licp;
- int abort;
-
- abort = flags & XFS_TRANS_ABORT;
- licp = &tp->t_items;
- /*
- * Special case the embedded chunk so we don't free it below.
- */
- if (!xfs_lic_are_all_free(licp)) {
- (void) xfs_trans_unlock_chunk(licp, 1, abort, commit_lsn);
- xfs_lic_all_free(licp);
- licp->lic_unused = 0;
- }
- licp = licp->lic_next;
-
- /*
- * Unlock each item in each chunk and free the chunks.
- */
- while (licp != NULL) {
- ASSERT(!xfs_lic_are_all_free(licp));
- (void) xfs_trans_unlock_chunk(licp, 1, abort, commit_lsn);
- next_licp = licp->lic_next;
- kmem_free(licp);
- licp = next_licp;
- }
-
- /*
- * Reset the transaction structure's free item count.
- */
- tp->t_items_free = XFS_LIC_NUM_SLOTS;
- tp->t_items.lic_next = NULL;
-}
-
-
-
-/*
- * This is called to unlock the items associated with a transaction.
- * Items which were not logged should be freed.
- * Those which were logged must still be tracked so they can be unpinned
- * when the transaction commits.
- */
-void
-xfs_trans_unlock_items(xfs_trans_t *tp, xfs_lsn_t commit_lsn)
-{
- xfs_log_item_chunk_t *licp;
- xfs_log_item_chunk_t *next_licp;
- xfs_log_item_chunk_t **licpp;
- int freed;
-
- freed = 0;
- licp = &tp->t_items;
-
- /*
- * Special case the embedded chunk so we don't free.
- */
- if (!xfs_lic_are_all_free(licp)) {
- freed = xfs_trans_unlock_chunk(licp, 0, 0, commit_lsn);
- }
- licpp = &(tp->t_items.lic_next);
- licp = licp->lic_next;
-
- /*
- * Unlock each item in each chunk, free non-dirty descriptors,
- * and free empty chunks.
- */
- while (licp != NULL) {
- ASSERT(!xfs_lic_are_all_free(licp));
- freed += xfs_trans_unlock_chunk(licp, 0, 0, commit_lsn);
- next_licp = licp->lic_next;
- if (xfs_lic_are_all_free(licp)) {
- *licpp = next_licp;
- kmem_free(licp);
- freed -= XFS_LIC_NUM_SLOTS;
- } else {
- licpp = &(licp->lic_next);
- }
- ASSERT(*licpp == next_licp);
- licp = next_licp;
- }
-
- /*
- * Fix the free descriptor count in the transaction.
- */
- tp->t_items_free += freed;
-}
-
-/*
- * Unlock each item pointed to by a descriptor in the given chunk.
- * Stamp the commit lsn into each item if necessary.
- * Free descriptors pointing to items which are not dirty if freeing_chunk
- * is zero. If freeing_chunk is non-zero, then we need to unlock all
- * items in the chunk.
- *
- * Return the number of descriptors freed.
- */
-STATIC int
-xfs_trans_unlock_chunk(
- xfs_log_item_chunk_t *licp,
- int freeing_chunk,
- int abort,
- xfs_lsn_t commit_lsn)
-{
- xfs_log_item_desc_t *lidp;
- xfs_log_item_t *lip;
- int i;
- int freed;
-
- freed = 0;
- lidp = licp->lic_descs;
- for (i = 0; i < licp->lic_unused; i++, lidp++) {
- if (xfs_lic_isfree(licp, i)) {
- continue;
- }
- lip = lidp->lid_item;
- lip->li_desc = NULL;
-
- if (commit_lsn != NULLCOMMITLSN)
- IOP_COMMITTING(lip, commit_lsn);
- if (abort)
- lip->li_flags |= XFS_LI_ABORTED;
- IOP_UNLOCK(lip);
-
- /*
- * Free the descriptor if the item is not dirty
- * within this transaction and the caller is not
- * going to just free the entire thing regardless.
- */
- if (!(freeing_chunk) &&
- (!(lidp->lid_flags & XFS_LID_DIRTY) || abort)) {
- xfs_lic_relse(licp, i);
- freed++;
- }
- }
-
- return freed;
-}
diff --git a/fs/xfs/xfs_trans_priv.h b/fs/xfs/xfs_trans_priv.h
index c6e4f2c8de6e..e2d93d8ead7b 100644
--- a/fs/xfs/xfs_trans_priv.h
+++ b/fs/xfs/xfs_trans_priv.h
@@ -23,22 +23,8 @@ struct xfs_log_item_desc;
struct xfs_mount;
struct xfs_trans;
-/*
- * From xfs_trans_item.c
- */
-struct xfs_log_item_desc *xfs_trans_add_item(struct xfs_trans *,
- struct xfs_log_item *);
-void xfs_trans_free_item(struct xfs_trans *,
- struct xfs_log_item_desc *);
-struct xfs_log_item_desc *xfs_trans_find_item(struct xfs_trans *,
- struct xfs_log_item *);
-struct xfs_log_item_desc *xfs_trans_first_item(struct xfs_trans *);
-struct xfs_log_item_desc *xfs_trans_next_item(struct xfs_trans *,
- struct xfs_log_item_desc *);
-
-void xfs_trans_unlock_items(struct xfs_trans *tp, xfs_lsn_t commit_lsn);
-void xfs_trans_free_items(struct xfs_trans *tp, xfs_lsn_t commit_lsn,
- int flags);
+void xfs_trans_add_item(struct xfs_trans *, struct xfs_log_item *);
+void xfs_trans_del_item(struct xfs_log_item *);
void xfs_trans_item_committed(struct xfs_log_item *lip,
xfs_lsn_t commit_lsn, int aborted);
diff --git a/fs/xfs/xfs_utils.c b/fs/xfs/xfs_utils.c
index 4d88616bde91..b7d5769d2df0 100644
--- a/fs/xfs/xfs_utils.c
+++ b/fs/xfs/xfs_utils.c
@@ -25,18 +25,14 @@
#include "xfs_sb.h"
#include "xfs_ag.h"
#include "xfs_dir2.h"
-#include "xfs_dmapi.h"
#include "xfs_mount.h"
#include "xfs_bmap_btree.h"
-#include "xfs_dir2_sf.h"
-#include "xfs_attr_sf.h"
#include "xfs_dinode.h"
#include "xfs_inode.h"
#include "xfs_inode_item.h"
#include "xfs_bmap.h"
#include "xfs_error.h"
#include "xfs_quota.h"
-#include "xfs_rw.h"
#include "xfs_itable.h"
#include "xfs_utils.h"
@@ -324,86 +320,3 @@ xfs_bumplink(
xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
return 0;
}
-
-/*
- * Try to truncate the given file to 0 length. Currently called
- * only out of xfs_remove when it has to truncate a file to free
- * up space for the remove to proceed.
- */
-int
-xfs_truncate_file(
- xfs_mount_t *mp,
- xfs_inode_t *ip)
-{
- xfs_trans_t *tp;
- int error;
-
-#ifdef QUOTADEBUG
- /*
- * This is called to truncate the quotainodes too.
- */
- if (XFS_IS_UQUOTA_ON(mp)) {
- if (ip->i_ino != mp->m_sb.sb_uquotino)
- ASSERT(ip->i_udquot);
- }
- if (XFS_IS_OQUOTA_ON(mp)) {
- if (ip->i_ino != mp->m_sb.sb_gquotino)
- ASSERT(ip->i_gdquot);
- }
-#endif
- /*
- * Make the call to xfs_itruncate_start before starting the
- * transaction, because we cannot make the call while we're
- * in a transaction.
- */
- xfs_ilock(ip, XFS_IOLOCK_EXCL);
- error = xfs_itruncate_start(ip, XFS_ITRUNC_DEFINITE, (xfs_fsize_t)0);
- if (error) {
- xfs_iunlock(ip, XFS_IOLOCK_EXCL);
- return error;
- }
-
- tp = xfs_trans_alloc(mp, XFS_TRANS_TRUNCATE_FILE);
- if ((error = xfs_trans_reserve(tp, 0, XFS_ITRUNCATE_LOG_RES(mp), 0,
- XFS_TRANS_PERM_LOG_RES,
- XFS_ITRUNCATE_LOG_COUNT))) {
- xfs_trans_cancel(tp, 0);
- xfs_iunlock(ip, XFS_IOLOCK_EXCL);
- return error;
- }
-
- /*
- * Follow the normal truncate locking protocol. Since we
- * hold the inode in the transaction, we know that its number
- * of references will stay constant.
- */
- xfs_ilock(ip, XFS_ILOCK_EXCL);
- xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
- xfs_trans_ihold(tp, ip);
- /*
- * Signal a sync xaction. The only case where that isn't
- * the case is if we're truncating an already unlinked file
- * on a wsync fs. In that case, we know the blocks can't
- * reappear in the file because the links to file are
- * permanently toast. Currently, we're always going to
- * want a sync transaction because this code is being
- * called from places where nlink is guaranteed to be 1
- * but I'm leaving the tests in to protect against future
- * changes -- rcc.
- */
- error = xfs_itruncate_finish(&tp, ip, (xfs_fsize_t)0,
- XFS_DATA_FORK,
- ((ip->i_d.di_nlink != 0 ||
- !(mp->m_flags & XFS_MOUNT_WSYNC))
- ? 1 : 0));
- if (error) {
- xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES |
- XFS_TRANS_ABORT);
- } else {
- xfs_ichgtime(ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
- error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
- }
- xfs_iunlock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
-
- return error;
-}
diff --git a/fs/xfs/xfs_utils.h b/fs/xfs/xfs_utils.h
index ef321225d269..f55b9678264f 100644
--- a/fs/xfs/xfs_utils.h
+++ b/fs/xfs/xfs_utils.h
@@ -18,7 +18,6 @@
#ifndef __XFS_UTILS_H__
#define __XFS_UTILS_H__
-extern int xfs_truncate_file(xfs_mount_t *, xfs_inode_t *);
extern int xfs_dir_ialloc(xfs_trans_t **, xfs_inode_t *, mode_t, xfs_nlink_t,
xfs_dev_t, cred_t *, prid_t, int,
xfs_inode_t **, int *);
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index 9d376be0ea38..3ac137dd531b 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -26,19 +26,14 @@
#include "xfs_sb.h"
#include "xfs_ag.h"
#include "xfs_dir2.h"
-#include "xfs_dmapi.h"
#include "xfs_mount.h"
#include "xfs_da_btree.h"
#include "xfs_bmap_btree.h"
-#include "xfs_alloc_btree.h"
#include "xfs_ialloc_btree.h"
-#include "xfs_dir2_sf.h"
-#include "xfs_attr_sf.h"
#include "xfs_dinode.h"
#include "xfs_inode.h"
#include "xfs_inode_item.h"
#include "xfs_itable.h"
-#include "xfs_btree.h"
#include "xfs_ialloc.h"
#include "xfs_alloc.h"
#include "xfs_bmap.h"
@@ -73,7 +68,7 @@ xfs_setattr(
struct xfs_dquot *udqp, *gdqp, *olddquot1, *olddquot2;
int need_iolock = 1;
- xfs_itrace_entry(ip);
+ trace_xfs_setattr(ip);
if (mp->m_flags & XFS_MOUNT_RDONLY)
return XFS_ERROR(EROFS);
@@ -143,16 +138,6 @@ xfs_setattr(
goto error_return;
}
} else {
- if (DM_EVENT_ENABLED(ip, DM_EVENT_TRUNCATE) &&
- !(flags & XFS_ATTR_DMI)) {
- int dmflags = AT_DELAY_FLAG(flags) | DM_SEM_FLAG_WR;
- code = XFS_SEND_DATA(mp, DM_EVENT_TRUNCATE, ip,
- iattr->ia_size, 0, dmflags, NULL);
- if (code) {
- lock_flags = 0;
- goto error_return;
- }
- }
if (need_iolock)
lock_flags |= XFS_IOLOCK_EXCL;
}
@@ -267,7 +252,7 @@ xfs_setattr(
if (code) {
ASSERT(tp == NULL);
lock_flags &= ~XFS_ILOCK_EXCL;
- ASSERT(lock_flags == XFS_IOLOCK_EXCL);
+ ASSERT(lock_flags == XFS_IOLOCK_EXCL || !need_iolock);
goto error_return;
}
tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_SIZE);
@@ -283,8 +268,7 @@ xfs_setattr(
commit_flags = XFS_TRANS_RELEASE_LOG_RES;
xfs_ilock(ip, XFS_ILOCK_EXCL);
- xfs_trans_ijoin(tp, ip, lock_flags);
- xfs_trans_ihold(tp, ip);
+ xfs_trans_ijoin(tp, ip);
/*
* Only change the c/mtime if we are changing the size
@@ -334,8 +318,7 @@ xfs_setattr(
xfs_iflags_set(ip, XFS_ITRUNCATED);
}
} else if (tp) {
- xfs_trans_ijoin(tp, ip, lock_flags);
- xfs_trans_ihold(tp, ip);
+ xfs_trans_ijoin(tp, ip);
}
/*
@@ -470,17 +453,10 @@ xfs_setattr(
return XFS_ERROR(code);
}
- if (DM_EVENT_ENABLED(ip, DM_EVENT_ATTRIBUTE) &&
- !(flags & XFS_ATTR_DMI)) {
- (void) XFS_SEND_NAMESP(mp, DM_EVENT_ATTRIBUTE, ip, DM_RIGHT_NULL,
- NULL, DM_RIGHT_NULL, NULL, NULL,
- 0, 0, AT_DELAY_FLAG(flags));
- }
return 0;
abort_return:
commit_flags |= XFS_TRANS_ABORT;
- /* FALLTHROUGH */
error_return:
xfs_qm_dqrele(udqp);
xfs_qm_dqrele(gdqp);
@@ -516,7 +492,7 @@ xfs_readlink_bmap(
int error = 0;
error = xfs_bmapi(NULL, ip, 0, XFS_B_TO_FSB(mp, pathlen), 0, NULL, 0,
- mval, &nmaps, NULL, NULL);
+ mval, &nmaps, NULL);
if (error)
goto out;
@@ -557,7 +533,7 @@ xfs_readlink(
int pathlen;
int error = 0;
- xfs_itrace_entry(ip);
+ trace_xfs_readlink(ip);
if (XFS_FORCED_SHUTDOWN(mp))
return XFS_ERROR(EIO);
@@ -613,14 +589,14 @@ xfs_free_eofblocks(
*/
end_fsb = XFS_B_TO_FSB(mp, ((xfs_ufsize_t)ip->i_size));
last_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)XFS_MAXIOFFSET(mp));
- map_len = last_fsb - end_fsb;
- if (map_len <= 0)
+ if (last_fsb <= end_fsb)
return 0;
+ map_len = last_fsb - end_fsb;
nimaps = 1;
xfs_ilock(ip, XFS_ILOCK_SHARED);
error = xfs_bmapi(NULL, ip, end_fsb, map_len, 0,
- NULL, 0, &imap, &nimaps, NULL, NULL);
+ NULL, 0, &imap, &nimaps, NULL);
xfs_iunlock(ip, XFS_ILOCK_SHARED);
if (!error && (nimaps != 0) &&
@@ -675,10 +651,7 @@ xfs_free_eofblocks(
}
xfs_ilock(ip, XFS_ILOCK_EXCL);
- xfs_trans_ijoin(tp, ip,
- XFS_IOLOCK_EXCL |
- XFS_ILOCK_EXCL);
- xfs_trans_ihold(tp, ip);
+ xfs_trans_ijoin(tp, ip);
error = xfs_itruncate_finish(&tp, ip,
ip->i_size,
@@ -750,8 +723,7 @@ xfs_inactive_symlink_rmt(
xfs_ilock(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
size = (int)ip->i_d.di_size;
ip->i_d.di_size = 0;
- xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
- xfs_trans_ihold(tp, ip);
+ xfs_trans_ijoin(tp, ip);
xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
/*
* Find the block(s) so we can inval and unmap them.
@@ -761,7 +733,7 @@ xfs_inactive_symlink_rmt(
nmaps = ARRAY_SIZE(mval);
if ((error = xfs_bmapi(tp, ip, 0, XFS_B_TO_FSB(mp, size),
XFS_BMAPI_METADATA, &first_block, 0, mval, &nmaps,
- &free_list, NULL)))
+ &free_list)))
goto error0;
/*
* Invalidate the block(s).
@@ -776,7 +748,7 @@ xfs_inactive_symlink_rmt(
* Unmap the dead block(s) to the free_list.
*/
if ((error = xfs_bunmapi(tp, ip, 0, size, XFS_BMAPI_METADATA, nmaps,
- &first_block, &free_list, NULL, &done)))
+ &first_block, &free_list, &done)))
goto error1;
ASSERT(done);
/*
@@ -795,8 +767,7 @@ xfs_inactive_symlink_rmt(
* Mark it dirty so it will be logged and moved forward in the log as
* part of every commit.
*/
- xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
- xfs_trans_ihold(tp, ip);
+ xfs_trans_ijoin(tp, ip);
xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
/*
* Get a new, empty transaction to return to our caller.
@@ -929,8 +900,7 @@ xfs_inactive_attrs(
goto error_cancel;
xfs_ilock(ip, XFS_ILOCK_EXCL);
- xfs_trans_ijoin(tp, ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
- xfs_trans_ihold(tp, ip);
+ xfs_trans_ijoin(tp, ip);
xfs_idestroy_fork(ip, XFS_ATTR_FORK);
ASSERT(ip->i_d.di_anextents == 0);
@@ -1035,8 +1005,6 @@ xfs_inactive(
int error;
int truncate;
- xfs_itrace_entry(ip);
-
/*
* If the inode is already free, then there can be nothing
* to clean up here.
@@ -1060,9 +1028,6 @@ xfs_inactive(
mp = ip->i_mount;
- if (ip->i_d.di_nlink == 0 && DM_EVENT_ENABLED(ip, DM_EVENT_DESTROY))
- XFS_SEND_DESTROY(mp, ip, DM_RIGHT_NULL);
-
error = 0;
/* If this is a read-only mount, don't do this (would generate I/O) */
@@ -1120,8 +1085,7 @@ xfs_inactive(
}
xfs_ilock(ip, XFS_ILOCK_EXCL);
- xfs_trans_ijoin(tp, ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
- xfs_trans_ihold(tp, ip);
+ xfs_trans_ijoin(tp, ip);
/*
* normally, we have to run xfs_itruncate_finish sync.
@@ -1154,8 +1118,7 @@ xfs_inactive(
return VN_INACTIVE_CACHE;
}
- xfs_trans_ijoin(tp, ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
- xfs_trans_ihold(tp, ip);
+ xfs_trans_ijoin(tp, ip);
} else {
error = xfs_trans_reserve(tp, 0,
XFS_IFREE_LOG_RES(mp),
@@ -1168,8 +1131,7 @@ xfs_inactive(
}
xfs_ilock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
- xfs_trans_ijoin(tp, ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
- xfs_trans_ihold(tp, ip);
+ xfs_trans_ijoin(tp, ip);
}
/*
@@ -1257,7 +1219,7 @@ xfs_lookup(
int error;
uint lock_mode;
- xfs_itrace_entry(dp);
+ trace_xfs_lookup(dp, name);
if (XFS_FORCED_SHUTDOWN(dp->i_mount))
return XFS_ERROR(EIO);
@@ -1269,7 +1231,7 @@ xfs_lookup(
if (error)
goto out;
- error = xfs_iget(dp->i_mount, NULL, inum, 0, 0, ipp, 0);
+ error = xfs_iget(dp->i_mount, NULL, inum, 0, 0, ipp);
if (error)
goto out_free_name;
@@ -1309,21 +1271,11 @@ xfs_create(
uint log_res;
uint log_count;
- xfs_itrace_entry(dp);
+ trace_xfs_create(dp, name);
if (XFS_FORCED_SHUTDOWN(mp))
return XFS_ERROR(EIO);
- if (DM_EVENT_ENABLED(dp, DM_EVENT_CREATE)) {
- error = XFS_SEND_NAMESP(mp, DM_EVENT_CREATE,
- dp, DM_RIGHT_NULL, NULL,
- DM_RIGHT_NULL, name->name, NULL,
- mode, 0, 0);
-
- if (error)
- return error;
- }
-
if (dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT)
prid = dp->i_d.di_projid;
else
@@ -1427,8 +1379,7 @@ xfs_create(
* the transaction cancel unlocking dp so don't do it explicitly in the
* error path.
*/
- IHOLD(dp);
- xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
+ xfs_trans_ijoin_ref(tp, dp, XFS_ILOCK_EXCL);
unlock_dp_on_error = B_FALSE;
error = xfs_dir_createname(tp, dp, name, ip->i_ino,
@@ -1487,16 +1438,7 @@ xfs_create(
xfs_qm_dqrele(gdqp);
*ipp = ip;
-
- /* Fallthrough to std_return with error = 0 */
- std_return:
- if (DM_EVENT_ENABLED(dp, DM_EVENT_POSTCREATE)) {
- XFS_SEND_NAMESP(mp, DM_EVENT_POSTCREATE, dp, DM_RIGHT_NULL,
- ip, DM_RIGHT_NULL, name->name, NULL, mode,
- error, 0);
- }
-
- return error;
+ return 0;
out_bmap_cancel:
xfs_bmap_cancel(&free_list);
@@ -1510,8 +1452,8 @@ xfs_create(
if (unlock_dp_on_error)
xfs_iunlock(dp, XFS_ILOCK_EXCL);
-
- goto std_return;
+ std_return:
+ return error;
out_abort_rele:
/*
@@ -1726,20 +1668,11 @@ xfs_remove(
uint resblks;
uint log_count;
- xfs_itrace_entry(dp);
- xfs_itrace_entry(ip);
+ trace_xfs_remove(dp, name);
if (XFS_FORCED_SHUTDOWN(mp))
return XFS_ERROR(EIO);
- if (DM_EVENT_ENABLED(dp, DM_EVENT_REMOVE)) {
- error = XFS_SEND_NAMESP(mp, DM_EVENT_REMOVE, dp, DM_RIGHT_NULL,
- NULL, DM_RIGHT_NULL, name->name, NULL,
- ip->i_d.di_mode, 0, 0);
- if (error)
- return error;
- }
-
error = xfs_qm_dqattach(dp, 0);
if (error)
goto std_return;
@@ -1782,15 +1715,8 @@ xfs_remove(
xfs_lock_two_inodes(dp, ip, XFS_ILOCK_EXCL);
- /*
- * At this point, we've gotten both the directory and the entry
- * inodes locked.
- */
- IHOLD(ip);
- xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
-
- IHOLD(dp);
- xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
+ xfs_trans_ijoin_ref(tp, dp, XFS_ILOCK_EXCL);
+ xfs_trans_ijoin_ref(tp, ip, XFS_ILOCK_EXCL);
/*
* If we're removing a directory perform some additional validation.
@@ -1877,21 +1803,15 @@ xfs_remove(
if (!is_dir && link_zero && xfs_inode_is_filestream(ip))
xfs_filestream_deassociate(ip);
- std_return:
- if (DM_EVENT_ENABLED(dp, DM_EVENT_POSTREMOVE)) {
- XFS_SEND_NAMESP(mp, DM_EVENT_POSTREMOVE, dp, DM_RIGHT_NULL,
- NULL, DM_RIGHT_NULL, name->name, NULL,
- ip->i_d.di_mode, error, 0);
- }
-
- return error;
+ return 0;
out_bmap_cancel:
xfs_bmap_cancel(&free_list);
cancel_flags |= XFS_TRANS_ABORT;
out_trans_cancel:
xfs_trans_cancel(tp, cancel_flags);
- goto std_return;
+ std_return:
+ return error;
}
int
@@ -1909,25 +1829,13 @@ xfs_link(
int committed;
int resblks;
- xfs_itrace_entry(tdp);
- xfs_itrace_entry(sip);
+ trace_xfs_link(tdp, target_name);
ASSERT(!S_ISDIR(sip->i_d.di_mode));
if (XFS_FORCED_SHUTDOWN(mp))
return XFS_ERROR(EIO);
- if (DM_EVENT_ENABLED(tdp, DM_EVENT_LINK)) {
- error = XFS_SEND_NAMESP(mp, DM_EVENT_LINK,
- tdp, DM_RIGHT_NULL,
- sip, DM_RIGHT_NULL,
- target_name->name, NULL, 0, 0, 0);
- if (error)
- return error;
- }
-
- /* Return through std_return after this point. */
-
error = xfs_qm_dqattach(sip, 0);
if (error)
goto std_return;
@@ -1953,15 +1861,8 @@ xfs_link(
xfs_lock_two_inodes(sip, tdp, XFS_ILOCK_EXCL);
- /*
- * Increment vnode ref counts since xfs_trans_commit &
- * xfs_trans_cancel will both unlock the inodes and
- * decrement the associated ref counts.
- */
- IHOLD(sip);
- IHOLD(tdp);
- xfs_trans_ijoin(tp, sip, XFS_ILOCK_EXCL);
- xfs_trans_ijoin(tp, tdp, XFS_ILOCK_EXCL);
+ xfs_trans_ijoin_ref(tp, sip, XFS_ILOCK_EXCL);
+ xfs_trans_ijoin_ref(tp, tdp, XFS_ILOCK_EXCL);
/*
* If the source has too many links, we can't make any more to it.
@@ -2014,27 +1915,14 @@ xfs_link(
goto abort_return;
}
- error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
- if (error)
- goto std_return;
-
- /* Fall through to std_return with error = 0. */
-std_return:
- if (DM_EVENT_ENABLED(sip, DM_EVENT_POSTLINK)) {
- (void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTLINK,
- tdp, DM_RIGHT_NULL,
- sip, DM_RIGHT_NULL,
- target_name->name, NULL, 0, error, 0);
- }
- return error;
+ return xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
abort_return:
cancel_flags |= XFS_TRANS_ABORT;
- /* FALLTHROUGH */
-
error_return:
xfs_trans_cancel(tp, cancel_flags);
- goto std_return;
+ std_return:
+ return error;
}
int
@@ -2074,7 +1962,7 @@ xfs_symlink(
ip = NULL;
tp = NULL;
- xfs_itrace_entry(dp);
+ trace_xfs_symlink(dp, link_name);
if (XFS_FORCED_SHUTDOWN(mp))
return XFS_ERROR(EIO);
@@ -2086,17 +1974,6 @@ xfs_symlink(
if (pathlen >= MAXPATHLEN) /* total string too long */
return XFS_ERROR(ENAMETOOLONG);
- if (DM_EVENT_ENABLED(dp, DM_EVENT_SYMLINK)) {
- error = XFS_SEND_NAMESP(mp, DM_EVENT_SYMLINK, dp,
- DM_RIGHT_NULL, NULL, DM_RIGHT_NULL,
- link_name->name,
- (unsigned char *)target_path, 0, 0, 0);
- if (error)
- return error;
- }
-
- /* Return through std_return after this point. */
-
udqp = gdqp = NULL;
if (dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT)
prid = dp->i_d.di_projid;
@@ -2180,8 +2057,7 @@ xfs_symlink(
* transaction cancel unlocking dp so don't do it explicitly in the
* error path.
*/
- IHOLD(dp);
- xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
+ xfs_trans_ijoin_ref(tp, dp, XFS_ILOCK_EXCL);
unlock_dp_on_error = B_FALSE;
/*
@@ -2215,7 +2091,7 @@ xfs_symlink(
error = xfs_bmapi(tp, ip, first_fsb, fs_blocks,
XFS_BMAPI_WRITE | XFS_BMAPI_METADATA,
&first_block, resblks, mval, &nmaps,
- &free_list, NULL);
+ &free_list);
if (error) {
goto error1;
}
@@ -2278,21 +2154,8 @@ xfs_symlink(
xfs_qm_dqrele(udqp);
xfs_qm_dqrele(gdqp);
- /* Fall through to std_return with error = 0 or errno from
- * xfs_trans_commit */
-std_return:
- if (DM_EVENT_ENABLED(dp, DM_EVENT_POSTSYMLINK)) {
- (void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTSYMLINK,
- dp, DM_RIGHT_NULL,
- error ? NULL : ip,
- DM_RIGHT_NULL, link_name->name,
- (unsigned char *)target_path,
- 0, error, 0);
- }
-
- if (!error)
- *ipp = ip;
- return error;
+ *ipp = ip;
+ return 0;
error2:
IRELE(ip);
@@ -2306,8 +2169,8 @@ std_return:
if (unlock_dp_on_error)
xfs_iunlock(dp, XFS_ILOCK_EXCL);
-
- goto std_return;
+ std_return:
+ return error;
}
int
@@ -2333,13 +2196,12 @@ xfs_set_dmattrs(
return error;
}
xfs_ilock(ip, XFS_ILOCK_EXCL);
- xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
+ xfs_trans_ijoin_ref(tp, ip, XFS_ILOCK_EXCL);
ip->i_d.di_dmevmask = evmask;
ip->i_d.di_dmstate = state;
xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
- IHOLD(ip);
error = xfs_trans_commit(tp, 0);
return error;
@@ -2390,7 +2252,7 @@ xfs_alloc_file_space(
int committed;
int error;
- xfs_itrace_entry(ip);
+ trace_xfs_alloc_file_space(ip);
if (XFS_FORCED_SHUTDOWN(mp))
return XFS_ERROR(EIO);
@@ -2412,25 +2274,9 @@ xfs_alloc_file_space(
startoffset_fsb = XFS_B_TO_FSBT(mp, offset);
allocatesize_fsb = XFS_B_TO_FSB(mp, count);
- /* Generate a DMAPI event if needed. */
- if (alloc_type != 0 && offset < ip->i_size &&
- (attr_flags & XFS_ATTR_DMI) == 0 &&
- DM_EVENT_ENABLED(ip, DM_EVENT_WRITE)) {
- xfs_off_t end_dmi_offset;
-
- end_dmi_offset = offset+len;
- if (end_dmi_offset > ip->i_size)
- end_dmi_offset = ip->i_size;
- error = XFS_SEND_DATA(mp, DM_EVENT_WRITE, ip, offset,
- end_dmi_offset - offset, 0, NULL);
- if (error)
- return error;
- }
-
/*
* Allocate file space until done or until there is an error
*/
-retry:
while (allocatesize_fsb && !error) {
xfs_fileoff_t s, e;
@@ -2488,8 +2334,7 @@ retry:
if (error)
goto error1;
- xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
- xfs_trans_ihold(tp, ip);
+ xfs_trans_ijoin(tp, ip);
/*
* Issue the xfs_bmapi() call to allocate the blocks
@@ -2498,7 +2343,7 @@ retry:
error = xfs_bmapi(tp, ip, startoffset_fsb,
allocatesize_fsb, bmapi_flag,
&firstfsb, 0, imapp, &nimaps,
- &free_list, NULL);
+ &free_list);
if (error) {
goto error0;
}
@@ -2527,17 +2372,6 @@ retry:
startoffset_fsb += allocated_fsb;
allocatesize_fsb -= allocated_fsb;
}
-dmapi_enospc_check:
- if (error == ENOSPC && (attr_flags & XFS_ATTR_DMI) == 0 &&
- DM_EVENT_ENABLED(ip, DM_EVENT_NOSPACE)) {
- error = XFS_SEND_NAMESP(mp, DM_EVENT_NOSPACE,
- ip, DM_RIGHT_NULL,
- ip, DM_RIGHT_NULL,
- NULL, NULL, 0, 0, 0); /* Delay flag intentionally unused */
- if (error == 0)
- goto retry; /* Maybe DMAPI app. has made space */
- /* else fall through with error from XFS_SEND_DATA */
- }
return error;
@@ -2548,7 +2382,7 @@ error0: /* Cancel bmap, unlock inode, unreserve quota blocks, cancel trans */
error1: /* Just cancel transaction */
xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
xfs_iunlock(ip, XFS_ILOCK_EXCL);
- goto dmapi_enospc_check;
+ return error;
}
/*
@@ -2598,7 +2432,7 @@ xfs_zero_remaining_bytes(
offset_fsb = XFS_B_TO_FSBT(mp, offset);
nimap = 1;
error = xfs_bmapi(NULL, ip, offset_fsb, 1, 0,
- NULL, 0, &imap, &nimap, NULL, NULL);
+ NULL, 0, &imap, &nimap, NULL);
if (error || nimap < 1)
break;
ASSERT(imap.br_blockcount >= 1);
@@ -2661,7 +2495,6 @@ xfs_free_file_space(
{
int committed;
int done;
- xfs_off_t end_dmi_offset;
xfs_fileoff_t endoffset_fsb;
int error;
xfs_fsblock_t firstfsb;
@@ -2680,7 +2513,7 @@ xfs_free_file_space(
mp = ip->i_mount;
- xfs_itrace_entry(ip);
+ trace_xfs_free_file_space(ip);
error = xfs_qm_dqattach(ip, 0);
if (error)
@@ -2691,19 +2524,7 @@ xfs_free_file_space(
return error;
rt = XFS_IS_REALTIME_INODE(ip);
startoffset_fsb = XFS_B_TO_FSB(mp, offset);
- end_dmi_offset = offset + len;
- endoffset_fsb = XFS_B_TO_FSBT(mp, end_dmi_offset);
-
- if (offset < ip->i_size && (attr_flags & XFS_ATTR_DMI) == 0 &&
- DM_EVENT_ENABLED(ip, DM_EVENT_WRITE)) {
- if (end_dmi_offset > ip->i_size)
- end_dmi_offset = ip->i_size;
- error = XFS_SEND_DATA(mp, DM_EVENT_WRITE, ip,
- offset, end_dmi_offset - offset,
- AT_DELAY_FLAG(attr_flags), NULL);
- if (error)
- return error;
- }
+ endoffset_fsb = XFS_B_TO_FSBT(mp, offset + len);
if (attr_flags & XFS_ATTR_NOLOCK)
need_iolock = 0;
@@ -2731,7 +2552,7 @@ xfs_free_file_space(
if (rt && !xfs_sb_version_hasextflgbit(&mp->m_sb)) {
nimap = 1;
error = xfs_bmapi(NULL, ip, startoffset_fsb,
- 1, 0, NULL, 0, &imap, &nimap, NULL, NULL);
+ 1, 0, NULL, 0, &imap, &nimap, NULL);
if (error)
goto out_unlock_iolock;
ASSERT(nimap == 0 || nimap == 1);
@@ -2746,7 +2567,7 @@ xfs_free_file_space(
}
nimap = 1;
error = xfs_bmapi(NULL, ip, endoffset_fsb - 1,
- 1, 0, NULL, 0, &imap, &nimap, NULL, NULL);
+ 1, 0, NULL, 0, &imap, &nimap, NULL);
if (error)
goto out_unlock_iolock;
ASSERT(nimap == 0 || nimap == 1);
@@ -2814,8 +2635,7 @@ xfs_free_file_space(
if (error)
goto error1;
- xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
- xfs_trans_ihold(tp, ip);
+ xfs_trans_ijoin(tp, ip);
/*
* issue the bunmapi() call to free the blocks
@@ -2823,7 +2643,7 @@ xfs_free_file_space(
xfs_bmap_init(&free_list, &firstfsb);
error = xfs_bunmapi(tp, ip, startoffset_fsb,
endoffset_fsb - startoffset_fsb,
- 0, 2, &firstfsb, &free_list, NULL, &done);
+ 0, 2, &firstfsb, &free_list, &done);
if (error) {
goto error0;
}
@@ -2883,8 +2703,6 @@ xfs_change_file_space(
xfs_trans_t *tp;
struct iattr iattr;
- xfs_itrace_entry(ip);
-
if (!S_ISREG(ip->i_d.di_mode))
return XFS_ERROR(EINVAL);
@@ -2985,8 +2803,7 @@ xfs_change_file_space(
xfs_ilock(ip, XFS_ILOCK_EXCL);
- xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
- xfs_trans_ihold(tp, ip);
+ xfs_trans_ijoin(tp, ip);
if ((attr_flags & XFS_ATTR_DMI) == 0) {
ip->i_d.di_mode &= ~S_ISUID;