diff options
Diffstat (limited to 'fs/ext4')
| -rw-r--r-- | fs/ext4/ext4.h | 13 | ||||
| -rw-r--r-- | fs/ext4/extents.c | 133 | ||||
| -rw-r--r-- | fs/ext4/file.c | 66 | ||||
| -rw-r--r-- | fs/ext4/inode.c | 77 | ||||
| -rw-r--r-- | fs/ext4/super.c | 1 | ||||
| -rw-r--r-- | fs/ext4/truncate.h | 2 |
6 files changed, 82 insertions, 210 deletions
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index daad932eeb38..786cb51cab56 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -933,15 +933,6 @@ struct ext4_inode_info { * by other means, so we have i_data_sem. */ struct rw_semaphore i_data_sem; - /* - * i_mmap_sem is for serializing page faults with truncate / punch hole - * operations. We have to make sure that new page cannot be faulted in - * a section of the inode that is being punched. We cannot easily use - * i_data_sem for this since we need protection for the whole punch - * operation and i_data_sem ranks below transaction start so we have - * to occasionally drop it. - */ - struct rw_semaphore i_mmap_sem; struct inode vfs_inode; struct jbd2_inode *jinode; @@ -2517,7 +2508,6 @@ extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks); extern int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode, loff_t lstart, loff_t lend); extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf); -extern int ext4_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf); extern qsize_t *ext4_get_reserved_space(struct inode *inode); extern void ext4_da_update_reserve_space(struct inode *inode, int used, int quota_claim); @@ -2882,9 +2872,6 @@ static inline int ext4_update_inode_size(struct inode *inode, loff_t newsize) return changed; } -int ext4_update_disksize_before_punch(struct inode *inode, loff_t offset, - loff_t len); - struct ext4_group_info { unsigned long bb_state; struct rb_root bb_free_root; diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 3578b25fccfd..551353b1b17a 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -4685,6 +4685,10 @@ static int ext4_alloc_file_blocks(struct file *file, ext4_lblk_t offset, if (len <= EXT_UNWRITTEN_MAX_LEN) flags |= EXT4_GET_BLOCKS_NO_NORMALIZE; + /* Wait all existing dio workers, newcomers will block on i_mutex */ + ext4_inode_block_unlocked_dio(inode); + inode_dio_wait(inode); + /* * credits to insert 1 extent into extent tree */ @@ -4748,6 +4752,8 @@ retry: goto retry; } + ext4_inode_resume_unlocked_dio(inode); + return ret > 0 ? ret2 : ret; } @@ -4764,6 +4770,7 @@ static long ext4_zero_range(struct file *file, loff_t offset, int partial_begin, partial_end; loff_t start, end; ext4_lblk_t lblk; + struct address_space *mapping = inode->i_mapping; unsigned int blkbits = inode->i_blkbits; trace_ext4_zero_range(inode, offset, len, mode); @@ -4779,6 +4786,17 @@ static long ext4_zero_range(struct file *file, loff_t offset, } /* + * Write out all dirty pages to avoid race conditions + * Then release them. + */ + if (mapping->nrpages && mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) { + ret = filemap_write_and_wait_range(mapping, offset, + offset + len - 1); + if (ret) + return ret; + } + + /* * Round up offset. This is not fallocate, we neet to zero out * blocks, so convert interior block aligned part of the range to * unwritten and possibly manually zero out unaligned parts of the @@ -4821,10 +4839,6 @@ static long ext4_zero_range(struct file *file, loff_t offset, if (mode & FALLOC_FL_KEEP_SIZE) flags |= EXT4_GET_BLOCKS_KEEP_SIZE; - /* Wait all existing dio workers, newcomers will block on i_mutex */ - ext4_inode_block_unlocked_dio(inode); - inode_dio_wait(inode); - /* Preallocate the range including the unaligned edges */ if (partial_begin || partial_end) { ret = ext4_alloc_file_blocks(file, @@ -4833,7 +4847,7 @@ static long ext4_zero_range(struct file *file, loff_t offset, round_down(offset, 1 << blkbits)) >> blkbits, new_size, flags, mode); if (ret) - goto out_dio; + goto out_mutex; } @@ -4842,23 +4856,16 @@ static long ext4_zero_range(struct file *file, loff_t offset, flags |= (EXT4_GET_BLOCKS_CONVERT_UNWRITTEN | EXT4_EX_NOCACHE); - /* - * Prevent page faults from reinstantiating pages we have - * released from page cache. - */ - down_write(&EXT4_I(inode)->i_mmap_sem); - ret = ext4_update_disksize_before_punch(inode, offset, len); - if (ret) { - up_write(&EXT4_I(inode)->i_mmap_sem); - goto out_dio; - } - /* Now release the pages and zero block aligned part of pages */ + /* Now release the pages and zero block aligned part of pages*/ truncate_pagecache_range(inode, start, end - 1); inode->i_mtime = inode->i_ctime = ext4_current_time(inode); + /* Wait all existing dio workers, newcomers will block on i_mutex */ + ext4_inode_block_unlocked_dio(inode); + inode_dio_wait(inode); + ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size, flags, mode); - up_write(&EXT4_I(inode)->i_mmap_sem); if (ret) goto out_dio; } @@ -4991,13 +4998,8 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len) goto out; } - /* Wait all existing dio workers, newcomers will block on i_mutex */ - ext4_inode_block_unlocked_dio(inode); - inode_dio_wait(inode); - ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size, flags, mode); - ext4_inode_resume_unlocked_dio(inode); if (ret) goto out; @@ -5492,7 +5494,21 @@ int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len) return ret; } + /* + * Need to round down offset to be aligned with page size boundary + * for page size > block size. + */ + ioffset = round_down(offset, PAGE_SIZE); + + /* Write out all dirty pages */ + ret = filemap_write_and_wait_range(inode->i_mapping, ioffset, + LLONG_MAX); + if (ret) + return ret; + + /* Take mutex lock */ mutex_lock(&inode->i_mutex); + /* * There is no need to overlap collapse range with EOF, in which case * it is effectively a truncate operation @@ -5508,43 +5524,17 @@ int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len) goto out_mutex; } + truncate_pagecache(inode, ioffset); + /* Wait for existing dio to complete */ ext4_inode_block_unlocked_dio(inode); inode_dio_wait(inode); - /* - * Prevent page faults from reinstantiating pages we have released from - * page cache. - */ - down_write(&EXT4_I(inode)->i_mmap_sem); - /* - * Need to round down offset to be aligned with page size boundary - * for page size > block size. - */ - ioffset = round_down(offset, PAGE_SIZE); - /* - * Write tail of the last page before removed range since it will get - * removed from the page cache below. - */ - ret = filemap_write_and_wait_range(inode->i_mapping, ioffset, offset); - if (ret) - goto out_mmap; - /* - * Write data that will be shifted to preserve them when discarding - * page cache below. We are also protected from pages becoming dirty - * by i_mmap_sem. - */ - ret = filemap_write_and_wait_range(inode->i_mapping, offset + len, - LLONG_MAX); - if (ret) - goto out_mmap; - truncate_pagecache(inode, ioffset); - credits = ext4_writepage_trans_blocks(inode); handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits); if (IS_ERR(handle)) { ret = PTR_ERR(handle); - goto out_mmap; + goto out_dio; } down_write(&EXT4_I(inode)->i_data_sem); @@ -5583,8 +5573,7 @@ int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len) out_stop: ext4_journal_stop(handle); -out_mmap: - up_write(&EXT4_I(inode)->i_mmap_sem); +out_dio: ext4_inode_resume_unlocked_dio(inode); out_mutex: mutex_unlock(&inode->i_mutex); @@ -5638,7 +5627,21 @@ int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len) return ret; } + /* + * Need to round down to align start offset to page size boundary + * for page size > block size. + */ + ioffset = round_down(offset, PAGE_SIZE); + + /* Write out all dirty pages */ + ret = filemap_write_and_wait_range(inode->i_mapping, ioffset, + LLONG_MAX); + if (ret) + return ret; + + /* Take mutex lock */ mutex_lock(&inode->i_mutex); + /* Currently just for extent based files */ if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { ret = -EOPNOTSUPP; @@ -5657,32 +5660,17 @@ int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len) goto out_mutex; } + truncate_pagecache(inode, ioffset); + /* Wait for existing dio to complete */ ext4_inode_block_unlocked_dio(inode); inode_dio_wait(inode); - /* - * Prevent page faults from reinstantiating pages we have released from - * page cache. - */ - down_write(&EXT4_I(inode)->i_mmap_sem); - /* - * Need to round down to align start offset to page size boundary - * for page size > block size. - */ - ioffset = round_down(offset, PAGE_SIZE); - /* Write out all dirty pages */ - ret = filemap_write_and_wait_range(inode->i_mapping, ioffset, - LLONG_MAX); - if (ret) - goto out_mmap; - truncate_pagecache(inode, ioffset); - credits = ext4_writepage_trans_blocks(inode); handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits); if (IS_ERR(handle)) { ret = PTR_ERR(handle); - goto out_mmap; + goto out_dio; } /* Expand file to avoid data loss if there is error while shifting */ @@ -5753,8 +5741,7 @@ int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len) out_stop: ext4_journal_stop(handle); -out_mmap: - up_write(&EXT4_I(inode)->i_mmap_sem); +out_dio: ext4_inode_resume_unlocked_dio(inode); out_mutex: mutex_unlock(&inode->i_mutex); diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 0d24ebcd7c9e..113837e7ba98 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -209,18 +209,15 @@ static int ext4_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf) { int result; handle_t *handle = NULL; - struct inode *inode = file_inode(vma->vm_file); - struct super_block *sb = inode->i_sb; + struct super_block *sb = file_inode(vma->vm_file)->i_sb; bool write = vmf->flags & FAULT_FLAG_WRITE; if (write) { sb_start_pagefault(sb); file_update_time(vma->vm_file); - down_read(&EXT4_I(inode)->i_mmap_sem); handle = ext4_journal_start_sb(sb, EXT4_HT_WRITE_PAGE, EXT4_DATA_TRANS_BLOCKS(sb)); - } else - down_read(&EXT4_I(inode)->i_mmap_sem); + } if (IS_ERR(handle)) result = VM_FAULT_SIGBUS; @@ -231,10 +228,8 @@ static int ext4_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf) if (write) { if (!IS_ERR(handle)) ext4_journal_stop(handle); - up_read(&EXT4_I(inode)->i_mmap_sem); sb_end_pagefault(sb); - } else - up_read(&EXT4_I(inode)->i_mmap_sem); + } return result; } @@ -251,12 +246,10 @@ static int ext4_dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr, if (write) { sb_start_pagefault(sb); file_update_time(vma->vm_file); - down_read(&EXT4_I(inode)->i_mmap_sem); handle = ext4_journal_start_sb(sb, EXT4_HT_WRITE_PAGE, ext4_chunk_trans_blocks(inode, PMD_SIZE / PAGE_SIZE)); - } else - down_read(&EXT4_I(inode)->i_mmap_sem); + } if (IS_ERR(handle)) result = VM_FAULT_SIGBUS; @@ -267,71 +260,30 @@ static int ext4_dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr, if (write) { if (!IS_ERR(handle)) ext4_journal_stop(handle); - up_read(&EXT4_I(inode)->i_mmap_sem); sb_end_pagefault(sb); - } else - up_read(&EXT4_I(inode)->i_mmap_sem); + } return result; } static int ext4_dax_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) { - int err; - struct inode *inode = file_inode(vma->vm_file); - - sb_start_pagefault(inode->i_sb); - file_update_time(vma->vm_file); - down_read(&EXT4_I(inode)->i_mmap_sem); - err = __dax_mkwrite(vma, vmf, ext4_get_block_dax, - ext4_end_io_unwritten); - up_read(&EXT4_I(inode)->i_mmap_sem); - sb_end_pagefault(inode->i_sb); - - return err; -} - -/* - * Handle write fault for VM_MIXEDMAP mappings. Similarly to ext4_dax_mkwrite() - * handler we check for races agaist truncate. Note that since we cycle through - * i_mmap_sem, we are sure that also any hole punching that began before we - * were called is finished by now and so if it included part of the file we - * are working on, our pte will get unmapped and the check for pte_same() in - * wp_pfn_shared() fails. Thus fault gets retried and things work out as - * desired. - */ -static int ext4_dax_pfn_mkwrite(struct vm_area_struct *vma, - struct vm_fault *vmf) -{ - struct inode *inode = file_inode(vma->vm_file); - struct super_block *sb = inode->i_sb; - int ret = VM_FAULT_NOPAGE; - loff_t size; - - sb_start_pagefault(sb); - file_update_time(vma->vm_file); - down_read(&EXT4_I(inode)->i_mmap_sem); - size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; - if (vmf->pgoff >= size) - ret = VM_FAULT_SIGBUS; - up_read(&EXT4_I(inode)->i_mmap_sem); - sb_end_pagefault(sb); - - return ret; + return dax_mkwrite(vma, vmf, ext4_get_block_dax, + ext4_end_io_unwritten); } static const struct vm_operations_struct ext4_dax_vm_ops = { .fault = ext4_dax_fault, .pmd_fault = ext4_dax_pmd_fault, .page_mkwrite = ext4_dax_mkwrite, - .pfn_mkwrite = ext4_dax_pfn_mkwrite, + .pfn_mkwrite = dax_pfn_mkwrite, }; #else #define ext4_dax_vm_ops ext4_file_vm_ops #endif static const struct vm_operations_struct ext4_file_vm_ops = { - .fault = ext4_filemap_fault, + .fault = filemap_fault, .map_pages = filemap_map_pages, .page_mkwrite = ext4_page_mkwrite, }; diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index e31d762eedce..06bda0361e7c 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -3587,35 +3587,6 @@ int ext4_can_truncate(struct inode *inode) } /* - * We have to make sure i_disksize gets properly updated before we truncate - * page cache due to hole punching or zero range. Otherwise i_disksize update - * can get lost as it may have been postponed to submission of writeback but - * that will never happen after we truncate page cache. - */ -int ext4_update_disksize_before_punch(struct inode *inode, loff_t offset, - loff_t len) -{ - handle_t *handle; - loff_t size = i_size_read(inode); - - WARN_ON(!mutex_is_locked(&inode->i_mutex)); - if (offset > size || offset + len < size) - return 0; - - if (EXT4_I(inode)->i_disksize >= size) - return 0; - - handle = ext4_journal_start(inode, EXT4_HT_MISC, 1); - if (IS_ERR(handle)) - return PTR_ERR(handle); - ext4_update_i_disksize(inode, size); - ext4_mark_inode_dirty(handle, inode); - ext4_journal_stop(handle); - - return 0; -} - -/* * ext4_punch_hole: punches a hole in a file by releaseing the blocks * associated with the given offset and length * @@ -3680,26 +3651,17 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length) } - /* Wait all existing dio workers, newcomers will block on i_mutex */ - ext4_inode_block_unlocked_dio(inode); - inode_dio_wait(inode); - - /* - * Prevent page faults from reinstantiating pages we have released from - * page cache. - */ - down_write(&EXT4_I(inode)->i_mmap_sem); first_block_offset = round_up(offset, sb->s_blocksize); last_block_offset = round_down((offset + length), sb->s_blocksize) - 1; /* Now release the pages and zero block aligned part of pages*/ - if (last_block_offset > first_block_offset) { - ret = ext4_update_disksize_before_punch(inode, offset, length); - if (ret) - goto out_dio; + if (last_block_offset > first_block_offset) truncate_pagecache_range(inode, first_block_offset, last_block_offset); - } + + /* Wait all existing dio workers, newcomers will block on i_mutex */ + ext4_inode_block_unlocked_dio(inode); + inode_dio_wait(inode); if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) credits = ext4_writepage_trans_blocks(inode); @@ -3746,12 +3708,16 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length) if (IS_SYNC(inode)) ext4_handle_sync(handle); + /* Now release the pages again to reduce race window */ + if (last_block_offset > first_block_offset) + truncate_pagecache_range(inode, first_block_offset, + last_block_offset); + inode->i_mtime = inode->i_ctime = ext4_current_time(inode); ext4_mark_inode_dirty(handle, inode); out_stop: ext4_journal_stop(handle); out_dio: - up_write(&EXT4_I(inode)->i_mmap_sem); ext4_inode_resume_unlocked_dio(inode); out_mutex: mutex_unlock(&inode->i_mutex); @@ -4885,7 +4851,6 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr) } else ext4_wait_for_tail_page_commit(inode); } - down_write(&EXT4_I(inode)->i_mmap_sem); /* * Truncate pagecache after we've waited for commit * in data=journal mode to make pages freeable. @@ -4893,7 +4858,6 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr) truncate_pagecache(inode, inode->i_size); if (shrink) ext4_truncate(inode); - up_write(&EXT4_I(inode)->i_mmap_sem); } if (!rc) { @@ -5145,8 +5109,6 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode) might_sleep(); trace_ext4_mark_inode_dirty(inode, _RET_IP_); err = ext4_reserve_inode_write(handle, inode, &iloc); - if (err) - return err; if (ext4_handle_valid(handle) && EXT4_I(inode)->i_extra_isize < sbi->s_want_extra_isize && !ext4_test_inode_state(inode, EXT4_STATE_NO_EXPAND)) { @@ -5177,7 +5139,9 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode) } } } - return ext4_mark_iloc_dirty(handle, inode, &iloc); + if (!err) + err = ext4_mark_iloc_dirty(handle, inode, &iloc); + return err; } /* @@ -5342,8 +5306,6 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) sb_start_pagefault(inode->i_sb); file_update_time(vma->vm_file); - - down_read(&EXT4_I(inode)->i_mmap_sem); /* Delalloc case is easy... */ if (test_opt(inode->i_sb, DELALLOC) && !ext4_should_journal_data(inode) && @@ -5413,19 +5375,6 @@ retry_alloc: out_ret: ret = block_page_mkwrite_return(ret); out: - up_read(&EXT4_I(inode)->i_mmap_sem); sb_end_pagefault(inode->i_sb); return ret; } - -int ext4_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) -{ - struct inode *inode = file_inode(vma->vm_file); - int err; - - down_read(&EXT4_I(inode)->i_mmap_sem); - err = filemap_fault(vma, vmf); - up_read(&EXT4_I(inode)->i_mmap_sem); - - return err; -} diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 852c26806af2..ba1cf0bf2f81 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -958,7 +958,6 @@ static void init_once(void *foo) INIT_LIST_HEAD(&ei->i_orphan); init_rwsem(&ei->xattr_sem); init_rwsem(&ei->i_data_sem); - init_rwsem(&ei->i_mmap_sem); inode_init_once(&ei->vfs_inode); } diff --git a/fs/ext4/truncate.h b/fs/ext4/truncate.h index c70d06a383e2..011ba6670d99 100644 --- a/fs/ext4/truncate.h +++ b/fs/ext4/truncate.h @@ -10,10 +10,8 @@ */ static inline void ext4_truncate_failed_write(struct inode *inode) { - down_write(&EXT4_I(inode)->i_mmap_sem); truncate_inode_pages(inode->i_mapping, inode->i_size); ext4_truncate(inode); - up_write(&EXT4_I(inode)->i_mmap_sem); } /* |
