Re: [RFC PATCH V11 14/21] Btrfs: subpagesize-blocksize: Explicitly Track I/O status of blocks of an ordered extent.

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On Mon, Jun 01, 2015 at 08:52:49PM +0530, Chandan Rajendra wrote:
> In subpagesize-blocksize scenario a page can have more than one block. So
> in addition to PagePrivate2 flag, we would have to track the I/O status of
> each block of a page to reliably mark the ordered extent as complete.
> 
> Signed-off-by: Chandan Rajendra <chandan@xxxxxxxxxxxxxxxxxx>
> ---
>  fs/btrfs/extent_io.c    |  19 +--
>  fs/btrfs/extent_io.h    |   5 +-
>  fs/btrfs/inode.c        | 346 +++++++++++++++++++++++++++++++++++-------------
>  fs/btrfs/ordered-data.c |  17 +++
>  fs/btrfs/ordered-data.h |   4 +
>  5 files changed, 287 insertions(+), 104 deletions(-)
> 
> diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
> index 0110abc..55f900a 100644
> --- a/fs/btrfs/extent_io.c
> +++ b/fs/btrfs/extent_io.c
> @@ -4545,11 +4545,10 @@ int extent_invalidatepage(struct extent_io_tree *tree,
>   * to drop the page.
>   */
>  static int try_release_extent_state(struct extent_map_tree *map,
> -				    struct extent_io_tree *tree,
> -				    struct page *page, gfp_t mask)
> +				struct extent_io_tree *tree,
> +				struct page *page, u64 start, u64 end,
> +				gfp_t mask)
>  {
> -	u64 start = page_offset(page);
> -	u64 end = start + PAGE_CACHE_SIZE - 1;
>  	int ret = 1;
>  
>  	if (test_range_bit(tree, start, end,
> @@ -4583,12 +4582,12 @@ static int try_release_extent_state(struct extent_map_tree *map,
>   * map records are removed
>   */
>  int try_release_extent_mapping(struct extent_map_tree *map,
> -			       struct extent_io_tree *tree, struct page *page,
> -			       gfp_t mask)
> +			struct extent_io_tree *tree, struct page *page,
> +			u64 start, u64 end, gfp_t mask)
>  {
>  	struct extent_map *em;
> -	u64 start = page_offset(page);
> -	u64 end = start + PAGE_CACHE_SIZE - 1;
> +	u64 orig_start = start;
> +	u64 orig_end = end;
>  
>  	if ((mask & __GFP_WAIT) &&
>  	    page->mapping->host->i_size > 16 * 1024 * 1024) {
> @@ -4622,7 +4621,9 @@ int try_release_extent_mapping(struct extent_map_tree *map,
>  			free_extent_map(em);
>  		}
>  	}
> -	return try_release_extent_state(map, tree, page, mask);
> +	return try_release_extent_state(map, tree, page,
> +					orig_start, orig_end,
> +					mask);
>  }
>  
>  /*
> diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
> index 8fe5ac3..c629e53 100644
> --- a/fs/btrfs/extent_io.h
> +++ b/fs/btrfs/extent_io.h
> @@ -217,8 +217,9 @@ typedef struct extent_map *(get_extent_t)(struct inode *inode,
>  void extent_io_tree_init(struct extent_io_tree *tree,
>  			 struct address_space *mapping);
>  int try_release_extent_mapping(struct extent_map_tree *map,
> -			       struct extent_io_tree *tree, struct page *page,
> -			       gfp_t mask);
> +			struct extent_io_tree *tree, struct page *page,
> +			u64 start, u64 end,
> +			gfp_t mask);
>  int try_release_extent_buffer(struct page *page);
>  int lock_extent(struct extent_io_tree *tree, u64 start, u64 end);
>  int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
> diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
> index bff60c6..bfffc62 100644
> --- a/fs/btrfs/inode.c
> +++ b/fs/btrfs/inode.c
> @@ -2990,56 +2990,115 @@ static void finish_ordered_fn(struct btrfs_work *work)
>  	btrfs_finish_ordered_io(ordered_extent);
>  }
>  
> -static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end,
> -				struct extent_state *state, int uptodate)
> +static void mark_blks_io_complete(struct btrfs_ordered_extent *ordered,
> +				u64 blk, u64 nr_blks, int uptodate)
>  {
> -	struct inode *inode = page->mapping->host;
> +	struct inode *inode = ordered->inode;
>  	struct btrfs_root *root = BTRFS_I(inode)->root;
> -	struct btrfs_ordered_extent *ordered_extent = NULL;
>  	struct btrfs_workqueue *wq;
>  	btrfs_work_func_t func;
> -	u64 ordered_start, ordered_end;
>  	int done;
>  
> -	trace_btrfs_writepage_end_io_hook(page, start, end, uptodate);
> +	while (nr_blks--) {
> +		if (test_and_set_bit(blk, ordered->blocks_done)) {
> +			blk++;
> +			continue;
> +		}
>  
> -	ClearPagePrivate2(page);
> -loop:
> -	ordered_extent = btrfs_lookup_ordered_range(inode, start,
> -						end - start + 1);
> -	if (!ordered_extent)
> -		goto out;
> +		done = btrfs_dec_test_ordered_pending(inode, &ordered,
> +						ordered->file_offset
> +						+ (blk << inode->i_sb->s_blocksize_bits),
> +						root->sectorsize,
> +						uptodate);
> +		if (done) {
> +			if (btrfs_is_free_space_inode(inode)) {
> +				wq = root->fs_info->endio_freespace_worker;
> +				func = btrfs_freespace_write_helper;
> +			} else {
> +				wq = root->fs_info->endio_write_workers;
> +				func = btrfs_endio_write_helper;
> +			}
>  
> -	ordered_start = max_t(u64, start, ordered_extent->file_offset);
> -	ordered_end = min_t(u64, end,
> -			ordered_extent->file_offset + ordered_extent->len - 1);
> -
> -	done = btrfs_dec_test_ordered_pending(inode, &ordered_extent,
> -					ordered_start,
> -					ordered_end - ordered_start + 1,
> -					uptodate);
> -	if (done) {
> -		if (btrfs_is_free_space_inode(inode)) {
> -			wq = root->fs_info->endio_freespace_worker;
> -			func = btrfs_freespace_write_helper;
> -		} else {
> -			wq = root->fs_info->endio_write_workers;
> -			func = btrfs_endio_write_helper;
> +			btrfs_init_work(&ordered->work, func,
> +					finish_ordered_fn, NULL, NULL);
> +			btrfs_queue_work(wq, &ordered->work);
>  		}
>  
> -		btrfs_init_work(&ordered_extent->work, func,
> -				finish_ordered_fn, NULL, NULL);
> -		btrfs_queue_work(wq, &ordered_extent->work);
> +		blk++;
>  	}
> +}
>  
> -	btrfs_put_ordered_extent(ordered_extent);
> +int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end,
> +				struct extent_state *state, int uptodate)
> +{
> +	struct inode *inode = page->mapping->host;
> +	struct btrfs_root *root = BTRFS_I(inode)->root;
> +	struct btrfs_ordered_extent *ordered_extent = NULL;
> +	u64 blk, nr_blks;
> +	int clear;
>  
> -	start = ordered_end + 1;
> +	trace_btrfs_writepage_end_io_hook(page, start, end, uptodate);
>  
> -	if (start < end)
> -		goto loop;
> +	while (start < end) {
> +		ordered_extent = btrfs_lookup_ordered_extent(inode, start);
> +		if (!ordered_extent) {
> +			start += root->sectorsize;
> +			continue;
> +		}
> +
> +		blk = (start - ordered_extent->file_offset)
> +			>> inode->i_sb->s_blocksize_bits;
> +
> +		nr_blks = (min(end, ordered_extent->file_offset + ordered_extent->len - 1)
> +			+ 1 - start) >> inode->i_sb->s_blocksize_bits;
> +
> +		BUG_ON(!nr_blks);
> +
> +		mark_blks_io_complete(ordered_extent, blk, nr_blks, uptodate);

Range [start, end] is surely contiguous, so why are we processing blocks
one by one in mark_blks_io_complete()?

Same question for invalidatepage().

Thanks,

-liubo

> +
> +		start = ordered_extent->file_offset + ordered_extent->len;
> +
> +		btrfs_put_ordered_extent(ordered_extent);
> +	}
> +
> +	start = page_offset(page);
> +	end = start + PAGE_CACHE_SIZE - 1;
> +	clear = 1;
> +
> +	while (start < end) {
> +		ordered_extent = btrfs_lookup_ordered_extent(inode, start);
> +		if (!ordered_extent) {
> +			start += root->sectorsize;
> +			continue;
> +		}
> +
> +		blk = (start - ordered_extent->file_offset)
> +			>> inode->i_sb->s_blocksize_bits;
> +		nr_blks = (min(end, ordered_extent->file_offset + ordered_extent->len - 1)
> +			+ 1  - start) >> inode->i_sb->s_blocksize_bits;
> +
> +		BUG_ON(!nr_blks);
> +
> +		while (nr_blks--) {
> +			if (!test_bit(blk++, ordered_extent->blocks_done)) {
> +				clear = 0;
> +				break;
> +			}
> +		}
> +
> +		if (!clear) {
> +			btrfs_put_ordered_extent(ordered_extent);
> +			break;
> +		}
> +
> +		start += ordered_extent->len;
> +
> +		btrfs_put_ordered_extent(ordered_extent);
> +	}
> +
> +	if (clear)
> +		ClearPagePrivate2(page);
>  
> -out:
>  	return 0;
>  }
>  
> @@ -8472,7 +8531,9 @@ btrfs_readpages(struct file *file, struct address_space *mapping,
>  	return extent_readpages(tree, mapping, pages, nr_pages,
>  				btrfs_get_extent);
>  }
> -static int __btrfs_releasepage(struct page *page, gfp_t gfp_flags)
> +
> +static int __btrfs_releasepage(struct page *page, u64 start, u64 end,
> +			gfp_t gfp_flags)
>  {
>  	struct extent_io_tree *tree;
>  	struct extent_map_tree *map;
> @@ -8480,31 +8541,149 @@ static int __btrfs_releasepage(struct page *page, gfp_t gfp_flags)
>  
>  	tree = &BTRFS_I(page->mapping->host)->io_tree;
>  	map = &BTRFS_I(page->mapping->host)->extent_tree;
> -	ret = try_release_extent_mapping(map, tree, page, gfp_flags);
> -	if (ret == 1)
> +
> +	ret = try_release_extent_mapping(map, tree, page, start, end,
> +					gfp_flags);
> +	if ((ret == 1) && ((end - start + 1) == PAGE_CACHE_SIZE)) {
>  		clear_page_extent_mapped(page);
> +	} else {
> +		ret = 0;
> +	}
>  
>  	return ret;
>  }
>  
>  static int btrfs_releasepage(struct page *page, gfp_t gfp_flags)
>  {
> +	u64 start = page_offset(page);
> +	u64 end = start + PAGE_CACHE_SIZE - 1;
> +
>  	if (PageWriteback(page) || PageDirty(page))
>  		return 0;
> -	return __btrfs_releasepage(page, gfp_flags & GFP_NOFS);
> +
> +	return __btrfs_releasepage(page, start, end, gfp_flags & GFP_NOFS);
> +}
> +
> +static void invalidate_ordered_extent_blocks(struct inode *inode,
> +					struct btrfs_ordered_extent *ordered,
> +					u64 locked_start, u64 locked_end,
> +					u64 cur,
> +					int inode_evicting)
> +{
> +	struct btrfs_root *root = BTRFS_I(inode)->root;
> +	struct btrfs_ordered_inode_tree *ordered_tree;
> +	struct extent_io_tree *tree;
> +	u64 blk, blk_done, nr_blks;
> +	u64 end;
> +	u64 new_len;
> +
> +	tree = &BTRFS_I(inode)->io_tree;
> +
> +	end = min(locked_end, ordered->file_offset + ordered->len - 1);
> +
> +	if (!inode_evicting) {
> +		clear_extent_bit(tree, cur, end,
> +				EXTENT_DIRTY | EXTENT_DELALLOC |
> +				EXTENT_DO_ACCOUNTING |
> +				EXTENT_DEFRAG, 1, 0, NULL,
> +				GFP_NOFS);
> +		unlock_extent(tree, locked_start, locked_end);
> +	}
> +
> +
> +	ordered_tree = &BTRFS_I(inode)->ordered_tree;
> +	spin_lock_irq(&ordered_tree->lock);
> +	set_bit(BTRFS_ORDERED_TRUNCATED, &ordered->flags);
> +	new_len = cur - ordered->file_offset;
> +	if (new_len < ordered->truncated_len)
> +		ordered->truncated_len = new_len;
> +
> +	blk = (cur - ordered->file_offset) >> inode->i_sb->s_blocksize_bits;
> +	nr_blks = (end + 1 - cur) >> inode->i_sb->s_blocksize_bits;
> +
> +	while (nr_blks--) {
> +		blk_done = !test_and_set_bit(blk, ordered->blocks_done);
> +		if (blk_done) {
> +			spin_unlock_irq(&ordered_tree->lock);
> +			if (btrfs_dec_test_ordered_pending(inode, &ordered,
> +								ordered->file_offset + (blk << inode->i_sb->s_blocksize_bits),
> +								root->sectorsize,
> +								1))
> +				btrfs_finish_ordered_io(ordered);
> +
> +			spin_lock_irq(&ordered_tree->lock);
> +		}
> +		blk++;
> +	}
> +
> +	spin_unlock_irq(&ordered_tree->lock);
> +
> +	if (!inode_evicting)
> +		lock_extent_bits(tree, locked_start, locked_end, 0, NULL);
> +}
> +
> +static int page_blocks_written(struct page *page)
> +{
> +	struct btrfs_ordered_extent *ordered;
> +	struct btrfs_root *root;
> +	struct inode *inode;
> +	unsigned long outstanding_blk;
> +	u64 page_start, page_end;
> +	u64 blk, last_blk, nr_blks;
> +	u64 cur;
> +	u64 len;
> +
> +	inode = page->mapping->host;
> +	root = BTRFS_I(inode)->root;
> +
> +	page_start = page_offset(page);
> +	page_end = page_start + PAGE_CACHE_SIZE - 1;
> +
> +	cur = page_start;
> +	while (cur < page_end) {
> +		ordered = btrfs_lookup_ordered_extent(inode, cur);
> +		if (!ordered) {
> +			cur += root->sectorsize;
> +			continue;
> +		}
> +
> +		blk = (cur - ordered->file_offset)
> +			>> inode->i_sb->s_blocksize_bits;
> +		len = min(page_end, ordered->file_offset + ordered->len - 1)
> +			- cur + 1;
> +		nr_blks = len >> inode->i_sb->s_blocksize_bits;
> +
> +		last_blk = blk + nr_blks - 1;
> +
> +		outstanding_blk = find_next_zero_bit(ordered->blocks_done,
> +						ordered->len >> inode->i_sb->s_blocksize_bits,
> +						blk);
> +		if (outstanding_blk <= last_blk) {
> +			btrfs_put_ordered_extent(ordered);
> +			return 0;
> +		}
> +
> +		btrfs_put_ordered_extent(ordered);
> +		cur += len;
> +	}
> +
> +	return 1;
>  }
>  
>  static void btrfs_invalidatepage(struct page *page, unsigned int offset,
> -				 unsigned int length)
> +				unsigned int length)
>  {
>  	struct inode *inode = page->mapping->host;
> +	struct btrfs_root *root = BTRFS_I(inode)->root;
>  	struct extent_io_tree *tree;
>  	struct btrfs_ordered_extent *ordered;
> -	struct extent_state *cached_state = NULL;
> -	u64 page_start = page_offset(page);
> -	u64 page_end = page_start + PAGE_CACHE_SIZE - 1;
> +	u64 start, end, cur;
> +	u64 page_start, page_end;
>  	int inode_evicting = inode->i_state & I_FREEING;
>  
> +	page_start = page_offset(page);
> +	page_end = page_start + PAGE_CACHE_SIZE - 1;
> +
>  	/*
>  	 * we have the page locked, so new writeback can't start,
>  	 * and the dirty bit won't be cleared while we are here.
> @@ -8515,73 +8694,54 @@ static void btrfs_invalidatepage(struct page *page, unsigned int offset,
>  	wait_on_page_writeback(page);
>  
>  	tree = &BTRFS_I(inode)->io_tree;
> -	if (offset) {
> +
> +	start = round_up(offset, root->sectorsize);
> +	end = round_down(offset + length, root->sectorsize) - 1;
> +	if (end - start + 1 < root->sectorsize) {
>  		btrfs_releasepage(page, GFP_NOFS);
>  		return;
>  	}
>  
> +	start = round_up(page_start + offset, root->sectorsize);
> +	end = round_down(page_start + offset + length,
> +			root->sectorsize) - 1;
> +
>  	if (!inode_evicting)
> -		lock_extent_bits(tree, page_start, page_end, 0, &cached_state);
> -	ordered = btrfs_lookup_ordered_range(inode, page_start, PAGE_CACHE_SIZE);
> -	if (ordered) {
> -		/*
> -		 * IO on this page will never be started, so we need
> -		 * to account for any ordered extents now
> -		 */
> -		if (!inode_evicting)
> -			clear_extent_bit(tree, page_start, page_end,
> -					 EXTENT_DIRTY | EXTENT_DELALLOC |
> -					 EXTENT_LOCKED | EXTENT_DO_ACCOUNTING |
> -					 EXTENT_DEFRAG, 1, 0, &cached_state,
> -					 GFP_NOFS);
> -		/*
> -		 * whoever cleared the private bit is responsible
> -		 * for the finish_ordered_io
> -		 */
> -		if (TestClearPagePrivate2(page)) {
> -			struct btrfs_ordered_inode_tree *tree;
> -			u64 new_len;
> +		lock_extent_bits(tree, start, end, 0, NULL);
>  
> -			tree = &BTRFS_I(inode)->ordered_tree;
> +	cur = start;
> +	while (cur < end) {
> +		ordered = btrfs_lookup_ordered_extent(inode, cur);
> +		if (!ordered) {
> +			cur += root->sectorsize;
> +			continue;
> +		}
>  
> -			spin_lock_irq(&tree->lock);
> -			set_bit(BTRFS_ORDERED_TRUNCATED, &ordered->flags);
> -			new_len = page_start - ordered->file_offset;
> -			if (new_len < ordered->truncated_len)
> -				ordered->truncated_len = new_len;
> -			spin_unlock_irq(&tree->lock);
> +		invalidate_ordered_extent_blocks(inode, ordered,
> +						start, end, cur,
> +						inode_evicting);
>  
> -			if (btrfs_dec_test_ordered_pending(inode, &ordered,
> -							   page_start,
> -							   PAGE_CACHE_SIZE, 1))
> -				btrfs_finish_ordered_io(ordered);
> -		}
> +		cur = min(end + 1, ordered->file_offset + ordered->len);
>  		btrfs_put_ordered_extent(ordered);
> -		if (!inode_evicting) {
> -			cached_state = NULL;
> -			lock_extent_bits(tree, page_start, page_end, 0,
> -					 &cached_state);
> -		}
>  	}
>  
> -	if (!inode_evicting) {
> -		clear_extent_bit(tree, page_start, page_end,
> -				 EXTENT_LOCKED | EXTENT_DIRTY |
> -				 EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING |
> -				 EXTENT_DEFRAG, 1, 1,
> -				 &cached_state, GFP_NOFS);
> +	if (page_blocks_written(page))
> +		ClearPagePrivate2(page);
>  
> -		__btrfs_releasepage(page, GFP_NOFS);
> +	if (!inode_evicting) {
> +		clear_extent_bit(tree, start, end,
> +				EXTENT_LOCKED | EXTENT_DIRTY |
> +				EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING |
> +				EXTENT_DEFRAG, 1, 1, NULL, GFP_NOFS);
>  	}
>  
> -	ClearPageChecked(page);
> -	if (PagePrivate(page)) {
> -		ClearPagePrivate(page);
> -		set_page_private(page, 0);
> -		page_cache_release(page);
> +	if (!offset && length == PAGE_CACHE_SIZE) {
> +		WARN_ON(!__btrfs_releasepage(page, start, end, GFP_NOFS));
> +		ClearPageChecked(page);
>  	}
>  }
>  
> +
>  /*
>   * btrfs_page_mkwrite() is not allowed to change the file size as it gets
>   * called from a page fault handler when a page is first dirtied. Hence we must
> diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
> index 157cc54..8e614ca 100644
> --- a/fs/btrfs/ordered-data.c
> +++ b/fs/btrfs/ordered-data.c
> @@ -189,12 +189,25 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
>  	struct btrfs_ordered_inode_tree *tree;
>  	struct rb_node *node;
>  	struct btrfs_ordered_extent *entry;
> +	u64 nr_longs;
>  
>  	tree = &BTRFS_I(inode)->ordered_tree;
>  	entry = kmem_cache_zalloc(btrfs_ordered_extent_cache, GFP_NOFS);
>  	if (!entry)
>  		return -ENOMEM;
>  
> +	nr_longs = BITS_TO_LONGS(len >> inode->i_sb->s_blocksize_bits);
> +	if (nr_longs == 1) {
> +		entry->blocks_done = &entry->blocks_bitmap;
> +	} else {
> +		entry->blocks_done = kzalloc(nr_longs * sizeof(unsigned long),
> +					GFP_NOFS);
> +		if (!entry->blocks_done) {
> +			kmem_cache_free(btrfs_ordered_extent_cache, entry);
> +			return -ENOMEM;
> +		}
> +	}
> +
>  	entry->file_offset = file_offset;
>  	entry->start = start;
>  	entry->len = len;
> @@ -553,6 +566,10 @@ void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry)
>  			list_del(&sum->list);
>  			kfree(sum);
>  		}
> +
> +		if (entry->blocks_done != &entry->blocks_bitmap)
> +			kfree(entry->blocks_done);
> +
>  		kmem_cache_free(btrfs_ordered_extent_cache, entry);
>  	}
>  }
> diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
> index e96cd4c..4b3356a 100644
> --- a/fs/btrfs/ordered-data.h
> +++ b/fs/btrfs/ordered-data.h
> @@ -140,6 +140,10 @@ struct btrfs_ordered_extent {
>  	struct completion completion;
>  	struct btrfs_work flush_work;
>  	struct list_head work_list;
> +
> +	/* bitmap to track the blocks that have been written to disk */
> +	unsigned long *blocks_done;
> +	unsigned long blocks_bitmap;
>  };
>  
>  /*
> -- 
> 2.1.0
> 
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html



[Index of Archives]     [Linux Filesystem Development]     [Linux NFS]     [Linux NILFS]     [Linux USB Devel]     [Linux Audio Users]     [Yosemite News]     [Linux Kernel]     [Linux SCSI]

  Powered by Linux