Re: [RFC PATCH V11 02/21] Btrfs: subpagesize-blocksize: Fix whole page write.

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On Mon, Jun 01, 2015 at 08:52:37PM +0530, Chandan Rajendra wrote:
> For the subpagesize-blocksize scenario, a page can contain multiple
> blocks. In such cases, this patch handles writing data to files.
> 
> Also, When setting EXTENT_DELALLOC, we no longer set EXTENT_UPTODATE bit on
> the extent_io_tree since uptodate status is being tracked by the bitmap
> pointed to by page->private.

To be honestly, I'm not sure why we set EXTENT_UPTODATE bit for data as we
don't check for that bit at all for now, correct me if I'm wrong.

> 
> Signed-off-by: Chandan Rajendra <chandan@xxxxxxxxxxxxxxxxxx>
> ---
>  fs/btrfs/extent_io.c | 141 +++++++++++++++++++++++----------------------------
>  fs/btrfs/file.c      |  16 ++++++
>  fs/btrfs/inode.c     |  58 ++++++++++++++++-----
>  3 files changed, 125 insertions(+), 90 deletions(-)
> 
> diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
> index d37badb..3736ab5 100644
> --- a/fs/btrfs/extent_io.c
> +++ b/fs/btrfs/extent_io.c
> @@ -1283,9 +1283,8 @@ int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
>  int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,
>  			struct extent_state **cached_state, gfp_t mask)
>  {
> -	return set_extent_bit(tree, start, end,
> -			      EXTENT_DELALLOC | EXTENT_UPTODATE,
> -			      NULL, cached_state, mask);
> +	return set_extent_bit(tree, start, end, EXTENT_DELALLOC,
> +			NULL, cached_state, mask);
>  }
>  
>  int set_extent_defrag(struct extent_io_tree *tree, u64 start, u64 end,
> @@ -1498,25 +1497,6 @@ int extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end)
>  	return 0;
>  }
>  
> -/*
> - * helper function to set both pages and extents in the tree writeback
> - */
> -static int set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end)
> -{
> -	unsigned long index = start >> PAGE_CACHE_SHIFT;
> -	unsigned long end_index = end >> PAGE_CACHE_SHIFT;
> -	struct page *page;
> -
> -	while (index <= end_index) {
> -		page = find_get_page(tree->mapping, index);
> -		BUG_ON(!page); /* Pages should be in the extent_io_tree */
> -		set_page_writeback(page);
> -		page_cache_release(page);
> -		index++;
> -	}
> -	return 0;
> -}
> -
>  /* find the first state struct with 'bits' set after 'start', and
>   * return it.  tree->lock must be held.  NULL will returned if
>   * nothing was found after 'start'
> @@ -2080,6 +2060,14 @@ static int page_read_complete(struct page *page)
>  	return !test_page_blks_state(page, BLK_STATE_IO, start, end, 0);
>  }
>  
> +static int page_write_complete(struct page *page)
> +{
> +	u64 start = page_offset(page);
> +	u64 end = start + PAGE_CACHE_SIZE - 1;
> +
> +	return !test_page_blks_state(page, BLK_STATE_IO, start, end, 0);
> +}
> +
>  int free_io_failure(struct inode *inode, struct io_failure_record *rec)
>  {
>  	int ret;
> @@ -2575,38 +2563,37 @@ int end_extent_writepage(struct page *page, int err, u64 start, u64 end)
>   */
>  static void end_bio_extent_writepage(struct bio *bio, int err)
>  {
> +	struct btrfs_page_private *pg_private;
>  	struct bio_vec *bvec;
> +	unsigned long flags;
>  	u64 start;
>  	u64 end;
> +	int clear_writeback;
>  	int i;
>  
>  	bio_for_each_segment_all(bvec, bio, i) {
>  		struct page *page = bvec->bv_page;
>  
> -		/* We always issue full-page reads, but if some block
> -		 * in a page fails to read, blk_update_request() will
> -		 * advance bv_offset and adjust bv_len to compensate.
> -		 * Print a warning for nonzero offsets, and an error
> -		 * if they don't add up to a full page.  */
> -		if (bvec->bv_offset || bvec->bv_len != PAGE_CACHE_SIZE) {
> -			if (bvec->bv_offset + bvec->bv_len != PAGE_CACHE_SIZE)
> -				btrfs_err(BTRFS_I(page->mapping->host)->root->fs_info,
> -				   "partial page write in btrfs with offset %u and length %u",
> -					bvec->bv_offset, bvec->bv_len);
> -			else
> -				btrfs_info(BTRFS_I(page->mapping->host)->root->fs_info,
> -				   "incomplete page write in btrfs with offset %u and "
> -				   "length %u",
> -					bvec->bv_offset, bvec->bv_len);
> -		}
> +		start = page_offset(page) + bvec->bv_offset;
> +		end = start + bvec->bv_len - 1;
>  
> -		start = page_offset(page);
> -		end = start + bvec->bv_offset + bvec->bv_len - 1;
> +		pg_private = (struct btrfs_page_private *)page->private;
> +
> +		spin_lock_irqsave(&pg_private->io_lock, flags);
>  
> -		if (end_extent_writepage(page, err, start, end))
> +		if (end_extent_writepage(page, err, start, end)) {
> +			spin_unlock_irqrestore(&pg_private->io_lock, flags);
>  			continue;
> +		}
>  
> -		end_page_writeback(page);
> +		clear_page_blks_state(page, 1 << BLK_STATE_IO, start, end);
> +
> +		clear_writeback = page_write_complete(page);
> +
> +		spin_unlock_irqrestore(&pg_private->io_lock, flags);
> +
> +		if (clear_writeback)
> +			end_page_writeback(page);
>  	}
>  
>  	bio_put(bio);
> @@ -3417,10 +3404,9 @@ static noinline_for_stack int __extent_writepage_io(struct inode *inode,
>  	u64 block_start;
>  	u64 iosize;
>  	sector_t sector;
> -	struct extent_state *cached_state = NULL;
>  	struct extent_map *em;
>  	struct block_device *bdev;
> -	size_t pg_offset = 0;
> +	size_t pg_offset;
>  	size_t blocksize;
>  	int ret = 0;
>  	int nr = 0;
> @@ -3467,8 +3453,16 @@ static noinline_for_stack int __extent_writepage_io(struct inode *inode,
>  							 page_end, NULL, 1);
>  			break;
>  		}
> -		em = epd->get_extent(inode, page, pg_offset, cur,
> -				     end - cur + 1, 1);
> +
> +		pg_offset = cur & (PAGE_CACHE_SIZE - 1);
> +
> +		if (!test_page_blks_state(page, BLK_STATE_DIRTY, cur,
> +						cur + blocksize - 1, 1)) {
> +			cur += blocksize;
> +			continue;
> +		}

If we don't check this, the below get_extent() will return a HOLE (block_start
== EXTENT_MAP_HOLE) and we can still go on to the next block, then we don't
need to maintain this BLK_STATE_DIRTY bit all the while.

> +
> +		em = epd->get_extent(inode, page, pg_offset, cur, blocksize, 1);
>  		if (IS_ERR_OR_NULL(em)) {
>  			SetPageError(page);
>  			ret = PTR_ERR_OR_ZERO(em);
> @@ -3479,7 +3473,7 @@ static noinline_for_stack int __extent_writepage_io(struct inode *inode,
>  		em_end = extent_map_end(em);
>  		BUG_ON(em_end <= cur);
>  		BUG_ON(end < cur);
> -		iosize = min(em_end - cur, end - cur + 1);
> +		iosize = min_t(u64, em_end - cur, blocksize);
>  		iosize = ALIGN(iosize, blocksize);

This limits us to do one block per loop, if two blocks are contiguous,
it should be fine to write them along.

>  		sector = (em->block_start + extent_offset) >> 9;
>  		bdev = em->bdev;
> @@ -3488,32 +3482,20 @@ static noinline_for_stack int __extent_writepage_io(struct inode *inode,
>  		free_extent_map(em);
>  		em = NULL;
>  
> -		/*
> -		 * compressed and inline extents are written through other
> -		 * paths in the FS
> -		 */
> -		if (compressed || block_start == EXTENT_MAP_HOLE ||
> -		    block_start == EXTENT_MAP_INLINE) {
> -			/*
> -			 * end_io notification does not happen here for
> -			 * compressed extents
> -			 */
> -			if (!compressed && tree->ops &&
> -			    tree->ops->writepage_end_io_hook)
> -				tree->ops->writepage_end_io_hook(page, cur,
> -							 cur + iosize - 1,
> -							 NULL, 1);
> -			else if (compressed) {
> -				/* we don't want to end_page_writeback on
> -				 * a compressed extent.  this happens
> -				 * elsewhere
> -				 */
> -				nr++;
> -			}
> +		BUG_ON(compressed);
> +		BUG_ON(block_start == EXTENT_MAP_INLINE);
>  
> -			cur += iosize;
> -			pg_offset += iosize;
> -			continue;
> +		if (block_start == EXTENT_MAP_HOLE) {
> +			if (test_page_blks_state(page, BLK_STATE_UPTODATE, cur,
> +							cur + iosize - 1, 1)) {
> +				clear_page_blks_state(page,
> +						1 << BLK_STATE_DIRTY, cur,
> +						cur + iosize - 1);
> +				cur += iosize;
> +				continue;
> +			} else {
> +				BUG();
> +			}
>  		}
>  
>  		if (tree->ops && tree->ops->writepage_io_hook) {
> @@ -3527,7 +3509,13 @@ static noinline_for_stack int __extent_writepage_io(struct inode *inode,
>  		} else {
>  			unsigned long max_nr = (i_size >> PAGE_CACHE_SHIFT) + 1;
>  
> -			set_range_writeback(tree, cur, cur + iosize - 1);
> +			clear_page_blks_state(page, 1 << BLK_STATE_DIRTY, cur,
> +					cur + iosize - 1);
> +			set_page_writeback(page);
> +
> +			set_page_blks_state(page, 1 << BLK_STATE_IO, cur,
> +					cur + iosize - 1);
> +
>  			if (!PageWriteback(page)) {
>  				btrfs_err(BTRFS_I(inode)->root->fs_info,
>  					   "page %lu not writeback, cur %llu end %llu",
> @@ -3542,17 +3530,14 @@ static noinline_for_stack int __extent_writepage_io(struct inode *inode,
>  			if (ret)
>  				SetPageError(page);
>  		}
> -		cur = cur + iosize;
> -		pg_offset += iosize;
> +
> +		cur += iosize;
>  		nr++;
>  	}
>  done:
>  	*nr_ret = nr;
>  
>  done_unlocked:
> -
> -	/* drop our reference on any cached states */
> -	free_extent_state(cached_state);
>  	return ret;
>  }
>  
> diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
> index 23b6e03..cbe6381 100644
> --- a/fs/btrfs/file.c
> +++ b/fs/btrfs/file.c
> @@ -495,6 +495,9 @@ int btrfs_dirty_pages(struct btrfs_root *root, struct inode *inode,
>  	u64 num_bytes;
>  	u64 start_pos;
>  	u64 end_of_last_block;
> +	u64 start;
> +	u64 end;
> +	u64 page_end;
>  	u64 end_pos = pos + write_bytes;
>  	loff_t isize = i_size_read(inode);
>  
> @@ -507,11 +510,24 @@ int btrfs_dirty_pages(struct btrfs_root *root, struct inode *inode,
>  	if (err)
>  		return err;
>  
> +	start = start_pos;
> +
>  	for (i = 0; i < num_pages; i++) {
>  		struct page *p = pages[i];
>  		SetPageUptodate(p);
>  		ClearPageChecked(p);
> +
> +		end = page_end = page_offset(p) + PAGE_CACHE_SIZE - 1;
> +
> +		if (i == num_pages - 1)
> +			end = min_t(u64, page_end, end_of_last_block);
> +
> +		set_page_blks_state(p,
> +				1 << BLK_STATE_DIRTY | 1 << BLK_STATE_UPTODATE,
> +				start, end);
>  		set_page_dirty(p);
> +
> +		start = page_end + 1;

This is not the usual way, page_end is unnecessary, (start += PAGE_CACHE_SIZE) should work.

>  	}
>  
>  	/*
> diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
> index 8262f83..ac6a3f3 100644
> --- a/fs/btrfs/inode.c
> +++ b/fs/btrfs/inode.c
> @@ -1995,6 +1995,11 @@ again:
>  	 }
>  
>  	btrfs_set_extent_delalloc(inode, page_start, page_end, &cached_state);
> +
> +	set_page_blks_state(page,
> +			1 << BLK_STATE_DIRTY | 1 << BLK_STATE_UPTODATE,
> +			page_start, page_end);
> +
>  	ClearPageChecked(page);
>  	set_page_dirty(page);
>  out:
> @@ -2984,26 +2989,48 @@ static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end,
>  	struct btrfs_ordered_extent *ordered_extent = NULL;
>  	struct btrfs_workqueue *wq;
>  	btrfs_work_func_t func;
> +	u64 ordered_start, ordered_end;
> +	int done;
>  
>  	trace_btrfs_writepage_end_io_hook(page, start, end, uptodate);
>  
>  	ClearPagePrivate2(page);
> -	if (!btrfs_dec_test_ordered_pending(inode, &ordered_extent, start,
> -					    end - start + 1, uptodate))
> -		return 0;
> +loop:
> +	ordered_extent = btrfs_lookup_ordered_range(inode, start,
> +						end - start + 1);
> +	if (!ordered_extent)
> +		goto out;
>  
> -	if (btrfs_is_free_space_inode(inode)) {
> -		wq = root->fs_info->endio_freespace_worker;
> -		func = btrfs_freespace_write_helper;
> -	} else {
> -		wq = root->fs_info->endio_write_workers;
> -		func = btrfs_endio_write_helper;
> +	ordered_start = max_t(u64, start, ordered_extent->file_offset);
> +	ordered_end = min_t(u64, end,
> +			ordered_extent->file_offset + ordered_extent->len - 1);
> +
> +	done = btrfs_dec_test_ordered_pending(inode, &ordered_extent,
> +					ordered_start,
> +					ordered_end - ordered_start + 1,
> +					uptodate);
> +	if (done) {
> +		if (btrfs_is_free_space_inode(inode)) {
> +			wq = root->fs_info->endio_freespace_worker;
> +			func = btrfs_freespace_write_helper;
> +		} else {
> +			wq = root->fs_info->endio_write_workers;
> +			func = btrfs_endio_write_helper;
> +		}
> +
> +		btrfs_init_work(&ordered_extent->work, func,
> +				finish_ordered_fn, NULL, NULL);
> +		btrfs_queue_work(wq, &ordered_extent->work);
>  	}
>  
> -	btrfs_init_work(&ordered_extent->work, func, finish_ordered_fn, NULL,
> -			NULL);
> -	btrfs_queue_work(wq, &ordered_extent->work);
> +	btrfs_put_ordered_extent(ordered_extent);
> +
> +	start = ordered_end + 1;
> +
> +	if (start < end)
> +		goto loop;
>  
> +out:

I saw this's put a BUG_ON(block_start == EXTENT_MAP_INLINE); in writepage(),
but I didn't see the code of disabling inline data in patch 01 and patch 02,
but anyway I think we can avoid above searching for ordered_extents in a single page
if we enable inline data.

Thanks,

-liubo

>  	return 0;
>  }
>  
> @@ -4601,6 +4628,9 @@ again:
>  		goto out_unlock;
>  	}
>  
> +	set_page_blks_state(page, 1 << BLK_STATE_DIRTY | 1 << BLK_STATE_UPTODATE,
> +			page_start, page_end);
> +
>  	if (offset != PAGE_CACHE_SIZE) {
>  		if (!len)
>  			len = PAGE_CACHE_SIZE - offset;
> @@ -8590,6 +8620,10 @@ again:
>  		ret = VM_FAULT_SIGBUS;
>  		goto out_unlock;
>  	}
> +
> +	set_page_blks_state(page, 1 << BLK_STATE_DIRTY | 1 << BLK_STATE_UPTODATE,
> +			page_start, end);
> +
>  	ret = 0;
>  
>  	/* page is wholly or partially inside EOF */
> -- 
> 2.1.0
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
> the body of a message to majordomo@xxxxxxxxxxxxxxx
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html



[Index of Archives]     [Linux Filesystem Development]     [Linux NFS]     [Linux NILFS]     [Linux USB Devel]     [Linux Audio Users]     [Yosemite News]     [Linux Kernel]     [Linux SCSI]

  Powered by Linux