From: Timofey Titovets <nefelim4ag@xxxxxxxxx>
Currently btrfs raid1/10 bаlance requests to mirrors,
based on pid % num of mirrors.
Add logic to consider:
- If one of devices are non rotational
- Queue length to devices
By default pid % num_mirrors guessing will be used, but:
- If one of mirrors are non rotational,
reroute requests to non rotational
- If other mirror have less queue length then optimal,
repick to that mirror
For avoid round-robin request balancing,
lets use abs diff of queue length,
and only if diff greater then threshold try rebalance:
- threshold 8 for rotational devs
- threshold 2 for all non rotational devs
Some bench results from mail list
(Dmitrii Tcvetkov <demfloro@xxxxxxxxxxx>):
Benchmark summary (arithmetic mean of 3 runs):
Mainline Patch
------------------------------------
RAID1 | 18.9 MiB/s | 26.5 MiB/s
RAID10 | 30.7 MiB/s | 30.7 MiB/s
------------------------------------------------------------------------
mainline, fio got lucky to read from first HDD (quite slow HDD):
Jobs: 1 (f=1): [r(1)][100.0%][r=8456KiB/s,w=0KiB/s][r=264,w=0 IOPS]
read: IOPS=265, BW=8508KiB/s (8712kB/s)(499MiB/60070msec)
lat (msec): min=2, max=825, avg=60.17, stdev=65.06
------------------------------------------------------------------------
mainline, fio got lucky to read from second HDD (much more modern):
Jobs: 1 (f=1): [r(1)][8.7%][r=11.9MiB/s,w=0KiB/s][r=380,w=0 IOPS]
read: IOPS=378, BW=11.8MiB/s (12.4MB/s)(710MiB/60051msec)
lat (usec): min=416, max=644286, avg=42312.74, stdev=48518.56
------------------------------------------------------------------------
mainline, fio got lucky to read from an SSD:
Jobs: 1 (f=1): [r(1)][100.0%][r=436MiB/s,w=0KiB/s][r=13.9k,w=0 IOPS]
read: IOPS=13.9k, BW=433MiB/s (454MB/s)(25.4GiB/60002msec)
lat (usec): min=343, max=16319, avg=1152.52, stdev=245.36
------------------------------------------------------------------------
With the patch, 2 HDDs:
Jobs: 1 (f=1): [r(1)][100.0%][r=17.5MiB/s,w=0KiB/s][r=560,w=0 IOPS]
read: IOPS=560, BW=17.5MiB/s (18.4MB/s)(1053MiB/60052msec)
lat (usec): min=435, max=341037, avg=28511.64, stdev=30000.14
------------------------------------------------------------------------
With the patch, HDD(old one)+SSD:
Jobs: 1 (f=1): [r(1)][100.0%][r=371MiB/s,w=0KiB/s][r=11.9k,w=0 IOPS]
read: IOPS=11.6k, BW=361MiB/s (379MB/s)(21.2GiB/60084msec)
lat (usec): min=363, max=346752, avg=1381.73, stdev=6948.32
Changes:
v1 -> v2:
- Use helper part_in_flight() from genhd.c
to get queue length
- Move guess code to guess_optimal()
- Change balancer logic, try use pid % mirror by default
Make balancing on spinning rust if one of underline devices
are overloaded
v2 -> v3:
- Fix arg for RAID10 - use sub_stripes, instead of num_stripes
v3 -> v4:
- Rebase on latest misc-next
v4 -> v5:
- Rebase on latest misc-next
v5 -> v6:
- Fix spelling
- Include bench results
v6 -> v7:
- Fixes based on Nikolay Borisov review:
* Assume num == 2
* Remove "for" loop based on that assumption, where possible
v7 -> v8:
- Add comment about magic '2' num in guess function
v8 -> v9:
- Rebase on latest misc-next
- Simplify code
- Use abs instead of round_down for approximation
Abs are more fair
Signed-off-by: Timofey Titovets <nefelim4ag@xxxxxxxxx>
Tested-by: Dmitrii Tcvetkov <demfloro@xxxxxxxxxxx>
---
block/genhd.c | 1 +
fs/btrfs/volumes.c | 88 +++++++++++++++++++++++++++++++++++++++++++++-
2 files changed, 88 insertions(+), 1 deletion(-)
diff --git a/block/genhd.c b/block/genhd.c
index 703267865f14..fb35c85a7f42 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -84,6 +84,7 @@ unsigned int part_in_flight(struct request_queue *q, struct hd_struct *part)
return inflight;
}
+EXPORT_SYMBOL_GPL(part_in_flight);
void part_in_flight_rw(struct request_queue *q, struct hd_struct *part,
unsigned int inflight[2])
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 1c2a6e4b39da..8671c2bdced6 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -13,6 +13,7 @@
#include <linux/raid/pq.h>
#include <linux/semaphore.h>
#include <linux/uuid.h>
+#include <linux/genhd.h>
#include <linux/list_sort.h>
#include "ctree.h"
#include "extent_map.h"
@@ -29,6 +30,8 @@
#include "sysfs.h"
#include "tree-checker.h"
+#define BTRFS_RAID_1_10_MAX_MIRRORS 2
+
const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
[BTRFS_RAID_RAID10] = {
.sub_stripes = 2,
@@ -5482,6 +5485,88 @@ int btrfs_is_parity_mirror(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
return ret;
}
+/**
+ * bdev_get_queue_len - return rounded down in flight queue length of bdev
+ *
+ * @bdev: target bdev
+ */
+static uint32_t bdev_get_queue_len(struct block_device *bdev)
+{
+ struct hd_struct *bd_part = bdev->bd_part;
+ struct request_queue *rq = bdev_get_queue(bdev);
+
+ return part_in_flight(rq, bd_part);
+}
+
+/**
+ * guess_optimal - return guessed optimal mirror
+ *
+ * Optimal expected to be pid % num_stripes
+ *
+ * That's generally fine for spread load
+ * That are balancer based on queue length to device
+ *
+ * Basic ideas:
+ * - Sequential read generate low amount of request
+ * so if load of drives are equal, use pid % num_stripes balancing
+ * - For mixed rotate/non-rotate mirrors, pick non-rotate as optimal
+ * and repick if other dev have "significant" less queue length
+ * - Repick optimal if queue length of other mirror are significantly less
+ */
+static int guess_optimal(struct map_lookup *map, int num, int optimal)
+{
+ int i;
+ /*
+ * Spinning rust not like random request
+ * Because of that rebalance are less aggressive
+ */
+ uint32_t rq_len_overload = 8;
+ uint32_t qlen[2];
+ bool is_nonrot[2];
+ struct block_device *bdev;
+
+ /* That function supposed to work with up to 2 mirrors */
+ ASSERT(BTRFS_RAID_1_10_MAX_MIRRORS == 2);
+ ASSERT(BTRFS_RAID_1_10_MAX_MIRRORS == num);
+
+ /* Check accessible bdevs */
+ for (i = 0; i < 2; i++) {
+ bdev = map->stripes[i].dev->bdev;
+ /*
+ * Don't bother with further computation
+ * if only one of two bdevs are accessible
+ */
+ if (!bdev)
+ return (i + 1) % 2;
+
+ qlen[i] = bdev_get_queue_len(bdev);
+ is_nonrot[i] = blk_queue_nonrot(bdev_get_queue(bdev));
+ }
+
+ /* For mixed case, pick non rotational dev as optimal */
+ if (is_nonrot[0] != is_nonrot[1]) {
+ if (is_nonrot[0])
+ optimal = 0;
+ else
+ optimal = 1;
+ } else {
+ /* For nonrot devices balancing can be aggressive */
+ if (is_nonrot[0])
+ rq_len_overload = 2;
+ }
+
+ /* Devices load at same level */
+ if (abs(qlen[0] - qlen[1]) <= rq_len_overload)
+ return optimal;
+
+ if (qlen[0] > qlen[1])
+ optimal = 1;
+ else
+ optimal = 0;
+
+ return optimal;
+}
+
static int find_live_mirror(struct btrfs_fs_info *fs_info,
struct map_lookup *map, int first,
int dev_replace_is_ongoing)
@@ -5500,7 +5585,8 @@ static int find_live_mirror(struct btrfs_fs_info *fs_info,
else
num_stripes = map->num_stripes;
- preferred_mirror = first + current->pid % num_stripes;
+ preferred_mirror = first + guess_optimal(map, num_stripes,
+ current->pid % num_stripes);
if (dev_replace_is_ongoing &&
fs_info->dev_replace.cont_reading_from_srcdev_mode ==
--
2.21.0