|
|
|
[PATCH 3/5] mm: compaction: Capture a suitable high-order page immediately when it is made available | |
| [Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
|
|
While compaction is migrating pages to free up large contiguous blocks for
allocation it races with other allocation requests that may steal these
blocks or break them up. This patch alters direct compaction to capture a
suitable free page as soon as it becomes available to reduce this race. It
uses similar logic to split_free_page() to ensure that watermarks are
still obeyed.
Signed-off-by: Mel Gorman <mgorman@xxxxxxx>
Reviewed-by: Rik van Riel <riel@xxxxxxxxxx>
---
include/linux/compaction.h | 4 +--
include/linux/mm.h | 1 +
mm/compaction.c | 71 +++++++++++++++++++++++++++++++++++++-------
mm/internal.h | 1 +
mm/page_alloc.c | 63 +++++++++++++++++++++++++++++----------
5 files changed, 111 insertions(+), 29 deletions(-)
diff --git a/include/linux/compaction.h b/include/linux/compaction.h
index 51a90b7..5673459 100644
--- a/include/linux/compaction.h
+++ b/include/linux/compaction.h
@@ -22,7 +22,7 @@ extern int sysctl_extfrag_handler(struct ctl_table *table, int write,
extern int fragmentation_index(struct zone *zone, unsigned int order);
extern unsigned long try_to_compact_pages(struct zonelist *zonelist,
int order, gfp_t gfp_mask, nodemask_t *mask,
- bool sync);
+ bool sync, struct page **page);
extern int compact_pgdat(pg_data_t *pgdat, int order);
extern unsigned long compaction_suitable(struct zone *zone, int order);
@@ -64,7 +64,7 @@ static inline bool compaction_deferred(struct zone *zone, int order)
#else
static inline unsigned long try_to_compact_pages(struct zonelist *zonelist,
int order, gfp_t gfp_mask, nodemask_t *nodemask,
- bool sync)
+ bool sync, struct page **page)
{
return COMPACT_CONTINUE;
}
diff --git a/include/linux/mm.h b/include/linux/mm.h
index b36d08c..0812e86 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -454,6 +454,7 @@ void put_pages_list(struct list_head *pages);
void split_page(struct page *page, unsigned int order);
int split_free_page(struct page *page);
+int capture_free_page(struct page *page, int alloc_order, int migratetype);
/*
* Compound pages have a destructor function. Provide a
diff --git a/mm/compaction.c b/mm/compaction.c
index 95ca967..63af8d2 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -50,6 +50,41 @@ static inline bool migrate_async_suitable(int migratetype)
return is_migrate_cma(migratetype) || migratetype == MIGRATE_MOVABLE;
}
+static void compact_capture_page(struct compact_control *cc)
+{
+ unsigned long flags;
+ int mtype;
+
+ if (!cc->page || *cc->page)
+ return;
+
+ /* Speculatively examine the free lists without zone lock */
+ for (mtype = 0; mtype < MIGRATE_PCPTYPES; mtype++) {
+ int order;
+ for (order = cc->order; order < MAX_ORDER; order++) {
+ struct page *page;
+ struct free_area *area;
+ area = &(cc->zone->free_area[order]);
+ if (list_empty(&area->free_list[mtype]))
+ continue;
+
+ /* Take the lock and attempt capture of the page */
+ spin_lock_irqsave(&cc->zone->lock, flags);
+ if (!list_empty(&area->free_list[mtype])) {
+ page = list_entry(area->free_list[mtype].next,
+ struct page, lru);
+ if (capture_free_page(page, cc->order, mtype)) {
+ spin_unlock_irqrestore(&cc->zone->lock,
+ flags);
+ *cc->page = page;
+ return;
+ }
+ }
+ spin_unlock_irqrestore(&cc->zone->lock, flags);
+ }
+ }
+}
+
/*
* Isolate free pages onto a private freelist. Caller must hold zone->lock.
* If @strict is true, will abort returning 0 on any invalid PFNs or non-free
@@ -561,7 +596,6 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
static int compact_finished(struct zone *zone,
struct compact_control *cc)
{
- unsigned int order;
unsigned long watermark;
if (fatal_signal_pending(current))
@@ -586,14 +620,22 @@ static int compact_finished(struct zone *zone,
return COMPACT_CONTINUE;
/* Direct compactor: Is a suitable page free? */
- for (order = cc->order; order < MAX_ORDER; order++) {
- /* Job done if page is free of the right migratetype */
- if (!list_empty(&zone->free_area[order].free_list[cc->migratetype]))
- return COMPACT_PARTIAL;
-
- /* Job done if allocation would set block type */
- if (order >= pageblock_order && zone->free_area[order].nr_free)
+ if (cc->page) {
+ /* Was a suitable page captured? */
+ if (*cc->page)
return COMPACT_PARTIAL;
+ } else {
+ unsigned int order;
+ for (order = cc->order; order < MAX_ORDER; order++) {
+ struct free_area *area = &zone->free_area[cc->order];
+ /* Job done if page is free of the right migratetype */
+ if (!list_empty(&area->free_list[cc->migratetype]))
+ return COMPACT_PARTIAL;
+
+ /* Job done if allocation would set block type */
+ if (cc->order >= pageblock_order && area->nr_free)
+ return COMPACT_PARTIAL;
+ }
}
return COMPACT_CONTINUE;
@@ -708,6 +750,10 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
goto out;
}
}
+
+ /* Capture a page now if it is a suitable size */
+ if (cc->migratetype == MIGRATE_MOVABLE)
+ compact_capture_page(cc);
}
out:
@@ -720,7 +766,7 @@ out:
static unsigned long compact_zone_order(struct zone *zone,
int order, gfp_t gfp_mask,
- bool sync)
+ bool sync, struct page **page)
{
struct compact_control cc = {
.nr_freepages = 0,
@@ -729,6 +775,7 @@ static unsigned long compact_zone_order(struct zone *zone,
.migratetype = allocflags_to_migratetype(gfp_mask),
.zone = zone,
.sync = sync,
+ .page = page,
};
INIT_LIST_HEAD(&cc.freepages);
INIT_LIST_HEAD(&cc.migratepages);
@@ -750,7 +797,7 @@ int sysctl_extfrag_threshold = 500;
*/
unsigned long try_to_compact_pages(struct zonelist *zonelist,
int order, gfp_t gfp_mask, nodemask_t *nodemask,
- bool sync)
+ bool sync, struct page **page)
{
enum zone_type high_zoneidx = gfp_zone(gfp_mask);
int may_enter_fs = gfp_mask & __GFP_FS;
@@ -770,7 +817,7 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist,
nodemask) {
int status;
- status = compact_zone_order(zone, order, gfp_mask, sync);
+ status = compact_zone_order(zone, order, gfp_mask, sync, page);
rc = max(status, rc);
/* If a normal allocation would succeed, stop compacting */
@@ -825,6 +872,7 @@ int compact_pgdat(pg_data_t *pgdat, int order)
struct compact_control cc = {
.order = order,
.sync = false,
+ .page = NULL,
};
return __compact_pgdat(pgdat, &cc);
@@ -835,6 +883,7 @@ static int compact_node(int nid)
struct compact_control cc = {
.order = -1,
.sync = true,
+ .page = NULL,
};
return __compact_pgdat(NODE_DATA(nid), &cc);
diff --git a/mm/internal.h b/mm/internal.h
index 2ba87fb..9156714 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -124,6 +124,7 @@ struct compact_control {
int order; /* order a direct compactor needs */
int migratetype; /* MOVABLE, RECLAIMABLE etc */
struct zone *zone;
+ struct page **page; /* Page captured of requested size */
};
unsigned long
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 4a4f921..adc3aa8 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1374,16 +1374,11 @@ void split_page(struct page *page, unsigned int order)
}
/*
- * Similar to split_page except the page is already free. As this is only
- * being used for migration, the migratetype of the block also changes.
- * As this is called with interrupts disabled, the caller is responsible
- * for calling arch_alloc_page() and kernel_map_page() after interrupts
- * are enabled.
- *
- * Note: this is probably too low level an operation for use in drivers.
- * Please consult with lkml before using this in your driver.
+ * Similar to the split_page family of functions except that the page
+ * required at the given order and being isolated now to prevent races
+ * with parallel allocators
*/
-int split_free_page(struct page *page)
+int capture_free_page(struct page *page, int alloc_order, int migratetype)
{
unsigned int order;
unsigned long watermark;
@@ -1405,10 +1400,11 @@ int split_free_page(struct page *page)
rmv_page_order(page);
__mod_zone_page_state(zone, NR_FREE_PAGES, -(1UL << order));
- /* Split into individual pages */
- set_page_refcounted(page);
- split_page(page, order);
+ if (alloc_order != order)
+ expand(zone, page, alloc_order, order,
+ &zone->free_area[order], migratetype);
+ /* Set the pageblock if the captured page is at least a pageblock */
if (order >= pageblock_order - 1) {
struct page *endpage = page + (1 << order) - 1;
for (; page < endpage; page += pageblock_nr_pages) {
@@ -1419,7 +1415,35 @@ int split_free_page(struct page *page)
}
}
- return 1 << order;
+ return 1UL << order;
+}
+
+/*
+ * Similar to split_page except the page is already free. As this is only
+ * being used for migration, the migratetype of the block also changes.
+ * As this is called with interrupts disabled, the caller is responsible
+ * for calling arch_alloc_page() and kernel_map_page() after interrupts
+ * are enabled.
+ *
+ * Note: this is probably too low level an operation for use in drivers.
+ * Please consult with lkml before using this in your driver.
+ */
+int split_free_page(struct page *page)
+{
+ unsigned int order;
+ int nr_pages;
+
+ BUG_ON(!PageBuddy(page));
+ order = page_order(page);
+
+ nr_pages = capture_free_page(page, order, 0);
+ if (!nr_pages)
+ return 0;
+
+ /* Split into individual pages */
+ set_page_refcounted(page);
+ split_page(page, order);
+ return nr_pages;
}
/*
@@ -2065,7 +2089,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
bool *deferred_compaction,
unsigned long *did_some_progress)
{
- struct page *page;
+ struct page *page = NULL;
if (!order)
return NULL;
@@ -2077,10 +2101,16 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
current->flags |= PF_MEMALLOC;
*did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask,
- nodemask, sync_migration);
+ nodemask, sync_migration, &page);
current->flags &= ~PF_MEMALLOC;
- if (*did_some_progress != COMPACT_SKIPPED) {
+ /* If compaction captured a page, prep and use it */
+ if (page) {
+ prep_new_page(page, order, gfp_mask);
+ goto got_page;
+ }
+
+ if (*did_some_progress != COMPACT_SKIPPED) {
/* Page migration frees to the PCP lists but we want merging */
drain_pages(get_cpu());
put_cpu();
@@ -2090,6 +2120,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
alloc_flags, preferred_zone,
migratetype);
if (page) {
+got_page:
preferred_zone->compact_considered = 0;
preferred_zone->compact_defer_shift = 0;
if (order >= preferred_zone->compact_order_failed)
--
1.7.9.2
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
[Other Archives] [Linux Kernel Newbies] [Linux Driver Development] [Linux Kbuild] [Fedora Kernel] [Linux Kernel Testers] [Linux SH] [Linux Omap] [Linux Tape] [Linux Input] [Linux Kernel Janitors] [Linux Kernel Packagers] [Linux Doc] [Linux Man Pages] [Linux API] [Linux Memory Management] [Linux Modules] [Linux Standards] [Kernel Announce] [Netdev] [Git] [Linux PCI] Linux CAN Development [Linux I2C] [Linux RDMA] [Linux NUMA] [Netfilter] [Netfilter Devel] [SELinux] [Bugtraq] [FIO] [Linux Perf Users] [Linux Serial] [Linux PPP] [Linux ISDN] [Linux Next] [Kernel Stable Commits] [Linux Tip Commits] [Kernel MM Commits] [Linux Security Module] [AutoFS] [Filesystem Development] [Ext3 Filesystem] [Linux bcache] [Ext4 Filesystem] [Linux BTRFS] [Linux CEPH Filesystem] [Linux XFS] [XFS] [Linux NFS] [Linux CIFS] [Ecryptfs] [Linux NILFS] [Linux Cachefs] [Reiser FS] [Initramfs] [Linux FB Devel] [Linux OpenGL] [DRI Devel] [Fastboot] [Linux RT Users] [Linux RT Stable] [eCos] [Corosync] [Linux Clusters] [LVS Devel] [Hot Plug] [Linux Virtualization] [KVM] [KVM PPC] [KVM ia64] [Linux Containers] [Linux Hexagon] [Linux Cgroups] [Util Linux] [Wireless] [Linux Bluetooth] [Bluez Devel] [Ethernet Bridging] [Embedded Linux] [Barebox] [Linux MMC] [Linux IIO] [Sparse] [Smatch] [Linux Arch] [x86 Platform Driver] [Linux ACPI] [Linux IBM ACPI] [LM Sensors] [CPU Freq] [Linux Power Management] [Linmodems] [Linux DCCP] [Linux SCTP] [ALSA Devel] [Linux USB] [Linux PA RISC] [Linux Samsung SOC] [MIPS Linux] [IBM S/390 Linux] [ARM Linux] [ARM Kernel] [ARM MSM] [Tegra Devel] [Sparc Linux] [Linux Security] [Linux Sound] [Linux Media] [Video 4 Linux] [Linux IRDA Users] [Linux for the blind] [Linux RAID] [Linux ATA RAID] [Device Mapper] [Linux SCSI] [SCSI Target Devel] [Linux SCSI Target Infrastructure] [Linux IDE] [Linux SMP] [Linux AXP] [Linux Alpha] [Linux M68K] [Linux ia64] [Linux 8086] [Linux x86_64] [Linux Config] [Linux Apps] [Linux MSDOS] [Linux X.25] [Linux Crypto] [DM Crypt] [Linux Trace Users] [Linux Btrace] [Linux Watchdog] [Utrace Devel] [Linux C Programming] [Linux Assembly] [Dash] [DWARVES] [Hail Devel] [Linux Kernel Debugger] [Linux gcc] [Gcc Help] [X.Org] [Wine]
![]() |
![]() |
[Older Kernel Discussion] [Yosemite National Park Forum] [Large Format Photos] [Gimp] [Yosemite Photos] [Stuff]