[PATCH] [63/66] x86_64: Reserve SRAT hotadd memory on x86-64

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



From: Keith Mannthey, Andi Kleen 

Implement memory hotadd without sparsemem. The memory in the SRAT
hotadd area is just preserved instead and can be activated later.

There are a few restrictions:
- Only one continuous hotadd area allowed per node

The main problem is dealing with the many buggy SRAT tables
that are out there. The strategy here is to reject anything
suspicious.

Originally from Keith Mannthey, with several hacks and changes by AK.

[TBD:  Problems pointed out by 
KAMEZAWA Hiroyuki <kamezawa.hiroyu@xxxxxxxxxxxxxx>:
1) Goto's rebuild_zonelist patch will not work if CONFIG_MEMORY_HOTPLUG=n.
   rebuild zonelist is necessary when the system has just memory < 4G at boot,
   and hot add memory > 4G. because x86_64 has DMA32, ZONE_NORAML is not
+included into
   zonelist at boot time if system doesn't have memory >4G at boot.
[AK: should just force the higher zones at boot time when SRAT tells us]

2) zone and node's spanned_pages and present_pages are not incremented. you
+should do.

For example, our server (ia64/Fujitsu PrimeQuest) can equip memory from
4G to 1T(maybe 2T in future), and SRAT will *always* say  we have possible 1T
+memory.
(Microsoft requires "write all possible memory in SRAT")
When we reserve memmap for possible 1T memory, Linux will not work well in
+minimum 4G
configuraion ;)
[AK: needs limiting to 5-10% of max memory]]

Signed-off-by: Andi Kleen <ak@xxxxxxx>

---
 Documentation/x86_64/boot-options.txt |    3 
 arch/x86_64/mm/init.c                 |    3 
 arch/x86_64/mm/numa.c                 |    3 
 arch/x86_64/mm/srat.c                 |  109 ++++++++++++++++++++++++++++++++--
 include/asm-x86_64/numa.h             |    2 
 5 files changed, 113 insertions(+), 7 deletions(-)

Index: linux/arch/x86_64/mm/numa.c
===================================================================
--- linux.orig/arch/x86_64/mm/numa.c
+++ linux/arch/x86_64/mm/numa.c
@@ -142,6 +142,7 @@ void __init setup_node_bootmem(int nodei
 
 	reserve_bootmem_node(NODE_DATA(nodeid), nodedata_phys, pgdat_size); 
 	reserve_bootmem_node(NODE_DATA(nodeid), bootmap_start, bootmap_pages<<PAGE_SHIFT);
+	srat_reserve_add_area(nodeid);
 	node_set_online(nodeid);
 } 
 
@@ -335,6 +336,8 @@ __init int numa_setup(char *opt) 
 #ifdef CONFIG_ACPI_NUMA
  	if (!strncmp(opt,"noacpi",6))
  		acpi_numa = -1;
+	if (!strncmp(opt,"ignorehotadd",13))
+		ignore_hotadd = 1;
 #endif
 	return 1;
 } 
Index: linux/arch/x86_64/mm/srat.c
===================================================================
--- linux.orig/arch/x86_64/mm/srat.c
+++ linux/arch/x86_64/mm/srat.c
@@ -15,15 +15,28 @@
 #include <linux/bitmap.h>
 #include <linux/module.h>
 #include <linux/topology.h>
+#include <linux/bootmem.h>
+#include <linux/mm.h>
 #include <asm/proto.h>
 #include <asm/numa.h>
 #include <asm/e820.h>
 
+#if (defined(CONFIG_ACPI_HOTPLUG_MEMORY) || \
+	defined(CONFIG_ACPI_HOTPLUG_MEMORY_MODULE)) \
+		&& !defined(CONFIG_MEMORY_HOTPLUG)
+#define RESERVE_HOTADD 1
+#endif
+
 static struct acpi_table_slit *acpi_slit;
 
 static nodemask_t nodes_parsed __initdata;
 static nodemask_t nodes_found __initdata;
 static struct bootnode nodes[MAX_NUMNODES] __initdata;
+#ifdef RESERVE_HOTADD 
+static struct bootnode nodes_add[MAX_NUMNODES] __initdata;
+static int found_add_area __initdata;
+#endif
+int ignore_hotadd __initdata;
 static u8 pxm2node[256] = { [0 ... 255] = 0xff };
 
 /* Too small nodes confuse the VM badly. Usually they result
@@ -71,6 +84,12 @@ static __init int conflicting_nodes(unsi
 static __init void cutoff_node(int i, unsigned long start, unsigned long end)
 {
 	struct bootnode *nd = &nodes[i];
+
+#ifdef RESERVE_HOTADD
+	if (found_add_area)
+		return;
+#endif
+
 	if (nd->start < start) {
 		nd->start = start;
 		if (nd->end < nd->start)
@@ -90,6 +109,10 @@ static __init void bad_srat(void)
 	acpi_numa = -1;
 	for (i = 0; i < MAX_LOCAL_APIC; i++)
 		apicid_to_node[i] = NUMA_NO_NODE;
+#ifdef RESERVE_HOTADD
+	for (i = 0; i < MAX_NUMNODES; i++)
+		nodes_add[i].start = nodes[i].end = 0;
+#endif
 }
 
 static __init inline int srat_disabled(void)
@@ -155,6 +178,53 @@ acpi_numa_processor_affinity_init(struct
 	       pxm, pa->apic_id, node);
 }
 
+#ifdef RESERVE_HOTADD
+/*
+ * It is fine to add this area to the nodes data it will be used later
+ * This code supports one contigious hot add area per node.
+ */
+static int reserve_hotadd(int node, unsigned long start, unsigned long end)
+{
+	unsigned long s_pfn = start >> PAGE_SHIFT;
+	unsigned long e_pfn = end >> PAGE_SHIFT;
+
+	/* I had some trouble with strange memory hotadd regions breaking
+	   the boot. Be very strict here and restrict anything unexpected. 
+	   If you want working memory hotadd write correct SRATs.
+	
+	   The node size check is a basic sanity check to guard against 
+	   mistakes */
+	if ((signed long)(end - start) < NODE_MIN_SIZE) { 
+		printk(KERN_ERR "SRAT: Hotplug area too small\n");
+		return -1;
+	}			
+
+	if (e820_hole_size(s_pfn, e_pfn) != e_pfn - s_pfn) { 
+		printk(KERN_ERR "SRAT: Hotplug area has existing memory\n");
+		return -1;
+	}
+
+	/* Looks good */
+
+ 	found_add_area = 1;
+	if (nodes_add[node].start == nodes_add[node].end) {
+ 		nodes_add[node].start = start;
+ 		nodes_add[node].end = end;
+ 	} else {
+ 		if (nodes_add[node].start == end)
+ 			nodes_add[node].start = start;
+ 		if (nodes_add[node].end == start)
+ 			nodes_add[node].end = end;
+ 	}
+ 	if ((nodes_add[node].end >> PAGE_SHIFT) > end_pfn)
+ 		end_pfn = nodes_add[node].end >> PAGE_SHIFT;
+
+ 	printk(KERN_INFO "SRAT: hot plug zone found %Lx - %Lx\n",
+ 			nodes_add[node].start, nodes_add[node].end);
+	return 0;
+}
+#endif
+
 /* Callback for parsing of the Proximity Domain <-> Memory Area mappings */
 void __init
 acpi_numa_memory_affinity_init(struct acpi_table_memory_affinity *ma)
@@ -172,6 +242,8 @@ acpi_numa_memory_affinity_init(struct ac
 	}
 	if (ma->flags.enabled == 0)
 		return;
+ 	if (ma->flags.hot_pluggable && ignore_hotadd)
+		return;
 	start = ma->base_addr_lo | ((u64)ma->base_addr_hi << 32);
 	end = start + (ma->length_lo | ((u64)ma->length_hi << 32));
 	pxm = ma->proximity_domain;
@@ -181,10 +253,6 @@ acpi_numa_memory_affinity_init(struct ac
 		bad_srat();
 		return;
 	}
-	/* It is fine to add this area to the nodes data it will be used later*/
-	if (ma->flags.hot_pluggable == 1)
-		printk(KERN_INFO "SRAT: hot plug zone found %lx - %lx \n",
-				start, end);
 	i = conflicting_nodes(start, end);
 	if (i == node) {
 		printk(KERN_WARNING
@@ -208,8 +276,16 @@ acpi_numa_memory_affinity_init(struct ac
 		if (nd->end < end)
 			nd->end = end;
 	}
+
 	printk(KERN_INFO "SRAT: Node %u PXM %u %Lx-%Lx\n", node, pxm,
 	       nd->start, nd->end);
+
+#ifdef RESERVE_HOTADD
+ 	if (ma->flags.hot_pluggable && reserve_hotadd(node, start, end) < 0) {
+		bad_srat();
+		return;
+	}
+#endif
 }
 
 /* Sanity check to catch more bad SRATs (they are amazingly common).
@@ -225,6 +301,11 @@ static int nodes_cover_memory(void)
 		unsigned long e = nodes[i].end >> PAGE_SHIFT;
 		pxmram += e - s;
 		pxmram -= e820_hole_size(s, e);
+#ifdef RESERVE_HOTADD
+		pxmram -= nodes_add[i].end - nodes_add[i].start;
+#endif
+		if ((long)pxmram < 0)
+			pxmram = 0;
 	}
 
 	e820ram = end_pfn - e820_hole_size(0, end_pfn);
@@ -258,7 +339,7 @@ int __init acpi_scan_nodes(unsigned long
 
 	/* First clean up the node list */
 	for (i = 0; i < MAX_NUMNODES; i++) {
-		cutoff_node(i, start, end);
+ 		cutoff_node(i, start, end);
 		if ((nodes[i].end - nodes[i].start) < NODE_MIN_SIZE)
 			unparse_node(i);
 	}
@@ -303,6 +384,24 @@ static int node_to_pxm(int n)
        return 0;
 }
 
+void __init srat_reserve_add_area(int nodeid)
+{
+#ifdef RESERVE_HOTADD
+	if (found_add_area && nodes_add[nodeid].end) {
+		printk(KERN_INFO 
+	"SRAT: Reserving hot-add memory space for node %d at %Lx-%Lx\n", 
+			nodeid, nodes_add[nodeid].start, nodes_add[nodeid].end);
+		printk(KERN_INFO 
+	"SRAT: This will cost you %Lu MB of pre-allocated memory.\n", 
+		(((nodes_add[nodeid].end - 
+			nodes_add[nodeid].start)/PAGE_SIZE)*sizeof(struct page)) >> 20);
+
+		reserve_bootmem_node(NODE_DATA(nodeid), nodes_add[nodeid].start,
+			       nodes_add[nodeid].end - nodes_add[nodeid].start);
+	}
+#endif
+}
+
 int __node_distance(int a, int b)
 {
 	int index;
Index: linux/include/asm-x86_64/numa.h
===================================================================
--- linux.orig/include/asm-x86_64/numa.h
+++ linux/include/asm-x86_64/numa.h
@@ -18,6 +18,8 @@ extern void numa_init_array(void);
 extern int numa_off;
 
 extern void numa_set_node(int cpu, int node);
+extern void srat_reserve_add_area(int nodeid);
+extern int ignore_hotadd;
 
 extern unsigned char apicid_to_node[256];
 #ifdef CONFIG_NUMA
Index: linux/Documentation/x86_64/boot-options.txt
===================================================================
--- linux.orig/Documentation/x86_64/boot-options.txt
+++ linux/Documentation/x86_64/boot-options.txt
@@ -151,6 +151,9 @@ NUMA
 
   numa=fake=X   Fake X nodes and ignore NUMA setup of the actual machine.
 
+  numa=ignorehotadd Ignore hot add memory in SRAT. This will disable memory
+                hotplug      [SUSE extension for now]
+
 ACPI
 
   acpi=off	Don't enable ACPI
Index: linux/arch/x86_64/mm/init.c
===================================================================
--- linux.orig/arch/x86_64/mm/init.c
+++ linux/arch/x86_64/mm/init.c
@@ -529,8 +529,7 @@ int __add_pages(struct zone *z, unsigned
 	int err = -EIO;
 	unsigned long pfn;
 	for (pfn = start_pfn; pfn < start_pfn + nr_pages; pfn++) {
-		unsigned long addr = pfn << PAGE_SHIFT;
-		if (pfn_valid(pfn) && e820_mapped(addr, addr+1, E820_RAM)) {
+		if (pfn_valid(pfn)) {
 			online_page(pfn_to_page(pfn));
 			err = 0;
 		}
-
: send the line "unsubscribe linux-x86_64" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[Index of Archives]     [Linux ia64]     [Linux Kernel]     [DCCP]     [Linux ARM]     [Yosemite News]     [Linux SCSI]     [Linux Hams]
  Powered by Linux