Re: [PATCH -V2 6/9] hugetlbfs: Add memory controller support for private mapping

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]


On Thu, Mar 01, 2012 at 02:46:17PM +0530, Aneesh Kumar K.V wrote:
> From: "Aneesh Kumar K.V" <aneesh.kumar@xxxxxxxxxxxxxxxxxx>
> 
> For private mapping we always charge/uncharge from the current task memcg.
> Charging happens during mmap(2) and uncharge happens during the
> vm_operations->close. For child task after fork the charging happens
> during fault time in alloc_huge_page. We also need to make sure for private
> mapping each vma for hugeTLB mapping have struct resv_map allocated so that we
> can store the charge list in resv_map.
> 
> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@xxxxxxxxxxxxxxxxxx>
> ---
>  mm/hugetlb.c |  176 +++++++++++++++++++++++++++++++++------------------------
>  1 files changed, 102 insertions(+), 74 deletions(-)
> 
> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> index 664c663..2d99d0a 100644
> --- a/mm/hugetlb.c
> +++ b/mm/hugetlb.c
> @@ -126,6 +126,22 @@ long hugetlb_truncate_cgroup(struct hstate *h,
>  #endif
>  }
> 
> +long hugetlb_truncate_cgroup_range(struct hstate *h,
> +				   struct list_head *head, long from, long end)
> +{
> +#ifdef CONFIG_MEM_RES_CTLR_NORECLAIM
> +	long chg;
> +	from = from << huge_page_order(h);
> +	end  = end << huge_page_order(h);
> +	chg  = mem_cgroup_truncate_chglist_range(head, from, end, h - hstates);
> +	if (chg > 0)
> +		return chg >> huge_page_order(h);
> +	return chg;
> +#else
> +	return region_truncate_range(head, from, end);
> +#endif
> +}
> +
>  /*
>   * Convert the address within this vma to the page offset within
>   * the mapping, in pagecache page units; huge pages here.
> @@ -229,13 +245,19 @@ static struct resv_map *resv_map_alloc(void)
>  	return resv_map;
>  }
> 
> -static void resv_map_release(struct kref *ref)
> +static unsigned long resv_map_release(struct hstate *h,
> +				      struct resv_map *resv_map)
>  {
> -	struct resv_map *resv_map = container_of(ref, struct resv_map, refs);
> -
> -	/* Clear out any active regions before we release the map. */
> -	region_truncate(&resv_map->regions, 0);
> +	unsigned long reserve;
> +	/*
> +	 * We should not have any regions left here, if we were able to
> +	 * do memory allocation when in trunage_cgroup_range.
> +	 *
> +	 * Clear out any active regions before we release the map
> +	 */
> +	reserve = hugetlb_truncate_cgroup(h, &resv_map->regions, 0);
>  	kfree(resv_map);
> +	return reserve;
>  }
> 
>  static struct resv_map *vma_resv_map(struct vm_area_struct *vma)
> @@ -447,9 +469,7 @@ static void free_huge_page(struct page *page)
>  	 */
>  	struct hstate *h = page_hstate(page);
>  	int nid = page_to_nid(page);
> -	struct address_space *mapping;
> 
> -	mapping = (struct address_space *) page_private(page);
>  	set_page_private(page, 0);
>  	page->mapping = NULL;
>  	BUG_ON(page_count(page));
> @@ -465,8 +485,6 @@ static void free_huge_page(struct page *page)
>  		enqueue_huge_page(h, page);
>  	}
>  	spin_unlock(&hugetlb_lock);
> -	if (mapping)
> -		hugetlb_put_quota(mapping, 1);
>  }
> 
>  static void prep_new_huge_page(struct hstate *h, struct page *page, int nid)
> @@ -887,63 +905,74 @@ static void return_unused_surplus_pages(struct hstate *h,
>   * No action is required on failure.
>   */
>  static long vma_needs_reservation(struct hstate *h,
> -			struct vm_area_struct *vma, unsigned long addr)
> +				  struct vm_area_struct *vma,
> +				  unsigned long addr)
>  {
> +	pgoff_t idx = vma_hugecache_offset(h, vma, addr);
>  	struct address_space *mapping = vma->vm_file->f_mapping;
>  	struct inode *inode = mapping->host;
> 
> +
>  	if (vma->vm_flags & VM_MAYSHARE) {
> -		pgoff_t idx = vma_hugecache_offset(h, vma, addr);
>  		return hugetlb_page_charge(&inode->i_mapping->private_list,
>  					   h, idx, idx + 1);
>  	} else if (!is_vma_resv_set(vma, HPAGE_RESV_OWNER)) {
> -		return 1;
> -
> -	} else  {
> -		long err;
> -		pgoff_t idx = vma_hugecache_offset(h, vma, addr);
> -		struct resv_map *reservations = vma_resv_map(vma);
> -
> -		err = region_chg(&reservations->regions, idx, idx + 1, 0);
> -		if (err < 0)
> -			return err;
> -		return 0;
> +		struct resv_map *resv_map = vma_resv_map(vma);
> +		if (!resv_map) {
> +			/*
> +			 * We didn't allocate resv_map for this vma.
> +			 * Allocate it here.
> +			 */
> +			resv_map = resv_map_alloc();
> +			if (!resv_map)
> +				return -ENOMEM;
> +			set_vma_resv_map(vma, resv_map);
> +		}
> +		return hugetlb_page_charge(&resv_map->regions,
> +					   h, idx, idx + 1);
>  	}
> +	/*
> +	 * We did the private page charging in mmap call
> +	 */
> +	return 0;
>  }
> 
>  static void vma_uncharge_reservation(struct hstate *h,
>  				     struct vm_area_struct *vma,
>  				     unsigned long chg)
>  {
> +	int idx = h - hstates;
> +	struct list_head *region_list;
>  	struct address_space *mapping = vma->vm_file->f_mapping;
>  	struct inode *inode = mapping->host;
> 
> 
> -	if (vma->vm_flags & VM_MAYSHARE) {
> -		return hugetlb_page_uncharge(&inode->i_mapping->private_list,
> -					     h - hstates,
> -					     chg << huge_page_order(h));
> +	if (vma->vm_flags & VM_MAYSHARE)
> +		region_list = &inode->i_mapping->private_list;
> +	else {
> +		struct resv_map *resv_map = vma_resv_map(vma);
> +		region_list = &resv_map->regions;
>  	}
> +	return hugetlb_page_uncharge(region_list,
> +				     idx, chg << huge_page_order(h));
>  }
> 
>  static void vma_commit_reservation(struct hstate *h,
>  			struct vm_area_struct *vma, unsigned long addr)
>  {
> -
> +	struct list_head *region_list;
> +	pgoff_t idx = vma_hugecache_offset(h, vma, addr);
>  	struct address_space *mapping = vma->vm_file->f_mapping;
>  	struct inode *inode = mapping->host;
> 
>  	if (vma->vm_flags & VM_MAYSHARE) {
> -		pgoff_t idx = vma_hugecache_offset(h, vma, addr);
> -		hugetlb_commit_page_charge(h, &inode->i_mapping->private_list,
> -					   idx, idx + 1);
> -	} else if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) {
> -		pgoff_t idx = vma_hugecache_offset(h, vma, addr);
> +		region_list = &inode->i_mapping->private_list;
> +	} else  {
>  		struct resv_map *reservations = vma_resv_map(vma);
> -
> -		/* Mark this page used in the map. */
> -		region_add(&reservations->regions, idx, idx + 1, 0);
> +		region_list = &reservations->regions;
>  	}
> +	hugetlb_commit_page_charge(h, region_list, idx, idx + 1);
> +	return;
>  }
> 
>  static struct page *alloc_huge_page(struct vm_area_struct *vma,
> @@ -986,10 +1015,9 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,
>  			return ERR_PTR(-VM_FAULT_SIGBUS);
>  		}
>  	}
> -
>  	set_page_private(page, (unsigned long) mapping);
> -
> -	vma_commit_reservation(h, vma, addr);
> +	if (chg)
> +		vma_commit_reservation(h, vma, addr);
>  	return page;
>  }
> 
> @@ -2001,25 +2029,40 @@ static void hugetlb_vm_op_open(struct vm_area_struct *vma)
>  	 */
>  	if (reservations)
>  		kref_get(&reservations->refs);
> +	else if (!(vma->vm_flags & VM_MAYSHARE)) {
> +		/*
> +		 * for non shared vma we need resv map to track
> +		 * hugetlb cgroup usage. Allocate it here. Charging
> +		 * the cgroup will take place in fault path.
> +		 */
> +		struct resv_map *resv_map = resv_map_alloc();
> +		/*
> +		 * If we fail to allocate resv_map here. We will allocate
> +		 * one when we do alloc_huge_page. So we don't handle
> +		 * ENOMEM here. The function also return void. So there is
> +		 * nothing much we can do.
> +		 */
> +		if (resv_map)
> +			set_vma_resv_map(vma, resv_map);
> +	}
>  }
> 
>  static void hugetlb_vm_op_close(struct vm_area_struct *vma)
>  {
>  	struct hstate *h = hstate_vma(vma);
> -	struct resv_map *reservations = vma_resv_map(vma);
> -	unsigned long reserve;
> -	unsigned long start;
> -	unsigned long end;
> +	struct resv_map *resv_map = vma_resv_map(vma);
> +	unsigned long reserve, start, end;
> 
> -	if (reservations) {
> +	if (resv_map) {
>  		start = vma_hugecache_offset(h, vma, vma->vm_start);
>  		end = vma_hugecache_offset(h, vma, vma->vm_end);
> 
> -		reserve = (end - start) -
> -			region_count(&reservations->regions, start, end);
> -
> -		kref_put(&reservations->refs, resv_map_release);
> -
> +		reserve = hugetlb_truncate_cgroup_range(h, &resv_map->regions,
> +							start, end);

I wonder, what's the reason for removing the "(end - start) -" part of the
"reserve =" statements?  region_count() and hugetlb_truncate_cgroup_range()
both seem to return the number of bytes (I think?) between start and end that
are mapped in that region list.

This change alters the definition of "reserve" from "bytes not mapped" to
"bytes mapped" with no corresponding change to the subsequent uses of reserve,
which would likely cause hugetlb_acct_memory() to be invoked at the wrong
times, I think.

<shrug> Does that make any sense?

--D
> +		/* open coded kref_put */
> +		if (atomic_sub_and_test(1, &resv_map->refs.refcount)) {
> +			reserve += resv_map_release(h, resv_map);
> +		}
>  		if (reserve) {
>  			hugetlb_acct_memory(h, -reserve);
>  			hugetlb_put_quota(vma->vm_file->f_mapping, reserve);
> @@ -2803,8 +2846,9 @@ int hugetlb_reserve_pages(struct inode *inode,
>  					vm_flags_t vm_flags)
>  {
>  	long ret, chg;
> +	struct list_head *region_list;
>  	struct hstate *h = hstate_inode(inode);
> -
> +	struct resv_map *resv_map = NULL;
>  	/*
>  	 * Only apply hugepage reservation if asked. At fault time, an
>  	 * attempt will be made for VM_NORESERVE to allocate a page
> @@ -2820,19 +2864,17 @@ int hugetlb_reserve_pages(struct inode *inode,
>  	 * called to make the mapping read-write. Assume !vma is a shm mapping
>  	 */
>  	if (!vma || vma->vm_flags & VM_MAYSHARE) {
> -		chg = hugetlb_page_charge(&inode->i_mapping->private_list,
> -					  h, from, to);
> +		region_list = &inode->i_mapping->private_list;
>  	} else {
> -		struct resv_map *resv_map = resv_map_alloc();
> +		resv_map = resv_map_alloc();
>  		if (!resv_map)
>  			return -ENOMEM;
> 
> -		chg = to - from;
> -
>  		set_vma_resv_map(vma, resv_map);
>  		set_vma_resv_flags(vma, HPAGE_RESV_OWNER);
> +		region_list = &resv_map->regions;
>  	}
> -
> +	chg = hugetlb_page_charge(region_list, h, from, to);
>  	if (chg < 0)
>  		return chg;
> 
> @@ -2848,29 +2890,15 @@ int hugetlb_reserve_pages(struct inode *inode,
>  	ret = hugetlb_acct_memory(h, chg);
>  	if (ret < 0)
>  		goto err_acct_mem;
> -	/*
> -	 * Account for the reservations made. Shared mappings record regions
> -	 * that have reservations as they are shared by multiple VMAs.
> -	 * When the last VMA disappears, the region map says how much
> -	 * the reservation was and the page cache tells how much of
> -	 * the reservation was consumed. Private mappings are per-VMA and
> -	 * only the consumed reservations are tracked. When the VMA
> -	 * disappears, the original reservation is the VMA size and the
> -	 * consumed reservations are stored in the map. Hence, nothing
> -	 * else has to be done for private mappings here
> -	 */
> -	if (!vma || vma->vm_flags & VM_MAYSHARE)
> -		hugetlb_commit_page_charge(h, &inode->i_mapping->private_list,
> -					   from, to);
> +
> +	hugetlb_commit_page_charge(h, region_list, from, to);
>  	return 0;
>  err_acct_mem:
>  	hugetlb_put_quota(inode->i_mapping, chg);
>  err_quota:
> -	if (!vma || vma->vm_flags & VM_MAYSHARE)
> -		hugetlb_page_uncharge(&inode->i_mapping->private_list,
> -				      h - hstates, chg << huge_page_order(h));
> +	hugetlb_page_uncharge(region_list, h - hstates,
> +			      chg << huge_page_order(h));
>  	return ret;
> -
>  }
> 
>  void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed)
> @@ -2884,7 +2912,7 @@ void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed)
>  	inode->i_blocks -= (blocks_per_huge_page(h) * freed);
>  	spin_unlock(&inode->i_lock);
> 
> -	hugetlb_put_quota(inode->i_mapping, (chg - freed));
> +	hugetlb_put_quota(inode->i_mapping, chg);
>  	hugetlb_acct_memory(h, -(chg - freed));
>  }
> 
> -- 
> 1.7.9
> 
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to majordomo@xxxxxxxxx.  For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
> Don't email: <a href=mailto:"dont@xxxxxxxxx";> email@xxxxxxxxx </a>
> 

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[Other Archives]     [Linux Kernel Newbies]     [Linux Driver Development]     [Fedora Kernel]     [Linux Kernel Testers]     [Linux SH]     [Linux Omap]     [Linux Kbuild]     [Linux Tape]     [Linux Input]     [Linux Kernel Janitors]     [Linux Kernel Packagers]     [Linux Doc]     [Linux Man Pages]     [Linux API]     [Linux Memory Management]     [Linux Modules]     [Linux Standards]     [Kernel Announce]     [Netdev]     [Git]     [Linux PCI]     Linux CAN Development     [Linux I2C]     [Linux RDMA]     [Linux NUMA]     [Netfilter]     [Netfilter Devel]     [SELinux]     [Bugtraq]     [FIO]     [Linux Perf Users]     [Linux Serial]     [Linux PPP]     [Linux ISDN]     [Linux Next]     [Kernel Stable Commits]     [Linux Tip Commits]     [Kernel MM Commits]     [Linux Security Module]     [AutoFS]     [Filesystem Development]     [Ext3 Filesystem]     [Linux bcache]     [Ext4 Filesystem]     [Linux BTRFS]     [Linux CEPH Filesystem]     [Linux XFS]     [XFS]     [Linux NFS]     [Linux CIFS]     [Ecryptfs]     [Linux NILFS]     [Linux Cachefs]     [Reiser FS]     [Initramfs]     [Linux FB Devel]     [Linux OpenGL]     [DRI Devel]     [Fastboot]     [Linux RT Users]     [Linux RT Stable]     [eCos]     [Corosync]     [Linux Clusters]     [LVS Devel]     [Hot Plug]     [Linux Virtualization]     [KVM]     [KVM PPC]     [KVM ia64]     [Linux Containers]     [Linux Hexagon]     [Linux Cgroups]     [Util Linux]     [Wireless]     [Linux Bluetooth]     [Bluez Devel]     [Ethernet Bridging]     [Embedded Linux]     [Barebox]     [Linux MMC]     [Linux IIO]     [Sparse]     [Smatch]     [Linux Arch]     [x86 Platform Driver]     [Linux ACPI]     [Linux IBM ACPI]     [LM Sensors]     [CPU Freq]     [Linux Power Management]     [Linmodems]     [Linux DCCP]     [Linux SCTP]     [ALSA Devel]     [Linux USB]     [Linux PA RISC]     [Linux Samsung SOC]     [MIPS Linux]     [IBM S/390 Linux]     [ARM Linux]     [ARM Kernel]     [ARM MSM]     [Tegra Devel]     [Sparc Linux]     [Linux Security]     [Linux Sound]     [Linux Media]     [Video 4 Linux]     [Linux IRDA Users]     [Linux for the blind]     [Linux RAID]     [Linux ATA RAID]     [Device Mapper]     [Linux SCSI]     [SCSI Target Devel]     [Linux SCSI Target Infrastructure]     [Linux IDE]     [Linux SMP]     [Linux AXP]     [Linux Alpha]     [Linux M68K]     [Linux ia64]     [Linux 8086]     [Linux x86_64]     [Linux Config]     [Linux Apps]     [Linux MSDOS]     [Linux X.25]     [Linux Crypto]     [DM Crypt]     [Linux Trace Users]     [Linux Btrace]     [Linux Watchdog]     [Utrace Devel]     [Linux C Programming]     [Linux Assembly]     [Dash]     [DWARVES]     [Hail Devel]     [Linux Kernel Debugger]     [Linux gcc]     [Gcc Help]     [X.Org]     [Wine]

Add to Google Powered by Linux

[Older Kernel Discussion]     [Yosemite National Park Forum]     [Large Format Photos]     [Gimp]     [Yosemite Photos]     [Stuff]