From: Naoya Horiguchi on
On Thu, May 13, 2010 at 11:18:04AM +0200, Andi Kleen wrote:
> ...
> I think you also had a patch for mce-test, can you send me that
> one too?

Yes.
I attached stand-alone test program and run script below.
These don't depend on other code in mce-test,
so we can just add to /tsrc.

> BTW I wonder: did you verify that the 1GB page support works?
> I would expect it does, but it would be good to double check.
> One would need a Westmere server or AMD Family10h+ system to test that.

No, I didn't.

Thanks,
Naoya Horiguchi

---
thugetlb.c
---
/*
* Test program for memory error handling for hugepages
* Author: Naoya Horiguchi <n-horiguchi(a)ah.jp.nec.com>
*/
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <fcntl.h>
#include <signal.h>
#include <unistd.h>
#include <getopt.h>
#include <sys/mman.h>
#include <sys/ipc.h>
#include <sys/shm.h>
#include <sys/sem.h>
#include <sys/types.h>
#include <sys/prctl.h>

#define FILE_BASE "test"

#define HPAGE_SIZE (2UL*1024*1024)
#define BUF_SIZE 256
#define PROTECTION (PROT_READ | PROT_WRITE)

#ifndef SHM_HUGETLB
#define SHM_HUGETLB 04000
#endif

/* Control early_kill/late_kill */
#define PR_MCE_KILL 33
#define PR_MCE_KILL_CLEAR 0
#define PR_MCE_KILL_SET 1
#define PR_MCE_KILL_LATE 0
#define PR_MCE_KILL_EARLY 1
#define PR_MCE_KILL_DEFAULT 2
#define PR_MCE_KILL_GET 34

int PS; /* Page size */
int file_size; /* Memory allocation size (hugepage unit) */
/* Error injection position (page offset from the first hugepage head) */
int corrupt_page;
char filename[BUF_SIZE] = "/test";
char filepath[BUF_SIZE];

#define DEB printf("DEBUG [%d:%s:%d]\n", getpid(), __FILE__, __LINE__);

static void usage(void)
{
printf(
"./thugetlb [-m memory] [-o offset] [-f file] [-xeSAaFpch] hugetlbfs_directory\n"
" -m|--memory size(hugepage unit) Size of hugetlbfs file\n"
" -o|--offset offset(page unit) Position of error injection\n"
" -x|--inject Error injection switch\n"
" -e|--early-kill Set PR_MCE_KILL_EARLY\n"
" -S|--shm Use shmem with SHM_HUGETLB\n"
" -A|--anonymous Use MAP_ANONYMOUS\n"
" -a|--avoid-touch Avoid touching error page\n"
" -F|--fork\n"
" -p|--private\n"
" -c|--cow\n"
" -f|--filename string\n"
" -h|--help\n"
"\n"
);
}

/*
* semaphore get/put wrapper
*/
int get_semaphore(int sem_id, struct sembuf *sembuffer)
{
sembuffer->sem_num = 0;
sembuffer->sem_op = -1;
sembuffer->sem_flg = SEM_UNDO;
return semop(sem_id, sembuffer, 1);
}

int put_semaphore(int sem_id, struct sembuf *sembuffer)
{
sembuffer->sem_num = 0;
sembuffer->sem_op = 1;
sembuffer->sem_flg = SEM_UNDO;
return semop(sem_id, sembuffer, 1);
}

static int avoid_hpage(void *addr, int flag, char *avoid)
{
return flag == 1 && addr == avoid;
}

static void write_bytes(char *addr, int flag, char *avoid)
{
int i, j;
for (i = 0; i < file_size; i++) {
if (avoid_hpage(addr + i * HPAGE_SIZE, flag, avoid))
continue;
for (j = 0; j < HPAGE_SIZE; j++) {
*(addr + i * HPAGE_SIZE + j) = (char)('a' +
((i + j) % 26));
}
}
}

static void read_bytes(char *addr, int flag, char *avoid)
{
int i, j;

for (i = 0; i < file_size; i++) {
if (avoid_hpage(addr + i * HPAGE_SIZE, flag, avoid))
continue;
for (j = 0; j < HPAGE_SIZE; j++) {
if (*(addr + i * HPAGE_SIZE + j) != (char)('a' +
((i + j) % 26))) {
printf("Mismatch at %lu\n", i + j);
break;
}
}
}
}

static struct option opts[] = {
{ "memory" , 1, NULL, 'm' },
{ "offset" , 1, NULL, 'o' },
{ "inject" , 0, NULL, 'x' },
{ "early_kill" , 0, NULL, 'e' },
{ "shm" , 0, NULL, 'S' },
{ "anonymous" , 0, NULL, 'A' },
{ "avoid-touch" , 0, NULL, 'a' },
{ "fork" , 0, NULL, 'F' },
{ "private" , 0, NULL, 'p' },
{ "cow" , 0, NULL, 'c' },
{ "filename" , 1, NULL, 'f' },
{ "help" , 0, NULL, 'h' },
{ NULL , 0, NULL, 0 }
};

int main(int argc, char *argv[])
{
void *addr;
int i;
int ret;
int fd = 0;
int shmid;
int semid;
int semaphore;
int inject = 0;
int early_kill = 0;
int avoid_touch = 0;
int anonflag = 0;
int shmflag = 0;
int shmkey = 0;
int forkflag = 0;
int privateflag = 0;
int cowflag = 0;
char c;
pid_t pid = 0;
void *expected_addr = NULL;
struct sembuf sembuffer;

PS = getpagesize();
file_size = 1;
corrupt_page = -1;

if (argc == 1) {
usage();
exit(EXIT_FAILURE);
}

while ((c = getopt_long(argc, argv,
"m:o:xeSAaFpcf:h", opts, NULL)) != -1) {
switch (c) {
case 'm':
file_size = strtol(optarg, NULL, 10);
break;
case 'o':
corrupt_page = strtol(optarg, NULL, 10);
break;
case 'x':
inject = 1;
break;
case 'e':
early_kill = 1;
break;
case 'S':
shmflag = 1;
break;
case 'A':
anonflag = 1;
break;
case 'a':
avoid_touch = 1;
break;
case 'F':
forkflag = 1;
break;
case 'p':
privateflag = 1;
break;
case 'c':
cowflag = 1;
break;
case 'f':
strcat(filename, optarg);
shmkey = strtol(optarg, NULL, 10);
break;
case 'h':
usage();
exit(EXIT_SUCCESS);
default:
usage();
exit(EXIT_FAILURE);
}
}

if (inject && corrupt_page * PS > file_size * HPAGE_SIZE)
err("Target page is out of range.\n");

if (avoid_touch && corrupt_page == -1)
err("Avoid which page?\n");

/* Construct file name */
if (access(argv[argc - 1], F_OK) == -1) {
usage();
exit(EXIT_FAILURE);
} else {
strcpy(filepath, argv[argc - 1]);
strcat(filepath, filename);
}

if (shmflag) {
if ((shmid = shmget(shmkey, file_size * HPAGE_SIZE,
SHM_HUGETLB | IPC_CREAT | SHM_R | SHM_W)) < 0)
err("shmget");
addr = shmat(shmid, (void *)0x0UL, 0);
if (addr == (char *)-1) {
perror("Shared memory attach failure");
shmctl(shmid, IPC_RMID, NULL);
exit(2);
}
} else if (anonflag) {
int mapflag = MAP_ANONYMOUS | 0x40000; /* MAP_HUGETLB */
if (privateflag)
mapflag |= MAP_PRIVATE;
else
mapflag |= MAP_SHARED;
if ((addr = mmap(0, file_size * HPAGE_SIZE,
PROTECTION, mapflag, -1, 0)) == MAP_FAILED)
err("mmap");
} else {
int mapflag = MAP_SHARED;
if (privateflag)
mapflag = MAP_PRIVATE;
if ((fd = open(filepath, O_CREAT | O_RDWR, 0777)) < 0)
err("Open failed");
if ((addr = mmap(0, file_size * HPAGE_SIZE,
PROTECTION, mapflag, fd, 0)) == MAP_FAILED) {
unlink(filepath);
err("mmap");
}
}

if (corrupt_page != -1)
expected_addr = (void *)(addr + corrupt_page / 512 * HPAGE_SIZE);

if (forkflag) {
semid = semget(IPC_PRIVATE, 1, 0666|IPC_CREAT);
if (semid == -1) {
perror("semget");
goto cleanout;
}
semaphore = semctl(semid, 0, SETVAL, 1);
if (semaphore == -1) {
perror("semctl");
goto cleanout;
}
if (get_semaphore(semid, &sembuffer)) {
perror("get_semaphore");
goto cleanout;
}
}

write_bytes(addr, 0, 0);
read_bytes(addr, 0, 0);

if (early_kill)
prctl(PR_MCE_KILL, PR_MCE_KILL_SET, PR_MCE_KILL_EARLY,
NULL, NULL);

/*
* Intended order:
* 1. Child COWs
* 2. Parent madvise()s
* 3. Child exit()s
*/
if (forkflag) {
pid = fork();
if (!pid) {
/* Semaphore is already held */
if (cowflag) {
write_bytes(addr, 0, expected_addr);
read_bytes(addr, 0, expected_addr);
}
if (put_semaphore(semid, &sembuffer))
err("put_semaphore");
usleep(1000);
/* Wait for madvise() to be done */
if (get_semaphore(semid, &sembuffer))
err("put_semaphore");
if (put_semaphore(semid, &sembuffer))
err("put_semaphore");
return 0;
}
}

/* Wait for COW */
if (forkflag && get_semaphore(semid, &sembuffer)) {
perror("get_semaphore");
goto cleanout;
}

if (inject && corrupt_page != -1) {
ret = madvise(addr + corrupt_page * PS, PS, 100);
if (ret) {
printf("madivise return %d :", ret);
perror("madvise");
goto cleanout;
}
}

if (forkflag && put_semaphore(semid, &sembuffer)) {
perror("put_semaphore");
goto cleanout;
}

write_bytes(addr, avoid_touch, expected_addr);
read_bytes(addr, avoid_touch, expected_addr);

if (forkflag)
if (wait(&i) == -1)
err("wait");
cleanout:
if (shmflag) {
if (shmdt((const void *)addr) != 0) {
err("Detach failure");
shmctl(shmid, IPC_RMID, NULL);
exit(EXIT_FAILURE);
}
shmctl(shmid, IPC_RMID, NULL);
} else {
if (munmap(addr, file_size * HPAGE_SIZE))
err("munmap");
if (close(fd))
err("close");
if (!anonflag && unlink(filepath))
err("unlink");
}

return 0;
}

---
run-huge-test.sh
---
#!/bin/bash
#
# Test program for memory error handling for hugepages
# Usage: ./run-huge-test.sh hugetlbfs_directory
# Author: Naoya Horiguchi

usage()
{
echo "Usage: ./run-hgue-test.sh hugetlbfs_directory" && exit 1
}

htdir=$1
[ $# -ne 1 ] && usage
[ ! -d $htdir ] && usage

rm -rf $htdir/test*
echo 1000 > /proc/sys/vm/nr_hugepages

num=0

exec_testcase() {
error=0
echo "TestCase $@"
hpage_size=$1
hpage_target=$2
num=$7

if [ "$3" = "head" ] ; then
hpage_target_offset=0
elif [ "$3" = "tail" ] ; then
hpage_target_offset=1
else
error=1
fi
hpage_target=$((hpage_target * 512 + hpage_target_offset))

if [ "$4" = "early" ] ; then
process_type="-e"
elif [ "$4" = "late_touch" ] ; then
process_type=""
elif [ "$4" = "late_avoid" ] ; then
process_type="-a"
else
error=1
fi

if [ "$5" = "anonymous" ] ; then
file_type="-A"
elif [ "$5" = "file" ] ; then
file_type="-f $num"
elif [ "$5" = "shm" ] ; then
file_type="-S"
else
error=1
fi

if [ "$6" = "fork_shared" ] ; then
share_type="-F"
elif [ "$6" = "fork_private_nocow" ] ; then
share_type="-Fp"
elif [ "$6" = "fork_private_cow" ] ; then
share_type="-Fpc"
else
error=1
fi

command="./thugetlb -x -m $hpage_size -o $hpage_target $process_type $file_type $share_type $htdir &"
echo $command
eval $command
wait $!
echo ""

return 0
}

num=$((num+1))
exec_testcase 2 1 "head" "early" "file" "fork_shared" $num
num=$((num+1))
exec_testcase 2 1 "head" "early" "file" "fork_private_nocow" $num
num=$((num+1))
exec_testcase 2 1 "head" "early" "file" "fork_private_cow" $num
num=$((num+1))
exec_testcase 2 1 "head" "early" "shm" "fork_shared" $num
num=$((num+1))
exec_testcase 2 1 "head" "early" "anonymous" "fork_shared" $num
num=$((num+1))
exec_testcase 2 1 "head" "early" "anonymous" "fork_private_nocow" $num
num=$((num+1))
exec_testcase 2 1 "head" "early" "anonymous" "fork_private_cow" $num

num=$((num+1))
exec_testcase 2 1 "head" "late_touch" "file" "fork_shared" $num
num=$((num+1))
exec_testcase 2 1 "head" "late_touch" "file" "fork_private_nocow" $num
num=$((num+1))
exec_testcase 2 1 "head" "late_touch" "file" "fork_private_cow" $num
num=$((num+1))
exec_testcase 2 1 "head" "late_touch" "shm" "fork_shared" $num
num=$((num+1))
exec_testcase 2 1 "head" "late_touch" "anonymous" "fork_shared" $num
num=$((num+1))
exec_testcase 2 1 "head" "late_touch" "anonymous" "fork_private_nocow" $num
num=$((num+1))
exec_testcase 2 1 "head" "late_touch" "anonymous" "fork_private_cow" $num

num=$((num+1))
exec_testcase 2 1 "head" "late_avoid" "file" "fork_shared" $num
num=$((num+1))
exec_testcase 2 1 "head" "late_avoid" "file" "fork_private_nocow" $num
num=$((num+1))
exec_testcase 2 1 "head" "late_avoid" "file" "fork_private_cow" $num
num=$((num+1))
exec_testcase 2 1 "head" "late_avoid" "shm" "fork_shared" $num
num=$((num+1))
exec_testcase 2 1 "head" "late_avoid" "anonymous" "fork_shared" $num
num=$((num+1))
exec_testcase 2 1 "head" "late_avoid" "anonymous" "fork_private_nocow" $num
num=$((num+1))
exec_testcase 2 1 "head" "late_avoid" "anonymous" "fork_private_cow" $num

num=$((num+1))
exec_testcase 2 1 "tail" "early" "file" "fork_shared" $num
num=$((num+1))
exec_testcase 2 1 "tail" "early" "file" "fork_private_nocow" $num
num=$((num+1))
exec_testcase 2 1 "tail" "early" "file" "fork_private_cow" $num
num=$((num+1))
exec_testcase 2 1 "tail" "early" "shm" "fork_shared" $num
num=$((num+1))
exec_testcase 2 1 "tail" "early" "anonymous" "fork_shared" $num
num=$((num+1))
exec_testcase 2 1 "tail" "early" "anonymous" "fork_private_nocow" $num
num=$((num+1))
exec_testcase 2 1 "tail" "early" "anonymous" "fork_private_cow" $num

num=$((num+1))
exec_testcase 2 1 "tail" "late_touch" "file" "fork_shared" $num
num=$((num+1))
exec_testcase 2 1 "tail" "late_touch" "file" "fork_private_nocow" $num
num=$((num+1))
exec_testcase 2 1 "tail" "late_touch" "file" "fork_private_cow" $num
num=$((num+1))
exec_testcase 2 1 "tail" "late_touch" "shm" "fork_shared" $num
num=$((num+1))
exec_testcase 2 1 "tail" "late_touch" "anonymous" "fork_shared" $num
num=$((num+1))
exec_testcase 2 1 "tail" "late_touch" "anonymous" "fork_private_nocow" $num
num=$((num+1))
exec_testcase 2 1 "tail" "late_touch" "anonymous" "fork_private_cow" $num

num=$((num+1))
exec_testcase 2 1 "tail" "late_avoid" "file" "fork_shared" $num
num=$((num+1))
exec_testcase 2 1 "tail" "late_avoid" "file" "fork_private_nocow" $num
num=$((num+1))
exec_testcase 2 1 "tail" "late_avoid" "file" "fork_private_cow" $num
num=$((num+1))
exec_testcase 2 1 "tail" "late_avoid" "shm" "fork_shared" $num
num=$((num+1))
exec_testcase 2 1 "tail" "late_avoid" "anonymous" "fork_shared" $num
num=$((num+1))
exec_testcase 2 1 "tail" "late_avoid" "anonymous" "fork_private_nocow" $num
num=$((num+1))
exec_testcase 2 1 "tail" "late_avoid" "anonymous" "fork_private_cow" $num

# free IPC semaphores used by thugetlb.c
ipcs -s|grep $USER|cut -f2 -d' '|xargs ipcrm sem
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo(a)vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
From: Naoya Horiguchi on
Hi,

On Fri, May 14, 2010 at 10:54:50AM +0100, Mel Gorman wrote:
> ...
> > Hmm. I failed libhugetlbfs test with a oops in "private mapped" test :(
> >
>
> That's not a disaster - it's what the regression test is for. I haven't
> restarted the review in this case. I'll wait for another version that
> passes those regression tests.

I have fixed bugs on "rmap for hugepage" patch in the latest version,
where 'make func' test passes with the same result as in vanilla kernel.
Other HWPOISON patches have no change.

Thanks,
Naoya Horiguchi
---
From: Naoya Horiguchi <n-horiguchi(a)ah.jp.nec.com>
Date: Mon, 24 May 2010 11:26:33 +0900
Subject: [PATCH] hugetlb, rmap: add reverse mapping for hugepage

This patch adds reverse mapping feature for hugepage by introducing
mapcount for shared/private-mapped hugepage and anon_vma for
private-mapped hugepage.

While hugepage is not currently swappable, reverse mapping can be useful
for memory error handler.

Without this patch, memory error handler cannot identify processes
using the bad hugepage nor unmap it from them. That is:
- for shared hugepage:
we can collect processes using a hugepage through pagecache,
but can not unmap the hugepage because of the lack of mapcount.
- for privately mapped hugepage:
we can neither collect processes nor unmap the hugepage.
This patch solves these problems.

This patch include the bug fix given by commit 23be7468e8, so reverts it.

ChangeLog since May 13.
- rebased to 2.6.34
- fix logic error (in case that private mapping and shared mapping coexist)
- move is_vm_hugetlb_page() into include/linux/mm.h to use this function
from linear_page_index()
- define and use linear_hugepage_index() instead of compound_order()
- use page_move_anon_rmap() in hugetlb_cow()
- copy exclusive switch of __set_page_anon_rmap() into hugepage counterpart.
- revert commit 24be7468 completely

Signed-off-by: Naoya Horiguchi <n-horiguchi(a)ah.jp.nec.com>
Cc: Andi Kleen <andi(a)firstfloor.org>
Cc: Andrew Morton <akpm(a)linux-foundation.org>
Cc: Wu Fengguang <fengguang.wu(a)intel.com>
Cc: Mel Gorman <mel(a)csn.ul.ie>
Cc: Andrea Arcangeli <aarcange(a)redhat.com>
Cc: Larry Woodman <lwoodman(a)redhat.com>
Cc: Lee Schermerhorn <Lee.Schermerhorn(a)hp.com>
---
include/linux/hugetlb.h | 11 +-----
include/linux/mm.h | 9 +++++
include/linux/pagemap.h | 8 ++++-
include/linux/poison.h | 9 -----
mm/hugetlb.c | 85 +++++++++++++++++++++++++++++++++++++++++++++--
mm/rmap.c | 16 +++++++++
6 files changed, 115 insertions(+), 23 deletions(-)

diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 78b4bc6..a574d09 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -14,11 +14,6 @@ struct user_struct;

int PageHuge(struct page *page);

-static inline int is_vm_hugetlb_page(struct vm_area_struct *vma)
-{
- return vma->vm_flags & VM_HUGETLB;
-}
-
void reset_vma_resv_huge_pages(struct vm_area_struct *vma);
int hugetlb_sysctl_handler(struct ctl_table *, int, void __user *, size_t *, loff_t *);
int hugetlb_overcommit_handler(struct ctl_table *, int, void __user *, size_t *, loff_t *);
@@ -77,11 +72,6 @@ static inline int PageHuge(struct page *page)
return 0;
}

-static inline int is_vm_hugetlb_page(struct vm_area_struct *vma)
-{
- return 0;
-}
-
static inline void reset_vma_resv_huge_pages(struct vm_area_struct *vma)
{
}
@@ -108,6 +98,7 @@ static inline void hugetlb_report_meminfo(struct seq_file *m)
#define is_hugepage_only_range(mm, addr, len) 0
#define hugetlb_free_pgd_range(tlb, addr, end, floor, ceiling) ({BUG(); 0; })
#define hugetlb_fault(mm, vma, addr, flags) ({ BUG(); 0; })
+#define huge_pte_offset(mm, address) 0

#define hugetlb_change_protection(vma, address, end, newprot)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 462acaf..ee2c442 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -366,6 +366,15 @@ static inline void set_compound_order(struct page *page, unsigned long order)
page[1].lru.prev = (void *)order;
}

+static inline int is_vm_hugetlb_page(struct vm_area_struct *vma)
+{
+#ifdef CONFIG_HUGETLBFS
+ return vma->vm_flags & VM_HUGETLB;
+#else
+ return 0;
+#endif
+}
+
/*
* Multiple processes may "see" the same page. E.g. for untouched
* mappings of /dev/null, all processes see the same page full of
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 3c62ed4..9e0bd64 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -281,10 +281,16 @@ static inline loff_t page_offset(struct page *page)
return ((loff_t)page->index) << PAGE_CACHE_SHIFT;
}

+extern pgoff_t linear_hugepage_index(struct vm_area_struct *vma,
+ unsigned long address);
+
static inline pgoff_t linear_page_index(struct vm_area_struct *vma,
unsigned long address)
{
- pgoff_t pgoff = (address - vma->vm_start) >> PAGE_SHIFT;
+ pgoff_t pgoff;
+ if (unlikely(is_vm_hugetlb_page(vma)))
+ return linear_hugepage_index(vma, address);
+ pgoff = (address - vma->vm_start) >> PAGE_SHIFT;
pgoff += vma->vm_pgoff;
return pgoff >> (PAGE_CACHE_SHIFT - PAGE_SHIFT);
}
diff --git a/include/linux/poison.h b/include/linux/poison.h
index 34066ff..2110a81 100644
--- a/include/linux/poison.h
+++ b/include/linux/poison.h
@@ -48,15 +48,6 @@
#define POISON_FREE 0x6b /* for use-after-free poisoning */
#define POISON_END 0xa5 /* end-byte of poisoning */

-/********** mm/hugetlb.c **********/
-/*
- * Private mappings of hugetlb pages use this poisoned value for
- * page->mapping. The core VM should not be doing anything with this mapping
- * but futex requires the existence of some page->mapping value even though it
- * is unused if PAGE_MAPPING_ANON is set.
- */
-#define HUGETLB_POISON ((void *)(0x00300300 + POISON_POINTER_DELTA + PAGE_MAPPING_ANON))
-
/********** arch/$ARCH/mm/init.c **********/
#define POISON_FREE_INITMEM 0xcc

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 4c9e6bb..b8b8ea4 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -18,6 +18,7 @@
#include <linux/bootmem.h>
#include <linux/sysfs.h>
#include <linux/slab.h>
+#include <linux/rmap.h>

#include <asm/page.h>
#include <asm/pgtable.h>
@@ -220,6 +221,12 @@ static pgoff_t vma_hugecache_offset(struct hstate *h,
(vma->vm_pgoff >> huge_page_order(h));
}

+pgoff_t linear_hugepage_index(struct vm_area_struct *vma,
+ unsigned long address)
+{
+ return vma_hugecache_offset(hstate_vma(vma), vma, address);
+}
+
/*
* Return the size of the pages allocated when backing a VMA. In the majority
* cases this will be same size as used by the page table entries.
@@ -546,8 +553,8 @@ static void free_huge_page(struct page *page)

mapping = (struct address_space *) page_private(page);
set_page_private(page, 0);
- page->mapping = NULL;
BUG_ON(page_count(page));
+ BUG_ON(page_mapcount(page));
INIT_LIST_HEAD(&page->lru);

spin_lock(&hugetlb_lock);
@@ -2125,6 +2132,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
entry = huge_ptep_get(src_pte);
ptepage = pte_page(entry);
get_page(ptepage);
+ page_dup_rmap(ptepage);
set_huge_pte_at(dst, addr, dst_pte, entry);
}
spin_unlock(&src->page_table_lock);
@@ -2203,6 +2211,7 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
flush_tlb_range(vma, start, end);
mmu_notifier_invalidate_range_end(mm, start, end);
list_for_each_entry_safe(page, tmp, &page_list, lru) {
+ page_remove_rmap(page);
list_del(&page->lru);
put_page(page);
}
@@ -2268,6 +2277,50 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
return 1;
}

+/*
+ * The following three functions are counterparts of ones in mm/rmap.c.
+ * Unlike them, these functions don't have accounting code nor lru code,
+ * because we handle hugepages differently from common anonymous pages.
+ */
+static void __hugepage_set_anon_rmap(struct page *page,
+ struct vm_area_struct *vma, unsigned long address, int exclusive)
+{
+ struct anon_vma *anon_vma = vma->anon_vma;
+ BUG_ON(!anon_vma);
+ if (!exclusive) {
+ struct anon_vma_chain *avc;
+ avc = list_entry(vma->anon_vma_chain.prev,
+ struct anon_vma_chain, same_vma);
+ anon_vma = avc->anon_vma;
+ }
+ anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
+ page->mapping = (struct address_space *) anon_vma;
+ page->index = linear_page_index(vma, address);
+}
+
+static void hugepage_add_anon_rmap(struct page *page,
+ struct vm_area_struct *vma, unsigned long address)
+{
+ struct anon_vma *anon_vma = vma->anon_vma;
+ int first;
+ BUG_ON(!anon_vma);
+ BUG_ON(address < vma->vm_start || address >= vma->vm_end);
+ first = atomic_inc_and_test(&page->_mapcount);
+ if (first)
+ __hugepage_set_anon_rmap(page, vma, address, 0);
+}
+
+void hugepage_add_new_anon_rmap(struct page *page,
+ struct vm_area_struct *vma, unsigned long address)
+{
+ BUG_ON(address < vma->vm_start || address >= vma->vm_end);
+ atomic_set(&page->_mapcount, 0);
+ __hugepage_set_anon_rmap(page, vma, address, 1);
+}
+
+/*
+ * Hugetlb_cow() should be called with page lock of the original hugepage held.
+ */
static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long address, pte_t *ptep, pte_t pte,
struct page *pagecache_page)
@@ -2282,8 +2335,10 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
retry_avoidcopy:
/* If no-one else is actually using this page, avoid the copy
* and just make the page writable */
- avoidcopy = (page_count(old_page) == 1);
+ avoidcopy = (page_mapcount(old_page) == 1);
if (avoidcopy) {
+ if (PageAnon(old_page))
+ page_move_anon_rmap(old_page, vma, address);
set_huge_ptep_writable(vma, address, ptep);
return 0;
}
@@ -2334,6 +2389,13 @@ retry_avoidcopy:
return -PTR_ERR(new_page);
}

+ /*
+ * When the original hugepage is shared one, it does not have
+ * anon_vma prepared.
+ */
+ if (unlikely(anon_vma_prepare(vma)))
+ return VM_FAULT_OOM;
+
copy_huge_page(new_page, old_page, address, vma);
__SetPageUptodate(new_page);

@@ -2348,6 +2410,8 @@ retry_avoidcopy:
huge_ptep_clear_flush(vma, address, ptep);
set_huge_pte_at(mm, address, ptep,
make_huge_pte(vma, new_page, 1));
+ page_remove_rmap(old_page);
+ hugepage_add_anon_rmap(new_page, vma, address);
/* Make the old page be freed below */
new_page = old_page;
}
@@ -2448,10 +2512,17 @@ retry:
spin_lock(&inode->i_lock);
inode->i_blocks += blocks_per_huge_page(h);
spin_unlock(&inode->i_lock);
+ page_dup_rmap(page);
} else {
lock_page(page);
- page->mapping = HUGETLB_POISON;
+ if (unlikely(anon_vma_prepare(vma))) {
+ ret = VM_FAULT_OOM;
+ goto backout_unlocked;
+ }
+ hugepage_add_new_anon_rmap(page, vma, address);
}
+ } else {
+ page_dup_rmap(page);
}

/*
@@ -2503,6 +2574,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
pte_t *ptep;
pte_t entry;
int ret;
+ struct page *page = NULL;
struct page *pagecache_page = NULL;
static DEFINE_MUTEX(hugetlb_instantiation_mutex);
struct hstate *h = hstate_vma(vma);
@@ -2544,6 +2616,11 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
vma, address);
}

+ if (!pagecache_page) {
+ page = pte_page(entry);
+ lock_page(page);
+ }
+
spin_lock(&mm->page_table_lock);
/* Check for a racing update before calling hugetlb_cow */
if (unlikely(!pte_same(entry, huge_ptep_get(ptep))))
@@ -2569,6 +2646,8 @@ out_page_table_lock:
if (pagecache_page) {
unlock_page(pagecache_page);
put_page(pagecache_page);
+ } else {
+ unlock_page(page);
}

out_mutex:
diff --git a/mm/rmap.c b/mm/rmap.c
index 0feeef8..4ec6b1c 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -56,6 +56,7 @@
#include <linux/memcontrol.h>
#include <linux/mmu_notifier.h>
#include <linux/migrate.h>
+#include <linux/hugetlb.h>

#include <asm/tlbflush.h>

@@ -326,6 +327,8 @@ vma_address(struct page *page, struct vm_area_struct *vma)
pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
unsigned long address;

+ if (unlikely(is_vm_hugetlb_page(vma)))
+ pgoff = page->index << huge_page_order(page_hstate(page));
address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
if (unlikely(address < vma->vm_start || address >= vma->vm_end)) {
/* page should be within @vma mapping range */
@@ -369,6 +372,12 @@ pte_t *page_check_address(struct page *page, struct mm_struct *mm,
pte_t *pte;
spinlock_t *ptl;

+ if (unlikely(PageHuge(page))) {
+ pte = huge_pte_offset(mm, address);
+ ptl = &mm->page_table_lock;
+ goto check;
+ }
+
pgd = pgd_offset(mm, address);
if (!pgd_present(*pgd))
return NULL;
@@ -389,6 +398,7 @@ pte_t *page_check_address(struct page *page, struct mm_struct *mm,
}

ptl = pte_lockptr(mm, pmd);
+check:
spin_lock(ptl);
if (pte_present(*pte) && page_to_pfn(page) == pte_pfn(*pte)) {
*ptlp = ptl;
@@ -873,6 +883,12 @@ void page_remove_rmap(struct page *page)
page_clear_dirty(page);
set_page_dirty(page);
}
+ /*
+ * Hugepages are not counted in NR_ANON_PAGES nor NR_FILE_MAPPED
+ * and not charged by memcg for now.
+ */
+ if (unlikely(PageHuge(page)))
+ return;
if (PageAnon(page)) {
mem_cgroup_uncharge_page(page);
__dec_zone_page_state(page, NR_ANON_PAGES);
--
1.7.0
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo(a)vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
From: Mel Gorman on
On Mon, May 24, 2010 at 04:15:16PM +0900, Naoya Horiguchi wrote:
> Hi,
>
> On Fri, May 14, 2010 at 10:54:50AM +0100, Mel Gorman wrote:
> > ...
> > > Hmm. I failed libhugetlbfs test with a oops in "private mapped" test :(
> > >
> >
> > That's not a disaster - it's what the regression test is for. I haven't
> > restarted the review in this case. I'll wait for another version that
> > passes those regression tests.
>
> I have fixed bugs on "rmap for hugepage" patch in the latest version,
> where 'make func' test passes with the same result as in vanilla kernel.
> Other HWPOISON patches have no change.
>

I'd have preferred to see the whole series but still...

> <SNIP>
>
> Signed-off-by: Naoya Horiguchi <n-horiguchi(a)ah.jp.nec.com>
> Cc: Andi Kleen <andi(a)firstfloor.org>
> Cc: Andrew Morton <akpm(a)linux-foundation.org>
> Cc: Wu Fengguang <fengguang.wu(a)intel.com>
> Cc: Mel Gorman <mel(a)csn.ul.ie>
> Cc: Andrea Arcangeli <aarcange(a)redhat.com>
> Cc: Larry Woodman <lwoodman(a)redhat.com>
> Cc: Lee Schermerhorn <Lee.Schermerhorn(a)hp.com>
> ---
> include/linux/hugetlb.h | 11 +-----
> include/linux/mm.h | 9 +++++
> include/linux/pagemap.h | 8 ++++-
> include/linux/poison.h | 9 -----
> mm/hugetlb.c | 85 +++++++++++++++++++++++++++++++++++++++++++++--
> mm/rmap.c | 16 +++++++++
> 6 files changed, 115 insertions(+), 23 deletions(-)
>
> diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
> index 78b4bc6..a574d09 100644
> --- a/include/linux/hugetlb.h
> +++ b/include/linux/hugetlb.h
> @@ -14,11 +14,6 @@ struct user_struct;
>
> int PageHuge(struct page *page);
>
> -static inline int is_vm_hugetlb_page(struct vm_area_struct *vma)
> -{
> - return vma->vm_flags & VM_HUGETLB;
> -}
> -
> void reset_vma_resv_huge_pages(struct vm_area_struct *vma);
> int hugetlb_sysctl_handler(struct ctl_table *, int, void __user *, size_t *, loff_t *);
> int hugetlb_overcommit_handler(struct ctl_table *, int, void __user *, size_t *, loff_t *);
> @@ -77,11 +72,6 @@ static inline int PageHuge(struct page *page)
> return 0;
> }
>
> -static inline int is_vm_hugetlb_page(struct vm_area_struct *vma)
> -{
> - return 0;
> -}
> -

You collapse two functions into one here and move them to another
header. Is there a reason why pagemap.h could not include hugetlb.h?
It adds another header dependency which is bad but moving hugetlb stuff
into mm.h seems bad too.

> static inline void reset_vma_resv_huge_pages(struct vm_area_struct *vma)
> {
> }
> @@ -108,6 +98,7 @@ static inline void hugetlb_report_meminfo(struct seq_file *m)
> #define is_hugepage_only_range(mm, addr, len) 0
> #define hugetlb_free_pgd_range(tlb, addr, end, floor, ceiling) ({BUG(); 0; })
> #define hugetlb_fault(mm, vma, addr, flags) ({ BUG(); 0; })
> +#define huge_pte_offset(mm, address) 0
>
> #define hugetlb_change_protection(vma, address, end, newprot)
>
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index 462acaf..ee2c442 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -366,6 +366,15 @@ static inline void set_compound_order(struct page *page, unsigned long order)
> page[1].lru.prev = (void *)order;
> }
>
> +static inline int is_vm_hugetlb_page(struct vm_area_struct *vma)
> +{
> +#ifdef CONFIG_HUGETLBFS
> + return vma->vm_flags & VM_HUGETLB;
> +#else
> + return 0;
> +#endif
> +}
> +
> /*
> * Multiple processes may "see" the same page. E.g. for untouched
> * mappings of /dev/null, all processes see the same page full of
> diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
> index 3c62ed4..9e0bd64 100644
> --- a/include/linux/pagemap.h
> +++ b/include/linux/pagemap.h
> @@ -281,10 +281,16 @@ static inline loff_t page_offset(struct page *page)
> return ((loff_t)page->index) << PAGE_CACHE_SHIFT;
> }
>
> +extern pgoff_t linear_hugepage_index(struct vm_area_struct *vma,
> + unsigned long address);
> +
> static inline pgoff_t linear_page_index(struct vm_area_struct *vma,
> unsigned long address)
> {
> - pgoff_t pgoff = (address - vma->vm_start) >> PAGE_SHIFT;
> + pgoff_t pgoff;
> + if (unlikely(is_vm_hugetlb_page(vma)))
> + return linear_hugepage_index(vma, address);
> + pgoff = (address - vma->vm_start) >> PAGE_SHIFT;
> pgoff += vma->vm_pgoff;
> return pgoff >> (PAGE_CACHE_SHIFT - PAGE_SHIFT);
> }
> diff --git a/include/linux/poison.h b/include/linux/poison.h
> index 34066ff..2110a81 100644
> --- a/include/linux/poison.h
> +++ b/include/linux/poison.h
> @@ -48,15 +48,6 @@
> #define POISON_FREE 0x6b /* for use-after-free poisoning */
> #define POISON_END 0xa5 /* end-byte of poisoning */
>
> -/********** mm/hugetlb.c **********/
> -/*
> - * Private mappings of hugetlb pages use this poisoned value for
> - * page->mapping. The core VM should not be doing anything with this mapping
> - * but futex requires the existence of some page->mapping value even though it
> - * is unused if PAGE_MAPPING_ANON is set.
> - */
> -#define HUGETLB_POISON ((void *)(0x00300300 + POISON_POINTER_DELTA + PAGE_MAPPING_ANON))
> -
> /********** arch/$ARCH/mm/init.c **********/
> #define POISON_FREE_INITMEM 0xcc
>
> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> index 4c9e6bb..b8b8ea4 100644
> --- a/mm/hugetlb.c
> +++ b/mm/hugetlb.c
> @@ -18,6 +18,7 @@
> #include <linux/bootmem.h>
> #include <linux/sysfs.h>
> #include <linux/slab.h>
> +#include <linux/rmap.h>
>
> #include <asm/page.h>
> #include <asm/pgtable.h>
> @@ -220,6 +221,12 @@ static pgoff_t vma_hugecache_offset(struct hstate *h,
> (vma->vm_pgoff >> huge_page_order(h));
> }
>
> +pgoff_t linear_hugepage_index(struct vm_area_struct *vma,
> + unsigned long address)
> +{
> + return vma_hugecache_offset(hstate_vma(vma), vma, address);
> +}
> +
> /*
> * Return the size of the pages allocated when backing a VMA. In the majority
> * cases this will be same size as used by the page table entries.
> @@ -546,8 +553,8 @@ static void free_huge_page(struct page *page)
>
> mapping = (struct address_space *) page_private(page);
> set_page_private(page, 0);
> - page->mapping = NULL;
> BUG_ON(page_count(page));
> + BUG_ON(page_mapcount(page));
> INIT_LIST_HEAD(&page->lru);
>
> spin_lock(&hugetlb_lock);
> @@ -2125,6 +2132,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
> entry = huge_ptep_get(src_pte);
> ptepage = pte_page(entry);
> get_page(ptepage);
> + page_dup_rmap(ptepage);
> set_huge_pte_at(dst, addr, dst_pte, entry);
> }
> spin_unlock(&src->page_table_lock);
> @@ -2203,6 +2211,7 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
> flush_tlb_range(vma, start, end);
> mmu_notifier_invalidate_range_end(mm, start, end);
> list_for_each_entry_safe(page, tmp, &page_list, lru) {
> + page_remove_rmap(page);
> list_del(&page->lru);
> put_page(page);
> }
> @@ -2268,6 +2277,50 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
> return 1;
> }
>
> +/*
> + * The following three functions are counterparts of ones in mm/rmap.c.
> + * Unlike them, these functions don't have accounting code nor lru code,
> + * because we handle hugepages differently from common anonymous pages.
> + */
> +static void __hugepage_set_anon_rmap(struct page *page,
> + struct vm_area_struct *vma, unsigned long address, int exclusive)
> +{
> + struct anon_vma *anon_vma = vma->anon_vma;
> + BUG_ON(!anon_vma);
> + if (!exclusive) {
> + struct anon_vma_chain *avc;
> + avc = list_entry(vma->anon_vma_chain.prev,
> + struct anon_vma_chain, same_vma);
> + anon_vma = avc->anon_vma;
> + }
> + anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
> + page->mapping = (struct address_space *) anon_vma;
> + page->index = linear_page_index(vma, address);
> +}
> +
> +static void hugepage_add_anon_rmap(struct page *page,
> + struct vm_area_struct *vma, unsigned long address)
> +{
> + struct anon_vma *anon_vma = vma->anon_vma;
> + int first;
> + BUG_ON(!anon_vma);
> + BUG_ON(address < vma->vm_start || address >= vma->vm_end);
> + first = atomic_inc_and_test(&page->_mapcount);
> + if (first)
> + __hugepage_set_anon_rmap(page, vma, address, 0);
> +}
> +
> +void hugepage_add_new_anon_rmap(struct page *page,
> + struct vm_area_struct *vma, unsigned long address)
> +{
> + BUG_ON(address < vma->vm_start || address >= vma->vm_end);
> + atomic_set(&page->_mapcount, 0);
> + __hugepage_set_anon_rmap(page, vma, address, 1);
> +}
> +

Is it possible to move these to mm/rmap.c so all the anon rmap adding
code is in the same place? In the event that __page_set_anon_rmap() is
updated, there would be a greater chance the hugepage equivalent will be
noticed and updated.

I didn't spot anything particularly bad after this. If these minor issues
could be addressed and the full series reposted, I'll test the hugetlb side
of things further just to be sure.

> +/*
> + * Hugetlb_cow() should be called with page lock of the original hugepage held.
> + */
> static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
> unsigned long address, pte_t *ptep, pte_t pte,
> struct page *pagecache_page)
> @@ -2282,8 +2335,10 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
> retry_avoidcopy:
> /* If no-one else is actually using this page, avoid the copy
> * and just make the page writable */
> - avoidcopy = (page_count(old_page) == 1);
> + avoidcopy = (page_mapcount(old_page) == 1);
> if (avoidcopy) {
> + if (PageAnon(old_page))
> + page_move_anon_rmap(old_page, vma, address);
> set_huge_ptep_writable(vma, address, ptep);
> return 0;
> }
> @@ -2334,6 +2389,13 @@ retry_avoidcopy:
> return -PTR_ERR(new_page);
> }
>
> + /*
> + * When the original hugepage is shared one, it does not have
> + * anon_vma prepared.
> + */
> + if (unlikely(anon_vma_prepare(vma)))
> + return VM_FAULT_OOM;
> +
> copy_huge_page(new_page, old_page, address, vma);
> __SetPageUptodate(new_page);
>
> @@ -2348,6 +2410,8 @@ retry_avoidcopy:
> huge_ptep_clear_flush(vma, address, ptep);
> set_huge_pte_at(mm, address, ptep,
> make_huge_pte(vma, new_page, 1));
> + page_remove_rmap(old_page);
> + hugepage_add_anon_rmap(new_page, vma, address);
> /* Make the old page be freed below */
> new_page = old_page;
> }
> @@ -2448,10 +2512,17 @@ retry:
> spin_lock(&inode->i_lock);
> inode->i_blocks += blocks_per_huge_page(h);
> spin_unlock(&inode->i_lock);
> + page_dup_rmap(page);
> } else {
> lock_page(page);
> - page->mapping = HUGETLB_POISON;
> + if (unlikely(anon_vma_prepare(vma))) {
> + ret = VM_FAULT_OOM;
> + goto backout_unlocked;
> + }
> + hugepage_add_new_anon_rmap(page, vma, address);
> }
> + } else {
> + page_dup_rmap(page);
> }
>
> /*
> @@ -2503,6 +2574,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
> pte_t *ptep;
> pte_t entry;
> int ret;
> + struct page *page = NULL;
> struct page *pagecache_page = NULL;
> static DEFINE_MUTEX(hugetlb_instantiation_mutex);
> struct hstate *h = hstate_vma(vma);
> @@ -2544,6 +2616,11 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
> vma, address);
> }
>
> + if (!pagecache_page) {
> + page = pte_page(entry);
> + lock_page(page);
> + }
> +
> spin_lock(&mm->page_table_lock);
> /* Check for a racing update before calling hugetlb_cow */
> if (unlikely(!pte_same(entry, huge_ptep_get(ptep))))
> @@ -2569,6 +2646,8 @@ out_page_table_lock:
> if (pagecache_page) {
> unlock_page(pagecache_page);
> put_page(pagecache_page);
> + } else {
> + unlock_page(page);
> }
>
> out_mutex:
> diff --git a/mm/rmap.c b/mm/rmap.c
> index 0feeef8..4ec6b1c 100644
> --- a/mm/rmap.c
> +++ b/mm/rmap.c
> @@ -56,6 +56,7 @@
> #include <linux/memcontrol.h>
> #include <linux/mmu_notifier.h>
> #include <linux/migrate.h>
> +#include <linux/hugetlb.h>
>
> #include <asm/tlbflush.h>
>
> @@ -326,6 +327,8 @@ vma_address(struct page *page, struct vm_area_struct *vma)
> pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
> unsigned long address;
>
> + if (unlikely(is_vm_hugetlb_page(vma)))
> + pgoff = page->index << huge_page_order(page_hstate(page));
> address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
> if (unlikely(address < vma->vm_start || address >= vma->vm_end)) {
> /* page should be within @vma mapping range */
> @@ -369,6 +372,12 @@ pte_t *page_check_address(struct page *page, struct mm_struct *mm,
> pte_t *pte;
> spinlock_t *ptl;
>
> + if (unlikely(PageHuge(page))) {
> + pte = huge_pte_offset(mm, address);
> + ptl = &mm->page_table_lock;
> + goto check;
> + }
> +
> pgd = pgd_offset(mm, address);
> if (!pgd_present(*pgd))
> return NULL;
> @@ -389,6 +398,7 @@ pte_t *page_check_address(struct page *page, struct mm_struct *mm,
> }
>
> ptl = pte_lockptr(mm, pmd);
> +check:
> spin_lock(ptl);
> if (pte_present(*pte) && page_to_pfn(page) == pte_pfn(*pte)) {
> *ptlp = ptl;
> @@ -873,6 +883,12 @@ void page_remove_rmap(struct page *page)
> page_clear_dirty(page);
> set_page_dirty(page);
> }
> + /*
> + * Hugepages are not counted in NR_ANON_PAGES nor NR_FILE_MAPPED
> + * and not charged by memcg for now.
> + */
> + if (unlikely(PageHuge(page)))
> + return;
> if (PageAnon(page)) {
> mem_cgroup_uncharge_page(page);
> __dec_zone_page_state(page, NR_ANON_PAGES);
> --
> 1.7.0
>

--
Mel Gorman
Part-time Phd Student Linux Technology Center
University of Limerick IBM Dublin Software Lab
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo(a)vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
From: Naoya Horiguchi on
Hi, Mel.

Thank you for the review.

On Tue, May 25, 2010 at 11:59:57AM +0100, Mel Gorman wrote:
> ...
> I'd have preferred to see the whole series but still...

OK.

> > diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
> > index 78b4bc6..a574d09 100644
> > --- a/include/linux/hugetlb.h
> > +++ b/include/linux/hugetlb.h
> > @@ -14,11 +14,6 @@ struct user_struct;
> >
> > int PageHuge(struct page *page);
> >
> > -static inline int is_vm_hugetlb_page(struct vm_area_struct *vma)
> > -{
> > - return vma->vm_flags & VM_HUGETLB;
> > -}
> > -
> > void reset_vma_resv_huge_pages(struct vm_area_struct *vma);
> > int hugetlb_sysctl_handler(struct ctl_table *, int, void __user *, size_t *, loff_t *);
> > int hugetlb_overcommit_handler(struct ctl_table *, int, void __user *, size_t *, loff_t *);
> > @@ -77,11 +72,6 @@ static inline int PageHuge(struct page *page)
> > return 0;
> > }
> >
> > -static inline int is_vm_hugetlb_page(struct vm_area_struct *vma)
> > -{
> > - return 0;
> > -}
> > -
>
> You collapse two functions into one here and move them to another
> header. Is there a reason why pagemap.h could not include hugetlb.h?

Yes, hugetlb.h includes pagemap.h through mempolicy.h.
I didn't make pagemap.h depend on hugetlb.h because it makes cyclic dependency
among pagemap.h, mempolicy.h and hugetlb.h.

> It adds another header dependency which is bad but moving hugetlb stuff
> into mm.h seems bad too.

I have another choice to move the definition of is_vm_hugetlb_page() into
mm/hugetlb.c and introduce declaration of this function to pagemap.h,
but this needed a bit ugly #ifdefs and I didn't like it.
If putting hugetlb code in mm.h is worse, I'll take the second choice
in the next post.

> > @@ -2268,6 +2277,50 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
> > return 1;
> > }
> >
> > +/*
> > + * The following three functions are counterparts of ones in mm/rmap.c.
> > + * Unlike them, these functions don't have accounting code nor lru code,
> > + * because we handle hugepages differently from common anonymous pages.
> > + */
> > +static void __hugepage_set_anon_rmap(struct page *page,
> > + struct vm_area_struct *vma, unsigned long address, int exclusive)
> > +{
> > + struct anon_vma *anon_vma = vma->anon_vma;
> > + BUG_ON(!anon_vma);
> > + if (!exclusive) {
> > + struct anon_vma_chain *avc;
> > + avc = list_entry(vma->anon_vma_chain.prev,
> > + struct anon_vma_chain, same_vma);
> > + anon_vma = avc->anon_vma;
> > + }
> > + anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
> > + page->mapping = (struct address_space *) anon_vma;
> > + page->index = linear_page_index(vma, address);
> > +}
> > +
> > +static void hugepage_add_anon_rmap(struct page *page,
> > + struct vm_area_struct *vma, unsigned long address)
> > +{
> > + struct anon_vma *anon_vma = vma->anon_vma;
> > + int first;
> > + BUG_ON(!anon_vma);
> > + BUG_ON(address < vma->vm_start || address >= vma->vm_end);
> > + first = atomic_inc_and_test(&page->_mapcount);
> > + if (first)
> > + __hugepage_set_anon_rmap(page, vma, address, 0);
> > +}
> > +
> > +void hugepage_add_new_anon_rmap(struct page *page,
> > + struct vm_area_struct *vma, unsigned long address)
> > +{
> > + BUG_ON(address < vma->vm_start || address >= vma->vm_end);
> > + atomic_set(&page->_mapcount, 0);
> > + __hugepage_set_anon_rmap(page, vma, address, 1);
> > +}
> > +
>
> Is it possible to move these to mm/rmap.c so all the anon rmap adding
> code is in the same place? In the event that __page_set_anon_rmap() is
> updated, there would be a greater chance the hugepage equivalent will be
> noticed and updated.

Sounds reasonable, I'll do it.

> I didn't spot anything particularly bad after this. If these minor issues
> could be addressed and the full series reposted, I'll test the hugetlb side
> of things further just to be sure.

OK, thanks you :)

Thanks,
Naoya Horiguchi
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo(a)vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
From: Mel Gorman on
On Wed, May 26, 2010 at 03:51:56PM +0900, Naoya Horiguchi wrote:
> Hi, Mel.
>
> Thank you for the review.
>
> On Tue, May 25, 2010 at 11:59:57AM +0100, Mel Gorman wrote:
> > ...
> > I'd have preferred to see the whole series but still...
>
> OK.
>
> > > diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
> > > index 78b4bc6..a574d09 100644
> > > --- a/include/linux/hugetlb.h
> > > +++ b/include/linux/hugetlb.h
> > > @@ -14,11 +14,6 @@ struct user_struct;
> > >
> > > int PageHuge(struct page *page);
> > >
> > > -static inline int is_vm_hugetlb_page(struct vm_area_struct *vma)
> > > -{
> > > - return vma->vm_flags & VM_HUGETLB;
> > > -}
> > > -
> > > void reset_vma_resv_huge_pages(struct vm_area_struct *vma);
> > > int hugetlb_sysctl_handler(struct ctl_table *, int, void __user *, size_t *, loff_t *);
> > > int hugetlb_overcommit_handler(struct ctl_table *, int, void __user *, size_t *, loff_t *);
> > > @@ -77,11 +72,6 @@ static inline int PageHuge(struct page *page)
> > > return 0;
> > > }
> > >
> > > -static inline int is_vm_hugetlb_page(struct vm_area_struct *vma)
> > > -{
> > > - return 0;
> > > -}
> > > -
> >
> > You collapse two functions into one here and move them to another
> > header. Is there a reason why pagemap.h could not include hugetlb.h?
>
> Yes, hugetlb.h includes pagemap.h through mempolicy.h.
> I didn't make pagemap.h depend on hugetlb.h because it makes cyclic dependency
> among pagemap.h, mempolicy.h and hugetlb.h.
>

Ok, that's a good reason.

> > It adds another header dependency which is bad but moving hugetlb stuff
> > into mm.h seems bad too.
>
> I have another choice to move the definition of is_vm_hugetlb_page() into
> mm/hugetlb.c and introduce declaration of this function to pagemap.h,
> but this needed a bit ugly #ifdefs and I didn't like it.
> If putting hugetlb code in mm.h is worse, I'll take the second choice
> in the next post.
>

That would add an additional function call overhead to page table teardown
which would be very unfortunate. I still am not very keen on moving hugetlb
code to mm.h though.

How about moving the definition of shared_policy under a CONFIG_NUMA
block in mm_types.h and removing the dependency between hugetlb.h and
mempolicy.h?

Does anyone else see a problem with this from a "clean" perspective?

--
Mel Gorman
Part-time Phd Student Linux Technology Center
University of Limerick IBM Dublin Software Lab
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo(a)vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/