Inotify scalability issue

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]


Hi,

We're running Dovecot mailservers and are experiencing problems similar to what is described here:
http://old.nabble.com/Very-High-Load-on-Dovecot-2-and-Errors-in-mail.err.-tt33856207.html#a33856207

I've written two small programs to expose the problem.

watcher.c:
This program reads a filename from the commandline, creates a new inotify handle and sets it up to watch IN_CLOSE_WRITE and IN_DELETE on the file. It then writes a 'z' to stdout, and does a blocking read from inotify. After receiving an event from inotify the program prints an 'x' to stdout, closes the inotify handle and then prints a '.' to stdout before exiting.

test.c:
This program creates 20 files and spawns 20 watchers to watch each of them. For each watcher it waits between 1 and 2 seconds before touching the file they watch (which should cause it to wake up and exit), and then spawns a new watcher on the file, again waiting between 1 and 2 seconds before touching the file again etc.

On my dualcore workstation running the test program behaves as you'd expect. That is it prints
zzzzzzzzzzzzzzzzzzzzx.zx.zx.zx.zx.zx.zx.zx.zx.zx.zx.zx.zx (etc.)

However on a 16-core server it behaves very differently:
zzzzzzzzzzzzzzzzzzzzxzxzxzxz.xzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxz......................................................................................................xzxzxzxzxz.xxzzxzxzxzxzxzxzxzxzxzxzxz.................xzxz.xz

(sorry about the long line)
That is watchers are spawned to watch their files, they're woken up by inotify as they should be, but then they pile up in D-state waiting for the close call to finish. Only at irregular intervals do they all return.

They seem to be sleeping on the syncronize_srcu() call in fsnotify_destroy_group() of fs/notify/group.c.

We've tested this on various machines running kernels from 3.0 and up, and the trend very clear: The more processors the worse it gets. However, I also tried it on one 48-core server running an old 2.6.32 debian kernels, and here the processes don't pile up.

/Emil
#include <stdlib.h>
#include <errno.h>
#include <string.h>
#include <stdio.h>
#include <unistd.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <sys/inotify.h>
#include <limits.h>

#define log(...) fprintf(stderr, __VA_ARGS__)

static void
put(int c)
{
	putchar(c);
	fflush(stdout);
}

int
main(int argc, char *argv[])
{
	int ifd = -1;
	int wd;
	char buf[sizeof(struct inotify_event) + NAME_MAX + 1];

	if (argc != 2) {
		log("I need a file..\n");
		goto error;
	}

	ifd = inotify_init();
	if (ifd < 0) {
		if (errno == EMFILE)
			put('!');
		else
			log("Error initializing inotify: %s\n",
				strerror(errno));
		goto error;
	}

	wd = inotify_add_watch(ifd, argv[1], IN_CLOSE_WRITE | IN_DELETE);
	if (wd < 0) {
		log("Error adding watch on '%s': %s\n",
				argv[1], strerror(errno));
		goto error;
	}

	put('z');
	if (read(ifd, buf, sizeof(buf)) < 0) {
		log("Error reading inotify event: %s\n",
				strerror(errno));
		goto error;
	}

	put('x');
	if (close(ifd)) {
		log("Error closing inotify: %s\n",
				strerror(errno));
		ifd = -1;
		goto error;
	}
	put('.');
	return EXIT_SUCCESS;

error:
	if (ifd > 0)
		close(ifd);
	return EXIT_FAILURE;
}
#include <stdint.h>
#include <stdlib.h>
#include <errno.h>
#include <string.h>
#include <stdio.h>
#include <unistd.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <sys/time.h>
#include <signal.h>

#define log(...) fprintf(stderr, __VA_ARGS__)
#define WATCHER "watcher"

static struct proc {
	uint64_t deadline;
	char filename[32 - sizeof(uint64_t)];
} procs[NPROC];

static pid_t
spawn_watcher(char *filename)
{
	char *argv[3] = { WATCHER, filename, NULL };
	pid_t pid;

	pid = fork();
	if (pid == 0) {
		if (execv(WATCHER, argv)) {
			log("Error spawning '%s': %s\n",
					WATCHER, strerror(errno));
			exit(EXIT_FAILURE);
		}
	}

	if (pid < 0)
		log("Error forking: %s\n", strerror(errno));
	
	return pid;
}

static int
touch(char *filename, int flags)
{
	int fd = open(filename, flags, 0644);
	if (fd < 0 || close(fd)) {
		log("Error touching '%s': %s\n",
				filename, strerror(errno));
		return -1;
	}
	return 0;
}

static uint64_t 
now(void)
{
	struct timeval tv;
	(void)gettimeofday(&tv, NULL);
	return ((uint64_t)tv.tv_sec * 1000000) + (uint64_t)tv.tv_usec;
}

static uint64_t
random_delay(void)
{
	return (uint64_t)DELAY_MIN + (random() / (RAND_MAX/(DELAY_MAX - DELAY_MIN)));
}

int
main()
{
	unsigned int i;
	struct proc *next;

	if (signal(SIGCHLD, SIG_IGN)) {
		log("Error setting SIGCHLD handler: %s\n",
				strerror(errno));
		return EXIT_FAILURE;
	}

	for (i = 0; i < NPROC; i++) {
		(void)sprintf(procs[i].filename, "%03u.tmp", i);
		if (touch(procs[i].filename, O_WRONLY | O_CREAT)) {
			log("Error creating file '%s': %s\n",
					procs[i].filename, strerror(errno));
			return EXIT_FAILURE;
		}
	}

	next = &procs[0];
	for (i = 0; i < NPROC; i++) {
		if (spawn_watcher(procs[i].filename) < 0)
			goto error;

		procs[i].deadline = now() + random_delay();
		if (procs[i].deadline < next->deadline)
			next = &procs[i];
	}

	while (1) {
		uint64_t n = now();

		if (next->deadline > n) {
			usleep(next->deadline - n);
			continue;
		}

		if (touch(next->filename, O_WRONLY))
			goto error;

		if (spawn_watcher(next->filename) < 0)
			goto error;

		next->deadline = now() + random_delay();
		for (i = 0; i < NPROC; i++) {
			if (procs[i].deadline < next->deadline)
				next = &procs[i];
		}
	}

error:
	for (i = 0; i < NPROC; i++)
		(void)touch(procs[i].filename, O_WRONLY);

	return EXIT_FAILURE;
}
NPROC     = 20
DELAY_MIN = 1000000
DELAY_MAX = 2000000

CC = gcc
CFLAGS ?= -O2 -pipe -g
CFLAGS += -std=gnu99 -Wall -Wextra -pedantic

all: test watcher

test: test.c
	$(CC) $(CFLAGS) -DNPROC=$(NPROC) -DDELAY_MIN=$(DELAY_MIN) -DDELAY_MAX=$(DELAY_MAX) $(LDFLAGS) $< -o $@

watcher: watcher.c
	$(CC) $(CFLAGS) $(LDFLAGS) $< -o $@

clean:
	rm -rf test watcher *.tmp

[Other Archives]     [Linux Kernel Newbies]     [Linux Driver Development]     [Linux Kbuild]     [Fedora Kernel]     [Linux Kernel Testers]     [Linux SH]     [Linux Omap]     [Linux Tape]     [Linux Input]     [Linux Kernel Janitors]     [Linux Kernel Packagers]     [Linux Doc]     [Linux Man Pages]     [Linux API]     [Linux Memory Management]     [Linux Modules]     [Linux Standards]     [Kernel Announce]     [Netdev]     [Git]     [Linux PCI]     Linux CAN Development     [Linux I2C]     [Linux RDMA]     [Linux NUMA]     [Netfilter]     [Netfilter Devel]     [SELinux]     [Bugtraq]     [FIO]     [Linux Perf Users]     [Linux Serial]     [Linux PPP]     [Linux ISDN]     [Linux Next]     [Kernel Stable Commits]     [Linux Tip Commits]     [Kernel MM Commits]     [Linux Security Module]     [AutoFS]     [Filesystem Development]     [Ext3 Filesystem]     [Linux bcache]     [Ext4 Filesystem]     [Linux BTRFS]     [Linux CEPH Filesystem]     [Linux XFS]     [XFS]     [Linux NFS]     [Linux CIFS]     [Ecryptfs]     [Linux NILFS]     [Linux Cachefs]     [Reiser FS]     [Initramfs]     [Linux FB Devel]     [Linux OpenGL]     [DRI Devel]     [Fastboot]     [Linux RT Users]     [Linux RT Stable]     [eCos]     [Corosync]     [Linux Clusters]     [LVS Devel]     [Hot Plug]     [Linux Virtualization]     [KVM]     [KVM PPC]     [KVM ia64]     [Linux Containers]     [Linux Hexagon]     [Linux Cgroups]     [Util Linux]     [Wireless]     [Linux Bluetooth]     [Bluez Devel]     [Ethernet Bridging]     [Embedded Linux]     [Barebox]     [Linux MMC]     [Linux IIO]     [Sparse]     [Smatch]     [Linux Arch]     [x86 Platform Driver]     [Linux ACPI]     [Linux IBM ACPI]     [LM Sensors]     [CPU Freq]     [Linux Power Management]     [Linmodems]     [Linux DCCP]     [Linux SCTP]     [ALSA Devel]     [Linux USB]     [Linux PA RISC]     [Linux Samsung SOC]     [MIPS Linux]     [IBM S/390 Linux]     [ARM Linux]     [ARM Kernel]     [ARM MSM]     [Tegra Devel]     [Sparc Linux]     [Linux Security]     [Linux Sound]     [Linux Media]     [Video 4 Linux]     [Linux IRDA Users]     [Linux for the blind]     [Linux RAID]     [Linux ATA RAID]     [Device Mapper]     [Linux SCSI]     [SCSI Target Devel]     [Linux SCSI Target Infrastructure]     [Linux IDE]     [Linux SMP]     [Linux AXP]     [Linux Alpha]     [Linux M68K]     [Linux ia64]     [Linux 8086]     [Linux x86_64]     [Linux Config]     [Linux Apps]     [Linux MSDOS]     [Linux X.25]     [Linux Crypto]     [DM Crypt]     [Linux Trace Users]     [Linux Btrace]     [Linux Watchdog]     [Utrace Devel]     [Linux C Programming]     [Linux Assembly]     [Dash]     [DWARVES]     [Hail Devel]     [Linux Kernel Debugger]     [Linux gcc]     [Gcc Help]     [X.Org]     [Wine]

Add to Google Powered by Linux

[Older Kernel Discussion]     [Yosemite National Park Forum]     [Large Format Photos]     [Gimp]     [Yosemite Photos]     [Stuff]