(original) (raw)

diff -Nru linux-2.5.44.vanilla/arch/i386/kernel/entry.S linux-2.5.44.epoll/arch/i386/kernel/entry.S --- linux-2.5.44.vanilla/arch/i386/kernel/entry.S Fri Oct 18 21:01:19 2002 +++ linux-2.5.44.epoll/arch/i386/kernel/entry.S Sat Oct 19 21:16:19 2002 @@ -737,6 +737,10 @@ .long sys_free_hugepages .long sys_exit_group .long sys_lookup_dcookie + .long sys_epoll_create + .long sys_epoll_ctl /* 255 */ + .long sys_epoll_wait + .rept NR_syscalls-(.-sys_call_table)/4 .long sys_ni_syscall diff -Nru linux-2.5.44.vanilla/drivers/char/Makefile linux-2.5.44.epoll/drivers/char/Makefile --- linux-2.5.44.vanilla/drivers/char/Makefile Fri Oct 18 21:02:32 2002 +++ linux-2.5.44.epoll/drivers/char/Makefile Tue Oct 22 10:08:40 2002 @@ -7,14 +7,14 @@ # FONTMAPFILE = cp437.uni -obj-y += mem.o tty_io.o n_tty.o tty_ioctl.o pty.o misc.o random.o +obj-y += mem.o tty_io.o n_tty.o tty_ioctl.o pty.o misc.o random.o eventpoll.o # All of the (potential) objects that export symbols. # This list comes from 'grep -l EXPORT_SYMBOL *.[hc]'. export-objs := busmouse.o vt.o generic_serial.o ip2main.o \ ite_gpio.o keyboard.o misc.o nvram.o random.o rtc.o \ - selection.o sonypi.o sysrq.o tty_io.o tty_ioctl.o + selection.o sonypi.o sysrq.o tty_io.o tty_ioctl.o eventpoll.o obj-$(CONFIG_VT) += vt_ioctl.o vc_screen.o consolemap.o consolemap_deftbl.o selection.o keyboard.o obj-$(CONFIG_HW_CONSOLE) += vt.o defkeymap.o diff -Nru linux-2.5.44.vanilla/drivers/char/eventpoll.c linux-2.5.44.epoll/drivers/char/eventpoll.c --- linux-2.5.44.vanilla/drivers/char/eventpoll.c Wed Dec 31 16:00:00 1969 +++ linux-2.5.44.epoll/drivers/char/eventpoll.c Tue Oct 29 16:23:32 2002 @@ -0,0 +1,1429 @@ +/* + * drivers/char/eventpoll.c ( Efficent event polling implementation ) + * Copyright (C) 2001,...,2002 Davide Libenzi + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * Davide Libenzi davidel@xmailserver.org + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#define EVENTPOLLFS_MAGIC 0x03111965 /* My birthday should work for this :) */ + +#define DEBUG_EPOLL 0 + +#if DEBUG_EPOLL > 0 +#define DPRINTK(x) printk x +#define DNPRINTK(n, x) do { if ((n) <= DEBUG_EPOLL) printk x; } while (0) +#else /* #if DEBUG_EPOLL > 0 */ +#define DPRINTK(x) (void) 0 +#define DNPRINTK(n, x) (void) 0 +#endif /* #if DEBUG_EPOLL > 0 */ + +#define DEBUG_DPI 0 + +#if DEBUG_DPI != 0 +#define DPI_SLAB_DEBUG (SLAB_DEBUG_FREE | SLAB_RED_ZONE /* | SLAB_POISON */) +#else /* #if DEBUG_DPI != 0 */ +#define DPI_SLAB_DEBUG 0 +#endif /* #if DEBUG_DPI != 0 */ + +#define INITIAL_HASH_BITS 7 +#define MAX_HASH_BITS 18 +#define RESIZE_LENGTH 2 + +#define DPI_MEM_ALLOC() (struct epitem *) kmem_cache_alloc(dpi_cache, SLAB_KERNEL) +#define DPI_MEM_FREE(p) kmem_cache_free(dpi_cache, p) +#define IS_FILE_EPOLL(f) ((f)->f_op == &eventpoll_fops) + + +/* + * Type used for versioning events snapshots inside the double buffer. + */ +typedef unsigned long long event_version_t; + +/* + * This structure is stored inside the "private_data" member of the file + * structure and rapresent the main data sructure for the eventpoll + * interface. + */ +struct eventpoll { + /* + * Protect the evenpoll interface from sys_epoll_ctl(2), ioctl(EP_POLL) + * and ->write() concurrency. It basically serialize the add/remove/edit + * of items in the interest set. + */ + struct rw_semaphore acsem; + + /* + * Protect the this structure access. When the "acsem" is acquired + * togheter with this one, "acsem" should be acquired first. Or, + * "lock" nests inside "acsem". + */ + rwlock_t lock; + + /* Wait queue used by sys_epoll_wait() and ioctl(EP_POLL) */ + wait_queue_head_t wq; + + /* Wait queue used by file->poll() */ + wait_queue_head_t poll_wait; + + /* This is the hash used to store the "struct epitem" elements */ + struct list_head *hash; + + unsigned int hbits; + unsigned int hmask; + atomic_t hents; + atomic_t resize; + + /* Number of pages currently allocated in each side of the double buffer */ + int numpages; + + /* + * Current page set pointer, switched from "pages0" and "pages1" each time + * ep_poll() returns events to the caller. + */ + char **pages; + + /* Each one of these contains the pages allocated for each side of + * the double buffer. + */ + char *pages0[MAX_EVENTPOLL_PAGES]; + char *pages1[MAX_EVENTPOLL_PAGES]; + + /* + * Variable containing the vma base address where the double buffer + * pages are mapped onto. + */ + unsigned long vmabase; + + /* + * Certain functions cannot be called if the double buffer pages are + * not allocated and if the memory mapping is not in place. This tells + * us that everything is setup to fully use the interface. + */ + atomic_t mmapped; + + /* Number of events currently available inside the current snapshot */ + int eventcnt; + + /* + * Variable storing the current "version" of the snapshot. It is used + * to validate the validity of the current slot pointed by the "index" + * member of a "struct epitem". + */ + event_version_t ver; +}; + +/* + * Each file descriptor added to the eventpoll interface will + * have an entry of this type linked to the hash. + */ +struct epitem { + /* List header used to link this structure to the eventpoll hash */ + struct list_head llink; + + /* The "container" of this item */ + struct eventpoll *ep; + + /* The file this item refers to */ + struct file *file; + + /* The structure that describe the interested events and the source fd */ + struct pollfd pfd; + + /* + * The index inside the current double buffer that stores the active + * event slot for this item ( file ). + */ + int index; + + /* + * The version that is used to validate if the current slot is still + * valid or if it refers to an old snapshot. It is matches togheter + * with the one inside the eventpoll structure. + */ + event_version_t ver; +}; + + + + +static int ep_getfd(int *efd, struct inode **einode, struct file **efile); +static int ep_alloc_pages(char **pages, int numpages); +static int ep_free_pages(char **pages, int numpages); +static int ep_init(struct eventpoll *ep); +static void ep_free(struct eventpoll *ep); +static struct epitem *ep_find_nl(struct eventpoll *ep, int fd); +static struct epitem *ep_find(struct eventpoll *ep, int fd); +static int ep_hashresize(struct eventpoll *ep, unsigned long *kflags); +static int ep_insert(struct eventpoll *ep, struct pollfd *pfd); +static int ep_remove(struct eventpoll *ep, struct epitem *dpi); +static void notify_proc(struct file *file, void *data, unsigned long *local, + long *event); +static int open_eventpoll(struct inode *inode, struct file *file); +static int close_eventpoll(struct inode *inode, struct file *file); +static unsigned int poll_eventpoll(struct file *file, poll_table *wait); +static int write_eventpoll(struct file *file, const char *buffer, size_t count, + loff_t *ppos); +static int ep_poll(struct eventpoll *ep, struct evpoll *dvp); +static int ep_do_alloc_pages(struct eventpoll *ep, int numpages); +static int ioctl_eventpoll(struct inode *inode, struct file *file, + unsigned int cmd, unsigned long arg); +static void eventpoll_mm_open(struct vm_area_struct * vma); +static void eventpoll_mm_close(struct vm_area_struct * vma); +static int mmap_eventpoll(struct file *file, struct vm_area_struct *vma); +static int eventpollfs_delete_dentry(struct dentry *dentry); +static struct inode *get_eventpoll_inode(void); +static struct super_block *eventpollfs_get_sb(struct file_system_type *fs_type, + int flags, char *dev_name, void *data); + + + +/* Slab cache used to allocate "struct epitem" */ +static kmem_cache_t *dpi_cache; + +/* Virtual fs used to allocate inodes for eventpoll files */ +static struct vfsmount *eventpoll_mnt; + +/* File callbacks that implement the eventpoll file behaviour */ +static struct file_operations eventpoll_fops = { + .write = write_eventpoll, + .ioctl = ioctl_eventpoll, + .mmap = mmap_eventpoll, + .open = open_eventpoll, + .release = close_eventpoll, + .poll = poll_eventpoll +}; + +/* Memory mapping callbacks for the eventpoll file */ +static struct vm_operations_struct eventpoll_mmap_ops = { + .open = eventpoll_mm_open, + .close = eventpoll_mm_close, +}; + +/* + * The "struct miscdevice" is used to register the eventpoll device + * to make it suitable to be openend from a /dev file. + */ +static struct miscdevice eventpoll_miscdev = { + EVENTPOLL_MINOR, "eventpoll", &eventpoll_fops +}; + +/* + * This is used to register the virtual file system from where + * eventpoll inodes are allocated. + */ +static struct file_system_type eventpoll_fs_type = { + .name = "eventpollfs", + .get_sb = eventpollfs_get_sb, + .kill_sb = kill_anon_super, +}; + +/* Very basic directory entry operations for the eventpoll virtual file system */ +static struct dentry_operations eventpollfs_dentry_operations = { + .d_delete = eventpollfs_delete_dentry, +}; + + + +/* + * It opens an eventpoll file descriptor by allocating space for "maxfds" + * file descriptors. It is the kernel part of the userspace epoll_create(2). + */ +asmlinkage int sys_epoll_create(int maxfds) +{ + int error = -EINVAL, fd; + unsigned long addr; + struct inode *inode; + struct file *file; + struct eventpoll *ep; + + DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_create(%d)\n", + current, maxfds)); + + /* + * It is not possible to store more than MAX_FDS_IN_EVENTPOLL file + * descriptors inside the eventpoll interface. + */ + if (maxfds > MAX_FDS_IN_EVENTPOLL) + goto eexit_1; + + /* + * Creates all the items needed to setup an eventpoll file. That is, + * a file structure, and inode and a free file descriptor. + */ + error = ep_getfd(&fd, &inode, &file); + if (error) + goto eexit_1; + + /* + * Calls the code to initialize the eventpoll file. This code is + * the same as the "open" file operation callback because inside + * ep_getfd() we did what the kernel usually does before invoking + * corresponding file "open" callback. + */ + error = open_eventpoll(inode, file); + if (error) + goto eexit_2; + + /* The "private_data" member is setup by open_eventpoll() */ + ep = file->private_data; + + /* Alloc pages for the event double buffer */ + error = ep_do_alloc_pages(ep, EP_FDS_PAGES(maxfds + 1)); + if (error) + goto eexit_2; + + /* + * Create a user space mapping of the event double buffer to + * avoid kernel to user space memory copy when returning events + * to the caller. + */ + down_write(¤t->mm->mmap_sem); + addr = do_mmap_pgoff(file, 0, EP_MAP_SIZE(maxfds + 1), PROT_READ, + MAP_PRIVATE, 0); + up_write(¤t->mm->mmap_sem); + error = PTR_ERR((void *) addr); + if (IS_ERR((void *) addr)) + goto eexit_2; + + DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_create(%d) = %d\n", + current, maxfds, fd)); + + return fd; + +eexit_2: + sys_close(fd); +eexit_1: + DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_create(%d) = %d\n", + current, maxfds, error)); + return error; +} + + +/* + * The following function implement the controller interface for the eventpoll + * file that enable the insertion/removal/change of file descriptors inside + * the interest set. It rapresents the kernel part of the user spcae epoll_ctl(2). + */ +asmlinkage int sys_epoll_ctl(int epfd, int op, int fd, unsigned int events) +{ + int error = -EBADF; + struct file *file; + struct eventpoll *ep; + struct epitem *dpi; + struct pollfd pfd; + + DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_ctl(%d, %d, %d, %u)\n", + current, epfd, op, fd, events)); + + file = fget(epfd); + if (!file) + goto eexit_1; + + /* + * We have to check that the file structure underneath the file descriptor + * the user passed to us _is_ an eventpoll file. + */ + error = -EINVAL; + if (!IS_FILE_EPOLL(file)) + goto eexit_2; + + /* + * At this point it is safe to assume that the "private_data" contains + * our own data structure. + */ + ep = file->private_data; + + down_write(&ep->acsem); + + pfd.fd = fd; + pfd.events = events | POLLERR | POLLHUP; + pfd.revents = 0; + + dpi = ep_find(ep, fd); + + error = -EINVAL; + switch (op) { + case EP_CTL_ADD: + if (!dpi) + error = ep_insert(ep, &pfd); + else + error = -EEXIST; + break; + case EP_CTL_DEL: + if (dpi) + error = ep_remove(ep, dpi); + else + error = -ENOENT; + break; + case EP_CTL_MOD: + if (dpi) { + dpi->pfd.events = events; + error = 0; + } else + error = -ENOENT; + break; + } + + DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_ctl(%d, %d, %d, %u) = %d\n", + current, epfd, op, fd, events, error)); + + up_write(&ep->acsem); + +eexit_2: + fput(file); +eexit_1: + return error; +} + + +/* + * Implement the event wait interface for the eventpoll file. It is the kernel + * part of the user space epoll_wait(2). + */ +asmlinkage int sys_epoll_wait(int epfd, struct pollfd const **events, int timeout) +{ + int error = -EBADF; + void *eaddr; + struct file *file; + struct eventpoll *ep; + struct evpoll dvp; + + DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_wait(%d, %p, %d)\n", + current, epfd, events, timeout)); + + file = fget(epfd); + if (!file) + goto eexit_1; + + /* + * We have to check that the file structure underneath the file descriptor + * the user passed to us _is_ an eventpoll file. + */ + error = -EINVAL; + if (!IS_FILE_EPOLL(file)) + goto eexit_2; + + /* + * At this point it is safe to assume that the "private_data" contains + * our own data structure. + */ + ep = file->private_data; + + /* + * It is possible that the user created an eventpoll file by open()ing + * the corresponding /dev/ file and he did not perform the correct + * initialization required by the old /dev/epoll interface. This test + * protect us from this scenario. + */ + error = -EINVAL; + if (!atomic_read(&ep->mmapped)) + goto eexit_2; + + dvp.ep_timeout = timeout; + error = ep_poll(ep, &dvp); + if (error > 0) { + eaddr = (void *) (ep->vmabase + dvp.ep_resoff); + if (copy_to_user(events, &eaddr, sizeof(struct pollfd *))) + error = -EFAULT; + } + + DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_wait(%d, %p, %d) = %d\n", + current, epfd, events, timeout, error)); + +eexit_2: + fput(file); +eexit_1: + return error; +} + + +/* + * Creates the file descriptor to be used by the epoll interface. + */ +static int ep_getfd(int *efd, struct inode **einode, struct file **efile) +{ + struct qstr this; + char name[32]; + struct dentry *dentry; + struct inode *inode; + struct file *file; + int error, fd; + + /* Get an ready to use file */ + error = -ENFILE; + file = get_empty_filp(); + if (!file) + goto eexit_1; + + /* Allocates an inode from the eventpoll file system */ + inode = get_eventpoll_inode(); + error = PTR_ERR(inode); + if (IS_ERR(inode)) + goto eexit_2; + + /* Allocates a free descriptor to plug the file onto */ + error = get_unused_fd(); + if (error < 0) + goto eexit_3; + fd = error; + + /* + * Link the inode to a directory entry by creating a unique name + * using the inode number. + */ + error = -ENOMEM; + sprintf(name, "[%lu]", inode->i_ino); + this.name = name; + this.len = strlen(name); + this.hash = inode->i_ino; + dentry = d_alloc(eventpoll_mnt->mnt_sb->s_root, &this); + if (!dentry) + goto eexit_4; + dentry->d_op = &eventpollfs_dentry_operations; + d_add(dentry, inode); + file->f_vfsmnt = mntget(eventpoll_mnt); + file->f_dentry = dget(dentry); + + /* + * Initialize the file as read/write because it could be used + * with write() to add/remove/change interest sets. + */ + file->f_pos = 0; + file->f_flags = O_RDWR; + file->f_op = &eventpoll_fops; + file->f_mode = FMODE_READ | FMODE_WRITE; + file->f_version = 0; + file->private_data = NULL; + + /* Install the new setup file into the allocated fd. */ + fd_install(fd, file); + + *efd = fd; + *einode = inode; + *efile = file; + return 0; + +eexit_4: + put_unused_fd(fd); +eexit_3: + iput(inode); +eexit_2: + put_filp(file); +eexit_1: + return error; +} + + +static int ep_alloc_pages(char **pages, int numpages) +{ + int ii; + + for (ii = 0; ii < numpages; ii++) { + pages[ii] = (char *) __get_free_pages(GFP_KERNEL, 0); + if (!pages[ii]) { + for (--ii; ii >= 0; ii--) { + ClearPageReserved(virt_to_page(pages[ii])); + free_pages((unsigned long) pages[ii], 0); + } + return -ENOMEM; + } + SetPageReserved(virt_to_page(pages[ii])); + } + return 0; +} + + +static int ep_free_pages(char **pages, int numpages) +{ + int ii; + + for (ii = 0; ii < numpages; ii++) { + ClearPageReserved(virt_to_page(pages[ii])); + free_pages((unsigned long) pages[ii], 0); + } + return 0; +} + + +static int ep_init(struct eventpoll *ep) +{ + int ii, hentries; + + init_rwsem(&ep->acsem); + rwlock_init(&ep->lock); + init_waitqueue_head(&ep->wq); + init_waitqueue_head(&ep->poll_wait); + ep->hbits = INITIAL_HASH_BITS; + ep->hmask = (1 << ep->hbits) - 1; + atomic_set(&ep->hents, 0); + atomic_set(&ep->resize, 0); + atomic_set(&ep->mmapped, 0); + ep->numpages = 0; + ep->vmabase = 0; + ep->pages = ep->pages0; + ep->eventcnt = 0; + ep->ver = 1; + + hentries = ep->hmask + 1; + if (!(ep->hash = (struct list_head *) vmalloc(hentries * sizeof(struct list_head)))) + return -ENOMEM; + + for (ii = 0; ii < hentries; ii++) + INIT_LIST_HEAD(&ep->hash[ii]); + + return 0; +} + + +static void ep_free(struct eventpoll *ep) +{ + int ii; + struct list_head *lnk; + + /* + * Walks through the whole hash by unregistering file callbacks and + * freeing each "struct epitem". + */ + for (ii = 0; ii <= ep->hmask; ii++) { + while ((lnk = list_first(&ep->hash[ii]))) { + struct epitem *dpi = list_entry(lnk, struct epitem, llink); + + file_notify_delcb(dpi->file, notify_proc); + list_del(lnk); + DPI_MEM_FREE(dpi); + } + } + /* + * At this point we can free the hash and the pages used for the event + * double buffer. The ep_free() function is called from the "close" + * file operations callback, and this garanties us that the pages are + * already unmapped. + */ + vfree(ep->hash); + if (ep->numpages > 0) { + ep_free_pages(ep->pages0, ep->numpages); + ep_free_pages(ep->pages1, ep->numpages); + } +} + + +/* + * No lock version of ep_find(), used when the code had to acquire the lock + * before calling the function. + */ +static struct epitem *ep_find_nl(struct eventpoll *ep, int fd) +{ + struct epitem *dpi = NULL; + struct list_head *lsthead, *lnk; + + lsthead = &ep->hash[fd & ep->hmask]; + list_for_each(lnk, lsthead) { + dpi = list_entry(lnk, struct epitem, llink); + + if (dpi->pfd.fd == fd) break; + dpi = NULL; + } + + DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ep_find(%d) -> %p\n", current, fd, dpi)); + + return dpi; +} + + +static struct epitem *ep_find(struct eventpoll *ep, int fd) +{ + struct epitem *dpi; + unsigned long flags; + + read_lock_irqsave(&ep->lock, flags); + + dpi = ep_find_nl(ep, fd); + + read_unlock_irqrestore(&ep->lock, flags); + + return dpi; +} + + +static int ep_hashresize(struct eventpoll *ep, unsigned long *kflags) +{ + struct list_head *hash, *oldhash; + unsigned int hbits = ep->hbits + 1; + unsigned int hmask = (1 << hbits) - 1; + int ii, res, hentries = hmask + 1; + unsigned long flags = *kflags; + + DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ep_hashresize(%p) bits=%u\n", current, ep, hbits)); + + write_unlock_irqrestore(&ep->lock, flags); + + res = -ENOMEM; + if (!(hash = (struct list_head *) vmalloc(hentries * sizeof(struct list_head)))) { + write_lock_irqsave(&ep->lock, flags); + goto eexit_1; + } + + for (ii = 0; ii < hentries; ii++) + INIT_LIST_HEAD(&hash[ii]); + + write_lock_irqsave(&ep->lock, flags); + + oldhash = ep->hash; + for (ii = 0; ii <= ep->hmask; ii++) { + struct list_head *oldhead = &oldhash[ii], *lnk; + + while ((lnk = list_first(oldhead))) { + struct epitem *dpi = list_entry(lnk, struct epitem, llink); + + list_del(lnk); + list_add(lnk, &hash[dpi->pfd.fd & hmask]); + } + } + + ep->hash = hash; + ep->hbits = hbits; + ep->hmask = hmask; + + write_unlock_irqrestore(&ep->lock, flags); + vfree(oldhash); + write_lock_irqsave(&ep->lock, flags); + + res = 0; +eexit_1: + *kflags = flags; + atomic_dec(&ep->resize); + return res; +} + + +static int ep_insert(struct eventpoll *ep, struct pollfd *pfd) +{ + int error; + struct epitem *dpi; + struct file *file; + unsigned long flags; + + if (atomic_read(&ep->hents) >= (ep->numpages * POLLFD_X_PAGE)) + return -E2BIG; + + file = fget(pfd->fd); + if (!file) + return -EBADF; + + error = -ENOMEM; + if (!(dpi = DPI_MEM_ALLOC())) + goto eexit_1; + + INIT_LIST_HEAD(&dpi->llink); + dpi->ep = ep; + dpi->file = file; + dpi->pfd = *pfd; + dpi->index = -1; + dpi->ver = ep->ver - 1; + + write_lock_irqsave(&ep->lock, flags); + + list_add(&dpi->llink, &ep->hash[pfd->fd & ep->hmask]); + atomic_inc(&ep->hents); + + if (!atomic_read(&ep->resize) && + (atomic_read(&ep->hents) >> ep->hbits) > RESIZE_LENGTH && + ep->hbits < MAX_HASH_BITS) { + atomic_inc(&ep->resize); + ep_hashresize(ep, &flags); + } + + write_unlock_irqrestore(&ep->lock, flags); + + file_notify_addcb(file, notify_proc, dpi); + + DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ep_insert(%p, %d)\n", + current, ep, pfd->fd)); + + error = 0; +eexit_1: + fput(file); + + return error; +} + + +/* + * Removes a "struct epitem" from the eventpoll hash and deallocates + * all the associated resources. + */ +static int ep_remove(struct eventpoll *ep, struct epitem *dpi) +{ + unsigned long flags; + struct pollfd *pfd, *lpfd; + struct epitem *ldpi; + + /* First, removes the callback from the file callback list */ + file_notify_delcb(dpi->file, notify_proc); + + write_lock_irqsave(&ep->lock, flags); + + list_del(&dpi->llink); + atomic_dec(&ep->hents); + + /* + * This is to remove stale events. We don't want that the removed file + * has a pending event that might be associated with a file inserted + * at a later time inside the eventpoll interface. this code checks + * if the currently removed file has a valid pending event and, if it does, + * manages things to remove it and decrement the currently available + * event count. + */ + if (dpi->index >= 0 && dpi->ver == ep->ver && dpi->index < ep->eventcnt) { + pfd = (struct pollfd *) (ep->pages[EVENT_PAGE_INDEX(dpi->index)] + + EVENT_PAGE_OFFSET(dpi->index)); + if (pfd->fd == dpi->pfd.fd && dpi->index < --ep->eventcnt) { + lpfd = (struct pollfd *) (ep->pages[EVENT_PAGE_INDEX(ep->eventcnt)] + + EVENT_PAGE_OFFSET(ep->eventcnt)); + *pfd = *lpfd; + + if ((ldpi = ep_find_nl(ep, pfd->fd))) ldpi->index = dpi->index; + } + } + + write_unlock_irqrestore(&ep->lock, flags); + + DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ep_remove(%p, %d)\n", + current, ep, dpi->pfd.fd)); + + /* At this point it is safe to free the eventpoll item */ + DPI_MEM_FREE(dpi); + + return 0; +} + + +/* + * This is the event notify callback that is called from fs/fcblist.c because + * of the registration ( file_notify_addcb() ) done in ep_insert(). + */ +static void notify_proc(struct file *file, void *data, unsigned long *local, + long *event) +{ + struct epitem *dpi = data; + struct eventpoll *ep = dpi->ep; + struct pollfd *pfd; + + DNPRINTK(3, (KERN_INFO "[%p] eventpoll: notify(%p, %p, %ld, %ld) ep=%p\n", + current, file, data, event[0], event[1], ep)); + + write_lock(&ep->lock); + + /* We're not expecting any of those events. Jump out soon ... */ + if (!(dpi->pfd.events & event[1])) + goto out; + + /* + * This logic determins if an active even slot is available for the + * currently signaled file, or if we have to make space for a new one + * and increment the number of ready file descriptors ( ep->eventcnt ). + */ + if (dpi->index < 0 || dpi->ver != ep->ver) { + if (ep->eventcnt >= (ep->numpages * POLLFD_X_PAGE)) + goto out; + dpi->index = ep->eventcnt++; + dpi->ver = ep->ver; + pfd = (struct pollfd *) (ep->pages[EVENT_PAGE_INDEX(dpi->index)] + + EVENT_PAGE_OFFSET(dpi->index)); + *pfd = dpi->pfd; + } else { + pfd = (struct pollfd *) (ep->pages[EVENT_PAGE_INDEX(dpi->index)] + + EVENT_PAGE_OFFSET(dpi->index)); + if (pfd->fd != dpi->pfd.fd) { + if (ep->eventcnt >= (ep->numpages * POLLFD_X_PAGE)) + goto out; + dpi->index = ep->eventcnt++; + pfd = (struct pollfd *) (ep->pages[EVENT_PAGE_INDEX(dpi->index)] + + EVENT_PAGE_OFFSET(dpi->index)); + *pfd = dpi->pfd; + } + } + + /* + * Merge event bits into the corresponding event slot inside the + * double buffer. + */ + pfd->revents |= (pfd->events & event[1]); + + /* + * Wake up ( if active ) both the eventpoll wait list and the ->poll() + * wait list. + */ + if (waitqueue_active(&ep->wq)) + wake_up(&ep->wq); + if (waitqueue_active(&ep->poll_wait)) + wake_up(&ep->poll_wait); +out: + write_unlock(&ep->lock); +} + + +static int open_eventpoll(struct inode *inode, struct file *file) +{ + int res; + struct eventpoll *ep; + + if (!(ep = kmalloc(sizeof(struct eventpoll), GFP_KERNEL))) + return -ENOMEM; + + memset(ep, 0, sizeof(*ep)); + if ((res = ep_init(ep))) { + kfree(ep); + return res; + } + + file->private_data = ep; + + DNPRINTK(3, (KERN_INFO "[%p] eventpoll: open() ep=%p\n", current, ep)); + return 0; +} + + +static int close_eventpoll(struct inode *inode, struct file *file) +{ + struct eventpoll *ep = file->private_data; + + if (ep) { + ep_free(ep); + kfree(ep); + } + + DNPRINTK(3, (KERN_INFO "[%p] eventpoll: close() ep=%p\n", current, ep)); + return 0; +} + + +static unsigned int poll_eventpoll(struct file *file, poll_table *wait) +{ + struct eventpoll *ep = file->private_data; + + poll_wait(file, &ep->poll_wait, wait); + if (ep->eventcnt) + return POLLIN | POLLRDNORM; + + return 0; +} + + +static int write_eventpoll(struct file *file, const char *buffer, size_t count, + loff_t *ppos) +{ + int rcount; + struct eventpoll *ep = file->private_data; + struct epitem *dpi; + struct pollfd pfd; + + DNPRINTK(3, (KERN_INFO "[%p] eventpoll: write(%p, %d)\n", current, ep, count)); + + /* The size of the write must be a multiple of sizeof(struct pollfd) */ + rcount = -EINVAL; + if (count % sizeof(struct pollfd)) + goto eexit_1; + + /* + * And we have also to verify that that area is correctly accessible + * for the user. + */ + if ((rcount = verify_area(VERIFY_READ, buffer, count))) + goto eexit_1; + + down_write(&ep->acsem); + + rcount = 0; + + while (count > 0) { + if (__copy_from_user(&pfd, buffer, sizeof(pfd))) { + rcount = -EFAULT; + goto eexit_2; + } + + dpi = ep_find(ep, pfd.fd); + + if (pfd.fd >= current->files->max_fds || !current->files->fd[pfd.fd]) + pfd.events = POLLREMOVE; + if (pfd.events & POLLREMOVE) { + if (dpi) { + ep_remove(ep, dpi); + rcount += sizeof(pfd); + } + } + else if (dpi) { + dpi->pfd.events = pfd.events; + rcount += sizeof(pfd); + } else { + pfd.revents = 0; + if (!ep_insert(ep, &pfd)) + rcount += sizeof(pfd); + } + + buffer += sizeof(pfd); + count -= sizeof(pfd); + } + +eexit_2: + up_write(&ep->acsem); +eexit_1: + DNPRINTK(3, (KERN_INFO "[%p] eventpoll: write(%p, %d) = %d\n", + current, ep, count, rcount)); + + return rcount; +} + + +static int ep_poll(struct eventpoll *ep, struct evpoll *dvp) +{ + int res = 0; + long timeout; + unsigned long flags; + wait_queue_t wait; + + /* + * We don't want ep_poll() to be called if the correct sequence + * of operations are performed to initialize it. This won't happen + * for the system call interface but it could happen using the + * old /dev/epoll interface, that is maintained for compatibility. + */ + if (!atomic_read(&ep->mmapped)) + return -EINVAL; + + write_lock_irqsave(&ep->lock, flags); + + res = 0; + if (!ep->eventcnt) { + /* + * We don't have any available event to return to the caller. + * We need to sleep here, and we will be wake up by + * notify_proc() when events will become available. + */ + init_waitqueue_entry(&wait, current); + add_wait_queue(&ep->wq, &wait); + + /* + * Calculate the timeout by checking for the "infinite" value ( -1 ) + * and the overflow condition ( > MAX_SCHEDULE_TIMEOUT / HZ ). The + * passed timeout is in milliseconds, that why (t * HZ) / 1000. + */ + timeout = dvp->ep_timeout == -1 || dvp->ep_timeout > MAX_SCHEDULE_TIMEOUT / HZ ? + MAX_SCHEDULE_TIMEOUT: (dvp->ep_timeout * HZ) / 1000; + + for (;;) { + /* + * We don't want to sleep if the notify_proc() sends us + * a wakeup in between. That's why we set the task state + * to TASK_INTERRUPTIBLE before doing the checks. + */ + set_current_state(TASK_INTERRUPTIBLE); + if (ep->eventcnt || !timeout) + break; + if (signal_pending(current)) { + res = -EINTR; + break; + } + + write_unlock_irqrestore(&ep->lock, flags); + timeout = schedule_timeout(timeout); + write_lock_irqsave(&ep->lock, flags); + } + remove_wait_queue(&ep->wq, &wait); + + set_current_state(TASK_RUNNING); + } + + /* + * If we've been wake up because of events became available, we need to: + * + * 1) null the number of available ready file descriptors + * 2) increment the version of the current ( next ) snapshot + * 3) swap the double buffer to return the current one to the caller + */ + if (!res && ep->eventcnt) { + res = ep->eventcnt; + ep->eventcnt = 0; + ++ep->ver; + if (ep->pages == ep->pages0) { + ep->pages = ep->pages1; + dvp->ep_resoff = 0; + } else { + ep->pages = ep->pages0; + dvp->ep_resoff = ep->numpages * PAGE_SIZE; + } + } + + write_unlock_irqrestore(&ep->lock, flags); + + return res; +} + + +static int ep_do_alloc_pages(struct eventpoll *ep, int numpages) +{ + int res, pgalloc, pgcpy; + unsigned long flags; + char **pages, **pages0, **pages1; + + if (atomic_read(&ep->mmapped)) + return -EBUSY; + if (numpages > MAX_EVENTPOLL_PAGES) + return -EINVAL; + + pgalloc = numpages - ep->numpages; + if ((pages = (char **) vmalloc(2 * (pgalloc + 1) * sizeof(char *))) == NULL) + return -ENOMEM; + pages0 = &pages[0]; + pages1 = &pages[pgalloc + 1]; + + if ((res = ep_alloc_pages(pages0, pgalloc))) + goto eexit_1; + + if ((res = ep_alloc_pages(pages1, pgalloc))) { + ep_free_pages(pages0, pgalloc); + goto eexit_1; + } + + write_lock_irqsave(&ep->lock, flags); + pgcpy = (ep->numpages + pgalloc) > numpages ? numpages - ep->numpages: pgalloc; + if (pgcpy > 0) { + memcpy(&ep->pages0[ep->numpages], pages0, pgcpy * sizeof(char *)); + memcpy(&ep->pages1[ep->numpages], pages1, pgcpy * sizeof(char *)); + ep->numpages += pgcpy; + } + write_unlock_irqrestore(&ep->lock, flags); + + if (pgcpy < pgalloc) { + if (pgcpy < 0) + pgcpy = 0; + ep_free_pages(&pages0[pgcpy], pgalloc - pgcpy); + ep_free_pages(&pages1[pgcpy], pgalloc - pgcpy); + } + +eexit_1: + vfree(pages); + return res; +} + + +static int ioctl_eventpoll(struct inode *inode, struct file *file, + unsigned int cmd, unsigned long arg) +{ + int res; + struct eventpoll *ep = file->private_data; + struct epitem *dpi; + unsigned long flags; + struct pollfd pfd; + struct evpoll dvp; + + switch (cmd) { + case EP_ALLOC: + res = ep_do_alloc_pages(ep, EP_FDS_PAGES(arg)); + + DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ioctl(%p, EP_ALLOC, %lu) == %d\n", + current, ep, arg, res)); + return res; + + case EP_FREE: + if (atomic_read(&ep->mmapped)) + return -EBUSY; + + res = -EINVAL; + write_lock_irqsave(&ep->lock, flags); + if (ep->numpages > 0) { + ep_free_pages(ep->pages0, ep->numpages); + ep_free_pages(ep->pages1, ep->numpages); + ep->numpages = 0; + ep->pages = ep->pages0; + res = 0; + } + write_unlock_irqrestore(&ep->lock, flags); + + DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ioctl(%p, EP_FREE) == %d\n", + current, ep, res)); + return res; + + case EP_POLL: + if (copy_from_user(&dvp, (void *) arg, sizeof(struct evpoll))) + return -EFAULT; + + DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ioctl(%p, EP_POLL, %d)\n", + current, ep, dvp.ep_timeout)); + + res = ep_poll(ep, &dvp); + + DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ioctl(%p, EP_POLL, %d) == %d\n", + current, ep, dvp.ep_timeout, res)); + + if (res > 0 && copy_to_user((void *) arg, &dvp, sizeof(struct evpoll))) + res = -EFAULT; + + return res; + + case EP_ISPOLLED: + if (copy_from_user(&pfd, (void *) arg, sizeof(struct pollfd))) + return 0; + + read_lock_irqsave(&ep->lock, flags); + + res = 0; + if (!(dpi = ep_find_nl(ep, pfd.fd))) + goto is_not_polled; + + pfd = dpi->pfd; + res = 1; + + is_not_polled: + read_unlock_irqrestore(&ep->lock, flags); + + if (res) + copy_to_user((void *) arg, &pfd, sizeof(struct pollfd)); + + DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ioctl(%p, EP_ISPOLLED, %d) == %d\n", + current, ep, pfd.fd, res)); + return res; + } + + return -EINVAL; +} + + +static void eventpoll_mm_open(struct vm_area_struct * vma) +{ + struct file *file = vma->vm_file; + struct eventpoll *ep = file->private_data; + + if (ep) atomic_inc(&ep->mmapped); + + DNPRINTK(3, (KERN_INFO "[%p] eventpoll: mm_open(%p)\n", current, ep)); +} + + +static void eventpoll_mm_close(struct vm_area_struct * vma) +{ + struct file *file = vma->vm_file; + struct eventpoll *ep = file->private_data; + + if (ep && atomic_dec_and_test(&ep->mmapped)) + ep->vmabase = 0; + + DNPRINTK(3, (KERN_INFO "[%p] eventpoll: mm_close(%p)\n", current, ep)); +} + + +static int mmap_eventpoll(struct file *file, struct vm_area_struct *vma) +{ + struct eventpoll *ep = file->private_data; + unsigned long start; + int ii, res, numpages; + size_t mapsize; + + DNPRINTK(3, (KERN_INFO "[%p] eventpoll: mmap(%p, %lx, %lx)\n", + current, ep, vma->vm_start, vma->vm_pgoff << PAGE_SHIFT)); + + /* + * We need the eventpoll file to be RW but we don't want it to be + * mapped RW. This test perform the test and reject RW mmaping. + */ + if (vma->vm_flags & VM_WRITE) + return -EACCES; + + if ((vma->vm_pgoff << PAGE_SHIFT) != 0) + return -EINVAL; + + /* + * We need to verify that the mapped area covers all the allocated + * double buffer. + */ + mapsize = PAGE_ALIGN(vma->vm_end - vma->vm_start); + numpages = mapsize >> PAGE_SHIFT; + + res = -EINVAL; + if (numpages != (2 * ep->numpages)) + goto eexit_1; + + /* + * Map the double buffer starting from "vma->vm_start" up to + * "vma->vm_start + ep->numpages * PAGE_SIZE". + */ + start = vma->vm_start; + for (ii = 0; ii < ep->numpages; ii++) { + if ((res = remap_page_range(vma, start, __pa(ep->pages0[ii]), + PAGE_SIZE, vma->vm_page_prot))) + goto eexit_1; + start += PAGE_SIZE; + } + for (ii = 0; ii < ep->numpages; ii++) { + if ((res = remap_page_range(vma, start, __pa(ep->pages1[ii]), + PAGE_SIZE, vma->vm_page_prot))) + goto eexit_1; + start += PAGE_SIZE; + } + vma->vm_ops = &eventpoll_mmap_ops; + + /* Saves the base mapping address for later use in sys_epoll_wait(2) */ + ep->vmabase = vma->vm_start; + + /* + * Ok, mapping has been done. We can open the door to functions that + * requires the mapping to be in place. + */ + atomic_set(&ep->mmapped, 1); + + res = 0; +eexit_1: + + DNPRINTK(3, (KERN_INFO "[%p] eventpoll: mmap(%p, %lx, %lx) == %d\n", + current, ep, vma->vm_start, vma->vm_pgoff << PAGE_SHIFT, res)); + return res; +} + + +static int eventpollfs_delete_dentry(struct dentry *dentry) +{ + + return 1; +} + + +static struct inode *get_eventpoll_inode(void) +{ + int error = -ENOMEM; + struct inode *inode = new_inode(eventpoll_mnt->mnt_sb); + + if (!inode) + goto eexit_1; + + inode->i_fop = &eventpoll_fops; + + /* + * Mark the inode dirty from the very beginning, + * that way it will never be moved to the dirty + * list because "mark_inode_dirty()" will think + * that it already _is_ on the dirty list. + */ + inode->i_state = I_DIRTY; + inode->i_mode = S_IRUSR | S_IWUSR; + inode->i_uid = current->fsuid; + inode->i_gid = current->fsgid; + inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; + inode->i_blksize = PAGE_SIZE; + return inode; + +eexit_1: + return ERR_PTR(error); +} + + +static struct super_block *eventpollfs_get_sb(struct file_system_type *fs_type, + int flags, char *dev_name, void *data) +{ + + return get_sb_pseudo(fs_type, "eventpoll:", NULL, EVENTPOLLFS_MAGIC); +} + + +static int __init eventpoll_init(void) +{ + int error; + + /* Allocates slab cache used to allocate "struct epitem" items */ + error = -ENOMEM; + dpi_cache = kmem_cache_create("eventpoll", + sizeof(struct epitem), + __alignof__(struct epitem), + DPI_SLAB_DEBUG, NULL, NULL); + if (!dpi_cache) + goto eexit_1; + + /* + * Register the virtual file system that will be the source of inodes + * for the eventpoll files + */ + error = register_filesystem(&eventpoll_fs_type); + if (error) + goto eexit_2; + + /* Mount the above commented virtual file system */ + eventpoll_mnt = kern_mount(&eventpoll_fs_type); + error = PTR_ERR(eventpoll_mnt); + if (IS_ERR(eventpoll_mnt)) + goto eexit_3; + + /* + * This is to maintain compatibility with the old /dev/epoll interface. + * We need to register a misc device so that the caller can open(2) it + * through a file inside /dev. + */ + error = misc_register(&eventpoll_miscdev); + if (error) + goto eexit_4; + + printk(KERN_INFO "[%p] eventpoll: driver installed.\n", current); + + return error; + +eexit_4: + mntput(eventpoll_mnt); +eexit_3: + unregister_filesystem(&eventpoll_fs_type); +eexit_2: + kmem_cache_destroy(dpi_cache); +eexit_1: + + return error; +} + +static void __exit eventpoll_exit(void) +{ + /* Undo all operations done inside eventpoll_init() */ + unregister_filesystem(&eventpoll_fs_type); + mntput(eventpoll_mnt); + misc_deregister(&eventpoll_miscdev); + kmem_cache_destroy(dpi_cache); +} + +module_init(eventpoll_init); +module_exit(eventpoll_exit); + +MODULE_LICENSE("GPL"); + diff -Nru linux-2.5.44.vanilla/fs/Makefile linux-2.5.44.epoll/fs/Makefile --- linux-2.5.44.vanilla/fs/Makefile Fri Oct 18 21:01:57 2002 +++ linux-2.5.44.epoll/fs/Makefile Sat Oct 19 12:05:48 2002 @@ -6,14 +6,14 @@ # export-objs := open.o dcache.o buffer.o bio.o inode.o dquot.o mpage.o aio.o \ - fcntl.o read_write.o dcookies.o + fcntl.o read_write.o dcookies.o fcblist.o obj-y := open.o read_write.o devices.o file_table.o buffer.o \ bio.o super.o block_dev.o char_dev.o stat.o exec.o pipe.o \ namei.o fcntl.o ioctl.o readdir.o select.o fifo.o locks.o \ dcache.o inode.o attr.o bad_inode.o file.o dnotify.o \ filesystems.o namespace.o seq_file.o xattr.o libfs.o \ - fs-writeback.o mpage.o direct-io.o aio.o + fs-writeback.o mpage.o direct-io.o aio.o fcblist.o ifneq ($(CONFIG_NFSD),n) ifneq ($(CONFIG_NFSD),) diff -Nru linux-2.5.44.vanilla/fs/fcblist.c linux-2.5.44.epoll/fs/fcblist.c --- linux-2.5.44.vanilla/fs/fcblist.c Wed Dec 31 16:00:00 1969 +++ linux-2.5.44.epoll/fs/fcblist.c Tue Oct 29 12:28:11 2002 @@ -0,0 +1,146 @@ +/* + * linux/fs/fcblist.c ( File event callbacks handling ) + * Copyright (C) 2001,...,2002 Davide Libenzi + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * Davide Libenzi davidel@xmailserver.org + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +long ion_band_table[NSIGPOLL] = { + ION_IN, /* POLL_IN */ + ION_OUT, /* POLL_OUT */ + ION_IN, /* POLL_MSG */ + ION_ERR, /* POLL_ERR */ + 0, /* POLL_PRI */ + ION_HUP /* POLL_HUP */ +}; + +long poll_band_table[NSIGPOLL] = { + POLLIN | POLLRDNORM, /* POLL_IN */ + POLLOUT | POLLWRNORM | POLLWRBAND, /* POLL_OUT */ + POLLIN | POLLRDNORM | POLLMSG, /* POLL_MSG */ + POLLERR, /* POLL_ERR */ + POLLPRI | POLLRDBAND, /* POLL_PRI */ + POLLHUP | POLLERR /* POLL_HUP */ +}; + + + +/* + * Walk through the file callback list by calling each registered callback + * with the event that happened on the "filep" file. Callbacks are called + * by holding a read lock on the callback list lock, and also by keeping + * local IRQs disabled. + */ +void file_notify_event(struct file *filep, long *event) +{ + unsigned long flags; + struct list_head *lnk, *lsthead; + + read_lock_irqsave(&filep->f_cblock, flags); + + lsthead = &filep->f_cblist; + list_for_each(lnk, lsthead) { + struct fcb_struct *fcbp = list_entry(lnk, struct fcb_struct, llink); + + fcbp->cbproc(filep, fcbp->data, fcbp->local, event); + } + + read_unlock_irqrestore(&filep->f_cblock, flags); +} + + +/* + * Add a new callback to the list of file callbacks. + */ +int file_notify_addcb(struct file *filep, + void (*cbproc)(struct file *, void *, unsigned long *, long *), + void *data) +{ + unsigned long flags; + struct fcb_struct *fcbp; + + if (!(fcbp = (struct fcb_struct *) kmalloc(sizeof(struct fcb_struct), GFP_KERNEL))) + return -ENOMEM; + + memset(fcbp, 0, sizeof(struct fcb_struct)); + fcbp->cbproc = cbproc; + fcbp->data = data; + + write_lock_irqsave(&filep->f_cblock, flags); + list_add_tail(&fcbp->llink, &filep->f_cblist); + write_unlock_irqrestore(&filep->f_cblock, flags); + + return 0; +} + + +/* + * Removes the callback "cbproc" from the file callback list. + */ +int file_notify_delcb(struct file *filep, + void (*cbproc)(struct file *, void *, unsigned long *, long *)) +{ + unsigned long flags; + struct list_head *lnk, *lsthead; + + write_lock_irqsave(&filep->f_cblock, flags); + + lsthead = &filep->f_cblist; + list_for_each(lnk, lsthead) { + struct fcb_struct *fcbp = list_entry(lnk, struct fcb_struct, llink); + + if (fcbp->cbproc == cbproc) { + list_del(lnk); + write_unlock_irqrestore(&filep->f_cblock, flags); + kfree(fcbp); + return 0; + } + } + + write_unlock_irqrestore(&filep->f_cblock, flags); + + return -ENOENT; +} + + +/* + * It is called at file cleanup time and removes all the registered callbacks. + */ +void file_notify_cleanup(struct file *filep) +{ + unsigned long flags; + struct list_head *lnk, *lsthead; + + write_lock_irqsave(&filep->f_cblock, flags); + + lsthead = &filep->f_cblist; + while ((lnk = list_first(lsthead))) { + struct fcb_struct *fcbp = list_entry(lnk, struct fcb_struct, llink); + + list_del(lnk); + write_unlock_irqrestore(&filep->f_cblock, flags); + kfree(fcbp); + write_lock_irqsave(&filep->f_cblock, flags); + } + + write_unlock_irqrestore(&filep->f_cblock, flags); +} + diff -Nru linux-2.5.44.vanilla/fs/file_table.c linux-2.5.44.epoll/fs/file_table.c --- linux-2.5.44.vanilla/fs/file_table.c Fri Oct 18 21:01:08 2002 +++ linux-2.5.44.epoll/fs/file_table.c Sat Oct 19 12:01:33 2002 @@ -8,6 +8,7 @@ #include #include #include +#include #include #include #include @@ -58,6 +59,7 @@ f->f_gid = current->fsgid; f->f_owner.lock = RW_LOCK_UNLOCKED; list_add(&f->f_list, &anon_list); + file_notify_init(f); file_list_unlock(); return f; } @@ -102,6 +104,7 @@ filp->f_uid = current->fsuid; filp->f_gid = current->fsgid; filp->f_op = dentry->d_inode->i_fop; + file_notify_init(filp); if (filp->f_op->open) return filp->f_op->open(dentry->d_inode, filp); else @@ -123,6 +126,7 @@ struct vfsmount * mnt = file->f_vfsmnt; struct inode * inode = dentry->d_inode; + file_notify_cleanup(file); locks_remove_flock(file); if (file->f_op && file->f_op->release) diff -Nru linux-2.5.44.vanilla/fs/pipe.c linux-2.5.44.epoll/fs/pipe.c --- linux-2.5.44.vanilla/fs/pipe.c Fri Oct 18 21:01:56 2002 +++ linux-2.5.44.epoll/fs/pipe.c Sat Oct 19 12:32:34 2002 @@ -11,6 +11,7 @@ #include #include #include +#include #include #include @@ -47,7 +48,7 @@ pipe_read(struct file *filp, char *buf, size_t count, loff_t *ppos) { struct inode *inode = filp->f_dentry->d_inode; - int do_wakeup; + int do_wakeup, pfull; ssize_t ret; /* pread is not allowed on pipes. */ @@ -63,6 +64,7 @@ down(PIPE_SEM(*inode)); for (;;) { int size = PIPE_LEN(*inode); + pfull = PIPE_FULL(*inode); if (size) { char *pipebuf = PIPE_BASE(*inode) + PIPE_START(*inode); ssize_t chars = PIPE_MAX_RCHUNK(*inode); @@ -108,12 +110,18 @@ if (!ret) ret = -ERESTARTSYS; break; } + /* Send notification message */ + if (pfull && !PIPE_FULL(*inode) && PIPE_WRITEFILE(*inode)) + file_send_notify(PIPE_WRITEFILE(*inode), ION_OUT, POLLOUT | POLLWRNORM | POLLWRBAND); if (do_wakeup) { wake_up_interruptible(PIPE_WAIT(*inode)); kill_fasync(PIPE_FASYNC_WRITERS(*inode), SIGIO, POLL_OUT); } pipe_wait(inode); } + /* Send notification message */ + if (pfull && !PIPE_FULL(*inode) && PIPE_WRITEFILE(*inode)) + file_send_notify(PIPE_WRITEFILE(*inode), ION_OUT, POLLOUT | POLLWRNORM | POLLWRBAND); up(PIPE_SEM(*inode)); /* Signal writers asynchronously that there is more room. */ if (do_wakeup) { @@ -131,7 +139,7 @@ struct inode *inode = filp->f_dentry->d_inode; ssize_t ret; size_t min; - int do_wakeup; + int do_wakeup, pempty; /* pwrite is not allowed on pipes. */ if (unlikely(ppos != &filp->f_pos)) @@ -149,6 +157,7 @@ down(PIPE_SEM(*inode)); for (;;) { int free; + pempty = PIPE_EMPTY(*inode); if (!PIPE_READERS(*inode)) { send_sig(SIGPIPE, current, 0); if (!ret) ret = -EPIPE; @@ -194,6 +203,9 @@ if (!ret) ret = -ERESTARTSYS; break; } + /* Send notification message */ + if (pempty && !PIPE_EMPTY(*inode) && PIPE_READFILE(*inode)) + file_send_notify(PIPE_READFILE(*inode), ION_IN, POLLIN | POLLRDNORM); if (do_wakeup) { wake_up_interruptible_sync(PIPE_WAIT(*inode)); kill_fasync(PIPE_FASYNC_READERS(*inode), SIGIO, POLL_IN); @@ -203,6 +215,9 @@ pipe_wait(inode); PIPE_WAITING_WRITERS(*inode)--; } + /* Send notification message */ + if (pempty && !PIPE_EMPTY(*inode) && PIPE_READFILE(*inode)) + file_send_notify(PIPE_READFILE(*inode), ION_IN, POLLIN | POLLRDNORM); up(PIPE_SEM(*inode)); if (do_wakeup) { wake_up_interruptible(PIPE_WAIT(*inode)); @@ -266,9 +281,22 @@ static int pipe_release(struct inode *inode, int decr, int decw) { + struct file *rdfile, *wrfile; down(PIPE_SEM(*inode)); PIPE_READERS(*inode) -= decr; PIPE_WRITERS(*inode) -= decw; + rdfile = PIPE_READFILE(*inode); + wrfile = PIPE_WRITEFILE(*inode); + if (decr && !PIPE_READERS(*inode)) { + PIPE_READFILE(*inode) = NULL; + if (wrfile) + file_send_notify(wrfile, ION_HUP, POLLHUP); + } + if (decw && !PIPE_WRITERS(*inode)) { + PIPE_WRITEFILE(*inode) = NULL; + if (rdfile) + file_send_notify(rdfile, ION_HUP, POLLHUP); + } if (!PIPE_READERS(*inode) && !PIPE_WRITERS(*inode)) { struct pipe_inode_info *info = inode->i_pipe; inode->i_pipe = NULL; @@ -488,6 +516,7 @@ PIPE_READERS(*inode) = PIPE_WRITERS(*inode) = 0; PIPE_WAITING_WRITERS(*inode) = 0; PIPE_RCOUNTER(*inode) = PIPE_WCOUNTER(*inode) = 1; + PIPE_READFILE(*inode) = PIPE_WRITEFILE(*inode) = NULL; *PIPE_FASYNC_READERS(*inode) = *PIPE_FASYNC_WRITERS(*inode) = NULL; return inode; @@ -595,6 +624,9 @@ f2->f_op = &write_pipe_fops; f2->f_mode = 2; f2->f_version = 0; + + PIPE_READFILE(*inode) = f1; + PIPE_WRITEFILE(*inode) = f2; fd_install(i, f1); fd_install(j, f2); diff -Nru linux-2.5.44.vanilla/include/asm-i386/poll.h linux-2.5.44.epoll/include/asm-i386/poll.h --- linux-2.5.44.vanilla/include/asm-i386/poll.h Fri Oct 18 21:01:52 2002 +++ linux-2.5.44.epoll/include/asm-i386/poll.h Sat Oct 19 12:01:33 2002 @@ -15,6 +15,7 @@ #define POLLWRNORM 0x0100 #define POLLWRBAND 0x0200 #define POLLMSG 0x0400 +#define POLLREMOVE 0x1000 struct pollfd { int fd; diff -Nru linux-2.5.44.vanilla/include/asm-i386/unistd.h linux-2.5.44.epoll/include/asm-i386/unistd.h --- linux-2.5.44.vanilla/include/asm-i386/unistd.h Fri Oct 18 21:02:00 2002 +++ linux-2.5.44.epoll/include/asm-i386/unistd.h Sat Oct 19 20:23:33 2002 @@ -258,6 +258,9 @@ #define __NR_free_hugepages 251 #define __NR_exit_group 252 #define __NR_lookup_dcookie 253 +#define __NR_sys_epoll_create 254 +#define __NR_sys_epoll_ctl 255 +#define __NR_sys_epoll_wait 256 /* user-visible error numbers are in the range -1 - -124: see */ diff -Nru linux-2.5.44.vanilla/include/linux/eventpoll.h linux-2.5.44.epoll/include/linux/eventpoll.h --- linux-2.5.44.vanilla/include/linux/eventpoll.h Wed Dec 31 16:00:00 1969 +++ linux-2.5.44.epoll/include/linux/eventpoll.h Tue Oct 29 10:05:47 2002 @@ -0,0 +1,51 @@ +/* + * include/linux/eventpoll.h ( Efficent event polling implementation ) + * Copyright (C) 2001,...,2002 Davide Libenzi + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * Davide Libenzi davidel@xmailserver.org + * + */ + +#ifndef _LINUX_EVENTPOLL_H +#define _LINUX_EVENTPOLL_H + + +#define EVENTPOLL_MINOR 124 +#define POLLFD_X_PAGE (PAGE_SIZE / sizeof(struct pollfd)) +#define MAX_FDS_IN_EVENTPOLL (1024 * 128) +#define MAX_EVENTPOLL_PAGES (MAX_FDS_IN_EVENTPOLL / POLLFD_X_PAGE) +#define EVENT_PAGE_INDEX(n) ((n) / POLLFD_X_PAGE) +#define EVENT_PAGE_REM(n) ((n) % POLLFD_X_PAGE) +#define EVENT_PAGE_OFFSET(n) (((n) % POLLFD_X_PAGE) * sizeof(struct pollfd)) +#define EP_FDS_PAGES(n) (((n) + POLLFD_X_PAGE - 1) / POLLFD_X_PAGE) +#define EP_MAP_SIZE(n) (EP_FDS_PAGES(n) * PAGE_SIZE * 2) + + +struct evpoll { + int ep_timeout; + unsigned long ep_resoff; +}; + +#define EP_ALLOC _IOR('P', 1, int) +#define EP_POLL _IOWR('P', 2, struct evpoll) +#define EP_FREE _IO('P', 3) +#define EP_ISPOLLED _IOWR('P', 4, struct pollfd) + +#define EP_CTL_ADD 1 +#define EP_CTL_DEL 2 +#define EP_CTL_MOD 3 + + +asmlinkage int sys_epoll_create(int maxfds); +asmlinkage int sys_epoll_ctl(int epfd, int op, int fd, unsigned int events); +asmlinkage int sys_epoll_wait(int epfd, struct pollfd const **events, int timeout); + + + +#endif + diff -Nru linux-2.5.44.vanilla/include/linux/fcblist.h linux-2.5.44.epoll/include/linux/fcblist.h --- linux-2.5.44.vanilla/include/linux/fcblist.h Wed Dec 31 16:00:00 1969 +++ linux-2.5.44.epoll/include/linux/fcblist.h Tue Oct 29 12:28:52 2002 @@ -0,0 +1,71 @@ +/* + * include/linux/fcblist.h ( File event callbacks handling ) + * Copyright (C) 2001,...,2002 Davide Libenzi + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * Davide Libenzi davidel@xmailserver.org + * + */ + +#ifndef __LINUX_FCBLIST_H +#define __LINUX_FCBLIST_H + +#include +#include +#include +#include +#include + + + +/* file callback notification events */ +#define ION_IN 1 +#define ION_OUT 2 +#define ION_HUP 3 +#define ION_ERR 4 + +#define FCB_LOCAL_SIZE 4 + + +struct fcb_struct { + struct list_head llink; + void (*cbproc)(struct file *, void *, unsigned long *, long *); + void *data; + unsigned long local[FCB_LOCAL_SIZE]; +}; + + +extern long ion_band_table[]; +extern long poll_band_table[]; + + +void file_notify_event(struct file *filep, long *event); + +int file_notify_addcb(struct file *filep, + void (*cbproc)(struct file *, void *, unsigned long *, long *), + void *data); + +int file_notify_delcb(struct file *filep, + void (*cbproc)(struct file *, void *, unsigned long *, long *)); + +void file_notify_cleanup(struct file *filep); + + +static inline void file_notify_init(struct file *filep) +{ + rwlock_init(&filep->f_cblock); + INIT_LIST_HEAD(&filep->f_cblist); +} + +static inline void file_send_notify(struct file *filep, long ioevt, long plevt) +{ + long event[] = { ioevt, plevt, -1 }; + + file_notify_event(filep, event); +} + +#endif diff -Nru linux-2.5.44.vanilla/include/linux/fs.h linux-2.5.44.epoll/include/linux/fs.h --- linux-2.5.44.vanilla/include/linux/fs.h Fri Oct 18 21:01:18 2002 +++ linux-2.5.44.epoll/include/linux/fs.h Sat Oct 19 12:01:33 2002 @@ -506,6 +506,10 @@ /* needed for tty driver, and maybe others */ void *private_data; + + /* file callback list */ + rwlock_t f_cblock; + struct list_head f_cblist; }; extern spinlock_t files_lock; #define file_list_lock() spin_lock(&files_lock); diff -Nru linux-2.5.44.vanilla/include/linux/list.h linux-2.5.44.epoll/include/linux/list.h --- linux-2.5.44.vanilla/include/linux/list.h Fri Oct 18 21:01:07 2002 +++ linux-2.5.44.epoll/include/linux/list.h Sat Oct 19 12:01:33 2002 @@ -319,6 +319,11 @@ for (pos = (head)->next, n = pos->next; pos != (head); \ pos = n, ({ read_barrier_depends(); 0;}), n = pos->next) +#define list_first(head) (((head)->next != (head)) ? (head)->next: (struct list_head *) 0) +#define list_last(head) (((head)->prev != (head)) ? (head)->prev: (struct list_head *) 0) +#define list_next(pos, head) (((pos)->next != (head)) ? (pos)->next: (struct list_head *) 0) +#define list_prev(pos, head) (((pos)->prev != (head)) ? (pos)->prev: (struct list_head *) 0) + #endif /* __KERNEL__ || _LVM_H_INCLUDE */ #endif diff -Nru linux-2.5.44.vanilla/include/linux/pipe_fs_i.h linux-2.5.44.epoll/include/linux/pipe_fs_i.h --- linux-2.5.44.vanilla/include/linux/pipe_fs_i.h Fri Oct 18 21:02:24 2002 +++ linux-2.5.44.epoll/include/linux/pipe_fs_i.h Sat Oct 19 12:01:33 2002 @@ -12,6 +12,8 @@ unsigned int waiting_writers; unsigned int r_counter; unsigned int w_counter; + struct file *rdfile; + struct file *wrfile; struct fasync_struct *fasync_readers; struct fasync_struct *fasync_writers; }; @@ -30,6 +32,8 @@ #define PIPE_WAITING_WRITERS(inode) ((inode).i_pipe->waiting_writers) #define PIPE_RCOUNTER(inode) ((inode).i_pipe->r_counter) #define PIPE_WCOUNTER(inode) ((inode).i_pipe->w_counter) +#define PIPE_READFILE(inode) ((inode).i_pipe->rdfile) +#define PIPE_WRITEFILE(inode) ((inode).i_pipe->wrfile) #define PIPE_FASYNC_READERS(inode) (&((inode).i_pipe->fasync_readers)) #define PIPE_FASYNC_WRITERS(inode) (&((inode).i_pipe->fasync_writers)) diff -Nru linux-2.5.44.vanilla/include/linux/sys.h linux-2.5.44.epoll/include/linux/sys.h --- linux-2.5.44.vanilla/include/linux/sys.h Fri Oct 18 21:01:49 2002 +++ linux-2.5.44.epoll/include/linux/sys.h Sun Oct 20 15:13:06 2002 @@ -4,7 +4,7 @@ /* * system call entry points ... but not all are defined */ -#define NR_syscalls 256 +#define NR_syscalls 260 /* * These are system calls that will be removed at some time diff -Nru linux-2.5.44.vanilla/include/net/sock.h linux-2.5.44.epoll/include/net/sock.h --- linux-2.5.44.vanilla/include/net/sock.h Fri Oct 18 21:02:27 2002 +++ linux-2.5.44.epoll/include/net/sock.h Tue Oct 29 15:42:50 2002 @@ -52,6 +52,9 @@ #include #include #include +#include +#include +#include /* * This structure really needs to be cleaned up. @@ -766,8 +769,13 @@ static inline void sk_wake_async(struct sock *sk, int how, int band) { - if (sk->socket && sk->socket->fasync_list) - sock_wake_async(sk->socket, how, band); + if (sk->socket) { + if (sk->socket->file) + file_send_notify(sk->socket->file, ion_band_table[band - POLL_IN], + poll_band_table[band - POLL_IN]); + if (sk->socket->fasync_list) + sock_wake_async(sk->socket, how, band); + } } #define SOCK_MIN_SNDBUF 2048 diff -Nru linux-2.5.44.vanilla/net/ipv4/tcp.c linux-2.5.44.epoll/net/ipv4/tcp.c --- linux-2.5.44.vanilla/net/ipv4/tcp.c Fri Oct 18 21:01:19 2002 +++ linux-2.5.44.epoll/net/ipv4/tcp.c Sat Oct 19 12:01:33 2002 @@ -476,8 +476,8 @@ if (sk->sleep && waitqueue_active(sk->sleep)) wake_up_interruptible(sk->sleep); - if (sock->fasync_list && !(sk->shutdown & SEND_SHUTDOWN)) - sock_wake_async(sock, 2, POLL_OUT); + if (!(sk->shutdown & SEND_SHUTDOWN)) + sk_wake_async(sk, 2, POLL_OUT); } }/davidel@xmailserver.org/davidel@xmailserver.org/davidel@xmailserver.org/davidel@xmailserver.org