From 569ca5b3f94cd0b3295ec5943aa457cf4a4f6a3a Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Thu, 5 Apr 2012 16:10:07 +0100 Subject: xen/gnttab: add deferred freeing logic Rather than just leaking pages that can't be freed at the point where access permission for the backend domain gets revoked, put them on a list and run a timer to (infrequently) retry freeing them. (This can particularly happen when unloading a frontend driver when devices are still present, and the backend still has them in non-closed state or hasn't finished closing them yet.) Signed-off-by: Jan Beulich Signed-off-by: Konrad Rzeszutek Wilk --- drivers/xen/grant-table.c | 106 +++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 96 insertions(+), 10 deletions(-) (limited to 'drivers/xen') diff --git a/drivers/xen/grant-table.c b/drivers/xen/grant-table.c index b4d4eac761db..9f514bb561af 100644 --- a/drivers/xen/grant-table.c +++ b/drivers/xen/grant-table.c @@ -426,10 +426,8 @@ static int gnttab_end_foreign_access_ref_v1(grant_ref_t ref, int readonly) nflags = *pflags; do { flags = nflags; - if (flags & (GTF_reading|GTF_writing)) { - printk(KERN_ALERT "WARNING: g.e. still in use!\n"); + if (flags & (GTF_reading|GTF_writing)) return 0; - } } while ((nflags = sync_cmpxchg(pflags, flags, 0)) != flags); return 1; @@ -458,12 +456,103 @@ static int gnttab_end_foreign_access_ref_v2(grant_ref_t ref, int readonly) return 1; } -int gnttab_end_foreign_access_ref(grant_ref_t ref, int readonly) +static inline int _gnttab_end_foreign_access_ref(grant_ref_t ref, int readonly) { return gnttab_interface->end_foreign_access_ref(ref, readonly); } + +int gnttab_end_foreign_access_ref(grant_ref_t ref, int readonly) +{ + if (_gnttab_end_foreign_access_ref(ref, readonly)) + return 1; + pr_warn("WARNING: g.e. %#x still in use!\n", ref); + return 0; +} EXPORT_SYMBOL_GPL(gnttab_end_foreign_access_ref); +struct deferred_entry { + struct list_head list; + grant_ref_t ref; + bool ro; + uint16_t warn_delay; + struct page *page; +}; +static LIST_HEAD(deferred_list); +static void gnttab_handle_deferred(unsigned long); +static DEFINE_TIMER(deferred_timer, gnttab_handle_deferred, 0, 0); + +static void gnttab_handle_deferred(unsigned long unused) +{ + unsigned int nr = 10; + struct deferred_entry *first = NULL; + unsigned long flags; + + spin_lock_irqsave(&gnttab_list_lock, flags); + while (nr--) { + struct deferred_entry *entry + = list_first_entry(&deferred_list, + struct deferred_entry, list); + + if (entry == first) + break; + list_del(&entry->list); + spin_unlock_irqrestore(&gnttab_list_lock, flags); + if (_gnttab_end_foreign_access_ref(entry->ref, entry->ro)) { + put_free_entry(entry->ref); + if (entry->page) { + pr_debug("freeing g.e. %#x (pfn %#lx)\n", + entry->ref, page_to_pfn(entry->page)); + __free_page(entry->page); + } else + pr_info("freeing g.e. %#x\n", entry->ref); + kfree(entry); + entry = NULL; + } else { + if (!--entry->warn_delay) + pr_info("g.e. %#x still pending\n", + entry->ref); + if (!first) + first = entry; + } + spin_lock_irqsave(&gnttab_list_lock, flags); + if (entry) + list_add_tail(&entry->list, &deferred_list); + else if (list_empty(&deferred_list)) + break; + } + if (!list_empty(&deferred_list) && !timer_pending(&deferred_timer)) { + deferred_timer.expires = jiffies + HZ; + add_timer(&deferred_timer); + } + spin_unlock_irqrestore(&gnttab_list_lock, flags); +} + +static void gnttab_add_deferred(grant_ref_t ref, bool readonly, + struct page *page) +{ + struct deferred_entry *entry = kmalloc(sizeof(*entry), GFP_ATOMIC); + const char *what = KERN_WARNING "leaking"; + + if (entry) { + unsigned long flags; + + entry->ref = ref; + entry->ro = readonly; + entry->page = page; + entry->warn_delay = 60; + spin_lock_irqsave(&gnttab_list_lock, flags); + list_add_tail(&entry->list, &deferred_list); + if (!timer_pending(&deferred_timer)) { + deferred_timer.expires = jiffies + HZ; + add_timer(&deferred_timer); + } + spin_unlock_irqrestore(&gnttab_list_lock, flags); + what = KERN_DEBUG "deferring"; + } + printk("%s g.e. %#x (pfn %#lx)\n", + what, ref, page ? page_to_pfn(page) : -1); +} + void gnttab_end_foreign_access(grant_ref_t ref, int readonly, unsigned long page) { @@ -471,12 +560,9 @@ void gnttab_end_foreign_access(grant_ref_t ref, int readonly, put_free_entry(ref); if (page != 0) free_page(page); - } else { - /* XXX This needs to be fixed so that the ref and page are - placed on a list to be freed up later. */ - printk(KERN_WARNING - "WARNING: leaking g.e. and page still in use!\n"); - } + } else + gnttab_add_deferred(ref, readonly, + page ? virt_to_page(page) : NULL); } EXPORT_SYMBOL_GPL(gnttab_end_foreign_access); -- cgit v1.2.3 From 211063dc159695bd6072c5393e9bc729481c6ede Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Thu, 8 Dec 2011 17:32:23 +0800 Subject: xen/acpi/sleep: Enable ACPI sleep via the __acpi_os_prepare_sleep Provide the registration callback to call in the Xen's ACPI sleep functionality. This means that during S3/S5 we make a hypercall XENPF_enter_acpi_sleep with the proper PM1A/PM1B registers. Based of Ke Yu's initial idea. [ From http://xenbits.xensource.com/linux-2.6.18-xen.hg change c68699484a65 ] [v1: Added Copyright and license] [v2: Added check if PM1A/B the 16-bits MSB contain something. The spec only uses 16-bits but might have more in future] Signed-off-by: Liang Tang Signed-off-by: Konrad Rzeszutek Wilk --- drivers/xen/Makefile | 2 +- drivers/xen/acpi.c | 62 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 63 insertions(+), 1 deletion(-) create mode 100644 drivers/xen/acpi.c (limited to 'drivers/xen') diff --git a/drivers/xen/Makefile b/drivers/xen/Makefile index 9adc5be57b13..fc3488631136 100644 --- a/drivers/xen/Makefile +++ b/drivers/xen/Makefile @@ -17,7 +17,7 @@ obj-$(CONFIG_XEN_SYS_HYPERVISOR) += sys-hypervisor.o obj-$(CONFIG_XEN_PVHVM) += platform-pci.o obj-$(CONFIG_XEN_TMEM) += tmem.o obj-$(CONFIG_SWIOTLB_XEN) += swiotlb-xen.o -obj-$(CONFIG_XEN_DOM0) += pci.o +obj-$(CONFIG_XEN_DOM0) += pci.o acpi.o obj-$(CONFIG_XEN_PCIDEV_BACKEND) += xen-pciback/ obj-$(CONFIG_XEN_PRIVCMD) += xen-privcmd.o obj-$(CONFIG_XEN_ACPI_PROCESSOR) += xen-acpi-processor.o diff --git a/drivers/xen/acpi.c b/drivers/xen/acpi.c new file mode 100644 index 000000000000..119d42a2bf57 --- /dev/null +++ b/drivers/xen/acpi.c @@ -0,0 +1,62 @@ +/****************************************************************************** + * acpi.c + * acpi file for domain 0 kernel + * + * Copyright (c) 2011 Konrad Rzeszutek Wilk + * Copyright (c) 2011 Yu Ke ke.yu@intel.com + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation; or, when distributed + * separately from the Linux kernel or incorporated into other + * software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this source file (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include +#include +#include +#include + +int xen_acpi_notify_hypervisor_state(u8 sleep_state, + u32 pm1a_cnt, u32 pm1b_cnt) +{ + struct xen_platform_op op = { + .cmd = XENPF_enter_acpi_sleep, + .interface_version = XENPF_INTERFACE_VERSION, + .u = { + .enter_acpi_sleep = { + .pm1a_cnt_val = (u16)pm1a_cnt, + .pm1b_cnt_val = (u16)pm1b_cnt, + .sleep_state = sleep_state, + }, + }, + }; + + if ((pm1a_cnt & 0xffff0000) || (pm1b_cnt & 0xffff0000)) { + WARN(1, "Using more than 16bits of PM1A/B 0x%x/0x%x!" + "Email xen-devel@lists.xensource.com Thank you.\n", \ + pm1a_cnt, pm1b_cnt); + return -1; + } + + HYPERVISOR_dom0_op(&op); + return 1; +} -- cgit v1.2.3 From f62805f1f30a40e354bd036b4cb799863a39be4b Mon Sep 17 00:00:00 2001 From: Stefano Stabellini Date: Tue, 24 Apr 2012 11:55:43 +0100 Subject: xen: enter/exit lazy_mmu_mode around m2p_override calls This patch is a significant performance improvement for the m2p_override: about 6% using the gntdev device. Each m2p_add/remove_override call issues a MULTI_grant_table_op and a __flush_tlb_single if kmap_op != NULL. Batching all the calls together is a great performance benefit because it means issuing one hypercall total rather than two hypercall per page. If paravirt_lazy_mode is set PARAVIRT_LAZY_MMU, all these calls are going to be batched together, otherwise they are issued one at a time. Adding arch_enter_lazy_mmu_mode/arch_leave_lazy_mmu_mode around the m2p_add/remove_override calls forces paravirt_lazy_mode to PARAVIRT_LAZY_MMU, therefore makes sure that they are always batched. However it is not safe to call arch_enter_lazy_mmu_mode if we are in interrupt context or if we are already in PARAVIRT_LAZY_MMU mode, so check for both conditions before doing so. Changes in v4: - rebased on 3.4-rc4: all the m2p_override users call gnttab_unmap_refs and gnttab_map_refs; - check whether we are in interrupt context and the lazy_mode we are in before calling arch_enter/leave_lazy_mmu_mode. Changes in v3: - do not call arch_enter/leave_lazy_mmu_mode in xen_blkbk_unmap, that can be called in interrupt context. Signed-off-by: Stefano Stabellini [v5: s/int lazy/bool lazy/] Signed-off-by: Konrad Rzeszutek Wilk --- drivers/xen/grant-table.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) (limited to 'drivers/xen') diff --git a/drivers/xen/grant-table.c b/drivers/xen/grant-table.c index 9f514bb561af..487e468f9ded 100644 --- a/drivers/xen/grant-table.c +++ b/drivers/xen/grant-table.c @@ -38,6 +38,7 @@ #include #include #include +#include #include #include @@ -827,6 +828,7 @@ int gnttab_map_refs(struct gnttab_map_grant_ref *map_ops, struct page **pages, unsigned int count) { int i, ret; + bool lazy = false; pte_t *pte; unsigned long mfn; @@ -837,6 +839,11 @@ int gnttab_map_refs(struct gnttab_map_grant_ref *map_ops, if (xen_feature(XENFEAT_auto_translated_physmap)) return ret; + if (!in_interrupt() && paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE) { + arch_enter_lazy_mmu_mode(); + lazy = true; + } + for (i = 0; i < count; i++) { /* Do not add to override if the map failed. */ if (map_ops[i].status) @@ -855,6 +862,9 @@ int gnttab_map_refs(struct gnttab_map_grant_ref *map_ops, return ret; } + if (lazy) + arch_leave_lazy_mmu_mode(); + return ret; } EXPORT_SYMBOL_GPL(gnttab_map_refs); @@ -863,6 +873,7 @@ int gnttab_unmap_refs(struct gnttab_unmap_grant_ref *unmap_ops, struct page **pages, unsigned int count, bool clear_pte) { int i, ret; + bool lazy = false; ret = HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, unmap_ops, count); if (ret) @@ -871,12 +882,20 @@ int gnttab_unmap_refs(struct gnttab_unmap_grant_ref *unmap_ops, if (xen_feature(XENFEAT_auto_translated_physmap)) return ret; + if (!in_interrupt() && paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE) { + arch_enter_lazy_mmu_mode(); + lazy = true; + } + for (i = 0; i < count; i++) { ret = m2p_remove_override(pages[i], clear_pte); if (ret) return ret; } + if (lazy) + arch_leave_lazy_mmu_mode(); + return ret; } EXPORT_SYMBOL_GPL(gnttab_unmap_refs); -- cgit v1.2.3 From d2fb4c51c7471a23f0a95526b624c14cec62603d Mon Sep 17 00:00:00 2001 From: Daniel De Graaf Date: Tue, 8 May 2012 09:46:57 -0400 Subject: xenbus: Add support for xenbus backend in stub domain Add an ioctl to the /dev/xen/xenbus_backend device allowing the xenbus backend to be started after the kernel has booted. This allows xenstore to run in a different domain from the dom0. Signed-off-by: Daniel De Graaf Signed-off-by: Konrad Rzeszutek Wilk --- drivers/xen/xenbus/xenbus_comms.c | 6 ++++ drivers/xen/xenbus/xenbus_comms.h | 1 + drivers/xen/xenbus/xenbus_dev_backend.c | 51 +++++++++++++++++++++++++++++++++ 3 files changed, 58 insertions(+) (limited to 'drivers/xen') diff --git a/drivers/xen/xenbus/xenbus_comms.c b/drivers/xen/xenbus/xenbus_comms.c index 2eff7a6aaa20..52fe7ad07666 100644 --- a/drivers/xen/xenbus/xenbus_comms.c +++ b/drivers/xen/xenbus/xenbus_comms.c @@ -234,3 +234,9 @@ int xb_init_comms(void) return 0; } + +void xb_deinit_comms(void) +{ + unbind_from_irqhandler(xenbus_irq, &xb_waitq); + xenbus_irq = 0; +} diff --git a/drivers/xen/xenbus/xenbus_comms.h b/drivers/xen/xenbus/xenbus_comms.h index 6e42800fa499..c8abd3b8a6c4 100644 --- a/drivers/xen/xenbus/xenbus_comms.h +++ b/drivers/xen/xenbus/xenbus_comms.h @@ -35,6 +35,7 @@ int xs_init(void); int xb_init_comms(void); +void xb_deinit_comms(void); /* Low level routines. */ int xb_write(const void *data, unsigned len); diff --git a/drivers/xen/xenbus/xenbus_dev_backend.c b/drivers/xen/xenbus/xenbus_dev_backend.c index 3d3be78c1093..be738c43104b 100644 --- a/drivers/xen/xenbus/xenbus_dev_backend.c +++ b/drivers/xen/xenbus/xenbus_dev_backend.c @@ -8,7 +8,11 @@ #include #include +#include #include +#include +#include +#include #include "xenbus_comms.h" @@ -22,6 +26,50 @@ static int xenbus_backend_open(struct inode *inode, struct file *filp) return nonseekable_open(inode, filp); } +static long xenbus_alloc(domid_t domid) +{ + struct evtchn_alloc_unbound arg; + int err = -EEXIST; + + xs_suspend(); + + /* If xenstored_ready is nonzero, that means we have already talked to + * xenstore and set up watches. These watches will be restored by + * xs_resume, but that requires communication over the port established + * below that is not visible to anyone until the ioctl returns. + * + * This can be resolved by splitting the ioctl into two parts + * (postponing the resume until xenstored is active) but this is + * unnecessarily complex for the intended use where xenstored is only + * started once - so return -EEXIST if it's already running. + */ + if (xenstored_ready) + goto out_err; + + gnttab_grant_foreign_access_ref(GNTTAB_RESERVED_XENSTORE, domid, + virt_to_mfn(xen_store_interface), 0 /* writable */); + + arg.dom = DOMID_SELF; + arg.remote_dom = domid; + + err = HYPERVISOR_event_channel_op(EVTCHNOP_alloc_unbound, &arg); + if (err) + goto out_err; + + if (xen_store_evtchn > 0) + xb_deinit_comms(); + + xen_store_evtchn = arg.port; + + xs_resume(); + + return arg.port; + + out_err: + xs_suspend_cancel(); + return err; +} + static long xenbus_backend_ioctl(struct file *file, unsigned int cmd, unsigned long data) { if (!capable(CAP_SYS_ADMIN)) @@ -33,6 +81,9 @@ static long xenbus_backend_ioctl(struct file *file, unsigned int cmd, unsigned l return xen_store_evtchn; return -ENODEV; + case IOCTL_XENBUS_BACKEND_SETUP: + return xenbus_alloc(data); + default: return -ENOTTY; } -- cgit v1.2.3 From d79d5959a023fd637e90ed1ff6547ff09d19396b Mon Sep 17 00:00:00 2001 From: Jana Saout Date: Tue, 15 May 2012 12:34:46 +0200 Subject: xen: Add selfballoning memory reservation tunable. Currently, the memory target in the Xen selfballooning driver is mainly driven by the value of "Committed_AS". However, there are cases in which it is desirable to assign additional memory to be available for the kernel, e.g. for local caches (which are not covered by cleancache), e.g. dcache and inode caches. This adds an additional tunable in the selfballooning driver (accessible via sysfs) which allows the user to specify an additional constant amount of memory to be reserved by the selfballoning driver for the local domain. Signed-off-by: Jana Saout Acked-by: Dan Magenheimer Signed-off-by: Konrad Rzeszutek Wilk --- drivers/xen/xen-selfballoon.c | 34 +++++++++++++++++++++++++++++++++- 1 file changed, 33 insertions(+), 1 deletion(-) (limited to 'drivers/xen') diff --git a/drivers/xen/xen-selfballoon.c b/drivers/xen/xen-selfballoon.c index 146c94897016..7d041cb6da26 100644 --- a/drivers/xen/xen-selfballoon.c +++ b/drivers/xen/xen-selfballoon.c @@ -105,6 +105,12 @@ static unsigned int selfballoon_interval __read_mostly = 5; */ static unsigned int selfballoon_min_usable_mb; +/* + * Amount of RAM in MB to add to the target number of pages. + * Can be used to reserve some more room for caches and the like. + */ +static unsigned int selfballoon_reserved_mb; + static void selfballoon_process(struct work_struct *work); static DECLARE_DELAYED_WORK(selfballoon_worker, selfballoon_process); @@ -217,7 +223,8 @@ static void selfballoon_process(struct work_struct *work) cur_pages = totalram_pages; tgt_pages = cur_pages; /* default is no change */ goal_pages = percpu_counter_read_positive(&vm_committed_as) + - totalreserve_pages; + totalreserve_pages + + MB2PAGES(selfballoon_reserved_mb); #ifdef CONFIG_FRONTSWAP /* allow space for frontswap pages to be repatriated */ if (frontswap_selfshrinking && frontswap_enabled) @@ -397,6 +404,30 @@ static DEVICE_ATTR(selfballoon_min_usable_mb, S_IRUGO | S_IWUSR, show_selfballoon_min_usable_mb, store_selfballoon_min_usable_mb); +SELFBALLOON_SHOW(selfballoon_reserved_mb, "%d\n", + selfballoon_reserved_mb); + +static ssize_t store_selfballoon_reserved_mb(struct device *dev, + struct device_attribute *attr, + const char *buf, + size_t count) +{ + unsigned long val; + int err; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + err = strict_strtoul(buf, 10, &val); + if (err || val == 0) + return -EINVAL; + selfballoon_reserved_mb = val; + return count; +} + +static DEVICE_ATTR(selfballoon_reserved_mb, S_IRUGO | S_IWUSR, + show_selfballoon_reserved_mb, + store_selfballoon_reserved_mb); + #ifdef CONFIG_FRONTSWAP SELFBALLOON_SHOW(frontswap_selfshrinking, "%d\n", frontswap_selfshrinking); @@ -480,6 +511,7 @@ static struct attribute *selfballoon_attrs[] = { &dev_attr_selfballoon_downhysteresis.attr, &dev_attr_selfballoon_uphysteresis.attr, &dev_attr_selfballoon_min_usable_mb.attr, + &dev_attr_selfballoon_reserved_mb.attr, #ifdef CONFIG_FRONTSWAP &dev_attr_frontswap_selfshrinking.attr, &dev_attr_frontswap_hysteresis.attr, -- cgit v1.2.3 From 68c2c39a76b094e9b2773e5846424ea674bf2c46 Mon Sep 17 00:00:00 2001 From: Stefano Stabellini Date: Mon, 21 May 2012 16:54:10 +0100 Subject: xen: do not map the same GSI twice in PVHVM guests. PV on HVM guests map GSIs into event channels. At restore time the event channels are resumed by restore_pirqs. Device drivers might try to register the same GSI again through ACPI at restore time, but the GSI has already been mapped and bound by restore_pirqs. This patch detects these situations and avoids mapping the same GSI multiple times. Without this patch we get: (XEN) irq.c:2235: dom4: pirq 23 or emuirq 28 already mapped and waste a pirq. CC: stable@kernel.org Signed-off-by: Stefano Stabellini Signed-off-by: Konrad Rzeszutek Wilk --- drivers/xen/events.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'drivers/xen') diff --git a/drivers/xen/events.c b/drivers/xen/events.c index 4b33acd8ed4e..faae2f910ad2 100644 --- a/drivers/xen/events.c +++ b/drivers/xen/events.c @@ -611,7 +611,7 @@ static void disable_pirq(struct irq_data *data) disable_dynirq(data); } -static int find_irq_by_gsi(unsigned gsi) +int xen_irq_from_gsi(unsigned gsi) { struct irq_info *info; @@ -625,6 +625,7 @@ static int find_irq_by_gsi(unsigned gsi) return -1; } +EXPORT_SYMBOL_GPL(xen_irq_from_gsi); /* * Do not make any assumptions regarding the relationship between the @@ -644,7 +645,7 @@ int xen_bind_pirq_gsi_to_irq(unsigned gsi, mutex_lock(&irq_mapping_update_lock); - irq = find_irq_by_gsi(gsi); + irq = xen_irq_from_gsi(gsi); if (irq != -1) { printk(KERN_INFO "xen_map_pirq_gsi: returning irq %d for gsi %u\n", irq, gsi); -- cgit v1.2.3