commit 020d3ba8f39ae66ee1ad81881874961d2b8c3a0c Author: spike Date: Tue Feb 28 17:27:39 2017 +0100 initial commit diff --git a/README.md b/README.md new file mode 100644 index 0000000..62c2e4a --- /dev/null +++ b/README.md @@ -0,0 +1,31 @@ +## Patch for Vmware 12.1.1 Kernel 4.9 + +**Note: User at your own risk !** + +Vmware 12.1 modules compilation is broke since kernel 4.9. +Here’s a quick and dirty patch I came up with after failing to find a solution. + + +### Usage + +1. Go to vmware modules source directory +`cd /usr/lib/vmware/modules/source/` + +2. + +https://github.com/torvalds/linux/commit/9beae1ea89305a9667ceaab6d0bf46a045ad71e7 + +- two variables(write, force) replaced with gup_flags +- gup flags used like this +unsigned int flags = 0; + +flags |= FOLL_WRITE +1, 0, pvec + pinned ---> flags, pvec + pinned +https://github.com/torvalds/linux/commit/1e9877902dc7e11d2be038371c6fbf2dfcd469d7#diff-e37c5ffd9b4db050c3f7eae7d74e64c3R1230 + + +write, force, pages --> flags(write=1,force=0) + +flags: the flags must be write only and not force +FOLL_WRITE +https://github.com/torvalds/linux/blob/6e5c8381d1db4c1cdd4b4e49d5f0d1255c2246fd/include/linux/mm.h#L227://github.com/torvalds/linux/blob/6e5c8381d1db4c1cdd4b4e49d5f0d1255c2246fd/include/linux/mm.h#L2278 diff --git a/vmmon-hostif.c/hostif.c b/vmmon-hostif.c/hostif.c new file mode 100644 index 0000000..327a2e6 --- /dev/null +++ b/vmmon-hostif.c/hostif.c @@ -0,0 +1,3592 @@ +/********************************************************* + * Copyright (C) 1998-2014 VMware, Inc. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation version 2 and no later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + *********************************************************/ + +/* + * hostif.c -- + * + * This file implements the platform-specific (here Linux) interface that + * the cross-platform code uses --hpreg + * + */ + + +/* Must come before any kernel header file --hpreg */ +#include "driver-config.h" + +/* Must come before vmware.h --hpreg */ +#include "compat_page.h" +#include +#include +#include +#include + +#include +#include + +#include +#include +#include + +#include + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 25) +# include +#endif +#if defined(_ASM_EXTABLE) +# define VMW_ASM_EXTABLE(from, to) _ASM_EXTABLE(from, to) +#else + /* Compat version copied from asm.h of 2.6.25 kernel */ +# define VMW_ASM_FORM(x) " " #x " " +# define VMW_ASM_EX_SEC " .section __ex_table,\"a\"\n" +# ifdef CONFIG_X86_32 +# define VMW_ASM_SEL(a,b) VMW_ASM_FORM(a) +# else +# define VMW_ASM_SEL(a,b) VMW_ASM_FORM(b) +# endif +# define VMW_ASM_PTR VMW_ASM_SEL(.long, .quad) +# define VMW_ASM_ALIGN VMW_ASM_SEL(.balign 4, .balign 8) +# define VMW_ASM_EXTABLE(from,to) \ + VMW_ASM_EX_SEC \ + VMW_ASM_ALIGN "\n" \ + VMW_ASM_PTR #from "," #to "\n" \ + " .previous\n" +#endif + +#include +#include +#include +#include +#include +#include + + +#include "vmware.h" +#include "x86apic.h" +#include "vm_asm.h" +#include "modulecall.h" +#include "memtrack.h" +#include "phystrack.h" +#include "cpuid.h" +#include "cpuid_info.h" +#include "hostif.h" +#include "hostif_priv.h" +#include "driver.h" +#include "vmhost.h" +#include "x86msr.h" +#include "apic.h" +#include "memDefaults.h" +#include "vcpuid.h" + +#include "pgtbl.h" +#include "vmmonInt.h" +#include "versioned_atomic.h" + +/* + * Determine if we can use high resolution timers. + */ + +#ifdef CONFIG_HIGH_RES_TIMERS +# include +# define VMMON_USE_HIGH_RES_TIMERS +# if LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 28) +# define VMMON_USE_SCHEDULE_HRTIMEOUT +# else +# define VMMON_USE_COMPAT_SCHEDULE_HRTIMEOUT +static void HostIFWakeupClockThread(unsigned long data); +static DECLARE_TASKLET(timerTasklet, HostIFWakeupClockThread, 0); +# endif +# define close_rtc(filp, files) do {} while(0) +#else +# define close_rtc(filp, files) filp_close(filp, files) +#endif + +#define UPTIME_FREQ CONST64(1000000) + +/* + * When CONFIG_NO_HZ_FULL is set processors can run tickless + * if there is only one runnable process. When set, the rate + * checks in HostIF_SetFastClockRate and HostIFFastClockThread + * need to be relaxed to allow any non-zero rate to run. + * + * This code can potentially be removed if/when we stop using + * HostIFFastClockThread to drive MonTimer. See PR1088247. + */ +#ifdef CONFIG_NO_HZ_FULL +#define MIN_RATE (0) +#else +#define MIN_RATE ((HZ) + (HZ) / 16) +#endif + +/* + * Linux seems to like keeping free memory around 30MB + * even under severe memory pressure. Let's give it a little + * more leeway than that for safety. + */ +#define LOCKED_PAGE_SLACK 10000 + +static struct { + Atomic_uint64 uptimeBase; + VersionedAtomic version; + uint64 monotimeBase; + unsigned long jiffiesBase; + struct timer_list timer; +} uptimeState; + +/* + * First Page Locking strategy + * --------------------------- + * + * An early implementation hacked the lock bit for the purpose of locking + * memory. This had a couple of advantages: + * - the vmscan algorithm would never eliminate mappings from the process + * address space + * - easy to assert that things are ok + * - it worked with anonymous memory. Basically, vmscan jumps over these + * pages, their use count stays high, .... + * + * This approach however had a couple of problems: + * + * - it relies on an undocumented interface. (in another words, a total hack) + * - it creates deadlock situations if the application gets a kill -9 or + * otherwise dies ungracefully. linux first tears down the address space, + * then closes file descriptors (including our own device). Unfortunately, + * this leads to a deadlock of the process on pages with the lock bit set. + * + * There is a workaround for that, namely to detect that condition using + * a linux timer. (ugly) + * + * Current Page Locking strategy + * ----------------------------- + * + * The current scheme does not use the lock bit, rather it increments the use + * count on the pages that need to be locked down in memory. + * + * The problem is that experiments on certain linux systems (e.g. 2.2.0-pre9) + * showed that linux somehow swaps out anonymous pages, even with the + * increased ref counter. + * Swapping them out to disk is not that big of a deal, but bringing them back + * to a different location is. In any case, anonymous pages in linux are not + * intended to be write-shared (e.g. try to MAP_SHARED /dev/zero). + * + * As a result, the current locking strategy requires that all locked pages are + * backed by the filesystem, not by swap. For now, we use both mapped files and + * sys V shared memory. The user application is responsible to cover these + * cases. + * + */ + + +#define HOST_UNLOCK_PFN(_vm, _pfn) do { \ + _vm = _vm; \ + put_page(pfn_to_page(_pfn)); \ +} while (0) + +#define HOST_UNLOCK_PFN_BYMPN(_vm, _pfn) do { \ + PhysTrack_Remove((_vm)->vmhost->lockedPages, (_pfn)); \ + put_page(pfn_to_page(_pfn)); \ +} while (0) + +uint8 monitorIPIVector; +uint8 hvIPIVector; + +/* + *----------------------------------------------------------------------------- + * + * MutexInit -- + * + * Initialize a Mutex. --hpreg + * + * Results: + * None + * + * Side effects: + * None + * + *----------------------------------------------------------------------------- + */ + +#ifdef VMX86_DEBUG +static INLINE void +MutexInit(Mutex *mutex, // IN + char const *name) // IN +{ + ASSERT(mutex); + ASSERT(name); + + sema_init(&mutex->sem, 1); + mutex->name = name; + mutex->cur.pid = -1; +} +#else +# define MutexInit(_mutex, _name) sema_init(&(_mutex)->sem, 1) +#endif + + +#ifdef VMX86_DEBUG +/* + *----------------------------------------------------------------------------- + * + * MutexIsLocked -- + * + * Determine if a Mutex is locked by the current thread. --hpreg + * + * Results: + * TRUE if yes + * FALSE if no + * + * Side effects: + * None + * + *----------------------------------------------------------------------------- + */ + +static INLINE Bool +MutexIsLocked(Mutex *mutex) // IN +{ + ASSERT(mutex); + + return mutex->cur.pid == current->pid; +} +#endif + + +/* + *----------------------------------------------------------------------------- + * + * MutexLock -- + * + * Acquire a Mutex. --hpreg + * + * Results: + * None + * + * Side effects: + * None + * + *----------------------------------------------------------------------------- + */ + +#ifdef VMX86_DEBUG +static INLINE void +MutexLock(Mutex *mutex, // IN + int callerID) // IN +{ + ASSERT(mutex); + ASSERT(!MutexIsLocked(mutex)); + + down(&mutex->sem); + mutex->cur.pid = current->pid; + mutex->cur.callerID = callerID; +} +#else +# define MutexLock(_mutex, _callerID) down(&(_mutex)->sem) +#endif + + +/* + *----------------------------------------------------------------------------- + * + * MutexUnlock -- + * + * Release a Mutex. --hpreg + * + * Results: + * None + * + * Side effects: + * None + * + *----------------------------------------------------------------------------- + */ + +#ifdef VMX86_DEBUG +static INLINE void +MutexUnlock(Mutex *mutex, // IN + int callerID) // IN +{ + ASSERT(mutex); + + ASSERT(MutexIsLocked(mutex) && mutex->cur.callerID == callerID); + mutex->prev = mutex->cur; + mutex->cur.pid = -1; + up(&mutex->sem); +} +#else +# define MutexUnlock(_mutex, _callerID) up(&(_mutex)->sem) +#endif + + +/* This mutex protects the driver-wide state. --hpreg */ +static Mutex globalMutex; + +/* + * This mutex protects the fast clock rate and is held while + * creating/destroying the fastClockThread. It ranks below + * globalMutex. We can't use globalMutex for this purpose because the + * fastClockThread itself acquires the globalMutex, so trying to hold + * the mutex while destroying the thread can cause a deadlock. + */ +static Mutex fastClockMutex; + +/* This mutex protects linuxState.pollList. */ +static Mutex pollListMutex; + + +/* + *---------------------------------------------------------------------- + * + * HostIF_PrepareWaitForThreads -- + * + * Prepare to wait for another vCPU thread. + * + * Results: + * FALSE: no way on Linux to determine we've already been signalled. + * + * Side effects: + * Current task is interruptible. + * + *---------------------------------------------------------------------- + */ + +Bool +HostIF_PrepareWaitForThreads(VMDriver *vm, // IN: + Vcpuid currVcpu) // IN: +{ + set_current_state(TASK_INTERRUPTIBLE); + vm->vmhost->vcpuSemaTask[currVcpu] = current; + return FALSE; +} + + +/* + *---------------------------------------------------------------------- + * + * HostIF_WaitForThreads -- + * + * Wait for another vCPU thread. + * + * Results: + * None. + * + * Side effects: + * Current task may block. + * + *---------------------------------------------------------------------- + */ + +void +HostIF_WaitForThreads(VMDriver *vm, // UNUSED: + Vcpuid currVcpu) // UNUSED: + +{ +#ifdef VMMON_USE_SCHEDULE_HRTIMEOUT + ktime_t timeout = ktime_set(0, CROSSCALL_SLEEP_US * 1000); + schedule_hrtimeout(&timeout, HRTIMER_MODE_REL); +#else + /* Fallback to ms timer resolution is fine for older kernels. */ + schedule_timeout(msecs_to_jiffies(CROSSCALL_SLEEP_US / 1000) + 1); +#endif +} + + +/* + *---------------------------------------------------------------------- + * + * HostIF_CancelWaitForThreads -- + * + * Cancel waiting for another vCPU thread. + * + * Results: + * None. + * + * Side effects: + * Current task is running and no longer interruptible. + * + *---------------------------------------------------------------------- + */ + +void +HostIF_CancelWaitForThreads(VMDriver *vm, // IN: + Vcpuid currVcpu) // IN: +{ + vm->vmhost->vcpuSemaTask[currVcpu] = NULL; + set_current_state(TASK_RUNNING); +} + + +/* + *---------------------------------------------------------------------- + * + * HostIF_WakeUpYielders -- + * + * Wakeup vCPUs that are waiting for the current vCPU. + * + * Results: + * The requested vCPUs are nudged if they are sleeping due to + * Vmx86_YieldToSet. + * + * Side effects: + * None. + * + *---------------------------------------------------------------------- + */ + +void +HostIF_WakeUpYielders(VMDriver *vm, // IN: + Vcpuid currVcpu) // IN: +{ + VCPUSet req; + Vcpuid vcpuid; + uint64 subset; + + /* + * PR 1142958: if the VCPUs woken in the crosscallWaitSet re-add themselves + * to this set faster than it can be fully drained, this function never + * exits. Instead, we copy and remove a snapshot of the crosscallWaitSet + * and locally wake up just that snapshot. It is ok that we don't get a + * fully coherent snapshot, as long as the subset copy-and-remove is atomic + * so no VCPU added is lost entirely. + */ + + VCPUSet_Empty(&req); + FOR_EACH_SUBSET_IN_SET(subIdx) { + subset = VCPUSet_AtomicReadWriteSubset(&vm->crosscallWaitSet[currVcpu], + 0, subIdx); + VCPUSet_UnionSubset(&req, subset, subIdx); + } ROF_EACH_SUBSET_IN_SET(); + + preempt_disable(); + while ((vcpuid = VCPUSet_FindFirst(&req)) != VCPUID_INVALID) { + struct task_struct *t = vm->vmhost->vcpuSemaTask[vcpuid]; + VCPUSet_Remove(&req, vcpuid); + if (t && (t->state & TASK_INTERRUPTIBLE)) { + wake_up_process(t); + } + } + preempt_enable(); +} + + +/* + *----------------------------------------------------------------------------- + * + * HostIF_InitGlobalLock -- + * + * Initialize the global (across all VMs and vmmon) locks. + * + * Results: + * None + * + * Side effects: + * None + * + *----------------------------------------------------------------------------- + */ + +void +HostIF_InitGlobalLock(void) +{ + MutexInit(&globalMutex, "global"); + MutexInit(&fastClockMutex, "fastClock"); + MutexInit(&pollListMutex, "pollList"); +} + + +/* + *----------------------------------------------------------------------------- + * + * HostIF_GlobalLock -- + * + * Grabs the global data structure lock. + * + * Results: + * None + * + * Side effects: + * Should be a very low contention lock. + * The current thread is rescheduled if the lock is busy. + * + *----------------------------------------------------------------------------- + */ + +void +HostIF_GlobalLock(int callerID) // IN +{ + MutexLock(&globalMutex, callerID); +} + + +/* + *----------------------------------------------------------------------------- + * + * HostIF_GlobalUnlock -- + * + * Releases the global data structure lock. + * + * Results: + * None + * + * Side effects: + * None + * + *----------------------------------------------------------------------------- + */ + +void +HostIF_GlobalUnlock(int callerID) // IN +{ + MutexUnlock(&globalMutex, callerID); +} + + +#ifdef VMX86_DEBUG +/* + *----------------------------------------------------------------------------- + * + * HostIF_GlobalLockIsHeld -- + * + * Determine if the global lock is held by the current thread. + * + * Results: + * TRUE if yes + * FALSE if no + * + * Side effects: + * None + * + *----------------------------------------------------------------------------- + */ + +Bool +HostIF_GlobalLockIsHeld(void) +{ + return MutexIsLocked(&globalMutex); +} +#endif + + +/* + *----------------------------------------------------------------------------- + * + * HostIF_FastClockLock -- + * + * Grabs the fast clock data structure lock. + * + * Results: + * None + * + * Side effects: + * Should be a very low contention lock. + * The current thread is rescheduled if the lock is busy. + * + *----------------------------------------------------------------------------- + */ + +void +HostIF_FastClockLock(int callerID) // IN +{ + MutexLock(&fastClockMutex, callerID); +} + + +/* + *----------------------------------------------------------------------------- + * + * HostIF_FastClockUnlock -- + * + * Releases the fast clock data structure lock. + * + * Results: + * None + * + * Side effects: + * None + * + *----------------------------------------------------------------------------- + */ + +void +HostIF_FastClockUnlock(int callerID) // IN +{ + MutexUnlock(&fastClockMutex, callerID); +} + + +/* + *----------------------------------------------------------------------------- + * + * HostIF_PollListLock -- + * + * Grabs the linuxState.pollList lock. + * + * Results: + * None + * + * Side effects: + * The current thread is rescheduled if the lock is busy. + * + *----------------------------------------------------------------------------- + */ + +void +HostIF_PollListLock(int callerID) // IN +{ + MutexLock(&pollListMutex, callerID); +} + + +/* + *----------------------------------------------------------------------------- + * + * HostIF_PollListUnlock -- + * + * Releases the linuxState.pollList lock. + * + * Results: + * None + * + * Side effects: + * None + * + *----------------------------------------------------------------------------- + */ + +void +HostIF_PollListUnlock(int callerID) // IN +{ + MutexUnlock(&pollListMutex, callerID); +} + + +/* + *---------------------------------------------------------------------- + * + * MapCrossPage & UnmapCrossPage + * + * Both x86-64 and ia32 need to map crosspage to an executable + * virtual address. We use the vmap interface instead of kmap + * due to bug 43907. + * + * Side effects: + * + * UnmapCrossPage assumes that the page has been refcounted up + * so it takes care of the put_page. + * + *---------------------------------------------------------------------- + */ +static void * +MapCrossPage(struct page *p) // IN: +{ + return vmap(&p, 1, VM_MAP, VM_PAGE_KERNEL_EXEC); +} + + +static void +UnmapCrossPage(struct page *p, // IN: + void *va) // IN: +{ + vunmap(va); + put_page(p); +} + + +/* + *---------------------------------------------------------------------- + * + * HostIFHostMemInit -- + * + * Initialize per-VM pages lists. + * + * Results: + * 0 on success, + * non-zero on failure. + * + * Side effects: + * None. + * + *---------------------------------------------------------------------- + */ + +static int +HostIFHostMemInit(VMDriver *vm) // IN: +{ + VMHost *vmh = vm->vmhost; + + vmh->lockedPages = PhysTrack_Alloc(vm); + if (!vmh->lockedPages) { + return -1; + } + vmh->AWEPages = PhysTrack_Alloc(vm); + if (!vmh->AWEPages) { + return -1; + } + + return 0; +} + + +/* + *---------------------------------------------------------------------- + * + * HostIFHostMemCleanup -- + * + * Release per-VM pages lists. + * + * Results: + * None. + * + * Side effects: + * Locked and AWE pages are released. + * + *---------------------------------------------------------------------- + */ + +static void +HostIFHostMemCleanup(VMDriver *vm) // IN: +{ + MPN mpn; + VMHost *vmh = vm->vmhost; + + if (!vmh) { + return; + } + + HostIF_VMLock(vm, 32); // Debug version of PhysTrack wants VM's lock. + if (vmh->lockedPages) { + for (mpn = 0; + INVALID_MPN != (mpn = PhysTrack_GetNext(vmh->lockedPages, mpn));) { + HOST_UNLOCK_PFN_BYMPN(vm, mpn); + } + PhysTrack_Free(vmh->lockedPages); + vmh->lockedPages = NULL; + } + + if (vmh->AWEPages) { + for (mpn = 0; + INVALID_MPN != (mpn = PhysTrack_GetNext(vmh->AWEPages, mpn));) { + PhysTrack_Remove(vmh->AWEPages, mpn); + put_page(pfn_to_page(mpn)); + } + PhysTrack_Free(vmh->AWEPages); + vmh->AWEPages = NULL; + } + HostIF_VMUnlock(vm, 32); +} + + +/* + *---------------------------------------------------------------------- + * + * HostIF_AllocMachinePage -- + * + * Alloc non-swappable memory page. The page is not billed to + * a particular VM. Preferably the page should not be mapped into + * the kernel addresss space. + * + * Results: + * INVALID_MPN or a valid host mpn. + * + * Side effects: + * None. + * + *---------------------------------------------------------------------- + */ + +MPN +HostIF_AllocMachinePage(void) +{ + struct page *pg = alloc_page(GFP_HIGHUSER); + + return (pg) ? ((MPN)page_to_pfn(pg)) : INVALID_MPN; +} + + +/* + *---------------------------------------------------------------------- + * + * HostIF_FreeMachinePage -- + * + * Free an anonymous machine page allocated by + * HostIF_AllocMachinePage(). This page is not tracked in any + * phystracker. + * + * Results: + * Host page is unlocked. + * + * Side effects: + * None. + * + *---------------------------------------------------------------------- + */ + +void +HostIF_FreeMachinePage(MPN mpn) // IN: +{ + struct page *pg = pfn_to_page(mpn); + + __free_page(pg); +} + + +/* + *---------------------------------------------------------------------- + * + * HostIF_AllocLockedPages -- + * + * Alloc non-swappable memory. + * + * Results: + * negative value on complete failure + * non-negative value on partial/full completion, number of MPNs + * allocated & filled in pmpn returned. + * + * Side effects: + * Pages allocated. + * + *---------------------------------------------------------------------- + */ + +int +HostIF_AllocLockedPages(VMDriver *vm, // IN: VM instance pointer + VA64 addr, // OUT: pointer to user or kernel buffer for MPNs + unsigned numPages, // IN: number of pages to allocate + Bool kernelMPNBuffer)// IN: is the MPN buffer in kernel or user address space? +{ + MPN *pmpn = VA64ToPtr(addr); + + VMHost *vmh = vm->vmhost; + unsigned int cnt; + int err = 0; + + if (!vmh || !vmh->AWEPages) { + return -EINVAL; + } + for (cnt = 0; cnt < numPages; cnt++) { + struct page* pg; + MPN mpn; + + pg = alloc_page(GFP_HIGHUSER); + if (!pg) { + err = -ENOMEM; + break; + } + mpn = (MPN)page_to_pfn(pg); + if (kernelMPNBuffer) { + *pmpn = mpn; + } else if (HostIF_CopyToUser(pmpn, &mpn, sizeof *pmpn) != 0) { + __free_page(pg); + err = -EFAULT; + break; + } + pmpn++; + if (PhysTrack_Test(vmh->AWEPages, mpn)) { + Warning("%s: duplicate MPN %016" FMT64 "x\n", __func__, mpn); + } + PhysTrack_Add(vmh->AWEPages, mpn); + } + + return cnt ? cnt : err; +} + + +/* + *---------------------------------------------------------------------- + * + * HostIF_FreeLockedPages -- + * + * Free non-swappable memory. + * + * Results: + * On success: 0. All pages were unlocked. + * On failure: Non-zero system error code. No page was unlocked. + * + * Side effects: + * Pages freed. + * + *---------------------------------------------------------------------- + */ + +int +HostIF_FreeLockedPages(VMDriver *vm, // IN: VM instance pointer + VA64 addr, // IN: user or kernel array of MPNs + unsigned numPages, // IN: number of pages to free + Bool kernelMPNBuffer) // IN: is the MPN buffer in kernel or user address space? +{ + const int MPN_BATCH = 64; + MPN const *pmpn = VA64ToPtr(addr); + VMHost *vmh = vm->vmhost; + unsigned int cnt; + struct page *pg; + MPN *mpns; + + mpns = HostIF_AllocKernelMem(sizeof *mpns * MPN_BATCH, TRUE); + + if (mpns == NULL) { + return -ENOMEM; + } + if (!vmh || !vmh->AWEPages) { + HostIF_FreeKernelMem(mpns); + return -EINVAL; + } + + if (!kernelMPNBuffer) { + if (numPages > MPN_BATCH) { + HostIF_FreeKernelMem(mpns); + return -EINVAL; + } + + if (HostIF_CopyFromUser(mpns, pmpn, numPages * sizeof *pmpn)) { + printk(KERN_DEBUG "Cannot read from process address space at %p\n", + pmpn); + HostIF_FreeKernelMem(mpns); + return -EINVAL; + } + + pmpn = mpns; + } + + for (cnt = 0; cnt < numPages; cnt++) { + if (!PhysTrack_Test(vmh->AWEPages, pmpn[cnt])) { + printk(KERN_DEBUG "Attempted to free unallocated MPN %016" FMT64 "X\n", + pmpn[cnt]); + HostIF_FreeKernelMem(mpns); + return -EINVAL; + } + + pg = pfn_to_page(pmpn[cnt]); + if (page_count(pg) != 1) { + // should this case be considered a failure? + printk(KERN_DEBUG "Page %016" FMT64 "X is still used by someone " + "(use count %u, VM %p)\n", pmpn[cnt], + page_count(pg), vm); + } + } + + for (cnt = 0; cnt < numPages; cnt++) { + pg = pfn_to_page(pmpn[cnt]); + PhysTrack_Remove(vmh->AWEPages, pmpn[cnt]); + __free_page(pg); + } + HostIF_FreeKernelMem(mpns); + return 0; +} + + +/* + *---------------------------------------------------------------------- + * + * HostIF_Init -- + * + * Initialize the host-dependent part of the driver. + * + * Results: + * zero on success, non-zero on error. + * + * Side effects: + * None + * + *---------------------------------------------------------------------- + */ + +int +HostIF_Init(VMDriver *vm) // IN: +{ + vm->memtracker = MemTrack_Init(); + if (vm->memtracker == NULL) { + return -1; + } + + vm->vmhost = (VMHost *) HostIF_AllocKernelMem(sizeof *vm->vmhost, TRUE); + if (vm->vmhost == NULL) { + return -1; + } + memset(vm->vmhost, 0, sizeof *vm->vmhost); + + if (HostIFHostMemInit(vm)) { + return -1; + } + MutexInit(&vm->vmhost->vmMutex, "vm"); + + return 0; +} + + +/* + *------------------------------------------------------------------------------ + * + * HostIF_LookupUserMPN -- + * + * Lookup the MPN of a locked user page by user VA. + * + * Results: + * A status code and the MPN on success. + * + * Side effects: + * None + * + *------------------------------------------------------------------------------ + */ + +int +HostIF_LookupUserMPN(VMDriver *vm, // IN: VMDriver + VA64 uAddr, // IN: user VA of the page + MPN *mpn) // OUT +{ + void *uvAddr = VA64ToPtr(uAddr); + int retval = PAGE_LOCK_SUCCESS; + + *mpn = PgtblVa2MPN((VA)uvAddr); + + /* + * On failure, check whether the page is locked. + * + * While we don't require the page to be locked by HostIF_LockPage(), + * it does provide extra information. + * + * -- edward + */ + if (*mpn == INVALID_MPN) { + if (vm == NULL) { + retval += PAGE_LOOKUP_NO_VM; + } else { + MemTrackEntry *entryPtr = + MemTrack_LookupVPN(vm->memtracker, PTR_2_VPN(uvAddr)); + if (entryPtr == NULL) { + retval += PAGE_LOOKUP_NOT_TRACKED; + } else if (entryPtr->mpn == 0) { + retval += PAGE_LOOKUP_NO_MPN; + } else { + /* + * Kernel can remove PTEs/PDEs from our pagetables even if pages + * are locked... + */ + volatile int c; + + get_user(c, (char *)uvAddr); + *mpn = PgtblVa2MPN((VA)uvAddr); + if (*mpn == entryPtr->mpn) { +#ifdef VMX86_DEBUG + printk(KERN_DEBUG "Page %p disappeared from %s(%u)... " + "now back at %016" FMT64 "x\n", + uvAddr, current->comm, current->pid, *mpn); +#endif + } else if (*mpn != INVALID_MPN) { + printk(KERN_DEBUG "Page %p disappeared from %s(%u)... " + "now back at %016" FMT64"x (old=%016" FMT64 "x)\n", + uvAddr, current->comm, current->pid, *mpn, + entryPtr->mpn); + *mpn = INVALID_MPN; + } else { + printk(KERN_DEBUG "Page %p disappeared from %s(%u)... " + "and is lost (old=%016" FMT64 "x)\n", uvAddr, current->comm, + current->pid, entryPtr->mpn); + *mpn = entryPtr->mpn; + } + } + } + } + + return retval; +} + + +/* + *---------------------------------------------------------------------- + * + * HostIF_InitFP -- + * + * masks IRQ13 if not previously the case. + * + * Results: + * prevents INTR #0x2d (IRQ 13) from being generated -- + * assume that Int16 works for interrupt reporting + * + * + * Side effects: + * PIC + * + *---------------------------------------------------------------------- + */ + +void +HostIF_InitFP(VMDriver *vm) // IN: +{ + int mask = (1 << (0xD - 0x8)); + + uint8 val = inb(0xA1); + + if (!(val & mask)) { + val = val | mask; + outb(val, 0xA1); + } +} + + +/* + *----------------------------------------------------------------------------- + * + * HostIFGetUserPages -- + * + * Lock the pages of an user-level address space in memory. + * If ppages is NULL, pages are only marked as dirty. + * + * Results: + * Zero on success, non-zero on failure. + * + * Side effects: + * None + * + *----------------------------------------------------------------------------- + */ + +static int +HostIFGetUserPages(void *uvAddr, // IN + struct page **ppages, // OUT + unsigned int numPages) // IN +{ + int retval; + + down_read(¤t->mm->mmap_sem); + retval = get_user_pages_remote(current, current->mm, (unsigned long)uvAddr, + numPages, 0, 0, ppages, NULL); + up_read(¤t->mm->mmap_sem); + + return retval != numPages; +} + + +/* + *---------------------------------------------------------------------- + * + * HostIF_IsLockedByMPN -- + * + * Checks if mpn was locked using allowMultipleMPNsPerVA. + * + * Results: + * TRUE if mpn is present in the physTracker. + * + * + * Side effects: + * None. + * + *---------------------------------------------------------------------- + */ + +Bool +HostIF_IsLockedByMPN(VMDriver *vm, // IN: + MPN mpn) // IN: +{ + return PhysTrack_Test(vm->vmhost->lockedPages, mpn); +} + + +/* + *----------------------------------------------------------------------------- + * + * HostIF_LockPage -- + * + * Lockup the MPN of an pinned user-level address space + * + * Results: + * A PAGE_LOCK_* status code and the MPN on success. + * + * Side effects: + * Adds the page to the MemTracker, if allowMultipleMPNsPerVA then the page + * is added to the VM's PhysTracker. + * + *----------------------------------------------------------------------------- + */ + +int +HostIF_LockPage(VMDriver *vm, // IN: VMDriver + VA64 uAddr, // IN: user VA of the page + Bool allowMultipleMPNsPerVA, // IN: allow to lock many pages per VA + MPN *mpn) // OUT: pinned page +{ + void *uvAddr = VA64ToPtr(uAddr); + struct page *page; + VPN vpn; + MemTrackEntry *entryPtr = NULL; + + vpn = PTR_2_VPN(uvAddr); + if (!allowMultipleMPNsPerVA) { + entryPtr = MemTrack_LookupVPN(vm->memtracker, vpn); + + /* + * Already tracked and locked + */ + + if (entryPtr != NULL && entryPtr->mpn != 0) { + return PAGE_LOCK_ALREADY_LOCKED; + } + } + + if (HostIFGetUserPages(uvAddr, &page, 1)) { + return PAGE_LOCK_FAILED; + } + + *mpn = (MPN)page_to_pfn(page); + + if (allowMultipleMPNsPerVA) { + /* + * Add the MPN to the PhysTracker that tracks locked pages. + */ + + struct PhysTracker* const pt = vm->vmhost->lockedPages; + + if (PhysTrack_Test(pt, *mpn)) { + put_page(page); + return PAGE_LOCK_ALREADY_LOCKED; + } + PhysTrack_Add(pt, *mpn); + } else { + /* + * If the entry doesn't exist, add it to the memtracker + * otherwise we just update the mpn. + */ + + if (entryPtr == NULL) { + entryPtr = MemTrack_Add(vm->memtracker, vpn, *mpn); + if (entryPtr == NULL) { + HOST_UNLOCK_PFN(vm, *mpn); + return PAGE_LOCK_MEMTRACKER_ERROR; + } + } else { + entryPtr->mpn = *mpn; + } + } + + return PAGE_LOCK_SUCCESS; +} + + +/* + *---------------------------------------------------------------------- + * + * HostIF_UnlockPage -- + * + * Unlock an pinned user-level page. + * + * Results: + * Status PAGE_UNLOCK_* code. + * + * Side effects: + * None + * + *---------------------------------------------------------------------- + */ + +int +HostIF_UnlockPage(VMDriver *vm, // IN: + VA64 uAddr) // IN: +{ + void *addr = VA64ToPtr(uAddr); + VPN vpn; + MemTrackEntry *e; + + vpn = VA_2_VPN((VA)addr); + e = MemTrack_LookupVPN(vm->memtracker, vpn); + + if (e == NULL) { + return PAGE_UNLOCK_NOT_TRACKED; + } + if (e->mpn == 0) { + return PAGE_UNLOCK_NO_MPN; + } + + HOST_UNLOCK_PFN(vm, e->mpn); + e->mpn = 0; + + return PAGE_UNLOCK_SUCCESS; +} + + +/* + *---------------------------------------------------------------------- + * + * HostIF_UnlockPageByMPN -- + * + * Unlock a locked user mode page. The page doesn't need to be mapped + * anywhere. + * + * Results: + * Status code. Returns a PAGE_LOOKUP_* error if the page can't be found or + * a PAGE_UNLOCK_* error if the page can't be unlocked. + * + * Side effects: + * Removes the MPN from from VM's PhysTracker. + * + *---------------------------------------------------------------------- + */ + +int +HostIF_UnlockPageByMPN(VMDriver *vm, // IN: VMDriver + MPN mpn, // IN: the MPN to unlock + VA64 uAddr) // IN: optional(debugging) VA for the MPN +{ + if (!PhysTrack_Test(vm->vmhost->lockedPages, mpn)) { + return PAGE_UNLOCK_NO_MPN; + } + +#ifdef VMX86_DEBUG + { + void *va = VA64ToPtr(uAddr); + MemTrackEntry *e; + + /* + * Verify for debugging that VA and MPN make sense. + * PgtblVa2MPN() can fail under high memory pressure. + */ + + if (va != NULL) { + MPN lookupMpn = PgtblVa2MPN((VA)va); + + if (lookupMpn != INVALID_MPN && mpn != lookupMpn) { + Warning("Page lookup fail %#"FMT64"x %016" FMT64 "x %p\n", + mpn, lookupMpn, va); + + return PAGE_LOOKUP_INVALID_ADDR; + } + } + + /* + * Verify that this MPN was locked with + * HostIF_LockPage(allowMultipleMPNsPerVA = TRUE). + * That means that this MPN should not be in the MemTracker. + */ + + e = MemTrack_LookupMPN(vm->memtracker, mpn); + if (e) { + Warning("%s(): mpn=%#"FMT64"x va=%p was permanently locked with " + "vpn=0x%"FMT64"x\n", __func__, mpn, va, e->vpn); + + return PAGE_UNLOCK_MISMATCHED_TYPE; + } + } +#endif + + HOST_UNLOCK_PFN_BYMPN(vm, mpn); + + return PAGE_UNLOCK_SUCCESS; +} + + +static void +UnlockEntry(void *clientData, // IN: + MemTrackEntry *entryPtr) // IN: +{ + VMDriver *vm = (VMDriver *)clientData; + + if (entryPtr->mpn) { + HOST_UNLOCK_PFN(vm,entryPtr->mpn); + entryPtr->mpn = 0; + } +} + + +/* + *----------------------------------------------------------------------------- + * + * HostIF_FreeAllResources -- + * + * Free all host-specific VM resources. + * + * Results: + * None + * + * Side effects: + * None + * + *----------------------------------------------------------------------------- + */ + +void +HostIF_FreeAllResources(VMDriver *vm) // IN +{ + unsigned int cnt; + + HostIFHostMemCleanup(vm); + if (vm->memtracker) { + MemTrack_Cleanup(vm->memtracker, UnlockEntry, vm); + vm->memtracker = NULL; + } + if (vm->vmhost) { + for (cnt = vm->vmhost->crosspagePagesCount; cnt > 0; ) { + struct page* p = vm->vmhost->crosspagePages[--cnt]; + UnmapCrossPage(p, vm->crosspage[cnt]); + } + vm->vmhost->crosspagePagesCount = 0; + if (vm->vmhost->hostAPICIsMapped) { + ASSERT(vm->hostAPIC.base != NULL); + iounmap((void*)vm->hostAPIC.base); + vm->hostAPIC.base = NULL; + vm->vmhost->hostAPICIsMapped = FALSE; + } + HostIF_FreeKernelMem(vm->vmhost); + vm->vmhost = NULL; + } +} + + + +/* + *---------------------------------------------------------------------- + * + * HostIF_AllocKernelMem + * + * Allocate some kernel memory for the driver. + * + * Results: + * The address allocated or NULL on error. + * + * + * Side effects: + * memory is malloced + *---------------------------------------------------------------------- + */ + +void * +HostIF_AllocKernelMem(size_t size, // IN: + int wired) // IN: +{ + void * ptr = kmalloc(size, GFP_KERNEL); + + if (ptr == NULL) { + Warning("%s failed (size=%p)\n", __func__, (void*)size); + } + + return ptr; +} + + +/* + *----------------------------------------------------------------------------- + * + * HostIF_AllocPage -- + * + * Allocate a page (whose content is undetermined) + * + * Results: + * The kernel virtual address of the page + * + * Side effects: + * None + * + *----------------------------------------------------------------------------- + */ + +void * +HostIF_AllocPage(void) +{ + VA kvAddr; + + kvAddr = __get_free_page(GFP_KERNEL); + if (kvAddr == 0) { + Warning("%s: __get_free_page() failed\n", __func__); + } + + return (void *)kvAddr; +} + + +/* + *---------------------------------------------------------------------- + * + * HostIF_FreeKernelMem + * + * Free kernel memory allocated for the driver. + * + * Results: + * None. + * + * Side effects: + * memory is freed. + *---------------------------------------------------------------------- + */ + +void +HostIF_FreeKernelMem(void *ptr) // IN: +{ + kfree(ptr); +} + + +void +HostIF_FreePage(void *ptr) // IN: +{ + VA vAddr = (VA)ptr; + + if (vAddr & (PAGE_SIZE-1)) { + Warning("%s %p misaligned\n", __func__, (void*)vAddr); + } else { + free_page(vAddr); + } +} + + +/* + *---------------------------------------------------------------------- + * + * HostIF_EstimateLockedPageLimit -- + * + * Estimates how many memory pages can be locked or allocated + * from the kernel without causing the host to die or to be really upset. + * + * Results: + * The maximum number of pages that can be locked. + * + * Side effects: + * none + * + *---------------------------------------------------------------------- + */ + +unsigned int +HostIF_EstimateLockedPageLimit(const VMDriver* vm, // IN + unsigned int currentlyLockedPages) // IN +{ + /* + * This variable is available and exported to modules, + * since at least 2.6.0. + */ + + extern unsigned long totalram_pages; + + unsigned int totalPhysicalPages = totalram_pages; + +#if LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 28) + return MemDefaults_CalcMaxLockedPages(totalPhysicalPages); +#else + /* + * Use the memory information linux exports as of late for a more + * precise estimate of locked memory. All kernel page-related structures + * (slab, pagetable) are as good as locked. Unevictable includes things + * that are explicitly marked as such (like mlock()). Huge pages are + * also as good as locked, since we don't use them. Lastly, without + * available swap, anonymous pages become locked in memory as well. + */ + + unsigned int forHost; + unsigned int reservedPages = MEMDEFAULTS_MIN_HOST_PAGES; + unsigned int hugePages = (vm == NULL) ? 0 : + BYTES_2_PAGES(vm->memInfo.hugePageBytes); + unsigned int lockedPages = global_page_state(NR_PAGETABLE) + + global_page_state(NR_SLAB_UNRECLAIMABLE) + + global_page_state(NR_UNEVICTABLE) + + hugePages + reservedPages; + unsigned int anonPages = global_page_state(NR_ANON_MAPPED); + unsigned int swapPages = BYTES_2_PAGES(linuxState.swapSize); + + if (anonPages > swapPages) { + lockedPages += anonPages - swapPages; + } + forHost = lockedPages + LOCKED_PAGE_SLACK; + if (forHost > totalPhysicalPages) { + forHost = totalPhysicalPages; + } + + return totalPhysicalPages - forHost; +#endif +} + + +/* + *---------------------------------------------------------------------- + * + * HostIF_Wait -- + * + * Waits for specified number of milliseconds. + * + *---------------------------------------------------------------------- + */ + +void +HostIF_Wait(unsigned int timeoutMs) +{ + msleep_interruptible(timeoutMs); +} + + +/* + *---------------------------------------------------------------------- + * + * HostIF_WaitForFreePages -- + * + * Waits for pages to be available for allocation or locking. + * + * Results: + * New pages are likely to be available for allocation or locking. + * + * Side effects: + * none + * + *---------------------------------------------------------------------- + */ + +void +HostIF_WaitForFreePages(unsigned int timeoutMs) // IN: +{ + static unsigned count; + msleep_interruptible(timeoutMs); + count++; +} + + +/* + *---------------------------------------------------------------------- + * + * HostIFReadUptimeWork -- + * + * Reads the current uptime. The uptime is based on getimeofday, + * which provides the needed high resolution. However, we don't + * want uptime to be warped by e.g. calls to settimeofday. So, we + * use a jiffies based monotonic clock to sanity check the uptime. + * If the uptime is more than one second from the monotonic time, + * we assume that the time of day has been set, and recalculate the + * uptime base to get uptime back on track with monotonic time. On + * the other hand, we do expect jiffies based monotonic time and + * timeofday to have small drift (due to NTP rate correction, etc). + * We handle this by rebasing the jiffies based monotonic clock + * every second (see HostIFUptimeResyncMono). + * + * Results: + * The uptime, in units of UPTIME_FREQ. Also returns the jiffies + * value that was used in the monotonic time calculation. + * + * Side effects: + * May reset the uptime base in the case gettimeofday warp was + * detected. + * + *---------------------------------------------------------------------- + */ + +static uint64 +HostIFReadUptimeWork(unsigned long *j) // OUT: current jiffies +{ + struct timeval tv; + uint64 monotime, uptime, upBase, monoBase; + int64 diff; + uint32 version; + unsigned long jifs, jifBase; + unsigned int attempts = 0; + + /* Assert that HostIF_InitUptime has been called. */ + ASSERT(uptimeState.timer.function); + + retry: + do { + version = VersionedAtomic_BeginTryRead(&uptimeState.version); + jifs = jiffies; + jifBase = uptimeState.jiffiesBase; + monoBase = uptimeState.monotimeBase; + } while (!VersionedAtomic_EndTryRead(&uptimeState.version, version)); + + do_gettimeofday(&tv); + upBase = Atomic_Read64(&uptimeState.uptimeBase); + + monotime = (uint64)(jifs - jifBase) * (UPTIME_FREQ / HZ); + monotime += monoBase; + + uptime = tv.tv_usec * (UPTIME_FREQ / 1000000) + tv.tv_sec * UPTIME_FREQ; + uptime += upBase; + + /* + * Use the jiffies based monotonic time to sanity check gettimeofday. + * If they differ by more than one second, assume the time of day has + * been warped, and use the jiffies time to undo (most of) the warp. + */ + + diff = uptime - monotime; + if (UNLIKELY(diff < -UPTIME_FREQ || diff > UPTIME_FREQ)) { + /* Compute a new uptimeBase to get uptime back on track. */ + uint64 newUpBase = monotime - (uptime - upBase); + + attempts++; + if (!Atomic_CMPXCHG64(&uptimeState.uptimeBase, &upBase, &newUpBase) && + attempts < 5) { + /* Another thread updated uptimeBase. Recalculate uptime. */ + goto retry; + } + uptime = monotime; + + Log("%s: detected settimeofday: fixed uptimeBase old %"FMT64"u " + "new %"FMT64"u attempts %u\n", __func__, + upBase, newUpBase, attempts); + } + *j = jifs; + + return uptime; +} + + +/* + *---------------------------------------------------------------------- + * + * HostIFUptimeResyncMono -- + * + * Timer that fires ever second to resynchronize the jiffies based + * monotonic time with the uptime. + * + * Results: + * None + * + * Side effects: + * Resets the monotonic time bases so that jiffies based monotonic + * time does not drift from gettimeofday over the long term. + * + *---------------------------------------------------------------------- + */ + +static void +HostIFUptimeResyncMono(unsigned long data) // IN: ignored +{ + unsigned long jifs; + uintptr_t flags; + + /* + * Read the uptime and the corresponding jiffies value. This will + * also correct the uptime (which is based on time of day) if needed + * before we rebase monotonic time (which is based on jiffies). + */ + + uint64 uptime = HostIFReadUptimeWork(&jifs); + + /* + * Every second, recalculate monoBase and jiffiesBase to squash small + * drift between gettimeofday and jiffies. Also, this prevents + * (jiffies - jiffiesBase) wrap on 32-bits. + */ + + SAVE_FLAGS(flags); + CLEAR_INTERRUPTS(); + VersionedAtomic_BeginWrite(&uptimeState.version); + + uptimeState.monotimeBase = uptime; + uptimeState.jiffiesBase = jifs; + + VersionedAtomic_EndWrite(&uptimeState.version); + RESTORE_FLAGS(flags); + + /* Reschedule this timer to expire in one second. */ + mod_timer(&uptimeState.timer, jifs + HZ); +} + + +/* + *---------------------------------------------------------------------- + * + * HostIF_InitUptime -- + * + * Initialize the uptime clock's state. + * + * Results: + * None + * + * Side effects: + * Sets the initial values for the uptime state, and schedules + * the uptime timer. + * + *---------------------------------------------------------------------- + */ + +void +HostIF_InitUptime(void) +{ + struct timeval tv; + + uptimeState.jiffiesBase = jiffies; + do_gettimeofday(&tv); + Atomic_Write64(&uptimeState.uptimeBase, + -(tv.tv_usec * (UPTIME_FREQ / 1000000) + + tv.tv_sec * UPTIME_FREQ)); + + init_timer(&uptimeState.timer); + uptimeState.timer.function = HostIFUptimeResyncMono; + mod_timer(&uptimeState.timer, jiffies + HZ); +} + + +/* + *---------------------------------------------------------------------- + * + * HostIF_CleanupUptime -- + * + * Cleanup uptime state, called at module unloading time. + * + * Results: + * None + * + * Side effects: + * Deschedule the uptime timer. + * + *---------------------------------------------------------------------- + */ + +void +HostIF_CleanupUptime(void) +{ + del_timer_sync(&uptimeState.timer); +} + + +/* + *---------------------------------------------------------------------- + * + * HostIF_ReadUptime -- + * + * Read the system time. Returned value has no particular absolute + * value, only difference since previous call should be used. + * + * Results: + * Units are given by HostIF_UptimeFrequency. + * + * Side effects: + * See HostIFReadUptimeWork + * + *---------------------------------------------------------------------- + */ + +uint64 +HostIF_ReadUptime(void) +{ + unsigned long jifs; + + return HostIFReadUptimeWork(&jifs); +} + + +/* + *---------------------------------------------------------------------- + * + * HostIF_UptimeFrequency + * + * Return the frequency of the counter that HostIF_ReadUptime reads. + * + * Results: + * Frequency in Hz. + * + * Side effects: + * None + * + *---------------------------------------------------------------------- + */ + +uint64 +HostIF_UptimeFrequency(void) +{ + return UPTIME_FREQ; +} + + +/* + *----------------------------------------------------------------------------- + * + * HostIF_CopyFromUser -- + * + * Copy memory from the user application into a kernel buffer. This + * function may block, so don't call it while holding any kind of + * lock. --hpreg + * + * Results: + * 0 on success + * -EFAULT on failure. + * + * Side effects: + * None + * + *----------------------------------------------------------------------------- + */ + +int +HostIF_CopyFromUser(void *dst, // OUT + const void *src, // IN + unsigned int len) // IN +{ + return copy_from_user(dst, src, len) ? -EFAULT : 0; +} + + +/* + *----------------------------------------------------------------------------- + * + * HostIF_CopyToUser -- + * + * Copy memory to the user application from a kernel buffer. This + * function may block, so don't call it while holding any kind of + * lock. --hpreg + * + * Results: + * 0 on success + * -EFAULT on failure. + * + * Side effects: + * None + * + *----------------------------------------------------------------------------- + */ + +int +HostIF_CopyToUser(void *dst, // OUT + const void *src, // IN + unsigned int len) // IN +{ + return copy_to_user(dst, src, len) ? -EFAULT : 0; +} + + +/* + *----------------------------------------------------------------------------- + * + * HostIF_MapCrossPage -- + * + * Obtain kernel pointer to crosspage. + * + * We must return a VA that is obtained through a kernel mapping, so that + * the mapping never goes away (see bug 29753). + * + * However, the LA corresponding to that VA must not overlap with the + * monitor (see bug 32922). The userland code ensures that by only + * allocating cross pages from low memory. For those pages, the kernel + * uses a permanent mapping, instead of a temporary one with a high LA. + * + * Results: + * The kernel virtual address on success + * NULL on failure + * + * Side effects: + * None + * + *----------------------------------------------------------------------------- + */ + +void * +HostIF_MapCrossPage(VMDriver *vm, // IN + VA64 uAddr) // IN +{ + void *p = VA64ToPtr(uAddr); + struct page *page; + VA vPgAddr; + VA ret; + + if (HostIFGetUserPages(p, &page, 1)) { + return NULL; + } + vPgAddr = (VA) MapCrossPage(page); + HostIF_GlobalLock(16); + if (vm->vmhost->crosspagePagesCount >= MAX_INITBLOCK_CPUS) { + HostIF_GlobalUnlock(16); + UnmapCrossPage(page, (void*)vPgAddr); + + return NULL; + } + vm->vmhost->crosspagePages[vm->vmhost->crosspagePagesCount++] = page; + HostIF_GlobalUnlock(16); + + ret = vPgAddr | (((VA)p) & (PAGE_SIZE - 1)); + + return (void*)ret; +} + + +/* + *----------------------------------------------------------------------------- + * + * HostIF_AllocCrossGDT -- + * + * Allocate the per-vmmon cross GDT page set. + * + * See bora/doc/worldswitch-pages.txt for the requirements on the cross + * GDT page set addresses. + * + * Results: + * On success: Host kernel virtual address of the first cross GDT page. + * Use HostIF_FreeCrossGDT() with the same value to free. + * The 'crossGDTMPNs' array is filled with the MPNs of all the + * cross GDT pages. + * On failure: NULL. + * + * Side effects: + * None + * + *----------------------------------------------------------------------------- + */ + +void * +HostIF_AllocCrossGDT(uint32 numPages, // IN: Number of pages + MPN maxValidFirst, // IN: Highest valid MPN of first page + MPN *crossGDTMPNs) // OUT: Array of MPNs +{ + MPN startMPN; + struct page *pages; + uint32 i; + void *crossGDT; + + /* + * In practice, allocating a low page (MPN <= 0x100000 - 1) is equivalent to + * allocating a page with MPN <= 0xFEC00 - 1: + * + * o PC architecture guarantees that there is no RAM in top 16MB of 4GB + * range. + * + * o 0xFEC00000 is IOAPIC base. There could be RAM immediately below, + * but not above. + * + * How do we allocate a low page? We can safely use GFP_DMA32 when + * available. On 64bit kernels before GFP_DMA32 was introduced we + * fall back to DMA zone (which is not quite necessary for boxes + * with less than ~3GB of memory). On 32bit kernels we are using + * normal zone - which is usually 1GB, and at most 4GB (for 4GB/4GB + * kernels). And for 4GB/4GB kernels same restriction as for 64bit + * kernels applies - there is no RAM in top 16MB immediately below + * 4GB so alloc_pages() cannot return such page. + */ + + ASSERT(0xFEC00 - 1 <= maxValidFirst); + for (i = 0; (1 << i) < numPages; i++) { } +#ifdef GFP_DMA32 + pages = alloc_pages(GFP_KERNEL | GFP_DMA32, i); +#else + pages = alloc_pages(GFP_KERNEL | GFP_DMA, i); +#endif + crossGDT = NULL; + if (pages == NULL) { + Warning("%s: unable to alloc crossGDT (%u)\n", __func__, i); + } else { + startMPN = page_to_pfn(pages); + for (i = 0; i < numPages; i++) { + crossGDTMPNs[i] = startMPN + i; + } + crossGDT = (void *)page_address(pages); + } + + return crossGDT; +} + + +/* + *----------------------------------------------------------------------------- + * + * HostIF_FreeCrossGDT -- + * + * Free the per-vmmon cross GDT page set allocated with + * HostIF_AllocCrossGDT(). + * + * Results: + * None + * + * Side effects: + * None + * + *----------------------------------------------------------------------------- + */ + +void +HostIF_FreeCrossGDT(uint32 numPages, // IN: Number of pages + void *crossGDT) // IN: Kernel VA of first cross GDT page +{ + uint32 i; + + for (i = 0; (1 << i) < numPages; i++) { } + free_pages((VA)crossGDT, i); +} + + +/* + *----------------------------------------------------------------------------- + * + * HostIF_VMLock -- + * + * Grabs per-VM data structure lock. The lock is not recursive. + * The global lock has lower rank so the global lock should be grabbed + * first if both locks are acquired. + * + * It should be a medium contention lock. Also it should be fast: + * it is used for protecting of frequent page allocation and locking. + * + * Results: + * None + * + * Side effects: + * The current thread is rescheduled if the lock is busy. + * + *----------------------------------------------------------------------------- + */ + +void +HostIF_VMLock(VMDriver *vm, // IN + int callerID) // IN +{ + ASSERT(vm); + + ASSERT(vm->vmhost); + MutexLock(&vm->vmhost->vmMutex, callerID); +} + + +/* + *----------------------------------------------------------------------------- + * + * HostIF_VMUnlock -- + * + * Releases per-VM data structure lock. + * + * Results: + * None + * + * Side effects: + * Can wake up the thread blocked on this lock. + * + *----------------------------------------------------------------------------- + */ + +void +HostIF_VMUnlock(VMDriver *vm, // IN + int callerID) // IN +{ + ASSERT(vm); + + ASSERT(vm->vmhost); + MutexUnlock(&vm->vmhost->vmMutex, callerID); +} + + +#ifdef VMX86_DEBUG +/* + *----------------------------------------------------------------------------- + * + * HostIF_VMLockIsHeld -- + * + * Determine if the per-VM lock is held by the current thread. + * + * Results: + * TRUE if yes + * FALSE if no + * + * Side effects: + * None + * + *----------------------------------------------------------------------------- + */ + +Bool +HostIF_VMLockIsHeld(VMDriver *vm) // IN +{ + ASSERT(vm); + ASSERT(vm->vmhost); + + return MutexIsLocked(&vm->vmhost->vmMutex); +} +#endif + + +/* + * Utility routines for accessing and enabling the APIC + */ + +/* + * Defines for accessing the APIC. We use readl/writel to access the APIC + * which is how Linux wants you to access I/O memory (though on the x86 + * just dereferencing a pointer works just fine). + */ +#define APICR_TO_ADDR(apic, reg) (apic + (reg << 4)) +#define GET_APIC_REG(apic, reg) (readl(APICR_TO_ADDR(apic, reg))) +#define SET_APIC_REG(apic, reg, val) (writel(val, APICR_TO_ADDR(apic, reg))) + +#define APIC_MAXLVT(apic) ((GET_APIC_REG(apic, APICR_VERSION) >> 16) & 0xff) +#define APIC_VERSIONREG(apic) (GET_APIC_REG(apic, APICR_VERSION) & 0xff) + + +#if defined(CONFIG_SMP) || defined(CONFIG_X86_UP_IOAPIC) || \ + defined(CONFIG_X86_UP_APIC) || defined(CONFIG_X86_LOCAL_APIC) +/* + *---------------------------------------------------------------------- + * + * isVAReadable -- + * + * Verify that passed VA is accessible without crash... + * + * Results: + * TRUE if address is readable, FALSE otherwise. + * + * Side effects: + * None. + * + *---------------------------------------------------------------------- + */ + +static Bool +isVAReadable(VA r) // IN: +{ + mm_segment_t old_fs; + uint32 dummy; + int ret; + + old_fs = get_fs(); + set_fs(get_ds()); + r = APICR_TO_ADDR(r, APICR_VERSION); + ret = HostIF_CopyFromUser(&dummy, (void*)r, sizeof(dummy)); + set_fs(old_fs); + + return ret == 0; +} + + +/* + *---------------------------------------------------------------------- + * + * SetVMAPICAddr -- + * + * Maps the host cpu's APIC. The virtual address is stashed in + * the VMDriver structure. + * + * Results: + * None. + * + * Side effects: + * The VMDriver structure is updated. + * + *---------------------------------------------------------------------- + */ + +static void +SetVMAPICAddr(VMDriver *vm, // IN/OUT: driver state + MA ma) // IN: host APIC's ma +{ + volatile void *hostapic; + + ASSERT_ON_COMPILE(APICR_SIZE <= PAGE_SIZE); + hostapic = (volatile void *) ioremap_nocache(ma, PAGE_SIZE); + if (hostapic) { + if ((APIC_VERSIONREG(hostapic) & 0xF0) == 0x10) { + vm->hostAPIC.base = (volatile uint32 (*)[4]) hostapic; + ASSERT(vm->vmhost != NULL); + vm->vmhost->hostAPICIsMapped = TRUE; + } else { + iounmap((void*)hostapic); + } + } +} + + +/* + *---------------------------------------------------------------------- + * + * ProbeAPIC -- + * + * Attempts to map the host APIC. + * + * Most versions of Linux already provide access to a mapped + * APIC. This function is just a backup. + * + * Caveat: We assume that the APIC physical address is the same + * on all host cpus. + * + * Results: + * TRUE if APIC was found, FALSE if not. + * + * Side effects: + * May map the APIC. + * + *---------------------------------------------------------------------- + */ + +static Bool +ProbeAPIC(VMDriver *vm, // IN/OUT: driver state + Bool setVMPtr) // IN: set a pointer to the APIC's virtual address +{ + MA ma = APIC_GetMA(); + + if (ma == (MA)-1) { + return FALSE; + } + + if (setVMPtr) { + SetVMAPICAddr(vm, ma); + } else { + vm->hostAPIC.base = NULL; + } + + return TRUE; +} +#endif + + +/* + *---------------------------------------------------------------------- + * + * HostIF_APICInit -- + * + * Initialize APIC behavior. + * Attempts to map the host APIC into vm->hostAPIC. + * + * We don't attempt to refresh the mapping after a host cpu + * migration. Fortunately, hosts tend to use the same address + * for all APICs. + * + * Most versions of Linux already provide a mapped APIC. We + * have backup code to read APIC_BASE and map it, if needed. + * + * Results: + * TRUE + * + * Side effects: + * May map the host APIC. + * + *---------------------------------------------------------------------- + */ +Bool +HostIF_APICInit(VMDriver *vm, // IN: + Bool setVMPtr, // IN: + Bool probe) // IN: force probing +{ +#if defined(CONFIG_SMP) || defined(CONFIG_X86_UP_IOAPIC) || \ + defined(CONFIG_X86_UP_APIC) || defined(CONFIG_X86_LOCAL_APIC) + static Bool apicIPILogged = FALSE; + VA kAddr; + + monitorIPIVector = SPURIOUS_APIC_VECTOR; +#if defined(POSTED_INTR_VECTOR) + hvIPIVector = POSTED_INTR_VECTOR; +#else + hvIPIVector = 0; +#endif + + + if (!apicIPILogged) { + Log("Monitor IPI vector: %x\n", monitorIPIVector); + Log("HV IPI vector: %x\n", hvIPIVector); + apicIPILogged = TRUE; + } + + if ((__GET_MSR(MSR_APIC_BASE) & APIC_MSR_X2APIC_ENABLED) != 0) { + if (setVMPtr) { + vm->hostAPIC.base = NULL; + vm->vmhost->hostAPICIsMapped = FALSE; + vm->hostAPIC.isX2 = TRUE; + } + return TRUE; + } + + if (probe && ProbeAPIC(vm, setVMPtr)) { + return TRUE; + } + + /* + * Normal case: use Linux's pre-mapped APIC. + */ + kAddr = __fix_to_virt(FIX_APIC_BASE); + if (!isVAReadable(kAddr)) { + return TRUE; + } + if (setVMPtr) { + vm->hostAPIC.base = (void *)kAddr; + } else { + vm->hostAPIC.base = NULL; + } +#endif + return TRUE; +} + + +/* + *----------------------------------------------------------------------------- + * + * HostIF_SemaphoreWait -- + * + * Perform the semaphore wait (P) operation, possibly blocking. + * + * Result: + * 1 (which equals MX_WAITNORMAL) if success, + * negated error code otherwise. + * + * Side-effects: + * None + * + *----------------------------------------------------------------------------- + */ + +int +HostIF_SemaphoreWait(VMDriver *vm, // IN: + Vcpuid vcpuid, // IN: + uint64 *args) // IN: +{ + struct file *file; + mm_segment_t old_fs; + int res; + int waitFD = args[0]; + int timeoutms = args[2]; + uint64 value; + + file = vmware_fget(waitFD); + if (file == NULL) { + return MX_WAITERROR; + } + + old_fs = get_fs(); + set_fs(get_ds()); + + { + struct poll_wqueues table; + unsigned int mask; + + poll_initwait(&table); + current->state = TASK_INTERRUPTIBLE; + mask = file->f_op->poll(file, &table.pt); + if (!(mask & (POLLIN | POLLERR | POLLHUP))) { + vm->vmhost->vcpuSemaTask[vcpuid] = current; + schedule_timeout(timeoutms * HZ / 1000); // convert to Hz + vm->vmhost->vcpuSemaTask[vcpuid] = NULL; + } + current->state = TASK_RUNNING; + poll_freewait(&table); + } + + /* + * Userland only writes in multiples of sizeof(uint64). This will allow + * the code to happily deal with a pipe or an eventfd. We only care about + * reading no bytes (EAGAIN - non blocking fd) or sizeof(uint64). + */ + + res = file->f_op->read(file, (char *) &value, sizeof value, &file->f_pos); + + if (res == sizeof value) { + res = MX_WAITNORMAL; + } else { + if (res == 0) { + res = -EBADF; + } + } + + set_fs(old_fs); + fput(file); + + /* + * Handle benign errors: + * EAGAIN is MX_WAITTIMEDOUT. + * The signal-related errors are all mapped into MX_WAITINTERRUPTED. + */ + + switch (res) { + case -EAGAIN: + res = MX_WAITTIMEDOUT; + break; + case -EINTR: + case -ERESTART: + case -ERESTARTSYS: + case -ERESTARTNOINTR: + case -ERESTARTNOHAND: + res = MX_WAITINTERRUPTED; + break; + case -EBADF: + res = MX_WAITERROR; + break; + } + return res; +} + + +/* + *----------------------------------------------------------------------------- + * + * HostIF_SemaphoreForceWakeup -- + * + * For each VCPU in the set whose target process is lightly sleeping (i.e. + * TASK_INTERRUPTIBLE), wake it up. The target process can be waiting on a + * semaphore or due to a call to Vmx86_YieldToSet. + * + * Result: + * None. + * + * Side-effects: + * None + * + *----------------------------------------------------------------------------- + */ + +void +HostIF_SemaphoreForceWakeup(VMDriver *vm, // IN: + const VCPUSet *vcs) // IN: +{ + FOR_EACH_VCPU_IN_SET(vcs, vcpuid) { + struct task_struct *t = vm->vmhost->vcpuSemaTask[vcpuid]; + vm->vmhost->vcpuSemaTask[vcpuid] = NULL; + if (t && (t->state & TASK_INTERRUPTIBLE)) { + wake_up_process(t); + } + } ROF_EACH_VCPU_IN_SET(); +} + + +/* + *----------------------------------------------------------------------------- + * + * HostIF_SemaphoreSignal -- + * + * Perform the semaphore signal (V) operation. + * + * Result: + * On success: MX_WAITNORMAL (1). + * On error: MX_WAITINTERRUPTED (3) if interrupted by a Unix signal (we + * can block on a preemptive kernel). + * MX_WAITERROR (0) on generic error. + * Negated system error (< 0). + * + * Side-effects: + * None + * + *----------------------------------------------------------------------------- + */ + +int +HostIF_SemaphoreSignal(uint64 *args) // IN: +{ + struct file *file; + mm_segment_t old_fs; + int res; + int signalFD = args[1]; + uint64 value = 1; // make an eventfd happy should it be there + + file = vmware_fget(signalFD); + if (!file) { + return MX_WAITERROR; + } + + old_fs = get_fs(); + set_fs(get_ds()); + + /* + * Always write sizeof(uint64) bytes. This works fine for eventfd and + * pipes. The data written is formatted to make an eventfd happy should + * it be present. + */ + + res = file->f_op->write(file, (char *) &value, sizeof value, &file->f_pos); + + if (res == sizeof value) { + res = MX_WAITNORMAL; + } + + set_fs(old_fs); + fput(file); + + /* + * Handle benign errors: + * EAGAIN is MX_WAITTIMEDOUT. + * The signal-related errors are all mapped into MX_WAITINTERRUPTED. + */ + + switch (res) { + case -EAGAIN: + // The pipe is full, so it is already signalled. Success. + res = MX_WAITNORMAL; + break; + case -EINTR: + case -ERESTART: + case -ERESTARTSYS: + case -ERESTARTNOINTR: + case -ERESTARTNOHAND: + res = MX_WAITINTERRUPTED; + break; + } + return res; +} + +#if ((LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 27)) || !defined(CONFIG_SMP)) +# define VMMON_USE_CALL_FUNC +#endif + +#if defined(VMMON_USE_CALL_FUNC) +/* + *---------------------------------------------------------------------- + * + * LinuxDriverIPIHandler -- + * + * Null IPI handler - for monitor to notice AIO completion + * + *---------------------------------------------------------------------- + */ +void +LinuxDriverIPIHandler(void *info) +{ + return; +} + +#if LINUX_VERSION_CODE > KERNEL_VERSION(2, 6, 17) +#define VMMON_CALL_FUNC_SYNC 0 // async; we've not seen any problems +#else +#define VMMON_CALL_FUNC_SYNC 1 // sync; insure no problems from old releases +#endif + +#endif + + +/* + *---------------------------------------------------------------------- + * + * HostIF_IPI -- + * + * If the passed VCPU threads are on some CPUs in the system, + * attempt to hit them with an IPI. + * + * On older Linux systems we do a broadcast. + * + * Result: + * The mode used to send IPIs. + * + *---------------------------------------------------------------------- + */ + +HostIFIPIMode +HostIF_IPI(VMDriver *vm, // IN: + const VCPUSet *ipiTargets) // IN: +{ + HostIFIPIMode mode = IPI_NONE; + + ASSERT(vm); + + FOR_EACH_VCPU_IN_SET(ipiTargets, v) { + uint32 targetHostCpu = vm->currentHostCpu[v]; + if (targetHostCpu != INVALID_PCPU) { + ASSERT(targetHostCpu < MAX_PCPUS); +#if defined(VMMON_USE_CALL_FUNC) + /* older kernels IPI broadcast; use async when possible */ + (void) compat_smp_call_function(LinuxDriverIPIHandler, + NULL, VMMON_CALL_FUNC_SYNC); + mode = IPI_BROADCAST; + break; +#else + /* Newer kernels have (async) IPI targetting */ + arch_send_call_function_single_ipi(targetHostCpu); + mode = IPI_UNICAST; +#endif + } + } ROF_EACH_VCPU_IN_SET(); + + return mode; +} + + +typedef struct { + Atomic_uint32 index; + CPUIDQuery *query; +} HostIFGetCpuInfoData; + + +/* + *----------------------------------------------------------------------------- + * + * HostIFGetCpuInfo -- + * + * Collect CPUID information on the current logical CPU. + * + * Results: + * None. + * + * Side effects: + * 'data->index' is atomically incremented by one. + * + *----------------------------------------------------------------------------- + */ + +static void +HostIFGetCpuInfo(void *clientData) // IN/OUT: A HostIFGetCpuInfoData * +{ + HostIFGetCpuInfoData *data = (HostIFGetCpuInfoData *)clientData; + CPUIDQuery *query; + uint32 index; + + ASSERT(data); + query = data->query; + ASSERT(query); + + index = Atomic_ReadInc32(&data->index); + if (index >= query->numLogicalCPUs) { + return; + } + + query->logicalCPUs[index].tag = HostIF_GetCurrentPCPU(); + __GET_CPUID2(query->eax, query->ecx, &query->logicalCPUs[index].regs); +} + + +/* + *----------------------------------------------------------------------------- + * + * HostIF_GetAllCpuInfo -- + * + * Collect CPUID information on all logical CPUs. + * + * 'query->numLogicalCPUs' is the size of the 'query->logicalCPUs' output + * array. + * + * Results: + * On success: TRUE. 'query->logicalCPUs' is filled and + * 'query->numLogicalCPUs' is adjusted accordingly. + * On failure: FALSE. Happens if 'query->numLogicalCPUs' was too small. + * + * Side effects: + * None + * + *----------------------------------------------------------------------------- + */ + +Bool +HostIF_GetAllCpuInfo(CPUIDQuery *query) // IN/OUT +{ + HostIFGetCpuInfoData data; + + Atomic_Write32(&data.index, 0); + data.query = query; + + /* + * XXX Linux has userland APIs to bind a thread to a processor, so we could + * probably implement this in userland like we do on Win32. + */ + + HostIF_CallOnEachCPU(HostIFGetCpuInfo, &data); + + /* + * At this point, Atomic_Read32(&data.index) is the number of logical CPUs + * who replied. + */ + + if (Atomic_Read32(&data.index) > query->numLogicalCPUs) { + return FALSE; + } + + ASSERT(Atomic_Read32(&data.index) <= query->numLogicalCPUs); + query->numLogicalCPUs = Atomic_Read32(&data.index); + + return TRUE; +} + + +/* + *---------------------------------------------------------------------- + * + * HostIF_CallOnEachCPU -- + * + * Call specified function once on each CPU. No ordering guarantees. + * + * Results: + * None. + * + * Side effects: + * None. May be slow. + * + *---------------------------------------------------------------------- + */ + +void +HostIF_CallOnEachCPU(void (*func)(void*), // IN: function to call + void *data) // IN/OUT: argument to function +{ + preempt_disable(); + (*func)(data); + (void)compat_smp_call_function(*func, data, 1); + preempt_enable(); +} + + +/* + *---------------------------------------------------------------------- + * + * HostIF_ReadPage -- + * + * puts the content of a machine page into a kernel or user mode + * buffer. + * + * Results: + * 0 on success + * negative error code on error + * + * Side effects: + * none + * + *---------------------------------------------------------------------- + */ + +int +HostIF_ReadPage(MPN mpn, // MPN of the page + VA64 addr, // buffer for data + Bool kernelBuffer) // is the buffer in kernel space? +{ + void *buf = VA64ToPtr(addr); + int ret = 0; + const void* ptr; + struct page* page; + + if (mpn == INVALID_MPN) { + return -EFAULT; + } + + page = pfn_to_page(mpn); + ptr = kmap(page); + if (ptr == NULL) { + return -ENOMEM; + } + + if (kernelBuffer) { + memcpy(buf, ptr, PAGE_SIZE); + } else { + ret = HostIF_CopyToUser(buf, ptr, PAGE_SIZE); + } + kunmap(page); + + return ret; +} + + +/* + *---------------------------------------------------------------------- + * + * HostIF_WritePage -- + * + * Put the content of a kernel or user mode buffer into a machine + * page. + * + * Results: + * 0 on success + * negative error code on error + * + * Side effects: + * none + * + *---------------------------------------------------------------------- + */ + +int +HostIF_WritePage(MPN mpn, // MPN of the page + VA64 addr, // data to write to the page + Bool kernelBuffer) // is the buffer in kernel space? +{ + void const *buf = VA64ToPtr(addr); + int ret = 0; + void* ptr; + struct page* page; + + if (mpn == INVALID_MPN) { + return -EFAULT; + } + + page = pfn_to_page(mpn); + ptr = kmap(page); + if (ptr == NULL) { + return -ENOMEM; + } + + if (kernelBuffer) { + memcpy(ptr, buf, PAGE_SIZE); + } else { + ret = HostIF_CopyFromUser(ptr, buf, PAGE_SIZE); + } + kunmap(page); + + return ret; +} + + +/* + *---------------------------------------------------------------------- + * + * HostIF_GetLockedPageList -- + * + * puts MPNs of pages that were allocated by HostIF_AllocLockedPages() + * into user mode buffer. + * + * Results: + * non-negative number of the MPNs in the buffer on success. + * negative error code on error (-EFAULT) + * + * Side effects: + * none + * + *---------------------------------------------------------------------- + */ + +int +HostIF_GetLockedPageList(VMDriver* vm, // IN: VM instance pointer + VA64 uAddr, // OUT: user mode buffer for MPNs + unsigned int numPages) // IN: size of the buffer in MPNs +{ + MPN *mpns = VA64ToPtr(uAddr); + MPN mpn; + unsigned count; + + struct PhysTracker* AWEPages; + + if (!vm->vmhost || !vm->vmhost->AWEPages) { + return 0; + } + AWEPages = vm->vmhost->AWEPages; + + for (mpn = 0, count = 0; + (count < numPages) && + (INVALID_MPN != (mpn = PhysTrack_GetNext(AWEPages, mpn))); + count++) { + + if (HostIF_CopyToUser(&mpns[count], &mpn, sizeof *mpns) != 0) { + return -EFAULT; + } + } + + return count; +} + + +/* + *----------------------------------------------------------------------------- + * + * HostIF_GetNextAnonPage -- + * + * If "inMPN" is INVALID_MPN gets the first MPN in the anon mpn list else + * gets the anon mpn after "inMPN" in the anon mpn list. + * + * Results: + * Next anon MPN. If the list has been exhausted, returns INVALID_MPN. + * + *----------------------------------------------------------------------------- + */ + +MPN +HostIF_GetNextAnonPage(VMDriver *vm, MPN inMPN) +{ + if (!vm->vmhost || !vm->vmhost->AWEPages) { + return INVALID_MPN; + } + return PhysTrack_GetNext(vm->vmhost->AWEPages, inMPN); +} + + +/* + *---------------------------------------------------------------------- + * + * HostIF_GetCurrentPCPU -- + * + * Get current physical CPU id. Interrupts should be disabled so + * that the thread cannot move to another CPU. + * + * Results: + * Host CPU number. + * + * Side effects: + * None. + * + *---------------------------------------------------------------------- + */ + +uint32 +HostIF_GetCurrentPCPU(void) +{ + return smp_processor_id(); +} + + +#ifdef VMMON_USE_COMPAT_SCHEDULE_HRTIMEOUT +/* + *---------------------------------------------------------------------- + * + * HostIFWakeupClockThread -- + * + * Wake up the fast clock thread. Can't do this from the timer + * callback, because it holds locks that the scheduling code + * might take. + * + * Results: + * None. + * + * Side effects: + * None. + * + *---------------------------------------------------------------------- + */ + +static void +HostIFWakeupClockThread(unsigned long data) //IN: +{ + wake_up_process(linuxState.fastClockThread); +} + + +/* + *---------------------------------------------------------------------- + * + * HostIFTimerCallback -- + * + * Schedule a tasklet to wake up the fast clock thread. + * + * Results: + * Tell the kernel not to restart the timer. + * + * Side effects: + * None. + * + *---------------------------------------------------------------------- + */ + +static enum hrtimer_restart +HostIFTimerCallback(struct hrtimer *timer) //IN: +{ + tasklet_schedule(&timerTasklet); + + return HRTIMER_NORESTART; +} + + +/* + *---------------------------------------------------------------------- + * + * HostIFScheduleHRTimeout -- + * + * Schedule an hrtimer to wake up the fast clock thread. + * + * Results: + * None. + * + * Side effects: + * Sleep. + * + *---------------------------------------------------------------------- + */ + +static void +HostIFScheduleHRTimeout(ktime_t *expires) //IN: +{ + struct hrtimer t; + + if (expires && !expires->tv64) { + __set_current_state(TASK_RUNNING); + + return; + } + + hrtimer_init(&t, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + t.function = HostIFTimerCallback; + hrtimer_start(&t, *expires, HRTIMER_MODE_REL); + + if (hrtimer_active(&t)) { + schedule(); + } + + hrtimer_cancel(&t); + __set_current_state(TASK_RUNNING); +} +#endif //VMMON_USE_COMPAT_SCHEDULE_HRTIMEOUT + + +#ifndef VMMON_USE_HIGH_RES_TIMERS +/* + *---------------------------------------------------------------------- + * + * HostIFDoIoctl -- + * + * Issue ioctl. Assume kernel is not locked. It is not true now, + * but it makes things easier to understand, and won't surprise us + * later when we get rid of kernel lock from our code. + * + * Results: + * Same as ioctl method. + * + * Side effects: + * none. + * + *---------------------------------------------------------------------- + */ + +static long +HostIFDoIoctl(struct file *filp, + u_int iocmd, + unsigned long ioarg) +{ + if (filp->f_op->unlocked_ioctl) { + return filp->f_op->unlocked_ioctl(filp, iocmd, ioarg); + } + return -ENOIOCTLCMD; +} +#endif //VMON_USE_HIGH_RES_TIMERS + + +/* + *---------------------------------------------------------------------- + * + * HostIFStartTimer -- + * + * Starts the timer using either /dev/rtc or high-resolution timers. + * + * Results: + * Returns 0 on success, -1 on failure. + * + * Side effects: + * Sleep until timer expires. + * + *---------------------------------------------------------------------- + */ + +int +HostIFStartTimer(Bool rateChanged, //IN: Did rate change? + unsigned int rate, //IN: current clock rate + struct file *filp) //IN: /dev/rtc descriptor +{ +#ifdef VMMON_USE_HIGH_RES_TIMERS + static unsigned long slack = 0; + static ktime_t expires; + int timerPeriod; + + if (rateChanged) { + timerPeriod = NSEC_PER_SEC / rate; + expires = ktime_set(0, timerPeriod); + /* + * Allow the kernel to expire the timer at its convenience. + * ppoll() uses 0.1% of the timeout value. I think we can + * tolerate 1%. + */ + + slack = timerPeriod / 100; + } + set_current_state(TASK_INTERRUPTIBLE); +# ifdef VMMON_USE_SCHEDULE_HRTIMEOUT + schedule_hrtimeout_range(&expires, slack, HRTIMER_MODE_REL); +# else + HostIFScheduleHRTimeout(&expires); +# endif +#else + unsigned p2rate; + int res; + unsigned long buf; + loff_t pos = 0; + + if (rateChanged) { + /* + * The host will already have HZ timer interrupts per second. So + * in order to satisfy the requested rate, we need up to (rate - + * HZ) additional interrupts generated by the RTC. That way, if + * the guest ask for a bit more than 1024 virtual interrupts per + * second (which is a common case for Windows with multimedia + * timers), we'll program the RTC to 1024 rather than 2048, which + * saves a considerable amount of CPU. PR 519228. + */ + if (rate > HZ) { + rate -= HZ; + } else { + rate = 0; + } + /* + * Don't set the RTC rate to 64 Hz or lower: some kernels have a + * bug in the HPET emulation of RTC that will cause the RTC + * frequency to get stuck at 64Hz. See PR 519228 comment #23. + */ + p2rate = 128; + // Hardware rate must be a power of 2 + while (p2rate < rate && p2rate < 8192) { + p2rate <<= 1; + } + + res = HostIFDoIoctl(filp, RTC_IRQP_SET, p2rate); + if (res < 0) { + Warning("/dev/rtc set rate %d failed: %d\n", p2rate, res); + + return -1; + } + if (kthread_should_stop()) { + return -1; + } + } + res = filp->f_op->read(filp, (void *) &buf, sizeof(buf), &pos); + if (res <= 0) { + if (res != -ERESTARTSYS) { + Log("/dev/rtc read failed: %d\n", res); + } + + return -1; + } +#endif + + return 0; +} + + +/* + *---------------------------------------------------------------------- + * + * HostIFFastClockThread -- + * + * Kernel thread that provides finer-grained wakeups than the + * main system timers by using /dev/rtc. We can't do this at + * user level because /dev/rtc is not sharable (PR 19266). Also, + * we want to avoid the overhead of a context switch out to user + * level on every RTC interrupt. + * + * Results: + * Returns 0. + * + * Side effects: + * Wakeups and IPIs. + * + *---------------------------------------------------------------------- + */ + +static int +HostIFFastClockThread(void *data) // IN: +{ + struct file *filp = (struct file *) data; + int res; + mm_segment_t oldFS; + unsigned int rate = 0; + unsigned int prevRate = 0; + + oldFS = get_fs(); + set_fs(KERNEL_DS); + allow_signal(SIGKILL); + set_user_nice(current, linuxState.fastClockPriority); + + while ((rate = linuxState.fastClockRate) > MIN_RATE) { + if (kthread_should_stop()) { + goto out; + } + res = HostIFStartTimer(rate != prevRate, rate, filp); + if (res < 0) { + goto out; + } + prevRate = rate; + +#if defined(CONFIG_SMP) + /* + * IPI each VCPU thread that is in the monitor and is due to + * fire a MonTimer callback. + */ + Vmx86_MonTimerIPI(); +#endif + + /* + * Wake threads that are waiting for a fast poll timeout at + * userlevel. This is needed only on Linux. On Windows, + * we get shorter timeouts simply by increasing the host + * clock rate. + */ + + LinuxDriverWakeUp(TRUE); + } + + out: + LinuxDriverWakeUp(TRUE); + set_fs(oldFS); + + /* + * Do not exit thread until we are told to do so. + */ + + do { + set_current_state(TASK_UNINTERRUPTIBLE); + if (kthread_should_stop()) { + break; + } + schedule(); + } while (1); + set_current_state(TASK_RUNNING); + + return 0; +} + + +/* + *----------------------------------------------------------------------------- + * + * HostIF_SetFastClockRate -- + * + * The monitor wants to poll for events at the given rate. + * Ensure that the host OS's timer interrupts come at least at + * this rate. If the requested rate is greater than the rate at + * which timer interrupts will occur on CPUs other than 0, then + * also arrange to call Vmx86_MonitorPollIPI on every timer + * interrupt, in order to relay IPIs to any other CPUs that need + * them. + * + * Locking: + * The caller must hold the fast clock lock. + * + * Results: + * 0 for success; positive error code if /dev/rtc could not be opened. + * + * Side effects: + * As described above. + * + *----------------------------------------------------------------------------- + */ + +int +HostIF_SetFastClockRate(unsigned int rate) // IN: Frequency in Hz. +{ + ASSERT(MutexIsLocked(&fastClockMutex)); + linuxState.fastClockRate = rate; + + /* + * Overview + * -------- + * An SMP Linux kernel programs the 8253 timer (to increment the 'jiffies' + * counter) _and_ all local APICs (to run the scheduler code) to deliver + * interrupts HZ times a second. + * + * Time + * ---- + * The kernel tries very hard to spread all these interrupts evenly over + * time, i.e. on a 1 CPU system, the 1 local APIC phase is shifted by 1/2 + * period compared to the 8253, and on a 2 CPU system, the 2 local APIC + * phases are respectively shifted by 1/3 and 2/3 period compared to the + * 8253. This is done to reduce contention on locks guarding the global task + * queue. + * + * Space + * ----- + * The 8253 interrupts are distributed between physical CPUs, evenly on a P3 + * system, whereas on a P4 system physical CPU 0 gets all of them. + * + * Long story short, unless the monitor requested rate is significantly + * higher than HZ, we don't need to send IPIs or exclusively grab /dev/rtc + * to periodically kick vCPU threads running in the monitor on all physical + * CPUs. + */ + + if (rate > MIN_RATE) { + if (!linuxState.fastClockThread) { + struct task_struct *rtcTask; + struct file *filp = NULL; + +#if !defined(VMMON_USE_HIGH_RES_TIMERS) + int res; + + filp = filp_open("/dev/rtc", O_RDONLY, 0); + if (IS_ERR(filp)) { + Warning("/dev/rtc open failed: %d\n", (int)(VA)filp); + + return -(int)(VA)filp; + } + res = HostIFDoIoctl(filp, RTC_PIE_ON, 0); + if (res < 0) { + Warning("/dev/rtc enable interrupt failed: %d\n", res); + filp_close(filp, current->files); + + return -res; + } +#endif + rtcTask = kthread_run(HostIFFastClockThread, filp, "vmware-rtc"); + if (IS_ERR(rtcTask)) { + long err = PTR_ERR(rtcTask); + + /* + * Ignore ERESTARTNOINTR silently, it occurs when signal is + * pending, and syscall layer automatically reissues operation + * after signal is handled. + */ + + if (err != -ERESTARTNOINTR) { + Warning("/dev/rtc cannot start watch thread: %ld\n", err); + } + close_rtc(filp, current->files); + + return -err; + } + linuxState.fastClockThread = rtcTask; + linuxState.fastClockFile = filp; + } + } else { + if (linuxState.fastClockThread) { + force_sig(SIGKILL, linuxState.fastClockThread); + kthread_stop(linuxState.fastClockThread); + close_rtc(linuxState.fastClockFile, current->files); + + linuxState.fastClockThread = NULL; + linuxState.fastClockFile = NULL; + } + } + + return 0; +} + + +/* + *----------------------------------------------------------------------------- + * + * HostIF_MapUserMem -- + * + * Obtain kernel pointer to user memory. The pages backing the user memory + * address are locked into memory (this allows the pointer to be used in + * contexts where paging is undesirable or impossible). + * + * Results: + * On success, returns the kernel virtual address, along with a handle to + * be used for unmapping. + * On failure, returns NULL. + * + * Side effects: + * Yes. + * + *----------------------------------------------------------------------------- + */ + +void * +HostIF_MapUserMem(VA addr, // IN: User memory virtual address + size_t size, // IN: Size of memory desired + VMMappedUserMem **handle) // OUT: Handle to mapped memory +{ + void *p = (void *) (uintptr_t) addr; + VMMappedUserMem *newHandle; + VA offset = addr & (PAGE_SIZE - 1); + size_t numPagesNeeded = ((offset + size) / PAGE_SIZE) + 1; + size_t handleSize = + sizeof *newHandle + numPagesNeeded * sizeof newHandle->pages[0]; + void *mappedAddr; + + ASSERT(handle); + + if (!access_ok(VERIFY_WRITE, p, size)) { + printk(KERN_ERR "%s: Couldn't verify write to uva 0x%p with size %" + FMTSZ"u\n", __func__, p, size); + + return NULL; + } + + newHandle = kmalloc(handleSize, GFP_KERNEL); + if (newHandle == NULL) { + printk(KERN_ERR "%s: Couldn't allocate %"FMTSZ"u bytes of memory\n", + __func__, handleSize); + + return NULL; + } + + if (HostIFGetUserPages(p, newHandle->pages, numPagesNeeded)) { + kfree(newHandle); + printk(KERN_ERR "%s: Couldn't get %"FMTSZ"u %s for uva 0x%p\n", __func__, + numPagesNeeded, numPagesNeeded > 1 ? "pages" : "page", p); + + return NULL; + } + + if (numPagesNeeded > 1) { + /* + * Unlike kmap(), vmap() can fail. If it does, we need to release the + * pages that we acquired in HostIFGetUserPages(). + */ + + mappedAddr = vmap(newHandle->pages, numPagesNeeded, VM_MAP, PAGE_KERNEL); + if (mappedAddr == NULL) { + unsigned int i; + for (i = 0; i < numPagesNeeded; i++) { + put_page(newHandle->pages[i]); + } + kfree(newHandle); + printk(KERN_ERR "%s: Couldn't vmap %"FMTSZ"u %s for uva 0x%p\n", + __func__, numPagesNeeded, + numPagesNeeded > 1 ? "pages" : "page", p); + + return NULL; + } + } else { + mappedAddr = kmap(newHandle->pages[0]); + } + + printk(KERN_DEBUG "%s: p = 0x%p, offset = 0x%p, numPagesNeeded = %"FMTSZ"u," + " handleSize = %"FMTSZ"u, mappedAddr = 0x%p\n", + __func__, p, (void *)offset, numPagesNeeded, handleSize, mappedAddr); + + newHandle->numPages = numPagesNeeded; + newHandle->addr = mappedAddr; + *handle = newHandle; + + return mappedAddr + offset; +} + + +/* + *----------------------------------------------------------------------------- + * + * HostIF_UnmapUserMem -- + * + * Unmap user memory from HostIF_MapUserMem(). + * + * Results: + * None. + * + * Side effects: + * Yes. + * + *----------------------------------------------------------------------------- + */ + +void +HostIF_UnmapUserMem(VMMappedUserMem *handle) // IN: Handle to mapped memory +{ + unsigned int i; + + if (handle == NULL) { + return; + } + + printk(KERN_DEBUG "%s: numPages = %"FMTSZ"u, addr = 0x%p\n", + __func__, handle->numPages, handle->addr); + + if (handle->numPages > 1) { + vunmap(handle->addr); + } else { + kunmap(handle->pages[0]); + } + + for (i = 0; i < handle->numPages; i++) { + put_page(handle->pages[i]); + } + kfree(handle); +} + +/* + *----------------------------------------------------------------------------- + * + * HostIF_SafeRDMSR -- + * + * Attempt to read a MSR, and handle the exception if the MSR + * is unimplemented. + * + * Results: + * 0 if successful, and MSR value is returned via *val. + * + * If the MSR is unimplemented, *val is set to 0, and a + * non-zero value is returned: -1 for Win32, -EFAULT for Linux, + * and 1 for MacOS. + * + * Side effects: + * None + * + *----------------------------------------------------------------------------- + */ +int +HostIF_SafeRDMSR(unsigned int msr, // IN + uint64 *val) // OUT: MSR value +{ + int ret; + unsigned low, high; + asm volatile("2: rdmsr ; xor %0,%0\n" + "1:\n\t" + ".section .fixup,\"ax\"\n\t" + "3: mov %4,%0 ; jmp 1b\n\t" + ".previous\n\t" + VMW_ASM_EXTABLE(2b, 3b) + : "=r"(ret), "=a"(low), "=d"(high) + : "c"(msr), "i"(-EFAULT), "1"(0), "2"(0)); // init eax/edx to 0 + *val = (low | ((u64)(high) << 32)); + + return ret; +} + diff --git a/vmmon-hostif.c/hostif.c.new b/vmmon-hostif.c/hostif.c.new new file mode 100644 index 0000000..3440e28 --- /dev/null +++ b/vmmon-hostif.c/hostif.c.new @@ -0,0 +1,3611 @@ +/********************************************************* + * Copyright (C) 1998-2014 VMware, Inc. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation version 2 and no later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + *********************************************************/ + +/* + * hostif.c -- + * + * This file implements the platform-specific (here Linux) interface that + * the cross-platform code uses --hpreg + * + */ + + +/* Must come before any kernel header file --hpreg */ +#include "driver-config.h" + +/* Must come before vmware.h --hpreg */ +#include "compat_page.h" +#include +#include +#include +#include + +#include +#include + +#include +#include +#include + +#include + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 25) +# include +#endif +#if defined(_ASM_EXTABLE) +# define VMW_ASM_EXTABLE(from, to) _ASM_EXTABLE(from, to) +#else + /* Compat version copied from asm.h of 2.6.25 kernel */ +# define VMW_ASM_FORM(x) " " #x " " +# define VMW_ASM_EX_SEC " .section __ex_table,\"a\"\n" +# ifdef CONFIG_X86_32 +# define VMW_ASM_SEL(a,b) VMW_ASM_FORM(a) +# else +# define VMW_ASM_SEL(a,b) VMW_ASM_FORM(b) +# endif +# define VMW_ASM_PTR VMW_ASM_SEL(.long, .quad) +# define VMW_ASM_ALIGN VMW_ASM_SEL(.balign 4, .balign 8) +# define VMW_ASM_EXTABLE(from,to) \ + VMW_ASM_EX_SEC \ + VMW_ASM_ALIGN "\n" \ + VMW_ASM_PTR #from "," #to "\n" \ + " .previous\n" +#endif + +#include +#include +#include +#include +#include +#include + + +#include "vmware.h" +#include "x86apic.h" +#include "vm_asm.h" +#include "modulecall.h" +#include "memtrack.h" +#include "phystrack.h" +#include "cpuid.h" +#include "cpuid_info.h" +#include "hostif.h" +#include "hostif_priv.h" +#include "driver.h" +#include "vmhost.h" +#include "x86msr.h" +#include "apic.h" +#include "memDefaults.h" +#include "vcpuid.h" + +#include "pgtbl.h" +#include "vmmonInt.h" +#include "versioned_atomic.h" + +/* + * Determine if we can use high resolution timers. + */ + +#ifdef CONFIG_HIGH_RES_TIMERS +# include +# define VMMON_USE_HIGH_RES_TIMERS +# if LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 28) +# define VMMON_USE_SCHEDULE_HRTIMEOUT +# else +# define VMMON_USE_COMPAT_SCHEDULE_HRTIMEOUT +static void HostIFWakeupClockThread(unsigned long data); +static DECLARE_TASKLET(timerTasklet, HostIFWakeupClockThread, 0); +# endif +# define close_rtc(filp, files) do {} while(0) +#else +# define close_rtc(filp, files) filp_close(filp, files) +#endif + +#define UPTIME_FREQ CONST64(1000000) + +/* + * When CONFIG_NO_HZ_FULL is set processors can run tickless + * if there is only one runnable process. When set, the rate + * checks in HostIF_SetFastClockRate and HostIFFastClockThread + * need to be relaxed to allow any non-zero rate to run. + * + * This code can potentially be removed if/when we stop using + * HostIFFastClockThread to drive MonTimer. See PR1088247. + */ +#ifdef CONFIG_NO_HZ_FULL +#define MIN_RATE (0) +#else +#define MIN_RATE ((HZ) + (HZ) / 16) +#endif + +/* + * Linux seems to like keeping free memory around 30MB + * even under severe memory pressure. Let's give it a little + * more leeway than that for safety. + */ +#define LOCKED_PAGE_SLACK 10000 + +static struct { + Atomic_uint64 uptimeBase; + VersionedAtomic version; + uint64 monotimeBase; + unsigned long jiffiesBase; + struct timer_list timer; +} uptimeState; + +/* + * First Page Locking strategy + * --------------------------- + * + * An early implementation hacked the lock bit for the purpose of locking + * memory. This had a couple of advantages: + * - the vmscan algorithm would never eliminate mappings from the process + * address space + * - easy to assert that things are ok + * - it worked with anonymous memory. Basically, vmscan jumps over these + * pages, their use count stays high, .... + * + * This approach however had a couple of problems: + * + * - it relies on an undocumented interface. (in another words, a total hack) + * - it creates deadlock situations if the application gets a kill -9 or + * otherwise dies ungracefully. linux first tears down the address space, + * then closes file descriptors (including our own device). Unfortunately, + * this leads to a deadlock of the process on pages with the lock bit set. + * + * There is a workaround for that, namely to detect that condition using + * a linux timer. (ugly) + * + * Current Page Locking strategy + * ----------------------------- + * + * The current scheme does not use the lock bit, rather it increments the use + * count on the pages that need to be locked down in memory. + * + * The problem is that experiments on certain linux systems (e.g. 2.2.0-pre9) + * showed that linux somehow swaps out anonymous pages, even with the + * increased ref counter. + * Swapping them out to disk is not that big of a deal, but bringing them back + * to a different location is. In any case, anonymous pages in linux are not + * intended to be write-shared (e.g. try to MAP_SHARED /dev/zero). + * + * As a result, the current locking strategy requires that all locked pages are + * backed by the filesystem, not by swap. For now, we use both mapped files and + * sys V shared memory. The user application is responsible to cover these + * cases. + * + */ + + +#define HOST_UNLOCK_PFN(_vm, _pfn) do { \ + _vm = _vm; \ + put_page(pfn_to_page(_pfn)); \ +} while (0) + +#define HOST_UNLOCK_PFN_BYMPN(_vm, _pfn) do { \ + PhysTrack_Remove((_vm)->vmhost->lockedPages, (_pfn)); \ + put_page(pfn_to_page(_pfn)); \ +} while (0) + +uint8 monitorIPIVector; +uint8 hvIPIVector; + +/* + *----------------------------------------------------------------------------- + * + * MutexInit -- + * + * Initialize a Mutex. --hpreg + * + * Results: + * None + * + * Side effects: + * None + * + *----------------------------------------------------------------------------- + */ + +#ifdef VMX86_DEBUG +static INLINE void +MutexInit(Mutex *mutex, // IN + char const *name) // IN +{ + ASSERT(mutex); + ASSERT(name); + + sema_init(&mutex->sem, 1); + mutex->name = name; + mutex->cur.pid = -1; +} +#else +# define MutexInit(_mutex, _name) sema_init(&(_mutex)->sem, 1) +#endif + + +#ifdef VMX86_DEBUG +/* + *----------------------------------------------------------------------------- + * + * MutexIsLocked -- + * + * Determine if a Mutex is locked by the current thread. --hpreg + * + * Results: + * TRUE if yes + * FALSE if no + * + * Side effects: + * None + * + *----------------------------------------------------------------------------- + */ + +static INLINE Bool +MutexIsLocked(Mutex *mutex) // IN +{ + ASSERT(mutex); + + return mutex->cur.pid == current->pid; +} +#endif + + +/* + *----------------------------------------------------------------------------- + * + * MutexLock -- + * + * Acquire a Mutex. --hpreg + * + * Results: + * None + * + * Side effects: + * None + * + *----------------------------------------------------------------------------- + */ + +#ifdef VMX86_DEBUG +static INLINE void +MutexLock(Mutex *mutex, // IN + int callerID) // IN +{ + ASSERT(mutex); + ASSERT(!MutexIsLocked(mutex)); + + down(&mutex->sem); + mutex->cur.pid = current->pid; + mutex->cur.callerID = callerID; +} +#else +# define MutexLock(_mutex, _callerID) down(&(_mutex)->sem) +#endif + + +/* + *----------------------------------------------------------------------------- + * + * MutexUnlock -- + * + * Release a Mutex. --hpreg + * + * Results: + * None + * + * Side effects: + * None + * + *----------------------------------------------------------------------------- + */ + +#ifdef VMX86_DEBUG +static INLINE void +MutexUnlock(Mutex *mutex, // IN + int callerID) // IN +{ + ASSERT(mutex); + + ASSERT(MutexIsLocked(mutex) && mutex->cur.callerID == callerID); + mutex->prev = mutex->cur; + mutex->cur.pid = -1; + up(&mutex->sem); +} +#else +# define MutexUnlock(_mutex, _callerID) up(&(_mutex)->sem) +#endif + + +/* This mutex protects the driver-wide state. --hpreg */ +static Mutex globalMutex; + +/* + * This mutex protects the fast clock rate and is held while + * creating/destroying the fastClockThread. It ranks below + * globalMutex. We can't use globalMutex for this purpose because the + * fastClockThread itself acquires the globalMutex, so trying to hold + * the mutex while destroying the thread can cause a deadlock. + */ +static Mutex fastClockMutex; + +/* This mutex protects linuxState.pollList. */ +static Mutex pollListMutex; + + +/* + *---------------------------------------------------------------------- + * + * HostIF_PrepareWaitForThreads -- + * + * Prepare to wait for another vCPU thread. + * + * Results: + * FALSE: no way on Linux to determine we've already been signalled. + * + * Side effects: + * Current task is interruptible. + * + *---------------------------------------------------------------------- + */ + +Bool +HostIF_PrepareWaitForThreads(VMDriver *vm, // IN: + Vcpuid currVcpu) // IN: +{ + set_current_state(TASK_INTERRUPTIBLE); + vm->vmhost->vcpuSemaTask[currVcpu] = current; + return FALSE; +} + + +/* + *---------------------------------------------------------------------- + * + * HostIF_WaitForThreads -- + * + * Wait for another vCPU thread. + * + * Results: + * None. + * + * Side effects: + * Current task may block. + * + *---------------------------------------------------------------------- + */ + +void +HostIF_WaitForThreads(VMDriver *vm, // UNUSED: + Vcpuid currVcpu) // UNUSED: + +{ +#ifdef VMMON_USE_SCHEDULE_HRTIMEOUT + ktime_t timeout = ktime_set(0, CROSSCALL_SLEEP_US * 1000); + schedule_hrtimeout(&timeout, HRTIMER_MODE_REL); +#else + /* Fallback to ms timer resolution is fine for older kernels. */ + schedule_timeout(msecs_to_jiffies(CROSSCALL_SLEEP_US / 1000) + 1); +#endif +} + + +/* + *---------------------------------------------------------------------- + * + * HostIF_CancelWaitForThreads -- + * + * Cancel waiting for another vCPU thread. + * + * Results: + * None. + * + * Side effects: + * Current task is running and no longer interruptible. + * + *---------------------------------------------------------------------- + */ + +void +HostIF_CancelWaitForThreads(VMDriver *vm, // IN: + Vcpuid currVcpu) // IN: +{ + vm->vmhost->vcpuSemaTask[currVcpu] = NULL; + set_current_state(TASK_RUNNING); +} + + +/* + *---------------------------------------------------------------------- + * + * HostIF_WakeUpYielders -- + * + * Wakeup vCPUs that are waiting for the current vCPU. + * + * Results: + * The requested vCPUs are nudged if they are sleeping due to + * Vmx86_YieldToSet. + * + * Side effects: + * None. + * + *---------------------------------------------------------------------- + */ + +void +HostIF_WakeUpYielders(VMDriver *vm, // IN: + Vcpuid currVcpu) // IN: +{ + VCPUSet req; + Vcpuid vcpuid; + uint64 subset; + + /* + * PR 1142958: if the VCPUs woken in the crosscallWaitSet re-add themselves + * to this set faster than it can be fully drained, this function never + * exits. Instead, we copy and remove a snapshot of the crosscallWaitSet + * and locally wake up just that snapshot. It is ok that we don't get a + * fully coherent snapshot, as long as the subset copy-and-remove is atomic + * so no VCPU added is lost entirely. + */ + + VCPUSet_Empty(&req); + FOR_EACH_SUBSET_IN_SET(subIdx) { + subset = VCPUSet_AtomicReadWriteSubset(&vm->crosscallWaitSet[currVcpu], + 0, subIdx); + VCPUSet_UnionSubset(&req, subset, subIdx); + } ROF_EACH_SUBSET_IN_SET(); + + preempt_disable(); + while ((vcpuid = VCPUSet_FindFirst(&req)) != VCPUID_INVALID) { + struct task_struct *t = vm->vmhost->vcpuSemaTask[vcpuid]; + VCPUSet_Remove(&req, vcpuid); + if (t && (t->state & TASK_INTERRUPTIBLE)) { + wake_up_process(t); + } + } + preempt_enable(); +} + + +/* + *----------------------------------------------------------------------------- + * + * HostIF_InitGlobalLock -- + * + * Initialize the global (across all VMs and vmmon) locks. + * + * Results: + * None + * + * Side effects: + * None + * + *----------------------------------------------------------------------------- + */ + +void +HostIF_InitGlobalLock(void) +{ + MutexInit(&globalMutex, "global"); + MutexInit(&fastClockMutex, "fastClock"); + MutexInit(&pollListMutex, "pollList"); +} + + +/* + *----------------------------------------------------------------------------- + * + * HostIF_GlobalLock -- + * + * Grabs the global data structure lock. + * + * Results: + * None + * + * Side effects: + * Should be a very low contention lock. + * The current thread is rescheduled if the lock is busy. + * + *----------------------------------------------------------------------------- + */ + +void +HostIF_GlobalLock(int callerID) // IN +{ + MutexLock(&globalMutex, callerID); +} + + +/* + *----------------------------------------------------------------------------- + * + * HostIF_GlobalUnlock -- + * + * Releases the global data structure lock. + * + * Results: + * None + * + * Side effects: + * None + * + *----------------------------------------------------------------------------- + */ + +void +HostIF_GlobalUnlock(int callerID) // IN +{ + MutexUnlock(&globalMutex, callerID); +} + + +#ifdef VMX86_DEBUG +/* + *----------------------------------------------------------------------------- + * + * HostIF_GlobalLockIsHeld -- + * + * Determine if the global lock is held by the current thread. + * + * Results: + * TRUE if yes + * FALSE if no + * + * Side effects: + * None + * + *----------------------------------------------------------------------------- + */ + +Bool +HostIF_GlobalLockIsHeld(void) +{ + return MutexIsLocked(&globalMutex); +} +#endif + + +/* + *----------------------------------------------------------------------------- + * + * HostIF_FastClockLock -- + * + * Grabs the fast clock data structure lock. + * + * Results: + * None + * + * Side effects: + * Should be a very low contention lock. + * The current thread is rescheduled if the lock is busy. + * + *----------------------------------------------------------------------------- + */ + +void +HostIF_FastClockLock(int callerID) // IN +{ + MutexLock(&fastClockMutex, callerID); +} + + +/* + *----------------------------------------------------------------------------- + * + * HostIF_FastClockUnlock -- + * + * Releases the fast clock data structure lock. + * + * Results: + * None + * + * Side effects: + * None + * + *----------------------------------------------------------------------------- + */ + +void +HostIF_FastClockUnlock(int callerID) // IN +{ + MutexUnlock(&fastClockMutex, callerID); +} + + +/* + *----------------------------------------------------------------------------- + * + * HostIF_PollListLock -- + * + * Grabs the linuxState.pollList lock. + * + * Results: + * None + * + * Side effects: + * The current thread is rescheduled if the lock is busy. + * + *----------------------------------------------------------------------------- + */ + +void +HostIF_PollListLock(int callerID) // IN +{ + MutexLock(&pollListMutex, callerID); +} + + +/* + *----------------------------------------------------------------------------- + * + * HostIF_PollListUnlock -- + * + * Releases the linuxState.pollList lock. + * + * Results: + * None + * + * Side effects: + * None + * + *----------------------------------------------------------------------------- + */ + +void +HostIF_PollListUnlock(int callerID) // IN +{ + MutexUnlock(&pollListMutex, callerID); +} + + +/* + *---------------------------------------------------------------------- + * + * MapCrossPage & UnmapCrossPage + * + * Both x86-64 and ia32 need to map crosspage to an executable + * virtual address. We use the vmap interface instead of kmap + * due to bug 43907. + * + * Side effects: + * + * UnmapCrossPage assumes that the page has been refcounted up + * so it takes care of the put_page. + * + *---------------------------------------------------------------------- + */ +static void * +MapCrossPage(struct page *p) // IN: +{ + return vmap(&p, 1, VM_MAP, VM_PAGE_KERNEL_EXEC); +} + + +static void +UnmapCrossPage(struct page *p, // IN: + void *va) // IN: +{ + vunmap(va); + put_page(p); +} + + +/* + *---------------------------------------------------------------------- + * + * HostIFHostMemInit -- + * + * Initialize per-VM pages lists. + * + * Results: + * 0 on success, + * non-zero on failure. + * + * Side effects: + * None. + * + *---------------------------------------------------------------------- + */ + +static int +HostIFHostMemInit(VMDriver *vm) // IN: +{ + VMHost *vmh = vm->vmhost; + + vmh->lockedPages = PhysTrack_Alloc(vm); + if (!vmh->lockedPages) { + return -1; + } + vmh->AWEPages = PhysTrack_Alloc(vm); + if (!vmh->AWEPages) { + return -1; + } + + return 0; +} + + +/* + *---------------------------------------------------------------------- + * + * HostIFHostMemCleanup -- + * + * Release per-VM pages lists. + * + * Results: + * None. + * + * Side effects: + * Locked and AWE pages are released. + * + *---------------------------------------------------------------------- + */ + +static void +HostIFHostMemCleanup(VMDriver *vm) // IN: +{ + MPN mpn; + VMHost *vmh = vm->vmhost; + + if (!vmh) { + return; + } + + HostIF_VMLock(vm, 32); // Debug version of PhysTrack wants VM's lock. + if (vmh->lockedPages) { + for (mpn = 0; + INVALID_MPN != (mpn = PhysTrack_GetNext(vmh->lockedPages, mpn));) { + HOST_UNLOCK_PFN_BYMPN(vm, mpn); + } + PhysTrack_Free(vmh->lockedPages); + vmh->lockedPages = NULL; + } + + if (vmh->AWEPages) { + for (mpn = 0; + INVALID_MPN != (mpn = PhysTrack_GetNext(vmh->AWEPages, mpn));) { + PhysTrack_Remove(vmh->AWEPages, mpn); + put_page(pfn_to_page(mpn)); + } + PhysTrack_Free(vmh->AWEPages); + vmh->AWEPages = NULL; + } + HostIF_VMUnlock(vm, 32); +} + + +/* + *---------------------------------------------------------------------- + * + * HostIF_AllocMachinePage -- + * + * Alloc non-swappable memory page. The page is not billed to + * a particular VM. Preferably the page should not be mapped into + * the kernel addresss space. + * + * Results: + * INVALID_MPN or a valid host mpn. + * + * Side effects: + * None. + * + *---------------------------------------------------------------------- + */ + +MPN +HostIF_AllocMachinePage(void) +{ + struct page *pg = alloc_page(GFP_HIGHUSER); + + return (pg) ? ((MPN)page_to_pfn(pg)) : INVALID_MPN; +} + + +/* + *---------------------------------------------------------------------- + * + * HostIF_FreeMachinePage -- + * + * Free an anonymous machine page allocated by + * HostIF_AllocMachinePage(). This page is not tracked in any + * phystracker. + * + * Results: + * Host page is unlocked. + * + * Side effects: + * None. + * + *---------------------------------------------------------------------- + */ + +void +HostIF_FreeMachinePage(MPN mpn) // IN: +{ + struct page *pg = pfn_to_page(mpn); + + __free_page(pg); +} + + +/* + *---------------------------------------------------------------------- + * + * HostIF_AllocLockedPages -- + * + * Alloc non-swappable memory. + * + * Results: + * negative value on complete failure + * non-negative value on partial/full completion, number of MPNs + * allocated & filled in pmpn returned. + * + * Side effects: + * Pages allocated. + * + *---------------------------------------------------------------------- + */ + +int +HostIF_AllocLockedPages(VMDriver *vm, // IN: VM instance pointer + VA64 addr, // OUT: pointer to user or kernel buffer for MPNs + unsigned numPages, // IN: number of pages to allocate + Bool kernelMPNBuffer)// IN: is the MPN buffer in kernel or user address space? +{ + MPN *pmpn = VA64ToPtr(addr); + + VMHost *vmh = vm->vmhost; + unsigned int cnt; + int err = 0; + + if (!vmh || !vmh->AWEPages) { + return -EINVAL; + } + for (cnt = 0; cnt < numPages; cnt++) { + struct page* pg; + MPN mpn; + + pg = alloc_page(GFP_HIGHUSER); + if (!pg) { + err = -ENOMEM; + break; + } + mpn = (MPN)page_to_pfn(pg); + if (kernelMPNBuffer) { + *pmpn = mpn; + } else if (HostIF_CopyToUser(pmpn, &mpn, sizeof *pmpn) != 0) { + __free_page(pg); + err = -EFAULT; + break; + } + pmpn++; + if (PhysTrack_Test(vmh->AWEPages, mpn)) { + Warning("%s: duplicate MPN %016" FMT64 "x\n", __func__, mpn); + } + PhysTrack_Add(vmh->AWEPages, mpn); + } + + return cnt ? cnt : err; +} + + +/* + *---------------------------------------------------------------------- + * + * HostIF_FreeLockedPages -- + * + * Free non-swappable memory. + * + * Results: + * On success: 0. All pages were unlocked. + * On failure: Non-zero system error code. No page was unlocked. + * + * Side effects: + * Pages freed. + * + *---------------------------------------------------------------------- + */ + +int +HostIF_FreeLockedPages(VMDriver *vm, // IN: VM instance pointer + VA64 addr, // IN: user or kernel array of MPNs + unsigned numPages, // IN: number of pages to free + Bool kernelMPNBuffer) // IN: is the MPN buffer in kernel or user address space? +{ + const int MPN_BATCH = 64; + MPN const *pmpn = VA64ToPtr(addr); + VMHost *vmh = vm->vmhost; + unsigned int cnt; + struct page *pg; + MPN *mpns; + + mpns = HostIF_AllocKernelMem(sizeof *mpns * MPN_BATCH, TRUE); + + if (mpns == NULL) { + return -ENOMEM; + } + if (!vmh || !vmh->AWEPages) { + HostIF_FreeKernelMem(mpns); + return -EINVAL; + } + + if (!kernelMPNBuffer) { + if (numPages > MPN_BATCH) { + HostIF_FreeKernelMem(mpns); + return -EINVAL; + } + + if (HostIF_CopyFromUser(mpns, pmpn, numPages * sizeof *pmpn)) { + printk(KERN_DEBUG "Cannot read from process address space at %p\n", + pmpn); + HostIF_FreeKernelMem(mpns); + return -EINVAL; + } + + pmpn = mpns; + } + + for (cnt = 0; cnt < numPages; cnt++) { + if (!PhysTrack_Test(vmh->AWEPages, pmpn[cnt])) { + printk(KERN_DEBUG "Attempted to free unallocated MPN %016" FMT64 "X\n", + pmpn[cnt]); + HostIF_FreeKernelMem(mpns); + return -EINVAL; + } + + pg = pfn_to_page(pmpn[cnt]); + if (page_count(pg) != 1) { + // should this case be considered a failure? + printk(KERN_DEBUG "Page %016" FMT64 "X is still used by someone " + "(use count %u, VM %p)\n", pmpn[cnt], + page_count(pg), vm); + } + } + + for (cnt = 0; cnt < numPages; cnt++) { + pg = pfn_to_page(pmpn[cnt]); + PhysTrack_Remove(vmh->AWEPages, pmpn[cnt]); + __free_page(pg); + } + HostIF_FreeKernelMem(mpns); + return 0; +} + + +/* + *---------------------------------------------------------------------- + * + * HostIF_Init -- + * + * Initialize the host-dependent part of the driver. + * + * Results: + * zero on success, non-zero on error. + * + * Side effects: + * None + * + *---------------------------------------------------------------------- + */ + +int +HostIF_Init(VMDriver *vm) // IN: +{ + vm->memtracker = MemTrack_Init(); + if (vm->memtracker == NULL) { + return -1; + } + + vm->vmhost = (VMHost *) HostIF_AllocKernelMem(sizeof *vm->vmhost, TRUE); + if (vm->vmhost == NULL) { + return -1; + } + memset(vm->vmhost, 0, sizeof *vm->vmhost); + + if (HostIFHostMemInit(vm)) { + return -1; + } + MutexInit(&vm->vmhost->vmMutex, "vm"); + + return 0; +} + + +/* + *------------------------------------------------------------------------------ + * + * HostIF_LookupUserMPN -- + * + * Lookup the MPN of a locked user page by user VA. + * + * Results: + * A status code and the MPN on success. + * + * Side effects: + * None + * + *------------------------------------------------------------------------------ + */ + +int +HostIF_LookupUserMPN(VMDriver *vm, // IN: VMDriver + VA64 uAddr, // IN: user VA of the page + MPN *mpn) // OUT +{ + void *uvAddr = VA64ToPtr(uAddr); + int retval = PAGE_LOCK_SUCCESS; + + *mpn = PgtblVa2MPN((VA)uvAddr); + + /* + * On failure, check whether the page is locked. + * + * While we don't require the page to be locked by HostIF_LockPage(), + * it does provide extra information. + * + * -- edward + */ + if (*mpn == INVALID_MPN) { + if (vm == NULL) { + retval += PAGE_LOOKUP_NO_VM; + } else { + MemTrackEntry *entryPtr = + MemTrack_LookupVPN(vm->memtracker, PTR_2_VPN(uvAddr)); + if (entryPtr == NULL) { + retval += PAGE_LOOKUP_NOT_TRACKED; + } else if (entryPtr->mpn == 0) { + retval += PAGE_LOOKUP_NO_MPN; + } else { + /* + * Kernel can remove PTEs/PDEs from our pagetables even if pages + * are locked... + */ + volatile int c; + + get_user(c, (char *)uvAddr); + *mpn = PgtblVa2MPN((VA)uvAddr); + if (*mpn == entryPtr->mpn) { +#ifdef VMX86_DEBUG + printk(KERN_DEBUG "Page %p disappeared from %s(%u)... " + "now back at %016" FMT64 "x\n", + uvAddr, current->comm, current->pid, *mpn); +#endif + } else if (*mpn != INVALID_MPN) { + printk(KERN_DEBUG "Page %p disappeared from %s(%u)... " + "now back at %016" FMT64"x (old=%016" FMT64 "x)\n", + uvAddr, current->comm, current->pid, *mpn, + entryPtr->mpn); + *mpn = INVALID_MPN; + } else { + printk(KERN_DEBUG "Page %p disappeared from %s(%u)... " + "and is lost (old=%016" FMT64 "x)\n", uvAddr, current->comm, + current->pid, entryPtr->mpn); + *mpn = entryPtr->mpn; + } + } + } + } + + return retval; +} + + +/* + *---------------------------------------------------------------------- + * + * HostIF_InitFP -- + * + * masks IRQ13 if not previously the case. + * + * Results: + * prevents INTR #0x2d (IRQ 13) from being generated -- + * assume that Int16 works for interrupt reporting + * + * + * Side effects: + * PIC + * + *---------------------------------------------------------------------- + */ + +void +HostIF_InitFP(VMDriver *vm) // IN: +{ + int mask = (1 << (0xD - 0x8)); + + uint8 val = inb(0xA1); + + if (!(val & mask)) { + val = val | mask; + outb(val, 0xA1); + } +} + + +/* + *----------------------------------------------------------------------------- + * + * HostIFGetUserPages -- + * + * Lock the pages of an user-level address space in memory. + * If ppages is NULL, pages are only marked as dirty. + * + * Results: + * Zero on success, non-zero on failure. + * + * Side effects: + * None + * + *----------------------------------------------------------------------------- + */ + +static int +HostIFGetUserPages(void *uvAddr, // IN + struct page **ppages, // OUT + unsigned int numPages) // IN +{ + int retval; +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,9,0) + unsigned int flags = 0; // No rights +#endif + + down_read(¤t->mm->mmap_sem); + + + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,9,0) + retval = get_user_pages_remote(current, current->mm, (unsigned long)uvAddr, + numPages, flags, ppages, NULL); +#else +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 6, 0) + retval = get_user_pages_remote(current, current->mm, (unsigned long)uvAddr, + numPages, 0, 0, ppages, NULL); +#else + retval = get_user_pages(current, current->mm, (unsigned long)uvAddr, + numPages, 0, 0, ppages, NULL); +#endif +#endif + + + + up_read(¤t->mm->mmap_sem); + + return retval != numPages; +} + + +/* + *---------------------------------------------------------------------- + * + * HostIF_IsLockedByMPN -- + * + * Checks if mpn was locked using allowMultipleMPNsPerVA. + * + * Results: + * TRUE if mpn is present in the physTracker. + * + * + * Side effects: + * None. + * + *---------------------------------------------------------------------- + */ + +Bool +HostIF_IsLockedByMPN(VMDriver *vm, // IN: + MPN mpn) // IN: +{ + return PhysTrack_Test(vm->vmhost->lockedPages, mpn); +} + + +/* + *----------------------------------------------------------------------------- + * + * HostIF_LockPage -- + * + * Lockup the MPN of an pinned user-level address space + * + * Results: + * A PAGE_LOCK_* status code and the MPN on success. + * + * Side effects: + * Adds the page to the MemTracker, if allowMultipleMPNsPerVA then the page + * is added to the VM's PhysTracker. + * + *----------------------------------------------------------------------------- + */ + +int +HostIF_LockPage(VMDriver *vm, // IN: VMDriver + VA64 uAddr, // IN: user VA of the page + Bool allowMultipleMPNsPerVA, // IN: allow to lock many pages per VA + MPN *mpn) // OUT: pinned page +{ + void *uvAddr = VA64ToPtr(uAddr); + struct page *page; + VPN vpn; + MemTrackEntry *entryPtr = NULL; + + vpn = PTR_2_VPN(uvAddr); + if (!allowMultipleMPNsPerVA) { + entryPtr = MemTrack_LookupVPN(vm->memtracker, vpn); + + /* + * Already tracked and locked + */ + + if (entryPtr != NULL && entryPtr->mpn != 0) { + return PAGE_LOCK_ALREADY_LOCKED; + } + } + + if (HostIFGetUserPages(uvAddr, &page, 1)) { + return PAGE_LOCK_FAILED; + } + + *mpn = (MPN)page_to_pfn(page); + + if (allowMultipleMPNsPerVA) { + /* + * Add the MPN to the PhysTracker that tracks locked pages. + */ + + struct PhysTracker* const pt = vm->vmhost->lockedPages; + + if (PhysTrack_Test(pt, *mpn)) { + put_page(page); + return PAGE_LOCK_ALREADY_LOCKED; + } + PhysTrack_Add(pt, *mpn); + } else { + /* + * If the entry doesn't exist, add it to the memtracker + * otherwise we just update the mpn. + */ + + if (entryPtr == NULL) { + entryPtr = MemTrack_Add(vm->memtracker, vpn, *mpn); + if (entryPtr == NULL) { + HOST_UNLOCK_PFN(vm, *mpn); + return PAGE_LOCK_MEMTRACKER_ERROR; + } + } else { + entryPtr->mpn = *mpn; + } + } + + return PAGE_LOCK_SUCCESS; +} + + +/* + *---------------------------------------------------------------------- + * + * HostIF_UnlockPage -- + * + * Unlock an pinned user-level page. + * + * Results: + * Status PAGE_UNLOCK_* code. + * + * Side effects: + * None + * + *---------------------------------------------------------------------- + */ + +int +HostIF_UnlockPage(VMDriver *vm, // IN: + VA64 uAddr) // IN: +{ + void *addr = VA64ToPtr(uAddr); + VPN vpn; + MemTrackEntry *e; + + vpn = VA_2_VPN((VA)addr); + e = MemTrack_LookupVPN(vm->memtracker, vpn); + + if (e == NULL) { + return PAGE_UNLOCK_NOT_TRACKED; + } + if (e->mpn == 0) { + return PAGE_UNLOCK_NO_MPN; + } + + HOST_UNLOCK_PFN(vm, e->mpn); + e->mpn = 0; + + return PAGE_UNLOCK_SUCCESS; +} + + +/* + *---------------------------------------------------------------------- + * + * HostIF_UnlockPageByMPN -- + * + * Unlock a locked user mode page. The page doesn't need to be mapped + * anywhere. + * + * Results: + * Status code. Returns a PAGE_LOOKUP_* error if the page can't be found or + * a PAGE_UNLOCK_* error if the page can't be unlocked. + * + * Side effects: + * Removes the MPN from from VM's PhysTracker. + * + *---------------------------------------------------------------------- + */ + +int +HostIF_UnlockPageByMPN(VMDriver *vm, // IN: VMDriver + MPN mpn, // IN: the MPN to unlock + VA64 uAddr) // IN: optional(debugging) VA for the MPN +{ + if (!PhysTrack_Test(vm->vmhost->lockedPages, mpn)) { + return PAGE_UNLOCK_NO_MPN; + } + +#ifdef VMX86_DEBUG + { + void *va = VA64ToPtr(uAddr); + MemTrackEntry *e; + + /* + * Verify for debugging that VA and MPN make sense. + * PgtblVa2MPN() can fail under high memory pressure. + */ + + if (va != NULL) { + MPN lookupMpn = PgtblVa2MPN((VA)va); + + if (lookupMpn != INVALID_MPN && mpn != lookupMpn) { + Warning("Page lookup fail %#"FMT64"x %016" FMT64 "x %p\n", + mpn, lookupMpn, va); + + return PAGE_LOOKUP_INVALID_ADDR; + } + } + + /* + * Verify that this MPN was locked with + * HostIF_LockPage(allowMultipleMPNsPerVA = TRUE). + * That means that this MPN should not be in the MemTracker. + */ + + e = MemTrack_LookupMPN(vm->memtracker, mpn); + if (e) { + Warning("%s(): mpn=%#"FMT64"x va=%p was permanently locked with " + "vpn=0x%"FMT64"x\n", __func__, mpn, va, e->vpn); + + return PAGE_UNLOCK_MISMATCHED_TYPE; + } + } +#endif + + HOST_UNLOCK_PFN_BYMPN(vm, mpn); + + return PAGE_UNLOCK_SUCCESS; +} + + +static void +UnlockEntry(void *clientData, // IN: + MemTrackEntry *entryPtr) // IN: +{ + VMDriver *vm = (VMDriver *)clientData; + + if (entryPtr->mpn) { + HOST_UNLOCK_PFN(vm,entryPtr->mpn); + entryPtr->mpn = 0; + } +} + + +/* + *----------------------------------------------------------------------------- + * + * HostIF_FreeAllResources -- + * + * Free all host-specific VM resources. + * + * Results: + * None + * + * Side effects: + * None + * + *----------------------------------------------------------------------------- + */ + +void +HostIF_FreeAllResources(VMDriver *vm) // IN +{ + unsigned int cnt; + + HostIFHostMemCleanup(vm); + if (vm->memtracker) { + MemTrack_Cleanup(vm->memtracker, UnlockEntry, vm); + vm->memtracker = NULL; + } + if (vm->vmhost) { + for (cnt = vm->vmhost->crosspagePagesCount; cnt > 0; ) { + struct page* p = vm->vmhost->crosspagePages[--cnt]; + UnmapCrossPage(p, vm->crosspage[cnt]); + } + vm->vmhost->crosspagePagesCount = 0; + if (vm->vmhost->hostAPICIsMapped) { + ASSERT(vm->hostAPIC.base != NULL); + iounmap((void*)vm->hostAPIC.base); + vm->hostAPIC.base = NULL; + vm->vmhost->hostAPICIsMapped = FALSE; + } + HostIF_FreeKernelMem(vm->vmhost); + vm->vmhost = NULL; + } +} + + + +/* + *---------------------------------------------------------------------- + * + * HostIF_AllocKernelMem + * + * Allocate some kernel memory for the driver. + * + * Results: + * The address allocated or NULL on error. + * + * + * Side effects: + * memory is malloced + *---------------------------------------------------------------------- + */ + +void * +HostIF_AllocKernelMem(size_t size, // IN: + int wired) // IN: +{ + void * ptr = kmalloc(size, GFP_KERNEL); + + if (ptr == NULL) { + Warning("%s failed (size=%p)\n", __func__, (void*)size); + } + + return ptr; +} + + +/* + *----------------------------------------------------------------------------- + * + * HostIF_AllocPage -- + * + * Allocate a page (whose content is undetermined) + * + * Results: + * The kernel virtual address of the page + * + * Side effects: + * None + * + *----------------------------------------------------------------------------- + */ + +void * +HostIF_AllocPage(void) +{ + VA kvAddr; + + kvAddr = __get_free_page(GFP_KERNEL); + if (kvAddr == 0) { + Warning("%s: __get_free_page() failed\n", __func__); + } + + return (void *)kvAddr; +} + + +/* + *---------------------------------------------------------------------- + * + * HostIF_FreeKernelMem + * + * Free kernel memory allocated for the driver. + * + * Results: + * None. + * + * Side effects: + * memory is freed. + *---------------------------------------------------------------------- + */ + +void +HostIF_FreeKernelMem(void *ptr) // IN: +{ + kfree(ptr); +} + + +void +HostIF_FreePage(void *ptr) // IN: +{ + VA vAddr = (VA)ptr; + + if (vAddr & (PAGE_SIZE-1)) { + Warning("%s %p misaligned\n", __func__, (void*)vAddr); + } else { + free_page(vAddr); + } +} + + +/* + *---------------------------------------------------------------------- + * + * HostIF_EstimateLockedPageLimit -- + * + * Estimates how many memory pages can be locked or allocated + * from the kernel without causing the host to die or to be really upset. + * + * Results: + * The maximum number of pages that can be locked. + * + * Side effects: + * none + * + *---------------------------------------------------------------------- + */ + +unsigned int +HostIF_EstimateLockedPageLimit(const VMDriver* vm, // IN + unsigned int currentlyLockedPages) // IN +{ + /* + * This variable is available and exported to modules, + * since at least 2.6.0. + */ + + extern unsigned long totalram_pages; + + unsigned int totalPhysicalPages = totalram_pages; + +#if LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 28) + return MemDefaults_CalcMaxLockedPages(totalPhysicalPages); +#else + /* + * Use the memory information linux exports as of late for a more + * precise estimate of locked memory. All kernel page-related structures + * (slab, pagetable) are as good as locked. Unevictable includes things + * that are explicitly marked as such (like mlock()). Huge pages are + * also as good as locked, since we don't use them. Lastly, without + * available swap, anonymous pages become locked in memory as well. + */ + + unsigned int forHost; + unsigned int reservedPages = MEMDEFAULTS_MIN_HOST_PAGES; + unsigned int hugePages = (vm == NULL) ? 0 : + BYTES_2_PAGES(vm->memInfo.hugePageBytes); + unsigned int lockedPages = global_page_state(NR_PAGETABLE) + + global_page_state(NR_SLAB_UNRECLAIMABLE) + + global_page_state(NR_UNEVICTABLE) + + hugePages + reservedPages; + unsigned int anonPages = global_page_state(NR_ANON_MAPPED); + unsigned int swapPages = BYTES_2_PAGES(linuxState.swapSize); + + if (anonPages > swapPages) { + lockedPages += anonPages - swapPages; + } + forHost = lockedPages + LOCKED_PAGE_SLACK; + if (forHost > totalPhysicalPages) { + forHost = totalPhysicalPages; + } + + return totalPhysicalPages - forHost; +#endif +} + + +/* + *---------------------------------------------------------------------- + * + * HostIF_Wait -- + * + * Waits for specified number of milliseconds. + * + *---------------------------------------------------------------------- + */ + +void +HostIF_Wait(unsigned int timeoutMs) +{ + msleep_interruptible(timeoutMs); +} + + +/* + *---------------------------------------------------------------------- + * + * HostIF_WaitForFreePages -- + * + * Waits for pages to be available for allocation or locking. + * + * Results: + * New pages are likely to be available for allocation or locking. + * + * Side effects: + * none + * + *---------------------------------------------------------------------- + */ + +void +HostIF_WaitForFreePages(unsigned int timeoutMs) // IN: +{ + static unsigned count; + msleep_interruptible(timeoutMs); + count++; +} + + +/* + *---------------------------------------------------------------------- + * + * HostIFReadUptimeWork -- + * + * Reads the current uptime. The uptime is based on getimeofday, + * which provides the needed high resolution. However, we don't + * want uptime to be warped by e.g. calls to settimeofday. So, we + * use a jiffies based monotonic clock to sanity check the uptime. + * If the uptime is more than one second from the monotonic time, + * we assume that the time of day has been set, and recalculate the + * uptime base to get uptime back on track with monotonic time. On + * the other hand, we do expect jiffies based monotonic time and + * timeofday to have small drift (due to NTP rate correction, etc). + * We handle this by rebasing the jiffies based monotonic clock + * every second (see HostIFUptimeResyncMono). + * + * Results: + * The uptime, in units of UPTIME_FREQ. Also returns the jiffies + * value that was used in the monotonic time calculation. + * + * Side effects: + * May reset the uptime base in the case gettimeofday warp was + * detected. + * + *---------------------------------------------------------------------- + */ + +static uint64 +HostIFReadUptimeWork(unsigned long *j) // OUT: current jiffies +{ + struct timeval tv; + uint64 monotime, uptime, upBase, monoBase; + int64 diff; + uint32 version; + unsigned long jifs, jifBase; + unsigned int attempts = 0; + + /* Assert that HostIF_InitUptime has been called. */ + ASSERT(uptimeState.timer.function); + + retry: + do { + version = VersionedAtomic_BeginTryRead(&uptimeState.version); + jifs = jiffies; + jifBase = uptimeState.jiffiesBase; + monoBase = uptimeState.monotimeBase; + } while (!VersionedAtomic_EndTryRead(&uptimeState.version, version)); + + do_gettimeofday(&tv); + upBase = Atomic_Read64(&uptimeState.uptimeBase); + + monotime = (uint64)(jifs - jifBase) * (UPTIME_FREQ / HZ); + monotime += monoBase; + + uptime = tv.tv_usec * (UPTIME_FREQ / 1000000) + tv.tv_sec * UPTIME_FREQ; + uptime += upBase; + + /* + * Use the jiffies based monotonic time to sanity check gettimeofday. + * If they differ by more than one second, assume the time of day has + * been warped, and use the jiffies time to undo (most of) the warp. + */ + + diff = uptime - monotime; + if (UNLIKELY(diff < -UPTIME_FREQ || diff > UPTIME_FREQ)) { + /* Compute a new uptimeBase to get uptime back on track. */ + uint64 newUpBase = monotime - (uptime - upBase); + + attempts++; + if (!Atomic_CMPXCHG64(&uptimeState.uptimeBase, &upBase, &newUpBase) && + attempts < 5) { + /* Another thread updated uptimeBase. Recalculate uptime. */ + goto retry; + } + uptime = monotime; + + Log("%s: detected settimeofday: fixed uptimeBase old %"FMT64"u " + "new %"FMT64"u attempts %u\n", __func__, + upBase, newUpBase, attempts); + } + *j = jifs; + + return uptime; +} + + +/* + *---------------------------------------------------------------------- + * + * HostIFUptimeResyncMono -- + * + * Timer that fires ever second to resynchronize the jiffies based + * monotonic time with the uptime. + * + * Results: + * None + * + * Side effects: + * Resets the monotonic time bases so that jiffies based monotonic + * time does not drift from gettimeofday over the long term. + * + *---------------------------------------------------------------------- + */ + +static void +HostIFUptimeResyncMono(unsigned long data) // IN: ignored +{ + unsigned long jifs; + uintptr_t flags; + + /* + * Read the uptime and the corresponding jiffies value. This will + * also correct the uptime (which is based on time of day) if needed + * before we rebase monotonic time (which is based on jiffies). + */ + + uint64 uptime = HostIFReadUptimeWork(&jifs); + + /* + * Every second, recalculate monoBase and jiffiesBase to squash small + * drift between gettimeofday and jiffies. Also, this prevents + * (jiffies - jiffiesBase) wrap on 32-bits. + */ + + SAVE_FLAGS(flags); + CLEAR_INTERRUPTS(); + VersionedAtomic_BeginWrite(&uptimeState.version); + + uptimeState.monotimeBase = uptime; + uptimeState.jiffiesBase = jifs; + + VersionedAtomic_EndWrite(&uptimeState.version); + RESTORE_FLAGS(flags); + + /* Reschedule this timer to expire in one second. */ + mod_timer(&uptimeState.timer, jifs + HZ); +} + + +/* + *---------------------------------------------------------------------- + * + * HostIF_InitUptime -- + * + * Initialize the uptime clock's state. + * + * Results: + * None + * + * Side effects: + * Sets the initial values for the uptime state, and schedules + * the uptime timer. + * + *---------------------------------------------------------------------- + */ + +void +HostIF_InitUptime(void) +{ + struct timeval tv; + + uptimeState.jiffiesBase = jiffies; + do_gettimeofday(&tv); + Atomic_Write64(&uptimeState.uptimeBase, + -(tv.tv_usec * (UPTIME_FREQ / 1000000) + + tv.tv_sec * UPTIME_FREQ)); + + init_timer(&uptimeState.timer); + uptimeState.timer.function = HostIFUptimeResyncMono; + mod_timer(&uptimeState.timer, jiffies + HZ); +} + + +/* + *---------------------------------------------------------------------- + * + * HostIF_CleanupUptime -- + * + * Cleanup uptime state, called at module unloading time. + * + * Results: + * None + * + * Side effects: + * Deschedule the uptime timer. + * + *---------------------------------------------------------------------- + */ + +void +HostIF_CleanupUptime(void) +{ + del_timer_sync(&uptimeState.timer); +} + + +/* + *---------------------------------------------------------------------- + * + * HostIF_ReadUptime -- + * + * Read the system time. Returned value has no particular absolute + * value, only difference since previous call should be used. + * + * Results: + * Units are given by HostIF_UptimeFrequency. + * + * Side effects: + * See HostIFReadUptimeWork + * + *---------------------------------------------------------------------- + */ + +uint64 +HostIF_ReadUptime(void) +{ + unsigned long jifs; + + return HostIFReadUptimeWork(&jifs); +} + + +/* + *---------------------------------------------------------------------- + * + * HostIF_UptimeFrequency + * + * Return the frequency of the counter that HostIF_ReadUptime reads. + * + * Results: + * Frequency in Hz. + * + * Side effects: + * None + * + *---------------------------------------------------------------------- + */ + +uint64 +HostIF_UptimeFrequency(void) +{ + return UPTIME_FREQ; +} + + +/* + *----------------------------------------------------------------------------- + * + * HostIF_CopyFromUser -- + * + * Copy memory from the user application into a kernel buffer. This + * function may block, so don't call it while holding any kind of + * lock. --hpreg + * + * Results: + * 0 on success + * -EFAULT on failure. + * + * Side effects: + * None + * + *----------------------------------------------------------------------------- + */ + +int +HostIF_CopyFromUser(void *dst, // OUT + const void *src, // IN + unsigned int len) // IN +{ + return copy_from_user(dst, src, len) ? -EFAULT : 0; +} + + +/* + *----------------------------------------------------------------------------- + * + * HostIF_CopyToUser -- + * + * Copy memory to the user application from a kernel buffer. This + * function may block, so don't call it while holding any kind of + * lock. --hpreg + * + * Results: + * 0 on success + * -EFAULT on failure. + * + * Side effects: + * None + * + *----------------------------------------------------------------------------- + */ + +int +HostIF_CopyToUser(void *dst, // OUT + const void *src, // IN + unsigned int len) // IN +{ + return copy_to_user(dst, src, len) ? -EFAULT : 0; +} + + +/* + *----------------------------------------------------------------------------- + * + * HostIF_MapCrossPage -- + * + * Obtain kernel pointer to crosspage. + * + * We must return a VA that is obtained through a kernel mapping, so that + * the mapping never goes away (see bug 29753). + * + * However, the LA corresponding to that VA must not overlap with the + * monitor (see bug 32922). The userland code ensures that by only + * allocating cross pages from low memory. For those pages, the kernel + * uses a permanent mapping, instead of a temporary one with a high LA. + * + * Results: + * The kernel virtual address on success + * NULL on failure + * + * Side effects: + * None + * + *----------------------------------------------------------------------------- + */ + +void * +HostIF_MapCrossPage(VMDriver *vm, // IN + VA64 uAddr) // IN +{ + void *p = VA64ToPtr(uAddr); + struct page *page; + VA vPgAddr; + VA ret; + + if (HostIFGetUserPages(p, &page, 1)) { + return NULL; + } + vPgAddr = (VA) MapCrossPage(page); + HostIF_GlobalLock(16); + if (vm->vmhost->crosspagePagesCount >= MAX_INITBLOCK_CPUS) { + HostIF_GlobalUnlock(16); + UnmapCrossPage(page, (void*)vPgAddr); + + return NULL; + } + vm->vmhost->crosspagePages[vm->vmhost->crosspagePagesCount++] = page; + HostIF_GlobalUnlock(16); + + ret = vPgAddr | (((VA)p) & (PAGE_SIZE - 1)); + + return (void*)ret; +} + + +/* + *----------------------------------------------------------------------------- + * + * HostIF_AllocCrossGDT -- + * + * Allocate the per-vmmon cross GDT page set. + * + * See bora/doc/worldswitch-pages.txt for the requirements on the cross + * GDT page set addresses. + * + * Results: + * On success: Host kernel virtual address of the first cross GDT page. + * Use HostIF_FreeCrossGDT() with the same value to free. + * The 'crossGDTMPNs' array is filled with the MPNs of all the + * cross GDT pages. + * On failure: NULL. + * + * Side effects: + * None + * + *----------------------------------------------------------------------------- + */ + +void * +HostIF_AllocCrossGDT(uint32 numPages, // IN: Number of pages + MPN maxValidFirst, // IN: Highest valid MPN of first page + MPN *crossGDTMPNs) // OUT: Array of MPNs +{ + MPN startMPN; + struct page *pages; + uint32 i; + void *crossGDT; + + /* + * In practice, allocating a low page (MPN <= 0x100000 - 1) is equivalent to + * allocating a page with MPN <= 0xFEC00 - 1: + * + * o PC architecture guarantees that there is no RAM in top 16MB of 4GB + * range. + * + * o 0xFEC00000 is IOAPIC base. There could be RAM immediately below, + * but not above. + * + * How do we allocate a low page? We can safely use GFP_DMA32 when + * available. On 64bit kernels before GFP_DMA32 was introduced we + * fall back to DMA zone (which is not quite necessary for boxes + * with less than ~3GB of memory). On 32bit kernels we are using + * normal zone - which is usually 1GB, and at most 4GB (for 4GB/4GB + * kernels). And for 4GB/4GB kernels same restriction as for 64bit + * kernels applies - there is no RAM in top 16MB immediately below + * 4GB so alloc_pages() cannot return such page. + */ + + ASSERT(0xFEC00 - 1 <= maxValidFirst); + for (i = 0; (1 << i) < numPages; i++) { } +#ifdef GFP_DMA32 + pages = alloc_pages(GFP_KERNEL | GFP_DMA32, i); +#else + pages = alloc_pages(GFP_KERNEL | GFP_DMA, i); +#endif + crossGDT = NULL; + if (pages == NULL) { + Warning("%s: unable to alloc crossGDT (%u)\n", __func__, i); + } else { + startMPN = page_to_pfn(pages); + for (i = 0; i < numPages; i++) { + crossGDTMPNs[i] = startMPN + i; + } + crossGDT = (void *)page_address(pages); + } + + return crossGDT; +} + + +/* + *----------------------------------------------------------------------------- + * + * HostIF_FreeCrossGDT -- + * + * Free the per-vmmon cross GDT page set allocated with + * HostIF_AllocCrossGDT(). + * + * Results: + * None + * + * Side effects: + * None + * + *----------------------------------------------------------------------------- + */ + +void +HostIF_FreeCrossGDT(uint32 numPages, // IN: Number of pages + void *crossGDT) // IN: Kernel VA of first cross GDT page +{ + uint32 i; + + for (i = 0; (1 << i) < numPages; i++) { } + free_pages((VA)crossGDT, i); +} + + +/* + *----------------------------------------------------------------------------- + * + * HostIF_VMLock -- + * + * Grabs per-VM data structure lock. The lock is not recursive. + * The global lock has lower rank so the global lock should be grabbed + * first if both locks are acquired. + * + * It should be a medium contention lock. Also it should be fast: + * it is used for protecting of frequent page allocation and locking. + * + * Results: + * None + * + * Side effects: + * The current thread is rescheduled if the lock is busy. + * + *----------------------------------------------------------------------------- + */ + +void +HostIF_VMLock(VMDriver *vm, // IN + int callerID) // IN +{ + ASSERT(vm); + + ASSERT(vm->vmhost); + MutexLock(&vm->vmhost->vmMutex, callerID); +} + + +/* + *----------------------------------------------------------------------------- + * + * HostIF_VMUnlock -- + * + * Releases per-VM data structure lock. + * + * Results: + * None + * + * Side effects: + * Can wake up the thread blocked on this lock. + * + *----------------------------------------------------------------------------- + */ + +void +HostIF_VMUnlock(VMDriver *vm, // IN + int callerID) // IN +{ + ASSERT(vm); + + ASSERT(vm->vmhost); + MutexUnlock(&vm->vmhost->vmMutex, callerID); +} + + +#ifdef VMX86_DEBUG +/* + *----------------------------------------------------------------------------- + * + * HostIF_VMLockIsHeld -- + * + * Determine if the per-VM lock is held by the current thread. + * + * Results: + * TRUE if yes + * FALSE if no + * + * Side effects: + * None + * + *----------------------------------------------------------------------------- + */ + +Bool +HostIF_VMLockIsHeld(VMDriver *vm) // IN +{ + ASSERT(vm); + ASSERT(vm->vmhost); + + return MutexIsLocked(&vm->vmhost->vmMutex); +} +#endif + + +/* + * Utility routines for accessing and enabling the APIC + */ + +/* + * Defines for accessing the APIC. We use readl/writel to access the APIC + * which is how Linux wants you to access I/O memory (though on the x86 + * just dereferencing a pointer works just fine). + */ +#define APICR_TO_ADDR(apic, reg) (apic + (reg << 4)) +#define GET_APIC_REG(apic, reg) (readl(APICR_TO_ADDR(apic, reg))) +#define SET_APIC_REG(apic, reg, val) (writel(val, APICR_TO_ADDR(apic, reg))) + +#define APIC_MAXLVT(apic) ((GET_APIC_REG(apic, APICR_VERSION) >> 16) & 0xff) +#define APIC_VERSIONREG(apic) (GET_APIC_REG(apic, APICR_VERSION) & 0xff) + + +#if defined(CONFIG_SMP) || defined(CONFIG_X86_UP_IOAPIC) || \ + defined(CONFIG_X86_UP_APIC) || defined(CONFIG_X86_LOCAL_APIC) +/* + *---------------------------------------------------------------------- + * + * isVAReadable -- + * + * Verify that passed VA is accessible without crash... + * + * Results: + * TRUE if address is readable, FALSE otherwise. + * + * Side effects: + * None. + * + *---------------------------------------------------------------------- + */ + +static Bool +isVAReadable(VA r) // IN: +{ + mm_segment_t old_fs; + uint32 dummy; + int ret; + + old_fs = get_fs(); + set_fs(get_ds()); + r = APICR_TO_ADDR(r, APICR_VERSION); + ret = HostIF_CopyFromUser(&dummy, (void*)r, sizeof(dummy)); + set_fs(old_fs); + + return ret == 0; +} + + +/* + *---------------------------------------------------------------------- + * + * SetVMAPICAddr -- + * + * Maps the host cpu's APIC. The virtual address is stashed in + * the VMDriver structure. + * + * Results: + * None. + * + * Side effects: + * The VMDriver structure is updated. + * + *---------------------------------------------------------------------- + */ + +static void +SetVMAPICAddr(VMDriver *vm, // IN/OUT: driver state + MA ma) // IN: host APIC's ma +{ + volatile void *hostapic; + + ASSERT_ON_COMPILE(APICR_SIZE <= PAGE_SIZE); + hostapic = (volatile void *) ioremap_nocache(ma, PAGE_SIZE); + if (hostapic) { + if ((APIC_VERSIONREG(hostapic) & 0xF0) == 0x10) { + vm->hostAPIC.base = (volatile uint32 (*)[4]) hostapic; + ASSERT(vm->vmhost != NULL); + vm->vmhost->hostAPICIsMapped = TRUE; + } else { + iounmap((void*)hostapic); + } + } +} + + +/* + *---------------------------------------------------------------------- + * + * ProbeAPIC -- + * + * Attempts to map the host APIC. + * + * Most versions of Linux already provide access to a mapped + * APIC. This function is just a backup. + * + * Caveat: We assume that the APIC physical address is the same + * on all host cpus. + * + * Results: + * TRUE if APIC was found, FALSE if not. + * + * Side effects: + * May map the APIC. + * + *---------------------------------------------------------------------- + */ + +static Bool +ProbeAPIC(VMDriver *vm, // IN/OUT: driver state + Bool setVMPtr) // IN: set a pointer to the APIC's virtual address +{ + MA ma = APIC_GetMA(); + + if (ma == (MA)-1) { + return FALSE; + } + + if (setVMPtr) { + SetVMAPICAddr(vm, ma); + } else { + vm->hostAPIC.base = NULL; + } + + return TRUE; +} +#endif + + +/* + *---------------------------------------------------------------------- + * + * HostIF_APICInit -- + * + * Initialize APIC behavior. + * Attempts to map the host APIC into vm->hostAPIC. + * + * We don't attempt to refresh the mapping after a host cpu + * migration. Fortunately, hosts tend to use the same address + * for all APICs. + * + * Most versions of Linux already provide a mapped APIC. We + * have backup code to read APIC_BASE and map it, if needed. + * + * Results: + * TRUE + * + * Side effects: + * May map the host APIC. + * + *---------------------------------------------------------------------- + */ +Bool +HostIF_APICInit(VMDriver *vm, // IN: + Bool setVMPtr, // IN: + Bool probe) // IN: force probing +{ +#if defined(CONFIG_SMP) || defined(CONFIG_X86_UP_IOAPIC) || \ + defined(CONFIG_X86_UP_APIC) || defined(CONFIG_X86_LOCAL_APIC) + static Bool apicIPILogged = FALSE; + VA kAddr; + + monitorIPIVector = SPURIOUS_APIC_VECTOR; +#if defined(POSTED_INTR_VECTOR) + hvIPIVector = POSTED_INTR_VECTOR; +#else + hvIPIVector = 0; +#endif + + + if (!apicIPILogged) { + Log("Monitor IPI vector: %x\n", monitorIPIVector); + Log("HV IPI vector: %x\n", hvIPIVector); + apicIPILogged = TRUE; + } + + if ((__GET_MSR(MSR_APIC_BASE) & APIC_MSR_X2APIC_ENABLED) != 0) { + if (setVMPtr) { + vm->hostAPIC.base = NULL; + vm->vmhost->hostAPICIsMapped = FALSE; + vm->hostAPIC.isX2 = TRUE; + } + return TRUE; + } + + if (probe && ProbeAPIC(vm, setVMPtr)) { + return TRUE; + } + + /* + * Normal case: use Linux's pre-mapped APIC. + */ + kAddr = __fix_to_virt(FIX_APIC_BASE); + if (!isVAReadable(kAddr)) { + return TRUE; + } + if (setVMPtr) { + vm->hostAPIC.base = (void *)kAddr; + } else { + vm->hostAPIC.base = NULL; + } +#endif + return TRUE; +} + + +/* + *----------------------------------------------------------------------------- + * + * HostIF_SemaphoreWait -- + * + * Perform the semaphore wait (P) operation, possibly blocking. + * + * Result: + * 1 (which equals MX_WAITNORMAL) if success, + * negated error code otherwise. + * + * Side-effects: + * None + * + *----------------------------------------------------------------------------- + */ + +int +HostIF_SemaphoreWait(VMDriver *vm, // IN: + Vcpuid vcpuid, // IN: + uint64 *args) // IN: +{ + struct file *file; + mm_segment_t old_fs; + int res; + int waitFD = args[0]; + int timeoutms = args[2]; + uint64 value; + + file = vmware_fget(waitFD); + if (file == NULL) { + return MX_WAITERROR; + } + + old_fs = get_fs(); + set_fs(get_ds()); + + { + struct poll_wqueues table; + unsigned int mask; + + poll_initwait(&table); + current->state = TASK_INTERRUPTIBLE; + mask = file->f_op->poll(file, &table.pt); + if (!(mask & (POLLIN | POLLERR | POLLHUP))) { + vm->vmhost->vcpuSemaTask[vcpuid] = current; + schedule_timeout(timeoutms * HZ / 1000); // convert to Hz + vm->vmhost->vcpuSemaTask[vcpuid] = NULL; + } + current->state = TASK_RUNNING; + poll_freewait(&table); + } + + /* + * Userland only writes in multiples of sizeof(uint64). This will allow + * the code to happily deal with a pipe or an eventfd. We only care about + * reading no bytes (EAGAIN - non blocking fd) or sizeof(uint64). + */ + + res = file->f_op->read(file, (char *) &value, sizeof value, &file->f_pos); + + if (res == sizeof value) { + res = MX_WAITNORMAL; + } else { + if (res == 0) { + res = -EBADF; + } + } + + set_fs(old_fs); + fput(file); + + /* + * Handle benign errors: + * EAGAIN is MX_WAITTIMEDOUT. + * The signal-related errors are all mapped into MX_WAITINTERRUPTED. + */ + + switch (res) { + case -EAGAIN: + res = MX_WAITTIMEDOUT; + break; + case -EINTR: + case -ERESTART: + case -ERESTARTSYS: + case -ERESTARTNOINTR: + case -ERESTARTNOHAND: + res = MX_WAITINTERRUPTED; + break; + case -EBADF: + res = MX_WAITERROR; + break; + } + return res; +} + + +/* + *----------------------------------------------------------------------------- + * + * HostIF_SemaphoreForceWakeup -- + * + * For each VCPU in the set whose target process is lightly sleeping (i.e. + * TASK_INTERRUPTIBLE), wake it up. The target process can be waiting on a + * semaphore or due to a call to Vmx86_YieldToSet. + * + * Result: + * None. + * + * Side-effects: + * None + * + *----------------------------------------------------------------------------- + */ + +void +HostIF_SemaphoreForceWakeup(VMDriver *vm, // IN: + const VCPUSet *vcs) // IN: +{ + FOR_EACH_VCPU_IN_SET(vcs, vcpuid) { + struct task_struct *t = vm->vmhost->vcpuSemaTask[vcpuid]; + vm->vmhost->vcpuSemaTask[vcpuid] = NULL; + if (t && (t->state & TASK_INTERRUPTIBLE)) { + wake_up_process(t); + } + } ROF_EACH_VCPU_IN_SET(); +} + + +/* + *----------------------------------------------------------------------------- + * + * HostIF_SemaphoreSignal -- + * + * Perform the semaphore signal (V) operation. + * + * Result: + * On success: MX_WAITNORMAL (1). + * On error: MX_WAITINTERRUPTED (3) if interrupted by a Unix signal (we + * can block on a preemptive kernel). + * MX_WAITERROR (0) on generic error. + * Negated system error (< 0). + * + * Side-effects: + * None + * + *----------------------------------------------------------------------------- + */ + +int +HostIF_SemaphoreSignal(uint64 *args) // IN: +{ + struct file *file; + mm_segment_t old_fs; + int res; + int signalFD = args[1]; + uint64 value = 1; // make an eventfd happy should it be there + + file = vmware_fget(signalFD); + if (!file) { + return MX_WAITERROR; + } + + old_fs = get_fs(); + set_fs(get_ds()); + + /* + * Always write sizeof(uint64) bytes. This works fine for eventfd and + * pipes. The data written is formatted to make an eventfd happy should + * it be present. + */ + + res = file->f_op->write(file, (char *) &value, sizeof value, &file->f_pos); + + if (res == sizeof value) { + res = MX_WAITNORMAL; + } + + set_fs(old_fs); + fput(file); + + /* + * Handle benign errors: + * EAGAIN is MX_WAITTIMEDOUT. + * The signal-related errors are all mapped into MX_WAITINTERRUPTED. + */ + + switch (res) { + case -EAGAIN: + // The pipe is full, so it is already signalled. Success. + res = MX_WAITNORMAL; + break; + case -EINTR: + case -ERESTART: + case -ERESTARTSYS: + case -ERESTARTNOINTR: + case -ERESTARTNOHAND: + res = MX_WAITINTERRUPTED; + break; + } + return res; +} + +#if ((LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 27)) || !defined(CONFIG_SMP)) +# define VMMON_USE_CALL_FUNC +#endif + +#if defined(VMMON_USE_CALL_FUNC) +/* + *---------------------------------------------------------------------- + * + * LinuxDriverIPIHandler -- + * + * Null IPI handler - for monitor to notice AIO completion + * + *---------------------------------------------------------------------- + */ +void +LinuxDriverIPIHandler(void *info) +{ + return; +} + +#if LINUX_VERSION_CODE > KERNEL_VERSION(2, 6, 17) +#define VMMON_CALL_FUNC_SYNC 0 // async; we've not seen any problems +#else +#define VMMON_CALL_FUNC_SYNC 1 // sync; insure no problems from old releases +#endif + +#endif + + +/* + *---------------------------------------------------------------------- + * + * HostIF_IPI -- + * + * If the passed VCPU threads are on some CPUs in the system, + * attempt to hit them with an IPI. + * + * On older Linux systems we do a broadcast. + * + * Result: + * The mode used to send IPIs. + * + *---------------------------------------------------------------------- + */ + +HostIFIPIMode +HostIF_IPI(VMDriver *vm, // IN: + const VCPUSet *ipiTargets) // IN: +{ + HostIFIPIMode mode = IPI_NONE; + + ASSERT(vm); + + FOR_EACH_VCPU_IN_SET(ipiTargets, v) { + uint32 targetHostCpu = vm->currentHostCpu[v]; + if (targetHostCpu != INVALID_PCPU) { + ASSERT(targetHostCpu < MAX_PCPUS); +#if defined(VMMON_USE_CALL_FUNC) + /* older kernels IPI broadcast; use async when possible */ + (void) compat_smp_call_function(LinuxDriverIPIHandler, + NULL, VMMON_CALL_FUNC_SYNC); + mode = IPI_BROADCAST; + break; +#else + /* Newer kernels have (async) IPI targetting */ + arch_send_call_function_single_ipi(targetHostCpu); + mode = IPI_UNICAST; +#endif + } + } ROF_EACH_VCPU_IN_SET(); + + return mode; +} + + +typedef struct { + Atomic_uint32 index; + CPUIDQuery *query; +} HostIFGetCpuInfoData; + + +/* + *----------------------------------------------------------------------------- + * + * HostIFGetCpuInfo -- + * + * Collect CPUID information on the current logical CPU. + * + * Results: + * None. + * + * Side effects: + * 'data->index' is atomically incremented by one. + * + *----------------------------------------------------------------------------- + */ + +static void +HostIFGetCpuInfo(void *clientData) // IN/OUT: A HostIFGetCpuInfoData * +{ + HostIFGetCpuInfoData *data = (HostIFGetCpuInfoData *)clientData; + CPUIDQuery *query; + uint32 index; + + ASSERT(data); + query = data->query; + ASSERT(query); + + index = Atomic_ReadInc32(&data->index); + if (index >= query->numLogicalCPUs) { + return; + } + + query->logicalCPUs[index].tag = HostIF_GetCurrentPCPU(); + __GET_CPUID2(query->eax, query->ecx, &query->logicalCPUs[index].regs); +} + + +/* + *----------------------------------------------------------------------------- + * + * HostIF_GetAllCpuInfo -- + * + * Collect CPUID information on all logical CPUs. + * + * 'query->numLogicalCPUs' is the size of the 'query->logicalCPUs' output + * array. + * + * Results: + * On success: TRUE. 'query->logicalCPUs' is filled and + * 'query->numLogicalCPUs' is adjusted accordingly. + * On failure: FALSE. Happens if 'query->numLogicalCPUs' was too small. + * + * Side effects: + * None + * + *----------------------------------------------------------------------------- + */ + +Bool +HostIF_GetAllCpuInfo(CPUIDQuery *query) // IN/OUT +{ + HostIFGetCpuInfoData data; + + Atomic_Write32(&data.index, 0); + data.query = query; + + /* + * XXX Linux has userland APIs to bind a thread to a processor, so we could + * probably implement this in userland like we do on Win32. + */ + + HostIF_CallOnEachCPU(HostIFGetCpuInfo, &data); + + /* + * At this point, Atomic_Read32(&data.index) is the number of logical CPUs + * who replied. + */ + + if (Atomic_Read32(&data.index) > query->numLogicalCPUs) { + return FALSE; + } + + ASSERT(Atomic_Read32(&data.index) <= query->numLogicalCPUs); + query->numLogicalCPUs = Atomic_Read32(&data.index); + + return TRUE; +} + + +/* + *---------------------------------------------------------------------- + * + * HostIF_CallOnEachCPU -- + * + * Call specified function once on each CPU. No ordering guarantees. + * + * Results: + * None. + * + * Side effects: + * None. May be slow. + * + *---------------------------------------------------------------------- + */ + +void +HostIF_CallOnEachCPU(void (*func)(void*), // IN: function to call + void *data) // IN/OUT: argument to function +{ + preempt_disable(); + (*func)(data); + (void)compat_smp_call_function(*func, data, 1); + preempt_enable(); +} + + +/* + *---------------------------------------------------------------------- + * + * HostIF_ReadPage -- + * + * puts the content of a machine page into a kernel or user mode + * buffer. + * + * Results: + * 0 on success + * negative error code on error + * + * Side effects: + * none + * + *---------------------------------------------------------------------- + */ + +int +HostIF_ReadPage(MPN mpn, // MPN of the page + VA64 addr, // buffer for data + Bool kernelBuffer) // is the buffer in kernel space? +{ + void *buf = VA64ToPtr(addr); + int ret = 0; + const void* ptr; + struct page* page; + + if (mpn == INVALID_MPN) { + return -EFAULT; + } + + page = pfn_to_page(mpn); + ptr = kmap(page); + if (ptr == NULL) { + return -ENOMEM; + } + + if (kernelBuffer) { + memcpy(buf, ptr, PAGE_SIZE); + } else { + ret = HostIF_CopyToUser(buf, ptr, PAGE_SIZE); + } + kunmap(page); + + return ret; +} + + +/* + *---------------------------------------------------------------------- + * + * HostIF_WritePage -- + * + * Put the content of a kernel or user mode buffer into a machine + * page. + * + * Results: + * 0 on success + * negative error code on error + * + * Side effects: + * none + * + *---------------------------------------------------------------------- + */ + +int +HostIF_WritePage(MPN mpn, // MPN of the page + VA64 addr, // data to write to the page + Bool kernelBuffer) // is the buffer in kernel space? +{ + void const *buf = VA64ToPtr(addr); + int ret = 0; + void* ptr; + struct page* page; + + if (mpn == INVALID_MPN) { + return -EFAULT; + } + + page = pfn_to_page(mpn); + ptr = kmap(page); + if (ptr == NULL) { + return -ENOMEM; + } + + if (kernelBuffer) { + memcpy(ptr, buf, PAGE_SIZE); + } else { + ret = HostIF_CopyFromUser(ptr, buf, PAGE_SIZE); + } + kunmap(page); + + return ret; +} + + +/* + *---------------------------------------------------------------------- + * + * HostIF_GetLockedPageList -- + * + * puts MPNs of pages that were allocated by HostIF_AllocLockedPages() + * into user mode buffer. + * + * Results: + * non-negative number of the MPNs in the buffer on success. + * negative error code on error (-EFAULT) + * + * Side effects: + * none + * + *---------------------------------------------------------------------- + */ + +int +HostIF_GetLockedPageList(VMDriver* vm, // IN: VM instance pointer + VA64 uAddr, // OUT: user mode buffer for MPNs + unsigned int numPages) // IN: size of the buffer in MPNs +{ + MPN *mpns = VA64ToPtr(uAddr); + MPN mpn; + unsigned count; + + struct PhysTracker* AWEPages; + + if (!vm->vmhost || !vm->vmhost->AWEPages) { + return 0; + } + AWEPages = vm->vmhost->AWEPages; + + for (mpn = 0, count = 0; + (count < numPages) && + (INVALID_MPN != (mpn = PhysTrack_GetNext(AWEPages, mpn))); + count++) { + + if (HostIF_CopyToUser(&mpns[count], &mpn, sizeof *mpns) != 0) { + return -EFAULT; + } + } + + return count; +} + + +/* + *----------------------------------------------------------------------------- + * + * HostIF_GetNextAnonPage -- + * + * If "inMPN" is INVALID_MPN gets the first MPN in the anon mpn list else + * gets the anon mpn after "inMPN" in the anon mpn list. + * + * Results: + * Next anon MPN. If the list has been exhausted, returns INVALID_MPN. + * + *----------------------------------------------------------------------------- + */ + +MPN +HostIF_GetNextAnonPage(VMDriver *vm, MPN inMPN) +{ + if (!vm->vmhost || !vm->vmhost->AWEPages) { + return INVALID_MPN; + } + return PhysTrack_GetNext(vm->vmhost->AWEPages, inMPN); +} + + +/* + *---------------------------------------------------------------------- + * + * HostIF_GetCurrentPCPU -- + * + * Get current physical CPU id. Interrupts should be disabled so + * that the thread cannot move to another CPU. + * + * Results: + * Host CPU number. + * + * Side effects: + * None. + * + *---------------------------------------------------------------------- + */ + +uint32 +HostIF_GetCurrentPCPU(void) +{ + return smp_processor_id(); +} + + +#ifdef VMMON_USE_COMPAT_SCHEDULE_HRTIMEOUT +/* + *---------------------------------------------------------------------- + * + * HostIFWakeupClockThread -- + * + * Wake up the fast clock thread. Can't do this from the timer + * callback, because it holds locks that the scheduling code + * might take. + * + * Results: + * None. + * + * Side effects: + * None. + * + *---------------------------------------------------------------------- + */ + +static void +HostIFWakeupClockThread(unsigned long data) //IN: +{ + wake_up_process(linuxState.fastClockThread); +} + + +/* + *---------------------------------------------------------------------- + * + * HostIFTimerCallback -- + * + * Schedule a tasklet to wake up the fast clock thread. + * + * Results: + * Tell the kernel not to restart the timer. + * + * Side effects: + * None. + * + *---------------------------------------------------------------------- + */ + +static enum hrtimer_restart +HostIFTimerCallback(struct hrtimer *timer) //IN: +{ + tasklet_schedule(&timerTasklet); + + return HRTIMER_NORESTART; +} + + +/* + *---------------------------------------------------------------------- + * + * HostIFScheduleHRTimeout -- + * + * Schedule an hrtimer to wake up the fast clock thread. + * + * Results: + * None. + * + * Side effects: + * Sleep. + * + *---------------------------------------------------------------------- + */ + +static void +HostIFScheduleHRTimeout(ktime_t *expires) //IN: +{ + struct hrtimer t; + + if (expires && !expires->tv64) { + __set_current_state(TASK_RUNNING); + + return; + } + + hrtimer_init(&t, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + t.function = HostIFTimerCallback; + hrtimer_start(&t, *expires, HRTIMER_MODE_REL); + + if (hrtimer_active(&t)) { + schedule(); + } + + hrtimer_cancel(&t); + __set_current_state(TASK_RUNNING); +} +#endif //VMMON_USE_COMPAT_SCHEDULE_HRTIMEOUT + + +#ifndef VMMON_USE_HIGH_RES_TIMERS +/* + *---------------------------------------------------------------------- + * + * HostIFDoIoctl -- + * + * Issue ioctl. Assume kernel is not locked. It is not true now, + * but it makes things easier to understand, and won't surprise us + * later when we get rid of kernel lock from our code. + * + * Results: + * Same as ioctl method. + * + * Side effects: + * none. + * + *---------------------------------------------------------------------- + */ + +static long +HostIFDoIoctl(struct file *filp, + u_int iocmd, + unsigned long ioarg) +{ + if (filp->f_op->unlocked_ioctl) { + return filp->f_op->unlocked_ioctl(filp, iocmd, ioarg); + } + return -ENOIOCTLCMD; +} +#endif //VMON_USE_HIGH_RES_TIMERS + + +/* + *---------------------------------------------------------------------- + * + * HostIFStartTimer -- + * + * Starts the timer using either /dev/rtc or high-resolution timers. + * + * Results: + * Returns 0 on success, -1 on failure. + * + * Side effects: + * Sleep until timer expires. + * + *---------------------------------------------------------------------- + */ + +int +HostIFStartTimer(Bool rateChanged, //IN: Did rate change? + unsigned int rate, //IN: current clock rate + struct file *filp) //IN: /dev/rtc descriptor +{ +#ifdef VMMON_USE_HIGH_RES_TIMERS + static unsigned long slack = 0; + static ktime_t expires; + int timerPeriod; + + if (rateChanged) { + timerPeriod = NSEC_PER_SEC / rate; + expires = ktime_set(0, timerPeriod); + /* + * Allow the kernel to expire the timer at its convenience. + * ppoll() uses 0.1% of the timeout value. I think we can + * tolerate 1%. + */ + + slack = timerPeriod / 100; + } + set_current_state(TASK_INTERRUPTIBLE); +# ifdef VMMON_USE_SCHEDULE_HRTIMEOUT + schedule_hrtimeout_range(&expires, slack, HRTIMER_MODE_REL); +# else + HostIFScheduleHRTimeout(&expires); +# endif +#else + unsigned p2rate; + int res; + unsigned long buf; + loff_t pos = 0; + + if (rateChanged) { + /* + * The host will already have HZ timer interrupts per second. So + * in order to satisfy the requested rate, we need up to (rate - + * HZ) additional interrupts generated by the RTC. That way, if + * the guest ask for a bit more than 1024 virtual interrupts per + * second (which is a common case for Windows with multimedia + * timers), we'll program the RTC to 1024 rather than 2048, which + * saves a considerable amount of CPU. PR 519228. + */ + if (rate > HZ) { + rate -= HZ; + } else { + rate = 0; + } + /* + * Don't set the RTC rate to 64 Hz or lower: some kernels have a + * bug in the HPET emulation of RTC that will cause the RTC + * frequency to get stuck at 64Hz. See PR 519228 comment #23. + */ + p2rate = 128; + // Hardware rate must be a power of 2 + while (p2rate < rate && p2rate < 8192) { + p2rate <<= 1; + } + + res = HostIFDoIoctl(filp, RTC_IRQP_SET, p2rate); + if (res < 0) { + Warning("/dev/rtc set rate %d failed: %d\n", p2rate, res); + + return -1; + } + if (kthread_should_stop()) { + return -1; + } + } + res = filp->f_op->read(filp, (void *) &buf, sizeof(buf), &pos); + if (res <= 0) { + if (res != -ERESTARTSYS) { + Log("/dev/rtc read failed: %d\n", res); + } + + return -1; + } +#endif + + return 0; +} + + +/* + *---------------------------------------------------------------------- + * + * HostIFFastClockThread -- + * + * Kernel thread that provides finer-grained wakeups than the + * main system timers by using /dev/rtc. We can't do this at + * user level because /dev/rtc is not sharable (PR 19266). Also, + * we want to avoid the overhead of a context switch out to user + * level on every RTC interrupt. + * + * Results: + * Returns 0. + * + * Side effects: + * Wakeups and IPIs. + * + *---------------------------------------------------------------------- + */ + +static int +HostIFFastClockThread(void *data) // IN: +{ + struct file *filp = (struct file *) data; + int res; + mm_segment_t oldFS; + unsigned int rate = 0; + unsigned int prevRate = 0; + + oldFS = get_fs(); + set_fs(KERNEL_DS); + allow_signal(SIGKILL); + set_user_nice(current, linuxState.fastClockPriority); + + while ((rate = linuxState.fastClockRate) > MIN_RATE) { + if (kthread_should_stop()) { + goto out; + } + res = HostIFStartTimer(rate != prevRate, rate, filp); + if (res < 0) { + goto out; + } + prevRate = rate; + +#if defined(CONFIG_SMP) + /* + * IPI each VCPU thread that is in the monitor and is due to + * fire a MonTimer callback. + */ + Vmx86_MonTimerIPI(); +#endif + + /* + * Wake threads that are waiting for a fast poll timeout at + * userlevel. This is needed only on Linux. On Windows, + * we get shorter timeouts simply by increasing the host + * clock rate. + */ + + LinuxDriverWakeUp(TRUE); + } + + out: + LinuxDriverWakeUp(TRUE); + set_fs(oldFS); + + /* + * Do not exit thread until we are told to do so. + */ + + do { + set_current_state(TASK_UNINTERRUPTIBLE); + if (kthread_should_stop()) { + break; + } + schedule(); + } while (1); + set_current_state(TASK_RUNNING); + + return 0; +} + + +/* + *----------------------------------------------------------------------------- + * + * HostIF_SetFastClockRate -- + * + * The monitor wants to poll for events at the given rate. + * Ensure that the host OS's timer interrupts come at least at + * this rate. If the requested rate is greater than the rate at + * which timer interrupts will occur on CPUs other than 0, then + * also arrange to call Vmx86_MonitorPollIPI on every timer + * interrupt, in order to relay IPIs to any other CPUs that need + * them. + * + * Locking: + * The caller must hold the fast clock lock. + * + * Results: + * 0 for success; positive error code if /dev/rtc could not be opened. + * + * Side effects: + * As described above. + * + *----------------------------------------------------------------------------- + */ + +int +HostIF_SetFastClockRate(unsigned int rate) // IN: Frequency in Hz. +{ + ASSERT(MutexIsLocked(&fastClockMutex)); + linuxState.fastClockRate = rate; + + /* + * Overview + * -------- + * An SMP Linux kernel programs the 8253 timer (to increment the 'jiffies' + * counter) _and_ all local APICs (to run the scheduler code) to deliver + * interrupts HZ times a second. + * + * Time + * ---- + * The kernel tries very hard to spread all these interrupts evenly over + * time, i.e. on a 1 CPU system, the 1 local APIC phase is shifted by 1/2 + * period compared to the 8253, and on a 2 CPU system, the 2 local APIC + * phases are respectively shifted by 1/3 and 2/3 period compared to the + * 8253. This is done to reduce contention on locks guarding the global task + * queue. + * + * Space + * ----- + * The 8253 interrupts are distributed between physical CPUs, evenly on a P3 + * system, whereas on a P4 system physical CPU 0 gets all of them. + * + * Long story short, unless the monitor requested rate is significantly + * higher than HZ, we don't need to send IPIs or exclusively grab /dev/rtc + * to periodically kick vCPU threads running in the monitor on all physical + * CPUs. + */ + + if (rate > MIN_RATE) { + if (!linuxState.fastClockThread) { + struct task_struct *rtcTask; + struct file *filp = NULL; + +#if !defined(VMMON_USE_HIGH_RES_TIMERS) + int res; + + filp = filp_open("/dev/rtc", O_RDONLY, 0); + if (IS_ERR(filp)) { + Warning("/dev/rtc open failed: %d\n", (int)(VA)filp); + + return -(int)(VA)filp; + } + res = HostIFDoIoctl(filp, RTC_PIE_ON, 0); + if (res < 0) { + Warning("/dev/rtc enable interrupt failed: %d\n", res); + filp_close(filp, current->files); + + return -res; + } +#endif + rtcTask = kthread_run(HostIFFastClockThread, filp, "vmware-rtc"); + if (IS_ERR(rtcTask)) { + long err = PTR_ERR(rtcTask); + + /* + * Ignore ERESTARTNOINTR silently, it occurs when signal is + * pending, and syscall layer automatically reissues operation + * after signal is handled. + */ + + if (err != -ERESTARTNOINTR) { + Warning("/dev/rtc cannot start watch thread: %ld\n", err); + } + close_rtc(filp, current->files); + + return -err; + } + linuxState.fastClockThread = rtcTask; + linuxState.fastClockFile = filp; + } + } else { + if (linuxState.fastClockThread) { + force_sig(SIGKILL, linuxState.fastClockThread); + kthread_stop(linuxState.fastClockThread); + close_rtc(linuxState.fastClockFile, current->files); + + linuxState.fastClockThread = NULL; + linuxState.fastClockFile = NULL; + } + } + + return 0; +} + + +/* + *----------------------------------------------------------------------------- + * + * HostIF_MapUserMem -- + * + * Obtain kernel pointer to user memory. The pages backing the user memory + * address are locked into memory (this allows the pointer to be used in + * contexts where paging is undesirable or impossible). + * + * Results: + * On success, returns the kernel virtual address, along with a handle to + * be used for unmapping. + * On failure, returns NULL. + * + * Side effects: + * Yes. + * + *----------------------------------------------------------------------------- + */ + +void * +HostIF_MapUserMem(VA addr, // IN: User memory virtual address + size_t size, // IN: Size of memory desired + VMMappedUserMem **handle) // OUT: Handle to mapped memory +{ + void *p = (void *) (uintptr_t) addr; + VMMappedUserMem *newHandle; + VA offset = addr & (PAGE_SIZE - 1); + size_t numPagesNeeded = ((offset + size) / PAGE_SIZE) + 1; + size_t handleSize = + sizeof *newHandle + numPagesNeeded * sizeof newHandle->pages[0]; + void *mappedAddr; + + ASSERT(handle); + + if (!access_ok(VERIFY_WRITE, p, size)) { + printk(KERN_ERR "%s: Couldn't verify write to uva 0x%p with size %" + FMTSZ"u\n", __func__, p, size); + + return NULL; + } + + newHandle = kmalloc(handleSize, GFP_KERNEL); + if (newHandle == NULL) { + printk(KERN_ERR "%s: Couldn't allocate %"FMTSZ"u bytes of memory\n", + __func__, handleSize); + + return NULL; + } + + if (HostIFGetUserPages(p, newHandle->pages, numPagesNeeded)) { + kfree(newHandle); + printk(KERN_ERR "%s: Couldn't get %"FMTSZ"u %s for uva 0x%p\n", __func__, + numPagesNeeded, numPagesNeeded > 1 ? "pages" : "page", p); + + return NULL; + } + + if (numPagesNeeded > 1) { + /* + * Unlike kmap(), vmap() can fail. If it does, we need to release the + * pages that we acquired in HostIFGetUserPages(). + */ + + mappedAddr = vmap(newHandle->pages, numPagesNeeded, VM_MAP, PAGE_KERNEL); + if (mappedAddr == NULL) { + unsigned int i; + for (i = 0; i < numPagesNeeded; i++) { + put_page(newHandle->pages[i]); + } + kfree(newHandle); + printk(KERN_ERR "%s: Couldn't vmap %"FMTSZ"u %s for uva 0x%p\n", + __func__, numPagesNeeded, + numPagesNeeded > 1 ? "pages" : "page", p); + + return NULL; + } + } else { + mappedAddr = kmap(newHandle->pages[0]); + } + + printk(KERN_DEBUG "%s: p = 0x%p, offset = 0x%p, numPagesNeeded = %"FMTSZ"u," + " handleSize = %"FMTSZ"u, mappedAddr = 0x%p\n", + __func__, p, (void *)offset, numPagesNeeded, handleSize, mappedAddr); + + newHandle->numPages = numPagesNeeded; + newHandle->addr = mappedAddr; + *handle = newHandle; + + return mappedAddr + offset; +} + + +/* + *----------------------------------------------------------------------------- + * + * HostIF_UnmapUserMem -- + * + * Unmap user memory from HostIF_MapUserMem(). + * + * Results: + * None. + * + * Side effects: + * Yes. + * + *----------------------------------------------------------------------------- + */ + +void +HostIF_UnmapUserMem(VMMappedUserMem *handle) // IN: Handle to mapped memory +{ + unsigned int i; + + if (handle == NULL) { + return; + } + + printk(KERN_DEBUG "%s: numPages = %"FMTSZ"u, addr = 0x%p\n", + __func__, handle->numPages, handle->addr); + + if (handle->numPages > 1) { + vunmap(handle->addr); + } else { + kunmap(handle->pages[0]); + } + + for (i = 0; i < handle->numPages; i++) { + put_page(handle->pages[i]); + } + kfree(handle); +} + +/* + *----------------------------------------------------------------------------- + * + * HostIF_SafeRDMSR -- + * + * Attempt to read a MSR, and handle the exception if the MSR + * is unimplemented. + * + * Results: + * 0 if successful, and MSR value is returned via *val. + * + * If the MSR is unimplemented, *val is set to 0, and a + * non-zero value is returned: -1 for Win32, -EFAULT for Linux, + * and 1 for MacOS. + * + * Side effects: + * None + * + *----------------------------------------------------------------------------- + */ +int +HostIF_SafeRDMSR(unsigned int msr, // IN + uint64 *val) // OUT: MSR value +{ + int ret; + unsigned low, high; + asm volatile("2: rdmsr ; xor %0,%0\n" + "1:\n\t" + ".section .fixup,\"ax\"\n\t" + "3: mov %4,%0 ; jmp 1b\n\t" + ".previous\n\t" + VMW_ASM_EXTABLE(2b, 3b) + : "=r"(ret), "=a"(low), "=d"(high) + : "c"(msr), "i"(-EFAULT), "1"(0), "2"(0)); // init eax/edx to 0 + *val = (low | ((u64)(high) << 32)); + + return ret; +} + diff --git a/vmmon-hostif.c/hostif.patch b/vmmon-hostif.c/hostif.patch new file mode 100644 index 0000000..99b7089 --- /dev/null +++ b/vmmon-hostif.c/hostif.patch @@ -0,0 +1,33 @@ +diff -Naur vmmon-only-bak/linux/hostif.c vmmon-only/linux/hostif.c +--- vmmon-only-bak/linux/hostif.c 2017-02-28 17:05:34.764176166 +0100 ++++ vmmon-only/linux/hostif.c 2017-02-28 17:07:07.966050524 +0100 +@@ -1160,10 +1160,29 @@ + unsigned int numPages) // IN + { + int retval; ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,9,0) ++ unsigned int flags = 0; // No rights ++#endif + + down_read(¤t->mm->mmap_sem); ++ ++ ++ ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,9,0) ++ retval = get_user_pages_remote(current, current->mm, (unsigned long)uvAddr, ++ numPages, flags, ppages, NULL); ++#else ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 6, 0) + retval = get_user_pages_remote(current, current->mm, (unsigned long)uvAddr, + numPages, 0, 0, ppages, NULL); ++#else ++ retval = get_user_pages(current, current->mm, (unsigned long)uvAddr, ++ numPages, 0, 0, ppages, NULL); ++#endif ++#endif ++ ++ ++ + up_read(¤t->mm->mmap_sem); + + return retval != numPages; diff --git a/vmnet-userif.c/userif.c b/vmnet-userif.c/userif.c new file mode 100644 index 0000000..d1648a4 --- /dev/null +++ b/vmnet-userif.c/userif.c @@ -0,0 +1,1155 @@ +/********************************************************* + * Copyright (C) 1998-2013 VMware, Inc. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation version 2 and no later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + *********************************************************/ + +#include "driver-config.h" + +#define EXPORT_SYMTAB + +#define __KERNEL_SYSCALLS__ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include + +#include "vnetInt.h" +#include "compat_skbuff.h" +#include "vmnetInt.h" +#include "vm_atomic.h" +#include "vm_assert.h" +#include "monitorAction_exported.h" + +typedef struct VNetUserIFStats { + unsigned read; + unsigned written; + unsigned queued; + unsigned droppedDown; + unsigned droppedMismatch; + unsigned droppedOverflow; + unsigned droppedLargePacket; +} VNetUserIFStats; + +typedef struct VNetUserIF { + VNetPort port; + struct sk_buff_head packetQueue; + Atomic_uint32 *pollPtr; + MonitorActionIntr *actionIntr; + uint32 pollMask; + MonitorIdemAction actionID; + uint32* recvClusterCount; + wait_queue_head_t waitQueue; + struct page* actPage; + struct page* pollPage; + struct page* recvClusterPage; + VNetUserIFStats stats; + VNetEvent_Sender *eventSender; +} VNetUserIF; + +static void VNetUserIfUnsetupNotify(VNetUserIF *userIf); +static int VNetUserIfSetupNotify(VNetUserIF *userIf, VNet_Notify *vn); +static int VNetUserIfSetUplinkState(VNetPort *port, uint8 linkUp); +extern unsigned int vnet_max_qlen; + +#if COMPAT_LINUX_VERSION_CHECK_LT(3, 2, 0) +# define compat_kmap(page) kmap(page) +# define compat_kunmap(page) kunmap(page) +#else +# define compat_kmap(page) kmap((page).p) +# define compat_kunmap(page) kunmap((page).p) +#endif + +/* + *----------------------------------------------------------------------------- + * + * UserifLockPage -- + * + * Lock in core the physical page associated to a valid virtual + * address. + * + * Results: + * The page structure on success + * NULL on failure: memory pressure. Retry later + * + * Side effects: + * Loads page into memory + * + *----------------------------------------------------------------------------- + */ + +static INLINE struct page * +UserifLockPage(VA addr) // IN +{ + struct page *page = NULL; + int retval; + + down_read(¤t->mm->mmap_sem); + retval = get_user_pages_remote(current, current->mm, addr, + 1, 1, 0, &page, NULL); + up_read(¤t->mm->mmap_sem); + + if (retval != 1) { + return NULL; + } + + return page; +} + + +/* + *----------------------------------------------------------------------------- + * + * VNetUserIfMapUint32Ptr -- + * + * Maps a portion of user-space memory into the kernel. + * + * Results: + * 0 on success + * < 0 on failure: the actual value determines the type of failure + * + * Side effects: + * Might sleep. + * + *----------------------------------------------------------------------------- + */ + +static INLINE int +VNetUserIfMapPtr(VA uAddr, // IN: pointer to user memory + size_t size, // IN: size of data + struct page **p, // OUT: locked page + void **ptr) // OUT: kernel mapped pointer +{ + if (!access_ok(VERIFY_WRITE, (void *)uAddr, size) || + (((uAddr + size - 1) & ~(PAGE_SIZE - 1)) != + (uAddr & ~(PAGE_SIZE - 1)))) { + return -EINVAL; + } + + *p = UserifLockPage(uAddr); + if (*p == NULL) { + return -EAGAIN; + } + + *ptr = (uint8 *)kmap(*p) + (uAddr & (PAGE_SIZE - 1)); + return 0; +} + +static INLINE int +VNetUserIfMapUint32Ptr(VA uAddr, // IN: pointer to user memory + struct page **p, // OUT: locked page + uint32 **ptr) // OUT: kernel mapped pointer +{ + return VNetUserIfMapPtr(uAddr, sizeof **ptr, p, (void **)ptr); +} + +/* + *----------------------------------------------------------------------------- + * + * VNetUserIfSetupNotify -- + * + * Sets up notification by filling in pollPtr, actPtr, and recvClusterCount + * fields. + * + * Results: + * 0 on success + * < 0 on failure: the actual value determines the type of failure + * + * Side effects: + * Fields pollPtr, actPtr, recvClusterCount, pollPage, actPage, and + * recvClusterPage are filled in VNetUserIf structure. + * + *----------------------------------------------------------------------------- + */ + +static INLINE int +VNetUserIfSetupNotify(VNetUserIF *userIf, // IN + VNet_Notify *vn) // IN +{ + unsigned long flags; + struct sk_buff_head *q = &userIf->packetQueue; + uint32 *pollPtr; + MonitorActionIntr *actionIntr; + uint32 *recvClusterCount; + struct page *pollPage = NULL; + struct page *actPage = NULL; + struct page *recvClusterPage = NULL; + int retval; + + if (userIf->pollPtr || userIf->actionIntr || userIf->recvClusterCount) { + LOG(0, (KERN_DEBUG "vmnet: Notification mechanism already active\n")); + return -EBUSY; + } + + if ((retval = VNetUserIfMapUint32Ptr((VA)vn->pollPtr, &pollPage, + &pollPtr)) < 0) { + return retval; + } + + /* Atomic operations require proper alignment */ + if ((uintptr_t)pollPtr & (sizeof *pollPtr - 1)) { + LOG(0, (KERN_DEBUG "vmnet: Incorrect notify alignment\n")); + retval = -EFAULT; + goto error_free; + } + + if ((retval = VNetUserIfMapPtr((VA)vn->actPtr, sizeof *actionIntr, + &actPage, + (void **)&actionIntr)) < 0) { + goto error_free; + } + + if ((retval = VNetUserIfMapUint32Ptr((VA)vn->recvClusterPtr, + &recvClusterPage, + &recvClusterCount)) < 0) { + goto error_free; + } + + spin_lock_irqsave(&q->lock, flags); + if (userIf->pollPtr || userIf->actionIntr || userIf->recvClusterCount) { + spin_unlock_irqrestore(&q->lock, flags); + retval = -EBUSY; + LOG(0, (KERN_DEBUG "vmnet: Notification mechanism already active\n")); + goto error_free; + } + + userIf->pollPtr = (Atomic_uint32 *)pollPtr; + userIf->pollPage = pollPage; + userIf->actionIntr = actionIntr; + userIf->actPage = actPage; + userIf->recvClusterCount = recvClusterCount; + userIf->recvClusterPage = recvClusterPage; + userIf->pollMask = vn->pollMask; + userIf->actionID = vn->actionID; + spin_unlock_irqrestore(&q->lock, flags); + return 0; + + error_free: + if (pollPage) { + kunmap(pollPage); + put_page(pollPage); + } + if (actPage) { + kunmap(actPage); + put_page(actPage); + } + if (recvClusterPage) { + kunmap(recvClusterPage); + put_page(recvClusterPage); + } + return retval; +} + +/* + *---------------------------------------------------------------------- + * + * VNetUserIfUnsetupNotify -- + * + * Destroys permanent mapping for notify structure provided by user. + * + * Results: + * None. + * + * Side effects: + * Fields pollPtr, actPtr, recvClusterCount, etc. in VNetUserIf + * structure are cleared. + * + *---------------------------------------------------------------------- + */ + +static void +VNetUserIfUnsetupNotify(VNetUserIF *userIf) // IN +{ + unsigned long flags; + struct page *pollPage = userIf->pollPage; + struct page *actPage = userIf->actPage; + struct page *recvClusterPage = userIf->recvClusterPage; + + struct sk_buff_head *q = &userIf->packetQueue; + + spin_lock_irqsave(&q->lock, flags); + userIf->pollPtr = NULL; + userIf->pollPage = NULL; + userIf->actionIntr = NULL; + userIf->actPage = NULL; + userIf->recvClusterCount = NULL; + userIf->recvClusterPage = NULL; + userIf->pollMask = 0; + userIf->actionID = -1; + spin_unlock_irqrestore(&q->lock, flags); + + /* Release */ + if (pollPage) { + kunmap(pollPage); + put_page(pollPage); + } + if (actPage) { + kunmap(actPage); + put_page(actPage); + } + if (recvClusterPage) { + kunmap(recvClusterPage); + put_page(recvClusterPage); + } +} + + +/* + *---------------------------------------------------------------------- + * + * VNetUserIfFree -- + * + * Free the user interface port. + * + * Results: + * None. + * + * Side effects: + * None. + * + *---------------------------------------------------------------------- + */ + +static void +VNetUserIfFree(VNetJack *this) // IN +{ + VNetUserIF *userIf = (VNetUserIF*)this; + struct sk_buff *skb; + + for (;;) { + skb = skb_dequeue(&userIf->packetQueue); + if (skb == NULL) { + break; + } + dev_kfree_skb(skb); + } + + if (userIf->pollPtr) { + VNetUserIfUnsetupNotify(userIf); + } + + if (userIf->eventSender) { + VNetEvent_DestroySender(userIf->eventSender); + } + + if (this->procEntry) { + VNetProc_RemoveEntry(this->procEntry); + } + + kfree(userIf); +} + + +/* + *---------------------------------------------------------------------- + * + * VNetUserIfReceive -- + * + * This jack is receiving a packet. Take appropriate action. + * + * Results: + * None. + * + * Side effects: + * Frees skb. + * + *---------------------------------------------------------------------- + */ + +static void +VNetUserIfReceive(VNetJack *this, // IN + struct sk_buff *skb) // IN +{ + VNetUserIF *userIf = (VNetUserIF*)this->private; + uint8 *dest = SKB_2_DESTMAC(skb); + unsigned long flags; + + if (!UP_AND_RUNNING(userIf->port.flags)) { + userIf->stats.droppedDown++; + goto drop_packet; + } + + if (!VNetPacketMatch(dest, + userIf->port.paddr, + (const uint8 *)userIf->port.exactFilter, + userIf->port.exactFilterLen, + userIf->port.ladrf, + userIf->port.flags)) { + userIf->stats.droppedMismatch++; + goto drop_packet; + } + + if (skb_queue_len(&userIf->packetQueue) >= vnet_max_qlen) { + userIf->stats.droppedOverflow++; + goto drop_packet; + } + + if (skb->len > ETHER_MAX_QUEUED_PACKET) { + userIf->stats.droppedLargePacket++; + goto drop_packet; + } + + userIf->stats.queued++; + + spin_lock_irqsave(&userIf->packetQueue.lock, flags); + /* + * __skb_dequeue_tail does not take any locks so must be used with + * appropriate locks held only. + */ + __skb_queue_tail(&userIf->packetQueue, skb); + if (userIf->pollPtr) { + Atomic_Or(userIf->pollPtr, userIf->pollMask); + if (skb_queue_len(&userIf->packetQueue) >= (*userIf->recvClusterCount)) { + MonitorAction_SetBits(userIf->actionIntr, userIf->actionID); + } + } + spin_unlock_irqrestore(&userIf->packetQueue.lock, flags); + + wake_up(&userIf->waitQueue); + return; + + drop_packet: + dev_kfree_skb(skb); +} + + +/* + *---------------------------------------------------------------------- + * + * VNetUserIfProcRead -- + * + * Callback for read operation on this userif entry in vnets proc fs. + * + * Results: + * Length of read operation. + * + * Side effects: + * None. + * + *---------------------------------------------------------------------- + */ + +static int +VNetUserIfProcRead(char *page, // IN/OUT: buffer to write into + char **start, // OUT: 0 if file < 4k, else offset into + // page + off_t off, // IN: offset of read into the file + int count, // IN: maximum number of bytes to read + int *eof, // OUT: TRUE if there is nothing more to + // read + void *data) // IN: client data - not used +{ + VNetUserIF *userIf = (VNetUserIF*)data; + int len = 0; + + if (!userIf) { + return len; + } + + len += VNetPrintPort(&userIf->port, page+len); + + len += sprintf(page+len, "read %u written %u queued %u ", + userIf->stats.read, + userIf->stats.written, + userIf->stats.queued); + + len += sprintf(page+len, + "dropped.down %u dropped.mismatch %u " + "dropped.overflow %u dropped.largePacket %u", + userIf->stats.droppedDown, + userIf->stats.droppedMismatch, + userIf->stats.droppedOverflow, + userIf->stats.droppedLargePacket); + + len += sprintf(page+len, "\n"); + + *start = 0; + *eof = 1; + return len; +} + + +/* + *---------------------------------------------------------------------- + * + * VNetCopyDatagram -- + * + * Copy part of datagram to userspace. + * + * Results: + * zero on success, + * -EFAULT if buffer is an invalid area + * + * Side effects: + * Data copied to the buffer. + * + *---------------------------------------------------------------------- + */ + +static int +VNetCopyDatagram(const struct sk_buff *skb, // IN: skb to copy + char *buf, // OUT: where to copy data + int len) // IN: length +{ + struct iovec iov = { + .iov_base = buf, + .iov_len = len, + }; +#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 19, 0) + return skb_copy_datagram_iovec(skb, 0, &iov, len); +#else + struct iov_iter ioviter; + + iov_iter_init(&ioviter, READ, &iov, 1, len); + return skb_copy_datagram_iter(skb, 0, &ioviter, len); +#endif +} + + +/* + *---------------------------------------------------------------------- + * + * VNetCsumCopyDatagram -- + * + * Copy part of datagram to userspace doing checksum at same time. + * + * Do not mark this function INLINE, it is recursive! With all gcc's + * released up to now (<= gcc-3.3.1) inlining this function just + * consumes 120 more bytes of code and goes completely mad on + * register allocation, storing almost everything in the memory. + * + * Results: + * folded checksum (non-negative value) on success, + * -EINVAL if offset is too big, + * -EFAULT if buffer is an invalid area + * + * Side effects: + * Data copied to the buffer. + * + *---------------------------------------------------------------------- + */ + +static int +VNetCsumCopyDatagram(const struct sk_buff *skb, // IN: skb to copy + unsigned int offset, // IN: how many bytes skip + char *buf) // OUT: where to copy data +{ + unsigned int csum; + int err = 0; + int len = skb_headlen(skb) - offset; + char *curr = buf; + const skb_frag_t *frag; + + /* + * Something bad happened. We skip only up to skb->nh.raw, and skb->nh.raw + * must be in the header, otherwise we are in the big troubles. + */ + if (len < 0) { + return -EINVAL; + } + + csum = csum_and_copy_to_user(skb->data + offset, curr, len, 0, &err); + if (err) { + return err; + } + curr += len; + + for (frag = skb_shinfo(skb)->frags; + frag != skb_shinfo(skb)->frags + skb_shinfo(skb)->nr_frags; + frag++) { + if (frag->size > 0) { + unsigned int tmpCsum; + const void *vaddr; + + vaddr = compat_kmap(frag->page); + tmpCsum = csum_and_copy_to_user(vaddr + frag->page_offset, + curr, frag->size, 0, &err); + compat_kunmap(frag->page); + + if (err) { + return err; + } + csum = csum_block_add(csum, tmpCsum, curr - buf); + curr += frag->size; + } + } + + for (skb = skb_shinfo(skb)->frag_list; skb != NULL; skb = skb->next) { + int tmpCsum; + + tmpCsum = VNetCsumCopyDatagram(skb, 0, curr); + if (tmpCsum < 0) { + return tmpCsum; + } + /* Folded checksum must be inverted before we can use it */ + csum = csum_block_add(csum, tmpCsum ^ 0xFFFF, curr - buf); + curr += skb->len; + } + return csum_fold(csum); +} + + +/* + *---------------------------------------------------------------------- + * + * VNetCopyDatagramToUser -- + * + * Copy complete datagram to the user space. Fill correct checksum + * into the copied datagram if nobody did it yet. + * + * Results: + * On success byte count, on failure -EFAULT. + * + * Side effects: + * None. + * + *---------------------------------------------------------------------- + */ + +static INLINE_SINGLE_CALLER int +VNetCopyDatagramToUser(const struct sk_buff *skb, // IN + char *buf, // OUT + size_t count) // IN +{ + if (count > skb->len) { + count = skb->len; + } + /* + * If truncation occurs, we do not bother with checksumming - caller cannot + * verify checksum anyway in such case, and copy without checksum is + * faster. + */ + if (skb->pkt_type == PACKET_OUTGOING && /* Packet must be outgoing */ + skb->ip_summed == VM_TX_CHECKSUM_PARTIAL && /* Without checksum */ + compat_skb_network_header_len(skb) && /* We must know where header is */ + skb->len == count) { /* No truncation may occur */ + size_t skl; + int csum; + u_int16_t csum16; + + skl = compat_skb_csum_start(skb); + if (VNetCopyDatagram(skb, buf, skl)) { + return -EFAULT; + } + csum = VNetCsumCopyDatagram(skb, skl, buf + skl); + if (csum < 0) { + return csum; + } + csum16 = csum; + if (copy_to_user(buf + skl + compat_skb_csum_offset(skb), + &csum16, sizeof csum16)) { + return -EFAULT; + } + } else { + if (VNetCopyDatagram(skb, buf, count)) { + return -EFAULT; + } + } + return count; +} + + +/* + *---------------------------------------------------------------------- + * + * VNetUserIfRead -- + * + * The virtual network's read file operation. Reads the next pending + * packet for this network connection. + * + * Results: + * On success the len of the packet received, + * else if no packet waiting and nonblocking 0, + * else -errno. + * + * Side effects: + * None. + * + *---------------------------------------------------------------------- + */ + +static int +VNetUserIfRead(VNetPort *port, // IN + struct file *filp, // IN + char *buf, // OUT + size_t count) // IN +{ + VNetUserIF *userIf = (VNetUserIF*)port->jack.private; + struct sk_buff *skb; + int ret; + unsigned long flags; + DECLARE_WAITQUEUE(wait, current); + + add_wait_queue(&userIf->waitQueue, &wait); + for (;;) { + set_current_state(TASK_INTERRUPTIBLE); + skb = skb_peek(&userIf->packetQueue); + if (skb && (skb->len > count)) { + skb = NULL; + ret = -EMSGSIZE; + break; + } + ret = -EAGAIN; + + spin_lock_irqsave(&userIf->packetQueue.lock, flags); + /* + * __skb_dequeue does not take any locks so must be used with + * appropriate locks held only. + */ + skb = __skb_dequeue(&userIf->packetQueue); + if (userIf->pollPtr) { + if (!skb) { + /* List empty */ + Atomic_And(userIf->pollPtr, ~userIf->pollMask); + } + } + spin_unlock_irqrestore(&userIf->packetQueue.lock, flags); + + if (skb != NULL || filp->f_flags & O_NONBLOCK) { + break; + } + ret = -EINTR; + if (signal_pending(current)) { + break; + } + schedule(); + } + __set_current_state(TASK_RUNNING); + remove_wait_queue(&userIf->waitQueue, &wait); + if (! skb) { + return ret; + } + + userIf->stats.read++; + + count = VNetCopyDatagramToUser(skb, buf, count); + dev_kfree_skb(skb); + return count; +} + + +/* + *---------------------------------------------------------------------- + * + * VNetUserIfWrite -- + * + * The virtual network's write file operation. Send the raw packet + * to the network. + * + * Results: + * On success the count of bytes written else errno. + * + * Side effects: + * None. + * + *---------------------------------------------------------------------- + */ + +static int +VNetUserIfWrite(VNetPort *port, // IN + struct file *filp, // IN + const char *buf, // IN + size_t count) // IN +{ + VNetUserIF *userIf = (VNetUserIF*)port->jack.private; + struct sk_buff *skb; + + /* + * Check size + */ + + if (count < sizeof (struct ethhdr) || + count > ETHER_MAX_QUEUED_PACKET) { + return -EINVAL; + } + + /* + * Required to enforce the downWhenAddrMismatch policy in the MAC + * layer. --hpreg + */ + if (!UP_AND_RUNNING(userIf->port.flags)) { + userIf->stats.droppedDown++; + return count; + } + + /* + * Allocate an sk_buff. + */ + + skb = dev_alloc_skb(count + 7); + if (skb == NULL) { + // XXX obey O_NONBLOCK? + return -ENOBUFS; + } + + skb_reserve(skb, 2); + + /* + * Copy the data and send it. + */ + + userIf->stats.written++; + if (copy_from_user(skb_put(skb, count), buf, count)) { + dev_kfree_skb(skb); + return -EFAULT; + } + + VNetSend(&userIf->port.jack, skb); + + return count; +} + + +/* + *----------------------------------------------------------------------------- + * + * VNetUserIfIoctl -- + * + * XXX + * + * Results: + * 0 on success + * -errno on failure + * + * Side effects: + * None + * + *----------------------------------------------------------------------------- + */ + +static int +VNetUserIfIoctl(VNetPort *port, // IN + struct file *filp, // IN + unsigned int iocmd, // IN + unsigned long ioarg) // IN or OUT depending on iocmd +{ + VNetUserIF *userIf = (VNetUserIF*)port->jack.private; + + switch (iocmd) { + case SIOCSETNOTIFY: + return -EINVAL; + case SIOCSETNOTIFY2: +#ifdef VMX86_SERVER + /* + * This ioctl always return failure on ESX since we cannot map pages into + * the console os that are from the VMKernel address space which was the + * only case we used this. + */ + return -EINVAL; +#else // VMX86_SERVER + /* + * ORs pollMask into the integer pointed to by ptr if pending packet. Is + * cleared when all packets are drained. + */ + { + int retval; + VNet_Notify vn; + + if (copy_from_user(&vn, (void *)ioarg, sizeof vn)) { + return -EFAULT; + } + + ASSERT_ON_COMPILE(VNET_NOTIFY_VERSION == 5); + ASSERT_ON_COMPILE(ACTION_EXPORTED_VERSION == 2); + if (vn.version != VNET_NOTIFY_VERSION || + vn.actionVersion != ACTION_EXPORTED_VERSION || + vn.actionID / ACTION_WORD_SIZE >= ACTION_NUM_WORDS) { + return -ENOTTY; + } + + retval = VNetUserIfSetupNotify(userIf, &vn); + if (retval < 0) { + return retval; + } + + break; + } +#endif // VMX86_SERVER + case SIOCUNSETNOTIFY: + if (!userIf->pollPtr) { + /* This should always happen on ESX. */ + return -EINVAL; + } + VNetUserIfUnsetupNotify(userIf); + break; + + case SIOCSIFFLAGS: + /* + * Drain queue when interface is no longer active. We drain the queue to + * avoid having old packets delivered to the guest when reneabled. + */ + + if (!UP_AND_RUNNING(userIf->port.flags)) { + struct sk_buff *skb; + unsigned long flags; + struct sk_buff_head *q = &userIf->packetQueue; + + while ((skb = skb_dequeue(q)) != NULL) { + dev_kfree_skb(skb); + } + + spin_lock_irqsave(&q->lock, flags); + if (userIf->pollPtr) { + if (skb_queue_empty(q)) { + /* + * Clear the pending bit as no packets are pending at this + * point. + */ + Atomic_And(userIf->pollPtr, ~userIf->pollMask); + } + } + spin_unlock_irqrestore(&q->lock, flags); + } + break; + case SIOCINJECTLINKSTATE: + { + uint8 linkUpFromUser; + if (copy_from_user(&linkUpFromUser, (void *)ioarg, + sizeof linkUpFromUser)) { + return -EFAULT; + } + + if (linkUpFromUser != 0 && linkUpFromUser != 1) { + return -EINVAL; + } + + return VNetUserIfSetUplinkState(port, linkUpFromUser); + } + break; + default: + return -ENOIOCTLCMD; + break; + } + + return 0; +} + + +/* + *---------------------------------------------------------------------- + * + * VNetUserIfPoll -- + * + * The virtual network's file poll operation. + * + * Results: + * Return POLLIN if success, else sleep and return 0. + * FIXME: Should not we always return POLLOUT? + * + * Side effects: + * None. + * + *---------------------------------------------------------------------- + */ + +static int +VNetUserIfPoll(VNetPort *port, // IN + struct file *filp, // IN + poll_table *wait) // IN +{ + VNetUserIF *userIf = (VNetUserIF*)port->jack.private; + + poll_wait(filp, &userIf->waitQueue, wait); + if (!skb_queue_empty(&userIf->packetQueue)) { + return POLLIN; + } + + return 0; +} + +/* + *---------------------------------------------------------------------- + * + * VNetUserIfSetUplinkState -- + * + * Sends link state change event. + * + * Results: + * 0 on success, errno on failure. + * + * Side effects: + * Link state event is sent to all the event listeners + * + *---------------------------------------------------------------------- + */ + +int +VNetUserIfSetUplinkState(VNetPort *port, uint8 linkUp) +{ + VNetUserIF *userIf; + VNetJack *hubJack; + VNet_LinkStateEvent event; + int retval; + + userIf = (VNetUserIF *)port->jack.private; + hubJack = port->jack.peer; + + if (port->jack.state == FALSE || hubJack == NULL) { + return -EINVAL; + } + + if (userIf->eventSender == NULL) { + /* create event sender */ + retval = VNetHub_CreateSender(hubJack, &userIf->eventSender); + if (retval != 0) { + return retval; + } + } + + event.header.size = sizeof event; + retval = VNetEvent_GetSenderId(userIf->eventSender, &event.header.senderId); + if (retval != 0) { + LOG(1, (KERN_NOTICE "userif-%d: can't send link state event, " + "getSenderId failed (%d)\n", userIf->port.id, retval)); + return retval; + } + event.header.eventId = 0; + event.header.classSet = VNET_EVENT_CLASS_UPLINK; + event.header.type = VNET_EVENT_TYPE_LINK_STATE; + /* + * XXX kind of a hack, vmx will coalesce linkup/down if they come from the + * same adapter. + */ + event.adapter = linkUp; + event.up = linkUp; + retval = VNetEvent_Send(userIf->eventSender, &event.header); + if (retval != 0) { + LOG(1, (KERN_NOTICE "userif-%d: can't send link state event, send " + "failed (%d)\n", userIf->port.id, retval)); + } + + LOG(0, (KERN_NOTICE "userif-%d: sent link %s event.\n", + userIf->port.id, linkUp ? "up" : "down")); + + return retval; +} + +/* + *---------------------------------------------------------------------- + * + * VNetUserIf_Create -- + * + * Create a user level port to the wonderful world of virtual + * networking. + * + * Results: + * Errno. Also returns an allocated port to connect to, + * NULL on error. + * + * Side effects: + * None. + * + *---------------------------------------------------------------------- + */ + +int +VNetUserIf_Create(VNetPort **ret) // OUT +{ + VNetUserIF *userIf; + static unsigned id = 0; + int retval; + + userIf = kmalloc(sizeof *userIf, GFP_USER); + if (!userIf) { + return -ENOMEM; + } + + /* + * Initialize fields. + */ + + userIf->port.id = id++; + + userIf->port.jack.peer = NULL; + userIf->port.jack.numPorts = 1; + VNetSnprintf(userIf->port.jack.name, sizeof userIf->port.jack.name, + "userif%u", userIf->port.id); + userIf->port.jack.private = userIf; + userIf->port.jack.index = 0; + userIf->port.jack.procEntry = NULL; + userIf->port.jack.free = VNetUserIfFree; + userIf->port.jack.rcv = VNetUserIfReceive; + userIf->port.jack.cycleDetect = NULL; + userIf->port.jack.portsChanged = NULL; + userIf->port.jack.isBridged = NULL; + userIf->pollPtr = NULL; + userIf->actionIntr = NULL; + userIf->recvClusterCount = NULL; + userIf->pollPage = NULL; + userIf->actPage = NULL; + userIf->recvClusterPage = NULL; + userIf->pollMask = 0; + userIf->actionID = -1; + userIf->port.exactFilterLen = 0; + userIf->eventSender = NULL; + + /* + * Make proc entry for this jack. + */ + + retval = VNetProc_MakeEntry(userIf->port.jack.name, S_IFREG, userIf, + VNetUserIfProcRead, + &userIf->port.jack.procEntry); + if (retval) { + if (retval == -ENXIO) { + userIf->port.jack.procEntry = NULL; + } else { + kfree(userIf); + return retval; + } + } + + /* + * Rest of fields. + */ + + userIf->port.flags = IFF_RUNNING; + + memset(userIf->port.paddr, 0, sizeof userIf->port.paddr); + memset(userIf->port.ladrf, 0, sizeof userIf->port.ladrf); + memset(userIf->port.exactFilter, 0, sizeof userIf->port.exactFilter); + + VNet_MakeMACAddress(&userIf->port); + + userIf->port.fileOpRead = VNetUserIfRead; + userIf->port.fileOpWrite = VNetUserIfWrite; + userIf->port.fileOpIoctl = VNetUserIfIoctl; + userIf->port.fileOpPoll = VNetUserIfPoll; + + skb_queue_head_init(&(userIf->packetQueue)); + init_waitqueue_head(&userIf->waitQueue); + + memset(&userIf->stats, 0, sizeof userIf->stats); + + *ret = &userIf->port; + return 0; +} + diff --git a/vmnet-userif.c/userif.c.new b/vmnet-userif.c/userif.c.new new file mode 100644 index 0000000..77d1331 --- /dev/null +++ b/vmnet-userif.c/userif.c.new @@ -0,0 +1,1169 @@ +/********************************************************* + * Copyright (C) 1998-2013 VMware, Inc. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation version 2 and no later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + *********************************************************/ + +#include "driver-config.h" + +#define EXPORT_SYMTAB + +#define __KERNEL_SYSCALLS__ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include + +#include "vnetInt.h" +#include "compat_skbuff.h" +#include "vmnetInt.h" +#include "vm_atomic.h" +#include "vm_assert.h" +#include "monitorAction_exported.h" + +typedef struct VNetUserIFStats { + unsigned read; + unsigned written; + unsigned queued; + unsigned droppedDown; + unsigned droppedMismatch; + unsigned droppedOverflow; + unsigned droppedLargePacket; +} VNetUserIFStats; + +typedef struct VNetUserIF { + VNetPort port; + struct sk_buff_head packetQueue; + Atomic_uint32 *pollPtr; + MonitorActionIntr *actionIntr; + uint32 pollMask; + MonitorIdemAction actionID; + uint32* recvClusterCount; + wait_queue_head_t waitQueue; + struct page* actPage; + struct page* pollPage; + struct page* recvClusterPage; + VNetUserIFStats stats; + VNetEvent_Sender *eventSender; +} VNetUserIF; + +static void VNetUserIfUnsetupNotify(VNetUserIF *userIf); +static int VNetUserIfSetupNotify(VNetUserIF *userIf, VNet_Notify *vn); +static int VNetUserIfSetUplinkState(VNetPort *port, uint8 linkUp); +extern unsigned int vnet_max_qlen; + +#if COMPAT_LINUX_VERSION_CHECK_LT(3, 2, 0) +# define compat_kmap(page) kmap(page) +# define compat_kunmap(page) kunmap(page) +#else +# define compat_kmap(page) kmap((page).p) +# define compat_kunmap(page) kunmap((page).p) +#endif + +/* + *----------------------------------------------------------------------------- + * + * UserifLockPage -- + * + * Lock in core the physical page associated to a valid virtual + * address. + * + * Results: + * The page structure on success + * NULL on failure: memory pressure. Retry later + * + * Side effects: + * Loads page into memory + * + *----------------------------------------------------------------------------- + */ + +static INLINE struct page * +UserifLockPage(VA addr) // IN +{ + struct page *page = NULL; + int retval; + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,9,0) + unsigned int flags = FOLL_WRITE; // Write only +#endif + + down_read(¤t->mm->mmap_sem); +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,9,0) + retval = get_user_pages_remote(current, current->mm, addr, + 1, flags, &page, NULL); +#else +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 6, 0) + retval = get_user_pages_remote(current, current->mm, addr, + 1, 1, 0, &page, NULL); +#else + retval = get_user_pages(current, current->mm, addr, + 1, 1, 0, &page, NULL); +#endif +#endif + up_read(¤t->mm->mmap_sem); + + if (retval != 1) { + return NULL; + } + + return page; +} + + +/* + *----------------------------------------------------------------------------- + * + * VNetUserIfMapUint32Ptr -- + * + * Maps a portion of user-space memory into the kernel. + * + * Results: + * 0 on success + * < 0 on failure: the actual value determines the type of failure + * + * Side effects: + * Might sleep. + * + *----------------------------------------------------------------------------- + */ + +static INLINE int +VNetUserIfMapPtr(VA uAddr, // IN: pointer to user memory + size_t size, // IN: size of data + struct page **p, // OUT: locked page + void **ptr) // OUT: kernel mapped pointer +{ + if (!access_ok(VERIFY_WRITE, (void *)uAddr, size) || + (((uAddr + size - 1) & ~(PAGE_SIZE - 1)) != + (uAddr & ~(PAGE_SIZE - 1)))) { + return -EINVAL; + } + + *p = UserifLockPage(uAddr); + if (*p == NULL) { + return -EAGAIN; + } + + *ptr = (uint8 *)kmap(*p) + (uAddr & (PAGE_SIZE - 1)); + return 0; +} + +static INLINE int +VNetUserIfMapUint32Ptr(VA uAddr, // IN: pointer to user memory + struct page **p, // OUT: locked page + uint32 **ptr) // OUT: kernel mapped pointer +{ + return VNetUserIfMapPtr(uAddr, sizeof **ptr, p, (void **)ptr); +} + +/* + *----------------------------------------------------------------------------- + * + * VNetUserIfSetupNotify -- + * + * Sets up notification by filling in pollPtr, actPtr, and recvClusterCount + * fields. + * + * Results: + * 0 on success + * < 0 on failure: the actual value determines the type of failure + * + * Side effects: + * Fields pollPtr, actPtr, recvClusterCount, pollPage, actPage, and + * recvClusterPage are filled in VNetUserIf structure. + * + *----------------------------------------------------------------------------- + */ + +static INLINE int +VNetUserIfSetupNotify(VNetUserIF *userIf, // IN + VNet_Notify *vn) // IN +{ + unsigned long flags; + struct sk_buff_head *q = &userIf->packetQueue; + uint32 *pollPtr; + MonitorActionIntr *actionIntr; + uint32 *recvClusterCount; + struct page *pollPage = NULL; + struct page *actPage = NULL; + struct page *recvClusterPage = NULL; + int retval; + + if (userIf->pollPtr || userIf->actionIntr || userIf->recvClusterCount) { + LOG(0, (KERN_DEBUG "vmnet: Notification mechanism already active\n")); + return -EBUSY; + } + + if ((retval = VNetUserIfMapUint32Ptr((VA)vn->pollPtr, &pollPage, + &pollPtr)) < 0) { + return retval; + } + + /* Atomic operations require proper alignment */ + if ((uintptr_t)pollPtr & (sizeof *pollPtr - 1)) { + LOG(0, (KERN_DEBUG "vmnet: Incorrect notify alignment\n")); + retval = -EFAULT; + goto error_free; + } + + if ((retval = VNetUserIfMapPtr((VA)vn->actPtr, sizeof *actionIntr, + &actPage, + (void **)&actionIntr)) < 0) { + goto error_free; + } + + if ((retval = VNetUserIfMapUint32Ptr((VA)vn->recvClusterPtr, + &recvClusterPage, + &recvClusterCount)) < 0) { + goto error_free; + } + + spin_lock_irqsave(&q->lock, flags); + if (userIf->pollPtr || userIf->actionIntr || userIf->recvClusterCount) { + spin_unlock_irqrestore(&q->lock, flags); + retval = -EBUSY; + LOG(0, (KERN_DEBUG "vmnet: Notification mechanism already active\n")); + goto error_free; + } + + userIf->pollPtr = (Atomic_uint32 *)pollPtr; + userIf->pollPage = pollPage; + userIf->actionIntr = actionIntr; + userIf->actPage = actPage; + userIf->recvClusterCount = recvClusterCount; + userIf->recvClusterPage = recvClusterPage; + userIf->pollMask = vn->pollMask; + userIf->actionID = vn->actionID; + spin_unlock_irqrestore(&q->lock, flags); + return 0; + + error_free: + if (pollPage) { + kunmap(pollPage); + put_page(pollPage); + } + if (actPage) { + kunmap(actPage); + put_page(actPage); + } + if (recvClusterPage) { + kunmap(recvClusterPage); + put_page(recvClusterPage); + } + return retval; +} + +/* + *---------------------------------------------------------------------- + * + * VNetUserIfUnsetupNotify -- + * + * Destroys permanent mapping for notify structure provided by user. + * + * Results: + * None. + * + * Side effects: + * Fields pollPtr, actPtr, recvClusterCount, etc. in VNetUserIf + * structure are cleared. + * + *---------------------------------------------------------------------- + */ + +static void +VNetUserIfUnsetupNotify(VNetUserIF *userIf) // IN +{ + unsigned long flags; + struct page *pollPage = userIf->pollPage; + struct page *actPage = userIf->actPage; + struct page *recvClusterPage = userIf->recvClusterPage; + + struct sk_buff_head *q = &userIf->packetQueue; + + spin_lock_irqsave(&q->lock, flags); + userIf->pollPtr = NULL; + userIf->pollPage = NULL; + userIf->actionIntr = NULL; + userIf->actPage = NULL; + userIf->recvClusterCount = NULL; + userIf->recvClusterPage = NULL; + userIf->pollMask = 0; + userIf->actionID = -1; + spin_unlock_irqrestore(&q->lock, flags); + + /* Release */ + if (pollPage) { + kunmap(pollPage); + put_page(pollPage); + } + if (actPage) { + kunmap(actPage); + put_page(actPage); + } + if (recvClusterPage) { + kunmap(recvClusterPage); + put_page(recvClusterPage); + } +} + + +/* + *---------------------------------------------------------------------- + * + * VNetUserIfFree -- + * + * Free the user interface port. + * + * Results: + * None. + * + * Side effects: + * None. + * + *---------------------------------------------------------------------- + */ + +static void +VNetUserIfFree(VNetJack *this) // IN +{ + VNetUserIF *userIf = (VNetUserIF*)this; + struct sk_buff *skb; + + for (;;) { + skb = skb_dequeue(&userIf->packetQueue); + if (skb == NULL) { + break; + } + dev_kfree_skb(skb); + } + + if (userIf->pollPtr) { + VNetUserIfUnsetupNotify(userIf); + } + + if (userIf->eventSender) { + VNetEvent_DestroySender(userIf->eventSender); + } + + if (this->procEntry) { + VNetProc_RemoveEntry(this->procEntry); + } + + kfree(userIf); +} + + +/* + *---------------------------------------------------------------------- + * + * VNetUserIfReceive -- + * + * This jack is receiving a packet. Take appropriate action. + * + * Results: + * None. + * + * Side effects: + * Frees skb. + * + *---------------------------------------------------------------------- + */ + +static void +VNetUserIfReceive(VNetJack *this, // IN + struct sk_buff *skb) // IN +{ + VNetUserIF *userIf = (VNetUserIF*)this->private; + uint8 *dest = SKB_2_DESTMAC(skb); + unsigned long flags; + + if (!UP_AND_RUNNING(userIf->port.flags)) { + userIf->stats.droppedDown++; + goto drop_packet; + } + + if (!VNetPacketMatch(dest, + userIf->port.paddr, + (const uint8 *)userIf->port.exactFilter, + userIf->port.exactFilterLen, + userIf->port.ladrf, + userIf->port.flags)) { + userIf->stats.droppedMismatch++; + goto drop_packet; + } + + if (skb_queue_len(&userIf->packetQueue) >= vnet_max_qlen) { + userIf->stats.droppedOverflow++; + goto drop_packet; + } + + if (skb->len > ETHER_MAX_QUEUED_PACKET) { + userIf->stats.droppedLargePacket++; + goto drop_packet; + } + + userIf->stats.queued++; + + spin_lock_irqsave(&userIf->packetQueue.lock, flags); + /* + * __skb_dequeue_tail does not take any locks so must be used with + * appropriate locks held only. + */ + __skb_queue_tail(&userIf->packetQueue, skb); + if (userIf->pollPtr) { + Atomic_Or(userIf->pollPtr, userIf->pollMask); + if (skb_queue_len(&userIf->packetQueue) >= (*userIf->recvClusterCount)) { + MonitorAction_SetBits(userIf->actionIntr, userIf->actionID); + } + } + spin_unlock_irqrestore(&userIf->packetQueue.lock, flags); + + wake_up(&userIf->waitQueue); + return; + + drop_packet: + dev_kfree_skb(skb); +} + + +/* + *---------------------------------------------------------------------- + * + * VNetUserIfProcRead -- + * + * Callback for read operation on this userif entry in vnets proc fs. + * + * Results: + * Length of read operation. + * + * Side effects: + * None. + * + *---------------------------------------------------------------------- + */ + +static int +VNetUserIfProcRead(char *page, // IN/OUT: buffer to write into + char **start, // OUT: 0 if file < 4k, else offset into + // page + off_t off, // IN: offset of read into the file + int count, // IN: maximum number of bytes to read + int *eof, // OUT: TRUE if there is nothing more to + // read + void *data) // IN: client data - not used +{ + VNetUserIF *userIf = (VNetUserIF*)data; + int len = 0; + + if (!userIf) { + return len; + } + + len += VNetPrintPort(&userIf->port, page+len); + + len += sprintf(page+len, "read %u written %u queued %u ", + userIf->stats.read, + userIf->stats.written, + userIf->stats.queued); + + len += sprintf(page+len, + "dropped.down %u dropped.mismatch %u " + "dropped.overflow %u dropped.largePacket %u", + userIf->stats.droppedDown, + userIf->stats.droppedMismatch, + userIf->stats.droppedOverflow, + userIf->stats.droppedLargePacket); + + len += sprintf(page+len, "\n"); + + *start = 0; + *eof = 1; + return len; +} + + +/* + *---------------------------------------------------------------------- + * + * VNetCopyDatagram -- + * + * Copy part of datagram to userspace. + * + * Results: + * zero on success, + * -EFAULT if buffer is an invalid area + * + * Side effects: + * Data copied to the buffer. + * + *---------------------------------------------------------------------- + */ + +static int +VNetCopyDatagram(const struct sk_buff *skb, // IN: skb to copy + char *buf, // OUT: where to copy data + int len) // IN: length +{ + struct iovec iov = { + .iov_base = buf, + .iov_len = len, + }; +#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 19, 0) + return skb_copy_datagram_iovec(skb, 0, &iov, len); +#else + struct iov_iter ioviter; + + iov_iter_init(&ioviter, READ, &iov, 1, len); + return skb_copy_datagram_iter(skb, 0, &ioviter, len); +#endif +} + + +/* + *---------------------------------------------------------------------- + * + * VNetCsumCopyDatagram -- + * + * Copy part of datagram to userspace doing checksum at same time. + * + * Do not mark this function INLINE, it is recursive! With all gcc's + * released up to now (<= gcc-3.3.1) inlining this function just + * consumes 120 more bytes of code and goes completely mad on + * register allocation, storing almost everything in the memory. + * + * Results: + * folded checksum (non-negative value) on success, + * -EINVAL if offset is too big, + * -EFAULT if buffer is an invalid area + * + * Side effects: + * Data copied to the buffer. + * + *---------------------------------------------------------------------- + */ + +static int +VNetCsumCopyDatagram(const struct sk_buff *skb, // IN: skb to copy + unsigned int offset, // IN: how many bytes skip + char *buf) // OUT: where to copy data +{ + unsigned int csum; + int err = 0; + int len = skb_headlen(skb) - offset; + char *curr = buf; + const skb_frag_t *frag; + + /* + * Something bad happened. We skip only up to skb->nh.raw, and skb->nh.raw + * must be in the header, otherwise we are in the big troubles. + */ + if (len < 0) { + return -EINVAL; + } + + csum = csum_and_copy_to_user(skb->data + offset, curr, len, 0, &err); + if (err) { + return err; + } + curr += len; + + for (frag = skb_shinfo(skb)->frags; + frag != skb_shinfo(skb)->frags + skb_shinfo(skb)->nr_frags; + frag++) { + if (frag->size > 0) { + unsigned int tmpCsum; + const void *vaddr; + + vaddr = compat_kmap(frag->page); + tmpCsum = csum_and_copy_to_user(vaddr + frag->page_offset, + curr, frag->size, 0, &err); + compat_kunmap(frag->page); + + if (err) { + return err; + } + csum = csum_block_add(csum, tmpCsum, curr - buf); + curr += frag->size; + } + } + + for (skb = skb_shinfo(skb)->frag_list; skb != NULL; skb = skb->next) { + int tmpCsum; + + tmpCsum = VNetCsumCopyDatagram(skb, 0, curr); + if (tmpCsum < 0) { + return tmpCsum; + } + /* Folded checksum must be inverted before we can use it */ + csum = csum_block_add(csum, tmpCsum ^ 0xFFFF, curr - buf); + curr += skb->len; + } + return csum_fold(csum); +} + + +/* + *---------------------------------------------------------------------- + * + * VNetCopyDatagramToUser -- + * + * Copy complete datagram to the user space. Fill correct checksum + * into the copied datagram if nobody did it yet. + * + * Results: + * On success byte count, on failure -EFAULT. + * + * Side effects: + * None. + * + *---------------------------------------------------------------------- + */ + +static INLINE_SINGLE_CALLER int +VNetCopyDatagramToUser(const struct sk_buff *skb, // IN + char *buf, // OUT + size_t count) // IN +{ + if (count > skb->len) { + count = skb->len; + } + /* + * If truncation occurs, we do not bother with checksumming - caller cannot + * verify checksum anyway in such case, and copy without checksum is + * faster. + */ + if (skb->pkt_type == PACKET_OUTGOING && /* Packet must be outgoing */ + skb->ip_summed == VM_TX_CHECKSUM_PARTIAL && /* Without checksum */ + compat_skb_network_header_len(skb) && /* We must know where header is */ + skb->len == count) { /* No truncation may occur */ + size_t skl; + int csum; + u_int16_t csum16; + + skl = compat_skb_csum_start(skb); + if (VNetCopyDatagram(skb, buf, skl)) { + return -EFAULT; + } + csum = VNetCsumCopyDatagram(skb, skl, buf + skl); + if (csum < 0) { + return csum; + } + csum16 = csum; + if (copy_to_user(buf + skl + compat_skb_csum_offset(skb), + &csum16, sizeof csum16)) { + return -EFAULT; + } + } else { + if (VNetCopyDatagram(skb, buf, count)) { + return -EFAULT; + } + } + return count; +} + + +/* + *---------------------------------------------------------------------- + * + * VNetUserIfRead -- + * + * The virtual network's read file operation. Reads the next pending + * packet for this network connection. + * + * Results: + * On success the len of the packet received, + * else if no packet waiting and nonblocking 0, + * else -errno. + * + * Side effects: + * None. + * + *---------------------------------------------------------------------- + */ + +static int +VNetUserIfRead(VNetPort *port, // IN + struct file *filp, // IN + char *buf, // OUT + size_t count) // IN +{ + VNetUserIF *userIf = (VNetUserIF*)port->jack.private; + struct sk_buff *skb; + int ret; + unsigned long flags; + DECLARE_WAITQUEUE(wait, current); + + add_wait_queue(&userIf->waitQueue, &wait); + for (;;) { + set_current_state(TASK_INTERRUPTIBLE); + skb = skb_peek(&userIf->packetQueue); + if (skb && (skb->len > count)) { + skb = NULL; + ret = -EMSGSIZE; + break; + } + ret = -EAGAIN; + + spin_lock_irqsave(&userIf->packetQueue.lock, flags); + /* + * __skb_dequeue does not take any locks so must be used with + * appropriate locks held only. + */ + skb = __skb_dequeue(&userIf->packetQueue); + if (userIf->pollPtr) { + if (!skb) { + /* List empty */ + Atomic_And(userIf->pollPtr, ~userIf->pollMask); + } + } + spin_unlock_irqrestore(&userIf->packetQueue.lock, flags); + + if (skb != NULL || filp->f_flags & O_NONBLOCK) { + break; + } + ret = -EINTR; + if (signal_pending(current)) { + break; + } + schedule(); + } + __set_current_state(TASK_RUNNING); + remove_wait_queue(&userIf->waitQueue, &wait); + if (! skb) { + return ret; + } + + userIf->stats.read++; + + count = VNetCopyDatagramToUser(skb, buf, count); + dev_kfree_skb(skb); + return count; +} + + +/* + *---------------------------------------------------------------------- + * + * VNetUserIfWrite -- + * + * The virtual network's write file operation. Send the raw packet + * to the network. + * + * Results: + * On success the count of bytes written else errno. + * + * Side effects: + * None. + * + *---------------------------------------------------------------------- + */ + +static int +VNetUserIfWrite(VNetPort *port, // IN + struct file *filp, // IN + const char *buf, // IN + size_t count) // IN +{ + VNetUserIF *userIf = (VNetUserIF*)port->jack.private; + struct sk_buff *skb; + + /* + * Check size + */ + + if (count < sizeof (struct ethhdr) || + count > ETHER_MAX_QUEUED_PACKET) { + return -EINVAL; + } + + /* + * Required to enforce the downWhenAddrMismatch policy in the MAC + * layer. --hpreg + */ + if (!UP_AND_RUNNING(userIf->port.flags)) { + userIf->stats.droppedDown++; + return count; + } + + /* + * Allocate an sk_buff. + */ + + skb = dev_alloc_skb(count + 7); + if (skb == NULL) { + // XXX obey O_NONBLOCK? + return -ENOBUFS; + } + + skb_reserve(skb, 2); + + /* + * Copy the data and send it. + */ + + userIf->stats.written++; + if (copy_from_user(skb_put(skb, count), buf, count)) { + dev_kfree_skb(skb); + return -EFAULT; + } + + VNetSend(&userIf->port.jack, skb); + + return count; +} + + +/* + *----------------------------------------------------------------------------- + * + * VNetUserIfIoctl -- + * + * XXX + * + * Results: + * 0 on success + * -errno on failure + * + * Side effects: + * None + * + *----------------------------------------------------------------------------- + */ + +static int +VNetUserIfIoctl(VNetPort *port, // IN + struct file *filp, // IN + unsigned int iocmd, // IN + unsigned long ioarg) // IN or OUT depending on iocmd +{ + VNetUserIF *userIf = (VNetUserIF*)port->jack.private; + + switch (iocmd) { + case SIOCSETNOTIFY: + return -EINVAL; + case SIOCSETNOTIFY2: +#ifdef VMX86_SERVER + /* + * This ioctl always return failure on ESX since we cannot map pages into + * the console os that are from the VMKernel address space which was the + * only case we used this. + */ + return -EINVAL; +#else // VMX86_SERVER + /* + * ORs pollMask into the integer pointed to by ptr if pending packet. Is + * cleared when all packets are drained. + */ + { + int retval; + VNet_Notify vn; + + if (copy_from_user(&vn, (void *)ioarg, sizeof vn)) { + return -EFAULT; + } + + ASSERT_ON_COMPILE(VNET_NOTIFY_VERSION == 5); + ASSERT_ON_COMPILE(ACTION_EXPORTED_VERSION == 2); + if (vn.version != VNET_NOTIFY_VERSION || + vn.actionVersion != ACTION_EXPORTED_VERSION || + vn.actionID / ACTION_WORD_SIZE >= ACTION_NUM_WORDS) { + return -ENOTTY; + } + + retval = VNetUserIfSetupNotify(userIf, &vn); + if (retval < 0) { + return retval; + } + + break; + } +#endif // VMX86_SERVER + case SIOCUNSETNOTIFY: + if (!userIf->pollPtr) { + /* This should always happen on ESX. */ + return -EINVAL; + } + VNetUserIfUnsetupNotify(userIf); + break; + + case SIOCSIFFLAGS: + /* + * Drain queue when interface is no longer active. We drain the queue to + * avoid having old packets delivered to the guest when reneabled. + */ + + if (!UP_AND_RUNNING(userIf->port.flags)) { + struct sk_buff *skb; + unsigned long flags; + struct sk_buff_head *q = &userIf->packetQueue; + + while ((skb = skb_dequeue(q)) != NULL) { + dev_kfree_skb(skb); + } + + spin_lock_irqsave(&q->lock, flags); + if (userIf->pollPtr) { + if (skb_queue_empty(q)) { + /* + * Clear the pending bit as no packets are pending at this + * point. + */ + Atomic_And(userIf->pollPtr, ~userIf->pollMask); + } + } + spin_unlock_irqrestore(&q->lock, flags); + } + break; + case SIOCINJECTLINKSTATE: + { + uint8 linkUpFromUser; + if (copy_from_user(&linkUpFromUser, (void *)ioarg, + sizeof linkUpFromUser)) { + return -EFAULT; + } + + if (linkUpFromUser != 0 && linkUpFromUser != 1) { + return -EINVAL; + } + + return VNetUserIfSetUplinkState(port, linkUpFromUser); + } + break; + default: + return -ENOIOCTLCMD; + break; + } + + return 0; +} + + +/* + *---------------------------------------------------------------------- + * + * VNetUserIfPoll -- + * + * The virtual network's file poll operation. + * + * Results: + * Return POLLIN if success, else sleep and return 0. + * FIXME: Should not we always return POLLOUT? + * + * Side effects: + * None. + * + *---------------------------------------------------------------------- + */ + +static int +VNetUserIfPoll(VNetPort *port, // IN + struct file *filp, // IN + poll_table *wait) // IN +{ + VNetUserIF *userIf = (VNetUserIF*)port->jack.private; + + poll_wait(filp, &userIf->waitQueue, wait); + if (!skb_queue_empty(&userIf->packetQueue)) { + return POLLIN; + } + + return 0; +} + +/* + *---------------------------------------------------------------------- + * + * VNetUserIfSetUplinkState -- + * + * Sends link state change event. + * + * Results: + * 0 on success, errno on failure. + * + * Side effects: + * Link state event is sent to all the event listeners + * + *---------------------------------------------------------------------- + */ + +int +VNetUserIfSetUplinkState(VNetPort *port, uint8 linkUp) +{ + VNetUserIF *userIf; + VNetJack *hubJack; + VNet_LinkStateEvent event; + int retval; + + userIf = (VNetUserIF *)port->jack.private; + hubJack = port->jack.peer; + + if (port->jack.state == FALSE || hubJack == NULL) { + return -EINVAL; + } + + if (userIf->eventSender == NULL) { + /* create event sender */ + retval = VNetHub_CreateSender(hubJack, &userIf->eventSender); + if (retval != 0) { + return retval; + } + } + + event.header.size = sizeof event; + retval = VNetEvent_GetSenderId(userIf->eventSender, &event.header.senderId); + if (retval != 0) { + LOG(1, (KERN_NOTICE "userif-%d: can't send link state event, " + "getSenderId failed (%d)\n", userIf->port.id, retval)); + return retval; + } + event.header.eventId = 0; + event.header.classSet = VNET_EVENT_CLASS_UPLINK; + event.header.type = VNET_EVENT_TYPE_LINK_STATE; + /* + * XXX kind of a hack, vmx will coalesce linkup/down if they come from the + * same adapter. + */ + event.adapter = linkUp; + event.up = linkUp; + retval = VNetEvent_Send(userIf->eventSender, &event.header); + if (retval != 0) { + LOG(1, (KERN_NOTICE "userif-%d: can't send link state event, send " + "failed (%d)\n", userIf->port.id, retval)); + } + + LOG(0, (KERN_NOTICE "userif-%d: sent link %s event.\n", + userIf->port.id, linkUp ? "up" : "down")); + + return retval; +} + +/* + *---------------------------------------------------------------------- + * + * VNetUserIf_Create -- + * + * Create a user level port to the wonderful world of virtual + * networking. + * + * Results: + * Errno. Also returns an allocated port to connect to, + * NULL on error. + * + * Side effects: + * None. + * + *---------------------------------------------------------------------- + */ + +int +VNetUserIf_Create(VNetPort **ret) // OUT +{ + VNetUserIF *userIf; + static unsigned id = 0; + int retval; + + userIf = kmalloc(sizeof *userIf, GFP_USER); + if (!userIf) { + return -ENOMEM; + } + + /* + * Initialize fields. + */ + + userIf->port.id = id++; + + userIf->port.jack.peer = NULL; + userIf->port.jack.numPorts = 1; + VNetSnprintf(userIf->port.jack.name, sizeof userIf->port.jack.name, + "userif%u", userIf->port.id); + userIf->port.jack.private = userIf; + userIf->port.jack.index = 0; + userIf->port.jack.procEntry = NULL; + userIf->port.jack.free = VNetUserIfFree; + userIf->port.jack.rcv = VNetUserIfReceive; + userIf->port.jack.cycleDetect = NULL; + userIf->port.jack.portsChanged = NULL; + userIf->port.jack.isBridged = NULL; + userIf->pollPtr = NULL; + userIf->actionIntr = NULL; + userIf->recvClusterCount = NULL; + userIf->pollPage = NULL; + userIf->actPage = NULL; + userIf->recvClusterPage = NULL; + userIf->pollMask = 0; + userIf->actionID = -1; + userIf->port.exactFilterLen = 0; + userIf->eventSender = NULL; + + /* + * Make proc entry for this jack. + */ + + retval = VNetProc_MakeEntry(userIf->port.jack.name, S_IFREG, userIf, + VNetUserIfProcRead, + &userIf->port.jack.procEntry); + if (retval) { + if (retval == -ENXIO) { + userIf->port.jack.procEntry = NULL; + } else { + kfree(userIf); + return retval; + } + } + + /* + * Rest of fields. + */ + + userIf->port.flags = IFF_RUNNING; + + memset(userIf->port.paddr, 0, sizeof userIf->port.paddr); + memset(userIf->port.ladrf, 0, sizeof userIf->port.ladrf); + memset(userIf->port.exactFilter, 0, sizeof userIf->port.exactFilter); + + VNet_MakeMACAddress(&userIf->port); + + userIf->port.fileOpRead = VNetUserIfRead; + userIf->port.fileOpWrite = VNetUserIfWrite; + userIf->port.fileOpIoctl = VNetUserIfIoctl; + userIf->port.fileOpPoll = VNetUserIfPoll; + + skb_queue_head_init(&(userIf->packetQueue)); + init_waitqueue_head(&userIf->waitQueue); + + memset(&userIf->stats, 0, sizeof userIf->stats); + + *ret = &userIf->port; + return 0; +} + diff --git a/vmnet-userif.c/userif.patch b/vmnet-userif.c/userif.patch new file mode 100644 index 0000000..00ac722 --- /dev/null +++ b/vmnet-userif.c/userif.patch @@ -0,0 +1,26 @@ +--- vmnet-only-bak/userif.c 2017-02-28 17:19:28.674984344 +0100 ++++ vmnet-only/userif.c 2017-02-28 17:19:21.558424545 +0100 +@@ -112,9 +112,23 @@ + struct page *page = NULL; + int retval; + ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,9,0) ++ unsigned int flags = FOLL_WRITE; // Write only ++#endif ++ + down_read(¤t->mm->mmap_sem); ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,9,0) ++ retval = get_user_pages_remote(current, current->mm, addr, ++ 1, flags, &page, NULL); ++#else ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 6, 0) + retval = get_user_pages_remote(current, current->mm, addr, + 1, 1, 0, &page, NULL); ++#else ++ retval = get_user_pages(current, current->mm, addr, ++ 1, 1, 0, &page, NULL); ++#endif ++#endif + up_read(¤t->mm->mmap_sem); + + if (retval != 1) {