vmmware-12.1.1-kernel-4.9/vmmon-hostif.c/hostif.c

/*********************************************************
 * Copyright (C) 1998-2014 VMware, Inc. All rights reserved.
 *
 * This program is free software; you can redistribute it and/or modify it
 * under the terms of the GNU General Public License as published by the
 * Free Software Foundation version 2 and no later version.
 *
 * This program is distributed in the hope that it will be useful, but
 * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
 * or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
 * for more details.
 *
 * You should have received a copy of the GNU General Public License along
 * with this program; if not, write to the Free Software Foundation, Inc.,
 * 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
 *
 *********************************************************/

/*
 * hostif.c --
 *
 *    This file implements the platform-specific (here Linux) interface that
 *    the cross-platform code uses --hpreg
 *
 */


/* Must come before any kernel header file --hpreg */
#include "driver-config.h"

/* Must come before vmware.h --hpreg */
#include "compat_page.h"
#include <linux/binfmts.h>
#include <linux/delay.h>
#include <linux/file.h>
#include <linux/kernel.h>

#include <linux/vmalloc.h>
#include <linux/slab.h>

#include <linux/preempt.h>
#include <linux/poll.h>
#include <linux/mman.h>

#include <linux/smp.h>

#if LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 25)
#   include <asm/asm.h>
#endif
#if defined(_ASM_EXTABLE)
#   define VMW_ASM_EXTABLE(from, to) _ASM_EXTABLE(from, to)
#else
    /* Compat version copied from asm.h of 2.6.25 kernel */
#   define VMW_ASM_FORM(x)  " " #x " "
#   define VMW_ASM_EX_SEC   " .section __ex_table,\"a\"\n"
#   ifdef CONFIG_X86_32
#      define VMW_ASM_SEL(a,b) VMW_ASM_FORM(a)
#   else
#      define VMW_ASM_SEL(a,b) VMW_ASM_FORM(b)
#   endif
#   define VMW_ASM_PTR        VMW_ASM_SEL(.long, .quad)
#   define VMW_ASM_ALIGN      VMW_ASM_SEL(.balign 4, .balign 8)
#   define VMW_ASM_EXTABLE(from,to) \
           VMW_ASM_EX_SEC    \
           VMW_ASM_ALIGN "\n" \
           VMW_ASM_PTR #from "," #to "\n" \
           " .previous\n"
#endif

#include <asm/io.h>
#include <asm/uaccess.h>
#include <linux/mc146818rtc.h>
#include <linux/capability.h>
#include <linux/kthread.h>
#include <linux/wait.h>


#include "vmware.h"
#include "x86apic.h"
#include "vm_asm.h"
#include "modulecall.h"
#include "memtrack.h"
#include "phystrack.h"
#include "cpuid.h"
#include "cpuid_info.h"
#include "hostif.h"
#include "hostif_priv.h"
#include "driver.h"
#include "vmhost.h"
#include "x86msr.h"
#include "apic.h"
#include "memDefaults.h"
#include "vcpuid.h"

#include "pgtbl.h"
#include "vmmonInt.h"
#include "versioned_atomic.h"

/*
 * Determine if we can use high resolution timers.
 */

#ifdef CONFIG_HIGH_RES_TIMERS
#   include <linux/hrtimer.h>
#   define VMMON_USE_HIGH_RES_TIMERS
#   if LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 28)
#      define VMMON_USE_SCHEDULE_HRTIMEOUT
#   else
#      define VMMON_USE_COMPAT_SCHEDULE_HRTIMEOUT
static void HostIFWakeupClockThread(unsigned long data);
static DECLARE_TASKLET(timerTasklet, HostIFWakeupClockThread, 0);
#   endif
#   define close_rtc(filp, files) do {} while(0)
#else
#   define close_rtc(filp, files) filp_close(filp, files)
#endif

#define UPTIME_FREQ CONST64(1000000)

/*
 * When CONFIG_NO_HZ_FULL is set processors can run tickless
 * if there is only one runnable process.  When set, the rate
 * checks in HostIF_SetFastClockRate and HostIFFastClockThread
 * need to be relaxed to allow any non-zero rate to run.
 *
 * This code can potentially be removed if/when we stop using
 * HostIFFastClockThread to drive MonTimer.  See PR1088247.
 */
#ifdef CONFIG_NO_HZ_FULL
#define MIN_RATE (0)
#else
#define MIN_RATE ((HZ) + (HZ) / 16)
#endif

/*
 * Linux seems to like keeping free memory around 30MB
 * even under severe memory pressure.  Let's give it a little
 * more leeway than that for safety.
 */
#define LOCKED_PAGE_SLACK 10000

static struct {
   Atomic_uint64     uptimeBase;
   VersionedAtomic   version;
   uint64            monotimeBase;
   unsigned long     jiffiesBase;
   struct timer_list timer;
} uptimeState;

/*
 * First Page Locking strategy
 * ---------------------------
 *
 * An early implementation hacked the lock bit for the purpose of locking
 * memory. This had a couple of advantages:
 *   - the vmscan algorithm would never eliminate mappings from the process
 *     address space
 *   - easy to assert that things are ok
 *   - it worked with anonymous memory. Basically, vmscan jumps over these
 *     pages, their use count stays high, ....
 *
 * This approach however had a couple of problems:
 *
 *   - it relies on an undocumented interface. (in another words, a total hack)
 *   - it creates deadlock situations if the application gets a kill -9 or
 *     otherwise dies ungracefully. linux first tears down the address space,
 *     then closes file descriptors (including our own device). Unfortunately,
 *     this leads to a deadlock of the process on pages with the lock bit set.
 *
 *     There is a workaround for that, namely to detect that condition using
 *     a linux timer. (ugly)
 *
 * Current Page Locking strategy
 * -----------------------------
 *
 * The current scheme does not use the lock bit, rather it increments the use
 * count on the pages that need to be locked down in memory.
 *
 * The problem is that experiments on certain linux systems (e.g. 2.2.0-pre9)
 * showed that linux somehow swaps out anonymous pages, even with the
 * increased ref counter.
 * Swapping them out to disk is not that big of a deal, but bringing them back
 * to a different location is.  In any case, anonymous pages in linux are not
 * intended to be write-shared (e.g. try to MAP_SHARED /dev/zero).
 *
 * As a result, the current locking strategy requires that all locked pages are
 * backed by the filesystem, not by swap. For now, we use both mapped files and
 * sys V shared memory. The user application is responsible to cover these
 * cases.
 *
 */


#define HOST_UNLOCK_PFN(_vm, _pfn) do {                  \
   _vm = _vm;                                            \
   put_page(pfn_to_page(_pfn));                          \
} while (0)

#define HOST_UNLOCK_PFN_BYMPN(_vm, _pfn) do {            \
   PhysTrack_Remove((_vm)->vmhost->lockedPages, (_pfn)); \
   put_page(pfn_to_page(_pfn));                          \
} while (0)

uint8 monitorIPIVector;
uint8 hvIPIVector;

/*
 *-----------------------------------------------------------------------------
 *
 * MutexInit --
 *
 *      Initialize a Mutex. --hpreg
 *
 * Results:
 *      None
 *
 * Side effects:
 *      None
 *
 *-----------------------------------------------------------------------------
 */

#ifdef VMX86_DEBUG
static INLINE void
MutexInit(Mutex *mutex,     // IN
          char const *name) // IN
{
   ASSERT(mutex);
   ASSERT(name);

   sema_init(&mutex->sem, 1);
   mutex->name = name;
   mutex->cur.pid = -1;
}
#else
#   define MutexInit(_mutex, _name) sema_init(&(_mutex)->sem, 1)
#endif


#ifdef VMX86_DEBUG
/*
 *-----------------------------------------------------------------------------
 *
 * MutexIsLocked --
 *
 *      Determine if a Mutex is locked by the current thread. --hpreg
 *
 * Results:
 *      TRUE if yes
 *      FALSE if no
 *
 * Side effects:
 *      None
 *
 *-----------------------------------------------------------------------------
 */

static INLINE Bool
MutexIsLocked(Mutex *mutex) // IN
{
   ASSERT(mutex);

   return mutex->cur.pid == current->pid;
}
#endif


/*
 *-----------------------------------------------------------------------------
 *
 * MutexLock --
 *
 *      Acquire a Mutex. --hpreg
 *
 * Results:
 *      None
 *
 * Side effects:
 *      None
 *
 *-----------------------------------------------------------------------------
 */

#ifdef VMX86_DEBUG
static INLINE void
MutexLock(Mutex *mutex, // IN
          int callerID) // IN
{
   ASSERT(mutex);
   ASSERT(!MutexIsLocked(mutex));

   down(&mutex->sem);
   mutex->cur.pid = current->pid;
   mutex->cur.callerID = callerID;
}
#else
#   define MutexLock(_mutex, _callerID) down(&(_mutex)->sem)
#endif


/*
 *-----------------------------------------------------------------------------
 *
 * MutexUnlock --
 *
 *      Release a Mutex. --hpreg
 *
 * Results:
 *      None
 *
 * Side effects:
 *      None
 *
 *-----------------------------------------------------------------------------
 */

#ifdef VMX86_DEBUG
static INLINE void
MutexUnlock(Mutex *mutex, // IN
            int callerID) // IN
{
   ASSERT(mutex);

   ASSERT(MutexIsLocked(mutex) && mutex->cur.callerID == callerID);
   mutex->prev = mutex->cur;
   mutex->cur.pid = -1;
   up(&mutex->sem);
}
#else
#   define MutexUnlock(_mutex, _callerID) up(&(_mutex)->sem)
#endif


/* This mutex protects the driver-wide state. --hpreg */
static Mutex globalMutex;

/*
 * This mutex protects the fast clock rate and is held while
 * creating/destroying the fastClockThread.  It ranks below
 * globalMutex.  We can't use globalMutex for this purpose because the
 * fastClockThread itself acquires the globalMutex, so trying to hold
 * the mutex while destroying the thread can cause a deadlock.
 */
static Mutex fastClockMutex;

/* This mutex protects linuxState.pollList.  */
static Mutex pollListMutex;


/*
 *----------------------------------------------------------------------
 *
 * HostIF_PrepareWaitForThreads --
 *
 *      Prepare to wait for another vCPU thread.
 *
 * Results:
 *      FALSE: no way on Linux to determine we've already been signalled.
 *
 * Side effects:
 *      Current task is interruptible.
 *
 *----------------------------------------------------------------------
 */

Bool
HostIF_PrepareWaitForThreads(VMDriver *vm,     // IN:
                             Vcpuid currVcpu)  // IN:
{
   set_current_state(TASK_INTERRUPTIBLE);
   vm->vmhost->vcpuSemaTask[currVcpu] = current;
   return FALSE;
}


/*
 *----------------------------------------------------------------------
 *
 * HostIF_WaitForThreads --
 *
 *      Wait for another vCPU thread.
 *
 * Results:
 *      None.
 *
 * Side effects:
 *      Current task may block.
 *
 *----------------------------------------------------------------------
 */

void
HostIF_WaitForThreads(VMDriver *vm,     // UNUSED:
                      Vcpuid currVcpu)  // UNUSED:

{
#ifdef VMMON_USE_SCHEDULE_HRTIMEOUT
   ktime_t timeout = ktime_set(0, CROSSCALL_SLEEP_US * 1000);
   schedule_hrtimeout(&timeout, HRTIMER_MODE_REL);
#else
   /* Fallback to ms timer resolution is fine for older kernels. */
   schedule_timeout(msecs_to_jiffies(CROSSCALL_SLEEP_US / 1000) + 1);
#endif
}


/*
 *----------------------------------------------------------------------
 *
 * HostIF_CancelWaitForThreads --
 *
 *      Cancel waiting for another vCPU thread.
 *
 * Results:
 *      None.
 *
 * Side effects:
 *      Current task is running and no longer interruptible.
 *
 *----------------------------------------------------------------------
 */

void
HostIF_CancelWaitForThreads(VMDriver *vm,     // IN:
                            Vcpuid currVcpu)  // IN:
{
   vm->vmhost->vcpuSemaTask[currVcpu] = NULL;
   set_current_state(TASK_RUNNING);
}


/*
 *----------------------------------------------------------------------
 *
 * HostIF_WakeUpYielders --
 *
 *      Wakeup vCPUs that are waiting for the current vCPU.
 *      
 * Results:
 *      The requested vCPUs are nudged if they are sleeping due to
 *      Vmx86_YieldToSet.
 *
 * Side effects:
 *      None.
 *
 *----------------------------------------------------------------------
 */

void
HostIF_WakeUpYielders(VMDriver *vm,     // IN:
                      Vcpuid currVcpu)  // IN:
{
   VCPUSet req;
   Vcpuid vcpuid;
   uint64 subset;

   /*
    * PR 1142958: if the VCPUs woken in the crosscallWaitSet re-add themselves
    * to this set faster than it can be fully drained, this function never
    * exits.  Instead, we copy and remove a snapshot of the crosscallWaitSet
    * and locally wake up just that snapshot.  It is ok that we don't get a
    * fully coherent snapshot, as long as the subset copy-and-remove is atomic
    * so no VCPU added is lost entirely.
    */

   VCPUSet_Empty(&req);
   FOR_EACH_SUBSET_IN_SET(subIdx) {
      subset = VCPUSet_AtomicReadWriteSubset(&vm->crosscallWaitSet[currVcpu],
                                             0, subIdx);
      VCPUSet_UnionSubset(&req, subset, subIdx);
   } ROF_EACH_SUBSET_IN_SET();

   preempt_disable();
   while ((vcpuid = VCPUSet_FindFirst(&req)) != VCPUID_INVALID) {
      struct task_struct *t = vm->vmhost->vcpuSemaTask[vcpuid];
      VCPUSet_Remove(&req, vcpuid);
      if (t && (t->state & TASK_INTERRUPTIBLE)) {
         wake_up_process(t);
      }
   }
   preempt_enable();
}


/*
 *-----------------------------------------------------------------------------
 *
 * HostIF_InitGlobalLock --
 *
 *      Initialize the global (across all VMs and vmmon) locks.
 *
 * Results:
 *      None
 *
 * Side effects:
 *      None
 *
 *-----------------------------------------------------------------------------
 */

void
HostIF_InitGlobalLock(void)
{
   MutexInit(&globalMutex, "global");
   MutexInit(&fastClockMutex, "fastClock");
   MutexInit(&pollListMutex, "pollList");
}


/*
 *-----------------------------------------------------------------------------
 *
 * HostIF_GlobalLock --
 *
 *      Grabs the global data structure lock.
 *
 * Results:
 *      None
 *
 * Side effects:
 *      Should be a very low contention lock. 
 *      The current thread is rescheduled if the lock is busy.
 *
 *-----------------------------------------------------------------------------
 */

void
HostIF_GlobalLock(int callerID) // IN
{
   MutexLock(&globalMutex, callerID);
}


/*
 *-----------------------------------------------------------------------------
 *
 * HostIF_GlobalUnlock --
 *
 *      Releases the global data structure lock.
 *
 * Results:
 *      None
 *
 * Side effects:
 *      None
 *
 *-----------------------------------------------------------------------------
 */

void
HostIF_GlobalUnlock(int callerID) // IN
{
   MutexUnlock(&globalMutex, callerID);
}


#ifdef VMX86_DEBUG
/*
 *-----------------------------------------------------------------------------
 *
 * HostIF_GlobalLockIsHeld --
 *
 *      Determine if the global lock is held by the current thread.
 * 
 * Results:
 *      TRUE if yes
 *      FALSE if no
 *
 * Side effects:
 *      None
 *
 *-----------------------------------------------------------------------------
 */

Bool
HostIF_GlobalLockIsHeld(void)
{
   return MutexIsLocked(&globalMutex);
}
#endif


/*
 *-----------------------------------------------------------------------------
 *
 * HostIF_FastClockLock --
 *
 *      Grabs the fast clock data structure lock.
 *
 * Results:
 *      None
 *
 * Side effects:
 *      Should be a very low contention lock. 
 *      The current thread is rescheduled if the lock is busy.
 *
 *-----------------------------------------------------------------------------
 */

void
HostIF_FastClockLock(int callerID) // IN
{
   MutexLock(&fastClockMutex, callerID);
}


/*
 *-----------------------------------------------------------------------------
 *
 * HostIF_FastClockUnlock --
 *
 *      Releases the fast clock data structure lock.
 *
 * Results:
 *      None
 *
 * Side effects:
 *      None
 *
 *-----------------------------------------------------------------------------
 */

void
HostIF_FastClockUnlock(int callerID) // IN
{
   MutexUnlock(&fastClockMutex, callerID);
}


/*
 *-----------------------------------------------------------------------------
 *
 * HostIF_PollListLock --
 *
 *      Grabs the linuxState.pollList lock.
 *
 * Results:
 *      None
 *
 * Side effects:
 *      The current thread is rescheduled if the lock is busy.
 *
 *-----------------------------------------------------------------------------
 */

void
HostIF_PollListLock(int callerID) // IN
{
   MutexLock(&pollListMutex, callerID);
}


/*
 *-----------------------------------------------------------------------------
 *
 * HostIF_PollListUnlock --
 *
 *      Releases the linuxState.pollList lock.
 *
 * Results:
 *      None
 *
 * Side effects:
 *      None
 *
 *-----------------------------------------------------------------------------
 */

void
HostIF_PollListUnlock(int callerID) // IN
{
   MutexUnlock(&pollListMutex, callerID);
}


/*
 *----------------------------------------------------------------------
 *
 * MapCrossPage & UnmapCrossPage
 *
 *    Both x86-64 and ia32 need to map crosspage to an executable
 *    virtual address. We use the vmap interface instead of kmap
 *    due to bug 43907.
 *
 * Side effects:
 *
 *    UnmapCrossPage assumes that the page has been refcounted up
 *    so it takes care of the put_page.
 *
 *----------------------------------------------------------------------
 */
static void *
MapCrossPage(struct page *p)  // IN:
{
   return vmap(&p, 1, VM_MAP, VM_PAGE_KERNEL_EXEC);
}


static void
UnmapCrossPage(struct page *p,  // IN:
               void *va)        // IN:
{
   vunmap(va);
   put_page(p);
}


/*
 *----------------------------------------------------------------------
 *
 * HostIFHostMemInit --
 *
 *      Initialize per-VM pages lists.
 *
 * Results:
 *      0 on success,
 *      non-zero on failure.
 *
 * Side effects:
 *      None.
 *
 *----------------------------------------------------------------------
 */

static int
HostIFHostMemInit(VMDriver *vm)  // IN:
{
   VMHost *vmh = vm->vmhost;
   
   vmh->lockedPages = PhysTrack_Alloc(vm);
   if (!vmh->lockedPages) {
      return -1;
   }
   vmh->AWEPages = PhysTrack_Alloc(vm);
   if (!vmh->AWEPages) {
      return -1;
   }

   return 0;
}


/*
 *----------------------------------------------------------------------
 *
 * HostIFHostMemCleanup --
 *
 *      Release per-VM pages lists.
 *
 * Results:
 *      None.
 *
 * Side effects:
 *      Locked and AWE pages are released.
 *
 *----------------------------------------------------------------------
 */

static void
HostIFHostMemCleanup(VMDriver *vm)  // IN:
{
   MPN mpn;
   VMHost *vmh = vm->vmhost;

   if (!vmh) {
      return;
   }

   HostIF_VMLock(vm, 32); // Debug version of PhysTrack wants VM's lock.
   if (vmh->lockedPages) {
      for (mpn = 0;
           INVALID_MPN != (mpn = PhysTrack_GetNext(vmh->lockedPages, mpn));) {
         HOST_UNLOCK_PFN_BYMPN(vm, mpn);
      }
      PhysTrack_Free(vmh->lockedPages);
      vmh->lockedPages = NULL;
   }

   if (vmh->AWEPages) {
      for (mpn = 0;
           INVALID_MPN != (mpn = PhysTrack_GetNext(vmh->AWEPages, mpn));) {
         PhysTrack_Remove(vmh->AWEPages, mpn);
         put_page(pfn_to_page(mpn));
      }
      PhysTrack_Free(vmh->AWEPages);
      vmh->AWEPages = NULL;
   }
   HostIF_VMUnlock(vm, 32);
}


/*
 *----------------------------------------------------------------------
 *
 * HostIF_AllocMachinePage --
 *
 *      Alloc non-swappable memory page. The page is not billed to
 *      a particular VM. Preferably the page should not be mapped into
 *      the kernel addresss space.
 *
 * Results:
 *      INVALID_MPN or a valid host mpn.
 *
 * Side effects:
 *      None.
 *
 *----------------------------------------------------------------------
 */

MPN
HostIF_AllocMachinePage(void)
{
  struct page *pg = alloc_page(GFP_HIGHUSER);

  return (pg) ? ((MPN)page_to_pfn(pg)) : INVALID_MPN;
}


/*
 *----------------------------------------------------------------------
 *
 * HostIF_FreeMachinePage --
 *
 *      Free an anonymous machine page allocated by 
 *      HostIF_AllocMachinePage().  This page is not tracked in any 
 *      phystracker.
 *
 * Results:
 *      Host page is unlocked.
 *
 * Side effects:
 *      None.
 *
 *----------------------------------------------------------------------
 */

void
HostIF_FreeMachinePage(MPN mpn)  // IN:
{
  struct page *pg = pfn_to_page(mpn);

  __free_page(pg);
}


/*
 *----------------------------------------------------------------------
 *
 * HostIF_AllocLockedPages --
 *
 *      Alloc non-swappable memory.
 *
 * Results:
 *      negative value on complete failure
 *      non-negative value on partial/full completion, number of MPNs
 *          allocated & filled in pmpn returned.
 *
 * Side effects:
 *      Pages allocated.
 *
 *----------------------------------------------------------------------
 */

int
HostIF_AllocLockedPages(VMDriver *vm,	     // IN: VM instance pointer
			VA64 addr,	     // OUT: pointer to user or kernel buffer for MPNs
			unsigned numPages,   // IN: number of pages to allocate
			Bool kernelMPNBuffer)// IN: is the MPN buffer in kernel or user address space?
{
   MPN *pmpn = VA64ToPtr(addr);

   VMHost *vmh = vm->vmhost;
   unsigned int cnt;
   int err = 0;

   if (!vmh || !vmh->AWEPages) {
      return -EINVAL;
   }
   for (cnt = 0; cnt < numPages; cnt++) {
      struct page* pg;
      MPN mpn;

      pg = alloc_page(GFP_HIGHUSER);
      if (!pg) {
         err = -ENOMEM;
	 break;
      }
      mpn = (MPN)page_to_pfn(pg);
      if (kernelMPNBuffer) {
         *pmpn = mpn;
      } else if (HostIF_CopyToUser(pmpn, &mpn, sizeof *pmpn) != 0) {
	__free_page(pg);
	err = -EFAULT;
	break;
      }
      pmpn++;
      if (PhysTrack_Test(vmh->AWEPages, mpn)) {
	Warning("%s: duplicate MPN %016" FMT64 "x\n", __func__, mpn);
      }
      PhysTrack_Add(vmh->AWEPages, mpn);
   }

   return cnt ? cnt : err;
}


/*
 *----------------------------------------------------------------------
 *
 * HostIF_FreeLockedPages --
 *
 *      Free non-swappable memory.
 *
 * Results:
 *      On success: 0. All pages were unlocked.
 *      On failure: Non-zero system error code. No page was unlocked.
 *
 * Side effects:
 *      Pages freed.
 *
 *----------------------------------------------------------------------
 */

int
HostIF_FreeLockedPages(VMDriver *vm,	     // IN: VM instance pointer
		       VA64 addr,            // IN: user or kernel array of MPNs
		       unsigned numPages,    // IN: number of pages to free
		       Bool kernelMPNBuffer) // IN: is the MPN buffer in kernel or user address space?
{
   const int MPN_BATCH = 64;
   MPN const *pmpn = VA64ToPtr(addr);
   VMHost *vmh = vm->vmhost;
   unsigned int cnt;
   struct page *pg;
   MPN *mpns;

   mpns = HostIF_AllocKernelMem(sizeof *mpns * MPN_BATCH, TRUE);

   if (mpns == NULL) {
      return -ENOMEM;
   }
   if (!vmh || !vmh->AWEPages) {
      HostIF_FreeKernelMem(mpns);
      return -EINVAL;
   }

   if (!kernelMPNBuffer) {
      if (numPages > MPN_BATCH) {
         HostIF_FreeKernelMem(mpns);
         return -EINVAL;
      }

      if (HostIF_CopyFromUser(mpns, pmpn, numPages * sizeof *pmpn)) {
         printk(KERN_DEBUG "Cannot read from process address space at %p\n",
                pmpn);
         HostIF_FreeKernelMem(mpns);
         return -EINVAL;
      }

      pmpn = mpns;
   }

   for (cnt = 0; cnt < numPages; cnt++) {
      if (!PhysTrack_Test(vmh->AWEPages, pmpn[cnt])) {
         printk(KERN_DEBUG "Attempted to free unallocated MPN %016" FMT64 "X\n",
                pmpn[cnt]);
         HostIF_FreeKernelMem(mpns);
         return -EINVAL;
      }

      pg = pfn_to_page(pmpn[cnt]);
      if (page_count(pg) != 1) {
         // should this case be considered a failure?
         printk(KERN_DEBUG "Page %016" FMT64 "X is still used by someone "
                "(use count %u, VM %p)\n", pmpn[cnt],
                 page_count(pg), vm);
      }
   }

   for (cnt = 0; cnt < numPages; cnt++) {
      pg = pfn_to_page(pmpn[cnt]);
      PhysTrack_Remove(vmh->AWEPages, pmpn[cnt]);
      __free_page(pg);
   }
   HostIF_FreeKernelMem(mpns);
   return 0;
}


/*
 *----------------------------------------------------------------------
 *
 * HostIF_Init --
 *
 *      Initialize the host-dependent part of the driver.
 *
 * Results:
 *     zero on success, non-zero on error.
 *
 * Side effects:
 *     None
 *
 *----------------------------------------------------------------------
 */

int
HostIF_Init(VMDriver *vm)  // IN:
{
   vm->memtracker = MemTrack_Init();
   if (vm->memtracker == NULL) {
      return -1;
   }

   vm->vmhost = (VMHost *) HostIF_AllocKernelMem(sizeof *vm->vmhost, TRUE);
   if (vm->vmhost == NULL) {
      return -1;
   }
   memset(vm->vmhost, 0, sizeof *vm->vmhost);

   if (HostIFHostMemInit(vm)) {
      return -1;
   }
   MutexInit(&vm->vmhost->vmMutex, "vm");

   return 0;
}


/*
 *------------------------------------------------------------------------------
 *
 * HostIF_LookupUserMPN --
 *
 *      Lookup the MPN of a locked user page by user VA.
 *
 * Results:
 *      A status code and the MPN on success.
 *
 * Side effects:
 *     None
 *
 *------------------------------------------------------------------------------
 */

int
HostIF_LookupUserMPN(VMDriver *vm, // IN: VMDriver
                     VA64 uAddr,   // IN: user VA of the page
                     MPN *mpn)     // OUT
{
   void *uvAddr = VA64ToPtr(uAddr);
   int retval = PAGE_LOCK_SUCCESS;

   *mpn = PgtblVa2MPN((VA)uvAddr);

   /*
    * On failure, check whether the page is locked.
    *
    * While we don't require the page to be locked by HostIF_LockPage(),
    * it does provide extra information.
    *
    * -- edward
    */
   if (*mpn == INVALID_MPN) {
      if (vm == NULL) {
         retval += PAGE_LOOKUP_NO_VM;
      } else {
         MemTrackEntry *entryPtr =
            MemTrack_LookupVPN(vm->memtracker, PTR_2_VPN(uvAddr));
         if (entryPtr == NULL) {
            retval += PAGE_LOOKUP_NOT_TRACKED;
         } else if (entryPtr->mpn == 0) {
            retval += PAGE_LOOKUP_NO_MPN;
         } else {
            /*
             * Kernel can remove PTEs/PDEs from our pagetables even if pages
             * are locked...
             */
            volatile int c;

            get_user(c, (char *)uvAddr);
            *mpn = PgtblVa2MPN((VA)uvAddr);
            if (*mpn == entryPtr->mpn) {
#ifdef VMX86_DEBUG
               printk(KERN_DEBUG "Page %p disappeared from %s(%u)... "
                      "now back at %016" FMT64 "x\n",
                      uvAddr, current->comm, current->pid, *mpn);
#endif
            } else if (*mpn != INVALID_MPN) {
               printk(KERN_DEBUG "Page %p disappeared from %s(%u)... "
                      "now back at %016" FMT64"x (old=%016" FMT64 "x)\n",
                      uvAddr, current->comm, current->pid, *mpn,
                      entryPtr->mpn);
               *mpn = INVALID_MPN;
            } else {
               printk(KERN_DEBUG "Page %p disappeared from %s(%u)... "
                      "and is lost (old=%016" FMT64 "x)\n", uvAddr, current->comm,
                      current->pid, entryPtr->mpn);
               *mpn = entryPtr->mpn;
            }
         }
      }
   }

   return retval;
}


/*
 *----------------------------------------------------------------------
 *
 * HostIF_InitFP --
 *
 *      masks IRQ13 if not previously the case.
 *
 * Results:
 *      prevents INTR #0x2d (IRQ 13) from being generated --
 *      assume that Int16 works for interrupt reporting
 *      
 *
 * Side effects:
 *      PIC
 *
 *----------------------------------------------------------------------
 */

void
HostIF_InitFP(VMDriver *vm)  // IN:
{
   int mask = (1 << (0xD - 0x8));

   uint8 val = inb(0xA1);

   if (!(val & mask)) { 
      val = val | mask;
      outb(val, 0xA1);
   }
}


/*
 *-----------------------------------------------------------------------------
 *
 * HostIFGetUserPages --
 *
 *      Lock the pages of an user-level address space in memory.
 *      If ppages is NULL, pages are only marked as dirty.
 *
 * Results:
 *      Zero on success, non-zero on failure. 
 *
 * Side effects:
 *      None
 *
 *-----------------------------------------------------------------------------
 */

static int
HostIFGetUserPages(void *uvAddr,          // IN
                   struct page **ppages,  // OUT
                   unsigned int numPages) // IN
{
   int retval;

   down_read(&current->mm->mmap_sem);
   retval = get_user_pages_remote(current, current->mm, (unsigned long)uvAddr,
                           numPages, 0, 0, ppages, NULL);
   up_read(&current->mm->mmap_sem);

   return retval != numPages;
}


/*
 *----------------------------------------------------------------------
 *
 * HostIF_IsLockedByMPN --
 *
 *      Checks if mpn was locked using allowMultipleMPNsPerVA.  
 *
 * Results:
 *      TRUE if mpn is present in the physTracker.
 *      
 *
 * Side effects:
 *     None.
 *
 *----------------------------------------------------------------------
 */

Bool
HostIF_IsLockedByMPN(VMDriver *vm,  // IN:
                     MPN mpn)       // IN:
{
  return PhysTrack_Test(vm->vmhost->lockedPages, mpn);
}


/*
 *-----------------------------------------------------------------------------
 *
 * HostIF_LockPage --
 *
 *     Lockup the MPN of an pinned user-level address space
 *
 * Results:
 *     A PAGE_LOCK_* status code and the MPN on success.
 *
 * Side effects:
 *      Adds the page to the MemTracker, if allowMultipleMPNsPerVA then the page
 *      is added to the VM's PhysTracker.
 *
 *-----------------------------------------------------------------------------
 */

int
HostIF_LockPage(VMDriver *vm,                // IN: VMDriver
                VA64 uAddr,                  // IN: user VA of the page
                Bool allowMultipleMPNsPerVA, // IN: allow to lock many pages per VA
                MPN *mpn)                    // OUT: pinned page
{
   void *uvAddr = VA64ToPtr(uAddr);
   struct page *page;
   VPN vpn;
   MemTrackEntry *entryPtr = NULL;

   vpn = PTR_2_VPN(uvAddr);
   if (!allowMultipleMPNsPerVA) {
      entryPtr = MemTrack_LookupVPN(vm->memtracker, vpn);

      /*
       * Already tracked and locked
       */

      if (entryPtr != NULL && entryPtr->mpn != 0) {
         return PAGE_LOCK_ALREADY_LOCKED;
      }
   }

   if (HostIFGetUserPages(uvAddr, &page, 1)) {
      return PAGE_LOCK_FAILED;
   }

   *mpn = (MPN)page_to_pfn(page);

   if (allowMultipleMPNsPerVA) {
      /*
       *  Add the MPN to the PhysTracker that tracks locked pages.
       */

      struct PhysTracker* const pt = vm->vmhost->lockedPages;

      if (PhysTrack_Test(pt, *mpn)) {
         put_page(page);
         return PAGE_LOCK_ALREADY_LOCKED;
      }
      PhysTrack_Add(pt, *mpn);
   } else {
      /*
       * If the entry doesn't exist, add it to the memtracker
       * otherwise we just update the mpn.
       */

      if (entryPtr == NULL) {
         entryPtr = MemTrack_Add(vm->memtracker, vpn, *mpn);
         if (entryPtr == NULL) {
            HOST_UNLOCK_PFN(vm, *mpn);
            return PAGE_LOCK_MEMTRACKER_ERROR;
         }
      } else {
         entryPtr->mpn = *mpn;
      }
   }

   return PAGE_LOCK_SUCCESS;
}


/*
 *----------------------------------------------------------------------
 *
 * HostIF_UnlockPage --
 *
 *      Unlock an pinned user-level page.
 *
 * Results:
 *      Status PAGE_UNLOCK_* code.
 *
 * Side effects:
 *     None
 *
 *----------------------------------------------------------------------
 */

int
HostIF_UnlockPage(VMDriver *vm,  // IN:
                  VA64 uAddr)    // IN:
{
   void *addr = VA64ToPtr(uAddr);
   VPN vpn;
   MemTrackEntry *e;

   vpn = VA_2_VPN((VA)addr);
   e = MemTrack_LookupVPN(vm->memtracker, vpn);
    
   if (e == NULL) {
      return PAGE_UNLOCK_NOT_TRACKED;
   }
   if (e->mpn == 0) {
      return PAGE_UNLOCK_NO_MPN;
   }

   HOST_UNLOCK_PFN(vm, e->mpn);
   e->mpn = 0;

   return PAGE_UNLOCK_SUCCESS;
}


/*
 *----------------------------------------------------------------------
 *
 * HostIF_UnlockPageByMPN --
 *
 *      Unlock a locked user mode page. The page doesn't need to be mapped
 *      anywhere.
 *
 * Results:
 *      Status code. Returns a PAGE_LOOKUP_* error if the page can't be found or
 *      a PAGE_UNLOCK_* error if the page can't be unlocked.
 *
 * Side effects:
 *     Removes the MPN from from VM's PhysTracker.
 *
 *----------------------------------------------------------------------
 */

int
HostIF_UnlockPageByMPN(VMDriver *vm, // IN: VMDriver
                       MPN mpn,      // IN: the MPN to unlock
                       VA64 uAddr)   // IN: optional(debugging) VA for the MPN
{
   if (!PhysTrack_Test(vm->vmhost->lockedPages, mpn)) {
      return PAGE_UNLOCK_NO_MPN;
   }

#ifdef VMX86_DEBUG
   {
      void *va = VA64ToPtr(uAddr);
      MemTrackEntry *e;
      
      /*
       * Verify for debugging that VA and MPN make sense.
       * PgtblVa2MPN() can fail under high memory pressure.
       */

      if (va != NULL) {
         MPN lookupMpn = PgtblVa2MPN((VA)va);

         if (lookupMpn != INVALID_MPN && mpn != lookupMpn) {
            Warning("Page lookup fail %#"FMT64"x %016" FMT64 "x %p\n",
                    mpn, lookupMpn, va);

            return PAGE_LOOKUP_INVALID_ADDR;
         }
      }

      /*
       * Verify that this MPN was locked with 
       * HostIF_LockPage(allowMultipleMPNsPerVA = TRUE).
       * That means that this MPN should not be in the MemTracker.
       */

      e = MemTrack_LookupMPN(vm->memtracker, mpn);
      if (e) {
         Warning("%s(): mpn=%#"FMT64"x va=%p was permanently locked with "
                 "vpn=0x%"FMT64"x\n", __func__, mpn, va, e->vpn);

         return PAGE_UNLOCK_MISMATCHED_TYPE;
      }
   }
#endif 

   HOST_UNLOCK_PFN_BYMPN(vm, mpn);

   return PAGE_UNLOCK_SUCCESS;
}


static void 
UnlockEntry(void *clientData,         // IN:
            MemTrackEntry *entryPtr)  // IN:
{
   VMDriver *vm = (VMDriver *)clientData;

   if (entryPtr->mpn) {
      HOST_UNLOCK_PFN(vm,entryPtr->mpn);
      entryPtr->mpn = 0;
   }
}


/*
 *-----------------------------------------------------------------------------
 *
 * HostIF_FreeAllResources --
 *
 *      Free all host-specific VM resources.
 *
 * Results:
 *      None
 *
 * Side effects:
 *      None
 *
 *-----------------------------------------------------------------------------
 */

void
HostIF_FreeAllResources(VMDriver *vm) // IN
{
   unsigned int cnt;

   HostIFHostMemCleanup(vm);
   if (vm->memtracker) {
      MemTrack_Cleanup(vm->memtracker, UnlockEntry, vm);
      vm->memtracker = NULL;
   }
   if (vm->vmhost) {
      for (cnt = vm->vmhost->crosspagePagesCount; cnt > 0; ) {
         struct page* p = vm->vmhost->crosspagePages[--cnt];
         UnmapCrossPage(p, vm->crosspage[cnt]);
      }
      vm->vmhost->crosspagePagesCount = 0;
      if (vm->vmhost->hostAPICIsMapped) {
	 ASSERT(vm->hostAPIC.base != NULL);
	 iounmap((void*)vm->hostAPIC.base);
	 vm->hostAPIC.base = NULL;
	 vm->vmhost->hostAPICIsMapped = FALSE;
      }
      HostIF_FreeKernelMem(vm->vmhost);
      vm->vmhost = NULL;
   }
}


/*
 *----------------------------------------------------------------------
 *
 * HostIF_AllocKernelMem
 *
 *      Allocate some kernel memory for the driver. 
 *
 * Results:
 *      The address allocated or NULL on error. 
 *      
 *
 * Side effects:
 *      memory is malloced
 *----------------------------------------------------------------------
 */

void *
HostIF_AllocKernelMem(size_t size,  // IN:
                      int wired)    // IN:
{
   void * ptr = kmalloc(size, GFP_KERNEL);
   
   if (ptr == NULL) { 
      Warning("%s failed (size=%p)\n", __func__, (void*)size);
   }

   return ptr;
}


/*
 *-----------------------------------------------------------------------------
 *
 * HostIF_AllocPage --
 *
 *    Allocate a page (whose content is undetermined)
 *
 * Results:
 *    The kernel virtual address of the page
 *
 * Side effects:
 *    None
 *
 *-----------------------------------------------------------------------------
 */

void *
HostIF_AllocPage(void)
{
   VA kvAddr;
   
   kvAddr = __get_free_page(GFP_KERNEL);
   if (kvAddr == 0) {
      Warning("%s: __get_free_page() failed\n", __func__);
   }

   return (void *)kvAddr;
}


/*
 *----------------------------------------------------------------------
 *
 * HostIF_FreeKernelMem
 *
 *      Free kernel memory allocated for the driver. 
 *
 * Results:
 *      None.
 *
 * Side effects:
 *      memory is freed.
 *----------------------------------------------------------------------
 */

void
HostIF_FreeKernelMem(void *ptr)  // IN:
{
   kfree(ptr);
}


void
HostIF_FreePage(void *ptr)  // IN:
{
   VA vAddr = (VA)ptr;

   if (vAddr & (PAGE_SIZE-1)) {
      Warning("%s %p misaligned\n", __func__, (void*)vAddr);
   } else {
      free_page(vAddr);
   }
}


/*
 *----------------------------------------------------------------------
 *
 * HostIF_EstimateLockedPageLimit --
 *
 *      Estimates how many memory pages can be locked or allocated
 *      from the kernel without causing the host to die or to be really upset.
 *
 * Results:
 *	The maximum number of pages that can be locked. 
 *
 * Side effects:
 *      none
 *
 *----------------------------------------------------------------------
 */

unsigned int
HostIF_EstimateLockedPageLimit(const VMDriver* vm,                // IN
			       unsigned int currentlyLockedPages) // IN
{
   /*
    * This variable is available and exported to modules,
    * since at least 2.6.0.
    */

   extern unsigned long totalram_pages;

   unsigned int totalPhysicalPages = totalram_pages;

#if LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 28)
   return MemDefaults_CalcMaxLockedPages(totalPhysicalPages);
#else
   /*
    * Use the memory information linux exports as of late for a more
    * precise estimate of locked memory.  All kernel page-related structures
    * (slab, pagetable) are as good as locked.  Unevictable includes things
    * that are explicitly marked as such (like mlock()).  Huge pages are 
    * also as good as locked, since we don't use them.  Lastly, without 
    * available swap, anonymous pages become locked in memory as well. 
    */

   unsigned int forHost;
   unsigned int reservedPages = MEMDEFAULTS_MIN_HOST_PAGES;
   unsigned int hugePages = (vm == NULL) ? 0 :
      BYTES_2_PAGES(vm->memInfo.hugePageBytes);
   unsigned int lockedPages = global_page_state(NR_PAGETABLE) +
                              global_page_state(NR_SLAB_UNRECLAIMABLE) +
                              global_page_state(NR_UNEVICTABLE) +
                              hugePages + reservedPages;
   unsigned int anonPages = global_page_state(NR_ANON_MAPPED); 
   unsigned int swapPages = BYTES_2_PAGES(linuxState.swapSize);

   if (anonPages > swapPages) {
      lockedPages += anonPages - swapPages; 
   }
   forHost = lockedPages + LOCKED_PAGE_SLACK;
   if (forHost > totalPhysicalPages) {
      forHost = totalPhysicalPages;
   }

   return totalPhysicalPages - forHost;
#endif
}


/*
 *----------------------------------------------------------------------
 *
 * HostIF_Wait --
 *
 *      Waits for specified number of milliseconds.
 *
 *----------------------------------------------------------------------
 */

void
HostIF_Wait(unsigned int timeoutMs)
{
   msleep_interruptible(timeoutMs);
}


/*
 *----------------------------------------------------------------------
 *
 * HostIF_WaitForFreePages --
 *
 *      Waits for pages to be available for allocation or locking.
 *
 * Results:
 *	New pages are likely to be available for allocation or locking.
 *
 * Side effects:
 *      none
 *
 *----------------------------------------------------------------------
 */

void 
HostIF_WaitForFreePages(unsigned int timeoutMs)  // IN:
{
   static unsigned count;
   msleep_interruptible(timeoutMs);
   count++;
}


/*
 *----------------------------------------------------------------------
 *
 * HostIFReadUptimeWork --
 *
 *      Reads the current uptime.  The uptime is based on getimeofday,
 *      which provides the needed high resolution.  However, we don't
 *      want uptime to be warped by e.g. calls to settimeofday.  So, we
 *      use a jiffies based monotonic clock to sanity check the uptime.
 *      If the uptime is more than one second from the monotonic time,
 *      we assume that the time of day has been set, and recalculate the
 *      uptime base to get uptime back on track with monotonic time.  On
 *      the other hand, we do expect jiffies based monotonic time and
 *      timeofday to have small drift (due to NTP rate correction, etc).
 *      We handle this by rebasing the jiffies based monotonic clock
 *      every second (see HostIFUptimeResyncMono).
 *      
 * Results:
 *      The uptime, in units of UPTIME_FREQ.  Also returns the jiffies
 *      value that was used in the monotonic time calculation.
 *
 * Side effects:
 *      May reset the uptime base in the case gettimeofday warp was 
 *      detected.
 *
 *----------------------------------------------------------------------
 */

static uint64
HostIFReadUptimeWork(unsigned long *j)  // OUT: current jiffies 
{
   struct timeval tv;
   uint64 monotime, uptime, upBase, monoBase;
   int64 diff;
   uint32 version;
   unsigned long jifs, jifBase;
   unsigned int attempts = 0;

   /* Assert that HostIF_InitUptime has been called. */
   ASSERT(uptimeState.timer.function);

 retry:
   do {
      version  = VersionedAtomic_BeginTryRead(&uptimeState.version);
      jifs     = jiffies;
      jifBase  = uptimeState.jiffiesBase;
      monoBase = uptimeState.monotimeBase;
   } while (!VersionedAtomic_EndTryRead(&uptimeState.version, version));

   do_gettimeofday(&tv);
   upBase = Atomic_Read64(&uptimeState.uptimeBase);
   
   monotime = (uint64)(jifs - jifBase) * (UPTIME_FREQ / HZ);
   monotime += monoBase;

   uptime = tv.tv_usec * (UPTIME_FREQ / 1000000) + tv.tv_sec * UPTIME_FREQ;
   uptime += upBase;
   
   /* 
    * Use the jiffies based monotonic time to sanity check gettimeofday.
    * If they differ by more than one second, assume the time of day has
    * been warped, and use the jiffies time to undo (most of) the warp.
    */

   diff = uptime - monotime;
   if (UNLIKELY(diff < -UPTIME_FREQ || diff > UPTIME_FREQ)) {
      /* Compute a new uptimeBase to get uptime back on track. */
      uint64 newUpBase = monotime - (uptime - upBase);

      attempts++;
      if (!Atomic_CMPXCHG64(&uptimeState.uptimeBase, &upBase, &newUpBase) && 
          attempts < 5) {
         /* Another thread updated uptimeBase.  Recalculate uptime. */
         goto retry;
      }
      uptime = monotime;

      Log("%s: detected settimeofday: fixed uptimeBase old %"FMT64"u "
          "new %"FMT64"u attempts %u\n", __func__,
          upBase, newUpBase, attempts);
   }
   *j = jifs;

   return uptime;
}


/*
 *----------------------------------------------------------------------
 *
 * HostIFUptimeResyncMono --
 *
 *      Timer that fires ever second to resynchronize the jiffies based
 *      monotonic time with the uptime.
 *
 * Results:
 *      None
 *
 * Side effects:
 *      Resets the monotonic time bases so that jiffies based monotonic
 *      time does not drift from gettimeofday over the long term.
 *
 *----------------------------------------------------------------------
 */

static void
HostIFUptimeResyncMono(unsigned long data)  // IN: ignored
{
   unsigned long jifs;
   uintptr_t flags;

   /* 
    * Read the uptime and the corresponding jiffies value.  This will
    * also correct the uptime (which is based on time of day) if needed
    * before we rebase monotonic time (which is based on jiffies).
    */

   uint64 uptime = HostIFReadUptimeWork(&jifs);

   /* 
    * Every second, recalculate monoBase and jiffiesBase to squash small
    * drift between gettimeofday and jiffies.  Also, this prevents
    * (jiffies - jiffiesBase) wrap on 32-bits.
    */

   SAVE_FLAGS(flags);
   CLEAR_INTERRUPTS();
   VersionedAtomic_BeginWrite(&uptimeState.version);

   uptimeState.monotimeBase = uptime;
   uptimeState.jiffiesBase  = jifs;

   VersionedAtomic_EndWrite(&uptimeState.version);
   RESTORE_FLAGS(flags);

   /* Reschedule this timer to expire in one second. */
   mod_timer(&uptimeState.timer, jifs + HZ);
}


/*
 *----------------------------------------------------------------------
 *
 * HostIF_InitUptime --
 *
 *      Initialize the uptime clock's state.
 *
 * Results:
 *      None
 *
 * Side effects:
 *      Sets the initial values for the uptime state, and schedules
 *      the uptime timer.
 *
 *----------------------------------------------------------------------
 */

void
HostIF_InitUptime(void)
{
   struct timeval tv;

   uptimeState.jiffiesBase = jiffies;
   do_gettimeofday(&tv);
   Atomic_Write64(&uptimeState.uptimeBase, 
                  -(tv.tv_usec * (UPTIME_FREQ / 1000000) + 
                    tv.tv_sec * UPTIME_FREQ));

   init_timer(&uptimeState.timer);
   uptimeState.timer.function = HostIFUptimeResyncMono;
   mod_timer(&uptimeState.timer, jiffies + HZ);
}


/*
 *----------------------------------------------------------------------
 *
 * HostIF_CleanupUptime --
 *
 *      Cleanup uptime state, called at module unloading time.
 *
 * Results:
 *      None
 *
 * Side effects:
 *      Deschedule the uptime timer.
 *
 *----------------------------------------------------------------------
 */

void
HostIF_CleanupUptime(void)
{
   del_timer_sync(&uptimeState.timer);
}


/*
 *----------------------------------------------------------------------
 *
 * HostIF_ReadUptime --
 *
 *      Read the system time.  Returned value has no particular absolute
 *      value, only difference since previous call should be used.
 *
 * Results:
 *      Units are given by HostIF_UptimeFrequency.
 *
 * Side effects:
 *      See HostIFReadUptimeWork
 *
 *----------------------------------------------------------------------
 */

uint64
HostIF_ReadUptime(void)
{
   unsigned long jifs;

   return HostIFReadUptimeWork(&jifs);
}


/*
 *----------------------------------------------------------------------
 *
 * HostIF_UptimeFrequency
 *
 *      Return the frequency of the counter that HostIF_ReadUptime reads.
 *
 * Results:
 *      Frequency in Hz.
 *
 * Side effects:
 *      None
 *
 *----------------------------------------------------------------------
 */

uint64
HostIF_UptimeFrequency(void)
{
   return UPTIME_FREQ;
}


/*
 *-----------------------------------------------------------------------------
 *
 * HostIF_CopyFromUser --
 *
 *      Copy memory from the user application into a kernel buffer. This
 *      function may block, so don't call it while holding any kind of
 *      lock. --hpreg
 *
 * Results:
 *      0 on success
 *      -EFAULT on failure.
 *
 * Side effects:
 *      None
 *
 *-----------------------------------------------------------------------------
 */

int
HostIF_CopyFromUser(void *dst,        // OUT
                    const void *src,  // IN
                    unsigned int len) // IN
{
   return copy_from_user(dst, src, len) ? -EFAULT : 0;
}


/*
 *-----------------------------------------------------------------------------
 *
 * HostIF_CopyToUser --
 *
 *      Copy memory to the user application from a kernel buffer. This
 *      function may block, so don't call it while holding any kind of
 *      lock. --hpreg
 *
 * Results:
 *      0 on success
 *      -EFAULT on failure.
 *
 * Side effects:
 *      None
 *
 *-----------------------------------------------------------------------------
 */

int 
HostIF_CopyToUser(void *dst,        // OUT
                  const void *src,  // IN
                  unsigned int len) // IN
{
   return copy_to_user(dst, src, len) ? -EFAULT : 0;
}


/*
 *-----------------------------------------------------------------------------
 *
 * HostIF_MapCrossPage --
 *    
 *    Obtain kernel pointer to crosspage. 
 *
 *    We must return a VA that is obtained through a kernel mapping, so that 
 *    the mapping never goes away (see bug 29753).
 *
 *    However, the LA corresponding to that VA must not overlap with the 
 *    monitor (see bug 32922). The userland code ensures that by only 
 *    allocating cross pages from low memory. For those pages, the kernel 
 *    uses a permanent mapping, instead of a temporary one with a high LA.
 *
 * Results:
 *    The kernel virtual address on success
 *    NULL on failure
 *
 * Side effects:
 *    None
 *
 *-----------------------------------------------------------------------------
 */

void *
HostIF_MapCrossPage(VMDriver *vm, // IN
                    VA64 uAddr)   // IN
{
   void *p = VA64ToPtr(uAddr);
   struct page *page;
   VA           vPgAddr;
   VA           ret;

   if (HostIFGetUserPages(p, &page, 1)) {
      return NULL;
   }
   vPgAddr = (VA) MapCrossPage(page);
   HostIF_GlobalLock(16);
   if (vm->vmhost->crosspagePagesCount >= MAX_INITBLOCK_CPUS) {
      HostIF_GlobalUnlock(16);
      UnmapCrossPage(page, (void*)vPgAddr);

      return NULL;
   }
   vm->vmhost->crosspagePages[vm->vmhost->crosspagePagesCount++] = page;
   HostIF_GlobalUnlock(16);

   ret = vPgAddr | (((VA)p) & (PAGE_SIZE - 1));

   return (void*)ret;
}


/*
 *-----------------------------------------------------------------------------
 *
 * HostIF_AllocCrossGDT --
 *
 *      Allocate the per-vmmon cross GDT page set.
 *
 *      See bora/doc/worldswitch-pages.txt for the requirements on the cross
 *      GDT page set addresses.
 *
 * Results:
 *      On success: Host kernel virtual address of the first cross GDT page.
 *                  Use HostIF_FreeCrossGDT() with the same value to free.
 *                  The 'crossGDTMPNs' array is filled with the MPNs of all the
 *                  cross GDT pages.
 *      On failure: NULL.
 *
 * Side effects:
 *      None
 *
 *-----------------------------------------------------------------------------
 */

void *
HostIF_AllocCrossGDT(uint32 numPages,     // IN: Number of pages
                     MPN maxValidFirst,   // IN: Highest valid MPN of first page
                     MPN *crossGDTMPNs)   // OUT: Array of MPNs
{
   MPN startMPN;
   struct page *pages;
   uint32 i;
   void *crossGDT;

   /*
    * In practice, allocating a low page (MPN <= 0x100000 - 1) is equivalent to
    * allocating a page with MPN <= 0xFEC00 - 1:
    *
    * o PC architecture guarantees that there is no RAM in top 16MB of 4GB
    *   range.
    *
    * o 0xFEC00000 is IOAPIC base.  There could be RAM immediately below,
    *   but not above.
    *
    * How do we allocate a low page? We can safely use GFP_DMA32 when
    * available.  On 64bit kernels before GFP_DMA32 was introduced we
    * fall back to DMA zone (which is not quite necessary for boxes
    * with less than ~3GB of memory).  On 32bit kernels we are using
    * normal zone - which is usually 1GB, and at most 4GB (for 4GB/4GB
    * kernels).  And for 4GB/4GB kernels same restriction as for 64bit
    * kernels applies - there is no RAM in top 16MB immediately below
    * 4GB so alloc_pages() cannot return such page.
    */

   ASSERT(0xFEC00 - 1 <= maxValidFirst);
   for (i = 0; (1 << i) < numPages; i++) { }
#ifdef GFP_DMA32
   pages = alloc_pages(GFP_KERNEL | GFP_DMA32, i);
#else
   pages = alloc_pages(GFP_KERNEL | GFP_DMA, i);
#endif
   crossGDT = NULL;
   if (pages == NULL) {
      Warning("%s: unable to alloc crossGDT (%u)\n", __func__, i);
   } else {
      startMPN = page_to_pfn(pages);
      for (i = 0; i < numPages; i++) {
         crossGDTMPNs[i] = startMPN + i;
      }
      crossGDT = (void *)page_address(pages);
   }

   return crossGDT;
}


/*
 *-----------------------------------------------------------------------------
 *
 * HostIF_FreeCrossGDT --
 *
 *      Free the per-vmmon cross GDT page set allocated with
 *      HostIF_AllocCrossGDT().
 *
 * Results:
 *      None
 *
 * Side effects:
 *      None
 *
 *-----------------------------------------------------------------------------
 */

void
HostIF_FreeCrossGDT(uint32 numPages, // IN: Number of pages
                    void *crossGDT)  // IN: Kernel VA of first cross GDT page
{
   uint32 i;

   for (i = 0; (1 << i) < numPages; i++) { }
   free_pages((VA)crossGDT, i);
}


/*
 *-----------------------------------------------------------------------------
 *
 * HostIF_VMLock --
 *
 *      Grabs per-VM data structure lock. The lock is not recursive.
 *      The global lock has lower rank so the global lock should be grabbed
 *      first if both locks are acquired.
 *
 *      It should be a medium contention lock. Also it should be fast:
 *      it is used for protecting of frequent page allocation and locking.
 *
 * Results:
 *      None
 *
 * Side effects:
 *      The current thread is rescheduled if the lock is busy.
 *
 *-----------------------------------------------------------------------------
 */

void
HostIF_VMLock(VMDriver *vm, // IN
              int callerID) // IN
{
   ASSERT(vm);

   ASSERT(vm->vmhost);
   MutexLock(&vm->vmhost->vmMutex, callerID);
}


/*
 *-----------------------------------------------------------------------------
 *
 * HostIF_VMUnlock --
 *
 *      Releases per-VM data structure lock.
 *
 * Results:
 *      None
 *
 * Side effects:
 *      Can wake up the thread blocked on this lock. 
 *
 *-----------------------------------------------------------------------------
 */

void
HostIF_VMUnlock(VMDriver *vm, // IN
                int callerID) // IN
{
   ASSERT(vm);

   ASSERT(vm->vmhost);
   MutexUnlock(&vm->vmhost->vmMutex, callerID);
}


#ifdef VMX86_DEBUG
/*
 *-----------------------------------------------------------------------------
 *
 * HostIF_VMLockIsHeld --
 *
 *      Determine if the per-VM lock is held by the current thread.
 * 
 * Results:
 *      TRUE if yes
 *      FALSE if no
 *
 * Side effects:
 *      None
 *
 *-----------------------------------------------------------------------------
 */

Bool
HostIF_VMLockIsHeld(VMDriver *vm) // IN
{
   ASSERT(vm);
   ASSERT(vm->vmhost);

   return MutexIsLocked(&vm->vmhost->vmMutex);
}
#endif


/*
 * Utility routines for accessing and enabling the APIC
 */

/*
 * Defines for accessing the APIC.  We use readl/writel to access the APIC
 * which is how Linux wants you to access I/O memory (though on the x86
 * just dereferencing a pointer works just fine).
 */
#define APICR_TO_ADDR(apic, reg)      (apic + (reg << 4))
#define GET_APIC_REG(apic, reg)       (readl(APICR_TO_ADDR(apic, reg)))
#define SET_APIC_REG(apic, reg, val)  (writel(val, APICR_TO_ADDR(apic, reg)))

#define APIC_MAXLVT(apic)             ((GET_APIC_REG(apic, APICR_VERSION) >> 16) & 0xff)
#define APIC_VERSIONREG(apic)         (GET_APIC_REG(apic, APICR_VERSION) & 0xff)


#if defined(CONFIG_SMP) || defined(CONFIG_X86_UP_IOAPIC) || \
    defined(CONFIG_X86_UP_APIC) || defined(CONFIG_X86_LOCAL_APIC)
/*
 *----------------------------------------------------------------------
 *
 * isVAReadable --
 *
 *      Verify that passed VA is accessible without crash...
 *
 * Results:
 *      TRUE if address is readable, FALSE otherwise.
 *
 * Side effects:
 *      None.
 *
 *----------------------------------------------------------------------
 */
 
static Bool
isVAReadable(VA r)  // IN:
{
   mm_segment_t old_fs;
   uint32 dummy;
   int ret;
   
   old_fs = get_fs();
   set_fs(get_ds());
   r = APICR_TO_ADDR(r, APICR_VERSION);
   ret = HostIF_CopyFromUser(&dummy, (void*)r, sizeof(dummy));
   set_fs(old_fs);

   return ret == 0;
}


/*
 *----------------------------------------------------------------------
 *
 * SetVMAPICAddr --
 *
 *      Maps the host cpu's APIC.  The virtual address is stashed in
 *      the VMDriver structure.
 *
 * Results:
 *      None.
 *
 * Side effects:
 *      The VMDriver structure is updated.
 *
 *----------------------------------------------------------------------
 */

static void
SetVMAPICAddr(VMDriver *vm, // IN/OUT: driver state
              MA ma)	    // IN: host APIC's ma
{
   volatile void *hostapic;

   ASSERT_ON_COMPILE(APICR_SIZE <= PAGE_SIZE);
   hostapic = (volatile void *) ioremap_nocache(ma, PAGE_SIZE);
   if (hostapic) {
      if ((APIC_VERSIONREG(hostapic) & 0xF0) == 0x10) {
	 vm->hostAPIC.base = (volatile uint32 (*)[4]) hostapic;
	 ASSERT(vm->vmhost != NULL);
	 vm->vmhost->hostAPICIsMapped = TRUE;
      } else {
	 iounmap((void*)hostapic);
      }
   }
}


/*
 *----------------------------------------------------------------------
 *
 * ProbeAPIC --
 *
 *      Attempts to map the host APIC.
 *
 *      Most versions of Linux already provide access to a mapped
 *      APIC.  This function is just a backup.
 *
 *      Caveat: We assume that the APIC physical address is the same
 *      on all host cpus.
 *
 * Results:
 *      TRUE if APIC was found, FALSE if not.
 *
 * Side effects:
 *      May map the APIC.
 *
 *----------------------------------------------------------------------
 */

static Bool
ProbeAPIC(VMDriver *vm,   // IN/OUT: driver state
	  Bool setVMPtr)  // IN: set a pointer to the APIC's virtual address
{
   MA ma = APIC_GetMA();
   
   if (ma == (MA)-1) {
      return FALSE;
   }

   if (setVMPtr) {
      SetVMAPICAddr(vm, ma);
   } else {
      vm->hostAPIC.base = NULL;
   }

   return TRUE;
}
#endif


/*
 *----------------------------------------------------------------------
 *
 * HostIF_APICInit --
 *
 *      Initialize APIC behavior.
 *      Attempts to map the host APIC into vm->hostAPIC.
 *
 *      We don't attempt to refresh the mapping after a host cpu
 *      migration.  Fortunately, hosts tend to use the same address
 *      for all APICs.
 *
 *      Most versions of Linux already provide a mapped APIC.  We
 *      have backup code to read APIC_BASE and map it, if needed.
 *
 * Results:
 *      TRUE
 *
 * Side effects:
 *      May map the host APIC.
 *
 *----------------------------------------------------------------------
 */
Bool
HostIF_APICInit(VMDriver *vm,   // IN:
                Bool setVMPtr,  // IN:
                Bool probe)     // IN: force probing
{
#if defined(CONFIG_SMP)         || defined(CONFIG_X86_UP_IOAPIC) || \
    defined(CONFIG_X86_UP_APIC) || defined(CONFIG_X86_LOCAL_APIC)
   static Bool apicIPILogged = FALSE;
   VA kAddr;

   monitorIPIVector = SPURIOUS_APIC_VECTOR;
#if defined(POSTED_INTR_VECTOR)
   hvIPIVector      = POSTED_INTR_VECTOR;
#else
   hvIPIVector      = 0;
#endif


   if (!apicIPILogged) {
      Log("Monitor IPI vector: %x\n", monitorIPIVector);
      Log("HV      IPI vector: %x\n", hvIPIVector);
      apicIPILogged = TRUE;
   }

   if ((__GET_MSR(MSR_APIC_BASE) & APIC_MSR_X2APIC_ENABLED) != 0) {
      if (setVMPtr) {
         vm->hostAPIC.base = NULL;
         vm->vmhost->hostAPICIsMapped = FALSE;
         vm->hostAPIC.isX2 = TRUE;
      }
      return TRUE;
   }

   if (probe && ProbeAPIC(vm, setVMPtr)) {
      return TRUE;
   }

   /*
    * Normal case: use Linux's pre-mapped APIC.
    */
   kAddr = __fix_to_virt(FIX_APIC_BASE);
   if (!isVAReadable(kAddr)) {
      return TRUE;
   }
   if (setVMPtr) {
      vm->hostAPIC.base = (void *)kAddr;
   } else {
      vm->hostAPIC.base = NULL;
   }
#endif
   return TRUE;
}


/*
 *-----------------------------------------------------------------------------
 *
 * HostIF_SemaphoreWait --
 *
 *    Perform the semaphore wait (P) operation, possibly blocking.
 *
 * Result:
 *    1 (which equals MX_WAITNORMAL) if success, 
 *    negated error code otherwise.
 *
 * Side-effects:
 *    None
 *
 *-----------------------------------------------------------------------------
 */

int   
HostIF_SemaphoreWait(VMDriver *vm,   // IN:
                     Vcpuid vcpuid,  // IN:
                     uint64 *args)   // IN:
{
   struct file *file;
   mm_segment_t old_fs;
   int res;
   int waitFD = args[0];
   int timeoutms = args[2];
   uint64 value;

   file = vmware_fget(waitFD);
   if (file == NULL) {
      return MX_WAITERROR;
   }

   old_fs = get_fs();
   set_fs(get_ds());

   {
      struct poll_wqueues table;
      unsigned int mask;
      
      poll_initwait(&table);
      current->state = TASK_INTERRUPTIBLE;
      mask = file->f_op->poll(file, &table.pt);
      if (!(mask & (POLLIN | POLLERR | POLLHUP))) {
         vm->vmhost->vcpuSemaTask[vcpuid] = current;
         schedule_timeout(timeoutms * HZ / 1000);  // convert to Hz
         vm->vmhost->vcpuSemaTask[vcpuid] = NULL;
      }
      current->state = TASK_RUNNING;
      poll_freewait(&table);
   }

   /*
    * Userland only writes in multiples of sizeof(uint64). This will allow
    * the code to happily deal with a pipe or an eventfd. We only care about
    * reading no bytes (EAGAIN - non blocking fd) or sizeof(uint64).
    */

   res = file->f_op->read(file, (char *) &value, sizeof value, &file->f_pos);

   if (res == sizeof value) {
      res = MX_WAITNORMAL;
   } else {
      if (res == 0) {
         res = -EBADF;
      }
   }

   set_fs(old_fs);
   fput(file);

   /*
    * Handle benign errors:
    * EAGAIN is MX_WAITTIMEDOUT.
    * The signal-related errors are all mapped into MX_WAITINTERRUPTED.
    */

   switch (res) {
   case -EAGAIN:
      res = MX_WAITTIMEDOUT;
      break;
   case -EINTR:
   case -ERESTART:
   case -ERESTARTSYS:
   case -ERESTARTNOINTR:
   case -ERESTARTNOHAND:
      res = MX_WAITINTERRUPTED;
      break;
   case -EBADF:
      res = MX_WAITERROR;
      break;
   }
   return res;
}


/*
 *-----------------------------------------------------------------------------
 *
 * HostIF_SemaphoreForceWakeup --
 *
 *    For each VCPU in the set whose target process is lightly sleeping (i.e.
 *    TASK_INTERRUPTIBLE), wake it up.  The target process can be waiting on a
 *    semaphore or due to a call to Vmx86_YieldToSet.
 *
 * Result:
 *    None.
 *
 * Side-effects:
 *    None
 *
 *-----------------------------------------------------------------------------
 */

void 
HostIF_SemaphoreForceWakeup(VMDriver *vm,       // IN:
                            const VCPUSet *vcs) // IN:
{
   FOR_EACH_VCPU_IN_SET(vcs, vcpuid) {
      struct task_struct *t = vm->vmhost->vcpuSemaTask[vcpuid];
      vm->vmhost->vcpuSemaTask[vcpuid] = NULL;
      if (t && (t->state & TASK_INTERRUPTIBLE)) {
         wake_up_process(t);
      }
   } ROF_EACH_VCPU_IN_SET();
}


/*
 *-----------------------------------------------------------------------------
 *
 * HostIF_SemaphoreSignal --
 *
 *      Perform the semaphore signal (V) operation.
 *
 * Result:
 *      On success: MX_WAITNORMAL (1).
 *      On error: MX_WAITINTERRUPTED (3) if interrupted by a Unix signal (we
 *                   can block on a preemptive kernel).
 *                MX_WAITERROR (0) on generic error.
 *                Negated system error (< 0).
 *
 * Side-effects:
 *      None
 *
 *-----------------------------------------------------------------------------
 */

int
HostIF_SemaphoreSignal(uint64 *args)  // IN:
{
   struct file *file;
   mm_segment_t old_fs;
   int res;
   int signalFD = args[1];
   uint64 value = 1;  // make an eventfd happy should it be there

   file = vmware_fget(signalFD);
   if (!file) {
      return MX_WAITERROR;
   }

   old_fs = get_fs();
   set_fs(get_ds());

   /*
    * Always write sizeof(uint64) bytes. This works fine for eventfd and
    * pipes. The data written is formatted to make an eventfd happy should
    * it be present.
    */

   res = file->f_op->write(file, (char *) &value, sizeof value, &file->f_pos);

   if (res == sizeof value) {
      res = MX_WAITNORMAL;
   }

   set_fs(old_fs);
   fput(file);

   /*
    * Handle benign errors:
    * EAGAIN is MX_WAITTIMEDOUT.
    * The signal-related errors are all mapped into MX_WAITINTERRUPTED.
    */

   switch (res) {
   case -EAGAIN:
      // The pipe is full, so it is already signalled. Success.
      res = MX_WAITNORMAL;
      break;
   case -EINTR:
   case -ERESTART:
   case -ERESTARTSYS:
   case -ERESTARTNOINTR:
   case -ERESTARTNOHAND:
      res = MX_WAITINTERRUPTED;
      break;
   }
   return res;
}

#if ((LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 27)) || !defined(CONFIG_SMP))
#   define VMMON_USE_CALL_FUNC
#endif

#if defined(VMMON_USE_CALL_FUNC)
/*
 *----------------------------------------------------------------------
 *
 * LinuxDriverIPIHandler  --
 *
 *      Null IPI handler - for monitor to notice AIO completion
 *
 *----------------------------------------------------------------------
 */
void
LinuxDriverIPIHandler(void *info)
{
   return;
}

#if LINUX_VERSION_CODE > KERNEL_VERSION(2, 6, 17)
#define VMMON_CALL_FUNC_SYNC 0  // async; we've not seen any problems
#else
#define VMMON_CALL_FUNC_SYNC 1  // sync; insure no problems from old releases
#endif

#endif


/*
 *----------------------------------------------------------------------
 *
 * HostIF_IPI --
 *
 *    If the passed VCPU threads are on some CPUs in the system,
 *    attempt to hit them with an IPI.
 *
 *    On older Linux systems we do a broadcast.
 *
 * Result:
 *    The mode used to send IPIs.
 *
 *----------------------------------------------------------------------
 */

HostIFIPIMode
HostIF_IPI(VMDriver *vm,                // IN:
           const VCPUSet *ipiTargets)   // IN:
{
   HostIFIPIMode mode = IPI_NONE;

   ASSERT(vm);

   FOR_EACH_VCPU_IN_SET(ipiTargets, v) {
      uint32 targetHostCpu = vm->currentHostCpu[v];
      if (targetHostCpu != INVALID_PCPU) {
         ASSERT(targetHostCpu < MAX_PCPUS);
#if defined(VMMON_USE_CALL_FUNC)
         /* older kernels IPI broadcast; use async when possible */
         (void) compat_smp_call_function(LinuxDriverIPIHandler,
                                         NULL, VMMON_CALL_FUNC_SYNC);
	 mode = IPI_BROADCAST;
	 break;
#else
         /* Newer kernels have (async) IPI targetting */
         arch_send_call_function_single_ipi(targetHostCpu);
	 mode = IPI_UNICAST;
#endif
      }
   } ROF_EACH_VCPU_IN_SET();

   return mode;
}


typedef struct {
   Atomic_uint32 index;
   CPUIDQuery *query;
} HostIFGetCpuInfoData;


/*
 *-----------------------------------------------------------------------------
 *
 * HostIFGetCpuInfo --
 *
 *      Collect CPUID information on the current logical CPU.
 *
 * Results:
 *      None.
 *
 * Side effects:
 *      'data->index' is atomically incremented by one.
 *
 *-----------------------------------------------------------------------------
 */

static void
HostIFGetCpuInfo(void *clientData) // IN/OUT: A HostIFGetCpuInfoData *
{
   HostIFGetCpuInfoData *data = (HostIFGetCpuInfoData *)clientData;
   CPUIDQuery *query;
   uint32 index;

   ASSERT(data);
   query = data->query;
   ASSERT(query);

   index = Atomic_ReadInc32(&data->index);
   if (index >= query->numLogicalCPUs) {
      return;
   }

   query->logicalCPUs[index].tag = HostIF_GetCurrentPCPU();
   __GET_CPUID2(query->eax, query->ecx, &query->logicalCPUs[index].regs);
}


/*
 *-----------------------------------------------------------------------------
 *
 * HostIF_GetAllCpuInfo --
 *
 *      Collect CPUID information on all logical CPUs.
 *
 *      'query->numLogicalCPUs' is the size of the 'query->logicalCPUs' output
 *      array.
 *
 * Results:
 *      On success: TRUE. 'query->logicalCPUs' is filled and
 *                  'query->numLogicalCPUs' is adjusted accordingly.
 *      On failure: FALSE. Happens if 'query->numLogicalCPUs' was too small.
 *
 * Side effects:
 *      None
 *
 *-----------------------------------------------------------------------------
 */

Bool
HostIF_GetAllCpuInfo(CPUIDQuery *query) // IN/OUT
{
   HostIFGetCpuInfoData data;

   Atomic_Write32(&data.index, 0);
   data.query = query;

   /*
    * XXX Linux has userland APIs to bind a thread to a processor, so we could
    *     probably implement this in userland like we do on Win32.
    */

   HostIF_CallOnEachCPU(HostIFGetCpuInfo, &data);

   /*
    * At this point, Atomic_Read32(&data.index) is the number of logical CPUs
    * who replied.
    */

   if (Atomic_Read32(&data.index) > query->numLogicalCPUs) {
      return FALSE;
   }

   ASSERT(Atomic_Read32(&data.index) <= query->numLogicalCPUs);
   query->numLogicalCPUs = Atomic_Read32(&data.index);

   return TRUE;
}


/*
 *----------------------------------------------------------------------
 *
 * HostIF_CallOnEachCPU --
 *
 *      Call specified function once on each CPU.  No ordering guarantees.
 *
 * Results:
 *      None.
 *
 * Side effects:
 *      None.  May be slow.
 *
 *----------------------------------------------------------------------
 */

void
HostIF_CallOnEachCPU(void (*func)(void*), // IN: function to call
                     void *data)          // IN/OUT: argument to function
{
   preempt_disable();
   (*func)(data);
   (void)compat_smp_call_function(*func, data, 1);
   preempt_enable();
}


/*
 *----------------------------------------------------------------------
 *
 * HostIF_ReadPage --
 *
 *      puts the content of a machine page into a kernel or user mode 
 *      buffer. 
 *
 * Results:
 *      0 on success
 *      negative error code on error
 *
 * Side effects:
 *      none
 *
 *----------------------------------------------------------------------
 */

int
HostIF_ReadPage(MPN mpn,             // MPN of the page
                VA64 addr,           // buffer for data
                Bool kernelBuffer)   // is the buffer in kernel space?
{
   void *buf = VA64ToPtr(addr);
   int ret = 0;
   const void* ptr;
   struct page* page;

   if (mpn == INVALID_MPN) {
      return -EFAULT;
   }

   page = pfn_to_page(mpn);
   ptr = kmap(page);
   if (ptr == NULL) {
      return -ENOMEM;
   }
   
   if (kernelBuffer) {
      memcpy(buf, ptr, PAGE_SIZE);
   } else {
      ret = HostIF_CopyToUser(buf, ptr, PAGE_SIZE);
   }
   kunmap(page);

   return ret;
}


/*
 *----------------------------------------------------------------------
 *
 * HostIF_WritePage --
 *
 *      Put the content of a kernel or user mode buffer into a machine 
 *      page.
 *
 * Results:
 *      0 on success
 *      negative error code on error
 *
 * Side effects:
 *      none
 *
 *----------------------------------------------------------------------
 */

int
HostIF_WritePage(MPN mpn,              // MPN of the page
                 VA64 addr,            // data to write to the page
                 Bool kernelBuffer)    // is the buffer in kernel space?
{
   void const *buf = VA64ToPtr(addr);
   int ret = 0;
   void* ptr;
   struct page* page;

   if (mpn == INVALID_MPN) {
      return -EFAULT;
   }

   page = pfn_to_page(mpn);
   ptr = kmap(page);
   if (ptr == NULL) {
      return -ENOMEM;
   }

   if (kernelBuffer) {
      memcpy(ptr, buf, PAGE_SIZE);
   } else {
      ret = HostIF_CopyFromUser(ptr, buf, PAGE_SIZE);
   }
   kunmap(page);

   return ret;
}


/*
 *----------------------------------------------------------------------
 *
 * HostIF_GetLockedPageList --
 *
 *      puts MPNs of pages that were allocated by HostIF_AllocLockedPages()
 *      into user mode buffer.
 *
 * Results:
 *      non-negative number of the MPNs in the buffer on success.
 *      negative error code on error (-EFAULT)
 *
 * Side effects:
 *      none
 *
 *----------------------------------------------------------------------
 */

int
HostIF_GetLockedPageList(VMDriver* vm,          // IN: VM instance pointer
                         VA64 uAddr,            // OUT: user mode buffer for MPNs
                         unsigned int numPages) // IN: size of the buffer in MPNs
{
   MPN *mpns = VA64ToPtr(uAddr);
   MPN mpn;
   unsigned count;

   struct PhysTracker* AWEPages;

   if (!vm->vmhost || !vm->vmhost->AWEPages) {
      return 0;
   }
   AWEPages = vm->vmhost->AWEPages;

   for (mpn = 0, count = 0;
        (count < numPages) &&
        (INVALID_MPN != (mpn = PhysTrack_GetNext(AWEPages, mpn)));
        count++) {

      if (HostIF_CopyToUser(&mpns[count], &mpn, sizeof *mpns) != 0) {
         return -EFAULT;
      }
   }

   return count;
}


/*
 *-----------------------------------------------------------------------------
 *
 * HostIF_GetNextAnonPage --
 *
 *      If "inMPN" is INVALID_MPN gets the first MPN in the anon mpn list else
 *      gets the anon mpn after "inMPN" in the anon mpn list.
 *
 * Results:
 *      Next anon MPN. If the list has been exhausted, returns INVALID_MPN.
 *
 *-----------------------------------------------------------------------------
 */

MPN
HostIF_GetNextAnonPage(VMDriver *vm, MPN inMPN)
{
   if (!vm->vmhost || !vm->vmhost->AWEPages) {
      return INVALID_MPN;
   }
   return PhysTrack_GetNext(vm->vmhost->AWEPages, inMPN);
}


/*
 *----------------------------------------------------------------------
 *
 * HostIF_GetCurrentPCPU --
 *
 *    Get current physical CPU id.  Interrupts should be disabled so
 *    that the thread cannot move to another CPU.
 *
 * Results:
 *    Host CPU number.
 *
 * Side effects:
 *    None.
 *
 *---------------------------------------------------------------------- 
 */

uint32
HostIF_GetCurrentPCPU(void)
{
   return smp_processor_id();
}


#ifdef VMMON_USE_COMPAT_SCHEDULE_HRTIMEOUT
/*
 *----------------------------------------------------------------------
 *
 * HostIFWakeupClockThread --
 *
 *      Wake up the fast clock thread.  Can't do this from the timer
 *      callback, because it holds locks that the scheduling code
 *      might take. 
 *
 * Results:
 *      None.
 *      
 * Side effects:
 *      None.
 *
 *----------------------------------------------------------------------
 */

static void 
HostIFWakeupClockThread(unsigned long data)  //IN:
{
   wake_up_process(linuxState.fastClockThread);
}


/*
 *----------------------------------------------------------------------
 *
 * HostIFTimerCallback --
 *      
 *      Schedule a tasklet to wake up the fast clock thread.
 *
 * Results:
 *      Tell the kernel not to restart the timer.
 *
 * Side effects:
 *      None.
 *
 *----------------------------------------------------------------------
 */
 
static enum hrtimer_restart 
HostIFTimerCallback(struct hrtimer *timer)  //IN:
{
   tasklet_schedule(&timerTasklet);

   return HRTIMER_NORESTART;
}


/*
 *----------------------------------------------------------------------
 *
 * HostIFScheduleHRTimeout --
 *      
 *      Schedule an hrtimer to wake up the fast clock thread.
 *
 * Results:
 *      None.
 *
 * Side effects:
 *      Sleep.
 *
 *----------------------------------------------------------------------
 */

static void 
HostIFScheduleHRTimeout(ktime_t *expires)  //IN:
{
   struct hrtimer t;

   if (expires && !expires->tv64) {
      __set_current_state(TASK_RUNNING);

      return;
   }

   hrtimer_init(&t, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
   t.function = HostIFTimerCallback;
   hrtimer_start(&t, *expires, HRTIMER_MODE_REL);

   if (hrtimer_active(&t)) {
      schedule();
   }
   
   hrtimer_cancel(&t);
   __set_current_state(TASK_RUNNING);
}
#endif //VMMON_USE_COMPAT_SCHEDULE_HRTIMEOUT


#ifndef VMMON_USE_HIGH_RES_TIMERS
/*
 *----------------------------------------------------------------------
 *
 * HostIFDoIoctl --
 *
 *    Issue ioctl.  Assume kernel is not locked.  It is not true now,
 *    but it makes things easier to understand, and won't surprise us
 *    later when we get rid of kernel lock from our code.
 *
 * Results:
 *    Same as ioctl method.
 *
 * Side effects:
 *    none.
 *
 *---------------------------------------------------------------------- 
 */

static long
HostIFDoIoctl(struct file *filp,
              u_int iocmd,
              unsigned long ioarg)
{
   if (filp->f_op->unlocked_ioctl) {
      return filp->f_op->unlocked_ioctl(filp, iocmd, ioarg);
   }
   return -ENOIOCTLCMD;
}
#endif //VMON_USE_HIGH_RES_TIMERS


/*
 *----------------------------------------------------------------------
 *
 * HostIFStartTimer --
 *
 *      Starts the timer using either /dev/rtc or high-resolution timers.
 *
 * Results:
 *      Returns 0 on success, -1 on failure.
 *
 * Side effects:
 *      Sleep until timer expires.
 *
 *----------------------------------------------------------------------
 */

int
HostIFStartTimer(Bool rateChanged,  //IN: Did rate change? 
		 unsigned int rate, //IN: current clock rate
                 struct file *filp) //IN: /dev/rtc descriptor
{
#ifdef VMMON_USE_HIGH_RES_TIMERS
   static unsigned long slack = 0;
   static ktime_t expires;
   int timerPeriod;

   if (rateChanged) {
      timerPeriod = NSEC_PER_SEC / rate; 
      expires = ktime_set(0, timerPeriod);
      /*
       * Allow the kernel to expire the timer at its convenience.
       * ppoll() uses 0.1% of the timeout value.  I think we can
       * tolerate 1%.
       */
          
      slack = timerPeriod / 100;
   }
   set_current_state(TASK_INTERRUPTIBLE);
#   ifdef VMMON_USE_SCHEDULE_HRTIMEOUT
   schedule_hrtimeout_range(&expires, slack, HRTIMER_MODE_REL);
#   else
   HostIFScheduleHRTimeout(&expires);
#   endif
#else
   unsigned p2rate;
   int res;
   unsigned long buf;
   loff_t pos = 0;

   if (rateChanged) {
      /*
       * The host will already have HZ timer interrupts per second.  So
       * in order to satisfy the requested rate, we need up to (rate -
       * HZ) additional interrupts generated by the RTC.  That way, if
       * the guest ask for a bit more than 1024 virtual interrupts per
       * second (which is a common case for Windows with multimedia
       * timers), we'll program the RTC to 1024 rather than 2048, which
       * saves a considerable amount of CPU.  PR 519228.
       */
      if (rate > HZ) {
         rate -= HZ;
      } else {
         rate = 0;
      }
      /*
       * Don't set the RTC rate to 64 Hz or lower: some kernels have a
       * bug in the HPET emulation of RTC that will cause the RTC
       * frequency to get stuck at 64Hz.  See PR 519228 comment #23.
       */
      p2rate = 128;
      // Hardware rate must be a power of 2
      while (p2rate < rate && p2rate < 8192) {
         p2rate <<= 1;
      }

      res = HostIFDoIoctl(filp, RTC_IRQP_SET, p2rate);
      if (res < 0) {
         Warning("/dev/rtc set rate %d failed: %d\n", p2rate, res);

         return -1;
      }
      if (kthread_should_stop()) {
         return -1;
      }
   }
   res = filp->f_op->read(filp, (void *) &buf, sizeof(buf), &pos);
   if (res <= 0) {
      if (res != -ERESTARTSYS) {
         Log("/dev/rtc read failed: %d\n", res);
      }

      return -1;
   }
#endif

   return 0;
}


/*
 *----------------------------------------------------------------------
 *
 * HostIFFastClockThread --
 *
 *      Kernel thread that provides finer-grained wakeups than the
 *      main system timers by using /dev/rtc.  We can't do this at
 *      user level because /dev/rtc is not sharable (PR 19266).  Also,
 *      we want to avoid the overhead of a context switch out to user
 *      level on every RTC interrupt.
 *
 * Results:
 *      Returns 0.
 *
 * Side effects:
 *      Wakeups and IPIs.
 *
 *----------------------------------------------------------------------
 */

static int
HostIFFastClockThread(void *data)  // IN:
{
   struct file *filp = (struct file *) data;
   int res;
   mm_segment_t oldFS;
   unsigned int rate = 0;
   unsigned int prevRate = 0;

   oldFS = get_fs();
   set_fs(KERNEL_DS);
   allow_signal(SIGKILL);
   set_user_nice(current, linuxState.fastClockPriority);

   while ((rate = linuxState.fastClockRate) > MIN_RATE) {
      if (kthread_should_stop()) {
         goto out;
      }
      res = HostIFStartTimer(rate != prevRate, rate, filp);
      if (res < 0) {
         goto out;
      }
      prevRate = rate;

#if defined(CONFIG_SMP)
      /*
       * IPI each VCPU thread that is in the monitor and is due to
       * fire a MonTimer callback.
       */
      Vmx86_MonTimerIPI();
#endif

      /*
       * Wake threads that are waiting for a fast poll timeout at
       * userlevel.  This is needed only on Linux.  On Windows,
       * we get shorter timeouts simply by increasing the host
       * clock rate.
       */

      LinuxDriverWakeUp(TRUE);
   }

 out:
   LinuxDriverWakeUp(TRUE);
   set_fs(oldFS);

   /*
    * Do not exit thread until we are told to do so.
    */

   do {
      set_current_state(TASK_UNINTERRUPTIBLE);
      if (kthread_should_stop()) {
         break;
      }
      schedule();
   } while (1);
   set_current_state(TASK_RUNNING);

   return 0;
}


/*
 *-----------------------------------------------------------------------------
 *
 * HostIF_SetFastClockRate --
 *
 *      The monitor wants to poll for events at the given rate.
 *      Ensure that the host OS's timer interrupts come at least at
 *      this rate.  If the requested rate is greater than the rate at
 *      which timer interrupts will occur on CPUs other than 0, then
 *      also arrange to call Vmx86_MonitorPollIPI on every timer
 *      interrupt, in order to relay IPIs to any other CPUs that need
 *      them.
 *
 * Locking:
 *      The caller must hold the fast clock lock.
 *
 * Results:
 *      0 for success; positive error code if /dev/rtc could not be opened.
 *
 * Side effects:
 *      As described above.
 *
 *-----------------------------------------------------------------------------
 */

int
HostIF_SetFastClockRate(unsigned int rate) // IN: Frequency in Hz.
{
   ASSERT(MutexIsLocked(&fastClockMutex));
   linuxState.fastClockRate = rate;

   /*
    * Overview
    * --------
    * An SMP Linux kernel programs the 8253 timer (to increment the 'jiffies'
    * counter) _and_ all local APICs (to run the scheduler code) to deliver
    * interrupts HZ times a second.
    *
    * Time
    * ----
    * The kernel tries very hard to spread all these interrupts evenly over
    * time, i.e. on a 1 CPU system, the 1 local APIC phase is shifted by 1/2
    * period compared to the 8253, and on a 2 CPU system, the 2 local APIC
    * phases are respectively shifted by 1/3 and 2/3 period compared to the
    * 8253. This is done to reduce contention on locks guarding the global task
    * queue.
    *
    * Space
    * -----
    * The 8253 interrupts are distributed between physical CPUs, evenly on a P3
    * system, whereas on a P4 system physical CPU 0 gets all of them.
    *
    * Long story short, unless the monitor requested rate is significantly
    * higher than HZ, we don't need to send IPIs or exclusively grab /dev/rtc
    * to periodically kick vCPU threads running in the monitor on all physical
    * CPUs.
    */

   if (rate > MIN_RATE) {
      if (!linuxState.fastClockThread) {
         struct task_struct *rtcTask;
         struct file *filp = NULL;

#if !defined(VMMON_USE_HIGH_RES_TIMERS)
         int res;

         filp = filp_open("/dev/rtc", O_RDONLY, 0);
         if (IS_ERR(filp)) {
            Warning("/dev/rtc open failed: %d\n", (int)(VA)filp);

            return -(int)(VA)filp;
         }
         res = HostIFDoIoctl(filp, RTC_PIE_ON, 0);
         if (res < 0) {
            Warning("/dev/rtc enable interrupt failed: %d\n", res);
            filp_close(filp, current->files);

            return -res;
         }
#endif
         rtcTask = kthread_run(HostIFFastClockThread, filp, "vmware-rtc");
         if (IS_ERR(rtcTask)) {
            long err = PTR_ERR(rtcTask);

            /*
             * Ignore ERESTARTNOINTR silently, it occurs when signal is
             * pending, and syscall layer automatically reissues operation
             * after signal is handled.
             */

            if (err != -ERESTARTNOINTR) {
               Warning("/dev/rtc cannot start watch thread: %ld\n", err);
            }
	    close_rtc(filp, current->files);

            return -err;
         }
         linuxState.fastClockThread = rtcTask;
	 linuxState.fastClockFile = filp;
      }
   } else {
      if (linuxState.fastClockThread) {
         force_sig(SIGKILL, linuxState.fastClockThread);
         kthread_stop(linuxState.fastClockThread);
	 close_rtc(linuxState.fastClockFile, current->files);

         linuxState.fastClockThread = NULL;
	 linuxState.fastClockFile = NULL;
      }
   }

   return 0;
}


/*
 *-----------------------------------------------------------------------------
 *
 * HostIF_MapUserMem --
 *
 *	Obtain kernel pointer to user memory. The pages backing the user memory
 *      address are locked into memory (this allows the pointer to be used in
 *      contexts where paging is undesirable or impossible).
 *
 * Results:
 *      On success, returns the kernel virtual address, along with a handle to
 *      be used for unmapping.
 *      On failure, returns NULL.
 *
 * Side effects:
 *	Yes.
 *
 *-----------------------------------------------------------------------------
 */

void *
HostIF_MapUserMem(VA addr,                  // IN: User memory virtual address
                  size_t size,              // IN: Size of memory desired
                  VMMappedUserMem **handle) // OUT: Handle to mapped memory
{
   void *p = (void *) (uintptr_t) addr;
   VMMappedUserMem *newHandle;
   VA offset = addr & (PAGE_SIZE - 1);
   size_t numPagesNeeded = ((offset + size) / PAGE_SIZE) + 1;
   size_t handleSize =
      sizeof *newHandle + numPagesNeeded * sizeof newHandle->pages[0];
   void *mappedAddr;

   ASSERT(handle);

   if (!access_ok(VERIFY_WRITE, p, size)) {
      printk(KERN_ERR "%s: Couldn't verify write to uva 0x%p with size %"
             FMTSZ"u\n", __func__, p, size);

      return NULL;
   }

   newHandle = kmalloc(handleSize, GFP_KERNEL);
   if (newHandle == NULL) {
      printk(KERN_ERR "%s: Couldn't allocate %"FMTSZ"u bytes of memory\n",
             __func__, handleSize);

      return NULL;
   }

   if (HostIFGetUserPages(p, newHandle->pages, numPagesNeeded)) {
      kfree(newHandle);
      printk(KERN_ERR "%s: Couldn't get %"FMTSZ"u %s for uva 0x%p\n", __func__,
             numPagesNeeded, numPagesNeeded > 1 ? "pages" : "page", p);

      return NULL;
   }

   if (numPagesNeeded > 1) {
      /*
       * Unlike kmap(), vmap() can fail. If it does, we need to release the
       * pages that we acquired in HostIFGetUserPages().
       */

      mappedAddr = vmap(newHandle->pages, numPagesNeeded, VM_MAP, PAGE_KERNEL);
      if (mappedAddr == NULL) {
         unsigned int i;
         for (i = 0; i < numPagesNeeded; i++) {
            put_page(newHandle->pages[i]);
         }
         kfree(newHandle);
         printk(KERN_ERR "%s: Couldn't vmap %"FMTSZ"u %s for uva 0x%p\n",
                __func__, numPagesNeeded,
                numPagesNeeded > 1 ? "pages" : "page", p);

         return NULL;
      }
   } else {
      mappedAddr = kmap(newHandle->pages[0]);
   }

   printk(KERN_DEBUG "%s: p = 0x%p, offset = 0x%p, numPagesNeeded = %"FMTSZ"u,"
          " handleSize = %"FMTSZ"u, mappedAddr = 0x%p\n",
          __func__, p, (void *)offset, numPagesNeeded, handleSize, mappedAddr); 

   newHandle->numPages = numPagesNeeded;
   newHandle->addr = mappedAddr;
   *handle = newHandle;

   return mappedAddr + offset;
}


/*
 *-----------------------------------------------------------------------------
 *
 * HostIF_UnmapUserMem --
 *
 *	Unmap user memory from HostIF_MapUserMem().
 *
 * Results:
 *	None.
 *
 * Side effects:
 *	Yes.
 *
 *-----------------------------------------------------------------------------
 */

void
HostIF_UnmapUserMem(VMMappedUserMem *handle) // IN: Handle to mapped memory
{
   unsigned int i;

   if (handle == NULL) {
      return;
   }

   printk(KERN_DEBUG "%s: numPages = %"FMTSZ"u, addr = 0x%p\n",
          __func__, handle->numPages, handle->addr); 

   if (handle->numPages > 1) {
      vunmap(handle->addr);
   } else {
      kunmap(handle->pages[0]);
   }

   for (i = 0; i < handle->numPages; i++) {
      put_page(handle->pages[i]);
   }
   kfree(handle);
}

/*
 *-----------------------------------------------------------------------------
 *
 * HostIF_SafeRDMSR --
 *
 *      Attempt to read a MSR, and handle the exception if the MSR
 *      is unimplemented.
 *
 * Results:
 *      0 if successful, and MSR value is returned via *val.
 *
 *      If the MSR is unimplemented, *val is set to 0, and a
 *      non-zero value is returned: -1 for Win32, -EFAULT for Linux,
 *      and 1 for MacOS.
 *
 * Side effects:
 *      None
 *
 *-----------------------------------------------------------------------------
 */
int
HostIF_SafeRDMSR(unsigned int msr,   // IN
                 uint64 *val)        // OUT: MSR value
{
   int ret;
   unsigned low, high;
   asm volatile("2: rdmsr ; xor %0,%0\n"
                "1:\n\t"
                ".section .fixup,\"ax\"\n\t"
                "3: mov %4,%0 ; jmp 1b\n\t"
                ".previous\n\t"
                VMW_ASM_EXTABLE(2b, 3b)
                : "=r"(ret), "=a"(low), "=d"(high)
                : "c"(msr), "i"(-EFAULT), "1"(0), "2"(0)); // init eax/edx to 0
   *val = (low | ((u64)(high) << 32));

   return ret;
}