]> Pileus Git - ~andy/linux/blobdiff - arch/x86/kvm/emulate.c
Merge tag 'for-v3.11' of git://git.infradead.org/battery-2.6
[~andy/linux] / arch / x86 / kvm / emulate.c
index 5953dcea752d08e950d62293abbdec94ae95f62b..2bc1e81045b0f20f90ad2500c3acae318de1e8ee 100644 (file)
@@ -61,6 +61,8 @@
 #define OpMem8            26ull  /* 8-bit zero extended memory operand */
 #define OpImm64           27ull  /* Sign extended 16/32/64-bit immediate */
 #define OpXLat            28ull  /* memory at BX/EBX/RBX + zero-extended AL */
+#define OpAccLo           29ull  /* Low part of extended acc (AX/AX/EAX/RAX) */
+#define OpAccHi           30ull  /* High part of extended acc (-/DX/EDX/RDX) */
 
 #define OpBits             5  /* Width of operand field */
 #define OpMask             ((1ull << OpBits) - 1)
@@ -86,6 +88,7 @@
 #define DstMem64    (OpMem64 << DstShift)
 #define DstImmUByte (OpImmUByte << DstShift)
 #define DstDX       (OpDX << DstShift)
+#define DstAccLo    (OpAccLo << DstShift)
 #define DstMask     (OpMask << DstShift)
 /* Source operand type. */
 #define SrcShift    6
 #define SrcImm64    (OpImm64 << SrcShift)
 #define SrcDX       (OpDX << SrcShift)
 #define SrcMem8     (OpMem8 << SrcShift)
+#define SrcAccHi    (OpAccHi << SrcShift)
 #define SrcMask     (OpMask << SrcShift)
 #define BitOp       (1<<11)
 #define MemAbs      (1<<12)      /* Memory operand is absolute displacement */
 /* Source 2 operand type */
 #define Src2Shift   (31)
 #define Src2None    (OpNone << Src2Shift)
+#define Src2Mem     (OpMem << Src2Shift)
 #define Src2CL      (OpCL << Src2Shift)
 #define Src2ImmByte (OpImmByte << Src2Shift)
 #define Src2One     (OpOne << Src2Shift)
 #define Avx         ((u64)1 << 43)  /* Advanced Vector Extensions */
 #define Fastop      ((u64)1 << 44)  /* Use opcode::u.fastop */
 #define NoWrite     ((u64)1 << 45)  /* No writeback */
+#define SrcWrite    ((u64)1 << 46)  /* Write back src operand */
+
+#define DstXacc     (DstAccLo | SrcAccHi | SrcWrite)
 
 #define X2(x...) x, x
 #define X3(x...) X2(x), x
 /*
  * fastop functions have a special calling convention:
  *
- * dst:    [rdx]:rax  (in/out)
- * src:    rbx        (in/out)
+ * dst:    rax        (in/out)
+ * src:    rdx        (in/out)
  * src2:   rcx        (in)
  * flags:  rflags     (in/out)
+ * ex:     rsi        (in:fastop pointer, out:zero if exception)
  *
  * Moreover, they are all exactly FASTOP_SIZE bytes long, so functions for
  * different operand sizes can be reached by calculation, rather than a jump
@@ -275,175 +284,18 @@ static void invalidate_registers(struct x86_emulate_ctxt *ctxt)
        ctxt->regs_valid = 0;
 }
 
-/*
- * Instruction emulation:
- * Most instructions are emulated directly via a fragment of inline assembly
- * code. This allows us to save/restore EFLAGS and thus very easily pick up
- * any modified flags.
- */
-
-#if defined(CONFIG_X86_64)
-#define _LO32 "k"              /* force 32-bit operand */
-#define _STK  "%%rsp"          /* stack pointer */
-#elif defined(__i386__)
-#define _LO32 ""               /* force 32-bit operand */
-#define _STK  "%%esp"          /* stack pointer */
-#endif
-
 /*
  * These EFLAGS bits are restored from saved value during emulation, and
  * any changes are written back to the saved value after emulation.
  */
 #define EFLAGS_MASK (EFLG_OF|EFLG_SF|EFLG_ZF|EFLG_AF|EFLG_PF|EFLG_CF)
 
-/* Before executing instruction: restore necessary bits in EFLAGS. */
-#define _PRE_EFLAGS(_sav, _msk, _tmp)                                  \
-       /* EFLAGS = (_sav & _msk) | (EFLAGS & ~_msk); _sav &= ~_msk; */ \
-       "movl %"_sav",%"_LO32 _tmp"; "                                  \
-       "push %"_tmp"; "                                                \
-       "push %"_tmp"; "                                                \
-       "movl %"_msk",%"_LO32 _tmp"; "                                  \
-       "andl %"_LO32 _tmp",("_STK"); "                                 \
-       "pushf; "                                                       \
-       "notl %"_LO32 _tmp"; "                                          \
-       "andl %"_LO32 _tmp",("_STK"); "                                 \
-       "andl %"_LO32 _tmp","__stringify(BITS_PER_LONG/4)"("_STK"); "   \
-       "pop  %"_tmp"; "                                                \
-       "orl  %"_LO32 _tmp",("_STK"); "                                 \
-       "popf; "                                                        \
-       "pop  %"_sav"; "
-
-/* After executing instruction: write-back necessary bits in EFLAGS. */
-#define _POST_EFLAGS(_sav, _msk, _tmp) \
-       /* _sav |= EFLAGS & _msk; */            \
-       "pushf; "                               \
-       "pop  %"_tmp"; "                        \
-       "andl %"_msk",%"_LO32 _tmp"; "          \
-       "orl  %"_LO32 _tmp",%"_sav"; "
-
 #ifdef CONFIG_X86_64
 #define ON64(x) x
 #else
 #define ON64(x)
 #endif
 
-#define ____emulate_2op(ctxt, _op, _x, _y, _suffix, _dsttype)  \
-       do {                                                            \
-               __asm__ __volatile__ (                                  \
-                       _PRE_EFLAGS("0", "4", "2")                      \
-                       _op _suffix " %"_x"3,%1; "                      \
-                       _POST_EFLAGS("0", "4", "2")                     \
-                       : "=m" ((ctxt)->eflags),                        \
-                         "+q" (*(_dsttype*)&(ctxt)->dst.val),          \
-                         "=&r" (_tmp)                                  \
-                       : _y ((ctxt)->src.val), "i" (EFLAGS_MASK));     \
-       } while (0)
-
-
-/* Raw emulation: instruction has two explicit operands. */
-#define __emulate_2op_nobyte(ctxt,_op,_wx,_wy,_lx,_ly,_qx,_qy)         \
-       do {                                                            \
-               unsigned long _tmp;                                     \
-                                                                       \
-               switch ((ctxt)->dst.bytes) {                            \
-               case 2:                                                 \
-                       ____emulate_2op(ctxt,_op,_wx,_wy,"w",u16);      \
-                       break;                                          \
-               case 4:                                                 \
-                       ____emulate_2op(ctxt,_op,_lx,_ly,"l",u32);      \
-                       break;                                          \
-               case 8:                                                 \
-                       ON64(____emulate_2op(ctxt,_op,_qx,_qy,"q",u64)); \
-                       break;                                          \
-               }                                                       \
-       } while (0)
-
-#define __emulate_2op(ctxt,_op,_bx,_by,_wx,_wy,_lx,_ly,_qx,_qy)                     \
-       do {                                                                 \
-               unsigned long _tmp;                                          \
-               switch ((ctxt)->dst.bytes) {                                 \
-               case 1:                                                      \
-                       ____emulate_2op(ctxt,_op,_bx,_by,"b",u8);            \
-                       break;                                               \
-               default:                                                     \
-                       __emulate_2op_nobyte(ctxt, _op,                      \
-                                            _wx, _wy, _lx, _ly, _qx, _qy);  \
-                       break;                                               \
-               }                                                            \
-       } while (0)
-
-/* Source operand is byte-sized and may be restricted to just %cl. */
-#define emulate_2op_SrcB(ctxt, _op)                                    \
-       __emulate_2op(ctxt, _op, "b", "c", "b", "c", "b", "c", "b", "c")
-
-/* Source operand is byte, word, long or quad sized. */
-#define emulate_2op_SrcV(ctxt, _op)                                    \
-       __emulate_2op(ctxt, _op, "b", "q", "w", "r", _LO32, "r", "", "r")
-
-/* Source operand is word, long or quad sized. */
-#define emulate_2op_SrcV_nobyte(ctxt, _op)                             \
-       __emulate_2op_nobyte(ctxt, _op, "w", "r", _LO32, "r", "", "r")
-
-/* Instruction has three operands and one operand is stored in ECX register */
-#define __emulate_2op_cl(ctxt, _op, _suffix, _type)            \
-       do {                                                            \
-               unsigned long _tmp;                                     \
-               _type _clv  = (ctxt)->src2.val;                         \
-               _type _srcv = (ctxt)->src.val;                          \
-               _type _dstv = (ctxt)->dst.val;                          \
-                                                                       \
-               __asm__ __volatile__ (                                  \
-                       _PRE_EFLAGS("0", "5", "2")                      \
-                       _op _suffix " %4,%1 \n"                         \
-                       _POST_EFLAGS("0", "5", "2")                     \
-                       : "=m" ((ctxt)->eflags), "+r" (_dstv), "=&r" (_tmp) \
-                       : "c" (_clv) , "r" (_srcv), "i" (EFLAGS_MASK)   \
-                       );                                              \
-                                                                       \
-               (ctxt)->src2.val  = (unsigned long) _clv;               \
-               (ctxt)->src2.val = (unsigned long) _srcv;               \
-               (ctxt)->dst.val = (unsigned long) _dstv;                \
-       } while (0)
-
-#define emulate_2op_cl(ctxt, _op)                                      \
-       do {                                                            \
-               switch ((ctxt)->dst.bytes) {                            \
-               case 2:                                                 \
-                       __emulate_2op_cl(ctxt, _op, "w", u16);          \
-                       break;                                          \
-               case 4:                                                 \
-                       __emulate_2op_cl(ctxt, _op, "l", u32);          \
-                       break;                                          \
-               case 8:                                                 \
-                       ON64(__emulate_2op_cl(ctxt, _op, "q", ulong));  \
-                       break;                                          \
-               }                                                       \
-       } while (0)
-
-#define __emulate_1op(ctxt, _op, _suffix)                              \
-       do {                                                            \
-               unsigned long _tmp;                                     \
-                                                                       \
-               __asm__ __volatile__ (                                  \
-                       _PRE_EFLAGS("0", "3", "2")                      \
-                       _op _suffix " %1; "                             \
-                       _POST_EFLAGS("0", "3", "2")                     \
-                       : "=m" ((ctxt)->eflags), "+m" ((ctxt)->dst.val), \
-                         "=&r" (_tmp)                                  \
-                       : "i" (EFLAGS_MASK));                           \
-       } while (0)
-
-/* Instruction has only one explicit operand (no source operand). */
-#define emulate_1op(ctxt, _op)                                         \
-       do {                                                            \
-               switch ((ctxt)->dst.bytes) {                            \
-               case 1: __emulate_1op(ctxt, _op, "b"); break;           \
-               case 2: __emulate_1op(ctxt, _op, "w"); break;           \
-               case 4: __emulate_1op(ctxt, _op, "l"); break;           \
-               case 8: ON64(__emulate_1op(ctxt, _op, "q")); break;     \
-               }                                                       \
-       } while (0)
-
 static int fastop(struct x86_emulate_ctxt *ctxt, void (*fop)(struct fastop *));
 
 #define FOP_ALIGN ".align " __stringify(FASTOP_SIZE) " \n\t"
@@ -462,7 +314,10 @@ static int fastop(struct x86_emulate_ctxt *ctxt, void (*fop)(struct fastop *));
 #define FOPNOP() FOP_ALIGN FOP_RET
 
 #define FOP1E(op,  dst) \
-       FOP_ALIGN #op " %" #dst " \n\t" FOP_RET
+       FOP_ALIGN "10: " #op " %" #dst " \n\t" FOP_RET
+
+#define FOP1EEX(op,  dst) \
+       FOP1E(op, dst) _ASM_EXTABLE(10b, kvm_fastop_exception)
 
 #define FASTOP1(op) \
        FOP_START(op) \
@@ -472,24 +327,42 @@ static int fastop(struct x86_emulate_ctxt *ctxt, void (*fop)(struct fastop *));
        ON64(FOP1E(op##q, rax)) \
        FOP_END
 
+/* 1-operand, using src2 (for MUL/DIV r/m) */
+#define FASTOP1SRC2(op, name) \
+       FOP_START(name) \
+       FOP1E(op, cl) \
+       FOP1E(op, cx) \
+       FOP1E(op, ecx) \
+       ON64(FOP1E(op, rcx)) \
+       FOP_END
+
+/* 1-operand, using src2 (for MUL/DIV r/m), with exceptions */
+#define FASTOP1SRC2EX(op, name) \
+       FOP_START(name) \
+       FOP1EEX(op, cl) \
+       FOP1EEX(op, cx) \
+       FOP1EEX(op, ecx) \
+       ON64(FOP1EEX(op, rcx)) \
+       FOP_END
+
 #define FOP2E(op,  dst, src)      \
        FOP_ALIGN #op " %" #src ", %" #dst " \n\t" FOP_RET
 
 #define FASTOP2(op) \
        FOP_START(op) \
-       FOP2E(op##b, al, bl) \
-       FOP2E(op##w, ax, bx) \
-       FOP2E(op##l, eax, ebx) \
-       ON64(FOP2E(op##q, rax, rbx)) \
+       FOP2E(op##b, al, dl) \
+       FOP2E(op##w, ax, dx) \
+       FOP2E(op##l, eax, edx) \
+       ON64(FOP2E(op##q, rax, rdx)) \
        FOP_END
 
 /* 2 operand, word only */
 #define FASTOP2W(op) \
        FOP_START(op) \
        FOPNOP() \
-       FOP2E(op##w, ax, bx) \
-       FOP2E(op##l, eax, ebx) \
-       ON64(FOP2E(op##q, rax, rbx)) \
+       FOP2E(op##w, ax, dx) \
+       FOP2E(op##l, eax, edx) \
+       ON64(FOP2E(op##q, rax, rdx)) \
        FOP_END
 
 /* 2 operand, src is CL */
@@ -508,14 +381,17 @@ static int fastop(struct x86_emulate_ctxt *ctxt, void (*fop)(struct fastop *));
 #define FASTOP3WCL(op) \
        FOP_START(op) \
        FOPNOP() \
-       FOP3E(op##w, ax, bx, cl) \
-       FOP3E(op##l, eax, ebx, cl) \
-       ON64(FOP3E(op##q, rax, rbx, cl)) \
+       FOP3E(op##w, ax, dx, cl) \
+       FOP3E(op##l, eax, edx, cl) \
+       ON64(FOP3E(op##q, rax, rdx, cl)) \
        FOP_END
 
 /* Special case for SETcc - 1 instruction per cc */
 #define FOP_SETCC(op) ".align 4; " #op " %al; ret \n\t"
 
+asm(".global kvm_fastop_exception \n"
+    "kvm_fastop_exception: xor %esi, %esi; ret");
+
 FOP_START(setcc)
 FOP_SETCC(seto)
 FOP_SETCC(setno)
@@ -538,47 +414,6 @@ FOP_END;
 FOP_START(salc) "pushf; sbb %al, %al; popf \n\t" FOP_RET
 FOP_END;
 
-#define __emulate_1op_rax_rdx(ctxt, _op, _suffix, _ex)                 \
-       do {                                                            \
-               unsigned long _tmp;                                     \
-               ulong *rax = reg_rmw((ctxt), VCPU_REGS_RAX);            \
-               ulong *rdx = reg_rmw((ctxt), VCPU_REGS_RDX);            \
-                                                                       \
-               __asm__ __volatile__ (                                  \
-                       _PRE_EFLAGS("0", "5", "1")                      \
-                       "1: \n\t"                                       \
-                       _op _suffix " %6; "                             \
-                       "2: \n\t"                                       \
-                       _POST_EFLAGS("0", "5", "1")                     \
-                       ".pushsection .fixup,\"ax\" \n\t"               \
-                       "3: movb $1, %4 \n\t"                           \
-                       "jmp 2b \n\t"                                   \
-                       ".popsection \n\t"                              \
-                       _ASM_EXTABLE(1b, 3b)                            \
-                       : "=m" ((ctxt)->eflags), "=&r" (_tmp),          \
-                         "+a" (*rax), "+d" (*rdx), "+qm"(_ex)          \
-                       : "i" (EFLAGS_MASK), "m" ((ctxt)->src.val));    \
-       } while (0)
-
-/* instruction has only one source operand, destination is implicit (e.g. mul, div, imul, idiv) */
-#define emulate_1op_rax_rdx(ctxt, _op, _ex)    \
-       do {                                                            \
-               switch((ctxt)->src.bytes) {                             \
-               case 1:                                                 \
-                       __emulate_1op_rax_rdx(ctxt, _op, "b", _ex);     \
-                       break;                                          \
-               case 2:                                                 \
-                       __emulate_1op_rax_rdx(ctxt, _op, "w", _ex);     \
-                       break;                                          \
-               case 4:                                                 \
-                       __emulate_1op_rax_rdx(ctxt, _op, "l", _ex);     \
-                       break;                                          \
-               case 8: ON64(                                           \
-                       __emulate_1op_rax_rdx(ctxt, _op, "q", _ex));    \
-                       break;                                          \
-               }                                                       \
-       } while (0)
-
 static int emulator_check_intercept(struct x86_emulate_ctxt *ctxt,
                                    enum x86_intercept intercept,
                                    enum x86_intercept_stage stage)
@@ -988,6 +823,11 @@ FASTOP2(xor);
 FASTOP2(cmp);
 FASTOP2(test);
 
+FASTOP1SRC2(mul, mul_ex);
+FASTOP1SRC2(imul, imul_ex);
+FASTOP1SRC2EX(div, div_ex);
+FASTOP1SRC2EX(idiv, idiv_ex);
+
 FASTOP3WCL(shld);
 FASTOP3WCL(shrd);
 
@@ -1013,6 +853,8 @@ FASTOP2W(bts);
 FASTOP2W(btr);
 FASTOP2W(btc);
 
+FASTOP2(xadd);
+
 static u8 test_cc(unsigned int condition, unsigned long flags)
 {
        u8 rc;
@@ -1726,45 +1568,42 @@ static void write_register_operand(struct operand *op)
        }
 }
 
-static int writeback(struct x86_emulate_ctxt *ctxt)
+static int writeback(struct x86_emulate_ctxt *ctxt, struct operand *op)
 {
        int rc;
 
-       if (ctxt->d & NoWrite)
-               return X86EMUL_CONTINUE;
-
-       switch (ctxt->dst.type) {
+       switch (op->type) {
        case OP_REG:
-               write_register_operand(&ctxt->dst);
+               write_register_operand(op);
                break;
        case OP_MEM:
                if (ctxt->lock_prefix)
                        rc = segmented_cmpxchg(ctxt,
-                                              ctxt->dst.addr.mem,
-                                              &ctxt->dst.orig_val,
-                                              &ctxt->dst.val,
-                                              ctxt->dst.bytes);
+                                              op->addr.mem,
+                                              &op->orig_val,
+                                              &op->val,
+                                              op->bytes);
                else
                        rc = segmented_write(ctxt,
-                                            ctxt->dst.addr.mem,
-                                            &ctxt->dst.val,
-                                            ctxt->dst.bytes);
+                                            op->addr.mem,
+                                            &op->val,
+                                            op->bytes);
                if (rc != X86EMUL_CONTINUE)
                        return rc;
                break;
        case OP_MEM_STR:
                rc = segmented_write(ctxt,
-                               ctxt->dst.addr.mem,
-                               ctxt->dst.data,
-                               ctxt->dst.bytes * ctxt->dst.count);
+                               op->addr.mem,
+                               op->data,
+                               op->bytes * op->count);
                if (rc != X86EMUL_CONTINUE)
                        return rc;
                break;
        case OP_XMM:
-               write_sse_reg(ctxt, &ctxt->dst.vec_val, ctxt->dst.addr.xmm);
+               write_sse_reg(ctxt, &op->vec_val, op->addr.xmm);
                break;
        case OP_MM:
-               write_mmx_reg(ctxt, &ctxt->dst.mm_val, ctxt->dst.addr.mm);
+               write_mmx_reg(ctxt, &op->mm_val, op->addr.mm);
                break;
        case OP_NONE:
                /* no writeback */
@@ -2117,42 +1956,6 @@ static int em_jmp_far(struct x86_emulate_ctxt *ctxt)
        return X86EMUL_CONTINUE;
 }
 
-static int em_mul_ex(struct x86_emulate_ctxt *ctxt)
-{
-       u8 ex = 0;
-
-       emulate_1op_rax_rdx(ctxt, "mul", ex);
-       return X86EMUL_CONTINUE;
-}
-
-static int em_imul_ex(struct x86_emulate_ctxt *ctxt)
-{
-       u8 ex = 0;
-
-       emulate_1op_rax_rdx(ctxt, "imul", ex);
-       return X86EMUL_CONTINUE;
-}
-
-static int em_div_ex(struct x86_emulate_ctxt *ctxt)
-{
-       u8 de = 0;
-
-       emulate_1op_rax_rdx(ctxt, "div", de);
-       if (de)
-               return emulate_de(ctxt);
-       return X86EMUL_CONTINUE;
-}
-
-static int em_idiv_ex(struct x86_emulate_ctxt *ctxt)
-{
-       u8 de = 0;
-
-       emulate_1op_rax_rdx(ctxt, "idiv", de);
-       if (de)
-               return emulate_de(ctxt);
-       return X86EMUL_CONTINUE;
-}
-
 static int em_grp45(struct x86_emulate_ctxt *ctxt)
 {
        int rc = X86EMUL_CONTINUE;
@@ -3734,10 +3537,10 @@ static const struct opcode group3[] = {
        F(DstMem | SrcImm | NoWrite, em_test),
        F(DstMem | SrcNone | Lock, em_not),
        F(DstMem | SrcNone | Lock, em_neg),
-       I(SrcMem, em_mul_ex),
-       I(SrcMem, em_imul_ex),
-       I(SrcMem, em_div_ex),
-       I(SrcMem, em_idiv_ex),
+       F(DstXacc | Src2Mem, em_mul_ex),
+       F(DstXacc | Src2Mem, em_imul_ex),
+       F(DstXacc | Src2Mem, em_div_ex),
+       F(DstXacc | Src2Mem, em_idiv_ex),
 };
 
 static const struct opcode group4[] = {
@@ -4064,7 +3867,7 @@ static const struct opcode twobyte_table[256] = {
        F(DstReg | SrcMem | ModRM, em_bsf), F(DstReg | SrcMem | ModRM, em_bsr),
        D(DstReg | SrcMem8 | ModRM | Mov), D(DstReg | SrcMem16 | ModRM | Mov),
        /* 0xC0 - 0xC7 */
-       D2bv(DstMem | SrcReg | ModRM | Lock),
+       F2bv(DstMem | SrcReg | ModRM | SrcWrite | Lock, em_xadd),
        N, D(DstMem | SrcReg | ModRM | Mov),
        N, N, N, GD(0, &group9),
        /* 0xC8 - 0xCF */
@@ -4172,6 +3975,24 @@ static int decode_operand(struct x86_emulate_ctxt *ctxt, struct operand *op,
                fetch_register_operand(op);
                op->orig_val = op->val;
                break;
+       case OpAccLo:
+               op->type = OP_REG;
+               op->bytes = (ctxt->d & ByteOp) ? 2 : ctxt->op_bytes;
+               op->addr.reg = reg_rmw(ctxt, VCPU_REGS_RAX);
+               fetch_register_operand(op);
+               op->orig_val = op->val;
+               break;
+       case OpAccHi:
+               if (ctxt->d & ByteOp) {
+                       op->type = OP_NONE;
+                       break;
+               }
+               op->type = OP_REG;
+               op->bytes = ctxt->op_bytes;
+               op->addr.reg = reg_rmw(ctxt, VCPU_REGS_RDX);
+               fetch_register_operand(op);
+               op->orig_val = op->val;
+               break;
        case OpDI:
                op->type = OP_MEM;
                op->bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes;
@@ -4553,11 +4374,15 @@ static void fetch_possible_mmx_operand(struct x86_emulate_ctxt *ctxt,
 static int fastop(struct x86_emulate_ctxt *ctxt, void (*fop)(struct fastop *))
 {
        ulong flags = (ctxt->eflags & EFLAGS_MASK) | X86_EFLAGS_IF;
-       fop += __ffs(ctxt->dst.bytes) * FASTOP_SIZE;
+       if (!(ctxt->d & ByteOp))
+               fop += __ffs(ctxt->dst.bytes) * FASTOP_SIZE;
        asm("push %[flags]; popf; call *%[fastop]; pushf; pop %[flags]\n"
-           : "+a"(ctxt->dst.val), "+b"(ctxt->src.val), [flags]"+D"(flags)
-       : "c"(ctxt->src2.val), [fastop]"S"(fop));
+           : "+a"(ctxt->dst.val), "+d"(ctxt->src.val), [flags]"+D"(flags),
+             [fastop]"+S"(fop)
+           : "c"(ctxt->src2.val));
        ctxt->eflags = (ctxt->eflags & ~EFLAGS_MASK) | (flags & EFLAGS_MASK);
+       if (!fop) /* exception is returned in fop variable */
+               return emulate_de(ctxt);
        return X86EMUL_CONTINUE;
 }
 
@@ -4773,9 +4598,17 @@ special_insn:
                goto done;
 
 writeback:
-       rc = writeback(ctxt);
-       if (rc != X86EMUL_CONTINUE)
-               goto done;
+       if (!(ctxt->d & NoWrite)) {
+               rc = writeback(ctxt, &ctxt->dst);
+               if (rc != X86EMUL_CONTINUE)
+                       goto done;
+       }
+       if (ctxt->d & SrcWrite) {
+               BUG_ON(ctxt->src.type == OP_MEM || ctxt->src.type == OP_MEM_STR);
+               rc = writeback(ctxt, &ctxt->src);
+               if (rc != X86EMUL_CONTINUE)
+                       goto done;
+       }
 
        /*
         * restore dst type in case the decoding will be reused
@@ -4872,12 +4705,6 @@ twobyte_insn:
                ctxt->dst.val = (ctxt->src.bytes == 1) ? (s8) ctxt->src.val :
                                                        (s16) ctxt->src.val;
                break;
-       case 0xc0 ... 0xc1:     /* xadd */
-               fastop(ctxt, em_add);
-               /* Write back the register source. */
-               ctxt->src.val = ctxt->dst.orig_val;
-               write_register_operand(&ctxt->src);
-               break;
        case 0xc3:              /* movnti */
                ctxt->dst.bytes = ctxt->op_bytes;
                ctxt->dst.val = (ctxt->op_bytes == 4) ? (u32) ctxt->src.val :