#define _GNU_SOURCE
#include <stdio.h>
#include <stdlib.h>
#include <errno.h>
#include <string.h>
#include <math.h>
#include <sys/types.h>
#include <unistd.h>

#include "x64_signals.h"
#include "os.h"
#include "debug.h"
#include "box64context.h"
#include "box64cpu.h"
#include "emu/x64emu_private.h"
#include "tools/bridge_private.h"
#include "x64emu.h"
#include "box64stack.h"
#include "callback.h"
#include "emu/x64run_private.h"
#include "emu/x87emu_private.h"
#include "x64trace.h"
#include "mysignal.h"
#include "emit_signals.h"
#include "dynarec_native.h"
#include "custommem.h"
#include "bridge.h"
#include "dynarec_native_functions.h"

void native_fstp(x64emu_t* emu, void* p)
{
    if(ST0.q!=STld(0).uref)
        D2LD(&ST0.d, p);
    else
        memcpy(p, &STld(0).ld, 10);
}

void native_print_armreg(x64emu_t* emu, uintptr_t reg, uintptr_t n)
{
    (void)emu;
    dynarec_log(LOG_INFO, "Debug Register R%lu=0x%lx (%lu)\n", n, reg, reg);
}

void native_f2xm1(x64emu_t* emu)
{
    ST0.d = expm1(LN2 * ST0.d);
}
void native_fyl2x(x64emu_t* emu)
{
    ST(1).d = log2(ST0.d)*ST(1).d;
}
void native_ftan(x64emu_t* emu)
{
#pragma STDC FENV_ACCESS ON
    // seems that tan of glib doesn't follow the rounding direction mode
    ST0.d = tan(ST0.d);
    emu->sw.f.F87_C2 = 0;
}
void native_fpatan(x64emu_t* emu)
{
#pragma STDC FENV_ACCESS ON
    ST1.d = atan2(ST1.d, ST0.d);
}
void native_fxtract(x64emu_t* emu)
{
    int tmp32s;
    if(isnan(ST1.d)) {
        ST0.d = ST1.d;
    } else if(isinf(ST1.d)) {
        ST0.d = ST1.d;
        ST1.d = INFINITY;
    } else if(ST1.d==0.0) {
        ST0.d = ST1.d;
        ST1.d = -INFINITY;
    } else {
        // LD80bits doesn't have implicit "1" bit, so need to adjust for that
        ST0.d = frexp(ST1.d, &tmp32s)*2;
        ST1.d = tmp32s-1;
    }
}
void native_fprem(x64emu_t* emu)
{
    int64_t ll = (int64_t)trunc(ST0.d / ST1.d);
    ST0.d = ST0.d - (ST1.d * ll);
    emu->sw.f.F87_C2 = 0;
    emu->sw.f.F87_C1 = (ll & 1) ? 1 : 0;
    emu->sw.f.F87_C3 = (ll & 2) ? 1 : 0;
    emu->sw.f.F87_C0 = (ll & 4) ? 1 : 0;
}
void native_fyl2xp1(x64emu_t* emu)
{
    ST(1).d = log1p(ST0.d)*ST(1).d/LN2;
}
void native_fsincos(x64emu_t* emu)
{
#pragma STDC FENV_ACCESS ON
    // seems that sincos of glib doesn't follow the rounding direction mode
    sincos(ST1.d, &ST1.d, &ST0.d);
    emu->sw.f.F87_C2 = 0;
}
void native_frndint(x64emu_t* emu)
{
    ST0.d = fpu_round(emu, ST0.d);
}
void native_fscale(x64emu_t* emu)
{
#pragma STDC FENV_ACCESS ON
    if(ST0.d!=0.0)
        ST0.d = ldexp(ST0.d, trunc(ST1.d));
}
void native_fsin(x64emu_t* emu)
{
#pragma STDC FENV_ACCESS ON
    // seems that sin of glib doesn't follow the rounding direction mode
    ST0.d = sin(ST0.d);
    emu->sw.f.F87_C2 = 0;
}
void native_fcos(x64emu_t* emu)
{
#pragma STDC FENV_ACCESS ON
    // seems that cos of glib doesn't follow the rounding direction mode
    ST0.d = cos(ST0.d);
    emu->sw.f.F87_C2 = 0;
}

double direct_f2xm1(x64emu_t* emu, double a)
{
    return expm1(LN2 * a);
}
double direct_fyl2x(x64emu_t* emu, double a, double b)
{
    return log2(a)*b;
}
double direct_fyl2xp1(x64emu_t* emu, double a, double b)
{
    return log1p(a)*b/LN2;
}
double direct_fpatan(x64emu_t* emu, double a, double b)
{
#pragma STDC FENV_ACCESS ON
    return atan2(b, a);
}
double direct_fsin(x64emu_t* emu, double a)
{
#pragma STDC FENV_ACCESS ON
    // seems that sin of glib doesn't follow the rounding direction mode
    emu->sw.f.F87_C2 = 0;
    return sin(a);
}
double direct_fcos(x64emu_t* emu, double a)
{
#pragma STDC FENV_ACCESS ON
    // seems that cos of glib doesn't follow the rounding direction mode
    emu->sw.f.F87_C2 = 0;
    return cos(a);
}
double direct_ftan(x64emu_t* emu, double a)
{
#pragma STDC FENV_ACCESS ON
    // seems that tan of glib doesn't follow the rounding direction mode
    emu->sw.f.F87_C2 = 0;
    return tan(a);
}
double direct_fscale(x64emu_t* emu, double a, double b)
{
#pragma STDC FENV_ACCESS ON
    return a?ldexp(a, trunc(b)):a;
}

void native_fbld(x64emu_t* emu, uint8_t* ed)
{
    fpu_fbld(emu, ed);
}

void native_fild64(x64emu_t* emu, int64_t* ed)
{
    int64_t tmp;
    memcpy(&tmp, ed, sizeof(tmp));
    ST0.d = tmp;
    STll(0).sq = tmp;
    STll(0).sref = ST0.sq;
}

void native_fbstp(x64emu_t* emu, uint8_t* ed)
{
    fpu_fbst(emu, ed);
}

void native_fistp64(x64emu_t* emu, int64_t* ed)
{
    // used of memcpy to avoid aligments issues
    if(STll(0).sref==ST(0).sq) {
        memcpy(ed, &STll(0).sq, sizeof(int64_t));
    } else {
        int64_t tmp;
        if(isgreater(ST0.d, (double)(int64_t)0x7fffffffffffffffLL) || isless(ST0.d, (double)(int64_t)0x8000000000000000LL) || !isfinite(ST0.d))
            tmp = 0x8000000000000000LL;
        else
            tmp = fpu_round(emu, ST0.d);
        memcpy(ed, &tmp, sizeof(tmp));
    }
}

void native_fistt64(x64emu_t* emu, int64_t* ed)
{
    // used of memcpy to avoid alignments issues
    int64_t tmp = ST0.d;
    memcpy(ed, &tmp, sizeof(tmp));
}

void native_fld(x64emu_t* emu, uint8_t* ed)
{
    memcpy(&STld(0).ld, ed, 10);
    LD2D(&STld(0), &ST(0).d);
    STld(0).uref = ST0.q;
}

void native_ud(x64emu_t* emu)
{
    if(BOX64ENV(dynarec_test))
        emu->test.test = 0;
    EmitSignal(emu, X64_SIGILL, (void*)R_RIP, 0);
}

void native_br(x64emu_t* emu)
{
    if(BOX64ENV(dynarec_test))
        emu->test.test = 0;
    EmitSignal(emu, X64_SIGSEGV, (void*)R_RIP, 0xb09d);
}

void native_priv(x64emu_t* emu)
{
    emu->test.test = 0;
    EmitSignal(emu, X64_SIGSEGV, (void*)R_RIP, 0xbad0);
}

void native_gpf(x64emu_t* emu)
{
    emu->test.test = 0;
    EmitSignal(emu, X64_SIGSEGV, (void*)R_RIP, 0xbad0); // same effect has private opcode?
}

void native_int(x64emu_t* emu, int num)
{
    emu->test.test = 0;
    EmitInterruption(emu, num, (void*)R_RIP);
}
#ifndef _WIN32
void native_wineint(x64emu_t* emu, int num)
{
    emu->test.test = 0;
    EmitWineInt(emu, num, (void*)R_RIP);
}
#endif
void native_int3(x64emu_t* emu)
{
    EmitSignal(emu, X64_SIGTRAP, NULL, 3);
}

void native_div0(x64emu_t* emu)
{
    emu->test.test = 0;
    EmitDiv0(emu, (void*)R_RIP, 1);
}

void native_fsave(x64emu_t* emu, uint8_t* ed)
{
    fpu_savenv(emu, (char*)ed, 0);

    uint8_t* p = ed;
    p += 28;
    for (int i=0; i<8; ++i) {
        LD2D(p, &emu->x87[7-i].d);
        p+=10;
    }
    reset_fpu(emu);
}
void native_fsave16(x64emu_t* emu, uint8_t* ed)
{
    fpu_savenv(emu, (char*)ed, 1);

    uint8_t* p = ed;
    p += 14;
    for (int i=0; i<8; ++i) {
        LD2D(p, &emu->x87[7-i].d);
        p+=10;
    }
    reset_fpu(emu);
}
void native_frstor(x64emu_t* emu, uint8_t* ed)
{
    fpu_loadenv(emu, (char*)ed, 0);

    uint8_t* p = ed;
    p += 28;
    for (int i=0; i<8; ++i) {
        D2LD(&emu->x87[7-i].d, p);
        p+=10;
    }

}
void native_frstor16(x64emu_t* emu, uint8_t* ed)
{
    fpu_loadenv(emu, (char*)ed, 1);

    uint8_t* p = ed;
    p += 14;
    for (int i=0; i<8; ++i) {
        D2LD(&emu->x87[7-i].d, p);
        p+=10;
    }

}

void native_fprem1(x64emu_t* emu)
{
    int e0, e1;
    int64_t ll = (int64_t)round(ST0.d / ST1.d);
    ST0.d = ST0.d - (ST1.d * ll);
    emu->sw.f.F87_C2 = 0;
    emu->sw.f.F87_C1 = (ll & 1) ? 1 : 0;
    emu->sw.f.F87_C3 = (ll & 2) ? 1 : 0;
    emu->sw.f.F87_C0 = (ll & 4) ? 1 : 0;
}

const uint8_t ff_mult2[4][256] = {
    // a = 0x0e
    0x00, 0x0e, 0x1c, 0x12, 0x38, 0x36, 0x24, 0x2a, 0x70, 0x7e, 0x6c, 0x62, 0x48, 0x46, 0x54, 0x5a,
    0xe0, 0xee, 0xfc, 0xf2, 0xd8, 0xd6, 0xc4, 0xca, 0x90, 0x9e, 0x8c, 0x82, 0xa8, 0xa6, 0xb4, 0xba,
    0xdb, 0xd5, 0xc7, 0xc9, 0xe3, 0xed, 0xff, 0xf1, 0xab, 0xa5, 0xb7, 0xb9, 0x93, 0x9d, 0x8f, 0x81,
    0x3b, 0x35, 0x27, 0x29, 0x03, 0x0d, 0x1f, 0x11, 0x4b, 0x45, 0x57, 0x59, 0x73, 0x7d, 0x6f, 0x61,
    0xad, 0xa3, 0xb1, 0xbf, 0x95, 0x9b, 0x89, 0x87, 0xdd, 0xd3, 0xc1, 0xcf, 0xe5, 0xeb, 0xf9, 0xf7,
    0x4d, 0x43, 0x51, 0x5f, 0x75, 0x7b, 0x69, 0x67, 0x3d, 0x33, 0x21, 0x2f, 0x05, 0x0b, 0x19, 0x17,
    0x76, 0x78, 0x6a, 0x64, 0x4e, 0x40, 0x52, 0x5c, 0x06, 0x08, 0x1a, 0x14, 0x3e, 0x30, 0x22, 0x2c,
    0x96, 0x98, 0x8a, 0x84, 0xae, 0xa0, 0xb2, 0xbc, 0xe6, 0xe8, 0xfa, 0xf4, 0xde, 0xd0, 0xc2, 0xcc,
    0x41, 0x4f, 0x5d, 0x53, 0x79, 0x77, 0x65, 0x6b, 0x31, 0x3f, 0x2d, 0x23, 0x09, 0x07, 0x15, 0x1b,
    0xa1, 0xaf, 0xbd, 0xb3, 0x99, 0x97, 0x85, 0x8b, 0xd1, 0xdf, 0xcd, 0xc3, 0xe9, 0xe7, 0xf5, 0xfb,
    0x9a, 0x94, 0x86, 0x88, 0xa2, 0xac, 0xbe, 0xb0, 0xea, 0xe4, 0xf6, 0xf8, 0xd2, 0xdc, 0xce, 0xc0,
    0x7a, 0x74, 0x66, 0x68, 0x42, 0x4c, 0x5e, 0x50, 0x0a, 0x04, 0x16, 0x18, 0x32, 0x3c, 0x2e, 0x20,
    0xec, 0xe2, 0xf0, 0xfe, 0xd4, 0xda, 0xc8, 0xc6, 0x9c, 0x92, 0x80, 0x8e, 0xa4, 0xaa, 0xb8, 0xb6,
    0x0c, 0x02, 0x10, 0x1e, 0x34, 0x3a, 0x28, 0x26, 0x7c, 0x72, 0x60, 0x6e, 0x44, 0x4a, 0x58, 0x56,
    0x37, 0x39, 0x2b, 0x25, 0x0f, 0x01, 0x13, 0x1d, 0x47, 0x49, 0x5b, 0x55, 0x7f, 0x71, 0x63, 0x6d,
    0xd7, 0xd9, 0xcb, 0xc5, 0xef, 0xe1, 0xf3, 0xfd, 0xa7, 0xa9, 0xbb, 0xb5, 0x9f, 0x91, 0x83, 0x8d,
    // a = 0x09
    0x00, 0x09, 0x12, 0x1b, 0x24, 0x2d, 0x36, 0x3f, 0x48, 0x41, 0x5a, 0x53, 0x6c, 0x65, 0x7e, 0x77,
    0x90, 0x99, 0x82, 0x8b, 0xb4, 0xbd, 0xa6, 0xaf, 0xd8, 0xd1, 0xca, 0xc3, 0xfc, 0xf5, 0xee, 0xe7,
    0x3b, 0x32, 0x29, 0x20, 0x1f, 0x16, 0x0d, 0x04, 0x73, 0x7a, 0x61, 0x68, 0x57, 0x5e, 0x45, 0x4c,
    0xab, 0xa2, 0xb9, 0xb0, 0x8f, 0x86, 0x9d, 0x94, 0xe3, 0xea, 0xf1, 0xf8, 0xc7, 0xce, 0xd5, 0xdc,
    0x76, 0x7f, 0x64, 0x6d, 0x52, 0x5b, 0x40, 0x49, 0x3e, 0x37, 0x2c, 0x25, 0x1a, 0x13, 0x08, 0x01,
    0xe6, 0xef, 0xf4, 0xfd, 0xc2, 0xcb, 0xd0, 0xd9, 0xae, 0xa7, 0xbc, 0xb5, 0x8a, 0x83, 0x98, 0x91,
    0x4d, 0x44, 0x5f, 0x56, 0x69, 0x60, 0x7b, 0x72, 0x05, 0x0c, 0x17, 0x1e, 0x21, 0x28, 0x33, 0x3a,
    0xdd, 0xd4, 0xcf, 0xc6, 0xf9, 0xf0, 0xeb, 0xe2, 0x95, 0x9c, 0x87, 0x8e, 0xb1, 0xb8, 0xa3, 0xaa,
    0xec, 0xe5, 0xfe, 0xf7, 0xc8, 0xc1, 0xda, 0xd3, 0xa4, 0xad, 0xb6, 0xbf, 0x80, 0x89, 0x92, 0x9b,
    0x7c, 0x75, 0x6e, 0x67, 0x58, 0x51, 0x4a, 0x43, 0x34, 0x3d, 0x26, 0x2f, 0x10, 0x19, 0x02, 0x0b,
    0xd7, 0xde, 0xc5, 0xcc, 0xf3, 0xfa, 0xe1, 0xe8, 0x9f, 0x96, 0x8d, 0x84, 0xbb, 0xb2, 0xa9, 0xa0,
    0x47, 0x4e, 0x55, 0x5c, 0x63, 0x6a, 0x71, 0x78, 0x0f, 0x06, 0x1d, 0x14, 0x2b, 0x22, 0x39, 0x30,
    0x9a, 0x93, 0x88, 0x81, 0xbe, 0xb7, 0xac, 0xa5, 0xd2, 0xdb, 0xc0, 0xc9, 0xf6, 0xff, 0xe4, 0xed,
    0x0a, 0x03, 0x18, 0x11, 0x2e, 0x27, 0x3c, 0x35, 0x42, 0x4b, 0x50, 0x59, 0x66, 0x6f, 0x74, 0x7d,
    0xa1, 0xa8, 0xb3, 0xba, 0x85, 0x8c, 0x97, 0x9e, 0xe9, 0xe0, 0xfb, 0xf2, 0xcd, 0xc4, 0xdf, 0xd6,
    0x31, 0x38, 0x23, 0x2a, 0x15, 0x1c, 0x07, 0x0e, 0x79, 0x70, 0x6b, 0x62, 0x5d, 0x54, 0x4f, 0x46,
    // a = 0x0d
    0x00, 0x0d, 0x1a, 0x17, 0x34, 0x39, 0x2e, 0x23, 0x68, 0x65, 0x72, 0x7f, 0x5c, 0x51, 0x46, 0x4b,
    0xd0, 0xdd, 0xca, 0xc7, 0xe4, 0xe9, 0xfe, 0xf3, 0xb8, 0xb5, 0xa2, 0xaf, 0x8c, 0x81, 0x96, 0x9b,
    0xbb, 0xb6, 0xa1, 0xac, 0x8f, 0x82, 0x95, 0x98, 0xd3, 0xde, 0xc9, 0xc4, 0xe7, 0xea, 0xfd, 0xf0,
    0x6b, 0x66, 0x71, 0x7c, 0x5f, 0x52, 0x45, 0x48, 0x03, 0x0e, 0x19, 0x14, 0x37, 0x3a, 0x2d, 0x20,
    0x6d, 0x60, 0x77, 0x7a, 0x59, 0x54, 0x43, 0x4e, 0x05, 0x08, 0x1f, 0x12, 0x31, 0x3c, 0x2b, 0x26,
    0xbd, 0xb0, 0xa7, 0xaa, 0x89, 0x84, 0x93, 0x9e, 0xd5, 0xd8, 0xcf, 0xc2, 0xe1, 0xec, 0xfb, 0xf6,
    0xd6, 0xdb, 0xcc, 0xc1, 0xe2, 0xef, 0xf8, 0xf5, 0xbe, 0xb3, 0xa4, 0xa9, 0x8a, 0x87, 0x90, 0x9d,
    0x06, 0x0b, 0x1c, 0x11, 0x32, 0x3f, 0x28, 0x25, 0x6e, 0x63, 0x74, 0x79, 0x5a, 0x57, 0x40, 0x4d,
    0xda, 0xd7, 0xc0, 0xcd, 0xee, 0xe3, 0xf4, 0xf9, 0xb2, 0xbf, 0xa8, 0xa5, 0x86, 0x8b, 0x9c, 0x91,
    0x0a, 0x07, 0x10, 0x1d, 0x3e, 0x33, 0x24, 0x29, 0x62, 0x6f, 0x78, 0x75, 0x56, 0x5b, 0x4c, 0x41,
    0x61, 0x6c, 0x7b, 0x76, 0x55, 0x58, 0x4f, 0x42, 0x09, 0x04, 0x13, 0x1e, 0x3d, 0x30, 0x27, 0x2a,
    0xb1, 0xbc, 0xab, 0xa6, 0x85, 0x88, 0x9f, 0x92, 0xd9, 0xd4, 0xc3, 0xce, 0xed, 0xe0, 0xf7, 0xfa,
    0xb7, 0xba, 0xad, 0xa0, 0x83, 0x8e, 0x99, 0x94, 0xdf, 0xd2, 0xc5, 0xc8, 0xeb, 0xe6, 0xf1, 0xfc,
    0x67, 0x6a, 0x7d, 0x70, 0x53, 0x5e, 0x49, 0x44, 0x0f, 0x02, 0x15, 0x18, 0x3b, 0x36, 0x21, 0x2c,
    0x0c, 0x01, 0x16, 0x1b, 0x38, 0x35, 0x22, 0x2f, 0x64, 0x69, 0x7e, 0x73, 0x50, 0x5d, 0x4a, 0x47,
    0xdc, 0xd1, 0xc6, 0xcb, 0xe8, 0xe5, 0xf2, 0xff, 0xb4, 0xb9, 0xae, 0xa3, 0x80, 0x8d, 0x9a, 0x97,
    // a = 0x0b
    0x00, 0x0b, 0x16, 0x1d, 0x2c, 0x27, 0x3a, 0x31, 0x58, 0x53, 0x4e, 0x45, 0x74, 0x7f, 0x62, 0x69,
    0xb0, 0xbb, 0xa6, 0xad, 0x9c, 0x97, 0x8a, 0x81, 0xe8, 0xe3, 0xfe, 0xf5, 0xc4, 0xcf, 0xd2, 0xd9,
    0x7b, 0x70, 0x6d, 0x66, 0x57, 0x5c, 0x41, 0x4a, 0x23, 0x28, 0x35, 0x3e, 0x0f, 0x04, 0x19, 0x12,
    0xcb, 0xc0, 0xdd, 0xd6, 0xe7, 0xec, 0xf1, 0xfa, 0x93, 0x98, 0x85, 0x8e, 0xbf, 0xb4, 0xa9, 0xa2,
    0xf6, 0xfd, 0xe0, 0xeb, 0xda, 0xd1, 0xcc, 0xc7, 0xae, 0xa5, 0xb8, 0xb3, 0x82, 0x89, 0x94, 0x9f,
    0x46, 0x4d, 0x50, 0x5b, 0x6a, 0x61, 0x7c, 0x77, 0x1e, 0x15, 0x08, 0x03, 0x32, 0x39, 0x24, 0x2f,
    0x8d, 0x86, 0x9b, 0x90, 0xa1, 0xaa, 0xb7, 0xbc, 0xd5, 0xde, 0xc3, 0xc8, 0xf9, 0xf2, 0xef, 0xe4,
    0x3d, 0x36, 0x2b, 0x20, 0x11, 0x1a, 0x07, 0x0c, 0x65, 0x6e, 0x73, 0x78, 0x49, 0x42, 0x5f, 0x54,
    0xf7, 0xfc, 0xe1, 0xea, 0xdb, 0xd0, 0xcd, 0xc6, 0xaf, 0xa4, 0xb9, 0xb2, 0x83, 0x88, 0x95, 0x9e,
    0x47, 0x4c, 0x51, 0x5a, 0x6b, 0x60, 0x7d, 0x76, 0x1f, 0x14, 0x09, 0x02, 0x33, 0x38, 0x25, 0x2e,
    0x8c, 0x87, 0x9a, 0x91, 0xa0, 0xab, 0xb6, 0xbd, 0xd4, 0xdf, 0xc2, 0xc9, 0xf8, 0xf3, 0xee, 0xe5,
    0x3c, 0x37, 0x2a, 0x21, 0x10, 0x1b, 0x06, 0x0d, 0x64, 0x6f, 0x72, 0x79, 0x48, 0x43, 0x5e, 0x55,
    0x01, 0x0a, 0x17, 0x1c, 0x2d, 0x26, 0x3b, 0x30, 0x59, 0x52, 0x4f, 0x44, 0x75, 0x7e, 0x63, 0x68,
    0xb1, 0xba, 0xa7, 0xac, 0x9d, 0x96, 0x8b, 0x80, 0xe9, 0xe2, 0xff, 0xf4, 0xc5, 0xce, 0xd3, 0xd8,
    0x7a, 0x71, 0x6c, 0x67, 0x56, 0x5d, 0x40, 0x4b, 0x22, 0x29, 0x34, 0x3f, 0x0e, 0x05, 0x18, 0x13,
    0xca, 0xc1, 0xdc, 0xd7, 0xe6, 0xed, 0xf0, 0xfb, 0x92, 0x99, 0x84, 0x8f, 0xbe, 0xb5, 0xa8, 0xa3,
};
const uint8_t ff_mult3[4][256] = {
    // a = 0x02
    0x00, 0x02, 0x04, 0x06, 0x08, 0x0a, 0x0c, 0x0e, 0x10, 0x12, 0x14, 0x16, 0x18, 0x1a, 0x1c, 0x1e,
    0x20, 0x22, 0x24, 0x26, 0x28, 0x2a, 0x2c, 0x2e, 0x30, 0x32, 0x34, 0x36, 0x38, 0x3a, 0x3c, 0x3e,
    0x40, 0x42, 0x44, 0x46, 0x48, 0x4a, 0x4c, 0x4e, 0x50, 0x52, 0x54, 0x56, 0x58, 0x5a, 0x5c, 0x5e,
    0x60, 0x62, 0x64, 0x66, 0x68, 0x6a, 0x6c, 0x6e, 0x70, 0x72, 0x74, 0x76, 0x78, 0x7a, 0x7c, 0x7e,
    0x80, 0x82, 0x84, 0x86, 0x88, 0x8a, 0x8c, 0x8e, 0x90, 0x92, 0x94, 0x96, 0x98, 0x9a, 0x9c, 0x9e,
    0xa0, 0xa2, 0xa4, 0xa6, 0xa8, 0xaa, 0xac, 0xae, 0xb0, 0xb2, 0xb4, 0xb6, 0xb8, 0xba, 0xbc, 0xbe,
    0xc0, 0xc2, 0xc4, 0xc6, 0xc8, 0xca, 0xcc, 0xce, 0xd0, 0xd2, 0xd4, 0xd6, 0xd8, 0xda, 0xdc, 0xde,
    0xe0, 0xe2, 0xe4, 0xe6, 0xe8, 0xea, 0xec, 0xee, 0xf0, 0xf2, 0xf4, 0xf6, 0xf8, 0xfa, 0xfc, 0xfe,
    0x1b, 0x19, 0x1f, 0x1d, 0x13, 0x11, 0x17, 0x15, 0x0b, 0x09, 0x0f, 0x0d, 0x03, 0x01, 0x07, 0x05,
    0x3b, 0x39, 0x3f, 0x3d, 0x33, 0x31, 0x37, 0x35, 0x2b, 0x29, 0x2f, 0x2d, 0x23, 0x21, 0x27, 0x25,
    0x5b, 0x59, 0x5f, 0x5d, 0x53, 0x51, 0x57, 0x55, 0x4b, 0x49, 0x4f, 0x4d, 0x43, 0x41, 0x47, 0x45,
    0x7b, 0x79, 0x7f, 0x7d, 0x73, 0x71, 0x77, 0x75, 0x6b, 0x69, 0x6f, 0x6d, 0x63, 0x61, 0x67, 0x65,
    0x9b, 0x99, 0x9f, 0x9d, 0x93, 0x91, 0x97, 0x95, 0x8b, 0x89, 0x8f, 0x8d, 0x83, 0x81, 0x87, 0x85,
    0xbb, 0xb9, 0xbf, 0xbd, 0xb3, 0xb1, 0xb7, 0xb5, 0xab, 0xa9, 0xaf, 0xad, 0xa3, 0xa1, 0xa7, 0xa5,
    0xdb, 0xd9, 0xdf, 0xdd, 0xd3, 0xd1, 0xd7, 0xd5, 0xcb, 0xc9, 0xcf, 0xcd, 0xc3, 0xc1, 0xc7, 0xc5,
    0xfb, 0xf9, 0xff, 0xfd, 0xf3, 0xf1, 0xf7, 0xf5, 0xeb, 0xe9, 0xef, 0xed, 0xe3, 0xe1, 0xe7, 0xe5,
    // a = 0x01
    0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
    0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
    0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f,
    0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f,
    0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f,
    0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f,
    0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f,
    0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f,
    0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f,
    0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f,
    0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf,
    0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf,
    0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf,
    0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf,
    0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef,
    0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff,
    // a = 0x01
    0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
    0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
    0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f,
    0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f,
    0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f,
    0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f,
    0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f,
    0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f,
    0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f,
    0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f,
    0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf,
    0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf,
    0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf,
    0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf,
    0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef,
    0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff,
    // a = 0x03
    0x00, 0x03, 0x06, 0x05, 0x0c, 0x0f, 0x0a, 0x09, 0x18, 0x1b, 0x1e, 0x1d, 0x14, 0x17, 0x12, 0x11,
    0x30, 0x33, 0x36, 0x35, 0x3c, 0x3f, 0x3a, 0x39, 0x28, 0x2b, 0x2e, 0x2d, 0x24, 0x27, 0x22, 0x21,
    0x60, 0x63, 0x66, 0x65, 0x6c, 0x6f, 0x6a, 0x69, 0x78, 0x7b, 0x7e, 0x7d, 0x74, 0x77, 0x72, 0x71,
    0x50, 0x53, 0x56, 0x55, 0x5c, 0x5f, 0x5a, 0x59, 0x48, 0x4b, 0x4e, 0x4d, 0x44, 0x47, 0x42, 0x41,
    0xc0, 0xc3, 0xc6, 0xc5, 0xcc, 0xcf, 0xca, 0xc9, 0xd8, 0xdb, 0xde, 0xdd, 0xd4, 0xd7, 0xd2, 0xd1,
    0xf0, 0xf3, 0xf6, 0xf5, 0xfc, 0xff, 0xfa, 0xf9, 0xe8, 0xeb, 0xee, 0xed, 0xe4, 0xe7, 0xe2, 0xe1,
    0xa0, 0xa3, 0xa6, 0xa5, 0xac, 0xaf, 0xaa, 0xa9, 0xb8, 0xbb, 0xbe, 0xbd, 0xb4, 0xb7, 0xb2, 0xb1,
    0x90, 0x93, 0x96, 0x95, 0x9c, 0x9f, 0x9a, 0x99, 0x88, 0x8b, 0x8e, 0x8d, 0x84, 0x87, 0x82, 0x81,
    0x9b, 0x98, 0x9d, 0x9e, 0x97, 0x94, 0x91, 0x92, 0x83, 0x80, 0x85, 0x86, 0x8f, 0x8c, 0x89, 0x8a,
    0xab, 0xa8, 0xad, 0xae, 0xa7, 0xa4, 0xa1, 0xa2, 0xb3, 0xb0, 0xb5, 0xb6, 0xbf, 0xbc, 0xb9, 0xba,
    0xfb, 0xf8, 0xfd, 0xfe, 0xf7, 0xf4, 0xf1, 0xf2, 0xe3, 0xe0, 0xe5, 0xe6, 0xef, 0xec, 0xe9, 0xea,
    0xcb, 0xc8, 0xcd, 0xce, 0xc7, 0xc4, 0xc1, 0xc2, 0xd3, 0xd0, 0xd5, 0xd6, 0xdf, 0xdc, 0xd9, 0xda,
    0x5b, 0x58, 0x5d, 0x5e, 0x57, 0x54, 0x51, 0x52, 0x43, 0x40, 0x45, 0x46, 0x4f, 0x4c, 0x49, 0x4a,
    0x6b, 0x68, 0x6d, 0x6e, 0x67, 0x64, 0x61, 0x62, 0x73, 0x70, 0x75, 0x76, 0x7f, 0x7c, 0x79, 0x7a,
    0x3b, 0x38, 0x3d, 0x3e, 0x37, 0x34, 0x31, 0x32, 0x23, 0x20, 0x25, 0x26, 0x2f, 0x2c, 0x29, 0x2a,
    0x0b, 0x08, 0x0d, 0x0e, 0x07, 0x04, 0x01, 0x02, 0x13, 0x10, 0x15, 0x16, 0x1f, 0x1c, 0x19, 0x1a,
};

static uint8_t ff_mult(uint8_t a, uint8_t b)
{
    int retval = 0;

    for(int i = 0; i < 8; i++) {
        if((b & 1) == 1)
            retval ^= a;

        if((a & 0x80)) {
            a <<= 1;
            a  ^= 0x1b;
        } else {
            a <<= 1;
        }

        b >>= 1;
    }

    return retval;
}

void native_aesimc(x64emu_t* emu, int xmm)
{
    sse_regs_t eax1 = emu->xmm[xmm];

    for(int j=0; j<4; ++j) {
        // 0x0E -> 0, 0x09 -> 1, 0x0D -> 2, 0x0B -> 3
        // emu->xmm[xmm].ub[0+j*4] = ff_mult(0x0E, eax1.ub[0+j*4]) ^ ff_mult(0x0B, eax1.ub[1+j*4]) ^ ff_mult(0x0D, eax1.ub[2+j*4]) ^ ff_mult(0x09, eax1.ub[3+j*4]);
        // emu->xmm[xmm].ub[1+j*4] = ff_mult(0x09, eax1.ub[0+j*4]) ^ ff_mult(0x0E, eax1.ub[1+j*4]) ^ ff_mult(0x0B, eax1.ub[2+j*4]) ^ ff_mult(0x0D, eax1.ub[3+j*4]);
        // emu->xmm[xmm].ub[2+j*4] = ff_mult(0x0D, eax1.ub[0+j*4]) ^ ff_mult(0x09, eax1.ub[1+j*4]) ^ ff_mult(0x0E, eax1.ub[2+j*4]) ^ ff_mult(0x0B, eax1.ub[3+j*4]);
        // emu->xmm[xmm].ub[3+j*4] = ff_mult(0x0B, eax1.ub[0+j*4]) ^ ff_mult(0x0D, eax1.ub[1+j*4]) ^ ff_mult(0x09, eax1.ub[2+j*4]) ^ ff_mult(0x0E, eax1.ub[3+j*4]);
        emu->xmm[xmm].ub[0+j*4] = ff_mult2[0][eax1.ub[0+j*4]] ^ ff_mult2[3][eax1.ub[1+j*4]] ^ ff_mult2[2][eax1.ub[2+j*4]] ^ ff_mult2[1][eax1.ub[3+j*4]];
        emu->xmm[xmm].ub[1+j*4] = ff_mult2[1][eax1.ub[0+j*4]] ^ ff_mult2[0][eax1.ub[1+j*4]] ^ ff_mult2[3][eax1.ub[2+j*4]] ^ ff_mult2[2][eax1.ub[3+j*4]];
        emu->xmm[xmm].ub[2+j*4] = ff_mult2[2][eax1.ub[0+j*4]] ^ ff_mult2[1][eax1.ub[1+j*4]] ^ ff_mult2[0][eax1.ub[2+j*4]] ^ ff_mult2[3][eax1.ub[3+j*4]];
        emu->xmm[xmm].ub[3+j*4] = ff_mult2[3][eax1.ub[0+j*4]] ^ ff_mult2[2][eax1.ub[1+j*4]] ^ ff_mult2[1][eax1.ub[2+j*4]] ^ ff_mult2[0][eax1.ub[3+j*4]];
    }
}
void native_aesimc_y(x64emu_t* emu, int ymm)
{
    sse_regs_t eay1 = emu->ymm[ymm];

    for(int j=0; j<4; ++j) {
        // emu->ymm[ymm].ub[0+j*4] = ff_mult(0x0E, eay1.ub[0+j*4]) ^ ff_mult(0x0B, eay1.ub[1+j*4]) ^ ff_mult(0x0D, eay1.ub[2+j*4]) ^ ff_mult(0x09, eay1.ub[3+j*4]);
        // emu->ymm[ymm].ub[1+j*4] = ff_mult(0x09, eay1.ub[0+j*4]) ^ ff_mult(0x0E, eay1.ub[1+j*4]) ^ ff_mult(0x0B, eay1.ub[2+j*4]) ^ ff_mult(0x0D, eay1.ub[3+j*4]);
        // emu->ymm[ymm].ub[2+j*4] = ff_mult(0x0D, eay1.ub[0+j*4]) ^ ff_mult(0x09, eay1.ub[1+j*4]) ^ ff_mult(0x0E, eay1.ub[2+j*4]) ^ ff_mult(0x0B, eay1.ub[3+j*4]);
        // emu->ymm[ymm].ub[3+j*4] = ff_mult(0x0B, eay1.ub[0+j*4]) ^ ff_mult(0x0D, eay1.ub[1+j*4]) ^ ff_mult(0x09, eay1.ub[2+j*4]) ^ ff_mult(0x0E, eay1.ub[3+j*4]);
        emu->ymm[ymm].ub[0+j*4] = ff_mult2[0][eay1.ub[0+j*4]] ^ ff_mult2[3][eay1.ub[1+j*4]] ^ ff_mult2[2][eay1.ub[2+j*4]] ^ ff_mult2[1][eay1.ub[3+j*4]];
        emu->ymm[ymm].ub[1+j*4] = ff_mult2[1][eay1.ub[0+j*4]] ^ ff_mult2[0][eay1.ub[1+j*4]] ^ ff_mult2[3][eay1.ub[2+j*4]] ^ ff_mult2[2][eay1.ub[3+j*4]];
        emu->ymm[ymm].ub[2+j*4] = ff_mult2[2][eay1.ub[0+j*4]] ^ ff_mult2[1][eay1.ub[1+j*4]] ^ ff_mult2[0][eay1.ub[2+j*4]] ^ ff_mult2[3][eay1.ub[3+j*4]];
        emu->ymm[ymm].ub[3+j*4] = ff_mult2[3][eay1.ub[0+j*4]] ^ ff_mult2[2][eay1.ub[1+j*4]] ^ ff_mult2[1][eay1.ub[2+j*4]] ^ ff_mult2[0][eay1.ub[3+j*4]];
    }
}
void native_aesmc(x64emu_t* emu, int xmm)
{
    sse_regs_t eax1 = emu->xmm[xmm];

    for(int j=0; j<4; ++j) {
        // 0x02 -> 0, 0x01 -> 1, 0x01 -> 2, 0x03 -> 3
        // emu->xmm[xmm].ub[0+j*4] = ff_mult(0x02, eax1.ub[0+j*4]) ^ ff_mult(0x03, eax1.ub[1+j*4]) ^               eax1.ub[2+j*4]  ^               eax1.ub[3+j*4] ;
        // emu->xmm[xmm].ub[1+j*4] =               eax1.ub[0+j*4]  ^ ff_mult(0x02, eax1.ub[1+j*4]) ^ ff_mult(0x03, eax1.ub[2+j*4]) ^               eax1.ub[3+j*4] ;
        // emu->xmm[xmm].ub[2+j*4] =               eax1.ub[0+j*4]  ^               eax1.ub[1+j*4]  ^ ff_mult(0x02, eax1.ub[2+j*4]) ^ ff_mult(0x03, eax1.ub[3+j*4]);
        // emu->xmm[xmm].ub[3+j*4] = ff_mult(0x03, eax1.ub[0+j*4]) ^               eax1.ub[1+j*4]  ^               eax1.ub[2+j*4]  ^ ff_mult(0x02, eax1.ub[3+j*4]);
        emu->xmm[xmm].ub[0+j*4] = ff_mult3[0][eax1.ub[0+j*4]] ^ ff_mult3[3][eax1.ub[1+j*4]] ^ ff_mult3[2][eax1.ub[2+j*4]] ^ ff_mult3[1][eax1.ub[3+j*4]];
        emu->xmm[xmm].ub[1+j*4] = ff_mult3[1][eax1.ub[0+j*4]] ^ ff_mult3[0][eax1.ub[1+j*4]] ^ ff_mult3[3][eax1.ub[2+j*4]] ^ ff_mult3[2][eax1.ub[3+j*4]];
        emu->xmm[xmm].ub[2+j*4] = ff_mult3[2][eax1.ub[0+j*4]] ^ ff_mult3[1][eax1.ub[1+j*4]] ^ ff_mult3[0][eax1.ub[2+j*4]] ^ ff_mult3[3][eax1.ub[3+j*4]];
        emu->xmm[xmm].ub[3+j*4] = ff_mult3[3][eax1.ub[0+j*4]] ^ ff_mult3[2][eax1.ub[1+j*4]] ^ ff_mult3[1][eax1.ub[2+j*4]] ^ ff_mult3[0][eax1.ub[3+j*4]];
    }
}
void native_aesmc_y(x64emu_t* emu, int ymm)
{
    sse_regs_t eay1 = emu->ymm[ymm];

    for(int j=0; j<4; ++j) {
        // emu->ymm[ymm].ub[0+j*4] = ff_mult(0x02, eay1.ub[0+j*4]) ^ ff_mult(0x03, eay1.ub[1+j*4]) ^               eay1.ub[2+j*4]  ^               eay1.ub[3+j*4] ;
        // emu->ymm[ymm].ub[1+j*4] =               eay1.ub[0+j*4]  ^ ff_mult(0x02, eay1.ub[1+j*4]) ^ ff_mult(0x03, eay1.ub[2+j*4]) ^               eay1.ub[3+j*4] ;
        // emu->ymm[ymm].ub[2+j*4] =               eay1.ub[0+j*4]  ^               eay1.ub[1+j*4]  ^ ff_mult(0x02, eay1.ub[2+j*4]) ^ ff_mult(0x03, eay1.ub[3+j*4]);
        // emu->ymm[ymm].ub[3+j*4] = ff_mult(0x03, eay1.ub[0+j*4]) ^               eay1.ub[1+j*4]  ^               eay1.ub[2+j*4]  ^ ff_mult(0x02, eay1.ub[3+j*4]);
        emu->ymm[ymm].ub[0+j*4] = ff_mult3[0][eay1.ub[0+j*4]] ^ ff_mult3[3][eay1.ub[1+j*4]] ^ ff_mult3[2][eay1.ub[2+j*4]] ^ ff_mult3[1][eay1.ub[3+j*4]];
        emu->ymm[ymm].ub[1+j*4] = ff_mult3[1][eay1.ub[0+j*4]] ^ ff_mult3[0][eay1.ub[1+j*4]] ^ ff_mult3[3][eay1.ub[2+j*4]] ^ ff_mult3[2][eay1.ub[3+j*4]];
        emu->ymm[ymm].ub[2+j*4] = ff_mult3[2][eay1.ub[0+j*4]] ^ ff_mult3[1][eay1.ub[1+j*4]] ^ ff_mult3[0][eay1.ub[2+j*4]] ^ ff_mult3[3][eay1.ub[3+j*4]];
        emu->ymm[ymm].ub[3+j*4] = ff_mult3[3][eay1.ub[0+j*4]] ^ ff_mult3[2][eay1.ub[1+j*4]] ^ ff_mult3[1][eay1.ub[2+j*4]] ^ ff_mult3[0][eay1.ub[3+j*4]];
    }
}
                                  //   A0 B1 C2 D3 E4 F5 G6 H7 I8 J9 Ka Lb Mc Nd Oe Pf
                                  //   A  N  K  H  E  B  O  L  I  F  C  P  M  J  G  D
const uint8_t invshiftrows[] = {0,13,10, 7, 4, 1,14,11, 8, 5, 2,15,12, 9, 6, 3};
const uint8_t invsubbytes[256] = {
    0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38, 0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb,
    0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87, 0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb,
    0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d, 0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e,
    0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2, 0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25,
    0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16, 0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92,
    0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda, 0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84,
    0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a, 0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06,
    0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02, 0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b,
    0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea, 0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73,
    0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85, 0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e,
    0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89, 0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b,
    0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20, 0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4,
    0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31, 0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f,
    0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d, 0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef,
    0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0, 0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61,
    0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26, 0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d,
};
void native_aesdlast(x64emu_t* emu, int xmm)
{

    sse_regs_t eax1;
    for(int i=0; i<16; ++i)
        eax1.ub[i] = emu->xmm[xmm].ub[invshiftrows[i]];
    //STATE ← InvSubBytes( STATE );
    for(int i=0; i<16; ++i)
        emu->xmm[xmm].ub[i] = invsubbytes[eax1.ub[i]];

}
void native_aesdlast_y(x64emu_t* emu, int ymm)
{

    sse_regs_t eay1;
    for(int i=0; i<16; ++i)
        eay1.ub[i] = emu->ymm[ymm].ub[invshiftrows[i]];
    //STATE ← InvSubBytes( STATE );
    for(int i=0; i<16; ++i)
        emu->ymm[ymm].ub[i] = invsubbytes[eay1.ub[i]];

}
const uint8_t shiftrows[] = {0, 5,10,15, 4, 9,14, 3, 8,13, 2, 7,12, 1, 6,11};
const uint8_t subbytes[256] = {
    0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5, 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76,
    0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0, 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0,
    0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc, 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15,
    0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a, 0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75,
    0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0, 0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84,
    0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b, 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf,
    0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85, 0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8,
    0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5, 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2,
    0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17, 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73,
    0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88, 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb,
    0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c, 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79,
    0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9, 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08,
    0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6, 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a,
    0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e, 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e,
    0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94, 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf,
    0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68, 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16,
};
void native_aeselast(x64emu_t* emu, int xmm)
{
                            //   A0 B1 C2 D3 E4 F5 G6 H7 I8 J9 Ka Lb Mc Nd Oe Pf
                            //   A  F  K  P  E  J  O  D  I  N  C  H  M  B  G  L
    sse_regs_t eax1;
    for(int i=0; i<16; ++i)
        eax1.ub[i] = emu->xmm[xmm].ub[shiftrows[i]];
    //STATE ← SubBytes( STATE );
    for(int i=0; i<16; ++i)
        emu->xmm[xmm].ub[i] = subbytes[eax1.ub[i]];
}
void native_aeselast_y(x64emu_t* emu, int ymm)
{
    sse_regs_t eay1;
    for(int i=0; i<16; ++i)
        eay1.ub[i] = emu->ymm[ymm].ub[shiftrows[i]];
    for(int i=0; i<16; ++i)
        emu->ymm[ymm].ub[i] = subbytes[eay1.ub[i]];
}
void native_aesd(x64emu_t* emu, int xmm)
{
    native_aesdlast(emu, xmm);
    native_aesimc(emu, xmm);
}
void native_aesd_y(x64emu_t* emu, int ymm)
{
    native_aesdlast_y(emu, ymm);
    native_aesimc_y(emu, ymm);
}
void native_aese(x64emu_t* emu, int xmm)
{
    native_aeselast(emu, xmm);
    native_aesmc(emu, xmm);
}
void native_aese_y(x64emu_t* emu, int ymm)
{
    native_aeselast_y(emu, ymm);
    native_aesmc_y(emu, ymm);
}
void native_aeskeygenassist(x64emu_t* emu, int gx, int ex, void* p, uint32_t u8)
{
    sse_regs_t *EX = p?((sse_regs_t*)p):&emu->xmm[ex];
    sse_regs_t *GX = &emu->xmm[gx];
    for (int i = 4; i < 8; ++i)
        GX->ub[i] = subbytes[EX->ub[i]];
    for (int i = 12; i < 16; ++i)
        GX->ub[i] = subbytes[EX->ub[i]];
    GX->ud[0] = GX->ud[1];
    uint8_t tmp8u = GX->ub[4];
    GX->ud[1] = GX->ud[1] >> 8;
    GX->ub[7] = tmp8u;
    GX->ud[1] ^= u8;
    GX->ud[2] = GX->ud[3];
    tmp8u = GX->ub[12];
    GX->ud[3] = GX->ud[3] >> 8;
    GX->ub[15] = tmp8u;
    GX->ud[3] ^= u8;
}

void native_pclmul(x64emu_t* emu, int gx, int ex, void* p, uint32_t u8)
{
    sse_regs_t *EX = p?((sse_regs_t*)p):&emu->xmm[ex];
    sse_regs_t *GX = &emu->xmm[gx];
    int g = (u8&1)?1:0;
    int e = (u8&0b10000)?1:0;
    __int128 result = 0;
    __int128 op2 = EX->q[e];
    for (int i=0; i<64; ++i)
        if(GX->q[g]&(1LL<<i))
            result ^= (op2<<i);
    GX->u128 = result;
}
void native_pclmul_x(x64emu_t* emu, int gx, int vx, void* p, uint32_t u8)
{

    sse_regs_t *EX = ((uintptr_t)p>15)?((sse_regs_t*)p):&emu->xmm[(uintptr_t)p];
    sse_regs_t *GX = &emu->xmm[gx];
    sse_regs_t *VX = &emu->xmm[vx];
    int g = (u8&1)?1:0;
    int e = (u8&0b10000)?1:0;
    __int128 result = 0;
    __int128 op2 = EX->q[e];
    for (int i=0; i<64; ++i)
        if(VX->q[g]&(1LL<<i))
            result ^= (op2<<i);

    GX->u128 = result;
}
void native_pclmul_y(x64emu_t* emu, int gy, int vy, void* p, uint32_t u8)
{
    //compute both low and high values
    native_pclmul_x(emu, gy, vy, p, u8);
    sse_regs_t *EY = ((uintptr_t)p>15)?((sse_regs_t*)(p+16)):&emu->ymm[(uintptr_t)p];
    sse_regs_t *GY = &emu->ymm[gy];
    sse_regs_t *VY = &emu->ymm[vy];
    int g = (u8&1)?1:0;
    int e = (u8&0b10000)?1:0;
    __int128 result = 0;
    __int128 op2 = EY->q[e];
    for (int i=0; i<64; ++i)
        if(VY->q[g]&(1LL<<i))
            result ^= (op2<<i);

    GY->u128 = result;
}

static int flagsCacheNeedsTransform(dynarec_native_t* dyn, int ninst) {
    int jmp = dyn->insts[ninst].x64.jmp_insts;
    if(jmp<0)
        return 0;
    #ifdef ARM64
    // df_none is now a defered information
    if(dyn->insts[ninst].f_exit==dyn->insts[jmp].f_entry)
        return 0;
    if(dyn->insts[jmp].df_notneeded)
        return 0;
    if((dyn->insts[jmp].f_entry==status_none_pending) && (dyn->insts[ninst].f_exit!=status_none_pending))
        return 1;
    switch (dyn->insts[jmp].f_entry) {
        case status_unk:
            return (dyn->insts[ninst].f_exit==status_none_pending)?1:0;
        case status_none:
            return 1;
        case status_set:
            return (dyn->insts[ninst].f_exit==status_none)?0:1;
        case status_none_pending:
            return 1;
    }
    #else
    if(dyn->insts[ninst].f_exit.dfnone)  // flags are fully known, nothing we can do more
        return 0;
    if(dyn->insts[jmp].f_entry.dfnone && !dyn->insts[ninst].f_exit.dfnone && !dyn->insts[jmp].df_notneeded)
        return 1;
    switch (dyn->insts[jmp].f_entry.pending) {
        case SF_UNKNOWN: return 0;
        case SF_SET:
            if(dyn->insts[ninst].f_exit.pending!=SF_SET && dyn->insts[ninst].f_exit.pending!=SF_SET_PENDING)
                return 1;
            else
                return 0;
        case SF_SET_PENDING:
            if(dyn->insts[ninst].f_exit.pending==SF_SET_PENDING)
                return 0;
            return 1;
        case SF_PENDING:
            if(dyn->insts[ninst].f_exit.pending==SF_PENDING || dyn->insts[ninst].f_exit.pending==SF_SET_PENDING)
                return 0;
            return (dyn->insts[jmp].f_entry.dfnone  == dyn->insts[ninst].f_exit.dfnone)?0:1;
    }
    #endif
    return 0;
}

int CacheNeedsTransform(dynarec_native_t* dyn, int ninst) {
    int ret = 0;
    if (flagsCacheNeedsTransform(dyn, ninst)) ret|=1;
    OTHER_CACHE()
    return ret;
}

int isPred(dynarec_native_t* dyn, int ninst, int pred) {
    for(int i=0; i<dyn->insts[ninst].pred_sz; ++i)
        if(dyn->insts[ninst].pred[i]==pred)
            return pred;
    return -1;
}
int getNominalPred(dynarec_native_t* dyn, int ninst) {
    if((ninst<=0) || !dyn->insts[ninst].pred_sz)
        return -1;
    if(isPred(dyn, ninst, ninst-1)!=-1)
        return ninst-1;
    return dyn->insts[ninst].pred[0];
}

#define F8      *(uint8_t*)(addr++)
// Do the GETED, but don't emit anything...
uintptr_t fakeed(dynarec_native_t* dyn, uintptr_t addr, int ninst, uint8_t nextop)
{
    (void)dyn; (void)addr; (void)ninst;

    if((nextop&0xC0)==0xC0)
        return addr;
    if(!(nextop&0xC0)) {
        if((nextop&7)==4) {
            uint8_t sib = F8;
            if((sib&0x7)==5) {
                addr+=4;
            }
        } else if((nextop&7)==5) {
            addr+=4;
        }
    } else {
        if((nextop&7)==4) {
            ++addr;
        }
        if(nextop&0x80) {
            addr+=4;
        } else {
            ++addr;
        }
    }
    return addr;
}
// return Ib on a mod/rm opcode without emitting anything
uint8_t geted_ib(dynarec_native_t* dyn, uintptr_t addr, int ninst, uint8_t nextop)
{
    addr = fakeed(dyn, addr, ninst, nextop);
    return F8;
}
#undef F8

void propagate_nodf(dynarec_native_t* dyn, int ninst)
{
    while(ninst>=0) {
        if(dyn->insts[ninst].df_notneeded)
            return; // already flagged
        if(dyn->insts[ninst].x64.gen_flags || dyn->insts[ninst].x64.use_flags)
            return; // flags are use, so maybe it's needed
        dyn->insts[ninst].df_notneeded = 1;
        if(!dyn->insts[ninst].pred_sz)
            return;
        for(int i=1; i<dyn->insts[ninst].pred_sz; ++i)
            propagate_nodf(dyn, dyn->insts[ninst].pred[i]);
        ninst = dyn->insts[ninst].pred[0];
    }
}

void x64disas_add_register_mapping_annotations(char* buf, const char* disas, const register_mapping_t* mappings, size_t mappings_sz)
{
    static char tmp[32];
    tmp[0] = '\0';
    int len = 0;
    // skip the mnemonic
    char* p = strchr(disas, ' ');
    if (!p) {
        sprintf(buf, "%s", disas);
        return;
    }
    p++; // skip the space
    while (*p) {
        while (*p && !(*p >= 'a' && *p <= 'e') && *p != 's' && *p != 'r') // skip non-register characters
            p++;
        if (!*p) break;
        for (int i = 0; i < mappings_sz; ++i) {
            if (!strncmp(p, mappings[i].name, strlen(mappings[i].name))) {
                len += sprintf(tmp + len, " %s,", mappings[i].native);
                p += strlen(mappings[i].name) - 1;
                break;
            }
        }
        p++;
        }
    if (tmp[0]) tmp[strlen(tmp) - 1] = '\0';
    sprintf(buf, "%-35s ;%s", disas, tmp);
}
