Merge branch 'master' into jit_exit_addresses

2025-09-28 20:28:46 +00:00 · 2013-11-21 21:17:58 -06:00 · 2013-11-21 21:17:58 -06:00 · ea9ac07ec9
commit ea9ac07ec9
parent 1aa06b8fa4 ff91789773
143 changed files with 1773 additions and 1093 deletions
--- a/Source/Core/AudioCommon/Src/AudioCommon.cpp
+++ b/Source/Core/AudioCommon/Src/AudioCommon.cpp
@ -23,7 +23,7 @@ SoundStream *soundStream = nullptr;

 namespace AudioCommon 
 {
-	SoundStream *InitSoundStream(CMixer *mixer) 
+	SoundStream *InitSoundStream(CMixer *mixer, void *hWnd)
 	{
 		// TODO: possible memleak with mixer

@ -33,7 +33,7 @@ namespace AudioCommon
 		else if (backend == BACKEND_NULLSOUND   && NullSound::isValid())
 			soundStream = new NullSound(mixer);
 		else if (backend == BACKEND_DIRECTSOUND && DSound::isValid())
-			soundStream = new DSound(mixer);
+			soundStream = new DSound(mixer, hWnd);
 		else if (backend == BACKEND_XAUDIO2)
 		{
 			if (XAudio2::isValid())
--- a/Source/Core/AudioCommon/Src/AudioCommon.h
+++ b/Source/Core/AudioCommon/Src/AudioCommon.h
@ -40,7 +40,7 @@ union UDSPControl

 namespace AudioCommon
 {
-	SoundStream *InitSoundStream(CMixer *mixer);
+	SoundStream *InitSoundStream(CMixer *mixer, void *hWnd);
 	void ShutdownSoundStream();
 	std::vector<std::string> GetSoundBackends();
 	bool UseJIT();
--- a/Source/Core/AudioCommon/Src/DSoundStream.h
+++ b/Source/Core/AudioCommon/Src/DSoundStream.h
@ -48,7 +48,7 @@ class DSound : public SoundStream
 	bool WriteDataToBuffer(DWORD dwOffset, char* soundData, DWORD dwSoundBytes);

 public:
-	DSound(CMixer *mixer, void *_hWnd = NULL)
+	DSound(CMixer *mixer, void *_hWnd)
 		: SoundStream(mixer)
 		, bufferSize(0)
 		, currentPos(0)
@ -71,7 +71,7 @@ public:

 #else
 public:
-	DSound(CMixer *mixer)
+	DSound(CMixer *mixer, void *_hWnd)
 		: SoundStream(mixer)
 	{}
 #endif
--- a/Source/Core/Common/Src/ArmEmitter.cpp
+++ b/Source/Core/Common/Src/ArmEmitter.cpp
@ -938,13 +938,13 @@ u32 EncodeVm(ARMReg Vm)

 // Double/single, Neon
 extern const VFPEnc VFPOps[16][2] = {
-	{{0xE0, 0xA0}, {0x20, 0xD1}}, // 0: VMLA
+	{{0xE0, 0xA0}, {  -1,   -1}}, // 0: VMLA
 	{{0xE1, 0xA4}, {  -1,   -1}}, // 1: VNMLA
-	{{0xE0, 0xA4}, {0x22, 0xD1}}, // 2: VMLS
+	{{0xE0, 0xA4}, {  -1,   -1}}, // 2: VMLS
 	{{0xE1, 0xA0}, {  -1,   -1}}, // 3: VNMLS
-	{{0xE3, 0xA0}, {0x20, 0xD0}}, // 4: VADD
-	{{0xE3, 0xA4}, {0x22, 0xD0}}, // 5: VSUB
-	{{0xE2, 0xA0}, {0x30, 0xD1}}, // 6: VMUL
+	{{0xE3, 0xA0}, {  -1,   -1}}, // 4: VADD
+	{{0xE3, 0xA4}, {  -1,   -1}}, // 5: VSUB
+	{{0xE2, 0xA0}, {  -1,   -1}}, // 6: VMUL
 	{{0xE2, 0xA4}, {  -1,   -1}}, // 7: VNMUL
 	{{0xEB, 0xAC}, {  -1 /* 0x3B */,  -1 /* 0x70 */}}, // 8: VABS(Vn(0x0) used for encoding)
 	{{0xE8, 0xA0}, {  -1,   -1}}, // 9: VDIV
@ -1237,7 +1237,7 @@ void ARMXEmitter::VCVT(ARMReg Dest, ARMReg Source, int flags)
 	}
 }

-void NEONXEmitter::VABA(NEONElementType Size, ARMReg Vd, ARMReg Vn, ARMReg Vm)
+void NEONXEmitter::VABA(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm)
 {
 	_dbg_assert_msg_(DYNA_REC, Vd >= D0, "Pass invalid register to " __FUNCTION__);
 	_dbg_assert_msg_(DYNA_REC, cpu_info.bNEON, "Can't use " __FUNCTION__ " when CPU doesn't support it");
@ -1248,7 +1248,7 @@ void NEONXEmitter::VABA(NEONElementType Size, ARMReg Vd, ARMReg Vn, ARMReg Vm)
 		| (encodedSize(Size) << 20) | EncodeVd(Vd) | (0x71 << 4) | (register_quad << 6) | EncodeVm(Vm));
 }

-void NEONXEmitter::VABAL(NEONElementType Size, ARMReg Vd, ARMReg Vn, ARMReg Vm)
+void NEONXEmitter::VABAL(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm)
 {
 	_dbg_assert_msg_(DYNA_REC, Vd >= Q0, "Pass invalid register to " __FUNCTION__);
 	_dbg_assert_msg_(DYNA_REC, Vn >= D0 && Vn < Q0, "Pass invalid register to " __FUNCTION__);
@ -1260,7 +1260,7 @@ void NEONXEmitter::VABAL(NEONElementType Size, ARMReg Vd, ARMReg Vn, ARMReg Vm)
 		| (encodedSize(Size) << 20) | EncodeVd(Vd) | (0x50 << 4) | EncodeVm(Vm));
 }

-void NEONXEmitter::VABD(NEONElementType Size, ARMReg Vd, ARMReg Vn, ARMReg Vm)
+void NEONXEmitter::VABD(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm)
 {
 	_dbg_assert_msg_(DYNA_REC, Vd >= D0, "Pass invalid register to " __FUNCTION__);
 	_dbg_assert_msg_(DYNA_REC, cpu_info.bNEON, "Can't use " __FUNCTION__ " when CPU doesn't support it");
@ -1273,7 +1273,7 @@ void NEONXEmitter::VABD(NEONElementType Size, ARMReg Vd, ARMReg Vn, ARMReg Vm)
 			| (encodedSize(Size) << 20) | EncodeVd(Vd) | (0x70 << 4) | (register_quad << 6) | EncodeVm(Vm));
 }

-void NEONXEmitter::VABDL(NEONElementType Size, ARMReg Vd, ARMReg Vn, ARMReg Vm)
+void NEONXEmitter::VABDL(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm)
 {
 	_dbg_assert_msg_(DYNA_REC, Vd >= Q0, "Pass invalid register to " __FUNCTION__);
 	_dbg_assert_msg_(DYNA_REC, Vn >= D0 && Vn < Q0, "Pass invalid register to " __FUNCTION__);
@ -1285,7 +1285,7 @@ void NEONXEmitter::VABDL(NEONElementType Size, ARMReg Vd, ARMReg Vn, ARMReg Vm)
 		| (encodedSize(Size) << 20) | EncodeVd(Vd) | (0x70 << 4) | EncodeVm(Vm));
 }

-void NEONXEmitter::VABS(NEONElementType Size, ARMReg Vd, ARMReg Vm)
+void NEONXEmitter::VABS(u32 Size, ARMReg Vd, ARMReg Vm)
 {
 	_dbg_assert_msg_(DYNA_REC, Vd >= D0, "Pass invalid register to " __FUNCTION__);
 	_dbg_assert_msg_(DYNA_REC, cpu_info.bNEON, "Can't use " __FUNCTION__ " when CPU doesn't support it");
@ -1327,7 +1327,7 @@ void NEONXEmitter::VACLT(ARMReg Vd, ARMReg Vn, ARMReg Vm)
 	VACGT(Vd, Vn, Vm);
 }

-void NEONXEmitter::VADD(NEONElementType Size, ARMReg Vd, ARMReg Vn, ARMReg Vm)
+void NEONXEmitter::VADD(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm)
 {
 	_dbg_assert_msg_(DYNA_REC, Vd >= D0, "Pass invalid register to " __FUNCTION__);
 	_dbg_assert_msg_(DYNA_REC, cpu_info.bNEON, "Can't use " __FUNCTION__ " when CPU doesn't support it");
@ -1341,7 +1341,7 @@ void NEONXEmitter::VADD(NEONElementType Size, ARMReg Vd, ARMReg Vn, ARMReg Vm)
 			| (0x8 << 8) | (register_quad << 6) | EncodeVm(Vm));
 }

-void NEONXEmitter::VADDHN(NEONElementType Size, ARMReg Vd, ARMReg Vn, ARMReg Vm)
+void NEONXEmitter::VADDHN(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm)
 {
 	_dbg_assert_msg_(DYNA_REC, Vd < Q0, "Pass invalid register to " __FUNCTION__);
 	_dbg_assert_msg_(DYNA_REC, Vn >= Q0, "Pass invalid register to " __FUNCTION__);
@ -1353,7 +1353,7 @@ void NEONXEmitter::VADDHN(NEONElementType Size, ARMReg Vd, ARMReg Vn, ARMReg Vm)
 		| EncodeVd(Vd) | (0x80 << 4) | EncodeVm(Vm));
 }

-void NEONXEmitter::VADDL(NEONElementType Size, ARMReg Vd, ARMReg Vn, ARMReg Vm)
+void NEONXEmitter::VADDL(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm)
 {
 	_dbg_assert_msg_(DYNA_REC, Vd >= Q0, "Pass invalid register to " __FUNCTION__);
 	_dbg_assert_msg_(DYNA_REC, Vn >= D0 && Vn < Q0, "Pass invalid register to " __FUNCTION__);
@ -1364,7 +1364,7 @@ void NEONXEmitter::VADDL(NEONElementType Size, ARMReg Vd, ARMReg Vn, ARMReg Vm)
 	Write32((0xF2 << 24) | ((Size & I_UNSIGNED ? 1 : 0) << 24) | (1 << 23) | (encodedSize(Size) << 20) | EncodeVn(Vn) \
 		| EncodeVd(Vd) | EncodeVm(Vm));
 }
-void NEONXEmitter::VADDW(NEONElementType Size, ARMReg Vd, ARMReg Vn, ARMReg Vm)
+void NEONXEmitter::VADDW(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm)
 {
 	_dbg_assert_msg_(DYNA_REC, Vd >= Q0, "Pass invalid register to " __FUNCTION__);
 	_dbg_assert_msg_(DYNA_REC, Vn >= Q0, "Pass invalid register to " __FUNCTION__);
@ -1420,7 +1420,7 @@ void NEONXEmitter::VBSL(ARMReg Vd, ARMReg Vn, ARMReg Vm)

 	Write32((0xF3 << 24) | (1 << 20) | EncodeVn(Vn) | EncodeVd(Vd) | (0x11 << 4) | (register_quad << 6) | EncodeVm(Vm));
 }
-void NEONXEmitter::VCEQ(NEONElementType Size, ARMReg Vd, ARMReg Vn, ARMReg Vm)
+void NEONXEmitter::VCEQ(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm)
 {
 	_dbg_assert_msg_(DYNA_REC, Vd >= D0, "Pass invalid register to " __FUNCTION__);
 	_dbg_assert_msg_(DYNA_REC, cpu_info.bNEON, "Can't use " __FUNCTION__ " when CPU doesn't support it");
@ -1433,7 +1433,7 @@ void NEONXEmitter::VCEQ(NEONElementType Size, ARMReg Vd, ARMReg Vn, ARMReg Vm)
 			| (0x81 << 4) | (register_quad << 6) | EncodeVm(Vm));

 }
-void NEONXEmitter::VCEQ(NEONElementType Size, ARMReg Vd, ARMReg Vm)
+void NEONXEmitter::VCEQ(u32 Size, ARMReg Vd, ARMReg Vm)
 {
 	_dbg_assert_msg_(DYNA_REC, Vd >= D0, "Pass invalid register to " __FUNCTION__);
 	_dbg_assert_msg_(DYNA_REC, cpu_info.bNEON, "Can't use " __FUNCTION__ " when CPU doesn't support it");
@ -1443,7 +1443,7 @@ void NEONXEmitter::VCEQ(NEONElementType Size, ARMReg Vd, ARMReg Vm)
 	Write32((0xF2 << 24) | (0xB << 20) | (encodedSize(Size) << 18) | (1 << 16) \
 		| EncodeVd(Vd) | ((Size & F_32 ? 1 : 0) << 10) | (0x10 << 4) | (register_quad << 6) | EncodeVm(Vm));
 }
-void NEONXEmitter::VCGE(NEONElementType Size, ARMReg Vd, ARMReg Vn, ARMReg Vm)
+void NEONXEmitter::VCGE(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm)
 {
 	_dbg_assert_msg_(DYNA_REC, Vd >= D0, "Pass invalid register to " __FUNCTION__);
 	_dbg_assert_msg_(DYNA_REC, cpu_info.bNEON, "Can't use " __FUNCTION__ " when CPU doesn't support it");
@ -1455,7 +1455,7 @@ void NEONXEmitter::VCGE(NEONElementType Size, ARMReg Vd, ARMReg Vn, ARMReg Vm)
 		Write32((0xF2 << 24) | ((Size & I_UNSIGNED ? 1 : 0) << 24)  | (encodedSize(Size) << 20) | EncodeVn(Vn) | EncodeVd(Vd) \
 			| (0x31 << 4) | (register_quad << 6) | EncodeVm(Vm));
 }
-void NEONXEmitter::VCGE(NEONElementType Size, ARMReg Vd, ARMReg Vm)
+void NEONXEmitter::VCGE(u32 Size, ARMReg Vd, ARMReg Vm)
 {
 	_dbg_assert_msg_(DYNA_REC, Vd >= D0, "Pass invalid register to " __FUNCTION__);
 	_dbg_assert_msg_(DYNA_REC, cpu_info.bNEON, "Can't use " __FUNCTION__ " when CPU doesn't support it");
@ -1464,7 +1464,7 @@ void NEONXEmitter::VCGE(NEONElementType Size, ARMReg Vd, ARMReg Vm)
 	Write32((0xF3 << 24) | (0xB << 20) | (encodedSize(Size) << 18) | (1 << 16) \
 		| EncodeVd(Vd) | ((Size & F_32 ? 1 : 0) << 10) | (0x8 << 4) | (register_quad << 6) | EncodeVm(Vm));
 }
-void NEONXEmitter::VCGT(NEONElementType Size, ARMReg Vd, ARMReg Vn, ARMReg Vm)
+void NEONXEmitter::VCGT(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm)
 {
 	_dbg_assert_msg_(DYNA_REC, Vd >= D0, "Pass invalid register to " __FUNCTION__);
 	_dbg_assert_msg_(DYNA_REC, cpu_info.bNEON, "Can't use " __FUNCTION__ " when CPU doesn't support it");
@ -1476,7 +1476,7 @@ void NEONXEmitter::VCGT(NEONElementType Size, ARMReg Vd, ARMReg Vn, ARMReg Vm)
 		Write32((0xF2 << 24) | ((Size & I_UNSIGNED ? 1 : 0) << 24)  | (encodedSize(Size) << 20) | EncodeVn(Vn) | EncodeVd(Vd) \
 			| (0x30 << 4) | (register_quad << 6) | EncodeVm(Vm));
 }
-void NEONXEmitter::VCGT(NEONElementType Size, ARMReg Vd, ARMReg Vm)
+void NEONXEmitter::VCGT(u32 Size, ARMReg Vd, ARMReg Vm)
 {
 	_dbg_assert_msg_(DYNA_REC, Vd >= D0, "Pass invalid register to " __FUNCTION__);
 	_dbg_assert_msg_(DYNA_REC, cpu_info.bNEON, "Can't use " __FUNCTION__ " when CPU doesn't support it");
@ -1485,11 +1485,11 @@ void NEONXEmitter::VCGT(NEONElementType Size, ARMReg Vd, ARMReg Vm)
 	Write32((0xF3 << 24) | (0xD << 20) | (encodedSize(Size) << 18) | (1 << 16) \
 		| EncodeVd(Vd) | ((Size & F_32 ? 1 : 0) << 10) | (register_quad << 6) | EncodeVm(Vm));
 }
-void NEONXEmitter::VCLE(NEONElementType Size, ARMReg Vd, ARMReg Vn, ARMReg Vm)
+void NEONXEmitter::VCLE(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm)
 {
 	VCGE(Size, Vd, Vm, Vn);
 }
-void NEONXEmitter::VCLE(NEONElementType Size, ARMReg Vd, ARMReg Vm)
+void NEONXEmitter::VCLE(u32 Size, ARMReg Vd, ARMReg Vm)
 {
 	_dbg_assert_msg_(DYNA_REC, Vd >= D0, "Pass invalid register to " __FUNCTION__);
 	_dbg_assert_msg_(DYNA_REC, cpu_info.bNEON, "Can't use " __FUNCTION__ " when CPU doesn't support it");
@ -1498,7 +1498,7 @@ void NEONXEmitter::VCLE(NEONElementType Size, ARMReg Vd, ARMReg Vm)
 	Write32((0xF3 << 24) | (0xD << 20) | (encodedSize(Size) << 18) | (1 << 16) \
 		| EncodeVd(Vd) | ((Size & F_32 ? 1 : 0) << 10) | (3 << 7) | (register_quad << 6) | EncodeVm(Vm));
 }
-void NEONXEmitter::VCLS(NEONElementType Size, ARMReg Vd, ARMReg Vm)
+void NEONXEmitter::VCLS(u32 Size, ARMReg Vd, ARMReg Vm)
 {
 	_dbg_assert_msg_(DYNA_REC, Vd >= D0, "Pass invalid register to " __FUNCTION__);
 	_dbg_assert_msg_(DYNA_REC, cpu_info.bNEON, "Can't use " __FUNCTION__ " when CPU doesn't support it");
@ -1508,11 +1508,11 @@ void NEONXEmitter::VCLS(NEONElementType Size, ARMReg Vd, ARMReg Vm)
 	Write32((0xF3 << 24) | (0xD << 20) | (encodedSize(Size) << 18) \
 		| EncodeVd(Vd) | (1 << 10) | (register_quad << 6) | EncodeVm(Vm));
 }
-void NEONXEmitter::VCLT(NEONElementType Size, ARMReg Vd, ARMReg Vn, ARMReg Vm)
+void NEONXEmitter::VCLT(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm)
 {
 	VCGT(Size, Vd, Vm, Vn);
 }
-void NEONXEmitter::VCLT(NEONElementType Size, ARMReg Vd, ARMReg Vm)
+void NEONXEmitter::VCLT(u32 Size, ARMReg Vd, ARMReg Vm)
 {
 	_dbg_assert_msg_(DYNA_REC, Vd >= D0, "Pass invalid register to " __FUNCTION__);
 	_dbg_assert_msg_(DYNA_REC, cpu_info.bNEON, "Can't use " __FUNCTION__ " when CPU doesn't support it");
@ -1521,7 +1521,7 @@ void NEONXEmitter::VCLT(NEONElementType Size, ARMReg Vd, ARMReg Vm)
 	Write32((0xF3 << 24) | (0xD << 20) | (encodedSize(Size) << 18) | (1 << 16) \
 		| EncodeVd(Vd) | ((Size & F_32 ? 1 : 0) << 10) | (0x20 << 4) | (register_quad << 6) | EncodeVm(Vm));
 }
-void NEONXEmitter::VCLZ(NEONElementType Size, ARMReg Vd, ARMReg Vm)
+void NEONXEmitter::VCLZ(u32 Size, ARMReg Vd, ARMReg Vm)
 {
 	_dbg_assert_msg_(DYNA_REC, Vd >= D0, "Pass invalid register to " __FUNCTION__);
 	_dbg_assert_msg_(DYNA_REC, cpu_info.bNEON, "Can't use " __FUNCTION__ " when CPU doesn't support it");
@ -1530,7 +1530,7 @@ void NEONXEmitter::VCLZ(NEONElementType Size, ARMReg Vd, ARMReg Vm)
 	Write32((0xF3 << 24) | (0xD << 20) | (encodedSize(Size) << 18) \
 		| EncodeVd(Vd) | (0x48 << 4) | (register_quad << 6) | EncodeVm(Vm));
 }
-void NEONXEmitter::VCNT(NEONElementType Size, ARMReg Vd, ARMReg Vm)
+void NEONXEmitter::VCNT(u32 Size, ARMReg Vd, ARMReg Vm)
 {
 	_dbg_assert_msg_(DYNA_REC, Vd >= D0, "Pass invalid register to " __FUNCTION__);
 	_dbg_assert_msg_(DYNA_REC, cpu_info.bNEON, "Can't use " __FUNCTION__ " when CPU doesn't support it");
@ -1540,7 +1540,7 @@ void NEONXEmitter::VCNT(NEONElementType Size, ARMReg Vd, ARMReg Vm)
 	Write32((0xF3 << 24) | (0xD << 20) | (encodedSize(Size) << 18) \
 		| EncodeVd(Vd) | (0x90 << 4) | (register_quad << 6) | EncodeVm(Vm));
 }
-void NEONXEmitter::VDUP(NEONElementType Size, ARMReg Vd, ARMReg Vm, u8 index)
+void NEONXEmitter::VDUP(u32 Size, ARMReg Vd, ARMReg Vm, u8 index)
 {
 	_dbg_assert_msg_(DYNA_REC, Vd >= D0, "Pass invalid register to " __FUNCTION__);
 	_dbg_assert_msg_(DYNA_REC, cpu_info.bNEON, "Can't use " __FUNCTION__ " when CPU doesn't support it");
@ -1562,7 +1562,7 @@ void NEONXEmitter::VDUP(NEONElementType Size, ARMReg Vd, ARMReg Vm, u8 index)
 	Write32((0xF3 << 24) | (0xD << 20) | (sizeEncoded << 16) | (indexEncoded << 16) \
 		| EncodeVd(Vd) | (0xC0 << 4) | (register_quad << 6) | EncodeVm(Vm));
 }
-void NEONXEmitter::VDUP(NEONElementType Size, ARMReg Vd, ARMReg Rt)
+void NEONXEmitter::VDUP(u32 Size, ARMReg Vd, ARMReg Rt)
 {
 	_dbg_assert_msg_(DYNA_REC, Vd >= D0, "Pass invalid register to " __FUNCTION__);
 	_dbg_assert_msg_(DYNA_REC, Rt < D0, "Pass invalid register to " __FUNCTION__);
@ -1616,7 +1616,7 @@ void NEONXEmitter::VFMS(ARMReg Vd, ARMReg Vn, ARMReg Vm)

 	Write32((0xF2 << 24) | (1 << 21) | EncodeVn(Vn) | EncodeVd(Vd) | (0xC1 << 4) | (register_quad << 6) | EncodeVm(Vm));
 }
-void NEONXEmitter::VHADD(NEONElementType Size, ARMReg Vd, ARMReg Vn, ARMReg Vm)
+void NEONXEmitter::VHADD(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm)
 {
 	_dbg_assert_msg_(DYNA_REC, Vd >= D0, "Pass invalid register to " __FUNCTION__);
 	_dbg_assert_msg_(DYNA_REC, cpu_info.bNEON, "Can't use " __FUNCTION__ " when CPU doesn't support it");
@ -1627,7 +1627,7 @@ void NEONXEmitter::VHADD(NEONElementType Size, ARMReg Vd, ARMReg Vn, ARMReg Vm)
 	Write32((0xF2 << 24) | ((Size & I_UNSIGNED ? 1 : 0) << 23) | (encodedSize(Size) << 20) \
 		| EncodeVn(Vn) | EncodeVd(Vd) | (register_quad << 6) | EncodeVm(Vm));
 }
-void NEONXEmitter::VHSUB(NEONElementType Size, ARMReg Vd, ARMReg Vn, ARMReg Vm)
+void NEONXEmitter::VHSUB(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm)
 {
 	_dbg_assert_msg_(DYNA_REC, Vd >= D0, "Pass invalid register to " __FUNCTION__);
 	_dbg_assert_msg_(DYNA_REC, cpu_info.bNEON, "Can't use " __FUNCTION__ " when CPU doesn't support it");
@ -1638,7 +1638,7 @@ void NEONXEmitter::VHSUB(NEONElementType Size, ARMReg Vd, ARMReg Vn, ARMReg Vm)
 	Write32((0xF2 << 24) | ((Size & I_UNSIGNED ? 1 : 0) << 23) | (encodedSize(Size) << 20) \
 		| EncodeVn(Vn) | EncodeVd(Vd) | (1 << 9) | (register_quad << 6) | EncodeVm(Vm));
 }
-void NEONXEmitter::VMAX(NEONElementType Size, ARMReg Vd, ARMReg Vn, ARMReg Vm)
+void NEONXEmitter::VMAX(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm)
 {
 	_dbg_assert_msg_(DYNA_REC, Vd >= D0, "Pass invalid register to " __FUNCTION__);
 	_dbg_assert_msg_(DYNA_REC, cpu_info.bNEON, "Can't use " __FUNCTION__ " when CPU doesn't support it");
@ -1651,7 +1651,7 @@ void NEONXEmitter::VMAX(NEONElementType Size, ARMReg Vd, ARMReg Vn, ARMReg Vm)
 		Write32((0xF2 << 24) | ((Size & I_UNSIGNED ? 1 : 0) << 23) | (encodedSize(Size) << 20) \
 			| EncodeVn(Vn) | EncodeVd(Vd) | (0x60 << 4) | (register_quad << 6) | EncodeVm(Vm));
 }
-void NEONXEmitter::VMIN(NEONElementType Size, ARMReg Vd, ARMReg Vn, ARMReg Vm)
+void NEONXEmitter::VMIN(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm)
 {
 	_dbg_assert_msg_(DYNA_REC, Vd >= D0, "Pass invalid register to " __FUNCTION__);
 	_dbg_assert_msg_(DYNA_REC, cpu_info.bNEON, "Can't use " __FUNCTION__ " when CPU doesn't support it");
@ -1664,7 +1664,7 @@ void NEONXEmitter::VMIN(NEONElementType Size, ARMReg Vd, ARMReg Vn, ARMReg Vm)
 		Write32((0xF2 << 24) | ((Size & I_UNSIGNED ? 1 : 0) << 23) | (encodedSize(Size) << 20) \
 			| EncodeVn(Vn) | EncodeVd(Vd) | (0x61 << 4) | (register_quad << 6) | EncodeVm(Vm));
 }
-void NEONXEmitter::VMLA(NEONElementType Size, ARMReg Vd, ARMReg Vn, ARMReg Vm)
+void NEONXEmitter::VMLA(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm)
 {
 	_dbg_assert_msg_(DYNA_REC, Vd >= D0, "Pass invalid register to " __FUNCTION__);
 	_dbg_assert_msg_(DYNA_REC, cpu_info.bNEON, "Can't use " __FUNCTION__ " when CPU doesn't support it");
@ -1676,7 +1676,7 @@ void NEONXEmitter::VMLA(NEONElementType Size, ARMReg Vd, ARMReg Vn, ARMReg Vm)
 	else
 		Write32((0xF2 << 24) | (encodedSize(Size) << 20) | EncodeVn(Vn) | EncodeVd(Vd) | (0x90 << 4) | (register_quad << 6) | EncodeVm(Vm));
 }
-void NEONXEmitter::VMLS(NEONElementType Size, ARMReg Vd, ARMReg Vn, ARMReg Vm)
+void NEONXEmitter::VMLS(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm)
 {
 	_dbg_assert_msg_(DYNA_REC, Vd >= D0, "Pass invalid register to " __FUNCTION__);
 	_dbg_assert_msg_(DYNA_REC, cpu_info.bNEON, "Can't use " __FUNCTION__ " when CPU doesn't support it");
@ -1688,7 +1688,7 @@ void NEONXEmitter::VMLS(NEONElementType Size, ARMReg Vd, ARMReg Vn, ARMReg Vm)
 	else
 		Write32((0xF2 << 24) | (1 << 24) | (encodedSize(Size) << 20) | EncodeVn(Vn) | EncodeVd(Vd) | (0x90 << 4) | (register_quad << 6) | EncodeVm(Vm));
 }
-void NEONXEmitter::VMLAL(NEONElementType Size, ARMReg Vd, ARMReg Vn, ARMReg Vm)
+void NEONXEmitter::VMLAL(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm)
 {
 	_dbg_assert_msg_(DYNA_REC, Vd >= Q0, "Pass invalid register to " __FUNCTION__);
 	_dbg_assert_msg_(DYNA_REC, Vn >= Q0, "Pass invalid register to " __FUNCTION__);
@ -1699,7 +1699,7 @@ void NEONXEmitter::VMLAL(NEONElementType Size, ARMReg Vd, ARMReg Vn, ARMReg Vm)
 	Write32((0xF2 << 24) | ((Size & I_UNSIGNED ? 1 : 0) << 24) | (encodedSize(Size) << 20) \
 		| EncodeVn(Vn) | EncodeVd(Vd) | (0x80 << 4) | EncodeVm(Vm));
 }
-void NEONXEmitter::VMLSL(NEONElementType Size, ARMReg Vd, ARMReg Vn, ARMReg Vm)
+void NEONXEmitter::VMLSL(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm)
 {
 	_dbg_assert_msg_(DYNA_REC, Vd >= Q0, "Pass invalid register to " __FUNCTION__);
 	_dbg_assert_msg_(DYNA_REC, Vn >= Q0, "Pass invalid register to " __FUNCTION__);
@ -1710,23 +1710,404 @@ void NEONXEmitter::VMLSL(NEONElementType Size, ARMReg Vd, ARMReg Vn, ARMReg Vm)
 	Write32((0xF2 << 24) | ((Size & I_UNSIGNED ? 1 : 0) << 24) | (encodedSize(Size) << 20) \
 		| EncodeVn(Vn) | EncodeVd(Vd) | (0xA0 << 4) | EncodeVm(Vm));
 }
-
-void NEONXEmitter::VSUB(NEONElementType Size, ARMReg Vd, ARMReg Vn, ARMReg Vm)
+void NEONXEmitter::VMUL(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm)
 {
-	_dbg_assert_msg_(DYNA_REC, Vd >= Q0, "Pass invalid register to VSUB(integer)");
-	_dbg_assert_msg_(DYNA_REC, cpu_info.bNEON, "Can't use VSUB(integer) when CPU doesn't support it");
+	_dbg_assert_msg_(DYNA_REC, Vd >= D0, "Pass invalid register to " __FUNCTION__);
+	_dbg_assert_msg_(DYNA_REC, cpu_info.bNEON, "Can't use " __FUNCTION__ " when CPU doesn't support it");

-	// Gets encoded as a double register
+	bool register_quad = Vd >= Q0;
+
+	if (Size & F_32)
+		Write32((0xF3 << 24) | EncodeVn(Vn) | EncodeVd(Vd) | (0xD1 << 4) | (register_quad << 6) | EncodeVm(Vm));
+	else
+		Write32((0xF2 << 24) | ((Size & I_POLYNOMIAL) ? (1 << 24) : 0) | (encodedSize(Size) << 20) | \
+				EncodeVn(Vn) | EncodeVd(Vd) | (0x91 << 4) | (register_quad << 6) | EncodeVm(Vm));
+}
+void NEONXEmitter::VMULL(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm)
+{
+	_dbg_assert_msg_(DYNA_REC, Vd >= D0, "Pass invalid register to " __FUNCTION__);
+	_dbg_assert_msg_(DYNA_REC, cpu_info.bNEON, "Can't use " __FUNCTION__ " when CPU doesn't support it");
+	_dbg_assert_msg_(DYNA_REC, !(Size & F_32), __FUNCTION__ " doesn't support float");
+
+	Write32((0xF2 << 24) | (1 << 23) | (encodedSize(Size) << 20) | EncodeVn(Vn) | EncodeVd(Vd) | \
+			(0xC0 << 4) | ((Size & I_POLYNOMIAL) ? 1 << 9 : 0) | EncodeVm(Vm));
+}
+void NEONXEmitter::VNEG(u32 Size, ARMReg Vd, ARMReg Vm)
+{
+	_dbg_assert_msg_(DYNA_REC, Vd >= D0, "Pass invalid register to " __FUNCTION__);
+	_dbg_assert_msg_(DYNA_REC, cpu_info.bNEON, "Can't use " __FUNCTION__ " when CPU doesn't support it");
+
+	bool register_quad = Vd >= Q0;
+
+	Write32((0xF3 << 24) | (0xB << 20) | (encodedSize(Size) << 18) | (1 << 16) | \
+			EncodeVd(Vd) | ((Size & F_32) ? 1 << 10 : 0) | (0xE << 6) | (register_quad << 6) | EncodeVm(Vm));
+}
+void NEONXEmitter::VORN(ARMReg Vd, ARMReg Vn, ARMReg Vm)
+{
+	_dbg_assert_msg_(DYNA_REC, Vd >= D0, "Pass invalid register to " __FUNCTION__);
+	_dbg_assert_msg_(DYNA_REC, cpu_info.bNEON, "Can't use " __FUNCTION__ " when CPU doesn't support it");
+
+	bool register_quad = Vd >= Q0;
+
+	Write32((0xF2 << 24) | (3 << 20) | EncodeVn(Vn) | EncodeVd(Vd) | (0x11 << 4) | (register_quad << 6) | EncodeVm(Vm));
+}
+void NEONXEmitter::VORR(ARMReg Vd, ARMReg Vn, ARMReg Vm)
+{
+	_dbg_assert_msg_(DYNA_REC, Vd >= D0, "Pass invalid register to " __FUNCTION__);
+	_dbg_assert_msg_(DYNA_REC, cpu_info.bNEON, "Can't use " __FUNCTION__ " when CPU doesn't support it");
+
+	bool register_quad = Vd >= Q0;
+
+	Write32((0xF2 << 24) | (2 << 20) | EncodeVn(Vn) | EncodeVd(Vd) | (0x11 << 4) | (register_quad << 6) | EncodeVm(Vm));
+}
+void NEONXEmitter::VPADAL(u32 Size, ARMReg Vd, ARMReg Vm)
+{
+	_dbg_assert_msg_(DYNA_REC, Vd >= D0, "Pass invalid register to " __FUNCTION__);
+	_dbg_assert_msg_(DYNA_REC, cpu_info.bNEON, "Can't use " __FUNCTION__ " when CPU doesn't support it");
+	_dbg_assert_msg_(DYNA_REC, !(Size & F_32), __FUNCTION__ " doesn't support float");
+
+	bool register_quad = Vd >= Q0;
+
+	Write32((0xF3 << 24) | (0xB << 20) | (encodedSize(Size) << 18) | EncodeVd(Vd) | \
+			(0x60 << 4) | ((Size & I_UNSIGNED) ? 1 << 7 : 0) | (register_quad << 6) | EncodeVm(Vm));
+}
+void NEONXEmitter::VPADD(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm)
+{
+	_dbg_assert_msg_(DYNA_REC, Vd >= D0, "Pass invalid register to " __FUNCTION__);
+	_dbg_assert_msg_(DYNA_REC, cpu_info.bNEON, "Can't use " __FUNCTION__ " when CPU doesn't support it");
+
+	if (Size & F_32)
+		Write32((0xF3 << 24) | EncodeVn(Vn) | EncodeVd(Vd) | (0xD0 << 4) | EncodeVm(Vm));
+	else
+		Write32((0xF2 << 24) | (encodedSize(Size) << 20) | EncodeVn(Vn) | EncodeVd(Vd) | \
+				(0xB1 << 4) | EncodeVm(Vm));
+}
+void NEONXEmitter::VPADDL(u32 Size, ARMReg Vd, ARMReg Vm)
+{
+	_dbg_assert_msg_(DYNA_REC, Vd >= D0, "Pass invalid register to " __FUNCTION__);
+	_dbg_assert_msg_(DYNA_REC, cpu_info.bNEON, "Can't use " __FUNCTION__ " when CPU doesn't support it");
+	_dbg_assert_msg_(DYNA_REC, !(Size & F_32), __FUNCTION__ " doesn't support float");
+
+	bool register_quad = Vd >= Q0;
+
+	Write32((0xF3 << 24) | (0xB << 20) | (encodedSize(Size) << 18) | EncodeVd(Vd) | \
+			(0x20 << 4) | (Size & I_UNSIGNED ? 1 << 7 : 0) | (register_quad << 6) | EncodeVm(Vm));
+}
+void NEONXEmitter::VPMAX(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm)
+{
+	_dbg_assert_msg_(DYNA_REC, Vd >= D0, "Pass invalid register to " __FUNCTION__);
+	_dbg_assert_msg_(DYNA_REC, cpu_info.bNEON, "Can't use " __FUNCTION__ " when CPU doesn't support it");
+
+	if (Size & F_32)
+		Write32((0xF3 << 24) | EncodeVn(Vn) | EncodeVd(Vd) | (0xF0 << 4) | EncodeVm(Vm));
+	else
+		Write32((0xF2 << 24) | (Size & I_UNSIGNED ? 1 << 24 : 0) | (encodedSize(Size) << 20) | EncodeVn(Vn) | EncodeVd(Vd) | \
+				(0xA0 << 4) | EncodeVm(Vm));
+}
+void NEONXEmitter::VPMIN(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm)
+{
+	_dbg_assert_msg_(DYNA_REC, Vd >= D0, "Pass invalid register to " __FUNCTION__);
+	_dbg_assert_msg_(DYNA_REC, cpu_info.bNEON, "Can't use " __FUNCTION__ " when CPU doesn't support it");
+
+	if (Size & F_32)
+		Write32((0xF3 << 24) | (1 << 21) | EncodeVn(Vn) | EncodeVd(Vd) | (0xF0 << 4) | EncodeVm(Vm));
+	else
+		Write32((0xF2 << 24) | (Size & I_UNSIGNED ? 1 << 24 : 0) | (encodedSize(Size) << 20) | EncodeVn(Vn) | EncodeVd(Vd) | \
+				(0xA1 << 4) | EncodeVm(Vm));
+}
+void NEONXEmitter::VQABS(u32 Size, ARMReg Vd, ARMReg Vm)
+{
+	_dbg_assert_msg_(DYNA_REC, Vd >= D0, "Pass invalid register to " __FUNCTION__);
+	_dbg_assert_msg_(DYNA_REC, cpu_info.bNEON, "Can't use " __FUNCTION__ " when CPU doesn't support it");
+	_dbg_assert_msg_(DYNA_REC, !(Size & F_32), __FUNCTION__ " doesn't support float");
+
+	bool register_quad = Vd >= Q0;
+
+	Write32((0xF3 << 24) | (0xB << 20) | (encodedSize(Size) << 18) | EncodeVd(Vd) | \
+			(0x70 << 4) | (register_quad << 6) | EncodeVm(Vm));
+}
+void NEONXEmitter::VQADD(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm)
+{
+	_dbg_assert_msg_(DYNA_REC, Vd >= D0, "Pass invalid register to " __FUNCTION__);
+	_dbg_assert_msg_(DYNA_REC, cpu_info.bNEON, "Can't use " __FUNCTION__ " when CPU doesn't support it");
+	_dbg_assert_msg_(DYNA_REC, !(Size & F_32), __FUNCTION__ " doesn't support float");
+
+	bool register_quad = Vd >= Q0;
+
+	Write32((0xF2 << 24) | (encodedSize(Size) << 20) | EncodeVn(Vn) | EncodeVd(Vd) | \
+			(0x1 << 4) | (register_quad << 6) | EncodeVm(Vm));
+}
+void NEONXEmitter::VQDMLAL(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm)
+{
+	_dbg_assert_msg_(DYNA_REC, Vd >= D0, "Pass invalid register to " __FUNCTION__);
+	_dbg_assert_msg_(DYNA_REC, cpu_info.bNEON, "Can't use " __FUNCTION__ " when CPU doesn't support it");
+	_dbg_assert_msg_(DYNA_REC, !(Size & F_32), __FUNCTION__ " doesn't support float");
+
+	Write32((0xF2 << 24) | (1 << 23) | (encodedSize(Size) << 20) | EncodeVn(Vn) | EncodeVd(Vd) | \
+			(0x90 << 4) | EncodeVm(Vm));
+}
+void NEONXEmitter::VQDMLSL(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm)
+{
+	_dbg_assert_msg_(DYNA_REC, Vd >= D0, "Pass invalid register to " __FUNCTION__);
+	_dbg_assert_msg_(DYNA_REC, cpu_info.bNEON, "Can't use " __FUNCTION__ " when CPU doesn't support it");
+	_dbg_assert_msg_(DYNA_REC, !(Size & F_32), __FUNCTION__ " doesn't support float");
+
+	Write32((0xF2 << 24) | (1 << 23) | (encodedSize(Size) << 20) | EncodeVn(Vn) | EncodeVd(Vd) | \
+			(0xB0 << 4) | EncodeVm(Vm));
+}
+void NEONXEmitter::VQDMULH(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm)
+{
+	_dbg_assert_msg_(DYNA_REC, Vd >= D0, "Pass invalid register to " __FUNCTION__);
+	_dbg_assert_msg_(DYNA_REC, cpu_info.bNEON, "Can't use " __FUNCTION__ " when CPU doesn't support it");
+	_dbg_assert_msg_(DYNA_REC, !(Size & F_32), __FUNCTION__ " doesn't support float");
+
+	Write32((0xF2 << 24) | (encodedSize(Size) << 20) | EncodeVn(Vn) | EncodeVd(Vd) | \
+			(0xB0 << 4) | EncodeVm(Vm));
+}
+void NEONXEmitter::VQDMULL(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm)
+{
+	_dbg_assert_msg_(DYNA_REC, Vd >= D0, "Pass invalid register to " __FUNCTION__);
+	_dbg_assert_msg_(DYNA_REC, cpu_info.bNEON, "Can't use " __FUNCTION__ " when CPU doesn't support it");
+	_dbg_assert_msg_(DYNA_REC, !(Size & F_32), __FUNCTION__ " doesn't support float");
+
+	Write32((0xF2 << 24) | (1 << 23) | (encodedSize(Size) << 20) | EncodeVn(Vn) | EncodeVd(Vd) | \
+			(0xD0 << 4) | EncodeVm(Vm));
+}
+void NEONXEmitter::VQNEG(u32 Size, ARMReg Vd, ARMReg Vm)
+{
+	_dbg_assert_msg_(DYNA_REC, Vd >= D0, "Pass invalid register to " __FUNCTION__);
+	_dbg_assert_msg_(DYNA_REC, cpu_info.bNEON, "Can't use " __FUNCTION__ " when CPU doesn't support it");
+	_dbg_assert_msg_(DYNA_REC, !(Size & F_32), __FUNCTION__ " doesn't support float");
+
+	bool register_quad = Vd >= Q0;
+
+	Write32((0xF3 << 24) | (0xB << 20) | (encodedSize(Size) << 18) | EncodeVd(Vd) | \
+			(0x78 << 4) | (register_quad << 6) | EncodeVm(Vm));
+}
+void NEONXEmitter::VQRDMULH(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm)
+{
+	_dbg_assert_msg_(DYNA_REC, Vd >= D0, "Pass invalid register to " __FUNCTION__);
+	_dbg_assert_msg_(DYNA_REC, cpu_info.bNEON, "Can't use " __FUNCTION__ " when CPU doesn't support it");
+	_dbg_assert_msg_(DYNA_REC, !(Size & F_32), __FUNCTION__ " doesn't support float");
+
+	Write32((0xF3 << 24) | (encodedSize(Size) << 20) | EncodeVn(Vn) | EncodeVd(Vd) | \
+			(0xB0 << 4) | EncodeVm(Vm));
+}
+void NEONXEmitter::VQRSHL(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm)
+{
+	_dbg_assert_msg_(DYNA_REC, Vd >= D0, "Pass invalid register to " __FUNCTION__);
+	_dbg_assert_msg_(DYNA_REC, cpu_info.bNEON, "Can't use " __FUNCTION__ " when CPU doesn't support it");
+	_dbg_assert_msg_(DYNA_REC, !(Size & F_32), __FUNCTION__ " doesn't support float");
+
+	bool register_quad = Vd >= Q0;
+
+	Write32((0xF2 << 24) | (Size & I_UNSIGNED ? 1 << 24 : 0) | (encodedSize(Size) << 20) | EncodeVn(Vn) | EncodeVd(Vd) | \
+			(0x51 << 4) | (register_quad << 6) | EncodeVm(Vm));
+}
+void NEONXEmitter::VQSHL(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm)
+{
+	_dbg_assert_msg_(DYNA_REC, Vd >= D0, "Pass invalid register to " __FUNCTION__);
+	_dbg_assert_msg_(DYNA_REC, cpu_info.bNEON, "Can't use " __FUNCTION__ " when CPU doesn't support it");
+	_dbg_assert_msg_(DYNA_REC, !(Size & F_32), __FUNCTION__ " doesn't support float");
+
+	bool register_quad = Vd >= Q0;
+
+	Write32((0xF2 << 24) | (Size & I_UNSIGNED ? 1 << 24 : 0) | (encodedSize(Size) << 20) | EncodeVn(Vn) | EncodeVd(Vd) | \
+			(0x41 << 4) | (register_quad << 6) | EncodeVm(Vm));
+}
+void NEONXEmitter::VQSUB(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm)
+{
+	_dbg_assert_msg_(DYNA_REC, Vd >= D0, "Pass invalid register to " __FUNCTION__);
+	_dbg_assert_msg_(DYNA_REC, cpu_info.bNEON, "Can't use " __FUNCTION__ " when CPU doesn't support it");
+	_dbg_assert_msg_(DYNA_REC, !(Size & F_32), __FUNCTION__ " doesn't support float");
+
+	bool register_quad = Vd >= Q0;
+
+	Write32((0xF2 << 24) | (Size & I_UNSIGNED ? 1 << 24 : 0) | (encodedSize(Size) << 20) | EncodeVn(Vn) | EncodeVd(Vd) | \
+			(0x21 << 4) | (register_quad << 6) | EncodeVm(Vm));
+}
+void NEONXEmitter::VRADDHN(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm)
+{
+	_dbg_assert_msg_(DYNA_REC, Vd >= D0, "Pass invalid register to " __FUNCTION__);
+	_dbg_assert_msg_(DYNA_REC, cpu_info.bNEON, "Can't use " __FUNCTION__ " when CPU doesn't support it");
+	_dbg_assert_msg_(DYNA_REC, !(Size & F_32), __FUNCTION__ " doesn't support float");
+
+	Write32((0xF3 << 24) | (1 << 23) | ((encodedSize(Size) - 1) << 20) | EncodeVn(Vn) | EncodeVd(Vd) | \
+			(0x40 << 4) | EncodeVm(Vm));
+}
+void NEONXEmitter::VRECPE(u32 Size, ARMReg Vd, ARMReg Vm)
+{
+	_dbg_assert_msg_(DYNA_REC, Vd >= D0, "Pass invalid register to " __FUNCTION__);
+	_dbg_assert_msg_(DYNA_REC, cpu_info.bNEON, "Can't use " __FUNCTION__ " when CPU doesn't support it");
+
+	bool register_quad = Vd >= Q0;
+
+	Write32((0xF3 << 24) | (0xB << 20) | (0xB << 16) | EncodeVd(Vd) | \
+			(0x40 << 4) | (Size & F_32 ? 1 << 8 : 0) | (register_quad << 6) | EncodeVm(Vm));
+}
+void NEONXEmitter::VRECPS(ARMReg Vd, ARMReg Vn, ARMReg Vm)
+{
+	_dbg_assert_msg_(DYNA_REC, Vd >= D0, "Pass invalid register to " __FUNCTION__);
+	_dbg_assert_msg_(DYNA_REC, cpu_info.bNEON, "Can't use " __FUNCTION__ " when CPU doesn't support it");
+
+	bool register_quad = Vd >= Q0;
+
+	Write32((0xF2 << 24) | EncodeVn(Vn) | EncodeVd(Vd) | (0xF1 << 4) | (register_quad << 6) | EncodeVm(Vm));
+}
+void NEONXEmitter::VRHADD(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm)
+{
+	_dbg_assert_msg_(DYNA_REC, Vd >= D0, "Pass invalid register to " __FUNCTION__);
+	_dbg_assert_msg_(DYNA_REC, cpu_info.bNEON, "Can't use " __FUNCTION__ " when CPU doesn't support it");
+	_dbg_assert_msg_(DYNA_REC, !(Size & F_32), __FUNCTION__ " doesn't support float");
+
+	bool register_quad = Vd >= Q0;
+
+	Write32((0xF2 << 24) | (Size & I_UNSIGNED ? 1 << 24 : 0) | (encodedSize(Size) << 20) | EncodeVn(Vn) | EncodeVd(Vd) | \
+			(0x10 << 4) | (register_quad << 6) | EncodeVm(Vm));
+}
+void NEONXEmitter::VRSHL(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm)
+{
+	_dbg_assert_msg_(DYNA_REC, Vd >= D0, "Pass invalid register to " __FUNCTION__);
+	_dbg_assert_msg_(DYNA_REC, cpu_info.bNEON, "Can't use " __FUNCTION__ " when CPU doesn't support it");
+	_dbg_assert_msg_(DYNA_REC, !(Size & F_32), __FUNCTION__ " doesn't support float");
+
+	bool register_quad = Vd >= Q0;
+
+	Write32((0xF2 << 24) | (Size & I_UNSIGNED ? 1 << 24 : 0) | (encodedSize(Size) << 20) | EncodeVn(Vn) | EncodeVd(Vd) | \
+			(0x50 << 4) | (register_quad << 6) | EncodeVm(Vm));
+}
+void NEONXEmitter::VRSQRTE(u32 Size, ARMReg Vd, ARMReg Vm)
+{
+	_dbg_assert_msg_(DYNA_REC, Vd >= D0, "Pass invalid register to " __FUNCTION__);
+	_dbg_assert_msg_(DYNA_REC, cpu_info.bNEON, "Can't use " __FUNCTION__ " when CPU doesn't support it");
+
+	bool register_quad = Vd >= Q0;
 	Vd = SubBase(Vd);
-	Vn = SubBase(Vn);
 	Vm = SubBase(Vm);

-	Write32((0xF3 << 24) | ((Vd & 0x10) << 18) | (encodedSize(Size) << 20) | ((Vn & 0xF) << 16) \
-		| ((Vd & 0xF) << 12) | (0x8 << 8) | ((Vn & 0x10) << 3) | (1 << 6) \
-		| ((Vm & 0x10) << 1) | (Vm & 0xF));
+	Write32((0xF3 << 24) | (0xB << 20) | ((Vd & 0x10) << 18) | (0xB << 16)
+			| ((Vd & 0xF) << 12) | (9 << 7) | (Size & F_32 ? (1 << 8) : 0) | (register_quad << 6)
+			| ((Vm & 0x10) << 1) | (Vm & 0xF));
 }
+void NEONXEmitter::VRSQRTS(ARMReg Vd, ARMReg Vn, ARMReg Vm)
+{
+	_dbg_assert_msg_(DYNA_REC, Vd >= D0, "Pass invalid register to " __FUNCTION__);
+	_dbg_assert_msg_(DYNA_REC, cpu_info.bNEON, "Can't use " __FUNCTION__ " when CPU doesn't support it");

-void NEONXEmitter::VLD1(NEONElementType Size, ARMReg Vd, ARMReg Rn, NEONAlignment align, ARMReg Rm)
+	bool register_quad = Vd >= Q0;
+
+	Write32((0xF2 << 24) | (1 << 21) | EncodeVn(Vn) | EncodeVd(Vd) | \
+			(0xF1 << 4) | (register_quad << 6) | EncodeVm(Vm));
+}
+void NEONXEmitter::VRSUBHN(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm)
+{
+	_dbg_assert_msg_(DYNA_REC, Vd >= D0, "Pass invalid register to " __FUNCTION__);
+	_dbg_assert_msg_(DYNA_REC, cpu_info.bNEON, "Can't use " __FUNCTION__ " when CPU doesn't support it");
+	_dbg_assert_msg_(DYNA_REC, !(Size & F_32), __FUNCTION__ " doesn't support float");
+
+	Write32((0xF3 << 24) | (1 << 23) | ((encodedSize(Size) - 1) << 20) | EncodeVn(Vn) | EncodeVd(Vd) | \
+			(0x60 << 4) | EncodeVm(Vm));
+}
+void NEONXEmitter::VSHL(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm)
+{
+	_dbg_assert_msg_(DYNA_REC, Vd >= D0, "Pass invalid register to " __FUNCTION__);
+	_dbg_assert_msg_(DYNA_REC, cpu_info.bNEON, "Can't use " __FUNCTION__ " when CPU doesn't support it");
+	_dbg_assert_msg_(DYNA_REC, !(Size & F_32), __FUNCTION__ " doesn't support float");
+
+	bool register_quad = Vd >= Q0;
+
+	Write32((0xF2 << 24) | (Size & I_UNSIGNED ? 1 << 24 : 0) | (encodedSize(Size) << 20) | EncodeVn(Vn) | EncodeVd(Vd) | \
+			(0x40 << 4) | (register_quad << 6) | EncodeVm(Vm));
+}
+void NEONXEmitter::VSUB(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm)
+{
+	_dbg_assert_msg_(DYNA_REC, Vd >= Q0, "Pass invalid register to " __FUNCTION__);
+	_dbg_assert_msg_(DYNA_REC, cpu_info.bNEON, "Can't use " __FUNCTION__ " when CPU doesn't support it");
+
+	bool register_quad = Vd >= Q0;
+
+	if (Size & F_32)
+		Write32((0xF2 << 24) | (1 << 21) | EncodeVn(Vn) | EncodeVd(Vd) | \
+				(0xD0 << 4) | (register_quad << 6) | EncodeVm(Vm));
+	else
+		Write32((0xF3 << 24) | (encodedSize(Size) << 20) | EncodeVn(Vn) | EncodeVd(Vd) | \
+				(0x80 << 4) | (register_quad << 6) | EncodeVm(Vm));
+}
+void NEONXEmitter::VSUBHN(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm)
+{
+	_dbg_assert_msg_(DYNA_REC, Vd >= Q0, "Pass invalid register to " __FUNCTION__);
+	_dbg_assert_msg_(DYNA_REC, cpu_info.bNEON, "Can't use " __FUNCTION__ " when CPU doesn't support it");
+
+	Write32((0xF2 << 24) | (1 << 23) | ((encodedSize(Size) - 1) << 20) | EncodeVn(Vn) | EncodeVd(Vd) | \
+			(0x60 << 4) | EncodeVm(Vm));
+}
+void NEONXEmitter::VSUBL(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm)
+{
+	_dbg_assert_msg_(DYNA_REC, Vd >= Q0, "Pass invalid register to " __FUNCTION__);
+	_dbg_assert_msg_(DYNA_REC, cpu_info.bNEON, "Can't use " __FUNCTION__ " when CPU doesn't support it");
+
+	Write32((0xF2 << 24) | (Size & I_UNSIGNED ? 1 << 24 : 0) | (1 << 23) | (encodedSize(Size) << 20) | EncodeVn(Vn) | EncodeVd(Vd) | \
+			(0x20 << 4) | EncodeVm(Vm));
+}
+void NEONXEmitter::VSUBW(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm)
+{
+	_dbg_assert_msg_(DYNA_REC, Vd >= Q0, "Pass invalid register to " __FUNCTION__);
+	_dbg_assert_msg_(DYNA_REC, cpu_info.bNEON, "Can't use " __FUNCTION__ " when CPU doesn't support it");
+
+	Write32((0xF2 << 24) | (Size & I_UNSIGNED ? 1 << 24 : 0) | (1 << 23) | (encodedSize(Size) << 20) | EncodeVn(Vn) | EncodeVd(Vd) | \
+			(0x30 << 4) | EncodeVm(Vm));
+}
+void NEONXEmitter::VSWP(ARMReg Vd, ARMReg Vm)
+{
+	_dbg_assert_msg_(DYNA_REC, Vd >= Q0, "Pass invalid register to " __FUNCTION__);
+	_dbg_assert_msg_(DYNA_REC, cpu_info.bNEON, "Can't use " __FUNCTION__ " when CPU doesn't support it");
+
+	bool register_quad = Vd >= Q0;
+
+	Write32((0xF3 << 24) | (0xB << 20) | (1 << 17) | EncodeVd(Vd) | \
+			(register_quad << 6) | EncodeVm(Vm));
+}
+void NEONXEmitter::VTRN(u32 Size, ARMReg Vd, ARMReg Vm)
+{
+	_dbg_assert_msg_(DYNA_REC, Vd >= Q0, "Pass invalid register to " __FUNCTION__);
+	_dbg_assert_msg_(DYNA_REC, cpu_info.bNEON, "Can't use " __FUNCTION__ " when CPU doesn't support it");
+
+	bool register_quad = Vd >= Q0;
+
+	Write32((0xF3 << 24) | (0xB << 20) | (encodedSize(Size) << 18) | (1 << 17) | EncodeVd(Vd) | \
+			(1 << 7) | (register_quad << 6) | EncodeVm(Vm));
+}
+void NEONXEmitter::VTST(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm)
+{
+	_dbg_assert_msg_(DYNA_REC, Vd >= Q0, "Pass invalid register to " __FUNCTION__);
+	_dbg_assert_msg_(DYNA_REC, cpu_info.bNEON, "Can't use " __FUNCTION__ " when CPU doesn't support it");
+
+	bool register_quad = Vd >= Q0;
+
+	Write32((0xF2 << 24) | (encodedSize(Size) << 20) | EncodeVn(Vn) | EncodeVd(Vd) | \
+			(0x81 << 4) | (register_quad << 6) | EncodeVm(Vm));
+}
+void NEONXEmitter::VUZP(u32 Size, ARMReg Vd, ARMReg Vm)
+{
+	_dbg_assert_msg_(DYNA_REC, Vd >= Q0, "Pass invalid register to " __FUNCTION__);
+	_dbg_assert_msg_(DYNA_REC, cpu_info.bNEON, "Can't use " __FUNCTION__ " when CPU doesn't support it");
+
+	bool register_quad = Vd >= Q0;
+
+	Write32((0xF3 << 24) | (0xB << 20) | (encodedSize(Size) << 18) | (1 << 17) | EncodeVd(Vd) | \
+			(0x10 << 4) | (register_quad << 6) | EncodeVm(Vm));
+}
+void NEONXEmitter::VZIP(u32 Size, ARMReg Vd, ARMReg Vm)
+{
+	_dbg_assert_msg_(DYNA_REC, Vd >= Q0, "Pass invalid register to " __FUNCTION__);
+	_dbg_assert_msg_(DYNA_REC, cpu_info.bNEON, "Can't use " __FUNCTION__ " when CPU doesn't support it");
+
+	bool register_quad = Vd >= Q0;
+
+	Write32((0xF3 << 24) | (0xB << 20) | (encodedSize(Size) << 18) | (1 << 17) | EncodeVd(Vd) | \
+			(0x18 << 4) | (register_quad << 6) | EncodeVm(Vm));
+}
+void NEONXEmitter::VLD1(u32 Size, ARMReg Vd, ARMReg Rn, NEONAlignment align, ARMReg Rm)
 {
 	u32 spacing = 0x7; // Only support loading to 1 reg
 	// Gets encoded as a double register
@ -1736,8 +2117,7 @@ void NEONXEmitter::VLD1(NEONElementType Size, ARMReg Vd, ARMReg Rn, NEONAlignmen
 			| ((Vd & 0xF) << 12) | (spacing << 8) | (encodedSize(Size) << 6)
 			| (align << 4) | Rm);
 }
-
-void NEONXEmitter::VLD2(NEONElementType Size, ARMReg Vd, ARMReg Rn, NEONAlignment align, ARMReg Rm)
+void NEONXEmitter::VLD2(u32 Size, ARMReg Vd, ARMReg Rn, NEONAlignment align, ARMReg Rm)
 {
 	u32 spacing = 0x8; // Single spaced registers
 	// Gets encoded as a double register
@ -1747,8 +2127,7 @@ void NEONXEmitter::VLD2(NEONElementType Size, ARMReg Vd, ARMReg Rn, NEONAlignmen
 			| ((Vd & 0xF) << 12) | (spacing << 8) | (encodedSize(Size) << 6)
 			| (align << 4) | Rm);
 }
-
-void NEONXEmitter::VST1(NEONElementType Size, ARMReg Vd, ARMReg Rn, NEONAlignment align, ARMReg Rm)
+void NEONXEmitter::VST1(u32 Size, ARMReg Vd, ARMReg Rn, NEONAlignment align, ARMReg Rm)
 {
 	u32 spacing = 0x7; // Single spaced registers
 	// Gets encoded as a double register
@ -1759,8 +2138,7 @@ void NEONXEmitter::VST1(NEONElementType Size, ARMReg Vd, ARMReg Rn, NEONAlignmen
 			| (align << 4) | Rm);
 }

-
-void NEONXEmitter::VREVX(u32 size, NEONElementType Size, ARMReg Vd, ARMReg Vm)
+void NEONXEmitter::VREVX(u32 size, u32 Size, ARMReg Vd, ARMReg Vm)
 {
 	bool register_quad = Vd >= Q0;
 	Vd = SubBase(Vd);
@ -1771,44 +2149,19 @@ void NEONXEmitter::VREVX(u32 size, NEONElementType Size, ARMReg Vd, ARMReg Vm)
 			| (register_quad << 6) | ((Vm & 0x10) << 1) | (Vm & 0xF));
 }

-void NEONXEmitter::VREV64(NEONElementType Size, ARMReg Vd, ARMReg Vm)
+void NEONXEmitter::VREV64(u32 Size, ARMReg Vd, ARMReg Vm)
 {
 	VREVX(0, Size, Vd, Vm);
 }

-void NEONXEmitter::VREV32(NEONElementType Size, ARMReg Vd, ARMReg Vm)
+void NEONXEmitter::VREV32(u32 Size, ARMReg Vd, ARMReg Vm)
 {
 	VREVX(1, Size, Vd, Vm);
 }

-void NEONXEmitter::VREV16(NEONElementType Size, ARMReg Vd, ARMReg Vm)
+void NEONXEmitter::VREV16(u32 Size, ARMReg Vd, ARMReg Vm)
 {
 	VREVX(2, Size, Vd, Vm);
 }
-
-void NEONXEmitter::VRSQRTE(NEONElementType Size, ARMReg Vd, ARMReg Vm)
-{
-	bool register_quad = Vd >= Q0;
-	Vd = SubBase(Vd);
-	Vm = SubBase(Vm);
-
-	Write32((0xF3 << 24) | (0xB << 20) | ((Vd & 0x10) << 18) | (0xB << 16)
-			| ((Vd & 0xF) << 12) | (9 << 7) | (Size & F_32 ? (1 << 8) : 0) | (register_quad << 6)
-			| ((Vm & 0x10) << 1) | (Vm & 0xF));
-}
-
-void NEONXEmitter::VORR(ARMReg Vd, ARMReg Vn, ARMReg Vm)
-{
-	bool register_quad = Vd >= Q0;
-	Vd = SubBase(Vd);
-	Vn = SubBase(Vn);
-	Vm = SubBase(Vm);
-
-	Write32((0xF2 << 24) | (0x1 << 21) | ((Vd & 0x10) << 18) | ((Vn & 0xF) << 16)
-			| ((Vd & 0xF) << 12) | (1 << 8) | ((Vn & 0x10) << 3)
-			| (register_quad << 6) | ((Vm & 0x10) << 1) | (1 << 4) | (Vm & 0xF));
-}
-
-
 }

--- a/Source/Core/Common/Src/ArmEmitter.h
+++ b/Source/Core/Common/Src/ArmEmitter.h
@ -338,6 +338,15 @@ struct LiteralPool
 };

 typedef const u8* JumpTarget;
+// XXX: Stop polluting the global namespace
+const u32 I_8 = (1 << 0);
+const u32 I_16 = (1 << 1);
+const u32 I_32 = (1 << 2);
+const u32 I_64 = (1 << 3);
+const u32 I_SIGNED = (1 << 4);
+const u32 I_UNSIGNED = (1 << 5);
+const u32 F_32 = (1 << 6);
+const u32 I_POLYNOMIAL = (1 << 7); // Only used in VMUL/VMULL

 u32 EncodeVd(ARMReg Vd);
 u32 EncodeVn(ARMReg Vn);
@ -572,17 +581,6 @@ public:

 };  // class ARMXEmitter

-enum NEONElementType
-{
-	I_8 = (1 << 0),
-	I_16 = (1 << 1),
-	I_32 = (1 << 2),
-	I_64 = (1 << 3),
-	I_SIGNED = (1 << 4),
-	I_UNSIGNED = (1 << 5),
-	F_32 = (1 << 6)
-};
-
 enum NEONAlignment
 {
 	ALIGN_NONE = 0,
@ -613,71 +611,105 @@ private:
 		return 0;
 	}

-	void VREVX(u32 size, NEONElementType Size, ARMReg Vd, ARMReg Vm);
+	void VREVX(u32 size, u32 Size, ARMReg Vd, ARMReg Vm);

-public:
+public:		
 	NEONXEmitter(ARMXEmitter *emit)
 		: _emit(emit)
 	{}

-	void VABA(NEONElementType Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);
-	void VABAL(NEONElementType Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);
-	void VABD(NEONElementType Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);
-	void VABDL(NEONElementType Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);
-	void VABS(NEONElementType Size, ARMReg Vd, ARMReg Vm);
+	void VABA(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);
+	void VABAL(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);
+	void VABD(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);
+	void VABDL(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);
+	void VABS(u32 Size, ARMReg Vd, ARMReg Vm);
 	void VACGE(ARMReg Vd, ARMReg Vn, ARMReg Vm);
 	void VACGT(ARMReg Vd, ARMReg Vn, ARMReg Vm);
 	void VACLE(ARMReg Vd, ARMReg Vn, ARMReg Vm);
 	void VACLT(ARMReg Vd, ARMReg Vn, ARMReg Vm);
-	void VADD(NEONElementType Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);
-	void VADDHN(NEONElementType Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);
-	void VADDL(NEONElementType Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);
-	void VADDW(NEONElementType Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);
+	void VADD(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);
+	void VADDHN(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);
+	void VADDL(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);
+	void VADDW(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);
 	void VAND(ARMReg Vd, ARMReg Vn, ARMReg Vm);
 	void VBIC(ARMReg Vd, ARMReg Vn, ARMReg Vm);
 	void VBIF(ARMReg Vd, ARMReg Vn, ARMReg Vm);
 	void VBIT(ARMReg Vd, ARMReg Vn, ARMReg Vm);
 	void VBSL(ARMReg Vd, ARMReg Vn, ARMReg Vm);
-	void VCEQ(NEONElementType Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);
-	void VCEQ(NEONElementType Size, ARMReg Vd, ARMReg Vm);
-	void VCGE(NEONElementType Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);
-	void VCGE(NEONElementType Size, ARMReg Vd, ARMReg Vm);
-	void VCGT(NEONElementType Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);
-	void VCGT(NEONElementType Size, ARMReg Vd, ARMReg Vm);
-	void VCLE(NEONElementType Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);
-	void VCLE(NEONElementType Size, ARMReg Vd, ARMReg Vm);
-	void VCLS(NEONElementType Size, ARMReg Vd, ARMReg Vm);
-	void VCLT(NEONElementType Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);
-	void VCLT(NEONElementType Size, ARMReg Vd, ARMReg Vm);
-	void VCLZ(NEONElementType Size, ARMReg Vd, ARMReg Vm);
-	void VCNT(NEONElementType Size, ARMReg Vd, ARMReg Vm);
-	void VDUP(NEONElementType Size, ARMReg Vd, ARMReg Vm, u8 index);
-	void VDUP(NEONElementType Size, ARMReg Vd, ARMReg Rt);
+	void VCEQ(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);
+	void VCEQ(u32 Size, ARMReg Vd, ARMReg Vm);
+	void VCGE(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);
+	void VCGE(u32 Size, ARMReg Vd, ARMReg Vm);
+	void VCGT(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);
+	void VCGT(u32 Size, ARMReg Vd, ARMReg Vm);
+	void VCLE(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);
+	void VCLE(u32 Size, ARMReg Vd, ARMReg Vm);
+	void VCLS(u32 Size, ARMReg Vd, ARMReg Vm);
+	void VCLT(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);
+	void VCLT(u32 Size, ARMReg Vd, ARMReg Vm);
+	void VCLZ(u32 Size, ARMReg Vd, ARMReg Vm);
+	void VCNT(u32 Size, ARMReg Vd, ARMReg Vm);
+	void VDUP(u32 Size, ARMReg Vd, ARMReg Vm, u8 index);
+	void VDUP(u32 Size, ARMReg Vd, ARMReg Rt);
 	void VEOR(ARMReg Vd, ARMReg Vn, ARMReg Vm);
 	void VEXT(ARMReg Vd, ARMReg Vn, ARMReg Vm, u8 index);
 	void VFMA(ARMReg Vd, ARMReg Vn, ARMReg Vm);
 	void VFMS(ARMReg Vd, ARMReg Vn, ARMReg Vm);
-	void VHADD(NEONElementType Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);
-	void VHSUB(NEONElementType Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);
-	void VMAX(NEONElementType Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);
-	void VMIN(NEONElementType Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);
-	void VMLA(NEONElementType Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);
-	void VMLS(NEONElementType Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);
-	void VMLAL(NEONElementType Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);
-	void VMLSL(NEONElementType Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);
-	void VSUB(NEONElementType Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);
-	void VREV64(NEONElementType Size, ARMReg Vd, ARMReg Vm);
-	void VREV32(NEONElementType Size, ARMReg Vd, ARMReg Vm);
-	void VREV16(NEONElementType Size, ARMReg Vd, ARMReg Vm);
-
-	void VRSQRTE(NEONElementType Size, ARMReg Vd, ARMReg Vm);
-
+	void VHADD(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);
+	void VHSUB(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);
+	void VMAX(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);
+	void VMIN(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);
+	void VMLA(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);
+	void VMLS(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);
+	void VMLAL(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);
+	void VMLSL(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);
+	void VMUL(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);
+	void VMULL(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);
+	void VNEG(u32 Size, ARMReg Vd, ARMReg Vm);
+	void VORN(ARMReg Vd, ARMReg Vn, ARMReg Vm);
 	void VORR(ARMReg Vd, ARMReg Vn, ARMReg Vm);
+	void VPADAL(u32 Size, ARMReg Vd, ARMReg Vm);
+	void VPADD(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);
+	void VPADDL(u32 Size, ARMReg Vd, ARMReg Vm);
+	void VPMAX(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);
+	void VPMIN(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);
+	void VQABS(u32 Size, ARMReg Vd, ARMReg Vm);
+	void VQADD(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);
+	void VQDMLAL(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);
+	void VQDMLSL(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);
+	void VQDMULH(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);
+	void VQDMULL(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);
+	void VQNEG(u32 Size, ARMReg Vd, ARMReg Vm);
+	void VQRDMULH(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);
+	void VQRSHL(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);
+	void VQSHL(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);
+	void VQSUB(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);
+	void VRADDHN(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);
+	void VRECPE(u32 Size, ARMReg Vd, ARMReg Vm);
+	void VRECPS(ARMReg Vd, ARMReg Vn, ARMReg Vm);
+	void VRHADD(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);
+	void VRSHL(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);
+	void VRSQRTE(u32 Size, ARMReg Vd, ARMReg Vm);
+	void VRSQRTS(ARMReg Vd, ARMReg Vn, ARMReg Vm);
+	void VRSUBHN(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);
+	void VSHL(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);
+	void VSUB(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);
+	void VSUBHN(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);
+	void VSUBL(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);
+	void VSUBW(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);
+	void VSWP(ARMReg Vd, ARMReg Vm);
+	void VTRN(u32 Size, ARMReg Vd, ARMReg Vm);
+	void VTST(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);
+	void VUZP(u32 Size, ARMReg Vd, ARMReg Vm);
+	void VZIP(u32 Size, ARMReg Vd, ARMReg Vm);
+	void VREV64(u32 Size, ARMReg Vd, ARMReg Vm);
+	void VREV32(u32 Size, ARMReg Vd, ARMReg Vm);
+	void VREV16(u32 Size, ARMReg Vd, ARMReg Vm);

-	void VLD1(NEONElementType Size, ARMReg Vd, ARMReg Rn, NEONAlignment align = ALIGN_NONE, ARMReg Rm = _PC);
-	void VLD2(NEONElementType Size, ARMReg Vd, ARMReg Rn, NEONAlignment align = ALIGN_NONE, ARMReg Rm = _PC);
+	void VLD1(u32 Size, ARMReg Vd, ARMReg Rn, NEONAlignment align = ALIGN_NONE, ARMReg Rm = _PC);
+	void VLD2(u32 Size, ARMReg Vd, ARMReg Rn, NEONAlignment align = ALIGN_NONE, ARMReg Rm = _PC);

-	void VST1(NEONElementType Size, ARMReg Vd, ARMReg Rn, NEONAlignment align = ALIGN_NONE, ARMReg Rm = _PC);
+	void VST1(u32 Size, ARMReg Vd, ARMReg Rn, NEONAlignment align = ALIGN_NONE, ARMReg Rm = _PC);
 };

 // Everything that needs to generate X86 code should inherit from this.
--- a/Source/Core/Common/Src/CPUDetect.h
+++ b/Source/Core/Common/Src/CPUDetect.h
@ -41,7 +41,14 @@ struct CPUInfo
 	bool bLZCNT;
 	bool bSSE4A;
 	bool bAVX;
+	bool bFMA;
 	bool bAES;
+	// FXSAVE/FXRSTOR
+	bool bFXSR;
+	// This flag indicates that the hardware supports some mode
+	// in which denormal inputs _and_ outputs are automatically set to (signed) zero.
+	// TODO: ARM
+	bool bFlushToZero;
 	bool bLAHFSAHF64;
 	bool bLongMode;

--- a/Source/Core/Common/Src/ChunkFile.h
+++ b/Source/Core/Common/Src/ChunkFile.h
@ -193,10 +193,12 @@ public:
 	void DoPointer(T*& x, T* const base)
 	{
 		// pointers can be more than 2^31 apart, but you're using this function wrong if you need that much range
-		s32 offset = x - base;
+		ptrdiff_t offset = x - base;
 		Do(offset);
 		if (mode == MODE_READ)
+		{
 			x = base + offset;
+		}
 	}

 	// Let's pretend std::list doesn't exist!
--- a/Source/Core/Common/Src/CommonFuncs.h
+++ b/Source/Core/Common/Src/CommonFuncs.h
@ -31,7 +31,12 @@ struct ArraySizeImpl : public std::extent<T>
 #define b32(x)  (b16(x) | (b16(x) >>16) )
 #define ROUND_UP_POW2(x)	(b32(x - 1) + 1)

-#if defined __GNUC__ && !defined __SSSE3__ && !defined _M_GENERIC
+#ifndef __GNUC_PREREQ 
+	#define __GNUC_PREREQ(a, b) 0 
+#endif
+
+#if (defined __GNUC__ && !__GNUC_PREREQ(4,9))  \
+	&& !defined __SSSE3__ && !defined _M_GENERIC
 #include <emmintrin.h>
 static __inline __m128i __attribute__((__always_inline__))
 _mm_shuffle_epi8(__m128i a, __m128i mask)
--- a/Source/Core/Common/Src/FPURoundMode.h
+++ b/Source/Core/Common/Src/FPURoundMode.h
@ -36,7 +36,7 @@ namespace FPURoundMode

 	void SetPrecisionMode(u32 mode);

-	void SetSIMDMode(u32 mode);
+	void SetSIMDMode(u32 roundingMode, u32 nonIEEEMode);

 /*
 * There are two different flavors of float to int conversion:
--- a/Source/Core/Common/Src/GenericFPURoundMode.cpp
+++ b/Source/Core/Common/Src/GenericFPURoundMode.cpp
@ -26,7 +26,7 @@ namespace FPURoundMode
 	void SetPrecisionMode(u32 mode)
 	{
 	}
-	void SetSIMDMode(u32 mode)
+	void SetSIMDMode(u32 mode, u32 nonIEEEMode)
 	{
 	}
 	void SaveSIMDState()
--- a/Source/Core/Common/Src/MathUtil.h
+++ b/Source/Core/Common/Src/MathUtil.h
@ -64,10 +64,10 @@ inline float FlushToZero(float f)
 	return x.f;
 }

-inline double FlushToZeroAsFloat(double d)
+inline double FlushToZero(double d)
 {
 	IntDouble x; x.d = d;
-	if ((x.i & DOUBLE_EXP) < 0x3800000000000000ULL)
+	if ((x.i & DOUBLE_EXP) == 0)
 		x.i &= DOUBLE_SIGN;  // turn into signed zero
 	return x.d;
 }
--- a/Source/Core/Common/Src/SDCardUtil.cpp
+++ b/Source/Core/Common/Src/SDCardUtil.cpp
@ -36,6 +36,7 @@
 #include <stdlib.h>
 #include <string.h>
 #include <errno.h>
+#include <cinttypes>

 #ifndef _WIN32
 #include <unistd.h> // for unlink()
@ -196,7 +197,7 @@ bool SDCardCreate(u64 disk_size /*in MB*/, const char* filename)
 	disk_size *= 1024 * 1024;

 	if (disk_size < 0x800000 || disk_size > 0x800000000ULL) {
-		ERROR_LOG(COMMON, "Trying to create SD Card image of size %lliMB is out of range (8MB-32GB)", disk_size/(1024*1024));
+		ERROR_LOG(COMMON, "Trying to create SD Card image of size %" PRIu64 "MB is out of range (8MB-32GB)", disk_size/(1024*1024));
 		return false;
 	}

--- a/Source/Core/Common/Src/StringUtil.cpp
+++ b/Source/Core/Common/Src/StringUtil.cpp
@ -404,26 +404,30 @@ std::string UriEncode(const std::string & sSrc)

 std::string UTF16ToUTF8(const std::wstring& input)
 {
-	auto const size = WideCharToMultiByte(CP_UTF8, 0, input.data(), input.size(), nullptr, 0, nullptr, nullptr);
+	auto const size = WideCharToMultiByte(CP_UTF8, 0, input.data(), (int)input.size(), nullptr, 0, nullptr, nullptr);

 	std::string output;
 	output.resize(size);

-	if (size == 0 || size != WideCharToMultiByte(CP_UTF8, 0, input.data(), input.size(), &output[0], output.size(), nullptr, nullptr))
+	if (size == 0 || size != WideCharToMultiByte(CP_UTF8, 0, input.data(), (int)input.size(), &output[0], (int)output.size(), nullptr, nullptr))
+	{
 		output.clear();
+	}

 	return output;
 }

 std::wstring CPToUTF16(u32 code_page, const std::string& input)
 {
-	auto const size = MultiByteToWideChar(code_page, 0, input.data(), input.size(), nullptr, 0);
+	auto const size = MultiByteToWideChar(code_page, 0, input.data(), (int)input.size(), nullptr, 0);

 	std::wstring output;
 	output.resize(size);

-	if (size == 0 || size != MultiByteToWideChar(code_page, 0, input.data(), input.size(), &output[0], output.size()))
+	if (size == 0 || size != MultiByteToWideChar(code_page, 0, input.data(), (int)input.size(), &output[0], (int)output.size()))
+	{
 		output.clear();
+	}

 	return output;
 }
--- a/Source/Core/Common/Src/SysConf.cpp
+++ b/Source/Core/Common/Src/SysConf.cpp
@ -5,6 +5,8 @@
 #include "FileUtil.h"
 #include "SysConf.h"

+#include <cinttypes>
+
 SysConf::SysConf()
 	: m_IsValid(false)
 {
@ -42,7 +44,7 @@ bool SysConf::LoadFromFile(const char *filename)
 	u64 size = File::GetSize(filename);
 	if (size != SYSCONF_SIZE)
 	{
-		if (AskYesNoT("Your SYSCONF file is the wrong size.\nIt should be 0x%04x (but is 0x%04llx)\nDo you want to generate a new one?",
+		if (AskYesNoT("Your SYSCONF file is the wrong size.\nIt should be 0x%04x (but is 0x%04" PRIx64 ")\nDo you want to generate a new one?",
 					SYSCONF_SIZE, size))
 		{
 			GenerateSysConf();
@ -151,7 +153,7 @@ unsigned int create_item(SSysConfEntry &item, SysconfType type, const std::strin
 {
 	item.offset = offset;
 	item.type = type;
-	item.nameLength = name.length();
+	item.nameLength = (u8)(name.length());
 	strncpy(item.name, name.c_str(), 32);
 	item.dataLength = data_length;
 	item.data = new u8[data_length];
--- a/Source/Core/Common/Src/Thread.h
+++ b/Source/Core/Common/Src/Thread.h
@ -38,7 +38,7 @@ class Event
 public:
 	Event()
 		: is_set(false)
-	{};
+	{}

 	void Set()
 	{
@ -53,34 +53,20 @@ public:
 	void Wait()
 	{
 		std::unique_lock<std::mutex> lk(m_mutex);
-		m_condvar.wait(lk, IsSet(this));
+		m_condvar.wait(lk, [&]{ return is_set; });
 		is_set = false;
 	}

 	void Reset()
 	{
 		std::unique_lock<std::mutex> lk(m_mutex);
-		// no other action required, since wait loops on the predicate and any lingering signal will get cleared on the first iteration
+		// no other action required, since wait loops on 
+		// the predicate and any lingering signal will get 
+		// cleared on the first iteration
 		is_set = false;
 	}

 private:
-	class IsSet
-	{
-	public:
-		IsSet(const Event* ev)
-			: m_event(ev)
-		{}
-
-		bool operator()()
-		{
-			return m_event->is_set;
-		}
-
-	private:
-		const Event* const m_event;
-	};
-
 	volatile bool is_set;
 	std::condition_variable m_condvar;
 	std::mutex m_mutex;
@ -110,28 +96,12 @@ public:
 		}
 		else
 		{
-			m_condvar.wait(lk, IsDoneWating(this));
+			m_condvar.wait(lk, [&]{ return (0 == m_waiting); });
 			return false;
 		}
 	}

 private:
-	class IsDoneWating
-	{
-	public:
-		IsDoneWating(const Barrier* bar)
-			: m_bar(bar)
-		{}
-
-		bool operator()()
-		{
-			return (0 == m_bar->m_waiting);
-		}
-
-	private:
-		const Barrier* const m_bar;
-	};
-
 	std::condition_variable m_condvar;
 	std::mutex m_mutex;
 	const size_t m_count;
--- a/Source/Core/Common/Src/x64CPUDetect.cpp
+++ b/Source/Core/Common/Src/x64CPUDetect.cpp
@ -162,6 +162,34 @@ void CPUInfo::Detect()
 		if ((cpu_id[2] >> 20) & 1) bSSE4_2 = true;
 		if ((cpu_id[2] >> 25) & 1) bAES = true;

+		// To check DAZ support, we first need to check FXSAVE support.
+		if ((cpu_id[3] >> 24) & 1)
+		{
+			// We can use FXSAVE.
+			bFXSR = true;
+
+			GC_ALIGNED16(u8 fx_state[512]);
+			memset(fx_state, 0, sizeof(fx_state));
+#ifdef _WIN32
+#ifdef _M_IX86
+			_fxsave(fx_state);
+#elif defined (_M_X64)
+			_fxsave64(fx_state);
+#endif
+#else
+			__asm__("fxsave %0" : "=m" (fx_state));
+#endif
+
+			// lowest byte of MXCSR_MASK
+			if ((fx_state[0x1C] >> 6) & 1)
+			{
+				// On x86, the FTZ field (supported since SSE1) only flushes denormal _outputs_ to zero,
+				// now that we checked DAZ support (flushing denormal _inputs_ to zero),
+				// we can set our generic flag.
+				bFlushToZero = true;
+			}
+		}
+
 		// AVX support requires 3 separate checks:
 		//  - Is the AVX bit set in CPUID?
 		//  - Is the XSAVE bit set in CPUID?
@ -169,7 +197,11 @@ void CPUInfo::Detect()
 		if (((cpu_id[2] >> 28) & 1) && ((cpu_id[2] >> 27) & 1))
 		{
 			if ((_xgetbv(_XCR_XFEATURE_ENABLED_MASK) & 0x6) == 0x6)
+			{
 				bAVX = true;
+				if ((cpu_id[2] >> 12) & 1)
+					bFMA = true;
+			}
 		}
 	}
 	if (max_ex_fn >= 0x80000004) {
@ -218,13 +250,19 @@ std::string CPUInfo::Summarize()
 {
 	std::string sum(cpu_string);
 	if (bSSE) sum += ", SSE";
-	if (bSSE2) sum += ", SSE2";
+	if (bSSE2)
+	{
+		sum += ", SSE2";
+		if (!bFlushToZero)
+			sum += " (but not DAZ!)";
+	}
 	if (bSSE3) sum += ", SSE3";
 	if (bSSSE3) sum += ", SSSE3";
 	if (bSSE4_1) sum += ", SSE4.1";
 	if (bSSE4_2) sum += ", SSE4.2";
 	if (HTT) sum += ", HTT";
 	if (bAVX) sum += ", AVX";
+	if (bFMA) sum += ", FMA";
 	if (bAES) sum += ", AES";
 	if (bLongMode) sum += ", 64-bit support";
 	return sum;
--- a/Source/Core/Common/Src/x64Emitter.cpp
+++ b/Source/Core/Common/Src/x64Emitter.cpp
@ -7,6 +7,8 @@
 #include "x64ABI.h"
 #include "CPUDetect.h"

+#include <cinttypes>
+
 namespace Gen
 {

@ -154,6 +156,40 @@ void OpArg::WriteRex(XEmitter *emit, int opBits, int bits, int customOp) const
 #endif
 }

+void OpArg::WriteVex(XEmitter* emit, int size, int packed, Gen::X64Reg regOp1, Gen::X64Reg regOp2) const
+{
+	int R = !(regOp1 & 8);
+	int X = !(indexReg & 8);
+	int B = !(offsetOrBaseReg & 8);
+
+	// not so sure about this one...
+	int W = 0;
+
+	// aka map_select in AMD manuals
+	// only support VEX opcode map 1 for now (analog to secondary opcode map)
+	int mmmmm = 1;
+
+	int vvvv = (regOp2 == X64Reg::INVALID_REG) ? 0xf : (regOp2 ^ 0xf);
+	int L = size == 256;
+	int pp = (packed << 1) | (size == 64);
+
+	// do we need any VEX fields that only appear in the three-byte form?
+	if (X == 1 && B == 1 && W == 0 && mmmmm == 1)
+	{
+		u8 RvvvvLpp = (R << 7) | (vvvv << 3) | (L << 1) | pp;
+		emit->Write8(0xC5);
+		emit->Write8(RvvvvLpp);
+	}
+	else
+	{
+		u8 RXBmmmmm = (R << 7) | (X << 6) | (B << 5) | mmmmm;
+		u8 WvvvvLpp = (W << 7) | (vvvv << 3) | (L << 1) | pp;
+		emit->Write8(0xC4);
+		emit->Write8(RXBmmmmm);
+		emit->Write8(WvvvvLpp);
+	}
+}
+
 void OpArg::WriteRest(XEmitter *emit, int extraBytes, X64Reg _operandReg,
 	bool warn_64bit_offset) const
 {
@ -176,7 +212,7 @@ void OpArg::WriteRest(XEmitter *emit, int extraBytes, X64Reg _operandReg,
 		_assert_msg_(DYNA_REC, (distance < 0x80000000LL
 					&& distance >=  -0x80000000LL) ||
 			     !warn_64bit_offset,
-			     "WriteRest: op out of range (0x%llx uses 0x%llx)",
+			     "WriteRest: op out of range (0x%" PRIx64 " uses 0x%" PRIx64 ")",
 			     ripAddr, offset);
 		s32 offs = (s32)distance;
 		emit->Write32((u32)offs);
@ -1141,6 +1177,18 @@ void XEmitter::WriteSSEOp(int size, u8 sseOp, bool packed, X64Reg regOp, OpArg a
 	arg.WriteRest(this, extrabytes);
 }

+void XEmitter::WriteAVXOp(int size, u8 sseOp, bool packed, X64Reg regOp, OpArg arg, int extrabytes)
+{
+	WriteAVXOp(size, sseOp, packed, regOp, X64Reg::INVALID_REG, arg, extrabytes);
+}
+
+void XEmitter::WriteAVXOp(int size, u8 sseOp, bool packed, X64Reg regOp1, X64Reg regOp2, OpArg arg, int extrabytes)
+{
+	arg.WriteVex(this, size, packed, regOp1, regOp2);
+	Write8(sseOp);
+	arg.WriteRest(this, extrabytes, regOp1);
+}
+
 void XEmitter::MOVD_xmm(X64Reg dest, const OpArg &arg) {WriteSSEOp(64, 0x6E, true, dest, arg, 0);}
 void XEmitter::MOVD_xmm(const OpArg &arg, X64Reg src) {WriteSSEOp(64, 0x7E, true, src, arg, 0);}

@ -1444,6 +1492,13 @@ void XEmitter::PMOVMSKB(X64Reg dest, OpArg arg)    {WriteSSEOp(64, 0xD7, true, d

 void XEmitter::PSHUFLW(X64Reg regOp, OpArg arg, u8 shuffle)   {WriteSSEOp(64, 0x70, false, regOp, arg, 1); Write8(shuffle);}

+// VEX
+void XEmitter::VADDSD(X64Reg regOp1, X64Reg regOp2, OpArg arg)   {WriteAVXOp(64, sseADD, false, regOp1, regOp2, arg);}
+void XEmitter::VSUBSD(X64Reg regOp1, X64Reg regOp2, OpArg arg)   {WriteAVXOp(64, sseSUB, false, regOp1, regOp2, arg);}
+void XEmitter::VMULSD(X64Reg regOp1, X64Reg regOp2, OpArg arg)   {WriteAVXOp(64, sseMUL, false, regOp1, regOp2, arg);}
+void XEmitter::VDIVSD(X64Reg regOp1, X64Reg regOp2, OpArg arg)   {WriteAVXOp(64, sseDIV, false, regOp1, regOp2, arg);}
+void XEmitter::VSQRTSD(X64Reg regOp1, X64Reg regOp2, OpArg arg)  {WriteAVXOp(64, sseSQRT, false, regOp1, regOp2, arg);}
+
 // Prefixes

 void XEmitter::LOCK()  { Write8(0xF0); }
--- a/Source/Core/Common/Src/x64Emitter.h
+++ b/Source/Core/Common/Src/x64Emitter.h
@ -33,6 +33,9 @@ enum X64Reg
 	XMM0=0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7,
 	XMM8, XMM9, XMM10, XMM11, XMM12, XMM13, XMM14, XMM15,

+	YMM0=0, YMM1, YMM2, YMM3, YMM4, YMM5, YMM6, YMM7,
+	YMM8, YMM9, YMM10, YMM11, YMM12, YMM13, YMM14, YMM15,
+
 	INVALID_REG = 0xFFFFFFFF
 };

@ -111,6 +114,7 @@ struct OpArg
 		offset = _offset;
 	}
 	void WriteRex(XEmitter *emit, int opBits, int bits, int customOp = -1) const;
+	void WriteVex(XEmitter* emit, int size, int packed, Gen::X64Reg regOp1, X64Reg regOp2) const;
 	void WriteRest(XEmitter *emit, int extraBytes=0, X64Reg operandReg=(X64Reg)0xFF, bool warn_64bit_offset = true) const;
 	void WriteSingleByteOp(XEmitter *emit, u8 op, X64Reg operandReg, int bits);
 	// This one is public - must be written to
@ -239,6 +243,8 @@ private:
 	void WriteBitTest(int bits, OpArg &dest, OpArg &index, int ext);
 	void WriteMXCSR(OpArg arg, int ext);
 	void WriteSSEOp(int size, u8 sseOp, bool packed, X64Reg regOp, OpArg arg, int extrabytes = 0);
+	void WriteAVXOp(int size, u8 sseOp, bool packed, X64Reg regOp, OpArg arg, int extrabytes = 0);
+	void WriteAVXOp(int size, u8 sseOp, bool packed, X64Reg regOp1, X64Reg regOp2, OpArg arg, int extrabytes = 0);
 	void WriteNormalOp(XEmitter *emit, int bits, NormalOp op, const OpArg &a1, const OpArg &a2);

 protected:
@ -616,6 +622,13 @@ public:
 	void PSRAW(X64Reg reg, int shift);
 	void PSRAD(X64Reg reg, int shift);

+	// AVX
+	void VADDSD(X64Reg regOp1, X64Reg regOp2, OpArg arg);
+	void VSUBSD(X64Reg regOp1, X64Reg regOp2, OpArg arg);
+	void VMULSD(X64Reg regOp1, X64Reg regOp2, OpArg arg);
+	void VDIVSD(X64Reg regOp1, X64Reg regOp2, OpArg arg);
+	void VSQRTSD(X64Reg regOp1, X64Reg regOp2, OpArg arg);
+
 	void RTDSC();

 	// Utility functions
--- a/Source/Core/Common/Src/x64FPURoundMode.cpp
+++ b/Source/Core/Common/Src/x64FPURoundMode.cpp
@ -4,6 +4,7 @@

 #include "Common.h"
 #include "FPURoundMode.h"
+#include "CPUDetect.h"

 #ifndef _WIN32
 static const unsigned short FPU_ROUND_NEAR = 0 << 10;
@ -14,8 +15,11 @@ static const unsigned short FPU_ROUND_MASK = 3 << 10;
 #include <xmmintrin.h>
 #endif

-const u32 MASKS = 0x1F80;  // mask away the interrupts.
+// OR-mask for disabling FPU exceptions (bits 7-12 in the MXCSR register)
+const u32 EXCEPTION_MASK = 0x1F80;
+// Denormals-Are-Zero (non-IEEE mode: denormal inputs are set to +/- 0)
 const u32 DAZ = 0x40;
+// Flush-To-Zero (non-IEEE mode: denormal outputs are set to +/- 0)
 const u32 FTZ = 0x8000;

 namespace FPURoundMode
@ -79,16 +83,28 @@ namespace FPURoundMode
 			//but still - set any useful sse options here
 		#endif
 	}
-	void SetSIMDMode(u32 mode)
+
+	void SetSIMDMode(u32 roundingMode, u32 nonIEEEMode)
 	{
-		static const u32 ssetable[4] =
+		// lookup table for FPSCR.RN-to-MXCSR.RC translation
+		static const u32 roundingModeLUT[4] =
 		{
-			(0 << 13) | MASKS,
-			(3 << 13) | MASKS,
-			(2 << 13) | MASKS,
-			(1 << 13) | MASKS,
+			(0 << 13) | EXCEPTION_MASK, // nearest
+			(3 << 13) | EXCEPTION_MASK, // -inf
+			(2 << 13) | EXCEPTION_MASK, // +inf
+			(1 << 13) | EXCEPTION_MASK, // zero
 		};
-		u32 csr = ssetable[mode];
+		u32 csr = roundingModeLUT[roundingMode];
+
+		static const u32 denormalLUT[2] =
+		{
+			FTZ,       // flush-to-zero only
+			FTZ | DAZ, // flush-to-zero and denormals-are-zero (may not be supported)
+		};
+		if (nonIEEEMode)
+		{
+			csr |= denormalLUT[cpu_info.bFlushToZero];
+		}
 		_mm_setcsr(csr);
 	}

--- a/Source/Core/Core/Core.vcxproj.filters
+++ b/Source/Core/Core/Core.vcxproj.filters
@ -133,6 +133,9 @@
    <Filter Include="IPC HLE %28IOS/Starlet%29\USB/BT/Wiimote">
      <UniqueIdentifier>{8352be4d-d37d-4f55-adec-b940a9712802}</UniqueIdentifier>
    </Filter>
+    <Filter Include="PowerPC\JitILCommon">
+      <UniqueIdentifier>{827afa93-1a80-4835-93ae-b5516d95867f}</UniqueIdentifier>
+    </Filter>
  </ItemGroup>
  <ItemGroup>
    <ClCompile Include="Src\BootManager.cpp" />
@ -636,39 +639,12 @@
    <ClCompile Include="Src\PowerPC\JitCommon\JitCache.cpp">
      <Filter>PowerPC\JitCommon</Filter>
    </ClCompile>
-    <ClCompile Include="Src\PowerPC\Jit64IL\IR.cpp">
-      <Filter>PowerPC\JitIL</Filter>
-    </ClCompile>
    <ClCompile Include="Src\PowerPC\Jit64IL\IR_X86.cpp">
      <Filter>PowerPC\JitIL</Filter>
    </ClCompile>
    <ClCompile Include="Src\PowerPC\Jit64IL\JitIL.cpp">
      <Filter>PowerPC\JitIL</Filter>
    </ClCompile>
-    <ClCompile Include="Src\PowerPC\Jit64IL\JitIL_Branch.cpp">
-      <Filter>PowerPC\JitIL</Filter>
-    </ClCompile>
-    <ClCompile Include="Src\PowerPC\Jit64IL\JitIL_FloatingPoint.cpp">
-      <Filter>PowerPC\JitIL</Filter>
-    </ClCompile>
-    <ClCompile Include="Src\PowerPC\Jit64IL\JitIL_Integer.cpp">
-      <Filter>PowerPC\JitIL</Filter>
-    </ClCompile>
-    <ClCompile Include="Src\PowerPC\Jit64IL\JitIL_LoadStore.cpp">
-      <Filter>PowerPC\JitIL</Filter>
-    </ClCompile>
-    <ClCompile Include="Src\PowerPC\Jit64IL\JitIL_LoadStoreFloating.cpp">
-      <Filter>PowerPC\JitIL</Filter>
-    </ClCompile>
-    <ClCompile Include="Src\PowerPC\Jit64IL\JitIL_LoadStorePaired.cpp">
-      <Filter>PowerPC\JitIL</Filter>
-    </ClCompile>
-    <ClCompile Include="Src\PowerPC\Jit64IL\JitIL_Paired.cpp">
-      <Filter>PowerPC\JitIL</Filter>
-    </ClCompile>
-    <ClCompile Include="Src\PowerPC\Jit64IL\JitIL_SystemRegisters.cpp">
-      <Filter>PowerPC\JitIL</Filter>
-    </ClCompile>
    <ClCompile Include="Src\PowerPC\Jit64IL\JitIL_Tables.cpp">
      <Filter>PowerPC\JitIL</Filter>
    </ClCompile>
@ -706,6 +682,33 @@
      <Filter>PowerPC\Jit64</Filter>
    </ClCompile>
    <ClCompile Include="Src\stdafx.cpp" />
+    <ClCompile Include="Src\PowerPC\JitILCommon\JitILBase_Branch.cpp">
+      <Filter>PowerPC\JitILCommon</Filter>
+    </ClCompile>
+    <ClCompile Include="Src\PowerPC\JitILCommon\JitILBase_FloatingPoint.cpp">
+      <Filter>PowerPC\JitILCommon</Filter>
+    </ClCompile>
+    <ClCompile Include="Src\PowerPC\JitILCommon\JitILBase_Integer.cpp">
+      <Filter>PowerPC\JitILCommon</Filter>
+    </ClCompile>
+    <ClCompile Include="Src\PowerPC\JitILCommon\JitILBase_LoadStore.cpp">
+      <Filter>PowerPC\JitILCommon</Filter>
+    </ClCompile>
+    <ClCompile Include="Src\PowerPC\JitILCommon\JitILBase_LoadStoreFloating.cpp">
+      <Filter>PowerPC\JitILCommon</Filter>
+    </ClCompile>
+    <ClCompile Include="Src\PowerPC\JitILCommon\JitILBase_LoadStorePaired.cpp">
+      <Filter>PowerPC\JitILCommon</Filter>
+    </ClCompile>
+    <ClCompile Include="Src\PowerPC\JitILCommon\JitILBase_Paired.cpp">
+      <Filter>PowerPC\JitILCommon</Filter>
+    </ClCompile>
+    <ClCompile Include="Src\PowerPC\JitILCommon\JitILBase_SystemRegisters.cpp">
+      <Filter>PowerPC\JitILCommon</Filter>
+    </ClCompile>
+    <ClCompile Include="Src\PowerPC\JitILCommon\IR.cpp">
+      <Filter>PowerPC\JitILCommon</Filter>
+    </ClCompile>
  </ItemGroup>
  <ItemGroup>
    <ClInclude Include="Src\BootManager.h" />
@ -1190,9 +1193,6 @@
    <ClInclude Include="Src\PowerPC\JitCommon\JitCache.h">
      <Filter>PowerPC\JitCommon</Filter>
    </ClInclude>
-    <ClInclude Include="Src\PowerPC\Jit64IL\IR.h">
-      <Filter>PowerPC\JitIL</Filter>
-    </ClInclude>
    <ClInclude Include="Src\PowerPC\Jit64IL\JitIL.h">
      <Filter>PowerPC\JitIL</Filter>
    </ClInclude>
@ -1209,6 +1209,12 @@
      <Filter>PowerPC\Jit64</Filter>
    </ClInclude>
    <ClInclude Include="Src\stdafx.h" />
+    <ClInclude Include="Src\PowerPC\JitILCommon\JitILBase.h">
+      <Filter>PowerPC\JitILCommon</Filter>
+    </ClInclude>
+    <ClInclude Include="Src\PowerPC\JitILCommon\IR.h">
+      <Filter>PowerPC\JitILCommon</Filter>
+    </ClInclude>
  </ItemGroup>
  <ItemGroup>
    <Text Include="CMakeLists.txt" />
--- a/Source/Core/Core/Src/Console.cpp
+++ b/Source/Core/Core/Src/Console.cpp
@ -17,8 +17,8 @@
 #include "PowerPCDisasm.h"
 #include "Console.h"

-#define CASE1(x) if (memcmp(cmd, x, 2*sizeof(TCHAR))==0)
-#define CASE(x) else if (memcmp(cmd, x, 4*sizeof(TCHAR))==0)
+#define CASE1(x) if (!strcmp(cmd, (x)))
+#define CASE(x) else if (!strcmp(cmd, (x)))

 void Console_Submit(const char *cmd)
 {
@ -27,7 +27,7 @@ void Console_Submit(const char *cmd)
 		Core::StartTrace(false);
 		INFO_LOG(CONSOLE, "Read tracing started.");
 	}
-	CASE1("w")
+	CASE("w")
 	{
 		Core::StartTrace(true);
 		INFO_LOG(CONSOLE, "Write tracing started.");
@ -141,3 +141,6 @@ void Console_Submit(const char *cmd)
 		ERROR_LOG(CONSOLE, "Invalid command");
 	}
 }
+
+#undef CASE1
+#undef CASE
--- a/Source/Core/Core/Src/Core.cpp
+++ b/Source/Core/Core/Src/Core.cpp
@ -388,8 +388,8 @@ void EmuThread()

 	OSD::AddMessage("Dolphin " + g_video_backend->GetName() + " Video Backend.", 5000);

-	if (!DSP::GetDSPEmulator()->Initialize(_CoreParameter.bWii,
-		_CoreParameter.bDSPThread))
+	if (!DSP::GetDSPEmulator()->Initialize(g_pWindowHandle,
+		_CoreParameter.bWii, _CoreParameter.bDSPThread))
 	{
 		HW::Shutdown();
 		g_video_backend->Shutdown();
--- a/Source/Core/Core/Src/CoreParameter.cpp
+++ b/Source/Core/Core/Src/CoreParameter.cpp
@ -18,6 +18,8 @@
 #include "Core.h" // for bWii
 #include "FifoPlayer/FifoDataFile.h"

+#include <cinttypes>
+
 SCoreStartupParameter::SCoreStartupParameter()
 : hInstance(0),
  bEnableDebugging(false), bAutomaticStart(false), bBootToPause(false),
@ -278,7 +280,7 @@ bool SCoreStartupParameter::AutoSetup(EBootBS2 _BootBS2)
 				// Use the TitleIDhex for name and/or unique ID if launching from nand folder
 				// or if it is not ascii characters (specifically sysmenu could potentially apply to other things)
 				char titleidstr[17];
-				snprintf(titleidstr, 17, "%016llx", ContentLoader.GetTitleID());
+				snprintf(titleidstr, 17, "%016" PRIx64, ContentLoader.GetTitleID());

 				if (!m_strName.length())
 				{
--- a/Source/Core/Core/Src/CoreTiming.cpp
+++ b/Source/Core/Core/Src/CoreTiming.cpp
@ -3,6 +3,7 @@
 // Refer to the license.txt file included.

 #include <vector>
+#include <cinttypes>

 #include "Thread.h"
 #include "PowerPC/PowerPC.h"
@ -429,7 +430,7 @@ void LogPendingEvents()
 	Event *ptr = first;
 	while (ptr)
 	{
-		INFO_LOG(POWERPC, "PENDING: Now: %lld Pending: %lld Type: %d", globalTimer, ptr->time, ptr->type);
+		INFO_LOG(POWERPC, "PENDING: Now: %" PRId64 " Pending: %" PRId64 " Type: %d", globalTimer, ptr->time, ptr->type);
 		ptr = ptr->next;
 	}
 }
--- a/Source/Core/Core/Src/DSPEmulator.h
+++ b/Source/Core/Core/Src/DSPEmulator.h
@ -15,7 +15,7 @@ public:

 	virtual bool IsLLE() = 0;

-	virtual bool Initialize(bool bWii, bool bDSPThread) = 0;
+	virtual bool Initialize(void *hWnd, bool bWii, bool bDSPThread) = 0;
 	virtual void Shutdown() = 0;

 	virtual void DoState(PointerWrap &p) = 0;
@ -35,6 +35,7 @@ public:

 protected:
 	SoundStream *soundStream;
+	void *m_hWnd;
 };

 DSPEmulator *CreateDSPEmulator(bool HLE);
--- a/Source/Core/Core/Src/FifoPlayer/FifoDataFile.cpp
+++ b/Source/Core/Core/Src/FifoPlayer/FifoDataFile.cpp
@ -86,7 +86,7 @@ bool FifoDataFile::Save(const char *filename)
 	header.xfRegsSize = XF_REGS_SIZE;

 	header.frameListOffset = frameListOffset;
-	header.frameCount = m_Frames.size();
+	header.frameCount = (u32)m_Frames.size();

 	header.flags = m_Flags;

@ -111,7 +111,7 @@ bool FifoDataFile::Save(const char *filename)
 		dstFrame.fifoStart = srcFrame.fifoStart;
 		dstFrame.fifoEnd = srcFrame.fifoEnd;
 		dstFrame.memoryUpdatesOffset = memoryUpdatesOffset;
-		dstFrame.numMemoryUpdates = srcFrame.memoryUpdates.size();
+		dstFrame.numMemoryUpdates = (u32)srcFrame.memoryUpdates.size();

 		// Write frame info
 		u64 frameOffset = frameListOffset + (i * sizeof(FileFrameInfo));
--- a/Source/Core/Core/Src/FifoPlayer/FifoPlaybackAnalyzer.cpp
+++ b/Source/Core/Core/Src/FifoPlayer/FifoPlaybackAnalyzer.cpp
@ -234,7 +234,7 @@ u32 FifoPlaybackAnalyzer::DecodeCommand(u8 *data)
 		break;
 	}

-	return data - dataStart;
+	return (u32)(data - dataStart);
 }

 void FifoPlaybackAnalyzer::StoreEfbCopyRegion()
--- a/Source/Core/Core/Src/FifoPlayer/FifoPlayer.cpp
+++ b/Source/Core/Core/Src/FifoPlayer/FifoPlayer.cpp
@ -105,7 +105,9 @@ bool FifoPlayer::Play()
 u32 FifoPlayer::GetFrameObjectCount()
 {
 	if (m_CurrentFrame < m_FrameInfo.size())
-		return m_FrameInfo[m_CurrentFrame].objectStarts.size();
+	{
+		return (u32)(m_FrameInfo[m_CurrentFrame].objectStarts.size());
+	}

 	return 0;
 }
@ -172,7 +174,7 @@ void FifoPlayer::WriteFrame(const FifoFrameInfo &frame, const AnalyzedFrameInfo
 	m_FrameFifoSize = frame.fifoDataSize;

 	// Determine start and end objects
-	u32 numObjects = info.objectStarts.size();
+	u32 numObjects = (u32)(info.objectStarts.size());
 	u32 drawStart = std::min(numObjects, m_ObjectRangeStart);
 	u32 drawEnd = std::min(numObjects - 1, m_ObjectRangeEnd);

@ -181,7 +183,9 @@ void FifoPlayer::WriteFrame(const FifoFrameInfo &frame, const AnalyzedFrameInfo

 	// Skip memory updates during frame if true
 	if (m_EarlyMemoryUpdates)
-		memoryUpdate = frame.memoryUpdates.size();
+	{
+		memoryUpdate = (u32)(frame.memoryUpdates.size());
+	}

 	if (numObjects > 0)
 	{
--- a/Source/Core/Core/Src/FifoPlayer/FifoRecorder.cpp
+++ b/Source/Core/Core/Src/FifoPlayer/FifoRecorder.cpp
@ -83,9 +83,9 @@ void FifoRecorder::WriteGPCommand(u8 *data, u32 size)
 	if (m_FrameEnded && m_FifoData.size() > 0)
 	{
 		size_t dataSize = m_FifoData.size();
-		m_CurrentFrame.fifoDataSize = dataSize;
+		m_CurrentFrame.fifoDataSize = (u32)dataSize;
 		m_CurrentFrame.fifoData = new u8[dataSize];
-		memcpy(m_CurrentFrame.fifoData, &m_FifoData[0], dataSize);
+		memcpy(m_CurrentFrame.fifoData, m_FifoData.data(), dataSize);

 		sMutex.lock();

@ -129,7 +129,7 @@ void FifoRecorder::WriteMemory(u32 address, u32 size, MemoryUpdate::Type type)
 		// Record memory update
 		MemoryUpdate memUpdate;
 		memUpdate.address = address;
-		memUpdate.fifoPosition = m_FifoData.size();
+		memUpdate.fifoPosition = (u32)(m_FifoData.size());
 		memUpdate.size = size;
 		memUpdate.type = type;
 		memUpdate.data = new u8[size];
--- a/Source/Core/Core/Src/HW/DSPHLE/DSPHLE.cpp
+++ b/Source/Core/Core/Src/HW/DSPHLE/DSPHLE.cpp
@ -42,8 +42,9 @@ struct DSPState
 	}
 };

-bool DSPHLE::Initialize(bool bWii, bool bDSPThread)
+bool DSPHLE::Initialize(void *hWnd, bool bWii, bool bDSPThread)
 {
+	m_hWnd = hWnd;
 	m_bWii = bWii;
 	m_pUCode = NULL;
 	m_lastUCode = NULL;
@ -265,7 +266,7 @@ void DSPHLE::InitMixer()
 	unsigned int AISampleRate, DACSampleRate;
 	AudioInterface::Callback_GetSampleRate(AISampleRate, DACSampleRate);
 	delete soundStream;
-	soundStream = AudioCommon::InitSoundStream(new HLEMixer(this, AISampleRate, DACSampleRate, 48000));
+	soundStream = AudioCommon::InitSoundStream(new HLEMixer(this, AISampleRate, DACSampleRate, 48000), m_hWnd);
 	if(!soundStream) PanicAlert("Error starting up sound stream");
 	// Mixer is initialized
 	m_InitMixer = true;
--- a/Source/Core/Core/Src/HW/DSPHLE/DSPHLE.h
+++ b/Source/Core/Core/Src/HW/DSPHLE/DSPHLE.h
@ -16,7 +16,7 @@ class DSPHLE : public DSPEmulator {
 public:
 	DSPHLE();

-	virtual bool Initialize(bool bWii, bool bDSPThread) override;
+	virtual bool Initialize(void *hWnd, bool bWii, bool bDSPThread) override;
 	virtual void Shutdown() override;
 	virtual bool IsLLE() override { return false ; }

--- a/Source/Core/Core/Src/HW/DSPHLE/UCodes/UCode_AX_Voice.h
+++ b/Source/Core/Core/Src/HW/DSPHLE/UCodes/UCode_AX_Voice.h
@ -469,36 +469,42 @@ void ProcessVoice(PB_TYPE& pb, const AXBuffers& buffers, u16 count, AXMixControl
 	// Mix LRS, AUXA and AUXB depending on mixer_control
 	// TODO: Handle DPL2 on AUXB.

-	if (mctrl & MIX_L)
-		MixAdd(buffers.left, samples, count, &pb.mixer.left, &pb.dpop.left, mctrl & MIX_L_RAMP);
-	if (mctrl & MIX_R)
-		MixAdd(buffers.right, samples, count, &pb.mixer.right, &pb.dpop.right, mctrl & MIX_R_RAMP);
-	if (mctrl & MIX_S)
-		MixAdd(buffers.surround, samples, count, &pb.mixer.surround, &pb.dpop.surround, mctrl & MIX_S_RAMP);
+#define MIX_ON(C) (0 != (mctrl & MIX_##C))
+#define RAMP_ON(C) (0 != (mctrl & MIX_##C##_RAMP))

-	if (mctrl & MIX_AUXA_L)
-		MixAdd(buffers.auxA_left, samples, count, &pb.mixer.auxA_left, &pb.dpop.auxA_left, mctrl & MIX_AUXA_L_RAMP);
-	if (mctrl & MIX_AUXA_R)
-		MixAdd(buffers.auxA_right, samples, count, &pb.mixer.auxA_right, &pb.dpop.auxA_right, mctrl & MIX_AUXA_R_RAMP);
-	if (mctrl & MIX_AUXA_S)
-		MixAdd(buffers.auxA_surround, samples, count, &pb.mixer.auxA_surround, &pb.dpop.auxA_surround, mctrl & MIX_AUXA_S_RAMP);
+	if (MIX_ON(L))
+		MixAdd(buffers.left, samples, count, &pb.mixer.left, &pb.dpop.left, RAMP_ON(L));
+	if (MIX_ON(R))
+		MixAdd(buffers.right, samples, count, &pb.mixer.right, &pb.dpop.right, RAMP_ON(R));
+	if (MIX_ON(S))
+		MixAdd(buffers.surround, samples, count, &pb.mixer.surround, &pb.dpop.surround, RAMP_ON(S));

-	if (mctrl & MIX_AUXB_L)
-		MixAdd(buffers.auxB_left, samples, count, &pb.mixer.auxB_left, &pb.dpop.auxB_left, mctrl & MIX_AUXB_L_RAMP);
-	if (mctrl & MIX_AUXB_R)
-		MixAdd(buffers.auxB_right, samples, count, &pb.mixer.auxB_right, &pb.dpop.auxB_right, mctrl & MIX_AUXB_R_RAMP);
-	if (mctrl & MIX_AUXB_S)
-		MixAdd(buffers.auxB_surround, samples, count, &pb.mixer.auxB_surround, &pb.dpop.auxB_surround, mctrl & MIX_AUXB_S_RAMP);
+	if (MIX_ON(AUXA_L))
+		MixAdd(buffers.auxA_left, samples, count, &pb.mixer.auxA_left, &pb.dpop.auxA_left, RAMP_ON(AUXA_L));
+	if (MIX_ON(AUXA_R))
+		MixAdd(buffers.auxA_right, samples, count, &pb.mixer.auxA_right, &pb.dpop.auxA_right, RAMP_ON(AUXA_R));
+	if (MIX_ON(AUXA_S))
+		MixAdd(buffers.auxA_surround, samples, count, &pb.mixer.auxA_surround, &pb.dpop.auxA_surround, RAMP_ON(AUXA_S));
+
+	if (MIX_ON(AUXB_L))
+		MixAdd(buffers.auxB_left, samples, count, &pb.mixer.auxB_left, &pb.dpop.auxB_left, RAMP_ON(AUXB_L));
+	if (MIX_ON(AUXB_R))
+		MixAdd(buffers.auxB_right, samples, count, &pb.mixer.auxB_right, &pb.dpop.auxB_right, RAMP_ON(AUXB_R));
+	if (MIX_ON(AUXB_S))
+		MixAdd(buffers.auxB_surround, samples, count, &pb.mixer.auxB_surround, &pb.dpop.auxB_surround, RAMP_ON(AUXB_S));

 #ifdef AX_WII
-	if (mctrl & MIX_AUXC_L)
-		MixAdd(buffers.auxC_left, samples, count, &pb.mixer.auxC_left, &pb.dpop.auxC_left, mctrl & MIX_AUXC_L_RAMP);
-	if (mctrl & MIX_AUXC_R)
-		MixAdd(buffers.auxC_right, samples, count, &pb.mixer.auxC_right, &pb.dpop.auxC_right, mctrl & MIX_AUXC_R_RAMP);
-	if (mctrl & MIX_AUXC_S)
-		MixAdd(buffers.auxC_surround, samples, count, &pb.mixer.auxC_surround, &pb.dpop.auxC_surround, mctrl & MIX_AUXC_S_RAMP);
+	if (MIX_ON(AUXC_L))
+		MixAdd(buffers.auxC_left, samples, count, &pb.mixer.auxC_left, &pb.dpop.auxC_left, RAMP_ON(AUXC_L));
+	if (MIX_ON(AUXC_R))
+		MixAdd(buffers.auxC_right, samples, count, &pb.mixer.auxC_right, &pb.dpop.auxC_right, RAMP_ON(AUXC_R));
+	if (MIX_ON(AUXC_S))
+		MixAdd(buffers.auxC_surround, samples, count, &pb.mixer.auxC_surround, &pb.dpop.auxC_surround, RAMP_ON(AUXC_S));
 #endif

+#undef MIX_ON
+#undef RAMP_ON
+
 	// Optionally, phase shift left or right channel to simulate 3D sound.
 	if (pb.initial_time_delay.on)
 	{
@ -524,8 +530,8 @@ void ProcessVoice(PB_TYPE& pb, const AXBuffers& buffers, u16 count, AXMixControl
 		pb.remote_src.cur_addr_frac = curr_pos & 0xFFFF;

 		// Mix to main[0-3] and aux[0-3]
-#define WMCHAN_MIX_ON(n) ((pb.remote_mixer_control >> (2 * n)) & 3)
-#define WMCHAN_MIX_RAMP(n) ((pb.remote_mixer_control >> (2 * n)) & 2)
+#define WMCHAN_MIX_ON(n) (0 != ((pb.remote_mixer_control >> (2 * n)) & 3))
+#define WMCHAN_MIX_RAMP(n) (0 != ((pb.remote_mixer_control >> (2 * n)) & 2))

 		if (WMCHAN_MIX_ON(0))
 			MixAdd(buffers.wm_main0, wm_samples, wm_count, &pb.remote_mixer.main0, &pb.remote_dpop.main0, WMCHAN_MIX_RAMP(0));
@ -544,6 +550,8 @@ void ProcessVoice(PB_TYPE& pb, const AXBuffers& buffers, u16 count, AXMixControl
 		if (WMCHAN_MIX_ON(7))
 			MixAdd(buffers.wm_aux3, wm_samples, wm_count, &pb.remote_mixer.aux3, &pb.remote_dpop.aux3, WMCHAN_MIX_RAMP(7));
 	}
+#undef WMCHAN_MIX_RAMP
+#undef WMCHAN_MIX_ON
 #endif
 }

--- a/Source/Core/Core/Src/HW/DSPLLE/DSPLLE.cpp
+++ b/Source/Core/Core/Src/HW/DSPLLE/DSPLLE.cpp
@ -130,8 +130,9 @@ void DSPLLE::dsp_thread(DSPLLE *dsp_lle)
 	}
 }

-bool DSPLLE::Initialize(bool bWii, bool bDSPThread)
+bool DSPLLE::Initialize(void *hWnd, bool bWii, bool bDSPThread)
 {
+	m_hWnd = hWnd;
 	m_bWii = bWii;
 	m_bDSPThread = bDSPThread;
 	m_InitMixer = false;
@ -184,7 +185,7 @@ void DSPLLE::InitMixer()
 	unsigned int AISampleRate, DACSampleRate;
 	AudioInterface::Callback_GetSampleRate(AISampleRate, DACSampleRate);
 	delete soundStream;
-	soundStream = AudioCommon::InitSoundStream(new CMixer(AISampleRate, DACSampleRate, 48000));
+	soundStream = AudioCommon::InitSoundStream(new CMixer(AISampleRate, DACSampleRate, 48000), m_hWnd);
 	if(!soundStream) PanicAlert("Error starting up sound stream");
 	// Mixer is initialized
 	m_InitMixer = true;
--- a/Source/Core/Core/Src/HW/DSPLLE/DSPLLE.h
+++ b/Source/Core/Core/Src/HW/DSPLLE/DSPLLE.h
@ -14,7 +14,7 @@ class DSPLLE : public DSPEmulator {
 public:
 	DSPLLE();

-	virtual bool Initialize(bool bWii, bool bDSPThread);
+	virtual bool Initialize(void *hWnd, bool bWii, bool bDSPThread);
 	virtual void Shutdown();
 	virtual bool IsLLE() { return true; }

--- a/Source/Core/Core/Src/HW/DSPLLE/DSPLLEGlobals.cpp
+++ b/Source/Core/Core/Src/HW/DSPLLE/DSPLLEGlobals.cpp
@ -6,6 +6,7 @@
 #include "FileUtil.h"
 #include "DSP/DSPCore.h"
 #include "DSPLLEGlobals.h"
+#include <cinttypes>

 #if PROFILE

@ -37,12 +38,12 @@ void ProfilerDump(u64 count)
 	File::IOFile pFile("DSP_Prof.txt", "wt");
 	if (pFile)
 	{
-		fprintf(pFile.GetHandle(), "Number of DSP steps: %llu\n\n", count);
+		fprintf(pFile.GetHandle(), "Number of DSP steps: %" PRIu64 "\n\n", count);
 		for (int i=0; i<PROFILE_MAP_SIZE;i++)
 		{
 			if (g_profileMap[i] > 0)
 			{
-				fprintf(pFile.GetHandle(), "0x%04X: %llu\n", i, g_profileMap[i]);
+				fprintf(pFile.GetHandle(), "0x%04X: %" PRIu64 "\n", i, g_profileMap[i]);
 			}
 		}
 	}
--- a/Source/Core/Core/Src/HW/DVDInterface.cpp
+++ b/Source/Core/Core/Src/HW/DVDInterface.cpp
@ -325,7 +325,7 @@ void ChangeDisc(const char* _newFileName)
 	{
 		Movie::g_bDiscChange = true;
 		std::string fileName = _newFileName;
-		int sizeofpath = fileName.find_last_of("/\\") + 1;
+		auto sizeofpath = fileName.find_last_of("/\\") + 1;
 		if (fileName.substr(sizeofpath).length() > 40)
 		{
 			PanicAlert("Saving iso filename to .dtm failed; max file name length is 40 characters.");
--- a/Source/Core/Core/Src/HW/EXI_DeviceMemoryCard.cpp
+++ b/Source/Core/Core/Src/HW/EXI_DeviceMemoryCard.cpp
@ -191,7 +191,7 @@ void CEXIMemoryCard::CmdDone()
 void CEXIMemoryCard::CmdDoneLater(u64 cycles)
 {
 	CoreTiming::RemoveEvent(et_cmd_done);
-	CoreTiming::ScheduleEvent(cycles, et_cmd_done, (u64)card_index);
+	CoreTiming::ScheduleEvent((int)cycles, et_cmd_done, (u64)card_index);
 }

 void CEXIMemoryCard::SetCS(int cs)
--- a/Source/Core/Core/Src/HW/GCMemcard.cpp
+++ b/Source/Core/Core/Src/HW/GCMemcard.cpp
@ -4,6 +4,9 @@

 #include "GCMemcard.h"
 #include "ColorUtil.h"
+
+#include <cinttypes>
+
 static void ByteSwap(u8 *valueA, u8 *valueB)
 {
 	u8 tmp = *valueA;
@ -37,19 +40,19 @@ GCMemcard::GCMemcard(const char *filename, bool forceCreation, bool sjis)
 			PanicAlertT("File has the extension \"%s\"\nvalid extensions are (.raw/.gcp)", fileType.c_str());
 			return;
 		}
-		u32 size = mcdFile.GetSize();
+		auto size = mcdFile.GetSize();
 		if (size < MC_FST_BLOCKS*BLOCK_SIZE)
 		{
-			PanicAlertT("%s failed to load as a memorycard \nfile is not large enough to be a valid memory card file (0x%x bytes)", filename, size);
+			PanicAlertT("%s failed to load as a memorycard \nfile is not large enough to be a valid memory card file (0x%x bytes)", filename, (unsigned) size);
 			return;
 		}
 		if (size % BLOCK_SIZE)
 		{
-			PanicAlertT("%s failed to load as a memorycard \n Card file size is invalid (0x%x bytes)", filename, size);
+			PanicAlertT("%s failed to load as a memorycard \n Card file size is invalid (0x%x bytes)", filename, (unsigned) size);
 				return;
 		}

-		m_sizeMb = (size/BLOCK_SIZE) / MBIT_TO_BLOCKS;
+		m_sizeMb = (u16)((size/BLOCK_SIZE) / MBIT_TO_BLOCKS);
 		switch (m_sizeMb)
 		{
 			case MemCard59Mb:
@ -60,7 +63,7 @@ GCMemcard::GCMemcard(const char *filename, bool forceCreation, bool sjis)
 			case MemCard2043Mb:
 				break;
 			default:
-				PanicAlertT("%s failed to load as a memorycard \n Card size is invalid (0x%x bytes)", filename, size);
+				PanicAlertT("%s failed to load as a memorycard \n Card size is invalid (0x%x bytes)", filename, (unsigned) size);
 				return;
 		}
 	}
@ -173,7 +176,7 @@ GCMemcard::GCMemcard(const char *filename, bool forceCreation, bool sjis)
 		}
 		else
 		{
-			PanicAlertT("Failed to read block %d of the save data\nMemcard may be truncated\nFilePosition:%llx", i, mcdFile.Tell());
+			PanicAlertT("Failed to read block %d of the save data\nMemcard may be truncated\nFilePosition:%" PRIx64, i, mcdFile.Tell());
 			m_valid = false;
 			break;
 		}
--- a/Source/Core/Core/Src/HW/GCPadEmu.cpp
+++ b/Source/Core/Core/Src/HW/GCPadEmu.cpp
@ -209,5 +209,5 @@ void GCPad::LoadDefaults(const ControllerInterface& ciface)

 bool GCPad::GetMicButton() const
 {
-	return m_buttons->controls.back()->control_ref->State();
+	return (0.0f != m_buttons->controls.back()->control_ref->State());
 }
--- a/Source/Core/Core/Src/HW/WiimoteEmu/EmuSubroutines.cpp
+++ b/Source/Core/Core/Src/HW/WiimoteEmu/EmuSubroutines.cpp
@ -32,7 +32,7 @@
 namespace WiimoteEmu
 {

-void Spy(Wiimote* wm_, const void* data_, int size_)
+void Spy(Wiimote* wm_, const void* data_, size_t size_)
 {
 #if 0
 	// enable log
@ -1275,7 +1275,7 @@ void Wiimote::DoState(PointerWrap& p)
 		else
 		{
 			std::queue<ReadRequest> tmp_queue(m_read_requests);
-			size = m_read_requests.size();
+			size = (u32)(m_read_requests.size());
 			p.Do(size);
 			while (!tmp_queue.empty())
 			{
--- a/Source/Core/Core/Src/HW/WiimoteEmu/WiimoteEmu.cpp
+++ b/Source/Core/Core/Src/HW/WiimoteEmu/WiimoteEmu.cpp
@ -765,7 +765,7 @@ void Wiimote::Update()
 					if (-1 == rptf_size)
 					{
 						std::copy(rpt.begin(), rpt.end(), data);
-						rptf_size = rpt.size();
+						rptf_size = (s8)(rpt.size());
 					}
 				}
 			}
--- a/Source/Core/Core/Src/HW/WiimoteEmu/WiimoteEmu.h
+++ b/Source/Core/Core/Src/HW/WiimoteEmu/WiimoteEmu.h
@ -96,7 +96,7 @@ inline double trim(double a)
 class Wiimote : public ControllerEmu
 {
 friend class WiimoteReal::Wiimote;
-friend void Spy(Wiimote* wm_, const void* data_, int size_);
+friend void Spy(Wiimote* wm_, const void* data_, size_t size_);
 public:

 	enum
@ -245,7 +245,7 @@ private:
 	}	m_reg_speaker;
 };

-void Spy(Wiimote* wm_, const void* data_, int size_);
+void Spy(Wiimote* wm_, const void* data_, size_t size_);

 }

--- a/Source/Core/Core/Src/HW/WiimoteReal/IODummy.cpp
+++ b/Source/Core/Core/Src/HW/WiimoteReal/IODummy.cpp
@ -72,7 +72,7 @@ int Wiimote::IORead(u8* buf)
 	return 0;
 }

-int Wiimote::IOWrite(const u8* buf, int len)
+int Wiimote::IOWrite(const u8* buf, size_t len)
 {
 	return 0;
 }
--- a/Source/Core/Core/Src/HW/WiimoteReal/IONix.cpp
+++ b/Source/Core/Core/Src/HW/WiimoteReal/IONix.cpp
@ -263,9 +263,9 @@ int Wiimote::IORead(u8* buf)
 	return r;
 }

-int Wiimote::IOWrite(u8 const* buf, int len)
+int Wiimote::IOWrite(u8 const* buf, size_t len)
 {
-	return write(int_sock, buf, len);
+	return write(int_sock, buf, (int)len);
 }

 }; // WiimoteReal
--- a/Source/Core/Core/Src/HW/WiimoteReal/IOWin.cpp
+++ b/Source/Core/Core/Src/HW/WiimoteReal/IOWin.cpp
@ -140,7 +140,7 @@ namespace WiimoteReal
 {


-int _IOWrite(HANDLE &dev_handle, OVERLAPPED &hid_overlap_write, enum win_bt_stack_t &stack, const u8* buf, int len);
+int _IOWrite(HANDLE &dev_handle, OVERLAPPED &hid_overlap_write, enum win_bt_stack_t &stack, const u8* buf, size_t len);
 int _IORead(HANDLE &dev_handle, OVERLAPPED &hid_overlap_read, u8* buf, int index);
 void _IOWakeup(HANDLE &dev_handle, OVERLAPPED &hid_overlap_read);

@ -247,7 +247,7 @@ void WiimoteScanner::FindWiimotes(std::vector<Wiimote*> & found_wiimotes, Wiimot
 	//	SLEEP(2000);

 }
-int CheckDeviceType_Write(HANDLE &dev_handle, const u8* buf, int size, int attempts)
+int CheckDeviceType_Write(HANDLE &dev_handle, const u8* buf, size_t size, int attempts)
 {
 	OVERLAPPED hid_overlap_write = OVERLAPPED();
 	hid_overlap_write.hEvent = CreateEvent(NULL, true, false, NULL);
@ -641,7 +641,7 @@ int Wiimote::IORead(u8* buf)
 }


-int _IOWrite(HANDLE &dev_handle, OVERLAPPED &hid_overlap_write, enum win_bt_stack_t &stack, const u8* buf, int len)
+int _IOWrite(HANDLE &dev_handle, OVERLAPPED &hid_overlap_write, enum win_bt_stack_t &stack, const u8* buf, size_t len)
 {
 	WiimoteEmu::Spy(NULL, buf, len);

@ -663,7 +663,7 @@ int _IOWrite(HANDLE &dev_handle, OVERLAPPED &hid_overlap_write, enum win_bt_stac
 		}
 		case MSBT_STACK_MS:
 		{
-			auto result = HidD_SetOutputReport(dev_handle, const_cast<u8*>(buf) + 1, len - 1);
+			auto result = HidD_SetOutputReport(dev_handle, const_cast<u8*>(buf) + 1, (ULONG)(len - 1));
 			//FlushFileBuffers(dev_handle);

 			if (!result)
@ -715,7 +715,7 @@ int _IOWrite(HANDLE &dev_handle, OVERLAPPED &hid_overlap_write, enum win_bt_stac
 	return 0;
 }

-int Wiimote::IOWrite(const u8* buf, int len)
+int Wiimote::IOWrite(const u8* buf, size_t len)
 {
 	return _IOWrite(dev_handle, hid_overlap_write, stack, buf, len);
 }
--- a/Source/Core/Core/Src/HW/WiimoteReal/IOdarwin.mm
+++ b/Source/Core/Core/Src/HW/WiimoteReal/IOdarwin.mm
@ -310,14 +310,14 @@ int Wiimote::IORead(unsigned char *buf)
 	return inputlen;
 }

-int Wiimote::IOWrite(const unsigned char *buf, int len)
+int Wiimote::IOWrite(const unsigned char *buf, size_t len)
 {
 	IOReturn ret;

 	if (!IsConnected())
 		return 0;

-	ret = [ichan writeAsync: const_cast<void*>((void *)buf) length: len refcon: nil];
+	ret = [ichan writeAsync: const_cast<void*>((void *)buf) length: (int)len refcon: nil];

 	if (ret == kIOReturnSuccess)
 		return len;
--- a/Source/Core/Core/Src/HW/WiimoteReal/WiimoteReal.cpp
+++ b/Source/Core/Core/Src/HW/WiimoteReal/WiimoteReal.cpp
@ -239,7 +239,9 @@ bool Wiimote::Write()
 			IOWrite(rpt.data(), rpt.size());

 			if (is_speaker_data)
+			{
 				m_last_audio_report.Update();
+			}

 			m_write_reports.Pop();
 			return true;
@ -293,8 +295,10 @@ void Wiimote::Update()

 	// Send the report
 	if (!rpt.empty() && m_channel > 0)
-		Core::Callback_WiimoteInterruptChannel(index, m_channel,
-			rpt.data(), rpt.size());
+	{
+		Core::Callback_WiimoteInterruptChannel(index, m_channel, 
+			rpt.data(), (u32)rpt.size());
+	}
 }

 void Wiimote::Prepare(int _index)
--- a/Source/Core/Core/Src/HW/WiimoteReal/WiimoteReal.h
+++ b/Source/Core/Core/Src/HW/WiimoteReal/WiimoteReal.h
@ -106,7 +106,7 @@ private:
 	void WriteReport(Report rpt);

 	int IORead(u8* buf);
-	int IOWrite(u8 const* buf, int len);
+	int IOWrite(u8 const* buf, size_t len);
 	void IOWakeup();

 	void ThreadFunc();
--- a/Source/Core/Core/Src/IPC_HLE/WII_IPC_HLE.cpp
+++ b/Source/Core/Core/Src/IPC_HLE/WII_IPC_HLE.cpp
@ -83,7 +83,7 @@ static u64 last_reply_time;
 void EnqueReplyCallback(u64 userdata, int)
 {
 	std::lock_guard<std::mutex> lk(s_reply_queue);
-	reply_queue.push_back(userdata);
+	reply_queue.push_back((u32)userdata);
 }

 void Init()
@ -546,7 +546,9 @@ void ExecuteCommand(u32 _Address)
 		const s64 ticks_til_last_reply = last_reply_time - CoreTiming::GetTicks();

 		if (ticks_til_last_reply > 0)
-			reply_delay = ticks_til_last_reply;
+		{
+			reply_delay = (int)ticks_til_last_reply;
+		}

 		last_reply_time = CoreTiming::GetTicks() + reply_delay;

--- a/Source/Core/Core/Src/IPC_HLE/WII_IPC_HLE_Device_DI.cpp
+++ b/Source/Core/Core/Src/IPC_HLE/WII_IPC_HLE_Device_DI.cpp
@ -19,6 +19,8 @@

 #include "../../DiscIO/Src/FileMonitor.h"

+#include <cinttypes>
+
 using namespace DVDInterface;


@ -108,7 +110,7 @@ bool CWII_IPC_HLE_Device_di::IOCtlV(u32 _CommandAddress)
 			// Get TMD offset for requested partition...
 			u64 const TMDOffset = ((u64)Memory::Read_U32(CommandBuffer.InBuffer[0].m_Address + 4) << 2 ) + 0x2c0;

-			INFO_LOG(WII_IPC_DVD, "DVDLowOpenPartition: TMDOffset 0x%016llx", TMDOffset);
+			INFO_LOG(WII_IPC_DVD, "DVDLowOpenPartition: TMDOffset 0x%016" PRIx64, TMDOffset);

 			static u32 const TMDsz = 0x208; //CommandBuffer.PayloadBuffer[0].m_Size;
 			u8 pTMD[TMDsz];
@ -204,13 +206,13 @@ u32 CWII_IPC_HLE_Device_di::ExecuteCommand(u32 _BufferIn, u32 _BufferInSize, u32
 					pFilename = m_pFileSystem->GetFileName(DVDAddress);
 				if (pFilename != NULL)
 				{
-					INFO_LOG(WII_IPC_DVD, "DVDLowRead: %s (0x%llx) - (DVDAddr: 0x%llx, Size: 0x%x)",
+					INFO_LOG(WII_IPC_DVD, "DVDLowRead: %s (0x%" PRIx64 ") - (DVDAddr: 0x%" PRIx64 ", Size: 0x%x)",
 						pFilename, m_pFileSystem->GetFileSize(pFilename), DVDAddress, Size);
 					FileMon::CheckFile(std::string(pFilename), (int)m_pFileSystem->GetFileSize(pFilename));
 				}
 				else
 				{
-					INFO_LOG(WII_IPC_DVD, "DVDLowRead: file unknown - (DVDAddr: 0x%llx, Size: 0x%x)",
+					INFO_LOG(WII_IPC_DVD, "DVDLowRead: file unknown - (DVDAddr: 0x%" PRIx64 ", Size: 0x%x)",
 						DVDAddress, Size);
 				}
 			}
@ -308,7 +310,7 @@ u32 CWII_IPC_HLE_Device_di::ExecuteCommand(u32 _BufferIn, u32 _BufferInSize, u32

 			u64 DVDAddress = (u64)DVDAddress32 << 2;

-			INFO_LOG(WII_IPC_DVD, "DVDLowUnencryptedRead: DVDAddr: 0x%08llx, Size: 0x%x", DVDAddress, Size);
+			INFO_LOG(WII_IPC_DVD, "DVDLowUnencryptedRead: DVDAddr: 0x%08" PRIx64 ", Size: 0x%x", DVDAddress, Size);

 			if (Size > _BufferOutSize)
 			{
@ -342,12 +344,12 @@ u32 CWII_IPC_HLE_Device_di::ExecuteCommand(u32 _BufferIn, u32 _BufferInSize, u32
 				pFilename = m_pFileSystem->GetFileName(DVDAddress);
 			if (pFilename != NULL)
 			{
-				INFO_LOG(WII_IPC_DVD, "DVDLowSeek: %s (0x%llx) - (DVDAddr: 0x%llx)",
+				INFO_LOG(WII_IPC_DVD, "DVDLowSeek: %s (0x%" PRIx64 ") - (DVDAddr: 0x%" PRIx64 ")",
 					pFilename, m_pFileSystem->GetFileSize(pFilename), DVDAddress);
 			}
 			else
 			{
-				INFO_LOG(WII_IPC_DVD, "DVDLowSeek: file unknown - (DVDAddr: 0x%llx)",
+				INFO_LOG(WII_IPC_DVD, "DVDLowSeek: file unknown - (DVDAddr: 0x%" PRIx64 ")",
 					DVDAddress);
 			}
 		}
--- a/Source/Core/Core/Src/IPC_HLE/WII_IPC_HLE_Device_es.cpp
+++ b/Source/Core/Core/Src/IPC_HLE/WII_IPC_HLE_Device_es.cpp
@ -35,6 +35,10 @@

 #include "WII_IPC_HLE_Device_es.h"

+// need to include this before polarssl/aes.h,
+// otherwise we may not get __STDC_FORMAT_MACROS
+#include <cinttypes>
+
 #include "../PowerPC/PowerPC.h"
 #include "../VolumeHandler.h"
 #include "FileUtil.h"
@ -129,7 +133,7 @@ void CWII_IPC_HLE_Device_es::DoState(PointerWrap& p)
 	p.Do(m_AccessIdentID);
 	p.Do(m_TitleIDs);

-	u32 Count = m_ContentAccessMap.size();
+	u32 Count = (u32)(m_ContentAccessMap.size());
 	p.Do(Count);

 	u32 CFD, Position;
@ -205,7 +209,7 @@ u32 CWII_IPC_HLE_Device_es::OpenTitleContent(u32 CFD, u64 TitleID, u16 Index)

 	if (!Loader.IsValid())
 	{
-		WARN_LOG(WII_IPC_ES, "ES: loader not valid for %llx", TitleID);
+		WARN_LOG(WII_IPC_ES, "ES: loader not valid for %" PRIx64, TitleID);
 		return 0xffffffff;
 	}

@ -940,7 +944,7 @@ bool CWII_IPC_HLE_Device_es::IOCtlV(u32 _CommandAddress)
 			if (!bSuccess)
 			{
 				PanicAlertT("IOCTL_ES_LAUNCH: Game tried to reload a title that is not available in your NAND dump\n"
-					"TitleID %016llx.\n Dolphin will likely hang now.", TitleID);
+					"TitleID %016" PRIx64".\n Dolphin will likely hang now.", TitleID);
 			}
 			else
 			{
@ -983,7 +987,7 @@ bool CWII_IPC_HLE_Device_es::IOCtlV(u32 _CommandAddress)
 			//TODO: provide correct return code when bSuccess= false
 			Memory::Write_U32(0, _CommandAddress + 0x4);

-			ERROR_LOG(WII_IPC_ES, "IOCTL_ES_LAUNCH %016llx %08x %016llx %08x %016llx %04x", TitleID,view,ticketid,devicetype,titleid,access);
+			ERROR_LOG(WII_IPC_ES, "IOCTL_ES_LAUNCH %016" PRIx64 " %08x %016" PRIx64 " %08x %016" PRIx64 " %04x", TitleID,view,ticketid,devicetype,titleid,access);
 			//					   IOCTL_ES_LAUNCH 0001000248414341 00000001 0001c0fef3df2cfa 00000000 0001000248414341 ffff

 			// This is necessary because Reset(true) above deleted this object.  Ew.
--- a/Source/Core/Core/Src/IPC_HLE/WII_IPC_HLE_Device_fs.cpp
+++ b/Source/Core/Core/Src/IPC_HLE/WII_IPC_HLE_Device_fs.cpp
@ -571,7 +571,7 @@ void CWII_IPC_HLE_Device_fs::DoState(PointerWrap& p)
 			}
 			else
 			{
-				u32 size = entry.size;
+				u32 size = (u32)entry.size;
 				p.Do(size);

 				File::IOFile handle(entry.physicalName, "rb");
--- a/Source/Core/Core/Src/IPC_HLE/WII_IPC_HLE_Device_hid.cpp
+++ b/Source/Core/Core/Src/IPC_HLE/WII_IPC_HLE_Device_hid.cpp
@ -385,12 +385,12 @@ void CWII_IPC_HLE_Device_hid::FillOutDevices(u32 BufferOut, u32 BufferOutSize)
 		Memory::WriteBigEData((const u8*)&wii_device, OffsetBuffer, Align(wii_device.bLength, 4));
 		OffsetBuffer += Align(wii_device.bLength, 4);
 		bool deviceValid = true;
+		bool isHID = false;

 		for (c = 0; deviceValid && c < desc.bNumConfigurations; c++)
 		{
 			struct libusb_config_descriptor *config = NULL;
 			int cRet = libusb_get_config_descriptor(device, c, &config);
-
 			// do not try to use usb devices with more than one interface, games can crash
 			if(cRet == 0 && config->bNumInterfaces <= MAX_HID_INTERFACES)
 			{
@ -402,10 +402,14 @@ void CWII_IPC_HLE_Device_hid::FillOutDevices(u32 BufferOut, u32 BufferOutSize)
 				for (ic = 0; ic < config->bNumInterfaces; ic++)
 				{
 					const struct libusb_interface *interfaceContainer = &config->interface[ic];
+					
 					for (i = 0; i < interfaceContainer->num_altsetting; i++)
 					{
 						const struct libusb_interface_descriptor *interface = &interfaceContainer->altsetting[i];

+						if (interface->bInterfaceClass == LIBUSB_CLASS_HID)
+							isHID = true;
+
 						WiiHIDInterfaceDescriptor wii_interface;
 						ConvertInterfaceToWii(&wii_interface, interface);
 						Memory::WriteBigEData((const u8*)&wii_interface, OffsetBuffer, Align(wii_interface.bLength, 4));
@ -435,6 +439,12 @@ void CWII_IPC_HLE_Device_hid::FillOutDevices(u32 BufferOut, u32 BufferOutSize)
 			}
 		} // configs

+		if (!isHID)
+		{
+			deviceValid = false;
+			OffsetBuffer = OffsetStart;
+		}
+
 		if (deviceValid)
 		{
 			Memory::Write_U32(OffsetBuffer-OffsetStart, OffsetStart); // fill in length
--- a/Source/Core/Core/Src/IPC_HLE/WII_IPC_HLE_Device_usb.cpp
+++ b/Source/Core/Core/Src/IPC_HLE/WII_IPC_HLE_Device_usb.cpp
@ -866,7 +866,7 @@ bool CWII_IPC_HLE_Device_usb_oh1_57e_305::SendEventRoleChange(bdaddr_t _bd, bool

 bool CWII_IPC_HLE_Device_usb_oh1_57e_305::SendEventNumberOfCompletedPackets()
 {
-	SQueuedEvent Event(sizeof(hci_event_hdr_t) + sizeof(hci_num_compl_pkts_ep) + sizeof(hci_num_compl_pkts_info) * m_WiiMotes.size(), 0);
+	SQueuedEvent Event((u32)(sizeof(hci_event_hdr_t) + sizeof(hci_num_compl_pkts_ep) + (sizeof(hci_num_compl_pkts_info) * m_WiiMotes.size())), 0);

 	INFO_LOG(WII_IPC_WIIMOTE, "Event: SendEventNumberOfCompletedPackets");

--- a/Source/Core/Core/Src/IPC_HLE/hci.h
+++ b/Source/Core/Core/Src/IPC_HLE/hci.h
@ -84,14 +84,6 @@
 // All structs in this file are packed
 #pragma pack(push, 1)

-/*
- * Bluetooth Address Family Protocol Numbers
- */
-#define BTPROTO_HCI	1
-#define BTPROTO_L2CAP	2
-#define BTPROTO_RFCOMM	3
-#define BTPROTO_SCO	4
-
 /* All sizes are in bytes */
 #define BLUETOOTH_BDADDR_SIZE	6

@ -102,9 +94,8 @@
 typedef struct {
 	uint8_t	b[BLUETOOTH_BDADDR_SIZE];
 } bdaddr_t;
-#endif
-
 #define	BDADDR_ANY	{ { 0, 0, 0, 0, 0, 0 } }
+#endif

 /**************************************************************************
 **************************************************************************
--- a/Source/Core/Core/Src/NetPlayClient.cpp
+++ b/Source/Core/Core/Src/NetPlayClient.cpp
@ -445,10 +445,11 @@ void NetPlayClient::SendWiimoteState(const PadMapping in_game_pad, const NetWiim
 	sf::Packet spac;
 	spac << (MessageId)NP_MSG_WIIMOTE_DATA;
 	spac << in_game_pad;
-	u8 size = nw.size();
-	spac << size;
-	for (unsigned int i = 0; i < size; ++i)
-		spac << nw.data()[i];
+	spac << (u8)nw.size();
+	for (auto it : nw)
+	{
+		spac << it;
+	}

 	std::lock_guard<std::recursive_mutex> lks(m_crit.send);
 	m_socket.Send(spac);
--- a/Source/Core/Core/Src/NetPlayServer.cpp
+++ b/Source/Core/Core/Src/NetPlayServer.cpp
@ -153,7 +153,7 @@ unsigned int NetPlayServer::OnConnect(sf::SocketTCP& socket)
 	rpac >> player.name;

 	// give new client first available id
-	player.pid = m_players.size() + 1;
+	player.pid = (PlayerId)(m_players.size() + 1);

 	// try to automatically assign new user a pad
 	for (unsigned int m = 0; m < 4; ++m)
@ -435,12 +435,14 @@ unsigned int NetPlayServer::OnData(sf::Packet& packet, sf::SocketTCP& socket)

 	case NP_MSG_PONG :
 		{
-			const u32 ping = m_ping_timer.GetTimeElapsed();
+			const u32 ping = (u32)m_ping_timer.GetTimeElapsed();
 			u32 ping_key = 0;
 			packet >> ping_key;

 			if (m_ping_key == ping_key)
+			{
 				player.ping = ping;
+			}

 			sf::Packet spac;
 			spac << (MessageId)NP_MSG_PLAYER_PING_DATA;
--- a/Source/Core/Core/Src/PowerPC/Interpreter/Interpreter.cpp
+++ b/Source/Core/Core/Src/PowerPC/Interpreter/Interpreter.cpp
@ -10,11 +10,11 @@
 #include "../../Host.h"
 #include "../../IPC_HLE/WII_IPC_HLE.h"

-
 #ifdef USE_GDBSTUB
 #include "../GDBStub.h"
 #endif

+#include <cinttypes>

 namespace {
 	u32 last_pc;
@ -79,7 +79,7 @@ void Trace( UGeckoInstruction &instCode )
 	std::string fregs = "";
 	for (int i=0; i<32; i++)
 	{
-		sprintf(freg, "f%02d: %08llx %08llx ", i, PowerPC::ppcState.ps[i][0], PowerPC::ppcState.ps[i][1]);
+		sprintf(freg, "f%02d: %08" PRIx64 " %08" PRIx64 " ", i, PowerPC::ppcState.ps[i][0], PowerPC::ppcState.ps[i][1]);
 		fregs.append(freg);
 	}

--- a/Source/Core/Core/Src/PowerPC/Interpreter/Interpreter_FPUtils.h
+++ b/Source/Core/Core/Src/PowerPC/Interpreter/Interpreter_FPUtils.h
@ -5,6 +5,7 @@
 #ifndef _INTERPRETER_FPUTILS_H
 #define _INTERPRETER_FPUTILS_H

+#include "CPUDetect.h"
 #include "Interpreter.h"
 #include "MathUtil.h"

@ -69,28 +70,22 @@ inline void UpdateFPSCR()

 inline double ForceSingle(double _x)
 {
-	//if (FPSCR.RN != 0)
-	//	PanicAlert("RN = %d at %x", (int)FPSCR.RN, PC);
-	if (FPSCR.NI)
-		_x = FlushToZeroAsFloat(_x);
-
-	double x = static_cast<float>(_x);
-
+	// convert to float...
+	float x = _x;
+	if (!cpu_info.bFlushToZero && FPSCR.NI)
+	{
+		x = FlushToZero(x);
+	}
+	// ...and back to double:
 	return x;
 }

 inline double ForceDouble(double d)
 {
-	//if (FPSCR.RN != 0)
-	//	PanicAlert("RN = %d at %x", (int)FPSCR.RN, PC);
-
-	//if (FPSCR.NI)
-	//{
-	//	IntDouble x; x.d = d;
-		//if ((x.i & DOUBLE_EXP) == 0)
-		//	x.i &= DOUBLE_SIGN;  // turn into signed zero
-	//	return x.d;
-	//}
+	if (!cpu_info.bFlushToZero && FPSCR.NI)
+	{
+		d = FlushToZero(d);
+	}
 	return d;
 }

--- a/Source/Core/Core/Src/PowerPC/Interpreter/Interpreter_SystemRegisters.cpp
+++ b/Source/Core/Core/Src/PowerPC/Interpreter/Interpreter_SystemRegisters.cpp
@ -48,15 +48,8 @@ static void FPSCRtoFPUSettings(UReg_FPSCR fp)
 		// Pokemon Colosseum does this. Gah.
 	}

-	// Also corresponding SSE rounding mode setting
-	if (FPSCR.NI)
-	{
-		// Either one of these two breaks Beyond Good & Evil.
-		// if (cpu_info.bSSSE3)
-		//     csr |= DAZ;
-		// csr |= FTZ;
-	}
-	FPURoundMode::SetSIMDMode(FPSCR.RN);
+	// Set SSE rounding mode and denormal handling
+	FPURoundMode::SetSIMDMode(FPSCR.RN, FPSCR.NI);
 }

 void Interpreter::mtfsb0x(UGeckoInstruction _inst)
--- a/Source/Core/Core/Src/PowerPC/Jit64/Jit.h
+++ b/Source/Core/Core/Src/PowerPC/Jit64/Jit.h
@ -119,7 +119,7 @@ public:
 	void tri_op(int d, int a, int b, bool reversible, void (XEmitter::*op)(Gen::X64Reg, Gen::OpArg));
 	typedef u32 (*Operation)(u32 a, u32 b);
 	void regimmop(int d, int a, bool binary, u32 value, Operation doop, void (XEmitter::*op)(int, const Gen::OpArg&, const Gen::OpArg&), bool Rc = false, bool carry = false);
-	void fp_tri_op(int d, int a, int b, bool reversible, bool dupe, void (XEmitter::*op)(Gen::X64Reg, Gen::OpArg));
+	void fp_tri_op(int d, int a, int b, bool reversible, bool dupe, void (XEmitter::*op_2)(Gen::X64Reg, Gen::OpArg), void (XEmitter::*op_3)(Gen::X64Reg, Gen::X64Reg, Gen::OpArg));

 	// OPCODES
 	void unknown_instruction(UGeckoInstruction _inst);
@ -182,7 +182,7 @@ public:
 	void ps_sum(UGeckoInstruction inst);
 	void ps_muls(UGeckoInstruction inst);

-	void fp_arith_s(UGeckoInstruction inst);
+	void fp_arith(UGeckoInstruction inst);
 	void frsqrtex(UGeckoInstruction inst);

 	void fcmpx(UGeckoInstruction inst);
--- a/Source/Core/Core/Src/PowerPC/Jit64/Jit64_Tables.cpp
+++ b/Source/Core/Core/Src/PowerPC/Jit64/Jit64_Tables.cpp
@ -320,12 +320,12 @@ static GekkoOPTemplate table31_2[] =

 static GekkoOPTemplate table59[] =
 {
-	{18, &Jit64::Default},       //{"fdivsx",   OPTYPE_FPU, FL_RC_BIT_F, 16}},
-	{20, &Jit64::fp_arith_s}, //"fsubsx",   OPTYPE_FPU, FL_RC_BIT_F}},
-	{21, &Jit64::fp_arith_s}, //"faddsx",   OPTYPE_FPU, FL_RC_BIT_F}},
+	{18, &Jit64::fp_arith}, //{"fdivsx",   OPTYPE_FPU, FL_RC_BIT_F, 16}},
+	{20, &Jit64::fp_arith}, //"fsubsx",   OPTYPE_FPU, FL_RC_BIT_F}},
+	{21, &Jit64::fp_arith}, //"faddsx",   OPTYPE_FPU, FL_RC_BIT_F}},
 //	{22, &Jit64::Default}, //"fsqrtsx",  OPTYPE_FPU, FL_RC_BIT_F}}, // Not implemented on gekko
 	{24, &Jit64::Default}, //"fresx",    OPTYPE_FPU, FL_RC_BIT_F}},
-	{25, &Jit64::fp_arith_s}, //"fmulsx",   OPTYPE_FPU, FL_RC_BIT_F}},
+	{25, &Jit64::fp_arith}, //"fmulsx",   OPTYPE_FPU, FL_RC_BIT_F}},
 	{28, &Jit64::fmaddXX}, //"fmsubsx",  OPTYPE_FPU, FL_RC_BIT_F}},
 	{29, &Jit64::fmaddXX}, //"fmaddsx",  OPTYPE_FPU, FL_RC_BIT_F}},
 	{30, &Jit64::fmaddXX}, //"fnmsubsx", OPTYPE_FPU, FL_RC_BIT_F}},
@ -354,12 +354,12 @@ static GekkoOPTemplate table63[] =

 static GekkoOPTemplate table63_2[] =
 {
-	{18, &Jit64::Default}, //"fdivx",    OPTYPE_FPU, FL_RC_BIT_F, 30}},
-	{20, &Jit64::Default}, //"fsubx",    OPTYPE_FPU, FL_RC_BIT_F}},
-	{21, &Jit64::Default}, //"faddx",    OPTYPE_FPU, FL_RC_BIT_F}},
+	{18, &Jit64::fp_arith}, //"fdivx",    OPTYPE_FPU, FL_RC_BIT_F, 30}},
+	{20, &Jit64::fp_arith}, //"fsubx",    OPTYPE_FPU, FL_RC_BIT_F}},
+	{21, &Jit64::fp_arith}, //"faddx",    OPTYPE_FPU, FL_RC_BIT_F}},
 	{22, &Jit64::Default}, //"fsqrtx",   OPTYPE_FPU, FL_RC_BIT_F}},
 	{23, &Jit64::Default}, //"fselx",    OPTYPE_FPU, FL_RC_BIT_F}},
-	{25, &Jit64::fp_arith_s}, //"fmulx",    OPTYPE_FPU, FL_RC_BIT_F}},
+	{25, &Jit64::fp_arith}, //"fmulx",    OPTYPE_FPU, FL_RC_BIT_F}},
 	{26, &Jit64::frsqrtex}, //"frsqrtex", OPTYPE_FPU, FL_RC_BIT_F}},
 	{28, &Jit64::fmaddXX}, //"fmsubx",   OPTYPE_FPU, FL_RC_BIT_F}},
 	{29, &Jit64::fmaddXX}, //"fmaddx",   OPTYPE_FPU, FL_RC_BIT_F}},
--- a/Source/Core/Core/Src/PowerPC/Jit64/JitRegCache.cpp
+++ b/Source/Core/Core/Src/PowerPC/Jit64/JitRegCache.cpp
@ -166,7 +166,7 @@ int RegCache::SanityCheck() const

 void RegCache::DiscardRegContentsIfCached(int preg)
 {
-	if (regs[preg].away && regs[preg].location.IsSimpleReg())
+	if (IsBound(preg))
 	{
 		X64Reg xr = regs[preg].location.GetSimpleReg();
 		xregs[xr].free = true;
@ -351,11 +351,12 @@ void FPURegCache::StoreFromRegister(int i)
 	{
 		X64Reg xr = regs[i].location.GetSimpleReg();
 		_assert_msg_(DYNA_REC, xr < NUMXREGS, "WTF - store - invalid reg");
+		OpArg newLoc = GetDefaultLocation(i);
+		if (xregs[xr].dirty)
+			emit->MOVAPD(newLoc, xr);
 		xregs[xr].free = true;
 		xregs[xr].dirty = false;
 		xregs[xr].ppcReg = -1;
-		OpArg newLoc = GetDefaultLocation(i);
-		emit->MOVAPD(newLoc, xr);
 		regs[i].location = newLoc;
 		regs[i].away = false;
 	}
--- a/Source/Core/Core/Src/PowerPC/Jit64/JitRegCache.h
+++ b/Source/Core/Core/Src/PowerPC/Jit64/JitRegCache.h
@ -93,7 +93,7 @@ public:
 	const OpArg &R(int preg) const {return regs[preg].location;}
 	X64Reg RX(int preg) const
 	{
-		if (regs[preg].away && regs[preg].location.IsSimpleReg())
+		if (IsBound(preg))
 			return regs[preg].location.GetSimpleReg();
 		PanicAlert("Not so simple - %i", preg);
 		return (X64Reg)-1;
@ -111,6 +111,11 @@ public:
 		return xregs[xreg].free && !xlocks[xreg];
 	}

+	bool IsBound(int preg) const
+	{
+		return regs[preg].away && regs[preg].location.IsSimpleReg();
+	}
+

 	X64Reg GetFreeXReg();

--- a/Source/Core/Core/Src/PowerPC/Jit64/Jit_FloatingPoint.cpp
+++ b/Source/Core/Core/Src/PowerPC/Jit64/Jit_FloatingPoint.cpp
@ -13,37 +13,62 @@ static const u64 GC_ALIGNED16(psAbsMask2[2])  = {0x7FFFFFFFFFFFFFFFULL, 0x7FFFFF
 static const double GC_ALIGNED16(psOneOne2[2]) = {1.0, 1.0};
 static const double one_const = 1.0f;

-void Jit64::fp_tri_op(int d, int a, int b, bool reversible, bool dupe, void (XEmitter::*op)(Gen::X64Reg, Gen::OpArg))
+void Jit64::fp_tri_op(int d, int a, int b, bool reversible, bool single,
+		      void (XEmitter::*op_2)(Gen::X64Reg, Gen::OpArg),
+		      void (XEmitter::*op_3)(Gen::X64Reg, Gen::X64Reg, Gen::OpArg))
 {
+	if (!cpu_info.bAVX)
+	{
+		op_3 = nullptr;
+	}
+
 	fpr.Lock(d, a, b);
 	if (d == a)
 	{
-		fpr.BindToRegister(d, true);
-		(this->*op)(fpr.RX(d), fpr.R(b));
+		fpr.BindToRegister(d);
+		(this->*op_2)(fpr.RX(d), fpr.R(b));
 	}
 	else if (d == b)
 	{
 		if (reversible)
 		{
-			fpr.BindToRegister(d, true);
-			(this->*op)(fpr.RX(d), fpr.R(a));
+			fpr.BindToRegister(d);
+			(this->*op_2)(fpr.RX(d), fpr.R(a));
 		}
 		else
 		{
-			MOVSD(XMM0, fpr.R(b));
-			fpr.BindToRegister(d, !dupe);
-			MOVSD(fpr.RX(d), fpr.R(a));
-			(this->*op)(fpr.RX(d), Gen::R(XMM0));
+			if (op_3)
+			{
+				fpr.BindToRegister(d);
+				fpr.BindToRegister(a, true, false);
+				(this->*op_3)(fpr.RX(d), fpr.RX(a), fpr.R(b));
+			}
+			else
+			{
+				MOVSD(XMM0, fpr.R(b));
+				fpr.BindToRegister(d, false);
+				MOVSD(fpr.RX(d), fpr.R(a));
+				(this->*op_2)(fpr.RX(d), Gen::R(XMM0));
+			}
 		}
 	}
 	else
 	{
-		// Sources different from d, can use rather quick solution
-		fpr.BindToRegister(d, !dupe);
-		MOVSD(fpr.RX(d), fpr.R(a));
-		(this->*op)(fpr.RX(d), fpr.R(b));
+		if (op_3)
+		{
+			fpr.BindToRegister(d, false);
+			fpr.BindToRegister(a);
+			(this->*op_3)(fpr.RX(d), fpr.RX(a), fpr.R(b));
+		}
+		else
+		{
+			fpr.BindToRegister(d, false);
+			MOVSD(fpr.RX(d), fpr.R(a));
+			(this->*op_2)(fpr.RX(d), fpr.R(b));
+		}
 	}
-	if (dupe)
+
+	if (single)
 	{
 		ForceSinglePrecisionS(fpr.RX(d));
 		if (cpu_info.bSSE3)
@ -60,7 +85,7 @@ void Jit64::fp_tri_op(int d, int a, int b, bool reversible, bool dupe, void (XEm
 	fpr.UnlockAll();
 }

-void Jit64::fp_arith_s(UGeckoInstruction inst)
+void Jit64::fp_arith(UGeckoInstruction inst)
 {
 	INSTRUCTION_START
 	JITDISABLE(bJITFloatingPointOff)
@ -73,31 +98,38 @@ void Jit64::fp_arith_s(UGeckoInstruction inst)
 		Default(inst); return;
 	}

-	bool dupe = inst.OPCD == 59;
+	bool single = inst.OPCD == 59;
 	switch (inst.SUBOP5)
 	{
-	case 18: fp_tri_op(inst.FD, inst.FA, inst.FB, false, dupe, &XEmitter::DIVSD); break; //div
-	case 20: fp_tri_op(inst.FD, inst.FA, inst.FB, false, dupe, &XEmitter::SUBSD); break; //sub
-	case 21: fp_tri_op(inst.FD, inst.FA, inst.FB, true,  dupe, &XEmitter::ADDSD); break; //add
-	case 25: fp_tri_op(inst.FD, inst.FA, inst.FC, true, dupe, &XEmitter::MULSD); break; //mul
+	case 18: fp_tri_op(inst.FD, inst.FA, inst.FB, false, single, &XEmitter::DIVSD, &XEmitter::VDIVSD); break; //div
+	case 20: fp_tri_op(inst.FD, inst.FA, inst.FB, false, single, &XEmitter::SUBSD, &XEmitter::VSUBSD); break; //sub
+	case 21: fp_tri_op(inst.FD, inst.FA, inst.FB, true,  single, &XEmitter::ADDSD, &XEmitter::VADDSD); break; //add
+	case 25: fp_tri_op(inst.FD, inst.FA, inst.FC, true,  single, &XEmitter::MULSD, &XEmitter::VMULSD); break; //mul
 	default:
-		_assert_msg_(DYNA_REC, 0, "fp_arith_s WTF!!!");
+		_assert_msg_(DYNA_REC, 0, "fp_arith WTF!!!");
 	}
 }

 void Jit64::frsqrtex(UGeckoInstruction inst)
 {
-       INSTRUCTION_START
-       JITDISABLE(bJITFloatingPointOff)
-       int d = inst.FD;
-       int b = inst.FB;
-       fpr.Lock(b, d);
-       fpr.BindToRegister(d, true, true);
-       MOVSD(XMM0, M((void *)&one_const));
-       SQRTSD(XMM1, fpr.R(b));
-       DIVSD(XMM0, R(XMM1));
-       MOVSD(fpr.R(d), XMM0);
-       fpr.UnlockAll();
+	INSTRUCTION_START
+	JITDISABLE(bJITFloatingPointOff)
+	int d = inst.FD;
+	int b = inst.FB;
+	fpr.Lock(b, d);
+	fpr.BindToRegister(d, d == b, true);
+	MOVSD(XMM0, M((void *)&one_const));
+	SQRTSD(XMM1, fpr.R(b));
+	if (cpu_info.bAVX)
+	{
+		VDIVSD(fpr.RX(d), XMM0, R(XMM1));
+	}
+	else
+	{
+		DIVSD(XMM0, R(XMM1));
+		MOVSD(fpr.R(d), XMM0);
+	}
+	fpr.UnlockAll();
 }

 void Jit64::fmaddXX(UGeckoInstruction inst)
@ -192,16 +224,28 @@ void Jit64::fmrx(UGeckoInstruction inst)
 {
 	INSTRUCTION_START
 	JITDISABLE(bJITFloatingPointOff)
-	if (inst.Rc) {
+	if (inst.Rc)
+	{
 		Default(inst); return;
 	}
 	int d = inst.FD;
 	int b = inst.FB;
-	fpr.Lock(b, d);
-	fpr.BindToRegister(d, true, true);
-	MOVSD(XMM0, fpr.R(b));
-	MOVSD(fpr.R(d), XMM0);
-	fpr.UnlockAll();
+	if (d != b)
+	{
+		fpr.Lock(b, d);
+
+		// we don't need to load d, but if it already is, it must be marked as dirty
+		if (fpr.IsBound(d))
+		{
+			fpr.BindToRegister(d);
+		}
+		fpr.BindToRegister(b, true, false);
+
+		// caveat: the order of ModRM:r/m, ModRM:reg is deliberate!
+		// "MOVSD reg, mem" zeros out the upper half of the destination register
+		MOVSD(fpr.R(d), fpr.RX(b));
+		fpr.UnlockAll();
+	}
 }

 void Jit64::fcmpx(UGeckoInstruction inst)
--- a/Source/Core/Core/Src/PowerPC/Jit64/Jit_LoadStorePaired.cpp
+++ b/Source/Core/Core/Src/PowerPC/Jit64/Jit_LoadStorePaired.cpp
@ -122,12 +122,6 @@ void Jit64::psq_l(UGeckoInstruction inst)

 	const UGQR gqr(rSPR(SPR_GQR0 + inst.I));

-	if (inst.W) {
-		// PanicAlert("Single ps load: %i %i", gqr.ST_TYPE, gqr.ST_SCALE);
-		Default(inst);
-		return;
-	}
-
 	bool update = inst.OPCD == 57;
 	int offset = inst.SIMM_12;

@ -143,6 +137,8 @@ void Jit64::psq_l(UGeckoInstruction inst)
 		MOV(32, gpr.R(inst.RA), R(ECX));
 	MOVZX(32, 16, EAX, M(((char *)&GQR(inst.I)) + 2));
 	MOVZX(32, 8, EDX, R(AL));
+	if (inst.W)
+		OR(32, R(EDX), Imm8(8));
 #ifdef _M_IX86
 	int addr_scale = SCALE_4;
 #else
--- a/Source/Core/Core/Src/PowerPC/Jit64/Jit_Paired.cpp
+++ b/Source/Core/Core/Src/PowerPC/Jit64/Jit_Paired.cpp
@ -14,10 +14,9 @@
 //   cmppd, andpd, andnpd, or
 //   lfsx, ps_merge01 etc

-const u64 GC_ALIGNED16(psSignBits[2]) = {0x8000000000000000ULL, 0x8000000000000000ULL};
-const u64 GC_ALIGNED16(psAbsMask[2])  = {0x7FFFFFFFFFFFFFFFULL, 0x7FFFFFFFFFFFFFFFULL};
-const double GC_ALIGNED16(psOneOne[2])  = {1.0, 1.0};
-const double GC_ALIGNED16(psZeroZero[2]) = {0.0, 0.0};
+static const u64 GC_ALIGNED16(psSignBits[2]) = {0x8000000000000000ULL, 0x8000000000000000ULL};
+static const u64 GC_ALIGNED16(psAbsMask[2])  = {0x7FFFFFFFFFFFFFFFULL, 0x7FFFFFFFFFFFFFFFULL};
+static const double GC_ALIGNED16(psOneOne[2])  = {1.0, 1.0};

 void Jit64::ps_mr(UGeckoInstruction inst)
 {
@ -52,14 +51,15 @@ void Jit64::ps_sel(UGeckoInstruction inst)

 	fpr.Lock(a, b, c, d);
 	MOVAPD(XMM0, fpr.R(a));
+	XORPD(XMM1, R(XMM1));
 	// XMM0 = XMM0 < 0 ? all 1s : all 0s
-	CMPPD(XMM0, M((void*)psZeroZero), LT);
+	CMPPD(XMM0, R(XMM1), LT);
 	MOVAPD(XMM1, R(XMM0));
 	ANDPD(XMM0, fpr.R(b));
 	ANDNPD(XMM1, fpr.R(c));
+	ORPD(XMM0, R(XMM1));
 	fpr.BindToRegister(d, false);
 	MOVAPD(fpr.RX(d), R(XMM0));
-	ORPD(fpr.RX(d), R(XMM1));
 	fpr.UnlockAll();
 }

--- a/Source/Core/Core/Src/PowerPC/Jit64IL/IR_X86.cpp
+++ b/Source/Core/Core/Src/PowerPC/Jit64IL/IR_X86.cpp
@ -470,7 +470,7 @@ static void regEmitMemLoad(RegInfo& RI, InstLoc I, unsigned Size) {
 	X64Reg reg;
 	auto info = regBuildMemAddress(RI, I, getOp1(I), 1, Size, &reg);

-	RI.Jit->SafeLoadToReg(reg, info.first, Size, info.second, regsInUse(RI), false);
+	RI.Jit->SafeLoadToReg(reg, info.first, Size, info.second, regsInUse(RI), false, EmuCodeBlock::SAFE_LOADSTORE_NO_FASTMEM);
 	if (regReadUse(RI, I))
 		RI.regs[reg] = I;
 }
@ -498,7 +498,7 @@ static void regEmitMemStore(RegInfo& RI, InstLoc I, unsigned Size) {
 	} else {
 		RI.Jit->MOV(32, R(EAX), regLocForInst(RI, getOp1(I)));
 	}
-	RI.Jit->SafeWriteRegToReg(EAX, ECX, Size, 0, regsInUse(RI));
+	RI.Jit->SafeWriteRegToReg(EAX, ECX, Size, 0, regsInUse(RI), EmuCodeBlock::SAFE_LOADSTORE_NO_FASTMEM);
 	if (RI.IInfo[I - RI.FirstI] & 4)
 		regClearInst(RI, getOp1(I));
 }
@ -1188,7 +1188,7 @@ static void DoWriteCode(IRBuilder* ibuild, JitIL* Jit, u32 exitAddress) {
 				Jit->MOV(32, R(EAX), loc1);
 			}
 			Jit->MOV(32, R(ECX), regLocForInst(RI, getOp2(I)));
-			RI.Jit->SafeWriteRegToReg(EAX, ECX, 32, 0, regsInUse(RI));
+			RI.Jit->SafeWriteRegToReg(EAX, ECX, 32, 0, regsInUse(RI), EmuCodeBlock::SAFE_LOADSTORE_NO_FASTMEM);
 			if (RI.IInfo[I - RI.FirstI] & 4)
 				fregClearInst(RI, getOp1(I));
 			if (RI.IInfo[I - RI.FirstI] & 8)
@ -1251,12 +1251,12 @@ static void DoWriteCode(IRBuilder* ibuild, JitIL* Jit, u32 exitAddress) {
 				Jit->PSRLQ(XMM0, 32);
 				Jit->MOVD_xmm(R(EAX), XMM0);
 				Jit->MOV(32, R(ECX), address);
-				RI.Jit->SafeWriteRegToReg(EAX, ECX, 32, 0, regsInUse(RI));
+				RI.Jit->SafeWriteRegToReg(EAX, ECX, 32, 0, regsInUse(RI), EmuCodeBlock::SAFE_LOADSTORE_NO_FASTMEM);

 				Jit->MOVAPD(XMM0, value);
 				Jit->MOVD_xmm(R(EAX), XMM0);
 				Jit->MOV(32, R(ECX), address);
-				RI.Jit->SafeWriteRegToReg(EAX, ECX, 32, 4, regsInUse(RI));
+				RI.Jit->SafeWriteRegToReg(EAX, ECX, 32, 4, regsInUse(RI), EmuCodeBlock::SAFE_LOADSTORE_NO_FASTMEM);
 			Jit->SetJumpTarget(exit);

 			if (RI.IInfo[I - RI.FirstI] & 4)
--- a/Source/Core/Core/Src/PowerPC/Jit64IL/JitIL.cpp
+++ b/Source/Core/Core/Src/PowerPC/Jit64IL/JitIL.cpp
@ -3,6 +3,8 @@
 // Refer to the license.txt file included.

 #include <map>
+#include <memory>
+#include <cinttypes>

 #include "Common.h"
 #include "../../HLE/HLE.h"
@ -217,14 +219,14 @@ namespace JitILProfiler
 				const u64 totalElapsed = block.totalElapsed;
 				const u64 numberOfCalls = block.numberOfCalls;
 				const double elapsedPerCall = totalElapsed / (double)numberOfCalls;
-				fprintf(file.GetHandle(), "%016llx,%lld,%lld,%f\n", codeHash, totalElapsed, numberOfCalls, elapsedPerCall);
+				fprintf(file.GetHandle(), "%016" PRIx64 ",%" PRId64 ",%" PRId64 ",%f\n", codeHash, totalElapsed, numberOfCalls, elapsedPerCall);
 			}
 		}
 	};
-	std::auto_ptr<JitILProfilerFinalizer> finalizer;
+	std::unique_ptr<JitILProfilerFinalizer> finalizer;
 	static void Init()
 	{
-		finalizer = std::auto_ptr<JitILProfilerFinalizer>(new JitILProfilerFinalizer);
+		finalizer = std::unique_ptr<JitILProfilerFinalizer>(new JitILProfilerFinalizer);
 	}
 	static void Shutdown()
 	{
--- a/Source/Core/Core/Src/PowerPC/Jit64IL/JitIL.h
+++ b/Source/Core/Core/Src/PowerPC/Jit64IL/JitIL.h
@ -58,14 +58,10 @@ private:
 	JitBlockCache blocks;
 	TrampolineCache trampolines;

-	// The default code buffer. We keep it around to not have to alloc/dealloc a
-	// large chunk of memory for each recompiled block.
-	PPCAnalyst::CodeBuffer code_buffer;
-
 public:
 	JitILAsmRoutineManager asm_routines;

-	JitIL() : code_buffer(32000) {}
+	JitIL() {}
 	~JitIL() {}

 	// Initialization, etc
@ -140,6 +136,4 @@ public:
 	void DynaRunTable63(UGeckoInstruction _inst) override;
 };

-void Jit(u32 em_address);
-
 #endif  // _JITIL_H
--- a/Source/Core/Core/Src/PowerPC/JitCommon/JitAsmCommon.cpp
+++ b/Source/Core/Core/Src/PowerPC/JitCommon/JitAsmCommon.cpp
@ -196,7 +196,7 @@ void CommonAsmRoutines::GenQuantizedStores()
 	PACKSSDW(XMM0, R(XMM0));
 	PACKUSWB(XMM0, R(XMM0));
 	MOVD_xmm(R(EAX), XMM0);
-	SafeWriteRegToReg(AX, ECX, 16, 0, QUANTIZED_REGS_TO_SAVE, SAFE_WRITE_NO_SWAP | SAFE_WRITE_NO_PROLOG | SAFE_WRITE_NO_FASTMEM);
+	SafeWriteRegToReg(AX, ECX, 16, 0, QUANTIZED_REGS_TO_SAVE, SAFE_LOADSTORE_NO_SWAP | SAFE_LOADSTORE_NO_PROLOG | SAFE_LOADSTORE_NO_FASTMEM);

 	RET();

@ -215,7 +215,7 @@ void CommonAsmRoutines::GenQuantizedStores()
 	PACKSSWB(XMM0, R(XMM0));
 	MOVD_xmm(R(EAX), XMM0);

-	SafeWriteRegToReg(AX, ECX, 16, 0, QUANTIZED_REGS_TO_SAVE, SAFE_WRITE_NO_SWAP | SAFE_WRITE_NO_PROLOG | SAFE_WRITE_NO_FASTMEM);
+	SafeWriteRegToReg(AX, ECX, 16, 0, QUANTIZED_REGS_TO_SAVE, SAFE_LOADSTORE_NO_SWAP | SAFE_LOADSTORE_NO_PROLOG | SAFE_LOADSTORE_NO_FASTMEM);

 	RET();

@ -241,7 +241,7 @@ void CommonAsmRoutines::GenQuantizedStores()
 	MOV(16, R(AX), M((char*)psTemp + 4));

 	BSWAP(32, EAX);
-	SafeWriteRegToReg(EAX, ECX, 32, 0, QUANTIZED_REGS_TO_SAVE, SAFE_WRITE_NO_SWAP | SAFE_WRITE_NO_PROLOG | SAFE_WRITE_NO_FASTMEM);
+	SafeWriteRegToReg(EAX, ECX, 32, 0, QUANTIZED_REGS_TO_SAVE, SAFE_LOADSTORE_NO_SWAP | SAFE_LOADSTORE_NO_PROLOG | SAFE_LOADSTORE_NO_FASTMEM);

 	RET();

@ -261,7 +261,7 @@ void CommonAsmRoutines::GenQuantizedStores()
 	MOVD_xmm(R(EAX), XMM0);
 	BSWAP(32, EAX);
 	ROL(32, R(EAX), Imm8(16));
-	SafeWriteRegToReg(EAX, ECX, 32, 0, QUANTIZED_REGS_TO_SAVE, SAFE_WRITE_NO_SWAP | SAFE_WRITE_NO_PROLOG | SAFE_WRITE_NO_FASTMEM);
+	SafeWriteRegToReg(EAX, ECX, 32, 0, QUANTIZED_REGS_TO_SAVE, SAFE_LOADSTORE_NO_SWAP | SAFE_LOADSTORE_NO_PROLOG | SAFE_LOADSTORE_NO_FASTMEM);

 	RET();

@ -286,7 +286,7 @@ void CommonAsmRoutines::GenQuantizedSingleStores()

 	// Easy!
 	const u8* storeSingleFloat = AlignCode4();
-	SafeWriteFloatToReg(XMM0, ECX, QUANTIZED_REGS_TO_SAVE, SAFE_WRITE_NO_PROLOG | SAFE_WRITE_NO_FASTMEM);
+	SafeWriteFloatToReg(XMM0, ECX, QUANTIZED_REGS_TO_SAVE, SAFE_LOADSTORE_NO_PROLOG | SAFE_LOADSTORE_NO_FASTMEM);
 	RET();
 	/*
 	if (cpu_info.bSSSE3) {
@ -294,11 +294,11 @@ void CommonAsmRoutines::GenQuantizedSingleStores()
 		// TODO: SafeWriteFloat
 		MOVSS(M(&psTemp[0]), XMM0);
 		MOV(32, R(EAX), M(&psTemp[0]));
-		SafeWriteRegToReg(EAX, ECX, 32, 0, SAFE_WRITE_NO_SWAP | SAFE_WRITE_NO_PROLOG | SAFE_WRITE_NO_FASTMEM);
+		SafeWriteRegToReg(EAX, ECX, 32, 0, SAFE_LOADSTORE_NO_SWAP | SAFE_LOADSTORE_NO_PROLOG | SAFE_LOADSTORE_NO_FASTMEM);
 	} else {
 		MOVSS(M(&psTemp[0]), XMM0);
 		MOV(32, R(EAX), M(&psTemp[0]));
-		SafeWriteRegToReg(EAX, ECX, 32, 0, SAFE_WRITE_NO_PROLOG | SAFE_WRITE_NO_FASTMEM);
+		SafeWriteRegToReg(EAX, ECX, 32, 0, SAFE_LOADSTORE_NO_PROLOG | SAFE_LOADSTORE_NO_FASTMEM);
 	}*/

 	const u8* storeSingleU8 = AlignCode4();  // Used by MKWii
@ -309,7 +309,7 @@ void CommonAsmRoutines::GenQuantizedSingleStores()
 	MAXSS(XMM0, R(XMM1));
 	MINSS(XMM0, M((void *)&m_255));
 	CVTTSS2SI(EAX, R(XMM0));
-	SafeWriteRegToReg(AL, ECX, 8, 0, QUANTIZED_REGS_TO_SAVE, SAFE_WRITE_NO_PROLOG | SAFE_WRITE_NO_FASTMEM);
+	SafeWriteRegToReg(AL, ECX, 8, 0, QUANTIZED_REGS_TO_SAVE, SAFE_LOADSTORE_NO_PROLOG | SAFE_LOADSTORE_NO_FASTMEM);
 	RET();

 	const u8* storeSingleS8 = AlignCode4();
@ -319,7 +319,7 @@ void CommonAsmRoutines::GenQuantizedSingleStores()
 	MAXSS(XMM0, M((void *)&m_m128));
 	MINSS(XMM0, M((void *)&m_127));
 	CVTTSS2SI(EAX, R(XMM0));
-	SafeWriteRegToReg(AL, ECX, 8, 0, QUANTIZED_REGS_TO_SAVE, SAFE_WRITE_NO_PROLOG | SAFE_WRITE_NO_FASTMEM);
+	SafeWriteRegToReg(AL, ECX, 8, 0, QUANTIZED_REGS_TO_SAVE, SAFE_LOADSTORE_NO_PROLOG | SAFE_LOADSTORE_NO_FASTMEM);
 	RET();

 	const u8* storeSingleU16 = AlignCode4();  // Used by MKWii
@ -330,7 +330,7 @@ void CommonAsmRoutines::GenQuantizedSingleStores()
 	MAXSS(XMM0, R(XMM1));
 	MINSS(XMM0, M((void *)&m_65535));
 	CVTTSS2SI(EAX, R(XMM0));
-	SafeWriteRegToReg(EAX, ECX, 16, 0, QUANTIZED_REGS_TO_SAVE, SAFE_WRITE_NO_PROLOG | SAFE_WRITE_NO_FASTMEM);
+	SafeWriteRegToReg(EAX, ECX, 16, 0, QUANTIZED_REGS_TO_SAVE, SAFE_LOADSTORE_NO_PROLOG | SAFE_LOADSTORE_NO_FASTMEM);
 	RET();

 	const u8* storeSingleS16 = AlignCode4();
@ -340,7 +340,7 @@ void CommonAsmRoutines::GenQuantizedSingleStores()
 	MAXSS(XMM0, M((void *)&m_m32768));
 	MINSS(XMM0, M((void *)&m_32767));
 	CVTTSS2SI(EAX, R(XMM0));
-	SafeWriteRegToReg(EAX, ECX, 16, 0, QUANTIZED_REGS_TO_SAVE, SAFE_WRITE_NO_PROLOG | SAFE_WRITE_NO_FASTMEM);
+	SafeWriteRegToReg(EAX, ECX, 16, 0, QUANTIZED_REGS_TO_SAVE, SAFE_LOADSTORE_NO_PROLOG | SAFE_LOADSTORE_NO_FASTMEM);
 	RET();

 	singleStoreQuantized = reinterpret_cast<const u8**>(const_cast<u8*>(AlignCode16()));
--- a/Source/Core/Core/Src/PowerPC/JitCommon/JitBackpatch.cpp
+++ b/Source/Core/Core/Src/PowerPC/JitCommon/JitBackpatch.cpp
@ -3,6 +3,7 @@
 // Refer to the license.txt file included.

 #include <string>
+#include <cinttypes>

 #include "Common.h"
 #include "disasm.h"
@ -32,7 +33,7 @@ static void BackPatchError(const std::string &text, u8 *codePtr, u32 emAddress)
 #endif
 	PanicAlert("%s\n\n"
 		"Error encountered accessing emulated address %08x.\n"
-		"Culprit instruction: \n%s\nat %#llx",
+		"Culprit instruction: \n%s\nat %#" PRIx64,
 		text.c_str(), emAddress, disbuf, code_addr);
 	return;
 }
@ -233,7 +234,7 @@ const u8 *Jitx86Base::BackPatch(u8 *codePtr, u32 emAddress, void *ctx_void)
 		XEmitter emitter(start);
 		const u8 *trampoline = trampolines.GetWriteTrampoline(info, registersInUse);
 		emitter.CALL((void *)trampoline);
-		emitter.NOP(codePtr + info.instructionSize - emitter.GetCodePtr());
+		emitter.NOP((int)(codePtr + info.instructionSize - emitter.GetCodePtr()));
 		return start;
 	}
 #else
--- a/Source/Core/Core/Src/PowerPC/JitCommon/Jit_Util.cpp
+++ b/Source/Core/Core/Src/PowerPC/JitCommon/Jit_Util.cpp
@ -117,18 +117,20 @@ u8 *EmuCodeBlock::UnsafeLoadToReg(X64Reg reg_value, Gen::OpArg opAddress, int ac
 	return result;
 }

-void EmuCodeBlock::SafeLoadToReg(X64Reg reg_value, const Gen::OpArg & opAddress, int accessSize, s32 offset, u32 registersInUse, bool signExtend)
+void EmuCodeBlock::SafeLoadToReg(X64Reg reg_value, const Gen::OpArg & opAddress, int accessSize, s32 offset, u32 registersInUse, bool signExtend, int flags)
 {
 	if (!jit->js.memcheck)
 	{
 		registersInUse &= ~(1 << RAX | 1 << reg_value);
 	}
 #if defined(_M_X64)
+	if (!Core::g_CoreStartupParameter.bMMU &&
+	    Core::g_CoreStartupParameter.bFastmem &&
+	    !(flags & (SAFE_LOADSTORE_NO_SWAP | SAFE_LOADSTORE_NO_FASTMEM))
 #ifdef ENABLE_MEM_CHECK
-	if (!Core::g_CoreStartupParameter.bMMU && !Core::g_CoreStartupParameter.bEnableDebugging && Core::g_CoreStartupParameter.bFastmem)
-#else
-	if (!Core::g_CoreStartupParameter.bMMU && Core::g_CoreStartupParameter.bFastmem)
+	    && !Core::g_CoreStartupParameter.bEnableDebugging
 #endif
+	    )
 	{
 		u8 *mov = UnsafeLoadToReg(reg_value, opAddress, accessSize, offset, signExtend);

@ -282,14 +284,14 @@ void EmuCodeBlock::SafeWriteRegToReg(X64Reg reg_value, X64Reg reg_addr, int acce
 #if defined(_M_X64)
 	if (!Core::g_CoreStartupParameter.bMMU &&
 	    Core::g_CoreStartupParameter.bFastmem &&
-	    !(flags & (SAFE_WRITE_NO_SWAP | SAFE_WRITE_NO_FASTMEM))
+	    !(flags & (SAFE_LOADSTORE_NO_SWAP | SAFE_LOADSTORE_NO_FASTMEM))
 #ifdef ENABLE_MEM_CHECK
 	    && !Core::g_CoreStartupParameter.bEnableDebugging
 #endif
 	    )
 	{
 		MOV(32, M(&PC), Imm32(jit->js.compilerPC)); // Helps external systems know which instruction triggered the write
-		u8 *mov = UnsafeWriteRegToReg(reg_value, reg_addr, accessSize, offset, !(flags & SAFE_WRITE_NO_SWAP));
+		u8 *mov = UnsafeWriteRegToReg(reg_value, reg_addr, accessSize, offset, !(flags & SAFE_LOADSTORE_NO_SWAP));
 		if (accessSize == 8)
 		{
 			NOP(1);
@ -321,8 +323,8 @@ void EmuCodeBlock::SafeWriteRegToReg(X64Reg reg_value, X64Reg reg_addr, int acce
 	MOV(32, M(&PC), Imm32(jit->js.compilerPC)); // Helps external systems know which instruction triggered the write
 	TEST(32, R(reg_addr), Imm32(mem_mask));
 	FixupBranch fast = J_CC(CC_Z, true);
-	bool noProlog = flags & SAFE_WRITE_NO_PROLOG;
-	bool swap = !(flags & SAFE_WRITE_NO_SWAP);
+	bool noProlog = (0 != (flags & SAFE_LOADSTORE_NO_PROLOG));
+	bool swap = !(flags & SAFE_LOADSTORE_NO_SWAP);
 	ABI_PushRegistersAndAdjustStack(registersInUse, noProlog);
 	switch (accessSize)
 	{
--- a/Source/Core/Core/Src/PowerPC/JitCommon/Jit_Util.h
+++ b/Source/Core/Core/Src/PowerPC/JitCommon/Jit_Util.h
@ -28,13 +28,13 @@ public:
 	// these return the address of the MOV, for backpatching
 	u8 *UnsafeWriteRegToReg(Gen::X64Reg reg_value, Gen::X64Reg reg_addr, int accessSize, s32 offset = 0, bool swap = true);
 	u8 *UnsafeLoadToReg(Gen::X64Reg reg_value, Gen::OpArg opAddress, int accessSize, s32 offset, bool signExtend);
-	void SafeLoadToReg(Gen::X64Reg reg_value, const Gen::OpArg & opAddress, int accessSize, s32 offset, u32 registersInUse, bool signExtend);
-	enum SafeWriteFlags
+	enum SafeLoadStoreFlags
 	{
-		SAFE_WRITE_NO_SWAP = 1,
-		SAFE_WRITE_NO_PROLOG = 2,
-		SAFE_WRITE_NO_FASTMEM = 4
+		SAFE_LOADSTORE_NO_SWAP = 1,
+		SAFE_LOADSTORE_NO_PROLOG = 2,
+		SAFE_LOADSTORE_NO_FASTMEM = 4
 	};
+	void SafeLoadToReg(Gen::X64Reg reg_value, const Gen::OpArg & opAddress, int accessSize, s32 offset, u32 registersInUse, bool signExtend, int flags = 0);
 	void SafeWriteRegToReg(Gen::X64Reg reg_value, Gen::X64Reg reg_addr, int accessSize, s32 offset, u32 registersInUse, int flags = 0);

 	// Trashes both inputs and EAX.
--- a/Source/Core/Core/Src/PowerPC/JitILCommon/IR.cpp
+++ b/Source/Core/Core/Src/PowerPC/JitILCommon/IR.cpp
@ -118,6 +118,7 @@ Fix profiled loads/stores to work safely.  On 32-bit, one solution is to

 #include <algorithm>
 #include <memory>
+#include <cinttypes>
 #include <ctime>
 #include <set>
 #include "IR.h"
@ -1223,7 +1224,7 @@ struct Writer
 	virtual ~Writer() {}
 };

-static std::auto_ptr<Writer> writer;
+static std::unique_ptr<Writer> writer;

 static const std::string opcodeNames[] = {
 	"Nop", "LoadGReg", "LoadLink", "LoadCR", "LoadCarry", "LoadCTR",
@ -1275,11 +1276,11 @@ void IRBuilder::WriteToFile(u64 codeHash) {
 	_assert_(sizeof(opcodeNames) / sizeof(opcodeNames[0]) == Int3 + 1);

 	if (!writer.get()) {
-		writer = std::auto_ptr<Writer>(new Writer);
+		writer = std::unique_ptr<Writer>(new Writer);
 	}

 	FILE* const file = writer->file.GetHandle();
-	fprintf(file, "\ncode hash:%016llx\n", codeHash);
+	fprintf(file, "\ncode hash:%016" PRIx64 "\n", codeHash);

 	const InstLoc lastCurReadPtr = curReadPtr;
 	StartForwardPass();
--- a/Source/Core/Core/Src/PowerPC/JitInterface.cpp
+++ b/Source/Core/Core/Src/PowerPC/JitInterface.cpp
@ -3,6 +3,7 @@
 // Refer to the license.txt file included.

 #include <algorithm>
+#include <cinttypes>

 #ifdef _WIN32
 #include <windows.h>
@ -171,12 +172,12 @@ namespace JitInterface
 				double percent = 100.0 * (double)stat.cost / (double)cost_sum;
 	#ifdef _WIN32
 				double timePercent = 100.0 * (double)block->ticCounter / (double)timecost_sum;
-				fprintf(f.GetHandle(), "%08x\t%s\t%llu\t%llu\t%.2lf\t%llf\t%lf\t%i\n",
+				fprintf(f.GetHandle(), "%08x\t%s\t%" PRIu64 "\t%" PRIu64 "\t%.2lf\t%llf\t%lf\t%i\n",
 						block->originalAddress, name.c_str(), stat.cost,
 						block->ticCounter, percent, timePercent,
 						(double)block->ticCounter*1000.0/(double)countsPerSec, block->codeSize);
 	#else
-				fprintf(f.GetHandle(), "%08x\t%s\t%llu\t???\t%.2lf\t???\t???\t%i\n",
+				fprintf(f.GetHandle(), "%08x\t%s\t%" PRIu64 "\t???\t%.2lf\t???\t???\t%i\n",
 						block->originalAddress, name.c_str(), stat.cost,  percent, block->codeSize);
 	#endif
 			}
--- a/Source/Core/Core/Src/PowerPC/PPCTables.cpp
+++ b/Source/Core/Core/Src/PowerPC/PPCTables.cpp
@ -4,6 +4,7 @@

 #include <algorithm>
 #include <vector>
+#include <cinttypes>

 #include "Common.h"
 #include "PPCTables.h"
@ -13,24 +14,15 @@
 #include "Interpreter/Interpreter_Tables.h"
 #include "JitInterface.h"

-struct op_inf
-{
-	const char *name;
-	int count;
-	bool operator < (const op_inf &o) const
-	{
-		return count > o.count;
-	}
-};
- GekkoOPInfo *m_infoTable[64];
- GekkoOPInfo *m_infoTable4[1024];
- GekkoOPInfo *m_infoTable19[1024];
- GekkoOPInfo *m_infoTable31[1024];
- GekkoOPInfo *m_infoTable59[32];
- GekkoOPInfo *m_infoTable63[1024];
+GekkoOPInfo *m_infoTable[64];
+GekkoOPInfo *m_infoTable4[1024];
+GekkoOPInfo *m_infoTable19[1024];
+GekkoOPInfo *m_infoTable31[1024];
+GekkoOPInfo *m_infoTable59[32];
+GekkoOPInfo *m_infoTable63[1024];

- GekkoOPInfo *m_allInstructions[512];
- int m_numInstructions;
+GekkoOPInfo *m_allInstructions[512];
+int m_numInstructions;

 GekkoOPInfo *GetOpInfo(UGeckoInstruction _inst)
 {
@ -181,26 +173,34 @@ void CountInstruction(UGeckoInstruction _inst)
 {
 	GekkoOPInfo *info = GetOpInfo(_inst);
 	if (info)
+	{
 		info->runCount++;
+	}
 }

 void PrintInstructionRunCounts()
 {
-	std::vector<op_inf> temp;
-	for (int i = 0; i < m_numInstructions; i++)
+	typedef std::pair<const char*, u64> OpInfo;
+	std::vector<OpInfo> temp;
+	temp.reserve(m_numInstructions);
+	for (int i = 0; i < m_numInstructions; ++i)
 	{
-		op_inf x;
-		x.name = m_allInstructions[i]->opname;
-		x.count = m_allInstructions[i]->runCount;
-		temp.push_back(x);
+		GekkoOPInfo *pInst = m_allInstructions[i];
+		temp.emplace_back(pInst->opname, pInst->runCount);
 	}
-	std::sort(temp.begin(), temp.end());
-	for (int i = 0; i < m_numInstructions; i++)
+	std::sort(temp.begin(), temp.end(), 
+		[](const OpInfo &a, const OpInfo &b)
+		{
+			return a.second > b.second;
+		});
+
+	for (auto &inst : temp)
 	{
-		if (temp[i].count == 0)
+		if (inst.second == 0)
 			break;
-		DEBUG_LOG(POWERPC, "%s : %i", temp[i].name,temp[i].count);
-		//PanicAlert("%s : %i", temp[i].name,temp[i].count);
+
+		DEBUG_LOG(POWERPC, "%s : %llu", inst.first, inst.second);
+		//PanicAlert("%s : %llu", inst.first, inst.second);
 	}
 }

@ -211,20 +211,22 @@ void LogCompiledInstructions()
 	File::IOFile f(StringFromFormat("%sinst_log%i.txt", File::GetUserPath(D_LOGS_IDX).c_str(), time), "w");
 	for (int i = 0; i < m_numInstructions; i++)
 	{
-		if (m_allInstructions[i]->compileCount > 0)
+		GekkoOPInfo *pInst = m_allInstructions[i];
+		if (pInst->compileCount > 0)
 		{
-			fprintf(f.GetHandle(), "%s\t%i\t%lld\t%08x\n", m_allInstructions[i]->opname,
-				m_allInstructions[i]->compileCount, m_allInstructions[i]->runCount, m_allInstructions[i]->lastUse);
+			fprintf(f.GetHandle(), "%s\t%i\t%" PRId64 "\t%08x\n", pInst->opname,
+				pInst->compileCount, pInst->runCount, pInst->lastUse);
 		}
 	}

 	f.Open(StringFromFormat("%sinst_not%i.txt", File::GetUserPath(D_LOGS_IDX).c_str(), time), "w");
 	for (int i = 0; i < m_numInstructions; i++)
 	{
-		if (m_allInstructions[i]->compileCount == 0)
+		GekkoOPInfo *pInst = m_allInstructions[i];
+		if (pInst->compileCount == 0)
 		{
-			fprintf(f.GetHandle(), "%s\t%i\t%lld\n", m_allInstructions[i]->opname,
-				m_allInstructions[i]->compileCount, m_allInstructions[i]->runCount);
+			fprintf(f.GetHandle(), "%s\t%i\t%" PRId64 "\n", pInst->opname,
+				pInst->compileCount, pInst->runCount);
 		}
 	}

--- a/Source/Core/Core/Src/State.cpp
+++ b/Source/Core/Core/Src/State.cpp
@ -247,7 +247,7 @@ void CompressAndDumpState(CompressAndDumpState_args save_args)
 	// Setting up the header
 	StateHeader header;
 	memcpy(header.gameID, SConfig::GetInstance().m_LocalCoreStartupParameter.GetUniqueID().c_str(), 6);
-	header.size = g_use_compression ? buffer_size : 0;
+	header.size = g_use_compression ? (u32)buffer_size : 0;
 	header.time = Common::Timer::GetDoubleTime();

 	f.WriteArray(&header, 1);
@ -261,9 +261,13 @@ void CompressAndDumpState(CompressAndDumpState_args save_args)
 			lzo_uint out_len = 0;

 			if ((i + IN_LEN) >= buffer_size)
-				cur_len = buffer_size - i;
+			{
+				cur_len = (lzo_uint32)(buffer_size - i);
+			}
 			else
+			{
 				cur_len = IN_LEN;
+			}

 			if (lzo1x_1_compress(buffer_data + i, cur_len, out, &out_len, wrkmem) != LZO_E_OK)
 				PanicAlertT("Internal LZO Error - compression failed");
--- a/Source/Core/DiscIO/Src/CompressedBlob.cpp
+++ b/Source/Core/DiscIO/Src/CompressedBlob.cpp
@ -9,6 +9,8 @@
 #include <unistd.h>
 #endif

+#include <cinttypes>
+
 #include "CompressedBlob.h"
 #include "DiscScrubber.h"
 #include "FileUtil.h"
@ -99,7 +101,7 @@ void CompressedBlobReader::GetBlock(u64 block_num, u8 *out_ptr)
 	// First, check hash.
 	u32 block_hash = HashAdler32(source, comp_block_size);
 	if (block_hash != hashes[block_num])
-		PanicAlert("Hash of block %lli is %08x instead of %08x.\n"
+		PanicAlert("Hash of block %" PRIu64 " is %08x instead of %08x.\n"
 		           "Your ISO, %s, is corrupt.",
 		           block_num, block_hash, hashes[block_num],
 				   file_name.c_str());
@ -127,7 +129,7 @@ void CompressedBlobReader::GetBlock(u64 block_num, u8 *out_ptr)
 		{
 			// this seem to fire wrongly from time to time
 			// to be sure, don't use compressed isos :P
-			PanicAlert("Failure reading block %lli - out of data and not at end.", block_num);
+			PanicAlert("Failure reading block %" PRIu64 " - out of data and not at end.", block_num);
 		}
 		inflateEnd(&z);
 		if (uncomp_size != header.block_size)
--- a/Source/Core/DiscIO/Src/DiscScrubber.cpp
+++ b/Source/Core/DiscIO/Src/DiscScrubber.cpp
@ -7,6 +7,8 @@
 #include "FileUtil.h"
 #include "DiscScrubber.h"

+#include <cinttypes>
+
 namespace DiscIO
 {

@ -121,13 +123,13 @@ void GetNextBlock(File::IOFile& in, u8* buffer)

 	if (m_isScrubbing && m_FreeTable[i])
 	{
-		DEBUG_LOG(DISCIO, "Freeing 0x%016llx", CurrentOffset);
+		DEBUG_LOG(DISCIO, "Freeing 0x%016" PRIx64, CurrentOffset);
 		std::fill(buffer, buffer + m_BlockSize, 0xFF);
 		in.Seek(m_BlockSize, SEEK_CUR);
 	}
 	else
 	{
-		DEBUG_LOG(DISCIO, "Used    0x%016llx", CurrentOffset);
+		DEBUG_LOG(DISCIO, "Used    0x%016" PRIx64, CurrentOffset);
 		in.ReadBytes(buffer, m_BlockSize);
 	}

@ -150,7 +152,7 @@ void MarkAsUsed(u64 _Offset, u64 _Size)
 	u64 CurrentOffset = _Offset;
 	u64 EndOffset = CurrentOffset + _Size;

-	DEBUG_LOG(DISCIO, "Marking 0x%016llx - 0x%016llx as used", _Offset, EndOffset);
+	DEBUG_LOG(DISCIO, "Marking 0x%016" PRIx64 " - 0x%016" PRIx64 " as used", _Offset, EndOffset);

 	while ((CurrentOffset < EndOffset) && (CurrentOffset < m_FileSize))
 	{
--- a/Source/Core/DiscIO/Src/FileSystemGCWii.cpp
+++ b/Source/Core/DiscIO/Src/FileSystemGCWii.cpp
@ -8,6 +8,7 @@
 #include <string>
 #include <vector>
 #include <algorithm>
+#include <cinttypes>

 #include "FileSystemGCWii.h"
 #include "StringUtil.h"
@ -70,7 +71,7 @@ u64 CFileSystemGCWii::ReadFile(const char* _rFullPath, u8* _pBuffer, size_t _Max
 	if (pFileInfo->m_FileSize > _MaxBufferSize)
 		return 0;

-	DEBUG_LOG(DISCIO, "Filename: %s. Offset: %llx. Size: %llx",_rFullPath,
+	DEBUG_LOG(DISCIO, "Filename: %s. Offset: %" PRIx64 ". Size: %" PRIx64, _rFullPath,
 		pFileInfo->m_Offset, pFileInfo->m_FileSize);

 	m_rVolume->Read(pFileInfo->m_Offset, pFileInfo->m_FileSize, _pBuffer);
--- a/Source/Core/DiscIO/Src/VolumeCreator.cpp
+++ b/Source/Core/DiscIO/Src/VolumeCreator.cpp
@ -183,8 +183,18 @@ static IVolume* CreateVolumeFromCryptedWiiImage(IBlobReader& _rReader, u32 _Part
 			memset(IV, 0, 16);
 			_rReader.Read(rPartition.Offset + 0x44c, 8, IV);

+			bool usingKoreanKey = false;
+			// Issue: 6813
+			// Magic value is at 0x501f1 (1byte)
+			// If encrypted with the Korean key, the magic value would be 1
+			// Otherwise it is zero
+			if (Korean && Reader.Read32(0x501ee) != 0)
+			{
+				usingKoreanKey = true;
+			}
+
 			aes_context AES_ctx;
-			aes_setkey_dec(&AES_ctx, (Korean ? g_MasterKeyK : g_MasterKey), 128);
+			aes_setkey_dec(&AES_ctx, (usingKoreanKey ? g_MasterKeyK : g_MasterKey), 128);

 			u8 VolumeKey[16];
 			aes_crypt_cbc(&AES_ctx, AES_DECRYPT, 16, IV, SubKey, VolumeKey);
--- a/Source/Core/DolphinWX/Src/GameListCtrl.cpp
+++ b/Source/Core/DolphinWX/Src/GameListCtrl.cpp
@ -9,6 +9,7 @@
 #include <wx/filename.h>

 #include <algorithm>
+#include <cinttypes>
 #include <memory>

 #include "FileSearch.h"
@ -383,7 +384,7 @@ wxString NiceSizeFormat(u64 _size)
 	auto const value = (_size + unit_size / 2) / unit_size;
 	auto const frac = (_size % unit_size * 10 + unit_size / 2) / unit_size % 10;

-	return StrToWxStr(StringFromFormat("%llu.%llu %s", value, frac, unit_symbols[unit]));
+	return StrToWxStr(StringFromFormat("%" PRIu64 ".%" PRIu64 " %s", value, frac, unit_symbols[unit]));
 }

 void CGameListCtrl::InsertItemInReportView(long _Index)
--- a/Source/Core/DolphinWX/Src/ISOFile.cpp
+++ b/Source/Core/DolphinWX/Src/ISOFile.cpp
@ -188,7 +188,7 @@ std::string GameListItem::CreateCacheFilename()

 	// Filename.extension_HashOfFolderPath_Size.cache
 	// Append hash to prevent ISO name-clashing in different folders.
-	Filename.append(StringFromFormat("%s_%x_%llx.cache",
+	Filename.append(StringFromFormat("%s_%x_%zx.cache",
 		extension.c_str(), HashFletcher((const u8 *)LegalPathname.c_str(), LegalPathname.size()),
 		File::GetSize(m_FileName)));

--- a/Source/Core/DolphinWX/Src/ISOProperties.cpp
+++ b/Source/Core/DolphinWX/Src/ISOProperties.cpp
@ -7,6 +7,7 @@
 #endif

 #include <type_traits>
+#include <cinttypes>

 #include "Common.h"
 #include "CommonPaths.h"
@ -118,7 +119,7 @@ CISOProperties::CISOProperties(const std::string fileName, wxWindow* parent, wxW
 		u8 _tTitleID[8];
 		if(OpenISO->GetTitleID(_tTitleID))
 		{
-			snprintf(tmp, 17, "%016llx", Common::swap64(_tTitleID));
+			snprintf(tmp, 17, "%016" PRIx64, Common::swap64(_tTitleID));
 			_iniFilename = tmp;
 		}
 	}
--- a/Source/Core/DolphinWX/Src/MainAndroid.cpp
+++ b/Source/Core/DolphinWX/Src/MainAndroid.cpp
@ -278,10 +278,22 @@ JNIEXPORT jstring JNICALL Java_org_dolphinemu_dolphinemu_NativeLibrary_GetTitle(
 	env->ReleaseStringUTFChars(jFile, File);
 	return env->NewStringUTF(Name.c_str());
 }
+
 JNIEXPORT jstring JNICALL Java_org_dolphinemu_dolphinemu_NativeLibrary_GetVersionString(JNIEnv *env, jobject obj)
 {
 	return env->NewStringUTF(scm_rev_str);
 }
+
+JNIEXPORT jboolean JNICALL Java_org_dolphinemu_dolphinemu_NativeLibrary_SupportsNEON(JNIEnv *env, jobject obj)
+{
+	return cpu_info.bNEON;
+}
+
+JNIEXPORT void JNICALL Java_org_dolphinemu_dolphinemu_NativeLibrary_SaveScreenShot(JNIEnv *env, jobject obj)
+{
+	Core::SaveScreenShot();
+}
+
 JNIEXPORT jstring JNICALL Java_org_dolphinemu_dolphinemu_NativeLibrary_GetConfig(JNIEnv *env, jobject obj, jstring jFile, jstring jKey, jstring jValue, jstring jDefault)
 {
 	IniFile ini;
--- a/Source/Core/DolphinWX/Src/MemoryCards/WiiSaveCrypted.cpp
+++ b/Source/Core/DolphinWX/Src/MemoryCards/WiiSaveCrypted.cpp
@ -7,12 +7,14 @@
 // Licensed under the terms of the GNU GPL, version 2
 // http://www.gnu.org/licenses/old-licenses/gpl-2.0.txt

+#include <algorithm>
+#include <cinttypes>
+
 #include "WiiSaveCrypted.h"
 #include "FileUtil.h"
 #include "MathUtil.h"
 #include "NandPaths.h"
 #include "FileUtil.h"
-#include <algorithm>

 static Common::replace_v replacements;

@ -152,7 +154,7 @@ void CWiiSaveCrypted::ReadHDR()
 	md5((u8*)&_header, HEADER_SZ, md5_calc);
 	if (memcmp(md5_file, md5_calc, 0x10))
 	{
-		PanicAlertT("MD5 mismatch\n %016llx%016llx != %016llx%016llx", Common::swap64(md5_file),Common::swap64(md5_file+8), Common::swap64(md5_calc), Common::swap64(md5_calc+8));
+		PanicAlertT("MD5 mismatch\n %016" PRIx64 "%016" PRIx64 " != %016" PRIx64 "%016" PRIx64, Common::swap64(md5_file),Common::swap64(md5_file+8), Common::swap64(md5_calc), Common::swap64(md5_calc+8));
 		b_valid= false;
 	}

@ -244,7 +246,7 @@ void CWiiSaveCrypted::ReadBKHDR()
 	if (_sizeOfFiles + FULL_CERT_SZ != _totalSize)
 		WARN_LOG(CONSOLE, "Size(%x) + cert(%x) does not equal totalsize(%x)", _sizeOfFiles, FULL_CERT_SZ, _totalSize);
 	if (m_TitleID != Common::swap64(bkhdr.SaveGameTitle))
-		WARN_LOG(CONSOLE, "Encrypted title (%llx) does not match unencrypted title (%llx)", m_TitleID,  Common::swap64(bkhdr.SaveGameTitle));
+		WARN_LOG(CONSOLE, "Encrypted title (%" PRIx64 ") does not match unencrypted title (%" PRIx64 ")", m_TitleID,  Common::swap64(bkhdr.SaveGameTitle));
 }

 void CWiiSaveCrypted::WriteBKHDR()
--- a/Source/Core/DolphinWX/Src/WiimoteConfigDiag.cpp
+++ b/Source/Core/DolphinWX/Src/WiimoteConfigDiag.cpp
@ -3,6 +3,7 @@
 #include "HW/Wiimote.h"
 #include "HW/WiimoteReal/WiimoteReal.h"
 #include "Frame.h"
+#include "NetPlayProto.h"

 WiimoteConfigDiag::WiimoteConfigDiag(wxWindow* const parent, InputPlugin& plugin)
 	: wxDialog(parent, -1, _("Dolphin Wiimote Configuration"), wxDefaultPosition, wxDefaultSize)
@ -134,6 +135,16 @@ WiimoteConfigDiag::WiimoteConfigDiag(wxWindow* const parent, InputPlugin& plugin
 		WiimoteSpkVolumeText->Disable();
 		WiimoteSpkVolumeMinText->Disable();
 		WiimoteSpkVolumeMaxText->Disable();
+		if (NetPlay::IsNetPlayRunning())
+		{
+			bb_source->Disable();
+			for (int i = 0; i < 4; ++i)
+			{
+				wiimote_label[i]->Disable();
+				wiimote_source_ch[i]->Disable();
+			}
+		}
+
 	}


--- a/Source/Core/InputCommon/Src/ControllerInterface/SDL/SDL.cpp
+++ b/Source/Core/InputCommon/Src/ControllerInterface/SDL/SDL.cpp
@ -154,12 +154,13 @@ Joystick::~Joystick()
 	{
 		// stop/destroy all effects
 		SDL_HapticStopAll(m_haptic);
-		std::list<EffectIDState>::iterator
-			i = m_state_out.begin(),
-			e = m_state_out.end();
-		for ( ; i != e; ++i)
-			if (i->id != -1)
-				SDL_HapticDestroyEffect(m_haptic, i->id);
+		for (auto &i : m_state_out)
+		{
+			if (i.id != -1)
+			{
+				SDL_HapticDestroyEffect(m_haptic, i.id);
+			}
+		}
 		// close haptic first
 		SDL_HapticClose(m_haptic);
 	}
@ -210,7 +211,7 @@ void Joystick::ConstantEffect::SetState(ControlState state)
 	}

 	const Sint16 old = m_effect.effect.constant.level;
-	m_effect.effect.constant.level = state * 0x7FFF;
+	m_effect.effect.constant.level = (Sint16)(state * 0x7FFF);
 	if (old != m_effect.effect.constant.level)
 		m_effect.changed = true;
 }
@ -228,7 +229,7 @@ void Joystick::RampEffect::SetState(ControlState state)
 	}

 	const Sint16 old = m_effect.effect.ramp.start;
-	m_effect.effect.ramp.start = state * 0x7FFF;
+	m_effect.effect.ramp.start = (Sint16)(state * 0x7FFF);
 	if (old != m_effect.effect.ramp.start)
 		m_effect.changed = true;
 }
@ -247,7 +248,7 @@ void Joystick::SineEffect::SetState(ControlState state)

 	const Sint16 old = m_effect.effect.periodic.magnitude;
 	m_effect.effect.periodic.period = 5;
-	m_effect.effect.periodic.magnitude = state * 0x5000;
+	m_effect.effect.periodic.magnitude = (Sint16)(state * 0x5000);
 	m_effect.effect.periodic.attack_length = 0;
 	m_effect.effect.periodic.fade_length = 500;

@ -293,7 +294,7 @@ void Joystick::TriangleEffect::SetState(ControlState state)

 	const Sint16 old = m_effect.effect.periodic.magnitude;
 	m_effect.effect.periodic.period = 5;
-	m_effect.effect.periodic.magnitude = state * 0x5000;
+	m_effect.effect.periodic.magnitude = (Sint16)(state * 0x5000);
 	m_effect.effect.periodic.attack_length = 0;
 	m_effect.effect.periodic.fade_length = 100;

@ -313,34 +314,35 @@ bool Joystick::UpdateInput()
 bool Joystick::UpdateOutput()
 {
 #ifdef USE_SDL_HAPTIC
-	std::list<EffectIDState>::iterator
-		i = m_state_out.begin(),
-		e = m_state_out.end();
-	for ( ; i != e; ++i)
+	for (auto &i : m_state_out)
 	{
-		if (i->changed)	// if SetState was called on this output
+		if (i.changed)	// if SetState was called on this output
 		{
-			if (-1 == i->id)	// effect isn't currently uploaded
+			if (-1 == i.id)	// effect isn't currently uploaded
 			{
-				if (i->effect.type)		// if outputstate is >0  this would be true
-					if ((i->id = SDL_HapticNewEffect( m_haptic, &i->effect )) > -1)	// upload the effect
-						SDL_HapticRunEffect(m_haptic, i->id, 1);	// run the effect
+				if (i.effect.type)		// if outputstate is >0  this would be true
+				{
+					if ((i.id = SDL_HapticNewEffect(m_haptic, &i.effect)) > -1)	// upload the effect
+					{
+						SDL_HapticRunEffect(m_haptic, i.id, 1);	// run the effect
+					}
+				}
 			}
 			else	// effect is already uploaded
 			{
-				if (i->effect.type)	// if ouputstate >0
+				if (i.effect.type)	// if ouputstate >0
 				{
-					SDL_HapticUpdateEffect(m_haptic, i->id, &i->effect);	// update the effect
+					SDL_HapticUpdateEffect(m_haptic, i.id, &i.effect);	// update the effect
 				}
 				else
 				{
-					SDL_HapticStopEffect(m_haptic, i->id);	// else, stop and remove the effect
-					SDL_HapticDestroyEffect(m_haptic, i->id);
-					i->id = -1;	// mark it as not uploaded
+					SDL_HapticStopEffect(m_haptic, i.id);	// else, stop and remove the effect
+					SDL_HapticDestroyEffect(m_haptic, i.id);
+					i.id = -1;	// mark it as not uploaded
 				}
 			}

-			i->changed = false;
+			i.changed = false;
 		}
 	}
 #endif
--- a/Source/Core/VideoBackends/D3D/Src/PerfQuery.cpp
+++ b/Source/Core/VideoBackends/D3D/Src/PerfQuery.cpp
@ -116,7 +116,7 @@ void PerfQuery::FlushOne()
 	}

 	// NOTE: Reported pixel metrics should be referenced to native resolution
-	m_results[entry.query_type] += (u64)result * EFB_WIDTH / g_renderer->GetTargetWidth() * EFB_HEIGHT / g_renderer->GetTargetHeight();
+	m_results[entry.query_type] += (u32)(result * EFB_WIDTH / g_renderer->GetTargetWidth() * EFB_HEIGHT / g_renderer->GetTargetHeight());

 	m_query_read_pos = (m_query_read_pos + 1) % ArraySize(m_query_buffer);
 	--m_query_count;
@ -147,7 +147,7 @@ void PerfQuery::WeakFlush()
 		if (hr == S_OK)
 		{
 			// NOTE: Reported pixel metrics should be referenced to native resolution
-			m_results[entry.query_type] += (u64)result * EFB_WIDTH / g_renderer->GetTargetWidth() * EFB_HEIGHT / g_renderer->GetTargetHeight();
+			m_results[entry.query_type] += (u32)(result * EFB_WIDTH / g_renderer->GetTargetWidth() * EFB_HEIGHT / g_renderer->GetTargetHeight());

 			m_query_read_pos = (m_query_read_pos + 1) % ArraySize(m_query_buffer);
 			--m_query_count;
--- a/Source/Core/VideoBackends/D3D/Src/Render.cpp
+++ b/Source/Core/VideoBackends/D3D/Src/Render.cpp
@ -2,7 +2,8 @@
 // Licensed under GPLv2
 // Refer to the license.txt file included.

-#include <math.h>
+#include <cinttypes>
+#include <cmath>

 #include "Timer.h"

@ -33,6 +34,7 @@
 #include "FPSCounter.h"
 #include "ConfigManager.h"
 #include <strsafe.h>
+#include "ImageWrite.h"

 namespace DX11
 {
@ -680,7 +682,7 @@ void Renderer::SetBlendMode(bool forceUpdate)
 	}
 }

-void Renderer::TakeScreenshot(const TargetRectangle &rc, std::string filename)
+bool Renderer::SaveScreenshot(const std::string &filename, const TargetRectangle& rc)
 {
 	if (!s_screenshot_texture)
 		CreateScreenshotTexture(rc);
@ -689,26 +691,25 @@ void Renderer::TakeScreenshot(const TargetRectangle &rc, std::string filename)
 	D3D11_BOX box = CD3D11_BOX(rc.left, rc.top, 0, rc.right, rc.bottom, 1);
 	D3D::context->CopySubresourceRegion(s_screenshot_texture, 0, 0, 0, 0, (ID3D11Resource*)D3D::GetBackBuffer()->GetTex(), 0, &box);

-	u8* __restrict dest = (u8*) malloc(rc.GetWidth() * rc.GetHeight() * 3);
-
 	D3D11_MAPPED_SUBRESOURCE map;
 	D3D::context->Map(s_screenshot_texture, 0, D3D11_MAP_READ_WRITE, 0, &map);
-	u8* src = (u8*) map.pData;
-	for (int y = 0; y < rc.GetHeight(); ++y)
-	{
-		u8* __restrict row = src;
-		for (int x = 0; x < rc.GetWidth(); ++x)
-		{
-			*dest++ = *row++;
-			*dest++ = *row++;
-			*dest++ = *row++;
-			row++;
-		}
-		src += map.RowPitch;
-	}
+
+	bool saved_png = TextureToPng((u8*)map.pData, map.RowPitch, filename, rc.GetWidth(), rc.GetHeight(), false);
+
 	D3D::context->Unmap(s_screenshot_texture, 0);

-	SaveScreenshot(dest, rc.GetWidth(), rc.GetHeight(), filename);
+
+	if (saved_png)
+	{
+		OSD::AddMessage(StringFromFormat("Saved %i x %i %s", rc.GetWidth(),
+		                                 rc.GetHeight(), filename.c_str()));
+	}
+	else
+	{
+		OSD::AddMessage(StringFromFormat("Error saving %s", filename.c_str()));
+	}
+
+	return saved_png;
 }

 void formatBufferDump(const u8* in, u8* out, int w, int h, int p)
@ -846,7 +847,7 @@ void Renderer::Swap(u32 xfbAddr, u32 fbWidth, u32 fbHeight,const EFBRectangle& r
 	// done with drawing the game stuff, good moment to save a screenshot
 	if (s_bScreenshot)
 	{
-		TakeScreenshot(GetTargetRectangle(), s_sScreenshotName);
+		SaveScreenshot(s_sScreenshotName, GetTargetRectangle());
 		s_bScreenshot = false;
 	}

@ -921,7 +922,7 @@ void Renderer::Swap(u32 xfbAddr, u32 fbWidth, u32 fbHeight,const EFBRectangle& r
 	if (SConfig::GetInstance().m_ShowLag)
 	{
 		char lag[10];
-		StringCchPrintfA(lag, 10, "Lag: %llu\n", Movie::g_currentLagCount);
+		StringCchPrintfA(lag, 10, "Lag: %" PRIu64 "\n", Movie::g_currentLagCount);
 		D3D::font.DrawTextScaled(0, 18, 20, 0.0f, 0xFF00FFFF, lag);
 	}

--- a/Source/Core/VideoBackends/D3D/Src/Render.h
+++ b/Source/Core/VideoBackends/D3D/Src/Render.h
@ -48,10 +48,9 @@ public:

 	void UpdateViewport();

-	static void TakeScreenshot(const TargetRectangle &rc, std::string filename);
+	bool SaveScreenshot(const std::string &filename, const TargetRectangle &rc);

 	static bool CheckForResize();
-
 };

 }
--- a/Source/Core/VideoBackends/D3D/Src/TextureCache.cpp
+++ b/Source/Core/VideoBackends/D3D/Src/TextureCache.cpp
@ -14,6 +14,7 @@
 #include "PSTextureEncoder.h"
 #include "HW/Memmap.h"
 #include "VideoConfig.h"
+#include "ImageWrite.h"

 namespace DX11
 {
@ -32,7 +33,7 @@ void TextureCache::TCacheEntry::Bind(unsigned int stage)
 	D3D::context->PSSetShaderResources(stage, 1, &texture->GetSRV());
 }

-bool TextureCache::TCacheEntry::Save(const char filename[], unsigned int level)
+bool TextureCache::TCacheEntry::Save(const std::string filename, unsigned int level)
 {
 	// TODO: Somehow implement this (D3DX11 doesn't support dumping individual LODs)
 	static bool warn_once = true;
@ -42,8 +43,35 @@ bool TextureCache::TCacheEntry::Save(const char filename[], unsigned int level)
 		warn_once = false;
 		return false;
 	}
-	//return SUCCEEDED(PD3DX11SaveTextureToFileA(D3D::context, texture->GetTex(), D3DX11_IFF_PNG, filename));
-	return true;
+
+	ID3D11Texture2D* pNewTexture = NULL;
+	ID3D11Texture2D* pSurface = texture->GetTex();
+	D3D11_TEXTURE2D_DESC desc;
+	pSurface->GetDesc(&desc);
+
+	desc.BindFlags = 0;
+	desc.CPUAccessFlags = D3D11_CPU_ACCESS_READ | D3D11_CPU_ACCESS_WRITE;
+	desc.Usage = D3D11_USAGE_STAGING;
+
+	HRESULT hr = D3D::device->CreateTexture2D(&desc, NULL, &pNewTexture);
+
+	bool saved_png = false;
+
+	if (SUCCEEDED(hr) && pNewTexture)
+	{
+		D3D::context->CopyResource(pNewTexture, pSurface);
+
+		D3D11_MAPPED_SUBRESOURCE map;
+		HRESULT hr = D3D::context->Map(pNewTexture, 0, D3D11_MAP_READ_WRITE, 0, &map);
+		if (SUCCEEDED(hr))
+		{
+			saved_png = TextureToPng((u8*)map.pData, map.RowPitch, filename, desc.Width, desc.Height);
+			D3D::context->Unmap(pNewTexture, 0);
+		}
+		SAFE_RELEASE(pNewTexture);
+	}
+
+	return saved_png;
 }

 void TextureCache::TCacheEntry::Load(unsigned int width, unsigned int height,
--- a/Source/Core/VideoBackends/D3D/Src/TextureCache.h
+++ b/Source/Core/VideoBackends/D3D/Src/TextureCache.h
@ -36,7 +36,7 @@ private:
 			const float *colmat);

 		void Bind(unsigned int stage);
-		bool Save(const char filename[], unsigned int level);
+		bool Save(const std::string filename, unsigned int level);
 	};

 	TCacheEntryBase* CreateTexture(unsigned int width, unsigned int height,
--- a/Source/Core/VideoBackends/D3D/Src/VertexManager.cpp
+++ b/Source/Core/VideoBackends/D3D/Src/VertexManager.cpp
@ -222,9 +222,9 @@ void VertexManager::vFlush()
 				tex.texImage0[i&3].width + 1, tex.texImage0[i&3].height + 1,
 				tex.texImage0[i&3].format, tex.texTlut[i&3].tmem_offset<<9,
 				tex.texTlut[i&3].tlut_format,
-				(tex.texMode0[i&3].min_filter & 3),
+				((tex.texMode0[i&3].min_filter & 3) != 0),
 				(tex.texMode1[i&3].max_lod + 0xf) / 0x10,
-				tex.texImage1[i&3].image_type);
+				(tex.texImage1[i&3].image_type != 0));

 			if (tentry)
 			{
--- a/Source/Core/VideoBackends/OGL/Src/RasterFont.cpp
+++ b/Source/Core/VideoBackends/OGL/Src/RasterFont.cpp
@ -144,7 +144,7 @@ RasterFont::RasterFont()
 	for(u32 y=0; y<char_height; y++) {
 		for(u32 c=0; c<char_count; c++) {
 			for(u32 x=0; x<char_width; x++) {
-				bool pixel = rasters[c][y] & (1<<(char_width-x-1));
+				bool pixel = (0 != (rasters[c][y] & (1<<(char_width-x-1))));
 				texture_data[char_width*char_count*y+char_width*c+x] = pixel ? -1 : 0;
 			}
 		}
--- a/Source/Core/VideoBackends/OGL/Src/Render.cpp
+++ b/Source/Core/VideoBackends/OGL/Src/Render.cpp
@ -9,8 +9,12 @@
 #include <vector>
 #include <cmath>
 #include <cstdio>
+#include <cinttypes>

 #include "GLUtil.h"
+#if defined(HAVE_WX) && HAVE_WX
+#include "WxUtils.h"
+#endif

 #include "FileUtil.h"

@ -22,6 +26,7 @@
 #include "DriverDetails.h"
 #include "VideoConfig.h"
 #include "Statistics.h"
+#include "ImageWrite.h"
 #include "PixelEngine.h"
 #include "Render.h"
 #include "BPStructs.h"
@ -69,7 +74,6 @@ void VideoConfig::UpdateProjectionHack()
 	::UpdateProjectionHack(g_Config.iPhackvalue, g_Config.sPhackvalue);
 }

-
 int OSDInternalW, OSDInternalH;

 namespace OGL
@ -108,6 +112,10 @@ static u32 s_blendMode;

 static bool s_vsync;

+#if defined(HAVE_WX) && HAVE_WX
+static std::thread scrshotThread;
+#endif
+
 // EFB cache related
 static const u32 EFB_CACHE_RECT_SIZE = 64; // Cache 64x64 blocks.
 static const u32 EFB_CACHE_WIDTH = (EFB_WIDTH + EFB_CACHE_RECT_SIZE - 1) / EFB_CACHE_RECT_SIZE; // round up
@ -461,19 +469,23 @@ Renderer::Renderer()

 	}

-	g_Config.backend_info.bSupportsDualSourceBlend = GLEW_ARB_blend_func_extended;
-	g_Config.backend_info.bSupportsGLSLUBO = GLEW_ARB_uniform_buffer_object;
-	g_Config.backend_info.bSupportsPrimitiveRestart = GLEW_VERSION_3_1 || GLEW_NV_primitive_restart;
-	g_Config.backend_info.bSupportsEarlyZ = GLEW_ARB_shader_image_load_store;
+#define TO_BOOL(c) (0 != (c))

-	g_ogl_config.bSupportsGLSLCache = GLEW_ARB_get_program_binary;
-	g_ogl_config.bSupportsGLPinnedMemory = GLEW_AMD_pinned_memory;
-	g_ogl_config.bSupportsGLSync = GLEW_ARB_sync;
-	g_ogl_config.bSupportsGLBaseVertex = GLEW_ARB_draw_elements_base_vertex;
-	g_ogl_config.bSupportCoverageMSAA = GLEW_NV_framebuffer_multisample_coverage;
-	g_ogl_config.bSupportSampleShading = GLEW_ARB_sample_shading;
-	g_ogl_config.bSupportOGL31 = GLEW_VERSION_3_1;
-	g_ogl_config.bSupportViewportFloat = GLEW_ARB_viewport_array;
+	g_Config.backend_info.bSupportsDualSourceBlend = TO_BOOL(GLEW_ARB_blend_func_extended);
+	g_Config.backend_info.bSupportsGLSLUBO = TO_BOOL(GLEW_ARB_uniform_buffer_object);
+	g_Config.backend_info.bSupportsPrimitiveRestart = TO_BOOL(GLEW_VERSION_3_1) || TO_BOOL(GLEW_NV_primitive_restart);
+	g_Config.backend_info.bSupportsEarlyZ = TO_BOOL(GLEW_ARB_shader_image_load_store);
+
+	g_ogl_config.bSupportsGLSLCache = TO_BOOL(GLEW_ARB_get_program_binary);
+	g_ogl_config.bSupportsGLPinnedMemory = TO_BOOL(GLEW_AMD_pinned_memory);
+	g_ogl_config.bSupportsGLSync = TO_BOOL(GLEW_ARB_sync);
+	g_ogl_config.bSupportsGLBaseVertex = TO_BOOL(GLEW_ARB_draw_elements_base_vertex);
+	g_ogl_config.bSupportCoverageMSAA = TO_BOOL(GLEW_NV_framebuffer_multisample_coverage);
+	g_ogl_config.bSupportSampleShading = TO_BOOL(GLEW_ARB_sample_shading);
+	g_ogl_config.bSupportOGL31 = TO_BOOL(GLEW_VERSION_3_1);
+	g_ogl_config.bSupportViewportFloat = TO_BOOL(GLEW_ARB_viewport_array);
+
+#undef TO_BOOL

 	if(strstr(g_ogl_config.glsl_version, "1.00") || strstr(g_ogl_config.glsl_version, "1.10") || strstr(g_ogl_config.glsl_version, "1.20"))
 	{
@ -616,6 +628,11 @@ Renderer::Renderer()

 Renderer::~Renderer()
 {
+
+#if defined(HAVE_WX) && HAVE_WX
+	if (scrshotThread.joinable())
+		scrshotThread.join();
+#endif
 }

 void Renderer::Shutdown()
@ -681,7 +698,7 @@ void Renderer::DrawDebugInfo()
 		p+=sprintf(p, "FPS: %d\n", s_fps);

 	if (SConfig::GetInstance().m_ShowLag)
-		p+=sprintf(p, "Lag: %llu\n", Movie::g_currentLagCount);
+		p+=sprintf(p, "Lag: %" PRIu64 "\n", Movie::g_currentLagCount);

 	if (g_ActiveConfig.bShowInputDisplay)
 		p+=sprintf(p, "%s", Movie::GetInputDisplay().c_str());
@ -1391,9 +1408,11 @@ void Renderer::Swap(u32 xfbAddr, u32 fbWidth, u32 fbHeight,const EFBRectangle& r
 	// Save screenshot
 	if (s_bScreenshot)
 	{
-		TakeScreenshot(flipped_trc, s_sScreenshotName);
-		s_bScreenshot = false;
+		std::lock_guard<std::mutex> lk(s_criticalScreenshot);
+		SaveScreenshot(s_sScreenshotName, flipped_trc);
 		// Reset settings
+		s_sScreenshotName.clear();
+		s_bScreenshot = false;
 	}

 	// Frame dumps are handled a little differently in Windows
@ -1608,7 +1627,7 @@ void Renderer::Swap(u32 xfbAddr, u32 fbWidth, u32 fbHeight,const EFBRectangle& r

 	// For testing zbuffer targets.
 	// Renderer::SetZBufferRender();
-	// SaveTexture("tex.tga", GL_TEXTURE_2D, s_FakeZTarget,
+	// SaveTexture("tex.png", GL_TEXTURE_2D, s_FakeZTarget,
 	//	      GetTargetWidth(), GetTargetHeight());
 	Core::Callback_VideoCopiedToXFB(XFBWrited || (g_ActiveConfig.bUseXFB && g_ActiveConfig.bUseRealXFB));
 	XFBWrited = false;
@ -1769,44 +1788,48 @@ void Renderer::SetInterlacingMode()
 	// TODO
 }

-void Renderer::FlipImageData(u8 *data, int w, int h)
+void Renderer::FlipImageData(u8 *data, int w, int h, int pixel_width)
 {
-	// XXX make this faster
-	u8* __restrict top = data;
-	u8* bot = data + w * h * 3;
-	for (int y = 0; y < h / 2; y++)
+	// Flip image upside down. Damn OpenGL.
+	for (int y = 0; y < h / 2; ++y)
 	{
-		size_t stride = w * 3;
-		bot -= stride;
-		u8* __restrict brow = bot;
-		for(size_t x = 0; x < stride; x++)
+		for(int x = 0; x < w; ++x)
 		{
-			std::swap(*top++, *brow++);
+			for (auto delta = 0; delta < pixel_width; ++delta)
+				std::swap(data[(y * w + x) * pixel_width + delta], data[((h - 1 - y) * w + x) * pixel_width + delta]);
 		}
 	}
 }

-void Renderer::TakeScreenshot(const TargetRectangle &back_rc, std::string filename)
+}
+
+namespace OGL
+{
+
+bool Renderer::SaveScreenshot(const std::string &filename, const TargetRectangle &back_rc)
 {
 	u32 W = back_rc.GetWidth();
 	u32 H = back_rc.GetHeight();
-	u8 *data = (u8 *)malloc((sizeof(u8) * 3 * W * H));
+	u8 *data = new u8[W * 4 * H];
 	glPixelStorei(GL_PACK_ALIGNMENT, 1);

-	glReadPixels(back_rc.left, back_rc.bottom, W, H, GL_RGB, GL_UNSIGNED_BYTE, data);
+	glReadPixels(back_rc.left, back_rc.bottom, W, H, GL_RGBA, GL_UNSIGNED_BYTE, data);

 	// Show failure message
 	if (GL_REPORT_ERROR() != GL_NO_ERROR)
 	{
-		free(data);
+		delete[] data;
 		OSD::AddMessage("Error capturing or saving screenshot.", 2000);
-		return;
+		return false;
 	}

 	// Turn image upside down
-	FlipImageData(data, W, H);
+	FlipImageData(data, W, H, 4);
+	bool success = TextureToPng(data, W*4, filename, W, H, false);
+	delete[] data;
+
+	return success;

-	SaveScreenshot(data, W, H, filename);
 }

 }
--- a/Source/Core/VideoBackends/OGL/Src/Render.h
+++ b/Source/Core/VideoBackends/OGL/Src/Render.h
@ -63,7 +63,7 @@ public:

 	void RenderText(const char* pstr, int left, int top, u32 color) override;
 	void DrawDebugInfo();
-	static void FlipImageData(u8 *data, int w, int h);
+	void FlipImageData(u8 *data, int w, int h, int pixel_width = 3);

 	u32 AccessEFB(EFBAccessType type, u32 x, u32 y, u32 poke_data) override;

@ -80,7 +80,7 @@ public:

 	void UpdateViewport() override;

-	static void TakeScreenshot(const TargetRectangle &rc, std::string filename);
+	bool SaveScreenshot(const std::string &filename, const TargetRectangle &rc);

 private:
 	void UpdateEFBCache(EFBAccessType type, u32 cacheRectIdx, const EFBRectangle& efbPixelRc, const TargetRectangle& targetPixelRc, const u32* data);
--- a/Source/Core/VideoBackends/OGL/Src/StreamBuffer.cpp
+++ b/Source/Core/VideoBackends/OGL/Src/StreamBuffer.cpp
@ -59,7 +59,7 @@ StreamBuffer::~StreamBuffer()
 	glDeleteBuffers(1, &m_buffer);
 }

-#define SLOT(x) (x)*SYNC_POINTS/m_size
+#define SLOT(x) ((x)*SYNC_POINTS/m_size)

 void StreamBuffer::Alloc ( size_t size, u32 stride )
 {
@ -81,14 +81,14 @@ void StreamBuffer::Alloc ( size_t size, u32 stride )
 	case PINNED_MEMORY:

 		// insert waiting slots for used memory
-		for(u32 i=SLOT(m_used_iterator); i<SLOT(m_iterator); i++)
+		for(size_t i=SLOT(m_used_iterator); i<SLOT(m_iterator); i++)
 		{
 			fences[i] = glFenceSync(GL_SYNC_GPU_COMMANDS_COMPLETE, 0);
 		}
 		m_used_iterator = m_iterator;

 		// wait for new slots to end of buffer
-		for(u32 i=SLOT(m_free_iterator)+1; i<=SLOT(iter_end) && i < SYNC_POINTS; i++)
+		for (size_t i = SLOT(m_free_iterator) + 1; i <= SLOT(iter_end) && i < SYNC_POINTS; i++)
 		{
 			glClientWaitSync(fences[i], GL_SYNC_FLUSH_COMMANDS_BIT, GL_TIMEOUT_IGNORED);
 			glDeleteSync(fences[i]);
@ -99,8 +99,10 @@ void StreamBuffer::Alloc ( size_t size, u32 stride )
 		if(iter_end >= m_size) {

 			// insert waiting slots in unused space at the end of the buffer
-			for(u32 i=SLOT(m_used_iterator); i < SYNC_POINTS; i++)
+			for (size_t i = SLOT(m_used_iterator); i < SYNC_POINTS; i++)
+			{
 				fences[i] = glFenceSync(GL_SYNC_GPU_COMMANDS_COMPLETE, 0);
+			}

 			// move to the start
 			m_used_iterator = m_iterator_aligned = m_iterator = 0; // offset 0 is always aligned
@ -244,10 +246,14 @@ void StreamBuffer::Shutdown()

 void StreamBuffer::DeleteFences()
 {
-	for(u32 i=SLOT(m_free_iterator)+1; i < SYNC_POINTS; i++)
+	for (size_t i = SLOT(m_free_iterator) + 1; i < SYNC_POINTS; i++)
+	{
 		glDeleteSync(fences[i]);
-	for(u32 i=0; i<SLOT(m_iterator); i++)
+	}
+	for (size_t i = 0; i < SLOT(m_iterator); i++)
+	{
 		glDeleteSync(fences[i]);
+	}
 	delete [] fences;
 }

--- a/Source/Core/VideoBackends/OGL/Src/TextureCache.cpp
+++ b/Source/Core/VideoBackends/OGL/Src/TextureCache.cpp
@ -59,15 +59,15 @@ struct VBOCache {
 };
 static std::map<u64,VBOCache> s_VBO;

-bool SaveTexture(const char* filename, u32 textarget, u32 tex, int virtual_width, int virtual_height, unsigned int level)
+bool SaveTexture(const std::string filename, u32 textarget, u32 tex, int virtual_width, int virtual_height, unsigned int level)
 {
 #ifndef USE_GLES3
 	int width = std::max(virtual_width >> level, 1);
 	int height = std::max(virtual_height >> level, 1);
-	std::vector<u32> data(width * height);
+	u8* data = new u8[width * height * 4];
 	glActiveTexture(GL_TEXTURE0+9);
 	glBindTexture(textarget, tex);
-	glGetTexImage(textarget, level, GL_BGRA, GL_UNSIGNED_BYTE, &data[0]);
+	glGetTexImage(textarget, level, GL_RGBA, GL_UNSIGNED_BYTE, data);
 	glBindTexture(textarget, 0);
 	TextureCache::SetStage();

@ -75,10 +75,12 @@ bool SaveTexture(const char* filename, u32 textarget, u32 tex, int virtual_width
 	if (GL_NO_ERROR != err)
 	{
 		PanicAlert("Can't save texture, GL Error: %s", gluErrorString(err));
+		delete[] data;
 		return false;
 	}
-
-	return SaveTGA(filename, width, height, &data[0]);
+	bool success = TextureToPng(data, width * 4, filename, width, height, true);
+	delete[] data;
+	return success;
 #else
 	return false;
 #endif
@ -125,13 +127,9 @@ void TextureCache::TCacheEntry::Bind(unsigned int stage)
 	}
 }

-bool TextureCache::TCacheEntry::Save(const char filename[], unsigned int level)
+bool TextureCache::TCacheEntry::Save(const std::string filename, unsigned int level)
 {
-	// TODO: make ogl dump PNGs
-	std::string tga_filename(filename);
-	tga_filename.replace(tga_filename.size() - 3, 3, "tga");
-
-	return SaveTexture(tga_filename.c_str(), GL_TEXTURE_2D, texture, virtual_width, virtual_height, level);
+	return SaveTexture(filename, GL_TEXTURE_2D, texture, virtual_width, virtual_height, level);
 }

 TextureCache::TCacheEntryBase* TextureCache::CreateTexture(unsigned int width,
@ -395,8 +393,8 @@ void TextureCache::TCacheEntry::FromRenderTarget(u32 dstAddr, unsigned int dstFo
 	if (g_ActiveConfig.bDumpEFBTarget)
 	{
 		static int count = 0;
-		SaveTexture(StringFromFormat("%sefb_frame_%i.tga", File::GetUserPath(D_DUMPTEXTURES_IDX).c_str(),
-			count++).c_str(), GL_TEXTURE_2D, texture, virtual_width, virtual_height, 0);
+		SaveTexture(StringFromFormat("%sefb_frame_%i.png", File::GetUserPath(D_DUMPTEXTURES_IDX).c_str(),
+			count++), GL_TEXTURE_2D, texture, virtual_width, virtual_height, 0);
 	}

 	g_renderer->RestoreAPIState();
--- a/Show more
+++ b/Show more