39 files changed, 1656 insertions, 1041 deletions
diff --git a/arch/mips/math-emu/Makefile b/arch/mips/math-emu/Makefile
index a19641d3ac23..e9f10b88b695 100644
--- a/arch/mips/math-emu/Makefile
+++ b/arch/mips/math-emu/Makefile
@@ -4,9 +4,11 @@
 
 obj-y	+= cp1emu.o ieee754dp.o ieee754sp.o ieee754.o \
 	   dp_div.o dp_mul.o dp_sub.o dp_add.o dp_fsp.o dp_cmp.o dp_simple.o \
-	   dp_tint.o dp_fint.o dp_maddf.o dp_msubf.o dp_2008class.o dp_fmin.o dp_fmax.o \
+	   dp_tint.o dp_fint.o dp_rint.o dp_maddf.o dp_2008class.o dp_fmin.o \
+	   dp_fmax.o							     \
 	   sp_div.o sp_mul.o sp_sub.o sp_add.o sp_fdp.o sp_cmp.o sp_simple.o \
-	   sp_tint.o sp_fint.o sp_maddf.o sp_msubf.o sp_2008class.o sp_fmin.o sp_fmax.o \
+	   sp_tint.o sp_fint.o sp_rint.o sp_maddf.o sp_2008class.o sp_fmin.o \
+	   sp_fmax.o							     \
 	   dsemul.o
 
 lib-y	+= ieee754d.o \
diff --git a/arch/mips/math-emu/cp1emu.c b/arch/mips/math-emu/cp1emu.c
index 89d05de8040a..ebb5e3bfe12a 100644
--- a/arch/mips/math-emu/cp1emu.c
+++ b/arch/mips/math-emu/cp1emu.c
@@ -434,12 +434,14 @@ static int microMIPS32_to_MIPS32(union mips_instruction *insn_ptr)
  * a single subroutine should be used across both
  * modules.
  */
-static int isBranchInstr(struct pt_regs *regs, struct mm_decoded_insn dec_insn,
-			 unsigned long *contpc)
+int isBranchInstr(struct pt_regs *regs, struct mm_decoded_insn dec_insn,
+		  unsigned long *contpc)
 {
 	union mips_instruction insn = (union mips_instruction)dec_insn.insn;
 	unsigned int fcr31;
 	unsigned int bit = 0;
+	unsigned int bit0;
+	union fpureg *fpr;
 
 	switch (insn.i_format.opcode) {
 	case spec_op:
@@ -450,7 +452,7 @@ static int isBranchInstr(struct pt_regs *regs, struct mm_decoded_insn dec_insn,
 					regs->cp0_epc + dec_insn.pc_inc +
 					dec_insn.next_pc_inc;
 			}
-			/* Fall through */
+			/* fall through */
 		case jr_op:
 			/* For R6, JR already emulated in jalr_op */
 			if (NO_R6EMU && insn.r_format.func == jr_op)
@@ -470,10 +472,11 @@ static int isBranchInstr(struct pt_regs *regs, struct mm_decoded_insn dec_insn,
 			regs->regs[31] = regs->cp0_epc +
 				dec_insn.pc_inc +
 				dec_insn.next_pc_inc;
-			/* Fall through */
+			/* fall through */
 		case bltzl_op:
 			if (NO_R6EMU)
 				break;
+			/* fall through */
 		case bltz_op:
 			if ((long)regs->regs[insn.i_format.rs] < 0)
 				*contpc = regs->cp0_epc +
@@ -493,10 +496,11 @@ static int isBranchInstr(struct pt_regs *regs, struct mm_decoded_insn dec_insn,
 			regs->regs[31] = regs->cp0_epc +
 				dec_insn.pc_inc +
 				dec_insn.next_pc_inc;
-			/* Fall through */
+			/* fall through */
 		case bgezl_op:
 			if (NO_R6EMU)
 				break;
+			/* fall through */
 		case bgez_op:
 			if ((long)regs->regs[insn.i_format.rs] >= 0)
 				*contpc = regs->cp0_epc +
@@ -511,11 +515,12 @@ static int isBranchInstr(struct pt_regs *regs, struct mm_decoded_insn dec_insn,
 		break;
 	case jalx_op:
 		set_isa16_mode(bit);
+		/* fall through */
 	case jal_op:
 		regs->regs[31] = regs->cp0_epc +
 			dec_insn.pc_inc +
 			dec_insn.next_pc_inc;
-		/* Fall through */
+		/* fall through */
 	case j_op:
 		*contpc = regs->cp0_epc + dec_insn.pc_inc;
 		*contpc >>= 28;
@@ -527,6 +532,7 @@ static int isBranchInstr(struct pt_regs *regs, struct mm_decoded_insn dec_insn,
 	case beql_op:
 		if (NO_R6EMU)
 			break;
+		/* fall through */
 	case beq_op:
 		if (regs->regs[insn.i_format.rs] ==
 		    regs->regs[insn.i_format.rt])
@@ -541,6 +547,7 @@ static int isBranchInstr(struct pt_regs *regs, struct mm_decoded_insn dec_insn,
 	case bnel_op:
 		if (NO_R6EMU)
 			break;
+		/* fall through */
 	case bne_op:
 		if (regs->regs[insn.i_format.rs] !=
 		    regs->regs[insn.i_format.rt])
@@ -555,6 +562,7 @@ static int isBranchInstr(struct pt_regs *regs, struct mm_decoded_insn dec_insn,
 	case blezl_op:
 		if (!insn.i_format.rt && NO_R6EMU)
 			break;
+		/* fall through */
 	case blez_op:
 
 		/*
@@ -592,6 +600,7 @@ static int isBranchInstr(struct pt_regs *regs, struct mm_decoded_insn dec_insn,
 	case bgtzl_op:
 		if (!insn.i_format.rt && NO_R6EMU)
 			break;
+		/* fall through */
 	case bgtz_op:
 		/*
 		 * Compact branches for R6 for the
@@ -627,8 +636,8 @@ static int isBranchInstr(struct pt_regs *regs, struct mm_decoded_insn dec_insn,
 				dec_insn.pc_inc +
 				dec_insn.next_pc_inc;
 		return 1;
-	case cbcond0_op:
-	case cbcond1_op:
+	case pop10_op:
+	case pop30_op:
 		if (!cpu_has_mips_r6)
 			break;
 		if (insn.i_format.rt && !insn.i_format.rs)
@@ -683,14 +692,14 @@ static int isBranchInstr(struct pt_regs *regs, struct mm_decoded_insn dec_insn,
 			dec_insn.next_pc_inc;
 
 		return 1;
-	case beqzcjic_op:
+	case pop66_op:
 		if (!cpu_has_mips_r6)
 			break;
 		*contpc = regs->cp0_epc + dec_insn.pc_inc +
 			dec_insn.next_pc_inc;
 
 		return 1;
-	case bnezcjialc_op:
+	case pop76_op:
 		if (!cpu_has_mips_r6)
 			break;
 		if (!insn.i_format.rs)
@@ -707,14 +716,14 @@ static int isBranchInstr(struct pt_regs *regs, struct mm_decoded_insn dec_insn,
 		    ((insn.i_format.rs == bc1eqz_op) ||
 		     (insn.i_format.rs == bc1nez_op))) {
 			bit = 0;
+			fpr = &current->thread.fpu.fpr[insn.i_format.rt];
+			bit0 = get_fpr32(fpr, 0) & 0x1;
 			switch (insn.i_format.rs) {
 			case bc1eqz_op:
-				if (get_fpr32(&current->thread.fpu.fpr[insn.i_format.rt], 0) & 0x1)
-				    bit = 1;
+				bit = bit0 == 0;
 				break;
 			case bc1nez_op:
-				if (!(get_fpr32(&current->thread.fpu.fpr[insn.i_format.rt], 0) & 0x1))
-				    bit = 1;
+				bit = bit0 != 0;
 				break;
 			}
 			if (bit)
@@ -728,7 +737,8 @@ static int isBranchInstr(struct pt_regs *regs, struct mm_decoded_insn dec_insn,
 
 			return 1;
 		}
-		/* R2/R6 compatible cop1 instruction. Fall through */
+		/* R2/R6 compatible cop1 instruction */
+		/* fall through */
 	case cop2_op:
 	case cop1x_op:
 		if (insn.i_format.rs == bc_op) {
@@ -809,7 +819,7 @@ do {									\
 #define SITOREG(si, x)							\
 do {									\
 	if (cop1_64bit(xcp) && !hybrid_fprs()) {			\
-		unsigned i;						\
+		unsigned int i;						\
 		set_fpr32(&ctx->fpr[x], 0, si);				\
 		for (i = 1; i < ARRAY_SIZE(ctx->fpr[x].val32); i++)	\
 			set_fpr32(&ctx->fpr[x], i, 0);			\
@@ -822,19 +832,19 @@ do {									\
 
 #define SITOHREG(si, x)							\
 do {									\
-	unsigned i;							\
+	unsigned int i;							\
 	set_fpr32(&ctx->fpr[x], 1, si);					\
 	for (i = 2; i < ARRAY_SIZE(ctx->fpr[x].val32); i++)		\
 		set_fpr32(&ctx->fpr[x], i, 0);				\
 } while (0)
 
 #define DIFROMREG(di, x)						\
-	((di) = get_fpr64(&ctx->fpr[(x) & ~(cop1_64bit(xcp) == 0)], 0))
+	((di) = get_fpr64(&ctx->fpr[(x) & ~(cop1_64bit(xcp) ^ 1)], 0))
 
 #define DITOREG(di, x)							\
 do {									\
-	unsigned fpr, i;						\
-	fpr = (x) & ~(cop1_64bit(xcp) == 0);				\
+	unsigned int fpr, i;						\
+	fpr = (x) & ~(cop1_64bit(xcp) ^ 1);				\
 	set_fpr64(&ctx->fpr[fpr], 0, di);				\
 	for (i = 1; i < ARRAY_SIZE(ctx->fpr[x].val64); i++)		\
 		set_fpr64(&ctx->fpr[fpr], i, 0);			\
@@ -975,9 +985,10 @@ static int cop1Emulate(struct pt_regs *xcp, struct mips_fpu_struct *ctx,
 		struct mm_decoded_insn dec_insn, void *__user *fault_addr)
 {
 	unsigned long contpc = xcp->cp0_epc + dec_insn.pc_inc;
-	unsigned int cond, cbit;
+	unsigned int cond, cbit, bit0;
 	mips_instruction ir;
 	int likely, pc_inc;
+	union fpureg *fpr;
 	u32 __user *wva;
 	u64 __user *dva;
 	u32 wval;
@@ -1188,15 +1199,18 @@ emul:
 			if (!cpu_has_mips_r6 || delay_slot(xcp))
 				return SIGILL;
 
-			cond = likely = 0;
+			likely = 0;
+			cond = 0;
+			fpr = &current->thread.fpu.fpr[MIPSInst_RT(ir)];
+			bit0 = get_fpr32(fpr, 0) & 0x1;
 			switch (MIPSInst_RS(ir)) {
 			case bc1eqz_op:
-				if (get_fpr32(&current->thread.fpu.fpr[MIPSInst_RT(ir)], 0) & 0x1)
-				    cond = 1;
+				MIPS_FPU_EMU_INC_STATS(bc1eqz);
+				cond = bit0 == 0;
 				break;
 			case bc1nez_op:
-				if (!(get_fpr32(&current->thread.fpu.fpr[MIPSInst_RT(ir)], 0) & 0x1))
-				    cond = 1;
+				MIPS_FPU_EMU_INC_STATS(bc1nez);
+				cond = bit0 != 0;
 				break;
 			}
 			goto branch_common;
@@ -1216,18 +1230,19 @@ emul:
 			case bcfl_op:
 				if (cpu_has_mips_2_3_4_5_r)
 					likely = 1;
-				/* Fall through */
+				/* fall through */
 			case bcf_op:
 				cond = !cond;
 				break;
 			case bctl_op:
 				if (cpu_has_mips_2_3_4_5_r)
 					likely = 1;
-				/* Fall through */
+				/* fall through */
 			case bct_op:
 				break;
 			}
 branch_common:
+			MIPS_FPU_EMU_INC_STATS(branches);
 			set_delay_slot(xcp);
 			if (cond) {
 				/*
@@ -1267,7 +1282,9 @@ branch_common:
 						 * instruction in the dslot.
 						 */
 						sig = mips_dsemul(xcp, ir,
-								  contpc);
+								  bcpc, contpc);
+						if (sig < 0)
+							break;
 						if (sig)
 							xcp->cp0_epc = bcpc;
 						/*
@@ -1320,7 +1337,9 @@ branch_common:
 				 * Single step the non-cp1
 				 * instruction in the dslot
 				 */
-				sig = mips_dsemul(xcp, ir, contpc);
+				sig = mips_dsemul(xcp, ir, bcpc, contpc);
+				if (sig < 0)
+					break;
 				if (sig)
 					xcp->cp0_epc = bcpc;
 				/* SIGILL forces out of the emulation loop.  */
@@ -1344,7 +1363,8 @@ branch_common:
 				return SIGILL;
 
 			/* a real fpu computation instruction */
-			if ((sig = fpu_emu(xcp, ctx, ir)))
+			sig = fpu_emu(xcp, ctx, ir);
+			if (sig)
 				return sig;
 		}
 		break;
@@ -1457,7 +1477,7 @@ DEF3OP(nmsub, dp, ieee754dp_mul, ieee754dp_sub, ieee754dp_neg);
 static int fpux_emu(struct pt_regs *xcp, struct mips_fpu_struct *ctx,
 	mips_instruction ir, void *__user *fault_addr)
 {
-	unsigned rcsr = 0;	/* resulting csr */
+	unsigned int rcsr = 0;	/* resulting csr */
 
 	MIPS_FPU_EMU_INC_STATS(cp1xops);
 
@@ -1653,10 +1673,10 @@ static int fpu_emu(struct pt_regs *xcp, struct mips_fpu_struct *ctx,
 	mips_instruction ir)
 {
 	int rfmt;		/* resulting format */
-	unsigned rcsr = 0;	/* resulting csr */
+	unsigned int rcsr = 0;	/* resulting csr */
 	unsigned int oldrm;
 	unsigned int cbit;
-	unsigned cond;
+	unsigned int cond;
 	union {
 		union ieee754dp d;
 		union ieee754sp s;
@@ -1672,20 +1692,24 @@ static int fpu_emu(struct pt_regs *xcp, struct mips_fpu_struct *ctx,
 			union ieee754sp(*b) (union ieee754sp, union ieee754sp);
 			union ieee754sp(*u) (union ieee754sp);
 		} handler;
-		union ieee754sp fs, ft;
+		union ieee754sp fd, fs, ft;
 
 		switch (MIPSInst_FUNC(ir)) {
 			/* binary ops */
 		case fadd_op:
+			MIPS_FPU_EMU_INC_STATS(add_s);
 			handler.b = ieee754sp_add;
 			goto scopbop;
 		case fsub_op:
+			MIPS_FPU_EMU_INC_STATS(sub_s);
 			handler.b = ieee754sp_sub;
 			goto scopbop;
 		case fmul_op:
+			MIPS_FPU_EMU_INC_STATS(mul_s);
 			handler.b = ieee754sp_mul;
 			goto scopbop;
 		case fdiv_op:
+			MIPS_FPU_EMU_INC_STATS(div_s);
 			handler.b = ieee754sp_div;
 			goto scopbop;
 
@@ -1694,6 +1718,7 @@ static int fpu_emu(struct pt_regs *xcp, struct mips_fpu_struct *ctx,
 			if (!cpu_has_mips_2_3_4_5_r)
 				return SIGILL;
 
+			MIPS_FPU_EMU_INC_STATS(sqrt_s);
 			handler.u = ieee754sp_sqrt;
 			goto scopuop;
 
@@ -1706,6 +1731,7 @@ static int fpu_emu(struct pt_regs *xcp, struct mips_fpu_struct *ctx,
 			if (!cpu_has_mips_4_5_64_r2_r6)
 				return SIGILL;
 
+			MIPS_FPU_EMU_INC_STATS(rsqrt_s);
 			handler.u = fpemu_sp_rsqrt;
 			goto scopuop;
 
@@ -1713,6 +1739,7 @@ static int fpu_emu(struct pt_regs *xcp, struct mips_fpu_struct *ctx,
 			if (!cpu_has_mips_4_5_64_r2_r6)
 				return SIGILL;
 
+			MIPS_FPU_EMU_INC_STATS(recip_s);
 			handler.u = fpemu_sp_recip;
 			goto scopuop;
 
@@ -1749,6 +1776,7 @@ static int fpu_emu(struct pt_regs *xcp, struct mips_fpu_struct *ctx,
 			if (!cpu_has_mips_r6)
 				return SIGILL;
 
+			MIPS_FPU_EMU_INC_STATS(seleqz_s);
 			SPFROMREG(rv.s, MIPSInst_FT(ir));
 			if (rv.w & 0x1)
 				rv.w = 0;
@@ -1760,6 +1788,7 @@ static int fpu_emu(struct pt_regs *xcp, struct mips_fpu_struct *ctx,
 			if (!cpu_has_mips_r6)
 				return SIGILL;
 
+			MIPS_FPU_EMU_INC_STATS(selnez_s);
 			SPFROMREG(rv.s, MIPSInst_FT(ir));
 			if (rv.w & 0x1)
 				SPFROMREG(rv.s, MIPSInst_FS(ir));
@@ -1773,6 +1802,7 @@ static int fpu_emu(struct pt_regs *xcp, struct mips_fpu_struct *ctx,
 			if (!cpu_has_mips_r6)
 				return SIGILL;
 
+			MIPS_FPU_EMU_INC_STATS(maddf_s);
 			SPFROMREG(ft, MIPSInst_FT(ir));
 			SPFROMREG(fs, MIPSInst_FS(ir));
 			SPFROMREG(fd, MIPSInst_FD(ir));
@@ -1786,6 +1816,7 @@ static int fpu_emu(struct pt_regs *xcp, struct mips_fpu_struct *ctx,
 			if (!cpu_has_mips_r6)
 				return SIGILL;
 
+			MIPS_FPU_EMU_INC_STATS(msubf_s);
 			SPFROMREG(ft, MIPSInst_FT(ir));
 			SPFROMREG(fs, MIPSInst_FS(ir));
 			SPFROMREG(fd, MIPSInst_FD(ir));
@@ -1799,9 +1830,9 @@ static int fpu_emu(struct pt_regs *xcp, struct mips_fpu_struct *ctx,
 			if (!cpu_has_mips_r6)
 				return SIGILL;
 
+			MIPS_FPU_EMU_INC_STATS(rint_s);
 			SPFROMREG(fs, MIPSInst_FS(ir));
-			rv.l = ieee754sp_tlong(fs);
-			rv.s = ieee754sp_flong(rv.l);
+			rv.s = ieee754sp_rint(fs);
 			goto copcsr;
 		}
 
@@ -1811,6 +1842,7 @@ static int fpu_emu(struct pt_regs *xcp, struct mips_fpu_struct *ctx,
 			if (!cpu_has_mips_r6)
 				return SIGILL;
 
+			MIPS_FPU_EMU_INC_STATS(class_s);
 			SPFROMREG(fs, MIPSInst_FS(ir));
 			rv.w = ieee754sp_2008class(fs);
 			rfmt = w_fmt;
@@ -1823,6 +1855,7 @@ static int fpu_emu(struct pt_regs *xcp, struct mips_fpu_struct *ctx,
 			if (!cpu_has_mips_r6)
 				return SIGILL;
 
+			MIPS_FPU_EMU_INC_STATS(min_s);
 			SPFROMREG(ft, MIPSInst_FT(ir));
 			SPFROMREG(fs, MIPSInst_FS(ir));
 			rv.s = ieee754sp_fmin(fs, ft);
@@ -1835,6 +1868,7 @@ static int fpu_emu(struct pt_regs *xcp, struct mips_fpu_struct *ctx,
 			if (!cpu_has_mips_r6)
 				return SIGILL;
 
+			MIPS_FPU_EMU_INC_STATS(mina_s);
 			SPFROMREG(ft, MIPSInst_FT(ir));
 			SPFROMREG(fs, MIPSInst_FS(ir));
 			rv.s = ieee754sp_fmina(fs, ft);
@@ -1847,6 +1881,7 @@ static int fpu_emu(struct pt_regs *xcp, struct mips_fpu_struct *ctx,
 			if (!cpu_has_mips_r6)
 				return SIGILL;
 
+			MIPS_FPU_EMU_INC_STATS(max_s);
 			SPFROMREG(ft, MIPSInst_FT(ir));
 			SPFROMREG(fs, MIPSInst_FS(ir));
 			rv.s = ieee754sp_fmax(fs, ft);
@@ -1859,6 +1894,7 @@ static int fpu_emu(struct pt_regs *xcp, struct mips_fpu_struct *ctx,
 			if (!cpu_has_mips_r6)
 				return SIGILL;
 
+			MIPS_FPU_EMU_INC_STATS(maxa_s);
 			SPFROMREG(ft, MIPSInst_FT(ir));
 			SPFROMREG(fs, MIPSInst_FS(ir));
 			rv.s = ieee754sp_fmaxa(fs, ft);
@@ -1866,15 +1902,18 @@ static int fpu_emu(struct pt_regs *xcp, struct mips_fpu_struct *ctx,
 		}
 
 		case fabs_op:
+			MIPS_FPU_EMU_INC_STATS(abs_s);
 			handler.u = ieee754sp_abs;
 			goto scopuop;
 
 		case fneg_op:
+			MIPS_FPU_EMU_INC_STATS(neg_s);
 			handler.u = ieee754sp_neg;
 			goto scopuop;
 
 		case fmov_op:
 			/* an easy one */
+			MIPS_FPU_EMU_INC_STATS(mov_s);
 			SPFROMREG(rv.s, MIPSInst_FS(ir));
 			goto copcsr;
 
@@ -1917,12 +1956,14 @@ copcsr:
 			return SIGILL;	/* not defined */
 
 		case fcvtd_op:
+			MIPS_FPU_EMU_INC_STATS(cvt_d_s);
 			SPFROMREG(fs, MIPSInst_FS(ir));
 			rv.d = ieee754dp_fsp(fs);
 			rfmt = d_fmt;
 			goto copcsr;
 
 		case fcvtw_op:
+			MIPS_FPU_EMU_INC_STATS(cvt_w_s);
 			SPFROMREG(fs, MIPSInst_FS(ir));
 			rv.w = ieee754sp_tint(fs);
 			rfmt = w_fmt;
@@ -1935,6 +1976,15 @@ copcsr:
 			if (!cpu_has_mips_2_3_4_5_r)
 				return SIGILL;
 
+			if (MIPSInst_FUNC(ir) == fceil_op)
+				MIPS_FPU_EMU_INC_STATS(ceil_w_s);
+			if (MIPSInst_FUNC(ir) == ffloor_op)
+				MIPS_FPU_EMU_INC_STATS(floor_w_s);
+			if (MIPSInst_FUNC(ir) == fround_op)
+				MIPS_FPU_EMU_INC_STATS(round_w_s);
+			if (MIPSInst_FUNC(ir) == ftrunc_op)
+				MIPS_FPU_EMU_INC_STATS(trunc_w_s);
+
 			oldrm = ieee754_csr.rm;
 			SPFROMREG(fs, MIPSInst_FS(ir));
 			ieee754_csr.rm = MIPSInst_FUNC(ir);
@@ -1943,10 +1993,23 @@ copcsr:
 			rfmt = w_fmt;
 			goto copcsr;
 
+		case fsel_op:
+			if (!cpu_has_mips_r6)
+				return SIGILL;
+
+			MIPS_FPU_EMU_INC_STATS(sel_s);
+			SPFROMREG(fd, MIPSInst_FD(ir));
+			if (fd.bits & 0x1)
+				SPFROMREG(rv.s, MIPSInst_FT(ir));
+			else
+				SPFROMREG(rv.s, MIPSInst_FS(ir));
+			break;
+
 		case fcvtl_op:
 			if (!cpu_has_mips_3_4_5_64_r2_r6)
 				return SIGILL;
 
+			MIPS_FPU_EMU_INC_STATS(cvt_l_s);
 			SPFROMREG(fs, MIPSInst_FS(ir));
 			rv.l = ieee754sp_tlong(fs);
 			rfmt = l_fmt;
@@ -1959,6 +2022,15 @@ copcsr:
 			if (!cpu_has_mips_3_4_5_64_r2_r6)
 				return SIGILL;
 
+			if (MIPSInst_FUNC(ir) == fceill_op)
+				MIPS_FPU_EMU_INC_STATS(ceil_l_s);
+			if (MIPSInst_FUNC(ir) == ffloorl_op)
+				MIPS_FPU_EMU_INC_STATS(floor_l_s);
+			if (MIPSInst_FUNC(ir) == froundl_op)
+				MIPS_FPU_EMU_INC_STATS(round_l_s);
+			if (MIPSInst_FUNC(ir) == ftruncl_op)
+				MIPS_FPU_EMU_INC_STATS(trunc_l_s);
+
 			oldrm = ieee754_csr.rm;
 			SPFROMREG(fs, MIPSInst_FS(ir));
 			ieee754_csr.rm = MIPSInst_FUNC(ir);
@@ -1969,9 +2041,10 @@ copcsr:
 
 		default:
 			if (!NO_R6EMU && MIPSInst_FUNC(ir) >= fcmp_op) {
-				unsigned cmpop = MIPSInst_FUNC(ir) - fcmp_op;
+				unsigned int cmpop;
 				union ieee754sp fs, ft;
 
+				cmpop = MIPSInst_FUNC(ir) - fcmp_op;
 				SPFROMREG(fs, MIPSInst_FS(ir));
 				SPFROMREG(ft, MIPSInst_FT(ir));
 				rv.w = ieee754sp_cmp(fs, ft,
@@ -1991,7 +2064,7 @@ copcsr:
 	}
 
 	case d_fmt: {
-		union ieee754dp fs, ft;
+		union ieee754dp fd, fs, ft;
 		union {
 			union ieee754dp(*b) (union ieee754dp, union ieee754dp);
 			union ieee754dp(*u) (union ieee754dp);
@@ -2000,15 +2073,19 @@ copcsr:
 		switch (MIPSInst_FUNC(ir)) {
 			/* binary ops */
 		case fadd_op:
+			MIPS_FPU_EMU_INC_STATS(add_d);
 			handler.b = ieee754dp_add;
 			goto dcopbop;
 		case fsub_op:
+			MIPS_FPU_EMU_INC_STATS(sub_d);
 			handler.b = ieee754dp_sub;
 			goto dcopbop;
 		case fmul_op:
+			MIPS_FPU_EMU_INC_STATS(mul_d);
 			handler.b = ieee754dp_mul;
 			goto dcopbop;
 		case fdiv_op:
+			MIPS_FPU_EMU_INC_STATS(div_d);
 			handler.b = ieee754dp_div;
 			goto dcopbop;
 
@@ -2017,6 +2094,7 @@ copcsr:
 			if (!cpu_has_mips_2_3_4_5_r)
 				return SIGILL;
 
+			MIPS_FPU_EMU_INC_STATS(sqrt_d);
 			handler.u = ieee754dp_sqrt;
 			goto dcopuop;
 		/*
@@ -2028,12 +2106,14 @@ copcsr:
 			if (!cpu_has_mips_4_5_64_r2_r6)
 				return SIGILL;
 
+			MIPS_FPU_EMU_INC_STATS(rsqrt_d);
 			handler.u = fpemu_dp_rsqrt;
 			goto dcopuop;
 		case frecip_op:
 			if (!cpu_has_mips_4_5_64_r2_r6)
 				return SIGILL;
 
+			MIPS_FPU_EMU_INC_STATS(recip_d);
 			handler.u = fpemu_dp_recip;
 			goto dcopuop;
 		case fmovc_op:
@@ -2067,6 +2147,7 @@ copcsr:
 			if (!cpu_has_mips_r6)
 				return SIGILL;
 
+			MIPS_FPU_EMU_INC_STATS(seleqz_d);
 			DPFROMREG(rv.d, MIPSInst_FT(ir));
 			if (rv.l & 0x1)
 				rv.l = 0;
@@ -2078,6 +2159,7 @@ copcsr:
 			if (!cpu_has_mips_r6)
 				return SIGILL;
 
+			MIPS_FPU_EMU_INC_STATS(selnez_d);
 			DPFROMREG(rv.d, MIPSInst_FT(ir));
 			if (rv.l & 0x1)
 				DPFROMREG(rv.d, MIPSInst_FS(ir));
@@ -2091,6 +2173,7 @@ copcsr:
 			if (!cpu_has_mips_r6)
 				return SIGILL;
 
+			MIPS_FPU_EMU_INC_STATS(maddf_d);
 			DPFROMREG(ft, MIPSInst_FT(ir));
 			DPFROMREG(fs, MIPSInst_FS(ir));
 			DPFROMREG(fd, MIPSInst_FD(ir));
@@ -2104,6 +2187,7 @@ copcsr:
 			if (!cpu_has_mips_r6)
 				return SIGILL;
 
+			MIPS_FPU_EMU_INC_STATS(msubf_d);
 			DPFROMREG(ft, MIPSInst_FT(ir));
 			DPFROMREG(fs, MIPSInst_FS(ir));
 			DPFROMREG(fd, MIPSInst_FD(ir));
@@ -2117,9 +2201,9 @@ copcsr:
 			if (!cpu_has_mips_r6)
 				return SIGILL;
 
+			MIPS_FPU_EMU_INC_STATS(rint_d);
 			DPFROMREG(fs, MIPSInst_FS(ir));
-			rv.l = ieee754dp_tlong(fs);
-			rv.d = ieee754dp_flong(rv.l);
+			rv.d = ieee754dp_rint(fs);
 			goto copcsr;
 		}
 
@@ -2129,9 +2213,10 @@ copcsr:
 			if (!cpu_has_mips_r6)
 				return SIGILL;
 
+			MIPS_FPU_EMU_INC_STATS(class_d);
 			DPFROMREG(fs, MIPSInst_FS(ir));
-			rv.w = ieee754dp_2008class(fs);
-			rfmt = w_fmt;
+			rv.l = ieee754dp_2008class(fs);
+			rfmt = l_fmt;
 			goto copcsr;
 		}
 
@@ -2141,6 +2226,7 @@ copcsr:
 			if (!cpu_has_mips_r6)
 				return SIGILL;
 
+			MIPS_FPU_EMU_INC_STATS(min_d);
 			DPFROMREG(ft, MIPSInst_FT(ir));
 			DPFROMREG(fs, MIPSInst_FS(ir));
 			rv.d = ieee754dp_fmin(fs, ft);
@@ -2153,6 +2239,7 @@ copcsr:
 			if (!cpu_has_mips_r6)
 				return SIGILL;
 
+			MIPS_FPU_EMU_INC_STATS(mina_d);
 			DPFROMREG(ft, MIPSInst_FT(ir));
 			DPFROMREG(fs, MIPSInst_FS(ir));
 			rv.d = ieee754dp_fmina(fs, ft);
@@ -2165,6 +2252,7 @@ copcsr:
 			if (!cpu_has_mips_r6)
 				return SIGILL;
 
+			MIPS_FPU_EMU_INC_STATS(max_d);
 			DPFROMREG(ft, MIPSInst_FT(ir));
 			DPFROMREG(fs, MIPSInst_FS(ir));
 			rv.d = ieee754dp_fmax(fs, ft);
@@ -2177,6 +2265,7 @@ copcsr:
 			if (!cpu_has_mips_r6)
 				return SIGILL;
 
+			MIPS_FPU_EMU_INC_STATS(maxa_d);
 			DPFROMREG(ft, MIPSInst_FT(ir));
 			DPFROMREG(fs, MIPSInst_FS(ir));
 			rv.d = ieee754dp_fmaxa(fs, ft);
@@ -2184,15 +2273,18 @@ copcsr:
 		}
 
 		case fabs_op:
+			MIPS_FPU_EMU_INC_STATS(abs_d);
 			handler.u = ieee754dp_abs;
 			goto dcopuop;
 
 		case fneg_op:
+			MIPS_FPU_EMU_INC_STATS(neg_d);
 			handler.u = ieee754dp_neg;
 			goto dcopuop;
 
 		case fmov_op:
 			/* an easy one */
+			MIPS_FPU_EMU_INC_STATS(mov_d);
 			DPFROMREG(rv.d, MIPSInst_FS(ir));
 			goto copcsr;
 
@@ -2212,6 +2304,7 @@ dcopuop:
 		 * unary conv ops
 		 */
 		case fcvts_op:
+			MIPS_FPU_EMU_INC_STATS(cvt_s_d);
 			DPFROMREG(fs, MIPSInst_FS(ir));
 			rv.s = ieee754sp_fdp(fs);
 			rfmt = s_fmt;
@@ -2221,6 +2314,7 @@ dcopuop:
 			return SIGILL;	/* not defined */
 
 		case fcvtw_op:
+			MIPS_FPU_EMU_INC_STATS(cvt_w_d);
 			DPFROMREG(fs, MIPSInst_FS(ir));
 			rv.w = ieee754dp_tint(fs);	/* wrong */
 			rfmt = w_fmt;
@@ -2233,6 +2327,15 @@ dcopuop:
 			if (!cpu_has_mips_2_3_4_5_r)
 				return SIGILL;
 
+			if (MIPSInst_FUNC(ir) == fceil_op)
+				MIPS_FPU_EMU_INC_STATS(ceil_w_d);
+			if (MIPSInst_FUNC(ir) == ffloor_op)
+				MIPS_FPU_EMU_INC_STATS(floor_w_d);
+			if (MIPSInst_FUNC(ir) == fround_op)
+				MIPS_FPU_EMU_INC_STATS(round_w_d);
+			if (MIPSInst_FUNC(ir) == ftrunc_op)
+				MIPS_FPU_EMU_INC_STATS(trunc_w_d);
+
 			oldrm = ieee754_csr.rm;
 			DPFROMREG(fs, MIPSInst_FS(ir));
 			ieee754_csr.rm = MIPSInst_FUNC(ir);
@@ -2241,10 +2344,23 @@ dcopuop:
 			rfmt = w_fmt;
 			goto copcsr;
 
+		case fsel_op:
+			if (!cpu_has_mips_r6)
+				return SIGILL;
+
+			MIPS_FPU_EMU_INC_STATS(sel_d);
+			DPFROMREG(fd, MIPSInst_FD(ir));
+			if (fd.bits & 0x1)
+				DPFROMREG(rv.d, MIPSInst_FT(ir));
+			else
+				DPFROMREG(rv.d, MIPSInst_FS(ir));
+			break;
+
 		case fcvtl_op:
 			if (!cpu_has_mips_3_4_5_64_r2_r6)
 				return SIGILL;
 
+			MIPS_FPU_EMU_INC_STATS(cvt_l_d);
 			DPFROMREG(fs, MIPSInst_FS(ir));
 			rv.l = ieee754dp_tlong(fs);
 			rfmt = l_fmt;
@@ -2257,6 +2373,15 @@ dcopuop:
 			if (!cpu_has_mips_3_4_5_64_r2_r6)
 				return SIGILL;
 
+			if (MIPSInst_FUNC(ir) == fceill_op)
+				MIPS_FPU_EMU_INC_STATS(ceil_l_d);
+			if (MIPSInst_FUNC(ir) == ffloorl_op)
+				MIPS_FPU_EMU_INC_STATS(floor_l_d);
+			if (MIPSInst_FUNC(ir) == froundl_op)
+				MIPS_FPU_EMU_INC_STATS(round_l_d);
+			if (MIPSInst_FUNC(ir) == ftruncl_op)
+				MIPS_FPU_EMU_INC_STATS(trunc_l_d);
+
 			oldrm = ieee754_csr.rm;
 			DPFROMREG(fs, MIPSInst_FS(ir));
 			ieee754_csr.rm = MIPSInst_FUNC(ir);
@@ -2267,9 +2392,10 @@ dcopuop:
 
 		default:
 			if (!NO_R6EMU && MIPSInst_FUNC(ir) >= fcmp_op) {
-				unsigned cmpop = MIPSInst_FUNC(ir) - fcmp_op;
+				unsigned int cmpop;
 				union ieee754dp fs, ft;
 
+				cmpop = MIPSInst_FUNC(ir) - fcmp_op;
 				DPFROMREG(fs, MIPSInst_FS(ir));
 				DPFROMREG(ft, MIPSInst_FT(ir));
 				rv.w = ieee754dp_cmp(fs, ft,
@@ -2298,12 +2424,14 @@ dcopuop:
 		switch (MIPSInst_FUNC(ir)) {
 		case fcvts_op:
 			/* convert word to single precision real */
+			MIPS_FPU_EMU_INC_STATS(cvt_s_w);
 			SPFROMREG(fs, MIPSInst_FS(ir));
 			rv.s = ieee754sp_fint(fs.bits);
 			rfmt = s_fmt;
 			goto copcsr;
 		case fcvtd_op:
 			/* convert word to double precision real */
+			MIPS_FPU_EMU_INC_STATS(cvt_d_w);
 			SPFROMREG(fs, MIPSInst_FS(ir));
 			rv.d = ieee754dp_fint(fs.bits);
 			rfmt = d_fmt;
@@ -2323,6 +2451,90 @@ dcopuop:
 			    (MIPSInst_FUNC(ir) & 0x20))
 				return SIGILL;
 
+			if (!sig) {
+				if (!(MIPSInst_FUNC(ir) & PREDICATE_BIT)) {
+					switch (cmpop) {
+					case 0:
+					MIPS_FPU_EMU_INC_STATS(cmp_af_s);
+					break;
+					case 1:
+					MIPS_FPU_EMU_INC_STATS(cmp_un_s);
+					break;
+					case 2:
+					MIPS_FPU_EMU_INC_STATS(cmp_eq_s);
+					break;
+					case 3:
+					MIPS_FPU_EMU_INC_STATS(cmp_ueq_s);
+					break;
+					case 4:
+					MIPS_FPU_EMU_INC_STATS(cmp_lt_s);
+					break;
+					case 5:
+					MIPS_FPU_EMU_INC_STATS(cmp_ult_s);
+					break;
+					case 6:
+					MIPS_FPU_EMU_INC_STATS(cmp_le_s);
+					break;
+					case 7:
+					MIPS_FPU_EMU_INC_STATS(cmp_ule_s);
+					break;
+					}
+				} else {
+					switch (cmpop) {
+					case 1:
+					MIPS_FPU_EMU_INC_STATS(cmp_or_s);
+					break;
+					case 2:
+					MIPS_FPU_EMU_INC_STATS(cmp_une_s);
+					break;
+					case 3:
+					MIPS_FPU_EMU_INC_STATS(cmp_ne_s);
+					break;
+					}
+				}
+			} else {
+				if (!(MIPSInst_FUNC(ir) & PREDICATE_BIT)) {
+					switch (cmpop) {
+					case 0:
+					MIPS_FPU_EMU_INC_STATS(cmp_saf_s);
+					break;
+					case 1:
+					MIPS_FPU_EMU_INC_STATS(cmp_sun_s);
+					break;
+					case 2:
+					MIPS_FPU_EMU_INC_STATS(cmp_seq_s);
+					break;
+					case 3:
+					MIPS_FPU_EMU_INC_STATS(cmp_sueq_s);
+					break;
+					case 4:
+					MIPS_FPU_EMU_INC_STATS(cmp_slt_s);
+					break;
+					case 5:
+					MIPS_FPU_EMU_INC_STATS(cmp_sult_s);
+					break;
+					case 6:
+					MIPS_FPU_EMU_INC_STATS(cmp_sle_s);
+					break;
+					case 7:
+					MIPS_FPU_EMU_INC_STATS(cmp_sule_s);
+					break;
+					}
+				} else {
+					switch (cmpop) {
+					case 1:
+					MIPS_FPU_EMU_INC_STATS(cmp_sor_s);
+					break;
+					case 2:
+					MIPS_FPU_EMU_INC_STATS(cmp_sune_s);
+					break;
+					case 3:
+					MIPS_FPU_EMU_INC_STATS(cmp_sne_s);
+					break;
+					}
+				}
+			}
+
 			/* fmt is w_fmt for single precision so fix it */
 			rfmt = s_fmt;
 			/* default to false */
@@ -2366,6 +2578,7 @@ dcopuop:
 			break;
 			}
 		}
+		break;
 	}
 
 	case l_fmt:
@@ -2378,11 +2591,13 @@ dcopuop:
 		switch (MIPSInst_FUNC(ir)) {
 		case fcvts_op:
 			/* convert long to single precision real */
+			MIPS_FPU_EMU_INC_STATS(cvt_s_l);
 			rv.s = ieee754sp_flong(bits);
 			rfmt = s_fmt;
 			goto copcsr;
 		case fcvtd_op:
 			/* convert long to double precision real */
+			MIPS_FPU_EMU_INC_STATS(cvt_d_l);
 			rv.d = ieee754dp_flong(bits);
 			rfmt = d_fmt;
 			goto copcsr;
@@ -2396,6 +2611,90 @@ dcopuop:
 			    (MIPSInst_FUNC(ir) & 0x20))
 				return SIGILL;
 
+			if (!sig) {
+				if (!(MIPSInst_FUNC(ir) & PREDICATE_BIT)) {
+					switch (cmpop) {
+					case 0:
+					MIPS_FPU_EMU_INC_STATS(cmp_af_d);
+					break;
+					case 1:
+					MIPS_FPU_EMU_INC_STATS(cmp_un_d);
+					break;
+					case 2:
+					MIPS_FPU_EMU_INC_STATS(cmp_eq_d);
+					break;
+					case 3:
+					MIPS_FPU_EMU_INC_STATS(cmp_ueq_d);
+					break;
+					case 4:
+					MIPS_FPU_EMU_INC_STATS(cmp_lt_d);
+					break;
+					case 5:
+					MIPS_FPU_EMU_INC_STATS(cmp_ult_d);
+					break;
+					case 6:
+					MIPS_FPU_EMU_INC_STATS(cmp_le_d);
+					break;
+					case 7:
+					MIPS_FPU_EMU_INC_STATS(cmp_ule_d);
+					break;
+					}
+				} else {
+					switch (cmpop) {
+					case 1:
+					MIPS_FPU_EMU_INC_STATS(cmp_or_d);
+					break;
+					case 2:
+					MIPS_FPU_EMU_INC_STATS(cmp_une_d);
+					break;
+					case 3:
+					MIPS_FPU_EMU_INC_STATS(cmp_ne_d);
+					break;
+					}
+				}
+			} else {
+				if (!(MIPSInst_FUNC(ir) & PREDICATE_BIT)) {
+					switch (cmpop) {
+					case 0:
+					MIPS_FPU_EMU_INC_STATS(cmp_saf_d);
+					break;
+					case 1:
+					MIPS_FPU_EMU_INC_STATS(cmp_sun_d);
+					break;
+					case 2:
+					MIPS_FPU_EMU_INC_STATS(cmp_seq_d);
+					break;
+					case 3:
+					MIPS_FPU_EMU_INC_STATS(cmp_sueq_d);
+					break;
+					case 4:
+					MIPS_FPU_EMU_INC_STATS(cmp_slt_d);
+					break;
+					case 5:
+					MIPS_FPU_EMU_INC_STATS(cmp_sult_d);
+					break;
+					case 6:
+					MIPS_FPU_EMU_INC_STATS(cmp_sle_d);
+					break;
+					case 7:
+					MIPS_FPU_EMU_INC_STATS(cmp_sule_d);
+					break;
+					}
+				} else {
+					switch (cmpop) {
+					case 1:
+					MIPS_FPU_EMU_INC_STATS(cmp_sor_d);
+					break;
+					case 2:
+					MIPS_FPU_EMU_INC_STATS(cmp_sune_d);
+					break;
+					case 3:
+					MIPS_FPU_EMU_INC_STATS(cmp_sne_d);
+					break;
+					}
+				}
+			}
+
 			/* fmt is l_fmt for double precision so fix it */
 			rfmt = d_fmt;
 			/* default to false */
@@ -2439,6 +2738,8 @@ dcopuop:
 			break;
 			}
 		}
+		break;
+
 	default:
 		return SIGILL;
 	}
diff --git a/arch/mips/math-emu/dp_add.c b/arch/mips/math-emu/dp_add.c
index 8954ef031f84..678de20e4cb1 100644
--- a/arch/mips/math-emu/dp_add.c
+++ b/arch/mips/math-emu/dp_add.c
@@ -104,8 +104,7 @@ union ieee754dp ieee754dp_add(union ieee754dp x, union ieee754dp y)
 
 	case CLPAIR(IEEE754_CLASS_DNORM, IEEE754_CLASS_DNORM):
 		DPDNORMX;
-
-		/* FALL THROUGH */
+		/* fall through */
 
 	case CLPAIR(IEEE754_CLASS_NORM, IEEE754_CLASS_DNORM):
 		DPDNORMY;
diff --git a/arch/mips/math-emu/dp_div.c b/arch/mips/math-emu/dp_div.c
index f4746f7c5f63..3063ae3ab3b9 100644
--- a/arch/mips/math-emu/dp_div.c
+++ b/arch/mips/math-emu/dp_div.c
@@ -103,6 +103,7 @@ union ieee754dp ieee754dp_div(union ieee754dp x, union ieee754dp y)
 
 	case CLPAIR(IEEE754_CLASS_DNORM, IEEE754_CLASS_DNORM):
 		DPDNORMX;
+		/* fall through */
 
 	case CLPAIR(IEEE754_CLASS_NORM, IEEE754_CLASS_DNORM):
 		DPDNORMY;
diff --git a/arch/mips/math-emu/dp_fmax.c b/arch/mips/math-emu/dp_fmax.c
index 5bec64f2884e..d1f984b40344 100644
--- a/arch/mips/math-emu/dp_fmax.c
+++ b/arch/mips/math-emu/dp_fmax.c
@@ -96,6 +96,7 @@ union ieee754dp ieee754dp_fmax(union ieee754dp x, union ieee754dp y)
 
 	case CLPAIR(IEEE754_CLASS_DNORM, IEEE754_CLASS_DNORM):
 		DPDNORMX;
+		/* fall through */
 
 	case CLPAIR(IEEE754_CLASS_NORM, IEEE754_CLASS_DNORM):
 		DPDNORMY;
@@ -224,6 +225,7 @@ union ieee754dp ieee754dp_fmaxa(union ieee754dp x, union ieee754dp y)
 
 	case CLPAIR(IEEE754_CLASS_DNORM, IEEE754_CLASS_DNORM):
 		DPDNORMX;
+		/* fall through */
 
 	case CLPAIR(IEEE754_CLASS_NORM, IEEE754_CLASS_DNORM):
 		DPDNORMY;
diff --git a/arch/mips/math-emu/dp_fmin.c b/arch/mips/math-emu/dp_fmin.c
index a287b23818d8..f98b96135c8d 100644
--- a/arch/mips/math-emu/dp_fmin.c
+++ b/arch/mips/math-emu/dp_fmin.c
@@ -96,6 +96,7 @@ union ieee754dp ieee754dp_fmin(union ieee754dp x, union ieee754dp y)
 
 	case CLPAIR(IEEE754_CLASS_DNORM, IEEE754_CLASS_DNORM):
 		DPDNORMX;
+		/* fall through */
 
 	case CLPAIR(IEEE754_CLASS_NORM, IEEE754_CLASS_DNORM):
 		DPDNORMY;
@@ -224,6 +225,7 @@ union ieee754dp ieee754dp_fmina(union ieee754dp x, union ieee754dp y)
 
 	case CLPAIR(IEEE754_CLASS_DNORM, IEEE754_CLASS_DNORM):
 		DPDNORMX;
+		/* fall through */
 
 	case CLPAIR(IEEE754_CLASS_NORM, IEEE754_CLASS_DNORM):
 		DPDNORMY;
diff --git a/arch/mips/math-emu/dp_maddf.c b/arch/mips/math-emu/dp_maddf.c
index 119eda9fa1ea..7ea2f8222026 100644
--- a/arch/mips/math-emu/dp_maddf.c
+++ b/arch/mips/math-emu/dp_maddf.c
@@ -14,83 +14,91 @@
 
 #include "ieee754dp.h"
 
-union ieee754dp ieee754dp_maddf(union ieee754dp z, union ieee754dp x,
-				union ieee754dp y)
+
+/* 128 bits shift right logical with rounding. */
+static void srl128(u64 *hptr, u64 *lptr, int count)
+{
+	u64 low;
+
+	if (count >= 128) {
+		*lptr = *hptr != 0 || *lptr != 0;
+		*hptr = 0;
+	} else if (count >= 64) {
+		if (count == 64) {
+			*lptr = *hptr | (*lptr != 0);
+		} else {
+			low = *lptr;
+			*lptr = *hptr >> (count - 64);
+			*lptr |= (*hptr << (128 - count)) != 0 || low != 0;
+		}
+		*hptr = 0;
+	} else {
+		low = *lptr;
+		*lptr = low >> count | *hptr << (64 - count);
+		*lptr |= (low << (64 - count)) != 0;
+		*hptr = *hptr >> count;
+	}
+}
+
+static union ieee754dp _dp_maddf(union ieee754dp z, union ieee754dp x,
+				 union ieee754dp y, enum maddf_flags flags)
 {
 	int re;
 	int rs;
-	u64 rm;
-	unsigned lxm;
-	unsigned hxm;
-	unsigned lym;
-	unsigned hym;
+	unsigned int lxm;
+	unsigned int hxm;
+	unsigned int lym;
+	unsigned int hym;
 	u64 lrm;
 	u64 hrm;
+	u64 lzm;
+	u64 hzm;
 	u64 t;
 	u64 at;
 	int s;
 
 	COMPXDP;
 	COMPYDP;
-
-	u64 zm; int ze; int zs __maybe_unused; int zc;
+	COMPZDP;
 
 	EXPLODEXDP;
 	EXPLODEYDP;
-	EXPLODEDP(z, zc, zs, ze, zm)
+	EXPLODEZDP;
 
 	FLUSHXDP;
 	FLUSHYDP;
-	FLUSHDP(z, zc, zs, ze, zm);
+	FLUSHZDP;
 
 	ieee754_clearcx();
 
-	switch (zc) {
-	case IEEE754_CLASS_SNAN:
-		ieee754_setcx(IEEE754_INVALID_OPERATION);
+	/*
+	 * Handle the cases when at least one of x, y or z is a NaN.
+	 * Order of precedence is sNaN, qNaN and z, x, y.
+	 */
+	if (zc == IEEE754_CLASS_SNAN)
 		return ieee754dp_nanxcpt(z);
-	case IEEE754_CLASS_DNORM:
-		DPDNORMx(zm, ze);
-	/* QNAN is handled separately below */
-	}
-
-	switch (CLPAIR(xc, yc)) {
-	case CLPAIR(IEEE754_CLASS_QNAN, IEEE754_CLASS_SNAN):
-	case CLPAIR(IEEE754_CLASS_ZERO, IEEE754_CLASS_SNAN):
-	case CLPAIR(IEEE754_CLASS_NORM, IEEE754_CLASS_SNAN):
-	case CLPAIR(IEEE754_CLASS_DNORM, IEEE754_CLASS_SNAN):
-	case CLPAIR(IEEE754_CLASS_INF, IEEE754_CLASS_SNAN):
-		return ieee754dp_nanxcpt(y);
-
-	case CLPAIR(IEEE754_CLASS_SNAN, IEEE754_CLASS_SNAN):
-	case CLPAIR(IEEE754_CLASS_SNAN, IEEE754_CLASS_QNAN):
-	case CLPAIR(IEEE754_CLASS_SNAN, IEEE754_CLASS_ZERO):
-	case CLPAIR(IEEE754_CLASS_SNAN, IEEE754_CLASS_NORM):
-	case CLPAIR(IEEE754_CLASS_SNAN, IEEE754_CLASS_DNORM):
-	case CLPAIR(IEEE754_CLASS_SNAN, IEEE754_CLASS_INF):
+	if (xc == IEEE754_CLASS_SNAN)
 		return ieee754dp_nanxcpt(x);
-
-	case CLPAIR(IEEE754_CLASS_ZERO, IEEE754_CLASS_QNAN):
-	case CLPAIR(IEEE754_CLASS_NORM, IEEE754_CLASS_QNAN):
-	case CLPAIR(IEEE754_CLASS_DNORM, IEEE754_CLASS_QNAN):
-	case CLPAIR(IEEE754_CLASS_INF, IEEE754_CLASS_QNAN):
+	if (yc == IEEE754_CLASS_SNAN)
+		return ieee754dp_nanxcpt(y);
+	if (zc == IEEE754_CLASS_QNAN)
+		return z;
+	if (xc == IEEE754_CLASS_QNAN)
+		return x;
+	if (yc == IEEE754_CLASS_QNAN)
 		return y;
 
-	case CLPAIR(IEEE754_CLASS_QNAN, IEEE754_CLASS_QNAN):
-	case CLPAIR(IEEE754_CLASS_QNAN, IEEE754_CLASS_ZERO):
-	case CLPAIR(IEEE754_CLASS_QNAN, IEEE754_CLASS_NORM):
-	case CLPAIR(IEEE754_CLASS_QNAN, IEEE754_CLASS_DNORM):
-	case CLPAIR(IEEE754_CLASS_QNAN, IEEE754_CLASS_INF):
-		return x;
+	if (zc == IEEE754_CLASS_DNORM)
+		DPDNORMZ;
+	/* ZERO z cases are handled separately below */
 
+	switch (CLPAIR(xc, yc)) {
 
 	/*
 	 * Infinity handling
 	 */
 	case CLPAIR(IEEE754_CLASS_INF, IEEE754_CLASS_ZERO):
 	case CLPAIR(IEEE754_CLASS_ZERO, IEEE754_CLASS_INF):
-		if (zc == IEEE754_CLASS_QNAN)
-			return z;
 		ieee754_setcx(IEEE754_INVALID_OPERATION);
 		return ieee754dp_indef();
 
@@ -99,9 +107,27 @@ union ieee754dp ieee754dp_maddf(union ieee754dp z, union ieee754dp x,
 	case CLPAIR(IEEE754_CLASS_INF, IEEE754_CLASS_NORM):
 	case CLPAIR(IEEE754_CLASS_INF, IEEE754_CLASS_DNORM):
 	case CLPAIR(IEEE754_CLASS_INF, IEEE754_CLASS_INF):
-		if (zc == IEEE754_CLASS_QNAN)
-			return z;
-		return ieee754dp_inf(xs ^ ys);
+		if ((zc == IEEE754_CLASS_INF) &&
+		    ((!(flags & MADDF_NEGATE_PRODUCT) && (zs != (xs ^ ys))) ||
+		     ((flags & MADDF_NEGATE_PRODUCT) && (zs == (xs ^ ys))))) {
+			/*
+			 * Cases of addition of infinities with opposite signs
+			 * or subtraction of infinities with same signs.
+			 */
+			ieee754_setcx(IEEE754_INVALID_OPERATION);
+			return ieee754dp_indef();
+		}
+		/*
+		 * z is here either not an infinity, or an infinity having the
+		 * same sign as product (x*y) (in case of MADDF.D instruction)
+		 * or product -(x*y) (in MSUBF.D case). The result must be an
+		 * infinity, and its sign is determined only by the value of
+		 * (flags & MADDF_NEGATE_PRODUCT) and the signs of x and y.
+		 */
+		if (flags & MADDF_NEGATE_PRODUCT)
+			return ieee754dp_inf(1 ^ (xs ^ ys));
+		else
+			return ieee754dp_inf(xs ^ ys);
 
 	case CLPAIR(IEEE754_CLASS_ZERO, IEEE754_CLASS_ZERO):
 	case CLPAIR(IEEE754_CLASS_ZERO, IEEE754_CLASS_NORM):
@@ -110,34 +136,45 @@ union ieee754dp ieee754dp_maddf(union ieee754dp z, union ieee754dp x,
 	case CLPAIR(IEEE754_CLASS_DNORM, IEEE754_CLASS_ZERO):
 		if (zc == IEEE754_CLASS_INF)
 			return ieee754dp_inf(zs);
-		/* Multiplication is 0 so just return z */
+		if (zc == IEEE754_CLASS_ZERO) {
+			/* Handle cases +0 + (-0) and similar ones. */
+			if ((!(flags & MADDF_NEGATE_PRODUCT)
+					&& (zs == (xs ^ ys))) ||
+			    ((flags & MADDF_NEGATE_PRODUCT)
+					&& (zs != (xs ^ ys))))
+				/*
+				 * Cases of addition of zeros of equal signs
+				 * or subtraction of zeroes of opposite signs.
+				 * The sign of the resulting zero is in any
+				 * such case determined only by the sign of z.
+				 */
+				return z;
+
+			return ieee754dp_zero(ieee754_csr.rm == FPU_CSR_RD);
+		}
+		/* x*y is here 0, and z is not 0, so just return z */
 		return z;
 
 	case CLPAIR(IEEE754_CLASS_DNORM, IEEE754_CLASS_DNORM):
 		DPDNORMX;
+		/* fall through */
 
 	case CLPAIR(IEEE754_CLASS_NORM, IEEE754_CLASS_DNORM):
-		if (zc == IEEE754_CLASS_QNAN)
-			return z;
-		else if (zc == IEEE754_CLASS_INF)
+		if (zc == IEEE754_CLASS_INF)
 			return ieee754dp_inf(zs);
 		DPDNORMY;
 		break;
 
 	case CLPAIR(IEEE754_CLASS_DNORM, IEEE754_CLASS_NORM):
-		if (zc == IEEE754_CLASS_QNAN)
-			return z;
-		else if (zc == IEEE754_CLASS_INF)
+		if (zc == IEEE754_CLASS_INF)
 			return ieee754dp_inf(zs);
 		DPDNORMX;
 		break;
 
 	case CLPAIR(IEEE754_CLASS_NORM, IEEE754_CLASS_NORM):
-		if (zc == IEEE754_CLASS_QNAN)
-			return z;
-		else if (zc == IEEE754_CLASS_INF)
+		if (zc == IEEE754_CLASS_INF)
 			return ieee754dp_inf(zs);
-		/* fall through to real computations */
+		/* continue to real computations */
 	}
 
 	/* Finally get to do some computation */
@@ -154,18 +191,17 @@ union ieee754dp ieee754dp_maddf(union ieee754dp z, union ieee754dp x,
 
 	re = xe + ye;
 	rs = xs ^ ys;
+	if (flags & MADDF_NEGATE_PRODUCT)
+		rs ^= 1;
 
 	/* shunt to top of word */
 	xm <<= 64 - (DP_FBITS + 1);
 	ym <<= 64 - (DP_FBITS + 1);
 
 	/*
-	 * Multiply 32 bits xm, ym to give high 32 bits rm with stickness.
+	 * Multiply 64 bits xm and ym to give 128 bits result in hrm:lrm.
 	 */
 
-	/* 32 * 32 => 64 */
-#define DPXMULT(x, y)	((u64)(x) * (u64)y)
-
 	lxm = xm;
 	hxm = xm >> 32;
 	lym = ym;
@@ -190,76 +226,120 @@ union ieee754dp ieee754dp_maddf(union ieee754dp z, union ieee754dp x,
 
 	hrm = hrm + (t >> 32);
 
-	rm = hrm | (lrm != 0);
-
-	/*
-	 * Sticky shift down to normal rounding precision.
-	 */
-	if ((s64) rm < 0) {
-		rm = (rm >> (64 - (DP_FBITS + 1 + 3))) |
-		     ((rm << (DP_FBITS + 1 + 3)) != 0);
-			re++;
-	} else {
-		rm = (rm >> (64 - (DP_FBITS + 1 + 3 + 1))) |
-		     ((rm << (DP_FBITS + 1 + 3 + 1)) != 0);
+	/* Put explicit bit at bit 126 if necessary */
+	if ((int64_t)hrm < 0) {
+		lrm = (hrm << 63) | (lrm >> 1);
+		hrm = hrm >> 1;
+		re++;
 	}
-	assert(rm & (DP_HIDDEN_BIT << 3));
 
-	/* And now the addition */
-	assert(zm & DP_HIDDEN_BIT);
+	assert(hrm & (1 << 62));
 
-	/*
-	 * Provide guard,round and stick bit space.
-	 */
-	zm <<= 3;
+	if (zc == IEEE754_CLASS_ZERO) {
+		/*
+		 * Move explicit bit from bit 126 to bit 55 since the
+		 * ieee754dp_format code expects the mantissa to be
+		 * 56 bits wide (53 + 3 rounding bits).
+		 */
+		srl128(&hrm, &lrm, (126 - 55));
+		return ieee754dp_format(rs, re, lrm);
+	}
+
+	/* Move explicit bit from bit 52 to bit 126 */
+	lzm = 0;
+	hzm = zm << 10;
+	assert(hzm & (1 << 62));
 
+	/* Make the exponents the same */
 	if (ze > re) {
 		/*
 		 * Have to shift y fraction right to align.
 		 */
 		s = ze - re;
-		rm = XDPSRS(rm, s);
+		srl128(&hrm, &lrm, s);
 		re += s;
 	} else if (re > ze) {
 		/*
 		 * Have to shift x fraction right to align.
 		 */
 		s = re - ze;
-		zm = XDPSRS(zm, s);
+		srl128(&hzm, &lzm, s);
 		ze += s;
 	}
 	assert(ze == re);
 	assert(ze <= DP_EMAX);
 
+	/* Do the addition */
 	if (zs == rs) {
 		/*
-		 * Generate 28 bit result of adding two 27 bit numbers
-		 * leaving result in xm, xs and xe.
+		 * Generate 128 bit result by adding two 127 bit numbers
+		 * leaving result in hzm:lzm, zs and ze.
 		 */
-		zm = zm + rm;
-
-		if (zm >> (DP_FBITS + 1 + 3)) { /* carry out */
-			zm = XDPSRS1(zm);
+		hzm = hzm + hrm + (lzm > (lzm + lrm));
+		lzm = lzm + lrm;
+		if ((int64_t)hzm < 0) {        /* carry out */
+			srl128(&hzm, &lzm, 1);
 			ze++;
 		}
 	} else {
-		if (zm >= rm) {
-			zm = zm - rm;
+		if (hzm > hrm || (hzm == hrm && lzm >= lrm)) {
+			hzm = hzm - hrm - (lzm < lrm);
+			lzm = lzm - lrm;
 		} else {
-			zm = rm - zm;
+			hzm = hrm - hzm - (lrm < lzm);
+			lzm = lrm - lzm;
 			zs = rs;
 		}
-		if (zm == 0)
+		if (lzm == 0 && hzm == 0)
 			return ieee754dp_zero(ieee754_csr.rm == FPU_CSR_RD);
 
 		/*
-		 * Normalize to rounding precision.
+		 * Put explicit bit at bit 126 if necessary.
 		 */
-		while ((zm >> (DP_FBITS + 3)) == 0) {
-			zm <<= 1;
-			ze--;
+		if (hzm == 0) {
+			/* left shift by 63 or 64 bits */
+			if ((int64_t)lzm < 0) {
+				/* MSB of lzm is the explicit bit */
+				hzm = lzm >> 1;
+				lzm = lzm << 63;
+				ze -= 63;
+			} else {
+				hzm = lzm;
+				lzm = 0;
+				ze -= 64;
+			}
+		}
+
+		t = 0;
+		while ((hzm >> (62 - t)) == 0)
+			t++;
+
+		assert(t <= 62);
+		if (t) {
+			hzm = hzm << t | lzm >> (64 - t);
+			lzm = lzm << t;
+			ze -= t;
 		}
 	}
 
-	return ieee754dp_format(zs, ze, zm);
+	/*
+	 * Move explicit bit from bit 126 to bit 55 since the
+	 * ieee754dp_format code expects the mantissa to be
+	 * 56 bits wide (53 + 3 rounding bits).
+	 */
+	srl128(&hzm, &lzm, (126 - 55));
+
+	return ieee754dp_format(zs, ze, lzm);
+}
+
+union ieee754dp ieee754dp_maddf(union ieee754dp z, union ieee754dp x,
+				union ieee754dp y)
+{
+	return _dp_maddf(z, x, y, 0);
+}
+
+union ieee754dp ieee754dp_msubf(union ieee754dp z, union ieee754dp x,
+				union ieee754dp y)
+{
+	return _dp_maddf(z, x, y, MADDF_NEGATE_PRODUCT);
 }
diff --git a/arch/mips/math-emu/dp_msubf.c b/arch/mips/math-emu/dp_msubf.c
deleted file mode 100644
index 12241262f856..000000000000
--- a/arch/mips/math-emu/dp_msubf.c
+++ /dev/null
@@ -1,269 +0,0 @@
-/*
- * IEEE754 floating point arithmetic
- * double precision: MSUB.f (Fused Multiply Subtract)
- * MSUBF.fmt: FPR[fd] = FPR[fd] - (FPR[fs] x FPR[ft])
- *
- * MIPS floating point support
- * Copyright (C) 2015 Imagination Technologies, Ltd.
- * Author: Markos Chandras <markos.chandras@imgtec.com>
- *
- *  This program is free software; you can distribute it and/or modify it
- *  under the terms of the GNU General Public License as published by the
- *  Free Software Foundation; version 2 of the License.
- */
-
-#include "ieee754dp.h"
-
-union ieee754dp ieee754dp_msubf(union ieee754dp z, union ieee754dp x,
-				union ieee754dp y)
-{
-	int re;
-	int rs;
-	u64 rm;
-	unsigned lxm;
-	unsigned hxm;
-	unsigned lym;
-	unsigned hym;
-	u64 lrm;
-	u64 hrm;
-	u64 t;
-	u64 at;
-	int s;
-
-	COMPXDP;
-	COMPYDP;
-
-	u64 zm; int ze; int zs __maybe_unused; int zc;
-
-	EXPLODEXDP;
-	EXPLODEYDP;
-	EXPLODEDP(z, zc, zs, ze, zm)
-
-	FLUSHXDP;
-	FLUSHYDP;
-	FLUSHDP(z, zc, zs, ze, zm);
-
-	ieee754_clearcx();
-
-	switch (zc) {
-	case IEEE754_CLASS_SNAN:
-		ieee754_setcx(IEEE754_INVALID_OPERATION);
-		return ieee754dp_nanxcpt(z);
-	case IEEE754_CLASS_DNORM:
-		DPDNORMx(zm, ze);
-	/* QNAN is handled separately below */
-	}
-
-	switch (CLPAIR(xc, yc)) {
-	case CLPAIR(IEEE754_CLASS_QNAN, IEEE754_CLASS_SNAN):
-	case CLPAIR(IEEE754_CLASS_ZERO, IEEE754_CLASS_SNAN):
-	case CLPAIR(IEEE754_CLASS_NORM, IEEE754_CLASS_SNAN):
-	case CLPAIR(IEEE754_CLASS_DNORM, IEEE754_CLASS_SNAN):
-	case CLPAIR(IEEE754_CLASS_INF, IEEE754_CLASS_SNAN):
-		return ieee754dp_nanxcpt(y);
-
-	case CLPAIR(IEEE754_CLASS_SNAN, IEEE754_CLASS_SNAN):
-	case CLPAIR(IEEE754_CLASS_SNAN, IEEE754_CLASS_QNAN):
-	case CLPAIR(IEEE754_CLASS_SNAN, IEEE754_CLASS_ZERO):
-	case CLPAIR(IEEE754_CLASS_SNAN, IEEE754_CLASS_NORM):
-	case CLPAIR(IEEE754_CLASS_SNAN, IEEE754_CLASS_DNORM):
-	case CLPAIR(IEEE754_CLASS_SNAN, IEEE754_CLASS_INF):
-		return ieee754dp_nanxcpt(x);
-
-	case CLPAIR(IEEE754_CLASS_ZERO, IEEE754_CLASS_QNAN):
-	case CLPAIR(IEEE754_CLASS_NORM, IEEE754_CLASS_QNAN):
-	case CLPAIR(IEEE754_CLASS_DNORM, IEEE754_CLASS_QNAN):
-	case CLPAIR(IEEE754_CLASS_INF, IEEE754_CLASS_QNAN):
-		return y;
-
-	case CLPAIR(IEEE754_CLASS_QNAN, IEEE754_CLASS_QNAN):
-	case CLPAIR(IEEE754_CLASS_QNAN, IEEE754_CLASS_ZERO):
-	case CLPAIR(IEEE754_CLASS_QNAN, IEEE754_CLASS_NORM):
-	case CLPAIR(IEEE754_CLASS_QNAN, IEEE754_CLASS_DNORM):
-	case CLPAIR(IEEE754_CLASS_QNAN, IEEE754_CLASS_INF):
-		return x;
-
-
-	/*
-	 * Infinity handling
-	 */
-	case CLPAIR(IEEE754_CLASS_INF, IEEE754_CLASS_ZERO):
-	case CLPAIR(IEEE754_CLASS_ZERO, IEEE754_CLASS_INF):
-		if (zc == IEEE754_CLASS_QNAN)
-			return z;
-		ieee754_setcx(IEEE754_INVALID_OPERATION);
-		return ieee754dp_indef();
-
-	case CLPAIR(IEEE754_CLASS_NORM, IEEE754_CLASS_INF):
-	case CLPAIR(IEEE754_CLASS_DNORM, IEEE754_CLASS_INF):
-	case CLPAIR(IEEE754_CLASS_INF, IEEE754_CLASS_NORM):
-	case CLPAIR(IEEE754_CLASS_INF, IEEE754_CLASS_DNORM):
-	case CLPAIR(IEEE754_CLASS_INF, IEEE754_CLASS_INF):
-		if (zc == IEEE754_CLASS_QNAN)
-			return z;
-		return ieee754dp_inf(xs ^ ys);
-
-	case CLPAIR(IEEE754_CLASS_ZERO, IEEE754_CLASS_ZERO):
-	case CLPAIR(IEEE754_CLASS_ZERO, IEEE754_CLASS_NORM):
-	case CLPAIR(IEEE754_CLASS_ZERO, IEEE754_CLASS_DNORM):
-	case CLPAIR(IEEE754_CLASS_NORM, IEEE754_CLASS_ZERO):
-	case CLPAIR(IEEE754_CLASS_DNORM, IEEE754_CLASS_ZERO):
-		if (zc == IEEE754_CLASS_INF)
-			return ieee754dp_inf(zs);
-		/* Multiplication is 0 so just return z */
-		return z;
-
-	case CLPAIR(IEEE754_CLASS_DNORM, IEEE754_CLASS_DNORM):
-		DPDNORMX;
-
-	case CLPAIR(IEEE754_CLASS_NORM, IEEE754_CLASS_DNORM):
-		if (zc == IEEE754_CLASS_QNAN)
-			return z;
-		else if (zc == IEEE754_CLASS_INF)
-			return ieee754dp_inf(zs);
-		DPDNORMY;
-		break;
-
-	case CLPAIR(IEEE754_CLASS_DNORM, IEEE754_CLASS_NORM):
-		if (zc == IEEE754_CLASS_QNAN)
-			return z;
-		else if (zc == IEEE754_CLASS_INF)
-			return ieee754dp_inf(zs);
-		DPDNORMX;
-		break;
-
-	case CLPAIR(IEEE754_CLASS_NORM, IEEE754_CLASS_NORM):
-		if (zc == IEEE754_CLASS_QNAN)
-			return z;
-		else if (zc == IEEE754_CLASS_INF)
-			return ieee754dp_inf(zs);
-		/* fall through to real computations */
-	}
-
-	/* Finally get to do some computation */
-
-	/*
-	 * Do the multiplication bit first
-	 *
-	 * rm = xm * ym, re = xe + ye basically
-	 *
-	 * At this point xm and ym should have been normalized.
-	 */
-	assert(xm & DP_HIDDEN_BIT);
-	assert(ym & DP_HIDDEN_BIT);
-
-	re = xe + ye;
-	rs = xs ^ ys;
-
-	/* shunt to top of word */
-	xm <<= 64 - (DP_FBITS + 1);
-	ym <<= 64 - (DP_FBITS + 1);
-
-	/*
-	 * Multiply 32 bits xm, ym to give high 32 bits rm with stickness.
-	 */
-
-	/* 32 * 32 => 64 */
-#define DPXMULT(x, y)	((u64)(x) * (u64)y)
-
-	lxm = xm;
-	hxm = xm >> 32;
-	lym = ym;
-	hym = ym >> 32;
-
-	lrm = DPXMULT(lxm, lym);
-	hrm = DPXMULT(hxm, hym);
-
-	t = DPXMULT(lxm, hym);
-
-	at = lrm + (t << 32);
-	hrm += at < lrm;
-	lrm = at;
-
-	hrm = hrm + (t >> 32);
-
-	t = DPXMULT(hxm, lym);
-
-	at = lrm + (t << 32);
-	hrm += at < lrm;
-	lrm = at;
-
-	hrm = hrm + (t >> 32);
-
-	rm = hrm | (lrm != 0);
-
-	/*
-	 * Sticky shift down to normal rounding precision.
-	 */
-	if ((s64) rm < 0) {
-		rm = (rm >> (64 - (DP_FBITS + 1 + 3))) |
-		     ((rm << (DP_FBITS + 1 + 3)) != 0);
-			re++;
-	} else {
-		rm = (rm >> (64 - (DP_FBITS + 1 + 3 + 1))) |
-		     ((rm << (DP_FBITS + 1 + 3 + 1)) != 0);
-	}
-	assert(rm & (DP_HIDDEN_BIT << 3));
-
-	/* And now the subtraction */
-
-	/* flip sign of r and handle as add */
-	rs ^= 1;
-
-	assert(zm & DP_HIDDEN_BIT);
-
-	/*
-	 * Provide guard,round and stick bit space.
-	 */
-	zm <<= 3;
-
-	if (ze > re) {
-		/*
-		 * Have to shift y fraction right to align.
-		 */
-		s = ze - re;
-		rm = XDPSRS(rm, s);
-		re += s;
-	} else if (re > ze) {
-		/*
-		 * Have to shift x fraction right to align.
-		 */
-		s = re - ze;
-		zm = XDPSRS(zm, s);
-		ze += s;
-	}
-	assert(ze == re);
-	assert(ze <= DP_EMAX);
-
-	if (zs == rs) {
-		/*
-		 * Generate 28 bit result of adding two 27 bit numbers
-		 * leaving result in xm, xs and xe.
-		 */
-		zm = zm + rm;
-
-		if (zm >> (DP_FBITS + 1 + 3)) { /* carry out */
-			zm = XDPSRS1(zm);
-			ze++;
-		}
-	} else {
-		if (zm >= rm) {
-			zm = zm - rm;
-		} else {
-			zm = rm - zm;
-			zs = rs;
-		}
-		if (zm == 0)
-			return ieee754dp_zero(ieee754_csr.rm == FPU_CSR_RD);
-
-		/*
-		 * Normalize to rounding precision.
-		 */
-		while ((zm >> (DP_FBITS + 3)) == 0) {
-			zm <<= 1;
-			ze--;
-		}
-	}
-
-	return ieee754dp_format(zs, ze, zm);
-}
diff --git a/arch/mips/math-emu/dp_mul.c b/arch/mips/math-emu/dp_mul.c
index d0901f03fa19..c34a6cdf1b25 100644
--- a/arch/mips/math-emu/dp_mul.c
+++ b/arch/mips/math-emu/dp_mul.c
@@ -26,10 +26,10 @@ union ieee754dp ieee754dp_mul(union ieee754dp x, union ieee754dp y)
 	int re;
 	int rs;
 	u64 rm;
-	unsigned lxm;
-	unsigned hxm;
-	unsigned lym;
-	unsigned hym;
+	unsigned int lxm;
+	unsigned int hxm;
+	unsigned int lym;
+	unsigned int hym;
 	u64 lrm;
 	u64 hrm;
 	u64 t;
@@ -101,6 +101,7 @@ union ieee754dp ieee754dp_mul(union ieee754dp x, union ieee754dp y)
 
 	case CLPAIR(IEEE754_CLASS_DNORM, IEEE754_CLASS_DNORM):
 		DPDNORMX;
+		/* fall through */
 
 	case CLPAIR(IEEE754_CLASS_NORM, IEEE754_CLASS_DNORM):
 		DPDNORMY;
@@ -125,12 +126,9 @@ union ieee754dp ieee754dp_mul(union ieee754dp x, union ieee754dp y)
 	ym <<= 64 - (DP_FBITS + 1);
 
 	/*
-	 * Multiply 32 bits xm, ym to give high 32 bits rm with stickness.
+	 * Multiply 64 bits xm, ym to give high 64 bits rm with stickness.
 	 */
 
-	/* 32 * 32 => 64 */
-#define DPXMULT(x, y)	((u64)(x) * (u64)y)
-
 	lxm = xm;
 	hxm = xm >> 32;
 	lym = ym;
@@ -163,7 +161,7 @@ union ieee754dp ieee754dp_mul(union ieee754dp x, union ieee754dp y)
 	if ((s64) rm < 0) {
 		rm = (rm >> (64 - (DP_FBITS + 1 + 3))) |
 		     ((rm << (DP_FBITS + 1 + 3)) != 0);
-			re++;
+		re++;
 	} else {
 		rm = (rm >> (64 - (DP_FBITS + 1 + 3 + 1))) |
 		     ((rm << (DP_FBITS + 1 + 3 + 1)) != 0);
diff --git a/arch/mips/math-emu/dp_rint.c b/arch/mips/math-emu/dp_rint.c
new file mode 100644
index 000000000000..c3b9077ff357
--- /dev/null
+++ b/arch/mips/math-emu/dp_rint.c
@@ -0,0 +1,89 @@
+/* IEEE754 floating point arithmetic
+ * double precision: common utilities
+ */
+/*
+ * MIPS floating point support
+ * Copyright (C) 1994-2000 Algorithmics Ltd.
+ * Copyright (C) 2017 Imagination Technologies, Ltd.
+ * Author: Aleksandar Markovic <aleksandar.markovic@imgtec.com>
+ *
+ *  This program is free software; you can distribute it and/or modify it
+ *  under the terms of the GNU General Public License (Version 2) as
+ *  published by the Free Software Foundation.
+ *
+ *  This program is distributed in the hope it will be useful, but WITHOUT
+ *  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ *  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ *  for more details.
+ *
+ *  You should have received a copy of the GNU General Public License along
+ *  with this program.
+ */
+
+#include "ieee754dp.h"
+
+union ieee754dp ieee754dp_rint(union ieee754dp x)
+{
+	union ieee754dp ret;
+	u64 residue;
+	int sticky;
+	int round;
+	int odd;
+
+	COMPXDP;
+
+	ieee754_clearcx();
+
+	EXPLODEXDP;
+	FLUSHXDP;
+
+	if (xc == IEEE754_CLASS_SNAN)
+		return ieee754dp_nanxcpt(x);
+
+	if ((xc == IEEE754_CLASS_QNAN) ||
+	    (xc == IEEE754_CLASS_INF) ||
+	    (xc == IEEE754_CLASS_ZERO))
+		return x;
+
+	if (xe >= DP_FBITS)
+		return x;
+
+	if (xe < -1) {
+		residue = xm;
+		round = 0;
+		sticky = residue != 0;
+		xm = 0;
+	} else {
+		residue = xm << (64 - DP_FBITS + xe);
+		round = (residue >> 63) != 0;
+		sticky = (residue << 1) != 0;
+		xm >>= DP_FBITS - xe;
+	}
+
+	odd = (xm & 0x1) != 0x0;
+
+	switch (ieee754_csr.rm) {
+	case FPU_CSR_RN:	/* toward nearest */
+		if (round && (sticky || odd))
+			xm++;
+		break;
+	case FPU_CSR_RZ:	/* toward zero */
+		break;
+	case FPU_CSR_RU:	/* toward +infinity */
+		if ((round || sticky) && !xs)
+			xm++;
+		break;
+	case FPU_CSR_RD:	/* toward -infinity */
+		if ((round || sticky) && xs)
+			xm++;
+		break;
+	}
+
+	if (round || sticky)
+		ieee754_setcx(IEEE754_INEXACT);
+
+	ret = ieee754dp_flong(xm);
+	DPSIGN(ret) = xs;
+
+	return ret;
+}
diff --git a/arch/mips/math-emu/dp_simple.c b/arch/mips/math-emu/dp_simple.c
index 926d56bf37f2..eb96485ed939 100644
--- a/arch/mips/math-emu/dp_simple.c
+++ b/arch/mips/math-emu/dp_simple.c
@@ -23,27 +23,39 @@
 
 union ieee754dp ieee754dp_neg(union ieee754dp x)
 {
-	unsigned int oldrm;
 	union ieee754dp y;
 
-	oldrm = ieee754_csr.rm;
-	ieee754_csr.rm = FPU_CSR_RD;
-	y = ieee754dp_sub(ieee754dp_zero(0), x);
-	ieee754_csr.rm = oldrm;
+	if (ieee754_csr.abs2008) {
+		y = x;
+		DPSIGN(y) = !DPSIGN(x);
+	} else {
+		unsigned int oldrm;
+
+		oldrm = ieee754_csr.rm;
+		ieee754_csr.rm = FPU_CSR_RD;
+		y = ieee754dp_sub(ieee754dp_zero(0), x);
+		ieee754_csr.rm = oldrm;
+	}
 	return y;
 }
 
 union ieee754dp ieee754dp_abs(union ieee754dp x)
 {
-	unsigned int oldrm;
 	union ieee754dp y;
 
-	oldrm = ieee754_csr.rm;
-	ieee754_csr.rm = FPU_CSR_RD;
-	if (DPSIGN(x))
-		y = ieee754dp_sub(ieee754dp_zero(0), x);
-	else
-		y = ieee754dp_add(ieee754dp_zero(0), x);
-	ieee754_csr.rm = oldrm;
+	if (ieee754_csr.abs2008) {
+		y = x;
+		DPSIGN(y) = 0;
+	} else {
+		unsigned int oldrm;
+
+		oldrm = ieee754_csr.rm;
+		ieee754_csr.rm = FPU_CSR_RD;
+		if (DPSIGN(x))
+			y = ieee754dp_sub(ieee754dp_zero(0), x);
+		else
+			y = ieee754dp_add(ieee754dp_zero(0), x);
+		ieee754_csr.rm = oldrm;
+	}
 	return y;
 }
diff --git a/arch/mips/math-emu/dp_sqrt.c b/arch/mips/math-emu/dp_sqrt.c
index cd5bc083001e..1d26c92e5295 100644
--- a/arch/mips/math-emu/dp_sqrt.c
+++ b/arch/mips/math-emu/dp_sqrt.c
@@ -21,7 +21,7 @@
 
 #include "ieee754dp.h"
 
-static const unsigned table[] = {
+static const unsigned int table[] = {
 	0, 1204, 3062, 5746, 9193, 13348, 18162, 23592,
 	29598, 36145, 43202, 50740, 58733, 67158, 75992,
 	85215, 83599, 71378, 60428, 50647, 41945, 34246,
@@ -33,7 +33,7 @@ union ieee754dp ieee754dp_sqrt(union ieee754dp x)
 {
 	struct _ieee754_csr oldcsr;
 	union ieee754dp y, z, t;
-	unsigned scalx, yh;
+	unsigned int scalx, yh;
 	COMPXDP;
 
 	EXPLODEXDP;
@@ -91,7 +91,8 @@ union ieee754dp ieee754dp_sqrt(union ieee754dp x)
 		scalx -= 256;
 	}
 
-	y = x = builddp(0, xe + DP_EBIAS, xm & ~DP_HIDDEN_BIT);
+	x = builddp(0, xe + DP_EBIAS, xm & ~DP_HIDDEN_BIT);
+	y = x;
 
 	/* magic initial approximation to almost 8 sig. bits */
 	yh = y.bits >> 32;
@@ -108,7 +109,8 @@ union ieee754dp ieee754dp_sqrt(union ieee754dp x)
 
 	/* triple to almost 56 sig. bits: y ~= sqrt(x) to within 1 ulp */
 	/* t=y*y; z=t;	pt[n0]+=0x00100000; t+=z; z=(x-z)*y; */
-	z = t = ieee754dp_mul(y, y);
+	t = ieee754dp_mul(y, y);
+	z = t;
 	t.bexp += 0x001;
 	t = ieee754dp_add(t, z);
 	z = ieee754dp_mul(ieee754dp_sub(x, z), y);
@@ -140,7 +142,7 @@ union ieee754dp ieee754dp_sqrt(union ieee754dp x)
 		switch (oldcsr.rm) {
 		case FPU_CSR_RU:
 			y.bits += 1;
-			/* drop through */
+			/* fall through */
 		case FPU_CSR_RN:
 			t.bits += 1;
 			break;
diff --git a/arch/mips/math-emu/dp_sub.c b/arch/mips/math-emu/dp_sub.c
index fc17a781b9ae..3cc48b86519b 100644
--- a/arch/mips/math-emu/dp_sub.c
+++ b/arch/mips/math-emu/dp_sub.c
@@ -106,7 +106,7 @@ union ieee754dp ieee754dp_sub(union ieee754dp x, union ieee754dp y)
 
 	case CLPAIR(IEEE754_CLASS_DNORM, IEEE754_CLASS_DNORM):
 		DPDNORMX;
-		/* FALL THROUGH */
+		/* fall through */
 
 	case CLPAIR(IEEE754_CLASS_NORM, IEEE754_CLASS_DNORM):
 		/* normalize ym,ye */
diff --git a/arch/mips/math-emu/dp_tint.c b/arch/mips/math-emu/dp_tint.c
index 6ffc336c530e..f3985617ce31 100644
--- a/arch/mips/math-emu/dp_tint.c
+++ b/arch/mips/math-emu/dp_tint.c
@@ -38,10 +38,13 @@ int ieee754dp_tint(union ieee754dp x)
 	switch (xc) {
 	case IEEE754_CLASS_SNAN:
 	case IEEE754_CLASS_QNAN:
-	case IEEE754_CLASS_INF:
 		ieee754_setcx(IEEE754_INVALID_OPERATION);
 		return ieee754si_indef();
 
+	case IEEE754_CLASS_INF:
+		ieee754_setcx(IEEE754_INVALID_OPERATION);
+		return ieee754si_overflow(xs);
+
 	case IEEE754_CLASS_ZERO:
 		return 0;
 
@@ -53,7 +56,7 @@ int ieee754dp_tint(union ieee754dp x)
 		/* Set invalid. We will only use overflow for floating
 		   point overflow */
 		ieee754_setcx(IEEE754_INVALID_OPERATION);
-		return ieee754si_indef();
+		return ieee754si_overflow(xs);
 	}
 	/* oh gawd */
 	if (xe > DP_FBITS) {
@@ -93,7 +96,7 @@ int ieee754dp_tint(union ieee754dp x)
 		if ((xm >> 31) != 0 && (xs == 0 || xm != 0x80000000)) {
 			/* This can happen after rounding */
 			ieee754_setcx(IEEE754_INVALID_OPERATION);
-			return ieee754si_indef();
+			return ieee754si_overflow(xs);
 		}
 		if (round || sticky)
 			ieee754_setcx(IEEE754_INEXACT);
diff --git a/arch/mips/math-emu/dp_tlong.c b/arch/mips/math-emu/dp_tlong.c
index 9cdc145b75e0..748fa10ed4cf 100644
--- a/arch/mips/math-emu/dp_tlong.c
+++ b/arch/mips/math-emu/dp_tlong.c
@@ -38,10 +38,13 @@ s64 ieee754dp_tlong(union ieee754dp x)
 	switch (xc) {
 	case IEEE754_CLASS_SNAN:
 	case IEEE754_CLASS_QNAN:
-	case IEEE754_CLASS_INF:
 		ieee754_setcx(IEEE754_INVALID_OPERATION);
 		return ieee754di_indef();
 
+	case IEEE754_CLASS_INF:
+		ieee754_setcx(IEEE754_INVALID_OPERATION);
+		return ieee754di_overflow(xs);
+
 	case IEEE754_CLASS_ZERO:
 		return 0;
 
@@ -56,7 +59,7 @@ s64 ieee754dp_tlong(union ieee754dp x)
 		/* Set invalid. We will only use overflow for floating
 		   point overflow */
 		ieee754_setcx(IEEE754_INVALID_OPERATION);
-		return ieee754di_indef();
+		return ieee754di_overflow(xs);
 	}
 	/* oh gawd */
 	if (xe > DP_FBITS) {
@@ -97,7 +100,7 @@ s64 ieee754dp_tlong(union ieee754dp x)
 		if ((xm >> 63) != 0) {
 			/* This can happen after rounding */
 			ieee754_setcx(IEEE754_INVALID_OPERATION);
-			return ieee754di_indef();
+			return ieee754di_overflow(xs);
 		}
 		if (round || sticky)
 			ieee754_setcx(IEEE754_INEXACT);
diff --git a/arch/mips/math-emu/dsemul.c b/arch/mips/math-emu/dsemul.c
index cbb36c14b155..4a094f7acb3d 100644
--- a/arch/mips/math-emu/dsemul.c
+++ b/arch/mips/math-emu/dsemul.c
@@ -1,3 +1,6 @@
+#include <linux/err.h>
+#include <linux/slab.h>
+
 #include <asm/branch.h>
 #include <asm/cacheflush.h>
 #include <asm/fpu_emulator.h>
@@ -5,158 +8,296 @@
 #include <asm/mipsregs.h>
 #include <asm/uaccess.h>
 
-#include "ieee754.h"
-
-/*
- * Emulate the arbritrary instruction ir at xcp->cp0_epc.  Required when
- * we have to emulate the instruction in a COP1 branch delay slot.  Do
- * not change cp0_epc due to the instruction
+/**
+ * struct emuframe - The 'emulation' frame structure
+ * @emul:	The instruction to 'emulate'.
+ * @badinst:	A break instruction to cause a return to the kernel.
  *
- * According to the spec:
- * 1) it shouldn't be a branch :-)
- * 2) it can be a COP instruction :-(
- * 3) if we are tring to run a protected memory space we must take
- *    special care on memory access instructions :-(
- */
-
-/*
- * "Trampoline" return routine to catch exception following
- *  execution of delay-slot instruction execution.
+ * This structure defines the frames placed within the delay slot emulation
+ * page in response to a call to mips_dsemul(). Each thread may be allocated
+ * only one frame at any given time. The kernel stores within it the
+ * instruction to be 'emulated' followed by a break instruction, then
+ * executes the frame in user mode. The break causes a trap to the kernel
+ * which leads to do_dsemulret() being called unless the instruction in
+ * @emul causes a trap itself, is a branch, or a signal is delivered to
+ * the thread. In these cases the allocated frame will either be reused by
+ * a subsequent delay slot 'emulation', or be freed during signal delivery or
+ * upon thread exit.
+ *
+ * This approach is used because:
+ *
+ * - Actually emulating all instructions isn't feasible. We would need to
+ *   be able to handle instructions from all revisions of the MIPS ISA,
+ *   all ASEs & all vendor instruction set extensions. This would be a
+ *   whole lot of work & continual maintenance burden as new instructions
+ *   are introduced, and in the case of some vendor extensions may not
+ *   even be possible. Thus we need to take the approach of actually
+ *   executing the instruction.
+ *
+ * - We must execute the instruction within user context. If we were to
+ *   execute the instruction in kernel mode then it would have access to
+ *   kernel resources without very careful checks, leaving us with a
+ *   high potential for security or stability issues to arise.
+ *
+ * - We used to place the frame on the users stack, but this requires
+ *   that the stack be executable. This is bad for security so the
+ *   per-process page is now used instead.
+ *
+ * - The instruction in @emul may be something entirely invalid for a
+ *   delay slot. The user may (intentionally or otherwise) place a branch
+ *   in a delay slot, or a kernel mode instruction, or something else
+ *   which generates an exception. Thus we can't rely upon the break in
+ *   @badinst always being hit. For this reason we track the index of the
+ *   frame allocated to each thread, allowing us to clean it up at later
+ *   points such as signal delivery or thread exit.
+ *
+ * - The user may generate a fake struct emuframe if they wish, invoking
+ *   the BRK_MEMU break instruction themselves. We must therefore not
+ *   trust that BRK_MEMU means there's actually a valid frame allocated
+ *   to the thread, and must not allow the user to do anything they
+ *   couldn't already.
  */
-
 struct emuframe {
 	mips_instruction	emul;
 	mips_instruction	badinst;
-	mips_instruction	cookie;
-	unsigned long		epc;
 };
 
-int mips_dsemul(struct pt_regs *regs, mips_instruction ir, unsigned long cpc)
-{
-	struct emuframe __user *fr;
-	int err;
-
-	if ((get_isa16_mode(regs->cp0_epc) && ((ir >> 16) == MM_NOP16)) ||
-		(ir == 0)) {
-		/* NOP is easy */
-		regs->cp0_epc = cpc;
-		clear_delay_slot(regs);
-		return 0;
-	}
+static const int emupage_frame_count = PAGE_SIZE / sizeof(struct emuframe);
 
-	pr_debug("dsemul %lx %lx\n", regs->cp0_epc, cpc);
+static inline __user struct emuframe *dsemul_page(void)
+{
+	return (__user struct emuframe *)STACK_TOP;
+}
 
-	/*
-	 * The strategy is to push the instruction onto the user stack
-	 * and put a trap after it which we can catch and jump to
-	 * the required address any alternative apart from full
-	 * instruction emulation!!.
-	 *
-	 * Algorithmics used a system call instruction, and
-	 * borrowed that vector.  MIPS/Linux version is a bit
-	 * more heavyweight in the interests of portability and
-	 * multiprocessor support.  For Linux we generate a
-	 * an unaligned access and force an address error exception.
-	 *
-	 * For embedded systems (stand-alone) we prefer to use a
-	 * non-existing CP1 instruction. This prevents us from emulating
-	 * branches, but gives us a cleaner interface to the exception
-	 * handler (single entry point).
-	 */
+static int alloc_emuframe(void)
+{
+	mm_context_t *mm_ctx = &current->mm->context;
+	int idx;
 
-	/* Ensure that the two instructions are in the same cache line */
-	fr = (struct emuframe __user *)
-		((regs->regs[29] - sizeof(struct emuframe)) & ~0x7);
+retry:
+	spin_lock(&mm_ctx->bd_emupage_lock);
 
-	/* Verify that the stack pointer is not competely insane */
-	if (unlikely(!access_ok(VERIFY_WRITE, fr, sizeof(struct emuframe))))
-		return SIGBUS;
+	/* Ensure we have an allocation bitmap */
+	if (!mm_ctx->bd_emupage_allocmap) {
+		mm_ctx->bd_emupage_allocmap =
+			kcalloc(BITS_TO_LONGS(emupage_frame_count),
+					      sizeof(unsigned long),
+				GFP_ATOMIC);
 
-	if (get_isa16_mode(regs->cp0_epc)) {
-		err = __put_user(ir >> 16, (u16 __user *)(&fr->emul));
-		err |= __put_user(ir & 0xffff, (u16 __user *)((long)(&fr->emul) + 2));
-		err |= __put_user(BREAK_MATH >> 16, (u16 __user *)(&fr->badinst));
-		err |= __put_user(BREAK_MATH & 0xffff, (u16 __user *)((long)(&fr->badinst) + 2));
-	} else {
-		err = __put_user(ir, &fr->emul);
-		err |= __put_user((mips_instruction)BREAK_MATH, &fr->badinst);
+		if (!mm_ctx->bd_emupage_allocmap) {
+			idx = BD_EMUFRAME_NONE;
+			goto out_unlock;
+		}
 	}
 
-	err |= __put_user((mips_instruction)BD_COOKIE, &fr->cookie);
-	err |= __put_user(cpc, &fr->epc);
+	/* Attempt to allocate a single bit/frame */
+	idx = bitmap_find_free_region(mm_ctx->bd_emupage_allocmap,
+				      emupage_frame_count, 0);
+	if (idx < 0) {
+		/*
+		 * Failed to allocate a frame. We'll wait until one becomes
+		 * available. We unlock the page so that other threads actually
+		 * get the opportunity to free their frames, which means
+		 * technically the result of bitmap_full may be incorrect.
+		 * However the worst case is that we repeat all this and end up
+		 * back here again.
+		 */
+		spin_unlock(&mm_ctx->bd_emupage_lock);
+		if (!wait_event_killable(mm_ctx->bd_emupage_queue,
+			!bitmap_full(mm_ctx->bd_emupage_allocmap,
+				     emupage_frame_count)))
+			goto retry;
 
-	if (unlikely(err)) {
-		MIPS_FPU_EMU_INC_STATS(errors);
-		return SIGBUS;
+		/* Received a fatal signal - just give in */
+		return BD_EMUFRAME_NONE;
 	}
 
-	regs->cp0_epc = ((unsigned long) &fr->emul) |
-		get_isa16_mode(regs->cp0_epc);
+	/* Success! */
+	pr_debug("allocate emuframe %d to %d\n", idx, current->pid);
+out_unlock:
+	spin_unlock(&mm_ctx->bd_emupage_lock);
+	return idx;
+}
 
-	flush_cache_sigtramp((unsigned long)&fr->emul);
+static void free_emuframe(int idx, struct mm_struct *mm)
+{
+	mm_context_t *mm_ctx = &mm->context;
 
-	return 0;
+	spin_lock(&mm_ctx->bd_emupage_lock);
+
+	pr_debug("free emuframe %d from %d\n", idx, current->pid);
+	bitmap_clear(mm_ctx->bd_emupage_allocmap, idx, 1);
+
+	/* If some thread is waiting for a frame, now's its chance */
+	wake_up(&mm_ctx->bd_emupage_queue);
+
+	spin_unlock(&mm_ctx->bd_emupage_lock);
+}
+
+static bool within_emuframe(struct pt_regs *regs)
+{
+	unsigned long base = (unsigned long)dsemul_page();
+
+	if (regs->cp0_epc < base)
+		return false;
+	if (regs->cp0_epc >= (base + PAGE_SIZE))
+		return false;
+
+	return true;
+}
+
+bool dsemul_thread_cleanup(struct task_struct *tsk)
+{
+	int fr_idx;
+
+	/* Clear any allocated frame, retrieving its index */
+	fr_idx = atomic_xchg(&tsk->thread.bd_emu_frame, BD_EMUFRAME_NONE);
+
+	/* If no frame was allocated, we're done */
+	if (fr_idx == BD_EMUFRAME_NONE)
+		return false;
+
+	task_lock(tsk);
+
+	/* Free the frame that this thread had allocated */
+	if (tsk->mm)
+		free_emuframe(fr_idx, tsk->mm);
+
+	task_unlock(tsk);
+	return true;
 }
 
-int do_dsemulret(struct pt_regs *xcp)
+bool dsemul_thread_rollback(struct pt_regs *regs)
 {
 	struct emuframe __user *fr;
-	unsigned long epc;
-	u32 insn, cookie;
-	int err = 0;
-	u16 instr[2];
+	int fr_idx;
 
-	fr = (struct emuframe __user *)
-		(msk_isa16_mode(xcp->cp0_epc) - sizeof(mips_instruction));
+	/* Do nothing if we're not executing from a frame */
+	if (!within_emuframe(regs))
+		return false;
 
-	/*
-	 * If we can't even access the area, something is very wrong, but we'll
-	 * leave that to the default handling
-	 */
-	if (!access_ok(VERIFY_READ, fr, sizeof(struct emuframe)))
-		return 0;
+	/* Find the frame being executed */
+	fr_idx = atomic_read(&current->thread.bd_emu_frame);
+	if (fr_idx == BD_EMUFRAME_NONE)
+		return false;
+	fr = &dsemul_page()[fr_idx];
 
 	/*
-	 * Do some sanity checking on the stackframe:
-	 *
-	 *  - Is the instruction pointed to by the EPC an BREAK_MATH?
-	 *  - Is the following memory word the BD_COOKIE?
+	 * If the PC is at the emul instruction, roll back to the branch. If
+	 * PC is at the badinst (break) instruction, we've already emulated the
+	 * instruction so progress to the continue PC. If it's anything else
+	 * then something is amiss & the user has branched into some other area
+	 * of the emupage - we'll free the allocated frame anyway.
 	 */
-	if (get_isa16_mode(xcp->cp0_epc)) {
-		err = __get_user(instr[0], (u16 __user *)(&fr->badinst));
-		err |= __get_user(instr[1], (u16 __user *)((long)(&fr->badinst) + 2));
-		insn = (instr[0] << 16) | instr[1];
+	if (msk_isa16_mode(regs->cp0_epc) == (unsigned long)&fr->emul)
+		regs->cp0_epc = current->thread.bd_emu_branch_pc;
+	else if (msk_isa16_mode(regs->cp0_epc) == (unsigned long)&fr->badinst)
+		regs->cp0_epc = current->thread.bd_emu_cont_pc;
+
+	atomic_set(&current->thread.bd_emu_frame, BD_EMUFRAME_NONE);
+	free_emuframe(fr_idx, current->mm);
+	return true;
+}
+
+void dsemul_mm_cleanup(struct mm_struct *mm)
+{
+	mm_context_t *mm_ctx = &mm->context;
+
+	kfree(mm_ctx->bd_emupage_allocmap);
+}
+
+int mips_dsemul(struct pt_regs *regs, mips_instruction ir,
+		unsigned long branch_pc, unsigned long cont_pc)
+{
+	int isa16 = get_isa16_mode(regs->cp0_epc);
+	mips_instruction break_math;
+	struct emuframe __user *fr;
+	int err, fr_idx;
+
+	/* NOP is easy */
+	if (ir == 0)
+		return -1;
+
+	/* microMIPS instructions */
+	if (isa16) {
+		union mips_instruction insn = { .word = ir };
+
+		/* NOP16 aka MOVE16 $0, $0 */
+		if ((ir >> 16) == MM_NOP16)
+			return -1;
+
+		/* ADDIUPC */
+		if (insn.mm_a_format.opcode == mm_addiupc_op) {
+			unsigned int rs;
+			s32 v;
+
+			rs = (((insn.mm_a_format.rs + 0xe) & 0xf) + 2);
+			v = regs->cp0_epc & ~3;
+			v += insn.mm_a_format.simmediate << 2;
+			regs->regs[rs] = (long)v;
+			return -1;
+		}
+	}
+
+	pr_debug("dsemul 0x%08lx cont at 0x%08lx\n", regs->cp0_epc, cont_pc);
+
+	/* Allocate a frame if we don't already have one */
+	fr_idx = atomic_read(&current->thread.bd_emu_frame);
+	if (fr_idx == BD_EMUFRAME_NONE)
+		fr_idx = alloc_emuframe();
+	if (fr_idx == BD_EMUFRAME_NONE)
+		return SIGBUS;
+	fr = &dsemul_page()[fr_idx];
+
+	/* Retrieve the appropriately encoded break instruction */
+	break_math = BREAK_MATH(isa16);
+
+	/* Write the instructions to the frame */
+	if (isa16) {
+		err = __put_user(ir >> 16,
+				 (u16 __user *)(&fr->emul));
+		err |= __put_user(ir & 0xffff,
+				  (u16 __user *)((long)(&fr->emul) + 2));
+		err |= __put_user(break_math >> 16,
+				  (u16 __user *)(&fr->badinst));
+		err |= __put_user(break_math & 0xffff,
+				  (u16 __user *)((long)(&fr->badinst) + 2));
 	} else {
-		err = __get_user(insn, &fr->badinst);
+		err = __put_user(ir, &fr->emul);
+		err |= __put_user(break_math, &fr->badinst);
 	}
-	err |= __get_user(cookie, &fr->cookie);
 
-	if (unlikely(err || (insn != BREAK_MATH) || (cookie != BD_COOKIE))) {
+	if (unlikely(err)) {
 		MIPS_FPU_EMU_INC_STATS(errors);
-		return 0;
+		free_emuframe(fr_idx, current->mm);
+		return SIGBUS;
 	}
 
-	/*
-	 * At this point, we are satisfied that it's a BD emulation trap.  Yes,
-	 * a user might have deliberately put two malformed and useless
-	 * instructions in a row in his program, in which case he's in for a
-	 * nasty surprise - the next instruction will be treated as a
-	 * continuation address!  Alas, this seems to be the only way that we
-	 * can handle signals, recursion, and longjmps() in the context of
-	 * emulating the branch delay instruction.
-	 */
+	/* Record the PC of the branch, PC to continue from & frame index */
+	current->thread.bd_emu_branch_pc = branch_pc;
+	current->thread.bd_emu_cont_pc = cont_pc;
+	atomic_set(&current->thread.bd_emu_frame, fr_idx);
 
-	pr_debug("dsemulret\n");
+	/* Change user register context to execute the frame */
+	regs->cp0_epc = (unsigned long)&fr->emul | isa16;
 
-	if (__get_user(epc, &fr->epc)) {		/* Saved EPC */
-		/* This is not a good situation to be in */
-		force_sig(SIGBUS, current);
+	/* Ensure the icache observes our newly written frame */
+	flush_cache_sigtramp((unsigned long)&fr->emul);
 
-		return 0;
+	return 0;
+}
+
+bool do_dsemulret(struct pt_regs *xcp)
+{
+	/* Cleanup the allocated frame, returning if there wasn't one */
+	if (!dsemul_thread_cleanup(current)) {
+		MIPS_FPU_EMU_INC_STATS(errors);
+		return false;
 	}
 
 	/* Set EPC to return to post-branch instruction */
-	xcp->cp0_epc = epc;
+	xcp->cp0_epc = current->thread.bd_emu_cont_pc;
+	pr_debug("dsemulret to 0x%08lx\n", xcp->cp0_epc);
 	MIPS_FPU_EMU_INC_STATS(ds_emul);
-	return 1;
+	return true;
 }
diff --git a/arch/mips/math-emu/ieee754.c b/arch/mips/math-emu/ieee754.c
index 8e97acbbe22c..e16ae7b75dbb 100644
--- a/arch/mips/math-emu/ieee754.c
+++ b/arch/mips/math-emu/ieee754.c
@@ -59,7 +59,8 @@ const union ieee754dp __ieee754dp_spcvals[] = {
 	DPCNST(1, 3,           0x4000000000000ULL),	/* - 10.0   */
 	DPCNST(0, DP_EMAX + 1, 0x0000000000000ULL),	/* + infinity */
 	DPCNST(1, DP_EMAX + 1, 0x0000000000000ULL),	/* - infinity */
-	DPCNST(0, DP_EMAX + 1, 0x7FFFFFFFFFFFFULL),	/* + indef quiet Nan */
+	DPCNST(0, DP_EMAX + 1, 0x7FFFFFFFFFFFFULL),	/* + ind legacy qNaN */
+	DPCNST(0, DP_EMAX + 1, 0x8000000000000ULL),	/* + indef 2008 qNaN */
 	DPCNST(0, DP_EMAX,     0xFFFFFFFFFFFFFULL),	/* + max */
 	DPCNST(1, DP_EMAX,     0xFFFFFFFFFFFFFULL),	/* - max */
 	DPCNST(0, DP_EMIN,     0x0000000000000ULL),	/* + min normal */
@@ -82,7 +83,8 @@ const union ieee754sp __ieee754sp_spcvals[] = {
 	SPCNST(1, 3,	       0x200000),	/* - 10.0   */
 	SPCNST(0, SP_EMAX + 1, 0x000000),	/* + infinity */
 	SPCNST(1, SP_EMAX + 1, 0x000000),	/* - infinity */
-	SPCNST(0, SP_EMAX + 1, 0x3FFFFF),	/* + indef quiet Nan  */
+	SPCNST(0, SP_EMAX + 1, 0x3FFFFF),	/* + indef legacy quiet NaN */
+	SPCNST(0, SP_EMAX + 1, 0x400000),	/* + indef 2008 quiet NaN */
 	SPCNST(0, SP_EMAX,     0x7FFFFF),	/* + max normal */
 	SPCNST(1, SP_EMAX,     0x7FFFFF),	/* - max normal */
 	SPCNST(0, SP_EMIN,     0x000000),	/* + min normal */
diff --git a/arch/mips/math-emu/ieee754.h b/arch/mips/math-emu/ieee754.h
index df94720714c7..e0eb7a965fdf 100644
--- a/arch/mips/math-emu/ieee754.h
+++ b/arch/mips/math-emu/ieee754.h
@@ -67,6 +67,7 @@ union ieee754sp ieee754sp_div(union ieee754sp x, union ieee754sp y);
 union ieee754sp ieee754sp_fint(int x);
 union ieee754sp ieee754sp_flong(s64 x);
 union ieee754sp ieee754sp_fdp(union ieee754dp x);
+union ieee754sp ieee754sp_rint(union ieee754sp x);
 
 int ieee754sp_tint(union ieee754sp x);
 s64 ieee754sp_tlong(union ieee754sp x);
@@ -101,6 +102,7 @@ union ieee754dp ieee754dp_neg(union ieee754dp x);
 union ieee754dp ieee754dp_fint(int x);
 union ieee754dp ieee754dp_flong(s64 x);
 union ieee754dp ieee754dp_fsp(union ieee754sp x);
+union ieee754dp ieee754dp_rint(union ieee754dp x);
 
 int ieee754dp_tint(union ieee754dp x);
 s64 ieee754dp_tlong(union ieee754dp x);
@@ -163,11 +165,12 @@ struct _ieee754_csr {
 };
 #define ieee754_csr (*(struct _ieee754_csr *)(&current->thread.fpu.fcr31))
 
-static inline unsigned ieee754_getrm(void)
+static inline unsigned int ieee754_getrm(void)
 {
 	return (ieee754_csr.rm);
 }
-static inline unsigned ieee754_setrm(unsigned rm)
+
+static inline unsigned int ieee754_setrm(unsigned int rm)
 {
 	return (ieee754_csr.rm = rm);
 }
@@ -175,14 +178,14 @@ static inline unsigned ieee754_setrm(unsigned rm)
 /*
  * get current exceptions
  */
-static inline unsigned ieee754_getcx(void)
+static inline unsigned int ieee754_getcx(void)
 {
 	return (ieee754_csr.cx);
 }
 
 /* test for current exception condition
  */
-static inline int ieee754_cxtest(unsigned n)
+static inline int ieee754_cxtest(unsigned int n)
 {
 	return (ieee754_csr.cx & n);
 }
@@ -190,21 +193,21 @@ static inline int ieee754_cxtest(unsigned n)
 /*
  * get sticky exceptions
  */
-static inline unsigned ieee754_getsx(void)
+static inline unsigned int ieee754_getsx(void)
 {
 	return (ieee754_csr.sx);
 }
 
 /* clear sticky conditions
 */
-static inline unsigned ieee754_clrsx(void)
+static inline unsigned int ieee754_clrsx(void)
 {
 	return (ieee754_csr.sx = 0);
 }
 
 /* test for sticky exception condition
  */
-static inline int ieee754_sxtest(unsigned n)
+static inline int ieee754_sxtest(unsigned int n)
 {
 	return (ieee754_csr.sx & n);
 }
@@ -221,15 +224,16 @@ union ieee754dp ieee754dp_dump(char *s, union ieee754dp x);
 #define IEEE754_SPCVAL_NTEN		5	/* -10.0 */
 #define IEEE754_SPCVAL_PINFINITY	6	/* +inf */
 #define IEEE754_SPCVAL_NINFINITY	7	/* -inf */
-#define IEEE754_SPCVAL_INDEF		8	/* quiet NaN */
-#define IEEE754_SPCVAL_PMAX		9	/* +max norm */
-#define IEEE754_SPCVAL_NMAX		10	/* -max norm */
-#define IEEE754_SPCVAL_PMIN		11	/* +min norm */
-#define IEEE754_SPCVAL_NMIN		12	/* -min norm */
-#define IEEE754_SPCVAL_PMIND		13	/* +min denorm */
-#define IEEE754_SPCVAL_NMIND		14	/* -min denorm */
-#define IEEE754_SPCVAL_P1E31		15	/* + 1.0e31 */
-#define IEEE754_SPCVAL_P1E63		16	/* + 1.0e63 */
+#define IEEE754_SPCVAL_INDEF_LEG	8	/* legacy quiet NaN */
+#define IEEE754_SPCVAL_INDEF_2008	9	/* IEEE 754-2008 quiet NaN */
+#define IEEE754_SPCVAL_PMAX		10	/* +max norm */
+#define IEEE754_SPCVAL_NMAX		11	/* -max norm */
+#define IEEE754_SPCVAL_PMIN		12	/* +min norm */
+#define IEEE754_SPCVAL_NMIN		13	/* -min norm */
+#define IEEE754_SPCVAL_PMIND		14	/* +min denorm */
+#define IEEE754_SPCVAL_NMIND		15	/* -min denorm */
+#define IEEE754_SPCVAL_P1E31		16	/* + 1.0e31 */
+#define IEEE754_SPCVAL_P1E63		17	/* + 1.0e63 */
 
 extern const union ieee754dp __ieee754dp_spcvals[];
 extern const union ieee754sp __ieee754sp_spcvals[];
@@ -243,7 +247,8 @@ extern const union ieee754sp __ieee754sp_spcvals[];
 #define ieee754dp_zero(sn)	(ieee754dp_spcvals[IEEE754_SPCVAL_PZERO+(sn)])
 #define ieee754dp_one(sn)	(ieee754dp_spcvals[IEEE754_SPCVAL_PONE+(sn)])
 #define ieee754dp_ten(sn)	(ieee754dp_spcvals[IEEE754_SPCVAL_PTEN+(sn)])
-#define ieee754dp_indef()	(ieee754dp_spcvals[IEEE754_SPCVAL_INDEF])
+#define ieee754dp_indef()	(ieee754dp_spcvals[IEEE754_SPCVAL_INDEF_LEG + \
+						   ieee754_csr.nan2008])
 #define ieee754dp_max(sn)	(ieee754dp_spcvals[IEEE754_SPCVAL_PMAX+(sn)])
 #define ieee754dp_min(sn)	(ieee754dp_spcvals[IEEE754_SPCVAL_PMIN+(sn)])
 #define ieee754dp_mind(sn)	(ieee754dp_spcvals[IEEE754_SPCVAL_PMIND+(sn)])
@@ -254,7 +259,8 @@ extern const union ieee754sp __ieee754sp_spcvals[];
 #define ieee754sp_zero(sn)	(ieee754sp_spcvals[IEEE754_SPCVAL_PZERO+(sn)])
 #define ieee754sp_one(sn)	(ieee754sp_spcvals[IEEE754_SPCVAL_PONE+(sn)])
 #define ieee754sp_ten(sn)	(ieee754sp_spcvals[IEEE754_SPCVAL_PTEN+(sn)])
-#define ieee754sp_indef()	(ieee754sp_spcvals[IEEE754_SPCVAL_INDEF])
+#define ieee754sp_indef()	(ieee754sp_spcvals[IEEE754_SPCVAL_INDEF_LEG + \
+						   ieee754_csr.nan2008])
 #define ieee754sp_max(sn)	(ieee754sp_spcvals[IEEE754_SPCVAL_PMAX+(sn)])
 #define ieee754sp_min(sn)	(ieee754sp_spcvals[IEEE754_SPCVAL_PMIN+(sn)])
 #define ieee754sp_mind(sn)	(ieee754sp_spcvals[IEEE754_SPCVAL_PMIND+(sn)])
@@ -266,12 +272,25 @@ extern const union ieee754sp __ieee754sp_spcvals[];
  */
 static inline int ieee754si_indef(void)
 {
-	return INT_MAX;
+	return ieee754_csr.nan2008 ? 0 : INT_MAX;
 }
 
 static inline s64 ieee754di_indef(void)
 {
-	return S64_MAX;
+	return ieee754_csr.nan2008 ? 0 : S64_MAX;
+}
+
+/*
+ * Overflow integer value
+ */
+static inline int ieee754si_overflow(int xs)
+{
+	return ieee754_csr.nan2008 && xs ? INT_MIN : INT_MAX;
+}
+
+static inline s64 ieee754di_overflow(int xs)
+{
+	return ieee754_csr.nan2008 && xs ? S64_MIN : S64_MAX;
 }
 
 /* result types for xctx.rt */
diff --git a/arch/mips/math-emu/ieee754dp.c b/arch/mips/math-emu/ieee754dp.c
index 522d843f2ffd..465a0342ed4c 100644
--- a/arch/mips/math-emu/ieee754dp.c
+++ b/arch/mips/math-emu/ieee754dp.c
@@ -37,8 +37,11 @@ static inline int ieee754dp_isnan(union ieee754dp x)
 
 static inline int ieee754dp_issnan(union ieee754dp x)
 {
+	int qbit;
+
 	assert(ieee754dp_isnan(x));
-	return (DPMANT(x) & DP_MBIT(DP_FBITS - 1)) == DP_MBIT(DP_FBITS - 1);
+	qbit = (DPMANT(x) & DP_MBIT(DP_FBITS - 1)) == DP_MBIT(DP_FBITS - 1);
+	return ieee754_csr.nan2008 ^ qbit;
 }
 
 
@@ -51,7 +54,15 @@ union ieee754dp __cold ieee754dp_nanxcpt(union ieee754dp r)
 	assert(ieee754dp_issnan(r));
 
 	ieee754_setcx(IEEE754_INVALID_OPERATION);
-	return ieee754dp_indef();
+	if (ieee754_csr.nan2008) {
+		DPMANT(r) |= DP_MBIT(DP_FBITS - 1);
+	} else {
+		DPMANT(r) &= ~DP_MBIT(DP_FBITS - 1);
+		if (!ieee754dp_isnan(r))
+			DPMANT(r) |= DP_MBIT(DP_FBITS - 2);
+	}
+
+	return r;
 }
 
 static u64 ieee754dp_get_rounding(int sn, u64 xm)
@@ -89,7 +100,7 @@ union ieee754dp ieee754dp_format(int sn, int xe, u64 xm)
 {
 	assert(xm);		/* we don't gen exact zeros (probably should) */
 
-	assert((xm >> (DP_FBITS + 1 + 3)) == 0);	/* no execess */
+	assert((xm >> (DP_FBITS + 1 + 3)) == 0);	/* no excess */
 	assert(xm & (DP_HIDDEN_BIT << 3));
 
 	if (xe < DP_EMIN) {
@@ -157,7 +168,7 @@ union ieee754dp ieee754dp_format(int sn, int xe, u64 xm)
 	/* strip grs bits */
 	xm >>= 3;
 
-	assert((xm >> (DP_FBITS + 1)) == 0);	/* no execess */
+	assert((xm >> (DP_FBITS + 1)) == 0);	/* no excess */
 	assert(xe >= DP_EMIN);
 
 	if (xe > DP_EMAX) {
@@ -190,7 +201,7 @@ union ieee754dp ieee754dp_format(int sn, int xe, u64 xm)
 			ieee754_setcx(IEEE754_UNDERFLOW);
 		return builddp(sn, DP_EMIN - 1 + DP_EBIAS, xm);
 	} else {
-		assert((xm >> (DP_FBITS + 1)) == 0);	/* no execess */
+		assert((xm >> (DP_FBITS + 1)) == 0);	/* no excess */
 		assert(xm & DP_HIDDEN_BIT);
 
 		return builddp(sn, xe + DP_EBIAS, xm & ~DP_HIDDEN_BIT);
diff --git a/arch/mips/math-emu/ieee754dp.h b/arch/mips/math-emu/ieee754dp.h
index e2babd98fee3..a56707b75282 100644
--- a/arch/mips/math-emu/ieee754dp.h
+++ b/arch/mips/math-emu/ieee754dp.h
@@ -55,11 +55,15 @@ static inline int ieee754dp_finite(union ieee754dp x)
 #define XDPSRS1(v)	\
 	(((v) >> 1) | ((v) & 1))
 
+/* 32bit * 32bit => 64bit unsigned integer multiplication */
+#define DPXMULT(x, y)	((u64)(x) * (u64)y)
+
 /* convert denormal to normalized with extended exponent */
 #define DPDNORMx(m,e) \
 	while ((m >> DP_FBITS) == 0) { m <<= 1; e--; }
 #define DPDNORMX	DPDNORMx(xm, xe)
 #define DPDNORMY	DPDNORMx(ym, ye)
+#define DPDNORMZ	DPDNORMx(zm, ze)
 
 static inline union ieee754dp builddp(int s, int bx, u64 m)
 {
diff --git a/arch/mips/math-emu/ieee754int.h b/arch/mips/math-emu/ieee754int.h
index 6383e2c5c1ad..06ac0e2ac7ac 100644
--- a/arch/mips/math-emu/ieee754int.h
+++ b/arch/mips/math-emu/ieee754int.h
@@ -26,6 +26,10 @@
 
 #define CLPAIR(x, y)	((x)*6+(y))
 
+enum maddf_flags {
+	MADDF_NEGATE_PRODUCT	= 1 << 0,
+};
+
 static inline void ieee754_clearcx(void)
 {
 	ieee754_csr.cx = 0;
@@ -50,10 +54,13 @@ static inline int ieee754_class_nan(int xc)
 }
 
 #define COMPXSP \
-	unsigned xm; int xe; int xs __maybe_unused; int xc
+	unsigned int xm; int xe; int xs __maybe_unused; int xc
 
 #define COMPYSP \
-	unsigned ym; int ye; int ys; int yc
+	unsigned int ym; int ye; int ys; int yc
+
+#define COMPZSP \
+	unsigned int zm; int ze; int zs; int zc
 
 #define EXPLODESP(v, vc, vs, ve, vm)					\
 {									\
@@ -63,10 +70,10 @@ static inline int ieee754_class_nan(int xc)
 	if (ve == SP_EMAX+1+SP_EBIAS) {					\
 		if (vm == 0)						\
 			vc = IEEE754_CLASS_INF;				\
-		else if (vm & SP_MBIT(SP_FBITS-1))			\
-			vc = IEEE754_CLASS_SNAN;			\
-		else							\
+		else if (ieee754_csr.nan2008 ^ !(vm & SP_MBIT(SP_FBITS - 1))) \
 			vc = IEEE754_CLASS_QNAN;			\
+		else							\
+			vc = IEEE754_CLASS_SNAN;			\
 	} else if (ve == SP_EMIN-1+SP_EBIAS) {				\
 		if (vm) {						\
 			ve = SP_EMIN;					\
@@ -81,6 +88,7 @@ static inline int ieee754_class_nan(int xc)
 }
 #define EXPLODEXSP EXPLODESP(x, xc, xs, xe, xm)
 #define EXPLODEYSP EXPLODESP(y, yc, ys, ye, ym)
+#define EXPLODEZSP EXPLODESP(z, zc, zs, ze, zm)
 
 
 #define COMPXDP \
@@ -89,6 +97,9 @@ static inline int ieee754_class_nan(int xc)
 #define COMPYDP \
 	u64 ym; int ye; int ys; int yc
 
+#define COMPZDP \
+	u64 zm; int ze; int zs; int zc
+
 #define EXPLODEDP(v, vc, vs, ve, vm)					\
 {									\
 	vm = DPMANT(v);							\
@@ -97,10 +108,10 @@ static inline int ieee754_class_nan(int xc)
 	if (ve == DP_EMAX+1+DP_EBIAS) {					\
 		if (vm == 0)						\
 			vc = IEEE754_CLASS_INF;				\
-		else if (vm & DP_MBIT(DP_FBITS-1))			\
-			vc = IEEE754_CLASS_SNAN;			\
-		else							\
+		else if (ieee754_csr.nan2008 ^ !(vm & DP_MBIT(DP_FBITS - 1))) \
 			vc = IEEE754_CLASS_QNAN;			\
+		else							\
+			vc = IEEE754_CLASS_SNAN;			\
 	} else if (ve == DP_EMIN-1+DP_EBIAS) {				\
 		if (vm) {						\
 			ve = DP_EMIN;					\
@@ -115,6 +126,7 @@ static inline int ieee754_class_nan(int xc)
 }
 #define EXPLODEXDP EXPLODEDP(x, xc, xs, xe, xm)
 #define EXPLODEYDP EXPLODEDP(y, yc, ys, ye, ym)
+#define EXPLODEZDP EXPLODEDP(z, zc, zs, ze, zm)
 
 #define FLUSHDP(v, vc, vs, ve, vm)					\
 	if (vc==IEEE754_CLASS_DNORM) {					\
@@ -140,7 +152,9 @@ static inline int ieee754_class_nan(int xc)
 
 #define FLUSHXDP FLUSHDP(x, xc, xs, xe, xm)
 #define FLUSHYDP FLUSHDP(y, yc, ys, ye, ym)
+#define FLUSHZDP FLUSHDP(z, zc, zs, ze, zm)
 #define FLUSHXSP FLUSHSP(x, xc, xs, xe, xm)
 #define FLUSHYSP FLUSHSP(y, yc, ys, ye, ym)
+#define FLUSHZSP FLUSHSP(z, zc, zs, ze, zm)
 
 #endif /* __IEEE754INT_H  */
diff --git a/arch/mips/math-emu/ieee754sp.c b/arch/mips/math-emu/ieee754sp.c
index ca8e35e33bf7..8423e4c5e415 100644
--- a/arch/mips/math-emu/ieee754sp.c
+++ b/arch/mips/math-emu/ieee754sp.c
@@ -37,8 +37,11 @@ static inline int ieee754sp_isnan(union ieee754sp x)
 
 static inline int ieee754sp_issnan(union ieee754sp x)
 {
+	int qbit;
+
 	assert(ieee754sp_isnan(x));
-	return SPMANT(x) & SP_MBIT(SP_FBITS - 1);
+	qbit = (SPMANT(x) & SP_MBIT(SP_FBITS - 1)) == SP_MBIT(SP_FBITS - 1);
+	return ieee754_csr.nan2008 ^ qbit;
 }
 
 
@@ -51,10 +54,18 @@ union ieee754sp __cold ieee754sp_nanxcpt(union ieee754sp r)
 	assert(ieee754sp_issnan(r));
 
 	ieee754_setcx(IEEE754_INVALID_OPERATION);
-	return ieee754sp_indef();
+	if (ieee754_csr.nan2008) {
+		SPMANT(r) |= SP_MBIT(SP_FBITS - 1);
+	} else {
+		SPMANT(r) &= ~SP_MBIT(SP_FBITS - 1);
+		if (!ieee754sp_isnan(r))
+			SPMANT(r) |= SP_MBIT(SP_FBITS - 2);
+	}
+
+	return r;
 }
 
-static unsigned ieee754sp_get_rounding(int sn, unsigned xm)
+static unsigned int ieee754sp_get_rounding(int sn, unsigned int xm)
 {
 	/* inexact must round of 3 bits
 	 */
@@ -85,11 +96,11 @@ static unsigned ieee754sp_get_rounding(int sn, unsigned xm)
  * xe is an unbiased exponent
  * xm is 3bit extended precision value.
  */
-union ieee754sp ieee754sp_format(int sn, int xe, unsigned xm)
+union ieee754sp ieee754sp_format(int sn, int xe, unsigned int xm)
 {
 	assert(xm);		/* we don't gen exact zeros (probably should) */
 
-	assert((xm >> (SP_FBITS + 1 + 3)) == 0);	/* no execess */
+	assert((xm >> (SP_FBITS + 1 + 3)) == 0);	/* no excess */
 	assert(xm & (SP_HIDDEN_BIT << 3));
 
 	if (xe < SP_EMIN) {
@@ -130,7 +141,8 @@ union ieee754sp ieee754sp_format(int sn, int xe, unsigned xm)
 		} else {
 			/* sticky right shift es bits
 			 */
-			SPXSRSXn(es);
+			xm = XSPSRS(xm, es);
+			xe += es;
 			assert((xm & (SP_HIDDEN_BIT << 3)) == 0);
 			assert(xe == SP_EMIN);
 		}
@@ -155,7 +167,7 @@ union ieee754sp ieee754sp_format(int sn, int xe, unsigned xm)
 	/* strip grs bits */
 	xm >>= 3;
 
-	assert((xm >> (SP_FBITS + 1)) == 0);	/* no execess */
+	assert((xm >> (SP_FBITS + 1)) == 0);	/* no excess */
 	assert(xe >= SP_EMIN);
 
 	if (xe > SP_EMAX) {
@@ -188,7 +200,7 @@ union ieee754sp ieee754sp_format(int sn, int xe, unsigned xm)
 			ieee754_setcx(IEEE754_UNDERFLOW);
 		return buildsp(sn, SP_EMIN - 1 + SP_EBIAS, xm);
 	} else {
-		assert((xm >> (SP_FBITS + 1)) == 0);	/* no execess */
+		assert((xm >> (SP_FBITS + 1)) == 0);	/* no excess */
 		assert(xm & SP_HIDDEN_BIT);
 
 		return buildsp(sn, xe + SP_EBIAS, xm & ~SP_HIDDEN_BIT);
diff --git a/arch/mips/math-emu/ieee754sp.h b/arch/mips/math-emu/ieee754sp.h
index 374a3f00a589..8c5a63804873 100644
--- a/arch/mips/math-emu/ieee754sp.h
+++ b/arch/mips/math-emu/ieee754sp.h
@@ -45,28 +45,31 @@ static inline int ieee754sp_finite(union ieee754sp x)
 	return SPBEXP(x) != SP_EMAX + 1 + SP_EBIAS;
 }
 
+/* 64 bit right shift with rounding */
+#define XSPSRS64(v, rs)						\
+	(((rs) >= 64) ? ((v) != 0) : ((v) >> (rs)) | ((v) << (64-(rs)) != 0))
+
 /* 3bit extended single precision sticky right shift */
-#define SPXSRSXn(rs)							\
-	(xe += rs,							\
-	 xm = (rs > (SP_FBITS+3))?1:((xm) >> (rs)) | ((xm) << (32-(rs)) != 0))
+#define XSPSRS(v, rs)						\
+	((rs > (SP_FBITS+3))?1:((v) >> (rs)) | ((v) << (32-(rs)) != 0))
 
-#define SPXSRSX1() \
-	(xe++, (xm = (xm >> 1) | (xm & 1)))
+#define XSPSRS1(m) \
+	((m >> 1) | (m & 1))
 
-#define SPXSRSYn(rs)								\
-	(ye+=rs,								\
-	 ym = (rs > (SP_FBITS+3))?1:((ym) >> (rs)) | ((ym) << (32-(rs)) != 0))
+#define SPXSRSX1() \
+	(xe++, (xm = XSPSRS1(xm)))
 
 #define SPXSRSY1() \
-	(ye++, (ym = (ym >> 1) | (ym & 1)))
+	(ye++, (ym = XSPSRS1(ym)))
 
 /* convert denormal to normalized with extended exponent */
 #define SPDNORMx(m,e) \
 	while ((m >> SP_FBITS) == 0) { m <<= 1; e--; }
 #define SPDNORMX	SPDNORMx(xm, xe)
 #define SPDNORMY	SPDNORMx(ym, ye)
+#define SPDNORMZ	SPDNORMx(zm, ze)
 
-static inline union ieee754sp buildsp(int s, int bx, unsigned m)
+static inline union ieee754sp buildsp(int s, int bx, unsigned int m)
 {
 	union ieee754sp r;
 
diff --git a/arch/mips/math-emu/me-debugfs.c b/arch/mips/math-emu/me-debugfs.c
index be650ed7db59..8c0ec154aecc 100644
--- a/arch/mips/math-emu/me-debugfs.c
+++ b/arch/mips/math-emu/me-debugfs.c
@@ -28,14 +28,190 @@ static int fpuemu_stat_get(void *data, u64 *val)
 }
 DEFINE_SIMPLE_ATTRIBUTE(fops_fpuemu_stat, fpuemu_stat_get, NULL, "%llu\n");
 
+/*
+ * Used to obtain names for a debugfs instruction counter, given field name
+ * in fpuemustats structure. For example, for input "cmp_sueq_d", the output
+ * would be "cmp.sueq.d". This is needed since dots are not allowed to be
+ * used in structure field names, and are, on the other hand, desired to be
+ * used in debugfs item names to be clearly associated to corresponding
+ * MIPS FPU instructions.
+ */
+static void adjust_instruction_counter_name(char *out_name, char *in_name)
+{
+	int i = 0;
+
+	strcpy(out_name, in_name);
+	while (in_name[i] != '\0') {
+		if (out_name[i] == '_')
+			out_name[i] = '.';
+		i++;
+	}
+}
+
+static int fpuemustats_clear_show(struct seq_file *s, void *unused)
+{
+	__this_cpu_write((fpuemustats).emulated, 0);
+	__this_cpu_write((fpuemustats).loads, 0);
+	__this_cpu_write((fpuemustats).stores, 0);
+	__this_cpu_write((fpuemustats).branches, 0);
+	__this_cpu_write((fpuemustats).cp1ops, 0);
+	__this_cpu_write((fpuemustats).cp1xops, 0);
+	__this_cpu_write((fpuemustats).errors, 0);
+	__this_cpu_write((fpuemustats).ieee754_inexact, 0);
+	__this_cpu_write((fpuemustats).ieee754_underflow, 0);
+	__this_cpu_write((fpuemustats).ieee754_overflow, 0);
+	__this_cpu_write((fpuemustats).ieee754_zerodiv, 0);
+	__this_cpu_write((fpuemustats).ieee754_invalidop, 0);
+	__this_cpu_write((fpuemustats).ds_emul, 0);
+
+	__this_cpu_write((fpuemustats).abs_s, 0);
+	__this_cpu_write((fpuemustats).abs_d, 0);
+	__this_cpu_write((fpuemustats).add_s, 0);
+	__this_cpu_write((fpuemustats).add_d, 0);
+	__this_cpu_write((fpuemustats).bc1eqz, 0);
+	__this_cpu_write((fpuemustats).bc1nez, 0);
+	__this_cpu_write((fpuemustats).ceil_w_s, 0);
+	__this_cpu_write((fpuemustats).ceil_w_d, 0);
+	__this_cpu_write((fpuemustats).ceil_l_s, 0);
+	__this_cpu_write((fpuemustats).ceil_l_d, 0);
+	__this_cpu_write((fpuemustats).class_s, 0);
+	__this_cpu_write((fpuemustats).class_d, 0);
+	__this_cpu_write((fpuemustats).cmp_af_s, 0);
+	__this_cpu_write((fpuemustats).cmp_af_d, 0);
+	__this_cpu_write((fpuemustats).cmp_eq_s, 0);
+	__this_cpu_write((fpuemustats).cmp_eq_d, 0);
+	__this_cpu_write((fpuemustats).cmp_le_s, 0);
+	__this_cpu_write((fpuemustats).cmp_le_d, 0);
+	__this_cpu_write((fpuemustats).cmp_lt_s, 0);
+	__this_cpu_write((fpuemustats).cmp_lt_d, 0);
+	__this_cpu_write((fpuemustats).cmp_ne_s, 0);
+	__this_cpu_write((fpuemustats).cmp_ne_d, 0);
+	__this_cpu_write((fpuemustats).cmp_or_s, 0);
+	__this_cpu_write((fpuemustats).cmp_or_d, 0);
+	__this_cpu_write((fpuemustats).cmp_ueq_s, 0);
+	__this_cpu_write((fpuemustats).cmp_ueq_d, 0);
+	__this_cpu_write((fpuemustats).cmp_ule_s, 0);
+	__this_cpu_write((fpuemustats).cmp_ule_d, 0);
+	__this_cpu_write((fpuemustats).cmp_ult_s, 0);
+	__this_cpu_write((fpuemustats).cmp_ult_d, 0);
+	__this_cpu_write((fpuemustats).cmp_un_s, 0);
+	__this_cpu_write((fpuemustats).cmp_un_d, 0);
+	__this_cpu_write((fpuemustats).cmp_une_s, 0);
+	__this_cpu_write((fpuemustats).cmp_une_d, 0);
+	__this_cpu_write((fpuemustats).cmp_saf_s, 0);
+	__this_cpu_write((fpuemustats).cmp_saf_d, 0);
+	__this_cpu_write((fpuemustats).cmp_seq_s, 0);
+	__this_cpu_write((fpuemustats).cmp_seq_d, 0);
+	__this_cpu_write((fpuemustats).cmp_sle_s, 0);
+	__this_cpu_write((fpuemustats).cmp_sle_d, 0);
+	__this_cpu_write((fpuemustats).cmp_slt_s, 0);
+	__this_cpu_write((fpuemustats).cmp_slt_d, 0);
+	__this_cpu_write((fpuemustats).cmp_sne_s, 0);
+	__this_cpu_write((fpuemustats).cmp_sne_d, 0);
+	__this_cpu_write((fpuemustats).cmp_sor_s, 0);
+	__this_cpu_write((fpuemustats).cmp_sor_d, 0);
+	__this_cpu_write((fpuemustats).cmp_sueq_s, 0);
+	__this_cpu_write((fpuemustats).cmp_sueq_d, 0);
+	__this_cpu_write((fpuemustats).cmp_sule_s, 0);
+	__this_cpu_write((fpuemustats).cmp_sule_d, 0);
+	__this_cpu_write((fpuemustats).cmp_sult_s, 0);
+	__this_cpu_write((fpuemustats).cmp_sult_d, 0);
+	__this_cpu_write((fpuemustats).cmp_sun_s, 0);
+	__this_cpu_write((fpuemustats).cmp_sun_d, 0);
+	__this_cpu_write((fpuemustats).cmp_sune_s, 0);
+	__this_cpu_write((fpuemustats).cmp_sune_d, 0);
+	__this_cpu_write((fpuemustats).cvt_d_l, 0);
+	__this_cpu_write((fpuemustats).cvt_d_s, 0);
+	__this_cpu_write((fpuemustats).cvt_d_w, 0);
+	__this_cpu_write((fpuemustats).cvt_l_s, 0);
+	__this_cpu_write((fpuemustats).cvt_l_d, 0);
+	__this_cpu_write((fpuemustats).cvt_s_d, 0);
+	__this_cpu_write((fpuemustats).cvt_s_l, 0);
+	__this_cpu_write((fpuemustats).cvt_s_w, 0);
+	__this_cpu_write((fpuemustats).cvt_w_s, 0);
+	__this_cpu_write((fpuemustats).cvt_w_d, 0);
+	__this_cpu_write((fpuemustats).div_s, 0);
+	__this_cpu_write((fpuemustats).div_d, 0);
+	__this_cpu_write((fpuemustats).floor_w_s, 0);
+	__this_cpu_write((fpuemustats).floor_w_d, 0);
+	__this_cpu_write((fpuemustats).floor_l_s, 0);
+	__this_cpu_write((fpuemustats).floor_l_d, 0);
+	__this_cpu_write((fpuemustats).maddf_s, 0);
+	__this_cpu_write((fpuemustats).maddf_d, 0);
+	__this_cpu_write((fpuemustats).max_s, 0);
+	__this_cpu_write((fpuemustats).max_d, 0);
+	__this_cpu_write((fpuemustats).maxa_s, 0);
+	__this_cpu_write((fpuemustats).maxa_d, 0);
+	__this_cpu_write((fpuemustats).min_s, 0);
+	__this_cpu_write((fpuemustats).min_d, 0);
+	__this_cpu_write((fpuemustats).mina_s, 0);
+	__this_cpu_write((fpuemustats).mina_d, 0);
+	__this_cpu_write((fpuemustats).mov_s, 0);
+	__this_cpu_write((fpuemustats).mov_d, 0);
+	__this_cpu_write((fpuemustats).msubf_s, 0);
+	__this_cpu_write((fpuemustats).msubf_d, 0);
+	__this_cpu_write((fpuemustats).mul_s, 0);
+	__this_cpu_write((fpuemustats).mul_d, 0);
+	__this_cpu_write((fpuemustats).neg_s, 0);
+	__this_cpu_write((fpuemustats).neg_d, 0);
+	__this_cpu_write((fpuemustats).recip_s, 0);
+	__this_cpu_write((fpuemustats).recip_d, 0);
+	__this_cpu_write((fpuemustats).rint_s, 0);
+	__this_cpu_write((fpuemustats).rint_d, 0);
+	__this_cpu_write((fpuemustats).round_w_s, 0);
+	__this_cpu_write((fpuemustats).round_w_d, 0);
+	__this_cpu_write((fpuemustats).round_l_s, 0);
+	__this_cpu_write((fpuemustats).round_l_d, 0);
+	__this_cpu_write((fpuemustats).rsqrt_s, 0);
+	__this_cpu_write((fpuemustats).rsqrt_d, 0);
+	__this_cpu_write((fpuemustats).sel_s, 0);
+	__this_cpu_write((fpuemustats).sel_d, 0);
+	__this_cpu_write((fpuemustats).seleqz_s, 0);
+	__this_cpu_write((fpuemustats).seleqz_d, 0);
+	__this_cpu_write((fpuemustats).selnez_s, 0);
+	__this_cpu_write((fpuemustats).selnez_d, 0);
+	__this_cpu_write((fpuemustats).sqrt_s, 0);
+	__this_cpu_write((fpuemustats).sqrt_d, 0);
+	__this_cpu_write((fpuemustats).sub_s, 0);
+	__this_cpu_write((fpuemustats).sub_d, 0);
+	__this_cpu_write((fpuemustats).trunc_w_s, 0);
+	__this_cpu_write((fpuemustats).trunc_w_d, 0);
+	__this_cpu_write((fpuemustats).trunc_l_s, 0);
+	__this_cpu_write((fpuemustats).trunc_l_d, 0);
+
+	return 0;
+}
+
+static int fpuemustats_clear_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, fpuemustats_clear_show, inode->i_private);
+}
+
+static const struct file_operations fpuemustats_clear_fops = {
+	.open                   = fpuemustats_clear_open,
+	.read			= seq_read,
+	.llseek			= seq_lseek,
+	.release		= single_release,
+};
+
 static int __init debugfs_fpuemu(void)
 {
-	struct dentry *d, *dir;
+	struct dentry *fpuemu_debugfs_base_dir;
+	struct dentry *fpuemu_debugfs_inst_dir;
+	struct dentry *d, *reset_file;
 
 	if (!mips_debugfs_dir)
 		return -ENODEV;
-	dir = debugfs_create_dir("fpuemustats", mips_debugfs_dir);
-	if (!dir)
+
+	fpuemu_debugfs_base_dir = debugfs_create_dir("fpuemustats",
+						     mips_debugfs_dir);
+	if (!fpuemu_debugfs_base_dir)
+		return -ENOMEM;
+
+	reset_file = debugfs_create_file("fpuemustats_clear", 0444,
+					 mips_debugfs_dir, NULL,
+					 &fpuemustats_clear_fops);
+	if (!reset_file)
 		return -ENOMEM;
 
 #define FPU_EMU_STAT_OFFSET(m)						\
@@ -43,7 +219,7 @@ static int __init debugfs_fpuemu(void)
 
 #define FPU_STAT_CREATE(m)						\
 do {									\
-	d = debugfs_create_file(#m , S_IRUGO, dir,			\
+	d = debugfs_create_file(#m, 0444, fpuemu_debugfs_base_dir,	\
 				(void *)FPU_EMU_STAT_OFFSET(m),		\
 				&fops_fpuemu_stat);			\
 	if (!d)								\
@@ -53,6 +229,7 @@ do {									\
 	FPU_STAT_CREATE(emulated);
 	FPU_STAT_CREATE(loads);
 	FPU_STAT_CREATE(stores);
+	FPU_STAT_CREATE(branches);
 	FPU_STAT_CREATE(cp1ops);
 	FPU_STAT_CREATE(cp1xops);
 	FPU_STAT_CREATE(errors);
@@ -63,6 +240,139 @@ do {									\
 	FPU_STAT_CREATE(ieee754_invalidop);
 	FPU_STAT_CREATE(ds_emul);
 
+	fpuemu_debugfs_inst_dir = debugfs_create_dir("instructions",
+						     fpuemu_debugfs_base_dir);
+	if (!fpuemu_debugfs_inst_dir)
+		return -ENOMEM;
+
+#define FPU_STAT_CREATE_EX(m)						\
+do {									\
+	char name[32];							\
+									\
+	adjust_instruction_counter_name(name, #m);			\
+									\
+	d = debugfs_create_file(name, 0444, fpuemu_debugfs_inst_dir,	\
+				(void *)FPU_EMU_STAT_OFFSET(m),		\
+				&fops_fpuemu_stat);			\
+	if (!d)								\
+		return -ENOMEM;						\
+} while (0)
+
+	FPU_STAT_CREATE_EX(abs_s);
+	FPU_STAT_CREATE_EX(abs_d);
+	FPU_STAT_CREATE_EX(add_s);
+	FPU_STAT_CREATE_EX(add_d);
+	FPU_STAT_CREATE_EX(bc1eqz);
+	FPU_STAT_CREATE_EX(bc1nez);
+	FPU_STAT_CREATE_EX(ceil_w_s);
+	FPU_STAT_CREATE_EX(ceil_w_d);
+	FPU_STAT_CREATE_EX(ceil_l_s);
+	FPU_STAT_CREATE_EX(ceil_l_d);
+	FPU_STAT_CREATE_EX(class_s);
+	FPU_STAT_CREATE_EX(class_d);
+	FPU_STAT_CREATE_EX(cmp_af_s);
+	FPU_STAT_CREATE_EX(cmp_af_d);
+	FPU_STAT_CREATE_EX(cmp_eq_s);
+	FPU_STAT_CREATE_EX(cmp_eq_d);
+	FPU_STAT_CREATE_EX(cmp_le_s);
+	FPU_STAT_CREATE_EX(cmp_le_d);
+	FPU_STAT_CREATE_EX(cmp_lt_s);
+	FPU_STAT_CREATE_EX(cmp_lt_d);
+	FPU_STAT_CREATE_EX(cmp_ne_s);
+	FPU_STAT_CREATE_EX(cmp_ne_d);
+	FPU_STAT_CREATE_EX(cmp_or_s);
+	FPU_STAT_CREATE_EX(cmp_or_d);
+	FPU_STAT_CREATE_EX(cmp_ueq_s);
+	FPU_STAT_CREATE_EX(cmp_ueq_d);
+	FPU_STAT_CREATE_EX(cmp_ule_s);
+	FPU_STAT_CREATE_EX(cmp_ule_d);
+	FPU_STAT_CREATE_EX(cmp_ult_s);
+	FPU_STAT_CREATE_EX(cmp_ult_d);
+	FPU_STAT_CREATE_EX(cmp_un_s);
+	FPU_STAT_CREATE_EX(cmp_un_d);
+	FPU_STAT_CREATE_EX(cmp_une_s);
+	FPU_STAT_CREATE_EX(cmp_une_d);
+	FPU_STAT_CREATE_EX(cmp_saf_s);
+	FPU_STAT_CREATE_EX(cmp_saf_d);
+	FPU_STAT_CREATE_EX(cmp_seq_s);
+	FPU_STAT_CREATE_EX(cmp_seq_d);
+	FPU_STAT_CREATE_EX(cmp_sle_s);
+	FPU_STAT_CREATE_EX(cmp_sle_d);
+	FPU_STAT_CREATE_EX(cmp_slt_s);
+	FPU_STAT_CREATE_EX(cmp_slt_d);
+	FPU_STAT_CREATE_EX(cmp_sne_s);
+	FPU_STAT_CREATE_EX(cmp_sne_d);
+	FPU_STAT_CREATE_EX(cmp_sor_s);
+	FPU_STAT_CREATE_EX(cmp_sor_d);
+	FPU_STAT_CREATE_EX(cmp_sueq_s);
+	FPU_STAT_CREATE_EX(cmp_sueq_d);
+	FPU_STAT_CREATE_EX(cmp_sule_s);
+	FPU_STAT_CREATE_EX(cmp_sule_d);
+	FPU_STAT_CREATE_EX(cmp_sult_s);
+	FPU_STAT_CREATE_EX(cmp_sult_d);
+	FPU_STAT_CREATE_EX(cmp_sun_s);
+	FPU_STAT_CREATE_EX(cmp_sun_d);
+	FPU_STAT_CREATE_EX(cmp_sune_s);
+	FPU_STAT_CREATE_EX(cmp_sune_d);
+	FPU_STAT_CREATE_EX(cvt_d_l);
+	FPU_STAT_CREATE_EX(cvt_d_s);
+	FPU_STAT_CREATE_EX(cvt_d_w);
+	FPU_STAT_CREATE_EX(cvt_l_s);
+	FPU_STAT_CREATE_EX(cvt_l_d);
+	FPU_STAT_CREATE_EX(cvt_s_d);
+	FPU_STAT_CREATE_EX(cvt_s_l);
+	FPU_STAT_CREATE_EX(cvt_s_w);
+	FPU_STAT_CREATE_EX(cvt_w_s);
+	FPU_STAT_CREATE_EX(cvt_w_d);
+	FPU_STAT_CREATE_EX(div_s);
+	FPU_STAT_CREATE_EX(div_d);
+	FPU_STAT_CREATE_EX(floor_w_s);
+	FPU_STAT_CREATE_EX(floor_w_d);
+	FPU_STAT_CREATE_EX(floor_l_s);
+	FPU_STAT_CREATE_EX(floor_l_d);
+	FPU_STAT_CREATE_EX(maddf_s);
+	FPU_STAT_CREATE_EX(maddf_d);
+	FPU_STAT_CREATE_EX(max_s);
+	FPU_STAT_CREATE_EX(max_d);
+	FPU_STAT_CREATE_EX(maxa_s);
+	FPU_STAT_CREATE_EX(maxa_d);
+	FPU_STAT_CREATE_EX(min_s);
+	FPU_STAT_CREATE_EX(min_d);
+	FPU_STAT_CREATE_EX(mina_s);
+	FPU_STAT_CREATE_EX(mina_d);
+	FPU_STAT_CREATE_EX(mov_s);
+	FPU_STAT_CREATE_EX(mov_d);
+	FPU_STAT_CREATE_EX(msubf_s);
+	FPU_STAT_CREATE_EX(msubf_d);
+	FPU_STAT_CREATE_EX(mul_s);
+	FPU_STAT_CREATE_EX(mul_d);
+	FPU_STAT_CREATE_EX(neg_s);
+	FPU_STAT_CREATE_EX(neg_d);
+	FPU_STAT_CREATE_EX(recip_s);
+	FPU_STAT_CREATE_EX(recip_d);
+	FPU_STAT_CREATE_EX(rint_s);
+	FPU_STAT_CREATE_EX(rint_d);
+	FPU_STAT_CREATE_EX(round_w_s);
+	FPU_STAT_CREATE_EX(round_w_d);
+	FPU_STAT_CREATE_EX(round_l_s);
+	FPU_STAT_CREATE_EX(round_l_d);
+	FPU_STAT_CREATE_EX(rsqrt_s);
+	FPU_STAT_CREATE_EX(rsqrt_d);
+	FPU_STAT_CREATE_EX(sel_s);
+	FPU_STAT_CREATE_EX(sel_d);
+	FPU_STAT_CREATE_EX(seleqz_s);
+	FPU_STAT_CREATE_EX(seleqz_d);
+	FPU_STAT_CREATE_EX(selnez_s);
+	FPU_STAT_CREATE_EX(selnez_d);
+	FPU_STAT_CREATE_EX(sqrt_s);
+	FPU_STAT_CREATE_EX(sqrt_d);
+	FPU_STAT_CREATE_EX(sub_s);
+	FPU_STAT_CREATE_EX(sub_d);
+	FPU_STAT_CREATE_EX(trunc_w_s);
+	FPU_STAT_CREATE_EX(trunc_w_d);
+	FPU_STAT_CREATE_EX(trunc_l_s);
+	FPU_STAT_CREATE_EX(trunc_l_d);
+
 	return 0;
 }
 arch_initcall(debugfs_fpuemu);
diff --git a/arch/mips/math-emu/sp_add.c b/arch/mips/math-emu/sp_add.c
index f1c87b07d3b4..51dced9fbdaf 100644
--- a/arch/mips/math-emu/sp_add.c
+++ b/arch/mips/math-emu/sp_add.c
@@ -104,8 +104,7 @@ union ieee754sp ieee754sp_add(union ieee754sp x, union ieee754sp y)
 
 	case CLPAIR(IEEE754_CLASS_DNORM, IEEE754_CLASS_DNORM):
 		SPDNORMX;
-
-		/* FALL THROUGH */
+		/* fall through */
 
 	case CLPAIR(IEEE754_CLASS_NORM, IEEE754_CLASS_DNORM):
 		SPDNORMY;
@@ -132,13 +131,15 @@ union ieee754sp ieee754sp_add(union ieee754sp x, union ieee754sp y)
 		 * Have to shift y fraction right to align.
 		 */
 		s = xe - ye;
-		SPXSRSYn(s);
+		ym = XSPSRS(ym, s);
+		ye += s;
 	} else if (ye > xe) {
 		/*
 		 * Have to shift x fraction right to align.
 		 */
 		s = ye - xe;
-		SPXSRSXn(s);
+		xm = XSPSRS(xm, s);
+		xe += s;
 	}
 	assert(xe == ye);
 	assert(xe <= SP_EMAX);
diff --git a/arch/mips/math-emu/sp_div.c b/arch/mips/math-emu/sp_div.c
index 27f6db3a0a4c..5d2904960eb8 100644
--- a/arch/mips/math-emu/sp_div.c
+++ b/arch/mips/math-emu/sp_div.c
@@ -23,9 +23,9 @@
 
 union ieee754sp ieee754sp_div(union ieee754sp x, union ieee754sp y)
 {
-	unsigned rm;
+	unsigned int rm;
 	int re;
-	unsigned bm;
+	unsigned int bm;
 
 	COMPXSP;
 	COMPYSP;
@@ -103,6 +103,7 @@ union ieee754sp ieee754sp_div(union ieee754sp x, union ieee754sp y)
 
 	case CLPAIR(IEEE754_CLASS_DNORM, IEEE754_CLASS_DNORM):
 		SPDNORMX;
+		/* fall through */
 
 	case CLPAIR(IEEE754_CLASS_NORM, IEEE754_CLASS_DNORM):
 		SPDNORMY;
diff --git a/arch/mips/math-emu/sp_fdp.c b/arch/mips/math-emu/sp_fdp.c
index 3797148893ad..36a50f9082d1 100644
--- a/arch/mips/math-emu/sp_fdp.c
+++ b/arch/mips/math-emu/sp_fdp.c
@@ -44,13 +44,17 @@ union ieee754sp ieee754sp_fdp(union ieee754dp x)
 
 	switch (xc) {
 	case IEEE754_CLASS_SNAN:
-		return ieee754sp_nanxcpt(ieee754sp_nan_fdp(xs, xm));
+		x = ieee754dp_nanxcpt(x);
+		EXPLODEXDP;
+		/* fall through */
 
 	case IEEE754_CLASS_QNAN:
 		y = ieee754sp_nan_fdp(xs, xm);
-		EXPLODEYSP;
-		if (!ieee754_class_nan(yc))
-			y = ieee754sp_indef();
+		if (!ieee754_csr.nan2008) {
+			EXPLODEYSP;
+			if (!ieee754_class_nan(yc))
+				y = ieee754sp_indef();
+		}
 		return y;
 
 	case IEEE754_CLASS_INF:
diff --git a/arch/mips/math-emu/sp_fint.c b/arch/mips/math-emu/sp_fint.c
index d5d8495b2cc4..1a35d12b6fc8 100644
--- a/arch/mips/math-emu/sp_fint.c
+++ b/arch/mips/math-emu/sp_fint.c
@@ -23,7 +23,7 @@
 
 union ieee754sp ieee754sp_fint(int x)
 {
-	unsigned xm;
+	unsigned int xm;
 	int xe;
 	int xs;
 
diff --git a/arch/mips/math-emu/sp_fmax.c b/arch/mips/math-emu/sp_fmax.c
index 74a5a00d2f22..22019ed691df 100644
--- a/arch/mips/math-emu/sp_fmax.c
+++ b/arch/mips/math-emu/sp_fmax.c
@@ -96,6 +96,7 @@ union ieee754sp ieee754sp_fmax(union ieee754sp x, union ieee754sp y)
 
 	case CLPAIR(IEEE754_CLASS_DNORM, IEEE754_CLASS_DNORM):
 		SPDNORMX;
+		/* fall through */
 
 	case CLPAIR(IEEE754_CLASS_NORM, IEEE754_CLASS_DNORM):
 		SPDNORMY;
@@ -224,6 +225,7 @@ union ieee754sp ieee754sp_fmaxa(union ieee754sp x, union ieee754sp y)
 
 	case CLPAIR(IEEE754_CLASS_DNORM, IEEE754_CLASS_DNORM):
 		SPDNORMX;
+		/* fall through */
 
 	case CLPAIR(IEEE754_CLASS_NORM, IEEE754_CLASS_DNORM):
 		SPDNORMY;
diff --git a/arch/mips/math-emu/sp_fmin.c b/arch/mips/math-emu/sp_fmin.c
index c51385f46b09..feaec3985cca 100644
--- a/arch/mips/math-emu/sp_fmin.c
+++ b/arch/mips/math-emu/sp_fmin.c
@@ -96,6 +96,7 @@ union ieee754sp ieee754sp_fmin(union ieee754sp x, union ieee754sp y)
 
 	case CLPAIR(IEEE754_CLASS_DNORM, IEEE754_CLASS_DNORM):
 		SPDNORMX;
+		/* fall through */
 
 	case CLPAIR(IEEE754_CLASS_NORM, IEEE754_CLASS_DNORM):
 		SPDNORMY;
@@ -224,6 +225,7 @@ union ieee754sp ieee754sp_fmina(union ieee754sp x, union ieee754sp y)
 
 	case CLPAIR(IEEE754_CLASS_DNORM, IEEE754_CLASS_DNORM):
 		SPDNORMX;
+		/* fall through */
 
 	case CLPAIR(IEEE754_CLASS_NORM, IEEE754_CLASS_DNORM):
 		SPDNORMY;
diff --git a/arch/mips/math-emu/sp_maddf.c b/arch/mips/math-emu/sp_maddf.c
index dd1dd83e34eb..07ba675401e2 100644
--- a/arch/mips/math-emu/sp_maddf.c
+++ b/arch/mips/math-emu/sp_maddf.c
@@ -14,81 +14,60 @@
 
 #include "ieee754sp.h"
 
-union ieee754sp ieee754sp_maddf(union ieee754sp z, union ieee754sp x,
-				union ieee754sp y)
+
+static union ieee754sp _sp_maddf(union ieee754sp z, union ieee754sp x,
+				 union ieee754sp y, enum maddf_flags flags)
 {
 	int re;
 	int rs;
-	unsigned rm;
-	unsigned short lxm;
-	unsigned short hxm;
-	unsigned short lym;
-	unsigned short hym;
-	unsigned lrm;
-	unsigned hrm;
-	unsigned t;
-	unsigned at;
+	unsigned int rm;
+	u64 rm64;
+	u64 zm64;
 	int s;
 
 	COMPXSP;
 	COMPYSP;
-	u32 zm; int ze; int zs __maybe_unused; int zc;
+	COMPZSP;
 
 	EXPLODEXSP;
 	EXPLODEYSP;
-	EXPLODESP(z, zc, zs, ze, zm)
+	EXPLODEZSP;
 
 	FLUSHXSP;
 	FLUSHYSP;
-	FLUSHSP(z, zc, zs, ze, zm);
+	FLUSHZSP;
 
 	ieee754_clearcx();
 
-	switch (zc) {
-	case IEEE754_CLASS_SNAN:
-		ieee754_setcx(IEEE754_INVALID_OPERATION);
+	/*
+	 * Handle the cases when at least one of x, y or z is a NaN.
+	 * Order of precedence is sNaN, qNaN and z, x, y.
+	 */
+	if (zc == IEEE754_CLASS_SNAN)
 		return ieee754sp_nanxcpt(z);
-	case IEEE754_CLASS_DNORM:
-		SPDNORMx(zm, ze);
-	/* QNAN is handled separately below */
-	}
-
-	switch (CLPAIR(xc, yc)) {
-	case CLPAIR(IEEE754_CLASS_QNAN, IEEE754_CLASS_SNAN):
-	case CLPAIR(IEEE754_CLASS_ZERO, IEEE754_CLASS_SNAN):
-	case CLPAIR(IEEE754_CLASS_NORM, IEEE754_CLASS_SNAN):
-	case CLPAIR(IEEE754_CLASS_DNORM, IEEE754_CLASS_SNAN):
-	case CLPAIR(IEEE754_CLASS_INF, IEEE754_CLASS_SNAN):
+	if (xc == IEEE754_CLASS_SNAN)
+		return ieee754sp_nanxcpt(x);
+	if (yc == IEEE754_CLASS_SNAN)
 		return ieee754sp_nanxcpt(y);
+	if (zc == IEEE754_CLASS_QNAN)
+		return z;
+	if (xc == IEEE754_CLASS_QNAN)
+		return x;
+	if (yc == IEEE754_CLASS_QNAN)
+		return y;
 
-	case CLPAIR(IEEE754_CLASS_SNAN, IEEE754_CLASS_SNAN):
-	case CLPAIR(IEEE754_CLASS_SNAN, IEEE754_CLASS_QNAN):
-	case CLPAIR(IEEE754_CLASS_SNAN, IEEE754_CLASS_ZERO):
-	case CLPAIR(IEEE754_CLASS_SNAN, IEEE754_CLASS_NORM):
-	case CLPAIR(IEEE754_CLASS_SNAN, IEEE754_CLASS_DNORM):
-	case CLPAIR(IEEE754_CLASS_SNAN, IEEE754_CLASS_INF):
-		return ieee754sp_nanxcpt(x);
+	if (zc == IEEE754_CLASS_DNORM)
+		SPDNORMZ;
+	/* ZERO z cases are handled separately below */
 
-	case CLPAIR(IEEE754_CLASS_ZERO, IEEE754_CLASS_QNAN):
-	case CLPAIR(IEEE754_CLASS_NORM, IEEE754_CLASS_QNAN):
-	case CLPAIR(IEEE754_CLASS_DNORM, IEEE754_CLASS_QNAN):
-	case CLPAIR(IEEE754_CLASS_INF, IEEE754_CLASS_QNAN):
-		return y;
+	switch (CLPAIR(xc, yc)) {
 
-	case CLPAIR(IEEE754_CLASS_QNAN, IEEE754_CLASS_QNAN):
-	case CLPAIR(IEEE754_CLASS_QNAN, IEEE754_CLASS_ZERO):
-	case CLPAIR(IEEE754_CLASS_QNAN, IEEE754_CLASS_NORM):
-	case CLPAIR(IEEE754_CLASS_QNAN, IEEE754_CLASS_DNORM):
-	case CLPAIR(IEEE754_CLASS_QNAN, IEEE754_CLASS_INF):
-		return x;
 
 	/*
 	 * Infinity handling
 	 */
 	case CLPAIR(IEEE754_CLASS_INF, IEEE754_CLASS_ZERO):
 	case CLPAIR(IEEE754_CLASS_ZERO, IEEE754_CLASS_INF):
-		if (zc == IEEE754_CLASS_QNAN)
-			return z;
 		ieee754_setcx(IEEE754_INVALID_OPERATION);
 		return ieee754sp_indef();
 
@@ -97,9 +76,27 @@ union ieee754sp ieee754sp_maddf(union ieee754sp z, union ieee754sp x,
 	case CLPAIR(IEEE754_CLASS_INF, IEEE754_CLASS_NORM):
 	case CLPAIR(IEEE754_CLASS_INF, IEEE754_CLASS_DNORM):
 	case CLPAIR(IEEE754_CLASS_INF, IEEE754_CLASS_INF):
-		if (zc == IEEE754_CLASS_QNAN)
-			return z;
-		return ieee754sp_inf(xs ^ ys);
+		if ((zc == IEEE754_CLASS_INF) &&
+		    ((!(flags & MADDF_NEGATE_PRODUCT) && (zs != (xs ^ ys))) ||
+		     ((flags & MADDF_NEGATE_PRODUCT) && (zs == (xs ^ ys))))) {
+			/*
+			 * Cases of addition of infinities with opposite signs
+			 * or subtraction of infinities with same signs.
+			 */
+			ieee754_setcx(IEEE754_INVALID_OPERATION);
+			return ieee754sp_indef();
+		}
+		/*
+		 * z is here either not an infinity, or an infinity having the
+		 * same sign as product (x*y) (in case of MADDF.D instruction)
+		 * or product -(x*y) (in MSUBF.D case). The result must be an
+		 * infinity, and its sign is determined only by the value of
+		 * (flags & MADDF_NEGATE_PRODUCT) and the signs of x and y.
+		 */
+		if (flags & MADDF_NEGATE_PRODUCT)
+			return ieee754sp_inf(1 ^ (xs ^ ys));
+		else
+			return ieee754sp_inf(xs ^ ys);
 
 	case CLPAIR(IEEE754_CLASS_ZERO, IEEE754_CLASS_ZERO):
 	case CLPAIR(IEEE754_CLASS_ZERO, IEEE754_CLASS_NORM):
@@ -108,34 +105,45 @@ union ieee754sp ieee754sp_maddf(union ieee754sp z, union ieee754sp x,
 	case CLPAIR(IEEE754_CLASS_DNORM, IEEE754_CLASS_ZERO):
 		if (zc == IEEE754_CLASS_INF)
 			return ieee754sp_inf(zs);
-		/* Multiplication is 0 so just return z */
+		if (zc == IEEE754_CLASS_ZERO) {
+			/* Handle cases +0 + (-0) and similar ones. */
+			if ((!(flags & MADDF_NEGATE_PRODUCT)
+					&& (zs == (xs ^ ys))) ||
+			    ((flags & MADDF_NEGATE_PRODUCT)
+					&& (zs != (xs ^ ys))))
+				/*
+				 * Cases of addition of zeros of equal signs
+				 * or subtraction of zeroes of opposite signs.
+				 * The sign of the resulting zero is in any
+				 * such case determined only by the sign of z.
+				 */
+				return z;
+
+			return ieee754sp_zero(ieee754_csr.rm == FPU_CSR_RD);
+		}
+		/* x*y is here 0, and z is not 0, so just return z */
 		return z;
 
 	case CLPAIR(IEEE754_CLASS_DNORM, IEEE754_CLASS_DNORM):
 		SPDNORMX;
+		/* fall through */
 
 	case CLPAIR(IEEE754_CLASS_NORM, IEEE754_CLASS_DNORM):
-		if (zc == IEEE754_CLASS_QNAN)
-			return z;
-		else if (zc == IEEE754_CLASS_INF)
+		if (zc == IEEE754_CLASS_INF)
 			return ieee754sp_inf(zs);
 		SPDNORMY;
 		break;
 
 	case CLPAIR(IEEE754_CLASS_DNORM, IEEE754_CLASS_NORM):
-		if (zc == IEEE754_CLASS_QNAN)
-			return z;
-		else if (zc == IEEE754_CLASS_INF)
+		if (zc == IEEE754_CLASS_INF)
 			return ieee754sp_inf(zs);
 		SPDNORMX;
 		break;
 
 	case CLPAIR(IEEE754_CLASS_NORM, IEEE754_CLASS_NORM):
-		if (zc == IEEE754_CLASS_QNAN)
-			return z;
-		else if (zc == IEEE754_CLASS_INF)
+		if (zc == IEEE754_CLASS_INF)
 			return ieee754sp_inf(zs);
-		/* fall through to real computations */
+		/* continue to real computations */
 	}
 
 	/* Finally get to do some computation */
@@ -154,102 +162,104 @@ union ieee754sp ieee754sp_maddf(union ieee754sp z, union ieee754sp x,
 
 	re = xe + ye;
 	rs = xs ^ ys;
+	if (flags & MADDF_NEGATE_PRODUCT)
+		rs ^= 1;
 
-	/* shunt to top of word */
-	xm <<= 32 - (SP_FBITS + 1);
-	ym <<= 32 - (SP_FBITS + 1);
+	/* Multiple 24 bit xm and ym to give 48 bit results */
+	rm64 = (uint64_t)xm * ym;
 
-	/*
-	 * Multiply 32 bits xm, ym to give high 32 bits rm with stickness.
-	 */
-	lxm = xm & 0xffff;
-	hxm = xm >> 16;
-	lym = ym & 0xffff;
-	hym = ym >> 16;
+	/* Shunt to top of word */
+	rm64 = rm64 << 16;
 
-	lrm = lxm * lym;	/* 16 * 16 => 32 */
-	hrm = hxm * hym;	/* 16 * 16 => 32 */
-
-	t = lxm * hym; /* 16 * 16 => 32 */
-	at = lrm + (t << 16);
-	hrm += at < lrm;
-	lrm = at;
-	hrm = hrm + (t >> 16);
-
-	t = hxm * lym; /* 16 * 16 => 32 */
-	at = lrm + (t << 16);
-	hrm += at < lrm;
-	lrm = at;
-	hrm = hrm + (t >> 16);
-
-	rm = hrm | (lrm != 0);
-
-	/*
-	 * Sticky shift down to normal rounding precision.
-	 */
-	if ((int) rm < 0) {
-		rm = (rm >> (32 - (SP_FBITS + 1 + 3))) |
-		    ((rm << (SP_FBITS + 1 + 3)) != 0);
+	/* Put explicit bit at bit 62 if necessary */
+	if ((int64_t) rm64 < 0) {
+		rm64 = rm64 >> 1;
 		re++;
-	} else {
-		rm = (rm >> (32 - (SP_FBITS + 1 + 3 + 1))) |
-		     ((rm << (SP_FBITS + 1 + 3 + 1)) != 0);
 	}
-	assert(rm & (SP_HIDDEN_BIT << 3));
 
-	/* And now the addition */
+	assert(rm64 & (1 << 62));
 
-	assert(zm & SP_HIDDEN_BIT);
+	if (zc == IEEE754_CLASS_ZERO) {
+		/*
+		 * Move explicit bit from bit 62 to bit 26 since the
+		 * ieee754sp_format code expects the mantissa to be
+		 * 27 bits wide (24 + 3 rounding bits).
+		 */
+		rm = XSPSRS64(rm64, (62 - 26));
+		return ieee754sp_format(rs, re, rm);
+	}
 
-	/*
-	 * Provide guard,round and stick bit space.
-	 */
-	zm <<= 3;
+	/* Move explicit bit from bit 23 to bit 62 */
+	zm64 = (uint64_t)zm << (62 - 23);
+	assert(zm64 & (1 << 62));
 
+	/* Make the exponents the same */
 	if (ze > re) {
 		/*
-		 * Have to shift y fraction right to align.
+		 * Have to shift r fraction right to align.
 		 */
 		s = ze - re;
-		SPXSRSYn(s);
+		rm64 = XSPSRS64(rm64, s);
+		re += s;
 	} else if (re > ze) {
 		/*
-		 * Have to shift x fraction right to align.
+		 * Have to shift z fraction right to align.
 		 */
 		s = re - ze;
-		SPXSRSYn(s);
+		zm64 = XSPSRS64(zm64, s);
+		ze += s;
 	}
 	assert(ze == re);
 	assert(ze <= SP_EMAX);
 
+	/* Do the addition */
 	if (zs == rs) {
 		/*
-		 * Generate 28 bit result of adding two 27 bit numbers
-		 * leaving result in zm, zs and ze.
+		 * Generate 64 bit result by adding two 63 bit numbers
+		 * leaving result in zm64, zs and ze.
 		 */
-		zm = zm + rm;
-
-		if (zm >> (SP_FBITS + 1 + 3)) { /* carry out */
-			SPXSRSX1();
+		zm64 = zm64 + rm64;
+		if ((int64_t)zm64 < 0) {	/* carry out */
+			zm64 = XSPSRS1(zm64);
+			ze++;
 		}
 	} else {
-		if (zm >= rm) {
-			zm = zm - rm;
+		if (zm64 >= rm64) {
+			zm64 = zm64 - rm64;
 		} else {
-			zm = rm - zm;
+			zm64 = rm64 - zm64;
 			zs = rs;
 		}
-		if (zm == 0)
+		if (zm64 == 0)
 			return ieee754sp_zero(ieee754_csr.rm == FPU_CSR_RD);
 
 		/*
-		 * Normalize in extended single precision
+		 * Put explicit bit at bit 62 if necessary.
 		 */
-		while ((zm >> (SP_MBITS + 3)) == 0) {
-			zm <<= 1;
+		while ((zm64 >> 62) == 0) {
+			zm64 <<= 1;
 			ze--;
 		}
-
 	}
+
+	/*
+	 * Move explicit bit from bit 62 to bit 26 since the
+	 * ieee754sp_format code expects the mantissa to be
+	 * 27 bits wide (24 + 3 rounding bits).
+	 */
+	zm = XSPSRS64(zm64, (62 - 26));
+
 	return ieee754sp_format(zs, ze, zm);
 }
+
+union ieee754sp ieee754sp_maddf(union ieee754sp z, union ieee754sp x,
+				union ieee754sp y)
+{
+	return _sp_maddf(z, x, y, 0);
+}
+
+union ieee754sp ieee754sp_msubf(union ieee754sp z, union ieee754sp x,
+				union ieee754sp y)
+{
+	return _sp_maddf(z, x, y, MADDF_NEGATE_PRODUCT);
+}
diff --git a/arch/mips/math-emu/sp_msubf.c b/arch/mips/math-emu/sp_msubf.c
deleted file mode 100644
index 81c38b980d69..000000000000
--- a/arch/mips/math-emu/sp_msubf.c
+++ /dev/null
@@ -1,258 +0,0 @@
-/*
- * IEEE754 floating point arithmetic
- * single precision: MSUB.f (Fused Multiply Subtract)
- * MSUBF.fmt: FPR[fd] = FPR[fd] - (FPR[fs] x FPR[ft])
- *
- * MIPS floating point support
- * Copyright (C) 2015 Imagination Technologies, Ltd.
- * Author: Markos Chandras <markos.chandras@imgtec.com>
- *
- *  This program is free software; you can distribute it and/or modify it
- *  under the terms of the GNU General Public License as published by the
- *  Free Software Foundation; version 2 of the License.
- */
-
-#include "ieee754sp.h"
-
-union ieee754sp ieee754sp_msubf(union ieee754sp z, union ieee754sp x,
-				union ieee754sp y)
-{
-	int re;
-	int rs;
-	unsigned rm;
-	unsigned short lxm;
-	unsigned short hxm;
-	unsigned short lym;
-	unsigned short hym;
-	unsigned lrm;
-	unsigned hrm;
-	unsigned t;
-	unsigned at;
-	int s;
-
-	COMPXSP;
-	COMPYSP;
-	u32 zm; int ze; int zs __maybe_unused; int zc;
-
-	EXPLODEXSP;
-	EXPLODEYSP;
-	EXPLODESP(z, zc, zs, ze, zm)
-
-	FLUSHXSP;
-	FLUSHYSP;
-	FLUSHSP(z, zc, zs, ze, zm);
-
-	ieee754_clearcx();
-
-	switch (zc) {
-	case IEEE754_CLASS_SNAN:
-		ieee754_setcx(IEEE754_INVALID_OPERATION);
-		return ieee754sp_nanxcpt(z);
-	case IEEE754_CLASS_DNORM:
-		SPDNORMx(zm, ze);
-	/* QNAN is handled separately below */
-	}
-
-	switch (CLPAIR(xc, yc)) {
-	case CLPAIR(IEEE754_CLASS_QNAN, IEEE754_CLASS_SNAN):
-	case CLPAIR(IEEE754_CLASS_ZERO, IEEE754_CLASS_SNAN):
-	case CLPAIR(IEEE754_CLASS_NORM, IEEE754_CLASS_SNAN):
-	case CLPAIR(IEEE754_CLASS_DNORM, IEEE754_CLASS_SNAN):
-	case CLPAIR(IEEE754_CLASS_INF, IEEE754_CLASS_SNAN):
-		return ieee754sp_nanxcpt(y);
-
-	case CLPAIR(IEEE754_CLASS_SNAN, IEEE754_CLASS_SNAN):
-	case CLPAIR(IEEE754_CLASS_SNAN, IEEE754_CLASS_QNAN):
-	case CLPAIR(IEEE754_CLASS_SNAN, IEEE754_CLASS_ZERO):
-	case CLPAIR(IEEE754_CLASS_SNAN, IEEE754_CLASS_NORM):
-	case CLPAIR(IEEE754_CLASS_SNAN, IEEE754_CLASS_DNORM):
-	case CLPAIR(IEEE754_CLASS_SNAN, IEEE754_CLASS_INF):
-		return ieee754sp_nanxcpt(x);
-
-	case CLPAIR(IEEE754_CLASS_ZERO, IEEE754_CLASS_QNAN):
-	case CLPAIR(IEEE754_CLASS_NORM, IEEE754_CLASS_QNAN):
-	case CLPAIR(IEEE754_CLASS_DNORM, IEEE754_CLASS_QNAN):
-	case CLPAIR(IEEE754_CLASS_INF, IEEE754_CLASS_QNAN):
-		return y;
-
-	case CLPAIR(IEEE754_CLASS_QNAN, IEEE754_CLASS_QNAN):
-	case CLPAIR(IEEE754_CLASS_QNAN, IEEE754_CLASS_ZERO):
-	case CLPAIR(IEEE754_CLASS_QNAN, IEEE754_CLASS_NORM):
-	case CLPAIR(IEEE754_CLASS_QNAN, IEEE754_CLASS_DNORM):
-	case CLPAIR(IEEE754_CLASS_QNAN, IEEE754_CLASS_INF):
-		return x;
-
-	/*
-	 * Infinity handling
-	 */
-	case CLPAIR(IEEE754_CLASS_INF, IEEE754_CLASS_ZERO):
-	case CLPAIR(IEEE754_CLASS_ZERO, IEEE754_CLASS_INF):
-		if (zc == IEEE754_CLASS_QNAN)
-			return z;
-		ieee754_setcx(IEEE754_INVALID_OPERATION);
-		return ieee754sp_indef();
-
-	case CLPAIR(IEEE754_CLASS_NORM, IEEE754_CLASS_INF):
-	case CLPAIR(IEEE754_CLASS_DNORM, IEEE754_CLASS_INF):
-	case CLPAIR(IEEE754_CLASS_INF, IEEE754_CLASS_NORM):
-	case CLPAIR(IEEE754_CLASS_INF, IEEE754_CLASS_DNORM):
-	case CLPAIR(IEEE754_CLASS_INF, IEEE754_CLASS_INF):
-		if (zc == IEEE754_CLASS_QNAN)
-			return z;
-		return ieee754sp_inf(xs ^ ys);
-
-	case CLPAIR(IEEE754_CLASS_ZERO, IEEE754_CLASS_ZERO):
-	case CLPAIR(IEEE754_CLASS_ZERO, IEEE754_CLASS_NORM):
-	case CLPAIR(IEEE754_CLASS_ZERO, IEEE754_CLASS_DNORM):
-	case CLPAIR(IEEE754_CLASS_NORM, IEEE754_CLASS_ZERO):
-	case CLPAIR(IEEE754_CLASS_DNORM, IEEE754_CLASS_ZERO):
-		if (zc == IEEE754_CLASS_INF)
-			return ieee754sp_inf(zs);
-		/* Multiplication is 0 so just return z */
-		return z;
-
-	case CLPAIR(IEEE754_CLASS_DNORM, IEEE754_CLASS_DNORM):
-		SPDNORMX;
-
-	case CLPAIR(IEEE754_CLASS_NORM, IEEE754_CLASS_DNORM):
-		if (zc == IEEE754_CLASS_QNAN)
-			return z;
-		else if (zc == IEEE754_CLASS_INF)
-			return ieee754sp_inf(zs);
-		SPDNORMY;
-		break;
-
-	case CLPAIR(IEEE754_CLASS_DNORM, IEEE754_CLASS_NORM):
-		if (zc == IEEE754_CLASS_QNAN)
-			return z;
-		else if (zc == IEEE754_CLASS_INF)
-			return ieee754sp_inf(zs);
-		SPDNORMX;
-		break;
-
-	case CLPAIR(IEEE754_CLASS_NORM, IEEE754_CLASS_NORM):
-		if (zc == IEEE754_CLASS_QNAN)
-			return z;
-		else if (zc == IEEE754_CLASS_INF)
-			return ieee754sp_inf(zs);
-		/* fall through to real compuation */
-	}
-
-	/* Finally get to do some computation */
-
-	/*
-	 * Do the multiplication bit first
-	 *
-	 * rm = xm * ym, re = xe + ye basically
-	 *
-	 * At this point xm and ym should have been normalized.
-	 */
-
-	/* rm = xm * ym, re = xe+ye basically */
-	assert(xm & SP_HIDDEN_BIT);
-	assert(ym & SP_HIDDEN_BIT);
-
-	re = xe + ye;
-	rs = xs ^ ys;
-
-	/* shunt to top of word */
-	xm <<= 32 - (SP_FBITS + 1);
-	ym <<= 32 - (SP_FBITS + 1);
-
-	/*
-	 * Multiply 32 bits xm, ym to give high 32 bits rm with stickness.
-	 */
-	lxm = xm & 0xffff;
-	hxm = xm >> 16;
-	lym = ym & 0xffff;
-	hym = ym >> 16;
-
-	lrm = lxm * lym;	/* 16 * 16 => 32 */
-	hrm = hxm * hym;	/* 16 * 16 => 32 */
-
-	t = lxm * hym; /* 16 * 16 => 32 */
-	at = lrm + (t << 16);
-	hrm += at < lrm;
-	lrm = at;
-	hrm = hrm + (t >> 16);
-
-	t = hxm * lym; /* 16 * 16 => 32 */
-	at = lrm + (t << 16);
-	hrm += at < lrm;
-	lrm = at;
-	hrm = hrm + (t >> 16);
-
-	rm = hrm | (lrm != 0);
-
-	/*
-	 * Sticky shift down to normal rounding precision.
-	 */
-	if ((int) rm < 0) {
-		rm = (rm >> (32 - (SP_FBITS + 1 + 3))) |
-		    ((rm << (SP_FBITS + 1 + 3)) != 0);
-		re++;
-	} else {
-		rm = (rm >> (32 - (SP_FBITS + 1 + 3 + 1))) |
-		     ((rm << (SP_FBITS + 1 + 3 + 1)) != 0);
-	}
-	assert(rm & (SP_HIDDEN_BIT << 3));
-
-	/* And now the subtraction */
-
-	/* Flip sign of r and handle as add */
-	rs ^= 1;
-
-	assert(zm & SP_HIDDEN_BIT);
-
-	/*
-	 * Provide guard,round and stick bit space.
-	 */
-	zm <<= 3;
-
-	if (ze > re) {
-		/*
-		 * Have to shift y fraction right to align.
-		 */
-		s = ze - re;
-		SPXSRSYn(s);
-	} else if (re > ze) {
-		/*
-		 * Have to shift x fraction right to align.
-		 */
-		s = re - ze;
-		SPXSRSYn(s);
-	}
-	assert(ze == re);
-	assert(ze <= SP_EMAX);
-
-	if (zs == rs) {
-		/*
-		 * Generate 28 bit result of adding two 27 bit numbers
-		 * leaving result in zm, zs and ze.
-		 */
-		zm = zm + rm;
-
-		if (zm >> (SP_FBITS + 1 + 3)) { /* carry out */
-			SPXSRSX1(); /* shift preserving sticky */
-		}
-	} else {
-		if (zm >= rm) {
-			zm = zm - rm;
-		} else {
-			zm = rm - zm;
-			zs = rs;
-		}
-		if (zm == 0)
-			return ieee754sp_zero(ieee754_csr.rm == FPU_CSR_RD);
-
-		/*
-		 * Normalize in extended single precision
-		 */
-		while ((zm >> (SP_MBITS + 3)) == 0) {
-			zm <<= 1;
-			ze--;
-		}
-
-	}
-	return ieee754sp_format(zs, ze, zm);
-}
diff --git a/arch/mips/math-emu/sp_mul.c b/arch/mips/math-emu/sp_mul.c
index d910c43a6f30..fde71e293ec4 100644
--- a/arch/mips/math-emu/sp_mul.c
+++ b/arch/mips/math-emu/sp_mul.c
@@ -25,15 +25,15 @@ union ieee754sp ieee754sp_mul(union ieee754sp x, union ieee754sp y)
 {
 	int re;
 	int rs;
-	unsigned rm;
+	unsigned int rm;
 	unsigned short lxm;
 	unsigned short hxm;
 	unsigned short lym;
 	unsigned short hym;
-	unsigned lrm;
-	unsigned hrm;
-	unsigned t;
-	unsigned at;
+	unsigned int lrm;
+	unsigned int hrm;
+	unsigned int t;
+	unsigned int at;
 
 	COMPXSP;
 	COMPYSP;
@@ -101,6 +101,7 @@ union ieee754sp ieee754sp_mul(union ieee754sp x, union ieee754sp y)
 
 	case CLPAIR(IEEE754_CLASS_DNORM, IEEE754_CLASS_DNORM):
 		SPDNORMX;
+		/* fall through */
 
 	case CLPAIR(IEEE754_CLASS_NORM, IEEE754_CLASS_DNORM):
 		SPDNORMY;
diff --git a/arch/mips/math-emu/sp_rint.c b/arch/mips/math-emu/sp_rint.c
new file mode 100644
index 000000000000..70765b17e196
--- /dev/null
+++ b/arch/mips/math-emu/sp_rint.c
@@ -0,0 +1,90 @@
+/* IEEE754 floating point arithmetic
+ * single precision
+ */
+/*
+ * MIPS floating point support
+ * Copyright (C) 1994-2000 Algorithmics Ltd.
+ * Copyright (C) 2017 Imagination Technologies, Ltd.
+ * Author: Aleksandar Markovic <aleksandar.markovic@imgtec.com>
+ *
+ *  This program is free software; you can distribute it and/or modify it
+ *  under the terms of the GNU General Public License (Version 2) as
+ *  published by the Free Software Foundation.
+ *
+ *  This program is distributed in the hope it will be useful, but WITHOUT
+ *  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ *  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ *  for more details.
+ *
+ *  You should have received a copy of the GNU General Public License along
+ *  with this program.
+ */
+
+#include "ieee754sp.h"
+
+union ieee754sp ieee754sp_rint(union ieee754sp x)
+{
+	union ieee754sp ret;
+	u32 residue;
+	int sticky;
+	int round;
+	int odd;
+
+	COMPXDP;		/* <-- DP needed for 64-bit mantissa tmp */
+
+	ieee754_clearcx();
+
+	EXPLODEXSP;
+	FLUSHXSP;
+
+	if (xc == IEEE754_CLASS_SNAN)
+		return ieee754sp_nanxcpt(x);
+
+	if ((xc == IEEE754_CLASS_QNAN) ||
+	    (xc == IEEE754_CLASS_INF) ||
+	    (xc == IEEE754_CLASS_ZERO))
+		return x;
+
+	if (xe >= SP_FBITS)
+		return x;
+
+	if (xe < -1) {
+		residue = xm;
+		round = 0;
+		sticky = residue != 0;
+		xm = 0;
+	} else {
+		residue = xm << (xe + 1);
+		residue <<= 31 - SP_FBITS;
+		round = (residue >> 31) != 0;
+		sticky = (residue << 1) != 0;
+		xm >>= SP_FBITS - xe;
+	}
+
+	odd = (xm & 0x1) != 0x0;
+
+	switch (ieee754_csr.rm) {
+	case FPU_CSR_RN:	/* toward nearest */
+		if (round && (sticky || odd))
+			xm++;
+		break;
+	case FPU_CSR_RZ:	/* toward zero */
+		break;
+	case FPU_CSR_RU:	/* toward +infinity */
+		if ((round || sticky) && !xs)
+			xm++;
+		break;
+	case FPU_CSR_RD:	/* toward -infinity */
+		if ((round || sticky) && xs)
+			xm++;
+		break;
+	}
+
+	if (round || sticky)
+		ieee754_setcx(IEEE754_INEXACT);
+
+	ret = ieee754sp_flong(xm);
+	SPSIGN(ret) = xs;
+
+	return ret;
+}
diff --git a/arch/mips/math-emu/sp_simple.c b/arch/mips/math-emu/sp_simple.c
index c50e9451f2d2..756c9cf2dfd2 100644
--- a/arch/mips/math-emu/sp_simple.c
+++ b/arch/mips/math-emu/sp_simple.c
@@ -23,27 +23,39 @@
 
 union ieee754sp ieee754sp_neg(union ieee754sp x)
 {
-	unsigned int oldrm;
 	union ieee754sp y;
 
-	oldrm = ieee754_csr.rm;
-	ieee754_csr.rm = FPU_CSR_RD;
-	y = ieee754sp_sub(ieee754sp_zero(0), x);
-	ieee754_csr.rm = oldrm;
+	if (ieee754_csr.abs2008) {
+		y = x;
+		SPSIGN(y) = !SPSIGN(x);
+	} else {
+		unsigned int oldrm;
+
+		oldrm = ieee754_csr.rm;
+		ieee754_csr.rm = FPU_CSR_RD;
+		y = ieee754sp_sub(ieee754sp_zero(0), x);
+		ieee754_csr.rm = oldrm;
+	}
 	return y;
 }
 
 union ieee754sp ieee754sp_abs(union ieee754sp x)
 {
-	unsigned int oldrm;
 	union ieee754sp y;
 
-	oldrm = ieee754_csr.rm;
-	ieee754_csr.rm = FPU_CSR_RD;
-	if (SPSIGN(x))
-		y = ieee754sp_sub(ieee754sp_zero(0), x);
-	else
-		y = ieee754sp_add(ieee754sp_zero(0), x);
-	ieee754_csr.rm = oldrm;
+	if (ieee754_csr.abs2008) {
+		y = x;
+		SPSIGN(y) = 0;
+	} else {
+		unsigned int oldrm;
+
+		oldrm = ieee754_csr.rm;
+		ieee754_csr.rm = FPU_CSR_RD;
+		if (SPSIGN(x))
+			y = ieee754sp_sub(ieee754sp_zero(0), x);
+		else
+			y = ieee754sp_add(ieee754sp_zero(0), x);
+		ieee754_csr.rm = oldrm;
+	}
 	return y;
 }
diff --git a/arch/mips/math-emu/sp_sqrt.c b/arch/mips/math-emu/sp_sqrt.c
index 67059c33a250..9cc83f012342 100644
--- a/arch/mips/math-emu/sp_sqrt.c
+++ b/arch/mips/math-emu/sp_sqrt.c
@@ -82,7 +82,8 @@ union ieee754sp ieee754sp_sqrt(union ieee754sp x)
 
 	/* generate sqrt(x) bit by bit */
 	ix += ix;
-	q = s = 0;		/* q = sqrt(x) */
+	s = 0;
+	q = 0;			/* q = sqrt(x) */
 	r = 0x01000000;		/* r = moving bit from right to left */
 
 	while (r != 0) {
diff --git a/arch/mips/math-emu/sp_sub.c b/arch/mips/math-emu/sp_sub.c
index ec5f937a8b3e..9f2ff72c3d6b 100644
--- a/arch/mips/math-emu/sp_sub.c
+++ b/arch/mips/math-emu/sp_sub.c
@@ -106,6 +106,7 @@ union ieee754sp ieee754sp_sub(union ieee754sp x, union ieee754sp y)
 
 	case CLPAIR(IEEE754_CLASS_DNORM, IEEE754_CLASS_DNORM):
 		SPDNORMX;
+		/* fall through */
 
 	case CLPAIR(IEEE754_CLASS_NORM, IEEE754_CLASS_DNORM):
 		SPDNORMY;
@@ -134,13 +135,15 @@ union ieee754sp ieee754sp_sub(union ieee754sp x, union ieee754sp y)
 		 * have to shift y fraction right to align
 		 */
 		s = xe - ye;
-		SPXSRSYn(s);
+		ym = XSPSRS(ym, s);
+		ye += s;
 	} else if (ye > xe) {
 		/*
 		 * have to shift x fraction right to align
 		 */
 		s = ye - xe;
-		SPXSRSXn(s);
+		xm = XSPSRS(xm, s);
+		xe += s;
 	}
 	assert(xe == ye);
 	assert(xe <= SP_EMAX);
diff --git a/arch/mips/math-emu/sp_tint.c b/arch/mips/math-emu/sp_tint.c
index 091299a31798..f4b4cabfe2e1 100644
--- a/arch/mips/math-emu/sp_tint.c
+++ b/arch/mips/math-emu/sp_tint.c
@@ -38,10 +38,13 @@ int ieee754sp_tint(union ieee754sp x)
 	switch (xc) {
 	case IEEE754_CLASS_SNAN:
 	case IEEE754_CLASS_QNAN:
-	case IEEE754_CLASS_INF:
 		ieee754_setcx(IEEE754_INVALID_OPERATION);
 		return ieee754si_indef();
 
+	case IEEE754_CLASS_INF:
+		ieee754_setcx(IEEE754_INVALID_OPERATION);
+		return ieee754si_overflow(xs);
+
 	case IEEE754_CLASS_ZERO:
 		return 0;
 
@@ -56,7 +59,7 @@ int ieee754sp_tint(union ieee754sp x)
 		/* Set invalid. We will only use overflow for floating
 		   point overflow */
 		ieee754_setcx(IEEE754_INVALID_OPERATION);
-		return ieee754si_indef();
+		return ieee754si_overflow(xs);
 	}
 	/* oh gawd */
 	if (xe > SP_FBITS) {
@@ -97,7 +100,7 @@ int ieee754sp_tint(union ieee754sp x)
 		if ((xm >> 31) != 0) {
 			/* This can happen after rounding */
 			ieee754_setcx(IEEE754_INVALID_OPERATION);
-			return ieee754si_indef();
+			return ieee754si_overflow(xs);
 		}
 		if (round || sticky)
 			ieee754_setcx(IEEE754_INEXACT);
diff --git a/arch/mips/math-emu/sp_tlong.c b/arch/mips/math-emu/sp_tlong.c
index 9f3c742c1cea..bca5ac995801 100644
--- a/arch/mips/math-emu/sp_tlong.c
+++ b/arch/mips/math-emu/sp_tlong.c
@@ -20,7 +20,6 @@
  */
 
 #include "ieee754sp.h"
-#include "ieee754dp.h"
 
 s64 ieee754sp_tlong(union ieee754sp x)
 {
@@ -39,10 +38,13 @@ s64 ieee754sp_tlong(union ieee754sp x)
 	switch (xc) {
 	case IEEE754_CLASS_SNAN:
 	case IEEE754_CLASS_QNAN:
-	case IEEE754_CLASS_INF:
 		ieee754_setcx(IEEE754_INVALID_OPERATION);
 		return ieee754di_indef();
 
+	case IEEE754_CLASS_INF:
+		ieee754_setcx(IEEE754_INVALID_OPERATION);
+		return ieee754di_overflow(xs);
+
 	case IEEE754_CLASS_ZERO:
 		return 0;
 
@@ -57,7 +59,7 @@ s64 ieee754sp_tlong(union ieee754sp x)
 		/* Set invalid. We will only use overflow for floating
 		   point overflow */
 		ieee754_setcx(IEEE754_INVALID_OPERATION);
-		return ieee754di_indef();
+		return ieee754di_overflow(xs);
 	}
 	/* oh gawd */
 	if (xe > SP_FBITS) {
@@ -94,7 +96,7 @@ s64 ieee754sp_tlong(union ieee754sp x)
 		if ((xm >> 63) != 0) {
 			/* This can happen after rounding */
 			ieee754_setcx(IEEE754_INVALID_OPERATION);
-			return ieee754di_indef();
+			return ieee754di_overflow(xs);
 		}
 		if (round || sticky)
 			ieee754_setcx(IEEE754_INEXACT);