#
#  mp_limb_t mulredc1(mp_limb_t * z, const mp_limb_t x, const mp_limb_t y,
#                 const mp_limb_t m, mp_limb_t inv_m)
#
#  Compute z := x*y mod m, in Montgomery representation, where x, y < m
#  and m is n limb wide.  inv_m is the less significant limb of the
#  inverse of m modulo 2^(n*GMP_LIMB_BITS)
#
#  The result might be unreduced (larger than m) but becomes reduced
#  after subtracting m. The calling function should take care of that.
#
#  We use a temporary space for unreduced product on the stack.
#  Therefore, this can not be used for large integers (anyway, the
#  algorithm is quadratic).
#
#  WARNING: z is only n limbs but since it might be unreduced, there
#  could be a carry that does not fit in z. This carry is returned.


include(`config.m4')
	TEXT
	GLOBL GSYM_PREFIX`'mulredc1
	TYPE(GSYM_PREFIX`'mulredc1,`function')

ifdef(`WINDOWS64_ABI',
# stack: inv_m, %r9: m, %r8: y, %rdx: x, %rcx: *z
`define(`INV_M', `0x28(%rsp)')
define(`M', `%r9')
define(`Y', `%r8')
define(`X', `%rdx')
define(`Z', `%rcx')
define(`TMP2', `%r10')
define(`TMP1', `%r8')',
# %r8: inv_m, %rcx: m, %rdx: y, %rsi : x, %rdi : *z
`define(`INV_M', `%r8')
define(`M', `%rcx')
define(`Y', `%rdx')
define(`X', `%rsi')
define(`Z', `%rdi')
define(`TMP2', `%r10')
define(`TMP1', `%r9')')

GSYM_PREFIX`'mulredc1:
	movq	Y, %rax
	mulq	X
	movq	%rdx, TMP2
	movq	%rax, TMP1      # store xy in [r9:r10]
	mulq	INV_M           # compute u
	mulq	M               # compute u*m
	addq	TMP1, %rax      # rax is 0, now (carry is important)
	adcq	TMP2, %rdx
	movq	%rdx, (Z)
	adcq	$0, %rax
	ret

ifdef(`WINDOWS64_ABI',
,
`
`#'if defined(__linux__) && defined(__ELF__)
.section .note.GNU-stack,"",%progbits
`#'endif
') dnl
