Barretenberg
The ZK-SNARK library at the core of Aztec
Loading...
Searching...
No Matches
field_impl_generic.hpp
Go to the documentation of this file.
1// === AUDIT STATUS ===
2// internal: { status: Completed, auditors: [Raju], commit: }
3// external_1: { status: not started, auditors: [], commit: }
4// external_2: { status: not started, auditors: [], commit: }
5// =====================
6
7#pragma once
8
9#include <array>
10#include <cstdint>
11
12#include "./field_impl.hpp"
14
15namespace bb {
16
17// NOLINTBEGIN(readability-implicit-bool-conversion)
18template <class T>
19constexpr std::pair<uint64_t, uint64_t> field<T>::mul_wide([[maybe_unused]] uint64_t a,
20 [[maybe_unused]] uint64_t b) noexcept
21{
22#if defined(__SIZEOF_INT128__) && !defined(__wasm__)
23 const uint128_t res = (static_cast<uint128_t>(a) * static_cast<uint128_t>(b));
24 return { static_cast<uint64_t>(res), static_cast<uint64_t>(res >> 64) };
25#else
26 static_assert(false, "mul_wide is not implemented for WASM");
27 return { 0, 0 };
28#endif
29}
33template <class T>
34constexpr uint64_t field<T>::mac([[maybe_unused]] const uint64_t a,
35 [[maybe_unused]] const uint64_t b,
36 [[maybe_unused]] const uint64_t c,
37 [[maybe_unused]] const uint64_t carry_in,
38 [[maybe_unused]] uint64_t& carry_out) noexcept
39{
40#if defined(__SIZEOF_INT128__) && !defined(__wasm__)
41 const uint128_t res = static_cast<uint128_t>(a) + (static_cast<uint128_t>(b) * static_cast<uint128_t>(c)) +
42 static_cast<uint128_t>(carry_in);
43 carry_out = static_cast<uint64_t>(res >> 64);
44 return static_cast<uint64_t>(res);
45#else
46 static_assert(false, "mac is not implemented for WASM");
47 return 0;
48#endif
49}
50
55template <class T>
56constexpr void field<T>::mac([[maybe_unused]] const uint64_t a,
57 [[maybe_unused]] const uint64_t b,
58 [[maybe_unused]] const uint64_t c,
59 [[maybe_unused]] const uint64_t carry_in,
60 [[maybe_unused]] uint64_t& out,
61 [[maybe_unused]] uint64_t& carry_out) noexcept
62{
63#if defined(__SIZEOF_INT128__) && !defined(__wasm__)
64 const uint128_t res = static_cast<uint128_t>(a) + (static_cast<uint128_t>(b) * static_cast<uint128_t>(c)) +
65 static_cast<uint128_t>(carry_in);
66 out = static_cast<uint64_t>(res);
67 carry_out = static_cast<uint64_t>(res >> 64);
68#else
69 static_assert(false, "mac is not implemented for WASM");
70#endif
71}
72
73template <class T>
74constexpr uint64_t field<T>::mac_mini([[maybe_unused]] const uint64_t a,
75 [[maybe_unused]] const uint64_t b,
76 [[maybe_unused]] const uint64_t c,
77 [[maybe_unused]] uint64_t& carry_out) noexcept
78{
79#if defined(__SIZEOF_INT128__) && !defined(__wasm__)
80 const uint128_t res = static_cast<uint128_t>(a) + (static_cast<uint128_t>(b) * static_cast<uint128_t>(c));
81 carry_out = static_cast<uint64_t>(res >> 64);
82 return static_cast<uint64_t>(res);
83#else
84 static_assert(false, "mac is not implemented for WASM");
85 return 0;
86#endif
87}
88
89template <class T>
90constexpr void field<T>::mac_mini([[maybe_unused]] const uint64_t a,
91 [[maybe_unused]] const uint64_t b,
92 [[maybe_unused]] const uint64_t c,
93 [[maybe_unused]] uint64_t& out,
94 [[maybe_unused]] uint64_t& carry_out) noexcept
95{
96#if defined(__SIZEOF_INT128__) && !defined(__wasm__)
97 const uint128_t res = static_cast<uint128_t>(a) + (static_cast<uint128_t>(b) * static_cast<uint128_t>(c));
98 out = static_cast<uint64_t>(res);
99 carry_out = static_cast<uint64_t>(res >> 64);
100#else
101 static_assert(false, "mac_mini is not implemented for WASM");
102#endif
103}
104
105template <class T>
106constexpr uint64_t field<T>::mac_discard_lo([[maybe_unused]] const uint64_t a,
107 [[maybe_unused]] const uint64_t b,
108 [[maybe_unused]] const uint64_t c) noexcept
109{
110#if defined(__SIZEOF_INT128__) && !defined(__wasm__)
111 const uint128_t res = static_cast<uint128_t>(a) + (static_cast<uint128_t>(b) * static_cast<uint128_t>(c));
112 return static_cast<uint64_t>(res >> 64);
113#else
114 static_assert(false, "mac_discord_lo is not implemented for WASM");
115 return 0;
116#endif
117}
124template <class T>
125constexpr uint64_t field<T>::addc(const uint64_t a,
126 const uint64_t b,
127 const uint64_t carry_in,
128 uint64_t& carry_out) noexcept
129{
130#if defined(__SIZEOF_INT128__) && !defined(__wasm__)
131 uint128_t res = static_cast<uint128_t>(a) + static_cast<uint128_t>(b) + static_cast<uint128_t>(carry_in);
132 carry_out = static_cast<uint64_t>(res >> 64);
133 return static_cast<uint64_t>(res);
134#else
135 uint64_t r = a + b;
136 const uint64_t carry_temp = r < a; // carry_temp == 1 iff a + b overflows (without the carry_in bit)
137 r += carry_in;
138 carry_out = carry_temp +
139 (r < carry_in); // (r < carry_in) iff a + b == 2^64 - 1 and carry_in == 1, which means that (r >= a)
140 return r;
141#endif
142}
151template <class T>
152constexpr uint64_t field<T>::sbb(const uint64_t a,
153 const uint64_t b,
154 const uint64_t borrow_in,
155 uint64_t& borrow_out) noexcept
156{
157#if defined(__SIZEOF_INT128__) && !defined(__wasm__)
158 uint128_t res = static_cast<uint128_t>(a) - (static_cast<uint128_t>(b) + static_cast<uint128_t>(borrow_in >> 63));
159 borrow_out = static_cast<uint64_t>(
160 res >> 64); // consider the set of negative outputs of [0, 2^64 - 1] - [0, 2^64]; then the highest-order 64 bits
161 // are either all 0 or all 1. hence `borrow_out` is in {0, 2^64 - 1}.
162 return static_cast<uint64_t>(res);
163#else
164 uint64_t t_1 = a - (borrow_in >> 63ULL);
165 uint64_t borrow_temp_1 = t_1 > a; // 0 iff a == 0 and borrow_in is non-zero (i.e., 2^64 - 1).
166 uint64_t t_2 = t_1 - b;
167 uint64_t borrow_temp_2 = t_2 > t_1; // 0 iff b > t_1
168 borrow_out = 0ULL - (borrow_temp_1 | borrow_temp_2); // underflow if either staged underflowed.
169 return t_2;
170#endif
171}
181template <class T>
182constexpr uint64_t field<T>::square_accumulate([[maybe_unused]] const uint64_t a,
183 [[maybe_unused]] const uint64_t b,
184 [[maybe_unused]] const uint64_t c,
185 [[maybe_unused]] const uint64_t carry_in_lo,
186 [[maybe_unused]] const uint64_t carry_in_hi,
187 [[maybe_unused]] uint64_t& carry_lo,
188 [[maybe_unused]] uint64_t& carry_hi) noexcept
189{
190#if defined(__SIZEOF_INT128__) && !defined(__wasm__)
191 const uint128_t product = static_cast<uint128_t>(b) * static_cast<uint128_t>(c);
192 const auto r0 = static_cast<uint64_t>(product); // uint64_t(b * c)
193 const auto r1 = static_cast<uint64_t>(product >> 64);
194 uint64_t out = r0 + r0;
195 carry_lo = (out < r0); // 1 iff r_0 + r_0 overflows. (r_0 = uint_64t(b * c))
196 out += a; // uint_64t(a + (2 * b * c))
197 carry_lo += (out < a); // + 1 if a + uint_64t(2 * b * c) overflows
198 out += carry_in_lo; // uint_64t(a + (2 * b * c) + carry_in_lo)
199 carry_lo += (out < carry_in_lo); // + 1 if uint_64t(a + (2 * b * c)) + carry_in_lo overflows.
200 carry_lo += r1; // + r_1 (r_1 == "high order bits of b * c")
201 carry_hi = (carry_lo < r1); // 1 if adding r_1 to carry_lo causes overflow
202 carry_lo += r1; // + r_1 (we do this twice because of 2 * (b * c))
203 carry_hi += (carry_lo < r1); // + 1 if adding r_1 causes overflow
204 carry_lo += carry_in_hi; // finally add in the input "upper bits" contribution carry_in_hi
205 carry_hi += (carry_lo < carry_in_hi); // + 1 if this caused an overflow
206 return out;
207#else
208 static_assert(false, "square_accumulate is not implemented for WASM");
209 return 0;
210#endif
211}
223template <class T> constexpr field<T> field<T>::reduce() const noexcept
224{
225 if constexpr (modulus.data[3] >= MODULUS_TOP_LIMB_LARGE_THRESHOLD) {
226 uint256_t val{ data[0], data[1], data[2], data[3] };
227 if (val >= modulus) {
228 val -= modulus;
229 }
230 return { val.data[0], val.data[1], val.data[2], val.data[3] };
231 }
232 // not_modulus == 2^256 - modulus
233 // do limb-based add-and-carry with `not_modulus`. this yields a _constant-time_ algorithm.
234 uint64_t t0 = data[0] + not_modulus.data[0];
235 uint64_t c = t0 < data[0];
236 auto t1 = addc(data[1], not_modulus.data[1], c, c);
237 auto t2 = addc(data[2], not_modulus.data[2], c, c);
238 auto t3 = addc(data[3], not_modulus.data[3], c, c);
239 // c != 0 iff val >= modulus.
240 const uint64_t selection_mask = 0ULL - c; // 0xffffffff if we have overflowed.
241 const uint64_t selection_mask_inverse = ~selection_mask;
242 // if c == 0, then the original element is already reduced. if we overflow, we want to return the element whose
243 // limbs are {t0, t1, t2, t3}.
244 return {
245 (data[0] & selection_mask_inverse) | (t0 & selection_mask),
246 (data[1] & selection_mask_inverse) | (t1 & selection_mask),
247 (data[2] & selection_mask_inverse) | (t2 & selection_mask),
248 (data[3] & selection_mask_inverse) | (t3 & selection_mask),
249 };
250}
251
255
256// Both `add` and `sub` use constexpr branching to distinguish the cases: modulus has <= 254 bits (fields associated to
257// BN-254) and modulus has 256 bits. The former has the so-called "coarse" optimization: we allow the inputs to be in
258// the range [0, 2p) and the outputs will similarly only be constrained to [0, 2p)
259
260template <class T> constexpr field<T> field<T>::add(const field& other) const noexcept
261{
262 if constexpr (modulus.data[3] >= MODULUS_TOP_LIMB_LARGE_THRESHOLD) {
263 uint64_t r0 = data[0] + other.data[0];
264 uint64_t c = r0 < data[0];
265 auto r1 = addc(data[1], other.data[1], c, c);
266 auto r2 = addc(data[2], other.data[2], c, c);
267 auto r3 = addc(data[3], other.data[3], c, c);
268 if (c) {
269 uint64_t b = 0;
270 r0 = sbb(r0, modulus.data[0], b, b);
271 r1 = sbb(r1, modulus.data[1], b, b);
272 r2 = sbb(r2, modulus.data[2], b, b);
273 r3 = sbb(r3, modulus.data[3], b, b);
274 // Since both values are in [0, 2^256), the result is in [0, 2^257-2). Subtracting one p might not
275 // be enough. We need to ensure that we've underflown the 0 and that might require subtracting an additional
276 // p. This can only happen if at least one of the two arguments has uint256_t-element (derived from limbs)
277 // LARGER than p (i.e., non-reduced).
278 if (!b) {
279 b = 0;
280 r0 = sbb(r0, modulus.data[0], b, b);
281 r1 = sbb(r1, modulus.data[1], b, b);
282 r2 = sbb(r2, modulus.data[2], b, b);
283 r3 = sbb(r3, modulus.data[3], b, b);
284 }
285 }
286 // if c != 0, i.e., if there was no carry, we do no additional processing. Note that this means that the output
287 // might be larger than p, even if the original self and other were in the range [0, p). This is witnessed in
288 // the test AddYieldsLimbsBiggerThanModulus.
289 return { r0, r1, r2, r3 };
290 } else {
291 uint64_t r0 = data[0] + other.data[0];
292 uint64_t c = r0 < data[0];
293 auto r1 = addc(data[1], other.data[1], c, c);
294 auto r2 = addc(data[2], other.data[2], c, c);
295 uint64_t r3 = data[3] + other.data[3] +
296 c; // in the small modulus branch so this will satisfy the right size bounds: both self
297 // and other are in the range [0, 2p), which means their sum is in [0, 4p-1).
298
299 uint64_t t0 = r0 + twice_not_modulus.data[0];
300 c = t0 < twice_not_modulus.data[0];
301 uint64_t t1 = addc(r1, twice_not_modulus.data[1], c, c);
302 uint64_t t2 = addc(r2, twice_not_modulus.data[2], c, c);
303 uint64_t t3 = addc(r3, twice_not_modulus.data[3], c, c);
304 // c == 1 iff self + other >= 2 * p.
305 // if c == 0, then return the r_i (naive sum still in coarse form), if c == 1, return the t_i.
306 const uint64_t selection_mask = 0ULL - c;
307 const uint64_t selection_mask_inverse = ~selection_mask;
308
309 return {
310 (r0 & selection_mask_inverse) | (t0 & selection_mask),
311 (r1 & selection_mask_inverse) | (t1 & selection_mask),
312 (r2 & selection_mask_inverse) | (t2 & selection_mask),
313 (r3 & selection_mask_inverse) | (t3 & selection_mask),
314 };
315 }
316}
317
318template <class T> constexpr field<T> field<T>::subtract(const field& other) const noexcept
319{
320 uint64_t borrow = 0;
321 uint64_t r0 = sbb(data[0], other.data[0], borrow, borrow);
322 uint64_t r1 = sbb(data[1], other.data[1], borrow, borrow);
323 uint64_t r2 = sbb(data[2], other.data[2], borrow, borrow);
324 uint64_t r3 = sbb(data[3], other.data[3], borrow, borrow);
325
326 // recall that borrow is in the size-2 set {0, 2^64 - 1}.
327 if constexpr (modulus.data[3] >= MODULUS_TOP_LIMB_LARGE_THRESHOLD) {
328 // add the modulus if borrow != 0, i.e., if other > self as uint256_t.
329 r0 += (modulus.data[0] & borrow);
330 uint64_t carry = r0 < (modulus.data[0] & borrow);
331 r1 = addc(r1, modulus.data[1] & borrow, carry, carry);
332 r2 = addc(r2, modulus.data[2] & borrow, carry, carry);
333 r3 = addc(r3, modulus.data[3] & borrow, carry, carry);
334 // The value being subtracted is in [0, 2^256); it is possible that adding one copy of
335 // p still leaves us with a negative number. To check if we might need to add another copy of p, we check if
336 // `carry == 0`; this means that (if we are "in the borrow branch"), the addition did not 2^256-overflow, which
337 // means we are still negative. If we not in the borrow branch (i.e., if `borrow == 0`), `carry == 0` and we add
338 // nothing using the
339 // `& borrow` trick for the `addc` argument.
340 if (!carry) {
341 r0 += (modulus.data[0] & borrow);
342 uint64_t carry = r0 < (modulus.data[0] & borrow);
343 r1 = addc(r1, modulus.data[1] & borrow, carry, carry);
344 r2 = addc(r2, modulus.data[2] & borrow, carry, carry);
345 r3 = addc(r3, (modulus.data[3] & borrow), carry, carry);
346 }
347 return { r0, r1, r2, r3 };
348 }
349 // Recall that in this constexpr branch, we use _coarse representation_, meaning the underlying limbs of both self
350 // and other yield uint256_t are in [0, 2p) . If there is a borrow, then it is possible that adding one copy of p
351 // is insufficient to make the result positive (and adding two copies both preserves the residue mod p and keeps us
352 // in the coarse-range).
353 r0 += (twice_modulus.data[0] & borrow);
354 uint64_t carry = r0 < (twice_modulus.data[0] & borrow);
355 r1 = addc(r1, twice_modulus.data[1] & borrow, carry, carry);
356 r2 = addc(r2, twice_modulus.data[2] & borrow, carry, carry);
357 r3 += (twice_modulus.data[3] & borrow) + carry;
358
359 return { r0, r1, r2, r3 };
360}
361
370template <class T> constexpr field<T> field<T>::montgomery_mul_big(const field& other) const noexcept
371{
372 // only applicable for big moduli
373 static_assert(modulus.data[3] >= MODULUS_TOP_LIMB_LARGE_THRESHOLD);
374
375#if defined(__SIZEOF_INT128__) && !defined(__wasm__)
376 uint64_t c = 0;
377 uint64_t t0 = 0;
378 uint64_t t1 = 0;
379 uint64_t t2 = 0;
380 uint64_t t3 = 0;
381 uint64_t t4 = 0;
382 uint64_t t5 = 0;
383 uint64_t k = 0;
384
385 // Montgomery multiplication main loop: iterates 4 times, once per limb of self.data.
386 // We compute self * other in Montgomery form by maintaining a 5-limb running accumulator (t0-t4, with t5 for
387 // overflow). In each iteration:
388 // 1. Accumulate one limb of self multiplied by all limbs of other into (t0, t1, t2, t3, t4, t5)
389 // 2. "Zero out" the lowest limb t0 by computing k = t0 * r_inv (mod 2^64), then adding k * modulus
390 // This shifts the accumulator right by one limb position (t1->t0, t2->t1, etc.)
391 // The value of k is chosen so that (t0 + k * modulus[0]) ≡ 0 (mod 2^64), meaning the shifting of the accumulator
392 // amounts to dividing by 2^64.
393 //
394 // After 4 iterations, we've accumulated the full product and divided by R = 2^256,
395 // leaving the Montgomery form result in (t0, t1, t2, t3, t4).
396 for (const auto& element : data) {
397 c = 0;
398 // element = self.data[j]
399 // ti <- ti + self.data[j] * other.data[i] + carry_in, for i = 0..3.
400 // c is the carry_in for the computation; the carry-out is then written to c at every ste at every step..
401 mac(t0, element, other.data[0], c, t0, c);
402 mac(t1, element, other.data[1], c, t1, c);
403 mac(t2, element, other.data[2], c, t2, c);
404 mac(t3, element, other.data[3], c, t3, c);
405 // t4 += c, with carry-out written to t5.
406 // t5 is in {0, 1}.
407 t4 = addc(t4, c, 0, t5);
408
409 // add a multiple of the modulus, so that the result is divisible by 2^64, and then divide. these processes are
410 // done "simultaneously".
411 k = t0 * T::r_inv;
412 // the uint128_t t0 + (t0 * r_inv) * modulus[0] is divisible by 2^64. set c to be the high 64-bits of this
413 // number.
414 c = mac_discard_lo(t0, k, modulus.data[0]);
415 mac(t1, k, modulus.data[1], c, t0, c);
416 mac(t2, k, modulus.data[2], c, t1, c);
417 mac(t3, k, modulus.data[3], c, t2, c);
418 t3 = addc(c, t4, 0, c); // c is now in {0, 1}
419 t4 = t5 + c;
420 }
421 // The result is now contains in the 64*5-bit number with limbs {t0, t1, t2, t3, t4}. In fact, this number has at
422 // most 257 bits because t4 is in {0, 1}. Proof: we have just computed (aR * bR + \sum_i k_i p)/(2^256), where each
423 // k_i is less than 2^{64i} * (2^64 - 1) for i = 0..3. The numerator is therefore upper-bounded by (2^256 - 1)^2 +
424 // (2^256 - 1) * p, hence the whole quantity is bounded by 2^256 + p - 1. Therefore, t4 is in {0, 1}, and we must do
425 // at most one subtraction to get in range.
426
427 // constant-time "conditional reduction" that computes the following without branches:
428 // `result = (value >= modulus) ? value - modulus : value`
429 uint64_t borrow = 0;
430 uint64_t r0 = sbb(t0, modulus.data[0], borrow, borrow);
431 uint64_t r1 = sbb(t1, modulus.data[1], borrow, borrow);
432 uint64_t r2 = sbb(t2, modulus.data[2], borrow, borrow);
433 uint64_t r3 = sbb(t3, modulus.data[3], borrow, borrow);
434 // if t4 == 1, then from the above upper bound of 2^256 + p - 1, it follows that borrow != 0, i.e., borrow == 2^64
435 // - 1. if t4 == 0, both options for borrow are possible.
436 borrow = borrow ^ (0ULL - t4); // borrow is set to 0 if (t4 == 1 and hence borrow == 2^64 - 1) OR if (borrow == 0
437 // AND t4 == 1). borrow is set to 2^64 - 1 if (t4 == 0 AND borrow == 2^64 - 1)
438 r0 += (modulus.data[0] & borrow);
439 uint64_t carry = r0 < (modulus.data[0] & borrow);
440 r1 = addc(r1, modulus.data[1] & borrow, carry, carry);
441 r2 = addc(r2, modulus.data[2] & borrow, carry, carry);
442 r3 += (modulus.data[3] & borrow) + carry;
443 return { r0, r1, r2, r3 };
444#else
445
446 // Convert 4 64-bit limbs to 9 29-bit limbs
447 auto left = wasm_convert(data);
448 auto right = wasm_convert(other.data);
449 constexpr uint64_t mask = 0x1fffffff;
450 uint64_t temp_0 = 0;
451 uint64_t temp_1 = 0;
452 uint64_t temp_2 = 0;
453 uint64_t temp_3 = 0;
454 uint64_t temp_4 = 0;
455 uint64_t temp_5 = 0;
456 uint64_t temp_6 = 0;
457 uint64_t temp_7 = 0;
458 uint64_t temp_8 = 0;
459 uint64_t temp_9 = 0;
460 uint64_t temp_10 = 0;
461 uint64_t temp_11 = 0;
462 uint64_t temp_12 = 0;
463 uint64_t temp_13 = 0;
464 uint64_t temp_14 = 0;
465 uint64_t temp_15 = 0;
466 uint64_t temp_16 = 0;
467 uint64_t temp_17 = 0;
468 // Compute left[0] * right and replace with a representative modulo p that zeros out the lowest
469 // 29 bits. In other words, after first reduction: temp_1..temp_8 hold the partial Montgomery product after
470 // processing left[0]. temp_0 has been "consumed" (its information propagated via carry to temp_1).
471 // Multiply-add 0th limb of the left argument by all 9 limbs of the right arguemnt
472 wasm_madd(left[0], right, temp_0, temp_1, temp_2, temp_3, temp_4, temp_5, temp_6, temp_7, temp_8);
473 // Instantly Montgomery reduce
474 wasm_reduce(temp_0, temp_1, temp_2, temp_3, temp_4, temp_5, temp_6, temp_7, temp_8);
475 // Continue for other limbs
476 wasm_madd(left[1], right, temp_1, temp_2, temp_3, temp_4, temp_5, temp_6, temp_7, temp_8, temp_9);
477 wasm_reduce(temp_1, temp_2, temp_3, temp_4, temp_5, temp_6, temp_7, temp_8, temp_9);
478 wasm_madd(left[2], right, temp_2, temp_3, temp_4, temp_5, temp_6, temp_7, temp_8, temp_9, temp_10);
479 wasm_reduce(temp_2, temp_3, temp_4, temp_5, temp_6, temp_7, temp_8, temp_9, temp_10);
480 wasm_madd(left[3], right, temp_3, temp_4, temp_5, temp_6, temp_7, temp_8, temp_9, temp_10, temp_11);
481 wasm_reduce(temp_3, temp_4, temp_5, temp_6, temp_7, temp_8, temp_9, temp_10, temp_11);
482 wasm_madd(left[4], right, temp_4, temp_5, temp_6, temp_7, temp_8, temp_9, temp_10, temp_11, temp_12);
483 wasm_reduce(temp_4, temp_5, temp_6, temp_7, temp_8, temp_9, temp_10, temp_11, temp_12);
484 wasm_madd(left[5], right, temp_5, temp_6, temp_7, temp_8, temp_9, temp_10, temp_11, temp_12, temp_13);
485 wasm_reduce(temp_5, temp_6, temp_7, temp_8, temp_9, temp_10, temp_11, temp_12, temp_13);
486 wasm_madd(left[6], right, temp_6, temp_7, temp_8, temp_9, temp_10, temp_11, temp_12, temp_13, temp_14);
487 wasm_reduce(temp_6, temp_7, temp_8, temp_9, temp_10, temp_11, temp_12, temp_13, temp_14);
488 wasm_madd(left[7], right, temp_7, temp_8, temp_9, temp_10, temp_11, temp_12, temp_13, temp_14, temp_15);
489 wasm_reduce(temp_7, temp_8, temp_9, temp_10, temp_11, temp_12, temp_13, temp_14, temp_15);
490 wasm_madd(left[8], right, temp_8, temp_9, temp_10, temp_11, temp_12, temp_13, temp_14, temp_15, temp_16);
491 wasm_reduce(temp_8, temp_9, temp_10, temp_11, temp_12, temp_13, temp_14, temp_15, temp_16);
492 // MontgomeryMul(left, right) := (left * right) / R mod p.
493 // Then, after the add/reduce sequence, we have the following: MontgomeryMul(left, right) ≡ \sum_{i=0}^8 temp_{i+9}
494 // * 2^{29 * i} mod p. In particular, the information we want is stored in {t_9, ..., t_16}. However, these t_i are
495 // not yet 29 bits.
496 //
497 // Moreover, we claim that the value \sum_{i=0}^8 temp_{i+9} is less than than p + 2^{512-261} = p +
498 // 2^{251}. The reasoning is again generic: we have computed aR * bR + k_{0, 1, .., 8}p. Each aR and bR are, by
499 // assumption, 256 bits, and each k is 29 bits: k_0 is at most 2^29 - 1, k_1 is at most 2^58 - 2^29, etc.
500 // Telescoping, this means that the sum is upper-bounded by 2^512 + (2^261 - 1) * p. As we are taking the "high"
501 // part, we are simply trying to upper-bound this sum divided by 2^261. In particular, this shows that we have to do
502 // at most one subtraction to make the result 256 bits.
503 //
504 // After all multiplications and additions, convert relaxed form to strict (i.e., force all limbs to be
505 // 29 bits)
506 temp_10 += temp_9 >> WASM_LIMB_BITS;
507 temp_9 &= mask;
508 temp_11 += temp_10 >> WASM_LIMB_BITS;
509 temp_10 &= mask;
510 temp_12 += temp_11 >> WASM_LIMB_BITS;
511 temp_11 &= mask;
512 temp_13 += temp_12 >> WASM_LIMB_BITS;
513 temp_12 &= mask;
514 temp_14 += temp_13 >> WASM_LIMB_BITS;
515 temp_13 &= mask;
516 temp_15 += temp_14 >> WASM_LIMB_BITS;
517 temp_14 &= mask;
518 temp_16 += temp_15 >> WASM_LIMB_BITS;
519 temp_15 &= mask;
520 temp_17 += temp_16 >> WASM_LIMB_BITS;
521 temp_16 &= mask;
522
523 uint64_t r_temp_0;
524 uint64_t r_temp_1;
525 uint64_t r_temp_2;
526 uint64_t r_temp_3;
527 uint64_t r_temp_4;
528 uint64_t r_temp_5;
529 uint64_t r_temp_6;
530 uint64_t r_temp_7;
531 uint64_t r_temp_8;
532
533 r_temp_0 = temp_9 - wasm_modulus[0];
534 r_temp_1 = temp_10 - wasm_modulus[1] - ((r_temp_0) >> 63);
535 r_temp_2 = temp_11 - wasm_modulus[2] - ((r_temp_1) >> 63);
536 r_temp_3 = temp_12 - wasm_modulus[3] - ((r_temp_2) >> 63);
537 r_temp_4 = temp_13 - wasm_modulus[4] - ((r_temp_3) >> 63);
538 r_temp_5 = temp_14 - wasm_modulus[5] - ((r_temp_4) >> 63);
539 r_temp_6 = temp_15 - wasm_modulus[6] - ((r_temp_5) >> 63);
540 r_temp_7 = temp_16 - wasm_modulus[7] - ((r_temp_6) >> 63);
541 r_temp_8 = temp_17 - wasm_modulus[8] - ((r_temp_7) >> 63);
542
543 // Depending on whether the subtraction underflowed, choose original value or the result of subtraction
544 uint64_t new_mask = 0 - (r_temp_8 >> 63);
545 uint64_t inverse_mask = (~new_mask) & mask;
546 temp_9 = (temp_9 & new_mask) | (r_temp_0 & inverse_mask);
547 temp_10 = (temp_10 & new_mask) | (r_temp_1 & inverse_mask);
548 temp_11 = (temp_11 & new_mask) | (r_temp_2 & inverse_mask);
549 temp_12 = (temp_12 & new_mask) | (r_temp_3 & inverse_mask);
550 temp_13 = (temp_13 & new_mask) | (r_temp_4 & inverse_mask);
551 temp_14 = (temp_14 & new_mask) | (r_temp_5 & inverse_mask);
552 temp_15 = (temp_15 & new_mask) | (r_temp_6 & inverse_mask);
553 temp_16 = (temp_16 & new_mask) | (r_temp_7 & inverse_mask);
554 temp_17 = (temp_17 & new_mask) | (r_temp_8 & inverse_mask);
555
556 // Convert back to 4 64-bit limbs
557 return { (temp_9 << 0) | (temp_10 << 29) | (temp_11 << 58),
558 (temp_11 >> 6) | (temp_12 << 23) | (temp_13 << 52),
559 (temp_13 >> 12) | (temp_14 << 17) | (temp_15 << 46),
560 (temp_15 >> 18) | (temp_16 << 11) | (temp_17 << 40) };
561
562#endif
563}
564
565#if defined(__wasm__) || !defined(__SIZEOF_INT128__)
566
572template <class T>
573constexpr void field<T>::wasm_madd(uint64_t& left_limb,
574 const std::array<uint64_t, WASM_NUM_LIMBS>& right_limbs,
575 uint64_t& result_0,
576 uint64_t& result_1,
577 uint64_t& result_2,
578 uint64_t& result_3,
579 uint64_t& result_4,
580 uint64_t& result_5,
581 uint64_t& result_6,
582 uint64_t& result_7,
583 uint64_t& result_8)
584{
585 result_0 += left_limb * right_limbs[0];
586 result_1 += left_limb * right_limbs[1];
587 result_2 += left_limb * right_limbs[2];
588 result_3 += left_limb * right_limbs[3];
589 result_4 += left_limb * right_limbs[4];
590 result_5 += left_limb * right_limbs[5];
591 result_6 += left_limb * right_limbs[6];
592 result_7 += left_limb * right_limbs[7];
593 result_8 += left_limb * right_limbs[8];
594}
595
616template <class T>
617constexpr void field<T>::wasm_reduce(uint64_t& result_0,
618 uint64_t& result_1,
619 uint64_t& result_2,
620 uint64_t& result_3,
621 uint64_t& result_4,
622 uint64_t& result_5,
623 uint64_t& result_6,
624 uint64_t& result_7,
625 uint64_t& result_8)
627 constexpr uint64_t mask = 0x1fffffff;
628 constexpr uint64_t r_inv = T::r_inv & mask; // -(modulus ^ { -1 }) modulo 2 ^ WASM_LIMB_BITS
629 uint64_t k = (result_0 * r_inv) & mask;
630 result_0 += k * wasm_modulus[0];
631 result_1 += k * wasm_modulus[1] + (result_0 >> WASM_LIMB_BITS);
632 result_2 += k * wasm_modulus[2];
633 result_3 += k * wasm_modulus[3];
634 result_4 += k * wasm_modulus[4];
635 result_5 += k * wasm_modulus[5];
636 result_6 += k * wasm_modulus[6];
637 result_7 += k * wasm_modulus[7];
638 result_8 += k * wasm_modulus[8];
639}
661template <class T>
662constexpr void field<T>::wasm_reduce_yuval(uint64_t& result_0,
663 uint64_t& result_1,
664 uint64_t& result_2,
665 uint64_t& result_3,
666 uint64_t& result_4,
667 uint64_t& result_5,
668 uint64_t& result_6,
669 uint64_t& result_7,
670 uint64_t& result_8,
671 uint64_t& result_9)
672{
673 constexpr uint64_t mask = 0x1fffffff;
674 const uint64_t result_0_masked = result_0 & mask;
675 result_1 += result_0_masked * wasm_r_inv[0] + (result_0 >> WASM_LIMB_BITS);
676 result_2 += result_0_masked * wasm_r_inv[1];
677 result_3 += result_0_masked * wasm_r_inv[2];
678 result_4 += result_0_masked * wasm_r_inv[3];
679 result_5 += result_0_masked * wasm_r_inv[4];
680 result_6 += result_0_masked * wasm_r_inv[5];
681 result_7 += result_0_masked * wasm_r_inv[6];
682 result_8 += result_0_masked * wasm_r_inv[7];
683 result_9 += result_0_masked * wasm_r_inv[8];
684}
689template <class T> constexpr std::array<uint64_t, WASM_NUM_LIMBS> field<T>::wasm_convert(const uint64_t* data)
690{
691 return { data[0] & 0x1fffffff,
692 (data[0] >> WASM_LIMB_BITS) & 0x1fffffff,
693 ((data[0] >> 58) & 0x3f) | ((data[1] & 0x7fffff) << 6),
694 (data[1] >> 23) & 0x1fffffff,
695 ((data[1] >> 52) & 0xfff) | ((data[2] & 0x1ffff) << 12),
696 (data[2] >> 17) & 0x1fffffff,
697 ((data[2] >> 46) & 0x3ffff) | ((data[3] & 0x7ff) << 18),
698 (data[3] >> 11) & 0x1fffffff,
699 (data[3] >> 40) & 0x1fffffff };
700}
701#endif
702template <class T> constexpr field<T> field<T>::montgomery_mul(const field& other) const noexcept
703{
704 if constexpr (modulus.data[3] >= MODULUS_TOP_LIMB_LARGE_THRESHOLD) {
705 return montgomery_mul_big(other);
706 }
707#if defined(__SIZEOF_INT128__) && !defined(__wasm__)
708 // process first limb of self, data[0]
709 auto [t0, c] = mul_wide(data[0], other.data[0]);
710 uint64_t k = t0 * T::r_inv;
711 uint64_t a = mac_discard_lo(t0, k, modulus.data[0]);
712
713 uint64_t t1 = mac_mini(a, data[0], other.data[1], a);
714 mac(t1, k, modulus.data[1], c, t0, c);
715 uint64_t t2 = mac_mini(a, data[0], other.data[2], a);
716 mac(t2, k, modulus.data[2], c, t1, c);
717 uint64_t t3 = mac_mini(a, data[0], other.data[3], a);
718 mac(t3, k, modulus.data[3], c, t2, c);
719 t3 = c + a;
720 // process second limb of self, data[1]
721 mac_mini(t0, data[1], other.data[0], t0, a);
722 k = t0 * T::r_inv;
723 c = mac_discard_lo(t0, k, modulus.data[0]);
724 mac(t1, data[1], other.data[1], a, t1, a);
725 mac(t1, k, modulus.data[1], c, t0, c);
726 mac(t2, data[1], other.data[2], a, t2, a);
727 mac(t2, k, modulus.data[2], c, t1, c);
728 mac(t3, data[1], other.data[3], a, t3, a);
729 mac(t3, k, modulus.data[3], c, t2, c);
730 t3 = c + a;
731 // process third limb of self, data[2]
732 mac_mini(t0, data[2], other.data[0], t0, a);
733 k = t0 * T::r_inv;
734 c = mac_discard_lo(t0, k, modulus.data[0]);
735 mac(t1, data[2], other.data[1], a, t1, a);
736 mac(t1, k, modulus.data[1], c, t0, c);
737 mac(t2, data[2], other.data[2], a, t2, a);
738 mac(t2, k, modulus.data[2], c, t1, c);
739 mac(t3, data[2], other.data[3], a, t3, a);
740 mac(t3, k, modulus.data[3], c, t2, c);
741 t3 = c + a;
742 // process fourth limb of self, data[3]
743 mac_mini(t0, data[3], other.data[0], t0, a);
744 k = t0 * T::r_inv;
745 c = mac_discard_lo(t0, k, modulus.data[0]);
746 mac(t1, data[3], other.data[1], a, t1, a);
747 mac(t1, k, modulus.data[1], c, t0, c);
748 mac(t2, data[3], other.data[2], a, t2, a);
749 mac(t2, k, modulus.data[2], c, t1, c);
750 mac(t3, data[3], other.data[3], a, t3, a);
751 mac(t3, k, modulus.data[3], c, t2, c);
752 t3 = c + a;
753 return { t0, t1, t2, t3 };
754#else
755
756 // Convert 4 64-bit limbs to 9 29-bit ones
757 auto left = wasm_convert(data);
758 auto right = wasm_convert(other.data);
759 constexpr uint64_t mask = 0x1fffffff;
760 uint64_t temp_0 = 0;
761 uint64_t temp_1 = 0;
762 uint64_t temp_2 = 0;
763 uint64_t temp_3 = 0;
764 uint64_t temp_4 = 0;
765 uint64_t temp_5 = 0;
766 uint64_t temp_6 = 0;
767 uint64_t temp_7 = 0;
768 uint64_t temp_8 = 0;
769 uint64_t temp_9 = 0;
770 uint64_t temp_10 = 0;
771 uint64_t temp_11 = 0;
772 uint64_t temp_12 = 0;
773 uint64_t temp_13 = 0;
774 uint64_t temp_14 = 0;
775 uint64_t temp_15 = 0;
776 uint64_t temp_16 = 0;
777
778 // Perform a series of mul-adds and then reductions
779 wasm_madd(left[0], right, temp_0, temp_1, temp_2, temp_3, temp_4, temp_5, temp_6, temp_7, temp_8);
780 wasm_madd(left[1], right, temp_1, temp_2, temp_3, temp_4, temp_5, temp_6, temp_7, temp_8, temp_9);
781 wasm_madd(left[2], right, temp_2, temp_3, temp_4, temp_5, temp_6, temp_7, temp_8, temp_9, temp_10);
782 wasm_madd(left[3], right, temp_3, temp_4, temp_5, temp_6, temp_7, temp_8, temp_9, temp_10, temp_11);
783 wasm_madd(left[4], right, temp_4, temp_5, temp_6, temp_7, temp_8, temp_9, temp_10, temp_11, temp_12);
784 wasm_madd(left[5], right, temp_5, temp_6, temp_7, temp_8, temp_9, temp_10, temp_11, temp_12, temp_13);
785 wasm_madd(left[6], right, temp_6, temp_7, temp_8, temp_9, temp_10, temp_11, temp_12, temp_13, temp_14);
786 wasm_madd(left[7], right, temp_7, temp_8, temp_9, temp_10, temp_11, temp_12, temp_13, temp_14, temp_15);
787 wasm_madd(left[8], right, temp_8, temp_9, temp_10, temp_11, temp_12, temp_13, temp_14, temp_15, temp_16);
788 // At this point, the value aR * bR is contained in \sum_{i=0}^16 temp_{i}*2^{29*i}. Note that this value is no
789 // greater than 4p^2 as aR and bR are both less than 2p.
790 wasm_reduce_yuval(temp_0, temp_1, temp_2, temp_3, temp_4, temp_5, temp_6, temp_7, temp_8, temp_9);
791 wasm_reduce_yuval(temp_1, temp_2, temp_3, temp_4, temp_5, temp_6, temp_7, temp_8, temp_9, temp_10);
792 wasm_reduce_yuval(temp_2, temp_3, temp_4, temp_5, temp_6, temp_7, temp_8, temp_9, temp_10, temp_11);
793 wasm_reduce_yuval(temp_3, temp_4, temp_5, temp_6, temp_7, temp_8, temp_9, temp_10, temp_11, temp_12);
794 wasm_reduce_yuval(temp_4, temp_5, temp_6, temp_7, temp_8, temp_9, temp_10, temp_11, temp_12, temp_13);
795 wasm_reduce_yuval(temp_5, temp_6, temp_7, temp_8, temp_9, temp_10, temp_11, temp_12, temp_13, temp_14);
796 wasm_reduce_yuval(temp_6, temp_7, temp_8, temp_9, temp_10, temp_11, temp_12, temp_13, temp_14, temp_15);
797 wasm_reduce_yuval(temp_7, temp_8, temp_9, temp_10, temp_11, temp_12, temp_13, temp_14, temp_15, temp_16);
798
799 // The first 8 limbs are reduced using Yuval's method, the last one is reduced using the regular method
800 // The reason for this is that Yuval's method produces a 10-limb representation of the reduced limb, which is then
801 // added to the higher limbs. If we do this for the last limb we reduce, we'll get a 10-limb representation instead
802 // of a 9-limb one, so we'll have to reduce it again in some other way.
803 wasm_reduce(temp_8, temp_9, temp_10, temp_11, temp_12, temp_13, temp_14, temp_15, temp_16);
804 // We must now reason about the current value of \sum_{i=0}^8 temp_{i+8} from the original assumptions.
805 // Following the algorithm, this is aR * bR + k_{0, 1, ..., 7}*r_inv_wasm + k_8p. Here, k_0 < 2^29-1, k_1 < 2^58 -
806 // 2^29, and so on, until k_8 < 2^261 - 2^232. Moreover, r_inv_wasm < p. (From the definition, it is the value of
807 // 2^{-29} mod p, and our choice of limb-representation is smaller than p. In fact, it is empirically smaller than
808 // p/2 for Fq and Fr.)
809 //
810 // Therefore, this whole sum is bounded by 4p^2 + (2^261 - 1)*p. Dividing by 2^261 and taking
811 // the integral part (corresponding to taking the top half of the limbs), and noting that 4p^2 / 2^261 << 1, we
812 // conclude that the result is in [0, p]. In particular, this implies that we are safely in [0, 2p), as desired.
813 //
814 // Note that the above analysis is soft, and it is overwhelmingly likely that the result is in [0, p). However, the
815 // only guarantee we require is that it is in [0, 2p), as with 254-bit fields we work with the coarse
816 // representation.
817
818 // Convert result to unrelaxed form (all limbs are 29 bits)
819 temp_10 += temp_9 >> WASM_LIMB_BITS;
820 temp_9 &= mask;
821 temp_11 += temp_10 >> WASM_LIMB_BITS;
822 temp_10 &= mask;
823 temp_12 += temp_11 >> WASM_LIMB_BITS;
824 temp_11 &= mask;
825 temp_13 += temp_12 >> WASM_LIMB_BITS;
826 temp_12 &= mask;
827 temp_14 += temp_13 >> WASM_LIMB_BITS;
828 temp_13 &= mask;
829 temp_15 += temp_14 >> WASM_LIMB_BITS;
830 temp_14 &= mask;
831 temp_16 += temp_15 >> WASM_LIMB_BITS;
832 temp_15 &= mask;
833
834 // Convert back to 4 64-bit limbs form
835 return { (temp_9 << 0) | (temp_10 << 29) | (temp_11 << 58),
836 (temp_11 >> 6) | (temp_12 << 23) | (temp_13 << 52),
837 (temp_13 >> 12) | (temp_14 << 17) | (temp_15 << 46),
838 (temp_15 >> 18) | (temp_16 << 11) };
839#endif
840}
847template <class T> constexpr field<T> field<T>::montgomery_square() const noexcept
848{
849 if constexpr (modulus.data[3] >= MODULUS_TOP_LIMB_LARGE_THRESHOLD) {
850 return montgomery_mul_big(*this);
851 }
852#if defined(__SIZEOF_INT128__) && !defined(__wasm__)
853 uint64_t carry_hi = 0;
854
855 auto [t0, carry_lo] = mul_wide(data[0], data[0]);
856 uint64_t t1 = square_accumulate(0, data[1], data[0], carry_lo, carry_hi, carry_lo, carry_hi);
857 uint64_t t2 = square_accumulate(0, data[2], data[0], carry_lo, carry_hi, carry_lo, carry_hi);
858 uint64_t t3 = square_accumulate(0, data[3], data[0], carry_lo, carry_hi, carry_lo, carry_hi);
859
860 uint64_t round_carry = carry_lo;
861 uint64_t k = t0 * T::r_inv;
862 carry_lo = mac_discard_lo(t0, k, modulus.data[0]);
863 mac(t1, k, modulus.data[1], carry_lo, t0, carry_lo);
864 mac(t2, k, modulus.data[2], carry_lo, t1, carry_lo);
865 mac(t3, k, modulus.data[3], carry_lo, t2, carry_lo);
866 t3 = carry_lo + round_carry;
867
868 t1 = mac_mini(t1, data[1], data[1], carry_lo);
869 carry_hi = 0;
870 t2 = square_accumulate(t2, data[2], data[1], carry_lo, carry_hi, carry_lo, carry_hi);
871 t3 = square_accumulate(t3, data[3], data[1], carry_lo, carry_hi, carry_lo, carry_hi);
872 round_carry = carry_lo;
873 k = t0 * T::r_inv;
874 carry_lo = mac_discard_lo(t0, k, modulus.data[0]);
875 mac(t1, k, modulus.data[1], carry_lo, t0, carry_lo);
876 mac(t2, k, modulus.data[2], carry_lo, t1, carry_lo);
877 mac(t3, k, modulus.data[3], carry_lo, t2, carry_lo);
878 t3 = carry_lo + round_carry;
879
880 t2 = mac_mini(t2, data[2], data[2], carry_lo);
881 carry_hi = 0;
882 t3 = square_accumulate(t3, data[3], data[2], carry_lo, carry_hi, carry_lo, carry_hi);
883 round_carry = carry_lo;
884 k = t0 * T::r_inv;
885 carry_lo = mac_discard_lo(t0, k, modulus.data[0]);
886 mac(t1, k, modulus.data[1], carry_lo, t0, carry_lo);
887 mac(t2, k, modulus.data[2], carry_lo, t1, carry_lo);
888 mac(t3, k, modulus.data[3], carry_lo, t2, carry_lo);
889 t3 = carry_lo + round_carry;
890
891 t3 = mac_mini(t3, data[3], data[3], carry_lo);
892 k = t0 * T::r_inv;
893 round_carry = carry_lo;
894 carry_lo = mac_discard_lo(t0, k, modulus.data[0]);
895 mac(t1, k, modulus.data[1], carry_lo, t0, carry_lo);
896 mac(t2, k, modulus.data[2], carry_lo, t1, carry_lo);
897 mac(t3, k, modulus.data[3], carry_lo, t2, carry_lo);
898 t3 = carry_lo + round_carry;
899 return { t0, t1, t2, t3 };
900#else
901 // Convert from 4 64-bit limbs to 9 29-bit ones
902 auto left = wasm_convert(data);
903 constexpr uint64_t mask = 0x1fffffff;
904 uint64_t temp_0 = 0;
905 uint64_t temp_1 = 0;
906 uint64_t temp_2 = 0;
907 uint64_t temp_3 = 0;
908 uint64_t temp_4 = 0;
909 uint64_t temp_5 = 0;
910 uint64_t temp_6 = 0;
911 uint64_t temp_7 = 0;
912 uint64_t temp_8 = 0;
913 uint64_t temp_9 = 0;
914 uint64_t temp_10 = 0;
915 uint64_t temp_11 = 0;
916 uint64_t temp_12 = 0;
917 uint64_t temp_13 = 0;
918 uint64_t temp_14 = 0;
919 uint64_t temp_15 = 0;
920 uint64_t temp_16 = 0;
921 uint64_t acc;
922 // Perform multiplications, but accumulated results for limb k=i+j so that we can double them at the same time
923 temp_0 += left[0] * left[0];
924 acc = 0;
925 acc += left[0] * left[1];
926 temp_1 += (acc << 1);
927 acc = 0;
928 acc += left[0] * left[2];
929 temp_2 += left[1] * left[1];
930 temp_2 += (acc << 1);
931 acc = 0;
932 acc += left[0] * left[3];
933 acc += left[1] * left[2];
934 temp_3 += (acc << 1);
935 acc = 0;
936 acc += left[0] * left[4];
937 acc += left[1] * left[3];
938 temp_4 += left[2] * left[2];
939 temp_4 += (acc << 1);
940 acc = 0;
941 acc += left[0] * left[5];
942 acc += left[1] * left[4];
943 acc += left[2] * left[3];
944 temp_5 += (acc << 1);
945 acc = 0;
946 acc += left[0] * left[6];
947 acc += left[1] * left[5];
948 acc += left[2] * left[4];
949 temp_6 += left[3] * left[3];
950 temp_6 += (acc << 1);
951 acc = 0;
952 acc += left[0] * left[7];
953 acc += left[1] * left[6];
954 acc += left[2] * left[5];
955 acc += left[3] * left[4];
956 temp_7 += (acc << 1);
957 acc = 0;
958 acc += left[0] * left[8];
959 acc += left[1] * left[7];
960 acc += left[2] * left[6];
961 acc += left[3] * left[5];
962 temp_8 += left[4] * left[4];
963 temp_8 += (acc << 1);
964 acc = 0;
965 acc += left[1] * left[8];
966 acc += left[2] * left[7];
967 acc += left[3] * left[6];
968 acc += left[4] * left[5];
969 temp_9 += (acc << 1);
970 acc = 0;
971 acc += left[2] * left[8];
972 acc += left[3] * left[7];
973 acc += left[4] * left[6];
974 temp_10 += left[5] * left[5];
975 temp_10 += (acc << 1);
976 acc = 0;
977 acc += left[3] * left[8];
978 acc += left[4] * left[7];
979 acc += left[5] * left[6];
980 temp_11 += (acc << 1);
981 acc = 0;
982 acc += left[4] * left[8];
983 acc += left[5] * left[7];
984 temp_12 += left[6] * left[6];
985 temp_12 += (acc << 1);
986 acc = 0;
987 acc += left[5] * left[8];
988 acc += left[6] * left[7];
989 temp_13 += (acc << 1);
990 acc = 0;
991 acc += left[6] * left[8];
992 temp_14 += left[7] * left[7];
993 temp_14 += (acc << 1);
994 acc = 0;
995 acc += left[7] * left[8];
996 temp_15 += (acc << 1);
997 temp_16 += left[8] * left[8];
998
999 // Perform reductions
1000
1001 wasm_reduce_yuval(temp_0, temp_1, temp_2, temp_3, temp_4, temp_5, temp_6, temp_7, temp_8, temp_9);
1002 wasm_reduce_yuval(temp_1, temp_2, temp_3, temp_4, temp_5, temp_6, temp_7, temp_8, temp_9, temp_10);
1003 wasm_reduce_yuval(temp_2, temp_3, temp_4, temp_5, temp_6, temp_7, temp_8, temp_9, temp_10, temp_11);
1004 wasm_reduce_yuval(temp_3, temp_4, temp_5, temp_6, temp_7, temp_8, temp_9, temp_10, temp_11, temp_12);
1005 wasm_reduce_yuval(temp_4, temp_5, temp_6, temp_7, temp_8, temp_9, temp_10, temp_11, temp_12, temp_13);
1006 wasm_reduce_yuval(temp_5, temp_6, temp_7, temp_8, temp_9, temp_10, temp_11, temp_12, temp_13, temp_14);
1007 wasm_reduce_yuval(temp_6, temp_7, temp_8, temp_9, temp_10, temp_11, temp_12, temp_13, temp_14, temp_15);
1008 wasm_reduce_yuval(temp_7, temp_8, temp_9, temp_10, temp_11, temp_12, temp_13, temp_14, temp_15, temp_16);
1009
1010 // In case there is some unforseen edge case encountered in wasm multiplications, we can quickly restore previous
1011 // functionality. Comment all "wasm_reduce_yuval" and uncomment the following:
1012
1013 // wasm_reduce(temp_0, temp_1, temp_2, temp_3, temp_4, temp_5, temp_6, temp_7, temp_8);
1014 // wasm_reduce(temp_1, temp_2, temp_3, temp_4, temp_5, temp_6, temp_7, temp_8, temp_9);
1015 // wasm_reduce(temp_2, temp_3, temp_4, temp_5, temp_6, temp_7, temp_8, temp_9, temp_10);
1016 // wasm_reduce(temp_3, temp_4, temp_5, temp_6, temp_7, temp_8, temp_9, temp_10, temp_11);
1017 // wasm_reduce(temp_4, temp_5, temp_6, temp_7, temp_8, temp_9, temp_10, temp_11, temp_12);
1018 // wasm_reduce(temp_5, temp_6, temp_7, temp_8, temp_9, temp_10, temp_11, temp_12, temp_13);
1019 // wasm_reduce(temp_6, temp_7, temp_8, temp_9, temp_10, temp_11, temp_12, temp_13, temp_14);
1020 // wasm_reduce(temp_7, temp_8, temp_9, temp_10, temp_11, temp_12, temp_13, temp_14, temp_15);
1021
1022 // The first 8 limbs are reduced using Yuval's method, the last one is reduced using the regular method
1023 // The reason for this is that Yuval's method produces a 10-limb representation of the reduced limb, which is then
1024 // added to the higher limbs. If we do this for the last limb we reduce, we'll get a 10-limb representation instead
1025 // of a 9-limb one, so we'll have to reduce it again in some other way.
1026 wasm_reduce(temp_8, temp_9, temp_10, temp_11, temp_12, temp_13, temp_14, temp_15, temp_16);
1027
1028 // Convert to unrelaxed 29-bit form
1029 temp_10 += temp_9 >> WASM_LIMB_BITS;
1030 temp_9 &= mask;
1031 temp_11 += temp_10 >> WASM_LIMB_BITS;
1032 temp_10 &= mask;
1033 temp_12 += temp_11 >> WASM_LIMB_BITS;
1034 temp_11 &= mask;
1035 temp_13 += temp_12 >> WASM_LIMB_BITS;
1036 temp_12 &= mask;
1037 temp_14 += temp_13 >> WASM_LIMB_BITS;
1038 temp_13 &= mask;
1039 temp_15 += temp_14 >> WASM_LIMB_BITS;
1040 temp_14 &= mask;
1041 temp_16 += temp_15 >> WASM_LIMB_BITS;
1042 temp_15 &= mask;
1043 // Convert to 4 64-bit form
1044 return { (temp_9 << 0) | (temp_10 << 29) | (temp_11 << 58),
1045 (temp_11 >> 6) | (temp_12 << 23) | (temp_13 << 52),
1046 (temp_13 >> 12) | (temp_14 << 17) | (temp_15 << 46),
1047 (temp_15 >> 18) | (temp_16 << 11) };
1048#endif
1049}
1050
1051template <class T> constexpr struct field<T>::wide_array field<T>::mul_512(const field& other) const noexcept
1052{
1053#if defined(__SIZEOF_INT128__) && !defined(__wasm__)
1054 uint64_t carry_2 = 0;
1055 auto [r0, carry] = mul_wide(data[0], other.data[0]);
1056 uint64_t r1 = mac_mini(carry, data[0], other.data[1], carry);
1057 uint64_t r2 = mac_mini(carry, data[0], other.data[2], carry);
1058 uint64_t r3 = mac_mini(carry, data[0], other.data[3], carry_2);
1059
1060 r1 = mac_mini(r1, data[1], other.data[0], carry);
1061 r2 = mac(r2, data[1], other.data[1], carry, carry);
1062 r3 = mac(r3, data[1], other.data[2], carry, carry);
1063 uint64_t r4 = mac(carry_2, data[1], other.data[3], carry, carry_2);
1064
1065 r2 = mac_mini(r2, data[2], other.data[0], carry);
1066 r3 = mac(r3, data[2], other.data[1], carry, carry);
1067 r4 = mac(r4, data[2], other.data[2], carry, carry);
1068 uint64_t r5 = mac(carry_2, data[2], other.data[3], carry, carry_2);
1069
1070 r3 = mac_mini(r3, data[3], other.data[0], carry);
1071 r4 = mac(r4, data[3], other.data[1], carry, carry);
1072 r5 = mac(r5, data[3], other.data[2], carry, carry);
1073 uint64_t r6 = mac(carry_2, data[3], other.data[3], carry, carry_2);
1074
1075 return { r0, r1, r2, r3, r4, r5, r6, carry_2 };
1076#else
1077 // Convert from 4 64-bit limbs to 9 29-bit limbs
1078 auto left = wasm_convert(data);
1079 auto right = wasm_convert(other.data);
1080 constexpr uint64_t mask = 0x1fffffff;
1081 uint64_t temp_0 = 0;
1082 uint64_t temp_1 = 0;
1083 uint64_t temp_2 = 0;
1084 uint64_t temp_3 = 0;
1085 uint64_t temp_4 = 0;
1086 uint64_t temp_5 = 0;
1087 uint64_t temp_6 = 0;
1088 uint64_t temp_7 = 0;
1089 uint64_t temp_8 = 0;
1090 uint64_t temp_9 = 0;
1091 uint64_t temp_10 = 0;
1092 uint64_t temp_11 = 0;
1093 uint64_t temp_12 = 0;
1094 uint64_t temp_13 = 0;
1095 uint64_t temp_14 = 0;
1096 uint64_t temp_15 = 0;
1097 uint64_t temp_16 = 0;
1098
1099 // Multiply-add all limbs
1100 wasm_madd(left[0], right, temp_0, temp_1, temp_2, temp_3, temp_4, temp_5, temp_6, temp_7, temp_8);
1101 wasm_madd(left[1], right, temp_1, temp_2, temp_3, temp_4, temp_5, temp_6, temp_7, temp_8, temp_9);
1102 wasm_madd(left[2], right, temp_2, temp_3, temp_4, temp_5, temp_6, temp_7, temp_8, temp_9, temp_10);
1103 wasm_madd(left[3], right, temp_3, temp_4, temp_5, temp_6, temp_7, temp_8, temp_9, temp_10, temp_11);
1104 wasm_madd(left[4], right, temp_4, temp_5, temp_6, temp_7, temp_8, temp_9, temp_10, temp_11, temp_12);
1105 wasm_madd(left[5], right, temp_5, temp_6, temp_7, temp_8, temp_9, temp_10, temp_11, temp_12, temp_13);
1106 wasm_madd(left[6], right, temp_6, temp_7, temp_8, temp_9, temp_10, temp_11, temp_12, temp_13, temp_14);
1107 wasm_madd(left[7], right, temp_7, temp_8, temp_9, temp_10, temp_11, temp_12, temp_13, temp_14, temp_15);
1108 wasm_madd(left[8], right, temp_8, temp_9, temp_10, temp_11, temp_12, temp_13, temp_14, temp_15, temp_16);
1109
1110 // Convert to unrelaxed 29-bit form
1111 temp_1 += temp_0 >> WASM_LIMB_BITS;
1112 temp_0 &= mask;
1113 temp_2 += temp_1 >> WASM_LIMB_BITS;
1114 temp_1 &= mask;
1115 temp_3 += temp_2 >> WASM_LIMB_BITS;
1116 temp_2 &= mask;
1117 temp_4 += temp_3 >> WASM_LIMB_BITS;
1118 temp_3 &= mask;
1119 temp_5 += temp_4 >> WASM_LIMB_BITS;
1120 temp_4 &= mask;
1121 temp_6 += temp_5 >> WASM_LIMB_BITS;
1122 temp_5 &= mask;
1123 temp_7 += temp_6 >> WASM_LIMB_BITS;
1124 temp_6 &= mask;
1125 temp_8 += temp_7 >> WASM_LIMB_BITS;
1126 temp_7 &= mask;
1127 temp_9 += temp_8 >> WASM_LIMB_BITS;
1128 temp_8 &= mask;
1129 temp_10 += temp_9 >> WASM_LIMB_BITS;
1130 temp_9 &= mask;
1131 temp_11 += temp_10 >> WASM_LIMB_BITS;
1132 temp_10 &= mask;
1133 temp_12 += temp_11 >> WASM_LIMB_BITS;
1134 temp_11 &= mask;
1135 temp_13 += temp_12 >> WASM_LIMB_BITS;
1136 temp_12 &= mask;
1137 temp_14 += temp_13 >> WASM_LIMB_BITS;
1138 temp_13 &= mask;
1139 temp_15 += temp_14 >> WASM_LIMB_BITS;
1140 temp_14 &= mask;
1141 temp_16 += temp_15 >> WASM_LIMB_BITS;
1142 temp_15 &= mask;
1143
1144 // Convert to 8 64-bit limbs
1145 return { (temp_0 << 0) | (temp_1 << 29) | (temp_2 << 58),
1146 (temp_2 >> 6) | (temp_3 << 23) | (temp_4 << 52),
1147 (temp_4 >> 12) | (temp_5 << 17) | (temp_6 << 46),
1148 (temp_6 >> 18) | (temp_7 << 11) | (temp_8 << 40),
1149 (temp_8 >> 24) | (temp_9 << 5) | (temp_10 << 34) | (temp_11 << 63),
1150 (temp_11 >> 1) | (temp_12 << 28) | (temp_13 << 57),
1151 (temp_13 >> 7) | (temp_14 << 22) | (temp_15 << 51),
1152 (temp_15 >> 13) | (temp_16 << 16) };
1153#endif
1154}
1155
1156// NOLINTEND(readability-implicit-bool-conversion)
1157} // namespace bb
const std::vector< MemoryValue > data
FF a
FF b
#define WASM_LIMB_BITS
Entry point for Barretenberg command-line interface.
Definition api.hpp:5
constexpr decltype(auto) get(::tuplet::tuple< T... > &&t) noexcept
Definition tuple.hpp:13
unsigned __int128 uint128_t
Definition serialize.hpp:45
General class for prime fields see Prime field documentation["field documentation"] for general imple...
static BB_INLINE constexpr std::array< uint64_t, WASM_NUM_LIMBS > wasm_convert(const uint64_t *data)
Convert 4 64-bit limbs into 9 29-bit limbs.
static BB_INLINE constexpr std::pair< uint64_t, uint64_t > mul_wide(uint64_t a, uint64_t b) noexcept
static BB_INLINE constexpr uint64_t mac_discard_lo(uint64_t a, uint64_t b, uint64_t c) noexcept
static BB_INLINE constexpr uint64_t sbb(uint64_t a, uint64_t b, uint64_t borrow_in, uint64_t &borrow_out) noexcept
unsigned 64-bit subtract-with-borrow that takes in borrow_in value in the size-2 set {0,...
BB_INLINE constexpr field subtract(const field &other) const noexcept
static BB_INLINE constexpr uint64_t mac(uint64_t a, uint64_t b, uint64_t c, uint64_t carry_in, uint64_t &carry_out) noexcept
Compute uint128_t(a * b + c + carry_in), where the inputs are all uint64_t. Return the top 64 bits.
static BB_INLINE constexpr uint64_t addc(uint64_t a, uint64_t b, uint64_t carry_in, uint64_t &carry_out) noexcept
unsigned 64-bit add-with-carry that takes in a carry_in and a carry_out bit and rewrites the latter.
static BB_INLINE constexpr void wasm_reduce(uint64_t &result_0, uint64_t &result_1, uint64_t &result_2, uint64_t &result_3, uint64_t &result_4, uint64_t &result_5, uint64_t &result_6, uint64_t &result_7, uint64_t &result_8)
Perform 29-bit Montgomery reduction on 1 limb (result_0 should be zero modulo 2^29 after calling this...
BB_INLINE constexpr field montgomery_mul_big(const field &other) const noexcept
Mongtomery multiplication for moduli > 2²⁵⁴
static BB_INLINE constexpr void wasm_madd(uint64_t &left_limb, const std::array< uint64_t, WASM_NUM_LIMBS > &right_limbs, uint64_t &result_0, uint64_t &result_1, uint64_t &result_2, uint64_t &result_3, uint64_t &result_4, uint64_t &result_5, uint64_t &result_6, uint64_t &result_7, uint64_t &result_8)
Multiply left limb by a sequence of 9 limbs and accumulate into result variables.
static BB_INLINE constexpr uint64_t square_accumulate(uint64_t a, uint64_t b, uint64_t c, uint64_t carry_in_lo, uint64_t carry_in_hi, uint64_t &carry_lo, uint64_t &carry_hi) noexcept
Computes a + 2 * b * c + carry_in_lo + 2^64 * carry_in_hi, in the form of returning a uint64_t and mo...
static BB_INLINE constexpr void wasm_reduce_yuval(uint64_t &result_0, uint64_t &result_1, uint64_t &result_2, uint64_t &result_3, uint64_t &result_4, uint64_t &result_5, uint64_t &result_6, uint64_t &result_7, uint64_t &result_8, uint64_t &result_9)
Perform 29-bit Montgomery reduction on 1 limb using Yuval's method.
BB_INLINE constexpr field add(const field &other) const noexcept
BB_INLINE constexpr field montgomery_square() const noexcept
Squaring via a variant of the Montgomery algorithm, where we roughly take advantage of the repeated t...
BB_INLINE constexpr field montgomery_mul(const field &other) const noexcept
BB_INLINE constexpr field reduce() const noexcept
reduce once, i.e., if the value is bigger than the modulus, subtract off the modulus once.
static BB_INLINE constexpr uint64_t mac_mini(uint64_t a, uint64_t b, uint64_t c, uint64_t &out) noexcept