Barretenberg
The ZK-SNARK library at the core of Aztec
Loading...
Searching...
No Matches
polynomial_arithmetic.cpp
Go to the documentation of this file.
1// === AUDIT STATUS ===
2// internal: { status: Complete, auditors: [Nishat], commit: 94f596f8b3bbbc216f9ad7dc33253256141156b2 }
3// external_1: { status: not started, auditors: [], commit: }
4// external_2: { status: not started, auditors: [], commit: }
5// =====================
6
12#include <math.h>
13#include <memory.h>
14#include <memory>
15
17
18namespace {
19
20template <typename Fr> std::shared_ptr<Fr[]> get_scratch_space(const size_t num_elements)
21{
22 static std::shared_ptr<Fr[]> working_memory = nullptr;
23 static size_t current_size = 0;
24 if (num_elements > current_size) {
25 working_memory = std::make_shared<Fr[]>(num_elements);
26 current_size = num_elements;
27 }
28 return working_memory;
29}
30
31} // namespace
32
33inline uint32_t reverse_bits(uint32_t x, uint32_t bit_length)
34{
35 x = (((x & 0xaaaaaaaa) >> 1) | ((x & 0x55555555) << 1));
36 x = (((x & 0xcccccccc) >> 2) | ((x & 0x33333333) << 2));
37 x = (((x & 0xf0f0f0f0) >> 4) | ((x & 0x0f0f0f0f) << 4));
38 x = (((x & 0xff00ff00) >> 8) | ((x & 0x00ff00ff) << 8));
39 return (((x >> 16) | (x << 16))) >> (32 - bit_length);
40}
41
42inline bool is_power_of_two(uint64_t x)
43{
44 return x && !(x & (x - 1));
45}
46
47template <typename Fr>
49 Fr* target,
50 const EvaluationDomain<Fr>& domain,
51 const Fr& generator_start,
52 const Fr& generator_shift,
53 const size_t generator_size)
54{
55 parallel_for(domain.num_threads, [&](size_t j) {
56 Fr thread_shift = generator_shift.pow(static_cast<uint64_t>(j * (generator_size / domain.num_threads)));
57 Fr work_generator = generator_start * thread_shift;
58 const size_t offset = j * (generator_size / domain.num_threads);
59 const size_t end = offset + (generator_size / domain.num_threads);
60 for (size_t i = offset; i < end; ++i) {
61 target[i] = coeffs[i] * work_generator;
62 work_generator *= generator_shift;
63 }
64 });
65}
66
67template <typename Fr>
68 requires SupportsFFT<Fr>
70 Fr* coeffs, Fr* target, const EvaluationDomain<Fr>& domain, const Fr&, const std::vector<Fr*>& root_table)
71{
72 parallel_for(domain.num_threads, [&](size_t j) {
73 Fr temp_1;
74 Fr temp_2;
75 for (size_t i = (j * domain.thread_size); i < ((j + 1) * domain.thread_size); i += 2) {
76 uint32_t next_index_1 = (uint32_t)reverse_bits((uint32_t)i + 2, (uint32_t)domain.log2_size);
77 uint32_t next_index_2 = (uint32_t)reverse_bits((uint32_t)i + 3, (uint32_t)domain.log2_size);
78 __builtin_prefetch(&coeffs[next_index_1]);
79 __builtin_prefetch(&coeffs[next_index_2]);
80
81 uint32_t swap_index_1 = (uint32_t)reverse_bits((uint32_t)i, (uint32_t)domain.log2_size);
82 uint32_t swap_index_2 = (uint32_t)reverse_bits((uint32_t)i + 1, (uint32_t)domain.log2_size);
83
84 Fr::__copy(coeffs[swap_index_1], temp_1);
85 Fr::__copy(coeffs[swap_index_2], temp_2);
86 target[i + 1] = temp_1 - temp_2;
87 target[i] = temp_1 + temp_2;
88 }
89 });
90
91 // outer FFT loop
92 for (size_t m = 2; m < (domain.size); m <<= 1) {
93 parallel_for(domain.num_threads, [&](size_t j) {
94 Fr temp;
95
96 // Ok! So, what's going on here? This is the inner loop of the FFT algorithm, and we want to break it
97 // out into multiple independent threads. For `num_threads`, each thread will evaluation `domain.size /
98 // num_threads` of the polynomial. The actual iteration length will be half of this, because we leverage
99 // the fact that \omega^{n/2} = -\omega (where \omega is a root of unity)
100
101 // Here, `start` and `end` are used as our iterator limits, so that we can use our iterator `i` to
102 // directly access the roots of unity lookup table
103 const size_t start = j * (domain.thread_size >> 1);
104 const size_t end = (j + 1) * (domain.thread_size >> 1);
105
106 // For all but the last round of our FFT, the roots of unity that we need, will be a subset of our
107 // lookup table. e.g. for a size 2^n FFT, the 2^n'th roots create a multiplicative subgroup of order 2^n
108 // the 1st round will use the roots from the multiplicative subgroup of order 2 : the 2'th roots of
109 // unity the 2nd round will use the roots from the multiplicative subgroup of order 4 : the 4'th
110 // roots of unity
111 // i.e. each successive FFT round will double the set of roots that we need to index.
112 // We have already laid out the `root_table` container so that each FFT round's roots are linearly
113 // ordered in memory. For all FFT rounds, the number of elements we're iterating over is greater than
114 // the size of our lookup table. We need to access this table in a cyclical fasion - i.e. for a subgroup
115 // of size x, the first x iterations will index the subgroup elements in order, then for the next x
116 // iterations, we loop back to the start.
117
118 // We could implement the algorithm by having 2 nested loops (where the inner loop iterates over the
119 // root table), but we want to flatten this out - as for the first few rounds, the inner loop will be
120 // tiny and we'll have quite a bit of unneccesary branch checks For each iteration of our flattened
121 // loop, indexed by `i`, the element of the root table we need to access will be `i % (current round
122 // subgroup size)` Given that each round subgroup size is `m`, which is a power of 2, we can index the
123 // root table with a very cheap `i & (m - 1)` Which is why we have this odd `block_mask` variable
124 const size_t block_mask = m - 1;
125
126 // The next problem to tackle, is we now need to efficiently index the polynomial element in
127 // `scratch_space` in our flattened loop If we used nested loops, the outer loop (e.g. `y`) iterates
128 // from 0 to 'domain size', in steps of 2 * m, with the inner loop (e.g. `z`) iterating from 0 to m. We
129 // have our inner loop indexer with `i & (m - 1)`. We need to add to this our outer loop indexer, which
130 // is equivalent to taking our indexer `i`, masking out the bits used in the 'inner loop', and doubling
131 // the result. i.e. polynomial indexer = (i & (m - 1)) + ((i & ~(m - 1)) >> 1) To simplify this, we
132 // cache index_mask = ~block_mask, meaning that our indexer is just `((i & index_mask) << 1 + (i &
133 // block_mask)`
134 const size_t index_mask = ~block_mask;
135
136 // `round_roots` fetches the pointer to this round's lookup table. We use `numeric::get_msb(m) - 1` as
137 // our indexer, because we don't store the precomputed root values for the 1st round (because they're
138 // all 1).
139 const Fr* round_roots = root_table[static_cast<size_t>(numeric::get_msb(m)) - 1];
140
141 // Finally, we want to treat the final round differently from the others,
142 // so that we can reduce out of our 'coarse' reduction and store the output in `coeffs` instead of
143 // `scratch_space`
144 for (size_t i = start; i < end; ++i) {
145 size_t k1 = (i & index_mask) << 1;
146 size_t j1 = i & block_mask;
147 temp = round_roots[j1] * target[k1 + j1 + m];
148 target[k1 + j1 + m] = target[k1 + j1] - temp;
149 target[k1 + j1] += temp;
150 }
151 });
152 }
153}
154
155template <typename Fr>
156 requires SupportsFFT<Fr>
157void ifft(Fr* coeffs, Fr* target, const EvaluationDomain<Fr>& domain)
158{
159 fft_inner_parallel(coeffs, target, domain, domain.root_inverse, domain.get_inverse_round_roots());
160
161 parallel_for(domain.num_threads, [&](size_t j) {
162 const size_t start = j * domain.thread_size;
163 const size_t end = (j + 1) * domain.thread_size;
164 for (size_t i = start; i < end; ++i) {
165 target[i] *= domain.domain_inverse;
166 }
167 });
168}
169
170template <typename Fr> Fr evaluate(const Fr* coeffs, const Fr& z, const size_t n)
171{
172 const size_t num_threads = get_num_cpus();
173 std::vector<Fr> evaluations(num_threads, Fr::zero());
174 parallel_for([&](const ThreadChunk& chunk) {
175 // parallel_for with ThreadChunk uses get_num_cpus() threads
176 BB_ASSERT_EQ(chunk.total_threads, evaluations.size());
177 auto range = chunk.range(n);
178 if (range.empty()) {
179 return;
180 }
181 size_t start = *range.begin();
182 Fr z_acc = z.pow(static_cast<uint64_t>(start));
183 for (size_t i : range) {
184 Fr work_var = z_acc * coeffs[i];
185 evaluations[chunk.thread_index] += work_var;
186 z_acc *= z;
187 }
188 });
189
190 Fr r = Fr::zero();
191 for (const auto& eval : evaluations) {
192 r += eval;
193 }
194 return r;
195}
196
197template <typename Fr> Fr evaluate(const std::vector<Fr*> coeffs, const Fr& z, const size_t large_n)
198{
199 const size_t num_polys = coeffs.size();
200 const size_t poly_size = large_n / num_polys;
201 BB_ASSERT(is_power_of_two(poly_size));
202 const size_t log2_poly_size = (size_t)numeric::get_msb(poly_size);
203 const size_t num_threads = get_num_cpus();
204 std::vector<Fr> evaluations(num_threads, Fr::zero());
205 parallel_for([&](const ThreadChunk& chunk) {
206 // parallel_for with ThreadChunk uses get_num_cpus() threads
207 BB_ASSERT_EQ(chunk.total_threads, evaluations.size());
208 auto range = chunk.range(large_n);
209 if (range.empty()) {
210 return;
211 }
212 size_t start = *range.begin();
213 Fr z_acc = z.pow(static_cast<uint64_t>(start));
214 for (size_t i : range) {
215 Fr work_var = z_acc * coeffs[i >> log2_poly_size][i & (poly_size - 1)];
216 evaluations[chunk.thread_index] += work_var;
217 z_acc *= z;
218 }
219 });
220
221 Fr r = Fr::zero();
222 for (const auto& eval : evaluations) {
223 r += eval;
224 }
225 return r;
226}
227
228// This function computes sum of all scalars in a given array.
229template <typename Fr> Fr compute_sum(const Fr* src, const size_t n)
230{
231 Fr result = 0;
232 for (size_t i = 0; i < n; ++i) {
233 result += src[i];
234 }
235 return result;
236}
237
238// This function computes the polynomial (x - a)(x - b)(x - c)... given n distinct roots (a, b, c, ...).
239template <typename Fr> void compute_linear_polynomial_product(const Fr* roots, Fr* dest, const size_t n)
240{
241
242 auto scratch_space_ptr = get_scratch_space<Fr>(n);
243 auto scratch_space = scratch_space_ptr.get();
244 memcpy((void*)scratch_space, (void*)roots, n * sizeof(Fr));
245
246 dest[n] = 1;
247 dest[n - 1] = -compute_sum(scratch_space, n);
248
249 Fr temp;
250 Fr constant = 1;
251 for (size_t i = 0; i < n - 1; ++i) {
252 temp = 0;
253 for (size_t j = 0; j < n - 1 - i; ++j) {
254 scratch_space[j] = roots[j] * compute_sum(&scratch_space[j + 1], n - 1 - i - j);
255 temp += scratch_space[j];
256 }
257 dest[n - 2 - i] = temp * constant;
258 constant *= Fr::neg_one();
259 }
260}
261
262template <typename Fr> Fr compute_linear_polynomial_product_evaluation(const Fr* roots, const Fr z, const size_t n)
263{
264 Fr result = 1;
265 for (size_t i = 0; i < n; ++i) {
266 result *= (z - roots[i]);
267 }
268 return result;
269}
270
271template <typename Fr>
272void compute_efficient_interpolation(const Fr* src, Fr* dest, const Fr* evaluation_points, const size_t n)
273{
274 /*
275 We use Lagrange technique to compute polynomial interpolation.
276 Given: (x_i, y_i) for i ∈ {0, 1, ..., n} =: [n]
277 Compute function f(X) such that f(x_i) = y_i for all i ∈ [n].
278 (X - x1)(X - x2)...(X - xn) (X - x0)(X - x2)...(X - xn)
279 F(X) = y0-------------------------------- + y1---------------------------------- + ...
280 (x0 - x_1)(x0 - x_2)...(x0 - xn) (x1 - x_0)(x1 - x_2)...(x1 - xn)
281 We write this as:
282 [ yi ]
283 F(X) = N(X) * |∑_i --------------- |
284 [ (X - xi) * di ]
285 where:
286 N(X) = ∏_{i \in [n]} (X - xi),
287 di = ∏_{j != i} (xi - xj)
288 For division of N(X) by (X - xi), we use the same trick that was used in compute_opening_polynomial()
289 function in the Kate commitment scheme.
290 We denote
291 q_{x_i} = N(X)/(X-x_i) * y_i * (d_i)^{-1} = q_{x_i,0}*1 + ... + q_{x_i,n-1} * X^{n-1} for i=0,..., n-1.
292
293 The computation of F(X) is split into two cases:
294
295 - if 0 is not in the interpolation domain, then the numerator polynomial N(X) has a non-zero constant term
296 that is used to initialize the division algorithm mentioned above; the monomial coefficients q_{x_i, j} of
297 q_{x_i} are accumulated into dest[j] for j=0,..., n-1
298
299 - if 0 is in the domain at index i_0, the constant term of N(X) is 0 and the division algorithm computing
300 q_{x_i} for i != i_0 is initialized with the constant term of N(X)/X. Note that its coefficients are given
301 by numerator_polynomial[j] for j=1,...,n. The monomial coefficients of q_{x_i} are then accumuluated in
302 dest[j] for j=1,..., n-1. Whereas the coefficients of
303 q_{0} = N(X)/X * f(0) * (d_{i_0})^{-1}
304 are added to dest[j] for j=0,..., n-1. Note that these coefficients do not require performing the division
305 algorithm used in Kate commitment scheme, as the coefficients of N(X)/X are given by numerator_polynomial[j]
306 for j=1,...,n.
307 */
308 std::vector<Fr> numerator_polynomial(n + 1);
309 polynomial_arithmetic::compute_linear_polynomial_product(evaluation_points, numerator_polynomial.data(), n);
310 // First half contains roots, second half contains denominators (to be inverted)
311 std::vector<Fr> roots_and_denominators(2 * n);
312 std::vector<Fr> temp_src(n);
313 for (size_t i = 0; i < n; ++i) {
314 roots_and_denominators[i] = -evaluation_points[i];
315 temp_src[i] = src[i];
316 dest[i] = 0;
317 // compute constant denominators
318 roots_and_denominators[n + i] = 1;
319 for (size_t j = 0; j < n; ++j) {
320 if (j == i) {
321 continue;
322 }
323 roots_and_denominators[n + i] *= (evaluation_points[i] - evaluation_points[j]);
324 }
325 }
326 // at this point roots_and_denominators is populated as follows
327 // (x_0,\ldots, x_{n-1}, d_0, \ldots, d_{n-1})
328 Fr::batch_invert(roots_and_denominators.data(), 2 * n);
329
330 Fr z, multiplier;
331 std::vector<Fr> temp_dest(n);
332 size_t idx_zero = 0;
333 bool interpolation_domain_contains_zero = false;
334 // if the constant term of the numerator polynomial N(X) is 0, then the interpolation domain contains 0
335 // we find the index i_0, such that x_{i_0} = 0
336 if (numerator_polynomial[0] == Fr(0)) {
337 for (size_t i = 0; i < n; ++i) {
338 if (evaluation_points[i] == Fr(0)) {
339 idx_zero = i;
340 interpolation_domain_contains_zero = true;
341 break;
342 }
343 }
344 };
345
346 if (!interpolation_domain_contains_zero) {
347 for (size_t i = 0; i < n; ++i) {
348 // set z = - 1/x_i for x_i <> 0
349 z = roots_and_denominators[i];
350 // temp_src[i] is y_i, it gets multiplied by 1/d_i
351 multiplier = temp_src[i] * roots_and_denominators[n + i];
352 temp_dest[0] = multiplier * numerator_polynomial[0];
353 temp_dest[0] *= z;
354 dest[0] += temp_dest[0];
355 for (size_t j = 1; j < n; ++j) {
356 temp_dest[j] = multiplier * numerator_polynomial[j] - temp_dest[j - 1];
357 temp_dest[j] *= z;
358 dest[j] += temp_dest[j];
359 }
360 }
361 } else {
362 for (size_t i = 0; i < n; ++i) {
363 if (i == idx_zero) {
364 // the contribution from the term corresponding to i_0 is computed separately
365 continue;
366 }
367 // get the next inverted root
368 z = roots_and_denominators[i];
369 // compute f(x_i) * d_{x_i}^{-1}
370 multiplier = temp_src[i] * roots_and_denominators[n + i];
371 // get x_i^{-1} * f(x_i) * d_{x_i}^{-1} into the "free" term
372 temp_dest[1] = multiplier * numerator_polynomial[1];
373 temp_dest[1] *= z;
374 // correct the first coefficient as it is now accumulating free terms from
375 // f(x_i) d_i^{-1} prod_(X-x_i, x_i != 0) (X-x_i) * 1/(X-x_i)
376 dest[1] += temp_dest[1];
377 // compute the quotient N(X)/(X-x_i) f(x_i)/d_{x_i} and its contribution to the target coefficients
378 for (size_t j = 2; j < n; ++j) {
379 temp_dest[j] = multiplier * numerator_polynomial[j] - temp_dest[j - 1];
380 temp_dest[j] *= z;
381 dest[j] += temp_dest[j];
382 };
383 }
384 // correct the target coefficients by the contribution from q_{0} = N(X)/X * d_{i_0}^{-1} * f(0)
385 for (size_t i = 0; i < n; ++i) {
386 dest[i] += temp_src[idx_zero] * roots_and_denominators[n + idx_zero] * numerator_polynomial[i + 1];
387 }
388 }
389}
390
391template fr evaluate<fr>(const fr*, const fr&, const size_t);
392template fr evaluate<fr>(const std::vector<fr*>, const fr&, const size_t);
393template void fft_inner_parallel<fr>(fr*, fr*, const EvaluationDomain<fr>&, const fr&, const std::vector<fr*>&);
394template void ifft<fr>(fr*, fr*, const EvaluationDomain<fr>&);
395template fr compute_sum<fr>(const fr*, const size_t);
396template void compute_linear_polynomial_product<fr>(const fr*, fr*, const size_t);
397template void compute_efficient_interpolation<fr>(const fr*, fr*, const fr*, const size_t);
398
399template grumpkin::fr evaluate<grumpkin::fr>(const grumpkin::fr*, const grumpkin::fr&, const size_t);
400template grumpkin::fr evaluate<grumpkin::fr>(const std::vector<grumpkin::fr*>, const grumpkin::fr&, const size_t);
401template grumpkin::fr compute_sum<grumpkin::fr>(const grumpkin::fr*, const size_t);
402template void compute_linear_polynomial_product<grumpkin::fr>(const grumpkin::fr*, grumpkin::fr*, const size_t);
403template void compute_efficient_interpolation<grumpkin::fr>(const grumpkin::fr*,
405 const grumpkin::fr*,
406 const size_t);
407
408} // namespace bb::polynomial_arithmetic
#define BB_ASSERT(expression,...)
Definition assert.hpp:70
#define BB_ASSERT_EQ(actual, expected,...)
Definition assert.hpp:83
const std::vector< FF * > & get_inverse_round_roots() const
constexpr bool is_power_of_two(uint64_t x)
Definition pow.hpp:35
Fr compute_linear_polynomial_product_evaluation(const Fr *roots, const Fr z, const size_t n)
uint32_t reverse_bits(uint32_t x, uint32_t bit_length)
void ifft(Fr *coeffs, Fr *target, const EvaluationDomain< Fr > &domain)
void compute_linear_polynomial_product(const Fr *roots, Fr *dest, const size_t n)
Fr evaluate(const Fr *coeffs, const Fr &z, const size_t n)
void fft_inner_parallel(Fr *coeffs, Fr *target, const EvaluationDomain< Fr > &domain, const Fr &, const std::vector< Fr * > &root_table)
void compute_efficient_interpolation(const Fr *src, Fr *dest, const Fr *evaluation_points, const size_t n)
void scale_by_generator(Fr *coeffs, Fr *target, const EvaluationDomain< Fr > &domain, const Fr &generator_start, const Fr &generator_shift, const size_t generator_size)
Fr compute_sum(const Fr *src, const size_t n)
size_t get_num_cpus()
Definition thread.cpp:33
void parallel_for(size_t num_iterations, const std::function< void(size_t)> &func)
Definition thread.cpp:111
constexpr decltype(auto) get(::tuplet::tuple< T... > &&t) noexcept
Definition tuple.hpp:13
Curve::ScalarField Fr
size_t total_threads
Definition thread.hpp:151
size_t thread_index
Definition thread.hpp:150
auto range(size_t size, size_t offset=0) const
Definition thread.hpp:152
static constexpr field neg_one()
BB_INLINE constexpr field pow(const uint256_t &exponent) const noexcept
static void batch_invert(C &coeffs) noexcept
Batch invert a collection of field elements using Montgomery's trick.
static constexpr field zero()