Barretenberg
The ZK-SNARK library at the core of Aztec
Loading...
Searching...
No Matches
thread.hpp
Go to the documentation of this file.
1#pragma once
3#include <atomic>
6#include <functional>
7#include <iostream>
8#include <ranges>
9#include <vector>
10
11namespace bb {
12#ifdef __wasm__
13// Fixed number of workers in WASM environment
14constexpr size_t PARALLEL_FOR_MAX_NESTING = 1;
15#else
16constexpr size_t PARALLEL_FOR_MAX_NESTING = 2;
17#endif
18
19// Useful for programatically benching different thread counts
20// Note this is threadsafe and affects parallel_for's just in that thread if so.
21void set_parallel_for_concurrency(size_t num_cores);
22size_t get_num_cpus();
23
24// For algorithms that need to be divided amongst power of 2 threads.
25inline size_t get_num_cpus_pow2()
26{
27 return static_cast<size_t>(1ULL << numeric::get_msb(get_num_cpus()));
28}
29
37void parallel_for(size_t num_iterations, const std::function<void(size_t)>& func);
38void parallel_for_range(size_t num_points,
39 const std::function<void(size_t, size_t)>& func,
40 size_t no_multhreading_if_less_or_equal = 0);
41
52void parallel_for_heuristic(size_t num_points,
53 const std::function<void(size_t, size_t, size_t)>& func,
54 size_t heuristic_cost);
55
56template <typename Func>
58void parallel_for_heuristic(size_t num_points, const Func& func, size_t heuristic_cost)
59{
61 num_points,
62 [&](size_t start_idx, size_t end_idx, BB_UNUSED size_t chunk_index) {
63 for (size_t i = start_idx; i < end_idx; i++) {
64 func(i);
65 }
66 },
67 heuristic_cost);
68}
69
76template <typename Func, typename Accum>
79 const Accum& initial_accum,
80 const Func& func,
81 size_t heuristic_cost)
82{
83 // thread-safe accumulators
84 std::vector<Accum> accumulators(get_num_cpus(), initial_accum);
86 num_points,
87 [&](size_t start_idx, size_t end_idx, size_t chunk_index) {
88 for (size_t i = start_idx; i < end_idx; i++) {
89 func(i, accumulators[chunk_index]);
90 }
91 },
92 heuristic_cost);
93 return accumulators;
94}
95
96const size_t DEFAULT_MIN_ITERS_PER_THREAD = 1 << 4;
97
100 // index bounds for each thread
101 std::vector<size_t> start;
102 std::vector<size_t> end;
103};
104
113MultithreadData calculate_thread_data(size_t num_iterations,
114 size_t min_iterations_per_thread = DEFAULT_MIN_ITERS_PER_THREAD);
115
125size_t calculate_num_threads(size_t num_iterations, size_t min_iterations_per_thread = DEFAULT_MIN_ITERS_PER_THREAD);
126
127namespace thread_heuristics {
128// Maximum observed parallel_for overhead in nanoseconds (rounded up from 388us measurement)
129constexpr size_t PARALLEL_FOR_COST = 400000;
130// Rough cost of operations (the operation costs are derives in basics_bench and the units are nanoseconds)
131// Field element (16 byte) addition cost
132constexpr size_t FF_ADDITION_COST = 4;
133// Field element (16 byte) multiplication cost
134constexpr size_t FF_MULTIPLICATION_COST = 21;
135// Field element (16 byte) inversion cost
136constexpr size_t FF_INVERSION_COST = 7000;
137// Group element projective addition number
138constexpr size_t GE_ADDITION_COST = 350;
139// Group element projective doubling number
140constexpr size_t GE_DOUBLING_COST = 194;
141// Group element scalar multiplication cost
142constexpr size_t SM_COST = 50000;
143// Field element (16 byte) sequential copy number
144constexpr size_t FF_COPY_COST = 3;
145// Fine default if something looks 'chunky enough that I don't want to calculate'
146constexpr size_t ALWAYS_MULTITHREAD = 100000;
147} // namespace thread_heuristics
148
152 auto range(size_t size, size_t offset = 0) const
153 {
155 return std::views::iota(size_t{ 0 }, size_t{ 0 });
156 }
157 // Calculate base chunk size and remainder
158 size_t chunk_size = size / total_threads;
159 size_t remainder = size % total_threads;
160
161 if (thread_index < remainder) {
162 // Threads with index < remainder get chunk_size + 1 elements
163 size_t start = thread_index * (chunk_size + 1);
164 size_t end = start + chunk_size + 1;
165 return std::views::iota(start + offset, end + offset);
166 }
167 // Threads with index >= remainder get chunk_size elements
168 size_t start = remainder * (chunk_size + 1) + (thread_index - remainder) * chunk_size;
169 size_t end = start + chunk_size;
170 return std::views::iota(start + offset, end + offset);
171 }
172};
173
174template <typename Func>
176void parallel_for(const Func& func)
177{
178 size_t total_threads = get_num_cpus();
179 parallel_for(total_threads, [&](size_t thread_index) {
180 func(ThreadChunk{ .thread_index = thread_index, .total_threads = total_threads });
181 });
182}
183
184// Overload that allows specifying the number of threads explicitly while still using ThreadChunk
185template <typename Func>
187void parallel_for(size_t num_threads, const Func& func)
188{
189 parallel_for(num_threads, [&](size_t thread_index) {
190 func(ThreadChunk{ .thread_index = thread_index, .total_threads = num_threads });
191 });
192}
193
194// parallel_for_heuristic variant that uses ThreadChunk for work distribution.
195// Parallelizes only when the estimated total work exceeds the parallel_for overhead.
196template <typename Func>
198void parallel_for_heuristic(size_t num_points, const Func& func, size_t heuristic_cost)
199{
200 const size_t num_cpus = get_num_cpus();
201 const size_t chunk_size = (num_points / num_cpus) + (num_points % num_cpus == 0 ? 0 : 1);
202 const size_t offset_cost = (num_points - chunk_size) * heuristic_cost;
203
204 if (offset_cost < thread_heuristics::PARALLEL_FOR_COST) {
205 func(ThreadChunk{ .thread_index = 0, .total_threads = 1 });
206 return;
207 }
208 parallel_for(num_cpus, [&](size_t thread_index) {
209 func(ThreadChunk{ .thread_index = thread_index, .total_threads = num_cpus });
210 });
211}
212
213} // namespace bb
#define BB_UNUSED
ssize_t offset
Definition engine.cpp:52
constexpr T get_msb(const T in)
Definition get_msb.hpp:49
constexpr size_t PARALLEL_FOR_COST
Definition thread.hpp:129
constexpr size_t FF_COPY_COST
Definition thread.hpp:144
constexpr size_t GE_ADDITION_COST
Definition thread.hpp:138
constexpr size_t GE_DOUBLING_COST
Definition thread.hpp:140
constexpr size_t ALWAYS_MULTITHREAD
Definition thread.hpp:146
constexpr size_t FF_ADDITION_COST
Definition thread.hpp:132
constexpr size_t FF_MULTIPLICATION_COST
Definition thread.hpp:134
constexpr size_t FF_INVERSION_COST
Definition thread.hpp:136
constexpr size_t SM_COST
Definition thread.hpp:142
Entry point for Barretenberg command-line interface.
Definition api.hpp:5
MultithreadData calculate_thread_data(size_t num_iterations, size_t min_iterations_per_thread)
Calculates number of threads and index bounds for each thread.
Definition thread.cpp:208
const size_t DEFAULT_MIN_ITERS_PER_THREAD
Definition thread.hpp:96
size_t get_num_cpus_pow2()
Definition thread.hpp:25
size_t get_num_cpus()
Definition thread.cpp:33
constexpr size_t PARALLEL_FOR_MAX_NESTING
Definition thread.hpp:16
size_t calculate_num_threads(size_t num_iterations, size_t min_iterations_per_thread)
calculates number of threads to create based on minimum iterations per thread
Definition thread.cpp:233
void parallel_for_heuristic(size_t num_points, const std::function< void(size_t, size_t, size_t)> &func, size_t heuristic_cost)
Split a loop into several loops running in parallel based on operations in 1 iteration.
Definition thread.cpp:171
void set_parallel_for_concurrency(size_t num_cores)
Definition thread.cpp:23
void parallel_for(size_t num_iterations, const std::function< void(size_t)> &func)
Definition thread.cpp:111
void parallel_for_range(size_t num_points, const std::function< void(size_t, size_t)> &func, size_t no_multhreading_if_less_or_equal)
Split a loop into several loops running in parallel.
Definition thread.cpp:141
constexpr decltype(auto) get(::tuplet::tuple< T... > &&t) noexcept
Definition tuple.hpp:13
std::vector< size_t > end
Definition thread.hpp:102
std::vector< size_t > start
Definition thread.hpp:101
size_t total_threads
Definition thread.hpp:151
size_t thread_index
Definition thread.hpp:150
auto range(size_t size, size_t offset=0) const
Definition thread.hpp:152