The inner function requires an atomic size_t that synchronizes the write positions. As the algorithm is recursive, it cannot store the atomic size_t itself; it requires an outer function to invoke the algorithm:
template <typename SrcIt, typename DstIt, typename Pred>auto _inner_par_copy_if_sync( SrcIt first, SrcIt last, DstIt dst, std::atomic_size_t& dst_idx, Pred pred, size_t chunk_sz) -> void { auto n = std::distance(first, last); if (n <= chunk_sz) { std::for_each(first, last, [&](const auto& v) { if (pred(v)) { auto write_idx = dst_idx.fetch_add(1); *std::next(dst, write_idx) = v; } }); return; } auto middle = std::next(first, n / 2); auto future = std::async( [first, middle, dst, chunk_sz, &pred, &dst_idx] { return