Implementation-wise, it's quite a small bit of code. The incoming range is recursively split into two chunks; the first chunk is recursively invoked as a new task, and the second chunk is processed on the same task:
template <typename SrcIt, typename DstIt, typename Func> auto par_transform(SrcIt first,SrcIt last,DstIt dst,Func f,size_t chunk_sz) { const auto n = static_cast<size_t>(std::distance(first, last)); if (n <= chunk_sz) { std::transform(first, last, dst, f); return; } const auto src_middle = std::next(first, n/2); // Branch of first part to another task auto future = std::async([=, &func]{ par_transform(first, src_middle, dst, f, chunk_sz); }); // Recursively handle the second part const auto dst_middle = std::next(dst, ...