Since this about polynomials in one complex variable, we can compute the eigenvalues of the companion matrix instead. Since Eigenvalues is a bit slow on small matrices, I decided to use a LibraryFunction. For ease of use, I employ the Eigen package for the eigensolve.
coeffs = Compile[{{t, _Real, 1}},
With[{
T1 = Exp[I Compile`GetElement[t, 1]],
T2 = Exp[I Compile`GetElement[t, 2]]
},
{I (-1. + T1 (1. + T1 (-1. + T1 (-1. + T2)))), -I 1., I 1.,
1. I, -1. I, 1. I (-1. + T2 (-1. + T1 (1. + (1. - T2) T2))),
1. I, 1. I, 1.}
],
CompilationTarget -> "C",
RuntimeAttributes -> {Listable}
];
Needs["CCompilerDriver`"];
Quiet[LibraryFunctionUnload[cSolvePolynomial[8]]];
ClearAll[cSolvePolynomial];
cSolvePolynomial[d_] :=
cSolvePolynomial[d] = Module[{lib, code, name}, name = "cf";
code = StringJoin["
#include "WolframLibrary.h"
#include <thread>
#include <future>
#include <Eigen/Eigenvalues>
static constexpr mint d = " <> IntegerString[d] <> ";
using Complex = std::complex<mreal>;
using Matrix_T = Eigen::Matrix<Complex,d,d>;
using Vector_T = Eigen::Matrix<Complex,d,1>;
// Computes k-th job pointer for job_count equally sized jobs distributed on thread_count threads.
template<typename Int, typename Int1, typename Int2>
inline Int JobPointer( const Int job_count, const Int1 thread_count_, const Int2 thread_ )
{
const Int thread_count = static_cast<Int>(thread_count_);
const Int thread = static_cast<Int>(thread_);
const Int chunk_size = job_count / thread_count;
const Int remainder = job_count % thread_count;
return chunk_size * thread + (remainder * thread) / thread_count;
}
// Executes the function `fun` of the form `[]( const Int thread ) -> void {...}` parallelized over `thread_count` threads.
template<typename F, typename Int>
inline void ParallelDo( F && fun, const Int thread_count )
{
if( thread_count <= static_cast<Int>(1) )
{
std::invoke( fun, static_cast<Int>(0) );
}
else
{
std::vector<std::future<void>> futures (thread_count);
for( Int thread = 0; thread < thread_count; ++thread )
{
futures[thread] = std::async( std::forward<F>(fun), thread );
}
for( auto & future : futures )
{
future.get();
}
}
}
EXTERN_C DLLEXPORT int " <> name <>
"(WolframLibraryData libData, mint Argc, MArgument *Args, MArgument Res)
{
MTensor coeffs_ = MArgument_getMTensor(Args[0]);
mint thread_count = MArgument_getInteger(Args[1]);
const mint n = libData->MTensor_getDimensions(coeffs_)[0];
const Complex * const coeffs = reinterpret_cast<Complex*>(libData->MTensor_getComplexData(coeffs_));
MTensor roots_;
// Instead of a complex array of size n x d we allocate a real array of size (n * d) x 2.
// This way, we can get rid of ReIm and Flatten.
const mint dims [2] { n * d, 2 };
(void)libData->MTensor_new( MType_Real, 2, &dims[0], &roots_ );
Complex * roots = reinterpret_cast<Complex*>( libData->MTensor_getRealData(roots_));
// This loops over all coefficient vectors and computes the eigenvalues of the companion matrix.
ParallelDo(
[roots,coeffs,n,thread_count]( const mint thread )
{
Eigen::ComplexEigenSolver<Matrix_T> S;
Matrix_T C;
C.setZero();
for( int i = 0; i < d-1; ++i )
{
C(i+1,i) = Complex { 1., 0. };
}
const mint k_begin = JobPointer( n, thread_count, thread );
const mint k_end = JobPointer( n, thread_count, thread + 1 );
for( mint k = k_begin; k < k_end; ++k )
{
const Complex * const c = &coeffs[(d+1) * k];
Complex * const r = &roots [ d * k];
const Complex factor = - 1. / c[d];
for( int i = 0; i < d; ++i )
{
C(i,d-1) = c[i] * factor;
}
S.compute(C);
Vector_T eigs = S.eigenvalues().col(0);
for( int i = 0; i < d; ++i )
{
r[i] = eigs(i);
}
}
},
thread_count
);
MArgument_setMTensor(Res, roots_);
return LIBRARY_NO_ERROR;
}"];
lib =
CreateLibrary[code, name, "Language" -> "C++",
"CompileOptions" -> {"-O3", "-std=c++20", "-pthread"},
"IncludeDirectories" -> {"/opt/homebrew/include/eigen3"},
"ShellOutputFunction" -> (If[# =!= "", Print[#]] &)];
LibraryFunctionLoad[lib,
name, {{Complex, 2, "Constant"}, Integer}, {Real, 2}]
];
Here an usage example :
degree = 8;
threadCount = 8;
cf = cSolvePolynomial[degree];
data2 = cf[coeffs[tlist], threadCount]; // AbsoluteTiming // First
0.238111
On my machine, OP's code takes 52.2739 seconds instead. So this is a 220-fold speedup.
Correctness can be checked with
Max[Abs[(Sort /@ Partition[data, 8]) - (Sort /@ Partition[data2, 8])]]
1.05471*10^-14
This allows me to run this with ten times the number of points and to get this pictures:

And here with 8000000 random points:

n=10000on my PC – anderstood Nov 27 '23 at 19:13x, so you can useSolverather thanNSolve. Doing so knocks about 15% off of the computer time on my machine; not a huge speed-up but not completely insignificant either. – Michael Seifert Nov 27 '23 at 19:15nsols = ParallelTable[{ToRules[NRoots[f @@ t == 0, x]]}, {t, tlist}];. I get a bit under 3 seconds on my machine. – Daniel Lichtblau Nov 28 '23 at 00:47