8

Can you think of a way of speeding up this computation ? I don't.

n = 100000;
f[t1_, t2_] = 
  x^5*(x*(x*(x + I) + I) + I*(Exp[I*t2]*(Exp[I*t1]*(Exp[I*t2]*(-Exp[I*t2] + 1) + 1) - 1) - 
         1)) - I*x*(x*(x*(x - 1) - 1) + 1) + I*(Exp[I*t1]*(Exp[I*t1]*(Exp[I*t1]*(Exp[I*t2] - 1) - 1) + 1) - 1);
tlist = RandomReal[{0, 2 Pi}, {n, 2}];

data = Flatten[ParallelTable[ReIm[x /. NSolve[f @@ t == 0, {x}]], {t, tlist}], 1];

Why? To produce nice images!

ListPlot[data, PlotStyle -> {Opacity[0.025], PointSize[0.001], Black},
  AspectRatio -> Automatic, Axes -> False]

enter image description here

Reference: https://profconradi.github.io/category/matematica.html

user64494
  • 26,149
  • 4
  • 27
  • 56
anderstood
  • 14,301
  • 2
  • 29
  • 80

3 Answers3

6

Since this about polynomials in one complex variable, we can compute the eigenvalues of the companion matrix instead. Since Eigenvalues is a bit slow on small matrices, I decided to use a LibraryFunction. For ease of use, I employ the Eigen package for the eigensolve.

coeffs = Compile[{{t, _Real, 1}},
   With[{
     T1 = Exp[I Compile`GetElement[t, 1]],
     T2 = Exp[I Compile`GetElement[t, 2]]
     },
    {I (-1. + T1 (1. + T1 (-1. + T1 (-1. + T2)))), -I 1., I 1., 
     1. I, -1. I, 1. I (-1. + T2 (-1. + T1 (1. + (1. - T2) T2))), 
     1. I, 1. I, 1.}
    ],
   CompilationTarget -> "C",
   RuntimeAttributes -> {Listable}
   ];

Needs["CCompilerDriver`"]; Quiet[LibraryFunctionUnload[cSolvePolynomial[8]]]; ClearAll[cSolvePolynomial]; cSolvePolynomial[d_] := cSolvePolynomial[d] = Module[{lib, code, name}, name = "cf"; code = StringJoin["

#include "WolframLibrary.h"

#include <thread> #include <future> #include <Eigen/Eigenvalues>

static constexpr mint d = " <> IntegerString[d] <> ";

using Complex = std::complex<mreal>;

using Matrix_T = Eigen::Matrix<Complex,d,d>; using Vector_T = Eigen::Matrix<Complex,d,1>;

// Computes k-th job pointer for job_count equally sized jobs distributed on thread_count threads.
template&lt;typename Int, typename Int1, typename Int2&gt;
inline Int JobPointer( const Int job_count, const Int1 thread_count_, const Int2 thread_ )
{
    const Int thread_count = static_cast&lt;Int&gt;(thread_count_);
    const Int thread       = static_cast&lt;Int&gt;(thread_);
    const Int chunk_size   = job_count / thread_count;
    const Int remainder    = job_count % thread_count;

    return chunk_size * thread + (remainder * thread) / thread_count;
}


// Executes the function `fun` of the form `[]( const Int thread ) -&gt; void {...}` parallelized over `thread_count` threads.
template&lt;typename F, typename Int&gt;
inline void ParallelDo( F &amp;&amp; fun, const Int thread_count )
{
    if( thread_count &lt;= static_cast&lt;Int&gt;(1) )
    {
        std::invoke( fun, static_cast&lt;Int&gt;(0) );
    }
    else
    {
        std::vector&lt;std::future&lt;void&gt;&gt; futures (thread_count);

        for( Int thread = 0; thread &lt; thread_count; ++thread )
        {
            futures[thread] = std::async( std::forward&lt;F&gt;(fun), thread );
        }

        for( auto &amp; future : futures )
        {
            future.get();
        }
    }
}

EXTERN_C DLLEXPORT int " <> name <> "(WolframLibraryData libData, mint Argc, MArgument *Args, MArgument Res) { MTensor coeffs_ = MArgument_getMTensor(Args[0]); mint thread_count = MArgument_getInteger(Args[1]);

const mint n = libData-&gt;MTensor_getDimensions(coeffs_)[0];

const Complex * const coeffs = reinterpret_cast&lt;Complex*&gt;(libData-&gt;MTensor_getComplexData(coeffs_));

MTensor roots_;

// Instead of a complex array of size n x d we allocate a real array of size (n * d) x 2.
// This way, we can get rid of ReIm and Flatten.

const mint dims [2] { n * d, 2 };

(void)libData-&gt;MTensor_new( MType_Real, 2, &amp;dims[0], &amp;roots_ );

Complex * roots = reinterpret_cast&lt;Complex*&gt;( libData-&gt;MTensor_getRealData(roots_));

// This loops over all coefficient vectors and computes the eigenvalues of the companion matrix.

ParallelDo(
    [roots,coeffs,n,thread_count]( const mint thread )
    {
        Eigen::ComplexEigenSolver&lt;Matrix_T&gt; S;

        Matrix_T C;

        C.setZero();

        for( int i = 0; i &lt; d-1; ++i )
        {
            C(i+1,i) = Complex { 1., 0. };
        }

        const mint k_begin = JobPointer( n, thread_count, thread     );
        const mint k_end   = JobPointer( n, thread_count, thread + 1 );

        for( mint k = k_begin; k &lt; k_end; ++k )
        {
            const Complex * const c = &amp;coeffs[(d+1) * k];
                  Complex * const r = &amp;roots [ d    * k];

            const Complex factor = - 1. / c[d];

            for( int i = 0; i &lt; d; ++i )
            {
                C(i,d-1) = c[i] * factor;
            }

            S.compute(C);

            Vector_T eigs = S.eigenvalues().col(0);

            for( int i = 0; i &lt; d; ++i )
            {
                r[i] = eigs(i);
            }
        }
    },
    thread_count
);


MArgument_setMTensor(Res, roots_);

return LIBRARY_NO_ERROR;

}"]; lib = CreateLibrary[code, name, "Language" -> "C++", "CompileOptions" -> {"-O3", "-std=c++20", "-pthread"}, "IncludeDirectories" -> {"/opt/homebrew/include/eigen3"}, "ShellOutputFunction" -> (If[# =!= "", Print[#]] &)]; LibraryFunctionLoad[lib, name, {{Complex, 2, "Constant"}, Integer}, {Real, 2}] ];

Here an usage example :

degree = 8;
threadCount = 8;
cf = cSolvePolynomial[degree];
data2 = cf[coeffs[tlist], threadCount]; // AbsoluteTiming // First

0.238111

On my machine, OP's code takes 52.2739 seconds instead. So this is a 220-fold speedup.

Correctness can be checked with

Max[Abs[(Sort /@ Partition[data, 8]) - (Sort /@ Partition[data2, 8])]]

1.05471*10^-14

This allows me to run this with ten times the number of points and to get this pictures:

enter image description here

And here with 8000000 random points:

enter image description here

Henrik Schumacher
  • 106,770
  • 7
  • 179
  • 309
5
data1 = Flatten[Table[ReIm[x /. NSolve[f @@ t == 0, {x}]], {t, tlist}], 1]; // AbsoluteTiming // First
(*    51.9531    *)

Optimizing @DanielLichtblau's comment a bit, and using $s_k=e^{i t_k}$:

g[s1_, s2_] = Series[f[t1, t2] /. {E^(I t1) -> s1, E^(I t2) -> s2},
                {x, 0, 8}] // FullSimplify // Normal
(*    I (-1 + s1 + s1^2 (-1 + s1 (-1 + s2))) - I x + I x^2 + I x^3 - I x^4 +
      I (-1 + s2 (-1 + s1 (1 + s2 - s2^2))) x^5 + I x^6 + I x^7 + x^8    *)

data2 = ReIm[List @@ Join @@ Map[NRoots[g @@ #, x] &, Exp[I*tlist]][[All, All, 2]]]; // AbsoluteTiming // First (* 3.71658 *)

is faster by a factor of 14, and numerically indistinguishable:

data2 - data1 // Norm
(*    5.06198*10^-14    *)

Replacing Map by ParallelMap gives a small speedup, but not much.

Roman
  • 47,322
  • 2
  • 55
  • 121
4

Another possibility is the Ehrlich-Aberth algorithm which is easy to implement:

cAberth = Block[{z, t, x},
  With[{
    \[Epsilon] = 16. $MachineEpsilon,
    d = 8,
    rCode = N[
       Divide[
         f[Compile`GetElement[t, 1], Compile`GetElement[t, 2]], 
         D[f[Compile`GetElement[t, 1], Compile`GetElement[t, 2]], x]
         ] /. x -> z]
    },
   Compile[{{t, _Real, 1}},
    Block[{z, w, sum, maxw, r, zlist, wlist},
     zlist = RandomComplex[{-1 - I, 1 + I}, d];
     z = 0. + 0. I;
     r = 0. + 0. I;
     maxw = 1.;
 While[ maxw &gt; \[Epsilon],

  maxw = 0.;

  Do[
   z = Compile`GetElement[zlist, i];
   r = rCode;
   sum = 0. + 0. I;
   Do[sum += 1./(z - Compile`GetElement[zlist, j]), {j, 1, i - 1}];
   Do[sum += 1./(z - Compile`GetElement[zlist, j]), {j, i + 1, d}];
   w = r /(1. - r sum);
   maxw = Max[maxw, Abs[w]];
   zlist[[i]] -= w;

   , {i, 1, d}];
  ];
 zlist
 ],
CompilationTarget -&gt; &quot;C&quot;,
RuntimeAttributes -&gt; {Listable},
Parallelization -&gt; True,
RuntimeOptions -&gt; &quot;Speed&quot;
]

] ];

Use it like this on you problem:

data3 = Flatten[ReIm[cAberth[tlist]], 1];

This is a quick and dirty implementation of the "Gauss-Seidel"-like variant as mentioned in the text. I just generate random starting values without looking at the actual coefficients; so this should definitely be improved.

It takes about 25% longer than SolvePolynomial[degree][coeffs[tlist], threadCount] from my other answer. But I think that is due to the parallelization in CompiledFunctions; that is sometimes a bit poor. I expect that a corresponding LibraryLink implementation with parallelization within the function would perform better.

Henrik Schumacher
  • 106,770
  • 7
  • 179
  • 309
  • Consider posting an answer here?: https://mathematica.stackexchange.com/q/73312/1871 :) – xzczd Dec 04 '23 at 00:51