#include <stdlib.h>
#include <stdbool.h>
#include <math.h>
#include "common.h"
#include "hvapprox.h"
#include "pow_int.h"
#include "rng.h"

#define ALMOST_ZERO_WEIGHT 1e-20

#ifndef M_PIl
# define M_PIl		3.141592653589793238462643383279502884L /* pi */
#endif
#ifndef M_PI_2l
# define M_PI_2l	1.570796326794896619231321691639751442L /* pi/2 */
#endif
#ifndef M_PI_4l
# define M_PI_4l	0.785398163397448309615660845819875721L /* pi/4 */
#endif

// Returns fractional part. Equivalent to modfl(x, &dummy) but slightly faster.
_attr_const_func
static inline long double fractl(long double x) { return x - truncl(x); }


static double *
transform_and_filter(const double * restrict data, size_t * restrict npoints_p,
                     dimension_t dim, const double * restrict ref,
                     const bool * restrict maximise)
{
    size_t npoints = *npoints_p;
    double * points = malloc(dim * npoints * sizeof(*points));
    size_t i, j;
    // Transform points (ref - points)
    for (i = 0, j = 0; i < npoints; i++) {
        double * restrict p = points + j * dim;
        const double * restrict src = data + i * dim;
        dimension_t k;
        for (k = 0; k < dim; k++) {
            p[k] = maximise[k] ? (src[k] - ref[k]) : (ref[k] - src[k]);
            // Filter out dominated points (must be >0 in all objectives)
            if (p[k] <= 0)
                break;
        }
        if (k == dim)
            j++;
    }
    *npoints_p = j;
    if (j == 0) {
        free(points);
        return NULL;
    }
    return points;
}

_attr_optimize_finite_math // Required so that GCC will vectorize the inner loop.
static double
get_expected_value(const double * restrict points, size_t npoints,
                   dimension_t dim, const double * restrict w)
{
    ASSUME(1 <= dim && dim <= MOOCORE_DIMENSION_MAX);
    ASSUME(npoints > 0);
    // points >= 0 && w >=0 so max_s_w cannot be < 0.
    double max_s_w = -INFINITY;
    for (size_t i = 0; i < npoints; i++) {
        const double * restrict p = points + i * dim;
        double min_ratio = p[0] * w[0];
        for (dimension_t k = 1; k < dim; k++) {
            double ratio = p[k] * w[k];
            min_ratio = MIN(min_ratio, ratio);
        }
        max_s_w = MAX(max_s_w, min_ratio);
    }
    ASSUME(max_s_w >= 0);
    return pow_uint(max_s_w, dim);
}

#if DEBUG >= 1 // Only used for checking the product of int_all below.
/* Pre-computed value of S_{d-1} / 2^d = (2*pi^(d/2) / Gamma(d/2)) / 2^d

   Generated by hypersphere_volume.py using long double to preserve as much
   precision as possible.
*/
static const long double sphere_area_div_2_pow_d[] = {
    0.0L, // d = 0, value = 0.0
    1.0L, // d = 1, value = 1.0
    0x1.921fb54442d18469898cc51701b839a252049c1115p+00L, // d = 2, value = 1.5707963267948966
    0x1.921fb54442d18469898cc51701b839a252049c1115p+00L, // d = 3, value = 1.5707963267948966
    0x1.3bd3cc9be45de5a4adc4d9b30118358e10acd47fc2p+00L, // d = 4, value = 1.2337005501361697
    0x1.a51a6625307d3230e7b1224401759cbd6b911b5502p-01L, // d = 5, value = 0.8224670334241132
    0x1.f019b59389d7c1e019558e5380d6d8733503c4a497p-02L, // d = 6, value = 0.4844730731296847
    0x1.08963eb51650efeefc71e581de507370a4cecf46b7p-02L, // d = 7, value = 0.2583856390024985
    0x1.03c1f081b5ac3b353301e00faeda2538125c8c3b59p-03L, // d = 8, value = 0.12683475395052402
    0x1.dafc3b70d72c4f02316283c4e7f54b5f3784a8a709p-05L, // d = 9, value = 0.05798160180595383
    0x1.9806b8153159a46d9e0255c5898bcf370eee57846p-06L, // d = 10, value = 0.0249039457019272
    0x1.4b9a2f342b5b8b51c57e3d94590728cf4916387f16p-07L, // d = 11, value = 0.010119698570941848
    0x1.005ed5ead8ffb7d3fbc24f22cec8451f00764b8e16p-08L, // d = 12, value = 0.00391190264312868
    0x1.7ad251e2f6063b9710446d8bb8c07dc71d20931b18p-10L, // d = 13, value = 0.0014450895766824562
    0x1.0c787349665d506993054f77d909360acd87ebecd7p-11L, // d = 14, value = 0.0005120668585504815
    0x1.6e2f802d8e6f6be3cc89d7e3d4be387de5f4cf8fefp-13L, // d = 15, value = 0.00017461087684172264
    0x1.e1f506891babaf464e85b51fb497d01b9d08bb326fp-15L, // d = 16, value = 5.745376717746416e-05
    0x1.32c65f1a4911d21b53ad39b93d0bb24113a26b755bp-16L, // d = 17, value = 1.8285208264094266e-05
    0x1.7a873b18ec471f028a1f0e9a834133b44336bcd148p-18L, // d = 18, value = 5.640510402680619e-06
    0x1.c588f17d08676e01725ff1abde4c56ba193c210ee5p-20L, // d = 19, value = 1.6895492927011154e-06
    0x1.084337a542e974a08d2829d6d18c660131bc6ff3bcp-21L, // d = 20, value = 4.922273900988399e-07
    0x1.2bf668645a7468deb47b91b7c44236587c2de78fa3p-23L, // d = 21, value = 1.396809380480962e-07
    0x1.4c1506e8fdce59389198a25c0d328da5770c11d504p-25L, // d = 22, value = 3.8659448815754826e-08
    0x1.66fe8b8d476d4d6c9e4bc48e83fca953436771dee4p-27L, // d = 23, value = 1.0448109733772144e-08
    0x1.7b5ea7a98ad91ea61c1a5c7bcb16d152b3c2d51b7ep-29L, // d = 24, value = 2.7602781907137724e-09
    0x1.88487927445984656c2db6d925930e4cbd43eb71a3p-31L, // d = 25, value = 7.135587996417082e-10
    0x1.8d466ae9142e099129461cd7de1256d3e39df5caccp-33L, // d = 26, value = 1.8065978512105238e-10
    0x1.8a5db1253b2056ea27cd7ce3245e3c4354746c8395p-35L, // d = 27, value = 4.483422165717483e-11
    0x1.8006055c1b03d4804b491953e83b472bac1db1d0fap-37L, // d = 28, value = 1.091460487952709e-11
    0x1.6f17afdb720740237f86dfbc991f0a5afbec5ad741p-39L, // d = 29, value = 2.6083492849554974e-12
    0x1.58b2de52401cf583c86dd159ff28406e44680a2e86p-41L, // d = 30, value = 6.123079018992432e-13
    0x1.3e23ad365ba0e2ca4613f1c499779316a839fafb2fp-43L, // d = 31, value = 1.4128225778642035e-13
    0x1.20c62c2f2d7f4a970cb97b8e179a6943fd21ba7509p-45L, // d = 32, value = 3.206036677236071e-14
};
#endif
/* Pre-computed value of S_{d-1} / (d * 2^d) = (2*pi^(d/2) / Gamma(d/2)) / (d * 2^d)

   Generated by hypersphere_volume.py using long double to preserve as much
   precision as possible.
*/
static const long double sphere_area_div_2_pow_d_times_d[] = {
    0.0L, // d = 0, value = 0.0
    1.0L, // d = 1, value = 1.0
    0x1.921fb54442d18469898cc51701b839a252049c1115p-01L, // d = 2, value = 0.7853981633974483
    0x1.0c152382d73658465bb32e0f567ad116e158680b63p-01L, // d = 3, value = 0.5235987755982989
    0x1.3bd3cc9be45de5a4adc4d9b30118358e10acd47fc2p-02L, // d = 4, value = 0.30842513753404244
    0x1.50e1eb50f3975b5a52f41b699ac47d64560daf7735p-03L, // d = 5, value = 0.16449340668482265
    0x1.4abbce625be52beabb8e5ee255e4904cce02831865p-04L, // d = 6, value = 0.08074551218828079
    0x1.2e6290cef4eec91120822add905bf1a54ea33607acp-05L, // d = 7, value = 0.036912234143214075
    0x1.03c1f081b5ac3b353301e00faeda2538125c8c3b59p-06L, // d = 8, value = 0.015854344243815502
    0x1.a6358a2b69ee7f1e64c958af0712eda9f875eb3f25p-08L, // d = 9, value = 0.006442400200661537
    0x1.466bc6775aae1d247e68449e07a30c2c0bf1df9d1ap-09L, // d = 10, value = 0.0024903945701927202
    0x1.e254a1c03f10caa57c5a881d98c4987353090c5bc3p-11L, // d = 11, value = 0.0009199725973583498
    0x1.55d3c7e3cbff9fc54fadbed913b5b17eab4864bd73p-12L, // d = 12, value = 0.00032599188692739
    0x1.d23deea12ecc981c62ca5f70e36310f510281783cfp-14L, // d = 13, value = 0.00011116073666788125
    0x1.32d2cce62bd85be65ee17f6465c162557d2d9fea1ap-15L, // d = 14, value = 3.657620418217725e-05
    0x1.86991141a8ff62040d5fd53749536f75398daa3321p-17L, // d = 15, value = 1.1640725122781509e-05
    0x1.e1f506891babaf464e85b51fb497d01b9d08bb326fp-19L, // d = 16, value = 3.59086044859151e-06
    0x1.20bab3dc8101b6b04ec12744ee29203d3fa7ecaabp-20L, // d = 17, value = 1.075600486123192e-06
    0x1.50783487ee781b907ac645dead8f4a675830a7d679p-22L, // d = 18, value = 3.1336168903781217e-07
    0x1.7dec9576c3b56a1c2a6bbe09fe91209cb6ef44427p-24L, // d = 19, value = 8.892364698426923e-08
    0x1.a6d1f2a204a8ba9a7b737624827a3cceb5fa4cb92dp-26L, // d = 20, value = 2.4611369504942e-08
    0x1.c915f468284fdcc113057054fa4c8393065e54aa1dp-28L, // d = 21, value = 6.651473240385533e-09
    0x1.e3074fde8871f623bc80ec28cd60ce07f2fa487ba9p-30L, // d = 22, value = 1.757247673443401e-09
    0x1.f378691d9b0760971f053dfdef4944a05dc7934c5ep-32L, // d = 23, value = 4.5426564059878885e-10
    0x1.f9d38a3763cc28dd7acdd0a50ec91718efae7179fdp-34L, // d = 24, value = 1.1501159127974052e-10
    0x1.f61f536576352458dc637967de2cdf1a8bdc0ea5fp-36L, // d = 25, value = 2.8542351985668327e-11
    0x1.e8f434d018d63328d0564ae24c65572c2bd61ad237p-38L, // d = 26, value = 6.948453273886629e-12
    0x1.d3658f946c0067028dfd05cad5c5050d6d937720b1p-40L, // d = 27, value = 1.6605267280435122e-12
    0x1.b6e24f44b128f2dbc3c14184771f2cc4326b145c8cp-42L, // d = 28, value = 3.8980731712596753e-13
    0x1.95114f4a6c2b4f99edd29e9b248c2ebcac0da20801p-44L, // d = 29, value = 8.994307879156888e-14
    0x1.6fadb9f1557439151a0ebd2ccbe6ab2048f78253c3p-46L, // d = 30, value = 2.0410263396641442e-14
    0x1.4866e45924c71ba782251a99649c76cd10bffad1bdp-48L, // d = 31, value = 4.5574921866587215e-15
};

_attr_pure_func static double
euclidean_norm(const double * restrict w, dimension_t dim)
{
    ASSUME(2 <= dim && dim <= MOOCORE_DIMENSION_MAX);
    double norm = (w[0] * w[0]) + (w[1] * w[1]);
    for (dimension_t k = 2; k < dim; k++)
        norm += w[k] * w[k];
    return sqrt(norm);
}

/* Hypervolume approximation DZ2019-MC.

   Jingda Deng, Qingfu Zhang (2019). “Approximating Hypervolume and Hypervolume
   Contributions Using Polar Coordinate.” IEEE Transactions on Evolutionary
   Computation, 23(5), 913–918. doi:10.1109/tevc.2019.2895108 .
*/
double
hv_approx_normal(const double * restrict data, size_t npoints, dimension_t dim,
                 const double * restrict ref, const bool * restrict maximise,
                 uint_fast32_t nsamples, uint32_t random_seed)
{
    ASSUME(2 <= dim && dim <= MOOCORE_DIMENSION_MAX);
    const double * points = transform_and_filter(data, &npoints, dim, ref, maximise);
    if (points == NULL)
        return 0;

    rng_state * rng = rng_new(random_seed);
    double * w = malloc(dim * sizeof(*w));
    double expected = 0.0;
    // Monte Carlo sampling.
    for (uint_fast32_t j = 0; j < nsamples; j++) {
        // Generate random weights in positive orthant.
        // Reference: Marsaglia, G. (1972). "Choosing a Point from the Surface
        // of a Sphere". Annals of Mathematical Statistics. 43 (2): 645-646.
        for (dimension_t k = 0; k < dim; k++)
            w[k] = rng_standard_normal(rng);
        for (dimension_t k = 0; k < dim; k++) {
            w[k] = fabs(w[k]);
            w[k] = MAX(w[k], ALMOST_ZERO_WEIGHT); // Avoid division by zero later.
        }
        double norm = euclidean_norm(w, dim);
        for (dimension_t k = 0; k < dim; k++) {
            // 1 / (w[k] / norm) so we avoid the division when calculating the
            // ratio below.
            w[k] = norm / w[k];
        }
        expected += get_expected_value(points, npoints, dim, w);
    }
    free(w);
    free(rng);
    free((void *)points);
    const long double c_m = sphere_area_div_2_pow_d_times_d[dim];
    return STATIC_CAST(double, c_m * (expected / STATIC_CAST(long double, nsamples)));
}


static uint_fast32_t *
construct_polar_a(dimension_t dim, uint_fast32_t nsamples)
{
    ASSUME(1 <= dim && dim <= MOOCORE_DIMENSION_MAX);
    // Step 1: find prime p such that dim <= eularfunction(p)/2 == (p-1)/2
    static const dimension_t primes [] = {
        1,  3,  5,  7, 11, 11, 13, 17, 17, 19,
        23, 23, 29, 29, 29, 31, 37, 37, 37, 41,
        41, 43, 47, 47, 53, 53, 53, 59, 59, 59,
        61, 67, 67 };

    const dimension_t p = primes[dim];
    DEBUG2_PRINT("construct_polar_a: prime: %u\n", (unsigned int)p);

    uint_fast32_t * a = malloc(dim * sizeof(*a));
    a[0] = 1;
    DEBUG2_PRINT("construct_polar_a: a[%u] = %lu",
                 (unsigned int) dim, (unsigned long) a[0]);
    for (dimension_t k = 1; k < dim; k++) {
        long double temp = 2 * fabsl(cosl(2 * M_PIl * k / p));
        temp = fractl(temp);
        a[k] = STATIC_CAST(uint_fast32_t, llroundl(nsamples * temp));
        DEBUG2_PRINT(", %lu", (unsigned long) a[k]);
    }
    DEBUG2_PRINT("\n");
    return a;
}

static void
compute_polar_sample(long double * restrict sample, dimension_t dim,
                     uint_fast32_t i, uint_fast32_t nsamples,
                     const uint_fast32_t * restrict a)
{
    ASSUME(i + 1 <= nsamples);
    if (i + 1 < nsamples) {
        long double factor = (i+1) / STATIC_CAST(long double, nsamples);
        for (dimension_t k = 0; k < dim; k++) {
            long double val = factor * a[k];
            sample[k] = fractl(val);
        }
    } else { // Last point is always 0.
        for (dimension_t k = 0; k < dim; k++)
            sample[k] = 0.0;
    }
}

/* Calculate \int_{0}^{b} \sin^m x dx

   Generated using this code:

```python
import sympy as sp
from sympy import sin, cos, Rational,Float
x = sp.symbols('x', real=True)
# Upper limit of integration
b = sp.symbols('b', real=True)

manually_simplified = {
    4: Rational(3,8)*b - cos(b)*sin(b)*(Rational(1,4)*sin(b)**2 + Rational(3,8)),
    5: Rational(8,15) - cos(b) * (cos(b)**4/5 - 2*cos(b)**2/3 + 1),
    6: Rational(5,16)*b - sin(b) * cos(b) * (Rational(1,6)*sin(b)**4 + 5*sin(b)**2/24 + Rational(5,16)),
    7: (cos(b)**6/7 - 3*cos(b)**4/5 + cos(b)**2 - 1)*cos(b) + Rational(16,35),
    8: 35*b/128 - (sin(b)**6/8 + 7*sin(b)**4/48 + 35*sin(b)**2/192 + Rational(35,128))*sin(b)*cos(b),
    9: Rational(128,315) - cos(b) * (cos(b)**8/9 - 4*cos(b)**6/7 + 6*cos(b)**4/5 - 4*cos(b)**2/3 + 1)
}

# Function to calculate the definite integral of sin(x)^n from 0 to b
def sin_power_integral(n, b):
    expr = sin(x)**n
    integral = sp.integrate(expr, (x, 0, b))
    if n in manually_simplified:
        assert sp.simplify(integral - manually_simplified[n], rational=True) == 0
        return manually_simplified[n]
    integral = integral.collect(sin(b)*cos(b), exact=False)
    if n == 2:
        return sp.simplify(integral);
    if n == 3:
        return sp.simplify(sp.simplify(integral));
    return integral

# Calculate and print the integrals for n from 0 to 31
results = {n: sin_power_integral(n, b) for n in range(32)}

# Function to check if a number has an exact floating-point representation
def is_exact_float(e):
    return isinstance(e, sp.Rational) and e.q != 1 and e.q & (e.q - 1) == 0  # Check if denominator is power of 2

# Function to convert rational numbers to float only if exact
def convert_rational(expr):
    return expr.replace(lambda e: is_exact_float(e), lambda e: Float(e,precision=128))

from sympy.printing.c import C99CodePrinter
class MyCCodePrinter(C99CodePrinter):
    def _print_Rational(self, expr):
        p, q = int(expr.p), int(expr.q)
        return '%d/%d.' % (p, q)

def custom_ccode(expr):
    expr = convert_rational(expr)  # Convert exact rationals to floats
    expr = expr.subs({cos(b): sp.Symbol('cos_b'), sin(b): sp.Symbol('sin_b')})  # Replace cos(b) -> cos_b, sin(b) -> sin_b
    return MyCCodePrinter().doprint(expr)

for n, integral in results.items():
    c_code = custom_ccode(integral)
    print(f'case {n}:\n    return {c_code};')
``` */
_attr_const_func static double
int_of_power_of_sin_from_0_to_b(dimension_t m, double b)
{
#define POW fast_pow_uint_max31
    double sin_b, cos_b;

    switch (m) {
      case 0:
          return b;
      case 1:
          return 1 - cos(b);
      case 2:
          return 0.5*b - 0.25*sin(2*b);
      case 3:
          cos_b = cos(b);
          return POW(cos_b, 3)/3 - cos_b + 2/3.;
      case 4:
          sin_b = sin(b); cos_b = cos(b);
          return 0.375*b - cos_b*sin_b*(0.25*POW(sin_b, 2) + 0.375);
      case 5:
          cos_b = cos(b);
          return 8/15. - cos_b*(POW(cos_b, 4)/5 - 2/3.*POW(cos_b, 2) + 1);
      case 6:
          sin_b = sin(b); cos_b = cos(b);
          return 0.3125*b - cos_b*sin_b*(POW(sin_b, 4)/6 + (5/24.)*POW(sin_b, 2) + 0.3125);
      case 7:
          cos_b = cos(b);
          return cos_b*(POW(cos_b, 6)/7 - 3/5.*POW(cos_b, 4) + POW(cos_b, 2) - 1) + 16/35.;
      case 8:
          sin_b = sin(b); cos_b = cos(b);
          return 0.2734375*b - cos_b*sin_b*(0.125*POW(sin_b, 6) + (7/48.)*POW(sin_b, 4) + (35/192.)*POW(sin_b, 2) + 0.2734375);
      case 9:
          cos_b = cos(b);
          return 128/315. - cos_b*(POW(cos_b, 8)/9. - 4/7.*POW(cos_b, 6) + (6/5.)*POW(cos_b, 4) - 4/3.*POW(cos_b, 2) + 1);
      case 10:
          sin_b = sin(b); cos_b = cos(b);
          return 0.24609375*b - cos_b*sin_b*(POW(sin_b, 8)/10. + 9/80.*POW(sin_b, 6) + 21/160.*POW(sin_b, 4) + 0.1640625*POW(sin_b, 2) + 0.24609375);
      case 11:
          cos_b = cos(b);
          return POW(cos_b, 11)/11. - 5/9.*POW(cos_b, 9) + (10/7.)*POW(cos_b, 7) - 2*POW(cos_b, 5) + (5/3.)*POW(cos_b, 3) - cos_b + 256/693.;
      case 12:
          sin_b = sin(b); cos_b = cos(b);
          return 0.2255859375*b - cos_b*sin_b*(POW(sin_b, 10)/12 + 11/120.*POW(sin_b, 8) + 33/320.*POW(sin_b, 6) + 77/640.*POW(sin_b, 4) + 0.150390625*POW(sin_b, 2) + 0.2255859375);
      case 13:
          cos_b = cos(b);
          return 1024/3003. -POW(cos_b, 13)/13. + (6/11.)*POW(cos_b, 11) - 5/3.*POW(cos_b, 9) + (20/7.)*POW(cos_b, 7) - 3*POW(cos_b, 5) + 2*POW(cos_b, 3) - cos_b;
      case 14:
          sin_b = sin(b); cos_b = cos(b);
          return 0.20947265625*b - cos_b*sin_b*(POW(sin_b, 12)/14. + 13/168.*POW(sin_b, 10) + 143/1680.*POW(sin_b, 8) + 429/4480.*POW(sin_b, 6) + 143/1280.*POW(sin_b, 4) + 0.1396484375*POW(sin_b, 2) + 0.20947265625);
      case 15:
          cos_b = cos(b);
          return POW(cos_b, 15)/15. - 7/13.*POW(cos_b, 13) + (21/11.)*POW(cos_b, 11) - 35/9.*POW(cos_b, 9) + 5*POW(cos_b, 7) - 21/5.*POW(cos_b, 5) + (7/3.)*POW(cos_b, 3) - cos_b + 2048/6435.;
      case 16:
          sin_b = sin(b); cos_b = cos(b);
          return 0.196380615234375*b - cos_b*sin_b*(0.0625*POW(sin_b, 14) + 15/224.*POW(sin_b, 12) + 65/896.*POW(sin_b, 10) + 143/1792.*POW(sin_b, 8) + 1287/14336.*POW(sin_b, 6) + 0.104736328125*POW(sin_b, 4) + 0.13092041015625*POW(sin_b, 2) + 0.196380615234375);
      case 17:
          cos_b = cos(b);
          return 32768/109395. - POW(cos_b, 17)/17. + (8/15.)*POW(cos_b, 15) - 28/13.*POW(cos_b, 13) + (56/11.)*POW(cos_b, 11) - 70/9.*POW(cos_b, 9) + 8*POW(cos_b, 7) - 28/5.*POW(cos_b, 5) + (8/3.)*POW(cos_b, 3) - cos_b;
      case 18:
          sin_b = sin(b); cos_b = cos(b);
          return 0.1854705810546875*b - cos_b*sin_b*(POW(sin_b, 16)/18. + 17/288.*POW(sin_b, 14) + 85/1344.*POW(sin_b, 12) + 1105/16128.*POW(sin_b, 10) + 2431/32256.*POW(sin_b, 8) + 2431/28672.*POW(sin_b, 6) + 2431/24576.*POW(sin_b, 4) + 12155/98304.*POW(sin_b, 2) + 0.1854705810546875);
      case 19:
          cos_b = cos(b);
          return POW(cos_b, 19)/19. - 9/17.*POW(cos_b, 17) + (12/5.)*POW(cos_b, 15) - 84/13.*POW(cos_b, 13) + (126/11.)*POW(cos_b, 11) - 14*POW(cos_b, 9) + 12*POW(cos_b, 7) - 36/5.*POW(cos_b, 5) + 3*POW(cos_b, 3) - cos_b + 65536/230945.;
      case 20:
          sin_b = sin(b); cos_b = cos(b);
          return 0.17619705200195313*b - cos_b*sin_b*(POW(sin_b, 18)/20. + 19/360.*POW(sin_b, 16) + 323/5760.*POW(sin_b, 14) + 323/5376.*POW(sin_b, 12) + 4199/64512.*POW(sin_b, 10) + 46189/645120.*POW(sin_b, 8) + 46189/573440.*POW(sin_b, 6) + 46189/491520.*POW(sin_b, 4) + 46189/393216.*POW(sin_b, 2) + 0.17619705200195313);
      case 21:
          cos_b = cos(b);
          return 262144/969969. -POW(cos_b, 21)/21 + (10/19.)*POW(cos_b, 19) - 45/17.*POW(cos_b, 17) + 8*POW(cos_b, 15) - 210/13.*POW(cos_b, 13) + (252/11.)*POW(cos_b, 11) - 70/3.*POW(cos_b, 9) + (120/7.)*POW(cos_b, 7) - 9*POW(cos_b, 5) + (10/3.)*POW(cos_b, 3) - cos_b;
      case 22:
          sin_b = sin(b); cos_b = cos(b);
          return 0.16818809509277344*b - cos_b*sin_b*(
              POW(sin_b, 20)/22. + 21/440.*POW(sin_b, 18) + 133/2640.*POW(sin_b, 16) + 2261/42240.*POW(sin_b, 14) + 323/5632.*POW(sin_b, 12) + 4199/67584.*POW(sin_b, 10)
              + 4199/61440.*POW(sin_b, 8) + 12597/163840.*POW(sin_b, 6) + 29393/327680.*POW(sin_b, 4) + 0.11212539672851563*POW(sin_b, 2) + 0.16818809509277344);
      case 23:
          cos_b = cos(b);
          return POW(cos_b, 23)/23 - 11/21.*POW(cos_b, 21) + (55/19.)*POW(cos_b, 19) - 165/17.*POW(cos_b, 17) + 22*POW(cos_b, 15) - 462/13.*POW(cos_b, 13) + 42*POW(cos_b, 11) - 110/3.*POW(cos_b, 9) + (165/7.)*POW(cos_b, 7) - 11*POW(cos_b, 5) + (11/3.)*POW(cos_b, 3) - cos_b + 524288/2028117.;
      case 24:
          sin_b = sin(b); cos_b = cos(b);
          return 0.16118025779724121*b - cos_b*sin_b*(POW(sin_b, 22)/24. + 23/528.*POW(sin_b, 20) + 161/3520.*POW(sin_b, 18) + 3059/63360.*POW(sin_b, 16) + 52003/1013760.*POW(sin_b, 14) + 7429/135168.*POW(sin_b, 12) + 96577/1622016.*POW(sin_b, 10) + 96577/1474560.*POW(sin_b, 8) + 96577/1310720.*POW(sin_b, 6) + 676039/7864320.*POW(sin_b, 4) + 676039/6291456.*POW(sin_b, 2) + 0.16118025779724121);
      case 25:
          cos_b = cos(b);
          return 4194304/16900975. - POW(cos_b, 25)/25. + (12/23.)*POW(cos_b, 23) - 22/7.*POW(cos_b, 21) + (220/19.)*POW(cos_b, 19) - 495/17.*POW(cos_b, 17) + (264/5.)*POW(cos_b, 15) - 924/13.*POW(cos_b, 13) + 72*POW(cos_b, 11) - 55*POW(cos_b, 9) + (220/7.)*POW(cos_b, 7) - 66/5.*POW(cos_b, 5) + 4*POW(cos_b, 3) - cos_b;
      case 26:
          sin_b = sin(b); cos_b = cos(b);
          return 0.15498101711273193*b - cos_b*sin_b*(
              POW(sin_b, 24)/26. + 25/624.*POW(sin_b, 22) + 575/13728.*POW(sin_b, 20) + 805/18304.*POW(sin_b, 18) + 15295/329472.*POW(sin_b, 16) + 260015/5271552.*POW(sin_b, 14)
              + 185725/3514368.*POW(sin_b, 12) + 185725/3244032.*POW(sin_b, 10) + 37145/589824.*POW(sin_b, 8) + 0.070848464965820313*POW(sin_b, 6) + 260015/3145728.*POW(sin_b, 4) + 1300075/12582912.*POW(sin_b, 2) + 0.15498101711273193);
      case 27:
          cos_b = cos(b);
          return POW(cos_b, 27)/27. - 13/25.*POW(cos_b, 25) + (78/23.)*POW(cos_b, 23) - 286/21.*POW(cos_b, 21) + (715/19.)*POW(cos_b, 19) - 1287/17.*POW(cos_b, 17) + (572/5.)*POW(cos_b, 15) - 132*POW(cos_b, 13) + 117*POW(cos_b, 11) - 715/9.*POW(cos_b, 9) + (286/7.)*POW(cos_b, 7) - 78/5.*POW(cos_b, 5) + (13/3.)*POW(cos_b, 3) - cos_b + 8388608/35102025.;
      case 28:
          sin_b = sin(b); cos_b = cos(b);
          return 0.14944598078727722*b - cos_b*sin_b*(
              POW(sin_b, 26)/28. + 27/728.*POW(sin_b, 24) + 225/5824.*POW(sin_b, 22) + 5175/128128.*POW(sin_b, 20) + 3105/73216.*POW(sin_b, 18) + 6555/146432.*POW(sin_b, 16) + 111435/2342912.*POW(sin_b, 14) + 1671525/32800768.*POW(sin_b, 12) + 557175/10092544.*POW(sin_b, 10) + 111435/1835008.*POW(sin_b, 8) + 1002915/14680064.*POW(sin_b, 6) + 0.079704523086547852*POW(sin_b, 4) + 0.099630653858184814*POW(sin_b, 2) + 0.14944598078727722);
      case 29:
          cos_b = cos(b);
          return 33554432/145422675. - POW(cos_b, 29)/29 + (14/27.)*POW(cos_b, 27) - 91/25.*POW(cos_b, 25) + (364/23.)*POW(cos_b, 23) - 143/3.*POW(cos_b, 21) + (2002/19.)*POW(cos_b, 19) - 3003/17.*POW(cos_b, 17) + (1144/5.)*POW(cos_b, 15) - 231*POW(cos_b, 13) + 182*POW(cos_b, 11) - 1001/9.*POW(cos_b, 9) + 52*POW(cos_b, 7) - 91/5.*POW(cos_b, 5) + (14/3.)*POW(cos_b, 3) - cos_b;
      case 30:
          sin_b = sin(b); cos_b = cos(b);
          return 0.14446444809436798*b - cos_b*sin_b*(
              POW(sin_b, 28)/30. + 29/840.*POW(sin_b, 26) + 261/7280.*POW(sin_b, 24) + 435/11648.*POW(sin_b, 22) + 10005/256256.*POW(sin_b, 20) + 6003/146432.*POW(sin_b, 18) + 12673/292864.*POW(sin_b, 16) + 215441/4685824.*POW(sin_b, 14) + 3231615/65601536.*POW(sin_b, 12) + 1077205/20185088.*POW(sin_b, 10) + 215441/3670016.*POW(sin_b, 8) + 1938969/29360128.*POW(sin_b, 6) + 0.07704770565032959*POW(sin_b, 4) + 0.096309632062911987*POW(sin_b, 2) + 0.14446444809436798);
      case 31:
          cos_b = cos(b);
          return POW(cos_b, 31)/31. - 15/29.*POW(cos_b, 29) + (35/9.)*POW(cos_b, 27) - 91/5.*POW(cos_b, 25) + (1365/23.)*POW(cos_b, 23) - 143*POW(cos_b, 21) + (5005/19.)*POW(cos_b, 19) - 6435/17.*POW(cos_b, 17) + 429*POW(cos_b, 15) - 385*POW(cos_b, 13) + 273*POW(cos_b, 11) - 455/3.*POW(cos_b, 9) + 65*POW(cos_b, 7) - 21*POW(cos_b, 5) + 5*POW(cos_b, 3) - cos_b + 67108864/300540195.;
      default:   // LCOV_EXCL_LINE # nocov
          unreachable();
    }
#undef POW
}

// \int_{0}^{pi/2} sin^i(x) dx
static const long double int_power_of_sin_from_0_to_half_pi[] = {
    /* d =  0 */ M_PI_2l,
    /* d =  1 */ 1.L,
    /* d =  2 */ M_PI_4l,
/* GCC on powerpc cannot fold some floating-point expressions involving IBM
   long double into constant initializers, unless -ffast-math is enabled.  See
   https://gcc.gnu.org/PR19779 */
#if defined(__GNUC__) && (defined(__PPC__) || defined(__POWERPC__) || defined(__ppc__))
    /* d =  3 */ 2. / 3.,
    /* d =  4 */ 3.L * M_PI / 16.L,
    /* d =  5 */ 8. / 15.,
    /* d =  6 */ 5.L * M_PI / 32.L,
    /* d =  7 */ 16. / 35.,
    /* d =  8 */ 35.L * M_PI / 256.L,
    /* d =  9 */ 128. / 315.,
    /* d = 10 */ 63.L * M_PI / 512.L,
    /* d = 11 */ 256. / 693.,
    /* d = 12 */ 231.L * M_PI / 2048.L,
    /* d = 13 */ 1024. / 3003.,
    /* d = 14 */ 429.L * M_PI / 4096.L,
    /* d = 15 */ 2048. / 6435.,
    /* d = 16 */ 6435.L * M_PI / 65536.L,
    /* d = 17 */ 32768. / 109395.,
    /* d = 18 */ 12155.L * M_PI / 131072.L,
    /* d = 19 */ 65536. / 230945.,
    /* d = 20 */ 46189.L * M_PI / 524288.L,
    /* d = 21 */ 262144. / 969969.,
    /* d = 22 */ 88179.L * M_PI / 1048576.L,
    /* d = 23 */ 524288. / 2028117.,
    /* d = 24 */ 676039.L * M_PI / 8388608.L,
    /* d = 25 */ 4194304. / 16900975.,
    /* d = 26 */ 1300075.L * M_PI / 16777216.L,
    /* d = 27 */ 8388608. / 35102025.,
    /* d = 28 */ 5014575.L * M_PI / 67108864.L,
    /* d = 29 */ 33554432. / 145422675.,
    /* d = 30 */ 9694845.L * M_PI / 134217728.L,
    /* d = 31 */ 67108864. / 300540195.,
#else
    /* d =  3 */ 2 / 3.L,
    /* d =  4 */ 3 * M_PIl / 16,
    /* d =  5 */ 8 / 15.L,
    /* d =  6 */ 5.L * M_PIl / 32,
    /* d =  7 */ 16 / 35.L,
    /* d =  8 */ 35.L * M_PIl / 256,
    /* d =  9 */ 128 / 315.L,
    /* d = 10 */ 63 * M_PIl / 512,
    /* d = 11 */ 256 / 693.L,
    /* d = 12 */ 231 * M_PIl / 2048,
    /* d = 13 */ 1024 / 3003.L,
    /* d = 14 */ 429 * M_PIl / 4096,
    /* d = 15 */ 2048 / 6435.L,
    /* d = 16 */ 6435 * M_PIl / 65536.L,
    /* d = 17 */ 32768 / 109395.L,
    /* d = 18 */ 12155 * M_PIl / 131072,
    /* d = 19 */ 65536 / 230945.L,
    /* d = 20 */ 46189 * M_PIl / 524288,
    /* d = 21 */ 262144 / 969969.L,
    /* d = 22 */ 88179 * M_PIl / 1048576,
    /* d = 23 */ 524288 / 2028117.L,
    /* d = 24 */ 676039 * M_PIl / 8388608,
    /* d = 25 */ 4194304 / 16900975.L,
    /* d = 26 */ 1300075 * M_PIl / 16777216,
    /* d = 27 */ 8388608 / 35102025.L,
    /* d = 28 */ 5014575 * M_PIl / 67108864,
    /* d = 29 */ 33554432 / 145422675.L,
    /* d = 30 */ 9694845 * M_PIl / 134217728.L,
    /* d = 31 */ 67108864 / 300540195.L,
#endif
};

// Solve inverse integral of power of sin.
_attr_const_func static long double
solve_inverse_int_of_power_sin(long double theta, dimension_t dim)
{
    long double x = M_PI_2l;
    long double newf = int_power_of_sin_from_0_to_half_pi[dim] - theta;
    // ??? Does this need to be ALMOST_ZERO_WEIGHT? If it does, it becomes very slow.
    // Even 1e-16 is much slower.
    while (fabsl(newf) > 1e-15) {
        long double g = powl_uint(sinl(x), dim);
        x -= newf / g;
        newf = int_of_power_of_sin_from_0_to_b(dim, (double)x) - theta;
    }
    return x;
}

static long double *
compute_int_all(dimension_t dm1)
{
    long double * int_all = malloc(dm1 * sizeof(long double));
    dimension_t i;
    DEBUG2_PRINT("int_all[%u] =", (unsigned int)dm1);
    for (i = 0; i < dm1; i++) {
        int_all[i] = int_power_of_sin_from_0_to_half_pi[i];
        DEBUG2_PRINT(" %25.18Lg ", int_all[i]);
    }
    DEBUG2_PRINT("\n");

#if DEBUG >= 1
    long double prod_int_all = int_all[0];
    for (i = 1; i < dm1; i++)
        prod_int_all *= int_all[i];
    ASSUME(prod_int_all > 0);
    const long double S_value = sphere_area_div_2_pow_d[dm1 + 1];
    DEBUG2_PRINT("sphere / prod_int_all = %22.15Lg / %22.15Lg = %22.15Lg\n",
                 S_value, prod_int_all, S_value / prod_int_all);
    // Check the value of int_all
    assert(fabsl(S_value - prod_int_all) <= 1e-15);
    DEBUG2_PRINT("\n");
#endif
    return int_all;
}

static void
compute_theta(long double * restrict theta, dimension_t dim,
              const long double * restrict int_all)
{
    ASSUME(2 <= dim && dim <= MOOCORE_DIMENSION_MAX);
    for (dimension_t j = 0; j < dim - 1; j++) {
        // We multiply here because we computed 1 / int_all[j] before.
        theta[j] = solve_inverse_int_of_power_sin(theta[j] * int_all[(dim - 2) - j],
                                                  STATIC_CAST(dimension_t, (dim - j) - 2));
    }
}

/*
  FIXME: This function is not vectorized well because it uses "long double"
  and because vectorization requires -funsafe-math-optimizations
*/
static void
compute_sin_cos_theta(const long double * restrict theta, dimension_t dm1,
                      double * restrict sin_theta, double * restrict cos_theta)
{
    for (size_t k = 0; k < dm1; k++) {
        cos_theta[k] = STATIC_CAST(double, theta[k]);
        sin_theta[k] = sin(cos_theta[k]);
        cos_theta[k] = cos(cos_theta[k]);
    }
}

/**
   w_0 = \prod_{k=0}^{dim - 1} sin(theta_k)
   w_j = cos(theta_{dim - j - 1}) * \prod_{k=0}^{dim - j - 1} sin(theta_k)

*/
static void
compute_hua_wang_direction(double * restrict direction, dimension_t dim,
                           const double * restrict sin_theta, const double * restrict cos_theta)
{
    ASSUME(2 <= dim && dim <= MOOCORE_DIMENSION_MAX);
    dimension_t j;
    for (j = 0; j < dim - 1; j++)
        direction[j] = sin_theta[0];
    for (j = dim - 2; j > 0; j--)
        direction[j - 1] = sin_theta[dim - j - 1] * direction[j];
    for (j = 1; j < dim - 1; j++)
        direction[j] *= cos_theta[dim - j - 1];
    direction[dim - 1] = cos_theta[0];

    for (j = 0; j < dim; j++) {
        assert(direction[j] >= 0);
        // FIXME: Can direction[j] be negative? If not, then we don't need fabs().
        direction[j] = (fabs(direction[j]) <= ALMOST_ZERO_WEIGHT)
            ? 1. / ALMOST_ZERO_WEIGHT
            : 1. / direction[j];
    }
}

/* Hypervolume approximation DZ2019-HW.

   Jingda Deng, Qingfu Zhang (2019). “Approximating Hypervolume and Hypervolume
   Contributions Using Polar Coordinate.” IEEE Transactions on Evolutionary
   Computation, 23(5), 913–918. doi:10.1109/tevc.2019.2895108 .
*/
double
hv_approx_hua_wang(const double * restrict data, size_t npoints, dimension_t dim,
                   const double * restrict ref, const bool * restrict maximise,
                   uint_fast32_t nsamples)
{
    ASSUME(2 <= dim && dim <= MOOCORE_DIMENSION_MAX);
    const double * points = transform_and_filter(data, &npoints, dim, ref, maximise);
    if (points == NULL)
        return 0;

    const long double * int_all = compute_int_all(dim - 1);
    const uint_fast32_t * polar_a = construct_polar_a(dim - 1, nsamples);
    double expected = 0.0;
    // FIXME: OpenMP: #pragma omp parallel
    {
        long double * theta = malloc((dim - 1) * sizeof(*theta));
        double * sin_theta = malloc((dim - 1) * sizeof(*sin_theta));
        double * cos_theta = malloc((dim - 1) * sizeof(*cos_theta));
        double * w = malloc(dim * sizeof(*w));
        // FIXME: Add OpenMP: #pragma omp for reduction(+:expected)
        for (uint_fast32_t j = 0; j < nsamples; j++) {
            compute_polar_sample(theta, dim - 1, j, nsamples, polar_a);
            compute_theta(theta, dim, int_all);
            compute_sin_cos_theta(theta, dim - 1, sin_theta, cos_theta);
            compute_hua_wang_direction(w, dim, sin_theta, cos_theta);
            expected += get_expected_value(points, npoints, dim, w);
        }
        free(theta);
        free(sin_theta);
        free(cos_theta);
        free(w);
    }
    free((void *) int_all);
    free((void *) polar_a);
    free((void *) points);
    const long double c_m = sphere_area_div_2_pow_d_times_d[dim];
    return STATIC_CAST(double, c_m * (expected / STATIC_CAST(long double, nsamples)));
}
