• a MSVC and glibc-compatible fmod()

    From Bonita Montero@Bonita.Montero@gmail.com to comp.lang.c++ on Mon Feb 24 11:48:08 2025
    From Newsgroup: comp.lang.c++

    I wanted to optimize fmod to be a bit faster. This is my C++20 solution.

    double myFmod( double x, double y )
    {
    constexpr uint64_t
    SIGN = 1ull << 63,
    IMPLICIT = 1ull << 52,
    MANT = IMPLICIT - 1,
    QBIT = 1ull << 51;
    uint64_t const
    binX = bit_cast<uint64_t>( x ),
    binY = bit_cast<uint64_t>( y );
    static auto abs = []( uint64_t m ) { return m & ~SIGN; };
    auto isNaN = []( uint64_t m ) { return abs( m ) >= 0x7FF0000000000001u; };
    auto isSig = []( uint64_t m ) { return !(m & QBIT); };
    if( isNaN( binX ) ) [[unlikely]] // x == NaN
    #if defined(_MSC_VER)
    return bit_cast<double>( isNaN( binY ) ? binY | binX & binY & QBIT :
    binX );
    #else
    {
    if( isSig( binX ) || isNaN( binY ) && isSig( binY ) ) [[unlikely]]
    feraiseexcept( FE_INVALID );
    return bit_cast<double>( binX | QBIT );
    }
    #endif
    if( isNaN( binY ) ) [[unlikely]] // x != NaN || y == NaN
    #if defined(_MSC_VER)
    return y;
    #else
    {
    if( isSig( binY ) ) [[unlikely]]
    feraiseexcept( FE_INVALID );
    return bit_cast<double>( binY | QBIT );
    }
    #endif
    auto isInf = []( uint64_t m ) { return abs( m ) == 0x7FF0000000000000u; };
    if( isInf( binX ) ) // x == Inf
    {
    feraiseexcept( FE_INVALID );
    #if defined(_MSC_VER)
    return bit_cast<double>( binX & ~MANT | QBIT );
    #else
    return -numeric_limits<double>::quiet_NaN();
    #endif
    }
    if( !abs( binY ) ) [[unlikely]] // y == 0
    {
    feraiseexcept( FE_INVALID );
    #if defined(_MSC_VER)
    return numeric_limits<double>::quiet_NaN();
    #else
    return -numeric_limits<double>::quiet_NaN();
    #endif
    }
    if( !abs( binX ) || isInf( binY ) ) [[unlikely]] // x == 0 || y == Inf
    return x;
    auto exp = []( uint64_t b ) -> int { return b >> 52 & 0x7FF; };
    int
    expX = exp( binX ),
    expY = exp( binY );
    auto mant = []( uint64_t b ) { return b & MANT; };
    uint64_t
    mantX = mant( binX ),
    mantY = mant( binY );
    static auto normalize = []( int &exp, uint64_t &mant )
    {
    unsigned shift = countl_zero( mant ) - 11;
    mant <<= shift;
    exp -= shift;
    };
    auto build = []( int &exp, uint64_t &mant )
    {
    if( exp ) [[likely]]
    mant |= IMPLICIT;
    else
    {
    exp = 1;
    normalize( exp, mant );
    }
    };
    build( expX, mantX );
    build( expY, mantY );
    uint64_t signX = binX & SIGN;
    int expDiff;
    while( (expDiff = expX - expY) > 0 )
    {
    unsigned bits = expDiff <= 11 ? expDiff : 11;
    if( !(mantX = (mantX << bits) % mantY) ) [[unlikely]]
    return bit_cast<double>( signX );
    expX -= bits;
    normalize( expX, mantX );
    }
    if( !expDiff && mantX >= mantY ) [[unlikely]]
    if( (mantX -= mantY) ) [[likely]]
    normalize( expX, mantX );
    else
    return bit_cast<double>( signX );
    if( expX <= 0 ) [[unlikely]]
    {
    assert(expX >= -51);
    mantX = mantX >> (unsigned)(-expX + 1);
    expX = 0;
    }
    return bit_cast<double>( signX | (uint64_t)expX << 52 | mantX & MANT ); }

    It's about six times faster than the glibc 2.31 solution in my
    benchmark. The returned NaNs and the raised exceptions are MSVC-
    and glibc-compatible.

    --- Synchronet 3.20c-Linux NewsLink 1.2
  • From Muttley@Muttley@DastardlyHQ.org to comp.lang.c++ on Mon Feb 24 12:00:30 2025
    From Newsgroup: comp.lang.c++

    On Mon, 24 Feb 2025 11:48:08 +0100
    Bonita Montero <Bonita.Montero@gmail.com> wibbled:
    I wanted to optimize fmod to be a bit faster. This is my C++20 solution.

    double myFmod( double x, double y )
    [snip]

    It's about six times faster than the glibc 2.31 solution in my
    benchmark. The returned NaNs and the raised exceptions are MSVC-
    and glibc-compatible.

    double myFmod(double x, double y)
    {
    double div = x / y;
    return y * (div - (long)div);
    }


    --- Synchronet 3.20c-Linux NewsLink 1.2
  • From Bonita Montero@Bonita.Montero@gmail.com to comp.lang.c++ on Mon Feb 24 13:48:02 2025
    From Newsgroup: comp.lang.c++

    Am 24.02.2025 um 13:00 schrieb Muttley@DastardlyHQ.org:
    On Mon, 24 Feb 2025 11:48:08 +0100
    Bonita Montero <Bonita.Montero@gmail.com> wibbled:
    I wanted to optimize fmod to be a bit faster. This is my C++20 solution.

    double myFmod( double x, double y )
    [snip]

    It's about six times faster than the glibc 2.31 solution in my
    benchmark. The returned NaNs and the raised exceptions are MSVC-
    and glibc-compatible.

    double myFmod(double x, double y)
    {
    double div = x / y;
    return y * (div - (long)div);
    }

    If the exponent difference between x and y is large enough this
    returns results which are larger than y. glibc does it completley
    with integer-operations also.

    --- Synchronet 3.20c-Linux NewsLink 1.2
  • From Muttley@Muttley@DastardlyHQ.org to comp.lang.c++ on Mon Feb 24 13:09:50 2025
    From Newsgroup: comp.lang.c++

    On Mon, 24 Feb 2025 13:48:02 +0100
    Bonita Montero <Bonita.Montero@gmail.com> wibbled:
    Am 24.02.2025 um 13:00 schrieb Muttley@DastardlyHQ.org:
    On Mon, 24 Feb 2025 11:48:08 +0100
    Bonita Montero <Bonita.Montero@gmail.com> wibbled:
    I wanted to optimize fmod to be a bit faster. This is my C++20 solution. >>>
    double myFmod( double x, double y )
    [snip]

    It's about six times faster than the glibc 2.31 solution in my
    benchmark. The returned NaNs and the raised exceptions are MSVC-
    and glibc-compatible.

    double myFmod(double x, double y)
    {
    double div = x / y;
    return y * (div - (long)div);
    }

    If the exponent difference between x and y is large enough this
    returns results which are larger than y. glibc does it completley
    with integer-operations also.

    If the values are so large or small that you start to get floating point
    based errors then you should probably be using integer arthmetic or a large number library anyway like GMP anyway.

    --- Synchronet 3.20c-Linux NewsLink 1.2
  • From Bonita Montero@Bonita.Montero@gmail.com to comp.lang.c++ on Mon Feb 24 14:22:04 2025
    From Newsgroup: comp.lang.c++

    Am 24.02.2025 um 14:09 schrieb Muttley@DastardlyHQ.org:

    If the values are so large or small that you start to get floating point based errors then you should probably be using integer arthmetic or a large number library anyway like GMP anyway.

    There's no need for large integer arithmetics since each calculation
    step results in a mantissa with equal or less bits than the divisor.
    Even if the exponents are close enough to have not missing integer
    -bits the following multiplication is very likely to have a precision
    -loss. All current solutions (MSVC, libstdc++) work with integer-ope-
    tations and are always 100% precise.


    --- Synchronet 3.20c-Linux NewsLink 1.2
  • From Michael S@already5chosen@yahoo.com to comp.lang.c++ on Mon Feb 24 16:22:46 2025
    From Newsgroup: comp.lang.c++

    On Mon, 24 Feb 2025 13:09:50 -0000 (UTC)
    Muttley@DastardlyHQ.org wrote:

    On Mon, 24 Feb 2025 13:48:02 +0100
    Bonita Montero <Bonita.Montero@gmail.com> wibbled:
    Am 24.02.2025 um 13:00 schrieb Muttley@DastardlyHQ.org:
    On Mon, 24 Feb 2025 11:48:08 +0100
    Bonita Montero <Bonita.Montero@gmail.com> wibbled:
    I wanted to optimize fmod to be a bit faster. This is my C++20
    solution.

    double myFmod( double x, double y )
    [snip]

    It's about six times faster than the glibc 2.31 solution in my
    benchmark. The returned NaNs and the raised exceptions are MSVC-
    and glibc-compatible.

    double myFmod(double x, double y)
    {
    double div = x / y;
    return y * (div - (long)div);
    }

    If the exponent difference between x and y is large enough this
    returns results which are larger than y. glibc does it completley
    with integer-operations also.

    If the values are so large or small that you start to get floating
    point based errors then you should probably be using integer
    arthmetic or a large number library anyway like GMP anyway.


    Your method will sometimes produce results that are 1 LSB off
    relatively to IEEE-754 prescription when values are neither small nor
    large.
    And sometimes 1 LSB off means that result is 2x off.
    For example, for x=0.9999999999999999, y=0.9999999999999998 your method produces 2.2204460492503126e-16. A correct result is, of course, 1.1102230246251565e-16

    Also, I don't think that your method is any faster than correct methods.


    --- Synchronet 3.20c-Linux NewsLink 1.2
  • From Muttley@Muttley@DastardlyHQ.org to comp.lang.c++ on Mon Feb 24 15:10:53 2025
    From Newsgroup: comp.lang.c++

    On Mon, 24 Feb 2025 16:22:46 +0200
    Michael S <already5chosen@yahoo.com> wibbled:
    On Mon, 24 Feb 2025 13:09:50 -0000 (UTC)
    Muttley@DastardlyHQ.org wrote:
    If the values are so large or small that you start to get floating
    point based errors then you should probably be using integer
    arthmetic or a large number library anyway like GMP anyway.


    Your method will sometimes produce results that are 1 LSB off
    relatively to IEEE-754 prescription when values are neither small nor
    large.
    And sometimes 1 LSB off means that result is 2x off.
    For example, for x=0.9999999999999999, y=0.9999999999999998 your method >produces 2.2204460492503126e-16. A correct result is, of course, >1.1102230246251565e-16

    Frankly I doubt anyone would care, its zero in all but name.

    Also, I don't think that your method is any faster than correct methods.

    Don't know, but its only 3 mathematical operations all of which can be done
    by the hardware so its going to be pretty fast.


    --- Synchronet 3.20c-Linux NewsLink 1.2
  • From Michael S@already5chosen@yahoo.com to comp.lang.c++ on Mon Feb 24 17:13:43 2025
    From Newsgroup: comp.lang.c++

    On Mon, 24 Feb 2025 14:22:04 +0100
    Bonita Montero <Bonita.Montero@gmail.com> wrote:

    Am 24.02.2025 um 14:09 schrieb Muttley@DastardlyHQ.org:

    If the values are so large or small that you start to get floating
    point based errors then you should probably be using integer
    arthmetic or a large number library anyway like GMP anyway.

    There's no need for large integer arithmetics since each calculation
    step results in a mantissa with equal or less bits than the divisor.
    Even if the exponents are close enough to have not missing integer
    -bits the following multiplication is very likely to have a precision
    -loss. All current solutions (MSVC, libstdc++) work with integer-ope-
    tations and are always 100% precise.



    Do you have real application for fast fmod() or just playing?

    For as long as y is positive and abs(x/y) <= 2**53, a very simple
    formula will produce precise result: fma(trunc(x/fabs(y)), -fabs(y), x).



    --- Synchronet 3.20c-Linux NewsLink 1.2
  • From Bonita Montero@Bonita.Montero@gmail.com to comp.lang.c++ on Mon Feb 24 16:19:35 2025
    From Newsgroup: comp.lang.c++

    Am 24.02.2025 um 16:13 schrieb Michael S:

    Do you have real application for fast fmod() or just playing?

    I experimented with x87 FPREM and wanted to know whether it is precise;
    it isn't and the results can be > y. So I developed my own routine which
    is always 100% precise.

    For as long as y is positive and abs(x/y) <= 2**53, a very simple
    formula will produce precise result: fma(trunc(x/fabs(y)), -fabs(y), x).

    The multiplication mostly will drop bits so that the difference might
    become larger than y.

    --- Synchronet 3.20c-Linux NewsLink 1.2
  • From Michael S@already5chosen@yahoo.com to comp.lang.c++ on Mon Feb 24 17:33:45 2025
    From Newsgroup: comp.lang.c++

    On Mon, 24 Feb 2025 15:10:53 -0000 (UTC)
    Muttley@DastardlyHQ.org wrote:

    On Mon, 24 Feb 2025 16:22:46 +0200
    Michael S <already5chosen@yahoo.com> wibbled:
    On Mon, 24 Feb 2025 13:09:50 -0000 (UTC)
    Muttley@DastardlyHQ.org wrote:
    If the values are so large or small that you start to get floating
    point based errors then you should probably be using integer
    arthmetic or a large number library anyway like GMP anyway.


    Your method will sometimes produce results that are 1 LSB off
    relatively to IEEE-754 prescription when values are neither small nor >large.
    And sometimes 1 LSB off means that result is 2x off.
    For example, for x=0.9999999999999999, y=0.9999999999999998 your
    method produces 2.2204460492503126e-16. A correct result is, of
    course, 1.1102230246251565e-16

    Frankly I doubt anyone would care, its zero in all but name.

    Also, I don't think that your method is any faster than correct
    methods.

    Don't know, but its only 3 mathematical operations all of which can
    be done by the hardware so its going to be pretty fast.



    Looks like 4 operations to me - division, truncation, subtraction, multiplication. If compiler takes it literally, which he probably
    should if compiled without special non-standard-conforming flags like -fast-math, then there are 5 operations - double->int and
    int->double conversions instead of truncation

    Nevertheless, after a bit of thinking I concur that your formula is
    faster than 100% correct methods. Initially, I didn't took into account
    all difficulties that correct methods have to face in cases of very
    large x to y ratios.

    However your method is approximately the same speed as *mostly correct*
    method shown in my post above. May be, yours is even a little slower,
    at least as long as we use good optimizing compiler and target modern
    CPUs that support trunc() and fma() as fast hardware instructions.



    --- Synchronet 3.20c-Linux NewsLink 1.2
  • From Michael S@already5chosen@yahoo.com to comp.lang.c++ on Mon Feb 24 17:52:58 2025
    From Newsgroup: comp.lang.c++

    On Mon, 24 Feb 2025 16:19:35 +0100
    Bonita Montero <Bonita.Montero@gmail.com> wrote:

    Am 24.02.2025 um 16:13 schrieb Michael S:

    Do you have real application for fast fmod() or just playing?

    I experimented with x87 FPREM and wanted to know whether it is
    precise; it isn't

    That is not surprising. It wouldn't be called *partial* reminder
    otherwise.

    and the results can be > y.

    That's a little unexpected, but mentioned in the Intel and AMD manuals.
    It can happen when abs(x/y) > 2**32. If you think about how FPREM
    works, you'd realize that for abs(x/y) > 2**63 it is a necessity.

    Still rem(x,y) == rem(fprem(x,y), y), so this unusual property does not
    prevent FPREM from getting correct answer when we run it in loop.
    But in the worst case loop can take something like 1000 iterations
    :(

    So I developed my own
    routine which is always 100% precise.

    For as long as y is positive

    Actually, formula appears to work for negative y as well.

    and abs(x/y) <= 2**53, a very simple
    formula will produce precise result: fma(trunc(x/fabs(y)),
    -fabs(y), x).

    The multiplication mostly will drop bits so that the difference might
    become larger than y.


    That is why I don't use multiplication. Did you ever asked yourself
    what is the meaning of 'f' in 'fma' ?


    --- Synchronet 3.20c-Linux NewsLink 1.2
  • From Muttley@Muttley@DastardlyHQ.org to comp.lang.c++ on Mon Feb 24 16:48:10 2025
    From Newsgroup: comp.lang.c++

    On Mon, 24 Feb 2025 17:33:45 +0200
    Michael S <already5chosen@yahoo.com> wibbled:
    On Mon, 24 Feb 2025 15:10:53 -0000 (UTC)
    Muttley@DastardlyHQ.org wrote:
    Don't know, but its only 3 mathematical operations all of which can
    be done by the hardware so its going to be pretty fast.



    Looks like 4 operations to me - division, truncation, subtraction, >multiplication. If compiler takes it literally, which he probably

    Yes, I should have included the cast. Not sure whether that could be done
    in hardware or not, my assembler knowledge - for x86 - is way too rusty.

    Nevertheless, after a bit of thinking I concur that your formula is
    faster than 100% correct methods. Initially, I didn't took into account
    all difficulties that correct methods have to face in cases of very
    large x to y ratios.

    In those sorts of cases the result of your program will be running into floating point precision errors elsewhere so IMO its somewhat moot.

    However your method is approximately the same speed as *mostly correct* >method shown in my post above. May be, yours is even a little slower,
    at least as long as we use good optimizing compiler and target modern
    CPUs that support trunc() and fma() as fast hardware instructions.

    The only way to really know would be to test it on various OS's and CPUs
    I guess.

    --- Synchronet 3.20c-Linux NewsLink 1.2
  • From Mr Flibble@leigh@i42.co.uk to comp.lang.c++ on Mon Feb 24 22:21:03 2025
    From Newsgroup: comp.lang.c++

    On Mon, 24 Feb 2025 11:48:08 +0100, Bonita Montero
    <Bonita.Montero@gmail.com> wrote:

    I wanted to optimize fmod to be a bit faster. This is my C++20 solution.

    double myFmod( double x, double y )
    {
    constexpr uint64_t
    SIGN = 1ull << 63,
    IMPLICIT = 1ull << 52,
    MANT = IMPLICIT - 1,
    QBIT = 1ull << 51;
    uint64_t const
    binX = bit_cast<uint64_t>( x ),
    binY = bit_cast<uint64_t>( y );
    static auto abs = []( uint64_t m ) { return m & ~SIGN; };
    auto isNaN = []( uint64_t m ) { return abs( m ) >= 0x7FF0000000000001u; }; auto isSig = []( uint64_t m ) { return !(m & QBIT); };
    if( isNaN( binX ) ) [[unlikely]] // x == NaN
    #if defined(_MSC_VER)
    return bit_cast<double>( isNaN( binY ) ? binY | binX & binY & QBIT :
    binX );
    #else
    {
    if( isSig( binX ) || isNaN( binY ) && isSig( binY ) ) [[unlikely]]
    feraiseexcept( FE_INVALID );
    return bit_cast<double>( binX | QBIT );
    }
    #endif
    if( isNaN( binY ) ) [[unlikely]] // x != NaN || y == NaN
    #if defined(_MSC_VER)
    return y;
    #else
    {
    if( isSig( binY ) ) [[unlikely]]
    feraiseexcept( FE_INVALID );
    return bit_cast<double>( binY | QBIT );
    }
    #endif
    auto isInf = []( uint64_t m ) { return abs( m ) == 0x7FF0000000000000u; }; if( isInf( binX ) ) // x == Inf
    {
    feraiseexcept( FE_INVALID );
    #if defined(_MSC_VER)
    return bit_cast<double>( binX & ~MANT | QBIT );
    #else
    return -numeric_limits<double>::quiet_NaN();
    #endif
    }
    if( !abs( binY ) ) [[unlikely]] // y == 0
    {
    feraiseexcept( FE_INVALID );
    #if defined(_MSC_VER)
    return numeric_limits<double>::quiet_NaN();
    #else
    return -numeric_limits<double>::quiet_NaN();
    #endif
    }
    if( !abs( binX ) || isInf( binY ) ) [[unlikely]] // x == 0 || y == Inf
    return x;
    auto exp = []( uint64_t b ) -> int { return b >> 52 & 0x7FF; };
    int
    expX = exp( binX ),
    expY = exp( binY );
    auto mant = []( uint64_t b ) { return b & MANT; };
    uint64_t
    mantX = mant( binX ),
    mantY = mant( binY );
    static auto normalize = []( int &exp, uint64_t &mant )
    {
    unsigned shift = countl_zero( mant ) - 11;
    mant <<= shift;
    exp -= shift;
    };
    auto build = []( int &exp, uint64_t &mant )
    {
    if( exp ) [[likely]]
    mant |= IMPLICIT;
    else
    {
    exp = 1;
    normalize( exp, mant );
    }
    };
    build( expX, mantX );
    build( expY, mantY );
    uint64_t signX = binX & SIGN;
    int expDiff;
    while( (expDiff = expX - expY) > 0 )
    {
    unsigned bits = expDiff <= 11 ? expDiff : 11;
    if( !(mantX = (mantX << bits) % mantY) ) [[unlikely]]
    return bit_cast<double>( signX );
    expX -= bits;
    normalize( expX, mantX );
    }
    if( !expDiff && mantX >= mantY ) [[unlikely]]
    if( (mantX -= mantY) ) [[likely]]
    normalize( expX, mantX );
    else
    return bit_cast<double>( signX );
    if( expX <= 0 ) [[unlikely]]
    {
    assert(expX >= -51);
    mantX = mantX >> (unsigned)(-expX + 1);
    expX = 0;
    }
    return bit_cast<double>( signX | (uint64_t)expX << 52 | mantX & MANT );
    }

    It's about six times faster than the glibc 2.31 solution in my
    benchmark. The returned NaNs and the raised exceptions are MSVC-
    and glibc-compatible.

    double myFmod(double x, double y)
    {
    double div = x / y;
    return y * (div - std::round(div));
    }

    /Flibble
    --- Synchronet 3.20c-Linux NewsLink 1.2
  • From Bonita Montero@Bonita.Montero@gmail.com to comp.lang.c++ on Tue Feb 25 07:37:23 2025
    From Newsgroup: comp.lang.c++

    Am 24.02.2025 um 23:21 schrieb Mr Flibble:

    double myFmod(double x, double y)
    {
    double div = x / y;
    return y * (div - std::round(div));
    }

    Doesn't work, not only for the reasons already mentioned.

    --- Synchronet 3.20c-Linux NewsLink 1.2
  • From Bonita Montero@Bonita.Montero@gmail.com to comp.lang.c++ on Tue Feb 25 09:09:21 2025
    From Newsgroup: comp.lang.c++

    Am 24.02.2025 um 16:52 schrieb Michael S:

    That is why I don't use multiplication. Did you ever asked yourself
    what is the meaning of 'f' in 'fma' ?

    The FMA-instructions produce the same results:

    #include <iostream>
    #include <random>
    #include <bit>
    #include <cmath>
    #include <iomanip>
    #include <intrin.h>

    using namespace std;

    int main()
    {
    auto fma = []( double a, double b, double c )
    {
    __m128d mA, mB, mC;
    mA.m128d_f64[0] = a;
    mB.m128d_f64[0] = b;
    mC.m128d_f64[0] = c;
    return _mm_fmadd_pd( mA, mB, mC ).m128d_f64[0];
    };
    mt19937_64 mt;
    uniform_int_distribution<uint64_t> finites( 1, 0x7FEFFFFFFFFFFFFFu );
    auto rnd = [&]() -> double { return bit_cast<double>( finites( mt ) ); };
    ptrdiff_t nEQs = 0;
    for( ptrdiff_t r = 0; r != 1'000'000; ++r )
    {
    double
    a = rnd(), b = rnd(), c = rnd(),
    rA = fma( a, b, c ),
    rB = a * b + c;
    nEQs = rA != rB;
    }
    cout << hexfloat << nEQs / 1.0e6 << endl;
    }

    --- Synchronet 3.20c-Linux NewsLink 1.2
  • From Muttley@Muttley@DastardlyHQ.org to comp.lang.c++ on Tue Feb 25 08:24:56 2025
    From Newsgroup: comp.lang.c++

    On Mon, 24 Feb 2025 22:21:03 +0000
    Mr Flibble <leigh@i42.co.uk> wibbled:
    On Mon, 24 Feb 2025 11:48:08 +0100, Bonita Montero
    <Bonita.Montero@gmail.com> wrote:
    It's about six times faster than the glibc 2.31 solution in my
    benchmark. The returned NaNs and the raised exceptions are MSVC-
    and glibc-compatible.

    double myFmod(double x, double y)
    {
    double div = x / y;
    return y * (div - std::round(div));
    }

    You don't ever want it rounded up, it must always just be the integer component.

    --- Synchronet 3.20c-Linux NewsLink 1.2
  • From Bonita Montero@Bonita.Montero@gmail.com to comp.lang.c++ on Tue Feb 25 16:17:22 2025
    From Newsgroup: comp.lang.c++

    Am 25.02.2025 um 09:09 schrieb Bonita Montero:
    Am 24.02.2025 um 16:52 schrieb Michael S:

    That is why I don't use multiplication. Did you ever asked yourself
    what is the meaning of 'f' in 'fma' ?

    The FMA-instructions produce the same results:

    #include <iostream>
    #include <random>
    #include <bit>
    #include <cmath>
    #include <iomanip>
    #include <intrin.h>

    using namespace std;

    int main()
    {
        auto fma = []( double a, double b, double c )
        {
            __m128d mA, mB, mC;
            mA.m128d_f64[0] = a;
            mB.m128d_f64[0] = b;
            mC.m128d_f64[0] = c;
            return _mm_fmadd_pd( mA, mB, mC ).m128d_f64[0];
        };
        mt19937_64 mt;
        uniform_int_distribution<uint64_t> finites( 1, 0x7FEFFFFFFFFFFFFFu );
        auto rnd = [&]() -> double { return
    bit_cast<double>( finites( mt ) ); };
        ptrdiff_t nEQs = 0;
        for( ptrdiff_t r = 0; r != 1'000'000; ++r )
        {
            double
                a = rnd(), b = rnd(), c = rnd(),
                rA = fma( a, b, c ),
                rB = a * b + c;
            nEQs = rA != rB;
        }
        cout << hexfloat << nEQs / 1.0e6 << endl;
    }


    There's a good reason to produce the same result: if a "a * b + c"
    -statement is replaced with a FMA-operation by the compiler you won't
    get different results depending on the compiler-setting. clang++ does
    this replacement if you chose the proper instruction set.
    --- Synchronet 3.20c-Linux NewsLink 1.2
  • From Michael S@already5chosen@yahoo.com to comp.lang.c++ on Tue Feb 25 17:26:18 2025
    From Newsgroup: comp.lang.c++

    On Tue, 25 Feb 2025 09:09:21 +0100
    Bonita Montero <Bonita.Montero@gmail.com> wrote:

    Am 24.02.2025 um 16:52 schrieb Michael S:

    That is why I don't use multiplication. Did you ever asked yourself
    what is the meaning of 'f' in 'fma' ?

    The FMA-instructions produce the same results:

    #include <iostream>
    #include <random>
    #include <bit>
    #include <cmath>
    #include <iomanip>
    #include <intrin.h>

    using namespace std;

    int main()
    {
    auto fma = []( double a, double b, double c )
    {
    __m128d mA, mB, mC;
    mA.m128d_f64[0] = a;
    mB.m128d_f64[0] = b;
    mC.m128d_f64[0] = c;
    return _mm_fmadd_pd( mA, mB, mC ).m128d_f64[0];
    };
    mt19937_64 mt;
    uniform_int_distribution<uint64_t> finites( 1,
    0x7FEFFFFFFFFFFFFFu ); auto rnd = [&]() -> double { return
    bit_cast<double>( finites( mt ) ); }; ptrdiff_t nEQs = 0;
    for( ptrdiff_t r = 0; r != 1'000'000; ++r )
    {
    double
    a = rnd(), b = rnd(), c = rnd(),
    rA = fma( a, b, c ),
    rB = a * b + c;
    nEQs = rA != rB;
    }
    cout << hexfloat << nEQs / 1.0e6 << endl;
    }


    GIGO.
    Do a proper test then you'd get a proper answer.

    fma.c:

    #include <stdio.h>
    #include <stdlib.h>
    #include <math.h>

    int main(int argz, char** argv)
    {
    if (argz < 4) {
    fprintf(stderr, "Usage:\nfma x y z\n");
    return 1;
    }

    double values[3];
    for (int i = 0; i < 3; ++i) {
    char* arg = argv[i+1];
    char* endp;
    values[i] = strtod(arg, &endp);
    if (arg == endp) {
    fprintf(stderr, "Bad argument '%s'. Not a number.\n", arg);
    return 1;
    }
    }

    double r1 = values[0]* values[1] + values[2];
    double r2 = fma(values[0], values[1], values[2]);
    printf(" %.17e * %.17e + %.17e = %.17e\n",
    values[0], values[1], values[2], r1);
    printf("fma(%.17e , %.17e , %.17e) = %.17e\n",
    values[0], values[1], values[2], r2);

    return 0;
    }


    $ gcc -O2 -Wall fma.c -o fma

    $ ./fma 1000000001 999999999 -1e18
    1.00000000100000000e+09 * 9.99999999000000000e+08 +
    -1.00000000000000000e+18 = 0.00000000000000000e+00 fma(1.00000000100000000e+09 , 9.99999999000000000e+08 ,
    -1.00000000000000000e+18) = -1.00000000000000000e+00


    --- Synchronet 3.20c-Linux NewsLink 1.2
  • From Bonita Montero@Bonita.Montero@gmail.com to comp.lang.c++ on Tue Feb 25 16:45:25 2025
    From Newsgroup: comp.lang.c++

    Am 25.02.2025 um 16:26 schrieb Michael S:
    On Tue, 25 Feb 2025 09:09:21 +0100
    Bonita Montero <Bonita.Montero@gmail.com> wrote:

    Am 24.02.2025 um 16:52 schrieb Michael S:

    That is why I don't use multiplication. Did you ever asked yourself
    what is the meaning of 'f' in 'fma' ?

    The FMA-instructions produce the same results:

    #include <iostream>
    #include <random>
    #include <bit>
    #include <cmath>
    #include <iomanip>
    #include <intrin.h>

    using namespace std;

    int main()
    {
    auto fma = []( double a, double b, double c )
    {
    __m128d mA, mB, mC;
    mA.m128d_f64[0] = a;
    mB.m128d_f64[0] = b;
    mC.m128d_f64[0] = c;
    return _mm_fmadd_pd( mA, mB, mC ).m128d_f64[0];
    };
    mt19937_64 mt;
    uniform_int_distribution<uint64_t> finites( 1,
    0x7FEFFFFFFFFFFFFFu ); auto rnd = [&]() -> double { return
    bit_cast<double>( finites( mt ) ); }; ptrdiff_t nEQs = 0;
    for( ptrdiff_t r = 0; r != 1'000'000; ++r )
    {
    double
    a = rnd(), b = rnd(), c = rnd(),
    rA = fma( a, b, c ),
    rB = a * b + c;
    nEQs = rA != rB;
    }
    cout << hexfloat << nEQs / 1.0e6 << endl;
    }


    GIGO.
    Do a proper test then you'd get a proper answer.

    The test is proper with MSVC since MSVC doesn't replace the
    "a * b + c"-operation with a FMA-operation. With your code
    it isn't guaranteed that the CPU-specific FMA-operations are
    used. I'm using the SSE FMA operation explicitly and I'm using
    it for a million random finite double-values.

    --- Synchronet 3.20c-Linux NewsLink 1.2
  • From Bonita Montero@Bonita.Montero@gmail.com to comp.lang.c++ on Tue Feb 25 16:57:29 2025
    From Newsgroup: comp.lang.c++

    This is a somewhat more improved code that generates a larger
    part of zeores, denormals, infs and nans. It tests a billion
    of random values for a, b and c. It is specified behaivour
    according to the Intel manuals that the result is the same.

    #include <iostream>
    #include <random>
    #include <bit>
    #include <cmath>
    #include <iomanip>
    #include <cassert>
    #include <intrin.h>

    using namespace std;

    int main()
    {
    auto fma = []( double a, double b, double c )
    {
    __m128d mA, mB, mC;
    mA.m128d_f64[0] = a;
    mB.m128d_f64[0] = b;
    mC.m128d_f64[0] = c;
    return _mm_fmadd_pd( mA, mB, mC ).m128d_f64[0];
    };
    mt19937_64 mt;
    uniform_int_distribution<uint64_t>
    genFinite( 0x0010000000000000u, 0x7FEFFFFFFFFFFFFFu ),
    genDen( 1, 0x000FFFFFFFFFFFFFu ),
    genNaN( 0x7FF0000000000001u, 0x7FFFFFFFFFFFFFFFu );
    auto get = [&]()
    {
    constexpr uint64_t
    FINITE_THRESH = 4, // 75% finites
    ZERO = 3, // 6.25% zeroes
    DENORMALS = 2, // 6.25% denormals
    INF = 1, // 6.25% infs
    NAN_THRESH = 0; // 6.25% NaNs
    uint64_t
    sign = mt() & numeric_limits<int64_t>::min(),
    type = mt() % 16;
    if( type >= FINITE_THRESH )
    return bit_cast<double>( sign | genFinite( mt ) );
    if( type == ZERO )
    return bit_cast<double>( sign );
    if( type == DENORMALS )
    return bit_cast<double>( sign | genDen( mt ) );
    if( type == INF )
    return bit_cast<double>( sign | 0x7FF0000000000000u );
    assert(type == NAN_THRESH);
    return bit_cast<double>( sign | genNaN( mt ) );
    };
    ptrdiff_t nEQs = 0;
    for( ptrdiff_t r = 0; r != 1'000'000'000; ++r )
    {
    double
    a = get(), b = get(), c = get(),
    rA = fma( a, b, c ),
    rB = a * b + c;
    nEQs = rA != rB;
    }
    cout << nEQs << endl;
    }
    --- Synchronet 3.20c-Linux NewsLink 1.2
  • From Michael S@already5chosen@yahoo.com to comp.lang.c++ on Tue Feb 25 19:17:15 2025
    From Newsgroup: comp.lang.c++

    On Tue, 25 Feb 2025 16:45:25 +0100
    Bonita Montero <Bonita.Montero@gmail.com> wrote:

    Am 25.02.2025 um 16:26 schrieb Michael S:
    On Tue, 25 Feb 2025 09:09:21 +0100
    Bonita Montero <Bonita.Montero@gmail.com> wrote:

    Am 24.02.2025 um 16:52 schrieb Michael S:

    That is why I don't use multiplication. Did you ever asked
    yourself what is the meaning of 'f' in 'fma' ?

    The FMA-instructions produce the same results:

    #include <iostream>
    #include <random>
    #include <bit>
    #include <cmath>
    #include <iomanip>
    #include <intrin.h>

    using namespace std;

    int main()
    {
    auto fma = []( double a, double b, double c )
    {
    __m128d mA, mB, mC;
    mA.m128d_f64[0] = a;
    mB.m128d_f64[0] = b;
    mC.m128d_f64[0] = c;
    return _mm_fmadd_pd( mA, mB, mC ).m128d_f64[0];
    };
    mt19937_64 mt;
    uniform_int_distribution<uint64_t> finites( 1,
    0x7FEFFFFFFFFFFFFFu ); auto rnd = [&]() -> double { return
    bit_cast<double>( finites( mt ) ); }; ptrdiff_t nEQs = 0;
    for( ptrdiff_t r = 0; r != 1'000'000; ++r )
    {
    double
    a = rnd(), b = rnd(), c = rnd(),
    rA = fma( a, b, c ),
    rB = a * b + c;
    nEQs = rA != rB;
    }
    cout << hexfloat << nEQs / 1.0e6 << endl;
    }


    GIGO.
    Do a proper test then you'd get a proper answer.

    The test is proper with MSVC since MSVC doesn't replace the
    "a * b + c"-operation with a FMA-operation.

    Don't invent your own fma(). Use one provided by library.
    Then MSVC will do what it is prescribed to do by the standard.

    With your code
    it isn't guaranteed that the CPU-specific FMA-operations are
    used.

    Correct. And it does not matter. When run on CPU without HW FMA, it
    would be slower. But would still produce a right result.

    I'm using the SSE FMA operation explicitly and I'm using
    it for a million random finite double-values.


    Originally, I didn't even try to investigate what garbage exactly you
    are feeding to your test. Now I took a look. It seems that you are
    doing something fundamentally stupid, like all fma inputs positive.
    Of course, for all positive inputs the (fma(x,y,z) != x*y+z) is quite
    rare. It still happens sometimes, but it's likely that for your chosen distribution it happens less often than once per million.
    OTOH, when x*y and z have different signs and similar magnitude, it
    happens all the time. Still, for your stupidly chosen distribution the
    similar magnitude is probably quite rare too.






    --- Synchronet 3.20c-Linux NewsLink 1.2
  • From Bonita Montero@Bonita.Montero@gmail.com to comp.lang.c++ on Tue Feb 25 18:58:23 2025
    From Newsgroup: comp.lang.c++

    Am 25.02.2025 um 18:17 schrieb Michael S:

    Don't invent your own fma(). Use one provided by library.
    Then MSVC will do what it is prescribed to do by the standard.

    I want to be sure that I'm using the SSE FMA operation and
    not a conventional substitute of two instructions.

    Originally, I didn't even try to investigate what garbage exactly you
    are feeding to your test. Now I took a look. It seems that you are
    doing something fundamentally stupid, like all fma inputs positive.

    I extended the test to incorporate all possible double
    bitrepresentations (taken von mt()) - with no difference.

    --- Synchronet 3.20c-Linux NewsLink 1.2
  • From Michael S@already5chosen@yahoo.com to comp.lang.c++ on Wed Feb 26 00:36:19 2025
    From Newsgroup: comp.lang.c++

    On Tue, 25 Feb 2025 18:58:23 +0100
    Bonita Montero <Bonita.Montero@gmail.com> wrote:

    Am 25.02.2025 um 18:17 schrieb Michael S:

    Don't invent your own fma(). Use one provided by library.
    Then MSVC will do what it is prescribed to do by the standard.

    I want to be sure that I'm using the SSE FMA operation

    SSEn has no FMA operations.
    FMA was introduces as part of AVX series, simulatneously with AVX2.
    In practice all CPUs that support AVX2 also support FMA, but
    from formal perspective they AVX2 and FMA are different extensions.

    and
    not a conventional substitute of two instructions.


    fma() is guaranteed to do a right thing by standard.
    It can not be substituted by two instructions. It's either one
    instruction or many, likely over dozen. Never two.

    Originally, I didn't even try to investigate what garbage exactly
    you are feeding to your test. Now I took a look. It seems that you
    are doing something fundamentally stupid, like all fma inputs
    positive.

    I extended the test to incorporate all possible double
    bitrepresentations (taken von mt()) - with no difference.


    What you wrote makes no sense. For positive double-precision x, y and
    z there are 2**189 posible combinations. You cant check all of them even
    if you were given all computers of the world for millenium.

    However now I see that there indeed is a bug in Microsoft's
    implementation of fma() library routine (also used by gcc on msys2).
    When programs linked with their dynamic library run on hardware with FMA instructions then everything works correctly.

    Same programs on hardware without FMA mostly produce correct results
    when x*y and z differ in sign.
    But when x*y and z have the same sign then [on hardware without FMA] Microsoft's routine appear to do non-fused calculations.

    Here is an example of the program that prints 250508 on Intel Haswell
    CPU, but prints 0 on Intel Ivy Bridge.
    Compiled as 'cl -O1 -W4 -MD fma_tst0.c'.

    #include <stdio.h>
    #include <stdlib.h>
    #include <math.h>

    static
    unsigned long long rnd(void)
    {
    unsigned long long x = 0;
    for (int i = 0; i < 5; ++i)
    x = (x << 15) + (rand() & 0x7FFF);
    return x;
    }

    int main(void)
    {
    srand(1);
    int n = 0;
    for (int i = 0; i < 1000000; ++i) {
    double x = rnd() * 0x1p-64;
    double y = rnd() * 0x1p-64;
    double z = rnd() * 0x1p-114;
    double r1 = x*y + z;
    double r2 = fma(x, y, z);
    n += r1 != r2;
    }
    printf("%d\n", n);
    return 0;
    }

    It is certainly worth a bug report, but I am afraid that Microsoft will
    do nothing to fix it, likely claiming that they don't care about old
    hardware.



    --- Synchronet 3.20c-Linux NewsLink 1.2
  • From Mr Flibble@leigh@i42.co.uk to comp.lang.c++ on Tue Feb 25 23:03:04 2025
    From Newsgroup: comp.lang.c++

    On Tue, 25 Feb 2025 07:37:23 +0100, Bonita Montero
    <Bonita.Montero@gmail.com> wrote:

    Am 24.02.2025 um 23:21 schrieb Mr Flibble:

    double myFmod(double x, double y)
    {
    double div = x / y;
    return y * (div - std::round(div));
    }

    Doesn't work, not only for the reasons already mentioned.

    double myFmod(double x, double y)
    {
    double div = x / y;
    return y * (div - std::trunc(div));
    }

    /Flibble
    --- Synchronet 3.20c-Linux NewsLink 1.2
  • From Michael S@already5chosen@yahoo.com to comp.lang.c++ on Wed Feb 26 02:00:48 2025
    From Newsgroup: comp.lang.c++

    On Tue, 25 Feb 2025 23:03:04 +0000
    Mr Flibble <leigh@i42.co.uk> wrote:

    On Tue, 25 Feb 2025 07:37:23 +0100, Bonita Montero
    <Bonita.Montero@gmail.com> wrote:

    Am 24.02.2025 um 23:21 schrieb Mr Flibble:

    double myFmod(double x, double y)
    {
    double div = x / y;
    return y * (div - std::round(div));
    }

    Doesn't work, not only for the reasons already mentioned.

    double myFmod(double x, double y)
    {
    double div = x / y;
    return y * (div - std::trunc(div));
    }

    /Flibble


    Even ignoring potential overflow during division, this method is
    very imprecise.
    (1e3/9 - trunc(1e3/9))*9 = 1.000000000000028
    (1e6/9 - trunc(1e6/9))*9 = 0.999999999985448
    (1e9/9 - trunc(1e9/9))*9 = 0.999999940395355
    (1e12/9 - trunc(1e12/9))*9 = 1.000030517578125
    (1e15/9 - trunc(1e15/9))*9 = 0.984375

    OTOH
    1e3/9 - trunc(1e3/9)*9 = 1
    1e6/9 - trunc(1e6/9)*9 = 1
    1e9/9 - trunc(1e9/9)*9 = 1
    1e12/9 - trunc(1e12/9)*9 = 1
    1e15/9 - trunc(1e15/9)*9 = 1


    --- Synchronet 3.20c-Linux NewsLink 1.2
  • From Wuns Haerst@Wuns.Haerst@wurstfabrik.at to comp.lang.c++ on Wed Feb 26 04:01:06 2025
    From Newsgroup: comp.lang.c++

    Am 26.02.2025 um 00:03 schrieb Mr Flibble:

    double myFmod(double x, double y)
    {
    double div = x / y;
    return y * (div - std::trunc(div));
    }

    Produces a lot of erroneous results.

    --- Synchronet 3.20c-Linux NewsLink 1.2
  • From Bonita Montero@Bonita.Montero@gmail.com to comp.lang.c++ on Wed Feb 26 04:15:33 2025
    From Newsgroup: comp.lang.c++

    Am 25.02.2025 um 23:36 schrieb Michael S:

    SSEn has no FMA operations.

    FMA3 is an extension to SSE as well as AVX. As you can see from my
    code I'm using the __m128d data type, which is a SSE and not an AVX
    data type. There are also variants for AVX (ymm) and AVX-512 (zmm)
    data types.

    --- Synchronet 3.20c-Linux NewsLink 1.2
  • From Muttley@Muttley@DastardlyHQ.org to comp.lang.c++ on Wed Feb 26 08:16:01 2025
    From Newsgroup: comp.lang.c++

    On Wed, 26 Feb 2025 00:36:19 +0200
    Michael S <already5chosen@yahoo.com> wibbled:
    Here is an example of the program that prints 250508 on Intel Haswell
    CPU, but prints 0 on Intel Ivy Bridge.
    Compiled as 'cl -O1 -W4 -MD fma_tst0.c'.

    #include <stdio.h>
    #include <stdlib.h>
    #include <math.h>

    static
    unsigned long long rnd(void)
    {
    unsigned long long x = 0;
    for (int i = 0; i < 5; ++i)
    x = (x << 15) + (rand() & 0x7FFF);
    return x;
    }

    int main(void)
    {
    srand(1);
    int n = 0;
    for (int i = 0; i < 1000000; ++i) {
    double x = rnd() * 0x1p-64;
    double y = rnd() * 0x1p-64;
    double z = rnd() * 0x1p-114;
    double r1 = x*y + z;
    double r2 = fma(x, y, z);
    n += r1 != r2;
    }
    printf("%d\n", n);
    return 0;
    }

    It is certainly worth a bug report, but I am afraid that Microsoft will
    do nothing to fix it, likely claiming that they don't care about old >hardware.

    Just FYI - it also returns 0 when compiled by Clang on an ARM Mac.

    --- Synchronet 3.20c-Linux NewsLink 1.2
  • From Michael S@already5chosen@yahoo.com to comp.lang.c++ on Wed Feb 26 14:27:21 2025
    From Newsgroup: comp.lang.c++

    On Wed, 26 Feb 2025 08:16:01 -0000 (UTC)
    Muttley@DastardlyHQ.org wrote:

    On Wed, 26 Feb 2025 00:36:19 +0200
    Michael S <already5chosen@yahoo.com> wibbled:
    Here is an example of the program that prints 250508 on Intel Haswell
    CPU, but prints 0 on Intel Ivy Bridge.
    Compiled as 'cl -O1 -W4 -MD fma_tst0.c'.

    #include <stdio.h>
    #include <stdlib.h>
    #include <math.h>

    static
    unsigned long long rnd(void)
    {
    unsigned long long x = 0;
    for (int i = 0; i < 5; ++i)
    x = (x << 15) + (rand() & 0x7FFF);
    return x;
    }

    int main(void)
    {
    srand(1);
    int n = 0;
    for (int i = 0; i < 1000000; ++i) {
    double x = rnd() * 0x1p-64;
    double y = rnd() * 0x1p-64;
    double z = rnd() * 0x1p-114;
    double r1 = x*y + z;
    double r2 = fma(x, y, z);
    n += r1 != r2;
    }
    printf("%d\n", n);
    return 0;
    }

    It is certainly worth a bug report, but I am afraid that Microsoft
    will do nothing to fix it, likely claiming that they don't care
    about old hardware.

    Just FYI - it also returns 0 when compiled by Clang on an ARM Mac.


    Looks like a bug in clang.
    New versions of clang generate FMA instead of mul+add. I.e. clang bug is opposite of MS bug.
    By Standard, compilers not allowed to do it in "standard" C mode in
    absence of special flags like -ffast-math.

    I played a little on godbolt and it seems that the bug is relatively
    new. clang 13 still generates correct code. clang 14 does not. I.e.
    slightly less than 3 years.

    It happened simultaneously on x86-64 and ARM64

    clang 14
    https://godbolt.org/z/asochKz5P
    https://godbolt.org/z/c7xTaGWzv

    clang 13
    https://godbolt.org/z/6onP3dE3c
    https://godbolt.org/z/W9exqTanf

    clang 13 -ffast-math
    https://godbolt.org/z/8f875qMrf
    https://godbolt.org/z/qPGafr563

    --- Synchronet 3.20c-Linux NewsLink 1.2
  • From Muttley@Muttley@DastardlyHQ.org to comp.lang.c++ on Wed Feb 26 14:39:34 2025
    From Newsgroup: comp.lang.c++

    On Wed, 26 Feb 2025 14:27:21 +0200
    Michael S <already5chosen@yahoo.com> wibbled:
    On Wed, 26 Feb 2025 08:16:01 -0000 (UTC)
    Muttley@DastardlyHQ.org wrote:
    I played a little on godbolt and it seems that the bug is relatively
    new. clang 13 still generates correct code. clang 14 does not. I.e.
    slightly less than 3 years.

    I don't think they've noticed:

    R8603$ cc --version
    Apple clang version 16.0.0 (clang-1600.0.26.6)

    --- Synchronet 3.20c-Linux NewsLink 1.2
  • From Mr Flibble@leigh@i42.co.uk to comp.lang.c++ on Thu Feb 27 21:16:50 2025
    From Newsgroup: comp.lang.c++

    On Mon, 24 Feb 2025 11:48:08 +0100, Bonita Montero
    <Bonita.Montero@gmail.com> wrote:

    I wanted to optimize fmod to be a bit faster. This is my C++20 solution.

    double myFmod( double x, double y )
    {
    constexpr uint64_t
    SIGN = 1ull << 63,
    IMPLICIT = 1ull << 52,
    MANT = IMPLICIT - 1,
    QBIT = 1ull << 51;
    uint64_t const
    binX = bit_cast<uint64_t>( x ),
    binY = bit_cast<uint64_t>( y );
    static auto abs = []( uint64_t m ) { return m & ~SIGN; };
    auto isNaN = []( uint64_t m ) { return abs( m ) >= 0x7FF0000000000001u; }; auto isSig = []( uint64_t m ) { return !(m & QBIT); };
    if( isNaN( binX ) ) [[unlikely]] // x == NaN
    #if defined(_MSC_VER)
    return bit_cast<double>( isNaN( binY ) ? binY | binX & binY & QBIT :
    binX );
    #else
    {
    if( isSig( binX ) || isNaN( binY ) && isSig( binY ) ) [[unlikely]]
    feraiseexcept( FE_INVALID );
    return bit_cast<double>( binX | QBIT );
    }
    #endif
    if( isNaN( binY ) ) [[unlikely]] // x != NaN || y == NaN
    #if defined(_MSC_VER)
    return y;
    #else
    {
    if( isSig( binY ) ) [[unlikely]]
    feraiseexcept( FE_INVALID );
    return bit_cast<double>( binY | QBIT );
    }
    #endif
    auto isInf = []( uint64_t m ) { return abs( m ) == 0x7FF0000000000000u; }; if( isInf( binX ) ) // x == Inf
    {
    feraiseexcept( FE_INVALID );
    #if defined(_MSC_VER)
    return bit_cast<double>( binX & ~MANT | QBIT );
    #else
    return -numeric_limits<double>::quiet_NaN();
    #endif
    }
    if( !abs( binY ) ) [[unlikely]] // y == 0
    {
    feraiseexcept( FE_INVALID );
    #if defined(_MSC_VER)
    return numeric_limits<double>::quiet_NaN();
    #else
    return -numeric_limits<double>::quiet_NaN();
    #endif
    }
    if( !abs( binX ) || isInf( binY ) ) [[unlikely]] // x == 0 || y == Inf
    return x;
    auto exp = []( uint64_t b ) -> int { return b >> 52 & 0x7FF; };
    int
    expX = exp( binX ),
    expY = exp( binY );
    auto mant = []( uint64_t b ) { return b & MANT; };
    uint64_t
    mantX = mant( binX ),
    mantY = mant( binY );
    static auto normalize = []( int &exp, uint64_t &mant )
    {
    unsigned shift = countl_zero( mant ) - 11;
    mant <<= shift;
    exp -= shift;
    };
    auto build = []( int &exp, uint64_t &mant )
    {
    if( exp ) [[likely]]
    mant |= IMPLICIT;
    else
    {
    exp = 1;
    normalize( exp, mant );
    }
    };
    build( expX, mantX );
    build( expY, mantY );
    uint64_t signX = binX & SIGN;
    int expDiff;
    while( (expDiff = expX - expY) > 0 )
    {
    unsigned bits = expDiff <= 11 ? expDiff : 11;
    if( !(mantX = (mantX << bits) % mantY) ) [[unlikely]]
    return bit_cast<double>( signX );
    expX -= bits;
    normalize( expX, mantX );
    }
    if( !expDiff && mantX >= mantY ) [[unlikely]]
    if( (mantX -= mantY) ) [[likely]]
    normalize( expX, mantX );
    else
    return bit_cast<double>( signX );
    if( expX <= 0 ) [[unlikely]]
    {
    assert(expX >= -51);
    mantX = mantX >> (unsigned)(-expX + 1);
    expX = 0;
    }
    return bit_cast<double>( signX | (uint64_t)expX << 52 | mantX & MANT );
    }

    It's about six times faster than the glibc 2.31 solution in my
    benchmark. The returned NaNs and the raised exceptions are MSVC-
    and glibc-compatible.

    double myFmod(double x, double y)
    {
    return x / y - std::trunc(x / y) * y;
    }

    /Flibble
    --- Synchronet 3.20c-Linux NewsLink 1.2
  • From Bonita Montero@Bonita.Montero@gmail.com to comp.lang.c++ on Fri Feb 28 07:59:30 2025
    From Newsgroup: comp.lang.c++

    Am 27.02.2025 um 22:16 schrieb Mr Flibble:

    double myFmod(double x, double y)
    {
    return x / y - std::trunc(x / y) * y;
    }

    In most cases ths has a precision-loss which can lead to a result
    which is larger than y. Current solutions are similar to my code
    and are always 100% exact.
    --- Synchronet 3.20c-Linux NewsLink 1.2
  • From Michael S@already5chosen@yahoo.com to comp.lang.c++ on Fri Feb 28 12:35:29 2025
    From Newsgroup: comp.lang.c++

    On Thu, 27 Feb 2025 21:16:50 +0000
    Mr Flibble <leigh@i42.co.uk> wrote:


    double myFmod(double x, double y)
    {
    return x / y - std::trunc(x / y) * y;
    }

    /Flibble

    Nonsense.

    The one below is not nonsense, but still very bad.
    double myFmod(double x, double y)
    {
    return x - trunc(x / y) * y;
    }

    All solutions that work for all combinations of inputs are complicated.
    They can be based either on integer arithmetic or on FMA.

    In the former case in order to get any sort of speed one has to use non-standard extensions to the language, like gcc __int128 or MS/Intel _umul128/_umulh or MS/ARM __umulh.

    In the latter class of solutions one has to be careful about rounding -
    either check on every step that rounding didn't went a wrong way or set rounding mode to FE_TOWARDZERO at the beginning and restore it to
    original at the end.

    For both solutions worst case (huge x, tiny y) is pretty slow - 30-40
    steps with several arithmetic operations on every step. Without
    experimentation it is hard to say which of the solutions is faster. It
    depends on your hardware, anyway.



    --- Synchronet 3.20c-Linux NewsLink 1.2
  • From Michael S@already5chosen@yahoo.com to comp.lang.c++ on Fri Feb 28 16:28:32 2025
    From Newsgroup: comp.lang.c++

    On Wed, 26 Feb 2025 14:39:34 -0000 (UTC)
    Muttley@DastardlyHQ.org wrote:

    On Wed, 26 Feb 2025 14:27:21 +0200
    Michael S <already5chosen@yahoo.com> wibbled:
    On Wed, 26 Feb 2025 08:16:01 -0000 (UTC)
    Muttley@DastardlyHQ.org wrote:
    I played a little on godbolt and it seems that the bug is relatively
    new. clang 13 still generates correct code. clang 14 does not. I.e. >slightly less than 3 years.

    I don't think they've noticed:

    R8603$ cc --version
    Apple clang version 16.0.0 (clang-1600.0.26.6)


    More googling/stack-overflowing.

    clang/LLVM people think that it is a feature rather than a bug. They
    claim that the standard allows fusing. I think that they are wrong, but
    I didn't read the respective part of the standard.

    The behavior can be turned back into clang13 way by -ffp-contract=off.
    Or with pragma
    #pragma STDC FP_CONTRACT OFF


    See answer by rici https://stackoverflow.com/questions/73985098/clang-14-0-0-floating-point-optimizations

    --- Synchronet 3.20c-Linux NewsLink 1.2
  • From Mr Flibble@leigh@i42.co.uk to comp.lang.c++ on Fri Feb 28 18:12:06 2025
    From Newsgroup: comp.lang.c++

    On Fri, 28 Feb 2025 12:35:29 +0200, Michael S
    <already5chosen@yahoo.com> wrote:

    On Thu, 27 Feb 2025 21:16:50 +0000
    Mr Flibble <leigh@i42.co.uk> wrote:


    double myFmod(double x, double y)
    {
    return x / y - std::trunc(x / y) * y;
    }

    /Flibble

    Nonsense.

    The one below is not nonsense, but still very bad.
    double myFmod(double x, double y)
    {
    return x - trunc(x / y) * y;
    }

    Yes it is nonsense, YOUR nonsense (I didn't actually think about the
    problem, just reposted yours as I foolishly assumed you were correct):

    On Wed, 26 Feb 2025 02:00:48 +0200, Michael S
    <already5chosen@yahoo.com> wrote:

    Even ignoring potential overflow during division, this method is
    very imprecise.
    (1e3/9 - trunc(1e3/9))*9 = 1.000000000000028
    (1e6/9 - trunc(1e6/9))*9 = 0.999999999985448
    (1e9/9 - trunc(1e9/9))*9 = 0.999999940395355
    (1e12/9 - trunc(1e12/9))*9 = 1.000030517578125
    (1e15/9 - trunc(1e15/9))*9 = 0.984375

    OTOH
    1e3/9 - trunc(1e3/9)*9 = 1
    1e6/9 - trunc(1e6/9)*9 = 1
    1e9/9 - trunc(1e9/9)*9 = 1
    1e12/9 - trunc(1e12/9)*9 = 1
    1e15/9 - trunc(1e15/9)*9 = 1


    /Flibble
    --- Synchronet 3.20c-Linux NewsLink 1.2
  • From Bonita Montero@Bonita.Montero@gmail.com to comp.lang.c++ on Fri Feb 28 19:23:49 2025
    From Newsgroup: comp.lang.c++

    Am 27.02.2025 um 22:16 schrieb Mr Flibble:

    double myFmod(double x, double y)
    {
    return x / y - std::trunc(x / y) * y;
    }

    I weote a little test to find out how often the result is imprecise and
    how often it is out of range:

    #include <iostream>
    #include <cmath>
    #include <random>
    #include <bit>

    using namespace std;

    double trivialFmod( double a, double b );

    int main()
    {
    mt19937_64 mt;
    uniform_int_distribution<uint64_t> gen( 1, 0x7FEFFFFFFFFFFFFFu );
    size_t imprecise = 0, outOfRange = 0;
    for( size_t r = 1'000'000; r; --r )
    {
    double
    a = bit_cast<double>( gen( mt ) ),
    b = bit_cast<double>( gen( mt ) ),
    fm = fmod( a, b ),
    tfm = trivialFmod( a, b );
    imprecise += fm != tfm;
    outOfRange += tfm >= b;
    }
    auto print = []( char const *what, size_t n ) { cout << what << (ptrdiff_t)n / (1.0e6 / 100) << "%" << endl; };
    print( "imprecise: ", imprecise );
    print( "out of range: ", outOfRange );
    }

    double trivialFmod( double a, double b )
    {
    return a - trunc( a / b ) * b;
    }

    The output is:

    imprecise: 49.9096%
    out of range: 2.0039%

    I'd never expected that half of the results are precise and that
    only two percent of the results are out of range. But that's still
    unusable.

    --- Synchronet 3.20c-Linux NewsLink 1.2
  • From Mr Flibble@leigh@i42.co.uk to comp.lang.c++ on Fri Feb 28 18:30:47 2025
    From Newsgroup: comp.lang.c++

    On Mon, 24 Feb 2025 11:48:08 +0100, Bonita Montero
    <Bonita.Montero@gmail.com> wrote:

    I wanted to optimize fmod to be a bit faster. This is my C++20 solution.

    double myFmod( double x, double y )
    {
    constexpr uint64_t
    SIGN = 1ull << 63,
    IMPLICIT = 1ull << 52,
    MANT = IMPLICIT - 1,
    QBIT = 1ull << 51;
    uint64_t const
    binX = bit_cast<uint64_t>( x ),
    binY = bit_cast<uint64_t>( y );
    static auto abs = []( uint64_t m ) { return m & ~SIGN; };
    auto isNaN = []( uint64_t m ) { return abs( m ) >= 0x7FF0000000000001u; }; auto isSig = []( uint64_t m ) { return !(m & QBIT); };
    if( isNaN( binX ) ) [[unlikely]] // x == NaN
    #if defined(_MSC_VER)
    return bit_cast<double>( isNaN( binY ) ? binY | binX & binY & QBIT :
    binX );
    #else
    {
    if( isSig( binX ) || isNaN( binY ) && isSig( binY ) ) [[unlikely]]
    feraiseexcept( FE_INVALID );
    return bit_cast<double>( binX | QBIT );
    }
    #endif
    if( isNaN( binY ) ) [[unlikely]] // x != NaN || y == NaN
    #if defined(_MSC_VER)
    return y;
    #else
    {
    if( isSig( binY ) ) [[unlikely]]
    feraiseexcept( FE_INVALID );
    return bit_cast<double>( binY | QBIT );
    }
    #endif
    auto isInf = []( uint64_t m ) { return abs( m ) == 0x7FF0000000000000u; }; if( isInf( binX ) ) // x == Inf
    {
    feraiseexcept( FE_INVALID );
    #if defined(_MSC_VER)
    return bit_cast<double>( binX & ~MANT | QBIT );
    #else
    return -numeric_limits<double>::quiet_NaN();
    #endif
    }
    if( !abs( binY ) ) [[unlikely]] // y == 0
    {
    feraiseexcept( FE_INVALID );
    #if defined(_MSC_VER)
    return numeric_limits<double>::quiet_NaN();
    #else
    return -numeric_limits<double>::quiet_NaN();
    #endif
    }
    if( !abs( binX ) || isInf( binY ) ) [[unlikely]] // x == 0 || y == Inf
    return x;
    auto exp = []( uint64_t b ) -> int { return b >> 52 & 0x7FF; };
    int
    expX = exp( binX ),
    expY = exp( binY );
    auto mant = []( uint64_t b ) { return b & MANT; };
    uint64_t
    mantX = mant( binX ),
    mantY = mant( binY );
    static auto normalize = []( int &exp, uint64_t &mant )
    {
    unsigned shift = countl_zero( mant ) - 11;
    mant <<= shift;
    exp -= shift;
    };
    auto build = []( int &exp, uint64_t &mant )
    {
    if( exp ) [[likely]]
    mant |= IMPLICIT;
    else
    {
    exp = 1;
    normalize( exp, mant );
    }
    };
    build( expX, mantX );
    build( expY, mantY );
    uint64_t signX = binX & SIGN;
    int expDiff;
    while( (expDiff = expX - expY) > 0 )
    {
    unsigned bits = expDiff <= 11 ? expDiff : 11;
    if( !(mantX = (mantX << bits) % mantY) ) [[unlikely]]
    return bit_cast<double>( signX );
    expX -= bits;
    normalize( expX, mantX );
    }
    if( !expDiff && mantX >= mantY ) [[unlikely]]
    if( (mantX -= mantY) ) [[likely]]
    normalize( expX, mantX );
    else
    return bit_cast<double>( signX );
    if( expX <= 0 ) [[unlikely]]
    {
    assert(expX >= -51);
    mantX = mantX >> (unsigned)(-expX + 1);
    expX = 0;
    }
    return bit_cast<double>( signX | (uint64_t)expX << 52 | mantX & MANT );
    }

    It's about six times faster than the glibc 2.31 solution in my
    benchmark. The returned NaNs and the raised exceptions are MSVC-
    and glibc-compatible.

    double my_fmod(double x, double y)
    {
    if (y == 0.0)
    return x / y;
    return x - std::trunc(x / y) * y;
    }

    /Flibble
    --- Synchronet 3.20c-Linux NewsLink 1.2
  • From Bonita Montero@Bonita.Montero@gmail.com to comp.lang.c++ on Fri Feb 28 19:31:44 2025
    From Newsgroup: comp.lang.c++

    Am 28.02.2025 um 19:30 schrieb Mr Flibble:

    double my_fmod(double x, double y)
    {
    if (y == 0.0)
    return x / y;
    return x - std::trunc(x / y) * y;
    }

    This still sucks. Try it with this test:

    #include <iostream>
    #include <cmath>
    #include <random>
    #include <bit>

    using namespace std;

    double trivialFmod( double a, double b );

    int main()
    {
    mt19937_64 mt;
    uniform_int_distribution<uint64_t> gen( 1, 0x7FEFFFFFFFFFFFFFu );
    size_t imprecise = 0, outOfRange = 0;
    for( size_t r = 1'000'000; r; --r )
    {
    double
    a = bit_cast<double>( gen( mt ) ),
    b = bit_cast<double>( gen( mt ) ),
    fm = fmod( a, b ),
    tfm = trivialFmod( a, b );
    imprecise += fm != tfm;
    outOfRange += tfm >= b;
    }
    auto print = []( char const *what, size_t n ) { cout << what << (ptrdiff_t)n / (1.0e6 / 100) << "%" << endl; };
    print( "imprecise: ", imprecise );
    print( "out of range: ", outOfRange );
    }

    double trivialFmod( double a, double b )
    {
    return a - trunc( a / b ) * b;
    }


    --- Synchronet 3.20c-Linux NewsLink 1.2
  • From Mr Flibble@leigh@i42.co.uk to comp.lang.c++ on Fri Feb 28 19:47:23 2025
    From Newsgroup: comp.lang.c++

    On Fri, 28 Feb 2025 19:31:44 +0100, Bonita Montero
    <Bonita.Montero@gmail.com> wrote:

    Am 28.02.2025 um 19:30 schrieb Mr Flibble:

    double my_fmod(double x, double y)
    {
    if (y == 0.0)
    return x / y;
    return x - std::trunc(x / y) * y;
    }

    This still sucks. Try it with this test:

    #include <iostream>
    #include <cmath>
    #include <random>
    #include <bit>

    using namespace std;

    double trivialFmod( double a, double b );

    int main()
    {
    mt19937_64 mt;
    uniform_int_distribution<uint64_t> gen( 1, 0x7FEFFFFFFFFFFFFFu );
    size_t imprecise = 0, outOfRange = 0;
    for( size_t r = 1'000'000; r; --r )
    {
    double
    a = bit_cast<double>( gen( mt ) ),
    b = bit_cast<double>( gen( mt ) ),
    fm = fmod( a, b ),
    tfm = trivialFmod( a, b );
    imprecise += fm != tfm;
    outOfRange += tfm >= b;
    }
    auto print = []( char const *what, size_t n ) { cout << what <<
    (ptrdiff_t)n / (1.0e6 / 100) << "%" << endl; };
    print( "imprecise: ", imprecise );
    print( "out of range: ", outOfRange );
    }

    double trivialFmod( double a, double b )
    {
    return a - trunc( a / b ) * b;
    }


    IEEE 754 does not define how std::fmod should behave, only
    std::remainder.

    /Flibble
    --- Synchronet 3.20c-Linux NewsLink 1.2
  • From Bonita Montero@Bonita.Montero@gmail.com to comp.lang.c++ on Fri Feb 28 20:49:38 2025
    From Newsgroup: comp.lang.c++

    Am 28.02.2025 um 20:47 schrieb Mr Flibble:

    IEEE 754 does not define how std::fmod should behave, only
    std::remainder.

    There's only one way to do it for finite numbers.
    --- Synchronet 3.20c-Linux NewsLink 1.2
  • From Mr Flibble@leigh@i42.co.uk to comp.lang.c++ on Sat Mar 1 14:37:28 2025
    From Newsgroup: comp.lang.c++

    On Fri, 28 Feb 2025 20:49:38 +0100, Bonita Montero
    <Bonita.Montero@gmail.com> wrote:

    Am 28.02.2025 um 20:47 schrieb Mr Flibble:

    IEEE 754 does not define how std::fmod should behave, only
    std::remainder.

    There's only one way to do it for finite numbers.

    Not true as there is a fixed mantissa size, so finite precision,
    making your test case useless if x is sufficiently large.

    /Flibble
    --- Synchronet 3.20c-Linux NewsLink 1.2
  • From Bonita Montero@Bonita.Montero@gmail.com to comp.lang.c++ on Sat Mar 1 16:58:51 2025
    From Newsgroup: comp.lang.c++

    Am 01.03.2025 um 15:37 schrieb Mr Flibble:
    On Fri, 28 Feb 2025 20:49:38 +0100, Bonita Montero
    <Bonita.Montero@gmail.com> wrote:

    Am 28.02.2025 um 20:47 schrieb Mr Flibble:

    IEEE 754 does not define how std::fmod should behave, only
    std::remainder.

    There's only one way to do it for finite numbers.

    Not true as there is a fixed mantissa size, so finite precision,
    making your test case useless if x is sufficiently large.

    The way to do a modolo calculations for every floating point value
    except inf or nan (finite numbers) is always the same for all imple- mentations. And correct implementations are always without precision
    los, i.e. exact.
    As I've shown solutions like yours are only 50% exact and in 2% they
    generate out of range results.

    --- Synchronet 3.20c-Linux NewsLink 1.2
  • From Tim Rentsch@tr.17687@z991.linuxsc.com to comp.lang.c++ on Sat Mar 1 08:35:01 2025
    From Newsgroup: comp.lang.c++

    Michael S <already5chosen@yahoo.com> writes:

    On Wed, 26 Feb 2025 14:39:34 -0000 (UTC)
    Muttley@DastardlyHQ.org wrote:

    On Wed, 26 Feb 2025 14:27:21 +0200
    Michael S <already5chosen@yahoo.com> wibbled:

    On Wed, 26 Feb 2025 08:16:01 -0000 (UTC)
    Muttley@DastardlyHQ.org wrote:
    I played a little on godbolt and it seems that the bug is relatively
    new. clang 13 still generates correct code. clang 14 does not. I.e.
    slightly less than 3 years.

    I don't think they've noticed:

    R8603$ cc --version
    Apple clang version 16.0.0 (clang-1600.0.26.6)

    More googling/stack-overflowing.

    clang/LLVM people think that it is a feature rather than a bug. They
    claim that the standard allows fusing. I think that they are wrong, but
    I didn't read the respective part of the standard.

    My reading of the C standard is that implementations are allowed to
    contract floating-point expressions (aka fusing) as their default
    choice of what is allowed, and because this default choice falls
    into the category of implementation-defined behavior the
    implementation must document what default it has chosen.

    The behavior can be turned back into clang13 way by -ffp-contract=off.
    Or with pragma
    #pragma STDC FP_CONTRACT OFF

    The C standard doesn't say anything about compiler options.

    The C standard does specify what happens for the STDC FP_CONTRACT
    standard #pragma, for both

    #pragma STDC FP_CONTRACT OFF

    and

    #pragma STDC FP_CONTRACT ON

    If you want to look, what these #pragma's do is defined in the
    section of the C standard pertaining to <math.h>, which is 7.12 in
    the N1256 document for C99.
    --- Synchronet 3.20c-Linux NewsLink 1.2
  • From Mr Flibble@leigh@i42.co.uk to comp.lang.c++ on Sat Mar 1 17:11:54 2025
    From Newsgroup: comp.lang.c++

    On Sat, 1 Mar 2025 16:58:51 +0100, Bonita Montero
    <Bonita.Montero@gmail.com> wrote:

    Am 01.03.2025 um 15:37 schrieb Mr Flibble:
    On Fri, 28 Feb 2025 20:49:38 +0100, Bonita Montero
    <Bonita.Montero@gmail.com> wrote:

    Am 28.02.2025 um 20:47 schrieb Mr Flibble:

    IEEE 754 does not define how std::fmod should behave, only
    std::remainder.

    There's only one way to do it for finite numbers.

    Not true as there is a fixed mantissa size, so finite precision,
    making your test case useless if x is sufficiently large.

    The way to do a modolo calculations for every floating point value
    except inf or nan (finite numbers) is always the same for all imple- >mentations. And correct implementations are always without precision
    los, i.e. exact.
    As I've shown solutions like yours are only 50% exact and in 2% they
    generate out of range results.

    Thus you are asserting that all finite numbers have an exact IEEE 754
    floating point representation which is of course an erroneous
    assertion ergo your solution is bogus.

    /Flibble
    --- Synchronet 3.20c-Linux NewsLink 1.2
  • From Bonita Montero@Bonita.Montero@gmail.com to comp.lang.c++ on Sat Mar 1 21:02:32 2025
    From Newsgroup: comp.lang.c++

    Am 01.03.2025 um 18:11 schrieb Mr Flibble:

    Thus you are asserting that all finite numbers have an exact IEEE 754 floating point representation ...

    But the result of floating-point operations usually have precision-loss;
    except fmod(), which is always correct - when implemented properly. You
    didn't implement it correctly.
    --- Synchronet 3.20c-Linux NewsLink 1.2
  • From Mr Flibble@leigh@i42.co.uk to comp.lang.c++ on Sat Mar 1 20:32:40 2025
    From Newsgroup: comp.lang.c++

    On Sat, 1 Mar 2025 21:02:32 +0100, Bonita Montero
    <Bonita.Montero@gmail.com> wrote:

    Am 01.03.2025 um 18:11 schrieb Mr Flibble:

    Thus you are asserting that all finite numbers have an exact IEEE 754
    floating point representation ...

    But the result of floating-point operations usually have precision-loss; >except fmod(), which is always correct - when implemented properly. You >didn't implement it correctly.

    False, see my other post.

    /Flibble
    --- Synchronet 3.20c-Linux NewsLink 1.2
  • From Bonita Montero@Bonita.Montero@gmail.com to comp.lang.c++ on Sat Mar 1 21:37:10 2025
    From Newsgroup: comp.lang.c++

    Am 01.03.2025 um 21:32 schrieb Mr Flibble:
    On Sat, 1 Mar 2025 21:02:32 +0100, Bonita Montero
    <Bonita.Montero@gmail.com> wrote:

    Am 01.03.2025 um 18:11 schrieb Mr Flibble:

    Thus you are asserting that all finite numbers have an exact IEEE 754
    floating point representation ...

    But the result of floating-point operations usually have precision-loss;
    except fmod(), which is always correct - when implemented properly. You
    didn't implement it correctly.

    False, see my other post.

    /Flibble

    This:

    #include <iostream>
    #include <cmath>
    #include <random>
    #include <bit>

    using namespace std;

    double my_fmod( double x, double y );

    int main()
    {
    mt19937_64 mt;
    uniform_int_distribution<uint64_t> gen( 1, 0x7FEFFFFFFFFFFFFFu );
    size_t imprecise = 0, outOfRange = 0;
    for( size_t r = 1'000'000; r; --r )
    {
    double
    a = bit_cast<double>( gen( mt ) ),
    b = bit_cast<double>( gen( mt ) ),
    fm = fmod( a, b ),
    tfm = my_fmod( a, b );
    imprecise += fm != tfm;
    outOfRange += tfm >= b;
    }
    auto print = []( char const *what, size_t n ) { cout << what << (ptrdiff_t)n / (1.0e6 / 100) << "%" << endl; };
    print( "imprecise: ", imprecise );
    print( "out of range: ", outOfRange );
    }

    double my_fmod( double x, double y )
    {
    if( y == 0.0 )
    return x / y;
    return x - std::trunc( x / y ) * y;
    }

    ... prints this ...

    imprecise: 49.9096%
    out of range: 2.0039%

    So your solution is unusable.

    --- Synchronet 3.20c-Linux NewsLink 1.2
  • From Mr Flibble@leigh@i42.co.uk to comp.lang.c++ on Sat Mar 1 20:45:26 2025
    From Newsgroup: comp.lang.c++

    On Sat, 1 Mar 2025 21:37:10 +0100, Bonita Montero
    <Bonita.Montero@gmail.com> wrote:

    Am 01.03.2025 um 21:32 schrieb Mr Flibble:
    On Sat, 1 Mar 2025 21:02:32 +0100, Bonita Montero
    <Bonita.Montero@gmail.com> wrote:

    Am 01.03.2025 um 18:11 schrieb Mr Flibble:

    Thus you are asserting that all finite numbers have an exact IEEE 754
    floating point representation ...

    But the result of floating-point operations usually have precision-loss; >>> except fmod(), which is always correct - when implemented properly. You
    didn't implement it correctly.

    False, see my other post.

    /Flibble

    This:

    #include <iostream>
    #include <cmath>
    #include <random>
    #include <bit>

    using namespace std;

    double my_fmod( double x, double y );

    int main()
    {
    mt19937_64 mt;
    uniform_int_distribution<uint64_t> gen( 1, 0x7FEFFFFFFFFFFFFFu );
    size_t imprecise = 0, outOfRange = 0;
    for( size_t r = 1'000'000; r; --r )
    {
    double
    a = bit_cast<double>( gen( mt ) ),
    b = bit_cast<double>( gen( mt ) ),
    fm = fmod( a, b ),
    tfm = my_fmod( a, b );
    imprecise += fm != tfm;
    outOfRange += tfm >= b;
    }
    auto print = []( char const *what, size_t n ) { cout << what <<
    (ptrdiff_t)n / (1.0e6 / 100) << "%" << endl; };
    print( "imprecise: ", imprecise );
    print( "out of range: ", outOfRange );
    }

    double my_fmod( double x, double y )
    {
    if( y == 0.0 )
    return x / y;
    return x - std::trunc( x / y ) * y;
    }

    ... prints this ...

    imprecise: 49.9096%
    out of range: 2.0039%

    So your solution is unusable.

    False, see my other post.

    /Flibble
    --- Synchronet 3.20c-Linux NewsLink 1.2
  • From Michael S@already5chosen@yahoo.com to comp.lang.c++ on Sat Mar 1 22:55:59 2025
    From Newsgroup: comp.lang.c++

    On Mon, 24 Feb 2025 11:48:08 +0100
    Bonita Montero <Bonita.Montero@gmail.com> wrote:

    I wanted to optimize fmod to be a bit faster. This is my C++20
    solution.

    double myFmod( double x, double y )
    {
    constexpr uint64_t
    SIGN = 1ull << 63,
    IMPLICIT = 1ull << 52,
    MANT = IMPLICIT - 1,
    QBIT = 1ull << 51;
    uint64_t const
    binX = bit_cast<uint64_t>( x ),
    binY = bit_cast<uint64_t>( y );
    static auto abs = []( uint64_t m ) { return m & ~SIGN; };
    auto isNaN = []( uint64_t m ) { return abs( m ) >=
    0x7FF0000000000001u; }; auto isSig = []( uint64_t m ) { return !(m &
    QBIT); }; if( isNaN( binX ) ) [[unlikely]] // x == NaN
    #if defined(_MSC_VER)
    return bit_cast<double>( isNaN( binY ) ? binY | binX
    & binY & QBIT : binX );
    #else
    {
    if( isSig( binX ) || isNaN( binY ) && isSig( binY ) ) [[unlikely]] feraiseexcept( FE_INVALID );
    return bit_cast<double>( binX | QBIT );
    }
    #endif
    if( isNaN( binY ) ) [[unlikely]] // x != NaN || y == NaN
    #if defined(_MSC_VER)
    return y;
    #else
    {
    if( isSig( binY ) ) [[unlikely]]
    feraiseexcept( FE_INVALID );
    return bit_cast<double>( binY | QBIT );
    }
    #endif
    auto isInf = []( uint64_t m ) { return abs( m ) ==
    0x7FF0000000000000u; }; if( isInf( binX ) ) // x == Inf
    {
    feraiseexcept( FE_INVALID );
    #if defined(_MSC_VER)
    return bit_cast<double>( binX & ~MANT | QBIT );
    #else
    return -numeric_limits<double>::quiet_NaN();
    #endif
    }
    if( !abs( binY ) ) [[unlikely]] // y == 0
    {
    feraiseexcept( FE_INVALID );
    #if defined(_MSC_VER)
    return numeric_limits<double>::quiet_NaN();
    #else
    return -numeric_limits<double>::quiet_NaN();
    #endif
    }
    if( !abs( binX ) || isInf( binY ) ) [[unlikely]] // x == 0 ||
    y == Inf return x;
    auto exp = []( uint64_t b ) -> int { return b >> 52 & 0x7FF;
    }; int
    expX = exp( binX ),
    expY = exp( binY );
    auto mant = []( uint64_t b ) { return b & MANT; };
    uint64_t
    mantX = mant( binX ),
    mantY = mant( binY );
    static auto normalize = []( int &exp, uint64_t &mant )
    {
    unsigned shift = countl_zero( mant ) - 11;
    mant <<= shift;
    exp -= shift;
    };
    auto build = []( int &exp, uint64_t &mant )
    {
    if( exp ) [[likely]]
    mant |= IMPLICIT;
    else
    {
    exp = 1;
    normalize( exp, mant );
    }
    };
    build( expX, mantX );
    build( expY, mantY );
    uint64_t signX = binX & SIGN;
    int expDiff;
    while( (expDiff = expX - expY) > 0 )
    {
    unsigned bits = expDiff <= 11 ? expDiff : 11;
    if( !(mantX = (mantX << bits) % mantY) ) [[unlikely]]
    return bit_cast<double>( signX );
    expX -= bits;
    normalize( expX, mantX );
    }
    if( !expDiff && mantX >= mantY ) [[unlikely]]
    if( (mantX -= mantY) ) [[likely]]
    normalize( expX, mantX );
    else
    return bit_cast<double>( signX );
    if( expX <= 0 ) [[unlikely]]
    {
    assert(expX >= -51);
    mantX = mantX >> (unsigned)(-expX + 1);
    expX = 0;
    }
    return bit_cast<double>( signX | (uint64_t)expX << 52 | mantX
    & MANT ); }

    It's about six times faster than the glibc 2.31 solution in my
    benchmark. The returned NaNs and the raised exceptions are MSVC-
    and glibc-compatible.


    How about that?
    Pay attention, it's C rather than C++. So 5 times shorter :-)
    It's not the fastest for big x/y ratios, but rather simple and not
    *too* slow. At least as long as hardware supports FMA.
    For small x/y ratios it should be pretty close to best possible.


    #include <math.h>
    #include <fenv.h>

    double my_fmod(double x, double y)
    {
    if (isnan(x))
    return x;

    // pre-process y
    if (isless(y, 0))
    y = -y;
    else if (isgreater(y, 0))
    ;
    else {
    if (isnan(y))
    return y;
    // y==0
    feraiseexcept(FE_INVALID);
    return nan("y0");
    }

    // y in (0:+inf]

    // Quick path
    double xx = x * 0x1p-53;
    if (xx > -y && xx < y) {
    // among other things, x guaranteed to be finite
    if (x > -y && x < y)
    return x; // case y=+-inf covered here
    double d = trunc(x/y);
    double res = fma(-y, d, x);
    if (signbit(x) != signbit(res)) {
    // overshoot because of unfortunate division rounding
    // it is extremely rare for small x/y,
    // but not rare when x/y is close to 2**53
    res = fma(-y, d+(signbit(x)*2-1), x);
    }
    return res;
    }

    // slow path
    if (isinf(x)) {
    feraiseexcept(FE_INVALID);
    return nan("xinf");
    }

    int oldRnd = fegetround();
    fesetround(FE_TOWARDZERO);

    double ax = fabs(x);
    do {
    double yy = y;
    while (yy < ax * 0x1p-1022)
    yy *= 0x1p1021;

    do
    ax = fma(-yy, trunc(ax/yy), ax);
    while (ax >= yy);

    } while (ax >= y);

    ax = copysign(ax, x);
    fesetround(oldRnd);
    return ax;
    }



    --- Synchronet 3.20c-Linux NewsLink 1.2
  • From Bonita Montero@Bonita.Montero@gmail.com to comp.lang.c++ on Sun Mar 2 09:40:42 2025
    From Newsgroup: comp.lang.c++

    Am 01.03.2025 um 18:11 schrieb Mr Flibble:

    Thus you are asserting that all finite numbers have an exact IEEE 754 floating point representation ...

    I never said that.

    --- Synchronet 3.20c-Linux NewsLink 1.2
  • From Michael S@already5chosen@yahoo.com to comp.lang.c++ on Sun Mar 2 15:41:55 2025
    From Newsgroup: comp.lang.c++

    On Mon, 24 Feb 2025 11:48:08 +0100
    Bonita Montero <Bonita.Montero@gmail.com> wrote:

    I wanted to optimize fmod to be a bit faster. This is my C++20
    solution.


    It's about six times faster than the glibc 2.31 solution in my
    benchmark. The returned NaNs and the raised exceptions are MSVC-
    and glibc-compatible.


    Here are my benchmark results on Windows/msys2, 4.25MHz Skylake:
    Compiler x/y range fmod my_fmod (yours)
    clang [0.5:2**53] 5.6 38.0
    gcc [0.5:2**53] 5.8 36.7
    MSVC [0.5:2**53] 11.0 36.8
    clang [2**-2098:2**2098] 102.9 294.3
    gcc [2**-2098:2**2098] 102.9 291.7
    MSVC [2**-2098:2**2098] 109.9 289.6

    Your variant is 2.6x to 6.8x times slower than standard library.

    My variant is also slower than standard library, but the margin of
    defeat is much closer.

    ------- first range
    #include <cstdio>
    #include <cstring>
    #include <cmath>
    #include <cfenv>
    #include <vector>
    #include <random>
    #include <chrono>

    double my_fmod(double x, double y);

    int main(void)
    {
    const int VEC_LEN = 100000;
    const int N_IT = 31;

    std::vector<double> xy(VEC_LEN*2);
    std::mt19937_64 rndGen;
    const uint64_t EXP_MASK = 2047ull << 52;
    for (int i = 0; i < VEC_LEN*2; ++i) {
    uint64_t u = rndGen();
    uint64_t exp = 1023;
    if (i % 2 == 0) { // x
    uint64_t exp0 = (u >> 52) & 2047;
    exp += exp0 % 52;
    }
    u = (u & ~EXP_MASK) | (exp << 52);
    double d;
    memcpy(&d, &u, sizeof(d));
    xy[i] = d;
    }
    std::vector<double> res(VEC_LEN);
    std::vector<double> ref(VEC_LEN);

    auto t00 = std::chrono::steady_clock::now();
    const double* pXY = xy.data();
    double* pRef = ref.data();
    double* pRes = res.data();
    std::vector<int64_t> tref(N_IT);
    std::vector<int64_t> tres(N_IT);
    for (int it = 0; it < N_IT; ++it) {
    auto t0 = std::chrono::steady_clock::now();
    for (int i = 0; i < VEC_LEN; ++i)
    pRef[i] = fmod(pXY[i*2+0], pXY[i*2+1]);
    auto t1 = std::chrono::steady_clock::now();
    for (int i = 0; i < VEC_LEN; ++i)
    pRes[i] = my_fmod(pXY[i*2+0], pXY[i*2+1]);
    auto t2 = std::chrono::steady_clock::now();

    tref[it] = std::chrono::duration_cast<std::chrono::nanoseconds>(t1
    - t0).count();
    tres[it] = std::chrono::duration_cast<std::chrono::nanoseconds>(t2
    - t1).count();

    for (int i = 0; i < VEC_LEN; ++i) {
    if (pRef[i] != pRes[i]) {
    if (!std::isnan(pRef[i]) || !std::isnan(pRes[i])) {
    printf(
    "Mismatch. fmod(%.17e, %.17e).\n"
    "ref %.17e\n"
    "my %.17e\n"
    ,xy[i*2+0]
    ,xy[i*2+1]
    ,ref[i]
    ,res[i]
    );
    return 1;
    }
    }
    }
    }

    auto t11 = std::chrono::steady_clock::now();
    int64_t dt = std::chrono::duration_cast<std::chrono::nanoseconds>(t11
    - t00).count();

    std::nth_element(tref.begin(), tref.begin()+N_IT/2, tref.end());
    std::nth_element(tres.begin(), tres.begin()+N_IT/2, tres.end());
    printf("fmod %.2f nsec. my_fmod %.2f nsec. Test time %.3f msec\n"
    ,double(tref[N_IT/2]) / VEC_LEN
    ,double(tres[N_IT/2]) / VEC_LEN
    ,double(dt)*1e-6
    );

    return 0;
    }


    -- second range
    #include <cstdio>
    #include <cstring>
    #include <cmath>
    #include <cfenv>
    #include <vector>
    #include <random>
    #include <chrono>

    double my_fmod(double x, double y);

    int main(void)
    {
    const int VEC_LEN = 100000;
    const int N_IT = 31;

    std::vector<double> xy(VEC_LEN*2);
    std::mt19937_64 rndGen;
    for (int i = 0; i < VEC_LEN*2; ++i) {
    uint64_t u = rndGen();
    double d;
    memcpy(&d, &u, sizeof(d));
    xy[i] = d;
    }
    std::vector<double> res(VEC_LEN);
    std::vector<double> ref(VEC_LEN);

    auto t00 = std::chrono::steady_clock::now();
    const double* pXY = xy.data();
    double* pRef = ref.data();
    double* pRes = res.data();
    std::vector<int64_t> tref(N_IT);
    std::vector<int64_t> tres(N_IT);
    for (int it = 0; it < N_IT; ++it) {
    auto t0 = std::chrono::steady_clock::now();
    for (int i = 0; i < VEC_LEN; ++i)
    pRef[i] = fmod(pXY[i*2+0], pXY[i*2+1]);
    auto t1 = std::chrono::steady_clock::now();
    for (int i = 0; i < VEC_LEN; ++i)
    pRes[i] = my_fmod(pXY[i*2+0], pXY[i*2+1]);
    auto t2 = std::chrono::steady_clock::now();

    tref[it] = std::chrono::duration_cast<std::chrono::nanoseconds>(t1
    - t0).count();
    tres[it] = std::chrono::duration_cast<std::chrono::nanoseconds>(t2
    - t1).count();

    for (int i = 0; i < VEC_LEN; ++i) {
    if (pRef[i] != pRes[i]) {
    if (!std::isnan(pRef[i]) || !std::isnan(pRes[i])) {
    printf(
    "Mismatch. fmod(%.17e, %.17e).\n"
    "ref %.17e\n"
    "my %.17e\n"
    ,xy[i*2+0]
    ,xy[i*2+1]
    ,ref[i]
    ,res[i]
    );
    return 1;
    }
    }
    }
    }

    auto t11 = std::chrono::steady_clock::now();
    int64_t dt = std::chrono::duration_cast<std::chrono::nanoseconds>(t11
    - t00).count();

    std::nth_element(tref.begin(), tref.begin()+N_IT/2, tref.end());
    std::nth_element(tres.begin(), tres.begin()+N_IT/2, tres.end());
    printf("fmod %.2f nsec. my_fmod %.2f nsec. Test time %.3f msec\n"
    ,double(tref[N_IT/2]) / VEC_LEN
    ,double(tres[N_IT/2]) / VEC_LEN
    ,double(dt)*1e-6
    );

    return 0;
    }





    --- Synchronet 3.20c-Linux NewsLink 1.2
  • From Bonita Montero@Bonita.Montero@gmail.com to comp.lang.c++ on Sun Mar 2 17:10:37 2025
    From Newsgroup: comp.lang.c++

    This is my code, improved by the _udiv128-intrinsic of MSVC which
    provides a 128 / 64 division. With that my algorithm becomes nearly
    tree times as fast as before. I'll provide a g++ / clang++ compatible
    version with inline-assembly later.

    template<bool _32 = false>
    double xMyFmod( double x, double y )
    {
    constexpr uint64_t
    SIGN = 1ull << 63,
    IMPLICIT = 1ull << 52,
    MANT = IMPLICIT - 1,
    QBIT = 1ull << 51;
    uint64_t const
    binX = bit_cast<uint64_t>( x ),
    binY = bit_cast<uint64_t>( y );
    static auto abs = []( uint64_t m ) { return m & ~SIGN; };
    auto isNaN = []( uint64_t m ) { return abs( m ) >= 0x7FF0000000000001u; };
    auto isSig = []( uint64_t m ) { return !(m & QBIT); };
    if( isNaN( binX ) ) [[unlikely]] // x == NaN
    #if defined(_MSC_VER)
    return bit_cast<double>( isNaN( binY ) ? binY | binX & binY & QBIT :
    binX );
    #else
    {
    if( isSig( binX ) || isNaN( binY ) && isSig( binY ) ) [[unlikely]]
    feraiseexcept( FE_INVALID );
    return bit_cast<double>( binX | QBIT );
    }
    #endif
    auto isInf = []( uint64_t m ) { return abs( m ) == 0x7FF0000000000000u; };
    if( isNaN( binY ) ) [[unlikely]] // x != NaN || y == NaN
    #if defined(_MSC_VER)
    {
    if constexpr( _32 )
    if( isInf( binX ) )
    feraiseexcept( FE_INVALID );
    return y;
    }
    #else
    {
    if( isSig( binY ) ) [[unlikely]]
    feraiseexcept( FE_INVALID );
    return bit_cast<double>( binY | QBIT );
    }
    #endif
    if( isInf( binX ) ) // x == Inf
    {
    feraiseexcept( FE_INVALID );
    #if defined(_MSC_VER)
    return bit_cast<double>( binX & ~MANT | QBIT );
    #else
    return -numeric_limits<double>::quiet_NaN();
    #endif
    }
    if( !abs( binY ) ) [[unlikely]] // y == 0
    {
    feraiseexcept( FE_INVALID );
    #if defined(_MSC_VER)
    return numeric_limits<double>::quiet_NaN();
    #else
    return -numeric_limits<double>::quiet_NaN();
    #endif
    }
    if( !abs( binX ) || isInf( binY ) ) [[unlikely]] // x == 0 || y == Inf
    return x;
    auto exp = []( uint64_t b ) -> int { return b >> 52 & 0x7FF; };
    int
    expX = exp( binX ),
    expY = exp( binY );
    auto mant = []( uint64_t b ) { return b & MANT; };
    uint64_t
    mantX = mant( binX ),
    mantY = mant( binY );
    int headBits = 11;
    static auto normalize = [&]( int &exp, uint64_t &mant )
    {
    unsigned shift = countl_zero( mant ) - headBits;
    mant <<= shift;
    exp -= shift;
    };
    auto build = []( int &exp, uint64_t &mant )
    {
    if( exp ) [[likely]]
    mant |= IMPLICIT;
    else
    {
    exp = 1;
    normalize( exp, mant );
    }
    };
    build( expX, mantX );
    build( expY, mantY );
    int
    tailX = countr_zero( mantX ),
    tailY = countr_zero( mantY ),
    tailBits = tailX <= tailY ? tailX : tailY;
    headBits += tailBits;
    mantX >>= tailBits;
    mantY >>= tailBits;
    uint64_t signX = binX & SIGN;
    int expDiff;
    #if defined(_MSC_VER)
    while( (expDiff = expX - expY) > 63 )
    {
    unsigned long long hi = mantX >> 1, lo = mantX << 63, remainder;
    (void)_udiv128( hi, lo, mantY, &remainder );
    expX -= 63;
    mantX = remainder;
    normalize( expX, mantX );
    }
    #endif
    while( (expDiff = expX - expY) > 0 )
    {
    unsigned bits = expDiff <= headBits ? expDiff : headBits;
    if( !(mantX = (mantX << bits) % mantY) ) [[unlikely]]
    return bit_cast<double>( signX );
    expX -= bits;
    normalize( expX, mantX );
    }
    if( !expDiff && mantX >= mantY ) [[unlikely]]
    if( (mantX -= mantY) ) [[likely]]
    normalize( expX, mantX );
    else
    return bit_cast<double>( signX );
    mantX <<= tailBits;
    mantY <<= tailBits;
    if( expX <= 0 ) [[unlikely]]
    {
    assert(expX >= -51);
    mantX = mantX >> (unsigned)(-expX + 1);
    expX = 0;
    }
    return bit_cast<double>( signX | (uint64_t)expX << 52 | mantX & MANT ); }

    double myFmod( double x, double y )
    {
    return xMyFmod( x, y );
    }

    inline float myFmod( float x, float y )
    {
    return (float)xMyFmod<true>( (double)x, (double)y );
    }
    --- Synchronet 3.20c-Linux NewsLink 1.2
  • From Mr Flibble@leigh@i42.co.uk to comp.lang.c++ on Sun Mar 2 16:22:20 2025
    From Newsgroup: comp.lang.c++

    On Sun, 2 Mar 2025 17:10:37 +0100, Bonita Montero
    <Bonita.Montero@gmail.com> wrote:

    This is my code, improved by the _udiv128-intrinsic of MSVC which
    provides a 128 / 64 division. With that my algorithm becomes nearly
    tree times as fast as before. I'll provide a g++ / clang++ compatible
    version with inline-assembly later.

    Still slow tho.

    /Flibble
    --- Synchronet 3.20c-Linux NewsLink 1.2
  • From Bonita Montero@Bonita.Montero@gmail.com to comp.lang.c++ on Sun Mar 2 17:26:18 2025
    From Newsgroup: comp.lang.c++

    Am 02.03.2025 um 17:22 schrieb Mr Flibble:
    On Sun, 2 Mar 2025 17:10:37 +0100, Bonita Montero
    <Bonita.Montero@gmail.com> wrote:

    This is my code, improved by the _udiv128-intrinsic of MSVC which
    provides a 128 / 64 division. With that my algorithm becomes nearly
    tree times as fast as before. I'll provide a g++ / clang++ compatible
    version with inline-assembly later.

    Still slow tho.

    With MSVC and the fairer random numbers I chose I'm 2.5 times
    faster.

    --- Synchronet 3.20c-Linux NewsLink 1.2
  • From Michael S@already5chosen@yahoo.com to comp.lang.c++ on Sun Mar 2 18:54:14 2025
    From Newsgroup: comp.lang.c++

    On Sat, 1 Mar 2025 22:55:59 +0200
    Michael S <already5chosen@yahoo.com> wrote:

    On Mon, 24 Feb 2025 11:48:08 +0100
    Bonita Montero <Bonita.Montero@gmail.com> wrote:

    I wanted to optimize fmod to be a bit faster. This is my C++20
    solution.


    How about that?
    Pay attention, it's C rather than C++. So 5 times shorter :-)
    It's not the fastest for big x/y ratios, but rather simple and not
    *too* slow. At least as long as hardware supports FMA.
    For small x/y ratios it should be pretty close to best possible.


    I didn't RTFM about signbit() and didn't check with compilers others
    than clang, so didn't pay attention to the bug until testing with
    MSVC. At the same opportunity I looked at MSVC-generated asm and found
    out that it does not inline trunc() and copysign(). So, I changed the
    code please MSVC idiosyncrasy, replacing trunc() with floor() and using
    if () instead of copysign(). Fortunately, the changes didn't make
    clang/gcc compiled code any slower.

    Here is hopefully correct version:

    #include <math.h>
    #include <fenv.h>

    double my_fmod(double x, double y)
    {
    if (isnan(x))
    return x;

    // pre-process y
    if (y < 0)
    y = -y;
    else if (y > 0)
    ;
    else {
    if (isnan(y))
    return y;
    // y==0
    feraiseexcept(FE_INVALID);
    return NAN;
    }

    // y in (0:+inf]
    double ax = fabs(x);

    // Quick path
    double xx = ax * 0x1p-53;
    if (xx < y) {
    // among other things, x guaranteed to be finite
    if (ax < y)
    return x; // case y=+-inf covered here
    double d = floor(ax/y);
    double res = fma(-y, d, ax);
    if (res < 0) {
    // overshoot because of unfortunate division rounding
    // it is extremely rare for small x/y,
    // but not rare when x/y is close to 2**53
    res = fma(-y, d-1, ax);
    }
    if (x < 0)
    res = -res;
    return res;
    }

    // slow path
    if (isinf(x)) {
    feraiseexcept(FE_INVALID);
    return NAN;
    }

    int oldRnd = fegetround();
    fesetround(FE_TOWARDZERO);

    do {
    double yy = y;
    while (yy < ax * 0x1p-1022)
    yy *= 0x1p1021;

    do
    ax = fma(-yy, floor(ax/yy), ax);
    while (ax >= yy);

    } while (ax >= y);

    if (x < 0)
    ax = -ax;
    fesetround(oldRnd);
    return ax;
    }


    The behaviour w.r.t. raising FE_INVALID when either x or y is signaling
    NaN could differ from library implementation, but as far as I understand
    both raising and not raising exception in this case is legal.

    This code is between 15-20% slower then standard library for clang and
    gcc, and 0 to 15% faster then standard library for MSVC, so of no
    practical interest.

    Still, it is better than Bonita's in all possible circumstances.

    --- Synchronet 3.20c-Linux NewsLink 1.2
  • From Bonita Montero@Bonita.Montero@gmail.com to comp.lang.c++ on Sun Mar 2 17:59:14 2025
    From Newsgroup: comp.lang.c++

    Am 02.03.2025 um 17:54 schrieb Michael S:

    Still, it is better than Bonita's in all possible circumstances.

    Not better than my latest code with MSVC.

    --- Synchronet 3.20c-Linux NewsLink 1.2
  • From Michael S@already5chosen@yahoo.com to comp.lang.c++ on Sun Mar 2 19:07:59 2025
    From Newsgroup: comp.lang.c++

    On Sun, 2 Mar 2025 17:26:18 +0100
    Bonita Montero <Bonita.Montero@gmail.com> wrote:


    With MSVC and the fairer random numbers I chose I'm 2.5 times
    faster.


    I don't trust your benchmarking skills.


    --- Synchronet 3.20c-Linux NewsLink 1.2
  • From Bonita Montero@Bonita.Montero@gmail.com to comp.lang.c++ on Sun Mar 2 18:09:32 2025
    From Newsgroup: comp.lang.c++

    Am 02.03.2025 um 18:07 schrieb Michael S:
    On Sun, 2 Mar 2025 17:26:18 +0100
    Bonita Montero <Bonita.Montero@gmail.com> wrote:


    With MSVC and the fairer random numbers I chose I'm 2.5 times
    faster.


    I don't trust your benchmarking skills.

    I don't prefer your fast-path value combinations but I chose 75% random
    finite combinations.

    --- Synchronet 3.20c-Linux NewsLink 1.2
  • From Bonita Montero@Bonita.Montero@gmail.com to comp.lang.c++ on Sun Mar 2 18:11:14 2025
    From Newsgroup: comp.lang.c++

    Am 02.03.2025 um 18:09 schrieb Bonita Montero:
    Am 02.03.2025 um 18:07 schrieb Michael S:
    On Sun, 2 Mar 2025 17:26:18 +0100
    Bonita Montero <Bonita.Montero@gmail.com> wrote:


    With MSVC and the fairer random numbers I chose I'm 2.5 times
    faster.


    I don't trust your benchmarking skills.

    I don't prefer your fast-path value combinations but I chose 75% random finite combinations.


    This generates the random values for me:

    mt19937_64 mt;
    uniform_int_distribution<uint64_t>
    genType( 0, 15 ),
    genFinite( 0x0010000000000000u, 0x7FEFFFFFFFFFFFFFu ),
    genDen( 1, 0x000FFFFFFFFFFFFFu ),
    genNaN( 0x7FF0000000000001u, 0x7FFFFFFFFFFFFFFFu );
    auto get = [&]()
    {
    constexpr uint64_t
    FINITE_THRESH = 4, // 75% finites
    ZERO = 3, // 6.25% zeroes
    DENORMALS = 2, // 6.25% denormals
    INF = 1, // 6.25% Infs
    NAN_ = 0; // 6.25% NaNs
    uint64_t
    sign = mt() & - numeric_limits<int64_t>::min(),
    type = genType( mt );
    if( type >= FINITE_THRESH )
    return bit_cast<double>( sign | genFinite( mt ) );
    if( type == ZERO )
    return bit_cast<double>( sign );
    if( type == DENORMALS )
    return bit_cast<double>( sign | genDen( mt ) );
    if( type == INF )
    return bit_cast<double>( sign | 0x7FF0000000000000u );
    assert(type == NAN_);
    return bit_cast<double>( sign | genNaN( mt ) );
    };

    --- Synchronet 3.20c-Linux NewsLink 1.2
  • From Michael S@already5chosen@yahoo.com to comp.lang.c++ on Sun Mar 2 19:37:16 2025
    From Newsgroup: comp.lang.c++

    On Sun, 02 Mar 2025 16:22:20 +0000
    Mr Flibble <leigh@i42.co.uk> wrote:
    On Sun, 2 Mar 2025 17:10:37 +0100, Bonita Montero
    <Bonita.Montero@gmail.com> wrote:

    This is my code, improved by the _udiv128-intrinsic of MSVC which
    provides a 128 / 64 division. With that my algorithm becomes nearly
    tree times as fast as before. I'll provide a g++ / clang++ compatible >version with inline-assembly later.

    Still slow tho.

    /Flibble
    The truth is that relative speed of FP vs Integer algorithms depends on specific CPU that one is using for measurements.
    I measured on relatively old CPU - Intel Skylake. On this CPU integer
    division is very significantly slower than floating-point division.
    On newer CPUs, like Intel IceLake/Tiger Lake and Alder Lake or AMD Zen
    3/4/5 and even more sore on Apple M-series the difference in speed
    between floating-point and integer division is less significant, and
    in few cases integer division is even faster, so in theory Bonita's code
    could be more competitive.
    From Agner Fog's tables:
    Arch DIVSD DIV r64
    Skylake 13-14 35-88
    IceLake 13-14 15
    Alder Lake 14 10
    Zen3 13.5 9-17
    My problem is that because of Bonita's horrible coding style I am not
    even trying to understand what's is going on within his/her code.
    --- Synchronet 3.20c-Linux NewsLink 1.2
  • From Bonita Montero@Bonita.Montero@gmail.com to comp.lang.c++ on Sun Mar 2 18:41:58 2025
    From Newsgroup: comp.lang.c++

    Am 02.03.2025 um 18:37 schrieb Michael S:

    The truth is that relative speed of FP vs Integer algorithms depends on specific CPU that one is using for measurements.
    I measured on relatively old CPU - Intel Skylake. On this CPU integer division is very significantly slower than floating-point division.
    On newer CPUs, like Intel IceLake/Tiger Lake and Alder Lake or AMD Zen
    3/4/5 and even more sore on Apple M-series the difference in speed
    between floating-point and integer division is less significant, and
    in few cases integer division is even faster, so in theory Bonita's code could be more competitive.

    From Agner Fog's tables:
    Arch DIVSD DIV r64
    Skylake 13-14 35-88
    IceLake 13-14 15
    Alder Lake 14 10
    Zen3 13.5 9-17

    On my Zen4-CPU your code is slightly faster than my initial code with
    the shown selection of random values (which don't prefer any fast path).
    My current code is on my CPU with MSVC 2.4 times faster if I chose the
    values somethat different with only finites and no denormals, infs, nans
    or zeroes.

    My problem is that because of Bonita's horrible coding style I am not
    even trying to understand what's is going on within his/her code.

    Your coding-style is horrible. Mine is "too beautiful" (my employer).

    --- Synchronet 3.20c-Linux NewsLink 1.2
  • From Michael S@already5chosen@yahoo.com to comp.lang.c++ on Sun Mar 2 19:52:21 2025
    From Newsgroup: comp.lang.c++

    On Sun, 2 Mar 2025 18:11:14 +0100
    Bonita Montero <Bonita.Montero@gmail.com> wrote:

    Am 02.03.2025 um 18:09 schrieb Bonita Montero:
    Am 02.03.2025 um 18:07 schrieb Michael S:
    On Sun, 2 Mar 2025 17:26:18 +0100
    Bonita Montero <Bonita.Montero@gmail.com> wrote:


    With MSVC and the fairer random numbers I chose I'm 2.5 times
    faster.


    I don't trust your benchmarking skills.

    I don't prefer your fast-path value combinations but I chose 75%
    random finite combinations.


    This generates the random values for me:

    mt19937_64 mt;
    uniform_int_distribution<uint64_t>
    genType( 0, 15 ),
    genFinite( 0x0010000000000000u, 0x7FEFFFFFFFFFFFFFu ),
    genDen( 1, 0x000FFFFFFFFFFFFFu ),
    genNaN( 0x7FF0000000000001u, 0x7FFFFFFFFFFFFFFFu );
    auto get = [&]()
    {
    constexpr uint64_t
    FINITE_THRESH = 4, // 75% finites
    ZERO = 3, // 6.25% zeroes
    DENORMALS = 2, // 6.25% denormals
    INF = 1, // 6.25% Infs
    NAN_ = 0; // 6.25% NaNs
    uint64_t
    sign = mt() & -
    numeric_limits<int64_t>::min(), type = genType( mt );
    if( type >= FINITE_THRESH )
    return bit_cast<double>( sign | genFinite( mt
    ) ); if( type == ZERO )
    return bit_cast<double>( sign );
    if( type == DENORMALS )
    return bit_cast<double>( sign | genDen( mt )
    ); if( type == INF )
    return bit_cast<double>( sign |
    0x7FF0000000000000u ); assert(type == NAN_);
    return bit_cast<double>( sign | genNaN( mt ) );
    };


    Your distribution is very different from what one would expect in
    real-world usage.
    In real-world usage apart from debugging stage there are no inf, nan or
    y=zero. x=zero happens, but with lower probability that 6%. Denormals
    also happen, but with even lower probability than x=zero. Also in
    majority of real-world scenarios huge x/y ratios either not happen at
    all or are extremely rare.






    --- Synchronet 3.20c-Linux NewsLink 1.2
  • From Bonita Montero@Bonita.Montero@gmail.com to comp.lang.c++ on Sun Mar 2 18:55:22 2025
    From Newsgroup: comp.lang.c++

    Am 02.03.2025 um 18:52 schrieb Michael S:

    Your distribution is very different from what one would expect in
    real-world usage.

    There's no real world distibution, so I chose all finites to be
    equally likely.

    In real-world usage apart from debugging stage there are no inf, nan or y=zero. x=zero happens, but with lower probability that 6%. Denormals
    also happen, but with even lower probability than x=zero.

    As I said if I chose 100% finites from the 1 to 0x7FEFFFFFFFFFFFFFu
    range I'm still 2.4 times faster.

    Also in majority of real-world scenarios huge x/y ratios either not happen at all or are extremely rare.







    --- Synchronet 3.20c-Linux NewsLink 1.2
  • From Michael S@already5chosen@yahoo.com to comp.lang.c++ on Sun Mar 2 20:00:05 2025
    From Newsgroup: comp.lang.c++

    On Sun, 2 Mar 2025 18:41:58 +0100
    Bonita Montero <Bonita.Montero@gmail.com> wrote:


    Your coding-style is horrible.

    Even with minimal comments it's not too hard to understand what's going
    on within my code. With few more comments it could become completely
    clear.
    Of course, floating-point algorithm is inherently simpler. That helps.

    Mine is "too beautiful" (my employer).


    It sounds like your employer agrees with me, but he expresses his
    thought in humoristic style.

    --- Synchronet 3.20c-Linux NewsLink 1.2
  • From Bonita Montero@Bonita.Montero@gmail.com to comp.lang.c++ on Sun Mar 2 19:01:03 2025
    From Newsgroup: comp.lang.c++

    Am 02.03.2025 um 19:00 schrieb Michael S:

    Mine is "too beautiful" (my employer).

    It sounds like your employer agrees with me, but he expresses his
    thought in humoristic style.

    No, he likes my style.

    --- Synchronet 3.20c-Linux NewsLink 1.2
  • From Michael S@already5chosen@yahoo.com to comp.lang.c++ on Sun Mar 2 20:09:36 2025
    From Newsgroup: comp.lang.c++

    On Sun, 2 Mar 2025 18:55:22 +0100
    Bonita Montero <Bonita.Montero@gmail.com> wrote:

    Am 02.03.2025 um 18:52 schrieb Michael S:

    Your distribution is very different from what one would expect in real-world usage.

    There's no real world distibution, so I chose all finites to be
    equally likely.

    In real-world usage apart from debugging stage there are no inf,
    nan or y=zero. x=zero happens, but with lower probability that 6%. Denormals also happen, but with even lower probability than x=zero.


    As I said if I chose 100% finites from the 1 to 0x7FEFFFFFFFFFFFFFu
    range I'm still 2.4 times faster.

    Also in majority of real-world scenarios huge x/y ratios either not
    happen at all or are extremely rare.








    You didn't answer the second point, which is critical.
    In your fully random scenario 48.7% of cases are huge x/y. That is
    completely unrealistic.

    I can easily improve speed of huge x/y at cost of less simple code and
    of small slowdown of more typical case, but I consider it
    counterproductive. It seems, authors of standard libraries agree with
    my judgment.

    --- Synchronet 3.20c-Linux NewsLink 1.2
  • From Bonita Montero@Bonita.Montero@gmail.com to comp.lang.c++ on Sun Mar 2 19:14:15 2025
    From Newsgroup: comp.lang.c++

    Am 02.03.2025 um 19:09 schrieb Michael S:
    On Sun, 2 Mar 2025 18:55:22 +0100
    Bonita Montero <Bonita.Montero@gmail.com> wrote:

    Am 02.03.2025 um 18:52 schrieb Michael S:

    Your distribution is very different from what one would expect in
    real-world usage.

    There's no real world distibution, so I chose all finites to be
    equally likely.

    In real-world usage apart from debugging stage there are no inf,
    nan or y=zero. x=zero happens, but with lower probability that 6%.
    Denormals also happen, but with even lower probability than x=zero.


    As I said if I chose 100% finites from the 1 to 0x7FEFFFFFFFFFFFFFu
    range I'm still 2.4 times faster.

    Also in majority of real-world scenarios huge x/y ratios either not
    happen at all or are extremely rare.








    You didn't answer the second point, which is critical.
    In your fully random scenario 48.7% of cases are huge x/y. That is
    completely unrealistic.

    If I modify my genFinite that way:

    genFinite( 1, 0x433FFFFFFFFFFFFFu )

    So that there are no dropped digits before the decimal point,
    I'm still nearly twice as fast with my latest code.

    I can easily improve speed of huge x/y at cost of less simple code and
    of small slowdown of more typical case, but I consider it
    counterproductive. It seems, authors of standard libraries agree with
    my judgment.


    --- Synchronet 3.20c-Linux NewsLink 1.2
  • From Bonita Montero@Bonita.Montero@gmail.com to comp.lang.c++ on Sun Mar 2 19:57:30 2025
    From Newsgroup: comp.lang.c++

    Am 02.03.2025 um 19:09 schrieb Michael S:

    You didn't answer the second point, which is critical.
    In your fully random scenario 48.7% of cases are huge x/y. That is
    completely unrealistic.
    I can easily improve speed of huge x/y at cost of less simple code and
    of small slowdown of more typical case, but I consider it
    counterproductive. It seems, authors of standard libraries agree with
    my judgment.

    And you use fesetround, which takes about 40 clock cycles on my
    CPU under Linux (WSL2). Better chose _mm_getcsr() and _mm_setcsr()
    for that, which directly sets the FPU control word for SSE / AVX*
    / AVX-512. This is multiple times faster. For the x87-FPU you'd
    have to chose different code, but the x87-FPU is totally broken
    anywax.

    --- Synchronet 3.20c-Linux NewsLink 1.2
  • From Michael S@already5chosen@yahoo.com to comp.lang.c++ on Sun Mar 2 22:58:45 2025
    From Newsgroup: comp.lang.c++

    On Sun, 2 Mar 2025 19:57:30 +0100
    Bonita Montero <Bonita.Montero@gmail.com> wrote:

    Am 02.03.2025 um 19:09 schrieb Michael S:

    You didn't answer the second point, which is critical.
    In your fully random scenario 48.7% of cases are huge x/y. That is completely unrealistic.
    I can easily improve speed of huge x/y at cost of less simple code
    and of small slowdown of more typical case, but I consider it counterproductive. It seems, authors of standard libraries agree
    with my judgment.

    And you use fesetround, which takes about 40 clock cycles on my
    CPU under Linux (WSL2). Better chose _mm_getcsr() and _mm_setcsr()
    for that, which directly sets the FPU control word for SSE / AVX*
    / AVX-512. This is multiple times faster. For the x87-FPU you'd
    have to chose different code, but the x87-FPU is totally broken
    anywax.


    If it was on the fast path, I'd consider it.
    But improving speed of unimportant slow path at cost of portability?
    Nah.


    --- Synchronet 3.20c-Linux NewsLink 1.2
  • From Michael S@already5chosen@yahoo.com to comp.lang.c++ on Mon Mar 3 02:01:25 2025
    From Newsgroup: comp.lang.c++

    On Sun, 2 Mar 2025 18:55:22 +0100
    Bonita Montero <Bonita.Montero@gmail.com> wrote:

    Am 02.03.2025 um 18:52 schrieb Michael S:

    Your distribution is very different from what one would expect in real-world usage.

    There's no real world distibution, so I chose all finites to be
    equally likely.

    In real-world usage apart from debugging stage there are no inf,
    nan or y=zero. x=zero happens, but with lower probability that 6%. Denormals also happen, but with even lower probability than x=zero.


    As I said if I chose 100% finites from the 1 to 0x7FEFFFFFFFFFFFFFu
    range I'm still 2.4 times faster.


    On my CPU [for huge ratios] it is indeed faster than your previous
    attempt, but still 1.25x slower than standard library. And for
    non-huge ratios there is no improvement - still 3.3 times slower than
    standard library.


    --- Synchronet 3.20c-Linux NewsLink 1.2
  • From Bonita Montero@Bonita.Montero@gmail.com to comp.lang.c++ on Mon Mar 3 12:52:51 2025
    From Newsgroup: comp.lang.c++

    Am 02.03.2025 um 21:58 schrieb Michael S:

    If it was on the fast path, I'd consider it.
    But improving speed of unimportant slow path at cost of portability?
    Nah.

    For the 75% random finites case I've shown your code becomes about
    28% faster with _mm_getcsr() and _mm_setcsr().
    --- Synchronet 3.20c-Linux NewsLink 1.2
  • From Michael S@already5chosen@yahoo.com to comp.lang.c++ on Mon Mar 3 18:10:08 2025
    From Newsgroup: comp.lang.c++

    On Mon, 3 Mar 2025 12:52:51 +0100
    Bonita Montero <Bonita.Montero@gmail.com> wrote:

    Am 02.03.2025 um 21:58 schrieb Michael S:

    If it was on the fast path, I'd consider it.
    But improving speed of unimportant slow path at cost of portability?
    Nah.

    For the 75% random finites case I've shown your code becomes about
    28% faster with _mm_getcsr() and _mm_setcsr().

    It seems that major slowdown is specific to combination of msys2
    libraries with Zen3/4 CPU.
    I see even worse slowness of get/set rounding mode on msys2/Zen3.
    The same msys-compiled binary on Intel CPUs is o.k., at least
    relatively to other heavy things going on on the slow path.
    On Zen3 with Microsoft's compiler/library it is also o.k.

    As long as it only affects slow path there is nothing to agitated
    about.




    --- Synchronet 3.20c-Linux NewsLink 1.2
  • From Michael S@already5chosen@yahoo.com to comp.lang.c++ on Mon Mar 3 18:20:09 2025
    From Newsgroup: comp.lang.c++

    On Sun, 2 Mar 2025 19:57:30 +0100
    Bonita Montero <Bonita.Montero@gmail.com> wrote:

    For the x87-FPU you'd
    have to chose different code, but the x87-FPU is totally broken
    anywax.


    x87 is not broken relatively to its own specifications. It just happens
    to be slightly different from IEEE-754 specifications. Which is not
    surprising considering that it predates IEEE-754 Standard by several
    years.
    Today there are very few reasons to still use x87 in new software.
    However back in it's time x87 was an astonishingly good piece of work,
    less so in performance (it was not fast, even by standards of its time)
    more so for features, precision and especially for consistency of its arithmetic.


    --- Synchronet 3.20c-Linux NewsLink 1.2
  • From Bonita Montero@Bonita.Montero@gmail.com to comp.lang.c++ on Mon Mar 3 17:30:39 2025
    From Newsgroup: comp.lang.c++

    Am 03.03.2025 um 17:20 schrieb Michael S:

    x87 is not broken relatively to its own specifications. It just happens
    to be slightly different from IEEE-754 specifications. Which is not surprising considering that it predates IEEE-754 Standard by several
    years.

    You can reduce the with of the mantissa to 53 or 24 bit, but the
    exponent is always 15 bit; that's not up to any specification.

    Today there are very few reasons to still use x87 in new software.
    However back in it's time x87 was an astonishingly good piece of work,
    less so in performance (it was not fast, even by standards of its time)
    more so for features, precision and especially for consistency of its arithmetic.

    There are compiler-settings which enforce consistency by storing values
    with reduced precision and re-loading them to give expectable results
    when you use values < long double. That's a mess.

    --- Synchronet 3.20c-Linux NewsLink 1.2
  • From Michael S@already5chosen@yahoo.com to comp.lang.c++ on Mon Mar 3 19:27:32 2025
    From Newsgroup: comp.lang.c++

    On Mon, 3 Mar 2025 17:30:39 +0100
    Bonita Montero <Bonita.Montero@gmail.com> wrote:

    Am 03.03.2025 um 17:20 schrieb Michael S:

    x87 is not broken relatively to its own specifications. It just
    happens to be slightly different from IEEE-754 specifications.
    Which is not surprising considering that it predates IEEE-754
    Standard by several years.

    You can reduce the with of the mantissa to 53 or 24 bit, but the
    exponent is always 15 bit; that's not up to any specification.


    That's up to x87 specification. Which predates IEEE-754.

    Today there are very few reasons to still use x87 in new software.
    However back in it's time x87 was an astonishingly good piece of
    work, less so in performance (it was not fast, even by standards of
    its time) more so for features, precision and especially for
    consistency of its arithmetic.

    There are compiler-settings which enforce consistency by storing
    values with reduced precision and re-loading them to give expectable
    results when you use values < long double. That's a mess.


    It's a mess only if you try to be very compatible with IEEE-754 specs.
    If you don't try to be compatible, you just enjoy higher precision
    and higher dynamic range for as long as you can. If you want it all the
    time, nothing prevents you from storing full 80-bit numbers in memory.
    Back in the era of 16-bit buses it was only 10-25% slower than storing
    64-bit results. For a full application difference in speed between
    80-bit and 64-bit precision was typically just few per cents.


    --- Synchronet 3.20c-Linux NewsLink 1.2
  • From Bonita Montero@Bonita.Montero@gmail.com to comp.lang.c++ on Mon Mar 3 18:39:51 2025
    From Newsgroup: comp.lang.c++

    Am 03.03.2025 um 18:27 schrieb Michael S:

    That's up to x87 specification. Which predates IEEE-754.

    It's dangerous. From the "Handbook of Floating Point Arithmetic".

    The dynamic rounding precision can introduce bugs in modern soft-
    ware, which is almost always made up of several components (dy-
    namic libraries, plug-ins). For instance, the following bug in
    Mozilla’s Javascript engine was discovered in 2006: if the rounding
    precision was reduced to single precision by a plug-in, then the
    js_dtoa function (double-to-string conversion) could overwrite
    memory, making the application behave erratically, e.g., crash.
    The cause was the loop exit condition being always false due to
    an unexpected floating-point error.

    So it can make it harder to write portable software.


    --- Synchronet 3.20c-Linux NewsLink 1.2
  • From Michael S@already5chosen@yahoo.com to comp.lang.c++ on Mon Mar 3 23:55:08 2025
    From Newsgroup: comp.lang.c++

    On Mon, 3 Mar 2025 18:10:08 +0200
    Michael S <already5chosen@yahoo.com> wrote:

    On Mon, 3 Mar 2025 12:52:51 +0100
    Bonita Montero <Bonita.Montero@gmail.com> wrote:

    Am 02.03.2025 um 21:58 schrieb Michael S:

    If it was on the fast path, I'd consider it.
    But improving speed of unimportant slow path at cost of
    portability? Nah.

    For the 75% random finites case I've shown your code becomes about
    28% faster with _mm_getcsr() and _mm_setcsr().

    It seems that major slowdown is specific to combination of msys2
    libraries with Zen3/4 CPU.
    I see even worse slowness of get/set rounding mode on msys2/Zen3.
    The same msys-compiled binary on Intel CPUs is o.k., at least
    relatively to other heavy things going on on the slow path.
    On Zen3 with Microsoft's compiler/library it is also o.k.

    As long as it only affects slow path there is nothing to agitated
    about.





    I can think about half a dozen of different ways of avoiding the need to
    change rounding. However most of them are boring. Only one is fun.

    #include <math.h>
    #include <fenv.h>

    double my_fmod(double x, double y)
    {
    if (isnan(x))
    return x;

    // pre-process y
    if (y < 0)
    y = -y;
    else if (y > 0)
    ;
    else {
    if (isnan(y))
    return y;
    // y==0
    feraiseexcept(FE_INVALID);
    return NAN;
    }

    // y in (0:+inf]
    double ax = fabs(x);

    // Quick path
    if (ax * 0x1p-53 < y) {
    // among other things, x guaranteed to be finite
    if (ax < y)
    return x; // case y=+-inf covered here
    double d = floor(ax/y);
    double res = fma(-y, d, ax);
    if (res < 0) {
    // overshoot because of unfortunate division rounding
    // it is extremely rare for small x/y,
    // but not rare when x/y is close to 2**53
    res += y;
    }
    if (x < 0)
    res = -res;
    return res;
    }

    // slow path
    if (isinf(x)) {
    feraiseexcept(FE_INVALID);
    return NAN;
    }

    int flipflop = 0;
    do {
    double yy = y;
    while (yy < ax * 0x1p-1022)
    yy *= 0x1p1021;

    do {
    ax = fma(-yy, floor(ax/yy), ax);
    flipflop ^= (ax < 0);
    ax = fabs(ax);
    } while (ax >= yy);
    } while (ax >= y);
    if (flipflop)
    ax = y - ax;
    if (x < 0)
    ax = -ax;
    return ax;
    }


    To my surprise, in case of insane x/y ratios it was faster than original
    not only on Zen3/Msys, but on Intel CPUs and MSVC platform as well.




    --- Synchronet 3.20c-Linux NewsLink 1.2
  • From Bonita Montero@Bonita.Montero@gmail.com to comp.lang.c++ on Fri Mar 7 09:51:24 2025
    From Newsgroup: comp.lang.c++

    Am 03.03.2025 um 17:10 schrieb Michael S:

    It seems that major slowdown is specific to combination of msys2
    libraries with Zen3/4 CPU.
    I see even worse slowness of get/set rounding mode on msys2/Zen3.
    The same msys-compiled binary on Intel CPUs is o.k., at least
    relatively to other heavy things going on on the slow path.
    On Zen3 with Microsoft's compiler/library it is also o.k.

    If I use _mm_setcsr() and _mm_getcsr() I can disable your fast path
    and I get the same performance. The Linux code with clang++-18 has
    about the same performance like my Windows-code with MSVC (... and
    this 128/64 -> 64:64 division).

    --- Synchronet 3.20c-Linux NewsLink 1.2
  • From Michael S@already5chosen@yahoo.com to comp.lang.c++ on Sun Mar 9 01:31:27 2025
    From Newsgroup: comp.lang.c++

    On Sun, 2 Mar 2025 17:10:37 +0100
    Bonita Montero <Bonita.Montero@gmail.com> wrote:

    This is my code, improved by the _udiv128-intrinsic of MSVC which
    provides a 128 / 64 division. With that my algorithm becomes nearly
    tree times as fast as before. I'll provide a g++ / clang++ compatible
    version with inline-assembly later.

    template<bool _32 = false>
    double xMyFmod( double x, double y )
    {
    constexpr uint64_t
    SIGN = 1ull << 63,
    IMPLICIT = 1ull << 52,
    MANT = IMPLICIT - 1,
    QBIT = 1ull << 51;
    uint64_t const
    binX = bit_cast<uint64_t>( x ),
    binY = bit_cast<uint64_t>( y );
    static auto abs = []( uint64_t m ) { return m & ~SIGN; };
    auto isNaN = []( uint64_t m ) { return abs( m ) >=
    0x7FF0000000000001u; }; auto isSig = []( uint64_t m ) { return !(m &
    QBIT); }; if( isNaN( binX ) ) [[unlikely]] // x == NaN
    #if defined(_MSC_VER)
    return bit_cast<double>( isNaN( binY ) ? binY | binX
    & binY & QBIT : binX );
    #else
    {
    if( isSig( binX ) || isNaN( binY ) && isSig( binY ) ) [[unlikely]] feraiseexcept( FE_INVALID );
    return bit_cast<double>( binX | QBIT );
    }
    #endif
    auto isInf = []( uint64_t m ) { return abs( m ) ==
    0x7FF0000000000000u; }; if( isNaN( binY ) ) [[unlikely]] // x != NaN
    || y == NaN #if defined(_MSC_VER)
    {
    if constexpr( _32 )
    if( isInf( binX ) )
    feraiseexcept( FE_INVALID );
    return y;
    }
    #else
    {
    if( isSig( binY ) ) [[unlikely]]
    feraiseexcept( FE_INVALID );
    return bit_cast<double>( binY | QBIT );
    }
    #endif
    if( isInf( binX ) ) // x == Inf
    {
    feraiseexcept( FE_INVALID );
    #if defined(_MSC_VER)
    return bit_cast<double>( binX & ~MANT | QBIT );
    #else
    return -numeric_limits<double>::quiet_NaN();
    #endif
    }
    if( !abs( binY ) ) [[unlikely]] // y == 0
    {
    feraiseexcept( FE_INVALID );
    #if defined(_MSC_VER)
    return numeric_limits<double>::quiet_NaN();
    #else
    return -numeric_limits<double>::quiet_NaN();
    #endif
    }
    if( !abs( binX ) || isInf( binY ) ) [[unlikely]] // x == 0 ||
    y == Inf return x;
    auto exp = []( uint64_t b ) -> int { return b >> 52 & 0x7FF;
    }; int
    expX = exp( binX ),
    expY = exp( binY );
    auto mant = []( uint64_t b ) { return b & MANT; };
    uint64_t
    mantX = mant( binX ),
    mantY = mant( binY );
    int headBits = 11;
    static auto normalize = [&]( int &exp, uint64_t &mant )
    {
    unsigned shift = countl_zero( mant ) - headBits;
    mant <<= shift;
    exp -= shift;
    };
    auto build = []( int &exp, uint64_t &mant )
    {
    if( exp ) [[likely]]
    mant |= IMPLICIT;
    else
    {
    exp = 1;
    normalize( exp, mant );
    }
    };
    build( expX, mantX );
    build( expY, mantY );
    int
    tailX = countr_zero( mantX ),
    tailY = countr_zero( mantY ),
    tailBits = tailX <= tailY ? tailX : tailY;
    headBits += tailBits;
    mantX >>= tailBits;
    mantY >>= tailBits;
    uint64_t signX = binX & SIGN;
    int expDiff;
    #if defined(_MSC_VER)
    while( (expDiff = expX - expY) > 63 )
    {
    unsigned long long hi = mantX >> 1, lo = mantX << 63, remainder; (void)_udiv128( hi, lo, mantY, &remainder );
    expX -= 63;
    mantX = remainder;
    normalize( expX, mantX );
    }
    #endif
    while( (expDiff = expX - expY) > 0 )
    {
    unsigned bits = expDiff <= headBits ? expDiff :
    headBits; if( !(mantX = (mantX << bits) % mantY) ) [[unlikely]]
    return bit_cast<double>( signX );
    expX -= bits;
    normalize( expX, mantX );
    }
    if( !expDiff && mantX >= mantY ) [[unlikely]]
    if( (mantX -= mantY) ) [[likely]]
    normalize( expX, mantX );
    else
    return bit_cast<double>( signX );
    mantX <<= tailBits;
    mantY <<= tailBits;
    if( expX <= 0 ) [[unlikely]]
    {
    assert(expX >= -51);
    mantX = mantX >> (unsigned)(-expX + 1);
    expX = 0;
    }
    return bit_cast<double>( signX | (uint64_t)expX << 52 | mantX
    & MANT ); }

    double myFmod( double x, double y )
    {
    return xMyFmod( x, y );
    }

    inline float myFmod( float x, float y )
    {
    return (float)xMyFmod<true>( (double)x, (double)y );
    }


    This code does not work in plenty of cases. It seems, your test vectors
    have poor coverage.
    Try, for example, x=1.8037919852882307, y=2.22605637008665934e-194







    --- Synchronet 3.20c-Linux NewsLink 1.2
  • From Bonita Montero@Bonita.Montero@gmail.com to comp.lang.c++ on Sun Mar 9 07:41:26 2025
    From Newsgroup: comp.lang.c++

    Am 09.03.2025 um 00:31 schrieb Michael S:
    This code does not work in plenty of cases. It seems, your test vectors
    have poor coverage.
    Try, for example, x=1.8037919852882307, y=2.22605637008665934e-194

    cout << hexfloat << myFmod( 1.8037919852882307, 2.22605637008665934e-194 ) << endl;
    cout << hexfloat << fmod( 1.8037919852882307, 2.22605637008665934e-194 ) << endl;


    Prints the same result under Linux and Windows.

    --- Synchronet 3.20c-Linux NewsLink 1.2
  • From Michael S@already5chosen@yahoo.com to comp.lang.c++ on Sun Mar 9 11:08:01 2025
    From Newsgroup: comp.lang.c++

    On Sun, 9 Mar 2025 07:41:26 +0100
    Bonita Montero <Bonita.Montero@gmail.com> wrote:

    Am 09.03.2025 um 00:31 schrieb Michael S:
    This code does not work in plenty of cases. It seems, your test
    vectors have poor coverage.
    Try, for example, x=1.8037919852882307, y=2.22605637008665934e-194

    cout << hexfloat << myFmod( 1.8037919852882307, 2.22605637008665934e-194 ) << endl;
    cout << hexfloat << fmod( 1.8037919852882307,
    2.22605637008665934e-194 ) << endl;


    Prints the same result under Linux and Windows.


    What it prints under Linux is irrelevant. Under Linux it compiles your
    original version or its close equivalent that is slow, ugly, but not
    buggy.
    What it prints under Windows when compiled with clang or gcc is also
    irrelevant for the same reason.
    The bug is in the new code that is exposed only when compiled with MSVC compiler.

    -- foo.cpp
    #include <cstdint>
    #include <cassert>
    #include <cfenv>
    #include <limits>
    #include <bit>
    using namespace std;

    template<bool _32 = false>
    double xMyFmod( double x, double y )
    {
    constexpr uint64_t
    SIGN = 1ull << 63,
    IMPLICIT = 1ull << 52,
    MANT = IMPLICIT - 1,
    QBIT = 1ull << 51;
    uint64_t const
    binX = bit_cast<uint64_t>( x ),
    binY = bit_cast<uint64_t>( y );
    static auto abs = []( uint64_t m ) { return m & ~SIGN; };
    auto isNaN = []( uint64_t m ) { return abs( m ) >=
    0x7FF0000000000001u; }; auto isSig = []( uint64_t m ) { return !(m &
    QBIT); }; if( isNaN( binX ) ) [[unlikely]] // x == NaN
    #if defined(_MSC_VER)
    return bit_cast<double>( isNaN( binY ) ? binY | binX &
    binY & QBIT : binX );
    #else
    {
    if( isSig( binX ) || isNaN( binY ) && isSig( binY ) ) [[unlikely]] feraiseexcept( FE_INVALID );
    return bit_cast<double>( binX | QBIT );
    }
    #endif
    auto isInf = []( uint64_t m ) { return abs( m ) ==
    0x7FF0000000000000u; }; if( isNaN( binY ) ) [[unlikely]] // x != NaN ||
    y == NaN #if defined(_MSC_VER)
    {
    if constexpr( _32 )
    if( isInf( binX ) )
    feraiseexcept( FE_INVALID );
    return y;
    }
    #else
    {
    if( isSig( binY ) ) [[unlikely]]
    feraiseexcept( FE_INVALID );
    return bit_cast<double>( binY | QBIT );
    }
    #endif
    if( isInf( binX ) ) // x == Inf
    {
    feraiseexcept( FE_INVALID );
    #if defined(_MSC_VER)
    return bit_cast<double>( binX & ~MANT | QBIT );
    #else
    return -numeric_limits<double>::quiet_NaN();
    #endif
    }
    if( !abs( binY ) ) [[unlikely]] // y == 0
    {
    feraiseexcept( FE_INVALID );
    #if defined(_MSC_VER)
    return numeric_limits<double>::quiet_NaN();
    #else
    return -numeric_limits<double>::quiet_NaN();
    #endif
    }
    if( !abs( binX ) || isInf( binY ) ) [[unlikely]] // x == 0 || y
    == Inf return x;
    auto exp = []( uint64_t b ) -> int { return b >> 52 & 0x7FF; };
    int
    expX = exp( binX ),
    expY = exp( binY );
    auto mant = []( uint64_t b ) { return b & MANT; };
    uint64_t
    mantX = mant( binX ),
    mantY = mant( binY );
    int headBits = 11;
    static auto normalize = [&]( int &exp, uint64_t &mant )
    {
    unsigned shift = countl_zero( mant ) - headBits;
    mant <<= shift;
    exp -= shift;
    };
    auto build = []( int &exp, uint64_t &mant )
    {
    if( exp ) [[likely]]
    mant |= IMPLICIT;
    else
    {
    exp = 1;
    normalize( exp, mant );
    }
    };
    build( expX, mantX );
    build( expY, mantY );
    int
    tailX = countr_zero( mantX ),
    tailY = countr_zero( mantY ),
    tailBits = tailX <= tailY ? tailX : tailY;
    headBits += tailBits;
    mantX >>= tailBits;
    mantY >>= tailBits;
    uint64_t signX = binX & SIGN;
    int expDiff;
    #if defined(_MSC_VER)
    while( (expDiff = expX - expY) > 63 )
    {
    unsigned long long hi = mantX >> 1, lo = mantX << 63, remainder; (void)_udiv128( hi, lo, mantY, &remainder );
    expX -= 63;
    mantX = remainder;
    normalize( expX, mantX );
    }
    #endif
    while( (expDiff = expX - expY) > 0 )
    {
    unsigned bits = expDiff <= headBits ? expDiff :
    headBits; if( !(mantX = (mantX << bits) % mantY) ) [[unlikely]]
    return bit_cast<double>( signX );
    expX -= bits;
    normalize( expX, mantX );
    }
    if( !expDiff && mantX >= mantY ) [[unlikely]]
    if( (mantX -= mantY) ) [[likely]]
    normalize( expX, mantX );
    else
    return bit_cast<double>( signX );
    mantX <<= tailBits;
    mantY <<= tailBits;
    if( expX <= 0 ) [[unlikely]]
    {
    assert(expX >= -51);
    mantX = mantX >> (unsigned)(-expX + 1);
    expX = 0;
    }
    return bit_cast<double>( signX | (uint64_t)expX << 52 | mantX &
    MANT ); }

    double myFmod( double x, double y )
    {
    return xMyFmod( x, y );
    }

    inline float myFmod( float x, float y )
    {
    return (float)xMyFmod<true>( (double)x, (double)y );
    }

    -- end of foo.cpp


    -- bar.cpp
    #include <iostream>
    using namespace std;

    double myFmod( double x, double y );

    int main()
    {
    cout << hexfloat << myFmod(
    1.8037919852882307,
    2.22605637008665934e-194 ) << endl;
    cout << hexfloat << fmod( 1.8037919852882307,
    2.22605637008665934e-194 ) << endl;
    }

    -- end of foo.cpp


    W:\foobar>cl
    Microsoft (R) C/C++ Optimizing Compiler Version 19.41.34120 for x64
    Copyright (C) Microsoft Corporation. All rights reserved.

    usage: cl [ option... ] filename... [ /link linkoption... ]


    W:\foobar>cl -nologo -O2 -EHsc -std:c++20 foo.cpp bar.cpp
    foo.cpp
    bar.cpp
    Generating Code...

    W:\foobar>foo
    0x1.0000000000000p-696
    0x0.0000000000000p+0


    --- Synchronet 3.20c-Linux NewsLink 1.2
  • From Bonita Montero@Bonita.Montero@gmail.com to comp.lang.c++ on Sun Mar 9 10:31:02 2025
    From Newsgroup: comp.lang.c++

    This prints the same result (0.0) under Windows and Linux:

    double myFmod( double x, double y )
    {
    constexpr uint64_t
    SIGN = 1ull << 63,
    IMPLICIT = 1ull << 52,
    MANT = IMPLICIT - 1,
    QBIT = 1ull << 51;
    uint64_t const
    binX = bit_cast<uint64_t>( x ),
    binY = bit_cast<uint64_t>( y );
    static auto abs = []( uint64_t m ) { return m & ~SIGN; };
    auto isNaN = []( uint64_t m ) { return abs( m ) >= 0x7FF0000000000001u; };
    auto isSig = []( uint64_t m ) { return !(m & QBIT); };
    if( isNaN( binX ) ) [[unlikely]] // x == NaN
    #if defined(_MSC_VER)
    return bit_cast<double>( isNaN( binY ) ? binY | binX & binY & QBIT :
    binX );
    #else
    {
    if( isSig( binX ) || isNaN( binY ) && isSig( binY ) ) [[unlikely]]
    feraiseexcept( FE_INVALID );
    return bit_cast<double>( binX | QBIT );
    }
    #endif
    if( isNaN( binY ) ) [[unlikely]] // x != NaN || y == NaN
    #if defined(_MSC_VER)
    return y;
    #else
    {
    if( isSig( binY ) ) [[unlikely]]
    feraiseexcept( FE_INVALID );
    return bit_cast<double>( binY | QBIT );
    }
    #endif
    auto isInf = []( uint64_t m ) { return abs( m ) == 0x7FF0000000000000u; };
    if( isInf( binX ) ) // x == Inf
    {
    feraiseexcept( FE_INVALID );
    #if defined(_MSC_VER)
    return bit_cast<double>( binX & ~MANT | QBIT );
    #else
    return -numeric_limits<double>::quiet_NaN();
    #endif
    }
    if( !abs( binY ) ) [[unlikely]] // y == 0
    {
    feraiseexcept( FE_INVALID );
    #if defined(_MSC_VER)
    return numeric_limits<double>::quiet_NaN();
    #else
    return -numeric_limits<double>::quiet_NaN();
    #endif
    }
    if( !abs( binX ) || isInf( binY ) ) [[unlikely]] // x == 0 || y == Inf
    return x;
    auto exp = []( uint64_t b ) -> int { return b >> 52 & 0x7FF; };
    int
    expX = exp( binX ),
    expY = exp( binY );
    auto mant = []( uint64_t b ) { return b & MANT; };
    uint64_t
    mantX = mant( binX ),
    mantY = mant( binY );
    int headBits = 11;
    static auto normalize = [&]( int &exp, uint64_t &mant )
    {
    unsigned shift = countl_zero( mant ) - headBits;
    mant <<= shift;
    exp -= shift;
    };
    auto build = []( int &exp, uint64_t &mant )
    {
    if( exp ) [[likely]]
    mant |= IMPLICIT;
    else
    {
    exp = 1;
    normalize( exp, mant );
    }
    };
    build( expX, mantX );
    build( expY, mantY );
    int
    tailX = countr_zero( mantX ),
    tailY = countr_zero( mantY ),
    tailBits = tailX <= tailY ? tailX : tailY;
    mantX >>= tailBits;
    mantY >>= tailBits;
    headBits += tailBits;
    uint64_t signX = binX & SIGN;
    int expDiff;
    #if defined(_MSC_VER) && !defined(__llvm__) && defined(_M_X64)
    while( (expDiff = expX - expY) > 0 )
    {
    unsigned bits = expDiff <= 63 ? expDiff : 63;
    unsigned long long hi = mantX >> 64 - bits, lo = mantX << bits, remainder;
    (void)_udiv128( hi, lo, mantY, &remainder );
    if( !remainder ) [[unlikely]]
    return bit_cast<double>( signX );
    mantX = remainder;
    expX -= bits;
    normalize( expX, mantX );
    }
    #else
    while( (expDiff = expX - expY) > 0 )
    {
    unsigned bits = expDiff <= headBits ? expDiff : headBits;
    if( !(mantX = (mantX << bits) % mantY) ) [[unlikely]]
    return bit_cast<double>( signX );
    expX -= bits;
    normalize( expX, mantX );
    }
    #endif
    if( !expDiff && mantX >= mantY ) [[unlikely]]
    if( (mantX -= mantY) ) [[likely]]
    normalize( expX, mantX );
    else
    return bit_cast<double>( signX );
    mantX <<= tailBits;
    mantY <<= tailBits;
    if( expX <= 0 ) [[unlikely]]
    {
    assert(expX >= -51);
    mantX = mantX >> (unsigned)(-expX + 1);
    expX = 0;
    }
    return bit_cast<double>( signX | (uint64_t)expX << 52 | mantX & MANT ); }

    --- Synchronet 3.20c-Linux NewsLink 1.2
  • From Michael S@already5chosen@yahoo.com to comp.lang.c++ on Sun Mar 9 11:46:24 2025
    From Newsgroup: comp.lang.c++

    On Sun, 9 Mar 2025 10:31:02 +0100
    Bonita Montero <Bonita.Montero@gmail.com> wrote:

    This prints the same result (0.0) under Windows and Linux:


    I am no longer going to look at your code until you start posting full
    files, with all includes and using directives.

    --- Synchronet 3.20c-Linux NewsLink 1.2
  • From Bonita Montero@Bonita.Montero@gmail.com to comp.lang.c++ on Sun Mar 9 10:54:40 2025
    From Newsgroup: comp.lang.c++

    Am 09.03.2025 um 10:46 schrieb Michael S:

    This prints the same result (0.0) under Windows and Linux:

    I am no longer going to look at your code until you start posting full
    files, with all includes and using directives.

    You could simply replace the single function I've shown.

    --- Synchronet 3.20c-Linux NewsLink 1.2
  • From Michael S@already5chosen@yahoo.com to comp.lang.c++ on Sun Mar 9 12:09:31 2025
    From Newsgroup: comp.lang.c++

    On Sun, 9 Mar 2025 10:54:40 +0100
    Bonita Montero <Bonita.Montero@gmail.com> wrote:

    Am 09.03.2025 um 10:46 schrieb Michael S:

    This prints the same result (0.0) under Windows and Linux:

    I am no longer going to look at your code until you start posting
    full files, with all includes and using directives.

    You could simply replace the single function I've shown.


    I can. I don't want to do it.
    You want me to look at/test your code? You post full code.
    Simple, isn't it?




    --- Synchronet 3.20c-Linux NewsLink 1.2
  • From Bonita Montero@Bonita.Montero@gmail.com to comp.lang.c++ on Sun Mar 9 11:21:55 2025
    From Newsgroup: comp.lang.c++

    Am 09.03.2025 um 11:09 schrieb Michael S:
    On Sun, 9 Mar 2025 10:54:40 +0100
    Bonita Montero <Bonita.Montero@gmail.com> wrote:

    Am 09.03.2025 um 10:46 schrieb Michael S:

    This prints the same result (0.0) under Windows and Linux:

    I am no longer going to look at your code until you start posting
    full files, with all includes and using directives.

    You could simply replace the single function I've shown.


    I can. I don't want to do it.
    You want me to look at/test your code? You post full code.
    Simple, isn't it?

    I've read you don't trust my tests, so use your own with myFmod.

    --- Synchronet 3.20c-Linux NewsLink 1.2
  • From wij@wyniijj5@gmail.com to comp.lang.c++ on Sun Mar 9 18:51:45 2025
    From Newsgroup: comp.lang.c++

    On Sun, 2025-03-09 at 10:31 +0100, Bonita Montero wrote:
    This prints the same result (0.0) under Windows and Linux:

    double myFmod( double x, double y )
    {
    constexpr uint64_t
    SIGN = 1ull << 63,
    IMPLICIT = 1ull << 52,
    MANT = IMPLICIT - 1,
    QBIT = 1ull << 51;
    uint64_t const
    binX = bit_cast<uint64_t>( x ),
    binY = bit_cast<uint64_t>( y );
    static auto abs = []( uint64_t m ) { return m & ~SIGN; };
    auto isNaN = []( uint64_t m ) { return abs( m ) >= 0x7FF0000000000001u; };
    auto isSig = []( uint64_t m ) { return !(m & QBIT); };
    if( isNaN( binX ) ) [[unlikely]] // x == NaN
    #if defined(_MSC_VER)
    return bit_cast<double>( isNaN( binY ) ? binY | binX & binY & QBIT :
    binX );
    #else
    {
    if( isSig( binX ) || isNaN( binY ) && isSig( binY ) ) [[unlikely]]
    feraiseexcept( FE_INVALID );
    return bit_cast<double>( binX | QBIT );
    }
    #endif
    if( isNaN( binY ) ) [[unlikely]] // x != NaN || y == NaN
    #if defined(_MSC_VER)
    return y;
    #else
    {
    if( isSig( binY ) ) [[unlikely]]
    feraiseexcept( FE_INVALID );
    return bit_cast<double>( binY | QBIT );
    }
    #endif
    auto isInf = []( uint64_t m ) { return abs( m ) == 0x7FF0000000000000u; };
    if( isInf( binX ) ) // x == Inf
    {
    feraiseexcept( FE_INVALID );
    #if defined(_MSC_VER)
    return bit_cast<double>( binX & ~MANT | QBIT );
    #else
    return -numeric_limits<double>::quiet_NaN();
    #endif
    }
    if( !abs( binY ) ) [[unlikely]] // y == 0
    {
    feraiseexcept( FE_INVALID );
    #if defined(_MSC_VER)
    return numeric_limits<double>::quiet_NaN();
    #else
    return -numeric_limits<double>::quiet_NaN();
    #endif
    }
    if( !abs( binX ) || isInf( binY ) ) [[unlikely]] // x == 0 || y == Inf
    return x;
    auto exp = []( uint64_t b ) -> int { return b >> 52 & 0x7FF; };
    int
    expX = exp( binX ),
    expY = exp( binY );
    auto mant = []( uint64_t b ) { return b & MANT; };
    uint64_t
    mantX = mant( binX ),
    mantY = mant( binY );
    int headBits = 11;
    static auto normalize = [&]( int &exp, uint64_t &mant )
    {
    unsigned shift = countl_zero( mant ) - headBits;
    mant <<= shift;
    exp -= shift;
    };
    auto build = []( int &exp, uint64_t &mant )
    {
    if( exp ) [[likely]]
    mant |= IMPLICIT;
    else
    {
    exp = 1;
    normalize( exp, mant );
    }
    };
    build( expX, mantX );
    build( expY, mantY );
    int
    tailX = countr_zero( mantX ),
    tailY = countr_zero( mantY ),
    tailBits = tailX <= tailY ? tailX : tailY;
    mantX >>= tailBits;
    mantY >>= tailBits;
    headBits += tailBits;
    uint64_t signX = binX & SIGN;
    int expDiff;
    #if defined(_MSC_VER) && !defined(__llvm__) && defined(_M_X64)
    while( (expDiff = expX - expY) > 0 )
    {
    unsigned bits = expDiff <= 63 ? expDiff : 63;
    unsigned long long hi = mantX >> 64 - bits, lo = mantX << bits, remainder;
    (void)_udiv128( hi, lo, mantY, &remainder );
    if( !remainder ) [[unlikely]]
    return bit_cast<double>( signX );
    mantX = remainder;
    expX -= bits;
    normalize( expX, mantX );
    }
    #else
    while( (expDiff = expX - expY) > 0 )
    {
    unsigned bits = expDiff <= headBits ? expDiff : headBits;
    if( !(mantX = (mantX << bits) % mantY) ) [[unlikely]]
    return bit_cast<double>( signX );
    expX -= bits;
    normalize( expX, mantX );
    }
    #endif
    if( !expDiff && mantX >= mantY ) [[unlikely]]
    if( (mantX -= mantY) ) [[likely]]
    normalize( expX, mantX );
    else
    return bit_cast<double>( signX );
    mantX <<= tailBits;
    mantY <<= tailBits;
    if( expX <= 0 ) [[unlikely]]
    {
    assert(expX >= -51);
    mantX = mantX >> (unsigned)(-expX + 1);
    expX = 0;
    }
    return bit_cast<double>( signX | (uint64_t)expX << 52 | mantX & MANT ); }

    The basic problem is what the purpose of myFmod(double,double)? (maybe I missed something)
    From the view of implementing myFmod, I think using C-like coding style would be better,
    but all depending on what you want to achieve.
    --- Synchronet 3.20c-Linux NewsLink 1.2
  • From Bonita Montero@Bonita.Montero@gmail.com to comp.lang.c++ on Sun Mar 9 12:56:55 2025
    From Newsgroup: comp.lang.c++

    Am 09.03.2025 um 11:51 schrieb wij:

    From the view of implementing myFmod, I think using C-like coding style would be better,
    but all depending on what you want to achieve.

    A C coding style would result in about two times the code.
    --- Synchronet 3.20c-Linux NewsLink 1.2
  • From Michael S@already5chosen@yahoo.com to comp.lang.c++ on Sun Mar 9 14:09:32 2025
    From Newsgroup: comp.lang.c++

    On Sun, 9 Mar 2025 11:21:55 +0100
    Bonita Montero <Bonita.Montero@gmail.com> wrote:

    Am 09.03.2025 um 11:09 schrieb Michael S:
    On Sun, 9 Mar 2025 10:54:40 +0100
    Bonita Montero <Bonita.Montero@gmail.com> wrote:

    Am 09.03.2025 um 10:46 schrieb Michael S:

    This prints the same result (0.0) under Windows and Linux:

    I am no longer going to look at your code until you start posting
    full files, with all includes and using directives.

    You could simply replace the single function I've shown.


    I can. I don't want to do it.
    You want me to look at/test your code? You post full code.
    Simple, isn't it?

    I've read you don't trust my tests, so use your own with myFmod.


    ok. I was too curios :(
    This version produces correct results both when compiled under MSVC and
    when compiled with other compilers. It is a little faster too.
    With MSVC on old Intel CPU it is only 2.5 times slower than standard
    library in relevant range of x/y. Previous version was 3.4 times
    slower.
    With gcc and clang it is still more than 6 times slower than standard
    library.
    The coding style is now less insane.

    Measurements in nsec.
    First result - Intel Skylake at 4.25 GHz
    Second result - AMD Zen3 at 3.7 GHz

    abs(x/y) in range that matters [0.5:2**53]:
    Standard MSVC Library - 11.1 10.4
    Standard gnu Library - 5.4 10.7
    Yours (MSVc) - 27.6 11.5
    Yours (gcc) - 36.4 23.7
    Yours (clang) - 37.4 24.3

    abs(x/y) in full range [2**-2090:2**2090]:
    Standard MSVC Library - 109.4 153.5
    Standard glib Library - 102.3 155.5
    Yours (MSVc) - 134.9 52.6
    Yours (gcc) - 284.7 151.8
    Yours (clang) - 285.2 156.5





    --- Synchronet 3.20c-Linux NewsLink 1.2
  • From Michael S@already5chosen@yahoo.com to comp.lang.c++ on Sun Mar 9 14:23:03 2025
    From Newsgroup: comp.lang.c++

    On Sun, 9 Mar 2025 12:56:55 +0100
    Bonita Montero <Bonita.Montero@gmail.com> wrote:

    Am 09.03.2025 um 11:51 schrieb wij:

    From the view of implementing myFmod, I think using C-like coding
    style would be better, but all depending on what you want to
    achieve.

    A C coding style would result in about two times the code.

    So far all we had see from you is 2-3 times longer than C code (real C,
    not C-style C++) that I posted here few days ago. And my code had more
    comments than yours, so difference in code itself is even bigger.

    Yes, part of it is because my algorithm is simpler. But that is only
    part.

    --- Synchronet 3.20c-Linux NewsLink 1.2
  • From Bonita Montero@Bonita.Montero@gmail.com to comp.lang.c++ on Sun Mar 9 13:39:40 2025
    From Newsgroup: comp.lang.c++

    Am 09.03.2025 um 13:23 schrieb Michael S:

    So far all we had see from you is 2-3 times longer than C code (real C,
    not C-style C++) ...

    Not true since I save a lot of redundant-code with [&]-lambdas.
    --- Synchronet 3.20c-Linux NewsLink 1.2
  • From Michael S@already5chosen@yahoo.com to comp.lang.c++ on Sun Mar 9 14:56:25 2025
    From Newsgroup: comp.lang.c++

    On Sun, 9 Mar 2025 13:39:40 +0100
    Bonita Montero <Bonita.Montero@gmail.com> wrote:

    Am 09.03.2025 um 13:23 schrieb Michael S:

    So far all we had see from you is 2-3 times longer than C code
    (real C, not C-style C++) ...

    Not true since I save a lot of redundant-code with [&]-lambdas.

    Every heard of wc? It does not lie.

    --- Synchronet 3.20c-Linux NewsLink 1.2
  • From Mr Flibble@leigh@i42.co.uk to comp.lang.c++ on Sun Mar 9 15:26:29 2025
    From Newsgroup: comp.lang.c++

    On Sun, 9 Mar 2025 14:09:32 +0200, Michael S
    <already5chosen@yahoo.com> wrote:

    On Sun, 9 Mar 2025 11:21:55 +0100
    Bonita Montero <Bonita.Montero@gmail.com> wrote:

    Am 09.03.2025 um 11:09 schrieb Michael S:
    On Sun, 9 Mar 2025 10:54:40 +0100
    Bonita Montero <Bonita.Montero@gmail.com> wrote:

    Am 09.03.2025 um 10:46 schrieb Michael S:

    This prints the same result (0.0) under Windows and Linux:

    I am no longer going to look at your code until you start posting
    full files, with all includes and using directives.

    You could simply replace the single function I've shown.


    I can. I don't want to do it.
    You want me to look at/test your code? You post full code.
    Simple, isn't it?

    I've read you don't trust my tests, so use your own with myFmod.


    ok. I was too curios :(
    This version produces correct results both when compiled under MSVC and
    when compiled with other compilers. It is a little faster too.
    With MSVC on old Intel CPU it is only 2.5 times slower than standard
    library in relevant range of x/y. Previous version was 3.4 times
    slower.
    With gcc and clang it is still more than 6 times slower than standard >library.
    The coding style is now less insane.

    Measurements in nsec.
    First result - Intel Skylake at 4.25 GHz
    Second result - AMD Zen3 at 3.7 GHz

    abs(x/y) in range that matters [0.5:2**53]:
    Standard MSVC Library - 11.1 10.4
    Standard gnu Library - 5.4 10.7
    Yours (MSVc) - 27.6 11.5
    Yours (gcc) - 36.4 23.7
    Yours (clang) - 37.4 24.3

    abs(x/y) in full range [2**-2090:2**2090]:
    Standard MSVC Library - 109.4 153.5
    Standard glib Library - 102.3 155.5
    Yours (MSVc) - 134.9 52.6
    Yours (gcc) - 284.7 151.8
    Yours (clang) - 285.2 156.5

    So it is slow ergo a pointless alternative to what we already have.

    /Flibble
    --- Synchronet 3.20c-Linux NewsLink 1.2
  • From Bonita Montero@Bonita.Montero@gmail.com to comp.lang.c++ on Sun Mar 9 17:02:32 2025
    From Newsgroup: comp.lang.c++

    Am 09.03.2025 um 16:26 schrieb Mr Flibble:

    So it is slow ergo a pointless alternative to what we already have.

    glibc does it nearly in the same way I do it because the FMA-solution
    isn't portable. If fma( a, b, c ) is substituted with a * b + c because
    there's no proper CPU-instruction the whole issue doesn't work.
    And with support for _udiv128 my solution has about the same performance
    like the Michael's solution with clang++ 18.1.7.

    --- Synchronet 3.20c-Linux NewsLink 1.2
  • From Bonita Montero@Bonita.Montero@gmail.com to comp.lang.c++ on Sun Mar 9 17:23:26 2025
    From Newsgroup: comp.lang.c++

    Am 09.03.2025 um 13:09 schrieb Michael S:

    Measurements in nsec.
    First result - Intel Skylake at 4.25 GHz
    Second result - AMD Zen3 at 3.7 GHz

    abs(x/y) in range that matters [0.5:2**53]:
    Standard MSVC Library - 11.1 10.4
    Standard gnu Library - 5.4 10.7
    Yours (MSVc) - 27.6 11.5
    Yours (gcc) - 36.4 23.7
    Yours (clang) - 37.4 24.3

    abs(x/y) in full range [2**-2090:2**2090]:
    Standard MSVC Library - 109.4 153.5
    Standard glib Library - 102.3 155.5
    Yours (MSVc) - 134.9 52.6
    Yours (gcc) - 284.7 151.8
    Yours (clang) - 285.2 156.5

    With MSVC and an arbitrary combination of finite x and y on my
    Zen4-machine:

    your fmod: 77.1214
    my: 38.4486

    With MSVC and an arbitrary combination of finite x with exponents
    ranging from 0x3FF to 0x433 (close exponents) on my Zen4-machine:

    your fmod: 23.6423
    my: 9.79146

    This is a nearly proper implementation of your idea with FMA-intrinsics
    and SSE/AVX control register access:

    double fmody( double x, double y )
    {
    if( isnan( x ) ) [[unlikely]]
    return x;
    if( isnan( y ) ) [[unlikely]]
    return y;
    if( isinf( x ) || !y ) [[unlikely]]
    {
    feraiseexcept( FE_INVALID );
    return numeric_limits<double>::quiet_NaN();
    }
    if( !x || isinf( y ) ) [[unlikely]]
    return x;
    uint64_t sign = bit_cast<uint64_t>( x ) & numeric_limits<int64_t>::min();
    x = abs( x );
    y = abs( y );
    int oldCsr = _mm_getcsr();
    constexpr int CHOP = 0x6000;
    _mm_setcsr( oldCsr | CHOP );
    constexpr uint64_t
    EXP = -(1ll << 52),
    MANT = ~EXP;
    uint64_t binY = bit_cast<uint64_t>( y );
    int64_t expY = binY & EXP;
    if( !expY ) [[unlikely]]
    expY = (uint64_t)(0 - (countl_zero( binY & MANT ) - 12)) << 52;
    while( x >= y )
    {
    uint64_t yExpAdd = 0;
    double div = x / y;
    if( div < 0x1.FFFFFFFFFFFFFp+1023 ) [[likely]]
    div = xtrunc( div );
    else
    {
    uint64_t
    binX = bit_cast<uint64_t>( x ),
    newExp = expY + (54ull << 52);
    yExpAdd = (binX & EXP) - newExp;
    div = xtrunc( bit_cast<double>( newExp | binX & MANT ) / y );
    }
    __m128d mult1, mult2, add;
    #if defined(_MSC_VER)
    mult1.m128d_f64[0] = div;
    mult2.m128d_f64[0] = -bit_cast<double>( binY + yExpAdd );
    add.m128d_f64[0] = x;
    x = _mm_fmadd_sd( mult1, mult2, add ).m128d_f64[0];
    #else
    mult1[0] = div;
    mult2[0] = -bit_cast<double>( binY + yExpAdd );
    add[0] = x;
    x = _mm_fmadd_sd( mult1, mult2, add )[0];
    #endif
    if( !x ) [[unlikely]]
    return bit_cast<double>( sign );
    }
    _mm_setcsr( oldCsr );
    return bit_cast<double>( sign | bit_cast<uint64_t>( x ) );
    }

    The only thing that doesn't work currently is the support for denormal
    values.

    --- Synchronet 3.20c-Linux NewsLink 1.2
  • From Michael S@already5chosen@yahoo.com to comp.lang.c++ on Sun Mar 9 19:55:21 2025
    From Newsgroup: comp.lang.c++

    On Sun, 9 Mar 2025 17:02:32 +0100
    Bonita Montero <Bonita.Montero@gmail.com> wrote:

    Am 09.03.2025 um 16:26 schrieb Mr Flibble:

    So it is slow ergo a pointless alternative to what we already have.


    glibc does it nearly in the same way I do it because the FMA-solution
    isn't portable.
    If fma( a, b, c ) is substituted with a * b + c
    because there's no proper CPU-instruction the whole issue doesn't
    work.

    FMA solution is portable to any standard-complaint C environment.
    By standard, it has to work regardless of presence of absence of
    FMA hardware.
    In absence of FMA hardware it is expected to be rather slow, but still
    has to produce correct results.

    Unfortunately, Microsoft's library implementation of fma() does not work correctly on non-FMA hardware. In case when x*y and z have different
    signs it fairs somewhat better than when they have the same sign, but
    still not good enough.
    So it goes ):
    Since under msys2 both gcc and clang rely on Microsoft's library, they
    also do not work correctly on non-FMA CPUs.

    It would be interesting to test if glibc is better in that regard. Unfortunately, right now I have no access to any glibc-based system
    that runs on sufficiently old hardware.

    Anyway, despite all their problems, fma() based solutions are written
    in standard C language. The same can't be said about solutions that use
    any variant of 128-bit integer arithmetic, either in form of Gnu
    __int128 or in form of Microsoft's intrinsic functions.
    I think that an absence of standardized 128-bit integer math in both C
    and C++ sucks, but I can not change the fact.

    And with support for _udiv128 my solution has about the same
    performance like the Michael's solution with clang++ 18.1.7.


    I readily admitted that from practical perspective all 3 solutions
    that I posted here (one of each is incorrect) are pointless.

    I do have variants that are usefully faster than Standard library for
    relevant x/y ratios, but I didn't post them here.
    They achieve the speed boost via direct manipulation of binary64 bit
    patterns. I am not sure that such solutions are on topic in c.l.c++.

    Still, what you say about relative speed is true only on newer CPUs and
    only when compiled with MSVC. On Older CPUs, like Intel Skylake, which
    is pretty fast CPU on absolute scale and which still constitutes big
    portion of installed base, your code is significantly slower than mine.
    Same for newer CPUs with compilers that do not support native long
    division.

    Here is comparison vs code that I posted here at 2025-03-03 18:10:08
    +0200:


    Measurements in nsec.
    First result - Intel Skylake at 4.25 GHz
    Second result - AMD Zen3 at 3.7 GHz

    abs(x/y) in range that matters [0.5:2**53]:
    Standard MSVC Library - 11.1 10.4
    Standard gnu Library - 5.4 10.7
    Yours (MSVc) - 27.6 11.5
    Yours (gcc) - 36.4 23.7
    Yours (clang) - 37.4 24.3
    my (MSVc) - 10.7 11.3
    my (gcc) - 7.7 7.6
    my (clang) - 6.3 7.5

    abs(x/y) in full range [2**-2090:2**2090]:
    Standard MSVC Library - 109.4 153.5
    Standard glib Library - 102.3 155.5
    Yours (MSVc) - 134.9 52.6
    Yours (gcc) - 284.7 151.8
    Yours (clang) - 285.2 156.5
    my (MSVc) - 62.1 61.1
    my (gcc) - 60.8 59.1
    my (clang) - 59.9 59.3





    --- Synchronet 3.20c-Linux NewsLink 1.2
  • From Michael S@already5chosen@yahoo.com to comp.lang.c++ on Sun Mar 9 20:04:07 2025
    From Newsgroup: comp.lang.c++

    On Sun, 9 Mar 2025 17:23:26 +0100
    Bonita Montero <Bonita.Montero@gmail.com> wrote:

    Am 09.03.2025 um 13:09 schrieb Michael S:

    Measurements in nsec.
    First result - Intel Skylake at 4.25 GHz
    Second result - AMD Zen3 at 3.7 GHz

    abs(x/y) in range that matters [0.5:2**53]:
    Standard MSVC Library - 11.1 10.4
    Standard gnu Library - 5.4 10.7
    Yours (MSVc) - 27.6 11.5
    Yours (gcc) - 36.4 23.7
    Yours (clang) - 37.4 24.3

    abs(x/y) in full range [2**-2090:2**2090]:
    Standard MSVC Library - 109.4 153.5
    Standard glib Library - 102.3 155.5
    Yours (MSVc) - 134.9 52.6
    Yours (gcc) - 284.7 151.8
    Yours (clang) - 285.2 156.5

    With MSVC and an arbitrary combination of finite x and y on my
    Zen4-machine:

    your fmod: 77.1214
    my: 38.4486

    With MSVC and an arbitrary combination of finite x with exponents
    ranging from 0x3FF to 0x433 (close exponents) on my Zen4-machine:

    your fmod: 23.6423
    my: 9.79146



    It looks like you didn't pay attention to a "flipflop" version that I
    posted at 2025-03-03 18:10:08

    This is a nearly proper implementation of your idea with
    FMA-intrinsics and SSE/AVX control register access:

    double fmody( double x, double y )
    {
    if( isnan( x ) ) [[unlikely]]
    return x;
    if( isnan( y ) ) [[unlikely]]
    return y;
    if( isinf( x ) || !y ) [[unlikely]]
    {
    feraiseexcept( FE_INVALID );
    return numeric_limits<double>::quiet_NaN();
    }
    if( !x || isinf( y ) ) [[unlikely]]
    return x;
    uint64_t sign = bit_cast<uint64_t>( x ) & numeric_limits<int64_t>::min(); x = abs( x );
    y = abs( y );
    int oldCsr = _mm_getcsr();
    constexpr int CHOP = 0x6000;
    _mm_setcsr( oldCsr | CHOP );
    constexpr uint64_t
    EXP = -(1ll << 52),
    MANT = ~EXP;
    uint64_t binY = bit_cast<uint64_t>( y );
    int64_t expY = binY & EXP;
    if( !expY ) [[unlikely]]
    expY = (uint64_t)(0 - (countl_zero( binY & MANT ) -
    12)) << 52; while( x >= y )
    {
    uint64_t yExpAdd = 0;
    double div = x / y;
    if( div < 0x1.FFFFFFFFFFFFFp+1023 ) [[likely]]
    div = xtrunc( div );
    else
    {
    uint64_t
    binX = bit_cast<uint64_t>( x ),
    newExp = expY + (54ull << 52);
    yExpAdd = (binX & EXP) - newExp;
    div = xtrunc( bit_cast<double>( newExp | binX
    & MANT ) / y ); }
    __m128d mult1, mult2, add;
    #if defined(_MSC_VER)
    mult1.m128d_f64[0] = div;
    mult2.m128d_f64[0] = -bit_cast<double>( binY +
    yExpAdd ); add.m128d_f64[0] = x;
    x = _mm_fmadd_sd( mult1, mult2, add ).m128d_f64[0];
    #else
    mult1[0] = div;
    mult2[0] = -bit_cast<double>( binY + yExpAdd );
    add[0] = x;
    x = _mm_fmadd_sd( mult1, mult2, add )[0];
    #endif
    if( !x ) [[unlikely]]
    return bit_cast<double>( sign );
    }
    _mm_setcsr( oldCsr );
    return bit_cast<double>( sign | bit_cast<uint64_t>( x ) );
    }

    The only thing that doesn't work currently is the support for denormal values.


    I already said that I don't approve non-portable constructs like _mm_getcsr()/_mm_setcsr() except when they help important cases and
    help ALOT. Neither applies here.


    --- Synchronet 3.20c-Linux NewsLink 1.2
  • From Bonita Montero@Bonita.Montero@gmail.com to comp.lang.c++ on Sun Mar 9 19:10:23 2025
    From Newsgroup: comp.lang.c++

    Am 09.03.2025 um 18:55 schrieb Michael S:

    In absence of FMA hardware it is expected to be rather slow, but still
    has to produce correct results.

    Yes, extremely slow.

    It would be interesting to test if glibc is better in that regard.

    glibc uses integer-operations.

    I readily admitted that from practical perspective all 3 solutions
    that I posted here (one of each is incorrect) are pointless.

    I've recently posted a comparison of your solution against mine on
    a Zen4-CPU with MSVC. Your solution is equally performant with close
    exponents (0x3FF to 0x433) if I compile it unter WSL2 with g++-12.
    The problem with MSVC is that the trunc() function is extremely slow.
    In my implementation of your idea (x86 FMA) I use my own function
    xtrunc which makes the code twice as fast.

    Still, what you say about relative speed is true only on newer CPUs and
    only when compiled with MSVC. ...

    I haven't managed to write a 128 / 64 -> 64:64 division with Linux
    -compilers. __int128 doesn't work since the compiler doesn't see that
    the result fits in 64 bit and calls a library function for the division
    which does the subtract and shift steps manually.
    But your solution has about the same performance on Linux with g++12
    like my code with MSVC on a Zen4-CPU.
    --- Synchronet 3.20c-Linux NewsLink 1.2
  • From Bonita Montero@Bonita.Montero@gmail.com to comp.lang.c++ on Sun Mar 9 19:11:45 2025
    From Newsgroup: comp.lang.c++

    Am 09.03.2025 um 19:04 schrieb Michael S:

    I already said that I don't approve non-portable constructs like _mm_getcsr()/_mm_setcsr() except when they help important cases and
    help ALOT. Neither applies here.

    The solution is much simpler than your solution since there are no
    separate fast ans slow paths. The performance is about the same for
    close exponents like your solution.

    --- Synchronet 3.20c-Linux NewsLink 1.2
  • From Bonita Montero@Bonita.Montero@gmail.com to comp.lang.c++ on Sun Mar 9 19:27:26 2025
    From Newsgroup: comp.lang.c++

    Am 09.03.2025 um 19:10 schrieb Bonita Montero:

    I've recently posted a comparison of your solution against mine on
    a Zen4-CPU with MSVC. Your solution is equally performant with close exponents (0x3FF to 0x433) if I compile it unter WSL2 with g++-12.
    The problem with MSVC is that the trunc() function is extremely slow.
    In my implementation of your idea (x86 FMA) I use my own function
    xtrunc which makes the code twice as fast.

    This is even somewhat faster since I'm using the ROUNDSD instruction
    of SSE 4.1; but the difference to my xtrunc-solution is only 13%:

    double fmody( double x, double y )
    {
    if( isnan( x ) ) [[unlikely]]
    return x;
    if( isnan( y ) ) [[unlikely]]
    return y;
    if( isinf( x ) || !y ) [[unlikely]]
    {
    feraiseexcept( FE_INVALID );
    return numeric_limits<double>::quiet_NaN();
    }
    if( !x || isinf( y ) ) [[unlikely]]
    return x;
    uint64_t sign = bit_cast<uint64_t>( x ) & numeric_limits<int64_t>::min();
    x = abs( x );
    y = abs( y );
    int oldCsr = _mm_getcsr();
    constexpr int CHOP = 0x6000;
    _mm_setcsr( oldCsr | CHOP );
    constexpr uint64_t
    EXP = -(1ll << 52),
    MANT = ~EXP;
    uint64_t binY = bit_cast<uint64_t>( y );
    int64_t expY = binY & EXP;
    if( !expY ) [[unlikely]]
    expY = (uint64_t)(0 - (countl_zero( binY & MANT ) - 12)) << 52;
    while( x >= y )
    {
    auto floor = []( double value )
    {
    __m128d m;
    #if defined(_MSC_VER)
    m.m128d_f64[0] = value;
    _mm_floor_sd(m, m);
    return m.m128d_f64[0];
    #else
    m[0] = value;
    _mm_floor_sd(m, m);
    return m[0];
    #endif
    };
    uint64_t yExpAdd = 0;
    double div = x / y;
    if( div < 0x1.FFFFFFFFFFFFFp+1023 ) [[likely]]
    div = xtrunc( div );
    else
    {
    uint64_t
    binX = bit_cast<uint64_t>( x ),
    newExp = expY + (54ull << 52);
    yExpAdd = (binX & EXP) - newExp;
    div = xtrunc( bit_cast<double>( newExp | binX & MANT ) / y );
    }
    auto xfma = []( double a, double b, double c )
    {
    __m128d mult1, mult2, add;
    #if defined(_MSC_VER)
    mult1.m128d_f64[0] = a;
    mult2.m128d_f64[0] = b;
    add.m128d_f64[0] = c;
    return _mm_fmadd_sd( mult1, mult2, add ).m128d_f64[0]; #else
    mult1[0] = a;
    mult2[0] = b;
    add[0] = c;
    return _mm_fmadd_sd( mult1, mult2, add )[0];
    #endif
    };
    x = xfma( div, -bit_cast<double>( binY + yExpAdd ), x );
    if( !x ) [[unlikely]]
    return bit_cast<double>( sign );
    }
    _mm_setcsr( oldCsr );
    return bit_cast<double>( sign | bit_cast<uint64_t>( x ) );
    }
    --- Synchronet 3.20c-Linux NewsLink 1.2
  • From Bonita Montero@Bonita.Montero@gmail.com to comp.lang.c++ on Sun Mar 9 20:11:37 2025
    From Newsgroup: comp.lang.c++

    Am 09.03.2025 um 19:11 schrieb Bonita Montero:

    The solution is much simpler than your solution since there are no
    separate fast ans slow paths. The performance is about the same for
    close exponents like your solution.

    And it's somewhat faster if I use the ROUNDSD-intrinsic than my
    solution before with the xtrunc-function. With MSVC's own trunc()
    code the performance is not competitive.


    --- Synchronet 3.20c-Linux NewsLink 1.2
  • From Bonita Montero@Bonita.Montero@gmail.com to comp.lang.c++ on Mon Mar 10 07:24:18 2025
    From Newsgroup: comp.lang.c++

    Am 09.03.2025 um 19:04 schrieb Michael S:

    I already said that I don't approve non-portable constructs like _mm_getcsr()/_mm_setcsr() except when they help important cases and
    help ALOT. Neither applies here.

    I dropped getting and setting the MSCSR-register to set the roun-
    ding mode. Now I set the rounding mode directly with the intrinsics _mm_div_round_sd and and _mm_fmadd_round_sd. Now this more manual
    code does help "ALOT", i.e. the solution is 2/3 faster than your
    initial solution with clang++-18 under Linux.
    But there's still a problem with the denormals.

    double fmodMplus( double x, double y )
    {
    if( isnan( x ) ) [[unlikely]]
    return x;
    if( isnan( y ) ) [[unlikely]]
    return y;
    if( isinf( x ) || !y ) [[unlikely]]
    {
    feraiseexcept( FE_INVALID );
    return numeric_limits<double>::quiet_NaN();
    }
    if( !x || isinf( y ) ) [[unlikely]]
    return x;
    uint64_t sign = bit_cast<uint64_t>( x ) & numeric_limits<int64_t>::min();
    x = abs( x );
    y = abs( y );
    constexpr uint64_t
    EXP = -(1ll << 52),
    MANT = ~EXP;
    uint64_t binY = bit_cast<uint64_t>( y );
    int64_t expY = binY & EXP;
    //if( !expY ) [[unlikely]]
    //expY = (uint64_t)(0 - (countl_zero( binY & MANT ) - 12)) << 52;
    while( x >= y )
    {
    uint64_t yExpAdd = 0;
    __m128d a128, b128;
    #if defined(_MSC_VER)
    a128.m128d_f64[0] = x;
    b128.m128d_f64[0] = y;
    double div = _mm_div_round_sd( a128, b128, _MM_FROUND_TO_ZERO |
    _MM_FROUND_NO_EXC ).m128d_f64[0];
    #else
    a128[0] = x;
    b128[0] = y;
    double div = _mm_div_round_sd( a128, b128, _MM_FROUND_TO_ZERO |
    _MM_FROUND_NO_EXC )[0];
    #endif
    if( div < 0x1.FFFFFFFFFFFFFp+1023 ) [[likely]]
    div = xtrunc( div );
    else
    {
    uint64_t
    binX = bit_cast<uint64_t>( x ),
    newExp = expY + (54ull << 52);
    yExpAdd = (binX & EXP) - newExp;
    div = xtrunc( bit_cast<double>( newExp | binX & MANT ) / y );
    }
    x = []( double a, double b, double c )
    {
    __m128d mult1, mult2, add;
    #if defined(_MSC_VER)
    mult1.m128d_f64[0] = a;
    mult2.m128d_f64[0] = b;
    add.m128d_f64[0] = c;
    return _mm_fmadd_round_sd( mult1, mult2, add, _MM_FROUND_TO_ZERO |
    _MM_FROUND_NO_EXC ).m128d_f64[0];
    #else
    mult1[0] = a;
    mult2[0] = b;
    add[0] = c;
    return _mm_fmadd_round_sd( mult1, mult2, add, _MM_FROUND_TO_ZERO |
    _MM_FROUND_NO_EXC )[0];
    #endif
    }( div, -bit_cast<double>( binY + yExpAdd ), x );
    if( !x ) [[unlikely]]
    return bit_cast<double>( sign );
    }
    return bit_cast<double>( sign | bit_cast<uint64_t>( x ) );
    }
    --- Synchronet 3.20c-Linux NewsLink 1.2
  • From Bonita Montero@Bonita.Montero@gmail.com to comp.lang.c++ on Mon Mar 10 10:46:59 2025
    From Newsgroup: comp.lang.c++

    I tried to use a unsigned __int128 / uint64_t division and I expected
    that the compiler calls a library function which does the subtract and
    shift manually. But g++ as well as clang++ handle this as a 128 : 64
    64#64 division somehow.
    And now my original solution is about 23% faster than your solution
    for close exponents (exponent difference <= 53) and for arbitrary
    exponent differences your solution is about 3% faster.

    double fmodO( double x, double y )
    {
    constexpr uint64_t
    SIGN = 1ull << 63,
    IMPLICIT = 1ull << 52,
    MANT = IMPLICIT - 1,
    QBIT = 1ull << 51;
    uint64_t const
    binX = bit_cast<uint64_t>( x ),
    binY = bit_cast<uint64_t>( y );
    static auto abs = []( uint64_t m ) { return m & ~SIGN; };
    auto isNaN = []( uint64_t m ) { return abs( m ) >= 0x7FF0000000000001u; };
    auto isSig = []( uint64_t m ) { return !(m & QBIT); };
    if( isNaN( binX ) ) [[unlikely]] // x == NaN
    #if defined(_MSC_VER)
    return bit_cast<double>( isNaN( binY ) ? binY | binX & binY & QBIT :
    binX );
    #else
    {
    if( isSig( binX ) || isNaN( binY ) && isSig( binY ) ) [[unlikely]]
    feraiseexcept( FE_INVALID );
    return bit_cast<double>( binX | QBIT );
    }
    #endif
    if( isNaN( binY ) ) [[unlikely]] // x != NaN || y == NaN
    #if defined(_MSC_VER)
    return y;
    #else
    {
    if( isSig( binY ) ) [[unlikely]]
    feraiseexcept( FE_INVALID );
    return bit_cast<double>( binY | QBIT );
    }
    #endif
    auto isInf = []( uint64_t m ) { return abs( m ) == 0x7FF0000000000000u; };
    if( isInf( binX ) ) // x == Inf
    {
    feraiseexcept( FE_INVALID );
    #if defined(_MSC_VER)
    return bit_cast<double>( binX & ~MANT | QBIT );
    #else
    return -numeric_limits<double>::quiet_NaN();
    #endif
    }
    if( !abs( binY ) ) [[unlikely]] // y == 0
    {
    feraiseexcept( FE_INVALID );
    #if defined(_MSC_VER)
    return numeric_limits<double>::quiet_NaN();
    #else
    return -numeric_limits<double>::quiet_NaN();
    #endif
    }
    if( !abs( binX ) || isInf( binY ) ) [[unlikely]] // x == 0 || y == Inf
    return x;
    auto exp = []( uint64_t b ) -> int { return b >> 52 & 0x7FF; };
    int
    expX = exp( binX ),
    expY = exp( binY );
    auto mant = []( uint64_t b ) { return b & MANT; };
    uint64_t
    mantX = mant( binX ),
    mantY = mant( binY );
    int headBits = 11;
    static auto normalize = [&]( int &exp, uint64_t &mant )
    {
    unsigned shift = countl_zero( mant ) - headBits;
    mant <<= shift;
    exp -= shift;
    };
    auto build = []( int &exp, uint64_t &mant )
    {
    if( exp ) [[likely]]
    mant |= IMPLICIT;
    else
    {
    exp = 1;
    normalize( exp, mant );
    }
    };
    build( expX, mantX );
    build( expY, mantY );
    int
    tailX = countr_zero( mantX ),
    tailY = countr_zero( mantY ),
    tailBits = tailX <= tailY ? tailX : tailY;
    mantX >>= tailBits;
    mantY >>= tailBits;
    headBits += tailBits;
    uint64_t signX = binX & SIGN;
    int expDiff;
    #if defined(_MSC_VER) && !defined(__llvm__) && defined(_M_X64)
    while( (expDiff = expX - expY) > 0 )
    {
    unsigned bits = expDiff <= 63 ? expDiff : 63;
    unsigned long long hi = mantX >> 64 - bits, lo = mantX << bits, remainder;
    (void)_udiv128( hi, lo, mantY, &remainder );
    if( !remainder ) [[unlikely]]
    return bit_cast<double>( signX );
    mantX = remainder;
    expX -= bits;
    normalize( expX, mantX );
    }
    #elif defined(__GNUC__) || defined(__clang__)
    while( (expDiff = expX - expY) > 0 )
    {
    unsigned bits = expDiff <= 63 ? expDiff : 63;
    unsigned __int128 dividend = (unsigned __int128)mantX << bits;
    mantX = (uint64_t)(dividend % mantY);
    if( !mantX ) [[unlikely]]
    return bit_cast<double>( signX );
    expX -= bits;
    normalize( expX, mantX );
    }
    #else
    while( (expDiff = expX - expY) > 0 )
    {
    unsigned bits = expDiff <= headBits ? expDiff : headBits;
    if( !(mantX = (mantX << bits) % mantY) ) [[unlikely]]
    return bit_cast<double>( signX );
    expX -= bits;
    normalize( expX, mantX );
    }
    #endif
    if( !expDiff && mantX >= mantY ) [[unlikely]]
    if( (mantX -= mantY) ) [[likely]]
    normalize( expX, mantX );
    else
    return bit_cast<double>( signX );
    mantX <<= tailBits;
    mantY <<= tailBits;
    if( expX <= 0 ) [[unlikely]]
    {
    assert(expX >= -51);
    mantX = mantX >> (unsigned)(-expX + 1);
    expX = 0;
    }
    return bit_cast<double>( signX | (uint64_t)expX << 52 | mantX & MANT ); }
    --- Synchronet 3.20c-Linux NewsLink 1.2
  • From Michael S@already5chosen@yahoo.com to comp.lang.c++ on Mon Mar 10 13:17:27 2025
    From Newsgroup: comp.lang.c++

    On Mon, 10 Mar 2025 07:24:18 +0100
    Bonita Montero <Bonita.Montero@gmail.com> wrote:

    Am 09.03.2025 um 19:04 schrieb Michael S:

    I already said that I don't approve non-portable constructs like _mm_getcsr()/_mm_setcsr() except when they help important cases and
    help ALOT. Neither applies here.

    I dropped getting and setting the MSCSR-register to set the roun-
    ding mode. Now I set the rounding mode directly with the intrinsics _mm_div_round_sd and and _mm_fmadd_round_sd. Now this more manual
    code does help "ALOT", i.e. the solution is 2/3 faster than your
    initial solution with clang++-18 under Linux.

    I can't check your claim about speed, because the code does not compile. Compiler has no idea WTF is xtrunc. But considering that so far all
    your claims about speed were false, I can safely assume that this one
    is false as well.

    But there's still a problem with the denormals.


    On y or on x?
    Subnormal values of x can be nicely handled by quick path with 0
    additional characters of code.

    BTW, for last 20-25 years IEEE-754 prefers to call binary floating
    point numbers in range (0:DBL_MIN) 'subnormal' rather than 'denormal'.
    I'd guess that it is because the term 'denormal' has wider meaning.



    --- Synchronet 3.20c-Linux NewsLink 1.2
  • From Bonita Montero@Bonita.Montero@gmail.com to comp.lang.c++ on Mon Mar 10 12:53:25 2025
    From Newsgroup: comp.lang.c++

    Am 10.03.2025 um 12:17 schrieb Michael S:

    I can't check your claim about speed, because the code does not compile. Compiler has no idea WTF is xtrunc. But considering that so far all
    your claims about speed were false, I can safely assume that this one
    is false as well.

    Well, if that makes my statements easier for you.

    On y or on x?

    I haven't checked that. But I've dropped the SSE-FMA-solution because
    with unsigned __int128 / uint64_t divisions my initial solution even
    faster, nearly one quarter than yours with clang++-18 under WSL2 for
    close exponents. With nearly arbitrary exponent combinations your
    solution is slightly faste (3%).

    BTW, for last 20-25 years IEEE-754 prefers to call binary floating
    point numbers in range (0:DBL_MIN) 'subnormal' rather than 'denormal'.
    I'd guess that it is because the term 'denormal' has wider meaning.

    I don't insist so compulsively on standardized terms.
    --- Synchronet 3.20c-Linux NewsLink 1.2
  • From Bonita Montero@Bonita.Montero@gmail.com to comp.lang.c++ on Mon Mar 10 12:54:52 2025
    From Newsgroup: comp.lang.c++

    Am 10.03.2025 um 12:17 schrieb Michael S:

    I can't check your claim about speed, because the code does not compile.

    Test my latest code of my initial idea parallel to this posting in this
    thread.
    --- Synchronet 3.20c-Linux NewsLink 1.2
  • From Michael S@already5chosen@yahoo.com to comp.lang.c++ on Mon Mar 10 14:26:48 2025
    From Newsgroup: comp.lang.c++

    On Mon, 10 Mar 2025 10:46:59 +0100
    Bonita Montero <Bonita.Montero@gmail.com> wrote:

    I tried to use a unsigned __int128 / uint64_t division and I expected
    that the compiler calls a library function which does the subtract and
    shift manually. But g++ as well as clang++ handle this as a 128 : 64
    64#64 division somehow.


    Somehow?
    It should be an obvious thing to anybody who cared to think for 15
    seconds.
    A library-or-compiler starts with hi1=rem(0:x_hi, y). Now hi1 is
    guaranteed to be smaller than y, so it's safe to do rem(hi1:x_lo, y).
    It is not *very* slow, but still there are 2 dependent division
    operations.
    So, on machines with slow integer division, like Skylake, it is 1.5-1.7
    times slower than a single long division.
    On machines with fast long division, like Zen3/4/5 or "performance"
    cores on newer Intel CPUs, it is approximately twice slower than a
    single long division. Plus call overhead, so 2.5x or so slower overall.

    And now my original solution is about 23% faster than your solution
    for close exponents (exponent difference <= 53) and for arbitrary
    exponent differences your solution is about 3% faster.


    That's absolutely not what I see.
    Here are my measurements:

    Measurements in nsec.
    First result - Intel Skylake at 4.25 GHz
    Second result - AMD Zen3 at 3.7 GHz

    abs(x/y) in range that matters [0.5:2**53]:
    Standard MSVC Library - 11.1 10.4
    Standard gnu Library - 5.4 10.7
    Yours (MSVc) - 27.6 11.5
    Yours (gcc) - 36.4 23.7
    Yours (clang) - 37.4 24.3
    my (MSVc) - 10.7 11.3
    my (gcc) - 7.7 7.6
    my (clang) - 6.3 7.5
    Your last (MSVc) - 27.6 11.6
    Your last (gcc) - 33.8 28.6
    Your last (clang) - 32.6 26.9

    abs(x/y) in full range [2**-2090:2**2090]:
    Standard MSVC Library - 109.4 153.5
    Standard glib Library - 102.3 155.5
    Yours (MSVc) - 134.9 52.6
    Yours (gcc) - 284.7 151.8
    Yours (clang) - 285.2 156.5
    my (MSVc) - 62.1 61.1
    my (gcc) - 60.8 59.1
    my (clang) - 59.9 59.3
    Your last (MSVc) - 135.0 52.5
    Your last (gcc) - 172.1 137.3
    Your last (clang) - 167.7 126.3


    It looks like you are not using version that I posted at 2025-03-03
    as a reference.
    The only case where you code is running at approximately the same speed
    as my code is MSVC-generated code on CPUs with fast integer division.
    And it was like that since your previous version. I see no changes in
    that regard.

    double fmodO( double x, double y )
    {
    constexpr uint64_t
    SIGN = 1ull << 63,
    IMPLICIT = 1ull << 52,
    MANT = IMPLICIT - 1,
    QBIT = 1ull << 51;
    uint64_t const
    binX = bit_cast<uint64_t>( x ),
    binY = bit_cast<uint64_t>( y );
    static auto abs = []( uint64_t m ) { return m & ~SIGN; };
    auto isNaN = []( uint64_t m ) { return abs( m ) >=
    0x7FF0000000000001u; }; auto isSig = []( uint64_t m ) { return !(m &
    QBIT); }; if( isNaN( binX ) ) [[unlikely]] // x == NaN
    #if defined(_MSC_VER)
    return bit_cast<double>( isNaN( binY ) ? binY | binX
    & binY & QBIT : binX );
    #else
    {
    if( isSig( binX ) || isNaN( binY ) && isSig( binY ) ) [[unlikely]] feraiseexcept( FE_INVALID );
    return bit_cast<double>( binX | QBIT );
    }
    #endif
    if( isNaN( binY ) ) [[unlikely]] // x != NaN || y == NaN
    #if defined(_MSC_VER)
    return y;
    #else
    {
    if( isSig( binY ) ) [[unlikely]]
    feraiseexcept( FE_INVALID );
    return bit_cast<double>( binY | QBIT );
    }
    #endif
    auto isInf = []( uint64_t m ) { return abs( m ) ==
    0x7FF0000000000000u; }; if( isInf( binX ) ) // x == Inf
    {
    feraiseexcept( FE_INVALID );
    #if defined(_MSC_VER)
    return bit_cast<double>( binX & ~MANT | QBIT );
    #else
    return -numeric_limits<double>::quiet_NaN();
    #endif
    }
    if( !abs( binY ) ) [[unlikely]] // y == 0
    {
    feraiseexcept( FE_INVALID );
    #if defined(_MSC_VER)
    return numeric_limits<double>::quiet_NaN();
    #else
    return -numeric_limits<double>::quiet_NaN();
    #endif
    }
    if( !abs( binX ) || isInf( binY ) ) [[unlikely]] // x == 0 ||
    y == Inf return x;
    auto exp = []( uint64_t b ) -> int { return b >> 52 & 0x7FF;
    }; int
    expX = exp( binX ),
    expY = exp( binY );
    auto mant = []( uint64_t b ) { return b & MANT; };
    uint64_t
    mantX = mant( binX ),
    mantY = mant( binY );
    int headBits = 11;
    static auto normalize = [&]( int &exp, uint64_t &mant )
    {
    unsigned shift = countl_zero( mant ) - headBits;
    mant <<= shift;
    exp -= shift;
    };
    auto build = []( int &exp, uint64_t &mant )
    {
    if( exp ) [[likely]]
    mant |= IMPLICIT;
    else
    {
    exp = 1;
    normalize( exp, mant );
    }
    };
    build( expX, mantX );
    build( expY, mantY );
    int
    tailX = countr_zero( mantX ),
    tailY = countr_zero( mantY ),
    tailBits = tailX <= tailY ? tailX : tailY;
    mantX >>= tailBits;
    mantY >>= tailBits;
    headBits += tailBits;
    uint64_t signX = binX & SIGN;
    int expDiff;
    #if defined(_MSC_VER) && !defined(__llvm__) && defined(_M_X64)
    while( (expDiff = expX - expY) > 0 )
    {
    unsigned bits = expDiff <= 63 ? expDiff : 63;
    unsigned long long hi = mantX >> 64 - bits, lo =
    mantX << bits, remainder; (void)_udiv128( hi, lo, mantY, &remainder );
    if( !remainder ) [[unlikely]]
    return bit_cast<double>( signX );
    mantX = remainder;
    expX -= bits;
    normalize( expX, mantX );
    }
    #elif defined(__GNUC__) || defined(__clang__)
    while( (expDiff = expX - expY) > 0 )
    {
    unsigned bits = expDiff <= 63 ? expDiff : 63;
    unsigned __int128 dividend = (unsigned __int128)mantX
    << bits; mantX = (uint64_t)(dividend % mantY);
    if( !mantX ) [[unlikely]]
    return bit_cast<double>( signX );
    expX -= bits;
    normalize( expX, mantX );
    }
    #else
    while( (expDiff = expX - expY) > 0 )
    {
    unsigned bits = expDiff <= headBits ? expDiff :
    headBits; if( !(mantX = (mantX << bits) % mantY) ) [[unlikely]]
    return bit_cast<double>( signX );
    expX -= bits;
    normalize( expX, mantX );
    }
    #endif
    if( !expDiff && mantX >= mantY ) [[unlikely]]
    if( (mantX -= mantY) ) [[likely]]
    normalize( expX, mantX );
    else
    return bit_cast<double>( signX );
    mantX <<= tailBits;
    mantY <<= tailBits;
    if( expX <= 0 ) [[unlikely]]
    {
    assert(expX >= -51);
    mantX = mantX >> (unsigned)(-expX + 1);
    expX = 0;
    }
    return bit_cast<double>( signX | (uint64_t)expX << 52 | mantX
    & MANT ); }


    You code would be much, much cleaner and more readable if you replace
    lambdas with proper functions. Or just write simple expressions like 'x
    & MANT' in place.




    --- Synchronet 3.20c-Linux NewsLink 1.2
  • From Bonita Montero@Bonita.Montero@gmail.com to comp.lang.c++ on Mon Mar 10 13:45:26 2025
    From Newsgroup: comp.lang.c++

    Am 10.03.2025 um 13:26 schrieb Michael S:

    It should be an obvious thing to anybody who cared to think for 15
    seconds.

    I was wrong and it absolutels isn't obvious. The compiler calls the
    glibc function __umodti3 of glibc which has a shortcut for results
    which fit into 64 bit. Although there's an additional call on Linux
    the code with clang++-18 is still a bit faster than my Windows solution
    with the _udiv128-intrinsic; that's really surprisng.

    A library-or-compiler starts with hi1=rem(0:x_hi, y). Now hi1 is
    guaranteed to be smaller than y, so it's safe to do rem(hi1:x_lo, y).
    It is not *very* slow, but still there are 2 dependent division
    operations.

    Both parameters are variable so there could be no static evaluation
    at compile tiime.

    Measurements in nsec.
    First result - Intel Skylake at 4.25 GHz
    Second result - AMD Zen3 at 3.7 GHz

    abs(x/y) in range that matters [0.5:2**53]:
    Standard MSVC Library - 11.1 10.4
    Standard gnu Library - 5.4 10.7
    Yours (MSVc) - 27.6 11.5
    Yours (gcc) - 36.4 23.7
    Yours (clang) - 37.4 24.3
    my (MSVc) - 10.7 11.3
    my (gcc) - 7.7 7.6
    my (clang) - 6.3 7.5
    Your last (MSVc) - 27.6 11.6
    Your last (gcc) - 33.8 28.6
    Your last (clang) - 32.6 26.9

    abs(x/y) in full range [2**-2090:2**2090]:
    Standard MSVC Library - 109.4 153.5
    Standard glib Library - 102.3 155.5
    Yours (MSVc) - 134.9 52.6
    Yours (gcc) - 284.7 151.8
    Yours (clang) - 285.2 156.5
    my (MSVc) - 62.1 61.1
    my (gcc) - 60.8 59.1
    my (clang) - 59.9 59.3
    Your last (MSVc) - 135.0 52.5
    Your last (gcc) - 172.1 137.3
    Your last (clang) - 167.7 126.3

    This are the clang++-18 results on my Zen4-computer under WSL2 with
    close exponents (0x3ff to 0x433):

    fmodO: 9.42929
    fmodM: 11.7907

    So my code is about 23% faster on my computer.

    This are the results for arbitrary exponents:

    fmodO: 41.9115
    fmodM: 41.2062

    Exactly what I already mentioned.

    Maybe that depends on the glibc-version because a different glibc
    version might have different efficient __umodti3 functions.

    You code would be much, much cleaner and more readable if you replace
    lambdas with proper functions. ..

    For me that doesn't make a difference.
    --- Synchronet 3.20c-Linux NewsLink 1.2
  • From Michael S@already5chosen@yahoo.com to comp.lang.c++ on Mon Mar 10 16:33:04 2025
    From Newsgroup: comp.lang.c++

    On Mon, 10 Mar 2025 13:45:26 +0100
    Bonita Montero <Bonita.Montero@gmail.com> wrote:

    Am 10.03.2025 um 13:26 schrieb Michael S:

    It should be an obvious thing to anybody who cared to think for 15
    seconds.

    I was wrong and it absolutels isn't obvious. The compiler calls the
    glibc function __umodti3 of glibc which has a shortcut for results
    which fit into 64 bit. Although there's an additional call on Linux
    the code with clang++-18 is still a bit faster than my Windows
    solution with the _udiv128-intrinsic; that's really surprisng.

    A library-or-compiler starts with hi1=rem(0:x_hi, y). Now hi1 is
    guaranteed to be smaller than y, so it's safe to do rem(hi1:x_lo,
    y). It is not *very* slow, but still there are 2 dependent division operations.

    Both parameters are variable so there could be no static evaluation
    at compile tiime.


    https://github.com/llvm/llvm-project/blob/main/compiler-rt/lib/builtins/udivmodti4.c
    On line 114 they specialize the case of 64-bit divisor.
    On line 116 the further specialize our specific case of x.hi < y.
    So, at the end they use the same single division instruction as MSVC.
    The only difference is an overhead of cal and of to very predictable
    branches.

    Measurements in nsec.
    First result - Intel Skylake at 4.25 GHz
    Second result - AMD Zen3 at 3.7 GHz

    abs(x/y) in range that matters [0.5:2**53]:
    Standard MSVC Library - 11.1 10.4
    Standard gnu Library - 5.4 10.7
    Yours (MSVc) - 27.6 11.5
    Yours (gcc) - 36.4 23.7
    Yours (clang) - 37.4 24.3
    my (MSVc) - 10.7 11.3
    my (gcc) - 7.7 7.6
    my (clang) - 6.3 7.5
    Your last (MSVc) - 27.6 11.6
    Your last (gcc) - 33.8 28.6
    Your last (clang) - 32.6 26.9

    abs(x/y) in full range [2**-2090:2**2090]:
    Standard MSVC Library - 109.4 153.5
    Standard glib Library - 102.3 155.5
    Yours (MSVc) - 134.9 52.6
    Yours (gcc) - 284.7 151.8
    Yours (clang) - 285.2 156.5
    my (MSVc) - 62.1 61.1
    my (gcc) - 60.8 59.1
    my (clang) - 59.9 59.3
    Your last (MSVc) - 135.0 52.5
    Your last (gcc) - 172.1 137.3
    Your last (clang) - 167.7 126.3

    This are the clang++-18 results on my Zen4-computer under WSL2 with
    close exponents (0x3ff to 0x433):

    fmodO: 9.42929
    fmodM: 11.7907

    For my code it's strangely slow. On 4+ GHz Zen4 I would expect ~5 nsec.
    Are you sure that you took my code from 2025-03-03 as is, compiled it
    as separate file C file (not C++), without touching it?


    So my code is about 23% faster on my computer.

    This are the results for arbitrary exponents:

    fmodO: 41.9115
    fmodM: 41.2062

    Good job by LLVM. Unfortunately, on msys2 clang appears to use Gnu implementation of compiler support library instead of their own. Right
    now gnu is not as smart. Hopefully, they will catch up soon.


    Exactly what I already mentioned.

    Maybe that depends on the glibc-version because a different glibc
    version might have different efficient __umodti3 functions.


    Compiler supports functions like udivmodti4 are not part of glibc.
    They reside in separate library. In case of gcc it is called libgcc or libgcc_s.
    My educated guess is that on Linux clang does not use libgcc.

    You code would be much, much cleaner and more readable if you
    replace lambdas with proper functions. ..

    For me that doesn't make a difference.

    But it makes difference for your potential readers.


    --- Synchronet 3.20c-Linux NewsLink 1.2
  • From Bonita Montero@Bonita.Montero@gmail.com to comp.lang.c++ on Mon Mar 10 15:46:32 2025
    From Newsgroup: comp.lang.c++

    Am 10.03.2025 um 15:33 schrieb Michael S:

    https://github.com/llvm/llvm-project/blob/main/compiler-rt/lib/builtins/udivmodti4.c
    On line 114 they specialize the case of 64-bit divisor.
    On line 116 the further specialize our specific case of x.hi < y.
    So, at the end they use the same single division instruction as MSVC.
    The only difference is an overhead of cal and of to very predictable branches.

    That's what I also guessed, but maybe we've not the same glibc-version.
    Or the code runs just more efficiently on my Zen4-CPU-

    For my code it's strangely slow. On 4+ GHz Zen4 I would expect ~5 nsec.

    I want your crystal ball.

    But it makes difference for your potential readers.

    If I have an exteral functions whose innards have to be known by the
    one who uses that function you could simply place it near the place
    where it is called.

    --- Synchronet 3.20c-Linux NewsLink 1.2
  • From Michael S@already5chosen@yahoo.com to comp.lang.c++ on Mon Mar 10 16:59:25 2025
    From Newsgroup: comp.lang.c++

    On Mon, 10 Mar 2025 15:46:32 +0100
    Bonita Montero <Bonita.Montero@gmail.com> wrote:

    Am 10.03.2025 um 15:33 schrieb Michael S:


    That's what I also guessed, but maybe we've not the same
    glibc-version. Or the code runs just more efficiently on my Zen4-CPU-

    For my code it's strangely slow. On 4+ GHz Zen4 I would expect ~5
    nsec.

    I want your crystal ball.


    One does not need crystal ball to extrapolate speed of simple integer
    code from 3.7 GHz Zen3 to 4+ GHz Zen4 (probably 4.7 or 4.8 GHz).
    But your repetitive avoidance of answering my direct questions about
    what exactly you are using as "my code" makes me even more suspicious.


    --- Synchronet 3.20c-Linux NewsLink 1.2
  • From Bonita Montero@Bonita.Montero@gmail.com to comp.lang.c++ on Mon Mar 10 16:11:39 2025
    From Newsgroup: comp.lang.c++

    Am 10.03.2025 um 15:59 schrieb Michael S:

    One does not need crystal ball to extrapolate speed of simple integer
    code from 3.7 GHz Zen3 to 4+ GHz Zen4 (probably 4.7 or 4.8 GHz).

    If one core only computes the clock is even 5.7GHz.
    But the results aren't better than shown.

    But your repetitive avoidance of answering my direct questions about
    what exactly you are using as "my code" makes me even more suspicious.

    I've shown the latest code of fmodO; you can easily inegrate it into
    your own benchmark. I don't use unfiform_real_distrubution for the
    random numbers but uniform_int_distribution with bounds of 0x3FFull
    << 52 and 0x433ull << 52 for the close exponent case. The whole test
    code nearly hasn't changed over my initial post.

    --- Synchronet 3.20c-Linux NewsLink 1.2
  • From Michael S@already5chosen@yahoo.com to comp.lang.c++ on Mon Mar 10 17:43:05 2025
    From Newsgroup: comp.lang.c++

    On Mon, 10 Mar 2025 16:11:39 +0100
    Bonita Montero <Bonita.Montero@gmail.com> wrote:

    Am 10.03.2025 um 15:59 schrieb Michael S:

    One does not need crystal ball to extrapolate speed of simple
    integer code from 3.7 GHz Zen3 to 4+ GHz Zen4 (probably 4.7 or 4.8
    GHz).

    If one core only computes the clock is even 5.7GHz.
    But the results aren't better than shown.

    But your repetitive avoidance of answering my direct questions about
    what exactly you are using as "my code" makes me even more
    suspicious.

    I've shown the latest code of fmodO;

    That's not the question I am asking for 4th or 5th time.
    My question is what *exactly* is fmodM.

    you can easily inegrate it into
    your own benchmark.

    I did. And presented the results. I am fully willing to believe that
    the difference between our clang results explained by difference in
    compiler support library.
    But so far I find no explanation for why results for what you claim to
    be *my* code are so much slower in your measurements, despite your
    faster CPU.
    BTW, the number you did not publish at all was the speed of fmod()
    routine from standard library. My estimation is that on CPU like
    yours for close exponent range it should be around 7-8 nsec, both for
    msys2 and for MSVC. I have no idea about glibc on Linux.

    I don't use unfiform_real_distrubution for the
    random numbers but uniform_int_distribution with bounds of 0x3FFull
    << 52 and 0x433ull << 52 for the close exponent case. The whole test
    code nearly hasn't changed over my initial post.


    Assuming that the exponent of y is fixed at 1023 that is approximately
    the same as my own test.



    --- Synchronet 3.20c-Linux NewsLink 1.2
  • From Bonita Montero@Bonita.Montero@gmail.com to comp.lang.c++ on Mon Mar 10 17:13:31 2025
    From Newsgroup: comp.lang.c++

    Am 10.03.2025 um 16:43 schrieb Michael S:

    That's not the question I am asking for 4th or 5th time.
    My question is what *exactly* is fmodM.

    fmodM is your code, M like Michael.

    I did. And presented the results. I am fully willing to believe that
    the difference between our clang results explained by difference in
    compiler support library.

    Or maybe the different CPU.

    But so far I find no explanation for why results for what you claim to
    be *my* code are so much slower in your measurements, despite your
    faster CPU.

    I compiled with -O2 and march=native, that should be sufficient.

    BTW, the number you did not publish at all was the speed of fmod()
    routine from standard library. ...

    I've the fmod code of glibc 2.31, which is rather slow since it
    does the subtract and shifts manually - code from Sun of the 90s.

    Assuming that the exponent of y is fixed at 1023 that is approximately
    the same as my own test.

    Yes, but as you said earlier close exponents are more relevant.
    --- Synchronet 3.20c-Linux NewsLink 1.2
  • From Michael S@already5chosen@yahoo.com to comp.lang.c++ on Mon Mar 10 19:51:28 2025
    From Newsgroup: comp.lang.c++

    On Mon, 10 Mar 2025 17:13:31 +0100
    Bonita Montero <Bonita.Montero@gmail.com> wrote:

    Am 10.03.2025 um 16:43 schrieb Michael S:

    That's not the question I am asking for 4th or 5th time.
    My question is what *exactly* is fmodM.

    fmodM is your code, M like Michael.


    What my code?
    Post it.


    I did. And presented the results. I am fully willing to believe that
    the difference between our clang results explained by difference in compiler support library.

    Or maybe the different CPU.

    But so far I find no explanation for why results for what you claim
    to be *my* code are so much slower in your measurements, despite
    your faster CPU.

    I compiled with -O2 and march=native, that should be sufficient.

    BTW, the number you did not publish at all was the speed of fmod()
    routine from standard library. ...

    I've the fmod code of glibc 2.31, which is rather slow since it
    does the subtract and shifts manually - code from Sun of the 90s.

    Assuming that the exponent of y is fixed at 1023 that is
    approximately the same as my own test.

    Yes, but as you said earlier close exponents are more relevant.


    --- Synchronet 3.20c-Linux NewsLink 1.2
  • From Bonita Montero@Bonita.Montero@gmail.com to comp.lang.c++ on Mon Mar 10 19:00:06 2025
    From Newsgroup: comp.lang.c++

    Am 10.03.2025 um 18:51 schrieb Michael S:

    What my code?
    Post it.

    I just changed the function name and that the code uses xtrunc() instead
    of trunc() since trunc() is slow with MSVC. I removed my improvement _mm_getcsr() / _mm_setcsr() since the speedup was noticeable but not significant, unlike the xtrunc() optimization, which made a speedup of
    about +100% with MSVC.

    double fmodM( double x, double y )
    {
    if( isnan( x ) )
    return x;

    // pre-process y
    if( isless( y, 0 ) )
    y = -y;
    else if( isgreater( y, 0 ) )
    ;
    else {
    if( isnan( y ) )
    return y;
    // y==0
    feraiseexcept( FE_INVALID );
    return nan( "y0" );
    }

    // y in (0:+inf]

    // Quick path
    double xx = x * 0x1p-53;
    if( xx > -y && xx < y ) {
    // among other things, x guaranteed to be finite
    if( x > -y && x < y )
    return x; // case y=+-inf covered here
    double d = xtrunc( x / y );
    double res = fma( -y, d, x );
    if( signbit( x ) != signbit( res ) ) {
    // overshoot because of unfortunate division rounding
    // it is extremely rare for small x/y,
    // but not rare when x/y is close to 2**53
    res = fma( -y, d + (signbit( x ) * 2 - 1), x );
    }
    return res;
    }

    // slow path
    if( isinf( x ) ) {
    feraiseexcept( FE_INVALID );
    return nan( "xinf" );
    }

    int oldRnd = fegetround();
    fesetround( FE_TOWARDZERO );
    double ax = fabs( x );
    do {
    double yy = y;
    while( yy < ax * 0x1p-1022 )
    yy *= 0x1p1021;

    do
    ax = fma( -yy, xtrunc( ax / yy ), ax );
    while( ax >= yy );

    } while( ax >= y );

    ax = copysign( ax, x );
    fesetround( oldRnd );
    return ax;
    }

    Your idea is really elegant and as I've shown it could be significantly imporved with SSE 4.1 along with FMA3. But at the point where I noticed
    how performant a 128 : 64 modulo division through the glibc is and as
    this is superior over the FMA-solution I dropped the whole idea and
    removed the SSE-FMA-code from my test program.

    --- Synchronet 3.20c-Linux NewsLink 1.2
  • From Michael S@already5chosen@yahoo.com to comp.lang.c++ on Mon Mar 10 20:38:18 2025
    From Newsgroup: comp.lang.c++

    On Mon, 10 Mar 2025 19:00:06 +0100
    Bonita Montero <Bonita.Montero@gmail.com> wrote:

    Am 10.03.2025 um 18:51 schrieb Michael S:

    What my code?
    Post it.

    I just changed the function name and that the code uses xtrunc()
    instead of trunc() since trunc() is slow with MSVC. I removed my
    improvement _mm_getcsr() / _mm_setcsr() since the speedup was
    noticeable but not significant, unlike the xtrunc() optimization,
    which made a speedup of about +100% with MSVC.

    double fmodM( double x, double y )
    {
    if( isnan( x ) )
    return x;

    // pre-process y
    if( isless( y, 0 ) )
    y = -y;
    else if( isgreater( y, 0 ) )
    ;
    else {
    if( isnan( y ) )
    return y;
    // y==0
    feraiseexcept( FE_INVALID );
    return nan( "y0" );
    }

    // y in (0:+inf]

    // Quick path
    double xx = x * 0x1p-53;
    if( xx > -y && xx < y ) {
    // among other things, x guaranteed to be finite
    if( x > -y && x < y )
    return x; // case y=+-inf covered here
    double d = xtrunc( x / y );
    double res = fma( -y, d, x );
    if( signbit( x ) != signbit( res ) ) {
    // overshoot because of unfortunate division
    rounding // it is extremely rare for small x/y,
    // but not rare when x/y is close to 2**53
    res = fma( -y, d + (signbit( x ) * 2 - 1), x
    ); }
    return res;
    }

    // slow path
    if( isinf( x ) ) {
    feraiseexcept( FE_INVALID );
    return nan( "xinf" );
    }

    int oldRnd = fegetround();
    fesetround( FE_TOWARDZERO );
    double ax = fabs( x );
    do {
    double yy = y;
    while( yy < ax * 0x1p-1022 )
    yy *= 0x1p1021;

    do
    ax = fma( -yy, xtrunc( ax / yy ), ax );
    while( ax >= yy );

    } while( ax >= y );

    ax = copysign( ax, x );
    fesetround( oldRnd );
    return ax;
    }



    That is *not* a "flipflop" code that I consider relevant for
    approximately a weak.
    The relevant code is the one posted a weak ago. I am posting it for
    the second time:

    #include <math.h>
    #include <fenv.h>

    double my_fmod(double x, double y)
    {
    if (isnan(x))
    return x;

    // pre-process y
    if (y < 0)
    y = -y;
    else if (y > 0)
    ;
    else {
    if (isnan(y))
    return y;
    // y==0
    feraiseexcept(FE_INVALID);
    return NAN;
    }

    // y in (0:+inf]
    double ax = fabs(x);

    // Quick path
    if (ax * 0x1p-53 < y) {
    // among other things, x guaranteed to be finite
    if (ax < y)
    return x; // case y=+-inf covered here
    double d = floor(ax/y);
    double res = fma(-y, d, ax);
    if (res < 0) {
    // overshoot because of unfortunate division rounding
    // it is extremely rare for small x/y,
    // but not rare when x/y is close to 2**53
    res += y;
    }
    if (x < 0)
    res = -res;
    return res;
    }

    // slow path
    if (isinf(x)) {
    feraiseexcept(FE_INVALID);
    return NAN;
    }

    int flipflop = 0;
    do {
    double yy = y;
    while (yy < ax * 0x1p-1022)
    yy *= 0x1p1021;

    do {
    ax = fma(-yy, floor(ax/yy), ax);
    flipflop ^= (ax < 0);
    ax = fabs(ax);
    } while (ax >= yy);
    } while (ax >= y);
    if (flipflop)
    ax = y - ax;
    if (x < 0)
    ax = -ax;
    return ax;
    }

    Your idea is really elegant

    I'd rather call it "simple" or "straightforward". "Elegant" in my book
    is something else. For example, the code above is closer to what I
    consider elegant.
    May be, later today or tomorrow, I'll show you solution that I consider
    bright. Bright, but impractical.

    and as I've shown it could be
    significantly imporved with SSE 4.1 along with FMA3. But at the point
    where I noticed how performant a 128 : 64 modulo division through the
    glibc is and as this is superior over the FMA-solution I dropped the
    whole idea and removed the SSE-FMA-code from my test program.


    Even if compiler generates good code for long division, there are still multiple problems with this approach:

    1. Long division is very slow on majority of older CPUs. That includes
    CPUs that are quite fast in the absolute sense, like Intel Skylake,
    with all its subvariants (Caby Lake, Coffee Lake, Whisky Lake, etc...)
    and AMD Zen2.

    2. The source language is not a standard C (or C++ for that matter). One
    has to use ether gnu extensions or Microsoft's extensions. In the latter
    case, it became non-portable to ARM64.

    3. It is slow under msys2/mingw64 and probably slow under Linux with
    gnu compiler. You can easily test if the latter is true and to tell me.

    4. Even on CPUs with fast long and division and with
    compilers/libraries that are able to generate long division it is
    measurably slower than fdiv/floor/fma in the case that corresponds to
    my quick path. And slower than standard library in this case. That is
    less visible with MSVC, but quite obvious with other compilers. I don't
    know an exact reason for that, but would guess that this new CPUs, esp.
    new CPUs from AMD, do not handle dual transition of data between
    domains FF->Integer->FP particularly well. So, when the work is
    short, it ends up better doing everything on the floating-point side,
    even if calculation is a little longer.

















    --- Synchronet 3.20c-Linux NewsLink 1.2
  • From Bonita Montero@Bonita.Montero@gmail.com to comp.lang.c++ on Mon Mar 10 22:34:13 2025
    From Newsgroup: comp.lang.c++

    Am 10.03.2025 um 19:38 schrieb Michael S:

    The relevant code is the one posted a weak ago. I am posting it for
    the second time:

    #include <math.h>
    #include <fenv.h>

    double my_fmod(double x, double y)
    {
    if (isnan(x))
    return x;

    // pre-process y
    if (y < 0)
    y = -y;
    else if (y > 0)
    ;
    else {
    if (isnan(y))
    return y;
    // y==0
    feraiseexcept(FE_INVALID);
    return NAN;
    }

    // y in (0:+inf]
    double ax = fabs(x);

    // Quick path
    if (ax * 0x1p-53 < y) {
    // among other things, x guaranteed to be finite
    if (ax < y)
    return x; // case y=+-inf covered here
    double d = floor(ax/y);
    double res = fma(-y, d, ax);
    if (res < 0) {
    // overshoot because of unfortunate division rounding
    // it is extremely rare for small x/y,
    // but not rare when x/y is close to 2**53
    res += y;
    }
    if (x < 0)
    res = -res;
    return res;
    }

    // slow path
    if (isinf(x)) {
    feraiseexcept(FE_INVALID);
    return NAN;
    }

    int flipflop = 0;
    do {
    double yy = y;
    while (yy < ax * 0x1p-1022)
    yy *= 0x1p1021;

    do {
    ax = fma(-yy, floor(ax/yy), ax);
    flipflop ^= (ax < 0);
    ax = fabs(ax);
    } while (ax >= yy);
    } while (ax >= y);
    if (flipflop)
    ax = y - ax;
    if (x < 0)
    ax = -ax;
    return ax;
    }

    With that code and xtrunc instead of floor the results look like this
    for close exponents (0x3FF to 0x433) with clang++-18 under WSL2:

    fmodO: 9.29622
    fmodM: 11.4518

    And for aribitrary exponets (0x1 to 0x7FE):

    fmodO: 9.29622
    fmodM: 11.4518

    I'd rather call it "simple" or "straightforward". "Elegant" in my book
    is something else. For example, the code above is closer to what I
    consider elegant.

    I think if it would be staightforward some runtime-library would have implemented it in that way.

    1. Long division is very slow on majority of older CPUs. That includes
    CPUs that are quite fast in the absolute sense, like Intel Skylake,
    with all its subvariants (Caby Lake, Coffee Lake, Whisky Lake, etc...)
    and AMD Zen2.

    2. The source language is not a standard C (or C++ for that matter). One
    has to use ether gnu extensions or Microsoft's extensions. In the latter case, it became non-portable to ARM64.

    FMA is available on more advanced CPUs. glibc has no problems to be x86-centric, so my code is also x86-centric. 128 / 64 divisions are
    available on the oldest AMD64-CPU, FMA came along with the first
    version of AVX, although there are SSE-variabts of FMA.

    4. Even on CPUs with fast long and division and with
    compilers/libraries that are able to generate long division it is
    measurably slower than fdiv/floor/fma in the case that corresponds to
    my quick path.

    The above results are from clang++-18 with -march=native on my Zen4
    -CPU. With MSVC the difference is in the same direction and much worse.


    --- Synchronet 3.20c-Linux NewsLink 1.2
  • From Bonita Montero@Bonita.Montero@gmail.com to comp.lang.c++ on Mon Mar 10 22:36:31 2025
    From Newsgroup: comp.lang.c++

    Am 10.03.2025 um 22:34 schrieb Bonita Montero:

    And for aribitrary exponets (0x1 to 0x7FE):

        fmodO: 9.29622
        fmodM: 11.4518

    Sorry, the copy-buffer wasn't refreshed with the new results:

    fmodO: 40.4702
    fmodM: 40.1652
    --- Synchronet 3.20c-Linux NewsLink 1.2
  • From Bonita Montero@Bonita.Montero@gmail.com to comp.lang.c++ on Mon Mar 10 23:52:44 2025
    From Newsgroup: comp.lang.c++

    I compared your new vesion against fmod() of MSVC in terms of accuracy
    and your solution isn't absolute accurate:

    fmod: 80.0059
    fmodM: 44.21
    50.8631 bits shared accuracy
    equal results: 95.917%
    equal exceptions: 91.017%
    equal NaN signs: 96.466%
    equal NaN-types: 85.78%
    equal NaNs: 66.164%

    All my solutions so far have 100%-values against glibc and MSVC-runtime.

    --- Synchronet 3.20c-Linux NewsLink 1.2
  • From Bonita Montero@Bonita.Montero@gmail.com to comp.lang.c++ on Mon Mar 10 23:57:48 2025
    From Newsgroup: comp.lang.c++

    Am 10.03.2025 um 23:52 schrieb Bonita Montero:
    I compared your new vesion against fmod() of MSVC in terms of accuracy
    and your solution isn't absolute accurate:

        fmod: 80.0059
        fmodM: 44.21
        50.8631 bits shared accuracy
        equal results: 95.917%
        equal exceptions: 91.017%
        equal NaN signs: 96.466%
        equal NaN-types: 85.78%
        equal NaNs: 66.164%

    All my solutions so far have 100%-values against glibc and MSVC-runtime.

    These is a small list of values where MSVC is different than your code.
    The first number after the = is MSVC's fmod, the second after the comma
    is your code. A leading "S:" means that a value is signalling NaN, a
    leading "Q:" means that a value is a quiet NaN. ":E" means that the
    result triggered a FE_INVALID exception.

    0x1.f3906bc1b8a5bp-39 % 0x1.ce82ac28f6d31p-480 =
    0x1.cb9017b2eb6d0p-481:*, 0x1.ce82ac28f6d23p-480:E
    -0x1.db9e55a99ab7ep+288 % 0x1.0ea94e32a4911p-837 =
    -0x1.1a0f723fb6168p-839:*, -0x1.0ea94e301527ep-837:*
    0x1.6cd295bc49bc9p+40 % -0x1.181d078b8be21p-593 =
    0x1.e1b68391d8d86p-594:*, 0x1.1f97fa7e3cf68p-624:*
    0x1.693b8d5c89f20p+450 % -0x1.835aa5a150414p-641 =
    0x1.acab4ce1b1fc0p-642:*, 0x1.835aa29b6ea2fp-641:*
    0x1.bdde634d7b156p+227 % 0x1.bfd2a77df6d50p-966 =
    0x1.4e1d4d2aa24d0p-966:*, 0x1.bfd1c9b19df91p-966:*
    0x1.05f1b1985d89bp+805 % -0x1.b30b0f4f9a276p+358 =
    0x1.b24ef343c18f0p+357:*, 0x1.b30b0f4f9a26ep+358:*
    0x1.c51a2760f5593p-60 % -:Q:08cc56216b9b = -:Q:08cc56216b9b:*, -:Q:08cc56216b9b:*
    0x1.920c6dab8413ep+848 % -0x1.83fe96148a1f5p-608 =
    0x1.89ffc2c4fe6b0p-612:*, 0x1.83fe95a92fb60p-608:*
    -0x1.1f85e2baf9b81p+442 % -0x1.b6c265ae6c7f2p-735 =
    -0x1.87cd395179cc8p-737:*, -0x1.aab5559a169c0p-770:*
    0x1.caca80f1fbd5cp+92 % -0x1.5af03c5e3af31p-29 =
    0x1.0e459ee5e237ap-29:*, 0x1.a0663e416d964p-76:*
    0x1.e3b768fb131a4p-221 % +:S:02b894aade34 = +:S:02b894aade34:*, +:S:02b894aade34:*
    0x1.cc8085c385f91p+728 % 0x1.a87b45bbf44d0p+4 = 0x1.9171dd366fb80p+3:*, 0x1.e357b06433180p-39:*
    -inf % -0x1.5679339e426cap+833 = -:Q:000000000000:*, +:Q:000000000000:* 0x1.4c70b5c9428a0p+686 % -0x1.9171d61daeba1p-487 =
    0x1.57ea17d959160p-488:*, 0x1.9171d61dae69fp-487:*
    -0x1.52a6303a14500p+656 % -0x1.ad04391aaea96p-949 =
    -0x1.0f02ac17840aap-949:*, -0x1.7004336b70620p-956:*
    -0x1.0bdb317d8fad0p-767 % -:S:46e8429d5b1f = -:S:46e8429d5b1f:*, -:S:46e8429d5b1f:*
    -0x1.1ad7e2eb90171p+1017 % 0x1.a15c8f17c6c46p+206 =
    -0x1.a8f525c667820p+204:*, -0x1.c9531a58ca788p+191:*
    0x1.f48a4fc42f92bp-674 % 0x1.cfc25d39d546ep-794 =
    0x1.d93f965fe3148p-796:*, 0x1.0a6a691537120p-840:E
    -0x1.3b74c883ad486p-37 % 0x1.3b6bc4c9630e4p-682 =
    -0x1.138406da8d324p-682:*, -0x1.3b67c7dd85193p-682:E
    -0x1.a9338d85aec1fp-59 % 0x1.b1bb2fe2dac6ap-189 =
    -0x1.6605992b756c4p-189:*, -0x1.7c807b3947f50p-222:E
    0x1.3cb0d3f492cbep+742 % 0x1.b9ee7c1d9dc7fp-377 =
    0x1.e7207e18e1f64p-379:*, 0x1.954bb610d104ep-414:*
    -:S:75cf442862ac % -0x1.18d9e4c580dc3p+888 = -:S:75cf442862ac:*, -:S:75cf442862ac:*
    0x1.a422ba47d8b4cp+931 % -0x1.f14d3e01678afp+650 =
    0x1.e9285e9f57eb0p+649:*, 0x1.f14d3e016789cp+650:*
    0x1.9305131399232p-92 % -0x1.151226c38af1ap-793 =
    0x1.a63acaf84e6e8p-794:*, 0x1.84df91a241950p-826:E
    -0x1.a3902bd9caf38p-358 % -:Q:7b18016a9dad = -:Q:7b18016a9dad:*, -:Q:7b18016a9dad:*
    -0x1.feffb10a8faf9p-821 % -:Q:3dc9368f0f2b = -:Q:3dc9368f0f2b:*, -:Q:3dc9368f0f2b:*
    -0x1.303257ecb7accp+311 % 0x1.1079a8749c0cfp-460 =
    -0x1.dab61acde63b0p-463:*, -0x1.1079a8749c0cfp-460:*
    0x1.ec6774d00c8dap+292 % -0x1.a7ee93cb98208p+39 =
    0x1.2aa60d920f4d0p+39:*, 0x1.a7ee9265d284cp+39:*
    -0x1.c86dc88ae039dp-11 % -0x1.9f1b5aba7a1a3p-833 =
    -0x1.83a8fadfbd164p-834:*, -0x1.674f068517da0p-885:E
    -0x1.c572653fae811p+830 % 0x1.0a773c9bac2bap-360 =
    -0x1.461532c5050c4p-361:*, -0x1.25a6aabe2ba58p-371:*
    -:S:09d771d520c5 % 0x0.0000000000000p+0 = -:S:09d771d520c5:*, -:S:09d771d520c5:*
    -0x1.fd78d355657fap+378 % 0x1.bac5a74a96292p-808 =
    -0x1.cf57ab01b8168p-810:*, -0x1.bac5a736234a0p-808:*
    -0x1.60886219128b2p-181 % -0x1.e0da6be14b319p-801 =
    -0x1.7d892c58d801dp-801:*, -0x1.56f7456dcc78ap-840:E
    -0x1.673fc3d551aabp+788 % 0x1.33629a122cb93p+657 =
    -0x1.91d1f5c6c07e6p+656:*, -0x1.b76b6d2ee64b4p+624:*
    0x1.56e9cc0e38d51p+150 % -0x1.bf241f2243df8p-837 =
    0x1.84e51c39c9980p-839:*, 0x1.b0945e68254dap-837:*
    -inf % -0x1.a91df61cdff55p-190 = -:Q:000000000000:*, +:Q:000000000000:* 0x1.3637b51a4cef5p+704 % -0x1.7321fee9405dap+174 =
    0x1.35091dab4b7fcp+173:*, 0x1.e1fecd5a95078p+156:*
    0x1.87b6bf31e8f4dp+417 % 0x1.0d7e010bd24d1p+107 =
    0x1.807cf1f9a930cp+105:*, 0x1.20f38f9396204p+89:*
    0x1.2be81d593fb2bp+654 % 0x1.be9286b355f8cp+615 =
    0x1.cd0041c7ddda0p+612:*, 0x1.3047a80b7f7a0p+599:*
    0x1.dea5144722dbfp+921 % 0x0.b513d2204ebb4p-1022 =
    0x0.46213d625ee74p-1022:*, 0x0.b513d21d82ae5p-1022:E
    -0x1.fb8987bc24854p-338 % -0x1.547c2fcd66d1ap-430 =
    -0x1.26d9137c7a176p-430:*, -0x1.547bf8a4b1ef6p-430:E
    -:S:1b29754c5a61 % -0x0.0000000000000p+0 = -:S:1b29754c5a61:*, -:S:1b29754c5a61:*
    -0x1.e6b1f8fdeffe5p-815 % +:Q:6028ccd5ed9b = +:Q:6028ccd5ed9b:*, +:Q:6028ccd5ed9b:*
    0x1.02506fa14b30dp+757 % -0x1.293042280db98p-281 =
    0x1.77c049e18c880p-284:*, 0x1.206f3efcbcb7cp-281:*
    0x1.dfe723c99470cp+761 % -0x1.f4e4e5c401e3ap+602 =
    0x1.3451fa7ce6290p+600:*, 0x1.c4c02af23af50p+602:*
    -0x1.40f2c7b476892p+177 % 0x1.598f0e6f54478p-69 =
    -0x1.b6c1215f45a00p-73:*, -0x1.598f0e6f09656p-69:*
    0x1.ae1571bcc13d3p+800 % -0x1.27f708bc9e5eap+761 =
    0x1.46a1fe4d37f74p+760:*, 0x1.f8ca44ccf51c0p+746:*
    -inf % -0x0.6eb15b84ec762p-1022 = -:Q:000000000000:*, +:Q:000000000000:* -0x1.6eef2ed2bd997p-367 % 0x1.24f0dfb1d013cp-725 =
    -0x1.0bd5725b7a8e8p-725:*, -0x1.3d07e25757980p-751:E
    -0x1.80825cff43fedp+297 % 0x1.d2fa1162d808ap-239 =
    -0x1.9d715085e17e0p-243:*, -0x1.d2d2baf3759a0p-239:*
    -0x1.f3ef264938cdbp+578 % -0x1.7a7361f2c374dp-594 =
    -0x1.3456c0b093d82p-595:*, -0x1.18c8cde8a4fcep-636:*
    0x1.2fef3ebb2a83cp+271 % 0x1.305b597908258p-256 =
    0x1.6bbc385a6bf40p-258:*, 0x1.002affd4abc60p-281:*
    -0x1.17be45284e395p+801 % 0x1.5a522f5c2de83p-200 =
    -0x1.9677eb7bf9d06p-201:*, -0x1.5a522f5c2db80p-200:*
    -0x1.57d0dd98ac18ap+137 % 0x1.c32b88a59fe56p-327 =
    -0x1.68a7520c8a1e0p-329:*, -0x1.c32b88a10b018p-327:*
    0x1.265ff059ca95bp-464 % 0x1.27e29972a0af2p-878 =
    0x1.9fbc54a730db4p-879:*, 0x1.27e2995cd1fbap-878:E
    -0x1.080283533b4fbp-251 % 0x0.6437eff3e0c4dp-1022 = -0x0.12f12bf43d424p-1022:*, -0x0.0000000000005p-1022:E
    -0x1.1cee4185128fep+712 % -0x1.3809a31ed19ddp+150 =
    -0x1.7299814d7e1d0p+146:*, -0x1.3809a31ecdae5p+150:*
    -0x1.30540d1fe448ap+914 % 0x1.0604359f2bdc4p-943 =
    -0x1.9833d33fe54f0p-945:*, -0x1.d21e1f3f1e83cp-944:*
    0x1.c8243149f8cebp-469 % -0x1.f5cfcf76e9a96p-949 =
    0x1.0b057c96e4dd6p-949:*, 0x1.f5cfa80564c88p-949:E
    -0x1.ab3666e78c1e4p+501 % 0x1.62569bac2d7a3p+485 =
    -0x1.bdcdcf0704138p+483:*, -0x1.7380d4821c8d8p+446:*
    0x1.5d5a05222a4fdp+131 % -:S:10b49421137b = -:S:10b49421137b:*, -:S:10b49421137b:*
    0x1.c1c5c8c8c576ap+471 % 0x1.c4ef1b4bf646ap-857 =
    0x1.19014231bdc32p-857:*, 0x1.c4ef1b4bf6384p-857:*
    -0x1.b3036e1018f3ap+954 % 0x1.12b54b9bc60cbp-106 =
    -0x1.9227762d84bf2p-107:*, -0x1.7efb1c6d82c10p-150:*
    -0x1.dadbc417031c0p-394 % 0x0.b4cbf249fc9aep-1022 = -0x0.2f327af302c84p-1022:*, -0x0.000000644dd0ap-1022:E
    0x1.e106de3e95ad8p+775 % 0x1.34cf8bf2b1e25p+627 =
    0x1.fef3325406150p+624:*, 0x1.5da5008cf1ca8p+612:*
    0x1.a021410d37bafp+943 % 0x1.0447b1e4a4245p-68 =
    0x1.fd040c22ea6ccp-70:*, 0x1.0447b1dd3a815p-68:*
    0x1.9003f1808bc04p+84 % 0x1.85f71d08b694bp-665 =
    0x1.19ea7f28dbf38p-668:*, 0x1.45dddfbb51c28p-683:*
    -0x1.6866b2963447fp+946 % 0x1.6aef20853c933p+479 =
    -0x1.b85d0be5b8e90p+476:*, -0x1.6aef2030c473ap+479:*
    0x1.076c10825d6adp-100 % -0x0.c634f8a4283dep-1022 =
    0x0.06ab8ad7d38a0p-1022:*, 0x0.c632fa9b73ecdp-1022:E
    -0x1.526afec81d0adp+912 % 0x1.e3952b1d3e152p-582 =
    -0x1.3f44586f37d62p-582:*, -0x1.c221c35d0b620p-627:*
    0x1.a7366af0b36e4p-573 % 0x1.a5c39226cd23ap-782 =
    0x1.664043f332d0cp-782:*, 0x1.a4db4ba4f5441p-782:E
    0x1.72d14378039cep+59 % -0x0.a147a5df15a01p-1022 =
    0x0.9ca3811e921b8p-1022:*, 0x0.000009177dfccp-1022:E
    -0x1.fd24a08a6f52bp+893 % -0x1.ee9da37e89d02p-847 =
    -0x1.350ee33be82c0p-852:*, -0x1.ee9d820e59401p-847:*
    -0x1.515e88f40aeaep+120 % 0x0.f3c4bf501bbb9p-1022 = -0x0.90ca9ea63183fp-1022:*, -0x0.f3c35012c7c66p-1022:E
    -0x1.ed0a86327586bp-317 % -0x1.79df69443e400p-579 =
    -0x1.b994a2bbc1000p-581:*, -0x1.79dcdf7d921fep-579:E
    -:S:7672e928bf68 % -0x1.cdbb6ab8d7b03p-117 = -:S:7672e928bf68:*, -:S:7672e928bf68:*
    -:S:081e683dda7e % -0x1.dceb39934861fp+259 = -:S:081e683dda7e:*, -:S:081e683dda7e:*
    -0x0.0000000000000p+0 % -:Q:75ed446ec7a4 = -:Q:75ed446ec7a4:*, -:Q:75ed446ec7a4:*
    0x1.adc02d9073a3bp+452 % 0x1.a9ab278d13f51p-104 =
    0x1.e88332c5799e0p-109:*, 0x1.a9ab278d13dd6p-104:*
    0x1.2530e56b212d8p-412 % -0x1.37acf81b8d90fp-810 =
    0x1.bc65bc384b140p-816:*, 0x1.37acf81b8d5cdp-810:E
    -:S:6c70c7d49bb7 % 0x1.92a7f6f419b76p-158 = -:S:6c70c7d49bb7:*, -:S:6c70c7d49bb7:*
    0x1.3ef5eebd7e5f7p-181 % -0x1.8c7560f6a725ap-810 =
    0x1.7285be6c6b8f8p-810:*, 0x1.1c4146384d114p-844:E
    0x1.ed15321b6998ep+563 % -:Q:5be28789cb65 = -:Q:5be28789cb65:*, -:Q:5be28789cb65:*
    -0x1.32a3d7e78eb79p-537 % -0x0.62b8f212ad045p-1022 = -0x0.2c3c62fd03a36p-1022:*, -0x0.010b7914d7f18p-1022:E
    0x1.74e4f832866b1p+531 % 0x1.8869e30a82af1p-789 =
    0x1.b0ed52d3be3f0p-793:*, 0x1.87e146410162bp-789:*
    -0x0.0000000000000p+0 % +:Q:38557bd7bfef = +:Q:38557bd7bfef:*, +:Q:38557bd7bfef:*
    0x1.84b2827aab2bdp+29 % 0x1.7f3d131d22ad7p-32 = 0x1.536c8a4118206p-32:*, 0x1.7f3d131d22acfp-32:*
    -0x1.c4c6606ad1e87p+881 % 0x1.70e0997a9689fp+867 =
    -0x1.1984713928b9cp+866:*, -0x1.9707b388df498p+826:*
    -0x1.5c1a267c247d7p+734 % -0x1.3ab95fceade6dp-967 =
    -0x1.9ac9b0dd27a6cp-968:*, -0x1.1c1330623f4d1p-967:*
    -0x1.dea2384d5c6d3p+24 % 0x1.4529498eae0ffp-935 =
    -0x1.3466e259d4db5p-935:*, -0x1.4529498cc443ep-935:*
    -inf % -0x1.97e808a7f8df4p+522 = -:Q:000000000000:*, +:Q:000000000000:* -0x1.b979582d97743p+770 % -0x1.1e72af7936eaep-899 =
    -0x1.8154231576fd0p-902:*, -0x1.8bd3a71a70ec0p-936:*
    0x1.d8e702352e66cp+409 % -0x1.dfee77717fca4p-368 =
    0x1.3e66ebd8b1b6cp-368:*, 0x1.dfee77717fbe3p-368:*
    +:S:46837b2202cc % 0x1.7873abc208651p+94 = +:S:46837b2202cc:*, +:S:46837b2202cc:*
    -0x1.8f7b2379bc0c0p+773 % -0x0.02035b0f42517p-1022 = -0x0.01196830ca964p-1022:*, -0x0.02035b0f42517p-1022:E
    -0x1.afddedad3bca3p+411 % 0x1.793cb5b3c3499p+102 =
    -0x1.6f0b726965a15p+102:*, -0x1.5637782cd14ccp+82:*
    -0x1.fdefee3844196p+476 % 0x1.e90419c0cddefp-780 =
    -0x1.9dad78387d944p-782:*, -0x1.12027dcfe0258p-790:*
    -0x0.0000000000000p+0 % +:S:2e30258a38b6 = +:S:2e30258a38b6:*, +:S:2e30258a38b6:*
    0x1.0f15abc17f9c7p-746 % -0x0.ddd8b5cb2a749p-1022 =
    0x0.37560602be5e3p-1022:*, 0x0.ddd8b5cb2a746p-1022:E
    0x1.378476f9b99b3p-87 % 0x0.266b7f5dc0f55p-1022 =
    0x0.1c9ba4415b7fbp-1022:*, 0x0.0000000000010p-1022:E
    -0x1.799d5f8283c25p+681 % -0x0.5bdcedf4d1686p-1022 = -0x0.547ed045a2138p-1022:*, -0x0.5bdcedf4d1686p-1022:E
    -0x1.3c85d07425b0fp-245 % -:S:211cab11d7dd = -:S:211cab11d7dd:*, -:S:211cab11d7dd:*
    0x0.0000000000000p+0 % -:Q:6c1a20f86641 = -:Q:6c1a20f86641:*, -:Q:6c1a20f86641:*
    0x1.c74d47d529ef1p+508 % -0x1.e34758c8bf2c5p-70 =
    0x1.7e35028e5a0d4p-71:*, 0x1.15c59fb3659a0p-104:*
    -0x1.94d3c15a0cf90p+579 % -0x1.57a0c606e8fa8p-294 =
    -0x1.f552da22f7440p-295:*, -0x1.d85db6e88fb00p-350:*
    -0x1.a61d0a40f417ep-196 % +:Q:5acb57395130 = +:Q:5acb57395130:*, +:Q:5acb57395130:*
    0x1.b0207d3756795p+639 % 0x1.7642d9771edb3p-674 =
    0x1.1624dc4805040p-679:*, 0x1.7642d9771edb2p-674:*
    +:S:7592d85cc736 % 0x1.b812369167f25p-387 = +:S:7592d85cc736:*, +:S:7592d85cc736:*
    0x1.a02e1e147da82p+450 % 0x1.688bb222d612bp-750 =
    0x1.007566df14c0bp-750:*, 0x1.30be3736a6dd6p-752:*
    0x1.28805687d2460p+405 % 0x1.3f4f7338ec139p-144 =
    0x1.819f7d9a2e2eep-145:*, 0x1.1a23d51a542e0p-193:*
    0x1.081e52924fe9dp-179 % -0x0.3283d218ef012p-1022 =
    0x0.31b06032d76a0p-1022:*, 0x0.0000000004e6dp-1022:E
    -inf % 0x0.9519cce00aaa6p-1022 = -:Q:000000000000:*, +:Q:000000000000:* -0x1.d16423d05da60p+524 % 0x1.a2ad95033b063p-995 =
    -0x1.397576708f8c3p-995:*, -0x0.0aa20d2d123f7p-1022:E
    -0x1.8ef55d814e6abp+883 % -0x0.3378c574766e4p-1022 = -0x0.1671a798673ccp-1022:*, -0x0.337274db46f86p-1022:E
    0x1.fbbc1b7920635p-649 % -:Q:4a93d54803f5 = -:Q:4a93d54803f5:*, -:Q:4a93d54803f5:*
    -0x1.d09edd208393ep+709 % -:Q:66de064b1685 = -:Q:66de064b1685:*, -:Q:66de064b1685:*
    -0x1.8ea9a217a6effp+850 % -0x1.1815431add1a4p+374 =
    -0x1.bf507edcde350p+372:*, -0x1.181542fe17c45p+374:*
    -0x1.2daf1bf0e5d13p+199 % 0x1.9cef03f3cc2edp-403 =
    -0x1.a77c721faffaap-404:*, -0x1.51adc57c797a0p-409:*
    -0x1.a1e88b029465bp+832 % -0x1.fc682ce37787bp-63 =
    -0x1.dba2ab5e14e76p-63:*, -0x1.fc682ce3747bep-63:*
    -0x1.5f270d128c748p-281 % +:S:6c34c9b5e4c0 = +:S:6c34c9b5e4c0:*, +:S:6c34c9b5e4c0:*
    -inf % -0x1.c668413334d5bp-963 = -:Q:000000000000:*, +:Q:000000000000:* -0x1.b16799cacfbe7p+68 % 0x1.61dd8c0cd5129p-855 =
    -0x1.05a1c4459cca0p-855:*, -0x1.c3116c30505e4p-863:*
    -0x1.d1c58f9899e87p+318 % -0x1.cc84945d5b796p-379 =
    -0x1.2632a8f8a4800p-388:*, -0x1.97f30f92d5090p-394:*
    -0x1.b013c6b151f76p+719 % +:S:3bef3dd00cc8 = +:S:3bef3dd00cc8:*, +:S:3bef3dd00cc8:*
    -0x0.0000000000000p+0 % +:Q:000608ecfae8 = +:Q:000608ecfae8:*, +:Q:000608ecfae8:*
    -inf % -0x1.c4091395ab537p+98 = -:Q:000000000000:*, +:Q:000000000000:* 0x1.d1cc289247e7bp-365 % -0x1.398e53924c941p-490 =
    0x1.adabfa5e699e6p-491:*, 0x1.e427fa46eaf58p-529:E
    -inf % 0x0.0000000000000p+0 = -:Q:000000000000:*, +:Q:000000000000:* 0x1.a0890ccaba309p+146 % -0x1.3ea42dd561679p+111 =
    0x1.17fe6b64b14f4p+110:*, 0x1.35c9ea24b2c94p+92:*
    0x1.070f9dab6357cp-540 % -:Q:390657e54615 = -:Q:390657e54615:*, -:Q:390657e54615:*
    -0x1.374a2c816f75dp+105 % -0x1.29b0dc0098d70p-755 =
    -0x1.4a4fc81ed1b40p-756:*, -0x1.29b0cc9ddfe5bp-755:*
    -inf % 0x1.eb85833cdc7dep-278 = -:Q:000000000000:*, +:Q:000000000000:*
    -inf % -0x1.da2b0a902da25p+138 = -:Q:000000000000:*, +:Q:000000000000:* -0x1.de186e195b3b4p-363 % 0x1.52450120e6441p-711 =
    -0x1.acd1fc8441c26p-712:*, -0x1.52450120e3b1cp-711:E
    -0x1.e75ffd14e916bp+817 % +:Q:3add6df48ede = +:Q:3add6df48ede:*, +:Q:3add6df48ede:*
    -0x1.6f4c4b4343943p+676 % -0x1.a1ebb919d9d86p+238 =
    -0x1.9db899c7f5fc0p+234:*, -0x1.f783d25d4cd00p+183:*
    -0x1.d6bbe9c32fd6ep+9 % -0x1.af1f4b558df6ap-476 =
    -0x1.33eec5760264cp-476:*, -0x1.9fb6984b65e26p-476:*
    0x1.55283a1151a10p-137 % -0x1.a887efec42e0bp-827 =
    0x1.bf00241ad7366p-828:*, 0x1.22a85fbbb42fcp-844:E
    -:S:037f8c9a52d4 % -0x1.5997d57d2df65p+712 = -:S:037f8c9a52d4:*, -:S:037f8c9a52d4:*
    0x1.f7067b2c1d7ccp+190 % 0x1.83d4e306f6917p-476 =
    0x1.04c8890eac772p-476:*, 0x1.83d4e306f68a2p-476:*
    -0x1.a709b33b14c2bp+577 % -0x1.54af390e0c3f0p-345 =
    -0x1.4654f08f1a1c0p-347:*, -0x1.bc2ac47ba83c0p-353:*
    -0x1.cda203086f3f5p+181 % -0x1.ed29ddc5c80b4p-951 =
    -0x1.3d085eb001238p-951:*, -0x1.ed29dd6139725p-951:*
    0x1.f11e3c321678bp+736 % -0x1.5995beaa2b6bfp+619 =
    0x1.8169668561980p+618:*, 0x1.5995beaa2b697p+619:*
    -:S:57a3f664f4b1 % 0x1.2d899a1838accp-454 = -:S:57a3f664f4b1:*, -:S:57a3f664f4b1:*
    -:S:0792c2c0e65d % -0x1.e594b75fca2ebp-622 = -:S:0792c2c0e65d:*, -:S:0792c2c0e65d:*
    0x1.d315c18b1be5ep+612 % 0x1.c1cf287812e41p-16 =
    0x1.7e92f80055d0dp-16:*, 0x1.c1cf24fa6f3ffp-16:*
    -0x1.ce2fd68fe3f88p+109 % -0x1.548a894c57f79p-783 =
    -0x1.2297bcb6dcd78p-784:*, -0x1.548a894c57ee6p-783:*
    -0x1.8c7d4e71b040ap-837 % 0x1.c56c03b917b94p-1012 = -0x1.609cf2eb2b6b8p-1013:*, -0x0.00000000003c5p-1022:E
    -0x1.f7008c6efc4f1p+833 % -0x1.2799ce2fe8d15p+368 =
    -0x1.293976731bbeep+367:*, -0x1.360dffa8ee0a8p+343:*
    +:Q:62ac0b7aa18b % +:Q:6687c1114d2e = +:Q:6687c1114d2e:*, +:Q:62ac0b7aa18b:* -0x1.05296066d952bp+798 % -0x1.44b5dc7a87fabp-763 =
    -0x1.379123c62e116p-764:*, -0x1.44b5dc7a8556ep-763:*
    0x1.3a13b93f6260ep+821 % 0x1.b760ec18452e3p-819 =
    0x1.ccfd310c46f2cp-820:*, 0x1.b7071893a84edp-819:*
    0x1.f778ebd7612aap-524 % 0x1.b0b4efa6ff719p-892 =
    0x1.60304d6e02ec3p-892:*, 0x1.b079dfe57f51cp-892:E
    0x1.26d1cf7523321p+917 % 0x0.5b9c65858b32ep-1022 =
    0x0.45c9258ced302p-1022:*, 0x0.0000000000521p-1022:E
    -0x1.27a7da8e7d5bep-477 % 0x1.0191e9715e8acp-857 =
    -0x1.8407692ec72c8p-858:*, -0x1.ac8b04b95c600p-866:E
    -:S:796644ff35b9 % 0x1.b6ec67a0a4a1ep-229 = -:S:796644ff35b9:*, -:S:796644ff35b9:*
    0x1.1ceba2f4b4d67p+587 % 0x1.b6833a133bcb4p-482 =
    0x1.045303a582d30p-484:*, 0x1.2baa633bcbee0p-506:*
    -0x1.1a4f36ab2299fp-791 % -0x1.475ccb3010580p-929 =
    -0x1.41e40130b1300p-930:*, -0x1.2735791edba00p-955:E
    0x1.3ad2dd96f3170p-278 % 0x0.7a28f935814f7p-1022 =
    0x0.408e9585953b6p-1022:*, 0x0.0000002723cccp-1022:E
    -inf % 0x1.6c6a5497d9974p+524 = -:Q:000000000000:*, +:Q:000000000000:* 0x1.b21b8180e1052p+466 % 0x0.2f689c757d593p-1022 =
    0x0.1b98199deb26ep-1022:*, 0x0.2f689c757a61cp-1022:E
    -0x1.f72de68d827bap+915 % -0x1.7d57b389b509dp+451 =
    -0x1.efdf5fac2c7e0p+449:*, -0x1.7d57b37441313p+451:*
    -inf % 0x0.acd891dd609f9p-1022 = -:Q:000000000000:*, +:Q:000000000000:*
    -inf % -0x1.2a88972abe444p+731 = -:Q:000000000000:*, +:Q:000000000000:* 0x1.23466efdf0dcfp-263 % 0x1.afdd61e8b63a7p-506 =
    0x1.ffa93d4ddc3acp-507:*, 0x1.172dd60c8224ep-535:E
    -:S:75346ab05ace % 0x1.f94231fab62a1p-47 = -:S:75346ab05ace:*, -:S:75346ab05ace:*
    0x1.158e33bd86dacp+569 % 0x1.9dd74b66314eap-446 =
    0x1.7642561075f2ep-446:*, 0x1.9dd74b5b20cffp-446:*
    0x1.ede372b8d12b3p+48 % 0x1.64a7ae7d16cb9p-791 =
    0x1.cf4f14c98e588p-793:*, 0x1.09868ef132d90p-829:*
    -0x1.fdc55a7098d16p+819 % 0x1.c86ab67c2768fp-368 =
    -0x1.5f3ff8a351e4cp-368:*, -0x1.db5ecfafe75e8p-389:*
    -0x1.c1351c205a114p+985 % 0x1.100efe35d256cp+418 =
    -0x1.dc41d9edf7fd0p+417:*, -0x1.100efe35bb157p+418:*
    -0x1.ca82d91974e11p+414 % -0x1.f1b9bc8b4b540p-317 =
    -0x1.bf91b0b3cc8c0p-317:*, -0x1.c59b6ef4b7b00p-367:*
    +:S:7398cb29ad3e % -0x1.76cfed81d0fbap-936 = +:S:7398cb29ad3e:*, +:S:7398cb29ad3e:*
    -0x1.36ec4d878b279p+958 % 0x1.ba12e6c9636d9p-204 =
    -0x1.e3de9522be2e8p-207:*, -0x1.ba12e6c963682p-204:*
    -0x1.555e0d8d9dafcp-286 % -0x1.710574dc17bc5p-607 =
    -0x1.86d5d19a78edcp-608:*, -0x1.88572bde6b3a8p-619:E
    -0x1.cace77640c345p+408 % 0x1.aeafe208ccb38p-876 =
    -0x1.0b80363d350e0p-876:*, -0x1.1beea77edab50p-906:*
    -:S:192ca14ace2f % 0x1.6dca69c14dd41p+473 = -:S:192ca14ace2f:*, -:S:192ca14ace2f:*
    0x1.b46ff52c54fa4p+912 % -0x1.e76eb5c52911cp+880 =
    0x1.37c6afc2f4bd8p+880:*, 0x1.21ebbb6318c00p+854:*
    -0x1.b30033fea3b41p+645 % -0x1.61c15a38a9ee6p-865 =
    -0x1.78fc6641deb38p-867:*, -0x1.61c15a366ffe6p-865:*
    -0x1.6dbc2e9ee5669p-909 % -0x0.1fb91677e4a25p-1022 = -0x0.004268165e4c8p-1022:*, -0x0.0000000000007p-1022:E
    -0x1.9dfb222ffb5b2p+299 % 0x1.5a99ed2a84c20p+129 =
    -0x1.0a5da9bc5fce0p+129:*, -0x1.5a99ed2a84bffp+129:*
    0x1.f4f940ee0a476p+580 % 0x0.622dd1898514cp-1022 =
    0x0.495c50e13d574p-1022:*, 0x0.0000000000006p-1022:E
    -0x1.831c08e80339ep+506 % -0x1.0dcc58f713107p-392 =
    -0x1.6d008d08cfbd2p-393:*, -0x1.b5503708eb588p-432:*
    -0x1.6dcb17e8003e7p+545 % 0x1.e9597c2faf7c7p-848 =
    -0x1.a7b5d9cb9ec0ap-848:*, -0x1.1d64176d5f05ap-899:*
    0x1.af09cacc69639p+686 % -0x1.52945ba2161aap+678 =
    0x1.337277a353c2ep+678:*, 0x1.52945ba216171p+678:*
    -0x0.0000000000000p+0 % +:Q:6d2241cbeb07 = +:Q:6d2241cbeb07:*, +:Q:6d2241cbeb07:*
    -inf % 0x1.fc583271fef46p-40 = -:Q:000000000000:*, +:Q:000000000000:* 0x1.ab1d619a2c6ebp+268 % -0x1.b41ae6dfffa02p-531 =
    0x1.1301a8bf57d40p-531:*, 0x1.5f75e6aaf2350p-558:*
    0x1.abbc82fea32ecp-91 % -0x0.40173b2155994p-1022 =
    0x0.0dc5a56d613c4p-1022:*, 0x0.40173b2155993p-1022:E
    +:S:575f4a6796bd % -0x1.58c9b31c0681dp-429 = +:S:575f4a6796bd:*, +:S:575f4a6796bd:*
    0x1.d0aa56c5914f8p+25 % 0x1.b1ee6d238b758p-205 =
    0x1.e1a0d5f41cd20p-206:*, 0x1.b1ee6d238b73fp-205:*
    -0x1.3258901c35b37p-226 % 0x1.d605b9bb4104ap-248 =
    -0x1.7ab582a37f938p-248:*, -0x1.b0ef2151ed9a0p-280:*
    0x1.09ab50ac1234fp-203 % +:S:4cb04526ab2a = +:S:4cb04526ab2a:*, +:S:4cb04526ab2a:*
    0x1.793bc5ff58821p-705 % -:S:72923dfb17d2 = -:S:72923dfb17d2:*, -:S:72923dfb17d2:*
    +:S:4ff2e99a9d2a % -0x1.d19e1ddb4b200p-356 = +:S:4ff2e99a9d2a:*, +:S:4ff2e99a9d2a:*
    0x1.eefc1967ea01ap+227 % -0x1.9a60be3993cb9p+202 =
    0x1.3345297f47cd9p+202:*, 0x1.9a60be3718c92p+202:*
    0x1.2f1bcd2f750e6p+702 % -0x1.ba4bcdfaa6f19p-973 =
    0x1.3db4e955df1f1p-973:*, 0x1.ba4bcdfa87d9dp-973:*
    +:S:590f33ef7e4b % -0x1.84c3b96598caap+901 = +:S:590f33ef7e4b:*, +:S:590f33ef7e4b:*
    -:S:26a00caf8d6e % 0x1.f7e13f4d6c16ap+388 = -:S:26a00caf8d6e:*, -:S:26a00caf8d6e:*
    +:S:2f0574fe9d6d % 0x1.15095c5d1ad54p+581 = +:S:2f0574fe9d6d:*, +:S:2f0574fe9d6d:*
    0x1.18b7c640f0c87p+400 % 0x1.9d7507770aae6p-774 =
    0x1.1d5fb6cf486f0p-775:*, 0x1.3d2b87b9e46c0p-812:*
    0x1.ddb1f56316646p+648 % 0x1.9a86e340f9bfbp+208 =
    0x1.ab312a20b2ac8p+205:*, 0x1.9a86e340f9bf8p+208:*
    0x1.87cec2f44a5c2p+794 % 0x1.82a7563438c17p+509 =
    0x1.5aba928b2acd4p+509:*, 0x1.82a7563438c0cp+509:*
    -0x1.4ffae216aeac4p+642 % 0x1.279eeb8e39303p-424 =
    -0x1.8031feb60d0c4p-425:*, -0x1.f81c28d0a5f40p-452:*
    0x1.e504832c6ca66p+796 % -0x1.191f71664bff7p-526 =
    0x1.21d237bd72aa0p-531:*, 0x1.4ae1f1cfed900p-579:*
    +:S:5fd7e66ba539 % 0x1.966df34b06b39p-720 = +:S:5fd7e66ba539:*, +:S:5fd7e66ba539:*
    -0x1.994cff326fee1p+272 % -0x1.8dbe934960029p+81 =
    -0x1.b2b97288bed80p+75:*, -0x1.f33eab9a98158p+56:*
    -0x1.5b4f03732d4e2p+779 % -0x1.ec866aa711b0cp-757 =
    -0x1.63fdbdeed1a78p-758:*, -0x1.f661828e15680p-812:*
    0x1.0f79971211108p+763 % -0x1.616c26e5f83d0p+721 =
    0x1.484224c7a7cd0p+721:*, 0x1.fce9e52096500p+707:*
    0x1.6434ce40c38b8p+683 % -0x1.a553687c67e1ep+59 =
    0x1.823315c286120p+57:*, 0x1.0da4e82c68480p+21:*
    0x1.13a2606d28377p-780 % -0x0.fcf48a24d586fp-1022 =
    0x0.483844c525cdap-1022:*, 0x0.fcf48a23599abp-1022:E
    -0x1.10bf2cf884cb2p+816 % -0x1.65be20d380c1cp+336 =
    -0x1.fdc6c0d6857e0p+335:*, -0x1.65bdcc2751b1bp+336:*
    -0x1.9c14da6e0ba59p-107 % 0x1.6e50fff133fe5p-751 =
    -0x1.86a5efe7aaed0p-755:*, -0x1.8ef7727accb00p-771:E
    -0x1.2c7d44b103b09p+410 % 0x1.475f4dc5a7aacp+140 =
    -0x1.80de48f606170p+138:*, -0x1.03e73920555b2p+140:*
    0x1.0674fbac0ce22p+925 % 0x1.b4edec2d37390p+903 =
    0x1.7747609c8a990p+903:*, 0x1.b4edec2ba389dp+903:*
    0x1.f268d2339668dp-412 % 0x1.9af436111d1f4p-889 =
    0x1.afb1d5bd8bb00p-890:*, 0x1.9af0aa2995b34p-889:E
    -0x1.9df964b74891fp+553 % -0x1.fd8771c162c36p-11 =
    -0x1.ed214f320bff0p-11:*, -0x1.1f766edd6c930p-52:*
    0x1.73453865d5ec6p+448 % -0x1.dacfe0a02aa30p+328 =
    0x1.539a8b614ce80p+328:*, 0x1.dacfe0a02a858p+328:*
    -0x1.665197a65b5ffp+3 % -0x1.b363654e0df02p-280 =
    -0x1.314c3fb43a868p-282:*, -0x1.00f7e6d5530c0p-328:*
    -0x1.fccb514963614p+787 % 0x1.c679cf174436dp+542 =
    -0x1.74e7cb1ea03e7p+542:*, -0x1.5a34fd7d72d20p+510:*
    -0x1.34518008c2dfcp-820 % 0x0.4fab23608ff1bp-1022 = -0x0.3d69dfaa7d118p-1022:*, -0x0.000247f6201f9p-1022:E
    0x1.0ed7bfb52c919p+782 % -0x1.474a0f0fc4a6dp-230 =
    0x1.a57c5359c23b8p-233:*, 0x1.474a0f0e9be35p-230:*
    -:S:0820be10ce32 % -inf = -:S:0820be10ce32:*, -:S:0820be10ce32:* 0x1.40957f6579bebp-581 % -0x1.88ffef6bb8906p-1012 =
    0x1.1e8ede71b4690p-1014:*, 0x1.86f3aa23f30f9p-1012:E
    -0x1.a276dbcd05ce2p+486 % -0x1.884b5722e4d36p-942 =
    -0x1.e294fae6f79d0p-945:*, -0x1.31994c245ab68p-996:*
    -0x1.507f4291410f1p+623 % -0x1.9734831b71b5dp-287 =
    -0x1.4ecaeb4ed0c96p-288:*, -0x1.9734831af41e5p-287:*
    -0x0.e30ce38c5f3cap-1022 % 0x0.cc77cc8d08a67p-1022 = -0x0.169516ff56963p-1022:*, -0x0.0000000000000p+0:E
    -0x1.fe2210d42e88bp+541 % -0x1.c6cc8a1b166f3p+422 =
    -0x1.70acf6e25e444p+421:*, -0x1.c6cc8a1b16549p+422:*
    0x1.9768755ec94f0p-490 % 0x0.54fd41fa8e5b9p-1022 =
    0x0.3ef10b1ed9885p-1022:*, 0x0.54da5621723bfp-1022:E
    -0x1.4482250ac2cc4p-692 % -0x1.b56d29a02e388p-926 =
    -0x1.3616a233d8ad0p-926:*, -0x1.dc0e6bf1f7500p-965:E
    -0x1.77dd2a83f8548p-154 % 0x0.7b3de566ca8e0p-1022 = -0x0.6b8f117ef60e0p-1022:*, -0x0.7b346d1e82c50p-1022:E
    -0x1.a63e15e83078dp+200 % -0x0.d695683a6c35dp-1022 = -0x0.aca74bf011ea1p-1022:*, -0x0.0000000004f0fp-1022:E
    0x1.095e55052867ap+870 % 0x1.84bf556755569p-746 =
    0x1.113a89470aea6p-747:*, 0x1.84bf556732469p-746:*
    -0x1.0206653a6121cp-470 % -:S:1289b1e0b1eb = -:S:1289b1e0b1eb:*, -:S:1289b1e0b1eb:*
    -0x1.9e407485e3f38p+298 % -0x1.571972d6f7235p-947 =
    -0x1.49228fe699d40p-949:*, -0x1.571972cbc8ddbp-947:*
    0x1.80465931b2e3ap-134 % -0x0.c615934a0e3cap-1022 =
    0x0.3c24761f52046p-1022:*, 0x0.000000000000ap-1022:E
    -0x1.687bcb4d3a568p+590 % 0x1.ba0d04e072ce9p-939 =
    -0x1.71f27de91d919p-939:*, -0x1.3345d2063bb52p-942:*
    -0x1.2434c48f4c020p-618 % -:S:3954133c649d = -:S:3954133c649d:*, -:S:3954133c649d:*
    +:Q:76047fbb33a6 % -:Q:346df81637c4 = -:Q:346df81637c4:*, +:Q:76047fbb33a6:* 0x1.3d3438dcbfd24p+429 % 0x1.e7cdb00d7546cp-759 =
    0x1.6a69b573a7780p-760:*, 0x1.e7cd935c192d1p-759:*
    0x1.3ef19e7eb1c49p-382 % -0x0.06a35176833dap-1022 =
    0x0.0224eb53cf854p-1022:*, 0x0.000000029fb7fp-1022:E
    +:S:29887b86b495 % 0x1.89f4467e9976bp-358 = +:S:29887b86b495:*, +:S:29887b86b495:*
    0x1.d2abedbf78f32p+820 % -0x1.35c9e84c6e43ap-121 =
    0x1.76b9c8ae35b94p-122:*, 0x1.9f25f85b18140p-173:*
    0x1.a3fa2aba002bdp+960 % -:S:6ced5e0b115b = -:S:6ced5e0b115b:*, -:S:6ced5e0b115b:*
    -inf % -0x1.1498f38858e6fp+680 = -:Q:000000000000:*, +:Q:000000000000:*
    -inf % 0x1.e707eb3365530p+358 = -:Q:000000000000:*, +:Q:000000000000:* -0x1.dac63c7c2d337p+249 % 0x1.1e3b03d0e5a55p-546 =
    -0x1.21bb66a14b49cp-547:*, -0x1.1e3b0397884b2p-546:*
    0x1.e1f383151da0bp+362 % 0x1.363c48151a8c4p-755 =
    0x1.651fbb60e0a48p-756:*, 0x1.d24a369a001e0p-799:*
    0x1.058430db9348fp+735 % 0x1.e55ab256660a6p-632 =
    0x1.462c5435174bap-632:*, 0x1.364630abca780p-644:*
    0x1.5577cd9ccb71fp+859 % -0x1.06ba82d1ae72bp-796 =
    0x1.f0ed0452a28dcp-797:*, 0x1.06ba82d1ae5b7p-796:*
    -inf % 0x1.5cf1665456db5p+724 = -:Q:000000000000:*, +:Q:000000000000:* -:S:42c4fc523d7b % -0x1.8b6ff9ef2b001p-655 = -:S:42c4fc523d7b:*, -:S:42c4fc523d7b:*
    -0x1.a129d23867040p+716 % 0x1.941fe5005e1b2p-295 =
    -0x1.40645cdc2a1b8p-295:*, -0x1.941fe5004d848p-295:*
    0x1.20cbcfac0bb7ep+885 % 0x1.e279eab0b0ff0p+757 =
    0x1.2a3a0a6eddf80p+755:*, 0x1.53f0bbe3c0000p+713:*
    0x1.e780f54829124p+52 % -0x1.d42eb4caf1af3p-1000 =
    0x1.4746a17870f2dp-1000:*, 0x0.000015ec71a55p-1022:E
    -inf % 0x1.1526f2cf23c26p-102 = -:Q:000000000000:*, +:Q:000000000000:* -0x1.590cc7ca068a2p+363 % -0x1.6b657269531a6p-506 =
    -0x1.4926f7ecd66f8p-507:*, -0x1.5a46352b14c4fp-506:*
    -0x1.1920c5def8f9ap+271 % -0x1.6cdee32f5cdb2p-541 =
    -0x1.1b4893ef49080p-545:*, -0x1.6cc28a568162fp-541:*
    -0x1.4ec76b75b99f3p+468 % -:Q:609c0abc7a20 = -:Q:609c0abc7a20:*, -:Q:609c0abc7a20:*
    0x1.e612f205b191bp-349 % 0x1.696681d377410p-587 =
    0x1.3f24fd20e9d00p-589:*, 0x1.696681d33d6c3p-587:E
    -0x1.593a578348901p+653 % 0x1.2cd960be3c228p-801 =
    -0x1.9a105cac827c0p-802:*, -0x1.2cd960be3938fp-801:*
    0x1.1df42e63a48a8p-40 % -0x0.d5676cfc32ae8p-1022 =
    0x0.cf3b01fe7a5e0p-1022:*, 0x0.d560dcc23e53cp-1022:E
    -0x1.501919cbfd4c9p+290 % -0x1.c72c031fadf2cp-929 =
    -0x1.8dd20f30247d0p-929:*, -0x1.7694f6c81ec70p-978:*
    0x1.61d529699c607p+591 % -0x1.bc42cfdf25604p+82 =
    0x1.1b152ea1d8b20p+81:*, 0x1.43a4174e194f0p+43:*
    0x1.85fc6d403f6e2p-1 % -0x1.1dcf9bf1b571dp-544 =
    0x1.b496060c3a0d8p-546:*, 0x1.f7a7086a78b20p-552:*
    0x1.d3bed90eb7d77p+29 % 0x1.8f442685305dbp-257 =
    0x1.409651747985bp-257:*, 0x1.8f442685305dap-257:*
    0x1.4820968c00a25p-207 % 0x1.5ea986ee0fcf6p-657 =
    0x1.1f2df1683fc80p-661:*, 0x1.ce83137d1fdf0p-701:E
    -0x1.627ce9c4c4afap-90 % -0x0.3f77e0dfd2763p-1022 = -0x0.0db2a60384655p-1022:*, -0x0.3f77e0dfd2762p-1022:E
    -inf % -0x1.c7a5394ec89e0p-523 = -:Q:000000000000:*, +:Q:000000000000:*
    -inf % -0x1.7e16797b2f0a9p+859 = -:Q:000000000000:*, +:Q:000000000000:* -0x1.d09bbc949014cp+861 % -0x1.1e360f2192fe7p+766 =
    -0x1.e1afd4aeb6354p+765:*, -0x1.0f6b00c0ce24ap+752:*
    0x1.9b3371d8b1c55p-436 % -0x1.1720e7edd3278p-885 =
    0x1.05e04527ffec0p-885:*, 0x1.1720e7edd3198p-885:E
    -:S:66f8b19c9822 % -0x1.d650a3c1be52cp-148 = -:S:66f8b19c9822:*, -:S:66f8b19c9822:*
    0x1.1a78436e85119p-215 % 0x1.2219db1d0eb30p-410 =
    0x1.08cb55a9540e0p-410:*, 0x1.2219d802c51e6p-410:E
    +:S:0da83e445834 % 0x1.633090c6677dcp+442 = +:S:0da83e445834:*, +:S:0da83e445834:*
    -0x1.b2d56657198bcp-471 % 0x1.580cc7af69db0p-497 =
    -0x1.173a9ea4cb260p-498:*, -0x1.33231482bff80p-524:*
    0x1.0c92524dfb66cp-381 % 0x1.8a7f5f3dc8130p-1018 =
    0x1.359a54e07bf80p-1021:*, 0x1.8a7f5df9169dcp-1018:E
    -0x1.5a791faa8b47ep+553 % -0x1.201aaa7df2633p-147 =
    -0x1.163b4a3ab8dd4p-149:*, -0x1.063eeccffe32cp-159:*
    -inf % -0x1.28b9ae59df005p-524 = -:Q:000000000000:*, +:Q:000000000000:* -0x1.65a7a7e1d5fd9p+617 % 0x1.082baaa15be31p-583 =
    -0x1.0e5928320fbd2p-584:*, -0x1.429c0e77f3ab8p-604:*
    0x1.189eef67fd659p+539 % -:Q:4dc426463cf5 = -:Q:4dc426463cf5:*, -:Q:4dc426463cf5:*
    +:S:76d8539d9fdf % 0x1.8e9e851efea68p+572 = +:S:76d8539d9fdf:*, +:S:76d8539d9fdf:*
    -inf % 0x1.8b6b27e02396fp+357 = -:Q:000000000000:*, +:Q:000000000000:* 0x1.bdf47e2ed4dd7p-36 % +:S:5dec089346bf = +:S:5dec089346bf:*, +:S:5dec089346bf:*
    0x1.090032c22525dp+322 % 0x0.a6e1c81ea125fp-1022 =
    0x0.7612603bb6732p-1022:*, 0x0.000000cf6034ep-1022:E
    0x1.1a74dec343122p-147 % 0x1.95287502465c9p-939 =
    0x1.112e5fe5d1ac0p-944:*, 0x1.52ce25034dca0p-969:E
    -0x1.53730773c9c25p+888 % -0x1.7bc77b3ba7674p-237 =
    -0x1.6cacf30fe44bcp-237:*, -0x1.7bc6fbb85d8dep-237:*
    -0x1.abc8198c985ecp-514 % -0x0.5644431fc94bfp-1022 = -0x0.2c0b93c461360p-1022:*, -0x0.00000000001bbp-1022:E
    0x1.c3e04a479db00p-631 % -0x1.c8579514811f9p-651 =
    0x1.d01e67b839b30p-654:*, 0x1.c857951461b90p-651:*
    -0x1.09379a0b4cd12p-633 % -0x0.347b1a6a03576p-1022 = -0x0.0712d86b57780p-1022:*, -0x0.0000000000002p-1022:E
    -0x1.cf15f1db70929p+150 % -0x1.8ccd79520450cp-78 =
    -0x1.3ac054c687600p-84:*, -0x1.8ccd79520450cp-78:*
    0x1.ee805c37ffdafp+264 % -0x0.77f072f70104ep-1022 =
    0x0.2f60fb5bedfcap-1022:*, 0x0.77f072f700d30p-1022:E
    0x1.c2b08f6a72b9ep-58 % -0x1.0782586a7b996p-984 =
    0x1.a4d5b372f2958p-986:*, 0x1.d9ed3223f4e42p-985:E
    0x1.8007bd94c874ap+467 % 0x1.f7e2e78c6c424p+166 =
    0x1.acf8df5101680p+164:*, 0x1.f7e2e5d537630p+166:*
    0x1.62dee22f1e415p+802 % 0x1.a53970bd6bb25p-250 =
    0x1.34f6616177380p-254:*, 0x1.a53970bd6baf6p-250:*
    -0x1.9c706d6b77f12p+639 % -0x1.65ee550dad389p-218 =
    -0x1.533233b8df2c0p-224:*, -0x1.65ee536578556p-218:*
    -0x1.4b3fa5267178ep+819 % 0x1.0e2a822988440p+298 =
    -0x1.50834ff070400p+294:*, -0x1.cb1fc7bae8400p+275:*
    0x1.199303c075608p+42 % -0x1.64d299e988256p-511 =
    0x1.82fcc40ce3524p-512:*, 0x1.8fbb9d3ea3560p-564:*
    -0x1.3203bdf387cdap-862 % -0x0.b089f89f1baacp-1022 = -0x0.a231e2df870fcp-1022:*, -0x0.07b929544eda6p-1022:E
    -0x1.8e3437ff09c61p+627 % 0x1.8f439f049ad08p-453 =
    -0x1.4b0c7c1ae5600p-455:*, -0x1.8f439e72b88e3p-453:*
    -0x1.9133201dd47ffp+400 % 0x1.8398a8d2c5cdfp+164 =
    -0x1.809a03e114ce8p+161:*, -0x1.ea5f265267160p+129:*
    -0x1.bf98c7da17084p+571 % 0x1.b7c82c6a1725cp-66 =
    -0x1.2cc1ae4f627f0p-66:*, -0x1.1eb2cf1a66390p-89:*
    0x1.d3f4f9d9ff04fp-198 % -0x1.154fb60f49542p-648 =
    0x1.5d208c0b2caa8p-649:*, 0x1.2e85837ffa550p-686:E
    0x1.8080d7a03b27fp+863 % 0x1.872d450b3fa5ep-456 =
    0x1.70e4e932ab7dep-456:*, 0x1.3a2545edea4b4p-496:*
    +:S:6eee3c65938b % -:Q:1c1cb40ad58b = -:Q:1c1cb40ad58b:*, +:S:6eee3c65938b:* 0x1.262ea83a6d9d4p+260 % 0x1.0d5787dba5106p-201 =
    0x1.e3e1561ed281cp-202:*, 0x1.0d5787db03fa3p-201:*
    0x1.8880f54a4e8c0p+660 % -0x1.2fbc35131efafp+210 =
    0x1.5dc21ec7b01d8p+208:*, 0x1.2fbc35131ef95p+210:*
    -0x1.5bf42310b0f32p+700 % 0x1.f5295fc4c433bp-277 =
    -0x1.5d20cd8ae3af8p-278:*, -0x1.5411e826871f4p-287:*
    0x1.0afb32562ce7bp+741 % -0x0.77288a354a1d4p-1022 =
    0x0.11133907f43e0p-1022:*, 0x0.77288a354a1b4p-1022:E
    -0x1.1381db13f973ap+645 % -0x1.1120f19f24687p-669 =
    -0x1.6bd67e3456eb8p-670:*, -0x1.6716d6db18b40p-684:*
    -0x1.6adeff1912c8ep+856 % 0x1.340dcaf6b0671p+832 =
    -0x1.139230157d945p+832:*, -0x1.cf26f744ca740p+801:*
    0x1.972d60871661ep+160 % -0x1.d55d8b6e2f399p-491 =
    0x1.72948f106dc33p-491:*, 0x1.c5f85be3fe230p-502:*
    0x1.c407aca0f1077p-324 % 0x1.8717479cadb98p-915 =
    0x1.854aee9ce6190p-916:*, 0x1.870d1835120b0p-915:E
    +:S:4c9c99efe3fd % 0x1.fc301cc69b114p+999 = +:S:4c9c99efe3fd:*, +:S:4c9c99efe3fd:*
    -0x1.f1f2e8fdc92d9p+76 % +:S:78f1e50f6c72 = +:S:78f1e50f6c72:*, +:S:78f1e50f6c72:*
    0x1.775caa63cf403p-175 % 0x0.3da05b0ce1239p-1022 =
    0x0.397062b8c2ba8p-1022:*, 0x0.3da05b0ca8bd4p-1022:E
    0x1.e35abaf2e5c36p+997 % -0x1.5d002d30e68a0p-599 =
    0x1.4815c5a0a8980p-599:*, 0x1.e2f5eb5802700p-645:*
    0x1.b50c78d7f1291p+437 % -0x1.8bcb83efdc6cfp-372 =
    0x1.f934613ebb33ep-373:*, 0x1.7b5226e317c20p-392:*
    0x0.c02b45917f785p-1022 % +:S:53c115574d25 = +:S:53c115574d25:*, +:S:53c115574d25:*
    -0x1.5a7e3135042d9p+430 % -0x1.a5f1f7a966beep-906 =
    -0x1.358f959b0d52ep-906:*, -0x1.a5a3db640c9d9p-906:*
    0x1.2cdab04360b84p+454 % -0x0.b241b8ad803f5p-1022 =
    0x0.5d3ae84d02fcap-1022:*, 0x0.041a0bf642dd0p-1022:E
    +:S:528b4a5d4299 % -0x1.f9d68a5685362p+141 = +:S:528b4a5d4299:*, +:S:528b4a5d4299:*
    -0x1.d5da6e97b77f2p+443 % 0x1.9e13eaebfbf96p-704 =
    -0x1.0750efe5b0544p-704:*, -0x1.1c64c29e37638p-716:*
    -inf % -:Q:1f400346a573 = -:Q:1f400346a573:*, -:Q:1f400346a573:* 0x1.f501e427149e5p-65 % +:Q:52618610950c = +:Q:52618610950c:*, +:Q:52618610950c:*
    0x1.385a7fca6c009p+351 % -0x1.44cddc10040ebp+72 =
    0x1.0ef222f712122p+72:*, 0x1.fae44f12dc640p+20:*
    -0x1.a6e815d856d8ep+132 % -0x0.e25268faa2cddp-1022 = -0x0.b637348b300d8p-1022:*, -0x0.0018e411810bep-1022:E
    +:S:3ab9f10930d2 % -0x1.d7ceaacbb637ap-822 = +:S:3ab9f10930d2:*, +:S:3ab9f10930d2:*
    -inf % -0x1.2b751d38488d0p+941 = -:Q:000000000000:*, +:Q:000000000000:* -0x1.35df27600fedbp+69 % 0x1.b90d646195dc9p-469 =
    -0x1.5d7dcdbf752acp-471:*, -0x1.7393f95dd287ap-482:*
    0x1.c57854970a884p+743 % -0x1.c5ea6c4bea2bfp+741 =
    0x1.c4220d786b9d3p+741:*, 0x1.3299466e60cc6p+689:*
    --- Synchronet 3.20c-Linux NewsLink 1.2
  • From Bonita Montero@Bonita.Montero@gmail.com to comp.lang.c++ on Tue Mar 11 09:34:06 2025
    From Newsgroup: comp.lang.c++

    Partitially wrong alarm: I forget to convert my xtrunc() function into
    an xfloor() function. These are the accuracy results now compared to
    fmod() of MSVC:

    53 bits shared accuracy
    equal results: 100%
    equal exceptions: 91.017%
    equal NaN signs: 96.475%
    equal NaN-types: 99.78%
    equal NaNs: 96.253%

    These are the accuracy results compared to glibc:

    53 bits shared accuracy
    equal results: 100%
    equal exceptions: 99.901%
    equal NaN signs: 87.224%
    equal NaN-types: 93.181%
    equal NaNs: 80.405%


    --- Synchronet 3.20c-Linux NewsLink 1.2
  • From Michael S@already5chosen@yahoo.com to comp.lang.c++ on Tue Mar 11 12:29:02 2025
    From Newsgroup: comp.lang.c++

    On Tue, 11 Mar 2025 09:34:06 +0100
    Bonita Montero <Bonita.Montero@gmail.com> wrote:

    Partitially wrong alarm: I forget to convert my xtrunc() function into
    an xfloor() function. These are the accuracy results now compared to
    fmod() of MSVC:

    53 bits shared accuracy
    equal results: 100%
    equal exceptions: 91.017%
    equal NaN signs: 96.475%
    equal NaN-types: 99.78%
    equal NaNs: 96.253%

    These are the accuracy results compared to glibc:

    53 bits shared accuracy
    equal results: 100%
    equal exceptions: 99.901%
    equal NaN signs: 87.224%
    equal NaN-types: 93.181%
    equal NaNs: 80.405%



    Pay attention that fmod() has no requirements w.r.t. to such exceptions
    as FE_INEXACT, FE_UNDERFLOW and non-standard FE_DENORMAL.
    Strictly speaking, even raising FE_OVERFLOW is not illegal, but doing
    so would be bad quality of implementation.
    Also spec does not say what happens to FE_INVALID when one of the
    inputs is signalling NAN.



    --- Synchronet 3.20c-Linux NewsLink 1.2
  • From Michael S@already5chosen@yahoo.com to comp.lang.c++ on Tue Mar 11 12:47:06 2025
    From Newsgroup: comp.lang.c++

    On Mon, 10 Mar 2025 22:36:31 +0100
    Bonita Montero <Bonita.Montero@gmail.com> wrote:
    Am 10.03.2025 um 22:34 schrieb Bonita Montero:

    And for aribitrary exponets (0x1 to 0x7FE):

        fmodO: 9.29622
        fmodM: 11.4518

    Sorry, the copy-buffer wasn't refreshed with the new results:

    fmodO: 40.4702
    fmodM: 40.1652
    Let's establish common measurement methodology.
    Here is my throughput test bench (I have different test bench for
    correctness tests and plan to build yet different one for latency
    tests).
    #include <cstdio>
    #include <cstring>
    #include <cmath>
    #include <cfenv>
    #include <vector>
    #include <random>
    #include <chrono>
    extern "C" double my_fmod(double x, double y);
    int main(void)
    {
    const int VEC_LEN = 100000;
    const int N_IT = 31;
    std::vector<double> xy(VEC_LEN*2);
    std::vector<double> res(VEC_LEN);
    std::vector<double> ref(VEC_LEN);
    std::mt19937_64 rndGen;
    for (int rep = 0; rep < 2; ++rep) {
    if (rep == 0) {
    const uint64_t EXP_MASK = 2047ull << 52;
    for (int i = 0; i < VEC_LEN*2; ++i) {
    uint64_t u = rndGen();
    uint64_t exp = 1023;
    if (i % 2 == 0) { // x
    uint64_t exp0 = (u >> 52) & 2047;
    exp += exp0 % 52;
    }
    u = (u & ~EXP_MASK) | (exp << 52);
    double d;
    memcpy(&d, &u, sizeof(d));
    xy[i] = d;
    }
    } else {
    for (int i = 0; i < VEC_LEN*2; ++i) {
    uint64_t u = rndGen();
    double d;
    memcpy(&d, &u, sizeof(d));
    xy[i] = d;
    }
    }
    auto t00 = std::chrono::steady_clock::now();
    const double* pXY = xy.data();
    double* pRef = ref.data();
    double* pRes = res.data();
    std::vector<int64_t> tref(N_IT);
    std::vector<int64_t> tres(N_IT);
    for (int it = 0; it < N_IT; ++it) {
    auto t0 = std::chrono::steady_clock::now();
    for (int i = 0; i < VEC_LEN; ++i)
    pRef[i] = fmod(pXY[i*2+0], pXY[i*2+1]);
    auto t1 = std::chrono::steady_clock::now();
    for (int i = 0; i < VEC_LEN; ++i)
    pRes[i] = my_fmod(pXY[i*2+0], pXY[i*2+1]);
    auto t2 = std::chrono::steady_clock::now();
    tref[it] =
    std::chrono::duration_cast<std::chrono::nanoseconds>(t1 -
    t0).count(); tres[it] =
    std::chrono::duration_cast<std::chrono::nanoseconds>(t2 -
    t1).count();
    for (int i = 0; i < VEC_LEN; ++i) {
    if (pRef[i] != pRes[i]) {
    if (!std::isnan(pRef[i]) || !std::isnan(pRes[i])) {
    printf(
    "Mismatch. fmod(%.17e, %.17e).\n"
    "ref %.17e\n"
    "my %.17e\n"
    ,xy[i*2+0]
    ,xy[i*2+1]
    ,ref[i]
    ,res[i]
    );
    return 1;
    }
    }
    }
    }
    auto t11 = std::chrono::steady_clock::now();
    int64_t dt =
    std::chrono::duration_cast<std::chrono::nanoseconds>(t11 -
    t00).count();
    std::nth_element(tref.begin(), tref.begin()+N_IT/2, tref.end());
    std::nth_element(tres.begin(), tres.begin()+N_IT/2, tres.end());
    printf("fmod %6.2f nsec. my_fmod %6.2f nsec. Test time %7.3f msec\n"
    ,double(tref[N_IT/2]) / VEC_LEN
    ,double(tres[N_IT/2]) / VEC_LEN
    ,double(dt)*1e-6
    );
    }
    return 0;
    }
    What happens when you take this code 'as is' compile it and run
    it in each of 3 environments with following options:
    MSVC:
    cl -nologo -O2 -W4 -arch:AVX2 -std:c++20 -MD -EHsc
    gcc/clang under msys2:
    C:
    gcc -O2 -Wall -std=c17 -march=haswell
    or
    clang -O2 -Wall -std=c17 -march=haswell
    C++:
    g++ -O2 -Wall -std=c++20 -march=haswell
    or
    clang++ -O2 -Wall -std=c17 -march=haswell
    gcc/clang under msys2:
    The same as above with addition of -lm

    --- Synchronet 3.20c-Linux NewsLink 1.2
  • From Bonita Montero@Bonita.Montero@gmail.com to comp.lang.c++ on Tue Mar 11 12:12:31 2025
    From Newsgroup: comp.lang.c++

    Am 11.03.2025 um 11:29 schrieb Michael S:

    Pay attention that fmod() has no requirements w.r.t. to such
    exceptions as FE_INEXACT, FE_UNDERFLOW and non-standard FE_DENORMAL.

    Yes, that's while I evaluate FE_INVALID only. But your code also can
    set FE_INEXACT due to your "rounding" with sign change. MSVC seems
    also try to do the math with the FPU with a integer-fallback, because
    with exponent differences <= 53 MSVC's fmod() often sets FE_INECACT;
    but I ignore that because that shouldn't be part of fmod();

    Strictly speaking, even raising FE_OVERFLOW is not illegal,
    but doing so would be bad quality of implementation.

    Couldn't FE_OVERFLOW happen with your implementation when the
    exponents are too far away that you get inf from the division ?

    Also spec does not say what happens to FE_INVALID when one of the
    inputs is signalling NAN.

    See my code; I return MSVC and glibc compatible NaNs and I return
    the same exceptions. MSVC sets FE_INVALID only when x is inf or y
    is zero, glibc in addition raises FE_INVALID when either operand
    is a signalling NaN.

    --- Synchronet 3.20c-Linux NewsLink 1.2
  • From Michael S@already5chosen@yahoo.com to comp.lang.c++ on Tue Mar 11 13:34:58 2025
    From Newsgroup: comp.lang.c++

    On Tue, 11 Mar 2025 12:12:31 +0100
    Bonita Montero <Bonita.Montero@gmail.com> wrote:

    Am 11.03.2025 um 11:29 schrieb Michael S:

    Pay attention that fmod() has no requirements w.r.t. to such
    exceptions as FE_INEXACT, FE_UNDERFLOW and non-standard
    FE_DENORMAL.

    Yes, that's while I evaluate FE_INVALID only. But your code also can
    set FE_INEXACT due to your "rounding" with sign change. MSVC seems
    also try to do the math with the FPU with a integer-fallback, because
    with exponent differences <= 53 MSVC's fmod() often sets FE_INECACT;
    but I ignore that because that shouldn't be part of fmod();

    Strictly speaking, even raising FE_OVERFLOW is not illegal,
    but doing so would be bad quality of implementation.

    Couldn't FE_OVERFLOW happen with your implementation when the
    exponents are too far away that you get inf from the division ?


    It couldn't happen. Loop within loop exists for this reason exactly: to
    prevent overflow.

    Also spec does not say what happens to FE_INVALID when one of the
    inputs is signalling NAN.

    See my code; I return MSVC and glibc compatible NaNs and I return
    the same exceptions. MSVC sets FE_INVALID only when x is inf or y
    is zero, glibc in addition raises FE_INVALID when either operand
    is a signalling NaN.


    Exactly. Both options are legal. MS's decision to not set FE_INVALID is
    as good as glibc decision to set it.
    So, test bench should accept both variants as correct.
    BTW, what is the output of MS library in that case? SNAN or QNAN?
    I would think that it should be SNAN even when the other argument is
    QNAN. But even that is probably not required by the Standard.





    --- Synchronet 3.20c-Linux NewsLink 1.2
  • From Michael S@already5chosen@yahoo.com to comp.lang.c++ on Tue Mar 11 13:51:27 2025
    From Newsgroup: comp.lang.c++

    On Tue, 11 Mar 2025 12:12:31 +0100
    Bonita Montero <Bonita.Montero@gmail.com> wrote:

    Am 11.03.2025 um 11:29 schrieb Michael S:

    Pay attention that fmod() has no requirements w.r.t. to such
    exceptions as FE_INEXACT, FE_UNDERFLOW and non-standard
    FE_DENORMAL.

    Yes, that's while I evaluate FE_INVALID only.

    I think that testing that FE_DIVEDEBYZERO is not set is also a good
    idea.

    --- Synchronet 3.20c-Linux NewsLink 1.2
  • From Bonita Montero@Bonita.Montero@gmail.com to comp.lang.c++ on Tue Mar 11 13:10:31 2025
    From Newsgroup: comp.lang.c++

    Am 11.03.2025 um 12:34 schrieb Michael S:

    Exactly. Both options are legal. MS's decision to not set FE_INVALID is
    as good as glibc decision to set it.

    If I do a SSE-/AVX-operation where either operand is a signalling NaN
    I get a FE_INVALID; since the FPU behaves this way the MSVC runtime
    should do that also.

    BTW, what is the output of MS library in that case? SNAN or QNAN?

    Results with SNaN parameters are always QNaN, that shoud be common
    with any FPU.

    --- Synchronet 3.20c-Linux NewsLink 1.2
  • From Michael S@already5chosen@yahoo.com to comp.lang.c++ on Tue Mar 11 14:55:43 2025
    From Newsgroup: comp.lang.c++

    On Tue, 11 Mar 2025 13:10:31 +0100
    Bonita Montero <Bonita.Montero@gmail.com> wrote:

    Am 11.03.2025 um 12:34 schrieb Michael S:

    Exactly. Both options are legal. MS's decision to not set
    FE_INVALID is as good as glibc decision to set it.

    If I do a SSE-/AVX-operation where either operand is a signalling NaN
    I get a FE_INVALID; since the FPU behaves this way the MSVC runtime
    should do that also.

    BTW, what is the output of MS library in that case? SNAN or QNAN?

    Results with SNaN parameters are always QNaN, that shoud be common
    with any FPU.


    But not when library routine does not use FPU. Or uses FPU only for
    comparison ops.
    The point is, it does not sound right if SNAN is *silently* converted
    to QNAN. That type of conversion has to be loud i.e. accompanied by
    setting of FE_INVALID.


    --- Synchronet 3.20c-Linux NewsLink 1.2
  • From Bonita Montero@Bonita.Montero@gmail.com to comp.lang.c++ on Tue Mar 11 14:08:04 2025
    From Newsgroup: comp.lang.c++

    Am 11.03.2025 um 13:55 schrieb Michael S:

    But not when library routine does not use FPU. Or uses FPU only for comparison ops.

    Then the library routine shout use fesetexcept() as you do it yourself.

    The point is, it does not sound right if SNAN is *silently* converted
    to QNAN. That type of conversion has to be loud i.e. accompanied by
    setting of FE_INVALID.

    Interestingly even conversion-operations from double to float do that.
    That's not what I epected.
    And there a some operations that should do that but actually keep the signalling bit zeroed like a sign-change since this is usually done with
    a XOR-operation.
    --- Synchronet 3.20c-Linux NewsLink 1.2
  • From Michael S@already5chosen@yahoo.com to comp.lang.c++ on Tue Mar 11 15:46:47 2025
    From Newsgroup: comp.lang.c++

    On Tue, 11 Mar 2025 14:55:43 +0200
    Michael S <already5chosen@yahoo.com> wrote:

    On Tue, 11 Mar 2025 13:10:31 +0100
    Bonita Montero <Bonita.Montero@gmail.com> wrote:

    Am 11.03.2025 um 12:34 schrieb Michael S:

    Exactly. Both options are legal. MS's decision to not set
    FE_INVALID is as good as glibc decision to set it.

    If I do a SSE-/AVX-operation where either operand is a signalling
    NaN I get a FE_INVALID; since the FPU behaves this way the MSVC
    runtime should do that also.

    BTW, what is the output of MS library in that case? SNAN or QNAN?


    Results with SNaN parameters are always QNaN, that shoud be common
    with any FPU.


    But not when library routine does not use FPU. Or uses FPU only for comparison ops.
    The point is, it does not sound right if SNAN is *silently* converted
    to QNAN. That type of conversion has to be loud i.e. accompanied by
    setting of FE_INVALID.



    I tested. It appears that MSVC implementation made a mistake in
    cases of fmod(snan, qnan):

    MSVC gcc
    x y result FE_INVALID) result FE_INVALID
    snan 1 snan 0 qnan 1
    snan 0 snan 0 qnan 1
    snan inf snan 0 qnan 1
    snan qnan qnan 0 !!! qnan 1
    1 snan snan 0 qnan 1
    0 snan snan 0 qnan 1
    inf snan snan 0 qnan 1
    qnan snan snan 0 qnan 1




    --- Synchronet 3.20c-Linux NewsLink 1.2
  • From James Kuyper@jameskuyper@alumni.caltech.edu to comp.lang.c++ on Tue Mar 11 14:21:02 2025
    From Newsgroup: comp.lang.c++

    On 3/11/25 06:29, Michael S wrote:
    ...
    Pay attention that fmod() has no requirements w.r.t. to such exceptions
    as FE_INEXACT, FE_UNDERFLOW and non-standard FE_DENORMAL.
    Strictly speaking, even raising FE_OVERFLOW is not illegal, but doing
    so would be bad quality of implementation.
    Also spec does not say what happens to FE_INVALID when one of the
    inputs is signalling NAN.

    I've got Bonita killfiled, so the oldest message I can see on this
    thread is one posted by you that indicated you were interested in IEEE
    754 (==IOS/IEC 60559) conformance.

    The C++ standard cross-references the C standard for such issues. The C standard specifies that, for an implementation which pre#defines __STDC__IEC_60559__BFP__, floating point exception handling is very
    tightly specified for conformance with ISO/IEC 60559:

    "The double version of fmod behaves as though implemented by
    #include <math.h>
    #include <fenv.h>
    #pragma STDC FENV_ACCESS ON
    double fmod(double x, double y)
    {
    double result;
    result = remainder(fabs(x), (y = fabs(y)));
    if (signbit(result)) result += y;
    return copysign(result, x);
    }
    " (F10.7.1)

    "Operations defined in 6.5 and functions and macros defined for the
    standard libraries change floating-point status flags and control modes
    just as indicated by their specifications (including conformance to IEC
    60559). They do not change flags or modes (so as to be detectable by the
    user) in any other cases." (F8.6)

    "... signbit ... raise[s] no floating-point exceptions, even if an
    argument is a signaling NaN." (F3p6)

    "fabs(x) raises no floating-point exceptions, even if x is a signaling
    NaN." (F10.4.3)

    "— remainder(x, y) returns a NaN and raises the "invalid" floating-point exception for x infinite or y zero (and neither is a NaN)." (F.10.7.2)

    "copysign(x, y) raises no floating-point exceptions, even if x or y is a signaling NaN." (F10.8.1)
    --- Synchronet 3.20c-Linux NewsLink 1.2
  • From Michael S@already5chosen@yahoo.com to comp.lang.c++ on Tue Mar 11 20:28:12 2025
    From Newsgroup: comp.lang.c++

    On Mon, 10 Mar 2025 20:38:18 +0200
    Michael S <already5chosen@yahoo.com> wrote:

    On Mon, 10 Mar 2025 19:00:06 +0100
    Bonita Montero <Bonita.Montero@gmail.com> wrote:


    Your idea is really elegant

    I'd rather call it "simple" or "straightforward". "Elegant" in my book
    is something else. For example, the code above is closer to what I
    consider elegant.
    May be, later today or tomorrow, I'll show you solution that I
    consider bright. Bright, but impractical.


    Here, here!
    A bright part is in lines 18 to 29. The rest are hopefully competent technicalities.

    #include <string.h>
    #include <stdint.h>
    #include <math.h>
    #include <fenv.h>

    static uint64_t
    umulrem(uint64_t x, uint64_t y, uint64_t den) {
    #ifdef _MSC_VER
    uint64_t hi, lo = _umul128(x, y, &hi);
    uint64_t rem;
    _udiv128(hi, lo, den, &rem);
    return rem;
    #else
    return ((unsigned __int128)x * y) % den;
    #endif
    }

    // Calculate mod(2**e, y) where y < 2**53
    static uint64_t pow2_mod(uint64_t y, unsigned e) {
    if (e < 64) {
    uint64_t x = (uint64_t)1 << e;
    if (x < y)
    return x;
    return x % y;
    }
    uint64_t x1 = pow2_mod(y, e/2);
    uint64_t x2 = x1 << (e & 1);
    return umulrem(x1, x2, y);
    }

    static double u2d(uint64_t u) {
    double d;
    memcpy(&d, &u, sizeof(d));
    return d;
    }

    static uint64_t d2u(double d) {
    uint64_t u;
    memcpy(&u, &d, sizeof(u));
    return u;
    }

    // raise FE_INVALID and return nan
    static double raise_fe_invalid_ret_nan(double x)
    {
    const uint64_t SNAN_BITS = ~(1ull << 51);
    double snan = u2d(SNAN_BITS);
    #ifndef __clang__
    return snan + x;
    #else
    volatile double v_snan = snan;
    return v_snan + x;
    #endif
    }

    double my_fmod(double x, double y)
    {
    const uint64_t INF_EXP = 2047;
    const uint64_t INF2 = INF_EXP << 53;
    const uint64_t HIDDEN_BIT = (uint64_t)1 << 52;
    const uint64_t MANT_MASK = HIDDEN_BIT - 1;
    const uint64_t SIGN_BIT = (uint64_t)1 << 63;

    uint64_t ux = d2u(x);
    uint64_t uy = d2u(y);
    uint64_t ux2 = ux*2;
    uint64_t uy2 = uy*2;
    uint64_t sx = ux & SIGN_BIT;

    // process non-finite x
    if (ux2 >= INF2) { // x is inf or nan
    if (ux2 > INF2) // x is nan
    return x + y; // raises FE_INVALID when either x or y is sNAN
    // x is inf
    if (uy2 > INF2) // y is nan
    return x + y;
    // y is finite or inf
    return raise_fe_invalid_ret_nan(x);
    }
    // x is finite

    // process non-finite and zero y
    if (uy2-1 >= INF2-1) { // y is inf or nan or 0
    if (uy2 > INF2) // y is nan
    return y;
    // x is inf
    if (uy2 == INF2) // y is inf
    return x;
    // y is 0
    return raise_fe_invalid_ret_nan(x);
    }

    // y is finite non-zero
    if (ux2 < uy2)
    return x; // abs(x) < abs(y)

    // extract mantissa and exponent
    uint64_t mantX = (ux2 >> 1) & MANT_MASK;
    uint64_t mantY = (uy2 >> 1) & MANT_MASK;
    int expX = ux2 >> 53;
    if (expX == 0) { // X subnormal
    // Y is also subnormal, so we can use simple integer reduction
    return u2d((mantX % mantY) | sx);
    }

    int expY = uy2 >> 53;
    if (expY == 0) { // Y subnormal
    mantY |= HIDDEN_BIT; // removed below
    expY = 1;
    }

    mantY ^= HIDDEN_BIT;
    mantX ^= HIDDEN_BIT;
    if (mantX >= mantY) {
    mantX -= mantY;
    if (mantX >= mantY) // can happen when y is subnormal
    mantX %= mantY;
    }

    int dExp = expX - expY;
    uint64_t f = (dExp <= 63) ?
    (uint64_t)1 << dExp : // quick path
    pow2_mod(mantY, dExp); // slow path
    mantX = umulrem(mantX, f, mantY);

    // apply exponent of Y to mantX
    uint64_t ures0 = ((uint64_t)expY << 52) | sx;
    uint64_t ures = ures0 | (mantX & MANT_MASK);
    if (mantX & HIDDEN_BIT)
    ures0 = 0;
    return u2d(ures) - u2d(ures0);
    }

    --- Synchronet 3.20c-Linux NewsLink 1.2
  • From Michael S@already5chosen@yahoo.com to comp.lang.c++ on Sun Mar 16 13:48:03 2025
    From Newsgroup: comp.lang.c++

    On Tue, 11 Mar 2025 20:28:12 +0200
    Michael S <already5chosen@yahoo.com> wrote:

    On Mon, 10 Mar 2025 20:38:18 +0200
    Michael S <already5chosen@yahoo.com> wrote:

    On Mon, 10 Mar 2025 19:00:06 +0100
    Bonita Montero <Bonita.Montero@gmail.com> wrote:


    Your idea is really elegant

    I'd rather call it "simple" or "straightforward". "Elegant" in my
    book is something else. For example, the code above is closer to
    what I consider elegant.
    May be, later today or tomorrow, I'll show you solution that I
    consider bright. Bright, but impractical.


    Here, here!
    A bright part is in lines 18 to 29. The rest are hopefully competent technicalities.


    And here is non-recursive implementation of the same algorithm that has following potentially useful properties:
    1. It does not use compiler-specific extensions, only standard C.
    2. It does not use FMA, so gives correct results on implementations
    with broken fam(), like MSVC on pre-AVX computers.


    #include <string.h>
    #include <stdint.h>
    #include <math.h>

    static double u2d(uint64_t u) {
    double d;
    memcpy(&d, &u, sizeof(d));
    return d;
    }

    static uint64_t d2u(double d) {
    uint64_t u;
    memcpy(&u, &d, sizeof(u));
    return u;
    }

    // raise FE_INVALID and return nan
    static double raise_fe_invalid_ret_nan(double x)
    {
    const uint64_t SNAN_BITS = ~(1ull << 51);
    double snan = u2d(SNAN_BITS);
    #ifndef __clang__
    return snan + x;
    #else
    volatile double v_snan = snan;
    return v_snan + x;
    #endif
    }

    double my_fmod(double x, double y)
    {
    const uint64_t INF_EXP = 2047;
    const uint64_t INF2 = INF_EXP << 53;
    const uint64_t HIDDEN_BIT = (uint64_t)1 << 52;
    const uint64_t MANT_MASK = HIDDEN_BIT - 1;
    const uint64_t SIGN_BIT = (uint64_t)1 << 63;

    uint64_t ux = d2u(x);
    uint64_t uy = d2u(y);
    uint64_t ux2 = ux*2;
    uint64_t uy2 = uy*2;
    uint64_t sx = ux & SIGN_BIT;

    // process non-finite x
    if (ux2 >= INF2) { // x is inf or nan
    if (ux2 > INF2) // x is nan
    return x + y; // raises FE_INVALID when either x or y is sNAN
    // x is inf
    if (uy2 > INF2) // y is nan
    return x + y;
    // y is finite or inf
    return raise_fe_invalid_ret_nan(x);
    }
    // x is finite

    // process non-finite and zero y
    if (uy2-1 >= INF2-1) { // y is inf or nan or 0
    if (uy2 > INF2) // y is nan
    return y;
    // x is inf
    if (uy2 == INF2) // y is inf
    return x;
    // y is 0
    return raise_fe_invalid_ret_nan(x);
    }

    // y is finite non-zero
    if (ux2 < uy2)
    return x; // abs(x) < abs(y)

    // extract mantissa and exponent
    int64_t mantX = ((ux2 >> 1) & MANT_MASK)+HIDDEN_BIT;
    int64_t mantY = ((uy2 >> 1) & MANT_MASK)+HIDDEN_BIT;
    int expX = ux2 >> 53;
    int expY = uy2 >> 53;
    unsigned dExp = expX - expY;
    double ax = fabs(x);
    double ay = fabs(y);
    if (ax*0x1p-53 <= ay) {
    // Quick path
    int64_t d = (int64_t)(ax/ay);
    if (expY == 0) { // Y subnormal
    // don't normalize
    mantY -= HIDDEN_BIT;
    expY = 1;
    if (expX == 0) { // X subnormal
    mantX -= HIDDEN_BIT;
    expX = 1;
    }
    dExp = expX - expY;
    }
    mantX = (mantX << dExp) - mantY*(uint64_t)d;
    } else {
    if (expY == 0) { // Y subnormal
    // Normalize
    uy = d2u(y*0x1p52);
    mantY = (uy & MANT_MASK) | HIDDEN_BIT;
    expY = ((int)(uy >> 52) & 2047) - 52;
    dExp = expX - expY;
    }
    if (mantY == (int64_t)HIDDEN_BIT)
    return u2d(sx); // Y is power of 2

    // Calculate rem(2**dExp, mantY)
    unsigned e0, n_steps;
    for (n_steps = 0, e0 = dExp; e0 > 105; ++n_steps)
    e0 /= 2;

    double ry = 1.0/mantY;
    uint64_t mantRy = (d2u(ry) & MANT_MASK) | HIDDEN_BIT;
    uint64_t d = mantRy >> (105-e0);
    int64_t f = (((uint64_t)1 << 52) << (e0-52)) - mantY*d;
    // f = rem(2**e0, mantY) + a*mantY where -1 <= a <= 1
    if (n_steps > 0) {
    int next_bit = n_steps-1;
    const uint64_t F_MAX = (uint64_t)3 << 54;
    do {
    double df = (double)f;
    d = (int64_t)(df*df*ry);
    f = (uint64_t)f*(uint64_t)f - mantY*d;
    f <<= (dExp >> next_bit) & 1;
    if (f+F_MAX > F_MAX*2)
    f -= (int64_t)(f*ry)*mantY;
    --next_bit;
    } while (next_bit >= 0);
    }
    if (mantX >= mantY)
    mantX -= mantY;
    d = (int64_t)((double)f*(int64_t)mantX*ry);
    mantX = (uint64_t)f*(uint64_t)mantX - mantY*(uint64_t)d;
    }
    while ((uint64_t)mantX >= (uint64_t)mantY) {
    if (mantX < 0)
    mantX += mantY;
    else
    mantX -= mantY;
    }

    if (expY <= 1) { // Y subnormal
    mantX >>= 1 - expY;
    return u2d(mantX | sx);
    }

    // Apply exponent of Y to mantX
    uint64_t ures0 = ((uint64_t)expY << 52) | sx;
    uint64_t ures = ures0 | (mantX & MANT_MASK);
    if (mantX & HIDDEN_BIT)
    ures0 = 0;
    return u2d(ures) - u2d(ures0);
    }

    --- Synchronet 3.20c-Linux NewsLink 1.2