Forum: War Ensemble BBS

a MSVC and glibc-compatible fmod()

From Bonita Montero@Bonita.Montero@gmail.com to comp.lang.c++ on Mon Feb 24 11:48:08 2025

From Newsgroup: comp.lang.c++

I wanted to optimize fmod to be a bit faster. This is my C++20 solution.

double myFmod( double x, double y )
{
constexpr uint64_t
SIGN = 1ull << 63,
IMPLICIT = 1ull << 52,
MANT = IMPLICIT - 1,
QBIT = 1ull << 51;
uint64_t const
binX = bit_cast<uint64_t>( x ),
binY = bit_cast<uint64_t>( y );
static auto abs = []( uint64_t m ) { return m & ~SIGN; };
auto isNaN = []( uint64_t m ) { return abs( m ) >= 0x7FF0000000000001u; };
auto isSig = []( uint64_t m ) { return !(m & QBIT); };
if( isNaN( binX ) ) [[unlikely]] // x == NaN
#if defined(_MSC_VER)
return bit_cast<double>( isNaN( binY ) ? binY | binX & binY & QBIT :
binX );
#else
{
if( isSig( binX ) || isNaN( binY ) && isSig( binY ) ) [[unlikely]]
feraiseexcept( FE_INVALID );
return bit_cast<double>( binX | QBIT );
}
#endif
if( isNaN( binY ) ) [[unlikely]] // x != NaN || y == NaN
#if defined(_MSC_VER)
return y;
#else
{
if( isSig( binY ) ) [[unlikely]]
feraiseexcept( FE_INVALID );
return bit_cast<double>( binY | QBIT );
}
#endif
auto isInf = []( uint64_t m ) { return abs( m ) == 0x7FF0000000000000u; };
if( isInf( binX ) ) // x == Inf
{
feraiseexcept( FE_INVALID );
#if defined(_MSC_VER)
return bit_cast<double>( binX & ~MANT | QBIT );
#else
return -numeric_limits<double>::quiet_NaN();
#endif
}
if( !abs( binY ) ) [[unlikely]] // y == 0
{
feraiseexcept( FE_INVALID );
#if defined(_MSC_VER)
return numeric_limits<double>::quiet_NaN();
#else
return -numeric_limits<double>::quiet_NaN();
#endif
}
if( !abs( binX ) || isInf( binY ) ) [[unlikely]] // x == 0 || y == Inf
return x;
auto exp = []( uint64_t b ) -> int { return b >> 52 & 0x7FF; };
int
expX = exp( binX ),
expY = exp( binY );
auto mant = []( uint64_t b ) { return b & MANT; };
uint64_t
mantX = mant( binX ),
mantY = mant( binY );
static auto normalize = []( int &exp, uint64_t &mant )
{
unsigned shift = countl_zero( mant ) - 11;
mant <<= shift;
exp -= shift;
};
auto build = []( int &exp, uint64_t &mant )
{
if( exp ) [[likely]]
mant |= IMPLICIT;
else
{
exp = 1;
normalize( exp, mant );
}
};
build( expX, mantX );
build( expY, mantY );
uint64_t signX = binX & SIGN;
int expDiff;
while( (expDiff = expX - expY) > 0 )
{
unsigned bits = expDiff <= 11 ? expDiff : 11;
if( !(mantX = (mantX << bits) % mantY) ) [[unlikely]]
return bit_cast<double>( signX );
expX -= bits;
normalize( expX, mantX );
}
if( !expDiff && mantX >= mantY ) [[unlikely]]
if( (mantX -= mantY) ) [[likely]]
normalize( expX, mantX );
else
return bit_cast<double>( signX );
if( expX <= 0 ) [[unlikely]]
{
assert(expX >= -51);
mantX = mantX >> (unsigned)(-expX + 1);
expX = 0;
}
return bit_cast<double>( signX | (uint64_t)expX << 52 | mantX & MANT ); }

It's about six times faster than the glibc 2.31 solution in my
benchmark. The returned NaNs and the raised exceptions are MSVC-
and glibc-compatible.

--- Synchronet 3.20c-Linux NewsLink 1.2

From Muttley@Muttley@DastardlyHQ.org to comp.lang.c++ on Mon Feb 24 12:00:30 2025

From Newsgroup: comp.lang.c++

On Mon, 24 Feb 2025 11:48:08 +0100
Bonita Montero <Bonita.Montero@gmail.com> wibbled:

I wanted to optimize fmod to be a bit faster. This is my C++20 solution.

double myFmod( double x, double y )

[snip]

It's about six times faster than the glibc 2.31 solution in my
benchmark. The returned NaNs and the raised exceptions are MSVC-
and glibc-compatible.

double myFmod(double x, double y)
{
double div = x / y;
return y * (div - (long)div);
}

--- Synchronet 3.20c-Linux NewsLink 1.2

From Bonita Montero@Bonita.Montero@gmail.com to comp.lang.c++ on Mon Feb 24 13:48:02 2025

From Newsgroup: comp.lang.c++

Am 24.02.2025 um 13:00 schrieb Muttley@DastardlyHQ.org:

On Mon, 24 Feb 2025 11:48:08 +0100
Bonita Montero <Bonita.Montero@gmail.com> wibbled:

I wanted to optimize fmod to be a bit faster. This is my C++20 solution.

double myFmod( double x, double y )

[snip]

It's about six times faster than the glibc 2.31 solution in my
benchmark. The returned NaNs and the raised exceptions are MSVC-
and glibc-compatible.

double myFmod(double x, double y)
{
double div = x / y;
return y * (div - (long)div);
}

If the exponent difference between x and y is large enough this
returns results which are larger than y. glibc does it completley
with integer-operations also.

--- Synchronet 3.20c-Linux NewsLink 1.2

From Muttley@Muttley@DastardlyHQ.org to comp.lang.c++ on Mon Feb 24 13:09:50 2025

From Newsgroup: comp.lang.c++

On Mon, 24 Feb 2025 13:48:02 +0100
Bonita Montero <Bonita.Montero@gmail.com> wibbled:

Am 24.02.2025 um 13:00 schrieb Muttley@DastardlyHQ.org:

On Mon, 24 Feb 2025 11:48:08 +0100
Bonita Montero <Bonita.Montero@gmail.com> wibbled:

I wanted to optimize fmod to be a bit faster. This is my C++20 solution. >>>
double myFmod( double x, double y )

[snip]

It's about six times faster than the glibc 2.31 solution in my
benchmark. The returned NaNs and the raised exceptions are MSVC-
and glibc-compatible.

double myFmod(double x, double y)
{
double div = x / y;
return y * (div - (long)div);
}

If the exponent difference between x and y is large enough this
returns results which are larger than y. glibc does it completley
with integer-operations also.

If the values are so large or small that you start to get floating point
based errors then you should probably be using integer arthmetic or a large number library anyway like GMP anyway.

--- Synchronet 3.20c-Linux NewsLink 1.2

From Bonita Montero@Bonita.Montero@gmail.com to comp.lang.c++ on Mon Feb 24 14:22:04 2025

From Newsgroup: comp.lang.c++

Am 24.02.2025 um 14:09 schrieb Muttley@DastardlyHQ.org:

If the values are so large or small that you start to get floating point based errors then you should probably be using integer arthmetic or a large number library anyway like GMP anyway.

There's no need for large integer arithmetics since each calculation
step results in a mantissa with equal or less bits than the divisor.
Even if the exponents are close enough to have not missing integer
-bits the following multiplication is very likely to have a precision
-loss. All current solutions (MSVC, libstdc++) work with integer-ope-
tations and are always 100% precise.

--- Synchronet 3.20c-Linux NewsLink 1.2

From Michael S@already5chosen@yahoo.com to comp.lang.c++ on Mon Feb 24 16:22:46 2025

From Newsgroup: comp.lang.c++

On Mon, 24 Feb 2025 13:09:50 -0000 (UTC)
Muttley@DastardlyHQ.org wrote:

On Mon, 24 Feb 2025 13:48:02 +0100
Bonita Montero <Bonita.Montero@gmail.com> wibbled:

Am 24.02.2025 um 13:00 schrieb Muttley@DastardlyHQ.org:

On Mon, 24 Feb 2025 11:48:08 +0100
Bonita Montero <Bonita.Montero@gmail.com> wibbled:

I wanted to optimize fmod to be a bit faster. This is my C++20
solution.

double myFmod( double x, double y )

[snip]

It's about six times faster than the glibc 2.31 solution in my
benchmark. The returned NaNs and the raised exceptions are MSVC-
and glibc-compatible.

double myFmod(double x, double y)
{
double div = x / y;
return y * (div - (long)div);
}

If the exponent difference between x and y is large enough this
returns results which are larger than y. glibc does it completley
with integer-operations also.

If the values are so large or small that you start to get floating
point based errors then you should probably be using integer
arthmetic or a large number library anyway like GMP anyway.

Your method will sometimes produce results that are 1 LSB off
relatively to IEEE-754 prescription when values are neither small nor
large.
And sometimes 1 LSB off means that result is 2x off.
For example, for x=0.9999999999999999, y=0.9999999999999998 your method produces 2.2204460492503126e-16. A correct result is, of course, 1.1102230246251565e-16

Also, I don't think that your method is any faster than correct methods.

--- Synchronet 3.20c-Linux NewsLink 1.2

From Muttley@Muttley@DastardlyHQ.org to comp.lang.c++ on Mon Feb 24 15:10:53 2025

From Newsgroup: comp.lang.c++

On Mon, 24 Feb 2025 16:22:46 +0200
Michael S <already5chosen@yahoo.com> wibbled:

On Mon, 24 Feb 2025 13:09:50 -0000 (UTC)
Muttley@DastardlyHQ.org wrote:

If the values are so large or small that you start to get floating
point based errors then you should probably be using integer
arthmetic or a large number library anyway like GMP anyway.

Your method will sometimes produce results that are 1 LSB off
relatively to IEEE-754 prescription when values are neither small nor
large.
And sometimes 1 LSB off means that result is 2x off.
For example, for x=0.9999999999999999, y=0.9999999999999998 your method >produces 2.2204460492503126e-16. A correct result is, of course, >1.1102230246251565e-16

Frankly I doubt anyone would care, its zero in all but name.

Also, I don't think that your method is any faster than correct methods.

Don't know, but its only 3 mathematical operations all of which can be done
by the hardware so its going to be pretty fast.

--- Synchronet 3.20c-Linux NewsLink 1.2

From Michael S@already5chosen@yahoo.com to comp.lang.c++ on Mon Feb 24 17:13:43 2025

From Newsgroup: comp.lang.c++

On Mon, 24 Feb 2025 14:22:04 +0100
Bonita Montero <Bonita.Montero@gmail.com> wrote:

Am 24.02.2025 um 14:09 schrieb Muttley@DastardlyHQ.org:

If the values are so large or small that you start to get floating
point based errors then you should probably be using integer
arthmetic or a large number library anyway like GMP anyway.

There's no need for large integer arithmetics since each calculation
step results in a mantissa with equal or less bits than the divisor.
Even if the exponents are close enough to have not missing integer
-bits the following multiplication is very likely to have a precision
-loss. All current solutions (MSVC, libstdc++) work with integer-ope-
tations and are always 100% precise.

Do you have real application for fast fmod() or just playing?

For as long as y is positive and abs(x/y) <= 2**53, a very simple
formula will produce precise result: fma(trunc(x/fabs(y)), -fabs(y), x).

--- Synchronet 3.20c-Linux NewsLink 1.2

From Bonita Montero@Bonita.Montero@gmail.com to comp.lang.c++ on Mon Feb 24 16:19:35 2025

From Newsgroup: comp.lang.c++

Am 24.02.2025 um 16:13 schrieb Michael S:

Do you have real application for fast fmod() or just playing?

I experimented with x87 FPREM and wanted to know whether it is precise;
it isn't and the results can be > y. So I developed my own routine which
is always 100% precise.

For as long as y is positive and abs(x/y) <= 2**53, a very simple
formula will produce precise result: fma(trunc(x/fabs(y)), -fabs(y), x).

The multiplication mostly will drop bits so that the difference might
become larger than y.

--- Synchronet 3.20c-Linux NewsLink 1.2

From Michael S@already5chosen@yahoo.com to comp.lang.c++ on Mon Feb 24 17:33:45 2025

From Newsgroup: comp.lang.c++

On Mon, 24 Feb 2025 15:10:53 -0000 (UTC)
Muttley@DastardlyHQ.org wrote:

On Mon, 24 Feb 2025 16:22:46 +0200
Michael S <already5chosen@yahoo.com> wibbled:

On Mon, 24 Feb 2025 13:09:50 -0000 (UTC)
Muttley@DastardlyHQ.org wrote:

If the values are so large or small that you start to get floating
point based errors then you should probably be using integer
arthmetic or a large number library anyway like GMP anyway.

Your method will sometimes produce results that are 1 LSB off
relatively to IEEE-754 prescription when values are neither small nor >large.
And sometimes 1 LSB off means that result is 2x off.
For example, for x=0.9999999999999999, y=0.9999999999999998 your
method produces 2.2204460492503126e-16. A correct result is, of
course, 1.1102230246251565e-16

Frankly I doubt anyone would care, its zero in all but name.

Also, I don't think that your method is any faster than correct
methods.

Don't know, but its only 3 mathematical operations all of which can
be done by the hardware so its going to be pretty fast.

Looks like 4 operations to me - division, truncation, subtraction, multiplication. If compiler takes it literally, which he probably
should if compiled without special non-standard-conforming flags like -fast-math, then there are 5 operations - double->int and
int->double conversions instead of truncation

Nevertheless, after a bit of thinking I concur that your formula is
faster than 100% correct methods. Initially, I didn't took into account
all difficulties that correct methods have to face in cases of very
large x to y ratios.

However your method is approximately the same speed as *mostly correct*
method shown in my post above. May be, yours is even a little slower,
at least as long as we use good optimizing compiler and target modern
CPUs that support trunc() and fma() as fast hardware instructions.

--- Synchronet 3.20c-Linux NewsLink 1.2

From Michael S@already5chosen@yahoo.com to comp.lang.c++ on Mon Feb 24 17:52:58 2025

From Newsgroup: comp.lang.c++

On Mon, 24 Feb 2025 16:19:35 +0100
Bonita Montero <Bonita.Montero@gmail.com> wrote:

Am 24.02.2025 um 16:13 schrieb Michael S:

Do you have real application for fast fmod() or just playing?

I experimented with x87 FPREM and wanted to know whether it is
precise; it isn't

That is not surprising. It wouldn't be called *partial* reminder
otherwise.

and the results can be > y.

That's a little unexpected, but mentioned in the Intel and AMD manuals.
It can happen when abs(x/y) > 2**32. If you think about how FPREM
works, you'd realize that for abs(x/y) > 2**63 it is a necessity.

Still rem(x,y) == rem(fprem(x,y), y), so this unusual property does not
prevent FPREM from getting correct answer when we run it in loop.
But in the worst case loop can take something like 1000 iterations
:(

So I developed my own
routine which is always 100% precise.

For as long as y is positive

Actually, formula appears to work for negative y as well.

and abs(x/y) <= 2**53, a very simple

formula will produce precise result: fma(trunc(x/fabs(y)),
-fabs(y), x).

The multiplication mostly will drop bits so that the difference might
become larger than y.

That is why I don't use multiplication. Did you ever asked yourself
what is the meaning of 'f' in 'fma' ?

--- Synchronet 3.20c-Linux NewsLink 1.2

From Muttley@Muttley@DastardlyHQ.org to comp.lang.c++ on Mon Feb 24 16:48:10 2025

From Newsgroup: comp.lang.c++

On Mon, 24 Feb 2025 17:33:45 +0200
Michael S <already5chosen@yahoo.com> wibbled:

On Mon, 24 Feb 2025 15:10:53 -0000 (UTC)
Muttley@DastardlyHQ.org wrote:

Don't know, but its only 3 mathematical operations all of which can
be done by the hardware so its going to be pretty fast.

Looks like 4 operations to me - division, truncation, subtraction, >multiplication. If compiler takes it literally, which he probably

Yes, I should have included the cast. Not sure whether that could be done
in hardware or not, my assembler knowledge - for x86 - is way too rusty.

Nevertheless, after a bit of thinking I concur that your formula is
faster than 100% correct methods. Initially, I didn't took into account
all difficulties that correct methods have to face in cases of very
large x to y ratios.

In those sorts of cases the result of your program will be running into floating point precision errors elsewhere so IMO its somewhat moot.

However your method is approximately the same speed as *mostly correct* >method shown in my post above. May be, yours is even a little slower,
at least as long as we use good optimizing compiler and target modern
CPUs that support trunc() and fma() as fast hardware instructions.

The only way to really know would be to test it on various OS's and CPUs
I guess.

--- Synchronet 3.20c-Linux NewsLink 1.2

From Mr Flibble@leigh@i42.co.uk to comp.lang.c++ on Mon Feb 24 22:21:03 2025

From Newsgroup: comp.lang.c++

On Mon, 24 Feb 2025 11:48:08 +0100, Bonita Montero
<Bonita.Montero@gmail.com> wrote:

I wanted to optimize fmod to be a bit faster. This is my C++20 solution.

double myFmod( double x, double y )
{
constexpr uint64_t
SIGN = 1ull << 63,
IMPLICIT = 1ull << 52,
MANT = IMPLICIT - 1,
QBIT = 1ull << 51;
uint64_t const
binX = bit_cast<uint64_t>( x ),
binY = bit_cast<uint64_t>( y );
static auto abs = []( uint64_t m ) { return m & ~SIGN; };
auto isNaN = []( uint64_t m ) { return abs( m ) >= 0x7FF0000000000001u; }; auto isSig = []( uint64_t m ) { return !(m & QBIT); };
if( isNaN( binX ) ) [[unlikely]] // x == NaN
#if defined(_MSC_VER)
return bit_cast<double>( isNaN( binY ) ? binY | binX & binY & QBIT :
binX );
#else
{
if( isSig( binX ) || isNaN( binY ) && isSig( binY ) ) [[unlikely]]
feraiseexcept( FE_INVALID );
return bit_cast<double>( binX | QBIT );
}
#endif
if( isNaN( binY ) ) [[unlikely]] // x != NaN || y == NaN
#if defined(_MSC_VER)
return y;
#else
{
if( isSig( binY ) ) [[unlikely]]
feraiseexcept( FE_INVALID );
return bit_cast<double>( binY | QBIT );
}
#endif
auto isInf = []( uint64_t m ) { return abs( m ) == 0x7FF0000000000000u; }; if( isInf( binX ) ) // x == Inf
{
feraiseexcept( FE_INVALID );
#if defined(_MSC_VER)
return bit_cast<double>( binX & ~MANT | QBIT );
#else
return -numeric_limits<double>::quiet_NaN();
#endif
}
if( !abs( binY ) ) [[unlikely]] // y == 0
{
feraiseexcept( FE_INVALID );
#if defined(_MSC_VER)
return numeric_limits<double>::quiet_NaN();
#else
return -numeric_limits<double>::quiet_NaN();
#endif
}
if( !abs( binX ) || isInf( binY ) ) [[unlikely]] // x == 0 || y == Inf
return x;
auto exp = []( uint64_t b ) -> int { return b >> 52 & 0x7FF; };
int
expX = exp( binX ),
expY = exp( binY );
auto mant = []( uint64_t b ) { return b & MANT; };
uint64_t
mantX = mant( binX ),
mantY = mant( binY );
static auto normalize = []( int &exp, uint64_t &mant )
{
unsigned shift = countl_zero( mant ) - 11;
mant <<= shift;
exp -= shift;
};
auto build = []( int &exp, uint64_t &mant )
{
if( exp ) [[likely]]
mant |= IMPLICIT;
else
{
exp = 1;
normalize( exp, mant );
}
};
build( expX, mantX );
build( expY, mantY );
uint64_t signX = binX & SIGN;
int expDiff;
while( (expDiff = expX - expY) > 0 )
{
unsigned bits = expDiff <= 11 ? expDiff : 11;
if( !(mantX = (mantX << bits) % mantY) ) [[unlikely]]
return bit_cast<double>( signX );
expX -= bits;
normalize( expX, mantX );
}
if( !expDiff && mantX >= mantY ) [[unlikely]]
if( (mantX -= mantY) ) [[likely]]
normalize( expX, mantX );
else
return bit_cast<double>( signX );
if( expX <= 0 ) [[unlikely]]
{
assert(expX >= -51);
mantX = mantX >> (unsigned)(-expX + 1);
expX = 0;
}
return bit_cast<double>( signX | (uint64_t)expX << 52 | mantX & MANT );
}

It's about six times faster than the glibc 2.31 solution in my
benchmark. The returned NaNs and the raised exceptions are MSVC-
and glibc-compatible.

double myFmod(double x, double y)
{
double div = x / y;
return y * (div - std::round(div));
}

/Flibble
--- Synchronet 3.20c-Linux NewsLink 1.2

From Bonita Montero@Bonita.Montero@gmail.com to comp.lang.c++ on Tue Feb 25 07:37:23 2025

From Newsgroup: comp.lang.c++

Am 24.02.2025 um 23:21 schrieb Mr Flibble:

double myFmod(double x, double y)
{
double div = x / y;
return y * (div - std::round(div));
}

Doesn't work, not only for the reasons already mentioned.

--- Synchronet 3.20c-Linux NewsLink 1.2

From Bonita Montero@Bonita.Montero@gmail.com to comp.lang.c++ on Tue Feb 25 09:09:21 2025

From Newsgroup: comp.lang.c++

Am 24.02.2025 um 16:52 schrieb Michael S:

That is why I don't use multiplication. Did you ever asked yourself
what is the meaning of 'f' in 'fma' ?

The FMA-instructions produce the same results:

#include <iostream>
#include <random>
#include <bit>
#include <cmath>
#include <iomanip>
#include <intrin.h>

using namespace std;

int main()
{
auto fma = []( double a, double b, double c )
{
__m128d mA, mB, mC;
mA.m128d_f64[0] = a;
mB.m128d_f64[0] = b;
mC.m128d_f64[0] = c;
return _mm_fmadd_pd( mA, mB, mC ).m128d_f64[0];
};
mt19937_64 mt;
uniform_int_distribution<uint64_t> finites( 1, 0x7FEFFFFFFFFFFFFFu );
auto rnd = [&]() -> double { return bit_cast<double>( finites( mt ) ); };
ptrdiff_t nEQs = 0;
for( ptrdiff_t r = 0; r != 1'000'000; ++r )
{
double
a = rnd(), b = rnd(), c = rnd(),
rA = fma( a, b, c ),
rB = a * b + c;
nEQs = rA != rB;
}
cout << hexfloat << nEQs / 1.0e6 << endl;
}

--- Synchronet 3.20c-Linux NewsLink 1.2

From Muttley@Muttley@DastardlyHQ.org to comp.lang.c++ on Tue Feb 25 08:24:56 2025

From Newsgroup: comp.lang.c++

On Mon, 24 Feb 2025 22:21:03 +0000
Mr Flibble <leigh@i42.co.uk> wibbled:

On Mon, 24 Feb 2025 11:48:08 +0100, Bonita Montero
<Bonita.Montero@gmail.com> wrote:

It's about six times faster than the glibc 2.31 solution in my
benchmark. The returned NaNs and the raised exceptions are MSVC-
and glibc-compatible.

double myFmod(double x, double y)
{
double div = x / y;
return y * (div - std::round(div));
}

You don't ever want it rounded up, it must always just be the integer component.

--- Synchronet 3.20c-Linux NewsLink 1.2

From Bonita Montero@Bonita.Montero@gmail.com to comp.lang.c++ on Tue Feb 25 16:17:22 2025

From Newsgroup: comp.lang.c++

Am 25.02.2025 um 09:09 schrieb Bonita Montero:

Am 24.02.2025 um 16:52 schrieb Michael S:

That is why I don't use multiplication. Did you ever asked yourself
what is the meaning of 'f' in 'fma' ?

The FMA-instructions produce the same results:

#include <iostream>
#include <random>
#include <bit>
#include <cmath>
#include <iomanip>
#include <intrin.h>

using namespace std;

int main()
{
    auto fma = []( double a, double b, double c )
    {
        __m128d mA, mB, mC;
        mA.m128d_f64[0] = a;
        mB.m128d_f64[0] = b;
        mC.m128d_f64[0] = c;
        return _mm_fmadd_pd( mA, mB, mC ).m128d_f64[0];
    };
    mt19937_64 mt;
    uniform_int_distribution<uint64_t> finites( 1, 0x7FEFFFFFFFFFFFFFu );
    auto rnd = [&]() -> double { return
bit_cast<double>( finites( mt ) ); };
    ptrdiff_t nEQs = 0;
    for( ptrdiff_t r = 0; r != 1'000'000; ++r )
    {
        double
            a = rnd(), b = rnd(), c = rnd(),
            rA = fma( a, b, c ),
            rB = a * b + c;
        nEQs = rA != rB;
    }
    cout << hexfloat << nEQs / 1.0e6 << endl;
}

There's a good reason to produce the same result: if a "a * b + c"
-statement is replaced with a FMA-operation by the compiler you won't
get different results depending on the compiler-setting. clang++ does
this replacement if you chose the proper instruction set.
--- Synchronet 3.20c-Linux NewsLink 1.2

From Michael S@already5chosen@yahoo.com to comp.lang.c++ on Tue Feb 25 17:26:18 2025

From Newsgroup: comp.lang.c++

On Tue, 25 Feb 2025 09:09:21 +0100
Bonita Montero <Bonita.Montero@gmail.com> wrote:

Am 24.02.2025 um 16:52 schrieb Michael S:

That is why I don't use multiplication. Did you ever asked yourself
what is the meaning of 'f' in 'fma' ?

The FMA-instructions produce the same results:

#include <iostream>
#include <random>
#include <bit>
#include <cmath>
#include <iomanip>
#include <intrin.h>

using namespace std;

int main()
{
auto fma = []( double a, double b, double c )
{
__m128d mA, mB, mC;
mA.m128d_f64[0] = a;
mB.m128d_f64[0] = b;
mC.m128d_f64[0] = c;
return _mm_fmadd_pd( mA, mB, mC ).m128d_f64[0];
};
mt19937_64 mt;
uniform_int_distribution<uint64_t> finites( 1,
0x7FEFFFFFFFFFFFFFu ); auto rnd = [&]() -> double { return
bit_cast<double>( finites( mt ) ); }; ptrdiff_t nEQs = 0;
for( ptrdiff_t r = 0; r != 1'000'000; ++r )
{
double
a = rnd(), b = rnd(), c = rnd(),
rA = fma( a, b, c ),
rB = a * b + c;
nEQs = rA != rB;
}
cout << hexfloat << nEQs / 1.0e6 << endl;
}

GIGO.
Do a proper test then you'd get a proper answer.

fma.c:

#include <stdio.h>
#include <stdlib.h>
#include <math.h>

int main(int argz, char** argv)
{
if (argz < 4) {
fprintf(stderr, "Usage:\nfma x y z\n");
return 1;
}

double values[3];
for (int i = 0; i < 3; ++i) {
char* arg = argv[i+1];
char* endp;
values[i] = strtod(arg, &endp);
if (arg == endp) {
fprintf(stderr, "Bad argument '%s'. Not a number.\n", arg);
return 1;
}
}

double r1 = values[0]* values[1] + values[2];
double r2 = fma(values[0], values[1], values[2]);
printf(" %.17e * %.17e + %.17e = %.17e\n",
values[0], values[1], values[2], r1);
printf("fma(%.17e , %.17e , %.17e) = %.17e\n",
values[0], values[1], values[2], r2);

return 0;
}

$ gcc -O2 -Wall fma.c -o fma

$ ./fma 1000000001 999999999 -1e18
1.00000000100000000e+09 * 9.99999999000000000e+08 +
-1.00000000000000000e+18 = 0.00000000000000000e+00 fma(1.00000000100000000e+09 , 9.99999999000000000e+08 ,
-1.00000000000000000e+18) = -1.00000000000000000e+00

--- Synchronet 3.20c-Linux NewsLink 1.2

From Bonita Montero@Bonita.Montero@gmail.com to comp.lang.c++ on Tue Feb 25 16:45:25 2025

From Newsgroup: comp.lang.c++

Am 25.02.2025 um 16:26 schrieb Michael S:

On Tue, 25 Feb 2025 09:09:21 +0100
Bonita Montero <Bonita.Montero@gmail.com> wrote:

Am 24.02.2025 um 16:52 schrieb Michael S:

That is why I don't use multiplication. Did you ever asked yourself
what is the meaning of 'f' in 'fma' ?

The FMA-instructions produce the same results:

#include <iostream>
#include <random>
#include <bit>
#include <cmath>
#include <iomanip>
#include <intrin.h>

using namespace std;

int main()
{
auto fma = []( double a, double b, double c )
{
__m128d mA, mB, mC;
mA.m128d_f64[0] = a;
mB.m128d_f64[0] = b;
mC.m128d_f64[0] = c;
return _mm_fmadd_pd( mA, mB, mC ).m128d_f64[0];
};
mt19937_64 mt;
uniform_int_distribution<uint64_t> finites( 1,
0x7FEFFFFFFFFFFFFFu ); auto rnd = [&]() -> double { return
bit_cast<double>( finites( mt ) ); }; ptrdiff_t nEQs = 0;
for( ptrdiff_t r = 0; r != 1'000'000; ++r )
{
double
a = rnd(), b = rnd(), c = rnd(),
rA = fma( a, b, c ),
rB = a * b + c;
nEQs = rA != rB;
}
cout << hexfloat << nEQs / 1.0e6 << endl;
}

GIGO.
Do a proper test then you'd get a proper answer.

The test is proper with MSVC since MSVC doesn't replace the
"a * b + c"-operation with a FMA-operation. With your code
it isn't guaranteed that the CPU-specific FMA-operations are
used. I'm using the SSE FMA operation explicitly and I'm using
it for a million random finite double-values.

--- Synchronet 3.20c-Linux NewsLink 1.2

From Bonita Montero@Bonita.Montero@gmail.com to comp.lang.c++ on Tue Feb 25 16:57:29 2025

From Newsgroup: comp.lang.c++

This is a somewhat more improved code that generates a larger
part of zeores, denormals, infs and nans. It tests a billion
of random values for a, b and c. It is specified behaivour
according to the Intel manuals that the result is the same.

#include <iostream>
#include <random>
#include <bit>
#include <cmath>
#include <iomanip>
#include <cassert>
#include <intrin.h>

using namespace std;

int main()
{
auto fma = []( double a, double b, double c )
{
__m128d mA, mB, mC;
mA.m128d_f64[0] = a;
mB.m128d_f64[0] = b;
mC.m128d_f64[0] = c;
return _mm_fmadd_pd( mA, mB, mC ).m128d_f64[0];
};
mt19937_64 mt;
uniform_int_distribution<uint64_t>
genFinite( 0x0010000000000000u, 0x7FEFFFFFFFFFFFFFu ),
genDen( 1, 0x000FFFFFFFFFFFFFu ),
genNaN( 0x7FF0000000000001u, 0x7FFFFFFFFFFFFFFFu );
auto get = [&]()
{
constexpr uint64_t
FINITE_THRESH = 4, // 75% finites
ZERO = 3, // 6.25% zeroes
DENORMALS = 2, // 6.25% denormals
INF = 1, // 6.25% infs
NAN_THRESH = 0; // 6.25% NaNs
uint64_t
sign = mt() & numeric_limits<int64_t>::min(),
type = mt() % 16;
if( type >= FINITE_THRESH )
return bit_cast<double>( sign | genFinite( mt ) );
if( type == ZERO )
return bit_cast<double>( sign );
if( type == DENORMALS )
return bit_cast<double>( sign | genDen( mt ) );
if( type == INF )
return bit_cast<double>( sign | 0x7FF0000000000000u );
assert(type == NAN_THRESH);
return bit_cast<double>( sign | genNaN( mt ) );
};
ptrdiff_t nEQs = 0;
for( ptrdiff_t r = 0; r != 1'000'000'000; ++r )
{
double
a = get(), b = get(), c = get(),
rA = fma( a, b, c ),
rB = a * b + c;
nEQs = rA != rB;
}
cout << nEQs << endl;
}
--- Synchronet 3.20c-Linux NewsLink 1.2

From Michael S@already5chosen@yahoo.com to comp.lang.c++ on Tue Feb 25 19:17:15 2025

From Newsgroup: comp.lang.c++

On Tue, 25 Feb 2025 16:45:25 +0100
Bonita Montero <Bonita.Montero@gmail.com> wrote:

Am 25.02.2025 um 16:26 schrieb Michael S:

On Tue, 25 Feb 2025 09:09:21 +0100
Bonita Montero <Bonita.Montero@gmail.com> wrote:

Am 24.02.2025 um 16:52 schrieb Michael S:

That is why I don't use multiplication. Did you ever asked
yourself what is the meaning of 'f' in 'fma' ?

The FMA-instructions produce the same results:

#include <iostream>
#include <random>
#include <bit>
#include <cmath>
#include <iomanip>
#include <intrin.h>

using namespace std;

int main()
{
auto fma = []( double a, double b, double c )
{
__m128d mA, mB, mC;
mA.m128d_f64[0] = a;
mB.m128d_f64[0] = b;
mC.m128d_f64[0] = c;
return _mm_fmadd_pd( mA, mB, mC ).m128d_f64[0];
};
mt19937_64 mt;
uniform_int_distribution<uint64_t> finites( 1,
0x7FEFFFFFFFFFFFFFu ); auto rnd = [&]() -> double { return
bit_cast<double>( finites( mt ) ); }; ptrdiff_t nEQs = 0;
for( ptrdiff_t r = 0; r != 1'000'000; ++r )
{
double
a = rnd(), b = rnd(), c = rnd(),
rA = fma( a, b, c ),
rB = a * b + c;
nEQs = rA != rB;
}
cout << hexfloat << nEQs / 1.0e6 << endl;
}

GIGO.
Do a proper test then you'd get a proper answer.

The test is proper with MSVC since MSVC doesn't replace the
"a * b + c"-operation with a FMA-operation.

Don't invent your own fma(). Use one provided by library.
Then MSVC will do what it is prescribed to do by the standard.

With your code
it isn't guaranteed that the CPU-specific FMA-operations are
used.

Correct. And it does not matter. When run on CPU without HW FMA, it
would be slower. But would still produce a right result.

I'm using the SSE FMA operation explicitly and I'm using
it for a million random finite double-values.

Originally, I didn't even try to investigate what garbage exactly you
are feeding to your test. Now I took a look. It seems that you are
doing something fundamentally stupid, like all fma inputs positive.
Of course, for all positive inputs the (fma(x,y,z) != x*y+z) is quite
rare. It still happens sometimes, but it's likely that for your chosen distribution it happens less often than once per million.
OTOH, when x*y and z have different signs and similar magnitude, it
happens all the time. Still, for your stupidly chosen distribution the
similar magnitude is probably quite rare too.

--- Synchronet 3.20c-Linux NewsLink 1.2

From Bonita Montero@Bonita.Montero@gmail.com to comp.lang.c++ on Tue Feb 25 18:58:23 2025

From Newsgroup: comp.lang.c++

Am 25.02.2025 um 18:17 schrieb Michael S:

Don't invent your own fma(). Use one provided by library.
Then MSVC will do what it is prescribed to do by the standard.

I want to be sure that I'm using the SSE FMA operation and
not a conventional substitute of two instructions.

Originally, I didn't even try to investigate what garbage exactly you
are feeding to your test. Now I took a look. It seems that you are
doing something fundamentally stupid, like all fma inputs positive.

I extended the test to incorporate all possible double
bitrepresentations (taken von mt()) - with no difference.

--- Synchronet 3.20c-Linux NewsLink 1.2

From Michael S@already5chosen@yahoo.com to comp.lang.c++ on Wed Feb 26 00:36:19 2025

From Newsgroup: comp.lang.c++

On Tue, 25 Feb 2025 18:58:23 +0100
Bonita Montero <Bonita.Montero@gmail.com> wrote:

Am 25.02.2025 um 18:17 schrieb Michael S:

Don't invent your own fma(). Use one provided by library.
Then MSVC will do what it is prescribed to do by the standard.

I want to be sure that I'm using the SSE FMA operation

SSEn has no FMA operations.
FMA was introduces as part of AVX series, simulatneously with AVX2.
In practice all CPUs that support AVX2 also support FMA, but
from formal perspective they AVX2 and FMA are different extensions.

and
not a conventional substitute of two instructions.

fma() is guaranteed to do a right thing by standard.
It can not be substituted by two instructions. It's either one
instruction or many, likely over dozen. Never two.

Originally, I didn't even try to investigate what garbage exactly
you are feeding to your test. Now I took a look. It seems that you
are doing something fundamentally stupid, like all fma inputs
positive.

I extended the test to incorporate all possible double
bitrepresentations (taken von mt()) - with no difference.

What you wrote makes no sense. For positive double-precision x, y and
z there are 2**189 posible combinations. You cant check all of them even
if you were given all computers of the world for millenium.

However now I see that there indeed is a bug in Microsoft's
implementation of fma() library routine (also used by gcc on msys2).
When programs linked with their dynamic library run on hardware with FMA instructions then everything works correctly.

Same programs on hardware without FMA mostly produce correct results
when x*y and z differ in sign.
But when x*y and z have the same sign then [on hardware without FMA] Microsoft's routine appear to do non-fused calculations.

Here is an example of the program that prints 250508 on Intel Haswell
CPU, but prints 0 on Intel Ivy Bridge.
Compiled as 'cl -O1 -W4 -MD fma_tst0.c'.

#include <stdio.h>
#include <stdlib.h>
#include <math.h>

static
unsigned long long rnd(void)
{
unsigned long long x = 0;
for (int i = 0; i < 5; ++i)
x = (x << 15) + (rand() & 0x7FFF);
return x;
}

int main(void)
{
srand(1);
int n = 0;
for (int i = 0; i < 1000000; ++i) {
double x = rnd() * 0x1p-64;
double y = rnd() * 0x1p-64;
double z = rnd() * 0x1p-114;
double r1 = x*y + z;
double r2 = fma(x, y, z);
n += r1 != r2;
}
printf("%d\n", n);
return 0;
}

It is certainly worth a bug report, but I am afraid that Microsoft will
do nothing to fix it, likely claiming that they don't care about old
hardware.

--- Synchronet 3.20c-Linux NewsLink 1.2

From Mr Flibble@leigh@i42.co.uk to comp.lang.c++ on Tue Feb 25 23:03:04 2025

From Newsgroup: comp.lang.c++

On Tue, 25 Feb 2025 07:37:23 +0100, Bonita Montero
<Bonita.Montero@gmail.com> wrote:

Am 24.02.2025 um 23:21 schrieb Mr Flibble:

double myFmod(double x, double y)
{
double div = x / y;
return y * (div - std::round(div));
}

Doesn't work, not only for the reasons already mentioned.

double myFmod(double x, double y)
{
double div = x / y;
return y * (div - std::trunc(div));
}

/Flibble
--- Synchronet 3.20c-Linux NewsLink 1.2

From Michael S@already5chosen@yahoo.com to comp.lang.c++ on Wed Feb 26 02:00:48 2025

From Newsgroup: comp.lang.c++

On Tue, 25 Feb 2025 23:03:04 +0000
Mr Flibble <leigh@i42.co.uk> wrote:

On Tue, 25 Feb 2025 07:37:23 +0100, Bonita Montero
<Bonita.Montero@gmail.com> wrote:

Am 24.02.2025 um 23:21 schrieb Mr Flibble:

double myFmod(double x, double y)
{
double div = x / y;
return y * (div - std::round(div));
}

Doesn't work, not only for the reasons already mentioned.

double myFmod(double x, double y)
{
double div = x / y;
return y * (div - std::trunc(div));
}

/Flibble

Even ignoring potential overflow during division, this method is
very imprecise.
(1e3/9 - trunc(1e3/9))*9 = 1.000000000000028
(1e6/9 - trunc(1e6/9))*9 = 0.999999999985448
(1e9/9 - trunc(1e9/9))*9 = 0.999999940395355
(1e12/9 - trunc(1e12/9))*9 = 1.000030517578125
(1e15/9 - trunc(1e15/9))*9 = 0.984375

OTOH
1e3/9 - trunc(1e3/9)*9 = 1
1e6/9 - trunc(1e6/9)*9 = 1
1e9/9 - trunc(1e9/9)*9 = 1
1e12/9 - trunc(1e12/9)*9 = 1
1e15/9 - trunc(1e15/9)*9 = 1

--- Synchronet 3.20c-Linux NewsLink 1.2

From Wuns Haerst@Wuns.Haerst@wurstfabrik.at to comp.lang.c++ on Wed Feb 26 04:01:06 2025

From Newsgroup: comp.lang.c++

Am 26.02.2025 um 00:03 schrieb Mr Flibble:

double myFmod(double x, double y)
{
double div = x / y;
return y * (div - std::trunc(div));
}

Produces a lot of erroneous results.

--- Synchronet 3.20c-Linux NewsLink 1.2

From Bonita Montero@Bonita.Montero@gmail.com to comp.lang.c++ on Wed Feb 26 04:15:33 2025

From Newsgroup: comp.lang.c++

Am 25.02.2025 um 23:36 schrieb Michael S:

SSEn has no FMA operations.

FMA3 is an extension to SSE as well as AVX. As you can see from my
code I'm using the __m128d data type, which is a SSE and not an AVX
data type. There are also variants for AVX (ymm) and AVX-512 (zmm)
data types.

--- Synchronet 3.20c-Linux NewsLink 1.2

From Muttley@Muttley@DastardlyHQ.org to comp.lang.c++ on Wed Feb 26 08:16:01 2025

From Newsgroup: comp.lang.c++

On Wed, 26 Feb 2025 00:36:19 +0200
Michael S <already5chosen@yahoo.com> wibbled:

Here is an example of the program that prints 250508 on Intel Haswell
CPU, but prints 0 on Intel Ivy Bridge.
Compiled as 'cl -O1 -W4 -MD fma_tst0.c'.

#include <stdio.h>
#include <stdlib.h>
#include <math.h>

static
unsigned long long rnd(void)
{
unsigned long long x = 0;
for (int i = 0; i < 5; ++i)
x = (x << 15) + (rand() & 0x7FFF);
return x;
}

int main(void)
{
srand(1);
int n = 0;
for (int i = 0; i < 1000000; ++i) {
double x = rnd() * 0x1p-64;
double y = rnd() * 0x1p-64;
double z = rnd() * 0x1p-114;
double r1 = x*y + z;
double r2 = fma(x, y, z);
n += r1 != r2;
}
printf("%d\n", n);
return 0;
}

It is certainly worth a bug report, but I am afraid that Microsoft will
do nothing to fix it, likely claiming that they don't care about old >hardware.

Just FYI - it also returns 0 when compiled by Clang on an ARM Mac.

--- Synchronet 3.20c-Linux NewsLink 1.2

From Michael S@already5chosen@yahoo.com to comp.lang.c++ on Wed Feb 26 14:27:21 2025

From Newsgroup: comp.lang.c++

On Wed, 26 Feb 2025 08:16:01 -0000 (UTC)
Muttley@DastardlyHQ.org wrote:

On Wed, 26 Feb 2025 00:36:19 +0200
Michael S <already5chosen@yahoo.com> wibbled:

Here is an example of the program that prints 250508 on Intel Haswell
CPU, but prints 0 on Intel Ivy Bridge.
Compiled as 'cl -O1 -W4 -MD fma_tst0.c'.

#include <stdio.h>
#include <stdlib.h>
#include <math.h>

static
unsigned long long rnd(void)
{
unsigned long long x = 0;
for (int i = 0; i < 5; ++i)
x = (x << 15) + (rand() & 0x7FFF);
return x;
}

int main(void)
{
srand(1);
int n = 0;
for (int i = 0; i < 1000000; ++i) {
double x = rnd() * 0x1p-64;
double y = rnd() * 0x1p-64;
double z = rnd() * 0x1p-114;
double r1 = x*y + z;
double r2 = fma(x, y, z);
n += r1 != r2;
}
printf("%d\n", n);
return 0;
}

It is certainly worth a bug report, but I am afraid that Microsoft
will do nothing to fix it, likely claiming that they don't care
about old hardware.

Just FYI - it also returns 0 when compiled by Clang on an ARM Mac.

Looks like a bug in clang.
New versions of clang generate FMA instead of mul+add. I.e. clang bug is opposite of MS bug.
By Standard, compilers not allowed to do it in "standard" C mode in
absence of special flags like -ffast-math.

I played a little on godbolt and it seems that the bug is relatively
new. clang 13 still generates correct code. clang 14 does not. I.e.
slightly less than 3 years.

It happened simultaneously on x86-64 and ARM64

clang 14
https://godbolt.org/z/asochKz5P
https://godbolt.org/z/c7xTaGWzv

clang 13
https://godbolt.org/z/6onP3dE3c
https://godbolt.org/z/W9exqTanf

clang 13 -ffast-math
https://godbolt.org/z/8f875qMrf
https://godbolt.org/z/qPGafr563

--- Synchronet 3.20c-Linux NewsLink 1.2

From Muttley@Muttley@DastardlyHQ.org to comp.lang.c++ on Wed Feb 26 14:39:34 2025

From Newsgroup: comp.lang.c++

On Wed, 26 Feb 2025 14:27:21 +0200
Michael S <already5chosen@yahoo.com> wibbled:

On Wed, 26 Feb 2025 08:16:01 -0000 (UTC)
Muttley@DastardlyHQ.org wrote:
I played a little on godbolt and it seems that the bug is relatively
new. clang 13 still generates correct code. clang 14 does not. I.e.
slightly less than 3 years.

I don't think they've noticed:

R8603$ cc --version
Apple clang version 16.0.0 (clang-1600.0.26.6)

--- Synchronet 3.20c-Linux NewsLink 1.2

From Mr Flibble@leigh@i42.co.uk to comp.lang.c++ on Thu Feb 27 21:16:50 2025

From Newsgroup: comp.lang.c++

On Mon, 24 Feb 2025 11:48:08 +0100, Bonita Montero
<Bonita.Montero@gmail.com> wrote:

I wanted to optimize fmod to be a bit faster. This is my C++20 solution.

double myFmod( double x, double y )
{
constexpr uint64_t
SIGN = 1ull << 63,
IMPLICIT = 1ull << 52,
MANT = IMPLICIT - 1,
QBIT = 1ull << 51;
uint64_t const
binX = bit_cast<uint64_t>( x ),
binY = bit_cast<uint64_t>( y );
static auto abs = []( uint64_t m ) { return m & ~SIGN; };
auto isNaN = []( uint64_t m ) { return abs( m ) >= 0x7FF0000000000001u; }; auto isSig = []( uint64_t m ) { return !(m & QBIT); };
if( isNaN( binX ) ) [[unlikely]] // x == NaN
#if defined(_MSC_VER)
return bit_cast<double>( isNaN( binY ) ? binY | binX & binY & QBIT :
binX );
#else
{
if( isSig( binX ) || isNaN( binY ) && isSig( binY ) ) [[unlikely]]
feraiseexcept( FE_INVALID );
return bit_cast<double>( binX | QBIT );
}
#endif
if( isNaN( binY ) ) [[unlikely]] // x != NaN || y == NaN
#if defined(_MSC_VER)
return y;
#else
{
if( isSig( binY ) ) [[unlikely]]
feraiseexcept( FE_INVALID );
return bit_cast<double>( binY | QBIT );
}
#endif
auto isInf = []( uint64_t m ) { return abs( m ) == 0x7FF0000000000000u; }; if( isInf( binX ) ) // x == Inf
{
feraiseexcept( FE_INVALID );
#if defined(_MSC_VER)
return bit_cast<double>( binX & ~MANT | QBIT );
#else
return -numeric_limits<double>::quiet_NaN();
#endif
}
if( !abs( binY ) ) [[unlikely]] // y == 0
{
feraiseexcept( FE_INVALID );
#if defined(_MSC_VER)
return numeric_limits<double>::quiet_NaN();
#else
return -numeric_limits<double>::quiet_NaN();
#endif
}
if( !abs( binX ) || isInf( binY ) ) [[unlikely]] // x == 0 || y == Inf
return x;
auto exp = []( uint64_t b ) -> int { return b >> 52 & 0x7FF; };
int
expX = exp( binX ),
expY = exp( binY );
auto mant = []( uint64_t b ) { return b & MANT; };
uint64_t
mantX = mant( binX ),
mantY = mant( binY );
static auto normalize = []( int &exp, uint64_t &mant )
{
unsigned shift = countl_zero( mant ) - 11;
mant <<= shift;
exp -= shift;
};
auto build = []( int &exp, uint64_t &mant )
{
if( exp ) [[likely]]
mant |= IMPLICIT;
else
{
exp = 1;
normalize( exp, mant );
}
};
build( expX, mantX );
build( expY, mantY );
uint64_t signX = binX & SIGN;
int expDiff;
while( (expDiff = expX - expY) > 0 )
{
unsigned bits = expDiff <= 11 ? expDiff : 11;
if( !(mantX = (mantX << bits) % mantY) ) [[unlikely]]
return bit_cast<double>( signX );
expX -= bits;
normalize( expX, mantX );
}
if( !expDiff && mantX >= mantY ) [[unlikely]]
if( (mantX -= mantY) ) [[likely]]
normalize( expX, mantX );
else
return bit_cast<double>( signX );
if( expX <= 0 ) [[unlikely]]
{
assert(expX >= -51);
mantX = mantX >> (unsigned)(-expX + 1);
expX = 0;
}
return bit_cast<double>( signX | (uint64_t)expX << 52 | mantX & MANT );
}

It's about six times faster than the glibc 2.31 solution in my
benchmark. The returned NaNs and the raised exceptions are MSVC-
and glibc-compatible.

double myFmod(double x, double y)
{
return x / y - std::trunc(x / y) * y;
}

/Flibble
--- Synchronet 3.20c-Linux NewsLink 1.2

From Bonita Montero@Bonita.Montero@gmail.com to comp.lang.c++ on Fri Feb 28 07:59:30 2025

From Newsgroup: comp.lang.c++

Am 27.02.2025 um 22:16 schrieb Mr Flibble:

double myFmod(double x, double y)
{
return x / y - std::trunc(x / y) * y;
}

In most cases ths has a precision-loss which can lead to a result
which is larger than y. Current solutions are similar to my code
and are always 100% exact.
--- Synchronet 3.20c-Linux NewsLink 1.2

From Michael S@already5chosen@yahoo.com to comp.lang.c++ on Fri Feb 28 12:35:29 2025

From Newsgroup: comp.lang.c++

On Thu, 27 Feb 2025 21:16:50 +0000
Mr Flibble <leigh@i42.co.uk> wrote:

double myFmod(double x, double y)
{
return x / y - std::trunc(x / y) * y;
}

/Flibble

Nonsense.

The one below is not nonsense, but still very bad.
double myFmod(double x, double y)
{
return x - trunc(x / y) * y;
}

All solutions that work for all combinations of inputs are complicated.
They can be based either on integer arithmetic or on FMA.

In the former case in order to get any sort of speed one has to use non-standard extensions to the language, like gcc __int128 or MS/Intel _umul128/_umulh or MS/ARM __umulh.

In the latter class of solutions one has to be careful about rounding -
either check on every step that rounding didn't went a wrong way or set rounding mode to FE_TOWARDZERO at the beginning and restore it to
original at the end.

For both solutions worst case (huge x, tiny y) is pretty slow - 30-40
steps with several arithmetic operations on every step. Without
experimentation it is hard to say which of the solutions is faster. It
depends on your hardware, anyway.

--- Synchronet 3.20c-Linux NewsLink 1.2

From Michael S@already5chosen@yahoo.com to comp.lang.c++ on Fri Feb 28 16:28:32 2025

From Newsgroup: comp.lang.c++

On Wed, 26 Feb 2025 14:39:34 -0000 (UTC)
Muttley@DastardlyHQ.org wrote:

On Wed, 26 Feb 2025 14:27:21 +0200
Michael S <already5chosen@yahoo.com> wibbled:

On Wed, 26 Feb 2025 08:16:01 -0000 (UTC)
Muttley@DastardlyHQ.org wrote:
I played a little on godbolt and it seems that the bug is relatively
new. clang 13 still generates correct code. clang 14 does not. I.e. >slightly less than 3 years.

I don't think they've noticed:

R8603$ cc --version
Apple clang version 16.0.0 (clang-1600.0.26.6)

More googling/stack-overflowing.

clang/LLVM people think that it is a feature rather than a bug. They
claim that the standard allows fusing. I think that they are wrong, but
I didn't read the respective part of the standard.

The behavior can be turned back into clang13 way by -ffp-contract=off.
Or with pragma
#pragma STDC FP_CONTRACT OFF

See answer by rici https://stackoverflow.com/questions/73985098/clang-14-0-0-floating-point-optimizations

--- Synchronet 3.20c-Linux NewsLink 1.2

From Mr Flibble@leigh@i42.co.uk to comp.lang.c++ on Fri Feb 28 18:12:06 2025

From Newsgroup: comp.lang.c++

On Fri, 28 Feb 2025 12:35:29 +0200, Michael S
<already5chosen@yahoo.com> wrote:

On Thu, 27 Feb 2025 21:16:50 +0000
Mr Flibble <leigh@i42.co.uk> wrote:

double myFmod(double x, double y)
{
return x / y - std::trunc(x / y) * y;
}

/Flibble

Nonsense.

The one below is not nonsense, but still very bad.
double myFmod(double x, double y)
{
return x - trunc(x / y) * y;
}

Yes it is nonsense, YOUR nonsense (I didn't actually think about the
problem, just reposted yours as I foolishly assumed you were correct):

On Wed, 26 Feb 2025 02:00:48 +0200, Michael S
<already5chosen@yahoo.com> wrote:

Even ignoring potential overflow during division, this method is
very imprecise.
(1e3/9 - trunc(1e3/9))*9 = 1.000000000000028
(1e6/9 - trunc(1e6/9))*9 = 0.999999999985448
(1e9/9 - trunc(1e9/9))*9 = 0.999999940395355
(1e12/9 - trunc(1e12/9))*9 = 1.000030517578125
(1e15/9 - trunc(1e15/9))*9 = 0.984375

OTOH
1e3/9 - trunc(1e3/9)*9 = 1
1e6/9 - trunc(1e6/9)*9 = 1
1e9/9 - trunc(1e9/9)*9 = 1
1e12/9 - trunc(1e12/9)*9 = 1
1e15/9 - trunc(1e15/9)*9 = 1

/Flibble
--- Synchronet 3.20c-Linux NewsLink 1.2

From Bonita Montero@Bonita.Montero@gmail.com to comp.lang.c++ on Fri Feb 28 19:23:49 2025

From Newsgroup: comp.lang.c++

Am 27.02.2025 um 22:16 schrieb Mr Flibble:

double myFmod(double x, double y)
{
return x / y - std::trunc(x / y) * y;
}

I weote a little test to find out how often the result is imprecise and
how often it is out of range:

#include <iostream>
#include <cmath>
#include <random>
#include <bit>

using namespace std;

double trivialFmod( double a, double b );

int main()
{
mt19937_64 mt;
uniform_int_distribution<uint64_t> gen( 1, 0x7FEFFFFFFFFFFFFFu );
size_t imprecise = 0, outOfRange = 0;
for( size_t r = 1'000'000; r; --r )
{
double
a = bit_cast<double>( gen( mt ) ),
b = bit_cast<double>( gen( mt ) ),
fm = fmod( a, b ),
tfm = trivialFmod( a, b );
imprecise += fm != tfm;
outOfRange += tfm >= b;
}
auto print = []( char const *what, size_t n ) { cout << what << (ptrdiff_t)n / (1.0e6 / 100) << "%" << endl; };
print( "imprecise: ", imprecise );
print( "out of range: ", outOfRange );
}

double trivialFmod( double a, double b )
{
return a - trunc( a / b ) * b;
}

The output is:

imprecise: 49.9096%
out of range: 2.0039%

I'd never expected that half of the results are precise and that
only two percent of the results are out of range. But that's still
unusable.

--- Synchronet 3.20c-Linux NewsLink 1.2

From Mr Flibble@leigh@i42.co.uk to comp.lang.c++ on Fri Feb 28 18:30:47 2025

From Newsgroup: comp.lang.c++

On Mon, 24 Feb 2025 11:48:08 +0100, Bonita Montero
<Bonita.Montero@gmail.com> wrote:

I wanted to optimize fmod to be a bit faster. This is my C++20 solution.

double myFmod( double x, double y )
{
constexpr uint64_t
SIGN = 1ull << 63,
IMPLICIT = 1ull << 52,
MANT = IMPLICIT - 1,
QBIT = 1ull << 51;
uint64_t const
binX = bit_cast<uint64_t>( x ),
binY = bit_cast<uint64_t>( y );
static auto abs = []( uint64_t m ) { return m & ~SIGN; };
auto isNaN = []( uint64_t m ) { return abs( m ) >= 0x7FF0000000000001u; }; auto isSig = []( uint64_t m ) { return !(m & QBIT); };
if( isNaN( binX ) ) [[unlikely]] // x == NaN
#if defined(_MSC_VER)
return bit_cast<double>( isNaN( binY ) ? binY | binX & binY & QBIT :
binX );
#else
{
if( isSig( binX ) || isNaN( binY ) && isSig( binY ) ) [[unlikely]]
feraiseexcept( FE_INVALID );
return bit_cast<double>( binX | QBIT );
}
#endif
if( isNaN( binY ) ) [[unlikely]] // x != NaN || y == NaN
#if defined(_MSC_VER)
return y;
#else
{
if( isSig( binY ) ) [[unlikely]]
feraiseexcept( FE_INVALID );
return bit_cast<double>( binY | QBIT );
}
#endif
auto isInf = []( uint64_t m ) { return abs( m ) == 0x7FF0000000000000u; }; if( isInf( binX ) ) // x == Inf
{
feraiseexcept( FE_INVALID );
#if defined(_MSC_VER)
return bit_cast<double>( binX & ~MANT | QBIT );
#else
return -numeric_limits<double>::quiet_NaN();
#endif
}
if( !abs( binY ) ) [[unlikely]] // y == 0
{
feraiseexcept( FE_INVALID );
#if defined(_MSC_VER)
return numeric_limits<double>::quiet_NaN();
#else
return -numeric_limits<double>::quiet_NaN();
#endif
}
if( !abs( binX ) || isInf( binY ) ) [[unlikely]] // x == 0 || y == Inf
return x;
auto exp = []( uint64_t b ) -> int { return b >> 52 & 0x7FF; };
int
expX = exp( binX ),
expY = exp( binY );
auto mant = []( uint64_t b ) { return b & MANT; };
uint64_t
mantX = mant( binX ),
mantY = mant( binY );
static auto normalize = []( int &exp, uint64_t &mant )
{
unsigned shift = countl_zero( mant ) - 11;
mant <<= shift;
exp -= shift;
};
auto build = []( int &exp, uint64_t &mant )
{
if( exp ) [[likely]]
mant |= IMPLICIT;
else
{
exp = 1;
normalize( exp, mant );
}
};
build( expX, mantX );
build( expY, mantY );
uint64_t signX = binX & SIGN;
int expDiff;
while( (expDiff = expX - expY) > 0 )
{
unsigned bits = expDiff <= 11 ? expDiff : 11;
if( !(mantX = (mantX << bits) % mantY) ) [[unlikely]]
return bit_cast<double>( signX );
expX -= bits;
normalize( expX, mantX );
}
if( !expDiff && mantX >= mantY ) [[unlikely]]
if( (mantX -= mantY) ) [[likely]]
normalize( expX, mantX );
else
return bit_cast<double>( signX );
if( expX <= 0 ) [[unlikely]]
{
assert(expX >= -51);
mantX = mantX >> (unsigned)(-expX + 1);
expX = 0;
}
return bit_cast<double>( signX | (uint64_t)expX << 52 | mantX & MANT );
}

It's about six times faster than the glibc 2.31 solution in my
benchmark. The returned NaNs and the raised exceptions are MSVC-
and glibc-compatible.

double my_fmod(double x, double y)
{
if (y == 0.0)
return x / y;
return x - std::trunc(x / y) * y;
}

/Flibble
--- Synchronet 3.20c-Linux NewsLink 1.2

From Bonita Montero@Bonita.Montero@gmail.com to comp.lang.c++ on Fri Feb 28 19:31:44 2025

From Newsgroup: comp.lang.c++

Am 28.02.2025 um 19:30 schrieb Mr Flibble:

double my_fmod(double x, double y)
{
if (y == 0.0)
return x / y;
return x - std::trunc(x / y) * y;
}

This still sucks. Try it with this test:

#include <iostream>
#include <cmath>
#include <random>
#include <bit>

using namespace std;

double trivialFmod( double a, double b );

int main()
{
mt19937_64 mt;
uniform_int_distribution<uint64_t> gen( 1, 0x7FEFFFFFFFFFFFFFu );
size_t imprecise = 0, outOfRange = 0;
for( size_t r = 1'000'000; r; --r )
{
double
a = bit_cast<double>( gen( mt ) ),
b = bit_cast<double>( gen( mt ) ),
fm = fmod( a, b ),
tfm = trivialFmod( a, b );
imprecise += fm != tfm;
outOfRange += tfm >= b;
}
auto print = []( char const *what, size_t n ) { cout << what << (ptrdiff_t)n / (1.0e6 / 100) << "%" << endl; };
print( "imprecise: ", imprecise );
print( "out of range: ", outOfRange );
}

double trivialFmod( double a, double b )
{
return a - trunc( a / b ) * b;
}

--- Synchronet 3.20c-Linux NewsLink 1.2

From Mr Flibble@leigh@i42.co.uk to comp.lang.c++ on Fri Feb 28 19:47:23 2025

From Newsgroup: comp.lang.c++

On Fri, 28 Feb 2025 19:31:44 +0100, Bonita Montero
<Bonita.Montero@gmail.com> wrote:

Am 28.02.2025 um 19:30 schrieb Mr Flibble:

double my_fmod(double x, double y)
{
if (y == 0.0)
return x / y;
return x - std::trunc(x / y) * y;
}

This still sucks. Try it with this test:

#include <iostream>
#include <cmath>
#include <random>
#include <bit>

using namespace std;

double trivialFmod( double a, double b );

int main()
{
mt19937_64 mt;
uniform_int_distribution<uint64_t> gen( 1, 0x7FEFFFFFFFFFFFFFu );
size_t imprecise = 0, outOfRange = 0;
for( size_t r = 1'000'000; r; --r )
{
double
a = bit_cast<double>( gen( mt ) ),
b = bit_cast<double>( gen( mt ) ),
fm = fmod( a, b ),
tfm = trivialFmod( a, b );
imprecise += fm != tfm;
outOfRange += tfm >= b;
}
auto print = []( char const *what, size_t n ) { cout << what <<
(ptrdiff_t)n / (1.0e6 / 100) << "%" << endl; };
print( "imprecise: ", imprecise );
print( "out of range: ", outOfRange );
}

double trivialFmod( double a, double b )
{
return a - trunc( a / b ) * b;
}

IEEE 754 does not define how std::fmod should behave, only
std::remainder.

/Flibble
--- Synchronet 3.20c-Linux NewsLink 1.2

From Bonita Montero@Bonita.Montero@gmail.com to comp.lang.c++ on Fri Feb 28 20:49:38 2025

From Newsgroup: comp.lang.c++

Am 28.02.2025 um 20:47 schrieb Mr Flibble:

IEEE 754 does not define how std::fmod should behave, only
std::remainder.

There's only one way to do it for finite numbers.
--- Synchronet 3.20c-Linux NewsLink 1.2

From Mr Flibble@leigh@i42.co.uk to comp.lang.c++ on Sat Mar 1 14:37:28 2025

From Newsgroup: comp.lang.c++

On Fri, 28 Feb 2025 20:49:38 +0100, Bonita Montero
<Bonita.Montero@gmail.com> wrote:

Am 28.02.2025 um 20:47 schrieb Mr Flibble:

IEEE 754 does not define how std::fmod should behave, only
std::remainder.

There's only one way to do it for finite numbers.

Not true as there is a fixed mantissa size, so finite precision,
making your test case useless if x is sufficiently large.

/Flibble
--- Synchronet 3.20c-Linux NewsLink 1.2

From Bonita Montero@Bonita.Montero@gmail.com to comp.lang.c++ on Sat Mar 1 16:58:51 2025

From Newsgroup: comp.lang.c++

Am 01.03.2025 um 15:37 schrieb Mr Flibble:

On Fri, 28 Feb 2025 20:49:38 +0100, Bonita Montero
<Bonita.Montero@gmail.com> wrote:

Am 28.02.2025 um 20:47 schrieb Mr Flibble:

IEEE 754 does not define how std::fmod should behave, only
std::remainder.

There's only one way to do it for finite numbers.

Not true as there is a fixed mantissa size, so finite precision,
making your test case useless if x is sufficiently large.

The way to do a modolo calculations for every floating point value
except inf or nan (finite numbers) is always the same for all imple- mentations. And correct implementations are always without precision
los, i.e. exact.
As I've shown solutions like yours are only 50% exact and in 2% they
generate out of range results.

--- Synchronet 3.20c-Linux NewsLink 1.2

From Tim Rentsch@tr.17687@z991.linuxsc.com to comp.lang.c++ on Sat Mar 1 08:35:01 2025

From Newsgroup: comp.lang.c++

Michael S <already5chosen@yahoo.com> writes:

On Wed, 26 Feb 2025 14:39:34 -0000 (UTC)
Muttley@DastardlyHQ.org wrote:

On Wed, 26 Feb 2025 14:27:21 +0200
Michael S <already5chosen@yahoo.com> wibbled:

On Wed, 26 Feb 2025 08:16:01 -0000 (UTC)
Muttley@DastardlyHQ.org wrote:
I played a little on godbolt and it seems that the bug is relatively
new. clang 13 still generates correct code. clang 14 does not. I.e.
slightly less than 3 years.

I don't think they've noticed:

R8603$ cc --version
Apple clang version 16.0.0 (clang-1600.0.26.6)

More googling/stack-overflowing.

clang/LLVM people think that it is a feature rather than a bug. They
claim that the standard allows fusing. I think that they are wrong, but
I didn't read the respective part of the standard.

My reading of the C standard is that implementations are allowed to
contract floating-point expressions (aka fusing) as their default
choice of what is allowed, and because this default choice falls
into the category of implementation-defined behavior the
implementation must document what default it has chosen.

The behavior can be turned back into clang13 way by -ffp-contract=off.
Or with pragma
#pragma STDC FP_CONTRACT OFF

The C standard doesn't say anything about compiler options.

The C standard does specify what happens for the STDC FP_CONTRACT
standard #pragma, for both

#pragma STDC FP_CONTRACT OFF

and

#pragma STDC FP_CONTRACT ON

If you want to look, what these #pragma's do is defined in the
section of the C standard pertaining to <math.h>, which is 7.12 in
the N1256 document for C99.
--- Synchronet 3.20c-Linux NewsLink 1.2

From Mr Flibble@leigh@i42.co.uk to comp.lang.c++ on Sat Mar 1 17:11:54 2025

From Newsgroup: comp.lang.c++

On Sat, 1 Mar 2025 16:58:51 +0100, Bonita Montero
<Bonita.Montero@gmail.com> wrote:

Am 01.03.2025 um 15:37 schrieb Mr Flibble:

On Fri, 28 Feb 2025 20:49:38 +0100, Bonita Montero
<Bonita.Montero@gmail.com> wrote:

Am 28.02.2025 um 20:47 schrieb Mr Flibble:

IEEE 754 does not define how std::fmod should behave, only
std::remainder.

There's only one way to do it for finite numbers.

Not true as there is a fixed mantissa size, so finite precision,
making your test case useless if x is sufficiently large.

The way to do a modolo calculations for every floating point value
except inf or nan (finite numbers) is always the same for all imple- >mentations. And correct implementations are always without precision
los, i.e. exact.
As I've shown solutions like yours are only 50% exact and in 2% they
generate out of range results.

Thus you are asserting that all finite numbers have an exact IEEE 754
floating point representation which is of course an erroneous
assertion ergo your solution is bogus.

/Flibble
--- Synchronet 3.20c-Linux NewsLink 1.2

From Bonita Montero@Bonita.Montero@gmail.com to comp.lang.c++ on Sat Mar 1 21:02:32 2025

From Newsgroup: comp.lang.c++

Am 01.03.2025 um 18:11 schrieb Mr Flibble:

Thus you are asserting that all finite numbers have an exact IEEE 754 floating point representation ...

But the result of floating-point operations usually have precision-loss;
except fmod(), which is always correct - when implemented properly. You
didn't implement it correctly.
--- Synchronet 3.20c-Linux NewsLink 1.2

From Mr Flibble@leigh@i42.co.uk to comp.lang.c++ on Sat Mar 1 20:32:40 2025

From Newsgroup: comp.lang.c++

On Sat, 1 Mar 2025 21:02:32 +0100, Bonita Montero
<Bonita.Montero@gmail.com> wrote:

Am 01.03.2025 um 18:11 schrieb Mr Flibble:

Thus you are asserting that all finite numbers have an exact IEEE 754
floating point representation ...

But the result of floating-point operations usually have precision-loss; >except fmod(), which is always correct - when implemented properly. You >didn't implement it correctly.

False, see my other post.

/Flibble
--- Synchronet 3.20c-Linux NewsLink 1.2

From Bonita Montero@Bonita.Montero@gmail.com to comp.lang.c++ on Sat Mar 1 21:37:10 2025

From Newsgroup: comp.lang.c++

Am 01.03.2025 um 21:32 schrieb Mr Flibble:

On Sat, 1 Mar 2025 21:02:32 +0100, Bonita Montero
<Bonita.Montero@gmail.com> wrote:

Am 01.03.2025 um 18:11 schrieb Mr Flibble:

Thus you are asserting that all finite numbers have an exact IEEE 754
floating point representation ...

But the result of floating-point operations usually have precision-loss;
except fmod(), which is always correct - when implemented properly. You
didn't implement it correctly.

False, see my other post.

/Flibble

This:

#include <iostream>
#include <cmath>
#include <random>
#include <bit>

using namespace std;

double my_fmod( double x, double y );

int main()
{
mt19937_64 mt;
uniform_int_distribution<uint64_t> gen( 1, 0x7FEFFFFFFFFFFFFFu );
size_t imprecise = 0, outOfRange = 0;
for( size_t r = 1'000'000; r; --r )
{
double
a = bit_cast<double>( gen( mt ) ),
b = bit_cast<double>( gen( mt ) ),
fm = fmod( a, b ),
tfm = my_fmod( a, b );
imprecise += fm != tfm;
outOfRange += tfm >= b;
}
auto print = []( char const *what, size_t n ) { cout << what << (ptrdiff_t)n / (1.0e6 / 100) << "%" << endl; };
print( "imprecise: ", imprecise );
print( "out of range: ", outOfRange );
}

double my_fmod( double x, double y )
{
if( y == 0.0 )
return x / y;
return x - std::trunc( x / y ) * y;
}

... prints this ...

imprecise: 49.9096%
out of range: 2.0039%

So your solution is unusable.

--- Synchronet 3.20c-Linux NewsLink 1.2

From Mr Flibble@leigh@i42.co.uk to comp.lang.c++ on Sat Mar 1 20:45:26 2025

From Newsgroup: comp.lang.c++

On Sat, 1 Mar 2025 21:37:10 +0100, Bonita Montero
<Bonita.Montero@gmail.com> wrote:

Am 01.03.2025 um 21:32 schrieb Mr Flibble:

On Sat, 1 Mar 2025 21:02:32 +0100, Bonita Montero
<Bonita.Montero@gmail.com> wrote:

Am 01.03.2025 um 18:11 schrieb Mr Flibble:

Thus you are asserting that all finite numbers have an exact IEEE 754
floating point representation ...

But the result of floating-point operations usually have precision-loss; >>> except fmod(), which is always correct - when implemented properly. You
didn't implement it correctly.

False, see my other post.

/Flibble

This:

#include <iostream>
#include <cmath>
#include <random>
#include <bit>

using namespace std;

double my_fmod( double x, double y );

int main()
{
mt19937_64 mt;
uniform_int_distribution<uint64_t> gen( 1, 0x7FEFFFFFFFFFFFFFu );
size_t imprecise = 0, outOfRange = 0;
for( size_t r = 1'000'000; r; --r )
{
double
a = bit_cast<double>( gen( mt ) ),
b = bit_cast<double>( gen( mt ) ),
fm = fmod( a, b ),
tfm = my_fmod( a, b );
imprecise += fm != tfm;
outOfRange += tfm >= b;
}
auto print = []( char const *what, size_t n ) { cout << what <<
(ptrdiff_t)n / (1.0e6 / 100) << "%" << endl; };
print( "imprecise: ", imprecise );
print( "out of range: ", outOfRange );
}

double my_fmod( double x, double y )
{
if( y == 0.0 )
return x / y;
return x - std::trunc( x / y ) * y;
}

... prints this ...

imprecise: 49.9096%
out of range: 2.0039%

So your solution is unusable.

False, see my other post.

/Flibble
--- Synchronet 3.20c-Linux NewsLink 1.2

From Michael S@already5chosen@yahoo.com to comp.lang.c++ on Sat Mar 1 22:55:59 2025

From Newsgroup: comp.lang.c++

On Mon, 24 Feb 2025 11:48:08 +0100
Bonita Montero <Bonita.Montero@gmail.com> wrote:

I wanted to optimize fmod to be a bit faster. This is my C++20
solution.

double myFmod( double x, double y )
{
constexpr uint64_t
SIGN = 1ull << 63,
IMPLICIT = 1ull << 52,
MANT = IMPLICIT - 1,
QBIT = 1ull << 51;
uint64_t const
binX = bit_cast<uint64_t>( x ),
binY = bit_cast<uint64_t>( y );
static auto abs = []( uint64_t m ) { return m & ~SIGN; };
auto isNaN = []( uint64_t m ) { return abs( m ) >=
0x7FF0000000000001u; }; auto isSig = []( uint64_t m ) { return !(m &
QBIT); }; if( isNaN( binX ) ) [[unlikely]] // x == NaN
#if defined(_MSC_VER)
return bit_cast<double>( isNaN( binY ) ? binY | binX
& binY & QBIT : binX );
#else
{
if( isSig( binX ) || isNaN( binY ) && isSig( binY ) ) [[unlikely]] feraiseexcept( FE_INVALID );
return bit_cast<double>( binX | QBIT );
}
#endif
if( isNaN( binY ) ) [[unlikely]] // x != NaN || y == NaN
#if defined(_MSC_VER)
return y;
#else
{
if( isSig( binY ) ) [[unlikely]]
feraiseexcept( FE_INVALID );
return bit_cast<double>( binY | QBIT );
}
#endif
auto isInf = []( uint64_t m ) { return abs( m ) ==
0x7FF0000000000000u; }; if( isInf( binX ) ) // x == Inf
{
feraiseexcept( FE_INVALID );
#if defined(_MSC_VER)
return bit_cast<double>( binX & ~MANT | QBIT );
#else
return -numeric_limits<double>::quiet_NaN();
#endif
}
if( !abs( binY ) ) [[unlikely]] // y == 0
{
feraiseexcept( FE_INVALID );
#if defined(_MSC_VER)
return numeric_limits<double>::quiet_NaN();
#else
return -numeric_limits<double>::quiet_NaN();
#endif
}
if( !abs( binX ) || isInf( binY ) ) [[unlikely]] // x == 0 ||
y == Inf return x;
auto exp = []( uint64_t b ) -> int { return b >> 52 & 0x7FF;
}; int
expX = exp( binX ),
expY = exp( binY );
auto mant = []( uint64_t b ) { return b & MANT; };
uint64_t
mantX = mant( binX ),
mantY = mant( binY );
static auto normalize = []( int &exp, uint64_t &mant )
{
unsigned shift = countl_zero( mant ) - 11;
mant <<= shift;
exp -= shift;
};
auto build = []( int &exp, uint64_t &mant )
{
if( exp ) [[likely]]
mant |= IMPLICIT;
else
{
exp = 1;
normalize( exp, mant );
}
};
build( expX, mantX );
build( expY, mantY );
uint64_t signX = binX & SIGN;
int expDiff;
while( (expDiff = expX - expY) > 0 )
{
unsigned bits = expDiff <= 11 ? expDiff : 11;
if( !(mantX = (mantX << bits) % mantY) ) [[unlikely]]
return bit_cast<double>( signX );
expX -= bits;
normalize( expX, mantX );
}
if( !expDiff && mantX >= mantY ) [[unlikely]]
if( (mantX -= mantY) ) [[likely]]
normalize( expX, mantX );
else
return bit_cast<double>( signX );
if( expX <= 0 ) [[unlikely]]
{
assert(expX >= -51);
mantX = mantX >> (unsigned)(-expX + 1);
expX = 0;
}
return bit_cast<double>( signX | (uint64_t)expX << 52 | mantX
& MANT ); }

It's about six times faster than the glibc 2.31 solution in my
benchmark. The returned NaNs and the raised exceptions are MSVC-
and glibc-compatible.

How about that?
Pay attention, it's C rather than C++. So 5 times shorter :-)
It's not the fastest for big x/y ratios, but rather simple and not
*too* slow. At least as long as hardware supports FMA.
For small x/y ratios it should be pretty close to best possible.

#include <math.h>
#include <fenv.h>

double my_fmod(double x, double y)
{
if (isnan(x))
return x;

// pre-process y
if (isless(y, 0))
y = -y;
else if (isgreater(y, 0))
;
else {
if (isnan(y))
return y;
// y==0
feraiseexcept(FE_INVALID);
return nan("y0");
}

// y in (0:+inf]

// Quick path
double xx = x * 0x1p-53;
if (xx > -y && xx < y) {
// among other things, x guaranteed to be finite
if (x > -y && x < y)
return x; // case y=+-inf covered here
double d = trunc(x/y);
double res = fma(-y, d, x);
if (signbit(x) != signbit(res)) {
// overshoot because of unfortunate division rounding
// it is extremely rare for small x/y,
// but not rare when x/y is close to 2**53
res = fma(-y, d+(signbit(x)*2-1), x);
}
return res;
}

// slow path
if (isinf(x)) {
feraiseexcept(FE_INVALID);
return nan("xinf");
}

int oldRnd = fegetround();
fesetround(FE_TOWARDZERO);

double ax = fabs(x);
do {
double yy = y;
while (yy < ax * 0x1p-1022)
yy *= 0x1p1021;

do
ax = fma(-yy, trunc(ax/yy), ax);
while (ax >= yy);

} while (ax >= y);

ax = copysign(ax, x);
fesetround(oldRnd);
return ax;
}

--- Synchronet 3.20c-Linux NewsLink 1.2

From Bonita Montero@Bonita.Montero@gmail.com to comp.lang.c++ on Sun Mar 2 09:40:42 2025

From Newsgroup: comp.lang.c++

Am 01.03.2025 um 18:11 schrieb Mr Flibble:

Thus you are asserting that all finite numbers have an exact IEEE 754 floating point representation ...

I never said that.

--- Synchronet 3.20c-Linux NewsLink 1.2

From Michael S@already5chosen@yahoo.com to comp.lang.c++ on Sun Mar 2 15:41:55 2025

From Newsgroup: comp.lang.c++

On Mon, 24 Feb 2025 11:48:08 +0100
Bonita Montero <Bonita.Montero@gmail.com> wrote:

I wanted to optimize fmod to be a bit faster. This is my C++20
solution.

It's about six times faster than the glibc 2.31 solution in my
benchmark. The returned NaNs and the raised exceptions are MSVC-
and glibc-compatible.

Here are my benchmark results on Windows/msys2, 4.25MHz Skylake:
Compiler x/y range fmod my_fmod (yours)
clang [0.5:2**53] 5.6 38.0
gcc [0.5:2**53] 5.8 36.7
MSVC [0.5:2**53] 11.0 36.8
clang [2**-2098:2**2098] 102.9 294.3
gcc [2**-2098:2**2098] 102.9 291.7
MSVC [2**-2098:2**2098] 109.9 289.6

Your variant is 2.6x to 6.8x times slower than standard library.

My variant is also slower than standard library, but the margin of
defeat is much closer.

------- first range
#include <cstdio>
#include <cstring>
#include <cmath>
#include <cfenv>
#include <vector>
#include <random>
#include <chrono>

double my_fmod(double x, double y);

int main(void)
{
const int VEC_LEN = 100000;
const int N_IT = 31;

std::vector<double> xy(VEC_LEN*2);
std::mt19937_64 rndGen;
const uint64_t EXP_MASK = 2047ull << 52;
for (int i = 0; i < VEC_LEN*2; ++i) {
uint64_t u = rndGen();
uint64_t exp = 1023;
if (i % 2 == 0) { // x
uint64_t exp0 = (u >> 52) & 2047;
exp += exp0 % 52;
}
u = (u & ~EXP_MASK) | (exp << 52);
double d;
memcpy(&d, &u, sizeof(d));
xy[i] = d;
}
std::vector<double> res(VEC_LEN);
std::vector<double> ref(VEC_LEN);

auto t00 = std::chrono::steady_clock::now();
const double* pXY = xy.data();
double* pRef = ref.data();
double* pRes = res.data();
std::vector<int64_t> tref(N_IT);
std::vector<int64_t> tres(N_IT);
for (int it = 0; it < N_IT; ++it) {
auto t0 = std::chrono::steady_clock::now();
for (int i = 0; i < VEC_LEN; ++i)
pRef[i] = fmod(pXY[i*2+0], pXY[i*2+1]);
auto t1 = std::chrono::steady_clock::now();
for (int i = 0; i < VEC_LEN; ++i)
pRes[i] = my_fmod(pXY[i*2+0], pXY[i*2+1]);
auto t2 = std::chrono::steady_clock::now();

tref[it] = std::chrono::duration_cast<std::chrono::nanoseconds>(t1
- t0).count();
tres[it] = std::chrono::duration_cast<std::chrono::nanoseconds>(t2
- t1).count();

for (int i = 0; i < VEC_LEN; ++i) {
if (pRef[i] != pRes[i]) {
if (!std::isnan(pRef[i]) || !std::isnan(pRes[i])) {
printf(
"Mismatch. fmod(%.17e, %.17e).\n"
"ref %.17e\n"
"my %.17e\n"
,xy[i*2+0]
,xy[i*2+1]
,ref[i]
,res[i]
);
return 1;
}
}
}
}

auto t11 = std::chrono::steady_clock::now();
int64_t dt = std::chrono::duration_cast<std::chrono::nanoseconds>(t11
- t00).count();

std::nth_element(tref.begin(), tref.begin()+N_IT/2, tref.end());
std::nth_element(tres.begin(), tres.begin()+N_IT/2, tres.end());
printf("fmod %.2f nsec. my_fmod %.2f nsec. Test time %.3f msec\n"
,double(tref[N_IT/2]) / VEC_LEN
,double(tres[N_IT/2]) / VEC_LEN
,double(dt)*1e-6
);

return 0;
}

-- second range
#include <cstdio>
#include <cstring>
#include <cmath>
#include <cfenv>
#include <vector>
#include <random>
#include <chrono>

double my_fmod(double x, double y);

int main(void)
{
const int VEC_LEN = 100000;
const int N_IT = 31;

std::vector<double> xy(VEC_LEN*2);
std::mt19937_64 rndGen;
for (int i = 0; i < VEC_LEN*2; ++i) {
uint64_t u = rndGen();
double d;
memcpy(&d, &u, sizeof(d));
xy[i] = d;
}
std::vector<double> res(VEC_LEN);
std::vector<double> ref(VEC_LEN);

auto t00 = std::chrono::steady_clock::now();
const double* pXY = xy.data();
double* pRef = ref.data();
double* pRes = res.data();
std::vector<int64_t> tref(N_IT);
std::vector<int64_t> tres(N_IT);
for (int it = 0; it < N_IT; ++it) {
auto t0 = std::chrono::steady_clock::now();
for (int i = 0; i < VEC_LEN; ++i)
pRef[i] = fmod(pXY[i*2+0], pXY[i*2+1]);
auto t1 = std::chrono::steady_clock::now();
for (int i = 0; i < VEC_LEN; ++i)
pRes[i] = my_fmod(pXY[i*2+0], pXY[i*2+1]);
auto t2 = std::chrono::steady_clock::now();

tref[it] = std::chrono::duration_cast<std::chrono::nanoseconds>(t1
- t0).count();
tres[it] = std::chrono::duration_cast<std::chrono::nanoseconds>(t2
- t1).count();

for (int i = 0; i < VEC_LEN; ++i) {
if (pRef[i] != pRes[i]) {
if (!std::isnan(pRef[i]) || !std::isnan(pRes[i])) {
printf(
"Mismatch. fmod(%.17e, %.17e).\n"
"ref %.17e\n"
"my %.17e\n"
,xy[i*2+0]
,xy[i*2+1]
,ref[i]
,res[i]
);
return 1;
}
}
}
}

auto t11 = std::chrono::steady_clock::now();
int64_t dt = std::chrono::duration_cast<std::chrono::nanoseconds>(t11
- t00).count();

std::nth_element(tref.begin(), tref.begin()+N_IT/2, tref.end());
std::nth_element(tres.begin(), tres.begin()+N_IT/2, tres.end());
printf("fmod %.2f nsec. my_fmod %.2f nsec. Test time %.3f msec\n"
,double(tref[N_IT/2]) / VEC_LEN
,double(tres[N_IT/2]) / VEC_LEN
,double(dt)*1e-6
);

return 0;
}

--- Synchronet 3.20c-Linux NewsLink 1.2

From Bonita Montero@Bonita.Montero@gmail.com to comp.lang.c++ on Sun Mar 2 17:10:37 2025

From Newsgroup: comp.lang.c++

This is my code, improved by the _udiv128-intrinsic of MSVC which
provides a 128 / 64 division. With that my algorithm becomes nearly
tree times as fast as before. I'll provide a g++ / clang++ compatible
version with inline-assembly later.

template<bool _32 = false>
double xMyFmod( double x, double y )
{
constexpr uint64_t
SIGN = 1ull << 63,
IMPLICIT = 1ull << 52,
MANT = IMPLICIT - 1,
QBIT = 1ull << 51;
uint64_t const
binX = bit_cast<uint64_t>( x ),
binY = bit_cast<uint64_t>( y );
static auto abs = []( uint64_t m ) { return m & ~SIGN; };
auto isNaN = []( uint64_t m ) { return abs( m ) >= 0x7FF0000000000001u; };
auto isSig = []( uint64_t m ) { return !(m & QBIT); };
if( isNaN( binX ) ) [[unlikely]] // x == NaN
#if defined(_MSC_VER)
return bit_cast<double>( isNaN( binY ) ? binY | binX & binY & QBIT :
binX );
#else
{
if( isSig( binX ) || isNaN( binY ) && isSig( binY ) ) [[unlikely]]
feraiseexcept( FE_INVALID );
return bit_cast<double>( binX | QBIT );
}
#endif
auto isInf = []( uint64_t m ) { return abs( m ) == 0x7FF0000000000000u; };
if( isNaN( binY ) ) [[unlikely]] // x != NaN || y == NaN
#if defined(_MSC_VER)
{
if constexpr( _32 )
if( isInf( binX ) )
feraiseexcept( FE_INVALID );
return y;
}
#else
{
if( isSig( binY ) ) [[unlikely]]
feraiseexcept( FE_INVALID );
return bit_cast<double>( binY | QBIT );
}
#endif
if( isInf( binX ) ) // x == Inf
{
feraiseexcept( FE_INVALID );
#if defined(_MSC_VER)
return bit_cast<double>( binX & ~MANT | QBIT );
#else
return -numeric_limits<double>::quiet_NaN();
#endif
}
if( !abs( binY ) ) [[unlikely]] // y == 0
{
feraiseexcept( FE_INVALID );
#if defined(_MSC_VER)
return numeric_limits<double>::quiet_NaN();
#else
return -numeric_limits<double>::quiet_NaN();
#endif
}
if( !abs( binX ) || isInf( binY ) ) [[unlikely]] // x == 0 || y == Inf
return x;
auto exp = []( uint64_t b ) -> int { return b >> 52 & 0x7FF; };
int
expX = exp( binX ),
expY = exp( binY );
auto mant = []( uint64_t b ) { return b & MANT; };
uint64_t
mantX = mant( binX ),
mantY = mant( binY );
int headBits = 11;
static auto normalize = [&]( int &exp, uint64_t &mant )
{
unsigned shift = countl_zero( mant ) - headBits;
mant <<= shift;
exp -= shift;
};
auto build = []( int &exp, uint64_t &mant )
{
if( exp ) [[likely]]
mant |= IMPLICIT;
else
{
exp = 1;
normalize( exp, mant );
}
};
build( expX, mantX );
build( expY, mantY );
int
tailX = countr_zero( mantX ),
tailY = countr_zero( mantY ),
tailBits = tailX <= tailY ? tailX : tailY;
headBits += tailBits;
mantX >>= tailBits;
mantY >>= tailBits;
uint64_t signX = binX & SIGN;
int expDiff;
#if defined(_MSC_VER)
while( (expDiff = expX - expY) > 63 )
{
unsigned long long hi = mantX >> 1, lo = mantX << 63, remainder;
(void)_udiv128( hi, lo, mantY, &remainder );
expX -= 63;
mantX = remainder;
normalize( expX, mantX );
}
#endif
while( (expDiff = expX - expY) > 0 )
{
unsigned bits = expDiff <= headBits ? expDiff : headBits;
if( !(mantX = (mantX << bits) % mantY) ) [[unlikely]]
return bit_cast<double>( signX );
expX -= bits;
normalize( expX, mantX );
}
if( !expDiff && mantX >= mantY ) [[unlikely]]
if( (mantX -= mantY) ) [[likely]]
normalize( expX, mantX );
else
return bit_cast<double>( signX );
mantX <<= tailBits;
mantY <<= tailBits;
if( expX <= 0 ) [[unlikely]]
{
assert(expX >= -51);
mantX = mantX >> (unsigned)(-expX + 1);
expX = 0;
}
return bit_cast<double>( signX | (uint64_t)expX << 52 | mantX & MANT ); }

double myFmod( double x, double y )
{
return xMyFmod( x, y );
}

inline float myFmod( float x, float y )
{
return (float)xMyFmod<true>( (double)x, (double)y );
}
--- Synchronet 3.20c-Linux NewsLink 1.2

From Mr Flibble@leigh@i42.co.uk to comp.lang.c++ on Sun Mar 2 16:22:20 2025

From Newsgroup: comp.lang.c++

On Sun, 2 Mar 2025 17:10:37 +0100, Bonita Montero
<Bonita.Montero@gmail.com> wrote:

This is my code, improved by the _udiv128-intrinsic of MSVC which
provides a 128 / 64 division. With that my algorithm becomes nearly
tree times as fast as before. I'll provide a g++ / clang++ compatible
version with inline-assembly later.

Still slow tho.

/Flibble
--- Synchronet 3.20c-Linux NewsLink 1.2

From Bonita Montero@Bonita.Montero@gmail.com to comp.lang.c++ on Sun Mar 2 17:26:18 2025

From Newsgroup: comp.lang.c++

Am 02.03.2025 um 17:22 schrieb Mr Flibble:

On Sun, 2 Mar 2025 17:10:37 +0100, Bonita Montero
<Bonita.Montero@gmail.com> wrote:

This is my code, improved by the _udiv128-intrinsic of MSVC which
provides a 128 / 64 division. With that my algorithm becomes nearly
tree times as fast as before. I'll provide a g++ / clang++ compatible
version with inline-assembly later.

Still slow tho.

With MSVC and the fairer random numbers I chose I'm 2.5 times
faster.

--- Synchronet 3.20c-Linux NewsLink 1.2

From Michael S@already5chosen@yahoo.com to comp.lang.c++ on Sun Mar 2 18:54:14 2025

From Newsgroup: comp.lang.c++

On Sat, 1 Mar 2025 22:55:59 +0200
Michael S <already5chosen@yahoo.com> wrote:

On Mon, 24 Feb 2025 11:48:08 +0100
Bonita Montero <Bonita.Montero@gmail.com> wrote:

I wanted to optimize fmod to be a bit faster. This is my C++20
solution.

How about that?
Pay attention, it's C rather than C++. So 5 times shorter :-)
It's not the fastest for big x/y ratios, but rather simple and not
*too* slow. At least as long as hardware supports FMA.
For small x/y ratios it should be pretty close to best possible.

I didn't RTFM about signbit() and didn't check with compilers others
than clang, so didn't pay attention to the bug until testing with
MSVC. At the same opportunity I looked at MSVC-generated asm and found
out that it does not inline trunc() and copysign(). So, I changed the
code please MSVC idiosyncrasy, replacing trunc() with floor() and using
if () instead of copysign(). Fortunately, the changes didn't make
clang/gcc compiled code any slower.

Here is hopefully correct version:

#include <math.h>
#include <fenv.h>

double my_fmod(double x, double y)
{
if (isnan(x))
return x;

// pre-process y
if (y < 0)
y = -y;
else if (y > 0)
;
else {
if (isnan(y))
return y;
// y==0
feraiseexcept(FE_INVALID);
return NAN;
}

// y in (0:+inf]
double ax = fabs(x);

// Quick path
double xx = ax * 0x1p-53;
if (xx < y) {
// among other things, x guaranteed to be finite
if (ax < y)
return x; // case y=+-inf covered here
double d = floor(ax/y);
double res = fma(-y, d, ax);
if (res < 0) {
// overshoot because of unfortunate division rounding
// it is extremely rare for small x/y,
// but not rare when x/y is close to 2**53
res = fma(-y, d-1, ax);
}
if (x < 0)
res = -res;
return res;
}

// slow path
if (isinf(x)) {
feraiseexcept(FE_INVALID);
return NAN;
}

int oldRnd = fegetround();
fesetround(FE_TOWARDZERO);

do {
double yy = y;
while (yy < ax * 0x1p-1022)
yy *= 0x1p1021;

do
ax = fma(-yy, floor(ax/yy), ax);
while (ax >= yy);

} while (ax >= y);

if (x < 0)
ax = -ax;
fesetround(oldRnd);
return ax;
}

The behaviour w.r.t. raising FE_INVALID when either x or y is signaling
NaN could differ from library implementation, but as far as I understand
both raising and not raising exception in this case is legal.

This code is between 15-20% slower then standard library for clang and
gcc, and 0 to 15% faster then standard library for MSVC, so of no
practical interest.

Still, it is better than Bonita's in all possible circumstances.

--- Synchronet 3.20c-Linux NewsLink 1.2

From Bonita Montero@Bonita.Montero@gmail.com to comp.lang.c++ on Sun Mar 2 17:59:14 2025

From Newsgroup: comp.lang.c++

Am 02.03.2025 um 17:54 schrieb Michael S:

Still, it is better than Bonita's in all possible circumstances.

Not better than my latest code with MSVC.

--- Synchronet 3.20c-Linux NewsLink 1.2

From Michael S@already5chosen@yahoo.com to comp.lang.c++ on Sun Mar 2 19:07:59 2025

From Newsgroup: comp.lang.c++

On Sun, 2 Mar 2025 17:26:18 +0100
Bonita Montero <Bonita.Montero@gmail.com> wrote:

With MSVC and the fairer random numbers I chose I'm 2.5 times
faster.

I don't trust your benchmarking skills.

--- Synchronet 3.20c-Linux NewsLink 1.2

From Bonita Montero@Bonita.Montero@gmail.com to comp.lang.c++ on Sun Mar 2 18:09:32 2025

From Newsgroup: comp.lang.c++

Am 02.03.2025 um 18:07 schrieb Michael S:

On Sun, 2 Mar 2025 17:26:18 +0100
Bonita Montero <Bonita.Montero@gmail.com> wrote:

With MSVC and the fairer random numbers I chose I'm 2.5 times
faster.

I don't trust your benchmarking skills.

I don't prefer your fast-path value combinations but I chose 75% random
finite combinations.

--- Synchronet 3.20c-Linux NewsLink 1.2

From Bonita Montero@Bonita.Montero@gmail.com to comp.lang.c++ on Sun Mar 2 18:11:14 2025

From Newsgroup: comp.lang.c++

Am 02.03.2025 um 18:09 schrieb Bonita Montero:

Am 02.03.2025 um 18:07 schrieb Michael S:

On Sun, 2 Mar 2025 17:26:18 +0100
Bonita Montero <Bonita.Montero@gmail.com> wrote:

With MSVC and the fairer random numbers I chose I'm 2.5 times
faster.

I don't trust your benchmarking skills.

I don't prefer your fast-path value combinations but I chose 75% random finite combinations.

This generates the random values for me:

mt19937_64 mt;
uniform_int_distribution<uint64_t>
genType( 0, 15 ),
genFinite( 0x0010000000000000u, 0x7FEFFFFFFFFFFFFFu ),
genDen( 1, 0x000FFFFFFFFFFFFFu ),
genNaN( 0x7FF0000000000001u, 0x7FFFFFFFFFFFFFFFu );
auto get = [&]()
{
constexpr uint64_t
FINITE_THRESH = 4, // 75% finites
ZERO = 3, // 6.25% zeroes
DENORMALS = 2, // 6.25% denormals
INF = 1, // 6.25% Infs
NAN_ = 0; // 6.25% NaNs
uint64_t
sign = mt() & - numeric_limits<int64_t>::min(),
type = genType( mt );
if( type >= FINITE_THRESH )
return bit_cast<double>( sign | genFinite( mt ) );
if( type == ZERO )
return bit_cast<double>( sign );
if( type == DENORMALS )
return bit_cast<double>( sign | genDen( mt ) );
if( type == INF )
return bit_cast<double>( sign | 0x7FF0000000000000u );
assert(type == NAN_);
return bit_cast<double>( sign | genNaN( mt ) );
};

--- Synchronet 3.20c-Linux NewsLink 1.2

From Michael S@already5chosen@yahoo.com to comp.lang.c++ on Sun Mar 2 19:37:16 2025

From Newsgroup: comp.lang.c++

On Sun, 02 Mar 2025 16:22:20 +0000
Mr Flibble <leigh@i42.co.uk> wrote:

On Sun, 2 Mar 2025 17:10:37 +0100, Bonita Montero
<Bonita.Montero@gmail.com> wrote:

This is my code, improved by the _udiv128-intrinsic of MSVC which
provides a 128 / 64 division. With that my algorithm becomes nearly
tree times as fast as before. I'll provide a g++ / clang++ compatible >version with inline-assembly later.

Still slow tho.

/Flibble

The truth is that relative speed of FP vs Integer algorithms depends on specific CPU that one is using for measurements.
I measured on relatively old CPU - Intel Skylake. On this CPU integer
division is very significantly slower than floating-point division.
On newer CPUs, like Intel IceLake/Tiger Lake and Alder Lake or AMD Zen
3/4/5 and even more sore on Apple M-series the difference in speed
between floating-point and integer division is less significant, and
in few cases integer division is even faster, so in theory Bonita's code
could be more competitive.
From Agner Fog's tables:
Arch DIVSD DIV r64
Skylake 13-14 35-88
IceLake 13-14 15
Alder Lake 14 10
Zen3 13.5 9-17
My problem is that because of Bonita's horrible coding style I am not
even trying to understand what's is going on within his/her code.
--- Synchronet 3.20c-Linux NewsLink 1.2

From Bonita Montero@Bonita.Montero@gmail.com to comp.lang.c++ on Sun Mar 2 18:41:58 2025

From Newsgroup: comp.lang.c++

Am 02.03.2025 um 18:37 schrieb Michael S:

The truth is that relative speed of FP vs Integer algorithms depends on specific CPU that one is using for measurements.
I measured on relatively old CPU - Intel Skylake. On this CPU integer division is very significantly slower than floating-point division.
On newer CPUs, like Intel IceLake/Tiger Lake and Alder Lake or AMD Zen
3/4/5 and even more sore on Apple M-series the difference in speed
between floating-point and integer division is less significant, and
in few cases integer division is even faster, so in theory Bonita's code could be more competitive.

From Agner Fog's tables:
Arch DIVSD DIV r64
Skylake 13-14 35-88
IceLake 13-14 15
Alder Lake 14 10
Zen3 13.5 9-17

On my Zen4-CPU your code is slightly faster than my initial code with
the shown selection of random values (which don't prefer any fast path).
My current code is on my CPU with MSVC 2.4 times faster if I chose the
values somethat different with only finites and no denormals, infs, nans
or zeroes.

My problem is that because of Bonita's horrible coding style I am not
even trying to understand what's is going on within his/her code.

Your coding-style is horrible. Mine is "too beautiful" (my employer).

--- Synchronet 3.20c-Linux NewsLink 1.2

From Michael S@already5chosen@yahoo.com to comp.lang.c++ on Sun Mar 2 19:52:21 2025

From Newsgroup: comp.lang.c++

On Sun, 2 Mar 2025 18:11:14 +0100
Bonita Montero <Bonita.Montero@gmail.com> wrote:

Am 02.03.2025 um 18:09 schrieb Bonita Montero:

Am 02.03.2025 um 18:07 schrieb Michael S:

On Sun, 2 Mar 2025 17:26:18 +0100
Bonita Montero <Bonita.Montero@gmail.com> wrote:

With MSVC and the fairer random numbers I chose I'm 2.5 times
faster.

I don't trust your benchmarking skills.

I don't prefer your fast-path value combinations but I chose 75%
random finite combinations.

This generates the random values for me:

mt19937_64 mt;
uniform_int_distribution<uint64_t>
genType( 0, 15 ),
genFinite( 0x0010000000000000u, 0x7FEFFFFFFFFFFFFFu ),
genDen( 1, 0x000FFFFFFFFFFFFFu ),
genNaN( 0x7FF0000000000001u, 0x7FFFFFFFFFFFFFFFu );
auto get = [&]()
{
constexpr uint64_t
FINITE_THRESH = 4, // 75% finites
ZERO = 3, // 6.25% zeroes
DENORMALS = 2, // 6.25% denormals
INF = 1, // 6.25% Infs
NAN_ = 0; // 6.25% NaNs
uint64_t
sign = mt() & -
numeric_limits<int64_t>::min(), type = genType( mt );
if( type >= FINITE_THRESH )
return bit_cast<double>( sign | genFinite( mt
) ); if( type == ZERO )
return bit_cast<double>( sign );
if( type == DENORMALS )
return bit_cast<double>( sign | genDen( mt )
); if( type == INF )
return bit_cast<double>( sign |
0x7FF0000000000000u ); assert(type == NAN_);
return bit_cast<double>( sign | genNaN( mt ) );
};

Your distribution is very different from what one would expect in
real-world usage.
In real-world usage apart from debugging stage there are no inf, nan or
y=zero. x=zero happens, but with lower probability that 6%. Denormals
also happen, but with even lower probability than x=zero. Also in
majority of real-world scenarios huge x/y ratios either not happen at
all or are extremely rare.

--- Synchronet 3.20c-Linux NewsLink 1.2

From Bonita Montero@Bonita.Montero@gmail.com to comp.lang.c++ on Sun Mar 2 18:55:22 2025

From Newsgroup: comp.lang.c++

Am 02.03.2025 um 18:52 schrieb Michael S:

Your distribution is very different from what one would expect in
real-world usage.

There's no real world distibution, so I chose all finites to be
equally likely.

In real-world usage apart from debugging stage there are no inf, nan or y=zero. x=zero happens, but with lower probability that 6%. Denormals
also happen, but with even lower probability than x=zero.

As I said if I chose 100% finites from the 1 to 0x7FEFFFFFFFFFFFFFu
range I'm still 2.4 times faster.

Also in majority of real-world scenarios huge x/y ratios either not happen at all or are extremely rare.

--- Synchronet 3.20c-Linux NewsLink 1.2

From Michael S@already5chosen@yahoo.com to comp.lang.c++ on Sun Mar 2 20:00:05 2025

From Newsgroup: comp.lang.c++

On Sun, 2 Mar 2025 18:41:58 +0100
Bonita Montero <Bonita.Montero@gmail.com> wrote:

Your coding-style is horrible.

Even with minimal comments it's not too hard to understand what's going
on within my code. With few more comments it could become completely
clear.
Of course, floating-point algorithm is inherently simpler. That helps.

Mine is "too beautiful" (my employer).

It sounds like your employer agrees with me, but he expresses his
thought in humoristic style.

--- Synchronet 3.20c-Linux NewsLink 1.2

From Bonita Montero@Bonita.Montero@gmail.com to comp.lang.c++ on Sun Mar 2 19:01:03 2025

From Newsgroup: comp.lang.c++

Am 02.03.2025 um 19:00 schrieb Michael S:

Mine is "too beautiful" (my employer).

It sounds like your employer agrees with me, but he expresses his
thought in humoristic style.

No, he likes my style.

--- Synchronet 3.20c-Linux NewsLink 1.2

From Michael S@already5chosen@yahoo.com to comp.lang.c++ on Sun Mar 2 20:09:36 2025

From Newsgroup: comp.lang.c++

On Sun, 2 Mar 2025 18:55:22 +0100
Bonita Montero <Bonita.Montero@gmail.com> wrote:

Am 02.03.2025 um 18:52 schrieb Michael S:

Your distribution is very different from what one would expect in real-world usage.

There's no real world distibution, so I chose all finites to be
equally likely.

In real-world usage apart from debugging stage there are no inf,
nan or y=zero. x=zero happens, but with lower probability that 6%. Denormals also happen, but with even lower probability than x=zero.

As I said if I chose 100% finites from the 1 to 0x7FEFFFFFFFFFFFFFu
range I'm still 2.4 times faster.

Also in majority of real-world scenarios huge x/y ratios either not
happen at all or are extremely rare.

You didn't answer the second point, which is critical.
In your fully random scenario 48.7% of cases are huge x/y. That is
completely unrealistic.

I can easily improve speed of huge x/y at cost of less simple code and
of small slowdown of more typical case, but I consider it
counterproductive. It seems, authors of standard libraries agree with
my judgment.

--- Synchronet 3.20c-Linux NewsLink 1.2

From Bonita Montero@Bonita.Montero@gmail.com to comp.lang.c++ on Sun Mar 2 19:14:15 2025

From Newsgroup: comp.lang.c++

Am 02.03.2025 um 19:09 schrieb Michael S:

On Sun, 2 Mar 2025 18:55:22 +0100
Bonita Montero <Bonita.Montero@gmail.com> wrote:

Am 02.03.2025 um 18:52 schrieb Michael S:

Your distribution is very different from what one would expect in
real-world usage.

There's no real world distibution, so I chose all finites to be
equally likely.

In real-world usage apart from debugging stage there are no inf,
nan or y=zero. x=zero happens, but with lower probability that 6%.
Denormals also happen, but with even lower probability than x=zero.

As I said if I chose 100% finites from the 1 to 0x7FEFFFFFFFFFFFFFu
range I'm still 2.4 times faster.

Also in majority of real-world scenarios huge x/y ratios either not
happen at all or are extremely rare.

You didn't answer the second point, which is critical.
In your fully random scenario 48.7% of cases are huge x/y. That is
completely unrealistic.

If I modify my genFinite that way:

genFinite( 1, 0x433FFFFFFFFFFFFFu )

So that there are no dropped digits before the decimal point,
I'm still nearly twice as fast with my latest code.

I can easily improve speed of huge x/y at cost of less simple code and
of small slowdown of more typical case, but I consider it
counterproductive. It seems, authors of standard libraries agree with
my judgment.

--- Synchronet 3.20c-Linux NewsLink 1.2

From Bonita Montero@Bonita.Montero@gmail.com to comp.lang.c++ on Sun Mar 2 19:57:30 2025

From Newsgroup: comp.lang.c++

Am 02.03.2025 um 19:09 schrieb Michael S:

You didn't answer the second point, which is critical.
In your fully random scenario 48.7% of cases are huge x/y. That is
completely unrealistic.
I can easily improve speed of huge x/y at cost of less simple code and
of small slowdown of more typical case, but I consider it
counterproductive. It seems, authors of standard libraries agree with
my judgment.

And you use fesetround, which takes about 40 clock cycles on my
CPU under Linux (WSL2). Better chose _mm_getcsr() and _mm_setcsr()
for that, which directly sets the FPU control word for SSE / AVX*
/ AVX-512. This is multiple times faster. For the x87-FPU you'd
have to chose different code, but the x87-FPU is totally broken
anywax.

--- Synchronet 3.20c-Linux NewsLink 1.2

From Michael S@already5chosen@yahoo.com to comp.lang.c++ on Sun Mar 2 22:58:45 2025

From Newsgroup: comp.lang.c++

On Sun, 2 Mar 2025 19:57:30 +0100
Bonita Montero <Bonita.Montero@gmail.com> wrote:

Am 02.03.2025 um 19:09 schrieb Michael S:

You didn't answer the second point, which is critical.
In your fully random scenario 48.7% of cases are huge x/y. That is completely unrealistic.
I can easily improve speed of huge x/y at cost of less simple code
and of small slowdown of more typical case, but I consider it counterproductive. It seems, authors of standard libraries agree
with my judgment.

And you use fesetround, which takes about 40 clock cycles on my
CPU under Linux (WSL2). Better chose _mm_getcsr() and _mm_setcsr()
for that, which directly sets the FPU control word for SSE / AVX*
/ AVX-512. This is multiple times faster. For the x87-FPU you'd
have to chose different code, but the x87-FPU is totally broken
anywax.

If it was on the fast path, I'd consider it.
But improving speed of unimportant slow path at cost of portability?
Nah.

--- Synchronet 3.20c-Linux NewsLink 1.2

From Michael S@already5chosen@yahoo.com to comp.lang.c++ on Mon Mar 3 02:01:25 2025

From Newsgroup: comp.lang.c++

On Sun, 2 Mar 2025 18:55:22 +0100
Bonita Montero <Bonita.Montero@gmail.com> wrote:

Am 02.03.2025 um 18:52 schrieb Michael S:

Your distribution is very different from what one would expect in real-world usage.

There's no real world distibution, so I chose all finites to be
equally likely.

In real-world usage apart from debugging stage there are no inf,
nan or y=zero. x=zero happens, but with lower probability that 6%. Denormals also happen, but with even lower probability than x=zero.

As I said if I chose 100% finites from the 1 to 0x7FEFFFFFFFFFFFFFu
range I'm still 2.4 times faster.

On my CPU [for huge ratios] it is indeed faster than your previous
attempt, but still 1.25x slower than standard library. And for
non-huge ratios there is no improvement - still 3.3 times slower than
standard library.

--- Synchronet 3.20c-Linux NewsLink 1.2

From Bonita Montero@Bonita.Montero@gmail.com to comp.lang.c++ on Mon Mar 3 12:52:51 2025

From Newsgroup: comp.lang.c++

Am 02.03.2025 um 21:58 schrieb Michael S:

If it was on the fast path, I'd consider it.
But improving speed of unimportant slow path at cost of portability?
Nah.

For the 75% random finites case I've shown your code becomes about
28% faster with _mm_getcsr() and _mm_setcsr().
--- Synchronet 3.20c-Linux NewsLink 1.2

From Michael S@already5chosen@yahoo.com to comp.lang.c++ on Mon Mar 3 18:10:08 2025

From Newsgroup: comp.lang.c++

On Mon, 3 Mar 2025 12:52:51 +0100
Bonita Montero <Bonita.Montero@gmail.com> wrote:

Am 02.03.2025 um 21:58 schrieb Michael S:

If it was on the fast path, I'd consider it.
But improving speed of unimportant slow path at cost of portability?
Nah.

For the 75% random finites case I've shown your code becomes about
28% faster with _mm_getcsr() and _mm_setcsr().

It seems that major slowdown is specific to combination of msys2
libraries with Zen3/4 CPU.
I see even worse slowness of get/set rounding mode on msys2/Zen3.
The same msys-compiled binary on Intel CPUs is o.k., at least
relatively to other heavy things going on on the slow path.
On Zen3 with Microsoft's compiler/library it is also o.k.

As long as it only affects slow path there is nothing to agitated
about.

--- Synchronet 3.20c-Linux NewsLink 1.2

From Michael S@already5chosen@yahoo.com to comp.lang.c++ on Mon Mar 3 18:20:09 2025

From Newsgroup: comp.lang.c++

On Sun, 2 Mar 2025 19:57:30 +0100
Bonita Montero <Bonita.Montero@gmail.com> wrote:

For the x87-FPU you'd
have to chose different code, but the x87-FPU is totally broken
anywax.

x87 is not broken relatively to its own specifications. It just happens
to be slightly different from IEEE-754 specifications. Which is not
surprising considering that it predates IEEE-754 Standard by several
years.
Today there are very few reasons to still use x87 in new software.
However back in it's time x87 was an astonishingly good piece of work,
less so in performance (it was not fast, even by standards of its time)
more so for features, precision and especially for consistency of its arithmetic.

--- Synchronet 3.20c-Linux NewsLink 1.2

From Bonita Montero@Bonita.Montero@gmail.com to comp.lang.c++ on Mon Mar 3 17:30:39 2025

From Newsgroup: comp.lang.c++

Am 03.03.2025 um 17:20 schrieb Michael S:

x87 is not broken relatively to its own specifications. It just happens
to be slightly different from IEEE-754 specifications. Which is not surprising considering that it predates IEEE-754 Standard by several
years.

You can reduce the with of the mantissa to 53 or 24 bit, but the
exponent is always 15 bit; that's not up to any specification.

Today there are very few reasons to still use x87 in new software.
However back in it's time x87 was an astonishingly good piece of work,
less so in performance (it was not fast, even by standards of its time)
more so for features, precision and especially for consistency of its arithmetic.

There are compiler-settings which enforce consistency by storing values
with reduced precision and re-loading them to give expectable results
when you use values < long double. That's a mess.

--- Synchronet 3.20c-Linux NewsLink 1.2

From Michael S@already5chosen@yahoo.com to comp.lang.c++ on Mon Mar 3 19:27:32 2025

From Newsgroup: comp.lang.c++

On Mon, 3 Mar 2025 17:30:39 +0100
Bonita Montero <Bonita.Montero@gmail.com> wrote:

Am 03.03.2025 um 17:20 schrieb Michael S:

x87 is not broken relatively to its own specifications. It just
happens to be slightly different from IEEE-754 specifications.
Which is not surprising considering that it predates IEEE-754
Standard by several years.

You can reduce the with of the mantissa to 53 or 24 bit, but the
exponent is always 15 bit; that's not up to any specification.

That's up to x87 specification. Which predates IEEE-754.

Today there are very few reasons to still use x87 in new software.
However back in it's time x87 was an astonishingly good piece of
work, less so in performance (it was not fast, even by standards of
its time) more so for features, precision and especially for
consistency of its arithmetic.

There are compiler-settings which enforce consistency by storing
values with reduced precision and re-loading them to give expectable
results when you use values < long double. That's a mess.

It's a mess only if you try to be very compatible with IEEE-754 specs.
If you don't try to be compatible, you just enjoy higher precision
and higher dynamic range for as long as you can. If you want it all the
time, nothing prevents you from storing full 80-bit numbers in memory.
Back in the era of 16-bit buses it was only 10-25% slower than storing
64-bit results. For a full application difference in speed between
80-bit and 64-bit precision was typically just few per cents.

--- Synchronet 3.20c-Linux NewsLink 1.2

From Bonita Montero@Bonita.Montero@gmail.com to comp.lang.c++ on Mon Mar 3 18:39:51 2025

From Newsgroup: comp.lang.c++

Am 03.03.2025 um 18:27 schrieb Michael S:

That's up to x87 specification. Which predates IEEE-754.

It's dangerous. From the "Handbook of Floating Point Arithmetic".

The dynamic rounding precision can introduce bugs in modern soft-
ware, which is almost always made up of several components (dy-
namic libraries, plug-ins). For instance, the following bug in
Mozilla’s Javascript engine was discovered in 2006: if the rounding
precision was reduced to single precision by a plug-in, then the
js_dtoa function (double-to-string conversion) could overwrite
memory, making the application behave erratically, e.g., crash.
The cause was the loop exit condition being always false due to
an unexpected floating-point error.

So it can make it harder to write portable software.

--- Synchronet 3.20c-Linux NewsLink 1.2

From Michael S@already5chosen@yahoo.com to comp.lang.c++ on Mon Mar 3 23:55:08 2025

From Newsgroup: comp.lang.c++

On Mon, 3 Mar 2025 18:10:08 +0200
Michael S <already5chosen@yahoo.com> wrote:

On Mon, 3 Mar 2025 12:52:51 +0100
Bonita Montero <Bonita.Montero@gmail.com> wrote:

Am 02.03.2025 um 21:58 schrieb Michael S:

If it was on the fast path, I'd consider it.
But improving speed of unimportant slow path at cost of
portability? Nah.

For the 75% random finites case I've shown your code becomes about
28% faster with _mm_getcsr() and _mm_setcsr().

It seems that major slowdown is specific to combination of msys2
libraries with Zen3/4 CPU.
I see even worse slowness of get/set rounding mode on msys2/Zen3.
The same msys-compiled binary on Intel CPUs is o.k., at least
relatively to other heavy things going on on the slow path.
On Zen3 with Microsoft's compiler/library it is also o.k.

As long as it only affects slow path there is nothing to agitated
about.

I can think about half a dozen of different ways of avoiding the need to
change rounding. However most of them are boring. Only one is fun.

#include <math.h>
#include <fenv.h>

double my_fmod(double x, double y)
{
if (isnan(x))
return x;

// pre-process y
if (y < 0)
y = -y;
else if (y > 0)
;
else {
if (isnan(y))
return y;
// y==0
feraiseexcept(FE_INVALID);
return NAN;
}

// y in (0:+inf]
double ax = fabs(x);

// Quick path
if (ax * 0x1p-53 < y) {
// among other things, x guaranteed to be finite
if (ax < y)
return x; // case y=+-inf covered here
double d = floor(ax/y);
double res = fma(-y, d, ax);
if (res < 0) {
// overshoot because of unfortunate division rounding
// it is extremely rare for small x/y,
// but not rare when x/y is close to 2**53
res += y;
}
if (x < 0)
res = -res;
return res;
}

// slow path
if (isinf(x)) {
feraiseexcept(FE_INVALID);
return NAN;
}

int flipflop = 0;
do {
double yy = y;
while (yy < ax * 0x1p-1022)
yy *= 0x1p1021;

do {
ax = fma(-yy, floor(ax/yy), ax);
flipflop ^= (ax < 0);
ax = fabs(ax);
} while (ax >= yy);
} while (ax >= y);
if (flipflop)
ax = y - ax;
if (x < 0)
ax = -ax;
return ax;
}

To my surprise, in case of insane x/y ratios it was faster than original
not only on Zen3/Msys, but on Intel CPUs and MSVC platform as well.

--- Synchronet 3.20c-Linux NewsLink 1.2

From Bonita Montero@Bonita.Montero@gmail.com to comp.lang.c++ on Fri Mar 7 09:51:24 2025

From Newsgroup: comp.lang.c++

Am 03.03.2025 um 17:10 schrieb Michael S:

It seems that major slowdown is specific to combination of msys2
libraries with Zen3/4 CPU.
I see even worse slowness of get/set rounding mode on msys2/Zen3.
The same msys-compiled binary on Intel CPUs is o.k., at least
relatively to other heavy things going on on the slow path.
On Zen3 with Microsoft's compiler/library it is also o.k.

If I use _mm_setcsr() and _mm_getcsr() I can disable your fast path
and I get the same performance. The Linux code with clang++-18 has
about the same performance like my Windows-code with MSVC (... and
this 128/64 -> 64:64 division).

--- Synchronet 3.20c-Linux NewsLink 1.2

From Michael S@already5chosen@yahoo.com to comp.lang.c++ on Sun Mar 9 01:31:27 2025

From Newsgroup: comp.lang.c++

On Sun, 2 Mar 2025 17:10:37 +0100
Bonita Montero <Bonita.Montero@gmail.com> wrote:

This is my code, improved by the _udiv128-intrinsic of MSVC which
provides a 128 / 64 division. With that my algorithm becomes nearly
tree times as fast as before. I'll provide a g++ / clang++ compatible
version with inline-assembly later.

template<bool _32 = false>
double xMyFmod( double x, double y )
{
constexpr uint64_t
SIGN = 1ull << 63,
IMPLICIT = 1ull << 52,
MANT = IMPLICIT - 1,
QBIT = 1ull << 51;
uint64_t const
binX = bit_cast<uint64_t>( x ),
binY = bit_cast<uint64_t>( y );
static auto abs = []( uint64_t m ) { return m & ~SIGN; };
auto isNaN = []( uint64_t m ) { return abs( m ) >=
0x7FF0000000000001u; }; auto isSig = []( uint64_t m ) { return !(m &
QBIT); }; if( isNaN( binX ) ) [[unlikely]] // x == NaN
#if defined(_MSC_VER)
return bit_cast<double>( isNaN( binY ) ? binY | binX
& binY & QBIT : binX );
#else
{
if( isSig( binX ) || isNaN( binY ) && isSig( binY ) ) [[unlikely]] feraiseexcept( FE_INVALID );
return bit_cast<double>( binX | QBIT );
}
#endif
auto isInf = []( uint64_t m ) { return abs( m ) ==
0x7FF0000000000000u; }; if( isNaN( binY ) ) [[unlikely]] // x != NaN
|| y == NaN #if defined(_MSC_VER)
{
if constexpr( _32 )
if( isInf( binX ) )
feraiseexcept( FE_INVALID );
return y;
}
#else
{
if( isSig( binY ) ) [[unlikely]]
feraiseexcept( FE_INVALID );
return bit_cast<double>( binY | QBIT );
}
#endif
if( isInf( binX ) ) // x == Inf
{
feraiseexcept( FE_INVALID );
#if defined(_MSC_VER)
return bit_cast<double>( binX & ~MANT | QBIT );
#else
return -numeric_limits<double>::quiet_NaN();
#endif
}
if( !abs( binY ) ) [[unlikely]] // y == 0
{
feraiseexcept( FE_INVALID );
#if defined(_MSC_VER)
return numeric_limits<double>::quiet_NaN();
#else
return -numeric_limits<double>::quiet_NaN();
#endif
}
if( !abs( binX ) || isInf( binY ) ) [[unlikely]] // x == 0 ||
y == Inf return x;
auto exp = []( uint64_t b ) -> int { return b >> 52 & 0x7FF;
}; int
expX = exp( binX ),
expY = exp( binY );
auto mant = []( uint64_t b ) { return b & MANT; };
uint64_t
mantX = mant( binX ),
mantY = mant( binY );
int headBits = 11;
static auto normalize = [&]( int &exp, uint64_t &mant )
{
unsigned shift = countl_zero( mant ) - headBits;
mant <<= shift;
exp -= shift;
};
auto build = []( int &exp, uint64_t &mant )
{
if( exp ) [[likely]]
mant |= IMPLICIT;
else
{
exp = 1;
normalize( exp, mant );
}
};
build( expX, mantX );
build( expY, mantY );
int
tailX = countr_zero( mantX ),
tailY = countr_zero( mantY ),
tailBits = tailX <= tailY ? tailX : tailY;
headBits += tailBits;
mantX >>= tailBits;
mantY >>= tailBits;
uint64_t signX = binX & SIGN;
int expDiff;
#if defined(_MSC_VER)
while( (expDiff = expX - expY) > 63 )
{
unsigned long long hi = mantX >> 1, lo = mantX << 63, remainder; (void)_udiv128( hi, lo, mantY, &remainder );
expX -= 63;
mantX = remainder;
normalize( expX, mantX );
}
#endif
while( (expDiff = expX - expY) > 0 )
{
unsigned bits = expDiff <= headBits ? expDiff :
headBits; if( !(mantX = (mantX << bits) % mantY) ) [[unlikely]]
return bit_cast<double>( signX );
expX -= bits;
normalize( expX, mantX );
}
if( !expDiff && mantX >= mantY ) [[unlikely]]
if( (mantX -= mantY) ) [[likely]]
normalize( expX, mantX );
else
return bit_cast<double>( signX );
mantX <<= tailBits;
mantY <<= tailBits;
if( expX <= 0 ) [[unlikely]]
{
assert(expX >= -51);
mantX = mantX >> (unsigned)(-expX + 1);
expX = 0;
}
return bit_cast<double>( signX | (uint64_t)expX << 52 | mantX
& MANT ); }

double myFmod( double x, double y )
{
return xMyFmod( x, y );
}

inline float myFmod( float x, float y )
{
return (float)xMyFmod<true>( (double)x, (double)y );
}

This code does not work in plenty of cases. It seems, your test vectors
have poor coverage.
Try, for example, x=1.8037919852882307, y=2.22605637008665934e-194

--- Synchronet 3.20c-Linux NewsLink 1.2

From Bonita Montero@Bonita.Montero@gmail.com to comp.lang.c++ on Sun Mar 9 07:41:26 2025

From Newsgroup: comp.lang.c++

Am 09.03.2025 um 00:31 schrieb Michael S:

This code does not work in plenty of cases. It seems, your test vectors
have poor coverage.
Try, for example, x=1.8037919852882307, y=2.22605637008665934e-194

cout << hexfloat << myFmod( 1.8037919852882307, 2.22605637008665934e-194 ) << endl;
cout << hexfloat << fmod( 1.8037919852882307, 2.22605637008665934e-194 ) << endl;

Prints the same result under Linux and Windows.

--- Synchronet 3.20c-Linux NewsLink 1.2

From Michael S@already5chosen@yahoo.com to comp.lang.c++ on Sun Mar 9 11:08:01 2025

From Newsgroup: comp.lang.c++

On Sun, 9 Mar 2025 07:41:26 +0100
Bonita Montero <Bonita.Montero@gmail.com> wrote:

Am 09.03.2025 um 00:31 schrieb Michael S:

This code does not work in plenty of cases. It seems, your test
vectors have poor coverage.
Try, for example, x=1.8037919852882307, y=2.22605637008665934e-194

cout << hexfloat << myFmod( 1.8037919852882307, 2.22605637008665934e-194 ) << endl;
cout << hexfloat << fmod( 1.8037919852882307,
2.22605637008665934e-194 ) << endl;

Prints the same result under Linux and Windows.

What it prints under Linux is irrelevant. Under Linux it compiles your
original version or its close equivalent that is slow, ugly, but not
buggy.
What it prints under Windows when compiled with clang or gcc is also
irrelevant for the same reason.
The bug is in the new code that is exposed only when compiled with MSVC compiler.

-- foo.cpp
#include <cstdint>
#include <cassert>
#include <cfenv>
#include <limits>
#include <bit>
using namespace std;

template<bool _32 = false>
double xMyFmod( double x, double y )
{
constexpr uint64_t
SIGN = 1ull << 63,
IMPLICIT = 1ull << 52,
MANT = IMPLICIT - 1,
QBIT = 1ull << 51;
uint64_t const
binX = bit_cast<uint64_t>( x ),
binY = bit_cast<uint64_t>( y );
static auto abs = []( uint64_t m ) { return m & ~SIGN; };
auto isNaN = []( uint64_t m ) { return abs( m ) >=
0x7FF0000000000001u; }; auto isSig = []( uint64_t m ) { return !(m &
QBIT); }; if( isNaN( binX ) ) [[unlikely]] // x == NaN
#if defined(_MSC_VER)
return bit_cast<double>( isNaN( binY ) ? binY | binX &
binY & QBIT : binX );
#else
{
if( isSig( binX ) || isNaN( binY ) && isSig( binY ) ) [[unlikely]] feraiseexcept( FE_INVALID );
return bit_cast<double>( binX | QBIT );
}
#endif
auto isInf = []( uint64_t m ) { return abs( m ) ==
0x7FF0000000000000u; }; if( isNaN( binY ) ) [[unlikely]] // x != NaN ||
y == NaN #if defined(_MSC_VER)
{
if constexpr( _32 )
if( isInf( binX ) )
feraiseexcept( FE_INVALID );
return y;
}
#else
{
if( isSig( binY ) ) [[unlikely]]
feraiseexcept( FE_INVALID );
return bit_cast<double>( binY | QBIT );
}
#endif
if( isInf( binX ) ) // x == Inf
{
feraiseexcept( FE_INVALID );
#if defined(_MSC_VER)
return bit_cast<double>( binX & ~MANT | QBIT );
#else
return -numeric_limits<double>::quiet_NaN();
#endif
}
if( !abs( binY ) ) [[unlikely]] // y == 0
{
feraiseexcept( FE_INVALID );
#if defined(_MSC_VER)
return numeric_limits<double>::quiet_NaN();
#else
return -numeric_limits<double>::quiet_NaN();
#endif
}
if( !abs( binX ) || isInf( binY ) ) [[unlikely]] // x == 0 || y
== Inf return x;
auto exp = []( uint64_t b ) -> int { return b >> 52 & 0x7FF; };
int
expX = exp( binX ),
expY = exp( binY );
auto mant = []( uint64_t b ) { return b & MANT; };
uint64_t
mantX = mant( binX ),
mantY = mant( binY );
int headBits = 11;
static auto normalize = [&]( int &exp, uint64_t &mant )
{
unsigned shift = countl_zero( mant ) - headBits;
mant <<= shift;
exp -= shift;
};
auto build = []( int &exp, uint64_t &mant )
{
if( exp ) [[likely]]
mant |= IMPLICIT;
else
{
exp = 1;
normalize( exp, mant );
}
};
build( expX, mantX );
build( expY, mantY );
int
tailX = countr_zero( mantX ),
tailY = countr_zero( mantY ),
tailBits = tailX <= tailY ? tailX : tailY;
headBits += tailBits;
mantX >>= tailBits;
mantY >>= tailBits;
uint64_t signX = binX & SIGN;
int expDiff;
#if defined(_MSC_VER)
while( (expDiff = expX - expY) > 63 )
{
unsigned long long hi = mantX >> 1, lo = mantX << 63, remainder; (void)_udiv128( hi, lo, mantY, &remainder );
expX -= 63;
mantX = remainder;
normalize( expX, mantX );
}
#endif
while( (expDiff = expX - expY) > 0 )
{
unsigned bits = expDiff <= headBits ? expDiff :
headBits; if( !(mantX = (mantX << bits) % mantY) ) [[unlikely]]
return bit_cast<double>( signX );
expX -= bits;
normalize( expX, mantX );
}
if( !expDiff && mantX >= mantY ) [[unlikely]]
if( (mantX -= mantY) ) [[likely]]
normalize( expX, mantX );
else
return bit_cast<double>( signX );
mantX <<= tailBits;
mantY <<= tailBits;
if( expX <= 0 ) [[unlikely]]
{
assert(expX >= -51);
mantX = mantX >> (unsigned)(-expX + 1);
expX = 0;
}
return bit_cast<double>( signX | (uint64_t)expX << 52 | mantX &
MANT ); }

double myFmod( double x, double y )
{
return xMyFmod( x, y );
}

inline float myFmod( float x, float y )
{
return (float)xMyFmod<true>( (double)x, (double)y );
}

-- end of foo.cpp

-- bar.cpp
#include <iostream>
using namespace std;

double myFmod( double x, double y );

int main()
{
cout << hexfloat << myFmod(
1.8037919852882307,
2.22605637008665934e-194 ) << endl;
cout << hexfloat << fmod( 1.8037919852882307,
2.22605637008665934e-194 ) << endl;
}

-- end of foo.cpp

W:\foobar>cl
Microsoft (R) C/C++ Optimizing Compiler Version 19.41.34120 for x64
Copyright (C) Microsoft Corporation. All rights reserved.

usage: cl [ option... ] filename... [ /link linkoption... ]

W:\foobar>cl -nologo -O2 -EHsc -std:c++20 foo.cpp bar.cpp
foo.cpp
bar.cpp
Generating Code...

W:\foobar>foo
0x1.0000000000000p-696
0x0.0000000000000p+0

--- Synchronet 3.20c-Linux NewsLink 1.2

From Bonita Montero@Bonita.Montero@gmail.com to comp.lang.c++ on Sun Mar 9 10:31:02 2025

From Newsgroup: comp.lang.c++

This prints the same result (0.0) under Windows and Linux:

double myFmod( double x, double y )
{
constexpr uint64_t
SIGN = 1ull << 63,
IMPLICIT = 1ull << 52,
MANT = IMPLICIT - 1,
QBIT = 1ull << 51;
uint64_t const
binX = bit_cast<uint64_t>( x ),
binY = bit_cast<uint64_t>( y );
static auto abs = []( uint64_t m ) { return m & ~SIGN; };
auto isNaN = []( uint64_t m ) { return abs( m ) >= 0x7FF0000000000001u; };
auto isSig = []( uint64_t m ) { return !(m & QBIT); };
if( isNaN( binX ) ) [[unlikely]] // x == NaN
#if defined(_MSC_VER)
return bit_cast<double>( isNaN( binY ) ? binY | binX & binY & QBIT :
binX );
#else
{
if( isSig( binX ) || isNaN( binY ) && isSig( binY ) ) [[unlikely]]
feraiseexcept( FE_INVALID );
return bit_cast<double>( binX | QBIT );
}
#endif
if( isNaN( binY ) ) [[unlikely]] // x != NaN || y == NaN
#if defined(_MSC_VER)
return y;
#else
{
if( isSig( binY ) ) [[unlikely]]
feraiseexcept( FE_INVALID );
return bit_cast<double>( binY | QBIT );
}
#endif
auto isInf = []( uint64_t m ) { return abs( m ) == 0x7FF0000000000000u; };
if( isInf( binX ) ) // x == Inf
{
feraiseexcept( FE_INVALID );
#if defined(_MSC_VER)
return bit_cast<double>( binX & ~MANT | QBIT );
#else
return -numeric_limits<double>::quiet_NaN();
#endif
}
if( !abs( binY ) ) [[unlikely]] // y == 0
{
feraiseexcept( FE_INVALID );
#if defined(_MSC_VER)
return numeric_limits<double>::quiet_NaN();
#else
return -numeric_limits<double>::quiet_NaN();
#endif
}
if( !abs( binX ) || isInf( binY ) ) [[unlikely]] // x == 0 || y == Inf
return x;
auto exp = []( uint64_t b ) -> int { return b >> 52 & 0x7FF; };
int
expX = exp( binX ),
expY = exp( binY );
auto mant = []( uint64_t b ) { return b & MANT; };
uint64_t
mantX = mant( binX ),
mantY = mant( binY );
int headBits = 11;
static auto normalize = [&]( int &exp, uint64_t &mant )
{
unsigned shift = countl_zero( mant ) - headBits;
mant <<= shift;
exp -= shift;
};
auto build = []( int &exp, uint64_t &mant )
{
if( exp ) [[likely]]
mant |= IMPLICIT;
else
{
exp = 1;
normalize( exp, mant );
}
};
build( expX, mantX );
build( expY, mantY );
int
tailX = countr_zero( mantX ),
tailY = countr_zero( mantY ),
tailBits = tailX <= tailY ? tailX : tailY;
mantX >>= tailBits;
mantY >>= tailBits;
headBits += tailBits;
uint64_t signX = binX & SIGN;
int expDiff;
#if defined(_MSC_VER) && !defined(__llvm__) && defined(_M_X64)
while( (expDiff = expX - expY) > 0 )
{
unsigned bits = expDiff <= 63 ? expDiff : 63;
unsigned long long hi = mantX >> 64 - bits, lo = mantX << bits, remainder;
(void)_udiv128( hi, lo, mantY, &remainder );
if( !remainder ) [[unlikely]]
return bit_cast<double>( signX );
mantX = remainder;
expX -= bits;
normalize( expX, mantX );
}
#else
while( (expDiff = expX - expY) > 0 )
{
unsigned bits = expDiff <= headBits ? expDiff : headBits;
if( !(mantX = (mantX << bits) % mantY) ) [[unlikely]]
return bit_cast<double>( signX );
expX -= bits;
normalize( expX, mantX );
}
#endif
if( !expDiff && mantX >= mantY ) [[unlikely]]
if( (mantX -= mantY) ) [[likely]]
normalize( expX, mantX );
else
return bit_cast<double>( signX );
mantX <<= tailBits;
mantY <<= tailBits;
if( expX <= 0 ) [[unlikely]]
{
assert(expX >= -51);
mantX = mantX >> (unsigned)(-expX + 1);
expX = 0;
}
return bit_cast<double>( signX | (uint64_t)expX << 52 | mantX & MANT ); }

--- Synchronet 3.20c-Linux NewsLink 1.2

From Michael S@already5chosen@yahoo.com to comp.lang.c++ on Sun Mar 9 11:46:24 2025

From Newsgroup: comp.lang.c++

On Sun, 9 Mar 2025 10:31:02 +0100
Bonita Montero <Bonita.Montero@gmail.com> wrote:

This prints the same result (0.0) under Windows and Linux:

I am no longer going to look at your code until you start posting full
files, with all includes and using directives.

--- Synchronet 3.20c-Linux NewsLink 1.2

From Bonita Montero@Bonita.Montero@gmail.com to comp.lang.c++ on Sun Mar 9 10:54:40 2025

From Newsgroup: comp.lang.c++

Am 09.03.2025 um 10:46 schrieb Michael S:

This prints the same result (0.0) under Windows and Linux:

I am no longer going to look at your code until you start posting full
files, with all includes and using directives.

You could simply replace the single function I've shown.

--- Synchronet 3.20c-Linux NewsLink 1.2

From Michael S@already5chosen@yahoo.com to comp.lang.c++ on Sun Mar 9 12:09:31 2025

From Newsgroup: comp.lang.c++

On Sun, 9 Mar 2025 10:54:40 +0100
Bonita Montero <Bonita.Montero@gmail.com> wrote:

Am 09.03.2025 um 10:46 schrieb Michael S:

This prints the same result (0.0) under Windows and Linux:

I am no longer going to look at your code until you start posting
full files, with all includes and using directives.

You could simply replace the single function I've shown.

I can. I don't want to do it.
You want me to look at/test your code? You post full code.
Simple, isn't it?

--- Synchronet 3.20c-Linux NewsLink 1.2

From Bonita Montero@Bonita.Montero@gmail.com to comp.lang.c++ on Sun Mar 9 11:21:55 2025

From Newsgroup: comp.lang.c++

Am 09.03.2025 um 11:09 schrieb Michael S:

On Sun, 9 Mar 2025 10:54:40 +0100
Bonita Montero <Bonita.Montero@gmail.com> wrote:

Am 09.03.2025 um 10:46 schrieb Michael S:

This prints the same result (0.0) under Windows and Linux:

I am no longer going to look at your code until you start posting
full files, with all includes and using directives.

You could simply replace the single function I've shown.

I can. I don't want to do it.
You want me to look at/test your code? You post full code.
Simple, isn't it?

I've read you don't trust my tests, so use your own with myFmod.

--- Synchronet 3.20c-Linux NewsLink 1.2

From wij@wyniijj5@gmail.com to comp.lang.c++ on Sun Mar 9 18:51:45 2025

From Newsgroup: comp.lang.c++

On Sun, 2025-03-09 at 10:31 +0100, Bonita Montero wrote:

This prints the same result (0.0) under Windows and Linux:

double myFmod( double x, double y )
{
constexpr uint64_t
SIGN = 1ull << 63,
IMPLICIT = 1ull << 52,
MANT = IMPLICIT - 1,
QBIT = 1ull << 51;
uint64_t const
binX = bit_cast<uint64_t>( x ),
binY = bit_cast<uint64_t>( y );
static auto abs = []( uint64_t m ) { return m & ~SIGN; };
auto isNaN = []( uint64_t m ) { return abs( m ) >= 0x7FF0000000000001u; };
auto isSig = []( uint64_t m ) { return !(m & QBIT); };
if( isNaN( binX ) ) [[unlikely]] // x == NaN
#if defined(_MSC_VER)
return bit_cast<double>( isNaN( binY ) ? binY | binX & binY & QBIT :
binX );
#else
{
if( isSig( binX ) || isNaN( binY ) && isSig( binY ) ) [[unlikely]]
feraiseexcept( FE_INVALID );
return bit_cast<double>( binX | QBIT );
}
#endif
if( isNaN( binY ) ) [[unlikely]] // x != NaN || y == NaN
#if defined(_MSC_VER)
return y;
#else
{
if( isSig( binY ) ) [[unlikely]]
feraiseexcept( FE_INVALID );
return bit_cast<double>( binY | QBIT );
}
#endif
auto isInf = []( uint64_t m ) { return abs( m ) == 0x7FF0000000000000u; };
if( isInf( binX ) ) // x == Inf
{
feraiseexcept( FE_INVALID );
#if defined(_MSC_VER)
return bit_cast<double>( binX & ~MANT | QBIT );
#else
return -numeric_limits<double>::quiet_NaN();
#endif
}
if( !abs( binY ) ) [[unlikely]] // y == 0
{
feraiseexcept( FE_INVALID );
#if defined(_MSC_VER)
return numeric_limits<double>::quiet_NaN();
#else
return -numeric_limits<double>::quiet_NaN();
#endif
}
if( !abs( binX ) || isInf( binY ) ) [[unlikely]] // x == 0 || y == Inf
return x;
auto exp = []( uint64_t b ) -> int { return b >> 52 & 0x7FF; };
int
expX = exp( binX ),
expY = exp( binY );
auto mant = []( uint64_t b ) { return b & MANT; };
uint64_t
mantX = mant( binX ),
mantY = mant( binY );
int headBits = 11;
static auto normalize = [&]( int &exp, uint64_t &mant )
{
unsigned shift = countl_zero( mant ) - headBits;
mant <<= shift;
exp -= shift;
};
auto build = []( int &exp, uint64_t &mant )
{
if( exp ) [[likely]]
mant |= IMPLICIT;
else
{
exp = 1;
normalize( exp, mant );
}
};
build( expX, mantX );
build( expY, mantY );
int
tailX = countr_zero( mantX ),
tailY = countr_zero( mantY ),
tailBits = tailX <= tailY ? tailX : tailY;
mantX >>= tailBits;
mantY >>= tailBits;
headBits += tailBits;
uint64_t signX = binX & SIGN;
int expDiff;
#if defined(_MSC_VER) && !defined(__llvm__) && defined(_M_X64)
while( (expDiff = expX - expY) > 0 )
{
unsigned bits = expDiff <= 63 ? expDiff : 63;
unsigned long long hi = mantX >> 64 - bits, lo = mantX << bits, remainder;
(void)_udiv128( hi, lo, mantY, &remainder );
if( !remainder ) [[unlikely]]
return bit_cast<double>( signX );
mantX = remainder;
expX -= bits;
normalize( expX, mantX );
}
#else
while( (expDiff = expX - expY) > 0 )
{
unsigned bits = expDiff <= headBits ? expDiff : headBits;
if( !(mantX = (mantX << bits) % mantY) ) [[unlikely]]
return bit_cast<double>( signX );
expX -= bits;
normalize( expX, mantX );
}
#endif
if( !expDiff && mantX >= mantY ) [[unlikely]]
if( (mantX -= mantY) ) [[likely]]
normalize( expX, mantX );
else
return bit_cast<double>( signX );
mantX <<= tailBits;
mantY <<= tailBits;
if( expX <= 0 ) [[unlikely]]
{
assert(expX >= -51);
mantX = mantX >> (unsigned)(-expX + 1);
expX = 0;
}
return bit_cast<double>( signX | (uint64_t)expX << 52 | mantX & MANT ); }

The basic problem is what the purpose of myFmod(double,double)? (maybe I missed something)
From the view of implementing myFmod, I think using C-like coding style would be better,
but all depending on what you want to achieve.
--- Synchronet 3.20c-Linux NewsLink 1.2

From Bonita Montero@Bonita.Montero@gmail.com to comp.lang.c++ on Sun Mar 9 12:56:55 2025

From Newsgroup: comp.lang.c++

Am 09.03.2025 um 11:51 schrieb wij:

From the view of implementing myFmod, I think using C-like coding style would be better,
but all depending on what you want to achieve.

A C coding style would result in about two times the code.
--- Synchronet 3.20c-Linux NewsLink 1.2

From Michael S@already5chosen@yahoo.com to comp.lang.c++ on Sun Mar 9 14:09:32 2025

From Newsgroup: comp.lang.c++

On Sun, 9 Mar 2025 11:21:55 +0100
Bonita Montero <Bonita.Montero@gmail.com> wrote:

Am 09.03.2025 um 11:09 schrieb Michael S:

On Sun, 9 Mar 2025 10:54:40 +0100
Bonita Montero <Bonita.Montero@gmail.com> wrote:

Am 09.03.2025 um 10:46 schrieb Michael S:

This prints the same result (0.0) under Windows and Linux:

I am no longer going to look at your code until you start posting
full files, with all includes and using directives.

You could simply replace the single function I've shown.

I can. I don't want to do it.
You want me to look at/test your code? You post full code.
Simple, isn't it?

I've read you don't trust my tests, so use your own with myFmod.

ok. I was too curios :(
This version produces correct results both when compiled under MSVC and
when compiled with other compilers. It is a little faster too.
With MSVC on old Intel CPU it is only 2.5 times slower than standard
library in relevant range of x/y. Previous version was 3.4 times
slower.
With gcc and clang it is still more than 6 times slower than standard
library.
The coding style is now less insane.

Measurements in nsec.
First result - Intel Skylake at 4.25 GHz
Second result - AMD Zen3 at 3.7 GHz

abs(x/y) in range that matters [0.5:2**53]:
Standard MSVC Library - 11.1 10.4
Standard gnu Library - 5.4 10.7
Yours (MSVc) - 27.6 11.5
Yours (gcc) - 36.4 23.7
Yours (clang) - 37.4 24.3

abs(x/y) in full range [2**-2090:2**2090]:
Standard MSVC Library - 109.4 153.5
Standard glib Library - 102.3 155.5
Yours (MSVc) - 134.9 52.6
Yours (gcc) - 284.7 151.8
Yours (clang) - 285.2 156.5

--- Synchronet 3.20c-Linux NewsLink 1.2

From Michael S@already5chosen@yahoo.com to comp.lang.c++ on Sun Mar 9 14:23:03 2025

From Newsgroup: comp.lang.c++

On Sun, 9 Mar 2025 12:56:55 +0100
Bonita Montero <Bonita.Montero@gmail.com> wrote:

Am 09.03.2025 um 11:51 schrieb wij:

From the view of implementing myFmod, I think using C-like coding
style would be better, but all depending on what you want to
achieve.

A C coding style would result in about two times the code.

So far all we had see from you is 2-3 times longer than C code (real C,
not C-style C++) that I posted here few days ago. And my code had more
comments than yours, so difference in code itself is even bigger.

Yes, part of it is because my algorithm is simpler. But that is only
part.

--- Synchronet 3.20c-Linux NewsLink 1.2

From Bonita Montero@Bonita.Montero@gmail.com to comp.lang.c++ on Sun Mar 9 13:39:40 2025

From Newsgroup: comp.lang.c++

Am 09.03.2025 um 13:23 schrieb Michael S:

So far all we had see from you is 2-3 times longer than C code (real C,
not C-style C++) ...

Not true since I save a lot of redundant-code with [&]-lambdas.
--- Synchronet 3.20c-Linux NewsLink 1.2

From Michael S@already5chosen@yahoo.com to comp.lang.c++ on Sun Mar 9 14:56:25 2025

From Newsgroup: comp.lang.c++

On Sun, 9 Mar 2025 13:39:40 +0100
Bonita Montero <Bonita.Montero@gmail.com> wrote:

Am 09.03.2025 um 13:23 schrieb Michael S:

So far all we had see from you is 2-3 times longer than C code
(real C, not C-style C++) ...

Not true since I save a lot of redundant-code with [&]-lambdas.

Every heard of wc? It does not lie.

--- Synchronet 3.20c-Linux NewsLink 1.2

From Mr Flibble@leigh@i42.co.uk to comp.lang.c++ on Sun Mar 9 15:26:29 2025

From Newsgroup: comp.lang.c++

On Sun, 9 Mar 2025 14:09:32 +0200, Michael S
<already5chosen@yahoo.com> wrote:

On Sun, 9 Mar 2025 11:21:55 +0100
Bonita Montero <Bonita.Montero@gmail.com> wrote:

Am 09.03.2025 um 11:09 schrieb Michael S:

On Sun, 9 Mar 2025 10:54:40 +0100
Bonita Montero <Bonita.Montero@gmail.com> wrote:

Am 09.03.2025 um 10:46 schrieb Michael S:

This prints the same result (0.0) under Windows and Linux:

I am no longer going to look at your code until you start posting
full files, with all includes and using directives.

You could simply replace the single function I've shown.

I can. I don't want to do it.
You want me to look at/test your code? You post full code.
Simple, isn't it?

I've read you don't trust my tests, so use your own with myFmod.

ok. I was too curios :(
This version produces correct results both when compiled under MSVC and
when compiled with other compilers. It is a little faster too.
With MSVC on old Intel CPU it is only 2.5 times slower than standard
library in relevant range of x/y. Previous version was 3.4 times
slower.
With gcc and clang it is still more than 6 times slower than standard >library.
The coding style is now less insane.

Measurements in nsec.
First result - Intel Skylake at 4.25 GHz
Second result - AMD Zen3 at 3.7 GHz

abs(x/y) in range that matters [0.5:2**53]:
Standard MSVC Library - 11.1 10.4
Standard gnu Library - 5.4 10.7
Yours (MSVc) - 27.6 11.5
Yours (gcc) - 36.4 23.7
Yours (clang) - 37.4 24.3

abs(x/y) in full range [2**-2090:2**2090]:
Standard MSVC Library - 109.4 153.5
Standard glib Library - 102.3 155.5
Yours (MSVc) - 134.9 52.6
Yours (gcc) - 284.7 151.8
Yours (clang) - 285.2 156.5

So it is slow ergo a pointless alternative to what we already have.

/Flibble
--- Synchronet 3.20c-Linux NewsLink 1.2

From Bonita Montero@Bonita.Montero@gmail.com to comp.lang.c++ on Sun Mar 9 17:02:32 2025

From Newsgroup: comp.lang.c++

Am 09.03.2025 um 16:26 schrieb Mr Flibble:

So it is slow ergo a pointless alternative to what we already have.

glibc does it nearly in the same way I do it because the FMA-solution
isn't portable. If fma( a, b, c ) is substituted with a * b + c because
there's no proper CPU-instruction the whole issue doesn't work.
And with support for _udiv128 my solution has about the same performance
like the Michael's solution with clang++ 18.1.7.

--- Synchronet 3.20c-Linux NewsLink 1.2

From Bonita Montero@Bonita.Montero@gmail.com to comp.lang.c++ on Sun Mar 9 17:23:26 2025

From Newsgroup: comp.lang.c++

Am 09.03.2025 um 13:09 schrieb Michael S:

Measurements in nsec.
First result - Intel Skylake at 4.25 GHz
Second result - AMD Zen3 at 3.7 GHz

abs(x/y) in range that matters [0.5:2**53]:
Standard MSVC Library - 11.1 10.4
Standard gnu Library - 5.4 10.7
Yours (MSVc) - 27.6 11.5
Yours (gcc) - 36.4 23.7
Yours (clang) - 37.4 24.3

abs(x/y) in full range [2**-2090:2**2090]:
Standard MSVC Library - 109.4 153.5
Standard glib Library - 102.3 155.5
Yours (MSVc) - 134.9 52.6
Yours (gcc) - 284.7 151.8
Yours (clang) - 285.2 156.5

With MSVC and an arbitrary combination of finite x and y on my
Zen4-machine:

your fmod: 77.1214
my: 38.4486

With MSVC and an arbitrary combination of finite x with exponents
ranging from 0x3FF to 0x433 (close exponents) on my Zen4-machine:

your fmod: 23.6423
my: 9.79146

This is a nearly proper implementation of your idea with FMA-intrinsics
and SSE/AVX control register access:

double fmody( double x, double y )
{
if( isnan( x ) ) [[unlikely]]
return x;
if( isnan( y ) ) [[unlikely]]
return y;
if( isinf( x ) || !y ) [[unlikely]]
{
feraiseexcept( FE_INVALID );
return numeric_limits<double>::quiet_NaN();
}
if( !x || isinf( y ) ) [[unlikely]]
return x;
uint64_t sign = bit_cast<uint64_t>( x ) & numeric_limits<int64_t>::min();
x = abs( x );
y = abs( y );
int oldCsr = _mm_getcsr();
constexpr int CHOP = 0x6000;
_mm_setcsr( oldCsr | CHOP );
constexpr uint64_t
EXP = -(1ll << 52),
MANT = ~EXP;
uint64_t binY = bit_cast<uint64_t>( y );
int64_t expY = binY & EXP;
if( !expY ) [[unlikely]]
expY = (uint64_t)(0 - (countl_zero( binY & MANT ) - 12)) << 52;
while( x >= y )
{
uint64_t yExpAdd = 0;
double div = x / y;
if( div < 0x1.FFFFFFFFFFFFFp+1023 ) [[likely]]
div = xtrunc( div );
else
{
uint64_t
binX = bit_cast<uint64_t>( x ),
newExp = expY + (54ull << 52);
yExpAdd = (binX & EXP) - newExp;
div = xtrunc( bit_cast<double>( newExp | binX & MANT ) / y );
}
__m128d mult1, mult2, add;
#if defined(_MSC_VER)
mult1.m128d_f64[0] = div;
mult2.m128d_f64[0] = -bit_cast<double>( binY + yExpAdd );
add.m128d_f64[0] = x;
x = _mm_fmadd_sd( mult1, mult2, add ).m128d_f64[0];
#else
mult1[0] = div;
mult2[0] = -bit_cast<double>( binY + yExpAdd );
add[0] = x;
x = _mm_fmadd_sd( mult1, mult2, add )[0];
#endif
if( !x ) [[unlikely]]
return bit_cast<double>( sign );
}
_mm_setcsr( oldCsr );
return bit_cast<double>( sign | bit_cast<uint64_t>( x ) );
}

The only thing that doesn't work currently is the support for denormal
values.

--- Synchronet 3.20c-Linux NewsLink 1.2

From Michael S@already5chosen@yahoo.com to comp.lang.c++ on Sun Mar 9 19:55:21 2025

From Newsgroup: comp.lang.c++

On Sun, 9 Mar 2025 17:02:32 +0100
Bonita Montero <Bonita.Montero@gmail.com> wrote:

Am 09.03.2025 um 16:26 schrieb Mr Flibble:

So it is slow ergo a pointless alternative to what we already have.

glibc does it nearly in the same way I do it because the FMA-solution
isn't portable.
If fma( a, b, c ) is substituted with a * b + c
because there's no proper CPU-instruction the whole issue doesn't
work.

FMA solution is portable to any standard-complaint C environment.
By standard, it has to work regardless of presence of absence of
FMA hardware.
In absence of FMA hardware it is expected to be rather slow, but still
has to produce correct results.

Unfortunately, Microsoft's library implementation of fma() does not work correctly on non-FMA hardware. In case when x*y and z have different
signs it fairs somewhat better than when they have the same sign, but
still not good enough.
So it goes ):
Since under msys2 both gcc and clang rely on Microsoft's library, they
also do not work correctly on non-FMA CPUs.

It would be interesting to test if glibc is better in that regard. Unfortunately, right now I have no access to any glibc-based system
that runs on sufficiently old hardware.

Anyway, despite all their problems, fma() based solutions are written
in standard C language. The same can't be said about solutions that use
any variant of 128-bit integer arithmetic, either in form of Gnu
__int128 or in form of Microsoft's intrinsic functions.
I think that an absence of standardized 128-bit integer math in both C
and C++ sucks, but I can not change the fact.

And with support for _udiv128 my solution has about the same
performance like the Michael's solution with clang++ 18.1.7.

I readily admitted that from practical perspective all 3 solutions
that I posted here (one of each is incorrect) are pointless.

I do have variants that are usefully faster than Standard library for
relevant x/y ratios, but I didn't post them here.
They achieve the speed boost via direct manipulation of binary64 bit
patterns. I am not sure that such solutions are on topic in c.l.c++.

Still, what you say about relative speed is true only on newer CPUs and
only when compiled with MSVC. On Older CPUs, like Intel Skylake, which
is pretty fast CPU on absolute scale and which still constitutes big
portion of installed base, your code is significantly slower than mine.
Same for newer CPUs with compilers that do not support native long
division.

Here is comparison vs code that I posted here at 2025-03-03 18:10:08
+0200:

Measurements in nsec.
First result - Intel Skylake at 4.25 GHz
Second result - AMD Zen3 at 3.7 GHz

abs(x/y) in range that matters [0.5:2**53]:
Standard MSVC Library - 11.1 10.4
Standard gnu Library - 5.4 10.7
Yours (MSVc) - 27.6 11.5
Yours (gcc) - 36.4 23.7
Yours (clang) - 37.4 24.3
my (MSVc) - 10.7 11.3
my (gcc) - 7.7 7.6
my (clang) - 6.3 7.5

abs(x/y) in full range [2**-2090:2**2090]:
Standard MSVC Library - 109.4 153.5
Standard glib Library - 102.3 155.5
Yours (MSVc) - 134.9 52.6
Yours (gcc) - 284.7 151.8
Yours (clang) - 285.2 156.5
my (MSVc) - 62.1 61.1
my (gcc) - 60.8 59.1
my (clang) - 59.9 59.3

--- Synchronet 3.20c-Linux NewsLink 1.2

From Michael S@already5chosen@yahoo.com to comp.lang.c++ on Sun Mar 9 20:04:07 2025

From Newsgroup: comp.lang.c++

On Sun, 9 Mar 2025 17:23:26 +0100
Bonita Montero <Bonita.Montero@gmail.com> wrote:

Am 09.03.2025 um 13:09 schrieb Michael S:

Measurements in nsec.
First result - Intel Skylake at 4.25 GHz
Second result - AMD Zen3 at 3.7 GHz

abs(x/y) in range that matters [0.5:2**53]:
Standard MSVC Library - 11.1 10.4
Standard gnu Library - 5.4 10.7
Yours (MSVc) - 27.6 11.5
Yours (gcc) - 36.4 23.7
Yours (clang) - 37.4 24.3

abs(x/y) in full range [2**-2090:2**2090]:
Standard MSVC Library - 109.4 153.5
Standard glib Library - 102.3 155.5
Yours (MSVc) - 134.9 52.6
Yours (gcc) - 284.7 151.8
Yours (clang) - 285.2 156.5

With MSVC and an arbitrary combination of finite x and y on my
Zen4-machine:

your fmod: 77.1214
my: 38.4486

With MSVC and an arbitrary combination of finite x with exponents
ranging from 0x3FF to 0x433 (close exponents) on my Zen4-machine:

your fmod: 23.6423
my: 9.79146

It looks like you didn't pay attention to a "flipflop" version that I
posted at 2025-03-03 18:10:08

This is a nearly proper implementation of your idea with
FMA-intrinsics and SSE/AVX control register access:

double fmody( double x, double y )
{
if( isnan( x ) ) [[unlikely]]
return x;
if( isnan( y ) ) [[unlikely]]
return y;
if( isinf( x ) || !y ) [[unlikely]]
{
feraiseexcept( FE_INVALID );
return numeric_limits<double>::quiet_NaN();
}
if( !x || isinf( y ) ) [[unlikely]]
return x;
uint64_t sign = bit_cast<uint64_t>( x ) & numeric_limits<int64_t>::min(); x = abs( x );
y = abs( y );
int oldCsr = _mm_getcsr();
constexpr int CHOP = 0x6000;
_mm_setcsr( oldCsr | CHOP );
constexpr uint64_t
EXP = -(1ll << 52),
MANT = ~EXP;
uint64_t binY = bit_cast<uint64_t>( y );
int64_t expY = binY & EXP;
if( !expY ) [[unlikely]]
expY = (uint64_t)(0 - (countl_zero( binY & MANT ) -
12)) << 52; while( x >= y )
{
uint64_t yExpAdd = 0;
double div = x / y;
if( div < 0x1.FFFFFFFFFFFFFp+1023 ) [[likely]]
div = xtrunc( div );
else
{
uint64_t
binX = bit_cast<uint64_t>( x ),
newExp = expY + (54ull << 52);
yExpAdd = (binX & EXP) - newExp;
div = xtrunc( bit_cast<double>( newExp | binX
& MANT ) / y ); }
__m128d mult1, mult2, add;
#if defined(_MSC_VER)
mult1.m128d_f64[0] = div;
mult2.m128d_f64[0] = -bit_cast<double>( binY +
yExpAdd ); add.m128d_f64[0] = x;
x = _mm_fmadd_sd( mult1, mult2, add ).m128d_f64[0];
#else
mult1[0] = div;
mult2[0] = -bit_cast<double>( binY + yExpAdd );
add[0] = x;
x = _mm_fmadd_sd( mult1, mult2, add )[0];
#endif
if( !x ) [[unlikely]]
return bit_cast<double>( sign );
}
_mm_setcsr( oldCsr );
return bit_cast<double>( sign | bit_cast<uint64_t>( x ) );
}

The only thing that doesn't work currently is the support for denormal values.

I already said that I don't approve non-portable constructs like _mm_getcsr()/_mm_setcsr() except when they help important cases and
help ALOT. Neither applies here.

--- Synchronet 3.20c-Linux NewsLink 1.2

From Bonita Montero@Bonita.Montero@gmail.com to comp.lang.c++ on Sun Mar 9 19:10:23 2025

From Newsgroup: comp.lang.c++

Am 09.03.2025 um 18:55 schrieb Michael S:

In absence of FMA hardware it is expected to be rather slow, but still
has to produce correct results.

Yes, extremely slow.

It would be interesting to test if glibc is better in that regard.

glibc uses integer-operations.

I readily admitted that from practical perspective all 3 solutions
that I posted here (one of each is incorrect) are pointless.

I've recently posted a comparison of your solution against mine on
a Zen4-CPU with MSVC. Your solution is equally performant with close
exponents (0x3FF to 0x433) if I compile it unter WSL2 with g++-12.
The problem with MSVC is that the trunc() function is extremely slow.
In my implementation of your idea (x86 FMA) I use my own function
xtrunc which makes the code twice as fast.

Still, what you say about relative speed is true only on newer CPUs and
only when compiled with MSVC. ...

I haven't managed to write a 128 / 64 -> 64:64 division with Linux
-compilers. __int128 doesn't work since the compiler doesn't see that
the result fits in 64 bit and calls a library function for the division
which does the subtract and shift steps manually.
But your solution has about the same performance on Linux with g++12
like my code with MSVC on a Zen4-CPU.
--- Synchronet 3.20c-Linux NewsLink 1.2

From Bonita Montero@Bonita.Montero@gmail.com to comp.lang.c++ on Sun Mar 9 19:11:45 2025

From Newsgroup: comp.lang.c++

Am 09.03.2025 um 19:04 schrieb Michael S:

I already said that I don't approve non-portable constructs like _mm_getcsr()/_mm_setcsr() except when they help important cases and
help ALOT. Neither applies here.

The solution is much simpler than your solution since there are no
separate fast ans slow paths. The performance is about the same for
close exponents like your solution.

--- Synchronet 3.20c-Linux NewsLink 1.2

From Bonita Montero@Bonita.Montero@gmail.com to comp.lang.c++ on Sun Mar 9 19:27:26 2025

From Newsgroup: comp.lang.c++

Am 09.03.2025 um 19:10 schrieb Bonita Montero:

I've recently posted a comparison of your solution against mine on
a Zen4-CPU with MSVC. Your solution is equally performant with close exponents (0x3FF to 0x433) if I compile it unter WSL2 with g++-12.
The problem with MSVC is that the trunc() function is extremely slow.
In my implementation of your idea (x86 FMA) I use my own function
xtrunc which makes the code twice as fast.

This is even somewhat faster since I'm using the ROUNDSD instruction
of SSE 4.1; but the difference to my xtrunc-solution is only 13%:

double fmody( double x, double y )
{
if( isnan( x ) ) [[unlikely]]
return x;
if( isnan( y ) ) [[unlikely]]
return y;
if( isinf( x ) || !y ) [[unlikely]]
{
feraiseexcept( FE_INVALID );
return numeric_limits<double>::quiet_NaN();
}
if( !x || isinf( y ) ) [[unlikely]]
return x;
uint64_t sign = bit_cast<uint64_t>( x ) & numeric_limits<int64_t>::min();
x = abs( x );
y = abs( y );
int oldCsr = _mm_getcsr();
constexpr int CHOP = 0x6000;
_mm_setcsr( oldCsr | CHOP );
constexpr uint64_t
EXP = -(1ll << 52),
MANT = ~EXP;
uint64_t binY = bit_cast<uint64_t>( y );
int64_t expY = binY & EXP;
if( !expY ) [[unlikely]]
expY = (uint64_t)(0 - (countl_zero( binY & MANT ) - 12)) << 52;
while( x >= y )
{
auto floor = []( double value )
{
__m128d m;
#if defined(_MSC_VER)
m.m128d_f64[0] = value;
_mm_floor_sd(m, m);
return m.m128d_f64[0];
#else
m[0] = value;
_mm_floor_sd(m, m);
return m[0];
#endif
};
uint64_t yExpAdd = 0;
double div = x / y;
if( div < 0x1.FFFFFFFFFFFFFp+1023 ) [[likely]]
div = xtrunc( div );
else
{
uint64_t
binX = bit_cast<uint64_t>( x ),
newExp = expY + (54ull << 52);
yExpAdd = (binX & EXP) - newExp;
div = xtrunc( bit_cast<double>( newExp | binX & MANT ) / y );
}
auto xfma = []( double a, double b, double c )
{
__m128d mult1, mult2, add;
#if defined(_MSC_VER)
mult1.m128d_f64[0] = a;
mult2.m128d_f64[0] = b;
add.m128d_f64[0] = c;
return _mm_fmadd_sd( mult1, mult2, add ).m128d_f64[0]; #else
mult1[0] = a;
mult2[0] = b;
add[0] = c;
return _mm_fmadd_sd( mult1, mult2, add )[0];
#endif
};
x = xfma( div, -bit_cast<double>( binY + yExpAdd ), x );
if( !x ) [[unlikely]]
return bit_cast<double>( sign );
}
_mm_setcsr( oldCsr );
return bit_cast<double>( sign | bit_cast<uint64_t>( x ) );
}
--- Synchronet 3.20c-Linux NewsLink 1.2

From Bonita Montero@Bonita.Montero@gmail.com to comp.lang.c++ on Sun Mar 9 20:11:37 2025

From Newsgroup: comp.lang.c++

Am 09.03.2025 um 19:11 schrieb Bonita Montero:

The solution is much simpler than your solution since there are no
separate fast ans slow paths. The performance is about the same for
close exponents like your solution.

And it's somewhat faster if I use the ROUNDSD-intrinsic than my
solution before with the xtrunc-function. With MSVC's own trunc()
code the performance is not competitive.

--- Synchronet 3.20c-Linux NewsLink 1.2

From Bonita Montero@Bonita.Montero@gmail.com to comp.lang.c++ on Mon Mar 10 07:24:18 2025

From Newsgroup: comp.lang.c++

Am 09.03.2025 um 19:04 schrieb Michael S:

I already said that I don't approve non-portable constructs like _mm_getcsr()/_mm_setcsr() except when they help important cases and
help ALOT. Neither applies here.

I dropped getting and setting the MSCSR-register to set the roun-
ding mode. Now I set the rounding mode directly with the intrinsics _mm_div_round_sd and and _mm_fmadd_round_sd. Now this more manual
code does help "ALOT", i.e. the solution is 2/3 faster than your
initial solution with clang++-18 under Linux.
But there's still a problem with the denormals.

double fmodMplus( double x, double y )
{
if( isnan( x ) ) [[unlikely]]
return x;
if( isnan( y ) ) [[unlikely]]
return y;
if( isinf( x ) || !y ) [[unlikely]]
{
feraiseexcept( FE_INVALID );
return numeric_limits<double>::quiet_NaN();
}
if( !x || isinf( y ) ) [[unlikely]]
return x;
uint64_t sign = bit_cast<uint64_t>( x ) & numeric_limits<int64_t>::min();
x = abs( x );
y = abs( y );
constexpr uint64_t
EXP = -(1ll << 52),
MANT = ~EXP;
uint64_t binY = bit_cast<uint64_t>( y );
int64_t expY = binY & EXP;
//if( !expY ) [[unlikely]]
//expY = (uint64_t)(0 - (countl_zero( binY & MANT ) - 12)) << 52;
while( x >= y )
{
uint64_t yExpAdd = 0;
__m128d a128, b128;
#if defined(_MSC_VER)
a128.m128d_f64[0] = x;
b128.m128d_f64[0] = y;
double div = _mm_div_round_sd( a128, b128, _MM_FROUND_TO_ZERO |
_MM_FROUND_NO_EXC ).m128d_f64[0];
#else
a128[0] = x;
b128[0] = y;
double div = _mm_div_round_sd( a128, b128, _MM_FROUND_TO_ZERO |
_MM_FROUND_NO_EXC )[0];
#endif
if( div < 0x1.FFFFFFFFFFFFFp+1023 ) [[likely]]
div = xtrunc( div );
else
{
uint64_t
binX = bit_cast<uint64_t>( x ),
newExp = expY + (54ull << 52);
yExpAdd = (binX & EXP) - newExp;
div = xtrunc( bit_cast<double>( newExp | binX & MANT ) / y );
}
x = []( double a, double b, double c )
{
__m128d mult1, mult2, add;
#if defined(_MSC_VER)
mult1.m128d_f64[0] = a;
mult2.m128d_f64[0] = b;
add.m128d_f64[0] = c;
return _mm_fmadd_round_sd( mult1, mult2, add, _MM_FROUND_TO_ZERO |
_MM_FROUND_NO_EXC ).m128d_f64[0];
#else
mult1[0] = a;
mult2[0] = b;
add[0] = c;
return _mm_fmadd_round_sd( mult1, mult2, add, _MM_FROUND_TO_ZERO |
_MM_FROUND_NO_EXC )[0];
#endif
}( div, -bit_cast<double>( binY + yExpAdd ), x );
if( !x ) [[unlikely]]
return bit_cast<double>( sign );
}
return bit_cast<double>( sign | bit_cast<uint64_t>( x ) );
}
--- Synchronet 3.20c-Linux NewsLink 1.2

From Bonita Montero@Bonita.Montero@gmail.com to comp.lang.c++ on Mon Mar 10 10:46:59 2025

From Newsgroup: comp.lang.c++

I tried to use a unsigned __int128 / uint64_t division and I expected
that the compiler calls a library function which does the subtract and
shift manually. But g++ as well as clang++ handle this as a 128 : 64

64#64 division somehow.

And now my original solution is about 23% faster than your solution
for close exponents (exponent difference <= 53) and for arbitrary
exponent differences your solution is about 3% faster.

double fmodO( double x, double y )
{
constexpr uint64_t
SIGN = 1ull << 63,
IMPLICIT = 1ull << 52,
MANT = IMPLICIT - 1,
QBIT = 1ull << 51;
uint64_t const
binX = bit_cast<uint64_t>( x ),
binY = bit_cast<uint64_t>( y );
static auto abs = []( uint64_t m ) { return m & ~SIGN; };
auto isNaN = []( uint64_t m ) { return abs( m ) >= 0x7FF0000000000001u; };
auto isSig = []( uint64_t m ) { return !(m & QBIT); };
if( isNaN( binX ) ) [[unlikely]] // x == NaN
#if defined(_MSC_VER)
return bit_cast<double>( isNaN( binY ) ? binY | binX & binY & QBIT :
binX );
#else
{
if( isSig( binX ) || isNaN( binY ) && isSig( binY ) ) [[unlikely]]
feraiseexcept( FE_INVALID );
return bit_cast<double>( binX | QBIT );
}
#endif
if( isNaN( binY ) ) [[unlikely]] // x != NaN || y == NaN
#if defined(_MSC_VER)
return y;
#else
{
if( isSig( binY ) ) [[unlikely]]
feraiseexcept( FE_INVALID );
return bit_cast<double>( binY | QBIT );
}
#endif
auto isInf = []( uint64_t m ) { return abs( m ) == 0x7FF0000000000000u; };
if( isInf( binX ) ) // x == Inf
{
feraiseexcept( FE_INVALID );
#if defined(_MSC_VER)
return bit_cast<double>( binX & ~MANT | QBIT );
#else
return -numeric_limits<double>::quiet_NaN();
#endif
}
if( !abs( binY ) ) [[unlikely]] // y == 0
{
feraiseexcept( FE_INVALID );
#if defined(_MSC_VER)
return numeric_limits<double>::quiet_NaN();
#else
return -numeric_limits<double>::quiet_NaN();
#endif
}
if( !abs( binX ) || isInf( binY ) ) [[unlikely]] // x == 0 || y == Inf
return x;
auto exp = []( uint64_t b ) -> int { return b >> 52 & 0x7FF; };
int
expX = exp( binX ),
expY = exp( binY );
auto mant = []( uint64_t b ) { return b & MANT; };
uint64_t
mantX = mant( binX ),
mantY = mant( binY );
int headBits = 11;
static auto normalize = [&]( int &exp, uint64_t &mant )
{
unsigned shift = countl_zero( mant ) - headBits;
mant <<= shift;
exp -= shift;
};
auto build = []( int &exp, uint64_t &mant )
{
if( exp ) [[likely]]
mant |= IMPLICIT;
else
{
exp = 1;
normalize( exp, mant );
}
};
build( expX, mantX );
build( expY, mantY );
int
tailX = countr_zero( mantX ),
tailY = countr_zero( mantY ),
tailBits = tailX <= tailY ? tailX : tailY;
mantX >>= tailBits;
mantY >>= tailBits;
headBits += tailBits;
uint64_t signX = binX & SIGN;
int expDiff;
#if defined(_MSC_VER) && !defined(__llvm__) && defined(_M_X64)
while( (expDiff = expX - expY) > 0 )
{
unsigned bits = expDiff <= 63 ? expDiff : 63;
unsigned long long hi = mantX >> 64 - bits, lo = mantX << bits, remainder;
(void)_udiv128( hi, lo, mantY, &remainder );
if( !remainder ) [[unlikely]]
return bit_cast<double>( signX );
mantX = remainder;
expX -= bits;
normalize( expX, mantX );
}
#elif defined(__GNUC__) || defined(__clang__)
while( (expDiff = expX - expY) > 0 )
{
unsigned bits = expDiff <= 63 ? expDiff : 63;
unsigned __int128 dividend = (unsigned __int128)mantX << bits;
mantX = (uint64_t)(dividend % mantY);
if( !mantX ) [[unlikely]]
return bit_cast<double>( signX );
expX -= bits;
normalize( expX, mantX );
}
#else
while( (expDiff = expX - expY) > 0 )
{
unsigned bits = expDiff <= headBits ? expDiff : headBits;
if( !(mantX = (mantX << bits) % mantY) ) [[unlikely]]
return bit_cast<double>( signX );
expX -= bits;
normalize( expX, mantX );
}
#endif
if( !expDiff && mantX >= mantY ) [[unlikely]]
if( (mantX -= mantY) ) [[likely]]
normalize( expX, mantX );
else
return bit_cast<double>( signX );
mantX <<= tailBits;
mantY <<= tailBits;
if( expX <= 0 ) [[unlikely]]
{
assert(expX >= -51);
mantX = mantX >> (unsigned)(-expX + 1);
expX = 0;
}
return bit_cast<double>( signX | (uint64_t)expX << 52 | mantX & MANT ); }
--- Synchronet 3.20c-Linux NewsLink 1.2

From Michael S@already5chosen@yahoo.com to comp.lang.c++ on Mon Mar 10 13:17:27 2025

From Newsgroup: comp.lang.c++

On Mon, 10 Mar 2025 07:24:18 +0100
Bonita Montero <Bonita.Montero@gmail.com> wrote:

Am 09.03.2025 um 19:04 schrieb Michael S:

I already said that I don't approve non-portable constructs like _mm_getcsr()/_mm_setcsr() except when they help important cases and
help ALOT. Neither applies here.

I dropped getting and setting the MSCSR-register to set the roun-
ding mode. Now I set the rounding mode directly with the intrinsics _mm_div_round_sd and and _mm_fmadd_round_sd. Now this more manual
code does help "ALOT", i.e. the solution is 2/3 faster than your
initial solution with clang++-18 under Linux.

I can't check your claim about speed, because the code does not compile. Compiler has no idea WTF is xtrunc. But considering that so far all
your claims about speed were false, I can safely assume that this one
is false as well.

But there's still a problem with the denormals.

On y or on x?
Subnormal values of x can be nicely handled by quick path with 0
additional characters of code.

BTW, for last 20-25 years IEEE-754 prefers to call binary floating
point numbers in range (0:DBL_MIN) 'subnormal' rather than 'denormal'.
I'd guess that it is because the term 'denormal' has wider meaning.

--- Synchronet 3.20c-Linux NewsLink 1.2

From Bonita Montero@Bonita.Montero@gmail.com to comp.lang.c++ on Mon Mar 10 12:53:25 2025

From Newsgroup: comp.lang.c++

Am 10.03.2025 um 12:17 schrieb Michael S:

I can't check your claim about speed, because the code does not compile. Compiler has no idea WTF is xtrunc. But considering that so far all
your claims about speed were false, I can safely assume that this one
is false as well.

Well, if that makes my statements easier for you.

On y or on x?

I haven't checked that. But I've dropped the SSE-FMA-solution because
with unsigned __int128 / uint64_t divisions my initial solution even
faster, nearly one quarter than yours with clang++-18 under WSL2 for
close exponents. With nearly arbitrary exponent combinations your
solution is slightly faste (3%).

BTW, for last 20-25 years IEEE-754 prefers to call binary floating
point numbers in range (0:DBL_MIN) 'subnormal' rather than 'denormal'.
I'd guess that it is because the term 'denormal' has wider meaning.

I don't insist so compulsively on standardized terms.
--- Synchronet 3.20c-Linux NewsLink 1.2

From Bonita Montero@Bonita.Montero@gmail.com to comp.lang.c++ on Mon Mar 10 12:54:52 2025

From Newsgroup: comp.lang.c++

Am 10.03.2025 um 12:17 schrieb Michael S:

I can't check your claim about speed, because the code does not compile.

Test my latest code of my initial idea parallel to this posting in this
thread.
--- Synchronet 3.20c-Linux NewsLink 1.2

From Michael S@already5chosen@yahoo.com to comp.lang.c++ on Mon Mar 10 14:26:48 2025

From Newsgroup: comp.lang.c++

On Mon, 10 Mar 2025 10:46:59 +0100
Bonita Montero <Bonita.Montero@gmail.com> wrote:

I tried to use a unsigned __int128 / uint64_t division and I expected
that the compiler calls a library function which does the subtract and
shift manually. But g++ as well as clang++ handle this as a 128 : 64

64#64 division somehow.

Somehow?
It should be an obvious thing to anybody who cared to think for 15
seconds.
A library-or-compiler starts with hi1=rem(0:x_hi, y). Now hi1 is
guaranteed to be smaller than y, so it's safe to do rem(hi1:x_lo, y).
It is not *very* slow, but still there are 2 dependent division
operations.
So, on machines with slow integer division, like Skylake, it is 1.5-1.7
times slower than a single long division.
On machines with fast long division, like Zen3/4/5 or "performance"
cores on newer Intel CPUs, it is approximately twice slower than a
single long division. Plus call overhead, so 2.5x or so slower overall.

And now my original solution is about 23% faster than your solution
for close exponents (exponent difference <= 53) and for arbitrary
exponent differences your solution is about 3% faster.

That's absolutely not what I see.
Here are my measurements:

Measurements in nsec.
First result - Intel Skylake at 4.25 GHz
Second result - AMD Zen3 at 3.7 GHz

abs(x/y) in range that matters [0.5:2**53]:
Standard MSVC Library - 11.1 10.4
Standard gnu Library - 5.4 10.7
Yours (MSVc) - 27.6 11.5
Yours (gcc) - 36.4 23.7
Yours (clang) - 37.4 24.3
my (MSVc) - 10.7 11.3
my (gcc) - 7.7 7.6
my (clang) - 6.3 7.5
Your last (MSVc) - 27.6 11.6
Your last (gcc) - 33.8 28.6
Your last (clang) - 32.6 26.9

abs(x/y) in full range [2**-2090:2**2090]:
Standard MSVC Library - 109.4 153.5
Standard glib Library - 102.3 155.5
Yours (MSVc) - 134.9 52.6
Yours (gcc) - 284.7 151.8
Yours (clang) - 285.2 156.5
my (MSVc) - 62.1 61.1
my (gcc) - 60.8 59.1
my (clang) - 59.9 59.3
Your last (MSVc) - 135.0 52.5
Your last (gcc) - 172.1 137.3
Your last (clang) - 167.7 126.3

It looks like you are not using version that I posted at 2025-03-03
as a reference.
The only case where you code is running at approximately the same speed
as my code is MSVC-generated code on CPUs with fast integer division.
And it was like that since your previous version. I see no changes in
that regard.

double fmodO( double x, double y )
{
constexpr uint64_t
SIGN = 1ull << 63,
IMPLICIT = 1ull << 52,
MANT = IMPLICIT - 1,
QBIT = 1ull << 51;
uint64_t const
binX = bit_cast<uint64_t>( x ),
binY = bit_cast<uint64_t>( y );
static auto abs = []( uint64_t m ) { return m & ~SIGN; };
auto isNaN = []( uint64_t m ) { return abs( m ) >=
0x7FF0000000000001u; }; auto isSig = []( uint64_t m ) { return !(m &
QBIT); }; if( isNaN( binX ) ) [[unlikely]] // x == NaN
#if defined(_MSC_VER)
return bit_cast<double>( isNaN( binY ) ? binY | binX
& binY & QBIT : binX );
#else
{
if( isSig( binX ) || isNaN( binY ) && isSig( binY ) ) [[unlikely]] feraiseexcept( FE_INVALID );
return bit_cast<double>( binX | QBIT );
}
#endif
if( isNaN( binY ) ) [[unlikely]] // x != NaN || y == NaN
#if defined(_MSC_VER)
return y;
#else
{
if( isSig( binY ) ) [[unlikely]]
feraiseexcept( FE_INVALID );
return bit_cast<double>( binY | QBIT );
}
#endif
auto isInf = []( uint64_t m ) { return abs( m ) ==
0x7FF0000000000000u; }; if( isInf( binX ) ) // x == Inf
{
feraiseexcept( FE_INVALID );
#if defined(_MSC_VER)
return bit_cast<double>( binX & ~MANT | QBIT );
#else
return -numeric_limits<double>::quiet_NaN();
#endif
}
if( !abs( binY ) ) [[unlikely]] // y == 0
{
feraiseexcept( FE_INVALID );
#if defined(_MSC_VER)
return numeric_limits<double>::quiet_NaN();
#else
return -numeric_limits<double>::quiet_NaN();
#endif
}
if( !abs( binX ) || isInf( binY ) ) [[unlikely]] // x == 0 ||
y == Inf return x;
auto exp = []( uint64_t b ) -> int { return b >> 52 & 0x7FF;
}; int
expX = exp( binX ),
expY = exp( binY );
auto mant = []( uint64_t b ) { return b & MANT; };
uint64_t
mantX = mant( binX ),
mantY = mant( binY );
int headBits = 11;
static auto normalize = [&]( int &exp, uint64_t &mant )
{
unsigned shift = countl_zero( mant ) - headBits;
mant <<= shift;
exp -= shift;
};
auto build = []( int &exp, uint64_t &mant )
{
if( exp ) [[likely]]
mant |= IMPLICIT;
else
{
exp = 1;
normalize( exp, mant );
}
};
build( expX, mantX );
build( expY, mantY );
int
tailX = countr_zero( mantX ),
tailY = countr_zero( mantY ),
tailBits = tailX <= tailY ? tailX : tailY;
mantX >>= tailBits;
mantY >>= tailBits;
headBits += tailBits;
uint64_t signX = binX & SIGN;
int expDiff;
#if defined(_MSC_VER) && !defined(__llvm__) && defined(_M_X64)
while( (expDiff = expX - expY) > 0 )
{
unsigned bits = expDiff <= 63 ? expDiff : 63;
unsigned long long hi = mantX >> 64 - bits, lo =
mantX << bits, remainder; (void)_udiv128( hi, lo, mantY, &remainder );
if( !remainder ) [[unlikely]]
return bit_cast<double>( signX );
mantX = remainder;
expX -= bits;
normalize( expX, mantX );
}
#elif defined(__GNUC__) || defined(__clang__)
while( (expDiff = expX - expY) > 0 )
{
unsigned bits = expDiff <= 63 ? expDiff : 63;
unsigned __int128 dividend = (unsigned __int128)mantX
<< bits; mantX = (uint64_t)(dividend % mantY);
if( !mantX ) [[unlikely]]
return bit_cast<double>( signX );
expX -= bits;
normalize( expX, mantX );
}
#else
while( (expDiff = expX - expY) > 0 )
{
unsigned bits = expDiff <= headBits ? expDiff :
headBits; if( !(mantX = (mantX << bits) % mantY) ) [[unlikely]]
return bit_cast<double>( signX );
expX -= bits;
normalize( expX, mantX );
}
#endif
if( !expDiff && mantX >= mantY ) [[unlikely]]
if( (mantX -= mantY) ) [[likely]]
normalize( expX, mantX );
else
return bit_cast<double>( signX );
mantX <<= tailBits;
mantY <<= tailBits;
if( expX <= 0 ) [[unlikely]]
{
assert(expX >= -51);
mantX = mantX >> (unsigned)(-expX + 1);
expX = 0;
}
return bit_cast<double>( signX | (uint64_t)expX << 52 | mantX
& MANT ); }

You code would be much, much cleaner and more readable if you replace
lambdas with proper functions. Or just write simple expressions like 'x
& MANT' in place.

--- Synchronet 3.20c-Linux NewsLink 1.2

From Bonita Montero@Bonita.Montero@gmail.com to comp.lang.c++ on Mon Mar 10 13:45:26 2025

From Newsgroup: comp.lang.c++

Am 10.03.2025 um 13:26 schrieb Michael S:

It should be an obvious thing to anybody who cared to think for 15
seconds.

I was wrong and it absolutels isn't obvious. The compiler calls the
glibc function __umodti3 of glibc which has a shortcut for results
which fit into 64 bit. Although there's an additional call on Linux
the code with clang++-18 is still a bit faster than my Windows solution
with the _udiv128-intrinsic; that's really surprisng.

A library-or-compiler starts with hi1=rem(0:x_hi, y). Now hi1 is
guaranteed to be smaller than y, so it's safe to do rem(hi1:x_lo, y).
It is not *very* slow, but still there are 2 dependent division
operations.

Both parameters are variable so there could be no static evaluation
at compile tiime.

Measurements in nsec.
First result - Intel Skylake at 4.25 GHz
Second result - AMD Zen3 at 3.7 GHz

abs(x/y) in range that matters [0.5:2**53]:
Standard MSVC Library - 11.1 10.4
Standard gnu Library - 5.4 10.7
Yours (MSVc) - 27.6 11.5
Yours (gcc) - 36.4 23.7
Yours (clang) - 37.4 24.3
my (MSVc) - 10.7 11.3
my (gcc) - 7.7 7.6
my (clang) - 6.3 7.5
Your last (MSVc) - 27.6 11.6
Your last (gcc) - 33.8 28.6
Your last (clang) - 32.6 26.9

abs(x/y) in full range [2**-2090:2**2090]:
Standard MSVC Library - 109.4 153.5
Standard glib Library - 102.3 155.5
Yours (MSVc) - 134.9 52.6
Yours (gcc) - 284.7 151.8
Yours (clang) - 285.2 156.5
my (MSVc) - 62.1 61.1
my (gcc) - 60.8 59.1
my (clang) - 59.9 59.3
Your last (MSVc) - 135.0 52.5
Your last (gcc) - 172.1 137.3
Your last (clang) - 167.7 126.3

This are the clang++-18 results on my Zen4-computer under WSL2 with
close exponents (0x3ff to 0x433):

fmodO: 9.42929
fmodM: 11.7907

So my code is about 23% faster on my computer.

This are the results for arbitrary exponents:

fmodO: 41.9115
fmodM: 41.2062

Exactly what I already mentioned.

Maybe that depends on the glibc-version because a different glibc
version might have different efficient __umodti3 functions.

You code would be much, much cleaner and more readable if you replace
lambdas with proper functions. ..

For me that doesn't make a difference.
--- Synchronet 3.20c-Linux NewsLink 1.2

From Michael S@already5chosen@yahoo.com to comp.lang.c++ on Mon Mar 10 16:33:04 2025

From Newsgroup: comp.lang.c++

On Mon, 10 Mar 2025 13:45:26 +0100
Bonita Montero <Bonita.Montero@gmail.com> wrote:

Am 10.03.2025 um 13:26 schrieb Michael S:

It should be an obvious thing to anybody who cared to think for 15
seconds.

I was wrong and it absolutels isn't obvious. The compiler calls the
glibc function __umodti3 of glibc which has a shortcut for results
which fit into 64 bit. Although there's an additional call on Linux
the code with clang++-18 is still a bit faster than my Windows
solution with the _udiv128-intrinsic; that's really surprisng.

A library-or-compiler starts with hi1=rem(0:x_hi, y). Now hi1 is
guaranteed to be smaller than y, so it's safe to do rem(hi1:x_lo,
y). It is not *very* slow, but still there are 2 dependent division operations.

Both parameters are variable so there could be no static evaluation
at compile tiime.

https://github.com/llvm/llvm-project/blob/main/compiler-rt/lib/builtins/udivmodti4.c
On line 114 they specialize the case of 64-bit divisor.
On line 116 the further specialize our specific case of x.hi < y.
So, at the end they use the same single division instruction as MSVC.
The only difference is an overhead of cal and of to very predictable
branches.

Measurements in nsec.
First result - Intel Skylake at 4.25 GHz
Second result - AMD Zen3 at 3.7 GHz

abs(x/y) in range that matters [0.5:2**53]:
Standard MSVC Library - 11.1 10.4
Standard gnu Library - 5.4 10.7
Yours (MSVc) - 27.6 11.5
Yours (gcc) - 36.4 23.7
Yours (clang) - 37.4 24.3
my (MSVc) - 10.7 11.3
my (gcc) - 7.7 7.6
my (clang) - 6.3 7.5
Your last (MSVc) - 27.6 11.6
Your last (gcc) - 33.8 28.6
Your last (clang) - 32.6 26.9

abs(x/y) in full range [2**-2090:2**2090]:
Standard MSVC Library - 109.4 153.5
Standard glib Library - 102.3 155.5
Yours (MSVc) - 134.9 52.6
Yours (gcc) - 284.7 151.8
Yours (clang) - 285.2 156.5
my (MSVc) - 62.1 61.1
my (gcc) - 60.8 59.1
my (clang) - 59.9 59.3
Your last (MSVc) - 135.0 52.5
Your last (gcc) - 172.1 137.3
Your last (clang) - 167.7 126.3

This are the clang++-18 results on my Zen4-computer under WSL2 with
close exponents (0x3ff to 0x433):

fmodO: 9.42929
fmodM: 11.7907

For my code it's strangely slow. On 4+ GHz Zen4 I would expect ~5 nsec.
Are you sure that you took my code from 2025-03-03 as is, compiled it
as separate file C file (not C++), without touching it?

So my code is about 23% faster on my computer.

This are the results for arbitrary exponents:

fmodO: 41.9115
fmodM: 41.2062

Good job by LLVM. Unfortunately, on msys2 clang appears to use Gnu implementation of compiler support library instead of their own. Right
now gnu is not as smart. Hopefully, they will catch up soon.

Exactly what I already mentioned.

Maybe that depends on the glibc-version because a different glibc
version might have different efficient __umodti3 functions.

Compiler supports functions like udivmodti4 are not part of glibc.
They reside in separate library. In case of gcc it is called libgcc or libgcc_s.
My educated guess is that on Linux clang does not use libgcc.

You code would be much, much cleaner and more readable if you
replace lambdas with proper functions. ..

For me that doesn't make a difference.

But it makes difference for your potential readers.

--- Synchronet 3.20c-Linux NewsLink 1.2

From Bonita Montero@Bonita.Montero@gmail.com to comp.lang.c++ on Mon Mar 10 15:46:32 2025

From Newsgroup: comp.lang.c++

Am 10.03.2025 um 15:33 schrieb Michael S:

https://github.com/llvm/llvm-project/blob/main/compiler-rt/lib/builtins/udivmodti4.c
On line 114 they specialize the case of 64-bit divisor.
On line 116 the further specialize our specific case of x.hi < y.
So, at the end they use the same single division instruction as MSVC.
The only difference is an overhead of cal and of to very predictable branches.

That's what I also guessed, but maybe we've not the same glibc-version.
Or the code runs just more efficiently on my Zen4-CPU-

For my code it's strangely slow. On 4+ GHz Zen4 I would expect ~5 nsec.

I want your crystal ball.

But it makes difference for your potential readers.

If I have an exteral functions whose innards have to be known by the
one who uses that function you could simply place it near the place
where it is called.

--- Synchronet 3.20c-Linux NewsLink 1.2

From Michael S@already5chosen@yahoo.com to comp.lang.c++ on Mon Mar 10 16:59:25 2025

From Newsgroup: comp.lang.c++

On Mon, 10 Mar 2025 15:46:32 +0100
Bonita Montero <Bonita.Montero@gmail.com> wrote:

Am 10.03.2025 um 15:33 schrieb Michael S:

That's what I also guessed, but maybe we've not the same
glibc-version. Or the code runs just more efficiently on my Zen4-CPU-

For my code it's strangely slow. On 4+ GHz Zen4 I would expect ~5
nsec.

I want your crystal ball.

One does not need crystal ball to extrapolate speed of simple integer
code from 3.7 GHz Zen3 to 4+ GHz Zen4 (probably 4.7 or 4.8 GHz).
But your repetitive avoidance of answering my direct questions about
what exactly you are using as "my code" makes me even more suspicious.

--- Synchronet 3.20c-Linux NewsLink 1.2

From Bonita Montero@Bonita.Montero@gmail.com to comp.lang.c++ on Mon Mar 10 16:11:39 2025

From Newsgroup: comp.lang.c++

Am 10.03.2025 um 15:59 schrieb Michael S:

One does not need crystal ball to extrapolate speed of simple integer
code from 3.7 GHz Zen3 to 4+ GHz Zen4 (probably 4.7 or 4.8 GHz).

If one core only computes the clock is even 5.7GHz.
But the results aren't better than shown.

But your repetitive avoidance of answering my direct questions about
what exactly you are using as "my code" makes me even more suspicious.

I've shown the latest code of fmodO; you can easily inegrate it into
your own benchmark. I don't use unfiform_real_distrubution for the
random numbers but uniform_int_distribution with bounds of 0x3FFull
<< 52 and 0x433ull << 52 for the close exponent case. The whole test
code nearly hasn't changed over my initial post.

--- Synchronet 3.20c-Linux NewsLink 1.2

From Michael S@already5chosen@yahoo.com to comp.lang.c++ on Mon Mar 10 17:43:05 2025

From Newsgroup: comp.lang.c++

On Mon, 10 Mar 2025 16:11:39 +0100
Bonita Montero <Bonita.Montero@gmail.com> wrote:

Am 10.03.2025 um 15:59 schrieb Michael S:

One does not need crystal ball to extrapolate speed of simple
integer code from 3.7 GHz Zen3 to 4+ GHz Zen4 (probably 4.7 or 4.8
GHz).

If one core only computes the clock is even 5.7GHz.
But the results aren't better than shown.

But your repetitive avoidance of answering my direct questions about
what exactly you are using as "my code" makes me even more
suspicious.

I've shown the latest code of fmodO;

That's not the question I am asking for 4th or 5th time.
My question is what *exactly* is fmodM.

you can easily inegrate it into
your own benchmark.

I did. And presented the results. I am fully willing to believe that
the difference between our clang results explained by difference in
compiler support library.
But so far I find no explanation for why results for what you claim to
be *my* code are so much slower in your measurements, despite your
faster CPU.
BTW, the number you did not publish at all was the speed of fmod()
routine from standard library. My estimation is that on CPU like
yours for close exponent range it should be around 7-8 nsec, both for
msys2 and for MSVC. I have no idea about glibc on Linux.

I don't use unfiform_real_distrubution for the
random numbers but uniform_int_distribution with bounds of 0x3FFull
<< 52 and 0x433ull << 52 for the close exponent case. The whole test
code nearly hasn't changed over my initial post.

Assuming that the exponent of y is fixed at 1023 that is approximately
the same as my own test.

--- Synchronet 3.20c-Linux NewsLink 1.2

From Bonita Montero@Bonita.Montero@gmail.com to comp.lang.c++ on Mon Mar 10 17:13:31 2025

From Newsgroup: comp.lang.c++

Am 10.03.2025 um 16:43 schrieb Michael S:

That's not the question I am asking for 4th or 5th time.
My question is what *exactly* is fmodM.

fmodM is your code, M like Michael.

I did. And presented the results. I am fully willing to believe that
the difference between our clang results explained by difference in
compiler support library.

Or maybe the different CPU.

But so far I find no explanation for why results for what you claim to
be *my* code are so much slower in your measurements, despite your
faster CPU.

I compiled with -O2 and march=native, that should be sufficient.

BTW, the number you did not publish at all was the speed of fmod()
routine from standard library. ...

I've the fmod code of glibc 2.31, which is rather slow since it
does the subtract and shifts manually - code from Sun of the 90s.

Assuming that the exponent of y is fixed at 1023 that is approximately
the same as my own test.

Yes, but as you said earlier close exponents are more relevant.
--- Synchronet 3.20c-Linux NewsLink 1.2

From Michael S@already5chosen@yahoo.com to comp.lang.c++ on Mon Mar 10 19:51:28 2025

From Newsgroup: comp.lang.c++

On Mon, 10 Mar 2025 17:13:31 +0100
Bonita Montero <Bonita.Montero@gmail.com> wrote:

Am 10.03.2025 um 16:43 schrieb Michael S:

That's not the question I am asking for 4th or 5th time.
My question is what *exactly* is fmodM.

fmodM is your code, M like Michael.

What my code?
Post it.

I did. And presented the results. I am fully willing to believe that
the difference between our clang results explained by difference in compiler support library.

Or maybe the different CPU.

But so far I find no explanation for why results for what you claim
to be *my* code are so much slower in your measurements, despite
your faster CPU.

I compiled with -O2 and march=native, that should be sufficient.

BTW, the number you did not publish at all was the speed of fmod()
routine from standard library. ...

I've the fmod code of glibc 2.31, which is rather slow since it
does the subtract and shifts manually - code from Sun of the 90s.

Assuming that the exponent of y is fixed at 1023 that is
approximately the same as my own test.

Yes, but as you said earlier close exponents are more relevant.

--- Synchronet 3.20c-Linux NewsLink 1.2

From Bonita Montero@Bonita.Montero@gmail.com to comp.lang.c++ on Mon Mar 10 19:00:06 2025

From Newsgroup: comp.lang.c++

Am 10.03.2025 um 18:51 schrieb Michael S:

What my code?
Post it.

I just changed the function name and that the code uses xtrunc() instead
of trunc() since trunc() is slow with MSVC. I removed my improvement _mm_getcsr() / _mm_setcsr() since the speedup was noticeable but not significant, unlike the xtrunc() optimization, which made a speedup of
about +100% with MSVC.

double fmodM( double x, double y )
{
if( isnan( x ) )
return x;

// pre-process y
if( isless( y, 0 ) )
y = -y;
else if( isgreater( y, 0 ) )
;
else {
if( isnan( y ) )
return y;
// y==0
feraiseexcept( FE_INVALID );
return nan( "y0" );
}

// y in (0:+inf]

// Quick path
double xx = x * 0x1p-53;
if( xx > -y && xx < y ) {
// among other things, x guaranteed to be finite
if( x > -y && x < y )
return x; // case y=+-inf covered here
double d = xtrunc( x / y );
double res = fma( -y, d, x );
if( signbit( x ) != signbit( res ) ) {
// overshoot because of unfortunate division rounding
// it is extremely rare for small x/y,
// but not rare when x/y is close to 2**53
res = fma( -y, d + (signbit( x ) * 2 - 1), x );
}
return res;
}

// slow path
if( isinf( x ) ) {
feraiseexcept( FE_INVALID );
return nan( "xinf" );
}

int oldRnd = fegetround();
fesetround( FE_TOWARDZERO );
double ax = fabs( x );
do {
double yy = y;
while( yy < ax * 0x1p-1022 )
yy *= 0x1p1021;

do
ax = fma( -yy, xtrunc( ax / yy ), ax );
while( ax >= yy );

} while( ax >= y );

ax = copysign( ax, x );
fesetround( oldRnd );
return ax;
}

Your idea is really elegant and as I've shown it could be significantly imporved with SSE 4.1 along with FMA3. But at the point where I noticed
how performant a 128 : 64 modulo division through the glibc is and as
this is superior over the FMA-solution I dropped the whole idea and
removed the SSE-FMA-code from my test program.

--- Synchronet 3.20c-Linux NewsLink 1.2

From Michael S@already5chosen@yahoo.com to comp.lang.c++ on Mon Mar 10 20:38:18 2025

From Newsgroup: comp.lang.c++

On Mon, 10 Mar 2025 19:00:06 +0100
Bonita Montero <Bonita.Montero@gmail.com> wrote:

Am 10.03.2025 um 18:51 schrieb Michael S:

What my code?
Post it.

I just changed the function name and that the code uses xtrunc()
instead of trunc() since trunc() is slow with MSVC. I removed my
improvement _mm_getcsr() / _mm_setcsr() since the speedup was
noticeable but not significant, unlike the xtrunc() optimization,
which made a speedup of about +100% with MSVC.

double fmodM( double x, double y )
{
if( isnan( x ) )
return x;

// pre-process y
if( isless( y, 0 ) )
y = -y;
else if( isgreater( y, 0 ) )
;
else {
if( isnan( y ) )
return y;
// y==0
feraiseexcept( FE_INVALID );
return nan( "y0" );
}

// y in (0:+inf]

// Quick path
double xx = x * 0x1p-53;
if( xx > -y && xx < y ) {
// among other things, x guaranteed to be finite
if( x > -y && x < y )
return x; // case y=+-inf covered here
double d = xtrunc( x / y );
double res = fma( -y, d, x );
if( signbit( x ) != signbit( res ) ) {
// overshoot because of unfortunate division
rounding // it is extremely rare for small x/y,
// but not rare when x/y is close to 2**53
res = fma( -y, d + (signbit( x ) * 2 - 1), x
); }
return res;
}

// slow path
if( isinf( x ) ) {
feraiseexcept( FE_INVALID );
return nan( "xinf" );
}

int oldRnd = fegetround();
fesetround( FE_TOWARDZERO );
double ax = fabs( x );
do {
double yy = y;
while( yy < ax * 0x1p-1022 )
yy *= 0x1p1021;

do
ax = fma( -yy, xtrunc( ax / yy ), ax );
while( ax >= yy );

} while( ax >= y );

ax = copysign( ax, x );
fesetround( oldRnd );
return ax;
}

That is *not* a "flipflop" code that I consider relevant for
approximately a weak.
The relevant code is the one posted a weak ago. I am posting it for
the second time:

#include <math.h>
#include <fenv.h>

double my_fmod(double x, double y)
{
if (isnan(x))
return x;

// pre-process y
if (y < 0)
y = -y;
else if (y > 0)
;
else {
if (isnan(y))
return y;
// y==0
feraiseexcept(FE_INVALID);
return NAN;
}

// y in (0:+inf]
double ax = fabs(x);

// Quick path
if (ax * 0x1p-53 < y) {
// among other things, x guaranteed to be finite
if (ax < y)
return x; // case y=+-inf covered here
double d = floor(ax/y);
double res = fma(-y, d, ax);
if (res < 0) {
// overshoot because of unfortunate division rounding
// it is extremely rare for small x/y,
// but not rare when x/y is close to 2**53
res += y;
}
if (x < 0)
res = -res;
return res;
}

// slow path
if (isinf(x)) {
feraiseexcept(FE_INVALID);
return NAN;
}

int flipflop = 0;
do {
double yy = y;
while (yy < ax * 0x1p-1022)
yy *= 0x1p1021;

do {
ax = fma(-yy, floor(ax/yy), ax);
flipflop ^= (ax < 0);
ax = fabs(ax);
} while (ax >= yy);
} while (ax >= y);
if (flipflop)
ax = y - ax;
if (x < 0)
ax = -ax;
return ax;
}

Your idea is really elegant

I'd rather call it "simple" or "straightforward". "Elegant" in my book
is something else. For example, the code above is closer to what I
consider elegant.
May be, later today or tomorrow, I'll show you solution that I consider
bright. Bright, but impractical.

and as I've shown it could be
significantly imporved with SSE 4.1 along with FMA3. But at the point
where I noticed how performant a 128 : 64 modulo division through the
glibc is and as this is superior over the FMA-solution I dropped the
whole idea and removed the SSE-FMA-code from my test program.

Even if compiler generates good code for long division, there are still multiple problems with this approach:

1. Long division is very slow on majority of older CPUs. That includes
CPUs that are quite fast in the absolute sense, like Intel Skylake,
with all its subvariants (Caby Lake, Coffee Lake, Whisky Lake, etc...)
and AMD Zen2.

2. The source language is not a standard C (or C++ for that matter). One
has to use ether gnu extensions or Microsoft's extensions. In the latter
case, it became non-portable to ARM64.

3. It is slow under msys2/mingw64 and probably slow under Linux with
gnu compiler. You can easily test if the latter is true and to tell me.

4. Even on CPUs with fast long and division and with
compilers/libraries that are able to generate long division it is
measurably slower than fdiv/floor/fma in the case that corresponds to
my quick path. And slower than standard library in this case. That is
less visible with MSVC, but quite obvious with other compilers. I don't
know an exact reason for that, but would guess that this new CPUs, esp.
new CPUs from AMD, do not handle dual transition of data between
domains FF->Integer->FP particularly well. So, when the work is
short, it ends up better doing everything on the floating-point side,
even if calculation is a little longer.

--- Synchronet 3.20c-Linux NewsLink 1.2

From Bonita Montero@Bonita.Montero@gmail.com to comp.lang.c++ on Mon Mar 10 22:34:13 2025

From Newsgroup: comp.lang.c++

Am 10.03.2025 um 19:38 schrieb Michael S:

The relevant code is the one posted a weak ago. I am posting it for
the second time:

#include <math.h>
#include <fenv.h>

double my_fmod(double x, double y)
{
if (isnan(x))
return x;

// pre-process y
if (y < 0)
y = -y;
else if (y > 0)
;
else {
if (isnan(y))
return y;
// y==0
feraiseexcept(FE_INVALID);
return NAN;
}

// y in (0:+inf]
double ax = fabs(x);

// Quick path
if (ax * 0x1p-53 < y) {
// among other things, x guaranteed to be finite
if (ax < y)
return x; // case y=+-inf covered here
double d = floor(ax/y);
double res = fma(-y, d, ax);
if (res < 0) {
// overshoot because of unfortunate division rounding
// it is extremely rare for small x/y,
// but not rare when x/y is close to 2**53
res += y;
}
if (x < 0)
res = -res;
return res;
}

// slow path
if (isinf(x)) {
feraiseexcept(FE_INVALID);
return NAN;
}

int flipflop = 0;
do {
double yy = y;
while (yy < ax * 0x1p-1022)
yy *= 0x1p1021;

do {
ax = fma(-yy, floor(ax/yy), ax);
flipflop ^= (ax < 0);
ax = fabs(ax);
} while (ax >= yy);
} while (ax >= y);
if (flipflop)
ax = y - ax;
if (x < 0)
ax = -ax;
return ax;
}

With that code and xtrunc instead of floor the results look like this
for close exponents (0x3FF to 0x433) with clang++-18 under WSL2:

fmodO: 9.29622
fmodM: 11.4518

And for aribitrary exponets (0x1 to 0x7FE):

fmodO: 9.29622
fmodM: 11.4518

I'd rather call it "simple" or "straightforward". "Elegant" in my book
is something else. For example, the code above is closer to what I
consider elegant.

I think if it would be staightforward some runtime-library would have implemented it in that way.

1. Long division is very slow on majority of older CPUs. That includes
CPUs that are quite fast in the absolute sense, like Intel Skylake,
with all its subvariants (Caby Lake, Coffee Lake, Whisky Lake, etc...)
and AMD Zen2.

2. The source language is not a standard C (or C++ for that matter). One
has to use ether gnu extensions or Microsoft's extensions. In the latter case, it became non-portable to ARM64.

FMA is available on more advanced CPUs. glibc has no problems to be x86-centric, so my code is also x86-centric. 128 / 64 divisions are
available on the oldest AMD64-CPU, FMA came along with the first
version of AVX, although there are SSE-variabts of FMA.

4. Even on CPUs with fast long and division and with
compilers/libraries that are able to generate long division it is
measurably slower than fdiv/floor/fma in the case that corresponds to
my quick path.

The above results are from clang++-18 with -march=native on my Zen4
-CPU. With MSVC the difference is in the same direction and much worse.

--- Synchronet 3.20c-Linux NewsLink 1.2

From Bonita Montero@Bonita.Montero@gmail.com to comp.lang.c++ on Mon Mar 10 22:36:31 2025

From Newsgroup: comp.lang.c++

Am 10.03.2025 um 22:34 schrieb Bonita Montero:

And for aribitrary exponets (0x1 to 0x7FE):

fmodO: 9.29622
fmodM: 11.4518

Sorry, the copy-buffer wasn't refreshed with the new results:

fmodO: 40.4702
fmodM: 40.1652
--- Synchronet 3.20c-Linux NewsLink 1.2

From Bonita Montero@Bonita.Montero@gmail.com to comp.lang.c++ on Mon Mar 10 23:52:44 2025

From Newsgroup: comp.lang.c++

I compared your new vesion against fmod() of MSVC in terms of accuracy
and your solution isn't absolute accurate:

fmod: 80.0059
fmodM: 44.21
50.8631 bits shared accuracy
equal results: 95.917%
equal exceptions: 91.017%
equal NaN signs: 96.466%
equal NaN-types: 85.78%
equal NaNs: 66.164%

All my solutions so far have 100%-values against glibc and MSVC-runtime.

--- Synchronet 3.20c-Linux NewsLink 1.2

From Bonita Montero@Bonita.Montero@gmail.com to comp.lang.c++ on Mon Mar 10 23:57:48 2025

From Newsgroup: comp.lang.c++

Am 10.03.2025 um 23:52 schrieb Bonita Montero:

I compared your new vesion against fmod() of MSVC in terms of accuracy
and your solution isn't absolute accurate:

    fmod: 80.0059
    fmodM: 44.21
    50.8631 bits shared accuracy
    equal results: 95.917%
    equal exceptions: 91.017%
    equal NaN signs: 96.466%
    equal NaN-types: 85.78%
    equal NaNs: 66.164%

All my solutions so far have 100%-values against glibc and MSVC-runtime.

These is a small list of values where MSVC is different than your code.
The first number after the = is MSVC's fmod, the second after the comma
is your code. A leading "S:" means that a value is signalling NaN, a
leading "Q:" means that a value is a quiet NaN. ":E" means that the
result triggered a FE_INVALID exception.

0x1.f3906bc1b8a5bp-39 % 0x1.ce82ac28f6d31p-480 =
0x1.cb9017b2eb6d0p-481:*, 0x1.ce82ac28f6d23p-480:E
-0x1.db9e55a99ab7ep+288 % 0x1.0ea94e32a4911p-837 =
-0x1.1a0f723fb6168p-839:*, -0x1.0ea94e301527ep-837:*
0x1.6cd295bc49bc9p+40 % -0x1.181d078b8be21p-593 =
0x1.e1b68391d8d86p-594:*, 0x1.1f97fa7e3cf68p-624:*
0x1.693b8d5c89f20p+450 % -0x1.835aa5a150414p-641 =
0x1.acab4ce1b1fc0p-642:*, 0x1.835aa29b6ea2fp-641:*
0x1.bdde634d7b156p+227 % 0x1.bfd2a77df6d50p-966 =
0x1.4e1d4d2aa24d0p-966:*, 0x1.bfd1c9b19df91p-966:*
0x1.05f1b1985d89bp+805 % -0x1.b30b0f4f9a276p+358 =
0x1.b24ef343c18f0p+357:*, 0x1.b30b0f4f9a26ep+358:*
0x1.c51a2760f5593p-60 % -:Q:08cc56216b9b = -:Q:08cc56216b9b:*, -:Q:08cc56216b9b:*
0x1.920c6dab8413ep+848 % -0x1.83fe96148a1f5p-608 =
0x1.89ffc2c4fe6b0p-612:*, 0x1.83fe95a92fb60p-608:*
-0x1.1f85e2baf9b81p+442 % -0x1.b6c265ae6c7f2p-735 =
-0x1.87cd395179cc8p-737:*, -0x1.aab5559a169c0p-770:*
0x1.caca80f1fbd5cp+92 % -0x1.5af03c5e3af31p-29 =
0x1.0e459ee5e237ap-29:*, 0x1.a0663e416d964p-76:*
0x1.e3b768fb131a4p-221 % +:S:02b894aade34 = +:S:02b894aade34:*, +:S:02b894aade34:*
0x1.cc8085c385f91p+728 % 0x1.a87b45bbf44d0p+4 = 0x1.9171dd366fb80p+3:*, 0x1.e357b06433180p-39:*
-inf % -0x1.5679339e426cap+833 = -:Q:000000000000:*, +:Q:000000000000:* 0x1.4c70b5c9428a0p+686 % -0x1.9171d61daeba1p-487 =
0x1.57ea17d959160p-488:*, 0x1.9171d61dae69fp-487:*
-0x1.52a6303a14500p+656 % -0x1.ad04391aaea96p-949 =
-0x1.0f02ac17840aap-949:*, -0x1.7004336b70620p-956:*
-0x1.0bdb317d8fad0p-767 % -:S:46e8429d5b1f = -:S:46e8429d5b1f:*, -:S:46e8429d5b1f:*
-0x1.1ad7e2eb90171p+1017 % 0x1.a15c8f17c6c46p+206 =
-0x1.a8f525c667820p+204:*, -0x1.c9531a58ca788p+191:*
0x1.f48a4fc42f92bp-674 % 0x1.cfc25d39d546ep-794 =
0x1.d93f965fe3148p-796:*, 0x1.0a6a691537120p-840:E
-0x1.3b74c883ad486p-37 % 0x1.3b6bc4c9630e4p-682 =
-0x1.138406da8d324p-682:*, -0x1.3b67c7dd85193p-682:E
-0x1.a9338d85aec1fp-59 % 0x1.b1bb2fe2dac6ap-189 =
-0x1.6605992b756c4p-189:*, -0x1.7c807b3947f50p-222:E
0x1.3cb0d3f492cbep+742 % 0x1.b9ee7c1d9dc7fp-377 =
0x1.e7207e18e1f64p-379:*, 0x1.954bb610d104ep-414:*
-:S:75cf442862ac % -0x1.18d9e4c580dc3p+888 = -:S:75cf442862ac:*, -:S:75cf442862ac:*
0x1.a422ba47d8b4cp+931 % -0x1.f14d3e01678afp+650 =
0x1.e9285e9f57eb0p+649:*, 0x1.f14d3e016789cp+650:*
0x1.9305131399232p-92 % -0x1.151226c38af1ap-793 =
0x1.a63acaf84e6e8p-794:*, 0x1.84df91a241950p-826:E
-0x1.a3902bd9caf38p-358 % -:Q:7b18016a9dad = -:Q:7b18016a9dad:*, -:Q:7b18016a9dad:*
-0x1.feffb10a8faf9p-821 % -:Q:3dc9368f0f2b = -:Q:3dc9368f0f2b:*, -:Q:3dc9368f0f2b:*
-0x1.303257ecb7accp+311 % 0x1.1079a8749c0cfp-460 =
-0x1.dab61acde63b0p-463:*, -0x1.1079a8749c0cfp-460:*
0x1.ec6774d00c8dap+292 % -0x1.a7ee93cb98208p+39 =
0x1.2aa60d920f4d0p+39:*, 0x1.a7ee9265d284cp+39:*
-0x1.c86dc88ae039dp-11 % -0x1.9f1b5aba7a1a3p-833 =
-0x1.83a8fadfbd164p-834:*, -0x1.674f068517da0p-885:E
-0x1.c572653fae811p+830 % 0x1.0a773c9bac2bap-360 =
-0x1.461532c5050c4p-361:*, -0x1.25a6aabe2ba58p-371:*
-:S:09d771d520c5 % 0x0.0000000000000p+0 = -:S:09d771d520c5:*, -:S:09d771d520c5:*
-0x1.fd78d355657fap+378 % 0x1.bac5a74a96292p-808 =
-0x1.cf57ab01b8168p-810:*, -0x1.bac5a736234a0p-808:*
-0x1.60886219128b2p-181 % -0x1.e0da6be14b319p-801 =
-0x1.7d892c58d801dp-801:*, -0x1.56f7456dcc78ap-840:E
-0x1.673fc3d551aabp+788 % 0x1.33629a122cb93p+657 =
-0x1.91d1f5c6c07e6p+656:*, -0x1.b76b6d2ee64b4p+624:*
0x1.56e9cc0e38d51p+150 % -0x1.bf241f2243df8p-837 =
0x1.84e51c39c9980p-839:*, 0x1.b0945e68254dap-837:*
-inf % -0x1.a91df61cdff55p-190 = -:Q:000000000000:*, +:Q:000000000000:* 0x1.3637b51a4cef5p+704 % -0x1.7321fee9405dap+174 =
0x1.35091dab4b7fcp+173:*, 0x1.e1fecd5a95078p+156:*
0x1.87b6bf31e8f4dp+417 % 0x1.0d7e010bd24d1p+107 =
0x1.807cf1f9a930cp+105:*, 0x1.20f38f9396204p+89:*
0x1.2be81d593fb2bp+654 % 0x1.be9286b355f8cp+615 =
0x1.cd0041c7ddda0p+612:*, 0x1.3047a80b7f7a0p+599:*
0x1.dea5144722dbfp+921 % 0x0.b513d2204ebb4p-1022 =
0x0.46213d625ee74p-1022:*, 0x0.b513d21d82ae5p-1022:E
-0x1.fb8987bc24854p-338 % -0x1.547c2fcd66d1ap-430 =
-0x1.26d9137c7a176p-430:*, -0x1.547bf8a4b1ef6p-430:E
-:S:1b29754c5a61 % -0x0.0000000000000p+0 = -:S:1b29754c5a61:*, -:S:1b29754c5a61:*
-0x1.e6b1f8fdeffe5p-815 % +:Q:6028ccd5ed9b = +:Q:6028ccd5ed9b:*, +:Q:6028ccd5ed9b:*
0x1.02506fa14b30dp+757 % -0x1.293042280db98p-281 =
0x1.77c049e18c880p-284:*, 0x1.206f3efcbcb7cp-281:*
0x1.dfe723c99470cp+761 % -0x1.f4e4e5c401e3ap+602 =
0x1.3451fa7ce6290p+600:*, 0x1.c4c02af23af50p+602:*
-0x1.40f2c7b476892p+177 % 0x1.598f0e6f54478p-69 =
-0x1.b6c1215f45a00p-73:*, -0x1.598f0e6f09656p-69:*
0x1.ae1571bcc13d3p+800 % -0x1.27f708bc9e5eap+761 =
0x1.46a1fe4d37f74p+760:*, 0x1.f8ca44ccf51c0p+746:*
-inf % -0x0.6eb15b84ec762p-1022 = -:Q:000000000000:*, +:Q:000000000000:* -0x1.6eef2ed2bd997p-367 % 0x1.24f0dfb1d013cp-725 =
-0x1.0bd5725b7a8e8p-725:*, -0x1.3d07e25757980p-751:E
-0x1.80825cff43fedp+297 % 0x1.d2fa1162d808ap-239 =
-0x1.9d715085e17e0p-243:*, -0x1.d2d2baf3759a0p-239:*
-0x1.f3ef264938cdbp+578 % -0x1.7a7361f2c374dp-594 =
-0x1.3456c0b093d82p-595:*, -0x1.18c8cde8a4fcep-636:*
0x1.2fef3ebb2a83cp+271 % 0x1.305b597908258p-256 =
0x1.6bbc385a6bf40p-258:*, 0x1.002affd4abc60p-281:*
-0x1.17be45284e395p+801 % 0x1.5a522f5c2de83p-200 =
-0x1.9677eb7bf9d06p-201:*, -0x1.5a522f5c2db80p-200:*
-0x1.57d0dd98ac18ap+137 % 0x1.c32b88a59fe56p-327 =
-0x1.68a7520c8a1e0p-329:*, -0x1.c32b88a10b018p-327:*
0x1.265ff059ca95bp-464 % 0x1.27e29972a0af2p-878 =
0x1.9fbc54a730db4p-879:*, 0x1.27e2995cd1fbap-878:E
-0x1.080283533b4fbp-251 % 0x0.6437eff3e0c4dp-1022 = -0x0.12f12bf43d424p-1022:*, -0x0.0000000000005p-1022:E
-0x1.1cee4185128fep+712 % -0x1.3809a31ed19ddp+150 =
-0x1.7299814d7e1d0p+146:*, -0x1.3809a31ecdae5p+150:*
-0x1.30540d1fe448ap+914 % 0x1.0604359f2bdc4p-943 =
-0x1.9833d33fe54f0p-945:*, -0x1.d21e1f3f1e83cp-944:*
0x1.c8243149f8cebp-469 % -0x1.f5cfcf76e9a96p-949 =
0x1.0b057c96e4dd6p-949:*, 0x1.f5cfa80564c88p-949:E
-0x1.ab3666e78c1e4p+501 % 0x1.62569bac2d7a3p+485 =
-0x1.bdcdcf0704138p+483:*, -0x1.7380d4821c8d8p+446:*
0x1.5d5a05222a4fdp+131 % -:S:10b49421137b = -:S:10b49421137b:*, -:S:10b49421137b:*
0x1.c1c5c8c8c576ap+471 % 0x1.c4ef1b4bf646ap-857 =
0x1.19014231bdc32p-857:*, 0x1.c4ef1b4bf6384p-857:*
-0x1.b3036e1018f3ap+954 % 0x1.12b54b9bc60cbp-106 =
-0x1.9227762d84bf2p-107:*, -0x1.7efb1c6d82c10p-150:*
-0x1.dadbc417031c0p-394 % 0x0.b4cbf249fc9aep-1022 = -0x0.2f327af302c84p-1022:*, -0x0.000000644dd0ap-1022:E
0x1.e106de3e95ad8p+775 % 0x1.34cf8bf2b1e25p+627 =
0x1.fef3325406150p+624:*, 0x1.5da5008cf1ca8p+612:*
0x1.a021410d37bafp+943 % 0x1.0447b1e4a4245p-68 =
0x1.fd040c22ea6ccp-70:*, 0x1.0447b1dd3a815p-68:*
0x1.9003f1808bc04p+84 % 0x1.85f71d08b694bp-665 =
0x1.19ea7f28dbf38p-668:*, 0x1.45dddfbb51c28p-683:*
-0x1.6866b2963447fp+946 % 0x1.6aef20853c933p+479 =
-0x1.b85d0be5b8e90p+476:*, -0x1.6aef2030c473ap+479:*
0x1.076c10825d6adp-100 % -0x0.c634f8a4283dep-1022 =
0x0.06ab8ad7d38a0p-1022:*, 0x0.c632fa9b73ecdp-1022:E
-0x1.526afec81d0adp+912 % 0x1.e3952b1d3e152p-582 =
-0x1.3f44586f37d62p-582:*, -0x1.c221c35d0b620p-627:*
0x1.a7366af0b36e4p-573 % 0x1.a5c39226cd23ap-782 =
0x1.664043f332d0cp-782:*, 0x1.a4db4ba4f5441p-782:E
0x1.72d14378039cep+59 % -0x0.a147a5df15a01p-1022 =
0x0.9ca3811e921b8p-1022:*, 0x0.000009177dfccp-1022:E
-0x1.fd24a08a6f52bp+893 % -0x1.ee9da37e89d02p-847 =
-0x1.350ee33be82c0p-852:*, -0x1.ee9d820e59401p-847:*
-0x1.515e88f40aeaep+120 % 0x0.f3c4bf501bbb9p-1022 = -0x0.90ca9ea63183fp-1022:*, -0x0.f3c35012c7c66p-1022:E
-0x1.ed0a86327586bp-317 % -0x1.79df69443e400p-579 =
-0x1.b994a2bbc1000p-581:*, -0x1.79dcdf7d921fep-579:E
-:S:7672e928bf68 % -0x1.cdbb6ab8d7b03p-117 = -:S:7672e928bf68:*, -:S:7672e928bf68:*
-:S:081e683dda7e % -0x1.dceb39934861fp+259 = -:S:081e683dda7e:*, -:S:081e683dda7e:*
-0x0.0000000000000p+0 % -:Q:75ed446ec7a4 = -:Q:75ed446ec7a4:*, -:Q:75ed446ec7a4:*
0x1.adc02d9073a3bp+452 % 0x1.a9ab278d13f51p-104 =
0x1.e88332c5799e0p-109:*, 0x1.a9ab278d13dd6p-104:*
0x1.2530e56b212d8p-412 % -0x1.37acf81b8d90fp-810 =
0x1.bc65bc384b140p-816:*, 0x1.37acf81b8d5cdp-810:E
-:S:6c70c7d49bb7 % 0x1.92a7f6f419b76p-158 = -:S:6c70c7d49bb7:*, -:S:6c70c7d49bb7:*
0x1.3ef5eebd7e5f7p-181 % -0x1.8c7560f6a725ap-810 =
0x1.7285be6c6b8f8p-810:*, 0x1.1c4146384d114p-844:E
0x1.ed15321b6998ep+563 % -:Q:5be28789cb65 = -:Q:5be28789cb65:*, -:Q:5be28789cb65:*
-0x1.32a3d7e78eb79p-537 % -0x0.62b8f212ad045p-1022 = -0x0.2c3c62fd03a36p-1022:*, -0x0.010b7914d7f18p-1022:E
0x1.74e4f832866b1p+531 % 0x1.8869e30a82af1p-789 =
0x1.b0ed52d3be3f0p-793:*, 0x1.87e146410162bp-789:*
-0x0.0000000000000p+0 % +:Q:38557bd7bfef = +:Q:38557bd7bfef:*, +:Q:38557bd7bfef:*
0x1.84b2827aab2bdp+29 % 0x1.7f3d131d22ad7p-32 = 0x1.536c8a4118206p-32:*, 0x1.7f3d131d22acfp-32:*
-0x1.c4c6606ad1e87p+881 % 0x1.70e0997a9689fp+867 =
-0x1.1984713928b9cp+866:*, -0x1.9707b388df498p+826:*
-0x1.5c1a267c247d7p+734 % -0x1.3ab95fceade6dp-967 =
-0x1.9ac9b0dd27a6cp-968:*, -0x1.1c1330623f4d1p-967:*
-0x1.dea2384d5c6d3p+24 % 0x1.4529498eae0ffp-935 =
-0x1.3466e259d4db5p-935:*, -0x1.4529498cc443ep-935:*
-inf % -0x1.97e808a7f8df4p+522 = -:Q:000000000000:*, +:Q:000000000000:* -0x1.b979582d97743p+770 % -0x1.1e72af7936eaep-899 =
-0x1.8154231576fd0p-902:*, -0x1.8bd3a71a70ec0p-936:*
0x1.d8e702352e66cp+409 % -0x1.dfee77717fca4p-368 =
0x1.3e66ebd8b1b6cp-368:*, 0x1.dfee77717fbe3p-368:*
+:S:46837b2202cc % 0x1.7873abc208651p+94 = +:S:46837b2202cc:*, +:S:46837b2202cc:*
-0x1.8f7b2379bc0c0p+773 % -0x0.02035b0f42517p-1022 = -0x0.01196830ca964p-1022:*, -0x0.02035b0f42517p-1022:E
-0x1.afddedad3bca3p+411 % 0x1.793cb5b3c3499p+102 =
-0x1.6f0b726965a15p+102:*, -0x1.5637782cd14ccp+82:*
-0x1.fdefee3844196p+476 % 0x1.e90419c0cddefp-780 =
-0x1.9dad78387d944p-782:*, -0x1.12027dcfe0258p-790:*
-0x0.0000000000000p+0 % +:S:2e30258a38b6 = +:S:2e30258a38b6:*, +:S:2e30258a38b6:*
0x1.0f15abc17f9c7p-746 % -0x0.ddd8b5cb2a749p-1022 =
0x0.37560602be5e3p-1022:*, 0x0.ddd8b5cb2a746p-1022:E
0x1.378476f9b99b3p-87 % 0x0.266b7f5dc0f55p-1022 =
0x0.1c9ba4415b7fbp-1022:*, 0x0.0000000000010p-1022:E
-0x1.799d5f8283c25p+681 % -0x0.5bdcedf4d1686p-1022 = -0x0.547ed045a2138p-1022:*, -0x0.5bdcedf4d1686p-1022:E
-0x1.3c85d07425b0fp-245 % -:S:211cab11d7dd = -:S:211cab11d7dd:*, -:S:211cab11d7dd:*
0x0.0000000000000p+0 % -:Q:6c1a20f86641 = -:Q:6c1a20f86641:*, -:Q:6c1a20f86641:*
0x1.c74d47d529ef1p+508 % -0x1.e34758c8bf2c5p-70 =
0x1.7e35028e5a0d4p-71:*, 0x1.15c59fb3659a0p-104:*
-0x1.94d3c15a0cf90p+579 % -0x1.57a0c606e8fa8p-294 =
-0x1.f552da22f7440p-295:*, -0x1.d85db6e88fb00p-350:*
-0x1.a61d0a40f417ep-196 % +:Q:5acb57395130 = +:Q:5acb57395130:*, +:Q:5acb57395130:*
0x1.b0207d3756795p+639 % 0x1.7642d9771edb3p-674 =
0x1.1624dc4805040p-679:*, 0x1.7642d9771edb2p-674:*
+:S:7592d85cc736 % 0x1.b812369167f25p-387 = +:S:7592d85cc736:*, +:S:7592d85cc736:*
0x1.a02e1e147da82p+450 % 0x1.688bb222d612bp-750 =
0x1.007566df14c0bp-750:*, 0x1.30be3736a6dd6p-752:*
0x1.28805687d2460p+405 % 0x1.3f4f7338ec139p-144 =
0x1.819f7d9a2e2eep-145:*, 0x1.1a23d51a542e0p-193:*
0x1.081e52924fe9dp-179 % -0x0.3283d218ef012p-1022 =
0x0.31b06032d76a0p-1022:*, 0x0.0000000004e6dp-1022:E
-inf % 0x0.9519cce00aaa6p-1022 = -:Q:000000000000:*, +:Q:000000000000:* -0x1.d16423d05da60p+524 % 0x1.a2ad95033b063p-995 =
-0x1.397576708f8c3p-995:*, -0x0.0aa20d2d123f7p-1022:E
-0x1.8ef55d814e6abp+883 % -0x0.3378c574766e4p-1022 = -0x0.1671a798673ccp-1022:*, -0x0.337274db46f86p-1022:E
0x1.fbbc1b7920635p-649 % -:Q:4a93d54803f5 = -:Q:4a93d54803f5:*, -:Q:4a93d54803f5:*
-0x1.d09edd208393ep+709 % -:Q:66de064b1685 = -:Q:66de064b1685:*, -:Q:66de064b1685:*
-0x1.8ea9a217a6effp+850 % -0x1.1815431add1a4p+374 =
-0x1.bf507edcde350p+372:*, -0x1.181542fe17c45p+374:*
-0x1.2daf1bf0e5d13p+199 % 0x1.9cef03f3cc2edp-403 =
-0x1.a77c721faffaap-404:*, -0x1.51adc57c797a0p-409:*
-0x1.a1e88b029465bp+832 % -0x1.fc682ce37787bp-63 =
-0x1.dba2ab5e14e76p-63:*, -0x1.fc682ce3747bep-63:*
-0x1.5f270d128c748p-281 % +:S:6c34c9b5e4c0 = +:S:6c34c9b5e4c0:*, +:S:6c34c9b5e4c0:*
-inf % -0x1.c668413334d5bp-963 = -:Q:000000000000:*, +:Q:000000000000:* -0x1.b16799cacfbe7p+68 % 0x1.61dd8c0cd5129p-855 =
-0x1.05a1c4459cca0p-855:*, -0x1.c3116c30505e4p-863:*
-0x1.d1c58f9899e87p+318 % -0x1.cc84945d5b796p-379 =
-0x1.2632a8f8a4800p-388:*, -0x1.97f30f92d5090p-394:*
-0x1.b013c6b151f76p+719 % +:S:3bef3dd00cc8 = +:S:3bef3dd00cc8:*, +:S:3bef3dd00cc8:*
-0x0.0000000000000p+0 % +:Q:000608ecfae8 = +:Q:000608ecfae8:*, +:Q:000608ecfae8:*
-inf % -0x1.c4091395ab537p+98 = -:Q:000000000000:*, +:Q:000000000000:* 0x1.d1cc289247e7bp-365 % -0x1.398e53924c941p-490 =
0x1.adabfa5e699e6p-491:*, 0x1.e427fa46eaf58p-529:E
-inf % 0x0.0000000000000p+0 = -:Q:000000000000:*, +:Q:000000000000:* 0x1.a0890ccaba309p+146 % -0x1.3ea42dd561679p+111 =
0x1.17fe6b64b14f4p+110:*, 0x1.35c9ea24b2c94p+92:*
0x1.070f9dab6357cp-540 % -:Q:390657e54615 = -:Q:390657e54615:*, -:Q:390657e54615:*
-0x1.374a2c816f75dp+105 % -0x1.29b0dc0098d70p-755 =
-0x1.4a4fc81ed1b40p-756:*, -0x1.29b0cc9ddfe5bp-755:*
-inf % 0x1.eb85833cdc7dep-278 = -:Q:000000000000:*, +:Q:000000000000:*
-inf % -0x1.da2b0a902da25p+138 = -:Q:000000000000:*, +:Q:000000000000:* -0x1.de186e195b3b4p-363 % 0x1.52450120e6441p-711 =
-0x1.acd1fc8441c26p-712:*, -0x1.52450120e3b1cp-711:E
-0x1.e75ffd14e916bp+817 % +:Q:3add6df48ede = +:Q:3add6df48ede:*, +:Q:3add6df48ede:*
-0x1.6f4c4b4343943p+676 % -0x1.a1ebb919d9d86p+238 =
-0x1.9db899c7f5fc0p+234:*, -0x1.f783d25d4cd00p+183:*
-0x1.d6bbe9c32fd6ep+9 % -0x1.af1f4b558df6ap-476 =
-0x1.33eec5760264cp-476:*, -0x1.9fb6984b65e26p-476:*
0x1.55283a1151a10p-137 % -0x1.a887efec42e0bp-827 =
0x1.bf00241ad7366p-828:*, 0x1.22a85fbbb42fcp-844:E
-:S:037f8c9a52d4 % -0x1.5997d57d2df65p+712 = -:S:037f8c9a52d4:*, -:S:037f8c9a52d4:*
0x1.f7067b2c1d7ccp+190 % 0x1.83d4e306f6917p-476 =
0x1.04c8890eac772p-476:*, 0x1.83d4e306f68a2p-476:*
-0x1.a709b33b14c2bp+577 % -0x1.54af390e0c3f0p-345 =
-0x1.4654f08f1a1c0p-347:*, -0x1.bc2ac47ba83c0p-353:*
-0x1.cda203086f3f5p+181 % -0x1.ed29ddc5c80b4p-951 =
-0x1.3d085eb001238p-951:*, -0x1.ed29dd6139725p-951:*
0x1.f11e3c321678bp+736 % -0x1.5995beaa2b6bfp+619 =
0x1.8169668561980p+618:*, 0x1.5995beaa2b697p+619:*
-:S:57a3f664f4b1 % 0x1.2d899a1838accp-454 = -:S:57a3f664f4b1:*, -:S:57a3f664f4b1:*
-:S:0792c2c0e65d % -0x1.e594b75fca2ebp-622 = -:S:0792c2c0e65d:*, -:S:0792c2c0e65d:*
0x1.d315c18b1be5ep+612 % 0x1.c1cf287812e41p-16 =
0x1.7e92f80055d0dp-16:*, 0x1.c1cf24fa6f3ffp-16:*
-0x1.ce2fd68fe3f88p+109 % -0x1.548a894c57f79p-783 =
-0x1.2297bcb6dcd78p-784:*, -0x1.548a894c57ee6p-783:*
-0x1.8c7d4e71b040ap-837 % 0x1.c56c03b917b94p-1012 = -0x1.609cf2eb2b6b8p-1013:*, -0x0.00000000003c5p-1022:E
-0x1.f7008c6efc4f1p+833 % -0x1.2799ce2fe8d15p+368 =
-0x1.293976731bbeep+367:*, -0x1.360dffa8ee0a8p+343:*
+:Q:62ac0b7aa18b % +:Q:6687c1114d2e = +:Q:6687c1114d2e:*, +:Q:62ac0b7aa18b:* -0x1.05296066d952bp+798 % -0x1.44b5dc7a87fabp-763 =
-0x1.379123c62e116p-764:*, -0x1.44b5dc7a8556ep-763:*
0x1.3a13b93f6260ep+821 % 0x1.b760ec18452e3p-819 =
0x1.ccfd310c46f2cp-820:*, 0x1.b7071893a84edp-819:*
0x1.f778ebd7612aap-524 % 0x1.b0b4efa6ff719p-892 =
0x1.60304d6e02ec3p-892:*, 0x1.b079dfe57f51cp-892:E
0x1.26d1cf7523321p+917 % 0x0.5b9c65858b32ep-1022 =
0x0.45c9258ced302p-1022:*, 0x0.0000000000521p-1022:E
-0x1.27a7da8e7d5bep-477 % 0x1.0191e9715e8acp-857 =
-0x1.8407692ec72c8p-858:*, -0x1.ac8b04b95c600p-866:E
-:S:796644ff35b9 % 0x1.b6ec67a0a4a1ep-229 = -:S:796644ff35b9:*, -:S:796644ff35b9:*
0x1.1ceba2f4b4d67p+587 % 0x1.b6833a133bcb4p-482 =
0x1.045303a582d30p-484:*, 0x1.2baa633bcbee0p-506:*
-0x1.1a4f36ab2299fp-791 % -0x1.475ccb3010580p-929 =
-0x1.41e40130b1300p-930:*, -0x1.2735791edba00p-955:E
0x1.3ad2dd96f3170p-278 % 0x0.7a28f935814f7p-1022 =
0x0.408e9585953b6p-1022:*, 0x0.0000002723cccp-1022:E
-inf % 0x1.6c6a5497d9974p+524 = -:Q:000000000000:*, +:Q:000000000000:* 0x1.b21b8180e1052p+466 % 0x0.2f689c757d593p-1022 =
0x0.1b98199deb26ep-1022:*, 0x0.2f689c757a61cp-1022:E
-0x1.f72de68d827bap+915 % -0x1.7d57b389b509dp+451 =
-0x1.efdf5fac2c7e0p+449:*, -0x1.7d57b37441313p+451:*
-inf % 0x0.acd891dd609f9p-1022 = -:Q:000000000000:*, +:Q:000000000000:*
-inf % -0x1.2a88972abe444p+731 = -:Q:000000000000:*, +:Q:000000000000:* 0x1.23466efdf0dcfp-263 % 0x1.afdd61e8b63a7p-506 =
0x1.ffa93d4ddc3acp-507:*, 0x1.172dd60c8224ep-535:E
-:S:75346ab05ace % 0x1.f94231fab62a1p-47 = -:S:75346ab05ace:*, -:S:75346ab05ace:*
0x1.158e33bd86dacp+569 % 0x1.9dd74b66314eap-446 =
0x1.7642561075f2ep-446:*, 0x1.9dd74b5b20cffp-446:*
0x1.ede372b8d12b3p+48 % 0x1.64a7ae7d16cb9p-791 =
0x1.cf4f14c98e588p-793:*, 0x1.09868ef132d90p-829:*
-0x1.fdc55a7098d16p+819 % 0x1.c86ab67c2768fp-368 =
-0x1.5f3ff8a351e4cp-368:*, -0x1.db5ecfafe75e8p-389:*
-0x1.c1351c205a114p+985 % 0x1.100efe35d256cp+418 =
-0x1.dc41d9edf7fd0p+417:*, -0x1.100efe35bb157p+418:*
-0x1.ca82d91974e11p+414 % -0x1.f1b9bc8b4b540p-317 =
-0x1.bf91b0b3cc8c0p-317:*, -0x1.c59b6ef4b7b00p-367:*
+:S:7398cb29ad3e % -0x1.76cfed81d0fbap-936 = +:S:7398cb29ad3e:*, +:S:7398cb29ad3e:*
-0x1.36ec4d878b279p+958 % 0x1.ba12e6c9636d9p-204 =
-0x1.e3de9522be2e8p-207:*, -0x1.ba12e6c963682p-204:*
-0x1.555e0d8d9dafcp-286 % -0x1.710574dc17bc5p-607 =
-0x1.86d5d19a78edcp-608:*, -0x1.88572bde6b3a8p-619:E
-0x1.cace77640c345p+408 % 0x1.aeafe208ccb38p-876 =
-0x1.0b80363d350e0p-876:*, -0x1.1beea77edab50p-906:*
-:S:192ca14ace2f % 0x1.6dca69c14dd41p+473 = -:S:192ca14ace2f:*, -:S:192ca14ace2f:*
0x1.b46ff52c54fa4p+912 % -0x1.e76eb5c52911cp+880 =
0x1.37c6afc2f4bd8p+880:*, 0x1.21ebbb6318c00p+854:*
-0x1.b30033fea3b41p+645 % -0x1.61c15a38a9ee6p-865 =
-0x1.78fc6641deb38p-867:*, -0x1.61c15a366ffe6p-865:*
-0x1.6dbc2e9ee5669p-909 % -0x0.1fb91677e4a25p-1022 = -0x0.004268165e4c8p-1022:*, -0x0.0000000000007p-1022:E
-0x1.9dfb222ffb5b2p+299 % 0x1.5a99ed2a84c20p+129 =
-0x1.0a5da9bc5fce0p+129:*, -0x1.5a99ed2a84bffp+129:*
0x1.f4f940ee0a476p+580 % 0x0.622dd1898514cp-1022 =
0x0.495c50e13d574p-1022:*, 0x0.0000000000006p-1022:E
-0x1.831c08e80339ep+506 % -0x1.0dcc58f713107p-392 =
-0x1.6d008d08cfbd2p-393:*, -0x1.b5503708eb588p-432:*
-0x1.6dcb17e8003e7p+545 % 0x1.e9597c2faf7c7p-848 =
-0x1.a7b5d9cb9ec0ap-848:*, -0x1.1d64176d5f05ap-899:*
0x1.af09cacc69639p+686 % -0x1.52945ba2161aap+678 =
0x1.337277a353c2ep+678:*, 0x1.52945ba216171p+678:*
-0x0.0000000000000p+0 % +:Q:6d2241cbeb07 = +:Q:6d2241cbeb07:*, +:Q:6d2241cbeb07:*
-inf % 0x1.fc583271fef46p-40 = -:Q:000000000000:*, +:Q:000000000000:* 0x1.ab1d619a2c6ebp+268 % -0x1.b41ae6dfffa02p-531 =
0x1.1301a8bf57d40p-531:*, 0x1.5f75e6aaf2350p-558:*
0x1.abbc82fea32ecp-91 % -0x0.40173b2155994p-1022 =
0x0.0dc5a56d613c4p-1022:*, 0x0.40173b2155993p-1022:E
+:S:575f4a6796bd % -0x1.58c9b31c0681dp-429 = +:S:575f4a6796bd:*, +:S:575f4a6796bd:*
0x1.d0aa56c5914f8p+25 % 0x1.b1ee6d238b758p-205 =
0x1.e1a0d5f41cd20p-206:*, 0x1.b1ee6d238b73fp-205:*
-0x1.3258901c35b37p-226 % 0x1.d605b9bb4104ap-248 =
-0x1.7ab582a37f938p-248:*, -0x1.b0ef2151ed9a0p-280:*
0x1.09ab50ac1234fp-203 % +:S:4cb04526ab2a = +:S:4cb04526ab2a:*, +:S:4cb04526ab2a:*
0x1.793bc5ff58821p-705 % -:S:72923dfb17d2 = -:S:72923dfb17d2:*, -:S:72923dfb17d2:*
+:S:4ff2e99a9d2a % -0x1.d19e1ddb4b200p-356 = +:S:4ff2e99a9d2a:*, +:S:4ff2e99a9d2a:*
0x1.eefc1967ea01ap+227 % -0x1.9a60be3993cb9p+202 =
0x1.3345297f47cd9p+202:*, 0x1.9a60be3718c92p+202:*
0x1.2f1bcd2f750e6p+702 % -0x1.ba4bcdfaa6f19p-973 =
0x1.3db4e955df1f1p-973:*, 0x1.ba4bcdfa87d9dp-973:*
+:S:590f33ef7e4b % -0x1.84c3b96598caap+901 = +:S:590f33ef7e4b:*, +:S:590f33ef7e4b:*
-:S:26a00caf8d6e % 0x1.f7e13f4d6c16ap+388 = -:S:26a00caf8d6e:*, -:S:26a00caf8d6e:*
+:S:2f0574fe9d6d % 0x1.15095c5d1ad54p+581 = +:S:2f0574fe9d6d:*, +:S:2f0574fe9d6d:*
0x1.18b7c640f0c87p+400 % 0x1.9d7507770aae6p-774 =
0x1.1d5fb6cf486f0p-775:*, 0x1.3d2b87b9e46c0p-812:*
0x1.ddb1f56316646p+648 % 0x1.9a86e340f9bfbp+208 =
0x1.ab312a20b2ac8p+205:*, 0x1.9a86e340f9bf8p+208:*
0x1.87cec2f44a5c2p+794 % 0x1.82a7563438c17p+509 =
0x1.5aba928b2acd4p+509:*, 0x1.82a7563438c0cp+509:*
-0x1.4ffae216aeac4p+642 % 0x1.279eeb8e39303p-424 =
-0x1.8031feb60d0c4p-425:*, -0x1.f81c28d0a5f40p-452:*
0x1.e504832c6ca66p+796 % -0x1.191f71664bff7p-526 =
0x1.21d237bd72aa0p-531:*, 0x1.4ae1f1cfed900p-579:*
+:S:5fd7e66ba539 % 0x1.966df34b06b39p-720 = +:S:5fd7e66ba539:*, +:S:5fd7e66ba539:*
-0x1.994cff326fee1p+272 % -0x1.8dbe934960029p+81 =
-0x1.b2b97288bed80p+75:*, -0x1.f33eab9a98158p+56:*
-0x1.5b4f03732d4e2p+779 % -0x1.ec866aa711b0cp-757 =
-0x1.63fdbdeed1a78p-758:*, -0x1.f661828e15680p-812:*
0x1.0f79971211108p+763 % -0x1.616c26e5f83d0p+721 =
0x1.484224c7a7cd0p+721:*, 0x1.fce9e52096500p+707:*
0x1.6434ce40c38b8p+683 % -0x1.a553687c67e1ep+59 =
0x1.823315c286120p+57:*, 0x1.0da4e82c68480p+21:*
0x1.13a2606d28377p-780 % -0x0.fcf48a24d586fp-1022 =
0x0.483844c525cdap-1022:*, 0x0.fcf48a23599abp-1022:E
-0x1.10bf2cf884cb2p+816 % -0x1.65be20d380c1cp+336 =
-0x1.fdc6c0d6857e0p+335:*, -0x1.65bdcc2751b1bp+336:*
-0x1.9c14da6e0ba59p-107 % 0x1.6e50fff133fe5p-751 =
-0x1.86a5efe7aaed0p-755:*, -0x1.8ef7727accb00p-771:E
-0x1.2c7d44b103b09p+410 % 0x1.475f4dc5a7aacp+140 =
-0x1.80de48f606170p+138:*, -0x1.03e73920555b2p+140:*
0x1.0674fbac0ce22p+925 % 0x1.b4edec2d37390p+903 =
0x1.7747609c8a990p+903:*, 0x1.b4edec2ba389dp+903:*
0x1.f268d2339668dp-412 % 0x1.9af436111d1f4p-889 =
0x1.afb1d5bd8bb00p-890:*, 0x1.9af0aa2995b34p-889:E
-0x1.9df964b74891fp+553 % -0x1.fd8771c162c36p-11 =
-0x1.ed214f320bff0p-11:*, -0x1.1f766edd6c930p-52:*
0x1.73453865d5ec6p+448 % -0x1.dacfe0a02aa30p+328 =
0x1.539a8b614ce80p+328:*, 0x1.dacfe0a02a858p+328:*
-0x1.665197a65b5ffp+3 % -0x1.b363654e0df02p-280 =
-0x1.314c3fb43a868p-282:*, -0x1.00f7e6d5530c0p-328:*
-0x1.fccb514963614p+787 % 0x1.c679cf174436dp+542 =
-0x1.74e7cb1ea03e7p+542:*, -0x1.5a34fd7d72d20p+510:*
-0x1.34518008c2dfcp-820 % 0x0.4fab23608ff1bp-1022 = -0x0.3d69dfaa7d118p-1022:*, -0x0.000247f6201f9p-1022:E
0x1.0ed7bfb52c919p+782 % -0x1.474a0f0fc4a6dp-230 =
0x1.a57c5359c23b8p-233:*, 0x1.474a0f0e9be35p-230:*
-:S:0820be10ce32 % -inf = -:S:0820be10ce32:*, -:S:0820be10ce32:* 0x1.40957f6579bebp-581 % -0x1.88ffef6bb8906p-1012 =
0x1.1e8ede71b4690p-1014:*, 0x1.86f3aa23f30f9p-1012:E
-0x1.a276dbcd05ce2p+486 % -0x1.884b5722e4d36p-942 =
-0x1.e294fae6f79d0p-945:*, -0x1.31994c245ab68p-996:*
-0x1.507f4291410f1p+623 % -0x1.9734831b71b5dp-287 =
-0x1.4ecaeb4ed0c96p-288:*, -0x1.9734831af41e5p-287:*
-0x0.e30ce38c5f3cap-1022 % 0x0.cc77cc8d08a67p-1022 = -0x0.169516ff56963p-1022:*, -0x0.0000000000000p+0:E
-0x1.fe2210d42e88bp+541 % -0x1.c6cc8a1b166f3p+422 =
-0x1.70acf6e25e444p+421:*, -0x1.c6cc8a1b16549p+422:*
0x1.9768755ec94f0p-490 % 0x0.54fd41fa8e5b9p-1022 =
0x0.3ef10b1ed9885p-1022:*, 0x0.54da5621723bfp-1022:E
-0x1.4482250ac2cc4p-692 % -0x1.b56d29a02e388p-926 =
-0x1.3616a233d8ad0p-926:*, -0x1.dc0e6bf1f7500p-965:E
-0x1.77dd2a83f8548p-154 % 0x0.7b3de566ca8e0p-1022 = -0x0.6b8f117ef60e0p-1022:*, -0x0.7b346d1e82c50p-1022:E
-0x1.a63e15e83078dp+200 % -0x0.d695683a6c35dp-1022 = -0x0.aca74bf011ea1p-1022:*, -0x0.0000000004f0fp-1022:E
0x1.095e55052867ap+870 % 0x1.84bf556755569p-746 =
0x1.113a89470aea6p-747:*, 0x1.84bf556732469p-746:*
-0x1.0206653a6121cp-470 % -:S:1289b1e0b1eb = -:S:1289b1e0b1eb:*, -:S:1289b1e0b1eb:*
-0x1.9e407485e3f38p+298 % -0x1.571972d6f7235p-947 =
-0x1.49228fe699d40p-949:*, -0x1.571972cbc8ddbp-947:*
0x1.80465931b2e3ap-134 % -0x0.c615934a0e3cap-1022 =
0x0.3c24761f52046p-1022:*, 0x0.000000000000ap-1022:E
-0x1.687bcb4d3a568p+590 % 0x1.ba0d04e072ce9p-939 =
-0x1.71f27de91d919p-939:*, -0x1.3345d2063bb52p-942:*
-0x1.2434c48f4c020p-618 % -:S:3954133c649d = -:S:3954133c649d:*, -:S:3954133c649d:*
+:Q:76047fbb33a6 % -:Q:346df81637c4 = -:Q:346df81637c4:*, +:Q:76047fbb33a6:* 0x1.3d3438dcbfd24p+429 % 0x1.e7cdb00d7546cp-759 =
0x1.6a69b573a7780p-760:*, 0x1.e7cd935c192d1p-759:*
0x1.3ef19e7eb1c49p-382 % -0x0.06a35176833dap-1022 =
0x0.0224eb53cf854p-1022:*, 0x0.000000029fb7fp-1022:E
+:S:29887b86b495 % 0x1.89f4467e9976bp-358 = +:S:29887b86b495:*, +:S:29887b86b495:*
0x1.d2abedbf78f32p+820 % -0x1.35c9e84c6e43ap-121 =
0x1.76b9c8ae35b94p-122:*, 0x1.9f25f85b18140p-173:*
0x1.a3fa2aba002bdp+960 % -:S:6ced5e0b115b = -:S:6ced5e0b115b:*, -:S:6ced5e0b115b:*
-inf % -0x1.1498f38858e6fp+680 = -:Q:000000000000:*, +:Q:000000000000:*
-inf % 0x1.e707eb3365530p+358 = -:Q:000000000000:*, +:Q:000000000000:* -0x1.dac63c7c2d337p+249 % 0x1.1e3b03d0e5a55p-546 =
-0x1.21bb66a14b49cp-547:*, -0x1.1e3b0397884b2p-546:*
0x1.e1f383151da0bp+362 % 0x1.363c48151a8c4p-755 =
0x1.651fbb60e0a48p-756:*, 0x1.d24a369a001e0p-799:*
0x1.058430db9348fp+735 % 0x1.e55ab256660a6p-632 =
0x1.462c5435174bap-632:*, 0x1.364630abca780p-644:*
0x1.5577cd9ccb71fp+859 % -0x1.06ba82d1ae72bp-796 =
0x1.f0ed0452a28dcp-797:*, 0x1.06ba82d1ae5b7p-796:*
-inf % 0x1.5cf1665456db5p+724 = -:Q:000000000000:*, +:Q:000000000000:* -:S:42c4fc523d7b % -0x1.8b6ff9ef2b001p-655 = -:S:42c4fc523d7b:*, -:S:42c4fc523d7b:*
-0x1.a129d23867040p+716 % 0x1.941fe5005e1b2p-295 =
-0x1.40645cdc2a1b8p-295:*, -0x1.941fe5004d848p-295:*
0x1.20cbcfac0bb7ep+885 % 0x1.e279eab0b0ff0p+757 =
0x1.2a3a0a6eddf80p+755:*, 0x1.53f0bbe3c0000p+713:*
0x1.e780f54829124p+52 % -0x1.d42eb4caf1af3p-1000 =
0x1.4746a17870f2dp-1000:*, 0x0.000015ec71a55p-1022:E
-inf % 0x1.1526f2cf23c26p-102 = -:Q:000000000000:*, +:Q:000000000000:* -0x1.590cc7ca068a2p+363 % -0x1.6b657269531a6p-506 =
-0x1.4926f7ecd66f8p-507:*, -0x1.5a46352b14c4fp-506:*
-0x1.1920c5def8f9ap+271 % -0x1.6cdee32f5cdb2p-541 =
-0x1.1b4893ef49080p-545:*, -0x1.6cc28a568162fp-541:*
-0x1.4ec76b75b99f3p+468 % -:Q:609c0abc7a20 = -:Q:609c0abc7a20:*, -:Q:609c0abc7a20:*
0x1.e612f205b191bp-349 % 0x1.696681d377410p-587 =
0x1.3f24fd20e9d00p-589:*, 0x1.696681d33d6c3p-587:E
-0x1.593a578348901p+653 % 0x1.2cd960be3c228p-801 =
-0x1.9a105cac827c0p-802:*, -0x1.2cd960be3938fp-801:*
0x1.1df42e63a48a8p-40 % -0x0.d5676cfc32ae8p-1022 =
0x0.cf3b01fe7a5e0p-1022:*, 0x0.d560dcc23e53cp-1022:E
-0x1.501919cbfd4c9p+290 % -0x1.c72c031fadf2cp-929 =
-0x1.8dd20f30247d0p-929:*, -0x1.7694f6c81ec70p-978:*
0x1.61d529699c607p+591 % -0x1.bc42cfdf25604p+82 =
0x1.1b152ea1d8b20p+81:*, 0x1.43a4174e194f0p+43:*
0x1.85fc6d403f6e2p-1 % -0x1.1dcf9bf1b571dp-544 =
0x1.b496060c3a0d8p-546:*, 0x1.f7a7086a78b20p-552:*
0x1.d3bed90eb7d77p+29 % 0x1.8f442685305dbp-257 =
0x1.409651747985bp-257:*, 0x1.8f442685305dap-257:*
0x1.4820968c00a25p-207 % 0x1.5ea986ee0fcf6p-657 =
0x1.1f2df1683fc80p-661:*, 0x1.ce83137d1fdf0p-701:E
-0x1.627ce9c4c4afap-90 % -0x0.3f77e0dfd2763p-1022 = -0x0.0db2a60384655p-1022:*, -0x0.3f77e0dfd2762p-1022:E
-inf % -0x1.c7a5394ec89e0p-523 = -:Q:000000000000:*, +:Q:000000000000:*
-inf % -0x1.7e16797b2f0a9p+859 = -:Q:000000000000:*, +:Q:000000000000:* -0x1.d09bbc949014cp+861 % -0x1.1e360f2192fe7p+766 =
-0x1.e1afd4aeb6354p+765:*, -0x1.0f6b00c0ce24ap+752:*
0x1.9b3371d8b1c55p-436 % -0x1.1720e7edd3278p-885 =
0x1.05e04527ffec0p-885:*, 0x1.1720e7edd3198p-885:E
-:S:66f8b19c9822 % -0x1.d650a3c1be52cp-148 = -:S:66f8b19c9822:*, -:S:66f8b19c9822:*
0x1.1a78436e85119p-215 % 0x1.2219db1d0eb30p-410 =
0x1.08cb55a9540e0p-410:*, 0x1.2219d802c51e6p-410:E
+:S:0da83e445834 % 0x1.633090c6677dcp+442 = +:S:0da83e445834:*, +:S:0da83e445834:*
-0x1.b2d56657198bcp-471 % 0x1.580cc7af69db0p-497 =
-0x1.173a9ea4cb260p-498:*, -0x1.33231482bff80p-524:*
0x1.0c92524dfb66cp-381 % 0x1.8a7f5f3dc8130p-1018 =
0x1.359a54e07bf80p-1021:*, 0x1.8a7f5df9169dcp-1018:E
-0x1.5a791faa8b47ep+553 % -0x1.201aaa7df2633p-147 =
-0x1.163b4a3ab8dd4p-149:*, -0x1.063eeccffe32cp-159:*
-inf % -0x1.28b9ae59df005p-524 = -:Q:000000000000:*, +:Q:000000000000:* -0x1.65a7a7e1d5fd9p+617 % 0x1.082baaa15be31p-583 =
-0x1.0e5928320fbd2p-584:*, -0x1.429c0e77f3ab8p-604:*
0x1.189eef67fd659p+539 % -:Q:4dc426463cf5 = -:Q:4dc426463cf5:*, -:Q:4dc426463cf5:*
+:S:76d8539d9fdf % 0x1.8e9e851efea68p+572 = +:S:76d8539d9fdf:*, +:S:76d8539d9fdf:*
-inf % 0x1.8b6b27e02396fp+357 = -:Q:000000000000:*, +:Q:000000000000:* 0x1.bdf47e2ed4dd7p-36 % +:S:5dec089346bf = +:S:5dec089346bf:*, +:S:5dec089346bf:*
0x1.090032c22525dp+322 % 0x0.a6e1c81ea125fp-1022 =
0x0.7612603bb6732p-1022:*, 0x0.000000cf6034ep-1022:E
0x1.1a74dec343122p-147 % 0x1.95287502465c9p-939 =
0x1.112e5fe5d1ac0p-944:*, 0x1.52ce25034dca0p-969:E
-0x1.53730773c9c25p+888 % -0x1.7bc77b3ba7674p-237 =
-0x1.6cacf30fe44bcp-237:*, -0x1.7bc6fbb85d8dep-237:*
-0x1.abc8198c985ecp-514 % -0x0.5644431fc94bfp-1022 = -0x0.2c0b93c461360p-1022:*, -0x0.00000000001bbp-1022:E
0x1.c3e04a479db00p-631 % -0x1.c8579514811f9p-651 =
0x1.d01e67b839b30p-654:*, 0x1.c857951461b90p-651:*
-0x1.09379a0b4cd12p-633 % -0x0.347b1a6a03576p-1022 = -0x0.0712d86b57780p-1022:*, -0x0.0000000000002p-1022:E
-0x1.cf15f1db70929p+150 % -0x1.8ccd79520450cp-78 =
-0x1.3ac054c687600p-84:*, -0x1.8ccd79520450cp-78:*
0x1.ee805c37ffdafp+264 % -0x0.77f072f70104ep-1022 =
0x0.2f60fb5bedfcap-1022:*, 0x0.77f072f700d30p-1022:E
0x1.c2b08f6a72b9ep-58 % -0x1.0782586a7b996p-984 =
0x1.a4d5b372f2958p-986:*, 0x1.d9ed3223f4e42p-985:E
0x1.8007bd94c874ap+467 % 0x1.f7e2e78c6c424p+166 =
0x1.acf8df5101680p+164:*, 0x1.f7e2e5d537630p+166:*
0x1.62dee22f1e415p+802 % 0x1.a53970bd6bb25p-250 =
0x1.34f6616177380p-254:*, 0x1.a53970bd6baf6p-250:*
-0x1.9c706d6b77f12p+639 % -0x1.65ee550dad389p-218 =
-0x1.533233b8df2c0p-224:*, -0x1.65ee536578556p-218:*
-0x1.4b3fa5267178ep+819 % 0x1.0e2a822988440p+298 =
-0x1.50834ff070400p+294:*, -0x1.cb1fc7bae8400p+275:*
0x1.199303c075608p+42 % -0x1.64d299e988256p-511 =
0x1.82fcc40ce3524p-512:*, 0x1.8fbb9d3ea3560p-564:*
-0x1.3203bdf387cdap-862 % -0x0.b089f89f1baacp-1022 = -0x0.a231e2df870fcp-1022:*, -0x0.07b929544eda6p-1022:E
-0x1.8e3437ff09c61p+627 % 0x1.8f439f049ad08p-453 =
-0x1.4b0c7c1ae5600p-455:*, -0x1.8f439e72b88e3p-453:*
-0x1.9133201dd47ffp+400 % 0x1.8398a8d2c5cdfp+164 =
-0x1.809a03e114ce8p+161:*, -0x1.ea5f265267160p+129:*
-0x1.bf98c7da17084p+571 % 0x1.b7c82c6a1725cp-66 =
-0x1.2cc1ae4f627f0p-66:*, -0x1.1eb2cf1a66390p-89:*
0x1.d3f4f9d9ff04fp-198 % -0x1.154fb60f49542p-648 =
0x1.5d208c0b2caa8p-649:*, 0x1.2e85837ffa550p-686:E
0x1.8080d7a03b27fp+863 % 0x1.872d450b3fa5ep-456 =
0x1.70e4e932ab7dep-456:*, 0x1.3a2545edea4b4p-496:*
+:S:6eee3c65938b % -:Q:1c1cb40ad58b = -:Q:1c1cb40ad58b:*, +:S:6eee3c65938b:* 0x1.262ea83a6d9d4p+260 % 0x1.0d5787dba5106p-201 =
0x1.e3e1561ed281cp-202:*, 0x1.0d5787db03fa3p-201:*
0x1.8880f54a4e8c0p+660 % -0x1.2fbc35131efafp+210 =
0x1.5dc21ec7b01d8p+208:*, 0x1.2fbc35131ef95p+210:*
-0x1.5bf42310b0f32p+700 % 0x1.f5295fc4c433bp-277 =
-0x1.5d20cd8ae3af8p-278:*, -0x1.5411e826871f4p-287:*
0x1.0afb32562ce7bp+741 % -0x0.77288a354a1d4p-1022 =
0x0.11133907f43e0p-1022:*, 0x0.77288a354a1b4p-1022:E
-0x1.1381db13f973ap+645 % -0x1.1120f19f24687p-669 =
-0x1.6bd67e3456eb8p-670:*, -0x1.6716d6db18b40p-684:*
-0x1.6adeff1912c8ep+856 % 0x1.340dcaf6b0671p+832 =
-0x1.139230157d945p+832:*, -0x1.cf26f744ca740p+801:*
0x1.972d60871661ep+160 % -0x1.d55d8b6e2f399p-491 =
0x1.72948f106dc33p-491:*, 0x1.c5f85be3fe230p-502:*
0x1.c407aca0f1077p-324 % 0x1.8717479cadb98p-915 =
0x1.854aee9ce6190p-916:*, 0x1.870d1835120b0p-915:E
+:S:4c9c99efe3fd % 0x1.fc301cc69b114p+999 = +:S:4c9c99efe3fd:*, +:S:4c9c99efe3fd:*
-0x1.f1f2e8fdc92d9p+76 % +:S:78f1e50f6c72 = +:S:78f1e50f6c72:*, +:S:78f1e50f6c72:*
0x1.775caa63cf403p-175 % 0x0.3da05b0ce1239p-1022 =
0x0.397062b8c2ba8p-1022:*, 0x0.3da05b0ca8bd4p-1022:E
0x1.e35abaf2e5c36p+997 % -0x1.5d002d30e68a0p-599 =
0x1.4815c5a0a8980p-599:*, 0x1.e2f5eb5802700p-645:*
0x1.b50c78d7f1291p+437 % -0x1.8bcb83efdc6cfp-372 =
0x1.f934613ebb33ep-373:*, 0x1.7b5226e317c20p-392:*
0x0.c02b45917f785p-1022 % +:S:53c115574d25 = +:S:53c115574d25:*, +:S:53c115574d25:*
-0x1.5a7e3135042d9p+430 % -0x1.a5f1f7a966beep-906 =
-0x1.358f959b0d52ep-906:*, -0x1.a5a3db640c9d9p-906:*
0x1.2cdab04360b84p+454 % -0x0.b241b8ad803f5p-1022 =
0x0.5d3ae84d02fcap-1022:*, 0x0.041a0bf642dd0p-1022:E
+:S:528b4a5d4299 % -0x1.f9d68a5685362p+141 = +:S:528b4a5d4299:*, +:S:528b4a5d4299:*
-0x1.d5da6e97b77f2p+443 % 0x1.9e13eaebfbf96p-704 =
-0x1.0750efe5b0544p-704:*, -0x1.1c64c29e37638p-716:*
-inf % -:Q:1f400346a573 = -:Q:1f400346a573:*, -:Q:1f400346a573:* 0x1.f501e427149e5p-65 % +:Q:52618610950c = +:Q:52618610950c:*, +:Q:52618610950c:*
0x1.385a7fca6c009p+351 % -0x1.44cddc10040ebp+72 =
0x1.0ef222f712122p+72:*, 0x1.fae44f12dc640p+20:*
-0x1.a6e815d856d8ep+132 % -0x0.e25268faa2cddp-1022 = -0x0.b637348b300d8p-1022:*, -0x0.0018e411810bep-1022:E
+:S:3ab9f10930d2 % -0x1.d7ceaacbb637ap-822 = +:S:3ab9f10930d2:*, +:S:3ab9f10930d2:*
-inf % -0x1.2b751d38488d0p+941 = -:Q:000000000000:*, +:Q:000000000000:* -0x1.35df27600fedbp+69 % 0x1.b90d646195dc9p-469 =
-0x1.5d7dcdbf752acp-471:*, -0x1.7393f95dd287ap-482:*
0x1.c57854970a884p+743 % -0x1.c5ea6c4bea2bfp+741 =
0x1.c4220d786b9d3p+741:*, 0x1.3299466e60cc6p+689:*
--- Synchronet 3.20c-Linux NewsLink 1.2

From Bonita Montero@Bonita.Montero@gmail.com to comp.lang.c++ on Tue Mar 11 09:34:06 2025

From Newsgroup: comp.lang.c++

Partitially wrong alarm: I forget to convert my xtrunc() function into
an xfloor() function. These are the accuracy results now compared to
fmod() of MSVC:

53 bits shared accuracy
equal results: 100%
equal exceptions: 91.017%
equal NaN signs: 96.475%
equal NaN-types: 99.78%
equal NaNs: 96.253%

These are the accuracy results compared to glibc:

53 bits shared accuracy
equal results: 100%
equal exceptions: 99.901%
equal NaN signs: 87.224%
equal NaN-types: 93.181%
equal NaNs: 80.405%

--- Synchronet 3.20c-Linux NewsLink 1.2

From Michael S@already5chosen@yahoo.com to comp.lang.c++ on Tue Mar 11 12:29:02 2025

From Newsgroup: comp.lang.c++

On Tue, 11 Mar 2025 09:34:06 +0100
Bonita Montero <Bonita.Montero@gmail.com> wrote:

Partitially wrong alarm: I forget to convert my xtrunc() function into
an xfloor() function. These are the accuracy results now compared to
fmod() of MSVC:

53 bits shared accuracy
equal results: 100%
equal exceptions: 91.017%
equal NaN signs: 96.475%
equal NaN-types: 99.78%
equal NaNs: 96.253%

These are the accuracy results compared to glibc:

53 bits shared accuracy
equal results: 100%
equal exceptions: 99.901%
equal NaN signs: 87.224%
equal NaN-types: 93.181%
equal NaNs: 80.405%

Pay attention that fmod() has no requirements w.r.t. to such exceptions
as FE_INEXACT, FE_UNDERFLOW and non-standard FE_DENORMAL.
Strictly speaking, even raising FE_OVERFLOW is not illegal, but doing
so would be bad quality of implementation.
Also spec does not say what happens to FE_INVALID when one of the
inputs is signalling NAN.

--- Synchronet 3.20c-Linux NewsLink 1.2

From Michael S@already5chosen@yahoo.com to comp.lang.c++ on Tue Mar 11 12:47:06 2025

From Newsgroup: comp.lang.c++

On Mon, 10 Mar 2025 22:36:31 +0100
Bonita Montero <Bonita.Montero@gmail.com> wrote:

Am 10.03.2025 um 22:34 schrieb Bonita Montero:

And for aribitrary exponets (0x1 to 0x7FE):

fmodO: 9.29622
fmodM: 11.4518

Sorry, the copy-buffer wasn't refreshed with the new results:

fmodO: 40.4702
fmodM: 40.1652

Let's establish common measurement methodology.
Here is my throughput test bench (I have different test bench for
correctness tests and plan to build yet different one for latency
tests).
#include <cstdio>
#include <cstring>
#include <cmath>
#include <cfenv>
#include <vector>
#include <random>
#include <chrono>
extern "C" double my_fmod(double x, double y);
int main(void)
{
const int VEC_LEN = 100000;
const int N_IT = 31;
std::vector<double> xy(VEC_LEN*2);
std::vector<double> res(VEC_LEN);
std::vector<double> ref(VEC_LEN);
std::mt19937_64 rndGen;
for (int rep = 0; rep < 2; ++rep) {
if (rep == 0) {
const uint64_t EXP_MASK = 2047ull << 52;
for (int i = 0; i < VEC_LEN*2; ++i) {
uint64_t u = rndGen();
uint64_t exp = 1023;
if (i % 2 == 0) { // x
uint64_t exp0 = (u >> 52) & 2047;
exp += exp0 % 52;
}
u = (u & ~EXP_MASK) | (exp << 52);
double d;
memcpy(&d, &u, sizeof(d));
xy[i] = d;
}
} else {
for (int i = 0; i < VEC_LEN*2; ++i) {
uint64_t u = rndGen();
double d;
memcpy(&d, &u, sizeof(d));
xy[i] = d;
}
}
auto t00 = std::chrono::steady_clock::now();
const double* pXY = xy.data();
double* pRef = ref.data();
double* pRes = res.data();
std::vector<int64_t> tref(N_IT);
std::vector<int64_t> tres(N_IT);
for (int it = 0; it < N_IT; ++it) {
auto t0 = std::chrono::steady_clock::now();
for (int i = 0; i < VEC_LEN; ++i)
pRef[i] = fmod(pXY[i*2+0], pXY[i*2+1]);
auto t1 = std::chrono::steady_clock::now();
for (int i = 0; i < VEC_LEN; ++i)
pRes[i] = my_fmod(pXY[i*2+0], pXY[i*2+1]);
auto t2 = std::chrono::steady_clock::now();
tref[it] =
std::chrono::duration_cast<std::chrono::nanoseconds>(t1 -
t0).count(); tres[it] =
std::chrono::duration_cast<std::chrono::nanoseconds>(t2 -
t1).count();
for (int i = 0; i < VEC_LEN; ++i) {
if (pRef[i] != pRes[i]) {
if (!std::isnan(pRef[i]) || !std::isnan(pRes[i])) {
printf(
"Mismatch. fmod(%.17e, %.17e).\n"
"ref %.17e\n"
"my %.17e\n"
,xy[i*2+0]
,xy[i*2+1]
,ref[i]
,res[i]
);
return 1;
}
}
}
}
auto t11 = std::chrono::steady_clock::now();
int64_t dt =
std::chrono::duration_cast<std::chrono::nanoseconds>(t11 -
t00).count();
std::nth_element(tref.begin(), tref.begin()+N_IT/2, tref.end());
std::nth_element(tres.begin(), tres.begin()+N_IT/2, tres.end());
printf("fmod %6.2f nsec. my_fmod %6.2f nsec. Test time %7.3f msec\n"
,double(tref[N_IT/2]) / VEC_LEN
,double(tres[N_IT/2]) / VEC_LEN
,double(dt)*1e-6
);
}
return 0;
}
What happens when you take this code 'as is' compile it and run
it in each of 3 environments with following options:
MSVC:
cl -nologo -O2 -W4 -arch:AVX2 -std:c++20 -MD -EHsc
gcc/clang under msys2:
C:
gcc -O2 -Wall -std=c17 -march=haswell
or
clang -O2 -Wall -std=c17 -march=haswell
C++:
g++ -O2 -Wall -std=c++20 -march=haswell
or
clang++ -O2 -Wall -std=c17 -march=haswell
gcc/clang under msys2:
The same as above with addition of -lm

--- Synchronet 3.20c-Linux NewsLink 1.2

From Bonita Montero@Bonita.Montero@gmail.com to comp.lang.c++ on Tue Mar 11 12:12:31 2025

From Newsgroup: comp.lang.c++

Am 11.03.2025 um 11:29 schrieb Michael S:

Pay attention that fmod() has no requirements w.r.t. to such
exceptions as FE_INEXACT, FE_UNDERFLOW and non-standard FE_DENORMAL.

Yes, that's while I evaluate FE_INVALID only. But your code also can
set FE_INEXACT due to your "rounding" with sign change. MSVC seems
also try to do the math with the FPU with a integer-fallback, because
with exponent differences <= 53 MSVC's fmod() often sets FE_INECACT;
but I ignore that because that shouldn't be part of fmod();

Strictly speaking, even raising FE_OVERFLOW is not illegal,
but doing so would be bad quality of implementation.

Couldn't FE_OVERFLOW happen with your implementation when the
exponents are too far away that you get inf from the division ?

Also spec does not say what happens to FE_INVALID when one of the
inputs is signalling NAN.

See my code; I return MSVC and glibc compatible NaNs and I return
the same exceptions. MSVC sets FE_INVALID only when x is inf or y
is zero, glibc in addition raises FE_INVALID when either operand
is a signalling NaN.

--- Synchronet 3.20c-Linux NewsLink 1.2

From Michael S@already5chosen@yahoo.com to comp.lang.c++ on Tue Mar 11 13:34:58 2025

From Newsgroup: comp.lang.c++

On Tue, 11 Mar 2025 12:12:31 +0100
Bonita Montero <Bonita.Montero@gmail.com> wrote:

Am 11.03.2025 um 11:29 schrieb Michael S:

Pay attention that fmod() has no requirements w.r.t. to such
exceptions as FE_INEXACT, FE_UNDERFLOW and non-standard
FE_DENORMAL.

Yes, that's while I evaluate FE_INVALID only. But your code also can
set FE_INEXACT due to your "rounding" with sign change. MSVC seems
also try to do the math with the FPU with a integer-fallback, because
with exponent differences <= 53 MSVC's fmod() often sets FE_INECACT;
but I ignore that because that shouldn't be part of fmod();

Strictly speaking, even raising FE_OVERFLOW is not illegal,
but doing so would be bad quality of implementation.

Couldn't FE_OVERFLOW happen with your implementation when the
exponents are too far away that you get inf from the division ?

It couldn't happen. Loop within loop exists for this reason exactly: to
prevent overflow.

Also spec does not say what happens to FE_INVALID when one of the
inputs is signalling NAN.

See my code; I return MSVC and glibc compatible NaNs and I return
the same exceptions. MSVC sets FE_INVALID only when x is inf or y
is zero, glibc in addition raises FE_INVALID when either operand
is a signalling NaN.

Exactly. Both options are legal. MS's decision to not set FE_INVALID is
as good as glibc decision to set it.
So, test bench should accept both variants as correct.
BTW, what is the output of MS library in that case? SNAN or QNAN?
I would think that it should be SNAN even when the other argument is
QNAN. But even that is probably not required by the Standard.

--- Synchronet 3.20c-Linux NewsLink 1.2

From Michael S@already5chosen@yahoo.com to comp.lang.c++ on Tue Mar 11 13:51:27 2025

From Newsgroup: comp.lang.c++

On Tue, 11 Mar 2025 12:12:31 +0100
Bonita Montero <Bonita.Montero@gmail.com> wrote:

Am 11.03.2025 um 11:29 schrieb Michael S:

Pay attention that fmod() has no requirements w.r.t. to such
exceptions as FE_INEXACT, FE_UNDERFLOW and non-standard
FE_DENORMAL.

Yes, that's while I evaluate FE_INVALID only.

I think that testing that FE_DIVEDEBYZERO is not set is also a good
idea.

--- Synchronet 3.20c-Linux NewsLink 1.2

From Bonita Montero@Bonita.Montero@gmail.com to comp.lang.c++ on Tue Mar 11 13:10:31 2025

From Newsgroup: comp.lang.c++

Am 11.03.2025 um 12:34 schrieb Michael S:

Exactly. Both options are legal. MS's decision to not set FE_INVALID is
as good as glibc decision to set it.

If I do a SSE-/AVX-operation where either operand is a signalling NaN
I get a FE_INVALID; since the FPU behaves this way the MSVC runtime
should do that also.

BTW, what is the output of MS library in that case? SNAN or QNAN?

Results with SNaN parameters are always QNaN, that shoud be common
with any FPU.

--- Synchronet 3.20c-Linux NewsLink 1.2

From Michael S@already5chosen@yahoo.com to comp.lang.c++ on Tue Mar 11 14:55:43 2025

From Newsgroup: comp.lang.c++

On Tue, 11 Mar 2025 13:10:31 +0100
Bonita Montero <Bonita.Montero@gmail.com> wrote:

Am 11.03.2025 um 12:34 schrieb Michael S:

Exactly. Both options are legal. MS's decision to not set
FE_INVALID is as good as glibc decision to set it.

If I do a SSE-/AVX-operation where either operand is a signalling NaN
I get a FE_INVALID; since the FPU behaves this way the MSVC runtime
should do that also.

BTW, what is the output of MS library in that case? SNAN or QNAN?

Results with SNaN parameters are always QNaN, that shoud be common
with any FPU.

But not when library routine does not use FPU. Or uses FPU only for
comparison ops.
The point is, it does not sound right if SNAN is *silently* converted
to QNAN. That type of conversion has to be loud i.e. accompanied by
setting of FE_INVALID.

--- Synchronet 3.20c-Linux NewsLink 1.2

From Bonita Montero@Bonita.Montero@gmail.com to comp.lang.c++ on Tue Mar 11 14:08:04 2025

From Newsgroup: comp.lang.c++

Am 11.03.2025 um 13:55 schrieb Michael S:

But not when library routine does not use FPU. Or uses FPU only for comparison ops.

Then the library routine shout use fesetexcept() as you do it yourself.

The point is, it does not sound right if SNAN is *silently* converted
to QNAN. That type of conversion has to be loud i.e. accompanied by
setting of FE_INVALID.

Interestingly even conversion-operations from double to float do that.
That's not what I epected.
And there a some operations that should do that but actually keep the signalling bit zeroed like a sign-change since this is usually done with
a XOR-operation.
--- Synchronet 3.20c-Linux NewsLink 1.2

From Michael S@already5chosen@yahoo.com to comp.lang.c++ on Tue Mar 11 15:46:47 2025

From Newsgroup: comp.lang.c++

On Tue, 11 Mar 2025 14:55:43 +0200
Michael S <already5chosen@yahoo.com> wrote:

On Tue, 11 Mar 2025 13:10:31 +0100
Bonita Montero <Bonita.Montero@gmail.com> wrote:

Am 11.03.2025 um 12:34 schrieb Michael S:

Exactly. Both options are legal. MS's decision to not set
FE_INVALID is as good as glibc decision to set it.

If I do a SSE-/AVX-operation where either operand is a signalling
NaN I get a FE_INVALID; since the FPU behaves this way the MSVC
runtime should do that also.

BTW, what is the output of MS library in that case? SNAN or QNAN?

Results with SNaN parameters are always QNaN, that shoud be common
with any FPU.

But not when library routine does not use FPU. Or uses FPU only for comparison ops.
The point is, it does not sound right if SNAN is *silently* converted
to QNAN. That type of conversion has to be loud i.e. accompanied by
setting of FE_INVALID.

I tested. It appears that MSVC implementation made a mistake in
cases of fmod(snan, qnan):

MSVC gcc
x y result FE_INVALID) result FE_INVALID
snan 1 snan 0 qnan 1
snan 0 snan 0 qnan 1
snan inf snan 0 qnan 1
snan qnan qnan 0 !!! qnan 1
1 snan snan 0 qnan 1
0 snan snan 0 qnan 1
inf snan snan 0 qnan 1
qnan snan snan 0 qnan 1

--- Synchronet 3.20c-Linux NewsLink 1.2

From James Kuyper@jameskuyper@alumni.caltech.edu to comp.lang.c++ on Tue Mar 11 14:21:02 2025

From Newsgroup: comp.lang.c++

On 3/11/25 06:29, Michael S wrote:
...

Pay attention that fmod() has no requirements w.r.t. to such exceptions
as FE_INEXACT, FE_UNDERFLOW and non-standard FE_DENORMAL.
Strictly speaking, even raising FE_OVERFLOW is not illegal, but doing
so would be bad quality of implementation.
Also spec does not say what happens to FE_INVALID when one of the
inputs is signalling NAN.

I've got Bonita killfiled, so the oldest message I can see on this
thread is one posted by you that indicated you were interested in IEEE
754 (==IOS/IEC 60559) conformance.

The C++ standard cross-references the C standard for such issues. The C standard specifies that, for an implementation which pre#defines __STDC__IEC_60559__BFP__, floating point exception handling is very
tightly specified for conformance with ISO/IEC 60559:

"The double version of fmod behaves as though implemented by
#include <math.h>
#include <fenv.h>
#pragma STDC FENV_ACCESS ON
double fmod(double x, double y)
{
double result;
result = remainder(fabs(x), (y = fabs(y)));
if (signbit(result)) result += y;
return copysign(result, x);
}
" (F10.7.1)

"Operations defined in 6.5 and functions and macros defined for the
standard libraries change floating-point status flags and control modes
just as indicated by their specifications (including conformance to IEC
60559). They do not change flags or modes (so as to be detectable by the
user) in any other cases." (F8.6)

"... signbit ... raise[s] no floating-point exceptions, even if an
argument is a signaling NaN." (F3p6)

"fabs(x) raises no floating-point exceptions, even if x is a signaling
NaN." (F10.4.3)

"— remainder(x, y) returns a NaN and raises the "invalid" floating-point exception for x infinite or y zero (and neither is a NaN)." (F.10.7.2)

"copysign(x, y) raises no floating-point exceptions, even if x or y is a signaling NaN." (F10.8.1)
--- Synchronet 3.20c-Linux NewsLink 1.2

From Michael S@already5chosen@yahoo.com to comp.lang.c++ on Tue Mar 11 20:28:12 2025

From Newsgroup: comp.lang.c++

On Mon, 10 Mar 2025 20:38:18 +0200
Michael S <already5chosen@yahoo.com> wrote:

On Mon, 10 Mar 2025 19:00:06 +0100
Bonita Montero <Bonita.Montero@gmail.com> wrote:

Your idea is really elegant

I'd rather call it "simple" or "straightforward". "Elegant" in my book
is something else. For example, the code above is closer to what I
consider elegant.
May be, later today or tomorrow, I'll show you solution that I
consider bright. Bright, but impractical.

Here, here!
A bright part is in lines 18 to 29. The rest are hopefully competent technicalities.

#include <string.h>
#include <stdint.h>
#include <math.h>
#include <fenv.h>

static uint64_t
umulrem(uint64_t x, uint64_t y, uint64_t den) {
#ifdef _MSC_VER
uint64_t hi, lo = _umul128(x, y, &hi);
uint64_t rem;
_udiv128(hi, lo, den, &rem);
return rem;
#else
return ((unsigned __int128)x * y) % den;
#endif
}

// Calculate mod(2**e, y) where y < 2**53
static uint64_t pow2_mod(uint64_t y, unsigned e) {
if (e < 64) {
uint64_t x = (uint64_t)1 << e;
if (x < y)
return x;
return x % y;
}
uint64_t x1 = pow2_mod(y, e/2);
uint64_t x2 = x1 << (e & 1);
return umulrem(x1, x2, y);
}

static double u2d(uint64_t u) {
double d;
memcpy(&d, &u, sizeof(d));
return d;
}

static uint64_t d2u(double d) {
uint64_t u;
memcpy(&u, &d, sizeof(u));
return u;
}

// raise FE_INVALID and return nan
static double raise_fe_invalid_ret_nan(double x)
{
const uint64_t SNAN_BITS = ~(1ull << 51);
double snan = u2d(SNAN_BITS);
#ifndef __clang__
return snan + x;
#else
volatile double v_snan = snan;
return v_snan + x;
#endif
}

double my_fmod(double x, double y)
{
const uint64_t INF_EXP = 2047;
const uint64_t INF2 = INF_EXP << 53;
const uint64_t HIDDEN_BIT = (uint64_t)1 << 52;
const uint64_t MANT_MASK = HIDDEN_BIT - 1;
const uint64_t SIGN_BIT = (uint64_t)1 << 63;

uint64_t ux = d2u(x);
uint64_t uy = d2u(y);
uint64_t ux2 = ux*2;
uint64_t uy2 = uy*2;
uint64_t sx = ux & SIGN_BIT;

// process non-finite x
if (ux2 >= INF2) { // x is inf or nan
if (ux2 > INF2) // x is nan
return x + y; // raises FE_INVALID when either x or y is sNAN
// x is inf
if (uy2 > INF2) // y is nan
return x + y;
// y is finite or inf
return raise_fe_invalid_ret_nan(x);
}
// x is finite

// process non-finite and zero y
if (uy2-1 >= INF2-1) { // y is inf or nan or 0
if (uy2 > INF2) // y is nan
return y;
// x is inf
if (uy2 == INF2) // y is inf
return x;
// y is 0
return raise_fe_invalid_ret_nan(x);
}

// y is finite non-zero
if (ux2 < uy2)
return x; // abs(x) < abs(y)

// extract mantissa and exponent
uint64_t mantX = (ux2 >> 1) & MANT_MASK;
uint64_t mantY = (uy2 >> 1) & MANT_MASK;
int expX = ux2 >> 53;
if (expX == 0) { // X subnormal
// Y is also subnormal, so we can use simple integer reduction
return u2d((mantX % mantY) | sx);
}

int expY = uy2 >> 53;
if (expY == 0) { // Y subnormal
mantY |= HIDDEN_BIT; // removed below
expY = 1;
}

mantY ^= HIDDEN_BIT;
mantX ^= HIDDEN_BIT;
if (mantX >= mantY) {
mantX -= mantY;
if (mantX >= mantY) // can happen when y is subnormal
mantX %= mantY;
}

int dExp = expX - expY;
uint64_t f = (dExp <= 63) ?
(uint64_t)1 << dExp : // quick path
pow2_mod(mantY, dExp); // slow path
mantX = umulrem(mantX, f, mantY);

// apply exponent of Y to mantX
uint64_t ures0 = ((uint64_t)expY << 52) | sx;
uint64_t ures = ures0 | (mantX & MANT_MASK);
if (mantX & HIDDEN_BIT)
ures0 = 0;
return u2d(ures) - u2d(ures0);
}

--- Synchronet 3.20c-Linux NewsLink 1.2

From Michael S@already5chosen@yahoo.com to comp.lang.c++ on Sun Mar 16 13:48:03 2025

From Newsgroup: comp.lang.c++

On Tue, 11 Mar 2025 20:28:12 +0200
Michael S <already5chosen@yahoo.com> wrote:

On Mon, 10 Mar 2025 20:38:18 +0200
Michael S <already5chosen@yahoo.com> wrote:

On Mon, 10 Mar 2025 19:00:06 +0100
Bonita Montero <Bonita.Montero@gmail.com> wrote:

Your idea is really elegant

I'd rather call it "simple" or "straightforward". "Elegant" in my
book is something else. For example, the code above is closer to
what I consider elegant.
May be, later today or tomorrow, I'll show you solution that I
consider bright. Bright, but impractical.

Here, here!
A bright part is in lines 18 to 29. The rest are hopefully competent technicalities.

And here is non-recursive implementation of the same algorithm that has following potentially useful properties:
1. It does not use compiler-specific extensions, only standard C.
2. It does not use FMA, so gives correct results on implementations
with broken fam(), like MSVC on pre-AVX computers.

#include <string.h>
#include <stdint.h>
#include <math.h>

static double u2d(uint64_t u) {
double d;
memcpy(&d, &u, sizeof(d));
return d;
}

static uint64_t d2u(double d) {
uint64_t u;
memcpy(&u, &d, sizeof(u));
return u;
}

// raise FE_INVALID and return nan
static double raise_fe_invalid_ret_nan(double x)
{
const uint64_t SNAN_BITS = ~(1ull << 51);
double snan = u2d(SNAN_BITS);
#ifndef __clang__
return snan + x;
#else
volatile double v_snan = snan;
return v_snan + x;
#endif
}

double my_fmod(double x, double y)
{
const uint64_t INF_EXP = 2047;
const uint64_t INF2 = INF_EXP << 53;
const uint64_t HIDDEN_BIT = (uint64_t)1 << 52;
const uint64_t MANT_MASK = HIDDEN_BIT - 1;
const uint64_t SIGN_BIT = (uint64_t)1 << 63;

uint64_t ux = d2u(x);
uint64_t uy = d2u(y);
uint64_t ux2 = ux*2;
uint64_t uy2 = uy*2;
uint64_t sx = ux & SIGN_BIT;

// process non-finite x
if (ux2 >= INF2) { // x is inf or nan
if (ux2 > INF2) // x is nan
return x + y; // raises FE_INVALID when either x or y is sNAN
// x is inf
if (uy2 > INF2) // y is nan
return x + y;
// y is finite or inf
return raise_fe_invalid_ret_nan(x);
}
// x is finite

// process non-finite and zero y
if (uy2-1 >= INF2-1) { // y is inf or nan or 0
if (uy2 > INF2) // y is nan
return y;
// x is inf
if (uy2 == INF2) // y is inf
return x;
// y is 0
return raise_fe_invalid_ret_nan(x);
}

// y is finite non-zero
if (ux2 < uy2)
return x; // abs(x) < abs(y)

// extract mantissa and exponent
int64_t mantX = ((ux2 >> 1) & MANT_MASK)+HIDDEN_BIT;
int64_t mantY = ((uy2 >> 1) & MANT_MASK)+HIDDEN_BIT;
int expX = ux2 >> 53;
int expY = uy2 >> 53;
unsigned dExp = expX - expY;
double ax = fabs(x);
double ay = fabs(y);
if (ax*0x1p-53 <= ay) {
// Quick path
int64_t d = (int64_t)(ax/ay);
if (expY == 0) { // Y subnormal
// don't normalize
mantY -= HIDDEN_BIT;
expY = 1;
if (expX == 0) { // X subnormal
mantX -= HIDDEN_BIT;
expX = 1;
}
dExp = expX - expY;
}
mantX = (mantX << dExp) - mantY*(uint64_t)d;
} else {
if (expY == 0) { // Y subnormal
// Normalize
uy = d2u(y*0x1p52);
mantY = (uy & MANT_MASK) | HIDDEN_BIT;
expY = ((int)(uy >> 52) & 2047) - 52;
dExp = expX - expY;
}
if (mantY == (int64_t)HIDDEN_BIT)
return u2d(sx); // Y is power of 2

// Calculate rem(2**dExp, mantY)
unsigned e0, n_steps;
for (n_steps = 0, e0 = dExp; e0 > 105; ++n_steps)
e0 /= 2;

double ry = 1.0/mantY;
uint64_t mantRy = (d2u(ry) & MANT_MASK) | HIDDEN_BIT;
uint64_t d = mantRy >> (105-e0);
int64_t f = (((uint64_t)1 << 52) << (e0-52)) - mantY*d;
// f = rem(2**e0, mantY) + a*mantY where -1 <= a <= 1
if (n_steps > 0) {
int next_bit = n_steps-1;
const uint64_t F_MAX = (uint64_t)3 << 54;
do {
double df = (double)f;
d = (int64_t)(df*df*ry);
f = (uint64_t)f*(uint64_t)f - mantY*d;
f <<= (dExp >> next_bit) & 1;
if (f+F_MAX > F_MAX*2)
f -= (int64_t)(f*ry)*mantY;
--next_bit;
} while (next_bit >= 0);
}
if (mantX >= mantY)
mantX -= mantY;
d = (int64_t)((double)f*(int64_t)mantX*ry);
mantX = (uint64_t)f*(uint64_t)mantX - mantY*(uint64_t)d;
}
while ((uint64_t)mantX >= (uint64_t)mantY) {
if (mantX < 0)
mantX += mantY;
else
mantX -= mantY;
}

if (expY <= 1) { // Y subnormal
mantX >>= 1 - expY;
return u2d(mantX | sx);
}

// Apply exponent of Y to mantX
uint64_t ures0 = ((uint64_t)expY << 52) | sx;
uint64_t ures = ures0 | (mantX & MANT_MASK);
if (mantX & HIDDEN_BIT)
ures0 = 0;
return u2d(ures) - u2d(ures0);
}

--- Synchronet 3.20c-Linux NewsLink 1.2

Who's Online
Recent Visitors
- Noozle
  Thu Apr 17 18:11:52 2025
  from Noozle City via Telnet
- Kaptain_Krawdad
  Thu Apr 17 17:57:23 2025
  from Southern Il via Telnet
- Microbot
  Thu Apr 17 07:24:59 2025
  from Moore, Ok via Telnet
- Noozle
  Thu Apr 17 06:55:06 2025
  from Noozle City via Telnet

System Info

Sysop:	DaiTengu
Location:	Appleton, WI
Users:	1,029
Nodes:	10 (1 / 9)
Uptime:	183:54:52
Calls:	13,337
Calls today:	4
Files:	186,574
D/L today:	5,840 files (1,604M bytes)
Messages:	3,356,632

a MSVC and glibc-compatible fmod()

Who's Online

Recent Visitors

System Info