Is it possible to implement nextafter w/o obtaining a binary representation?-CodePudding

Usually nextafter is implemented in the following way:

double nextafter(double x, double y)
{
    // handle corner cases

    int delta = ((x > 0) == (x < y)) ? 1 : -1;
    unsigned long long mant = __mant(x);  // get mantissa as int
    mant  = delta;
    ...
}

Here, a binary representation is obtained using __mant(x).

Out of curiosity: is it possible to implement nextafter without obtaining a binary representation? For example, using a sequence of arithmetic floating point operations.

CodePudding user response：

Consider FP may/may not support sub-normals, infinity, have unique values for most values (e.g. using 2 float for double), support /= 0, without an intimae knowledge of the FP encoding, using assumptions like mant = delta; is the next value leads to portability failures - even when using binary ops. Using only FP ops would need many assumptions on the FP encoding.

I think a more useful approach would be to post candidate code that uses "a sequence of arithmetic floating point operations" and then ask for 1) what conditions it fails 2) how to improve?

CodePudding user response：

The following implements nextafter for finite values for IEEE-754 arithmetic with round-to-nearest-ties-to-even. Without these assumptions, I expect an implementation is still possible by using the ilogb or logb functions to determine the exponent and scalb to increment the LSB, along with various tests and adjustments for negative numbers, overflow, NaNs, and such.

#include <float.h>
#include <math.h>


/*  Return the next floating-point value after the finite value q.

    This was inspired by Algorithm 3.5 in Siegfried M. Rump, Takeshi Ogita, and
    Shin'ichi Oishi, "Accurate Floating-Point Summation", _Technical Report
    05.12_, Faculty for Information and Communication Sciences, Hamburg
    University of Technology, November 13, 2005.
    
    IEEE-754 and the default rounding mode,
    round-to-nearest-ties-to-even, may be required.
*/
double NextAfter(double q)
{
    /*  Scale is .625 ULP, so multiplying it by any significand in [1, 2)
        yields something in [.625 ULP, 1.25 ULP].
    */
    static const double Scale = 0.625 * DBL_EPSILON;

    /*  Either of the following may be used, according to preference and
        performance characteristics.  In either case, use a fused multiply-add
        (fma) to add to q a number that is in [.625 ULP, 1.25 ULP].  When this
        is rounded to the floating-point format, it must produce the next
        number after q.
    */
#if 0
    // SmallestPositive is the smallest positive floating-point number.
    static const double SmallestPositive = DBL_EPSILON * DBL_MIN;

    if (fabs(q) < 2*DBL_MIN)
        return q   SmallestPositive;

    return fma(fabs(q), Scale, q);
#else
    return fma(fmax(fabs(q), DBL_MIN), Scale, q);
#endif
}


#if defined CompileMain


#include <stdio.h>
#include <stdlib.h>


#define NumberOf(a) (sizeof (a) / sizeof *(a))


int main(void)
{
    int status = EXIT_SUCCESS;

    static const struct { double in, out; } cases[] =
    {
        {  INFINITY,                 INFINITY                },
        {  0x1.fffffffffffffp1023,   INFINITY                },
        {  0x1.ffffffffffffep1023,   0x1.fffffffffffffp1023  },
        {  0x1.ffffffffffffdp1023,   0x1.ffffffffffffep1023  },
        {  0x1.ffffffffffffcp1023,   0x1.ffffffffffffdp1023  },
        {  0x1.0000000000003p1023,   0x1.0000000000004p1023  },
        {  0x1.0000000000002p1023,   0x1.0000000000003p1023  },
        {  0x1.0000000000001p1023,   0x1.0000000000002p1023  },
        {  0x1.0000000000000p1023,   0x1.0000000000001p1023  },

        {  0x1.fffffffffffffp1022,   0x1.0000000000000p1023  },

        {  0x1.fffffffffffffp1,      0x1.0000000000000p2     },
        {  0x1.ffffffffffffep1,      0x1.fffffffffffffp1     },
        {  0x1.ffffffffffffdp1,      0x1.ffffffffffffep1     },
        {  0x1.ffffffffffffcp1,      0x1.ffffffffffffdp1     },
        {  0x1.0000000000003p1,      0x1.0000000000004p1     },
        {  0x1.0000000000002p1,      0x1.0000000000003p1     },
        {  0x1.0000000000001p1,      0x1.0000000000002p1     },
        {  0x1.0000000000000p1,      0x1.0000000000001p1     },

        {  0x1.fffffffffffffp-1022,  0x1.0000000000000p-1021 },
        {  0x1.ffffffffffffep-1022,  0x1.fffffffffffffp-1022 },
        {  0x1.ffffffffffffdp-1022,  0x1.ffffffffffffep-1022 },
        {  0x1.ffffffffffffcp-1022,  0x1.ffffffffffffdp-1022 },
        {  0x1.0000000000003p-1022,  0x1.0000000000004p-1022 },
        {  0x1.0000000000002p-1022,  0x1.0000000000003p-1022 },
        {  0x1.0000000000001p-1022,  0x1.0000000000002p-1022 },
        {  0x1.0000000000000p-1022,  0x1.0000000000001p-1022 },

        {  0x0.fffffffffffffp-1022,  0x1.0000000000000p-1022 },
        {  0x0.ffffffffffffep-1022,  0x0.fffffffffffffp-1022 },
        {  0x0.ffffffffffffdp-1022,  0x0.ffffffffffffep-1022 },
        {  0x0.ffffffffffffcp-1022,  0x0.ffffffffffffdp-1022 },
        {  0x0.0000000000003p-1022,  0x0.0000000000004p-1022 },
        {  0x0.0000000000002p-1022,  0x0.0000000000003p-1022 },
        {  0x0.0000000000001p-1022,  0x0.0000000000002p-1022 },
        {  0x0.0000000000000p-1022,  0x0.0000000000001p-1022 },

        { -0x1.fffffffffffffp1023,  -0x1.ffffffffffffep1023  },
        { -0x1.ffffffffffffep1023,  -0x1.ffffffffffffdp1023  },
        { -0x1.ffffffffffffdp1023,  -0x1.ffffffffffffcp1023  },
        { -0x1.0000000000004p1023,  -0x1.0000000000003p1023  },
        { -0x1.0000000000003p1023,  -0x1.0000000000002p1023  },
        { -0x1.0000000000002p1023,  -0x1.0000000000001p1023  },
        { -0x1.0000000000001p1023,  -0x1.0000000000000p1023  },

        { -0x1.0000000000000p1023,  -0x1.fffffffffffffp1022  },

        { -0x1.0000000000000p2,     -0x1.fffffffffffffp1     },
        { -0x1.fffffffffffffp1,     -0x1.ffffffffffffep1     },
        { -0x1.ffffffffffffep1,     -0x1.ffffffffffffdp1     },
        { -0x1.ffffffffffffdp1,     -0x1.ffffffffffffcp1     },
        { -0x1.0000000000004p1,     -0x1.0000000000003p1     },
        { -0x1.0000000000003p1,     -0x1.0000000000002p1     },
        { -0x1.0000000000002p1,     -0x1.0000000000001p1     },
        { -0x1.0000000000001p1,     -0x1.0000000000000p1     },

        { -0x1.0000000000000p-1021, -0x1.fffffffffffffp-1022 },
        { -0x1.fffffffffffffp-1022, -0x1.ffffffffffffep-1022 },
        { -0x1.ffffffffffffep-1022, -0x1.ffffffffffffdp-1022 },
        { -0x1.ffffffffffffdp-1022, -0x1.ffffffffffffcp-1022 },
        { -0x1.0000000000004p-1022, -0x1.0000000000003p-1022 },
        { -0x1.0000000000003p-1022, -0x1.0000000000002p-1022 },
        { -0x1.0000000000002p-1022, -0x1.0000000000001p-1022 },
        { -0x1.0000000000001p-1022, -0x1.0000000000000p-1022 },

        { -0x1.0000000000000p-1022, -0x0.fffffffffffffp-1022 },
        { -0x0.fffffffffffffp-1022, -0x0.ffffffffffffep-1022 },
        { -0x0.ffffffffffffep-1022, -0x0.ffffffffffffdp-1022 },
        { -0x0.ffffffffffffdp-1022, -0x0.ffffffffffffcp-1022 },
        { -0x0.0000000000004p-1022, -0x0.0000000000003p-1022 },
        { -0x0.0000000000003p-1022, -0x0.0000000000002p-1022 },
        { -0x0.0000000000002p-1022, -0x0.0000000000001p-1022 },
        { -0x0.0000000000001p-1022, -0x0.0000000000000p-1022 },
    };

    for (int i = 0; i < NumberOf(cases);   i)
    {
        double in = cases[i].in, expected = cases[i].out;
        double observed = NextAfter(in);
        printf("NextAfter(%a) = %a.\n", in, observed);
        if (! (observed == expected))
        {
            printf("\tError, expected %a.\n", expected);
            status = EXIT_FAILURE;
        }
    }

    return status;
}


#endif  // defined CompileMain

CodePudding user response：

Best I can tell, this is possible only if fused multiply-add (FMA) is available. For my sample ISO-C99 implementation below I used float mapped to IEEE-754 binary32 because I can get better test coverage that way. The assumptions are that IEEE-754 bindings for C are in effect (so C floating-point types are bound to IEEE-754 binary types, subnormals are supported etc), the rounding mode in effect is round-to-nearest-or-even, and exception reporting requirements specified by the ISO-C standard (FE_INEXACT, FE_OVERFLOW, FE_UNDERFLOW) are waived.

The code may be more complex than it needs to be; I simply separated out the various operand classes and handled them one by one. I used the Intel compiler with the strictest floating-point settings to compile. The implementation of nextafterf from the Intel math library is used as a golden reference. I consider it highly unlikely, but obviously not impossible, that my implementation has a bug that matches a bug in Intel's library implementation.

#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <float.h>
#include <string.h>
#include <math.h>

float my_nextafterf (float a, float b)
{
    const float FP32_MIN_NORMAL = 0x1.000000p-126f;
    const float FP32_MAX_NORMAL = 0x1.fffffep 127f;
    const float FP32_EPSILON = 0x1.0p-23f;
    const float FP32_ONE = 1.0f;
    const float FP32_HALF = 0.5f;
    const float FP32_SUBNORM_SCALE = FP32_ONE / FP32_MIN_NORMAL;
    const float FP32_INC = (FP32_ONE   FP32_EPSILON) * FP32_EPSILON * FP32_HALF;
    const float FP32_INT_SCALE = FP32_ONE / FP32_EPSILON;

    float r;
    if ((!(fabsf(a) <= INFINITY)) || (!(fabsf(b) <= INFINITY))) { // unordered
        r = a   b;
    }
    else if (a == b) { // equal
        r = ((FP32_ONE / a) != (FP32_ONE / b)) ? b : a;
    }
    else if (fabsf (a) == INFINITY) { // infinity
        r = (a >= 0) ? FP32_MAX_NORMAL : (-FP32_MAX_NORMAL);
    }
    else if (fabsf (a) >= FP32_MIN_NORMAL) { // normal
        float factor = ((a >= 0) ^ (a > b)) ? FP32_INC : (-FP32_INC);
        r = fmaf (factor, a, a);
    } else { // subnormal
        float scal = (a >= 0) ? FP32_INT_SCALE : (-FP32_INT_SCALE);
        float incr = ((a >= 0) ^ (a > b)) ? FP32_ONE : (-FP32_ONE);
        r = (a * scal * FP32_SUBNORM_SCALE   incr) / scal / FP32_SUBNORM_SCALE;
    }
    return r;
}

float uint32_as_float (uint32_t a)
{
    float r;
    memcpy (&r, &a, sizeof r);
    return r;
}

uint32_t float_as_uint32 (float a)
{
    uint32_t r;
    memcpy (&r, &a, sizeof r);
    return r;
}

// Fixes via: Greg Rose, KISS: A Bit Too Simple. http://eprint.iacr.org/2011/007
static uint32_t kiss_z = 362436069;
static uint32_t kiss_w = 521288629;
static uint32_t kiss_jsr = 123456789;
static uint32_t kiss_jcong = 380116160;
#define znew (kiss_z=36969*(kiss_z&0xffff) (kiss_z>>16))
#define wnew (kiss_w=18000*(kiss_w&0xffff) (kiss_w>>16))
#define MWC  ((znew<<16) wnew)
#define SHR3 (kiss_jsr^=(kiss_jsr<<13),kiss_jsr^=(kiss_jsr>>17),kiss_jsr^=(kiss_jsr<<5))
#define CONG (kiss_jcong=69069*kiss_jcong 13579)
#define KISS ((MWC^CONG) SHR3)

int main (void)
{
    float a, b, res, ref;
    uint32_t ia, ib, ires, iref;
    unsigned long long int count = 0;
    const uint32_t FP32_QNAN_BIT = 0x00400000;

    while (1) {
        ia = KISS;
        ib = KISS;
        /* increase likelihood of zeros, infinities, equality */
        if (!(KISS & 0xfff)) ia = ia & 0x80000000;
        if (!(KISS & 0xfff)) ib = ib & 0x80000000;
        if (!(KISS & 0xfff)) ia = ia | 0x7f800000;
        if (!(KISS & 0xfff)) ib = ib | 0x7f800000;
        if (!(KISS & 0xffffff)) ib = ia;

        a = uint32_as_float (ia);
        b = uint32_as_float (ib);

        res = my_nextafterf (a, b);
        ref = nextafterf (a, b);
        ires = float_as_uint32 (res);
        iref = float_as_uint32 (ref);

        if (ires != iref) {
            /* if both 'from' and 'to' are NaN, result may be either NaN, quietened */
            if (!((isnan (a) && isnan (b)) && 
                  ((ires == (ia | FP32_QNAN_BIT)) || (ires == (ib | FP32_QNAN_BIT))))) {
                printf ("error: a=x b=x  res=x  ref=x\n", ia, ib, ires, iref);
            }
        }
        count  ;
        if (!(count & 0xffffff)) printf ("\rcount=%llx", count);
    }
    return EXIT_SUCCESS;
}