float.dd

Ddoc

$(SPEC_S Floating Point,

$(H3 Floating Point Intermediate Values)

	$(P On many computers, greater
	precision operations do not take any longer than lesser
	precision operations, so it makes numerical sense to use
	the greatest precision available for internal temporaries.
	The philosophy is not to dumb down the language to the lowest
	common hardware denominator, but to enable the exploitation
	of the best capabilities of target hardware.
	)

	$(P For floating point operations and expression intermediate values,
	a greater precision can be used than the type of the
	expression.
	Only the minimum precision is set by the types of the
	operands, not the maximum. $(B Implementation Note:) On Intel
	x86 machines, for example,
	it is expected (but not required) that the intermediate
	calculations be done to the full 80 bits of precision
	implemented by the hardware.
	)

	$(P It's possible that, due to greater use of temporaries and
	common subexpressions, optimized code may produce a more
	accurate answer than unoptimized code.
	)

	$(P Algorithms should be written to work based on the minimum
	precision of the calculation. They should not degrade or
	fail if the actual precision is greater. Float or double types,
	as opposed to the real (extended) type, should only be used for:
	)

	$(UL
	    $(LI reducing memory consumption for large arrays)
	    $(LI when speed is more important than accuracy)
	    $(LI data and function argument compatibility with C)
	)

$(H3 Floating Point Constant Folding)

	$(P Regardless of the type of the operands, floating point
	constant folding is done in $(B real) or greater precision.
	It is always done following IEEE 754 rules and round-to-nearest
	is used.)

	$(P Floating point constants are internally represented in
	the implementation in at least $(B real) precision, regardless
	of the constant's type. The extra precision is available for
	constant folding. Committing to the precision of the result is
	done as late as possible in the compilation process. For example:)

---
const float f = 0.2f;
writefln(f - 0.2);
---
	$(P will print 0. A non-const static variable's value cannot be
	propagated at compile time, so:)

---
static float f = 0.2f;
writefln(f - 0.2);
---
	$(P will print 2.98023e-09. Hex floating point constants can also
	be used when specific floating point bit patterns are needed that
	are unaffected by rounding. To find the hex value of 0.2f:)

---
import std.stdio;

void main() {
  writefln("%a", 0.2f);
}
---
	$(P which is 0x1.99999ap-3. Using the hex constant:)

---
const float f = 0x1.99999ap-3f;
writefln(f - 0.2);
---

	$(P prints 2.98023e-09.)

	$(P Different compiler settings, optimization settings,
	and inlining settings can affect opportunities for constant
	folding, therefore the results of floating point calculations may differ
	depending on those settings.)

$(H3 Rounding Control)

	$(P IEEE 754 floating point arithmetic includes the ability to set 4
	different rounding modes.
	These are accessible via the functions in std.c.fenv.
	)

	$(P If the floating-point rounding mode is changed within a function,
	it must be restored before the function exits. If this rule is violated
	(for example, by the use of inline asm), the rounding mode used for
	subsequent calculations is undefined.
	)


$(H3 Exception Flags)

	$(P IEEE 754 floating point arithmetic can set several flags based on what
	happened with a
	computation:)

	$(TABLE
	$(TR $(TD $(D FE_INVALID)))
	$(TR $(TD $(D FE_DENORMAL)))
	$(TR $(TD $(D FE_DIVBYZERO)))
	$(TR $(TD $(D FE_OVERFLOW)))
	$(TR $(TD $(D FE_UNDERFLOW)))
	$(TR $(TD $(D FE_INEXACT)))
	)

	$(P These flags can be set/reset via the functions in
	$(DPLLINK phobos/std_c_fenv.html, std.c.fenv).)

$(H3 Floating Point Comparisons)

	$(P In addition to the usual $(D <) $(D <)$(D =)
	$(D >) $(D >)$(D =) $(D ==) $(D !=) comparison
	operators, D adds more that are
	specific to floating point. These are
	$(D !<>=)
	$(D <>)
	$(D <>=)
	$(D !<=)
	$(D !<)
	$(D !>=)
	$(D !>)
	$(D !<>)
	and match the semantics for the
	NCEG extensions to C.
	See $(DDSUBLINK expression, floating_point_comparisons, Floating point comparisons).
	)

$(H3 $(LNAME2 floating-point-transformations, Floating Point Transformations))

	$(P An implementation may perform transformations on
	floating point computations in order to reduce their strength,
	i.e. their runtime computation time.
	Because floating point math does not precisely follow mathematical
	rules, some transformations are not valid, even though some
	other programming languages still allow them.
	)

	$(P The following transformations of floating point expressions
	are not allowed because under IEEE rules they could produce
	different results.
	)

	$(TABLE2 Disallowed Floating Point Transformations,
	$(THEAD transformation, comments)
	$(TROW
	$(ARGS $(I x) + 0 $(RARR) $(I x)) , $(ARGS not valid if $(I x) is -0)
	)
	$(TROW
	$(ARGS $(I x) - 0 $(RARR) $(I x)) , $(ARGS not valid if $(I x) is $(PLUSMN)0 and rounding is towards -$(INFIN))
	)
	$(TROW
	$(ARGS -$(I x) $(HARR) 0 - $(I x)) , $(ARGS not valid if $(I x) is +0)
	)
	$(TROW
	$(ARGS $(I x) - $(I x) $(RARR) 0) , $(ARGS not valid if $(I x) is NaN or $(PLUSMN)$(INFIN))
	)
	$(TROW
	$(ARGS $(I x) - $(I y) $(HARR) -($(I y) - $(I x))) , $(ARGS not valid because (1-1=+0) whereas -(1-1)=-0)
	)
	$(TROW
	$(ARGS $(I x) * 0 $(RARR) 0) , $(ARGS not valid if $(I x) is NaN or $(PLUSMN)$(INFIN))
	)
$(COMMENT
	$(TROW
	$(ARGS $(I x) * 1 $(RARR) $(I x)) , $(ARGS not valid if $(I x) is a signaling NaN)
	)
)
	$(TROW
	$(ARGS $(I x) / $(I c) $(HARR) $(I x) * (1/$(I c))) , $(ARGS valid if (1/$(I c)) yields an e$(I x)act result)
	)
	$(TROW
	$(ARGS $(I x) != $(I x) $(RARR) false) , $(ARGS not valid if $(I x) is a NaN)
	)
	$(TROW
	$(ARGS $(I x) == $(I x) $(RARR) true) , $(ARGS not valid if $(I x) is a NaN)
	)
	$(TROW
	$(ARGS $(I x) !$(I op) $(I y) $(HARR) !($(I x) $(I op) $(I y))) , $(ARGS not valid if $(I x) or $(I y) is a NaN)
	)
	)

	$(P Of course, transformations that would alter side effects are also
	invalid.)

)

Macros:
	TITLE=Floating Point
	WIKI=Float
	CATEGORY_SPEC=$0