blob: 585ce89c0f336d0d5ab6895be6ccf9fe1b0a1c02 [file] [log] [blame]
 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_RECIPROCAL_DIV_H #define _LINUX_RECIPROCAL_DIV_H #include /* * This algorithm is based on the paper "Division by Invariant * Integers Using Multiplication" by Torbjörn Granlund and Peter * L. Montgomery. * * The assembler implementation from Agner Fog, which this code is * based on, can be found here: * http://www.agner.org/optimize/asmlib.zip * * This optimization for A/B is helpful if the divisor B is mostly * runtime invariant. The reciprocal of B is calculated in the * slow-path with reciprocal_value(). The fast-path can then just use * a much faster multiplication operation with a variable dividend A * to calculate the division A/B. */ struct reciprocal_value { u32 m; u8 sh1, sh2; }; /* "reciprocal_value" and "reciprocal_divide" together implement the basic * version of the algorithm described in Figure 4.1 of the paper. */ struct reciprocal_value reciprocal_value(u32 d); static inline u32 reciprocal_divide(u32 a, struct reciprocal_value R) { u32 t = (u32)(((u64)a * R.m) >> 32); return (t + ((a - t) >> R.sh1)) >> R.sh2; } struct reciprocal_value_adv { u32 m; u8 sh, exp; bool is_wide_m; }; /* "reciprocal_value_adv" implements the advanced version of the algorithm * described in Figure 4.2 of the paper except when "divisor > (1U << 31)" whose * ceil(log2(d)) result will be 32 which then requires u128 divide on host. The * exception case could be easily handled before calling "reciprocal_value_adv". * * The advanced version requires more complex calculation to get the reciprocal * multiplier and other control variables, but then could reduce the required * emulation operations. * * It makes no sense to use this advanced version for host divide emulation, * those extra complexities for calculating multiplier etc could completely * waive our saving on emulation operations. * * However, it makes sense to use it for JIT divide code generation for which * we are willing to trade performance of JITed code with that of host. As shown * by the following pseudo code, the required emulation operations could go down * from 6 (the basic version) to 3 or 4. * * To use the result of "reciprocal_value_adv", suppose we want to calculate * n/d, the pseudo C code will be: * * struct reciprocal_value_adv rvalue; * u8 pre_shift, exp; * * // handle exception case. * if (d >= (1U << 31)) { * result = n >= d; * return; * } * * rvalue = reciprocal_value_adv(d, 32) * exp = rvalue.exp; * if (rvalue.is_wide_m && !(d & 1)) { * // floor(log2(d & (2^32 -d))) * pre_shift = fls(d & -d) - 1; * rvalue = reciprocal_value_adv(d >> pre_shift, 32 - pre_shift); * } else { * pre_shift = 0; * } * * // code generation starts. * if (imm == 1U << exp) { * result = n >> exp; * } else if (rvalue.is_wide_m) { * // pre_shift must be zero when reached here. * t = (n * rvalue.m) >> 32; * result = n - t; * result >>= 1; * result += t; * result >>= rvalue.sh - 1; * } else { * if (pre_shift) * result = n >> pre_shift; * result = ((u64)result * rvalue.m) >> 32; * result >>= rvalue.sh; * } */ struct reciprocal_value_adv reciprocal_value_adv(u32 d, u8 prec); #endif /* _LINUX_RECIPROCAL_DIV_H */