The Goldschmidt division algorithm is a fast division using the Newton-Raphson approximation.

Goldschmidt division on wikipedia

It is an algorithm computing the quotient c of n (numerator) divided by d (divider).

c = n / d

The Newton-Raphson approximation estimates 1/d and the quotient is then:

c = n * (1/d)

1/d is estimated by iterating this:

x = x * (2 - x * d)

The quotient c is estimated with:

c ~= x * n

Goldschmidt division for 16bit signed integers

We replace d with a value a so that a is in the range [0.5, 1[ and 1/a is in the range ]1, 2]. We choose the range [0.5, 1[ for a because the iteration x * (2 - a * x) converges fast for this range.

a is d divided by a power of 2 number so that a is in the expected range:

a   = d / 2<<(msb(d)+1)
1/a = 2<<(msb(d)+1) / d

x is an approximation of 1/a, we get the code:

x = initial value in range [1,2]
loop:
x = x * (2 - x * a)
goto loop (4 times)
c ~= x * n / 2<<(msb(d)+1)

The computations are done using fixed point integers with 15 bit for the fractional number to fit x * (2 - x * a) into a 32 bit integer.

  • Range 0.5 to 1 maps to 16384 to 32768 (bit 15 is 0, bit 14 is 1)
  • Range 1 to 2 maps to 32768 to 65536 (bit 15 is 1)
  • 2 is 0x10000 in fixed point integer with 15bit fractional number (bit 16 is 1)

d is a 16bit signed integer in the range 0 to 32767, we compute a like this:

a = d << (clz(d)-1)

clz is a function (or the lzcnt instruction) counting the leading 0s in d (from 1 to 15), bit 15 is 0 and bit 14 is 1 so a in range 16384 to 32768.

The initial value for x is 1.5 which in fixed point is: 0xc000. We get the code:

u16 div(u16 n, u16 d) {
  // clz input is 32 bit integer so d needs to be shifted
  u16 shift = clz(((u32)d)<<16)-1;
  u32     a = d << shift;
  u32     x = 0xc000;

  // 4 iterations
  x = (x*(0x10000 - ((x*a)>>15)))>>15;
  x = (x*(0x10000 - ((x*a)>>15)))>>15;
  x = (x*(0x10000 - ((x*a)>>15)))>>15;
  x = (x*(0x10000 - ((x*a)>>15)))>>15;

  r = ((((u32)n * x) >> (15-shift)) >> 15);
  return r;
}

n * 1/a needs to be adjusted to become n * 1/d with shift right by 15-shift positions and the 15bit fractional number is removed with shift right by 15 positions. The fractional number in the result is an estimation of the remainder and is often not accurate.

Signed division is implemented by computing the sign of the result and dividing abs(n) with abs(d).

We can choose an initial value for x to make x converge faster.

range(i, 32) {
    (u32)((1/((f32)(0x4000+(i<<9))/((f32)32768)))*32768)
}
// initial value bit 9 to 13 in a are index in initialValuesForx
u32 x = initialValuesForx[(a & 0x3e00) >> 9];

The clz function is from the book Henry S. Warren - Hacker's Delight (2nd Edition)-Addison-Wesley Professional (2012):

/* u is for unused elements */
#define u 99
u32 clz(u32 v) {
  static char table[64] =
  {32,20,19, u, u,18, u, 7,  10,17, u, u,14, u, 6, u,
    u, 9, u,16, u, u, 1,26,   u,13, u, u,24, 5, u, u,
    u,21, u, 8,11, u,15, u,   u, u, u, 2,27, 0,25, u,
    22, u,12, u, u, 3,28, u,  23, u, 4,29, u, u,30,31};

  v = v | (v >> 1);    // Propagate leftmost
  v = v | (v >> 2);    // 1-bit to the right.
  v = v | (v >> 4);
  v = v | (v >> 8);
  v = v & ~(v >> 16);
  v = v*0xFD7049FF;
  return table[v >> 26];
}

Implementation in C16 assembly

C16 CPU

The clz function is implemented like this:

clz:
  ; count leading zeros
  ; input r1 output r1
  ; r0, r1, r2 are used

  ; v = v | (v >> 1);
  mv    r2, r1
  shri  r2, 1
  or    r1, r2
  ; v = v | (v >> 2);
  mv    r2, r1
  shri  r2, 2
  or    r1, r2
  ; v = v | (v >> 4);
  mv    r2, r1
  shri  r2, 4
  or    r1, r2
  ; v = v | (v >> 8);
  mv    r2, r1
  shri  r2, 8
  or    r1, r2
  ; v = v & ~(v >> 16);
  mv    r2, r1
  xor   r0, r0
  loadi 0, 0x10
  shr   r2 ,r0
  not   r2
  and   r1, r2
  ; load 26 in r2
  loadi 0, 0x1a
  mv    r2, r0
  ; v = v*0xFD7049FF;
  ; load 0xFD7049FF in r0
  loadi 3, 0xFD
  loadi 2, 0x70
  loadi 1, 0x49
  loadi 0, 0xFF
  mul   r1, r0
  shr   r1, r2
  ; return table[v >> 26];
  ; table address is 0
  xor   r2, r2
  mula  r1, 2
  load  r1, r1
  ret

The nrdiv function is the implementation of the newton-raphson method for the goldschmidt division approximation:

nrdiv:
  ; divide function the Goldschmidt algorithm
  ; input i16 r4 n , r5 d output r4 quotient
  ; r0-r8 are modified

  ; handle division by 0
  xor    r0, r0
  cmp    r5, r0
  jnz    nrdivByR5
  ; division by 0
  ; skip
  inc    r0
  xor    r4, r4
  ret
nrdivByR5:
  ; store sign of the result
  ; r8 sign = 1
  xor    r8, r8
  inc    r8
  ; convert both dividend and divisor to positive
  ; if (n < 0 and d >= 0) {
  ; n >= 0
  cmp    r4, r0
  jge    endNNegDPos
  ; d < 0
  cmp    r5, r0
  jl     endNNegDPos
  ; r8 sign = -1
  dec    r8
  dec    r8
  ; n = -n
  neg    r4
  jmp    unsignedNrdiv
endNNegDPos:
  ; else if (n >= 0 and d < 0) {
  ; n < 0
  cmp    r4, r0
  jl     endNPosDNeg
  ; n >= 0
  cmp    r5, r0
  jge    endNPosDNeg
  ; r8 sign = -1
  dec    r8
  dec    r8
  ; d = -d
  neg    r5
  jmp    unsignedNrdiv
endNPosDNeg:
  ; else if (n < 0 and d < 0) {
  ; n >= 0
  cmp    r4, r0
  jge    endNNegDNeg
  ; d >= 0
  cmp    r5, r0
  jge    endNNegDNeg
  dec    r0
  ; n = -n
  neg    r4
  ; d = -d
  neg    r5
endNNegDNeg:
unsignedNrdiv:

  ; r3 is shift
  ; u16 shift = __builtin_clz(((u32)d)<<16)-1;
  ; count leading zeros
  mv    r1, r5
  xor   r0, r0
  loadi 0, 0x10
  shl   r1, r0
  lzcnt r1, r1
  dec   r1
  mv    r3, r1

  ; r5 is d and becomes a after shl
  ; u32     a = d << shift;
  shl   r5, r3

  ; r2 is x
  ; u32 x = initialValuesForx[(a & 0x3e00) >> 9];
  ; initialValuesForx array has address 0x100
  xor   r0, r0
  loadi 1, 0x1
  mv    r2, r0
  mv    r1, r5
  xor   r0, r0
  loadi 1, 0x3e
  and   r1, r0
  xor   r0, r0
  loadi 0, 0x9
  shr   r1, r0
  mula  r2, 2
  load  r2, r2

 ; r6 is 15
  xor   r0, r0
  loadi 0, 0xf
  mv    r6, r0
  ; r7 is 0x10000
  xor   r0, r0
  loadi 2, 0x1
  mv    r7, r0

  ; x = (x*(0x10000 - ((x*a)>>15)))>>15;
  mv    r1, r2 ; r1 = x
  mul   r1, r5 ; r1 = x * a
  shr   r1, r6
  mv    r0, r7
  sub   r0, r1 ; r0 = 0x10000 - ((x*a)>>15)
  mul   r0, r2 ; r0 = r0 * x
  shr   r0, r6
  mv    r2, r0 ; r2 = (x*(0x10000 - ((x*a)>>15)))>>15
  ; iteration 2
  mv    r1, r2 ; r1 = x
  mul   r1, r5 ; r1 = x * a
  shr   r1, r6
  mv    r0, r7
  sub   r0, r1 ; r0 = 0x10000 - ((x*a)>>15)
  mul   r0, r2 ; r0 = r0 * x
  shr   r0, r6
  mv    r2, r0 ; r2 = (x*(0x10000 - ((x*a)>>15)))>>15
  ; iteration 3
  mv    r1, r2 ; r1 = x
  mul   r1, r5 ; r1 = x * a
  shr   r1, r6
  mv    r0, r7
  sub   r0, r1 ; r0 = 0x10000 - ((x*a)>>15)
  mul   r0, r2 ; r0 = r0 * x
  shr   r0, r6
  mv    r2, r0 ; r2 = (x*(0x10000 - ((x*a)>>15)))>>15
  ; iteration 4
  mv    r1, r2 ; r1 = x
  mul   r1, r5 ; r1 = x * a
  shr   r1, r6
  mv    r0, r7
  sub   r0, r1 ; r0 = 0x10000 - ((x*a)>>15)
  mul   r0, r2 ; r0 = r0 * x
  shr   r0, r6
  ; no more iterations - mv    r2, r0 ; r2 = (x*(0x10000 - ((x*a)>>15)))>>15

  ; (((u32)n * x) >> (15-shift)) >> 15
  mul   r4, r0 ; r4 = n * x
  mv    r0, r6
  sub   r0, r3 ; r0 = 15-shift
  shr   r4, r0 ; r4 = ((u32)n * x) >> (15-shift)
  shr   r4, r6 ; r4 = (((u32)n * x) >> (15-shift)) >> 15

  ; sign
  ; r4 = ((((u32)n * x) >> (15-shift)) >> 15) * sign
  mul   r4, r8
  ret

It is not very efficient to use the clz function because it takes 50 clock cycles to run and the lzcnt instruction gives the result in 1 clock cycle.

Long division

To compare the performance of the goldschmidt division, I implemented the traditional long division in assembly, it is the slowest:

longdiv:
  ; divide function using the long division algorithm
  ; input u32 r4, r5 output r2 quotient r3 remainder
  ; r4 N r5 D
  ; r2 Q r3 R
  ; r2 Q = 0
  xor   r2, r2
  ; 34 R = 0
  xor   r3, r3
  xor   r0, r0
  ; r1 i 31
  loadi 0, 0x1f
  mv    r1, r0
  ; r6 31 for shift 31
  mv    r6, r0
  ; r8 1 for Q = Q | 1
  xor   r8, r8
  inc   r8
  ; for (int i = 31; i >= 0; --i) {
longdivLoop:
  ; Q = Q << 1
  shli  r2, 1
  ; R = R << 1;
  shli  r3, 1
  ; R |= (N >> 31);
  mv    r7, r4
  shr   r7, r6
  or    r3, r7
  shli  r4, 1
  ; if (D <= R) {
  cmp   r5, r3
  ja    endQuotient
  ; R = R - D
  sub   r3, r5
  ; Q = Q | 1
  or    r2, r8
endQuotient:
  dec   r1
  jns   longdivLoop
  ret

Division by 10

There is an efficient divide by 10 function in the book Henry S. Warren - Hacker's Delight (2nd Edition)-Addison-Wesley Professional (2012):

unsigned divu10(unsigned n) {
    unsigned q, r;
    q = (n >> 1) + (n >> 2);
    q = q + (q >> 4);
    q = q + (q >> 8);
    q = q + (q >> 16);
    q = q >> 3;
    r = n - (((q << 2) + q) << 1);
    return q + (r > 9);
}

This divide by 10 function is useful for displaying a binary number is base 10.

I implemented it in C16 assembly:

div10:
  ; div10 function
  ; input u32 r0 n output r1
  ; q = (n >> 1) + (n >> 2);
  mv    r1,r0
  shri  r1, 1
  mv    r2,r0
  shri  r2, 2
  add   r1, r2
  ; q = q + (q >> 4);
  mv    r2, r1
  shri  r2, 4
  add   r1, r2
  ; q = q + (q >> 8);
  mv    r2, r1
  shri  r2, 8
  add   r1, r2
  ; q = q + (q >> 16);
  mv    r2, r1
  shri  r2, 8
  shri  r2, 8
  add   r1, r2
  ; q = q >> 3;
  shri  r1, 3
  ; r = n - (((q << 2) + q) << 1);
  mv    r2, r1
  mv    r3, r1
  shli  r2, 2
  add   r3, r2
  shli  r3, 1
  sub   r0, r3
  ; return q + (r > 9);
  ; r3 r
  mv    r3, r0
  xor   r0, r0
  loadi 0, 0x9
  cmp   r3, r0
  jbe   endDiv10
  inc   r1
endDiv10:
  ; result in r1
  ret

Hardware implementation in verilog

The divide by 10 function implemented in verilog is:

        div10q = {1'b0, regfile_rdata1[31:1]} + {2'b0, regfile_rdata1[31:2]};
        div10q = div10q + {4'b0, div10q[31:4]};
        div10q = div10q + {8'b0, div10q[31:8]};
        div10q = div10q + {16'b0, div10q[31:16]};
        div10q = {3'b0, div10q[31:3]};
        div10r = regfile_rdata1 - {{{div10q[28:0],2'b0}+div10q[30:0]}, 1'b0};
        regfile_wdata1 <= div10q + (div10r > 9 ? 1 : 0);

I added a div10 instruction to my C16 cpu and it takes 1 clock cycle.

I implemented a 32 bit goldschmidt division in verilog, the iterations need to be computed with 66 bit values.

For the goldschmidt division, we need to count the number of leading zero bits in the dividend to shift the dividend and compute an approximation of 1/dividend.

In verilog, leading zero are counted like this:

            divshift[4] = (sdivd[31:16] == 16'b0);
            divshift16  = divshift[4] ? sdivd[15:0] : sdivd[31:16];
            divshift[3] = (divshift16[15:8] == 8'b0);
            divshift8   = divshift[3] ? divshift16[7:0] : divshift16[15:8];
            divshift[2] = (divshift8[7:4] == 4'b0);
            divshift4   = divshift[2] ? divshift8[3:0] : divshift8[7:4];
            divshift[1] = (divshift4[3:2] == 2'b0);
            divshift[0] = divshift[1] ? ~divshift4[1] : ~divshift4[3];

sdivd is the dividend, it is never equal to 0 because divide by 0 is invalid. It sets bit 4 in the result, then bit 3, 2, 1 and 0. For bit 0, it doesn't need to test if the bit is 1 because we know the dividend is not 0.

diva is the shifted dividend in the range 2^31 to 2^32-1. divx initial value is in the range 2^32 to 2^33, it selected by using 4 bits (bit 27 to 30, bit 31 is always 1) in diva.

            diva = {34'b0, sdivd} << divshift;

                       // initial value
            if (diva[30:27] == 0) begin
              divx = 66'h200000000;
            end
            else if (diva[30:27] == 1) begin
              divx = 66'h1e1e1e1e1;
            end
            else if (diva[30:27] == 2) begin
              divx = 66'h1c71c71c7;
            end
            else if (diva[30:27] == 3) begin
              divx = 66'h1af286bca;
            end
            else if (diva[30:27] == 4) begin
              divx = 66'h199999999;
            end
            else if (diva[30:27] == 5) begin
              divx = 66'h186186186;
            end
            else if (diva[30:27] == 6) begin
              divx = 66'h1745d1745;
            end
            else if (diva[30:27] == 7) begin
              divx = 66'h1642c8590;
            end
            else if (diva[30:27] == 8) begin
              divx = 66'h155555555;
            end
            else if (diva[30:27] == 9) begin
              divx = 66'h147ae147a;
            end
            else if (diva[30:27] == 10) begin
              divx = 66'h13b13b13b;
            end
            else if (diva[30:27] == 11) begin
              divx = 66'h12f684bda;
            end
            else if (diva[30:27] == 12) begin
              divx = 66'h124924924;
            end
            else if (diva[30:27] == 13) begin
              divx = 66'h11a7b9611;
            end
            else if (diva[30:27] == 14) begin
              divx = 66'h111111111;
            end
            else if (diva[30:27] == 15) begin
              divx = 66'h108421084;
            end

            // Newton-Raphson method
            divx = (divx * (66'h200000000 - ((divx * diva)>>32))) >> 32;
            divx = (divx * (66'h200000000 - ((divx * diva)>>32))) >> 32;
            divx = (divx * (66'h200000000 - ((divx * diva)>>32))) >> 32;
            divx = (divx * (66'h200000000 - ((divx * diva)>>32))) >> 32;

            // Shift divx * numerator to get the division result
            diva = (((sdivn * divx) >> (32-{1'b0,divshift})) >> 32);

The execution stage of the division takes 1 clock cycle at 100Mhz, I don't see any timing violation in the logs from vivado (vivado is the tool from amd/xilinx for compiling designs for FPGAs) so it should be ok.

I have been running 50 million divisions with random numbers in a testbench and the division approximation is correct.

I first tried with 32 initial values and 3 iterations, it didn't work. So I increased to 64 and 128 initial values and the approximation was still incorrect sometimes. Maybe it works with 256 initial values and 3 iterations but it takes less ressources to have 16 initial values and 4 iterations.

Performance

Values are clock cycles.

Function/Implemation│ SW   │ HW
────────────────────┼──────┼─────
division by 10      │  91  │ 3
────────────────────┼──────┼─────
goldschmidt division│ 232  │ 3
────────────────────┼──────┼─────
long division       │ 943  │ 32

The software implementation of the goldschmidt division is 4 times faster than the long division and the divide by 10 function is ten times faster than the long division. In the hardware column, I wrote 3 clock cycle for the hw division and the hw divide by 10 because the C16 CPU is currently not pipelined and the execution stage takes 1 clock cycle.