www.pudn.com > pi.zip > pi.c
/* +++Date last modified: 05-Jul-1997 */ /* ** This method implements the Salamin / Brent / Gauss Arithmetic- ** Geometric Mean pi formula. ** ** Let A[0] = 1, B[0] = 1/Sqrt(2) ** ** Then iterate from 1 to 'n'. ** A[n] = (A[n-1] + B[n-1])/2 ** B[n] = Sqrt(A[n-1]*B[n-1]) ** C[n]^2 = A[n]^2 - B[n]^2 (or) C[n] = (A[n-1]-B[n-1])/2 ** n ** PI[n] = 4A[n+1]^2 / (1-(Sum (2^(j+1))*C[j]^2)) ** j = 1 ** ** There is an actual error calculation, but it comes out to slightly ** more than double on each iteration. I think it results in about 17 ** million correct digits, instead of 16 million if it actually ** doubled. PI16 generates 178,000 digits. PI19 to over a million. ** PI22 is 10 million, and PI26 to 200 million. ** ** For what little it's worth, this program is placed into the public ** domain by its author, Carey Bloodworth, on September 21, 1996. ** ** The math routines in this program are not general purpose routines. ** They have been optimized and written for this specific use. The ** concepts are of course portable, but not the implementation. ** ** The program run time is about O(n^1.7). That's fairly good growth, ** compared to things such as the classic arctangents. But not as good ** as it should be, if I used a FFT based multiplication. Also, the 'O' ** is fairly large. In fact, I'd guess that this program could compute ** one million digits of pi in about the same time as my previously ** posted arctangent method, in spite of this one having less than n^2 ** growth. ** ** The program has not been cleaned up. It's written rather crudely ** and dirty. The RSqrt() in particular is rather dirty, having gone ** through numerous changes and attempts at speeding it up. ** But I'm not planning on doing any more with it. The growth isn't as ** low as I'd hoped, and until I find a faster multiplication, the ** method isn't any better than simpler arctangents. ** ** I currently use a base of 10,000 simply because it made debugging ** easier. A base of 65,536 and modifying the FastMul() to handle sizes ** that aren't powers of two would allow a bit better efficiency. */ #include#include #include #include #ifdef __TURBOC__ typedef short int Indexer; #else typedef long int Indexer; #endif typedef short int Short; typedef long int Long; Short *a, *b, *c, *Work1, *MulWork, *Work2, *Work3; int SIZE; int ispow2(int x){ return! ((~(~0U>>1)|x)&x -1) ;} /* ** Work1 is explicitly used in Reciprocal() and RSqrt() ** Work1 is implicitly used in Divide() and Sqrt() ** Work2 is explicitly used in Divide() and Sqrt() ** Work3 is used only in the AGM and holds the previous reciprocal ** square root, allowing us to save some time in RSqrt() */ void Copy(Short *a, Short *b, Indexer Len) { while (Len--) a[Len] = b[Len]; } /* ** This rounds and copies a 2x mul result into a normal result ** Our number format will never have more than one unit of integer, ** and after a mul, we have two, so we need to fix that. */ void Round2x(Short *a, Short *b, Indexer Len) { Indexer x; Short carry; carry = 0; if (b[Len+1] >= 5000) carry = 1; for (x = Len; x > 0; x--) { carry += b[x]; a[x-1] = carry % 10000; carry /= 10000; } } void DivBy2(Short *n, Indexer Len) { Indexer x; Long temp; temp = 0; for (x = 0; x < Len; x++) { temp = (Long)n[x]+temp*10000; n[x] = (Short)(temp/2); temp = temp%2; } } void DoCarries(Short *limit, Short *cur, Short carry) { Long temp; while ((cur >= limit) && (carry != 0)) { temp = *cur+carry; carry = 0; if (temp >= 10000) { carry = 1; temp -= 10000; } *cur = temp; --cur; } } void DoBorrows(Short *limit, Short *cur, Short borrow) { Long temp; while ((cur >= limit) && (borrow != 0)) { temp = *cur-borrow; borrow = 0; if (temp < 0) { borrow = 1; temp += 10000; } *cur = temp; --cur; }; } void PrintShort2(char *str, Short *num, Indexer Len) { Indexer x; printf("%s ", str); printf("%u.", num[0]); for (x = 1; x < Len; x++) printf("%04u", num[x]); printf("\n"); } void PrintShort(char *str, Short *num, Indexer Len) { Indexer x; int printed = 0; printf("%s ", str); printf("%u.\n", num[0]); for (x = 1; x < Len; x++) { printf("%02d", num[x]/100); printed += 2; if ((printed % 1000) == 0) { printf("\n\n\n\n"); printed = 0; } else if ((printed % 50) == 0) { printf("\n"); } else if ((printed % 10) == 0) { printf(" "); } printf("%02d", num[x] % 100); printed += 2; if ((printed % 1000) == 0) { printf("\n\n\n\n"); printed = 0; } else if ((printed % 50) == 0) { printf("\n"); } else if ((printed % 10) == 0) { printf(" "); } } printf("\n"); } /* sum = a + b */ Short Add(Short *sum, Short *a, Short *b, Indexer Len) { Long s, carry; Indexer x; carry = 0; for (x = Len - 1; x >= 0; x--) { s = (Long)a[x] + (Long)b[x] + carry; sum[x] = (Short)(s%10000); carry = s/10000; } return carry; } /* dif = a-b */ Short Sub(Short *dif, Short *a, Short *b, Indexer Len) { Long d, borrow; Indexer x; borrow = 0; for (x = Len - 1; x >= 0; x--) { d = (Long)a[x] - (Long)b[x] - borrow; borrow = 0; if (d < 0) { borrow = 1; d += 10000; } dif[x] = (Short)d; } return borrow; } void Negate(Short *num, Indexer Len) { Indexer x;Long d, borrow; borrow = 0; for (x = Len - 1; x >= 0; x--) { d = 0 - num[x] - borrow; borrow = 0; if (d < 0) { borrow = 1; d += 10000; } num[x] = (Short)d; } } /* prod = a*b. prod should be twice normal length */ void Mul(Short *prod, Short *a, Short *b, Indexer Len) { Long p; Indexer ia, ib, ip; if ((prod == a) || (b == prod)) { printf("MUL product can't be one of the other arguments\n"); exit(1); } for (ip = 0; ip < Len * 2; ip++) prod[ip] = 0; for (ib = Len - 1; ib >= 0; ib--) { if (b[ib] == 0) continue; ip = ib + Len; p = 0; for (ia = Len - 1; ia >= 0; ia--) { p = (Long)a[ia]*(Long)b[ib] + p + prod[ip]; prod[ip] = p%10000; p = p/10000; ip--; } while ((p) && (ip >= 0)) { p += prod[ip]; prod[ip] = p%10000; p = p/10000; ip--; } } } /* ** This is based on the simple O(n^1.585) method, although my ** growth seems to be closer to O(n^1.7) ** ** It's fairly simple. a*b is: a2b2(B^2+B)+(a2-a1)(b1-b2)B+a1b1(B+1) ** ** For a = 4711 and b = 6397, a2 = 47 a1 = 11 b2 = 63 b1 = 97 Base = 100 ** ** If we did that the normal way, we'd do ** a2b2 = 47*63 = 2961 ** a2b1 = 47*97 = 4559 ** a1b2 = 11*63 = 693 ** a1b1 = 11*97 = 1067 ** ** 29 61 ** 45 59 ** 6 93 ** 10 67 ** ----------- ** 30 13 62 67 ** ** Or, we'd need N*N multiplications. ** ** With the 'fractal' method, we compute: ** a2b2 = 47*63 = 2961 ** (a2-b1)(b1-b2) = (47-11)(97-63) = 36*34 = 1224 ** a1b1 = 11*97 = 1067 ** ** 29 61 ** 29 61 ** 12 24 ** 10 67 ** 10 67 ** ----------- ** 30 13 62 67 ** ** We need only 3 multiplications, plus a few additions. And of course, ** additions are a lot simpler and faster than multiplications, so we ** end up ahead. ** ** The way it is written requires that the size to be a power of two. ** That's why the program itself requires the size to be a power of two. ** There is no actual requirement in the method, it's just how I did it ** because I would be working close to powers of two anyway, and it was ** easier. */ void FastMul(Short *prod, Short *a, Short *b, Indexer Len) { Indexer x, HLen; int SignA, SignB; Short *offset; Short *NextProd; /* ** This is the threshold where it's faster to do it the ordinary way ** On my computer, it's 16. Yours may be different. */ if (Len <= 16 ) { Mul(prod, a, b, Len); return; } HLen = Len/2; NextProd = prod + 2*Len; SignA = Sub(prod, a, a + HLen, HLen); if (SignA) { SignA = 1; Negate(prod, HLen); } SignB = Sub(prod + HLen, b + HLen, b, HLen); if (SignB) { SignB = 1; Negate(prod + HLen, HLen); } FastMul(NextProd, prod, prod + HLen, HLen); for (x = 0; x < 2 * Len; x++) prod[x] = 0; offset = prod + HLen; if (SignA == SignB) DoCarries(prod, offset - 1, Add(offset, offset, NextProd, Len)); else DoBorrows(prod, offset - 1, Sub(offset, offset, NextProd, Len)); FastMul(NextProd, a, b, HLen); offset = prod + HLen; DoCarries(prod, offset - 1, Add(offset, offset, NextProd, Len)); Add(prod, prod, NextProd, Len); FastMul(NextProd, a + HLen, b + HLen, HLen); offset = prod + HLen; DoCarries(prod, offset - 1, Add(offset, offset, NextProd, Len)); offset = prod + Len; DoCarries(prod, offset - 1, Add(offset, offset, NextProd, Len)); } /* ** Compute the reciprocal of 'a'. ** x[k+1] = x[k]*(2-a*x[k]) */ void Reciprocal(Short *r, Short *n, Indexer Len) { Indexer x, SubLen; int iter; double t; /* Estimate our first reciprocal */ for (x = 0; x < Len; x++) r[x] = 0; t = n[0] + n[1]*1.0e-4 + n[2]*1.0e-8; if (t == 0.0) t = 1; t = 1.0/t; r[0] = t; t = (t - floor(t))*10000.0; r[1] = t; t = (t - floor(t))*10000.0; r[2] = t; iter = log(SIZE)/log(2.0) + 1; SubLen = 1; while (iter--) { SubLen *= 2; if (SubLen > SIZE) SubLen = SIZE; FastMul(MulWork, n, r, SubLen); Round2x(Work1, MulWork, SubLen); MulWork[0] = 2; for (x = 1; x < Len; x++) MulWork[x] = 0; Sub(Work1, MulWork, Work1, SubLen); FastMul(MulWork, r, Work1, SubLen); Round2x(r, MulWork, SubLen); } } /* ** Computes the reciprocal of the square root of 'a' ** y[k+1] = y[k]*(3-a*y[k]^2)/2 ** ** ** We can save quite a bit of time by using part of our previous ** results! Since the number is converging onto a specific value, ** at least part of our previous result will be correct, so we ** can skip a bit of work. */ void RSqrt(Short *r, Short *n, Indexer Len, Indexer SubLen) { Indexer x; int iter; double t; /* Estimate our first reciprocal square root */ if (SubLen < 2 ) { for (x = 0; x < Len; x++) r[x] = 0; t = n[0] + n[1]*1.0e-4 + n[2]*1.0e-8; if (t == 0.0) t = 1; t = 1.0/sqrt(t); r[0] = t; t = (t - floor(t))*10000.0; r[1] = t; t = (t - floor(t))*10000.0; r[2] = t; } /* ** PrintShort2("\n ~R: ", r, SIZE); */ if (SubLen > SIZE) SubLen = SIZE; iter = SubLen; while (iter <= SIZE*2) { FastMul(MulWork, r, r, SubLen); Round2x(Work1, MulWork, SubLen); FastMul(MulWork, n, Work1, SubLen); Round2x(Work1, MulWork, SubLen); MulWork[0] = 3; for (x = 1; x < SubLen; x++) MulWork[x] = 0; Sub(Work1, MulWork, Work1, SubLen); FastMul(MulWork, r, Work1, SubLen); Round2x(r, MulWork, SubLen); DivBy2(r, SubLen); /* printf("%3d", SubLen); PrintShort2("R: ", r, SubLen); printf("%3d", SubLen); PrintShort2("R: ", r, Len); */ SubLen *= 2; if (SubLen > SIZE) SubLen = SIZE; iter *= 2; } } /* ** d = a/b by computing the reciprocal of b and then multiplying ** that by a. It's faster this way because the normal digit by ** digit method has N^2 growth, and this method will have the same ** growth as our multiplication method. */ void Divide(Short *d, Short *a, Short *b, Indexer Len) { Reciprocal(Work2, b, Len); FastMul(MulWork, a, Work2, Len); Round2x(d, MulWork, Len); } /* ** r = sqrt(n) by computing the reciprocal of the square root of ** n and then multiplying that by n */ void Sqrt(Short *r, Short *n, Indexer Len, Indexer SubLen) { RSqrt(Work2, n, Len, SubLen); FastMul(MulWork, n, Work2, Len); Round2x(r, MulWork, Len); } void usage(void) { puts("This program requires the number of digits/4 to calculate"); puts("This number must be a power of two."); exit(-1); } int main(int argc,char *argv[]) { Indexer x; Indexer SubLen; double Pow2, tempd, carryd; int AGMIter; int NeededIter; clock_t T0, T1, T2, T3; if (argc < 2) usage(); SIZE = atoi(argv[1]); if (!ispow2(SIZE)) usage(); a = (Short *)malloc(sizeof(Short)*SIZE); b = (Short *)malloc(sizeof(Short)*SIZE); c = (Short *)malloc(sizeof(Short)*SIZE); Work1 = (Short *)malloc(sizeof(Short)*SIZE); Work2 = (Short *)malloc(sizeof(Short)*SIZE); Work3 = (Short *)malloc(sizeof(Short)*SIZE); MulWork = (Short *)malloc(sizeof(Short)*SIZE*4); if ((a ==NULL) || (b == NULL) || (c == NULL) || (Work1 == NULL) || (Work2 == NULL) || (MulWork == NULL)) { printf("Unable to allocate memory\n");exit(1); } NeededIter = log(SIZE)/log(2.0) + 1; Pow2 = 4.0; AGMIter = NeededIter + 1; SubLen = 1; T0 = clock(); for (x = 0; x < SIZE; x++) a[x] = b[x] = c[x] = Work3[x] = 0; a[0] = 1; Work3[0] = 2; RSqrt(b, Work3, SIZE, 1); c[0] = 1; T1 = clock(); /* printf("AGMIter %d\n", AGMIter); PrintShort2("a AGM: ", a, SIZE); PrintShort2("b AGM: ", b, SIZE); PrintShort2("C sum: ", c, SIZE); */ while (AGMIter--) { Sub(Work1, a, b, SIZE); /* w = (a-b)/2 */ DivBy2(Work1, SIZE); FastMul(MulWork, Work1, Work1, SIZE); /* m = w*w */ Round2x(Work1, MulWork, SIZE); carryd = 0.0; /* m = m* w^(J+1) */ for (x = SIZE - 1; x >= 0; x--) { tempd = Pow2*Work1[x] + carryd; carryd = floor(tempd / 10000.0); Work1[x] = tempd - carryd*10000.0; } Pow2 *= 2.0; Sub(c, c, Work1, SIZE); /* c = c - m */ /* Save some time on last iter */ if (AGMIter != 0) FastMul(MulWork, a, b, SIZE); /* m = a*b */ Add(a, a, b, SIZE); /* a = (a+b)/2 */ DivBy2(a, SIZE); if (AGMIter != 0) { Round2x(Work3, MulWork, SIZE); Sqrt(b, Work3, SIZE, SubLen); /* b = sqrt(a*b) */ } /* printf("AGMIter %d\n", AGMIter); PrintShort2("a AGM: ", a, SIZE); PrintShort2("b AGM: ", b, SIZE); PrintShort2("C sum: ", c, SIZE); */ SubLen *= 2;if (SubLen > SIZE) SubLen = SIZE; } T2 = clock(); FastMul(MulWork, a, a, SIZE); Round2x(a, MulWork, SIZE); carryd = 0.0; for (x = SIZE - 1; x >= 0; x--) { tempd = 4.0*a[x] + carryd; carryd = floor(tempd / 10000.0); a[x] = tempd - carryd*10000.0; } Divide(b, a, c, SIZE); T3 = clock(); PrintShort("\nPI = ", b, SIZE); printf("\nTotal Execution time: %12.4lf\n", (double)(T3 - T0) / CLOCKS_PER_SEC); printf("Setup time : %12.4lf\n", (double)(T1 - T0) / CLOCKS_PER_SEC); printf("AGM Calculation time: %12.4lf\n", (double)(T2 - T1) / CLOCKS_PER_SEC); printf("Post calc time : %12.4lf\n", (double)(T3 - T2) / CLOCKS_PER_SEC); return 0; }