If you haven't seen Ramujan's formula before, it looks like this:
$$ \Large \frac{1}{\pi} = \frac{2\sqrt{2}}{9801} \sum_{k=0}^{\infty} \frac{(4k)!}{k!^4} \frac{1103 + 26390k}{396^{4k}} $$
The first attempt I made for calculating Pi was using Ramujan's formula in basically pure C.
#include <stdio.h>
#include <math.h>
long double
factorial (int x)
{
return tgamma (x + 1);
}
int
main ()
{
long double pi;
int k;
long double summation;
summation = 0;
for (k = 0; k < 15; k++)
{
printf ("%d\n", k);
printf ("%Lf\n", summation);
summation = summation + (factorial (4 * k) / (pow (factorial (k), 4))
* (1103 + 26390 * k) / (pow (396, 4 * k)));
}
pi = pow (((2 * sqrt (2)) / 9801) * summation, -1);
printf ("%Lf", pi);
return 0;
}
You will quickly notice an issue with this function if you compile and run it on your system
$ gcc unoptimized.c -o unoptimized -lm
$ ./unoptimized
....
3.141593
Even though we have quite a few iterations of Ramujan's formula (which produces ~8 digits per iteration), we only have 6 decimals of accuracy. In order to improve this, we are going to need GMP (The GNU Multiple Precision Arithmetic Library )
The first thing I needed to get right, was the correct rounding for floating point numbers, for this I will be using the GNU MPFR library (for multiple-precision floating-point computation)
// ramujan-gmp.c
#include <stdlib.h>
#include <stdio.h>
#include <gmp.h>
#include <mpfr.h>
const int acc = 5000;
const int rev = 5000;
int main()
{
printf("LOADED\n");
unsigned int i;
mpfr_t s, t, u, d, g;
mpfr_init2(t, acc);
mpfr_set_d(t, 2.0, MPFR_RNDD);
mpfr_sqrt(t, t, MPFR_RNDD);
mpfr_mul_ui(t, t, 2, MPFR_RNDD);
mpfr_div_ui(t, t, 9801, MPFR_RNDD);
mpfr_init2(u, acc);
mpfr_set_d(u, 0, MPFR_RNDD);
mpfr_init2(d, acc);
mpfr_set_d(d, 0, MPFR_RNDD);
mpfr_init2(g, acc);
mpfr_set_d(g, 0, MPFR_RNDD);
mpfr_init2(s, acc);
mpfr_set_d(s, 0, MPFR_RNDD);
unsigned long int k4 = -4;
for (i = 0; i < rev + 1; i++)
{
// printf("%d\n", i);
// mpfr_mul_ui(u, u, 4, MPFR_RNDD);
k4 += 4;
// u = (4k)!
mpfr_set_d(u, 4, MPFR_RNDD);
mpfr_mul_ui(u, u, i, MPFR_RNDD);
mpfr_fac_ui(u, k4, MPFR_RNDD);
// mpfr_out_str(stdout, 10, 0, u, MPFR_RNDD);
// putchar('\n');
// d = (k!)^4
mpfr_fac_ui(d, i, MPFR_RNDD);
mpfr_pow_ui(d, d, 4, MPFR_RNDD);
// u = (4k)!/((k!)^4)
mpfr_div(u, u, d, MPFR_RNDD);
// d = 26390k + 1103
mpfr_set_d(d, 26390, MPFR_RNDD);
mpfr_mul_ui(d, d, i, MPFR_RNDD);
mpfr_add_ui(d, d, 1103, MPFR_RNDD);
// g = (396)^(4k)
mpfr_set_d(g, 4, MPFR_RNDD);
mpfr_mul_ui(g, g, i, MPFR_RNDD);
mpfr_ui_pow(g, 396, g, MPFR_RNDD);
// d = (26390k + 1103) / (396 ^ (4k))
mpfr_div(d, d, g, MPFR_RNDD);
// u = (4k)!/((k!)^4) * (26390k + 1103) / (396 ^ (4k))
mpfr_mul(u, u, d, MPFR_RNDD);
// s = s + u
mpfr_add(s, s, u, MPFR_RNDD);
}
mpfr_mul(t, t, s, MPFR_RNDD);
mpfr_ui_div(t, 1, t, MPFR_RNDD);
// mpfr_rec (t, MPFR_RNDD);
// mpfr_pow_ui (t, t, -1, MPFR_RNDD);
printf("PI is ");
mpfr_out_str(stdout, 10, 0, t, MPFR_RNDD);
putchar('\n');
// mpfr_clear (s);
mpfr_clear(t);
// mpfr_clear (u);
mpfr_free_cache();
return 0;
}
A key disadvantage we can instantly see is that readability has plummeted massively, to such an extent I have added step by step explanations for most of the maths
Running this program results in:
$ gcc ramujan-gmp.c -o ramujan-gmp -lm -lgmp -lmpfr
$ ./ramujan-gmp
LOADED
PI is 3.141592653589793238462643383279502884197169399375105820974944592307816406286208998628034825342117067982148086513282306647093844609550582231725359408128481117450284102701938521105559644622948954930381964428810975665933446128475648233786783165271201909145648566923460348610454326648213393607260249141273724587006606315588174881520920962829254091715364367892590360011330530548820466521384146951941511609433057270365759591953092186117381932611793105118548074462379962749567351885752724891227938183011949129833673362440656643086021394946395224737190702179860943702770539217176293176752384674818467669405132000568127145263560827785771342757789609173637178721468440901224953430146549585371050792279689258923542019956112129021960864034418159813629774771309960518707211349999998372978049951059731732816096318595024459455346908302642522308253344685035261931188171010003137838752886587533208381420617177669147303598253490428755468731159562863882353787593751957781857780532171226806613001927876611195909216420198938095257201065485863278865936153381827968230301952035301852968995773622599413891249721775283479131515574857242454150695950829533116861727855889075098381754637464939319255060400927701671139009848824012858361603563707660104710181942955596198946767837449448255379774726847104047534646208046684259069491293313677028989152104752162056966024058038150193511253382430035587640247496473263914199272604269922796782354781636009341721641219924586315030286182974555706749838505494588586926995690927210797509302955325168
This is clearly much better, as we are getting a proper result, the issue is, this takes a long time. And because Ramjuan's formula is \(O(n \log(n)^3) \), this starts to slow up extremely quickly. So, how do we improve this?
Meet multiprocessing!
I chose to use OpenMP because it is well designed and for loops can be scaled extremely easily!
#include <stdlib.h>
#include <stdio.h>
#include <gmp.h>
#include <mpfr.h>
#include <omp.h>
const int acc = 500;
const int rev = 50000;
int main()
{
printf("LOADED\n");
unsigned int i;
unsigned int progress = 0;
mpfr_t s, s2, t, u, d, g;
mpfr_init2(t, acc);
mpfr_set_d(t, 2.0, MPFR_RNDD);
mpfr_sqrt(t, t, MPFR_RNDD);
mpfr_mul_ui(t, t, 2, MPFR_RNDD);
mpfr_div_ui(t, t, 9801, MPFR_RNDD);
#pragma omp parallel private(u, d, g, s) shared(s2, progress)
{
mpfr_init2(u, acc);
mpfr_set_d(u, 0, MPFR_RNDD);
mpfr_init2(d, acc);
mpfr_set_d(d, 0, MPFR_RNDD);
mpfr_init2(g, acc);
mpfr_set_d(g, 0, MPFR_RNDD);
mpfr_init2(s, acc);
mpfr_set_d(s, 0, MPFR_RNDD);
mpfr_init2(s2, acc);
mpfr_set_d(s2, 0, MPFR_RNDD);
#pragma omp for
for (i = 0; i < rev + 1; i++) {
mpfr_set_d(u, 4, MPFR_RNDD);
mpfr_mul_ui(u, u, i, MPFR_RNDD);
mpfr_fac_ui(u, 4*i, MPFR_RNDD);
// d = (k!)^4
mpfr_fac_ui(d, i, MPFR_RNDD);
mpfr_pow_ui(d, d, 4, MPFR_RNDD);
// u = (4k)!/((k!)^4)
mpfr_div(u, u, d, MPFR_RNDD);
// d = 26390k + 1103
mpfr_set_d(d, 26390, MPFR_RNDD);
mpfr_mul_ui(d, d, i, MPFR_RNDD);
mpfr_add_ui(d, d, 1103, MPFR_RNDD);
// g = (396)^(4k)
mpfr_set_d(g, 4, MPFR_RNDD);
mpfr_mul_ui(g, g, i, MPFR_RNDD);
mpfr_ui_pow(g, 396, g, MPFR_RNDD);
// d = (26390k + 1103) / (396 ^ (4k))
mpfr_div(d, d, g, MPFR_RNDD);
// u = (4k)!/((k!)^4) * (26390k + 1103) / (396 ^ (4k))
mpfr_mul(u, u, d, MPFR_RNDD);
// s = s + u
mpfr_add(s, s, u, MPFR_RNDD);
progress += 1;
printf("%d\n", progress);
}
#pragma omp critical
{
//add each threads partial sum to the total sum
mpfr_add(s2, s2, s, MPFR_RNDD);
}
}
mpfr_mul(t, t, s2, MPFR_RNDD);
mpfr_ui_div(t, 1, t, MPFR_RNDD);
printf("PI is ");
mpfr_out_str(stdout, 10, 0, t, MPFR_RNDD);
putchar('\n');
mpfr_free_cache();
return 0;
}
This basically works by taking the whole summation and splitting it into as many threads as you have for your
CPU. Why is this useful?
If you are on a system with lots of resources (like me, I have a RYZEN 7 2700x), you can use every single core
of your processor to calculate this maths!
Thanks for reading! And happy coding!