Skip to content

Fast chained content #1947

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 6 commits into
base: main
Choose a base branch
from
Draft
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
487 changes: 463 additions & 24 deletions src/fmpz_vec/content_chained.c
Original file line number Diff line number Diff line change
@@ -1,59 +1,498 @@
/*
Copyright 1994, 1996, 2000, 2001, 2009, 2012, 2019 Free Software Foundation, Inc.
Copyright (C) 2020 Daniel Schultz
Copyright (C) 2021 Fredrik Johansson
Copyright (C) 2024 Albin Ahlbäck
This file is part of FLINT.
Contains code from mpn_gcd_1 in GMP 6.3.0.
FLINT is free software: you can redistribute it and/or modify it under
the terms of the GNU Lesser General Public License (LGPL) as published
by the Free Software Foundation; either version 3 of the License, or
(at your option) any later version. See <https://www.gnu.org/licenses/>.
*/

#include "mpn_extras.h"
#include "fmpz.h"
#include "fmpz_vec.h"

/* Calculating the GCD is a very costly operation.
Hence, our attempt is to find a single-limbed integer of which we can start
the process with, because calculating the GCD of {u, n} and {v, 1} is much
more effective than calculating if of two n-limbed integers.
Initial strategy to find a single-limbed integer:
* `ip` will often be small -- start by checking its total size.
* `ip` may contain factors of two -- check if reducing it leads to a single
limb.
* `vp` may contain single-limbed integers at specific places -- imagine it
representing a polynomial; many polynomials are small at the ends.
*/

#define SIZ(x) ((x)->_mp_size)
#define PTR(x) ((x)->_mp_d)

typedef struct
{
mp_srcptr d;
ulong sz;
}
mpn_struct;

typedef mpn_struct * mpn_ptr;

FLINT_STATIC_NOINLINE void heavy_machinery(fmpz_t, const fmpz *, slong, const fmpz_t);

/* In this function we only take care of small inputs. If none is found, we go
* to heavy_machinery instead. */
void
_fmpz_vec_content_chained(fmpz_t res, const fmpz * vec, slong len, const fmpz_t in)
_fmpz_vec_content_chained(fmpz_t rp, const fmpz * vp, slong vn, const fmpz_t ip)
{
while (len > 0 && fmpz_is_zero(vec + 0))
slong jx = -1, kx;
ulong gd; /* single limb gcd */
int exp;
mpz_srcptr mp;
slong msz;
mp_srcptr md;

#if FLINT_WANT_ASSERT
gd = 0;
#endif

/* We will assume that vn >= 2 for everything that will come */
if (vn < WORD(2))
{
len--;
vec++;
if (ip == NULL)
fmpz_abs(rp, vp + 0);
else
fmpz_gcd(rp, vp + 0, ip);

return;
}

while (len > 1 && fmpz_is_zero(vec + len - 1))
len--;
/* Check if `ip` is a single limb */
if (ip != NULL)
{
fmpz tip = *ip; /* Read-only */

if (len == 0)
if (!COEFF_IS_MPZ(tip))
{
if (tip == WORD(0))
{
ip = NULL;
goto check_vp_may_be_zero;
}

gd = FLINT_ABS(tip);
goto ip1;
}
else
{
mp = COEFF_TO_PTR(tip);
msz = FLINT_ABS(SIZ(mp));

if (msz == 1)
{
gd = PTR(mp)[0];
goto ip1;
}
}
}

check_vp_may_be_zero: /* Fast checks only. Result can be zero. */
for (jx = 0; jx < vn; jx++)
{
fmpz_abs(res, in);
return;
if (!fmpz_is_zero(vp + jx))
goto check_vp_not_zero;
else if (!COEFF_IS_MPZ(vp[jx]))
goto found_small;
}

if (len == 1)
if (ip == NULL)
fmpz_zero(rp);
else
fmpz_abs(rp, ip);

return;

check_vp_not_zero: /* Fast checks only. Result cannot be zero from this point onwards. */
for (jx++; jx < vn; jx++)
if (!fmpz_is_zero(vp + jx) && !COEFF_IS_MPZ(vp[jx]))
goto found_small;

/* Alright, bring it on */
heavy_machinery(rp, vp, vn, ip);
return;

found_small: /* found small inside vp */
gd = FLINT_ABS(vp[jx]);

reduce_gd: /* gd is set, but we need to reduce it */
exp = flint_ctz(gd);

/* Take care of ip before next step */
if (ip != NULL)
{
fmpz_gcd(res, vec + 0, in);
return;
slong tsz;
mp = COEFF_TO_PTR(*ip); /* ip cannot be small */
tsz = msz = FLINT_ABS(SIZ(mp));
md = PTR(mp);

/* Remove trailing zero limbs */
while (*md == UWORD(0))
{
md++;
msz--;
}

if (tsz != msz)
/* keep exp */;
else
exp = FLINT_MIN(exp, flint_ctz(md[0]));

gd = mpn_gcd_1(md, msz, gd);

if (gd == UWORD(1))
goto gd_eq_1;
}

if (fmpz_is_pm1(in) || fmpz_is_pm1(vec + 0) || fmpz_is_pm1(vec + len - 1))
goto compute;

ip1: /* We found ip to be small */
/* gd is set but unreduced and exp is not set */
FLINT_ASSERT(gd != UWORD(0));
exp = flint_ctz(gd);
gd >>= exp;

if (gd == UWORD(1))
goto gd_eq_1;

compute: /* gd is set and ip has been taken care of */
FLINT_ASSERT(gd != UWORD(0));
FLINT_ASSERT(gd != UWORD(1));
for (kx = 0; kx < vn; kx++)
{
fmpz_one(res);
return;
FLINT_ASSERT(gd & 1 == 1);

/* If kx == jx, we have already taken care of this entry */
if (fmpz_is_zero(vp + kx) || kx == jx)
continue;

if (gd == UWORD(1))
goto gd_eq_1;

if (!COEFF_IS_MPZ(vp[kx]))
{
ulong td = FLINT_ABS(vp[kx]);
int c;

c = flint_ctz(td);
td >>= c;

exp = FLINT_MIN(exp, c);

/* Make td bigger */
if (td < gd)
FLINT_SWAP(ulong, td, gd);

/* Now td >= gd */
/* If much bigger, reduce via remainder first */
if ((td >> 16) > gd)
{
td %= gd;
if (td == UWORD(0)) /* td was a multiple of gd */
continue;

c = flint_ctz(td);
td >>= c;
}

gd = mpn_gcd_11(td, gd);
}
else
{
mp = COEFF_TO_PTR(vp[kx]);
msz = FLINT_ABS(SIZ(mp));
md = PTR(mp);

gd = mpn_gcd_1(md, msz, gd);
}
}

goto fin;

gd_eq_1: /* gd == 1, but we may need to check for common factors of 2 */
FLINT_ASSERT(gd == UWORD(1));

if (exp == 0)
goto fin;

for (; kx < vn; kx++)
{
if (fmpz_is_zero(vp + kx))
continue;

if (!COEFF_IS_MPZ(vp[kx]))
{
exp = FLINT_MIN(flint_ctz(FLINT_ABS(vp[kx])), exp);

if (exp == 0)
goto fin;
}
else
{
mp = COEFF_TO_PTR(vp[kx]);
md = PTR(mp);

if (*md == UWORD(0))
continue;

exp = FLINT_MIN(flint_ctz(*md), exp);

if (exp == 0)
goto fin;
}
}

fin:
FLINT_ASSERT(gd != UWORD(0));
fmpz_set_ui(rp, gd << exp);
return;
}

/* Everything is an mpz */
FLINT_STATIC_NOINLINE
void heavy_machinery(fmpz_t rp, const fmpz * vp, slong vn, const fmpz_t ip)
{
mpn_ptr mvp;
ulong limbs = UWORD_MAX, exp;
slong idx0, idx1;
ulong sz0 = UWORD_MAX, sz1 = UWORD_MAX;

ulong a0;

slong ix;

if (vn < WORD(2))
FLINT_UNREACHABLE;

mvp = flint_malloc(sizeof(mpn_struct) * (vn + (ip != NULL)));

/* Start by adding entries to mvp */
if (ip != NULL)
{
mpz_srcptr mp;
mp_srcptr md;
ulong sz, tz;

mp = COEFF_TO_PTR(*ip);
tz = sz = FLINT_ABS(SIZ(mp));
md = PTR(mp);

while (*md == UWORD(0))
{
md++;
sz--;
}

limbs = tz - sz;
mvp[0].d = md;
mvp[0].sz = sz;
mvp++;

sz0 = sz;
idx0 = -1;
}

fmpz_gcd3(res, vec + 0, vec + len - 1, in);
vec++;
len -= 2;
for (ix = 0, vp += vn - 1; ix < vn; vp--)
{
mpz_srcptr mp;
mp_srcptr md;
ulong sz, tz;

while (len >= 2 && !fmpz_is_one(res))
if (fmpz_is_zero(vp))
{
vn--;
continue;
}

mp = COEFF_TO_PTR(*vp);
tz = sz = FLINT_ABS(SIZ(mp));
md = PTR(mp);

while (*md == UWORD(0))
{
md++;
sz--;
}

limbs = FLINT_MIN(limbs, tz - sz);
mvp[ix].d = md;
mvp[ix].sz = sz;
ix++;

if (sz < sz0)
{
sz1 = sz0;
sz0 = sz;
idx1 = idx0;
idx0 = ix;
}
else if (sz < sz1)
{
sz1 = sz;
idx1 = ix;
}
}

/* Check if we can fast-track the initial GCD */
if (sz1 == UWORD(1))
{
fmpz_gcd3(res, vec + 0, vec + len - 1, res);
vec++;
len -= 2;
/* 1 = sz0 = sz1 */
ulong b0;
ulong bexp;

a0 = mvp[idx0].d[0];
b0 = mvp[idx1].d[0];

exp = flint_ctz(a0);
bexp = flint_ctz(b0);

a0 >>= exp;
b0 >>= bexp;
exp = FLINT_MIN(exp, bexp);

if (b0 < a0)
FLINT_SWAP(ulong, a0, b0);

/* Now b0 >= a0 */
/* If much bigger, reduce via remainder first */
if ((b0 >> 16) > a0)
{
b0 %= a0;
if (b0 == UWORD(0)) /* b0 was a multiple of a0 */
goto G1;

b0 >>= flint_ctz(b0);
}

a0 = mpn_gcd_11(b0, a0);

if (a0 == UWORD(1))
goto a0_eq_1;

goto G1;
}
else if (sz0 == UWORD(1))
{
/* 1 = sz0 < sz1 */
ulong bexp;
mp_srcptr bd;
ulong bn;

a0 = mvp[idx0].d[0];
bd = mvp[idx1].d;
bn = mvp[idx1].sz;

exp = flint_ctz(a0);
bexp = flint_ctz(bd[0]);

a0 >>= exp;
exp = FLINT_MIN(exp, bexp);

a0 = mpn_gcd_1(bd, bn, a0);

if (a0 == UWORD(1))
goto a0_eq_1;

goto G1;
}
else if (sz1 == UWORD(2))
{
/* 2 = sz0 = sz1 */
ulong a1, b0, b1;
ulong bexp;
mp_limb_pair_t a;

a0 = mvp[idx0].d[0];
a1 = mvp[idx0].d[1];
b0 = mvp[idx1].d[0];
b1 = mvp[idx1].d[1];

exp = flint_ctz(a0);
bexp = flint_ctz(b0);

a0 = (a0 >> exp) | (a1 << (FLINT_BITS - exp));
a1 = (a1 >> exp);
b0 = (b0 >> bexp) | (b1 << (FLINT_BITS - bexp));
b1 = (b1 >> bexp);

a = mpn_gcd_22(b1, b0, a1, a0);
a1 = a.m2;
a0 = a.m1;

if (a1 == UWORD(0))
{
if (a0 == UWORD(1))
goto a0_eq_1;
else
goto G1;
}

goto G2;
}
else if (sz0 == UWORD(2))
{
/* 2 = sz0 < sz1 */
/* Try to reduce `a` to a single limb */
ulong a1;
a0 = mvp[idx0].d[0];
a1 = mvp[idx0].d[1];

exp = flint_ctz(a0);

a0 = (a0 >> exp) | (a1 << (FLINT_BITS - exp));
a1 = (a1 >> exp);

if (a1 == UWORD(0))
{
exp = FLINT_MIN(flint_ctz(mvp[idx1].d[0]), exp);
a0 = mpn_gcd_1(mvp[idx1].d, mvp[idx1].sz, a0);

if (a0 == UWORD(1))
goto a0_eq_1;
else
goto G1;
}
else
goto G2;
}

if (len != 0 && !fmpz_is_one(res))
fmpz_gcd(res, res, vec + 0);
Gn: /* TODO:
0. Allocate temporary space for two integers -- one for the output (say
OP) and one for the input (say IP),
1. Take the smallest sample we have (given at idx0), and right shift it
into OP,
2. Take the next smallest sample and right shift it into IP,
3. Compute OP <- GCD(IP, OP),
4. If we are at the end, exit.
5. If OP can fit in a limb, go to G1.
6. If OP can fit in two limbs, go to G2.
7. Take the next sample (no specific order), right shift it into IP and
go to step 3. */

G2: /* TODO: Do mpn_divrem_2 and continue with mpn_gcd_22 */

G1: for (ix = -1; ix < vn; ix++)
;

a0_eq_1: /* a0 == 1, but we may need to check for common factors of 2 */
FLINT_ASSERT(a0 == UWORD(1));

end:

flint_free(mvp - (ip != NULL));
}
4 changes: 4 additions & 0 deletions src/mpn_extras.h
Original file line number Diff line number Diff line change
@@ -785,9 +785,13 @@ int flint_mpn_factor_trial_tree(slong * factors, mp_srcptr x, mp_size_t xsize, s

/* greatest common divisor ***************************************************/

#define mpn_gcd_22 __gmpn_gcd_22

mp_size_t flint_mpn_gcd_full2(mp_ptr gp, mp_srcptr ap, mp_size_t an, mp_srcptr bp, mp_size_t bn, mp_ptr scr);
mp_size_t flint_mpn_gcd_full(mp_ptr gp, mp_srcptr ap, mp_size_t an, mp_srcptr bp, mp_size_t bn);

mp_limb_pair_t mpn_gcd_22(mp_limb_t, mp_limb_t, mp_limb_t, mp_limb_t);

/* modular arithmetic ********************************************************/

void flint_mpn_mulmod_preinv1(mp_ptr r, mp_srcptr a, mp_srcptr b, mp_size_t n, mp_srcptr d, mp_limb_t dinv, ulong norm);