/*=============================================================================

    This file is part of FLINT.

    FLINT is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.

    FLINT is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with FLINT; if not, write to the Free Software
    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA

=============================================================================*/
/******************************************************************************

    Copyright (C) 2010 William Hart
    Copyright (C) 2011 Fredrik Johansson
    Copyright (C) 2011 Sebastian Pancratz

******************************************************************************/

*******************************************************************************

    Helper functions

*******************************************************************************

int signed_mpn_sub_n(mp_ptr res, mp_srcptr op1, mp_srcptr op2, slong n)

    If \code{op1 >= op2} return 0 and set \code{res} to \code{op1 - op2}
    else return 1 and set \code{res} to \code{op2 - op1}.

*******************************************************************************

    Memory management

*******************************************************************************

void nmod_poly_init(nmod_poly_t poly, mp_limb_t n)

    Initialises \code{poly}. It will have coefficients modulo~$n$.

void nmod_poly_init_preinv(nmod_poly_t poly, mp_limb_t n, mp_limb_t ninv)

    Initialises \code{poly}. It will have coefficients modulo~$n$.
    The caller supplies a precomputed inverse limb generated by
    \code{n_preinvert_limb()}.

void nmod_poly_init2(nmod_poly_t poly, mp_limb_t n, slong alloc)

    Initialises \code{poly}. It will have coefficients modulo~$n$.
    Up to \code{alloc} coefficients may be stored in \code{poly}.

void nmod_poly_init2_preinv(nmod_poly_t poly,
                       mp_limb_t n, mp_limb_t ninv, slong alloc)

    Initialises \code{poly}. It will have coefficients modulo~$n$.
    The caller supplies a precomputed inverse limb generated by
    \code{n_preinvert_limb()}. Up to \code{alloc} coefficients may
    be stored in \code{poly}.

void nmod_poly_realloc(nmod_poly_t poly, slong alloc)

    Reallocates \code{poly} to the given length. If the current
    length is less than \code{alloc}, the polynomial is truncated
    and normalised.  If \code{alloc} is zero, the polynomial is
    cleared.

void nmod_poly_clear(nmod_poly_t poly)

    Clears the polynomial and releases any memory it used. The polynomial
    cannot be used again until it is initialised.

void nmod_poly_fit_length(nmod_poly_t poly, slong alloc)

    Ensures \code{poly} has space for at least \code{alloc} coefficients.
    This function only ever grows the allocated space, so no data loss can
    occur.

void _nmod_poly_normalise(nmod_poly_t poly)

    Internal function for normalising a polynomial so that the top
    coefficient, if there is one at all, is not zero.

*******************************************************************************

    Polynomial properties

*******************************************************************************

slong nmod_poly_length(const nmod_poly_t poly)

    Returns the length of the polynomial \code{poly}. The zero polynomial
    has length zero.

slong nmod_poly_degree(const nmod_poly_t poly)

    Returns the degree of the polynomial \code{poly}. The zero polynomial
    is deemed to have degree~$-1$.

mp_limb_t nmod_poly_modulus(const nmod_poly_t poly)

    Returns the modulus of the polynomial \code{poly}. This will be a
    positive integer.

mp_bitcnt_t nmod_poly_max_bits(const nmod_poly_t poly)

    Returns the maximum number of bits of any coefficient of \code{poly}.

*******************************************************************************

    Assignment and basic manipulation

*******************************************************************************

void nmod_poly_set(nmod_poly_t a, const nmod_poly_t b)

    Sets \code{a} to a copy of \code{b}.

void nmod_poly_swap(nmod_poly_t poly1, nmod_poly_t poly2)

    Efficiently swaps \code{poly1} and \code{poly2} by swapping pointers
    internally.

void nmod_poly_zero(nmod_poly_t res)

    Sets \code{res} to the zero polynomial.

void nmod_poly_truncate(nmod_poly_t poly, slong len)

    Truncates \code{poly} to the given length and normalises it.
    If \code{len} is greater than the current length of \code{poly},
    then nothing happens.

void _nmod_poly_reverse(mp_ptr output, mp_srcptr input, slong len, slong m)

    Sets \code{output} to the reverse of \code{input}, which is of length
    \code{len}, but thinking of it as a polynomial of length~\code{m},
    notionally zero-padded if necessary. The length~\code{m} must be
    non-negative, but there are no other restrictions. The polynomial
    \code{output} must have space for \code{m} coefficients.

void nmod_poly_reverse(nmod_poly_t output, const nmod_poly_t input, slong m)

    Sets \code{output} to the reverse of \code{input}, thinking of it as
    a polynomial of length~\code{m}, notionally zero-padded if necessary).
    The length~\code{m} must be non-negative, but there are no other
    restrictions. The output polynomial will be set to length~\code{m}
    and then normalised.

*******************************************************************************

    Randomization

*******************************************************************************

void nmod_poly_randtest(nmod_poly_t poly, flint_rand_t state, slong len)

    Generates a random polynomial with length up to \code{len}.

void
nmod_poly_randtest_irreducible(nmod_poly_t poly, flint_rand_t state, slong len)

    Generates a random irreducible polynomial with length up to \code{len}.

void nmod_poly_randtest_monic(nmod_poly_t poly, flint_rand_t state, slong len)

    Generates a random monic polynomial with length \code{len}.

void
nmod_poly_randtest_monic_irreducible(nmod_poly_t poly, flint_rand_t state,
                                     slong len)

    Generates a random monic irreducible polynomial with length \code{len}.


void
nmod_poly_randtest_trinomial(nmod_poly_t poly, flint_rand_t state, slong len)

    Generates a random monic trinomial of length \code{len}.

int
nmod_poly_randtest_trinomial_irreducible(nmod_poly_t poly, flint_rand_t state,
                                         slong len, slong max_attempts)

    Attempts to set \code{poly} to a monic irreducible trinomial of
    length \code{len}.  It will generate up to \code{max_attempts}
    trinomials in attempt to find an irreducible one.  If
    \code{max_attempts} is \code{0}, then it will keep generating
    trinomials until an irreducible one is found.  Returns $1$ if one
    is found and $0$ otherwise.

void
nmod_poly_randtest_pentomial(nmod_poly_t poly, flint_rand_t state, slong len)

    Generates a random monic pentomial of length \code{len}.

int
nmod_poly_randtest_pentomial_irreducible(nmod_poly_t poly, flint_rand_t state,
                                         slong len, slong max_attempts)

    Attempts to set \code{poly} to a monic irreducible pentomial of
    length \code{len}.  It will generate up to \code{max_attempts}
    pentomials in attempt to find an irreducible one.  If
    \code{max_attempts} is \code{0}, then it will keep generating
    pentomials until an irreducible one is found.  Returns $1$ if one
    is found and $0$ otherwise.

void
nmod_poly_randtest_sparse_irreducible(nmod_poly_t poly, flint_rand_t state,
                                      slong len)

    Attempts to set \code{poly} to a sparse, monic irreducible polynomial
    with length \code{len}.  It attempts to find an irreducible
    trinomial.  If that does not succeed, it attempts to find a
    irreducible pentomial.  If that fails, then \code{poly} is just
    set to a random monic irreducible polynomial.

*******************************************************************************

    Getting and setting coefficients

*******************************************************************************

ulong nmod_poly_get_coeff_ui(const nmod_poly_t poly, slong j)

    Returns the coefficient of \code{poly} at index~\code{j}, where
    coefficients are numbered with zero being the constant coefficient,
    and returns it as an \code{ulong}. If \code{j} refers to a
    coefficient beyond the end of \code{poly}, zero is returned.

void nmod_poly_set_coeff_ui(nmod_poly_t poly, slong j, ulong c)

    Sets the coefficient of \code{poly} at index \code{j}, where
    coefficients are numbered with zero being the constant coefficient,
    to the value \code{c} reduced modulo the modulus of \code{poly}.
    If \code{j} refers to a coefficient beyond the current end of \code{poly},
    the polynomial is first resized, with intervening coefficients being
    set to zero.

*******************************************************************************

    Input and output

*******************************************************************************

char * nmod_poly_get_str(const nmod_poly_t poly)

    Writes \code{poly} to a string representation. The format is as
    described for \code{nmod_poly_print()}. The string must be freed by the
    user when finished. For this it is sufficient to call \code{flint_free()}.

int nmod_poly_set_str(nmod_poly_t poly, const char * s)

    Reads \code{poly} from a string \code{s}. The format is as described
    for \code{nmod_poly_print()}. If a polynomial in the correct format
    is read, a positive value is returned, otherwise a non-positive value
    is returned.

int nmod_poly_print(const nmod_poly_t a)

    Prints the polynomial to \code{stdout}. The length is printed,
    followed by a space, then the modulus. If the length is zero this is
    all that is printed, otherwise two spaces followed by a space
    separated list of coefficients is printed, beginning with the constant
    coefficient.

    In case of success, returns a positive value.  In case of failure,
    returns a non-positive value.

int nmod_poly_fread(FILE * f, nmod_poly_t poly)

    Reads \code{poly} from the file stream \code{f}. If this is a file
    that has just been written, the file should be closed then opened
    again. The format is as described for \code{nmod_poly_print()}. If a
    polynomial in the correct format is read, a positive value is returned,
    otherwise a non-positive value is returned.

int nmod_poly_fprint(FILE * f, const nmod_poly_t poly)

    Writes a polynomial to the file stream \code{f}. If this is a file
    then the file should be closed and reopened before being read.
    The format is as described for \code{nmod_poly_print()}. If a
    polynomial in the correct format is read, a positive value is returned,
    otherwise a non-positive value is returned. If an error occurs
    whilst writing to the file, an error message is printed.

    In case of success, returns a positive value.  In case of failure,
    returns a non-positive value.

int nmod_poly_read(nmod_poly_t poly)

    Read \code{poly} from \code{stdin}. The format is as described for
    \code{nmod_poly_print()}. If a polynomial in the correct format is read, a
    positive value is returned, otherwise a non-positive value is returned.

*******************************************************************************

    Comparison

*******************************************************************************

int nmod_poly_equal(const nmod_poly_t a, const nmod_poly_t b)

    Returns~$1$ if the polynomials are equal, otherwise~$0$.

int nmod_poly_is_zero(const nmod_poly_t poly)

    Returns~$1$ if the polynomial \code{poly} is the zero polynomial,
    otherwise returns~$0$.

int nmod_poly_is_one(const nmod_poly_t poly)

    Returns~$1$ if the polynomial \code{poly} is the constant polynomial 1,
    otherwise returns~$0$.

*******************************************************************************

    Shifting

*******************************************************************************

void _nmod_poly_shift_left(mp_ptr res, mp_srcptr poly, slong len, slong k)

    Sets \code{(res, len + k)} to \code{(poly, len)} shifted left by
    \code{k} coefficients. Assumes that \code{res} has space for
    \code{len + k} coefficients.

void nmod_poly_shift_left(nmod_poly_t res, const nmod_poly_t poly, slong k)

    Sets \code{res} to \code{poly} shifted left by \code{k} coefficients,
    i.e.\ multiplied by $x^k$.

void _nmod_poly_shift_right(mp_ptr res, mp_srcptr poly, slong len, slong k)

    Sets \code{(res, len - k)} to \code{(poly, len)} shifted left by
    \code{k} coefficients. It is assumed that \code{k <= len} and that
    \code{res} has space for at least \code{len - k} coefficients.

void nmod_poly_shift_right(nmod_poly_t res, const nmod_poly_t poly, slong k)

    Sets \code{res} to \code{poly} shifted right by \code{k} coefficients,
    i.e.\ divide by $x^k$ and throws away the remainder. If \code{k} is
    greater than or equal to the length of \code{poly}, the result is the
    zero polynomial.

*******************************************************************************

    Addition and subtraction

*******************************************************************************

void _nmod_poly_add(mp_ptr res, mp_srcptr poly1, slong len1,
                         mp_srcptr poly2, slong len2, nmod_t mod)

    Sets \code{res} to the sum of \code{(poly1, len1)} and
    \code{(poly2, len2)}. There are no restrictions on the lengths.

void nmod_poly_add(nmod_poly_t res, const nmod_poly_t poly1,
                                            const nmod_poly_t poly2)

    Sets \code{res} to the sum of \code{poly1} and \code{poly2}.

void _nmod_poly_sub(mp_ptr res, mp_srcptr poly1, slong len1,
                         mp_srcptr poly2, slong len2, nmod_t mod)

    Sets \code{res} to the difference of \code{(poly1, len1)} and
    \code{(poly2, len2)}. There are no restrictions on the lengths.

void nmod_poly_sub(nmod_poly_t res, const nmod_poly_t poly1,
                                    const nmod_poly_t poly2)

    Sets \code{res} to the difference of \code{poly1} and \code{poly2}.

void nmod_poly_neg(nmod_poly_t res, const nmod_poly_t poly)

    Sets \code{res} to the negation of \code{poly}.

*******************************************************************************

    Scalar multiplication and division

*******************************************************************************

void nmod_poly_scalar_mul_nmod(nmod_poly_t res,
                          const nmod_poly_t poly, ulong c)

    Sets \code{res} to \code{(poly, len)} multiplied by~$c$,
    where~$c$ is reduced modulo the modulus of \code{poly}.

void _nmod_poly_make_monic(mp_ptr output,
                                      mp_srcptr input, slong len, nmod_t mod)

    Sets \code{output} to be the scalar multiple of \code{input} of
    length \code{len > 0} that has leading coefficient one, if such a
    polynomial exists. If the leading coefficient of \code{input} is not
    invertible, \code{output} is set to the multiple of \code{input} whose
    leading coefficient is the greatest common divisor of the leading
    coefficient and the modulus of \code{input}.

void nmod_poly_make_monic(nmod_poly_t output, const nmod_poly_t input)

    Sets \code{output} to be the scalar multiple of \code{input} with leading
    coefficient one, if such a polynomial exists. If \code{input} is zero
    an exception is raised. If the leading coefficient of \code{input} is not
    invertible, \code{output} is set to the multiple of \code{input} whose
    leading coefficient is the greatest common divisor of the leading
    coefficient and the modulus of \code{input}.

*******************************************************************************

    Bit packing and unpacking

*******************************************************************************

void _nmod_poly_bit_pack(mp_ptr res, mp_srcptr poly, slong len,
                                                     mp_bitcnt_t bits)

    Packs \code{len} coefficients of \code{poly} into fields of the given
    number of bits in the large integer \code{res}, i.e.\ evaluates
    \code{poly} at \code{2^bits} and store the result in \code{res}.
    Assumes \code{len > 0} and \code{bits > 0}. Also assumes that no
    coefficient of \code{poly} is bigger than \code{bits/2} bits. We
    also assume \code{bits < 3 * FLINT_BITS}.

void _nmod_poly_bit_unpack(mp_ptr res, slong len,
                                mp_srcptr mpn, ulong bits, nmod_t mod)

    Unpacks \code{len} coefficients stored in the big integer \code{mpn}
    in bit fields of the given number of bits, reduces them modulo the
    given modulus, then stores them in the polynomial \code{res}.
    We assume \code{len > 0} and \code{3 * FLINT_BITS > bits > 0}.
    There are no restrictions on the size of the actual coefficients as
    stored within the bitfields.

void nmod_poly_bit_pack(fmpz_t f, const nmod_poly_t poly, mp_bitcnt_t bit_size)

    Packs \code{poly} into bitfields of size \code{bit_size}, writing the
    result to \code{f}.

void nmod_poly_bit_unpack(nmod_poly_t poly, const fmpz_t f,
        mp_bitcnt_t bit_size)

    Unpacks the polynomial from fields of size \code{bit_size} as
    represented by the integer \code{f}.


void _nmod_poly_KS2_pack1(mp_ptr res, mp_srcptr op, slong n, slong s,
                ulong b, ulong k, slong r)

    Same as \code{_nmod_poly_KS2_pack}, but requires \code{b <= FLINT_BITS}.

void _nmod_poly_KS2_pack(mp_ptr res, mp_srcptr op, slong n, slong s,
               ulong b, ulong k, slong r)

    Bit packing routine used by KS2 and KS4 multiplication.

void _nmod_poly_KS2_unpack1(mp_ptr res, mp_srcptr op, slong n, ulong b,
                  ulong k)

    Same as \code{_nmod_poly_KS2_unpack}, but requires \code{b <= FLINT_BITS}
    (i.e. writes one word per coefficient).

void _nmod_poly_KS2_unpack2(mp_ptr res, mp_srcptr op, slong n, ulong b,
                  ulong k)

    Same as \code{_nmod_poly_KS2_unpack}, but requires
    \code{FLINT_BITS < b <= 2 * FLINT_BITS} (i.e. writes two words per
    coefficient).

void _nmod_poly_KS2_unpack3(mp_ptr res, mp_srcptr op, slong n, ulong b,
                  ulong k)

    Same as \code{_nmod_poly_KS2_unpack}, but requires
    \code{2 * FLINT_BITS < b < 3 * FLINT_BITS} (i.e. writes three words per
    coefficient).

void _nmod_poly_KS2_unpack(mp_ptr res, mp_srcptr op, slong n, ulong b,
                 ulong k)

    Bit unpacking code used by KS2 and KS4 multiplication.


*******************************************************************************

    KS2/KS4 Reduction

*******************************************************************************

void _nmod_poly_KS2_reduce(mp_ptr res, slong s, mp_srcptr op, slong n, ulong w,
                     nmod_t mod)

    Reduction code used by KS2 and KS4 multiplication.

void _nmod_poly_KS2_recover_reduce1(mp_ptr res, slong s, mp_srcptr op1,
                          mp_srcptr op2, slong n, ulong b, nmod_t mod)

    Same as \code{_nmod_poly_KS2_recover_reduce}, but requires
    \code{0 < 2 * b <= FLINT_BITS}.

void _nmod_poly_KS2_recover_reduce2(mp_ptr res, slong s, mp_srcptr op1,
                          mp_srcptr op2, slong n, ulong b, nmod_t mod)

    Same as \code{_nmod_poly_KS2_recover_reduce}, but requires
    \code{FLINT_BITS < 2 * b < 2*FLINT_BITS}.

void _nmod_poly_KS2_recover_reduce2b(mp_ptr res, slong s, mp_srcptr op1,
                          mp_srcptr op2, slong n, ulong b, nmod_t mod)

    Same as \code{_nmod_poly_KS2_recover_reduce}, but requires
    \code{b == FLINT_BITS}.

void _nmod_poly_KS2_recover_reduce3(mp_ptr res, slong s, mp_srcptr op1,
                          mp_srcptr op2, slong n, ulong b, nmod_t mod)

    Same as \code{_nmod_poly_KS2_recover_reduce}, but requires
    \code{2 * FLINT_BITS < 2 * b <= 3 * FLINT_BITS}.

void _nmod_poly_KS2_recover_reduce(mp_ptr res, slong s, mp_srcptr op1,
                          mp_srcptr op2, slong n, ulong b, nmod_t mod)

    Reduction code used by KS4 multiplication.


*******************************************************************************

    Multiplication

*******************************************************************************

void _nmod_poly_mul_classical(mp_ptr res, mp_srcptr poly1,
                    slong len1, mp_srcptr poly2, slong len2, nmod_t mod)

    Sets \code{(res, len1 + len2 - 1)} to the product of \code{(poly1, len1)}
    and \code{(poly2, len2)}. Assumes \code{len1 >= len2 > 0}. Aliasing of
    inputs and output is not permitted.

void nmod_poly_mul_classical(nmod_poly_t res,
                             const nmod_poly_t poly1, const nmod_poly_t poly2)

    Sets \code{res} to the product of \code{poly1} and \code{poly2}.

void _nmod_poly_mullow_classical(mp_ptr res, mp_srcptr poly1, slong len1,
                          mp_srcptr poly2, slong len2, slong trunc, nmod_t mod)

    Sets \code{res} to the lower \code{trunc} coefficients of the product of
    \code{(poly1, len1)} and \code{(poly2, len2)}. Assumes that
    \code{len1 >= len2 > 0} and \code{trunc > 0}. Aliasing of inputs and
    output is not permitted.

void nmod_poly_mullow_classical(nmod_poly_t res,
                 const nmod_poly_t poly1, const nmod_poly_t poly2, slong trunc)

    Sets \code{res} to the lower \code{trunc} coefficients of the product
    of \code{poly1} and \code{poly2}.

void _nmod_poly_mulhigh_classical(mp_ptr res, mp_srcptr poly1,
            slong len1, mp_srcptr poly2, slong len2, slong start, nmod_t mod)

    Computes the product of \code{(poly1, len1)} and \code{(poly2, len2)}
    and writes the coefficients from \code{start} onwards into the high
    coefficients of \code{res}, the remaining coefficients being arbitrary
    but reduced.  Assumes that \code{len1 >= len2 > 0}. Aliasing of inputs
    and output is not permitted.

void nmod_poly_mulhigh_classical(nmod_poly_t res,
                 const nmod_poly_t poly1, const nmod_poly_t poly2, slong start)

    Computes the product of \code{poly1} and \code{poly2} and writes the
    coefficients from \code{start} onwards into the high coefficients of
    \code{res}, the remaining coefficients being arbitrary but reduced.

void _nmod_poly_mul_KS(mp_ptr out, mp_srcptr in1, slong len1,
                     mp_srcptr in2, slong len2, mp_bitcnt_t bits, nmod_t mod)

    Sets \code{res} to the product of \code{in1} and \code{in2}
    assuming the output coefficients are at most the given number of
    bits wide. If \code{bits} is set to $0$ an appropriate value is
    computed automatically.  Assumes that \code{len1 >= len2 > 0}.

void nmod_poly_mul_KS(nmod_poly_t res,
            const nmod_poly_t poly1, const nmod_poly_t poly2, mp_bitcnt_t bits)

    Sets \code{res} to the product of \code{poly1} and \code{poly2}
    assuming the output coefficients are at most the given number of
    bits wide. If \code{bits} is set to $0$ an appropriate value
    is computed automatically.

void _nmod_poly_mul_KS2(mp_ptr res, mp_srcptr op1, slong n1,
                  mp_srcptr op2, slong n2, nmod_t mod)

    Sets \code{res} to the product of \code{op1} and \code{op2}.
    Assumes that \code{len1 >= len2 > 0}.

void nmod_poly_mul_KS2(nmod_poly_t res,
                 const nmod_poly_t poly1, const nmod_poly_t poly2)

    Sets \code{res} to the product of \code{poly1} and \code{poly2}.

void _nmod_poly_mul_KS4(mp_ptr res, mp_srcptr op1, slong n1,
                  mp_srcptr op2, slong n2, nmod_t mod)

    Sets \code{res} to the product of \code{op1} and \code{op2}.
    Assumes that \code{len1 >= len2 > 0}.

void nmod_poly_mul_KS4(nmod_poly_t res,
                 const nmod_poly_t poly1, const nmod_poly_t poly2)

    Sets \code{res} to the product of \code{poly1} and \code{poly2}.

void _nmod_poly_mullow_KS(mp_ptr out, mp_srcptr in1, slong len1,
              mp_srcptr in2, slong len2, mp_bitcnt_t bits, slong n, nmod_t mod)

    Sets \code{out} to the low $n$ coefficients of \code{in1} of length
    \code{len1} times \code{in2} of length \code{len2}. The output must have
    space for \code{n} coefficients. We assume that \code{len1 >= len2 > 0}
    and that \code{0 < n <= len1 + len2 - 1}.

void nmod_poly_mullow_KS(nmod_poly_t res, const nmod_poly_t poly1,
                            const nmod_poly_t poly2, mp_bitcnt_t bits, slong n)

    Set \code{res} to the low $n$ coefficients of \code{in1} of length
    \code{len1} times \code{in2} of length \code{len2}.

void _nmod_poly_mul(mp_ptr res, mp_srcptr poly1, slong len1,
                                       mp_srcptr poly2, slong len2, nmod_t mod)

    Sets \code{res} to the product of \code{poly1} of length \code{len1}
    and \code{poly2} of length \code{len2}. Assumes \code{len1 >= len2 > 0}.
    No aliasing is permitted between the inputs and the output.

void nmod_poly_mul(nmod_poly_t res,
                               const nmod_poly_t poly, const nmod_poly_t poly2)

    Sets \code{res} to the product of \code{poly1} and \code{poly2}.

void _nmod_poly_mullow(mp_ptr res, mp_srcptr poly1, slong len1,
                              mp_srcptr poly2, slong len2, slong n, nmod_t mod)

    Sets \code{res} to the first \code{n} coefficients of the
    product of \code{poly1} of length \code{len1} and \code{poly2} of
    length \code{len2}. It is assumed that \code{0 < n <= len1 + len2 - 1}
    and that \code{len1 >= len2 > 0}. No aliasing of inputs and output
    is permitted.

void nmod_poly_mullow(nmod_poly_t res, const nmod_poly_t poly1,
                                          const nmod_poly_t poly2, slong trunc)

    Sets \code{res} to the first \code{trunc} coefficients of the
    product of \code{poly1} and \code{poly2}.

void _nmod_poly_mulhigh(mp_ptr res, mp_srcptr poly1, slong len1,
                              mp_srcptr poly2, slong len2, slong n, nmod_t mod)

    Sets all but the low $n$ coefficients of \code{res} to the
    corresponding coefficients of the product of \code{poly1} of length
    \code{len1} and \code{poly2} of length \code{len2}, the other
    coefficients being arbitrary. It is assumed that
    \code{len1 >= len2 > 0} and that \code{0 < n <= len1 + len2 - 1}.
    Aliasing of inputs and output is not permitted.

void nmod_poly_mulhigh(nmod_poly_t res, const nmod_poly_t poly1,
                                          const nmod_poly_t poly2, slong n)

    Sets all but the low $n$ coefficients of \code{res} to the
    corresponding coefficients of the product of \code{poly1} and
    \code{poly2}, the remaining coefficients being arbitrary.

void _nmod_poly_mulmod(mp_ptr res, mp_srcptr poly1, slong len1,
                             mp_srcptr poly2, slong len2, mp_srcptr f,
                            slong lenf, nmod_t mod)

    Sets \code{res} to the remainder of the product of \code{poly1} and
    \code{poly2} upon polynomial division by \code{f}.

    It is required that \code{len1 + len2 - lenf > 0}, which is equivalent
    to requiring that the result will actually be reduced. Otherwise, simply
    use \code{_nmod_poly_mul} instead.

    Aliasing of \code{f} and \code{res} is not permitted.

void nmod_poly_mulmod(nmod_poly_t res,
    const nmod_poly_t poly1, const nmod_poly_t poly2, const nmod_poly_t f)

    Sets \code{res} to the remainder of the product of \code{poly1} and
    \code{poly2} upon polynomial division by \code{f}.

void _nmod_poly_mulmod_preinv(mp_ptr res, mp_srcptr poly1, slong len1,
                          mp_srcptr poly2, slong len2, mp_srcptr f,
                         slong lenf, mp_srcptr finv, slong lenfinv, nmod_t mod)

    Sets \code{res} to the remainder of the product of \code{poly1} and
    \code{poly2} upon polynomial division by \code{f}.

    It is required that \code{finv} is the inverse of the reverse of \code{f}
    mod \code{x^lenf}. It is required that \code{len1 + len2 - lenf > 0},
    which is equivalent to requiring that the result will actually be reduced.
    It is required that \code{len1 < lenf} and \code{len2 < lenf}.
    Otherwise, simply use \code{_nmod_poly_mul} instead.

    Aliasing of \code{f} or \code{finv} and \code{res} is not permitted.

void nmod_poly_mulmod_preinv(nmod_poly_t res,
    const nmod_poly_t poly1, const nmod_poly_t poly2, const nmod_poly_t f,
    const nmod_poly_t finv)

    Sets \code{res} to the remainder of the product of \code{poly1} and
    \code{poly2} upon polynomial division by \code{f}. \code{finv} is the
    inverse of the reverse of \code{f}. It is required that \code{poly1} and
    \code{poly2} are reduced modulo \code{f}.

*******************************************************************************

    Powering

*******************************************************************************

void _nmod_poly_pow_binexp(mp_ptr res,
                             mp_srcptr poly, slong len, ulong e, nmod_t mod)

    Raises \code{poly} of length \code{len} to the power \code{e} and sets
    \code{res} to the result. We require that \code{res} has enough space
    for \code{(len - 1)*e + 1} coefficients. Assumes that \code{len > 0},
    \code{e > 1}. Aliasing is not permitted. Uses the binary exponentiation
    method.

void nmod_poly_pow_binexp(nmod_poly_t res, const nmod_poly_t poly, ulong e)

    Raises \code{poly} to the power \code{e} and sets \code{res} to the
    result. Uses the binary exponentiation method.

void _nmod_poly_pow(mp_ptr res,
                             mp_srcptr poly, slong len, ulong e, nmod_t mod)

    Raises \code{poly} of length \code{len} to the power \code{e} and sets
    \code{res} to the result. We require that \code{res} has enough space
    for \code{(len - 1)*e + 1} coefficients. Assumes that \code{len > 0},
    \code{e > 1}. Aliasing is not permitted.

void nmod_poly_pow(nmod_poly_t res, const nmod_poly_t poly, ulong e)

    Raises \code{poly} to the power \code{e} and sets \code{res} to the
    result.

void _nmod_poly_pow_trunc_binexp(mp_ptr res, mp_srcptr poly,
                                           ulong e, slong trunc, nmod_t mod)

    Sets \code{res} to the low \code{trunc} coefficients of \code{poly}
    (assumed to be zero padded if necessary to length \code{trunc}) to
    the power \code{e}. This is equivalent to doing a powering followed
    by a truncation. We require that \code{res} has enough space for
    \code{trunc} coefficients, that \code{trunc > 0} and that
    \code{e > 1}. Aliasing is not permitted. Uses the binary
    exponentiation method.

void nmod_poly_pow_trunc_binexp(nmod_poly_t res,
                               const nmod_poly_t poly, ulong e, slong trunc)

    Sets \code{res} to the low \code{trunc} coefficients of \code{poly}
    to the power \code{e}. This is equivalent to doing a powering
    followed by a truncation. Uses the binary exponentiation method.

void _nmod_poly_pow_trunc(mp_ptr res, mp_srcptr poly,
                                           ulong e, slong trunc, nmod_t mod)

    Sets \code{res} to the low \code{trunc} coefficients of \code{poly}
    (assumed to be zero padded if necessary to length \code{trunc}) to
    the power \code{e}. This is equivalent to doing a powering followed
    by a truncation. We require that \code{res} has enough space for
    \code{trunc} coefficients, that \code{trunc > 0} and that
    \code{e > 1}. Aliasing is not permitted.

void nmod_poly_pow_trunc(nmod_poly_t res,
                               const nmod_poly_t poly, ulong e, slong trunc)

    Sets \code{res} to the low \code{trunc} coefficients of \code{poly}
    to the power \code{e}. This is equivalent to doing a powering
    followed by a truncation.

void _nmod_poly_powmod_ui_binexp(mp_ptr res, mp_srcptr poly,
                                ulong e, mp_srcptr f,
                                slong lenf, nmod_t mod)

    Sets \code{res} to \code{poly} raised to the power \code{e}
    modulo \code{f}, using binary exponentiation. We require \code{e > 0}.

    We require \code{lenf > 1}. It is assumed that \code{poly} is already
    reduced modulo \code{f} and zero-padded as necessary to have length
    exactly \code{lenf - 1}. The output \code{res} must have room for
    \code{lenf - 1} coefficients.

void nmod_poly_powmod_ui_binexp(nmod_poly_t res,
                           const nmod_poly_t poly, ulong e,
                           const nmod_poly_t f)

    Sets \code{res} to \code{poly} raised to the power \code{e}
    modulo \code{f}, using binary exponentiation. We require \code{e >= 0}.

void
_nmod_poly_powmod_ui_binexp_preinv (mp_ptr res, mp_srcptr poly,
                                    ulong e, mp_srcptr f, slong lenf,
                                    mp_srcptr finv, slong lenfinv, nmod_t mod)

    Sets \code{res} to \code{poly} raised to the power \code{e}
    modulo \code{f}, using binary exponentiation. We require \code{e > 0}.
    We require \code{finv} to be the inverse of the reverse of \code{f}.

    We require \code{lenf > 1}. It is assumed that \code{poly} is already
    reduced modulo \code{f} and zero-padded as necessary to have length
    exactly \code{lenf - 1}. The output \code{res} must have room for
    \code{lenf - 1} coefficients.

void
nmod_poly_powmod_ui_binexp_preinv(nmod_poly_t res,
                           const nmod_poly_t poly, ulong e,
                           const nmod_poly_t f, const nmod_poly_t finv)

    Sets \code{res} to \code{poly} raised to the power \code{e}
    modulo \code{f}, using binary exponentiation. We require \code{e >= 0}.
    We require \code{finv} to be the inverse of the reverse of \code{f}.

void
_nmod_poly_powmod_x_ui_preinv (mp_ptr res, ulong e, mp_srcptr f, slong lenf,
                               mp_srcptr finv, slong lenfinv, nmod_t mod)

    Sets \code{res} to \code{x} raised to the power \code{e} modulo \code{f},
    using sliding window exponentiation. We require \code{e > 0}.
    We require \code{finv} to be the inverse of the reverse of \code{f}.

    We require \code{lenf > 2}. The output \code{res} must have room for
    \code{lenf - 1} coefficients.

void
nmod_poly_powmod_x_ui_preinv(nmod_poly_t res, ulong e, const nmod_poly_t f,
                             const nmod_poly_t finv)

    Sets \code{res} to \code{x} raised to the power \code{e}
    modulo \code{f}, using sliding window exponentiation. We require
    \code{e >= 0}. We require \code{finv} to be the inverse of the reverse of
    \code{f}.

void _nmod_poly_powmod_mpz_binexp(mp_ptr res, mp_srcptr poly,
                                mpz_srcptr e, mp_srcptr f,
                                slong lenf, nmod_t mod)

    Sets \code{res} to \code{poly} raised to the power \code{e}
    modulo \code{f}, using binary exponentiation. We require \code{e > 0}.

    We require \code{lenf > 1}. It is assumed that \code{poly} is already
    reduced modulo \code{f} and zero-padded as necessary to have length
    exactly \code{lenf - 1}. The output \code{res} must have room for
    \code{lenf - 1} coefficients.

void nmod_poly_powmod_mpz_binexp(nmod_poly_t res,
                           const nmod_poly_t poly, mpz_srcptr e,
                           const nmod_poly_t f)

    Sets \code{res} to \code{poly} raised to the power \code{e}
    modulo \code{f}, using binary exponentiation. We require \code{e >= 0}.

void
_nmod_poly_powmod_mpz_binexp_preinv (mp_ptr res, mp_srcptr poly,
                                    mpz_srcptr e, mp_srcptr f, slong lenf,
                                    mp_srcptr finv, slong lenfinv, nmod_t mod)

    Sets \code{res} to \code{poly} raised to the power \code{e}
    modulo \code{f}, using binary exponentiation. We require \code{e > 0}.
    We require \code{finv} to be the inverse of the reverse of \code{f}.

    We require \code{lenf > 1}. It is assumed that \code{poly} is already
    reduced modulo \code{f} and zero-padded as necessary to have length
    exactly \code{lenf - 1}. The output \code{res} must have room for
    \code{lenf - 1} coefficients.

void
nmod_poly_powmod_mpz_binexp_preinv(nmod_poly_t res,
                           const nmod_poly_t poly, mpz_srcptr e,
                           const nmod_poly_t f, const nmod_poly_t finv)

    Sets \code{res} to \code{poly} raised to the power \code{e}
    modulo \code{f}, using binary exponentiation. We require \code{e >= 0}.
    We require \code{finv} to be the inverse of the reverse of \code{f}.

*******************************************************************************

    Division

*******************************************************************************

void _nmod_poly_divrem_basecase(mp_ptr Q, mp_ptr R, mp_ptr W,
           mp_srcptr A, slong A_len, mp_srcptr B, slong B_len, nmod_t mod)

    Finds $Q$ and $R$ such that $A = B Q + R$ with $\len(R) < \len(B)$.
    If $\len(B) = 0$ an exception is raised. We require that \code{W}
    is temporary space of \code{NMOD_DIVREM_BC_ITCH(A_len, B_len, mod)}
    coefficients.

void nmod_poly_divrem_basecase(nmod_poly_t Q,
                       nmod_poly_t R, const nmod_poly_t A, const nmod_poly_t B)

    Finds $Q$ and $R$ such that $A = B Q + R$ with $\len(R) < \len(B)$.
    If $\len(B) = 0$ an exception is raised.

void _nmod_poly_div_basecase(mp_ptr Q, mp_ptr W, mp_srcptr A, slong A_len,
                                         mp_srcptr B, slong B_len, nmod_t mod);

    Notionally finds polynomials $Q$ and $R$ such that $A = B Q + R$ with
    $\len(R) < \len(B)$, but returns only \code{Q}. If $\len(B) = 0$ an
    exception is raised. We require that \code{W} is temporary space of
    \code{NMOD_DIV_BC_ITCH(A_len, B_len, mod)} coefficients.

void nmod_poly_div_basecase(nmod_poly_t Q, const nmod_poly_t A,
                                                          const nmod_poly_t B);

    Notionally finds polynomials $Q$ and $R$ such that $A = B Q + R$ with
    $\len(R) < \len(B)$, but returns only \code{Q}. If $\len(B) = 0$ an
    exception is raised.

void _nmod_poly_divrem_divconquer_recursive(mp_ptr Q, mp_ptr BQ, mp_ptr W,
                    mp_ptr V, mp_srcptr A, mp_srcptr B, slong lenB, nmod_t mod)

    Computes $Q$ and $R$ such that $A = BQ + R$ with $\len(R)$ less than
    \code{lenB}, where \code{A} is of length \code{2 * lenB - 1} and \code{B}
    is of length \code{lenB}. Sets \code{BQ} to the low \code{lenB - 1}
    coefficients of \code{B * Q}. We require that \code{Q} have space for
    \code{lenB} coefficients, that \code{W} be temporary space of size
    \code{lenB - 1} and \code{V} be temporary space for a number of
    coefficients computed by \code{NMOD_DIVREM_DC_ITCH(lenB, mod)}.

void _nmod_poly_divrem_divconquer(mp_ptr Q, mp_ptr R,
                  mp_srcptr A, slong lenA, mp_srcptr B, slong lenB, nmod_t mod)

    Computes $Q$ and $R$ such that $A = BQ + R$ with $\len(R)$ less than
    \code{lenB}, where \code{A} is of length \code{lenA} and \code{B} is of
    length \code{lenB}. We require that \code{Q} have space for
    \code{lenA - lenB + 1} coefficients.

void nmod_poly_divrem_divconquer(nmod_poly_t Q, nmod_poly_t R,
                                      const nmod_poly_t A, const nmod_poly_t B)

    Computes $Q$ and $R$ such that $A = BQ + R$ with $\len(R) < \len(B)$.

void _nmod_poly_divrem_q0(mp_ptr Q, mp_ptr R,
                          mp_srcptr A, mp_srcptr B, slong lenA, nmod_t mod)

    Computes $Q$ and $R$ such that $A = BQ + R$ with $\len(R) < \len(B)$,
    where $\len(A) = \len(B) > 0$.

    Requires that $Q$ and $R$ have space for $1$ and $\len(B) - 1$
    coefficients, respectively.

    Does not support aliasing or zero-padding.

void _nmod_poly_divrem_q1(mp_ptr Q, mp_ptr R,
                          mp_srcptr A, slong lenA, mp_srcptr B, slong lenB,
                          nmod_t mod)

    Computes $Q$ and $R$ such that $A = BQ + R$ with $\len(R) < \len(B)$,
    where $\len(A) = \len(B) + 1 \geq \len(B) > 0$.

    Requires that $Q$ and $R$ have space for $\len(A) - \len(B) + 1$ and
    $\len(B) - 1$ coefficients, respectively.

    Does not support aliasing or zero-padding.

void _nmod_poly_divrem(mp_ptr Q, mp_ptr R,
                  mp_srcptr A, slong lenA, mp_srcptr B, slong lenB, nmod_t mod)

    Computes $Q$ and $R$ such that $A = BQ + R$ with $\len(R)$ less than
    \code{lenB}, where \code{A} is of length \code{lenA} and \code{B} is of
    length \code{lenB}. We require that \code{Q} have space for
    \code{lenA - lenB + 1} coefficients.

void nmod_poly_divrem(nmod_poly_t Q, nmod_poly_t R,
                                      const nmod_poly_t A, const nmod_poly_t B)

    Computes $Q$ and $R$ such that $A = BQ + R$ with $\len(R) < \len(B)$.

void _nmod_poly_div_divconquer_recursive(mp_ptr Q, mp_ptr W, mp_ptr V,
                              mp_srcptr A, mp_srcptr B, slong lenB, nmod_t mod)

    Computes $Q$ and $R$ such that $A = BQ + R$ with $\len(R)$ less than
    \code{lenB}, where \code{A} is of length \code{2 * lenB - 1} and \code{B}
    is of length \code{lenB}. We require that \code{Q} have space for
    \code{lenB} coefficients and that \code{W} be temporary space of size
    \code{lenB - 1} and \code{V} be temporary space for a number of
    coefficients computed by \code{NMOD_DIV_DC_ITCH(lenB, mod)}.

void _nmod_poly_div_divconquer(mp_ptr Q, mp_srcptr A, slong lenA,
                                           mp_srcptr B, slong lenB, nmod_t mod)

    Notionally computes polynomials $Q$ and $R$ such that $A = BQ + R$ with
    $\len(R)$ less than \code{lenB}, where \code{A} is of length \code{lenA}
    and \code{B} is of length \code{lenB}, but returns only \code{Q}. We
    require that \code{Q} have space for \code{lenA - lenB + 1} coefficients.

void nmod_poly_div_divconquer(nmod_poly_t Q,
                                      const nmod_poly_t A, const nmod_poly_t B)

    Notionally computes $Q$ and $R$ such that $A = BQ + R$ with
    $\len(R) < \len(B)$, but returns only $Q$.

void _nmod_poly_div(mp_ptr Q, mp_srcptr A, slong lenA,
                                           mp_srcptr B, slong lenB, nmod_t mod)

    Notionally computes polynomials $Q$ and $R$ such that $A = BQ + R$ with
    $\len(R)$ less than \code{lenB}, where \code{A} is of length \code{lenA}
    and \code{B} is of length \code{lenB}, but returns only \code{Q}. We
    require that \code{Q} have space for \code{lenA - lenB + 1} coefficients.


void nmod_poly_div(nmod_poly_t Q, const nmod_poly_t A, const nmod_poly_t B)

    Computes the quotient $Q$ on polynomial division of $A$ and $B$.

void _nmod_poly_rem_basecase(mp_ptr R, mp_ptr W, mp_srcptr A, slong lenA,
                                       mp_srcptr B, slong lenB, nmod_t mod)

void nmod_poly_rem_basecase(nmod_poly_t R,
                            const nmod_poly_t A, const nmod_poly_t B)

void _nmod_poly_rem_q1(mp_ptr R,
                       mp_srcptr A, slong lenA, mp_srcptr B, slong lenB,
                       nmod_t mod)

    Notationally, computes $Q$ and $R$ such that $A = BQ + R$ with
    $\len(R) < \len(B)$, where $\len(A) = \len(B) + 1 \geq \len(B) > 0$,
    but returns only the remainder.

    Requires that $R$ has space for $\len(B) - 1$ coefficients,
    respectively.

    Does not support aliasing or zero-padding.

void _nmod_poly_rem(mp_ptr R, mp_srcptr A, slong lenA,
                              mp_srcptr B, slong lenB, nmod_t mod)

    Computes the remainder $R$ on polynomial division of $A$ by $B$.

void nmod_poly_rem(nmod_poly_t R, const nmod_poly_t A, const nmod_poly_t B)

    Computes the remainder $R$ on polynomial division of $A$ by $B$.

void _nmod_poly_inv_series_basecase(mp_ptr Qinv,
                                    mp_srcptr Q, slong n, nmod_t mod)

    Given \code{Q} of length \code{n} whose leading coefficient is invertible
    modulo the given modulus, finds a polynomial \code{Qinv} of length \code{n}
    such that the top \code{n} coefficients of the product \code{Q * Qinv} is
    $x^{n - 1}$. Requires that \code{n > 0}. This function can be viewed as
    inverting a power series.

void nmod_poly_inv_series_basecase(nmod_poly_t Qinv,
                                   const nmod_poly_t Q, slong n)

    Given \code{Q} of length at least \code{n} find \code{Qinv} of length
    \code{n} such that the top \code{n} coefficients of the product
    \code{Q * Qinv} is $x^{n - 1}$. An exception is raised if \code{n = 0}
    or if the length of \code{Q} is less than \code{n}. The leading
    coefficient of \code{Q} must be invertible modulo the modulus of
    \code{Q}. This function can be viewed as inverting a power series.

void
_nmod_poly_inv_series_newton(mp_ptr Qinv, mp_srcptr Q, slong n, nmod_t mod)

    Given \code{Q} of length \code{n} whose constant coefficient is invertible
    modulo the given modulus, find a polynomial \code{Qinv} of length \code{n}
    such that \code{Q * Qinv} is \code{1} modulo $x^n$. Requires \code{n > 0}.
    This function can be viewed as inverting a power series via Newton
    iteration.

void
nmod_poly_inv_series_newton(nmod_poly_t Qinv, const nmod_poly_t Q, slong n)

    Given \code{Q} find \code{Qinv} such that \code{Q * Qinv} is \code{1}
    modulo $x^n$. The constant coefficient of \code{Q} must be invertible
    modulo the modulus of \code{Q}. An exception is raised if this is not
    the case or if \code{n = 0}. This function can be viewed as inverting
    a power series via Newton iteration.

void _nmod_poly_inv_series(mp_ptr Qinv, mp_srcptr Q, slong n, nmod_t mod)

    Given \code{Q} of length \code{n} whose constant coefficient is invertible
    modulo the given modulus, find a polynomial \code{Qinv} of length \code{n}
    such that \code{Q * Qinv} is \code{1} modulo $x^n$. Requires \code{n > 0}.
    This function can be viewed as inverting a power series.

void nmod_poly_inv_series(nmod_poly_t Qinv, const nmod_poly_t Q, slong n)

    Given \code{Q} find \code{Qinv} such that \code{Q * Qinv} is \code{1}
    modulo $x^n$. The constant coefficient of \code{Q} must be invertible
    modulo the modulus of \code{Q}. An exception is raised if this is not
    the case or if \code{n = 0}. This function can be viewed as inverting
    a power series.

void _nmod_poly_div_series(mp_ptr Q, mp_srcptr A, mp_srcptr B,
                                                  slong n, nmod_t mod)

    Given polynomials \code{A} and \code{B} of length \code{n}, finds the
    polynomial \code{Q} of length \code{n} such that \code{Q * B = A}
    modulo $x^n$. We assume \code{n > 0} and that the constant coefficient
    of \code{B} is invertible modulo the given modulus. The polynomial
    \code{Q} must have space for \code{n} coefficients.

void nmod_poly_div_series(nmod_poly_t Q, const nmod_poly_t A,
                                         const nmod_poly_t B, slong n)

    Given polynomials \code{A} and \code{B} considered modulo \code{n},
    finds the polynomial \code{Q} of length at most \code{n} such that
    \code{Q * B = A} modulo $x^n$. We assume \code{n > 0} and that the
    constant coefficient of \code{B} is invertible modulo the modulus.
    An exception is raised if \code{n == 0} or the constant coefficient
    of \code{B} is zero.

void _nmod_poly_div_newton(mp_ptr Q, mp_srcptr A, slong Alen,
                                     mp_srcptr B, slong Blen, nmod_t mod)

    Notionally computes polynomials $Q$ and $R$ such that $A = BQ + R$ with
    $\len(R)$ less than \code{lenB}, where \code{A} is of length \code{lenA}
    and \code{B} is of length \code{lenB}, but return only $Q$.

    We require that $Q$ have space for \code{lenA - lenB + 1} coefficients
    and assume that the leading coefficient of $B$ is a unit.

    The algorithm used is to reverse the polynomials and divide the
    resulting power series, then reverse the result.

void nmod_poly_div_newton(nmod_poly_t Q, const nmod_poly_t A,
                                         const nmod_poly_t B)

    Notionally computes $Q$ and $R$ such that $A = BQ + R$ with
    $\len(R) < \len(B)$, but returns only $Q$.

    We assume that the leading coefficient of $B$ is a unit.

    The algorithm used is to reverse the polynomials and divide the
    resulting power series, then reverse the result.

void _nmod_poly_div_newton_n_preinv (mp_ptr Q, mp_srcptr A, slong lenA,
            mp_srcptr B, slong lenB, mp_srcptr Binv, slong lenBinv, nmod_t mod)

    Notionally computes polynomials $Q$ and $R$ such that $A = BQ + R$ with
    $\len(R)$ less than \code{lenB}, where \code{A} is of length \code{lenA}
    and \code{B} is of length \code{lenB}, but return only $Q$.

    We require that $Q$ have space for \code{lenA - lenB + 1} coefficients
    and assume that the leading coefficient of $B$ is a unit. Furthermore, we
    assume that $Binv$ is the inverse of the reverse of $B$ mod $x^{\len(B)}$.

    The algorithm used is to reverse the polynomials and divide the
    resulting power series, then reverse the result.

void nmod_poly_div_newton_n_preinv (nmod_poly_t Q, const nmod_poly_t A,
                                 const nmod_poly_t B, const nmod_poly_t Binv)

    Notionally computes $Q$ and $R$ such that $A = BQ + R$ with
    $\len(R) < \len(B)$, but returns only $Q$.

    We assume that the leading coefficient of $B$ is a unit and that $Binv$ is
    the inverse of the reverse of $B$ mod $x^{\len(B)}$.

    It is required that the length of $A$ is less than or equal to
    2*the length of $B$ - 2.

    The algorithm used is to reverse the polynomials and divide the
    resulting power series, then reverse the result.

void _nmod_poly_divrem_newton(mp_ptr Q, mp_ptr R, mp_srcptr A, slong Alen,
                                        mp_srcptr B, slong Blen, nmod_t mod)

    Computes $Q$ and $R$ such that $A = BQ + R$ with $\len(R)$ less than
    \code{lenB}, where $A$ is of length \code{lenA} and $B$ is of length
    \code{lenB}. We require that $Q$ have space for \code{lenA - lenB + 1}
    coefficients. The algorithm used is to call \code{div_newton()} and then
    multiply out and compute the remainder.

void nmod_poly_divrem_newton(nmod_poly_t Q, nmod_poly_t R,
                             const nmod_poly_t A, const nmod_poly_t B)

    Computes $Q$ and $R$ such that $A = BQ + R$ with $\len(R) < \len(B)$.
    The algorithm used is to call \code{div_newton()} and then multiply out
    and compute the remainder.

void _nmod_poly_divrem_newton_n_preinv (mp_ptr Q, mp_ptr R, mp_srcptr A,
slong lenA, mp_srcptr B, slong lenB, mp_srcptr Binv, slong lenBinv, nmod_t mod)

    Computes $Q$ and $R$ such that $A = BQ + R$ with $\len(R)$ less than
    \code{lenB}, where $A$ is of length \code{lenA} and $B$ is of length
    \code{lenB}. We require that $Q$ have space for \code{lenA - lenB + 1}
    coefficients. Furthermore, we assume that $Binv$ is the inverse of the
    reverse of $B$ mod $x^{\len(B)}$. The algorithm used is to call
    \code{div_newton_n_preinv()} and then multiply out and compute
    the remainder.

void nmod_poly_divrem_newton_n_preinv(nmod_poly_t Q, nmod_poly_t R,
            const nmod_poly_t A, const nmod_poly_t B, const nmod_poly_t Binv)

    Computes $Q$ and $R$ such that $A = BQ + R$ with $\len(R) < \len(B)$.
    We assume $Binv$ is the inverse of the reverse of $B$ mod $x^{\len(B)}$.

    It is required that the length of $A$ is less than or equal to
    2*the length of $B$ - 2.

    The algorithm used is to call \code{div_newton_n()} and then multiply out
    and compute the remainder.

mp_limb_t _nmod_poly_div_root(mp_ptr Q, mp_srcptr A, slong len,
                                mp_limb_t c, nmod_t mod)

    Sets \code{(Q, len-1)} to the quotient of \code{(A, len)} on division
    by $(x - c)$, and returns the remainder, equal to the value of $A$
    evaluated at $c$. $A$ and $Q$ are allowed to be the same, but may
    not overlap partially in any other way.

mp_limb_t nmod_poly_div_root(nmod_poly_t Q, const nmod_poly_t A, mp_limb_t c)

    Sets $Q$ to the quotient of $A$ on division by $(x - c)$, and returns
    the remainder, equal to the value of $A$ evaluated at $c$.

*******************************************************************************

    Derivative and integral

*******************************************************************************

void _nmod_poly_derivative(mp_ptr x_prime, mp_srcptr x, slong len, nmod_t mod)

    Sets the first \code{len - 1} coefficients of \code{x_prime} to the
    derivative of \code{x} which is assumed to be of length \code{len}.
    It is assumed that \code{len > 0}.

void nmod_poly_derivative(nmod_poly_t x_prime, const nmod_poly_t x)

    Sets \code{x_prime} to the derivative of \code{x}.

void _nmod_poly_integral(mp_ptr x_int, mp_srcptr x, slong len, nmod_t mod)

    Set the first \code{len} coefficients of \code{x_int} to the
    integral of \code{x} which is assumed to be of length \code{len - 1}.
    The constant term of \code{x_int} is set to zero.
    It is assumed that \code{len > 0}. The result is only well-defined
    if the modulus is a prime number strictly larger than the degree of
    \code{x}.

void nmod_poly_integral(nmod_poly_t x_int, const nmod_poly_t x)

    Set \code{x_int} to the indefinite integral of \code{x} with constant
    term zero. The result is only well-defined if the modulus
    is a prime number strictly larger than the degree of \code{x}.


*******************************************************************************

    Evaluation

*******************************************************************************

mp_limb_t _nmod_poly_evaluate_nmod(mp_srcptr poly, slong len, mp_limb_t c,
                                   nmod_t mod)

    Evaluates \code{poly} at the value~\code{c} and reduces modulo the
    given modulus of \code{poly}. The value~\code{c} should be reduced
    modulo the modulus. The algorithm used is Horner's method.

mp_limb_t nmod_poly_evaluate_nmod(nmod_poly_t poly, mp_limb_t c)

    Evaluates \code{poly} at the value~\code{c} and reduces modulo the
    modulus of \code{poly}. The value~\code{c} should be reduced modulo
    the modulus. The algorithm used is Horner's method.

*******************************************************************************

    Multipoint evaluation

*******************************************************************************

void _nmod_poly_evaluate_nmod_vec_iter(mp_ptr ys, mp_srcptr poly, slong len,
                                    mp_srcptr xs, slong n, nmod_t mod)

    Evaluates (\code{coeffs}, \code{len}) at the \code{n} values
    given in the vector \code{xs}, writing the output values
    to \code{ys}. The values in \code{xs} should be reduced
    modulo the modulus.

    Uses Horner's method iteratively.

void nmod_poly_evaluate_nmod_vec_iter(mp_ptr ys, const nmod_poly_t poly,
                                    mp_srcptr xs, slong n)

    Evaluates \code{poly} at the \code{n} values given in the vector
    \code{xs}, writing the output values to \code{ys}. The values in
    \code{xs} should be reduced modulo the modulus.

    Uses Horner's method iteratively.

void _nmod_poly_evaluate_nmod_vec_fast_precomp(mp_ptr vs, mp_srcptr poly,
    slong plen, const mp_ptr * tree, slong len, nmod_t mod)

    Evaluates (\code{poly}, \code{plen}) at the \code{len} values given
    by the precomputed subproduct tree \code{tree}.

void _nmod_poly_evaluate_nmod_vec_fast(mp_ptr ys, mp_srcptr poly,
        slong len, mp_srcptr xs, slong n, nmod_t mod)

    Evaluates (\code{coeffs}, \code{len}) at the \code{n} values
    given in the vector \code{xs}, writing the output values
    to \code{ys}. The values in \code{xs} should be reduced
    modulo the modulus.

    Uses fast multipoint evaluation, building a temporary subproduct tree.

void nmod_poly_evaluate_nmod_vec_fast(mp_ptr ys, const nmod_poly_t poly,
                                    mp_srcptr xs, slong n)

    Evaluates \code{poly} at the \code{n} values given in the vector
    \code{xs}, writing the output values to \code{ys}. The values in
    \code{xs} should be reduced modulo the modulus.

    Uses fast multipoint evaluation, building a temporary subproduct tree.


void _nmod_poly_evaluate_nmod_vec(mp_ptr ys, mp_srcptr poly, slong len,
                                    mp_srcptr xs, slong n, nmod_t mod)

    Evaluates (\code{poly}, \code{len}) at the \code{n} values
    given in the vector \code{xs}, writing the output values
    to \code{ys}. The values in \code{xs} should be reduced
    modulo the modulus.

void nmod_poly_evaluate_nmod_vec(mp_ptr ys, const nmod_poly_t poly,
                                    mp_srcptr xs, slong n)

    Evaluates \code{poly} at the \code{n} values given in the vector
    \code{xs}, writing the output values to \code{ys}. The values in
    \code{xs} should be reduced modulo the modulus.

*******************************************************************************

    Interpolation

*******************************************************************************

void _nmod_poly_interpolate_nmod_vec(mp_ptr poly,
                            mp_srcptr xs, mp_srcptr ys, slong n, nmod_t mod)

    Sets \code{poly} to the unique polynomial of length at most \code{n}
    that interpolates the \code{n} given evaluation points \code{xs} and
    values \code{ys}. If the interpolating polynomial is shorter than
    length \code{n}, the leading coefficients are set to zero.

    The values in \code{xs} and \code{ys} should be reduced modulo the
    modulus, and all \code{xs} must be distinct. Aliasing between
    \code{poly} and \code{xs} or \code{ys} is not allowed.

void nmod_poly_interpolate_nmod_vec(nmod_poly_t poly,
                                    mp_srcptr xs, mp_srcptr ys, slong n)

    Sets \code{poly} to the unique polynomial of length \code{n} that
    interpolates the \code{n} given evaluation points \code{xs} and
    values \code{ys}. The values in \code{xs} and \code{ys} should be
    reduced modulo the modulus, and all \code{xs} must be distinct.

void _nmod_poly_interpolation_weights(mp_ptr w, const mp_ptr * tree,
        slong len, nmod_t mod)

    Sets \code{w} to the barycentric interpolation weights for fast
    Lagrange interpolation with respect to a given subproduct tree.

void _nmod_poly_interpolate_nmod_vec_fast_precomp(mp_ptr poly, mp_srcptr ys,
    const mp_ptr * tree, mp_srcptr weights, slong len, nmod_t mod)

    Performs interpolation using the fast Lagrange interpolation
    algorithm, generating a temporary subproduct tree.

    The function values are given as \code{ys}. The function takes
    a precomputed subproduct tree \code{tree} and barycentric
    interpolation weights \code{weights} corresponding to the
    roots.

void _nmod_poly_interpolate_nmod_vec_fast(mp_ptr poly,
                            mp_srcptr xs, mp_srcptr ys, slong n, nmod_t mod)

    Performs interpolation using the fast Lagrange interpolation
    algorithm, generating a temporary subproduct tree.

void nmod_poly_interpolate_nmod_vec_fast(nmod_poly_t poly,
                                    mp_srcptr xs, mp_srcptr ys, slong n)

    Performs interpolation using the fast Lagrange interpolation algorithm,
    generating a temporary subproduct tree.

void _nmod_poly_interpolate_nmod_vec_newton(mp_ptr poly,
                            mp_srcptr xs, mp_srcptr ys, slong n, nmod_t mod)

    Forms the interpolating polynomial in the Newton basis using
    the method of divided differences and then converts it to
    monomial form.

void nmod_poly_interpolate_nmod_vec_newton(nmod_poly_t poly,
                                    mp_srcptr xs, mp_srcptr ys, slong n)

    Forms the interpolating polynomial in the Newton basis using
    the method of divided differences and then converts it to
    monomial form.

void _nmod_poly_interpolate_nmod_vec_barycentric(mp_ptr poly,
                            mp_srcptr xs, mp_srcptr ys, slong n, nmod_t mod)

    Forms the interpolating polynomial using a naive implementation
    of the barycentric form of Lagrange interpolation.

void nmod_poly_interpolate_nmod_vec_barycentric(nmod_poly_t poly,
                                    mp_srcptr xs, mp_srcptr ys, slong n)

    Forms the interpolating polynomial using a naive implementation
    of the barycentric form of Lagrange interpolation.


*******************************************************************************

    Composition

*******************************************************************************

void _nmod_poly_compose_horner(mp_ptr res, mp_srcptr poly1, slong len1,
                               mp_srcptr poly2, slong len2, nmod_t mod)

    Composes \code{poly1} of length \code{len1} with \code{poly2} of length
    \code{len2} and sets \code{res} to the result, i.e.\ evaluates
    \code{poly1} at \code{poly2}. The algorithm used is Horner's algorithm.
    We require that \code{res} have space for \code{(len1 - 1)*(len2 - 1) + 1}
    coefficients. It is assumed that \code{len1 > 0} and \code{len2 > 0}.

void nmod_poly_compose_horner(nmod_poly_t res,
                              const nmod_poly_t poly1, const nmod_poly_t poly2)

    Composes \code{poly1} with \code{poly2} and sets \code{res} to the result,
    i.e.\ evaluates \code{poly1} at \code{poly2}. The algorithm used is
    Horner's algorithm.

void _nmod_poly_compose_divconquer(mp_ptr res, mp_srcptr poly1, slong len1,
                                   mp_srcptr poly2, slong len2, nmod_t mod)

    Composes \code{poly1} of length \code{len1} with \code{poly2} of length
    \code{len2} and sets \code{res} to the result, i.e.\ evaluates
    \code{poly1} at \code{poly2}. The algorithm used is the divide and
    conquer algorithm. We require that \code{res} have space for
    \code{(len1 - 1)*(len2 - 1) + 1} coefficients. It is assumed that
    \code{len1 > 0} and \code{len2 > 0}.

void nmod_poly_compose_divconquer(nmod_poly_t res,
                              const nmod_poly_t poly1, const nmod_poly_t poly2)

    Composes \code{poly1} with \code{poly2} and sets \code{res} to the result,
    i.e.\ evaluates \code{poly1} at \code{poly2}. The algorithm used is
    the divide and conquer algorithm.

void _nmod_poly_compose(mp_ptr res, mp_srcptr poly1, slong len1,
                                       mp_srcptr poly2, slong len2, nmod_t mod)

    Composes \code{poly1} of length \code{len1} with \code{poly2} of length
    \code{len2} and sets \code{res} to the result, i.e.\ evaluates \code{poly1}
    at \code{poly2}. We require that \code{res} have space for
    \code{(len1 - 1)*(len2 - 1) + 1} coefficients. It is assumed that
    \code{len1 > 0} and \code{len2 > 0}.

void nmod_poly_compose(nmod_poly_t res,
                              const nmod_poly_t poly1, const nmod_poly_t poly2)

    Composes \code{poly1} with \code{poly2} and sets \code{res} to the result,
    that is, evaluates \code{poly1} at \code{poly2}.

*******************************************************************************

    Taylor shift

*******************************************************************************

void _nmod_poly_taylor_shift_horner(mp_ptr poly, mp_limb_t c,
    slong len, nmod_t mod)

    Performs the Taylor shift composing \code{poly} by $x+c$ in-place.
    Uses an efficient version Horner's rule.

void nmod_poly_taylor_shift_horner(nmod_poly_t g,
    const nmod_poly_t f, mp_limb_t c)

    Performs the Taylor shift composing \code{f} by $x+c$.

void _nmod_poly_taylor_shift_convolution(mp_ptr poly, mp_limb_t c,
    slong len, nmod_t mod)

    Performs the Taylor shift composing \code{poly} by $x+c$ in-place.
    Writes the composition as a single convolution with cost $O(M(n))$.
    We require that the modulus is a prime at least as large as the length.

void nmod_poly_taylor_shift_convolution(nmod_poly_t g,
    const nmod_poly_t f, mp_limb_t c)

    Performs the Taylor shift composing \code{f} by $x+c$.
    Writes the composition as a single convolution with cost $O(M(n))$.
    We require that the modulus is a prime at least as large as the length.

void _nmod_poly_taylor_shift(mp_ptr poly, mp_limb_t c, slong len, nmod_t mod)

    Performs the Taylor shift composing \code{poly} by $x+c$ in-place.
    We require that the modulus is a prime.

void nmod_poly_taylor_shift(nmod_poly_t g, const nmod_poly_t f, mp_limb_t c)

    Performs the Taylor shift composing \code{f} by $x+c$.
    We require that the modulus is a prime.

*******************************************************************************

    Modular composition

*******************************************************************************

void _nmod_poly_compose_mod_horner(mp_ptr res,
    mp_srcptr f, slong lenf, mp_srcptr g, mp_srcptr h, slong lenh, nmod_t mod)

    Sets \code{res} to the composition $f(g)$ modulo $h$. We require that
    $h$ is nonzero and that the length of $g$ is one less than the
    length of $h$ (possibly with zero padding). The output is not allowed
    to be aliased with any of the inputs.

    The algorithm used is Horner's rule.

void nmod_poly_compose_mod_horner(nmod_poly_t res,
                    const nmod_poly_t f, const nmod_poly_t g,
                    const nmod_poly_t h)

    Sets \code{res} to the composition $f(g)$ modulo $h$. We require that
    $h$ is nonzero. The algorithm used is Horner's rule.


void _nmod_poly_compose_mod_brent_kung(mp_ptr res,
    mp_srcptr f, slong lenf, mp_srcptr g, mp_srcptr h, slong lenh, nmod_t mod)

    Sets \code{res} to the composition $f(g)$ modulo $h$. We require that
    $h$ is nonzero and that the length of $g$ is one less than the
    length of $h$ (possibly with zero padding). We also require that
    the length of $f$ is less than the length of $h$. The output is not allowed
    to be aliased with any of the inputs.

    The algorithm used is the Brent-Kung matrix algorithm.

void nmod_poly_compose_mod_brent_kung(nmod_poly_t res,
                    const nmod_poly_t f, const nmod_poly_t g,
                    const nmod_poly_t h)

    Sets \code{res} to the composition $f(g)$ modulo $h$. We require that
    $h$ is nonzero and that $f$ has smaller degree than $h$.
    The algorithm used is the Brent-Kung matrix algorithm.

void _nmod_poly_compose_mod_brent_kung_preinv(mp_ptr res, mp_srcptr f,
                            slong lenf,
                            mp_srcptr g, mp_srcptr h, slong lenh,
                            mp_srcptr hinv, slong lenhinv, nmod_t mod)

    Sets \code{res} to the composition $f(g)$ modulo $h$. We require that
    $h$ is nonzero and that the length of $g$ is one less than the
    length of $h$ (possibly with zero padding). We also require that
    the length of $f$ is less than the length of $h$. Furthermore, we require
    \code{hinv} to be the inverse of the reverse of \code{h}.
    The output is not allowed to be aliased with any of the inputs.

    The algorithm used is the Brent-Kung matrix algorithm.

void nmod_poly_compose_mod_brent_kung_preinv(nmod_poly_t res,
                    const nmod_poly_t f, const nmod_poly_t g,
                    const nmod_poly_t h, const nmod_poly_t hinv)

    Sets \code{res} to the composition $f(g)$ modulo $h$. We require that
    $h$ is nonzero and that $f$ has smaller degree than $h$. Furthermore,
    we require \code{hinv} to be the inverse of the reverse of \code{h}.
    The algorithm used is the Brent-Kung matrix algorithm.

void
_nmod_poly_reduce_matrix_mod_poly (nmod_mat_t A, const nmod_mat_t B,
                          const nmod_poly_t f)

    Sets the ith row of \code{A} to the reduction of the ith row of $B$ modulo
    $f$ for $i=1,\ldots,\sqrt{\deg(f)}$. We require $B$ to be at least
    a $\sqrt{\deg(f)}\times \deg(f)$ matrix and $f$ to be nonzero.

void
_nmod_poly_precompute_matrix (nmod_mat_t A, mp_srcptr f, mp_srcptr g,
               slong leng, mp_srcptr ginv, slong lenginv, nmod_t mod)

    Sets the ith row of \code{A} to $f^i$ modulo $g$ for
    $i=1,\ldots,\sqrt{\deg(g)}$. We require $A$ to be
    a $\sqrt{\deg(g)}\times \deg(g)$ matrix. We require
    \code{ginv} to be the inverse of the reverse of \code{g} and $g$ to be
    nonzero.

void
nmod_poly_precompute_matrix (nmod_mat_t A, const nmod_poly_t f,
                          const nmod_poly_t g, const nmod_poly_t ginv)

    Sets the ith row of \code{A} to $f^i$ modulo $g$ for
    $i=1,\ldots,\sqrt{\deg(g)}$. We require $A$ to be
    a $\sqrt{\deg(g)}\times \deg(g)$ matrix. We require
    \code{ginv} to be the inverse of the reverse of \code{g}.


void
_nmod_poly_compose_mod_brent_kung_precomp_preinv(mp_ptr res, mp_srcptr f,
                            slong lenf, const nmod_mat_t A, mp_srcptr h,
                            slong h, mp_srcptr hinv, slong lenhinv,
                            nmod_t mod)

    Sets \code{res} to the composition $f(g)$ modulo $h$. We require that
    $h$ is nonzero. We require that the ith row of $A$ contains $g^i$ for
    $i=1,\ldots,\sqrt{\deg(h)}$, i.e. $A$ is a
    $\sqrt{\deg(h)}\times \deg(h)$ matrix. We also require that
    the length of $f$ is less than the length of $h$. Furthermore, we require
    \code{hinv} to be the inverse of the reverse of \code{h}.
    The output is not allowed to be aliased with any of the inputs.

    The algorithm used is the Brent-Kung matrix algorithm.

void
nmod_poly_compose_mod_brent_kung_precomp_preinv(nmod_poly_t res,
                    const nmod_poly_t f, const nmod_mat_t A,
                    const nmod_poly_t h, const nmod_poly_t hinv)

    Sets \code{res} to the composition $f(g)$ modulo $h$. We require that the
    ith row of $A$ contains $g^i$ for $i=1,\ldots,\sqrt{\deg(h)}$, i.e. $A$ is a
    $\sqrt{\deg(h)}\times \deg(h)$ matrix. We require that $h$ is nonzero and
    that $f$ has smaller degree than $h$. Furthermore, we require \code{hinv} to
    be the inverse of the reverse of \code{h}. This version of Brent-Kung
    modular composition is particularly useful if one has to perform several
    modular composition of the form $f(g)$ modulo $h$ for fixed $g$ and $h$.

void _nmod_poly_compose_mod(mp_ptr res,
    mp_srcptr f, slong lenf, mp_srcptr g, mp_srcptr h, slong lenh, nmod_t mod)

    Sets \code{res} to the composition $f(g)$ modulo $h$. We require that
    $h$ is nonzero and that the length of $g$ is one less than the
    length of $h$ (possibly with zero padding). The output is not allowed
    to be aliased with any of the inputs.

void nmod_poly_compose_mod(nmod_poly_t res,
                    const nmod_poly_t f, const nmod_poly_t g,
                    const nmod_poly_t h)

    Sets \code{res} to the composition $f(g)$ modulo $h$. We require that
    $h$ is nonzero.


*******************************************************************************

    Greatest common divisor

*******************************************************************************

slong _nmod_poly_gcd_euclidean(mp_ptr G,
                  mp_srcptr A, slong lenA, mp_srcptr B, slong lenB, nmod_t mod)

    Computes the GCD of $A$ of length \code{lenA} and $B$ of length
    \code{lenB}, where \code{lenA >= lenB > 0}. The length of the GCD $G$
    is returned by the function. No attempt is made to make the GCD monic. It
    is required that $G$ have space for \code{lenB} coefficients.

void nmod_poly_gcd_euclidean(nmod_poly_t G,
                             const nmod_poly_t A, const nmod_poly_t B)

    Computes the GCD of $A$ and $B$. The GCD of zero polynomials is
    defined to be zero, whereas the GCD of the zero polynomial and some other
    polynomial $P$ is defined to be $P$. Except in the case where
    the GCD is zero, the GCD $G$ is made monic.

slong _nmod_poly_hgcd(mp_ptr *M, slong *lenM,
                     mp_ptr A, slong *lenA, mp_ptr B, slong *lenB,
                     mp_srcptr a, slong lena, mp_srcptr b, slong lenb,
                     nmod_t mod)

    Computes the HGCD of $a$ and $b$, that is, a matrix~$M$, a sign~$\sigma$
    and two polynomials $A$ and $B$ such that
    \begin{equation*}
    (A,B)^t = \sigma M^{-1} (a,b)^t.
    \end{equation*}

    Assumes that $\len(a) > \len(b) > 0$.

    Assumes that $A$ and $B$ have space of size at least $\len(a)$
    and $\len(b)$, respectively.  On exit, \code{*lenA} and \code{*lenB}
    will contain the correct lengths of $A$ and $B$.

    Assumes that \code{M[0]}, \code{M[1]}, \code{M[2]}, and \code{M[3]}
    each point to a vector of size at least $\len(a)$.

slong _nmod_poly_gcd_hgcd(mp_ptr G, mp_srcptr A, slong lenA,
                                   mp_srcptr B, slong lenB, nmod_t mod)

    Computes the monic GCD of $A$ and $B$, assuming that
    $\len(A) \geq \len(B) > 0$.

    Assumes that $G$ has space for $\len(B)$ coefficients and
    returns the length of $G$ on output.

void nmod_poly_gcd_hgcd(nmod_poly_t G,
                        const nmod_poly_t A, const nmod_poly_t B)

    Computes the monic GCD of $A$ and $B$ using the HGCD algorithm.

    As a special case, the GCD of two zero polynomials is defined to be
    the zero polynomial.

    The time complexity of the algorithm is $\mathcal{O}(n \log^2 n)$.
    For further details, see~\citep{ThullYap1990}.

slong _nmod_poly_gcd(mp_ptr G,
                  mp_srcptr A, slong lenA, mp_srcptr B, slong lenB, nmod_t mod)

    Computes the GCD of $A$ of length \code{lenA} and $B$ of length
    \code{lenB}, where \code{lenA >= lenB > 0}. The length of the GCD $G$
    is returned by the function. No attempt is made to make the GCD monic. It
    is required that $G$ have space for \code{lenB} coefficients.

void nmod_poly_gcd(nmod_poly_t G,
                             const nmod_poly_t A, const nmod_poly_t B)

    Computes the GCD of $A$ and $B$. The GCD of zero polynomials is
    defined to be zero, whereas the GCD of the zero polynomial and some other
    polynomial $P$ is defined to be $P$. Except in the case where
    the GCD is zero, the GCD $G$ is made monic.

slong _nmod_poly_xgcd_euclidean(mp_ptr G, mp_ptr S, mp_ptr T,
             mp_srcptr A, slong A_len, mp_srcptr B, slong B_len, nmod_t mod)

    Computes the GCD of $A$ and $B$ together with cofactors $S$ and $T$
    such that $S A + T B = G$.  Returns the length of $G$.

    Assumes that $\len(A) \geq \len(B) \geq 1$ and
    $(\len(A),\len(B)) \neq (1,1)$.

    No attempt is made to make the GCD monic.

    Requires that $G$ have space for $\len(B)$ coefficients.  Writes
    $\len(B)-1$ and $\len(A)-1$ coefficients to $S$ and $T$, respectively.
    Note that, in fact, $\len(S) \leq \max(\len(B) - \len(G), 1)$ and
    $\len(T) \leq \max(\len(A) - \len(G), 1)$.

    No aliasing of input and output operands is permitted.

void nmod_poly_xgcd_euclidean(nmod_poly_t G, nmod_poly_t S, nmod_poly_t T,
                                    const nmod_poly_t A, const nmod_poly_t B)

    Computes the GCD of $A$ and $B$. The GCD of zero polynomials is
    defined to be zero, whereas the GCD of the zero polynomial and some other
    polynomial $P$ is defined to be $P$. Except in the case where
    the GCD is zero, the GCD $G$ is made monic.

    Polynomials \code{S} and \code{T} are computed such that
    \code{S*A + T*B = G}. The length of \code{S} will be at most
    \code{lenB} and the length of \code{T} will be at most \code{lenA}.

slong _nmod_poly_xgcd_hgcd(mp_ptr G, mp_ptr S, mp_ptr T,
             mp_srcptr A, slong A_len, mp_srcptr B, slong B_len, nmod_t mod)

    Computes the GCD of $A$ and $B$, where $\len(A) \geq \len(B) > 0$,
    together with cofactors $S$ and $T$ such that $S A + T B = G$. Returns
    the length of $G$.

    No attempt is made to make the GCD monic.

    Requires that $G$ have space for $\len(B)$ coefficients.  Writes
    $\len(B) - 1$ and $\len(A) - 1$ coefficients to $S$ and $T$,
    respectively.  Note that, in fact, $\len(S) \leq \len(B) - \len(G)$
    and $\len(T) \leq \len(A) - \len(G)$.

    Both $S$ and $T$ must have space for at least $2$ coefficients.

    No aliasing of input and output operands is permitted.

void nmod_poly_xgcd_hgcd(nmod_poly_t G, nmod_poly_t S, nmod_poly_t T,
                         const nmod_poly_t A, const nmod_poly_t B)

    Computes the GCD of $A$ and $B$. The GCD of zero polynomials is
    defined to be zero, whereas the GCD of the zero polynomial and some other
    polynomial $P$ is defined to be $P$. Except in the case where
    the GCD is zero, the GCD $G$ is made monic.

    Polynomials \code{S} and \code{T} are computed such that
    \code{S*A + T*B = G}. The length of \code{S} will be at most
    \code{lenB} and the length of \code{T} will be at most \code{lenA}.

slong _nmod_poly_xgcd(mp_ptr G, mp_ptr S, mp_ptr T,
                     mp_srcptr A, slong lenA, mp_srcptr B, slong lenB,
                     nmod_t mod)

    Computes the GCD of $A$ and $B$, where $\len(A) \geq \len(B) > 0$,
    together with cofactors $S$ and $T$ such that $S A + T B = G$. Returns
    the length of $G$.

    No attempt is made to make the GCD monic.

    Requires that $G$ have space for $\len(B)$ coefficients.  Writes
    $\len(B) - 1$ and $\len(A) - 1$ coefficients to $S$ and $T$,
    respectively.  Note that, in fact, $\len(S) \leq \len(B) - \len(G)$
    and $\len(T) \leq \len(A) - \len(G)$.

    No aliasing of input and output operands is permitted.

void nmod_poly_xgcd(nmod_poly_t G, nmod_poly_t S, nmod_poly_t T,
                                    const nmod_poly_t A, const nmod_poly_t B)

    Computes the GCD of $A$ and $B$. The GCD of zero polynomials is
    defined to be zero, whereas the GCD of the zero polynomial and some other
    polynomial $P$ is defined to be $P$. Except in the case where
    the GCD is zero, the GCD $G$ is made monic.

    The polynomials \code{S} and \code{T} are set such that
    \code{S*A + T*B = G}. The length of \code{S} will be at most
    \code{lenB} and the length of \code{T} will be at most \code{lenA}.

mp_limb_t
_nmod_poly_resultant_euclidean(mp_srcptr poly1, slong len1,
                               mp_srcptr poly2, slong len2, nmod_t mod)

    Returns the resultant of \code{(poly1, len1)} and
    \code{(poly2, len2)} using the Euclidean algorithm.

    Assumes that \code{len1 >= len2 > 0}.

    Asumes that the modulus is prime.

mp_limb_t
nmod_poly_resultant_euclidean(const nmod_poly_t f, const nmod_poly_t g)

    Computes the resultant of $f$ and $g$ using the Euclidean algorithm.

    For two non-zero polynomials $f(x) = a_m x^m + \dotsb + a_0$ and
    $g(x) = b_n x^n + \dotsb + b_0$ of degrees $m$ and $n$, the resultant
    is defined to be
    \begin{equation*}
        a_m^n b_n^m \prod_{(x, y) : f(x) = g(y) = 0} (x - y).
    \end{equation*}
    For convenience, we define the resultant to be equal to zero if either
    of the two polynomials is zero.

mp_limb_t
_nmod_poly_resultant(mp_srcptr poly1, slong len1,
                     mp_srcptr poly2, slong len2, nmod_t mod)

    Returns the resultant of \code{(poly1, len1)} and
    \code{(poly2, len2)}.

    Assumes that \code{len1 >= len2 > 0}.

    Asumes that the modulus is prime.

mp_limb_t
nmod_poly_resultant(const nmod_poly_t f, const nmod_poly_t g)

    Computes the resultant of $f$ and $g$.

    For two non-zero polynomials $f(x) = a_m x^m + \dotsb + a_0$ and
    $g(x) = b_n x^n + \dotsb + b_0$ of degrees $m$ and $n$, the resultant
    is defined to be
    \begin{equation*}
        a_m^n b_n^m \prod_{(x, y) : f(x) = g(y) = 0} (x - y).
    \end{equation*}
    For convenience, we define the resultant to be equal to zero if either
    of the two polynomials is zero.

slong _nmod_poly_gcdinv(mp_ptr G, mp_ptr S,
                        mp_srcptr A, slong lenA, mp_srcptr B, slong lenB,
                        const nmod_t mod)

    Computes \code{(G, lenA)}, \code{(S, lenB-1)} such that
    $G \cong S A \pmod{B}$, returning the actual length of $G$.

    Assumes that $0 < \len(A) < \len(B)$.

void nmod_poly_gcdinv(nmod_poly_t G, nmod_poly_t S,
                      const nmod_poly_t A, const nmod_poly_t B)

    Computes polynomials $G$ and $S$, both reduced modulo~$B$,
    such that $G \cong S A \pmod{B}$, where $B$ is assumed to
    have $\len(B) \geq 2$.

    In the case that $A = 0 \pmod{B}$, returns $G = S = 0$.

int _nmod_poly_invmod(mp_ptr A, mp_srcptr B, slong lenB,
                      mp_srcptr P, slong lenP, const nmod_t mod)

    Attempts to set \code{(A, lenP-1)} to the inverse of \code{(B, lenB)}
    modulo the polynomial \code{(P, lenP)}.  Returns $1$ if \code{(B, lenB)}
    is invertible and $0$ otherwise.

    Assumes that $0 < \len(B) < \len(P)$, and hence also $\len(P) \geq 2$,
    but supports zero-padding in \code{(B, lenB)}.

    Does not support aliasing.

    Assumes that $mod$ is a prime number.

int nmod_poly_invmod(nmod_poly_t A, const nmod_poly_t B, const nmod_poly_t P)

    Attempts to set $A$ to the inverse of $B$ modulo $P$ in the polynomial
    ring $(\mathbf{Z}/p\mathbf{Z})[X]$, where we assume that $p$ is a prime
    number.

    If $\deg(P) < 2$, raises an exception.

    If the greatest common divisor of $B$ and $P$ is~$1$, returns~$1$ and
    sets $A$ to the inverse of $B$.  Otherwise, returns~$0$ and the value
    of $A$ on exit is undefined.


*******************************************************************************

    Power series composition

*******************************************************************************

void _nmod_poly_compose_series_horner(mp_ptr res,
        mp_srcptr poly1, slong len1, mp_srcptr poly2, slong len2, slong n)

    Sets \code{res} to the composition of \code{poly1} and \code{poly2}
    modulo $x^n$, where the constant term of \code{poly2} is required
    to be zero.

    Assumes that \code{len1, len2, n > 0}, that \code{len1, len2 <= n},
    and that \code{(len1-1) * (len2-1) + 1 <= n}, and that \code{res} has
    space for \code{n} coefficients. Does not support aliasing between any
    of the inputs and the output.

    This implementation uses the Horner scheme.

void nmod_poly_compose_series_horner(nmod_poly_t res,
                    const nmod_poly_t poly1, const nmod_poly_t poly2, slong n)

    Sets \code{res} to the composition of \code{poly1} and \code{poly2}
    modulo $x^n$, where the constant term of \code{poly2} is required
    to be zero.

    This implementation uses the Horner scheme.

void _nmod_poly_compose_series_brent_kung(mp_ptr res, mp_srcptr poly1,
        slong len1, mp_srcptr poly2, slong len2, slong n)

    Sets \code{res} to the composition of \code{poly1} and \code{poly2}
    modulo $x^n$, where the constant term of \code{poly2} is required
    to be zero.

    Assumes that \code{len1, len2, n > 0}, that \code{len1, len2 <= n},
    and that\\ \code{(len1-1) * (len2-1) + 1 <= n}, and that \code{res} has
    space for \code{n} coefficients. Does not support aliasing between any
    of the inputs and the output.

    This implementation uses Brent-Kung algorithm 2.1 \cite{BrentKung1978}.

void nmod_poly_compose_series_brent_kung(nmod_poly_t res,
                const nmod_poly_t poly1, const nmod_poly_t poly2, slong n)

    Sets \code{res} to the composition of \code{poly1} and \code{poly2}
    modulo $x^n$, where the constant term of \code{poly2} is required
    to be zero.

    This implementation uses Brent-Kung algorithm 2.1 \cite{BrentKung1978}.

void _nmod_poly_compose_series_divconquer(mp_ptr res,
           mp_srcptr poly1, slong len1,
	   mp_srcptr poly2, slong len2, slong N, nmod_t mod)

    Composes \code{poly1} of length $\ell_1$ with \code{poly2} of
    length $\ell_2$ modulo $x^N$ and sets \code{res} to the result,
    i.e.\ evaluates \code{poly1} at \code{poly2}.

    Writes $\min\{(\ell_1 - 1)(\ell_2 - 2) + 1, N\}$ coefficients
    to the vector \code{res}.

    The algorithm used is the divide and conquer algorithm.
    It is assumed that $0 < \ell_1$ and $0 < \ell_2 \leq N$.

    Does not support aliasing between the inputs and the output.

void nmod_poly_compose_series_divconquer(nmod_poly_t res,
    const nmod_poly_t poly1, const nmod_poly_t poly2, slong N)

    Composes \code{poly1} with \code{poly2} modulo $x^N$ and sets \code{res}
    to the result, i.e.\ evaluates \code{poly1} at \code{poly2}.

    The algorithm used is the divide and conquer algorithm.

void _nmod_poly_compose_series(mp_ptr res, mp_srcptr poly1, slong len1,
                                      mp_srcptr poly2, slong len2, slong n)

    Sets \code{res} to the composition of \code{poly1} and \code{poly2}
    modulo $x^n$, where the constant term of \code{poly2} is required
    to be zero.

    Assumes that \code{len1, len2, n > 0}, that \code{len1, len2 <= n},
    and that\\ \code{(len1-1) * (len2-1) + 1 <= n}, and that \code{res} has
    space for \code{n} coefficients. Does not support aliasing between any
    of the inputs and the output.

    This implementation automatically switches between the Horner scheme
    and Brent-Kung algorithm 2.1 depending on the size of the inputs.

void nmod_poly_compose_series(nmod_poly_t res,
                    const nmod_poly_t poly1, const nmod_poly_t poly2, slong n)

    Sets \code{res} to the composition of \code{poly1} and \code{poly2}
    modulo $x^n$, where the constant term of \code{poly2} is required
    to be zero.

    This implementation automatically switches between the Horner scheme
    and Brent-Kung algorithm 2.1 depending on the size of the inputs.

*******************************************************************************

    Power series reversion

*******************************************************************************

void _nmod_poly_revert_series_lagrange(mp_ptr Qinv, mp_srcptr Q,
        slong n, nmod_t mod)

    Sets \code{Qinv} to the compositional inverse or reversion of \code{Q}
    as a power series, i.e. computes $Q^{-1}$ such that
    $Q(Q^{-1}(x)) = Q^{-1}(Q(x)) = x \bmod x^n$. The arguments must
    both have length \code{n} and may not be aliased.

    It is required that $Q_0 = 0$ and that $Q_1$ as well as the integers
    $1, 2, \ldots, n-1$ are invertible modulo the modulus.

    This implementation uses the Lagrange inversion formula.

void nmod_poly_revert_series_lagrange(nmod_poly_t Qinv,
            const nmod_poly_t Q, slong n)

    Sets \code{Qinv} to the compositional inverse or reversion of \code{Q}
    as a power series, i.e. computes $Q^{-1}$ such that
    $Q(Q^{-1}(x)) = Q^{-1}(Q(x)) = x \bmod x^n$.

    It is required that $Q_0 = 0$ and that $Q_1$ as well as the integers
    $1, 2, \ldots, n-1$ are invertible modulo the modulus.

    This implementation uses the Lagrange inversion formula.

void _nmod_poly_revert_series_lagrange_fast(mp_ptr Qinv, mp_srcptr Q,
        slong n, nmod_t mod)

    Sets \code{Qinv} to the compositional inverse or reversion of \code{Q}
    as a power series, i.e. computes $Q^{-1}$ such that
    $Q(Q^{-1}(x)) = Q^{-1}(Q(x)) = x \bmod x^n$. The arguments must
    both have length \code{n} and may not be aliased.

    It is required that $Q_0 = 0$ and that $Q_1$ as well as the integers
    $1, 2, \ldots, n-1$ are invertible modulo the modulus.

    This implementation uses a reduced-complexity implementation
    of the Lagrange inversion formula.

void nmod_poly_revert_series_lagrange_fast(nmod_poly_t Qinv,
            const nmod_poly_t Q, slong n)

    Sets \code{Qinv} to the compositional inverse or reversion of \code{Q}
    as a power series, i.e. computes $Q^{-1}$ such that
    $Q(Q^{-1}(x)) = Q^{-1}(Q(x)) = x \bmod x^n$.

    It is required that $Q_0 = 0$ and that $Q_1$ as well as the integers
    $1, 2, \ldots, n-1$ are invertible modulo the modulus.

    This implementation uses a reduced-complexity implementation
    of the Lagrange inversion formula.

void _nmod_poly_revert_series_newton(mp_ptr Qinv, mp_srcptr Q,
    slong n, nmod_t mod)

    Sets \code{Qinv} to the compositional inverse or reversion of \code{Q}
    as a power series, i.e. computes $Q^{-1}$ such that
    $Q(Q^{-1}(x)) = Q^{-1}(Q(x)) = x \bmod x^n$. The arguments must
    both have length \code{n} and may not be aliased.

    It is required that $Q_0 = 0$ and that $Q_1$ as well as the integers
    $1, 2, \ldots, n-1$ are invertible modulo the modulus.

    This implementation uses Newton iteration \cite{BrentKung1978}.

void nmod_poly_revert_series_newton(nmod_poly_t Qinv,
        const nmod_poly_t Q, slong n)

    Sets \code{Qinv} to the compositional inverse or reversion of \code{Q}
    as a power series, i.e. computes $Q^{-1}$ such that
    $Q(Q^{-1}(x)) = Q^{-1}(Q(x)) = x \bmod x^n$.

    It is required that $Q_0 = 0$ and that $Q_1$ as well as the integers
    $1, 2, \ldots, n-1$ are invertible modulo the modulus.

    This implementation uses Newton iteration \cite{BrentKung1978}.

void _nmod_poly_revert_series(mp_ptr Qinv, mp_srcptr Q, slong n, nmod_t mod)

    Sets \code{Qinv} to the compositional inverse or reversion of \code{Q}
    as a power series, i.e. computes $Q^{-1}$ such that
    $Q(Q^{-1}(x)) = Q^{-1}(Q(x)) = x \bmod x^n$. The arguments must
    both have length \code{n} and may not be aliased.

    It is required that $Q_0 = 0$ and that $Q_1$ as well as the integers
    $1, 2, \ldots, n-1$ are invertible modulo the modulus.

    This implementation automatically chooses between the Lagrange
    inversion formula and Newton iteration based on the size of the
    input.

void nmod_poly_revert_series(nmod_poly_t Qinv, const nmod_poly_t Q, slong n)

    Sets \code{Qinv} to the compositional inverse or reversion of \code{Q}
    as a power series, i.e. computes $Q^{-1}$ such that
    $Q(Q^{-1}(x)) = Q^{-1}(Q(x)) = x \bmod x^n$.

    It is required that $Q_0 = 0$ and that $Q_1$ as well as the integers
    $1, 2, \ldots, n-1$ are invertible modulo the modulus.

    This implementation automatically chooses between the Lagrange
    inversion formula and Newton iteration based on the size of the
    input.

*******************************************************************************

    Square roots

    The series expansions for $\sqrt{h}$ and $1/\sqrt{h}$ are defined
    by means of the generalised binomial theorem
    $$h^r = (1+y)^r =
        \sum_{k=0}^{\infty} {r \choose k} y^k.$$
    It is assumed that $h$ has constant term $1$ and that the coefficients
    $2^{-k}$ exist in the coefficient ring (i.e. $2$ must be invertible).


*******************************************************************************

void _nmod_poly_invsqrt_series(mp_ptr g, mp_srcptr h, slong n, nmod_t mod)

    Set the first $n$ terms of $g$ to the series expansion of $1/\sqrt{h}$.
    It is assumed that $n > 0$, that $h$ has constant term 1 and that $h$
    is zero-padded as necessary to length $n$. Aliasing is not permitted.

void nmod_poly_invsqrt_series(nmod_poly_t g, const nmod_poly_t h, slong n)

    Set $g$ to the series expansion of $1/\sqrt{h}$ to order $O(x^n)$.
    It is assumed that $h$ has constant term 1.

void _nmod_poly_sqrt_series(mp_ptr g, mp_srcptr h, slong n, nmod_t mod)

    Set the first $n$ terms of $g$ to the series expansion of $\sqrt{h}$.
    It is assumed that $n > 0$, that $h$ has constant term 1 and that $h$
    is zero-padded as necessary to length $n$. Aliasing is not permitted.

void nmod_poly_sqrt_series(nmod_poly_t g, const nmod_poly_t h, slong n)

    Set $g$ to the series expansion of $\sqrt{h}$ to order $O(x^n)$.
    It is assumed that $h$ has constant term 1.

int _nmod_poly_sqrt(mp_ptr s, mp_srcptr p, slong n, nmod_t mod)

    If \code{(p, n)} is a perfect square, sets \code{(s, n / 2 + 1)}
    to a square root of $p$ and returns 1. Otherwise returns 0.

int nmod_poly_sqrt(nmod_poly_t s, const nmod_poly_t p)

    If $p$ is a perfect square, sets $s$ to a square root of $p$
    and returns 1. Otherwise returns 0.

*******************************************************************************

    Transcendental functions

    The elementary transcendental functions of a formal power series $h$
    are defined as

    $$\exp(h(x)) = \sum_{k=0}^{\infty} \frac{(h(x))^k}{k!}$$

    $$\log(h(x)) = \int_0^x \frac{h'(t)}{h(t)} dt$$

    $$\operatorname{atan}(h(x)) = \int_0^x\frac{h'(t)}{1+(h(t))^2} dt$$

    $$\operatorname{atanh}(h(x)) = \int_0^x\frac{h'(t)}{1-(h(t))^2} dt$$

    $$\operatorname{asin}(h(x)) = \int_0^x\frac{h'(t)}{\sqrt{1-(h(t))^2}} dt$$

    $$\operatorname{asinh}(h(x)) = \int_0^x\frac{h'(t)}{\sqrt{1+(h(t))^2}} dt$$

    The functions sin, cos, tan, etc. are defined using standard inverse
    or functional relations.

    The logarithm function assumes that $h$ has constant term $1$. All
    other functions assume that $h$ has constant term $0$.

    All functions assume that the coefficient $1/k$ or $1/k!$ exists
    for all indices $k$. When computing to order $O(x^n)$, the modulus $p$
    must therefore be a prime satisfying $p \ge n$. Further, we always
    require that $p > 2$ in order to be able to multiply by $1/2$ for
    internal purposes.

    If the input does not satisfy all these conditions, results are undefined.

    Except where otherwise noted, functions are implemented with optimal
    (up to constants) complexity $O(M(n))$, where $M(n)$ is the cost
    of polynomial multiplication.

*******************************************************************************

void _nmod_poly_log_series_monomial_ui(mp_ptr g,
            mp_limb_t c, ulong r, slong n, nmod_t mod)

    Set $g = \log(1+cx^r) + O(x^n)$. Assumes $n > 0$, $r > 0$, and that
    the coefficient is reduced by the modulus. Works efficiently in linear
    time.

void nmod_poly_log_series_monomial_ui(nmod_poly_t g,
            mp_limb_t c, ulong r, slong n)

    Set $g = \log(1+cx^r) + O(x^n)$. Works efficiently in linear time.

void _nmod_poly_log_series(mp_ptr g, mp_srcptr h, slong n, nmod_t mod)

    Set $g = \log(h) + O(x^n)$. Assumes $n > 0$ and that $h$ is zero-padded
    as necessary to length $n$. Aliasing of $g$ and $h$ is allowed.

void nmod_poly_log_series(nmod_poly_t g, const nmod_poly_t h, slong n)

    Set $g = \log(h) + O(x^n)$. The case $h = 1+cx^r$ is automatically
    detected and handled efficiently.

void _nmod_poly_exp_series_monomial_ui(mp_ptr g,
            mp_limb_t c, ulong r, slong n, nmod_t mod)

    Set $g = \exp(cx^r) + O(x^n)$. Assumes $n > 0$, $r > 0$, and that
    the coefficient is reduced by the modulus. Works efficiently
    in linear time.

void nmod_poly_exp_series_monomial_ui(nmod_poly_t g,
            mp_limb_t c, ulong r, slong n)

    Set $g = \exp(cx^r) + O(x^n)$. Works efficiently in linear time.

void _nmod_poly_exp_series_basecase(mp_ptr g, mp_srcptr h, slong hlen,
                                        slong n, nmod_t mod)

    Set $g = \exp(h) + O(x^n)$ using a simple $O(n^2)$ algorithm.
    Assumes $n > 0$ and $\operatorname{hlen} > 0$. Only the first
    $\operatorname{hlen}$ coefficients of $h$ will be read.
    Aliasing of $f$ and $h$ is allowed.

void nmod_poly_exp_series_basecase(nmod_poly_t g, const nmod_poly_t h, slong n)

    Set $g = \exp(h) + O(x^n)$ using a simple $O(n^2)$ algorithm.

void _nmod_poly_exp_series(mp_ptr g, mp_srcptr h, slong n, nmod_t mod)

    Set $g = \exp(h) + O(x^n)$. Assumes $n > 0$ and that $h$ is zero-padded
    as necessary to length $n$. Aliasing of $g$ and $h$ is not allowed.

    Uses Newton iteration (the version given in \cite{HanZim2004}).
    For small $n$, falls back to the basecase algorithm.

void  _nmod_poly_exp_expinv_series(mp_ptr f, mp_ptr g, mp_srcptr h,
        slong n, nmod_t mod)

    Set $f = \exp(h) + O(x^n)$ and $g = \exp(-h) + O(x^n)$, more efficiently
    for large $n$ than performing a separate inversion to obtain $g$.
    Assumes $n > 0$ and that $h$ is zero-padded
    as necessary to length $n$. Aliasing is not allowed.

    Uses Newton iteration (the version given in \cite{HanZim2004}).
    For small $n$, falls back to the basecase algorithm.

void nmod_poly_exp_series(nmod_poly_t g, const nmod_poly_t h, slong n)

    Set $g = \exp(h) + O(x^n)$. The case $h = cx^r$ is automatically
    detected and handled efficiently. Otherwise this function automatically
    uses the basecase algorithm for small $n$ and Newton iteration otherwise.

void _nmod_poly_atan_series(mp_ptr g, mp_srcptr h, slong n, nmod_t mod)

    Set $g = \operatorname{atan}(h) + O(x^n)$. Assumes $n > 0$ and that $h$
    is zero-padded as necessary to length $n$. Aliasing of $g$ and $h$ is
    allowed.

void nmod_poly_atan_series(nmod_poly_t g, const nmod_poly_t h, slong n)

    Set $g = \operatorname{atan}(h) + O(x^n)$.

void _nmod_poly_atanh_series(mp_ptr g, mp_srcptr h, slong n, nmod_t mod)

    Set $g = \operatorname{atanh}(h) + O(x^n)$. Assumes $n > 0$ and that $h$
    is zero-padded as necessary to length $n$. Aliasing of $g$ and $h$ is
    allowed.

void nmod_poly_atanh_series(nmod_poly_t g, const nmod_poly_t h, slong n)

    Set $g = \operatorname{atanh}(h) + O(x^n)$.

void _nmod_poly_asin_series(mp_ptr g, mp_srcptr h, slong n, nmod_t mod)

    Set $g = \operatorname{asin}(h) + O(x^n)$. Assumes $n > 0$ and that $h$
    is zero-padded as necessary to length $n$. Aliasing of $g$ and $h$ is
    allowed.

void nmod_poly_asin_series(nmod_poly_t g, const nmod_poly_t h, slong n)

    Set $g = \operatorname{asin}(h) + O(x^n)$.

void _nmod_poly_asinh_series(mp_ptr g, mp_srcptr h, slong n, nmod_t mod)

    Set $g = \operatorname{asinh}(h) + O(x^n)$. Assumes $n > 0$ and that $h$
    is zero-padded as necessary to length $n$. Aliasing of $g$ and $h$ is
    allowed.

void nmod_poly_asinh_series(nmod_poly_t g, const nmod_poly_t h, slong n)

    Set $g = \operatorname{asinh}(h) + O(x^n)$.

void _nmod_poly_sin_series(mp_ptr g, mp_srcptr h, slong n, nmod_t mod)

    Set $g = \operatorname{sin}(h) + O(x^n)$. Assumes $n > 0$ and that $h$
    is zero-padded as necessary to length $n$. Aliasing of $g$ and $h$ is
    allowed. The value is computed using the identity
    $\sin(x) = 2 \tan(x/2)) / (1 + \tan^2(x/2)).$

void nmod_poly_sin_series(nmod_poly_t g, const nmod_poly_t h, slong n)

    Set $g = \operatorname{sin}(h) + O(x^n)$.

void _nmod_poly_cos_series(mp_ptr g, mp_srcptr h, slong n, nmod_t mod)

    Set $g = \operatorname{cos}(h) + O(x^n)$. Assumes $n > 0$ and that $h$
    is zero-padded as necessary to length $n$. Aliasing of $g$ and $h$ is
    allowed. The value is computed using the identity
    $\cos(x) = (1-\tan^2(x/2)) / (1 + \tan^2(x/2)).$

void nmod_poly_cos_series(nmod_poly_t g, const nmod_poly_t h, slong n)

    Set $g = \operatorname{cos}(h) + O(x^n)$.

void _nmod_poly_tan_series(mp_ptr g, mp_srcptr h, slong n, nmod_t mod)

    Set $g = \operatorname{tan}(h) + O(x^n)$. Assumes $n > 0$ and that $h$
    is zero-padded as necessary to length $n$. Aliasing of $g$ and $h$ is
    not allowed. Uses Newton iteration to invert the atan function.

void nmod_poly_tan_series(nmod_poly_t g, const nmod_poly_t h, slong n)

    Set $g = \operatorname{tan}(h) + O(x^n)$.

void _nmod_poly_sinh_series(mp_ptr g, mp_srcptr h, slong n, nmod_t mod)

    Set $g = \operatorname{sinh}(h) + O(x^n)$. Assumes $n > 0$ and that $h$
    is zero-padded as necessary to length $n$. Aliasing of $g$ and $h$ is
    not allowed. Uses the identity $\sinh(x) = (e^x - e^{-x})/2$.

void nmod_poly_sinh_series(nmod_poly_t g, const nmod_poly_t h, slong n)

    Set $g = \operatorname{sinh}(h) + O(x^n)$.

void _nmod_poly_cosh_series(mp_ptr g, mp_srcptr h, slong n, nmod_t mod)

    Set $g = \operatorname{cos}(h) + O(x^n)$. Assumes $n > 0$ and that $h$
    is zero-padded as necessary to length $n$. Aliasing of $g$ and $h$ is
    not allowed. Uses the identity $\cosh(x) = (e^x + e^{-x})/2$.

void nmod_poly_cosh_series(nmod_poly_t g, const nmod_poly_t h, slong n)

    Set $g = \operatorname{cosh}(h) + O(x^n)$.

void _nmod_poly_tanh_series(mp_ptr g, mp_srcptr h, slong n, nmod_t mod)

    Set $g = \operatorname{tanh}(h) + O(x^n)$. Assumes $n > 0$ and that $h$
    is zero-padded as necessary to length $n$. Uses the identity
    $\tanh(x) = (e^{2x}-1)/(e^{2x}+1)$.

void nmod_poly_tanh_series(nmod_poly_t g, const nmod_poly_t h, slong n)

    Set $g = \operatorname{tanh}(h) + O(x^n)$.

*******************************************************************************

    Products

*******************************************************************************

void _nmod_poly_product_roots_nmod_vec(mp_ptr poly, mp_srcptr xs,
    slong n, nmod_t mod)

    Sets \code{(poly, n + 1)} to the monic polynomial which is the product
    of $(x - x_0)(x - x_1) \cdots (x - x_{n-1})$, the roots $x_i$ being
    given by \code{xs}.

    Aliasing of the input and output is not allowed.

void nmod_poly_product_roots_nmod_vec(nmod_poly_t poly, mp_srcptr xs, slong n)

    Sets \code{poly} to the monic polynomial which is the product
    of $(x - x_0)(x - x_1) \cdots (x - x_{n-1})$, the roots $x_i$ being
    given by \code{xs}.

*******************************************************************************

    Subproduct trees

*******************************************************************************

mp_ptr * _nmod_poly_tree_alloc(slong len)

    Allocates space for a subproduct tree of the given length, having
    linear factors at the lowest level.

    Entry $i$ in the tree is a pointer to a single array of limbs,
    capable of storing $\lfloor n / 2^i \rfloor$ subproducts of
    degree $2^i$ adjacently, plus a trailing entry if $n / 2^i$ is
    not an integer.

    For example, a tree of length 7 built from monic linear factors has
    the following structure, where spaces have been inserted
    for illustrative purposes:

    \begin{verbatim}
       X1 X1 X1 X1 X1 X1 X1
       XX1   XX1   XX1   X1
       XXXX1       XX1   X1
       XXXXXXX1
    \end{verbatim}

void _nmod_poly_tree_free(mp_ptr * tree, slong len)

    Free the allocated space for the subproduct.

void _nmod_poly_tree_build(mp_ptr * tree, mp_srcptr roots, slong len,
    nmod_t mod)

    Builds a subproduct tree in the preallocated space from
    the \code{len} monic linear factors $(x-r_i)$. The top level
    product is not computed.


*******************************************************************************

    Inflation and deflation

*******************************************************************************

void nmod_poly_inflate(nmod_poly_t result, const nmod_poly_t input,
    ulong inflation)

    Sets \code{result} to the inflated polynomial $p(x^n)$ where
    $p$ is given by \code{input} and $n$ is given by \code{deflation}.

void nmod_poly_deflate(nmod_poly_t result, const nmod_poly_t input,
    ulong deflation)

    Sets \code{result} to the deflated polynomial $p(x^{1/n})$ where
    $p$ is given by \code{input} and $n$ is given by \code{deflation}.
    Requires $n > 0$.

ulong nmod_poly_deflation(const nmod_poly_t input)

    Returns the largest integer by which \code{input} can be deflated.
    As special cases, returns 0 if \code{input} is the zero polynomial
    and 1 of \code{input} is a constant polynomial.
