// Copyright (c) the JPEG XL Project Authors. All rights reserved.
//
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

// This file contains a generic implementation of large (>32x32) 2d IDCTs.
// They are not implemented in the same way as smaller 2d IDCTs to reduce code size.

#![allow(clippy::excessive_precision)]

use std::f32::consts::SQRT_2;

use jxl_simd::F32SimdVec;
use jxl_simd::SimdDescriptor;

use crate::idct_32;

const WC_WEIGHTS_64: [f32; 32] = [
    0.500150636020651,
    0.5013584524464084,
    0.5037887256810443,
    0.5074711720725553,
    0.5124514794082247,
    0.5187927131053328,
    0.52657731515427,
    0.535909816907992,
    0.5469204379855088,
    0.5597698129470802,
    0.57465518403266,
    0.5918185358574165,
    0.6115573478825099,
    0.6342389366884031,
    0.6603198078137061,
    0.6903721282002123,
    0.7251205223771985,
    0.7654941649730891,
    0.8127020908144905,
    0.8683447152233481,
    0.9345835970364075,
    1.0144082649970547,
    1.1120716205797176,
    1.233832737976571,
    1.3892939586328277,
    1.5939722833856311,
    1.8746759800084078,
    2.282050068005162,
    2.924628428158216,
    4.084611078129248,
    6.796750711673633,
    20.373878167231453,
];

const WC_WEIGHTS_128: [f32; 64] = [
    0.5000376519155477,
    0.5003390374428216,
    0.5009427176380873,
    0.5018505174842379,
    0.5030651913013697,
    0.5045904432216454,
    0.5064309549285542,
    0.5085924210498143,
    0.5110815927066812,
    0.5139063298475396,
    0.5170756631334912,
    0.5205998663018917,
    0.524490540114724,
    0.5287607092074876,
    0.5334249333971333,
    0.538499435291984,
    0.5440022463817783,
    0.549953374183236,
    0.5563749934898856,
    0.5632916653417023,
    0.5707305880121454,
    0.5787218851348208,
    0.5872989370937893,
    0.5964987630244563,
    0.606362462272146,
    0.6169357260050706,
    0.6282694319707711,
    0.6404203382416639,
    0.6534518953751283,
    0.6674352009263413,
    0.6824501259764195,
    0.6985866506472291,
    0.7159464549705746,
    0.7346448236478627,
    0.7548129391165311,
    0.776600658233963,
    0.8001798956216941,
    0.8257487738627852,
    0.8535367510066064,
    0.8838110045596234,
    0.9168844461846523,
    0.9531258743921193,
    0.9929729612675466,
    1.036949040910389,
    1.0856850642580145,
    1.1399486751015042,
    1.2006832557294167,
    1.2690611716991191,
    1.346557628206286,
    1.4350550884414341,
    1.5369941008524954,
    1.6555965242641195,
    1.7952052190778898,
    1.961817848571166,
    2.163957818751979,
    2.4141600002500763,
    2.7316450287739396,
    3.147462191781909,
    3.7152427383269746,
    4.5362909369693565,
    5.827688377844654,
    8.153848602466814,
    13.58429025728446,
    40.744688103351834,
];

const WC_WEIGHTS_256: [f32; 128] = [
    0.5000094125358878,
    0.500084723455784,
    0.5002354020255269,
    0.5004615618093246,
    0.5007633734146156,
    0.5011410648064231,
    0.5015949217281668,
    0.502125288230386,
    0.5027325673091954,
    0.5034172216566842,
    0.5041797745258774,
    0.5050208107132756,
    0.5059409776624396,
    0.5069409866925212,
    0.5080216143561264,
    0.509183703931388,
    0.5104281670536573,
    0.5117559854927805,
    0.5131682130825206,
    0.5146659778093218,
    0.516250484068288,
    0.5179230150949777,
    0.5196849355823947,
    0.5215376944933958,
    0.5234828280796439,
    0.52552196311921,
    0.5276568203859896,
    0.5298892183652453,
    0.5322210772308335,
    0.5346544231010253,
    0.537191392591309,
    0.5398342376841637,
    0.5425853309375497,
    0.545447171055775,
    0.5484223888484947,
    0.551513753605893,
    0.554724179920619,
    0.5580567349898085,
    0.5615146464335654,
    0.5651013106696203,
    0.5688203018875696,
    0.5726753816701664,
    0.5766705093136241,
    0.5808098529038624,
    0.5850978012111273,
    0.58953897647151,
    0.5941382481306648,
    0.5989007476325463,
    0.6038318843443582,
    0.6089373627182432,
    0.614223200800649,
    0.6196957502119484,
    0.6253617177319102,
    0.6312281886412079,
    0.6373026519855411,
    0.6435930279473415,
    0.6501076975307724,
    0.6568555347890955,
    0.6638459418498757,
    0.6710888870233562,
    0.6785949463131795,
    0.6863753486870501,
    0.6944420255086364,
    0.7028076645818034,
    0.7114857693151208,
    0.7204907235796304,
    0.7298378629074134,
    0.7395435527641373,
    0.749625274727372,
    0.7601017215162176,
    0.7709929019493761,
    0.7823202570613161,
    0.7941067887834509,
    0.8063772028037925,
    0.8191580674598145,
    0.83247799080191,
    0.8463678182968619,
    0.860860854031955,
    0.8759931087426972,
    0.8918035785352535,
    0.9083345588266809,
    0.9256319988042384,
    0.9437459026371479,
    0.962730784794803,
    0.9826461881778968,
    1.0035572754078206,
    1.0255355056139732,
    1.048659411496106,
    1.0730154944316674,
    1.0986992590905857,
    1.1258164135986009,
    1.1544842669978943,
    1.184833362908442,
    1.217009397314603,
    1.2511754798461228,
    1.287514812536712,
    1.326233878832723,
    1.3675662599582539,
    1.411777227500661,
    1.459169302866857,
    1.5100890297227016,
    1.5649352798258847,
    1.6241695131835794,
    1.6883285509131505,
    1.7580406092704062,
    1.8340456094306077,
    1.9172211551275689,
    2.0086161135167564,
    2.1094945286246385,
    2.22139377701127,
    2.346202662531156,
    2.486267909203593,
    2.644541877144861,
    2.824791402350551,
    3.0318994541759925,
    3.2723115884254845,
    3.5547153325075804,
    3.891107790700307,
    4.298537526449054,
    4.802076008665048,
    5.440166215091329,
    6.274908408039339,
    7.413566756422303,
    9.058751453879703,
    11.644627325175037,
    16.300023088031555,
    27.163977662448232,
    81.48784219222516,
];

#[inline(always)]
fn idct_impl_inner<D: SimdDescriptor>(d: D, data: &mut [D::F32Vec], scratch: &mut [D::F32Vec]) {
    let n = data.len();
    assert!(scratch.len() >= n);

    if n == 32 {
        d.call(
            #[inline(always)]
            |_| {
                (
                    data[0], data[1], data[2], data[3], data[4], data[5], data[6], data[7],
                    data[8], data[9], data[10], data[11], data[12], data[13], data[14], data[15],
                    data[16], data[17], data[18], data[19], data[20], data[21], data[22], data[23],
                    data[24], data[25], data[26], data[27], data[28], data[29], data[30], data[31],
                ) = idct_32(
                    d, data[0], data[1], data[2], data[3], data[4], data[5], data[6], data[7],
                    data[8], data[9], data[10], data[11], data[12], data[13], data[14], data[15],
                    data[16], data[17], data[18], data[19], data[20], data[21], data[22], data[23],
                    data[24], data[25], data[26], data[27], data[28], data[29], data[30], data[31],
                )
            },
        );
        return;
    }

    let wc_weights = match n {
        64 => &WC_WEIGHTS_64[..],
        128 => &WC_WEIGHTS_128[..],
        256 => &WC_WEIGHTS_256[..],
        _ => unreachable!("invalid large-dct size: {n}"),
    };

    assert_eq!(wc_weights.len(), n / 2);

    let (first_half, second_half) = scratch[..n].split_at_mut(n / 2);
    for i in 0..n / 2 {
        first_half[i] = data[i * 2];
        second_half[i] = data[2 * i + 1];
    }

    d.call(
        #[inline(always)]
        |_| idct_impl_inner(d, first_half, data),
    );

    for i in (1..n / 2).rev() {
        second_half[i] += second_half[i - 1];
    }
    second_half[0] *= D::F32Vec::splat(d, SQRT_2);

    d.call(
        #[inline(always)]
        |_| idct_impl_inner(d, second_half, data),
    );

    for i in 0..n / 2 {
        let mul = D::F32Vec::splat(d, wc_weights[i]);
        data[i] = second_half[i].mul_add(mul, first_half[i]);
        data[n - i - 1] = second_half[i].neg_mul_add(mul, first_half[i]);
    }
}

#[inline(always)]
fn do_idct<D: SimdDescriptor>(
    d: D,
    data: &mut [<D::F32Vec as F32SimdVec>::UnderlyingArray],
    stride: usize,
    storage: &mut [D::F32Vec],
    scratch: &mut [D::F32Vec],
) {
    let n = storage.len();
    assert!((n - 1) * stride < data.len());
    for i in 0..n {
        storage[i] = D::F32Vec::load_array(d, &data[i * stride]);
    }
    d.call(
        #[inline(always)]
        |d| idct_impl_inner(d, storage, scratch),
    );
    for i in 0..n {
        storage[i].store_array(&mut data[i * stride]);
    }
}

#[inline(always)]
fn do_idct_rowblock<D: SimdDescriptor>(
    d: D,
    data: &mut [<D::F32Vec as F32SimdVec>::UnderlyingArray],
    storage: &mut [D::F32Vec],
    scratch: &mut [D::F32Vec],
) {
    let n = storage.len();
    assert!(n.is_multiple_of(D::F32Vec::LEN));
    assert!(data.len() >= n);

    let row_stride = n / D::F32Vec::LEN;
    for i in 0..n {
        storage[i] = D::F32Vec::load_array(
            d,
            &data[row_stride * (i % D::F32Vec::LEN) + (i / D::F32Vec::LEN)],
        );
    }
    d.call(
        #[inline(always)]
        |d| idct_impl_inner(d, storage, scratch),
    );
    for i in 0..n {
        storage[i].store_array(&mut data[row_stride * (i % D::F32Vec::LEN) + (i / D::F32Vec::LEN)]);
    }
}

#[inline(always)]
fn do_idct_trh<D: SimdDescriptor>(
    d: D,
    data: &mut [<D::F32Vec as F32SimdVec>::UnderlyingArray],
    storage: &mut [D::F32Vec],
    scratch: &mut [D::F32Vec],
) {
    let n = storage.len();
    assert!(n.is_multiple_of(D::F32Vec::LEN));
    assert!(data.len() >= n);

    let row_stride = n / (2 * D::F32Vec::LEN);
    for i in 0..n / 2 {
        storage[i] = D::F32Vec::load_array(d, &data[row_stride * 2 * i]);
        storage[i + n / 2] = D::F32Vec::load_array(d, &data[row_stride * (2 * i + 1)]);
    }
    d.call(
        #[inline(always)]
        |d| idct_impl_inner(d, storage, scratch),
    );
    for i in 0..n {
        storage[i].store_array(&mut data[row_stride * i]);
    }
}

#[inline(always)]
fn idct2d_square<D: SimdDescriptor>(
    d: D,
    data: &mut [f32],
    n: usize,
    storage: &mut [D::F32Vec],
    scratch: &mut [D::F32Vec],
) {
    let data = D::F32Vec::make_array_slice_mut(data);
    let chunks = n / D::F32Vec::LEN;
    // Step 1: do column-DCTs on the first K columns.
    for i in 0..chunks {
        d.call(
            #[inline(always)]
            |_| do_idct(d, &mut data[i..], chunks, &mut storage[..n], scratch),
        );
    }
    // Step 2: do column-DCTs on groups of K columns, transposing KxK blocks and
    // swapping them in their final place as we do so.
    for i in 0..chunks {
        D::F32Vec::transpose_square(d, &mut data[i * n + i..], chunks);
        for j in i + 1..chunks {
            D::F32Vec::transpose_square(d, &mut data[j * n + i..], chunks);
            D::F32Vec::transpose_square(d, &mut data[i * n + j..], chunks);
            for k in 0..D::F32Vec::LEN {
                data.swap(i * n + j + k * chunks, j * n + i + k * chunks);
            }
        }
        d.call(
            #[inline(always)]
            |_| do_idct(d, &mut data[i..], chunks, &mut storage[..n], scratch),
        );
    }
}

#[inline(always)]
fn idct2d_wide<D: SimdDescriptor>(
    d: D,
    data: &mut [f32],
    c: usize,
    r: usize,
    storage: &mut [D::F32Vec],
    scratch: &mut [D::F32Vec],
) {
    assert!(r < c);
    let data = D::F32Vec::make_array_slice_mut(data);
    let column_chunks = c / D::F32Vec::LEN;
    let row_chunks = r / D::F32Vec::LEN;
    // Step 1: do rowblock-DCTs on the first K rows, transposing KxK blocks first.
    for i in 0..row_chunks {
        for j in 0..column_chunks {
            D::F32Vec::transpose_square(d, &mut data[i * c + j..], column_chunks);
        }
        d.call(
            #[inline(always)]
            |_| do_idct_rowblock(d, &mut data[i * c..], &mut storage[..c], scratch),
        );
    }
    // Step 2: do column-DCTs on groups of K columns, transposing KxK blocks back.
    for i in 0..column_chunks {
        for j in 0..row_chunks {
            D::F32Vec::transpose_square(d, &mut data[j * c + i..], column_chunks);
        }
        d.call(
            #[inline(always)]
            |_| do_idct(d, &mut data[i..], column_chunks, &mut storage[..r], scratch),
        );
    }
}

#[inline(always)]
fn idct2d_thin<D: SimdDescriptor>(
    d: D,
    data: &mut [f32],
    c: usize,
    r: usize,
    storage: &mut [D::F32Vec],
    scratch: &mut [D::F32Vec],
) {
    assert!(r > c);
    let data = D::F32Vec::make_array_slice_mut(data);
    let column_chunks = c / D::F32Vec::LEN;
    let row_chunks = r / D::F32Vec::LEN;
    // Note: input is transposed, so in the beginning it has ROWS *columns* and COLS *rows*.
    // Step 1: do column-DCTs on columns.
    for i in 0..row_chunks {
        d.call(
            #[inline(always)]
            |_| do_idct(d, &mut data[i..], row_chunks, &mut storage[..c], scratch),
        );
    }
    // Step 2: Incrementally transpose each square sub-block of the matrix, then do a column-IDCT which also completes the transpose.
    for i in 0..column_chunks {
        let tr_block = |data: &mut [<D::F32Vec as F32SimdVec>::UnderlyingArray], i, j, l| {
            D::F32Vec::transpose_square(d, &mut data[i * r + j + l * column_chunks..], row_chunks)
        };

        (0..2).for_each(|l| tr_block(data, i, i, l));
        for j in i + 1..column_chunks {
            (0..2).for_each(|l| tr_block(data, i, j, l));
            (0..2).for_each(|l| tr_block(data, j, i, l));
            for l in 0..2 {
                for k in 0..D::F32Vec::LEN {
                    data.swap(
                        i * r + j + k * row_chunks + l * column_chunks,
                        j * r + i + k * row_chunks + l * column_chunks,
                    );
                }
            }
        }
        d.call(
            #[inline(always)]
            |_| do_idct_trh(d, &mut data[i..], &mut storage[..r], scratch),
        );
    }
}

macro_rules! make_idct2d {
    ($name: ident, $h: literal, $w: literal) => {
        pub fn $name<D: SimdDescriptor>(d: D, data: &mut [f32]) {
            const L: usize = if $w < $h { $h } else { $w };
            let mut storage = [D::F32Vec::zero(d); L];
            let mut scratch = [D::F32Vec::zero(d); L];
            if $w == $h {
                return d.call(
                    #[inline(always)]
                    |_| idct2d_square(d, data, $w, &mut storage, &mut scratch),
                );
            }
            if $w > $h {
                return d.call(
                    #[inline(always)]
                    |_| idct2d_wide(d, data, $w, $h, &mut storage, &mut scratch),
                );
            }
            return d.call(
                #[inline(always)]
                |_| idct2d_thin(d, data, $w, $h, &mut storage, &mut scratch),
            );
        }
    };
}

make_idct2d!(idct2d_32_64, 32, 64);
make_idct2d!(idct2d_64_32, 64, 32);
make_idct2d!(idct2d_64_64, 64, 64);
make_idct2d!(idct2d_64_128, 64, 128);
make_idct2d!(idct2d_128_64, 128, 64);
make_idct2d!(idct2d_128_128, 128, 128);
make_idct2d!(idct2d_128_256, 128, 256);
make_idct2d!(idct2d_256_128, 256, 128);
make_idct2d!(idct2d_256_256, 256, 256);

#[cfg(test)]
#[inline(always)]
pub fn do_idct_64<D: SimdDescriptor>(
    d: D,
    data: &mut [<D::F32Vec as F32SimdVec>::UnderlyingArray],
    stride: usize,
) {
    let mut storage = [D::F32Vec::zero(d); 64];
    let mut scratch = [D::F32Vec::zero(d); 64];
    d.call(
        #[inline(always)]
        |_| {
            do_idct(d, data, stride, &mut storage, &mut scratch);
        },
    );
}

#[cfg(test)]
#[inline(always)]
pub fn do_idct_128<D: SimdDescriptor>(
    d: D,
    data: &mut [<D::F32Vec as F32SimdVec>::UnderlyingArray],
    stride: usize,
) {
    let mut storage = [D::F32Vec::zero(d); 128];
    let mut scratch = [D::F32Vec::zero(d); 128];
    d.call(
        #[inline(always)]
        |_| {
            do_idct(d, data, stride, &mut storage, &mut scratch);
        },
    );
}

#[cfg(test)]
#[inline(always)]
pub fn do_idct_256<D: SimdDescriptor>(
    d: D,
    data: &mut [<D::F32Vec as F32SimdVec>::UnderlyingArray],
    stride: usize,
) {
    let mut storage = [D::F32Vec::zero(d); 256];
    let mut scratch = [D::F32Vec::zero(d); 256];
    d.call(
        #[inline(always)]
        |_| {
            do_idct(d, data, stride, &mut storage, &mut scratch);
        },
    );
}