/*
 * Copyright (c) 2026, Alliance for Open Media. All rights reserved.
 *
 * This source code is subject to the terms of the BSD 2 Clause License and
 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
 * was not distributed with this source code in the LICENSE file, you can
 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
 * Media Patent License 1.0 was not distributed with this source code in the
 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
 */

#include <emmintrin.h>

#include "config/av1_rtcd.h"

void av1_interp_cubic_rate_dist_sse2(const double *p1, const double *p2,
                                     double x, double *const rate_f,
                                     double *const distbysse_f) {
  const __m128d half = _mm_set1_pd(0.5);
  const __m128d two = _mm_set1_pd(2.0);
  const __m128d three = _mm_set1_pd(3.0);
  const __m128d four = _mm_set1_pd(4.0);
  const __m128d five = _mm_set1_pd(5.0);

  const __m128d reg_x = _mm_set1_pd(x);
  const __m128d reg_p0 = _mm_set_pd(p2[0], p1[0]);
  const __m128d reg_p1 = _mm_set_pd(p2[1], p1[1]);
  const __m128d reg_p2 = _mm_set_pd(p2[2], p1[2]);
  const __m128d reg_p3 = _mm_set_pd(p2[3], p1[3]);

  // To ensure that results are bit-identical to the C code, we need to perform
  // exactly the same sequence of operations here as in the C code.
  // reg_res_0 = x * (3.0 * (p[1] - p[2]) + p[3] - p[0])
  __m128d reg_res_0 = _mm_sub_pd(reg_p1, reg_p2);
  reg_res_0 = _mm_mul_pd(three, reg_res_0);
  reg_res_0 = _mm_add_pd(reg_res_0, reg_p3);
  reg_res_0 = _mm_sub_pd(reg_res_0, reg_p0);
  reg_res_0 = _mm_mul_pd(reg_x, reg_res_0);

  // reg_res_1 = 2.0 * p[0] - 5.0 * p[1] + 4.0 * p[2]- p[3]
  const __m128d regp0_x_2 = _mm_mul_pd(two, reg_p0);
  const __m128d regp1_x_5 = _mm_mul_pd(five, reg_p1);
  const __m128d regp2_x_4 = _mm_mul_pd(four, reg_p2);
  __m128d reg_res_1 = _mm_sub_pd(regp0_x_2, regp1_x_5);
  reg_res_1 = _mm_add_pd(reg_res_1, regp2_x_4);
  reg_res_1 = _mm_sub_pd(reg_res_1, reg_p3);

  // reg_res_2 = x * (reg_res_1 + reg_res_0)
  __m128d reg_res_2 = _mm_add_pd(reg_res_1, reg_res_0);
  reg_res_2 = _mm_mul_pd(reg_x, reg_res_2);

  // reg_res_3 = p[2] - p[0] + reg_res_2
  __m128d reg_res_3 = _mm_sub_pd(reg_p2, reg_p0);
  reg_res_3 = _mm_add_pd(reg_res_3, reg_res_2);

  // reg_res_4 = p[1] + 0.5 * x * reg_res_3
  __m128d reg_res_4 = _mm_mul_pd(_mm_mul_pd(half, reg_x), reg_res_3);
  reg_res_4 = _mm_add_pd(reg_p1, reg_res_4);

  _mm_storel_pd(rate_f, reg_res_4);
  _mm_storeh_pd(distbysse_f, reg_res_4);
}