From 14a8a0f784475571c3532816ab77afea589d90b5 Mon Sep 17 00:00:00 2001
From: Peter Jung <admin@ptr1337.dev>
Date: Wed, 13 Sep 2023 14:31:11 +0200
Subject: [PATCH 1/7] amd-hdr

Signed-off-by: Peter Jung <admin@ptr1337.dev>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_mode.h      |  71 ++
 .../gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c |  34 +-
 .../gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h | 100 +++
 .../amd/display/amdgpu_dm/amdgpu_dm_color.c   | 805 ++++++++++++++++--
 .../amd/display/amdgpu_dm/amdgpu_dm_crtc.c    |  72 ++
 .../amd/display/amdgpu_dm/amdgpu_dm_plane.c   | 224 ++++-
 .../amd/display/dc/dcn10/dcn10_cm_common.c    |  95 ++-
 .../drm/amd/display/dc/dcn30/dcn30_hwseq.c    |  37 +
 .../drm/amd/display/dc/dcn30/dcn30_hwseq.h    |   3 +
 .../drm/amd/display/dc/dcn301/dcn301_init.c   |   2 +-
 .../gpu/drm/amd/display/include/fixed31_32.h  |  12 +
 drivers/gpu/drm/arm/malidp_crtc.c             |   2 +-
 drivers/gpu/drm/drm_atomic.c                  |   1 +
 drivers/gpu/drm/drm_atomic_state_helper.c     |   1 +
 drivers/gpu/drm/drm_atomic_uapi.c             |  43 +-
 drivers/gpu/drm/drm_property.c                |  49 ++
 include/drm/drm_mode_object.h                 |   2 +-
 include/drm/drm_plane.h                       |   7 +
 include/drm/drm_property.h                    |   6 +
 include/uapi/drm/drm_mode.h                   |   8 +
 20 files changed, 1446 insertions(+), 128 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mode.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_mode.h
index 32fe05c810c6..84bf501b02f4 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mode.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mode.h
@@ -343,6 +343,77 @@ struct amdgpu_mode_info {
 	int			disp_priority;
 	const struct amdgpu_display_funcs *funcs;
 	const enum drm_plane_type *plane_type;
+
+	/* Driver-private color mgmt props */
+
+	/* @plane_degamma_lut_property: Plane property to set a degamma LUT to
+	 * convert input space before blending.
+	 */
+	struct drm_property *plane_degamma_lut_property;
+	/* @plane_degamma_lut_size_property: Plane property to define the max
+	 * size of degamma LUT as supported by the driver (read-only).
+	 */
+	struct drm_property *plane_degamma_lut_size_property;
+	/**
+	 * @plane_degamma_tf_property: Plane pre-defined transfer function to
+	 * to go from scanout/encoded values to linear values.
+	 */
+	struct drm_property *plane_degamma_tf_property;
+	/**
+	 * @plane_hdr_mult_property:
+	 */
+	struct drm_property *plane_hdr_mult_property;
+
+	struct drm_property *plane_ctm_property;
+	/**
+	 * @shaper_lut_property: Plane property to set pre-blending shaper LUT
+	 * that converts color content before 3D LUT.
+	 */
+	struct drm_property *plane_shaper_lut_property;
+	/**
+	 * @shaper_lut_size_property: Plane property for the size of
+	 * pre-blending shaper LUT as supported by the driver (read-only).
+	 */
+	struct drm_property *plane_shaper_lut_size_property;
+	/**
+	 * @plane_shaper_tf_property: Plane property to set a predefined
+	 * transfer function for pre-blending shaper (before applying 3D LUT)
+	 * with or without LUT.
+	 */
+	struct drm_property *plane_shaper_tf_property;
+	/**
+	 * @plane_lut3d_property: Plane property for gamma correction using a
+	 * 3D LUT (pre-blending).
+	 */
+	struct drm_property *plane_lut3d_property;
+	/**
+	 * @plane_degamma_lut_size_property: Plane property to define the max
+	 * size of 3D LUT as supported by the driver (read-only).
+	 */
+	struct drm_property *plane_lut3d_size_property;
+	/**
+	 * @plane_blend_lut_property: Plane property for output gamma before
+	 * blending. Userspace set a blend LUT to convert colors after 3D LUT
+	 * conversion. It works as a post-3D LUT 1D LUT, with shaper LUT, they
+	 * are sandwiching 3D LUT with two 1D LUT.
+	 */
+	struct drm_property *plane_blend_lut_property;
+	/**
+	 * @plane_blend_lut_size_property: Plane property to define the max
+	 * size of blend LUT as supported by the driver (read-only).
+	 */
+	struct drm_property *plane_blend_lut_size_property;
+	/**
+	 * @plane_blend_tf_property: Plane property to set a predefined
+	 * transfer function for pre-blending blend (before applying 3D LUT)
+	 * with or without LUT.
+	 */
+	struct drm_property *plane_blend_tf_property;
+	/* @regamma_tf_property: Transfer function for CRTC regamma
+	 * (post-blending). Possible values are defined by `enum
+	 * amdgpu_transfer_function`.
+	 */
+	struct drm_property *regamma_tf_property;
 };
 
 #define AMDGPU_MAX_BL_LEVEL 0xFF
diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
index c9959bd8147d..4a2300db8c99 100644
--- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
+++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
@@ -4015,6 +4015,11 @@ static int amdgpu_dm_mode_config_init(struct amdgpu_device *adev)
 		return r;
 	}
 
+#ifdef AMD_PRIVATE_COLOR
+	if (amdgpu_dm_create_color_properties(adev))
+		return -ENOMEM;
+#endif
+
 	r = amdgpu_dm_audio_init(adev);
 	if (r) {
 		dc_release_state(state->context);
@@ -5064,7 +5069,9 @@ static int fill_dc_plane_attributes(struct amdgpu_device *adev,
 	 * Always set input transfer function, since plane state is refreshed
 	 * every time.
 	 */
-	ret = amdgpu_dm_update_plane_color_mgmt(dm_crtc_state, dc_plane_state);
+	ret = amdgpu_dm_update_plane_color_mgmt(dm_crtc_state,
+						plane_state,
+						dc_plane_state);
 	if (ret)
 		return ret;
 
@@ -8079,6 +8086,10 @@ static void amdgpu_dm_commit_planes(struct drm_atomic_state *state,
 			bundle->surface_updates[planes_count].gamma = dc_plane->gamma_correction;
 			bundle->surface_updates[planes_count].in_transfer_func = dc_plane->in_transfer_func;
 			bundle->surface_updates[planes_count].gamut_remap_matrix = &dc_plane->gamut_remap_matrix;
+			bundle->surface_updates[planes_count].hdr_mult = dc_plane->hdr_mult;
+			bundle->surface_updates[planes_count].func_shaper = dc_plane->in_shaper_func;
+			bundle->surface_updates[planes_count].lut3d_func = dc_plane->lut3d_func;
+			bundle->surface_updates[planes_count].blend_tf = dc_plane->blend_tf;
 		}
 
 		amdgpu_dm_plane_fill_dc_scaling_info(dm->adev, new_plane_state,
@@ -8289,6 +8300,10 @@ static void amdgpu_dm_commit_planes(struct drm_atomic_state *state,
 				&acrtc_state->stream->csc_color_matrix;
 			bundle->stream_update.out_transfer_func =
 				acrtc_state->stream->out_transfer_func;
+			bundle->stream_update.lut3d_func =
+				(struct dc_3dlut *) acrtc_state->stream->lut3d_func;
+			bundle->stream_update.func_shaper =
+				(struct dc_transfer_func *) acrtc_state->stream->func_shaper;
 		}
 
 		acrtc_state->stream->abm_level = acrtc_state->abm_level;
@@ -9479,6 +9494,7 @@ static int dm_update_crtc_state(struct amdgpu_display_manager *dm,
 	 * when a modeset is needed, to ensure it gets reprogrammed.
 	 */
 	if (dm_new_crtc_state->base.color_mgmt_changed ||
+	    dm_old_crtc_state->regamma_tf != dm_new_crtc_state->regamma_tf ||
 	    drm_atomic_crtc_needs_modeset(new_crtc_state)) {
 		ret = amdgpu_dm_update_crtc_color_mgmt(dm_new_crtc_state);
 		if (ret)
@@ -9546,6 +9562,10 @@ static bool should_reset_plane(struct drm_atomic_state *state,
 	 */
 	for_each_oldnew_plane_in_state(state, other, old_other_state, new_other_state, i) {
 		struct amdgpu_framebuffer *old_afb, *new_afb;
+		struct dm_plane_state *dm_new_other_state, *dm_old_other_state;
+
+		dm_new_other_state = to_dm_plane_state(new_other_state);
+		dm_old_other_state = to_dm_plane_state(old_other_state);
 
 		if (other->type == DRM_PLANE_TYPE_CURSOR)
 			continue;
@@ -9582,6 +9602,18 @@ static bool should_reset_plane(struct drm_atomic_state *state,
 		    old_other_state->color_encoding != new_other_state->color_encoding)
 			return true;
 
+		/* HDR/Transfer Function changes. */
+		if (dm_old_other_state->degamma_tf != dm_new_other_state->degamma_tf ||
+		    dm_old_other_state->degamma_lut != dm_new_other_state->degamma_lut ||
+		    dm_old_other_state->hdr_mult != dm_new_other_state->hdr_mult ||
+		    dm_old_other_state->ctm != dm_new_other_state->ctm ||
+		    dm_old_other_state->shaper_lut != dm_new_other_state->shaper_lut ||
+		    dm_old_other_state->shaper_tf != dm_new_other_state->shaper_tf ||
+		    dm_old_other_state->lut3d != dm_new_other_state->lut3d ||
+		    dm_old_other_state->blend_lut != dm_new_other_state->blend_lut ||
+		    dm_old_other_state->blend_tf != dm_new_other_state->blend_tf)
+			return true;
+
 		/* Framebuffer checks fall at the end. */
 		if (!old_other_state->fb || !new_other_state->fb)
 			continue;
diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h
index 9fb5bb3a75a7..f92bbd7ed867 100644
--- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h
+++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h
@@ -51,6 +51,8 @@
 
 #define AMDGPU_DMUB_NOTIFICATION_MAX 5
 
+#define AMDGPU_HDR_MULT_DEFAULT (0x100000000LL)
+
 /*
 #include "include/amdgpu_dal_power_if.h"
 #include "amdgpu_dm_irq.h"
@@ -702,9 +704,91 @@ static inline void amdgpu_dm_set_mst_status(uint8_t *status,
 
 extern const struct amdgpu_ip_block_version dm_ip_block;
 
+enum amdgpu_transfer_function {
+	AMDGPU_TRANSFER_FUNCTION_DEFAULT,
+	AMDGPU_TRANSFER_FUNCTION_SRGB_EOTF,
+	AMDGPU_TRANSFER_FUNCTION_BT709_EOTF,
+	AMDGPU_TRANSFER_FUNCTION_PQ_EOTF,
+	AMDGPU_TRANSFER_FUNCTION_LINEAR,
+	AMDGPU_TRANSFER_FUNCTION_UNITY,
+	AMDGPU_TRANSFER_FUNCTION_GAMMA22_EOTF,
+	AMDGPU_TRANSFER_FUNCTION_GAMMA24_EOTF,
+	AMDGPU_TRANSFER_FUNCTION_GAMMA26_EOTF,
+	AMDGPU_TRANSFER_FUNCTION_SRGB_INV_EOTF,
+	AMDGPU_TRANSFER_FUNCTION_BT709_INV_EOTF,
+	AMDGPU_TRANSFER_FUNCTION_PQ_INV_EOTF,
+	AMDGPU_TRANSFER_FUNCTION_GAMMA22_INV_EOTF,
+	AMDGPU_TRANSFER_FUNCTION_GAMMA24_INV_EOTF,
+	AMDGPU_TRANSFER_FUNCTION_GAMMA26_INV_EOTF,
+        AMDGPU_TRANSFER_FUNCTION_COUNT
+};
+
 struct dm_plane_state {
 	struct drm_plane_state base;
 	struct dc_plane_state *dc_state;
+
+	/* Plane color mgmt */
+	/**
+	 * @degamma_lut:
+	 *
+	 * 1D LUT for mapping framebuffer/plane pixel data before sampling or
+	 * blending operations. It's usually applied to linearize input space.
+	 * The blob (if not NULL) is an array of &struct drm_color_lut.
+	 */
+	struct drm_property_blob *degamma_lut;
+	/**
+	 * @degamma_tf:
+	 *
+	 * Predefined transfer function to tell DC driver the input space to
+	 * linearize.
+	 */
+	enum amdgpu_transfer_function degamma_tf;
+	/**
+	 * @hdr_mult:
+	 *
+	 * Multiplier to 'gain' the plane.  When PQ is decoded using the fixed
+	 * func transfer function to the internal FP16 fb, 1.0 -> 80 nits (on
+	 * AMD at least). When sRGB is decoded, 1.0 -> 1.0, obviously.
+	 * Therefore, 1.0 multiplier = 80 nits for SDR content.  So if you
+	 * want, 203 nits for SDR content, pass in (203.0 / 80.0).  Format is
+	 * S31.32 sign-magnitude.
+	 */
+	__u64 hdr_mult;
+	/**
+	 * @ctm:
+	 *
+	 * Color transformation matrix. See drm_crtc_enable_color_mgmt(). The
+	 * blob (if not NULL) is a &struct drm_color_ctm.
+	 */
+	struct drm_property_blob *ctm;
+	/**
+	 * @shaper_lut: shaper lookup table blob. The blob (if not NULL) is an
+	 * array of &struct drm_color_lut.
+	 */
+	struct drm_property_blob *shaper_lut;
+	/**
+	 * @shaper_tf:
+	 *
+	 * Predefined transfer function to delinearize color space.
+	 */
+	enum amdgpu_transfer_function shaper_tf;
+	/**
+	 * @lut3d: 3D lookup table blob. The blob (if not NULL) is an array of
+	 * &struct drm_color_lut.
+	 */
+	struct drm_property_blob *lut3d;
+	/**
+	 * @blend_lut: blend lut lookup table blob. The blob (if not NULL) is an
+	 * array of &struct drm_color_lut.
+	 */
+	struct drm_property_blob *blend_lut;
+	/**
+	 * @blend_tf:
+	 *
+	 * Pre-defined transfer function for converting plane pixel data before
+	 * applying blend LUT.
+	 */
+	enum amdgpu_transfer_function blend_tf;
 };
 
 struct dm_crtc_state {
@@ -729,6 +813,14 @@ struct dm_crtc_state {
 	struct dc_info_packet vrr_infopacket;
 
 	int abm_level;
+
+        /**
+	 * @regamma_tf:
+	 *
+	 * Pre-defined transfer function for converting internal FB -> wire
+	 * encoding.
+	 */
+	enum amdgpu_transfer_function regamma_tf;
 };
 
 #define to_dm_crtc_state(x) container_of(x, struct dm_crtc_state, base)
@@ -790,14 +882,22 @@ void amdgpu_dm_update_freesync_caps(struct drm_connector *connector,
 
 void amdgpu_dm_trigger_timing_sync(struct drm_device *dev);
 
+/* 3D LUT max size is 17x17x17 */
+#define MAX_COLOR_3DLUT_ENTRIES 4913
+#define MAX_COLOR_3DLUT_BITDEPTH 12
+int amdgpu_dm_verify_lut3d_size(struct amdgpu_device *adev,
+				struct drm_plane_state *plane_state);
+/* 1D LUT size */
 #define MAX_COLOR_LUT_ENTRIES 4096
 /* Legacy gamm LUT users such as X doesn't like large LUT sizes */
 #define MAX_COLOR_LEGACY_LUT_ENTRIES 256
 
 void amdgpu_dm_init_color_mod(void);
+int amdgpu_dm_create_color_properties(struct amdgpu_device *adev);
 int amdgpu_dm_verify_lut_sizes(const struct drm_crtc_state *crtc_state);
 int amdgpu_dm_update_crtc_color_mgmt(struct dm_crtc_state *crtc);
 int amdgpu_dm_update_plane_color_mgmt(struct dm_crtc_state *crtc,
+				      struct drm_plane_state *plane_state,
 				      struct dc_plane_state *dc_plane_state);
 
 void amdgpu_dm_update_connector_after_detect(
diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_color.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_color.c
index a4cb23d059bd..0a51af44efd5 100644
--- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_color.c
+++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_color.c
@@ -72,6 +72,7 @@
  */
 
 #define MAX_DRM_LUT_VALUE 0xFFFF
+#define SDR_WHITE_LEVEL_INIT_VALUE 80
 
 /**
  * amdgpu_dm_init_color_mod - Initialize the color module.
@@ -84,6 +85,213 @@ void amdgpu_dm_init_color_mod(void)
 	setup_x_points_distribution();
 }
 
+#ifdef AMD_PRIVATE_COLOR
+/* Pre-defined Transfer Functions (TF)
+ *
+ * AMD driver supports pre-defined mathematical functions for transferring
+ * between encoded values and optical/linear space. Depending on HW color caps,
+ * ROMs and curves built by the AMD color module support these transforms.
+ *
+ * The driver-specific color implementation exposes properties for pre-blending
+ * degamma TF, shaper TF (before 3D LUT), and blend(dpp.ogam) TF and
+ * post-blending regamma (mpc.ogam) TF. However, only pre-blending degamma
+ * supports ROM curves. AMD color module uses pre-defined coefficients to build
+ * curves for the other blocks. What can be done by each color block is
+ * described by struct dpp_color_capsand struct mpc_color_caps.
+ *
+ * AMD driver-specific color API exposes the following pre-defined transfer
+ * functions:
+ *
+ * - Linear/Unity: linear/identity relationship between pixel value and
+ *   luminance value;
+ * - Gamma 2.2, Gamma 2.4, Gamma 2.6: pure gamma functions;
+ * - sRGB: 2.4 gamma with small initial linear section as standardized by IEC
+ *   61966-2-1:1999;
+ * - BT.709 (BT.1886): 2.4 gamma with differences in the dark end of the scale.
+ *   Used in HD-TV and standardized by ITU-R BT.1886;
+ * - PQ (Perceptual Quantizer): used for HDR display, allows luminance range
+ *   capability of 0 to 10,000 nits; standardized by SMPTE ST 2084.
+ *
+ * In the driver-specific API, color block names attached to TF properties
+ * suggest the intention regarding non-linear encoding pixel's luminance
+ * values. As some newer encodings don't use gamma curve, we make encoding and
+ * decoding explicit by defining an enum list of transfer functions supported
+ * in terms of EOTF and inverse EOTF, where:
+ *
+ * - EOTF (electro-optical transfer function): is the transfer function to go
+ *   from the encoded value to an optical (linear) value. De-gamma functions
+ *   traditionally do this.
+ * - Inverse EOTF (simply the inverse of the EOTF): is usually intended to go
+ *   from an optical/linear space (which might have been used for blending)
+ *   back to the encoded values. Gamma functions traditionally do this. 
+ */
+static const char * const
+amdgpu_transfer_function_names[] = {
+	[AMDGPU_TRANSFER_FUNCTION_DEFAULT]		= "Default",
+	[AMDGPU_TRANSFER_FUNCTION_LINEAR]		= "Linear",
+	[AMDGPU_TRANSFER_FUNCTION_UNITY]		= "Unity",
+	[AMDGPU_TRANSFER_FUNCTION_SRGB_EOTF]		= "sRGB EOTF",
+	[AMDGPU_TRANSFER_FUNCTION_BT709_EOTF]		= "BT.709 EOTF",
+	[AMDGPU_TRANSFER_FUNCTION_PQ_EOTF]		= "PQ EOTF",
+	[AMDGPU_TRANSFER_FUNCTION_GAMMA22_EOTF]		= "Gamma 2.2 EOTF",
+	[AMDGPU_TRANSFER_FUNCTION_GAMMA24_EOTF]		= "Gamma 2.4 EOTF",
+	[AMDGPU_TRANSFER_FUNCTION_GAMMA26_EOTF]		= "Gamma 2.6 EOTF",
+	[AMDGPU_TRANSFER_FUNCTION_SRGB_INV_EOTF]	= "sRGB inv_EOTF",
+	[AMDGPU_TRANSFER_FUNCTION_BT709_INV_EOTF]	= "BT.709 inv_EOTF",
+	[AMDGPU_TRANSFER_FUNCTION_PQ_INV_EOTF]		= "PQ inv_EOTF",
+	[AMDGPU_TRANSFER_FUNCTION_GAMMA22_INV_EOTF]	= "Gamma 2.2 inv_EOTF",
+	[AMDGPU_TRANSFER_FUNCTION_GAMMA24_INV_EOTF]	= "Gamma 2.4 inv_EOTF",
+	[AMDGPU_TRANSFER_FUNCTION_GAMMA26_INV_EOTF]	= "Gamma 2.6 inv_EOTF",
+};
+
+static const u32 amdgpu_eotf =
+	BIT(AMDGPU_TRANSFER_FUNCTION_SRGB_EOTF) |
+	BIT(AMDGPU_TRANSFER_FUNCTION_BT709_EOTF) |
+	BIT(AMDGPU_TRANSFER_FUNCTION_PQ_EOTF) |
+	BIT(AMDGPU_TRANSFER_FUNCTION_GAMMA22_EOTF) |
+	BIT(AMDGPU_TRANSFER_FUNCTION_GAMMA24_EOTF) |
+	BIT(AMDGPU_TRANSFER_FUNCTION_GAMMA26_EOTF);
+
+static const u32 amdgpu_inv_eotf =
+	BIT(AMDGPU_TRANSFER_FUNCTION_SRGB_INV_EOTF) |
+	BIT(AMDGPU_TRANSFER_FUNCTION_BT709_INV_EOTF) |
+	BIT(AMDGPU_TRANSFER_FUNCTION_PQ_INV_EOTF) |
+	BIT(AMDGPU_TRANSFER_FUNCTION_GAMMA22_INV_EOTF) |
+	BIT(AMDGPU_TRANSFER_FUNCTION_GAMMA24_INV_EOTF) |
+	BIT(AMDGPU_TRANSFER_FUNCTION_GAMMA26_INV_EOTF);
+
+static struct drm_property *
+amdgpu_create_tf_property(struct drm_device *dev,
+			  const char *name,
+			  u32 supported_tf)
+{
+	u32 transfer_functions = supported_tf |
+				 BIT(AMDGPU_TRANSFER_FUNCTION_DEFAULT) |
+				 BIT(AMDGPU_TRANSFER_FUNCTION_LINEAR) |
+				 BIT(AMDGPU_TRANSFER_FUNCTION_UNITY);
+	struct drm_prop_enum_list enum_list[AMDGPU_TRANSFER_FUNCTION_COUNT];
+	int i, len;
+
+	len = 0;
+	for (i = 0; i < AMDGPU_TRANSFER_FUNCTION_COUNT; i++) {
+		if ((transfer_functions & BIT(i)) == 0)
+			continue;
+
+		enum_list[len].type = i;
+		enum_list[len].name = amdgpu_transfer_function_names[i];
+		len++;
+	}
+
+	return drm_property_create_enum(dev, DRM_MODE_PROP_ENUM,
+					name, enum_list, len);
+}
+
+int
+amdgpu_dm_create_color_properties(struct amdgpu_device *adev)
+{
+	struct drm_property *prop;
+
+	prop = drm_property_create(adev_to_drm(adev),
+				   DRM_MODE_PROP_BLOB,
+				   "AMD_PLANE_DEGAMMA_LUT", 0);
+	if (!prop)
+		return -ENOMEM;
+	adev->mode_info.plane_degamma_lut_property = prop;
+
+	prop = drm_property_create_range(adev_to_drm(adev),
+					 DRM_MODE_PROP_IMMUTABLE,
+					 "AMD_PLANE_DEGAMMA_LUT_SIZE", 0, UINT_MAX);
+	if (!prop)
+		return -ENOMEM;
+	adev->mode_info.plane_degamma_lut_size_property = prop;
+
+	prop = amdgpu_create_tf_property(adev_to_drm(adev),
+					 "AMD_PLANE_DEGAMMA_TF",
+					 amdgpu_eotf);
+	if (!prop)
+		return -ENOMEM;
+	adev->mode_info.plane_degamma_tf_property = prop;
+
+	prop = drm_property_create_range(adev_to_drm(adev),
+					 0, "AMD_PLANE_HDR_MULT", 0, U64_MAX);
+	if (!prop)
+		return -ENOMEM;
+	adev->mode_info.plane_hdr_mult_property = prop;
+
+	prop = drm_property_create(adev_to_drm(adev),
+				   DRM_MODE_PROP_BLOB,
+				   "AMD_PLANE_CTM", 0);
+	if (!prop)
+		return -ENOMEM;
+	adev->mode_info.plane_ctm_property = prop;
+
+	prop = drm_property_create(adev_to_drm(adev),
+				   DRM_MODE_PROP_BLOB,
+				   "AMD_PLANE_SHAPER_LUT", 0);
+	if (!prop)
+		return -ENOMEM;
+	adev->mode_info.plane_shaper_lut_property = prop;
+
+	prop = drm_property_create_range(adev_to_drm(adev),
+					 DRM_MODE_PROP_IMMUTABLE,
+					 "AMD_PLANE_SHAPER_LUT_SIZE", 0, UINT_MAX);
+	if (!prop)
+		return -ENOMEM;
+	adev->mode_info.plane_shaper_lut_size_property = prop;
+
+	prop = amdgpu_create_tf_property(adev_to_drm(adev),
+					 "AMD_PLANE_SHAPER_TF",
+					 amdgpu_inv_eotf);
+	if (!prop)
+		return -ENOMEM;
+	adev->mode_info.plane_shaper_tf_property = prop;
+
+	prop = drm_property_create(adev_to_drm(adev),
+				   DRM_MODE_PROP_BLOB,
+				   "AMD_PLANE_LUT3D", 0);
+	if (!prop)
+		return -ENOMEM;
+	adev->mode_info.plane_lut3d_property = prop;
+
+	prop = drm_property_create_range(adev_to_drm(adev),
+					 DRM_MODE_PROP_IMMUTABLE,
+					 "AMD_PLANE_LUT3D_SIZE", 0, UINT_MAX);
+	if (!prop)
+		return -ENOMEM;
+	adev->mode_info.plane_lut3d_size_property = prop;
+
+	prop = drm_property_create(adev_to_drm(adev),
+				   DRM_MODE_PROP_BLOB,
+				   "AMD_PLANE_BLEND_LUT", 0);
+	if (!prop)
+		return -ENOMEM;
+	adev->mode_info.plane_blend_lut_property = prop;
+
+	prop = drm_property_create_range(adev_to_drm(adev),
+					 DRM_MODE_PROP_IMMUTABLE,
+					 "AMD_PLANE_BLEND_LUT_SIZE", 0, UINT_MAX);
+	if (!prop)
+		return -ENOMEM;
+	adev->mode_info.plane_blend_lut_size_property = prop;
+
+	prop = amdgpu_create_tf_property(adev_to_drm(adev),
+					 "AMD_PLANE_BLEND_TF",
+					 amdgpu_eotf);
+	if (!prop)
+		return -ENOMEM;
+	adev->mode_info.plane_blend_tf_property = prop;
+
+	prop = amdgpu_create_tf_property(adev_to_drm(adev),
+					 "AMD_CRTC_REGAMMA_TF",
+					 amdgpu_inv_eotf);
+	if (!prop)
+		return -ENOMEM;
+	adev->mode_info.regamma_tf_property = prop;
+
+	return 0;
+}
+#endif
+
 /**
  * __extract_blob_lut - Extracts the DRM lut and lut size from a blob.
  * @blob: DRM color mgmt property blob
@@ -182,7 +390,6 @@ static void __drm_lut_to_dc_gamma(const struct drm_color_lut *lut,
 static void __drm_ctm_to_dc_matrix(const struct drm_color_ctm *ctm,
 				   struct fixed31_32 *matrix)
 {
-	int64_t val;
 	int i;
 
 	/*
@@ -201,12 +408,33 @@ static void __drm_ctm_to_dc_matrix(const struct drm_color_ctm *ctm,
 		}
 
 		/* gamut_remap_matrix[i] = ctm[i - floor(i/4)] */
-		val = ctm->matrix[i - (i / 4)];
-		/* If negative, convert to 2's complement. */
-		if (val & (1ULL << 63))
-			val = -(val & ~(1ULL << 63));
+		matrix[i] = dc_fixpt_from_s3132(ctm->matrix[i - (i / 4)]);
+	}
+}
 
-		matrix[i].value = val;
+/**
+ * __drm_ctm2_to_dc_matrix - converts a DRM CTM2 to a DC CSC float matrix
+ * @ctm: DRM color transformation matrix
+ * @matrix: DC CSC float matrix
+ *
+ * The matrix needs to be a 3x4 (12 entry) matrix.
+ */
+static void __drm_ctm2_to_dc_matrix(const struct drm_color_ctm2 *ctm,
+				   struct fixed31_32 *matrix)
+{
+	int i;
+
+	/*
+	 * DRM gives a 3x3 matrix, but DC wants 3x4. Assuming we're operating
+	 * with homogeneous coordinates, augment the matrix with 0's.
+	 *
+	 * The format provided is S31.32, using signed-magnitude representation.
+	 * Our fixed31_32 is also S31.32, but is using 2's complement. We have
+	 * to convert from signed-magnitude to 2's complement.
+	 */
+	for (i = 0; i < 12; i++) {
+		/* gamut_remap_matrix[i] = ctm[i - floor(i/4)] */
+		matrix[i] = dc_fixpt_from_s3132(ctm->matrix[i]);
 	}
 }
 
@@ -268,16 +496,18 @@ static int __set_output_tf(struct dc_transfer_func *func,
 	struct calculate_buffer cal_buffer = {0};
 	bool res;
 
-	ASSERT(lut && lut_size == MAX_COLOR_LUT_ENTRIES);
-
 	cal_buffer.buffer_index = -1;
 
-	gamma = dc_create_gamma();
-	if (!gamma)
-		return -ENOMEM;
+	if (lut_size) {
+		ASSERT(lut && lut_size == MAX_COLOR_LUT_ENTRIES);
 
-	gamma->num_entries = lut_size;
-	__drm_lut_to_dc_gamma(lut, gamma, false);
+		gamma = dc_create_gamma();
+		if (!gamma)
+			return -ENOMEM;
+
+		gamma->num_entries = lut_size;
+		__drm_lut_to_dc_gamma(lut, gamma, false);
+	}
 
 	if (func->tf == TRANSFER_FUNCTION_LINEAR) {
 		/*
@@ -285,27 +515,63 @@ static int __set_output_tf(struct dc_transfer_func *func,
 		 * on top of a linear input. But degamma params can be used
 		 * instead to simulate this.
 		 */
-		gamma->type = GAMMA_CUSTOM;
+		if (gamma)
+			gamma->type = GAMMA_CUSTOM;
 		res = mod_color_calculate_degamma_params(NULL, func,
-							gamma, true);
+							 gamma, gamma != NULL);
 	} else {
 		/*
 		 * Assume sRGB. The actual mapping will depend on whether the
 		 * input was legacy or not.
 		 */
-		gamma->type = GAMMA_CS_TFM_1D;
-		res = mod_color_calculate_regamma_params(func, gamma, false,
+		if (gamma)
+			gamma->type = GAMMA_CS_TFM_1D;
+		res = mod_color_calculate_regamma_params(func, gamma, gamma != NULL,
 							 has_rom, NULL, &cal_buffer);
 	}
 
-	dc_gamma_release(&gamma);
+	if (gamma)
+		dc_gamma_release(&gamma);
 
 	return res ? 0 : -ENOMEM;
 }
 
+static int amdgpu_dm_set_atomic_regamma(struct dc_stream_state *stream,
+					const struct drm_color_lut *regamma_lut,
+					uint32_t regamma_size, bool has_rom,
+					enum dc_transfer_func_predefined tf)
+{
+	struct dc_transfer_func *out_tf = stream->out_transfer_func;
+	int ret = 0;
+
+	if (regamma_size || tf != TRANSFER_FUNCTION_LINEAR) {
+		/* CRTC RGM goes into RGM LUT.
+		 *
+		 * Note: there is no implicit sRGB regamma here. We are using
+		 * degamma calculation from color module to calculate the curve
+		 * from a linear base.
+		 */
+		out_tf->type = TF_TYPE_DISTRIBUTED_POINTS;
+		out_tf->tf = tf;
+		out_tf->sdr_ref_white_level = SDR_WHITE_LEVEL_INIT_VALUE;
+
+		ret = __set_output_tf(out_tf, regamma_lut, regamma_size, has_rom);
+	} else {
+		/*
+		 * No CRTC RGM means we can just put the block into bypass
+		 * since we don't have any plane level adjustments using it.
+		 */
+		out_tf->type = TF_TYPE_BYPASS;
+		out_tf->tf = TRANSFER_FUNCTION_LINEAR;
+	}
+
+	return ret;
+}
+
 /**
  * __set_input_tf - calculates the input transfer function based on expected
  * input space.
+ * @caps: dc color capabilities
  * @func: transfer function
  * @lut: lookup table that defines the color space
  * @lut_size: size of respective lut.
@@ -313,27 +579,249 @@ static int __set_output_tf(struct dc_transfer_func *func,
  * Returns:
  * 0 in case of success. -ENOMEM if fails.
  */
-static int __set_input_tf(struct dc_transfer_func *func,
+static int __set_input_tf(struct dc_color_caps *caps, struct dc_transfer_func *func,
 			  const struct drm_color_lut *lut, uint32_t lut_size)
 {
 	struct dc_gamma *gamma = NULL;
 	bool res;
 
-	gamma = dc_create_gamma();
-	if (!gamma)
-		return -ENOMEM;
+	if (lut_size) {
+		gamma = dc_create_gamma();
+		if (!gamma)
+			return -ENOMEM;
 
-	gamma->type = GAMMA_CUSTOM;
-	gamma->num_entries = lut_size;
+		gamma->type = GAMMA_CUSTOM;
+		gamma->num_entries = lut_size;
 
-	__drm_lut_to_dc_gamma(lut, gamma, false);
+		__drm_lut_to_dc_gamma(lut, gamma, false);
+	}
 
-	res = mod_color_calculate_degamma_params(NULL, func, gamma, true);
-	dc_gamma_release(&gamma);
+	res = mod_color_calculate_degamma_params(caps, func, gamma, gamma != NULL);
+
+	if (gamma)
+		dc_gamma_release(&gamma);
 
 	return res ? 0 : -ENOMEM;
 }
 
+static enum dc_transfer_func_predefined
+amdgpu_tf_to_dc_tf(enum amdgpu_transfer_function tf)
+{
+	switch (tf)
+	{
+	default:
+	case AMDGPU_TRANSFER_FUNCTION_DEFAULT:
+	case AMDGPU_TRANSFER_FUNCTION_LINEAR:
+		return TRANSFER_FUNCTION_LINEAR;
+	case AMDGPU_TRANSFER_FUNCTION_SRGB_EOTF:
+	case AMDGPU_TRANSFER_FUNCTION_SRGB_INV_EOTF:
+		return TRANSFER_FUNCTION_SRGB;
+	case AMDGPU_TRANSFER_FUNCTION_BT709_EOTF:
+	case AMDGPU_TRANSFER_FUNCTION_BT709_INV_EOTF:
+		return TRANSFER_FUNCTION_BT709;
+	case AMDGPU_TRANSFER_FUNCTION_PQ_EOTF:
+	case AMDGPU_TRANSFER_FUNCTION_PQ_INV_EOTF:
+		return TRANSFER_FUNCTION_PQ;
+	case AMDGPU_TRANSFER_FUNCTION_UNITY:
+		return TRANSFER_FUNCTION_UNITY;
+	case AMDGPU_TRANSFER_FUNCTION_GAMMA22_EOTF:
+	case AMDGPU_TRANSFER_FUNCTION_GAMMA22_INV_EOTF:
+		return TRANSFER_FUNCTION_GAMMA22;
+	case AMDGPU_TRANSFER_FUNCTION_GAMMA24_EOTF:
+	case AMDGPU_TRANSFER_FUNCTION_GAMMA24_INV_EOTF:
+		return TRANSFER_FUNCTION_GAMMA24;
+	case AMDGPU_TRANSFER_FUNCTION_GAMMA26_EOTF:
+	case AMDGPU_TRANSFER_FUNCTION_GAMMA26_INV_EOTF:
+		return TRANSFER_FUNCTION_GAMMA26;
+	}
+}
+
+static void __to_dc_lut3d_color(struct dc_rgb *rgb,
+				const struct drm_color_lut lut,
+				int bit_precision)
+{
+	rgb->red = drm_color_lut_extract(lut.red, bit_precision);
+	rgb->green = drm_color_lut_extract(lut.green, bit_precision);
+	rgb->blue  = drm_color_lut_extract(lut.blue, bit_precision);
+}
+
+static void __drm_3dlut_to_dc_3dlut(const struct drm_color_lut *lut,
+				    uint32_t lut3d_size,
+				    struct tetrahedral_params *params,
+				    bool use_tetrahedral_9,
+				    int bit_depth)
+{
+	struct dc_rgb *lut0;
+	struct dc_rgb *lut1;
+	struct dc_rgb *lut2;
+	struct dc_rgb *lut3;
+	int lut_i, i;
+
+
+	if (use_tetrahedral_9) {
+		lut0 = params->tetrahedral_9.lut0;
+		lut1 = params->tetrahedral_9.lut1;
+		lut2 = params->tetrahedral_9.lut2;
+		lut3 = params->tetrahedral_9.lut3;
+	} else {
+		lut0 = params->tetrahedral_17.lut0;
+		lut1 = params->tetrahedral_17.lut1;
+		lut2 = params->tetrahedral_17.lut2;
+		lut3 = params->tetrahedral_17.lut3;
+	}
+
+	for (lut_i = 0, i = 0; i < lut3d_size - 4; lut_i++, i += 4) {
+		/* We should consider the 3dlut RGB values are distributed
+		 * along four arrays lut0-3 where the first sizes 1229 and the
+		 * other 1228. The bit depth supported for 3dlut channel is
+		 * 12-bit, but DC also supports 10-bit.
+		 *
+		 * TODO: improve color pipeline API to enable the userspace set
+		 * bit depth and 3D LUT size/stride, as specified by VA-API.
+		 */
+		__to_dc_lut3d_color(&lut0[lut_i], lut[i], bit_depth);
+		__to_dc_lut3d_color(&lut1[lut_i], lut[i + 1], bit_depth);
+		__to_dc_lut3d_color(&lut2[lut_i], lut[i + 2], bit_depth);
+		__to_dc_lut3d_color(&lut3[lut_i], lut[i + 3], bit_depth);
+	}
+	/* lut0 has 1229 points (lut_size/4 + 1) */
+	__to_dc_lut3d_color(&lut0[lut_i], lut[i], bit_depth);
+}
+
+/* amdgpu_dm_atomic_lut3d - set DRM 3D LUT to DC stream
+ * @drm_lut3d: DRM CRTC (user) 3D LUT
+ * @drm_lut3d_size: size of 3D LUT
+ * @lut3d: DC 3D LUT
+ *
+ * Map DRM CRTC 3D LUT to DC 3D LUT and all necessary bits to program it
+ * on DCN MPC accordingly.
+ */
+static void amdgpu_dm_atomic_lut3d(const struct drm_color_lut *drm_lut,
+				   uint32_t drm_lut3d_size,
+				   struct dc_3dlut *lut)
+{
+	if (!drm_lut3d_size) {
+		lut->state.bits.initialized = 0;
+	} else {
+		/* Stride and bit depth are not programmable by API yet.
+		 * Therefore, only supports 17x17x17 3D LUT (12-bit).
+		 */
+		lut->lut_3d.use_tetrahedral_9 = false;
+		lut->lut_3d.use_12bits = true;
+		lut->state.bits.initialized = 1;
+		__drm_3dlut_to_dc_3dlut(drm_lut, drm_lut3d_size, &lut->lut_3d,
+					lut->lut_3d.use_tetrahedral_9,
+					MAX_COLOR_3DLUT_BITDEPTH);
+	}
+}
+
+static int amdgpu_dm_atomic_shaper_lut(const struct drm_color_lut *shaper_lut,
+				       bool has_rom,
+				       enum dc_transfer_func_predefined tf,
+				       uint32_t shaper_size,
+				       struct dc_transfer_func *func_shaper)
+{
+	int ret = 0;
+
+	if (shaper_size || tf != TRANSFER_FUNCTION_LINEAR) {
+		/* If DRM shaper LUT is set, we assume a linear color space
+		 * (linearized by DRM degamma 1D LUT or not)
+		 */
+		func_shaper->type = TF_TYPE_DISTRIBUTED_POINTS;
+		func_shaper->tf = tf;
+		func_shaper->sdr_ref_white_level = SDR_WHITE_LEVEL_INIT_VALUE;
+
+		ret = __set_output_tf(func_shaper, shaper_lut, shaper_size, has_rom);
+	} else {
+		func_shaper->type = TF_TYPE_BYPASS;
+		func_shaper->tf = TRANSFER_FUNCTION_LINEAR;
+	}
+
+	return ret;
+}
+
+static int amdgpu_dm_atomic_blend_lut(const struct drm_color_lut *blend_lut,
+				       bool has_rom,
+				       enum dc_transfer_func_predefined tf,
+				       uint32_t blend_size,
+				       struct dc_transfer_func *func_blend)
+{
+	int ret = 0;
+
+	if (blend_size || tf != TRANSFER_FUNCTION_LINEAR) {
+		/* DRM plane gamma LUT or TF means we are linearizing color
+		 * space before blending (similar to degamma programming). As
+		 * we don't have hardcoded curve support, or we use AMD color
+		 * module to fill the parameters that will be translated to HW
+		 * points.
+		 */
+		func_blend->type = TF_TYPE_DISTRIBUTED_POINTS;
+		func_blend->tf = tf;
+		func_blend->sdr_ref_white_level = SDR_WHITE_LEVEL_INIT_VALUE;
+
+		ret = __set_input_tf(NULL, func_blend, blend_lut, blend_size);
+	} else {
+		func_blend->type = TF_TYPE_BYPASS;
+		func_blend->tf = TRANSFER_FUNCTION_LINEAR;
+	}
+
+	return ret;
+}
+
+/* amdgpu_dm_lut3d_size - get expected size according to hw color caps
+ * @adev: amdgpu device
+ * @lut_size: default size
+ *
+ * Return:
+ * lut_size if DC 3D LUT is supported, zero otherwise.
+ */
+static uint32_t amdgpu_dm_get_lut3d_size(struct amdgpu_device *adev,
+					 uint32_t lut_size)
+{
+	return adev->dm.dc->caps.color.dpp.hw_3d_lut ? lut_size : 0;
+}
+
+/**
+ * amdgpu_dm_verify_lut3d_size - verifies if 3D LUT is supported and if DRM 3D
+ * LUT matches the hw supported size
+ * @adev: amdgpu device
+ * @crtc_state: the DRM CRTC state
+ *
+ * Verifies if post-blending (MPC) 3D LUT is supported by the HW (DCN 3.0 or
+ * newer) and if the DRM 3D LUT matches the supported size.
+ *
+ * Returns:
+ * 0 on success. -EINVAL if lut size are invalid.
+ */
+int amdgpu_dm_verify_lut3d_size(struct amdgpu_device *adev,
+				struct drm_plane_state *plane_state)
+{
+	struct dm_plane_state *dm_plane_state = to_dm_plane_state(plane_state);
+	const struct drm_color_lut *shaper = NULL, *lut3d = NULL;
+	uint32_t exp_size, size;
+
+	/* shaper LUT is only available if 3D LUT color caps*/
+	exp_size = amdgpu_dm_get_lut3d_size(adev, MAX_COLOR_LUT_ENTRIES);
+	shaper = __extract_blob_lut(dm_plane_state->shaper_lut, &size);
+
+	if (shaper && size != exp_size) {
+		drm_dbg(&adev->ddev,
+			"Invalid Shaper LUT size. Should be %u but got %u.\n",
+			exp_size, size);
+	}
+
+	exp_size = amdgpu_dm_get_lut3d_size(adev, MAX_COLOR_3DLUT_ENTRIES);
+	lut3d = __extract_blob_lut(dm_plane_state->lut3d, &size);
+
+	if (lut3d && size != exp_size) {
+		drm_dbg(&adev->ddev, "Invalid 3D LUT size. Should be %u but got %u.\n",
+			exp_size, size);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
 /**
  * amdgpu_dm_verify_lut_sizes - verifies if DRM luts match the hw supported sizes
  * @crtc_state: the DRM CRTC state
@@ -401,9 +889,12 @@ int amdgpu_dm_update_crtc_color_mgmt(struct dm_crtc_state *crtc)
 	const struct drm_color_lut *degamma_lut, *regamma_lut;
 	uint32_t degamma_size, regamma_size;
 	bool has_regamma, has_degamma;
+	enum dc_transfer_func_predefined tf = TRANSFER_FUNCTION_LINEAR;
 	bool is_legacy;
 	int r;
 
+	tf = amdgpu_tf_to_dc_tf(crtc->regamma_tf);
+
 	r = amdgpu_dm_verify_lut_sizes(&crtc->base);
 	if (r)
 		return r;
@@ -440,26 +931,22 @@ int amdgpu_dm_update_crtc_color_mgmt(struct dm_crtc_state *crtc)
 		stream->out_transfer_func->type = TF_TYPE_DISTRIBUTED_POINTS;
 		stream->out_transfer_func->tf = TRANSFER_FUNCTION_SRGB;
 
+		/* Note: although we pass has_rom as parameter here, we never
+		 * actually use ROM because the color module only takes the ROM
+		 * path if transfer_func->type == PREDEFINED.
+		 *
+		 * See more in mod_color_calculate_regamma_params()
+		 */
 		r = __set_legacy_tf(stream->out_transfer_func, regamma_lut,
 				    regamma_size, has_rom);
 		if (r)
 			return r;
-	} else if (has_regamma) {
-		/* If atomic regamma, CRTC RGM goes into RGM LUT. */
-		stream->out_transfer_func->type = TF_TYPE_DISTRIBUTED_POINTS;
-		stream->out_transfer_func->tf = TRANSFER_FUNCTION_LINEAR;
-
-		r = __set_output_tf(stream->out_transfer_func, regamma_lut,
-				    regamma_size, has_rom);
+	} else {
+		regamma_size = has_regamma ? regamma_size : 0;
+		r = amdgpu_dm_set_atomic_regamma(stream, regamma_lut,
+						 regamma_size, has_rom, tf);
 		if (r)
 			return r;
-	} else {
-		/*
-		 * No CRTC RGM means we can just put the block into bypass
-		 * since we don't have any plane level adjustments using it.
-		 */
-		stream->out_transfer_func->type = TF_TYPE_BYPASS;
-		stream->out_transfer_func->tf = TRANSFER_FUNCTION_LINEAR;
 	}
 
 	/*
@@ -495,20 +982,10 @@ int amdgpu_dm_update_crtc_color_mgmt(struct dm_crtc_state *crtc)
 	return 0;
 }
 
-/**
- * amdgpu_dm_update_plane_color_mgmt: Maps DRM color management to DC plane.
- * @crtc: amdgpu_dm crtc state
- * @dc_plane_state: target DC surface
- *
- * Update the underlying dc_stream_state's input transfer function (ITF) in
- * preparation for hardware commit. The transfer function used depends on
- * the preparation done on the stream for color management.
- *
- * Returns:
- * 0 on success. -ENOMEM if mem allocation fails.
- */
-int amdgpu_dm_update_plane_color_mgmt(struct dm_crtc_state *crtc,
-				      struct dc_plane_state *dc_plane_state)
+static int
+map_crtc_degamma_to_dc_plane(struct dm_crtc_state *crtc,
+			     struct dc_plane_state *dc_plane_state,
+			     struct dc_color_caps *caps)
 {
 	const struct drm_color_lut *degamma_lut;
 	enum dc_transfer_func_predefined tf = TRANSFER_FUNCTION_SRGB;
@@ -531,8 +1008,7 @@ int amdgpu_dm_update_plane_color_mgmt(struct dm_crtc_state *crtc,
 						 &degamma_size);
 		ASSERT(degamma_size == MAX_COLOR_LUT_ENTRIES);
 
-		dc_plane_state->in_transfer_func->type =
-			TF_TYPE_DISTRIBUTED_POINTS;
+		dc_plane_state->in_transfer_func->type = TF_TYPE_DISTRIBUTED_POINTS;
 
 		/*
 		 * This case isn't fully correct, but also fairly
@@ -564,11 +1040,11 @@ int amdgpu_dm_update_plane_color_mgmt(struct dm_crtc_state *crtc,
 			dc_plane_state->in_transfer_func->tf =
 				TRANSFER_FUNCTION_LINEAR;
 
-		r = __set_input_tf(dc_plane_state->in_transfer_func,
+		r = __set_input_tf(caps, dc_plane_state->in_transfer_func,
 				   degamma_lut, degamma_size);
 		if (r)
 			return r;
-	} else if (crtc->cm_is_degamma_srgb) {
+	} else {
 		/*
 		 * For legacy gamma support we need the regamma input
 		 * in linear space. Assume that the input is sRGB.
@@ -577,14 +1053,213 @@ int amdgpu_dm_update_plane_color_mgmt(struct dm_crtc_state *crtc,
 		dc_plane_state->in_transfer_func->tf = tf;
 
 		if (tf != TRANSFER_FUNCTION_SRGB &&
-		    !mod_color_calculate_degamma_params(NULL,
-			    dc_plane_state->in_transfer_func, NULL, false))
+		    !mod_color_calculate_degamma_params(caps,
+							dc_plane_state->in_transfer_func,
+							NULL, false))
+			return -ENOMEM;
+	}
+
+	return 0;
+}
+
+static int
+__set_dm_plane_degamma(struct drm_plane_state *plane_state,
+		       struct dc_plane_state *dc_plane_state,
+		       struct dc_color_caps *color_caps)
+{
+	struct dm_plane_state *dm_plane_state = to_dm_plane_state(plane_state);
+	const struct drm_color_lut *degamma_lut;
+	enum amdgpu_transfer_function tf = AMDGPU_TRANSFER_FUNCTION_DEFAULT;
+	uint32_t degamma_size;
+	bool has_degamma_lut;
+	int ret;
+
+	degamma_lut = __extract_blob_lut(dm_plane_state->degamma_lut,
+					 &degamma_size);
+
+	has_degamma_lut = degamma_lut &&
+			  !__is_lut_linear(degamma_lut, degamma_size);
+
+	tf = dm_plane_state->degamma_tf;
+
+	/* If we don't have plane degamma LUT nor TF to set on DC, we have
+	 * nothing to do here, return.
+	 */
+	if (!has_degamma_lut && tf == AMDGPU_TRANSFER_FUNCTION_DEFAULT)
+		return -EINVAL;
+
+	dc_plane_state->in_transfer_func->tf = amdgpu_tf_to_dc_tf(tf);
+
+	if (has_degamma_lut) {
+		ASSERT(degamma_size == MAX_COLOR_LUT_ENTRIES);
+
+		dc_plane_state->in_transfer_func->type =
+			TF_TYPE_DISTRIBUTED_POINTS;
+
+		ret = __set_input_tf(color_caps, dc_plane_state->in_transfer_func,
+				     degamma_lut, degamma_size);
+		if (ret)
+			return ret;
+       } else {
+		dc_plane_state->in_transfer_func->type =
+			TF_TYPE_PREDEFINED;
+
+		if (!mod_color_calculate_degamma_params(color_caps,
+		    dc_plane_state->in_transfer_func, NULL, false))
 			return -ENOMEM;
-	} else {
-		/* ...Otherwise we can just bypass the DGM block. */
-		dc_plane_state->in_transfer_func->type = TF_TYPE_BYPASS;
-		dc_plane_state->in_transfer_func->tf = TRANSFER_FUNCTION_LINEAR;
+	}
+	return 0;
+}
+
+static int
+amdgpu_dm_plane_set_color_properties(struct drm_plane_state *plane_state,
+				     struct dc_plane_state *dc_plane_state,
+				     struct dc_color_caps *color_caps)
+{
+	struct dm_plane_state *dm_plane_state = to_dm_plane_state(plane_state);
+	enum amdgpu_transfer_function shaper_tf = AMDGPU_TRANSFER_FUNCTION_DEFAULT;
+	enum amdgpu_transfer_function blend_tf = AMDGPU_TRANSFER_FUNCTION_DEFAULT;
+	const struct drm_color_lut *shaper_lut, *lut3d, *blend_lut;
+	uint32_t shaper_size, lut3d_size, blend_size;
+	int ret;
+
+	/* We have nothing to do here, return */
+	if (!plane_state->color_mgmt_changed)
+		return 0;
+
+	dc_plane_state->hdr_mult = dc_fixpt_from_s3132(dm_plane_state->hdr_mult);
+
+	shaper_lut = __extract_blob_lut(dm_plane_state->shaper_lut, &shaper_size);
+	shaper_size = shaper_lut != NULL ? shaper_size : 0;
+	shaper_tf = dm_plane_state->shaper_tf;
+	lut3d = __extract_blob_lut(dm_plane_state->lut3d, &lut3d_size);
+	lut3d_size = lut3d != NULL ? lut3d_size : 0;
+
+	amdgpu_dm_atomic_lut3d(lut3d, lut3d_size, dc_plane_state->lut3d_func);
+	ret = amdgpu_dm_atomic_shaper_lut(shaper_lut, false,
+					  amdgpu_tf_to_dc_tf(shaper_tf),
+					  shaper_size,
+					  dc_plane_state->in_shaper_func);
+	if (ret) {
+		drm_dbg_kms(plane_state->plane->dev,
+			    "setting plane %d shaper LUT failed.\n",
+			    plane_state->plane->index);
+
+		return ret;
+	}
+
+	blend_tf = dm_plane_state->blend_tf;
+	blend_lut = __extract_blob_lut(dm_plane_state->blend_lut, &blend_size);
+	blend_size = blend_lut != NULL ? blend_size : 0;
+
+	ret = amdgpu_dm_atomic_blend_lut(blend_lut, false,
+					 amdgpu_tf_to_dc_tf(blend_tf),
+					 blend_size, dc_plane_state->blend_tf);
+	if (ret) {
+		drm_dbg_kms(plane_state->plane->dev,
+			    "setting plane %d gamma lut failed.\n",
+			    plane_state->plane->index);
+
+		return ret;
 	}
 
 	return 0;
 }
+
+/**
+ * amdgpu_dm_update_plane_color_mgmt: Maps DRM color management to DC plane.
+ * @crtc: amdgpu_dm crtc state
+ * @plane_state: DRM plane state
+ * @dc_plane_state: target DC surface
+ *
+ * Update the underlying dc_stream_state's input transfer function (ITF) in
+ * preparation for hardware commit. The transfer function used depends on
+ * the preparation done on the stream for color management.
+ *
+ * Returns:
+ * 0 on success. -ENOMEM if mem allocation fails.
+ */
+int amdgpu_dm_update_plane_color_mgmt(struct dm_crtc_state *crtc,
+				      struct drm_plane_state *plane_state,
+				      struct dc_plane_state *dc_plane_state)
+{
+	struct amdgpu_device *adev = drm_to_adev(crtc->base.state->dev);
+	struct dm_plane_state *dm_plane_state = to_dm_plane_state(plane_state);
+	struct drm_color_ctm2 *ctm = NULL;
+	struct dc_color_caps *color_caps = NULL;
+	bool has_crtc_cm_degamma;
+	int ret;
+
+	ret = amdgpu_dm_verify_lut3d_size(adev, plane_state);
+	if (ret) {
+		drm_dbg_driver(&adev->ddev, "amdgpu_dm_verify_lut3d_size() failed\n");
+		return ret;
+	}
+
+	if (dc_plane_state->ctx && dc_plane_state->ctx->dc)
+		color_caps = &dc_plane_state->ctx->dc->caps.color;
+
+	/* Initially, we can just bypass the DGM block. */
+	dc_plane_state->in_transfer_func->type = TF_TYPE_BYPASS;
+	dc_plane_state->in_transfer_func->tf = TRANSFER_FUNCTION_LINEAR;
+
+	/* After, we start to update values according to color props */
+	has_crtc_cm_degamma = (crtc->cm_has_degamma || crtc->cm_is_degamma_srgb);
+
+	ret = __set_dm_plane_degamma(plane_state, dc_plane_state, color_caps);
+	if (ret == -ENOMEM)
+		return ret;
+
+	/* We only have one degamma block available (pre-blending) for the
+	 * whole color correction pipeline, so that we can't actually perform
+	 * plane and CRTC degamma at the same time. Explicitly reject atomic
+	 * updates when userspace sets both plane and CRTC degamma properties.
+	 */
+	if (has_crtc_cm_degamma && ret != -EINVAL){
+		drm_dbg_kms(crtc->base.crtc->dev,
+			    "doesn't support plane and CRTC degamma at the same time\n");
+			return -EINVAL;
+	}
+
+	/* If we are here, it means we don't have plane degamma settings, check
+	 * if we have CRTC degamma waiting for mapping to pre-blending degamma
+	 * block
+	 */
+	if (has_crtc_cm_degamma) {
+		/* AMD HW doesn't have post-blending degamma caps. When DRM
+		 * CRTC atomic degamma is set, we maps it to DPP degamma block
+		 * (pre-blending) or, on legacy gamma, we use DPP degamma to
+		 * linearize (implicit degamma) from sRGB/BT709 according to
+		 * the input space.
+		 */
+		ret = map_crtc_degamma_to_dc_plane(crtc, dc_plane_state, color_caps);
+		if (ret)
+			return ret;
+	}
+
+	/* Setup CRTC CTM. */
+	if (dm_plane_state->ctm) {
+		ctm = (struct drm_color_ctm2 *)dm_plane_state->ctm->data;
+
+		/*
+		 * So far, if we have both plane and CRTC CTM, plane CTM takes
+		 * the priority and we discard data for CRTC CTM, as
+		 * implemented in dcn10_program_gamut_remap().  However, we
+		 * have MPC gamut_remap_matrix from DCN3 family, therefore we
+		 * can remap MPC programing of the matrix to MPC block and
+		 * provide support for both DPP and MPC matrix at the same
+		 * time.
+		 */
+		__drm_ctm2_to_dc_matrix(ctm, dc_plane_state->gamut_remap_matrix.matrix);
+
+		dc_plane_state->gamut_remap_matrix.enable_remap = true;
+		dc_plane_state->input_csc_color_matrix.enable_adjustment = false;
+	} else {
+		/* Bypass CTM. */
+		dc_plane_state->gamut_remap_matrix.enable_remap = false;
+		dc_plane_state->input_csc_color_matrix.enable_adjustment = false;
+	}
+
+	return amdgpu_dm_plane_set_color_properties(plane_state,
+						    dc_plane_state, color_caps);
+}
diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_crtc.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_crtc.c
index 440fc0869a34..d746f0aa0f11 100644
--- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_crtc.c
+++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_crtc.c
@@ -253,6 +253,7 @@ static struct drm_crtc_state *dm_crtc_duplicate_state(struct drm_crtc *crtc)
 	state->freesync_config = cur->freesync_config;
 	state->cm_has_degamma = cur->cm_has_degamma;
 	state->cm_is_degamma_srgb = cur->cm_is_degamma_srgb;
+	state->regamma_tf = cur->regamma_tf;
 	state->crc_skip_count = cur->crc_skip_count;
 	state->mpo_requested = cur->mpo_requested;
 	/* TODO Duplicate dc_stream after objects are stream object is flattened */
@@ -289,6 +290,70 @@ static int amdgpu_dm_crtc_late_register(struct drm_crtc *crtc)
 }
 #endif
 
+#ifdef AMD_PRIVATE_COLOR
+/**
+ * drm_crtc_additional_color_mgmt - enable additional color properties
+ * @crtc: DRM CRTC
+ *
+ * This function lets the driver enable post-blending CRTC regamma transfer
+ * function property in addition to DRM CRTC gamma LUT. Default value means
+ * linear transfer function, which is the default CRTC gamma LUT behaviour
+ * without this property.
+ */
+static void
+dm_crtc_additional_color_mgmt(struct drm_crtc *crtc)
+{
+	struct amdgpu_device *adev = drm_to_adev(crtc->dev);
+
+	if(adev->dm.dc->caps.color.mpc.ogam_ram)
+		drm_object_attach_property(&crtc->base,
+					   adev->mode_info.regamma_tf_property,
+					   AMDGPU_TRANSFER_FUNCTION_DEFAULT);
+}
+
+static int
+amdgpu_dm_atomic_crtc_set_property(struct drm_crtc *crtc,
+				   struct drm_crtc_state *state,
+				   struct drm_property *property,
+				   uint64_t val)
+{
+	struct amdgpu_device *adev = drm_to_adev(crtc->dev);
+	struct dm_crtc_state *acrtc_state = to_dm_crtc_state(state);
+
+	if (property == adev->mode_info.regamma_tf_property) {
+		if (acrtc_state->regamma_tf != val) {
+			acrtc_state->regamma_tf = val;
+			acrtc_state->base.color_mgmt_changed |= 1;
+		}
+	} else {
+		drm_dbg_atomic(crtc->dev,
+			       "[CRTC:%d:%s] unknown property [PROP:%d:%s]]\n",
+			       crtc->base.id, crtc->name,
+			       property->base.id, property->name);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int
+amdgpu_dm_atomic_crtc_get_property(struct drm_crtc *crtc,
+				   const struct drm_crtc_state *state,
+				   struct drm_property *property,
+				   uint64_t *val)
+{
+	struct amdgpu_device *adev = drm_to_adev(crtc->dev);
+	struct dm_crtc_state *acrtc_state = to_dm_crtc_state(state);
+
+	if (property == adev->mode_info.regamma_tf_property)
+		*val = acrtc_state->regamma_tf;
+	else
+		return -EINVAL;
+
+	return 0;
+}
+#endif
+
 /* Implemented only the options currently available for the driver */
 static const struct drm_crtc_funcs amdgpu_dm_crtc_funcs = {
 	.reset = dm_crtc_reset_state,
@@ -307,6 +372,10 @@ static const struct drm_crtc_funcs amdgpu_dm_crtc_funcs = {
 #if defined(CONFIG_DEBUG_FS)
 	.late_register = amdgpu_dm_crtc_late_register,
 #endif
+#ifdef AMD_PRIVATE_COLOR
+	.atomic_set_property = amdgpu_dm_atomic_crtc_set_property,
+	.atomic_get_property = amdgpu_dm_atomic_crtc_get_property,
+#endif
 };
 
 static void dm_crtc_helper_disable(struct drm_crtc *crtc)
@@ -470,6 +539,9 @@ int amdgpu_dm_crtc_init(struct amdgpu_display_manager *dm,
 
 	drm_mode_crtc_set_gamma_size(&acrtc->base, MAX_COLOR_LEGACY_LUT_ENTRIES);
 
+#ifdef AMD_PRIVATE_COLOR
+	dm_crtc_additional_color_mgmt(&acrtc->base);
+#endif
 	return 0;
 
 fail:
diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_plane.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_plane.c
index 6c84ca2ae373..ea03c2243729 100644
--- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_plane.c
+++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_plane.c
@@ -1324,8 +1324,14 @@ static void dm_drm_plane_reset(struct drm_plane *plane)
 	amdgpu_state = kzalloc(sizeof(*amdgpu_state), GFP_KERNEL);
 	WARN_ON(amdgpu_state == NULL);
 
-	if (amdgpu_state)
-		__drm_atomic_helper_plane_reset(plane, &amdgpu_state->base);
+	if (!amdgpu_state)
+		return;
+
+	__drm_atomic_helper_plane_reset(plane, &amdgpu_state->base);
+	amdgpu_state->degamma_tf = AMDGPU_TRANSFER_FUNCTION_DEFAULT;
+	amdgpu_state->hdr_mult = AMDGPU_HDR_MULT_DEFAULT;
+	amdgpu_state->shaper_tf = AMDGPU_TRANSFER_FUNCTION_DEFAULT;
+	amdgpu_state->blend_tf = AMDGPU_TRANSFER_FUNCTION_DEFAULT;
 }
 
 static struct drm_plane_state *
@@ -1345,6 +1351,22 @@ dm_drm_plane_duplicate_state(struct drm_plane *plane)
 		dc_plane_state_retain(dm_plane_state->dc_state);
 	}
 
+	if (dm_plane_state->degamma_lut)
+		drm_property_blob_get(dm_plane_state->degamma_lut);
+	if (dm_plane_state->ctm)
+		drm_property_blob_get(dm_plane_state->ctm);
+	if (dm_plane_state->shaper_lut)
+		drm_property_blob_get(dm_plane_state->shaper_lut);
+	if (dm_plane_state->lut3d)
+		drm_property_blob_get(dm_plane_state->lut3d);
+	if (dm_plane_state->blend_lut)
+		drm_property_blob_get(dm_plane_state->blend_lut);
+
+	dm_plane_state->degamma_tf = old_dm_plane_state->degamma_tf;
+	dm_plane_state->hdr_mult = old_dm_plane_state->hdr_mult;
+	dm_plane_state->shaper_tf = old_dm_plane_state->shaper_tf;
+	dm_plane_state->blend_tf = old_dm_plane_state->blend_tf;
+
 	return &dm_plane_state->base;
 }
 
@@ -1412,12 +1434,203 @@ static void dm_drm_plane_destroy_state(struct drm_plane *plane,
 {
 	struct dm_plane_state *dm_plane_state = to_dm_plane_state(state);
 
+	if (dm_plane_state->degamma_lut)
+		drm_property_blob_put(dm_plane_state->degamma_lut);
+	if (dm_plane_state->ctm)
+		drm_property_blob_put(dm_plane_state->ctm);
+	if (dm_plane_state->lut3d)
+		drm_property_blob_put(dm_plane_state->lut3d);
+	if (dm_plane_state->shaper_lut)
+		drm_property_blob_put(dm_plane_state->shaper_lut);
+	if (dm_plane_state->blend_lut)
+		drm_property_blob_put(dm_plane_state->blend_lut);
+
 	if (dm_plane_state->dc_state)
 		dc_plane_state_release(dm_plane_state->dc_state);
 
 	drm_atomic_helper_plane_destroy_state(plane, state);
 }
 
+#ifdef AMD_PRIVATE_COLOR
+static void
+dm_atomic_plane_attach_color_mgmt_properties(struct amdgpu_display_manager *dm,
+					     struct drm_plane *plane)
+{
+	struct amdgpu_mode_info mode_info = dm->adev->mode_info;
+	struct dpp_color_caps dpp_color_caps = dm->dc->caps.color.dpp;
+
+	/* Check HW color pipeline capabilities for DPP (pre-blending) before expose*/
+	if (dpp_color_caps.dgam_ram || dpp_color_caps.gamma_corr) {
+		drm_object_attach_property(&plane->base,
+					   mode_info.plane_degamma_lut_property, 0);
+		drm_object_attach_property(&plane->base,
+					   mode_info.plane_degamma_lut_size_property,
+					   MAX_COLOR_LUT_ENTRIES);
+		drm_object_attach_property(&plane->base,
+					   dm->adev->mode_info.plane_degamma_tf_property,
+					   AMDGPU_TRANSFER_FUNCTION_DEFAULT);
+	}
+	/* HDR MULT is always available */
+	drm_object_attach_property(&plane->base,
+				   dm->adev->mode_info.plane_hdr_mult_property,
+				   AMDGPU_HDR_MULT_DEFAULT);
+
+	/* Only enable plane CTM if both DPP and MPC gamut remap is available. */
+	if (dm->dc->caps.color.mpc.gamut_remap)
+		drm_object_attach_property(&plane->base,
+					   dm->adev->mode_info.plane_ctm_property, 0);
+
+	if (dpp_color_caps.hw_3d_lut) {
+		drm_object_attach_property(&plane->base,
+					   mode_info.plane_shaper_lut_property, 0);
+		drm_object_attach_property(&plane->base,
+					   mode_info.plane_shaper_lut_size_property,
+					   MAX_COLOR_LUT_ENTRIES);
+		drm_object_attach_property(&plane->base,
+					   mode_info.plane_shaper_tf_property,
+					   AMDGPU_TRANSFER_FUNCTION_DEFAULT);
+		drm_object_attach_property(&plane->base,
+					   mode_info.plane_lut3d_property, 0);
+		drm_object_attach_property(&plane->base,
+					   mode_info.plane_lut3d_size_property,
+					   MAX_COLOR_3DLUT_ENTRIES);
+	}
+
+	if (dpp_color_caps.ogam_ram) {
+		drm_object_attach_property(&plane->base,
+					   mode_info.plane_blend_lut_property, 0);
+		drm_object_attach_property(&plane->base,
+					   mode_info.plane_blend_lut_size_property,
+					   MAX_COLOR_LUT_ENTRIES);
+		drm_object_attach_property(&plane->base,
+					   mode_info.plane_blend_tf_property,
+					   AMDGPU_TRANSFER_FUNCTION_DEFAULT);
+	}
+}
+
+static int
+dm_atomic_plane_set_property(struct drm_plane *plane,
+			     struct drm_plane_state *state,
+			     struct drm_property *property,
+			     uint64_t val)
+{
+	struct dm_plane_state *dm_plane_state = to_dm_plane_state(state);
+	struct amdgpu_device *adev = drm_to_adev(plane->dev);
+	bool replaced = false;
+	int ret;
+
+	if (property == adev->mode_info.plane_degamma_lut_property) {
+		ret = drm_property_replace_blob_from_id(plane->dev,
+							&dm_plane_state->degamma_lut,
+							val,
+							-1, sizeof(struct drm_color_lut),
+							&replaced);
+		dm_plane_state->base.color_mgmt_changed |= replaced;
+		return ret;
+	} else if (property == adev->mode_info.plane_degamma_tf_property) {
+		if (dm_plane_state->degamma_tf != val) {
+			dm_plane_state->degamma_tf = val;
+			dm_plane_state->base.color_mgmt_changed = 1;
+		}
+	} else if (property == adev->mode_info.plane_hdr_mult_property) {
+		if (dm_plane_state->hdr_mult != val) {
+			dm_plane_state->hdr_mult = val;
+			dm_plane_state->base.color_mgmt_changed = 1;
+		}
+	} else if (property == adev->mode_info.plane_ctm_property) {
+		ret = drm_property_replace_blob_from_id(plane->dev,
+							&dm_plane_state->ctm,
+							val,
+							sizeof(struct drm_color_ctm2), -1,
+							&replaced);
+		dm_plane_state->base.color_mgmt_changed |= replaced;
+		return ret;
+	} else if (property == adev->mode_info.plane_shaper_lut_property) {
+		ret = drm_property_replace_blob_from_id(plane->dev,
+							&dm_plane_state->shaper_lut,
+							val, -1,
+							sizeof(struct drm_color_lut),
+							&replaced);
+		dm_plane_state->base.color_mgmt_changed |= replaced;
+		return ret;
+	} else if (property == adev->mode_info.plane_shaper_tf_property) {
+		if (dm_plane_state->shaper_tf != val) {
+			dm_plane_state->shaper_tf = val;
+			dm_plane_state->base.color_mgmt_changed = 1;
+		}
+	} else if (property == adev->mode_info.plane_lut3d_property) {
+		ret = drm_property_replace_blob_from_id(plane->dev,
+							&dm_plane_state->lut3d,
+							val, -1,
+							sizeof(struct drm_color_lut),
+							&replaced);
+		dm_plane_state->base.color_mgmt_changed |= replaced;
+		return ret;
+	} else if (property == adev->mode_info.plane_blend_lut_property) {
+		ret = drm_property_replace_blob_from_id(plane->dev,
+							&dm_plane_state->blend_lut,
+							val, -1,
+							sizeof(struct drm_color_lut),
+							&replaced);
+		dm_plane_state->base.color_mgmt_changed |= replaced;
+		return ret;
+	} else if (property == adev->mode_info.plane_blend_tf_property) {
+		if (dm_plane_state->blend_tf != val) {
+			dm_plane_state->blend_tf = val;
+			dm_plane_state->base.color_mgmt_changed = 1;
+		}
+	} else {
+		drm_dbg_atomic(plane->dev,
+			       "[PLANE:%d:%s] unknown property [PROP:%d:%s]]\n",
+			       plane->base.id, plane->name,
+			       property->base.id, property->name);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int
+dm_atomic_plane_get_property(struct drm_plane *plane,
+			     const struct drm_plane_state *state,
+			     struct drm_property *property,
+			     uint64_t *val)
+{
+	struct dm_plane_state *dm_plane_state = to_dm_plane_state(state);
+	struct amdgpu_device *adev = drm_to_adev(plane->dev);
+
+	if (property == adev->mode_info.plane_degamma_lut_property) {
+		*val = (dm_plane_state->degamma_lut) ?
+			dm_plane_state->degamma_lut->base.id : 0;
+	} else if (property == adev->mode_info.plane_degamma_tf_property) {
+		*val = dm_plane_state->degamma_tf;
+	} else if (property == adev->mode_info.plane_hdr_mult_property) {
+		*val = dm_plane_state->hdr_mult;
+	} else if (property == adev->mode_info.plane_ctm_property) {
+		*val = (dm_plane_state->ctm) ?
+			dm_plane_state->ctm->base.id : 0;
+	} else 	if (property == adev->mode_info.plane_shaper_lut_property) {
+		*val = (dm_plane_state->shaper_lut) ?
+			dm_plane_state->shaper_lut->base.id : 0;
+	} else if (property == adev->mode_info.plane_shaper_tf_property) {
+		*val = dm_plane_state->shaper_tf;
+	} else 	if (property == adev->mode_info.plane_lut3d_property) {
+		*val = (dm_plane_state->lut3d) ?
+			dm_plane_state->lut3d->base.id : 0;
+	} else 	if (property == adev->mode_info.plane_blend_lut_property) {
+		*val = (dm_plane_state->blend_lut) ?
+			dm_plane_state->blend_lut->base.id : 0;
+	} else if (property == adev->mode_info.plane_blend_tf_property) {
+		*val = dm_plane_state->blend_tf;
+
+	} else {
+		return -EINVAL;
+	}
+
+	return 0;
+}
+#endif
+
 static const struct drm_plane_funcs dm_plane_funcs = {
 	.update_plane	= drm_atomic_helper_update_plane,
 	.disable_plane	= drm_atomic_helper_disable_plane,
@@ -1426,6 +1639,10 @@ static const struct drm_plane_funcs dm_plane_funcs = {
 	.atomic_duplicate_state = dm_drm_plane_duplicate_state,
 	.atomic_destroy_state = dm_drm_plane_destroy_state,
 	.format_mod_supported = dm_plane_format_mod_supported,
+#ifdef AMD_PRIVATE_COLOR
+	.atomic_set_property = dm_atomic_plane_set_property,
+	.atomic_get_property = dm_atomic_plane_get_property,
+#endif
 };
 
 int amdgpu_dm_plane_init(struct amdgpu_display_manager *dm,
@@ -1496,6 +1713,9 @@ int amdgpu_dm_plane_init(struct amdgpu_display_manager *dm,
 
 	drm_plane_helper_add(plane, &dm_plane_helper_funcs);
 
+#ifdef AMD_PRIVATE_COLOR
+	dm_atomic_plane_attach_color_mgmt_properties(dm, plane);
+#endif
 	/* Create (reset) the plane state */
 	if (plane->funcs->reset)
 		plane->funcs->reset(plane);
diff --git a/drivers/gpu/drm/amd/display/dc/dcn10/dcn10_cm_common.c b/drivers/gpu/drm/amd/display/dc/dcn10/dcn10_cm_common.c
index 3538973bd0c6..04b2e04b68f3 100644
--- a/drivers/gpu/drm/amd/display/dc/dcn10/dcn10_cm_common.c
+++ b/drivers/gpu/drm/amd/display/dc/dcn10/dcn10_cm_common.c
@@ -349,20 +349,37 @@ bool cm_helper_translate_curve_to_hw_format(struct dc_context *ctx,
 		 * segment is from 2^-10 to 2^1
 		 * There are less than 256 points, for optimization
 		 */
-		seg_distr[0] = 3;
-		seg_distr[1] = 4;
-		seg_distr[2] = 4;
-		seg_distr[3] = 4;
-		seg_distr[4] = 4;
-		seg_distr[5] = 4;
-		seg_distr[6] = 4;
-		seg_distr[7] = 4;
-		seg_distr[8] = 4;
-		seg_distr[9] = 4;
-		seg_distr[10] = 1;
-
-		region_start = -10;
-		region_end = 1;
+		if (output_tf->tf == TRANSFER_FUNCTION_LINEAR) {
+			seg_distr[0] = 0; /* 2 */
+			seg_distr[1] = 1; /* 4 */
+			seg_distr[2] = 2; /* 4 */
+			seg_distr[3] = 3; /* 8 */
+			seg_distr[4] = 4; /* 16 */
+			seg_distr[5] = 5; /* 32 */
+			seg_distr[6] = 6; /* 64 */
+			seg_distr[7] = 7; /* 128 */
+
+			region_start = -8;
+			region_end = 1;
+		} else {
+			seg_distr[0] = 3; /* 8 */
+			seg_distr[1] = 4; /* 16 */
+			seg_distr[2] = 4;
+			seg_distr[3] = 4;
+			seg_distr[4] = 4;
+			seg_distr[5] = 4;
+			seg_distr[6] = 4;
+			seg_distr[7] = 4;
+			seg_distr[8] = 4;
+			seg_distr[9] = 4;
+			seg_distr[10] = 1; /* 2 */
+			/* total = 8*16 + 8 + 64 + 2 = */
+
+			region_start = -10;
+			region_end = 1;
+		}
+
+
 	}
 
 	for (i = region_end - region_start; i < MAX_REGIONS_NUMBER ; i++)
@@ -375,16 +392,56 @@ bool cm_helper_translate_curve_to_hw_format(struct dc_context *ctx,
 
 	j = 0;
 	for (k = 0; k < (region_end - region_start); k++) {
-		increment = NUMBER_SW_SEGMENTS / (1 << seg_distr[k]);
+		/*
+		 * We're using an ugly-ish hack here. Our HW allows for
+		 * 256 segments per region but SW_SEGMENTS is 16.
+		 * SW_SEGMENTS has some undocumented relationship to
+		 * the number of points in the tf_pts struct, which
+		 * is 512, unlike what's suggested TRANSFER_FUNC_POINTS.
+		 *
+		 * In order to work past this dilemma we'll scale our
+		 * increment by (1 << 4) and then do the inverse (1 >> 4)
+		 * when accessing the elements in tf_pts.
+		 *
+		 * TODO: find a better way using SW_SEGMENTS and
+		 *       TRANSFER_FUNC_POINTS definitions
+		 */
+		increment = (NUMBER_SW_SEGMENTS << 4) / (1 << seg_distr[k]);
 		start_index = (region_start + k + MAX_LOW_POINT) *
 				NUMBER_SW_SEGMENTS;
-		for (i = start_index; i < start_index + NUMBER_SW_SEGMENTS;
+		for (i = (start_index << 4); i < (start_index << 4) + (NUMBER_SW_SEGMENTS << 4);
 				i += increment) {
+			struct fixed31_32 in_plus_one, in;
+			struct fixed31_32 value, red_value, green_value, blue_value;
+			uint32_t t = i & 0xf;
+
 			if (j == hw_points - 1)
 				break;
-			rgb_resulted[j].red = output_tf->tf_pts.red[i];
-			rgb_resulted[j].green = output_tf->tf_pts.green[i];
-			rgb_resulted[j].blue = output_tf->tf_pts.blue[i];
+
+			in_plus_one = output_tf->tf_pts.red[(i >> 4) + 1];
+			in = output_tf->tf_pts.red[i >> 4];
+			value = dc_fixpt_sub(in_plus_one, in);
+			value = dc_fixpt_shr(dc_fixpt_mul_int(value, t),  4);
+			value = dc_fixpt_add(in, value);
+			red_value = value;
+
+			in_plus_one = output_tf->tf_pts.green[(i >> 4) + 1];
+			in = output_tf->tf_pts.green[i >> 4];
+			value = dc_fixpt_sub(in_plus_one, in);
+			value = dc_fixpt_shr(dc_fixpt_mul_int(value, t),  4);
+			value = dc_fixpt_add(in, value);
+			green_value = value;
+
+			in_plus_one = output_tf->tf_pts.blue[(i >> 4) + 1];
+			in = output_tf->tf_pts.blue[i >> 4];
+			value = dc_fixpt_sub(in_plus_one, in);
+			value = dc_fixpt_shr(dc_fixpt_mul_int(value, t),  4);
+			value = dc_fixpt_add(in, value);
+			blue_value = value;
+
+			rgb_resulted[j].red = red_value;
+			rgb_resulted[j].green = green_value;
+			rgb_resulted[j].blue = blue_value;
 			j++;
 		}
 	}
diff --git a/drivers/gpu/drm/amd/display/dc/dcn30/dcn30_hwseq.c b/drivers/gpu/drm/amd/display/dc/dcn30/dcn30_hwseq.c
index 4cd4ae07d73d..4fb4e9ec03f1 100644
--- a/drivers/gpu/drm/amd/display/dc/dcn30/dcn30_hwseq.c
+++ b/drivers/gpu/drm/amd/display/dc/dcn30/dcn30_hwseq.c
@@ -186,6 +186,43 @@ bool dcn30_set_input_transfer_func(struct dc *dc,
 	return result;
 }
 
+void dcn30_program_gamut_remap(struct pipe_ctx *pipe_ctx)
+{
+	int i = 0;
+	struct dpp_grph_csc_adjustment dpp_adjust;
+	struct mpc_grph_gamut_adjustment mpc_adjust;
+	int mpcc_id = pipe_ctx->plane_res.hubp->inst;
+	struct mpc *mpc = pipe_ctx->stream_res.opp->ctx->dc->res_pool->mpc;
+
+	memset(&dpp_adjust, 0, sizeof(dpp_adjust));
+	dpp_adjust.gamut_adjust_type = GRAPHICS_GAMUT_ADJUST_TYPE_BYPASS;
+
+	if (pipe_ctx->plane_state &&
+	    pipe_ctx->plane_state->gamut_remap_matrix.enable_remap == true) {
+		dpp_adjust.gamut_adjust_type = GRAPHICS_GAMUT_ADJUST_TYPE_SW;
+		for (i = 0; i < CSC_TEMPERATURE_MATRIX_SIZE; i++)
+			dpp_adjust.temperature_matrix[i] =
+				pipe_ctx->plane_state->gamut_remap_matrix.matrix[i];
+	}
+
+	pipe_ctx->plane_res.dpp->funcs->dpp_set_gamut_remap(pipe_ctx->plane_res.dpp,
+							    &dpp_adjust);
+
+	memset(&mpc_adjust, 0, sizeof(mpc_adjust));
+	mpc_adjust.gamut_adjust_type = GRAPHICS_GAMUT_ADJUST_TYPE_BYPASS;
+
+	if (pipe_ctx->top_pipe == NULL) {
+		if (pipe_ctx->stream->gamut_remap_matrix.enable_remap == true) {
+			mpc_adjust.gamut_adjust_type = GRAPHICS_GAMUT_ADJUST_TYPE_SW;
+			for (i = 0; i < CSC_TEMPERATURE_MATRIX_SIZE; i++)
+				mpc_adjust.temperature_matrix[i] =
+					pipe_ctx->stream->gamut_remap_matrix.matrix[i];
+		}
+	}
+
+	mpc->funcs->set_gamut_remap(mpc, mpcc_id, &mpc_adjust);
+}
+
 bool dcn30_set_output_transfer_func(struct dc *dc,
 				struct pipe_ctx *pipe_ctx,
 				const struct dc_stream_state *stream)
diff --git a/drivers/gpu/drm/amd/display/dc/dcn30/dcn30_hwseq.h b/drivers/gpu/drm/amd/display/dc/dcn30/dcn30_hwseq.h
index a24a8e33a3d2..cb34ca932a5f 100644
--- a/drivers/gpu/drm/amd/display/dc/dcn30/dcn30_hwseq.h
+++ b/drivers/gpu/drm/amd/display/dc/dcn30/dcn30_hwseq.h
@@ -58,6 +58,9 @@ bool dcn30_set_blend_lut(struct pipe_ctx *pipe_ctx,
 bool dcn30_set_input_transfer_func(struct dc *dc,
 				struct pipe_ctx *pipe_ctx,
 				const struct dc_plane_state *plane_state);
+
+void dcn30_program_gamut_remap(struct pipe_ctx *pipe_ctx);
+
 bool dcn30_set_output_transfer_func(struct dc *dc,
 				struct pipe_ctx *pipe_ctx,
 				const struct dc_stream_state *stream);
diff --git a/drivers/gpu/drm/amd/display/dc/dcn301/dcn301_init.c b/drivers/gpu/drm/amd/display/dc/dcn301/dcn301_init.c
index 61205cdbe2d5..fdbe3d42cd7b 100644
--- a/drivers/gpu/drm/amd/display/dc/dcn301/dcn301_init.c
+++ b/drivers/gpu/drm/amd/display/dc/dcn301/dcn301_init.c
@@ -33,7 +33,7 @@
 #include "dcn301_init.h"
 
 static const struct hw_sequencer_funcs dcn301_funcs = {
-	.program_gamut_remap = dcn10_program_gamut_remap,
+	.program_gamut_remap = dcn30_program_gamut_remap,
 	.init_hw = dcn10_init_hw,
 	.power_down_on_boot = dcn10_power_down_on_boot,
 	.apply_ctx_to_hw = dce110_apply_ctx_to_hw,
diff --git a/drivers/gpu/drm/amd/display/include/fixed31_32.h b/drivers/gpu/drm/amd/display/include/fixed31_32.h
index d4cf7ead1d87..84da1dd34efd 100644
--- a/drivers/gpu/drm/amd/display/include/fixed31_32.h
+++ b/drivers/gpu/drm/amd/display/include/fixed31_32.h
@@ -69,6 +69,18 @@ static const struct fixed31_32 dc_fixpt_epsilon = { 1LL };
 static const struct fixed31_32 dc_fixpt_half = { 0x80000000LL };
 static const struct fixed31_32 dc_fixpt_one = { 0x100000000LL };
 
+static inline struct fixed31_32 dc_fixpt_from_s3132(__u64 x)
+{
+	struct fixed31_32 val;
+
+	/* If negative, convert to 2's complement. */
+	if (x & (1ULL << 63))
+		x = -(x & ~(1ULL << 63));
+
+	val.value = x;
+	return val;
+}
+
 /*
  * @brief
  * Initialization routines
diff --git a/drivers/gpu/drm/arm/malidp_crtc.c b/drivers/gpu/drm/arm/malidp_crtc.c
index dc01c43f6193..d72c22dcf685 100644
--- a/drivers/gpu/drm/arm/malidp_crtc.c
+++ b/drivers/gpu/drm/arm/malidp_crtc.c
@@ -221,7 +221,7 @@ static int malidp_crtc_atomic_check_ctm(struct drm_crtc *crtc,
 
 	/*
 	 * The size of the ctm is checked in
-	 * drm_atomic_replace_property_blob_from_id.
+	 * drm_property_replace_blob_from_id.
 	 */
 	ctm = (struct drm_color_ctm *)state->ctm->data;
 	for (i = 0; i < ARRAY_SIZE(ctm->matrix); ++i) {
diff --git a/drivers/gpu/drm/drm_atomic.c b/drivers/gpu/drm/drm_atomic.c
index c277b198fa3f..c3df45f90145 100644
--- a/drivers/gpu/drm/drm_atomic.c
+++ b/drivers/gpu/drm/drm_atomic.c
@@ -733,6 +733,7 @@ static void drm_atomic_plane_print_state(struct drm_printer *p,
 		   drm_get_color_encoding_name(state->color_encoding));
 	drm_printf(p, "\tcolor-range=%s\n",
 		   drm_get_color_range_name(state->color_range));
+	drm_printf(p, "\tcolor_mgmt_changed=%d\n", state->color_mgmt_changed);
 
 	if (plane->funcs->atomic_print_state)
 		plane->funcs->atomic_print_state(p, state);
diff --git a/drivers/gpu/drm/drm_atomic_state_helper.c b/drivers/gpu/drm/drm_atomic_state_helper.c
index 784e63d70a42..25bb0859fda7 100644
--- a/drivers/gpu/drm/drm_atomic_state_helper.c
+++ b/drivers/gpu/drm/drm_atomic_state_helper.c
@@ -338,6 +338,7 @@ void __drm_atomic_helper_plane_duplicate_state(struct drm_plane *plane,
 	state->fence = NULL;
 	state->commit = NULL;
 	state->fb_damage_clips = NULL;
+	state->color_mgmt_changed = false;
 }
 EXPORT_SYMBOL(__drm_atomic_helper_plane_duplicate_state);
 
diff --git a/drivers/gpu/drm/drm_atomic_uapi.c b/drivers/gpu/drm/drm_atomic_uapi.c
index d867e7f9f2cd..a6a9ee5086dd 100644
--- a/drivers/gpu/drm/drm_atomic_uapi.c
+++ b/drivers/gpu/drm/drm_atomic_uapi.c
@@ -362,39 +362,6 @@ static s32 __user *get_out_fence_for_connector(struct drm_atomic_state *state,
 	return fence_ptr;
 }
 
-static int
-drm_atomic_replace_property_blob_from_id(struct drm_device *dev,
-					 struct drm_property_blob **blob,
-					 uint64_t blob_id,
-					 ssize_t expected_size,
-					 ssize_t expected_elem_size,
-					 bool *replaced)
-{
-	struct drm_property_blob *new_blob = NULL;
-
-	if (blob_id != 0) {
-		new_blob = drm_property_lookup_blob(dev, blob_id);
-		if (new_blob == NULL)
-			return -EINVAL;
-
-		if (expected_size > 0 &&
-		    new_blob->length != expected_size) {
-			drm_property_blob_put(new_blob);
-			return -EINVAL;
-		}
-		if (expected_elem_size > 0 &&
-		    new_blob->length % expected_elem_size != 0) {
-			drm_property_blob_put(new_blob);
-			return -EINVAL;
-		}
-	}
-
-	*replaced |= drm_property_replace_blob(blob, new_blob);
-	drm_property_blob_put(new_blob);
-
-	return 0;
-}
-
 static int drm_atomic_crtc_set_property(struct drm_crtc *crtc,
 		struct drm_crtc_state *state, struct drm_property *property,
 		uint64_t val)
@@ -415,7 +382,7 @@ static int drm_atomic_crtc_set_property(struct drm_crtc *crtc,
 	} else if (property == config->prop_vrr_enabled) {
 		state->vrr_enabled = val;
 	} else if (property == config->degamma_lut_property) {
-		ret = drm_atomic_replace_property_blob_from_id(dev,
+		ret = drm_property_replace_blob_from_id(dev,
 					&state->degamma_lut,
 					val,
 					-1, sizeof(struct drm_color_lut),
@@ -423,7 +390,7 @@ static int drm_atomic_crtc_set_property(struct drm_crtc *crtc,
 		state->color_mgmt_changed |= replaced;
 		return ret;
 	} else if (property == config->ctm_property) {
-		ret = drm_atomic_replace_property_blob_from_id(dev,
+		ret = drm_property_replace_blob_from_id(dev,
 					&state->ctm,
 					val,
 					sizeof(struct drm_color_ctm), -1,
@@ -431,7 +398,7 @@ static int drm_atomic_crtc_set_property(struct drm_crtc *crtc,
 		state->color_mgmt_changed |= replaced;
 		return ret;
 	} else if (property == config->gamma_lut_property) {
-		ret = drm_atomic_replace_property_blob_from_id(dev,
+		ret = drm_property_replace_blob_from_id(dev,
 					&state->gamma_lut,
 					val,
 					-1, sizeof(struct drm_color_lut),
@@ -563,7 +530,7 @@ static int drm_atomic_plane_set_property(struct drm_plane *plane,
 	} else if (property == plane->color_range_property) {
 		state->color_range = val;
 	} else if (property == config->prop_fb_damage_clips) {
-		ret = drm_atomic_replace_property_blob_from_id(dev,
+		ret = drm_property_replace_blob_from_id(dev,
 					&state->fb_damage_clips,
 					val,
 					-1,
@@ -729,7 +696,7 @@ static int drm_atomic_connector_set_property(struct drm_connector *connector,
 		if (state->link_status != DRM_LINK_STATUS_GOOD)
 			state->link_status = val;
 	} else if (property == config->hdr_output_metadata_property) {
-		ret = drm_atomic_replace_property_blob_from_id(dev,
+		ret = drm_property_replace_blob_from_id(dev,
 				&state->hdr_output_metadata,
 				val,
 				sizeof(struct hdr_output_metadata), -1,
diff --git a/drivers/gpu/drm/drm_property.c b/drivers/gpu/drm/drm_property.c
index dfec479830e4..f72ef6493340 100644
--- a/drivers/gpu/drm/drm_property.c
+++ b/drivers/gpu/drm/drm_property.c
@@ -751,6 +751,55 @@ bool drm_property_replace_blob(struct drm_property_blob **blob,
 }
 EXPORT_SYMBOL(drm_property_replace_blob);
 
+/**
+ * drm_property_replace_blob_from_id - replace a blob property taking a reference
+ * @dev: DRM device
+ * @blob: a pointer to the member blob to be replaced
+ * @blob_id: the id of the new blob to replace with
+ * @expected_size: expected size of the blob property
+ * @expected_elem_size: expected size of an element in the blob property
+ * @replaced: if the blob was in fact replaced
+ *
+ * Look up the new blob from id, take its reference, check expected sizes of
+ * the blob and its element and replace the old blob by the new one. Advertise
+ * if the replacement operation was successful.
+ *
+ * Return: true if the blob was in fact replaced. -EINVAL if the new blob was
+ * not found or sizes don't match.
+ */
+int drm_property_replace_blob_from_id(struct drm_device *dev,
+					 struct drm_property_blob **blob,
+					 uint64_t blob_id,
+					 ssize_t expected_size,
+					 ssize_t expected_elem_size,
+					 bool *replaced)
+{
+	struct drm_property_blob *new_blob = NULL;
+
+	if (blob_id != 0) {
+		new_blob = drm_property_lookup_blob(dev, blob_id);
+		if (new_blob == NULL)
+			return -EINVAL;
+
+		if (expected_size > 0 &&
+		    new_blob->length != expected_size) {
+			drm_property_blob_put(new_blob);
+			return -EINVAL;
+		}
+		if (expected_elem_size > 0 &&
+		    new_blob->length % expected_elem_size != 0) {
+			drm_property_blob_put(new_blob);
+			return -EINVAL;
+		}
+	}
+
+	*replaced |= drm_property_replace_blob(blob, new_blob);
+	drm_property_blob_put(new_blob);
+
+	return 0;
+}
+EXPORT_SYMBOL(drm_property_replace_blob_from_id);
+
 int drm_mode_getblob_ioctl(struct drm_device *dev,
 			   void *data, struct drm_file *file_priv)
 {
diff --git a/include/drm/drm_mode_object.h b/include/drm/drm_mode_object.h
index 912f1e415685..08d7a7f0188f 100644
--- a/include/drm/drm_mode_object.h
+++ b/include/drm/drm_mode_object.h
@@ -60,7 +60,7 @@ struct drm_mode_object {
 	void (*free_cb)(struct kref *kref);
 };
 
-#define DRM_OBJECT_MAX_PROPERTY 24
+#define DRM_OBJECT_MAX_PROPERTY 64
 /**
  * struct drm_object_properties - property tracking for &drm_mode_object
  */
diff --git a/include/drm/drm_plane.h b/include/drm/drm_plane.h
index 51291983ea44..52c3287da0da 100644
--- a/include/drm/drm_plane.h
+++ b/include/drm/drm_plane.h
@@ -237,6 +237,13 @@ struct drm_plane_state {
 
 	/** @state: backpointer to global drm_atomic_state */
 	struct drm_atomic_state *state;
+
+	/**
+	 * @color_mgmt_changed: Color management properties have changed. Used
+	 * by the atomic helpers and drivers to steer the atomic commit control
+	 * flow.
+	 */
+	bool color_mgmt_changed : 1;
 };
 
 static inline struct drm_rect
diff --git a/include/drm/drm_property.h b/include/drm/drm_property.h
index 65bc9710a470..082f29156b3e 100644
--- a/include/drm/drm_property.h
+++ b/include/drm/drm_property.h
@@ -279,6 +279,12 @@ struct drm_property_blob *drm_property_create_blob(struct drm_device *dev,
 						   const void *data);
 struct drm_property_blob *drm_property_lookup_blob(struct drm_device *dev,
 						   uint32_t id);
+int drm_property_replace_blob_from_id(struct drm_device *dev,
+				      struct drm_property_blob **blob,
+				      uint64_t blob_id,
+				      ssize_t expected_size,
+				      ssize_t expected_elem_size,
+				      bool *replaced);
 int drm_property_replace_global_blob(struct drm_device *dev,
 				     struct drm_property_blob **replace,
 				     size_t length,
diff --git a/include/uapi/drm/drm_mode.h b/include/uapi/drm/drm_mode.h
index 43691058d28f..23fc19400998 100644
--- a/include/uapi/drm/drm_mode.h
+++ b/include/uapi/drm/drm_mode.h
@@ -843,6 +843,14 @@ struct drm_color_ctm {
 	__u64 matrix[9];
 };
 
+struct drm_color_ctm2 {
+	/*
+	 * Conversion matrix in S31.32 sign-magnitude
+	 * (not two's complement!) format.
+	 */
+	__u64 matrix[12];
+};
+
 struct drm_color_lut {
 	/*
 	 * Values are mapped linearly to 0.0 - 1.0 range, with 0x0 == 0.0 and
-- 
2.42.0

From e91c7c0b6a3581ad777540fdfb5cd35c7f0390e6 Mon Sep 17 00:00:00 2001
From: Peter Jung <admin@ptr1337.dev>
Date: Sun, 22 Oct 2023 09:05:08 +0200
Subject: [PATCH 2/7] amd-pref-core

Signed-off-by: Peter Jung <admin@ptr1337.dev>
---
 .../admin-guide/kernel-parameters.txt         |   5 +
 Documentation/admin-guide/pm/amd-pstate.rst   |  59 ++++-
 arch/x86/Kconfig                              |   5 +-
 drivers/acpi/cppc_acpi.c                      |  13 ++
 drivers/acpi/processor_driver.c               |   6 +
 drivers/cpufreq/amd-pstate-ut.c               |   4 +-
 drivers/cpufreq/amd-pstate.c                  | 206 ++++++++++++++++--
 drivers/cpufreq/cpufreq.c                     |  13 ++
 include/acpi/cppc_acpi.h                      |   5 +
 include/linux/amd-pstate.h                    |  10 +
 include/linux/cpufreq.h                       |   5 +
 11 files changed, 308 insertions(+), 23 deletions(-)

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 23ebe34ff901..f23ec4dc6c4b 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -363,6 +363,11 @@
 			  selects a performance level in this range and appropriate
 			  to the current workload.
 
+	amd_prefcore=
+			[X86]
+			disable
+			  Disable amd-pstate preferred core.
+
 	amijoy.map=	[HW,JOY] Amiga joystick support
 			Map of devices attached to JOY0DAT and JOY1DAT
 			Format: <a>,<b>
diff --git a/Documentation/admin-guide/pm/amd-pstate.rst b/Documentation/admin-guide/pm/amd-pstate.rst
index 1cf40f69278c..0b832ff529db 100644
--- a/Documentation/admin-guide/pm/amd-pstate.rst
+++ b/Documentation/admin-guide/pm/amd-pstate.rst
@@ -300,8 +300,8 @@ platforms. The AMD P-States mechanism is the more performance and energy
 efficiency frequency management method on AMD processors.
 
 
-AMD Pstate Driver Operation Modes
-=================================
+``amd-pstate`` Driver Operation Modes
+======================================
 
 ``amd_pstate`` CPPC has 3 operation modes: autonomous (active) mode,
 non-autonomous (passive) mode and guided autonomous (guided) mode.
@@ -353,6 +353,48 @@ is activated.  In this mode, driver requests minimum and maximum performance
 level and the platform autonomously selects a performance level in this range
 and appropriate to the current workload.
 
+``amd-pstate`` Preferred Core
+=================================
+
+The core frequency is subjected to the process variation in semiconductors.
+Not all cores are able to reach the maximum frequency respecting the
+infrastructure limits. Consequently, AMD has redefined the concept of
+maximum frequency of a part. This means that a fraction of cores can reach
+maximum frequency. To find the best process scheduling policy for a given
+scenario, OS needs to know the core ordering informed by the platform through
+highest performance capability register of the CPPC interface.
+
+``amd-pstate`` preferred core enables the scheduler to prefer scheduling on
+cores that can achieve a higher frequency with lower voltage. The preferred
+core rankings can dynamically change based on the workload, platform conditions,
+thermals and ageing.
+
+The priority metric will be initialized by the ``amd-pstate`` driver. The ``amd-pstate``
+driver will also determine whether or not ``amd-pstate`` preferred core is
+supported by the platform.
+
+``amd-pstate`` driver will provide an initial core ordering when the system boots.
+The platform uses the CPPC interfaces to communicate the core ranking to the
+operating system and scheduler to make sure that OS is choosing the cores
+with highest performance firstly for scheduling the process. When ``amd-pstate``
+driver receives a message with the highest performance change, it will
+update the core ranking and set the cpu's priority.
+
+``amd-pstate`` Preferred Core Switch
+=================================
+Kernel Parameters
+-----------------
+
+``amd-pstate`` peferred core`` has two states: enable and disable.
+Enable/disable states can be chosen by different kernel parameters.
+Default enable ``amd-pstate`` preferred core.
+
+``amd_prefcore=disable``
+
+For systems that support ``amd-pstate`` preferred core, the core rankings will
+always be advertised by the platform. But OS can choose to ignore that via the
+kernel parameter ``amd_prefcore=disable``.
+
 User Space Interface in ``sysfs`` - General
 ===========================================
 
@@ -385,6 +427,19 @@ control its functionality at the system level.  They are located in the
         to the operation mode represented by that string - or to be
         unregistered in the "disable" case.
 
+``prefcore``
+	Preferred core state of the driver: "enabled" or "disabled".
+
+	"enabled"
+		Enable the ``amd-pstate`` preferred core.
+
+	"disabled"
+		Disable the ``amd-pstate`` preferred core
+
+
+        This attribute is read-only to check the state of preferred core set
+        by the kernel parameter.
+
 ``cpupower`` tool support for ``amd-pstate``
 ===============================================
 
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 68ce4f786dcd..b6ac7d85487a 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1052,8 +1052,9 @@ config SCHED_MC
 
 config SCHED_MC_PRIO
 	bool "CPU core priorities scheduler support"
-	depends on SCHED_MC && CPU_SUP_INTEL
-	select X86_INTEL_PSTATE
+	depends on SCHED_MC
+	select X86_INTEL_PSTATE if CPU_SUP_INTEL
+	select X86_AMD_PSTATE if CPU_SUP_AMD && ACPI
 	select CPU_FREQ
 	default y
 	help
diff --git a/drivers/acpi/cppc_acpi.c b/drivers/acpi/cppc_acpi.c
index 7ff269a78c20..ad388a0e8484 100644
--- a/drivers/acpi/cppc_acpi.c
+++ b/drivers/acpi/cppc_acpi.c
@@ -1154,6 +1154,19 @@ int cppc_get_nominal_perf(int cpunum, u64 *nominal_perf)
 	return cppc_get_perf(cpunum, NOMINAL_PERF, nominal_perf);
 }
 
+/**
+ * cppc_get_highest_perf - Get the highest performance register value.
+ * @cpunum: CPU from which to get highest performance.
+ * @highest_perf: Return address.
+ *
+ * Return: 0 for success, -EIO otherwise.
+ */
+int cppc_get_highest_perf(int cpunum, u64 *highest_perf)
+{
+	return cppc_get_perf(cpunum, HIGHEST_PERF, highest_perf);
+}
+EXPORT_SYMBOL_GPL(cppc_get_highest_perf);
+
 /**
  * cppc_get_epp_perf - Get the epp register value.
  * @cpunum: CPU from which to get epp preference value.
diff --git a/drivers/acpi/processor_driver.c b/drivers/acpi/processor_driver.c
index 4bd16b3f0781..29b2fb68a35d 100644
--- a/drivers/acpi/processor_driver.c
+++ b/drivers/acpi/processor_driver.c
@@ -27,6 +27,7 @@
 #define ACPI_PROCESSOR_NOTIFY_PERFORMANCE 0x80
 #define ACPI_PROCESSOR_NOTIFY_POWER	0x81
 #define ACPI_PROCESSOR_NOTIFY_THROTTLING	0x82
+#define ACPI_PROCESSOR_NOTIFY_HIGEST_PERF_CHANGED	0x85
 
 MODULE_AUTHOR("Paul Diefenbaugh");
 MODULE_DESCRIPTION("ACPI Processor Driver");
@@ -83,6 +84,11 @@ static void acpi_processor_notify(acpi_handle handle, u32 event, void *data)
 		acpi_bus_generate_netlink_event(device->pnp.device_class,
 						  dev_name(&device->dev), event, 0);
 		break;
+	case ACPI_PROCESSOR_NOTIFY_HIGEST_PERF_CHANGED:
+		cpufreq_update_highest_perf(pr->id);
+		acpi_bus_generate_netlink_event(device->pnp.device_class,
+						  dev_name(&device->dev), event, 0);
+		break;
 	default:
 		acpi_handle_debug(handle, "Unsupported event [0x%x]\n", event);
 		break;
diff --git a/drivers/cpufreq/amd-pstate-ut.c b/drivers/cpufreq/amd-pstate-ut.c
index 502d494499ae..f04ae67dda37 100644
--- a/drivers/cpufreq/amd-pstate-ut.c
+++ b/drivers/cpufreq/amd-pstate-ut.c
@@ -127,8 +127,6 @@ static void amd_pstate_ut_check_perf(u32 index)
 	struct cpufreq_policy *policy = NULL;
 	struct amd_cpudata *cpudata = NULL;
 
-	highest_perf = amd_get_highest_perf();
-
 	for_each_possible_cpu(cpu) {
 		policy = cpufreq_cpu_get(cpu);
 		if (!policy)
@@ -143,6 +141,7 @@ static void amd_pstate_ut_check_perf(u32 index)
 				goto skip_test;
 			}
 
+			highest_perf = cppc_perf.highest_perf;
 			nominal_perf = cppc_perf.nominal_perf;
 			lowest_nonlinear_perf = cppc_perf.lowest_nonlinear_perf;
 			lowest_perf = cppc_perf.lowest_perf;
@@ -154,6 +153,7 @@ static void amd_pstate_ut_check_perf(u32 index)
 				goto skip_test;
 			}
 
+			highest_perf = AMD_CPPC_HIGHEST_PERF(cap1);
 			nominal_perf = AMD_CPPC_NOMINAL_PERF(cap1);
 			lowest_nonlinear_perf = AMD_CPPC_LOWNONLIN_PERF(cap1);
 			lowest_perf = AMD_CPPC_LOWEST_PERF(cap1);
diff --git a/drivers/cpufreq/amd-pstate.c b/drivers/cpufreq/amd-pstate.c
index 9a1e194d5cf8..1c1f04eab389 100644
--- a/drivers/cpufreq/amd-pstate.c
+++ b/drivers/cpufreq/amd-pstate.c
@@ -37,6 +37,7 @@
 #include <linux/uaccess.h>
 #include <linux/static_call.h>
 #include <linux/amd-pstate.h>
+#include <linux/topology.h>
 
 #include <acpi/processor.h>
 #include <acpi/cppc_acpi.h>
@@ -49,6 +50,8 @@
 
 #define AMD_PSTATE_TRANSITION_LATENCY	20000
 #define AMD_PSTATE_TRANSITION_DELAY	1000
+#define AMD_PSTATE_PREFCORE_THRESHOLD	166
+#define AMD_PSTATE_MAX_CPPC_PERF	255
 
 /*
  * TODO: We need more time to fine tune processors with shared memory solution
@@ -64,6 +67,7 @@ static struct cpufreq_driver amd_pstate_driver;
 static struct cpufreq_driver amd_pstate_epp_driver;
 static int cppc_state = AMD_PSTATE_UNDEFINED;
 static bool cppc_enabled;
+static bool amd_pstate_prefcore = true;
 
 /*
  * AMD Energy Preference Performance (EPP)
@@ -290,27 +294,26 @@ static inline int amd_pstate_enable(bool enable)
 static int pstate_init_perf(struct amd_cpudata *cpudata)
 {
 	u64 cap1;
-	u32 highest_perf;
 
 	int ret = rdmsrl_safe_on_cpu(cpudata->cpu, MSR_AMD_CPPC_CAP1,
 				     &cap1);
 	if (ret)
 		return ret;
 
-	/*
-	 * TODO: Introduce AMD specific power feature.
-	 *
-	 * CPPC entry doesn't indicate the highest performance in some ASICs.
+	/* For platforms that do not support the preferred core feature, the
+	 * highest_pef may be configured with 166 or 255, to avoid max frequency
+	 * calculated wrongly. we take the AMD_CPPC_HIGHEST_PERF(cap1) value as
+	 * the default max perf.
 	 */
-	highest_perf = amd_get_highest_perf();
-	if (highest_perf > AMD_CPPC_HIGHEST_PERF(cap1))
-		highest_perf = AMD_CPPC_HIGHEST_PERF(cap1);
-
-	WRITE_ONCE(cpudata->highest_perf, highest_perf);
+	if (cpudata->hw_prefcore)
+		WRITE_ONCE(cpudata->highest_perf, AMD_PSTATE_PREFCORE_THRESHOLD);
+	else
+		WRITE_ONCE(cpudata->highest_perf, AMD_CPPC_HIGHEST_PERF(cap1));
 
 	WRITE_ONCE(cpudata->nominal_perf, AMD_CPPC_NOMINAL_PERF(cap1));
 	WRITE_ONCE(cpudata->lowest_nonlinear_perf, AMD_CPPC_LOWNONLIN_PERF(cap1));
 	WRITE_ONCE(cpudata->lowest_perf, AMD_CPPC_LOWEST_PERF(cap1));
+	WRITE_ONCE(cpudata->prefcore_ranking, AMD_CPPC_HIGHEST_PERF(cap1));
 
 	return 0;
 }
@@ -318,22 +321,21 @@ static int pstate_init_perf(struct amd_cpudata *cpudata)
 static int cppc_init_perf(struct amd_cpudata *cpudata)
 {
 	struct cppc_perf_caps cppc_perf;
-	u32 highest_perf;
 
 	int ret = cppc_get_perf_caps(cpudata->cpu, &cppc_perf);
 	if (ret)
 		return ret;
 
-	highest_perf = amd_get_highest_perf();
-	if (highest_perf > cppc_perf.highest_perf)
-		highest_perf = cppc_perf.highest_perf;
-
-	WRITE_ONCE(cpudata->highest_perf, highest_perf);
+	if (cpudata->hw_prefcore)
+		WRITE_ONCE(cpudata->highest_perf, AMD_PSTATE_PREFCORE_THRESHOLD);
+	else
+		WRITE_ONCE(cpudata->highest_perf, cppc_perf.highest_perf);
 
 	WRITE_ONCE(cpudata->nominal_perf, cppc_perf.nominal_perf);
 	WRITE_ONCE(cpudata->lowest_nonlinear_perf,
 		   cppc_perf.lowest_nonlinear_perf);
 	WRITE_ONCE(cpudata->lowest_perf, cppc_perf.lowest_perf);
+	WRITE_ONCE(cpudata->prefcore_ranking, cppc_perf.highest_perf);
 
 	if (cppc_state == AMD_PSTATE_ACTIVE)
 		return 0;
@@ -540,7 +542,7 @@ static void amd_pstate_adjust_perf(unsigned int cpu,
 	if (target_perf < capacity)
 		des_perf = DIV_ROUND_UP(cap_perf * target_perf, capacity);
 
-	min_perf = READ_ONCE(cpudata->highest_perf);
+	min_perf = READ_ONCE(cpudata->lowest_perf);
 	if (_min_perf < capacity)
 		min_perf = DIV_ROUND_UP(cap_perf * _min_perf, capacity);
 
@@ -676,6 +678,124 @@ static void amd_perf_ctl_reset(unsigned int cpu)
 	wrmsrl_on_cpu(cpu, MSR_AMD_PERF_CTL, 0);
 }
 
+/*
+ * Set amd-pstate preferred core enable can't be done directly from cpufreq callbacks
+ * due to locking, so queue the work for later.
+ */
+static void amd_pstste_sched_prefcore_workfn(struct work_struct *work)
+{
+	sched_set_itmt_support();
+}
+static DECLARE_WORK(sched_prefcore_work, amd_pstste_sched_prefcore_workfn);
+
+/*
+ * Get the highest performance register value.
+ * @cpu: CPU from which to get highest performance.
+ * @highest_perf: Return address.
+ *
+ * Return: 0 for success, -EIO otherwise.
+ */
+static int amd_pstate_get_highest_perf(int cpu, u32 *highest_perf)
+{
+	int ret;
+
+	if (boot_cpu_has(X86_FEATURE_CPPC)) {
+		u64 cap1;
+
+		ret = rdmsrl_safe_on_cpu(cpu, MSR_AMD_CPPC_CAP1, &cap1);
+		if (ret)
+			return ret;
+		WRITE_ONCE(*highest_perf, AMD_CPPC_HIGHEST_PERF(cap1));
+	} else {
+		u64 cppc_highest_perf;
+
+		ret = cppc_get_highest_perf(cpu, &cppc_highest_perf);
+		WRITE_ONCE(*highest_perf, cppc_highest_perf);
+	}
+
+	return (ret);
+}
+
+static void amd_pstate_init_prefcore(struct amd_cpudata *cpudata)
+{
+	int ret, prio;
+	u32 highest_perf;
+	static u32 max_highest_perf = 0, min_highest_perf = U32_MAX;
+
+	ret = amd_pstate_get_highest_perf(cpudata->cpu, &highest_perf);
+	if (ret)
+		return;
+
+	cpudata->hw_prefcore = true;
+	/* check if CPPC preferred core feature is enabled*/
+	if (highest_perf == AMD_PSTATE_MAX_CPPC_PERF) {
+		pr_debug("AMD CPPC preferred core is unsupported!\n");
+		cpudata->hw_prefcore = false;
+		return;
+	}
+
+	if (!amd_pstate_prefcore)
+		return;
+
+	/* The maximum value of highest perf is 255 */
+	prio = (int)(highest_perf & 0xff);
+	/*
+	 * The priorities can be set regardless of whether or not
+	 * sched_set_itmt_support(true) has been called and it is valid to
+	 * update them at any time after it has been called.
+	 */
+	sched_set_itmt_core_prio(prio, cpudata->cpu);
+
+	if (max_highest_perf <= min_highest_perf) {
+		if (highest_perf > max_highest_perf)
+			max_highest_perf = highest_perf;
+
+		if (highest_perf < min_highest_perf)
+			min_highest_perf = highest_perf;
+
+		if (max_highest_perf > min_highest_perf) {
+			/*
+			 * This code can be run during CPU online under the
+			 * CPU hotplug locks, so sched_set_itmt_support()
+			 * cannot be called from here.  Queue up a work item
+			 * to invoke it.
+			 */
+			schedule_work(&sched_prefcore_work);
+		}
+	}
+}
+
+static void amd_pstate_update_highest_perf(unsigned int cpu)
+{
+	struct cpufreq_policy *policy;
+	struct amd_cpudata *cpudata;
+	u32 prev_high = 0, cur_high = 0;
+	int ret;
+
+	if ((!amd_pstate_prefcore) || (!cpudata->hw_prefcore))
+		return;
+
+	ret = amd_pstate_get_highest_perf(cpu, &cur_high);
+	if (ret)
+		return;
+
+	policy = cpufreq_cpu_get(cpu);
+	cpudata = policy->driver_data;
+	prev_high = READ_ONCE(cpudata->prefcore_ranking);
+
+	if (prev_high != cur_high) {
+		int prio;
+
+		WRITE_ONCE(cpudata->prefcore_ranking, cur_high);
+
+		/* The maximum value of highest perf is 255 */
+		prio = (int)(cur_high & 0xff);
+		sched_set_itmt_core_prio(prio, cpu);
+	}
+
+	cpufreq_cpu_put(policy);
+}
+
 static int amd_pstate_cpu_init(struct cpufreq_policy *policy)
 {
 	int min_freq, max_freq, nominal_freq, lowest_nonlinear_freq, ret;
@@ -697,6 +817,8 @@ static int amd_pstate_cpu_init(struct cpufreq_policy *policy)
 
 	cpudata->cpu = policy->cpu;
 
+	amd_pstate_init_prefcore(cpudata);
+
 	ret = amd_pstate_init_perf(cpudata);
 	if (ret)
 		goto free_cpudata1;
@@ -845,6 +967,28 @@ static ssize_t show_amd_pstate_highest_perf(struct cpufreq_policy *policy,
 	return sysfs_emit(buf, "%u\n", perf);
 }
 
+static ssize_t show_amd_pstate_prefcore_ranking(struct cpufreq_policy *policy,
+						char *buf)
+{
+	u32 perf;
+	struct amd_cpudata *cpudata = policy->driver_data;
+
+	perf = READ_ONCE(cpudata->prefcore_ranking);
+
+	return sysfs_emit(buf, "%u\n", perf);
+}
+
+static ssize_t show_amd_pstate_hw_prefcore(struct cpufreq_policy *policy,
+					   char *buf)
+{
+	bool hw_prefcore;
+	struct amd_cpudata *cpudata = policy->driver_data;
+
+	hw_prefcore = READ_ONCE(cpudata->hw_prefcore);
+
+	return sysfs_emit(buf, "%s\n", hw_prefcore ? "supported" : "unsupported");
+}
+
 static ssize_t show_energy_performance_available_preferences(
 				struct cpufreq_policy *policy, char *buf)
 {
@@ -1037,18 +1181,29 @@ static ssize_t status_store(struct device *a, struct device_attribute *b,
 	return ret < 0 ? ret : count;
 }
 
+static ssize_t prefcore_show(struct device *dev,
+			     struct device_attribute *attr, char *buf)
+{
+	return sysfs_emit(buf, "%s\n", amd_pstate_prefcore ? "enabled" : "disabled");
+}
+
 cpufreq_freq_attr_ro(amd_pstate_max_freq);
 cpufreq_freq_attr_ro(amd_pstate_lowest_nonlinear_freq);
 
 cpufreq_freq_attr_ro(amd_pstate_highest_perf);
+cpufreq_freq_attr_ro(amd_pstate_prefcore_ranking);
+cpufreq_freq_attr_ro(amd_pstate_hw_prefcore);
 cpufreq_freq_attr_rw(energy_performance_preference);
 cpufreq_freq_attr_ro(energy_performance_available_preferences);
 static DEVICE_ATTR_RW(status);
+static DEVICE_ATTR_RO(prefcore);
 
 static struct freq_attr *amd_pstate_attr[] = {
 	&amd_pstate_max_freq,
 	&amd_pstate_lowest_nonlinear_freq,
 	&amd_pstate_highest_perf,
+	&amd_pstate_prefcore_ranking,
+	&amd_pstate_hw_prefcore,
 	NULL,
 };
 
@@ -1056,6 +1211,8 @@ static struct freq_attr *amd_pstate_epp_attr[] = {
 	&amd_pstate_max_freq,
 	&amd_pstate_lowest_nonlinear_freq,
 	&amd_pstate_highest_perf,
+	&amd_pstate_prefcore_ranking,
+	&amd_pstate_hw_prefcore,
 	&energy_performance_preference,
 	&energy_performance_available_preferences,
 	NULL,
@@ -1063,6 +1220,7 @@ static struct freq_attr *amd_pstate_epp_attr[] = {
 
 static struct attribute *pstate_global_attributes[] = {
 	&dev_attr_status.attr,
+	&dev_attr_prefcore.attr,
 	NULL
 };
 
@@ -1114,6 +1272,8 @@ static int amd_pstate_epp_cpu_init(struct cpufreq_policy *policy)
 	cpudata->cpu = policy->cpu;
 	cpudata->epp_policy = 0;
 
+	amd_pstate_init_prefcore(cpudata);
+
 	ret = amd_pstate_init_perf(cpudata);
 	if (ret)
 		goto free_cpudata1;
@@ -1392,6 +1552,7 @@ static struct cpufreq_driver amd_pstate_driver = {
 	.suspend	= amd_pstate_cpu_suspend,
 	.resume		= amd_pstate_cpu_resume,
 	.set_boost	= amd_pstate_set_boost,
+	.update_highest_perf	= amd_pstate_update_highest_perf,
 	.name		= "amd-pstate",
 	.attr		= amd_pstate_attr,
 };
@@ -1406,6 +1567,7 @@ static struct cpufreq_driver amd_pstate_epp_driver = {
 	.online		= amd_pstate_epp_cpu_online,
 	.suspend	= amd_pstate_epp_suspend,
 	.resume		= amd_pstate_epp_resume,
+	.update_highest_perf	= amd_pstate_update_highest_perf,
 	.name		= "amd-pstate-epp",
 	.attr		= amd_pstate_epp_attr,
 };
@@ -1527,7 +1689,17 @@ static int __init amd_pstate_param(char *str)
 
 	return amd_pstate_set_driver(mode_idx);
 }
+
+static int __init amd_prefcore_param(char *str)
+{
+	if (!strcmp(str, "disable"))
+		amd_pstate_prefcore = false;
+
+	return 0;
+}
+
 early_param("amd_pstate", amd_pstate_param);
+early_param("amd_prefcore", amd_prefcore_param);
 
 MODULE_AUTHOR("Huang Rui <ray.huang@amd.com>");
 MODULE_DESCRIPTION("AMD Processor P-state Frequency Driver");
diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index 5c655d7b96d4..abefdb30fcaa 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -2677,6 +2677,19 @@ void cpufreq_update_limits(unsigned int cpu)
 }
 EXPORT_SYMBOL_GPL(cpufreq_update_limits);
 
+/**
+ * cpufreq_update_highest_perf - Update highest performance for a given CPU.
+ * @cpu: CPU to update the highest performance for.
+ *
+ * Invoke the driver's ->update_highest_perf callback if present
+ */
+void cpufreq_update_highest_perf(unsigned int cpu)
+{
+	if (cpufreq_driver->update_highest_perf)
+		cpufreq_driver->update_highest_perf(cpu);
+}
+EXPORT_SYMBOL_GPL(cpufreq_update_highest_perf);
+
 /*********************************************************************
  *               BOOST						     *
  *********************************************************************/
diff --git a/include/acpi/cppc_acpi.h b/include/acpi/cppc_acpi.h
index 6126c977ece0..c0b69ffe7bdb 100644
--- a/include/acpi/cppc_acpi.h
+++ b/include/acpi/cppc_acpi.h
@@ -139,6 +139,7 @@ struct cppc_cpudata {
 #ifdef CONFIG_ACPI_CPPC_LIB
 extern int cppc_get_desired_perf(int cpunum, u64 *desired_perf);
 extern int cppc_get_nominal_perf(int cpunum, u64 *nominal_perf);
+extern int cppc_get_highest_perf(int cpunum, u64 *highest_perf);
 extern int cppc_get_perf_ctrs(int cpu, struct cppc_perf_fb_ctrs *perf_fb_ctrs);
 extern int cppc_set_perf(int cpu, struct cppc_perf_ctrls *perf_ctrls);
 extern int cppc_set_enable(int cpu, bool enable);
@@ -165,6 +166,10 @@ static inline int cppc_get_nominal_perf(int cpunum, u64 *nominal_perf)
 {
 	return -ENOTSUPP;
 }
+static inline int cppc_get_highest_perf(int cpunum, u64 *highest_perf)
+{
+	return -ENOTSUPP;
+}
 static inline int cppc_get_perf_ctrs(int cpu, struct cppc_perf_fb_ctrs *perf_fb_ctrs)
 {
 	return -ENOTSUPP;
diff --git a/include/linux/amd-pstate.h b/include/linux/amd-pstate.h
index 446394f84606..426822612373 100644
--- a/include/linux/amd-pstate.h
+++ b/include/linux/amd-pstate.h
@@ -39,11 +39,16 @@ struct amd_aperf_mperf {
  * @cppc_req_cached: cached performance request hints
  * @highest_perf: the maximum performance an individual processor may reach,
  *		  assuming ideal conditions
+ *		  For platforms that do not support the preferred core feature, the
+ *		  highest_pef may be configured with 166 or 255, to avoid max frequency
+ *		  calculated wrongly. we take the fixed value as the highest_perf.
  * @nominal_perf: the maximum sustained performance level of the processor,
  *		  assuming ideal operating conditions
  * @lowest_nonlinear_perf: the lowest performance level at which nonlinear power
  *			   savings are achieved
  * @lowest_perf: the absolute lowest performance level of the processor
+ * @prefcore_ranking: the preferred core ranking, the higher value indicates a higher
+ * 		  priority.
  * @max_freq: the frequency that mapped to highest_perf
  * @min_freq: the frequency that mapped to lowest_perf
  * @nominal_freq: the frequency that mapped to nominal_perf
@@ -52,6 +57,9 @@ struct amd_aperf_mperf {
  * @prev: Last Aperf/Mperf/tsc count value read from register
  * @freq: current cpu frequency value
  * @boost_supported: check whether the Processor or SBIOS supports boost mode
+ * @hw_prefcore: check whether HW supports preferred core featue.
+ * 		  Only when hw_prefcore and early prefcore param are true,
+ * 		  AMD P-State driver supports preferred core featue.
  * @epp_policy: Last saved policy used to set energy-performance preference
  * @epp_cached: Cached CPPC energy-performance preference value
  * @policy: Cpufreq policy value
@@ -70,6 +78,7 @@ struct amd_cpudata {
 	u32	nominal_perf;
 	u32	lowest_nonlinear_perf;
 	u32	lowest_perf;
+	u32     prefcore_ranking;
 
 	u32	max_freq;
 	u32	min_freq;
@@ -81,6 +90,7 @@ struct amd_cpudata {
 
 	u64	freq;
 	bool	boost_supported;
+	bool	hw_prefcore;
 
 	/* EPP feature related attributes*/
 	s16	epp_policy;
diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h
index 172ff51c1b2a..9ca50c4e19d3 100644
--- a/include/linux/cpufreq.h
+++ b/include/linux/cpufreq.h
@@ -231,6 +231,7 @@ int cpufreq_get_policy(struct cpufreq_policy *policy, unsigned int cpu);
 void refresh_frequency_limits(struct cpufreq_policy *policy);
 void cpufreq_update_policy(unsigned int cpu);
 void cpufreq_update_limits(unsigned int cpu);
+void cpufreq_update_highest_perf(unsigned int cpu);
 bool have_governor_per_policy(void);
 bool cpufreq_supports_freq_invariance(void);
 struct kobject *get_governor_parent_kobj(struct cpufreq_policy *policy);
@@ -259,6 +260,7 @@ static inline bool cpufreq_supports_freq_invariance(void)
 	return false;
 }
 static inline void disable_cpufreq(void) { }
+static inline void cpufreq_update_highest_perf(unsigned int cpu) { }
 #endif
 
 #ifdef CONFIG_CPU_FREQ_STAT
@@ -376,6 +378,9 @@ struct cpufreq_driver {
 	/* Called to update policy limits on firmware notifications. */
 	void		(*update_limits)(unsigned int cpu);
 
+	/* Called to update highest performance on firmware notifications. */
+	void		(*update_highest_perf)(unsigned int cpu);
+
 	/* optional */
 	int		(*bios_limit)(int cpu, unsigned int *limit);
 
-- 
2.42.0

From adf8583387461676e6b72ecfedcd2112f48e9c44 Mon Sep 17 00:00:00 2001
From: Peter Jung <admin@ptr1337.dev>
Date: Wed, 13 Sep 2023 14:31:43 +0200
Subject: [PATCH 3/7] bbr3

Signed-off-by: Peter Jung <admin@ptr1337.dev>
---
 include/linux/tcp.h                |    4 +-
 include/net/inet_connection_sock.h |    4 +-
 include/net/tcp.h                  |   72 +-
 include/uapi/linux/inet_diag.h     |   23 +
 include/uapi/linux/rtnetlink.h     |    4 +-
 include/uapi/linux/tcp.h           |    1 +
 net/ipv4/Kconfig                   |   21 +-
 net/ipv4/tcp.c                     |    3 +
 net/ipv4/tcp_bbr.c                 | 2231 +++++++++++++++++++++-------
 net/ipv4/tcp_cong.c                |    1 +
 net/ipv4/tcp_input.c               |   40 +-
 net/ipv4/tcp_minisocks.c           |    2 +
 net/ipv4/tcp_output.c              |   48 +-
 net/ipv4/tcp_rate.c                |   30 +-
 net/ipv4/tcp_timer.c               |    1 +
 15 files changed, 1934 insertions(+), 551 deletions(-)

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 91a37c99ba66..ae0ee688c3f7 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -255,7 +255,9 @@ struct tcp_sock {
 	u8	compressed_ack;
 	u8	dup_ack_counter:2,
 		tlp_retrans:1,	/* TLP is a retransmission */
-		unused:5;
+		fast_ack_mode:2, /* which fast ack mode ? */
+		tlp_orig_data_app_limited:1, /* app-limited before TLP rtx? */
+		unused:2;
 	u32	chrono_start;	/* Start time in jiffies of a TCP chrono */
 	u32	chrono_stat[3];	/* Time in jiffies for chrono_stat stats */
 	u8	chrono_type:2,	/* current chronograph type */
diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h
index c2b15f7e5516..a400a84088d3 100644
--- a/include/net/inet_connection_sock.h
+++ b/include/net/inet_connection_sock.h
@@ -135,8 +135,8 @@ struct inet_connection_sock {
 	u32			  icsk_probes_tstamp;
 	u32			  icsk_user_timeout;
 
-	u64			  icsk_ca_priv[104 / sizeof(u64)];
-#define ICSK_CA_PRIV_SIZE	  sizeof_field(struct inet_connection_sock, icsk_ca_priv)
+#define ICSK_CA_PRIV_SIZE      (144)
+	u64			  icsk_ca_priv[ICSK_CA_PRIV_SIZE / sizeof(u64)];
 };
 
 #define ICSK_TIME_RETRANS	1	/* Retransmit timer */
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 7f684806c291..5238ef036e55 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -374,6 +374,8 @@ static inline void tcp_dec_quickack_mode(struct sock *sk)
 #define	TCP_ECN_QUEUE_CWR	2
 #define	TCP_ECN_DEMAND_CWR	4
 #define	TCP_ECN_SEEN		8
+#define	TCP_ECN_LOW		16
+#define	TCP_ECN_ECT_PERMANENT	32
 
 enum tcp_tw_status {
 	TCP_TW_SUCCESS = 0,
@@ -727,6 +729,15 @@ static inline void tcp_fast_path_check(struct sock *sk)
 		tcp_fast_path_on(tp);
 }
 
+static inline void tcp_set_ecn_low_from_dst(struct sock *sk,
+					    const struct dst_entry *dst)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	if (dst_feature(dst, RTAX_FEATURE_ECN_LOW))
+		tp->ecn_flags |= TCP_ECN_LOW;
+}
+
 /* Compute the actual rto_min value */
 static inline u32 tcp_rto_min(struct sock *sk)
 {
@@ -823,6 +834,11 @@ static inline u32 tcp_stamp_us_delta(u64 t1, u64 t0)
 	return max_t(s64, t1 - t0, 0);
 }
 
+static inline u32 tcp_stamp32_us_delta(u32 t1, u32 t0)
+{
+	return max_t(s32, t1 - t0, 0);
+}
+
 static inline u32 tcp_skb_timestamp(const struct sk_buff *skb)
 {
 	return tcp_ns_to_ts(skb->skb_mstamp_ns);
@@ -898,9 +914,14 @@ struct tcp_skb_cb {
 			/* pkts S/ACKed so far upon tx of skb, incl retrans: */
 			__u32 delivered;
 			/* start of send pipeline phase */
-			u64 first_tx_mstamp;
+			u32 first_tx_mstamp;
 			/* when we reached the "delivered" count */
-			u64 delivered_mstamp;
+			u32 delivered_mstamp;
+#define TCPCB_IN_FLIGHT_BITS 20
+#define TCPCB_IN_FLIGHT_MAX ((1U << TCPCB_IN_FLIGHT_BITS) - 1)
+			u32 in_flight:20,   /* packets in flight at transmit */
+			    unused2:12;
+			u32 lost;	/* packets lost so far upon tx of skb */
 		} tx;   /* only used for outgoing skbs */
 		union {
 			struct inet_skb_parm	h4;
@@ -1004,6 +1025,7 @@ enum tcp_ca_event {
 	CA_EVENT_LOSS,		/* loss timeout */
 	CA_EVENT_ECN_NO_CE,	/* ECT set, but not CE marked */
 	CA_EVENT_ECN_IS_CE,	/* received CE marked IP packet */
+	CA_EVENT_TLP_RECOVERY,	/* a lost segment was repaired by TLP probe */
 };
 
 /* Information about inbound ACK, passed to cong_ops->in_ack_event() */
@@ -1026,7 +1048,11 @@ enum tcp_ca_ack_event_flags {
 #define TCP_CONG_NON_RESTRICTED 0x1
 /* Requires ECN/ECT set on all packets */
 #define TCP_CONG_NEEDS_ECN	0x2
-#define TCP_CONG_MASK	(TCP_CONG_NON_RESTRICTED | TCP_CONG_NEEDS_ECN)
+/* Wants notification of CE events (CA_EVENT_ECN_IS_CE, CA_EVENT_ECN_NO_CE). */
+#define TCP_CONG_WANTS_CE_EVENTS	0x4
+#define TCP_CONG_MASK	(TCP_CONG_NON_RESTRICTED | \
+			 TCP_CONG_NEEDS_ECN | \
+			 TCP_CONG_WANTS_CE_EVENTS)
 
 union tcp_cc_info;
 
@@ -1046,10 +1072,13 @@ struct ack_sample {
  */
 struct rate_sample {
 	u64  prior_mstamp; /* starting timestamp for interval */
+	u32  prior_lost;	/* tp->lost at "prior_mstamp" */
 	u32  prior_delivered;	/* tp->delivered at "prior_mstamp" */
 	u32  prior_delivered_ce;/* tp->delivered_ce at "prior_mstamp" */
+	u32 tx_in_flight;	/* packets in flight at starting timestamp */
+	s32  lost;		/* number of packets lost over interval */
 	s32  delivered;		/* number of packets delivered over interval */
-	s32  delivered_ce;	/* number of packets delivered w/ CE marks*/
+	s32  delivered_ce;	/* packets delivered w/ CE mark over interval */
 	long interval_us;	/* time for tp->delivered to incr "delivered" */
 	u32 snd_interval_us;	/* snd interval for delivered packets */
 	u32 rcv_interval_us;	/* rcv interval for delivered packets */
@@ -1060,7 +1089,9 @@ struct rate_sample {
 	u32  last_end_seq;	/* end_seq of most recently ACKed packet */
 	bool is_app_limited;	/* is sample from packet with bubble in pipe? */
 	bool is_retrans;	/* is sample from retransmission? */
+	bool is_acking_tlp_retrans_seq;  /* ACKed a TLP retransmit sequence? */
 	bool is_ack_delayed;	/* is this (likely) a delayed ACK? */
+	bool is_ece;		/* did this ACK have ECN marked? */
 };
 
 struct tcp_congestion_ops {
@@ -1084,8 +1115,11 @@ struct tcp_congestion_ops {
 	/* hook for packet ack accounting (optional) */
 	void (*pkts_acked)(struct sock *sk, const struct ack_sample *sample);
 
-	/* override sysctl_tcp_min_tso_segs */
-	u32 (*min_tso_segs)(struct sock *sk);
+	/* pick target number of segments per TSO/GSO skb (optional): */
+	u32 (*tso_segs)(struct sock *sk, unsigned int mss_now);
+
+	/* react to a specific lost skb (optional) */
+	void (*skb_marked_lost)(struct sock *sk, const struct sk_buff *skb);
 
 	/* call when packets are delivered to update cwnd and pacing rate,
 	 * after all the ca_state processing. (optional)
@@ -1151,6 +1185,14 @@ static inline char *tcp_ca_get_name_by_key(u32 key, char *buffer)
 }
 #endif
 
+static inline bool tcp_ca_wants_ce_events(const struct sock *sk)
+{
+	const struct inet_connection_sock *icsk = inet_csk(sk);
+
+	return icsk->icsk_ca_ops->flags & (TCP_CONG_NEEDS_ECN |
+					   TCP_CONG_WANTS_CE_EVENTS);
+}
+
 static inline bool tcp_ca_needs_ecn(const struct sock *sk)
 {
 	const struct inet_connection_sock *icsk = inet_csk(sk);
@@ -1170,6 +1212,7 @@ static inline void tcp_ca_event(struct sock *sk, const enum tcp_ca_event event)
 void tcp_set_ca_state(struct sock *sk, const u8 ca_state);
 
 /* From tcp_rate.c */
+void tcp_set_tx_in_flight(struct sock *sk, struct sk_buff *skb);
 void tcp_rate_skb_sent(struct sock *sk, struct sk_buff *skb);
 void tcp_rate_skb_delivered(struct sock *sk, struct sk_buff *skb,
 			    struct rate_sample *rs);
@@ -1182,6 +1225,21 @@ static inline bool tcp_skb_sent_after(u64 t1, u64 t2, u32 seq1, u32 seq2)
 	return t1 > t2 || (t1 == t2 && after(seq1, seq2));
 }
 
+/* If a retransmit failed due to local qdisc congestion or other local issues,
+ * then we may have called tcp_set_skb_tso_segs() to increase the number of
+ * segments in the skb without increasing the tx.in_flight. In all other cases,
+ * the tx.in_flight should be at least as big as the pcount of the sk_buff.  We
+ * do not have the state to know whether a retransmit failed due to local qdisc
+ * congestion or other local issues, so to avoid spurious warnings we consider
+ * that any skb marked lost may have suffered that fate.
+ */
+static inline bool tcp_skb_tx_in_flight_is_suspicious(u32 skb_pcount,
+						      u32 skb_sacked_flags,
+						      u32 tx_in_flight)
+{
+	return (skb_pcount > tx_in_flight) && !(skb_sacked_flags & TCPCB_LOST);
+}
+
 /* These functions determine how the current flow behaves in respect of SACK
  * handling. SACK is negotiated with the peer, and therefore it can vary
  * between different flows.
@@ -2181,7 +2239,7 @@ struct tcp_plb_state {
 	u8	consec_cong_rounds:5, /* consecutive congested rounds */
 		unused:3;
 	u32	pause_until; /* jiffies32 when PLB can resume rerouting */
-};
+} __attribute__ ((__packed__));
 
 static inline void tcp_plb_init(const struct sock *sk,
 				struct tcp_plb_state *plb)
diff --git a/include/uapi/linux/inet_diag.h b/include/uapi/linux/inet_diag.h
index 50655de04c9b..82f8bd8f0d16 100644
--- a/include/uapi/linux/inet_diag.h
+++ b/include/uapi/linux/inet_diag.h
@@ -229,6 +229,29 @@ struct tcp_bbr_info {
 	__u32	bbr_min_rtt;		/* min-filtered RTT in uSec */
 	__u32	bbr_pacing_gain;	/* pacing gain shifted left 8 bits */
 	__u32	bbr_cwnd_gain;		/* cwnd gain shifted left 8 bits */
+	__u32	bbr_bw_hi_lsb;		/* lower 32 bits of bw_hi */
+	__u32	bbr_bw_hi_msb;		/* upper 32 bits of bw_hi */
+	__u32	bbr_bw_lo_lsb;		/* lower 32 bits of bw_lo */
+	__u32	bbr_bw_lo_msb;		/* upper 32 bits of bw_lo */
+	__u8	bbr_mode;		/* current bbr_mode in state machine */
+	__u8	bbr_phase;		/* current state machine phase */
+	__u8	unused1;		/* alignment padding; not used yet */
+	__u8	bbr_version;		/* BBR algorithm version */
+	__u32	bbr_inflight_lo;	/* lower short-term data volume bound */
+	__u32	bbr_inflight_hi;	/* higher long-term data volume bound */
+	__u32	bbr_extra_acked;	/* max excess packets ACKed in epoch */
+};
+
+/* TCP BBR congestion control bbr_phase as reported in netlink/ss stats. */
+enum tcp_bbr_phase {
+	BBR_PHASE_INVALID		= 0,
+	BBR_PHASE_STARTUP		= 1,
+	BBR_PHASE_DRAIN			= 2,
+	BBR_PHASE_PROBE_RTT		= 3,
+	BBR_PHASE_PROBE_BW_UP		= 4,
+	BBR_PHASE_PROBE_BW_DOWN		= 5,
+	BBR_PHASE_PROBE_BW_CRUISE	= 6,
+	BBR_PHASE_PROBE_BW_REFILL	= 7,
 };
 
 union tcp_cc_info {
diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h
index 51c13cf9c5ae..de8dcba26bec 100644
--- a/include/uapi/linux/rtnetlink.h
+++ b/include/uapi/linux/rtnetlink.h
@@ -506,9 +506,11 @@ enum {
 #define RTAX_FEATURE_SACK	(1 << 1)
 #define RTAX_FEATURE_TIMESTAMP	(1 << 2)
 #define RTAX_FEATURE_ALLFRAG	(1 << 3)
+#define RTAX_FEATURE_ECN_LOW	(1 << 4)
 
 #define RTAX_FEATURE_MASK	(RTAX_FEATURE_ECN | RTAX_FEATURE_SACK | \
-				 RTAX_FEATURE_TIMESTAMP | RTAX_FEATURE_ALLFRAG)
+				 RTAX_FEATURE_TIMESTAMP | RTAX_FEATURE_ALLFRAG \
+				 | RTAX_FEATURE_ECN_LOW)
 
 struct rta_session {
 	__u8	proto;
diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h
index 879eeb0a084b..77270053a5e3 100644
--- a/include/uapi/linux/tcp.h
+++ b/include/uapi/linux/tcp.h
@@ -170,6 +170,7 @@ enum tcp_fastopen_client_fail {
 #define TCPI_OPT_ECN		8 /* ECN was negociated at TCP session init */
 #define TCPI_OPT_ECN_SEEN	16 /* we received at least one packet with ECT */
 #define TCPI_OPT_SYN_DATA	32 /* SYN-ACK acked data in SYN sent or rcvd */
+#define TCPI_OPT_ECN_LOW	64 /* Low-latency ECN configured at init */
 
 /*
  * Sender's congestion state indicating normal or abnormal situations
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index 2dfb12230f08..2e14db3bee70 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -668,15 +668,18 @@ config TCP_CONG_BBR
 	default n
 	help
 
-	  BBR (Bottleneck Bandwidth and RTT) TCP congestion control aims to
-	  maximize network utilization and minimize queues. It builds an explicit
-	  model of the bottleneck delivery rate and path round-trip propagation
-	  delay. It tolerates packet loss and delay unrelated to congestion. It
-	  can operate over LAN, WAN, cellular, wifi, or cable modem links. It can
-	  coexist with flows that use loss-based congestion control, and can
-	  operate with shallow buffers, deep buffers, bufferbloat, policers, or
-	  AQM schemes that do not provide a delay signal. It requires the fq
-	  ("Fair Queue") pacing packet scheduler.
+	  BBR (Bottleneck Bandwidth and RTT) TCP congestion control is a
+	  model-based congestion control algorithm that aims to maximize
+	  network utilization, keep queues and retransmit rates low, and to be
+	  able to coexist with Reno/CUBIC in common scenarios. It builds an
+	  explicit model of the network path.  It tolerates a targeted degree
+	  of random packet loss and delay. It can operate over LAN, WAN,
+	  cellular, wifi, or cable modem links, and can use shallow-threshold
+	  ECN signals. It can coexist to some degree with flows that use
+	  loss-based congestion control, and can operate with shallow buffers,
+	  deep buffers, bufferbloat, policers, or AQM schemes that do not
+	  provide a delay signal. It requires pacing, using either TCP internal
+	  pacing or the fq ("Fair Queue") pacing packet scheduler.
 
 choice
 	prompt "Default TCP congestion control"
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 9bdc1b2eaf73..fdc880d6fe50 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -3077,6 +3077,7 @@ int tcp_disconnect(struct sock *sk, int flags)
 	tp->rx_opt.dsack = 0;
 	tp->rx_opt.num_sacks = 0;
 	tp->rcv_ooopack = 0;
+	tp->fast_ack_mode = 0;
 
 
 	/* Clean up fastopen related fields */
@@ -3772,6 +3773,8 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
 		info->tcpi_options |= TCPI_OPT_ECN;
 	if (tp->ecn_flags & TCP_ECN_SEEN)
 		info->tcpi_options |= TCPI_OPT_ECN_SEEN;
+	if (tp->ecn_flags & TCP_ECN_LOW)
+		info->tcpi_options |= TCPI_OPT_ECN_LOW;
 	if (tp->syn_data_acked)
 		info->tcpi_options |= TCPI_OPT_SYN_DATA;
 
diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp_bbr.c
index 146792cd26fe..f4f477a69917 100644
--- a/net/ipv4/tcp_bbr.c
+++ b/net/ipv4/tcp_bbr.c
@@ -1,18 +1,19 @@
-/* Bottleneck Bandwidth and RTT (BBR) congestion control
+/* BBR (Bottleneck Bandwidth and RTT) congestion control
  *
- * BBR congestion control computes the sending rate based on the delivery
- * rate (throughput) estimated from ACKs. In a nutshell:
+ * BBR is a model-based congestion control algorithm that aims for low queues,
+ * low loss, and (bounded) Reno/CUBIC coexistence. To maintain a model of the
+ * network path, it uses measurements of bandwidth and RTT, as well as (if they
+ * occur) packet loss and/or shallow-threshold ECN signals. Note that although
+ * it can use ECN or loss signals explicitly, it does not require either; it
+ * can bound its in-flight data based on its estimate of the BDP.
  *
- *   On each ACK, update our model of the network path:
- *      bottleneck_bandwidth = windowed_max(delivered / elapsed, 10 round trips)
- *      min_rtt = windowed_min(rtt, 10 seconds)
- *   pacing_rate = pacing_gain * bottleneck_bandwidth
- *   cwnd = max(cwnd_gain * bottleneck_bandwidth * min_rtt, 4)
- *
- * The core algorithm does not react directly to packet losses or delays,
- * although BBR may adjust the size of next send per ACK when loss is
- * observed, or adjust the sending rate if it estimates there is a
- * traffic policer, in order to keep the drop rate reasonable.
+ * The model has both higher and lower bounds for the operating range:
+ *   lo: bw_lo, inflight_lo: conservative short-term lower bound
+ *   hi: bw_hi, inflight_hi: robust long-term upper bound
+ * The bandwidth-probing time scale is (a) extended dynamically based on
+ * estimated BDP to improve coexistence with Reno/CUBIC; (b) bounded by
+ * an interactive wall-clock time-scale to be more scalable and responsive
+ * than Reno and CUBIC.
  *
  * Here is a state transition diagram for BBR:
  *
@@ -65,6 +66,13 @@
 #include <linux/random.h>
 #include <linux/win_minmax.h>
 
+#include <trace/events/tcp.h>
+#include "tcp_dctcp.h"
+
+#define BBR_VERSION		3
+
+#define bbr_param(sk,name)	(bbr_ ## name)
+
 /* Scale factor for rate in pkt/uSec unit to avoid truncation in bandwidth
  * estimation. The rate unit ~= (1500 bytes / 1 usec / 2^24) ~= 715 bps.
  * This handles bandwidths from 0.06pps (715bps) to 256Mpps (3Tbps) in a u32.
@@ -85,36 +93,41 @@ enum bbr_mode {
 	BBR_PROBE_RTT,	/* cut inflight to min to probe min_rtt */
 };
 
+/* How does the incoming ACK stream relate to our bandwidth probing? */
+enum bbr_ack_phase {
+	BBR_ACKS_INIT,		  /* not probing; not getting probe feedback */
+	BBR_ACKS_REFILLING,	  /* sending at est. bw to fill pipe */
+	BBR_ACKS_PROBE_STARTING,  /* inflight rising to probe bw */
+	BBR_ACKS_PROBE_FEEDBACK,  /* getting feedback from bw probing */
+	BBR_ACKS_PROBE_STOPPING,  /* stopped probing; still getting feedback */
+};
+
 /* BBR congestion control block */
 struct bbr {
 	u32	min_rtt_us;	        /* min RTT in min_rtt_win_sec window */
 	u32	min_rtt_stamp;	        /* timestamp of min_rtt_us */
 	u32	probe_rtt_done_stamp;   /* end time for BBR_PROBE_RTT mode */
-	struct minmax bw;	/* Max recent delivery rate in pkts/uS << 24 */
-	u32	rtt_cnt;	    /* count of packet-timed rounds elapsed */
+	u32	probe_rtt_min_us;	/* min RTT in probe_rtt_win_ms win */
+	u32	probe_rtt_min_stamp;	/* timestamp of probe_rtt_min_us*/
 	u32     next_rtt_delivered; /* scb->tx.delivered at end of round */
 	u64	cycle_mstamp;	     /* time of this cycle phase start */
-	u32     mode:3,		     /* current bbr_mode in state machine */
+	u32     mode:2,		     /* current bbr_mode in state machine */
 		prev_ca_state:3,     /* CA state on previous ACK */
-		packet_conservation:1,  /* use packet conservation? */
 		round_start:1,	     /* start of packet-timed tx->ack round? */
+		ce_state:1,          /* If most recent data has CE bit set */
+		bw_probe_up_rounds:5,   /* cwnd-limited rounds in PROBE_UP */
+		try_fast_path:1,	/* can we take fast path? */
 		idle_restart:1,	     /* restarting after idle? */
 		probe_rtt_round_done:1,  /* a BBR_PROBE_RTT round at 4 pkts? */
-		unused:13,
-		lt_is_sampling:1,    /* taking long-term ("LT") samples now? */
-		lt_rtt_cnt:7,	     /* round trips in long-term interval */
-		lt_use_bw:1;	     /* use lt_bw as our bw estimate? */
-	u32	lt_bw;		     /* LT est delivery rate in pkts/uS << 24 */
-	u32	lt_last_delivered;   /* LT intvl start: tp->delivered */
-	u32	lt_last_stamp;	     /* LT intvl start: tp->delivered_mstamp */
-	u32	lt_last_lost;	     /* LT intvl start: tp->lost */
+		init_cwnd:7,         /* initial cwnd */
+		unused_1:10;
 	u32	pacing_gain:10,	/* current gain for setting pacing rate */
 		cwnd_gain:10,	/* current gain for setting cwnd */
 		full_bw_reached:1,   /* reached full bw in Startup? */
 		full_bw_cnt:2,	/* number of rounds without large bw gains */
-		cycle_idx:3,	/* current index in pacing_gain cycle array */
+		cycle_idx:2,	/* current index in pacing_gain cycle array */
 		has_seen_rtt:1, /* have we seen an RTT sample yet? */
-		unused_b:5;
+		unused_2:6;
 	u32	prior_cwnd;	/* prior cwnd upon entering loss recovery */
 	u32	full_bw;	/* recent bw, to estimate if pipe is full */
 
@@ -124,19 +137,67 @@ struct bbr {
 	u32	ack_epoch_acked:20,	/* packets (S)ACKed in sampling epoch */
 		extra_acked_win_rtts:5,	/* age of extra_acked, in round trips */
 		extra_acked_win_idx:1,	/* current index in extra_acked array */
-		unused_c:6;
+	/* BBR v3 state: */
+		full_bw_now:1,		/* recently reached full bw plateau? */
+		startup_ecn_rounds:2,	/* consecutive hi ECN STARTUP rounds */
+		loss_in_cycle:1,	/* packet loss in this cycle? */
+		ecn_in_cycle:1,		/* ECN in this cycle? */
+		unused_3:1;
+	u32	loss_round_delivered; /* scb->tx.delivered ending loss round */
+	u32	undo_bw_lo;	     /* bw_lo before latest losses */
+	u32	undo_inflight_lo;    /* inflight_lo before latest losses */
+	u32	undo_inflight_hi;    /* inflight_hi before latest losses */
+	u32	bw_latest;	 /* max delivered bw in last round trip */
+	u32	bw_lo;		 /* lower bound on sending bandwidth */
+	u32	bw_hi[2];	 /* max recent measured bw sample */
+	u32	inflight_latest; /* max delivered data in last round trip */
+	u32	inflight_lo;	 /* lower bound of inflight data range */
+	u32	inflight_hi;	 /* upper bound of inflight data range */
+	u32	bw_probe_up_cnt; /* packets delivered per inflight_hi incr */
+	u32	bw_probe_up_acks;  /* packets (S)ACKed since inflight_hi incr */
+	u32	probe_wait_us;	 /* PROBE_DOWN until next clock-driven probe */
+	u32	prior_rcv_nxt;	/* tp->rcv_nxt when CE state last changed */
+	u32	ecn_eligible:1,	/* sender can use ECN (RTT, handshake)? */
+		ecn_alpha:9,	/* EWMA delivered_ce/delivered; 0..256 */
+		bw_probe_samples:1,    /* rate samples reflect bw probing? */
+		prev_probe_too_high:1, /* did last PROBE_UP go too high? */
+		stopped_risky_probe:1, /* last PROBE_UP stopped due to risk? */
+		rounds_since_probe:8,  /* packet-timed rounds since probed bw */
+		loss_round_start:1,    /* loss_round_delivered round trip? */
+		loss_in_round:1,       /* loss marked in this round trip? */
+		ecn_in_round:1,	       /* ECN marked in this round trip? */
+		ack_phase:3,	       /* bbr_ack_phase: meaning of ACKs */
+		loss_events_in_round:4,/* losses in STARTUP round */
+		initialized:1;	       /* has bbr_init() been called? */
+	u32	alpha_last_delivered;	 /* tp->delivered    at alpha update */
+	u32	alpha_last_delivered_ce; /* tp->delivered_ce at alpha update */
+
+	u8	unused_4;		/* to preserve alignment */
+	struct tcp_plb_state plb;
 };
 
-#define CYCLE_LEN	8	/* number of phases in a pacing gain cycle */
+struct bbr_context {
+	u32 sample_bw;
+};
 
-/* Window length of bw filter (in rounds): */
-static const int bbr_bw_rtts = CYCLE_LEN + 2;
 /* Window length of min_rtt filter (in sec): */
 static const u32 bbr_min_rtt_win_sec = 10;
 /* Minimum time (in ms) spent at bbr_cwnd_min_target in BBR_PROBE_RTT mode: */
 static const u32 bbr_probe_rtt_mode_ms = 200;
-/* Skip TSO below the following bandwidth (bits/sec): */
-static const int bbr_min_tso_rate = 1200000;
+/* Window length of probe_rtt_min_us filter (in ms), and consequently the
+ * typical interval between PROBE_RTT mode entries. The default is 5000ms.
+ * Note that bbr_probe_rtt_win_ms must be <= bbr_min_rtt_win_sec * MSEC_PER_SEC
+ */
+static const u32 bbr_probe_rtt_win_ms = 5000;
+/* Proportion of cwnd to estimated BDP in PROBE_RTT, in units of BBR_UNIT: */
+static const u32 bbr_probe_rtt_cwnd_gain = BBR_UNIT * 1 / 2;
+
+/* Use min_rtt to help adapt TSO burst size, with smaller min_rtt resulting
+ * in bigger TSO bursts. We cut the RTT-based allowance in half
+ * for every 2^9 usec (aka 512 us) of RTT, so that the RTT-based allowance
+ * is below 1500 bytes after 6 * ~500 usec = 3ms.
+ */
+static const u32 bbr_tso_rtt_shift = 9;
 
 /* Pace at ~1% below estimated bw, on average, to reduce queue at bottleneck.
  * In order to help drive the network toward lower queues and low latency while
@@ -146,13 +207,15 @@ static const int bbr_min_tso_rate = 1200000;
  */
 static const int bbr_pacing_margin_percent = 1;
 
-/* We use a high_gain value of 2/ln(2) because it's the smallest pacing gain
+/* We use a startup_pacing_gain of 4*ln(2) because it's the smallest value
  * that will allow a smoothly increasing pacing rate that will double each RTT
  * and send the same number of packets per RTT that an un-paced, slow-starting
  * Reno or CUBIC flow would:
  */
-static const int bbr_high_gain  = BBR_UNIT * 2885 / 1000 + 1;
-/* The pacing gain of 1/high_gain in BBR_DRAIN is calculated to typically drain
+static const int bbr_startup_pacing_gain = BBR_UNIT * 277 / 100 + 1;
+/* The gain for deriving startup cwnd: */
+static const int bbr_startup_cwnd_gain = BBR_UNIT * 2;
+/* The pacing gain in BBR_DRAIN is calculated to typically drain
  * the queue created in BBR_STARTUP in a single round:
  */
 static const int bbr_drain_gain = BBR_UNIT * 1000 / 2885;
@@ -160,13 +223,17 @@ static const int bbr_drain_gain = BBR_UNIT * 1000 / 2885;
 static const int bbr_cwnd_gain  = BBR_UNIT * 2;
 /* The pacing_gain values for the PROBE_BW gain cycle, to discover/share bw: */
 static const int bbr_pacing_gain[] = {
-	BBR_UNIT * 5 / 4,	/* probe for more available bw */
-	BBR_UNIT * 3 / 4,	/* drain queue and/or yield bw to other flows */
-	BBR_UNIT, BBR_UNIT, BBR_UNIT,	/* cruise at 1.0*bw to utilize pipe, */
-	BBR_UNIT, BBR_UNIT, BBR_UNIT	/* without creating excess queue... */
+	BBR_UNIT * 5 / 4,	/* UP: probe for more available bw */
+	BBR_UNIT * 91 / 100,	/* DOWN: drain queue and/or yield bw */
+	BBR_UNIT,		/* CRUISE: try to use pipe w/ some headroom */
+	BBR_UNIT,		/* REFILL: refill pipe to estimated 100% */
+};
+enum bbr_pacing_gain_phase {
+	BBR_BW_PROBE_UP		= 0,  /* push up inflight to probe for bw/vol */
+	BBR_BW_PROBE_DOWN	= 1,  /* drain excess inflight from the queue */
+	BBR_BW_PROBE_CRUISE	= 2,  /* use pipe, w/ headroom in queue/pipe */
+	BBR_BW_PROBE_REFILL	= 3,  /* v2: refill the pipe again to 100% */
 };
-/* Randomize the starting gain cycling phase over N phases: */
-static const u32 bbr_cycle_rand = 7;
 
 /* Try to keep at least this many packets in flight, if things go smoothly. For
  * smooth functioning, a sliding window protocol ACKing every other packet
@@ -174,24 +241,12 @@ static const u32 bbr_cycle_rand = 7;
  */
 static const u32 bbr_cwnd_min_target = 4;
 
-/* To estimate if BBR_STARTUP mode (i.e. high_gain) has filled pipe... */
+/* To estimate if BBR_STARTUP or BBR_BW_PROBE_UP has filled pipe... */
 /* If bw has increased significantly (1.25x), there may be more bw available: */
 static const u32 bbr_full_bw_thresh = BBR_UNIT * 5 / 4;
 /* But after 3 rounds w/o significant bw growth, estimate pipe is full: */
 static const u32 bbr_full_bw_cnt = 3;
 
-/* "long-term" ("LT") bandwidth estimator parameters... */
-/* The minimum number of rounds in an LT bw sampling interval: */
-static const u32 bbr_lt_intvl_min_rtts = 4;
-/* If lost/delivered ratio > 20%, interval is "lossy" and we may be policed: */
-static const u32 bbr_lt_loss_thresh = 50;
-/* If 2 intervals have a bw ratio <= 1/8, their bw is "consistent": */
-static const u32 bbr_lt_bw_ratio = BBR_UNIT / 8;
-/* If 2 intervals have a bw diff <= 4 Kbit/sec their bw is "consistent": */
-static const u32 bbr_lt_bw_diff = 4000 / 8;
-/* If we estimate we're policed, use lt_bw for this many round trips: */
-static const u32 bbr_lt_bw_max_rtts = 48;
-
 /* Gain factor for adding extra_acked to target cwnd: */
 static const int bbr_extra_acked_gain = BBR_UNIT;
 /* Window length of extra_acked window. */
@@ -201,8 +256,121 @@ static const u32 bbr_ack_epoch_acked_reset_thresh = 1U << 20;
 /* Time period for clamping cwnd increment due to ack aggregation */
 static const u32 bbr_extra_acked_max_us = 100 * 1000;
 
+/* Flags to control BBR ECN-related behavior... */
+
+/* Ensure ACKs only ACK packets with consistent ECN CE status? */
+static const bool bbr_precise_ece_ack = true;
+
+/* Max RTT (in usec) at which to use sender-side ECN logic.
+ * Disabled when 0 (ECN allowed at any RTT).
+ */
+static const u32 bbr_ecn_max_rtt_us = 5000;
+
+/* On losses, scale down inflight and pacing rate by beta scaled by BBR_SCALE.
+ * No loss response when 0.
+ */
+static const u32 bbr_beta = BBR_UNIT * 30 / 100;
+
+/* Gain factor for ECN mark ratio samples, scaled by BBR_SCALE (1/16 = 6.25%) */
+static const u32 bbr_ecn_alpha_gain = BBR_UNIT * 1 / 16;
+
+/* The initial value for ecn_alpha; 1.0 allows a flow to respond quickly
+ * to congestion if the bottleneck is congested when the flow starts up.
+ */
+static const u32 bbr_ecn_alpha_init = BBR_UNIT;
+
+/* On ECN, cut inflight_lo to (1 - ecn_factor * ecn_alpha) scaled by BBR_SCALE.
+ * No ECN based bounding when 0.
+ */
+static const u32 bbr_ecn_factor = BBR_UNIT * 1 / 3;	 /* 1/3 = 33% */
+
+/* Estimate bw probing has gone too far if CE ratio exceeds this threshold.
+ * Scaled by BBR_SCALE. Disabled when 0.
+ */
+static const u32 bbr_ecn_thresh = BBR_UNIT * 1 / 2;  /* 1/2 = 50% */
+
+/* If non-zero, if in a cycle with no losses but some ECN marks, after ECN
+ * clears then make the first round's increment to inflight_hi the following
+ * fraction of inflight_hi.
+ */
+static const u32 bbr_ecn_reprobe_gain = BBR_UNIT * 1 / 2;
+
+/* Estimate bw probing has gone too far if loss rate exceeds this level. */
+static const u32 bbr_loss_thresh = BBR_UNIT * 2 / 100;  /* 2% loss */
+
+/* Slow down for a packet loss recovered by TLP? */
+static const bool bbr_loss_probe_recovery = true;
+
+/* Exit STARTUP if number of loss marking events in a Recovery round is >= N,
+ * and loss rate is higher than bbr_loss_thresh.
+ * Disabled if 0.
+ */
+static const u32 bbr_full_loss_cnt = 6;
+
+/* Exit STARTUP if number of round trips with ECN mark rate above ecn_thresh
+ * meets this count.
+ */
+static const u32 bbr_full_ecn_cnt = 2;
+
+/* Fraction of unutilized headroom to try to leave in path upon high loss. */
+static const u32 bbr_inflight_headroom = BBR_UNIT * 15 / 100;
+
+/* How much do we increase cwnd_gain when probing for bandwidth in
+ * BBR_BW_PROBE_UP? This specifies the increment in units of
+ * BBR_UNIT/4. The default is 1, meaning 0.25.
+ * The min value is 0 (meaning 0.0); max is 3 (meaning 0.75).
+ */
+static const u32 bbr_bw_probe_cwnd_gain = 1;
+
+/* Max number of packet-timed rounds to wait before probing for bandwidth.  If
+ * we want to tolerate 1% random loss per round, and not have this cut our
+ * inflight too much, we must probe for bw periodically on roughly this scale.
+ * If low, limits Reno/CUBIC coexistence; if high, limits loss tolerance.
+ * We aim to be fair with Reno/CUBIC up to a BDP of at least:
+ *  BDP = 25Mbps * .030sec /(1514bytes) = 61.9 packets
+ */
+static const u32 bbr_bw_probe_max_rounds = 63;
+
+/* Max amount of randomness to inject in round counting for Reno-coexistence.
+ */
+static const u32 bbr_bw_probe_rand_rounds = 2;
+
+/* Use BBR-native probe time scale starting at this many usec.
+ * We aim to be fair with Reno/CUBIC up to an inter-loss time epoch of at least:
+ *  BDP*RTT = 25Mbps * .030sec /(1514bytes) * 0.030sec = 1.9 secs
+ */
+static const u32 bbr_bw_probe_base_us = 2 * USEC_PER_SEC;  /* 2 secs */
+
+/* Use BBR-native probes spread over this many usec: */
+static const u32 bbr_bw_probe_rand_us = 1 * USEC_PER_SEC;  /* 1 secs */
+
+/* Use fast path if app-limited, no loss/ECN, and target cwnd was reached? */
+static const bool bbr_fast_path = true;
+
+/* Use fast ack mode? */
+static const bool bbr_fast_ack_mode = true;
+
+static u32 bbr_max_bw(const struct sock *sk);
+static u32 bbr_bw(const struct sock *sk);
+static void bbr_exit_probe_rtt(struct sock *sk);
+static void bbr_reset_congestion_signals(struct sock *sk);
+static void bbr_run_loss_probe_recovery(struct sock *sk);
+
 static void bbr_check_probe_rtt_done(struct sock *sk);
 
+/* This connection can use ECN if both endpoints have signaled ECN support in
+ * the handshake and the per-route settings indicated this is a
+ * shallow-threshold ECN environment, meaning both:
+ *  (a) ECN CE marks indicate low-latency/shallow-threshold congestion, and
+ *  (b) TCP endpoints provide precise ACKs that only ACK data segments
+ *      with consistent ECN CE status
+ */
+static bool bbr_can_use_ecn(const struct sock *sk)
+{
+	return (tcp_sk(sk)->ecn_flags & TCP_ECN_OK) &&
+	       (tcp_sk(sk)->ecn_flags & TCP_ECN_LOW);
+}
+
 /* Do we estimate that STARTUP filled the pipe? */
 static bool bbr_full_bw_reached(const struct sock *sk)
 {
@@ -214,17 +382,17 @@ static bool bbr_full_bw_reached(const struct sock *sk)
 /* Return the windowed max recent bandwidth sample, in pkts/uS << BW_SCALE. */
 static u32 bbr_max_bw(const struct sock *sk)
 {
-	struct bbr *bbr = inet_csk_ca(sk);
+	const struct bbr *bbr = inet_csk_ca(sk);
 
-	return minmax_get(&bbr->bw);
+	return max(bbr->bw_hi[0], bbr->bw_hi[1]);
 }
 
 /* Return the estimated bandwidth of the path, in pkts/uS << BW_SCALE. */
 static u32 bbr_bw(const struct sock *sk)
 {
-	struct bbr *bbr = inet_csk_ca(sk);
+	const struct bbr *bbr = inet_csk_ca(sk);
 
-	return bbr->lt_use_bw ? bbr->lt_bw : bbr_max_bw(sk);
+	return min(bbr_max_bw(sk), bbr->bw_lo);
 }
 
 /* Return maximum extra acked in past k-2k round trips,
@@ -241,15 +409,23 @@ static u16 bbr_extra_acked(const struct sock *sk)
  * The order here is chosen carefully to avoid overflow of u64. This should
  * work for input rates of up to 2.9Tbit/sec and gain of 2.89x.
  */
-static u64 bbr_rate_bytes_per_sec(struct sock *sk, u64 rate, int gain)
+static u64 bbr_rate_bytes_per_sec(struct sock *sk, u64 rate, int gain,
+				  int margin)
 {
 	unsigned int mss = tcp_sk(sk)->mss_cache;
 
 	rate *= mss;
 	rate *= gain;
 	rate >>= BBR_SCALE;
-	rate *= USEC_PER_SEC / 100 * (100 - bbr_pacing_margin_percent);
-	return rate >> BW_SCALE;
+	rate *= USEC_PER_SEC / 100 * (100 - margin);
+	rate >>= BW_SCALE;
+	rate = max(rate, 1ULL);
+	return rate;
+}
+
+static u64 bbr_bw_bytes_per_sec(struct sock *sk, u64 rate)
+{
+	return bbr_rate_bytes_per_sec(sk, rate, BBR_UNIT, 0);
 }
 
 /* Convert a BBR bw and gain factor to a pacing rate in bytes per second. */
@@ -257,12 +433,13 @@ static unsigned long bbr_bw_to_pacing_rate(struct sock *sk, u32 bw, int gain)
 {
 	u64 rate = bw;
 
-	rate = bbr_rate_bytes_per_sec(sk, rate, gain);
+	rate = bbr_rate_bytes_per_sec(sk, rate, gain,
+				      bbr_pacing_margin_percent);
 	rate = min_t(u64, rate, sk->sk_max_pacing_rate);
 	return rate;
 }
 
-/* Initialize pacing rate to: high_gain * init_cwnd / RTT. */
+/* Initialize pacing rate to: startup_pacing_gain * init_cwnd / RTT. */
 static void bbr_init_pacing_rate_from_rtt(struct sock *sk)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
@@ -278,7 +455,8 @@ static void bbr_init_pacing_rate_from_rtt(struct sock *sk)
 	}
 	bw = (u64)tcp_snd_cwnd(tp) * BW_UNIT;
 	do_div(bw, rtt_us);
-	sk->sk_pacing_rate = bbr_bw_to_pacing_rate(sk, bw, bbr_high_gain);
+	sk->sk_pacing_rate =
+	  bbr_bw_to_pacing_rate(sk, bw, bbr_param(sk, startup_pacing_gain));
 }
 
 /* Pace using current bw estimate and a gain factor. */
@@ -294,26 +472,48 @@ static void bbr_set_pacing_rate(struct sock *sk, u32 bw, int gain)
 		sk->sk_pacing_rate = rate;
 }
 
-/* override sysctl_tcp_min_tso_segs */
-__bpf_kfunc static u32 bbr_min_tso_segs(struct sock *sk)
+/* Return the number of segments BBR would like in a TSO/GSO skb, given a
+ * particular max gso size as a constraint. TODO: make this simpler and more
+ * consistent by switching bbr to just call tcp_tso_autosize().
+ */
+static u32 bbr_tso_segs_generic(struct sock *sk, unsigned int mss_now,
+				u32 gso_max_size)
+{
+	struct bbr *bbr = inet_csk_ca(sk);
+	u32 segs, r;
+	u64 bytes;
+
+	/* Budget a TSO/GSO burst size allowance based on bw (pacing_rate). */
+	bytes = sk->sk_pacing_rate >> sk->sk_pacing_shift;
+
+	/* Budget a TSO/GSO burst size allowance based on min_rtt. For every
+	 * K = 2^tso_rtt_shift microseconds of min_rtt, halve the burst.
+	 * The min_rtt-based burst allowance is: 64 KBytes / 2^(min_rtt/K)
+	 */
+	if (bbr_param(sk, tso_rtt_shift)) {
+		r = bbr->min_rtt_us >> bbr_param(sk, tso_rtt_shift);
+		if (r < BITS_PER_TYPE(u32))   /* prevent undefined behavior */
+			bytes += GSO_LEGACY_MAX_SIZE >> r;
+	}
+
+	bytes = min_t(u32, bytes, gso_max_size - 1 - MAX_TCP_HEADER);
+	segs = max_t(u32, bytes / mss_now,
+		     sock_net(sk)->ipv4.sysctl_tcp_min_tso_segs);
+	return segs;
+}
+
+/* Custom tcp_tso_autosize() for BBR, used at transmit time to cap skb size. */
+__bpf_kfunc static u32 bbr_tso_segs(struct sock *sk, unsigned int mss_now)
 {
-	return sk->sk_pacing_rate < (bbr_min_tso_rate >> 3) ? 1 : 2;
+	return bbr_tso_segs_generic(sk, mss_now, sk->sk_gso_max_size);
 }
 
+/* Like bbr_tso_segs(), using mss_cache, ignoring driver's sk_gso_max_size. */
 static u32 bbr_tso_segs_goal(struct sock *sk)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
-	u32 segs, bytes;
-
-	/* Sort of tcp_tso_autosize() but ignoring
-	 * driver provided sk_gso_max_size.
-	 */
-	bytes = min_t(unsigned long,
-		      sk->sk_pacing_rate >> READ_ONCE(sk->sk_pacing_shift),
-		      GSO_LEGACY_MAX_SIZE - 1 - MAX_TCP_HEADER);
-	segs = max_t(u32, bytes / tp->mss_cache, bbr_min_tso_segs(sk));
 
-	return min(segs, 0x7FU);
+	return  bbr_tso_segs_generic(sk, tp->mss_cache, GSO_LEGACY_MAX_SIZE);
 }
 
 /* Save "last known good" cwnd so we can restore it after losses or PROBE_RTT */
@@ -333,7 +533,9 @@ __bpf_kfunc static void bbr_cwnd_event(struct sock *sk, enum tcp_ca_event event)
 	struct tcp_sock *tp = tcp_sk(sk);
 	struct bbr *bbr = inet_csk_ca(sk);
 
-	if (event == CA_EVENT_TX_START && tp->app_limited) {
+	if (event == CA_EVENT_TX_START) {
+		if (!tp->app_limited)
+			return;
 		bbr->idle_restart = 1;
 		bbr->ack_epoch_mstamp = tp->tcp_mstamp;
 		bbr->ack_epoch_acked = 0;
@@ -344,6 +546,16 @@ __bpf_kfunc static void bbr_cwnd_event(struct sock *sk, enum tcp_ca_event event)
 			bbr_set_pacing_rate(sk, bbr_bw(sk), BBR_UNIT);
 		else if (bbr->mode == BBR_PROBE_RTT)
 			bbr_check_probe_rtt_done(sk);
+	} else if ((event == CA_EVENT_ECN_IS_CE ||
+		    event == CA_EVENT_ECN_NO_CE) &&
+		   bbr_can_use_ecn(sk) &&
+		   bbr_param(sk, precise_ece_ack)) {
+		u32 state = bbr->ce_state;
+		dctcp_ece_ack_update(sk, event, &bbr->prior_rcv_nxt, &state);
+		bbr->ce_state = state;
+	} else if (event == CA_EVENT_TLP_RECOVERY &&
+		   bbr_param(sk, loss_probe_recovery)) {
+		bbr_run_loss_probe_recovery(sk);
 	}
 }
 
@@ -366,10 +578,10 @@ static u32 bbr_bdp(struct sock *sk, u32 bw, int gain)
 	 * default. This should only happen when the connection is not using TCP
 	 * timestamps and has retransmitted all of the SYN/SYNACK/data packets
 	 * ACKed so far. In this case, an RTO can cut cwnd to 1, in which
-	 * case we need to slow-start up toward something safe: TCP_INIT_CWND.
+	 * case we need to slow-start up toward something safe: initial cwnd.
 	 */
 	if (unlikely(bbr->min_rtt_us == ~0U))	 /* no valid RTT samples yet? */
-		return TCP_INIT_CWND;  /* be safe: cap at default initial cwnd*/
+		return bbr->init_cwnd;  /* be safe: cap at initial cwnd */
 
 	w = (u64)bw * bbr->min_rtt_us;
 
@@ -386,23 +598,23 @@ static u32 bbr_bdp(struct sock *sk, u32 bw, int gain)
  *   - one skb in sending host Qdisc,
  *   - one skb in sending host TSO/GSO engine
  *   - one skb being received by receiver host LRO/GRO/delayed-ACK engine
- * Don't worry, at low rates (bbr_min_tso_rate) this won't bloat cwnd because
- * in such cases tso_segs_goal is 1. The minimum cwnd is 4 packets,
+ * Don't worry, at low rates this won't bloat cwnd because
+ * in such cases tso_segs_goal is small. The minimum cwnd is 4 packets,
  * which allows 2 outstanding 2-packet sequences, to try to keep pipe
  * full even with ACK-every-other-packet delayed ACKs.
  */
 static u32 bbr_quantization_budget(struct sock *sk, u32 cwnd)
 {
 	struct bbr *bbr = inet_csk_ca(sk);
+	u32 tso_segs_goal;
 
-	/* Allow enough full-sized skbs in flight to utilize end systems. */
-	cwnd += 3 * bbr_tso_segs_goal(sk);
-
-	/* Reduce delayed ACKs by rounding up cwnd to the next even number. */
-	cwnd = (cwnd + 1) & ~1U;
+	tso_segs_goal = 3 * bbr_tso_segs_goal(sk);
 
+	/* Allow enough full-sized skbs in flight to utilize end systems. */
+	cwnd = max_t(u32, cwnd, tso_segs_goal);
+	cwnd = max_t(u32, cwnd, bbr_param(sk, cwnd_min_target));
 	/* Ensure gain cycling gets inflight above BDP even for small BDPs. */
-	if (bbr->mode == BBR_PROBE_BW && bbr->cycle_idx == 0)
+	if (bbr->mode == BBR_PROBE_BW && bbr->cycle_idx == BBR_BW_PROBE_UP)
 		cwnd += 2;
 
 	return cwnd;
@@ -457,10 +669,10 @@ static u32 bbr_ack_aggregation_cwnd(struct sock *sk)
 {
 	u32 max_aggr_cwnd, aggr_cwnd = 0;
 
-	if (bbr_extra_acked_gain && bbr_full_bw_reached(sk)) {
+	if (bbr_param(sk, extra_acked_gain)) {
 		max_aggr_cwnd = ((u64)bbr_bw(sk) * bbr_extra_acked_max_us)
 				/ BW_UNIT;
-		aggr_cwnd = (bbr_extra_acked_gain * bbr_extra_acked(sk))
+		aggr_cwnd = (bbr_param(sk, extra_acked_gain) * bbr_extra_acked(sk))
 			     >> BBR_SCALE;
 		aggr_cwnd = min(aggr_cwnd, max_aggr_cwnd);
 	}
@@ -468,66 +680,27 @@ static u32 bbr_ack_aggregation_cwnd(struct sock *sk)
 	return aggr_cwnd;
 }
 
-/* An optimization in BBR to reduce losses: On the first round of recovery, we
- * follow the packet conservation principle: send P packets per P packets acked.
- * After that, we slow-start and send at most 2*P packets per P packets acked.
- * After recovery finishes, or upon undo, we restore the cwnd we had when
- * recovery started (capped by the target cwnd based on estimated BDP).
- *
- * TODO(ycheng/ncardwell): implement a rate-based approach.
- */
-static bool bbr_set_cwnd_to_recover_or_restore(
-	struct sock *sk, const struct rate_sample *rs, u32 acked, u32 *new_cwnd)
+/* Returns the cwnd for PROBE_RTT mode. */
+static u32 bbr_probe_rtt_cwnd(struct sock *sk)
 {
-	struct tcp_sock *tp = tcp_sk(sk);
-	struct bbr *bbr = inet_csk_ca(sk);
-	u8 prev_state = bbr->prev_ca_state, state = inet_csk(sk)->icsk_ca_state;
-	u32 cwnd = tcp_snd_cwnd(tp);
-
-	/* An ACK for P pkts should release at most 2*P packets. We do this
-	 * in two steps. First, here we deduct the number of lost packets.
-	 * Then, in bbr_set_cwnd() we slow start up toward the target cwnd.
-	 */
-	if (rs->losses > 0)
-		cwnd = max_t(s32, cwnd - rs->losses, 1);
-
-	if (state == TCP_CA_Recovery && prev_state != TCP_CA_Recovery) {
-		/* Starting 1st round of Recovery, so do packet conservation. */
-		bbr->packet_conservation = 1;
-		bbr->next_rtt_delivered = tp->delivered;  /* start round now */
-		/* Cut unused cwnd from app behavior, TSQ, or TSO deferral: */
-		cwnd = tcp_packets_in_flight(tp) + acked;
-	} else if (prev_state >= TCP_CA_Recovery && state < TCP_CA_Recovery) {
-		/* Exiting loss recovery; restore cwnd saved before recovery. */
-		cwnd = max(cwnd, bbr->prior_cwnd);
-		bbr->packet_conservation = 0;
-	}
-	bbr->prev_ca_state = state;
-
-	if (bbr->packet_conservation) {
-		*new_cwnd = max(cwnd, tcp_packets_in_flight(tp) + acked);
-		return true;	/* yes, using packet conservation */
-	}
-	*new_cwnd = cwnd;
-	return false;
+	return max_t(u32, bbr_param(sk, cwnd_min_target),
+		     bbr_bdp(sk, bbr_bw(sk), bbr_param(sk, probe_rtt_cwnd_gain)));
 }
 
 /* Slow-start up toward target cwnd (if bw estimate is growing, or packet loss
  * has drawn us down below target), or snap down to target if we're above it.
  */
 static void bbr_set_cwnd(struct sock *sk, const struct rate_sample *rs,
-			 u32 acked, u32 bw, int gain)
+			 u32 acked, u32 bw, int gain, u32 cwnd,
+			 struct bbr_context *ctx)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 	struct bbr *bbr = inet_csk_ca(sk);
-	u32 cwnd = tcp_snd_cwnd(tp), target_cwnd = 0;
+	u32 target_cwnd = 0;
 
 	if (!acked)
 		goto done;  /* no packet fully ACKed; just apply caps */
 
-	if (bbr_set_cwnd_to_recover_or_restore(sk, rs, acked, &cwnd))
-		goto done;
-
 	target_cwnd = bbr_bdp(sk, bw, gain);
 
 	/* Increment the cwnd to account for excess ACKed data that seems
@@ -536,74 +709,26 @@ static void bbr_set_cwnd(struct sock *sk, const struct rate_sample *rs,
 	target_cwnd += bbr_ack_aggregation_cwnd(sk);
 	target_cwnd = bbr_quantization_budget(sk, target_cwnd);
 
-	/* If we're below target cwnd, slow start cwnd toward target cwnd. */
-	if (bbr_full_bw_reached(sk))  /* only cut cwnd if we filled the pipe */
-		cwnd = min(cwnd + acked, target_cwnd);
-	else if (cwnd < target_cwnd || tp->delivered < TCP_INIT_CWND)
-		cwnd = cwnd + acked;
-	cwnd = max(cwnd, bbr_cwnd_min_target);
+	/* Update cwnd and enable fast path if cwnd reaches target_cwnd. */
+	bbr->try_fast_path = 0;
+	if (bbr_full_bw_reached(sk)) { /* only cut cwnd if we filled the pipe */
+		cwnd += acked;
+		if (cwnd >= target_cwnd) {
+			cwnd = target_cwnd;
+			bbr->try_fast_path = 1;
+		}
+	} else if (cwnd < target_cwnd || cwnd  < 2 * bbr->init_cwnd) {
+		cwnd += acked;
+	} else {
+		bbr->try_fast_path = 1;
+	}
 
+	cwnd = max_t(u32, cwnd, bbr_param(sk, cwnd_min_target));
 done:
-	tcp_snd_cwnd_set(tp, min(cwnd, tp->snd_cwnd_clamp));	/* apply global cap */
+	tcp_snd_cwnd_set(tp, min(cwnd, tp->snd_cwnd_clamp));  /* global cap */
 	if (bbr->mode == BBR_PROBE_RTT)  /* drain queue, refresh min_rtt */
-		tcp_snd_cwnd_set(tp, min(tcp_snd_cwnd(tp), bbr_cwnd_min_target));
-}
-
-/* End cycle phase if it's time and/or we hit the phase's in-flight target. */
-static bool bbr_is_next_cycle_phase(struct sock *sk,
-				    const struct rate_sample *rs)
-{
-	struct tcp_sock *tp = tcp_sk(sk);
-	struct bbr *bbr = inet_csk_ca(sk);
-	bool is_full_length =
-		tcp_stamp_us_delta(tp->delivered_mstamp, bbr->cycle_mstamp) >
-		bbr->min_rtt_us;
-	u32 inflight, bw;
-
-	/* The pacing_gain of 1.0 paces at the estimated bw to try to fully
-	 * use the pipe without increasing the queue.
-	 */
-	if (bbr->pacing_gain == BBR_UNIT)
-		return is_full_length;		/* just use wall clock time */
-
-	inflight = bbr_packets_in_net_at_edt(sk, rs->prior_in_flight);
-	bw = bbr_max_bw(sk);
-
-	/* A pacing_gain > 1.0 probes for bw by trying to raise inflight to at
-	 * least pacing_gain*BDP; this may take more than min_rtt if min_rtt is
-	 * small (e.g. on a LAN). We do not persist if packets are lost, since
-	 * a path with small buffers may not hold that much.
-	 */
-	if (bbr->pacing_gain > BBR_UNIT)
-		return is_full_length &&
-			(rs->losses ||  /* perhaps pacing_gain*BDP won't fit */
-			 inflight >= bbr_inflight(sk, bw, bbr->pacing_gain));
-
-	/* A pacing_gain < 1.0 tries to drain extra queue we added if bw
-	 * probing didn't find more bw. If inflight falls to match BDP then we
-	 * estimate queue is drained; persisting would underutilize the pipe.
-	 */
-	return is_full_length ||
-		inflight <= bbr_inflight(sk, bw, BBR_UNIT);
-}
-
-static void bbr_advance_cycle_phase(struct sock *sk)
-{
-	struct tcp_sock *tp = tcp_sk(sk);
-	struct bbr *bbr = inet_csk_ca(sk);
-
-	bbr->cycle_idx = (bbr->cycle_idx + 1) & (CYCLE_LEN - 1);
-	bbr->cycle_mstamp = tp->delivered_mstamp;
-}
-
-/* Gain cycling: cycle pacing gain to converge to fair share of available bw. */
-static void bbr_update_cycle_phase(struct sock *sk,
-				   const struct rate_sample *rs)
-{
-	struct bbr *bbr = inet_csk_ca(sk);
-
-	if (bbr->mode == BBR_PROBE_BW && bbr_is_next_cycle_phase(sk, rs))
-		bbr_advance_cycle_phase(sk);
+		tcp_snd_cwnd_set(tp, min_t(u32, tcp_snd_cwnd(tp),
+					   bbr_probe_rtt_cwnd(sk)));
 }
 
 static void bbr_reset_startup_mode(struct sock *sk)
@@ -613,191 +738,49 @@ static void bbr_reset_startup_mode(struct sock *sk)
 	bbr->mode = BBR_STARTUP;
 }
 
-static void bbr_reset_probe_bw_mode(struct sock *sk)
-{
-	struct bbr *bbr = inet_csk_ca(sk);
-
-	bbr->mode = BBR_PROBE_BW;
-	bbr->cycle_idx = CYCLE_LEN - 1 - get_random_u32_below(bbr_cycle_rand);
-	bbr_advance_cycle_phase(sk);	/* flip to next phase of gain cycle */
-}
-
-static void bbr_reset_mode(struct sock *sk)
-{
-	if (!bbr_full_bw_reached(sk))
-		bbr_reset_startup_mode(sk);
-	else
-		bbr_reset_probe_bw_mode(sk);
-}
-
-/* Start a new long-term sampling interval. */
-static void bbr_reset_lt_bw_sampling_interval(struct sock *sk)
-{
-	struct tcp_sock *tp = tcp_sk(sk);
-	struct bbr *bbr = inet_csk_ca(sk);
-
-	bbr->lt_last_stamp = div_u64(tp->delivered_mstamp, USEC_PER_MSEC);
-	bbr->lt_last_delivered = tp->delivered;
-	bbr->lt_last_lost = tp->lost;
-	bbr->lt_rtt_cnt = 0;
-}
-
-/* Completely reset long-term bandwidth sampling. */
-static void bbr_reset_lt_bw_sampling(struct sock *sk)
-{
-	struct bbr *bbr = inet_csk_ca(sk);
-
-	bbr->lt_bw = 0;
-	bbr->lt_use_bw = 0;
-	bbr->lt_is_sampling = false;
-	bbr_reset_lt_bw_sampling_interval(sk);
-}
-
-/* Long-term bw sampling interval is done. Estimate whether we're policed. */
-static void bbr_lt_bw_interval_done(struct sock *sk, u32 bw)
-{
-	struct bbr *bbr = inet_csk_ca(sk);
-	u32 diff;
-
-	if (bbr->lt_bw) {  /* do we have bw from a previous interval? */
-		/* Is new bw close to the lt_bw from the previous interval? */
-		diff = abs(bw - bbr->lt_bw);
-		if ((diff * BBR_UNIT <= bbr_lt_bw_ratio * bbr->lt_bw) ||
-		    (bbr_rate_bytes_per_sec(sk, diff, BBR_UNIT) <=
-		     bbr_lt_bw_diff)) {
-			/* All criteria are met; estimate we're policed. */
-			bbr->lt_bw = (bw + bbr->lt_bw) >> 1;  /* avg 2 intvls */
-			bbr->lt_use_bw = 1;
-			bbr->pacing_gain = BBR_UNIT;  /* try to avoid drops */
-			bbr->lt_rtt_cnt = 0;
-			return;
-		}
-	}
-	bbr->lt_bw = bw;
-	bbr_reset_lt_bw_sampling_interval(sk);
-}
-
-/* Token-bucket traffic policers are common (see "An Internet-Wide Analysis of
- * Traffic Policing", SIGCOMM 2016). BBR detects token-bucket policers and
- * explicitly models their policed rate, to reduce unnecessary losses. We
- * estimate that we're policed if we see 2 consecutive sampling intervals with
- * consistent throughput and high packet loss. If we think we're being policed,
- * set lt_bw to the "long-term" average delivery rate from those 2 intervals.
+/* See if we have reached next round trip. Upon start of the new round,
+ * returns packets delivered since previous round start plus this ACK.
  */
-static void bbr_lt_bw_sampling(struct sock *sk, const struct rate_sample *rs)
-{
-	struct tcp_sock *tp = tcp_sk(sk);
-	struct bbr *bbr = inet_csk_ca(sk);
-	u32 lost, delivered;
-	u64 bw;
-	u32 t;
-
-	if (bbr->lt_use_bw) {	/* already using long-term rate, lt_bw? */
-		if (bbr->mode == BBR_PROBE_BW && bbr->round_start &&
-		    ++bbr->lt_rtt_cnt >= bbr_lt_bw_max_rtts) {
-			bbr_reset_lt_bw_sampling(sk);    /* stop using lt_bw */
-			bbr_reset_probe_bw_mode(sk);  /* restart gain cycling */
-		}
-		return;
-	}
-
-	/* Wait for the first loss before sampling, to let the policer exhaust
-	 * its tokens and estimate the steady-state rate allowed by the policer.
-	 * Starting samples earlier includes bursts that over-estimate the bw.
-	 */
-	if (!bbr->lt_is_sampling) {
-		if (!rs->losses)
-			return;
-		bbr_reset_lt_bw_sampling_interval(sk);
-		bbr->lt_is_sampling = true;
-	}
-
-	/* To avoid underestimates, reset sampling if we run out of data. */
-	if (rs->is_app_limited) {
-		bbr_reset_lt_bw_sampling(sk);
-		return;
-	}
-
-	if (bbr->round_start)
-		bbr->lt_rtt_cnt++;	/* count round trips in this interval */
-	if (bbr->lt_rtt_cnt < bbr_lt_intvl_min_rtts)
-		return;		/* sampling interval needs to be longer */
-	if (bbr->lt_rtt_cnt > 4 * bbr_lt_intvl_min_rtts) {
-		bbr_reset_lt_bw_sampling(sk);  /* interval is too long */
-		return;
-	}
-
-	/* End sampling interval when a packet is lost, so we estimate the
-	 * policer tokens were exhausted. Stopping the sampling before the
-	 * tokens are exhausted under-estimates the policed rate.
-	 */
-	if (!rs->losses)
-		return;
-
-	/* Calculate packets lost and delivered in sampling interval. */
-	lost = tp->lost - bbr->lt_last_lost;
-	delivered = tp->delivered - bbr->lt_last_delivered;
-	/* Is loss rate (lost/delivered) >= lt_loss_thresh? If not, wait. */
-	if (!delivered || (lost << BBR_SCALE) < bbr_lt_loss_thresh * delivered)
-		return;
-
-	/* Find average delivery rate in this sampling interval. */
-	t = div_u64(tp->delivered_mstamp, USEC_PER_MSEC) - bbr->lt_last_stamp;
-	if ((s32)t < 1)
-		return;		/* interval is less than one ms, so wait */
-	/* Check if can multiply without overflow */
-	if (t >= ~0U / USEC_PER_MSEC) {
-		bbr_reset_lt_bw_sampling(sk);  /* interval too long; reset */
-		return;
-	}
-	t *= USEC_PER_MSEC;
-	bw = (u64)delivered * BW_UNIT;
-	do_div(bw, t);
-	bbr_lt_bw_interval_done(sk, bw);
-}
-
-/* Estimate the bandwidth based on how fast packets are delivered */
-static void bbr_update_bw(struct sock *sk, const struct rate_sample *rs)
+static u32 bbr_update_round_start(struct sock *sk,
+		const struct rate_sample *rs, struct bbr_context *ctx)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 	struct bbr *bbr = inet_csk_ca(sk);
-	u64 bw;
+	u32 round_delivered = 0;
 
 	bbr->round_start = 0;
-	if (rs->delivered < 0 || rs->interval_us <= 0)
-		return; /* Not a valid observation */
 
 	/* See if we've reached the next RTT */
-	if (!before(rs->prior_delivered, bbr->next_rtt_delivered)) {
+	if (rs->interval_us > 0 &&
+	    !before(rs->prior_delivered, bbr->next_rtt_delivered)) {
+		round_delivered = tp->delivered - bbr->next_rtt_delivered;
 		bbr->next_rtt_delivered = tp->delivered;
-		bbr->rtt_cnt++;
 		bbr->round_start = 1;
-		bbr->packet_conservation = 0;
 	}
+	return round_delivered;
+}
 
-	bbr_lt_bw_sampling(sk, rs);
+/* Calculate the bandwidth based on how fast packets are delivered */
+static void bbr_calculate_bw_sample(struct sock *sk,
+			const struct rate_sample *rs, struct bbr_context *ctx)
+{
+	u64 bw = 0;
 
 	/* Divide delivered by the interval to find a (lower bound) bottleneck
 	 * bandwidth sample. Delivered is in packets and interval_us in uS and
 	 * ratio will be <<1 for most connections. So delivered is first scaled.
+	 * Round up to allow growth at low rates, even with integer division.
 	 */
-	bw = div64_long((u64)rs->delivered * BW_UNIT, rs->interval_us);
-
-	/* If this sample is application-limited, it is likely to have a very
-	 * low delivered count that represents application behavior rather than
-	 * the available network rate. Such a sample could drag down estimated
-	 * bw, causing needless slow-down. Thus, to continue to send at the
-	 * last measured network rate, we filter out app-limited samples unless
-	 * they describe the path bw at least as well as our bw model.
-	 *
-	 * So the goal during app-limited phase is to proceed with the best
-	 * network rate no matter how long. We automatically leave this
-	 * phase when app writes faster than the network can deliver :)
-	 */
-	if (!rs->is_app_limited || bw >= bbr_max_bw(sk)) {
-		/* Incorporate new sample into our max bw filter. */
-		minmax_running_max(&bbr->bw, bbr_bw_rtts, bbr->rtt_cnt, bw);
+	if (rs->interval_us > 0) {
+		if (WARN_ONCE(rs->delivered < 0,
+			      "negative delivered: %d interval_us: %ld\n",
+			      rs->delivered, rs->interval_us))
+			return;
+
+		bw = DIV_ROUND_UP_ULL((u64)rs->delivered * BW_UNIT, rs->interval_us);
 	}
+
+	ctx->sample_bw = bw;
 }
 
 /* Estimates the windowed max degree of ack aggregation.
@@ -811,7 +794,7 @@ static void bbr_update_bw(struct sock *sk, const struct rate_sample *rs)
  *
  * Max extra_acked is clamped by cwnd and bw * bbr_extra_acked_max_us (100 ms).
  * Max filter is an approximate sliding window of 5-10 (packet timed) round
- * trips.
+ * trips for non-startup phase, and 1-2 round trips for startup.
  */
 static void bbr_update_ack_aggregation(struct sock *sk,
 				       const struct rate_sample *rs)
@@ -819,15 +802,19 @@ static void bbr_update_ack_aggregation(struct sock *sk,
 	u32 epoch_us, expected_acked, extra_acked;
 	struct bbr *bbr = inet_csk_ca(sk);
 	struct tcp_sock *tp = tcp_sk(sk);
+	u32 extra_acked_win_rtts_thresh = bbr_param(sk, extra_acked_win_rtts);
 
-	if (!bbr_extra_acked_gain || rs->acked_sacked <= 0 ||
+	if (!bbr_param(sk, extra_acked_gain) || rs->acked_sacked <= 0 ||
 	    rs->delivered < 0 || rs->interval_us <= 0)
 		return;
 
 	if (bbr->round_start) {
 		bbr->extra_acked_win_rtts = min(0x1F,
 						bbr->extra_acked_win_rtts + 1);
-		if (bbr->extra_acked_win_rtts >= bbr_extra_acked_win_rtts) {
+		if (!bbr_full_bw_reached(sk))
+			extra_acked_win_rtts_thresh = 1;
+		if (bbr->extra_acked_win_rtts >=
+		    extra_acked_win_rtts_thresh) {
 			bbr->extra_acked_win_rtts = 0;
 			bbr->extra_acked_win_idx = bbr->extra_acked_win_idx ?
 						   0 : 1;
@@ -861,49 +848,6 @@ static void bbr_update_ack_aggregation(struct sock *sk,
 		bbr->extra_acked[bbr->extra_acked_win_idx] = extra_acked;
 }
 
-/* Estimate when the pipe is full, using the change in delivery rate: BBR
- * estimates that STARTUP filled the pipe if the estimated bw hasn't changed by
- * at least bbr_full_bw_thresh (25%) after bbr_full_bw_cnt (3) non-app-limited
- * rounds. Why 3 rounds: 1: rwin autotuning grows the rwin, 2: we fill the
- * higher rwin, 3: we get higher delivery rate samples. Or transient
- * cross-traffic or radio noise can go away. CUBIC Hystart shares a similar
- * design goal, but uses delay and inter-ACK spacing instead of bandwidth.
- */
-static void bbr_check_full_bw_reached(struct sock *sk,
-				      const struct rate_sample *rs)
-{
-	struct bbr *bbr = inet_csk_ca(sk);
-	u32 bw_thresh;
-
-	if (bbr_full_bw_reached(sk) || !bbr->round_start || rs->is_app_limited)
-		return;
-
-	bw_thresh = (u64)bbr->full_bw * bbr_full_bw_thresh >> BBR_SCALE;
-	if (bbr_max_bw(sk) >= bw_thresh) {
-		bbr->full_bw = bbr_max_bw(sk);
-		bbr->full_bw_cnt = 0;
-		return;
-	}
-	++bbr->full_bw_cnt;
-	bbr->full_bw_reached = bbr->full_bw_cnt >= bbr_full_bw_cnt;
-}
-
-/* If pipe is probably full, drain the queue and then enter steady-state. */
-static void bbr_check_drain(struct sock *sk, const struct rate_sample *rs)
-{
-	struct bbr *bbr = inet_csk_ca(sk);
-
-	if (bbr->mode == BBR_STARTUP && bbr_full_bw_reached(sk)) {
-		bbr->mode = BBR_DRAIN;	/* drain queue we created */
-		tcp_sk(sk)->snd_ssthresh =
-				bbr_inflight(sk, bbr_max_bw(sk), BBR_UNIT);
-	}	/* fall through to check if in-flight is already small: */
-	if (bbr->mode == BBR_DRAIN &&
-	    bbr_packets_in_net_at_edt(sk, tcp_packets_in_flight(tcp_sk(sk))) <=
-	    bbr_inflight(sk, bbr_max_bw(sk), BBR_UNIT))
-		bbr_reset_probe_bw_mode(sk);  /* we estimate queue is drained */
-}
-
 static void bbr_check_probe_rtt_done(struct sock *sk)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
@@ -913,9 +857,9 @@ static void bbr_check_probe_rtt_done(struct sock *sk)
 	      after(tcp_jiffies32, bbr->probe_rtt_done_stamp)))
 		return;
 
-	bbr->min_rtt_stamp = tcp_jiffies32;  /* wait a while until PROBE_RTT */
+	bbr->probe_rtt_min_stamp = tcp_jiffies32; /* schedule next PROBE_RTT */
 	tcp_snd_cwnd_set(tp, max(tcp_snd_cwnd(tp), bbr->prior_cwnd));
-	bbr_reset_mode(sk);
+	bbr_exit_probe_rtt(sk);
 }
 
 /* The goal of PROBE_RTT mode is to have BBR flows cooperatively and
@@ -941,23 +885,35 @@ static void bbr_update_min_rtt(struct sock *sk, const struct rate_sample *rs)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 	struct bbr *bbr = inet_csk_ca(sk);
-	bool filter_expired;
+	bool probe_rtt_expired, min_rtt_expired;
+	u32 expire;
 
-	/* Track min RTT seen in the min_rtt_win_sec filter window: */
-	filter_expired = after(tcp_jiffies32,
-			       bbr->min_rtt_stamp + bbr_min_rtt_win_sec * HZ);
+	/* Track min RTT in probe_rtt_win_ms to time next PROBE_RTT state. */
+	expire = bbr->probe_rtt_min_stamp +
+		 msecs_to_jiffies(bbr_param(sk, probe_rtt_win_ms));
+	probe_rtt_expired = after(tcp_jiffies32, expire);
 	if (rs->rtt_us >= 0 &&
-	    (rs->rtt_us < bbr->min_rtt_us ||
-	     (filter_expired && !rs->is_ack_delayed))) {
-		bbr->min_rtt_us = rs->rtt_us;
-		bbr->min_rtt_stamp = tcp_jiffies32;
+	    (rs->rtt_us < bbr->probe_rtt_min_us ||
+	     (probe_rtt_expired && !rs->is_ack_delayed))) {
+		bbr->probe_rtt_min_us = rs->rtt_us;
+		bbr->probe_rtt_min_stamp = tcp_jiffies32;
+	}
+	/* Track min RTT seen in the min_rtt_win_sec filter window: */
+	expire = bbr->min_rtt_stamp + bbr_param(sk, min_rtt_win_sec) * HZ;
+	min_rtt_expired = after(tcp_jiffies32, expire);
+	if (bbr->probe_rtt_min_us <= bbr->min_rtt_us ||
+	    min_rtt_expired) {
+		bbr->min_rtt_us = bbr->probe_rtt_min_us;
+		bbr->min_rtt_stamp = bbr->probe_rtt_min_stamp;
 	}
 
-	if (bbr_probe_rtt_mode_ms > 0 && filter_expired &&
+	if (bbr_param(sk, probe_rtt_mode_ms) > 0 && probe_rtt_expired &&
 	    !bbr->idle_restart && bbr->mode != BBR_PROBE_RTT) {
 		bbr->mode = BBR_PROBE_RTT;  /* dip, drain queue */
 		bbr_save_cwnd(sk);  /* note cwnd so we can restore it */
 		bbr->probe_rtt_done_stamp = 0;
+		bbr->ack_phase = BBR_ACKS_PROBE_STOPPING;
+		bbr->next_rtt_delivered = tp->delivered;
 	}
 
 	if (bbr->mode == BBR_PROBE_RTT) {
@@ -966,9 +922,9 @@ static void bbr_update_min_rtt(struct sock *sk, const struct rate_sample *rs)
 			(tp->delivered + tcp_packets_in_flight(tp)) ? : 1;
 		/* Maintain min packets in flight for max(200 ms, 1 round). */
 		if (!bbr->probe_rtt_done_stamp &&
-		    tcp_packets_in_flight(tp) <= bbr_cwnd_min_target) {
+		    tcp_packets_in_flight(tp) <= bbr_probe_rtt_cwnd(sk)) {
 			bbr->probe_rtt_done_stamp = tcp_jiffies32 +
-				msecs_to_jiffies(bbr_probe_rtt_mode_ms);
+				msecs_to_jiffies(bbr_param(sk, probe_rtt_mode_ms));
 			bbr->probe_rtt_round_done = 0;
 			bbr->next_rtt_delivered = tp->delivered;
 		} else if (bbr->probe_rtt_done_stamp) {
@@ -989,18 +945,20 @@ static void bbr_update_gains(struct sock *sk)
 
 	switch (bbr->mode) {
 	case BBR_STARTUP:
-		bbr->pacing_gain = bbr_high_gain;
-		bbr->cwnd_gain	 = bbr_high_gain;
+		bbr->pacing_gain = bbr_param(sk, startup_pacing_gain);
+		bbr->cwnd_gain	 = bbr_param(sk, startup_cwnd_gain);
 		break;
 	case BBR_DRAIN:
-		bbr->pacing_gain = bbr_drain_gain;	/* slow, to drain */
-		bbr->cwnd_gain	 = bbr_high_gain;	/* keep cwnd */
+		bbr->pacing_gain = bbr_param(sk, drain_gain);  /* slow, to drain */
+		bbr->cwnd_gain	 = bbr_param(sk, startup_cwnd_gain);  /* keep cwnd */
 		break;
 	case BBR_PROBE_BW:
-		bbr->pacing_gain = (bbr->lt_use_bw ?
-				    BBR_UNIT :
-				    bbr_pacing_gain[bbr->cycle_idx]);
-		bbr->cwnd_gain	 = bbr_cwnd_gain;
+		bbr->pacing_gain = bbr_pacing_gain[bbr->cycle_idx];
+		bbr->cwnd_gain	 = bbr_param(sk, cwnd_gain);
+		if (bbr_param(sk, bw_probe_cwnd_gain) &&
+		    bbr->cycle_idx == BBR_BW_PROBE_UP)
+			bbr->cwnd_gain +=
+				BBR_UNIT * bbr_param(sk, bw_probe_cwnd_gain) / 4;
 		break;
 	case BBR_PROBE_RTT:
 		bbr->pacing_gain = BBR_UNIT;
@@ -1012,144 +970,1387 @@ static void bbr_update_gains(struct sock *sk)
 	}
 }
 
-static void bbr_update_model(struct sock *sk, const struct rate_sample *rs)
+__bpf_kfunc static u32 bbr_sndbuf_expand(struct sock *sk)
 {
-	bbr_update_bw(sk, rs);
-	bbr_update_ack_aggregation(sk, rs);
-	bbr_update_cycle_phase(sk, rs);
-	bbr_check_full_bw_reached(sk, rs);
-	bbr_check_drain(sk, rs);
-	bbr_update_min_rtt(sk, rs);
-	bbr_update_gains(sk);
+	/* Provision 3 * cwnd since BBR may slow-start even during recovery. */
+	return 3;
 }
 
-__bpf_kfunc static void bbr_main(struct sock *sk, const struct rate_sample *rs)
+/* Incorporate a new bw sample into the current window of our max filter. */
+static void bbr_take_max_bw_sample(struct sock *sk, u32 bw)
 {
 	struct bbr *bbr = inet_csk_ca(sk);
-	u32 bw;
 
-	bbr_update_model(sk, rs);
-
-	bw = bbr_bw(sk);
-	bbr_set_pacing_rate(sk, bw, bbr->pacing_gain);
-	bbr_set_cwnd(sk, rs, rs->acked_sacked, bw, bbr->cwnd_gain);
+	bbr->bw_hi[1] = max(bw, bbr->bw_hi[1]);
 }
 
-__bpf_kfunc static void bbr_init(struct sock *sk)
+/* Keep max of last 1-2 cycles. Each PROBE_BW cycle, flip filter window. */
+static void bbr_advance_max_bw_filter(struct sock *sk)
 {
-	struct tcp_sock *tp = tcp_sk(sk);
 	struct bbr *bbr = inet_csk_ca(sk);
 
-	bbr->prior_cwnd = 0;
-	tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
-	bbr->rtt_cnt = 0;
-	bbr->next_rtt_delivered = tp->delivered;
-	bbr->prev_ca_state = TCP_CA_Open;
-	bbr->packet_conservation = 0;
-
-	bbr->probe_rtt_done_stamp = 0;
-	bbr->probe_rtt_round_done = 0;
-	bbr->min_rtt_us = tcp_min_rtt(tp);
-	bbr->min_rtt_stamp = tcp_jiffies32;
-
-	minmax_reset(&bbr->bw, bbr->rtt_cnt, 0);  /* init max bw to 0 */
+	if (!bbr->bw_hi[1])
+		return;  /* no samples in this window; remember old window */
+	bbr->bw_hi[0] = bbr->bw_hi[1];
+	bbr->bw_hi[1] = 0;
+}
 
-	bbr->has_seen_rtt = 0;
-	bbr_init_pacing_rate_from_rtt(sk);
+/* Reset the estimator for reaching full bandwidth based on bw plateau. */
+static void bbr_reset_full_bw(struct sock *sk)
+{
+	struct bbr *bbr = inet_csk_ca(sk);
 
-	bbr->round_start = 0;
-	bbr->idle_restart = 0;
-	bbr->full_bw_reached = 0;
 	bbr->full_bw = 0;
 	bbr->full_bw_cnt = 0;
-	bbr->cycle_mstamp = 0;
-	bbr->cycle_idx = 0;
-	bbr_reset_lt_bw_sampling(sk);
-	bbr_reset_startup_mode(sk);
+	bbr->full_bw_now = 0;
+}
 
-	bbr->ack_epoch_mstamp = tp->tcp_mstamp;
-	bbr->ack_epoch_acked = 0;
-	bbr->extra_acked_win_rtts = 0;
-	bbr->extra_acked_win_idx = 0;
-	bbr->extra_acked[0] = 0;
-	bbr->extra_acked[1] = 0;
+/* How much do we want in flight? Our BDP, unless congestion cut cwnd. */
+static u32 bbr_target_inflight(struct sock *sk)
+{
+	u32 bdp = bbr_inflight(sk, bbr_bw(sk), BBR_UNIT);
 
-	cmpxchg(&sk->sk_pacing_status, SK_PACING_NONE, SK_PACING_NEEDED);
+	return min(bdp, tcp_sk(sk)->snd_cwnd);
 }
 
-__bpf_kfunc static u32 bbr_sndbuf_expand(struct sock *sk)
+static bool bbr_is_probing_bandwidth(struct sock *sk)
 {
-	/* Provision 3 * cwnd since BBR may slow-start even during recovery. */
-	return 3;
+	struct bbr *bbr = inet_csk_ca(sk);
+
+	return (bbr->mode == BBR_STARTUP) ||
+		(bbr->mode == BBR_PROBE_BW &&
+		 (bbr->cycle_idx == BBR_BW_PROBE_REFILL ||
+		  bbr->cycle_idx == BBR_BW_PROBE_UP));
+}
+
+/* Has the given amount of time elapsed since we marked the phase start? */
+static bool bbr_has_elapsed_in_phase(const struct sock *sk, u32 interval_us)
+{
+	const struct tcp_sock *tp = tcp_sk(sk);
+	const struct bbr *bbr = inet_csk_ca(sk);
+
+	return tcp_stamp_us_delta(tp->tcp_mstamp,
+				  bbr->cycle_mstamp + interval_us) > 0;
+}
+
+static void bbr_handle_queue_too_high_in_startup(struct sock *sk)
+{
+	struct bbr *bbr = inet_csk_ca(sk);
+	u32 bdp;  /* estimated BDP in packets, with quantization budget */
+
+	bbr->full_bw_reached = 1;
+
+	bdp = bbr_inflight(sk, bbr_max_bw(sk), BBR_UNIT);
+	bbr->inflight_hi = max(bdp, bbr->inflight_latest);
+}
+
+/* Exit STARTUP upon N consecutive rounds with ECN mark rate > ecn_thresh. */
+static void bbr_check_ecn_too_high_in_startup(struct sock *sk, u32 ce_ratio)
+{
+	struct bbr *bbr = inet_csk_ca(sk);
+
+	if (bbr_full_bw_reached(sk) || !bbr->ecn_eligible ||
+	    !bbr_param(sk, full_ecn_cnt) || !bbr_param(sk, ecn_thresh))
+		return;
+
+	if (ce_ratio >= bbr_param(sk, ecn_thresh))
+		bbr->startup_ecn_rounds++;
+	else
+		bbr->startup_ecn_rounds = 0;
+
+	if (bbr->startup_ecn_rounds >= bbr_param(sk, full_ecn_cnt)) {
+		bbr_handle_queue_too_high_in_startup(sk);
+		return;
+	}
+}
+
+/* Updates ecn_alpha and returns ce_ratio. -1 if not available. */
+static int bbr_update_ecn_alpha(struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct net *net = sock_net(sk);
+	struct bbr *bbr = inet_csk_ca(sk);
+	s32 delivered, delivered_ce;
+	u64 alpha, ce_ratio;
+	u32 gain;
+	bool want_ecn_alpha;
+
+	/* See if we should use ECN sender logic for this connection. */
+	if (!bbr->ecn_eligible && bbr_can_use_ecn(sk) &&
+	    bbr_param(sk, ecn_factor) &&
+	    (bbr->min_rtt_us <= bbr_ecn_max_rtt_us ||
+	     !bbr_ecn_max_rtt_us))
+		bbr->ecn_eligible = 1;
+
+	/* Skip updating alpha only if not ECN-eligible and PLB is disabled. */
+	want_ecn_alpha = (bbr->ecn_eligible ||
+			  (bbr_can_use_ecn(sk) &&
+			   READ_ONCE(net->ipv4.sysctl_tcp_plb_enabled)));
+	if (!want_ecn_alpha)
+		return -1;
+
+	delivered = tp->delivered - bbr->alpha_last_delivered;
+	delivered_ce = tp->delivered_ce - bbr->alpha_last_delivered_ce;
+
+	if (delivered == 0 ||		/* avoid divide by zero */
+	    WARN_ON_ONCE(delivered < 0 || delivered_ce < 0))  /* backwards? */
+		return -1;
+
+	BUILD_BUG_ON(BBR_SCALE != TCP_PLB_SCALE);
+	ce_ratio = (u64)delivered_ce << BBR_SCALE;
+	do_div(ce_ratio, delivered);
+
+	gain = bbr_param(sk, ecn_alpha_gain);
+	alpha = ((BBR_UNIT - gain) * bbr->ecn_alpha) >> BBR_SCALE;
+	alpha += (gain * ce_ratio) >> BBR_SCALE;
+	bbr->ecn_alpha = min_t(u32, alpha, BBR_UNIT);
+
+	bbr->alpha_last_delivered = tp->delivered;
+	bbr->alpha_last_delivered_ce = tp->delivered_ce;
+
+	bbr_check_ecn_too_high_in_startup(sk, ce_ratio);
+	return (int)ce_ratio;
 }
 
-/* In theory BBR does not need to undo the cwnd since it does not
- * always reduce cwnd on losses (see bbr_main()). Keep it for now.
+/* Protective Load Balancing (PLB). PLB rehashes outgoing data (to a new IPv6
+ * flow label) if it encounters sustained congestion in the form of ECN marks.
  */
-__bpf_kfunc static u32 bbr_undo_cwnd(struct sock *sk)
+static void bbr_plb(struct sock *sk, const struct rate_sample *rs, int ce_ratio)
+{
+	struct bbr *bbr = inet_csk_ca(sk);
+
+	if (bbr->round_start && ce_ratio >= 0)
+		tcp_plb_update_state(sk, &bbr->plb, ce_ratio);
+
+	tcp_plb_check_rehash(sk, &bbr->plb);
+}
+
+/* Each round trip of BBR_BW_PROBE_UP, double volume of probing data. */
+static void bbr_raise_inflight_hi_slope(struct sock *sk)
 {
+	struct tcp_sock *tp = tcp_sk(sk);
 	struct bbr *bbr = inet_csk_ca(sk);
+	u32 growth_this_round, cnt;
+
+	/* Calculate "slope": packets S/Acked per inflight_hi increment. */
+	growth_this_round = 1 << bbr->bw_probe_up_rounds;
+	bbr->bw_probe_up_rounds = min(bbr->bw_probe_up_rounds + 1, 30);
+	cnt = tcp_snd_cwnd(tp) / growth_this_round;
+	cnt = max(cnt, 1U);
+	bbr->bw_probe_up_cnt = cnt;
+}
+
+/* In BBR_BW_PROBE_UP, not seeing high loss/ECN/queue, so raise inflight_hi. */
+static void bbr_probe_inflight_hi_upward(struct sock *sk,
+					  const struct rate_sample *rs)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct bbr *bbr = inet_csk_ca(sk);
+	u32 delta;
+
+	if (!tp->is_cwnd_limited || tcp_snd_cwnd(tp) < bbr->inflight_hi)
+		return;  /* not fully using inflight_hi, so don't grow it */
+
+	/* For each bw_probe_up_cnt packets ACKed, increase inflight_hi by 1. */
+	bbr->bw_probe_up_acks += rs->acked_sacked;
+	if (bbr->bw_probe_up_acks >=  bbr->bw_probe_up_cnt) {
+		delta = bbr->bw_probe_up_acks / bbr->bw_probe_up_cnt;
+		bbr->bw_probe_up_acks -= delta * bbr->bw_probe_up_cnt;
+		bbr->inflight_hi += delta;
+		bbr->try_fast_path = 0;  /* Need to update cwnd */
+	}
+
+	if (bbr->round_start)
+		bbr_raise_inflight_hi_slope(sk);
+}
+
+/* Does loss/ECN rate for this sample say inflight is "too high"?
+ * This is used by both the bbr_check_loss_too_high_in_startup() function,
+ * which can be used in either v1 or v2, and the PROBE_UP phase of v2, which
+ * uses it to notice when loss/ECN rates suggest inflight is too high.
+ */
+static bool bbr_is_inflight_too_high(const struct sock *sk,
+				      const struct rate_sample *rs)
+{
+	const struct bbr *bbr = inet_csk_ca(sk);
+	u32 loss_thresh, ecn_thresh;
 
-	bbr->full_bw = 0;   /* spurious slow-down; reset full pipe detection */
+	if (rs->lost > 0 && rs->tx_in_flight) {
+		loss_thresh = (u64)rs->tx_in_flight * bbr_param(sk, loss_thresh) >>
+				BBR_SCALE;
+		if (rs->lost > loss_thresh) {
+			return true;
+		}
+	}
+
+	if (rs->delivered_ce > 0 && rs->delivered > 0 &&
+	    bbr->ecn_eligible && bbr_param(sk, ecn_thresh)) {
+		ecn_thresh = (u64)rs->delivered * bbr_param(sk, ecn_thresh) >>
+				BBR_SCALE;
+		if (rs->delivered_ce > ecn_thresh) {
+			return true;
+		}
+	}
+
+	return false;
+}
+
+/* Calculate the tx_in_flight level that corresponded to excessive loss.
+ * We find "lost_prefix" segs of the skb where loss rate went too high,
+ * by solving for "lost_prefix" in the following equation:
+ *   lost                     /  inflight                     >= loss_thresh
+ *  (lost_prev + lost_prefix) / (inflight_prev + lost_prefix) >= loss_thresh
+ * Then we take that equation, convert it to fixed point, and
+ * round up to the nearest packet.
+ */
+static u32 bbr_inflight_hi_from_lost_skb(const struct sock *sk,
+					  const struct rate_sample *rs,
+					  const struct sk_buff *skb)
+{
+	const struct tcp_sock *tp = tcp_sk(sk);
+	u32 loss_thresh  = bbr_param(sk, loss_thresh);
+	u32 pcount, divisor, inflight_hi;
+	s32 inflight_prev, lost_prev;
+	u64 loss_budget, lost_prefix;
+
+	pcount = tcp_skb_pcount(skb);
+
+	/* How much data was in flight before this skb? */
+	inflight_prev = rs->tx_in_flight - pcount;
+	if (inflight_prev < 0) {
+		WARN_ONCE(tcp_skb_tx_in_flight_is_suspicious(
+				  pcount,
+				  TCP_SKB_CB(skb)->sacked,
+				  rs->tx_in_flight),
+			  "tx_in_flight: %u pcount: %u reneg: %u",
+			  rs->tx_in_flight, pcount, tcp_sk(sk)->is_sack_reneg);
+		return ~0U;
+	}
+
+	/* How much inflight data was marked lost before this skb? */
+	lost_prev = rs->lost - pcount;
+	if (WARN_ONCE(lost_prev < 0,
+		      "cwnd: %u ca: %d out: %u lost: %u pif: %u "
+		      "tx_in_flight: %u tx.lost: %u tp->lost: %u rs->lost: %d "
+		      "lost_prev: %d pcount: %d seq: %u end_seq: %u reneg: %u",
+		      tcp_snd_cwnd(tp), inet_csk(sk)->icsk_ca_state,
+		      tp->packets_out, tp->lost_out, tcp_packets_in_flight(tp),
+		      rs->tx_in_flight, TCP_SKB_CB(skb)->tx.lost, tp->lost,
+		      rs->lost, lost_prev, pcount,
+		      TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq,
+		      tp->is_sack_reneg))
+		return ~0U;
+
+	/* At what prefix of this lost skb did losss rate exceed loss_thresh? */
+	loss_budget = (u64)inflight_prev * loss_thresh + BBR_UNIT - 1;
+	loss_budget >>= BBR_SCALE;
+	if (lost_prev >= loss_budget) {
+		lost_prefix = 0;   /* previous losses crossed loss_thresh */
+	} else {
+		lost_prefix = loss_budget - lost_prev;
+		lost_prefix <<= BBR_SCALE;
+		divisor = BBR_UNIT - loss_thresh;
+		if (WARN_ON_ONCE(!divisor))  /* loss_thresh is 8 bits */
+			return ~0U;
+		do_div(lost_prefix, divisor);
+	}
+
+	inflight_hi = inflight_prev + lost_prefix;
+	return inflight_hi;
+}
+
+/* If loss/ECN rates during probing indicated we may have overfilled a
+ * buffer, return an operating point that tries to leave unutilized headroom in
+ * the path for other flows, for fairness convergence and lower RTTs and loss.
+ */
+static u32 bbr_inflight_with_headroom(const struct sock *sk)
+{
+	struct bbr *bbr = inet_csk_ca(sk);
+	u32 headroom, headroom_fraction;
+
+	if (bbr->inflight_hi == ~0U)
+		return ~0U;
+
+	headroom_fraction = bbr_param(sk, inflight_headroom);
+	headroom = ((u64)bbr->inflight_hi * headroom_fraction) >> BBR_SCALE;
+	headroom = max(headroom, 1U);
+	return max_t(s32, bbr->inflight_hi - headroom,
+		     bbr_param(sk, cwnd_min_target));
+}
+
+/* Bound cwnd to a sensible level, based on our current probing state
+ * machine phase and model of a good inflight level (inflight_lo, inflight_hi).
+ */
+static void bbr_bound_cwnd_for_inflight_model(struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct bbr *bbr = inet_csk_ca(sk);
+	u32 cap;
+
+	/* tcp_rcv_synsent_state_process() currently calls tcp_ack()
+	 * and thus cong_control() without first initializing us(!).
+	 */
+	if (!bbr->initialized)
+		return;
+
+	cap = ~0U;
+	if (bbr->mode == BBR_PROBE_BW &&
+	    bbr->cycle_idx != BBR_BW_PROBE_CRUISE) {
+		/* Probe to see if more packets fit in the path. */
+		cap = bbr->inflight_hi;
+	} else {
+		if (bbr->mode == BBR_PROBE_RTT ||
+		    (bbr->mode == BBR_PROBE_BW &&
+		     bbr->cycle_idx == BBR_BW_PROBE_CRUISE))
+			cap = bbr_inflight_with_headroom(sk);
+	}
+	/* Adapt to any loss/ECN since our last bw probe. */
+	cap = min(cap, bbr->inflight_lo);
+
+	cap = max_t(u32, cap, bbr_param(sk, cwnd_min_target));
+	tcp_snd_cwnd_set(tp, min(cap, tcp_snd_cwnd(tp)));
+}
+
+/* How should we multiplicatively cut bw or inflight limits based on ECN? */
+u32 bbr_ecn_cut(struct sock *sk)
+{
+	struct bbr *bbr = inet_csk_ca(sk);
+
+	return BBR_UNIT -
+		((bbr->ecn_alpha * bbr_param(sk, ecn_factor)) >> BBR_SCALE);
+}
+
+/* Init lower bounds if have not inited yet. */
+static void bbr_init_lower_bounds(struct sock *sk, bool init_bw)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct bbr *bbr = inet_csk_ca(sk);
+
+	if (init_bw && bbr->bw_lo == ~0U)
+		bbr->bw_lo = bbr_max_bw(sk);
+	if (bbr->inflight_lo == ~0U)
+		bbr->inflight_lo = tcp_snd_cwnd(tp);
+}
+
+/* Reduce bw and inflight to (1 - beta). */
+static void bbr_loss_lower_bounds(struct sock *sk, u32 *bw, u32 *inflight)
+{
+	struct bbr* bbr = inet_csk_ca(sk);
+	u32 loss_cut = BBR_UNIT - bbr_param(sk, beta);
+
+	*bw = max_t(u32, bbr->bw_latest,
+		    (u64)bbr->bw_lo * loss_cut >> BBR_SCALE);
+	*inflight = max_t(u32, bbr->inflight_latest,
+			  (u64)bbr->inflight_lo * loss_cut >> BBR_SCALE);
+}
+
+/* Reduce inflight to (1 - alpha*ecn_factor). */
+static void bbr_ecn_lower_bounds(struct sock *sk, u32 *inflight)
+{
+	struct bbr *bbr = inet_csk_ca(sk);
+	u32 ecn_cut = bbr_ecn_cut(sk);
+
+	*inflight = (u64)bbr->inflight_lo * ecn_cut >> BBR_SCALE;
+}
+
+/* Estimate a short-term lower bound on the capacity available now, based
+ * on measurements of the current delivery process and recent history. When we
+ * are seeing loss/ECN at times when we are not probing bw, then conservatively
+ * move toward flow balance by multiplicatively cutting our short-term
+ * estimated safe rate and volume of data (bw_lo and inflight_lo). We use a
+ * multiplicative decrease in order to converge to a lower capacity in time
+ * logarithmic in the magnitude of the decrease.
+ *
+ * However, we do not cut our short-term estimates lower than the current rate
+ * and volume of delivered data from this round trip, since from the current
+ * delivery process we can estimate the measured capacity available now.
+ *
+ * Anything faster than that approach would knowingly risk high loss, which can
+ * cause low bw for Reno/CUBIC and high loss recovery latency for
+ * request/response flows using any congestion control.
+ */
+static void bbr_adapt_lower_bounds(struct sock *sk,
+				    const struct rate_sample *rs)
+{
+	struct bbr *bbr = inet_csk_ca(sk);
+	u32 ecn_inflight_lo = ~0U;
+
+	/* We only use lower-bound estimates when not probing bw.
+	 * When probing we need to push inflight higher to probe bw.
+	 */
+	if (bbr_is_probing_bandwidth(sk))
+		return;
+
+	/* ECN response. */
+	if (bbr->ecn_in_round && bbr_param(sk, ecn_factor)) {
+		bbr_init_lower_bounds(sk, false);
+		bbr_ecn_lower_bounds(sk, &ecn_inflight_lo);
+	}
+
+	/* Loss response. */
+	if (bbr->loss_in_round) {
+		bbr_init_lower_bounds(sk, true);
+		bbr_loss_lower_bounds(sk, &bbr->bw_lo, &bbr->inflight_lo);
+	}
+
+	/* Adjust to the lower of the levels implied by loss/ECN. */
+	bbr->inflight_lo = min(bbr->inflight_lo, ecn_inflight_lo);
+	bbr->bw_lo = max(1U, bbr->bw_lo);
+}
+
+/* Reset any short-term lower-bound adaptation to congestion, so that we can
+ * push our inflight up.
+ */
+static void bbr_reset_lower_bounds(struct sock *sk)
+{
+	struct bbr *bbr = inet_csk_ca(sk);
+
+	bbr->bw_lo = ~0U;
+	bbr->inflight_lo = ~0U;
+}
+
+/* After bw probing (STARTUP/PROBE_UP), reset signals before entering a state
+ * machine phase where we adapt our lower bound based on congestion signals.
+ */
+static void bbr_reset_congestion_signals(struct sock *sk)
+{
+	struct bbr *bbr = inet_csk_ca(sk);
+
+	bbr->loss_in_round = 0;
+	bbr->ecn_in_round = 0;
+	bbr->loss_in_cycle = 0;
+	bbr->ecn_in_cycle = 0;
+	bbr->bw_latest = 0;
+	bbr->inflight_latest = 0;
+}
+
+static void bbr_exit_loss_recovery(struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct bbr *bbr = inet_csk_ca(sk);
+
+	tcp_snd_cwnd_set(tp, max(tcp_snd_cwnd(tp), bbr->prior_cwnd));
+	bbr->try_fast_path = 0; /* bound cwnd using latest model */
+}
+
+/* Update rate and volume of delivered data from latest round trip. */
+static void bbr_update_latest_delivery_signals(
+	struct sock *sk, const struct rate_sample *rs, struct bbr_context *ctx)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct bbr *bbr = inet_csk_ca(sk);
+
+	bbr->loss_round_start = 0;
+	if (rs->interval_us <= 0 || !rs->acked_sacked)
+		return; /* Not a valid observation */
+
+	bbr->bw_latest       = max_t(u32, bbr->bw_latest,       ctx->sample_bw);
+	bbr->inflight_latest = max_t(u32, bbr->inflight_latest, rs->delivered);
+
+	if (!before(rs->prior_delivered, bbr->loss_round_delivered)) {
+		bbr->loss_round_delivered = tp->delivered;
+		bbr->loss_round_start = 1;  /* mark start of new round trip */
+	}
+}
+
+/* Once per round, reset filter for latest rate and volume of delivered data. */
+static void bbr_advance_latest_delivery_signals(
+	struct sock *sk, const struct rate_sample *rs, struct bbr_context *ctx)
+{
+	struct bbr *bbr = inet_csk_ca(sk);
+
+	/* If ACK matches a TLP retransmit, persist the filter. If we detect
+	 * that a TLP retransmit plugged a tail loss, we'll want to remember
+	 * how much data the path delivered before the tail loss.
+	 */
+	if (bbr->loss_round_start && !rs->is_acking_tlp_retrans_seq) {
+		bbr->bw_latest = ctx->sample_bw;
+		bbr->inflight_latest = rs->delivered;
+	}
+}
+
+/* Update (most of) our congestion signals: track the recent rate and volume of
+ * delivered data, presence of loss, and EWMA degree of ECN marking.
+ */
+static void bbr_update_congestion_signals(
+	struct sock *sk, const struct rate_sample *rs, struct bbr_context *ctx)
+{
+	struct bbr *bbr = inet_csk_ca(sk);
+	u64 bw;
+
+	if (rs->interval_us <= 0 || !rs->acked_sacked)
+		return; /* Not a valid observation */
+	bw = ctx->sample_bw;
+
+	if (!rs->is_app_limited || bw >= bbr_max_bw(sk))
+		bbr_take_max_bw_sample(sk, bw);
+
+	bbr->loss_in_round |= (rs->losses > 0);
+
+	if (!bbr->loss_round_start)
+		return;		/* skip the per-round-trip updates */
+	/* Now do per-round-trip updates. */
+	bbr_adapt_lower_bounds(sk, rs);
+
+	bbr->loss_in_round = 0;
+	bbr->ecn_in_round  = 0;
+}
+
+/* Bandwidth probing can cause loss. To help coexistence with loss-based
+ * congestion control we spread out our probing in a Reno-conscious way. Due to
+ * the shape of the Reno sawtooth, the time required between loss epochs for an
+ * idealized Reno flow is a number of round trips that is the BDP of that
+ * flow. We count packet-timed round trips directly, since measured RTT can
+ * vary widely, and Reno is driven by packet-timed round trips.
+ */
+static bool bbr_is_reno_coexistence_probe_time(struct sock *sk)
+{
+	struct bbr *bbr = inet_csk_ca(sk);
+	u32 rounds;
+
+	/* Random loss can shave some small percentage off of our inflight
+	 * in each round. To survive this, flows need robust periodic probes.
+	 */
+	rounds = min_t(u32, bbr_param(sk, bw_probe_max_rounds), bbr_target_inflight(sk));
+	return bbr->rounds_since_probe >= rounds;
+}
+
+/* How long do we want to wait before probing for bandwidth (and risking
+ * loss)? We randomize the wait, for better mixing and fairness convergence.
+ *
+ * We bound the Reno-coexistence inter-bw-probe time to be 62-63 round trips.
+ * This is calculated to allow fairness with a 25Mbps, 30ms Reno flow,
+ * (eg 4K video to a broadband user):
+ *   BDP = 25Mbps * .030sec /(1514bytes) = 61.9 packets
+ *
+ * We bound the BBR-native inter-bw-probe wall clock time to be:
+ *  (a) higher than 2 sec: to try to avoid causing loss for a long enough time
+ *      to allow Reno at 30ms to get 4K video bw, the inter-bw-probe time must
+ *      be at least: 25Mbps * .030sec / (1514bytes) * 0.030sec = 1.9secs
+ *  (b) lower than 3 sec: to ensure flows can start probing in a reasonable
+ *      amount of time to discover unutilized bw on human-scale interactive
+ *      time-scales (e.g. perhaps traffic from a web page download that we
+ *      were competing with is now complete).
+ */
+static void bbr_pick_probe_wait(struct sock *sk)
+{
+	struct bbr *bbr = inet_csk_ca(sk);
+
+	/* Decide the random round-trip bound for wait until probe: */
+	bbr->rounds_since_probe =
+		get_random_u32_below(bbr_param(sk, bw_probe_rand_rounds));
+	/* Decide the random wall clock bound for wait until probe: */
+	bbr->probe_wait_us = bbr_param(sk, bw_probe_base_us) +
+			     get_random_u32_below(bbr_param(sk, bw_probe_rand_us));
+}
+
+static void bbr_set_cycle_idx(struct sock *sk, int cycle_idx)
+{
+	struct bbr *bbr = inet_csk_ca(sk);
+
+	bbr->cycle_idx = cycle_idx;
+	/* New phase, so need to update cwnd and pacing rate. */
+	bbr->try_fast_path = 0;
+}
+
+/* Send at estimated bw to fill the pipe, but not queue. We need this phase
+ * before PROBE_UP, because as soon as we send faster than the available bw
+ * we will start building a queue, and if the buffer is shallow we can cause
+ * loss. If we do not fill the pipe before we cause this loss, our bw_hi and
+ * inflight_hi estimates will underestimate.
+ */
+static void bbr_start_bw_probe_refill(struct sock *sk, u32 bw_probe_up_rounds)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct bbr *bbr = inet_csk_ca(sk);
+
+	bbr_reset_lower_bounds(sk);
+	bbr->bw_probe_up_rounds = bw_probe_up_rounds;
+	bbr->bw_probe_up_acks = 0;
+	bbr->stopped_risky_probe = 0;
+	bbr->ack_phase = BBR_ACKS_REFILLING;
+	bbr->next_rtt_delivered = tp->delivered;
+	bbr_set_cycle_idx(sk, BBR_BW_PROBE_REFILL);
+}
+
+/* Now probe max deliverable data rate and volume. */
+static void bbr_start_bw_probe_up(struct sock *sk, struct bbr_context *ctx)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct bbr *bbr = inet_csk_ca(sk);
+
+	bbr->ack_phase = BBR_ACKS_PROBE_STARTING;
+	bbr->next_rtt_delivered = tp->delivered;
+	bbr->cycle_mstamp = tp->tcp_mstamp;
+	bbr_reset_full_bw(sk);
+	bbr->full_bw = ctx->sample_bw;
+	bbr_set_cycle_idx(sk, BBR_BW_PROBE_UP);
+	bbr_raise_inflight_hi_slope(sk);
+}
+
+/* Start a new PROBE_BW probing cycle of some wall clock length. Pick a wall
+ * clock time at which to probe beyond an inflight that we think to be
+ * safe. This will knowingly risk packet loss, so we want to do this rarely, to
+ * keep packet loss rates low. Also start a round-trip counter, to probe faster
+ * if we estimate a Reno flow at our BDP would probe faster.
+ */
+static void bbr_start_bw_probe_down(struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct bbr *bbr = inet_csk_ca(sk);
+
+	bbr_reset_congestion_signals(sk);
+	bbr->bw_probe_up_cnt = ~0U;     /* not growing inflight_hi any more */
+	bbr_pick_probe_wait(sk);
+	bbr->cycle_mstamp = tp->tcp_mstamp;		/* start wall clock */
+	bbr->ack_phase = BBR_ACKS_PROBE_STOPPING;
+	bbr->next_rtt_delivered = tp->delivered;
+	bbr_set_cycle_idx(sk, BBR_BW_PROBE_DOWN);
+}
+
+/* Cruise: maintain what we estimate to be a neutral, conservative
+ * operating point, without attempting to probe up for bandwidth or down for
+ * RTT, and only reducing inflight in response to loss/ECN signals.
+ */
+static void bbr_start_bw_probe_cruise(struct sock *sk)
+{
+	struct bbr *bbr = inet_csk_ca(sk);
+
+	if (bbr->inflight_lo != ~0U)
+		bbr->inflight_lo = min(bbr->inflight_lo, bbr->inflight_hi);
+
+	bbr_set_cycle_idx(sk, BBR_BW_PROBE_CRUISE);
+}
+
+/* Loss and/or ECN rate is too high while probing.
+ * Adapt (once per bw probe) by cutting inflight_hi and then restarting cycle.
+ */
+static void bbr_handle_inflight_too_high(struct sock *sk,
+					  const struct rate_sample *rs)
+{
+	struct bbr *bbr = inet_csk_ca(sk);
+	const u32 beta = bbr_param(sk, beta);
+
+	bbr->prev_probe_too_high = 1;
+	bbr->bw_probe_samples = 0;  /* only react once per probe */
+	/* If we are app-limited then we are not robustly
+	 * probing the max volume of inflight data we think
+	 * might be safe (analogous to how app-limited bw
+	 * samples are not known to be robustly probing bw).
+	 */
+	if (!rs->is_app_limited) {
+		bbr->inflight_hi = max_t(u32, rs->tx_in_flight,
+					 (u64)bbr_target_inflight(sk) *
+					 (BBR_UNIT - beta) >> BBR_SCALE);
+	}
+	if (bbr->mode == BBR_PROBE_BW && bbr->cycle_idx == BBR_BW_PROBE_UP)
+		bbr_start_bw_probe_down(sk);
+}
+
+/* If we're seeing bw and loss samples reflecting our bw probing, adapt
+ * using the signals we see. If loss or ECN mark rate gets too high, then adapt
+ * inflight_hi downward. If we're able to push inflight higher without such
+ * signals, push higher: adapt inflight_hi upward.
+ */
+static bool bbr_adapt_upper_bounds(struct sock *sk,
+				    const struct rate_sample *rs,
+				    struct bbr_context *ctx)
+{
+	struct bbr *bbr = inet_csk_ca(sk);
+
+	/* Track when we'll see bw/loss samples resulting from our bw probes. */
+	if (bbr->ack_phase == BBR_ACKS_PROBE_STARTING && bbr->round_start)
+		bbr->ack_phase = BBR_ACKS_PROBE_FEEDBACK;
+	if (bbr->ack_phase == BBR_ACKS_PROBE_STOPPING && bbr->round_start) {
+		/* End of samples from bw probing phase. */
+		bbr->bw_probe_samples = 0;
+		bbr->ack_phase = BBR_ACKS_INIT;
+		/* At this point in the cycle, our current bw sample is also
+		 * our best recent chance at finding the highest available bw
+		 * for this flow. So now is the best time to forget the bw
+		 * samples from the previous cycle, by advancing the window.
+		 */
+		if (bbr->mode == BBR_PROBE_BW && !rs->is_app_limited)
+			bbr_advance_max_bw_filter(sk);
+		/* If we had an inflight_hi, then probed and pushed inflight all
+		 * the way up to hit that inflight_hi without seeing any
+		 * high loss/ECN in all the resulting ACKs from that probing,
+		 * then probe up again, this time letting inflight persist at
+		 * inflight_hi for a round trip, then accelerating beyond.
+		 */
+		if (bbr->mode == BBR_PROBE_BW &&
+		    bbr->stopped_risky_probe && !bbr->prev_probe_too_high) {
+			bbr_start_bw_probe_refill(sk, 0);
+			return true;  /* yes, decided state transition */
+		}
+	}
+	if (bbr_is_inflight_too_high(sk, rs)) {
+		if (bbr->bw_probe_samples)  /*  sample is from bw probing? */
+			bbr_handle_inflight_too_high(sk, rs);
+	} else {
+		/* Loss/ECN rate is declared safe. Adjust upper bound upward. */
+
+		if (bbr->inflight_hi == ~0U)
+			return false;   /* no excess queue signals yet */
+
+		/* To be resilient to random loss, we must raise bw/inflight_hi
+		 * if we observe in any phase that a higher level is safe.
+		 */
+		if (rs->tx_in_flight > bbr->inflight_hi) {
+			bbr->inflight_hi = rs->tx_in_flight;
+		}
+
+		if (bbr->mode == BBR_PROBE_BW &&
+		    bbr->cycle_idx == BBR_BW_PROBE_UP)
+			bbr_probe_inflight_hi_upward(sk, rs);
+	}
+
+	return false;
+}
+
+/* Check if it's time to probe for bandwidth now, and if so, kick it off. */
+static bool bbr_check_time_to_probe_bw(struct sock *sk,
+					const struct rate_sample *rs)
+{
+	struct bbr *bbr = inet_csk_ca(sk);
+	u32 n;
+
+	/* If we seem to be at an operating point where we are not seeing loss
+	 * but we are seeing ECN marks, then when the ECN marks cease we reprobe
+	 * quickly (in case cross-traffic has ceased and freed up bw).
+	 */
+	if (bbr_param(sk, ecn_reprobe_gain) && bbr->ecn_eligible &&
+	    bbr->ecn_in_cycle && !bbr->loss_in_cycle &&
+	    inet_csk(sk)->icsk_ca_state == TCP_CA_Open) {
+		/* Calculate n so that when bbr_raise_inflight_hi_slope()
+		 * computes growth_this_round as 2^n it will be roughly the
+		 * desired volume of data (inflight_hi*ecn_reprobe_gain).
+		 */
+		n = ilog2((((u64)bbr->inflight_hi *
+			    bbr_param(sk, ecn_reprobe_gain)) >> BBR_SCALE));
+		bbr_start_bw_probe_refill(sk, n);
+		return true;
+	}
+
+	if (bbr_has_elapsed_in_phase(sk, bbr->probe_wait_us) ||
+	    bbr_is_reno_coexistence_probe_time(sk)) {
+		bbr_start_bw_probe_refill(sk, 0);
+		return true;
+	}
+	return false;
+}
+
+/* Is it time to transition from PROBE_DOWN to PROBE_CRUISE? */
+static bool bbr_check_time_to_cruise(struct sock *sk, u32 inflight, u32 bw)
+{
+	/* Always need to pull inflight down to leave headroom in queue. */
+	if (inflight > bbr_inflight_with_headroom(sk))
+		return false;
+
+	return inflight <= bbr_inflight(sk, bw, BBR_UNIT);
+}
+
+/* PROBE_BW state machine: cruise, refill, probe for bw, or drain? */
+static void bbr_update_cycle_phase(struct sock *sk,
+				    const struct rate_sample *rs,
+				    struct bbr_context *ctx)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct bbr *bbr = inet_csk_ca(sk);
+	bool is_bw_probe_done = false;
+	u32 inflight, bw;
+
+	if (!bbr_full_bw_reached(sk))
+		return;
+
+	/* In DRAIN, PROBE_BW, or PROBE_RTT, adjust upper bounds. */
+	if (bbr_adapt_upper_bounds(sk, rs, ctx))
+		return;		/* already decided state transition */
+
+	if (bbr->mode != BBR_PROBE_BW)
+		return;
+
+	inflight = bbr_packets_in_net_at_edt(sk, rs->prior_in_flight);
+	bw = bbr_max_bw(sk);
+
+	switch (bbr->cycle_idx) {
+	/* First we spend most of our time cruising with a pacing_gain of 1.0,
+	 * which paces at the estimated bw, to try to fully use the pipe
+	 * without building queue. If we encounter loss/ECN marks, we adapt
+	 * by slowing down.
+	 */
+	case BBR_BW_PROBE_CRUISE:
+		if (bbr_check_time_to_probe_bw(sk, rs))
+			return;		/* already decided state transition */
+		break;
+
+	/* After cruising, when it's time to probe, we first "refill": we send
+	 * at the estimated bw to fill the pipe, before probing higher and
+	 * knowingly risking overflowing the bottleneck buffer (causing loss).
+	 */
+	case BBR_BW_PROBE_REFILL:
+		if (bbr->round_start) {
+			/* After one full round trip of sending in REFILL, we
+			 * start to see bw samples reflecting our REFILL, which
+			 * may be putting too much data in flight.
+			 */
+			bbr->bw_probe_samples = 1;
+			bbr_start_bw_probe_up(sk, ctx);
+		}
+		break;
+
+	/* After we refill the pipe, we probe by using a pacing_gain > 1.0, to
+	 * probe for bw. If we have not seen loss/ECN, we try to raise inflight
+	 * to at least pacing_gain*BDP; note that this may take more than
+	 * min_rtt if min_rtt is small (e.g. on a LAN).
+	 *
+	 * We terminate PROBE_UP bandwidth probing upon any of the following:
+	 *
+	 * (1) We've pushed inflight up to hit the inflight_hi target set in the
+	 *     most recent previous bw probe phase. Thus we want to start
+	 *     draining the queue immediately because it's very likely the most
+	 *     recently sent packets will fill the queue and cause drops.
+	 * (2) If inflight_hi has not limited bandwidth growth recently, and
+	 *     yet delivered bandwidth has not increased much recently
+	 *     (bbr->full_bw_now).
+	 * (3) Loss filter says loss rate is "too high".
+	 * (4) ECN filter says ECN mark rate is "too high".
+	 *
+	 * (1) (2) checked here, (3) (4) checked in bbr_is_inflight_too_high()
+	 */
+	case BBR_BW_PROBE_UP:
+		if (bbr->prev_probe_too_high &&
+		    inflight >= bbr->inflight_hi) {
+			bbr->stopped_risky_probe = 1;
+			is_bw_probe_done = true;
+		} else {
+			if (tp->is_cwnd_limited &&
+			    tcp_snd_cwnd(tp) >= bbr->inflight_hi) {
+				/* inflight_hi is limiting bw growth */
+				bbr_reset_full_bw(sk);
+				bbr->full_bw = ctx->sample_bw;
+			} else if (bbr->full_bw_now) {
+				/* Plateau in estimated bw. Pipe looks full. */
+				is_bw_probe_done = true;
+			}
+		}
+		if (is_bw_probe_done) {
+			bbr->prev_probe_too_high = 0;  /* no loss/ECN (yet) */
+			bbr_start_bw_probe_down(sk);  /* restart w/ down */
+		}
+		break;
+
+	/* After probing in PROBE_UP, we have usually accumulated some data in
+	 * the bottleneck buffer (if bw probing didn't find more bw). We next
+	 * enter PROBE_DOWN to try to drain any excess data from the queue. To
+	 * do this, we use a pacing_gain < 1.0. We hold this pacing gain until
+	 * our inflight is less then that target cruising point, which is the
+	 * minimum of (a) the amount needed to leave headroom, and (b) the
+	 * estimated BDP. Once inflight falls to match the target, we estimate
+	 * the queue is drained; persisting would underutilize the pipe.
+	 */
+	case BBR_BW_PROBE_DOWN:
+		if (bbr_check_time_to_probe_bw(sk, rs))
+			return;		/* already decided state transition */
+		if (bbr_check_time_to_cruise(sk, inflight, bw))
+			bbr_start_bw_probe_cruise(sk);
+		break;
+
+	default:
+		WARN_ONCE(1, "BBR invalid cycle index %u\n", bbr->cycle_idx);
+	}
+}
+
+/* Exiting PROBE_RTT, so return to bandwidth probing in STARTUP or PROBE_BW. */
+static void bbr_exit_probe_rtt(struct sock *sk)
+{
+	struct bbr *bbr = inet_csk_ca(sk);
+
+	bbr_reset_lower_bounds(sk);
+	if (bbr_full_bw_reached(sk)) {
+		bbr->mode = BBR_PROBE_BW;
+		/* Raising inflight after PROBE_RTT may cause loss, so reset
+		 * the PROBE_BW clock and schedule the next bandwidth probe for
+		 * a friendly and randomized future point in time.
+		 */
+		bbr_start_bw_probe_down(sk);
+		/* Since we are exiting PROBE_RTT, we know inflight is
+		 * below our estimated BDP, so it is reasonable to cruise.
+		 */
+		bbr_start_bw_probe_cruise(sk);
+	} else {
+		bbr->mode = BBR_STARTUP;
+	}
+}
+
+/* Exit STARTUP based on loss rate > 1% and loss gaps in round >= N. Wait until
+ * the end of the round in recovery to get a good estimate of how many packets
+ * have been lost, and how many we need to drain with a low pacing rate.
+ */
+static void bbr_check_loss_too_high_in_startup(struct sock *sk,
+						const struct rate_sample *rs)
+{
+	struct bbr *bbr = inet_csk_ca(sk);
+
+	if (bbr_full_bw_reached(sk))
+		return;
+
+	/* For STARTUP exit, check the loss rate at the end of each round trip
+	 * of Recovery episodes in STARTUP. We check the loss rate at the end
+	 * of the round trip to filter out noisy/low loss and have a better
+	 * sense of inflight (extent of loss), so we can drain more accurately.
+	 */
+	if (rs->losses && bbr->loss_events_in_round < 0xf)
+		bbr->loss_events_in_round++;  /* update saturating counter */
+	if (bbr_param(sk, full_loss_cnt) && bbr->loss_round_start &&
+	    inet_csk(sk)->icsk_ca_state == TCP_CA_Recovery &&
+	    bbr->loss_events_in_round >= bbr_param(sk, full_loss_cnt) &&
+	    bbr_is_inflight_too_high(sk, rs)) {
+		bbr_handle_queue_too_high_in_startup(sk);
+		return;
+	}
+	if (bbr->loss_round_start)
+		bbr->loss_events_in_round = 0;
+}
+
+/* Estimate when the pipe is full, using the change in delivery rate: BBR
+ * estimates bw probing filled the pipe if the estimated bw hasn't changed by
+ * at least bbr_full_bw_thresh (25%) after bbr_full_bw_cnt (3) non-app-limited
+ * rounds. Why 3 rounds: 1: rwin autotuning grows the rwin, 2: we fill the
+ * higher rwin, 3: we get higher delivery rate samples. Or transient
+ * cross-traffic or radio noise can go away. CUBIC Hystart shares a similar
+ * design goal, but uses delay and inter-ACK spacing instead of bandwidth.
+ */
+static void bbr_check_full_bw_reached(struct sock *sk,
+				       const struct rate_sample *rs,
+				       struct bbr_context *ctx)
+{
+	struct bbr *bbr = inet_csk_ca(sk);
+	u32 bw_thresh, full_cnt, thresh;
+
+	if (bbr->full_bw_now || rs->is_app_limited)
+		return;
+
+	thresh = bbr_param(sk, full_bw_thresh);
+	full_cnt = bbr_param(sk, full_bw_cnt);
+	bw_thresh = (u64)bbr->full_bw * thresh >> BBR_SCALE;
+	if (ctx->sample_bw >= bw_thresh) {
+		bbr_reset_full_bw(sk);
+		bbr->full_bw = ctx->sample_bw;
+		return;
+	}
+	if (!bbr->round_start)
+		return;
+	++bbr->full_bw_cnt;
+	bbr->full_bw_now = bbr->full_bw_cnt >= full_cnt;
+	bbr->full_bw_reached |= bbr->full_bw_now;
+}
+
+/* If pipe is probably full, drain the queue and then enter steady-state. */
+static void bbr_check_drain(struct sock *sk, const struct rate_sample *rs,
+			    struct bbr_context *ctx)
+{
+	struct bbr *bbr = inet_csk_ca(sk);
+
+	if (bbr->mode == BBR_STARTUP && bbr_full_bw_reached(sk)) {
+		bbr->mode = BBR_DRAIN;	/* drain queue we created */
+		/* Set ssthresh to export purely for monitoring, to signal
+		 * completion of initial STARTUP by setting to a non-
+		 * TCP_INFINITE_SSTHRESH value (ssthresh is not used by BBR).
+		 */
+		tcp_sk(sk)->snd_ssthresh =
+				bbr_inflight(sk, bbr_max_bw(sk), BBR_UNIT);
+		bbr_reset_congestion_signals(sk);
+	}	/* fall through to check if in-flight is already small: */
+	if (bbr->mode == BBR_DRAIN &&
+	    bbr_packets_in_net_at_edt(sk, tcp_packets_in_flight(tcp_sk(sk))) <=
+	    bbr_inflight(sk, bbr_max_bw(sk), BBR_UNIT)) {
+		bbr->mode = BBR_PROBE_BW;
+		bbr_start_bw_probe_down(sk);
+	}
+}
+
+static void bbr_update_model(struct sock *sk, const struct rate_sample *rs,
+			      struct bbr_context *ctx)
+{
+	bbr_update_congestion_signals(sk, rs, ctx);
+	bbr_update_ack_aggregation(sk, rs);
+	bbr_check_loss_too_high_in_startup(sk, rs);
+	bbr_check_full_bw_reached(sk, rs, ctx);
+	bbr_check_drain(sk, rs, ctx);
+	bbr_update_cycle_phase(sk, rs, ctx);
+	bbr_update_min_rtt(sk, rs);
+}
+
+/* Fast path for app-limited case.
+ *
+ * On each ack, we execute bbr state machine, which primarily consists of:
+ * 1) update model based on new rate sample, and
+ * 2) update control based on updated model or state change.
+ *
+ * There are certain workload/scenarios, e.g. app-limited case, where
+ * either we can skip updating model or we can skip update of both model
+ * as well as control. This provides signifcant softirq cpu savings for
+ * processing incoming acks.
+ *
+ * In case of app-limited, if there is no congestion (loss/ecn) and
+ * if observed bw sample is less than current estimated bw, then we can
+ * skip some of the computation in bbr state processing:
+ *
+ * - if there is no rtt/mode/phase change: In this case, since all the
+ *   parameters of the network model are constant, we can skip model
+ *   as well control update.
+ *
+ * - else we can skip rest of the model update. But we still need to
+ *   update the control to account for the new rtt/mode/phase.
+ *
+ * Returns whether we can take fast path or not.
+ */
+static bool bbr_run_fast_path(struct sock *sk, bool *update_model,
+		const struct rate_sample *rs, struct bbr_context *ctx)
+{
+	struct bbr *bbr = inet_csk_ca(sk);
+	u32 prev_min_rtt_us, prev_mode;
+
+	if (bbr_param(sk, fast_path) && bbr->try_fast_path &&
+	    rs->is_app_limited && ctx->sample_bw < bbr_max_bw(sk) &&
+	    !bbr->loss_in_round && !bbr->ecn_in_round ) {
+		prev_mode = bbr->mode;
+		prev_min_rtt_us = bbr->min_rtt_us;
+		bbr_check_drain(sk, rs, ctx);
+		bbr_update_cycle_phase(sk, rs, ctx);
+		bbr_update_min_rtt(sk, rs);
+
+		if (bbr->mode == prev_mode &&
+		    bbr->min_rtt_us == prev_min_rtt_us &&
+		    bbr->try_fast_path) {
+			return true;
+		}
+
+		/* Skip model update, but control still needs to be updated */
+		*update_model = false;
+	}
+	return false;
+}
+
+__bpf_kfunc void bbr_main(struct sock *sk, const struct rate_sample *rs)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct bbr *bbr = inet_csk_ca(sk);
+	struct bbr_context ctx = { 0 };
+	bool update_model = true;
+	u32 bw, round_delivered;
+	int ce_ratio = -1;
+
+	round_delivered = bbr_update_round_start(sk, rs, &ctx);
+	if (bbr->round_start) {
+		bbr->rounds_since_probe =
+			min_t(s32, bbr->rounds_since_probe + 1, 0xFF);
+		ce_ratio = bbr_update_ecn_alpha(sk);
+	}
+	bbr_plb(sk, rs, ce_ratio);
+
+	bbr->ecn_in_round  |= (bbr->ecn_eligible && rs->is_ece);
+	bbr_calculate_bw_sample(sk, rs, &ctx);
+	bbr_update_latest_delivery_signals(sk, rs, &ctx);
+
+	if (bbr_run_fast_path(sk, &update_model, rs, &ctx))
+		goto out;
+
+	if (update_model)
+		bbr_update_model(sk, rs, &ctx);
+
+	bbr_update_gains(sk);
+	bw = bbr_bw(sk);
+	bbr_set_pacing_rate(sk, bw, bbr->pacing_gain);
+	bbr_set_cwnd(sk, rs, rs->acked_sacked, bw, bbr->cwnd_gain,
+		     tcp_snd_cwnd(tp), &ctx);
+	bbr_bound_cwnd_for_inflight_model(sk);
+
+out:
+	bbr_advance_latest_delivery_signals(sk, rs, &ctx);
+	bbr->prev_ca_state = inet_csk(sk)->icsk_ca_state;
+	bbr->loss_in_cycle |= rs->lost > 0;
+	bbr->ecn_in_cycle  |= rs->delivered_ce > 0;
+}
+
+__bpf_kfunc static void bbr_init(struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct bbr *bbr = inet_csk_ca(sk);
+
+	bbr->initialized = 1;
+
+	bbr->init_cwnd = min(0x7FU, tcp_snd_cwnd(tp));
+	bbr->prior_cwnd = tp->prior_cwnd;
+	tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
+	bbr->next_rtt_delivered = tp->delivered;
+	bbr->prev_ca_state = TCP_CA_Open;
+
+	bbr->probe_rtt_done_stamp = 0;
+	bbr->probe_rtt_round_done = 0;
+	bbr->probe_rtt_min_us = tcp_min_rtt(tp);
+	bbr->probe_rtt_min_stamp = tcp_jiffies32;
+	bbr->min_rtt_us = tcp_min_rtt(tp);
+	bbr->min_rtt_stamp = tcp_jiffies32;
+
+	bbr->has_seen_rtt = 0;
+	bbr_init_pacing_rate_from_rtt(sk);
+
+	bbr->round_start = 0;
+	bbr->idle_restart = 0;
+	bbr->full_bw_reached = 0;
+	bbr->full_bw = 0;
 	bbr->full_bw_cnt = 0;
-	bbr_reset_lt_bw_sampling(sk);
-	return tcp_snd_cwnd(tcp_sk(sk));
+	bbr->cycle_mstamp = 0;
+	bbr->cycle_idx = 0;
+
+	bbr_reset_startup_mode(sk);
+
+	bbr->ack_epoch_mstamp = tp->tcp_mstamp;
+	bbr->ack_epoch_acked = 0;
+	bbr->extra_acked_win_rtts = 0;
+	bbr->extra_acked_win_idx = 0;
+	bbr->extra_acked[0] = 0;
+	bbr->extra_acked[1] = 0;
+
+	bbr->ce_state = 0;
+	bbr->prior_rcv_nxt = tp->rcv_nxt;
+	bbr->try_fast_path = 0;
+
+	cmpxchg(&sk->sk_pacing_status, SK_PACING_NONE, SK_PACING_NEEDED);
+
+	/* Start sampling ECN mark rate after first full flight is ACKed: */
+	bbr->loss_round_delivered = tp->delivered + 1;
+	bbr->loss_round_start = 0;
+	bbr->undo_bw_lo = 0;
+	bbr->undo_inflight_lo = 0;
+	bbr->undo_inflight_hi = 0;
+	bbr->loss_events_in_round = 0;
+	bbr->startup_ecn_rounds = 0;
+	bbr_reset_congestion_signals(sk);
+	bbr->bw_lo = ~0U;
+	bbr->bw_hi[0] = 0;
+	bbr->bw_hi[1] = 0;
+	bbr->inflight_lo = ~0U;
+	bbr->inflight_hi = ~0U;
+	bbr_reset_full_bw(sk);
+	bbr->bw_probe_up_cnt = ~0U;
+	bbr->bw_probe_up_acks = 0;
+	bbr->bw_probe_up_rounds = 0;
+	bbr->probe_wait_us = 0;
+	bbr->stopped_risky_probe = 0;
+	bbr->ack_phase = BBR_ACKS_INIT;
+	bbr->rounds_since_probe = 0;
+	bbr->bw_probe_samples = 0;
+	bbr->prev_probe_too_high = 0;
+	bbr->ecn_eligible = 0;
+	bbr->ecn_alpha = bbr_param(sk, ecn_alpha_init);
+	bbr->alpha_last_delivered = 0;
+	bbr->alpha_last_delivered_ce = 0;
+	bbr->plb.pause_until = 0;
+
+	tp->fast_ack_mode = bbr_fast_ack_mode ? 1 : 0;
+
+	if (bbr_can_use_ecn(sk))
+		tp->ecn_flags |= TCP_ECN_ECT_PERMANENT;
+}
+
+/* BBR marks the current round trip as a loss round. */
+static void bbr_note_loss(struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct bbr *bbr = inet_csk_ca(sk);
+
+	/* Capture "current" data over the full round trip of loss, to
+	 * have a better chance of observing the full capacity of the path.
+	 */
+	if (!bbr->loss_in_round)  /* first loss in this round trip? */
+		bbr->loss_round_delivered = tp->delivered;  /* set round trip */
+	bbr->loss_in_round = 1;
+	bbr->loss_in_cycle = 1;
 }
 
-/* Entering loss recovery, so save cwnd for when we exit or undo recovery. */
+/* Core TCP stack informs us that the given skb was just marked lost. */
+__bpf_kfunc static void bbr_skb_marked_lost(struct sock *sk,
+					    const struct sk_buff *skb)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct bbr *bbr = inet_csk_ca(sk);
+	struct tcp_skb_cb *scb = TCP_SKB_CB(skb);
+	struct rate_sample rs = {};
+
+	bbr_note_loss(sk);
+
+	if (!bbr->bw_probe_samples)
+		return;  /* not an skb sent while probing for bandwidth */
+	if (unlikely(!scb->tx.delivered_mstamp))
+		return;  /* skb was SACKed, reneged, marked lost; ignore it */
+	/* We are probing for bandwidth. Construct a rate sample that
+	 * estimates what happened in the flight leading up to this lost skb,
+	 * then see if the loss rate went too high, and if so at which packet.
+	 */
+	rs.tx_in_flight = scb->tx.in_flight;
+	rs.lost = tp->lost - scb->tx.lost;
+	rs.is_app_limited = scb->tx.is_app_limited;
+	if (bbr_is_inflight_too_high(sk, &rs)) {
+		rs.tx_in_flight = bbr_inflight_hi_from_lost_skb(sk, &rs, skb);
+		bbr_handle_inflight_too_high(sk, &rs);
+	}
+}
+
+static void bbr_run_loss_probe_recovery(struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct bbr *bbr = inet_csk_ca(sk);
+	struct rate_sample rs = {0};
+
+	bbr_note_loss(sk);
+
+	if (!bbr->bw_probe_samples)
+		return;  /* not sent while probing for bandwidth */
+	/* We are probing for bandwidth. Construct a rate sample that
+	 * estimates what happened in the flight leading up to this
+	 * loss, then see if the loss rate went too high.
+	 */
+	rs.lost = 1;	/* TLP probe repaired loss of a single segment */
+	rs.tx_in_flight = bbr->inflight_latest + rs.lost;
+	rs.is_app_limited = tp->tlp_orig_data_app_limited;
+	if (bbr_is_inflight_too_high(sk, &rs))
+		bbr_handle_inflight_too_high(sk, &rs);
+}
+
+/* Revert short-term model if current loss recovery event was spurious. */
+__bpf_kfunc static u32 bbr_undo_cwnd(struct sock *sk)
+{
+	struct bbr *bbr = inet_csk_ca(sk);
+
+	bbr_reset_full_bw(sk); /* spurious slow-down; reset full bw detector */
+	bbr->loss_in_round = 0;
+
+	/* Revert to cwnd and other state saved before loss episode. */
+	bbr->bw_lo = max(bbr->bw_lo, bbr->undo_bw_lo);
+	bbr->inflight_lo = max(bbr->inflight_lo, bbr->undo_inflight_lo);
+	bbr->inflight_hi = max(bbr->inflight_hi, bbr->undo_inflight_hi);
+	bbr->try_fast_path = 0;  /* take slow path to set proper cwnd, pacing */
+	return bbr->prior_cwnd;
+}
+
+/* Entering loss recovery, so save state for when we undo recovery. */
 __bpf_kfunc static u32 bbr_ssthresh(struct sock *sk)
 {
+	struct bbr *bbr = inet_csk_ca(sk);
+
 	bbr_save_cwnd(sk);
+	/* For undo, save state that adapts based on loss signal. */
+	bbr->undo_bw_lo		= bbr->bw_lo;
+	bbr->undo_inflight_lo	= bbr->inflight_lo;
+	bbr->undo_inflight_hi	= bbr->inflight_hi;
 	return tcp_sk(sk)->snd_ssthresh;
 }
 
+static enum tcp_bbr_phase bbr_get_phase(struct bbr *bbr)
+{
+	switch (bbr->mode) {
+	case BBR_STARTUP:
+		return BBR_PHASE_STARTUP;
+	case BBR_DRAIN:
+		return BBR_PHASE_DRAIN;
+	case BBR_PROBE_BW:
+		break;
+	case BBR_PROBE_RTT:
+		return BBR_PHASE_PROBE_RTT;
+	default:
+		return BBR_PHASE_INVALID;
+	}
+	switch (bbr->cycle_idx) {
+	case BBR_BW_PROBE_UP:
+		return BBR_PHASE_PROBE_BW_UP;
+	case BBR_BW_PROBE_DOWN:
+		return BBR_PHASE_PROBE_BW_DOWN;
+	case BBR_BW_PROBE_CRUISE:
+		return BBR_PHASE_PROBE_BW_CRUISE;
+	case BBR_BW_PROBE_REFILL:
+		return BBR_PHASE_PROBE_BW_REFILL;
+	default:
+		return BBR_PHASE_INVALID;
+	}
+}
+
 static size_t bbr_get_info(struct sock *sk, u32 ext, int *attr,
-			   union tcp_cc_info *info)
+			    union tcp_cc_info *info)
 {
 	if (ext & (1 << (INET_DIAG_BBRINFO - 1)) ||
 	    ext & (1 << (INET_DIAG_VEGASINFO - 1))) {
-		struct tcp_sock *tp = tcp_sk(sk);
 		struct bbr *bbr = inet_csk_ca(sk);
-		u64 bw = bbr_bw(sk);
-
-		bw = bw * tp->mss_cache * USEC_PER_SEC >> BW_SCALE;
-		memset(&info->bbr, 0, sizeof(info->bbr));
-		info->bbr.bbr_bw_lo		= (u32)bw;
-		info->bbr.bbr_bw_hi		= (u32)(bw >> 32);
-		info->bbr.bbr_min_rtt		= bbr->min_rtt_us;
-		info->bbr.bbr_pacing_gain	= bbr->pacing_gain;
-		info->bbr.bbr_cwnd_gain		= bbr->cwnd_gain;
+		u64 bw = bbr_bw_bytes_per_sec(sk, bbr_bw(sk));
+		u64 bw_hi = bbr_bw_bytes_per_sec(sk, bbr_max_bw(sk));
+		u64 bw_lo = bbr->bw_lo == ~0U ?
+			~0ULL : bbr_bw_bytes_per_sec(sk, bbr->bw_lo);
+		struct tcp_bbr_info *bbr_info = &info->bbr;
+
+		memset(bbr_info, 0, sizeof(*bbr_info));
+		bbr_info->bbr_bw_lo		= (u32)bw;
+		bbr_info->bbr_bw_hi		= (u32)(bw >> 32);
+		bbr_info->bbr_min_rtt		= bbr->min_rtt_us;
+		bbr_info->bbr_pacing_gain	= bbr->pacing_gain;
+		bbr_info->bbr_cwnd_gain		= bbr->cwnd_gain;
+		bbr_info->bbr_bw_hi_lsb		= (u32)bw_hi;
+		bbr_info->bbr_bw_hi_msb		= (u32)(bw_hi >> 32);
+		bbr_info->bbr_bw_lo_lsb		= (u32)bw_lo;
+		bbr_info->bbr_bw_lo_msb		= (u32)(bw_lo >> 32);
+		bbr_info->bbr_mode		= bbr->mode;
+		bbr_info->bbr_phase		= (__u8)bbr_get_phase(bbr);
+		bbr_info->bbr_version		= (__u8)BBR_VERSION;
+		bbr_info->bbr_inflight_lo	= bbr->inflight_lo;
+		bbr_info->bbr_inflight_hi	= bbr->inflight_hi;
+		bbr_info->bbr_extra_acked	= bbr_extra_acked(sk);
 		*attr = INET_DIAG_BBRINFO;
-		return sizeof(info->bbr);
+		return sizeof(*bbr_info);
 	}
 	return 0;
 }
 
 __bpf_kfunc static void bbr_set_state(struct sock *sk, u8 new_state)
 {
+	struct tcp_sock *tp = tcp_sk(sk);
 	struct bbr *bbr = inet_csk_ca(sk);
 
 	if (new_state == TCP_CA_Loss) {
-		struct rate_sample rs = { .losses = 1 };
 
 		bbr->prev_ca_state = TCP_CA_Loss;
-		bbr->full_bw = 0;
-		bbr->round_start = 1;	/* treat RTO like end of a round */
-		bbr_lt_bw_sampling(sk, &rs);
+		tcp_plb_update_state_upon_rto(sk, &bbr->plb);
+		/* The tcp_write_timeout() call to sk_rethink_txhash() likely
+		 * repathed this flow, so re-learn the min network RTT on the
+		 * new path:
+		 */
+		bbr_reset_full_bw(sk);
+		if (!bbr_is_probing_bandwidth(sk) && bbr->inflight_lo == ~0U) {
+			/* bbr_adapt_lower_bounds() needs cwnd before
+			 * we suffered an RTO, to update inflight_lo:
+			 */
+			bbr->inflight_lo =
+				max(tcp_snd_cwnd(tp), bbr->prior_cwnd);
+		}
+	} else if (bbr->prev_ca_state == TCP_CA_Loss &&
+		   new_state != TCP_CA_Loss) {
+		bbr_exit_loss_recovery(sk);
 	}
 }
 
+
 static struct tcp_congestion_ops tcp_bbr_cong_ops __read_mostly = {
-	.flags		= TCP_CONG_NON_RESTRICTED,
+	.flags		= TCP_CONG_NON_RESTRICTED | TCP_CONG_WANTS_CE_EVENTS,
 	.name		= "bbr",
 	.owner		= THIS_MODULE,
 	.init		= bbr_init,
 	.cong_control	= bbr_main,
 	.sndbuf_expand	= bbr_sndbuf_expand,
+	.skb_marked_lost = bbr_skb_marked_lost,
 	.undo_cwnd	= bbr_undo_cwnd,
 	.cwnd_event	= bbr_cwnd_event,
 	.ssthresh	= bbr_ssthresh,
-	.min_tso_segs	= bbr_min_tso_segs,
+	.tso_segs	= bbr_tso_segs,
 	.get_info	= bbr_get_info,
 	.set_state	= bbr_set_state,
 };
@@ -1160,10 +2361,11 @@ BTF_SET8_START(tcp_bbr_check_kfunc_ids)
 BTF_ID_FLAGS(func, bbr_init)
 BTF_ID_FLAGS(func, bbr_main)
 BTF_ID_FLAGS(func, bbr_sndbuf_expand)
+BTF_ID_FLAGS(func, bbr_skb_marked_lost)
 BTF_ID_FLAGS(func, bbr_undo_cwnd)
 BTF_ID_FLAGS(func, bbr_cwnd_event)
 BTF_ID_FLAGS(func, bbr_ssthresh)
-BTF_ID_FLAGS(func, bbr_min_tso_segs)
+BTF_ID_FLAGS(func, bbr_tso_segs)
 BTF_ID_FLAGS(func, bbr_set_state)
 #endif
 #endif
@@ -1198,5 +2400,12 @@ MODULE_AUTHOR("Van Jacobson <vanj@google.com>");
 MODULE_AUTHOR("Neal Cardwell <ncardwell@google.com>");
 MODULE_AUTHOR("Yuchung Cheng <ycheng@google.com>");
 MODULE_AUTHOR("Soheil Hassas Yeganeh <soheil@google.com>");
+MODULE_AUTHOR("Priyaranjan Jha <priyarjha@google.com>");
+MODULE_AUTHOR("Yousuk Seung <ysseung@google.com>");
+MODULE_AUTHOR("Kevin Yang <yyd@google.com>");
+MODULE_AUTHOR("Arjun Roy <arjunroy@google.com>");
+MODULE_AUTHOR("David Morley <morleyd@google.com>");
+
 MODULE_LICENSE("Dual BSD/GPL");
 MODULE_DESCRIPTION("TCP BBR (Bottleneck Bandwidth and RTT)");
+MODULE_VERSION(__stringify(BBR_VERSION));
diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c
index 1b34050a7538..66d40449b3f4 100644
--- a/net/ipv4/tcp_cong.c
+++ b/net/ipv4/tcp_cong.c
@@ -241,6 +241,7 @@ void tcp_init_congestion_control(struct sock *sk)
 	struct inet_connection_sock *icsk = inet_csk(sk);
 
 	tcp_sk(sk)->prior_ssthresh = 0;
+	tcp_sk(sk)->fast_ack_mode = 0;
 	if (icsk->icsk_ca_ops->init)
 		icsk->icsk_ca_ops->init(sk);
 	if (tcp_ca_needs_ecn(sk))
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index a5781f86ac37..dbadaf0d2300 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -361,7 +361,7 @@ static void __tcp_ecn_check_ce(struct sock *sk, const struct sk_buff *skb)
 			tcp_enter_quickack_mode(sk, 2);
 		break;
 	case INET_ECN_CE:
-		if (tcp_ca_needs_ecn(sk))
+		if (tcp_ca_wants_ce_events(sk))
 			tcp_ca_event(sk, CA_EVENT_ECN_IS_CE);
 
 		if (!(tp->ecn_flags & TCP_ECN_DEMAND_CWR)) {
@@ -372,7 +372,7 @@ static void __tcp_ecn_check_ce(struct sock *sk, const struct sk_buff *skb)
 		tp->ecn_flags |= TCP_ECN_SEEN;
 		break;
 	default:
-		if (tcp_ca_needs_ecn(sk))
+		if (tcp_ca_wants_ce_events(sk))
 			tcp_ca_event(sk, CA_EVENT_ECN_NO_CE);
 		tp->ecn_flags |= TCP_ECN_SEEN;
 		break;
@@ -1091,7 +1091,12 @@ static void tcp_verify_retransmit_hint(struct tcp_sock *tp, struct sk_buff *skb)
  */
 static void tcp_notify_skb_loss_event(struct tcp_sock *tp, const struct sk_buff *skb)
 {
+	struct sock *sk = (struct sock *)tp;
+	const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops;
+
 	tp->lost += tcp_skb_pcount(skb);
+	if (ca_ops->skb_marked_lost)
+		ca_ops->skb_marked_lost(sk, skb);
 }
 
 void tcp_mark_skb_lost(struct sock *sk, struct sk_buff *skb)
@@ -1472,6 +1477,17 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *prev,
 	WARN_ON_ONCE(tcp_skb_pcount(skb) < pcount);
 	tcp_skb_pcount_add(skb, -pcount);
 
+	/* Adjust tx.in_flight as pcount is shifted from skb to prev. */
+	if (WARN_ONCE(TCP_SKB_CB(skb)->tx.in_flight < pcount,
+		      "prev in_flight: %u skb in_flight: %u pcount: %u",
+		      TCP_SKB_CB(prev)->tx.in_flight,
+		      TCP_SKB_CB(skb)->tx.in_flight,
+		      pcount))
+		TCP_SKB_CB(skb)->tx.in_flight = 0;
+	else
+		TCP_SKB_CB(skb)->tx.in_flight -= pcount;
+	TCP_SKB_CB(prev)->tx.in_flight += pcount;
+
 	/* When we're adding to gso_segs == 1, gso_size will be zero,
 	 * in theory this shouldn't be necessary but as long as DSACK
 	 * code can come after this skb later on it's better to keep
@@ -3700,7 +3716,8 @@ static void tcp_replace_ts_recent(struct tcp_sock *tp, u32 seq)
 /* This routine deals with acks during a TLP episode and ends an episode by
  * resetting tlp_high_seq. Ref: TLP algorithm in draft-ietf-tcpm-rack
  */
-static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag)
+static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag,
+				struct rate_sample *rs)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 
@@ -3717,6 +3734,7 @@ static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag)
 		/* ACK advances: there was a loss, so reduce cwnd. Reset
 		 * tlp_high_seq in tcp_init_cwnd_reduction()
 		 */
+		tcp_ca_event(sk, CA_EVENT_TLP_RECOVERY);
 		tcp_init_cwnd_reduction(sk);
 		tcp_set_ca_state(sk, TCP_CA_CWR);
 		tcp_end_cwnd_reduction(sk);
@@ -3727,6 +3745,11 @@ static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag)
 			     FLAG_NOT_DUP | FLAG_DATA_SACKED))) {
 		/* Pure dupack: original and TLP probe arrived; no loss */
 		tp->tlp_high_seq = 0;
+	} else {
+		/* This ACK matches a TLP retransmit. We cannot yet tell if
+		 * this ACK is for the original or the TLP retransmit.
+		 */
+		rs->is_acking_tlp_retrans_seq = 1;
 	}
 }
 
@@ -3831,6 +3854,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
 
 	prior_fack = tcp_is_sack(tp) ? tcp_highest_sack_seq(tp) : tp->snd_una;
 	rs.prior_in_flight = tcp_packets_in_flight(tp);
+	tcp_rate_check_app_limited(sk);
 
 	/* ts_recent update must be made after we are sure that the packet
 	 * is in window.
@@ -3905,7 +3929,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
 	tcp_rack_update_reo_wnd(sk, &rs);
 
 	if (tp->tlp_high_seq)
-		tcp_process_tlp_ack(sk, ack, flag);
+		tcp_process_tlp_ack(sk, ack, flag, &rs);
 
 	if (tcp_ack_is_dubious(sk, flag)) {
 		if (!(flag & (FLAG_SND_UNA_ADVANCED |
@@ -3929,6 +3953,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
 	delivered = tcp_newly_delivered(sk, delivered, flag);
 	lost = tp->lost - lost;			/* freshly marked lost */
 	rs.is_ack_delayed = !!(flag & FLAG_ACK_MAYBE_DELAYED);
+	rs.is_ece = !!(flag & FLAG_ECE);
 	tcp_rate_gen(sk, delivered, lost, is_sack_reneg, sack_state.rate);
 	tcp_cong_control(sk, ack, delivered, flag, sack_state.rate);
 	tcp_xmit_recovery(sk, rexmit);
@@ -3948,7 +3973,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
 	tcp_ack_probe(sk);
 
 	if (tp->tlp_high_seq)
-		tcp_process_tlp_ack(sk, ack, flag);
+		tcp_process_tlp_ack(sk, ack, flag, &rs);
 	return 1;
 
 old_ack:
@@ -5539,13 +5564,14 @@ static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible)
 
 	    /* More than one full frame received... */
 	if (((tp->rcv_nxt - tp->rcv_wup) > inet_csk(sk)->icsk_ack.rcv_mss &&
+	     (tp->fast_ack_mode == 1 ||
 	     /* ... and right edge of window advances far enough.
 	      * (tcp_recvmsg() will send ACK otherwise).
 	      * If application uses SO_RCVLOWAT, we want send ack now if
 	      * we have not received enough bytes to satisfy the condition.
 	      */
-	    (tp->rcv_nxt - tp->copied_seq < sk->sk_rcvlowat ||
-	     __tcp_select_window(sk) >= tp->rcv_wnd)) ||
+	      (tp->rcv_nxt - tp->copied_seq < sk->sk_rcvlowat ||
+	       __tcp_select_window(sk) >= tp->rcv_wnd))) ||
 	    /* We ACK each frame or... */
 	    tcp_in_quickack_mode(sk) ||
 	    /* Protocol state mandates a one-time immediate ACK */
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index c8f2aa003387..fdf51e436899 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -440,6 +440,8 @@ void tcp_ca_openreq_child(struct sock *sk, const struct dst_entry *dst)
 	u32 ca_key = dst_metric(dst, RTAX_CC_ALGO);
 	bool ca_got_dst = false;
 
+	tcp_set_ecn_low_from_dst(sk, dst);
+
 	if (ca_key != TCP_CA_UNSPEC) {
 		const struct tcp_congestion_ops *ca;
 
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index afa819eede6a..cabfeefa368b 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -324,10 +324,9 @@ static void tcp_ecn_send_syn(struct sock *sk, struct sk_buff *skb)
 	bool bpf_needs_ecn = tcp_bpf_ca_needs_ecn(sk);
 	bool use_ecn = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn) == 1 ||
 		tcp_ca_needs_ecn(sk) || bpf_needs_ecn;
+	const struct dst_entry *dst = __sk_dst_get(sk);
 
 	if (!use_ecn) {
-		const struct dst_entry *dst = __sk_dst_get(sk);
-
 		if (dst && dst_feature(dst, RTAX_FEATURE_ECN))
 			use_ecn = true;
 	}
@@ -339,6 +338,9 @@ static void tcp_ecn_send_syn(struct sock *sk, struct sk_buff *skb)
 		tp->ecn_flags = TCP_ECN_OK;
 		if (tcp_ca_needs_ecn(sk) || bpf_needs_ecn)
 			INET_ECN_xmit(sk);
+
+		if (dst)
+			tcp_set_ecn_low_from_dst(sk, dst);
 	}
 }
 
@@ -376,7 +378,8 @@ static void tcp_ecn_send(struct sock *sk, struct sk_buff *skb,
 				th->cwr = 1;
 				skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_ECN;
 			}
-		} else if (!tcp_ca_needs_ecn(sk)) {
+		} else if (!(tp->ecn_flags & TCP_ECN_ECT_PERMANENT) &&
+			!tcp_ca_needs_ecn(sk)) {
 			/* ACK or retransmitted segment: clear ECT|CE */
 			INET_ECN_dontxmit(sk);
 		}
@@ -1531,7 +1534,7 @@ int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue,
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 	struct sk_buff *buff;
-	int old_factor;
+	int old_factor, inflight_prev;
 	long limit;
 	int nlen;
 	u8 flags;
@@ -1606,6 +1609,30 @@ int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue,
 
 		if (diff)
 			tcp_adjust_pcount(sk, skb, diff);
+
+		inflight_prev = TCP_SKB_CB(skb)->tx.in_flight - old_factor;
+		if (inflight_prev < 0) {
+			WARN_ONCE(tcp_skb_tx_in_flight_is_suspicious(
+					  old_factor,
+					  TCP_SKB_CB(skb)->sacked,
+					  TCP_SKB_CB(skb)->tx.in_flight),
+				  "inconsistent: tx.in_flight: %u "
+				  "old_factor: %d mss: %u sacked: %u "
+				  "1st pcount: %d 2nd pcount: %d "
+				  "1st len: %u 2nd len: %u ",
+				  TCP_SKB_CB(skb)->tx.in_flight, old_factor,
+				  mss_now, TCP_SKB_CB(skb)->sacked,
+				  tcp_skb_pcount(skb), tcp_skb_pcount(buff),
+				  skb->len, buff->len);
+			inflight_prev = 0;
+		}
+		/* Set 1st tx.in_flight as if 1st were sent by itself: */
+		TCP_SKB_CB(skb)->tx.in_flight = inflight_prev +
+						 tcp_skb_pcount(skb);
+		/* Set 2nd tx.in_flight with new 1st and 2nd pcounts: */
+		TCP_SKB_CB(buff)->tx.in_flight = inflight_prev +
+						 tcp_skb_pcount(skb) +
+						 tcp_skb_pcount(buff);
 	}
 
 	/* Link BUFF into the send queue. */
@@ -1981,13 +2008,12 @@ static u32 tcp_tso_autosize(const struct sock *sk, unsigned int mss_now,
 static u32 tcp_tso_segs(struct sock *sk, unsigned int mss_now)
 {
 	const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops;
-	u32 min_tso, tso_segs;
-
-	min_tso = ca_ops->min_tso_segs ?
-			ca_ops->min_tso_segs(sk) :
-			READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_min_tso_segs);
+	u32 tso_segs;
 
-	tso_segs = tcp_tso_autosize(sk, mss_now, min_tso);
+	tso_segs = ca_ops->tso_segs ?
+		ca_ops->tso_segs(sk, mss_now) :
+		tcp_tso_autosize(sk, mss_now,
+				 sock_net(sk)->ipv4.sysctl_tcp_min_tso_segs);
 	return min_t(u32, tso_segs, sk->sk_gso_max_segs);
 }
 
@@ -2686,6 +2712,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
 			skb_set_delivery_time(skb, tp->tcp_wstamp_ns, true);
 			list_move_tail(&skb->tcp_tsorted_anchor, &tp->tsorted_sent_queue);
 			tcp_init_tso_segs(skb, mss_now);
+			tcp_set_tx_in_flight(sk, skb);
 			goto repair; /* Skip network transmission */
 		}
 
@@ -2899,6 +2926,7 @@ void tcp_send_loss_probe(struct sock *sk)
 	if (WARN_ON(!skb || !tcp_skb_pcount(skb)))
 		goto rearm_timer;
 
+	tp->tlp_orig_data_app_limited = TCP_SKB_CB(skb)->tx.is_app_limited;
 	if (__tcp_retransmit_skb(sk, skb, 1))
 		goto rearm_timer;
 
diff --git a/net/ipv4/tcp_rate.c b/net/ipv4/tcp_rate.c
index a8f6d9d06f2e..8737f2134648 100644
--- a/net/ipv4/tcp_rate.c
+++ b/net/ipv4/tcp_rate.c
@@ -34,6 +34,24 @@
  * ready to send in the write queue.
  */
 
+void tcp_set_tx_in_flight(struct sock *sk, struct sk_buff *skb)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	u32 in_flight;
+
+	/* Check, sanitize, and record packets in flight after skb was sent. */
+	in_flight = tcp_packets_in_flight(tp) + tcp_skb_pcount(skb);
+	if (WARN_ONCE(in_flight > TCPCB_IN_FLIGHT_MAX,
+		      "insane in_flight %u cc %s mss %u "
+		      "cwnd %u pif %u %u %u %u\n",
+		      in_flight, inet_csk(sk)->icsk_ca_ops->name,
+		      tp->mss_cache, tp->snd_cwnd,
+		      tp->packets_out, tp->retrans_out,
+		      tp->sacked_out, tp->lost_out))
+		in_flight = TCPCB_IN_FLIGHT_MAX;
+	TCP_SKB_CB(skb)->tx.in_flight = in_flight;
+}
+
 /* Snapshot the current delivery information in the skb, to generate
  * a rate sample later when the skb is (s)acked in tcp_rate_skb_delivered().
  */
@@ -66,7 +84,9 @@ void tcp_rate_skb_sent(struct sock *sk, struct sk_buff *skb)
 	TCP_SKB_CB(skb)->tx.delivered_mstamp	= tp->delivered_mstamp;
 	TCP_SKB_CB(skb)->tx.delivered		= tp->delivered;
 	TCP_SKB_CB(skb)->tx.delivered_ce	= tp->delivered_ce;
+	TCP_SKB_CB(skb)->tx.lost		= tp->lost;
 	TCP_SKB_CB(skb)->tx.is_app_limited	= tp->app_limited ? 1 : 0;
+	tcp_set_tx_in_flight(sk, skb);
 }
 
 /* When an skb is sacked or acked, we fill in the rate sample with the (prior)
@@ -91,18 +111,21 @@ void tcp_rate_skb_delivered(struct sock *sk, struct sk_buff *skb,
 	if (!rs->prior_delivered ||
 	    tcp_skb_sent_after(tx_tstamp, tp->first_tx_mstamp,
 			       scb->end_seq, rs->last_end_seq)) {
+		rs->prior_lost	     = scb->tx.lost;
 		rs->prior_delivered_ce  = scb->tx.delivered_ce;
 		rs->prior_delivered  = scb->tx.delivered;
 		rs->prior_mstamp     = scb->tx.delivered_mstamp;
 		rs->is_app_limited   = scb->tx.is_app_limited;
 		rs->is_retrans	     = scb->sacked & TCPCB_RETRANS;
+		rs->tx_in_flight     = scb->tx.in_flight;
 		rs->last_end_seq     = scb->end_seq;
 
 		/* Record send time of most recently ACKed packet: */
 		tp->first_tx_mstamp  = tx_tstamp;
 		/* Find the duration of the "send phase" of this window: */
-		rs->interval_us = tcp_stamp_us_delta(tp->first_tx_mstamp,
-						     scb->tx.first_tx_mstamp);
+		rs->interval_us      = tcp_stamp32_us_delta(
+						tp->first_tx_mstamp,
+						scb->tx.first_tx_mstamp);
 
 	}
 	/* Mark off the skb delivered once it's sacked to avoid being
@@ -144,6 +167,7 @@ void tcp_rate_gen(struct sock *sk, u32 delivered, u32 lost,
 		return;
 	}
 	rs->delivered   = tp->delivered - rs->prior_delivered;
+	rs->lost        = tp->lost - rs->prior_lost;
 
 	rs->delivered_ce = tp->delivered_ce - rs->prior_delivered_ce;
 	/* delivered_ce occupies less than 32 bits in the skb control block */
@@ -155,7 +179,7 @@ void tcp_rate_gen(struct sock *sk, u32 delivered, u32 lost,
 	 * longer phase.
 	 */
 	snd_us = rs->interval_us;				/* send phase */
-	ack_us = tcp_stamp_us_delta(tp->tcp_mstamp,
+	ack_us = tcp_stamp32_us_delta(tp->tcp_mstamp,
 				    rs->prior_mstamp); /* ack phase */
 	rs->interval_us = max(snd_us, ack_us);
 
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index a9f6200f12f1..445c4df7406f 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -642,6 +642,7 @@ void tcp_write_timer_handler(struct sock *sk)
 		return;
 	}
 
+	tcp_rate_check_app_limited(sk);
 	tcp_mstamp_refresh(tcp_sk(sk));
 	event = icsk->icsk_pending;
 
-- 
2.42.0

From 8cef81ab87bb29cd5c0557971d92de42bee60887 Mon Sep 17 00:00:00 2001
From: Peter Jung <admin@ptr1337.dev>
Date: Mon, 25 Sep 2023 18:09:53 +0200
Subject: [PATCH 4/7] cachy

Signed-off-by: Peter Jung <admin@ptr1337.dev>
---
 .../admin-guide/kernel-parameters.txt         |    9 +
 Makefile                                      |    8 +-
 arch/arc/configs/axs101_defconfig             |    1 +
 arch/arc/configs/axs103_defconfig             |    1 +
 arch/arc/configs/axs103_smp_defconfig         |    1 +
 arch/arc/configs/haps_hs_defconfig            |    1 +
 arch/arc/configs/haps_hs_smp_defconfig        |    1 +
 arch/arc/configs/hsdk_defconfig               |    1 +
 arch/arc/configs/nsim_700_defconfig           |    1 +
 arch/arc/configs/nsimosci_defconfig           |    1 +
 arch/arc/configs/nsimosci_hs_defconfig        |    1 +
 arch/arc/configs/nsimosci_hs_smp_defconfig    |    1 +
 arch/arc/configs/tb10x_defconfig              |    1 +
 arch/arc/configs/vdk_hs38_defconfig           |    1 +
 arch/arc/configs/vdk_hs38_smp_defconfig       |    1 +
 arch/x86/Kconfig.cpu                          |  427 +-
 arch/x86/Makefile                             |   46 +-
 arch/x86/include/asm/pci.h                    |    6 +
 arch/x86/include/asm/vermagic.h               |   74 +
 arch/x86/pci/common.c                         |    7 +-
 block/bfq-iosched.c                           |    6 +
 drivers/Makefile                              |   15 +-
 drivers/ata/ahci.c                            |   23 +-
 drivers/cpufreq/Kconfig.x86                   |    2 -
 drivers/i2c/busses/Kconfig                    |    9 +
 drivers/i2c/busses/Makefile                   |    1 +
 drivers/i2c/busses/i2c-nct6775.c              |  648 ++
 drivers/i2c/busses/i2c-piix4.c                |    4 +-
 drivers/md/dm-crypt.c                         |    5 +
 drivers/pci/controller/Makefile               |    6 +
 drivers/pci/controller/intel-nvme-remap.c     |  462 ++
 drivers/pci/quirks.c                          |  101 +
 drivers/platform/x86/Kconfig                  |   24 +
 drivers/platform/x86/Makefile                 |    4 +
 drivers/platform/x86/legion-laptop.c          | 5858 +++++++++++++++++
 drivers/platform/x86/steamdeck.c              |  523 ++
 include/linux/mm.h                            |    2 +-
 include/linux/pagemap.h                       |    2 +-
 include/linux/user_namespace.h                |    4 +
 init/Kconfig                                  |   26 +
 kernel/Kconfig.hz                             |   24 +
 kernel/fork.c                                 |   14 +
 kernel/sched/fair.c                           |   20 +-
 kernel/sysctl.c                               |   12 +
 kernel/user_namespace.c                       |    7 +
 lib/scatterlist.c                             |   23 +-
 mm/Kconfig                                    |    2 +-
 mm/internal.h                                 |    1 +
 mm/list_lru.c                                 |    4 +
 mm/page-writeback.c                           |    8 +
 mm/page_alloc.c                               |   42 +-
 mm/swap.c                                     |    5 +
 mm/vmpressure.c                               |    4 +
 mm/vmscan.c                                   |   28 +-
 54 files changed, 8419 insertions(+), 90 deletions(-)
 create mode 100644 drivers/i2c/busses/i2c-nct6775.c
 create mode 100644 drivers/pci/controller/intel-nvme-remap.c
 create mode 100644 drivers/platform/x86/legion-laptop.c
 create mode 100644 drivers/platform/x86/steamdeck.c

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index f23ec4dc6c4b..851e413f8967 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -4276,6 +4276,15 @@
 		nomsi		[MSI] If the PCI_MSI kernel config parameter is
 				enabled, this kernel boot option can be used to
 				disable the use of MSI interrupts system-wide.
+		pcie_acs_override =
+					[PCIE] Override missing PCIe ACS support for:
+				downstream
+					All downstream ports - full ACS capabilities
+				multfunction
+					All multifunction devices - multifunction ACS subset
+				id:nnnn:nnnn
+					Specfic device - full ACS capabilities
+					Specified as vid:did (vendor/device ID) in hex
 		noioapicquirk	[APIC] Disable all boot interrupt quirks.
 				Safety option to keep boot IRQs enabled. This
 				should never be necessary.
diff --git a/Makefile b/Makefile
index fc83c079c471..9ad36395d50d 100644
--- a/Makefile
+++ b/Makefile
@@ -831,6 +831,9 @@ KBUILD_CFLAGS	+= $(call cc-disable-warning, address-of-packed-member)
 ifdef CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE
 KBUILD_CFLAGS += -O2
 KBUILD_RUSTFLAGS += -Copt-level=2
+else ifdef CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE_O3
+KBUILD_CFLAGS += -O3
+KBUILD_RUSTFLAGS += -Copt-level=3
 else ifdef CONFIG_CC_OPTIMIZE_FOR_SIZE
 KBUILD_CFLAGS += -Os
 KBUILD_RUSTFLAGS += -Copt-level=s
@@ -1076,11 +1079,6 @@ KBUILD_CFLAGS	+= -fno-strict-overflow
 # Make sure -fstack-check isn't enabled (like gentoo apparently did)
 KBUILD_CFLAGS  += -fno-stack-check
 
-# conserve stack if available
-ifdef CONFIG_CC_IS_GCC
-KBUILD_CFLAGS   += -fconserve-stack
-endif
-
 # Prohibit date/time macros, which would make the build non-deterministic
 KBUILD_CFLAGS   += -Werror=date-time
 
diff --git a/arch/arc/configs/axs101_defconfig b/arch/arc/configs/axs101_defconfig
index 81764160451f..2c15d3bf747a 100644
--- a/arch/arc/configs/axs101_defconfig
+++ b/arch/arc/configs/axs101_defconfig
@@ -9,6 +9,7 @@ CONFIG_NAMESPACES=y
 # CONFIG_UTS_NS is not set
 # CONFIG_PID_NS is not set
 CONFIG_BLK_DEV_INITRD=y
+CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE_O3=y
 CONFIG_EMBEDDED=y
 CONFIG_PERF_EVENTS=y
 # CONFIG_VM_EVENT_COUNTERS is not set
diff --git a/arch/arc/configs/axs103_defconfig b/arch/arc/configs/axs103_defconfig
index d5181275490e..7d868e148d9a 100644
--- a/arch/arc/configs/axs103_defconfig
+++ b/arch/arc/configs/axs103_defconfig
@@ -9,6 +9,7 @@ CONFIG_NAMESPACES=y
 # CONFIG_UTS_NS is not set
 # CONFIG_PID_NS is not set
 CONFIG_BLK_DEV_INITRD=y
+CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE_O3=y
 CONFIG_EMBEDDED=y
 CONFIG_PERF_EVENTS=y
 # CONFIG_VM_EVENT_COUNTERS is not set
diff --git a/arch/arc/configs/axs103_smp_defconfig b/arch/arc/configs/axs103_smp_defconfig
index 07c89281c2e3..1513324ddb00 100644
--- a/arch/arc/configs/axs103_smp_defconfig
+++ b/arch/arc/configs/axs103_smp_defconfig
@@ -9,6 +9,7 @@ CONFIG_NAMESPACES=y
 # CONFIG_UTS_NS is not set
 # CONFIG_PID_NS is not set
 CONFIG_BLK_DEV_INITRD=y
+CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE_O3=y
 CONFIG_EMBEDDED=y
 CONFIG_PERF_EVENTS=y
 # CONFIG_VM_EVENT_COUNTERS is not set
diff --git a/arch/arc/configs/haps_hs_defconfig b/arch/arc/configs/haps_hs_defconfig
index 8c3ed5d6e6c3..2db643853e8f 100644
--- a/arch/arc/configs/haps_hs_defconfig
+++ b/arch/arc/configs/haps_hs_defconfig
@@ -11,6 +11,7 @@ CONFIG_NAMESPACES=y
 # CONFIG_UTS_NS is not set
 # CONFIG_PID_NS is not set
 CONFIG_BLK_DEV_INITRD=y
+CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE_O3=y
 CONFIG_EXPERT=y
 CONFIG_PERF_EVENTS=y
 # CONFIG_COMPAT_BRK is not set
diff --git a/arch/arc/configs/haps_hs_smp_defconfig b/arch/arc/configs/haps_hs_smp_defconfig
index 61107e8bac33..d764007e5ada 100644
--- a/arch/arc/configs/haps_hs_smp_defconfig
+++ b/arch/arc/configs/haps_hs_smp_defconfig
@@ -11,6 +11,7 @@ CONFIG_NAMESPACES=y
 # CONFIG_UTS_NS is not set
 # CONFIG_PID_NS is not set
 CONFIG_BLK_DEV_INITRD=y
+CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE_O3=y
 CONFIG_EMBEDDED=y
 CONFIG_PERF_EVENTS=y
 # CONFIG_VM_EVENT_COUNTERS is not set
diff --git a/arch/arc/configs/hsdk_defconfig b/arch/arc/configs/hsdk_defconfig
index 4ee2a1507b57..ce6a4431a76d 100644
--- a/arch/arc/configs/hsdk_defconfig
+++ b/arch/arc/configs/hsdk_defconfig
@@ -9,6 +9,7 @@ CONFIG_NAMESPACES=y
 # CONFIG_PID_NS is not set
 CONFIG_BLK_DEV_INITRD=y
 CONFIG_BLK_DEV_RAM=y
+CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE_O3=y
 CONFIG_EMBEDDED=y
 CONFIG_PERF_EVENTS=y
 # CONFIG_VM_EVENT_COUNTERS is not set
diff --git a/arch/arc/configs/nsim_700_defconfig b/arch/arc/configs/nsim_700_defconfig
index 3e9829775992..5044609540cc 100644
--- a/arch/arc/configs/nsim_700_defconfig
+++ b/arch/arc/configs/nsim_700_defconfig
@@ -11,6 +11,7 @@ CONFIG_NAMESPACES=y
 # CONFIG_UTS_NS is not set
 # CONFIG_PID_NS is not set
 CONFIG_BLK_DEV_INITRD=y
+CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE_O3=y
 CONFIG_KALLSYMS_ALL=y
 CONFIG_EMBEDDED=y
 CONFIG_PERF_EVENTS=y
diff --git a/arch/arc/configs/nsimosci_defconfig b/arch/arc/configs/nsimosci_defconfig
index 502c87f351c8..748c809d1c4c 100644
--- a/arch/arc/configs/nsimosci_defconfig
+++ b/arch/arc/configs/nsimosci_defconfig
@@ -10,6 +10,7 @@ CONFIG_NAMESPACES=y
 # CONFIG_UTS_NS is not set
 # CONFIG_PID_NS is not set
 CONFIG_BLK_DEV_INITRD=y
+CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE_O3=y
 CONFIG_KALLSYMS_ALL=y
 CONFIG_EMBEDDED=y
 CONFIG_PERF_EVENTS=y
diff --git a/arch/arc/configs/nsimosci_hs_defconfig b/arch/arc/configs/nsimosci_hs_defconfig
index f721cc3997d0..205c32b0074c 100644
--- a/arch/arc/configs/nsimosci_hs_defconfig
+++ b/arch/arc/configs/nsimosci_hs_defconfig
@@ -10,6 +10,7 @@ CONFIG_NAMESPACES=y
 # CONFIG_UTS_NS is not set
 # CONFIG_PID_NS is not set
 CONFIG_BLK_DEV_INITRD=y
+CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE_O3=y
 CONFIG_KALLSYMS_ALL=y
 CONFIG_EMBEDDED=y
 CONFIG_PERF_EVENTS=y
diff --git a/arch/arc/configs/nsimosci_hs_smp_defconfig b/arch/arc/configs/nsimosci_hs_smp_defconfig
index 1419fc946a08..2477b7c80977 100644
--- a/arch/arc/configs/nsimosci_hs_smp_defconfig
+++ b/arch/arc/configs/nsimosci_hs_smp_defconfig
@@ -8,6 +8,7 @@ CONFIG_IKCONFIG_PROC=y
 # CONFIG_UTS_NS is not set
 # CONFIG_PID_NS is not set
 CONFIG_BLK_DEV_INITRD=y
+CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE_O3=y
 CONFIG_PERF_EVENTS=y
 # CONFIG_COMPAT_BRK is not set
 CONFIG_KPROBES=y
diff --git a/arch/arc/configs/tb10x_defconfig b/arch/arc/configs/tb10x_defconfig
index 941bbadd6bf2..e61132ba4f89 100644
--- a/arch/arc/configs/tb10x_defconfig
+++ b/arch/arc/configs/tb10x_defconfig
@@ -14,6 +14,7 @@ CONFIG_INITRAMFS_SOURCE="../tb10x-rootfs.cpio"
 CONFIG_INITRAMFS_ROOT_UID=2100
 CONFIG_INITRAMFS_ROOT_GID=501
 # CONFIG_RD_GZIP is not set
+CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE_O3=y
 CONFIG_KALLSYMS_ALL=y
 # CONFIG_AIO is not set
 CONFIG_EMBEDDED=y
diff --git a/arch/arc/configs/vdk_hs38_defconfig b/arch/arc/configs/vdk_hs38_defconfig
index d3ef189c75f8..922b1b24f518 100644
--- a/arch/arc/configs/vdk_hs38_defconfig
+++ b/arch/arc/configs/vdk_hs38_defconfig
@@ -4,6 +4,7 @@ CONFIG_HIGH_RES_TIMERS=y
 CONFIG_IKCONFIG=y
 CONFIG_IKCONFIG_PROC=y
 CONFIG_BLK_DEV_INITRD=y
+CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE_O3=y
 CONFIG_EMBEDDED=y
 CONFIG_PERF_EVENTS=y
 # CONFIG_VM_EVENT_COUNTERS is not set
diff --git a/arch/arc/configs/vdk_hs38_smp_defconfig b/arch/arc/configs/vdk_hs38_smp_defconfig
index 944b347025fd..ed64319f7eb2 100644
--- a/arch/arc/configs/vdk_hs38_smp_defconfig
+++ b/arch/arc/configs/vdk_hs38_smp_defconfig
@@ -4,6 +4,7 @@ CONFIG_HIGH_RES_TIMERS=y
 CONFIG_IKCONFIG=y
 CONFIG_IKCONFIG_PROC=y
 CONFIG_BLK_DEV_INITRD=y
+CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE_O3=y
 CONFIG_EMBEDDED=y
 CONFIG_PERF_EVENTS=y
 # CONFIG_VM_EVENT_COUNTERS is not set
diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu
index 00468adf180f..46cc91cb622f 100644
--- a/arch/x86/Kconfig.cpu
+++ b/arch/x86/Kconfig.cpu
@@ -157,7 +157,7 @@ config MPENTIUM4
 
 
 config MK6
-	bool "K6/K6-II/K6-III"
+	bool "AMD K6/K6-II/K6-III"
 	depends on X86_32
 	help
 	  Select this for an AMD K6-family processor.  Enables use of
@@ -165,7 +165,7 @@ config MK6
 	  flags to GCC.
 
 config MK7
-	bool "Athlon/Duron/K7"
+	bool "AMD Athlon/Duron/K7"
 	depends on X86_32
 	help
 	  Select this for an AMD Athlon K7-family processor.  Enables use of
@@ -173,12 +173,106 @@ config MK7
 	  flags to GCC.
 
 config MK8
-	bool "Opteron/Athlon64/Hammer/K8"
+	bool "AMD Opteron/Athlon64/Hammer/K8"
 	help
 	  Select this for an AMD Opteron or Athlon64 Hammer-family processor.
 	  Enables use of some extended instructions, and passes appropriate
 	  optimization flags to GCC.
 
+config MK8SSE3
+	bool "AMD Opteron/Athlon64/Hammer/K8 with SSE3"
+	help
+	  Select this for improved AMD Opteron or Athlon64 Hammer-family processors.
+	  Enables use of some extended instructions, and passes appropriate
+	  optimization flags to GCC.
+
+config MK10
+	bool "AMD 61xx/7x50/PhenomX3/X4/II/K10"
+	help
+	  Select this for an AMD 61xx Eight-Core Magny-Cours, Athlon X2 7x50,
+	  Phenom X3/X4/II, Athlon II X2/X3/X4, or Turion II-family processor.
+	  Enables use of some extended instructions, and passes appropriate
+	  optimization flags to GCC.
+
+config MBARCELONA
+	bool "AMD Barcelona"
+	help
+	  Select this for AMD Family 10h Barcelona processors.
+
+	  Enables -march=barcelona
+
+config MBOBCAT
+	bool "AMD Bobcat"
+	help
+	  Select this for AMD Family 14h Bobcat processors.
+
+	  Enables -march=btver1
+
+config MJAGUAR
+	bool "AMD Jaguar"
+	help
+	  Select this for AMD Family 16h Jaguar processors.
+
+	  Enables -march=btver2
+
+config MBULLDOZER
+	bool "AMD Bulldozer"
+	help
+	  Select this for AMD Family 15h Bulldozer processors.
+
+	  Enables -march=bdver1
+
+config MPILEDRIVER
+	bool "AMD Piledriver"
+	help
+	  Select this for AMD Family 15h Piledriver processors.
+
+	  Enables -march=bdver2
+
+config MSTEAMROLLER
+	bool "AMD Steamroller"
+	help
+	  Select this for AMD Family 15h Steamroller processors.
+
+	  Enables -march=bdver3
+
+config MEXCAVATOR
+	bool "AMD Excavator"
+	help
+	  Select this for AMD Family 15h Excavator processors.
+
+	  Enables -march=bdver4
+
+config MZEN
+	bool "AMD Zen"
+	help
+	  Select this for AMD Family 17h Zen processors.
+
+	  Enables -march=znver1
+
+config MZEN2
+	bool "AMD Zen 2"
+	help
+	  Select this for AMD Family 17h Zen 2 processors.
+
+	  Enables -march=znver2
+
+config MZEN3
+	bool "AMD Zen 3"
+	depends on (CC_IS_GCC && GCC_VERSION >= 100300) || (CC_IS_CLANG && CLANG_VERSION >= 120000)
+	help
+	  Select this for AMD Family 19h Zen 3 processors.
+
+	  Enables -march=znver3
+
+config MZEN4
+	bool "AMD Zen 4"
+	depends on (CC_IS_GCC && GCC_VERSION >= 130000) || (CC_IS_CLANG && CLANG_VERSION >= 160000)
+	help
+	  Select this for AMD Family 19h Zen 4 processors.
+
+	  Enables -march=znver4
+
 config MCRUSOE
 	bool "Crusoe"
 	depends on X86_32
@@ -270,7 +364,7 @@ config MPSC
 	  in /proc/cpuinfo. Family 15 is an older Xeon, Family 6 a newer one.
 
 config MCORE2
-	bool "Core 2/newer Xeon"
+	bool "Intel Core 2"
 	help
 
 	  Select this for Intel Core 2 and newer Core 2 Xeons (Xeon 51xx and
@@ -278,6 +372,8 @@ config MCORE2
 	  family in /proc/cpuinfo. Newer ones have 6 and older ones 15
 	  (not a typo)
 
+	  Enables -march=core2
+
 config MATOM
 	bool "Intel Atom"
 	help
@@ -287,6 +383,212 @@ config MATOM
 	  accordingly optimized code. Use a recent GCC with specific Atom
 	  support in order to fully benefit from selecting this option.
 
+config MNEHALEM
+	bool "Intel Nehalem"
+	select X86_P6_NOP
+	help
+
+	  Select this for 1st Gen Core processors in the Nehalem family.
+
+	  Enables -march=nehalem
+
+config MWESTMERE
+	bool "Intel Westmere"
+	select X86_P6_NOP
+	help
+
+	  Select this for the Intel Westmere formerly Nehalem-C family.
+
+	  Enables -march=westmere
+
+config MSILVERMONT
+	bool "Intel Silvermont"
+	select X86_P6_NOP
+	help
+
+	  Select this for the Intel Silvermont platform.
+
+	  Enables -march=silvermont
+
+config MGOLDMONT
+	bool "Intel Goldmont"
+	select X86_P6_NOP
+	help
+
+	  Select this for the Intel Goldmont platform including Apollo Lake and Denverton.
+
+	  Enables -march=goldmont
+
+config MGOLDMONTPLUS
+	bool "Intel Goldmont Plus"
+	select X86_P6_NOP
+	help
+
+	  Select this for the Intel Goldmont Plus platform including Gemini Lake.
+
+	  Enables -march=goldmont-plus
+
+config MSANDYBRIDGE
+	bool "Intel Sandy Bridge"
+	select X86_P6_NOP
+	help
+
+	  Select this for 2nd Gen Core processors in the Sandy Bridge family.
+
+	  Enables -march=sandybridge
+
+config MIVYBRIDGE
+	bool "Intel Ivy Bridge"
+	select X86_P6_NOP
+	help
+
+	  Select this for 3rd Gen Core processors in the Ivy Bridge family.
+
+	  Enables -march=ivybridge
+
+config MHASWELL
+	bool "Intel Haswell"
+	select X86_P6_NOP
+	help
+
+	  Select this for 4th Gen Core processors in the Haswell family.
+
+	  Enables -march=haswell
+
+config MBROADWELL
+	bool "Intel Broadwell"
+	select X86_P6_NOP
+	help
+
+	  Select this for 5th Gen Core processors in the Broadwell family.
+
+	  Enables -march=broadwell
+
+config MSKYLAKE
+	bool "Intel Skylake"
+	select X86_P6_NOP
+	help
+
+	  Select this for 6th Gen Core processors in the Skylake family.
+
+	  Enables -march=skylake
+
+config MSKYLAKEX
+	bool "Intel Skylake X"
+	select X86_P6_NOP
+	help
+
+	  Select this for 6th Gen Core processors in the Skylake X family.
+
+	  Enables -march=skylake-avx512
+
+config MCANNONLAKE
+	bool "Intel Cannon Lake"
+	select X86_P6_NOP
+	help
+
+	  Select this for 8th Gen Core processors
+
+	  Enables -march=cannonlake
+
+config MICELAKE
+	bool "Intel Ice Lake"
+	select X86_P6_NOP
+	help
+
+	  Select this for 10th Gen Core processors in the Ice Lake family.
+
+	  Enables -march=icelake-client
+
+config MCASCADELAKE
+	bool "Intel Cascade Lake"
+	select X86_P6_NOP
+	help
+
+	  Select this for Xeon processors in the Cascade Lake family.
+
+	  Enables -march=cascadelake
+
+config MCOOPERLAKE
+	bool "Intel Cooper Lake"
+	depends on (CC_IS_GCC && GCC_VERSION > 100100) || (CC_IS_CLANG && CLANG_VERSION >= 100000)
+	select X86_P6_NOP
+	help
+
+	  Select this for Xeon processors in the Cooper Lake family.
+
+	  Enables -march=cooperlake
+
+config MTIGERLAKE
+	bool "Intel Tiger Lake"
+	depends on  (CC_IS_GCC && GCC_VERSION > 100100) || (CC_IS_CLANG && CLANG_VERSION >= 100000)
+	select X86_P6_NOP
+	help
+
+	  Select this for third-generation 10 nm process processors in the Tiger Lake family.
+
+	  Enables -march=tigerlake
+
+config MSAPPHIRERAPIDS
+	bool "Intel Sapphire Rapids"
+	depends on (CC_IS_GCC && GCC_VERSION > 110000) || (CC_IS_CLANG && CLANG_VERSION >= 120000)
+	select X86_P6_NOP
+	help
+
+	  Select this for fourth-generation 10 nm process processors in the Sapphire Rapids family.
+
+	  Enables -march=sapphirerapids
+
+config MROCKETLAKE
+	bool "Intel Rocket Lake"
+	depends on (CC_IS_GCC && GCC_VERSION > 110000) || (CC_IS_CLANG && CLANG_VERSION >= 120000)
+	select X86_P6_NOP
+	help
+
+	  Select this for eleventh-generation processors in the Rocket Lake family.
+
+	  Enables -march=rocketlake
+
+config MALDERLAKE
+	bool "Intel Alder Lake"
+	depends on (CC_IS_GCC && GCC_VERSION > 110000) || (CC_IS_CLANG && CLANG_VERSION >= 120000)
+	select X86_P6_NOP
+	help
+
+	  Select this for twelfth-generation processors in the Alder Lake family.
+
+	  Enables -march=alderlake
+
+config MRAPTORLAKE
+	bool "Intel Raptor Lake"
+	depends on (CC_IS_GCC && GCC_VERSION >= 130000) || (CC_IS_CLANG && CLANG_VERSION >= 150500)
+	select X86_P6_NOP
+	help
+
+	  Select this for thirteenth-generation processors in the Raptor Lake family.
+
+	  Enables -march=raptorlake
+
+config MMETEORLAKE
+	bool "Intel Meteor Lake"
+	depends on (CC_IS_GCC && GCC_VERSION >= 130000) || (CC_IS_CLANG && CLANG_VERSION >= 150500)
+	select X86_P6_NOP
+	help
+
+	  Select this for fourteenth-generation processors in the Meteor Lake family.
+
+	  Enables -march=meteorlake
+
+config MEMERALDRAPIDS
+	bool "Intel Emerald Rapids"
+	depends on (CC_IS_GCC && GCC_VERSION > 130000) || (CC_IS_CLANG && CLANG_VERSION >= 150500)
+	select X86_P6_NOP
+	help
+
+	  Select this for fifth-generation 10 nm process processors in the Emerald Rapids family.
+
+	  Enables -march=emeraldrapids
+
 config GENERIC_CPU
 	bool "Generic-x86-64"
 	depends on X86_64
@@ -294,6 +596,50 @@ config GENERIC_CPU
 	  Generic x86-64 CPU.
 	  Run equally well on all x86-64 CPUs.
 
+config GENERIC_CPU2
+	bool "Generic-x86-64-v2"
+	depends on (CC_IS_GCC && GCC_VERSION > 110000) || (CC_IS_CLANG && CLANG_VERSION >= 120000)
+	depends on X86_64
+	help
+	  Generic x86-64 CPU.
+	  Run equally well on all x86-64 CPUs with min support of x86-64-v2.
+
+config GENERIC_CPU3
+	bool "Generic-x86-64-v3"
+	depends on (CC_IS_GCC && GCC_VERSION > 110000) || (CC_IS_CLANG && CLANG_VERSION >= 120000)
+	depends on X86_64
+	help
+	  Generic x86-64-v3 CPU with v3 instructions.
+	  Run equally well on all x86-64 CPUs with min support of x86-64-v3.
+
+config GENERIC_CPU4
+	bool "Generic-x86-64-v4"
+	depends on (CC_IS_GCC && GCC_VERSION > 110000) || (CC_IS_CLANG && CLANG_VERSION >= 120000)
+	depends on X86_64
+	help
+	  Generic x86-64 CPU with v4 instructions.
+	  Run equally well on all x86-64 CPUs with min support of x86-64-v4.
+
+config MNATIVE_INTEL
+	bool "Intel-Native optimizations autodetected by the compiler"
+	help
+
+	  Clang 3.8, GCC 4.2 and above support -march=native, which automatically detects
+	  the optimum settings to use based on your processor. Do NOT use this
+	  for AMD CPUs.  Intel Only!
+
+	  Enables -march=native
+
+config MNATIVE_AMD
+	bool "AMD-Native optimizations autodetected by the compiler"
+	help
+
+	  Clang 3.8, GCC 4.2 and above support -march=native, which automatically detects
+	  the optimum settings to use based on your processor. Do NOT use this
+	  for Intel CPUs.  AMD Only!
+
+	  Enables -march=native
+
 endchoice
 
 config X86_GENERIC
@@ -318,9 +664,17 @@ config X86_INTERNODE_CACHE_SHIFT
 config X86_L1_CACHE_SHIFT
 	int
 	default "7" if MPENTIUM4 || MPSC
-	default "6" if MK7 || MK8 || MPENTIUMM || MCORE2 || MATOM || MVIAC7 || X86_GENERIC || GENERIC_CPU
+	default "6" if MK7 || MK8 || MPENTIUMM || MCORE2 || MATOM || MVIAC7 || MK8SSE3 || MK10 \
+	|| MBARCELONA || MBOBCAT || MJAGUAR || MBULLDOZER || MPILEDRIVER || MSTEAMROLLER \
+	|| MEXCAVATOR || MZEN || MZEN2 || MZEN3 || MZEN4 || MNEHALEM || MWESTMERE || MSILVERMONT \
+	|| MGOLDMONT || MGOLDMONTPLUS || MSANDYBRIDGE || MIVYBRIDGE || MHASWELL || MBROADWELL \
+	|| MSKYLAKE || MSKYLAKEX || MCANNONLAKE || MICELAKE || MCASCADELAKE || MCOOPERLAKE \
+	|| MTIGERLAKE || MSAPPHIRERAPIDS || MROCKETLAKE || MALDERLAKE || MRAPTORLAKE || MMETEORLAKE \
+	|| MEMERALDRAPIDS || MNATIVE_INTEL || MNATIVE_AMD || X86_GENERIC || GENERIC_CPU || GENERIC_CPU2 \
+	|| GENERIC_CPU3 || GENERIC_CPU4
 	default "4" if MELAN || M486SX || M486 || MGEODEGX1
-	default "5" if MWINCHIP3D || MWINCHIPC6 || MCRUSOE || MEFFICEON || MCYRIXIII || MK6 || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || M586 || MVIAC3_2 || MGEODE_LX
+	default "5" if MWINCHIP3D || MWINCHIPC6 || MCRUSOE || MEFFICEON || MCYRIXIII || MK6 || MPENTIUMIII \
+	|| MPENTIUMII || M686 || M586MMX || M586TSC || M586 || MVIAC3_2 || MGEODE_LX
 
 config X86_F00F_BUG
 	def_bool y
@@ -332,15 +686,27 @@ config X86_INVD_BUG
 
 config X86_ALIGNMENT_16
 	def_bool y
-	depends on MWINCHIP3D || MWINCHIPC6 || MCYRIXIII || MELAN || MK6 || M586MMX || M586TSC || M586 || M486SX || M486 || MVIAC3_2 || MGEODEGX1
+	depends on MWINCHIP3D || MWINCHIPC6 || MCYRIXIII || MELAN || MK6 || M586MMX || M586TSC \
+	|| M586 || M486SX || M486 || MVIAC3_2 || MGEODEGX1
 
 config X86_INTEL_USERCOPY
 	def_bool y
-	depends on MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M586MMX || X86_GENERIC || MK8 || MK7 || MEFFICEON || MCORE2
+	depends on MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M586MMX || X86_GENERIC \
+	|| MK8 || MK7 || MEFFICEON || MCORE2 || MNEHALEM || MWESTMERE || MSILVERMONT || MGOLDMONT \
+	|| MGOLDMONTPLUS || MSANDYBRIDGE || MIVYBRIDGE || MHASWELL || MBROADWELL || MSKYLAKE || MSKYLAKEX \
+	|| MCANNONLAKE || MICELAKE || MCASCADELAKE || MCOOPERLAKE || MTIGERLAKE || MSAPPHIRERAPIDS \
+	|| MROCKETLAKE || MALDERLAKE || MRAPTORLAKE || MMETEORLAKE || MEMERALDRAPIDS || MNATIVE_INTEL
 
 config X86_USE_PPRO_CHECKSUM
 	def_bool y
-	depends on MWINCHIP3D || MWINCHIPC6 || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MK8 || MVIAC3_2 || MVIAC7 || MEFFICEON || MGEODE_LX || MCORE2 || MATOM
+	depends on MWINCHIP3D || MWINCHIPC6 || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM \
+	|| MPENTIUMIII || MPENTIUMII || M686 || MK8 || MVIAC3_2 || MVIAC7 || MEFFICEON || MGEODE_LX \
+	|| MCORE2 || MATOM || MK8SSE3 || MK10 || MBARCELONA || MBOBCAT || MJAGUAR || MBULLDOZER \
+	|| MPILEDRIVER || MSTEAMROLLER || MEXCAVATOR || MZEN || MZEN2 || MZEN3 || MZEN4 || MNEHALEM \
+	|| MWESTMERE || MSILVERMONT || MGOLDMONT || MGOLDMONTPLUS || MSANDYBRIDGE || MIVYBRIDGE \
+	|| MHASWELL || MBROADWELL || MSKYLAKE || MSKYLAKEX || MCANNONLAKE || MICELAKE \
+	|| MCASCADELAKE || MCOOPERLAKE || MTIGERLAKE || MSAPPHIRERAPIDS || MROCKETLAKE \
+	|| MALDERLAKE || MRAPTORLAKE || MMETEORLAKE || MEMERALDRAPIDS || MNATIVE_INTEL || MNATIVE_AMD
 
 #
 # P6_NOPs are a relatively minor optimization that require a family >=
@@ -356,32 +722,63 @@ config X86_USE_PPRO_CHECKSUM
 config X86_P6_NOP
 	def_bool y
 	depends on X86_64
-	depends on (MCORE2 || MPENTIUM4 || MPSC)
+	depends on (MCORE2 || MPENTIUM4 || MPSC || MNEHALEM || MWESTMERE || MSILVERMONT || MGOLDMONT \
+	|| MGOLDMONTPLUS || MSANDYBRIDGE || MIVYBRIDGE || MHASWELL || MBROADWELL || MSKYLAKE \
+	|| MSKYLAKEX || MCANNONLAKE || MICELAKE || MCASCADELAKE || MCOOPERLAKE || MTIGERLAKE \
+	|| MSAPPHIRERAPIDS || MROCKETLAKE || MALDERLAKE || MRAPTORLAKE || MMETEORLAKE || MEMERALDRAPIDS \
+	|| MNATIVE_INTEL)
 
 config X86_TSC
 	def_bool y
-	depends on (MWINCHIP3D || MCRUSOE || MEFFICEON || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || MK8 || MVIAC3_2 || MVIAC7 || MGEODEGX1 || MGEODE_LX || MCORE2 || MATOM) || X86_64
+	depends on (MWINCHIP3D || MCRUSOE || MEFFICEON || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM \
+	|| MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || MK8 || MVIAC3_2 || MVIAC7 || MGEODEGX1 \
+	|| MGEODE_LX || MCORE2 || MATOM || MK8SSE3 || MK10 || MBARCELONA || MBOBCAT || MJAGUAR || MBULLDOZER \
+	|| MPILEDRIVER || MSTEAMROLLER || MEXCAVATOR || MZEN || MZEN2 || MZEN3 || MZEN4 || MNEHALEM \
+	|| MWESTMERE || MSILVERMONT || MGOLDMONT || MGOLDMONTPLUS || MSANDYBRIDGE || MIVYBRIDGE || MHASWELL \
+	|| MBROADWELL || MSKYLAKE || MSKYLAKEX || MCANNONLAKE || MICELAKE || MCASCADELAKE || MCOOPERLAKE \
+	|| MTIGERLAKE || MSAPPHIRERAPIDS || MROCKETLAKE || MALDERLAKE || MRAPTORLAKE || MMETEORLAKE || MEMERALDRAPIDS \
+	|| MNATIVE_INTEL || MNATIVE_AMD) || X86_64
 
 config X86_CMPXCHG64
 	def_bool y
-	depends on X86_PAE || X86_64 || MCORE2 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || M586TSC || M586MMX || MATOM || MGEODE_LX || MGEODEGX1 || MK6 || MK7 || MK8
+	depends on X86_PAE || X86_64 || MCORE2 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 \
+	|| M586TSC || M586MMX || MATOM || MGEODE_LX || MGEODEGX1 || MK6 || MK7 || MK8 || MK8SSE3 || MK10 \
+	|| MBARCELONA || MBOBCAT || MJAGUAR || MBULLDOZER || MPILEDRIVER || MSTEAMROLLER || MEXCAVATOR || MZEN \
+	|| MZEN2 || MZEN3 || MZEN4 || MNEHALEM || MWESTMERE || MSILVERMONT || MGOLDMONT || MGOLDMONTPLUS \
+	|| MSANDYBRIDGE || MIVYBRIDGE || MHASWELL || MBROADWELL || MSKYLAKE || MSKYLAKEX || MCANNONLAKE \
+	|| MICELAKE || MCASCADELAKE || MCOOPERLAKE || MTIGERLAKE || MSAPPHIRERAPIDS || MROCKETLAKE \
+	|| MALDERLAKE || MRAPTORLAKE || MMETEORLAKE || MEMERALDRAPIDS || MNATIVE_INTEL || MNATIVE_AMD
 
 # this should be set for all -march=.. options where the compiler
 # generates cmov.
 config X86_CMOV
 	def_bool y
-	depends on (MK8 || MK7 || MCORE2 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MVIAC3_2 || MVIAC7 || MCRUSOE || MEFFICEON || X86_64 || MATOM || MGEODE_LX)
+	depends on (MK8 || MK7 || MCORE2 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 \
+	|| MVIAC3_2 || MVIAC7 || MCRUSOE || MEFFICEON || X86_64 || MATOM || MGEODE_LX || MK8SSE3 || MK10 \
+	|| MBARCELONA || MBOBCAT || MJAGUAR || MBULLDOZER || MPILEDRIVER || MSTEAMROLLER || MEXCAVATOR \
+	|| MZEN || MZEN2 || MZEN3 || MZEN4 || MNEHALEM || MWESTMERE || MSILVERMONT || MGOLDMONT \
+	|| MGOLDMONTPLUS || MSANDYBRIDGE || MIVYBRIDGE || MHASWELL || MBROADWELL || MSKYLAKE || MSKYLAKEX \
+	|| MCANNONLAKE || MICELAKE || MCASCADELAKE || MCOOPERLAKE || MTIGERLAKE || MSAPPHIRERAPIDS \
+	|| MROCKETLAKE || MALDERLAKE || MRAPTORLAKE || MMETEORLAKE || MEMERALDRAPIDS || MNATIVE_INTEL || MNATIVE_AMD)
 
 config X86_MINIMUM_CPU_FAMILY
 	int
 	default "64" if X86_64
-	default "6" if X86_32 && (MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MVIAC3_2 || MVIAC7 || MEFFICEON || MATOM || MCRUSOE || MCORE2 || MK7 || MK8)
+	default "6" if X86_32 && (MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 \
+	|| MVIAC3_2 || MVIAC7 || MEFFICEON || MATOM || MCRUSOE || MCORE2 || MK7 || MK8 ||  MK8SSE3 \
+	|| MK10 || MBARCELONA || MBOBCAT || MJAGUAR || MBULLDOZER || MPILEDRIVER || MSTEAMROLLER \
+	|| MEXCAVATOR || MZEN || MZEN2 || MZEN3 || MZEN4 || MNEHALEM || MWESTMERE || MSILVERMONT \
+	|| MGOLDMONT || MGOLDMONTPLUS || MSANDYBRIDGE || MIVYBRIDGE || MHASWELL || MBROADWELL \
+	|| MSKYLAKE || MSKYLAKEX || MCANNONLAKE || MICELAKE || MCASCADELAKE || MCOOPERLAKE \
+	|| MTIGERLAKE || MSAPPHIRERAPIDS || MROCKETLAKE || MALDERLAKE || MRAPTORLAKE || MRAPTORLAKE \
+	|| MNATIVE_INTEL || MNATIVE_AMD)
 	default "5" if X86_32 && X86_CMPXCHG64
 	default "4"
 
 config X86_DEBUGCTLMSR
 	def_bool y
-	depends on !(MK6 || MWINCHIPC6 || MWINCHIP3D || MCYRIXIII || M586MMX || M586TSC || M586 || M486SX || M486) && !UML
+	depends on !(MK6 || MWINCHIPC6 || MWINCHIP3D || MCYRIXIII || M586MMX || M586TSC || M586 \
+	|| M486SX || M486) && !UML
 
 config IA32_FEAT_CTL
 	def_bool y
diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index fdc2e3abd615..63845db8bf8a 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -67,7 +67,7 @@ export BITS
 #
 #    https://gcc.gnu.org/bugzilla/show_bug.cgi?id=53383
 #
-KBUILD_CFLAGS += -mno-sse -mno-mmx -mno-sse2 -mno-3dnow -mno-avx
+KBUILD_CFLAGS += -mno-sse -mno-mmx -mno-sse2 -mno-3dnow -mno-avx -O3
 KBUILD_RUSTFLAGS += -Ctarget-feature=-sse,-sse2,-sse3,-ssse3,-sse4.1,-sse4.2,-avx,-avx2
 
 ifeq ($(CONFIG_X86_KERNEL_IBT),y)
@@ -151,8 +151,48 @@ else
         # FIXME - should be integrated in Makefile.cpu (Makefile_32.cpu)
         cflags-$(CONFIG_MK8)		+= -march=k8
         cflags-$(CONFIG_MPSC)		+= -march=nocona
-        cflags-$(CONFIG_MCORE2)		+= -march=core2
-        cflags-$(CONFIG_MATOM)		+= -march=atom
+        cflags-$(CONFIG_MK8SSE3)	+= -march=k8-sse3
+        cflags-$(CONFIG_MK10) 		+= -march=amdfam10
+        cflags-$(CONFIG_MBARCELONA) 	+= -march=barcelona
+        cflags-$(CONFIG_MBOBCAT) 	+= -march=btver1
+        cflags-$(CONFIG_MJAGUAR) 	+= -march=btver2
+        cflags-$(CONFIG_MBULLDOZER) 	+= -march=bdver1
+        cflags-$(CONFIG_MPILEDRIVER)	+= -march=bdver2 -mno-tbm
+        cflags-$(CONFIG_MSTEAMROLLER) 	+= -march=bdver3 -mno-tbm
+        cflags-$(CONFIG_MEXCAVATOR) 	+= -march=bdver4 -mno-tbm
+        cflags-$(CONFIG_MZEN) 		+= -march=znver1
+        cflags-$(CONFIG_MZEN2) 	+= -march=znver2
+        cflags-$(CONFIG_MZEN3) 	+= -march=znver3
+        cflags-$(CONFIG_MZEN4) 	+= -march=znver4
+        cflags-$(CONFIG_MNATIVE_INTEL) += -march=native
+        cflags-$(CONFIG_MNATIVE_AMD) 	+= -march=native
+        cflags-$(CONFIG_MATOM) 	+= -march=bonnell
+        cflags-$(CONFIG_MCORE2) 	+= -march=core2
+        cflags-$(CONFIG_MNEHALEM) 	+= -march=nehalem
+        cflags-$(CONFIG_MWESTMERE) 	+= -march=westmere
+        cflags-$(CONFIG_MSILVERMONT) 	+= -march=silvermont
+        cflags-$(CONFIG_MGOLDMONT) 	+= -march=goldmont
+        cflags-$(CONFIG_MGOLDMONTPLUS) += -march=goldmont-plus
+        cflags-$(CONFIG_MSANDYBRIDGE) 	+= -march=sandybridge
+        cflags-$(CONFIG_MIVYBRIDGE) 	+= -march=ivybridge
+        cflags-$(CONFIG_MHASWELL) 	+= -march=haswell
+        cflags-$(CONFIG_MBROADWELL) 	+= -march=broadwell
+        cflags-$(CONFIG_MSKYLAKE) 	+= -march=skylake
+        cflags-$(CONFIG_MSKYLAKEX) 	+= -march=skylake-avx512
+        cflags-$(CONFIG_MCANNONLAKE) 	+= -march=cannonlake
+        cflags-$(CONFIG_MICELAKE) 	+= -march=icelake-client
+        cflags-$(CONFIG_MCASCADELAKE) 	+= -march=cascadelake
+        cflags-$(CONFIG_MCOOPERLAKE) 	+= -march=cooperlake
+        cflags-$(CONFIG_MTIGERLAKE) 	+= -march=tigerlake
+        cflags-$(CONFIG_MSAPPHIRERAPIDS) += -march=sapphirerapids
+        cflags-$(CONFIG_MROCKETLAKE) 	+= -march=rocketlake
+        cflags-$(CONFIG_MALDERLAKE) 	+= -march=alderlake
+        cflags-$(CONFIG_MRAPTORLAKE) 	+= -march=raptorlake
+        cflags-$(CONFIG_MMETEORLAKE) 	+= -march=meteorlake
+        cflags-$(CONFIG_MEMERALDRAPIDS)	+= -march=emeraldrapids
+        cflags-$(CONFIG_GENERIC_CPU2) 	+= -march=x86-64-v2
+        cflags-$(CONFIG_GENERIC_CPU3) 	+= -march=x86-64-v3
+        cflags-$(CONFIG_GENERIC_CPU4) 	+= -march=x86-64-v4
         cflags-$(CONFIG_GENERIC_CPU)	+= -mtune=generic
         KBUILD_CFLAGS += $(cflags-y)
 
diff --git a/arch/x86/include/asm/pci.h b/arch/x86/include/asm/pci.h
index b40c462b4af3..c4e66e60d559 100644
--- a/arch/x86/include/asm/pci.h
+++ b/arch/x86/include/asm/pci.h
@@ -27,6 +27,7 @@ struct pci_sysdata {
 #if IS_ENABLED(CONFIG_VMD)
 	struct pci_dev	*vmd_dev;	/* VMD Device if in Intel VMD domain */
 #endif
+	struct pci_dev	*nvme_remap_dev;	/* AHCI Device if NVME remapped bus */
 };
 
 extern int pci_routeirq;
@@ -70,6 +71,11 @@ static inline bool is_vmd(struct pci_bus *bus)
 #define is_vmd(bus)		false
 #endif /* CONFIG_VMD */
 
+static inline bool is_nvme_remap(struct pci_bus *bus)
+{
+	return to_pci_sysdata(bus)->nvme_remap_dev != NULL;
+}
+
 /* Can be used to override the logic in pci_scan_bus for skipping
    already-configured bus numbers - to be used for buggy BIOSes
    or architectures with incomplete PCI setup by the loader */
diff --git a/arch/x86/include/asm/vermagic.h b/arch/x86/include/asm/vermagic.h
index 75884d2cdec3..02c1386eb653 100644
--- a/arch/x86/include/asm/vermagic.h
+++ b/arch/x86/include/asm/vermagic.h
@@ -17,6 +17,54 @@
 #define MODULE_PROC_FAMILY "586MMX "
 #elif defined CONFIG_MCORE2
 #define MODULE_PROC_FAMILY "CORE2 "
+#elif defined CONFIG_MNATIVE_INTEL
+#define MODULE_PROC_FAMILY "NATIVE_INTEL "
+#elif defined CONFIG_MNATIVE_AMD
+#define MODULE_PROC_FAMILY "NATIVE_AMD "
+#elif defined CONFIG_MNEHALEM
+#define MODULE_PROC_FAMILY "NEHALEM "
+#elif defined CONFIG_MWESTMERE
+#define MODULE_PROC_FAMILY "WESTMERE "
+#elif defined CONFIG_MSILVERMONT
+#define MODULE_PROC_FAMILY "SILVERMONT "
+#elif defined CONFIG_MGOLDMONT
+#define MODULE_PROC_FAMILY "GOLDMONT "
+#elif defined CONFIG_MGOLDMONTPLUS
+#define MODULE_PROC_FAMILY "GOLDMONTPLUS "
+#elif defined CONFIG_MSANDYBRIDGE
+#define MODULE_PROC_FAMILY "SANDYBRIDGE "
+#elif defined CONFIG_MIVYBRIDGE
+#define MODULE_PROC_FAMILY "IVYBRIDGE "
+#elif defined CONFIG_MHASWELL
+#define MODULE_PROC_FAMILY "HASWELL "
+#elif defined CONFIG_MBROADWELL
+#define MODULE_PROC_FAMILY "BROADWELL "
+#elif defined CONFIG_MSKYLAKE
+#define MODULE_PROC_FAMILY "SKYLAKE "
+#elif defined CONFIG_MSKYLAKEX
+#define MODULE_PROC_FAMILY "SKYLAKEX "
+#elif defined CONFIG_MCANNONLAKE
+#define MODULE_PROC_FAMILY "CANNONLAKE "
+#elif defined CONFIG_MICELAKE
+#define MODULE_PROC_FAMILY "ICELAKE "
+#elif defined CONFIG_MCASCADELAKE
+#define MODULE_PROC_FAMILY "CASCADELAKE "
+#elif defined CONFIG_MCOOPERLAKE
+#define MODULE_PROC_FAMILY "COOPERLAKE "
+#elif defined CONFIG_MTIGERLAKE
+#define MODULE_PROC_FAMILY "TIGERLAKE "
+#elif defined CONFIG_MSAPPHIRERAPIDS
+#define MODULE_PROC_FAMILY "SAPPHIRERAPIDS "
+#elif defined CONFIG_ROCKETLAKE
+#define MODULE_PROC_FAMILY "ROCKETLAKE "
+#elif defined CONFIG_MALDERLAKE
+#define MODULE_PROC_FAMILY "ALDERLAKE "
+#elif defined CONFIG_MRAPTORLAKE
+#define MODULE_PROC_FAMILY "RAPTORLAKE "
+#elif defined CONFIG_MMETEORLAKE
+#define MODULE_PROC_FAMILY "METEORLAKE "
+#elif defined CONFIG_MEMERALDRAPIDS
+#define MODULE_PROC_FAMILY "EMERALDRAPIDS "
 #elif defined CONFIG_MATOM
 #define MODULE_PROC_FAMILY "ATOM "
 #elif defined CONFIG_M686
@@ -35,6 +83,32 @@
 #define MODULE_PROC_FAMILY "K7 "
 #elif defined CONFIG_MK8
 #define MODULE_PROC_FAMILY "K8 "
+#elif defined CONFIG_MK8SSE3
+#define MODULE_PROC_FAMILY "K8SSE3 "
+#elif defined CONFIG_MK10
+#define MODULE_PROC_FAMILY "K10 "
+#elif defined CONFIG_MBARCELONA
+#define MODULE_PROC_FAMILY "BARCELONA "
+#elif defined CONFIG_MBOBCAT
+#define MODULE_PROC_FAMILY "BOBCAT "
+#elif defined CONFIG_MBULLDOZER
+#define MODULE_PROC_FAMILY "BULLDOZER "
+#elif defined CONFIG_MPILEDRIVER
+#define MODULE_PROC_FAMILY "PILEDRIVER "
+#elif defined CONFIG_MSTEAMROLLER
+#define MODULE_PROC_FAMILY "STEAMROLLER "
+#elif defined CONFIG_MJAGUAR
+#define MODULE_PROC_FAMILY "JAGUAR "
+#elif defined CONFIG_MEXCAVATOR
+#define MODULE_PROC_FAMILY "EXCAVATOR "
+#elif defined CONFIG_MZEN
+#define MODULE_PROC_FAMILY "ZEN "
+#elif defined CONFIG_MZEN2
+#define MODULE_PROC_FAMILY "ZEN2 "
+#elif defined CONFIG_MZEN3
+#define MODULE_PROC_FAMILY "ZEN3 "
+#elif defined CONFIG_MZEN4
+#define MODULE_PROC_FAMILY "ZEN4 "
 #elif defined CONFIG_MELAN
 #define MODULE_PROC_FAMILY "ELAN "
 #elif defined CONFIG_MCRUSOE
diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c
index ddb798603201..7c20387d8202 100644
--- a/arch/x86/pci/common.c
+++ b/arch/x86/pci/common.c
@@ -723,12 +723,15 @@ int pci_ext_cfg_avail(void)
 		return 0;
 }
 
-#if IS_ENABLED(CONFIG_VMD)
 struct pci_dev *pci_real_dma_dev(struct pci_dev *dev)
 {
+#if IS_ENABLED(CONFIG_VMD)
 	if (is_vmd(dev->bus))
 		return to_pci_sysdata(dev->bus)->vmd_dev;
+#endif
+
+	if (is_nvme_remap(dev->bus))
+		return to_pci_sysdata(dev->bus)->nvme_remap_dev;
 
 	return dev;
 }
-#endif
diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
index 3cce6de464a7..9176bc4f07da 100644
--- a/block/bfq-iosched.c
+++ b/block/bfq-iosched.c
@@ -7627,6 +7627,7 @@ MODULE_ALIAS("bfq-iosched");
 static int __init bfq_init(void)
 {
 	int ret;
+	char msg[60] = "BFQ I/O-scheduler: BFQ-CachyOS v6.5";
 
 #ifdef CONFIG_BFQ_GROUP_IOSCHED
 	ret = blkcg_policy_register(&blkcg_policy_bfq);
@@ -7658,6 +7659,11 @@ static int __init bfq_init(void)
 	if (ret)
 		goto slab_kill;
 
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+	strcat(msg, " (with cgroups support)");
+#endif
+	pr_info("%s", msg);
+
 	return 0;
 
 slab_kill:
diff --git a/drivers/Makefile b/drivers/Makefile
index 7241d80a7b29..ac0ca3498f43 100644
--- a/drivers/Makefile
+++ b/drivers/Makefile
@@ -64,15 +64,8 @@ obj-y				+= char/
 # iommu/ comes before gpu as gpu are using iommu controllers
 obj-y				+= iommu/
 
-# gpu/ comes after char for AGP vs DRM startup and after iommu
-obj-y				+= gpu/
-
 obj-$(CONFIG_CONNECTOR)		+= connector/
 
-# i810fb and intelfb depend on char/agp/
-obj-$(CONFIG_FB_I810)           += video/fbdev/i810/
-obj-$(CONFIG_FB_INTEL)          += video/fbdev/intelfb/
-
 obj-$(CONFIG_PARPORT)		+= parport/
 obj-y				+= base/ block/ misc/ mfd/ nfc/
 obj-$(CONFIG_LIBNVDIMM)		+= nvdimm/
@@ -84,6 +77,14 @@ obj-y				+= macintosh/
 obj-y				+= scsi/
 obj-y				+= nvme/
 obj-$(CONFIG_ATA)		+= ata/
+
+# gpu/ comes after char for AGP vs DRM startup and after iommu
+obj-y				+= gpu/
+
+# i810fb and intelfb depend on char/agp/
+obj-$(CONFIG_FB_I810)           += video/fbdev/i810/
+obj-$(CONFIG_FB_INTEL)          += video/fbdev/intelfb/
+
 obj-$(CONFIG_TARGET_CORE)	+= target/
 obj-$(CONFIG_MTD)		+= mtd/
 obj-$(CONFIG_SPI)		+= spi/
diff --git a/drivers/ata/ahci.c b/drivers/ata/ahci.c
index 7907b09fc27e..a31562d18506 100644
--- a/drivers/ata/ahci.c
+++ b/drivers/ata/ahci.c
@@ -1524,7 +1524,7 @@ static irqreturn_t ahci_thunderx_irq_handler(int irq, void *dev_instance)
 }
 #endif
 
-static void ahci_remap_check(struct pci_dev *pdev, int bar,
+static int ahci_remap_check(struct pci_dev *pdev, int bar,
 		struct ahci_host_priv *hpriv)
 {
 	int i;
@@ -1537,7 +1537,7 @@ static void ahci_remap_check(struct pci_dev *pdev, int bar,
 	    pci_resource_len(pdev, bar) < SZ_512K ||
 	    bar != AHCI_PCI_BAR_STANDARD ||
 	    !(readl(hpriv->mmio + AHCI_VSCAP) & 1))
-		return;
+		return 0;
 
 	cap = readq(hpriv->mmio + AHCI_REMAP_CAP);
 	for (i = 0; i < AHCI_MAX_REMAP; i++) {
@@ -1552,18 +1552,11 @@ static void ahci_remap_check(struct pci_dev *pdev, int bar,
 	}
 
 	if (!hpriv->remapped_nvme)
-		return;
-
-	dev_warn(&pdev->dev, "Found %u remapped NVMe devices.\n",
-		 hpriv->remapped_nvme);
-	dev_warn(&pdev->dev,
-		 "Switch your BIOS from RAID to AHCI mode to use them.\n");
+		return 0;
 
-	/*
-	 * Don't rely on the msi-x capability in the remap case,
-	 * share the legacy interrupt across ahci and remapped devices.
-	 */
-	hpriv->flags |= AHCI_HFLAG_NO_MSI;
+	/* Abort probe, allowing intel-nvme-remap to step in when available */
+	dev_info(&pdev->dev, "Device will be handled by intel-nvme-remap.\n");
+	return -ENODEV;
 }
 
 static int ahci_get_irq_vector(struct ata_host *host, int port)
@@ -1783,7 +1776,9 @@ static int ahci_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
 	hpriv->mmio = pcim_iomap_table(pdev)[ahci_pci_bar];
 
 	/* detect remapped nvme devices */
-	ahci_remap_check(pdev, ahci_pci_bar, hpriv);
+	rc = ahci_remap_check(pdev, ahci_pci_bar, hpriv);
+	if (rc)
+		return rc;
 
 	sysfs_add_file_to_group(&pdev->dev.kobj,
 				&dev_attr_remapped_nvme.attr,
diff --git a/drivers/cpufreq/Kconfig.x86 b/drivers/cpufreq/Kconfig.x86
index 438c9e75a04d..1bbfeca5f01e 100644
--- a/drivers/cpufreq/Kconfig.x86
+++ b/drivers/cpufreq/Kconfig.x86
@@ -9,7 +9,6 @@ config X86_INTEL_PSTATE
 	select ACPI_PROCESSOR if ACPI
 	select ACPI_CPPC_LIB if X86_64 && ACPI && SCHED_MC_PRIO
 	select CPU_FREQ_GOV_PERFORMANCE
-	select CPU_FREQ_GOV_SCHEDUTIL if SMP
 	help
 	  This driver provides a P state for Intel core processors.
 	  The driver implements an internal governor and will become
@@ -39,7 +38,6 @@ config X86_AMD_PSTATE
 	depends on X86 && ACPI
 	select ACPI_PROCESSOR
 	select ACPI_CPPC_LIB if X86_64
-	select CPU_FREQ_GOV_SCHEDUTIL if SMP
 	help
 	  This driver adds a CPUFreq driver which utilizes a fine grain
 	  processor performance frequency control range instead of legacy
diff --git a/drivers/i2c/busses/Kconfig b/drivers/i2c/busses/Kconfig
index 9cfe8fc509d7..efc3b0c0b4ad 100644
--- a/drivers/i2c/busses/Kconfig
+++ b/drivers/i2c/busses/Kconfig
@@ -229,6 +229,15 @@ config I2C_CHT_WC
 	  combined with a FUSB302 Type-C port-controller as such it is advised
 	  to also select CONFIG_TYPEC_FUSB302=m.
 
+config I2C_NCT6775
+	tristate "Nuvoton NCT6775 and compatible SMBus controller"
+	help
+		If you say yes to this option, support will be included for the
+		Nuvoton NCT6775 and compatible SMBus controllers.
+
+		This driver can also be built as a module.  If so, the module
+		will be called i2c-nct6775.
+
 config I2C_NFORCE2
 	tristate "Nvidia nForce2, nForce3 and nForce4"
 	depends on PCI
diff --git a/drivers/i2c/busses/Makefile b/drivers/i2c/busses/Makefile
index af56fe2c75c0..76be74584719 100644
--- a/drivers/i2c/busses/Makefile
+++ b/drivers/i2c/busses/Makefile
@@ -20,6 +20,7 @@ obj-$(CONFIG_I2C_CHT_WC)	+= i2c-cht-wc.o
 obj-$(CONFIG_I2C_I801)		+= i2c-i801.o
 obj-$(CONFIG_I2C_ISCH)		+= i2c-isch.o
 obj-$(CONFIG_I2C_ISMT)		+= i2c-ismt.o
+obj-$(CONFIG_I2C_NCT6775)   += i2c-nct6775.o
 obj-$(CONFIG_I2C_NFORCE2)	+= i2c-nforce2.o
 obj-$(CONFIG_I2C_NFORCE2_S4985)	+= i2c-nforce2-s4985.o
 obj-$(CONFIG_I2C_NVIDIA_GPU)	+= i2c-nvidia-gpu.o
diff --git a/drivers/i2c/busses/i2c-nct6775.c b/drivers/i2c/busses/i2c-nct6775.c
new file mode 100644
index 000000000000..e919d1e10c51
--- /dev/null
+++ b/drivers/i2c/busses/i2c-nct6775.c
@@ -0,0 +1,648 @@
+/*
+ * i2c-nct6775 - Driver for the SMBus master functionality of
+ *	       Nuvoton NCT677x Super-I/O chips
+ *
+ * Copyright (C) 2019  Adam Honse <calcprogrammer1@gmail.com>
+ *
+ * Derived from nct6775 hwmon driver
+ * Copyright (C) 2012  Guenter Roeck <linux@roeck-us.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/jiffies.h>
+#include <linux/platform_device.h>
+#include <linux/hwmon.h>
+#include <linux/hwmon-sysfs.h>
+#include <linux/hwmon-vid.h>
+#include <linux/err.h>
+#include <linux/mutex.h>
+#include <linux/delay.h>
+#include <linux/ioport.h>
+#include <linux/i2c.h>
+#include <linux/acpi.h>
+#include <linux/bitops.h>
+#include <linux/dmi.h>
+#include <linux/io.h>
+#include <linux/nospec.h>
+
+#define DRVNAME "i2c-nct6775"
+
+/* Nuvoton SMBus address offsets */
+#define SMBHSTDAT       (0 + nuvoton_nct6793d_smba)
+#define SMBBLKSZ        (1 + nuvoton_nct6793d_smba)
+#define SMBHSTCMD       (2 + nuvoton_nct6793d_smba)
+#define SMBHSTIDX       (3 + nuvoton_nct6793d_smba)  //Index field is the Command field on other controllers
+#define SMBHSTCTL       (4 + nuvoton_nct6793d_smba)
+#define SMBHSTADD       (5 + nuvoton_nct6793d_smba)
+#define SMBHSTERR       (9 + nuvoton_nct6793d_smba)
+#define SMBHSTSTS       (0xE + nuvoton_nct6793d_smba)
+
+/* Command register */
+#define NCT6793D_READ_BYTE      0
+#define NCT6793D_READ_WORD      1
+#define NCT6793D_READ_BLOCK     2
+#define NCT6793D_BLOCK_WRITE_READ_PROC_CALL 3
+#define NCT6793D_PROC_CALL      4
+#define NCT6793D_WRITE_BYTE     8
+#define NCT6793D_WRITE_WORD     9
+#define NCT6793D_WRITE_BLOCK    10
+
+/* Control register */
+#define NCT6793D_MANUAL_START   128
+#define NCT6793D_SOFT_RESET     64
+
+/* Error register */
+#define NCT6793D_NO_ACK         32
+
+/* Status register */
+#define NCT6793D_FIFO_EMPTY     1
+#define NCT6793D_FIFO_FULL      2
+#define NCT6793D_MANUAL_ACTIVE  4
+
+#define NCT6775_LD_SMBUS		0x0B
+
+/* Other settings */
+#define MAX_RETRIES		400
+
+enum kinds { nct6106, nct6775, nct6776, nct6779, nct6791, nct6792, nct6793,
+	     nct6795, nct6796, nct6798 };
+
+struct nct6775_sio_data {
+	int sioreg;
+	enum kinds kind;
+};
+
+/* used to set data->name = nct6775_device_names[data->sio_kind] */
+static const char * const nct6775_device_names[] = {
+	"nct6106",
+	"nct6775",
+	"nct6776",
+	"nct6779",
+	"nct6791",
+	"nct6792",
+	"nct6793",
+	"nct6795",
+	"nct6796",
+	"nct6798",
+};
+
+static const char * const nct6775_sio_names[] __initconst = {
+	"NCT6106D",
+	"NCT6775F",
+	"NCT6776D/F",
+	"NCT6779D",
+	"NCT6791D",
+	"NCT6792D",
+	"NCT6793D",
+	"NCT6795D",
+	"NCT6796D",
+	"NCT6798D",
+};
+
+#define SIO_REG_LDSEL		0x07	/* Logical device select */
+#define SIO_REG_DEVID		0x20	/* Device ID (2 bytes) */
+#define SIO_REG_SMBA		0x62	/* SMBus base address register */
+
+#define SIO_NCT6106_ID		0xc450
+#define SIO_NCT6775_ID		0xb470
+#define SIO_NCT6776_ID		0xc330
+#define SIO_NCT6779_ID		0xc560
+#define SIO_NCT6791_ID		0xc800
+#define SIO_NCT6792_ID		0xc910
+#define SIO_NCT6793_ID		0xd120
+#define SIO_NCT6795_ID		0xd350
+#define SIO_NCT6796_ID		0xd420
+#define SIO_NCT6798_ID		0xd428
+#define SIO_ID_MASK			0xFFF0
+
+static inline void
+superio_outb(int ioreg, int reg, int val)
+{
+	outb(reg, ioreg);
+	outb(val, ioreg + 1);
+}
+
+static inline int
+superio_inb(int ioreg, int reg)
+{
+	outb(reg, ioreg);
+	return inb(ioreg + 1);
+}
+
+static inline void
+superio_select(int ioreg, int ld)
+{
+	outb(SIO_REG_LDSEL, ioreg);
+	outb(ld, ioreg + 1);
+}
+
+static inline int
+superio_enter(int ioreg)
+{
+	/*
+	 * Try to reserve <ioreg> and <ioreg + 1> for exclusive access.
+	 */
+	if (!request_muxed_region(ioreg, 2, DRVNAME))
+		return -EBUSY;
+
+	outb(0x87, ioreg);
+	outb(0x87, ioreg);
+
+	return 0;
+}
+
+static inline void
+superio_exit(int ioreg)
+{
+	outb(0xaa, ioreg);
+	outb(0x02, ioreg);
+	outb(0x02, ioreg + 1);
+	release_region(ioreg, 2);
+}
+
+/*
+ * ISA constants
+ */
+
+#define IOREGION_ALIGNMENT	(~7)
+#define IOREGION_LENGTH		2
+#define ADDR_REG_OFFSET		0
+#define DATA_REG_OFFSET		1
+
+#define NCT6775_REG_BANK	0x4E
+#define NCT6775_REG_CONFIG	0x40
+
+static struct i2c_adapter *nct6775_adapter;
+
+struct i2c_nct6775_adapdata {
+	unsigned short smba;
+};
+
+/* Return negative errno on error. */
+static s32 nct6775_access(struct i2c_adapter * adap, u16 addr,
+		 unsigned short flags, char read_write,
+		 u8 command, int size, union i2c_smbus_data * data)
+{
+	struct i2c_nct6775_adapdata *adapdata = i2c_get_adapdata(adap);
+	unsigned short nuvoton_nct6793d_smba = adapdata->smba;
+	int i, len, cnt;
+	union i2c_smbus_data tmp_data;
+	int timeout = 0;
+
+	tmp_data.word = 0;
+	cnt = 0;
+	len = 0;
+
+	outb_p(NCT6793D_SOFT_RESET, SMBHSTCTL);
+
+	switch (size) {
+		case I2C_SMBUS_QUICK:
+			outb_p((addr << 1) | read_write,
+			       SMBHSTADD);
+			break;
+		case I2C_SMBUS_BYTE_DATA:
+			tmp_data.byte = data->byte;
+			fallthrough;
+		case I2C_SMBUS_BYTE:
+			outb_p((addr << 1) | read_write,
+			       SMBHSTADD);
+			outb_p(command, SMBHSTIDX);
+			if (read_write == I2C_SMBUS_WRITE) {
+				outb_p(tmp_data.byte, SMBHSTDAT);
+				outb_p(NCT6793D_WRITE_BYTE, SMBHSTCMD);
+			}
+			else {
+				outb_p(NCT6793D_READ_BYTE, SMBHSTCMD);
+			}
+			break;
+		case I2C_SMBUS_WORD_DATA:
+			outb_p((addr << 1) | read_write,
+			       SMBHSTADD);
+			outb_p(command, SMBHSTIDX);
+			if (read_write == I2C_SMBUS_WRITE) {
+				outb_p(data->word & 0xff, SMBHSTDAT);
+				outb_p((data->word & 0xff00) >> 8, SMBHSTDAT);
+				outb_p(NCT6793D_WRITE_WORD, SMBHSTCMD);
+			}
+			else {
+				outb_p(NCT6793D_READ_WORD, SMBHSTCMD);
+			}
+			break;
+		case I2C_SMBUS_BLOCK_DATA:
+			outb_p((addr << 1) | read_write,
+			       SMBHSTADD);
+			outb_p(command, SMBHSTIDX);
+			if (read_write == I2C_SMBUS_WRITE) {
+				len = data->block[0];
+				if (len == 0 || len > I2C_SMBUS_BLOCK_MAX)
+					return -EINVAL;
+				outb_p(len, SMBBLKSZ);
+
+				cnt = 1;
+				if (len >= 4) {
+					for (i = cnt; i <= 4; i++) {
+						outb_p(data->block[i], SMBHSTDAT);
+					}
+
+					len -= 4;
+					cnt += 4;
+				}
+				else {
+					for (i = cnt; i <= len; i++ ) {
+						outb_p(data->block[i], SMBHSTDAT);
+					}
+
+					len = 0;
+				}
+
+				outb_p(NCT6793D_WRITE_BLOCK, SMBHSTCMD);
+			}
+			else {
+				return -ENOTSUPP;
+			}
+			break;
+		default:
+			dev_warn(&adap->dev, "Unsupported transaction %d\n", size);
+			return -EOPNOTSUPP;
+	}
+
+	outb_p(NCT6793D_MANUAL_START, SMBHSTCTL);
+
+	while ((size == I2C_SMBUS_BLOCK_DATA) && (len > 0)) {
+		if (read_write == I2C_SMBUS_WRITE) {
+			timeout = 0;
+			while ((inb_p(SMBHSTSTS) & NCT6793D_FIFO_EMPTY) == 0)
+			{
+				if(timeout > MAX_RETRIES)
+				{
+					return -ETIMEDOUT;
+				}
+				usleep_range(250, 500);
+				timeout++;
+			}
+
+			//Load more bytes into FIFO
+			if (len >= 4) {
+				for (i = cnt; i <= (cnt + 4); i++) {
+					outb_p(data->block[i], SMBHSTDAT);
+				}
+
+				len -= 4;
+				cnt += 4;
+			}
+			else {
+				for (i = cnt; i <= (cnt + len); i++) {
+					outb_p(data->block[i], SMBHSTDAT);
+				}
+
+				len = 0;
+			}
+		}
+		else {
+			return -ENOTSUPP;
+		}
+		
+	}
+
+	//wait for manual mode to complete
+	timeout = 0;
+	while ((inb_p(SMBHSTSTS) & NCT6793D_MANUAL_ACTIVE) != 0)
+	{
+		if(timeout > MAX_RETRIES)
+		{
+			return -ETIMEDOUT;
+		}
+		usleep_range(250, 500);
+		timeout++;
+	}
+
+	if ((inb_p(SMBHSTERR) & NCT6793D_NO_ACK) != 0) {    	
+		return -ENXIO;
+	}
+	else if ((read_write == I2C_SMBUS_WRITE) || (size == I2C_SMBUS_QUICK)) {
+		return 0;
+	}
+
+	switch (size) {
+		case I2C_SMBUS_QUICK:
+		case I2C_SMBUS_BYTE_DATA:
+			data->byte = inb_p(SMBHSTDAT);
+			break;
+		case I2C_SMBUS_WORD_DATA:
+			data->word = inb_p(SMBHSTDAT) + (inb_p(SMBHSTDAT) << 8);
+			break;
+	}
+	return 0;
+}
+
+static u32 nct6775_func(struct i2c_adapter *adapter)
+{
+	return I2C_FUNC_SMBUS_QUICK | I2C_FUNC_SMBUS_BYTE |
+	    I2C_FUNC_SMBUS_BYTE_DATA | I2C_FUNC_SMBUS_WORD_DATA |
+	    I2C_FUNC_SMBUS_BLOCK_DATA;
+}
+
+static const struct i2c_algorithm smbus_algorithm = {
+	.smbus_xfer	= nct6775_access,
+	.functionality	= nct6775_func,
+};
+
+static int nct6775_add_adapter(unsigned short smba, const char *name, struct i2c_adapter **padap)
+{
+	struct i2c_adapter *adap;
+	struct i2c_nct6775_adapdata *adapdata;
+	int retval;
+
+	adap = kzalloc(sizeof(*adap), GFP_KERNEL);
+	if (adap == NULL) {
+		return -ENOMEM;
+	}
+
+	adap->owner = THIS_MODULE;
+	adap->class = I2C_CLASS_HWMON | I2C_CLASS_SPD;
+	adap->algo = &smbus_algorithm;
+
+	adapdata = kzalloc(sizeof(*adapdata), GFP_KERNEL);
+	if (adapdata == NULL) {
+		kfree(adap);
+		return -ENOMEM;
+	}
+
+	adapdata->smba = smba;
+
+	snprintf(adap->name, sizeof(adap->name),
+		"SMBus NCT67xx adapter%s at %04x", name, smba);
+
+	i2c_set_adapdata(adap, adapdata);
+
+	retval = i2c_add_adapter(adap);
+	if (retval) {
+		kfree(adapdata);
+		kfree(adap);
+		return retval;
+	}
+
+	*padap = adap;
+	return 0;
+}
+
+static void nct6775_remove_adapter(struct i2c_adapter *adap)
+{
+	struct i2c_nct6775_adapdata *adapdata = i2c_get_adapdata(adap);
+
+	if (adapdata->smba) {
+		i2c_del_adapter(adap);
+		kfree(adapdata);
+		kfree(adap);
+	}
+}
+
+//static SIMPLE_DEV_PM_OPS(nct6775_dev_pm_ops, nct6775_suspend, nct6775_resume);
+
+/*
+ * when Super-I/O functions move to a separate file, the Super-I/O
+ * bus will manage the lifetime of the device and this module will only keep
+ * track of the nct6775 driver. But since we use platform_device_alloc(), we
+ * must keep track of the device
+ */
+static struct platform_device *pdev[2];
+
+static int nct6775_probe(struct platform_device *pdev)
+{
+	struct device *dev = &pdev->dev;
+	struct nct6775_sio_data *sio_data = dev_get_platdata(dev);
+	struct resource *res;
+
+	res = platform_get_resource(pdev, IORESOURCE_IO, 0);
+	if (!devm_request_region(&pdev->dev, res->start, IOREGION_LENGTH,
+				 DRVNAME))
+		return -EBUSY;
+
+	switch (sio_data->kind) {
+	case nct6791:
+	case nct6792:
+	case nct6793:
+	case nct6795:
+	case nct6796:
+	case nct6798:
+		nct6775_add_adapter(res->start, "", &nct6775_adapter);
+		break;
+	default:
+		return -ENODEV;
+	}
+
+	return 0;
+}
+/*
+static void nct6791_enable_io_mapping(int sioaddr)
+{
+	int val;
+
+	val = superio_inb(sioaddr, NCT6791_REG_HM_IO_SPACE_LOCK_ENABLE);
+	if (val & 0x10) {
+		pr_info("Enabling hardware monitor logical device mappings.\n");
+		superio_outb(sioaddr, NCT6791_REG_HM_IO_SPACE_LOCK_ENABLE,
+			     val & ~0x10);
+	}
+}*/
+
+static struct platform_driver i2c_nct6775_driver = {
+	.driver = {
+		.name	= DRVNAME,
+//		.pm	= &nct6775_dev_pm_ops,
+	},
+	.probe		= nct6775_probe,
+};
+
+static void __exit i2c_nct6775_exit(void)
+{
+	int i;
+
+	if(nct6775_adapter)
+		nct6775_remove_adapter(nct6775_adapter);
+
+	for (i = 0; i < ARRAY_SIZE(pdev); i++) {
+		if (pdev[i])
+			platform_device_unregister(pdev[i]);
+	}
+	platform_driver_unregister(&i2c_nct6775_driver);
+}
+
+/* nct6775_find() looks for a '627 in the Super-I/O config space */
+static int __init nct6775_find(int sioaddr, struct nct6775_sio_data *sio_data)
+{
+	u16 val;
+	int err;
+	int addr;
+
+	err = superio_enter(sioaddr);
+	if (err)
+		return err;
+
+	val = (superio_inb(sioaddr, SIO_REG_DEVID) << 8) |
+		superio_inb(sioaddr, SIO_REG_DEVID + 1);
+
+	switch (val & SIO_ID_MASK) {
+	case SIO_NCT6106_ID:
+		sio_data->kind = nct6106;
+		break;
+	case SIO_NCT6775_ID:
+		sio_data->kind = nct6775;
+		break;
+	case SIO_NCT6776_ID:
+		sio_data->kind = nct6776;
+		break;
+	case SIO_NCT6779_ID:
+		sio_data->kind = nct6779;
+		break;
+	case SIO_NCT6791_ID:
+		sio_data->kind = nct6791;
+		break;
+	case SIO_NCT6792_ID:
+		sio_data->kind = nct6792;
+		break;
+	case SIO_NCT6793_ID:
+		sio_data->kind = nct6793;
+		break;
+	case SIO_NCT6795_ID:
+		sio_data->kind = nct6795;
+		break;
+	case SIO_NCT6796_ID:
+		sio_data->kind = nct6796;
+		break;
+	case SIO_NCT6798_ID:
+		sio_data->kind = nct6798;
+		break;
+	default:
+		if (val != 0xffff)
+			pr_debug("unsupported chip ID: 0x%04x\n", val);
+		superio_exit(sioaddr);
+		return -ENODEV;
+	}
+
+	/* We have a known chip, find the SMBus I/O address */
+	superio_select(sioaddr, NCT6775_LD_SMBUS);
+	val = (superio_inb(sioaddr, SIO_REG_SMBA) << 8)
+	    | superio_inb(sioaddr, SIO_REG_SMBA + 1);
+	addr = val & IOREGION_ALIGNMENT;
+	if (addr == 0) {
+		pr_err("Refusing to enable a Super-I/O device with a base I/O port 0\n");
+		superio_exit(sioaddr);
+		return -ENODEV;
+	}
+
+	//if (sio_data->kind == nct6791 || sio_data->kind == nct6792 ||
+	//    sio_data->kind == nct6793 || sio_data->kind == nct6795 ||
+	//    sio_data->kind == nct6796)
+	//	nct6791_enable_io_mapping(sioaddr);
+
+	superio_exit(sioaddr);
+	pr_info("Found %s or compatible chip at %#x:%#x\n",
+		nct6775_sio_names[sio_data->kind], sioaddr, addr);
+	sio_data->sioreg = sioaddr;
+
+	return addr;
+}
+
+static int __init i2c_nct6775_init(void)
+{
+	int i, err;
+	bool found = false;
+	int address;
+	struct resource res;
+	struct nct6775_sio_data sio_data;
+	int sioaddr[2] = { 0x2e, 0x4e };
+
+	err = platform_driver_register(&i2c_nct6775_driver);
+	if (err)
+		return err;
+
+	/*
+	 * initialize sio_data->kind and sio_data->sioreg.
+	 *
+	 * when Super-I/O functions move to a separate file, the Super-I/O
+	 * driver will probe 0x2e and 0x4e and auto-detect the presence of a
+	 * nct6775 hardware monitor, and call probe()
+	 */
+	for (i = 0; i < ARRAY_SIZE(pdev); i++) {
+		address = nct6775_find(sioaddr[i], &sio_data);
+		if (address <= 0)
+			continue;
+
+		found = true;
+
+		pdev[i] = platform_device_alloc(DRVNAME, address);
+		if (!pdev[i]) {
+			err = -ENOMEM;
+			goto exit_device_unregister;
+		}
+
+		err = platform_device_add_data(pdev[i], &sio_data,
+					       sizeof(struct nct6775_sio_data));
+		if (err)
+			goto exit_device_put;
+
+		memset(&res, 0, sizeof(res));
+		res.name = DRVNAME;
+		res.start = address;
+		res.end = address + IOREGION_LENGTH - 1;
+		res.flags = IORESOURCE_IO;
+
+		err = acpi_check_resource_conflict(&res);
+		if (err) {
+			platform_device_put(pdev[i]);
+			pdev[i] = NULL;
+			continue;
+		}
+
+		err = platform_device_add_resources(pdev[i], &res, 1);
+		if (err)
+			goto exit_device_put;
+
+		/* platform_device_add calls probe() */
+		err = platform_device_add(pdev[i]);
+		if (err)
+			goto exit_device_put;
+	}
+	if (!found) {
+		err = -ENODEV;
+		goto exit_unregister;
+	}
+
+	return 0;
+
+exit_device_put:
+	platform_device_put(pdev[i]);
+exit_device_unregister:
+	while (--i >= 0) {
+		if (pdev[i])
+			platform_device_unregister(pdev[i]);
+	}
+exit_unregister:
+	platform_driver_unregister(&i2c_nct6775_driver);
+	return err;
+}
+
+MODULE_AUTHOR("Adam Honse <calcprogrammer1@gmail.com>");
+MODULE_DESCRIPTION("SMBus driver for NCT6775F and compatible chips");
+MODULE_LICENSE("GPL");
+
+module_init(i2c_nct6775_init);
+module_exit(i2c_nct6775_exit);
diff --git a/drivers/i2c/busses/i2c-piix4.c b/drivers/i2c/busses/i2c-piix4.c
index 809fbd014cd6..d54b35b147ee 100644
--- a/drivers/i2c/busses/i2c-piix4.c
+++ b/drivers/i2c/busses/i2c-piix4.c
@@ -568,11 +568,11 @@ static int piix4_transaction(struct i2c_adapter *piix4_adapter)
 	if (srvrworks_csb5_delay) /* Extra delay for SERVERWORKS_CSB5 */
 		usleep_range(2000, 2100);
 	else
-		usleep_range(250, 500);
+		usleep_range(25, 50);
 
 	while ((++timeout < MAX_TIMEOUT) &&
 	       ((temp = inb_p(SMBHSTSTS)) & 0x01))
-		usleep_range(250, 500);
+		usleep_range(25, 50);
 
 	/* If the SMBus is still busy, we give up */
 	if (timeout == MAX_TIMEOUT) {
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index dc0463bf3c2c..e6121b9a105e 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -3241,6 +3241,11 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 			goto bad;
 	}
 
+#ifdef CONFIG_CACHY
+	set_bit(DM_CRYPT_NO_READ_WORKQUEUE, &cc->flags);
+	set_bit(DM_CRYPT_NO_WRITE_WORKQUEUE, &cc->flags);
+#endif
+
 	ret = crypt_ctr_cipher(ti, argv[0], argv[1]);
 	if (ret < 0)
 		goto bad;
diff --git a/drivers/pci/controller/Makefile b/drivers/pci/controller/Makefile
index 37c8663de7fe..897d19f92ede 100644
--- a/drivers/pci/controller/Makefile
+++ b/drivers/pci/controller/Makefile
@@ -1,4 +1,10 @@
 # SPDX-License-Identifier: GPL-2.0
+ifdef CONFIG_X86_64
+ifdef CONFIG_SATA_AHCI
+obj-y += intel-nvme-remap.o
+endif
+endif
+
 obj-$(CONFIG_PCIE_CADENCE) += cadence/
 obj-$(CONFIG_PCI_FTPCI100) += pci-ftpci100.o
 obj-$(CONFIG_PCI_IXP4XX) += pci-ixp4xx.o
diff --git a/drivers/pci/controller/intel-nvme-remap.c b/drivers/pci/controller/intel-nvme-remap.c
new file mode 100644
index 000000000000..e105e6f5cc91
--- /dev/null
+++ b/drivers/pci/controller/intel-nvme-remap.c
@@ -0,0 +1,462 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Intel remapped NVMe device support.
+ *
+ * Copyright (c) 2019 Endless Mobile, Inc.
+ * Author: Daniel Drake <drake@endlessm.com>
+ *
+ * Some products ship by default with the SATA controller in "RAID" or
+ * "Intel RST Premium With Intel Optane System Acceleration" mode. Under this
+ * mode, which we refer to as "remapped NVMe" mode, any installed NVMe
+ * devices disappear from the PCI bus, and instead their I/O memory becomes
+ * available within the AHCI device BARs.
+ *
+ * This scheme is understood to be a way of avoiding usage of the standard
+ * Windows NVMe driver under that OS, instead mandating usage of Intel's
+ * driver instead, which has better power management, and presumably offers
+ * some RAID/disk-caching solutions too.
+ *
+ * Here in this driver, we support the remapped NVMe mode by claiming the
+ * AHCI device and creating a fake PCIe root port. On the new bus, the
+ * original AHCI device is exposed with only minor tweaks. Then, fake PCI
+ * devices corresponding to the remapped NVMe devices are created. The usual
+ * ahci and nvme drivers are then expected to bind to these devices and
+ * operate as normal.
+ *
+ * The PCI configuration space for the NVMe devices is completely
+ * unavailable, so we fake a minimal one and hope for the best.
+ *
+ * Interrupts are shared between the AHCI and NVMe devices. For simplicity,
+ * we only support the legacy interrupt here, although MSI support
+ * could potentially be added later.
+ */
+
+#define MODULE_NAME "intel-nvme-remap"
+
+#include <linux/ahci-remap.h>
+#include <linux/irq.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/pci.h>
+
+#define AHCI_PCI_BAR_STANDARD 5
+
+struct nvme_remap_dev {
+	struct pci_dev		*dev;		/* AHCI device */
+	struct pci_bus		*bus;		/* our fake PCI bus */
+	struct pci_sysdata	sysdata;
+	int			irq_base;	/* our fake interrupts */
+
+	/*
+	 * When we detect an all-ones write to a BAR register, this flag
+	 * is set, so that we return the BAR size on the next read (a
+	 * standard PCI behaviour).
+	 * This includes the assumption that an all-ones BAR write is
+	 * immediately followed by a read of the same register.
+	 */
+	bool			bar_sizing;
+
+	/*
+	 * Resources copied from the AHCI device, to be regarded as
+	 * resources on our fake bus.
+	 */
+	struct resource		ahci_resources[PCI_NUM_RESOURCES];
+
+	/* Resources corresponding to the NVMe devices. */
+	struct resource		remapped_dev_mem[AHCI_MAX_REMAP];
+
+	/* Number of remapped NVMe devices found. */
+	int			num_remapped_devices;
+};
+
+static inline struct nvme_remap_dev *nrdev_from_bus(struct pci_bus *bus)
+{
+	return container_of(bus->sysdata, struct nvme_remap_dev, sysdata);
+}
+
+
+/******** PCI configuration space **********/
+
+/*
+ * Helper macros for tweaking returned contents of PCI configuration space.
+ *
+ * value contains len bytes of data read from reg.
+ * If fixup_reg is included in that range, fix up the contents of that
+ * register to fixed_value.
+ */
+#define NR_FIX8(fixup_reg, fixed_value) do { \
+		if (reg <= fixup_reg && fixup_reg < reg + len) \
+			((u8 *) value)[fixup_reg - reg] = (u8) (fixed_value); \
+	} while (0)
+
+#define NR_FIX16(fixup_reg, fixed_value) do { \
+		NR_FIX8(fixup_reg, fixed_value); \
+		NR_FIX8(fixup_reg + 1, fixed_value >> 8); \
+	} while (0)
+
+#define NR_FIX24(fixup_reg, fixed_value) do { \
+		NR_FIX8(fixup_reg, fixed_value); \
+		NR_FIX8(fixup_reg + 1, fixed_value >> 8); \
+		NR_FIX8(fixup_reg + 2, fixed_value >> 16); \
+	} while (0)
+
+#define NR_FIX32(fixup_reg, fixed_value) do { \
+		NR_FIX16(fixup_reg, (u16) fixed_value); \
+		NR_FIX16(fixup_reg + 2, fixed_value >> 16); \
+	} while (0)
+
+/*
+ * Read PCI config space of the slot 0 (AHCI) device.
+ * We pass through the read request to the underlying device, but
+ * tweak the results in some cases.
+ */
+static int nvme_remap_pci_read_slot0(struct pci_bus *bus, int reg,
+				     int len, u32 *value)
+{
+	struct nvme_remap_dev *nrdev = nrdev_from_bus(bus);
+	struct pci_bus *ahci_dev_bus = nrdev->dev->bus;
+	int ret;
+
+	ret = ahci_dev_bus->ops->read(ahci_dev_bus, nrdev->dev->devfn,
+				      reg, len, value);
+	if (ret)
+		return ret;
+
+	/*
+	 * Adjust the device class, to prevent this driver from attempting to
+	 * additionally probe the device we're simulating here.
+	 */
+	NR_FIX24(PCI_CLASS_PROG, PCI_CLASS_STORAGE_SATA_AHCI);
+
+	/*
+	 * Unset interrupt pin, otherwise ACPI tries to find routing
+	 * info for our virtual IRQ, fails, and complains.
+	 */
+	NR_FIX8(PCI_INTERRUPT_PIN, 0);
+
+	/*
+	 * Truncate the AHCI BAR to not include the region that covers the
+	 * hidden devices. This will cause the ahci driver to successfully
+	 * probe th new device (instead of handing it over to this driver).
+	 */
+	if (nrdev->bar_sizing) {
+		NR_FIX32(PCI_BASE_ADDRESS_5, ~(SZ_16K - 1));
+		nrdev->bar_sizing = false;
+	}
+
+	return PCIBIOS_SUCCESSFUL;
+}
+
+/*
+ * Read PCI config space of a remapped device.
+ * Since the original PCI config space is inaccessible, we provide a minimal,
+ * fake config space instead.
+ */
+static int nvme_remap_pci_read_remapped(struct pci_bus *bus, unsigned int port,
+					int reg, int len, u32 *value)
+{
+	struct nvme_remap_dev *nrdev = nrdev_from_bus(bus);
+	struct resource *remapped_mem;
+
+	if (port > nrdev->num_remapped_devices)
+		return PCIBIOS_DEVICE_NOT_FOUND;
+
+	*value = 0;
+	remapped_mem = &nrdev->remapped_dev_mem[port - 1];
+
+	/* Set a Vendor ID, otherwise Linux assumes no device is present */
+	NR_FIX16(PCI_VENDOR_ID, PCI_VENDOR_ID_INTEL);
+
+	/* Always appear on & bus mastering */
+	NR_FIX16(PCI_COMMAND, PCI_COMMAND_MEMORY | PCI_COMMAND_MASTER);
+
+	/* Set class so that nvme driver probes us */
+	NR_FIX24(PCI_CLASS_PROG, PCI_CLASS_STORAGE_EXPRESS);
+
+	if (nrdev->bar_sizing) {
+		NR_FIX32(PCI_BASE_ADDRESS_0,
+			 ~(resource_size(remapped_mem) - 1));
+		nrdev->bar_sizing = false;
+	} else {
+		resource_size_t mem_start = remapped_mem->start;
+
+		mem_start |= PCI_BASE_ADDRESS_MEM_TYPE_64;
+		NR_FIX32(PCI_BASE_ADDRESS_0, mem_start);
+		mem_start >>= 32;
+		NR_FIX32(PCI_BASE_ADDRESS_1, mem_start);
+	}
+
+	return PCIBIOS_SUCCESSFUL;
+}
+
+/* Read PCI configuration space. */
+static int nvme_remap_pci_read(struct pci_bus *bus, unsigned int devfn,
+			       int reg, int len, u32 *value)
+{
+	if (PCI_SLOT(devfn) == 0)
+		return nvme_remap_pci_read_slot0(bus, reg, len, value);
+	else
+		return nvme_remap_pci_read_remapped(bus, PCI_SLOT(devfn),
+						    reg, len, value);
+}
+
+/*
+ * Write PCI config space of the slot 0 (AHCI) device.
+ * Apart from the special case of BAR sizing, we disable all writes.
+ * Otherwise, the ahci driver could make changes (e.g. unset PCI bus master)
+ * that would affect the operation of the NVMe devices.
+ */
+static int nvme_remap_pci_write_slot0(struct pci_bus *bus, int reg,
+				      int len, u32 value)
+{
+	struct nvme_remap_dev *nrdev = nrdev_from_bus(bus);
+	struct pci_bus *ahci_dev_bus = nrdev->dev->bus;
+
+	if (reg >= PCI_BASE_ADDRESS_0 && reg <= PCI_BASE_ADDRESS_5) {
+		/*
+		 * Writing all-ones to a BAR means that the size of the
+		 * memory region is being checked. Flag this so that we can
+		 * reply with an appropriate size on the next read.
+		 */
+		if (value == ~0)
+			nrdev->bar_sizing = true;
+
+		return ahci_dev_bus->ops->write(ahci_dev_bus,
+						nrdev->dev->devfn,
+						reg, len, value);
+	}
+
+	return PCIBIOS_SET_FAILED;
+}
+
+/*
+ * Write PCI config space of a remapped device.
+ * Since the original PCI config space is inaccessible, we reject all
+ * writes, except for the special case of BAR probing.
+ */
+static int nvme_remap_pci_write_remapped(struct pci_bus *bus,
+					 unsigned int port,
+					 int reg, int len, u32 value)
+{
+	struct nvme_remap_dev *nrdev = nrdev_from_bus(bus);
+
+	if (port > nrdev->num_remapped_devices)
+		return PCIBIOS_DEVICE_NOT_FOUND;
+
+	/*
+	 * Writing all-ones to a BAR means that the size of the memory
+	 * region is being checked. Flag this so that we can reply with
+	 * an appropriate size on the next read.
+	 */
+	if (value == ~0 && reg >= PCI_BASE_ADDRESS_0
+			&& reg <= PCI_BASE_ADDRESS_5) {
+		nrdev->bar_sizing = true;
+		return PCIBIOS_SUCCESSFUL;
+	}
+
+	return PCIBIOS_SET_FAILED;
+}
+
+/* Write PCI configuration space. */
+static int nvme_remap_pci_write(struct pci_bus *bus, unsigned int devfn,
+				int reg, int len, u32 value)
+{
+	if (PCI_SLOT(devfn) == 0)
+		return nvme_remap_pci_write_slot0(bus, reg, len, value);
+	else
+		return nvme_remap_pci_write_remapped(bus, PCI_SLOT(devfn),
+						     reg, len, value);
+}
+
+static struct pci_ops nvme_remap_pci_ops = {
+	.read	= nvme_remap_pci_read,
+	.write	= nvme_remap_pci_write,
+};
+
+
+/******** Initialization & exit **********/
+
+/*
+ * Find a PCI domain ID to use for our fake bus.
+ * Start at 0x10000 to not clash with ACPI _SEG domains (16 bits).
+ */
+static int find_free_domain(void)
+{
+	int domain = 0xffff;
+	struct pci_bus *bus = NULL;
+
+	while ((bus = pci_find_next_bus(bus)) != NULL)
+		domain = max_t(int, domain, pci_domain_nr(bus));
+
+	return domain + 1;
+}
+
+static int find_remapped_devices(struct nvme_remap_dev *nrdev,
+				 struct list_head *resources)
+{
+	void __iomem *mmio;
+	int i, count = 0;
+	u32 cap;
+
+	mmio = pcim_iomap(nrdev->dev, AHCI_PCI_BAR_STANDARD,
+			  pci_resource_len(nrdev->dev,
+					   AHCI_PCI_BAR_STANDARD));
+	if (!mmio)
+		return -ENODEV;
+
+	/* Check if this device might have remapped nvme devices. */
+	if (pci_resource_len(nrdev->dev, AHCI_PCI_BAR_STANDARD) < SZ_512K ||
+	    !(readl(mmio + AHCI_VSCAP) & 1))
+		return -ENODEV;
+
+	cap = readq(mmio + AHCI_REMAP_CAP);
+	for (i = AHCI_MAX_REMAP-1; i >= 0; i--) {
+		struct resource *remapped_mem;
+
+		if ((cap & (1 << i)) == 0)
+			continue;
+		if (readl(mmio + ahci_remap_dcc(i))
+				!= PCI_CLASS_STORAGE_EXPRESS)
+			continue;
+
+		/* We've found a remapped device */
+		remapped_mem = &nrdev->remapped_dev_mem[count++];
+		remapped_mem->start =
+			pci_resource_start(nrdev->dev, AHCI_PCI_BAR_STANDARD)
+			+ ahci_remap_base(i);
+		remapped_mem->end = remapped_mem->start
+			+ AHCI_REMAP_N_SIZE - 1;
+		remapped_mem->flags = IORESOURCE_MEM | IORESOURCE_PCI_FIXED;
+		pci_add_resource(resources, remapped_mem);
+	}
+
+	pcim_iounmap(nrdev->dev, mmio);
+
+	if (count == 0)
+		return -ENODEV;
+
+	nrdev->num_remapped_devices = count;
+	dev_info(&nrdev->dev->dev, "Found %d remapped NVMe devices\n",
+		 nrdev->num_remapped_devices);
+	return 0;
+}
+
+static void nvme_remap_remove_root_bus(void *data)
+{
+	struct pci_bus *bus = data;
+
+	pci_stop_root_bus(bus);
+	pci_remove_root_bus(bus);
+}
+
+static int nvme_remap_probe(struct pci_dev *dev,
+			    const struct pci_device_id *id)
+{
+	struct nvme_remap_dev *nrdev;
+	LIST_HEAD(resources);
+	int i;
+	int ret;
+	struct pci_dev *child;
+
+	nrdev = devm_kzalloc(&dev->dev, sizeof(*nrdev), GFP_KERNEL);
+	nrdev->sysdata.domain = find_free_domain();
+	nrdev->sysdata.nvme_remap_dev = dev;
+	nrdev->dev = dev;
+	pci_set_drvdata(dev, nrdev);
+
+	ret = pcim_enable_device(dev);
+	if (ret < 0)
+		return ret;
+
+	pci_set_master(dev);
+
+	ret = find_remapped_devices(nrdev, &resources);
+	if (ret)
+		return ret;
+
+	/* Add resources from the original AHCI device */
+	for (i = 0; i < PCI_NUM_RESOURCES; i++) {
+		struct resource *res = &dev->resource[i];
+
+		if (res->start) {
+			struct resource *nr_res = &nrdev->ahci_resources[i];
+
+			nr_res->start = res->start;
+			nr_res->end = res->end;
+			nr_res->flags = res->flags;
+			pci_add_resource(&resources, nr_res);
+		}
+	}
+
+	/* Create virtual interrupts */
+	nrdev->irq_base = devm_irq_alloc_descs(&dev->dev, -1, 0,
+					       nrdev->num_remapped_devices + 1,
+					       0);
+	if (nrdev->irq_base < 0)
+		return nrdev->irq_base;
+
+	/* Create and populate PCI bus */
+	nrdev->bus = pci_create_root_bus(&dev->dev, 0, &nvme_remap_pci_ops,
+					 &nrdev->sysdata, &resources);
+	if (!nrdev->bus)
+		return -ENODEV;
+
+	if (devm_add_action_or_reset(&dev->dev, nvme_remap_remove_root_bus,
+				     nrdev->bus))
+		return -ENOMEM;
+
+	/* We don't support sharing MSI interrupts between these devices */
+	nrdev->bus->bus_flags |= PCI_BUS_FLAGS_NO_MSI;
+
+	pci_scan_child_bus(nrdev->bus);
+
+	list_for_each_entry(child, &nrdev->bus->devices, bus_list) {
+		/*
+		 * Prevent PCI core from trying to move memory BARs around.
+		 * The hidden NVMe devices are at fixed locations.
+		 */
+		for (i = 0; i < PCI_NUM_RESOURCES; i++) {
+			struct resource *res = &child->resource[i];
+
+			if (res->flags & IORESOURCE_MEM)
+				res->flags |= IORESOURCE_PCI_FIXED;
+		}
+
+		/* Share the legacy IRQ between all devices */
+		child->irq = dev->irq;
+	}
+
+	pci_assign_unassigned_bus_resources(nrdev->bus);
+	pci_bus_add_devices(nrdev->bus);
+
+	return 0;
+}
+
+static const struct pci_device_id nvme_remap_ids[] = {
+	/*
+	 * Match all Intel RAID controllers.
+	 *
+	 * There's overlap here with the set of devices detected by the ahci
+	 * driver, but ahci will only successfully probe when there
+	 * *aren't* any remapped NVMe devices, and this driver will only
+	 * successfully probe when there *are* remapped NVMe devices that
+	 * need handling.
+	 */
+	{
+		PCI_VDEVICE(INTEL, PCI_ANY_ID),
+		.class = PCI_CLASS_STORAGE_RAID << 8,
+		.class_mask = 0xffffff00,
+	},
+	{0,}
+};
+MODULE_DEVICE_TABLE(pci, nvme_remap_ids);
+
+static struct pci_driver nvme_remap_drv = {
+	.name		= MODULE_NAME,
+	.id_table	= nvme_remap_ids,
+	.probe		= nvme_remap_probe,
+};
+module_pci_driver(nvme_remap_drv);
+
+MODULE_AUTHOR("Daniel Drake <drake@endlessm.com>");
+MODULE_LICENSE("GPL v2");
diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c
index 321156ca273d..5dda26c737e2 100644
--- a/drivers/pci/quirks.c
+++ b/drivers/pci/quirks.c
@@ -3718,6 +3718,106 @@ static void quirk_no_bus_reset(struct pci_dev *dev)
 	dev->dev_flags |= PCI_DEV_FLAGS_NO_BUS_RESET;
 }
 
+static bool acs_on_downstream;
+static bool acs_on_multifunction;
+
+#define NUM_ACS_IDS 16
+struct acs_on_id {
+	unsigned short vendor;
+	unsigned short device;
+};
+static struct acs_on_id acs_on_ids[NUM_ACS_IDS];
+static u8 max_acs_id;
+
+static __init int pcie_acs_override_setup(char *p)
+{
+	if (!p)
+		return -EINVAL;
+
+	while (*p) {
+		if (!strncmp(p, "downstream", 10))
+			acs_on_downstream = true;
+		if (!strncmp(p, "multifunction", 13))
+			acs_on_multifunction = true;
+		if (!strncmp(p, "id:", 3)) {
+			char opt[5];
+			int ret;
+			long val;
+
+			if (max_acs_id >= NUM_ACS_IDS - 1) {
+				pr_warn("Out of PCIe ACS override slots (%d)\n",
+						NUM_ACS_IDS);
+				goto next;
+			}
+
+			p += 3;
+			snprintf(opt, 5, "%s", p);
+			ret = kstrtol(opt, 16, &val);
+			if (ret) {
+				pr_warn("PCIe ACS ID parse error %d\n", ret);
+				goto next;
+			}
+			acs_on_ids[max_acs_id].vendor = val;
+
+			p += strcspn(p, ":");
+			if (*p != ':') {
+				pr_warn("PCIe ACS invalid ID\n");
+				goto next;
+			}
+
+			p++;
+			snprintf(opt, 5, "%s", p);
+			ret = kstrtol(opt, 16, &val);
+			if (ret) {
+				pr_warn("PCIe ACS ID parse error %d\n", ret);
+				goto next;
+			}
+			acs_on_ids[max_acs_id].device = val;
+			max_acs_id++;
+		}
+next:
+		p += strcspn(p, ",");
+		if (*p == ',')
+			p++;
+	}
+
+	if (acs_on_downstream || acs_on_multifunction || max_acs_id)
+		pr_warn("Warning: PCIe ACS overrides enabled; This may allow non-IOMMU protected peer-to-peer DMA\n");
+
+	return 0;
+}
+early_param("pcie_acs_override", pcie_acs_override_setup);
+
+static int pcie_acs_overrides(struct pci_dev *dev, u16 acs_flags)
+{
+	int i;
+
+	/* Never override ACS for legacy devices or devices with ACS caps */
+	if (!pci_is_pcie(dev) ||
+		pci_find_ext_capability(dev, PCI_EXT_CAP_ID_ACS))
+			return -ENOTTY;
+
+	for (i = 0; i < max_acs_id; i++)
+		if (acs_on_ids[i].vendor == dev->vendor &&
+			acs_on_ids[i].device == dev->device)
+				return 1;
+
+	switch (pci_pcie_type(dev)) {
+	case PCI_EXP_TYPE_DOWNSTREAM:
+	case PCI_EXP_TYPE_ROOT_PORT:
+		if (acs_on_downstream)
+			return 1;
+		break;
+	case PCI_EXP_TYPE_ENDPOINT:
+	case PCI_EXP_TYPE_UPSTREAM:
+	case PCI_EXP_TYPE_LEG_END:
+	case PCI_EXP_TYPE_RC_END:
+		if (acs_on_multifunction && dev->multifunction)
+			return 1;
+	}
+
+	return -ENOTTY;
+}
 /*
  * Some NVIDIA GPU devices do not work with bus reset, SBR needs to be
  * prevented for those affected devices.
@@ -5112,6 +5212,7 @@ static const struct pci_dev_acs_enabled {
 	{ PCI_VENDOR_ID_ZHAOXIN, PCI_ANY_ID, pci_quirk_zhaoxin_pcie_ports_acs },
 	/* Wangxun nics */
 	{ PCI_VENDOR_ID_WANGXUN, PCI_ANY_ID, pci_quirk_wangxun_nic_acs },
+	{ PCI_ANY_ID, PCI_ANY_ID, pcie_acs_overrides },
 	{ 0 }
 };
 
diff --git a/drivers/platform/x86/Kconfig b/drivers/platform/x86/Kconfig
index 49c2c4cd8d00..956f4eff85b5 100644
--- a/drivers/platform/x86/Kconfig
+++ b/drivers/platform/x86/Kconfig
@@ -643,6 +643,16 @@ config THINKPAD_LMI
 	  To compile this driver as a module, choose M here: the module will
 	  be called think-lmi.
 
+config LEGION_LAPTOP
+	tristate "Lenovo Legion Laptop Extras"
+	depends on ACPI
+	depends on ACPI_WMI || ACPI_WMI = n
+	depends on HWMON || HWMON = n
+	select ACPI_PLATFORM_PROFILE
+	help
+	  This is a driver for Lenovo Legion laptops and contains drivers for
+	  hotkey, fan control, and power mode.
+
 source "drivers/platform/x86/intel/Kconfig"
 
 config MSI_EC
@@ -1094,6 +1104,20 @@ config WINMATE_FM07_KEYS
 	  buttons below the display. This module adds an input device
 	  that delivers key events when these buttons are pressed.
 
+config STEAMDECK
+	tristate "Valve Steam Deck platform driver"
+	depends on X86_64
+	help
+	  Driver exposing various bits and pieces of functionality
+	  provided by Steam Deck specific VLV0100 device presented by
+	  EC firmware. This includes but not limited to:
+	     - CPU/device's fan control
+	     - Read-only access to DDIC registers
+	     - Battery tempreature measurements
+	     - Various display related control knobs
+	     - USB Type-C connector event notification
+	  Say N unless you are running on a Steam Deck.
+
 endif # X86_PLATFORM_DEVICES
 
 config P2SB
diff --git a/drivers/platform/x86/Makefile b/drivers/platform/x86/Makefile
index 52dfdf574ac2..71f65ef04f9e 100644
--- a/drivers/platform/x86/Makefile
+++ b/drivers/platform/x86/Makefile
@@ -65,6 +65,7 @@ obj-$(CONFIG_LENOVO_YMC)	+= lenovo-ymc.o
 obj-$(CONFIG_SENSORS_HDAPS)	+= hdaps.o
 obj-$(CONFIG_THINKPAD_ACPI)	+= thinkpad_acpi.o
 obj-$(CONFIG_THINKPAD_LMI)	+= think-lmi.o
+obj-$(CONFIG_LEGION_LAPTOP)	+= legion-laptop.o
 obj-$(CONFIG_YOGABOOK)		+= lenovo-yogabook.o
 
 # Intel
@@ -135,3 +136,6 @@ obj-$(CONFIG_SIEMENS_SIMATIC_IPC)	+= simatic-ipc.o
 
 # Winmate
 obj-$(CONFIG_WINMATE_FM07_KEYS)		+= winmate-fm07-keys.o
+
+# Steam Deck
+obj-$(CONFIG_STEAMDECK)			+= steamdeck.o
diff --git a/drivers/platform/x86/legion-laptop.c b/drivers/platform/x86/legion-laptop.c
new file mode 100644
index 000000000000..7275105071d2
--- /dev/null
+++ b/drivers/platform/x86/legion-laptop.c
@@ -0,0 +1,5858 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ *  legion-laptop.c - Extra Lenovo Legion laptop support, in
+ *   particular for fan curve control and power mode.
+ *
+ *  Copyright (C) 2022 johnfan <johnfan (at) example (dot) com>
+ *
+ *
+ *  This driver might work on other Lenovo Legion models. If you
+ *  want to try it you can pass force=1 as argument
+ *  to the module which will force it to load even when the DMI
+ *  data doesn't match the model AND FIRMWARE.
+ *
+ *  Support for other hardware of this model is already partially
+ *  provided by the module ideapd-laptop.
+ *
+ *  The development page for this driver is located at
+ *  https://github.com/johnfanv2/LenovoLegionLinux
+ *
+ *  This driver exports the files:
+ *    - /sys/kernel/debug/legion/fancurve (ro)
+ *        The fan curve in the form stored in the firmware in an
+ *        human readable table.
+ *
+ *    - /sys/module/legion_laptop/drivers/platform\:legion/PNP0C09\:00/powermode (rw)
+ *       0: balanced mode (white)
+ *       1: performance mode (red)
+ *       2: quiet mode (blue)
+ *       ?: custom mode (pink)
+ *
+ *  NOTE: Writing to this will load the default fan curve from
+ *        the firmware for this mode, so the fan curve might
+ *        have to be reconfigured if needed.
+ *
+ *  It implements the usual hwmon interface to monitor fan speed and temmperature
+ *  and allows to set the fan curve inside the firware.
+ *
+ *    - /sys/class/hwmon/X/fan1_input or /sys/class/hwmon/X/fan2_input  (ro)
+ *        Current fan speed of fan1/fan2.
+ *    - /sys/class/hwmon/X/temp1_input (ro)
+ *    - /sys/class/hwmon/X/temp2_input (ro)
+ *    - /sys/class/hwmon/X/temp3_input (ro)
+ *        Temperature (Celsius) of CPU, GPU, and IC used for fan control.
+ *    - /sys/class/hwmon/X/pwmY_auto_pointZ_pwm (rw)
+ *          PWM (0-255) of the fan at the Y-level in the fan curve
+ *    - /sys/class/hwmon/X/pwmY_auto_pointZ_temp (rw)
+ *          upper temperature of tempZ (CPU, GPU, or IC) at the Y-level in the fan curve
+ *    - /sys/class/hwmon/X/pwmY_auto_pointZ_temp_hyst (rw)
+ *          hysteris (CPU, GPU, or IC) at the Y-level in the fan curve. The lower
+ *          temperatue of the level is the upper temperature minus the hysteris
+ *
+ *
+ *  Credits for reverse engineering the firmware to:
+ *      - David Woodhouse: heavily inspired by lenovo_laptop.c
+ *      - Luke Cama: Windows version "LegionFanControl"
+ *      - SmokelessCPU: reverse engineering of custom registers in EC
+ *                      and commincation method with EC via ports
+ *      - 0x1F9F1: additional reverse engineering for complete fan curve
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/acpi.h>
+#include <asm/io.h>
+#include <linux/debugfs.h>
+#include <linux/delay.h>
+#include <linux/dmi.h>
+#include <linux/leds.h>
+#include <linux/hwmon.h>
+#include <linux/hwmon-sysfs.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/platform_device.h>
+#include <linux/platform_profile.h>
+#include <linux/types.h>
+#include <linux/wmi.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("johnfan");
+MODULE_DESCRIPTION("Lenovo Legion laptop extras");
+
+static bool force;
+module_param(force, bool, 0440);
+MODULE_PARM_DESC(
+	force,
+	"Force loading this module even if model or BIOS does not match.");
+
+static bool ec_readonly;
+module_param(ec_readonly, bool, 0440);
+MODULE_PARM_DESC(
+	ec_readonly,
+	"Only read from embedded controller but do not write or change settings.");
+
+static bool enable_platformprofile = true;
+module_param(enable_platformprofile, bool, 0440);
+MODULE_PARM_DESC(
+	enable_platformprofile,
+	"Enable the platform profile sysfs API to read and write the power mode.");
+
+#define LEGIONFEATURES \
+	"fancurve powermode platformprofile platformprofilenotify minifancurve"
+
+//Size of fancurve stored in embedded controller
+#define MAXFANCURVESIZE 10
+
+#define LEGION_DRVR_SHORTNAME "legion"
+#define LEGION_HWMON_NAME LEGION_DRVR_SHORTNAME "_hwmon"
+
+struct legion_private;
+
+/* =============================== */
+/* Embedded Controller Description */
+/* =============================== */
+
+/* The configuration and registers to access the embedded controller
+ * depending on different the version of the software on the
+ * embedded controller or and the BIOS/UEFI firmware.
+ *
+ * To control fan curve in the embedded controller (EC) one has to
+ * write to its "RAM". There are different possibilities:
+ *  - EC RAM is memory mapped (write to it with ioremap)
+ *  - access EC RAM via ported mapped IO (outb/inb)
+ *  - access EC RAM via ACPI methods. It is only possible to write
+ *    to part of it (first 0xFF bytes?)
+ *
+ * In later models the firmware directly exposes ACPI methods to
+ * set the fan curve direclty, without writing to EC RAM. This
+ * is done inside the ACPI method.
+ */
+
+/**
+ * Offsets for interseting values inside the EC RAM  (0 = start of
+ * EC RAM. These might change depending on the software inside of
+ * the EC, which can be updated by a BIOS update from Lenovo.
+ */
+// TODO: same order as in initialization
+struct ec_register_offsets {
+	// Super I/O Configuration Registers
+	// 7.15 General Control (GCTRL)
+	// General Control (GCTRL)
+	// (see EC Interface Registers  and 6.2 Plug and Play Configuration (PNPCFG)) in datasheet
+	// note: these are in two places saved
+	// in EC Interface Registers  and in super io configuraion registers
+	// Chip ID
+	u16 ECHIPID1;
+	u16 ECHIPID2;
+	// Chip Version
+	u16 ECHIPVER;
+	u16 ECDEBUG;
+
+	// Lenovo Custom OEM extension
+	// Firmware of ITE can be extended by
+	// custom program using its own "variables"
+	// These are the offsets to these "variables"
+	u16 EXT_FAN_CUR_POINT;
+	u16 EXT_FAN_POINTS_SIZE;
+	u16 EXT_FAN1_BASE;
+	u16 EXT_FAN2_BASE;
+	u16 EXT_FAN_ACC_BASE;
+	u16 EXT_FAN_DEC_BASE;
+	u16 EXT_CPU_TEMP;
+	u16 EXT_CPU_TEMP_HYST;
+	u16 EXT_GPU_TEMP;
+	u16 EXT_GPU_TEMP_HYST;
+	u16 EXT_VRM_TEMP;
+	u16 EXT_VRM_TEMP_HYST;
+	u16 EXT_FAN1_RPM_LSB;
+	u16 EXT_FAN1_RPM_MSB;
+	u16 EXT_FAN2_RPM_LSB;
+	u16 EXT_FAN2_RPM_MSB;
+	u16 EXT_FAN1_TARGET_RPM;
+	u16 EXT_FAN2_TARGET_RPM;
+	u16 EXT_POWERMODE;
+	u16 EXT_MINIFANCURVE_ON_COOL;
+	// values
+	// 0x04: enable mini fan curve if very long on cool level
+	//      - this might be due to potential temp failure
+	//      - or just because really so cool
+	// 0xA0: disable it
+	u16 EXT_LOCKFANCONTROLLER;
+	u16 EXT_MAXIMUMFANSPEED;
+	u16 EXT_WHITE_KEYBOARD_BACKLIGHT;
+	u16 EXT_IC_TEMP_INPUT;
+	u16 EXT_CPU_TEMP_INPUT;
+	u16 EXT_GPU_TEMP_INPUT;
+};
+
+enum access_method {
+	ACCESS_METHOD_NO_ACCESS = 0,
+	ACCESS_METHOD_EC = 1,
+	ACCESS_METHOD_ACPI = 2,
+	ACCESS_METHOD_WMI = 3,
+	ACCESS_METHOD_WMI2 = 4,
+	ACCESS_METHOD_WMI3 = 5,
+	ACCESS_METHOD_EC2 = 10, // ideapad fancurve method
+};
+
+struct model_config {
+	const struct ec_register_offsets *registers;
+	bool check_embedded_controller_id;
+	u16 embedded_controller_id;
+
+	// first addr in EC we access/scan
+	phys_addr_t memoryio_physical_ec_start;
+	size_t memoryio_size;
+
+	// TODO: maybe use bitfield
+	bool has_minifancurve;
+	bool has_custom_powermode;
+	enum access_method access_method_powermode;
+
+	enum access_method access_method_keyboard;
+	enum access_method access_method_temperature;
+	enum access_method access_method_fanspeed;
+	enum access_method access_method_fancurve;
+	enum access_method access_method_fanfullspeed;
+	bool three_state_keyboard;
+
+	bool acpi_check_dev;
+
+	phys_addr_t ramio_physical_start;
+	size_t ramio_size;
+};
+
+/* =================================== */
+/* Configuration for different models */
+/* =================================== */
+
+// Idea by SmokelesssCPU (modified)
+// - all default names and register addresses are supported by datasheet
+// - register addresses for custom firmware by SmokelesssCPU
+static const struct ec_register_offsets ec_register_offsets_v0 = {
+	.ECHIPID1 = 0x2000,
+	.ECHIPID2 = 0x2001,
+	.ECHIPVER = 0x2002,
+	.ECDEBUG = 0x2003,
+	.EXT_FAN_CUR_POINT = 0xC534,
+	.EXT_FAN_POINTS_SIZE = 0xC535,
+	.EXT_FAN1_BASE = 0xC540,
+	.EXT_FAN2_BASE = 0xC550,
+	.EXT_FAN_ACC_BASE = 0xC560,
+	.EXT_FAN_DEC_BASE = 0xC570,
+	.EXT_CPU_TEMP = 0xC580,
+	.EXT_CPU_TEMP_HYST = 0xC590,
+	.EXT_GPU_TEMP = 0xC5A0,
+	.EXT_GPU_TEMP_HYST = 0xC5B0,
+	.EXT_VRM_TEMP = 0xC5C0,
+	.EXT_VRM_TEMP_HYST = 0xC5D0,
+	.EXT_FAN1_RPM_LSB = 0xC5E0,
+	.EXT_FAN1_RPM_MSB = 0xC5E1,
+	.EXT_FAN2_RPM_LSB = 0xC5E2,
+	.EXT_FAN2_RPM_MSB = 0xC5E3,
+	.EXT_MINIFANCURVE_ON_COOL = 0xC536,
+	.EXT_LOCKFANCONTROLLER = 0xc4AB,
+	.EXT_CPU_TEMP_INPUT = 0xc538,
+	.EXT_GPU_TEMP_INPUT = 0xc539,
+	.EXT_IC_TEMP_INPUT = 0xC5E8,
+	.EXT_POWERMODE = 0xc420,
+	.EXT_FAN1_TARGET_RPM = 0xc600,
+	.EXT_FAN2_TARGET_RPM = 0xc601,
+	.EXT_MAXIMUMFANSPEED = 0xBD,
+	.EXT_WHITE_KEYBOARD_BACKLIGHT = (0x3B + 0xC400)
+};
+
+static const struct ec_register_offsets ec_register_offsets_v1 = {
+	.ECHIPID1 = 0x2000,
+	.ECHIPID2 = 0x2001,
+	.ECHIPVER = 0x2002,
+	.ECDEBUG = 0x2003,
+	.EXT_FAN_CUR_POINT = 0xC534,
+	.EXT_FAN_POINTS_SIZE = 0xC535,
+	.EXT_FAN1_BASE = 0xC540,
+	.EXT_FAN2_BASE = 0xC550,
+	.EXT_FAN_ACC_BASE = 0xC560,
+	.EXT_FAN_DEC_BASE = 0xC570,
+	.EXT_CPU_TEMP = 0xC580,
+	.EXT_CPU_TEMP_HYST = 0xC590,
+	.EXT_GPU_TEMP = 0xC5A0,
+	.EXT_GPU_TEMP_HYST = 0xC5B0,
+	.EXT_VRM_TEMP = 0xC5C0,
+	.EXT_VRM_TEMP_HYST = 0xC5D0,
+	.EXT_FAN1_RPM_LSB = 0xC5E0,
+	.EXT_FAN1_RPM_MSB = 0xC5E1,
+	.EXT_FAN2_RPM_LSB = 0xC5E2,
+	.EXT_FAN2_RPM_MSB = 0xC5E3,
+	.EXT_MINIFANCURVE_ON_COOL = 0xC536,
+	.EXT_LOCKFANCONTROLLER = 0xc4AB,
+	.EXT_CPU_TEMP_INPUT = 0xc538,
+	.EXT_GPU_TEMP_INPUT = 0xc539,
+	.EXT_IC_TEMP_INPUT = 0xC5E8,
+	.EXT_POWERMODE = 0xc41D,
+	.EXT_FAN1_TARGET_RPM = 0xc600,
+	.EXT_FAN2_TARGET_RPM = 0xc601,
+	.EXT_MAXIMUMFANSPEED = 0xBD,
+	.EXT_WHITE_KEYBOARD_BACKLIGHT = (0x3B + 0xC400)
+};
+
+static const struct ec_register_offsets ec_register_offsets_ideapad_v0 = {
+	.ECHIPID1 = 0x2000,
+	.ECHIPID2 = 0x2001,
+	.ECHIPVER = 0x2002,
+	.ECDEBUG = 0x2003,
+	.EXT_FAN_CUR_POINT = 0xC5a0, // not found yet
+	.EXT_FAN_POINTS_SIZE = 0xC5a0, // constant 0
+	.EXT_FAN1_BASE = 0xC5a0,
+	.EXT_FAN2_BASE = 0xC5a8,
+	.EXT_FAN_ACC_BASE = 0xC5a0, // not found yet
+	.EXT_FAN_DEC_BASE = 0xC5a0, // not found yet
+	.EXT_CPU_TEMP = 0xC550, // and repeated after 8 bytes
+	.EXT_CPU_TEMP_HYST = 0xC590, // and repeated after 8 bytes
+	.EXT_GPU_TEMP = 0xC5C0, // and repeated after 8 bytes
+	.EXT_GPU_TEMP_HYST = 0xC5D0, // and repeated after 8 bytes
+	.EXT_VRM_TEMP = 0xC5a0, // does not exists or not found
+	.EXT_VRM_TEMP_HYST = 0xC5a0, // does not exists ot not found yet
+	.EXT_FAN1_RPM_LSB = 0xC5a0, // not found yet
+	.EXT_FAN1_RPM_MSB = 0xC5a0, // not found yet
+	.EXT_FAN2_RPM_LSB = 0xC5a0, // not found yet
+	.EXT_FAN2_RPM_MSB = 0xC5a0, // not found yet
+	.EXT_MINIFANCURVE_ON_COOL = 0xC5a0, // does not exists or not found
+	.EXT_LOCKFANCONTROLLER = 0xC5a0, // does not exists or not found
+	.EXT_CPU_TEMP_INPUT = 0xC5a0, // not found yet
+	.EXT_GPU_TEMP_INPUT = 0xC5a0, // not found yet
+	.EXT_IC_TEMP_INPUT = 0xC5a0, // not found yet
+	.EXT_POWERMODE = 0xC5a0, // not found yet
+	.EXT_FAN1_TARGET_RPM = 0xC5a0, // not found yet
+	.EXT_FAN2_TARGET_RPM = 0xC5a0, // not found yet
+	.EXT_MAXIMUMFANSPEED = 0xC5a0, // not found yet
+	.EXT_WHITE_KEYBOARD_BACKLIGHT = 0xC5a0 // not found yet
+};
+
+static const struct ec_register_offsets ec_register_offsets_ideapad_v1 = {
+	.ECHIPID1 = 0x2000,
+	.ECHIPID2 = 0x2001,
+	.ECHIPVER = 0x2002,
+	.ECDEBUG = 0x2003,
+	.EXT_FAN_CUR_POINT = 0xC5a0, // not found yet
+	.EXT_FAN_POINTS_SIZE = 0xC5a0, // constant 0
+	.EXT_FAN1_BASE = 0xC5a0,
+	.EXT_FAN2_BASE = 0xC5a8,
+	.EXT_FAN_ACC_BASE = 0xC5a0, // not found yet
+	.EXT_FAN_DEC_BASE = 0xC5a0, // not found yet
+	.EXT_CPU_TEMP = 0xC550, // and repeated after 8 bytes
+	.EXT_CPU_TEMP_HYST = 0xC590, // and repeated after 8 bytes
+	.EXT_GPU_TEMP = 0xC5C0, // and repeated after 8 bytes
+	.EXT_GPU_TEMP_HYST = 0xC5D0, // and repeated after 8 bytes
+	.EXT_VRM_TEMP = 0xC5a0, // does not exists or not found
+	.EXT_VRM_TEMP_HYST = 0xC5a0, // does not exists ot not found yet
+	.EXT_FAN1_RPM_LSB = 0xC5a0, // not found yet
+	.EXT_FAN1_RPM_MSB = 0xC5a0, // not found yet
+	.EXT_FAN2_RPM_LSB = 0xC5a0, // not found yet
+	.EXT_FAN2_RPM_MSB = 0xC5a0, // not found yet
+	.EXT_MINIFANCURVE_ON_COOL = 0xC5a0, // does not exists or not found
+	.EXT_LOCKFANCONTROLLER = 0xC5a0, // does not exists or not found
+	.EXT_CPU_TEMP_INPUT = 0xC5a0, // not found yet
+	.EXT_GPU_TEMP_INPUT = 0xC5a0, // not found yet
+	.EXT_IC_TEMP_INPUT = 0xC5a0, // not found yet
+	.EXT_POWERMODE = 0xC5a0, // not found yet
+	.EXT_FAN1_TARGET_RPM = 0xC5a0, // not found yet
+	.EXT_FAN2_TARGET_RPM = 0xC5a0, // not found yet
+	.EXT_MAXIMUMFANSPEED = 0xC5a0, // not found yet
+	.EXT_WHITE_KEYBOARD_BACKLIGHT = 0xC5a0 // not found yet
+};
+
+static const struct model_config model_v0 = {
+	.registers = &ec_register_offsets_v0,
+	.check_embedded_controller_id = true,
+	.embedded_controller_id = 0x8227,
+	.memoryio_physical_ec_start = 0xC400,
+	.memoryio_size = 0x300,
+	.has_minifancurve = true,
+	.has_custom_powermode = true,
+	.access_method_powermode = ACCESS_METHOD_WMI,
+	.access_method_keyboard = ACCESS_METHOD_WMI,
+	.access_method_fanspeed = ACCESS_METHOD_EC,
+	.access_method_temperature = ACCESS_METHOD_EC,
+	.access_method_fancurve = ACCESS_METHOD_EC,
+	.access_method_fanfullspeed = ACCESS_METHOD_WMI,
+	.acpi_check_dev = true,
+	.ramio_physical_start = 0xFE00D400,
+	.ramio_size = 0x600
+};
+
+static const struct model_config model_j2cn = {
+	.registers = &ec_register_offsets_v0,
+	.check_embedded_controller_id = true,
+	.embedded_controller_id = 0x8227,
+	.memoryio_physical_ec_start = 0xC400,
+	.memoryio_size = 0x300,
+	.has_minifancurve = true,
+	.has_custom_powermode = true,
+	.access_method_powermode = ACCESS_METHOD_WMI,
+	.access_method_keyboard = ACCESS_METHOD_WMI,
+	.access_method_fanspeed = ACCESS_METHOD_EC,
+	.access_method_temperature = ACCESS_METHOD_EC,
+	.access_method_fancurve = ACCESS_METHOD_EC,
+	.access_method_fanfullspeed = ACCESS_METHOD_WMI,
+	.acpi_check_dev = true,
+	.ramio_physical_start = 0xFE00D400,
+	.ramio_size = 0x600
+};
+
+static const struct model_config model_9vcn = {
+	.registers = &ec_register_offsets_ideapad_v1,
+	.check_embedded_controller_id = true,
+	.embedded_controller_id = 0x8226,
+	.memoryio_physical_ec_start = 0xC400,
+	.memoryio_size = 0x300,
+	.has_minifancurve = true,
+	.has_custom_powermode = true,
+	.access_method_powermode = ACCESS_METHOD_WMI,
+	.access_method_keyboard = ACCESS_METHOD_WMI,
+	.access_method_fanspeed = ACCESS_METHOD_WMI,
+	.access_method_temperature = ACCESS_METHOD_WMI,
+	.access_method_fancurve = ACCESS_METHOD_EC2,
+	.access_method_fanfullspeed = ACCESS_METHOD_WMI,
+	.acpi_check_dev = false,
+	.ramio_physical_start = 0xFE00D400,
+	.ramio_size = 0x600
+};
+
+static const struct model_config model_v2022 = {
+	.registers = &ec_register_offsets_v0,
+	.check_embedded_controller_id = true,
+	.embedded_controller_id = 0x8227,
+	.memoryio_physical_ec_start = 0xC400,
+	.memoryio_size = 0x300,
+	.has_minifancurve = true,
+	.has_custom_powermode = true,
+	.access_method_powermode = ACCESS_METHOD_WMI,
+	.access_method_keyboard = ACCESS_METHOD_WMI,
+	.access_method_fanspeed = ACCESS_METHOD_EC,
+	.access_method_temperature = ACCESS_METHOD_EC,
+	.access_method_fancurve = ACCESS_METHOD_EC,
+	.access_method_fanfullspeed = ACCESS_METHOD_WMI,
+	.acpi_check_dev = true,
+	.ramio_physical_start = 0xFE00D400,
+	.ramio_size = 0x600
+};
+
+static const struct model_config model_4gcn = {
+	.registers = &ec_register_offsets_v0,
+	.check_embedded_controller_id = true,
+	.embedded_controller_id = 0x8226,
+	.memoryio_physical_ec_start = 0xC400,
+	.memoryio_size = 0x300,
+	.has_minifancurve = true,
+	.has_custom_powermode = true,
+	.access_method_powermode = ACCESS_METHOD_WMI,
+	.access_method_keyboard = ACCESS_METHOD_WMI,
+	.access_method_fanspeed = ACCESS_METHOD_EC,
+	.access_method_temperature = ACCESS_METHOD_EC,
+	.access_method_fancurve = ACCESS_METHOD_EC,
+	.access_method_fanfullspeed = ACCESS_METHOD_WMI,
+	.acpi_check_dev = true,
+	.ramio_physical_start = 0xFE00D400,
+	.ramio_size = 0x600
+};
+
+static const struct model_config model_bvcn = {
+	.registers = &ec_register_offsets_v0,
+	.check_embedded_controller_id = false,
+	.embedded_controller_id = 0x8226,
+	.memoryio_physical_ec_start = 0xC400,
+	.memoryio_size = 0x300,
+	.has_minifancurve = true,
+	.has_custom_powermode = true,
+	.access_method_powermode = ACCESS_METHOD_WMI,
+	.access_method_keyboard = ACCESS_METHOD_WMI,
+	.access_method_fanspeed = ACCESS_METHOD_WMI,
+	.access_method_temperature = ACCESS_METHOD_WMI,
+	.access_method_fancurve = ACCESS_METHOD_NO_ACCESS,
+	.access_method_fanfullspeed = ACCESS_METHOD_WMI,
+	.acpi_check_dev = false,
+	.ramio_physical_start = 0xFC7E0800,
+	.ramio_size = 0x600
+};
+
+static const struct model_config model_bhcn = {
+	.registers = &ec_register_offsets_v0,
+	.check_embedded_controller_id = true,
+	.embedded_controller_id = 0x8226,
+	.memoryio_physical_ec_start = 0xC400,
+	.memoryio_size = 0x300,
+	.has_minifancurve = true,
+	.has_custom_powermode = false,
+	.access_method_powermode = ACCESS_METHOD_WMI,
+	.access_method_keyboard = ACCESS_METHOD_ACPI,
+	.access_method_fanspeed = ACCESS_METHOD_WMI,
+	.access_method_temperature = ACCESS_METHOD_ACPI,
+	.access_method_fancurve = ACCESS_METHOD_EC,
+	.access_method_fanfullspeed = ACCESS_METHOD_WMI,
+	.acpi_check_dev = true,
+	.ramio_physical_start = 0xFF00D400,
+	.ramio_size = 0x600
+};
+
+static const struct model_config model_kwcn = {
+	.registers = &ec_register_offsets_v0,
+	.check_embedded_controller_id = true,
+	.embedded_controller_id = 0x5507,
+	.memoryio_physical_ec_start = 0xC400,
+	.memoryio_size = 0x300,
+	.has_minifancurve = false,
+	.has_custom_powermode = true,
+	.access_method_powermode = ACCESS_METHOD_WMI,
+	.access_method_keyboard = ACCESS_METHOD_WMI,
+	.access_method_fanspeed = ACCESS_METHOD_WMI3,
+	.access_method_temperature = ACCESS_METHOD_WMI3,
+	.access_method_fancurve = ACCESS_METHOD_WMI3,
+	.access_method_fanfullspeed = ACCESS_METHOD_WMI,
+	.acpi_check_dev = true,
+	.ramio_physical_start = 0xFE0B0400,
+	.ramio_size = 0x600
+};
+
+static const struct model_config model_m2cn = {
+	.registers = &ec_register_offsets_v0,
+	.check_embedded_controller_id = true,
+	.embedded_controller_id = 0x8227,
+	.memoryio_physical_ec_start = 0xC400,
+	.memoryio_size = 0x300,
+	.has_minifancurve = false,
+	.has_custom_powermode = true,
+	.access_method_powermode = ACCESS_METHOD_WMI,
+	.access_method_keyboard = ACCESS_METHOD_WMI,
+	.access_method_fanspeed = ACCESS_METHOD_WMI3,
+	.access_method_temperature = ACCESS_METHOD_WMI3,
+	.access_method_fancurve = ACCESS_METHOD_WMI3,
+	.access_method_fanfullspeed = ACCESS_METHOD_WMI,
+	.acpi_check_dev = false,
+	.ramio_physical_start = 0xFE0B0400,
+	.ramio_size = 0x600
+};
+
+static const struct model_config model_k1cn = {
+	.registers = &ec_register_offsets_v0,
+	.check_embedded_controller_id = true,
+	.embedded_controller_id = 0x5263,
+	.memoryio_physical_ec_start = 0xC400,
+	.memoryio_size = 0x300,
+	.has_minifancurve = false,
+	.has_custom_powermode = true,
+	.access_method_powermode = ACCESS_METHOD_WMI,
+	.access_method_keyboard = ACCESS_METHOD_WMI,
+	.access_method_fanspeed = ACCESS_METHOD_WMI3,
+	.access_method_temperature = ACCESS_METHOD_WMI3,
+	.access_method_fancurve = ACCESS_METHOD_WMI3,
+	.access_method_fanfullspeed = ACCESS_METHOD_WMI,
+	.acpi_check_dev = true,
+	.ramio_physical_start = 0xFE0B0400,
+	.ramio_size = 0x600
+};
+
+static const struct model_config model_lpcn = {
+	.registers = &ec_register_offsets_v0,
+	.check_embedded_controller_id = true,
+	.embedded_controller_id = 0x5507,
+	.memoryio_physical_ec_start = 0xC400,
+	.memoryio_size = 0x300,
+	.has_minifancurve = false,
+	.has_custom_powermode = true,
+	.access_method_powermode = ACCESS_METHOD_WMI,
+	.access_method_keyboard = ACCESS_METHOD_WMI,
+	.access_method_fanspeed = ACCESS_METHOD_WMI3,
+	.access_method_temperature = ACCESS_METHOD_WMI3,
+	.access_method_fancurve = ACCESS_METHOD_WMI3,
+	.access_method_fanfullspeed = ACCESS_METHOD_WMI,
+	.acpi_check_dev = true,
+	.ramio_physical_start = 0xFE0B0400,
+	.ramio_size = 0x600
+};
+
+static const struct model_config model_kfcn = {
+	.registers = &ec_register_offsets_v0,
+	.check_embedded_controller_id = true,
+	.embedded_controller_id = 0x8227,
+	.memoryio_physical_ec_start = 0xC400,
+	.memoryio_size = 0x300,
+	.has_minifancurve = false,
+	.has_custom_powermode = true,
+	.access_method_powermode = ACCESS_METHOD_WMI,
+	.access_method_keyboard = ACCESS_METHOD_WMI,
+	.access_method_fanspeed = ACCESS_METHOD_EC,
+	.access_method_temperature = ACCESS_METHOD_EC,
+	.access_method_fancurve = ACCESS_METHOD_EC,
+	.access_method_fanfullspeed = ACCESS_METHOD_WMI,
+	.acpi_check_dev = true,
+	.ramio_physical_start = 0xFE00D400,
+	.ramio_size = 0x600
+};
+
+static const struct model_config model_hacn = {
+	.registers = &ec_register_offsets_v0,
+	.check_embedded_controller_id = false,
+	.embedded_controller_id = 0x8227,
+	.memoryio_physical_ec_start = 0xC400,
+	.memoryio_size = 0x300,
+	.has_minifancurve = true,
+	.has_custom_powermode = true,
+	.access_method_powermode = ACCESS_METHOD_WMI,
+	.access_method_keyboard = ACCESS_METHOD_WMI,
+	.access_method_fanspeed = ACCESS_METHOD_EC,
+	.access_method_temperature = ACCESS_METHOD_EC,
+	.access_method_fancurve = ACCESS_METHOD_EC,
+	.access_method_fanfullspeed = ACCESS_METHOD_WMI,
+	.acpi_check_dev = true,
+	.ramio_physical_start = 0xFE00D400,
+	.ramio_size = 0x600
+};
+
+static const struct model_config model_k9cn = {
+	.registers = &ec_register_offsets_v0,
+	.check_embedded_controller_id = false,
+	.embedded_controller_id = 0x8227,
+	.memoryio_physical_ec_start = 0xC400, // or replace 0xC400 by 0x0400  ?
+	.memoryio_size = 0x300,
+	.has_minifancurve = true,
+	.has_custom_powermode = true,
+	.access_method_powermode = ACCESS_METHOD_WMI,
+	.access_method_keyboard = ACCESS_METHOD_WMI,
+	.access_method_fanspeed = ACCESS_METHOD_EC,
+	.access_method_temperature = ACCESS_METHOD_EC,
+	.access_method_fancurve = ACCESS_METHOD_EC,
+	.access_method_fanfullspeed = ACCESS_METHOD_WMI,
+	.acpi_check_dev = true,
+	.ramio_physical_start = 0xFE00D400,
+	.ramio_size = 0x600
+};
+
+static const struct model_config model_eucn = {
+	.registers = &ec_register_offsets_v1,
+	.check_embedded_controller_id = true,
+	.embedded_controller_id = 0x8227,
+	.memoryio_physical_ec_start = 0xC400,
+	.memoryio_size = 0x300,
+	.has_minifancurve = true,
+	.has_custom_powermode = true,
+	.access_method_powermode = ACCESS_METHOD_WMI,
+	.access_method_keyboard = ACCESS_METHOD_WMI,
+	.access_method_fanspeed = ACCESS_METHOD_EC,
+	.access_method_temperature = ACCESS_METHOD_EC,
+	.access_method_fancurve = ACCESS_METHOD_EC,
+	.access_method_fanfullspeed = ACCESS_METHOD_WMI,
+	.acpi_check_dev = true,
+	.ramio_physical_start = 0xFE00D400,
+	.ramio_size = 0x600
+};
+
+static const struct model_config model_fccn = {
+	.registers = &ec_register_offsets_ideapad_v0,
+	.check_embedded_controller_id = true,
+	.embedded_controller_id = 0x8227,
+	.memoryio_physical_ec_start = 0xC400,
+	.memoryio_size = 0x300,
+	.has_minifancurve = false,
+	.has_custom_powermode = true,
+	.access_method_powermode = ACCESS_METHOD_WMI,
+	.access_method_keyboard = ACCESS_METHOD_WMI,
+	.access_method_fanspeed = ACCESS_METHOD_WMI,
+	.access_method_temperature = ACCESS_METHOD_ACPI,
+	.access_method_fancurve = ACCESS_METHOD_EC2,
+	.access_method_fanfullspeed = ACCESS_METHOD_WMI,
+	.acpi_check_dev = true,
+	.ramio_physical_start = 0xFE00D400,
+	.ramio_size = 0x600
+};
+
+static const struct model_config model_h3cn = {
+	//0xFE0B0800
+	.registers = &ec_register_offsets_v1,
+	.check_embedded_controller_id = false,
+	.embedded_controller_id = 0x8227,
+	.memoryio_physical_ec_start = 0xC400,
+	.memoryio_size = 0x300,
+	.has_minifancurve = false,
+	.has_custom_powermode = false,
+	.access_method_powermode = ACCESS_METHOD_WMI,
+	// not implemented (properly) in WMI, RGB conrolled by USB
+	.access_method_keyboard = ACCESS_METHOD_NO_ACCESS,
+	// accessing fan speed is not implemented in ACPI
+	// a variable in the operation region (or not found)
+	// and not per WMI (methods returns constant 0)
+	.access_method_fanspeed = ACCESS_METHOD_NO_ACCESS,
+	.access_method_temperature = ACCESS_METHOD_WMI,
+	.access_method_fancurve = ACCESS_METHOD_NO_ACCESS,
+	.access_method_fanfullspeed = ACCESS_METHOD_WMI,
+	.acpi_check_dev = false,
+	.ramio_physical_start = 0xFE0B0800,
+	.ramio_size = 0x600
+};
+
+static const struct model_config model_e9cn = {
+	//0xFE0B0800
+	.registers = &ec_register_offsets_v1,
+	.check_embedded_controller_id = false,
+	.embedded_controller_id = 0x8227,
+	.memoryio_physical_ec_start = 0xC400, //0xFC7E0800
+	.memoryio_size = 0x300,
+	.has_minifancurve = false,
+	.has_custom_powermode = false,
+	.access_method_powermode = ACCESS_METHOD_WMI,
+	// not implemented (properly) in WMI, RGB conrolled by USB
+	.access_method_keyboard = ACCESS_METHOD_NO_ACCESS,
+	// accessing fan speed is not implemented in ACPI
+	// a variable in the operation region (or not found)
+	// and not per WMI (methods returns constant 0)
+	.access_method_fanspeed = ACCESS_METHOD_WMI,
+	.access_method_temperature = ACCESS_METHOD_WMI,
+	.access_method_fancurve = ACCESS_METHOD_NO_ACCESS,
+	.access_method_fanfullspeed = ACCESS_METHOD_WMI,
+	.acpi_check_dev = false,
+	.ramio_physical_start = 0xFC7E0800,
+	.ramio_size = 0x600
+};
+
+static const struct model_config model_8jcn = {
+	.registers = &ec_register_offsets_v0,
+	.check_embedded_controller_id = true,
+	.embedded_controller_id = 0x8226,
+	.memoryio_physical_ec_start = 0xC400,
+	.memoryio_size = 0x300,
+	.has_minifancurve = true,
+	.has_custom_powermode = true,
+	.access_method_powermode = ACCESS_METHOD_WMI,
+	.access_method_keyboard = ACCESS_METHOD_WMI,
+	.access_method_fanspeed = ACCESS_METHOD_WMI,
+	.access_method_temperature = ACCESS_METHOD_WMI,
+	.access_method_fancurve = ACCESS_METHOD_EC,
+	.access_method_fanfullspeed = ACCESS_METHOD_WMI,
+	.acpi_check_dev = false,
+	.ramio_physical_start = 0xFE00D400,
+	.ramio_size = 0x600
+};
+
+static const struct model_config model_jncn = {
+	.registers = &ec_register_offsets_v1,
+	.check_embedded_controller_id = false,
+	.embedded_controller_id = 0x8227,
+	.memoryio_physical_ec_start = 0xC400,
+	.memoryio_size = 0x300,
+	.has_minifancurve = false,
+	.has_custom_powermode = false,
+	.access_method_powermode = ACCESS_METHOD_WMI,
+	.access_method_keyboard = ACCESS_METHOD_NO_ACCESS,
+	.access_method_fanspeed = ACCESS_METHOD_WMI,
+	.access_method_temperature = ACCESS_METHOD_WMI,
+	.access_method_fancurve = ACCESS_METHOD_NO_ACCESS,
+	.access_method_fanfullspeed = ACCESS_METHOD_WMI,
+	.acpi_check_dev = false,
+	.ramio_physical_start = 0xFC7E0800,
+	.ramio_size = 0x600
+};
+
+// Yoga Model!
+static const struct model_config model_j1cn = {
+	.registers = &ec_register_offsets_v0,
+	.check_embedded_controller_id = true,
+	.embedded_controller_id = 0x8227,
+	.memoryio_physical_ec_start = 0xC400,
+	.memoryio_size = 0x300,
+	.has_minifancurve = true,
+	.has_custom_powermode = true,
+	.access_method_powermode = ACCESS_METHOD_WMI,
+	.access_method_keyboard = ACCESS_METHOD_WMI,
+	.access_method_fanspeed = ACCESS_METHOD_EC,
+	.access_method_temperature = ACCESS_METHOD_EC,
+	.access_method_fancurve = ACCESS_METHOD_EC,
+	.access_method_fanfullspeed = ACCESS_METHOD_WMI,
+	.acpi_check_dev = true,
+	.ramio_physical_start = 0xFE0B0400,
+	.ramio_size = 0x600
+};
+
+// Yoga Model!
+static const struct model_config model_dmcn = {
+	.registers = &ec_register_offsets_v0,
+	.check_embedded_controller_id = true,
+	.embedded_controller_id = 0x8227,
+	.memoryio_physical_ec_start = 0xC400,
+	.memoryio_size = 0x300,
+	.has_minifancurve = true,
+	.has_custom_powermode = true,
+	.access_method_powermode = ACCESS_METHOD_WMI,
+	.access_method_keyboard = ACCESS_METHOD_WMI,
+	.access_method_fanspeed = ACCESS_METHOD_EC,
+	.access_method_temperature = ACCESS_METHOD_EC,
+	.access_method_fancurve = ACCESS_METHOD_EC,
+	.access_method_fanfullspeed = ACCESS_METHOD_WMI,
+	.acpi_check_dev = true,
+	.ramio_physical_start = 0xFE700D00,
+	.ramio_size = 0x600
+};
+
+// Yoga Model!
+static const struct model_config model_khcn = {
+	.registers = &ec_register_offsets_v0,
+	.check_embedded_controller_id = false,
+	.embedded_controller_id = 0x8227,
+	.memoryio_physical_ec_start = 0xC400,
+	.memoryio_size = 0x300,
+	.has_minifancurve = true,
+	.has_custom_powermode = true,
+	.access_method_powermode = ACCESS_METHOD_EC,
+	.access_method_keyboard = ACCESS_METHOD_WMI,
+	.access_method_fanspeed = ACCESS_METHOD_EC,
+	.access_method_temperature = ACCESS_METHOD_EC,
+	.access_method_fancurve = ACCESS_METHOD_EC,
+	.access_method_fanfullspeed = ACCESS_METHOD_WMI,
+	.acpi_check_dev = false,
+	.ramio_physical_start = 0xFE0B0400,
+	.ramio_size = 0x600
+};
+
+
+static const struct dmi_system_id denylist[] = { {} };
+
+static const struct dmi_system_id optimistic_allowlist[] = {
+	{
+		// modelyear: 2021
+		// generation: 6
+		// name: Legion 5, Legion 5 pro, Legion 7
+		// Family: Legion 5 15ACH6H, ...
+		.ident = "GKCN",
+		.matches = {
+			DMI_MATCH(DMI_SYS_VENDOR, "LENOVO"),
+			DMI_MATCH(DMI_BIOS_VERSION, "GKCN"),
+		},
+		.driver_data = (void *)&model_v0
+	},
+	{
+		// modelyear: 2020
+		.ident = "EUCN",
+		.matches = {
+			DMI_MATCH(DMI_SYS_VENDOR, "LENOVO"),
+			DMI_MATCH(DMI_BIOS_VERSION, "EUCN"),
+		},
+		.driver_data = (void *)&model_eucn
+	},
+	{
+		// modelyear: 2020
+		.ident = "EFCN",
+		.matches = {
+			DMI_MATCH(DMI_SYS_VENDOR, "LENOVO"),
+			DMI_MATCH(DMI_BIOS_VERSION, "EFCN"),
+		},
+		.driver_data = (void *)&model_v0
+	},
+	{
+		// modelyear: 2020
+		.ident = "FSCN",
+		.matches = {
+			DMI_MATCH(DMI_SYS_VENDOR, "LENOVO"),
+			DMI_MATCH(DMI_BIOS_VERSION, "FSCN"),
+		},
+		.driver_data = (void *)&model_v0
+	},
+	{
+		// modelyear: 2021
+		.ident = "HHCN",
+		.matches = {
+			DMI_MATCH(DMI_SYS_VENDOR, "LENOVO"),
+			DMI_MATCH(DMI_BIOS_VERSION, "HHCN"),
+		},
+		.driver_data = (void *)&model_v0
+	},
+	{
+		// modelyear: 2022
+		.ident = "H1CN",
+		.matches = {
+			DMI_MATCH(DMI_SYS_VENDOR, "LENOVO"),
+			DMI_MATCH(DMI_BIOS_VERSION, "H1CN"),
+		},
+		.driver_data = (void *)&model_v0
+	},
+	{
+		// modelyear: 2022
+		.ident = "J2CN",
+		.matches = {
+			DMI_MATCH(DMI_SYS_VENDOR, "LENOVO"),
+			DMI_MATCH(DMI_BIOS_VERSION, "J2CN"),
+		},
+		.driver_data = (void *)&model_v0
+	},
+	{
+		// modelyear: 2022
+		.ident = "JUCN",
+		.matches = {
+			DMI_MATCH(DMI_SYS_VENDOR, "LENOVO"),
+			DMI_MATCH(DMI_BIOS_VERSION, "JUCN"),
+		},
+		.driver_data = (void *)&model_v0
+	},
+	{
+		// modelyear: 2022
+		.ident = "KFCN",
+		.matches = {
+			DMI_MATCH(DMI_SYS_VENDOR, "LENOVO"),
+			DMI_MATCH(DMI_BIOS_VERSION, "KFCN"),
+		},
+		.driver_data = (void *)&model_kfcn
+	},
+	{
+		// modelyear: 2021
+		.ident = "HACN",
+		.matches = {
+			DMI_MATCH(DMI_SYS_VENDOR, "LENOVO"),
+			DMI_MATCH(DMI_BIOS_VERSION, "HACN"),
+		},
+		.driver_data = (void *)&model_hacn
+	},
+	{
+		// modelyear: 2021
+		.ident = "G9CN",
+		.matches = {
+			DMI_MATCH(DMI_SYS_VENDOR, "LENOVO"),
+			DMI_MATCH(DMI_BIOS_VERSION, "G9CN"),
+		},
+		.driver_data = (void *)&model_v0
+	},
+	{
+		// modelyear: 2022
+		.ident = "K9CN",
+		.matches = {
+			DMI_MATCH(DMI_SYS_VENDOR, "LENOVO"),
+			DMI_MATCH(DMI_BIOS_VERSION, "K9CN"),
+		},
+		.driver_data = (void *)&model_k9cn
+	},
+	{
+		// e.g. IdeaPad Gaming 3 15ARH05
+		.ident = "FCCN",
+		.matches = {
+			DMI_MATCH(DMI_SYS_VENDOR, "LENOVO"),
+			DMI_MATCH(DMI_BIOS_VERSION, "FCCN"),
+		},
+		.driver_data = (void *)&model_fccn
+	},
+	{
+		// e.g. Ideapad Gaming 3 15ACH6
+		.ident = "H3CN",
+		.matches = {
+			DMI_MATCH(DMI_SYS_VENDOR, "LENOVO"),
+			DMI_MATCH(DMI_BIOS_VERSION, "H3CN"),
+		},
+		.driver_data = (void *)&model_h3cn
+	},
+	{
+		// e.g. IdeaPad Gaming 3 15ARH7 (2022)
+		.ident = "JNCN",
+		.matches = {
+			DMI_MATCH(DMI_SYS_VENDOR, "LENOVO"),
+			DMI_MATCH(DMI_BIOS_VERSION, "JNCN"),
+		},
+		.driver_data = (void *)&model_jncn
+	},
+	{
+		// 2020, seems very different in ACPI dissassembly
+		.ident = "E9CN",
+		.matches = {
+			DMI_MATCH(DMI_SYS_VENDOR, "LENOVO"),
+			DMI_MATCH(DMI_BIOS_VERSION, "E9CN"),
+		},
+		.driver_data = (void *)&model_e9cn
+	},
+	{
+		// e.g. Legion Y7000 (older version)
+		.ident = "8JCN",
+		.matches = {
+			DMI_MATCH(DMI_SYS_VENDOR, "LENOVO"),
+			DMI_MATCH(DMI_BIOS_VERSION, "8JCN"),
+		},
+		.driver_data = (void *)&model_8jcn
+	},
+	{
+		// e.g. Legion 7i Pro 2023
+		.ident = "KWCN",
+		.matches = {
+			DMI_MATCH(DMI_SYS_VENDOR, "LENOVO"),
+			DMI_MATCH(DMI_BIOS_VERSION, "KWCN"),
+		},
+		.driver_data = (void *)&model_kwcn
+	},
+	{
+		// e.g. Legion Pro 5 2023 or R9000P
+		.ident = "LPCN",
+		.matches = {
+			DMI_MATCH(DMI_SYS_VENDOR, "LENOVO"),
+			DMI_MATCH(DMI_BIOS_VERSION, "LPCN"),
+		},
+		.driver_data = (void *)&model_lpcn
+	},
+	{
+		// e.g. Lenovo Legion 5i/Y7000 2019 PG0
+		.ident = "BHCN",
+		.matches = {
+			DMI_MATCH(DMI_SYS_VENDOR, "LENOVO"),
+			DMI_MATCH(DMI_BIOS_VERSION, "BHCN"),
+		},
+		.driver_data = (void *)&model_bhcn
+	},
+	{
+		// e.g. Lenovo 7 16IAX7
+		.ident = "K1CN",
+		.matches = {
+			DMI_MATCH(DMI_SYS_VENDOR, "LENOVO"),
+			DMI_MATCH(DMI_BIOS_VERSION, "K1CN"),
+		},
+		.driver_data = (void *)&model_k1cn
+	},
+	{
+		// e.g. Legion Y720
+		.ident = "4GCN",
+		.matches = {
+			DMI_MATCH(DMI_SYS_VENDOR, "LENOVO"),
+			DMI_MATCH(DMI_BIOS_VERSION, "4GCN"),
+		},
+		.driver_data = (void *)&model_4gcn
+	},
+	{
+		// e.g. Legion Slim 5 16APH8 2023
+		.ident = "M3CN",
+		.matches = {
+			DMI_MATCH(DMI_SYS_VENDOR, "LENOVO"),
+			DMI_MATCH(DMI_BIOS_VERSION, "M3CN"),
+		},
+		.driver_data = (void *)&model_lpcn
+	},
+	{
+		// e.g. Legion Y7000p-1060
+		.ident = "9VCN",
+		.matches = {
+			DMI_MATCH(DMI_SYS_VENDOR, "LENOVO"),
+			DMI_MATCH(DMI_BIOS_VERSION, "9VCN"),
+		},
+		.driver_data = (void *)&model_9vcn
+	},
+	{
+		// e.g. Legion Y9000X
+		.ident = "JYCN",
+		.matches = {
+			DMI_MATCH(DMI_SYS_VENDOR, "LENOVO"),
+			DMI_MATCH(DMI_BIOS_VERSION, "JYCN"),
+		},
+		.driver_data = (void *)&model_v2022
+	},
+	{
+		// e.g. Legion Y740-15IRH, older model e.g. with GTX 1660
+		.ident = "BVCN",
+		.matches = {
+			DMI_MATCH(DMI_SYS_VENDOR, "LENOVO"),
+			DMI_MATCH(DMI_BIOS_VERSION, "BVCN"),
+		},
+		.driver_data = (void *)&model_bvcn
+	},
+	{
+		// e.g. Legion 5 Pro 16IAH7H with a RTX 3070 Ti
+		.ident = "J2CN",
+		.matches = {
+			DMI_MATCH(DMI_SYS_VENDOR, "LENOVO"),
+			DMI_MATCH(DMI_BIOS_VERSION, "J2CN"),
+		},
+		.driver_data = (void *)&model_j2cn
+	},
+	{
+		// e.g. Lenovo Yoga 7 16IAH7 with GPU Intel DG2 Arc A370M
+		.ident = "J1CN",
+		.matches = {
+			DMI_MATCH(DMI_SYS_VENDOR, "LENOVO"),
+			DMI_MATCH(DMI_BIOS_VERSION, "J1CN"),
+		},
+		.driver_data = (void *)&model_j1cn
+	},
+	{
+		// e.g. Legion Slim 5 16IRH8 (2023) with RTX 4070
+		.ident = "M2CN",
+		.matches = {
+			DMI_MATCH(DMI_SYS_VENDOR, "LENOVO"),
+			DMI_MATCH(DMI_BIOS_VERSION, "M2CN"),
+		},
+		.driver_data = (void *)&model_m2cn
+	},
+	{
+		// e.g. Yoga Slim 7-14ARE05
+		.ident = "DMCN",
+		.matches = {
+			DMI_MATCH(DMI_SYS_VENDOR, "LENOVO"),
+			DMI_MATCH(DMI_BIOS_VERSION, "DMCN"),
+		},
+		.driver_data = (void *)&model_dmcn
+	},
+	{
+		// e.g. Yoga Slim 7 Pro 14ARH7
+		.ident = "KHCN",
+		.matches = {
+			DMI_MATCH(DMI_SYS_VENDOR, "LENOVO"),
+			DMI_MATCH(DMI_BIOS_VERSION, "KHCN"),
+		},
+		.driver_data = (void *)&model_khcn
+	},
+	{}
+};
+
+/* ================================= */
+/* ACPI and WMI access               */
+/* ================================= */
+
+// function from ideapad-laptop.c
+static int eval_int(acpi_handle handle, const char *name, unsigned long *res)
+{
+	unsigned long long result;
+	acpi_status status;
+
+	status = acpi_evaluate_integer(handle, (char *)name, NULL, &result);
+	if (ACPI_FAILURE(status))
+		return -EIO;
+
+	*res = result;
+
+	return 0;
+}
+
+// function from ideapad-laptop.c
+static int exec_simple_method(acpi_handle handle, const char *name,
+			      unsigned long arg)
+{
+	acpi_status status =
+		acpi_execute_simple_method(handle, (char *)name, arg);
+
+	return ACPI_FAILURE(status) ? -EIO : 0;
+}
+
+// function from ideapad-laptop.c
+static int exec_sbmc(acpi_handle handle, unsigned long arg)
+{
+	// \_SB.PCI0.LPC0.EC0.VPC0.SBMC
+	return exec_simple_method(handle, "VPC0.SBMC", arg);
+}
+
+//static int eval_qcho(acpi_handle handle, unsigned long *res)
+//{
+//	// \_SB.PCI0.LPC0.EC0.QCHO
+//	return eval_int(handle, "QCHO", res);
+//}
+
+static int eval_gbmd(acpi_handle handle, unsigned long *res)
+{
+	return eval_int(handle, "VPC0.GBMD", res);
+}
+
+static int eval_spmo(acpi_handle handle, unsigned long *res)
+{
+	// \_SB.PCI0.LPC0.EC0.QCHO
+	return eval_int(handle, "VPC0.BTSM", res);
+}
+
+static int acpi_process_buffer_to_ints(const char *id_name, int id_nr,
+				       acpi_status status,
+				       struct acpi_buffer *out_buffer, u8 *res,
+				       size_t ressize)
+{
+	// seto to NULL call kfree on NULL if next function call fails
+	union acpi_object *out = NULL;
+	size_t i;
+	int error = 0;
+
+	if (ACPI_FAILURE(status)) {
+		pr_info("ACPI evaluation error for: %s:%d\n", id_name, id_nr);
+		error = -EFAULT;
+		goto err;
+	}
+
+	out = out_buffer->pointer;
+	if (!out) {
+		pr_info("Unexpected ACPI result for %s:%d\n", id_name, id_nr);
+		error = -AE_ERROR;
+		goto err;
+	}
+
+	if (out->type != ACPI_TYPE_BUFFER || out->buffer.length != ressize) {
+		pr_info("Unexpected ACPI result for %s:%d: expected type %d but got %d; expected length %lu but got %u;\n",
+			id_name, id_nr, ACPI_TYPE_BUFFER, out->type, ressize,
+			out->buffer.length);
+		error = -AE_ERROR;
+		goto err;
+	}
+	pr_info("ACPI result for %s:%d: ACPI buffer length: %u\n", id_name,
+		id_nr, out->buffer.length);
+
+	for (i = 0; i < ressize; ++i)
+		res[i] = out->buffer.pointer[i];
+	error = 0;
+
+err:
+	kfree(out);
+	return error;
+}
+
+//static int exec_ints(acpi_handle handle, const char *method_name,
+//		     struct acpi_object_list *params, u8 *res, size_t ressize)
+//{
+//	acpi_status status;
+//	struct acpi_buffer out_buffer = { ACPI_ALLOCATE_BUFFER, NULL };
+
+//	status = acpi_evaluate_object(handle, (acpi_string)method_name, params,
+//				      &out_buffer);
+
+//	return acpi_process_buffer_to_ints(method_name, 0, status, &out_buffer,
+//					   res, ressize);
+//}
+
+static int wmi_exec_ints(const char *guid, u8 instance, u32 method_id,
+			 const struct acpi_buffer *params, u8 *res,
+			 size_t ressize)
+{
+	acpi_status status;
+	struct acpi_buffer out_buffer = { ACPI_ALLOCATE_BUFFER, NULL };
+
+	status = wmi_evaluate_method(guid, instance, method_id, params,
+				     &out_buffer);
+	return acpi_process_buffer_to_ints(guid, method_id, status, &out_buffer,
+					   res, ressize);
+}
+
+static int wmi_exec_int(const char *guid, u8 instance, u32 method_id,
+			const struct acpi_buffer *params, unsigned long *res)
+{
+	acpi_status status;
+	struct acpi_buffer out_buffer = { ACPI_ALLOCATE_BUFFER, NULL };
+	// seto to NULL call kfree on NULL if next function call fails
+	union acpi_object *out = NULL;
+	int error = 0;
+
+	status = wmi_evaluate_method(guid, instance, method_id, params,
+				     &out_buffer);
+
+	if (ACPI_FAILURE(status)) {
+		pr_info("WMI evaluation error for: %s:%d\n", guid, method_id);
+		error = -EFAULT;
+		goto err;
+	}
+
+	out = out_buffer.pointer;
+	if (!out) {
+		pr_info("Unexpected ACPI result for %s:%d", guid, method_id);
+		error = -AE_ERROR;
+		goto err;
+	}
+
+	if (out->type != ACPI_TYPE_INTEGER) {
+		pr_info("Unexpected ACPI result for %s:%d: expected type %d but got %d\n",
+			guid, method_id, ACPI_TYPE_INTEGER, out->type);
+		error = -AE_ERROR;
+		goto err;
+	}
+
+	*res = out->integer.value;
+	error = 0;
+
+err:
+	kfree(out);
+	return error;
+}
+
+static int wmi_exec_noarg_int(const char *guid, u8 instance, u32 method_id,
+			      unsigned long *res)
+{
+	struct acpi_buffer params;
+
+	params.length = 0;
+	params.pointer = NULL;
+	return wmi_exec_int(guid, instance, method_id, &params, res);
+}
+
+static int wmi_exec_noarg_ints(const char *guid, u8 instance, u32 method_id,
+			       u8 *res, size_t ressize)
+{
+	struct acpi_buffer params;
+
+	params.length = 0;
+	params.pointer = NULL;
+	return wmi_exec_ints(guid, instance, method_id, &params, res, ressize);
+}
+
+static int wmi_exec_arg(const char *guid, u8 instance, u32 method_id, void *arg,
+			size_t arg_size)
+{
+	struct acpi_buffer params;
+	acpi_status status;
+
+	params.length = arg_size;
+	params.pointer = arg;
+	status = wmi_evaluate_method(guid, instance, method_id, &params, NULL);
+
+	if (ACPI_FAILURE(status))
+		return -EIO;
+	return 0;
+}
+
+/* ================================= */
+/* Lenovo WMI config                 */
+/* ================================= */
+#define LEGION_WMI_GAMEZONE_GUID "887B54E3-DDDC-4B2C-8B88-68A26A8835D0"
+// GPU over clock
+#define WMI_METHOD_ID_ISSUPPORTGPUOC 4
+
+//Fan speed
+// only completely implemented only for some models here
+// often implemted also in other class and other method
+// below
+#define WMI_METHOD_ID_GETFAN1SPEED 8
+#define WMI_METHOD_ID_GETFAN2SPEED 9
+
+// Version of ACPI
+#define WMI_METHOD_ID_GETVERSION 11
+// Does it support CPU overclock?
+#define WMI_METHOD_ID_ISSUPPORTCPUOC 14
+// Temperatures
+// only completely implemented  only for some models here
+// often implemted also in other class and other method
+// below
+#define WMI_METHOD_ID_GETCPUTEMP 18
+#define WMI_METHOD_ID_GETGPUTEMP 19
+
+// two state keyboard light
+#define WMI_METHOD_ID_GETKEYBOARDLIGHT 37
+#define WMI_METHOD_ID_SETKEYBOARDLIGHT 36
+// disable win key
+// 0 = win key enabled; 1 = win key disabled
+#define WMI_METHOD_ID_ISSUPPORTDISABLEWINKEY 21
+#define WMI_METHOD_ID_GETWINKEYSTATUS 23
+#define WMI_METHOD_ID_SETWINKEYSTATUS 22
+// disable touchpad
+//0 = touchpad enabled; 1 = touchpad disabled
+#define WMI_METHOD_ID_ISSUPPORTDISABLETP 24
+#define WMI_METHOD_ID_GETTPSTATUS 26
+#define WMI_METHOD_ID_SETTPSTATUS 25
+// gSync
+#define WMI_METHOD_ID_ISSUPPORTGSYNC 40
+#define WMI_METHOD_ID_GETGSYNCSTATUS 41
+#define WMI_METHOD_ID_SETGSYNCSTATUS 42
+//smartFanMode = powermode
+#define WMI_METHOD_ID_ISSUPPORTSMARTFAN 49
+#define WMI_METHOD_ID_GETSMARTFANMODE 45
+#define WMI_METHOD_ID_SETSMARTFANMODE 44
+// power charge mode
+#define WMI_METHOD_ID_GETPOWERCHARGEMODE 47
+// overdrive of display to reduce latency
+// 0=off, 1=on
+#define WMI_METHOD_ID_ISSUPPORTOD 49
+#define WMI_METHOD_ID_GETODSTATUS 50
+#define WMI_METHOD_ID_SETODSTATUS 51
+// thermal mode = power mode used for cooling
+#define WMI_METHOD_ID_GETTHERMALMODE 55
+// get max frequency of core 0
+#define WMI_METHOD_ID_GETCPUMAXFREQUENCY 60
+// check if AC adapter has enough power to overclock
+#define WMI_METHOD_ID_ISACFITFOROC 62
+// set iGPU (GPU packaged with CPU) state
+#define WMI_METHOD_ID_ISSUPPORTIGPUMODE 63
+#define WMI_METHOD_ID_GETIGPUMODESTATUS 64
+#define WMI_METHOD_ID_SETIGPUMODESTATUS 65
+#define WMI_METHOD_ID_NOTIFYDGPUSTATUS 66
+enum IGPUState {
+	IGPUState_default = 0,
+	IGPUState_iGPUOnly = 1,
+	IGPUState_auto = 2
+};
+
+#define WMI_GUID_LENOVO_CPU_METHOD "14afd777-106f-4c9b-b334-d388dc7809be"
+#define WMI_METHOD_ID_CPU_GET_SUPPORT_OC_STATUS 15
+#define WMI_METHOD_ID_CPU_GET_OC_STATUS 1
+#define WMI_METHOD_ID_CPU_SET_OC_STATUS 2
+
+// ppt limit slow
+#define WMI_METHOD_ID_CPU_GET_SHORTTERM_POWERLIMIT 3
+#define WMI_METHOD_ID_CPU_SET_SHORTTERM_POWERLIMIT 4
+// ppt stapm
+#define WMI_METHOD_ID_CPU_GET_LONGTERM_POWERLIMIT 5
+#define WMI_METHOD_ID_CPU_SET_LONGTERM_POWERLIMIT 6
+// default power limit
+#define WMI_METHOD_ID_CPU_GET_DEFAULT_POWERLIMIT 7
+// peak power limit
+#define WMI_METHOD_ID_CPU_GET_PEAK_POWERLIMIT 8
+#define WMI_METHOD_ID_CPU_SET_PEAK_POWERLIMIT 9
+// apu sppt powerlimit
+#define WMI_METHOD_ID_CPU_GET_APU_SPPT_POWERLIMIT 12
+#define WMI_METHOD_ID_CPU_SET_APU_SPPT_POWERLIMIT 13
+// cross loading powerlimit
+#define WMI_METHOD_ID_CPU_GET_CROSS_LOADING_POWERLIMIT 16
+#define WMI_METHOD_ID_CPU_SET_CROSS_LOADING_POWERLIMIT 17
+
+#define WMI_GUID_LENOVO_GPU_METHOD "da7547f1-824d-405f-be79-d9903e29ced7"
+// overclock GPU possible
+#define WMI_METHOD_ID_GPU_GET_OC_STATUS 1
+#define WMI_METHOD_ID_GPU_SET_OC_STATUS 2
+// dynamic boost power
+#define WMI_METHOD_ID_GPU_GET_PPAB_POWERLIMIT 3
+#define WMI_METHOD_ID_GPU_SET_PPAB_POWERLIMIT 4
+// configurable TGP (power)
+#define WMI_METHOD_ID_GPU_GET_CTGP_POWERLIMIT 5
+#define WMI_METHOD_ID_GPU_SET_CTGP_POWERLIMIT 6
+// ppab/ctgp powerlimit
+#define WMI_METHOD_ID_GPU_GET_DEFAULT_PPAB_CTGP_POWERLIMIT 7
+// temperature limit
+#define WMI_METHOD_ID_GPU_GET_TEMPERATURE_LIMIT 8
+#define WMI_METHOD_ID_GPU_SET_TEMPERATURE_LIMIT 9
+// boost clock
+#define WMI_METHOD_ID_GPU_GET_BOOST_CLOCK 10
+
+#define WMI_GUID_LENOVO_FAN_METHOD "92549549-4bde-4f06-ac04-ce8bf898dbaa"
+// set fan to maximal speed; dust cleaning mode
+// only works in custom power mode
+#define WMI_METHOD_ID_FAN_GET_FULLSPEED 1
+#define WMI_METHOD_ID_FAN_SET_FULLSPEED 2
+// max speed of fan
+#define WMI_METHOD_ID_FAN_GET_MAXSPEED 3
+#define WMI_METHOD_ID_FAN_SET_MAXSPEED 4
+// fan table in custom mode
+#define WMI_METHOD_ID_FAN_GET_TABLE 5
+#define WMI_METHOD_ID_FAN_SET_TABLE 6
+// get speed of fans
+#define WMI_METHOD_ID_FAN_GETCURRENTFANSPEED 7
+// get temperatures of CPU and GPU used for controlling cooling
+#define WMI_METHOD_ID_FAN_GETCURRENTSENSORTEMPERATURE 8
+
+// do not implement following
+// #define WMI_METHOD_ID_Fan_SetCurrentFanSpeed 9
+
+#define LEGION_WMI_KBBACKLIGHT_GUID "8C5B9127-ECD4-4657-980F-851019F99CA5"
+// access the keyboard backlight with 3 states
+#define WMI_METHOD_ID_KBBACKLIGHTGET 0x1
+#define WMI_METHOD_ID_KBBACKLIGHTSET 0x2
+
+// new method in newer methods to get or set most of the values
+// with the two methods GetFeatureValue or SetFeatureValue.
+// They are called like GetFeatureValue(feature_id) where
+// feature_id is a id for the feature
+#define LEGION_WMI_LENOVO_OTHER_METHOD_GUID \
+	"dc2a8805-3a8c-41ba-a6f7-092e0089cd3b"
+#define WMI_METHOD_ID_GET_FEATURE_VALUE 17
+#define WMI_METHOD_ID_SET_FEATURE_VALUE 18
+
+enum OtherMethodFeature {
+	OtherMethodFeature_U1 = 0x010000, //->PC00.LPCB.EC0.REJF
+	OtherMethodFeature_U2 = 0x0F0000, //->C00.PEG1.PXP._STA?
+	OtherMethodFeature_U3 = 0x030000, //->PC00.LPCB.EC0.FLBT?
+	OtherMethodFeature_CPU_SHORT_TERM_POWER_LIMIT = 0x01010000,
+	OtherMethodFeature_CPU_LONG_TERM_POWER_LIMIT = 0x01020000,
+	OtherMethodFeature_CPU_PEAK_POWER_LIMIT = 0x01030000,
+	OtherMethodFeature_CPU_TEMPERATURE_LIMIT = 0x01040000,
+
+	OtherMethodFeature_APU_PPT_POWER_LIMIT = 0x01050000,
+
+	OtherMethodFeature_CPU_CROSS_LOAD_POWER_LIMIT = 0x01060000,
+	OtherMethodFeature_CPU_L1_TAU = 0x01070000,
+
+	OtherMethodFeature_GPU_POWER_BOOST = 0x02010000,
+	OtherMethodFeature_GPU_cTGP = 0x02020000,
+	OtherMethodFeature_GPU_TEMPERATURE_LIMIT = 0x02030000,
+	OtherMethodFeature_GPU_POWER_TARGET_ON_AC_OFFSET_FROM_BASELINE =
+		0x02040000,
+
+	OtherMethodFeature_FAN_SPEED_1 = 0x04030001,
+	OtherMethodFeature_FAN_SPEED_2 = 0x04030002,
+
+	OtherMethodFeature_C_U1 = 0x05010000,
+	OtherMethodFeature_TEMP_CPU = 0x05040000,
+	OtherMethodFeature_TEMP_GPU = 0x05050000,
+};
+
+static ssize_t wmi_other_method_get_value(enum OtherMethodFeature feature_id,
+					  int *value)
+{
+	struct acpi_buffer params;
+	int error;
+	unsigned long res;
+	u32 param1 = feature_id;
+
+	params.length = sizeof(param1);
+	params.pointer = &param1;
+	error = wmi_exec_int(LEGION_WMI_LENOVO_OTHER_METHOD_GUID, 0,
+			     WMI_METHOD_ID_GET_FEATURE_VALUE, &params, &res);
+	if (!error)
+		*value = res;
+	return error;
+}
+
+/* =================================== */
+/* EC RAM Access with memory mapped IO */
+/* =================================== */
+
+struct ecram_memoryio {
+	// TODO: start of remapped memory in EC RAM is assumed to be 0
+	// u16 ecram_start;
+
+	// physical address of remapped IO, depends on model and firmware
+	phys_addr_t physical_start;
+	// start adress of region in ec memory
+	phys_addr_t physical_ec_start;
+	// virtual address of remapped IO
+	u8 *virtual_start;
+	// size of remapped access
+	size_t size;
+};
+
+/**
+ * physical_start : corresponds to EC RAM 0 inside EC
+ * size: size of remapped region
+ *
+ * strong exception safety
+ */
+static ssize_t ecram_memoryio_init(struct ecram_memoryio *ec_memoryio,
+				   phys_addr_t physical_start,
+				   phys_addr_t physical_ec_start, size_t size)
+{
+	void *virtual_start = ioremap(physical_start, size);
+
+	if (!IS_ERR_OR_NULL(virtual_start)) {
+		ec_memoryio->virtual_start = virtual_start;
+		ec_memoryio->physical_start = physical_start;
+		ec_memoryio->physical_ec_start = physical_ec_start;
+		ec_memoryio->size = size;
+		pr_info("Succeffuly mapped embedded controller: 0x%llx (in RAM)/0x%llx (in EC) to virtual 0x%p\n",
+			ec_memoryio->physical_start,
+			ec_memoryio->physical_ec_start,
+			ec_memoryio->virtual_start);
+	} else {
+		pr_info("Error mapping embedded controller memory at 0x%llx\n",
+			physical_start);
+		return -ENOMEM;
+	}
+	return 0;
+}
+
+static void ecram_memoryio_exit(struct ecram_memoryio *ec_memoryio)
+{
+	if (ec_memoryio->virtual_start != NULL) {
+		pr_info("Unmapping embedded controller memory at 0x%llx (in RAM)/0x%llx (in EC) at virtual 0x%p\n",
+			ec_memoryio->physical_start,
+			ec_memoryio->physical_ec_start,
+			ec_memoryio->virtual_start);
+		iounmap(ec_memoryio->virtual_start);
+		ec_memoryio->virtual_start = NULL;
+	}
+}
+
+/* Read a byte from the EC RAM.
+ *
+ * Return status because of commong signature for alle
+ * methods to access EC RAM.
+ */
+static ssize_t ecram_memoryio_read(const struct ecram_memoryio *ec_memoryio,
+				   u16 ec_offset, u8 *value)
+{
+	if (ec_offset < ec_memoryio->physical_ec_start) {
+		pr_info("Unexpected read at offset %d into EC RAM\n",
+			ec_offset);
+		return -1;
+	}
+	*value = *(ec_memoryio->virtual_start +
+		   (ec_offset - ec_memoryio->physical_ec_start));
+	return 0;
+}
+
+/* Write a byte to the EC RAM.
+ *
+ * Return status because of commong signature for alle
+ * methods to access EC RAM.
+ */
+ssize_t ecram_memoryio_write(const struct ecram_memoryio *ec_memoryio,
+			     u16 ec_offset, u8 value)
+{
+	if (ec_offset < ec_memoryio->physical_ec_start) {
+		pr_info("Unexpected write at offset %d into EC RAM\n",
+			ec_offset);
+		return -1;
+	}
+	*(ec_memoryio->virtual_start +
+	  (ec_offset - ec_memoryio->physical_ec_start)) = value;
+	return 0;
+}
+
+/* ================================= */
+/* EC RAM Access with port-mapped IO */
+/* ================================= */
+
+/*
+ * See datasheet of e.g. IT8502E/F/G, e.g.
+ * 6.2 Plug and Play Configuration (PNPCFG)
+ *
+ * Depending on configured BARDSEL register
+ * the ports
+ *   ECRAM_PORTIO_ADDR_PORT and
+ *   ECRAM_PORTIO_DATA_PORT
+ * are configured.
+ *
+ * By performing IO on these ports one can
+ * read/write to registers in the EC.
+ *
+ * "To access a register of PNPCFG, write target index to
+ *  address port and access this PNPCFG register via
+ *  data port" [datasheet, 6.2 Plug and Play Configuration]
+ */
+
+// IO ports used to write to communicate with embedded controller
+// Start of used ports
+#define ECRAM_PORTIO_START_PORT 0x4E
+// Number of used ports
+#define ECRAM_PORTIO_PORTS_SIZE 2
+// Port used to specify address in EC RAM to read/write
+// 0x4E/0x4F is the usual port for IO super controler
+// 0x2E/0x2F also common (ITE can also be configure to use these)
+#define ECRAM_PORTIO_ADDR_PORT 0x4E
+// Port to send/receive the value  to write/read
+#define ECRAM_PORTIO_DATA_PORT 0x4F
+// Name used to request ports
+#define ECRAM_PORTIO_NAME "legion"
+
+struct ecram_portio {
+	/* protects read/write to EC RAM performed
+	 * as a certain sequence of outb, inb
+	 * commands on the IO ports. There can
+	 * be at most one.
+	 */
+	struct mutex io_port_mutex;
+};
+
+static ssize_t ecram_portio_init(struct ecram_portio *ec_portio)
+{
+	if (!request_region(ECRAM_PORTIO_START_PORT, ECRAM_PORTIO_PORTS_SIZE,
+			    ECRAM_PORTIO_NAME)) {
+		pr_info("Cannot init ecram_portio the %x ports starting at %x\n",
+			ECRAM_PORTIO_PORTS_SIZE, ECRAM_PORTIO_START_PORT);
+		return -ENODEV;
+	}
+	//pr_info("Reserved %x ports starting at %x\n", ECRAM_PORTIO_PORTS_SIZE, ECRAM_PORTIO_START_PORT);
+	mutex_init(&ec_portio->io_port_mutex);
+	return 0;
+}
+
+static void ecram_portio_exit(struct ecram_portio *ec_portio)
+{
+	release_region(ECRAM_PORTIO_START_PORT, ECRAM_PORTIO_PORTS_SIZE);
+}
+
+/* Read a byte from the EC RAM.
+ *
+ * Return status because of commong signature for alle
+ * methods to access EC RAM.
+ */
+static ssize_t ecram_portio_read(struct ecram_portio *ec_portio, u16 offset,
+				 u8 *value)
+{
+	mutex_lock(&ec_portio->io_port_mutex);
+
+	outb(0x2E, ECRAM_PORTIO_ADDR_PORT);
+	outb(0x11, ECRAM_PORTIO_DATA_PORT);
+	outb(0x2F, ECRAM_PORTIO_ADDR_PORT);
+	// TODO: no explicit cast between types seems to be sometimes
+	// done and sometimes not
+	outb((u8)((offset >> 8) & 0xFF), ECRAM_PORTIO_DATA_PORT);
+
+	outb(0x2E, ECRAM_PORTIO_ADDR_PORT);
+	outb(0x10, ECRAM_PORTIO_DATA_PORT);
+	outb(0x2F, ECRAM_PORTIO_ADDR_PORT);
+	outb((u8)(offset & 0xFF), ECRAM_PORTIO_DATA_PORT);
+
+	outb(0x2E, ECRAM_PORTIO_ADDR_PORT);
+	outb(0x12, ECRAM_PORTIO_DATA_PORT);
+	outb(0x2F, ECRAM_PORTIO_ADDR_PORT);
+	*value = inb(ECRAM_PORTIO_DATA_PORT);
+
+	mutex_unlock(&ec_portio->io_port_mutex);
+	return 0;
+}
+
+/* Write a byte to the EC RAM.
+ *
+ * Return status because of commong signature for alle
+ * methods to access EC RAM.
+ */
+static ssize_t ecram_portio_write(struct ecram_portio *ec_portio, u16 offset,
+				  u8 value)
+{
+	mutex_lock(&ec_portio->io_port_mutex);
+
+	outb(0x2E, ECRAM_PORTIO_ADDR_PORT);
+	outb(0x11, ECRAM_PORTIO_DATA_PORT);
+	outb(0x2F, ECRAM_PORTIO_ADDR_PORT);
+	// TODO: no explicit cast between types seems to be sometimes
+	// done and sometimes not
+	outb((u8)((offset >> 8) & 0xFF), ECRAM_PORTIO_DATA_PORT);
+
+	outb(0x2E, ECRAM_PORTIO_ADDR_PORT);
+	outb(0x10, ECRAM_PORTIO_DATA_PORT);
+	outb(0x2F, ECRAM_PORTIO_ADDR_PORT);
+	outb((u8)(offset & 0xFF), ECRAM_PORTIO_DATA_PORT);
+
+	outb(0x2E, ECRAM_PORTIO_ADDR_PORT);
+	outb(0x12, ECRAM_PORTIO_DATA_PORT);
+	outb(0x2F, ECRAM_PORTIO_ADDR_PORT);
+	outb(value, ECRAM_PORTIO_DATA_PORT);
+
+	mutex_unlock(&ec_portio->io_port_mutex);
+	// TODO: remove this
+	//pr_info("Writing %d to addr %x\n", value, offset);
+	return 0;
+}
+
+/* =================================== */
+/* EC RAM Access                       */
+/* =================================== */
+
+struct ecram {
+	struct ecram_portio portio;
+};
+
+static ssize_t ecram_init(struct ecram *ecram,
+			  phys_addr_t memoryio_ec_physical_start,
+			  size_t region_size)
+{
+	ssize_t err;
+
+	err = ecram_portio_init(&ecram->portio);
+	if (err) {
+		pr_info("Failed ecram_portio_init\n");
+		goto err_ecram_portio_init;
+	}
+
+	return 0;
+
+err_ecram_portio_init:
+	return err;
+}
+
+static void ecram_exit(struct ecram *ecram)
+{
+	pr_info("Unloading legion ecram\n");
+	ecram_portio_exit(&ecram->portio);
+	pr_info("Unloading legion ecram done\n");
+}
+
+/** Read from EC RAM
+ * ecram_offset address on the EC
+ */
+static u8 ecram_read(struct ecram *ecram, u16 ecram_offset)
+{
+	u8 value;
+	int err;
+
+	err = ecram_portio_read(&ecram->portio, ecram_offset, &value);
+	if (err)
+		pr_info("Error reading EC RAM at 0x%x\n", ecram_offset);
+	return value;
+}
+
+static void ecram_write(struct ecram *ecram, u16 ecram_offset, u8 value)
+{
+	int err;
+
+	if (ec_readonly) {
+		pr_info("Skipping writing EC RAM at 0x%x because readonly.\n",
+			ecram_offset);
+		return;
+	}
+	err = ecram_portio_write(&ecram->portio, ecram_offset, value);
+	if (err)
+		pr_info("Error writing EC RAM at 0x%x\n", ecram_offset);
+}
+
+/* =============================== */
+/* Reads from EC  */
+/* ===============================  */
+
+static u16 read_ec_id(struct ecram *ecram, const struct model_config *model)
+{
+	u8 id1 = ecram_read(ecram, model->registers->ECHIPID1);
+	u8 id2 = ecram_read(ecram, model->registers->ECHIPID2);
+
+	return (id1 << 8) + id2;
+}
+
+static u16 read_ec_version(struct ecram *ecram,
+			   const struct model_config *model)
+{
+	u8 vers = ecram_read(ecram, model->registers->ECHIPVER);
+	u8 debug = ecram_read(ecram, model->registers->ECDEBUG);
+
+	return (vers << 8) + debug;
+}
+
+/* ============================= */
+/* Data model for sensor values  */
+/* ============================= */
+
+struct sensor_values {
+	u16 fan1_rpm; // current speed in rpm of fan 1
+	u16 fan2_rpm; // current speed in rpm of fan2
+	u16 fan1_target_rpm; // target speed in rpm of fan 1
+	u16 fan2_target_rpm; // target speed in rpm of fan 2
+	u8 cpu_temp_celsius; // cpu temperature in celcius
+	u8 gpu_temp_celsius; // gpu temperature in celcius
+	u8 ic_temp_celsius; // ic temperature in celcius
+};
+
+enum SENSOR_ATTR {
+	SENSOR_CPU_TEMP_ID = 1,
+	SENSOR_GPU_TEMP_ID = 2,
+	SENSOR_IC_TEMP_ID = 3,
+	SENSOR_FAN1_RPM_ID = 4,
+	SENSOR_FAN2_RPM_ID = 5,
+	SENSOR_FAN1_TARGET_RPM_ID = 6,
+	SENSOR_FAN2_TARGET_RPM_ID = 7
+};
+
+/* ============================= */
+/* Data model for fan curve      */
+/* ============================= */
+
+struct fancurve_point {
+	// rpm1 devided by 100
+	u8 rpm1_raw;
+	// rpm2 devided by 100
+	u8 rpm2_raw;
+	// >=2 , <=5 (lower is faster); must be increasing by level
+	u8 accel;
+	// >=2 , <=5 (lower is faster); must be increasing by level
+	u8 decel;
+
+	// min must be lower or equal than max
+	// last level max must be 127
+	// <=127 cpu max temp for this level; must be increasing by level
+	u8 cpu_max_temp_celsius;
+	// <=127 cpu min temp for this level; must be increasing by level
+	u8 cpu_min_temp_celsius;
+	// <=127 gpu min temp for this level; must be increasing by level
+	u8 gpu_max_temp_celsius;
+	// <=127 gpu max temp for this level; must be increasing by level
+	u8 gpu_min_temp_celsius;
+	// <=127 ic max temp for this level; must be increasing by level
+	u8 ic_max_temp_celsius;
+	// <=127 ic max temp for this level; must be increasing by level
+	u8 ic_min_temp_celsius;
+};
+
+enum FANCURVE_ATTR {
+	FANCURVE_ATTR_PWM1 = 1,
+	FANCURVE_ATTR_PWM2 = 2,
+	FANCURVE_ATTR_CPU_TEMP = 3,
+	FANCURVE_ATTR_CPU_HYST = 4,
+	FANCURVE_ATTR_GPU_TEMP = 5,
+	FANCURVE_ATTR_GPU_HYST = 6,
+	FANCURVE_ATTR_IC_TEMP = 7,
+	FANCURVE_ATTR_IC_HYST = 8,
+	FANCURVE_ATTR_ACCEL = 9,
+	FANCURVE_ATTR_DECEL = 10,
+	FANCURVE_SIZE = 11,
+	FANCURVE_MINIFANCURVE_ON_COOL = 12
+};
+
+// used for clearing table entries
+static const struct fancurve_point fancurve_point_zero = { 0, 0, 0, 0, 0,
+							   0, 0, 0, 0, 0 };
+
+struct fancurve {
+	struct fancurve_point points[MAXFANCURVESIZE];
+	// number of points used; must be <= MAXFANCURVESIZE
+	size_t size;
+	// the point that at which fans are run currently
+	size_t current_point_i;
+};
+
+// validation functions
+
+static bool fancurve_is_valid_min_temp(int min_temp)
+{
+	return min_temp >= 0 && min_temp <= 127;
+}
+
+static bool fancurve_is_valid_max_temp(int max_temp)
+{
+	return max_temp >= 0 && max_temp <= 127;
+}
+
+// setters with validation
+// - make hwmon implementation easier
+// - keep fancurve valid, otherwise EC will not properly control fan
+
+static bool fancurve_set_rpm1(struct fancurve *fancurve, int point_id, int rpm)
+{
+	bool valid = point_id == 0 ? rpm == 0 : (rpm >= 0 && rpm <= 4500);
+
+	if (valid)
+		fancurve->points[point_id].rpm1_raw = rpm / 100;
+	return valid;
+}
+
+static bool fancurve_set_rpm2(struct fancurve *fancurve, int point_id, int rpm)
+{
+	bool valid = point_id == 0 ? rpm == 0 : (rpm >= 0 && rpm <= 4500);
+
+	if (valid)
+		fancurve->points[point_id].rpm2_raw = rpm / 100;
+	return valid;
+}
+
+// TODO: remove { ... } from single line if body
+
+static bool fancurve_set_accel(struct fancurve *fancurve, int point_id,
+			       int accel)
+{
+	bool valid = accel >= 2 && accel <= 5;
+
+	if (valid)
+		fancurve->points[point_id].accel = accel;
+	return valid;
+}
+
+static bool fancurve_set_decel(struct fancurve *fancurve, int point_id,
+			       int decel)
+{
+	bool valid = decel >= 2 && decel <= 5;
+
+	if (valid)
+		fancurve->points[point_id].decel = decel;
+	return valid;
+}
+
+static bool fancurve_set_cpu_temp_max(struct fancurve *fancurve, int point_id,
+				      int value)
+{
+	bool valid = fancurve_is_valid_max_temp(value);
+
+	if (valid)
+		fancurve->points[point_id].cpu_max_temp_celsius = value;
+
+	return valid;
+}
+
+static bool fancurve_set_gpu_temp_max(struct fancurve *fancurve, int point_id,
+				      int value)
+{
+	bool valid = fancurve_is_valid_max_temp(value);
+
+	if (valid)
+		fancurve->points[point_id].gpu_max_temp_celsius = value;
+	return valid;
+}
+
+static bool fancurve_set_ic_temp_max(struct fancurve *fancurve, int point_id,
+				     int value)
+{
+	bool valid = fancurve_is_valid_max_temp(value);
+
+	if (valid)
+		fancurve->points[point_id].ic_max_temp_celsius = value;
+	return valid;
+}
+
+static bool fancurve_set_cpu_temp_min(struct fancurve *fancurve, int point_id,
+				      int value)
+{
+	bool valid = fancurve_is_valid_max_temp(value);
+
+	if (valid)
+		fancurve->points[point_id].cpu_min_temp_celsius = value;
+	return valid;
+}
+
+static bool fancurve_set_gpu_temp_min(struct fancurve *fancurve, int point_id,
+				      int value)
+{
+	bool valid = fancurve_is_valid_min_temp(value);
+
+	if (valid)
+		fancurve->points[point_id].gpu_min_temp_celsius = value;
+	return valid;
+}
+
+static bool fancurve_set_ic_temp_min(struct fancurve *fancurve, int point_id,
+				     int value)
+{
+	bool valid = fancurve_is_valid_min_temp(value);
+
+	if (valid)
+		fancurve->points[point_id].ic_min_temp_celsius = value;
+	return valid;
+}
+
+static bool fancurve_set_size(struct fancurve *fancurve, int size,
+			      bool init_values)
+{
+	bool valid = size >= 1 && size <= MAXFANCURVESIZE;
+
+	if (!valid)
+		return false;
+	if (init_values && size < fancurve->size) {
+		// fancurve size is decreased, but last etnry alwasy needs 127 temperatures
+		// Note: size >=1
+		fancurve->points[size - 1].cpu_max_temp_celsius = 127;
+		fancurve->points[size - 1].ic_max_temp_celsius = 127;
+		fancurve->points[size - 1].gpu_max_temp_celsius = 127;
+	}
+	if (init_values && size > fancurve->size) {
+		// fancurve increased, so new entries need valid values
+		int i;
+		int last = fancurve->size > 0 ? fancurve->size - 1 : 0;
+
+		for (i = fancurve->size; i < size; ++i)
+			fancurve->points[i] = fancurve->points[last];
+	}
+	return true;
+}
+
+static ssize_t fancurve_print_seqfile(const struct fancurve *fancurve,
+				      struct seq_file *s)
+{
+	int i;
+
+	seq_printf(
+		s,
+		"rpm1|rpm2|acceleration|deceleration|cpu_min_temp|cpu_max_temp|gpu_min_temp|gpu_max_temp|ic_min_temp|ic_max_temp\n");
+	for (i = 0; i < fancurve->size; ++i) {
+		const struct fancurve_point *point = &fancurve->points[i];
+
+		seq_printf(
+			s, "%d\t %d\t %d\t %d\t %d\t %d\t %d\t %d\t %d\t %d\n",
+			point->rpm1_raw * 100, point->rpm2_raw * 100,
+			point->accel, point->decel, point->cpu_min_temp_celsius,
+			point->cpu_max_temp_celsius,
+			point->gpu_min_temp_celsius,
+			point->gpu_max_temp_celsius, point->ic_min_temp_celsius,
+			point->ic_max_temp_celsius);
+	}
+	return 0;
+}
+
+struct light {
+	bool initialized;
+	struct led_classdev led;
+	unsigned int last_brightness;
+	u8 light_id;
+	unsigned int lower_limit;
+	unsigned int upper_limit;
+};
+
+/* =============================  */
+/* Global and shared data between */
+/* all calls to this module       */
+/* =============================  */
+// Implemented like ideapad-laptop.c but currenlty still
+// wihtout dynamic memory allocation (instead global _priv)
+struct legion_private {
+	struct platform_device *platform_device;
+	// TODO: remove or keep? init?
+	struct acpi_device *adev;
+
+	// Method to access ECRAM
+	struct ecram ecram;
+	// Configuration with registers an ECRAM access method
+	const struct model_config *conf;
+
+	// TODO: maybe refactor an keep only local to each function
+	// last known fan curve
+	struct fancurve fancurve;
+	// configured fan curve from user space
+	struct fancurve fancurve_configured;
+
+	// update lock, when partial values of fancurve are changed
+	struct mutex fancurve_mutex;
+
+	//interfaces
+	struct dentry *debugfs_dir;
+	struct device *hwmon_dev;
+	struct platform_profile_handler platform_profile_handler;
+
+	struct light kbd_bl;
+	struct light ylogo_light;
+	struct light iport_light;
+
+	// TODO: remove?
+	bool loaded;
+
+	// TODO: remove, only for reverse enginnering
+	struct ecram_memoryio ec_memoryio;
+};
+
+// shared between different drivers: WMI, platform and proteced by mutex
+static struct legion_private *legion_shared;
+static struct legion_private _priv;
+static DEFINE_MUTEX(legion_shared_mutex);
+
+static int legion_shared_init(struct legion_private *priv)
+{
+	int ret;
+
+	mutex_lock(&legion_shared_mutex);
+
+	if (!legion_shared) {
+		legion_shared = priv;
+		mutex_init(&legion_shared->fancurve_mutex);
+		ret = 0;
+	} else {
+		pr_warn("Found multiple platform devices\n");
+		ret = -EINVAL;
+	}
+
+	priv->loaded = true;
+	mutex_unlock(&legion_shared_mutex);
+
+	return ret;
+}
+
+static void legion_shared_exit(struct legion_private *priv)
+{
+	pr_info("Unloading legion shared\n");
+	mutex_lock(&legion_shared_mutex);
+
+	if (legion_shared == priv)
+		legion_shared = NULL;
+
+	mutex_unlock(&legion_shared_mutex);
+	pr_info("Unloading legion shared done\n");
+}
+
+static int get_simple_wmi_attribute(struct legion_private *priv,
+				    const char *guid, u8 instance,
+				    u32 method_id, bool invert,
+				    unsigned long scale, unsigned long *value)
+{
+	unsigned long state = 0;
+	int err;
+
+	if (scale == 0) {
+		pr_info("Scale cannot be 0\n");
+		return -EINVAL;
+	}
+	err = wmi_exec_noarg_int(guid, instance, method_id, &state);
+	if (err)
+		return -EINVAL;
+
+	// TODO: remove later
+	pr_info("%swith raw value: %ld\n", __func__, state);
+
+	state = state * scale;
+
+	if (invert)
+		state = !state;
+	*value = state;
+	return 0;
+}
+
+static int get_simple_wmi_attribute_bool(struct legion_private *priv,
+					 const char *guid, u8 instance,
+					 u32 method_id, bool invert,
+					 unsigned long scale, bool *value)
+{
+	unsigned long int_val = *value;
+	int err = get_simple_wmi_attribute(priv, guid, instance, method_id,
+					   invert, scale, &int_val);
+	*value = int_val;
+	return err;
+}
+
+static int set_simple_wmi_attribute(struct legion_private *priv,
+				    const char *guid, u8 instance,
+				    u32 method_id, bool invert, int scale,
+				    int state)
+{
+	int err;
+	u8 in_param;
+
+	if (scale == 0) {
+		pr_info("Scale cannot be 0\n");
+		return -EINVAL;
+	}
+
+	if (invert)
+		state = !state;
+
+	in_param = state / scale;
+
+	err = wmi_exec_arg(guid, instance, method_id, &in_param,
+			   sizeof(in_param));
+	return err;
+}
+
+/* ============================= */
+/* Sensor values reading/writing */
+/* ============================= */
+
+static int ec_read_sensor_values(struct ecram *ecram,
+				 const struct model_config *model,
+				 struct sensor_values *values)
+{
+	values->fan1_target_rpm =
+		100 * ecram_read(ecram, model->registers->EXT_FAN1_TARGET_RPM);
+	values->fan2_target_rpm =
+		100 * ecram_read(ecram, model->registers->EXT_FAN2_TARGET_RPM);
+
+	values->fan1_rpm =
+		ecram_read(ecram, model->registers->EXT_FAN1_RPM_LSB) +
+		(((int)ecram_read(ecram, model->registers->EXT_FAN1_RPM_MSB))
+		 << 8);
+	values->fan2_rpm =
+		ecram_read(ecram, model->registers->EXT_FAN2_RPM_LSB) +
+		(((int)ecram_read(ecram, model->registers->EXT_FAN2_RPM_MSB))
+		 << 8);
+
+	values->cpu_temp_celsius =
+		ecram_read(ecram, model->registers->EXT_CPU_TEMP_INPUT);
+	values->gpu_temp_celsius =
+		ecram_read(ecram, model->registers->EXT_GPU_TEMP_INPUT);
+	values->ic_temp_celsius =
+		ecram_read(ecram, model->registers->EXT_IC_TEMP_INPUT);
+
+	values->cpu_temp_celsius = ecram_read(ecram, 0xC5E6);
+	values->gpu_temp_celsius = ecram_read(ecram, 0xC5E7);
+	values->ic_temp_celsius = ecram_read(ecram, 0xC5E8);
+
+	return 0;
+}
+
+static ssize_t ec_read_temperature(struct ecram *ecram,
+				   const struct model_config *model,
+				   int sensor_id, int *temperature)
+{
+	int err = 0;
+	unsigned long res;
+
+	if (sensor_id == 0) {
+		res = ecram_read(ecram, 0xC5E6);
+	} else if (sensor_id == 1) {
+		res = ecram_read(ecram, 0xC5E7);
+	} else {
+		// TODO: use all correct error codes
+		return -EEXIST;
+	}
+	if (!err)
+		*temperature = res;
+	return err;
+}
+
+static ssize_t ec_read_fanspeed(struct ecram *ecram,
+				const struct model_config *model, int fan_id,
+				int *fanspeed_rpm)
+{
+	int err = 0;
+	unsigned long res;
+
+	if (fan_id == 0) {
+		res = ecram_read(ecram, model->registers->EXT_FAN1_RPM_LSB) +
+		      (((int)ecram_read(ecram,
+					model->registers->EXT_FAN1_RPM_MSB))
+		       << 8);
+	} else if (fan_id == 1) {
+		res = ecram_read(ecram, model->registers->EXT_FAN2_RPM_LSB) +
+		      (((int)ecram_read(ecram,
+					model->registers->EXT_FAN2_RPM_MSB))
+		       << 8);
+	} else {
+		// TODO: use all correct error codes
+		return -EEXIST;
+	}
+	if (!err)
+		*fanspeed_rpm = res;
+	return err;
+}
+
+// '\_SB.PCI0.LPC0.EC0.FANS
+#define ACPI_PATH_FAN_SPEED1 "FANS"
+// '\_SB.PCI0.LPC0.EC0.FA2S
+#define ACPI_PATH_FAN_SPEED2 "FA2S"
+
+static ssize_t acpi_read_fanspeed(struct legion_private *priv, int fan_id,
+				  int *value)
+{
+	int err;
+	unsigned long acpi_value;
+	const char *acpi_path;
+
+	if (fan_id == 0) {
+		acpi_path = ACPI_PATH_FAN_SPEED1;
+	} else if (fan_id == 1) {
+		acpi_path = ACPI_PATH_FAN_SPEED2;
+	} else {
+		// TODO: use all correct error codes
+		return -EEXIST;
+	}
+	err = eval_int(priv->adev->handle, acpi_path, &acpi_value);
+	if (!err)
+		*value = (int)acpi_value * 100;
+	return err;
+}
+
+// '\_SB.PCI0.LPC0.EC0.CPUT
+#define ACPI_PATH_CPU_TEMP "CPUT"
+// '\_SB.PCI0.LPC0.EC0.GPUT
+#define ACPI_PATH_GPU_TEMP "GPUT"
+
+static ssize_t acpi_read_temperature(struct legion_private *priv, int fan_id,
+				     int *value)
+{
+	int err;
+	unsigned long acpi_value;
+	const char *acpi_path;
+
+	if (fan_id == 0) {
+		acpi_path = ACPI_PATH_CPU_TEMP;
+	} else if (fan_id == 1) {
+		acpi_path = ACPI_PATH_GPU_TEMP;
+	} else {
+		// TODO: use all correct error codes
+		return -EEXIST;
+	}
+	err = eval_int(priv->adev->handle, acpi_path, &acpi_value);
+	if (!err)
+		*value = (int)acpi_value;
+	return err;
+}
+
+// fan_id: 0 or 1
+static ssize_t wmi_read_fanspeed(int fan_id, int *fanspeed_rpm)
+{
+	int err;
+	unsigned long res;
+	struct acpi_buffer params;
+
+	params.length = 1;
+	params.pointer = &fan_id;
+
+	err = wmi_exec_int(WMI_GUID_LENOVO_FAN_METHOD, 0,
+			   WMI_METHOD_ID_FAN_GETCURRENTFANSPEED, &params, &res);
+
+	if (!err)
+		*fanspeed_rpm = res;
+	return err;
+}
+
+//sensor_id: cpu = 0, gpu = 1
+static ssize_t wmi_read_temperature(int sensor_id, int *temperature)
+{
+	int err;
+	unsigned long res;
+	struct acpi_buffer params;
+
+	if (sensor_id == 0)
+		sensor_id = 0x03;
+	else if (sensor_id == 1)
+		sensor_id = 0x04;
+	else {
+		// TODO: use all correct error codes
+		return -EEXIST;
+	}
+
+	params.length = 1;
+	params.pointer = &sensor_id;
+
+	err = wmi_exec_int(WMI_GUID_LENOVO_FAN_METHOD, 0,
+			   WMI_METHOD_ID_FAN_GETCURRENTSENSORTEMPERATURE,
+			   &params, &res);
+
+	if (!err)
+		*temperature = res;
+	return err;
+}
+
+// fan_id: 0 or 1
+static ssize_t wmi_read_fanspeed_gz(int fan_id, int *fanspeed_rpm)
+{
+	int err;
+	u32 method_id;
+	unsigned long res;
+
+	if (fan_id == 0)
+		method_id = WMI_METHOD_ID_GETFAN1SPEED;
+	else if (fan_id == 1)
+		method_id = WMI_METHOD_ID_GETFAN2SPEED;
+	else {
+		// TODO: use all correct error codes
+		return -EEXIST;
+	}
+	err = wmi_exec_noarg_int(LEGION_WMI_GAMEZONE_GUID, 0, method_id, &res);
+
+	if (!err)
+		*fanspeed_rpm = res;
+	return err;
+}
+
+//sensor_id: cpu = 0, gpu = 1
+static ssize_t wmi_read_temperature_gz(int sensor_id, int *temperature)
+{
+	int err;
+	u32 method_id;
+	unsigned long res;
+
+	if (sensor_id == 0)
+		method_id = WMI_METHOD_ID_GETCPUTEMP;
+	else if (sensor_id == 1)
+		method_id = WMI_METHOD_ID_GETGPUTEMP;
+	else {
+		// TODO: use all correct error codes
+		return -EEXIST;
+	}
+
+	err = wmi_exec_noarg_int(LEGION_WMI_GAMEZONE_GUID, 0, method_id, &res);
+
+	if (!err)
+		*temperature = res;
+	return err;
+}
+
+// fan_id: 0 or 1
+static ssize_t wmi_read_fanspeed_other(int fan_id, int *fanspeed_rpm)
+{
+	int err;
+	enum OtherMethodFeature featured_id;
+	int res;
+
+	if (fan_id == 0)
+		featured_id = OtherMethodFeature_FAN_SPEED_1;
+	else if (fan_id == 1)
+		featured_id = OtherMethodFeature_FAN_SPEED_2;
+	else {
+		// TODO: use all correct error codes
+		return -EEXIST;
+	}
+
+	err = wmi_other_method_get_value(featured_id, &res);
+
+	if (!err)
+		*fanspeed_rpm = res;
+	return err;
+}
+
+//sensor_id: cpu = 0, gpu = 1
+static ssize_t wmi_read_temperature_other(int sensor_id, int *temperature)
+{
+	int err;
+	enum OtherMethodFeature featured_id;
+	int res;
+
+	if (sensor_id == 0)
+		featured_id = OtherMethodFeature_TEMP_CPU;
+	else if (sensor_id == 1)
+		featured_id = OtherMethodFeature_TEMP_GPU;
+	else {
+		// TODO: use all correct error codes
+		return -EEXIST;
+	}
+
+	err = wmi_other_method_get_value(featured_id, &res);
+	if (!err)
+		*temperature = res;
+	return err;
+}
+
+static ssize_t read_fanspeed(struct legion_private *priv, int fan_id,
+			     int *speed_rpm)
+{
+	// TODO: use enums or function pointers?
+	switch (priv->conf->access_method_fanspeed) {
+	case ACCESS_METHOD_EC:
+		return ec_read_fanspeed(&priv->ecram, priv->conf, fan_id,
+					speed_rpm);
+	case ACCESS_METHOD_ACPI:
+		return acpi_read_fanspeed(priv, fan_id, speed_rpm);
+	case ACCESS_METHOD_WMI:
+		return wmi_read_fanspeed_gz(fan_id, speed_rpm);
+	case ACCESS_METHOD_WMI2:
+		return wmi_read_fanspeed(fan_id, speed_rpm);
+	case ACCESS_METHOD_WMI3:
+		return wmi_read_fanspeed_other(fan_id, speed_rpm);
+	default:
+		pr_info("No access method for fanspeed: %d\n",
+			priv->conf->access_method_fanspeed);
+		return -EINVAL;
+	}
+}
+
+static ssize_t read_temperature(struct legion_private *priv, int sensor_id,
+				int *temperature)
+{
+	// TODO: use enums or function pointers?
+	switch (priv->conf->access_method_temperature) {
+	case ACCESS_METHOD_EC:
+		return ec_read_temperature(&priv->ecram, priv->conf, sensor_id,
+					   temperature);
+	case ACCESS_METHOD_ACPI:
+		return acpi_read_temperature(priv, sensor_id, temperature);
+	case ACCESS_METHOD_WMI:
+		return wmi_read_temperature_gz(sensor_id, temperature);
+	case ACCESS_METHOD_WMI2:
+		return wmi_read_temperature(sensor_id, temperature);
+	case ACCESS_METHOD_WMI3:
+		return wmi_read_temperature_other(sensor_id, temperature);
+	default:
+		pr_info("No access method for temperature: %d\n",
+			priv->conf->access_method_temperature);
+		return -EINVAL;
+	}
+}
+
+/* ============================= */
+/* Fancurve reading/writing      */
+/* ============================= */
+
+/* Fancurve from WMI
+ * This allows changing fewer parameters.
+ * It is only available on newer models.
+ */
+
+struct WMIFanTable {
+	u8 FSTM; //FSMD
+	u8 FSID;
+	u32 FSTL; //FSST
+	u16 FSS0;
+	u16 FSS1;
+	u16 FSS2;
+	u16 FSS3;
+	u16 FSS4;
+	u16 FSS5;
+	u16 FSS6;
+	u16 FSS7;
+	u16 FSS8;
+	u16 FSS9;
+} __packed;
+
+struct WMIFanTableRead {
+	u32 FSFL;
+	u32 FSS0;
+	u32 FSS1;
+	u32 FSS2;
+	u32 FSS3;
+	u32 FSS4;
+	u32 FSS5;
+	u32 FSS6;
+	u32 FSS7;
+	u32 FSS8;
+	u32 FSS9;
+	u32 FSSA;
+} __packed;
+
+static ssize_t wmi_read_fancurve_custom(const struct model_config *model,
+					struct fancurve *fancurve)
+{
+	u8 buffer[88];
+	int err;
+
+	// The output buffer from the ACPI call is 88 bytes and larger
+	// than the returned object
+	pr_info("Size of object: %lu\n", sizeof(struct WMIFanTableRead));
+	err = wmi_exec_noarg_ints(WMI_GUID_LENOVO_FAN_METHOD, 0,
+				  WMI_METHOD_ID_FAN_GET_TABLE, buffer,
+				  sizeof(buffer));
+	print_hex_dump(KERN_INFO, "legion_laptop fan table wmi buffer",
+		       DUMP_PREFIX_ADDRESS, 16, 1, buffer, sizeof(buffer),
+		       true);
+	if (!err) {
+		struct WMIFanTableRead *fantable =
+			(struct WMIFanTableRead *)&buffer[0];
+		fancurve->current_point_i = 0;
+		fancurve->size = 10;
+		fancurve->points[0].rpm1_raw = fantable->FSS0;
+		fancurve->points[1].rpm1_raw = fantable->FSS1;
+		fancurve->points[2].rpm1_raw = fantable->FSS2;
+		fancurve->points[3].rpm1_raw = fantable->FSS3;
+		fancurve->points[4].rpm1_raw = fantable->FSS4;
+		fancurve->points[5].rpm1_raw = fantable->FSS5;
+		fancurve->points[6].rpm1_raw = fantable->FSS6;
+		fancurve->points[7].rpm1_raw = fantable->FSS7;
+		fancurve->points[8].rpm1_raw = fantable->FSS8;
+		fancurve->points[9].rpm1_raw = fantable->FSS9;
+		//fancurve->points[10].rpm1_raw = fantable->FSSA;
+	}
+	return err;
+}
+
+static ssize_t wmi_write_fancurve_custom(const struct model_config *model,
+					 const struct fancurve *fancurve)
+{
+	u8 buffer[0x20];
+	int err;
+
+	// The buffer is read like this in ACPI firmware
+	//
+	// CreateByteField (Arg2, Zero, FSTM)
+	// CreateByteField (Arg2, One, FSID)
+	// CreateDWordField (Arg2, 0x02, FSTL)
+	// CreateByteField (Arg2, 0x06, FSS0)
+	// CreateByteField (Arg2, 0x08, FSS1)
+	// CreateByteField (Arg2, 0x0A, FSS2)
+	// CreateByteField (Arg2, 0x0C, FSS3)
+	// CreateByteField (Arg2, 0x0E, FSS4)
+	// CreateByteField (Arg2, 0x10, FSS5)
+	// CreateByteField (Arg2, 0x12, FSS6)
+	// CreateByteField (Arg2, 0x14, FSS7)
+	// CreateByteField (Arg2, 0x16, FSS8)
+	// CreateByteField (Arg2, 0x18, FSS9)
+
+	memset(buffer, 0, sizeof(buffer));
+	buffer[0x06] = fancurve->points[0].rpm1_raw;
+	buffer[0x08] = fancurve->points[1].rpm1_raw;
+	buffer[0x0A] = fancurve->points[2].rpm1_raw;
+	buffer[0x0C] = fancurve->points[3].rpm1_raw;
+	buffer[0x0E] = fancurve->points[4].rpm1_raw;
+	buffer[0x10] = fancurve->points[5].rpm1_raw;
+	buffer[0x12] = fancurve->points[6].rpm1_raw;
+	buffer[0x14] = fancurve->points[7].rpm1_raw;
+	buffer[0x16] = fancurve->points[8].rpm1_raw;
+	buffer[0x18] = fancurve->points[9].rpm1_raw;
+
+	print_hex_dump(KERN_INFO, "legion_laptop fan table wmi write buffer",
+		       DUMP_PREFIX_ADDRESS, 16, 1, buffer, sizeof(buffer),
+		       true);
+	err = wmi_exec_arg(WMI_GUID_LENOVO_FAN_METHOD, 0,
+			   WMI_METHOD_ID_FAN_SET_TABLE, buffer, sizeof(buffer));
+	return err;
+}
+
+/* Read the fan curve from the EC.
+ *
+ * In newer models (>=2022) there is an ACPI/WMI to read fan curve as
+ * a whole. So read/write fan table as a whole to use
+ * same interface for both cases.
+ *
+ * It reads all points from EC memory, even if stored fancurve is smaller, so
+ * it can contain 0 entries.
+ */
+static int ec_read_fancurve_legion(struct ecram *ecram,
+				   const struct model_config *model,
+				   struct fancurve *fancurve)
+{
+	size_t i = 0;
+
+	for (i = 0; i < MAXFANCURVESIZE; ++i) {
+		struct fancurve_point *point = &fancurve->points[i];
+
+		point->rpm1_raw =
+			ecram_read(ecram, model->registers->EXT_FAN1_BASE + i);
+		point->rpm2_raw =
+			ecram_read(ecram, model->registers->EXT_FAN2_BASE + i);
+
+		point->accel = ecram_read(
+			ecram, model->registers->EXT_FAN_ACC_BASE + i);
+		point->decel = ecram_read(
+			ecram, model->registers->EXT_FAN_DEC_BASE + i);
+		point->cpu_max_temp_celsius =
+			ecram_read(ecram, model->registers->EXT_CPU_TEMP + i);
+		point->cpu_min_temp_celsius = ecram_read(
+			ecram, model->registers->EXT_CPU_TEMP_HYST + i);
+		point->gpu_max_temp_celsius =
+			ecram_read(ecram, model->registers->EXT_GPU_TEMP + i);
+		point->gpu_min_temp_celsius = ecram_read(
+			ecram, model->registers->EXT_GPU_TEMP_HYST + i);
+		point->ic_max_temp_celsius =
+			ecram_read(ecram, model->registers->EXT_VRM_TEMP + i);
+		point->ic_min_temp_celsius = ecram_read(
+			ecram, model->registers->EXT_VRM_TEMP_HYST + i);
+	}
+
+	// Do not trust that hardware; It might suddendly report
+	// a larger size, so clamp it.
+	fancurve->size =
+		ecram_read(ecram, model->registers->EXT_FAN_POINTS_SIZE);
+	fancurve->size =
+		min(fancurve->size, (typeof(fancurve->size))(MAXFANCURVESIZE));
+	fancurve->current_point_i =
+		ecram_read(ecram, model->registers->EXT_FAN_CUR_POINT);
+	fancurve->current_point_i =
+		min(fancurve->current_point_i, fancurve->size);
+	return 0;
+}
+
+static int ec_write_fancurve_legion(struct ecram *ecram,
+				    const struct model_config *model,
+				    const struct fancurve *fancurve,
+				    bool write_size)
+{
+	size_t i;
+
+	//TODO: remove again
+	pr_info("Set fancurve\n");
+
+	// Reset fan update counters (try to avoid any race conditions)
+	ecram_write(ecram, 0xC5FE, 0);
+	ecram_write(ecram, 0xC5FF, 0);
+	for (i = 0; i < MAXFANCURVESIZE; ++i) {
+		// Entries for points larger than fancurve size should be cleared
+		// to 0
+		const struct fancurve_point *point =
+			i < fancurve->size ? &fancurve->points[i] :
+					     &fancurve_point_zero;
+
+		ecram_write(ecram, model->registers->EXT_FAN1_BASE + i,
+			    point->rpm1_raw);
+		ecram_write(ecram, model->registers->EXT_FAN2_BASE + i,
+			    point->rpm2_raw);
+
+		ecram_write(ecram, model->registers->EXT_FAN_ACC_BASE + i,
+			    point->accel);
+		ecram_write(ecram, model->registers->EXT_FAN_DEC_BASE + i,
+			    point->decel);
+
+		ecram_write(ecram, model->registers->EXT_CPU_TEMP + i,
+			    point->cpu_max_temp_celsius);
+		ecram_write(ecram, model->registers->EXT_CPU_TEMP_HYST + i,
+			    point->cpu_min_temp_celsius);
+		ecram_write(ecram, model->registers->EXT_GPU_TEMP + i,
+			    point->gpu_max_temp_celsius);
+		ecram_write(ecram, model->registers->EXT_GPU_TEMP_HYST + i,
+			    point->gpu_min_temp_celsius);
+		ecram_write(ecram, model->registers->EXT_VRM_TEMP + i,
+			    point->ic_max_temp_celsius);
+		ecram_write(ecram, model->registers->EXT_VRM_TEMP_HYST + i,
+			    point->ic_min_temp_celsius);
+	}
+
+	if (write_size) {
+		ecram_write(ecram, model->registers->EXT_FAN_POINTS_SIZE,
+			    fancurve->size);
+	}
+
+	// Reset current fan level to 0, so algorithm in EC
+	// selects fan curve point again and resetting hysterisis
+	// effects
+	ecram_write(ecram, model->registers->EXT_FAN_CUR_POINT, 0);
+
+	// Reset internal fan levels
+	ecram_write(ecram, 0xC634, 0); // CPU
+	ecram_write(ecram, 0xC635, 0); // GPU
+	ecram_write(ecram, 0xC636, 0); // SENSOR
+
+	return 0;
+}
+
+#define FANCURVESIZE_IDEAPDAD 8
+
+static int ec_read_fancurve_ideapad(struct ecram *ecram,
+				    const struct model_config *model,
+				    struct fancurve *fancurve)
+{
+	size_t i = 0;
+
+	for (i = 0; i < FANCURVESIZE_IDEAPDAD; ++i) {
+		struct fancurve_point *point = &fancurve->points[i];
+
+		point->rpm1_raw =
+			ecram_read(ecram, model->registers->EXT_FAN1_BASE + i);
+		point->rpm2_raw =
+			ecram_read(ecram, model->registers->EXT_FAN2_BASE + i);
+
+		point->accel = 0;
+		point->decel = 0;
+		point->cpu_max_temp_celsius =
+			ecram_read(ecram, model->registers->EXT_CPU_TEMP + i);
+		point->cpu_min_temp_celsius = ecram_read(
+			ecram, model->registers->EXT_CPU_TEMP_HYST + i);
+		point->gpu_max_temp_celsius =
+			ecram_read(ecram, model->registers->EXT_GPU_TEMP + i);
+		point->gpu_min_temp_celsius = ecram_read(
+			ecram, model->registers->EXT_GPU_TEMP_HYST + i);
+		point->ic_max_temp_celsius = 0;
+		point->ic_min_temp_celsius = 0;
+	}
+
+	// Do not trust that hardware; It might suddendly report
+	// a larger size, so clamp it.
+	fancurve->size = FANCURVESIZE_IDEAPDAD;
+	fancurve->current_point_i =
+		ecram_read(ecram, model->registers->EXT_FAN_CUR_POINT);
+	fancurve->current_point_i =
+		min(fancurve->current_point_i, fancurve->size);
+	return 0;
+}
+
+static int ec_write_fancurve_ideapad(struct ecram *ecram,
+				     const struct model_config *model,
+				     const struct fancurve *fancurve)
+{
+	size_t i;
+	int valr1;
+	int valr2;
+
+	// add this later: maybe other addresses needed
+	// therefore, fan curve might not be effective immediatley but
+	// only after temp change
+	// Reset fan update counters (try to avoid any race conditions)
+	ecram_write(ecram, 0xC5FE, 0);
+	ecram_write(ecram, 0xC5FF, 0);
+	for (i = 0; i < FANCURVESIZE_IDEAPDAD; ++i) {
+		const struct fancurve_point *point = &fancurve->points[i];
+
+		ecram_write(ecram, model->registers->EXT_FAN1_BASE + i,
+			    point->rpm1_raw);
+		valr1 = ecram_read(ecram, model->registers->EXT_FAN1_BASE + i);
+		ecram_write(ecram, model->registers->EXT_FAN2_BASE + i,
+			    point->rpm2_raw);
+		valr2 = ecram_read(ecram, model->registers->EXT_FAN2_BASE + i);
+		pr_info("Writing fan1: %d; reading fan1: %d\n", point->rpm1_raw,
+			valr1);
+		pr_info("Writing fan2: %d; reading fan2: %d\n", point->rpm2_raw,
+			valr2);
+
+		// write to memory and repeat 8 bytes later again
+		ecram_write(ecram, model->registers->EXT_CPU_TEMP + i,
+			    point->cpu_max_temp_celsius);
+		ecram_write(ecram, model->registers->EXT_CPU_TEMP + 8 + i,
+			    point->cpu_max_temp_celsius);
+		// write to memory and repeat 8 bytes later again
+		ecram_write(ecram, model->registers->EXT_CPU_TEMP_HYST + i,
+			    point->cpu_min_temp_celsius);
+		ecram_write(ecram, model->registers->EXT_CPU_TEMP_HYST + 8 + i,
+			    point->cpu_min_temp_celsius);
+		// write to memory and repeat 8 bytes later again
+		ecram_write(ecram, model->registers->EXT_GPU_TEMP + i,
+			    point->gpu_max_temp_celsius);
+		ecram_write(ecram, model->registers->EXT_GPU_TEMP + 8 + i,
+			    point->gpu_max_temp_celsius);
+		// write to memory and repeat 8 bytes later again
+		ecram_write(ecram, model->registers->EXT_GPU_TEMP_HYST + i,
+			    point->gpu_min_temp_celsius);
+		ecram_write(ecram, model->registers->EXT_GPU_TEMP_HYST + 8 + i,
+			    point->gpu_min_temp_celsius);
+	}
+
+	// add this later: maybe other addresses needed
+	// therefore, fan curve might not be effective immediatley but
+	// only after temp change
+	// // Reset current fan level to 0, so algorithm in EC
+	// // selects fan curve point again and resetting hysterisis
+	// // effects
+	// ecram_write(ecram, model->registers->EXT_FAN_CUR_POINT, 0);
+
+	// // Reset internal fan levels
+	// ecram_write(ecram, 0xC634, 0); // CPU
+	// ecram_write(ecram, 0xC635, 0); // GPU
+	// ecram_write(ecram, 0xC636, 0); // SENSOR
+
+	return 0;
+}
+
+static int read_fancurve(struct legion_private *priv, struct fancurve *fancurve)
+{
+	// TODO: use enums or function pointers?
+	switch (priv->conf->access_method_fancurve) {
+	case ACCESS_METHOD_EC:
+		return ec_read_fancurve_legion(&priv->ecram, priv->conf,
+					       fancurve);
+	case ACCESS_METHOD_EC2:
+		return ec_read_fancurve_ideapad(&priv->ecram, priv->conf,
+						fancurve);
+	case ACCESS_METHOD_WMI3:
+		return wmi_read_fancurve_custom(priv->conf, fancurve);
+	default:
+		pr_info("No access method for fancurve:%d\n",
+			priv->conf->access_method_fancurve);
+		return -EINVAL;
+	}
+}
+
+static int write_fancurve(struct legion_private *priv,
+			  const struct fancurve *fancurve, bool write_size)
+{
+	// TODO: use enums or function pointers?
+	switch (priv->conf->access_method_fancurve) {
+	case ACCESS_METHOD_EC:
+		return ec_write_fancurve_legion(&priv->ecram, priv->conf,
+						fancurve, write_size);
+	case ACCESS_METHOD_EC2:
+		return ec_write_fancurve_ideapad(&priv->ecram, priv->conf,
+						 fancurve);
+	case ACCESS_METHOD_WMI3:
+		return wmi_write_fancurve_custom(priv->conf, fancurve);
+	default:
+		pr_info("No access method for fancurve:%d\n",
+			priv->conf->access_method_fancurve);
+		return -EINVAL;
+	}
+}
+
+#define MINIFANCUVE_ON_COOL_ON 0x04
+#define MINIFANCUVE_ON_COOL_OFF 0xA0
+
+static int ec_read_minifancurve(struct ecram *ecram,
+				const struct model_config *model, bool *state)
+{
+	int value =
+		ecram_read(ecram, model->registers->EXT_MINIFANCURVE_ON_COOL);
+
+	switch (value) {
+	case MINIFANCUVE_ON_COOL_ON:
+		*state = true;
+		break;
+	case MINIFANCUVE_ON_COOL_OFF:
+		*state = false;
+		break;
+	default:
+		pr_info("Unexpected value in MINIFANCURVE register:%d\n",
+			value);
+		return -1;
+	}
+	return 0;
+}
+
+static ssize_t ec_write_minifancurve(struct ecram *ecram,
+				     const struct model_config *model,
+				     bool state)
+{
+	u8 val = state ? MINIFANCUVE_ON_COOL_ON : MINIFANCUVE_ON_COOL_OFF;
+
+	ecram_write(ecram, model->registers->EXT_MINIFANCURVE_ON_COOL, val);
+	return 0;
+}
+
+#define EC_LOCKFANCONTROLLER_ON 8
+#define EC_LOCKFANCONTROLLER_OFF 0
+
+static ssize_t ec_write_lockfancontroller(struct ecram *ecram,
+					  const struct model_config *model,
+					  bool state)
+{
+	u8 val = state ? EC_LOCKFANCONTROLLER_ON : EC_LOCKFANCONTROLLER_OFF;
+
+	ecram_write(ecram, model->registers->EXT_LOCKFANCONTROLLER, val);
+	return 0;
+}
+
+static int ec_read_lockfancontroller(struct ecram *ecram,
+				     const struct model_config *model,
+				     bool *state)
+{
+	int value = ecram_read(ecram, model->registers->EXT_LOCKFANCONTROLLER);
+
+	switch (value) {
+	case EC_LOCKFANCONTROLLER_ON:
+		*state = true;
+		break;
+	case EC_LOCKFANCONTROLLER_OFF:
+		*state = false;
+		break;
+	default:
+		pr_info("Unexpected value in lockfanspeed register:%d\n",
+			value);
+		return -1;
+	}
+	return 0;
+}
+
+#define EC_FANFULLSPEED_ON 0x40
+#define EC_FANFULLSPEED_OFF 0x00
+
+static int ec_read_fanfullspeed(struct ecram *ecram,
+				const struct model_config *model, bool *state)
+{
+	int value = ecram_read(ecram, model->registers->EXT_MAXIMUMFANSPEED);
+
+	switch (value) {
+	case EC_FANFULLSPEED_ON:
+		*state = true;
+		break;
+	case EC_FANFULLSPEED_OFF:
+		*state = false;
+		break;
+	default:
+		pr_info("Unexpected value in maximumfanspeed register:%d\n",
+			value);
+		return -1;
+	}
+	return 0;
+}
+
+static ssize_t ec_write_fanfullspeed(struct ecram *ecram,
+				     const struct model_config *model,
+				     bool state)
+{
+	u8 val = state ? EC_FANFULLSPEED_ON : EC_FANFULLSPEED_OFF;
+
+	ecram_write(ecram, model->registers->EXT_MAXIMUMFANSPEED, val);
+	return 0;
+}
+
+static ssize_t wmi_read_fanfullspeed(struct legion_private *priv, bool *state)
+{
+	return get_simple_wmi_attribute_bool(priv, WMI_GUID_LENOVO_FAN_METHOD,
+					     0, WMI_METHOD_ID_FAN_GET_FULLSPEED,
+					     false, 1, state);
+}
+
+static ssize_t wmi_write_fanfullspeed(struct legion_private *priv, bool state)
+{
+	return set_simple_wmi_attribute(priv, WMI_GUID_LENOVO_FAN_METHOD, 0,
+					WMI_METHOD_ID_FAN_SET_FULLSPEED, false,
+					1, state);
+}
+
+static ssize_t read_fanfullspeed(struct legion_private *priv, bool *state)
+{
+	// TODO: use enums or function pointers?
+	switch (priv->conf->access_method_fanfullspeed) {
+	case ACCESS_METHOD_EC:
+		return ec_read_fanfullspeed(&priv->ecram, priv->conf, state);
+	case ACCESS_METHOD_WMI:
+		return wmi_read_fanfullspeed(priv, state);
+	default:
+		pr_info("No access method for fan full speed: %d\n",
+			priv->conf->access_method_fanfullspeed);
+		return -EINVAL;
+	}
+}
+
+static ssize_t write_fanfullspeed(struct legion_private *priv, bool state)
+{
+	ssize_t res;
+
+	switch (priv->conf->access_method_fanfullspeed) {
+	case ACCESS_METHOD_EC:
+		res = ec_write_fanfullspeed(&priv->ecram, priv->conf, state);
+		return res;
+	case ACCESS_METHOD_WMI:
+		return wmi_write_fanfullspeed(priv, state);
+	default:
+		pr_info("No access method for fan full speed:%d\n",
+			priv->conf->access_method_fanfullspeed);
+		return -EINVAL;
+	}
+}
+
+/* ============================= */
+/* Power mode reading/writing    */
+/* ============================= */
+
+enum legion_ec_powermode {
+	LEGION_EC_POWERMODE_QUIET = 2,
+	LEGION_EC_POWERMODE_BALANCED = 0,
+	LEGION_EC_POWERMODE_PERFORMANCE = 1,
+	LEGION_EC_POWERMODE_CUSTOM = 3
+};
+
+enum legion_wmi_powermode {
+	LEGION_WMI_POWERMODE_QUIET = 1,
+	LEGION_WMI_POWERMODE_BALANCED = 2,
+	LEGION_WMI_POWERMODE_PERFORMANCE = 3,
+	LEGION_WMI_POWERMODE_CUSTOM = 255
+};
+
+enum legion_wmi_powermode ec_to_wmi_powermode(int ec_mode)
+{
+	switch (ec_mode) {
+	case LEGION_EC_POWERMODE_QUIET:
+		return LEGION_WMI_POWERMODE_QUIET;
+	case LEGION_EC_POWERMODE_BALANCED:
+		return LEGION_WMI_POWERMODE_BALANCED;
+	case LEGION_EC_POWERMODE_PERFORMANCE:
+		return LEGION_WMI_POWERMODE_PERFORMANCE;
+	case LEGION_EC_POWERMODE_CUSTOM:
+		return LEGION_WMI_POWERMODE_CUSTOM;
+	default:
+		return LEGION_WMI_POWERMODE_BALANCED;
+	}
+}
+
+enum legion_ec_powermode wmi_to_ec_powermode(enum legion_wmi_powermode wmi_mode)
+{
+	switch (wmi_mode) {
+	case LEGION_WMI_POWERMODE_QUIET:
+		return LEGION_EC_POWERMODE_QUIET;
+	case LEGION_WMI_POWERMODE_BALANCED:
+		return LEGION_EC_POWERMODE_BALANCED;
+	case LEGION_WMI_POWERMODE_PERFORMANCE:
+		return LEGION_EC_POWERMODE_PERFORMANCE;
+	case LEGION_WMI_POWERMODE_CUSTOM:
+		return LEGION_EC_POWERMODE_CUSTOM;
+	default:
+		return LEGION_EC_POWERMODE_BALANCED;
+	}
+}
+
+static ssize_t ec_read_powermode(struct legion_private *priv, int *powermode)
+{
+	*powermode =
+		ecram_read(&priv->ecram, priv->conf->registers->EXT_POWERMODE);
+	return 0;
+}
+
+static ssize_t ec_write_powermode(struct legion_private *priv, u8 value)
+{
+	if (!((value >= 0 && value <= 2) || value == 255)) {
+		pr_info("Unexpected power mode value ignored: %d\n", value);
+		return -ENOMEM;
+	}
+	ecram_write(&priv->ecram, priv->conf->registers->EXT_POWERMODE, value);
+	return 0;
+}
+
+static ssize_t acpi_read_powermode(struct legion_private *priv, int *powermode)
+{
+	unsigned long acpi_powermode;
+	int err;
+
+	// spmo method not alwasy available
+	// \_SB.PCI0.LPC0.EC0.SPMO
+	err = eval_spmo(priv->adev->handle, &acpi_powermode);
+	*powermode = (int)acpi_powermode;
+	return err;
+}
+
+static ssize_t wmi_read_powermode(int *powermode)
+{
+	int err;
+	unsigned long res;
+
+	err = wmi_exec_noarg_int(LEGION_WMI_GAMEZONE_GUID, 0,
+				 WMI_METHOD_ID_GETSMARTFANMODE, &res);
+
+	if (!err)
+		*powermode = res;
+	return err;
+}
+
+static ssize_t wmi_write_powermode(u8 value)
+{
+	if (!((value >= LEGION_WMI_POWERMODE_QUIET &&
+	       value <= LEGION_WMI_POWERMODE_PERFORMANCE) ||
+	      value == LEGION_WMI_POWERMODE_CUSTOM)) {
+		pr_info("Unexpected power mode value ignored: %d\n", value);
+		return -ENOMEM;
+	}
+	return wmi_exec_arg(LEGION_WMI_GAMEZONE_GUID, 0,
+			    WMI_METHOD_ID_SETSMARTFANMODE, &value,
+			    sizeof(value));
+}
+
+static ssize_t read_powermode(struct legion_private *priv, int *powermode)
+{
+	ssize_t res;
+
+	switch (priv->conf->access_method_powermode) {
+	case ACCESS_METHOD_EC:
+		res = ec_read_powermode(priv, powermode);
+		*powermode = ec_to_wmi_powermode(*powermode);
+		return res;
+	case ACCESS_METHOD_ACPI:
+		return acpi_read_powermode(priv, powermode);
+	case ACCESS_METHOD_WMI:
+		return wmi_read_powermode(powermode);
+	default:
+		pr_info("No access method for powermode:%d\n",
+			priv->conf->access_method_powermode);
+		return -EINVAL;
+	}
+}
+
+static ssize_t write_powermode(struct legion_private *priv,
+			       enum legion_wmi_powermode value)
+{
+	ssize_t res;
+
+	//TODO: remove again
+	pr_info("Set powermode\n");
+
+	switch (priv->conf->access_method_powermode) {
+	case ACCESS_METHOD_EC:
+		res = ec_write_powermode(priv, wmi_to_ec_powermode(value));
+		return res;
+	case ACCESS_METHOD_WMI:
+		return wmi_write_powermode(value);
+	default:
+		pr_info("No access method for powermode:%d\n",
+			priv->conf->access_method_powermode);
+		return -EINVAL;
+	}
+}
+
+/**
+ * Shortly toggle powermode to a different mode
+ * and switch back, e.g. to reset fan curve.
+ */
+static void toggle_powermode(struct legion_private *priv)
+{
+	int old_powermode;
+	int next_powermode;
+
+	read_powermode(priv, &old_powermode);
+	next_powermode = old_powermode == 0 ? 1 : 0;
+
+	write_powermode(priv, next_powermode);
+	mdelay(1500);
+	write_powermode(priv, old_powermode);
+}
+
+/* ============================= */
+/* Charging mode reading/writing */
+/* ============================- */
+
+#define FCT_RAPID_CHARGE_ON 0x07
+#define FCT_RAPID_CHARGE_OFF 0x08
+#define RAPID_CHARGE_ON 0x0
+#define RAPID_CHARGE_OFF 0x1
+
+static int acpi_read_rapidcharge(struct acpi_device *adev, bool *state)
+{
+	unsigned long result;
+	int err;
+
+	//also works? what is better?
+	/*
+	 * err = eval_qcho(adev->handle, &result);
+	 * if (err)
+	 *  return err;
+	 * state = result;
+	 * return 0;
+	 */
+
+	err = eval_gbmd(adev->handle, &result);
+	if (err)
+		return err;
+
+	*state = result & 0x04;
+	return 0;
+}
+
+static int acpi_write_rapidcharge(struct acpi_device *adev, bool state)
+{
+	int err;
+	unsigned long fct_nr = state > 0 ? FCT_RAPID_CHARGE_ON :
+					   FCT_RAPID_CHARGE_OFF;
+
+	err = exec_sbmc(adev->handle, fct_nr);
+	pr_info("Set rapidcharge to %d by calling %lu: result: %d\n", state,
+		fct_nr, err);
+	return err;
+}
+
+/* ============================= */
+/* Keyboard backlight read/write */
+/* ============================= */
+
+static ssize_t legion_kbd_bl2_brightness_get(struct legion_private *priv)
+{
+	unsigned long state = 0;
+	int err;
+
+	err = wmi_exec_noarg_int(LEGION_WMI_GAMEZONE_GUID, 0,
+				 WMI_METHOD_ID_GETKEYBOARDLIGHT, &state);
+	if (err)
+		return -EINVAL;
+
+	return state;
+}
+
+//static int legion_kbd_bl2_brightness_set(struct legion_private *priv,
+//					 unsigned int brightness)
+//{
+//	u8 in_param = brightness;
+
+//	return wmi_exec_arg(LEGION_WMI_GAMEZONE_GUID, 0,
+//			    WMI_METHOD_ID_SETKEYBOARDLIGHT, &in_param,
+//			    sizeof(in_param));
+//}
+
+//min: 1, max: 3
+#define LIGHT_ID_KEYBOARD 0x00
+//min: 0, max: 1
+#define LIGHT_ID_YLOGO 0x03
+//min: 1, max: 2
+#define LIGHT_ID_IOPORT 0x05
+
+static int legion_wmi_light_get(struct legion_private *priv, u8 light_id,
+				unsigned int min_value, unsigned int max_value)
+{
+	struct acpi_buffer params;
+	u8 in;
+	u8 result[2];
+	u8 value;
+	int err;
+
+	params.length = 1;
+	params.pointer = &in;
+	in = light_id;
+	err = wmi_exec_ints(LEGION_WMI_KBBACKLIGHT_GUID, 0,
+			    WMI_METHOD_ID_KBBACKLIGHTGET, &params, result,
+			    ARRAY_SIZE(result));
+	if (err) {
+		pr_info("Error for WMI method call to get brightness\n");
+		return -EIO;
+	}
+
+	value = result[1];
+	if (!(value >= min_value && value <= max_value)) {
+		pr_info("Error WMI call for reading brightness: expected a value between %u and %u, but got %d\n",
+			min_value, max_value, value);
+		return -EFAULT;
+	}
+
+	return value - min_value;
+}
+
+static int legion_wmi_light_set(struct legion_private *priv, u8 light_id,
+				unsigned int min_value, unsigned int max_value,
+				unsigned int brightness)
+{
+	struct acpi_buffer buffer;
+	u8 in_buffer_param[8];
+	unsigned long result;
+	int err;
+
+	buffer.length = 3;
+	buffer.pointer = &in_buffer_param[0];
+	in_buffer_param[0] = light_id;
+	in_buffer_param[1] = 0x01;
+	in_buffer_param[2] =
+		clamp(brightness + min_value, min_value, max_value);
+
+	err = wmi_exec_int(LEGION_WMI_KBBACKLIGHT_GUID, 0,
+			   WMI_METHOD_ID_KBBACKLIGHTSET, &buffer, &result);
+	if (err) {
+		pr_info("Error for WMI method call to set brightness on light: %d\n",
+			light_id);
+		return -EIO;
+	}
+
+	return 0;
+}
+
+static int legion_kbd_bl_brightness_get(struct legion_private *priv)
+{
+	return legion_wmi_light_get(priv, LIGHT_ID_KEYBOARD, 1, 3);
+}
+
+static int legion_kbd_bl_brightness_set(struct legion_private *priv,
+					unsigned int brightness)
+{
+	return legion_wmi_light_set(priv, LIGHT_ID_KEYBOARD, 1, 3, brightness);
+}
+
+/* =============================  */
+/* debugfs interface              */
+/* ============================   */
+
+static int debugfs_ecmemory_show(struct seq_file *s, void *unused)
+{
+	struct legion_private *priv = s->private;
+	size_t offset;
+
+	for (offset = 0; offset < priv->conf->memoryio_size; ++offset) {
+		char value = ecram_read(&priv->ecram,
+					priv->conf->memoryio_physical_ec_start +
+						offset);
+
+		seq_write(s, &value, 1);
+	}
+	return 0;
+}
+
+DEFINE_SHOW_ATTRIBUTE(debugfs_ecmemory);
+
+static int debugfs_ecmemoryram_show(struct seq_file *s, void *unused)
+{
+	struct legion_private *priv = s->private;
+	size_t offset;
+	ssize_t err;
+	u8 value;
+
+	for (offset = 0; offset < priv->conf->ramio_size; ++offset) {
+		err = ecram_memoryio_read(&priv->ec_memoryio, offset, &value);
+		if (!err)
+			seq_write(s, &value, 1);
+		else
+			return -EACCES;
+	}
+	return 0;
+}
+
+DEFINE_SHOW_ATTRIBUTE(debugfs_ecmemoryram);
+
+//TODO: make (almost) all methods static
+
+static void seq_file_print_with_error(struct seq_file *s, const char *name,
+				      ssize_t err, int value)
+{
+	seq_printf(s, "%s error: %ld\n", name, err);
+	seq_printf(s, "%s: %d\n", name, value);
+}
+
+static int debugfs_fancurve_show(struct seq_file *s, void *unused)
+{
+	struct legion_private *priv = s->private;
+	bool is_minifancurve;
+	bool is_lockfancontroller;
+	bool is_maximumfanspeed;
+	bool is_rapidcharge = false;
+	int powermode;
+	int temperature;
+	int fanspeed;
+	int err;
+	unsigned long cfg;
+	struct fancurve wmi_fancurve;
+	//int kb_backlight;
+
+	mutex_lock(&priv->fancurve_mutex);
+
+	seq_printf(s, "EC Chip ID: %x\n", read_ec_id(&priv->ecram, priv->conf));
+	seq_printf(s, "EC Chip Version: %x\n",
+		   read_ec_version(&priv->ecram, priv->conf));
+	seq_printf(s, "legion_laptop features: %s\n", LEGIONFEATURES);
+	seq_printf(s, "legion_laptop ec_readonly: %d\n", ec_readonly);
+
+	err = eval_int(priv->adev->handle, "VPC0._CFG", &cfg);
+	seq_printf(s, "ACPI CFG error: %d\n", err);
+	seq_printf(s, "ACPI CFG: %lu\n", cfg);
+
+	seq_printf(s, "temperature access method: %d\n",
+		   priv->conf->access_method_temperature);
+	err = read_temperature(priv, 0, &temperature);
+	seq_file_print_with_error(s, "CPU temperature", err, temperature);
+	err = ec_read_temperature(&priv->ecram, priv->conf, 0, &temperature);
+	seq_file_print_with_error(s, "CPU temperature EC", err, temperature);
+	err = acpi_read_temperature(priv, 0, &temperature);
+	seq_file_print_with_error(s, "CPU temperature ACPI", err, temperature);
+	err = wmi_read_temperature_gz(0, &temperature);
+	seq_file_print_with_error(s, "CPU temperature WMI", err, temperature);
+	err = wmi_read_temperature(0, &temperature);
+	seq_file_print_with_error(s, "CPU temperature WMI2", err, temperature);
+	err = wmi_read_temperature_other(0, &temperature);
+	seq_file_print_with_error(s, "CPU temperature WMI3", err, temperature);
+
+	err = read_temperature(priv, 1, &temperature);
+	seq_file_print_with_error(s, "GPU temperature", err, temperature);
+	err = ec_read_temperature(&priv->ecram, priv->conf, 1, &temperature);
+	seq_file_print_with_error(s, "GPU temperature EC", err, temperature);
+	err = acpi_read_temperature(priv, 1, &temperature);
+	seq_file_print_with_error(s, "GPU temperature ACPI", err, temperature);
+	err = wmi_read_temperature_gz(1, &temperature);
+	seq_file_print_with_error(s, "GPU temperature WMI", err, temperature);
+	err = wmi_read_temperature(1, &temperature);
+	seq_file_print_with_error(s, "GPU temperature WMI2", err, temperature);
+	err = wmi_read_temperature_other(1, &temperature);
+	seq_file_print_with_error(s, "GPU temperature WMI3", err, temperature);
+
+	seq_printf(s, "fan speed access method: %d\n",
+		   priv->conf->access_method_fanspeed);
+	err = read_fanspeed(priv, 0, &fanspeed);
+	seq_file_print_with_error(s, "1 fanspeed", err, fanspeed);
+	err = ec_read_fanspeed(&priv->ecram, priv->conf, 0, &fanspeed);
+	seq_file_print_with_error(s, "1 fanspeed EC", err, fanspeed);
+	err = acpi_read_fanspeed(priv, 0, &fanspeed);
+	seq_file_print_with_error(s, "1 fanspeed ACPI", err, fanspeed);
+	err = wmi_read_fanspeed_gz(0, &fanspeed);
+	seq_file_print_with_error(s, "1 fanspeed WMI", err, fanspeed);
+	err = wmi_read_fanspeed(0, &fanspeed);
+	seq_file_print_with_error(s, "1 fanspeed WMI2", err, fanspeed);
+	err = wmi_read_fanspeed_other(0, &fanspeed);
+	seq_file_print_with_error(s, "1 fanspeed WMI3", err, fanspeed);
+
+	err = read_fanspeed(priv, 1, &fanspeed);
+	seq_file_print_with_error(s, "2 fanspeed", err, fanspeed);
+	err = ec_read_fanspeed(&priv->ecram, priv->conf, 1, &fanspeed);
+	seq_file_print_with_error(s, "2 fanspeed EC", err, fanspeed);
+	err = acpi_read_fanspeed(priv, 1, &fanspeed);
+	seq_file_print_with_error(s, "2 fanspeed ACPI", err, fanspeed);
+	err = wmi_read_fanspeed_gz(1, &fanspeed);
+	seq_file_print_with_error(s, "2 fanspeed WMI", err, fanspeed);
+	err = wmi_read_fanspeed(1, &fanspeed);
+	seq_file_print_with_error(s, "2 fanspeed WMI2", err, fanspeed);
+	err = wmi_read_fanspeed_other(1, &fanspeed);
+	seq_file_print_with_error(s, "2 fanspeed WMI3", err, fanspeed);
+
+	seq_printf(s, "powermode access method: %d\n",
+		   priv->conf->access_method_powermode);
+	err = read_powermode(priv, &powermode);
+	seq_file_print_with_error(s, "powermode", err, powermode);
+	err = ec_read_powermode(priv, &powermode);
+	seq_file_print_with_error(s, "powermode EC", err, powermode);
+	err = acpi_read_powermode(priv, &powermode);
+	seq_file_print_with_error(s, "powermode ACPI", err, powermode);
+	err = wmi_read_powermode(&powermode);
+	seq_file_print_with_error(s, "powermode WMI", err, powermode);
+	seq_printf(s, "has custom powermode: %d\n",
+		   priv->conf->has_custom_powermode);
+
+	err = acpi_read_rapidcharge(priv->adev, &is_rapidcharge);
+	seq_printf(s, "ACPI rapidcharge error: %d\n", err);
+	seq_printf(s, "ACPI rapidcharge: %d\n", is_rapidcharge);
+
+	seq_printf(s, "WMI backlight 2 state: %ld\n",
+		   legion_kbd_bl2_brightness_get(priv));
+	seq_printf(s, "WMI backlight 3 state: %d\n",
+		   legion_kbd_bl_brightness_get(priv));
+
+	seq_printf(s, "WMI light IO port: %d\n",
+		   legion_wmi_light_get(priv, LIGHT_ID_IOPORT, 0, 4));
+
+	seq_printf(s, "WMI light y logo/lid: %d\n",
+		   legion_wmi_light_get(priv, LIGHT_ID_YLOGO, 0, 4));
+
+	seq_printf(s, "EC minifancurve feature enabled: %d\n",
+		   priv->conf->has_minifancurve);
+	err = ec_read_minifancurve(&priv->ecram, priv->conf, &is_minifancurve);
+	seq_printf(s, "EC minifancurve on cool: %s\n",
+		   err ? "error" : (is_minifancurve ? "true" : "false"));
+
+	err = ec_read_lockfancontroller(&priv->ecram, priv->conf,
+					&is_lockfancontroller);
+	seq_printf(s, "EC lockfancontroller error: %d\n", err);
+	seq_printf(s, "EC lockfancontroller: %s\n",
+		   err ? "error" : (is_lockfancontroller ? "true" : "false"));
+
+	err = read_fanfullspeed(priv, &is_maximumfanspeed);
+	seq_file_print_with_error(s, "fanfullspeed", err, is_maximumfanspeed);
+
+	err = ec_read_fanfullspeed(&priv->ecram, priv->conf,
+				   &is_maximumfanspeed);
+	seq_file_print_with_error(s, "fanfullspeed EC", err,
+				  is_maximumfanspeed);
+
+	read_fancurve(priv, &priv->fancurve);
+	seq_printf(s, "EC fan curve current point id: %ld\n",
+		   priv->fancurve.current_point_i);
+	seq_printf(s, "EC fan curve points size: %ld\n", priv->fancurve.size);
+
+	seq_puts(s, "Current fan curve in hardware:\n");
+	fancurve_print_seqfile(&priv->fancurve, s);
+	seq_puts(s, "=====================\n");
+	mutex_unlock(&priv->fancurve_mutex);
+
+	seq_puts(s, "Current fan curve in hardware (WMI; might be empty)\n");
+	wmi_fancurve.size = 0;
+	err = wmi_read_fancurve_custom(priv->conf, &wmi_fancurve);
+	fancurve_print_seqfile(&wmi_fancurve, s);
+	seq_puts(s, "=====================\n");
+	return 0;
+}
+
+DEFINE_SHOW_ATTRIBUTE(debugfs_fancurve);
+
+static void legion_debugfs_init(struct legion_private *priv)
+{
+	struct dentry *dir;
+
+	// TODO: remove this note
+	// Note: as other kernel modules, do not catch errors here
+	// because if kernel is build without debugfs this
+	// will return an error but module still has to
+	// work, just without debugfs
+	// TODO: what permissions; some modules do 400
+	// other do 444
+	dir = debugfs_create_dir(LEGION_DRVR_SHORTNAME, NULL);
+	debugfs_create_file("fancurve", 0444, dir, priv,
+			    &debugfs_fancurve_fops);
+	debugfs_create_file("ecmemory", 0444, dir, priv,
+			    &debugfs_ecmemory_fops);
+	debugfs_create_file("ecmemoryram", 0444, dir, priv,
+			    &debugfs_ecmemoryram_fops);
+
+	priv->debugfs_dir = dir;
+}
+
+static void legion_debugfs_exit(struct legion_private *priv)
+{
+	pr_info("Unloading legion dubugfs\n");
+	// The following is does nothing if pointer is NULL
+	debugfs_remove_recursive(priv->debugfs_dir);
+	priv->debugfs_dir = NULL;
+	pr_info("Unloading legion dubugfs done\n");
+}
+
+/* =============================  */
+/* sysfs interface                */
+/* ============================   */
+
+static int show_simple_wmi_attribute(struct device *dev,
+				     struct device_attribute *attr, char *buf,
+				     const char *guid, u8 instance,
+				     u32 method_id, bool invert,
+				     unsigned long scale)
+{
+	unsigned long state = 0;
+	int err;
+	struct legion_private *priv = dev_get_drvdata(dev);
+
+	mutex_lock(&priv->fancurve_mutex);
+	err = get_simple_wmi_attribute(priv, guid, instance, method_id, invert,
+				       scale, &state);
+	mutex_unlock(&priv->fancurve_mutex);
+
+	if (err)
+		return -EINVAL;
+
+	return sysfs_emit(buf, "%lu\n", state);
+}
+
+static int show_simple_wmi_attribute_from_buffer(struct device *dev,
+						 struct device_attribute *attr,
+						 char *buf, const char *guid,
+						 u8 instance, u32 method_id,
+						 size_t ressize, size_t i,
+						 int scale)
+{
+	u8 res[16];
+	int err;
+	int out;
+	struct legion_private *priv = dev_get_drvdata(dev);
+
+	if (ressize > ARRAY_SIZE(res)) {
+		pr_info("Buffer to small for WMI result\n");
+		return -EINVAL;
+	}
+	if (i >= ressize) {
+		pr_info("Index not within buffer size\n");
+		return -EINVAL;
+	}
+
+	mutex_lock(&priv->fancurve_mutex);
+	err = wmi_exec_noarg_ints(guid, instance, method_id, res, ressize);
+	mutex_unlock(&priv->fancurve_mutex);
+	if (err)
+		return -EINVAL;
+
+	out = scale * res[i];
+	return sysfs_emit(buf, "%d\n", out);
+}
+
+static int store_simple_wmi_attribute(struct device *dev,
+				      struct device_attribute *attr,
+				      const char *buf, size_t count,
+				      const char *guid, u8 instance,
+				      u32 method_id, bool invert, int scale)
+{
+	int state;
+	int err;
+	struct legion_private *priv = dev_get_drvdata(dev);
+
+	err = kstrtouint(buf, 0, &state);
+	if (err)
+		return err;
+	err = set_simple_wmi_attribute(priv, guid, instance, method_id, invert,
+				       scale, state);
+	if (err)
+		return err;
+	return count;
+}
+
+static ssize_t lockfancontroller_show(struct device *dev,
+				      struct device_attribute *attr, char *buf)
+{
+	struct legion_private *priv = dev_get_drvdata(dev);
+	bool is_lockfancontroller;
+	int err;
+
+	mutex_lock(&priv->fancurve_mutex);
+	err = ec_read_lockfancontroller(&priv->ecram, priv->conf,
+					&is_lockfancontroller);
+	mutex_unlock(&priv->fancurve_mutex);
+	if (err)
+		return -EINVAL;
+
+	return sysfs_emit(buf, "%d\n", is_lockfancontroller);
+}
+
+static ssize_t lockfancontroller_store(struct device *dev,
+				       struct device_attribute *attr,
+				       const char *buf, size_t count)
+{
+	struct legion_private *priv = dev_get_drvdata(dev);
+	bool is_lockfancontroller;
+	int err;
+
+	err = kstrtobool(buf, &is_lockfancontroller);
+	if (err)
+		return err;
+
+	mutex_lock(&priv->fancurve_mutex);
+	err = ec_write_lockfancontroller(&priv->ecram, priv->conf,
+					 is_lockfancontroller);
+	mutex_unlock(&priv->fancurve_mutex);
+	if (err)
+		return -EINVAL;
+
+	return count;
+}
+
+static DEVICE_ATTR_RW(lockfancontroller);
+
+static ssize_t rapidcharge_show(struct device *dev,
+				struct device_attribute *attr, char *buf)
+{
+	bool state = false;
+	int err;
+	struct legion_private *priv = dev_get_drvdata(dev);
+
+	mutex_lock(&priv->fancurve_mutex);
+	err = acpi_read_rapidcharge(priv->adev, &state);
+	mutex_unlock(&priv->fancurve_mutex);
+	if (err)
+		return -EINVAL;
+
+	return sysfs_emit(buf, "%d\n", state);
+}
+
+static ssize_t rapidcharge_store(struct device *dev,
+				 struct device_attribute *attr, const char *buf,
+				 size_t count)
+{
+	struct legion_private *priv = dev_get_drvdata(dev);
+	int state;
+	int err;
+
+	err = kstrtouint(buf, 0, &state);
+	if (err)
+		return err;
+
+	mutex_lock(&priv->fancurve_mutex);
+	err = acpi_write_rapidcharge(priv->adev, state);
+	mutex_unlock(&priv->fancurve_mutex);
+	if (err)
+		return -EINVAL;
+
+	return count;
+}
+
+static DEVICE_ATTR_RW(rapidcharge);
+
+static ssize_t issupportgpuoc_show(struct device *dev,
+				   struct device_attribute *attr, char *buf)
+{
+	return show_simple_wmi_attribute(dev, attr, buf,
+					 LEGION_WMI_GAMEZONE_GUID, 0,
+					 WMI_METHOD_ID_ISSUPPORTGPUOC, false,
+					 1);
+}
+
+static DEVICE_ATTR_RO(issupportgpuoc);
+
+static ssize_t aslcodeversion_show(struct device *dev,
+				   struct device_attribute *attr, char *buf)
+{
+	return show_simple_wmi_attribute(dev, attr, buf,
+					 LEGION_WMI_GAMEZONE_GUID, 0,
+					 WMI_METHOD_ID_GETVERSION, false, 1);
+}
+
+static DEVICE_ATTR_RO(aslcodeversion);
+
+static ssize_t issupportcpuoc_show(struct device *dev,
+				   struct device_attribute *attr, char *buf)
+{
+	return show_simple_wmi_attribute(dev, attr, buf,
+					 LEGION_WMI_GAMEZONE_GUID, 0,
+					 WMI_METHOD_ID_ISSUPPORTCPUOC, false,
+					 1);
+}
+
+static DEVICE_ATTR_RO(issupportcpuoc);
+
+static ssize_t winkey_show(struct device *dev, struct device_attribute *attr,
+			   char *buf)
+{
+	return show_simple_wmi_attribute(dev, attr, buf,
+					 LEGION_WMI_GAMEZONE_GUID, 0,
+					 WMI_METHOD_ID_GETWINKEYSTATUS, true,
+					 1);
+}
+
+static ssize_t winkey_store(struct device *dev, struct device_attribute *attr,
+			    const char *buf, size_t count)
+{
+	return store_simple_wmi_attribute(dev, attr, buf, count,
+					  LEGION_WMI_GAMEZONE_GUID, 0,
+					  WMI_METHOD_ID_SETWINKEYSTATUS, true,
+					  1);
+}
+
+static DEVICE_ATTR_RW(winkey);
+
+// on newer models the touchpad feature in ideapad does not work anymore, so
+// we need this
+static ssize_t touchpad_show(struct device *dev, struct device_attribute *attr,
+			     char *buf)
+{
+	return show_simple_wmi_attribute(dev, attr, buf,
+					 LEGION_WMI_GAMEZONE_GUID, 0,
+					 WMI_METHOD_ID_GETTPSTATUS, true, 1);
+}
+
+static ssize_t touchpad_store(struct device *dev, struct device_attribute *attr,
+			      const char *buf, size_t count)
+{
+	return store_simple_wmi_attribute(dev, attr, buf, count,
+					  LEGION_WMI_GAMEZONE_GUID, 0,
+					  WMI_METHOD_ID_SETTPSTATUS, true, 1);
+}
+
+static DEVICE_ATTR_RW(touchpad);
+
+static ssize_t gsync_show(struct device *dev, struct device_attribute *attr,
+			  char *buf)
+{
+	return show_simple_wmi_attribute(dev, attr, buf,
+					 LEGION_WMI_GAMEZONE_GUID, 0,
+					 WMI_METHOD_ID_GETGSYNCSTATUS, true, 1);
+}
+
+static ssize_t gsync_store(struct device *dev, struct device_attribute *attr,
+			   const char *buf, size_t count)
+{
+	return store_simple_wmi_attribute(dev, attr, buf, count,
+					  LEGION_WMI_GAMEZONE_GUID, 0,
+					  WMI_METHOD_ID_SETGSYNCSTATUS, true,
+					  1);
+}
+
+static DEVICE_ATTR_RW(gsync);
+
+static ssize_t powerchargemode_show(struct device *dev,
+				    struct device_attribute *attr, char *buf)
+{
+	return show_simple_wmi_attribute(dev, attr, buf,
+					 LEGION_WMI_GAMEZONE_GUID, 0,
+					 WMI_METHOD_ID_GETPOWERCHARGEMODE,
+					 false, 1);
+}
+static DEVICE_ATTR_RO(powerchargemode);
+
+static ssize_t overdrive_show(struct device *dev, struct device_attribute *attr,
+			      char *buf)
+{
+	return show_simple_wmi_attribute(dev, attr, buf,
+					 LEGION_WMI_GAMEZONE_GUID, 0,
+					 WMI_METHOD_ID_GETODSTATUS, false, 1);
+}
+
+static ssize_t overdrive_store(struct device *dev,
+			       struct device_attribute *attr, const char *buf,
+			       size_t count)
+{
+	return store_simple_wmi_attribute(dev, attr, buf, count,
+					  LEGION_WMI_GAMEZONE_GUID, 0,
+					  WMI_METHOD_ID_SETODSTATUS, false, 1);
+}
+
+static DEVICE_ATTR_RW(overdrive);
+
+static ssize_t thermalmode_show(struct device *dev,
+				struct device_attribute *attr, char *buf)
+{
+	return show_simple_wmi_attribute(dev, attr, buf,
+					 LEGION_WMI_GAMEZONE_GUID, 0,
+					 WMI_METHOD_ID_GETTHERMALMODE, false,
+					 1);
+}
+static DEVICE_ATTR_RO(thermalmode);
+
+// TOOD: probably remove again because provided by other means; only useful for overclocking
+static ssize_t cpumaxfrequency_show(struct device *dev,
+				    struct device_attribute *attr, char *buf)
+{
+	return show_simple_wmi_attribute(dev, attr, buf,
+					 LEGION_WMI_GAMEZONE_GUID, 0,
+					 WMI_METHOD_ID_GETCPUMAXFREQUENCY,
+					 false, 1);
+}
+static DEVICE_ATTR_RO(cpumaxfrequency);
+
+static ssize_t isacfitforoc_show(struct device *dev,
+				 struct device_attribute *attr, char *buf)
+{
+	return show_simple_wmi_attribute(dev, attr, buf,
+					 LEGION_WMI_GAMEZONE_GUID, 0,
+					 WMI_METHOD_ID_ISACFITFOROC, false, 1);
+}
+static DEVICE_ATTR_RO(isacfitforoc);
+
+static ssize_t igpumode_show(struct device *dev, struct device_attribute *attr,
+			     char *buf)
+{
+	return show_simple_wmi_attribute(dev, attr, buf,
+					 LEGION_WMI_GAMEZONE_GUID, 0,
+					 WMI_METHOD_ID_GETIGPUMODESTATUS, false,
+					 1);
+}
+
+static ssize_t igpumode_store(struct device *dev, struct device_attribute *attr,
+			      const char *buf, size_t count)
+{
+	return store_simple_wmi_attribute(dev, attr, buf, count,
+					  LEGION_WMI_GAMEZONE_GUID, 0,
+					  WMI_METHOD_ID_SETIGPUMODESTATUS,
+					  false, 1);
+}
+
+static DEVICE_ATTR_RW(igpumode);
+
+static ssize_t cpu_oc_show(struct device *dev, struct device_attribute *attr,
+			   char *buf)
+{
+	return show_simple_wmi_attribute_from_buffer(
+		dev, attr, buf, WMI_GUID_LENOVO_CPU_METHOD, 0,
+		WMI_METHOD_ID_CPU_GET_OC_STATUS, 16, 0, 1);
+}
+
+static ssize_t cpu_oc_store(struct device *dev, struct device_attribute *attr,
+			    const char *buf, size_t count)
+{
+	return store_simple_wmi_attribute(dev, attr, buf, count,
+					  WMI_GUID_LENOVO_CPU_METHOD, 0,
+					  WMI_METHOD_ID_CPU_SET_OC_STATUS,
+					  false, 1);
+}
+
+static DEVICE_ATTR_RW(cpu_oc);
+
+static ssize_t cpu_shortterm_powerlimit_show(struct device *dev,
+					     struct device_attribute *attr,
+					     char *buf)
+{
+	return show_simple_wmi_attribute_from_buffer(
+		dev, attr, buf, WMI_GUID_LENOVO_CPU_METHOD, 0,
+		WMI_METHOD_ID_CPU_GET_SHORTTERM_POWERLIMIT, 16, 0, 1);
+}
+
+static ssize_t cpu_shortterm_powerlimit_store(struct device *dev,
+					      struct device_attribute *attr,
+					      const char *buf, size_t count)
+{
+	return store_simple_wmi_attribute(
+		dev, attr, buf, count, WMI_GUID_LENOVO_CPU_METHOD, 0,
+		WMI_METHOD_ID_CPU_SET_SHORTTERM_POWERLIMIT, false, 1);
+}
+
+static DEVICE_ATTR_RW(cpu_shortterm_powerlimit);
+
+static ssize_t cpu_longterm_powerlimit_show(struct device *dev,
+					    struct device_attribute *attr,
+					    char *buf)
+{
+	return show_simple_wmi_attribute_from_buffer(
+		dev, attr, buf, WMI_GUID_LENOVO_CPU_METHOD, 0,
+		WMI_METHOD_ID_CPU_GET_LONGTERM_POWERLIMIT, 16, 0, 1);
+}
+
+static ssize_t cpu_longterm_powerlimit_store(struct device *dev,
+					     struct device_attribute *attr,
+					     const char *buf, size_t count)
+{
+	return store_simple_wmi_attribute(
+		dev, attr, buf, count, WMI_GUID_LENOVO_CPU_METHOD, 0,
+		WMI_METHOD_ID_CPU_SET_LONGTERM_POWERLIMIT, false, 1);
+}
+
+static DEVICE_ATTR_RW(cpu_longterm_powerlimit);
+
+static ssize_t cpu_default_powerlimit_show(struct device *dev,
+					   struct device_attribute *attr,
+					   char *buf)
+{
+	return show_simple_wmi_attribute(
+		dev, attr, buf, WMI_GUID_LENOVO_CPU_METHOD, 0,
+		WMI_METHOD_ID_CPU_GET_DEFAULT_POWERLIMIT, false, 1);
+}
+
+static DEVICE_ATTR_RO(cpu_default_powerlimit);
+
+static ssize_t cpu_peak_powerlimit_show(struct device *dev,
+					struct device_attribute *attr,
+					char *buf)
+{
+	return show_simple_wmi_attribute(dev, attr, buf,
+					 WMI_GUID_LENOVO_GPU_METHOD, 0,
+					 WMI_METHOD_ID_CPU_GET_PEAK_POWERLIMIT,
+					 false, 1);
+}
+
+static ssize_t cpu_peak_powerlimit_store(struct device *dev,
+					 struct device_attribute *attr,
+					 const char *buf, size_t count)
+{
+	return store_simple_wmi_attribute(dev, attr, buf, count,
+					  WMI_GUID_LENOVO_GPU_METHOD, 0,
+					  WMI_METHOD_ID_CPU_SET_PEAK_POWERLIMIT,
+					  false, 1);
+}
+
+static DEVICE_ATTR_RW(cpu_peak_powerlimit);
+
+static ssize_t cpu_apu_sppt_powerlimit_show(struct device *dev,
+					    struct device_attribute *attr,
+					    char *buf)
+{
+	return show_simple_wmi_attribute(
+		dev, attr, buf, WMI_GUID_LENOVO_GPU_METHOD, 0,
+		WMI_METHOD_ID_CPU_GET_APU_SPPT_POWERLIMIT, false, 1);
+}
+
+static ssize_t cpu_apu_sppt_powerlimit_store(struct device *dev,
+					     struct device_attribute *attr,
+					     const char *buf, size_t count)
+{
+	return store_simple_wmi_attribute(
+		dev, attr, buf, count, WMI_GUID_LENOVO_GPU_METHOD, 0,
+		WMI_METHOD_ID_CPU_SET_APU_SPPT_POWERLIMIT, false, 1);
+}
+
+static DEVICE_ATTR_RW(cpu_apu_sppt_powerlimit);
+
+static ssize_t cpu_cross_loading_powerlimit_show(struct device *dev,
+						 struct device_attribute *attr,
+						 char *buf)
+{
+	return show_simple_wmi_attribute(
+		dev, attr, buf, WMI_GUID_LENOVO_GPU_METHOD, 0,
+		WMI_METHOD_ID_CPU_GET_CROSS_LOADING_POWERLIMIT, false, 1);
+}
+
+static ssize_t cpu_cross_loading_powerlimit_store(struct device *dev,
+						  struct device_attribute *attr,
+						  const char *buf, size_t count)
+{
+	return store_simple_wmi_attribute(
+		dev, attr, buf, count, WMI_GUID_LENOVO_GPU_METHOD, 0,
+		WMI_METHOD_ID_CPU_SET_CROSS_LOADING_POWERLIMIT, false, 1);
+}
+
+static DEVICE_ATTR_RW(cpu_cross_loading_powerlimit);
+
+static ssize_t gpu_oc_show(struct device *dev, struct device_attribute *attr,
+			   char *buf)
+{
+	return show_simple_wmi_attribute(dev, attr, buf,
+					 WMI_GUID_LENOVO_GPU_METHOD, 0,
+					 WMI_METHOD_ID_GPU_GET_OC_STATUS, false,
+					 1);
+}
+
+static ssize_t gpu_oc_store(struct device *dev, struct device_attribute *attr,
+			    const char *buf, size_t count)
+{
+	return store_simple_wmi_attribute(dev, attr, buf, count,
+					  WMI_GUID_LENOVO_GPU_METHOD, 0,
+					  WMI_METHOD_ID_GPU_SET_OC_STATUS,
+					  false, 1);
+}
+
+static DEVICE_ATTR_RW(gpu_oc);
+
+static ssize_t gpu_ppab_powerlimit_show(struct device *dev,
+					struct device_attribute *attr,
+					char *buf)
+{
+	return show_simple_wmi_attribute_from_buffer(
+		dev, attr, buf, WMI_GUID_LENOVO_GPU_METHOD, 0,
+		WMI_METHOD_ID_GPU_GET_PPAB_POWERLIMIT, 16, 0, 1);
+}
+
+static ssize_t gpu_ppab_powerlimit_store(struct device *dev,
+					 struct device_attribute *attr,
+					 const char *buf, size_t count)
+{
+	return store_simple_wmi_attribute(dev, attr, buf, count,
+					  WMI_GUID_LENOVO_GPU_METHOD, 0,
+					  WMI_METHOD_ID_GPU_SET_PPAB_POWERLIMIT,
+					  false, 1);
+}
+
+static DEVICE_ATTR_RW(gpu_ppab_powerlimit);
+
+static ssize_t gpu_ctgp_powerlimit_show(struct device *dev,
+					struct device_attribute *attr,
+					char *buf)
+{
+	return show_simple_wmi_attribute_from_buffer(
+		dev, attr, buf, WMI_GUID_LENOVO_GPU_METHOD, 0,
+		WMI_METHOD_ID_GPU_GET_CTGP_POWERLIMIT, 16, 0, 1);
+}
+
+static ssize_t gpu_ctgp_powerlimit_store(struct device *dev,
+					 struct device_attribute *attr,
+					 const char *buf, size_t count)
+{
+	return store_simple_wmi_attribute(dev, attr, buf, count,
+					  WMI_GUID_LENOVO_GPU_METHOD, 0,
+					  WMI_METHOD_ID_GPU_SET_CTGP_POWERLIMIT,
+					  false, 1);
+}
+
+static DEVICE_ATTR_RW(gpu_ctgp_powerlimit);
+
+static ssize_t gpu_ctgp2_powerlimit_show(struct device *dev,
+					 struct device_attribute *attr,
+					 char *buf)
+{
+	return show_simple_wmi_attribute_from_buffer(
+		dev, attr, buf, WMI_GUID_LENOVO_GPU_METHOD, 0,
+		WMI_METHOD_ID_GPU_GET_CTGP_POWERLIMIT, 16, 0x0C, 1);
+}
+
+static DEVICE_ATTR_RO(gpu_ctgp2_powerlimit);
+
+// TOOD: probably remove again because provided by other means; only useful for overclocking
+static ssize_t
+gpu_default_ppab_ctrgp_powerlimit_show(struct device *dev,
+				       struct device_attribute *attr, char *buf)
+{
+	return show_simple_wmi_attribute(
+		dev, attr, buf, WMI_GUID_LENOVO_GPU_METHOD, 0,
+		WMI_METHOD_ID_GPU_GET_DEFAULT_PPAB_CTGP_POWERLIMIT, false, 1);
+}
+static DEVICE_ATTR_RO(gpu_default_ppab_ctrgp_powerlimit);
+
+static ssize_t gpu_temperature_limit_show(struct device *dev,
+					  struct device_attribute *attr,
+					  char *buf)
+{
+	return show_simple_wmi_attribute(
+		dev, attr, buf, WMI_GUID_LENOVO_GPU_METHOD, 0,
+		WMI_METHOD_ID_GPU_GET_TEMPERATURE_LIMIT, false, 1);
+}
+
+static ssize_t gpu_temperature_limit_store(struct device *dev,
+					   struct device_attribute *attr,
+					   const char *buf, size_t count)
+{
+	return store_simple_wmi_attribute(
+		dev, attr, buf, count, WMI_GUID_LENOVO_GPU_METHOD, 0,
+		WMI_METHOD_ID_GPU_SET_TEMPERATURE_LIMIT, false, 1);
+}
+
+static DEVICE_ATTR_RW(gpu_temperature_limit);
+
+// TOOD: probably remove again because provided by other means; only useful for overclocking
+static ssize_t gpu_boost_clock_show(struct device *dev,
+				    struct device_attribute *attr, char *buf)
+{
+	return show_simple_wmi_attribute(dev, attr, buf,
+					 WMI_GUID_LENOVO_GPU_METHOD, 0,
+					 WMI_METHOD_ID_GPU_GET_BOOST_CLOCK,
+					 false, 1);
+}
+static DEVICE_ATTR_RO(gpu_boost_clock);
+
+static ssize_t fan_fullspeed_show(struct device *dev,
+				  struct device_attribute *attr, char *buf)
+{
+	bool state = false;
+	int err;
+	struct legion_private *priv = dev_get_drvdata(dev);
+
+	mutex_lock(&priv->fancurve_mutex);
+	err = read_fanfullspeed(priv, &state);
+	mutex_unlock(&priv->fancurve_mutex);
+	if (err)
+		return -EINVAL;
+
+	return sysfs_emit(buf, "%d\n", state);
+}
+
+static ssize_t fan_fullspeed_store(struct device *dev,
+				   struct device_attribute *attr,
+				   const char *buf, size_t count)
+{
+	struct legion_private *priv = dev_get_drvdata(dev);
+	int state;
+	int err;
+
+	err = kstrtouint(buf, 0, &state);
+	if (err)
+		return err;
+
+	mutex_lock(&priv->fancurve_mutex);
+	err = write_fanfullspeed(priv, state);
+	mutex_unlock(&priv->fancurve_mutex);
+	if (err)
+		return -EINVAL;
+
+	return count;
+}
+
+static DEVICE_ATTR_RW(fan_fullspeed);
+
+static ssize_t fan_maxspeed_show(struct device *dev,
+				 struct device_attribute *attr, char *buf)
+{
+	return show_simple_wmi_attribute(dev, attr, buf,
+					 WMI_GUID_LENOVO_FAN_METHOD, 0,
+					 WMI_METHOD_ID_FAN_GET_MAXSPEED, false,
+					 1);
+}
+
+static ssize_t fan_maxspeed_store(struct device *dev,
+				  struct device_attribute *attr,
+				  const char *buf, size_t count)
+{
+	return store_simple_wmi_attribute(dev, attr, buf, count,
+					  WMI_GUID_LENOVO_FAN_METHOD, 0,
+					  WMI_METHOD_ID_FAN_SET_MAXSPEED, false,
+					  1);
+}
+
+static DEVICE_ATTR_RW(fan_maxspeed);
+
+static ssize_t powermode_show(struct device *dev, struct device_attribute *attr,
+			      char *buf)
+{
+	struct legion_private *priv = dev_get_drvdata(dev);
+	int power_mode;
+
+	mutex_lock(&priv->fancurve_mutex);
+	read_powermode(priv, &power_mode);
+	mutex_unlock(&priv->fancurve_mutex);
+	return sysfs_emit(buf, "%d\n", power_mode);
+}
+
+static void legion_platform_profile_notify(void);
+
+static ssize_t powermode_store(struct device *dev,
+			       struct device_attribute *attr, const char *buf,
+			       size_t count)
+{
+	struct legion_private *priv = dev_get_drvdata(dev);
+	int powermode;
+	int err;
+
+	err = kstrtouint(buf, 0, &powermode);
+	if (err)
+		return err;
+
+	mutex_lock(&priv->fancurve_mutex);
+	err = write_powermode(priv, powermode);
+	mutex_unlock(&priv->fancurve_mutex);
+	if (err)
+		return -EINVAL;
+
+	// TODO: better?
+	// we have to wait a bit before change is done in hardware and
+	// readback done after notifying returns correct value, otherwise
+	// the notified reader will read old value
+	msleep(500);
+	legion_platform_profile_notify();
+
+	return count;
+}
+
+static DEVICE_ATTR_RW(powermode);
+
+static struct attribute *legion_sysfs_attributes[] = {
+	&dev_attr_powermode.attr,
+	&dev_attr_lockfancontroller.attr,
+	&dev_attr_rapidcharge.attr,
+	&dev_attr_winkey.attr,
+	&dev_attr_touchpad.attr,
+	&dev_attr_gsync.attr,
+	&dev_attr_powerchargemode.attr,
+	&dev_attr_overdrive.attr,
+	&dev_attr_cpumaxfrequency.attr,
+	&dev_attr_isacfitforoc.attr,
+	&dev_attr_cpu_oc.attr,
+	&dev_attr_cpu_shortterm_powerlimit.attr,
+	&dev_attr_cpu_longterm_powerlimit.attr,
+	&dev_attr_cpu_apu_sppt_powerlimit.attr,
+	&dev_attr_cpu_default_powerlimit.attr,
+	&dev_attr_cpu_peak_powerlimit.attr,
+	&dev_attr_cpu_cross_loading_powerlimit.attr,
+	&dev_attr_gpu_oc.attr,
+	&dev_attr_gpu_ppab_powerlimit.attr,
+	&dev_attr_gpu_ctgp_powerlimit.attr,
+	&dev_attr_gpu_ctgp2_powerlimit.attr,
+	&dev_attr_gpu_default_ppab_ctrgp_powerlimit.attr,
+	&dev_attr_gpu_temperature_limit.attr,
+	&dev_attr_gpu_boost_clock.attr,
+	&dev_attr_fan_fullspeed.attr,
+	&dev_attr_fan_maxspeed.attr,
+	&dev_attr_thermalmode.attr,
+	&dev_attr_issupportcpuoc.attr,
+	&dev_attr_issupportgpuoc.attr,
+	&dev_attr_aslcodeversion.attr,
+	&dev_attr_igpumode.attr,
+	NULL
+};
+
+static const struct attribute_group legion_attribute_group = {
+	.attrs = legion_sysfs_attributes
+};
+
+static int legion_sysfs_init(struct legion_private *priv)
+{
+	return device_add_group(&priv->platform_device->dev,
+				&legion_attribute_group);
+}
+
+static void legion_sysfs_exit(struct legion_private *priv)
+{
+	pr_info("Unloading legion sysfs\n");
+	device_remove_group(&priv->platform_device->dev,
+			    &legion_attribute_group);
+	pr_info("Unloading legion sysfs done\n");
+}
+
+/* =============================  */
+/* WMI + ACPI                     */
+/* ============================   */
+// heavily based on ideapad_laptop.c
+
+// TODO: proper names if meaning of all events is clear
+enum LEGION_WMI_EVENT {
+	LEGION_WMI_EVENT_GAMEZONE = 1,
+	LEGION_EVENT_A,
+	LEGION_EVENT_B,
+	LEGION_EVENT_C,
+	LEGION_EVENT_D,
+	LEGION_EVENT_E,
+	LEGION_EVENT_F,
+	LEGION_EVENT_G
+};
+
+struct legion_wmi_private {
+	enum LEGION_WMI_EVENT event;
+};
+
+//static void legion_wmi_notify2(u32 value, void *context)
+//    {
+//	pr_info("WMI notify\n" );
+//    }
+
+static void legion_wmi_notify(struct wmi_device *wdev, union acpi_object *data)
+{
+	struct legion_wmi_private *wpriv;
+	struct legion_private *priv;
+
+	mutex_lock(&legion_shared_mutex);
+	priv = legion_shared;
+	if ((!priv) && (priv->loaded)) {
+		pr_info("Received WMI event while not initialized!\n");
+		goto unlock;
+	}
+
+	wpriv = dev_get_drvdata(&wdev->dev);
+	switch (wpriv->event) {
+	case LEGION_EVENT_A:
+		pr_info("Fan event: legion type: %d;  acpi type: %d (%d=integer)",
+			wpriv->event, data->type, ACPI_TYPE_INTEGER);
+		// TODO: here it is too early (first unlock mutext, then wait a bit)
+		//legion_platform_profile_notify();
+		break;
+	default:
+		pr_info("Event: legion type: %d;  acpi type: %d (%d=integer)",
+			wpriv->event, data->type, ACPI_TYPE_INTEGER);
+		break;
+	}
+
+unlock:
+	mutex_unlock(&legion_shared_mutex);
+	// todo; fix that!
+	// problem: we get a event just before the powermode change (from the key?),
+	// so if we notify to early, it will read the old power mode/platform profile
+	msleep(500);
+	legion_platform_profile_notify();
+}
+
+static int legion_wmi_probe(struct wmi_device *wdev, const void *context)
+{
+	struct legion_wmi_private *wpriv;
+
+	wpriv = devm_kzalloc(&wdev->dev, sizeof(*wpriv), GFP_KERNEL);
+	if (!wpriv)
+		return -ENOMEM;
+
+	*wpriv = *(const struct legion_wmi_private *)context;
+
+	dev_set_drvdata(&wdev->dev, wpriv);
+	dev_info(&wdev->dev, "Register after probing for WMI.\n");
+	return 0;
+}
+
+static const struct legion_wmi_private legion_wmi_context_gamezone = {
+	.event = LEGION_WMI_EVENT_GAMEZONE
+};
+static const struct legion_wmi_private legion_wmi_context_a = {
+	.event = LEGION_EVENT_A
+};
+static const struct legion_wmi_private legion_wmi_context_b = {
+	.event = LEGION_EVENT_B
+};
+static const struct legion_wmi_private legion_wmi_context_c = {
+	.event = LEGION_EVENT_C
+};
+static const struct legion_wmi_private legion_wmi_context_d = {
+	.event = LEGION_EVENT_D
+};
+static const struct legion_wmi_private legion_wmi_context_e = {
+	.event = LEGION_EVENT_E
+};
+static const struct legion_wmi_private legion_wmi_context_f = {
+	.event = LEGION_EVENT_F
+};
+
+#define LEGION_WMI_GUID_FAN_EVENT "D320289E-8FEA-41E0-86F9-611D83151B5F"
+#define LEGION_WMI_GUID_FAN2_EVENT "bc72a435-e8c1-4275-b3e2-d8b8074aba59"
+#define LEGION_WMI_GUID_GAMEZONE_KEY_EVENT \
+	"10afc6d9-ea8b-4590-a2e7-1cd3c84bb4b1"
+#define LEGION_WMI_GUID_GAMEZONE_GPU_EVENT \
+	"bfd42481-aee3-4502-a107-afb68425c5f8"
+#define LEGION_WMI_GUID_GAMEZONE_OC_EVENT "d062906b-12d4-4510-999d-4831ee80e985"
+#define LEGION_WMI_GUID_GAMEZONE_TEMP_EVENT \
+	"bfd42481-aee3-4501-a107-afb68425c5f8"
+//#define LEGION_WMI_GUID_GAMEZONE_DATA_EVENT  "887b54e3-dddc-4b2c-8b88-68a26a8835d0"
+
+static const struct wmi_device_id legion_wmi_ids[] = {
+	{ LEGION_WMI_GAMEZONE_GUID, &legion_wmi_context_gamezone },
+	{ LEGION_WMI_GUID_FAN_EVENT, &legion_wmi_context_a },
+	{ LEGION_WMI_GUID_FAN2_EVENT, &legion_wmi_context_b },
+	{ LEGION_WMI_GUID_GAMEZONE_KEY_EVENT, &legion_wmi_context_c },
+	{ LEGION_WMI_GUID_GAMEZONE_GPU_EVENT, &legion_wmi_context_d },
+	{ LEGION_WMI_GUID_GAMEZONE_OC_EVENT, &legion_wmi_context_e },
+	{ LEGION_WMI_GUID_GAMEZONE_TEMP_EVENT, &legion_wmi_context_f },
+	{ "8FC0DE0C-B4E4-43FD-B0F3-8871711C1294",
+	  &legion_wmi_context_gamezone }, /* Legion 5 */
+	{},
+};
+MODULE_DEVICE_TABLE(wmi, legion_wmi_ids);
+
+static struct wmi_driver legion_wmi_driver = {
+	.driver = {
+		.name = "legion_wmi",
+	},
+	.id_table = legion_wmi_ids,
+	.probe = legion_wmi_probe,
+	.notify = legion_wmi_notify,
+};
+
+//acpi_status status = wmi_install_notify_handler(LEGION_WMI_GAMEZONE_GUID,
+//				legion_wmi_notify2, NULL);
+//if (ACPI_FAILURE(status)) {
+//    return -ENODEV;
+//}
+//return 0;
+
+static int legion_wmi_init(void)
+{
+	return wmi_driver_register(&legion_wmi_driver);
+}
+
+static void legion_wmi_exit(void)
+{
+	// TODO: remove this
+	pr_info("Unloading legion WMI\n");
+
+	//wmi_remove_notify_handler(LEGION_WMI_GAMEZONE_GUID);
+	wmi_driver_unregister(&legion_wmi_driver);
+	pr_info("Unloading legion WMI done\n");
+}
+
+/* =============================  */
+/* Platform profile               */
+/* ============================   */
+
+static void legion_platform_profile_notify(void)
+{
+	if (!enable_platformprofile)
+		pr_info("Skipping platform_profile_notify because enable_platformprofile is false\n");
+
+	platform_profile_notify();
+}
+
+static int legion_platform_profile_get(struct platform_profile_handler *pprof,
+				       enum platform_profile_option *profile)
+{
+	int powermode;
+	struct legion_private *priv;
+
+	priv = container_of(pprof, struct legion_private,
+			    platform_profile_handler);
+	read_powermode(priv, &powermode);
+
+	switch (powermode) {
+	case LEGION_WMI_POWERMODE_BALANCED:
+		*profile = PLATFORM_PROFILE_BALANCED;
+		break;
+	case LEGION_WMI_POWERMODE_PERFORMANCE:
+		*profile = PLATFORM_PROFILE_PERFORMANCE;
+		break;
+	case LEGION_WMI_POWERMODE_QUIET:
+		*profile = PLATFORM_PROFILE_QUIET;
+		break;
+	case LEGION_WMI_POWERMODE_CUSTOM:
+		*profile = PLATFORM_PROFILE_BALANCED_PERFORMANCE;
+		break;
+	default:
+		return -EINVAL;
+	}
+	return 0;
+}
+
+static int legion_platform_profile_set(struct platform_profile_handler *pprof,
+				       enum platform_profile_option profile)
+{
+	int powermode;
+	struct legion_private *priv;
+
+	priv = container_of(pprof, struct legion_private,
+			    platform_profile_handler);
+
+	switch (profile) {
+	case PLATFORM_PROFILE_BALANCED:
+		powermode = LEGION_WMI_POWERMODE_BALANCED;
+		break;
+	case PLATFORM_PROFILE_PERFORMANCE:
+		powermode = LEGION_WMI_POWERMODE_PERFORMANCE;
+		break;
+	case PLATFORM_PROFILE_QUIET:
+		powermode = LEGION_WMI_POWERMODE_QUIET;
+		break;
+	case PLATFORM_PROFILE_BALANCED_PERFORMANCE:
+		powermode = LEGION_WMI_POWERMODE_CUSTOM;
+		break;
+	default:
+		return -EOPNOTSUPP;
+	}
+
+	return write_powermode(priv, powermode);
+}
+
+static int legion_platform_profile_init(struct legion_private *priv)
+{
+	int err;
+
+	if (!enable_platformprofile) {
+		pr_info("Skipping creating platform profile support because enable_platformprofile is false\n");
+		return 0;
+	}
+
+	priv->platform_profile_handler.profile_get =
+		legion_platform_profile_get;
+	priv->platform_profile_handler.profile_set =
+		legion_platform_profile_set;
+
+	set_bit(PLATFORM_PROFILE_QUIET, priv->platform_profile_handler.choices);
+	set_bit(PLATFORM_PROFILE_BALANCED,
+		priv->platform_profile_handler.choices);
+	set_bit(PLATFORM_PROFILE_PERFORMANCE,
+		priv->platform_profile_handler.choices);
+	if (priv->conf->has_custom_powermode &&
+	    priv->conf->access_method_powermode == ACCESS_METHOD_WMI) {
+		set_bit(PLATFORM_PROFILE_BALANCED_PERFORMANCE,
+			priv->platform_profile_handler.choices);
+	}
+
+	err = platform_profile_register(&priv->platform_profile_handler);
+	if (err)
+		return err;
+
+	return 0;
+}
+
+static void legion_platform_profile_exit(struct legion_private *priv)
+{
+	if (!enable_platformprofile) {
+		pr_info("Skipping unloading platform profile support because enable_platformprofile is false\n");
+		return;
+	}
+	pr_info("Unloading legion platform profile\n");
+	platform_profile_remove();
+	pr_info("Unloading legion platform profile done\n");
+}
+
+/* =============================  */
+/* hwom interface              */
+/* ============================   */
+
+// hw-mon interface
+
+// todo: register_group or register_info?
+
+// TODO: use one common function (like here) or one function per attribute?
+static ssize_t sensor_label_show(struct device *dev,
+				 struct device_attribute *attr, char *buf)
+{
+	int sensor_id = (to_sensor_dev_attr(attr))->index;
+	const char *label;
+
+	switch (sensor_id) {
+	case SENSOR_CPU_TEMP_ID:
+		label = "CPU Temperature\n";
+		break;
+	case SENSOR_GPU_TEMP_ID:
+		label = "GPU Temperature\n";
+		break;
+	case SENSOR_IC_TEMP_ID:
+		label = "IC Temperature\n";
+		break;
+	case SENSOR_FAN1_RPM_ID:
+		label = "Fan 1\n";
+		break;
+	case SENSOR_FAN2_RPM_ID:
+		label = "Fan 2\n";
+		break;
+	case SENSOR_FAN1_TARGET_RPM_ID:
+		label = "Fan 1 Target\n";
+		break;
+	case SENSOR_FAN2_TARGET_RPM_ID:
+		label = "Fan 2 Target\n";
+		break;
+	default:
+		return -EOPNOTSUPP;
+	}
+
+	return sprintf(buf, label);
+}
+
+// TODO: use one common function (like here) or one function per attribute?
+static ssize_t sensor_show(struct device *dev, struct device_attribute *devattr,
+			   char *buf)
+{
+	struct legion_private *priv = dev_get_drvdata(dev);
+	int sensor_id = (to_sensor_dev_attr(devattr))->index;
+	struct sensor_values values;
+	int outval;
+	int err = -EIO;
+
+	switch (sensor_id) {
+	case SENSOR_CPU_TEMP_ID:
+		err = read_temperature(priv, 0, &outval);
+		outval *= 1000;
+		break;
+	case SENSOR_GPU_TEMP_ID:
+		err = read_temperature(priv, 1, &outval);
+		outval *= 1000;
+		break;
+	case SENSOR_IC_TEMP_ID:
+		ec_read_sensor_values(&priv->ecram, priv->conf, &values);
+		outval = 1000 * values.ic_temp_celsius;
+		err = 0;
+		break;
+	case SENSOR_FAN1_RPM_ID:
+		err = read_fanspeed(priv, 0, &outval);
+		break;
+	case SENSOR_FAN2_RPM_ID:
+		err = read_fanspeed(priv, 1, &outval);
+		break;
+	case SENSOR_FAN1_TARGET_RPM_ID:
+		ec_read_sensor_values(&priv->ecram, priv->conf, &values);
+		outval = values.fan1_target_rpm;
+		err = 0;
+		break;
+	case SENSOR_FAN2_TARGET_RPM_ID:
+		ec_read_sensor_values(&priv->ecram, priv->conf, &values);
+		outval = values.fan2_target_rpm;
+		err = 0;
+		break;
+	default:
+		pr_info("Error reading sensor value with id %d\n", sensor_id);
+		return -EOPNOTSUPP;
+	}
+	if (err)
+		return err;
+
+	return sprintf(buf, "%d\n", outval);
+}
+
+static SENSOR_DEVICE_ATTR_RO(temp1_input, sensor, SENSOR_CPU_TEMP_ID);
+static SENSOR_DEVICE_ATTR_RO(temp1_label, sensor_label, SENSOR_CPU_TEMP_ID);
+static SENSOR_DEVICE_ATTR_RO(temp2_input, sensor, SENSOR_GPU_TEMP_ID);
+static SENSOR_DEVICE_ATTR_RO(temp2_label, sensor_label, SENSOR_GPU_TEMP_ID);
+static SENSOR_DEVICE_ATTR_RO(temp3_input, sensor, SENSOR_IC_TEMP_ID);
+static SENSOR_DEVICE_ATTR_RO(temp3_label, sensor_label, SENSOR_IC_TEMP_ID);
+static SENSOR_DEVICE_ATTR_RO(fan1_input, sensor, SENSOR_FAN1_RPM_ID);
+static SENSOR_DEVICE_ATTR_RO(fan1_label, sensor_label, SENSOR_FAN1_RPM_ID);
+static SENSOR_DEVICE_ATTR_RO(fan2_input, sensor, SENSOR_FAN2_RPM_ID);
+static SENSOR_DEVICE_ATTR_RO(fan2_label, sensor_label, SENSOR_FAN2_RPM_ID);
+static SENSOR_DEVICE_ATTR_RO(fan1_target, sensor, SENSOR_FAN1_TARGET_RPM_ID);
+static SENSOR_DEVICE_ATTR_RO(fan2_target, sensor, SENSOR_FAN2_TARGET_RPM_ID);
+
+static struct attribute *sensor_hwmon_attributes[] = {
+	&sensor_dev_attr_temp1_input.dev_attr.attr,
+	&sensor_dev_attr_temp1_label.dev_attr.attr,
+	&sensor_dev_attr_temp2_input.dev_attr.attr,
+	&sensor_dev_attr_temp2_label.dev_attr.attr,
+	&sensor_dev_attr_temp3_input.dev_attr.attr,
+	&sensor_dev_attr_temp3_label.dev_attr.attr,
+	&sensor_dev_attr_fan1_input.dev_attr.attr,
+	&sensor_dev_attr_fan1_label.dev_attr.attr,
+	&sensor_dev_attr_fan2_input.dev_attr.attr,
+	&sensor_dev_attr_fan2_label.dev_attr.attr,
+	&sensor_dev_attr_fan1_target.dev_attr.attr,
+	&sensor_dev_attr_fan2_target.dev_attr.attr,
+	NULL
+};
+
+static ssize_t autopoint_show(struct device *dev,
+			      struct device_attribute *devattr, char *buf)
+{
+	struct fancurve fancurve;
+	int err;
+	int value;
+	struct legion_private *priv = dev_get_drvdata(dev);
+	int fancurve_attr_id = to_sensor_dev_attr_2(devattr)->nr;
+	int point_id = to_sensor_dev_attr_2(devattr)->index;
+
+	mutex_lock(&priv->fancurve_mutex);
+	err = read_fancurve(priv, &fancurve);
+	mutex_unlock(&priv->fancurve_mutex);
+
+	if (err) {
+		pr_info("Reading fancurve failed\n");
+		return -EOPNOTSUPP;
+	}
+	if (!(point_id >= 0 && point_id < MAXFANCURVESIZE)) {
+		pr_info("Reading fancurve failed due to wrong point id: %d\n",
+			point_id);
+		return -EOPNOTSUPP;
+	}
+
+	switch (fancurve_attr_id) {
+	case FANCURVE_ATTR_PWM1:
+		value = fancurve.points[point_id].rpm1_raw * 100;
+		break;
+	case FANCURVE_ATTR_PWM2:
+		value = fancurve.points[point_id].rpm2_raw * 100;
+		break;
+	case FANCURVE_ATTR_CPU_TEMP:
+		value = fancurve.points[point_id].cpu_max_temp_celsius;
+		break;
+	case FANCURVE_ATTR_CPU_HYST:
+		value = fancurve.points[point_id].cpu_min_temp_celsius;
+		break;
+	case FANCURVE_ATTR_GPU_TEMP:
+		value = fancurve.points[point_id].gpu_max_temp_celsius;
+		break;
+	case FANCURVE_ATTR_GPU_HYST:
+		value = fancurve.points[point_id].gpu_min_temp_celsius;
+		break;
+	case FANCURVE_ATTR_IC_TEMP:
+		value = fancurve.points[point_id].ic_max_temp_celsius;
+		break;
+	case FANCURVE_ATTR_IC_HYST:
+		value = fancurve.points[point_id].ic_min_temp_celsius;
+		break;
+	case FANCURVE_ATTR_ACCEL:
+		value = fancurve.points[point_id].accel;
+		break;
+	case FANCURVE_ATTR_DECEL:
+		value = fancurve.points[point_id].decel;
+		break;
+	case FANCURVE_SIZE:
+		value = fancurve.size;
+		break;
+	default:
+		pr_info("Reading fancurve failed due to wrong attribute id: %d\n",
+			fancurve_attr_id);
+		return -EOPNOTSUPP;
+	}
+
+	return sprintf(buf, "%d\n", value);
+}
+
+static ssize_t autopoint_store(struct device *dev,
+			       struct device_attribute *devattr,
+			       const char *buf, size_t count)
+{
+	struct fancurve fancurve;
+	int err;
+	int value;
+	bool valid;
+	struct legion_private *priv = dev_get_drvdata(dev);
+	int fancurve_attr_id = to_sensor_dev_attr_2(devattr)->nr;
+	int point_id = to_sensor_dev_attr_2(devattr)->index;
+	bool write_fancurve_size = false;
+
+	if (!(point_id >= 0 && point_id < MAXFANCURVESIZE)) {
+		pr_info("Reading fancurve failed due to wrong point id: %d\n",
+			point_id);
+		err = -EOPNOTSUPP;
+		goto error;
+	}
+
+	err = kstrtoint(buf, 0, &value);
+	if (err) {
+		pr_info("Parse for hwmon store is not succesful: error:%d; point_id: %d; fancurve_attr_id: %d\\n",
+			err, point_id, fancurve_attr_id);
+		goto error;
+	}
+
+	mutex_lock(&priv->fancurve_mutex);
+	err = read_fancurve(priv, &fancurve);
+
+	if (err) {
+		pr_info("Reading fancurve failed\n");
+		err = -EOPNOTSUPP;
+		goto error_mutex;
+	}
+
+	switch (fancurve_attr_id) {
+	case FANCURVE_ATTR_PWM1:
+		valid = fancurve_set_rpm1(&fancurve, point_id, value);
+		break;
+	case FANCURVE_ATTR_PWM2:
+		valid = fancurve_set_rpm2(&fancurve, point_id, value);
+		break;
+	case FANCURVE_ATTR_CPU_TEMP:
+		valid = fancurve_set_cpu_temp_max(&fancurve, point_id, value);
+		break;
+	case FANCURVE_ATTR_CPU_HYST:
+		valid = fancurve_set_cpu_temp_min(&fancurve, point_id, value);
+		break;
+	case FANCURVE_ATTR_GPU_TEMP:
+		valid = fancurve_set_gpu_temp_max(&fancurve, point_id, value);
+		break;
+	case FANCURVE_ATTR_GPU_HYST:
+		valid = fancurve_set_gpu_temp_min(&fancurve, point_id, value);
+		break;
+	case FANCURVE_ATTR_IC_TEMP:
+		valid = fancurve_set_ic_temp_max(&fancurve, point_id, value);
+		break;
+	case FANCURVE_ATTR_IC_HYST:
+		valid = fancurve_set_ic_temp_min(&fancurve, point_id, value);
+		break;
+	case FANCURVE_ATTR_ACCEL:
+		valid = fancurve_set_accel(&fancurve, point_id, value);
+		break;
+	case FANCURVE_ATTR_DECEL:
+		valid = fancurve_set_decel(&fancurve, point_id, value);
+		break;
+	case FANCURVE_SIZE:
+		valid = fancurve_set_size(&fancurve, value, true);
+		write_fancurve_size = true;
+		break;
+	default:
+		pr_info("Writing fancurve failed due to wrong attribute id: %d\n",
+			fancurve_attr_id);
+		err = -EOPNOTSUPP;
+		goto error_mutex;
+	}
+
+	if (!valid) {
+		pr_info("Ignoring invalid fancurve value %d for attribute %d at point %d\n",
+			value, fancurve_attr_id, point_id);
+		err = -EOPNOTSUPP;
+		goto error_mutex;
+	}
+
+	err = write_fancurve(priv, &fancurve, write_fancurve_size);
+	if (err) {
+		pr_info("Writing fancurve failed for accessing hwmon at point_id: %d\n",
+			point_id);
+		err = -EOPNOTSUPP;
+		goto error_mutex;
+	}
+
+	mutex_unlock(&priv->fancurve_mutex);
+	return count;
+
+error_mutex:
+	mutex_unlock(&priv->fancurve_mutex);
+error:
+	return count;
+}
+
+// rpm1
+static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point1_pwm, autopoint,
+			       FANCURVE_ATTR_PWM1, 0);
+static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point2_pwm, autopoint,
+			       FANCURVE_ATTR_PWM1, 1);
+static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point3_pwm, autopoint,
+			       FANCURVE_ATTR_PWM1, 2);
+static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point4_pwm, autopoint,
+			       FANCURVE_ATTR_PWM1, 3);
+static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point5_pwm, autopoint,
+			       FANCURVE_ATTR_PWM1, 4);
+static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point6_pwm, autopoint,
+			       FANCURVE_ATTR_PWM1, 5);
+static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point7_pwm, autopoint,
+			       FANCURVE_ATTR_PWM1, 6);
+static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point8_pwm, autopoint,
+			       FANCURVE_ATTR_PWM1, 7);
+static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point9_pwm, autopoint,
+			       FANCURVE_ATTR_PWM1, 8);
+static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point10_pwm, autopoint,
+			       FANCURVE_ATTR_PWM1, 9);
+// rpm2
+static SENSOR_DEVICE_ATTR_2_RW(pwm2_auto_point1_pwm, autopoint,
+			       FANCURVE_ATTR_PWM2, 0);
+static SENSOR_DEVICE_ATTR_2_RW(pwm2_auto_point2_pwm, autopoint,
+			       FANCURVE_ATTR_PWM2, 1);
+static SENSOR_DEVICE_ATTR_2_RW(pwm2_auto_point3_pwm, autopoint,
+			       FANCURVE_ATTR_PWM2, 2);
+static SENSOR_DEVICE_ATTR_2_RW(pwm2_auto_point4_pwm, autopoint,
+			       FANCURVE_ATTR_PWM2, 3);
+static SENSOR_DEVICE_ATTR_2_RW(pwm2_auto_point5_pwm, autopoint,
+			       FANCURVE_ATTR_PWM2, 4);
+static SENSOR_DEVICE_ATTR_2_RW(pwm2_auto_point6_pwm, autopoint,
+			       FANCURVE_ATTR_PWM2, 5);
+static SENSOR_DEVICE_ATTR_2_RW(pwm2_auto_point7_pwm, autopoint,
+			       FANCURVE_ATTR_PWM2, 6);
+static SENSOR_DEVICE_ATTR_2_RW(pwm2_auto_point8_pwm, autopoint,
+			       FANCURVE_ATTR_PWM2, 7);
+static SENSOR_DEVICE_ATTR_2_RW(pwm2_auto_point9_pwm, autopoint,
+			       FANCURVE_ATTR_PWM2, 8);
+static SENSOR_DEVICE_ATTR_2_RW(pwm2_auto_point10_pwm, autopoint,
+			       FANCURVE_ATTR_PWM2, 9);
+// CPU temp
+static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point1_temp, autopoint,
+			       FANCURVE_ATTR_CPU_TEMP, 0);
+static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point2_temp, autopoint,
+			       FANCURVE_ATTR_CPU_TEMP, 1);
+static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point3_temp, autopoint,
+			       FANCURVE_ATTR_CPU_TEMP, 2);
+static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point4_temp, autopoint,
+			       FANCURVE_ATTR_CPU_TEMP, 3);
+static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point5_temp, autopoint,
+			       FANCURVE_ATTR_CPU_TEMP, 4);
+static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point6_temp, autopoint,
+			       FANCURVE_ATTR_CPU_TEMP, 5);
+static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point7_temp, autopoint,
+			       FANCURVE_ATTR_CPU_TEMP, 6);
+static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point8_temp, autopoint,
+			       FANCURVE_ATTR_CPU_TEMP, 7);
+static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point9_temp, autopoint,
+			       FANCURVE_ATTR_CPU_TEMP, 8);
+static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point10_temp, autopoint,
+			       FANCURVE_ATTR_CPU_TEMP, 9);
+// CPU temp hyst
+static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point1_temp_hyst, autopoint,
+			       FANCURVE_ATTR_CPU_HYST, 0);
+static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point2_temp_hyst, autopoint,
+			       FANCURVE_ATTR_CPU_HYST, 1);
+static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point3_temp_hyst, autopoint,
+			       FANCURVE_ATTR_CPU_HYST, 2);
+static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point4_temp_hyst, autopoint,
+			       FANCURVE_ATTR_CPU_HYST, 3);
+static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point5_temp_hyst, autopoint,
+			       FANCURVE_ATTR_CPU_HYST, 4);
+static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point6_temp_hyst, autopoint,
+			       FANCURVE_ATTR_CPU_HYST, 5);
+static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point7_temp_hyst, autopoint,
+			       FANCURVE_ATTR_CPU_HYST, 6);
+static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point8_temp_hyst, autopoint,
+			       FANCURVE_ATTR_CPU_HYST, 7);
+static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point9_temp_hyst, autopoint,
+			       FANCURVE_ATTR_CPU_HYST, 8);
+static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point10_temp_hyst, autopoint,
+			       FANCURVE_ATTR_CPU_HYST, 9);
+// GPU temp
+static SENSOR_DEVICE_ATTR_2_RW(pwm2_auto_point1_temp, autopoint,
+			       FANCURVE_ATTR_GPU_TEMP, 0);
+static SENSOR_DEVICE_ATTR_2_RW(pwm2_auto_point2_temp, autopoint,
+			       FANCURVE_ATTR_GPU_TEMP, 1);
+static SENSOR_DEVICE_ATTR_2_RW(pwm2_auto_point3_temp, autopoint,
+			       FANCURVE_ATTR_GPU_TEMP, 2);
+static SENSOR_DEVICE_ATTR_2_RW(pwm2_auto_point4_temp, autopoint,
+			       FANCURVE_ATTR_GPU_TEMP, 3);
+static SENSOR_DEVICE_ATTR_2_RW(pwm2_auto_point5_temp, autopoint,
+			       FANCURVE_ATTR_GPU_TEMP, 4);
+static SENSOR_DEVICE_ATTR_2_RW(pwm2_auto_point6_temp, autopoint,
+			       FANCURVE_ATTR_GPU_TEMP, 5);
+static SENSOR_DEVICE_ATTR_2_RW(pwm2_auto_point7_temp, autopoint,
+			       FANCURVE_ATTR_GPU_TEMP, 6);
+static SENSOR_DEVICE_ATTR_2_RW(pwm2_auto_point8_temp, autopoint,
+			       FANCURVE_ATTR_GPU_TEMP, 7);
+static SENSOR_DEVICE_ATTR_2_RW(pwm2_auto_point9_temp, autopoint,
+			       FANCURVE_ATTR_GPU_TEMP, 8);
+static SENSOR_DEVICE_ATTR_2_RW(pwm2_auto_point10_temp, autopoint,
+			       FANCURVE_ATTR_GPU_TEMP, 9);
+// GPU temp hyst
+static SENSOR_DEVICE_ATTR_2_RW(pwm2_auto_point1_temp_hyst, autopoint,
+			       FANCURVE_ATTR_GPU_HYST, 0);
+static SENSOR_DEVICE_ATTR_2_RW(pwm2_auto_point2_temp_hyst, autopoint,
+			       FANCURVE_ATTR_GPU_HYST, 1);
+static SENSOR_DEVICE_ATTR_2_RW(pwm2_auto_point3_temp_hyst, autopoint,
+			       FANCURVE_ATTR_GPU_HYST, 2);
+static SENSOR_DEVICE_ATTR_2_RW(pwm2_auto_point4_temp_hyst, autopoint,
+			       FANCURVE_ATTR_GPU_HYST, 3);
+static SENSOR_DEVICE_ATTR_2_RW(pwm2_auto_point5_temp_hyst, autopoint,
+			       FANCURVE_ATTR_GPU_HYST, 4);
+static SENSOR_DEVICE_ATTR_2_RW(pwm2_auto_point6_temp_hyst, autopoint,
+			       FANCURVE_ATTR_GPU_HYST, 5);
+static SENSOR_DEVICE_ATTR_2_RW(pwm2_auto_point7_temp_hyst, autopoint,
+			       FANCURVE_ATTR_GPU_HYST, 6);
+static SENSOR_DEVICE_ATTR_2_RW(pwm2_auto_point8_temp_hyst, autopoint,
+			       FANCURVE_ATTR_GPU_HYST, 7);
+static SENSOR_DEVICE_ATTR_2_RW(pwm2_auto_point9_temp_hyst, autopoint,
+			       FANCURVE_ATTR_GPU_HYST, 8);
+static SENSOR_DEVICE_ATTR_2_RW(pwm2_auto_point10_temp_hyst, autopoint,
+			       FANCURVE_ATTR_GPU_HYST, 9);
+// IC temp
+static SENSOR_DEVICE_ATTR_2_RW(pwm3_auto_point1_temp, autopoint,
+			       FANCURVE_ATTR_IC_TEMP, 0);
+static SENSOR_DEVICE_ATTR_2_RW(pwm3_auto_point2_temp, autopoint,
+			       FANCURVE_ATTR_IC_TEMP, 1);
+static SENSOR_DEVICE_ATTR_2_RW(pwm3_auto_point3_temp, autopoint,
+			       FANCURVE_ATTR_IC_TEMP, 2);
+static SENSOR_DEVICE_ATTR_2_RW(pwm3_auto_point4_temp, autopoint,
+			       FANCURVE_ATTR_IC_TEMP, 3);
+static SENSOR_DEVICE_ATTR_2_RW(pwm3_auto_point5_temp, autopoint,
+			       FANCURVE_ATTR_IC_TEMP, 4);
+static SENSOR_DEVICE_ATTR_2_RW(pwm3_auto_point6_temp, autopoint,
+			       FANCURVE_ATTR_IC_TEMP, 5);
+static SENSOR_DEVICE_ATTR_2_RW(pwm3_auto_point7_temp, autopoint,
+			       FANCURVE_ATTR_IC_TEMP, 6);
+static SENSOR_DEVICE_ATTR_2_RW(pwm3_auto_point8_temp, autopoint,
+			       FANCURVE_ATTR_IC_TEMP, 7);
+static SENSOR_DEVICE_ATTR_2_RW(pwm3_auto_point9_temp, autopoint,
+			       FANCURVE_ATTR_IC_TEMP, 8);
+static SENSOR_DEVICE_ATTR_2_RW(pwm3_auto_point10_temp, autopoint,
+			       FANCURVE_ATTR_IC_TEMP, 9);
+// IC temp hyst
+static SENSOR_DEVICE_ATTR_2_RW(pwm3_auto_point1_temp_hyst, autopoint,
+			       FANCURVE_ATTR_IC_HYST, 0);
+static SENSOR_DEVICE_ATTR_2_RW(pwm3_auto_point2_temp_hyst, autopoint,
+			       FANCURVE_ATTR_IC_HYST, 1);
+static SENSOR_DEVICE_ATTR_2_RW(pwm3_auto_point3_temp_hyst, autopoint,
+			       FANCURVE_ATTR_IC_HYST, 2);
+static SENSOR_DEVICE_ATTR_2_RW(pwm3_auto_point4_temp_hyst, autopoint,
+			       FANCURVE_ATTR_IC_HYST, 3);
+static SENSOR_DEVICE_ATTR_2_RW(pwm3_auto_point5_temp_hyst, autopoint,
+			       FANCURVE_ATTR_IC_HYST, 4);
+static SENSOR_DEVICE_ATTR_2_RW(pwm3_auto_point6_temp_hyst, autopoint,
+			       FANCURVE_ATTR_IC_HYST, 5);
+static SENSOR_DEVICE_ATTR_2_RW(pwm3_auto_point7_temp_hyst, autopoint,
+			       FANCURVE_ATTR_IC_HYST, 6);
+static SENSOR_DEVICE_ATTR_2_RW(pwm3_auto_point8_temp_hyst, autopoint,
+			       FANCURVE_ATTR_IC_HYST, 7);
+static SENSOR_DEVICE_ATTR_2_RW(pwm3_auto_point9_temp_hyst, autopoint,
+			       FANCURVE_ATTR_IC_HYST, 8);
+static SENSOR_DEVICE_ATTR_2_RW(pwm3_auto_point10_temp_hyst, autopoint,
+			       FANCURVE_ATTR_IC_HYST, 9);
+// accel
+static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point1_accel, autopoint,
+			       FANCURVE_ATTR_ACCEL, 0);
+static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point2_accel, autopoint,
+			       FANCURVE_ATTR_ACCEL, 1);
+static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point3_accel, autopoint,
+			       FANCURVE_ATTR_ACCEL, 2);
+static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point4_accel, autopoint,
+			       FANCURVE_ATTR_ACCEL, 3);
+static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point5_accel, autopoint,
+			       FANCURVE_ATTR_ACCEL, 4);
+static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point6_accel, autopoint,
+			       FANCURVE_ATTR_ACCEL, 5);
+static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point7_accel, autopoint,
+			       FANCURVE_ATTR_ACCEL, 6);
+static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point8_accel, autopoint,
+			       FANCURVE_ATTR_ACCEL, 7);
+static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point9_accel, autopoint,
+			       FANCURVE_ATTR_ACCEL, 8);
+static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point10_accel, autopoint,
+			       FANCURVE_ATTR_ACCEL, 9);
+// decel
+static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point1_decel, autopoint,
+			       FANCURVE_ATTR_DECEL, 0);
+static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point2_decel, autopoint,
+			       FANCURVE_ATTR_DECEL, 1);
+static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point3_decel, autopoint,
+			       FANCURVE_ATTR_DECEL, 2);
+static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point4_decel, autopoint,
+			       FANCURVE_ATTR_DECEL, 3);
+static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point5_decel, autopoint,
+			       FANCURVE_ATTR_DECEL, 4);
+static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point6_decel, autopoint,
+			       FANCURVE_ATTR_DECEL, 5);
+static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point7_decel, autopoint,
+			       FANCURVE_ATTR_DECEL, 6);
+static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point8_decel, autopoint,
+			       FANCURVE_ATTR_DECEL, 7);
+static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point9_decel, autopoint,
+			       FANCURVE_ATTR_DECEL, 8);
+static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point10_decel, autopoint,
+			       FANCURVE_ATTR_DECEL, 9);
+//size
+static SENSOR_DEVICE_ATTR_2_RW(auto_points_size, autopoint, FANCURVE_SIZE, 0);
+
+static ssize_t minifancurve_show(struct device *dev,
+				 struct device_attribute *devattr, char *buf)
+{
+	bool value;
+	int err;
+	struct legion_private *priv = dev_get_drvdata(dev);
+
+	mutex_lock(&priv->fancurve_mutex);
+	err = ec_read_minifancurve(&priv->ecram, priv->conf, &value);
+	if (err) {
+		err = -1;
+		pr_info("Reading minifancurve not succesful\n");
+		goto error_unlock;
+	}
+	mutex_unlock(&priv->fancurve_mutex);
+	return sprintf(buf, "%d\n", value);
+
+error_unlock:
+	mutex_unlock(&priv->fancurve_mutex);
+	return -1;
+}
+
+static ssize_t minifancurve_store(struct device *dev,
+				  struct device_attribute *devattr,
+				  const char *buf, size_t count)
+{
+	int value;
+	int err;
+	struct legion_private *priv = dev_get_drvdata(dev);
+
+	err = kstrtoint(buf, 0, &value);
+	if (err) {
+		err = -1;
+		pr_info("Parse for hwmon store is not succesful: error:%d\n",
+			err);
+		goto error;
+	}
+
+	mutex_lock(&priv->fancurve_mutex);
+	err = ec_write_minifancurve(&priv->ecram, priv->conf, value);
+	if (err) {
+		err = -1;
+		pr_info("Writing minifancurve not succesful\n");
+		goto error_unlock;
+	}
+	mutex_unlock(&priv->fancurve_mutex);
+	return count;
+
+error_unlock:
+	mutex_unlock(&priv->fancurve_mutex);
+error:
+	return err;
+}
+
+static SENSOR_DEVICE_ATTR_RW(minifancurve, minifancurve, 0);
+
+static ssize_t pwm1_mode_show(struct device *dev,
+			      struct device_attribute *devattr, char *buf)
+{
+	bool value;
+	int err;
+	struct legion_private *priv = dev_get_drvdata(dev);
+
+	mutex_lock(&priv->fancurve_mutex);
+	err = ec_read_fanfullspeed(&priv->ecram, priv->conf, &value);
+	if (err) {
+		err = -1;
+		pr_info("Reading pwm1_mode/maximumfanspeed not succesful\n");
+		goto error_unlock;
+	}
+	mutex_unlock(&priv->fancurve_mutex);
+	return sprintf(buf, "%d\n", value ? 0 : 2);
+
+error_unlock:
+	mutex_unlock(&priv->fancurve_mutex);
+	return -1;
+}
+
+// TODO: remove? or use WMI method?
+static ssize_t pwm1_mode_store(struct device *dev,
+			       struct device_attribute *devattr,
+			       const char *buf, size_t count)
+{
+	int value;
+	int is_maximumfanspeed;
+	int err;
+	struct legion_private *priv = dev_get_drvdata(dev);
+
+	err = kstrtoint(buf, 0, &value);
+	if (err) {
+		err = -1;
+		pr_info("Parse for hwmon store is not succesful: error:%d\n",
+			err);
+		goto error;
+	}
+	is_maximumfanspeed = value == 0;
+
+	mutex_lock(&priv->fancurve_mutex);
+	err = ec_write_fanfullspeed(&priv->ecram, priv->conf,
+				    is_maximumfanspeed);
+	if (err) {
+		err = -1;
+		pr_info("Writing pwm1_mode/maximumfanspeed not succesful\n");
+		goto error_unlock;
+	}
+	mutex_unlock(&priv->fancurve_mutex);
+	return count;
+
+error_unlock:
+	mutex_unlock(&priv->fancurve_mutex);
+error:
+	return err;
+}
+
+static SENSOR_DEVICE_ATTR_RW(pwm1_mode, pwm1_mode, 0);
+
+static struct attribute *fancurve_hwmon_attributes[] = {
+	&sensor_dev_attr_pwm1_auto_point1_pwm.dev_attr.attr,
+	&sensor_dev_attr_pwm1_auto_point2_pwm.dev_attr.attr,
+	&sensor_dev_attr_pwm1_auto_point3_pwm.dev_attr.attr,
+	&sensor_dev_attr_pwm1_auto_point4_pwm.dev_attr.attr,
+	&sensor_dev_attr_pwm1_auto_point5_pwm.dev_attr.attr,
+	&sensor_dev_attr_pwm1_auto_point6_pwm.dev_attr.attr,
+	&sensor_dev_attr_pwm1_auto_point7_pwm.dev_attr.attr,
+	&sensor_dev_attr_pwm1_auto_point8_pwm.dev_attr.attr,
+	&sensor_dev_attr_pwm1_auto_point9_pwm.dev_attr.attr,
+	&sensor_dev_attr_pwm1_auto_point10_pwm.dev_attr.attr,
+	&sensor_dev_attr_pwm2_auto_point1_pwm.dev_attr.attr,
+	&sensor_dev_attr_pwm2_auto_point2_pwm.dev_attr.attr,
+	&sensor_dev_attr_pwm2_auto_point3_pwm.dev_attr.attr,
+	&sensor_dev_attr_pwm2_auto_point4_pwm.dev_attr.attr,
+	&sensor_dev_attr_pwm2_auto_point5_pwm.dev_attr.attr,
+	&sensor_dev_attr_pwm2_auto_point6_pwm.dev_attr.attr,
+	&sensor_dev_attr_pwm2_auto_point7_pwm.dev_attr.attr,
+	&sensor_dev_attr_pwm2_auto_point8_pwm.dev_attr.attr,
+	&sensor_dev_attr_pwm2_auto_point9_pwm.dev_attr.attr,
+	&sensor_dev_attr_pwm2_auto_point10_pwm.dev_attr.attr,
+	&sensor_dev_attr_pwm1_auto_point1_temp.dev_attr.attr,
+	&sensor_dev_attr_pwm1_auto_point2_temp.dev_attr.attr,
+	&sensor_dev_attr_pwm1_auto_point3_temp.dev_attr.attr,
+	&sensor_dev_attr_pwm1_auto_point4_temp.dev_attr.attr,
+	&sensor_dev_attr_pwm1_auto_point5_temp.dev_attr.attr,
+	&sensor_dev_attr_pwm1_auto_point6_temp.dev_attr.attr,
+	&sensor_dev_attr_pwm1_auto_point7_temp.dev_attr.attr,
+	&sensor_dev_attr_pwm1_auto_point8_temp.dev_attr.attr,
+	&sensor_dev_attr_pwm1_auto_point9_temp.dev_attr.attr,
+	&sensor_dev_attr_pwm1_auto_point10_temp.dev_attr.attr,
+	&sensor_dev_attr_pwm1_auto_point1_temp_hyst.dev_attr.attr,
+	&sensor_dev_attr_pwm1_auto_point2_temp_hyst.dev_attr.attr,
+	&sensor_dev_attr_pwm1_auto_point3_temp_hyst.dev_attr.attr,
+	&sensor_dev_attr_pwm1_auto_point4_temp_hyst.dev_attr.attr,
+	&sensor_dev_attr_pwm1_auto_point5_temp_hyst.dev_attr.attr,
+	&sensor_dev_attr_pwm1_auto_point6_temp_hyst.dev_attr.attr,
+	&sensor_dev_attr_pwm1_auto_point7_temp_hyst.dev_attr.attr,
+	&sensor_dev_attr_pwm1_auto_point8_temp_hyst.dev_attr.attr,
+	&sensor_dev_attr_pwm1_auto_point9_temp_hyst.dev_attr.attr,
+	&sensor_dev_attr_pwm1_auto_point10_temp_hyst.dev_attr.attr,
+	&sensor_dev_attr_pwm2_auto_point1_temp.dev_attr.attr,
+	&sensor_dev_attr_pwm2_auto_point2_temp.dev_attr.attr,
+	&sensor_dev_attr_pwm2_auto_point3_temp.dev_attr.attr,
+	&sensor_dev_attr_pwm2_auto_point4_temp.dev_attr.attr,
+	&sensor_dev_attr_pwm2_auto_point5_temp.dev_attr.attr,
+	&sensor_dev_attr_pwm2_auto_point6_temp.dev_attr.attr,
+	&sensor_dev_attr_pwm2_auto_point7_temp.dev_attr.attr,
+	&sensor_dev_attr_pwm2_auto_point8_temp.dev_attr.attr,
+	&sensor_dev_attr_pwm2_auto_point9_temp.dev_attr.attr,
+	&sensor_dev_attr_pwm2_auto_point10_temp.dev_attr.attr,
+	&sensor_dev_attr_pwm2_auto_point1_temp_hyst.dev_attr.attr,
+	&sensor_dev_attr_pwm2_auto_point2_temp_hyst.dev_attr.attr,
+	&sensor_dev_attr_pwm2_auto_point3_temp_hyst.dev_attr.attr,
+	&sensor_dev_attr_pwm2_auto_point4_temp_hyst.dev_attr.attr,
+	&sensor_dev_attr_pwm2_auto_point5_temp_hyst.dev_attr.attr,
+	&sensor_dev_attr_pwm2_auto_point6_temp_hyst.dev_attr.attr,
+	&sensor_dev_attr_pwm2_auto_point7_temp_hyst.dev_attr.attr,
+	&sensor_dev_attr_pwm2_auto_point8_temp_hyst.dev_attr.attr,
+	&sensor_dev_attr_pwm2_auto_point9_temp_hyst.dev_attr.attr,
+	&sensor_dev_attr_pwm2_auto_point10_temp_hyst.dev_attr.attr,
+	&sensor_dev_attr_pwm3_auto_point1_temp.dev_attr.attr,
+	&sensor_dev_attr_pwm3_auto_point2_temp.dev_attr.attr,
+	&sensor_dev_attr_pwm3_auto_point3_temp.dev_attr.attr,
+	&sensor_dev_attr_pwm3_auto_point4_temp.dev_attr.attr,
+	&sensor_dev_attr_pwm3_auto_point5_temp.dev_attr.attr,
+	&sensor_dev_attr_pwm3_auto_point6_temp.dev_attr.attr,
+	&sensor_dev_attr_pwm3_auto_point7_temp.dev_attr.attr,
+	&sensor_dev_attr_pwm3_auto_point8_temp.dev_attr.attr,
+	&sensor_dev_attr_pwm3_auto_point9_temp.dev_attr.attr,
+	&sensor_dev_attr_pwm3_auto_point10_temp.dev_attr.attr,
+	&sensor_dev_attr_pwm3_auto_point1_temp_hyst.dev_attr.attr,
+	&sensor_dev_attr_pwm3_auto_point2_temp_hyst.dev_attr.attr,
+	&sensor_dev_attr_pwm3_auto_point3_temp_hyst.dev_attr.attr,
+	&sensor_dev_attr_pwm3_auto_point4_temp_hyst.dev_attr.attr,
+	&sensor_dev_attr_pwm3_auto_point5_temp_hyst.dev_attr.attr,
+	&sensor_dev_attr_pwm3_auto_point6_temp_hyst.dev_attr.attr,
+	&sensor_dev_attr_pwm3_auto_point7_temp_hyst.dev_attr.attr,
+	&sensor_dev_attr_pwm3_auto_point8_temp_hyst.dev_attr.attr,
+	&sensor_dev_attr_pwm3_auto_point9_temp_hyst.dev_attr.attr,
+	&sensor_dev_attr_pwm3_auto_point10_temp_hyst.dev_attr.attr,
+	&sensor_dev_attr_pwm1_auto_point1_accel.dev_attr.attr,
+	&sensor_dev_attr_pwm1_auto_point2_accel.dev_attr.attr,
+	&sensor_dev_attr_pwm1_auto_point3_accel.dev_attr.attr,
+	&sensor_dev_attr_pwm1_auto_point4_accel.dev_attr.attr,
+	&sensor_dev_attr_pwm1_auto_point5_accel.dev_attr.attr,
+	&sensor_dev_attr_pwm1_auto_point6_accel.dev_attr.attr,
+	&sensor_dev_attr_pwm1_auto_point7_accel.dev_attr.attr,
+	&sensor_dev_attr_pwm1_auto_point8_accel.dev_attr.attr,
+	&sensor_dev_attr_pwm1_auto_point9_accel.dev_attr.attr,
+	&sensor_dev_attr_pwm1_auto_point10_accel.dev_attr.attr,
+	&sensor_dev_attr_pwm1_auto_point1_decel.dev_attr.attr,
+	&sensor_dev_attr_pwm1_auto_point2_decel.dev_attr.attr,
+	&sensor_dev_attr_pwm1_auto_point3_decel.dev_attr.attr,
+	&sensor_dev_attr_pwm1_auto_point4_decel.dev_attr.attr,
+	&sensor_dev_attr_pwm1_auto_point5_decel.dev_attr.attr,
+	&sensor_dev_attr_pwm1_auto_point6_decel.dev_attr.attr,
+	&sensor_dev_attr_pwm1_auto_point7_decel.dev_attr.attr,
+	&sensor_dev_attr_pwm1_auto_point8_decel.dev_attr.attr,
+	&sensor_dev_attr_pwm1_auto_point9_decel.dev_attr.attr,
+	&sensor_dev_attr_pwm1_auto_point10_decel.dev_attr.attr,
+	//
+	&sensor_dev_attr_auto_points_size.dev_attr.attr,
+	&sensor_dev_attr_minifancurve.dev_attr.attr,
+	&sensor_dev_attr_pwm1_mode.dev_attr.attr, NULL
+};
+
+static umode_t legion_hwmon_is_visible(struct kobject *kobj,
+				       struct attribute *attr, int idx)
+{
+	bool supported = true;
+	struct device *dev = kobj_to_dev(kobj);
+	struct legion_private *priv = dev_get_drvdata(dev);
+
+	if (attr == &sensor_dev_attr_minifancurve.dev_attr.attr)
+		supported = priv->conf->has_minifancurve;
+
+	supported = supported && (priv->conf->access_method_fancurve !=
+				  ACCESS_METHOD_NO_ACCESS);
+
+	return supported ? attr->mode : 0;
+}
+
+static const struct attribute_group legion_hwmon_sensor_group = {
+	.attrs = sensor_hwmon_attributes,
+	.is_visible = NULL
+};
+
+static const struct attribute_group legion_hwmon_fancurve_group = {
+	.attrs = fancurve_hwmon_attributes,
+	.is_visible = legion_hwmon_is_visible,
+};
+
+static const struct attribute_group *legion_hwmon_groups[] = {
+	&legion_hwmon_sensor_group, &legion_hwmon_fancurve_group, NULL
+};
+
+static ssize_t legion_hwmon_init(struct legion_private *priv)
+{
+	//TODO: use hwmon_device_register_with_groups or
+	// hwmon_device_register_with_info (latter means all hwmon functions have to be
+	// changed)
+	// some laptop driver do it in one way, some in the other
+	// TODO: Use devm_hwmon_device_register_with_groups ?
+	// some laptop drivers use this, some
+	struct device *hwmon_dev = hwmon_device_register_with_groups(
+		&priv->platform_device->dev, "legion_hwmon", priv,
+		legion_hwmon_groups);
+	if (IS_ERR_OR_NULL(hwmon_dev)) {
+		pr_err("hwmon_device_register failed!\n");
+		return PTR_ERR(hwmon_dev);
+	}
+	dev_set_drvdata(hwmon_dev, priv);
+	priv->hwmon_dev = hwmon_dev;
+	return 0;
+}
+
+static void legion_hwmon_exit(struct legion_private *priv)
+{
+	pr_info("Unloading legion hwon\n");
+	if (priv->hwmon_dev) {
+		hwmon_device_unregister(priv->hwmon_dev);
+		priv->hwmon_dev = NULL;
+	}
+	pr_info("Unloading legion hwon done\n");
+}
+
+/* ACPI*/
+
+static int acpi_init(struct legion_private *priv, struct acpi_device *adev)
+{
+	int err;
+	unsigned long cfg;
+	bool skip_acpi_sta_check;
+	struct device *dev = &priv->platform_device->dev;
+
+	priv->adev = adev;
+	if (!priv->adev) {
+		dev_info(dev, "Could not get ACPI handle\n");
+		goto err_acpi_init;
+	}
+
+	skip_acpi_sta_check = force || (!priv->conf->acpi_check_dev);
+	if (!skip_acpi_sta_check) {
+		err = eval_int(priv->adev->handle, "_STA", &cfg);
+		if (err) {
+			dev_info(dev, "Could not evaluate ACPI _STA\n");
+			goto err_acpi_init;
+		}
+
+		err = eval_int(priv->adev->handle, "VPC0._CFG", &cfg);
+		if (err) {
+			dev_info(dev, "Could not evaluate ACPI _CFG\n");
+			goto err_acpi_init;
+		}
+		dev_info(dev, "ACPI CFG: %lu\n", cfg);
+	} else {
+		dev_info(dev, "Skipping ACPI _STA check");
+	}
+
+	return 0;
+
+err_acpi_init:
+	return err;
+}
+
+/* =============================  */
+/* White Keyboard Backlight       */
+/* ============================   */
+// In style of ideapad-driver and with code modified from ideapad-driver.
+
+static enum led_brightness
+legion_kbd_bl_led_cdev_brightness_get(struct led_classdev *led_cdev)
+{
+	struct legion_private *priv =
+		container_of(led_cdev, struct legion_private, kbd_bl.led);
+
+	return legion_kbd_bl_brightness_get(priv);
+}
+
+static int legion_kbd_bl_led_cdev_brightness_set(struct led_classdev *led_cdev,
+						 enum led_brightness brightness)
+{
+	struct legion_private *priv =
+		container_of(led_cdev, struct legion_private, kbd_bl.led);
+
+	return legion_kbd_bl_brightness_set(priv, brightness);
+}
+
+static int legion_kbd_bl_init(struct legion_private *priv)
+{
+	int brightness, err;
+
+	if (WARN_ON(priv->kbd_bl.initialized)) {
+		pr_info("Keyboard backlight already initialized\n");
+		return -EEXIST;
+	}
+
+	if (priv->conf->access_method_keyboard == ACCESS_METHOD_NO_ACCESS) {
+		pr_info("Keyboard backlight handling disabled by this driver\n");
+		return -ENODEV;
+	}
+
+	brightness = legion_kbd_bl_brightness_get(priv);
+	if (brightness < 0) {
+		pr_info("Error reading keyboard brighntess\n");
+		return brightness;
+	}
+
+	priv->kbd_bl.last_brightness = brightness;
+
+	// will be renamed to "platform::kbd_backlight_1" if it exists already
+	priv->kbd_bl.led.name = "platform::" LED_FUNCTION_KBD_BACKLIGHT;
+	priv->kbd_bl.led.max_brightness = 2;
+	priv->kbd_bl.led.brightness_get = legion_kbd_bl_led_cdev_brightness_get;
+	priv->kbd_bl.led.brightness_set_blocking =
+		legion_kbd_bl_led_cdev_brightness_set;
+	priv->kbd_bl.led.flags = LED_BRIGHT_HW_CHANGED;
+
+	err = led_classdev_register(&priv->platform_device->dev,
+				    &priv->kbd_bl.led);
+	if (err)
+		return err;
+
+	priv->kbd_bl.initialized = true;
+
+	return 0;
+}
+
+/**
+ * Deinit keyboard backlight.
+ *
+ * Can also be called if init was not successful.
+ *
+ */
+static void legion_kbd_bl_exit(struct legion_private *priv)
+{
+	if (!priv->kbd_bl.initialized)
+		return;
+
+	priv->kbd_bl.initialized = false;
+
+	led_classdev_unregister(&priv->kbd_bl.led);
+}
+
+/* =============================  */
+/* Additional light driver        */
+/* ============================   */
+
+static enum led_brightness
+legion_wmi_cdev_brightness_get(struct led_classdev *led_cdev)
+{
+	struct legion_private *priv =
+		container_of(led_cdev, struct legion_private, kbd_bl.led);
+	struct light *light_ins = container_of(led_cdev, struct light, led);
+
+	return legion_wmi_light_get(priv, light_ins->light_id,
+				    light_ins->lower_limit,
+				    light_ins->upper_limit);
+}
+
+static int legion_wmi_cdev_brightness_set(struct led_classdev *led_cdev,
+					  enum led_brightness brightness)
+{
+	struct legion_private *priv =
+		container_of(led_cdev, struct legion_private, kbd_bl.led);
+	struct light *light_ins = container_of(led_cdev, struct light, led);
+
+	return legion_wmi_light_set(priv, light_ins->light_id,
+				    light_ins->lower_limit,
+				    light_ins->upper_limit, brightness);
+}
+
+static int legion_light_init(struct legion_private *priv,
+			     struct light *light_ins, u8 light_id,
+			     u8 lower_limit, u8 upper_limit, const char *name)
+{
+	int brightness, err;
+
+	if (WARN_ON(light_ins->initialized)) {
+		pr_info("Light already initialized for light: %u\n",
+			light_ins->light_id);
+		return -EEXIST;
+	}
+
+	light_ins->light_id = light_id;
+	light_ins->lower_limit = lower_limit;
+	light_ins->upper_limit = upper_limit;
+
+	brightness = legion_wmi_light_get(priv, light_ins->light_id,
+					  light_ins->lower_limit,
+					  light_ins->upper_limit);
+	if (brightness < 0) {
+		pr_info("Error reading brighntess for light: %u\n",
+			light_ins->light_id);
+		return brightness;
+	}
+
+	light_ins->led.name = name;
+	light_ins->led.max_brightness =
+		light_ins->upper_limit - light_ins->lower_limit;
+	light_ins->led.brightness_get = legion_wmi_cdev_brightness_get;
+	light_ins->led.brightness_set_blocking = legion_wmi_cdev_brightness_set;
+	light_ins->led.flags = LED_BRIGHT_HW_CHANGED;
+
+	err = led_classdev_register(&priv->platform_device->dev,
+				    &light_ins->led);
+	if (err)
+		return err;
+
+	light_ins->initialized = true;
+
+	return 0;
+}
+
+/**
+ * Deinit light.
+ *
+ * Can also be called if init was not successful.
+ *
+ */
+static void legion_light_exit(struct legion_private *priv,
+			      struct light *light_ins)
+{
+	if (!light_ins->initialized)
+		return;
+
+	light_ins->initialized = false;
+
+	led_classdev_unregister(&light_ins->led);
+}
+
+/* =============================  */
+/* Platform driver                */
+/* ============================   */
+
+static int legion_add(struct platform_device *pdev)
+{
+	struct legion_private *priv;
+	const struct dmi_system_id *dmi_sys;
+	int err;
+	u16 ec_read_id;
+	bool skip_ec_id_check;
+	bool is_ec_id_valid;
+	bool is_denied = true;
+	bool is_allowed = false;
+	bool do_load_by_list = false;
+	bool do_load = false;
+	//struct legion_private *priv = dev_get_drvdata(&pdev->dev);
+	dev_info(&pdev->dev, "legion_laptop platform driver probing\n");
+
+	dev_info(
+		&pdev->dev,
+		"Read identifying information: DMI_SYS_VENDOR: %s; DMI_PRODUCT_NAME: %s; DMI_BIOS_VERSION:%s\n",
+		dmi_get_system_info(DMI_SYS_VENDOR),
+		dmi_get_system_info(DMI_PRODUCT_NAME),
+		dmi_get_system_info(DMI_BIOS_VERSION));
+
+	// TODO: allocate?
+	priv = &_priv;
+	priv->platform_device = pdev;
+	err = legion_shared_init(priv);
+	if (err) {
+		dev_info(&pdev->dev, "legion_laptop is forced to load.\n");
+		goto err_legion_shared_init;
+	}
+	dev_set_drvdata(&pdev->dev, priv);
+
+	// TODO: remove
+	pr_info("Read identifying information: DMI_SYS_VENDOR: %s; DMI_PRODUCT_NAME: %s; DMI_BIOS_VERSION:%s\n",
+		dmi_get_system_info(DMI_SYS_VENDOR),
+		dmi_get_system_info(DMI_PRODUCT_NAME),
+		dmi_get_system_info(DMI_BIOS_VERSION));
+
+	dmi_sys = dmi_first_match(optimistic_allowlist);
+	is_allowed = dmi_sys != NULL;
+	is_denied = dmi_check_system(denylist);
+	do_load_by_list = is_allowed && !is_denied;
+	do_load = do_load_by_list || force;
+
+	dev_info(
+		&pdev->dev,
+		"is_denied: %d; is_allowed: %d; do_load_by_list: %d; do_load: %d\n",
+		is_denied, is_allowed, do_load_by_list, do_load);
+
+	if (!(do_load)) {
+		dev_info(
+			&pdev->dev,
+			"Module not useable for this laptop because it is not in allowlist. Notify maintainer if you want to add your device or force load with param force.\n");
+		err = -ENOMEM;
+		goto err_model_mismtach;
+	}
+
+	if (force)
+		dev_info(&pdev->dev, "legion_laptop is forced to load.\n");
+
+	if (!do_load_by_list && do_load) {
+		dev_info(
+			&pdev->dev,
+			"legion_laptop is forced to load and would otherwise be not loaded\n");
+	}
+
+	// if forced and no module found, use config for first model
+	if (dmi_sys == NULL)
+		dmi_sys = &optimistic_allowlist[0];
+	dev_info(&pdev->dev, "Using configuration for system: %s\n",
+		 dmi_sys->ident);
+
+	priv->conf = dmi_sys->driver_data;
+
+	err = acpi_init(priv, ACPI_COMPANION(&pdev->dev));
+	if (err) {
+		dev_info(&pdev->dev, "Could not init ACPI access: %d\n", err);
+		goto err_acpi_init;
+	}
+
+	// TODO: remove; only used for reverse engineering
+	pr_info("Creating RAM access to embedded controller\n");
+	err = ecram_memoryio_init(&priv->ec_memoryio,
+				  priv->conf->ramio_physical_start, 0,
+				  priv->conf->ramio_size);
+	if (err) {
+		dev_info(
+			&pdev->dev,
+			"Could not init RAM access to embedded controller: %d\n",
+			err);
+		goto err_ecram_memoryio_init;
+	}
+
+	err = ecram_init(&priv->ecram, priv->conf->memoryio_physical_ec_start,
+			 priv->conf->memoryio_size);
+	if (err) {
+		dev_info(&pdev->dev,
+			 "Could not init access to embedded controller: %d\n",
+			 err);
+		goto err_ecram_init;
+	}
+
+	ec_read_id = read_ec_id(&priv->ecram, priv->conf);
+	dev_info(&pdev->dev, "Read embedded controller ID 0x%x\n", ec_read_id);
+	skip_ec_id_check = force || (!priv->conf->check_embedded_controller_id);
+	is_ec_id_valid = skip_ec_id_check ||
+			 (ec_read_id == priv->conf->embedded_controller_id);
+	if (!is_ec_id_valid) {
+		err = -ENOMEM;
+		dev_info(&pdev->dev, "Expected EC chip id 0x%x but read 0x%x\n",
+			 priv->conf->embedded_controller_id, ec_read_id);
+		goto err_ecram_id;
+	}
+	if (skip_ec_id_check) {
+		dev_info(&pdev->dev,
+			 "Skipped checking embedded controller id\n");
+	}
+
+	dev_info(&pdev->dev, "Creating debugfs inteface\n");
+	legion_debugfs_init(priv);
+
+	pr_info("Creating sysfs inteface\n");
+	err = legion_sysfs_init(priv);
+	if (err) {
+		dev_info(&pdev->dev, "Creating sysfs interface failed: %d\n",
+			 err);
+		goto err_sysfs_init;
+	}
+
+	pr_info("Creating hwmon interface");
+	err = legion_hwmon_init(priv);
+	if (err) {
+		dev_info(&pdev->dev, "Creating hwmon interface failed: %d\n",
+			 err);
+		goto err_hwmon_init;
+	}
+
+	pr_info("Creating platform profile support\n");
+	err = legion_platform_profile_init(priv);
+	if (err) {
+		dev_info(&pdev->dev, "Creating platform profile failed: %d\n",
+			 err);
+		goto err_platform_profile;
+	}
+
+	pr_info("Init WMI driver support\n");
+	err = legion_wmi_init();
+	if (err) {
+		dev_info(&pdev->dev, "Init WMI driver failed: %d\n", err);
+		goto err_wmi;
+	}
+
+	pr_info("Init keyboard backlight LED driver\n");
+	err = legion_kbd_bl_init(priv);
+	if (err) {
+		dev_info(
+			&pdev->dev,
+			"Init keyboard backlight LED driver failed. Skipping ...\n");
+	}
+
+	pr_info("Init Y-Logo LED driver\n");
+	err = legion_light_init(priv, &priv->ylogo_light, LIGHT_ID_YLOGO, 0, 1,
+				"platform::ylogo");
+	if (err) {
+		dev_info(&pdev->dev,
+			 "Init Y-Logo LED driver failed. Skipping ...\n");
+	}
+
+	pr_info("Init IO-Port LED driver\n");
+	err = legion_light_init(priv, &priv->iport_light, LIGHT_ID_IOPORT, 1, 2,
+				"platform::ioport");
+	if (err) {
+		dev_info(&pdev->dev,
+			 "Init IO-Port LED driver failed. Skipping ...\n");
+	}
+
+	dev_info(&pdev->dev, "legion_laptop loaded for this device\n");
+	return 0;
+
+	// TODO: remove eventually
+	legion_light_exit(priv, &priv->iport_light);
+	legion_light_exit(priv, &priv->ylogo_light);
+	legion_kbd_bl_exit(priv);
+	legion_wmi_exit();
+err_wmi:
+	legion_platform_profile_exit(priv);
+err_platform_profile:
+	legion_hwmon_exit(priv);
+err_hwmon_init:
+	legion_sysfs_exit(priv);
+err_sysfs_init:
+	legion_debugfs_exit(priv);
+err_ecram_id:
+	ecram_exit(&priv->ecram);
+err_ecram_init:
+	ecram_memoryio_exit(&priv->ec_memoryio);
+err_ecram_memoryio_init:
+err_acpi_init:
+	legion_shared_exit(priv);
+err_legion_shared_init:
+err_model_mismtach:
+	dev_info(&pdev->dev, "legion_laptop not loaded for this device\n");
+	return err;
+}
+
+static int legion_remove(struct platform_device *pdev)
+{
+	struct legion_private *priv = dev_get_drvdata(&pdev->dev);
+
+	mutex_lock(&legion_shared_mutex);
+	priv->loaded = false;
+	mutex_unlock(&legion_shared_mutex);
+
+	legion_light_exit(priv, &priv->iport_light);
+	legion_light_exit(priv, &priv->ylogo_light);
+	legion_kbd_bl_exit(priv);
+	// first unregister wmi, so toggling powermode does not
+	// generate events anymore that even might be delayed
+	legion_wmi_exit();
+	legion_platform_profile_exit(priv);
+
+	// toggle power mode to load default setting from embedded controller
+	// again
+	toggle_powermode(priv);
+
+	legion_hwmon_exit(priv);
+	legion_sysfs_exit(priv);
+	legion_debugfs_exit(priv);
+	ecram_exit(&priv->ecram);
+	ecram_memoryio_exit(&priv->ec_memoryio);
+	legion_shared_exit(priv);
+
+	pr_info("Legion platform unloaded\n");
+	return 0;
+}
+
+static int legion_resume(struct platform_device *pdev)
+{
+	//struct legion_private *priv = dev_get_drvdata(&pdev->dev);
+	dev_info(&pdev->dev, "Resumed in legion-laptop\n");
+
+	return 0;
+}
+
+#ifdef CONFIG_PM_SLEEP
+static int legion_pm_resume(struct device *dev)
+{
+	//struct legion_private *priv = dev_get_drvdata(dev);
+	dev_info(dev, "Resumed PM in legion-laptop\n");
+
+	return 0;
+}
+#endif
+static SIMPLE_DEV_PM_OPS(legion_pm, NULL, legion_pm_resume);
+
+// same as ideapad
+static const struct acpi_device_id legion_device_ids[] = {
+	// todo: change to "VPC2004", and also ACPI paths
+	{ "PNP0C09", 0 },
+	{ "", 0 },
+};
+MODULE_DEVICE_TABLE(acpi, legion_device_ids);
+
+static struct platform_driver legion_driver = {
+	.probe = legion_add,
+	.remove = legion_remove,
+	.resume = legion_resume,
+	.driver = {
+		.name   = "legion",
+		.pm     = &legion_pm,
+		.acpi_match_table = ACPI_PTR(legion_device_ids),
+	},
+};
+
+static int __init legion_init(void)
+{
+	int err;
+
+	pr_info("legion_laptop starts loading\n");
+	err = platform_driver_register(&legion_driver);
+	if (err) {
+		pr_info("legion_laptop: platform_driver_register failed\n");
+		return err;
+	}
+
+	return 0;
+}
+
+module_init(legion_init);
+
+static void __exit legion_exit(void)
+{
+	platform_driver_unregister(&legion_driver);
+	pr_info("legion_laptop exit\n");
+}
+
+module_exit(legion_exit);
diff --git a/drivers/platform/x86/steamdeck.c b/drivers/platform/x86/steamdeck.c
new file mode 100644
index 000000000000..77a6677ec19e
--- /dev/null
+++ b/drivers/platform/x86/steamdeck.c
@@ -0,0 +1,523 @@
+// SPDX-License-Identifier: GPL-2.0+
+
+/*
+ * Steam Deck ACPI platform driver
+ *
+ * Copyright (C) 2021-2022 Valve Corporation
+ *
+ */
+#include <linux/acpi.h>
+#include <linux/hwmon.h>
+#include <linux/platform_device.h>
+#include <linux/regmap.h>
+#include <linux/extcon-provider.h>
+
+#define ACPI_STEAMDECK_NOTIFY_STATUS	0x80
+
+/* 0 - port connected, 1 -port disconnected */
+#define ACPI_STEAMDECK_PORT_CONNECT	BIT(0)
+/* 0 - Upstream Facing Port, 1 - Downdstream Facing Port */
+#define ACPI_STEAMDECK_CUR_DATA_ROLE	BIT(3)
+/*
+ * Debouncing delay to allow negotiation process to settle. 2s value
+ * was arrived at via trial and error.
+ */
+#define STEAMDECK_ROLE_SWITCH_DELAY	(msecs_to_jiffies(2000))
+
+struct steamdeck {
+	struct acpi_device *adev;
+	struct device *hwmon;
+	void *regmap;
+	long fan_target;
+	struct delayed_work role_work;
+	struct extcon_dev *edev;
+	struct device *dev;
+};
+
+static ssize_t
+steamdeck_simple_store(struct device *dev, const char *buf, size_t count,
+		       const char *method,
+		       unsigned long upper_limit)
+{
+	struct steamdeck *fan = dev_get_drvdata(dev);
+	unsigned long value;
+
+	if (kstrtoul(buf, 10, &value) || value >= upper_limit)
+		return -EINVAL;
+
+	if (ACPI_FAILURE(acpi_execute_simple_method(fan->adev->handle,
+						    (char *)method, value)))
+		return -EIO;
+
+	return count;
+}
+
+#define STEAMDECK_ATTR_WO(_name, _method, _upper_limit)			\
+	static ssize_t _name##_store(struct device *dev,		\
+				     struct device_attribute *attr,	\
+				     const char *buf, size_t count)	\
+	{								\
+		return steamdeck_simple_store(dev, buf, count,		\
+					    _method,			\
+					    _upper_limit);		\
+	}								\
+	static DEVICE_ATTR_WO(_name)
+
+STEAMDECK_ATTR_WO(target_cpu_temp, "STCT", U8_MAX / 2);
+STEAMDECK_ATTR_WO(gain, "SGAN", U16_MAX);
+STEAMDECK_ATTR_WO(ramp_rate, "SFRR", U8_MAX);
+STEAMDECK_ATTR_WO(hysteresis, "SHTS",  U16_MAX);
+STEAMDECK_ATTR_WO(maximum_battery_charge_rate, "CHGR", U16_MAX);
+STEAMDECK_ATTR_WO(recalculate, "SCHG", U16_MAX);
+
+STEAMDECK_ATTR_WO(led_brightness, "CHBV", U8_MAX);
+STEAMDECK_ATTR_WO(content_adaptive_brightness, "CABC", U8_MAX);
+STEAMDECK_ATTR_WO(gamma_set, "GAMA", U8_MAX);
+STEAMDECK_ATTR_WO(display_brightness, "WDBV", U8_MAX);
+STEAMDECK_ATTR_WO(ctrl_display, "WCDV", U8_MAX);
+STEAMDECK_ATTR_WO(cabc_minimum_brightness, "WCMB", U8_MAX);
+STEAMDECK_ATTR_WO(memory_data_access_control, "MDAC", U8_MAX);
+
+#define STEAMDECK_ATTR_WO_NOARG(_name, _method)				\
+	static ssize_t _name##_store(struct device *dev,		\
+				     struct device_attribute *attr,	\
+				     const char *buf, size_t count)	\
+	{								\
+		struct steamdeck *fan = dev_get_drvdata(dev);		\
+									\
+		if (ACPI_FAILURE(acpi_evaluate_object(fan->adev->handle, \
+						      _method, NULL, NULL))) \
+			return -EIO;					\
+									\
+		return count;						\
+	}								\
+	static DEVICE_ATTR_WO(_name)
+
+STEAMDECK_ATTR_WO_NOARG(power_cycle_display, "DPCY");
+STEAMDECK_ATTR_WO_NOARG(display_normal_mode_on, "NORO");
+STEAMDECK_ATTR_WO_NOARG(display_inversion_off, "INOF");
+STEAMDECK_ATTR_WO_NOARG(display_inversion_on, "INON");
+STEAMDECK_ATTR_WO_NOARG(idle_mode_on, "WRNE");
+
+#define STEAMDECK_ATTR_RO(_name, _method)				\
+	static ssize_t _name##_show(struct device *dev,			\
+				    struct device_attribute *attr,	\
+				    char *buf)				\
+	{								\
+		struct steamdeck *jup = dev_get_drvdata(dev);		\
+		unsigned long long val;					\
+									\
+		if (ACPI_FAILURE(acpi_evaluate_integer(			\
+					 jup->adev->handle,		\
+					 _method, NULL, &val)))		\
+			return -EIO;					\
+									\
+		return sprintf(buf, "%llu\n", val);			\
+	}								\
+	static DEVICE_ATTR_RO(_name)
+
+STEAMDECK_ATTR_RO(firmware_version, "PDFW");
+STEAMDECK_ATTR_RO(board_id, "BOID");
+STEAMDECK_ATTR_RO(pdcs, "PDCS");
+
+static umode_t
+steamdeck_is_visible(struct kobject *kobj, struct attribute *attr, int index)
+{
+	return attr->mode;
+}
+
+static struct attribute *steamdeck_attributes[] = {
+	&dev_attr_target_cpu_temp.attr,
+	&dev_attr_gain.attr,
+	&dev_attr_ramp_rate.attr,
+	&dev_attr_hysteresis.attr,
+	&dev_attr_maximum_battery_charge_rate.attr,
+	&dev_attr_recalculate.attr,
+	&dev_attr_power_cycle_display.attr,
+
+	&dev_attr_led_brightness.attr,
+	&dev_attr_content_adaptive_brightness.attr,
+	&dev_attr_gamma_set.attr,
+	&dev_attr_display_brightness.attr,
+	&dev_attr_ctrl_display.attr,
+	&dev_attr_cabc_minimum_brightness.attr,
+	&dev_attr_memory_data_access_control.attr,
+
+	&dev_attr_display_normal_mode_on.attr,
+	&dev_attr_display_inversion_off.attr,
+	&dev_attr_display_inversion_on.attr,
+	&dev_attr_idle_mode_on.attr,
+
+	&dev_attr_firmware_version.attr,
+	&dev_attr_board_id.attr,
+	&dev_attr_pdcs.attr,
+
+	NULL
+};
+
+static const struct attribute_group steamdeck_group = {
+	.attrs = steamdeck_attributes,
+	.is_visible = steamdeck_is_visible,
+};
+
+static const struct attribute_group *steamdeck_groups[] = {
+	&steamdeck_group,
+	NULL
+};
+
+static int steamdeck_read_fan_speed(struct steamdeck *jup, long *speed)
+{
+	unsigned long long val;
+
+	if (ACPI_FAILURE(acpi_evaluate_integer(jup->adev->handle,
+					       "FANR", NULL, &val)))
+		return -EIO;
+
+	*speed = val;
+	return 0;
+}
+
+static int
+steamdeck_hwmon_read(struct device *dev, enum hwmon_sensor_types type,
+		     u32 attr, int channel, long *out)
+{
+	struct steamdeck *sd = dev_get_drvdata(dev);
+	unsigned long long val;
+
+	switch (type) {
+	case hwmon_temp:
+		if (attr != hwmon_temp_input)
+			return -EOPNOTSUPP;
+
+		if (ACPI_FAILURE(acpi_evaluate_integer(sd->adev->handle,
+						       "BATT", NULL, &val)))
+			return -EIO;
+		/*
+		 * Assuming BATT returns deg C we need to mutiply it
+		 * by 1000 to convert to mC
+		 */
+		*out = val * 1000;
+		break;
+	case hwmon_fan:
+		switch (attr) {
+		case hwmon_fan_input:
+			return steamdeck_read_fan_speed(sd, out);
+		case hwmon_fan_target:
+			*out = sd->fan_target;
+			break;
+		case hwmon_fan_fault:
+			if (ACPI_FAILURE(acpi_evaluate_integer(
+						 sd->adev->handle,
+						 "FANC", NULL, &val)))
+				return -EIO;
+			/*
+			 * FANC (Fan check):
+			 * 0: Abnormal
+			 * 1: Normal
+			 */
+			*out = !val;
+			break;
+		default:
+			return -EOPNOTSUPP;
+		}
+		break;
+	default:
+		return -EOPNOTSUPP;
+	}
+
+	return 0;
+}
+
+static int
+steamdeck_hwmon_read_string(struct device *dev, enum hwmon_sensor_types type,
+			    u32 attr, int channel, const char **str)
+{
+	switch (type) {
+	case hwmon_temp:
+		*str = "Battery Temp";
+		break;
+	case hwmon_fan:
+		*str = "System Fan";
+		break;
+	default:
+		return -EOPNOTSUPP;
+	}
+
+	return 0;
+}
+
+static int
+steamdeck_hwmon_write(struct device *dev, enum hwmon_sensor_types type,
+		      u32 attr, int channel, long val)
+{
+	struct steamdeck *sd = dev_get_drvdata(dev);
+
+	if (type != hwmon_fan ||
+	    attr != hwmon_fan_target)
+		return -EOPNOTSUPP;
+
+	if (val > U16_MAX)
+		return -EINVAL;
+
+	sd->fan_target = val;
+
+	if (ACPI_FAILURE(acpi_execute_simple_method(sd->adev->handle,
+						    "FANS", val)))
+		return -EIO;
+
+	return 0;
+}
+
+static umode_t
+steamdeck_hwmon_is_visible(const void *data, enum hwmon_sensor_types type,
+			   u32 attr, int channel)
+{
+	if (type == hwmon_fan &&
+	    attr == hwmon_fan_target)
+		return 0644;
+
+	return 0444;
+}
+
+static const struct hwmon_channel_info *steamdeck_info[] = {
+	HWMON_CHANNEL_INFO(temp,
+			   HWMON_T_INPUT | HWMON_T_LABEL),
+	HWMON_CHANNEL_INFO(fan,
+			   HWMON_F_INPUT | HWMON_F_LABEL |
+			   HWMON_F_TARGET | HWMON_F_FAULT),
+	NULL
+};
+
+static const struct hwmon_ops steamdeck_hwmon_ops = {
+	.is_visible = steamdeck_hwmon_is_visible,
+	.read = steamdeck_hwmon_read,
+	.read_string = steamdeck_hwmon_read_string,
+	.write = steamdeck_hwmon_write,
+};
+
+static const struct hwmon_chip_info steamdeck_chip_info = {
+	.ops = &steamdeck_hwmon_ops,
+	.info = steamdeck_info,
+};
+
+#define STEAMDECK_STA_OK			\
+	(ACPI_STA_DEVICE_ENABLED |		\
+	 ACPI_STA_DEVICE_PRESENT |		\
+	 ACPI_STA_DEVICE_FUNCTIONING)
+
+static int
+steamdeck_ddic_reg_read(void *context, unsigned int reg, unsigned int *val)
+{
+	union acpi_object obj = { .type = ACPI_TYPE_INTEGER };
+	struct acpi_object_list arg_list = { .count = 1, .pointer = &obj, };
+	struct steamdeck *sd = context;
+	unsigned long long _val;
+
+	obj.integer.value = reg;
+
+	if (ACPI_FAILURE(acpi_evaluate_integer(sd->adev->handle,
+					       "RDDI", &arg_list, &_val)))
+		return -EIO;
+
+	*val = _val;
+	return 0;
+}
+
+static int steamdeck_read_pdcs(struct steamdeck *sd, unsigned long long *pdcs)
+{
+	acpi_status status;
+
+	status = acpi_evaluate_integer(sd->adev->handle, "PDCS", NULL, pdcs);
+	if (ACPI_FAILURE(status)) {
+		dev_err(sd->dev, "PDCS evaluation failed: %s\n",
+			acpi_format_exception(status));
+		return -EIO;
+	}
+
+	return 0;
+}
+
+static void steamdeck_usb_role_work(struct work_struct *work)
+{
+	struct steamdeck *sd =
+		container_of(work, struct steamdeck, role_work.work);
+	unsigned long long pdcs;
+	bool usb_host;
+
+	if (steamdeck_read_pdcs(sd, &pdcs))
+		return;
+
+	/*
+	 * We only care about these two
+	 */
+	pdcs &= ACPI_STEAMDECK_PORT_CONNECT | ACPI_STEAMDECK_CUR_DATA_ROLE;
+
+	/*
+	 * For "connect" events our role is determined by a bit in
+	 * PDCS, for "disconnect" we switch to being a gadget
+	 * unconditionally. The thinking for the latter is we don't
+	 * want to start acting as a USB host until we get
+	 * confirmation from the firmware that we are a USB host
+	 */
+	usb_host = (pdcs & ACPI_STEAMDECK_PORT_CONNECT) ?
+		pdcs & ACPI_STEAMDECK_CUR_DATA_ROLE : false;
+
+	WARN_ON(extcon_set_state_sync(sd->edev, EXTCON_USB_HOST,
+				      usb_host));
+	dev_dbg(sd->dev, "USB role is %s\n", usb_host ? "host" : "device");
+}
+
+static void steamdeck_notify(acpi_handle handle, u32 event, void *context)
+{
+	struct device *dev = context;
+	struct steamdeck *sd = dev_get_drvdata(dev);
+	unsigned long long pdcs;
+	unsigned long delay;
+
+	switch (event) {
+	case ACPI_STEAMDECK_NOTIFY_STATUS:
+		if (steamdeck_read_pdcs(sd, &pdcs))
+			return;
+		/*
+		 * We process "disconnect" events immediately and
+		 * "connect" events with a delay to give the HW time
+		 * to settle. For example attaching USB hub (at least
+		 * for HW used for testing) will generate intermediary
+		 * event with "host" bit not set, followed by the one
+		 * that does have it set.
+		 */
+		delay = (pdcs & ACPI_STEAMDECK_PORT_CONNECT) ?
+			STEAMDECK_ROLE_SWITCH_DELAY : 0;
+
+		queue_delayed_work(system_long_wq, &sd->role_work, delay);
+		break;
+	default:
+		dev_err(dev, "Unsupported event [0x%x]\n", event);
+	}
+}
+
+static void steamdeck_remove_notify_handler(void *data)
+{
+	struct steamdeck *sd = data;
+
+	acpi_remove_notify_handler(sd->adev->handle, ACPI_DEVICE_NOTIFY,
+				   steamdeck_notify);
+	cancel_delayed_work_sync(&sd->role_work);
+}
+
+static const unsigned int steamdeck_extcon_cable[] = {
+	EXTCON_USB,
+	EXTCON_USB_HOST,
+	EXTCON_CHG_USB_SDP,
+	EXTCON_CHG_USB_CDP,
+	EXTCON_CHG_USB_DCP,
+	EXTCON_CHG_USB_ACA,
+	EXTCON_NONE,
+};
+
+static int steamdeck_probe(struct platform_device *pdev)
+{
+	struct device *dev = &pdev->dev;
+	struct steamdeck *sd;
+	acpi_status status;
+	unsigned long long sta;
+	int ret;
+
+	static const struct regmap_config regmap_config = {
+		.reg_bits = 8,
+		.val_bits = 8,
+		.max_register = 255,
+		.cache_type = REGCACHE_NONE,
+		.reg_read = steamdeck_ddic_reg_read,
+	};
+
+	sd = devm_kzalloc(dev, sizeof(*sd), GFP_KERNEL);
+	if (!sd)
+		return -ENOMEM;
+	sd->adev = ACPI_COMPANION(&pdev->dev);
+	sd->dev = dev;
+	platform_set_drvdata(pdev, sd);
+	INIT_DELAYED_WORK(&sd->role_work, steamdeck_usb_role_work);
+
+	status = acpi_evaluate_integer(sd->adev->handle, "_STA",
+				       NULL, &sta);
+	if (ACPI_FAILURE(status)) {
+		dev_err(dev, "Status check failed (0x%x)\n", status);
+		return -EINVAL;
+	}
+
+	if ((sta & STEAMDECK_STA_OK) != STEAMDECK_STA_OK) {
+		dev_err(dev, "Device is not ready\n");
+		return -EINVAL;
+	}
+
+	/*
+	 * Our ACPI interface doesn't expose a method to read current
+	 * fan target, so we use current fan speed as an
+	 * approximation.
+	 */
+	if (steamdeck_read_fan_speed(sd, &sd->fan_target))
+		dev_warn(dev, "Failed to read fan speed");
+
+	sd->hwmon = devm_hwmon_device_register_with_info(dev,
+							 "steamdeck",
+							 sd,
+							 &steamdeck_chip_info,
+							 steamdeck_groups);
+	if (IS_ERR(sd->hwmon)) {
+		dev_err(dev, "Failed to register HWMON device");
+		return PTR_ERR(sd->hwmon);
+	}
+
+	sd->regmap = devm_regmap_init(dev, NULL, sd, &regmap_config);
+	if (IS_ERR(sd->regmap))
+		dev_err(dev, "Failed to register REGMAP");
+
+	sd->edev = devm_extcon_dev_allocate(dev, steamdeck_extcon_cable);
+	if (IS_ERR(sd->edev))
+		return -ENOMEM;
+
+	ret = devm_extcon_dev_register(dev, sd->edev);
+	if (ret < 0) {
+		dev_err(dev, "Failed to register extcon device: %d\n", ret);
+		return ret;
+	}
+
+	/*
+	 * Set initial role value
+	 */
+	queue_delayed_work(system_long_wq, &sd->role_work, 0);
+	flush_delayed_work(&sd->role_work);
+
+	status = acpi_install_notify_handler(sd->adev->handle,
+					     ACPI_DEVICE_NOTIFY,
+					     steamdeck_notify,
+					     dev);
+	if (ACPI_FAILURE(status)) {
+		dev_err(dev, "Error installing ACPI notify handler\n");
+		return -EIO;
+	}
+
+	ret = devm_add_action_or_reset(dev, steamdeck_remove_notify_handler,
+				       sd);
+	return ret;
+}
+
+static const struct acpi_device_id steamdeck_device_ids[] = {
+	{ "VLV0100", 0 },
+	{ "", 0 },
+};
+MODULE_DEVICE_TABLE(acpi, steamdeck_device_ids);
+
+static struct platform_driver steamdeck_driver = {
+	.probe = steamdeck_probe,
+	.driver = {
+		.name = "steamdeck",
+		.acpi_match_table = steamdeck_device_ids,
+	},
+};
+module_platform_driver(steamdeck_driver);
+
+MODULE_AUTHOR("Andrey Smirnov <andrew.smirnov@gmail.com>");
+MODULE_DESCRIPTION("Steam Deck ACPI platform driver");
+MODULE_LICENSE("GPL");
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 34f9dba17c1a..4527f319019a 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -191,7 +191,7 @@ static inline void __mm_zero_struct_page(struct page *page)
  * that.
  */
 #define MAPCOUNT_ELF_CORE_MARGIN	(5)
-#define DEFAULT_MAX_MAP_COUNT	(USHRT_MAX - MAPCOUNT_ELF_CORE_MARGIN)
+#define DEFAULT_MAX_MAP_COUNT	(INT_MAX - MAPCOUNT_ELF_CORE_MARGIN)
 
 extern int sysctl_max_map_count;
 
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 716953ee1ebd..dace360dc38d 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -1181,7 +1181,7 @@ struct readahead_control {
 		._index = i,						\
 	}
 
-#define VM_READAHEAD_PAGES	(SZ_128K / PAGE_SIZE)
+#define VM_READAHEAD_PAGES	(SZ_8M / PAGE_SIZE)
 
 void page_cache_ra_unbounded(struct readahead_control *,
 		unsigned long nr_to_read, unsigned long lookahead_count);
diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h
index 45f09bec02c4..87b20e2ee274 100644
--- a/include/linux/user_namespace.h
+++ b/include/linux/user_namespace.h
@@ -148,6 +148,8 @@ static inline void set_userns_rlimit_max(struct user_namespace *ns,
 
 #ifdef CONFIG_USER_NS
 
+extern int unprivileged_userns_clone;
+
 static inline struct user_namespace *get_user_ns(struct user_namespace *ns)
 {
 	if (ns)
@@ -181,6 +183,8 @@ extern bool current_in_userns(const struct user_namespace *target_ns);
 struct ns_common *ns_get_owner(struct ns_common *ns);
 #else
 
+#define unprivileged_userns_clone 0
+
 static inline struct user_namespace *get_user_ns(struct user_namespace *ns)
 {
 	return &init_user_ns;
diff --git a/init/Kconfig b/init/Kconfig
index 5e7d4885d1bf..25193a9d5c61 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -123,6 +123,10 @@ config THREAD_INFO_IN_TASK
 
 menu "General setup"
 
+config CACHY
+    bool "Some kernel tweaks by CachyOS"
+    default y
+
 config BROKEN
 	bool
 
@@ -1226,6 +1230,22 @@ config USER_NS
 
 	  If unsure, say N.
 
+config USER_NS_UNPRIVILEGED
+	bool "Allow unprivileged users to create namespaces"
+	default y
+	depends on USER_NS
+	help
+	  When disabled, unprivileged users will not be able to create
+	  new namespaces. Allowing users to create their own namespaces
+	  has been part of several recent local privilege escalation
+	  exploits, so if you need user namespaces but are
+	  paranoid^Wsecurity-conscious you want to disable this.
+
+	  This setting can be overridden at runtime via the
+	  kernel.unprivileged_userns_clone sysctl.
+
+	  If unsure, say Y.
+
 config PID_NS
 	bool "PID Namespaces"
 	default y
@@ -1368,6 +1388,12 @@ config CC_OPTIMIZE_FOR_PERFORMANCE
 	  with the "-O2" compiler flag for best performance and most
 	  helpful compile-time warnings.
 
+config CC_OPTIMIZE_FOR_PERFORMANCE_O3
+	bool "Optimize more for performance (-O3)"
+	help
+	  Choosing this option will pass "-O3" to your compiler to optimize
+	  the kernel yet more for performance.
+
 config CC_OPTIMIZE_FOR_SIZE
 	bool "Optimize for size (-Os)"
 	help
diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz
index 38ef6d06888e..0f78364efd4f 100644
--- a/kernel/Kconfig.hz
+++ b/kernel/Kconfig.hz
@@ -40,6 +40,27 @@ choice
 	 on SMP and NUMA systems and exactly dividing by both PAL and
 	 NTSC frame rates for video and multimedia work.
 
+	config HZ_500
+		bool "500 HZ"
+	help
+	 500 Hz is a balanced timer frequency. Provides fast interactivity
+	 on desktops with good smoothness without increasing CPU power
+	 consumption and sacrificing the battery life on laptops.
+
+	config HZ_600
+		bool "600 HZ"
+	help
+	 600 Hz is a balanced timer frequency. Provides fast interactivity
+	 on desktops with good smoothness without increasing CPU power
+	 consumption and sacrificing the battery life on laptops.
+
+	config HZ_750
+		bool "750 HZ"
+	help
+	 750 Hz is a balanced timer frequency. Provides fast interactivity
+	 on desktops with good smoothness without increasing CPU power
+	 consumption and sacrificing the battery life on laptops.
+
 	config HZ_1000
 		bool "1000 HZ"
 	help
@@ -53,6 +74,9 @@ config HZ
 	default 100 if HZ_100
 	default 250 if HZ_250
 	default 300 if HZ_300
+	default 500 if HZ_500
+	default 600 if HZ_600
+	default 750 if HZ_750
 	default 1000 if HZ_1000
 
 config SCHED_HRTICK
diff --git a/kernel/fork.c b/kernel/fork.c
index f81149739eb9..36fb0b711541 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -100,6 +100,10 @@
 #include <linux/user_events.h>
 #include <linux/iommu.h>
 
+#ifdef CONFIG_USER_NS
+#include <linux/user_namespace.h>
+#endif
+
 #include <asm/pgalloc.h>
 #include <linux/uaccess.h>
 #include <asm/mmu_context.h>
@@ -2271,6 +2275,10 @@ __latent_entropy struct task_struct *copy_process(
 	if ((clone_flags & (CLONE_NEWUSER|CLONE_FS)) == (CLONE_NEWUSER|CLONE_FS))
 		return ERR_PTR(-EINVAL);
 
+	if ((clone_flags & CLONE_NEWUSER) && !unprivileged_userns_clone)
+		if (!capable(CAP_SYS_ADMIN))
+			return ERR_PTR(-EPERM);
+
 	/*
 	 * Thread groups must share signals as well, and detached threads
 	 * can only be started up within the thread group.
@@ -3424,6 +3432,12 @@ int ksys_unshare(unsigned long unshare_flags)
 	if (unshare_flags & CLONE_NEWNS)
 		unshare_flags |= CLONE_FS;
 
+	if ((unshare_flags & CLONE_NEWUSER) && !unprivileged_userns_clone) {
+		err = -EPERM;
+		if (!capable(CAP_SYS_ADMIN))
+			goto bad_unshare_out;
+	}
+
 	err = check_unshare_flags(unshare_flags);
 	if (err)
 		goto bad_unshare_out;
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 1d9c2482c5a3..ff3386691626 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -69,9 +69,13 @@
  *
  * (default: 6ms * (1 + ilog(ncpus)), units: nanoseconds)
  */
+#ifdef CONFIG_CACHY
+unsigned int sysctl_sched_latency			= 3000000ULL;
+static unsigned int normalized_sysctl_sched_latency	= 3000000ULL;
+#else
 unsigned int sysctl_sched_latency			= 6000000ULL;
 static unsigned int normalized_sysctl_sched_latency	= 6000000ULL;
-
+#endif
 /*
  * The initial- and re-scaling of tunables is configurable
  *
@@ -90,8 +94,13 @@ unsigned int sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_LOG;
  *
  * (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds)
  */
+#ifdef CONFIG_CACHY
+unsigned int sysctl_sched_min_granularity			= 400000ULL;
+static unsigned int normalized_sysctl_sched_min_granularity	= 400000ULL;
+#else
 unsigned int sysctl_sched_min_granularity			= 750000ULL;
 static unsigned int normalized_sysctl_sched_min_granularity	= 750000ULL;
+#endif
 
 /*
  * Minimal preemption granularity for CPU-bound SCHED_IDLE tasks.
@@ -121,8 +130,13 @@ unsigned int sysctl_sched_child_runs_first __read_mostly;
  *
  * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds)
  */
+#ifdef CONFIG_CACHY
+unsigned int sysctl_sched_wakeup_granularity			= 500000UL;
+static unsigned int normalized_sysctl_sched_wakeup_granularity	= 500000UL;
+#else
 unsigned int sysctl_sched_wakeup_granularity			= 1000000UL;
 static unsigned int normalized_sysctl_sched_wakeup_granularity	= 1000000UL;
+#endif
 
 const_debug unsigned int sysctl_sched_migration_cost	= 500000UL;
 
@@ -175,8 +189,12 @@ int __weak arch_asym_cpu_priority(int cpu)
  *
  * (default: 5 msec, units: microseconds)
  */
+#ifdef CONFIG_CACHY
+static unsigned int sysctl_sched_cfs_bandwidth_slice		= 3000UL;
+#else
 static unsigned int sysctl_sched_cfs_bandwidth_slice		= 5000UL;
 #endif
+#endif
 
 #ifdef CONFIG_NUMA_BALANCING
 /* Restrict the NUMA promotion throughput (MB/s) for each target node. */
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 354a2d294f52..4dc780aa3bcc 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -95,6 +95,9 @@ EXPORT_SYMBOL_GPL(sysctl_long_vals);
 #ifdef CONFIG_PERF_EVENTS
 static const int six_hundred_forty_kb = 640 * 1024;
 #endif
+#ifdef CONFIG_USER_NS
+#include <linux/user_namespace.h>
+#endif
 
 
 static const int ngroups_max = NGROUPS_MAX;
@@ -1623,6 +1626,15 @@ static struct ctl_table kern_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec,
 	},
+#ifdef CONFIG_USER_NS
+	{
+		.procname	= "unprivileged_userns_clone",
+		.data		= &unprivileged_userns_clone,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
+#endif
 #ifdef CONFIG_PROC_SYSCTL
 	{
 		.procname	= "tainted",
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 1d8e47bed3f1..fec01d016a35 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -22,6 +22,13 @@
 #include <linux/bsearch.h>
 #include <linux/sort.h>
 
+/* sysctl */
+#ifdef CONFIG_USER_NS_UNPRIVILEGED
+int unprivileged_userns_clone = 1;
+#else
+int unprivileged_userns_clone;
+#endif
+
 static struct kmem_cache *user_ns_cachep __read_mostly;
 static DEFINE_MUTEX(userns_state_mutex);
 
diff --git a/lib/scatterlist.c b/lib/scatterlist.c
index c65566b4dc66..d3c8aaa68c5d 100644
--- a/lib/scatterlist.c
+++ b/lib/scatterlist.c
@@ -150,31 +150,12 @@ EXPORT_SYMBOL(sg_init_one);
  */
 static struct scatterlist *sg_kmalloc(unsigned int nents, gfp_t gfp_mask)
 {
-	if (nents == SG_MAX_SINGLE_ALLOC) {
-		/*
-		 * Kmemleak doesn't track page allocations as they are not
-		 * commonly used (in a raw form) for kernel data structures.
-		 * As we chain together a list of pages and then a normal
-		 * kmalloc (tracked by kmemleak), in order to for that last
-		 * allocation not to become decoupled (and thus a
-		 * false-positive) we need to inform kmemleak of all the
-		 * intermediate allocations.
-		 */
-		void *ptr = (void *) __get_free_page(gfp_mask);
-		kmemleak_alloc(ptr, PAGE_SIZE, 1, gfp_mask);
-		return ptr;
-	} else
-		return kmalloc_array(nents, sizeof(struct scatterlist),
-				     gfp_mask);
+	return kmalloc_array(nents, sizeof(struct scatterlist), gfp_mask);
 }
 
 static void sg_kfree(struct scatterlist *sg, unsigned int nents)
 {
-	if (nents == SG_MAX_SINGLE_ALLOC) {
-		kmemleak_free(sg);
-		free_page((unsigned long) sg);
-	} else
-		kfree(sg);
+	kfree(sg);
 }
 
 /**
diff --git a/mm/Kconfig b/mm/Kconfig
index 09130434e30d..f772ba88df87 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -631,7 +631,7 @@ config COMPACTION
 config COMPACT_UNEVICTABLE_DEFAULT
 	int
 	depends on COMPACTION
-	default 0 if PREEMPT_RT
+	default 0 if PREEMPT_RT || CACHY
 	default 1
 
 #
diff --git a/mm/internal.h b/mm/internal.h
index 8ed127c1c808..2f3040ec707d 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -421,6 +421,7 @@ extern void prep_compound_page(struct page *page, unsigned int order);
 extern void post_alloc_hook(struct page *page, unsigned int order,
 					gfp_t gfp_flags);
 extern int user_min_free_kbytes;
+extern atomic_long_t kswapd_waiters;
 
 extern void free_unref_page(struct page *page, unsigned int order);
 extern void free_unref_page_list(struct list_head *list);
diff --git a/mm/list_lru.c b/mm/list_lru.c
index a05e5bef3b40..0ead8e6651df 100644
--- a/mm/list_lru.c
+++ b/mm/list_lru.c
@@ -178,6 +178,7 @@ EXPORT_SYMBOL_GPL(list_lru_isolate_move);
 unsigned long list_lru_count_one(struct list_lru *lru,
 				 int nid, struct mem_cgroup *memcg)
 {
+#if defined(CONFIG_MEMCG) && !defined(CONFIG_SLOB)
 	struct list_lru_one *l;
 	long count;
 
@@ -190,6 +191,9 @@ unsigned long list_lru_count_one(struct list_lru *lru,
 		count = 0;
 
 	return count;
+#else
+	return READ_ONCE(lru->node[nid].lru.nr_items);
+#endif
 }
 EXPORT_SYMBOL_GPL(list_lru_count_one);
 
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index d3f42009bb70..39b9fd060630 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -71,7 +71,11 @@ static long ratelimit_pages = 32;
 /*
  * Start background writeback (via writeback threads) at this percentage
  */
+#ifdef CONFIG_CACHY
+static int dirty_background_ratio = 5;
+#else
 static int dirty_background_ratio = 10;
+#endif
 
 /*
  * dirty_background_bytes starts at 0 (disabled) so that it is a function of
@@ -99,7 +103,11 @@ static unsigned long vm_dirty_bytes;
 /*
  * The interval between `kupdate'-style writebacks
  */
+#ifdef CONFIG_CACHY
+unsigned int dirty_writeback_interval = 10 * 100; /* centiseconds */
+#else
 unsigned int dirty_writeback_interval = 5 * 100; /* centiseconds */
+#endif
 
 EXPORT_SYMBOL_GPL(dirty_writeback_interval);
 
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index d322bfae8f69..c67954222af5 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -204,6 +204,8 @@ EXPORT_SYMBOL(node_states);
 
 gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK;
 
+atomic_long_t kswapd_waiters = ATOMIC_LONG_INIT(0);
+
 /*
  * A cached value of the page's pageblock's migratetype, used when the page is
  * put on a pcplist. Used to avoid the pageblock migratetype lookup when
@@ -297,7 +299,7 @@ static compound_page_dtor * const compound_page_dtors[NR_COMPOUND_DTORS] = {
 
 int min_free_kbytes = 1024;
 int user_min_free_kbytes = -1;
-static int watermark_boost_factor __read_mostly = 15000;
+static int watermark_boost_factor __read_mostly;
 static int watermark_scale_factor = 10;
 
 /* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */
@@ -2152,16 +2154,17 @@ __rmqueue(struct zone *zone, unsigned int order, int migratetype,
 }
 
 /*
- * Obtain a specified number of elements from the buddy allocator, all under
- * a single hold of the lock, for efficiency.  Add them to the supplied list.
- * Returns the number of new pages which were placed at *list.
+ * Obtain a specified number of elements from the buddy allocator, and relax the
+ * zone lock when needed. Add them to the supplied list. Returns the number of
+ * new pages which were placed at *list.
  */
 static int rmqueue_bulk(struct zone *zone, unsigned int order,
 			unsigned long count, struct list_head *list,
 			int migratetype, unsigned int alloc_flags)
 {
+	const bool can_resched = !preempt_count() && !irqs_disabled();
 	unsigned long flags;
-	int i;
+	int i, last_mod = 0;
 
 	spin_lock_irqsave(&zone->lock, flags);
 	for (i = 0; i < count; ++i) {
@@ -2170,6 +2173,18 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
 		if (unlikely(page == NULL))
 			break;
 
+		/* Reschedule and ease the contention on the lock if needed */
+		if (i + 1 < count && ((can_resched && need_resched()) ||
+				      spin_needbreak(&zone->lock))) {
+			__mod_zone_page_state(zone, NR_FREE_PAGES,
+					      -((i + 1 - last_mod) << order));
+			last_mod = i + 1;
+			spin_unlock_irqrestore(&zone->lock, flags);
+			if (can_resched)
+				cond_resched();
+			spin_lock_irqsave(&zone->lock, flags);
+		}
+
 		/*
 		 * Split buddy pages returned by expand() are received here in
 		 * physical page order. The page is added to the tail of
@@ -2186,7 +2201,7 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
 					      -(1 << order));
 	}
 
-	__mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order));
+	__mod_zone_page_state(zone, NR_FREE_PAGES, -((i - last_mod) << order));
 	spin_unlock_irqrestore(&zone->lock, flags);
 
 	return i;
@@ -3962,6 +3977,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
 	unsigned int cpuset_mems_cookie;
 	unsigned int zonelist_iter_cookie;
 	int reserve_flags;
+	bool woke_kswapd = false;
 
 restart:
 	compaction_retries = 0;
@@ -4001,8 +4017,13 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
 			goto nopage;
 	}
 
-	if (alloc_flags & ALLOC_KSWAPD)
+	if (alloc_flags & ALLOC_KSWAPD) {
+		if (!woke_kswapd) {
+			atomic_long_inc(&kswapd_waiters);
+			woke_kswapd = true;
+		}
 		wake_all_kswapds(order, gfp_mask, ac);
+	}
 
 	/*
 	 * The adjusted alloc_flags might result in immediate success, so try
@@ -4217,9 +4238,12 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
 		goto retry;
 	}
 fail:
-	warn_alloc(gfp_mask, ac->nodemask,
-			"page allocation failure: order:%u", order);
 got_pg:
+	if (woke_kswapd)
+		atomic_long_dec(&kswapd_waiters);
+	if (!page)
+		warn_alloc(gfp_mask, ac->nodemask,
+				"page allocation failure: order:%u", order);
 	return page;
 }
 
diff --git a/mm/swap.c b/mm/swap.c
index cd8f0150ba3a..42c405a4f114 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -1090,6 +1090,10 @@ void folio_batch_remove_exceptionals(struct folio_batch *fbatch)
  */
 void __init swap_setup(void)
 {
+#ifdef CONFIG_CACHY
+	/* Only swap-in pages requested, avoid readahead */
+	page_cluster = 0;
+#else
 	unsigned long megs = totalram_pages() >> (20 - PAGE_SHIFT);
 
 	/* Use a smaller cluster for small-memory machines */
@@ -1101,4 +1105,5 @@ void __init swap_setup(void)
 	 * Right now other parts of the system means that we
 	 * _really_ don't want to cluster much more
 	 */
+#endif
 }
diff --git a/mm/vmpressure.c b/mm/vmpressure.c
index 22c6689d9302..bf65bd9abdf3 100644
--- a/mm/vmpressure.c
+++ b/mm/vmpressure.c
@@ -43,7 +43,11 @@ static const unsigned long vmpressure_win = SWAP_CLUSTER_MAX * 16;
  * essence, they are percents: the higher the value, the more number
  * unsuccessful reclaims there were.
  */
+#ifdef CONFIG_CACHY
+static const unsigned int vmpressure_level_med = 65;
+#else
 static const unsigned int vmpressure_level_med = 60;
+#endif
 static const unsigned int vmpressure_level_critical = 95;
 
 /*
diff --git a/mm/vmscan.c b/mm/vmscan.c
index da152407bc2b..a958dc4e0024 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -186,7 +186,11 @@ struct scan_control {
 /*
  * From 0 .. 200.  Higher means more swappy.
  */
+#ifdef CONFIG_CACHY
+int vm_swappiness = 20;
+#else
 int vm_swappiness = 60;
+#endif
 
 LIST_HEAD(shrinker_list);
 DECLARE_RWSEM(shrinker_rwsem);
@@ -4595,7 +4599,11 @@ static bool lruvec_is_reclaimable(struct lruvec *lruvec, struct scan_control *sc
 }
 
 /* to protect the working set of the last N jiffies */
+#ifdef CONFIG_CACHY
+static unsigned long lru_gen_min_ttl __read_mostly = HZ;
+#else
 static unsigned long lru_gen_min_ttl __read_mostly;
+#endif
 
 static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc)
 {
@@ -6908,7 +6916,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
 	return 0;
 }
 
-static bool allow_direct_reclaim(pg_data_t *pgdat)
+static bool allow_direct_reclaim(pg_data_t *pgdat, bool using_kswapd)
 {
 	struct zone *zone;
 	unsigned long pfmemalloc_reserve = 0;
@@ -6937,6 +6945,10 @@ static bool allow_direct_reclaim(pg_data_t *pgdat)
 
 	wmark_ok = free_pages > pfmemalloc_reserve / 2;
 
+	/* The throttled direct reclaimer is now a kswapd waiter */
+	if (unlikely(!using_kswapd && !wmark_ok))
+		atomic_long_inc(&kswapd_waiters);
+
 	/* kswapd must be awake if processes are being throttled */
 	if (!wmark_ok && waitqueue_active(&pgdat->kswapd_wait)) {
 		if (READ_ONCE(pgdat->kswapd_highest_zoneidx) > ZONE_NORMAL)
@@ -7002,7 +7014,7 @@ static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist,
 
 		/* Throttle based on the first usable node */
 		pgdat = zone->zone_pgdat;
-		if (allow_direct_reclaim(pgdat))
+		if (allow_direct_reclaim(pgdat, gfp_mask & __GFP_KSWAPD_RECLAIM))
 			goto out;
 		break;
 	}
@@ -7024,11 +7036,14 @@ static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist,
 	 */
 	if (!(gfp_mask & __GFP_FS))
 		wait_event_interruptible_timeout(pgdat->pfmemalloc_wait,
-			allow_direct_reclaim(pgdat), HZ);
+			allow_direct_reclaim(pgdat, true), HZ);
 	else
 		/* Throttle until kswapd wakes the process */
 		wait_event_killable(zone->zone_pgdat->pfmemalloc_wait,
-			allow_direct_reclaim(pgdat));
+			allow_direct_reclaim(pgdat, true));
+
+	if (unlikely(!(gfp_mask & __GFP_KSWAPD_RECLAIM)))
+		atomic_long_dec(&kswapd_waiters);
 
 	if (fatal_signal_pending(current))
 		return true;
@@ -7526,14 +7541,15 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int highest_zoneidx)
 		 * able to safely make forward progress. Wake them
 		 */
 		if (waitqueue_active(&pgdat->pfmemalloc_wait) &&
-				allow_direct_reclaim(pgdat))
+				allow_direct_reclaim(pgdat, true))
 			wake_up_all(&pgdat->pfmemalloc_wait);
 
 		/* Check if kswapd should be suspending */
 		__fs_reclaim_release(_THIS_IP_);
 		ret = try_to_freeze();
 		__fs_reclaim_acquire(_THIS_IP_);
-		if (ret || kthread_should_stop())
+		if (ret || kthread_should_stop() ||
+		    !atomic_long_read(&kswapd_waiters))
 			break;
 
 		/*
-- 
2.42.0

From 215c1496035a68adae4f992d995d0ccd58412e91 Mon Sep 17 00:00:00 2001
From: Peter Jung <admin@ptr1337.dev>
Date: Sat, 23 Sep 2023 13:08:08 +0200
Subject: [PATCH 5/7] fixes

Signed-off-by: Peter Jung <admin@ptr1337.dev>
---
 Documentation/ABI/stable/sysfs-block          |   10 +
 .../testing/sysfs-class-led-trigger-blkdev    |   78 ++
 Documentation/leds/index.rst                  |    1 +
 Documentation/leds/ledtrig-blkdev.rst         |  158 +++
 drivers/bluetooth/btusb.c                     |    2 +-
 drivers/leds/trigger/Kconfig                  |    9 +
 drivers/leds/trigger/Makefile                 |    1 +
 drivers/leds/trigger/ledtrig-blkdev.c         | 1218 +++++++++++++++++
 .../net/wireless/mediatek/mt76/mt7921/init.c  |    9 +-
 fs/btrfs/extent-tree.c                        |   61 +-
 fs/btrfs/extent-tree.h                        |   13 +-
 include/linux/pageblock-flags.h               |    2 +-
 kernel/padata.c                               |    4 +-
 scripts/Makefile.vmlinux_o                    |    2 +-
 sound/pci/hda/cs35l41_hda.c                   |    2 +-
 sound/pci/hda/patch_realtek.c                 |    1 +
 16 files changed, 1506 insertions(+), 65 deletions(-)
 create mode 100644 Documentation/ABI/testing/sysfs-class-led-trigger-blkdev
 create mode 100644 Documentation/leds/ledtrig-blkdev.rst
 create mode 100644 drivers/leds/trigger/ledtrig-blkdev.c

diff --git a/Documentation/ABI/stable/sysfs-block b/Documentation/ABI/stable/sysfs-block
index c57e5b7cb532..2d1df6c9b463 100644
--- a/Documentation/ABI/stable/sysfs-block
+++ b/Documentation/ABI/stable/sysfs-block
@@ -101,6 +101,16 @@ Description:
 		devices that support receiving integrity metadata.
 
 
+What:		/sys/block/<disk>/linked_leds
+Date:		January 2023
+Contact:	Ian Pilcher <arequipeno@gmail.com>
+Description:
+		Directory that contains symbolic links to all LEDs that
+		are associated with (linked to) this block device by the
+		blkdev LED trigger.  Only present when at least one LED
+		is linked.  (See Documentation/leds/ledtrig-blkdev.rst.)
+
+
 What:		/sys/block/<disk>/<partition>/alignment_offset
 Date:		April 2009
 Contact:	Martin K. Petersen <martin.petersen@oracle.com>
diff --git a/Documentation/ABI/testing/sysfs-class-led-trigger-blkdev b/Documentation/ABI/testing/sysfs-class-led-trigger-blkdev
new file mode 100644
index 000000000000..28ce8c814fb7
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-class-led-trigger-blkdev
@@ -0,0 +1,78 @@
+What:		/sys/class/leds/<led>/blink_time
+Date:		January 2023
+Contact:	Ian Pilcher <arequipeno@gmail.com>
+Description:
+		Time (in milliseconds) that the LED will be on during a single
+		"blink".
+
+What:		/sys/class/leds/<led>/check_interval
+Date:		January 2023
+Contact:	Ian Pilcher <arequipeno@gmail.com>
+Description:
+		Interval (in milliseconds) between checks of the block devices
+		linked to this LED.  The LED will be blinked if the correct type
+		of activity (see blink_on_{read,write,discard,flush} attributes)
+		has occurred on any of the linked devices since the previous
+		check.
+
+What:		/sys/class/leds/<led>/blink_on_read
+Date:		January 2023
+Contact:	Ian Pilcher <arequipeno@gmail.com>
+Description:
+		Boolean that determines whether the LED will blink in response
+		to read activity on any of its linked block devices.
+
+What:		/sys/class/leds/<led>/blink_on_write
+Date:		January 2023
+Contact:	Ian Pilcher <arequipeno@gmail.com>
+Description:
+		Boolean that determines whether the LED will blink in response
+		to write activity on any of its linked block devices.
+
+What:		/sys/class/leds/<led>/blink_on_discard
+Date:		January 2023
+Contact:	Ian Pilcher <arequipeno@gmail.com>
+Description:
+		Boolean that determines whether the LED will blink in response
+		to discard activity on any of its linked block devices.
+
+What:		/sys/class/leds/<led>/blink_on_flush
+Date:		January 2023
+Contact:	Ian Pilcher <arequipeno@gamil.com>
+Description:
+		Boolean that determines whether the LED will blink in response
+		to cache flush activity on any of its linked block devices.
+
+What:		/sys/class/leds/<led>/link_dev_by_path
+Date:		January 2023
+Contact:	Ian Pilcher <arequipeno@gmail.com>
+Description:
+		Associate a block device with this LED by writing the path to
+		the device special file (e.g. /dev/sda) to this attribute.
+		Symbolic links are followed.
+
+What:		/sys/class/leds/<led>/unlink_dev_by_path
+Date:		January 2023
+Contact:	Ian Pilcher <arequipeno@gmail.com>
+Description:
+		Remove the association between this LED and a block device by
+		writing the path to the device special file (e.g. /dev/sda) to
+		this attribute.  Symbolic links are followed.
+
+What:		/sys/class/leds/<led>/unlink_dev_by_name
+Date:		January 2023
+Contact:	Ian Pilcher <arequipeno@gmail.com>
+Description:
+		Remove the association between this LED and a block device by
+		writing the kernel name of the device (e.g. sda) to this
+		attribute.
+
+What:		/sys/class/leds/<led>/linked_devices
+Date:		January 2023
+Contact:	Ian Pilcher <arequipeno@gmail.com>
+Description:
+		Directory containing links to all block devices that are
+		associated with this LED.  (Note that the names of the
+		symbolic links in this directory are *kernel* names, which
+		may not match the device special file paths written to
+		link_device and unlink_device.)
diff --git a/Documentation/leds/index.rst b/Documentation/leds/index.rst
index 3ade16c18328..3fd55a2cbfb5 100644
--- a/Documentation/leds/index.rst
+++ b/Documentation/leds/index.rst
@@ -10,6 +10,7 @@ LEDs
    leds-class
    leds-class-flash
    leds-class-multicolor
+   ledtrig-blkdev
    ledtrig-oneshot
    ledtrig-transient
    ledtrig-usbport
diff --git a/Documentation/leds/ledtrig-blkdev.rst b/Documentation/leds/ledtrig-blkdev.rst
new file mode 100644
index 000000000000..9ff5b99de451
--- /dev/null
+++ b/Documentation/leds/ledtrig-blkdev.rst
@@ -0,0 +1,158 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+=================================
+Block Device (blkdev) LED Trigger
+=================================
+
+Available when ``CONFIG_LEDS_TRIGGER_BLKDEV=y`` or
+``CONFIG_LEDS_TRIGGER_BLKDEV=m``.
+
+See also:
+
+* ``Documentation/ABI/testing/sysfs-class-led-trigger-blkdev``
+* ``Documentation/ABI/stable/sysfs-block`` (``/sys/block/<disk>/linked_leds``)
+
+Overview
+========
+
+.. note::
+	The examples below use ``<LED>`` to refer to the name of a
+	system-specific LED.  If no suitable LED is available on a test
+	system (in a virtual machine, for example), it is possible to
+	use a userspace LED.  (See ``Documentation/leds/uleds.rst``.)
+
+Verify that the ``blkdev`` LED trigger is available::
+
+	# grep blkdev /sys/class/leds/<LED>/trigger
+	... rfkill-none blkdev
+
+(If the previous command produces no output, you may need to load the trigger
+module - ``modprobe ledtrig_blkdev``.  If the module is not available, check
+the value of ``CONFIG_LEDS_TRIGGER_BLKDEV`` in your kernel configuration.)
+
+Associate the LED with the ``blkdev`` LED trigger::
+
+	# echo blkdev > /sys/class/leds/<LED>/trigger
+
+	# cat /sys/class/leds/<LED>/trigger
+	... rfkill-none [blkdev]
+
+Note that several new device attributes are available in the
+``/sys/class/leds/<LED>`` directory.
+
+* ``link_dev_by_path``, ``unlink_dev_by_path``, and ``unlink_dev_by_name`` are
+  used to manage the set of block devices associated with this LED.  The LED
+  will blink when activity occurs on any of its linked devices.
+
+* ``blink_on_read``, ``blink_on_write``, ``blink_on_discard``, and
+  ``blink_on_flush`` are boolean values that determine whether the LED will
+  blink when a particular type of activity is detected on one of its linked
+  block devices.
+
+* ``blink_time`` is the duration (in milliseconds) of each blink of this LED.
+  (The minimum value is 10 milliseconds.)
+
+* ``check_interval`` is the frequency (in milliseconds) with which block devices
+  linked to this LED will be checked for activity and the LED blinked (if the
+  correct type of activity has occurred).
+
+* The ``linked_devices`` directory will contain a symbolic link to every device
+  that is associated with this LED.
+
+Link a block device to the LED::
+
+	# echo /dev/sda > /sys/class/leds/<LED>/link_dev_by_path
+
+	# ls /sys/class/leds/<LED>/linked_devices
+	sda
+
+(The value written to ``link_dev_by_path`` must be the path of the device
+special file, such as ``/dev/sda``, that represents the block device - or the
+path of a symbolic link to such a device special file.)
+
+Activity on the device will now cause the LED to blink.  The duration of each
+blink (in milliseconds) can be adjusted by setting
+``/sys/class/leds/<LED>/blink_time``.  (But see **check_interval and
+blink_time** below.)
+
+Associate a second device with the LED::
+
+	# echo /dev/sdb > /sys/class/leds/<LED>/link_dev_by_path
+
+	# ls /sys/class/leds/<LED>/linked_devices
+	sda  sdb
+
+When a block device is linked to one or more LEDs, the LEDs are linked from
+the device's ``linked_leds`` directory::
+
+	# ls /sys/class/block/sd{a,b}/linked_leds
+	/sys/class/block/sda/linked_leds:
+	<LED>
+
+	/sys/class/block/sdb/linked_leds:
+	<LED>
+
+(The ``linked_leds`` directory only exists when the block device is linked to
+at least one LED.)
+
+``check_interval`` and ``blink_time``
+=====================================
+
+* By default, linked block devices are checked for activity every 100
+  milliseconds.  This frequency can be changed for an LED via the
+  ``/sys/class/leds/<led>/check_interval`` attribute.  (The minimum value is 25
+  milliseconds.)
+
+* All block devices associated with an LED are checked for activity every
+  ``check_interval`` milliseconds, and a blink is triggered if the correct type
+  of activity (as determined by the LED's ``blink_on_*`` attributes) is
+  detected.  The duration of an LED's blink is determined by its ``blink_time``
+  attribute.  Thus (when the correct type of activity is detected), the LED will
+  be on for ``blink_time`` milliseconds and off for
+  ``check_interval - blink_time`` milliseconds.
+
+* The LED subsystem ignores new blink requests for an LED that is already in
+  in the process of blinking, so setting a ``blink_time`` greater than or equal
+  to ``check_interval`` will cause some blinks to be missed.
+
+* Because of processing times, scheduling latencies, etc., avoiding missed
+  blinks actually requires a difference of at least a few milliseconds between
+  the ``blink_time`` and ``check_interval``.  The required difference is likely
+  to vary from system to system.  As a  reference, a Thecus N5550 NAS requires a
+  difference of 7 milliseconds (e.g. ``check_interval == 100``,
+  ``blink_time == 93``).
+
+* The default values (``check_interval == 100``, ``blink_time == 75``) cause the
+  LED associated with a continuously active device to blink rapidly.  For a more
+  "always on" effect, increase the ``blink_time`` (but not too much; see the
+  previous bullet).
+
+Other Notes
+===========
+
+* Many (possibly all) types of block devices work with this trigger, including:
+
+  * SCSI (including SATA and USB) hard disk drives and SSDs
+  * SCSI (including SATA and USB) optical drives
+  * NVMe SSDs
+  * SD cards
+  * loopback block devices (``/dev/loop*``)
+  * device mapper devices, such as LVM logical volumes
+  * MD RAID devices
+  * zRAM compressed RAM-disks
+  * partitions on block devices that support them
+
+* The names of the symbolic links in ``/sys/class/leds/<LED>/linked_devices``
+  are **kernel** names, which may not match the paths used for
+  ``link_dev_by_path`` and ``unlink_dev_by_path``.  This is most likely when a
+  symbolic link is used to refer to the device (as is common with logical
+  volumes), but it can be true for any device, because nothing prevents the
+  creation of device special files with arbitrary names (e.g.
+  ``sudo mknod /foo b 8 0``).
+
+  Kernel names can be used to unlink block devices from LEDs by writing them to
+  the LED's ``unlink_dev_by_name`` attribute.
+
+* The ``blkdev`` LED trigger supports many-to-many device/LED associations.
+  A device can be associated with multiple LEDs, and an LED can be associated
+  with multiple devices.
diff --git a/drivers/bluetooth/btusb.c b/drivers/bluetooth/btusb.c
index ca9e2a210fff..1fc271adce26 100644
--- a/drivers/bluetooth/btusb.c
+++ b/drivers/bluetooth/btusb.c
@@ -960,7 +960,7 @@ static void btusb_qca_cmd_timeout(struct hci_dev *hdev)
 		}
 
 		gpiod_set_value_cansleep(reset_gpio, 0);
-		msleep(200);
+		usleep_range(USEC_PER_SEC / 2, USEC_PER_SEC);
 		gpiod_set_value_cansleep(reset_gpio, 1);
 
 		return;
diff --git a/drivers/leds/trigger/Kconfig b/drivers/leds/trigger/Kconfig
index 2a57328eca20..05e80cfd0ed8 100644
--- a/drivers/leds/trigger/Kconfig
+++ b/drivers/leds/trigger/Kconfig
@@ -155,4 +155,13 @@ config LEDS_TRIGGER_TTY
 
 	  When build as a module this driver will be called ledtrig-tty.
 
+config LEDS_TRIGGER_BLKDEV
+	tristate "LED Trigger for block devices"
+	depends on BLOCK
+	help
+	  The blkdev LED trigger allows LEDs to be controlled by block device
+	  activity (reads and writes).
+
+	  See Documentation/leds/ledtrig-blkdev.rst.
+
 endif # LEDS_TRIGGERS
diff --git a/drivers/leds/trigger/Makefile b/drivers/leds/trigger/Makefile
index 25c4db97cdd4..d53bab5d93f1 100644
--- a/drivers/leds/trigger/Makefile
+++ b/drivers/leds/trigger/Makefile
@@ -16,3 +16,4 @@ obj-$(CONFIG_LEDS_TRIGGER_NETDEV)	+= ledtrig-netdev.o
 obj-$(CONFIG_LEDS_TRIGGER_PATTERN)	+= ledtrig-pattern.o
 obj-$(CONFIG_LEDS_TRIGGER_AUDIO)	+= ledtrig-audio.o
 obj-$(CONFIG_LEDS_TRIGGER_TTY)		+= ledtrig-tty.o
+obj-$(CONFIG_LEDS_TRIGGER_BLKDEV)	+= ledtrig-blkdev.o
diff --git a/drivers/leds/trigger/ledtrig-blkdev.c b/drivers/leds/trigger/ledtrig-blkdev.c
new file mode 100644
index 000000000000..9e0c4b66ea27
--- /dev/null
+++ b/drivers/leds/trigger/ledtrig-blkdev.c
@@ -0,0 +1,1218 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+/*
+ *	Block device LED trigger
+ *
+ *	Copyright 2021-2023 Ian Pilcher <arequipeno@gmail.com>
+ */
+
+#include <linux/blkdev.h>
+#include <linux/leds.h>
+#include <linux/module.h>
+#include <linux/part_stat.h>
+#include <linux/xarray.h>
+
+/**
+ * DOC: Overview
+ *
+ * The ``blkdev`` LED trigger works by periodically checking the activity
+ * counters of block devices that have been linked to one or more LEDs and
+ * blinking those LED(s) if the correct type of activity has occurred.  The
+ * periodic check is scheduled with the Linux kernel's deferred work facility.
+ *
+ * Trigger-specific data about block devices and LEDs is stored in two data
+ * structures --- &struct blkdev_trig_bdev (a "BTB") and &struct blkdev_trig_led
+ * (a "BTL").  Each structure contains a &struct xarray that holds links to any
+ * linked devices of the other type.  I.e. &blkdev_trig_bdev.linked_btls
+ * contains links to all BTLs whose LEDs have been linked to the BTB's block
+ * device, and &blkdev_trig_led.linked_btbs contains links to all BTBs whose
+ * block devices have been linked to the BTL's LED.  Thus, a block device can
+ * be linked to more than one LED, and an LED can be linked to more than one
+ * block device.
+ */
+
+/* Default, minimum & maximum blink duration (milliseconds) */
+#define BLKDEV_TRIG_BLINK_DEF	75
+#define BLKDEV_TRIG_BLINK_MIN	10
+#define BLKDEV_TRIG_BLINK_MAX	86400000  /* 24 hours */
+
+/* Default, minimum & maximum activity check interval (milliseconds) */
+#define BLKDEV_TRIG_CHECK_DEF	100
+#define BLKDEV_TRIG_CHECK_MIN	25
+#define BLKDEV_TRIG_CHECK_MAX	86400000  /* 24 hours */
+
+/*
+ * If blkdev_trig_check() can't lock the mutex, how long to wait before trying
+ * again (milliseconds)
+ */
+#define BLKDEV_TRIG_CHECK_RETRY	5
+
+/**
+ * struct blkdev_trig_bdev - Trigger-specific data about a block device.
+ * @last_checked:	Time (in jiffies) at which the trigger last checked this
+ *			block device for activity.
+ * @last_activity:	Time (in jiffies) at which the trigger last detected
+ *			activity of each type.
+ * @ios:		Activity counter values for each type, corresponding to
+ *			the timestamps in &last_activity.
+ * @index:		&xarray index, so the BTB can be included in one or more
+ *			&blkdev_trig_led.linked_btbs.
+ * @bdev:		The block device.
+ * @linked_btls:	The BTLs that represent the LEDs linked to the BTB's
+ *			block device.
+ *
+ * Every block device linked to at least one LED gets a "BTB."  A BTB is created
+ * when a block device that is not currently linked to any LEDs is linked to an
+ * LED.
+ *
+ * A BTB is freed when one of the following occurs:
+ *
+ * * The number of LEDs linked to the block device becomes zero, because it has
+ *   been unlinked from its last LED using the trigger's &sysfs interface.
+ *
+ * * The number of LEDs linked to the block device becomes zero, because the
+ *   last LED to which it was linked has been disassociated from the trigger
+ *   (which happens automatically if the LED device is removed from the system).
+ *
+ * * The BTB's block device is removed from the system.  To accomodate this
+ *   scenario, BTB's are created as device resources, so that the release
+ *   function will be called by the driver core when the device is removed.
+ */
+struct blkdev_trig_bdev {
+	unsigned long		last_checked;
+	unsigned long		last_activity[NR_STAT_GROUPS];
+	unsigned long		ios[NR_STAT_GROUPS];
+	unsigned long		index;
+	struct block_device	*bdev;
+	struct xarray		linked_btls;
+};
+
+/**
+ * struct blkdev_trig_led - Trigger-specific data about an LED.
+ * @last_checked:	Time (in jiffies) at which the trigger last checked the
+ *			the block devices linked to this LED for activity.
+ * @index:		&xarray index, so the BTL can be included in one or more
+ *			&blkdev_trig_bdev.linked_btls.
+ * @mode:		Bitmask for types of block device activity that will
+ *			cause this LED to blink --- reads, writes, discards,
+ *			etc.
+ * @led:		The LED device.
+ * @blink_msec:		Duration of a blink (milliseconds).
+ * @check_jiffies:	Frequency with which block devices linked to this LED
+ *			should be checked for activity (jiffies).
+ * @linked_btbs:	The BTBs that represent the block devices linked to the
+ *			BTL's LED.
+ * @all_btls_node:	The BTL's node in the module's list of all BTLs.
+ *
+ * Every LED associated with the block device trigger gets a "BTL."  A BTL is
+ * created when the trigger is "activated" on an LED (usually by writing
+ * ``blkdev`` to the LED's &sysfs &trigger attribute).  A BTL is freed wnen its
+ * LED is disassociated from the trigger, either through the trigger's &sysfs
+ * interface or because the LED device is removed from the system.
+ */
+struct blkdev_trig_led {
+	unsigned long		last_checked;
+	unsigned long		index;
+	unsigned long		mode;  /* must be ulong for atomic bit ops */
+	struct led_classdev	*led;
+	unsigned int		blink_msec;
+	unsigned int		check_jiffies;
+	struct xarray		linked_btbs;
+	struct hlist_node	all_btls_node;
+};
+
+/* Protects everything except atomic LED attributes */
+static DEFINE_MUTEX(blkdev_trig_mutex);
+
+/* BTB device resource release function */
+static void blkdev_trig_btb_release(struct device *dev, void *res);
+
+/* Index for next BTB or BTL */
+static unsigned long blkdev_trig_next_index;
+
+/* All LEDs associated with the trigger */
+static HLIST_HEAD(blkdev_trig_all_btls);
+
+/* Delayed work to periodically check for activity & blink LEDs */
+static void blkdev_trig_check(struct work_struct *work);
+static DECLARE_DELAYED_WORK(blkdev_trig_work, blkdev_trig_check);
+
+/* When is the delayed work scheduled to run next (jiffies) */
+static unsigned long blkdev_trig_next_check;
+
+/* Total number of BTB-to-BTL links */
+static unsigned int blkdev_trig_link_count;
+
+/* Empty sysfs attribute list for next 2 declarations */
+static struct attribute *blkdev_trig_attrs_empty[] = { NULL };
+
+/* linked_leds sysfs directory for block devs linked to 1 or more LEDs */
+static const struct attribute_group blkdev_trig_linked_leds = {
+	.name	= "linked_leds",
+	.attrs	= blkdev_trig_attrs_empty,
+};
+
+/* linked_devices sysfs directory for each LED associated with the trigger */
+static const struct attribute_group blkdev_trig_linked_devs = {
+	.name	= "linked_devices",
+	.attrs	= blkdev_trig_attrs_empty,
+};
+
+
+/*
+ *
+ *	Delayed work to check for activity & blink LEDs
+ *
+ */
+
+/**
+ * blkdev_trig_blink() - Blink an LED, if the correct type of activity has
+ *	occurred on the block device.
+ * @btl:	The BTL that represents the LED
+ * @btb:	The BTB that represents the block device
+ *
+ * Context:	Process context.  Caller must hold &blkdev_trig_mutex.
+ * Return:	&true if the LED is blinked, &false if not.
+ */
+static bool blkdev_trig_blink(const struct blkdev_trig_led *btl,
+			      const struct blkdev_trig_bdev *btb)
+{
+	unsigned long mode, mask, delay_on, delay_off;
+	enum stat_group i;
+
+	mode = READ_ONCE(btl->mode);
+
+	for (i = STAT_READ, mask = 1; i <= STAT_FLUSH; ++i, mask <<= 1) {
+
+		if (!(mode & mask))
+			continue;
+
+		if (time_before_eq(btb->last_activity[i], btl->last_checked))
+			continue;
+
+		delay_on = READ_ONCE(btl->blink_msec);
+		delay_off = 1;	/* 0 leaves LED turned on */
+
+		led_blink_set_oneshot(btl->led, &delay_on, &delay_off, 0);
+		return true;
+	}
+
+	return false;
+}
+
+/**
+ * blkdev_trig_update_btb() - Update a BTB's activity counters and timestamps.
+ * @btb:	The BTB
+ * @now:	Timestamp (in jiffies)
+ *
+ * Context:	Process context.  Caller must hold &blkdev_trig_mutex.
+ */
+static void blkdev_trig_update_btb(struct blkdev_trig_bdev *btb,
+				   unsigned long now)
+{
+	unsigned long new_ios;
+	enum stat_group i;
+
+	for (i = STAT_READ; i <= STAT_FLUSH; ++i) {
+
+		new_ios = part_stat_read(btb->bdev, ios[i]);
+
+		if (new_ios != btb->ios[i]) {
+			btb->ios[i] = new_ios;
+			btb->last_activity[i] = now;
+		}
+	}
+
+	btb->last_checked = now;
+}
+
+/**
+ * blkdev_trig_check() - Check linked devices for activity and blink LEDs.
+ * @work:	Delayed work (&blkdev_trig_work)
+ *
+ * Context:	Process context.  Takes and releases &blkdev_trig_mutex.
+ */
+static void blkdev_trig_check(struct work_struct *work)
+{
+	struct blkdev_trig_led *btl;
+	struct blkdev_trig_bdev *btb;
+	unsigned long index, delay, now, led_check, led_delay;
+	bool blinked;
+
+	if (!mutex_trylock(&blkdev_trig_mutex)) {
+		delay = msecs_to_jiffies(BLKDEV_TRIG_CHECK_RETRY);
+		goto exit_reschedule;
+	}
+
+	now = jiffies;
+	delay = ULONG_MAX;
+
+	hlist_for_each_entry (btl, &blkdev_trig_all_btls, all_btls_node) {
+
+		led_check = btl->last_checked + btl->check_jiffies;
+
+		if (time_before_eq(led_check, now)) {
+
+			blinked = false;
+
+			xa_for_each (&btl->linked_btbs, index, btb) {
+
+				if (btb->last_checked != now)
+					blkdev_trig_update_btb(btb, now);
+				if (!blinked)
+					blinked = blkdev_trig_blink(btl, btb);
+			}
+
+			btl->last_checked = now;
+			led_delay = btl->check_jiffies;
+
+		} else {
+			led_delay = led_check - now;
+		}
+
+		if (led_delay < delay)
+			delay = led_delay;
+	}
+
+	mutex_unlock(&blkdev_trig_mutex);
+
+exit_reschedule:
+	WARN_ON_ONCE(delay == ULONG_MAX);
+	WARN_ON_ONCE(!schedule_delayed_work(&blkdev_trig_work, delay));
+}
+
+/**
+ * blkdev_trig_sched_led() - Set the schedule of the delayed work when a new
+ *	LED is added to the schedule.
+ * @btl:	The BTL that represents the LED
+ *
+ * Called when the number of block devices to which an LED is linked becomes
+ * non-zero.
+ *
+ * Context:	Process context.  Caller must hold &blkdev_trig_mutex.
+ */
+static void blkdev_trig_sched_led(const struct blkdev_trig_led *btl)
+{
+	unsigned long delay = READ_ONCE(btl->check_jiffies);
+	unsigned long check_by = jiffies + delay;
+
+	/*
+	 * If no other LED-to-block device links exist, simply schedule the
+	 * delayed work according to this LED's check_interval attribute
+	 * (check_jiffies).
+	 */
+	if (blkdev_trig_link_count == 0) {
+		WARN_ON(!schedule_delayed_work(&blkdev_trig_work, delay));
+		blkdev_trig_next_check = check_by;
+		return;
+	}
+
+	/*
+	 * If the next check is already scheduled to occur soon enough to
+	 * accomodate this LED's check_interval, the schedule doesn't need
+	 * to be changed.
+	 */
+	if (time_after_eq(check_by, blkdev_trig_next_check))
+		return;
+
+	/*
+	 * Modify the schedule, so that the delayed work runs soon enough for
+	 * this LED.
+	 */
+	WARN_ON(!mod_delayed_work(system_wq, &blkdev_trig_work, delay));
+	blkdev_trig_next_check = check_by;
+}
+
+
+/*
+ *
+ *	Linking and unlinking LEDs and block devices
+ *
+ */
+
+/**
+ * blkdev_trig_link() - Link a block device to an LED.
+ * @btl:	The BTL that represents the LED
+ * @btb:	The BTB that represents the block device
+ *
+ * Context:	Process context.  Caller must hold &blkdev_trig_mutex.
+ * Return:	&0 on success, negative &errno on error.
+ */
+static int blkdev_trig_link(struct blkdev_trig_led *btl,
+			    struct blkdev_trig_bdev *btb)
+{
+	bool led_first_link;
+	int err;
+
+	led_first_link = xa_empty(&btl->linked_btbs);
+
+	err = xa_insert(&btb->linked_btls, btl->index, btl, GFP_KERNEL);
+	if (err)
+		return err;
+
+	err = xa_insert(&btl->linked_btbs, btb->index, btb, GFP_KERNEL);
+	if (err)
+		goto error_erase_btl;
+
+	/* Create /sys/class/block/<bdev>/linked_leds/<led> symlink */
+	err = sysfs_add_link_to_group(bdev_kobj(btb->bdev),
+				      blkdev_trig_linked_leds.name,
+				      &btl->led->dev->kobj, btl->led->name);
+	if (err)
+		goto error_erase_btb;
+
+	/* Create /sys/class/leds/<led>/linked_devices/<bdev> symlink */
+	err = sysfs_add_link_to_group(&btl->led->dev->kobj,
+				      blkdev_trig_linked_devs.name,
+				      bdev_kobj(btb->bdev),
+				      dev_name(&btb->bdev->bd_device));
+	if (err)
+		goto error_remove_symlink;
+
+	/*
+	 * If this is the first block device linked to this LED, the delayed
+	 * work schedule may need to be changed.
+	 */
+	if (led_first_link)
+		blkdev_trig_sched_led(btl);
+
+	++blkdev_trig_link_count;
+
+	return 0;
+
+error_remove_symlink:
+	sysfs_remove_link_from_group(bdev_kobj(btb->bdev),
+				     blkdev_trig_linked_leds.name,
+				     btl->led->name);
+error_erase_btb:
+	xa_erase(&btl->linked_btbs, btb->index);
+error_erase_btl:
+	xa_erase(&btb->linked_btls, btl->index);
+	return err;
+}
+
+/**
+ * blkdev_trig_put_btb() - Remove and free a BTB, if it is no longer needed.
+ * @btb:	The BTB
+ *
+ * Does nothing if the BTB (block device) is still linked to at least one LED.
+ *
+ * Context:	Process context.  Caller must hold &blkdev_trig_mutex.
+ */
+static void blkdev_trig_put_btb(struct blkdev_trig_bdev *btb)
+{
+	struct block_device *bdev = btb->bdev;
+	int err;
+
+	if (xa_empty(&btb->linked_btls)) {
+
+		sysfs_remove_group(bdev_kobj(bdev), &blkdev_trig_linked_leds);
+		err = devres_destroy(&bdev->bd_device, blkdev_trig_btb_release,
+				     NULL, NULL);
+		WARN_ON(err);
+	}
+}
+
+/**
+ * _blkdev_trig_unlink_always() - Perform the unconditionally required steps of
+ *	unlinking a block device from an LED.
+ * @btl:	The BTL that represents the LED
+ * @btb:	The BTB that represents the block device
+ *
+ * When a block device is unlinked from an LED, certain steps must be performed
+ * only if the block device is **not** being released.  This function performs
+ * those steps that are **always** required, whether or not the block device is
+ * being released.
+ *
+ * Context:	Process context.  Caller must hold &blkdev_trig_mutex.
+ */
+static void _blkdev_trig_unlink_always(struct blkdev_trig_led *btl,
+				       struct blkdev_trig_bdev *btb)
+{
+	--blkdev_trig_link_count;
+
+	if (blkdev_trig_link_count == 0)
+		WARN_ON(!cancel_delayed_work_sync(&blkdev_trig_work));
+
+	xa_erase(&btb->linked_btls, btl->index);
+	xa_erase(&btl->linked_btbs, btb->index);
+
+	/* Remove /sys/class/leds/<led>/linked_devices/<bdev> symlink */
+	sysfs_remove_link_from_group(&btl->led->dev->kobj,
+				     blkdev_trig_linked_devs.name,
+				     dev_name(&btb->bdev->bd_device));
+}
+
+/**
+ * blkdev_trig_unlink_norelease() - Unlink an LED from a block device that is
+ *	**not** being released.
+ * @btl:	The BTL that represents the LED.
+ * @btb:	The BTB that represents the block device.
+ *
+ * Context:	Process context.  Caller must hold &blkdev_trig_mutex.
+ */
+static void blkdev_trig_unlink_norelease(struct blkdev_trig_led *btl,
+					 struct blkdev_trig_bdev *btb)
+{
+	_blkdev_trig_unlink_always(btl, btb);
+
+	/* Remove /sys/class/block/<bdev>/linked_leds/<led> symlink */
+	sysfs_remove_link_from_group(bdev_kobj(btb->bdev),
+				     blkdev_trig_linked_leds.name,
+				     btl->led->name);
+
+	blkdev_trig_put_btb(btb);
+}
+
+/**
+ * blkdev_trig_unlink_release() - Unlink an LED from a block device that is
+ *	being released.
+ * @btl:	The BTL that represents the LED
+ * @btb:	The BTB that represents the block device
+ *
+ * Context:	Process context.  Caller must hold &blkdev_trig_mutex.
+ */
+static void blkdev_trig_unlink_release(struct blkdev_trig_led *btl,
+				       struct blkdev_trig_bdev *btb)
+{
+	_blkdev_trig_unlink_always(btl, btb);
+
+	/*
+	 * If the BTB is being released, the driver core has already removed the
+	 * device's attribute groups, and the BTB will be freed automatically,
+	 * so there's nothing else to do.
+	 */
+}
+
+
+/*
+ *
+ *	BTB creation
+ *
+ */
+
+/**
+ * blkdev_trig_btb_release() - BTB device resource release function.
+ * @dev:	The block device
+ * @res:	The BTB
+ *
+ * Called by the driver core when a block device with a BTB is removed.
+ *
+ * Context:	Process context.  Takes and releases &blkdev_trig_mutex.
+ */
+static void blkdev_trig_btb_release(struct device *dev, void *res)
+{
+	struct blkdev_trig_bdev *btb = res;
+	struct blkdev_trig_led *btl;
+	unsigned long index;
+
+	mutex_lock(&blkdev_trig_mutex);
+
+	xa_for_each (&btb->linked_btls, index, btl)
+		blkdev_trig_unlink_release(btl, btb);
+
+	mutex_unlock(&blkdev_trig_mutex);
+}
+
+/**
+ * blkdev_trig_get_bdev() - Get a block device by path.
+ * @path:	The value written to an LED's &link_dev_by_path or
+ *		&unlink_dev_by_path attribute, which should be the path to a
+ *		special file that represents a block device
+ * @len:	The number of characters in &path (not including its
+ *		terminating null)
+ *
+ * The caller must call blkdev_put() when finished with the device.
+ *
+ * Context:	Process context.
+ * Return:	The block device, or an error pointer.
+ */
+static struct block_device *blkdev_trig_get_bdev(const char *path, size_t len)
+{
+	struct block_device *bdev;
+	char *buf;
+
+	buf = kmemdup(path, len + 1, GFP_KERNEL);  /* +1 to include null */
+	if (buf == NULL)
+		return ERR_PTR(-ENOMEM);
+
+	bdev = blkdev_get_by_path(strim(buf), 0, NULL, NULL);
+	kfree(buf);
+	return bdev;
+}
+
+/**
+ * blkdev_trig_get_btb() - Find or create the BTB for a block device.
+ * @path:	The value written to an LED's &link_dev_by_path attribute,
+ *		which should be the path to a special file that represents a
+ *		block device
+ * @len:	The number of characters in &path
+ *
+ * If a new BTB is created, because the block device was not previously linked
+ * to any LEDs, the block device's &linked_leds &sysfs directory is created.
+ *
+ * Context:	Process context.  Caller must hold &blkdev_trig_mutex.
+ * Return:	Pointer to the BTB, error pointer on error.
+ */
+static struct blkdev_trig_bdev *blkdev_trig_get_btb(const char *path,
+						    size_t len)
+{
+	struct block_device *bdev;
+	struct blkdev_trig_bdev *btb;
+	int err;
+
+	bdev = blkdev_trig_get_bdev(path, len);
+	if (IS_ERR(bdev))
+		return ERR_CAST(bdev);
+
+	btb = devres_find(&bdev->bd_device, blkdev_trig_btb_release,
+			  NULL, NULL);
+	if (btb != NULL) {
+		err = 0;
+		goto exit_put_bdev;
+	}
+
+	if (blkdev_trig_next_index == ULONG_MAX) {
+		err = -EOVERFLOW;
+		goto exit_put_bdev;
+	}
+
+	btb = devres_alloc(blkdev_trig_btb_release, sizeof(*btb), GFP_KERNEL);
+	if (btb == NULL) {
+		err = -ENOMEM;
+		goto exit_put_bdev;
+	}
+
+	err = sysfs_create_group(bdev_kobj(bdev), &blkdev_trig_linked_leds);
+	if (err)
+		goto exit_free_btb;
+
+	btb->index = blkdev_trig_next_index++;
+	btb->bdev = bdev;
+	xa_init(&btb->linked_btls);
+
+	/* Populate BTB activity counters */
+	blkdev_trig_update_btb(btb, jiffies);
+
+	devres_add(&bdev->bd_device, btb);
+
+exit_free_btb:
+	if (err)
+		devres_free(btb);
+exit_put_bdev:
+	blkdev_put(bdev, NULL);
+	return err ? ERR_PTR(err) : btb;
+}
+
+
+/*
+ *
+ *	Activating and deactivating the trigger on an LED
+ *
+ */
+
+/**
+ * blkdev_trig_activate() - Called by the LEDs subsystem when an LED is
+ *	associated with the trigger.
+ * @led:	The LED
+ *
+ * Context:	Process context.  Takes and releases &blkdev_trig_mutex.
+ * Return:	&0 on success, negative &errno on error.
+ */
+static int blkdev_trig_activate(struct led_classdev *led)
+{
+	struct blkdev_trig_led *btl;
+	int err;
+
+	btl = kzalloc(sizeof(*btl), GFP_KERNEL);
+	if (btl == NULL)
+		return -ENOMEM;
+
+	err = mutex_lock_interruptible(&blkdev_trig_mutex);
+	if (err)
+		goto exit_free;
+
+	if (blkdev_trig_next_index == ULONG_MAX) {
+		err = -EOVERFLOW;
+		goto exit_unlock;
+	}
+
+	btl->index = blkdev_trig_next_index++;
+	btl->last_checked = jiffies;
+	btl->mode = -1;  /* set all bits */
+	btl->led = led;
+	btl->blink_msec = BLKDEV_TRIG_BLINK_DEF;
+	btl->check_jiffies = msecs_to_jiffies(BLKDEV_TRIG_CHECK_DEF);
+	xa_init(&btl->linked_btbs);
+
+	hlist_add_head(&btl->all_btls_node, &blkdev_trig_all_btls);
+	led_set_trigger_data(led, btl);
+
+exit_unlock:
+	mutex_unlock(&blkdev_trig_mutex);
+exit_free:
+	if (err)
+		kfree(btl);
+	return err;
+}
+
+/**
+ * blkdev_trig_deactivate() - Called by the the LEDs subsystem when an LED is
+ *	disassociated from the trigger.
+ * @led:	The LED
+ *
+ * The LEDs subsystem also calls this function when an LED associated with the
+ * trigger is removed or when the trigger is unregistered (if the module is
+ * unloaded).
+ *
+ * Context:	Process context.  Takes and releases &blkdev_trig_mutex.
+ */
+static void blkdev_trig_deactivate(struct led_classdev *led)
+{
+	struct blkdev_trig_led *btl = led_get_trigger_data(led);
+	struct blkdev_trig_bdev *btb;
+	unsigned long index;
+
+	mutex_lock(&blkdev_trig_mutex);
+
+	xa_for_each (&btl->linked_btbs, index, btb)
+		blkdev_trig_unlink_norelease(btl, btb);
+
+	hlist_del(&btl->all_btls_node);
+	kfree(btl);
+
+	mutex_unlock(&blkdev_trig_mutex);
+}
+
+
+/*
+ *
+ *	Link-related attribute store functions
+ *
+ */
+
+/**
+ * link_dev_by_path_store() - &link_dev_by_path device attribute store function.
+ * @dev:	The LED device
+ * @attr:	The &link_dev_by_path attribute (&dev_attr_link_dev_by_path)
+ * @buf:	The value written to the attribute, which should be the path to
+ *		a special file that represents a block device to be linked to
+ *		the LED (e.g. ``/dev/sda``)
+ * @count:	The number of characters in &buf
+ *
+ * Context:	Process context.  Takes and releases &blkdev_trig_mutex.
+ * Return:	&count on success, negative &errno on error.
+ */
+static ssize_t link_dev_by_path_store(struct device *dev,
+				      struct device_attribute *attr,
+				      const char *buf, size_t count)
+{
+	struct blkdev_trig_led *btl = led_trigger_get_drvdata(dev);
+	struct blkdev_trig_bdev *btb;
+	int err;
+
+	err = mutex_lock_interruptible(&blkdev_trig_mutex);
+	if (err)
+		return err;
+
+	btb = blkdev_trig_get_btb(buf, count);
+	if (IS_ERR(btb)) {
+		err = PTR_ERR(btb);
+		goto exit_unlock;
+	}
+
+	if (xa_load(&btb->linked_btls, btl->index) != NULL) {
+		err = -EEXIST;
+		goto exit_put_btb;
+	}
+
+	err = blkdev_trig_link(btl, btb);
+
+exit_put_btb:
+	if (err)
+		blkdev_trig_put_btb(btb);
+exit_unlock:
+	mutex_unlock(&blkdev_trig_mutex);
+	return err ? : count;
+}
+
+/**
+ * unlink_dev_by_path_store() - &unlink_dev_by_path device attribute store
+ *	function.
+ * @dev:	The LED device
+ * @attr:	The &unlink_dev_by_path attribute (&dev_attr_unlink_dev_by_path)
+ * @buf:	The value written to the attribute, which should be the path to
+ *		a special file that represents a block device to be unlinked
+ *		from the LED (e.g. ``/dev/sda``)
+ * @count:	The number of characters in &buf
+ *
+ * Context:	Process context.  Takes and releases &blkdev_trig_mutex.
+ * Return:	&count on success, negative &errno on error.
+ */
+static ssize_t unlink_dev_by_path_store(struct device *dev,
+					struct device_attribute *attr,
+					const char *buf, size_t count)
+{
+	struct blkdev_trig_led *btl = led_trigger_get_drvdata(dev);
+	struct block_device *bdev;
+	struct blkdev_trig_bdev *btb;
+	int err;
+
+	bdev = blkdev_trig_get_bdev(buf, count);
+	if (IS_ERR(bdev))
+		return PTR_ERR(bdev);
+
+	err = mutex_lock_interruptible(&blkdev_trig_mutex);
+	if (err)
+		goto exit_put_bdev;
+
+	btb = devres_find(&bdev->bd_device, blkdev_trig_btb_release,
+			  NULL, NULL);
+	if (btb == NULL) {
+		err = -EUNATCH;  /* bdev isn't linked to any LED */
+		goto exit_unlock;
+	}
+
+	if (xa_load(&btb->linked_btls, btl->index) == NULL) {
+		err = -EUNATCH;  /* bdev isn't linked to this LED */
+		goto exit_unlock;
+	}
+
+	blkdev_trig_unlink_norelease(btl, btb);
+
+exit_unlock:
+	mutex_unlock(&blkdev_trig_mutex);
+exit_put_bdev:
+	blkdev_put(bdev, NULL);
+	return err ? : count;
+}
+
+/**
+ * unlink_dev_by_name_store() - &unlink_dev_by_name device attribute store
+ *	function.
+ * @dev:	The LED device
+ * @attr:	The &unlink_dev_by_name attribute (&dev_attr_unlink_dev_by_name)
+ * @buf:	The value written to the attribute, which should be the kernel
+ *		name of a block device to be unlinked from the LED (e.g.
+ *		``sda``)
+ * @count:	The number of characters in &buf
+ *
+ * Context:	Process context.  Takes and releases &blkdev_trig_mutex.
+ * Return:	&count on success, negative &errno on error.
+ */
+static ssize_t unlink_dev_by_name_store(struct device *dev,
+					struct device_attribute *attr,
+					const char *buf, size_t count)
+{
+	struct blkdev_trig_led *btl = led_trigger_get_drvdata(dev);
+	struct blkdev_trig_bdev *btb;
+	unsigned long index;
+	int err;
+
+	err = mutex_lock_interruptible(&blkdev_trig_mutex);
+	if (err)
+		return err;
+
+	err = -EUNATCH;
+
+	xa_for_each (&btl->linked_btbs, index, btb) {
+
+		if (sysfs_streq(dev_name(&btb->bdev->bd_device), buf)) {
+			blkdev_trig_unlink_norelease(btl, btb);
+			err = 0;
+			break;
+		}
+	}
+
+	mutex_unlock(&blkdev_trig_mutex);
+	return err ? : count;
+}
+
+
+/*
+ *
+ *	Atomic attribute show & store functions
+ *
+ */
+
+/**
+ * blink_time_show() - &blink_time device attribute show function.
+ * @dev:	The LED device
+ * @attr:	The &blink_time attribute (&dev_attr_blink_time)
+ * @buf:	Output buffer
+ *
+ * Writes the value of &blkdev_trig_led.blink_msec to &buf.
+ *
+ * Context:	Process context.
+ * Return:	The number of characters written to &buf.
+ */
+static ssize_t blink_time_show(struct device *dev,
+			       struct device_attribute *attr, char *buf)
+{
+	const struct blkdev_trig_led *btl = led_trigger_get_drvdata(dev);
+
+	return sysfs_emit(buf, "%u\n", READ_ONCE(btl->blink_msec));
+}
+
+/**
+ * blink_time_store() - &blink_time device attribute store function.
+ * @dev:	The LED device
+ * @attr:	The &blink_time attribute (&dev_attr_blink_time)
+ * @buf:	The new value (as written to the &sysfs attribute)
+ * @count:	The number of characters in &buf
+ *
+ * Sets &blkdev_trig_led.blink_msec to the value in &buf.
+ *
+ * Context:	Process context.
+ * Return:	&count on success, negative &errno on error.
+ */
+static ssize_t blink_time_store(struct device *dev,
+				struct device_attribute *attr,
+				const char *buf, size_t count)
+{
+	struct blkdev_trig_led *btl = led_trigger_get_drvdata(dev);
+	unsigned int value;
+	int err;
+
+	err = kstrtouint(buf, 0, &value);
+	if (err)
+		return err;
+
+	if (value < BLKDEV_TRIG_BLINK_MIN || value > BLKDEV_TRIG_BLINK_MAX)
+		return -ERANGE;
+
+	WRITE_ONCE(btl->blink_msec, value);
+	return count;
+}
+
+/**
+ * check_interval_show() - &check_interval device attribute show function.
+ * @dev:	The LED device
+ * @attr:	The &check_interval attribute (&dev_attr_check_interval)
+ * @buf:	Output buffer
+ *
+ * Writes the value of &blkdev_trig_led.check_jiffies (converted to
+ * milliseconds) to &buf.
+ *
+ * Context:	Process context.
+ * Return:	The number of characters written to &buf.
+ */
+static ssize_t check_interval_show(struct device *dev,
+				   struct device_attribute *attr, char *buf)
+{
+	struct blkdev_trig_led *btl = led_trigger_get_drvdata(dev);
+
+	return sysfs_emit(buf, "%u\n",
+			  jiffies_to_msecs(READ_ONCE(btl->check_jiffies)));
+}
+
+/**
+ * check_interval_store() - &check_interval device attribute store function
+ * @dev:	The LED device
+ * @attr:	The &check_interval attribute (&dev_attr_check_interval)
+ * @buf:	The new value (as written to the &sysfs attribute)
+ * @count:	The number of characters in &buf
+ *
+ * Sets &blkdev_trig_led.check_jiffies to the value in &buf (after converting
+ * from milliseconds).
+ *
+ * Context:	Process context.
+ * Return:	&count on success, negative &errno on error.
+ */
+static ssize_t check_interval_store(struct device *dev,
+				    struct device_attribute *attr,
+				    const char *buf, size_t count)
+{
+	struct blkdev_trig_led *led = led_trigger_get_drvdata(dev);
+	unsigned int value;
+	int err;
+
+	err = kstrtouint(buf, 0, &value);
+	if (err)
+		return err;
+
+	if (value < BLKDEV_TRIG_CHECK_MIN || value > BLKDEV_TRIG_CHECK_MAX)
+		return -ERANGE;
+
+	WRITE_ONCE(led->check_jiffies, msecs_to_jiffies(value));
+
+	return count;
+}
+
+/**
+ * blkdev_trig_mode_show() - Helper for boolean attribute show functions.
+ * @led:	The LED
+ * @buf:	Output buffer
+ * @bit:	Which bit to show
+ *
+ * Context:	Process context.
+ * Return:	The number of characters written to &buf.
+ */
+static int blkdev_trig_mode_show(const struct blkdev_trig_led *led, char *buf,
+				 enum stat_group bit)
+{
+	return sysfs_emit(buf,
+			  READ_ONCE(led->mode) & (1 << bit) ? "Y\n" : "N\n");
+}
+
+/**
+ * blkdev_trig_mode_store() - Helper for boolean attribute store functions.
+ * @led:	The LED
+ * @buf:	The new value (as written to the &sysfs attribute)
+ * @count:	The number of characters in &buf
+ * @bit:	Which bit to set
+ *
+ * Context:	Process context.
+ * Return:	&count on success, negative &errno on error.
+ */
+static int blkdev_trig_mode_store(struct blkdev_trig_led *led,
+				  const char *buf, size_t count,
+				  enum stat_group bit)
+{
+	bool set;
+	int err;
+
+	err = kstrtobool(buf, &set);
+	if (err)
+		return err;
+
+	if (set)
+		set_bit(bit, &led->mode);
+	else
+		clear_bit(bit, &led->mode);
+
+	return count;
+}
+
+/**
+ * blink_on_read_show() - &blink_on_read device attribute show function.
+ * @dev:	The LED device
+ * @attr:	The &blink_on_read attribute (&dev_attr_blink_on_read)
+ * @buf:	Output buffer
+ *
+ * Writes ``Y`` or ``N`` to &buf, depending on whether the &STAT_READ bit in
+ * &blkdev_trig_led.mode is set or cleared.
+ *
+ * Context:	Process context.
+ * Return:	The number of characters written to &buf.
+ */
+static ssize_t blink_on_read_show(struct device *dev,
+				  struct device_attribute *attr, char *buf)
+{
+	return blkdev_trig_mode_show(led_trigger_get_drvdata(dev),
+				     buf, STAT_READ);
+}
+
+/**
+ * blink_on_read_store() - &blink_on_read device attribute store function.
+ * @dev:	The LED device
+ * @attr:	The &blink_on_read attribute (&dev_attr_blink_on_read)
+ * @buf:	The new value (as written to the &sysfs attribute)
+ * @count:	The number of characters in &buf
+ *
+ * Sets the &STAT_READ bit in &blkdev_trig_led.mode to the value in &buf
+ * (interpretted as a boolean).
+ *
+ * Context:	Process context.
+ * Return:	&count on success, negative &errno on error.
+ */
+static ssize_t blink_on_read_store(struct device *dev,
+				   struct device_attribute *attr,
+				   const char *buf, size_t count)
+{
+	return blkdev_trig_mode_store(led_trigger_get_drvdata(dev),
+				      buf, count, STAT_READ);
+}
+
+/**
+ * blink_on_write_show() - &blink_on_write device attribute show function.
+ * @dev:	The LED device
+ * @attr:	The &blink_on_write attribute (&dev_attr_blink_on_write)
+ * @buf:	Output buffer
+ *
+ * Writes ``Y`` or ``N`` to &buf, depending on whether the &STAT_WRITE bit in
+ * in &blkdev_trig_led.mode is set or cleared.
+ *
+ * Context:	Process context.
+ * Return:	The number of characters written to &buf.
+ */
+static ssize_t blink_on_write_show(struct device *dev,
+				   struct device_attribute *attr, char *buf)
+{
+	return blkdev_trig_mode_show(led_trigger_get_drvdata(dev),
+				     buf, STAT_WRITE);
+}
+
+/**
+ * blink_on_write_store() - &blink_on_write device attribute store function.
+ * @dev:	The LED device
+ * @attr:	The &blink_on_write attribute (&dev_attr_blink_on_write)
+ * @buf:	The new value (as written to the &sysfs attribute)
+ * @count:	The number of characters in &buf
+ *
+ * Sets the &STAT_WRITE bit in &blkdev_trig_led.mode to the value in &buf
+ * (interpretted as a boolean).
+ *
+ * Context:	Process context.
+ * Return:	&count on success, negative &errno on error.
+ */
+static ssize_t blink_on_write_store(struct device *dev,
+				    struct device_attribute *attr,
+				    const char *buf, size_t count)
+{
+	return blkdev_trig_mode_store(led_trigger_get_drvdata(dev),
+				      buf, count, STAT_WRITE);
+}
+
+/**
+ * blink_on_flush_show() - &blink_on_flush device attribute show function.
+ * @dev:	The LED device
+ * @attr:	The &blink_on_flush attribute (&dev_attr_blink_on_flush)
+ * @buf:	Output buffer
+ *
+ * Writes ``Y`` or ``N`` to &buf, depending whether the &STAT_FLUSH bit in
+ * &blkdev_trig_led.mode is set or cleared.
+ *
+ * Context:	Process context.
+ * Return:	The number of characters written to &buf.
+ */
+static ssize_t blink_on_flush_show(struct device *dev,
+				   struct device_attribute *attr, char *buf)
+{
+	return blkdev_trig_mode_show(led_trigger_get_drvdata(dev),
+				     buf, STAT_FLUSH);
+}
+
+/**
+ * blink_on_flush_store() - &blink_on_flush device attribute store function.
+ * @dev:	The LED device
+ * @attr:	The &blink_on_flush attribute (&dev_attr_blink_on_flush)
+ * @buf:	The new value (as written to the &sysfs attribute)
+ * @count:	The number of characters in &buf
+ *
+ * Sets the &STAT_FLUSH bit in &blkdev_trig_led.mode to the value in &buf
+ * (interpretted as a boolean).
+ *
+ * Context:	Process context.
+ * Return:	&count on success, negative &errno on error.
+ */
+static ssize_t blink_on_flush_store(struct device *dev,
+				    struct device_attribute *attr,
+				    const char *buf, size_t count)
+{
+	return blkdev_trig_mode_store(led_trigger_get_drvdata(dev),
+				      buf, count, STAT_FLUSH);
+}
+
+/**
+ * blink_on_discard_show() - &blink_on_discard device attribute show function.
+ * @dev:	The LED device
+ * @attr:	The &blink_on_discard attribute (&dev_attr_blink_on_discard)
+ * @buf:	Output buffer
+ *
+ * Writes ``Y`` or ``N`` to &buf, depending on whether the &STAT_DISCARD bit in
+ * &blkdev_trig_led.mode is set or cleared.
+ *
+ * Context:	Process context.
+ * Return:	The number of characters written to &buf.
+ */
+static ssize_t blink_on_discard_show(struct device *dev,
+				     struct device_attribute *attr, char *buf)
+{
+	return blkdev_trig_mode_show(led_trigger_get_drvdata(dev),
+				     buf, STAT_DISCARD);
+}
+
+/**
+ * blink_on_discard_store() - &blink_on_discard device attribute store function.
+ * @dev:	The LED device
+ * @attr:	The &blink_on_discard attribute (&dev_attr_blink_on_discard)
+ * @buf:	The new value (as written to the &sysfs attribute)
+ * @count:	The number of characters in &buf
+ *
+ * Sets the &STAT_DISCARD bit in &blkdev_trig_led.mode to the value in &buf
+ * (interpretted as a boolean).
+ *
+ * Context:	Process context.
+ * Return:	&count on success, negative &errno on error.
+ */
+static ssize_t blink_on_discard_store(struct device *dev,
+				      struct device_attribute *attr,
+				      const char *buf, size_t count)
+{
+	return blkdev_trig_mode_store(led_trigger_get_drvdata(dev),
+				      buf, count, STAT_DISCARD);
+}
+
+/* Device attributes */
+static DEVICE_ATTR_WO(link_dev_by_path);
+static DEVICE_ATTR_WO(unlink_dev_by_path);
+static DEVICE_ATTR_WO(unlink_dev_by_name);
+static DEVICE_ATTR_RW(blink_time);
+static DEVICE_ATTR_RW(check_interval);
+static DEVICE_ATTR_RW(blink_on_read);
+static DEVICE_ATTR_RW(blink_on_write);
+static DEVICE_ATTR_RW(blink_on_flush);
+static DEVICE_ATTR_RW(blink_on_discard);
+
+/* Device attributes in LED directory (/sys/class/leds/<led>/...) */
+static struct attribute *blkdev_trig_attrs[] = {
+	&dev_attr_link_dev_by_path.attr,
+	&dev_attr_unlink_dev_by_path.attr,
+	&dev_attr_unlink_dev_by_name.attr,
+	&dev_attr_blink_time.attr,
+	&dev_attr_check_interval.attr,
+	&dev_attr_blink_on_read.attr,
+	&dev_attr_blink_on_write.attr,
+	&dev_attr_blink_on_flush.attr,
+	&dev_attr_blink_on_discard.attr,
+	NULL
+};
+
+/* Unnamed attribute group == no subdirectory */
+static const struct attribute_group blkdev_trig_attr_group = {
+	.attrs	= blkdev_trig_attrs,
+};
+
+/* Attribute groups for the trigger */
+static const struct attribute_group *blkdev_trig_attr_groups[] = {
+	&blkdev_trig_attr_group,   /* /sys/class/leds/<led>/... */
+	&blkdev_trig_linked_devs,  /* /sys/class/leds/<led>/linked_devices/ */
+	NULL
+};
+
+/* Trigger registration data */
+static struct led_trigger blkdev_trig_trigger = {
+	.name		= "blkdev",
+	.activate	= blkdev_trig_activate,
+	.deactivate	= blkdev_trig_deactivate,
+	.groups		= blkdev_trig_attr_groups,
+};
+
+/**
+ * blkdev_trig_init() - Block device LED trigger initialization.
+ *
+ * Registers the ``blkdev`` LED trigger.
+ *
+ * Return:	&0 on success, negative &errno on failure.
+ */
+static int __init blkdev_trig_init(void)
+{
+	return led_trigger_register(&blkdev_trig_trigger);
+}
+module_init(blkdev_trig_init);
+
+/**
+ * blkdev_trig_exit() - Block device LED trigger module exit.
+ *
+ * Unregisters the ``blkdev`` LED trigger.
+ */
+static void __exit blkdev_trig_exit(void)
+{
+	led_trigger_unregister(&blkdev_trig_trigger);
+}
+module_exit(blkdev_trig_exit);
+
+MODULE_DESCRIPTION("Block device LED trigger");
+MODULE_AUTHOR("Ian Pilcher <arequipeno@gmail.com>");
+MODULE_LICENSE("GPL v2");
diff --git a/drivers/net/wireless/mediatek/mt76/mt7921/init.c b/drivers/net/wireless/mediatek/mt76/mt7921/init.c
index f41975e37d06..8d8f3dea3450 100644
--- a/drivers/net/wireless/mediatek/mt76/mt7921/init.c
+++ b/drivers/net/wireless/mediatek/mt76/mt7921/init.c
@@ -99,7 +99,8 @@ mt7921_init_wiphy(struct ieee80211_hw *hw)
 		wiphy->n_iface_combinations = ARRAY_SIZE(if_comb);
 	}
 	wiphy->flags &= ~(WIPHY_FLAG_IBSS_RSN | WIPHY_FLAG_4ADDR_AP |
-			  WIPHY_FLAG_4ADDR_STATION);
+			  WIPHY_FLAG_4ADDR_STATION |
+			  WIPHY_FLAG_PS_ON_BY_DEFAULT);
 	wiphy->interface_modes = BIT(NL80211_IFTYPE_STATION) |
 				 BIT(NL80211_IFTYPE_AP) |
 				 BIT(NL80211_IFTYPE_P2P_CLIENT) |
@@ -409,12 +410,6 @@ int mt7921_register_device(struct mt7921_dev *dev)
 	dev->pm.idle_timeout = MT7921_PM_TIMEOUT;
 	dev->pm.stats.last_wake_event = jiffies;
 	dev->pm.stats.last_doze_event = jiffies;
-	if (!mt76_is_usb(&dev->mt76)) {
-		dev->pm.enable_user = true;
-		dev->pm.enable = true;
-		dev->pm.ds_enable_user = true;
-		dev->pm.ds_enable = true;
-	}
 
 	if (!mt76_is_mmio(&dev->mt76))
 		hw->extra_tx_headroom += MT_SDIO_TXD_SIZE + MT_SDIO_HDR_SIZE;
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 2cf8d646085c..6e45a471ed37 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3481,7 +3481,6 @@ btrfs_release_block_group(struct btrfs_block_group *cache,
  * Helper function for find_free_extent().
  *
  * Return -ENOENT to inform caller that we need fallback to unclustered mode.
- * Return -EAGAIN to inform caller that we need to re-search this block group
  * Return >0 to inform caller that we find nothing
  * Return 0 means we have found a location and set ffe_ctl->found_offset.
  */
@@ -3562,14 +3561,6 @@ static int find_free_extent_clustered(struct btrfs_block_group *bg,
 			trace_btrfs_reserve_extent_cluster(bg, ffe_ctl);
 			return 0;
 		}
-	} else if (!ffe_ctl->cached && ffe_ctl->loop > LOOP_CACHING_NOWAIT &&
-		   !ffe_ctl->retry_clustered) {
-		spin_unlock(&last_ptr->refill_lock);
-
-		ffe_ctl->retry_clustered = true;
-		btrfs_wait_block_group_cache_progress(bg, ffe_ctl->num_bytes +
-				ffe_ctl->empty_cluster + ffe_ctl->empty_size);
-		return -EAGAIN;
 	}
 	/*
 	 * At this point we either didn't find a cluster or we weren't able to
@@ -3584,7 +3575,6 @@ static int find_free_extent_clustered(struct btrfs_block_group *bg,
 /*
  * Return >0 to inform caller that we find nothing
  * Return 0 when we found an free extent and set ffe_ctrl->found_offset
- * Return -EAGAIN to inform caller that we need to re-search this block group
  */
 static int find_free_extent_unclustered(struct btrfs_block_group *bg,
 					struct find_free_extent_ctl *ffe_ctl)
@@ -3622,25 +3612,8 @@ static int find_free_extent_unclustered(struct btrfs_block_group *bg,
 	offset = btrfs_find_space_for_alloc(bg, ffe_ctl->search_start,
 			ffe_ctl->num_bytes, ffe_ctl->empty_size,
 			&ffe_ctl->max_extent_size);
-
-	/*
-	 * If we didn't find a chunk, and we haven't failed on this block group
-	 * before, and this block group is in the middle of caching and we are
-	 * ok with waiting, then go ahead and wait for progress to be made, and
-	 * set @retry_unclustered to true.
-	 *
-	 * If @retry_unclustered is true then we've already waited on this
-	 * block group once and should move on to the next block group.
-	 */
-	if (!offset && !ffe_ctl->retry_unclustered && !ffe_ctl->cached &&
-	    ffe_ctl->loop > LOOP_CACHING_NOWAIT) {
-		btrfs_wait_block_group_cache_progress(bg, ffe_ctl->num_bytes +
-						      ffe_ctl->empty_size);
-		ffe_ctl->retry_unclustered = true;
-		return -EAGAIN;
-	} else if (!offset) {
+	if (!offset)
 		return 1;
-	}
 	ffe_ctl->found_offset = offset;
 	return 0;
 }
@@ -3654,7 +3627,7 @@ static int do_allocation_clustered(struct btrfs_block_group *block_group,
 	/* We want to try and use the cluster allocator, so lets look there */
 	if (ffe_ctl->last_ptr && ffe_ctl->use_cluster) {
 		ret = find_free_extent_clustered(block_group, ffe_ctl, bg_ret);
-		if (ret >= 0 || ret == -EAGAIN)
+		if (ret >= 0)
 			return ret;
 		/* ret == -ENOENT case falls through */
 	}
@@ -3873,8 +3846,7 @@ static void release_block_group(struct btrfs_block_group *block_group,
 {
 	switch (ffe_ctl->policy) {
 	case BTRFS_EXTENT_ALLOC_CLUSTERED:
-		ffe_ctl->retry_clustered = false;
-		ffe_ctl->retry_unclustered = false;
+		ffe_ctl->retry_uncached = false;
 		break;
 	case BTRFS_EXTENT_ALLOC_ZONED:
 		/* Nothing to do */
@@ -4225,9 +4197,7 @@ static noinline int find_free_extent(struct btrfs_root *root,
 	ffe_ctl->orig_have_caching_bg = false;
 	ffe_ctl->index = btrfs_bg_flags_to_raid_index(ffe_ctl->flags);
 	ffe_ctl->loop = 0;
-	/* For clustered allocation */
-	ffe_ctl->retry_clustered = false;
-	ffe_ctl->retry_unclustered = false;
+	ffe_ctl->retry_uncached = false;
 	ffe_ctl->cached = 0;
 	ffe_ctl->max_extent_size = 0;
 	ffe_ctl->total_free_space = 0;
@@ -4378,16 +4348,12 @@ static noinline int find_free_extent(struct btrfs_root *root,
 
 		bg_ret = NULL;
 		ret = do_allocation(block_group, ffe_ctl, &bg_ret);
-		if (ret == 0) {
-			if (bg_ret && bg_ret != block_group) {
-				btrfs_release_block_group(block_group,
-							  ffe_ctl->delalloc);
-				block_group = bg_ret;
-			}
-		} else if (ret == -EAGAIN) {
-			goto have_block_group;
-		} else if (ret > 0) {
+		if (ret > 0)
 			goto loop;
+
+		if (bg_ret && bg_ret != block_group) {
+			btrfs_release_block_group(block_group, ffe_ctl->delalloc);
+			block_group = bg_ret;
 		}
 
 		/* Checks */
@@ -4428,6 +4394,15 @@ static noinline int find_free_extent(struct btrfs_root *root,
 		btrfs_release_block_group(block_group, ffe_ctl->delalloc);
 		break;
 loop:
+		if (!ffe_ctl->cached && ffe_ctl->loop > LOOP_CACHING_NOWAIT &&
+		    !ffe_ctl->retry_uncached) {
+			ffe_ctl->retry_uncached = true;
+			btrfs_wait_block_group_cache_progress(block_group,
+						ffe_ctl->num_bytes +
+						ffe_ctl->empty_cluster +
+						ffe_ctl->empty_size);
+			goto have_block_group;
+		}
 		release_block_group(block_group, ffe_ctl, ffe_ctl->delalloc);
 		cond_resched();
 	}
diff --git a/fs/btrfs/extent-tree.h b/fs/btrfs/extent-tree.h
index 429d5c570061..6bfba2f22fdd 100644
--- a/fs/btrfs/extent-tree.h
+++ b/fs/btrfs/extent-tree.h
@@ -48,16 +48,11 @@ struct find_free_extent_ctl {
 	int loop;
 
 	/*
-	 * Whether we're refilling a cluster, if true we need to re-search
-	 * current block group but don't try to refill the cluster again.
+	 * Set to true if we're retrying the allocation on this block group
+	 * after waiting for caching progress, this is so that we retry only
+	 * once before moving on to another block group.
 	 */
-	bool retry_clustered;
-
-	/*
-	 * Whether we're updating free space cache, if true we need to re-search
-	 * current block group but don't try updating free space cache again.
-	 */
-	bool retry_unclustered;
+	bool retry_uncached;
 
 	/* If current block group is cached */
 	int cached;
diff --git a/include/linux/pageblock-flags.h b/include/linux/pageblock-flags.h
index e83c4c095041..21b8dfa5d828 100644
--- a/include/linux/pageblock-flags.h
+++ b/include/linux/pageblock-flags.h
@@ -48,7 +48,7 @@ extern unsigned int pageblock_order;
 #else /* CONFIG_HUGETLB_PAGE */
 
 /* If huge pages are not used, group by MAX_ORDER_NR_PAGES */
-#define pageblock_order		MAX_ORDER
+#define pageblock_order		PAGE_ALLOC_COSTLY_ORDER
 
 #endif /* CONFIG_HUGETLB_PAGE */
 
diff --git a/kernel/padata.c b/kernel/padata.c
index 222d60195de6..b8e6b7c48746 100644
--- a/kernel/padata.c
+++ b/kernel/padata.c
@@ -45,7 +45,7 @@ struct padata_mt_job_state {
 };
 
 static void padata_free_pd(struct parallel_data *pd);
-static void __init padata_mt_helper(struct work_struct *work);
+static void padata_mt_helper(struct work_struct *work);
 
 static int padata_index_to_cpu(struct parallel_data *pd, int cpu_index)
 {
@@ -438,7 +438,7 @@ static int padata_setup_cpumasks(struct padata_instance *pinst)
 	return err;
 }
 
-static void __init padata_mt_helper(struct work_struct *w)
+static void padata_mt_helper(struct work_struct *w)
 {
 	struct padata_work *pw = container_of(w, struct padata_work, pw_work);
 	struct padata_mt_job_state *ps = pw->pw_data;
diff --git a/scripts/Makefile.vmlinux_o b/scripts/Makefile.vmlinux_o
index 0edfdb40364b..ae52d3b3f063 100644
--- a/scripts/Makefile.vmlinux_o
+++ b/scripts/Makefile.vmlinux_o
@@ -19,7 +19,7 @@ quiet_cmd_gen_initcalls_lds = GEN     $@
 
 .tmp_initcalls.lds: $(srctree)/scripts/generate_initcall_order.pl \
 		vmlinux.a $(KBUILD_VMLINUX_LIBS) FORCE
-	$(call if_changed,gen_initcalls_lds)
+	+$(call if_changed,gen_initcalls_lds)
 
 targets := .tmp_initcalls.lds
 
diff --git a/sound/pci/hda/cs35l41_hda.c b/sound/pci/hda/cs35l41_hda.c
index 9ba77e685126..398fa8b31536 100644
--- a/sound/pci/hda/cs35l41_hda.c
+++ b/sound/pci/hda/cs35l41_hda.c
@@ -1278,7 +1278,7 @@ static int cs35l41_no_acpi_dsd(struct cs35l41_hda *cs35l41, struct device *physd
 
 	if (strncmp(hid, "CLSA0100", 8) == 0) {
 		hw_cfg->bst_type = CS35L41_EXT_BOOST_NO_VSPK_SWITCH;
-	} else if (strncmp(hid, "CLSA0101", 8) == 0) {
+	} else if (strncmp(hid, "CLSA0101", 8) == 0 || strncmp(hid, "CSC3551", 7) == 0) {
 		hw_cfg->bst_type = CS35L41_EXT_BOOST;
 		hw_cfg->gpio1.func = CS35l41_VSPK_SWITCH;
 		hw_cfg->gpio1.valid = true;
diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c
index c2fbf484b110..b79c16bc2ed7 100644
--- a/sound/pci/hda/patch_realtek.c
+++ b/sound/pci/hda/patch_realtek.c
@@ -9748,6 +9748,7 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = {
 	SND_PCI_QUIRK(0x1043, 0x17f3, "ROG Ally RC71L_RC71L", ALC294_FIXUP_ASUS_ALLY),
 	SND_PCI_QUIRK(0x1043, 0x1881, "ASUS Zephyrus S/M", ALC294_FIXUP_ASUS_GX502_PINS),
 	SND_PCI_QUIRK(0x1043, 0x18b1, "Asus MJ401TA", ALC256_FIXUP_ASUS_HEADSET_MIC),
+	SND_PCI_QUIRK(0x1043, 0x18d3, "Asus Zenbook", ALC287_FIXUP_CS35L41_I2C_2),
 	SND_PCI_QUIRK(0x1043, 0x18f1, "Asus FX505DT", ALC256_FIXUP_ASUS_HEADSET_MIC),
 	SND_PCI_QUIRK(0x1043, 0x194e, "ASUS UX563FD", ALC294_FIXUP_ASUS_HPE),
 	SND_PCI_QUIRK(0x1043, 0x1970, "ASUS UX550VE", ALC289_FIXUP_ASUS_GA401),
-- 
2.42.0

From cbedf8cd5469fafe8876450ab9d41ce50e830ea9 Mon Sep 17 00:00:00 2001
From: Peter Jung <admin@ptr1337.dev>
Date: Wed, 13 Sep 2023 14:33:16 +0200
Subject: [PATCH 6/7] ksm

Signed-off-by: Peter Jung <admin@ptr1337.dev>
---
 Documentation/admin-guide/mm/ksm.rst          |  25 ++-
 arch/alpha/kernel/syscalls/syscall.tbl        |   3 +
 arch/arm/tools/syscall.tbl                    |   3 +
 arch/arm64/include/asm/unistd.h               |   2 +-
 arch/arm64/include/asm/unistd32.h             |   6 +
 arch/ia64/kernel/syscalls/syscall.tbl         |   3 +
 arch/m68k/kernel/syscalls/syscall.tbl         |   3 +
 arch/microblaze/kernel/syscalls/syscall.tbl   |   3 +
 arch/mips/kernel/syscalls/syscall_n32.tbl     |   3 +
 arch/mips/kernel/syscalls/syscall_n64.tbl     |   3 +
 arch/mips/kernel/syscalls/syscall_o32.tbl     |   3 +
 arch/parisc/kernel/syscalls/syscall.tbl       |   3 +
 arch/powerpc/kernel/syscalls/syscall.tbl      |   3 +
 arch/s390/kernel/syscalls/syscall.tbl         |   3 +
 arch/sh/kernel/syscalls/syscall.tbl           |   3 +
 arch/sparc/kernel/syscalls/syscall.tbl        |   3 +
 arch/x86/entry/syscalls/syscall_32.tbl        |   3 +
 arch/x86/entry/syscalls/syscall_64.tbl        |   3 +
 arch/xtensa/kernel/syscalls/syscall.tbl       |   3 +
 fs/proc/base.c                                |   1 +
 include/linux/ksm.h                           |  20 +++
 include/linux/mm_types.h                      |   9 +-
 include/linux/syscalls.h                      |   3 +
 include/uapi/asm-generic/unistd.h             |  11 +-
 kernel/sys.c                                  | 147 ++++++++++++++++++
 kernel/sys_ni.c                               |   3 +
 mm/khugepaged.c                               |   2 +
 mm/ksm.c                                      |  28 +++-
 mm/memory.c                                   |   5 +-
 .../selftests/mm/ksm_functional_tests.c       |  98 +++++++++++-
 30 files changed, 390 insertions(+), 18 deletions(-)

diff --git a/Documentation/admin-guide/mm/ksm.rst b/Documentation/admin-guide/mm/ksm.rst
index 7626392fe82c..5c5be7bd84b8 100644
--- a/Documentation/admin-guide/mm/ksm.rst
+++ b/Documentation/admin-guide/mm/ksm.rst
@@ -173,6 +173,13 @@ stable_node_chains
         the number of KSM pages that hit the ``max_page_sharing`` limit
 stable_node_dups
         number of duplicated KSM pages
+ksm_zero_pages
+        how many zero pages that are still mapped into processes were mapped by
+        KSM when deduplicating.
+
+When ``use_zero_pages`` is/was enabled, the sum of ``pages_sharing`` +
+``ksm_zero_pages`` represents the actual number of pages saved by KSM.
+if ``use_zero_pages`` has never been enabled, ``ksm_zero_pages`` is 0.
 
 A high ratio of ``pages_sharing`` to ``pages_shared`` indicates good
 sharing, but a high ratio of ``pages_unshared`` to ``pages_sharing``
@@ -196,21 +203,25 @@ several times, which are unprofitable memory consumed.
 1) How to determine whether KSM save memory or consume memory in system-wide
    range? Here is a simple approximate calculation for reference::
 
-	general_profit =~ pages_sharing * sizeof(page) - (all_rmap_items) *
+	general_profit =~ ksm_saved_pages * sizeof(page) - (all_rmap_items) *
 			  sizeof(rmap_item);
 
-   where all_rmap_items can be easily obtained by summing ``pages_sharing``,
-   ``pages_shared``, ``pages_unshared`` and ``pages_volatile``.
+   where ksm_saved_pages equals to the sum of ``pages_sharing`` +
+   ``ksm_zero_pages`` of the system, and all_rmap_items can be easily
+   obtained by summing ``pages_sharing``, ``pages_shared``, ``pages_unshared``
+   and ``pages_volatile``.
 
 2) The KSM profit inner a single process can be similarly obtained by the
    following approximate calculation::
 
-	process_profit =~ ksm_merging_pages * sizeof(page) -
+	process_profit =~ ksm_saved_pages * sizeof(page) -
 			  ksm_rmap_items * sizeof(rmap_item).
 
-   where ksm_merging_pages is shown under the directory ``/proc/<pid>/``,
-   and ksm_rmap_items is shown in ``/proc/<pid>/ksm_stat``. The process profit
-   is also shown in ``/proc/<pid>/ksm_stat`` as ksm_process_profit.
+   where ksm_saved_pages equals to the sum of ``ksm_merging_pages`` and
+   ``ksm_zero_pages``, both of which are shown under the directory
+   ``/proc/<pid>/ksm_stat``, and ksm_rmap_items is also shown in
+   ``/proc/<pid>/ksm_stat``. The process profit is also shown in
+   ``/proc/<pid>/ksm_stat`` as ksm_process_profit.
 
 From the perspective of application, a high ratio of ``ksm_rmap_items`` to
 ``ksm_merging_pages`` means a bad madvise-applied policy, so developers or
diff --git a/arch/alpha/kernel/syscalls/syscall.tbl b/arch/alpha/kernel/syscalls/syscall.tbl
index 1f13995d00d7..4a5bc2a91fa7 100644
--- a/arch/alpha/kernel/syscalls/syscall.tbl
+++ b/arch/alpha/kernel/syscalls/syscall.tbl
@@ -491,3 +491,6 @@
 559	common  futex_waitv                     sys_futex_waitv
 560	common	set_mempolicy_home_node		sys_ni_syscall
 561	common	cachestat			sys_cachestat
+562	common	process_ksm_enable		sys_process_ksm_enable
+563	common	process_ksm_disable		sys_process_ksm_disable
+564	common	process_ksm_status		sys_process_ksm_status
diff --git a/arch/arm/tools/syscall.tbl b/arch/arm/tools/syscall.tbl
index 8ebed8a13874..d616dcc060df 100644
--- a/arch/arm/tools/syscall.tbl
+++ b/arch/arm/tools/syscall.tbl
@@ -465,3 +465,6 @@
 449	common	futex_waitv			sys_futex_waitv
 450	common	set_mempolicy_home_node		sys_set_mempolicy_home_node
 451	common	cachestat			sys_cachestat
+452	common	process_ksm_enable		sys_process_ksm_enable
+453	common	process_ksm_disable		sys_process_ksm_disable
+454	common	process_ksm_status		sys_process_ksm_status
diff --git a/arch/arm64/include/asm/unistd.h b/arch/arm64/include/asm/unistd.h
index 64a514f90131..63a8a9c4abc1 100644
--- a/arch/arm64/include/asm/unistd.h
+++ b/arch/arm64/include/asm/unistd.h
@@ -39,7 +39,7 @@
 #define __ARM_NR_compat_set_tls		(__ARM_NR_COMPAT_BASE + 5)
 #define __ARM_NR_COMPAT_END		(__ARM_NR_COMPAT_BASE + 0x800)
 
-#define __NR_compat_syscalls		452
+#define __NR_compat_syscalls		455
 #endif
 
 #define __ARCH_WANT_SYS_CLONE
diff --git a/arch/arm64/include/asm/unistd32.h b/arch/arm64/include/asm/unistd32.h
index d952a28463e0..c99c8260489b 100644
--- a/arch/arm64/include/asm/unistd32.h
+++ b/arch/arm64/include/asm/unistd32.h
@@ -909,6 +909,12 @@ __SYSCALL(__NR_futex_waitv, sys_futex_waitv)
 __SYSCALL(__NR_set_mempolicy_home_node, sys_set_mempolicy_home_node)
 #define __NR_cachestat 451
 __SYSCALL(__NR_cachestat, sys_cachestat)
+#define __NR_process_ksm_enable 452
+__SYSCALL(__NR_process_ksm_enable, sys_process_ksm_enable)
+#define __NR_process_ksm_disable 453
+__SYSCALL(__NR_process_ksm_disable, sys_process_ksm_disable)
+#define __NR_process_ksm_status 454
+__SYSCALL(__NR_process_ksm_status, sys_process_ksm_status)
 
 /*
  * Please add new compat syscalls above this comment and update
diff --git a/arch/ia64/kernel/syscalls/syscall.tbl b/arch/ia64/kernel/syscalls/syscall.tbl
index f8c74ffeeefb..735157909c6f 100644
--- a/arch/ia64/kernel/syscalls/syscall.tbl
+++ b/arch/ia64/kernel/syscalls/syscall.tbl
@@ -372,3 +372,6 @@
 449	common  futex_waitv                     sys_futex_waitv
 450	common	set_mempolicy_home_node		sys_set_mempolicy_home_node
 451	common	cachestat			sys_cachestat
+452	common	process_ksm_enable		sys_process_ksm_enable
+453	common	process_ksm_disable		sys_process_ksm_disable
+454	common	process_ksm_status		sys_process_ksm_status
diff --git a/arch/m68k/kernel/syscalls/syscall.tbl b/arch/m68k/kernel/syscalls/syscall.tbl
index 4f504783371f..25b22d311f10 100644
--- a/arch/m68k/kernel/syscalls/syscall.tbl
+++ b/arch/m68k/kernel/syscalls/syscall.tbl
@@ -451,3 +451,6 @@
 449	common  futex_waitv                     sys_futex_waitv
 450	common	set_mempolicy_home_node		sys_set_mempolicy_home_node
 451	common	cachestat			sys_cachestat
+452	common	process_ksm_enable		sys_process_ksm_enable
+453	common	process_ksm_disable		sys_process_ksm_disable
+454	common	process_ksm_status		sys_process_ksm_status
diff --git a/arch/microblaze/kernel/syscalls/syscall.tbl b/arch/microblaze/kernel/syscalls/syscall.tbl
index 858d22bf275c..e548c182a33e 100644
--- a/arch/microblaze/kernel/syscalls/syscall.tbl
+++ b/arch/microblaze/kernel/syscalls/syscall.tbl
@@ -457,3 +457,6 @@
 449	common  futex_waitv                     sys_futex_waitv
 450	common	set_mempolicy_home_node		sys_set_mempolicy_home_node
 451	common	cachestat			sys_cachestat
+452	common	process_ksm_enable		sys_process_ksm_enable
+453	common	process_ksm_disable		sys_process_ksm_disable
+454	common	process_ksm_status		sys_process_ksm_status
diff --git a/arch/mips/kernel/syscalls/syscall_n32.tbl b/arch/mips/kernel/syscalls/syscall_n32.tbl
index 1976317d4e8b..fed21167be44 100644
--- a/arch/mips/kernel/syscalls/syscall_n32.tbl
+++ b/arch/mips/kernel/syscalls/syscall_n32.tbl
@@ -390,3 +390,6 @@
 449	n32	futex_waitv			sys_futex_waitv
 450	n32	set_mempolicy_home_node		sys_set_mempolicy_home_node
 451	n32	cachestat			sys_cachestat
+452	n32	process_ksm_enable		sys_process_ksm_enable
+453	n32	process_ksm_disable		sys_process_ksm_disable
+454	n32	process_ksm_status		sys_process_ksm_status
diff --git a/arch/mips/kernel/syscalls/syscall_n64.tbl b/arch/mips/kernel/syscalls/syscall_n64.tbl
index cfda2511badf..b27ae871f676 100644
--- a/arch/mips/kernel/syscalls/syscall_n64.tbl
+++ b/arch/mips/kernel/syscalls/syscall_n64.tbl
@@ -366,3 +366,6 @@
 449	n64	futex_waitv			sys_futex_waitv
 450	common	set_mempolicy_home_node		sys_set_mempolicy_home_node
 451	n64	cachestat			sys_cachestat
+452	n64	process_ksm_enable		sys_process_ksm_enable
+453	n64	process_ksm_disable		sys_process_ksm_disable
+454	n64	process_ksm_status		sys_process_ksm_status
diff --git a/arch/mips/kernel/syscalls/syscall_o32.tbl b/arch/mips/kernel/syscalls/syscall_o32.tbl
index 7692234c3768..59f298413c29 100644
--- a/arch/mips/kernel/syscalls/syscall_o32.tbl
+++ b/arch/mips/kernel/syscalls/syscall_o32.tbl
@@ -439,3 +439,6 @@
 449	o32	futex_waitv			sys_futex_waitv
 450	o32	set_mempolicy_home_node		sys_set_mempolicy_home_node
 451	o32	cachestat			sys_cachestat
+452	o32	process_ksm_enable		sys_process_ksm_enable
+453	o32	process_ksm_disable		sys_process_ksm_disable
+454	o32	process_ksm_status		sys_process_ksm_status
diff --git a/arch/parisc/kernel/syscalls/syscall.tbl b/arch/parisc/kernel/syscalls/syscall.tbl
index a0a9145b6dd4..494b59d1185f 100644
--- a/arch/parisc/kernel/syscalls/syscall.tbl
+++ b/arch/parisc/kernel/syscalls/syscall.tbl
@@ -450,3 +450,6 @@
 449	common	futex_waitv			sys_futex_waitv
 450	common	set_mempolicy_home_node		sys_set_mempolicy_home_node
 451	common	cachestat			sys_cachestat
+452	common	process_ksm_enable		sys_process_ksm_enable
+453	common	process_ksm_disable		sys_process_ksm_disable
+454	common	process_ksm_status		sys_process_ksm_status
diff --git a/arch/powerpc/kernel/syscalls/syscall.tbl b/arch/powerpc/kernel/syscalls/syscall.tbl
index 8c0b08b7a80e..499d7b233a43 100644
--- a/arch/powerpc/kernel/syscalls/syscall.tbl
+++ b/arch/powerpc/kernel/syscalls/syscall.tbl
@@ -538,3 +538,6 @@
 449	common  futex_waitv                     sys_futex_waitv
 450 	nospu	set_mempolicy_home_node		sys_set_mempolicy_home_node
 451	common	cachestat			sys_cachestat
+452	common	process_ksm_enable		sys_process_ksm_enable
+453	common	process_ksm_disable		sys_process_ksm_disable
+454	common	process_ksm_status		sys_process_ksm_status
diff --git a/arch/s390/kernel/syscalls/syscall.tbl b/arch/s390/kernel/syscalls/syscall.tbl
index a6935af2235c..97b36ce15155 100644
--- a/arch/s390/kernel/syscalls/syscall.tbl
+++ b/arch/s390/kernel/syscalls/syscall.tbl
@@ -454,3 +454,6 @@
 449  common	futex_waitv		sys_futex_waitv			sys_futex_waitv
 450  common	set_mempolicy_home_node	sys_set_mempolicy_home_node	sys_set_mempolicy_home_node
 451  common	cachestat		sys_cachestat			sys_cachestat
+452  common	process_ksm_enable	sys_process_ksm_enable		sys_process_ksm_enable
+453  common	process_ksm_disable	sys_process_ksm_disable		sys_process_ksm_disable
+454  common	process_ksm_status	sys_process_ksm_status		sys_process_ksm_status
diff --git a/arch/sh/kernel/syscalls/syscall.tbl b/arch/sh/kernel/syscalls/syscall.tbl
index 97377e8c5025..bd3827e1fc8d 100644
--- a/arch/sh/kernel/syscalls/syscall.tbl
+++ b/arch/sh/kernel/syscalls/syscall.tbl
@@ -454,3 +454,6 @@
 449	common  futex_waitv                     sys_futex_waitv
 450	common	set_mempolicy_home_node		sys_set_mempolicy_home_node
 451	common	cachestat			sys_cachestat
+452	common	process_ksm_enable		sys_process_ksm_enable
+453	common	process_ksm_disable		sys_process_ksm_disable
+454	common	process_ksm_status		sys_process_ksm_status
diff --git a/arch/sparc/kernel/syscalls/syscall.tbl b/arch/sparc/kernel/syscalls/syscall.tbl
index faa835f3c54a..c05e62a0ca02 100644
--- a/arch/sparc/kernel/syscalls/syscall.tbl
+++ b/arch/sparc/kernel/syscalls/syscall.tbl
@@ -497,3 +497,6 @@
 449	common  futex_waitv                     sys_futex_waitv
 450	common	set_mempolicy_home_node		sys_set_mempolicy_home_node
 451	common	cachestat			sys_cachestat
+452	common	process_ksm_enable		sys_process_ksm_enable
+453	common	process_ksm_disable		sys_process_ksm_disable
+454	common	process_ksm_status		sys_process_ksm_status
diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
index bc0a3c941b35..c79bd2dd758d 100644
--- a/arch/x86/entry/syscalls/syscall_32.tbl
+++ b/arch/x86/entry/syscalls/syscall_32.tbl
@@ -456,3 +456,6 @@
 449	i386	futex_waitv		sys_futex_waitv
 450	i386	set_mempolicy_home_node		sys_set_mempolicy_home_node
 451	i386	cachestat		sys_cachestat
+452	i386	process_ksm_enable		sys_process_ksm_enable
+453	i386	process_ksm_disable		sys_process_ksm_disable
+454	i386	process_ksm_status		sys_process_ksm_status
diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
index 227538b0ce80..e146a70cc299 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -373,6 +373,9 @@
 449	common	futex_waitv		sys_futex_waitv
 450	common	set_mempolicy_home_node	sys_set_mempolicy_home_node
 451	common	cachestat		sys_cachestat
+452	common	process_ksm_enable	sys_process_ksm_enable
+453	common	process_ksm_disable	sys_process_ksm_disable
+454	common	process_ksm_status	sys_process_ksm_status
 
 #
 # Due to a historical design error, certain syscalls are numbered differently
diff --git a/arch/xtensa/kernel/syscalls/syscall.tbl b/arch/xtensa/kernel/syscalls/syscall.tbl
index 2b69c3c035b6..b7bf81a3ba13 100644
--- a/arch/xtensa/kernel/syscalls/syscall.tbl
+++ b/arch/xtensa/kernel/syscalls/syscall.tbl
@@ -422,3 +422,6 @@
 449	common  futex_waitv                     sys_futex_waitv
 450	common	set_mempolicy_home_node		sys_set_mempolicy_home_node
 451	common	cachestat			sys_cachestat
+452	common	process_ksm_enable		sys_process_ksm_enable
+453	common	process_ksm_disable		sys_process_ksm_disable
+454	common	process_ksm_status		sys_process_ksm_status
diff --git a/fs/proc/base.c b/fs/proc/base.c
index ee4b824658a0..9b0beb26cbd4 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -3207,6 +3207,7 @@ static int proc_pid_ksm_stat(struct seq_file *m, struct pid_namespace *ns,
 	mm = get_task_mm(task);
 	if (mm) {
 		seq_printf(m, "ksm_rmap_items %lu\n", mm->ksm_rmap_items);
+		seq_printf(m, "ksm_zero_pages %lu\n", mm->ksm_zero_pages);
 		seq_printf(m, "ksm_merging_pages %lu\n", mm->ksm_merging_pages);
 		seq_printf(m, "ksm_process_profit %ld\n", ksm_process_profit(mm));
 		mmput(mm);
diff --git a/include/linux/ksm.h b/include/linux/ksm.h
index 899a314bc487..c2dd786a30e1 100644
--- a/include/linux/ksm.h
+++ b/include/linux/ksm.h
@@ -26,6 +26,22 @@ int ksm_disable(struct mm_struct *mm);
 
 int __ksm_enter(struct mm_struct *mm);
 void __ksm_exit(struct mm_struct *mm);
+/*
+ * To identify zeropages that were mapped by KSM, we reuse the dirty bit
+ * in the PTE. If the PTE is dirty, the zeropage was mapped by KSM when
+ * deduplicating memory.
+ */
+#define is_ksm_zero_pte(pte)	(is_zero_pfn(pte_pfn(pte)) && pte_dirty(pte))
+
+extern unsigned long ksm_zero_pages;
+
+static inline void ksm_might_unmap_zero_page(struct mm_struct *mm, pte_t pte)
+{
+	if (is_ksm_zero_pte(pte)) {
+		ksm_zero_pages--;
+		mm->ksm_zero_pages--;
+	}
+}
 
 static inline int ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm)
 {
@@ -95,6 +111,10 @@ static inline void ksm_exit(struct mm_struct *mm)
 {
 }
 
+static inline void ksm_might_unmap_zero_page(struct mm_struct *mm, pte_t pte)
+{
+}
+
 #ifdef CONFIG_MEMORY_FAILURE
 static inline void collect_procs_ksm(struct page *page,
 				     struct list_head *to_kill, int force_early)
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 7d30dc4ff0ff..d8d8cc1348d6 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -812,7 +812,7 @@ struct mm_struct {
 #ifdef CONFIG_KSM
 		/*
 		 * Represent how many pages of this process are involved in KSM
-		 * merging.
+		 * merging (not including ksm_zero_pages).
 		 */
 		unsigned long ksm_merging_pages;
 		/*
@@ -820,7 +820,12 @@ struct mm_struct {
 		 * including merged and not merged.
 		 */
 		unsigned long ksm_rmap_items;
-#endif
+		/*
+		 * Represent how many empty pages are merged with kernel zero
+		 * pages when enabling KSM use_zero_pages.
+		 */
+		unsigned long ksm_zero_pages;
+#endif /* CONFIG_KSM */
 #ifdef CONFIG_LRU_GEN
 		struct {
 			/* this mm_struct is on lru_gen_mm_list */
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 03e3d0121d5e..16597dea90f4 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -813,6 +813,9 @@ asmlinkage long sys_madvise(unsigned long start, size_t len, int behavior);
 asmlinkage long sys_process_madvise(int pidfd, const struct iovec __user *vec,
 			size_t vlen, int behavior, unsigned int flags);
 asmlinkage long sys_process_mrelease(int pidfd, unsigned int flags);
+asmlinkage long sys_process_ksm_enable(int pidfd, unsigned int flags);
+asmlinkage long sys_process_ksm_disable(int pidfd, unsigned int flags);
+asmlinkage long sys_process_ksm_status(int pidfd, unsigned int flags);
 asmlinkage long sys_remap_file_pages(unsigned long start, unsigned long size,
 			unsigned long prot, unsigned long pgoff,
 			unsigned long flags);
diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
index fd6c1cb585db..11d0fc82c437 100644
--- a/include/uapi/asm-generic/unistd.h
+++ b/include/uapi/asm-generic/unistd.h
@@ -820,8 +820,17 @@ __SYSCALL(__NR_set_mempolicy_home_node, sys_set_mempolicy_home_node)
 #define __NR_cachestat 451
 __SYSCALL(__NR_cachestat, sys_cachestat)
 
+#define __NR_process_ksm_enable 452
+__SYSCALL(__NR_process_ksm_enable, sys_process_ksm_enable)
+
+#define __NR_process_ksm_disable 453
+__SYSCALL(__NR_process_ksm_disable, sys_process_ksm_disable)
+
+#define __NR_process_ksm_status 454
+__SYSCALL(__NR_process_ksm_status, sys_process_ksm_status)
+
 #undef __NR_syscalls
-#define __NR_syscalls 452
+#define __NR_syscalls 455
 
 /*
  * 32 bit systems traditionally used different
diff --git a/kernel/sys.c b/kernel/sys.c
index 2410e3999ebe..b0841a2dd2b7 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -2727,6 +2727,153 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
 	return error;
 }
 
+#ifdef CONFIG_KSM
+enum pkc_action {
+	PKSM_ENABLE = 0,
+	PKSM_DISABLE,
+	PKSM_STATUS,
+};
+
+static long do_process_ksm_control(int pidfd, enum pkc_action action)
+{
+	long ret;
+	struct pid *pid;
+	struct task_struct *task;
+	struct mm_struct *mm;
+	unsigned int f_flags;
+
+	pid = pidfd_get_pid(pidfd, &f_flags);
+	if (IS_ERR(pid)) {
+		ret = PTR_ERR(pid);
+		goto out;
+	}
+
+	task = get_pid_task(pid, PIDTYPE_PID);
+	if (!task) {
+		ret = -ESRCH;
+		goto put_pid;
+	}
+
+	/* Require PTRACE_MODE_READ to avoid leaking ASLR metadata. */
+	mm = mm_access(task, PTRACE_MODE_READ_FSCREDS);
+	if (IS_ERR_OR_NULL(mm)) {
+		ret = IS_ERR(mm) ? PTR_ERR(mm) : -ESRCH;
+		goto release_task;
+	}
+
+	/* Require CAP_SYS_NICE for influencing process performance. */
+	if (!capable(CAP_SYS_NICE)) {
+		ret = -EPERM;
+		goto release_mm;
+	}
+
+	if (mmap_write_lock_killable(mm)) {
+		ret = -EINTR;
+		goto release_mm;
+	}
+
+	switch (action) {
+		case PKSM_ENABLE:
+			ret = ksm_enable_merge_any(mm);
+			break;
+		case PKSM_DISABLE:
+			ret = ksm_disable_merge_any(mm);
+			break;
+		case PKSM_STATUS:
+			ret = !!test_bit(MMF_VM_MERGE_ANY, &mm->flags);
+			break;
+	}
+
+	mmap_write_unlock(mm);
+
+release_mm:
+	mmput(mm);
+release_task:
+	put_task_struct(task);
+put_pid:
+	put_pid(pid);
+out:
+	return ret;
+}
+#endif /* CONFIG_KSM */
+
+SYSCALL_DEFINE2(process_ksm_enable, int, pidfd, unsigned int, flags)
+{
+#ifdef CONFIG_KSM
+	if (flags != 0)
+		return -EINVAL;
+
+	return do_process_ksm_control(pidfd, PKSM_ENABLE);
+#else /* CONFIG_KSM */
+	return -ENOSYS;
+#endif /* CONFIG_KSM */
+}
+
+SYSCALL_DEFINE2(process_ksm_disable, int, pidfd, unsigned int, flags)
+{
+#ifdef CONFIG_KSM
+	if (flags != 0)
+		return -EINVAL;
+
+	return do_process_ksm_control(pidfd, PKSM_DISABLE);
+#else /* CONFIG_KSM */
+	return -ENOSYS;
+#endif /* CONFIG_KSM */
+}
+
+SYSCALL_DEFINE2(process_ksm_status, int, pidfd, unsigned int, flags)
+{
+#ifdef CONFIG_KSM
+	if (flags != 0)
+		return -EINVAL;
+
+	return do_process_ksm_control(pidfd, PKSM_STATUS);
+#else /* CONFIG_KSM */
+	return -ENOSYS;
+#endif /* CONFIG_KSM */
+}
+
+#ifdef CONFIG_KSM
+static ssize_t process_ksm_enable_show(struct kobject *kobj,
+		struct kobj_attribute *attr, char *buf)
+{
+	return sprintf(buf, "%u\n", __NR_process_ksm_enable);
+}
+static struct kobj_attribute process_ksm_enable_attr = __ATTR_RO(process_ksm_enable);
+
+static ssize_t process_ksm_disable_show(struct kobject *kobj,
+		struct kobj_attribute *attr, char *buf)
+{
+	return sprintf(buf, "%u\n", __NR_process_ksm_disable);
+}
+static struct kobj_attribute process_ksm_disable_attr = __ATTR_RO(process_ksm_disable);
+
+static ssize_t process_ksm_status_show(struct kobject *kobj,
+		struct kobj_attribute *attr, char *buf)
+{
+	return sprintf(buf, "%u\n", __NR_process_ksm_status);
+}
+static struct kobj_attribute process_ksm_status_attr = __ATTR_RO(process_ksm_status);
+
+static struct attribute *process_ksm_sysfs_attrs[] = {
+	&process_ksm_enable_attr.attr,
+	&process_ksm_disable_attr.attr,
+	&process_ksm_status_attr.attr,
+	NULL,
+};
+
+static const struct attribute_group process_ksm_sysfs_attr_group = {
+	.attrs = process_ksm_sysfs_attrs,
+	.name = "process_ksm",
+};
+
+static int __init process_ksm_sysfs_init(void)
+{
+	return sysfs_create_group(kernel_kobj, &process_ksm_sysfs_attr_group);
+}
+subsys_initcall(process_ksm_sysfs_init);
+#endif /* CONFIG_KSM */
+
 SYSCALL_DEFINE3(getcpu, unsigned __user *, cpup, unsigned __user *, nodep,
 		struct getcpu_cache __user *, unused)
 {
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 781de7cc6a4e..49a35d35d0f9 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -184,6 +184,9 @@ COND_SYSCALL(mincore);
 COND_SYSCALL(madvise);
 COND_SYSCALL(process_madvise);
 COND_SYSCALL(process_mrelease);
+COND_SYSCALL(process_ksm_enable);
+COND_SYSCALL(process_ksm_disable);
+COND_SYSCALL(process_ksm_status);
 COND_SYSCALL(remap_file_pages);
 COND_SYSCALL(mbind);
 COND_SYSCALL(get_mempolicy);
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 78c8d5d8b628..4b8b8673d5d9 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -19,6 +19,7 @@
 #include <linux/page_table_check.h>
 #include <linux/swapops.h>
 #include <linux/shmem_fs.h>
+#include <linux/ksm.h>
 
 #include <asm/tlb.h>
 #include <asm/pgalloc.h>
@@ -709,6 +710,7 @@ static void __collapse_huge_page_copy_succeeded(pte_t *pte,
 				spin_lock(ptl);
 				ptep_clear(vma->vm_mm, address, _pte);
 				spin_unlock(ptl);
+				ksm_might_unmap_zero_page(vma->vm_mm, pteval);
 			}
 		} else {
 			src_page = pte_page(pteval);
diff --git a/mm/ksm.c b/mm/ksm.c
index d7b5b95e936e..6b7b8928fb96 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -278,6 +278,9 @@ static unsigned int zero_checksum __read_mostly;
 /* Whether to merge empty (zeroed) pages with actual zero pages */
 static bool ksm_use_zero_pages __read_mostly;
 
+/* The number of zero pages which is placed by KSM */
+unsigned long ksm_zero_pages;
+
 #ifdef CONFIG_NUMA
 /* Zeroed when merging across nodes is not allowed */
 static unsigned int ksm_merge_across_nodes = 1;
@@ -448,7 +451,8 @@ static int break_ksm_pmd_entry(pmd_t *pmd, unsigned long addr, unsigned long nex
 		if (is_migration_entry(entry))
 			page = pfn_swap_entry_to_page(entry);
 	}
-	ret = page && PageKsm(page);
+	/* return 1 if the page is an normal ksm page or KSM-placed zero page */
+	ret = (page && PageKsm(page)) || is_ksm_zero_pte(*pte);
 	pte_unmap_unlock(pte, ptl);
 	return ret;
 }
@@ -1229,8 +1233,14 @@ static int replace_page(struct vm_area_struct *vma, struct page *page,
 		page_add_anon_rmap(kpage, vma, addr, RMAP_NONE);
 		newpte = mk_pte(kpage, vma->vm_page_prot);
 	} else {
-		newpte = pte_mkspecial(pfn_pte(page_to_pfn(kpage),
-					       vma->vm_page_prot));
+		/*
+		 * Use pte_mkdirty to mark the zero page mapped by KSM, and then
+		 * we can easily track all KSM-placed zero pages by checking if
+		 * the dirty bit in zero page's PTE is set.
+		 */
+		newpte = pte_mkdirty(pte_mkspecial(pfn_pte(page_to_pfn(kpage), vma->vm_page_prot)));
+		ksm_zero_pages++;
+		mm->ksm_zero_pages++;
 		/*
 		 * We're replacing an anonymous page with a zero page, which is
 		 * not anonymous. We need to do proper accounting otherwise we
@@ -3091,7 +3101,7 @@ static void wait_while_offlining(void)
 #ifdef CONFIG_PROC_FS
 long ksm_process_profit(struct mm_struct *mm)
 {
-	return mm->ksm_merging_pages * PAGE_SIZE -
+	return (long)(mm->ksm_merging_pages + mm->ksm_zero_pages) * PAGE_SIZE -
 		mm->ksm_rmap_items * sizeof(struct ksm_rmap_item);
 }
 #endif /* CONFIG_PROC_FS */
@@ -3360,12 +3370,19 @@ static ssize_t pages_volatile_show(struct kobject *kobj,
 }
 KSM_ATTR_RO(pages_volatile);
 
+static ssize_t ksm_zero_pages_show(struct kobject *kobj,
+				struct kobj_attribute *attr, char *buf)
+{
+	return sysfs_emit(buf, "%ld\n", ksm_zero_pages);
+}
+KSM_ATTR_RO(ksm_zero_pages);
+
 static ssize_t general_profit_show(struct kobject *kobj,
 				   struct kobj_attribute *attr, char *buf)
 {
 	long general_profit;
 
-	general_profit = ksm_pages_sharing * PAGE_SIZE -
+	general_profit = (ksm_pages_sharing + ksm_zero_pages) * PAGE_SIZE -
 				ksm_rmap_items * sizeof(struct ksm_rmap_item);
 
 	return sysfs_emit(buf, "%ld\n", general_profit);
@@ -3427,6 +3444,7 @@ static struct attribute *ksm_attrs[] = {
 	&pages_sharing_attr.attr,
 	&pages_unshared_attr.attr,
 	&pages_volatile_attr.attr,
+	&ksm_zero_pages_attr.attr,
 	&full_scans_attr.attr,
 #ifdef CONFIG_NUMA
 	&merge_across_nodes_attr.attr,
diff --git a/mm/memory.c b/mm/memory.c
index cdc4d4c1c858..428943ecda25 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1433,8 +1433,10 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
 			tlb_remove_tlb_entry(tlb, pte, addr);
 			zap_install_uffd_wp_if_needed(vma, addr, pte, details,
 						      ptent);
-			if (unlikely(!page))
+			if (unlikely(!page)) {
+				ksm_might_unmap_zero_page(mm, ptent);
 				continue;
+			}
 
 			delay_rmap = 0;
 			if (!PageAnon(page)) {
@@ -3128,6 +3130,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
 				inc_mm_counter(mm, MM_ANONPAGES);
 			}
 		} else {
+			ksm_might_unmap_zero_page(mm, vmf->orig_pte);
 			inc_mm_counter(mm, MM_ANONPAGES);
 		}
 		flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte));
diff --git a/tools/testing/selftests/mm/ksm_functional_tests.c b/tools/testing/selftests/mm/ksm_functional_tests.c
index 26853badae70..0de9d33cd565 100644
--- a/tools/testing/selftests/mm/ksm_functional_tests.c
+++ b/tools/testing/selftests/mm/ksm_functional_tests.c
@@ -29,6 +29,8 @@
 
 static int ksm_fd;
 static int ksm_full_scans_fd;
+static int proc_self_ksm_stat_fd;
+static int ksm_use_zero_pages_fd;
 static int pagemap_fd;
 static size_t pagesize;
 
@@ -59,6 +61,33 @@ static bool range_maps_duplicates(char *addr, unsigned long size)
 	return false;
 }
 
+static long get_my_ksm_zero_pages(void)
+{
+	char buf[200];
+	char *substr_ksm_zero;
+	size_t value_pos;
+	ssize_t read_size;
+	unsigned long my_ksm_zero_pages;
+
+	if (!proc_self_ksm_stat_fd)
+		return 0;
+
+	read_size = pread(proc_self_ksm_stat_fd, buf, sizeof(buf) - 1, 0);
+	if (read_size < 0)
+		return -errno;
+
+	buf[read_size] = 0;
+
+	substr_ksm_zero = strstr(buf, "ksm_zero_pages");
+	if (!substr_ksm_zero)
+		return 0;
+
+	value_pos = strcspn(substr_ksm_zero, "0123456789");
+	my_ksm_zero_pages = strtol(substr_ksm_zero + value_pos, NULL, 10);
+
+	return my_ksm_zero_pages;
+}
+
 static long ksm_get_full_scans(void)
 {
 	char buf[10];
@@ -159,6 +188,70 @@ static void test_unmerge(void)
 	munmap(map, size);
 }
 
+static void test_unmerge_zero_pages(void)
+{
+	const unsigned int size = 2 * MiB;
+	char *map;
+	unsigned int offs;
+	unsigned long pages_expected;
+
+	ksft_print_msg("[RUN] %s\n", __func__);
+
+	if (proc_self_ksm_stat_fd < 0) {
+		ksft_test_result_skip("open(\"/proc/self/ksm_stat\") failed\n");
+		return;
+	}
+	if (ksm_use_zero_pages_fd < 0) {
+		ksft_test_result_skip("open \"/sys/kernel/mm/ksm/use_zero_pages\" failed\n");
+		return;
+	}
+	if (write(ksm_use_zero_pages_fd, "1", 1) != 1) {
+		ksft_test_result_skip("write \"/sys/kernel/mm/ksm/use_zero_pages\" failed\n");
+		return;
+	}
+
+	/* Let KSM deduplicate zero pages. */
+	map = mmap_and_merge_range(0x00, size, false);
+	if (map == MAP_FAILED)
+		return;
+
+	/* Check if ksm_zero_pages is updated correctly after KSM merging */
+	pages_expected = size / pagesize;
+	if (pages_expected != get_my_ksm_zero_pages()) {
+		ksft_test_result_fail("'ksm_zero_pages' updated after merging\n");
+		goto unmap;
+	}
+
+	/* Try to unmerge half of the region */
+	if (madvise(map, size / 2, MADV_UNMERGEABLE)) {
+		ksft_test_result_fail("MADV_UNMERGEABLE failed\n");
+		goto unmap;
+	}
+
+	/* Check if ksm_zero_pages is updated correctly after unmerging */
+	pages_expected /= 2;
+	if (pages_expected != get_my_ksm_zero_pages()) {
+		ksft_test_result_fail("'ksm_zero_pages' updated after unmerging\n");
+		goto unmap;
+	}
+
+	/* Trigger unmerging of the other half by writing to the pages. */
+	for (offs = size / 2; offs < size; offs += pagesize)
+		*((unsigned int *)&map[offs]) = offs;
+
+	/* Now we should have no zeropages remaining. */
+	if (get_my_ksm_zero_pages()) {
+		ksft_test_result_fail("'ksm_zero_pages' updated after write fault\n");
+		goto unmap;
+	}
+
+	/* Check if ksm zero pages are really unmerged */
+	ksft_test_result(!range_maps_duplicates(map, size),
+			"KSM zero pages were unmerged\n");
+unmap:
+	munmap(map, size);
+}
+
 static void test_unmerge_discarded(void)
 {
 	const unsigned int size = 2 * MiB;
@@ -358,7 +451,7 @@ static void test_prctl_unmerge(void)
 
 int main(int argc, char **argv)
 {
-	unsigned int tests = 5;
+	unsigned int tests = 6;
 	int err;
 
 #ifdef __NR_userfaultfd
@@ -379,8 +472,11 @@ int main(int argc, char **argv)
 	pagemap_fd = open("/proc/self/pagemap", O_RDONLY);
 	if (pagemap_fd < 0)
 		ksft_exit_skip("open(\"/proc/self/pagemap\") failed\n");
+	proc_self_ksm_stat_fd = open("/proc/self/ksm_stat", O_RDONLY);
+	ksm_use_zero_pages_fd = open("/sys/kernel/mm/ksm/use_zero_pages", O_RDWR);
 
 	test_unmerge();
+	test_unmerge_zero_pages();
 	test_unmerge_discarded();
 #ifdef __NR_userfaultfd
 	test_unmerge_uffd_wp();
-- 
2.42.0

From fb826c80eee7ebf2b9dfeca032b64ac680558a6f Mon Sep 17 00:00:00 2001
From: Peter Jung <admin@ptr1337.dev>
Date: Wed, 13 Sep 2023 14:33:28 +0200
Subject: [PATCH 7/7] zstd

Signed-off-by: Peter Jung <admin@ptr1337.dev>
---
 include/linux/zstd.h                          |    2 +-
 include/linux/zstd_errors.h                   |   23 +-
 include/linux/zstd_lib.h                      |  697 +++++--
 lib/zstd/Makefile                             |    2 +-
 lib/zstd/common/allocations.h                 |   56 +
 lib/zstd/common/bits.h                        |  149 ++
 lib/zstd/common/bitstream.h                   |   53 +-
 lib/zstd/common/compiler.h                    |   14 +-
 lib/zstd/common/cpu.h                         |    3 +-
 lib/zstd/common/debug.c                       |    3 +-
 lib/zstd/common/debug.h                       |    3 +-
 lib/zstd/common/entropy_common.c              |   42 +-
 lib/zstd/common/error_private.c               |   12 +-
 lib/zstd/common/error_private.h               |    3 +-
 lib/zstd/common/fse.h                         |   89 +-
 lib/zstd/common/fse_decompress.c              |   94 +-
 lib/zstd/common/huf.h                         |  222 +--
 lib/zstd/common/mem.h                         |    2 +-
 lib/zstd/common/portability_macros.h          |   26 +-
 lib/zstd/common/zstd_common.c                 |   38 +-
 lib/zstd/common/zstd_deps.h                   |   20 +-
 lib/zstd/common/zstd_internal.h               |   99 +-
 lib/zstd/compress/clevels.h                   |    3 +-
 lib/zstd/compress/fse_compress.c              |   59 +-
 lib/zstd/compress/hist.c                      |    3 +-
 lib/zstd/compress/hist.h                      |    3 +-
 lib/zstd/compress/huf_compress.c              |  372 ++--
 lib/zstd/compress/zstd_compress.c             | 1762 ++++++++++++-----
 lib/zstd/compress/zstd_compress_internal.h    |  333 +++-
 lib/zstd/compress/zstd_compress_literals.c    |  155 +-
 lib/zstd/compress/zstd_compress_literals.h    |   25 +-
 lib/zstd/compress/zstd_compress_sequences.c   |    7 +-
 lib/zstd/compress/zstd_compress_sequences.h   |    3 +-
 lib/zstd/compress/zstd_compress_superblock.c  |   47 +-
 lib/zstd/compress/zstd_compress_superblock.h  |    3 +-
 lib/zstd/compress/zstd_cwksp.h                |  149 +-
 lib/zstd/compress/zstd_double_fast.c          |  129 +-
 lib/zstd/compress/zstd_double_fast.h          |    6 +-
 lib/zstd/compress/zstd_fast.c                 |  582 ++++--
 lib/zstd/compress/zstd_fast.h                 |    6 +-
 lib/zstd/compress/zstd_lazy.c                 |  518 ++---
 lib/zstd/compress/zstd_lazy.h                 |    7 +-
 lib/zstd/compress/zstd_ldm.c                  |   11 +-
 lib/zstd/compress/zstd_ldm.h                  |    3 +-
 lib/zstd/compress/zstd_ldm_geartab.h          |    3 +-
 lib/zstd/compress/zstd_opt.c                  |  187 +-
 lib/zstd/compress/zstd_opt.h                  |    3 +-
 lib/zstd/decompress/huf_decompress.c          |  731 ++++---
 lib/zstd/decompress/zstd_ddict.c              |    9 +-
 lib/zstd/decompress/zstd_ddict.h              |    3 +-
 lib/zstd/decompress/zstd_decompress.c         |  261 ++-
 lib/zstd/decompress/zstd_decompress_block.c   |  283 ++-
 lib/zstd/decompress/zstd_decompress_block.h   |    8 +-
 .../decompress/zstd_decompress_internal.h     |    7 +-
 lib/zstd/decompress_sources.h                 |    2 +-
 lib/zstd/zstd_common_module.c                 |    5 +-
 lib/zstd/zstd_compress_module.c               |    2 +-
 lib/zstd/zstd_decompress_module.c             |    2 +-
 58 files changed, 4751 insertions(+), 2593 deletions(-)
 create mode 100644 lib/zstd/common/allocations.h
 create mode 100644 lib/zstd/common/bits.h

diff --git a/include/linux/zstd.h b/include/linux/zstd.h
index 113408eef6ec..f109d49f43f8 100644
--- a/include/linux/zstd.h
+++ b/include/linux/zstd.h
@@ -1,6 +1,6 @@
 /* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
 /*
- * Copyright (c) Yann Collet, Facebook, Inc.
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
  * All rights reserved.
  *
  * This source code is licensed under both the BSD-style license (found in the
diff --git a/include/linux/zstd_errors.h b/include/linux/zstd_errors.h
index 58b6dd45a969..6d5cf55f0bf3 100644
--- a/include/linux/zstd_errors.h
+++ b/include/linux/zstd_errors.h
@@ -1,5 +1,6 @@
+/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
 /*
- * Copyright (c) Yann Collet, Facebook, Inc.
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
  * All rights reserved.
  *
  * This source code is licensed under both the BSD-style license (found in the
@@ -17,8 +18,17 @@
 
 
 /* =====   ZSTDERRORLIB_API : control library symbols visibility   ===== */
-#define ZSTDERRORLIB_VISIBILITY 
-#define ZSTDERRORLIB_API ZSTDERRORLIB_VISIBILITY
+#define ZSTDERRORLIB_VISIBLE 
+
+#ifndef ZSTDERRORLIB_HIDDEN
+#  if (__GNUC__ >= 4) && !defined(__MINGW32__)
+#    define ZSTDERRORLIB_HIDDEN __attribute__ ((visibility ("hidden")))
+#  else
+#    define ZSTDERRORLIB_HIDDEN
+#  endif
+#endif
+
+#define ZSTDERRORLIB_API ZSTDERRORLIB_VISIBLE
 
 /*-*********************************************
  *  Error codes list
@@ -43,14 +53,17 @@ typedef enum {
   ZSTD_error_frameParameter_windowTooLarge = 16,
   ZSTD_error_corruption_detected = 20,
   ZSTD_error_checksum_wrong      = 22,
+  ZSTD_error_literals_headerWrong = 24,
   ZSTD_error_dictionary_corrupted      = 30,
   ZSTD_error_dictionary_wrong          = 32,
   ZSTD_error_dictionaryCreation_failed = 34,
   ZSTD_error_parameter_unsupported   = 40,
+  ZSTD_error_parameter_combination_unsupported = 41,
   ZSTD_error_parameter_outOfBound    = 42,
   ZSTD_error_tableLog_tooLarge       = 44,
   ZSTD_error_maxSymbolValue_tooLarge = 46,
   ZSTD_error_maxSymbolValue_tooSmall = 48,
+  ZSTD_error_stabilityCondition_notRespected = 50,
   ZSTD_error_stage_wrong       = 60,
   ZSTD_error_init_missing      = 62,
   ZSTD_error_memory_allocation = 64,
@@ -58,11 +71,15 @@ typedef enum {
   ZSTD_error_dstSize_tooSmall = 70,
   ZSTD_error_srcSize_wrong    = 72,
   ZSTD_error_dstBuffer_null   = 74,
+  ZSTD_error_noForwardProgress_destFull = 80,
+  ZSTD_error_noForwardProgress_inputEmpty = 82,
   /* following error codes are __NOT STABLE__, they can be removed or changed in future versions */
   ZSTD_error_frameIndex_tooLarge = 100,
   ZSTD_error_seekableIO          = 102,
   ZSTD_error_dstBuffer_wrong     = 104,
   ZSTD_error_srcBuffer_wrong     = 105,
+  ZSTD_error_sequenceProducer_failed = 106,
+  ZSTD_error_externalSequences_invalid = 107,
   ZSTD_error_maxCode = 120  /* never EVER use this value directly, it can change in future versions! Use ZSTD_isError() instead */
 } ZSTD_ErrorCode;
 
diff --git a/include/linux/zstd_lib.h b/include/linux/zstd_lib.h
index 79d55465d5c1..8b4ffe649df5 100644
--- a/include/linux/zstd_lib.h
+++ b/include/linux/zstd_lib.h
@@ -1,5 +1,6 @@
+/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
 /*
- * Copyright (c) Yann Collet, Facebook, Inc.
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
  * All rights reserved.
  *
  * This source code is licensed under both the BSD-style license (found in the
@@ -11,23 +12,42 @@
 #ifndef ZSTD_H_235446
 #define ZSTD_H_235446
 
-/* ======   Dependency   ======*/
+/* ======   Dependencies   ======*/
 #include <linux/limits.h>   /* INT_MAX */
 #include <linux/types.h>   /* size_t */
 
 
 /* =====   ZSTDLIB_API : control library symbols visibility   ===== */
-#ifndef ZSTDLIB_VISIBLE
+#define ZSTDLIB_VISIBLE 
+
+#ifndef ZSTDLIB_HIDDEN
 #  if (__GNUC__ >= 4) && !defined(__MINGW32__)
-#    define ZSTDLIB_VISIBLE __attribute__ ((visibility ("default")))
 #    define ZSTDLIB_HIDDEN __attribute__ ((visibility ("hidden")))
 #  else
-#    define ZSTDLIB_VISIBLE
 #    define ZSTDLIB_HIDDEN
 #  endif
 #endif
+
 #define ZSTDLIB_API ZSTDLIB_VISIBLE
 
+/* Deprecation warnings :
+ * Should these warnings be a problem, it is generally possible to disable them,
+ * typically with -Wno-deprecated-declarations for gcc or _CRT_SECURE_NO_WARNINGS in Visual.
+ * Otherwise, it's also possible to define ZSTD_DISABLE_DEPRECATE_WARNINGS.
+ */
+#ifdef ZSTD_DISABLE_DEPRECATE_WARNINGS
+#  define ZSTD_DEPRECATED(message) /* disable deprecation warnings */
+#else
+#  if (defined(GNUC) && (GNUC > 4 || (GNUC == 4 && GNUC_MINOR >= 5))) || defined(__clang__)
+#    define ZSTD_DEPRECATED(message) __attribute__((deprecated(message)))
+#  elif (__GNUC__ >= 3)
+#    define ZSTD_DEPRECATED(message) __attribute__((deprecated))
+#  else
+#    pragma message("WARNING: You need to implement ZSTD_DEPRECATED for this compiler")
+#    define ZSTD_DEPRECATED(message)
+#  endif
+#endif /* ZSTD_DISABLE_DEPRECATE_WARNINGS */
+
 
 /* *****************************************************************************
   Introduction
@@ -65,7 +85,7 @@
 /*------   Version   ------*/
 #define ZSTD_VERSION_MAJOR    1
 #define ZSTD_VERSION_MINOR    5
-#define ZSTD_VERSION_RELEASE  2
+#define ZSTD_VERSION_RELEASE  5
 #define ZSTD_VERSION_NUMBER  (ZSTD_VERSION_MAJOR *100*100 + ZSTD_VERSION_MINOR *100 + ZSTD_VERSION_RELEASE)
 
 /*! ZSTD_versionNumber() :
@@ -107,7 +127,8 @@ ZSTDLIB_API const char* ZSTD_versionString(void);
 ***************************************/
 /*! ZSTD_compress() :
  *  Compresses `src` content as a single zstd compressed frame into already allocated `dst`.
- *  Hint : compression runs faster if `dstCapacity` >=  `ZSTD_compressBound(srcSize)`.
+ *  NOTE: Providing `dstCapacity >= ZSTD_compressBound(srcSize)` guarantees that zstd will have
+ *        enough space to successfully compress the data.
  *  @return : compressed size written into `dst` (<= `dstCapacity),
  *            or an error code if it fails (which can be tested using ZSTD_isError()). */
 ZSTDLIB_API size_t ZSTD_compress( void* dst, size_t dstCapacity,
@@ -156,7 +177,9 @@ ZSTDLIB_API unsigned long long ZSTD_getFrameContentSize(const void *src, size_t
  *  "empty", "unknown" and "error" results to the same return value (0),
  *  while ZSTD_getFrameContentSize() gives them separate return values.
  * @return : decompressed size of `src` frame content _if known and not empty_, 0 otherwise. */
-ZSTDLIB_API unsigned long long ZSTD_getDecompressedSize(const void* src, size_t srcSize);
+ZSTD_DEPRECATED("Replaced by ZSTD_getFrameContentSize")
+ZSTDLIB_API
+unsigned long long ZSTD_getDecompressedSize(const void* src, size_t srcSize);
 
 /*! ZSTD_findFrameCompressedSize() : Requires v1.4.0+
  * `src` should point to the start of a ZSTD frame or skippable frame.
@@ -168,8 +191,30 @@ ZSTDLIB_API size_t ZSTD_findFrameCompressedSize(const void* src, size_t srcSize)
 
 
 /*======  Helper functions  ======*/
-#define ZSTD_COMPRESSBOUND(srcSize)   ((srcSize) + ((srcSize)>>8) + (((srcSize) < (128<<10)) ? (((128<<10) - (srcSize)) >> 11) /* margin, from 64 to 0 */ : 0))  /* this formula ensures that bound(A) + bound(B) <= bound(A+B) as long as A and B >= 128 KB */
-ZSTDLIB_API size_t      ZSTD_compressBound(size_t srcSize); /*!< maximum compressed size in worst case single-pass scenario */
+/* ZSTD_compressBound() :
+ * maximum compressed size in worst case single-pass scenario.
+ * When invoking `ZSTD_compress()` or any other one-pass compression function,
+ * it's recommended to provide @dstCapacity >= ZSTD_compressBound(srcSize)
+ * as it eliminates one potential failure scenario,
+ * aka not enough room in dst buffer to write the compressed frame.
+ * Note : ZSTD_compressBound() itself can fail, if @srcSize > ZSTD_MAX_INPUT_SIZE .
+ *        In which case, ZSTD_compressBound() will return an error code
+ *        which can be tested using ZSTD_isError().
+ *
+ * ZSTD_COMPRESSBOUND() :
+ * same as ZSTD_compressBound(), but as a macro.
+ * It can be used to produce constants, which can be useful for static allocation,
+ * for example to size a static array on stack.
+ * Will produce constant value 0 if srcSize too large.
+ */
+#define ZSTD_MAX_INPUT_SIZE ((sizeof(size_t)==8) ? 0xFF00FF00FF00FF00LLU : 0xFF00FF00U)
+#define ZSTD_COMPRESSBOUND(srcSize)   (((size_t)(srcSize) >= ZSTD_MAX_INPUT_SIZE) ? 0 : (srcSize) + ((srcSize)>>8) + (((srcSize) < (128<<10)) ? (((128<<10) - (srcSize)) >> 11) /* margin, from 64 to 0 */ : 0))  /* this formula ensures that bound(A) + bound(B) <= bound(A+B) as long as A and B >= 128 KB */
+ZSTDLIB_API size_t ZSTD_compressBound(size_t srcSize); /*!< maximum compressed size in worst case single-pass scenario */
+/* ZSTD_isError() :
+ * Most ZSTD_* functions returning a size_t value can be tested for error,
+ * using ZSTD_isError().
+ * @return 1 if error, 0 otherwise
+ */
 ZSTDLIB_API unsigned    ZSTD_isError(size_t code);          /*!< tells if a `size_t` function result is an error code */
 ZSTDLIB_API const char* ZSTD_getErrorName(size_t code);     /*!< provides readable string from an error code */
 ZSTDLIB_API int         ZSTD_minCLevel(void);               /*!< minimum negative compression level allowed, requires v1.4.0+ */
@@ -412,6 +457,9 @@ typedef enum {
      * ZSTD_c_validateSequences
      * ZSTD_c_useBlockSplitter
      * ZSTD_c_useRowMatchFinder
+     * ZSTD_c_prefetchCDictTables
+     * ZSTD_c_enableSeqProducerFallback
+     * ZSTD_c_maxBlockSize
      * Because they are not stable, it's necessary to define ZSTD_STATIC_LINKING_ONLY to access them.
      * note : never ever use experimentalParam? names directly;
      *        also, the enums values themselves are unstable and can still change.
@@ -430,7 +478,11 @@ typedef enum {
      ZSTD_c_experimentalParam12=1009,
      ZSTD_c_experimentalParam13=1010,
      ZSTD_c_experimentalParam14=1011,
-     ZSTD_c_experimentalParam15=1012
+     ZSTD_c_experimentalParam15=1012,
+     ZSTD_c_experimentalParam16=1013,
+     ZSTD_c_experimentalParam17=1014,
+     ZSTD_c_experimentalParam18=1015,
+     ZSTD_c_experimentalParam19=1016
 } ZSTD_cParameter;
 
 typedef struct {
@@ -493,7 +545,7 @@ typedef enum {
  *                  They will be used to compress next frame.
  *                  Resetting session never fails.
  *  - The parameters : changes all parameters back to "default".
- *                  This removes any reference to any dictionary too.
+ *                  This also removes any reference to any dictionary or external sequence producer.
  *                  Parameters can only be changed between 2 sessions (i.e. no compression is currently ongoing)
  *                  otherwise the reset fails, and function returns an error value (which can be tested using ZSTD_isError())
  *  - Both : similar to resetting the session, followed by resetting parameters.
@@ -506,7 +558,8 @@ ZSTDLIB_API size_t ZSTD_CCtx_reset(ZSTD_CCtx* cctx, ZSTD_ResetDirective reset);
  *  Should cctx hold data from a previously unfinished frame, everything about it is forgotten.
  *  - Compression parameters are pushed into CCtx before starting compression, using ZSTD_CCtx_set*()
  *  - The function is always blocking, returns when compression is completed.
- *  Hint : compression runs faster if `dstCapacity` >=  `ZSTD_compressBound(srcSize)`.
+ *  NOTE: Providing `dstCapacity >= ZSTD_compressBound(srcSize)` guarantees that zstd will have
+ *        enough space to successfully compress the data, though it is possible it fails for other reasons.
  * @return : compressed size written into `dst` (<= `dstCapacity),
  *           or an error code if it fails (which can be tested using ZSTD_isError()).
  */
@@ -543,13 +596,15 @@ typedef enum {
      * ZSTD_d_stableOutBuffer
      * ZSTD_d_forceIgnoreChecksum
      * ZSTD_d_refMultipleDDicts
+     * ZSTD_d_disableHuffmanAssembly
      * Because they are not stable, it's necessary to define ZSTD_STATIC_LINKING_ONLY to access them.
      * note : never ever use experimentalParam? names directly
      */
      ZSTD_d_experimentalParam1=1000,
      ZSTD_d_experimentalParam2=1001,
      ZSTD_d_experimentalParam3=1002,
-     ZSTD_d_experimentalParam4=1003
+     ZSTD_d_experimentalParam4=1003,
+     ZSTD_d_experimentalParam5=1004
 
 } ZSTD_dParameter;
 
@@ -728,8 +783,6 @@ ZSTDLIB_API size_t ZSTD_CStreamOutSize(void);   /*< recommended size for output
  * This following is a legacy streaming API, available since v1.0+ .
  * It can be replaced by ZSTD_CCtx_reset() and ZSTD_compressStream2().
  * It is redundant, but remains fully supported.
- * Streaming in combination with advanced parameters and dictionary compression
- * can only be used through the new API.
  ******************************************************************************/
 
 /*!
@@ -738,6 +791,9 @@ ZSTDLIB_API size_t ZSTD_CStreamOutSize(void);   /*< recommended size for output
  *     ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only);
  *     ZSTD_CCtx_refCDict(zcs, NULL); // clear the dictionary (if any)
  *     ZSTD_CCtx_setParameter(zcs, ZSTD_c_compressionLevel, compressionLevel);
+ *
+ * Note that ZSTD_initCStream() clears any previously set dictionary. Use the new API
+ * to compress with a dictionary.
  */
 ZSTDLIB_API size_t ZSTD_initCStream(ZSTD_CStream* zcs, int compressionLevel);
 /*!
@@ -788,13 +844,31 @@ ZSTDLIB_API size_t ZSTD_freeDStream(ZSTD_DStream* zds);  /* accept NULL pointer
 
 /*===== Streaming decompression functions =====*/
 
-/* This function is redundant with the advanced API and equivalent to:
+/*! ZSTD_initDStream() :
+ * Initialize/reset DStream state for new decompression operation.
+ * Call before new decompression operation using same DStream.
  *
+ * Note : This function is redundant with the advanced API and equivalent to:
  *     ZSTD_DCtx_reset(zds, ZSTD_reset_session_only);
  *     ZSTD_DCtx_refDDict(zds, NULL);
  */
 ZSTDLIB_API size_t ZSTD_initDStream(ZSTD_DStream* zds);
 
+/*! ZSTD_decompressStream() :
+ * Streaming decompression function.
+ * Call repetitively to consume full input updating it as necessary.
+ * Function will update both input and output `pos` fields exposing current state via these fields:
+ * - `input.pos < input.size`, some input remaining and caller should provide remaining input
+ *   on the next call.
+ * - `output.pos < output.size`, decoder finished and flushed all remaining buffers.
+ * - `output.pos == output.size`, potentially uncflushed data present in the internal buffers,
+ *   call ZSTD_decompressStream() again to flush remaining data to output.
+ * Note : with no additional input, amount of data flushed <= ZSTD_BLOCKSIZE_MAX.
+ *
+ * @return : 0 when a frame is completely decoded and fully flushed,
+ *           or an error code, which can be tested using ZSTD_isError(),
+ *           or any other value > 0, which means there is some decoding or flushing to do to complete current frame.
+ */
 ZSTDLIB_API size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inBuffer* input);
 
 ZSTDLIB_API size_t ZSTD_DStreamInSize(void);    /*!< recommended size for input buffer */
@@ -913,7 +987,7 @@ ZSTDLIB_API unsigned ZSTD_getDictID_fromDDict(const ZSTD_DDict* ddict);
  *  If @return == 0, the dictID could not be decoded.
  *  This could for one of the following reasons :
  *  - The frame does not require a dictionary to be decoded (most common case).
- *  - The frame was built with dictID intentionally removed. Whatever dictionary is necessary is a hidden information.
+ *  - The frame was built with dictID intentionally removed. Whatever dictionary is necessary is a hidden piece of information.
  *    Note : this use case also happens when using a non-conformant dictionary.
  *  - `srcSize` is too small, and as a result, the frame header could not be decoded (only possible if `srcSize < ZSTD_FRAMEHEADERSIZE_MAX`).
  *  - This is not a Zstandard frame.
@@ -925,9 +999,11 @@ ZSTDLIB_API unsigned ZSTD_getDictID_fromFrame(const void* src, size_t srcSize);
  * Advanced dictionary and prefix API (Requires v1.4.0+)
  *
  * This API allows dictionaries to be used with ZSTD_compress2(),
- * ZSTD_compressStream2(), and ZSTD_decompressDCtx(). Dictionaries are sticky, and
- * only reset with the context is reset with ZSTD_reset_parameters or
- * ZSTD_reset_session_and_parameters. Prefixes are single-use.
+ * ZSTD_compressStream2(), and ZSTD_decompressDCtx().
+ * Dictionaries are sticky, they remain valid when same context is re-used,
+ * they only reset when the context is reset
+ * with ZSTD_reset_parameters or ZSTD_reset_session_and_parameters.
+ * In contrast, Prefixes are single-use.
  ******************************************************************************/
 
 
@@ -937,8 +1013,9 @@ ZSTDLIB_API unsigned ZSTD_getDictID_fromFrame(const void* src, size_t srcSize);
  * @result : 0, or an error code (which can be tested with ZSTD_isError()).
  *  Special: Loading a NULL (or 0-size) dictionary invalidates previous dictionary,
  *           meaning "return to no-dictionary mode".
- *  Note 1 : Dictionary is sticky, it will be used for all future compressed frames.
- *           To return to "no-dictionary" situation, load a NULL dictionary (or reset parameters).
+ *  Note 1 : Dictionary is sticky, it will be used for all future compressed frames,
+ *           until parameters are reset, a new dictionary is loaded, or the dictionary
+ *           is explicitly invalidated by loading a NULL dictionary.
  *  Note 2 : Loading a dictionary involves building tables.
  *           It's also a CPU consuming operation, with non-negligible impact on latency.
  *           Tables are dependent on compression parameters, and for this reason,
@@ -947,11 +1024,15 @@ ZSTDLIB_API unsigned ZSTD_getDictID_fromFrame(const void* src, size_t srcSize);
  *           Use experimental ZSTD_CCtx_loadDictionary_byReference() to reference content instead.
  *           In such a case, dictionary buffer must outlive its users.
  *  Note 4 : Use ZSTD_CCtx_loadDictionary_advanced()
- *           to precisely select how dictionary content must be interpreted. */
+ *           to precisely select how dictionary content must be interpreted.
+ *  Note 5 : This method does not benefit from LDM (long distance mode).
+ *           If you want to employ LDM on some large dictionary content,
+ *           prefer employing ZSTD_CCtx_refPrefix() described below.
+ */
 ZSTDLIB_API size_t ZSTD_CCtx_loadDictionary(ZSTD_CCtx* cctx, const void* dict, size_t dictSize);
 
 /*! ZSTD_CCtx_refCDict() : Requires v1.4.0+
- *  Reference a prepared dictionary, to be used for all next compressed frames.
+ *  Reference a prepared dictionary, to be used for all future compressed frames.
  *  Note that compression parameters are enforced from within CDict,
  *  and supersede any compression parameter previously set within CCtx.
  *  The parameters ignored are labelled as "superseded-by-cdict" in the ZSTD_cParameter enum docs.
@@ -970,6 +1051,7 @@ ZSTDLIB_API size_t ZSTD_CCtx_refCDict(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict);
  *  Decompression will need same prefix to properly regenerate data.
  *  Compressing with a prefix is similar in outcome as performing a diff and compressing it,
  *  but performs much faster, especially during decompression (compression speed is tunable with compression level).
+ *  This method is compatible with LDM (long distance mode).
  * @result : 0, or an error code (which can be tested with ZSTD_isError()).
  *  Special: Adding any prefix (including NULL) invalidates any previous prefix or dictionary
  *  Note 1 : Prefix buffer is referenced. It **must** outlive compression.
@@ -986,9 +1068,9 @@ ZSTDLIB_API size_t ZSTD_CCtx_refPrefix(ZSTD_CCtx* cctx,
                                  const void* prefix, size_t prefixSize);
 
 /*! ZSTD_DCtx_loadDictionary() : Requires v1.4.0+
- *  Create an internal DDict from dict buffer,
- *  to be used to decompress next frames.
- *  The dictionary remains valid for all future frames, until explicitly invalidated.
+ *  Create an internal DDict from dict buffer, to be used to decompress all future frames.
+ *  The dictionary remains valid for all future frames, until explicitly invalidated, or
+ *  a new dictionary is loaded.
  * @result : 0, or an error code (which can be tested with ZSTD_isError()).
  *  Special : Adding a NULL (or 0-size) dictionary invalidates any previous dictionary,
  *            meaning "return to no-dictionary mode".
@@ -1012,9 +1094,10 @@ ZSTDLIB_API size_t ZSTD_DCtx_loadDictionary(ZSTD_DCtx* dctx, const void* dict, s
  *  The memory for the table is allocated on the first call to refDDict, and can be
  *  freed with ZSTD_freeDCtx().
  *
+ *  If called with ZSTD_d_refMultipleDDicts disabled (the default), only one dictionary
+ *  will be managed, and referencing a dictionary effectively "discards" any previous one.
+ *
  * @result : 0, or an error code (which can be tested with ZSTD_isError()).
- *  Note 1 : Currently, only one dictionary can be managed.
- *           Referencing a new dictionary effectively "discards" any previous one.
  *  Special: referencing a NULL DDict means "return to no-dictionary mode".
  *  Note 2 : DDict is just referenced, its lifetime must outlive its usage from DCtx.
  */
@@ -1071,24 +1154,6 @@ ZSTDLIB_API size_t ZSTD_sizeof_DDict(const ZSTD_DDict* ddict);
 #define ZSTDLIB_STATIC_API ZSTDLIB_VISIBLE
 #endif
 
-/* Deprecation warnings :
- * Should these warnings be a problem, it is generally possible to disable them,
- * typically with -Wno-deprecated-declarations for gcc or _CRT_SECURE_NO_WARNINGS in Visual.
- * Otherwise, it's also possible to define ZSTD_DISABLE_DEPRECATE_WARNINGS.
- */
-#ifdef ZSTD_DISABLE_DEPRECATE_WARNINGS
-#  define ZSTD_DEPRECATED(message) ZSTDLIB_STATIC_API  /* disable deprecation warnings */
-#else
-#  if (defined(GNUC) && (GNUC > 4 || (GNUC == 4 && GNUC_MINOR >= 5))) || defined(__clang__)
-#    define ZSTD_DEPRECATED(message) ZSTDLIB_STATIC_API __attribute__((deprecated(message)))
-#  elif (__GNUC__ >= 3)
-#    define ZSTD_DEPRECATED(message) ZSTDLIB_STATIC_API __attribute__((deprecated))
-#  else
-#    pragma message("WARNING: You need to implement ZSTD_DEPRECATED for this compiler")
-#    define ZSTD_DEPRECATED(message) ZSTDLIB_STATIC_API
-#  endif
-#endif /* ZSTD_DISABLE_DEPRECATE_WARNINGS */
-
 /* **************************************************************************************
  *   experimental API (static linking only)
  ****************************************************************************************
@@ -1123,6 +1188,7 @@ ZSTDLIB_API size_t ZSTD_sizeof_DDict(const ZSTD_DDict* ddict);
 #define ZSTD_TARGETLENGTH_MIN     0   /* note : comparing this constant to an unsigned results in a tautological test */
 #define ZSTD_STRATEGY_MIN        ZSTD_fast
 #define ZSTD_STRATEGY_MAX        ZSTD_btultra2
+#define ZSTD_BLOCKSIZE_MAX_MIN (1 << 10) /* The minimum valid max blocksize. Maximum blocksizes smaller than this make compressBound() inaccurate. */
 
 
 #define ZSTD_OVERLAPLOG_MIN       0
@@ -1303,7 +1369,7 @@ typedef enum {
 } ZSTD_paramSwitch_e;
 
 /* *************************************
-*  Frame size functions
+*  Frame header and size functions
 ***************************************/
 
 /*! ZSTD_findDecompressedSize() :
@@ -1350,29 +1416,109 @@ ZSTDLIB_STATIC_API unsigned long long ZSTD_decompressBound(const void* src, size
  *           or an error code (if srcSize is too small) */
 ZSTDLIB_STATIC_API size_t ZSTD_frameHeaderSize(const void* src, size_t srcSize);
 
+typedef enum { ZSTD_frame, ZSTD_skippableFrame } ZSTD_frameType_e;
+typedef struct {
+    unsigned long long frameContentSize; /* if == ZSTD_CONTENTSIZE_UNKNOWN, it means this field is not available. 0 means "empty" */
+    unsigned long long windowSize;       /* can be very large, up to <= frameContentSize */
+    unsigned blockSizeMax;
+    ZSTD_frameType_e frameType;          /* if == ZSTD_skippableFrame, frameContentSize is the size of skippable content */
+    unsigned headerSize;
+    unsigned dictID;
+    unsigned checksumFlag;
+    unsigned _reserved1;
+    unsigned _reserved2;
+} ZSTD_frameHeader;
+
+/*! ZSTD_getFrameHeader() :
+ *  decode Frame Header, or requires larger `srcSize`.
+ * @return : 0, `zfhPtr` is correctly filled,
+ *          >0, `srcSize` is too small, value is wanted `srcSize` amount,
+ *           or an error code, which can be tested using ZSTD_isError() */
+ZSTDLIB_STATIC_API size_t ZSTD_getFrameHeader(ZSTD_frameHeader* zfhPtr, const void* src, size_t srcSize);   /*< doesn't consume input */
+/*! ZSTD_getFrameHeader_advanced() :
+ *  same as ZSTD_getFrameHeader(),
+ *  with added capability to select a format (like ZSTD_f_zstd1_magicless) */
+ZSTDLIB_STATIC_API size_t ZSTD_getFrameHeader_advanced(ZSTD_frameHeader* zfhPtr, const void* src, size_t srcSize, ZSTD_format_e format);
+
+/*! ZSTD_decompressionMargin() :
+ * Zstd supports in-place decompression, where the input and output buffers overlap.
+ * In this case, the output buffer must be at least (Margin + Output_Size) bytes large,
+ * and the input buffer must be at the end of the output buffer.
+ *
+ *  _______________________ Output Buffer ________________________
+ * |                                                              |
+ * |                                        ____ Input Buffer ____|
+ * |                                       |                      |
+ * v                                       v                      v
+ * |---------------------------------------|-----------|----------|
+ * ^                                                   ^          ^
+ * |___________________ Output_Size ___________________|_ Margin _|
+ *
+ * NOTE: See also ZSTD_DECOMPRESSION_MARGIN().
+ * NOTE: This applies only to single-pass decompression through ZSTD_decompress() or
+ * ZSTD_decompressDCtx().
+ * NOTE: This function supports multi-frame input.
+ *
+ * @param src The compressed frame(s)
+ * @param srcSize The size of the compressed frame(s)
+ * @returns The decompression margin or an error that can be checked with ZSTD_isError().
+ */
+ZSTDLIB_STATIC_API size_t ZSTD_decompressionMargin(const void* src, size_t srcSize);
+
+/*! ZSTD_DECOMPRESS_MARGIN() :
+ * Similar to ZSTD_decompressionMargin(), but instead of computing the margin from
+ * the compressed frame, compute it from the original size and the blockSizeLog.
+ * See ZSTD_decompressionMargin() for details.
+ *
+ * WARNING: This macro does not support multi-frame input, the input must be a single
+ * zstd frame. If you need that support use the function, or implement it yourself.
+ *
+ * @param originalSize The original uncompressed size of the data.
+ * @param blockSize    The block size == MIN(windowSize, ZSTD_BLOCKSIZE_MAX).
+ *                     Unless you explicitly set the windowLog smaller than
+ *                     ZSTD_BLOCKSIZELOG_MAX you can just use ZSTD_BLOCKSIZE_MAX.
+ */
+#define ZSTD_DECOMPRESSION_MARGIN(originalSize, blockSize) ((size_t)(                                              \
+        ZSTD_FRAMEHEADERSIZE_MAX                                                              /* Frame header */ + \
+        4                                                                                         /* checksum */ + \
+        ((originalSize) == 0 ? 0 : 3 * (((originalSize) + (blockSize) - 1) / blockSize)) /* 3 bytes per block */ + \
+        (blockSize)                                                                    /* One block of margin */   \
+    ))
+
 typedef enum {
   ZSTD_sf_noBlockDelimiters = 0,         /* Representation of ZSTD_Sequence has no block delimiters, sequences only */
   ZSTD_sf_explicitBlockDelimiters = 1    /* Representation of ZSTD_Sequence contains explicit block delimiters */
 } ZSTD_sequenceFormat_e;
 
+/*! ZSTD_sequenceBound() :
+ * `srcSize` : size of the input buffer
+ *  @return : upper-bound for the number of sequences that can be generated
+ *            from a buffer of srcSize bytes
+ *
+ *  note : returns number of sequences - to get bytes, multiply by sizeof(ZSTD_Sequence).
+ */
+ZSTDLIB_STATIC_API size_t ZSTD_sequenceBound(size_t srcSize);
+
 /*! ZSTD_generateSequences() :
- * Generate sequences using ZSTD_compress2, given a source buffer.
+ * Generate sequences using ZSTD_compress2(), given a source buffer.
  *
  * Each block will end with a dummy sequence
  * with offset == 0, matchLength == 0, and litLength == length of last literals.
  * litLength may be == 0, and if so, then the sequence of (of: 0 ml: 0 ll: 0)
  * simply acts as a block delimiter.
  *
- * zc can be used to insert custom compression params.
- * This function invokes ZSTD_compress2
+ * @zc can be used to insert custom compression params.
+ * This function invokes ZSTD_compress2().
  *
  * The output of this function can be fed into ZSTD_compressSequences() with CCtx
  * setting of ZSTD_c_blockDelimiters as ZSTD_sf_explicitBlockDelimiters
  * @return : number of sequences generated
  */
 
-ZSTDLIB_STATIC_API size_t ZSTD_generateSequences(ZSTD_CCtx* zc, ZSTD_Sequence* outSeqs,
-                                          size_t outSeqsSize, const void* src, size_t srcSize);
+ZSTDLIB_STATIC_API size_t
+ZSTD_generateSequences( ZSTD_CCtx* zc,
+                        ZSTD_Sequence* outSeqs, size_t outSeqsSize,
+                        const void* src, size_t srcSize);
 
 /*! ZSTD_mergeBlockDelimiters() :
  * Given an array of ZSTD_Sequence, remove all sequences that represent block delimiters/last literals
@@ -1388,7 +1534,9 @@ ZSTDLIB_STATIC_API size_t ZSTD_generateSequences(ZSTD_CCtx* zc, ZSTD_Sequence* o
 ZSTDLIB_STATIC_API size_t ZSTD_mergeBlockDelimiters(ZSTD_Sequence* sequences, size_t seqsSize);
 
 /*! ZSTD_compressSequences() :
- * Compress an array of ZSTD_Sequence, generated from the original source buffer, into dst.
+ * Compress an array of ZSTD_Sequence, associated with @src buffer, into dst.
+ * @src contains the entire input (not just the literals).
+ * If @srcSize > sum(sequence.length), the remaining bytes are considered all literals
  * If a dictionary is included, then the cctx should reference the dict. (see: ZSTD_CCtx_refCDict(), ZSTD_CCtx_loadDictionary(), etc.)
  * The entire source is compressed into a single frame.
  *
@@ -1413,11 +1561,12 @@ ZSTDLIB_STATIC_API size_t ZSTD_mergeBlockDelimiters(ZSTD_Sequence* sequences, si
  * Note: Repcodes are, as of now, always re-calculated within this function, so ZSTD_Sequence::rep is unused.
  * Note 2: Once we integrate ability to ingest repcodes, the explicit block delims mode must respect those repcodes exactly,
  *         and cannot emit an RLE block that disagrees with the repcode history
- * @return : final compressed size or a ZSTD error.
+ * @return : final compressed size, or a ZSTD error code.
  */
-ZSTDLIB_STATIC_API size_t ZSTD_compressSequences(ZSTD_CCtx* const cctx, void* dst, size_t dstSize,
-                                  const ZSTD_Sequence* inSeqs, size_t inSeqsSize,
-                                  const void* src, size_t srcSize);
+ZSTDLIB_STATIC_API size_t
+ZSTD_compressSequences( ZSTD_CCtx* cctx, void* dst, size_t dstSize,
+                        const ZSTD_Sequence* inSeqs, size_t inSeqsSize,
+                        const void* src, size_t srcSize);
 
 
 /*! ZSTD_writeSkippableFrame() :
@@ -1481,8 +1630,11 @@ ZSTDLIB_API unsigned ZSTD_isSkippableFrame(const void* buffer, size_t size);
  *  and ZSTD_estimateCCtxSize_usingCCtxParams(), which can be used in tandem with ZSTD_CCtxParams_setParameter().
  *  Both can be used to estimate memory using custom compression parameters and arbitrary srcSize limits.
  *
- *  Note 2 : only single-threaded compression is supported.
+ *  Note : only single-threaded compression is supported.
  *  ZSTD_estimateCCtxSize_usingCCtxParams() will return an error code if ZSTD_c_nbWorkers is >= 1.
+ *
+ *  Note 2 : ZSTD_estimateCCtxSize* functions are not compatible with the Block-Level Sequence Producer API at this time.
+ *  Size estimates assume that no external sequence producer is registered.
  */
 ZSTDLIB_STATIC_API size_t ZSTD_estimateCCtxSize(int compressionLevel);
 ZSTDLIB_STATIC_API size_t ZSTD_estimateCCtxSize_usingCParams(ZSTD_compressionParameters cParams);
@@ -1501,7 +1653,12 @@ ZSTDLIB_STATIC_API size_t ZSTD_estimateDCtxSize(void);
  *  or deducted from a valid frame Header, using ZSTD_estimateDStreamSize_fromFrame();
  *  Note : if streaming is init with function ZSTD_init?Stream_usingDict(),
  *         an internal ?Dict will be created, which additional size is not estimated here.
- *         In this case, get total size by adding ZSTD_estimate?DictSize */
+ *         In this case, get total size by adding ZSTD_estimate?DictSize
+ *  Note 2 : only single-threaded compression is supported.
+ *  ZSTD_estimateCStreamSize_usingCCtxParams() will return an error code if ZSTD_c_nbWorkers is >= 1.
+ *  Note 3 : ZSTD_estimateCStreamSize* functions are not compatible with the Block-Level Sequence Producer API at this time.
+ *  Size estimates assume that no external sequence producer is registered.
+ */
 ZSTDLIB_STATIC_API size_t ZSTD_estimateCStreamSize(int compressionLevel);
 ZSTDLIB_STATIC_API size_t ZSTD_estimateCStreamSize_usingCParams(ZSTD_compressionParameters cParams);
 ZSTDLIB_STATIC_API size_t ZSTD_estimateCStreamSize_usingCCtxParams(const ZSTD_CCtx_params* params);
@@ -1649,22 +1806,45 @@ ZSTDLIB_STATIC_API size_t ZSTD_checkCParams(ZSTD_compressionParameters params);
  *  This function never fails (wide contract) */
 ZSTDLIB_STATIC_API ZSTD_compressionParameters ZSTD_adjustCParams(ZSTD_compressionParameters cPar, unsigned long long srcSize, size_t dictSize);
 
+/*! ZSTD_CCtx_setCParams() :
+ *  Set all parameters provided within @p cparams into the working @p cctx.
+ *  Note : if modifying parameters during compression (MT mode only),
+ *         note that changes to the .windowLog parameter will be ignored.
+ * @return 0 on success, or an error code (can be checked with ZSTD_isError()).
+ *         On failure, no parameters are updated.
+ */
+ZSTDLIB_STATIC_API size_t ZSTD_CCtx_setCParams(ZSTD_CCtx* cctx, ZSTD_compressionParameters cparams);
+
+/*! ZSTD_CCtx_setFParams() :
+ *  Set all parameters provided within @p fparams into the working @p cctx.
+ * @return 0 on success, or an error code (can be checked with ZSTD_isError()).
+ */
+ZSTDLIB_STATIC_API size_t ZSTD_CCtx_setFParams(ZSTD_CCtx* cctx, ZSTD_frameParameters fparams);
+
+/*! ZSTD_CCtx_setParams() :
+ *  Set all parameters provided within @p params into the working @p cctx.
+ * @return 0 on success, or an error code (can be checked with ZSTD_isError()).
+ */
+ZSTDLIB_STATIC_API size_t ZSTD_CCtx_setParams(ZSTD_CCtx* cctx, ZSTD_parameters params);
+
 /*! ZSTD_compress_advanced() :
  *  Note : this function is now DEPRECATED.
  *         It can be replaced by ZSTD_compress2(), in combination with ZSTD_CCtx_setParameter() and other parameter setters.
  *  This prototype will generate compilation warnings. */
 ZSTD_DEPRECATED("use ZSTD_compress2")
+ZSTDLIB_STATIC_API
 size_t ZSTD_compress_advanced(ZSTD_CCtx* cctx,
-                                          void* dst, size_t dstCapacity,
-                                    const void* src, size_t srcSize,
-                                    const void* dict,size_t dictSize,
-                                          ZSTD_parameters params);
+                              void* dst, size_t dstCapacity,
+                        const void* src, size_t srcSize,
+                        const void* dict,size_t dictSize,
+                              ZSTD_parameters params);
 
 /*! ZSTD_compress_usingCDict_advanced() :
  *  Note : this function is now DEPRECATED.
  *         It can be replaced by ZSTD_compress2(), in combination with ZSTD_CCtx_loadDictionary() and other parameter setters.
  *  This prototype will generate compilation warnings. */
 ZSTD_DEPRECATED("use ZSTD_compress2 with ZSTD_CCtx_loadDictionary")
+ZSTDLIB_STATIC_API
 size_t ZSTD_compress_usingCDict_advanced(ZSTD_CCtx* cctx,
                                               void* dst, size_t dstCapacity,
                                         const void* src, size_t srcSize,
@@ -1808,13 +1988,16 @@ ZSTDLIB_STATIC_API size_t ZSTD_CCtx_refPrefix_advanced(ZSTD_CCtx* cctx, const vo
  * Experimental parameter.
  * Default is 0 == disabled. Set to 1 to enable.
  *
- * Tells the compressor that the ZSTD_inBuffer will ALWAYS be the same
- * between calls, except for the modifications that zstd makes to pos (the
- * caller must not modify pos). This is checked by the compressor, and
- * compression will fail if it ever changes. This means the only flush
- * mode that makes sense is ZSTD_e_end, so zstd will error if ZSTD_e_end
- * is not used. The data in the ZSTD_inBuffer in the range [src, src + pos)
- * MUST not be modified during compression or you will get data corruption.
+ * Tells the compressor that input data presented with ZSTD_inBuffer
+ * will ALWAYS be the same between calls.
+ * Technically, the @src pointer must never be changed,
+ * and the @pos field can only be updated by zstd.
+ * However, it's possible to increase the @size field,
+ * allowing scenarios where more data can be appended after compressions starts.
+ * These conditions are checked by the compressor,
+ * and compression will fail if they are not respected.
+ * Also, data in the ZSTD_inBuffer within the range [src, src + pos)
+ * MUST not be modified during compression or it will result in data corruption.
  *
  * When this flag is enabled zstd won't allocate an input window buffer,
  * because the user guarantees it can reference the ZSTD_inBuffer until
@@ -1822,18 +2005,15 @@ ZSTDLIB_STATIC_API size_t ZSTD_CCtx_refPrefix_advanced(ZSTD_CCtx* cctx, const vo
  * large enough to fit a block (see ZSTD_c_stableOutBuffer). This will also
  * avoid the memcpy() from the input buffer to the input window buffer.
  *
- * NOTE: ZSTD_compressStream2() will error if ZSTD_e_end is not used.
- * That means this flag cannot be used with ZSTD_compressStream().
- *
  * NOTE: So long as the ZSTD_inBuffer always points to valid memory, using
  * this flag is ALWAYS memory safe, and will never access out-of-bounds
- * memory. However, compression WILL fail if you violate the preconditions.
+ * memory. However, compression WILL fail if conditions are not respected.
  *
- * WARNING: The data in the ZSTD_inBuffer in the range [dst, dst + pos) MUST
- * not be modified during compression or you will get data corruption. This
- * is because zstd needs to reference data in the ZSTD_inBuffer to find
+ * WARNING: The data in the ZSTD_inBuffer in the range [src, src + pos) MUST
+ * not be modified during compression or it will result in data corruption.
+ * This is because zstd needs to reference data in the ZSTD_inBuffer to find
  * matches. Normally zstd maintains its own window buffer for this purpose,
- * but passing this flag tells zstd to use the user provided buffer.
+ * but passing this flag tells zstd to rely on user provided buffer instead.
  */
 #define ZSTD_c_stableInBuffer ZSTD_c_experimentalParam9
 
@@ -1878,7 +2058,7 @@ ZSTDLIB_STATIC_API size_t ZSTD_CCtx_refPrefix_advanced(ZSTD_CCtx* cctx, const vo
  * Without validation, providing a sequence that does not conform to the zstd spec will cause
  * undefined behavior, and may produce a corrupted block.
  *
- * With validation enabled, a if sequence is invalid (see doc/zstd_compression_format.md for
+ * With validation enabled, if sequence is invalid (see doc/zstd_compression_format.md for
  * specifics regarding offset/matchlength requirements) then the function will bail out and
  * return an error.
  *
@@ -1928,6 +2108,79 @@ ZSTDLIB_STATIC_API size_t ZSTD_CCtx_refPrefix_advanced(ZSTD_CCtx* cctx, const vo
  */
 #define ZSTD_c_deterministicRefPrefix ZSTD_c_experimentalParam15
 
+/* ZSTD_c_prefetchCDictTables
+ * Controlled with ZSTD_paramSwitch_e enum. Default is ZSTD_ps_auto.
+ *
+ * In some situations, zstd uses CDict tables in-place rather than copying them
+ * into the working context. (See docs on ZSTD_dictAttachPref_e above for details).
+ * In such situations, compression speed is seriously impacted when CDict tables are
+ * "cold" (outside CPU cache). This parameter instructs zstd to prefetch CDict tables
+ * when they are used in-place.
+ *
+ * For sufficiently small inputs, the cost of the prefetch will outweigh the benefit.
+ * For sufficiently large inputs, zstd will by default memcpy() CDict tables
+ * into the working context, so there is no need to prefetch. This parameter is
+ * targeted at a middle range of input sizes, where a prefetch is cheap enough to be
+ * useful but memcpy() is too expensive. The exact range of input sizes where this
+ * makes sense is best determined by careful experimentation.
+ *
+ * Note: for this parameter, ZSTD_ps_auto is currently equivalent to ZSTD_ps_disable,
+ * but in the future zstd may conditionally enable this feature via an auto-detection
+ * heuristic for cold CDicts.
+ * Use ZSTD_ps_disable to opt out of prefetching under any circumstances.
+ */
+#define ZSTD_c_prefetchCDictTables ZSTD_c_experimentalParam16
+
+/* ZSTD_c_enableSeqProducerFallback
+ * Allowed values are 0 (disable) and 1 (enable). The default setting is 0.
+ *
+ * Controls whether zstd will fall back to an internal sequence producer if an
+ * external sequence producer is registered and returns an error code. This fallback
+ * is block-by-block: the internal sequence producer will only be called for blocks
+ * where the external sequence producer returns an error code. Fallback parsing will
+ * follow any other cParam settings, such as compression level, the same as in a
+ * normal (fully-internal) compression operation.
+ *
+ * The user is strongly encouraged to read the full Block-Level Sequence Producer API
+ * documentation (below) before setting this parameter. */
+#define ZSTD_c_enableSeqProducerFallback ZSTD_c_experimentalParam17
+
+/* ZSTD_c_maxBlockSize
+ * Allowed values are between 1KB and ZSTD_BLOCKSIZE_MAX (128KB).
+ * The default is ZSTD_BLOCKSIZE_MAX, and setting to 0 will set to the default.
+ *
+ * This parameter can be used to set an upper bound on the blocksize
+ * that overrides the default ZSTD_BLOCKSIZE_MAX. It cannot be used to set upper
+ * bounds greater than ZSTD_BLOCKSIZE_MAX or bounds lower than 1KB (will make
+ * compressBound() inaccurate). Only currently meant to be used for testing.
+ *
+ */
+#define ZSTD_c_maxBlockSize ZSTD_c_experimentalParam18
+
+/* ZSTD_c_searchForExternalRepcodes
+ * This parameter affects how zstd parses external sequences, such as sequences
+ * provided through the compressSequences() API or from an external block-level
+ * sequence producer.
+ *
+ * If set to ZSTD_ps_enable, the library will check for repeated offsets in
+ * external sequences, even if those repcodes are not explicitly indicated in
+ * the "rep" field. Note that this is the only way to exploit repcode matches
+ * while using compressSequences() or an external sequence producer, since zstd
+ * currently ignores the "rep" field of external sequences.
+ *
+ * If set to ZSTD_ps_disable, the library will not exploit repeated offsets in
+ * external sequences, regardless of whether the "rep" field has been set. This
+ * reduces sequence compression overhead by about 25% while sacrificing some
+ * compression ratio.
+ *
+ * The default value is ZSTD_ps_auto, for which the library will enable/disable
+ * based on compression level.
+ *
+ * Note: for now, this param only has an effect if ZSTD_c_blockDelimiters is
+ * set to ZSTD_sf_explicitBlockDelimiters. That may change in the future.
+ */
+#define ZSTD_c_searchForExternalRepcodes ZSTD_c_experimentalParam19
+
 /*! ZSTD_CCtx_getParameter() :
  *  Get the requested compression parameter value, selected by enum ZSTD_cParameter,
  *  and store it into int* value.
@@ -2084,7 +2337,7 @@ ZSTDLIB_STATIC_API size_t ZSTD_DCtx_getParameter(ZSTD_DCtx* dctx, ZSTD_dParamete
  * in the range [dst, dst + pos) MUST not be modified during decompression
  * or you will get data corruption.
  *
- * When this flags is enabled zstd won't allocate an output buffer, because
+ * When this flag is enabled zstd won't allocate an output buffer, because
  * it can write directly to the ZSTD_outBuffer, but it will still allocate
  * an input buffer large enough to fit any compressed block. This will also
  * avoid the memcpy() from the internal output buffer to the ZSTD_outBuffer.
@@ -2137,6 +2390,17 @@ ZSTDLIB_STATIC_API size_t ZSTD_DCtx_getParameter(ZSTD_DCtx* dctx, ZSTD_dParamete
  */
 #define ZSTD_d_refMultipleDDicts ZSTD_d_experimentalParam4
 
+/* ZSTD_d_disableHuffmanAssembly
+ * Set to 1 to disable the Huffman assembly implementation.
+ * The default value is 0, which allows zstd to use the Huffman assembly
+ * implementation if available.
+ *
+ * This parameter can be used to disable Huffman assembly at runtime.
+ * If you want to disable it at compile time you can define the macro
+ * ZSTD_DISABLE_ASM.
+ */
+#define ZSTD_d_disableHuffmanAssembly ZSTD_d_experimentalParam5
+
 
 /*! ZSTD_DCtx_setFormat() :
  *  This function is REDUNDANT. Prefer ZSTD_DCtx_setParameter().
@@ -2145,6 +2409,7 @@ ZSTDLIB_STATIC_API size_t ZSTD_DCtx_getParameter(ZSTD_DCtx* dctx, ZSTD_dParamete
  *  such ZSTD_f_zstd1_magicless for example.
  * @return : 0, or an error code (which can be tested using ZSTD_isError()). */
 ZSTD_DEPRECATED("use ZSTD_DCtx_setParameter() instead")
+ZSTDLIB_STATIC_API
 size_t ZSTD_DCtx_setFormat(ZSTD_DCtx* dctx, ZSTD_format_e format);
 
 /*! ZSTD_decompressStream_simpleArgs() :
@@ -2181,6 +2446,7 @@ ZSTDLIB_STATIC_API size_t ZSTD_decompressStream_simpleArgs (
  * This prototype will generate compilation warnings.
  */
 ZSTD_DEPRECATED("use ZSTD_CCtx_reset, see zstd.h for detailed instructions")
+ZSTDLIB_STATIC_API
 size_t ZSTD_initCStream_srcSize(ZSTD_CStream* zcs,
                          int compressionLevel,
                          unsigned long long pledgedSrcSize);
@@ -2198,17 +2464,15 @@ size_t ZSTD_initCStream_srcSize(ZSTD_CStream* zcs,
  * This prototype will generate compilation warnings.
  */
 ZSTD_DEPRECATED("use ZSTD_CCtx_reset, see zstd.h for detailed instructions")
+ZSTDLIB_STATIC_API
 size_t ZSTD_initCStream_usingDict(ZSTD_CStream* zcs,
                      const void* dict, size_t dictSize,
                            int compressionLevel);
 
 /*! ZSTD_initCStream_advanced() :
- * This function is DEPRECATED, and is approximately equivalent to:
+ * This function is DEPRECATED, and is equivalent to:
  *     ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only);
- *     // Pseudocode: Set each zstd parameter and leave the rest as-is.
- *     for ((param, value) : params) {
- *         ZSTD_CCtx_setParameter(zcs, param, value);
- *     }
+ *     ZSTD_CCtx_setParams(zcs, params);
  *     ZSTD_CCtx_setPledgedSrcSize(zcs, pledgedSrcSize);
  *     ZSTD_CCtx_loadDictionary(zcs, dict, dictSize);
  *
@@ -2218,6 +2482,7 @@ size_t ZSTD_initCStream_usingDict(ZSTD_CStream* zcs,
  * This prototype will generate compilation warnings.
  */
 ZSTD_DEPRECATED("use ZSTD_CCtx_reset, see zstd.h for detailed instructions")
+ZSTDLIB_STATIC_API
 size_t ZSTD_initCStream_advanced(ZSTD_CStream* zcs,
                     const void* dict, size_t dictSize,
                           ZSTD_parameters params,
@@ -2232,15 +2497,13 @@ size_t ZSTD_initCStream_advanced(ZSTD_CStream* zcs,
  * This prototype will generate compilation warnings.
  */
 ZSTD_DEPRECATED("use ZSTD_CCtx_reset and ZSTD_CCtx_refCDict, see zstd.h for detailed instructions")
+ZSTDLIB_STATIC_API
 size_t ZSTD_initCStream_usingCDict(ZSTD_CStream* zcs, const ZSTD_CDict* cdict);
 
 /*! ZSTD_initCStream_usingCDict_advanced() :
- *   This function is DEPRECATED, and is approximately equivalent to:
+ *   This function is DEPRECATED, and is equivalent to:
  *     ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only);
- *     // Pseudocode: Set each zstd frame parameter and leave the rest as-is.
- *     for ((fParam, value) : fParams) {
- *         ZSTD_CCtx_setParameter(zcs, fParam, value);
- *     }
+ *     ZSTD_CCtx_setFParams(zcs, fParams);
  *     ZSTD_CCtx_setPledgedSrcSize(zcs, pledgedSrcSize);
  *     ZSTD_CCtx_refCDict(zcs, cdict);
  *
@@ -2250,6 +2513,7 @@ size_t ZSTD_initCStream_usingCDict(ZSTD_CStream* zcs, const ZSTD_CDict* cdict);
  * This prototype will generate compilation warnings.
  */
 ZSTD_DEPRECATED("use ZSTD_CCtx_reset and ZSTD_CCtx_refCDict, see zstd.h for detailed instructions")
+ZSTDLIB_STATIC_API
 size_t ZSTD_initCStream_usingCDict_advanced(ZSTD_CStream* zcs,
                                const ZSTD_CDict* cdict,
                                      ZSTD_frameParameters fParams,
@@ -2274,6 +2538,7 @@ size_t ZSTD_initCStream_usingCDict_advanced(ZSTD_CStream* zcs,
  *  This prototype will generate compilation warnings.
  */
 ZSTD_DEPRECATED("use ZSTD_CCtx_reset, see zstd.h for detailed instructions")
+ZSTDLIB_STATIC_API
 size_t ZSTD_resetCStream(ZSTD_CStream* zcs, unsigned long long pledgedSrcSize);
 
 
@@ -2319,8 +2584,8 @@ ZSTDLIB_STATIC_API size_t ZSTD_toFlushNow(ZSTD_CCtx* cctx);
  *     ZSTD_DCtx_loadDictionary(zds, dict, dictSize);
  *
  * note: no dictionary will be used if dict == NULL or dictSize < 8
- * Note : this prototype will be marked as deprecated and generate compilation warnings on reaching v1.5.x
  */
+ZSTD_DEPRECATED("use ZSTD_DCtx_reset + ZSTD_DCtx_loadDictionary, see zstd.h for detailed instructions")
 ZSTDLIB_STATIC_API size_t ZSTD_initDStream_usingDict(ZSTD_DStream* zds, const void* dict, size_t dictSize);
 
 /*!
@@ -2330,8 +2595,8 @@ ZSTDLIB_STATIC_API size_t ZSTD_initDStream_usingDict(ZSTD_DStream* zds, const vo
  *     ZSTD_DCtx_refDDict(zds, ddict);
  *
  * note : ddict is referenced, it must outlive decompression session
- * Note : this prototype will be marked as deprecated and generate compilation warnings on reaching v1.5.x
  */
+ZSTD_DEPRECATED("use ZSTD_DCtx_reset + ZSTD_DCtx_refDDict, see zstd.h for detailed instructions")
 ZSTDLIB_STATIC_API size_t ZSTD_initDStream_usingDDict(ZSTD_DStream* zds, const ZSTD_DDict* ddict);
 
 /*!
@@ -2340,17 +2605,185 @@ ZSTDLIB_STATIC_API size_t ZSTD_initDStream_usingDDict(ZSTD_DStream* zds, const Z
  *     ZSTD_DCtx_reset(zds, ZSTD_reset_session_only);
  *
  * re-use decompression parameters from previous init; saves dictionary loading
- * Note : this prototype will be marked as deprecated and generate compilation warnings on reaching v1.5.x
  */
+ZSTD_DEPRECATED("use ZSTD_DCtx_reset, see zstd.h for detailed instructions")
 ZSTDLIB_STATIC_API size_t ZSTD_resetDStream(ZSTD_DStream* zds);
 
 
+/* ********************* BLOCK-LEVEL SEQUENCE PRODUCER API *********************
+ *
+ * *** OVERVIEW ***
+ * The Block-Level Sequence Producer API allows users to provide their own custom
+ * sequence producer which libzstd invokes to process each block. The produced list
+ * of sequences (literals and matches) is then post-processed by libzstd to produce
+ * valid compressed blocks.
+ *
+ * This block-level offload API is a more granular complement of the existing
+ * frame-level offload API compressSequences() (introduced in v1.5.1). It offers
+ * an easier migration story for applications already integrated with libzstd: the
+ * user application continues to invoke the same compression functions
+ * ZSTD_compress2() or ZSTD_compressStream2() as usual, and transparently benefits
+ * from the specific advantages of the external sequence producer. For example,
+ * the sequence producer could be tuned to take advantage of known characteristics
+ * of the input, to offer better speed / ratio, or could leverage hardware
+ * acceleration not available within libzstd itself.
+ *
+ * See contrib/externalSequenceProducer for an example program employing the
+ * Block-Level Sequence Producer API.
+ *
+ * *** USAGE ***
+ * The user is responsible for implementing a function of type
+ * ZSTD_sequenceProducer_F. For each block, zstd will pass the following
+ * arguments to the user-provided function:
+ *
+ *   - sequenceProducerState: a pointer to a user-managed state for the sequence
+ *     producer.
+ *
+ *   - outSeqs, outSeqsCapacity: an output buffer for the sequence producer.
+ *     outSeqsCapacity is guaranteed >= ZSTD_sequenceBound(srcSize). The memory
+ *     backing outSeqs is managed by the CCtx.
+ *
+ *   - src, srcSize: an input buffer for the sequence producer to parse.
+ *     srcSize is guaranteed to be <= ZSTD_BLOCKSIZE_MAX.
+ *
+ *   - dict, dictSize: a history buffer, which may be empty, which the sequence
+ *     producer may reference as it parses the src buffer. Currently, zstd will
+ *     always pass dictSize == 0 into external sequence producers, but this will
+ *     change in the future.
+ *
+ *   - compressionLevel: a signed integer representing the zstd compression level
+ *     set by the user for the current operation. The sequence producer may choose
+ *     to use this information to change its compression strategy and speed/ratio
+ *     tradeoff. Note: the compression level does not reflect zstd parameters set
+ *     through the advanced API.
+ *
+ *   - windowSize: a size_t representing the maximum allowed offset for external
+ *     sequences. Note that sequence offsets are sometimes allowed to exceed the
+ *     windowSize if a dictionary is present, see doc/zstd_compression_format.md
+ *     for details.
+ *
+ * The user-provided function shall return a size_t representing the number of
+ * sequences written to outSeqs. This return value will be treated as an error
+ * code if it is greater than outSeqsCapacity. The return value must be non-zero
+ * if srcSize is non-zero. The ZSTD_SEQUENCE_PRODUCER_ERROR macro is provided
+ * for convenience, but any value greater than outSeqsCapacity will be treated as
+ * an error code.
+ *
+ * If the user-provided function does not return an error code, the sequences
+ * written to outSeqs must be a valid parse of the src buffer. Data corruption may
+ * occur if the parse is not valid. A parse is defined to be valid if the
+ * following conditions hold:
+ *   - The sum of matchLengths and literalLengths must equal srcSize.
+ *   - All sequences in the parse, except for the final sequence, must have
+ *     matchLength >= ZSTD_MINMATCH_MIN. The final sequence must have
+ *     matchLength >= ZSTD_MINMATCH_MIN or matchLength == 0.
+ *   - All offsets must respect the windowSize parameter as specified in
+ *     doc/zstd_compression_format.md.
+ *   - If the final sequence has matchLength == 0, it must also have offset == 0.
+ *
+ * zstd will only validate these conditions (and fail compression if they do not
+ * hold) if the ZSTD_c_validateSequences cParam is enabled. Note that sequence
+ * validation has a performance cost.
+ *
+ * If the user-provided function returns an error, zstd will either fall back
+ * to an internal sequence producer or fail the compression operation. The user can
+ * choose between the two behaviors by setting the ZSTD_c_enableSeqProducerFallback
+ * cParam. Fallback compression will follow any other cParam settings, such as
+ * compression level, the same as in a normal compression operation.
+ *
+ * The user shall instruct zstd to use a particular ZSTD_sequenceProducer_F
+ * function by calling
+ *         ZSTD_registerSequenceProducer(cctx,
+ *                                       sequenceProducerState,
+ *                                       sequenceProducer)
+ * This setting will persist until the next parameter reset of the CCtx.
+ *
+ * The sequenceProducerState must be initialized by the user before calling
+ * ZSTD_registerSequenceProducer(). The user is responsible for destroying the
+ * sequenceProducerState.
+ *
+ * *** LIMITATIONS ***
+ * This API is compatible with all zstd compression APIs which respect advanced parameters.
+ * However, there are three limitations:
+ *
+ * First, the ZSTD_c_enableLongDistanceMatching cParam is not currently supported.
+ * COMPRESSION WILL FAIL if it is enabled and the user tries to compress with a block-level
+ * external sequence producer.
+ *   - Note that ZSTD_c_enableLongDistanceMatching is auto-enabled by default in some
+ *     cases (see its documentation for details). Users must explicitly set
+ *     ZSTD_c_enableLongDistanceMatching to ZSTD_ps_disable in such cases if an external
+ *     sequence producer is registered.
+ *   - As of this writing, ZSTD_c_enableLongDistanceMatching is disabled by default
+ *     whenever ZSTD_c_windowLog < 128MB, but that's subject to change. Users should
+ *     check the docs on ZSTD_c_enableLongDistanceMatching whenever the Block-Level Sequence
+ *     Producer API is used in conjunction with advanced settings (like ZSTD_c_windowLog).
+ *
+ * Second, history buffers are not currently supported. Concretely, zstd will always pass
+ * dictSize == 0 to the external sequence producer (for now). This has two implications:
+ *   - Dictionaries are not currently supported. Compression will *not* fail if the user
+ *     references a dictionary, but the dictionary won't have any effect.
+ *   - Stream history is not currently supported. All advanced compression APIs, including
+ *     streaming APIs, work with external sequence producers, but each block is treated as
+ *     an independent chunk without history from previous blocks.
+ *
+ * Third, multi-threading within a single compression is not currently supported. In other words,
+ * COMPRESSION WILL FAIL if ZSTD_c_nbWorkers > 0 and an external sequence producer is registered.
+ * Multi-threading across compressions is fine: simply create one CCtx per thread.
+ *
+ * Long-term, we plan to overcome all three limitations. There is no technical blocker to
+ * overcoming them. It is purely a question of engineering effort.
+ */
+
+#define ZSTD_SEQUENCE_PRODUCER_ERROR ((size_t)(-1))
+
+typedef size_t ZSTD_sequenceProducer_F (
+  void* sequenceProducerState,
+  ZSTD_Sequence* outSeqs, size_t outSeqsCapacity,
+  const void* src, size_t srcSize,
+  const void* dict, size_t dictSize,
+  int compressionLevel,
+  size_t windowSize
+);
+
+/*! ZSTD_registerSequenceProducer() :
+ * Instruct zstd to use a block-level external sequence producer function.
+ *
+ * The sequenceProducerState must be initialized by the caller, and the caller is
+ * responsible for managing its lifetime. This parameter is sticky across
+ * compressions. It will remain set until the user explicitly resets compression
+ * parameters.
+ *
+ * Sequence producer registration is considered to be an "advanced parameter",
+ * part of the "advanced API". This means it will only have an effect on compression
+ * APIs which respect advanced parameters, such as compress2() and compressStream2().
+ * Older compression APIs such as compressCCtx(), which predate the introduction of
+ * "advanced parameters", will ignore any external sequence producer setting.
+ *
+ * The sequence producer can be "cleared" by registering a NULL function pointer. This
+ * removes all limitations described above in the "LIMITATIONS" section of the API docs.
+ *
+ * The user is strongly encouraged to read the full API documentation (above) before
+ * calling this function. */
+ZSTDLIB_STATIC_API void
+ZSTD_registerSequenceProducer(
+  ZSTD_CCtx* cctx,
+  void* sequenceProducerState,
+  ZSTD_sequenceProducer_F* sequenceProducer
+);
+
+
 /* *******************************************************************
-*  Buffer-less and synchronous inner streaming functions
+*  Buffer-less and synchronous inner streaming functions (DEPRECATED)
+*
+*  This API is deprecated, and will be removed in a future version.
+*  It allows streaming (de)compression with user allocated buffers.
+*  However, it is hard to use, and not as well tested as the rest of
+*  our API.
 *
-*  This is an advanced API, giving full control over buffer management, for users which need direct control over memory.
-*  But it's also a complex one, with several restrictions, documented below.
-*  Prefer normal streaming API for an easier experience.
+*  Please use the normal streaming API instead: ZSTD_compressStream2,
+*  and ZSTD_decompressStream.
+*  If there is functionality that you need, but it doesn't provide,
+*  please open an issue on our GitHub.
 ********************************************************************* */
 
 /*
@@ -2362,7 +2795,6 @@ ZSTDLIB_STATIC_API size_t ZSTD_resetDStream(ZSTD_DStream* zds);
 
   Start by initializing a context.
   Use ZSTD_compressBegin(), or ZSTD_compressBegin_usingDict() for dictionary compression.
-  It's also possible to duplicate a reference context which has already been initialized, using ZSTD_copyCCtx()
 
   Then, consume your input using ZSTD_compressContinue().
   There are some important considerations to keep in mind when using this advanced function :
@@ -2384,18 +2816,28 @@ ZSTDLIB_STATIC_API size_t ZSTD_resetDStream(ZSTD_DStream* zds);
 */
 
 /*=====   Buffer-less streaming compression functions  =====*/
+ZSTD_DEPRECATED("The buffer-less API is deprecated in favor of the normal streaming API. See docs.")
 ZSTDLIB_STATIC_API size_t ZSTD_compressBegin(ZSTD_CCtx* cctx, int compressionLevel);
+ZSTD_DEPRECATED("The buffer-less API is deprecated in favor of the normal streaming API. See docs.")
 ZSTDLIB_STATIC_API size_t ZSTD_compressBegin_usingDict(ZSTD_CCtx* cctx, const void* dict, size_t dictSize, int compressionLevel);
+ZSTD_DEPRECATED("The buffer-less API is deprecated in favor of the normal streaming API. See docs.")
 ZSTDLIB_STATIC_API size_t ZSTD_compressBegin_usingCDict(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict); /*< note: fails if cdict==NULL */
-ZSTDLIB_STATIC_API size_t ZSTD_copyCCtx(ZSTD_CCtx* cctx, const ZSTD_CCtx* preparedCCtx, unsigned long long pledgedSrcSize); /*<  note: if pledgedSrcSize is not known, use ZSTD_CONTENTSIZE_UNKNOWN */
 
+ZSTD_DEPRECATED("This function will likely be removed in a future release. It is misleading and has very limited utility.")
+ZSTDLIB_STATIC_API
+size_t ZSTD_copyCCtx(ZSTD_CCtx* cctx, const ZSTD_CCtx* preparedCCtx, unsigned long long pledgedSrcSize); /*<  note: if pledgedSrcSize is not known, use ZSTD_CONTENTSIZE_UNKNOWN */
+
+ZSTD_DEPRECATED("The buffer-less API is deprecated in favor of the normal streaming API. See docs.")
 ZSTDLIB_STATIC_API size_t ZSTD_compressContinue(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize);
+ZSTD_DEPRECATED("The buffer-less API is deprecated in favor of the normal streaming API. See docs.")
 ZSTDLIB_STATIC_API size_t ZSTD_compressEnd(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize);
 
 /* The ZSTD_compressBegin_advanced() and ZSTD_compressBegin_usingCDict_advanced() are now DEPRECATED and will generate a compiler warning */
 ZSTD_DEPRECATED("use advanced API to access custom parameters")
+ZSTDLIB_STATIC_API
 size_t ZSTD_compressBegin_advanced(ZSTD_CCtx* cctx, const void* dict, size_t dictSize, ZSTD_parameters params, unsigned long long pledgedSrcSize); /*< pledgedSrcSize : If srcSize is not known at init time, use ZSTD_CONTENTSIZE_UNKNOWN */
 ZSTD_DEPRECATED("use advanced API to access custom parameters")
+ZSTDLIB_STATIC_API
 size_t ZSTD_compressBegin_usingCDict_advanced(ZSTD_CCtx* const cctx, const ZSTD_CDict* const cdict, ZSTD_frameParameters const fParams, unsigned long long const pledgedSrcSize);   /* compression parameters are already set within cdict. pledgedSrcSize must be correct. If srcSize is not known, use macro ZSTD_CONTENTSIZE_UNKNOWN */
 /*
   Buffer-less streaming decompression (synchronous mode)
@@ -2408,8 +2850,8 @@ size_t ZSTD_compressBegin_usingCDict_advanced(ZSTD_CCtx* const cctx, const ZSTD_
   Frame header is extracted from the beginning of compressed frame, so providing only the frame's beginning is enough.
   Data fragment must be large enough to ensure successful decoding.
  `ZSTD_frameHeaderSize_max` bytes is guaranteed to always be large enough.
-  @result : 0 : successful decoding, the `ZSTD_frameHeader` structure is correctly filled.
-           >0 : `srcSize` is too small, please provide at least @result bytes on next attempt.
+  result  : 0 : successful decoding, the `ZSTD_frameHeader` structure is correctly filled.
+           >0 : `srcSize` is too small, please provide at least result bytes on next attempt.
            errorCode, which can be tested using ZSTD_isError().
 
   It fills a ZSTD_frameHeader structure with important information to correctly decode the frame,
@@ -2428,7 +2870,7 @@ size_t ZSTD_compressBegin_usingCDict_advanced(ZSTD_CCtx* const cctx, const ZSTD_
 
   The most memory efficient way is to use a round buffer of sufficient size.
   Sufficient size is determined by invoking ZSTD_decodingBufferSize_min(),
-  which can @return an error code if required value is too large for current system (in 32-bits mode).
+  which can return an error code if required value is too large for current system (in 32-bits mode).
   In a round buffer methodology, ZSTD_decompressContinue() decompresses each block next to previous one,
   up to the moment there is not enough room left in the buffer to guarantee decoding another full block,
   which maximum size is provided in `ZSTD_frameHeader` structure, field `blockSizeMax`.
@@ -2448,7 +2890,7 @@ size_t ZSTD_compressBegin_usingCDict_advanced(ZSTD_CCtx* const cctx, const ZSTD_
   ZSTD_nextSrcSizeToDecompress() tells how many bytes to provide as 'srcSize' to ZSTD_decompressContinue().
   ZSTD_decompressContinue() requires this _exact_ amount of bytes, or it will fail.
 
- @result of ZSTD_decompressContinue() is the number of bytes regenerated within 'dst' (necessarily <= dstCapacity).
+  result of ZSTD_decompressContinue() is the number of bytes regenerated within 'dst' (necessarily <= dstCapacity).
   It can be zero : it just means ZSTD_decompressContinue() has decoded some metadata item.
   It can also be an error code, which can be tested with ZSTD_isError().
 
@@ -2471,27 +2913,7 @@ size_t ZSTD_compressBegin_usingCDict_advanced(ZSTD_CCtx* const cctx, const ZSTD_
 */
 
 /*=====   Buffer-less streaming decompression functions  =====*/
-typedef enum { ZSTD_frame, ZSTD_skippableFrame } ZSTD_frameType_e;
-typedef struct {
-    unsigned long long frameContentSize; /* if == ZSTD_CONTENTSIZE_UNKNOWN, it means this field is not available. 0 means "empty" */
-    unsigned long long windowSize;       /* can be very large, up to <= frameContentSize */
-    unsigned blockSizeMax;
-    ZSTD_frameType_e frameType;          /* if == ZSTD_skippableFrame, frameContentSize is the size of skippable content */
-    unsigned headerSize;
-    unsigned dictID;
-    unsigned checksumFlag;
-} ZSTD_frameHeader;
 
-/*! ZSTD_getFrameHeader() :
- *  decode Frame Header, or requires larger `srcSize`.
- * @return : 0, `zfhPtr` is correctly filled,
- *          >0, `srcSize` is too small, value is wanted `srcSize` amount,
- *           or an error code, which can be tested using ZSTD_isError() */
-ZSTDLIB_STATIC_API size_t ZSTD_getFrameHeader(ZSTD_frameHeader* zfhPtr, const void* src, size_t srcSize);   /*< doesn't consume input */
-/*! ZSTD_getFrameHeader_advanced() :
- *  same as ZSTD_getFrameHeader(),
- *  with added capability to select a format (like ZSTD_f_zstd1_magicless) */
-ZSTDLIB_STATIC_API size_t ZSTD_getFrameHeader_advanced(ZSTD_frameHeader* zfhPtr, const void* src, size_t srcSize, ZSTD_format_e format);
 ZSTDLIB_STATIC_API size_t ZSTD_decodingBufferSize_min(unsigned long long windowSize, unsigned long long frameContentSize);  /*< when frame content size is not known, pass in frameContentSize == ZSTD_CONTENTSIZE_UNKNOWN */
 
 ZSTDLIB_STATIC_API size_t ZSTD_decompressBegin(ZSTD_DCtx* dctx);
@@ -2502,6 +2924,7 @@ ZSTDLIB_STATIC_API size_t ZSTD_nextSrcSizeToDecompress(ZSTD_DCtx* dctx);
 ZSTDLIB_STATIC_API size_t ZSTD_decompressContinue(ZSTD_DCtx* dctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize);
 
 /* misc */
+ZSTD_DEPRECATED("This function will likely be removed in the next minor release. It is misleading and has very limited utility.")
 ZSTDLIB_STATIC_API void   ZSTD_copyDCtx(ZSTD_DCtx* dctx, const ZSTD_DCtx* preparedDCtx);
 typedef enum { ZSTDnit_frameHeader, ZSTDnit_blockHeader, ZSTDnit_block, ZSTDnit_lastBlock, ZSTDnit_checksum, ZSTDnit_skippableFrame } ZSTD_nextInputType_e;
 ZSTDLIB_STATIC_API ZSTD_nextInputType_e ZSTD_nextInputType(ZSTD_DCtx* dctx);
@@ -2509,11 +2932,23 @@ ZSTDLIB_STATIC_API ZSTD_nextInputType_e ZSTD_nextInputType(ZSTD_DCtx* dctx);
 
 
 
-/* ============================ */
-/*       Block level API       */
-/* ============================ */
+/* ========================================= */
+/*       Block level API (DEPRECATED)       */
+/* ========================================= */
 
 /*!
+
+    This API is deprecated in favor of the regular compression API.
+    You can get the frame header down to 2 bytes by setting:
+      - ZSTD_c_format = ZSTD_f_zstd1_magicless
+      - ZSTD_c_contentSizeFlag = 0
+      - ZSTD_c_checksumFlag = 0
+      - ZSTD_c_dictIDFlag = 0
+
+    This API is not as well tested as our normal API, so we recommend not using it.
+    We will be removing it in a future version. If the normal API doesn't provide
+    the functionality you need, please open a GitHub issue.
+
     Block functions produce and decode raw zstd blocks, without frame metadata.
     Frame metadata cost is typically ~12 bytes, which can be non-negligible for very small blocks (< 100 bytes).
     But users will have to take in charge needed metadata to regenerate data, such as compressed and content sizes.
@@ -2524,7 +2959,6 @@ ZSTDLIB_STATIC_API ZSTD_nextInputType_e ZSTD_nextInputType(ZSTD_DCtx* dctx);
     - It is necessary to init context before starting
       + compression : any ZSTD_compressBegin*() variant, including with dictionary
       + decompression : any ZSTD_decompressBegin*() variant, including with dictionary
-      + copyCCtx() and copyDCtx() can be used too
     - Block size is limited, it must be <= ZSTD_getBlockSize() <= ZSTD_BLOCKSIZE_MAX == 128 KB
       + If input is larger than a block size, it's necessary to split input data into multiple blocks
       + For inputs larger than a single block, consider using regular ZSTD_compress() instead.
@@ -2541,11 +2975,14 @@ ZSTDLIB_STATIC_API ZSTD_nextInputType_e ZSTD_nextInputType(ZSTD_DCtx* dctx);
 */
 
 /*=====   Raw zstd block functions  =====*/
+ZSTD_DEPRECATED("The block API is deprecated in favor of the normal compression API. See docs.")
 ZSTDLIB_STATIC_API size_t ZSTD_getBlockSize   (const ZSTD_CCtx* cctx);
+ZSTD_DEPRECATED("The block API is deprecated in favor of the normal compression API. See docs.")
 ZSTDLIB_STATIC_API size_t ZSTD_compressBlock  (ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize);
+ZSTD_DEPRECATED("The block API is deprecated in favor of the normal compression API. See docs.")
 ZSTDLIB_STATIC_API size_t ZSTD_decompressBlock(ZSTD_DCtx* dctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize);
+ZSTD_DEPRECATED("The block API is deprecated in favor of the normal compression API. See docs.")
 ZSTDLIB_STATIC_API size_t ZSTD_insertBlock    (ZSTD_DCtx* dctx, const void* blockStart, size_t blockSize);  /*< insert uncompressed block into `dctx` history. Useful for multi-blocks decompression. */
 
-
 #endif   /* ZSTD_H_ZSTD_STATIC_LINKING_ONLY */
 
diff --git a/lib/zstd/Makefile b/lib/zstd/Makefile
index 20f08c644b71..464c410b2768 100644
--- a/lib/zstd/Makefile
+++ b/lib/zstd/Makefile
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
 # ################################################################
-# Copyright (c) Facebook, Inc.
+# Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
 # This source code is licensed under both the BSD-style license (found in the
diff --git a/lib/zstd/common/allocations.h b/lib/zstd/common/allocations.h
new file mode 100644
index 000000000000..05adbbeccaa9
--- /dev/null
+++ b/lib/zstd/common/allocations.h
@@ -0,0 +1,56 @@
+/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+/* This file provides custom allocation primitives
+ */
+
+#define ZSTD_DEPS_NEED_MALLOC
+#include "zstd_deps.h"   /* ZSTD_malloc, ZSTD_calloc, ZSTD_free, ZSTD_memset */
+
+#include "mem.h" /* MEM_STATIC */
+#define ZSTD_STATIC_LINKING_ONLY
+#include <linux/zstd.h> /* ZSTD_customMem */
+
+#ifndef ZSTD_ALLOCATIONS_H
+#define ZSTD_ALLOCATIONS_H
+
+/* custom memory allocation functions */
+
+MEM_STATIC void* ZSTD_customMalloc(size_t size, ZSTD_customMem customMem)
+{
+    if (customMem.customAlloc)
+        return customMem.customAlloc(customMem.opaque, size);
+    return ZSTD_malloc(size);
+}
+
+MEM_STATIC void* ZSTD_customCalloc(size_t size, ZSTD_customMem customMem)
+{
+    if (customMem.customAlloc) {
+        /* calloc implemented as malloc+memset;
+         * not as efficient as calloc, but next best guess for custom malloc */
+        void* const ptr = customMem.customAlloc(customMem.opaque, size);
+        ZSTD_memset(ptr, 0, size);
+        return ptr;
+    }
+    return ZSTD_calloc(1, size);
+}
+
+MEM_STATIC void ZSTD_customFree(void* ptr, ZSTD_customMem customMem)
+{
+    if (ptr!=NULL) {
+        if (customMem.customFree)
+            customMem.customFree(customMem.opaque, ptr);
+        else
+            ZSTD_free(ptr);
+    }
+}
+
+#endif /* ZSTD_ALLOCATIONS_H */
diff --git a/lib/zstd/common/bits.h b/lib/zstd/common/bits.h
new file mode 100644
index 000000000000..aa3487ec4b6a
--- /dev/null
+++ b/lib/zstd/common/bits.h
@@ -0,0 +1,149 @@
+/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+#ifndef ZSTD_BITS_H
+#define ZSTD_BITS_H
+
+#include "mem.h"
+
+MEM_STATIC unsigned ZSTD_countTrailingZeros32_fallback(U32 val)
+{
+    assert(val != 0);
+    {
+        static const U32 DeBruijnBytePos[32] = {0, 1, 28, 2, 29, 14, 24, 3,
+                                                30, 22, 20, 15, 25, 17, 4, 8,
+                                                31, 27, 13, 23, 21, 19, 16, 7,
+                                                26, 12, 18, 6, 11, 5, 10, 9};
+        return DeBruijnBytePos[((U32) ((val & -(S32) val) * 0x077CB531U)) >> 27];
+    }
+}
+
+MEM_STATIC unsigned ZSTD_countTrailingZeros32(U32 val)
+{
+    assert(val != 0);
+#   if (__GNUC__ >= 4)
+        return (unsigned)__builtin_ctz(val);
+#   else
+        return ZSTD_countTrailingZeros32_fallback(val);
+#   endif
+}
+
+MEM_STATIC unsigned ZSTD_countLeadingZeros32_fallback(U32 val) {
+    assert(val != 0);
+    {
+        static const U32 DeBruijnClz[32] = {0, 9, 1, 10, 13, 21, 2, 29,
+                                            11, 14, 16, 18, 22, 25, 3, 30,
+                                            8, 12, 20, 28, 15, 17, 24, 7,
+                                            19, 27, 23, 6, 26, 5, 4, 31};
+        val |= val >> 1;
+        val |= val >> 2;
+        val |= val >> 4;
+        val |= val >> 8;
+        val |= val >> 16;
+        return 31 - DeBruijnClz[(val * 0x07C4ACDDU) >> 27];
+    }
+}
+
+MEM_STATIC unsigned ZSTD_countLeadingZeros32(U32 val)
+{
+    assert(val != 0);
+#   if (__GNUC__ >= 4)
+        return (unsigned)__builtin_clz(val);
+#   else
+        return ZSTD_countLeadingZeros32_fallback(val);
+#   endif
+}
+
+MEM_STATIC unsigned ZSTD_countTrailingZeros64(U64 val)
+{
+    assert(val != 0);
+#   if (__GNUC__ >= 4) && defined(__LP64__)
+        return (unsigned)__builtin_ctzll(val);
+#   else
+        {
+            U32 mostSignificantWord = (U32)(val >> 32);
+            U32 leastSignificantWord = (U32)val;
+            if (leastSignificantWord == 0) {
+                return 32 + ZSTD_countTrailingZeros32(mostSignificantWord);
+            } else {
+                return ZSTD_countTrailingZeros32(leastSignificantWord);
+            }
+        }
+#   endif
+}
+
+MEM_STATIC unsigned ZSTD_countLeadingZeros64(U64 val)
+{
+    assert(val != 0);
+#   if (__GNUC__ >= 4)
+        return (unsigned)(__builtin_clzll(val));
+#   else
+        {
+            U32 mostSignificantWord = (U32)(val >> 32);
+            U32 leastSignificantWord = (U32)val;
+            if (mostSignificantWord == 0) {
+                return 32 + ZSTD_countLeadingZeros32(leastSignificantWord);
+            } else {
+                return ZSTD_countLeadingZeros32(mostSignificantWord);
+            }
+        }
+#   endif
+}
+
+MEM_STATIC unsigned ZSTD_NbCommonBytes(size_t val)
+{
+    if (MEM_isLittleEndian()) {
+        if (MEM_64bits()) {
+            return ZSTD_countTrailingZeros64((U64)val) >> 3;
+        } else {
+            return ZSTD_countTrailingZeros32((U32)val) >> 3;
+        }
+    } else {  /* Big Endian CPU */
+        if (MEM_64bits()) {
+            return ZSTD_countLeadingZeros64((U64)val) >> 3;
+        } else {
+            return ZSTD_countLeadingZeros32((U32)val) >> 3;
+        }
+    }
+}
+
+MEM_STATIC unsigned ZSTD_highbit32(U32 val)   /* compress, dictBuilder, decodeCorpus */
+{
+    assert(val != 0);
+    return 31 - ZSTD_countLeadingZeros32(val);
+}
+
+/* ZSTD_rotateRight_*():
+ * Rotates a bitfield to the right by "count" bits.
+ * https://en.wikipedia.org/w/index.php?title=Circular_shift&oldid=991635599#Implementing_circular_shifts
+ */
+MEM_STATIC
+U64 ZSTD_rotateRight_U64(U64 const value, U32 count) {
+    assert(count < 64);
+    count &= 0x3F; /* for fickle pattern recognition */
+    return (value >> count) | (U64)(value << ((0U - count) & 0x3F));
+}
+
+MEM_STATIC
+U32 ZSTD_rotateRight_U32(U32 const value, U32 count) {
+    assert(count < 32);
+    count &= 0x1F; /* for fickle pattern recognition */
+    return (value >> count) | (U32)(value << ((0U - count) & 0x1F));
+}
+
+MEM_STATIC
+U16 ZSTD_rotateRight_U16(U16 const value, U32 count) {
+    assert(count < 16);
+    count &= 0x0F; /* for fickle pattern recognition */
+    return (value >> count) | (U16)(value << ((0U - count) & 0x0F));
+}
+
+#endif /* ZSTD_BITS_H */
diff --git a/lib/zstd/common/bitstream.h b/lib/zstd/common/bitstream.h
index feef3a1b1d60..444dc4f85c64 100644
--- a/lib/zstd/common/bitstream.h
+++ b/lib/zstd/common/bitstream.h
@@ -1,7 +1,8 @@
+/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
 /* ******************************************************************
  * bitstream
  * Part of FSE library
- * Copyright (c) Yann Collet, Facebook, Inc.
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
  *
  * You can contact the author at :
  * - Source repository : https://github.com/Cyan4973/FiniteStateEntropy
@@ -27,6 +28,7 @@
 #include "compiler.h"       /* UNLIKELY() */
 #include "debug.h"          /* assert(), DEBUGLOG(), RAWLOG() */
 #include "error_private.h"  /* error codes and messages */
+#include "bits.h"           /* ZSTD_highbit32 */
 
 
 /*=========================================
@@ -122,33 +124,6 @@ MEM_STATIC void BIT_flushBitsFast(BIT_CStream_t* bitC);
 MEM_STATIC size_t BIT_readBitsFast(BIT_DStream_t* bitD, unsigned nbBits);
 /* faster, but works only if nbBits >= 1 */
 
-
-
-/*-**************************************************************
-*  Internal functions
-****************************************************************/
-MEM_STATIC unsigned BIT_highbit32 (U32 val)
-{
-    assert(val != 0);
-    {
-#   if (__GNUC__ >= 3)   /* Use GCC Intrinsic */
-        return __builtin_clz (val) ^ 31;
-#   else   /* Software version */
-        static const unsigned DeBruijnClz[32] = { 0,  9,  1, 10, 13, 21,  2, 29,
-                                                 11, 14, 16, 18, 22, 25,  3, 30,
-                                                  8, 12, 20, 28, 15, 17, 24,  7,
-                                                 19, 27, 23,  6, 26,  5,  4, 31 };
-        U32 v = val;
-        v |= v >> 1;
-        v |= v >> 2;
-        v |= v >> 4;
-        v |= v >> 8;
-        v |= v >> 16;
-        return DeBruijnClz[ (U32) (v * 0x07C4ACDDU) >> 27];
-#   endif
-    }
-}
-
 /*=====    Local Constants   =====*/
 static const unsigned BIT_mask[] = {
     0,          1,         3,         7,         0xF,       0x1F,
@@ -178,6 +153,12 @@ MEM_STATIC size_t BIT_initCStream(BIT_CStream_t* bitC,
     return 0;
 }
 
+MEM_STATIC FORCE_INLINE_ATTR size_t BIT_getLowerBits(size_t bitContainer, U32 const nbBits)
+{
+    assert(nbBits < BIT_MASK_SIZE);
+    return bitContainer & BIT_mask[nbBits];
+}
+
 /*! BIT_addBits() :
  *  can add up to 31 bits into `bitC`.
  *  Note : does not check for register overflow ! */
@@ -187,7 +168,7 @@ MEM_STATIC void BIT_addBits(BIT_CStream_t* bitC,
     DEBUG_STATIC_ASSERT(BIT_MASK_SIZE == 32);
     assert(nbBits < BIT_MASK_SIZE);
     assert(nbBits + bitC->bitPos < sizeof(bitC->bitContainer) * 8);
-    bitC->bitContainer |= (value & BIT_mask[nbBits]) << bitC->bitPos;
+    bitC->bitContainer |= BIT_getLowerBits(value, nbBits) << bitC->bitPos;
     bitC->bitPos += nbBits;
 }
 
@@ -266,7 +247,7 @@ MEM_STATIC size_t BIT_initDStream(BIT_DStream_t* bitD, const void* srcBuffer, si
         bitD->ptr   = (const char*)srcBuffer + srcSize - sizeof(bitD->bitContainer);
         bitD->bitContainer = MEM_readLEST(bitD->ptr);
         { BYTE const lastByte = ((const BYTE*)srcBuffer)[srcSize-1];
-          bitD->bitsConsumed = lastByte ? 8 - BIT_highbit32(lastByte) : 0;  /* ensures bitsConsumed is always set */
+          bitD->bitsConsumed = lastByte ? 8 - ZSTD_highbit32(lastByte) : 0;  /* ensures bitsConsumed is always set */
           if (lastByte == 0) return ERROR(GENERIC); /* endMark not present */ }
     } else {
         bitD->ptr   = bitD->start;
@@ -294,7 +275,7 @@ MEM_STATIC size_t BIT_initDStream(BIT_DStream_t* bitD, const void* srcBuffer, si
         default: break;
         }
         {   BYTE const lastByte = ((const BYTE*)srcBuffer)[srcSize-1];
-            bitD->bitsConsumed = lastByte ? 8 - BIT_highbit32(lastByte) : 0;
+            bitD->bitsConsumed = lastByte ? 8 - ZSTD_highbit32(lastByte) : 0;
             if (lastByte == 0) return ERROR(corruption_detected);  /* endMark not present */
         }
         bitD->bitsConsumed += (U32)(sizeof(bitD->bitContainer) - srcSize)*8;
@@ -325,12 +306,6 @@ MEM_STATIC FORCE_INLINE_ATTR size_t BIT_getMiddleBits(size_t bitContainer, U32 c
 #endif
 }
 
-MEM_STATIC FORCE_INLINE_ATTR size_t BIT_getLowerBits(size_t bitContainer, U32 const nbBits)
-{
-    assert(nbBits < BIT_MASK_SIZE);
-    return bitContainer & BIT_mask[nbBits];
-}
-
 /*! BIT_lookBits() :
  *  Provides next n bits from local register.
  *  local register is not modified.
@@ -377,7 +352,7 @@ MEM_STATIC FORCE_INLINE_ATTR size_t BIT_readBits(BIT_DStream_t* bitD, unsigned n
 }
 
 /*! BIT_readBitsFast() :
- *  unsafe version; only works only if nbBits >= 1 */
+ *  unsafe version; only works if nbBits >= 1 */
 MEM_STATIC size_t BIT_readBitsFast(BIT_DStream_t* bitD, unsigned nbBits)
 {
     size_t const value = BIT_lookBitsFast(bitD, nbBits);
@@ -408,7 +383,7 @@ MEM_STATIC BIT_DStream_status BIT_reloadDStreamFast(BIT_DStream_t* bitD)
  *  This function is safe, it guarantees it will not read beyond src buffer.
  * @return : status of `BIT_DStream_t` internal register.
  *           when status == BIT_DStream_unfinished, internal register is filled with at least 25 or 57 bits */
-MEM_STATIC BIT_DStream_status BIT_reloadDStream(BIT_DStream_t* bitD)
+MEM_STATIC FORCE_INLINE_ATTR BIT_DStream_status BIT_reloadDStream(BIT_DStream_t* bitD)
 {
     if (bitD->bitsConsumed > (sizeof(bitD->bitContainer)*8))  /* overflow detected, like end of stream */
         return BIT_DStream_overflow;
diff --git a/lib/zstd/common/compiler.h b/lib/zstd/common/compiler.h
index c42d39faf9bd..c437e0975575 100644
--- a/lib/zstd/common/compiler.h
+++ b/lib/zstd/common/compiler.h
@@ -1,5 +1,6 @@
+/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
 /*
- * Copyright (c) Yann Collet, Facebook, Inc.
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
  * All rights reserved.
  *
  * This source code is licensed under both the BSD-style license (found in the
@@ -179,6 +180,17 @@
 *  Sanitizer
 *****************************************************************/
 
+/* Issue #3240 reports an ASAN failure on an llvm-mingw build. Out of an
+ * abundance of caution, disable our custom poisoning on mingw. */
+#ifdef __MINGW32__
+#ifndef ZSTD_ASAN_DONT_POISON_WORKSPACE
+#define ZSTD_ASAN_DONT_POISON_WORKSPACE 1
+#endif
+#ifndef ZSTD_MSAN_DONT_POISON_WORKSPACE
+#define ZSTD_MSAN_DONT_POISON_WORKSPACE 1
+#endif
+#endif
+
 
 
 #endif /* ZSTD_COMPILER_H */
diff --git a/lib/zstd/common/cpu.h b/lib/zstd/common/cpu.h
index 0db7b42407ee..d8319a2bef4c 100644
--- a/lib/zstd/common/cpu.h
+++ b/lib/zstd/common/cpu.h
@@ -1,5 +1,6 @@
+/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
 /*
- * Copyright (c) Facebook, Inc.
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
  * All rights reserved.
  *
  * This source code is licensed under both the BSD-style license (found in the
diff --git a/lib/zstd/common/debug.c b/lib/zstd/common/debug.c
index bb863c9ea616..e56ff6464e91 100644
--- a/lib/zstd/common/debug.c
+++ b/lib/zstd/common/debug.c
@@ -1,7 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
 /* ******************************************************************
  * debug
  * Part of FSE library
- * Copyright (c) Yann Collet, Facebook, Inc.
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
  *
  * You can contact the author at :
  * - Source repository : https://github.com/Cyan4973/FiniteStateEntropy
diff --git a/lib/zstd/common/debug.h b/lib/zstd/common/debug.h
index 6dd88d1fbd02..da0dbfc614b8 100644
--- a/lib/zstd/common/debug.h
+++ b/lib/zstd/common/debug.h
@@ -1,7 +1,8 @@
+/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
 /* ******************************************************************
  * debug
  * Part of FSE library
- * Copyright (c) Yann Collet, Facebook, Inc.
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
  *
  * You can contact the author at :
  * - Source repository : https://github.com/Cyan4973/FiniteStateEntropy
diff --git a/lib/zstd/common/entropy_common.c b/lib/zstd/common/entropy_common.c
index fef67056f052..6cdd82233fb5 100644
--- a/lib/zstd/common/entropy_common.c
+++ b/lib/zstd/common/entropy_common.c
@@ -1,6 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
 /* ******************************************************************
  * Common functions of New Generation Entropy library
- * Copyright (c) Yann Collet, Facebook, Inc.
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
  *
  *  You can contact the author at :
  *  - FSE+HUF source repository : https://github.com/Cyan4973/FiniteStateEntropy
@@ -19,8 +20,8 @@
 #include "error_private.h"       /* ERR_*, ERROR */
 #define FSE_STATIC_LINKING_ONLY  /* FSE_MIN_TABLELOG */
 #include "fse.h"
-#define HUF_STATIC_LINKING_ONLY  /* HUF_TABLELOG_ABSOLUTEMAX */
 #include "huf.h"
+#include "bits.h"                /* ZSDT_highbit32, ZSTD_countTrailingZeros32 */
 
 
 /*===   Version   ===*/
@@ -38,23 +39,6 @@ const char* HUF_getErrorName(size_t code) { return ERR_getErrorName(code); }
 /*-**************************************************************
 *  FSE NCount encoding-decoding
 ****************************************************************/
-static U32 FSE_ctz(U32 val)
-{
-    assert(val != 0);
-    {
-#   if (__GNUC__ >= 3)   /* GCC Intrinsic */
-        return __builtin_ctz(val);
-#   else   /* Software version */
-        U32 count = 0;
-        while ((val & 1) == 0) {
-            val >>= 1;
-            ++count;
-        }
-        return count;
-#   endif
-    }
-}
-
 FORCE_INLINE_TEMPLATE
 size_t FSE_readNCount_body(short* normalizedCounter, unsigned* maxSVPtr, unsigned* tableLogPtr,
                            const void* headerBuffer, size_t hbSize)
@@ -102,7 +86,7 @@ size_t FSE_readNCount_body(short* normalizedCounter, unsigned* maxSVPtr, unsigne
              * repeat.
              * Avoid UB by setting the high bit to 1.
              */
-            int repeats = FSE_ctz(~bitStream | 0x80000000) >> 1;
+            int repeats = ZSTD_countTrailingZeros32(~bitStream | 0x80000000) >> 1;
             while (repeats >= 12) {
                 charnum += 3 * 12;
                 if (LIKELY(ip <= iend-7)) {
@@ -113,7 +97,7 @@ size_t FSE_readNCount_body(short* normalizedCounter, unsigned* maxSVPtr, unsigne
                     ip = iend - 4;
                 }
                 bitStream = MEM_readLE32(ip) >> bitCount;
-                repeats = FSE_ctz(~bitStream | 0x80000000) >> 1;
+                repeats = ZSTD_countTrailingZeros32(~bitStream | 0x80000000) >> 1;
             }
             charnum += 3 * repeats;
             bitStream >>= 2 * repeats;
@@ -178,7 +162,7 @@ size_t FSE_readNCount_body(short* normalizedCounter, unsigned* maxSVPtr, unsigne
                  * know that threshold > 1.
                  */
                 if (remaining <= 1) break;
-                nbBits = BIT_highbit32(remaining) + 1;
+                nbBits = ZSTD_highbit32(remaining) + 1;
                 threshold = 1 << (nbBits - 1);
             }
             if (charnum >= maxSV1) break;
@@ -253,7 +237,7 @@ size_t HUF_readStats(BYTE* huffWeight, size_t hwSize, U32* rankStats,
                      const void* src, size_t srcSize)
 {
     U32 wksp[HUF_READ_STATS_WORKSPACE_SIZE_U32];
-    return HUF_readStats_wksp(huffWeight, hwSize, rankStats, nbSymbolsPtr, tableLogPtr, src, srcSize, wksp, sizeof(wksp), /* bmi2 */ 0);
+    return HUF_readStats_wksp(huffWeight, hwSize, rankStats, nbSymbolsPtr, tableLogPtr, src, srcSize, wksp, sizeof(wksp), /* flags */ 0);
 }
 
 FORCE_INLINE_TEMPLATE size_t
@@ -301,14 +285,14 @@ HUF_readStats_body(BYTE* huffWeight, size_t hwSize, U32* rankStats,
     if (weightTotal == 0) return ERROR(corruption_detected);
 
     /* get last non-null symbol weight (implied, total must be 2^n) */
-    {   U32 const tableLog = BIT_highbit32(weightTotal) + 1;
+    {   U32 const tableLog = ZSTD_highbit32(weightTotal) + 1;
         if (tableLog > HUF_TABLELOG_MAX) return ERROR(corruption_detected);
         *tableLogPtr = tableLog;
         /* determine last weight */
         {   U32 const total = 1 << tableLog;
             U32 const rest = total - weightTotal;
-            U32 const verif = 1 << BIT_highbit32(rest);
-            U32 const lastWeight = BIT_highbit32(rest) + 1;
+            U32 const verif = 1 << ZSTD_highbit32(rest);
+            U32 const lastWeight = ZSTD_highbit32(rest) + 1;
             if (verif != rest) return ERROR(corruption_detected);    /* last value must be a clean power of 2 */
             huffWeight[oSize] = (BYTE)lastWeight;
             rankStats[lastWeight]++;
@@ -345,13 +329,13 @@ size_t HUF_readStats_wksp(BYTE* huffWeight, size_t hwSize, U32* rankStats,
                      U32* nbSymbolsPtr, U32* tableLogPtr,
                      const void* src, size_t srcSize,
                      void* workSpace, size_t wkspSize,
-                     int bmi2)
+                     int flags)
 {
 #if DYNAMIC_BMI2
-    if (bmi2) {
+    if (flags & HUF_flags_bmi2) {
         return HUF_readStats_body_bmi2(huffWeight, hwSize, rankStats, nbSymbolsPtr, tableLogPtr, src, srcSize, workSpace, wkspSize);
     }
 #endif
-    (void)bmi2;
+    (void)flags;
     return HUF_readStats_body_default(huffWeight, hwSize, rankStats, nbSymbolsPtr, tableLogPtr, src, srcSize, workSpace, wkspSize);
 }
diff --git a/lib/zstd/common/error_private.c b/lib/zstd/common/error_private.c
index 6d1135f8c373..a4062d30d170 100644
--- a/lib/zstd/common/error_private.c
+++ b/lib/zstd/common/error_private.c
@@ -1,5 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
 /*
- * Copyright (c) Yann Collet, Facebook, Inc.
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
  * All rights reserved.
  *
  * This source code is licensed under both the BSD-style license (found in the
@@ -27,9 +28,11 @@ const char* ERR_getErrorString(ERR_enum code)
     case PREFIX(version_unsupported): return "Version not supported";
     case PREFIX(frameParameter_unsupported): return "Unsupported frame parameter";
     case PREFIX(frameParameter_windowTooLarge): return "Frame requires too much memory for decoding";
-    case PREFIX(corruption_detected): return "Corrupted block detected";
+    case PREFIX(corruption_detected): return "Data corruption detected";
     case PREFIX(checksum_wrong): return "Restored data doesn't match checksum";
+    case PREFIX(literals_headerWrong): return "Header of Literals' block doesn't respect format specification";
     case PREFIX(parameter_unsupported): return "Unsupported parameter";
+    case PREFIX(parameter_combination_unsupported): return "Unsupported combination of parameters";
     case PREFIX(parameter_outOfBound): return "Parameter is out of bound";
     case PREFIX(init_missing): return "Context should be init first";
     case PREFIX(memory_allocation): return "Allocation error : not enough memory";
@@ -38,17 +41,22 @@ const char* ERR_getErrorString(ERR_enum code)
     case PREFIX(tableLog_tooLarge): return "tableLog requires too much memory : unsupported";
     case PREFIX(maxSymbolValue_tooLarge): return "Unsupported max Symbol Value : too large";
     case PREFIX(maxSymbolValue_tooSmall): return "Specified maxSymbolValue is too small";
+    case PREFIX(stabilityCondition_notRespected): return "pledged buffer stability condition is not respected";
     case PREFIX(dictionary_corrupted): return "Dictionary is corrupted";
     case PREFIX(dictionary_wrong): return "Dictionary mismatch";
     case PREFIX(dictionaryCreation_failed): return "Cannot create Dictionary from provided samples";
     case PREFIX(dstSize_tooSmall): return "Destination buffer is too small";
     case PREFIX(srcSize_wrong): return "Src size is incorrect";
     case PREFIX(dstBuffer_null): return "Operation on NULL destination buffer";
+    case PREFIX(noForwardProgress_destFull): return "Operation made no progress over multiple calls, due to output buffer being full";
+    case PREFIX(noForwardProgress_inputEmpty): return "Operation made no progress over multiple calls, due to input being empty";
         /* following error codes are not stable and may be removed or changed in a future version */
     case PREFIX(frameIndex_tooLarge): return "Frame index is too large";
     case PREFIX(seekableIO): return "An I/O error occurred when reading/seeking";
     case PREFIX(dstBuffer_wrong): return "Destination buffer is wrong";
     case PREFIX(srcBuffer_wrong): return "Source buffer is wrong";
+    case PREFIX(sequenceProducer_failed): return "Block-level external sequence producer returned an error code";
+    case PREFIX(externalSequences_invalid): return "External sequences are not valid";
     case PREFIX(maxCode):
     default: return notErrorCode;
     }
diff --git a/lib/zstd/common/error_private.h b/lib/zstd/common/error_private.h
index ca5101e542fa..9a4699a38a88 100644
--- a/lib/zstd/common/error_private.h
+++ b/lib/zstd/common/error_private.h
@@ -1,5 +1,6 @@
+/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
 /*
- * Copyright (c) Yann Collet, Facebook, Inc.
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
  * All rights reserved.
  *
  * This source code is licensed under both the BSD-style license (found in the
diff --git a/lib/zstd/common/fse.h b/lib/zstd/common/fse.h
index 4507043b2287..c4e25a219142 100644
--- a/lib/zstd/common/fse.h
+++ b/lib/zstd/common/fse.h
@@ -1,7 +1,8 @@
+/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
 /* ******************************************************************
  * FSE : Finite State Entropy codec
  * Public Prototypes declaration
- * Copyright (c) Yann Collet, Facebook, Inc.
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
  *
  * You can contact the author at :
  * - Source repository : https://github.com/Cyan4973/FiniteStateEntropy
@@ -50,34 +51,6 @@
 FSE_PUBLIC_API unsigned FSE_versionNumber(void);   /*< library version number; to be used when checking dll version */
 
 
-/*-****************************************
-*  FSE simple functions
-******************************************/
-/*! FSE_compress() :
-    Compress content of buffer 'src', of size 'srcSize', into destination buffer 'dst'.
-    'dst' buffer must be already allocated. Compression runs faster is dstCapacity >= FSE_compressBound(srcSize).
-    @return : size of compressed data (<= dstCapacity).
-    Special values : if return == 0, srcData is not compressible => Nothing is stored within dst !!!
-                     if return == 1, srcData is a single byte symbol * srcSize times. Use RLE compression instead.
-                     if FSE_isError(return), compression failed (more details using FSE_getErrorName())
-*/
-FSE_PUBLIC_API size_t FSE_compress(void* dst, size_t dstCapacity,
-                             const void* src, size_t srcSize);
-
-/*! FSE_decompress():
-    Decompress FSE data from buffer 'cSrc', of size 'cSrcSize',
-    into already allocated destination buffer 'dst', of size 'dstCapacity'.
-    @return : size of regenerated data (<= maxDstSize),
-              or an error code, which can be tested using FSE_isError() .
-
-    ** Important ** : FSE_decompress() does not decompress non-compressible nor RLE data !!!
-    Why ? : making this distinction requires a header.
-    Header management is intentionally delegated to the user layer, which can better manage special cases.
-*/
-FSE_PUBLIC_API size_t FSE_decompress(void* dst,  size_t dstCapacity,
-                               const void* cSrc, size_t cSrcSize);
-
-
 /*-*****************************************
 *  Tool functions
 ******************************************/
@@ -88,20 +61,6 @@ FSE_PUBLIC_API unsigned    FSE_isError(size_t code);        /* tells if a return
 FSE_PUBLIC_API const char* FSE_getErrorName(size_t code);   /* provides error code string (useful for debugging) */
 
 
-/*-*****************************************
-*  FSE advanced functions
-******************************************/
-/*! FSE_compress2() :
-    Same as FSE_compress(), but allows the selection of 'maxSymbolValue' and 'tableLog'
-    Both parameters can be defined as '0' to mean : use default value
-    @return : size of compressed data
-    Special values : if return == 0, srcData is not compressible => Nothing is stored within cSrc !!!
-                     if return == 1, srcData is a single byte symbol * srcSize times. Use RLE compression.
-                     if FSE_isError(return), it's an error code.
-*/
-FSE_PUBLIC_API size_t FSE_compress2 (void* dst, size_t dstSize, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog);
-
-
 /*-*****************************************
 *  FSE detailed API
 ******************************************/
@@ -161,8 +120,6 @@ FSE_PUBLIC_API size_t FSE_writeNCount (void* buffer, size_t bufferSize,
 /*! Constructor and Destructor of FSE_CTable.
     Note that FSE_CTable size depends on 'tableLog' and 'maxSymbolValue' */
 typedef unsigned FSE_CTable;   /* don't allocate that. It's only meant to be more restrictive than void* */
-FSE_PUBLIC_API FSE_CTable* FSE_createCTable (unsigned maxSymbolValue, unsigned tableLog);
-FSE_PUBLIC_API void        FSE_freeCTable (FSE_CTable* ct);
 
 /*! FSE_buildCTable():
     Builds `ct`, which must be already allocated, using FSE_createCTable().
@@ -238,23 +195,7 @@ FSE_PUBLIC_API size_t FSE_readNCount_bmi2(short* normalizedCounter,
                            unsigned* maxSymbolValuePtr, unsigned* tableLogPtr,
                            const void* rBuffer, size_t rBuffSize, int bmi2);
 
-/*! Constructor and Destructor of FSE_DTable.
-    Note that its size depends on 'tableLog' */
 typedef unsigned FSE_DTable;   /* don't allocate that. It's just a way to be more restrictive than void* */
-FSE_PUBLIC_API FSE_DTable* FSE_createDTable(unsigned tableLog);
-FSE_PUBLIC_API void        FSE_freeDTable(FSE_DTable* dt);
-
-/*! FSE_buildDTable():
-    Builds 'dt', which must be already allocated, using FSE_createDTable().
-    return : 0, or an errorCode, which can be tested using FSE_isError() */
-FSE_PUBLIC_API size_t FSE_buildDTable (FSE_DTable* dt, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog);
-
-/*! FSE_decompress_usingDTable():
-    Decompress compressed source `cSrc` of size `cSrcSize` using `dt`
-    into `dst` which must be already allocated.
-    @return : size of regenerated data (necessarily <= `dstCapacity`),
-              or an errorCode, which can be tested using FSE_isError() */
-FSE_PUBLIC_API size_t FSE_decompress_usingDTable(void* dst, size_t dstCapacity, const void* cSrc, size_t cSrcSize, const FSE_DTable* dt);
 
 /*!
 Tutorial :
@@ -317,16 +258,6 @@ If there is an error, the function will return an error code, which can be teste
 unsigned FSE_optimalTableLog_internal(unsigned maxTableLog, size_t srcSize, unsigned maxSymbolValue, unsigned minus);
 /*< same as FSE_optimalTableLog(), which used `minus==2` */
 
-/* FSE_compress_wksp() :
- * Same as FSE_compress2(), but using an externally allocated scratch buffer (`workSpace`).
- * FSE_COMPRESS_WKSP_SIZE_U32() provides the minimum size required for `workSpace` as a table of FSE_CTable.
- */
-#define FSE_COMPRESS_WKSP_SIZE_U32(maxTableLog, maxSymbolValue)   ( FSE_CTABLE_SIZE_U32(maxTableLog, maxSymbolValue) + ((maxTableLog > 12) ? (1 << (maxTableLog - 2)) : 1024) )
-size_t FSE_compress_wksp (void* dst, size_t dstSize, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog, void* workSpace, size_t wkspSize);
-
-size_t FSE_buildCTable_raw (FSE_CTable* ct, unsigned nbBits);
-/*< build a fake FSE_CTable, designed for a flat distribution, where each symbol uses nbBits */
-
 size_t FSE_buildCTable_rle (FSE_CTable* ct, unsigned char symbolValue);
 /*< build a fake FSE_CTable, designed to compress always the same symbolValue */
 
@@ -344,19 +275,11 @@ size_t FSE_buildCTable_wksp(FSE_CTable* ct, const short* normalizedCounter, unsi
 FSE_PUBLIC_API size_t FSE_buildDTable_wksp(FSE_DTable* dt, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog, void* workSpace, size_t wkspSize);
 /*< Same as FSE_buildDTable(), using an externally allocated `workspace` produced with `FSE_BUILD_DTABLE_WKSP_SIZE_U32(maxSymbolValue)` */
 
-size_t FSE_buildDTable_raw (FSE_DTable* dt, unsigned nbBits);
-/*< build a fake FSE_DTable, designed to read a flat distribution where each symbol uses nbBits */
-
-size_t FSE_buildDTable_rle (FSE_DTable* dt, unsigned char symbolValue);
-/*< build a fake FSE_DTable, designed to always generate the same symbolValue */
-
-#define FSE_DECOMPRESS_WKSP_SIZE_U32(maxTableLog, maxSymbolValue) (FSE_DTABLE_SIZE_U32(maxTableLog) + FSE_BUILD_DTABLE_WKSP_SIZE_U32(maxTableLog, maxSymbolValue) + (FSE_MAX_SYMBOL_VALUE + 1) / 2 + 1)
+#define FSE_DECOMPRESS_WKSP_SIZE_U32(maxTableLog, maxSymbolValue) (FSE_DTABLE_SIZE_U32(maxTableLog) + 1 + FSE_BUILD_DTABLE_WKSP_SIZE_U32(maxTableLog, maxSymbolValue) + (FSE_MAX_SYMBOL_VALUE + 1) / 2 + 1)
 #define FSE_DECOMPRESS_WKSP_SIZE(maxTableLog, maxSymbolValue) (FSE_DECOMPRESS_WKSP_SIZE_U32(maxTableLog, maxSymbolValue) * sizeof(unsigned))
-size_t FSE_decompress_wksp(void* dst, size_t dstCapacity, const void* cSrc, size_t cSrcSize, unsigned maxLog, void* workSpace, size_t wkspSize);
-/*< same as FSE_decompress(), using an externally allocated `workSpace` produced with `FSE_DECOMPRESS_WKSP_SIZE_U32(maxLog, maxSymbolValue)` */
-
 size_t FSE_decompress_wksp_bmi2(void* dst, size_t dstCapacity, const void* cSrc, size_t cSrcSize, unsigned maxLog, void* workSpace, size_t wkspSize, int bmi2);
-/*< Same as FSE_decompress_wksp() but with dynamic BMI2 support. Pass 1 if your CPU supports BMI2 or 0 if it doesn't. */
+/*< same as FSE_decompress(), using an externally allocated `workSpace` produced with `FSE_DECOMPRESS_WKSP_SIZE_U32(maxLog, maxSymbolValue)`.
+ * Set bmi2 to 1 if your CPU supports BMI2 or 0 if it doesn't */
 
 typedef enum {
    FSE_repeat_none,  /*< Cannot use the previous table */
@@ -552,7 +475,7 @@ MEM_STATIC void FSE_flushCState(BIT_CStream_t* bitC, const FSE_CState_t* statePt
 
 /* FSE_getMaxNbBits() :
  * Approximate maximum cost of a symbol, in bits.
- * Fractional get rounded up (i.e : a symbol with a normalized frequency of 3 gives the same result as a frequency of 2)
+ * Fractional get rounded up (i.e. a symbol with a normalized frequency of 3 gives the same result as a frequency of 2)
  * note 1 : assume symbolValue is valid (<= maxSymbolValue)
  * note 2 : if freq[symbolValue]==0, @return a fake cost of tableLog+1 bits */
 MEM_STATIC U32 FSE_getMaxNbBits(const void* symbolTTPtr, U32 symbolValue)
diff --git a/lib/zstd/common/fse_decompress.c b/lib/zstd/common/fse_decompress.c
index a0d06095be83..45cf457f31ef 100644
--- a/lib/zstd/common/fse_decompress.c
+++ b/lib/zstd/common/fse_decompress.c
@@ -1,6 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
 /* ******************************************************************
  * FSE : Finite State Entropy decoder
- * Copyright (c) Yann Collet, Facebook, Inc.
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
  *
  *  You can contact the author at :
  *  - FSE source repository : https://github.com/Cyan4973/FiniteStateEntropy
@@ -24,6 +25,7 @@
 #include "error_private.h"
 #define ZSTD_DEPS_NEED_MALLOC
 #include "zstd_deps.h"
+#include "bits.h"       /* ZSTD_highbit32 */
 
 
 /* **************************************************************
@@ -55,19 +57,6 @@
 #define FSE_FUNCTION_NAME(X,Y) FSE_CAT(X,Y)
 #define FSE_TYPE_NAME(X,Y) FSE_CAT(X,Y)
 
-
-/* Function templates */
-FSE_DTable* FSE_createDTable (unsigned tableLog)
-{
-    if (tableLog > FSE_TABLELOG_ABSOLUTE_MAX) tableLog = FSE_TABLELOG_ABSOLUTE_MAX;
-    return (FSE_DTable*)ZSTD_malloc( FSE_DTABLE_SIZE_U32(tableLog) * sizeof (U32) );
-}
-
-void FSE_freeDTable (FSE_DTable* dt)
-{
-    ZSTD_free(dt);
-}
-
 static size_t FSE_buildDTable_internal(FSE_DTable* dt, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog, void* workSpace, size_t wkspSize)
 {
     void* const tdPtr = dt+1;   /* because *dt is unsigned, 32-bits aligned on 32-bits */
@@ -127,10 +116,10 @@ static size_t FSE_buildDTable_internal(FSE_DTable* dt, const short* normalizedCo
             }
         }
         /* Now we spread those positions across the table.
-         * The benefit of doing it in two stages is that we avoid the the
+         * The benefit of doing it in two stages is that we avoid the
          * variable size inner loop, which caused lots of branch misses.
          * Now we can run through all the positions without any branch misses.
-         * We unroll the loop twice, since that is what emperically worked best.
+         * We unroll the loop twice, since that is what empirically worked best.
          */
         {
             size_t position = 0;
@@ -166,7 +155,7 @@ static size_t FSE_buildDTable_internal(FSE_DTable* dt, const short* normalizedCo
         for (u=0; u<tableSize; u++) {
             FSE_FUNCTION_TYPE const symbol = (FSE_FUNCTION_TYPE)(tableDecode[u].symbol);
             U32 const nextState = symbolNext[symbol]++;
-            tableDecode[u].nbBits = (BYTE) (tableLog - BIT_highbit32(nextState) );
+            tableDecode[u].nbBits = (BYTE) (tableLog - ZSTD_highbit32(nextState) );
             tableDecode[u].newState = (U16) ( (nextState << tableDecode[u].nbBits) - tableSize);
     }   }
 
@@ -184,49 +173,6 @@ size_t FSE_buildDTable_wksp(FSE_DTable* dt, const short* normalizedCounter, unsi
 /*-*******************************************************
 *  Decompression (Byte symbols)
 *********************************************************/
-size_t FSE_buildDTable_rle (FSE_DTable* dt, BYTE symbolValue)
-{
-    void* ptr = dt;
-    FSE_DTableHeader* const DTableH = (FSE_DTableHeader*)ptr;
-    void* dPtr = dt + 1;
-    FSE_decode_t* const cell = (FSE_decode_t*)dPtr;
-
-    DTableH->tableLog = 0;
-    DTableH->fastMode = 0;
-
-    cell->newState = 0;
-    cell->symbol = symbolValue;
-    cell->nbBits = 0;
-
-    return 0;
-}
-
-
-size_t FSE_buildDTable_raw (FSE_DTable* dt, unsigned nbBits)
-{
-    void* ptr = dt;
-    FSE_DTableHeader* const DTableH = (FSE_DTableHeader*)ptr;
-    void* dPtr = dt + 1;
-    FSE_decode_t* const dinfo = (FSE_decode_t*)dPtr;
-    const unsigned tableSize = 1 << nbBits;
-    const unsigned tableMask = tableSize - 1;
-    const unsigned maxSV1 = tableMask+1;
-    unsigned s;
-
-    /* Sanity checks */
-    if (nbBits < 1) return ERROR(GENERIC);         /* min size */
-
-    /* Build Decoding Table */
-    DTableH->tableLog = (U16)nbBits;
-    DTableH->fastMode = 1;
-    for (s=0; s<maxSV1; s++) {
-        dinfo[s].newState = 0;
-        dinfo[s].symbol = (BYTE)s;
-        dinfo[s].nbBits = (BYTE)nbBits;
-    }
-
-    return 0;
-}
 
 FORCE_INLINE_TEMPLATE size_t FSE_decompress_usingDTable_generic(
           void* dst, size_t maxDstSize,
@@ -290,26 +236,6 @@ FORCE_INLINE_TEMPLATE size_t FSE_decompress_usingDTable_generic(
     return op-ostart;
 }
 
-
-size_t FSE_decompress_usingDTable(void* dst, size_t originalSize,
-                            const void* cSrc, size_t cSrcSize,
-                            const FSE_DTable* dt)
-{
-    const void* ptr = dt;
-    const FSE_DTableHeader* DTableH = (const FSE_DTableHeader*)ptr;
-    const U32 fastMode = DTableH->fastMode;
-
-    /* select fast mode (static) */
-    if (fastMode) return FSE_decompress_usingDTable_generic(dst, originalSize, cSrc, cSrcSize, dt, 1);
-    return FSE_decompress_usingDTable_generic(dst, originalSize, cSrc, cSrcSize, dt, 0);
-}
-
-
-size_t FSE_decompress_wksp(void* dst, size_t dstCapacity, const void* cSrc, size_t cSrcSize, unsigned maxLog, void* workSpace, size_t wkspSize)
-{
-    return FSE_decompress_wksp_bmi2(dst, dstCapacity, cSrc, cSrcSize, maxLog, workSpace, wkspSize, /* bmi2 */ 0);
-}
-
 typedef struct {
     short ncount[FSE_MAX_SYMBOL_VALUE + 1];
     FSE_DTable dtable[1]; /* Dynamically sized */
@@ -342,7 +268,8 @@ FORCE_INLINE_TEMPLATE size_t FSE_decompress_wksp_body(
     }
 
     if (FSE_DECOMPRESS_WKSP_SIZE(tableLog, maxSymbolValue) > wkspSize) return ERROR(tableLog_tooLarge);
-    workSpace = wksp->dtable + FSE_DTABLE_SIZE_U32(tableLog);
+    assert(sizeof(*wksp) + FSE_DTABLE_SIZE(tableLog) <= wkspSize);
+    workSpace = (BYTE*)workSpace + sizeof(*wksp) + FSE_DTABLE_SIZE(tableLog);
     wkspSize -= sizeof(*wksp) + FSE_DTABLE_SIZE(tableLog);
 
     CHECK_F( FSE_buildDTable_internal(wksp->dtable, wksp->ncount, maxSymbolValue, tableLog, workSpace, wkspSize) );
@@ -382,9 +309,4 @@ size_t FSE_decompress_wksp_bmi2(void* dst, size_t dstCapacity, const void* cSrc,
     return FSE_decompress_wksp_body_default(dst, dstCapacity, cSrc, cSrcSize, maxLog, workSpace, wkspSize);
 }
 
-
-typedef FSE_DTable DTable_max_t[FSE_DTABLE_SIZE_U32(FSE_MAX_TABLELOG)];
-
-
-
 #endif   /* FSE_COMMONDEFS_ONLY */
diff --git a/lib/zstd/common/huf.h b/lib/zstd/common/huf.h
index 5042ff870308..8e7943092ed1 100644
--- a/lib/zstd/common/huf.h
+++ b/lib/zstd/common/huf.h
@@ -1,7 +1,8 @@
+/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
 /* ******************************************************************
  * huff0 huffman codec,
  * part of Finite State Entropy library
- * Copyright (c) Yann Collet, Facebook, Inc.
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
  *
  * You can contact the author at :
  * - Source repository : https://github.com/Cyan4973/FiniteStateEntropy
@@ -18,99 +19,22 @@
 
 /* *** Dependencies *** */
 #include "zstd_deps.h"    /* size_t */
-
-
-/* *** library symbols visibility *** */
-/* Note : when linking with -fvisibility=hidden on gcc, or by default on Visual,
- *        HUF symbols remain "private" (internal symbols for library only).
- *        Set macro FSE_DLL_EXPORT to 1 if you want HUF symbols visible on DLL interface */
-#if defined(FSE_DLL_EXPORT) && (FSE_DLL_EXPORT==1) && defined(__GNUC__) && (__GNUC__ >= 4)
-#  define HUF_PUBLIC_API __attribute__ ((visibility ("default")))
-#elif defined(FSE_DLL_EXPORT) && (FSE_DLL_EXPORT==1)   /* Visual expected */
-#  define HUF_PUBLIC_API __declspec(dllexport)
-#elif defined(FSE_DLL_IMPORT) && (FSE_DLL_IMPORT==1)
-#  define HUF_PUBLIC_API __declspec(dllimport)  /* not required, just to generate faster code (saves a function pointer load from IAT and an indirect jump) */
-#else
-#  define HUF_PUBLIC_API
-#endif
-
-
-/* ========================== */
-/* ***  simple functions  *** */
-/* ========================== */
-
-/* HUF_compress() :
- *  Compress content from buffer 'src', of size 'srcSize', into buffer 'dst'.
- * 'dst' buffer must be already allocated.
- *  Compression runs faster if `dstCapacity` >= HUF_compressBound(srcSize).
- * `srcSize` must be <= `HUF_BLOCKSIZE_MAX` == 128 KB.
- * @return : size of compressed data (<= `dstCapacity`).
- *  Special values : if return == 0, srcData is not compressible => Nothing is stored within dst !!!
- *                   if HUF_isError(return), compression failed (more details using HUF_getErrorName())
- */
-HUF_PUBLIC_API size_t HUF_compress(void* dst, size_t dstCapacity,
-                             const void* src, size_t srcSize);
-
-/* HUF_decompress() :
- *  Decompress HUF data from buffer 'cSrc', of size 'cSrcSize',
- *  into already allocated buffer 'dst', of minimum size 'dstSize'.
- * `originalSize` : **must** be the ***exact*** size of original (uncompressed) data.
- *  Note : in contrast with FSE, HUF_decompress can regenerate
- *         RLE (cSrcSize==1) and uncompressed (cSrcSize==dstSize) data,
- *         because it knows size to regenerate (originalSize).
- * @return : size of regenerated data (== originalSize),
- *           or an error code, which can be tested using HUF_isError()
- */
-HUF_PUBLIC_API size_t HUF_decompress(void* dst,  size_t originalSize,
-                               const void* cSrc, size_t cSrcSize);
+#include "mem.h"          /* U32 */
+#define FSE_STATIC_LINKING_ONLY
+#include "fse.h"
 
 
 /* ***   Tool functions *** */
-#define HUF_BLOCKSIZE_MAX (128 * 1024)                  /*< maximum input size for a single block compressed with HUF_compress */
-HUF_PUBLIC_API size_t HUF_compressBound(size_t size);   /*< maximum compressed size (worst case) */
+#define HUF_BLOCKSIZE_MAX (128 * 1024)   /*< maximum input size for a single block compressed with HUF_compress */
+size_t HUF_compressBound(size_t size);   /*< maximum compressed size (worst case) */
 
 /* Error Management */
-HUF_PUBLIC_API unsigned    HUF_isError(size_t code);       /*< tells if a return value is an error code */
-HUF_PUBLIC_API const char* HUF_getErrorName(size_t code);  /*< provides error code string (useful for debugging) */
+unsigned    HUF_isError(size_t code);       /*< tells if a return value is an error code */
+const char* HUF_getErrorName(size_t code);  /*< provides error code string (useful for debugging) */
 
 
-/* ***   Advanced function   *** */
-
-/* HUF_compress2() :
- *  Same as HUF_compress(), but offers control over `maxSymbolValue` and `tableLog`.
- * `maxSymbolValue` must be <= HUF_SYMBOLVALUE_MAX .
- * `tableLog` must be `<= HUF_TABLELOG_MAX` . */
-HUF_PUBLIC_API size_t HUF_compress2 (void* dst, size_t dstCapacity,
-                               const void* src, size_t srcSize,
-                               unsigned maxSymbolValue, unsigned tableLog);
-
-/* HUF_compress4X_wksp() :
- *  Same as HUF_compress2(), but uses externally allocated `workSpace`.
- * `workspace` must be at least as large as HUF_WORKSPACE_SIZE */
 #define HUF_WORKSPACE_SIZE ((8 << 10) + 512 /* sorting scratch space */)
 #define HUF_WORKSPACE_SIZE_U64 (HUF_WORKSPACE_SIZE / sizeof(U64))
-HUF_PUBLIC_API size_t HUF_compress4X_wksp (void* dst, size_t dstCapacity,
-                                     const void* src, size_t srcSize,
-                                     unsigned maxSymbolValue, unsigned tableLog,
-                                     void* workSpace, size_t wkspSize);
-
-#endif   /* HUF_H_298734234 */
-
-/* ******************************************************************
- *  WARNING !!
- *  The following section contains advanced and experimental definitions
- *  which shall never be used in the context of a dynamic library,
- *  because they are not guaranteed to remain stable in the future.
- *  Only consider them in association with static linking.
- * *****************************************************************/
-#if !defined(HUF_H_HUF_STATIC_LINKING_ONLY)
-#define HUF_H_HUF_STATIC_LINKING_ONLY
-
-/* *** Dependencies *** */
-#include "mem.h"   /* U32 */
-#define FSE_STATIC_LINKING_ONLY
-#include "fse.h"
-
 
 /* *** Constants *** */
 #define HUF_TABLELOG_MAX      12      /* max runtime value of tableLog (due to static allocation); can be modified up to HUF_TABLELOG_ABSOLUTEMAX */
@@ -151,25 +75,49 @@ typedef U32 HUF_DTable;
 /* ****************************************
 *  Advanced decompression functions
 ******************************************/
-size_t HUF_decompress4X1 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /*< single-symbol decoder */
-#ifndef HUF_FORCE_DECOMPRESS_X1
-size_t HUF_decompress4X2 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /*< double-symbols decoder */
-#endif
 
-size_t HUF_decompress4X_DCtx (HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /*< decodes RLE and uncompressed */
-size_t HUF_decompress4X_hufOnly(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize); /*< considers RLE and uncompressed as errors */
-size_t HUF_decompress4X_hufOnly_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize); /*< considers RLE and uncompressed as errors */
-size_t HUF_decompress4X1_DCtx(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /*< single-symbol decoder */
-size_t HUF_decompress4X1_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize);   /*< single-symbol decoder */
-#ifndef HUF_FORCE_DECOMPRESS_X1
-size_t HUF_decompress4X2_DCtx(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /*< double-symbols decoder */
-size_t HUF_decompress4X2_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize);   /*< double-symbols decoder */
-#endif
+/*
+ * Huffman flags bitset.
+ * For all flags, 0 is the default value.
+ */
+typedef enum {
+    /*
+     * If compiled with DYNAMIC_BMI2: Set flag only if the CPU supports BMI2 at runtime.
+     * Otherwise: Ignored.
+     */
+    HUF_flags_bmi2 = (1 << 0),
+    /*
+     * If set: Test possible table depths to find the one that produces the smallest header + encoded size.
+     * If unset: Use heuristic to find the table depth.
+     */
+    HUF_flags_optimalDepth = (1 << 1),
+    /*
+     * If set: If the previous table can encode the input, always reuse the previous table.
+     * If unset: If the previous table can encode the input, reuse the previous table if it results in a smaller output.
+     */
+    HUF_flags_preferRepeat = (1 << 2),
+    /*
+     * If set: Sample the input and check if the sample is uncompressible, if it is then don't attempt to compress.
+     * If unset: Always histogram the entire input.
+     */
+    HUF_flags_suspectUncompressible = (1 << 3),
+    /*
+     * If set: Don't use assembly implementations
+     * If unset: Allow using assembly implementations
+     */
+    HUF_flags_disableAsm = (1 << 4),
+    /*
+     * If set: Don't use the fast decoding loop, always use the fallback decoding loop.
+     * If unset: Use the fast decoding loop when possible.
+     */
+    HUF_flags_disableFast = (1 << 5)
+} HUF_flags_e;
 
 
 /* ****************************************
  *  HUF detailed API
  * ****************************************/
+#define HUF_OPTIMAL_DEPTH_THRESHOLD ZSTD_btultra
 
 /*! HUF_compress() does the following:
  *  1. count symbol occurrence from source[] into table count[] using FSE_count() (exposed within "fse.h")
@@ -182,12 +130,12 @@ size_t HUF_decompress4X2_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize,
  *  For example, it's possible to compress several blocks using the same 'CTable',
  *  or to save and regenerate 'CTable' using external methods.
  */
-unsigned HUF_optimalTableLog(unsigned maxTableLog, size_t srcSize, unsigned maxSymbolValue);
-size_t HUF_buildCTable (HUF_CElt* CTable, const unsigned* count, unsigned maxSymbolValue, unsigned maxNbBits);   /* @return : maxNbBits; CTable and count can overlap. In which case, CTable will overwrite count content */
-size_t HUF_writeCTable (void* dst, size_t maxDstSize, const HUF_CElt* CTable, unsigned maxSymbolValue, unsigned huffLog);
+unsigned HUF_minTableLog(unsigned symbolCardinality);
+unsigned HUF_cardinality(const unsigned* count, unsigned maxSymbolValue);
+unsigned HUF_optimalTableLog(unsigned maxTableLog, size_t srcSize, unsigned maxSymbolValue, void* workSpace,
+ size_t wkspSize, HUF_CElt* table, const unsigned* count, int flags); /* table is used as scratch space for building and testing tables, not a return value */
 size_t HUF_writeCTable_wksp(void* dst, size_t maxDstSize, const HUF_CElt* CTable, unsigned maxSymbolValue, unsigned huffLog, void* workspace, size_t workspaceSize);
-size_t HUF_compress4X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable);
-size_t HUF_compress4X_usingCTable_bmi2(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable, int bmi2);
+size_t HUF_compress4X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable, int flags);
 size_t HUF_estimateCompressedSize(const HUF_CElt* CTable, const unsigned* count, unsigned maxSymbolValue);
 int HUF_validateCTable(const HUF_CElt* CTable, const unsigned* count, unsigned maxSymbolValue);
 
@@ -196,6 +144,7 @@ typedef enum {
    HUF_repeat_check, /*< Can use the previous table but it must be checked. Note : The previous table must have been constructed by HUF_compress{1, 4}X_repeat */
    HUF_repeat_valid  /*< Can use the previous table and it is assumed to be valid */
  } HUF_repeat;
+
 /* HUF_compress4X_repeat() :
  *  Same as HUF_compress4X_wksp(), but considers using hufTable if *repeat != HUF_repeat_none.
  *  If it uses hufTable it does not modify hufTable or repeat.
@@ -206,13 +155,13 @@ size_t HUF_compress4X_repeat(void* dst, size_t dstSize,
                        const void* src, size_t srcSize,
                        unsigned maxSymbolValue, unsigned tableLog,
                        void* workSpace, size_t wkspSize,    /*< `workSpace` must be aligned on 4-bytes boundaries, `wkspSize` must be >= HUF_WORKSPACE_SIZE */
-                       HUF_CElt* hufTable, HUF_repeat* repeat, int preferRepeat, int bmi2, unsigned suspectUncompressible);
+                       HUF_CElt* hufTable, HUF_repeat* repeat, int flags);
 
 /* HUF_buildCTable_wksp() :
  *  Same as HUF_buildCTable(), but using externally allocated scratch buffer.
  * `workSpace` must be aligned on 4-bytes boundaries, and its size must be >= HUF_CTABLE_WORKSPACE_SIZE.
  */
-#define HUF_CTABLE_WORKSPACE_SIZE_U32 (2*HUF_SYMBOLVALUE_MAX +1 +1)
+#define HUF_CTABLE_WORKSPACE_SIZE_U32 ((4 * (HUF_SYMBOLVALUE_MAX + 1)) + 192)
 #define HUF_CTABLE_WORKSPACE_SIZE (HUF_CTABLE_WORKSPACE_SIZE_U32 * sizeof(unsigned))
 size_t HUF_buildCTable_wksp (HUF_CElt* tree,
                        const unsigned* count, U32 maxSymbolValue, U32 maxNbBits,
@@ -238,7 +187,7 @@ size_t HUF_readStats_wksp(BYTE* huffWeight, size_t hwSize,
                           U32* rankStats, U32* nbSymbolsPtr, U32* tableLogPtr,
                           const void* src, size_t srcSize,
                           void* workspace, size_t wkspSize,
-                          int bmi2);
+                          int flags);
 
 /* HUF_readCTable() :
  *  Loading a CTable saved with HUF_writeCTable() */
@@ -276,32 +225,12 @@ U32 HUF_selectDecoder (size_t dstSize, size_t cSrcSize);
 #define HUF_DECOMPRESS_WORKSPACE_SIZE ((2 << 10) + (1 << 9))
 #define HUF_DECOMPRESS_WORKSPACE_SIZE_U32 (HUF_DECOMPRESS_WORKSPACE_SIZE / sizeof(U32))
 
-#ifndef HUF_FORCE_DECOMPRESS_X2
-size_t HUF_readDTableX1 (HUF_DTable* DTable, const void* src, size_t srcSize);
-size_t HUF_readDTableX1_wksp (HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize);
-#endif
-#ifndef HUF_FORCE_DECOMPRESS_X1
-size_t HUF_readDTableX2 (HUF_DTable* DTable, const void* src, size_t srcSize);
-size_t HUF_readDTableX2_wksp (HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize);
-#endif
-
-size_t HUF_decompress4X_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable);
-#ifndef HUF_FORCE_DECOMPRESS_X2
-size_t HUF_decompress4X1_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable);
-#endif
-#ifndef HUF_FORCE_DECOMPRESS_X1
-size_t HUF_decompress4X2_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable);
-#endif
-
 
 /* ====================== */
 /* single stream variants */
 /* ====================== */
 
-size_t HUF_compress1X (void* dst, size_t dstSize, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog);
-size_t HUF_compress1X_wksp (void* dst, size_t dstSize, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog, void* workSpace, size_t wkspSize);  /*< `workSpace` must be a table of at least HUF_WORKSPACE_SIZE_U64 U64 */
-size_t HUF_compress1X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable);
-size_t HUF_compress1X_usingCTable_bmi2(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable, int bmi2);
+size_t HUF_compress1X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable, int flags);
 /* HUF_compress1X_repeat() :
  *  Same as HUF_compress1X_wksp(), but considers using hufTable if *repeat != HUF_repeat_none.
  *  If it uses hufTable it does not modify hufTable or repeat.
@@ -312,47 +241,28 @@ size_t HUF_compress1X_repeat(void* dst, size_t dstSize,
                        const void* src, size_t srcSize,
                        unsigned maxSymbolValue, unsigned tableLog,
                        void* workSpace, size_t wkspSize,   /*< `workSpace` must be aligned on 4-bytes boundaries, `wkspSize` must be >= HUF_WORKSPACE_SIZE */
-                       HUF_CElt* hufTable, HUF_repeat* repeat, int preferRepeat, int bmi2, unsigned suspectUncompressible);
-
-size_t HUF_decompress1X1 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /* single-symbol decoder */
-#ifndef HUF_FORCE_DECOMPRESS_X1
-size_t HUF_decompress1X2 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /* double-symbol decoder */
-#endif
-
-size_t HUF_decompress1X_DCtx (HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);
-size_t HUF_decompress1X_DCtx_wksp (HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize);
-#ifndef HUF_FORCE_DECOMPRESS_X2
-size_t HUF_decompress1X1_DCtx(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /*< single-symbol decoder */
-size_t HUF_decompress1X1_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize);   /*< single-symbol decoder */
-#endif
-#ifndef HUF_FORCE_DECOMPRESS_X1
-size_t HUF_decompress1X2_DCtx(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /*< double-symbols decoder */
-size_t HUF_decompress1X2_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize);   /*< double-symbols decoder */
-#endif
+                       HUF_CElt* hufTable, HUF_repeat* repeat, int flags);
 
-size_t HUF_decompress1X_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable);   /*< automatic selection of sing or double symbol decoder, based on DTable */
-#ifndef HUF_FORCE_DECOMPRESS_X2
-size_t HUF_decompress1X1_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable);
-#endif
+size_t HUF_decompress1X_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int flags);
 #ifndef HUF_FORCE_DECOMPRESS_X1
-size_t HUF_decompress1X2_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable);
+size_t HUF_decompress1X2_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int flags);   /*< double-symbols decoder */
 #endif
 
 /* BMI2 variants.
  * If the CPU has BMI2 support, pass bmi2=1, otherwise pass bmi2=0.
  */
-size_t HUF_decompress1X_usingDTable_bmi2(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int bmi2);
+size_t HUF_decompress1X_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int flags);
 #ifndef HUF_FORCE_DECOMPRESS_X2
-size_t HUF_decompress1X1_DCtx_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int bmi2);
+size_t HUF_decompress1X1_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int flags);
 #endif
-size_t HUF_decompress4X_usingDTable_bmi2(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int bmi2);
-size_t HUF_decompress4X_hufOnly_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int bmi2);
+size_t HUF_decompress4X_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int flags);
+size_t HUF_decompress4X_hufOnly_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int flags);
 #ifndef HUF_FORCE_DECOMPRESS_X2
-size_t HUF_readDTableX1_wksp_bmi2(HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize, int bmi2);
+size_t HUF_readDTableX1_wksp(HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize, int flags);
 #endif
 #ifndef HUF_FORCE_DECOMPRESS_X1
-size_t HUF_readDTableX2_wksp_bmi2(HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize, int bmi2);
+size_t HUF_readDTableX2_wksp(HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize, int flags);
 #endif
 
-#endif /* HUF_STATIC_LINKING_ONLY */
+#endif   /* HUF_H_298734234 */
 
diff --git a/lib/zstd/common/mem.h b/lib/zstd/common/mem.h
index 1d9cc03924ca..a7231822b6e3 100644
--- a/lib/zstd/common/mem.h
+++ b/lib/zstd/common/mem.h
@@ -1,6 +1,6 @@
 /* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
 /*
- * Copyright (c) Yann Collet, Facebook, Inc.
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
  * All rights reserved.
  *
  * This source code is licensed under both the BSD-style license (found in the
diff --git a/lib/zstd/common/portability_macros.h b/lib/zstd/common/portability_macros.h
index 0e3b2c0a527d..7ede8cf1ffe5 100644
--- a/lib/zstd/common/portability_macros.h
+++ b/lib/zstd/common/portability_macros.h
@@ -1,5 +1,6 @@
+/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
 /*
- * Copyright (c) Facebook, Inc.
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
  * All rights reserved.
  *
  * This source code is licensed under both the BSD-style license (found in the
@@ -12,7 +13,7 @@
 #define ZSTD_PORTABILITY_MACROS_H
 
 /*
- * This header file contains macro defintions to support portability.
+ * This header file contains macro definitions to support portability.
  * This header is shared between C and ASM code, so it MUST only
  * contain macro definitions. It MUST not contain any C code.
  *
@@ -65,7 +66,7 @@
 #endif
 
 /*
- * Only enable assembly for GNUC comptabile compilers,
+ * Only enable assembly for GNUC compatible compilers,
  * because other platforms may not support GAS assembly syntax.
  *
  * Only enable assembly for Linux / MacOS, other platforms may
@@ -90,4 +91,23 @@
  */
 #define ZSTD_ENABLE_ASM_X86_64_BMI2 0
 
+/*
+ * For x86 ELF targets, add .note.gnu.property section for Intel CET in
+ * assembly sources when CET is enabled.
+ *
+ * Additionally, any function that may be called indirectly must begin
+ * with ZSTD_CET_ENDBRANCH.
+ */
+#if defined(__ELF__) && (defined(__x86_64__) || defined(__i386__)) \
+    && defined(__has_include)
+# if __has_include(<cet.h>)
+#  include <cet.h>
+#  define ZSTD_CET_ENDBRANCH _CET_ENDBR
+# endif
+#endif
+
+#ifndef ZSTD_CET_ENDBRANCH
+# define ZSTD_CET_ENDBRANCH
+#endif
+
 #endif /* ZSTD_PORTABILITY_MACROS_H */
diff --git a/lib/zstd/common/zstd_common.c b/lib/zstd/common/zstd_common.c
index 3d7e35b309b5..44b95b25344a 100644
--- a/lib/zstd/common/zstd_common.c
+++ b/lib/zstd/common/zstd_common.c
@@ -1,5 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
 /*
- * Copyright (c) Yann Collet, Facebook, Inc.
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
  * All rights reserved.
  *
  * This source code is licensed under both the BSD-style license (found in the
@@ -14,7 +15,6 @@
 *  Dependencies
 ***************************************/
 #define ZSTD_DEPS_NEED_MALLOC
-#include "zstd_deps.h"   /* ZSTD_malloc, ZSTD_calloc, ZSTD_free, ZSTD_memset */
 #include "error_private.h"
 #include "zstd_internal.h"
 
@@ -47,37 +47,3 @@ ZSTD_ErrorCode ZSTD_getErrorCode(size_t code) { return ERR_getErrorCode(code); }
 /*! ZSTD_getErrorString() :
  *  provides error code string from enum */
 const char* ZSTD_getErrorString(ZSTD_ErrorCode code) { return ERR_getErrorString(code); }
-
-
-
-/*=**************************************************************
-*  Custom allocator
-****************************************************************/
-void* ZSTD_customMalloc(size_t size, ZSTD_customMem customMem)
-{
-    if (customMem.customAlloc)
-        return customMem.customAlloc(customMem.opaque, size);
-    return ZSTD_malloc(size);
-}
-
-void* ZSTD_customCalloc(size_t size, ZSTD_customMem customMem)
-{
-    if (customMem.customAlloc) {
-        /* calloc implemented as malloc+memset;
-         * not as efficient as calloc, but next best guess for custom malloc */
-        void* const ptr = customMem.customAlloc(customMem.opaque, size);
-        ZSTD_memset(ptr, 0, size);
-        return ptr;
-    }
-    return ZSTD_calloc(1, size);
-}
-
-void ZSTD_customFree(void* ptr, ZSTD_customMem customMem)
-{
-    if (ptr!=NULL) {
-        if (customMem.customFree)
-            customMem.customFree(customMem.opaque, ptr);
-        else
-            ZSTD_free(ptr);
-    }
-}
diff --git a/lib/zstd/common/zstd_deps.h b/lib/zstd/common/zstd_deps.h
index 2c34e8a33a1c..670c5fa2a952 100644
--- a/lib/zstd/common/zstd_deps.h
+++ b/lib/zstd/common/zstd_deps.h
@@ -1,6 +1,6 @@
 /* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
 /*
- * Copyright (c) Facebook, Inc.
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
  * All rights reserved.
  *
  * This source code is licensed under both the BSD-style license (found in the
@@ -105,3 +105,21 @@ static uint64_t ZSTD_div64(uint64_t dividend, uint32_t divisor) {
 
 #endif /* ZSTD_DEPS_IO */
 #endif /* ZSTD_DEPS_NEED_IO */
+
+/*
+ * Only requested when MSAN is enabled.
+ * Need:
+ * intptr_t
+ */
+#ifdef ZSTD_DEPS_NEED_STDINT
+#ifndef ZSTD_DEPS_STDINT
+#define ZSTD_DEPS_STDINT
+
+/*
+ * The Linux Kernel doesn't provide intptr_t, only uintptr_t, which
+ * is an unsigned long.
+ */
+typedef long intptr_t;
+
+#endif /* ZSTD_DEPS_STDINT */
+#endif /* ZSTD_DEPS_NEED_STDINT */
diff --git a/lib/zstd/common/zstd_internal.h b/lib/zstd/common/zstd_internal.h
index 93305d9b41bb..7f023e4d4774 100644
--- a/lib/zstd/common/zstd_internal.h
+++ b/lib/zstd/common/zstd_internal.h
@@ -1,5 +1,6 @@
+/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
 /*
- * Copyright (c) Yann Collet, Facebook, Inc.
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
  * All rights reserved.
  *
  * This source code is licensed under both the BSD-style license (found in the
@@ -28,7 +29,6 @@
 #include <linux/zstd.h>
 #define FSE_STATIC_LINKING_ONLY
 #include "fse.h"
-#define HUF_STATIC_LINKING_ONLY
 #include "huf.h"
 #include <linux/xxhash.h>                /* XXH_reset, update, digest */
 #define ZSTD_TRACE 0
@@ -83,9 +83,9 @@ typedef enum { bt_raw, bt_rle, bt_compressed, bt_reserved } blockType_e;
 #define ZSTD_FRAMECHECKSUMSIZE 4
 
 #define MIN_SEQUENCES_SIZE 1 /* nbSeq==0 */
-#define MIN_CBLOCK_SIZE (1 /*litCSize*/ + 1 /* RLE or RAW */ + MIN_SEQUENCES_SIZE /* nbSeq==0 */)   /* for a non-null block */
+#define MIN_CBLOCK_SIZE (1 /*litCSize*/ + 1 /* RLE or RAW */)   /* for a non-null block */
+#define MIN_LITERALS_FOR_4_STREAMS 6
 
-#define HufLog 12
 typedef enum { set_basic, set_rle, set_compressed, set_repeat } symbolEncodingType_e;
 
 #define LONGNBSEQ 0x7F00
@@ -93,6 +93,7 @@ typedef enum { set_basic, set_rle, set_compressed, set_repeat } symbolEncodingTy
 #define MINMATCH 3
 
 #define Litbits  8
+#define LitHufLog 11
 #define MaxLit ((1<<Litbits) - 1)
 #define MaxML   52
 #define MaxLL   35
@@ -103,6 +104,8 @@ typedef enum { set_basic, set_rle, set_compressed, set_repeat } symbolEncodingTy
 #define LLFSELog    9
 #define OffFSELog   8
 #define MaxFSELog  MAX(MAX(MLFSELog, LLFSELog), OffFSELog)
+#define MaxMLBits 16
+#define MaxLLBits 16
 
 #define ZSTD_MAX_HUF_HEADER_SIZE 128 /* header + <= 127 byte tree description */
 /* Each table cannot take more than #symbols * FSELog bits */
@@ -225,12 +228,6 @@ void ZSTD_wildcopy(void* dst, const void* src, ptrdiff_t length, ZSTD_overlap_e
          * one COPY16() in the first call. Then, do two calls per loop since
          * at that point it is more likely to have a high trip count.
          */
-#ifdef __aarch64__
-        do {
-            COPY16(op, ip);
-        }
-        while (op < oend);
-#else
         ZSTD_copy16(op, ip);
         if (16 >= length) return;
         op += 16;
@@ -240,7 +237,6 @@ void ZSTD_wildcopy(void* dst, const void* src, ptrdiff_t length, ZSTD_overlap_e
             COPY16(op, ip);
         }
         while (op < oend);
-#endif
     }
 }
 
@@ -289,11 +285,11 @@ typedef enum {
 typedef struct {
     seqDef* sequencesStart;
     seqDef* sequences;      /* ptr to end of sequences */
-    BYTE* litStart;
-    BYTE* lit;              /* ptr to end of literals */
-    BYTE* llCode;
-    BYTE* mlCode;
-    BYTE* ofCode;
+    BYTE*  litStart;
+    BYTE*  lit;             /* ptr to end of literals */
+    BYTE*  llCode;
+    BYTE*  mlCode;
+    BYTE*  ofCode;
     size_t maxNbSeq;
     size_t maxNbLit;
 
@@ -301,8 +297,8 @@ typedef struct {
      * in the seqStore that has a value larger than U16 (if it exists). To do so, we increment
      * the existing value of the litLength or matchLength by 0x10000.
      */
-    ZSTD_longLengthType_e   longLengthType;
-    U32                     longLengthPos;  /* Index of the sequence to apply long length modification to */
+    ZSTD_longLengthType_e longLengthType;
+    U32                   longLengthPos;  /* Index of the sequence to apply long length modification to */
 } seqStore_t;
 
 typedef struct {
@@ -321,10 +317,10 @@ MEM_STATIC ZSTD_sequenceLength ZSTD_getSequenceLength(seqStore_t const* seqStore
     seqLen.matchLength = seq->mlBase + MINMATCH;
     if (seqStore->longLengthPos == (U32)(seq - seqStore->sequencesStart)) {
         if (seqStore->longLengthType == ZSTD_llt_literalLength) {
-            seqLen.litLength += 0xFFFF;
+            seqLen.litLength += 0x10000;
         }
         if (seqStore->longLengthType == ZSTD_llt_matchLength) {
-            seqLen.matchLength += 0xFFFF;
+            seqLen.matchLength += 0x10000;
         }
     }
     return seqLen;
@@ -337,72 +333,13 @@ MEM_STATIC ZSTD_sequenceLength ZSTD_getSequenceLength(seqStore_t const* seqStore
  *          `decompressedBound != ZSTD_CONTENTSIZE_ERROR`
  */
 typedef struct {
+    size_t nbBlocks;
     size_t compressedSize;
     unsigned long long decompressedBound;
 } ZSTD_frameSizeInfo;   /* decompress & legacy */
 
 const seqStore_t* ZSTD_getSeqStore(const ZSTD_CCtx* ctx);   /* compress & dictBuilder */
-void ZSTD_seqToCodes(const seqStore_t* seqStorePtr);   /* compress, dictBuilder, decodeCorpus (shouldn't get its definition from here) */
-
-/* custom memory allocation functions */
-void* ZSTD_customMalloc(size_t size, ZSTD_customMem customMem);
-void* ZSTD_customCalloc(size_t size, ZSTD_customMem customMem);
-void ZSTD_customFree(void* ptr, ZSTD_customMem customMem);
-
-
-MEM_STATIC U32 ZSTD_highbit32(U32 val)   /* compress, dictBuilder, decodeCorpus */
-{
-    assert(val != 0);
-    {
-#   if (__GNUC__ >= 3)   /* GCC Intrinsic */
-        return __builtin_clz (val) ^ 31;
-#   else   /* Software version */
-        static const U32 DeBruijnClz[32] = { 0, 9, 1, 10, 13, 21, 2, 29, 11, 14, 16, 18, 22, 25, 3, 30, 8, 12, 20, 28, 15, 17, 24, 7, 19, 27, 23, 6, 26, 5, 4, 31 };
-        U32 v = val;
-        v |= v >> 1;
-        v |= v >> 2;
-        v |= v >> 4;
-        v |= v >> 8;
-        v |= v >> 16;
-        return DeBruijnClz[(v * 0x07C4ACDDU) >> 27];
-#   endif
-    }
-}
-
-/*
- * Counts the number of trailing zeros of a `size_t`.
- * Most compilers should support CTZ as a builtin. A backup
- * implementation is provided if the builtin isn't supported, but
- * it may not be terribly efficient.
- */
-MEM_STATIC unsigned ZSTD_countTrailingZeros(size_t val)
-{
-    if (MEM_64bits()) {
-#       if (__GNUC__ >= 4)
-            return __builtin_ctzll((U64)val);
-#       else
-            static const int DeBruijnBytePos[64] = {  0,  1,  2,  7,  3, 13,  8, 19,
-                                                      4, 25, 14, 28,  9, 34, 20, 56,
-                                                      5, 17, 26, 54, 15, 41, 29, 43,
-                                                      10, 31, 38, 35, 21, 45, 49, 57,
-                                                      63,  6, 12, 18, 24, 27, 33, 55,
-                                                      16, 53, 40, 42, 30, 37, 44, 48,
-                                                      62, 11, 23, 32, 52, 39, 36, 47,
-                                                      61, 22, 51, 46, 60, 50, 59, 58 };
-            return DeBruijnBytePos[((U64)((val & -(long long)val) * 0x0218A392CDABBD3FULL)) >> 58];
-#       endif
-    } else { /* 32 bits */
-#       if (__GNUC__ >= 3)
-            return __builtin_ctz((U32)val);
-#       else
-            static const int DeBruijnBytePos[32] = {  0,  1, 28,  2, 29, 14, 24,  3,
-                                                     30, 22, 20, 15, 25, 17,  4,  8,
-                                                     31, 27, 13, 23, 21, 19, 16,  7,
-                                                     26, 12, 18,  6, 11,  5, 10,  9 };
-            return DeBruijnBytePos[((U32)((val & -(S32)val) * 0x077CB531U)) >> 27];
-#       endif
-    }
-}
+int ZSTD_seqToCodes(const seqStore_t* seqStorePtr);   /* compress, dictBuilder, decodeCorpus (shouldn't get its definition from here) */
 
 
 /* ZSTD_invalidateRepCodes() :
diff --git a/lib/zstd/compress/clevels.h b/lib/zstd/compress/clevels.h
index d9a76112ec3a..6ab8be6532ef 100644
--- a/lib/zstd/compress/clevels.h
+++ b/lib/zstd/compress/clevels.h
@@ -1,5 +1,6 @@
+/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
 /*
- * Copyright (c) Yann Collet, Facebook, Inc.
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
  * All rights reserved.
  *
  * This source code is licensed under both the BSD-style license (found in the
diff --git a/lib/zstd/compress/fse_compress.c b/lib/zstd/compress/fse_compress.c
index ec5b1ca6d71a..e46ca6621b48 100644
--- a/lib/zstd/compress/fse_compress.c
+++ b/lib/zstd/compress/fse_compress.c
@@ -1,6 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
 /* ******************************************************************
  * FSE : Finite State Entropy encoder
- * Copyright (c) Yann Collet, Facebook, Inc.
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
  *
  *  You can contact the author at :
  *  - FSE source repository : https://github.com/Cyan4973/FiniteStateEntropy
@@ -26,6 +27,7 @@
 #define ZSTD_DEPS_NEED_MALLOC
 #define ZSTD_DEPS_NEED_MATH64
 #include "../common/zstd_deps.h"  /* ZSTD_malloc, ZSTD_free, ZSTD_memcpy, ZSTD_memset */
+#include "../common/bits.h" /* ZSTD_highbit32 */
 
 
 /* **************************************************************
@@ -90,7 +92,7 @@ size_t FSE_buildCTable_wksp(FSE_CTable* ct,
     assert(tableLog < 16);   /* required for threshold strategy to work */
 
     /* For explanations on how to distribute symbol values over the table :
-     * http://fastcompression.blogspot.fr/2014/02/fse-distributing-symbol-values.html */
+     * https://fastcompression.blogspot.fr/2014/02/fse-distributing-symbol-values.html */
 
      #ifdef __clang_analyzer__
      ZSTD_memset(tableSymbol, 0, sizeof(*tableSymbol) * tableSize);   /* useless initialization, just to keep scan-build happy */
@@ -191,7 +193,7 @@ size_t FSE_buildCTable_wksp(FSE_CTable* ct,
                 break;
             default :
                 assert(normalizedCounter[s] > 1);
-                {   U32 const maxBitsOut = tableLog - BIT_highbit32 ((U32)normalizedCounter[s]-1);
+                {   U32 const maxBitsOut = tableLog - ZSTD_highbit32 ((U32)normalizedCounter[s]-1);
                     U32 const minStatePlus = (U32)normalizedCounter[s] << maxBitsOut;
                     symbolTT[s].deltaNbBits = (maxBitsOut << 16) - minStatePlus;
                     symbolTT[s].deltaFindState = (int)(total - (unsigned)normalizedCounter[s]);
@@ -342,21 +344,11 @@ size_t FSE_writeNCount (void* buffer, size_t bufferSize,
 *  FSE Compression Code
 ****************************************************************/
 
-FSE_CTable* FSE_createCTable (unsigned maxSymbolValue, unsigned tableLog)
-{
-    size_t size;
-    if (tableLog > FSE_TABLELOG_ABSOLUTE_MAX) tableLog = FSE_TABLELOG_ABSOLUTE_MAX;
-    size = FSE_CTABLE_SIZE_U32 (tableLog, maxSymbolValue) * sizeof(U32);
-    return (FSE_CTable*)ZSTD_malloc(size);
-}
-
-void FSE_freeCTable (FSE_CTable* ct) { ZSTD_free(ct); }
-
 /* provides the minimum logSize to safely represent a distribution */
 static unsigned FSE_minTableLog(size_t srcSize, unsigned maxSymbolValue)
 {
-    U32 minBitsSrc = BIT_highbit32((U32)(srcSize)) + 1;
-    U32 minBitsSymbols = BIT_highbit32(maxSymbolValue) + 2;
+    U32 minBitsSrc = ZSTD_highbit32((U32)(srcSize)) + 1;
+    U32 minBitsSymbols = ZSTD_highbit32(maxSymbolValue) + 2;
     U32 minBits = minBitsSrc < minBitsSymbols ? minBitsSrc : minBitsSymbols;
     assert(srcSize > 1); /* Not supported, RLE should be used instead */
     return minBits;
@@ -364,7 +356,7 @@ static unsigned FSE_minTableLog(size_t srcSize, unsigned maxSymbolValue)
 
 unsigned FSE_optimalTableLog_internal(unsigned maxTableLog, size_t srcSize, unsigned maxSymbolValue, unsigned minus)
 {
-    U32 maxBitsSrc = BIT_highbit32((U32)(srcSize - 1)) - minus;
+    U32 maxBitsSrc = ZSTD_highbit32((U32)(srcSize - 1)) - minus;
     U32 tableLog = maxTableLog;
     U32 minBits = FSE_minTableLog(srcSize, maxSymbolValue);
     assert(srcSize > 1); /* Not supported, RLE should be used instead */
@@ -532,40 +524,6 @@ size_t FSE_normalizeCount (short* normalizedCounter, unsigned tableLog,
     return tableLog;
 }
 
-
-/* fake FSE_CTable, for raw (uncompressed) input */
-size_t FSE_buildCTable_raw (FSE_CTable* ct, unsigned nbBits)
-{
-    const unsigned tableSize = 1 << nbBits;
-    const unsigned tableMask = tableSize - 1;
-    const unsigned maxSymbolValue = tableMask;
-    void* const ptr = ct;
-    U16* const tableU16 = ( (U16*) ptr) + 2;
-    void* const FSCT = ((U32*)ptr) + 1 /* header */ + (tableSize>>1);   /* assumption : tableLog >= 1 */
-    FSE_symbolCompressionTransform* const symbolTT = (FSE_symbolCompressionTransform*) (FSCT);
-    unsigned s;
-
-    /* Sanity checks */
-    if (nbBits < 1) return ERROR(GENERIC);             /* min size */
-
-    /* header */
-    tableU16[-2] = (U16) nbBits;
-    tableU16[-1] = (U16) maxSymbolValue;
-
-    /* Build table */
-    for (s=0; s<tableSize; s++)
-        tableU16[s] = (U16)(tableSize + s);
-
-    /* Build Symbol Transformation Table */
-    {   const U32 deltaNbBits = (nbBits << 16) - (1 << nbBits);
-        for (s=0; s<=maxSymbolValue; s++) {
-            symbolTT[s].deltaNbBits = deltaNbBits;
-            symbolTT[s].deltaFindState = s-1;
-    }   }
-
-    return 0;
-}
-
 /* fake FSE_CTable, for rle input (always same symbol) */
 size_t FSE_buildCTable_rle (FSE_CTable* ct, BYTE symbolValue)
 {
@@ -664,5 +622,4 @@ size_t FSE_compress_usingCTable (void* dst, size_t dstSize,
 
 size_t FSE_compressBound(size_t size) { return FSE_COMPRESSBOUND(size); }
 
-
 #endif   /* FSE_COMMONDEFS_ONLY */
diff --git a/lib/zstd/compress/hist.c b/lib/zstd/compress/hist.c
index 3ddc6dfb6894..0b12587cc14b 100644
--- a/lib/zstd/compress/hist.c
+++ b/lib/zstd/compress/hist.c
@@ -1,7 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
 /* ******************************************************************
  * hist : Histogram functions
  * part of Finite State Entropy project
- * Copyright (c) Yann Collet, Facebook, Inc.
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
  *
  *  You can contact the author at :
  *  - FSE source repository : https://github.com/Cyan4973/FiniteStateEntropy
diff --git a/lib/zstd/compress/hist.h b/lib/zstd/compress/hist.h
index fc1830abc9c6..f7687b0fc20a 100644
--- a/lib/zstd/compress/hist.h
+++ b/lib/zstd/compress/hist.h
@@ -1,7 +1,8 @@
+/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
 /* ******************************************************************
  * hist : Histogram functions
  * part of Finite State Entropy project
- * Copyright (c) Yann Collet, Facebook, Inc.
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
  *
  *  You can contact the author at :
  *  - FSE source repository : https://github.com/Cyan4973/FiniteStateEntropy
diff --git a/lib/zstd/compress/huf_compress.c b/lib/zstd/compress/huf_compress.c
index 74ef0db47621..83241abafe35 100644
--- a/lib/zstd/compress/huf_compress.c
+++ b/lib/zstd/compress/huf_compress.c
@@ -1,6 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
 /* ******************************************************************
  * Huffman encoder, part of New Generation Entropy library
- * Copyright (c) Yann Collet, Facebook, Inc.
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
  *
  *  You can contact the author at :
  *  - FSE+HUF source repository : https://github.com/Cyan4973/FiniteStateEntropy
@@ -26,9 +27,9 @@
 #include "hist.h"
 #define FSE_STATIC_LINKING_ONLY   /* FSE_optimalTableLog_internal */
 #include "../common/fse.h"        /* header compression */
-#define HUF_STATIC_LINKING_ONLY
 #include "../common/huf.h"
 #include "../common/error_private.h"
+#include "../common/bits.h"       /* ZSTD_highbit32 */
 
 
 /* **************************************************************
@@ -39,13 +40,67 @@
 
 
 /* **************************************************************
-*  Utils
+*  Required declarations
 ****************************************************************/
-unsigned HUF_optimalTableLog(unsigned maxTableLog, size_t srcSize, unsigned maxSymbolValue)
+typedef struct nodeElt_s {
+    U32 count;
+    U16 parent;
+    BYTE byte;
+    BYTE nbBits;
+} nodeElt;
+
+
+/* **************************************************************
+*  Debug Traces
+****************************************************************/
+
+#if DEBUGLEVEL >= 2
+
+static size_t showU32(const U32* arr, size_t size)
+{
+    size_t u;
+    for (u=0; u<size; u++) {
+        RAWLOG(6, " %u", arr[u]); (void)arr;
+    }
+    RAWLOG(6, " \n");
+    return size;
+}
+
+static size_t HUF_getNbBits(HUF_CElt elt);
+
+static size_t showCTableBits(const HUF_CElt* ctable, size_t size)
 {
-    return FSE_optimalTableLog_internal(maxTableLog, srcSize, maxSymbolValue, 1);
+    size_t u;
+    for (u=0; u<size; u++) {
+        RAWLOG(6, " %zu", HUF_getNbBits(ctable[u])); (void)ctable;
+    }
+    RAWLOG(6, " \n");
+    return size;
+
 }
 
+static size_t showHNodeSymbols(const nodeElt* hnode, size_t size)
+{
+    size_t u;
+    for (u=0; u<size; u++) {
+        RAWLOG(6, " %u", hnode[u].byte); (void)hnode;
+    }
+    RAWLOG(6, " \n");
+    return size;
+}
+
+static size_t showHNodeBits(const nodeElt* hnode, size_t size)
+{
+    size_t u;
+    for (u=0; u<size; u++) {
+        RAWLOG(6, " %u", hnode[u].nbBits); (void)hnode;
+    }
+    RAWLOG(6, " \n");
+    return size;
+}
+
+#endif
+
 
 /* *******************************************************
 *  HUF : Huffman block compression
@@ -86,7 +141,10 @@ typedef struct {
     S16 norm[HUF_TABLELOG_MAX+1];
 } HUF_CompressWeightsWksp;
 
-static size_t HUF_compressWeights(void* dst, size_t dstSize, const void* weightTable, size_t wtSize, void* workspace, size_t workspaceSize)
+static size_t
+HUF_compressWeights(void* dst, size_t dstSize,
+              const void* weightTable, size_t wtSize,
+                    void* workspace, size_t workspaceSize)
 {
     BYTE* const ostart = (BYTE*) dst;
     BYTE* op = ostart;
@@ -137,7 +195,7 @@ static size_t HUF_getNbBitsFast(HUF_CElt elt)
 
 static size_t HUF_getValue(HUF_CElt elt)
 {
-    return elt & ~0xFF;
+    return elt & ~(size_t)0xFF;
 }
 
 static size_t HUF_getValueFast(HUF_CElt elt)
@@ -175,6 +233,8 @@ size_t HUF_writeCTable_wksp(void* dst, size_t maxDstSize,
     U32 n;
     HUF_WriteCTableWksp* wksp = (HUF_WriteCTableWksp*)HUF_alignUpWorkspace(workspace, &workspaceSize, ZSTD_ALIGNOF(U32));
 
+    HUF_STATIC_ASSERT(HUF_CTABLE_WORKSPACE_SIZE >= sizeof(HUF_WriteCTableWksp));
+
     /* check conditions */
     if (workspaceSize < sizeof(HUF_WriteCTableWksp)) return ERROR(GENERIC);
     if (maxSymbolValue > HUF_SYMBOLVALUE_MAX) return ERROR(maxSymbolValue_tooLarge);
@@ -204,16 +264,6 @@ size_t HUF_writeCTable_wksp(void* dst, size_t maxDstSize,
     return ((maxSymbolValue+1)/2) + 1;
 }
 
-/*! HUF_writeCTable() :
-    `CTable` : Huffman tree to save, using huf representation.
-    @return : size of saved CTable */
-size_t HUF_writeCTable (void* dst, size_t maxDstSize,
-                        const HUF_CElt* CTable, unsigned maxSymbolValue, unsigned huffLog)
-{
-    HUF_WriteCTableWksp wksp;
-    return HUF_writeCTable_wksp(dst, maxDstSize, CTable, maxSymbolValue, huffLog, &wksp, sizeof(wksp));
-}
-
 
 size_t HUF_readCTable (HUF_CElt* CTable, unsigned* maxSymbolValuePtr, const void* src, size_t srcSize, unsigned* hasZeroWeights)
 {
@@ -269,68 +319,64 @@ size_t HUF_readCTable (HUF_CElt* CTable, unsigned* maxSymbolValuePtr, const void
 
 U32 HUF_getNbBitsFromCTable(HUF_CElt const* CTable, U32 symbolValue)
 {
-    const HUF_CElt* ct = CTable + 1;
+    const HUF_CElt* const ct = CTable + 1;
     assert(symbolValue <= HUF_SYMBOLVALUE_MAX);
     return (U32)HUF_getNbBits(ct[symbolValue]);
 }
 
 
-typedef struct nodeElt_s {
-    U32 count;
-    U16 parent;
-    BYTE byte;
-    BYTE nbBits;
-} nodeElt;
-
 /*
  * HUF_setMaxHeight():
- * Enforces maxNbBits on the Huffman tree described in huffNode.
+ * Try to enforce @targetNbBits on the Huffman tree described in @huffNode.
  *
- * It sets all nodes with nbBits > maxNbBits to be maxNbBits. Then it adjusts
- * the tree to so that it is a valid canonical Huffman tree.
+ * It attempts to convert all nodes with nbBits > @targetNbBits
+ * to employ @targetNbBits instead. Then it adjusts the tree
+ * so that it remains a valid canonical Huffman tree.
  *
  * @pre               The sum of the ranks of each symbol == 2^largestBits,
  *                    where largestBits == huffNode[lastNonNull].nbBits.
  * @post              The sum of the ranks of each symbol == 2^largestBits,
- *                    where largestBits is the return value <= maxNbBits.
+ *                    where largestBits is the return value (expected <= targetNbBits).
  *
- * @param huffNode    The Huffman tree modified in place to enforce maxNbBits.
+ * @param huffNode    The Huffman tree modified in place to enforce targetNbBits.
+ *                    It's presumed sorted, from most frequent to rarest symbol.
  * @param lastNonNull The symbol with the lowest count in the Huffman tree.
- * @param maxNbBits   The maximum allowed number of bits, which the Huffman tree
+ * @param targetNbBits  The allowed number of bits, which the Huffman tree
  *                    may not respect. After this function the Huffman tree will
- *                    respect maxNbBits.
- * @return            The maximum number of bits of the Huffman tree after adjustment,
- *                    necessarily no more than maxNbBits.
+ *                    respect targetNbBits.
+ * @return            The maximum number of bits of the Huffman tree after adjustment.
  */
-static U32 HUF_setMaxHeight(nodeElt* huffNode, U32 lastNonNull, U32 maxNbBits)
+static U32 HUF_setMaxHeight(nodeElt* huffNode, U32 lastNonNull, U32 targetNbBits)
 {
     const U32 largestBits = huffNode[lastNonNull].nbBits;
-    /* early exit : no elt > maxNbBits, so the tree is already valid. */
-    if (largestBits <= maxNbBits) return largestBits;
+    /* early exit : no elt > targetNbBits, so the tree is already valid. */
+    if (largestBits <= targetNbBits) return largestBits;
+
+    DEBUGLOG(5, "HUF_setMaxHeight (targetNbBits = %u)", targetNbBits);
 
     /* there are several too large elements (at least >= 2) */
     {   int totalCost = 0;
-        const U32 baseCost = 1 << (largestBits - maxNbBits);
+        const U32 baseCost = 1 << (largestBits - targetNbBits);
         int n = (int)lastNonNull;
 
-        /* Adjust any ranks > maxNbBits to maxNbBits.
+        /* Adjust any ranks > targetNbBits to targetNbBits.
          * Compute totalCost, which is how far the sum of the ranks is
          * we are over 2^largestBits after adjust the offending ranks.
          */
-        while (huffNode[n].nbBits > maxNbBits) {
+        while (huffNode[n].nbBits > targetNbBits) {
             totalCost += baseCost - (1 << (largestBits - huffNode[n].nbBits));
-            huffNode[n].nbBits = (BYTE)maxNbBits;
+            huffNode[n].nbBits = (BYTE)targetNbBits;
             n--;
         }
-        /* n stops at huffNode[n].nbBits <= maxNbBits */
-        assert(huffNode[n].nbBits <= maxNbBits);
-        /* n end at index of smallest symbol using < maxNbBits */
-        while (huffNode[n].nbBits == maxNbBits) --n;
+        /* n stops at huffNode[n].nbBits <= targetNbBits */
+        assert(huffNode[n].nbBits <= targetNbBits);
+        /* n end at index of smallest symbol using < targetNbBits */
+        while (huffNode[n].nbBits == targetNbBits) --n;
 
-        /* renorm totalCost from 2^largestBits to 2^maxNbBits
+        /* renorm totalCost from 2^largestBits to 2^targetNbBits
          * note : totalCost is necessarily a multiple of baseCost */
-        assert((totalCost & (baseCost - 1)) == 0);
-        totalCost >>= (largestBits - maxNbBits);
+        assert(((U32)totalCost & (baseCost - 1)) == 0);
+        totalCost >>= (largestBits - targetNbBits);
         assert(totalCost > 0);
 
         /* repay normalized cost */
@@ -339,19 +385,19 @@ static U32 HUF_setMaxHeight(nodeElt* huffNode, U32 lastNonNull, U32 maxNbBits)
 
             /* Get pos of last (smallest = lowest cum. count) symbol per rank */
             ZSTD_memset(rankLast, 0xF0, sizeof(rankLast));
-            {   U32 currentNbBits = maxNbBits;
+            {   U32 currentNbBits = targetNbBits;
                 int pos;
                 for (pos=n ; pos >= 0; pos--) {
                     if (huffNode[pos].nbBits >= currentNbBits) continue;
-                    currentNbBits = huffNode[pos].nbBits;   /* < maxNbBits */
-                    rankLast[maxNbBits-currentNbBits] = (U32)pos;
+                    currentNbBits = huffNode[pos].nbBits;   /* < targetNbBits */
+                    rankLast[targetNbBits-currentNbBits] = (U32)pos;
             }   }
 
             while (totalCost > 0) {
                 /* Try to reduce the next power of 2 above totalCost because we
                  * gain back half the rank.
                  */
-                U32 nBitsToDecrease = BIT_highbit32((U32)totalCost) + 1;
+                U32 nBitsToDecrease = ZSTD_highbit32((U32)totalCost) + 1;
                 for ( ; nBitsToDecrease > 1; nBitsToDecrease--) {
                     U32 const highPos = rankLast[nBitsToDecrease];
                     U32 const lowPos = rankLast[nBitsToDecrease-1];
@@ -391,7 +437,7 @@ static U32 HUF_setMaxHeight(nodeElt* huffNode, U32 lastNonNull, U32 maxNbBits)
                     rankLast[nBitsToDecrease] = noSymbol;
                 else {
                     rankLast[nBitsToDecrease]--;
-                    if (huffNode[rankLast[nBitsToDecrease]].nbBits != maxNbBits-nBitsToDecrease)
+                    if (huffNode[rankLast[nBitsToDecrease]].nbBits != targetNbBits-nBitsToDecrease)
                         rankLast[nBitsToDecrease] = noSymbol;   /* this rank is now empty */
                 }
             }   /* while (totalCost > 0) */
@@ -403,11 +449,11 @@ static U32 HUF_setMaxHeight(nodeElt* huffNode, U32 lastNonNull, U32 maxNbBits)
              * TODO.
              */
             while (totalCost < 0) {  /* Sometimes, cost correction overshoot */
-                /* special case : no rank 1 symbol (using maxNbBits-1);
-                 * let's create one from largest rank 0 (using maxNbBits).
+                /* special case : no rank 1 symbol (using targetNbBits-1);
+                 * let's create one from largest rank 0 (using targetNbBits).
                  */
                 if (rankLast[1] == noSymbol) {
-                    while (huffNode[n].nbBits == maxNbBits) n--;
+                    while (huffNode[n].nbBits == targetNbBits) n--;
                     huffNode[n+1].nbBits--;
                     assert(n >= 0);
                     rankLast[1] = (U32)(n+1);
@@ -421,7 +467,7 @@ static U32 HUF_setMaxHeight(nodeElt* huffNode, U32 lastNonNull, U32 maxNbBits)
         }   /* repay normalized cost */
     }   /* there are several too large elements (at least >= 2) */
 
-    return maxNbBits;
+    return targetNbBits;
 }
 
 typedef struct {
@@ -429,7 +475,7 @@ typedef struct {
     U16 curr;
 } rankPos;
 
-typedef nodeElt huffNodeTable[HUF_CTABLE_WORKSPACE_SIZE_U32];
+typedef nodeElt huffNodeTable[2 * (HUF_SYMBOLVALUE_MAX + 1)];
 
 /* Number of buckets available for HUF_sort() */
 #define RANK_POSITION_TABLE_SIZE 192
@@ -448,8 +494,8 @@ typedef struct {
  * Let buckets 166 to 192 represent all remaining counts up to RANK_POSITION_MAX_COUNT_LOG using log2 bucketing.
  */
 #define RANK_POSITION_MAX_COUNT_LOG 32
-#define RANK_POSITION_LOG_BUCKETS_BEGIN (RANK_POSITION_TABLE_SIZE - 1) - RANK_POSITION_MAX_COUNT_LOG - 1 /* == 158 */
-#define RANK_POSITION_DISTINCT_COUNT_CUTOFF RANK_POSITION_LOG_BUCKETS_BEGIN + BIT_highbit32(RANK_POSITION_LOG_BUCKETS_BEGIN) /* == 166 */
+#define RANK_POSITION_LOG_BUCKETS_BEGIN ((RANK_POSITION_TABLE_SIZE - 1) - RANK_POSITION_MAX_COUNT_LOG - 1 /* == 158 */)
+#define RANK_POSITION_DISTINCT_COUNT_CUTOFF (RANK_POSITION_LOG_BUCKETS_BEGIN + ZSTD_highbit32(RANK_POSITION_LOG_BUCKETS_BEGIN) /* == 166 */)
 
 /* Return the appropriate bucket index for a given count. See definition of
  * RANK_POSITION_DISTINCT_COUNT_CUTOFF for explanation of bucketing strategy.
@@ -457,7 +503,7 @@ typedef struct {
 static U32 HUF_getIndex(U32 const count) {
     return (count < RANK_POSITION_DISTINCT_COUNT_CUTOFF)
         ? count
-        : BIT_highbit32(count) + RANK_POSITION_LOG_BUCKETS_BEGIN;
+        : ZSTD_highbit32(count) + RANK_POSITION_LOG_BUCKETS_BEGIN;
 }
 
 /* Helper swap function for HUF_quickSortPartition() */
@@ -580,7 +626,7 @@ static void HUF_sort(nodeElt huffNode[], const unsigned count[], U32 const maxSy
 
     /* Sort each bucket. */
     for (n = RANK_POSITION_DISTINCT_COUNT_CUTOFF; n < RANK_POSITION_TABLE_SIZE - 1; ++n) {
-        U32 const bucketSize = rankPosition[n].curr-rankPosition[n].base;
+        int const bucketSize = rankPosition[n].curr - rankPosition[n].base;
         U32 const bucketStartIdx = rankPosition[n].base;
         if (bucketSize > 1) {
             assert(bucketStartIdx < maxSymbolValue1);
@@ -591,6 +637,7 @@ static void HUF_sort(nodeElt huffNode[], const unsigned count[], U32 const maxSy
     assert(HUF_isSorted(huffNode, maxSymbolValue1));
 }
 
+
 /* HUF_buildCTable_wksp() :
  *  Same as HUF_buildCTable(), but using externally allocated scratch buffer.
  *  `workSpace` must be aligned on 4-bytes boundaries, and be at least as large as sizeof(HUF_buildCTable_wksp_tables).
@@ -611,6 +658,7 @@ static int HUF_buildTree(nodeElt* huffNode, U32 maxSymbolValue)
     int lowS, lowN;
     int nodeNb = STARTNODE;
     int n, nodeRoot;
+    DEBUGLOG(5, "HUF_buildTree (alphabet size = %u)", maxSymbolValue + 1);
     /* init for parents */
     nonNullRank = (int)maxSymbolValue;
     while(huffNode[nonNullRank].count == 0) nonNullRank--;
@@ -637,6 +685,8 @@ static int HUF_buildTree(nodeElt* huffNode, U32 maxSymbolValue)
     for (n=0; n<=nonNullRank; n++)
         huffNode[n].nbBits = huffNode[ huffNode[n].parent ].nbBits + 1;
 
+    DEBUGLOG(6, "Initial distribution of bits completed (%zu sorted symbols)", showHNodeBits(huffNode, maxSymbolValue+1));
+
     return nonNullRank;
 }
 
@@ -674,28 +724,36 @@ static void HUF_buildCTableFromTree(HUF_CElt* CTable, nodeElt const* huffNode, i
     CTable[0] = maxNbBits;
 }
 
-size_t HUF_buildCTable_wksp (HUF_CElt* CTable, const unsigned* count, U32 maxSymbolValue, U32 maxNbBits, void* workSpace, size_t wkspSize)
+size_t
+HUF_buildCTable_wksp(HUF_CElt* CTable, const unsigned* count, U32 maxSymbolValue, U32 maxNbBits,
+                     void* workSpace, size_t wkspSize)
 {
-    HUF_buildCTable_wksp_tables* const wksp_tables = (HUF_buildCTable_wksp_tables*)HUF_alignUpWorkspace(workSpace, &wkspSize, ZSTD_ALIGNOF(U32));
+    HUF_buildCTable_wksp_tables* const wksp_tables =
+        (HUF_buildCTable_wksp_tables*)HUF_alignUpWorkspace(workSpace, &wkspSize, ZSTD_ALIGNOF(U32));
     nodeElt* const huffNode0 = wksp_tables->huffNodeTbl;
     nodeElt* const huffNode = huffNode0+1;
     int nonNullRank;
 
+    HUF_STATIC_ASSERT(HUF_CTABLE_WORKSPACE_SIZE == sizeof(HUF_buildCTable_wksp_tables));
+
+    DEBUGLOG(5, "HUF_buildCTable_wksp (alphabet size = %u)", maxSymbolValue+1);
+
     /* safety checks */
     if (wkspSize < sizeof(HUF_buildCTable_wksp_tables))
-      return ERROR(workSpace_tooSmall);
+        return ERROR(workSpace_tooSmall);
     if (maxNbBits == 0) maxNbBits = HUF_TABLELOG_DEFAULT;
     if (maxSymbolValue > HUF_SYMBOLVALUE_MAX)
-      return ERROR(maxSymbolValue_tooLarge);
+        return ERROR(maxSymbolValue_tooLarge);
     ZSTD_memset(huffNode0, 0, sizeof(huffNodeTable));
 
     /* sort, decreasing order */
     HUF_sort(huffNode, count, maxSymbolValue, wksp_tables->rankPosition);
+    DEBUGLOG(6, "sorted symbols completed (%zu symbols)", showHNodeSymbols(huffNode, maxSymbolValue+1));
 
     /* build tree */
     nonNullRank = HUF_buildTree(huffNode, maxSymbolValue);
 
-    /* enforce maxTableLog */
+    /* determine and enforce maxTableLog */
     maxNbBits = HUF_setMaxHeight(huffNode, (U32)nonNullRank, maxNbBits);
     if (maxNbBits > HUF_TABLELOG_MAX) return ERROR(GENERIC);   /* check fit into table */
 
@@ -804,7 +862,7 @@ FORCE_INLINE_TEMPLATE void HUF_addBits(HUF_CStream_t* bitC, HUF_CElt elt, int id
 #if DEBUGLEVEL >= 1
     {
         size_t const nbBits = HUF_getNbBits(elt);
-        size_t const dirtyBits = nbBits == 0 ? 0 : BIT_highbit32((U32)nbBits) + 1;
+        size_t const dirtyBits = nbBits == 0 ? 0 : ZSTD_highbit32((U32)nbBits) + 1;
         (void)dirtyBits;
         /* Middle bits are 0. */
         assert(((elt >> dirtyBits) << (dirtyBits + nbBits)) == 0);
@@ -884,7 +942,7 @@ static size_t HUF_closeCStream(HUF_CStream_t* bitC)
     {
         size_t const nbBits = bitC->bitPos[0] & 0xFF;
         if (bitC->ptr >= bitC->endPtr) return 0; /* overflow detected */
-        return (bitC->ptr - bitC->startPtr) + (nbBits > 0);
+        return (size_t)(bitC->ptr - bitC->startPtr) + (nbBits > 0);
     }
 }
 
@@ -1045,9 +1103,9 @@ HUF_compress1X_usingCTable_internal_default(void* dst, size_t dstSize,
 static size_t
 HUF_compress1X_usingCTable_internal(void* dst, size_t dstSize,
                               const void* src, size_t srcSize,
-                              const HUF_CElt* CTable, const int bmi2)
+                              const HUF_CElt* CTable, const int flags)
 {
-    if (bmi2) {
+    if (flags & HUF_flags_bmi2) {
         return HUF_compress1X_usingCTable_internal_bmi2(dst, dstSize, src, srcSize, CTable);
     }
     return HUF_compress1X_usingCTable_internal_default(dst, dstSize, src, srcSize, CTable);
@@ -1058,28 +1116,23 @@ HUF_compress1X_usingCTable_internal(void* dst, size_t dstSize,
 static size_t
 HUF_compress1X_usingCTable_internal(void* dst, size_t dstSize,
                               const void* src, size_t srcSize,
-                              const HUF_CElt* CTable, const int bmi2)
+                              const HUF_CElt* CTable, const int flags)
 {
-    (void)bmi2;
+    (void)flags;
     return HUF_compress1X_usingCTable_internal_body(dst, dstSize, src, srcSize, CTable);
 }
 
 #endif
 
-size_t HUF_compress1X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable)
-{
-    return HUF_compress1X_usingCTable_bmi2(dst, dstSize, src, srcSize, CTable, /* bmi2 */ 0);
-}
-
-size_t HUF_compress1X_usingCTable_bmi2(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable, int bmi2)
+size_t HUF_compress1X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable, int flags)
 {
-    return HUF_compress1X_usingCTable_internal(dst, dstSize, src, srcSize, CTable, bmi2);
+    return HUF_compress1X_usingCTable_internal(dst, dstSize, src, srcSize, CTable, flags);
 }
 
 static size_t
 HUF_compress4X_usingCTable_internal(void* dst, size_t dstSize,
                               const void* src, size_t srcSize,
-                              const HUF_CElt* CTable, int bmi2)
+                              const HUF_CElt* CTable, int flags)
 {
     size_t const segmentSize = (srcSize+3)/4;   /* first 3 segments */
     const BYTE* ip = (const BYTE*) src;
@@ -1093,7 +1146,7 @@ HUF_compress4X_usingCTable_internal(void* dst, size_t dstSize,
     op += 6;   /* jumpTable */
 
     assert(op <= oend);
-    {   CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend-op), ip, segmentSize, CTable, bmi2) );
+    {   CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend-op), ip, segmentSize, CTable, flags) );
         if (cSize == 0 || cSize > 65535) return 0;
         MEM_writeLE16(ostart, (U16)cSize);
         op += cSize;
@@ -1101,7 +1154,7 @@ HUF_compress4X_usingCTable_internal(void* dst, size_t dstSize,
 
     ip += segmentSize;
     assert(op <= oend);
-    {   CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend-op), ip, segmentSize, CTable, bmi2) );
+    {   CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend-op), ip, segmentSize, CTable, flags) );
         if (cSize == 0 || cSize > 65535) return 0;
         MEM_writeLE16(ostart+2, (U16)cSize);
         op += cSize;
@@ -1109,7 +1162,7 @@ HUF_compress4X_usingCTable_internal(void* dst, size_t dstSize,
 
     ip += segmentSize;
     assert(op <= oend);
-    {   CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend-op), ip, segmentSize, CTable, bmi2) );
+    {   CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend-op), ip, segmentSize, CTable, flags) );
         if (cSize == 0 || cSize > 65535) return 0;
         MEM_writeLE16(ostart+4, (U16)cSize);
         op += cSize;
@@ -1118,7 +1171,7 @@ HUF_compress4X_usingCTable_internal(void* dst, size_t dstSize,
     ip += segmentSize;
     assert(op <= oend);
     assert(ip <= iend);
-    {   CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend-op), ip, (size_t)(iend-ip), CTable, bmi2) );
+    {   CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend-op), ip, (size_t)(iend-ip), CTable, flags) );
         if (cSize == 0 || cSize > 65535) return 0;
         op += cSize;
     }
@@ -1126,14 +1179,9 @@ HUF_compress4X_usingCTable_internal(void* dst, size_t dstSize,
     return (size_t)(op-ostart);
 }
 
-size_t HUF_compress4X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable)
+size_t HUF_compress4X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable, int flags)
 {
-    return HUF_compress4X_usingCTable_bmi2(dst, dstSize, src, srcSize, CTable, /* bmi2 */ 0);
-}
-
-size_t HUF_compress4X_usingCTable_bmi2(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable, int bmi2)
-{
-    return HUF_compress4X_usingCTable_internal(dst, dstSize, src, srcSize, CTable, bmi2);
+    return HUF_compress4X_usingCTable_internal(dst, dstSize, src, srcSize, CTable, flags);
 }
 
 typedef enum { HUF_singleStream, HUF_fourStreams } HUF_nbStreams_e;
@@ -1141,11 +1189,11 @@ typedef enum { HUF_singleStream, HUF_fourStreams } HUF_nbStreams_e;
 static size_t HUF_compressCTable_internal(
                 BYTE* const ostart, BYTE* op, BYTE* const oend,
                 const void* src, size_t srcSize,
-                HUF_nbStreams_e nbStreams, const HUF_CElt* CTable, const int bmi2)
+                HUF_nbStreams_e nbStreams, const HUF_CElt* CTable, const int flags)
 {
     size_t const cSize = (nbStreams==HUF_singleStream) ?
-                         HUF_compress1X_usingCTable_internal(op, (size_t)(oend - op), src, srcSize, CTable, bmi2) :
-                         HUF_compress4X_usingCTable_internal(op, (size_t)(oend - op), src, srcSize, CTable, bmi2);
+                         HUF_compress1X_usingCTable_internal(op, (size_t)(oend - op), src, srcSize, CTable, flags) :
+                         HUF_compress4X_usingCTable_internal(op, (size_t)(oend - op), src, srcSize, CTable, flags);
     if (HUF_isError(cSize)) { return cSize; }
     if (cSize==0) { return 0; }   /* uncompressible */
     op += cSize;
@@ -1168,6 +1216,79 @@ typedef struct {
 #define SUSPECT_INCOMPRESSIBLE_SAMPLE_SIZE 4096
 #define SUSPECT_INCOMPRESSIBLE_SAMPLE_RATIO 10  /* Must be >= 2 */
 
+unsigned HUF_cardinality(const unsigned* count, unsigned maxSymbolValue)
+{
+    unsigned cardinality = 0;
+    unsigned i;
+
+    for (i = 0; i < maxSymbolValue + 1; i++) {
+        if (count[i] != 0) cardinality += 1;
+    }
+
+    return cardinality;
+}
+
+unsigned HUF_minTableLog(unsigned symbolCardinality)
+{
+    U32 minBitsSymbols = ZSTD_highbit32(symbolCardinality) + 1;
+    return minBitsSymbols;
+}
+
+unsigned HUF_optimalTableLog(
+            unsigned maxTableLog,
+            size_t srcSize,
+            unsigned maxSymbolValue,
+            void* workSpace, size_t wkspSize,
+            HUF_CElt* table,
+      const unsigned* count,
+            int flags)
+{
+    assert(srcSize > 1); /* Not supported, RLE should be used instead */
+    assert(wkspSize >= sizeof(HUF_buildCTable_wksp_tables));
+
+    if (!(flags & HUF_flags_optimalDepth)) {
+        /* cheap evaluation, based on FSE */
+        return FSE_optimalTableLog_internal(maxTableLog, srcSize, maxSymbolValue, 1);
+    }
+
+    {   BYTE* dst = (BYTE*)workSpace + sizeof(HUF_WriteCTableWksp);
+        size_t dstSize = wkspSize - sizeof(HUF_WriteCTableWksp);
+        size_t maxBits, hSize, newSize;
+        const unsigned symbolCardinality = HUF_cardinality(count, maxSymbolValue);
+        const unsigned minTableLog = HUF_minTableLog(symbolCardinality);
+        size_t optSize = ((size_t) ~0) - 1;
+        unsigned optLog = maxTableLog, optLogGuess;
+
+        DEBUGLOG(6, "HUF_optimalTableLog: probing huf depth (srcSize=%zu)", srcSize);
+
+        /* Search until size increases */
+        for (optLogGuess = minTableLog; optLogGuess <= maxTableLog; optLogGuess++) {
+            DEBUGLOG(7, "checking for huffLog=%u", optLogGuess);
+            maxBits = HUF_buildCTable_wksp(table, count, maxSymbolValue, optLogGuess, workSpace, wkspSize);
+            if (ERR_isError(maxBits)) continue;
+
+            if (maxBits < optLogGuess && optLogGuess > minTableLog) break;
+
+            hSize = HUF_writeCTable_wksp(dst, dstSize, table, maxSymbolValue, (U32)maxBits, workSpace, wkspSize);
+
+            if (ERR_isError(hSize)) continue;
+
+            newSize = HUF_estimateCompressedSize(table, count, maxSymbolValue) + hSize;
+
+            if (newSize > optSize + 1) {
+                break;
+            }
+
+            if (newSize < optSize) {
+                optSize = newSize;
+                optLog = optLogGuess;
+            }
+        }
+        assert(optLog <= HUF_TABLELOG_MAX);
+        return optLog;
+    }
+}
+
 /* HUF_compress_internal() :
  * `workSpace_align4` must be aligned on 4-bytes boundaries,
  * and occupies the same space as a table of HUF_WORKSPACE_SIZE_U64 unsigned */
@@ -1177,14 +1298,14 @@ HUF_compress_internal (void* dst, size_t dstSize,
                        unsigned maxSymbolValue, unsigned huffLog,
                        HUF_nbStreams_e nbStreams,
                        void* workSpace, size_t wkspSize,
-                       HUF_CElt* oldHufTable, HUF_repeat* repeat, int preferRepeat,
-                 const int bmi2, unsigned suspectUncompressible)
+                       HUF_CElt* oldHufTable, HUF_repeat* repeat, int flags)
 {
     HUF_compress_tables_t* const table = (HUF_compress_tables_t*)HUF_alignUpWorkspace(workSpace, &wkspSize, ZSTD_ALIGNOF(size_t));
     BYTE* const ostart = (BYTE*)dst;
     BYTE* const oend = ostart + dstSize;
     BYTE* op = ostart;
 
+    DEBUGLOG(5, "HUF_compress_internal (srcSize=%zu)", srcSize);
     HUF_STATIC_ASSERT(sizeof(*table) + HUF_WORKSPACE_MAX_ALIGNMENT <= HUF_WORKSPACE_SIZE);
 
     /* checks & inits */
@@ -1198,16 +1319,17 @@ HUF_compress_internal (void* dst, size_t dstSize,
     if (!huffLog) huffLog = HUF_TABLELOG_DEFAULT;
 
     /* Heuristic : If old table is valid, use it for small inputs */
-    if (preferRepeat && repeat && *repeat == HUF_repeat_valid) {
+    if ((flags & HUF_flags_preferRepeat) && repeat && *repeat == HUF_repeat_valid) {
         return HUF_compressCTable_internal(ostart, op, oend,
                                            src, srcSize,
-                                           nbStreams, oldHufTable, bmi2);
+                                           nbStreams, oldHufTable, flags);
     }
 
     /* If uncompressible data is suspected, do a smaller sampling first */
     DEBUG_STATIC_ASSERT(SUSPECT_INCOMPRESSIBLE_SAMPLE_RATIO >= 2);
-    if (suspectUncompressible && srcSize >= (SUSPECT_INCOMPRESSIBLE_SAMPLE_SIZE * SUSPECT_INCOMPRESSIBLE_SAMPLE_RATIO)) {
+    if ((flags & HUF_flags_suspectUncompressible) && srcSize >= (SUSPECT_INCOMPRESSIBLE_SAMPLE_SIZE * SUSPECT_INCOMPRESSIBLE_SAMPLE_RATIO)) {
         size_t largestTotal = 0;
+        DEBUGLOG(5, "input suspected incompressible : sampling to check");
         {   unsigned maxSymbolValueBegin = maxSymbolValue;
             CHECK_V_F(largestBegin, HIST_count_simple (table->count, &maxSymbolValueBegin, (const BYTE*)src, SUSPECT_INCOMPRESSIBLE_SAMPLE_SIZE) );
             largestTotal += largestBegin;
@@ -1224,6 +1346,7 @@ HUF_compress_internal (void* dst, size_t dstSize,
         if (largest == srcSize) { *ostart = ((const BYTE*)src)[0]; return 1; }   /* single symbol, rle */
         if (largest <= (srcSize >> 7)+4) return 0;   /* heuristic : probably not compressible enough */
     }
+    DEBUGLOG(6, "histogram detail completed (%zu symbols)", showU32(table->count, maxSymbolValue+1));
 
     /* Check validity of previous table */
     if ( repeat
@@ -1232,19 +1355,20 @@ HUF_compress_internal (void* dst, size_t dstSize,
         *repeat = HUF_repeat_none;
     }
     /* Heuristic : use existing table for small inputs */
-    if (preferRepeat && repeat && *repeat != HUF_repeat_none) {
+    if ((flags & HUF_flags_preferRepeat) && repeat && *repeat != HUF_repeat_none) {
         return HUF_compressCTable_internal(ostart, op, oend,
                                            src, srcSize,
-                                           nbStreams, oldHufTable, bmi2);
+                                           nbStreams, oldHufTable, flags);
     }
 
     /* Build Huffman Tree */
-    huffLog = HUF_optimalTableLog(huffLog, srcSize, maxSymbolValue);
+    huffLog = HUF_optimalTableLog(huffLog, srcSize, maxSymbolValue, &table->wksps, sizeof(table->wksps), table->CTable, table->count, flags);
     {   size_t const maxBits = HUF_buildCTable_wksp(table->CTable, table->count,
                                             maxSymbolValue, huffLog,
                                             &table->wksps.buildCTable_wksp, sizeof(table->wksps.buildCTable_wksp));
         CHECK_F(maxBits);
         huffLog = (U32)maxBits;
+        DEBUGLOG(6, "bit distribution completed (%zu symbols)", showCTableBits(table->CTable + 1, maxSymbolValue+1));
     }
     /* Zero unused symbols in CTable, so we can check it for validity */
     {
@@ -1263,7 +1387,7 @@ HUF_compress_internal (void* dst, size_t dstSize,
             if (oldSize <= hSize + newSize || hSize + 12 >= srcSize) {
                 return HUF_compressCTable_internal(ostart, op, oend,
                                                    src, srcSize,
-                                                   nbStreams, oldHufTable, bmi2);
+                                                   nbStreams, oldHufTable, flags);
         }   }
 
         /* Use the new huffman table */
@@ -1275,46 +1399,20 @@ HUF_compress_internal (void* dst, size_t dstSize,
     }
     return HUF_compressCTable_internal(ostart, op, oend,
                                        src, srcSize,
-                                       nbStreams, table->CTable, bmi2);
-}
-
-
-size_t HUF_compress1X_wksp (void* dst, size_t dstSize,
-                      const void* src, size_t srcSize,
-                      unsigned maxSymbolValue, unsigned huffLog,
-                      void* workSpace, size_t wkspSize)
-{
-    return HUF_compress_internal(dst, dstSize, src, srcSize,
-                                 maxSymbolValue, huffLog, HUF_singleStream,
-                                 workSpace, wkspSize,
-                                 NULL, NULL, 0, 0 /*bmi2*/, 0);
+                                       nbStreams, table->CTable, flags);
 }
 
 size_t HUF_compress1X_repeat (void* dst, size_t dstSize,
                       const void* src, size_t srcSize,
                       unsigned maxSymbolValue, unsigned huffLog,
                       void* workSpace, size_t wkspSize,
-                      HUF_CElt* hufTable, HUF_repeat* repeat, int preferRepeat,
-                      int bmi2, unsigned suspectUncompressible)
+                      HUF_CElt* hufTable, HUF_repeat* repeat, int flags)
 {
+    DEBUGLOG(5, "HUF_compress1X_repeat (srcSize = %zu)", srcSize);
     return HUF_compress_internal(dst, dstSize, src, srcSize,
                                  maxSymbolValue, huffLog, HUF_singleStream,
                                  workSpace, wkspSize, hufTable,
-                                 repeat, preferRepeat, bmi2, suspectUncompressible);
-}
-
-/* HUF_compress4X_repeat():
- * compress input using 4 streams.
- * provide workspace to generate compression tables */
-size_t HUF_compress4X_wksp (void* dst, size_t dstSize,
-                      const void* src, size_t srcSize,
-                      unsigned maxSymbolValue, unsigned huffLog,
-                      void* workSpace, size_t wkspSize)
-{
-    return HUF_compress_internal(dst, dstSize, src, srcSize,
-                                 maxSymbolValue, huffLog, HUF_fourStreams,
-                                 workSpace, wkspSize,
-                                 NULL, NULL, 0, 0 /*bmi2*/, 0);
+                                 repeat, flags);
 }
 
 /* HUF_compress4X_repeat():
@@ -1325,11 +1423,11 @@ size_t HUF_compress4X_repeat (void* dst, size_t dstSize,
                       const void* src, size_t srcSize,
                       unsigned maxSymbolValue, unsigned huffLog,
                       void* workSpace, size_t wkspSize,
-                      HUF_CElt* hufTable, HUF_repeat* repeat, int preferRepeat, int bmi2, unsigned suspectUncompressible)
+                      HUF_CElt* hufTable, HUF_repeat* repeat, int flags)
 {
+    DEBUGLOG(5, "HUF_compress4X_repeat (srcSize = %zu)", srcSize);
     return HUF_compress_internal(dst, dstSize, src, srcSize,
                                  maxSymbolValue, huffLog, HUF_fourStreams,
                                  workSpace, wkspSize,
-                                 hufTable, repeat, preferRepeat, bmi2, suspectUncompressible);
+                                 hufTable, repeat, flags);
 }
-
diff --git a/lib/zstd/compress/zstd_compress.c b/lib/zstd/compress/zstd_compress.c
index f620cafca633..c1c316e9e289 100644
--- a/lib/zstd/compress/zstd_compress.c
+++ b/lib/zstd/compress/zstd_compress.c
@@ -1,5 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
 /*
- * Copyright (c) Yann Collet, Facebook, Inc.
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
  * All rights reserved.
  *
  * This source code is licensed under both the BSD-style license (found in the
@@ -11,12 +12,12 @@
 /*-*************************************
 *  Dependencies
 ***************************************/
+#include "../common/allocations.h"  /* ZSTD_customMalloc, ZSTD_customCalloc, ZSTD_customFree */
 #include "../common/zstd_deps.h"  /* INT_MAX, ZSTD_memset, ZSTD_memcpy */
 #include "../common/mem.h"
 #include "hist.h"           /* HIST_countFast_wksp */
 #define FSE_STATIC_LINKING_ONLY   /* FSE_encodeSymbol */
 #include "../common/fse.h"
-#define HUF_STATIC_LINKING_ONLY
 #include "../common/huf.h"
 #include "zstd_compress_internal.h"
 #include "zstd_compress_sequences.h"
@@ -27,6 +28,7 @@
 #include "zstd_opt.h"
 #include "zstd_ldm.h"
 #include "zstd_compress_superblock.h"
+#include  "../common/bits.h"      /* ZSTD_highbit32, ZSTD_rotateRight_U64 */
 
 /* ***************************************************************
 *  Tuning parameters
@@ -55,14 +57,17 @@
 *  Helper functions
 ***************************************/
 /* ZSTD_compressBound()
- * Note that the result from this function is only compatible with the "normal"
- * full-block strategy.
- * When there are a lot of small blocks due to frequent flush in streaming mode
- * the overhead of headers can make the compressed data to be larger than the
- * return value of ZSTD_compressBound().
+ * Note that the result from this function is only valid for
+ * the one-pass compression functions.
+ * When employing the streaming mode,
+ * if flushes are frequently altering the size of blocks,
+ * the overhead from block headers can make the compressed data larger
+ * than the return value of ZSTD_compressBound().
  */
 size_t ZSTD_compressBound(size_t srcSize) {
-    return ZSTD_COMPRESSBOUND(srcSize);
+    size_t const r = ZSTD_COMPRESSBOUND(srcSize);
+    if (r==0) return ERROR(srcSize_wrong);
+    return r;
 }
 
 
@@ -171,12 +176,9 @@ size_t ZSTD_freeCCtx(ZSTD_CCtx* cctx)
     if (cctx==NULL) return 0;   /* support free on NULL */
     RETURN_ERROR_IF(cctx->staticSize, memory_allocation,
                     "not compatible with static CCtx");
-    {
-        int cctxInWorkspace = ZSTD_cwksp_owns_buffer(&cctx->workspace, cctx);
+    {   int cctxInWorkspace = ZSTD_cwksp_owns_buffer(&cctx->workspace, cctx);
         ZSTD_freeCCtxContent(cctx);
-        if (!cctxInWorkspace) {
-            ZSTD_customFree(cctx, cctx->customMem);
-        }
+        if (!cctxInWorkspace) ZSTD_customFree(cctx, cctx->customMem);
     }
     return 0;
 }
@@ -257,9 +259,9 @@ static int ZSTD_allocateChainTable(const ZSTD_strategy strategy,
     return forDDSDict || ((strategy != ZSTD_fast) && !ZSTD_rowMatchFinderUsed(strategy, useRowMatchFinder));
 }
 
-/* Returns 1 if compression parameters are such that we should
+/* Returns ZSTD_ps_enable if compression parameters are such that we should
  * enable long distance matching (wlog >= 27, strategy >= btopt).
- * Returns 0 otherwise.
+ * Returns ZSTD_ps_disable otherwise.
  */
 static ZSTD_paramSwitch_e ZSTD_resolveEnableLdm(ZSTD_paramSwitch_e mode,
                                  const ZSTD_compressionParameters* const cParams) {
@@ -267,6 +269,34 @@ static ZSTD_paramSwitch_e ZSTD_resolveEnableLdm(ZSTD_paramSwitch_e mode,
     return (cParams->strategy >= ZSTD_btopt && cParams->windowLog >= 27) ? ZSTD_ps_enable : ZSTD_ps_disable;
 }
 
+static int ZSTD_resolveExternalSequenceValidation(int mode) {
+    return mode;
+}
+
+/* Resolves maxBlockSize to the default if no value is present. */
+static size_t ZSTD_resolveMaxBlockSize(size_t maxBlockSize) {
+    if (maxBlockSize == 0) {
+        return ZSTD_BLOCKSIZE_MAX;
+    } else {
+        return maxBlockSize;
+    }
+}
+
+static ZSTD_paramSwitch_e ZSTD_resolveExternalRepcodeSearch(ZSTD_paramSwitch_e value, int cLevel) {
+    if (value != ZSTD_ps_auto) return value;
+    if (cLevel < 10) {
+        return ZSTD_ps_disable;
+    } else {
+        return ZSTD_ps_enable;
+    }
+}
+
+/* Returns 1 if compression parameters are such that CDict hashtable and chaintable indices are tagged.
+ * If so, the tags need to be removed in ZSTD_resetCCtx_byCopyingCDict. */
+static int ZSTD_CDictIndicesAreTagged(const ZSTD_compressionParameters* const cParams) {
+    return cParams->strategy == ZSTD_fast || cParams->strategy == ZSTD_dfast;
+}
+
 static ZSTD_CCtx_params ZSTD_makeCCtxParamsFromCParams(
         ZSTD_compressionParameters cParams)
 {
@@ -284,6 +314,10 @@ static ZSTD_CCtx_params ZSTD_makeCCtxParamsFromCParams(
     }
     cctxParams.useBlockSplitter = ZSTD_resolveBlockSplitterMode(cctxParams.useBlockSplitter, &cParams);
     cctxParams.useRowMatchFinder = ZSTD_resolveRowMatchFinderMode(cctxParams.useRowMatchFinder, &cParams);
+    cctxParams.validateSequences = ZSTD_resolveExternalSequenceValidation(cctxParams.validateSequences);
+    cctxParams.maxBlockSize = ZSTD_resolveMaxBlockSize(cctxParams.maxBlockSize);
+    cctxParams.searchForExternalRepcodes = ZSTD_resolveExternalRepcodeSearch(cctxParams.searchForExternalRepcodes,
+                                                                             cctxParams.compressionLevel);
     assert(!ZSTD_checkCParams(cParams));
     return cctxParams;
 }
@@ -329,10 +363,13 @@ size_t ZSTD_CCtxParams_init(ZSTD_CCtx_params* cctxParams, int compressionLevel)
 #define ZSTD_NO_CLEVEL 0
 
 /*
- * Initializes the cctxParams from params and compressionLevel.
+ * Initializes `cctxParams` from `params` and `compressionLevel`.
  * @param compressionLevel If params are derived from a compression level then that compression level, otherwise ZSTD_NO_CLEVEL.
  */
-static void ZSTD_CCtxParams_init_internal(ZSTD_CCtx_params* cctxParams, ZSTD_parameters const* params, int compressionLevel)
+static void
+ZSTD_CCtxParams_init_internal(ZSTD_CCtx_params* cctxParams,
+                        const ZSTD_parameters* params,
+                              int compressionLevel)
 {
     assert(!ZSTD_checkCParams(params->cParams));
     ZSTD_memset(cctxParams, 0, sizeof(*cctxParams));
@@ -345,6 +382,9 @@ static void ZSTD_CCtxParams_init_internal(ZSTD_CCtx_params* cctxParams, ZSTD_par
     cctxParams->useRowMatchFinder = ZSTD_resolveRowMatchFinderMode(cctxParams->useRowMatchFinder, &params->cParams);
     cctxParams->useBlockSplitter = ZSTD_resolveBlockSplitterMode(cctxParams->useBlockSplitter, &params->cParams);
     cctxParams->ldmParams.enableLdm = ZSTD_resolveEnableLdm(cctxParams->ldmParams.enableLdm, &params->cParams);
+    cctxParams->validateSequences = ZSTD_resolveExternalSequenceValidation(cctxParams->validateSequences);
+    cctxParams->maxBlockSize = ZSTD_resolveMaxBlockSize(cctxParams->maxBlockSize);
+    cctxParams->searchForExternalRepcodes = ZSTD_resolveExternalRepcodeSearch(cctxParams->searchForExternalRepcodes, compressionLevel);
     DEBUGLOG(4, "ZSTD_CCtxParams_init_internal: useRowMatchFinder=%d, useBlockSplitter=%d ldm=%d",
                 cctxParams->useRowMatchFinder, cctxParams->useBlockSplitter, cctxParams->ldmParams.enableLdm);
 }
@@ -359,7 +399,7 @@ size_t ZSTD_CCtxParams_init_advanced(ZSTD_CCtx_params* cctxParams, ZSTD_paramete
 
 /*
  * Sets cctxParams' cParams and fParams from params, but otherwise leaves them alone.
- * @param param Validated zstd parameters.
+ * @param params Validated zstd parameters.
  */
 static void ZSTD_CCtxParams_setZstdParams(
         ZSTD_CCtx_params* cctxParams, const ZSTD_parameters* params)
@@ -455,8 +495,8 @@ ZSTD_bounds ZSTD_cParam_getBounds(ZSTD_cParameter param)
         return bounds;
 
     case ZSTD_c_enableLongDistanceMatching:
-        bounds.lowerBound = 0;
-        bounds.upperBound = 1;
+        bounds.lowerBound = (int)ZSTD_ps_auto;
+        bounds.upperBound = (int)ZSTD_ps_disable;
         return bounds;
 
     case ZSTD_c_ldmHashLog:
@@ -549,6 +589,26 @@ ZSTD_bounds ZSTD_cParam_getBounds(ZSTD_cParameter param)
         bounds.upperBound = 1;
         return bounds;
 
+    case ZSTD_c_prefetchCDictTables:
+        bounds.lowerBound = (int)ZSTD_ps_auto;
+        bounds.upperBound = (int)ZSTD_ps_disable;
+        return bounds;
+
+    case ZSTD_c_enableSeqProducerFallback:
+        bounds.lowerBound = 0;
+        bounds.upperBound = 1;
+        return bounds;
+
+    case ZSTD_c_maxBlockSize:
+        bounds.lowerBound = ZSTD_BLOCKSIZE_MAX_MIN;
+        bounds.upperBound = ZSTD_BLOCKSIZE_MAX;
+        return bounds;
+
+    case ZSTD_c_searchForExternalRepcodes:
+        bounds.lowerBound = (int)ZSTD_ps_auto;
+        bounds.upperBound = (int)ZSTD_ps_disable;
+        return bounds;
+
     default:
         bounds.error = ERROR(parameter_unsupported);
         return bounds;
@@ -613,6 +673,10 @@ static int ZSTD_isUpdateAuthorized(ZSTD_cParameter param)
     case ZSTD_c_useBlockSplitter:
     case ZSTD_c_useRowMatchFinder:
     case ZSTD_c_deterministicRefPrefix:
+    case ZSTD_c_prefetchCDictTables:
+    case ZSTD_c_enableSeqProducerFallback:
+    case ZSTD_c_maxBlockSize:
+    case ZSTD_c_searchForExternalRepcodes:
     default:
         return 0;
     }
@@ -625,7 +689,7 @@ size_t ZSTD_CCtx_setParameter(ZSTD_CCtx* cctx, ZSTD_cParameter param, int value)
         if (ZSTD_isUpdateAuthorized(param)) {
             cctx->cParamsChanged = 1;
         } else {
-            RETURN_ERROR(stage_wrong, "can only set params in ctx init stage");
+            RETURN_ERROR(stage_wrong, "can only set params in cctx init stage");
     }   }
 
     switch(param)
@@ -668,6 +732,10 @@ size_t ZSTD_CCtx_setParameter(ZSTD_CCtx* cctx, ZSTD_cParameter param, int value)
     case ZSTD_c_useBlockSplitter:
     case ZSTD_c_useRowMatchFinder:
     case ZSTD_c_deterministicRefPrefix:
+    case ZSTD_c_prefetchCDictTables:
+    case ZSTD_c_enableSeqProducerFallback:
+    case ZSTD_c_maxBlockSize:
+    case ZSTD_c_searchForExternalRepcodes:
         break;
 
     default: RETURN_ERROR(parameter_unsupported, "unknown parameter");
@@ -723,12 +791,12 @@ size_t ZSTD_CCtxParams_setParameter(ZSTD_CCtx_params* CCtxParams,
     case ZSTD_c_minMatch :
         if (value!=0)   /* 0 => use default */
             BOUNDCHECK(ZSTD_c_minMatch, value);
-        CCtxParams->cParams.minMatch = value;
+        CCtxParams->cParams.minMatch = (U32)value;
         return CCtxParams->cParams.minMatch;
 
     case ZSTD_c_targetLength :
         BOUNDCHECK(ZSTD_c_targetLength, value);
-        CCtxParams->cParams.targetLength = value;
+        CCtxParams->cParams.targetLength = (U32)value;
         return CCtxParams->cParams.targetLength;
 
     case ZSTD_c_strategy :
@@ -741,12 +809,12 @@ size_t ZSTD_CCtxParams_setParameter(ZSTD_CCtx_params* CCtxParams,
         /* Content size written in frame header _when known_ (default:1) */
         DEBUGLOG(4, "set content size flag = %u", (value!=0));
         CCtxParams->fParams.contentSizeFlag = value != 0;
-        return CCtxParams->fParams.contentSizeFlag;
+        return (size_t)CCtxParams->fParams.contentSizeFlag;
 
     case ZSTD_c_checksumFlag :
         /* A 32-bits content checksum will be calculated and written at end of frame (default:0) */
         CCtxParams->fParams.checksumFlag = value != 0;
-        return CCtxParams->fParams.checksumFlag;
+        return (size_t)CCtxParams->fParams.checksumFlag;
 
     case ZSTD_c_dictIDFlag : /* When applicable, dictionary's dictID is provided in frame header (default:1) */
         DEBUGLOG(4, "set dictIDFlag = %u", (value!=0));
@@ -755,18 +823,18 @@ size_t ZSTD_CCtxParams_setParameter(ZSTD_CCtx_params* CCtxParams,
 
     case ZSTD_c_forceMaxWindow :
         CCtxParams->forceWindow = (value != 0);
-        return CCtxParams->forceWindow;
+        return (size_t)CCtxParams->forceWindow;
 
     case ZSTD_c_forceAttachDict : {
         const ZSTD_dictAttachPref_e pref = (ZSTD_dictAttachPref_e)value;
-        BOUNDCHECK(ZSTD_c_forceAttachDict, pref);
+        BOUNDCHECK(ZSTD_c_forceAttachDict, (int)pref);
         CCtxParams->attachDictPref = pref;
         return CCtxParams->attachDictPref;
     }
 
     case ZSTD_c_literalCompressionMode : {
         const ZSTD_paramSwitch_e lcm = (ZSTD_paramSwitch_e)value;
-        BOUNDCHECK(ZSTD_c_literalCompressionMode, lcm);
+        BOUNDCHECK(ZSTD_c_literalCompressionMode, (int)lcm);
         CCtxParams->literalCompressionMode = lcm;
         return CCtxParams->literalCompressionMode;
     }
@@ -789,47 +857,48 @@ size_t ZSTD_CCtxParams_setParameter(ZSTD_CCtx_params* CCtxParams,
 
     case ZSTD_c_enableDedicatedDictSearch :
         CCtxParams->enableDedicatedDictSearch = (value!=0);
-        return CCtxParams->enableDedicatedDictSearch;
+        return (size_t)CCtxParams->enableDedicatedDictSearch;
 
     case ZSTD_c_enableLongDistanceMatching :
+        BOUNDCHECK(ZSTD_c_enableLongDistanceMatching, value);
         CCtxParams->ldmParams.enableLdm = (ZSTD_paramSwitch_e)value;
         return CCtxParams->ldmParams.enableLdm;
 
     case ZSTD_c_ldmHashLog :
         if (value!=0)   /* 0 ==> auto */
             BOUNDCHECK(ZSTD_c_ldmHashLog, value);
-        CCtxParams->ldmParams.hashLog = value;
+        CCtxParams->ldmParams.hashLog = (U32)value;
         return CCtxParams->ldmParams.hashLog;
 
     case ZSTD_c_ldmMinMatch :
         if (value!=0)   /* 0 ==> default */
             BOUNDCHECK(ZSTD_c_ldmMinMatch, value);
-        CCtxParams->ldmParams.minMatchLength = value;
+        CCtxParams->ldmParams.minMatchLength = (U32)value;
         return CCtxParams->ldmParams.minMatchLength;
 
     case ZSTD_c_ldmBucketSizeLog :
         if (value!=0)   /* 0 ==> default */
             BOUNDCHECK(ZSTD_c_ldmBucketSizeLog, value);
-        CCtxParams->ldmParams.bucketSizeLog = value;
+        CCtxParams->ldmParams.bucketSizeLog = (U32)value;
         return CCtxParams->ldmParams.bucketSizeLog;
 
     case ZSTD_c_ldmHashRateLog :
         if (value!=0)   /* 0 ==> default */
             BOUNDCHECK(ZSTD_c_ldmHashRateLog, value);
-        CCtxParams->ldmParams.hashRateLog = value;
+        CCtxParams->ldmParams.hashRateLog = (U32)value;
         return CCtxParams->ldmParams.hashRateLog;
 
     case ZSTD_c_targetCBlockSize :
         if (value!=0)   /* 0 ==> default */
             BOUNDCHECK(ZSTD_c_targetCBlockSize, value);
-        CCtxParams->targetCBlockSize = value;
+        CCtxParams->targetCBlockSize = (U32)value;
         return CCtxParams->targetCBlockSize;
 
     case ZSTD_c_srcSizeHint :
         if (value!=0)    /* 0 ==> default */
             BOUNDCHECK(ZSTD_c_srcSizeHint, value);
         CCtxParams->srcSizeHint = value;
-        return CCtxParams->srcSizeHint;
+        return (size_t)CCtxParams->srcSizeHint;
 
     case ZSTD_c_stableInBuffer:
         BOUNDCHECK(ZSTD_c_stableInBuffer, value);
@@ -866,6 +935,27 @@ size_t ZSTD_CCtxParams_setParameter(ZSTD_CCtx_params* CCtxParams,
         CCtxParams->deterministicRefPrefix = !!value;
         return CCtxParams->deterministicRefPrefix;
 
+    case ZSTD_c_prefetchCDictTables:
+        BOUNDCHECK(ZSTD_c_prefetchCDictTables, value);
+        CCtxParams->prefetchCDictTables = (ZSTD_paramSwitch_e)value;
+        return CCtxParams->prefetchCDictTables;
+
+    case ZSTD_c_enableSeqProducerFallback:
+        BOUNDCHECK(ZSTD_c_enableSeqProducerFallback, value);
+        CCtxParams->enableMatchFinderFallback = value;
+        return CCtxParams->enableMatchFinderFallback;
+
+    case ZSTD_c_maxBlockSize:
+        if (value!=0)    /* 0 ==> default */
+            BOUNDCHECK(ZSTD_c_maxBlockSize, value);
+        CCtxParams->maxBlockSize = value;
+        return CCtxParams->maxBlockSize;
+
+    case ZSTD_c_searchForExternalRepcodes:
+        BOUNDCHECK(ZSTD_c_searchForExternalRepcodes, value);
+        CCtxParams->searchForExternalRepcodes = (ZSTD_paramSwitch_e)value;
+        return CCtxParams->searchForExternalRepcodes;
+
     default: RETURN_ERROR(parameter_unsupported, "unknown parameter");
     }
 }
@@ -980,6 +1070,18 @@ size_t ZSTD_CCtxParams_getParameter(
     case ZSTD_c_deterministicRefPrefix:
         *value = (int)CCtxParams->deterministicRefPrefix;
         break;
+    case ZSTD_c_prefetchCDictTables:
+        *value = (int)CCtxParams->prefetchCDictTables;
+        break;
+    case ZSTD_c_enableSeqProducerFallback:
+        *value = CCtxParams->enableMatchFinderFallback;
+        break;
+    case ZSTD_c_maxBlockSize:
+        *value = (int)CCtxParams->maxBlockSize;
+        break;
+    case ZSTD_c_searchForExternalRepcodes:
+        *value = (int)CCtxParams->searchForExternalRepcodes;
+        break;
     default: RETURN_ERROR(parameter_unsupported, "unknown parameter");
     }
     return 0;
@@ -1006,9 +1108,47 @@ size_t ZSTD_CCtx_setParametersUsingCCtxParams(
     return 0;
 }
 
+size_t ZSTD_CCtx_setCParams(ZSTD_CCtx* cctx, ZSTD_compressionParameters cparams)
+{
+    ZSTD_STATIC_ASSERT(sizeof(cparams) == 7 * 4 /* all params are listed below */);
+    DEBUGLOG(4, "ZSTD_CCtx_setCParams");
+    /* only update if all parameters are valid */
+    FORWARD_IF_ERROR(ZSTD_checkCParams(cparams), "");
+    FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_windowLog, cparams.windowLog), "");
+    FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_chainLog, cparams.chainLog), "");
+    FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_hashLog, cparams.hashLog), "");
+    FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_searchLog, cparams.searchLog), "");
+    FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_minMatch, cparams.minMatch), "");
+    FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_targetLength, cparams.targetLength), "");
+    FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_strategy, cparams.strategy), "");
+    return 0;
+}
+
+size_t ZSTD_CCtx_setFParams(ZSTD_CCtx* cctx, ZSTD_frameParameters fparams)
+{
+    ZSTD_STATIC_ASSERT(sizeof(fparams) == 3 * 4 /* all params are listed below */);
+    DEBUGLOG(4, "ZSTD_CCtx_setFParams");
+    FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_contentSizeFlag, fparams.contentSizeFlag != 0), "");
+    FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_checksumFlag, fparams.checksumFlag != 0), "");
+    FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_dictIDFlag, fparams.noDictIDFlag == 0), "");
+    return 0;
+}
+
+size_t ZSTD_CCtx_setParams(ZSTD_CCtx* cctx, ZSTD_parameters params)
+{
+    DEBUGLOG(4, "ZSTD_CCtx_setParams");
+    /* First check cParams, because we want to update all or none. */
+    FORWARD_IF_ERROR(ZSTD_checkCParams(params.cParams), "");
+    /* Next set fParams, because this could fail if the cctx isn't in init stage. */
+    FORWARD_IF_ERROR(ZSTD_CCtx_setFParams(cctx, params.fParams), "");
+    /* Finally set cParams, which should succeed. */
+    FORWARD_IF_ERROR(ZSTD_CCtx_setCParams(cctx, params.cParams), "");
+    return 0;
+}
+
 size_t ZSTD_CCtx_setPledgedSrcSize(ZSTD_CCtx* cctx, unsigned long long pledgedSrcSize)
 {
-    DEBUGLOG(4, "ZSTD_CCtx_setPledgedSrcSize to %u bytes", (U32)pledgedSrcSize);
+    DEBUGLOG(4, "ZSTD_CCtx_setPledgedSrcSize to %llu bytes", pledgedSrcSize);
     RETURN_ERROR_IF(cctx->streamStage != zcss_init, stage_wrong,
                     "Can't set pledgedSrcSize when not in init stage.");
     cctx->pledgedSrcSizePlusOne = pledgedSrcSize+1;
@@ -1024,9 +1164,9 @@ static void ZSTD_dedicatedDictSearch_revertCParams(
         ZSTD_compressionParameters* cParams);
 
 /*
- * Initializes the local dict using the requested parameters.
- * NOTE: This does not use the pledged src size, because it may be used for more
- * than one compression.
+ * Initializes the local dictionary using requested parameters.
+ * NOTE: Initialization does not employ the pledged src size,
+ * because the dictionary may be used for multiple compressions.
  */
 static size_t ZSTD_initLocalDict(ZSTD_CCtx* cctx)
 {
@@ -1039,8 +1179,8 @@ static size_t ZSTD_initLocalDict(ZSTD_CCtx* cctx)
         return 0;
     }
     if (dl->cdict != NULL) {
-        assert(cctx->cdict == dl->cdict);
         /* Local dictionary already initialized. */
+        assert(cctx->cdict == dl->cdict);
         return 0;
     }
     assert(dl->dictSize > 0);
@@ -1060,26 +1200,30 @@ static size_t ZSTD_initLocalDict(ZSTD_CCtx* cctx)
 }
 
 size_t ZSTD_CCtx_loadDictionary_advanced(
-        ZSTD_CCtx* cctx, const void* dict, size_t dictSize,
-        ZSTD_dictLoadMethod_e dictLoadMethod, ZSTD_dictContentType_e dictContentType)
+        ZSTD_CCtx* cctx,
+        const void* dict, size_t dictSize,
+        ZSTD_dictLoadMethod_e dictLoadMethod,
+        ZSTD_dictContentType_e dictContentType)
 {
-    RETURN_ERROR_IF(cctx->streamStage != zcss_init, stage_wrong,
-                    "Can't load a dictionary when ctx is not in init stage.");
     DEBUGLOG(4, "ZSTD_CCtx_loadDictionary_advanced (size: %u)", (U32)dictSize);
-    ZSTD_clearAllDicts(cctx);  /* in case one already exists */
-    if (dict == NULL || dictSize == 0)  /* no dictionary mode */
+    RETURN_ERROR_IF(cctx->streamStage != zcss_init, stage_wrong,
+                    "Can't load a dictionary when cctx is not in init stage.");
+    ZSTD_clearAllDicts(cctx);  /* erase any previously set dictionary */
+    if (dict == NULL || dictSize == 0)  /* no dictionary */
         return 0;
     if (dictLoadMethod == ZSTD_dlm_byRef) {
         cctx->localDict.dict = dict;
     } else {
+        /* copy dictionary content inside CCtx to own its lifetime */
         void* dictBuffer;
         RETURN_ERROR_IF(cctx->staticSize, memory_allocation,
-                        "no malloc for static CCtx");
+                        "static CCtx can't allocate for an internal copy of dictionary");
         dictBuffer = ZSTD_customMalloc(dictSize, cctx->customMem);
-        RETURN_ERROR_IF(!dictBuffer, memory_allocation, "NULL pointer!");
+        RETURN_ERROR_IF(dictBuffer==NULL, memory_allocation,
+                        "allocation failed for dictionary content");
         ZSTD_memcpy(dictBuffer, dict, dictSize);
-        cctx->localDict.dictBuffer = dictBuffer;
-        cctx->localDict.dict = dictBuffer;
+        cctx->localDict.dictBuffer = dictBuffer;  /* owned ptr to free */
+        cctx->localDict.dict = dictBuffer;        /* read-only reference */
     }
     cctx->localDict.dictSize = dictSize;
     cctx->localDict.dictContentType = dictContentType;
@@ -1149,8 +1293,9 @@ size_t ZSTD_CCtx_reset(ZSTD_CCtx* cctx, ZSTD_ResetDirective reset)
     if ( (reset == ZSTD_reset_parameters)
       || (reset == ZSTD_reset_session_and_parameters) ) {
         RETURN_ERROR_IF(cctx->streamStage != zcss_init, stage_wrong,
-                        "Can't reset parameters only when not in init stage.");
+                        "Reset parameters is only possible during init stage.");
         ZSTD_clearAllDicts(cctx);
+        ZSTD_memset(&cctx->externalMatchCtx, 0, sizeof(cctx->externalMatchCtx));
         return ZSTD_CCtxParams_reset(&cctx->requestedParams);
     }
     return 0;
@@ -1247,7 +1392,8 @@ static ZSTD_compressionParameters
 ZSTD_adjustCParams_internal(ZSTD_compressionParameters cPar,
                             unsigned long long srcSize,
                             size_t dictSize,
-                            ZSTD_cParamMode_e mode)
+                            ZSTD_cParamMode_e mode,
+                            ZSTD_paramSwitch_e useRowMatchFinder)
 {
     const U64 minSrcSize = 513; /* (1<<9) + 1 */
     const U64 maxWindowResize = 1ULL << (ZSTD_WINDOWLOG_MAX-1);
@@ -1281,8 +1427,8 @@ ZSTD_adjustCParams_internal(ZSTD_compressionParameters cPar,
     }
 
     /* resize windowLog if input is small enough, to use less memory */
-    if ( (srcSize < maxWindowResize)
-      && (dictSize < maxWindowResize) )  {
+    if ( (srcSize <= maxWindowResize)
+      && (dictSize <= maxWindowResize) )  {
         U32 const tSize = (U32)(srcSize + dictSize);
         static U32 const hashSizeMin = 1 << ZSTD_HASHLOG_MIN;
         U32 const srcLog = (tSize < hashSizeMin) ? ZSTD_HASHLOG_MIN :
@@ -1300,6 +1446,42 @@ ZSTD_adjustCParams_internal(ZSTD_compressionParameters cPar,
     if (cPar.windowLog < ZSTD_WINDOWLOG_ABSOLUTEMIN)
         cPar.windowLog = ZSTD_WINDOWLOG_ABSOLUTEMIN;  /* minimum wlog required for valid frame header */
 
+    /* We can't use more than 32 bits of hash in total, so that means that we require:
+     * (hashLog + 8) <= 32 && (chainLog + 8) <= 32
+     */
+    if (mode == ZSTD_cpm_createCDict && ZSTD_CDictIndicesAreTagged(&cPar)) {
+        U32 const maxShortCacheHashLog = 32 - ZSTD_SHORT_CACHE_TAG_BITS;
+        if (cPar.hashLog > maxShortCacheHashLog) {
+            cPar.hashLog = maxShortCacheHashLog;
+        }
+        if (cPar.chainLog > maxShortCacheHashLog) {
+            cPar.chainLog = maxShortCacheHashLog;
+        }
+    }
+
+
+    /* At this point, we aren't 100% sure if we are using the row match finder.
+     * Unless it is explicitly disabled, conservatively assume that it is enabled.
+     * In this case it will only be disabled for small sources, so shrinking the
+     * hash log a little bit shouldn't result in any ratio loss.
+     */
+    if (useRowMatchFinder == ZSTD_ps_auto)
+        useRowMatchFinder = ZSTD_ps_enable;
+
+    /* We can't hash more than 32-bits in total. So that means that we require:
+     * (hashLog - rowLog + 8) <= 32
+     */
+    if (ZSTD_rowMatchFinderUsed(cPar.strategy, useRowMatchFinder)) {
+        /* Switch to 32-entry rows if searchLog is 5 (or more) */
+        U32 const rowLog = BOUNDED(4, cPar.searchLog, 6);
+        U32 const maxRowHashLog = 32 - ZSTD_ROW_HASH_TAG_BITS;
+        U32 const maxHashLog = maxRowHashLog + rowLog;
+        assert(cPar.hashLog >= rowLog);
+        if (cPar.hashLog > maxHashLog) {
+            cPar.hashLog = maxHashLog;
+        }
+    }
+
     return cPar;
 }
 
@@ -1310,7 +1492,7 @@ ZSTD_adjustCParams(ZSTD_compressionParameters cPar,
 {
     cPar = ZSTD_clampCParams(cPar);   /* resulting cPar is necessarily valid (all parameters within range) */
     if (srcSize == 0) srcSize = ZSTD_CONTENTSIZE_UNKNOWN;
-    return ZSTD_adjustCParams_internal(cPar, srcSize, dictSize, ZSTD_cpm_unknown);
+    return ZSTD_adjustCParams_internal(cPar, srcSize, dictSize, ZSTD_cpm_unknown, ZSTD_ps_auto);
 }
 
 static ZSTD_compressionParameters ZSTD_getCParams_internal(int compressionLevel, unsigned long long srcSizeHint, size_t dictSize, ZSTD_cParamMode_e mode);
@@ -1341,7 +1523,7 @@ ZSTD_compressionParameters ZSTD_getCParamsFromCCtxParams(
     ZSTD_overrideCParams(&cParams, &CCtxParams->cParams);
     assert(!ZSTD_checkCParams(cParams));
     /* srcSizeHint == 0 means 0 */
-    return ZSTD_adjustCParams_internal(cParams, srcSizeHint, dictSize, mode);
+    return ZSTD_adjustCParams_internal(cParams, srcSizeHint, dictSize, mode, CCtxParams->useRowMatchFinder);
 }
 
 static size_t
@@ -1370,7 +1552,7 @@ ZSTD_sizeof_matchState(const ZSTD_compressionParameters* const cParams,
       + ZSTD_cwksp_aligned_alloc_size((ZSTD_OPT_NUM+1) * sizeof(ZSTD_match_t))
       + ZSTD_cwksp_aligned_alloc_size((ZSTD_OPT_NUM+1) * sizeof(ZSTD_optimal_t));
     size_t const lazyAdditionalSpace = ZSTD_rowMatchFinderUsed(cParams->strategy, useRowMatchFinder)
-                                            ? ZSTD_cwksp_aligned_alloc_size(hSize*sizeof(U16))
+                                            ? ZSTD_cwksp_aligned_alloc_size(hSize)
                                             : 0;
     size_t const optSpace = (forCCtx && (cParams->strategy >= ZSTD_btopt))
                                 ? optPotentialSpace
@@ -1386,6 +1568,13 @@ ZSTD_sizeof_matchState(const ZSTD_compressionParameters* const cParams,
     return tableSpace + optSpace + slackSpace + lazyAdditionalSpace;
 }
 
+/* Helper function for calculating memory requirements.
+ * Gives a tighter bound than ZSTD_sequenceBound() by taking minMatch into account. */
+static size_t ZSTD_maxNbSeq(size_t blockSize, unsigned minMatch, int useSequenceProducer) {
+    U32 const divider = (minMatch==3 || useSequenceProducer) ? 3 : 4;
+    return blockSize / divider;
+}
+
 static size_t ZSTD_estimateCCtxSize_usingCCtxParams_internal(
         const ZSTD_compressionParameters* cParams,
         const ldmParams_t* ldmParams,
@@ -1393,12 +1582,13 @@ static size_t ZSTD_estimateCCtxSize_usingCCtxParams_internal(
         const ZSTD_paramSwitch_e useRowMatchFinder,
         const size_t buffInSize,
         const size_t buffOutSize,
-        const U64 pledgedSrcSize)
+        const U64 pledgedSrcSize,
+        int useSequenceProducer,
+        size_t maxBlockSize)
 {
     size_t const windowSize = (size_t) BOUNDED(1ULL, 1ULL << cParams->windowLog, pledgedSrcSize);
-    size_t const blockSize = MIN(ZSTD_BLOCKSIZE_MAX, windowSize);
-    U32    const divider = (cParams->minMatch==3) ? 3 : 4;
-    size_t const maxNbSeq = blockSize / divider;
+    size_t const blockSize = MIN(ZSTD_resolveMaxBlockSize(maxBlockSize), windowSize);
+    size_t const maxNbSeq = ZSTD_maxNbSeq(blockSize, cParams->minMatch, useSequenceProducer);
     size_t const tokenSpace = ZSTD_cwksp_alloc_size(WILDCOPY_OVERLENGTH + blockSize)
                             + ZSTD_cwksp_aligned_alloc_size(maxNbSeq * sizeof(seqDef))
                             + 3 * ZSTD_cwksp_alloc_size(maxNbSeq * sizeof(BYTE));
@@ -1417,6 +1607,11 @@ static size_t ZSTD_estimateCCtxSize_usingCCtxParams_internal(
 
     size_t const cctxSpace = isStatic ? ZSTD_cwksp_alloc_size(sizeof(ZSTD_CCtx)) : 0;
 
+    size_t const maxNbExternalSeq = ZSTD_sequenceBound(blockSize);
+    size_t const externalSeqSpace = useSequenceProducer
+        ? ZSTD_cwksp_aligned_alloc_size(maxNbExternalSeq * sizeof(ZSTD_Sequence))
+        : 0;
+
     size_t const neededSpace =
         cctxSpace +
         entropySpace +
@@ -1425,7 +1620,8 @@ static size_t ZSTD_estimateCCtxSize_usingCCtxParams_internal(
         ldmSeqSpace +
         matchStateSize +
         tokenSpace +
-        bufferSpace;
+        bufferSpace +
+        externalSeqSpace;
 
     DEBUGLOG(5, "estimate workspace : %u", (U32)neededSpace);
     return neededSpace;
@@ -1443,7 +1639,7 @@ size_t ZSTD_estimateCCtxSize_usingCCtxParams(const ZSTD_CCtx_params* params)
      * be needed. However, we still allocate two 0-sized buffers, which can
      * take space under ASAN. */
     return ZSTD_estimateCCtxSize_usingCCtxParams_internal(
-        &cParams, &params->ldmParams, 1, useRowMatchFinder, 0, 0, ZSTD_CONTENTSIZE_UNKNOWN);
+        &cParams, &params->ldmParams, 1, useRowMatchFinder, 0, 0, ZSTD_CONTENTSIZE_UNKNOWN, params->useSequenceProducer, params->maxBlockSize);
 }
 
 size_t ZSTD_estimateCCtxSize_usingCParams(ZSTD_compressionParameters cParams)
@@ -1493,7 +1689,7 @@ size_t ZSTD_estimateCStreamSize_usingCCtxParams(const ZSTD_CCtx_params* params)
     RETURN_ERROR_IF(params->nbWorkers > 0, GENERIC, "Estimate CCtx size is supported for single-threaded compression only.");
     {   ZSTD_compressionParameters const cParams =
                 ZSTD_getCParamsFromCCtxParams(params, ZSTD_CONTENTSIZE_UNKNOWN, 0, ZSTD_cpm_noAttachDict);
-        size_t const blockSize = MIN(ZSTD_BLOCKSIZE_MAX, (size_t)1 << cParams.windowLog);
+        size_t const blockSize = MIN(ZSTD_resolveMaxBlockSize(params->maxBlockSize), (size_t)1 << cParams.windowLog);
         size_t const inBuffSize = (params->inBufferMode == ZSTD_bm_buffered)
                 ? ((size_t)1 << cParams.windowLog) + blockSize
                 : 0;
@@ -1504,7 +1700,7 @@ size_t ZSTD_estimateCStreamSize_usingCCtxParams(const ZSTD_CCtx_params* params)
 
         return ZSTD_estimateCCtxSize_usingCCtxParams_internal(
             &cParams, &params->ldmParams, 1, useRowMatchFinder, inBuffSize, outBuffSize,
-            ZSTD_CONTENTSIZE_UNKNOWN);
+            ZSTD_CONTENTSIZE_UNKNOWN, params->useSequenceProducer, params->maxBlockSize);
     }
 }
 
@@ -1637,6 +1833,19 @@ typedef enum {
     ZSTD_resetTarget_CCtx
 } ZSTD_resetTarget_e;
 
+/* Mixes bits in a 64 bits in a value, based on XXH3_rrmxmx */
+static U64 ZSTD_bitmix(U64 val, U64 len) {
+    val ^= ZSTD_rotateRight_U64(val, 49) ^ ZSTD_rotateRight_U64(val, 24);
+    val *= 0x9FB21C651E98DF25ULL;
+    val ^= (val >> 35) + len ;
+    val *= 0x9FB21C651E98DF25ULL;
+    return val ^ (val >> 28);
+}
+
+/* Mixes in the hashSalt and hashSaltEntropy to create a new hashSalt */
+static void ZSTD_advanceHashSalt(ZSTD_matchState_t* ms) {
+    ms->hashSalt = ZSTD_bitmix(ms->hashSalt, 8) ^ ZSTD_bitmix((U64) ms->hashSaltEntropy, 4);
+}
 
 static size_t
 ZSTD_reset_matchState(ZSTD_matchState_t* ms,
@@ -1664,6 +1873,7 @@ ZSTD_reset_matchState(ZSTD_matchState_t* ms,
     }
 
     ms->hashLog3 = hashLog3;
+    ms->lazySkipping = 0;
 
     ZSTD_invalidateMatchState(ms);
 
@@ -1685,6 +1895,27 @@ ZSTD_reset_matchState(ZSTD_matchState_t* ms,
         ZSTD_cwksp_clean_tables(ws);
     }
 
+    if (ZSTD_rowMatchFinderUsed(cParams->strategy, useRowMatchFinder)) {
+        /* Row match finder needs an additional table of hashes ("tags") */
+        size_t const tagTableSize = hSize;
+        /* We want to generate a new salt in case we reset a Cctx, but we always want to use
+         * 0 when we reset a Cdict */
+        if(forWho == ZSTD_resetTarget_CCtx) {
+            ms->tagTable = (BYTE*) ZSTD_cwksp_reserve_aligned_init_once(ws, tagTableSize);
+            ZSTD_advanceHashSalt(ms);
+        } else {
+            /* When we are not salting we want to always memset the memory */
+            ms->tagTable = (BYTE*) ZSTD_cwksp_reserve_aligned(ws, tagTableSize);
+            ZSTD_memset(ms->tagTable, 0, tagTableSize);
+            ms->hashSalt = 0;
+        }
+        {   /* Switch to 32-entry rows if searchLog is 5 (or more) */
+            U32 const rowLog = BOUNDED(4, cParams->searchLog, 6);
+            assert(cParams->hashLog >= rowLog);
+            ms->rowHashLog = cParams->hashLog - rowLog;
+        }
+    }
+
     /* opt parser space */
     if ((forWho == ZSTD_resetTarget_CCtx) && (cParams->strategy >= ZSTD_btopt)) {
         DEBUGLOG(4, "reserving optimal parser space");
@@ -1696,19 +1927,6 @@ ZSTD_reset_matchState(ZSTD_matchState_t* ms,
         ms->opt.priceTable = (ZSTD_optimal_t*)ZSTD_cwksp_reserve_aligned(ws, (ZSTD_OPT_NUM+1) * sizeof(ZSTD_optimal_t));
     }
 
-    if (ZSTD_rowMatchFinderUsed(cParams->strategy, useRowMatchFinder)) {
-        {   /* Row match finder needs an additional table of hashes ("tags") */
-            size_t const tagTableSize = hSize*sizeof(U16);
-            ms->tagTable = (U16*)ZSTD_cwksp_reserve_aligned(ws, tagTableSize);
-            if (ms->tagTable) ZSTD_memset(ms->tagTable, 0, tagTableSize);
-        }
-        {   /* Switch to 32-entry rows if searchLog is 5 (or more) */
-            U32 const rowLog = BOUNDED(4, cParams->searchLog, 6);
-            assert(cParams->hashLog >= rowLog);
-            ms->rowHashLog = cParams->hashLog - rowLog;
-        }
-    }
-
     ms->cParams = *cParams;
 
     RETURN_ERROR_IF(ZSTD_cwksp_reserve_failed(ws), memory_allocation,
@@ -1768,6 +1986,7 @@ static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc,
     assert(params->useRowMatchFinder != ZSTD_ps_auto);
     assert(params->useBlockSplitter != ZSTD_ps_auto);
     assert(params->ldmParams.enableLdm != ZSTD_ps_auto);
+    assert(params->maxBlockSize != 0);
     if (params->ldmParams.enableLdm == ZSTD_ps_enable) {
         /* Adjust long distance matching parameters */
         ZSTD_ldm_adjustParameters(&zc->appliedParams.ldmParams, &params->cParams);
@@ -1776,9 +1995,8 @@ static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc,
     }
 
     {   size_t const windowSize = MAX(1, (size_t)MIN(((U64)1 << params->cParams.windowLog), pledgedSrcSize));
-        size_t const blockSize = MIN(ZSTD_BLOCKSIZE_MAX, windowSize);
-        U32    const divider = (params->cParams.minMatch==3) ? 3 : 4;
-        size_t const maxNbSeq = blockSize / divider;
+        size_t const blockSize = MIN(params->maxBlockSize, windowSize);
+        size_t const maxNbSeq = ZSTD_maxNbSeq(blockSize, params->cParams.minMatch, params->useSequenceProducer);
         size_t const buffOutSize = (zbuff == ZSTDb_buffered && params->outBufferMode == ZSTD_bm_buffered)
                 ? ZSTD_compressBound(blockSize) + 1
                 : 0;
@@ -1795,7 +2013,7 @@ static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc,
         size_t const neededSpace =
             ZSTD_estimateCCtxSize_usingCCtxParams_internal(
                 &params->cParams, &params->ldmParams, zc->staticSize != 0, params->useRowMatchFinder,
-                buffInSize, buffOutSize, pledgedSrcSize);
+                buffInSize, buffOutSize, pledgedSrcSize, params->useSequenceProducer, params->maxBlockSize);
         int resizeWorkspace;
 
         FORWARD_IF_ERROR(neededSpace, "cctx size estimate failed!");
@@ -1838,6 +2056,7 @@ static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc,
 
         /* init params */
         zc->blockState.matchState.cParams = params->cParams;
+        zc->blockState.matchState.prefetchCDictTables = params->prefetchCDictTables == ZSTD_ps_enable;
         zc->pledgedSrcSizePlusOne = pledgedSrcSize+1;
         zc->consumedSrcSize = 0;
         zc->producedCSize = 0;
@@ -1854,13 +2073,46 @@ static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc,
 
         ZSTD_reset_compressedBlockState(zc->blockState.prevCBlock);
 
+        FORWARD_IF_ERROR(ZSTD_reset_matchState(
+                &zc->blockState.matchState,
+                ws,
+                &params->cParams,
+                params->useRowMatchFinder,
+                crp,
+                needsIndexReset,
+                ZSTD_resetTarget_CCtx), "");
+
+        zc->seqStore.sequencesStart = (seqDef*)ZSTD_cwksp_reserve_aligned(ws, maxNbSeq * sizeof(seqDef));
+
+        /* ldm hash table */
+        if (params->ldmParams.enableLdm == ZSTD_ps_enable) {
+            /* TODO: avoid memset? */
+            size_t const ldmHSize = ((size_t)1) << params->ldmParams.hashLog;
+            zc->ldmState.hashTable = (ldmEntry_t*)ZSTD_cwksp_reserve_aligned(ws, ldmHSize * sizeof(ldmEntry_t));
+            ZSTD_memset(zc->ldmState.hashTable, 0, ldmHSize * sizeof(ldmEntry_t));
+            zc->ldmSequences = (rawSeq*)ZSTD_cwksp_reserve_aligned(ws, maxNbLdmSeq * sizeof(rawSeq));
+            zc->maxNbLdmSequences = maxNbLdmSeq;
+
+            ZSTD_window_init(&zc->ldmState.window);
+            zc->ldmState.loadedDictEnd = 0;
+        }
+
+        /* reserve space for block-level external sequences */
+        if (params->useSequenceProducer) {
+            size_t const maxNbExternalSeq = ZSTD_sequenceBound(blockSize);
+            zc->externalMatchCtx.seqBufferCapacity = maxNbExternalSeq;
+            zc->externalMatchCtx.seqBuffer =
+                (ZSTD_Sequence*)ZSTD_cwksp_reserve_aligned(ws, maxNbExternalSeq * sizeof(ZSTD_Sequence));
+        }
+
+        /* buffers */
+
         /* ZSTD_wildcopy() is used to copy into the literals buffer,
          * so we have to oversize the buffer by WILDCOPY_OVERLENGTH bytes.
          */
         zc->seqStore.litStart = ZSTD_cwksp_reserve_buffer(ws, blockSize + WILDCOPY_OVERLENGTH);
         zc->seqStore.maxNbLit = blockSize;
 
-        /* buffers */
         zc->bufferedPolicy = zbuff;
         zc->inBuffSize = buffInSize;
         zc->inBuff = (char*)ZSTD_cwksp_reserve_buffer(ws, buffInSize);
@@ -1883,32 +2135,9 @@ static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc,
         zc->seqStore.llCode = ZSTD_cwksp_reserve_buffer(ws, maxNbSeq * sizeof(BYTE));
         zc->seqStore.mlCode = ZSTD_cwksp_reserve_buffer(ws, maxNbSeq * sizeof(BYTE));
         zc->seqStore.ofCode = ZSTD_cwksp_reserve_buffer(ws, maxNbSeq * sizeof(BYTE));
-        zc->seqStore.sequencesStart = (seqDef*)ZSTD_cwksp_reserve_aligned(ws, maxNbSeq * sizeof(seqDef));
-
-        FORWARD_IF_ERROR(ZSTD_reset_matchState(
-            &zc->blockState.matchState,
-            ws,
-            &params->cParams,
-            params->useRowMatchFinder,
-            crp,
-            needsIndexReset,
-            ZSTD_resetTarget_CCtx), "");
-
-        /* ldm hash table */
-        if (params->ldmParams.enableLdm == ZSTD_ps_enable) {
-            /* TODO: avoid memset? */
-            size_t const ldmHSize = ((size_t)1) << params->ldmParams.hashLog;
-            zc->ldmState.hashTable = (ldmEntry_t*)ZSTD_cwksp_reserve_aligned(ws, ldmHSize * sizeof(ldmEntry_t));
-            ZSTD_memset(zc->ldmState.hashTable, 0, ldmHSize * sizeof(ldmEntry_t));
-            zc->ldmSequences = (rawSeq*)ZSTD_cwksp_reserve_aligned(ws, maxNbLdmSeq * sizeof(rawSeq));
-            zc->maxNbLdmSequences = maxNbLdmSeq;
-
-            ZSTD_window_init(&zc->ldmState.window);
-            zc->ldmState.loadedDictEnd = 0;
-        }
 
         DEBUGLOG(3, "wksp: finished allocating, %zd bytes remain available", ZSTD_cwksp_available_space(ws));
-        assert(ZSTD_cwksp_estimated_space_within_bounds(ws, neededSpace, resizeWorkspace));
+        assert(ZSTD_cwksp_estimated_space_within_bounds(ws, neededSpace));
 
         zc->initialized = 1;
 
@@ -1980,7 +2209,8 @@ ZSTD_resetCCtx_byAttachingCDict(ZSTD_CCtx* cctx,
         }
 
         params.cParams = ZSTD_adjustCParams_internal(adjusted_cdict_cParams, pledgedSrcSize,
-                                                     cdict->dictContentSize, ZSTD_cpm_attachDict);
+                                                     cdict->dictContentSize, ZSTD_cpm_attachDict,
+                                                     params.useRowMatchFinder);
         params.cParams.windowLog = windowLog;
         params.useRowMatchFinder = cdict->useRowMatchFinder;    /* cdict overrides */
         FORWARD_IF_ERROR(ZSTD_resetCCtx_internal(cctx, &params, pledgedSrcSize,
@@ -2019,6 +2249,22 @@ ZSTD_resetCCtx_byAttachingCDict(ZSTD_CCtx* cctx,
     return 0;
 }
 
+static void ZSTD_copyCDictTableIntoCCtx(U32* dst, U32 const* src, size_t tableSize,
+                                        ZSTD_compressionParameters const* cParams) {
+    if (ZSTD_CDictIndicesAreTagged(cParams)){
+        /* Remove tags from the CDict table if they are present.
+         * See docs on "short cache" in zstd_compress_internal.h for context. */
+        size_t i;
+        for (i = 0; i < tableSize; i++) {
+            U32 const taggedIndex = src[i];
+            U32 const index = taggedIndex >> ZSTD_SHORT_CACHE_TAG_BITS;
+            dst[i] = index;
+        }
+    } else {
+        ZSTD_memcpy(dst, src, tableSize * sizeof(U32));
+    }
+}
+
 static size_t ZSTD_resetCCtx_byCopyingCDict(ZSTD_CCtx* cctx,
                             const ZSTD_CDict* cdict,
                             ZSTD_CCtx_params params,
@@ -2054,21 +2300,23 @@ static size_t ZSTD_resetCCtx_byCopyingCDict(ZSTD_CCtx* cctx,
                                                             : 0;
         size_t const hSize =  (size_t)1 << cdict_cParams->hashLog;
 
-        ZSTD_memcpy(cctx->blockState.matchState.hashTable,
-               cdict->matchState.hashTable,
-               hSize * sizeof(U32));
+        ZSTD_copyCDictTableIntoCCtx(cctx->blockState.matchState.hashTable,
+                                cdict->matchState.hashTable,
+                                hSize, cdict_cParams);
+
         /* Do not copy cdict's chainTable if cctx has parameters such that it would not use chainTable */
         if (ZSTD_allocateChainTable(cctx->appliedParams.cParams.strategy, cctx->appliedParams.useRowMatchFinder, 0 /* forDDSDict */)) {
-            ZSTD_memcpy(cctx->blockState.matchState.chainTable,
-               cdict->matchState.chainTable,
-               chainSize * sizeof(U32));
+            ZSTD_copyCDictTableIntoCCtx(cctx->blockState.matchState.chainTable,
+                                    cdict->matchState.chainTable,
+                                    chainSize, cdict_cParams);
         }
         /* copy tag table */
         if (ZSTD_rowMatchFinderUsed(cdict_cParams->strategy, cdict->useRowMatchFinder)) {
-            size_t const tagTableSize = hSize*sizeof(U16);
+            size_t const tagTableSize = hSize;
             ZSTD_memcpy(cctx->blockState.matchState.tagTable,
-                cdict->matchState.tagTable,
-                tagTableSize);
+                        cdict->matchState.tagTable,
+                        tagTableSize);
+            cctx->blockState.matchState.hashSalt = cdict->matchState.hashSalt;
         }
     }
 
@@ -2147,6 +2395,7 @@ static size_t ZSTD_copyCCtx_internal(ZSTD_CCtx* dstCCtx,
         params.useBlockSplitter = srcCCtx->appliedParams.useBlockSplitter;
         params.ldmParams = srcCCtx->appliedParams.ldmParams;
         params.fParams = fParams;
+        params.maxBlockSize = srcCCtx->appliedParams.maxBlockSize;
         ZSTD_resetCCtx_internal(dstCCtx, &params, pledgedSrcSize,
                                 /* loadedDictSize */ 0,
                                 ZSTDcrp_leaveDirty, zbuff);
@@ -2294,7 +2543,7 @@ static void ZSTD_reduceIndex (ZSTD_matchState_t* ms, ZSTD_CCtx_params const* par
 
 /* See doc/zstd_compression_format.md for detailed format description */
 
-void ZSTD_seqToCodes(const seqStore_t* seqStorePtr)
+int ZSTD_seqToCodes(const seqStore_t* seqStorePtr)
 {
     const seqDef* const sequences = seqStorePtr->sequencesStart;
     BYTE* const llCodeTable = seqStorePtr->llCode;
@@ -2302,18 +2551,24 @@ void ZSTD_seqToCodes(const seqStore_t* seqStorePtr)
     BYTE* const mlCodeTable = seqStorePtr->mlCode;
     U32 const nbSeq = (U32)(seqStorePtr->sequences - seqStorePtr->sequencesStart);
     U32 u;
+    int longOffsets = 0;
     assert(nbSeq <= seqStorePtr->maxNbSeq);
     for (u=0; u<nbSeq; u++) {
         U32 const llv = sequences[u].litLength;
+        U32 const ofCode = ZSTD_highbit32(sequences[u].offBase);
         U32 const mlv = sequences[u].mlBase;
         llCodeTable[u] = (BYTE)ZSTD_LLcode(llv);
-        ofCodeTable[u] = (BYTE)ZSTD_highbit32(sequences[u].offBase);
+        ofCodeTable[u] = (BYTE)ofCode;
         mlCodeTable[u] = (BYTE)ZSTD_MLcode(mlv);
+        assert(!(MEM_64bits() && ofCode >= STREAM_ACCUMULATOR_MIN));
+        if (MEM_32bits() && ofCode >= STREAM_ACCUMULATOR_MIN)
+            longOffsets = 1;
     }
     if (seqStorePtr->longLengthType==ZSTD_llt_literalLength)
         llCodeTable[seqStorePtr->longLengthPos] = MaxLL;
     if (seqStorePtr->longLengthType==ZSTD_llt_matchLength)
         mlCodeTable[seqStorePtr->longLengthPos] = MaxML;
+    return longOffsets;
 }
 
 /* ZSTD_useTargetCBlockSize():
@@ -2347,6 +2602,7 @@ typedef struct {
     U32 MLtype;
     size_t size;
     size_t lastCountSize; /* Accounts for bug in 1.3.4. More detail in ZSTD_entropyCompressSeqStore_internal() */
+    int longOffsets;
 } ZSTD_symbolEncodingTypeStats_t;
 
 /* ZSTD_buildSequencesStatistics():
@@ -2357,11 +2613,13 @@ typedef struct {
  * entropyWkspSize must be of size at least ENTROPY_WORKSPACE_SIZE - (MaxSeq + 1)*sizeof(U32)
  */
 static ZSTD_symbolEncodingTypeStats_t
-ZSTD_buildSequencesStatistics(seqStore_t* seqStorePtr, size_t nbSeq,
-                        const ZSTD_fseCTables_t* prevEntropy, ZSTD_fseCTables_t* nextEntropy,
-                              BYTE* dst, const BYTE* const dstEnd,
-                              ZSTD_strategy strategy, unsigned* countWorkspace,
-                              void* entropyWorkspace, size_t entropyWkspSize) {
+ZSTD_buildSequencesStatistics(
+                const seqStore_t* seqStorePtr, size_t nbSeq,
+                const ZSTD_fseCTables_t* prevEntropy, ZSTD_fseCTables_t* nextEntropy,
+                      BYTE* dst, const BYTE* const dstEnd,
+                      ZSTD_strategy strategy, unsigned* countWorkspace,
+                      void* entropyWorkspace, size_t entropyWkspSize)
+{
     BYTE* const ostart = dst;
     const BYTE* const oend = dstEnd;
     BYTE* op = ostart;
@@ -2375,7 +2633,7 @@ ZSTD_buildSequencesStatistics(seqStore_t* seqStorePtr, size_t nbSeq,
 
     stats.lastCountSize = 0;
     /* convert length/distances into codes */
-    ZSTD_seqToCodes(seqStorePtr);
+    stats.longOffsets = ZSTD_seqToCodes(seqStorePtr);
     assert(op <= oend);
     assert(nbSeq != 0); /* ZSTD_selectEncodingType() divides by nbSeq */
     /* build CTable for Literal Lengths */
@@ -2480,22 +2738,22 @@ ZSTD_buildSequencesStatistics(seqStore_t* seqStorePtr, size_t nbSeq,
  */
 #define SUSPECT_UNCOMPRESSIBLE_LITERAL_RATIO 20
 MEM_STATIC size_t
-ZSTD_entropyCompressSeqStore_internal(seqStore_t* seqStorePtr,
-                          const ZSTD_entropyCTables_t* prevEntropy,
-                                ZSTD_entropyCTables_t* nextEntropy,
-                          const ZSTD_CCtx_params* cctxParams,
-                                void* dst, size_t dstCapacity,
-                                void* entropyWorkspace, size_t entropyWkspSize,
-                          const int bmi2)
+ZSTD_entropyCompressSeqStore_internal(
+                        const seqStore_t* seqStorePtr,
+                        const ZSTD_entropyCTables_t* prevEntropy,
+                              ZSTD_entropyCTables_t* nextEntropy,
+                        const ZSTD_CCtx_params* cctxParams,
+                              void* dst, size_t dstCapacity,
+                              void* entropyWorkspace, size_t entropyWkspSize,
+                        const int bmi2)
 {
-    const int longOffsets = cctxParams->cParams.windowLog > STREAM_ACCUMULATOR_MIN;
     ZSTD_strategy const strategy = cctxParams->cParams.strategy;
     unsigned* count = (unsigned*)entropyWorkspace;
     FSE_CTable* CTable_LitLength = nextEntropy->fse.litlengthCTable;
     FSE_CTable* CTable_OffsetBits = nextEntropy->fse.offcodeCTable;
     FSE_CTable* CTable_MatchLength = nextEntropy->fse.matchlengthCTable;
     const seqDef* const sequences = seqStorePtr->sequencesStart;
-    const size_t nbSeq = seqStorePtr->sequences - seqStorePtr->sequencesStart;
+    const size_t nbSeq = (size_t)(seqStorePtr->sequences - seqStorePtr->sequencesStart);
     const BYTE* const ofCodeTable = seqStorePtr->ofCode;
     const BYTE* const llCodeTable = seqStorePtr->llCode;
     const BYTE* const mlCodeTable = seqStorePtr->mlCode;
@@ -2503,29 +2761,31 @@ ZSTD_entropyCompressSeqStore_internal(seqStore_t* seqStorePtr,
     BYTE* const oend = ostart + dstCapacity;
     BYTE* op = ostart;
     size_t lastCountSize;
+    int longOffsets = 0;
 
     entropyWorkspace = count + (MaxSeq + 1);
     entropyWkspSize -= (MaxSeq + 1) * sizeof(*count);
 
-    DEBUGLOG(4, "ZSTD_entropyCompressSeqStore_internal (nbSeq=%zu)", nbSeq);
+    DEBUGLOG(5, "ZSTD_entropyCompressSeqStore_internal (nbSeq=%zu, dstCapacity=%zu)", nbSeq, dstCapacity);
     ZSTD_STATIC_ASSERT(HUF_WORKSPACE_SIZE >= (1<<MAX(MLFSELog,LLFSELog)));
     assert(entropyWkspSize >= HUF_WORKSPACE_SIZE);
 
     /* Compress literals */
     {   const BYTE* const literals = seqStorePtr->litStart;
-        size_t const numSequences = seqStorePtr->sequences - seqStorePtr->sequencesStart;
-        size_t const numLiterals = seqStorePtr->lit - seqStorePtr->litStart;
+        size_t const numSequences = (size_t)(seqStorePtr->sequences - seqStorePtr->sequencesStart);
+        size_t const numLiterals = (size_t)(seqStorePtr->lit - seqStorePtr->litStart);
         /* Base suspicion of uncompressibility on ratio of literals to sequences */
         unsigned const suspectUncompressible = (numSequences == 0) || (numLiterals / numSequences >= SUSPECT_UNCOMPRESSIBLE_LITERAL_RATIO);
         size_t const litSize = (size_t)(seqStorePtr->lit - literals);
+
         size_t const cSize = ZSTD_compressLiterals(
-                                    &prevEntropy->huf, &nextEntropy->huf,
-                                    cctxParams->cParams.strategy,
-                                    ZSTD_literalsCompressionIsDisabled(cctxParams),
                                     op, dstCapacity,
                                     literals, litSize,
                                     entropyWorkspace, entropyWkspSize,
-                                    bmi2, suspectUncompressible);
+                                    &prevEntropy->huf, &nextEntropy->huf,
+                                    cctxParams->cParams.strategy,
+                                    ZSTD_literalsCompressionIsDisabled(cctxParams),
+                                    suspectUncompressible, bmi2);
         FORWARD_IF_ERROR(cSize, "ZSTD_compressLiterals failed");
         assert(cSize <= dstCapacity);
         op += cSize;
@@ -2551,11 +2811,10 @@ ZSTD_entropyCompressSeqStore_internal(seqStore_t* seqStorePtr,
         ZSTD_memcpy(&nextEntropy->fse, &prevEntropy->fse, sizeof(prevEntropy->fse));
         return (size_t)(op - ostart);
     }
-    {
-        ZSTD_symbolEncodingTypeStats_t stats;
-        BYTE* seqHead = op++;
+    {   BYTE* const seqHead = op++;
         /* build stats for sequences */
-        stats = ZSTD_buildSequencesStatistics(seqStorePtr, nbSeq,
+        const ZSTD_symbolEncodingTypeStats_t stats =
+                ZSTD_buildSequencesStatistics(seqStorePtr, nbSeq,
                                              &prevEntropy->fse, &nextEntropy->fse,
                                               op, oend,
                                               strategy, count,
@@ -2564,6 +2823,7 @@ ZSTD_entropyCompressSeqStore_internal(seqStore_t* seqStorePtr,
         *seqHead = (BYTE)((stats.LLtype<<6) + (stats.Offtype<<4) + (stats.MLtype<<2));
         lastCountSize = stats.lastCountSize;
         op += stats.size;
+        longOffsets = stats.longOffsets;
     }
 
     {   size_t const bitstreamSize = ZSTD_encodeSequences(
@@ -2598,14 +2858,15 @@ ZSTD_entropyCompressSeqStore_internal(seqStore_t* seqStorePtr,
 }
 
 MEM_STATIC size_t
-ZSTD_entropyCompressSeqStore(seqStore_t* seqStorePtr,
-                       const ZSTD_entropyCTables_t* prevEntropy,
-                             ZSTD_entropyCTables_t* nextEntropy,
-                       const ZSTD_CCtx_params* cctxParams,
-                             void* dst, size_t dstCapacity,
-                             size_t srcSize,
-                             void* entropyWorkspace, size_t entropyWkspSize,
-                             int bmi2)
+ZSTD_entropyCompressSeqStore(
+                    const seqStore_t* seqStorePtr,
+                    const ZSTD_entropyCTables_t* prevEntropy,
+                          ZSTD_entropyCTables_t* nextEntropy,
+                    const ZSTD_CCtx_params* cctxParams,
+                          void* dst, size_t dstCapacity,
+                          size_t srcSize,
+                          void* entropyWorkspace, size_t entropyWkspSize,
+                          int bmi2)
 {
     size_t const cSize = ZSTD_entropyCompressSeqStore_internal(
                             seqStorePtr, prevEntropy, nextEntropy, cctxParams,
@@ -2615,15 +2876,21 @@ ZSTD_entropyCompressSeqStore(seqStore_t* seqStorePtr,
     /* When srcSize <= dstCapacity, there is enough space to write a raw uncompressed block.
      * Since we ran out of space, block must be not compressible, so fall back to raw uncompressed block.
      */
-    if ((cSize == ERROR(dstSize_tooSmall)) & (srcSize <= dstCapacity))
+    if ((cSize == ERROR(dstSize_tooSmall)) & (srcSize <= dstCapacity)) {
+        DEBUGLOG(4, "not enough dstCapacity (%zu) for ZSTD_entropyCompressSeqStore_internal()=> do not compress block", dstCapacity);
         return 0;  /* block not compressed */
+    }
     FORWARD_IF_ERROR(cSize, "ZSTD_entropyCompressSeqStore_internal failed");
 
     /* Check compressibility */
     {   size_t const maxCSize = srcSize - ZSTD_minGain(srcSize, cctxParams->cParams.strategy);
         if (cSize >= maxCSize) return 0;  /* block not compressed */
     }
-    DEBUGLOG(4, "ZSTD_entropyCompressSeqStore() cSize: %zu", cSize);
+    DEBUGLOG(5, "ZSTD_entropyCompressSeqStore() cSize: %zu", cSize);
+    /* libzstd decoder before  > v1.5.4 is not compatible with compressed blocks of size ZSTD_BLOCKSIZE_MAX exactly.
+     * This restriction is indirectly already fulfilled by respecting ZSTD_minGain() condition above.
+     */
+    assert(cSize < ZSTD_BLOCKSIZE_MAX);
     return cSize;
 }
 
@@ -2718,6 +2985,72 @@ void ZSTD_resetSeqStore(seqStore_t* ssPtr)
     ssPtr->longLengthType = ZSTD_llt_none;
 }
 
+/* ZSTD_postProcessSequenceProducerResult() :
+ * Validates and post-processes sequences obtained through the external matchfinder API:
+ *   - Checks whether nbExternalSeqs represents an error condition.
+ *   - Appends a block delimiter to outSeqs if one is not already present.
+ *     See zstd.h for context regarding block delimiters.
+ * Returns the number of sequences after post-processing, or an error code. */
+static size_t ZSTD_postProcessSequenceProducerResult(
+    ZSTD_Sequence* outSeqs, size_t nbExternalSeqs, size_t outSeqsCapacity, size_t srcSize
+) {
+    RETURN_ERROR_IF(
+        nbExternalSeqs > outSeqsCapacity,
+        sequenceProducer_failed,
+        "External sequence producer returned error code %lu",
+        (unsigned long)nbExternalSeqs
+    );
+
+    RETURN_ERROR_IF(
+        nbExternalSeqs == 0 && srcSize > 0,
+        sequenceProducer_failed,
+        "Got zero sequences from external sequence producer for a non-empty src buffer!"
+    );
+
+    if (srcSize == 0) {
+        ZSTD_memset(&outSeqs[0], 0, sizeof(ZSTD_Sequence));
+        return 1;
+    }
+
+    {
+        ZSTD_Sequence const lastSeq = outSeqs[nbExternalSeqs - 1];
+
+        /* We can return early if lastSeq is already a block delimiter. */
+        if (lastSeq.offset == 0 && lastSeq.matchLength == 0) {
+            return nbExternalSeqs;
+        }
+
+        /* This error condition is only possible if the external matchfinder
+         * produced an invalid parse, by definition of ZSTD_sequenceBound(). */
+        RETURN_ERROR_IF(
+            nbExternalSeqs == outSeqsCapacity,
+            sequenceProducer_failed,
+            "nbExternalSeqs == outSeqsCapacity but lastSeq is not a block delimiter!"
+        );
+
+        /* lastSeq is not a block delimiter, so we need to append one. */
+        ZSTD_memset(&outSeqs[nbExternalSeqs], 0, sizeof(ZSTD_Sequence));
+        return nbExternalSeqs + 1;
+    }
+}
+
+/* ZSTD_fastSequenceLengthSum() :
+ * Returns sum(litLen) + sum(matchLen) + lastLits for *seqBuf*.
+ * Similar to another function in zstd_compress.c (determine_blockSize),
+ * except it doesn't check for a block delimiter to end summation.
+ * Removing the early exit allows the compiler to auto-vectorize (https://godbolt.org/z/cY1cajz9P).
+ * This function can be deleted and replaced by determine_blockSize after we resolve issue #3456. */
+static size_t ZSTD_fastSequenceLengthSum(ZSTD_Sequence const* seqBuf, size_t seqBufSize) {
+    size_t matchLenSum, litLenSum, i;
+    matchLenSum = 0;
+    litLenSum = 0;
+    for (i = 0; i < seqBufSize; i++) {
+        litLenSum += seqBuf[i].litLength;
+        matchLenSum += seqBuf[i].matchLength;
+    }
+    return litLenSum + matchLenSum;
+}
+
 typedef enum { ZSTDbss_compress, ZSTDbss_noCompress } ZSTD_buildSeqStore_e;
 
 static size_t ZSTD_buildSeqStore(ZSTD_CCtx* zc, const void* src, size_t srcSize)
@@ -2727,7 +3060,9 @@ static size_t ZSTD_buildSeqStore(ZSTD_CCtx* zc, const void* src, size_t srcSize)
     assert(srcSize <= ZSTD_BLOCKSIZE_MAX);
     /* Assert that we have correctly flushed the ctx params into the ms's copy */
     ZSTD_assertEqualCParams(zc->appliedParams.cParams, ms->cParams);
-    if (srcSize < MIN_CBLOCK_SIZE+ZSTD_blockHeaderSize+1) {
+    /* TODO: See 3090. We reduced MIN_CBLOCK_SIZE from 3 to 2 so to compensate we are adding
+     * additional 1. We need to revisit and change this logic to be more consistent */
+    if (srcSize < MIN_CBLOCK_SIZE+ZSTD_blockHeaderSize+1+1) {
         if (zc->appliedParams.cParams.strategy >= ZSTD_btopt) {
             ZSTD_ldm_skipRawSeqStoreBytes(&zc->externSeqStore, srcSize);
         } else {
@@ -2763,6 +3098,15 @@ static size_t ZSTD_buildSeqStore(ZSTD_CCtx* zc, const void* src, size_t srcSize)
         }
         if (zc->externSeqStore.pos < zc->externSeqStore.size) {
             assert(zc->appliedParams.ldmParams.enableLdm == ZSTD_ps_disable);
+
+            /* External matchfinder + LDM is technically possible, just not implemented yet.
+             * We need to revisit soon and implement it. */
+            RETURN_ERROR_IF(
+                zc->appliedParams.useSequenceProducer,
+                parameter_combination_unsupported,
+                "Long-distance matching with external sequence producer enabled is not currently supported."
+            );
+
             /* Updates ldmSeqStore.pos */
             lastLLSize =
                 ZSTD_ldm_blockCompress(&zc->externSeqStore,
@@ -2774,6 +3118,14 @@ static size_t ZSTD_buildSeqStore(ZSTD_CCtx* zc, const void* src, size_t srcSize)
         } else if (zc->appliedParams.ldmParams.enableLdm == ZSTD_ps_enable) {
             rawSeqStore_t ldmSeqStore = kNullRawSeqStore;
 
+            /* External matchfinder + LDM is technically possible, just not implemented yet.
+             * We need to revisit soon and implement it. */
+            RETURN_ERROR_IF(
+                zc->appliedParams.useSequenceProducer,
+                parameter_combination_unsupported,
+                "Long-distance matching with external sequence producer enabled is not currently supported."
+            );
+
             ldmSeqStore.seq = zc->ldmSequences;
             ldmSeqStore.capacity = zc->maxNbLdmSequences;
             /* Updates ldmSeqStore.size */
@@ -2788,7 +3140,68 @@ static size_t ZSTD_buildSeqStore(ZSTD_CCtx* zc, const void* src, size_t srcSize)
                                        zc->appliedParams.useRowMatchFinder,
                                        src, srcSize);
             assert(ldmSeqStore.pos == ldmSeqStore.size);
-        } else {   /* not long range mode */
+        } else if (zc->appliedParams.useSequenceProducer) {
+            assert(
+                zc->externalMatchCtx.seqBufferCapacity >= ZSTD_sequenceBound(srcSize)
+            );
+            assert(zc->externalMatchCtx.mFinder != NULL);
+
+            {   U32 const windowSize = (U32)1 << zc->appliedParams.cParams.windowLog;
+
+                size_t const nbExternalSeqs = (zc->externalMatchCtx.mFinder)(
+                    zc->externalMatchCtx.mState,
+                    zc->externalMatchCtx.seqBuffer,
+                    zc->externalMatchCtx.seqBufferCapacity,
+                    src, srcSize,
+                    NULL, 0,  /* dict and dictSize, currently not supported */
+                    zc->appliedParams.compressionLevel,
+                    windowSize
+                );
+
+                size_t const nbPostProcessedSeqs = ZSTD_postProcessSequenceProducerResult(
+                    zc->externalMatchCtx.seqBuffer,
+                    nbExternalSeqs,
+                    zc->externalMatchCtx.seqBufferCapacity,
+                    srcSize
+                );
+
+                /* Return early if there is no error, since we don't need to worry about last literals */
+                if (!ZSTD_isError(nbPostProcessedSeqs)) {
+                    ZSTD_sequencePosition seqPos = {0,0,0};
+                    size_t const seqLenSum = ZSTD_fastSequenceLengthSum(zc->externalMatchCtx.seqBuffer, nbPostProcessedSeqs);
+                    RETURN_ERROR_IF(seqLenSum > srcSize, externalSequences_invalid, "External sequences imply too large a block!");
+                    FORWARD_IF_ERROR(
+                        ZSTD_copySequencesToSeqStoreExplicitBlockDelim(
+                            zc, &seqPos,
+                            zc->externalMatchCtx.seqBuffer, nbPostProcessedSeqs,
+                            src, srcSize,
+                            zc->appliedParams.searchForExternalRepcodes
+                        ),
+                        "Failed to copy external sequences to seqStore!"
+                    );
+                    ms->ldmSeqStore = NULL;
+                    DEBUGLOG(5, "Copied %lu sequences from external sequence producer to internal seqStore.", (unsigned long)nbExternalSeqs);
+                    return ZSTDbss_compress;
+                }
+
+                /* Propagate the error if fallback is disabled */
+                if (!zc->appliedParams.enableMatchFinderFallback) {
+                    return nbPostProcessedSeqs;
+                }
+
+                /* Fallback to software matchfinder */
+                {   ZSTD_blockCompressor const blockCompressor = ZSTD_selectBlockCompressor(zc->appliedParams.cParams.strategy,
+                                                                                            zc->appliedParams.useRowMatchFinder,
+                                                                                            dictMode);
+                    ms->ldmSeqStore = NULL;
+                    DEBUGLOG(
+                        5,
+                        "External sequence producer returned error code %lu. Falling back to internal parser.",
+                        (unsigned long)nbExternalSeqs
+                    );
+                    lastLLSize = blockCompressor(ms, &zc->seqStore, zc->blockState.nextCBlock->rep, src, srcSize);
+            }   }
+        } else {   /* not long range mode and no external matchfinder */
             ZSTD_blockCompressor const blockCompressor = ZSTD_selectBlockCompressor(zc->appliedParams.cParams.strategy,
                                                                                     zc->appliedParams.useRowMatchFinder,
                                                                                     dictMode);
@@ -2849,7 +3262,7 @@ static void ZSTD_copyBlockSequences(ZSTD_CCtx* zc)
         /* seqStoreSeqs[i].offset == offCode+1, and ZSTD_updateRep() expects offCode
            so we provide seqStoreSeqs[i].offset - 1 */
         ZSTD_updateRep(updatedRepcodes.rep,
-                       seqStoreSeqs[i].offBase - 1,
+                       seqStoreSeqs[i].offBase,
                        seqStoreSeqs[i].litLength == 0);
         literalsRead += outSeqs[i].litLength;
     }
@@ -2865,6 +3278,10 @@ static void ZSTD_copyBlockSequences(ZSTD_CCtx* zc)
     zc->seqCollector.seqIndex += seqStoreSeqSize;
 }
 
+size_t ZSTD_sequenceBound(size_t srcSize) {
+    return (srcSize / ZSTD_MINMATCH_MIN) + 1;
+}
+
 size_t ZSTD_generateSequences(ZSTD_CCtx* zc, ZSTD_Sequence* outSeqs,
                               size_t outSeqsSize, const void* src, size_t srcSize)
 {
@@ -2910,19 +3327,17 @@ static int ZSTD_isRLE(const BYTE* src, size_t length) {
     const size_t unrollMask = unrollSize - 1;
     const size_t prefixLength = length & unrollMask;
     size_t i;
-    size_t u;
     if (length == 1) return 1;
     /* Check if prefix is RLE first before using unrolled loop */
     if (prefixLength && ZSTD_count(ip+1, ip, ip+prefixLength) != prefixLength-1) {
         return 0;
     }
     for (i = prefixLength; i != length; i += unrollSize) {
+        size_t u;
         for (u = 0; u < unrollSize; u += sizeof(size_t)) {
             if (MEM_readST(ip + i + u) != valueST) {
                 return 0;
-            }
-        }
-    }
+    }   }   }
     return 1;
 }
 
@@ -2938,7 +3353,8 @@ static int ZSTD_maybeRLE(seqStore_t const* seqStore)
     return nbSeqs < 4 && nbLits < 10;
 }
 
-static void ZSTD_blockState_confirmRepcodesAndEntropyTables(ZSTD_blockState_t* const bs)
+static void
+ZSTD_blockState_confirmRepcodesAndEntropyTables(ZSTD_blockState_t* const bs)
 {
     ZSTD_compressedBlockState_t* const tmp = bs->prevCBlock;
     bs->prevCBlock = bs->nextCBlock;
@@ -2946,7 +3362,9 @@ static void ZSTD_blockState_confirmRepcodesAndEntropyTables(ZSTD_blockState_t* c
 }
 
 /* Writes the block header */
-static void writeBlockHeader(void* op, size_t cSize, size_t blockSize, U32 lastBlock) {
+static void
+writeBlockHeader(void* op, size_t cSize, size_t blockSize, U32 lastBlock)
+{
     U32 const cBlockHeader = cSize == 1 ?
                         lastBlock + (((U32)bt_rle)<<1) + (U32)(blockSize << 3) :
                         lastBlock + (((U32)bt_compressed)<<1) + (U32)(cSize << 3);
@@ -2959,13 +3377,16 @@ static void writeBlockHeader(void* op, size_t cSize, size_t blockSize, U32 lastB
  *  Stores literals block type (raw, rle, compressed, repeat) and
  *  huffman description table to hufMetadata.
  *  Requires ENTROPY_WORKSPACE_SIZE workspace
- *  @return : size of huffman description table or error code */
-static size_t ZSTD_buildBlockEntropyStats_literals(void* const src, size_t srcSize,
-                                            const ZSTD_hufCTables_t* prevHuf,
-                                                  ZSTD_hufCTables_t* nextHuf,
-                                                  ZSTD_hufCTablesMetadata_t* hufMetadata,
-                                                  const int literalsCompressionIsDisabled,
-                                                  void* workspace, size_t wkspSize)
+ * @return : size of huffman description table, or an error code
+ */
+static size_t
+ZSTD_buildBlockEntropyStats_literals(void* const src, size_t srcSize,
+                               const ZSTD_hufCTables_t* prevHuf,
+                                     ZSTD_hufCTables_t* nextHuf,
+                                     ZSTD_hufCTablesMetadata_t* hufMetadata,
+                               const int literalsCompressionIsDisabled,
+                                     void* workspace, size_t wkspSize,
+                                     int hufFlags)
 {
     BYTE* const wkspStart = (BYTE*)workspace;
     BYTE* const wkspEnd = wkspStart + wkspSize;
@@ -2973,9 +3394,9 @@ static size_t ZSTD_buildBlockEntropyStats_literals(void* const src, size_t srcSi
     unsigned* const countWksp = (unsigned*)workspace;
     const size_t countWkspSize = (HUF_SYMBOLVALUE_MAX + 1) * sizeof(unsigned);
     BYTE* const nodeWksp = countWkspStart + countWkspSize;
-    const size_t nodeWkspSize = wkspEnd-nodeWksp;
+    const size_t nodeWkspSize = (size_t)(wkspEnd - nodeWksp);
     unsigned maxSymbolValue = HUF_SYMBOLVALUE_MAX;
-    unsigned huffLog = HUF_TABLELOG_DEFAULT;
+    unsigned huffLog = LitHufLog;
     HUF_repeat repeat = prevHuf->repeatMode;
     DEBUGLOG(5, "ZSTD_buildBlockEntropyStats_literals (srcSize=%zu)", srcSize);
 
@@ -2990,73 +3411,77 @@ static size_t ZSTD_buildBlockEntropyStats_literals(void* const src, size_t srcSi
 
     /* small ? don't even attempt compression (speed opt) */
 #ifndef COMPRESS_LITERALS_SIZE_MIN
-#define COMPRESS_LITERALS_SIZE_MIN 63
+# define COMPRESS_LITERALS_SIZE_MIN 63  /* heuristic */
 #endif
     {   size_t const minLitSize = (prevHuf->repeatMode == HUF_repeat_valid) ? 6 : COMPRESS_LITERALS_SIZE_MIN;
         if (srcSize <= minLitSize) {
             DEBUGLOG(5, "set_basic - too small");
             hufMetadata->hType = set_basic;
             return 0;
-        }
-    }
+    }   }
 
     /* Scan input and build symbol stats */
-    {   size_t const largest = HIST_count_wksp (countWksp, &maxSymbolValue, (const BYTE*)src, srcSize, workspace, wkspSize);
+    {   size_t const largest =
+            HIST_count_wksp (countWksp, &maxSymbolValue,
+                            (const BYTE*)src, srcSize,
+                            workspace, wkspSize);
         FORWARD_IF_ERROR(largest, "HIST_count_wksp failed");
         if (largest == srcSize) {
+            /* only one literal symbol */
             DEBUGLOG(5, "set_rle");
             hufMetadata->hType = set_rle;
             return 0;
         }
         if (largest <= (srcSize >> 7)+4) {
+            /* heuristic: likely not compressible */
             DEBUGLOG(5, "set_basic - no gain");
             hufMetadata->hType = set_basic;
             return 0;
-        }
-    }
+    }   }
 
     /* Validate the previous Huffman table */
-    if (repeat == HUF_repeat_check && !HUF_validateCTable((HUF_CElt const*)prevHuf->CTable, countWksp, maxSymbolValue)) {
+    if (repeat == HUF_repeat_check
+      && !HUF_validateCTable((HUF_CElt const*)prevHuf->CTable, countWksp, maxSymbolValue)) {
         repeat = HUF_repeat_none;
     }
 
     /* Build Huffman Tree */
     ZSTD_memset(nextHuf->CTable, 0, sizeof(nextHuf->CTable));
-    huffLog = HUF_optimalTableLog(huffLog, srcSize, maxSymbolValue);
+    huffLog = HUF_optimalTableLog(huffLog, srcSize, maxSymbolValue, nodeWksp, nodeWkspSize, nextHuf->CTable, countWksp, hufFlags);
+    assert(huffLog <= LitHufLog);
     {   size_t const maxBits = HUF_buildCTable_wksp((HUF_CElt*)nextHuf->CTable, countWksp,
                                                     maxSymbolValue, huffLog,
                                                     nodeWksp, nodeWkspSize);
         FORWARD_IF_ERROR(maxBits, "HUF_buildCTable_wksp");
         huffLog = (U32)maxBits;
-        {   /* Build and write the CTable */
-            size_t const newCSize = HUF_estimateCompressedSize(
-                    (HUF_CElt*)nextHuf->CTable, countWksp, maxSymbolValue);
-            size_t const hSize = HUF_writeCTable_wksp(
-                    hufMetadata->hufDesBuffer, sizeof(hufMetadata->hufDesBuffer),
-                    (HUF_CElt*)nextHuf->CTable, maxSymbolValue, huffLog,
-                    nodeWksp, nodeWkspSize);
-            /* Check against repeating the previous CTable */
-            if (repeat != HUF_repeat_none) {
-                size_t const oldCSize = HUF_estimateCompressedSize(
-                        (HUF_CElt const*)prevHuf->CTable, countWksp, maxSymbolValue);
-                if (oldCSize < srcSize && (oldCSize <= hSize + newCSize || hSize + 12 >= srcSize)) {
-                    DEBUGLOG(5, "set_repeat - smaller");
-                    ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf));
-                    hufMetadata->hType = set_repeat;
-                    return 0;
-                }
-            }
-            if (newCSize + hSize >= srcSize) {
-                DEBUGLOG(5, "set_basic - no gains");
+    }
+    {   /* Build and write the CTable */
+        size_t const newCSize = HUF_estimateCompressedSize(
+                (HUF_CElt*)nextHuf->CTable, countWksp, maxSymbolValue);
+        size_t const hSize = HUF_writeCTable_wksp(
+                hufMetadata->hufDesBuffer, sizeof(hufMetadata->hufDesBuffer),
+                (HUF_CElt*)nextHuf->CTable, maxSymbolValue, huffLog,
+                nodeWksp, nodeWkspSize);
+        /* Check against repeating the previous CTable */
+        if (repeat != HUF_repeat_none) {
+            size_t const oldCSize = HUF_estimateCompressedSize(
+                    (HUF_CElt const*)prevHuf->CTable, countWksp, maxSymbolValue);
+            if (oldCSize < srcSize && (oldCSize <= hSize + newCSize || hSize + 12 >= srcSize)) {
+                DEBUGLOG(5, "set_repeat - smaller");
                 ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf));
-                hufMetadata->hType = set_basic;
+                hufMetadata->hType = set_repeat;
                 return 0;
-            }
-            DEBUGLOG(5, "set_compressed (hSize=%u)", (U32)hSize);
-            hufMetadata->hType = set_compressed;
-            nextHuf->repeatMode = HUF_repeat_check;
-            return hSize;
+        }   }
+        if (newCSize + hSize >= srcSize) {
+            DEBUGLOG(5, "set_basic - no gains");
+            ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf));
+            hufMetadata->hType = set_basic;
+            return 0;
         }
+        DEBUGLOG(5, "set_compressed (hSize=%u)", (U32)hSize);
+        hufMetadata->hType = set_compressed;
+        nextHuf->repeatMode = HUF_repeat_check;
+        return hSize;
     }
 }
 
@@ -3066,8 +3491,9 @@ static size_t ZSTD_buildBlockEntropyStats_literals(void* const src, size_t srcSi
  * and updates nextEntropy to the appropriate repeatMode.
  */
 static ZSTD_symbolEncodingTypeStats_t
-ZSTD_buildDummySequencesStatistics(ZSTD_fseCTables_t* nextEntropy) {
-    ZSTD_symbolEncodingTypeStats_t stats = {set_basic, set_basic, set_basic, 0, 0};
+ZSTD_buildDummySequencesStatistics(ZSTD_fseCTables_t* nextEntropy)
+{
+    ZSTD_symbolEncodingTypeStats_t stats = {set_basic, set_basic, set_basic, 0, 0, 0};
     nextEntropy->litlength_repeatMode = FSE_repeat_none;
     nextEntropy->offcode_repeatMode = FSE_repeat_none;
     nextEntropy->matchlength_repeatMode = FSE_repeat_none;
@@ -3078,16 +3504,18 @@ ZSTD_buildDummySequencesStatistics(ZSTD_fseCTables_t* nextEntropy) {
  *  Builds entropy for the sequences.
  *  Stores symbol compression modes and fse table to fseMetadata.
  *  Requires ENTROPY_WORKSPACE_SIZE wksp.
- *  @return : size of fse tables or error code */
-static size_t ZSTD_buildBlockEntropyStats_sequences(seqStore_t* seqStorePtr,
-                                              const ZSTD_fseCTables_t* prevEntropy,
-                                                    ZSTD_fseCTables_t* nextEntropy,
-                                              const ZSTD_CCtx_params* cctxParams,
-                                                    ZSTD_fseCTablesMetadata_t* fseMetadata,
-                                                    void* workspace, size_t wkspSize)
+ * @return : size of fse tables or error code */
+static size_t
+ZSTD_buildBlockEntropyStats_sequences(
+                const seqStore_t* seqStorePtr,
+                const ZSTD_fseCTables_t* prevEntropy,
+                      ZSTD_fseCTables_t* nextEntropy,
+                const ZSTD_CCtx_params* cctxParams,
+                      ZSTD_fseCTablesMetadata_t* fseMetadata,
+                      void* workspace, size_t wkspSize)
 {
     ZSTD_strategy const strategy = cctxParams->cParams.strategy;
-    size_t const nbSeq = seqStorePtr->sequences - seqStorePtr->sequencesStart;
+    size_t const nbSeq = (size_t)(seqStorePtr->sequences - seqStorePtr->sequencesStart);
     BYTE* const ostart = fseMetadata->fseTablesBuffer;
     BYTE* const oend = ostart + sizeof(fseMetadata->fseTablesBuffer);
     BYTE* op = ostart;
@@ -3114,23 +3542,28 @@ static size_t ZSTD_buildBlockEntropyStats_sequences(seqStore_t* seqStorePtr,
 /* ZSTD_buildBlockEntropyStats() :
  *  Builds entropy for the block.
  *  Requires workspace size ENTROPY_WORKSPACE_SIZE
- *
- *  @return : 0 on success or error code
+ * @return : 0 on success, or an error code
+ *  Note : also employed in superblock
  */
-size_t ZSTD_buildBlockEntropyStats(seqStore_t* seqStorePtr,
-                             const ZSTD_entropyCTables_t* prevEntropy,
-                                   ZSTD_entropyCTables_t* nextEntropy,
-                             const ZSTD_CCtx_params* cctxParams,
-                                   ZSTD_entropyCTablesMetadata_t* entropyMetadata,
-                                   void* workspace, size_t wkspSize)
-{
-    size_t const litSize = seqStorePtr->lit - seqStorePtr->litStart;
+size_t ZSTD_buildBlockEntropyStats(
+            const seqStore_t* seqStorePtr,
+            const ZSTD_entropyCTables_t* prevEntropy,
+                  ZSTD_entropyCTables_t* nextEntropy,
+            const ZSTD_CCtx_params* cctxParams,
+                  ZSTD_entropyCTablesMetadata_t* entropyMetadata,
+                  void* workspace, size_t wkspSize)
+{
+    size_t const litSize = (size_t)(seqStorePtr->lit - seqStorePtr->litStart);
+    int const huf_useOptDepth = (cctxParams->cParams.strategy >= HUF_OPTIMAL_DEPTH_THRESHOLD);
+    int const hufFlags = huf_useOptDepth ? HUF_flags_optimalDepth : 0;
+
     entropyMetadata->hufMetadata.hufDesSize =
         ZSTD_buildBlockEntropyStats_literals(seqStorePtr->litStart, litSize,
                                             &prevEntropy->huf, &nextEntropy->huf,
                                             &entropyMetadata->hufMetadata,
                                             ZSTD_literalsCompressionIsDisabled(cctxParams),
-                                            workspace, wkspSize);
+                                            workspace, wkspSize, hufFlags);
+
     FORWARD_IF_ERROR(entropyMetadata->hufMetadata.hufDesSize, "ZSTD_buildBlockEntropyStats_literals failed");
     entropyMetadata->fseMetadata.fseTablesSize =
         ZSTD_buildBlockEntropyStats_sequences(seqStorePtr,
@@ -3143,11 +3576,12 @@ size_t ZSTD_buildBlockEntropyStats(seqStore_t* seqStorePtr,
 }
 
 /* Returns the size estimate for the literals section (header + content) of a block */
-static size_t ZSTD_estimateBlockSize_literal(const BYTE* literals, size_t litSize,
-                                                const ZSTD_hufCTables_t* huf,
-                                                const ZSTD_hufCTablesMetadata_t* hufMetadata,
-                                                void* workspace, size_t wkspSize,
-                                                int writeEntropy)
+static size_t
+ZSTD_estimateBlockSize_literal(const BYTE* literals, size_t litSize,
+                               const ZSTD_hufCTables_t* huf,
+                               const ZSTD_hufCTablesMetadata_t* hufMetadata,
+                               void* workspace, size_t wkspSize,
+                               int writeEntropy)
 {
     unsigned* const countWksp = (unsigned*)workspace;
     unsigned maxSymbolValue = HUF_SYMBOLVALUE_MAX;
@@ -3169,12 +3603,13 @@ static size_t ZSTD_estimateBlockSize_literal(const BYTE* literals, size_t litSiz
 }
 
 /* Returns the size estimate for the FSE-compressed symbols (of, ml, ll) of a block */
-static size_t ZSTD_estimateBlockSize_symbolType(symbolEncodingType_e type,
-                        const BYTE* codeTable, size_t nbSeq, unsigned maxCode,
-                        const FSE_CTable* fseCTable,
-                        const U8* additionalBits,
-                        short const* defaultNorm, U32 defaultNormLog, U32 defaultMax,
-                        void* workspace, size_t wkspSize)
+static size_t
+ZSTD_estimateBlockSize_symbolType(symbolEncodingType_e type,
+                    const BYTE* codeTable, size_t nbSeq, unsigned maxCode,
+                    const FSE_CTable* fseCTable,
+                    const U8* additionalBits,
+                    short const* defaultNorm, U32 defaultNormLog, U32 defaultMax,
+                    void* workspace, size_t wkspSize)
 {
     unsigned* const countWksp = (unsigned*)workspace;
     const BYTE* ctp = codeTable;
@@ -3206,99 +3641,107 @@ static size_t ZSTD_estimateBlockSize_symbolType(symbolEncodingType_e type,
 }
 
 /* Returns the size estimate for the sequences section (header + content) of a block */
-static size_t ZSTD_estimateBlockSize_sequences(const BYTE* ofCodeTable,
-                                                  const BYTE* llCodeTable,
-                                                  const BYTE* mlCodeTable,
-                                                  size_t nbSeq,
-                                                  const ZSTD_fseCTables_t* fseTables,
-                                                  const ZSTD_fseCTablesMetadata_t* fseMetadata,
-                                                  void* workspace, size_t wkspSize,
-                                                  int writeEntropy)
+static size_t
+ZSTD_estimateBlockSize_sequences(const BYTE* ofCodeTable,
+                                 const BYTE* llCodeTable,
+                                 const BYTE* mlCodeTable,
+                                 size_t nbSeq,
+                                 const ZSTD_fseCTables_t* fseTables,
+                                 const ZSTD_fseCTablesMetadata_t* fseMetadata,
+                                 void* workspace, size_t wkspSize,
+                                 int writeEntropy)
 {
     size_t sequencesSectionHeaderSize = 1 /* seqHead */ + 1 /* min seqSize size */ + (nbSeq >= 128) + (nbSeq >= LONGNBSEQ);
     size_t cSeqSizeEstimate = 0;
     cSeqSizeEstimate += ZSTD_estimateBlockSize_symbolType(fseMetadata->ofType, ofCodeTable, nbSeq, MaxOff,
-                                         fseTables->offcodeCTable, NULL,
-                                         OF_defaultNorm, OF_defaultNormLog, DefaultMaxOff,
-                                         workspace, wkspSize);
+                                    fseTables->offcodeCTable, NULL,
+                                    OF_defaultNorm, OF_defaultNormLog, DefaultMaxOff,
+                                    workspace, wkspSize);
     cSeqSizeEstimate += ZSTD_estimateBlockSize_symbolType(fseMetadata->llType, llCodeTable, nbSeq, MaxLL,
-                                         fseTables->litlengthCTable, LL_bits,
-                                         LL_defaultNorm, LL_defaultNormLog, MaxLL,
-                                         workspace, wkspSize);
+                                    fseTables->litlengthCTable, LL_bits,
+                                    LL_defaultNorm, LL_defaultNormLog, MaxLL,
+                                    workspace, wkspSize);
     cSeqSizeEstimate += ZSTD_estimateBlockSize_symbolType(fseMetadata->mlType, mlCodeTable, nbSeq, MaxML,
-                                         fseTables->matchlengthCTable, ML_bits,
-                                         ML_defaultNorm, ML_defaultNormLog, MaxML,
-                                         workspace, wkspSize);
+                                    fseTables->matchlengthCTable, ML_bits,
+                                    ML_defaultNorm, ML_defaultNormLog, MaxML,
+                                    workspace, wkspSize);
     if (writeEntropy) cSeqSizeEstimate += fseMetadata->fseTablesSize;
     return cSeqSizeEstimate + sequencesSectionHeaderSize;
 }
 
 /* Returns the size estimate for a given stream of literals, of, ll, ml */
-static size_t ZSTD_estimateBlockSize(const BYTE* literals, size_t litSize,
-                                     const BYTE* ofCodeTable,
-                                     const BYTE* llCodeTable,
-                                     const BYTE* mlCodeTable,
-                                     size_t nbSeq,
-                                     const ZSTD_entropyCTables_t* entropy,
-                                     const ZSTD_entropyCTablesMetadata_t* entropyMetadata,
-                                     void* workspace, size_t wkspSize,
-                                     int writeLitEntropy, int writeSeqEntropy) {
+static size_t
+ZSTD_estimateBlockSize(const BYTE* literals, size_t litSize,
+                       const BYTE* ofCodeTable,
+                       const BYTE* llCodeTable,
+                       const BYTE* mlCodeTable,
+                       size_t nbSeq,
+                       const ZSTD_entropyCTables_t* entropy,
+                       const ZSTD_entropyCTablesMetadata_t* entropyMetadata,
+                       void* workspace, size_t wkspSize,
+                       int writeLitEntropy, int writeSeqEntropy)
+{
     size_t const literalsSize = ZSTD_estimateBlockSize_literal(literals, litSize,
-                                                         &entropy->huf, &entropyMetadata->hufMetadata,
-                                                         workspace, wkspSize, writeLitEntropy);
+                                    &entropy->huf, &entropyMetadata->hufMetadata,
+                                    workspace, wkspSize, writeLitEntropy);
     size_t const seqSize = ZSTD_estimateBlockSize_sequences(ofCodeTable, llCodeTable, mlCodeTable,
-                                                         nbSeq, &entropy->fse, &entropyMetadata->fseMetadata,
-                                                         workspace, wkspSize, writeSeqEntropy);
+                                    nbSeq, &entropy->fse, &entropyMetadata->fseMetadata,
+                                    workspace, wkspSize, writeSeqEntropy);
     return seqSize + literalsSize + ZSTD_blockHeaderSize;
 }
 
 /* Builds entropy statistics and uses them for blocksize estimation.
  *
- * Returns the estimated compressed size of the seqStore, or a zstd error.
+ * @return: estimated compressed size of the seqStore, or a zstd error.
  */
-static size_t ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize(seqStore_t* seqStore, ZSTD_CCtx* zc) {
-    ZSTD_entropyCTablesMetadata_t* entropyMetadata = &zc->blockSplitCtx.entropyMetadata;
+static size_t
+ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize(seqStore_t* seqStore, ZSTD_CCtx* zc)
+{
+    ZSTD_entropyCTablesMetadata_t* const entropyMetadata = &zc->blockSplitCtx.entropyMetadata;
     DEBUGLOG(6, "ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize()");
     FORWARD_IF_ERROR(ZSTD_buildBlockEntropyStats(seqStore,
                     &zc->blockState.prevCBlock->entropy,
                     &zc->blockState.nextCBlock->entropy,
                     &zc->appliedParams,
                     entropyMetadata,
-                    zc->entropyWorkspace, ENTROPY_WORKSPACE_SIZE /* statically allocated in resetCCtx */), "");
-    return ZSTD_estimateBlockSize(seqStore->litStart, (size_t)(seqStore->lit - seqStore->litStart),
+                    zc->entropyWorkspace, ENTROPY_WORKSPACE_SIZE), "");
+    return ZSTD_estimateBlockSize(
+                    seqStore->litStart, (size_t)(seqStore->lit - seqStore->litStart),
                     seqStore->ofCode, seqStore->llCode, seqStore->mlCode,
                     (size_t)(seqStore->sequences - seqStore->sequencesStart),
-                    &zc->blockState.nextCBlock->entropy, entropyMetadata, zc->entropyWorkspace, ENTROPY_WORKSPACE_SIZE,
+                    &zc->blockState.nextCBlock->entropy,
+                    entropyMetadata,
+                    zc->entropyWorkspace, ENTROPY_WORKSPACE_SIZE,
                     (int)(entropyMetadata->hufMetadata.hType == set_compressed), 1);
 }
 
 /* Returns literals bytes represented in a seqStore */
-static size_t ZSTD_countSeqStoreLiteralsBytes(const seqStore_t* const seqStore) {
+static size_t ZSTD_countSeqStoreLiteralsBytes(const seqStore_t* const seqStore)
+{
     size_t literalsBytes = 0;
-    size_t const nbSeqs = seqStore->sequences - seqStore->sequencesStart;
+    size_t const nbSeqs = (size_t)(seqStore->sequences - seqStore->sequencesStart);
     size_t i;
     for (i = 0; i < nbSeqs; ++i) {
-        seqDef seq = seqStore->sequencesStart[i];
+        seqDef const seq = seqStore->sequencesStart[i];
         literalsBytes += seq.litLength;
         if (i == seqStore->longLengthPos && seqStore->longLengthType == ZSTD_llt_literalLength) {
             literalsBytes += 0x10000;
-        }
-    }
+    }   }
     return literalsBytes;
 }
 
 /* Returns match bytes represented in a seqStore */
-static size_t ZSTD_countSeqStoreMatchBytes(const seqStore_t* const seqStore) {
+static size_t ZSTD_countSeqStoreMatchBytes(const seqStore_t* const seqStore)
+{
     size_t matchBytes = 0;
-    size_t const nbSeqs = seqStore->sequences - seqStore->sequencesStart;
+    size_t const nbSeqs = (size_t)(seqStore->sequences - seqStore->sequencesStart);
     size_t i;
     for (i = 0; i < nbSeqs; ++i) {
         seqDef seq = seqStore->sequencesStart[i];
         matchBytes += seq.mlBase + MINMATCH;
         if (i == seqStore->longLengthPos && seqStore->longLengthType == ZSTD_llt_matchLength) {
             matchBytes += 0x10000;
-        }
-    }
+    }   }
     return matchBytes;
 }
 
@@ -3307,15 +3750,12 @@ static size_t ZSTD_countSeqStoreMatchBytes(const seqStore_t* const seqStore) {
  */
 static void ZSTD_deriveSeqStoreChunk(seqStore_t* resultSeqStore,
                                const seqStore_t* originalSeqStore,
-                                     size_t startIdx, size_t endIdx) {
-    BYTE* const litEnd = originalSeqStore->lit;
-    size_t literalsBytes;
-    size_t literalsBytesPreceding = 0;
-
+                                     size_t startIdx, size_t endIdx)
+{
     *resultSeqStore = *originalSeqStore;
     if (startIdx > 0) {
         resultSeqStore->sequences = originalSeqStore->sequencesStart + startIdx;
-        literalsBytesPreceding = ZSTD_countSeqStoreLiteralsBytes(resultSeqStore);
+        resultSeqStore->litStart += ZSTD_countSeqStoreLiteralsBytes(resultSeqStore);
     }
 
     /* Move longLengthPos into the correct position if necessary */
@@ -3328,13 +3768,12 @@ static void ZSTD_deriveSeqStoreChunk(seqStore_t* resultSeqStore,
     }
     resultSeqStore->sequencesStart = originalSeqStore->sequencesStart + startIdx;
     resultSeqStore->sequences = originalSeqStore->sequencesStart + endIdx;
-    literalsBytes = ZSTD_countSeqStoreLiteralsBytes(resultSeqStore);
-    resultSeqStore->litStart += literalsBytesPreceding;
     if (endIdx == (size_t)(originalSeqStore->sequences - originalSeqStore->sequencesStart)) {
         /* This accounts for possible last literals if the derived chunk reaches the end of the block */
-        resultSeqStore->lit = litEnd;
+        assert(resultSeqStore->lit == originalSeqStore->lit);
     } else {
-        resultSeqStore->lit = resultSeqStore->litStart+literalsBytes;
+        size_t const literalsBytes = ZSTD_countSeqStoreLiteralsBytes(resultSeqStore);
+        resultSeqStore->lit = resultSeqStore->litStart + literalsBytes;
     }
     resultSeqStore->llCode += startIdx;
     resultSeqStore->mlCode += startIdx;
@@ -3342,20 +3781,26 @@ static void ZSTD_deriveSeqStoreChunk(seqStore_t* resultSeqStore,
 }
 
 /*
- * Returns the raw offset represented by the combination of offCode, ll0, and repcode history.
- * offCode must represent a repcode in the numeric representation of ZSTD_storeSeq().
+ * Returns the raw offset represented by the combination of offBase, ll0, and repcode history.
+ * offBase must represent a repcode in the numeric representation of ZSTD_storeSeq().
  */
 static U32
-ZSTD_resolveRepcodeToRawOffset(const U32 rep[ZSTD_REP_NUM], const U32 offCode, const U32 ll0)
-{
-    U32 const adjustedOffCode = STORED_REPCODE(offCode) - 1 + ll0;  /* [ 0 - 3 ] */
-    assert(STORED_IS_REPCODE(offCode));
-    if (adjustedOffCode == ZSTD_REP_NUM) {
-        /* litlength == 0 and offCode == 2 implies selection of first repcode - 1 */
-        assert(rep[0] > 0);
+ZSTD_resolveRepcodeToRawOffset(const U32 rep[ZSTD_REP_NUM], const U32 offBase, const U32 ll0)
+{
+    U32 const adjustedRepCode = OFFBASE_TO_REPCODE(offBase) - 1 + ll0;  /* [ 0 - 3 ] */
+    assert(OFFBASE_IS_REPCODE(offBase));
+    if (adjustedRepCode == ZSTD_REP_NUM) {
+        assert(ll0);
+        /* litlength == 0 and offCode == 2 implies selection of first repcode - 1
+         * This is only valid if it results in a valid offset value, aka > 0.
+         * Note : it may happen that `rep[0]==1` in exceptional circumstances.
+         * In which case this function will return 0, which is an invalid offset.
+         * It's not an issue though, since this value will be
+         * compared and discarded within ZSTD_seqStore_resolveOffCodes().
+         */
         return rep[0] - 1;
     }
-    return rep[adjustedOffCode];
+    return rep[adjustedRepCode];
 }
 
 /*
@@ -3371,30 +3816,33 @@ ZSTD_resolveRepcodeToRawOffset(const U32 rep[ZSTD_REP_NUM], const U32 offCode, c
  *        1-3 : repcode 1-3
  *        4+ : real_offset+3
  */
-static void ZSTD_seqStore_resolveOffCodes(repcodes_t* const dRepcodes, repcodes_t* const cRepcodes,
-                                          seqStore_t* const seqStore, U32 const nbSeq) {
+static void
+ZSTD_seqStore_resolveOffCodes(repcodes_t* const dRepcodes, repcodes_t* const cRepcodes,
+                        const seqStore_t* const seqStore, U32 const nbSeq)
+{
     U32 idx = 0;
+    U32 const longLitLenIdx = seqStore->longLengthType == ZSTD_llt_literalLength ? seqStore->longLengthPos : nbSeq;
     for (; idx < nbSeq; ++idx) {
         seqDef* const seq = seqStore->sequencesStart + idx;
-        U32 const ll0 = (seq->litLength == 0);
-        U32 const offCode = OFFBASE_TO_STORED(seq->offBase);
-        assert(seq->offBase > 0);
-        if (STORED_IS_REPCODE(offCode)) {
-            U32 const dRawOffset = ZSTD_resolveRepcodeToRawOffset(dRepcodes->rep, offCode, ll0);
-            U32 const cRawOffset = ZSTD_resolveRepcodeToRawOffset(cRepcodes->rep, offCode, ll0);
+        U32 const ll0 = (seq->litLength == 0) && (idx != longLitLenIdx);
+        U32 const offBase = seq->offBase;
+        assert(offBase > 0);
+        if (OFFBASE_IS_REPCODE(offBase)) {
+            U32 const dRawOffset = ZSTD_resolveRepcodeToRawOffset(dRepcodes->rep, offBase, ll0);
+            U32 const cRawOffset = ZSTD_resolveRepcodeToRawOffset(cRepcodes->rep, offBase, ll0);
             /* Adjust simulated decompression repcode history if we come across a mismatch. Replace
              * the repcode with the offset it actually references, determined by the compression
              * repcode history.
              */
             if (dRawOffset != cRawOffset) {
-                seq->offBase = cRawOffset + ZSTD_REP_NUM;
+                seq->offBase = OFFSET_TO_OFFBASE(cRawOffset);
             }
         }
         /* Compression repcode history is always updated with values directly from the unmodified seqStore.
          * Decompression repcode history may use modified seq->offset value taken from compression repcode history.
          */
-        ZSTD_updateRep(dRepcodes->rep, OFFBASE_TO_STORED(seq->offBase), ll0);
-        ZSTD_updateRep(cRepcodes->rep, offCode, ll0);
+        ZSTD_updateRep(dRepcodes->rep, seq->offBase, ll0);
+        ZSTD_updateRep(cRepcodes->rep, offBase, ll0);
     }
 }
 
@@ -3404,10 +3852,11 @@ static void ZSTD_seqStore_resolveOffCodes(repcodes_t* const dRepcodes, repcodes_
  * Returns the total size of that block (including header) or a ZSTD error code.
  */
 static size_t
-ZSTD_compressSeqStore_singleBlock(ZSTD_CCtx* zc, seqStore_t* const seqStore,
+ZSTD_compressSeqStore_singleBlock(ZSTD_CCtx* zc,
+                            const seqStore_t* const seqStore,
                                   repcodes_t* const dRep, repcodes_t* const cRep,
                                   void* dst, size_t dstCapacity,
-                                  const void* src, size_t srcSize,
+                            const void* src, size_t srcSize,
                                   U32 lastBlock, U32 isPartition)
 {
     const U32 rleMaxLength = 25;
@@ -3481,45 +3930,49 @@ typedef struct {
 
 /* Helper function to perform the recursive search for block splits.
  * Estimates the cost of seqStore prior to split, and estimates the cost of splitting the sequences in half.
- * If advantageous to split, then we recurse down the two sub-blocks. If not, or if an error occurred in estimation, then
- * we do not recurse.
+ * If advantageous to split, then we recurse down the two sub-blocks.
+ * If not, or if an error occurred in estimation, then we do not recurse.
  *
- * Note: The recursion depth is capped by a heuristic minimum number of sequences, defined by MIN_SEQUENCES_BLOCK_SPLITTING.
+ * Note: The recursion depth is capped by a heuristic minimum number of sequences,
+ * defined by MIN_SEQUENCES_BLOCK_SPLITTING.
  * In theory, this means the absolute largest recursion depth is 10 == log2(maxNbSeqInBlock/MIN_SEQUENCES_BLOCK_SPLITTING).
  * In practice, recursion depth usually doesn't go beyond 4.
  *
- * Furthermore, the number of splits is capped by ZSTD_MAX_NB_BLOCK_SPLITS. At ZSTD_MAX_NB_BLOCK_SPLITS == 196 with the current existing blockSize
+ * Furthermore, the number of splits is capped by ZSTD_MAX_NB_BLOCK_SPLITS.
+ * At ZSTD_MAX_NB_BLOCK_SPLITS == 196 with the current existing blockSize
  * maximum of 128 KB, this value is actually impossible to reach.
  */
 static void
 ZSTD_deriveBlockSplitsHelper(seqStoreSplits* splits, size_t startIdx, size_t endIdx,
                              ZSTD_CCtx* zc, const seqStore_t* origSeqStore)
 {
-    seqStore_t* fullSeqStoreChunk = &zc->blockSplitCtx.fullSeqStoreChunk;
-    seqStore_t* firstHalfSeqStore = &zc->blockSplitCtx.firstHalfSeqStore;
-    seqStore_t* secondHalfSeqStore = &zc->blockSplitCtx.secondHalfSeqStore;
+    seqStore_t* const fullSeqStoreChunk = &zc->blockSplitCtx.fullSeqStoreChunk;
+    seqStore_t* const firstHalfSeqStore = &zc->blockSplitCtx.firstHalfSeqStore;
+    seqStore_t* const secondHalfSeqStore = &zc->blockSplitCtx.secondHalfSeqStore;
     size_t estimatedOriginalSize;
     size_t estimatedFirstHalfSize;
     size_t estimatedSecondHalfSize;
     size_t midIdx = (startIdx + endIdx)/2;
 
+    DEBUGLOG(5, "ZSTD_deriveBlockSplitsHelper: startIdx=%zu endIdx=%zu", startIdx, endIdx);
+    assert(endIdx >= startIdx);
     if (endIdx - startIdx < MIN_SEQUENCES_BLOCK_SPLITTING || splits->idx >= ZSTD_MAX_NB_BLOCK_SPLITS) {
-        DEBUGLOG(6, "ZSTD_deriveBlockSplitsHelper: Too few sequences");
+        DEBUGLOG(6, "ZSTD_deriveBlockSplitsHelper: Too few sequences (%zu)", endIdx - startIdx);
         return;
     }
-    DEBUGLOG(4, "ZSTD_deriveBlockSplitsHelper: startIdx=%zu endIdx=%zu", startIdx, endIdx);
     ZSTD_deriveSeqStoreChunk(fullSeqStoreChunk, origSeqStore, startIdx, endIdx);
     ZSTD_deriveSeqStoreChunk(firstHalfSeqStore, origSeqStore, startIdx, midIdx);
     ZSTD_deriveSeqStoreChunk(secondHalfSeqStore, origSeqStore, midIdx, endIdx);
     estimatedOriginalSize = ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize(fullSeqStoreChunk, zc);
     estimatedFirstHalfSize = ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize(firstHalfSeqStore, zc);
     estimatedSecondHalfSize = ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize(secondHalfSeqStore, zc);
-    DEBUGLOG(4, "Estimated original block size: %zu -- First half split: %zu -- Second half split: %zu",
+    DEBUGLOG(5, "Estimated original block size: %zu -- First half split: %zu -- Second half split: %zu",
              estimatedOriginalSize, estimatedFirstHalfSize, estimatedSecondHalfSize);
     if (ZSTD_isError(estimatedOriginalSize) || ZSTD_isError(estimatedFirstHalfSize) || ZSTD_isError(estimatedSecondHalfSize)) {
         return;
     }
     if (estimatedFirstHalfSize + estimatedSecondHalfSize < estimatedOriginalSize) {
+        DEBUGLOG(5, "split decided at seqNb:%zu", midIdx);
         ZSTD_deriveBlockSplitsHelper(splits, startIdx, midIdx, zc, origSeqStore);
         splits->splitLocations[splits->idx] = (U32)midIdx;
         splits->idx++;
@@ -3527,14 +3980,18 @@ ZSTD_deriveBlockSplitsHelper(seqStoreSplits* splits, size_t startIdx, size_t end
     }
 }
 
-/* Base recursive function. Populates a table with intra-block partition indices that can improve compression ratio.
+/* Base recursive function.
+ * Populates a table with intra-block partition indices that can improve compression ratio.
  *
- * Returns the number of splits made (which equals the size of the partition table - 1).
+ * @return: number of splits made (which equals the size of the partition table - 1).
  */
-static size_t ZSTD_deriveBlockSplits(ZSTD_CCtx* zc, U32 partitions[], U32 nbSeq) {
-    seqStoreSplits splits = {partitions, 0};
+static size_t ZSTD_deriveBlockSplits(ZSTD_CCtx* zc, U32 partitions[], U32 nbSeq)
+{
+    seqStoreSplits splits;
+    splits.splitLocations = partitions;
+    splits.idx = 0;
     if (nbSeq <= 4) {
-        DEBUGLOG(4, "ZSTD_deriveBlockSplits: Too few sequences to split");
+        DEBUGLOG(5, "ZSTD_deriveBlockSplits: Too few sequences to split (%u <= 4)", nbSeq);
         /* Refuse to try and split anything with less than 4 sequences */
         return 0;
     }
@@ -3550,18 +4007,20 @@ static size_t ZSTD_deriveBlockSplits(ZSTD_CCtx* zc, U32 partitions[], U32 nbSeq)
  * Returns combined size of all blocks (which includes headers), or a ZSTD error code.
  */
 static size_t
-ZSTD_compressBlock_splitBlock_internal(ZSTD_CCtx* zc, void* dst, size_t dstCapacity,
-                                       const void* src, size_t blockSize, U32 lastBlock, U32 nbSeq)
+ZSTD_compressBlock_splitBlock_internal(ZSTD_CCtx* zc,
+                                    void* dst, size_t dstCapacity,
+                              const void* src, size_t blockSize,
+                                    U32 lastBlock, U32 nbSeq)
 {
     size_t cSize = 0;
     const BYTE* ip = (const BYTE*)src;
     BYTE* op = (BYTE*)dst;
     size_t i = 0;
     size_t srcBytesTotal = 0;
-    U32* partitions = zc->blockSplitCtx.partitions; /* size == ZSTD_MAX_NB_BLOCK_SPLITS */
-    seqStore_t* nextSeqStore = &zc->blockSplitCtx.nextSeqStore;
-    seqStore_t* currSeqStore = &zc->blockSplitCtx.currSeqStore;
-    size_t numSplits = ZSTD_deriveBlockSplits(zc, partitions, nbSeq);
+    U32* const partitions = zc->blockSplitCtx.partitions; /* size == ZSTD_MAX_NB_BLOCK_SPLITS */
+    seqStore_t* const nextSeqStore = &zc->blockSplitCtx.nextSeqStore;
+    seqStore_t* const currSeqStore = &zc->blockSplitCtx.currSeqStore;
+    size_t const numSplits = ZSTD_deriveBlockSplits(zc, partitions, nbSeq);
 
     /* If a block is split and some partitions are emitted as RLE/uncompressed, then repcode history
      * may become invalid. In order to reconcile potentially invalid repcodes, we keep track of two
@@ -3583,30 +4042,31 @@ ZSTD_compressBlock_splitBlock_internal(ZSTD_CCtx* zc, void* dst, size_t dstCapac
     ZSTD_memcpy(cRep.rep, zc->blockState.prevCBlock->rep, sizeof(repcodes_t));
     ZSTD_memset(nextSeqStore, 0, sizeof(seqStore_t));
 
-    DEBUGLOG(4, "ZSTD_compressBlock_splitBlock_internal (dstCapacity=%u, dictLimit=%u, nextToUpdate=%u)",
+    DEBUGLOG(5, "ZSTD_compressBlock_splitBlock_internal (dstCapacity=%u, dictLimit=%u, nextToUpdate=%u)",
                 (unsigned)dstCapacity, (unsigned)zc->blockState.matchState.window.dictLimit,
                 (unsigned)zc->blockState.matchState.nextToUpdate);
 
     if (numSplits == 0) {
-        size_t cSizeSingleBlock = ZSTD_compressSeqStore_singleBlock(zc, &zc->seqStore,
-                                                                   &dRep, &cRep,
-                                                                    op, dstCapacity,
-                                                                    ip, blockSize,
-                                                                    lastBlock, 0 /* isPartition */);
+        size_t cSizeSingleBlock =
+            ZSTD_compressSeqStore_singleBlock(zc, &zc->seqStore,
+                                            &dRep, &cRep,
+                                            op, dstCapacity,
+                                            ip, blockSize,
+                                            lastBlock, 0 /* isPartition */);
         FORWARD_IF_ERROR(cSizeSingleBlock, "Compressing single block from splitBlock_internal() failed!");
         DEBUGLOG(5, "ZSTD_compressBlock_splitBlock_internal: No splits");
-        assert(cSizeSingleBlock <= ZSTD_BLOCKSIZE_MAX + ZSTD_blockHeaderSize);
+        assert(zc->blockSize <= ZSTD_BLOCKSIZE_MAX);
+        assert(cSizeSingleBlock <= zc->blockSize + ZSTD_blockHeaderSize);
         return cSizeSingleBlock;
     }
 
     ZSTD_deriveSeqStoreChunk(currSeqStore, &zc->seqStore, 0, partitions[0]);
     for (i = 0; i <= numSplits; ++i) {
-        size_t srcBytes;
         size_t cSizeChunk;
         U32 const lastPartition = (i == numSplits);
         U32 lastBlockEntireSrc = 0;
 
-        srcBytes = ZSTD_countSeqStoreLiteralsBytes(currSeqStore) + ZSTD_countSeqStoreMatchBytes(currSeqStore);
+        size_t srcBytes = ZSTD_countSeqStoreLiteralsBytes(currSeqStore) + ZSTD_countSeqStoreMatchBytes(currSeqStore);
         srcBytesTotal += srcBytes;
         if (lastPartition) {
             /* This is the final partition, need to account for possible last literals */
@@ -3621,7 +4081,8 @@ ZSTD_compressBlock_splitBlock_internal(ZSTD_CCtx* zc, void* dst, size_t dstCapac
                                                        op, dstCapacity,
                                                        ip, srcBytes,
                                                        lastBlockEntireSrc, 1 /* isPartition */);
-        DEBUGLOG(5, "Estimated size: %zu actual size: %zu", ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize(currSeqStore, zc), cSizeChunk);
+        DEBUGLOG(5, "Estimated size: %zu vs %zu : actual size",
+                    ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize(currSeqStore, zc), cSizeChunk);
         FORWARD_IF_ERROR(cSizeChunk, "Compressing chunk failed!");
 
         ip += srcBytes;
@@ -3629,10 +4090,10 @@ ZSTD_compressBlock_splitBlock_internal(ZSTD_CCtx* zc, void* dst, size_t dstCapac
         dstCapacity -= cSizeChunk;
         cSize += cSizeChunk;
         *currSeqStore = *nextSeqStore;
-        assert(cSizeChunk <= ZSTD_BLOCKSIZE_MAX + ZSTD_blockHeaderSize);
+        assert(cSizeChunk <= zc->blockSize + ZSTD_blockHeaderSize);
     }
-    /* cRep and dRep may have diverged during the compression. If so, we use the dRep repcodes
-     * for the next block.
+    /* cRep and dRep may have diverged during the compression.
+     * If so, we use the dRep repcodes for the next block.
      */
     ZSTD_memcpy(zc->blockState.prevCBlock->rep, dRep.rep, sizeof(repcodes_t));
     return cSize;
@@ -3643,8 +4104,6 @@ ZSTD_compressBlock_splitBlock(ZSTD_CCtx* zc,
                               void* dst, size_t dstCapacity,
                               const void* src, size_t srcSize, U32 lastBlock)
 {
-    const BYTE* ip = (const BYTE*)src;
-    BYTE* op = (BYTE*)dst;
     U32 nbSeq;
     size_t cSize;
     DEBUGLOG(4, "ZSTD_compressBlock_splitBlock");
@@ -3655,7 +4114,7 @@ ZSTD_compressBlock_splitBlock(ZSTD_CCtx* zc,
         if (bss == ZSTDbss_noCompress) {
             if (zc->blockState.prevCBlock->entropy.fse.offcode_repeatMode == FSE_repeat_valid)
                 zc->blockState.prevCBlock->entropy.fse.offcode_repeatMode = FSE_repeat_check;
-            cSize = ZSTD_noCompressBlock(op, dstCapacity, ip, srcSize, lastBlock);
+            cSize = ZSTD_noCompressBlock(dst, dstCapacity, src, srcSize, lastBlock);
             FORWARD_IF_ERROR(cSize, "ZSTD_noCompressBlock failed");
             DEBUGLOG(4, "ZSTD_compressBlock_splitBlock: Nocompress block");
             return cSize;
@@ -3673,9 +4132,9 @@ ZSTD_compressBlock_internal(ZSTD_CCtx* zc,
                             void* dst, size_t dstCapacity,
                             const void* src, size_t srcSize, U32 frame)
 {
-    /* This the upper bound for the length of an rle block.
-     * This isn't the actual upper bound. Finding the real threshold
-     * needs further investigation.
+    /* This is an estimated upper bound for the length of an rle block.
+     * This isn't the actual upper bound.
+     * Finding the real threshold needs further investigation.
      */
     const U32 rleMaxLength = 25;
     size_t cSize;
@@ -3767,10 +4226,11 @@ static size_t ZSTD_compressBlock_targetCBlockSize_body(ZSTD_CCtx* zc,
          *   * cSize >= blockBound(srcSize): We have expanded the block too much so
          *     emit an uncompressed block.
          */
-        {
-            size_t const cSize = ZSTD_compressSuperBlock(zc, dst, dstCapacity, src, srcSize, lastBlock);
+        {   size_t const cSize =
+                ZSTD_compressSuperBlock(zc, dst, dstCapacity, src, srcSize, lastBlock);
             if (cSize != ERROR(dstSize_tooSmall)) {
-                size_t const maxCSize = srcSize - ZSTD_minGain(srcSize, zc->appliedParams.cParams.strategy);
+                size_t const maxCSize =
+                    srcSize - ZSTD_minGain(srcSize, zc->appliedParams.cParams.strategy);
                 FORWARD_IF_ERROR(cSize, "ZSTD_compressSuperBlock failed");
                 if (cSize != 0 && cSize < maxCSize + ZSTD_blockHeaderSize) {
                     ZSTD_blockState_confirmRepcodesAndEntropyTables(&zc->blockState);
@@ -3778,7 +4238,7 @@ static size_t ZSTD_compressBlock_targetCBlockSize_body(ZSTD_CCtx* zc,
                 }
             }
         }
-    }
+    } /* if (bss == ZSTDbss_compress)*/
 
     DEBUGLOG(6, "Resorting to ZSTD_noCompressBlock()");
     /* Superblock compression failed, attempt to emit a single no compress block.
@@ -3836,7 +4296,7 @@ static void ZSTD_overflowCorrectIfNeeded(ZSTD_matchState_t* ms,
 *   All blocks will be terminated, all input will be consumed.
 *   Function will issue an error if there is not enough `dstCapacity` to hold the compressed content.
 *   Frame is supposed already started (header already produced)
-*   @return : compressed size, or an error code
+*  @return : compressed size, or an error code
 */
 static size_t ZSTD_compress_frameChunk(ZSTD_CCtx* cctx,
                                      void* dst, size_t dstCapacity,
@@ -3860,7 +4320,9 @@ static size_t ZSTD_compress_frameChunk(ZSTD_CCtx* cctx,
         ZSTD_matchState_t* const ms = &cctx->blockState.matchState;
         U32 const lastBlock = lastFrameChunk & (blockSize >= remaining);
 
-        RETURN_ERROR_IF(dstCapacity < ZSTD_blockHeaderSize + MIN_CBLOCK_SIZE,
+        /* TODO: See 3090. We reduced MIN_CBLOCK_SIZE from 3 to 2 so to compensate we are adding
+         * additional 1. We need to revisit and change this logic to be more consistent */
+        RETURN_ERROR_IF(dstCapacity < ZSTD_blockHeaderSize + MIN_CBLOCK_SIZE + 1,
                         dstSize_tooSmall,
                         "not enough space to store compressed block");
         if (remaining < blockSize) blockSize = remaining;
@@ -3899,7 +4361,7 @@ static size_t ZSTD_compress_frameChunk(ZSTD_CCtx* cctx,
                     MEM_writeLE24(op, cBlockHeader);
                     cSize += ZSTD_blockHeaderSize;
                 }
-            }
+            }  /* if (ZSTD_useTargetCBlockSize(&cctx->appliedParams))*/
 
 
             ip += blockSize;
@@ -4078,31 +4540,51 @@ static size_t ZSTD_compressContinue_internal (ZSTD_CCtx* cctx,
     }
 }
 
-size_t ZSTD_compressContinue (ZSTD_CCtx* cctx,
-                              void* dst, size_t dstCapacity,
-                        const void* src, size_t srcSize)
+size_t ZSTD_compressContinue_public(ZSTD_CCtx* cctx,
+                                        void* dst, size_t dstCapacity,
+                                  const void* src, size_t srcSize)
 {
     DEBUGLOG(5, "ZSTD_compressContinue (srcSize=%u)", (unsigned)srcSize);
     return ZSTD_compressContinue_internal(cctx, dst, dstCapacity, src, srcSize, 1 /* frame mode */, 0 /* last chunk */);
 }
 
+/* NOTE: Must just wrap ZSTD_compressContinue_public() */
+size_t ZSTD_compressContinue(ZSTD_CCtx* cctx,
+                             void* dst, size_t dstCapacity,
+                       const void* src, size_t srcSize)
+{
+    return ZSTD_compressContinue_public(cctx, dst, dstCapacity, src, srcSize);
+}
 
-size_t ZSTD_getBlockSize(const ZSTD_CCtx* cctx)
+static size_t ZSTD_getBlockSize_deprecated(const ZSTD_CCtx* cctx)
 {
     ZSTD_compressionParameters const cParams = cctx->appliedParams.cParams;
     assert(!ZSTD_checkCParams(cParams));
-    return MIN (ZSTD_BLOCKSIZE_MAX, (U32)1 << cParams.windowLog);
+    return MIN(cctx->appliedParams.maxBlockSize, (size_t)1 << cParams.windowLog);
 }
 
-size_t ZSTD_compressBlock(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize)
+/* NOTE: Must just wrap ZSTD_getBlockSize_deprecated() */
+size_t ZSTD_getBlockSize(const ZSTD_CCtx* cctx)
+{
+    return ZSTD_getBlockSize_deprecated(cctx);
+}
+
+/* NOTE: Must just wrap ZSTD_compressBlock_deprecated() */
+size_t ZSTD_compressBlock_deprecated(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize)
 {
     DEBUGLOG(5, "ZSTD_compressBlock: srcSize = %u", (unsigned)srcSize);
-    { size_t const blockSizeMax = ZSTD_getBlockSize(cctx);
+    { size_t const blockSizeMax = ZSTD_getBlockSize_deprecated(cctx);
       RETURN_ERROR_IF(srcSize > blockSizeMax, srcSize_wrong, "input is larger than a block"); }
 
     return ZSTD_compressContinue_internal(cctx, dst, dstCapacity, src, srcSize, 0 /* frame mode */, 0 /* last chunk */);
 }
 
+/* NOTE: Must just wrap ZSTD_compressBlock_deprecated() */
+size_t ZSTD_compressBlock(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize)
+{
+    return ZSTD_compressBlock_deprecated(cctx, dst, dstCapacity, src, srcSize);
+}
+
 /*! ZSTD_loadDictionaryContent() :
  *  @return : 0, or an error code
  */
@@ -4111,25 +4593,36 @@ static size_t ZSTD_loadDictionaryContent(ZSTD_matchState_t* ms,
                                          ZSTD_cwksp* ws,
                                          ZSTD_CCtx_params const* params,
                                          const void* src, size_t srcSize,
-                                         ZSTD_dictTableLoadMethod_e dtlm)
+                                         ZSTD_dictTableLoadMethod_e dtlm,
+                                         ZSTD_tableFillPurpose_e tfp)
 {
     const BYTE* ip = (const BYTE*) src;
     const BYTE* const iend = ip + srcSize;
     int const loadLdmDict = params->ldmParams.enableLdm == ZSTD_ps_enable && ls != NULL;
 
-    /* Assert that we the ms params match the params we're being given */
+    /* Assert that the ms params match the params we're being given */
     ZSTD_assertEqualCParams(params->cParams, ms->cParams);
 
-    if (srcSize > ZSTD_CHUNKSIZE_MAX) {
+    {   /* Ensure large dictionaries can't cause index overflow */
+
         /* Allow the dictionary to set indices up to exactly ZSTD_CURRENT_MAX.
          * Dictionaries right at the edge will immediately trigger overflow
          * correction, but I don't want to insert extra constraints here.
          */
-        U32 const maxDictSize = ZSTD_CURRENT_MAX - 1;
-        /* We must have cleared our windows when our source is this large. */
-        assert(ZSTD_window_isEmpty(ms->window));
-        if (loadLdmDict)
-            assert(ZSTD_window_isEmpty(ls->window));
+        U32 maxDictSize = ZSTD_CURRENT_MAX - ZSTD_WINDOW_START_INDEX;
+
+        int const CDictTaggedIndices = ZSTD_CDictIndicesAreTagged(&params->cParams);
+        if (CDictTaggedIndices && tfp == ZSTD_tfp_forCDict) {
+            /* Some dictionary matchfinders in zstd use "short cache",
+             * which treats the lower ZSTD_SHORT_CACHE_TAG_BITS of each
+             * CDict hashtable entry as a tag rather than as part of an index.
+             * When short cache is used, we need to truncate the dictionary
+             * so that its indices don't overlap with the tag. */
+            U32 const shortCacheMaxDictSize = (1u << (32 - ZSTD_SHORT_CACHE_TAG_BITS)) - ZSTD_WINDOW_START_INDEX;
+            maxDictSize = MIN(maxDictSize, shortCacheMaxDictSize);
+            assert(!loadLdmDict);
+        }
+
         /* If the dictionary is too large, only load the suffix of the dictionary. */
         if (srcSize > maxDictSize) {
             ip = iend - maxDictSize;
@@ -4138,30 +4631,46 @@ static size_t ZSTD_loadDictionaryContent(ZSTD_matchState_t* ms,
         }
     }
 
-    DEBUGLOG(4, "ZSTD_loadDictionaryContent(): useRowMatchFinder=%d", (int)params->useRowMatchFinder);
+    if (srcSize > ZSTD_CHUNKSIZE_MAX) {
+        /* We must have cleared our windows when our source is this large. */
+        assert(ZSTD_window_isEmpty(ms->window));
+        if (loadLdmDict) assert(ZSTD_window_isEmpty(ls->window));
+    }
     ZSTD_window_update(&ms->window, src, srcSize, /* forceNonContiguous */ 0);
-    ms->loadedDictEnd = params->forceWindow ? 0 : (U32)(iend - ms->window.base);
-    ms->forceNonContiguous = params->deterministicRefPrefix;
 
-    if (loadLdmDict) {
+    DEBUGLOG(4, "ZSTD_loadDictionaryContent(): useRowMatchFinder=%d", (int)params->useRowMatchFinder);
+
+    if (loadLdmDict) { /* Load the entire dict into LDM matchfinders. */
         ZSTD_window_update(&ls->window, src, srcSize, /* forceNonContiguous */ 0);
         ls->loadedDictEnd = params->forceWindow ? 0 : (U32)(iend - ls->window.base);
+        ZSTD_ldm_fillHashTable(ls, ip, iend, &params->ldmParams);
+    }
+
+    /* If the dict is larger than we can reasonably index in our tables, only load the suffix. */
+    if (params->cParams.strategy < ZSTD_btultra) {
+        U32 maxDictSize = 8U << MIN(MAX(params->cParams.hashLog, params->cParams.chainLog), 28);
+        if (srcSize > maxDictSize) {
+            ip = iend - maxDictSize;
+            src = ip;
+            srcSize = maxDictSize;
+        }
     }
 
+    ms->nextToUpdate = (U32)(ip - ms->window.base);
+    ms->loadedDictEnd = params->forceWindow ? 0 : (U32)(iend - ms->window.base);
+    ms->forceNonContiguous = params->deterministicRefPrefix;
+
     if (srcSize <= HASH_READ_SIZE) return 0;
 
     ZSTD_overflowCorrectIfNeeded(ms, ws, params, ip, iend);
 
-    if (loadLdmDict)
-        ZSTD_ldm_fillHashTable(ls, ip, iend, &params->ldmParams);
-
     switch(params->cParams.strategy)
     {
     case ZSTD_fast:
-        ZSTD_fillHashTable(ms, iend, dtlm);
+        ZSTD_fillHashTable(ms, iend, dtlm, tfp);
         break;
     case ZSTD_dfast:
-        ZSTD_fillDoubleHashTable(ms, iend, dtlm);
+        ZSTD_fillDoubleHashTable(ms, iend, dtlm, tfp);
         break;
 
     case ZSTD_greedy:
@@ -4174,7 +4683,7 @@ static size_t ZSTD_loadDictionaryContent(ZSTD_matchState_t* ms,
         } else {
             assert(params->useRowMatchFinder != ZSTD_ps_auto);
             if (params->useRowMatchFinder == ZSTD_ps_enable) {
-                size_t const tagTableSize = ((size_t)1 << params->cParams.hashLog) * sizeof(U16);
+                size_t const tagTableSize = ((size_t)1 << params->cParams.hashLog);
                 ZSTD_memset(ms->tagTable, 0, tagTableSize);
                 ZSTD_row_update(ms, iend-HASH_READ_SIZE);
                 DEBUGLOG(4, "Using row-based hash table for lazy dict");
@@ -4327,6 +4836,7 @@ static size_t ZSTD_loadZstdDictionary(ZSTD_compressedBlockState_t* bs,
                                       ZSTD_CCtx_params const* params,
                                       const void* dict, size_t dictSize,
                                       ZSTD_dictTableLoadMethod_e dtlm,
+                                      ZSTD_tableFillPurpose_e tfp,
                                       void* workspace)
 {
     const BYTE* dictPtr = (const BYTE*)dict;
@@ -4345,7 +4855,7 @@ static size_t ZSTD_loadZstdDictionary(ZSTD_compressedBlockState_t* bs,
     {
         size_t const dictContentSize = (size_t)(dictEnd - dictPtr);
         FORWARD_IF_ERROR(ZSTD_loadDictionaryContent(
-            ms, NULL, ws, params, dictPtr, dictContentSize, dtlm), "");
+            ms, NULL, ws, params, dictPtr, dictContentSize, dtlm, tfp), "");
     }
     return dictID;
 }
@@ -4361,6 +4871,7 @@ ZSTD_compress_insertDictionary(ZSTD_compressedBlockState_t* bs,
                          const void* dict, size_t dictSize,
                                ZSTD_dictContentType_e dictContentType,
                                ZSTD_dictTableLoadMethod_e dtlm,
+                               ZSTD_tableFillPurpose_e tfp,
                                void* workspace)
 {
     DEBUGLOG(4, "ZSTD_compress_insertDictionary (dictSize=%u)", (U32)dictSize);
@@ -4373,13 +4884,13 @@ ZSTD_compress_insertDictionary(ZSTD_compressedBlockState_t* bs,
 
     /* dict restricted modes */
     if (dictContentType == ZSTD_dct_rawContent)
-        return ZSTD_loadDictionaryContent(ms, ls, ws, params, dict, dictSize, dtlm);
+        return ZSTD_loadDictionaryContent(ms, ls, ws, params, dict, dictSize, dtlm, tfp);
 
     if (MEM_readLE32(dict) != ZSTD_MAGIC_DICTIONARY) {
         if (dictContentType == ZSTD_dct_auto) {
             DEBUGLOG(4, "raw content dictionary detected");
             return ZSTD_loadDictionaryContent(
-                ms, ls, ws, params, dict, dictSize, dtlm);
+                ms, ls, ws, params, dict, dictSize, dtlm, tfp);
         }
         RETURN_ERROR_IF(dictContentType == ZSTD_dct_fullDict, dictionary_wrong, "");
         assert(0);   /* impossible */
@@ -4387,13 +4898,14 @@ ZSTD_compress_insertDictionary(ZSTD_compressedBlockState_t* bs,
 
     /* dict as full zstd dictionary */
     return ZSTD_loadZstdDictionary(
-        bs, ms, ws, params, dict, dictSize, dtlm, workspace);
+        bs, ms, ws, params, dict, dictSize, dtlm, tfp, workspace);
 }
 
 #define ZSTD_USE_CDICT_PARAMS_SRCSIZE_CUTOFF (128 KB)
 #define ZSTD_USE_CDICT_PARAMS_DICTSIZE_MULTIPLIER (6ULL)
 
 /*! ZSTD_compressBegin_internal() :
+ * Assumption : either @dict OR @cdict (or none) is non-NULL, never both
  * @return : 0, or an error code */
 static size_t ZSTD_compressBegin_internal(ZSTD_CCtx* cctx,
                                     const void* dict, size_t dictSize,
@@ -4426,11 +4938,11 @@ static size_t ZSTD_compressBegin_internal(ZSTD_CCtx* cctx,
                         cctx->blockState.prevCBlock, &cctx->blockState.matchState,
                         &cctx->ldmState, &cctx->workspace, &cctx->appliedParams, cdict->dictContent,
                         cdict->dictContentSize, cdict->dictContentType, dtlm,
-                        cctx->entropyWorkspace)
+                        ZSTD_tfp_forCCtx, cctx->entropyWorkspace)
               : ZSTD_compress_insertDictionary(
                         cctx->blockState.prevCBlock, &cctx->blockState.matchState,
                         &cctx->ldmState, &cctx->workspace, &cctx->appliedParams, dict, dictSize,
-                        dictContentType, dtlm, cctx->entropyWorkspace);
+                        dictContentType, dtlm, ZSTD_tfp_forCCtx, cctx->entropyWorkspace);
         FORWARD_IF_ERROR(dictID, "ZSTD_compress_insertDictionary failed");
         assert(dictID <= UINT_MAX);
         cctx->dictID = (U32)dictID;
@@ -4471,11 +4983,11 @@ size_t ZSTD_compressBegin_advanced(ZSTD_CCtx* cctx,
                                             &cctxParams, pledgedSrcSize);
 }
 
-size_t ZSTD_compressBegin_usingDict(ZSTD_CCtx* cctx, const void* dict, size_t dictSize, int compressionLevel)
+static size_t
+ZSTD_compressBegin_usingDict_deprecated(ZSTD_CCtx* cctx, const void* dict, size_t dictSize, int compressionLevel)
 {
     ZSTD_CCtx_params cctxParams;
-    {
-        ZSTD_parameters const params = ZSTD_getParams_internal(compressionLevel, ZSTD_CONTENTSIZE_UNKNOWN, dictSize, ZSTD_cpm_noAttachDict);
+    {   ZSTD_parameters const params = ZSTD_getParams_internal(compressionLevel, ZSTD_CONTENTSIZE_UNKNOWN, dictSize, ZSTD_cpm_noAttachDict);
         ZSTD_CCtxParams_init_internal(&cctxParams, &params, (compressionLevel == 0) ? ZSTD_CLEVEL_DEFAULT : compressionLevel);
     }
     DEBUGLOG(4, "ZSTD_compressBegin_usingDict (dictSize=%u)", (unsigned)dictSize);
@@ -4483,9 +4995,15 @@ size_t ZSTD_compressBegin_usingDict(ZSTD_CCtx* cctx, const void* dict, size_t di
                                        &cctxParams, ZSTD_CONTENTSIZE_UNKNOWN, ZSTDb_not_buffered);
 }
 
+size_t
+ZSTD_compressBegin_usingDict(ZSTD_CCtx* cctx, const void* dict, size_t dictSize, int compressionLevel)
+{
+    return ZSTD_compressBegin_usingDict_deprecated(cctx, dict, dictSize, compressionLevel);
+}
+
 size_t ZSTD_compressBegin(ZSTD_CCtx* cctx, int compressionLevel)
 {
-    return ZSTD_compressBegin_usingDict(cctx, NULL, 0, compressionLevel);
+    return ZSTD_compressBegin_usingDict_deprecated(cctx, NULL, 0, compressionLevel);
 }
 
 
@@ -4537,9 +5055,9 @@ void ZSTD_CCtx_trace(ZSTD_CCtx* cctx, size_t extraCSize)
     (void)extraCSize;
 }
 
-size_t ZSTD_compressEnd (ZSTD_CCtx* cctx,
-                         void* dst, size_t dstCapacity,
-                   const void* src, size_t srcSize)
+size_t ZSTD_compressEnd_public(ZSTD_CCtx* cctx,
+                               void* dst, size_t dstCapacity,
+                         const void* src, size_t srcSize)
 {
     size_t endResult;
     size_t const cSize = ZSTD_compressContinue_internal(cctx,
@@ -4563,6 +5081,14 @@ size_t ZSTD_compressEnd (ZSTD_CCtx* cctx,
     return cSize + endResult;
 }
 
+/* NOTE: Must just wrap ZSTD_compressEnd_public() */
+size_t ZSTD_compressEnd(ZSTD_CCtx* cctx,
+                        void* dst, size_t dstCapacity,
+                  const void* src, size_t srcSize)
+{
+    return ZSTD_compressEnd_public(cctx, dst, dstCapacity, src, srcSize);
+}
+
 size_t ZSTD_compress_advanced (ZSTD_CCtx* cctx,
                                void* dst, size_t dstCapacity,
                          const void* src, size_t srcSize,
@@ -4591,7 +5117,7 @@ size_t ZSTD_compress_advanced_internal(
     FORWARD_IF_ERROR( ZSTD_compressBegin_internal(cctx,
                          dict, dictSize, ZSTD_dct_auto, ZSTD_dtlm_fast, NULL,
                          params, srcSize, ZSTDb_not_buffered) , "");
-    return ZSTD_compressEnd(cctx, dst, dstCapacity, src, srcSize);
+    return ZSTD_compressEnd_public(cctx, dst, dstCapacity, src, srcSize);
 }
 
 size_t ZSTD_compress_usingDict(ZSTD_CCtx* cctx,
@@ -4709,7 +5235,7 @@ static size_t ZSTD_initCDict_internal(
         {   size_t const dictID = ZSTD_compress_insertDictionary(
                     &cdict->cBlockState, &cdict->matchState, NULL, &cdict->workspace,
                     &params, cdict->dictContent, cdict->dictContentSize,
-                    dictContentType, ZSTD_dtlm_full, cdict->entropyWorkspace);
+                    dictContentType, ZSTD_dtlm_full, ZSTD_tfp_forCDict, cdict->entropyWorkspace);
             FORWARD_IF_ERROR(dictID, "ZSTD_compress_insertDictionary failed");
             assert(dictID <= (size_t)(U32)-1);
             cdict->dictID = (U32)dictID;
@@ -4906,6 +5432,7 @@ const ZSTD_CDict* ZSTD_initStaticCDict(
     params.cParams = cParams;
     params.useRowMatchFinder = useRowMatchFinder;
     cdict->useRowMatchFinder = useRowMatchFinder;
+    cdict->compressionLevel = ZSTD_NO_CLEVEL;
 
     if (ZSTD_isError( ZSTD_initCDict_internal(cdict,
                                               dict, dictSize,
@@ -4985,12 +5512,17 @@ size_t ZSTD_compressBegin_usingCDict_advanced(
 
 /* ZSTD_compressBegin_usingCDict() :
  * cdict must be != NULL */
-size_t ZSTD_compressBegin_usingCDict(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict)
+size_t ZSTD_compressBegin_usingCDict_deprecated(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict)
 {
     ZSTD_frameParameters const fParams = { 0 /*content*/, 0 /*checksum*/, 0 /*noDictID*/ };
     return ZSTD_compressBegin_usingCDict_internal(cctx, cdict, fParams, ZSTD_CONTENTSIZE_UNKNOWN);
 }
 
+size_t ZSTD_compressBegin_usingCDict(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict)
+{
+    return ZSTD_compressBegin_usingCDict_deprecated(cctx, cdict);
+}
+
 /*! ZSTD_compress_usingCDict_internal():
  * Implementation of various ZSTD_compress_usingCDict* functions.
  */
@@ -5000,7 +5532,7 @@ static size_t ZSTD_compress_usingCDict_internal(ZSTD_CCtx* cctx,
                                 const ZSTD_CDict* cdict, ZSTD_frameParameters fParams)
 {
     FORWARD_IF_ERROR(ZSTD_compressBegin_usingCDict_internal(cctx, cdict, fParams, srcSize), ""); /* will check if cdict != NULL */
-    return ZSTD_compressEnd(cctx, dst, dstCapacity, src, srcSize);
+    return ZSTD_compressEnd_public(cctx, dst, dstCapacity, src, srcSize);
 }
 
 /*! ZSTD_compress_usingCDict_advanced():
@@ -5197,30 +5729,41 @@ size_t ZSTD_initCStream(ZSTD_CStream* zcs, int compressionLevel)
 
 static size_t ZSTD_nextInputSizeHint(const ZSTD_CCtx* cctx)
 {
-    size_t hintInSize = cctx->inBuffTarget - cctx->inBuffPos;
-    if (hintInSize==0) hintInSize = cctx->blockSize;
-    return hintInSize;
+    if (cctx->appliedParams.inBufferMode == ZSTD_bm_stable) {
+        return cctx->blockSize - cctx->stableIn_notConsumed;
+    }
+    assert(cctx->appliedParams.inBufferMode == ZSTD_bm_buffered);
+    {   size_t hintInSize = cctx->inBuffTarget - cctx->inBuffPos;
+        if (hintInSize==0) hintInSize = cctx->blockSize;
+        return hintInSize;
+    }
 }
 
 /* ZSTD_compressStream_generic():
  *  internal function for all *compressStream*() variants
- *  non-static, because can be called from zstdmt_compress.c
- * @return : hint size for next input */
+ * @return : hint size for next input to complete ongoing block */
 static size_t ZSTD_compressStream_generic(ZSTD_CStream* zcs,
                                           ZSTD_outBuffer* output,
                                           ZSTD_inBuffer* input,
                                           ZSTD_EndDirective const flushMode)
 {
-    const char* const istart = (const char*)input->src;
-    const char* const iend = input->size != 0 ? istart + input->size : istart;
-    const char* ip = input->pos != 0 ? istart + input->pos : istart;
-    char* const ostart = (char*)output->dst;
-    char* const oend = output->size != 0 ? ostart + output->size : ostart;
-    char* op = output->pos != 0 ? ostart + output->pos : ostart;
+    const char* const istart = (assert(input != NULL), (const char*)input->src);
+    const char* const iend = (istart != NULL) ? istart + input->size : istart;
+    const char* ip = (istart != NULL) ? istart + input->pos : istart;
+    char* const ostart = (assert(output != NULL), (char*)output->dst);
+    char* const oend = (ostart != NULL) ? ostart + output->size : ostart;
+    char* op = (ostart != NULL) ? ostart + output->pos : ostart;
     U32 someMoreWork = 1;
 
     /* check expectations */
-    DEBUGLOG(5, "ZSTD_compressStream_generic, flush=%u", (unsigned)flushMode);
+    DEBUGLOG(5, "ZSTD_compressStream_generic, flush=%i, srcSize = %zu", (int)flushMode, input->size - input->pos);
+    assert(zcs != NULL);
+    if (zcs->appliedParams.inBufferMode == ZSTD_bm_stable) {
+        assert(input->pos >= zcs->stableIn_notConsumed);
+        input->pos -= zcs->stableIn_notConsumed;
+        ip -= zcs->stableIn_notConsumed;
+        zcs->stableIn_notConsumed = 0;
+    }
     if (zcs->appliedParams.inBufferMode == ZSTD_bm_buffered) {
         assert(zcs->inBuff != NULL);
         assert(zcs->inBuffSize > 0);
@@ -5229,8 +5772,10 @@ static size_t ZSTD_compressStream_generic(ZSTD_CStream* zcs,
         assert(zcs->outBuff !=  NULL);
         assert(zcs->outBuffSize > 0);
     }
-    assert(output->pos <= output->size);
+    if (input->src == NULL) assert(input->size == 0);
     assert(input->pos <= input->size);
+    if (output->dst == NULL) assert(output->size == 0);
+    assert(output->pos <= output->size);
     assert((U32)flushMode <= (U32)ZSTD_e_end);
 
     while (someMoreWork) {
@@ -5245,7 +5790,7 @@ static size_t ZSTD_compressStream_generic(ZSTD_CStream* zcs,
                 || zcs->appliedParams.outBufferMode == ZSTD_bm_stable)  /* OR we are allowed to return dstSizeTooSmall */
               && (zcs->inBuffPos == 0) ) {
                 /* shortcut to compression pass directly into output buffer */
-                size_t const cSize = ZSTD_compressEnd(zcs,
+                size_t const cSize = ZSTD_compressEnd_public(zcs,
                                                 op, oend-op, ip, iend-ip);
                 DEBUGLOG(4, "ZSTD_compressEnd : cSize=%u", (unsigned)cSize);
                 FORWARD_IF_ERROR(cSize, "ZSTD_compressEnd failed");
@@ -5262,8 +5807,7 @@ static size_t ZSTD_compressStream_generic(ZSTD_CStream* zcs,
                                         zcs->inBuff + zcs->inBuffPos, toLoad,
                                         ip, iend-ip);
                 zcs->inBuffPos += loaded;
-                if (loaded != 0)
-                    ip += loaded;
+                if (ip) ip += loaded;
                 if ( (flushMode == ZSTD_e_continue)
                   && (zcs->inBuffPos < zcs->inBuffTarget) ) {
                     /* not enough input to fill full block : stop here */
@@ -5274,6 +5818,20 @@ static size_t ZSTD_compressStream_generic(ZSTD_CStream* zcs,
                     /* empty */
                     someMoreWork = 0; break;
                 }
+            } else {
+                assert(zcs->appliedParams.inBufferMode == ZSTD_bm_stable);
+                if ( (flushMode == ZSTD_e_continue)
+                  && ( (size_t)(iend - ip) < zcs->blockSize) ) {
+                    /* can't compress a full block : stop here */
+                    zcs->stableIn_notConsumed = (size_t)(iend - ip);
+                    ip = iend;  /* pretend to have consumed input */
+                    someMoreWork = 0; break;
+                }
+                if ( (flushMode == ZSTD_e_flush)
+                  && (ip == iend) ) {
+                    /* empty */
+                    someMoreWork = 0; break;
+                }
             }
             /* compress current block (note : this stage cannot be stopped in the middle) */
             DEBUGLOG(5, "stream compression stage (flushMode==%u)", flushMode);
@@ -5281,9 +5839,8 @@ static size_t ZSTD_compressStream_generic(ZSTD_CStream* zcs,
                 void* cDst;
                 size_t cSize;
                 size_t oSize = oend-op;
-                size_t const iSize = inputBuffered
-                    ? zcs->inBuffPos - zcs->inToCompress
-                    : MIN((size_t)(iend - ip), zcs->blockSize);
+                size_t const iSize = inputBuffered ? zcs->inBuffPos - zcs->inToCompress
+                                                   : MIN((size_t)(iend - ip), zcs->blockSize);
                 if (oSize >= ZSTD_compressBound(iSize) || zcs->appliedParams.outBufferMode == ZSTD_bm_stable)
                     cDst = op;   /* compress into output buffer, to skip flush stage */
                 else
@@ -5291,9 +5848,9 @@ static size_t ZSTD_compressStream_generic(ZSTD_CStream* zcs,
                 if (inputBuffered) {
                     unsigned const lastBlock = (flushMode == ZSTD_e_end) && (ip==iend);
                     cSize = lastBlock ?
-                            ZSTD_compressEnd(zcs, cDst, oSize,
+                            ZSTD_compressEnd_public(zcs, cDst, oSize,
                                         zcs->inBuff + zcs->inToCompress, iSize) :
-                            ZSTD_compressContinue(zcs, cDst, oSize,
+                            ZSTD_compressContinue_public(zcs, cDst, oSize,
                                         zcs->inBuff + zcs->inToCompress, iSize);
                     FORWARD_IF_ERROR(cSize, "%s", lastBlock ? "ZSTD_compressEnd failed" : "ZSTD_compressContinue failed");
                     zcs->frameEnded = lastBlock;
@@ -5306,19 +5863,16 @@ static size_t ZSTD_compressStream_generic(ZSTD_CStream* zcs,
                     if (!lastBlock)
                         assert(zcs->inBuffTarget <= zcs->inBuffSize);
                     zcs->inToCompress = zcs->inBuffPos;
-                } else {
-                    unsigned const lastBlock = (ip + iSize == iend);
-                    assert(flushMode == ZSTD_e_end /* Already validated */);
+                } else { /* !inputBuffered, hence ZSTD_bm_stable */
+                    unsigned const lastBlock = (flushMode == ZSTD_e_end) && (ip + iSize == iend);
                     cSize = lastBlock ?
-                            ZSTD_compressEnd(zcs, cDst, oSize, ip, iSize) :
-                            ZSTD_compressContinue(zcs, cDst, oSize, ip, iSize);
+                            ZSTD_compressEnd_public(zcs, cDst, oSize, ip, iSize) :
+                            ZSTD_compressContinue_public(zcs, cDst, oSize, ip, iSize);
                     /* Consume the input prior to error checking to mirror buffered mode. */
-                    if (iSize > 0)
-                        ip += iSize;
+                    if (ip) ip += iSize;
                     FORWARD_IF_ERROR(cSize, "%s", lastBlock ? "ZSTD_compressEnd failed" : "ZSTD_compressContinue failed");
                     zcs->frameEnded = lastBlock;
-                    if (lastBlock)
-                        assert(ip == iend);
+                    if (lastBlock) assert(ip == iend);
                 }
                 if (cDst == op) {  /* no need to flush */
                     op += cSize;
@@ -5388,8 +5942,10 @@ size_t ZSTD_compressStream(ZSTD_CStream* zcs, ZSTD_outBuffer* output, ZSTD_inBuf
 /* After a compression call set the expected input/output buffer.
  * This is validated at the start of the next compression call.
  */
-static void ZSTD_setBufferExpectations(ZSTD_CCtx* cctx, ZSTD_outBuffer const* output, ZSTD_inBuffer const* input)
+static void
+ZSTD_setBufferExpectations(ZSTD_CCtx* cctx, const ZSTD_outBuffer* output, const ZSTD_inBuffer* input)
 {
+    DEBUGLOG(5, "ZSTD_setBufferExpectations (for advanced stable in/out modes)");
     if (cctx->appliedParams.inBufferMode == ZSTD_bm_stable) {
         cctx->expectedInBuffer = *input;
     }
@@ -5408,22 +5964,22 @@ static size_t ZSTD_checkBufferStability(ZSTD_CCtx const* cctx,
 {
     if (cctx->appliedParams.inBufferMode == ZSTD_bm_stable) {
         ZSTD_inBuffer const expect = cctx->expectedInBuffer;
-        if (expect.src != input->src || expect.pos != input->pos || expect.size != input->size)
-            RETURN_ERROR(srcBuffer_wrong, "ZSTD_c_stableInBuffer enabled but input differs!");
-        if (endOp != ZSTD_e_end)
-            RETURN_ERROR(srcBuffer_wrong, "ZSTD_c_stableInBuffer can only be used with ZSTD_e_end!");
+        if (expect.src != input->src || expect.pos != input->pos)
+            RETURN_ERROR(stabilityCondition_notRespected, "ZSTD_c_stableInBuffer enabled but input differs!");
     }
+    (void)endOp;
     if (cctx->appliedParams.outBufferMode == ZSTD_bm_stable) {
         size_t const outBufferSize = output->size - output->pos;
         if (cctx->expectedOutBufferSize != outBufferSize)
-            RETURN_ERROR(dstBuffer_wrong, "ZSTD_c_stableOutBuffer enabled but output size differs!");
+            RETURN_ERROR(stabilityCondition_notRespected, "ZSTD_c_stableOutBuffer enabled but output size differs!");
     }
     return 0;
 }
 
 static size_t ZSTD_CCtx_init_compressStream2(ZSTD_CCtx* cctx,
                                              ZSTD_EndDirective endOp,
-                                             size_t inSize) {
+                                             size_t inSize)
+{
     ZSTD_CCtx_params params = cctx->requestedParams;
     ZSTD_prefixDict const prefixDict = cctx->prefixDict;
     FORWARD_IF_ERROR( ZSTD_initLocalDict(cctx) , ""); /* Init the local dict if present. */
@@ -5437,9 +5993,9 @@ static size_t ZSTD_CCtx_init_compressStream2(ZSTD_CCtx* cctx,
         params.compressionLevel = cctx->cdict->compressionLevel;
     }
     DEBUGLOG(4, "ZSTD_compressStream2 : transparent init stage");
-    if (endOp == ZSTD_e_end) cctx->pledgedSrcSizePlusOne = inSize + 1;  /* auto-fix pledgedSrcSize */
-    {
-        size_t const dictSize = prefixDict.dict
+    if (endOp == ZSTD_e_end) cctx->pledgedSrcSizePlusOne = inSize + 1;  /* auto-determine pledgedSrcSize */
+
+    {   size_t const dictSize = prefixDict.dict
                 ? prefixDict.dictSize
                 : (cctx->cdict ? cctx->cdict->dictContentSize : 0);
         ZSTD_cParamMode_e const mode = ZSTD_getCParamMode(cctx->cdict, &params, cctx->pledgedSrcSizePlusOne - 1);
@@ -5451,6 +6007,9 @@ static size_t ZSTD_CCtx_init_compressStream2(ZSTD_CCtx* cctx,
     params.useBlockSplitter = ZSTD_resolveBlockSplitterMode(params.useBlockSplitter, &params.cParams);
     params.ldmParams.enableLdm = ZSTD_resolveEnableLdm(params.ldmParams.enableLdm, &params.cParams);
     params.useRowMatchFinder = ZSTD_resolveRowMatchFinderMode(params.useRowMatchFinder, &params.cParams);
+    params.validateSequences = ZSTD_resolveExternalSequenceValidation(params.validateSequences);
+    params.maxBlockSize = ZSTD_resolveMaxBlockSize(params.maxBlockSize);
+    params.searchForExternalRepcodes = ZSTD_resolveExternalRepcodeSearch(params.searchForExternalRepcodes, params.compressionLevel);
 
     {   U64 const pledgedSrcSize = cctx->pledgedSrcSizePlusOne - 1;
         assert(!ZSTD_isError(ZSTD_checkCParams(params.cParams)));
@@ -5477,6 +6036,8 @@ static size_t ZSTD_CCtx_init_compressStream2(ZSTD_CCtx* cctx,
     return 0;
 }
 
+/* @return provides a minimum amount of data remaining to be flushed from internal buffers
+ */
 size_t ZSTD_compressStream2( ZSTD_CCtx* cctx,
                              ZSTD_outBuffer* output,
                              ZSTD_inBuffer* input,
@@ -5491,8 +6052,27 @@ size_t ZSTD_compressStream2( ZSTD_CCtx* cctx,
 
     /* transparent initialization stage */
     if (cctx->streamStage == zcss_init) {
-        FORWARD_IF_ERROR(ZSTD_CCtx_init_compressStream2(cctx, endOp, input->size), "CompressStream2 initialization failed");
-        ZSTD_setBufferExpectations(cctx, output, input);    /* Set initial buffer expectations now that we've initialized */
+        size_t const inputSize = input->size - input->pos;  /* no obligation to start from pos==0 */
+        size_t const totalInputSize = inputSize + cctx->stableIn_notConsumed;
+        if ( (cctx->requestedParams.inBufferMode == ZSTD_bm_stable) /* input is presumed stable, across invocations */
+          && (endOp == ZSTD_e_continue)                             /* no flush requested, more input to come */
+          && (totalInputSize < ZSTD_BLOCKSIZE_MAX) ) {              /* not even reached one block yet */
+            if (cctx->stableIn_notConsumed) {  /* not the first time */
+                /* check stable source guarantees */
+                RETURN_ERROR_IF(input->src != cctx->expectedInBuffer.src, stabilityCondition_notRespected, "stableInBuffer condition not respected: wrong src pointer");
+                RETURN_ERROR_IF(input->pos != cctx->expectedInBuffer.size, stabilityCondition_notRespected, "stableInBuffer condition not respected: externally modified pos");
+            }
+            /* pretend input was consumed, to give a sense forward progress */
+            input->pos = input->size;
+            /* save stable inBuffer, for later control, and flush/end */
+            cctx->expectedInBuffer = *input;
+            /* but actually input wasn't consumed, so keep track of position from where compression shall resume */
+            cctx->stableIn_notConsumed += inputSize;
+            /* don't initialize yet, wait for the first block of flush() order, for better parameters adaptation */
+            return ZSTD_FRAMEHEADERSIZE_MIN(cctx->requestedParams.format);  /* at least some header to produce */
+        }
+        FORWARD_IF_ERROR(ZSTD_CCtx_init_compressStream2(cctx, endOp, totalInputSize), "compressStream2 initialization failed");
+        ZSTD_setBufferExpectations(cctx, output, input);   /* Set initial buffer expectations now that we've initialized */
     }
     /* end of transparent initialization stage */
 
@@ -5510,13 +6090,20 @@ size_t ZSTD_compressStream2_simpleArgs (
                       const void* src, size_t srcSize, size_t* srcPos,
                             ZSTD_EndDirective endOp)
 {
-    ZSTD_outBuffer output = { dst, dstCapacity, *dstPos };
-    ZSTD_inBuffer  input  = { src, srcSize, *srcPos };
+    ZSTD_outBuffer output;
+    ZSTD_inBuffer  input;
+    output.dst = dst;
+    output.size = dstCapacity;
+    output.pos = *dstPos;
+    input.src = src;
+    input.size = srcSize;
+    input.pos = *srcPos;
     /* ZSTD_compressStream2() will check validity of dstPos and srcPos */
-    size_t const cErr = ZSTD_compressStream2(cctx, &output, &input, endOp);
-    *dstPos = output.pos;
-    *srcPos = input.pos;
-    return cErr;
+    {   size_t const cErr = ZSTD_compressStream2(cctx, &output, &input, endOp);
+        *dstPos = output.pos;
+        *srcPos = input.pos;
+        return cErr;
+    }
 }
 
 size_t ZSTD_compress2(ZSTD_CCtx* cctx,
@@ -5539,6 +6126,7 @@ size_t ZSTD_compress2(ZSTD_CCtx* cctx,
         /* Reset to the original values. */
         cctx->requestedParams.inBufferMode = originalInBufferMode;
         cctx->requestedParams.outBufferMode = originalOutBufferMode;
+
         FORWARD_IF_ERROR(result, "ZSTD_compressStream2_simpleArgs failed");
         if (result != 0) {  /* compression not completed, due to lack of output space */
             assert(oPos == dstCapacity);
@@ -5549,64 +6137,61 @@ size_t ZSTD_compress2(ZSTD_CCtx* cctx,
     }
 }
 
-typedef struct {
-    U32 idx;             /* Index in array of ZSTD_Sequence */
-    U32 posInSequence;   /* Position within sequence at idx */
-    size_t posInSrc;        /* Number of bytes given by sequences provided so far */
-} ZSTD_sequencePosition;
-
 /* ZSTD_validateSequence() :
  * @offCode : is presumed to follow format required by ZSTD_storeSeq()
  * @returns a ZSTD error code if sequence is not valid
  */
 static size_t
-ZSTD_validateSequence(U32 offCode, U32 matchLength,
-                      size_t posInSrc, U32 windowLog, size_t dictSize)
+ZSTD_validateSequence(U32 offCode, U32 matchLength, U32 minMatch,
+                      size_t posInSrc, U32 windowLog, size_t dictSize, int useSequenceProducer)
 {
-    U32 const windowSize = 1 << windowLog;
+    U32 const windowSize = 1u << windowLog;
     /* posInSrc represents the amount of data the decoder would decode up to this point.
      * As long as the amount of data decoded is less than or equal to window size, offsets may be
      * larger than the total length of output decoded in order to reference the dict, even larger than
      * window size. After output surpasses windowSize, we're limited to windowSize offsets again.
      */
     size_t const offsetBound = posInSrc > windowSize ? (size_t)windowSize : posInSrc + (size_t)dictSize;
-    RETURN_ERROR_IF(offCode > STORE_OFFSET(offsetBound), corruption_detected, "Offset too large!");
-    RETURN_ERROR_IF(matchLength < MINMATCH, corruption_detected, "Matchlength too small");
+    size_t const matchLenLowerBound = (minMatch == 3 || useSequenceProducer) ? 3 : 4;
+    RETURN_ERROR_IF(offCode > OFFSET_TO_OFFBASE(offsetBound), externalSequences_invalid, "Offset too large!");
+    /* Validate maxNbSeq is large enough for the given matchLength and minMatch */
+    RETURN_ERROR_IF(matchLength < matchLenLowerBound, externalSequences_invalid, "Matchlength too small for the minMatch");
     return 0;
 }
 
 /* Returns an offset code, given a sequence's raw offset, the ongoing repcode array, and whether litLength == 0 */
-static U32 ZSTD_finalizeOffCode(U32 rawOffset, const U32 rep[ZSTD_REP_NUM], U32 ll0)
+static U32 ZSTD_finalizeOffBase(U32 rawOffset, const U32 rep[ZSTD_REP_NUM], U32 ll0)
 {
-    U32 offCode = STORE_OFFSET(rawOffset);
+    U32 offBase = OFFSET_TO_OFFBASE(rawOffset);
 
     if (!ll0 && rawOffset == rep[0]) {
-        offCode = STORE_REPCODE_1;
+        offBase = REPCODE1_TO_OFFBASE;
     } else if (rawOffset == rep[1]) {
-        offCode = STORE_REPCODE(2 - ll0);
+        offBase = REPCODE_TO_OFFBASE(2 - ll0);
     } else if (rawOffset == rep[2]) {
-        offCode = STORE_REPCODE(3 - ll0);
+        offBase = REPCODE_TO_OFFBASE(3 - ll0);
     } else if (ll0 && rawOffset == rep[0] - 1) {
-        offCode = STORE_REPCODE_3;
+        offBase = REPCODE3_TO_OFFBASE;
     }
-    return offCode;
+    return offBase;
 }
 
-/* Returns 0 on success, and a ZSTD_error otherwise. This function scans through an array of
- * ZSTD_Sequence, storing the sequences it finds, until it reaches a block delimiter.
- */
-static size_t
+size_t
 ZSTD_copySequencesToSeqStoreExplicitBlockDelim(ZSTD_CCtx* cctx,
                                               ZSTD_sequencePosition* seqPos,
                                         const ZSTD_Sequence* const inSeqs, size_t inSeqsSize,
-                                        const void* src, size_t blockSize)
+                                        const void* src, size_t blockSize,
+                                        ZSTD_paramSwitch_e externalRepSearch)
 {
     U32 idx = seqPos->idx;
+    U32 const startIdx = idx;
     BYTE const* ip = (BYTE const*)(src);
     const BYTE* const iend = ip + blockSize;
     repcodes_t updatedRepcodes;
     U32 dictSize;
 
+    DEBUGLOG(5, "ZSTD_copySequencesToSeqStoreExplicitBlockDelim (blockSize = %zu)", blockSize);
+
     if (cctx->cdict) {
         dictSize = (U32)cctx->cdict->dictContentSize;
     } else if (cctx->prefixDict.dict) {
@@ -5615,25 +6200,55 @@ ZSTD_copySequencesToSeqStoreExplicitBlockDelim(ZSTD_CCtx* cctx,
         dictSize = 0;
     }
     ZSTD_memcpy(updatedRepcodes.rep, cctx->blockState.prevCBlock->rep, sizeof(repcodes_t));
-    for (; (inSeqs[idx].matchLength != 0 || inSeqs[idx].offset != 0) && idx < inSeqsSize; ++idx) {
+    for (; idx < inSeqsSize && (inSeqs[idx].matchLength != 0 || inSeqs[idx].offset != 0); ++idx) {
         U32 const litLength = inSeqs[idx].litLength;
-        U32 const ll0 = (litLength == 0);
         U32 const matchLength = inSeqs[idx].matchLength;
-        U32 const offCode = ZSTD_finalizeOffCode(inSeqs[idx].offset, updatedRepcodes.rep, ll0);
-        ZSTD_updateRep(updatedRepcodes.rep, offCode, ll0);
+        U32 offBase;
 
-        DEBUGLOG(6, "Storing sequence: (of: %u, ml: %u, ll: %u)", offCode, matchLength, litLength);
+        if (externalRepSearch == ZSTD_ps_disable) {
+            offBase = OFFSET_TO_OFFBASE(inSeqs[idx].offset);
+        } else {
+            U32 const ll0 = (litLength == 0);
+            offBase = ZSTD_finalizeOffBase(inSeqs[idx].offset, updatedRepcodes.rep, ll0);
+            ZSTD_updateRep(updatedRepcodes.rep, offBase, ll0);
+        }
+
+        DEBUGLOG(6, "Storing sequence: (of: %u, ml: %u, ll: %u)", offBase, matchLength, litLength);
         if (cctx->appliedParams.validateSequences) {
             seqPos->posInSrc += litLength + matchLength;
-            FORWARD_IF_ERROR(ZSTD_validateSequence(offCode, matchLength, seqPos->posInSrc,
-                                                cctx->appliedParams.cParams.windowLog, dictSize),
+            FORWARD_IF_ERROR(ZSTD_validateSequence(offBase, matchLength, cctx->appliedParams.cParams.minMatch, seqPos->posInSrc,
+                                                cctx->appliedParams.cParams.windowLog, dictSize, cctx->appliedParams.useSequenceProducer),
                                                 "Sequence validation failed");
         }
-        RETURN_ERROR_IF(idx - seqPos->idx > cctx->seqStore.maxNbSeq, memory_allocation,
+        RETURN_ERROR_IF(idx - seqPos->idx >= cctx->seqStore.maxNbSeq, externalSequences_invalid,
                         "Not enough memory allocated. Try adjusting ZSTD_c_minMatch.");
-        ZSTD_storeSeq(&cctx->seqStore, litLength, ip, iend, offCode, matchLength);
+        ZSTD_storeSeq(&cctx->seqStore, litLength, ip, iend, offBase, matchLength);
         ip += matchLength + litLength;
     }
+
+    /* If we skipped repcode search while parsing, we need to update repcodes now */
+    assert(externalRepSearch != ZSTD_ps_auto);
+    assert(idx >= startIdx);
+    if (externalRepSearch == ZSTD_ps_disable && idx != startIdx) {
+        U32* const rep = updatedRepcodes.rep;
+        U32 lastSeqIdx = idx - 1; /* index of last non-block-delimiter sequence */
+
+        if (lastSeqIdx >= startIdx + 2) {
+            rep[2] = inSeqs[lastSeqIdx - 2].offset;
+            rep[1] = inSeqs[lastSeqIdx - 1].offset;
+            rep[0] = inSeqs[lastSeqIdx].offset;
+        } else if (lastSeqIdx == startIdx + 1) {
+            rep[2] = rep[0];
+            rep[1] = inSeqs[lastSeqIdx - 1].offset;
+            rep[0] = inSeqs[lastSeqIdx].offset;
+        } else {
+            assert(lastSeqIdx == startIdx);
+            rep[2] = rep[1];
+            rep[1] = rep[0];
+            rep[0] = inSeqs[lastSeqIdx].offset;
+        }
+    }
+
     ZSTD_memcpy(cctx->blockState.nextCBlock->rep, updatedRepcodes.rep, sizeof(repcodes_t));
 
     if (inSeqs[idx].litLength) {
@@ -5642,26 +6257,15 @@ ZSTD_copySequencesToSeqStoreExplicitBlockDelim(ZSTD_CCtx* cctx,
         ip += inSeqs[idx].litLength;
         seqPos->posInSrc += inSeqs[idx].litLength;
     }
-    RETURN_ERROR_IF(ip != iend, corruption_detected, "Blocksize doesn't agree with block delimiter!");
+    RETURN_ERROR_IF(ip != iend, externalSequences_invalid, "Blocksize doesn't agree with block delimiter!");
     seqPos->idx = idx+1;
     return 0;
 }
 
-/* Returns the number of bytes to move the current read position back by. Only non-zero
- * if we ended up splitting a sequence. Otherwise, it may return a ZSTD error if something
- * went wrong.
- *
- * This function will attempt to scan through blockSize bytes represented by the sequences
- * in inSeqs, storing any (partial) sequences.
- *
- * Occasionally, we may want to change the actual number of bytes we consumed from inSeqs to
- * avoid splitting a match, or to avoid splitting a match such that it would produce a match
- * smaller than MINMATCH. In this case, we return the number of bytes that we didn't read from this block.
- */
-static size_t
+size_t
 ZSTD_copySequencesToSeqStoreNoBlockDelim(ZSTD_CCtx* cctx, ZSTD_sequencePosition* seqPos,
                                    const ZSTD_Sequence* const inSeqs, size_t inSeqsSize,
-                                   const void* src, size_t blockSize)
+                                   const void* src, size_t blockSize, ZSTD_paramSwitch_e externalRepSearch)
 {
     U32 idx = seqPos->idx;
     U32 startPosInSequence = seqPos->posInSequence;
@@ -5673,6 +6277,9 @@ ZSTD_copySequencesToSeqStoreNoBlockDelim(ZSTD_CCtx* cctx, ZSTD_sequencePosition*
     U32 bytesAdjustment = 0;
     U32 finalMatchSplit = 0;
 
+    /* TODO(embg) support fast parsing mode in noBlockDelim mode */
+    (void)externalRepSearch;
+
     if (cctx->cdict) {
         dictSize = cctx->cdict->dictContentSize;
     } else if (cctx->prefixDict.dict) {
@@ -5680,7 +6287,7 @@ ZSTD_copySequencesToSeqStoreNoBlockDelim(ZSTD_CCtx* cctx, ZSTD_sequencePosition*
     } else {
         dictSize = 0;
     }
-    DEBUGLOG(5, "ZSTD_copySequencesToSeqStore: idx: %u PIS: %u blockSize: %zu", idx, startPosInSequence, blockSize);
+    DEBUGLOG(5, "ZSTD_copySequencesToSeqStoreNoBlockDelim: idx: %u PIS: %u blockSize: %zu", idx, startPosInSequence, blockSize);
     DEBUGLOG(5, "Start seq: idx: %u (of: %u ml: %u ll: %u)", idx, inSeqs[idx].offset, inSeqs[idx].matchLength, inSeqs[idx].litLength);
     ZSTD_memcpy(updatedRepcodes.rep, cctx->blockState.prevCBlock->rep, sizeof(repcodes_t));
     while (endPosInSequence && idx < inSeqsSize && !finalMatchSplit) {
@@ -5688,7 +6295,7 @@ ZSTD_copySequencesToSeqStoreNoBlockDelim(ZSTD_CCtx* cctx, ZSTD_sequencePosition*
         U32 litLength = currSeq.litLength;
         U32 matchLength = currSeq.matchLength;
         U32 const rawOffset = currSeq.offset;
-        U32 offCode;
+        U32 offBase;
 
         /* Modify the sequence depending on where endPosInSequence lies */
         if (endPosInSequence >= currSeq.litLength + currSeq.matchLength) {
@@ -5702,7 +6309,6 @@ ZSTD_copySequencesToSeqStoreNoBlockDelim(ZSTD_CCtx* cctx, ZSTD_sequencePosition*
             /* Move to the next sequence */
             endPosInSequence -= currSeq.litLength + currSeq.matchLength;
             startPosInSequence = 0;
-            idx++;
         } else {
             /* This is the final (partial) sequence we're adding from inSeqs, and endPosInSequence
                does not reach the end of the match. So, we have to split the sequence */
@@ -5742,21 +6348,23 @@ ZSTD_copySequencesToSeqStoreNoBlockDelim(ZSTD_CCtx* cctx, ZSTD_sequencePosition*
         }
         /* Check if this offset can be represented with a repcode */
         {   U32 const ll0 = (litLength == 0);
-            offCode = ZSTD_finalizeOffCode(rawOffset, updatedRepcodes.rep, ll0);
-            ZSTD_updateRep(updatedRepcodes.rep, offCode, ll0);
+            offBase = ZSTD_finalizeOffBase(rawOffset, updatedRepcodes.rep, ll0);
+            ZSTD_updateRep(updatedRepcodes.rep, offBase, ll0);
         }
 
         if (cctx->appliedParams.validateSequences) {
             seqPos->posInSrc += litLength + matchLength;
-            FORWARD_IF_ERROR(ZSTD_validateSequence(offCode, matchLength, seqPos->posInSrc,
-                                                   cctx->appliedParams.cParams.windowLog, dictSize),
+            FORWARD_IF_ERROR(ZSTD_validateSequence(offBase, matchLength, cctx->appliedParams.cParams.minMatch, seqPos->posInSrc,
+                                                   cctx->appliedParams.cParams.windowLog, dictSize, cctx->appliedParams.useSequenceProducer),
                                                    "Sequence validation failed");
         }
-        DEBUGLOG(6, "Storing sequence: (of: %u, ml: %u, ll: %u)", offCode, matchLength, litLength);
-        RETURN_ERROR_IF(idx - seqPos->idx > cctx->seqStore.maxNbSeq, memory_allocation,
+        DEBUGLOG(6, "Storing sequence: (of: %u, ml: %u, ll: %u)", offBase, matchLength, litLength);
+        RETURN_ERROR_IF(idx - seqPos->idx >= cctx->seqStore.maxNbSeq, externalSequences_invalid,
                         "Not enough memory allocated. Try adjusting ZSTD_c_minMatch.");
-        ZSTD_storeSeq(&cctx->seqStore, litLength, ip, iend, offCode, matchLength);
+        ZSTD_storeSeq(&cctx->seqStore, litLength, ip, iend, offBase, matchLength);
         ip += matchLength + litLength;
+        if (!finalMatchSplit)
+            idx++; /* Next Sequence */
     }
     DEBUGLOG(5, "Ending seq: idx: %u (of: %u ml: %u ll: %u)", idx, inSeqs[idx].offset, inSeqs[idx].matchLength, inSeqs[idx].litLength);
     assert(idx == inSeqsSize || endPosInSequence <= inSeqs[idx].litLength + inSeqs[idx].matchLength);
@@ -5779,7 +6387,7 @@ ZSTD_copySequencesToSeqStoreNoBlockDelim(ZSTD_CCtx* cctx, ZSTD_sequencePosition*
 
 typedef size_t (*ZSTD_sequenceCopier) (ZSTD_CCtx* cctx, ZSTD_sequencePosition* seqPos,
                                        const ZSTD_Sequence* const inSeqs, size_t inSeqsSize,
-                                       const void* src, size_t blockSize);
+                                       const void* src, size_t blockSize, ZSTD_paramSwitch_e externalRepSearch);
 static ZSTD_sequenceCopier ZSTD_selectSequenceCopier(ZSTD_sequenceFormat_e mode)
 {
     ZSTD_sequenceCopier sequenceCopier = NULL;
@@ -5793,6 +6401,57 @@ static ZSTD_sequenceCopier ZSTD_selectSequenceCopier(ZSTD_sequenceFormat_e mode)
     return sequenceCopier;
 }
 
+/* Discover the size of next block by searching for the delimiter.
+ * Note that a block delimiter **must** exist in this mode,
+ * otherwise it's an input error.
+ * The block size retrieved will be later compared to ensure it remains within bounds */
+static size_t
+blockSize_explicitDelimiter(const ZSTD_Sequence* inSeqs, size_t inSeqsSize, ZSTD_sequencePosition seqPos)
+{
+    int end = 0;
+    size_t blockSize = 0;
+    size_t spos = seqPos.idx;
+    DEBUGLOG(6, "blockSize_explicitDelimiter : seq %zu / %zu", spos, inSeqsSize);
+    assert(spos <= inSeqsSize);
+    while (spos < inSeqsSize) {
+        end = (inSeqs[spos].offset == 0);
+        blockSize += inSeqs[spos].litLength + inSeqs[spos].matchLength;
+        if (end) {
+            if (inSeqs[spos].matchLength != 0)
+                RETURN_ERROR(externalSequences_invalid, "delimiter format error : both matchlength and offset must be == 0");
+            break;
+        }
+        spos++;
+    }
+    if (!end)
+        RETURN_ERROR(externalSequences_invalid, "Reached end of sequences without finding a block delimiter");
+    return blockSize;
+}
+
+/* More a "target" block size */
+static size_t blockSize_noDelimiter(size_t blockSize, size_t remaining)
+{
+    int const lastBlock = (remaining <= blockSize);
+    return lastBlock ? remaining : blockSize;
+}
+
+static size_t determine_blockSize(ZSTD_sequenceFormat_e mode,
+                           size_t blockSize, size_t remaining,
+                     const ZSTD_Sequence* inSeqs, size_t inSeqsSize, ZSTD_sequencePosition seqPos)
+{
+    DEBUGLOG(6, "determine_blockSize : remainingSize = %zu", remaining);
+    if (mode == ZSTD_sf_noBlockDelimiters)
+        return blockSize_noDelimiter(blockSize, remaining);
+    {   size_t const explicitBlockSize = blockSize_explicitDelimiter(inSeqs, inSeqsSize, seqPos);
+        FORWARD_IF_ERROR(explicitBlockSize, "Error while determining block size with explicit delimiters");
+        if (explicitBlockSize > blockSize)
+            RETURN_ERROR(externalSequences_invalid, "sequences incorrectly define a too large block");
+        if (explicitBlockSize > remaining)
+            RETURN_ERROR(externalSequences_invalid, "sequences define a frame longer than source");
+        return explicitBlockSize;
+    }
+}
+
 /* Compress, block-by-block, all of the sequences given.
  *
  * Returns the cumulative size of all compressed blocks (including their headers),
@@ -5805,9 +6464,6 @@ ZSTD_compressSequences_internal(ZSTD_CCtx* cctx,
                           const void* src, size_t srcSize)
 {
     size_t cSize = 0;
-    U32 lastBlock;
-    size_t blockSize;
-    size_t compressedSeqsSize;
     size_t remaining = srcSize;
     ZSTD_sequencePosition seqPos = {0, 0, 0};
 
@@ -5827,22 +6483,29 @@ ZSTD_compressSequences_internal(ZSTD_CCtx* cctx,
     }
 
     while (remaining) {
+        size_t compressedSeqsSize;
         size_t cBlockSize;
         size_t additionalByteAdjustment;
-        lastBlock = remaining <= cctx->blockSize;
-        blockSize = lastBlock ? (U32)remaining : (U32)cctx->blockSize;
+        size_t blockSize = determine_blockSize(cctx->appliedParams.blockDelimiters,
+                                        cctx->blockSize, remaining,
+                                        inSeqs, inSeqsSize, seqPos);
+        U32 const lastBlock = (blockSize == remaining);
+        FORWARD_IF_ERROR(blockSize, "Error while trying to determine block size");
+        assert(blockSize <= remaining);
         ZSTD_resetSeqStore(&cctx->seqStore);
-        DEBUGLOG(4, "Working on new block. Blocksize: %zu", blockSize);
+        DEBUGLOG(5, "Working on new block. Blocksize: %zu (total:%zu)", blockSize, (ip - (const BYTE*)src) + blockSize);
 
-        additionalByteAdjustment = sequenceCopier(cctx, &seqPos, inSeqs, inSeqsSize, ip, blockSize);
+        additionalByteAdjustment = sequenceCopier(cctx, &seqPos, inSeqs, inSeqsSize, ip, blockSize, cctx->appliedParams.searchForExternalRepcodes);
         FORWARD_IF_ERROR(additionalByteAdjustment, "Bad sequence copy");
         blockSize -= additionalByteAdjustment;
 
         /* If blocks are too small, emit as a nocompress block */
-        if (blockSize < MIN_CBLOCK_SIZE+ZSTD_blockHeaderSize+1) {
+        /* TODO: See 3090. We reduced MIN_CBLOCK_SIZE from 3 to 2 so to compensate we are adding
+         * additional 1. We need to revisit and change this logic to be more consistent */
+        if (blockSize < MIN_CBLOCK_SIZE+ZSTD_blockHeaderSize+1+1) {
             cBlockSize = ZSTD_noCompressBlock(op, dstCapacity, ip, blockSize, lastBlock);
             FORWARD_IF_ERROR(cBlockSize, "Nocompress block failed");
-            DEBUGLOG(4, "Block too small, writing out nocompress block: cSize: %zu", cBlockSize);
+            DEBUGLOG(5, "Block too small, writing out nocompress block: cSize: %zu", cBlockSize);
             cSize += cBlockSize;
             ip += blockSize;
             op += cBlockSize;
@@ -5851,6 +6514,7 @@ ZSTD_compressSequences_internal(ZSTD_CCtx* cctx,
             continue;
         }
 
+        RETURN_ERROR_IF(dstCapacity < ZSTD_blockHeaderSize, dstSize_tooSmall, "not enough dstCapacity to write a new compressed block");
         compressedSeqsSize = ZSTD_entropyCompressSeqStore(&cctx->seqStore,
                                 &cctx->blockState.prevCBlock->entropy, &cctx->blockState.nextCBlock->entropy,
                                 &cctx->appliedParams,
@@ -5859,11 +6523,11 @@ ZSTD_compressSequences_internal(ZSTD_CCtx* cctx,
                                 cctx->entropyWorkspace, ENTROPY_WORKSPACE_SIZE /* statically allocated in resetCCtx */,
                                 cctx->bmi2);
         FORWARD_IF_ERROR(compressedSeqsSize, "Compressing sequences of block failed");
-        DEBUGLOG(4, "Compressed sequences size: %zu", compressedSeqsSize);
+        DEBUGLOG(5, "Compressed sequences size: %zu", compressedSeqsSize);
 
         if (!cctx->isFirstBlock &&
             ZSTD_maybeRLE(&cctx->seqStore) &&
-            ZSTD_isRLE((BYTE const*)src, srcSize)) {
+            ZSTD_isRLE(ip, blockSize)) {
             /* We don't want to emit our first block as a RLE even if it qualifies because
             * doing so will cause the decoder (cli only) to throw a "should consume all input error."
             * This is only an issue for zstd <= v1.4.3
@@ -5874,12 +6538,12 @@ ZSTD_compressSequences_internal(ZSTD_CCtx* cctx,
         if (compressedSeqsSize == 0) {
             /* ZSTD_noCompressBlock writes the block header as well */
             cBlockSize = ZSTD_noCompressBlock(op, dstCapacity, ip, blockSize, lastBlock);
-            FORWARD_IF_ERROR(cBlockSize, "Nocompress block failed");
-            DEBUGLOG(4, "Writing out nocompress block, size: %zu", cBlockSize);
+            FORWARD_IF_ERROR(cBlockSize, "ZSTD_noCompressBlock failed");
+            DEBUGLOG(5, "Writing out nocompress block, size: %zu", cBlockSize);
         } else if (compressedSeqsSize == 1) {
             cBlockSize = ZSTD_rleCompressBlock(op, dstCapacity, *ip, blockSize, lastBlock);
-            FORWARD_IF_ERROR(cBlockSize, "RLE compress block failed");
-            DEBUGLOG(4, "Writing out RLE block, size: %zu", cBlockSize);
+            FORWARD_IF_ERROR(cBlockSize, "ZSTD_rleCompressBlock failed");
+            DEBUGLOG(5, "Writing out RLE block, size: %zu", cBlockSize);
         } else {
             U32 cBlockHeader;
             /* Error checking and repcodes update */
@@ -5891,11 +6555,10 @@ ZSTD_compressSequences_internal(ZSTD_CCtx* cctx,
             cBlockHeader = lastBlock + (((U32)bt_compressed)<<1) + (U32)(compressedSeqsSize << 3);
             MEM_writeLE24(op, cBlockHeader);
             cBlockSize = ZSTD_blockHeaderSize + compressedSeqsSize;
-            DEBUGLOG(4, "Writing out compressed block, size: %zu", cBlockSize);
+            DEBUGLOG(5, "Writing out compressed block, size: %zu", cBlockSize);
         }
 
         cSize += cBlockSize;
-        DEBUGLOG(4, "cSize running total: %zu", cSize);
 
         if (lastBlock) {
             break;
@@ -5906,12 +6569,15 @@ ZSTD_compressSequences_internal(ZSTD_CCtx* cctx,
             dstCapacity -= cBlockSize;
             cctx->isFirstBlock = 0;
         }
+        DEBUGLOG(5, "cSize running total: %zu (remaining dstCapacity=%zu)", cSize, dstCapacity);
     }
 
+    DEBUGLOG(4, "cSize final total: %zu", cSize);
     return cSize;
 }
 
-size_t ZSTD_compressSequences(ZSTD_CCtx* const cctx, void* dst, size_t dstCapacity,
+size_t ZSTD_compressSequences(ZSTD_CCtx* cctx,
+                              void* dst, size_t dstCapacity,
                               const ZSTD_Sequence* inSeqs, size_t inSeqsSize,
                               const void* src, size_t srcSize)
 {
@@ -5921,7 +6587,7 @@ size_t ZSTD_compressSequences(ZSTD_CCtx* const cctx, void* dst, size_t dstCapaci
     size_t frameHeaderSize = 0;
 
     /* Transparent initialization stage, same as compressStream2() */
-    DEBUGLOG(3, "ZSTD_compressSequences()");
+    DEBUGLOG(4, "ZSTD_compressSequences (dstCapacity=%zu)", dstCapacity);
     assert(cctx != NULL);
     FORWARD_IF_ERROR(ZSTD_CCtx_init_compressStream2(cctx, ZSTD_e_end, srcSize), "CCtx initialization failed");
     /* Begin writing output, starting with frame header */
@@ -5949,26 +6615,34 @@ size_t ZSTD_compressSequences(ZSTD_CCtx* const cctx, void* dst, size_t dstCapaci
         cSize += 4;
     }
 
-    DEBUGLOG(3, "Final compressed size: %zu", cSize);
+    DEBUGLOG(4, "Final compressed size: %zu", cSize);
     return cSize;
 }
 
 /*======   Finalize   ======*/
 
+static ZSTD_inBuffer inBuffer_forEndFlush(const ZSTD_CStream* zcs)
+{
+    const ZSTD_inBuffer nullInput = { NULL, 0, 0 };
+    const int stableInput = (zcs->appliedParams.inBufferMode == ZSTD_bm_stable);
+    return stableInput ? zcs->expectedInBuffer : nullInput;
+}
+
 /*! ZSTD_flushStream() :
  * @return : amount of data remaining to flush */
 size_t ZSTD_flushStream(ZSTD_CStream* zcs, ZSTD_outBuffer* output)
 {
-    ZSTD_inBuffer input = { NULL, 0, 0 };
+    ZSTD_inBuffer input = inBuffer_forEndFlush(zcs);
+    input.size = input.pos; /* do not ingest more input during flush */
     return ZSTD_compressStream2(zcs, output, &input, ZSTD_e_flush);
 }
 
 
 size_t ZSTD_endStream(ZSTD_CStream* zcs, ZSTD_outBuffer* output)
 {
-    ZSTD_inBuffer input = { NULL, 0, 0 };
+    ZSTD_inBuffer input = inBuffer_forEndFlush(zcs);
     size_t const remainingToFlush = ZSTD_compressStream2(zcs, output, &input, ZSTD_e_end);
-    FORWARD_IF_ERROR( remainingToFlush , "ZSTD_compressStream2 failed");
+    FORWARD_IF_ERROR(remainingToFlush , "ZSTD_compressStream2(,,ZSTD_e_end) failed");
     if (zcs->appliedParams.nbWorkers > 0) return remainingToFlush;   /* minimal estimation */
     /* single thread mode : attempt to calculate remaining to flush more precisely */
     {   size_t const lastBlockSize = zcs->frameEnded ? 0 : ZSTD_BLOCKHEADERSIZE;
@@ -6090,7 +6764,7 @@ static ZSTD_compressionParameters ZSTD_getCParams_internal(int compressionLevel,
             cp.targetLength = (unsigned)(-clampedCompressionLevel);
         }
         /* refine parameters based on srcSize & dictSize */
-        return ZSTD_adjustCParams_internal(cp, srcSizeHint, dictSize, mode);
+        return ZSTD_adjustCParams_internal(cp, srcSizeHint, dictSize, mode, ZSTD_ps_auto);
     }
 }
 
@@ -6125,3 +6799,21 @@ ZSTD_parameters ZSTD_getParams(int compressionLevel, unsigned long long srcSizeH
     if (srcSizeHint == 0) srcSizeHint = ZSTD_CONTENTSIZE_UNKNOWN;
     return ZSTD_getParams_internal(compressionLevel, srcSizeHint, dictSize, ZSTD_cpm_unknown);
 }
+
+void ZSTD_registerSequenceProducer(
+    ZSTD_CCtx* zc, void* mState,
+    ZSTD_sequenceProducer_F* mFinder
+) {
+    if (mFinder != NULL) {
+        ZSTD_externalMatchCtx emctx;
+        emctx.mState = mState;
+        emctx.mFinder = mFinder;
+        emctx.seqBuffer = NULL;
+        emctx.seqBufferCapacity = 0;
+        zc->externalMatchCtx = emctx;
+        zc->requestedParams.useSequenceProducer = 1;
+    } else {
+        ZSTD_memset(&zc->externalMatchCtx, 0, sizeof(zc->externalMatchCtx));
+        zc->requestedParams.useSequenceProducer = 0;
+    }
+}
diff --git a/lib/zstd/compress/zstd_compress_internal.h b/lib/zstd/compress/zstd_compress_internal.h
index 71697a11ae30..899f5e2de8e9 100644
--- a/lib/zstd/compress/zstd_compress_internal.h
+++ b/lib/zstd/compress/zstd_compress_internal.h
@@ -1,5 +1,6 @@
+/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
 /*
- * Copyright (c) Yann Collet, Facebook, Inc.
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
  * All rights reserved.
  *
  * This source code is licensed under both the BSD-style license (found in the
@@ -20,6 +21,7 @@
 ***************************************/
 #include "../common/zstd_internal.h"
 #include "zstd_cwksp.h"
+#include "../common/bits.h" /* ZSTD_highbit32, ZSTD_NbCommonBytes */
 
 
 /*-*************************************
@@ -111,12 +113,13 @@ typedef struct {
 /* ZSTD_buildBlockEntropyStats() :
  *  Builds entropy for the block.
  *  @return : 0 on success or error code */
-size_t ZSTD_buildBlockEntropyStats(seqStore_t* seqStorePtr,
-                             const ZSTD_entropyCTables_t* prevEntropy,
-                                   ZSTD_entropyCTables_t* nextEntropy,
-                             const ZSTD_CCtx_params* cctxParams,
-                                   ZSTD_entropyCTablesMetadata_t* entropyMetadata,
-                                   void* workspace, size_t wkspSize);
+size_t ZSTD_buildBlockEntropyStats(
+                    const seqStore_t* seqStorePtr,
+                    const ZSTD_entropyCTables_t* prevEntropy,
+                          ZSTD_entropyCTables_t* nextEntropy,
+                    const ZSTD_CCtx_params* cctxParams,
+                          ZSTD_entropyCTablesMetadata_t* entropyMetadata,
+                          void* workspace, size_t wkspSize);
 
 /* *******************************
 *  Compression internals structs *
@@ -142,6 +145,12 @@ typedef struct {
   size_t capacity;      /* The capacity starting from `seq` pointer */
 } rawSeqStore_t;
 
+typedef struct {
+    U32 idx;            /* Index in array of ZSTD_Sequence */
+    U32 posInSequence;  /* Position within sequence at idx */
+    size_t posInSrc;    /* Number of bytes given by sequences provided so far */
+} ZSTD_sequencePosition;
+
 UNUSED_ATTR static const rawSeqStore_t kNullRawSeqStore = {NULL, 0, 0, 0, 0};
 
 typedef struct {
@@ -212,8 +221,10 @@ struct ZSTD_matchState_t {
     U32 hashLog3;           /* dispatch table for matches of len==3 : larger == faster, more memory */
 
     U32 rowHashLog;                          /* For row-based matchfinder: Hashlog based on nb of rows in the hashTable.*/
-    U16* tagTable;                           /* For row-based matchFinder: A row-based table containing the hashes and head index. */
+    BYTE* tagTable;                          /* For row-based matchFinder: A row-based table containing the hashes and head index. */
     U32 hashCache[ZSTD_ROW_HASH_CACHE_SIZE]; /* For row-based matchFinder: a cache of hashes to improve speed */
+    U64 hashSalt;                            /* For row-based matchFinder: salts the hash for re-use of tag table */
+    U32 hashSaltEntropy;                     /* For row-based matchFinder: collects entropy for salt generation */
 
     U32* hashTable;
     U32* hashTable3;
@@ -228,6 +239,18 @@ struct ZSTD_matchState_t {
     const ZSTD_matchState_t* dictMatchState;
     ZSTD_compressionParameters cParams;
     const rawSeqStore_t* ldmSeqStore;
+
+    /* Controls prefetching in some dictMatchState matchfinders.
+     * This behavior is controlled from the cctx ms.
+     * This parameter has no effect in the cdict ms. */
+    int prefetchCDictTables;
+
+    /* When == 0, lazy match finders insert every position.
+     * When != 0, lazy match finders only insert positions they search.
+     * This allows them to skip much faster over incompressible data,
+     * at a small cost to compression ratio.
+     */
+    int lazySkipping;
 };
 
 typedef struct {
@@ -324,6 +347,24 @@ struct ZSTD_CCtx_params_s {
 
     /* Internal use, for createCCtxParams() and freeCCtxParams() only */
     ZSTD_customMem customMem;
+
+    /* Controls prefetching in some dictMatchState matchfinders */
+    ZSTD_paramSwitch_e prefetchCDictTables;
+
+    /* Controls whether zstd will fall back to an internal matchfinder
+     * if the external matchfinder returns an error code. */
+    int enableMatchFinderFallback;
+
+    /* Indicates whether an external matchfinder has been referenced.
+     * Users can't set this externally.
+     * It is set internally in ZSTD_registerSequenceProducer(). */
+    int useSequenceProducer;
+
+    /* Adjust the max block size*/
+    size_t maxBlockSize;
+
+    /* Controls repcode search in external sequence parsing */
+    ZSTD_paramSwitch_e searchForExternalRepcodes;
 };  /* typedef'd to ZSTD_CCtx_params within "zstd.h" */
 
 #define COMPRESS_SEQUENCES_WORKSPACE_SIZE (sizeof(unsigned) * (MaxSeq + 2))
@@ -355,6 +396,14 @@ typedef struct {
     ZSTD_entropyCTablesMetadata_t entropyMetadata;
 } ZSTD_blockSplitCtx;
 
+/* Context for block-level external matchfinder API */
+typedef struct {
+  void* mState;
+  ZSTD_sequenceProducer_F* mFinder;
+  ZSTD_Sequence* seqBuffer;
+  size_t seqBufferCapacity;
+} ZSTD_externalMatchCtx;
+
 struct ZSTD_CCtx_s {
     ZSTD_compressionStage_e stage;
     int cParamsChanged;                  /* == 1 if cParams(except wlog) or compression level are changed in requestedParams. Triggers transmission of new params to ZSTDMT (if available) then reset to 0. */
@@ -404,6 +453,7 @@ struct ZSTD_CCtx_s {
 
     /* Stable in/out buffer verification */
     ZSTD_inBuffer expectedInBuffer;
+    size_t stableIn_notConsumed; /* nb bytes within stable input buffer that are said to be consumed but are not */
     size_t expectedOutBufferSize;
 
     /* Dictionary */
@@ -417,9 +467,13 @@ struct ZSTD_CCtx_s {
 
     /* Workspace for block splitter */
     ZSTD_blockSplitCtx blockSplitCtx;
+
+    /* Workspace for external matchfinder */
+    ZSTD_externalMatchCtx externalMatchCtx;
 };
 
 typedef enum { ZSTD_dtlm_fast, ZSTD_dtlm_full } ZSTD_dictTableLoadMethod_e;
+typedef enum { ZSTD_tfp_forCCtx, ZSTD_tfp_forCDict } ZSTD_tableFillPurpose_e;
 
 typedef enum {
     ZSTD_noDict = 0,
@@ -441,7 +495,7 @@ typedef enum {
                                  * In this mode we take both the source size and the dictionary size
                                  * into account when selecting and adjusting the parameters.
                                  */
-    ZSTD_cpm_unknown = 3,       /* ZSTD_getCParams, ZSTD_getParams, ZSTD_adjustParams.
+    ZSTD_cpm_unknown = 3        /* ZSTD_getCParams, ZSTD_getParams, ZSTD_adjustParams.
                                  * We don't know what these parameters are for. We default to the legacy
                                  * behavior of taking both the source size and the dict size into account
                                  * when selecting and adjusting parameters.
@@ -500,9 +554,11 @@ MEM_STATIC int ZSTD_cParam_withinBounds(ZSTD_cParameter cParam, int value)
 /* ZSTD_noCompressBlock() :
  * Writes uncompressed block to dst buffer from given src.
  * Returns the size of the block */
-MEM_STATIC size_t ZSTD_noCompressBlock (void* dst, size_t dstCapacity, const void* src, size_t srcSize, U32 lastBlock)
+MEM_STATIC size_t
+ZSTD_noCompressBlock(void* dst, size_t dstCapacity, const void* src, size_t srcSize, U32 lastBlock)
 {
     U32 const cBlockHeader24 = lastBlock + (((U32)bt_raw)<<1) + (U32)(srcSize << 3);
+    DEBUGLOG(5, "ZSTD_noCompressBlock (srcSize=%zu, dstCapacity=%zu)", srcSize, dstCapacity);
     RETURN_ERROR_IF(srcSize + ZSTD_blockHeaderSize > dstCapacity,
                     dstSize_tooSmall, "dst buf too small for uncompressed block");
     MEM_writeLE24(dst, cBlockHeader24);
@@ -510,7 +566,8 @@ MEM_STATIC size_t ZSTD_noCompressBlock (void* dst, size_t dstCapacity, const voi
     return ZSTD_blockHeaderSize + srcSize;
 }
 
-MEM_STATIC size_t ZSTD_rleCompressBlock (void* dst, size_t dstCapacity, BYTE src, size_t srcSize, U32 lastBlock)
+MEM_STATIC size_t
+ZSTD_rleCompressBlock(void* dst, size_t dstCapacity, BYTE src, size_t srcSize, U32 lastBlock)
 {
     BYTE* const op = (BYTE*)dst;
     U32 const cBlockHeader = lastBlock + (((U32)bt_rle)<<1) + (U32)(srcSize << 3);
@@ -529,7 +586,7 @@ MEM_STATIC size_t ZSTD_minGain(size_t srcSize, ZSTD_strategy strat)
 {
     U32 const minlog = (strat>=ZSTD_btultra) ? (U32)(strat) - 1 : 6;
     ZSTD_STATIC_ASSERT(ZSTD_btultra == 8);
-    assert(ZSTD_cParam_withinBounds(ZSTD_c_strategy, strat));
+    assert(ZSTD_cParam_withinBounds(ZSTD_c_strategy, (int)strat));
     return (srcSize >> minlog) + 2;
 }
 
@@ -565,29 +622,27 @@ ZSTD_safecopyLiterals(BYTE* op, BYTE const* ip, BYTE const* const iend, BYTE con
     while (ip < iend) *op++ = *ip++;
 }
 
-#define ZSTD_REP_MOVE     (ZSTD_REP_NUM-1)
-#define STORE_REPCODE_1 STORE_REPCODE(1)
-#define STORE_REPCODE_2 STORE_REPCODE(2)
-#define STORE_REPCODE_3 STORE_REPCODE(3)
-#define STORE_REPCODE(r) (assert((r)>=1), assert((r)<=3), (r)-1)
-#define STORE_OFFSET(o)  (assert((o)>0), o + ZSTD_REP_MOVE)
-#define STORED_IS_OFFSET(o)  ((o) > ZSTD_REP_MOVE)
-#define STORED_IS_REPCODE(o) ((o) <= ZSTD_REP_MOVE)
-#define STORED_OFFSET(o)  (assert(STORED_IS_OFFSET(o)), (o)-ZSTD_REP_MOVE)
-#define STORED_REPCODE(o) (assert(STORED_IS_REPCODE(o)), (o)+1)  /* returns ID 1,2,3 */
-#define STORED_TO_OFFBASE(o) ((o)+1)
-#define OFFBASE_TO_STORED(o) ((o)-1)
+
+#define REPCODE1_TO_OFFBASE REPCODE_TO_OFFBASE(1)
+#define REPCODE2_TO_OFFBASE REPCODE_TO_OFFBASE(2)
+#define REPCODE3_TO_OFFBASE REPCODE_TO_OFFBASE(3)
+#define REPCODE_TO_OFFBASE(r) (assert((r)>=1), assert((r)<=ZSTD_REP_NUM), (r)) /* accepts IDs 1,2,3 */
+#define OFFSET_TO_OFFBASE(o)  (assert((o)>0), o + ZSTD_REP_NUM)
+#define OFFBASE_IS_OFFSET(o)  ((o) > ZSTD_REP_NUM)
+#define OFFBASE_IS_REPCODE(o) ( 1 <= (o) && (o) <= ZSTD_REP_NUM)
+#define OFFBASE_TO_OFFSET(o)  (assert(OFFBASE_IS_OFFSET(o)), (o) - ZSTD_REP_NUM)
+#define OFFBASE_TO_REPCODE(o) (assert(OFFBASE_IS_REPCODE(o)), (o))  /* returns ID 1,2,3 */
 
 /*! ZSTD_storeSeq() :
- *  Store a sequence (litlen, litPtr, offCode and matchLength) into seqStore_t.
- *  @offBase_minus1 : Users should use employ macros STORE_REPCODE_X and STORE_OFFSET().
+ *  Store a sequence (litlen, litPtr, offBase and matchLength) into seqStore_t.
+ *  @offBase : Users should employ macros REPCODE_TO_OFFBASE() and OFFSET_TO_OFFBASE().
  *  @matchLength : must be >= MINMATCH
- *  Allowed to overread literals up to litLimit.
+ *  Allowed to over-read literals up to litLimit.
 */
 HINT_INLINE UNUSED_ATTR void
 ZSTD_storeSeq(seqStore_t* seqStorePtr,
               size_t litLength, const BYTE* literals, const BYTE* litLimit,
-              U32 offBase_minus1,
+              U32 offBase,
               size_t matchLength)
 {
     BYTE const* const litLimit_w = litLimit - WILDCOPY_OVERLENGTH;
@@ -596,8 +651,8 @@ ZSTD_storeSeq(seqStore_t* seqStorePtr,
     static const BYTE* g_start = NULL;
     if (g_start==NULL) g_start = (const BYTE*)literals;  /* note : index only works for compression within a single segment */
     {   U32 const pos = (U32)((const BYTE*)literals - g_start);
-        DEBUGLOG(6, "Cpos%7u :%3u literals, match%4u bytes at offCode%7u",
-               pos, (U32)litLength, (U32)matchLength, (U32)offBase_minus1);
+        DEBUGLOG(6, "Cpos%7u :%3u literals, match%4u bytes at offBase%7u",
+               pos, (U32)litLength, (U32)matchLength, (U32)offBase);
     }
 #endif
     assert((size_t)(seqStorePtr->sequences - seqStorePtr->sequencesStart) < seqStorePtr->maxNbSeq);
@@ -607,9 +662,9 @@ ZSTD_storeSeq(seqStore_t* seqStorePtr,
     assert(literals + litLength <= litLimit);
     if (litEnd <= litLimit_w) {
         /* Common case we can use wildcopy.
-	 * First copy 16 bytes, because literals are likely short.
-	 */
-        assert(WILDCOPY_OVERLENGTH >= 16);
+         * First copy 16 bytes, because literals are likely short.
+         */
+        ZSTD_STATIC_ASSERT(WILDCOPY_OVERLENGTH >= 16);
         ZSTD_copy16(seqStorePtr->lit, literals);
         if (litLength > 16) {
             ZSTD_wildcopy(seqStorePtr->lit+16, literals+16, (ptrdiff_t)litLength-16, ZSTD_no_overlap);
@@ -628,7 +683,7 @@ ZSTD_storeSeq(seqStore_t* seqStorePtr,
     seqStorePtr->sequences[0].litLength = (U16)litLength;
 
     /* match offset */
-    seqStorePtr->sequences[0].offBase = STORED_TO_OFFBASE(offBase_minus1);
+    seqStorePtr->sequences[0].offBase = offBase;
 
     /* match Length */
     assert(matchLength >= MINMATCH);
@@ -646,17 +701,17 @@ ZSTD_storeSeq(seqStore_t* seqStorePtr,
 
 /* ZSTD_updateRep() :
  * updates in-place @rep (array of repeat offsets)
- * @offBase_minus1 : sum-type, with same numeric representation as ZSTD_storeSeq()
+ * @offBase : sum-type, using numeric representation of ZSTD_storeSeq()
  */
 MEM_STATIC void
-ZSTD_updateRep(U32 rep[ZSTD_REP_NUM], U32 const offBase_minus1, U32 const ll0)
+ZSTD_updateRep(U32 rep[ZSTD_REP_NUM], U32 const offBase, U32 const ll0)
 {
-    if (STORED_IS_OFFSET(offBase_minus1)) {  /* full offset */
+    if (OFFBASE_IS_OFFSET(offBase)) {  /* full offset */
         rep[2] = rep[1];
         rep[1] = rep[0];
-        rep[0] = STORED_OFFSET(offBase_minus1);
+        rep[0] = OFFBASE_TO_OFFSET(offBase);
     } else {   /* repcode */
-        U32 const repCode = STORED_REPCODE(offBase_minus1) - 1 + ll0;
+        U32 const repCode = OFFBASE_TO_REPCODE(offBase) - 1 + ll0;
         if (repCode > 0) {  /* note : if repCode==0, no change */
             U32 const currentOffset = (repCode==ZSTD_REP_NUM) ? (rep[0] - 1) : rep[repCode];
             rep[2] = (repCode >= 2) ? rep[1] : rep[2];
@@ -673,11 +728,11 @@ typedef struct repcodes_s {
 } repcodes_t;
 
 MEM_STATIC repcodes_t
-ZSTD_newRep(U32 const rep[ZSTD_REP_NUM], U32 const offBase_minus1, U32 const ll0)
+ZSTD_newRep(U32 const rep[ZSTD_REP_NUM], U32 const offBase, U32 const ll0)
 {
     repcodes_t newReps;
     ZSTD_memcpy(&newReps, rep, sizeof(newReps));
-    ZSTD_updateRep(newReps.rep, offBase_minus1, ll0);
+    ZSTD_updateRep(newReps.rep, offBase, ll0);
     return newReps;
 }
 
@@ -685,59 +740,6 @@ ZSTD_newRep(U32 const rep[ZSTD_REP_NUM], U32 const offBase_minus1, U32 const ll0
 /*-*************************************
 *  Match length counter
 ***************************************/
-static unsigned ZSTD_NbCommonBytes (size_t val)
-{
-    if (MEM_isLittleEndian()) {
-        if (MEM_64bits()) {
-#       if (__GNUC__ >= 4)
-            return (__builtin_ctzll((U64)val) >> 3);
-#       else
-            static const int DeBruijnBytePos[64] = { 0, 0, 0, 0, 0, 1, 1, 2,
-                                                     0, 3, 1, 3, 1, 4, 2, 7,
-                                                     0, 2, 3, 6, 1, 5, 3, 5,
-                                                     1, 3, 4, 4, 2, 5, 6, 7,
-                                                     7, 0, 1, 2, 3, 3, 4, 6,
-                                                     2, 6, 5, 5, 3, 4, 5, 6,
-                                                     7, 1, 2, 4, 6, 4, 4, 5,
-                                                     7, 2, 6, 5, 7, 6, 7, 7 };
-            return DeBruijnBytePos[((U64)((val & -(long long)val) * 0x0218A392CDABBD3FULL)) >> 58];
-#       endif
-        } else { /* 32 bits */
-#       if (__GNUC__ >= 3)
-            return (__builtin_ctz((U32)val) >> 3);
-#       else
-            static const int DeBruijnBytePos[32] = { 0, 0, 3, 0, 3, 1, 3, 0,
-                                                     3, 2, 2, 1, 3, 2, 0, 1,
-                                                     3, 3, 1, 2, 2, 2, 2, 0,
-                                                     3, 1, 2, 0, 1, 0, 1, 1 };
-            return DeBruijnBytePos[((U32)((val & -(S32)val) * 0x077CB531U)) >> 27];
-#       endif
-        }
-    } else {  /* Big Endian CPU */
-        if (MEM_64bits()) {
-#       if (__GNUC__ >= 4)
-            return (__builtin_clzll(val) >> 3);
-#       else
-            unsigned r;
-            const unsigned n32 = sizeof(size_t)*4;   /* calculate this way due to compiler complaining in 32-bits mode */
-            if (!(val>>n32)) { r=4; } else { r=0; val>>=n32; }
-            if (!(val>>16)) { r+=2; val>>=8; } else { val>>=24; }
-            r += (!val);
-            return r;
-#       endif
-        } else { /* 32 bits */
-#       if (__GNUC__ >= 3)
-            return (__builtin_clz((U32)val) >> 3);
-#       else
-            unsigned r;
-            if (!(val>>16)) { r=2; val>>=8; } else { r=0; val>>=24; }
-            r += (!val);
-            return r;
-#       endif
-    }   }
-}
-
-
 MEM_STATIC size_t ZSTD_count(const BYTE* pIn, const BYTE* pMatch, const BYTE* const pInLimit)
 {
     const BYTE* const pStart = pIn;
@@ -783,32 +785,43 @@ ZSTD_count_2segments(const BYTE* ip, const BYTE* match,
  *  Hashes
  ***************************************/
 static const U32 prime3bytes = 506832829U;
-static U32    ZSTD_hash3(U32 u, U32 h) { return ((u << (32-24)) * prime3bytes)  >> (32-h) ; }
-MEM_STATIC size_t ZSTD_hash3Ptr(const void* ptr, U32 h) { return ZSTD_hash3(MEM_readLE32(ptr), h); } /* only in zstd_opt.h */
+static U32    ZSTD_hash3(U32 u, U32 h, U32 s) { assert(h <= 32); return (((u << (32-24)) * prime3bytes) ^ s)  >> (32-h) ; }
+MEM_STATIC size_t ZSTD_hash3Ptr(const void* ptr, U32 h) { return ZSTD_hash3(MEM_readLE32(ptr), h, 0); } /* only in zstd_opt.h */
+MEM_STATIC size_t ZSTD_hash3PtrS(const void* ptr, U32 h, U32 s) { return ZSTD_hash3(MEM_readLE32(ptr), h, s); }
 
 static const U32 prime4bytes = 2654435761U;
-static U32    ZSTD_hash4(U32 u, U32 h) { return (u * prime4bytes) >> (32-h) ; }
-static size_t ZSTD_hash4Ptr(const void* ptr, U32 h) { return ZSTD_hash4(MEM_read32(ptr), h); }
+static U32    ZSTD_hash4(U32 u, U32 h, U32 s) { assert(h <= 32); return ((u * prime4bytes) ^ s) >> (32-h) ; }
+static size_t ZSTD_hash4Ptr(const void* ptr, U32 h) { return ZSTD_hash4(MEM_readLE32(ptr), h, 0); }
+static size_t ZSTD_hash4PtrS(const void* ptr, U32 h, U32 s) { return ZSTD_hash4(MEM_readLE32(ptr), h, s); }
 
 static const U64 prime5bytes = 889523592379ULL;
-static size_t ZSTD_hash5(U64 u, U32 h) { return (size_t)(((u  << (64-40)) * prime5bytes) >> (64-h)) ; }
-static size_t ZSTD_hash5Ptr(const void* p, U32 h) { return ZSTD_hash5(MEM_readLE64(p), h); }
+static size_t ZSTD_hash5(U64 u, U32 h, U64 s) { assert(h <= 64); return (size_t)((((u  << (64-40)) * prime5bytes) ^ s) >> (64-h)) ; }
+static size_t ZSTD_hash5Ptr(const void* p, U32 h) { return ZSTD_hash5(MEM_readLE64(p), h, 0); }
+static size_t ZSTD_hash5PtrS(const void* p, U32 h, U64 s) { return ZSTD_hash5(MEM_readLE64(p), h, s); }
 
 static const U64 prime6bytes = 227718039650203ULL;
-static size_t ZSTD_hash6(U64 u, U32 h) { return (size_t)(((u  << (64-48)) * prime6bytes) >> (64-h)) ; }
-static size_t ZSTD_hash6Ptr(const void* p, U32 h) { return ZSTD_hash6(MEM_readLE64(p), h); }
+static size_t ZSTD_hash6(U64 u, U32 h, U64 s) { assert(h <= 64); return (size_t)((((u  << (64-48)) * prime6bytes) ^ s) >> (64-h)) ; }
+static size_t ZSTD_hash6Ptr(const void* p, U32 h) { return ZSTD_hash6(MEM_readLE64(p), h, 0); }
+static size_t ZSTD_hash6PtrS(const void* p, U32 h, U64 s) { return ZSTD_hash6(MEM_readLE64(p), h, s); }
 
 static const U64 prime7bytes = 58295818150454627ULL;
-static size_t ZSTD_hash7(U64 u, U32 h) { return (size_t)(((u  << (64-56)) * prime7bytes) >> (64-h)) ; }
-static size_t ZSTD_hash7Ptr(const void* p, U32 h) { return ZSTD_hash7(MEM_readLE64(p), h); }
+static size_t ZSTD_hash7(U64 u, U32 h, U64 s) { assert(h <= 64); return (size_t)((((u  << (64-56)) * prime7bytes) ^ s) >> (64-h)) ; }
+static size_t ZSTD_hash7Ptr(const void* p, U32 h) { return ZSTD_hash7(MEM_readLE64(p), h, 0); }
+static size_t ZSTD_hash7PtrS(const void* p, U32 h, U64 s) { return ZSTD_hash7(MEM_readLE64(p), h, s); }
 
 static const U64 prime8bytes = 0xCF1BBCDCB7A56463ULL;
-static size_t ZSTD_hash8(U64 u, U32 h) { return (size_t)(((u) * prime8bytes) >> (64-h)) ; }
-static size_t ZSTD_hash8Ptr(const void* p, U32 h) { return ZSTD_hash8(MEM_readLE64(p), h); }
+static size_t ZSTD_hash8(U64 u, U32 h, U64 s) { assert(h <= 64); return (size_t)((((u) * prime8bytes)  ^ s) >> (64-h)) ; }
+static size_t ZSTD_hash8Ptr(const void* p, U32 h) { return ZSTD_hash8(MEM_readLE64(p), h, 0); }
+static size_t ZSTD_hash8PtrS(const void* p, U32 h, U64 s) { return ZSTD_hash8(MEM_readLE64(p), h, s); }
+
 
 MEM_STATIC FORCE_INLINE_ATTR
 size_t ZSTD_hashPtr(const void* p, U32 hBits, U32 mls)
 {
+    /* Although some of these hashes do support hBits up to 64, some do not.
+     * To be on the safe side, always avoid hBits > 32. */
+    assert(hBits <= 32);
+
     switch(mls)
     {
     default:
@@ -820,6 +833,24 @@ size_t ZSTD_hashPtr(const void* p, U32 hBits, U32 mls)
     }
 }
 
+MEM_STATIC FORCE_INLINE_ATTR
+size_t ZSTD_hashPtrSalted(const void* p, U32 hBits, U32 mls, const U64 hashSalt) {
+    /* Although some of these hashes do support hBits up to 64, some do not.
+     * To be on the safe side, always avoid hBits > 32. */
+    assert(hBits <= 32);
+
+    switch(mls)
+    {
+        default:
+        case 4: return ZSTD_hash4PtrS(p, hBits, (U32)hashSalt);
+        case 5: return ZSTD_hash5PtrS(p, hBits, hashSalt);
+        case 6: return ZSTD_hash6PtrS(p, hBits, hashSalt);
+        case 7: return ZSTD_hash7PtrS(p, hBits, hashSalt);
+        case 8: return ZSTD_hash8PtrS(p, hBits, hashSalt);
+    }
+}
+
+
 /* ZSTD_ipow() :
  * Return base^exponent.
  */
@@ -1167,10 +1198,15 @@ ZSTD_checkDictValidity(const ZSTD_window_t* window,
                     (unsigned)blockEndIdx, (unsigned)maxDist, (unsigned)loadedDictEnd);
         assert(blockEndIdx >= loadedDictEnd);
 
-        if (blockEndIdx > loadedDictEnd + maxDist) {
+        if (blockEndIdx > loadedDictEnd + maxDist || loadedDictEnd != window->dictLimit) {
             /* On reaching window size, dictionaries are invalidated.
              * For simplification, if window size is reached anywhere within next block,
              * the dictionary is invalidated for the full block.
+             *
+             * We also have to invalidate the dictionary if ZSTD_window_update() has detected
+             * non-contiguous segments, which means that loadedDictEnd != window->dictLimit.
+             * loadedDictEnd may be 0, if forceWindow is true, but in that case we never use
+             * dictMatchState, so setting it to NULL is not a problem.
              */
             DEBUGLOG(6, "invalidating dictionary for current block (distance > windowSize)");
             *loadedDictEndPtr = 0;
@@ -1302,6 +1338,42 @@ MEM_STATIC void ZSTD_debugTable(const U32* table, U32 max)
 
 #endif
 
+/* Short Cache */
+
+/* Normally, zstd matchfinders follow this flow:
+ *     1. Compute hash at ip
+ *     2. Load index from hashTable[hash]
+ *     3. Check if *ip == *(base + index)
+ * In dictionary compression, loading *(base + index) is often an L2 or even L3 miss.
+ *
+ * Short cache is an optimization which allows us to avoid step 3 most of the time
+ * when the data doesn't actually match. With short cache, the flow becomes:
+ *     1. Compute (hash, currentTag) at ip. currentTag is an 8-bit independent hash at ip.
+ *     2. Load (index, matchTag) from hashTable[hash]. See ZSTD_writeTaggedIndex to understand how this works.
+ *     3. Only if currentTag == matchTag, check *ip == *(base + index). Otherwise, continue.
+ *
+ * Currently, short cache is only implemented in CDict hashtables. Thus, its use is limited to
+ * dictMatchState matchfinders.
+ */
+#define ZSTD_SHORT_CACHE_TAG_BITS 8
+#define ZSTD_SHORT_CACHE_TAG_MASK ((1u << ZSTD_SHORT_CACHE_TAG_BITS) - 1)
+
+/* Helper function for ZSTD_fillHashTable and ZSTD_fillDoubleHashTable.
+ * Unpacks hashAndTag into (hash, tag), then packs (index, tag) into hashTable[hash]. */
+MEM_STATIC void ZSTD_writeTaggedIndex(U32* const hashTable, size_t hashAndTag, U32 index) {
+    size_t const hash = hashAndTag >> ZSTD_SHORT_CACHE_TAG_BITS;
+    U32 const tag = (U32)(hashAndTag & ZSTD_SHORT_CACHE_TAG_MASK);
+    assert(index >> (32 - ZSTD_SHORT_CACHE_TAG_BITS) == 0);
+    hashTable[hash] = (index << ZSTD_SHORT_CACHE_TAG_BITS) | tag;
+}
+
+/* Helper function for short cache matchfinders.
+ * Unpacks tag1 and tag2 from lower bits of packedTag1 and packedTag2, then checks if the tags match. */
+MEM_STATIC int ZSTD_comparePackedTags(size_t packedTag1, size_t packedTag2) {
+    U32 const tag1 = packedTag1 & ZSTD_SHORT_CACHE_TAG_MASK;
+    U32 const tag2 = packedTag2 & ZSTD_SHORT_CACHE_TAG_MASK;
+    return tag1 == tag2;
+}
 
 
 /* ===============================================================
@@ -1396,4 +1468,51 @@ U32 ZSTD_cycleLog(U32 hashLog, ZSTD_strategy strat);
  */
 void ZSTD_CCtx_trace(ZSTD_CCtx* cctx, size_t extraCSize);
 
+/* Returns 0 on success, and a ZSTD_error otherwise. This function scans through an array of
+ * ZSTD_Sequence, storing the sequences it finds, until it reaches a block delimiter.
+ * Note that the block delimiter must include the last literals of the block.
+ */
+size_t
+ZSTD_copySequencesToSeqStoreExplicitBlockDelim(ZSTD_CCtx* cctx,
+                                              ZSTD_sequencePosition* seqPos,
+                                        const ZSTD_Sequence* const inSeqs, size_t inSeqsSize,
+                                        const void* src, size_t blockSize, ZSTD_paramSwitch_e externalRepSearch);
+
+/* Returns the number of bytes to move the current read position back by.
+ * Only non-zero if we ended up splitting a sequence.
+ * Otherwise, it may return a ZSTD error if something went wrong.
+ *
+ * This function will attempt to scan through blockSize bytes
+ * represented by the sequences in @inSeqs,
+ * storing any (partial) sequences.
+ *
+ * Occasionally, we may want to change the actual number of bytes we consumed from inSeqs to
+ * avoid splitting a match, or to avoid splitting a match such that it would produce a match
+ * smaller than MINMATCH. In this case, we return the number of bytes that we didn't read from this block.
+ */
+size_t
+ZSTD_copySequencesToSeqStoreNoBlockDelim(ZSTD_CCtx* cctx, ZSTD_sequencePosition* seqPos,
+                                   const ZSTD_Sequence* const inSeqs, size_t inSeqsSize,
+                                   const void* src, size_t blockSize, ZSTD_paramSwitch_e externalRepSearch);
+
+
+/* ===============================================================
+ * Deprecated definitions that are still used internally to avoid
+ * deprecation warnings. These functions are exactly equivalent to
+ * their public variants, but avoid the deprecation warnings.
+ * =============================================================== */
+
+size_t ZSTD_compressBegin_usingCDict_deprecated(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict);
+
+size_t ZSTD_compressContinue_public(ZSTD_CCtx* cctx,
+                                    void* dst, size_t dstCapacity,
+                              const void* src, size_t srcSize);
+
+size_t ZSTD_compressEnd_public(ZSTD_CCtx* cctx,
+                               void* dst, size_t dstCapacity,
+                         const void* src, size_t srcSize);
+
+size_t ZSTD_compressBlock_deprecated(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize);
+
+
 #endif /* ZSTD_COMPRESS_H */
diff --git a/lib/zstd/compress/zstd_compress_literals.c b/lib/zstd/compress/zstd_compress_literals.c
index 52b0a8059aba..3e9ea46a670a 100644
--- a/lib/zstd/compress/zstd_compress_literals.c
+++ b/lib/zstd/compress/zstd_compress_literals.c
@@ -1,5 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
 /*
- * Copyright (c) Yann Collet, Facebook, Inc.
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
  * All rights reserved.
  *
  * This source code is licensed under both the BSD-style license (found in the
@@ -13,11 +14,36 @@
  ***************************************/
 #include "zstd_compress_literals.h"
 
+
+/* **************************************************************
+*  Debug Traces
+****************************************************************/
+#if DEBUGLEVEL >= 2
+
+static size_t showHexa(const void* src, size_t srcSize)
+{
+    const BYTE* const ip = (const BYTE*)src;
+    size_t u;
+    for (u=0; u<srcSize; u++) {
+        RAWLOG(5, " %02X", ip[u]); (void)ip;
+    }
+    RAWLOG(5, " \n");
+    return srcSize;
+}
+
+#endif
+
+
+/* **************************************************************
+*  Literals compression - special cases
+****************************************************************/
 size_t ZSTD_noCompressLiterals (void* dst, size_t dstCapacity, const void* src, size_t srcSize)
 {
     BYTE* const ostart = (BYTE*)dst;
     U32   const flSize = 1 + (srcSize>31) + (srcSize>4095);
 
+    DEBUGLOG(5, "ZSTD_noCompressLiterals: srcSize=%zu, dstCapacity=%zu", srcSize, dstCapacity);
+
     RETURN_ERROR_IF(srcSize + flSize > dstCapacity, dstSize_tooSmall, "");
 
     switch(flSize)
@@ -36,16 +62,30 @@ size_t ZSTD_noCompressLiterals (void* dst, size_t dstCapacity, const void* src,
     }
 
     ZSTD_memcpy(ostart + flSize, src, srcSize);
-    DEBUGLOG(5, "Raw literals: %u -> %u", (U32)srcSize, (U32)(srcSize + flSize));
+    DEBUGLOG(5, "Raw (uncompressed) literals: %u -> %u", (U32)srcSize, (U32)(srcSize + flSize));
     return srcSize + flSize;
 }
 
+static int allBytesIdentical(const void* src, size_t srcSize)
+{
+    assert(srcSize >= 1);
+    assert(src != NULL);
+    {   const BYTE b = ((const BYTE*)src)[0];
+        size_t p;
+        for (p=1; p<srcSize; p++) {
+            if (((const BYTE*)src)[p] != b) return 0;
+        }
+        return 1;
+    }
+}
+
 size_t ZSTD_compressRleLiteralsBlock (void* dst, size_t dstCapacity, const void* src, size_t srcSize)
 {
     BYTE* const ostart = (BYTE*)dst;
     U32   const flSize = 1 + (srcSize>31) + (srcSize>4095);
 
-    (void)dstCapacity;  /* dstCapacity already guaranteed to be >=4, hence large enough */
+    assert(dstCapacity >= 4); (void)dstCapacity;
+    assert(allBytesIdentical(src, srcSize));
 
     switch(flSize)
     {
@@ -63,28 +103,51 @@ size_t ZSTD_compressRleLiteralsBlock (void* dst, size_t dstCapacity, const void*
     }
 
     ostart[flSize] = *(const BYTE*)src;
-    DEBUGLOG(5, "RLE literals: %u -> %u", (U32)srcSize, (U32)flSize + 1);
+    DEBUGLOG(5, "RLE : Repeated Literal (%02X: %u times) -> %u bytes encoded", ((const BYTE*)src)[0], (U32)srcSize, (U32)flSize + 1);
     return flSize+1;
 }
 
-size_t ZSTD_compressLiterals (ZSTD_hufCTables_t const* prevHuf,
-                              ZSTD_hufCTables_t* nextHuf,
-                              ZSTD_strategy strategy, int disableLiteralCompression,
-                              void* dst, size_t dstCapacity,
-                        const void* src, size_t srcSize,
-                              void* entropyWorkspace, size_t entropyWorkspaceSize,
-                        const int bmi2,
-                        unsigned suspectUncompressible)
+/* ZSTD_minLiteralsToCompress() :
+ * returns minimal amount of literals
+ * for literal compression to even be attempted.
+ * Minimum is made tighter as compression strategy increases.
+ */
+static size_t
+ZSTD_minLiteralsToCompress(ZSTD_strategy strategy, HUF_repeat huf_repeat)
+{
+    assert((int)strategy >= 0);
+    assert((int)strategy <= 9);
+    /* btultra2 : min 8 bytes;
+     * then 2x larger for each successive compression strategy
+     * max threshold 64 bytes */
+    {   int const shift = MIN(9-(int)strategy, 3);
+        size_t const mintc = (huf_repeat == HUF_repeat_valid) ? 6 : (size_t)8 << shift;
+        DEBUGLOG(7, "minLiteralsToCompress = %zu", mintc);
+        return mintc;
+    }
+}
+
+size_t ZSTD_compressLiterals (
+                  void* dst, size_t dstCapacity,
+            const void* src, size_t srcSize,
+                  void* entropyWorkspace, size_t entropyWorkspaceSize,
+            const ZSTD_hufCTables_t* prevHuf,
+                  ZSTD_hufCTables_t* nextHuf,
+                  ZSTD_strategy strategy,
+                  int disableLiteralCompression,
+                  int suspectUncompressible,
+                  int bmi2)
 {
-    size_t const minGain = ZSTD_minGain(srcSize, strategy);
     size_t const lhSize = 3 + (srcSize >= 1 KB) + (srcSize >= 16 KB);
     BYTE*  const ostart = (BYTE*)dst;
     U32 singleStream = srcSize < 256;
     symbolEncodingType_e hType = set_compressed;
     size_t cLitSize;
 
-    DEBUGLOG(5,"ZSTD_compressLiterals (disableLiteralCompression=%i srcSize=%u)",
-                disableLiteralCompression, (U32)srcSize);
+    DEBUGLOG(5,"ZSTD_compressLiterals (disableLiteralCompression=%i, srcSize=%u, dstCapacity=%zu)",
+                disableLiteralCompression, (U32)srcSize, dstCapacity);
+
+    DEBUGLOG(6, "Completed literals listing (%zu bytes)", showHexa(src, srcSize));
 
     /* Prepare nextEntropy assuming reusing the existing table */
     ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf));
@@ -92,40 +155,51 @@ size_t ZSTD_compressLiterals (ZSTD_hufCTables_t const* prevHuf,
     if (disableLiteralCompression)
         return ZSTD_noCompressLiterals(dst, dstCapacity, src, srcSize);
 
-    /* small ? don't even attempt compression (speed opt) */
-#   define COMPRESS_LITERALS_SIZE_MIN 63
-    {   size_t const minLitSize = (prevHuf->repeatMode == HUF_repeat_valid) ? 6 : COMPRESS_LITERALS_SIZE_MIN;
-        if (srcSize <= minLitSize) return ZSTD_noCompressLiterals(dst, dstCapacity, src, srcSize);
-    }
+    /* if too small, don't even attempt compression (speed opt) */
+    if (srcSize < ZSTD_minLiteralsToCompress(strategy, prevHuf->repeatMode))
+        return ZSTD_noCompressLiterals(dst, dstCapacity, src, srcSize);
 
     RETURN_ERROR_IF(dstCapacity < lhSize+1, dstSize_tooSmall, "not enough space for compression");
     {   HUF_repeat repeat = prevHuf->repeatMode;
-        int const preferRepeat = strategy < ZSTD_lazy ? srcSize <= 1024 : 0;
+        int const flags = 0
+            | (bmi2 ? HUF_flags_bmi2 : 0)
+            | (strategy < ZSTD_lazy && srcSize <= 1024 ? HUF_flags_preferRepeat : 0)
+            | (strategy >= HUF_OPTIMAL_DEPTH_THRESHOLD ? HUF_flags_optimalDepth : 0)
+            | (suspectUncompressible ? HUF_flags_suspectUncompressible : 0);
+
+        typedef size_t (*huf_compress_f)(void*, size_t, const void*, size_t, unsigned, unsigned, void*, size_t, HUF_CElt*, HUF_repeat*, int);
+        huf_compress_f huf_compress;
         if (repeat == HUF_repeat_valid && lhSize == 3) singleStream = 1;
-        cLitSize = singleStream ?
-            HUF_compress1X_repeat(
-                ostart+lhSize, dstCapacity-lhSize, src, srcSize,
-                HUF_SYMBOLVALUE_MAX, HUF_TABLELOG_DEFAULT, entropyWorkspace, entropyWorkspaceSize,
-                (HUF_CElt*)nextHuf->CTable, &repeat, preferRepeat, bmi2, suspectUncompressible) :
-            HUF_compress4X_repeat(
-                ostart+lhSize, dstCapacity-lhSize, src, srcSize,
-                HUF_SYMBOLVALUE_MAX, HUF_TABLELOG_DEFAULT, entropyWorkspace, entropyWorkspaceSize,
-                (HUF_CElt*)nextHuf->CTable, &repeat, preferRepeat, bmi2, suspectUncompressible);
+        huf_compress = singleStream ? HUF_compress1X_repeat : HUF_compress4X_repeat;
+        cLitSize = huf_compress(ostart+lhSize, dstCapacity-lhSize,
+                                src, srcSize,
+                                HUF_SYMBOLVALUE_MAX, LitHufLog,
+                                entropyWorkspace, entropyWorkspaceSize,
+                                (HUF_CElt*)nextHuf->CTable,
+                                &repeat, flags);
+        DEBUGLOG(5, "%zu literals compressed into %zu bytes (before header)", srcSize, cLitSize);
         if (repeat != HUF_repeat_none) {
             /* reused the existing table */
-            DEBUGLOG(5, "Reusing previous huffman table");
+            DEBUGLOG(5, "reusing statistics from previous huffman block");
             hType = set_repeat;
         }
     }
 
-    if ((cLitSize==0) || (cLitSize >= srcSize - minGain) || ERR_isError(cLitSize)) {
-        ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf));
-        return ZSTD_noCompressLiterals(dst, dstCapacity, src, srcSize);
-    }
+    {   size_t const minGain = ZSTD_minGain(srcSize, strategy);
+        if ((cLitSize==0) || (cLitSize >= srcSize - minGain) || ERR_isError(cLitSize)) {
+            ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf));
+            return ZSTD_noCompressLiterals(dst, dstCapacity, src, srcSize);
+    }   }
     if (cLitSize==1) {
-        ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf));
-        return ZSTD_compressRleLiteralsBlock(dst, dstCapacity, src, srcSize);
-    }
+        /* A return value of 1 signals that the alphabet consists of a single symbol.
+         * However, in some rare circumstances, it could be the compressed size (a single byte).
+         * For that outcome to have a chance to happen, it's necessary that `srcSize < 8`.
+         * (it's also necessary to not generate statistics).
+         * Therefore, in such a case, actively check that all bytes are identical. */
+        if ((srcSize >= 8) || allBytesIdentical(src, srcSize)) {
+            ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf));
+            return ZSTD_compressRleLiteralsBlock(dst, dstCapacity, src, srcSize);
+    }   }
 
     if (hType == set_compressed) {
         /* using a newly constructed table */
@@ -136,16 +210,19 @@ size_t ZSTD_compressLiterals (ZSTD_hufCTables_t const* prevHuf,
     switch(lhSize)
     {
     case 3: /* 2 - 2 - 10 - 10 */
-        {   U32 const lhc = hType + ((!singleStream) << 2) + ((U32)srcSize<<4) + ((U32)cLitSize<<14);
+        if (!singleStream) assert(srcSize >= MIN_LITERALS_FOR_4_STREAMS);
+        {   U32 const lhc = hType + ((U32)(!singleStream) << 2) + ((U32)srcSize<<4) + ((U32)cLitSize<<14);
             MEM_writeLE24(ostart, lhc);
             break;
         }
     case 4: /* 2 - 2 - 14 - 14 */
+        assert(srcSize >= MIN_LITERALS_FOR_4_STREAMS);
         {   U32 const lhc = hType + (2 << 2) + ((U32)srcSize<<4) + ((U32)cLitSize<<18);
             MEM_writeLE32(ostart, lhc);
             break;
         }
     case 5: /* 2 - 2 - 18 - 18 */
+        assert(srcSize >= MIN_LITERALS_FOR_4_STREAMS);
         {   U32 const lhc = hType + (3 << 2) + ((U32)srcSize<<4) + ((U32)cLitSize<<22);
             MEM_writeLE32(ostart, lhc);
             ostart[4] = (BYTE)(cLitSize >> 10);
diff --git a/lib/zstd/compress/zstd_compress_literals.h b/lib/zstd/compress/zstd_compress_literals.h
index 9775fb97cb70..a2a85d6b69e5 100644
--- a/lib/zstd/compress/zstd_compress_literals.h
+++ b/lib/zstd/compress/zstd_compress_literals.h
@@ -1,5 +1,6 @@
+/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
 /*
- * Copyright (c) Yann Collet, Facebook, Inc.
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
  * All rights reserved.
  *
  * This source code is licensed under both the BSD-style license (found in the
@@ -16,16 +17,24 @@
 
 size_t ZSTD_noCompressLiterals (void* dst, size_t dstCapacity, const void* src, size_t srcSize);
 
+/* ZSTD_compressRleLiteralsBlock() :
+ * Conditions :
+ * - All bytes in @src are identical
+ * - dstCapacity >= 4 */
 size_t ZSTD_compressRleLiteralsBlock (void* dst, size_t dstCapacity, const void* src, size_t srcSize);
 
-/* If suspectUncompressible then some sampling checks will be run to potentially skip huffman coding */
-size_t ZSTD_compressLiterals (ZSTD_hufCTables_t const* prevHuf,
-                              ZSTD_hufCTables_t* nextHuf,
-                              ZSTD_strategy strategy, int disableLiteralCompression,
-                              void* dst, size_t dstCapacity,
+/* ZSTD_compressLiterals():
+ * @entropyWorkspace: must be aligned on 4-bytes boundaries
+ * @entropyWorkspaceSize : must be >= HUF_WORKSPACE_SIZE
+ * @suspectUncompressible: sampling checks, to potentially skip huffman coding
+ */
+size_t ZSTD_compressLiterals (void* dst, size_t dstCapacity,
                         const void* src, size_t srcSize,
                               void* entropyWorkspace, size_t entropyWorkspaceSize,
-                        const int bmi2,
-                        unsigned suspectUncompressible);
+                        const ZSTD_hufCTables_t* prevHuf,
+                              ZSTD_hufCTables_t* nextHuf,
+                              ZSTD_strategy strategy, int disableLiteralCompression,
+                              int suspectUncompressible,
+                              int bmi2);
 
 #endif /* ZSTD_COMPRESS_LITERALS_H */
diff --git a/lib/zstd/compress/zstd_compress_sequences.c b/lib/zstd/compress/zstd_compress_sequences.c
index 21ddc1b37acf..5c028c78d889 100644
--- a/lib/zstd/compress/zstd_compress_sequences.c
+++ b/lib/zstd/compress/zstd_compress_sequences.c
@@ -1,5 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
 /*
- * Copyright (c) Yann Collet, Facebook, Inc.
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
  * All rights reserved.
  *
  * This source code is licensed under both the BSD-style license (found in the
@@ -58,7 +59,7 @@ static unsigned ZSTD_useLowProbCount(size_t const nbSeq)
 {
     /* Heuristic: This should cover most blocks <= 16K and
      * start to fade out after 16K to about 32K depending on
-     * comprssibility.
+     * compressibility.
      */
     return nbSeq >= 2048;
 }
@@ -166,7 +167,7 @@ ZSTD_selectEncodingType(
     if (mostFrequent == nbSeq) {
         *repeatMode = FSE_repeat_none;
         if (isDefaultAllowed && nbSeq <= 2) {
-            /* Prefer set_basic over set_rle when there are 2 or less symbols,
+            /* Prefer set_basic over set_rle when there are 2 or fewer symbols,
              * since RLE uses 1 byte, but set_basic uses 5-6 bits per symbol.
              * If basic encoding isn't possible, always choose RLE.
              */
diff --git a/lib/zstd/compress/zstd_compress_sequences.h b/lib/zstd/compress/zstd_compress_sequences.h
index 7991364c2f71..7fe6f4ff5cf2 100644
--- a/lib/zstd/compress/zstd_compress_sequences.h
+++ b/lib/zstd/compress/zstd_compress_sequences.h
@@ -1,5 +1,6 @@
+/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
 /*
- * Copyright (c) Yann Collet, Facebook, Inc.
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
  * All rights reserved.
  *
  * This source code is licensed under both the BSD-style license (found in the
diff --git a/lib/zstd/compress/zstd_compress_superblock.c b/lib/zstd/compress/zstd_compress_superblock.c
index 17d836cc84e8..dbacbaf72733 100644
--- a/lib/zstd/compress/zstd_compress_superblock.c
+++ b/lib/zstd/compress/zstd_compress_superblock.c
@@ -1,5 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
 /*
- * Copyright (c) Yann Collet, Facebook, Inc.
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
  * All rights reserved.
  *
  * This source code is licensed under both the BSD-style license (found in the
@@ -36,13 +37,14 @@
  *      If it is set_compressed, first sub-block's literals section will be Treeless_Literals_Block
  *      and the following sub-blocks' literals sections will be Treeless_Literals_Block.
  *  @return : compressed size of literals section of a sub-block
- *            Or 0 if it unable to compress.
+ *            Or 0 if unable to compress.
  *            Or error code */
-static size_t ZSTD_compressSubBlock_literal(const HUF_CElt* hufTable,
-                                    const ZSTD_hufCTablesMetadata_t* hufMetadata,
-                                    const BYTE* literals, size_t litSize,
-                                    void* dst, size_t dstSize,
-                                    const int bmi2, int writeEntropy, int* entropyWritten)
+static size_t
+ZSTD_compressSubBlock_literal(const HUF_CElt* hufTable,
+                              const ZSTD_hufCTablesMetadata_t* hufMetadata,
+                              const BYTE* literals, size_t litSize,
+                              void* dst, size_t dstSize,
+                              const int bmi2, int writeEntropy, int* entropyWritten)
 {
     size_t const header = writeEntropy ? 200 : 0;
     size_t const lhSize = 3 + (litSize >= (1 KB - header)) + (litSize >= (16 KB - header));
@@ -53,8 +55,6 @@ static size_t ZSTD_compressSubBlock_literal(const HUF_CElt* hufTable,
     symbolEncodingType_e hType = writeEntropy ? hufMetadata->hType : set_repeat;
     size_t cLitSize = 0;
 
-    (void)bmi2; /* TODO bmi2... */
-
     DEBUGLOG(5, "ZSTD_compressSubBlock_literal (litSize=%zu, lhSize=%zu, writeEntropy=%d)", litSize, lhSize, writeEntropy);
 
     *entropyWritten = 0;
@@ -76,9 +76,9 @@ static size_t ZSTD_compressSubBlock_literal(const HUF_CElt* hufTable,
         DEBUGLOG(5, "ZSTD_compressSubBlock_literal (hSize=%zu)", hufMetadata->hufDesSize);
     }
 
-    /* TODO bmi2 */
-    {   const size_t cSize = singleStream ? HUF_compress1X_usingCTable(op, oend-op, literals, litSize, hufTable)
-                                          : HUF_compress4X_usingCTable(op, oend-op, literals, litSize, hufTable);
+    {   int const flags = bmi2 ? HUF_flags_bmi2 : 0;
+        const size_t cSize = singleStream ? HUF_compress1X_usingCTable(op, oend-op, literals, litSize, hufTable, flags)
+                                          : HUF_compress4X_usingCTable(op, oend-op, literals, litSize, hufTable, flags);
         op += cSize;
         cLitSize += cSize;
         if (cSize == 0 || ERR_isError(cSize)) {
@@ -126,7 +126,11 @@ static size_t ZSTD_compressSubBlock_literal(const HUF_CElt* hufTable,
     return op-ostart;
 }
 
-static size_t ZSTD_seqDecompressedSize(seqStore_t const* seqStore, const seqDef* sequences, size_t nbSeq, size_t litSize, int lastSequence) {
+static size_t
+ZSTD_seqDecompressedSize(seqStore_t const* seqStore,
+                   const seqDef* sequences, size_t nbSeq,
+                         size_t litSize, int lastSequence)
+{
     const seqDef* const sstart = sequences;
     const seqDef* const send = sequences + nbSeq;
     const seqDef* sp = sstart;
@@ -156,13 +160,14 @@ static size_t ZSTD_seqDecompressedSize(seqStore_t const* seqStore, const seqDef*
  *  @return : compressed size of sequences section of a sub-block
  *            Or 0 if it is unable to compress
  *            Or error code. */
-static size_t ZSTD_compressSubBlock_sequences(const ZSTD_fseCTables_t* fseTables,
-                                              const ZSTD_fseCTablesMetadata_t* fseMetadata,
-                                              const seqDef* sequences, size_t nbSeq,
-                                              const BYTE* llCode, const BYTE* mlCode, const BYTE* ofCode,
-                                              const ZSTD_CCtx_params* cctxParams,
-                                              void* dst, size_t dstCapacity,
-                                              const int bmi2, int writeEntropy, int* entropyWritten)
+static size_t
+ZSTD_compressSubBlock_sequences(const ZSTD_fseCTables_t* fseTables,
+                                const ZSTD_fseCTablesMetadata_t* fseMetadata,
+                                const seqDef* sequences, size_t nbSeq,
+                                const BYTE* llCode, const BYTE* mlCode, const BYTE* ofCode,
+                                const ZSTD_CCtx_params* cctxParams,
+                                void* dst, size_t dstCapacity,
+                                const int bmi2, int writeEntropy, int* entropyWritten)
 {
     const int longOffsets = cctxParams->cParams.windowLog > STREAM_ACCUMULATOR_MIN;
     BYTE* const ostart = (BYTE*)dst;
@@ -539,7 +544,7 @@ static size_t ZSTD_compressSubBlock_multi(const seqStore_t* seqStorePtr,
             repcodes_t rep;
             ZSTD_memcpy(&rep, prevCBlock->rep, sizeof(rep));
             for (seq = sstart; seq < sp; ++seq) {
-                ZSTD_updateRep(rep.rep, seq->offBase - 1, ZSTD_getSequenceLength(seqStorePtr, seq).litLength == 0);
+                ZSTD_updateRep(rep.rep, seq->offBase, ZSTD_getSequenceLength(seqStorePtr, seq).litLength == 0);
             }
             ZSTD_memcpy(nextCBlock->rep, &rep, sizeof(rep));
         }
diff --git a/lib/zstd/compress/zstd_compress_superblock.h b/lib/zstd/compress/zstd_compress_superblock.h
index 224ece79546e..826bbc9e029b 100644
--- a/lib/zstd/compress/zstd_compress_superblock.h
+++ b/lib/zstd/compress/zstd_compress_superblock.h
@@ -1,5 +1,6 @@
+/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
 /*
- * Copyright (c) Yann Collet, Facebook, Inc.
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
  * All rights reserved.
  *
  * This source code is licensed under both the BSD-style license (found in the
diff --git a/lib/zstd/compress/zstd_cwksp.h b/lib/zstd/compress/zstd_cwksp.h
index 349fc923c355..65ea53b62844 100644
--- a/lib/zstd/compress/zstd_cwksp.h
+++ b/lib/zstd/compress/zstd_cwksp.h
@@ -1,5 +1,6 @@
+/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
 /*
- * Copyright (c) Yann Collet, Facebook, Inc.
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
  * All rights reserved.
  *
  * This source code is licensed under both the BSD-style license (found in the
@@ -14,7 +15,9 @@
 /*-*************************************
 *  Dependencies
 ***************************************/
+#include "../common/allocations.h"  /* ZSTD_customMalloc, ZSTD_customFree */
 #include "../common/zstd_internal.h"
+#include "../common/portability_macros.h"
 
 
 /*-*************************************
@@ -41,8 +44,9 @@
 ***************************************/
 typedef enum {
     ZSTD_cwksp_alloc_objects,
-    ZSTD_cwksp_alloc_buffers,
-    ZSTD_cwksp_alloc_aligned
+    ZSTD_cwksp_alloc_aligned_init_once,
+    ZSTD_cwksp_alloc_aligned,
+    ZSTD_cwksp_alloc_buffers
 } ZSTD_cwksp_alloc_phase_e;
 
 /*
@@ -95,8 +99,8 @@ typedef enum {
  *
  * Workspace Layout:
  *
- * [                        ... workspace ...                         ]
- * [objects][tables ... ->] free space [<- ... aligned][<- ... buffers]
+ * [                        ... workspace ...                           ]
+ * [objects][tables ->] free space [<- buffers][<- aligned][<- init once]
  *
  * The various objects that live in the workspace are divided into the
  * following categories, and are allocated separately:
@@ -120,9 +124,18 @@ typedef enum {
  *   uint32_t arrays, all of whose values are between 0 and (nextSrc - base).
  *   Their sizes depend on the cparams. These tables are 64-byte aligned.
  *
- * - Aligned: these buffers are used for various purposes that require 4 byte
- *   alignment, but don't require any initialization before they're used. These
- *   buffers are each aligned to 64 bytes.
+ * - Init once: these buffers require to be initialized at least once before
+ *   use. They should be used when we want to skip memory initialization
+ *   while not triggering memory checkers (like Valgrind) when reading from
+ *   from this memory without writing to it first.
+ *   These buffers should be used carefully as they might contain data
+ *   from previous compressions.
+ *   Buffers are aligned to 64 bytes.
+ *
+ * - Aligned: these buffers don't require any initialization before they're
+ *   used. The user of the buffer should make sure they write into a buffer
+ *   location before reading from it.
+ *   Buffers are aligned to 64 bytes.
  *
  * - Buffers: these buffers are used for various purposes that don't require
  *   any alignment or initialization before they're used. This means they can
@@ -134,8 +147,9 @@ typedef enum {
  * correctly packed into the workspace buffer. That order is:
  *
  * 1. Objects
- * 2. Buffers
- * 3. Aligned/Tables
+ * 2. Init once / Tables
+ * 3. Aligned / Tables
+ * 4. Buffers / Tables
  *
  * Attempts to reserve objects of different types out of order will fail.
  */
@@ -147,6 +161,7 @@ typedef struct {
     void* tableEnd;
     void* tableValidEnd;
     void* allocStart;
+    void* initOnceStart;
 
     BYTE allocFailed;
     int workspaceOversizedDuration;
@@ -159,6 +174,7 @@ typedef struct {
 ***************************************/
 
 MEM_STATIC size_t ZSTD_cwksp_available_space(ZSTD_cwksp* ws);
+MEM_STATIC void*  ZSTD_cwksp_initialAllocStart(ZSTD_cwksp* ws);
 
 MEM_STATIC void ZSTD_cwksp_assert_internal_consistency(ZSTD_cwksp* ws) {
     (void)ws;
@@ -168,6 +184,8 @@ MEM_STATIC void ZSTD_cwksp_assert_internal_consistency(ZSTD_cwksp* ws) {
     assert(ws->tableEnd <= ws->allocStart);
     assert(ws->tableValidEnd <= ws->allocStart);
     assert(ws->allocStart <= ws->workspaceEnd);
+    assert(ws->initOnceStart <= ZSTD_cwksp_initialAllocStart(ws));
+    assert(ws->workspace <= ws->initOnceStart);
 }
 
 /*
@@ -210,14 +228,10 @@ MEM_STATIC size_t ZSTD_cwksp_aligned_alloc_size(size_t size) {
  * for internal purposes (currently only alignment).
  */
 MEM_STATIC size_t ZSTD_cwksp_slack_space_required(void) {
-    /* For alignment, the wksp will always allocate an additional n_1=[1, 64] bytes
-     * to align the beginning of tables section, as well as another n_2=[0, 63] bytes
-     * to align the beginning of the aligned section.
-     *
-     * n_1 + n_2 == 64 bytes if the cwksp is freshly allocated, due to tables and
-     * aligneds being sized in multiples of 64 bytes.
+    /* For alignment, the wksp will always allocate an additional 2*ZSTD_CWKSP_ALIGNMENT_BYTES
+     * bytes to align the beginning of tables section and end of buffers;
      */
-    size_t const slackSpace = ZSTD_CWKSP_ALIGNMENT_BYTES;
+    size_t const slackSpace = ZSTD_CWKSP_ALIGNMENT_BYTES * 2;
     return slackSpace;
 }
 
@@ -230,10 +244,18 @@ MEM_STATIC size_t ZSTD_cwksp_bytes_to_align_ptr(void* ptr, const size_t alignByt
     size_t const alignBytesMask = alignBytes - 1;
     size_t const bytes = (alignBytes - ((size_t)ptr & (alignBytesMask))) & alignBytesMask;
     assert((alignBytes & alignBytesMask) == 0);
-    assert(bytes != ZSTD_CWKSP_ALIGNMENT_BYTES);
+    assert(bytes < alignBytes);
     return bytes;
 }
 
+/*
+ * Returns the initial value for allocStart which is used to determine the position from
+ * which we can allocate from the end of the workspace.
+ */
+MEM_STATIC void*  ZSTD_cwksp_initialAllocStart(ZSTD_cwksp* ws) {
+    return (void*)((size_t)ws->workspaceEnd & ~(ZSTD_CWKSP_ALIGNMENT_BYTES-1));
+}
+
 /*
  * Internal function. Do not use directly.
  * Reserves the given number of bytes within the aligned/buffer segment of the wksp,
@@ -274,27 +296,16 @@ ZSTD_cwksp_internal_advance_phase(ZSTD_cwksp* ws, ZSTD_cwksp_alloc_phase_e phase
 {
     assert(phase >= ws->phase);
     if (phase > ws->phase) {
-        /* Going from allocating objects to allocating buffers */
-        if (ws->phase < ZSTD_cwksp_alloc_buffers &&
-                phase >= ZSTD_cwksp_alloc_buffers) {
+        /* Going from allocating objects to allocating initOnce / tables */
+        if (ws->phase < ZSTD_cwksp_alloc_aligned_init_once &&
+            phase >= ZSTD_cwksp_alloc_aligned_init_once) {
             ws->tableValidEnd = ws->objectEnd;
-        }
+            ws->initOnceStart = ZSTD_cwksp_initialAllocStart(ws);
 
-        /* Going from allocating buffers to allocating aligneds/tables */
-        if (ws->phase < ZSTD_cwksp_alloc_aligned &&
-                phase >= ZSTD_cwksp_alloc_aligned) {
-            {   /* Align the start of the "aligned" to 64 bytes. Use [1, 64] bytes. */
-                size_t const bytesToAlign =
-                    ZSTD_CWKSP_ALIGNMENT_BYTES - ZSTD_cwksp_bytes_to_align_ptr(ws->allocStart, ZSTD_CWKSP_ALIGNMENT_BYTES);
-                DEBUGLOG(5, "reserving aligned alignment addtl space: %zu", bytesToAlign);
-                ZSTD_STATIC_ASSERT((ZSTD_CWKSP_ALIGNMENT_BYTES & (ZSTD_CWKSP_ALIGNMENT_BYTES - 1)) == 0); /* power of 2 */
-                RETURN_ERROR_IF(!ZSTD_cwksp_reserve_internal_buffer_space(ws, bytesToAlign),
-                                memory_allocation, "aligned phase - alignment initial allocation failed!");
-            }
             {   /* Align the start of the tables to 64 bytes. Use [0, 63] bytes */
-                void* const alloc = ws->objectEnd;
+                void *const alloc = ws->objectEnd;
                 size_t const bytesToAlign = ZSTD_cwksp_bytes_to_align_ptr(alloc, ZSTD_CWKSP_ALIGNMENT_BYTES);
-                void* const objectEnd = (BYTE*)alloc + bytesToAlign;
+                void *const objectEnd = (BYTE *) alloc + bytesToAlign;
                 DEBUGLOG(5, "reserving table alignment addtl space: %zu", bytesToAlign);
                 RETURN_ERROR_IF(objectEnd > ws->workspaceEnd, memory_allocation,
                                 "table phase - alignment initial allocation failed!");
@@ -302,7 +313,9 @@ ZSTD_cwksp_internal_advance_phase(ZSTD_cwksp* ws, ZSTD_cwksp_alloc_phase_e phase
                 ws->tableEnd = objectEnd;  /* table area starts being empty */
                 if (ws->tableValidEnd < ws->tableEnd) {
                     ws->tableValidEnd = ws->tableEnd;
-        }   }   }
+                }
+            }
+        }
         ws->phase = phase;
         ZSTD_cwksp_assert_internal_consistency(ws);
     }
@@ -314,7 +327,7 @@ ZSTD_cwksp_internal_advance_phase(ZSTD_cwksp* ws, ZSTD_cwksp_alloc_phase_e phase
  */
 MEM_STATIC int ZSTD_cwksp_owns_buffer(const ZSTD_cwksp* ws, const void* ptr)
 {
-    return (ptr != NULL) && (ws->workspace <= ptr) && (ptr <= ws->workspaceEnd);
+    return (ptr != NULL) && (ws->workspace <= ptr) && (ptr < ws->workspaceEnd);
 }
 
 /*
@@ -343,6 +356,33 @@ MEM_STATIC BYTE* ZSTD_cwksp_reserve_buffer(ZSTD_cwksp* ws, size_t bytes)
     return (BYTE*)ZSTD_cwksp_reserve_internal(ws, bytes, ZSTD_cwksp_alloc_buffers);
 }
 
+/*
+ * Reserves and returns memory sized on and aligned on ZSTD_CWKSP_ALIGNMENT_BYTES (64 bytes).
+ * This memory has been initialized at least once in the past.
+ * This doesn't mean it has been initialized this time, and it might contain data from previous
+ * operations.
+ * The main usage is for algorithms that might need read access into uninitialized memory.
+ * The algorithm must maintain safety under these conditions and must make sure it doesn't
+ * leak any of the past data (directly or in side channels).
+ */
+MEM_STATIC void* ZSTD_cwksp_reserve_aligned_init_once(ZSTD_cwksp* ws, size_t bytes)
+{
+    size_t const alignedBytes = ZSTD_cwksp_align(bytes, ZSTD_CWKSP_ALIGNMENT_BYTES);
+    void* ptr = ZSTD_cwksp_reserve_internal(ws, alignedBytes, ZSTD_cwksp_alloc_aligned_init_once);
+    assert(((size_t)ptr & (ZSTD_CWKSP_ALIGNMENT_BYTES-1))== 0);
+    if(ptr && ptr < ws->initOnceStart) {
+        /* We assume the memory following the current allocation is either:
+         * 1. Not usable as initOnce memory (end of workspace)
+         * 2. Another initOnce buffer that has been allocated before (and so was previously memset)
+         * 3. An ASAN redzone, in which case we don't want to write on it
+         * For these reasons it should be fine to not explicitly zero every byte up to ws->initOnceStart.
+         * Note that we assume here that MSAN and ASAN cannot run in the same time. */
+        ZSTD_memset(ptr, 0, MIN((size_t)((U8*)ws->initOnceStart - (U8*)ptr), alignedBytes));
+        ws->initOnceStart = ptr;
+    }
+    return ptr;
+}
+
 /*
  * Reserves and returns memory sized on and aligned on ZSTD_CWKSP_ALIGNMENT_BYTES (64 bytes).
  */
@@ -361,13 +401,17 @@ MEM_STATIC void* ZSTD_cwksp_reserve_aligned(ZSTD_cwksp* ws, size_t bytes)
  */
 MEM_STATIC void* ZSTD_cwksp_reserve_table(ZSTD_cwksp* ws, size_t bytes)
 {
-    const ZSTD_cwksp_alloc_phase_e phase = ZSTD_cwksp_alloc_aligned;
+    const ZSTD_cwksp_alloc_phase_e phase = ZSTD_cwksp_alloc_aligned_init_once;
     void* alloc;
     void* end;
     void* top;
 
-    if (ZSTD_isError(ZSTD_cwksp_internal_advance_phase(ws, phase))) {
-        return NULL;
+    /* We can only start allocating tables after we are done reserving space for objects at the
+     * start of the workspace */
+    if(ws->phase < phase) {
+        if (ZSTD_isError(ZSTD_cwksp_internal_advance_phase(ws, phase))) {
+            return NULL;
+        }
     }
     alloc = ws->tableEnd;
     end = (BYTE *)alloc + bytes;
@@ -451,7 +495,7 @@ MEM_STATIC void ZSTD_cwksp_clean_tables(ZSTD_cwksp* ws) {
     assert(ws->tableValidEnd >= ws->objectEnd);
     assert(ws->tableValidEnd <= ws->allocStart);
     if (ws->tableValidEnd < ws->tableEnd) {
-        ZSTD_memset(ws->tableValidEnd, 0, (BYTE*)ws->tableEnd - (BYTE*)ws->tableValidEnd);
+        ZSTD_memset(ws->tableValidEnd, 0, (size_t)((BYTE*)ws->tableEnd - (BYTE*)ws->tableValidEnd));
     }
     ZSTD_cwksp_mark_tables_clean(ws);
 }
@@ -478,10 +522,10 @@ MEM_STATIC void ZSTD_cwksp_clear(ZSTD_cwksp* ws) {
 
 
     ws->tableEnd = ws->objectEnd;
-    ws->allocStart = ws->workspaceEnd;
+    ws->allocStart = ZSTD_cwksp_initialAllocStart(ws);
     ws->allocFailed = 0;
-    if (ws->phase > ZSTD_cwksp_alloc_buffers) {
-        ws->phase = ZSTD_cwksp_alloc_buffers;
+    if (ws->phase > ZSTD_cwksp_alloc_aligned_init_once) {
+        ws->phase = ZSTD_cwksp_alloc_aligned_init_once;
     }
     ZSTD_cwksp_assert_internal_consistency(ws);
 }
@@ -498,6 +542,7 @@ MEM_STATIC void ZSTD_cwksp_init(ZSTD_cwksp* ws, void* start, size_t size, ZSTD_c
     ws->workspaceEnd = (BYTE*)start + size;
     ws->objectEnd = ws->workspace;
     ws->tableValidEnd = ws->objectEnd;
+    ws->initOnceStart = ZSTD_cwksp_initialAllocStart(ws);
     ws->phase = ZSTD_cwksp_alloc_objects;
     ws->isStatic = isStatic;
     ZSTD_cwksp_clear(ws);
@@ -550,17 +595,11 @@ MEM_STATIC int ZSTD_cwksp_reserve_failed(const ZSTD_cwksp* ws) {
  * Returns if the estimated space needed for a wksp is within an acceptable limit of the
  * actual amount of space used.
  */
-MEM_STATIC int ZSTD_cwksp_estimated_space_within_bounds(const ZSTD_cwksp* const ws,
-                                                        size_t const estimatedSpace, int resizedWorkspace) {
-    if (resizedWorkspace) {
-        /* Resized/newly allocated wksp should have exact bounds */
-        return ZSTD_cwksp_used(ws) == estimatedSpace;
-    } else {
-        /* Due to alignment, when reusing a workspace, we can actually consume 63 fewer or more bytes
-         * than estimatedSpace. See the comments in zstd_cwksp.h for details.
-         */
-        return (ZSTD_cwksp_used(ws) >= estimatedSpace - 63) && (ZSTD_cwksp_used(ws) <= estimatedSpace + 63);
-    }
+MEM_STATIC int ZSTD_cwksp_estimated_space_within_bounds(const ZSTD_cwksp *const ws, size_t const estimatedSpace) {
+    /* We have an alignment space between objects and tables between tables and buffers, so we can have up to twice
+     * the alignment bytes difference between estimation and actual usage */
+    return (estimatedSpace - ZSTD_cwksp_slack_space_required()) <= ZSTD_cwksp_used(ws) &&
+           ZSTD_cwksp_used(ws) <= estimatedSpace;
 }
 
 
diff --git a/lib/zstd/compress/zstd_double_fast.c b/lib/zstd/compress/zstd_double_fast.c
index 76933dea2624..ab9440a99603 100644
--- a/lib/zstd/compress/zstd_double_fast.c
+++ b/lib/zstd/compress/zstd_double_fast.c
@@ -1,5 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
 /*
- * Copyright (c) Yann Collet, Facebook, Inc.
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
  * All rights reserved.
  *
  * This source code is licensed under both the BSD-style license (found in the
@@ -11,8 +12,43 @@
 #include "zstd_compress_internal.h"
 #include "zstd_double_fast.h"
 
+static void ZSTD_fillDoubleHashTableForCDict(ZSTD_matchState_t* ms,
+                              void const* end, ZSTD_dictTableLoadMethod_e dtlm)
+{
+    const ZSTD_compressionParameters* const cParams = &ms->cParams;
+    U32* const hashLarge = ms->hashTable;
+    U32  const hBitsL = cParams->hashLog + ZSTD_SHORT_CACHE_TAG_BITS;
+    U32  const mls = cParams->minMatch;
+    U32* const hashSmall = ms->chainTable;
+    U32  const hBitsS = cParams->chainLog + ZSTD_SHORT_CACHE_TAG_BITS;
+    const BYTE* const base = ms->window.base;
+    const BYTE* ip = base + ms->nextToUpdate;
+    const BYTE* const iend = ((const BYTE*)end) - HASH_READ_SIZE;
+    const U32 fastHashFillStep = 3;
 
-void ZSTD_fillDoubleHashTable(ZSTD_matchState_t* ms,
+    /* Always insert every fastHashFillStep position into the hash tables.
+     * Insert the other positions into the large hash table if their entry
+     * is empty.
+     */
+    for (; ip + fastHashFillStep - 1 <= iend; ip += fastHashFillStep) {
+        U32 const curr = (U32)(ip - base);
+        U32 i;
+        for (i = 0; i < fastHashFillStep; ++i) {
+            size_t const smHashAndTag = ZSTD_hashPtr(ip + i, hBitsS, mls);
+            size_t const lgHashAndTag = ZSTD_hashPtr(ip + i, hBitsL, 8);
+            if (i == 0) {
+                ZSTD_writeTaggedIndex(hashSmall, smHashAndTag, curr + i);
+            }
+            if (i == 0 || hashLarge[lgHashAndTag >> ZSTD_SHORT_CACHE_TAG_BITS] == 0) {
+                ZSTD_writeTaggedIndex(hashLarge, lgHashAndTag, curr + i);
+            }
+            /* Only load extra positions for ZSTD_dtlm_full */
+            if (dtlm == ZSTD_dtlm_fast)
+                break;
+    }   }
+}
+
+static void ZSTD_fillDoubleHashTableForCCtx(ZSTD_matchState_t* ms,
                               void const* end, ZSTD_dictTableLoadMethod_e dtlm)
 {
     const ZSTD_compressionParameters* const cParams = &ms->cParams;
@@ -43,7 +79,19 @@ void ZSTD_fillDoubleHashTable(ZSTD_matchState_t* ms,
             /* Only load extra positions for ZSTD_dtlm_full */
             if (dtlm == ZSTD_dtlm_fast)
                 break;
-    }   }
+        }   }
+}
+
+void ZSTD_fillDoubleHashTable(ZSTD_matchState_t* ms,
+                        const void* const end,
+                        ZSTD_dictTableLoadMethod_e dtlm,
+                        ZSTD_tableFillPurpose_e tfp)
+{
+    if (tfp == ZSTD_tfp_forCDict) {
+        ZSTD_fillDoubleHashTableForCDict(ms, end, dtlm);
+    } else {
+        ZSTD_fillDoubleHashTableForCCtx(ms, end, dtlm);
+    }
 }
 
 
@@ -67,7 +115,7 @@ size_t ZSTD_compressBlock_doubleFast_noDict_generic(
     const BYTE* const iend = istart + srcSize;
     const BYTE* const ilimit = iend - HASH_READ_SIZE;
     U32 offset_1=rep[0], offset_2=rep[1];
-    U32 offsetSaved = 0;
+    U32 offsetSaved1 = 0, offsetSaved2 = 0;
 
     size_t mLength;
     U32 offset;
@@ -100,8 +148,8 @@ size_t ZSTD_compressBlock_doubleFast_noDict_generic(
         U32 const current = (U32)(ip - base);
         U32 const windowLow = ZSTD_getLowestPrefixIndex(ms, current, cParams->windowLog);
         U32 const maxRep = current - windowLow;
-        if (offset_2 > maxRep) offsetSaved = offset_2, offset_2 = 0;
-        if (offset_1 > maxRep) offsetSaved = offset_1, offset_1 = 0;
+        if (offset_2 > maxRep) offsetSaved2 = offset_2, offset_2 = 0;
+        if (offset_1 > maxRep) offsetSaved1 = offset_1, offset_1 = 0;
     }
 
     /* Outer Loop: one iteration per match found and stored */
@@ -131,7 +179,7 @@ size_t ZSTD_compressBlock_doubleFast_noDict_generic(
             if ((offset_1 > 0) & (MEM_read32(ip+1-offset_1) == MEM_read32(ip+1))) {
                 mLength = ZSTD_count(ip+1+4, ip+1+4-offset_1, iend) + 4;
                 ip++;
-                ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_REPCODE_1, mLength);
+                ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, REPCODE1_TO_OFFBASE, mLength);
                 goto _match_stored;
             }
 
@@ -175,9 +223,13 @@ size_t ZSTD_compressBlock_doubleFast_noDict_generic(
         } while (ip1 <= ilimit);
 
 _cleanup:
+        /* If offset_1 started invalid (offsetSaved1 != 0) and became valid (offset_1 != 0),
+         * rotate saved offsets. See comment in ZSTD_compressBlock_fast_noDict for more context. */
+        offsetSaved2 = ((offsetSaved1 != 0) && (offset_1 != 0)) ? offsetSaved1 : offsetSaved2;
+
         /* save reps for next block */
-        rep[0] = offset_1 ? offset_1 : offsetSaved;
-        rep[1] = offset_2 ? offset_2 : offsetSaved;
+        rep[0] = offset_1 ? offset_1 : offsetSaved1;
+        rep[1] = offset_2 ? offset_2 : offsetSaved2;
 
         /* Return the last literals size */
         return (size_t)(iend - anchor);
@@ -217,7 +269,7 @@ size_t ZSTD_compressBlock_doubleFast_noDict_generic(
             hashLong[hl1] = (U32)(ip1 - base);
         }
 
-        ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_OFFSET(offset), mLength);
+        ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, OFFSET_TO_OFFBASE(offset), mLength);
 
 _match_stored:
         /* match found */
@@ -243,7 +295,7 @@ size_t ZSTD_compressBlock_doubleFast_noDict_generic(
                 U32 const tmpOff = offset_2; offset_2 = offset_1; offset_1 = tmpOff;  /* swap offset_2 <=> offset_1 */
                 hashSmall[ZSTD_hashPtr(ip, hBitsS, mls)] = (U32)(ip-base);
                 hashLong[ZSTD_hashPtr(ip, hBitsL, 8)] = (U32)(ip-base);
-                ZSTD_storeSeq(seqStore, 0, anchor, iend, STORE_REPCODE_1, rLength);
+                ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, rLength);
                 ip += rLength;
                 anchor = ip;
                 continue;   /* faster when present ... (?) */
@@ -275,7 +327,6 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic(
     const BYTE* const iend = istart + srcSize;
     const BYTE* const ilimit = iend - HASH_READ_SIZE;
     U32 offset_1=rep[0], offset_2=rep[1];
-    U32 offsetSaved = 0;
 
     const ZSTD_matchState_t* const dms = ms->dictMatchState;
     const ZSTD_compressionParameters* const dictCParams = &dms->cParams;
@@ -286,8 +337,8 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic(
     const BYTE* const dictStart    = dictBase + dictStartIndex;
     const BYTE* const dictEnd      = dms->window.nextSrc;
     const U32 dictIndexDelta       = prefixLowestIndex - (U32)(dictEnd - dictBase);
-    const U32 dictHBitsL           = dictCParams->hashLog;
-    const U32 dictHBitsS           = dictCParams->chainLog;
+    const U32 dictHBitsL           = dictCParams->hashLog + ZSTD_SHORT_CACHE_TAG_BITS;
+    const U32 dictHBitsS           = dictCParams->chainLog + ZSTD_SHORT_CACHE_TAG_BITS;
     const U32 dictAndPrefixLength  = (U32)((ip - prefixLowest) + (dictEnd - dictStart));
 
     DEBUGLOG(5, "ZSTD_compressBlock_doubleFast_dictMatchState_generic");
@@ -295,6 +346,13 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic(
     /* if a dictionary is attached, it must be within window range */
     assert(ms->window.dictLimit + (1U << cParams->windowLog) >= endIndex);
 
+    if (ms->prefetchCDictTables) {
+        size_t const hashTableBytes = (((size_t)1) << dictCParams->hashLog) * sizeof(U32);
+        size_t const chainTableBytes = (((size_t)1) << dictCParams->chainLog) * sizeof(U32);
+        PREFETCH_AREA(dictHashLong, hashTableBytes)
+        PREFETCH_AREA(dictHashSmall, chainTableBytes)
+    }
+
     /* init */
     ip += (dictAndPrefixLength == 0);
 
@@ -309,8 +367,12 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic(
         U32 offset;
         size_t const h2 = ZSTD_hashPtr(ip, hBitsL, 8);
         size_t const h = ZSTD_hashPtr(ip, hBitsS, mls);
-        size_t const dictHL = ZSTD_hashPtr(ip, dictHBitsL, 8);
-        size_t const dictHS = ZSTD_hashPtr(ip, dictHBitsS, mls);
+        size_t const dictHashAndTagL = ZSTD_hashPtr(ip, dictHBitsL, 8);
+        size_t const dictHashAndTagS = ZSTD_hashPtr(ip, dictHBitsS, mls);
+        U32 const dictMatchIndexAndTagL = dictHashLong[dictHashAndTagL >> ZSTD_SHORT_CACHE_TAG_BITS];
+        U32 const dictMatchIndexAndTagS = dictHashSmall[dictHashAndTagS >> ZSTD_SHORT_CACHE_TAG_BITS];
+        int const dictTagsMatchL = ZSTD_comparePackedTags(dictMatchIndexAndTagL, dictHashAndTagL);
+        int const dictTagsMatchS = ZSTD_comparePackedTags(dictMatchIndexAndTagS, dictHashAndTagS);
         U32 const curr = (U32)(ip-base);
         U32 const matchIndexL = hashLong[h2];
         U32 matchIndexS = hashSmall[h];
@@ -328,7 +390,7 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic(
             const BYTE* repMatchEnd = repIndex < prefixLowestIndex ? dictEnd : iend;
             mLength = ZSTD_count_2segments(ip+1+4, repMatch+4, iend, repMatchEnd, prefixLowest) + 4;
             ip++;
-            ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_REPCODE_1, mLength);
+            ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, REPCODE1_TO_OFFBASE, mLength);
             goto _match_stored;
         }
 
@@ -340,9 +402,9 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic(
                 while (((ip>anchor) & (matchLong>prefixLowest)) && (ip[-1] == matchLong[-1])) { ip--; matchLong--; mLength++; } /* catch up */
                 goto _match_found;
             }
-        } else {
+        } else if (dictTagsMatchL) {
             /* check dictMatchState long match */
-            U32 const dictMatchIndexL = dictHashLong[dictHL];
+            U32 const dictMatchIndexL = dictMatchIndexAndTagL >> ZSTD_SHORT_CACHE_TAG_BITS;
             const BYTE* dictMatchL = dictBase + dictMatchIndexL;
             assert(dictMatchL < dictEnd);
 
@@ -358,9 +420,9 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic(
             if (MEM_read32(match) == MEM_read32(ip)) {
                 goto _search_next_long;
             }
-        } else {
+        } else if (dictTagsMatchS) {
             /* check dictMatchState short match */
-            U32 const dictMatchIndexS = dictHashSmall[dictHS];
+            U32 const dictMatchIndexS = dictMatchIndexAndTagS >> ZSTD_SHORT_CACHE_TAG_BITS;
             match = dictBase + dictMatchIndexS;
             matchIndexS = dictMatchIndexS + dictIndexDelta;
 
@@ -375,10 +437,11 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic(
         continue;
 
 _search_next_long:
-
         {   size_t const hl3 = ZSTD_hashPtr(ip+1, hBitsL, 8);
-            size_t const dictHLNext = ZSTD_hashPtr(ip+1, dictHBitsL, 8);
+            size_t const dictHashAndTagL3 = ZSTD_hashPtr(ip+1, dictHBitsL, 8);
             U32 const matchIndexL3 = hashLong[hl3];
+            U32 const dictMatchIndexAndTagL3 = dictHashLong[dictHashAndTagL3 >> ZSTD_SHORT_CACHE_TAG_BITS];
+            int const dictTagsMatchL3 = ZSTD_comparePackedTags(dictMatchIndexAndTagL3, dictHashAndTagL3);
             const BYTE* matchL3 = base + matchIndexL3;
             hashLong[hl3] = curr + 1;
 
@@ -391,9 +454,9 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic(
                     while (((ip>anchor) & (matchL3>prefixLowest)) && (ip[-1] == matchL3[-1])) { ip--; matchL3--; mLength++; } /* catch up */
                     goto _match_found;
                 }
-            } else {
+            } else if (dictTagsMatchL3) {
                 /* check dict long +1 match */
-                U32 const dictMatchIndexL3 = dictHashLong[dictHLNext];
+                U32 const dictMatchIndexL3 = dictMatchIndexAndTagL3 >> ZSTD_SHORT_CACHE_TAG_BITS;
                 const BYTE* dictMatchL3 = dictBase + dictMatchIndexL3;
                 assert(dictMatchL3 < dictEnd);
                 if (dictMatchL3 > dictStart && MEM_read64(dictMatchL3) == MEM_read64(ip+1)) {
@@ -419,7 +482,7 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic(
         offset_2 = offset_1;
         offset_1 = offset;
 
-        ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_OFFSET(offset), mLength);
+        ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, OFFSET_TO_OFFBASE(offset), mLength);
 
 _match_stored:
         /* match found */
@@ -448,7 +511,7 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic(
                     const BYTE* const repEnd2 = repIndex2 < prefixLowestIndex ? dictEnd : iend;
                     size_t const repLength2 = ZSTD_count_2segments(ip+4, repMatch2+4, iend, repEnd2, prefixLowest) + 4;
                     U32 tmpOffset = offset_2; offset_2 = offset_1; offset_1 = tmpOffset;   /* swap offset_2 <=> offset_1 */
-                    ZSTD_storeSeq(seqStore, 0, anchor, iend, STORE_REPCODE_1, repLength2);
+                    ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, repLength2);
                     hashSmall[ZSTD_hashPtr(ip, hBitsS, mls)] = current2;
                     hashLong[ZSTD_hashPtr(ip, hBitsL, 8)] = current2;
                     ip += repLength2;
@@ -461,8 +524,8 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic(
     }   /* while (ip < ilimit) */
 
     /* save reps for next block */
-    rep[0] = offset_1 ? offset_1 : offsetSaved;
-    rep[1] = offset_2 ? offset_2 : offsetSaved;
+    rep[0] = offset_1;
+    rep[1] = offset_2;
 
     /* Return the last literals size */
     return (size_t)(iend - anchor);
@@ -585,7 +648,7 @@ static size_t ZSTD_compressBlock_doubleFast_extDict_generic(
             const BYTE* repMatchEnd = repIndex < prefixStartIndex ? dictEnd : iend;
             mLength = ZSTD_count_2segments(ip+1+4, repMatch+4, iend, repMatchEnd, prefixStart) + 4;
             ip++;
-            ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_REPCODE_1, mLength);
+            ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, REPCODE1_TO_OFFBASE, mLength);
         } else {
             if ((matchLongIndex > dictStartIndex) && (MEM_read64(matchLong) == MEM_read64(ip))) {
                 const BYTE* const matchEnd = matchLongIndex < prefixStartIndex ? dictEnd : iend;
@@ -596,7 +659,7 @@ static size_t ZSTD_compressBlock_doubleFast_extDict_generic(
                 while (((ip>anchor) & (matchLong>lowMatchPtr)) && (ip[-1] == matchLong[-1])) { ip--; matchLong--; mLength++; }   /* catch up */
                 offset_2 = offset_1;
                 offset_1 = offset;
-                ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_OFFSET(offset), mLength);
+                ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, OFFSET_TO_OFFBASE(offset), mLength);
 
             } else if ((matchIndex > dictStartIndex) && (MEM_read32(match) == MEM_read32(ip))) {
                 size_t const h3 = ZSTD_hashPtr(ip+1, hBitsL, 8);
@@ -621,7 +684,7 @@ static size_t ZSTD_compressBlock_doubleFast_extDict_generic(
                 }
                 offset_2 = offset_1;
                 offset_1 = offset;
-                ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_OFFSET(offset), mLength);
+                ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, OFFSET_TO_OFFBASE(offset), mLength);
 
             } else {
                 ip += ((ip-anchor) >> kSearchStrength) + 1;
@@ -653,7 +716,7 @@ static size_t ZSTD_compressBlock_doubleFast_extDict_generic(
                     const BYTE* const repEnd2 = repIndex2 < prefixStartIndex ? dictEnd : iend;
                     size_t const repLength2 = ZSTD_count_2segments(ip+4, repMatch2+4, iend, repEnd2, prefixStart) + 4;
                     U32 const tmpOffset = offset_2; offset_2 = offset_1; offset_1 = tmpOffset;   /* swap offset_2 <=> offset_1 */
-                    ZSTD_storeSeq(seqStore, 0, anchor, iend, STORE_REPCODE_1, repLength2);
+                    ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, repLength2);
                     hashSmall[ZSTD_hashPtr(ip, hBitsS, mls)] = current2;
                     hashLong[ZSTD_hashPtr(ip, hBitsL, 8)] = current2;
                     ip += repLength2;
diff --git a/lib/zstd/compress/zstd_double_fast.h b/lib/zstd/compress/zstd_double_fast.h
index 6822bde65a1d..0204f12e4cf7 100644
--- a/lib/zstd/compress/zstd_double_fast.h
+++ b/lib/zstd/compress/zstd_double_fast.h
@@ -1,5 +1,6 @@
+/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
 /*
- * Copyright (c) Yann Collet, Facebook, Inc.
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
  * All rights reserved.
  *
  * This source code is licensed under both the BSD-style license (found in the
@@ -16,7 +17,8 @@
 #include "zstd_compress_internal.h"     /* ZSTD_CCtx, size_t */
 
 void ZSTD_fillDoubleHashTable(ZSTD_matchState_t* ms,
-                              void const* end, ZSTD_dictTableLoadMethod_e dtlm);
+                              void const* end, ZSTD_dictTableLoadMethod_e dtlm,
+                              ZSTD_tableFillPurpose_e tfp);
 size_t ZSTD_compressBlock_doubleFast(
         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
         void const* src, size_t srcSize);
diff --git a/lib/zstd/compress/zstd_fast.c b/lib/zstd/compress/zstd_fast.c
index a752e6beab52..3399b39c5dbc 100644
--- a/lib/zstd/compress/zstd_fast.c
+++ b/lib/zstd/compress/zstd_fast.c
@@ -1,5 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
 /*
- * Copyright (c) Yann Collet, Facebook, Inc.
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
  * All rights reserved.
  *
  * This source code is licensed under both the BSD-style license (found in the
@@ -11,8 +12,42 @@
 #include "zstd_compress_internal.h"  /* ZSTD_hashPtr, ZSTD_count, ZSTD_storeSeq */
 #include "zstd_fast.h"
 
+static void ZSTD_fillHashTableForCDict(ZSTD_matchState_t* ms,
+                        const void* const end,
+                        ZSTD_dictTableLoadMethod_e dtlm)
+{
+    const ZSTD_compressionParameters* const cParams = &ms->cParams;
+    U32* const hashTable = ms->hashTable;
+    U32  const hBits = cParams->hashLog + ZSTD_SHORT_CACHE_TAG_BITS;
+    U32  const mls = cParams->minMatch;
+    const BYTE* const base = ms->window.base;
+    const BYTE* ip = base + ms->nextToUpdate;
+    const BYTE* const iend = ((const BYTE*)end) - HASH_READ_SIZE;
+    const U32 fastHashFillStep = 3;
 
-void ZSTD_fillHashTable(ZSTD_matchState_t* ms,
+    /* Currently, we always use ZSTD_dtlm_full for filling CDict tables.
+     * Feel free to remove this assert if there's a good reason! */
+    assert(dtlm == ZSTD_dtlm_full);
+
+    /* Always insert every fastHashFillStep position into the hash table.
+     * Insert the other positions if their hash entry is empty.
+     */
+    for ( ; ip + fastHashFillStep < iend + 2; ip += fastHashFillStep) {
+        U32 const curr = (U32)(ip - base);
+        {   size_t const hashAndTag = ZSTD_hashPtr(ip, hBits, mls);
+            ZSTD_writeTaggedIndex(hashTable, hashAndTag, curr);   }
+
+        if (dtlm == ZSTD_dtlm_fast) continue;
+        /* Only load extra positions for ZSTD_dtlm_full */
+        {   U32 p;
+            for (p = 1; p < fastHashFillStep; ++p) {
+                size_t const hashAndTag = ZSTD_hashPtr(ip + p, hBits, mls);
+                if (hashTable[hashAndTag >> ZSTD_SHORT_CACHE_TAG_BITS] == 0) {  /* not yet filled */
+                    ZSTD_writeTaggedIndex(hashTable, hashAndTag, curr + p);
+                }   }   }   }
+}
+
+static void ZSTD_fillHashTableForCCtx(ZSTD_matchState_t* ms,
                         const void* const end,
                         ZSTD_dictTableLoadMethod_e dtlm)
 {
@@ -25,6 +60,10 @@ void ZSTD_fillHashTable(ZSTD_matchState_t* ms,
     const BYTE* const iend = ((const BYTE*)end) - HASH_READ_SIZE;
     const U32 fastHashFillStep = 3;
 
+    /* Currently, we always use ZSTD_dtlm_fast for filling CCtx tables.
+     * Feel free to remove this assert if there's a good reason! */
+    assert(dtlm == ZSTD_dtlm_fast);
+
     /* Always insert every fastHashFillStep position into the hash table.
      * Insert the other positions if their hash entry is empty.
      */
@@ -42,6 +81,18 @@ void ZSTD_fillHashTable(ZSTD_matchState_t* ms,
     }   }   }   }
 }
 
+void ZSTD_fillHashTable(ZSTD_matchState_t* ms,
+                        const void* const end,
+                        ZSTD_dictTableLoadMethod_e dtlm,
+                        ZSTD_tableFillPurpose_e tfp)
+{
+    if (tfp == ZSTD_tfp_forCDict) {
+        ZSTD_fillHashTableForCDict(ms, end, dtlm);
+    } else {
+        ZSTD_fillHashTableForCCtx(ms, end, dtlm);
+    }
+}
+
 
 /*
  * If you squint hard enough (and ignore repcodes), the search operation at any
@@ -117,7 +168,7 @@ ZSTD_compressBlock_fast_noDict_generic(
 
     U32 rep_offset1 = rep[0];
     U32 rep_offset2 = rep[1];
-    U32 offsetSaved = 0;
+    U32 offsetSaved1 = 0, offsetSaved2 = 0;
 
     size_t hash0; /* hash for ip0 */
     size_t hash1; /* hash for ip1 */
@@ -141,8 +192,8 @@ ZSTD_compressBlock_fast_noDict_generic(
     {   U32 const curr = (U32)(ip0 - base);
         U32 const windowLow = ZSTD_getLowestPrefixIndex(ms, curr, cParams->windowLog);
         U32 const maxRep = curr - windowLow;
-        if (rep_offset2 > maxRep) offsetSaved = rep_offset2, rep_offset2 = 0;
-        if (rep_offset1 > maxRep) offsetSaved = rep_offset1, rep_offset1 = 0;
+        if (rep_offset2 > maxRep) offsetSaved2 = rep_offset2, rep_offset2 = 0;
+        if (rep_offset1 > maxRep) offsetSaved1 = rep_offset1, rep_offset1 = 0;
     }
 
     /* start each op */
@@ -180,8 +231,14 @@ ZSTD_compressBlock_fast_noDict_generic(
             mLength = ip0[-1] == match0[-1];
             ip0 -= mLength;
             match0 -= mLength;
-            offcode = STORE_REPCODE_1;
+            offcode = REPCODE1_TO_OFFBASE;
             mLength += 4;
+
+            /* First write next hash table entry; we've already calculated it.
+             * This write is known to be safe because the ip1 is before the
+             * repcode (ip2). */
+            hashTable[hash1] = (U32)(ip1 - base);
+
             goto _match;
         }
 
@@ -195,6 +252,12 @@ ZSTD_compressBlock_fast_noDict_generic(
         /* check match at ip[0] */
         if (MEM_read32(ip0) == mval) {
             /* found a match! */
+
+            /* First write next hash table entry; we've already calculated it.
+             * This write is known to be safe because the ip1 == ip0 + 1, so
+             * we know we will resume searching after ip1 */
+            hashTable[hash1] = (U32)(ip1 - base);
+
             goto _offset;
         }
 
@@ -224,6 +287,21 @@ ZSTD_compressBlock_fast_noDict_generic(
         /* check match at ip[0] */
         if (MEM_read32(ip0) == mval) {
             /* found a match! */
+
+            /* first write next hash table entry; we've already calculated it */
+            if (step <= 4) {
+                /* We need to avoid writing an index into the hash table >= the
+                 * position at which we will pick up our searching after we've
+                 * taken this match.
+                 *
+                 * The minimum possible match has length 4, so the earliest ip0
+                 * can be after we take this match will be the current ip0 + 4.
+                 * ip1 is ip0 + step - 1. If ip1 is >= ip0 + 4, we can't safely
+                 * write this position.
+                 */
+                hashTable[hash1] = (U32)(ip1 - base);
+            }
+
             goto _offset;
         }
 
@@ -254,9 +332,24 @@ ZSTD_compressBlock_fast_noDict_generic(
      * However, it seems to be a meaningful performance hit to try to search
      * them. So let's not. */
 
+    /* When the repcodes are outside of the prefix, we set them to zero before the loop.
+     * When the offsets are still zero, we need to restore them after the block to have a correct
+     * repcode history. If only one offset was invalid, it is easy. The tricky case is when both
+     * offsets were invalid. We need to figure out which offset to refill with.
+     *     - If both offsets are zero they are in the same order.
+     *     - If both offsets are non-zero, we won't restore the offsets from `offsetSaved[12]`.
+     *     - If only one is zero, we need to decide which offset to restore.
+     *         - If rep_offset1 is non-zero, then rep_offset2 must be offsetSaved1.
+     *         - It is impossible for rep_offset2 to be non-zero.
+     *
+     * So if rep_offset1 started invalid (offsetSaved1 != 0) and became valid (rep_offset1 != 0), then
+     * set rep[0] = rep_offset1 and rep[1] = offsetSaved1.
+     */
+    offsetSaved2 = ((offsetSaved1 != 0) && (rep_offset1 != 0)) ? offsetSaved1 : offsetSaved2;
+
     /* save reps for next block */
-    rep[0] = rep_offset1 ? rep_offset1 : offsetSaved;
-    rep[1] = rep_offset2 ? rep_offset2 : offsetSaved;
+    rep[0] = rep_offset1 ? rep_offset1 : offsetSaved1;
+    rep[1] = rep_offset2 ? rep_offset2 : offsetSaved2;
 
     /* Return the last literals size */
     return (size_t)(iend - anchor);
@@ -267,7 +360,7 @@ ZSTD_compressBlock_fast_noDict_generic(
     match0 = base + idx;
     rep_offset2 = rep_offset1;
     rep_offset1 = (U32)(ip0-match0);
-    offcode = STORE_OFFSET(rep_offset1);
+    offcode = OFFSET_TO_OFFBASE(rep_offset1);
     mLength = 4;
 
     /* Count the backwards match length. */
@@ -287,11 +380,6 @@ ZSTD_compressBlock_fast_noDict_generic(
     ip0 += mLength;
     anchor = ip0;
 
-    /* write next hash table entry */
-    if (ip1 < ip0) {
-        hashTable[hash1] = (U32)(ip1 - base);
-    }
-
     /* Fill table and check for immediate repcode. */
     if (ip0 <= ilimit) {
         /* Fill Table */
@@ -306,7 +394,7 @@ ZSTD_compressBlock_fast_noDict_generic(
                 { U32 const tmpOff = rep_offset2; rep_offset2 = rep_offset1; rep_offset1 = tmpOff; } /* swap rep_offset2 <=> rep_offset1 */
                 hashTable[ZSTD_hashPtr(ip0, hlog, mls)] = (U32)(ip0-base);
                 ip0 += rLength;
-                ZSTD_storeSeq(seqStore, 0 /*litLen*/, anchor, iend, STORE_REPCODE_1, rLength);
+                ZSTD_storeSeq(seqStore, 0 /*litLen*/, anchor, iend, REPCODE1_TO_OFFBASE, rLength);
                 anchor = ip0;
                 continue;   /* faster when present (confirmed on gcc-8) ... (?) */
     }   }   }
@@ -380,14 +468,14 @@ size_t ZSTD_compressBlock_fast_dictMatchState_generic(
     U32 const stepSize = cParams->targetLength + !(cParams->targetLength);
     const BYTE* const base = ms->window.base;
     const BYTE* const istart = (const BYTE*)src;
-    const BYTE* ip = istart;
+    const BYTE* ip0 = istart;
+    const BYTE* ip1 = ip0 + stepSize; /* we assert below that stepSize >= 1 */
     const BYTE* anchor = istart;
     const U32   prefixStartIndex = ms->window.dictLimit;
     const BYTE* const prefixStart = base + prefixStartIndex;
     const BYTE* const iend = istart + srcSize;
     const BYTE* const ilimit = iend - HASH_READ_SIZE;
     U32 offset_1=rep[0], offset_2=rep[1];
-    U32 offsetSaved = 0;
 
     const ZSTD_matchState_t* const dms = ms->dictMatchState;
     const ZSTD_compressionParameters* const dictCParams = &dms->cParams ;
@@ -397,13 +485,13 @@ size_t ZSTD_compressBlock_fast_dictMatchState_generic(
     const BYTE* const dictStart    = dictBase + dictStartIndex;
     const BYTE* const dictEnd      = dms->window.nextSrc;
     const U32 dictIndexDelta       = prefixStartIndex - (U32)(dictEnd - dictBase);
-    const U32 dictAndPrefixLength  = (U32)(ip - prefixStart + dictEnd - dictStart);
-    const U32 dictHLog             = dictCParams->hashLog;
+    const U32 dictAndPrefixLength  = (U32)(istart - prefixStart + dictEnd - dictStart);
+    const U32 dictHBits            = dictCParams->hashLog + ZSTD_SHORT_CACHE_TAG_BITS;
 
     /* if a dictionary is still attached, it necessarily means that
      * it is within window size. So we just check it. */
     const U32 maxDistance = 1U << cParams->windowLog;
-    const U32 endIndex = (U32)((size_t)(ip - base) + srcSize);
+    const U32 endIndex = (U32)((size_t)(istart - base) + srcSize);
     assert(endIndex - prefixStartIndex <= maxDistance);
     (void)maxDistance; (void)endIndex;   /* these variables are not used when assert() is disabled */
 
@@ -413,106 +501,155 @@ size_t ZSTD_compressBlock_fast_dictMatchState_generic(
      * when translating a dict index into a local index */
     assert(prefixStartIndex >= (U32)(dictEnd - dictBase));
 
+    if (ms->prefetchCDictTables) {
+        size_t const hashTableBytes = (((size_t)1) << dictCParams->hashLog) * sizeof(U32);
+        PREFETCH_AREA(dictHashTable, hashTableBytes)
+    }
+
     /* init */
     DEBUGLOG(5, "ZSTD_compressBlock_fast_dictMatchState_generic");
-    ip += (dictAndPrefixLength == 0);
+    ip0 += (dictAndPrefixLength == 0);
     /* dictMatchState repCode checks don't currently handle repCode == 0
      * disabling. */
     assert(offset_1 <= dictAndPrefixLength);
     assert(offset_2 <= dictAndPrefixLength);
 
-    /* Main Search Loop */
-    while (ip < ilimit) {   /* < instead of <=, because repcode check at (ip+1) */
+    /* Outer search loop */
+    assert(stepSize >= 1);
+    while (ip1 <= ilimit) {   /* repcode check at (ip0 + 1) is safe because ip0 < ip1 */
         size_t mLength;
-        size_t const h = ZSTD_hashPtr(ip, hlog, mls);
-        U32 const curr = (U32)(ip-base);
-        U32 const matchIndex = hashTable[h];
-        const BYTE* match = base + matchIndex;
-        const U32 repIndex = curr + 1 - offset_1;
-        const BYTE* repMatch = (repIndex < prefixStartIndex) ?
-                               dictBase + (repIndex - dictIndexDelta) :
-                               base + repIndex;
-        hashTable[h] = curr;   /* update hash table */
-
-        if ( ((U32)((prefixStartIndex-1) - repIndex) >= 3) /* intentional underflow : ensure repIndex isn't overlapping dict + prefix */
-          && (MEM_read32(repMatch) == MEM_read32(ip+1)) ) {
-            const BYTE* const repMatchEnd = repIndex < prefixStartIndex ? dictEnd : iend;
-            mLength = ZSTD_count_2segments(ip+1+4, repMatch+4, iend, repMatchEnd, prefixStart) + 4;
-            ip++;
-            ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_REPCODE_1, mLength);
-        } else if ( (matchIndex <= prefixStartIndex) ) {
-            size_t const dictHash = ZSTD_hashPtr(ip, dictHLog, mls);
-            U32 const dictMatchIndex = dictHashTable[dictHash];
-            const BYTE* dictMatch = dictBase + dictMatchIndex;
-            if (dictMatchIndex <= dictStartIndex ||
-                MEM_read32(dictMatch) != MEM_read32(ip)) {
-                assert(stepSize >= 1);
-                ip += ((ip-anchor) >> kSearchStrength) + stepSize;
-                continue;
-            } else {
-                /* found a dict match */
-                U32 const offset = (U32)(curr-dictMatchIndex-dictIndexDelta);
-                mLength = ZSTD_count_2segments(ip+4, dictMatch+4, iend, dictEnd, prefixStart) + 4;
-                while (((ip>anchor) & (dictMatch>dictStart))
-                     && (ip[-1] == dictMatch[-1])) {
-                    ip--; dictMatch--; mLength++;
+        size_t hash0 = ZSTD_hashPtr(ip0, hlog, mls);
+
+        size_t const dictHashAndTag0 = ZSTD_hashPtr(ip0, dictHBits, mls);
+        U32 dictMatchIndexAndTag = dictHashTable[dictHashAndTag0 >> ZSTD_SHORT_CACHE_TAG_BITS];
+        int dictTagsMatch = ZSTD_comparePackedTags(dictMatchIndexAndTag, dictHashAndTag0);
+
+        U32 matchIndex = hashTable[hash0];
+        U32 curr = (U32)(ip0 - base);
+        size_t step = stepSize;
+        const size_t kStepIncr = 1 << kSearchStrength;
+        const BYTE* nextStep = ip0 + kStepIncr;
+
+        /* Inner search loop */
+        while (1) {
+            const BYTE* match = base + matchIndex;
+            const U32 repIndex = curr + 1 - offset_1;
+            const BYTE* repMatch = (repIndex < prefixStartIndex) ?
+                                   dictBase + (repIndex - dictIndexDelta) :
+                                   base + repIndex;
+            const size_t hash1 = ZSTD_hashPtr(ip1, hlog, mls);
+            size_t const dictHashAndTag1 = ZSTD_hashPtr(ip1, dictHBits, mls);
+            hashTable[hash0] = curr;   /* update hash table */
+
+            if (((U32) ((prefixStartIndex - 1) - repIndex) >=
+                 3) /* intentional underflow : ensure repIndex isn't overlapping dict + prefix */
+                && (MEM_read32(repMatch) == MEM_read32(ip0 + 1))) {
+                const BYTE* const repMatchEnd = repIndex < prefixStartIndex ? dictEnd : iend;
+                mLength = ZSTD_count_2segments(ip0 + 1 + 4, repMatch + 4, iend, repMatchEnd, prefixStart) + 4;
+                ip0++;
+                ZSTD_storeSeq(seqStore, (size_t) (ip0 - anchor), anchor, iend, REPCODE1_TO_OFFBASE, mLength);
+                break;
+            }
+
+            if (dictTagsMatch) {
+                /* Found a possible dict match */
+                const U32 dictMatchIndex = dictMatchIndexAndTag >> ZSTD_SHORT_CACHE_TAG_BITS;
+                const BYTE* dictMatch = dictBase + dictMatchIndex;
+                if (dictMatchIndex > dictStartIndex &&
+                    MEM_read32(dictMatch) == MEM_read32(ip0)) {
+                    /* To replicate extDict parse behavior, we only use dict matches when the normal matchIndex is invalid */
+                    if (matchIndex <= prefixStartIndex) {
+                        U32 const offset = (U32) (curr - dictMatchIndex - dictIndexDelta);
+                        mLength = ZSTD_count_2segments(ip0 + 4, dictMatch + 4, iend, dictEnd, prefixStart) + 4;
+                        while (((ip0 > anchor) & (dictMatch > dictStart))
+                            && (ip0[-1] == dictMatch[-1])) {
+                            ip0--;
+                            dictMatch--;
+                            mLength++;
+                        } /* catch up */
+                        offset_2 = offset_1;
+                        offset_1 = offset;
+                        ZSTD_storeSeq(seqStore, (size_t) (ip0 - anchor), anchor, iend, OFFSET_TO_OFFBASE(offset), mLength);
+                        break;
+                    }
+                }
+            }
+
+            if (matchIndex > prefixStartIndex && MEM_read32(match) == MEM_read32(ip0)) {
+                /* found a regular match */
+                U32 const offset = (U32) (ip0 - match);
+                mLength = ZSTD_count(ip0 + 4, match + 4, iend) + 4;
+                while (((ip0 > anchor) & (match > prefixStart))
+                       && (ip0[-1] == match[-1])) {
+                    ip0--;
+                    match--;
+                    mLength++;
                 } /* catch up */
                 offset_2 = offset_1;
                 offset_1 = offset;
-                ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_OFFSET(offset), mLength);
+                ZSTD_storeSeq(seqStore, (size_t) (ip0 - anchor), anchor, iend, OFFSET_TO_OFFBASE(offset), mLength);
+                break;
             }
-        } else if (MEM_read32(match) != MEM_read32(ip)) {
-            /* it's not a match, and we're not going to check the dictionary */
-            assert(stepSize >= 1);
-            ip += ((ip-anchor) >> kSearchStrength) + stepSize;
-            continue;
-        } else {
-            /* found a regular match */
-            U32 const offset = (U32)(ip-match);
-            mLength = ZSTD_count(ip+4, match+4, iend) + 4;
-            while (((ip>anchor) & (match>prefixStart))
-                 && (ip[-1] == match[-1])) { ip--; match--; mLength++; } /* catch up */
-            offset_2 = offset_1;
-            offset_1 = offset;
-            ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_OFFSET(offset), mLength);
-        }
+
+            /* Prepare for next iteration */
+            dictMatchIndexAndTag = dictHashTable[dictHashAndTag1 >> ZSTD_SHORT_CACHE_TAG_BITS];
+            dictTagsMatch = ZSTD_comparePackedTags(dictMatchIndexAndTag, dictHashAndTag1);
+            matchIndex = hashTable[hash1];
+
+            if (ip1 >= nextStep) {
+                step++;
+                nextStep += kStepIncr;
+            }
+            ip0 = ip1;
+            ip1 = ip1 + step;
+            if (ip1 > ilimit) goto _cleanup;
+
+            curr = (U32)(ip0 - base);
+            hash0 = hash1;
+        }   /* end inner search loop */
 
         /* match found */
-        ip += mLength;
-        anchor = ip;
+        assert(mLength);
+        ip0 += mLength;
+        anchor = ip0;
 
-        if (ip <= ilimit) {
+        if (ip0 <= ilimit) {
             /* Fill Table */
             assert(base+curr+2 > istart);  /* check base overflow */
             hashTable[ZSTD_hashPtr(base+curr+2, hlog, mls)] = curr+2;  /* here because curr+2 could be > iend-8 */
-            hashTable[ZSTD_hashPtr(ip-2, hlog, mls)] = (U32)(ip-2-base);
+            hashTable[ZSTD_hashPtr(ip0-2, hlog, mls)] = (U32)(ip0-2-base);
 
             /* check immediate repcode */
-            while (ip <= ilimit) {
-                U32 const current2 = (U32)(ip-base);
+            while (ip0 <= ilimit) {
+                U32 const current2 = (U32)(ip0-base);
                 U32 const repIndex2 = current2 - offset_2;
                 const BYTE* repMatch2 = repIndex2 < prefixStartIndex ?
                         dictBase - dictIndexDelta + repIndex2 :
                         base + repIndex2;
                 if ( ((U32)((prefixStartIndex-1) - (U32)repIndex2) >= 3 /* intentional overflow */)
-                   && (MEM_read32(repMatch2) == MEM_read32(ip)) ) {
+                   && (MEM_read32(repMatch2) == MEM_read32(ip0))) {
                     const BYTE* const repEnd2 = repIndex2 < prefixStartIndex ? dictEnd : iend;
-                    size_t const repLength2 = ZSTD_count_2segments(ip+4, repMatch2+4, iend, repEnd2, prefixStart) + 4;
+                    size_t const repLength2 = ZSTD_count_2segments(ip0+4, repMatch2+4, iend, repEnd2, prefixStart) + 4;
                     U32 tmpOffset = offset_2; offset_2 = offset_1; offset_1 = tmpOffset;   /* swap offset_2 <=> offset_1 */
-                    ZSTD_storeSeq(seqStore, 0, anchor, iend, STORE_REPCODE_1, repLength2);
-                    hashTable[ZSTD_hashPtr(ip, hlog, mls)] = current2;
-                    ip += repLength2;
-                    anchor = ip;
+                    ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, repLength2);
+                    hashTable[ZSTD_hashPtr(ip0, hlog, mls)] = current2;
+                    ip0 += repLength2;
+                    anchor = ip0;
                     continue;
                 }
                 break;
             }
         }
+
+        /* Prepare for next iteration */
+        assert(ip0 == anchor);
+        ip1 = ip0 + stepSize;
     }
 
+_cleanup:
     /* save reps for next block */
-    rep[0] = offset_1 ? offset_1 : offsetSaved;
-    rep[1] = offset_2 ? offset_2 : offsetSaved;
+    rep[0] = offset_1;
+    rep[1] = offset_2;
 
     /* Return the last literals size */
     return (size_t)(iend - anchor);
@@ -553,11 +690,10 @@ static size_t ZSTD_compressBlock_fast_extDict_generic(
     U32* const hashTable = ms->hashTable;
     U32 const hlog = cParams->hashLog;
     /* support stepSize of 0 */
-    U32 const stepSize = cParams->targetLength + !(cParams->targetLength);
+    size_t const stepSize = cParams->targetLength + !(cParams->targetLength) + 1;
     const BYTE* const base = ms->window.base;
     const BYTE* const dictBase = ms->window.dictBase;
     const BYTE* const istart = (const BYTE*)src;
-    const BYTE* ip = istart;
     const BYTE* anchor = istart;
     const U32   endIndex = (U32)((size_t)(istart - base) + srcSize);
     const U32   lowLimit = ZSTD_getLowestMatchIndex(ms, endIndex, cParams->windowLog);
@@ -570,6 +706,28 @@ static size_t ZSTD_compressBlock_fast_extDict_generic(
     const BYTE* const iend = istart + srcSize;
     const BYTE* const ilimit = iend - 8;
     U32 offset_1=rep[0], offset_2=rep[1];
+    U32 offsetSaved1 = 0, offsetSaved2 = 0;
+
+    const BYTE* ip0 = istart;
+    const BYTE* ip1;
+    const BYTE* ip2;
+    const BYTE* ip3;
+    U32 current0;
+
+
+    size_t hash0; /* hash for ip0 */
+    size_t hash1; /* hash for ip1 */
+    U32 idx; /* match idx for ip0 */
+    const BYTE* idxBase; /* base pointer for idx */
+
+    U32 offcode;
+    const BYTE* match0;
+    size_t mLength;
+    const BYTE* matchEnd = 0; /* initialize to avoid warning, assert != 0 later */
+
+    size_t step;
+    const BYTE* nextStep;
+    const size_t kStepIncr = (1 << (kSearchStrength - 1));
 
     (void)hasStep; /* not currently specialized on whether it's accelerated */
 
@@ -579,75 +737,202 @@ static size_t ZSTD_compressBlock_fast_extDict_generic(
     if (prefixStartIndex == dictStartIndex)
         return ZSTD_compressBlock_fast(ms, seqStore, rep, src, srcSize);
 
-    /* Search Loop */
-    while (ip < ilimit) {  /* < instead of <=, because (ip+1) */
-        const size_t h = ZSTD_hashPtr(ip, hlog, mls);
-        const U32    matchIndex = hashTable[h];
-        const BYTE* const matchBase = matchIndex < prefixStartIndex ? dictBase : base;
-        const BYTE*  match = matchBase + matchIndex;
-        const U32    curr = (U32)(ip-base);
-        const U32    repIndex = curr + 1 - offset_1;
-        const BYTE* const repBase = repIndex < prefixStartIndex ? dictBase : base;
-        const BYTE* const repMatch = repBase + repIndex;
-        hashTable[h] = curr;   /* update hash table */
-        DEBUGLOG(7, "offset_1 = %u , curr = %u", offset_1, curr);
-
-        if ( ( ((U32)((prefixStartIndex-1) - repIndex) >= 3) /* intentional underflow */
-             & (offset_1 <= curr+1 - dictStartIndex) ) /* note: we are searching at curr+1 */
-           && (MEM_read32(repMatch) == MEM_read32(ip+1)) ) {
-            const BYTE* const repMatchEnd = repIndex < prefixStartIndex ? dictEnd : iend;
-            size_t const rLength = ZSTD_count_2segments(ip+1 +4, repMatch +4, iend, repMatchEnd, prefixStart) + 4;
-            ip++;
-            ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_REPCODE_1, rLength);
-            ip += rLength;
-            anchor = ip;
-        } else {
-            if ( (matchIndex < dictStartIndex) ||
-                 (MEM_read32(match) != MEM_read32(ip)) ) {
-                assert(stepSize >= 1);
-                ip += ((ip-anchor) >> kSearchStrength) + stepSize;
-                continue;
+    {   U32 const curr = (U32)(ip0 - base);
+        U32 const maxRep = curr - dictStartIndex;
+        if (offset_2 >= maxRep) offsetSaved2 = offset_2, offset_2 = 0;
+        if (offset_1 >= maxRep) offsetSaved1 = offset_1, offset_1 = 0;
+    }
+
+    /* start each op */
+_start: /* Requires: ip0 */
+
+    step = stepSize;
+    nextStep = ip0 + kStepIncr;
+
+    /* calculate positions, ip0 - anchor == 0, so we skip step calc */
+    ip1 = ip0 + 1;
+    ip2 = ip0 + step;
+    ip3 = ip2 + 1;
+
+    if (ip3 >= ilimit) {
+        goto _cleanup;
+    }
+
+    hash0 = ZSTD_hashPtr(ip0, hlog, mls);
+    hash1 = ZSTD_hashPtr(ip1, hlog, mls);
+
+    idx = hashTable[hash0];
+    idxBase = idx < prefixStartIndex ? dictBase : base;
+
+    do {
+        {   /* load repcode match for ip[2] */
+            U32 const current2 = (U32)(ip2 - base);
+            U32 const repIndex = current2 - offset_1;
+            const BYTE* const repBase = repIndex < prefixStartIndex ? dictBase : base;
+            U32 rval;
+            if ( ((U32)(prefixStartIndex - repIndex) >= 4) /* intentional underflow */
+                 & (offset_1 > 0) ) {
+                rval = MEM_read32(repBase + repIndex);
+            } else {
+                rval = MEM_read32(ip2) ^ 1; /* guaranteed to not match. */
             }
-            {   const BYTE* const matchEnd = matchIndex < prefixStartIndex ? dictEnd : iend;
-                const BYTE* const lowMatchPtr = matchIndex < prefixStartIndex ? dictStart : prefixStart;
-                U32 const offset = curr - matchIndex;
-                size_t mLength = ZSTD_count_2segments(ip+4, match+4, iend, matchEnd, prefixStart) + 4;
-                while (((ip>anchor) & (match>lowMatchPtr)) && (ip[-1] == match[-1])) { ip--; match--; mLength++; }   /* catch up */
-                offset_2 = offset_1; offset_1 = offset;  /* update offset history */
-                ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_OFFSET(offset), mLength);
-                ip += mLength;
-                anchor = ip;
+
+            /* write back hash table entry */
+            current0 = (U32)(ip0 - base);
+            hashTable[hash0] = current0;
+
+            /* check repcode at ip[2] */
+            if (MEM_read32(ip2) == rval) {
+                ip0 = ip2;
+                match0 = repBase + repIndex;
+                matchEnd = repIndex < prefixStartIndex ? dictEnd : iend;
+                assert((match0 != prefixStart) & (match0 != dictStart));
+                mLength = ip0[-1] == match0[-1];
+                ip0 -= mLength;
+                match0 -= mLength;
+                offcode = REPCODE1_TO_OFFBASE;
+                mLength += 4;
+                goto _match;
         }   }
 
-        if (ip <= ilimit) {
-            /* Fill Table */
-            hashTable[ZSTD_hashPtr(base+curr+2, hlog, mls)] = curr+2;
-            hashTable[ZSTD_hashPtr(ip-2, hlog, mls)] = (U32)(ip-2-base);
-            /* check immediate repcode */
-            while (ip <= ilimit) {
-                U32 const current2 = (U32)(ip-base);
-                U32 const repIndex2 = current2 - offset_2;
-                const BYTE* const repMatch2 = repIndex2 < prefixStartIndex ? dictBase + repIndex2 : base + repIndex2;
-                if ( (((U32)((prefixStartIndex-1) - repIndex2) >= 3) & (offset_2 <= curr - dictStartIndex))  /* intentional overflow */
-                   && (MEM_read32(repMatch2) == MEM_read32(ip)) ) {
-                    const BYTE* const repEnd2 = repIndex2 < prefixStartIndex ? dictEnd : iend;
-                    size_t const repLength2 = ZSTD_count_2segments(ip+4, repMatch2+4, iend, repEnd2, prefixStart) + 4;
-                    { U32 const tmpOffset = offset_2; offset_2 = offset_1; offset_1 = tmpOffset; }  /* swap offset_2 <=> offset_1 */
-                    ZSTD_storeSeq(seqStore, 0 /*litlen*/, anchor, iend, STORE_REPCODE_1, repLength2);
-                    hashTable[ZSTD_hashPtr(ip, hlog, mls)] = current2;
-                    ip += repLength2;
-                    anchor = ip;
-                    continue;
-                }
-                break;
-    }   }   }
+        {   /* load match for ip[0] */
+            U32 const mval = idx >= dictStartIndex ?
+                    MEM_read32(idxBase + idx) :
+                    MEM_read32(ip0) ^ 1; /* guaranteed not to match */
+
+            /* check match at ip[0] */
+            if (MEM_read32(ip0) == mval) {
+                /* found a match! */
+                goto _offset;
+        }   }
+
+        /* lookup ip[1] */
+        idx = hashTable[hash1];
+        idxBase = idx < prefixStartIndex ? dictBase : base;
+
+        /* hash ip[2] */
+        hash0 = hash1;
+        hash1 = ZSTD_hashPtr(ip2, hlog, mls);
+
+        /* advance to next positions */
+        ip0 = ip1;
+        ip1 = ip2;
+        ip2 = ip3;
+
+        /* write back hash table entry */
+        current0 = (U32)(ip0 - base);
+        hashTable[hash0] = current0;
+
+        {   /* load match for ip[0] */
+            U32 const mval = idx >= dictStartIndex ?
+                    MEM_read32(idxBase + idx) :
+                    MEM_read32(ip0) ^ 1; /* guaranteed not to match */
+
+            /* check match at ip[0] */
+            if (MEM_read32(ip0) == mval) {
+                /* found a match! */
+                goto _offset;
+        }   }
+
+        /* lookup ip[1] */
+        idx = hashTable[hash1];
+        idxBase = idx < prefixStartIndex ? dictBase : base;
+
+        /* hash ip[2] */
+        hash0 = hash1;
+        hash1 = ZSTD_hashPtr(ip2, hlog, mls);
+
+        /* advance to next positions */
+        ip0 = ip1;
+        ip1 = ip2;
+        ip2 = ip0 + step;
+        ip3 = ip1 + step;
+
+        /* calculate step */
+        if (ip2 >= nextStep) {
+            step++;
+            PREFETCH_L1(ip1 + 64);
+            PREFETCH_L1(ip1 + 128);
+            nextStep += kStepIncr;
+        }
+    } while (ip3 < ilimit);
+
+_cleanup:
+    /* Note that there are probably still a couple positions we could search.
+     * However, it seems to be a meaningful performance hit to try to search
+     * them. So let's not. */
+
+    /* If offset_1 started invalid (offsetSaved1 != 0) and became valid (offset_1 != 0),
+     * rotate saved offsets. See comment in ZSTD_compressBlock_fast_noDict for more context. */
+    offsetSaved2 = ((offsetSaved1 != 0) && (offset_1 != 0)) ? offsetSaved1 : offsetSaved2;
 
     /* save reps for next block */
-    rep[0] = offset_1;
-    rep[1] = offset_2;
+    rep[0] = offset_1 ? offset_1 : offsetSaved1;
+    rep[1] = offset_2 ? offset_2 : offsetSaved2;
 
     /* Return the last literals size */
     return (size_t)(iend - anchor);
+
+_offset: /* Requires: ip0, idx, idxBase */
+
+    /* Compute the offset code. */
+    {   U32 const offset = current0 - idx;
+        const BYTE* const lowMatchPtr = idx < prefixStartIndex ? dictStart : prefixStart;
+        matchEnd = idx < prefixStartIndex ? dictEnd : iend;
+        match0 = idxBase + idx;
+        offset_2 = offset_1;
+        offset_1 = offset;
+        offcode = OFFSET_TO_OFFBASE(offset);
+        mLength = 4;
+
+        /* Count the backwards match length. */
+        while (((ip0>anchor) & (match0>lowMatchPtr)) && (ip0[-1] == match0[-1])) {
+            ip0--;
+            match0--;
+            mLength++;
+    }   }
+
+_match: /* Requires: ip0, match0, offcode, matchEnd */
+
+    /* Count the forward length. */
+    assert(matchEnd != 0);
+    mLength += ZSTD_count_2segments(ip0 + mLength, match0 + mLength, iend, matchEnd, prefixStart);
+
+    ZSTD_storeSeq(seqStore, (size_t)(ip0 - anchor), anchor, iend, offcode, mLength);
+
+    ip0 += mLength;
+    anchor = ip0;
+
+    /* write next hash table entry */
+    if (ip1 < ip0) {
+        hashTable[hash1] = (U32)(ip1 - base);
+    }
+
+    /* Fill table and check for immediate repcode. */
+    if (ip0 <= ilimit) {
+        /* Fill Table */
+        assert(base+current0+2 > istart);  /* check base overflow */
+        hashTable[ZSTD_hashPtr(base+current0+2, hlog, mls)] = current0+2;  /* here because current+2 could be > iend-8 */
+        hashTable[ZSTD_hashPtr(ip0-2, hlog, mls)] = (U32)(ip0-2-base);
+
+        while (ip0 <= ilimit) {
+            U32 const repIndex2 = (U32)(ip0-base) - offset_2;
+            const BYTE* const repMatch2 = repIndex2 < prefixStartIndex ? dictBase + repIndex2 : base + repIndex2;
+            if ( (((U32)((prefixStartIndex-1) - repIndex2) >= 3) & (offset_2 > 0))  /* intentional underflow */
+                 && (MEM_read32(repMatch2) == MEM_read32(ip0)) ) {
+                const BYTE* const repEnd2 = repIndex2 < prefixStartIndex ? dictEnd : iend;
+                size_t const repLength2 = ZSTD_count_2segments(ip0+4, repMatch2+4, iend, repEnd2, prefixStart) + 4;
+                { U32 const tmpOffset = offset_2; offset_2 = offset_1; offset_1 = tmpOffset; }  /* swap offset_2 <=> offset_1 */
+                ZSTD_storeSeq(seqStore, 0 /*litlen*/, anchor, iend, REPCODE1_TO_OFFBASE, repLength2);
+                hashTable[ZSTD_hashPtr(ip0, hlog, mls)] = (U32)(ip0-base);
+                ip0 += repLength2;
+                anchor = ip0;
+                continue;
+            }
+            break;
+    }   }
+
+    goto _start;
 }
 
 ZSTD_GEN_FAST_FN(extDict, 4, 0)
@@ -660,6 +945,7 @@ size_t ZSTD_compressBlock_fast_extDict(
         void const* src, size_t srcSize)
 {
     U32 const mls = ms->cParams.minMatch;
+    assert(ms->dictMatchState == NULL);
     switch(mls)
     {
     default: /* includes case 3 */
diff --git a/lib/zstd/compress/zstd_fast.h b/lib/zstd/compress/zstd_fast.h
index fddc2f532d21..e64d9e1b2d39 100644
--- a/lib/zstd/compress/zstd_fast.h
+++ b/lib/zstd/compress/zstd_fast.h
@@ -1,5 +1,6 @@
+/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
 /*
- * Copyright (c) Yann Collet, Facebook, Inc.
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
  * All rights reserved.
  *
  * This source code is licensed under both the BSD-style license (found in the
@@ -16,7 +17,8 @@
 #include "zstd_compress_internal.h"
 
 void ZSTD_fillHashTable(ZSTD_matchState_t* ms,
-                        void const* end, ZSTD_dictTableLoadMethod_e dtlm);
+                        void const* end, ZSTD_dictTableLoadMethod_e dtlm,
+                        ZSTD_tableFillPurpose_e tfp);
 size_t ZSTD_compressBlock_fast(
         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
         void const* src, size_t srcSize);
diff --git a/lib/zstd/compress/zstd_lazy.c b/lib/zstd/compress/zstd_lazy.c
index 0298a01a7504..f6b4978ceba7 100644
--- a/lib/zstd/compress/zstd_lazy.c
+++ b/lib/zstd/compress/zstd_lazy.c
@@ -1,5 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
 /*
- * Copyright (c) Yann Collet, Facebook, Inc.
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
  * All rights reserved.
  *
  * This source code is licensed under both the BSD-style license (found in the
@@ -10,6 +11,9 @@
 
 #include "zstd_compress_internal.h"
 #include "zstd_lazy.h"
+#include "../common/bits.h" /* ZSTD_countTrailingZeros64 */
+
+#define kLazySkippingStep 8
 
 
 /*-*************************************
@@ -197,8 +201,8 @@ ZSTD_DUBT_findBetterDictMatch (
             U32 matchIndex = dictMatchIndex + dictIndexDelta;
             if ( (4*(int)(matchLength-bestLength)) > (int)(ZSTD_highbit32(curr-matchIndex+1) - ZSTD_highbit32((U32)offsetPtr[0]+1)) ) {
                 DEBUGLOG(9, "ZSTD_DUBT_findBetterDictMatch(%u) : found better match length %u -> %u and offsetCode %u -> %u (dictMatchIndex %u, matchIndex %u)",
-                    curr, (U32)bestLength, (U32)matchLength, (U32)*offsetPtr, STORE_OFFSET(curr - matchIndex), dictMatchIndex, matchIndex);
-                bestLength = matchLength, *offsetPtr = STORE_OFFSET(curr - matchIndex);
+                    curr, (U32)bestLength, (U32)matchLength, (U32)*offsetPtr, OFFSET_TO_OFFBASE(curr - matchIndex), dictMatchIndex, matchIndex);
+                bestLength = matchLength, *offsetPtr = OFFSET_TO_OFFBASE(curr - matchIndex);
             }
             if (ip+matchLength == iend) {   /* reached end of input : ip[matchLength] is not valid, no way to know if it's larger or smaller than match */
                 break;   /* drop, to guarantee consistency (miss a little bit of compression) */
@@ -218,7 +222,7 @@ ZSTD_DUBT_findBetterDictMatch (
     }
 
     if (bestLength >= MINMATCH) {
-        U32 const mIndex = curr - (U32)STORED_OFFSET(*offsetPtr); (void)mIndex;
+        U32 const mIndex = curr - (U32)OFFBASE_TO_OFFSET(*offsetPtr); (void)mIndex;
         DEBUGLOG(8, "ZSTD_DUBT_findBetterDictMatch(%u) : found match of length %u and offsetCode %u (pos %u)",
                     curr, (U32)bestLength, (U32)*offsetPtr, mIndex);
     }
@@ -230,7 +234,7 @@ ZSTD_DUBT_findBetterDictMatch (
 static size_t
 ZSTD_DUBT_findBestMatch(ZSTD_matchState_t* ms,
                         const BYTE* const ip, const BYTE* const iend,
-                        size_t* offsetPtr,
+                        size_t* offBasePtr,
                         U32 const mls,
                         const ZSTD_dictMode_e dictMode)
 {
@@ -327,8 +331,8 @@ ZSTD_DUBT_findBestMatch(ZSTD_matchState_t* ms,
             if (matchLength > bestLength) {
                 if (matchLength > matchEndIdx - matchIndex)
                     matchEndIdx = matchIndex + (U32)matchLength;
-                if ( (4*(int)(matchLength-bestLength)) > (int)(ZSTD_highbit32(curr-matchIndex+1) - ZSTD_highbit32((U32)offsetPtr[0]+1)) )
-                    bestLength = matchLength, *offsetPtr = STORE_OFFSET(curr - matchIndex);
+                if ( (4*(int)(matchLength-bestLength)) > (int)(ZSTD_highbit32(curr - matchIndex + 1) - ZSTD_highbit32((U32)*offBasePtr)) )
+                    bestLength = matchLength, *offBasePtr = OFFSET_TO_OFFBASE(curr - matchIndex);
                 if (ip+matchLength == iend) {   /* equal : no way to know if inf or sup */
                     if (dictMode == ZSTD_dictMatchState) {
                         nbCompares = 0; /* in addition to avoiding checking any
@@ -361,16 +365,16 @@ ZSTD_DUBT_findBestMatch(ZSTD_matchState_t* ms,
         if (dictMode == ZSTD_dictMatchState && nbCompares) {
             bestLength = ZSTD_DUBT_findBetterDictMatch(
                     ms, ip, iend,
-                    offsetPtr, bestLength, nbCompares,
+                    offBasePtr, bestLength, nbCompares,
                     mls, dictMode);
         }
 
         assert(matchEndIdx > curr+8); /* ensure nextToUpdate is increased */
         ms->nextToUpdate = matchEndIdx - 8;   /* skip repetitive patterns */
         if (bestLength >= MINMATCH) {
-            U32 const mIndex = curr - (U32)STORED_OFFSET(*offsetPtr); (void)mIndex;
+            U32 const mIndex = curr - (U32)OFFBASE_TO_OFFSET(*offBasePtr); (void)mIndex;
             DEBUGLOG(8, "ZSTD_DUBT_findBestMatch(%u) : found match of length %u and offsetCode %u (pos %u)",
-                        curr, (U32)bestLength, (U32)*offsetPtr, mIndex);
+                        curr, (U32)bestLength, (U32)*offBasePtr, mIndex);
         }
         return bestLength;
     }
@@ -381,14 +385,14 @@ ZSTD_DUBT_findBestMatch(ZSTD_matchState_t* ms,
 FORCE_INLINE_TEMPLATE size_t
 ZSTD_BtFindBestMatch( ZSTD_matchState_t* ms,
                 const BYTE* const ip, const BYTE* const iLimit,
-                      size_t* offsetPtr,
+                      size_t* offBasePtr,
                 const U32 mls /* template */,
                 const ZSTD_dictMode_e dictMode)
 {
     DEBUGLOG(7, "ZSTD_BtFindBestMatch");
     if (ip < ms->window.base + ms->nextToUpdate) return 0;   /* skipped area */
     ZSTD_updateDUBT(ms, ip, iLimit, mls);
-    return ZSTD_DUBT_findBestMatch(ms, ip, iLimit, offsetPtr, mls, dictMode);
+    return ZSTD_DUBT_findBestMatch(ms, ip, iLimit, offBasePtr, mls, dictMode);
 }
 
 /* *********************************
@@ -561,7 +565,7 @@ size_t ZSTD_dedicatedDictSearch_lazy_search(size_t* offsetPtr, size_t ml, U32 nb
         /* save best solution */
         if (currentMl > ml) {
             ml = currentMl;
-            *offsetPtr = STORE_OFFSET(curr - (matchIndex + ddsIndexDelta));
+            *offsetPtr = OFFSET_TO_OFFBASE(curr - (matchIndex + ddsIndexDelta));
             if (ip+currentMl == iLimit) {
                 /* best possible, avoids read overflow on next attempt */
                 return ml;
@@ -598,7 +602,7 @@ size_t ZSTD_dedicatedDictSearch_lazy_search(size_t* offsetPtr, size_t ml, U32 nb
             /* save best solution */
             if (currentMl > ml) {
                 ml = currentMl;
-                *offsetPtr = STORE_OFFSET(curr - (matchIndex + ddsIndexDelta));
+                *offsetPtr = OFFSET_TO_OFFBASE(curr - (matchIndex + ddsIndexDelta));
                 if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */
             }
         }
@@ -617,7 +621,7 @@ size_t ZSTD_dedicatedDictSearch_lazy_search(size_t* offsetPtr, size_t ml, U32 nb
 FORCE_INLINE_TEMPLATE U32 ZSTD_insertAndFindFirstIndex_internal(
                         ZSTD_matchState_t* ms,
                         const ZSTD_compressionParameters* const cParams,
-                        const BYTE* ip, U32 const mls)
+                        const BYTE* ip, U32 const mls, U32 const lazySkipping)
 {
     U32* const hashTable  = ms->hashTable;
     const U32 hashLog = cParams->hashLog;
@@ -632,6 +636,9 @@ FORCE_INLINE_TEMPLATE U32 ZSTD_insertAndFindFirstIndex_internal(
         NEXT_IN_CHAIN(idx, chainMask) = hashTable[h];
         hashTable[h] = idx;
         idx++;
+        /* Stop inserting every position when in the lazy skipping mode. */
+        if (lazySkipping)
+            break;
     }
 
     ms->nextToUpdate = target;
@@ -640,7 +647,7 @@ FORCE_INLINE_TEMPLATE U32 ZSTD_insertAndFindFirstIndex_internal(
 
 U32 ZSTD_insertAndFindFirstIndex(ZSTD_matchState_t* ms, const BYTE* ip) {
     const ZSTD_compressionParameters* const cParams = &ms->cParams;
-    return ZSTD_insertAndFindFirstIndex_internal(ms, cParams, ip, ms->cParams.minMatch);
+    return ZSTD_insertAndFindFirstIndex_internal(ms, cParams, ip, ms->cParams.minMatch, /* lazySkipping*/ 0);
 }
 
 /* inlining is important to hardwire a hot branch (template emulation) */
@@ -684,14 +691,15 @@ size_t ZSTD_HcFindBestMatch(
     }
 
     /* HC4 match finder */
-    matchIndex = ZSTD_insertAndFindFirstIndex_internal(ms, cParams, ip, mls);
+    matchIndex = ZSTD_insertAndFindFirstIndex_internal(ms, cParams, ip, mls, ms->lazySkipping);
 
     for ( ; (matchIndex>=lowLimit) & (nbAttempts>0) ; nbAttempts--) {
         size_t currentMl=0;
         if ((dictMode != ZSTD_extDict) || matchIndex >= dictLimit) {
             const BYTE* const match = base + matchIndex;
             assert(matchIndex >= dictLimit);   /* ensures this is true if dictMode != ZSTD_extDict */
-            if (match[ml] == ip[ml])   /* potentially better */
+            /* read 4B starting from (match + ml + 1 - sizeof(U32)) */
+            if (MEM_read32(match + ml - 3) == MEM_read32(ip + ml - 3))   /* potentially better */
                 currentMl = ZSTD_count(ip, match, iLimit);
         } else {
             const BYTE* const match = dictBase + matchIndex;
@@ -703,7 +711,7 @@ size_t ZSTD_HcFindBestMatch(
         /* save best solution */
         if (currentMl > ml) {
             ml = currentMl;
-            *offsetPtr = STORE_OFFSET(curr - matchIndex);
+            *offsetPtr = OFFSET_TO_OFFBASE(curr - matchIndex);
             if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */
         }
 
@@ -739,7 +747,7 @@ size_t ZSTD_HcFindBestMatch(
             if (currentMl > ml) {
                 ml = currentMl;
                 assert(curr > matchIndex + dmsIndexDelta);
-                *offsetPtr = STORE_OFFSET(curr - (matchIndex + dmsIndexDelta));
+                *offsetPtr = OFFSET_TO_OFFBASE(curr - (matchIndex + dmsIndexDelta));
                 if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */
             }
 
@@ -756,8 +764,6 @@ size_t ZSTD_HcFindBestMatch(
 * (SIMD) Row-based matchfinder
 ***********************************/
 /* Constants for row-based hash */
-#define ZSTD_ROW_HASH_TAG_OFFSET 16     /* byte offset of hashes in the match state's tagTable from the beginning of a row */
-#define ZSTD_ROW_HASH_TAG_BITS 8        /* nb bits to use for the tag */
 #define ZSTD_ROW_HASH_TAG_MASK ((1u << ZSTD_ROW_HASH_TAG_BITS) - 1)
 #define ZSTD_ROW_HASH_MAX_ENTRIES 64    /* absolute maximum number of entries per row, for all configurations */
 
@@ -769,64 +775,19 @@ typedef U64 ZSTD_VecMask;   /* Clarifies when we are interacting with a U64 repr
  * Starting from the LSB, returns the idx of the next non-zero bit.
  * Basically counting the nb of trailing zeroes.
  */
-static U32 ZSTD_VecMask_next(ZSTD_VecMask val) {
-    assert(val != 0);
-#   if (defined(__GNUC__) && ((__GNUC__ > 3) || ((__GNUC__ == 3) && (__GNUC_MINOR__ >= 4))))
-    if (sizeof(size_t) == 4) {
-        U32 mostSignificantWord = (U32)(val >> 32);
-        U32 leastSignificantWord = (U32)val;
-        if (leastSignificantWord == 0) {
-            return 32 + (U32)__builtin_ctz(mostSignificantWord);
-        } else {
-            return (U32)__builtin_ctz(leastSignificantWord);
-        }
-    } else {
-        return (U32)__builtin_ctzll(val);
-    }
-#   else
-    /* Software ctz version: http://aggregate.org/MAGIC/#Trailing%20Zero%20Count
-     * and: https://stackoverflow.com/questions/2709430/count-number-of-bits-in-a-64-bit-long-big-integer
-     */
-    val = ~val & (val - 1ULL); /* Lowest set bit mask */
-    val = val - ((val >> 1) & 0x5555555555555555);
-    val = (val & 0x3333333333333333ULL) + ((val >> 2) & 0x3333333333333333ULL);
-    return (U32)((((val + (val >> 4)) & 0xF0F0F0F0F0F0F0FULL) * 0x101010101010101ULL) >> 56);
-#   endif
-}
-
-/* ZSTD_rotateRight_*():
- * Rotates a bitfield to the right by "count" bits.
- * https://en.wikipedia.org/w/index.php?title=Circular_shift&oldid=991635599#Implementing_circular_shifts
- */
-FORCE_INLINE_TEMPLATE
-U64 ZSTD_rotateRight_U64(U64 const value, U32 count) {
-    assert(count < 64);
-    count &= 0x3F; /* for fickle pattern recognition */
-    return (value >> count) | (U64)(value << ((0U - count) & 0x3F));
-}
-
-FORCE_INLINE_TEMPLATE
-U32 ZSTD_rotateRight_U32(U32 const value, U32 count) {
-    assert(count < 32);
-    count &= 0x1F; /* for fickle pattern recognition */
-    return (value >> count) | (U32)(value << ((0U - count) & 0x1F));
-}
-
-FORCE_INLINE_TEMPLATE
-U16 ZSTD_rotateRight_U16(U16 const value, U32 count) {
-    assert(count < 16);
-    count &= 0x0F; /* for fickle pattern recognition */
-    return (value >> count) | (U16)(value << ((0U - count) & 0x0F));
+MEM_STATIC U32 ZSTD_VecMask_next(ZSTD_VecMask val) {
+    return ZSTD_countTrailingZeros64(val);
 }
 
 /* ZSTD_row_nextIndex():
  * Returns the next index to insert at within a tagTable row, and updates the "head"
- * value to reflect the update. Essentially cycles backwards from [0, {entries per row})
+ * value to reflect the update. Essentially cycles backwards from [1, {entries per row})
  */
 FORCE_INLINE_TEMPLATE U32 ZSTD_row_nextIndex(BYTE* const tagRow, U32 const rowMask) {
-  U32 const next = (*tagRow - 1) & rowMask;
-  *tagRow = (BYTE)next;
-  return next;
+    U32 next = (*tagRow-1) & rowMask;
+    next += (next == 0) ? rowMask : 0; /* skip first position */
+    *tagRow = (BYTE)next;
+    return next;
 }
 
 /* ZSTD_isAligned():
@@ -840,7 +801,7 @@ MEM_STATIC int ZSTD_isAligned(void const* ptr, size_t align) {
 /* ZSTD_row_prefetch():
  * Performs prefetching for the hashTable and tagTable at a given row.
  */
-FORCE_INLINE_TEMPLATE void ZSTD_row_prefetch(U32 const* hashTable, U16 const* tagTable, U32 const relRow, U32 const rowLog) {
+FORCE_INLINE_TEMPLATE void ZSTD_row_prefetch(U32 const* hashTable, BYTE const* tagTable, U32 const relRow, U32 const rowLog) {
     PREFETCH_L1(hashTable + relRow);
     if (rowLog >= 5) {
         PREFETCH_L1(hashTable + relRow + 16);
@@ -864,13 +825,13 @@ FORCE_INLINE_TEMPLATE void ZSTD_row_fillHashCache(ZSTD_matchState_t* ms, const B
                                    U32 idx, const BYTE* const iLimit)
 {
     U32 const* const hashTable = ms->hashTable;
-    U16 const* const tagTable = ms->tagTable;
+    BYTE const* const tagTable = ms->tagTable;
     U32 const hashLog = ms->rowHashLog;
     U32 const maxElemsToPrefetch = (base + idx) > iLimit ? 0 : (U32)(iLimit - (base + idx) + 1);
     U32 const lim = idx + MIN(ZSTD_ROW_HASH_CACHE_SIZE, maxElemsToPrefetch);
 
     for (; idx < lim; ++idx) {
-        U32 const hash = (U32)ZSTD_hashPtr(base + idx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls);
+        U32 const hash = (U32)ZSTD_hashPtrSalted(base + idx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls, ms->hashSalt);
         U32 const row = (hash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog;
         ZSTD_row_prefetch(hashTable, tagTable, row, rowLog);
         ms->hashCache[idx & ZSTD_ROW_HASH_CACHE_MASK] = hash;
@@ -886,11 +847,12 @@ FORCE_INLINE_TEMPLATE void ZSTD_row_fillHashCache(ZSTD_matchState_t* ms, const B
  * base + idx + ZSTD_ROW_HASH_CACHE_SIZE. Also prefetches the appropriate rows from hashTable and tagTable.
  */
 FORCE_INLINE_TEMPLATE U32 ZSTD_row_nextCachedHash(U32* cache, U32 const* hashTable,
-                                                  U16 const* tagTable, BYTE const* base,
+                                                  BYTE const* tagTable, BYTE const* base,
                                                   U32 idx, U32 const hashLog,
-                                                  U32 const rowLog, U32 const mls)
+                                                  U32 const rowLog, U32 const mls,
+                                                  U64 const hashSalt)
 {
-    U32 const newHash = (U32)ZSTD_hashPtr(base+idx+ZSTD_ROW_HASH_CACHE_SIZE, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls);
+    U32 const newHash = (U32)ZSTD_hashPtrSalted(base+idx+ZSTD_ROW_HASH_CACHE_SIZE, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls, hashSalt);
     U32 const row = (newHash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog;
     ZSTD_row_prefetch(hashTable, tagTable, row, rowLog);
     {   U32 const hash = cache[idx & ZSTD_ROW_HASH_CACHE_MASK];
@@ -908,22 +870,21 @@ FORCE_INLINE_TEMPLATE void ZSTD_row_update_internalImpl(ZSTD_matchState_t* ms,
                                                         U32 const rowMask, U32 const useCache)
 {
     U32* const hashTable = ms->hashTable;
-    U16* const tagTable = ms->tagTable;
+    BYTE* const tagTable = ms->tagTable;
     U32 const hashLog = ms->rowHashLog;
     const BYTE* const base = ms->window.base;
 
     DEBUGLOG(6, "ZSTD_row_update_internalImpl(): updateStartIdx=%u, updateEndIdx=%u", updateStartIdx, updateEndIdx);
     for (; updateStartIdx < updateEndIdx; ++updateStartIdx) {
-        U32 const hash = useCache ? ZSTD_row_nextCachedHash(ms->hashCache, hashTable, tagTable, base, updateStartIdx, hashLog, rowLog, mls)
-                                  : (U32)ZSTD_hashPtr(base + updateStartIdx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls);
+        U32 const hash = useCache ? ZSTD_row_nextCachedHash(ms->hashCache, hashTable, tagTable, base, updateStartIdx, hashLog, rowLog, mls, ms->hashSalt)
+                                  : (U32)ZSTD_hashPtrSalted(base + updateStartIdx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls, ms->hashSalt);
         U32 const relRow = (hash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog;
         U32* const row = hashTable + relRow;
-        BYTE* tagRow = (BYTE*)(tagTable + relRow);  /* Though tagTable is laid out as a table of U16, each tag is only 1 byte.
-                                                       Explicit cast allows us to get exact desired position within each row */
+        BYTE* tagRow = tagTable + relRow;
         U32 const pos = ZSTD_row_nextIndex(tagRow, rowMask);
 
-        assert(hash == ZSTD_hashPtr(base + updateStartIdx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls));
-        ((BYTE*)tagRow)[pos + ZSTD_ROW_HASH_TAG_OFFSET] = hash & ZSTD_ROW_HASH_TAG_MASK;
+        assert(hash == ZSTD_hashPtrSalted(base + updateStartIdx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls, ms->hashSalt));
+        tagRow[pos] = hash & ZSTD_ROW_HASH_TAG_MASK;
         row[pos] = updateStartIdx;
     }
 }
@@ -971,7 +932,35 @@ void ZSTD_row_update(ZSTD_matchState_t* const ms, const BYTE* ip) {
     const U32 mls = MIN(ms->cParams.minMatch, 6 /* mls caps out at 6 */);
 
     DEBUGLOG(5, "ZSTD_row_update(), rowLog=%u", rowLog);
-    ZSTD_row_update_internal(ms, ip, mls, rowLog, rowMask, 0 /* dont use cache */);
+    ZSTD_row_update_internal(ms, ip, mls, rowLog, rowMask, 0 /* don't use cache */);
+}
+
+/* Returns the mask width of bits group of which will be set to 1. Given not all
+ * architectures have easy movemask instruction, this helps to iterate over
+ * groups of bits easier and faster.
+ */
+FORCE_INLINE_TEMPLATE U32
+ZSTD_row_matchMaskGroupWidth(const U32 rowEntries)
+{
+    assert((rowEntries == 16) || (rowEntries == 32) || rowEntries == 64);
+    assert(rowEntries <= ZSTD_ROW_HASH_MAX_ENTRIES);
+    (void)rowEntries;
+#if defined(ZSTD_ARCH_ARM_NEON)
+    /* NEON path only works for little endian */
+    if (!MEM_isLittleEndian()) {
+        return 1;
+    }
+    if (rowEntries == 16) {
+        return 4;
+    }
+    if (rowEntries == 32) {
+        return 2;
+    }
+    if (rowEntries == 64) {
+        return 1;
+    }
+#endif
+    return 1;
 }
 
 #if defined(ZSTD_ARCH_X86_SSE2)
@@ -994,71 +983,82 @@ ZSTD_row_getSSEMask(int nbChunks, const BYTE* const src, const BYTE tag, const U
 }
 #endif
 
-/* Returns a ZSTD_VecMask (U32) that has the nth bit set to 1 if the newly-computed "tag" matches
- * the hash at the nth position in a row of the tagTable.
- * Each row is a circular buffer beginning at the value of "head". So we must rotate the "matches" bitfield
- * to match up with the actual layout of the entries within the hashTable */
+#if defined(ZSTD_ARCH_ARM_NEON)
+FORCE_INLINE_TEMPLATE ZSTD_VecMask
+ZSTD_row_getNEONMask(const U32 rowEntries, const BYTE* const src, const BYTE tag, const U32 headGrouped)
+{
+    assert((rowEntries == 16) || (rowEntries == 32) || rowEntries == 64);
+    if (rowEntries == 16) {
+        /* vshrn_n_u16 shifts by 4 every u16 and narrows to 8 lower bits.
+         * After that groups of 4 bits represent the equalMask. We lower
+         * all bits except the highest in these groups by doing AND with
+         * 0x88 = 0b10001000.
+         */
+        const uint8x16_t chunk = vld1q_u8(src);
+        const uint16x8_t equalMask = vreinterpretq_u16_u8(vceqq_u8(chunk, vdupq_n_u8(tag)));
+        const uint8x8_t res = vshrn_n_u16(equalMask, 4);
+        const U64 matches = vget_lane_u64(vreinterpret_u64_u8(res), 0);
+        return ZSTD_rotateRight_U64(matches, headGrouped) & 0x8888888888888888ull;
+    } else if (rowEntries == 32) {
+        /* Same idea as with rowEntries == 16 but doing AND with
+         * 0x55 = 0b01010101.
+         */
+        const uint16x8x2_t chunk = vld2q_u16((const uint16_t*)(const void*)src);
+        const uint8x16_t chunk0 = vreinterpretq_u8_u16(chunk.val[0]);
+        const uint8x16_t chunk1 = vreinterpretq_u8_u16(chunk.val[1]);
+        const uint8x16_t dup = vdupq_n_u8(tag);
+        const uint8x8_t t0 = vshrn_n_u16(vreinterpretq_u16_u8(vceqq_u8(chunk0, dup)), 6);
+        const uint8x8_t t1 = vshrn_n_u16(vreinterpretq_u16_u8(vceqq_u8(chunk1, dup)), 6);
+        const uint8x8_t res = vsli_n_u8(t0, t1, 4);
+        const U64 matches = vget_lane_u64(vreinterpret_u64_u8(res), 0) ;
+        return ZSTD_rotateRight_U64(matches, headGrouped) & 0x5555555555555555ull;
+    } else { /* rowEntries == 64 */
+        const uint8x16x4_t chunk = vld4q_u8(src);
+        const uint8x16_t dup = vdupq_n_u8(tag);
+        const uint8x16_t cmp0 = vceqq_u8(chunk.val[0], dup);
+        const uint8x16_t cmp1 = vceqq_u8(chunk.val[1], dup);
+        const uint8x16_t cmp2 = vceqq_u8(chunk.val[2], dup);
+        const uint8x16_t cmp3 = vceqq_u8(chunk.val[3], dup);
+
+        const uint8x16_t t0 = vsriq_n_u8(cmp1, cmp0, 1);
+        const uint8x16_t t1 = vsriq_n_u8(cmp3, cmp2, 1);
+        const uint8x16_t t2 = vsriq_n_u8(t1, t0, 2);
+        const uint8x16_t t3 = vsriq_n_u8(t2, t2, 4);
+        const uint8x8_t t4 = vshrn_n_u16(vreinterpretq_u16_u8(t3), 4);
+        const U64 matches = vget_lane_u64(vreinterpret_u64_u8(t4), 0);
+        return ZSTD_rotateRight_U64(matches, headGrouped);
+    }
+}
+#endif
+
+/* Returns a ZSTD_VecMask (U64) that has the nth group (determined by
+ * ZSTD_row_matchMaskGroupWidth) of bits set to 1 if the newly-computed "tag"
+ * matches the hash at the nth position in a row of the tagTable.
+ * Each row is a circular buffer beginning at the value of "headGrouped". So we
+ * must rotate the "matches" bitfield to match up with the actual layout of the
+ * entries within the hashTable */
 FORCE_INLINE_TEMPLATE ZSTD_VecMask
-ZSTD_row_getMatchMask(const BYTE* const tagRow, const BYTE tag, const U32 head, const U32 rowEntries)
+ZSTD_row_getMatchMask(const BYTE* const tagRow, const BYTE tag, const U32 headGrouped, const U32 rowEntries)
 {
-    const BYTE* const src = tagRow + ZSTD_ROW_HASH_TAG_OFFSET;
+    const BYTE* const src = tagRow;
     assert((rowEntries == 16) || (rowEntries == 32) || rowEntries == 64);
     assert(rowEntries <= ZSTD_ROW_HASH_MAX_ENTRIES);
+    assert(ZSTD_row_matchMaskGroupWidth(rowEntries) * rowEntries <= sizeof(ZSTD_VecMask) * 8);
 
 #if defined(ZSTD_ARCH_X86_SSE2)
 
-    return ZSTD_row_getSSEMask(rowEntries / 16, src, tag, head);
+    return ZSTD_row_getSSEMask(rowEntries / 16, src, tag, headGrouped);
 
 #else /* SW or NEON-LE */
 
 # if defined(ZSTD_ARCH_ARM_NEON)
   /* This NEON path only works for little endian - otherwise use SWAR below */
     if (MEM_isLittleEndian()) {
-        if (rowEntries == 16) {
-            const uint8x16_t chunk = vld1q_u8(src);
-            const uint16x8_t equalMask = vreinterpretq_u16_u8(vceqq_u8(chunk, vdupq_n_u8(tag)));
-            const uint16x8_t t0 = vshlq_n_u16(equalMask, 7);
-            const uint32x4_t t1 = vreinterpretq_u32_u16(vsriq_n_u16(t0, t0, 14));
-            const uint64x2_t t2 = vreinterpretq_u64_u32(vshrq_n_u32(t1, 14));
-            const uint8x16_t t3 = vreinterpretq_u8_u64(vsraq_n_u64(t2, t2, 28));
-            const U16 hi = (U16)vgetq_lane_u8(t3, 8);
-            const U16 lo = (U16)vgetq_lane_u8(t3, 0);
-            return ZSTD_rotateRight_U16((hi << 8) | lo, head);
-        } else if (rowEntries == 32) {
-            const uint16x8x2_t chunk = vld2q_u16((const U16*)(const void*)src);
-            const uint8x16_t chunk0 = vreinterpretq_u8_u16(chunk.val[0]);
-            const uint8x16_t chunk1 = vreinterpretq_u8_u16(chunk.val[1]);
-            const uint8x16_t equalMask0 = vceqq_u8(chunk0, vdupq_n_u8(tag));
-            const uint8x16_t equalMask1 = vceqq_u8(chunk1, vdupq_n_u8(tag));
-            const int8x8_t pack0 = vqmovn_s16(vreinterpretq_s16_u8(equalMask0));
-            const int8x8_t pack1 = vqmovn_s16(vreinterpretq_s16_u8(equalMask1));
-            const uint8x8_t t0 = vreinterpret_u8_s8(pack0);
-            const uint8x8_t t1 = vreinterpret_u8_s8(pack1);
-            const uint8x8_t t2 = vsri_n_u8(t1, t0, 2);
-            const uint8x8x2_t t3 = vuzp_u8(t2, t0);
-            const uint8x8_t t4 = vsri_n_u8(t3.val[1], t3.val[0], 4);
-            const U32 matches = vget_lane_u32(vreinterpret_u32_u8(t4), 0);
-            return ZSTD_rotateRight_U32(matches, head);
-        } else { /* rowEntries == 64 */
-            const uint8x16x4_t chunk = vld4q_u8(src);
-            const uint8x16_t dup = vdupq_n_u8(tag);
-            const uint8x16_t cmp0 = vceqq_u8(chunk.val[0], dup);
-            const uint8x16_t cmp1 = vceqq_u8(chunk.val[1], dup);
-            const uint8x16_t cmp2 = vceqq_u8(chunk.val[2], dup);
-            const uint8x16_t cmp3 = vceqq_u8(chunk.val[3], dup);
-
-            const uint8x16_t t0 = vsriq_n_u8(cmp1, cmp0, 1);
-            const uint8x16_t t1 = vsriq_n_u8(cmp3, cmp2, 1);
-            const uint8x16_t t2 = vsriq_n_u8(t1, t0, 2);
-            const uint8x16_t t3 = vsriq_n_u8(t2, t2, 4);
-            const uint8x8_t t4 = vshrn_n_u16(vreinterpretq_u16_u8(t3), 4);
-            const U64 matches = vget_lane_u64(vreinterpret_u64_u8(t4), 0);
-            return ZSTD_rotateRight_U64(matches, head);
-        }
+        return ZSTD_row_getNEONMask(rowEntries, src, tag, headGrouped);
     }
 # endif /* ZSTD_ARCH_ARM_NEON */
     /* SWAR */
-    {   const size_t chunkSize = sizeof(size_t);
+    {   const int chunkSize = sizeof(size_t);
         const size_t shiftAmount = ((chunkSize * 8) - chunkSize);
         const size_t xFF = ~((size_t)0);
         const size_t x01 = xFF / 0xFF;
@@ -1091,11 +1091,11 @@ ZSTD_row_getMatchMask(const BYTE* const tagRow, const BYTE tag, const U32 head,
         }
         matches = ~matches;
         if (rowEntries == 16) {
-            return ZSTD_rotateRight_U16((U16)matches, head);
+            return ZSTD_rotateRight_U16((U16)matches, headGrouped);
         } else if (rowEntries == 32) {
-            return ZSTD_rotateRight_U32((U32)matches, head);
+            return ZSTD_rotateRight_U32((U32)matches, headGrouped);
         } else {
-            return ZSTD_rotateRight_U64((U64)matches, head);
+            return ZSTD_rotateRight_U64((U64)matches, headGrouped);
         }
     }
 #endif
@@ -1125,7 +1125,7 @@ size_t ZSTD_RowFindBestMatch(
                         const U32 rowLog)
 {
     U32* const hashTable = ms->hashTable;
-    U16* const tagTable = ms->tagTable;
+    BYTE* const tagTable = ms->tagTable;
     U32* const hashCache = ms->hashCache;
     const U32 hashLog = ms->rowHashLog;
     const ZSTD_compressionParameters* const cParams = &ms->cParams;
@@ -1143,8 +1143,11 @@ size_t ZSTD_RowFindBestMatch(
     const U32 rowEntries = (1U << rowLog);
     const U32 rowMask = rowEntries - 1;
     const U32 cappedSearchLog = MIN(cParams->searchLog, rowLog); /* nb of searches is capped at nb entries per row */
+    const U32 groupWidth = ZSTD_row_matchMaskGroupWidth(rowEntries);
+    const U64 hashSalt = ms->hashSalt;
     U32 nbAttempts = 1U << cappedSearchLog;
     size_t ml=4-1;
+    U32 hash;
 
     /* DMS/DDS variables that may be referenced laster */
     const ZSTD_matchState_t* const dms = ms->dictMatchState;
@@ -1168,7 +1171,7 @@ size_t ZSTD_RowFindBestMatch(
     if (dictMode == ZSTD_dictMatchState) {
         /* Prefetch DMS rows */
         U32* const dmsHashTable = dms->hashTable;
-        U16* const dmsTagTable = dms->tagTable;
+        BYTE* const dmsTagTable = dms->tagTable;
         U32 const dmsHash = (U32)ZSTD_hashPtr(ip, dms->rowHashLog + ZSTD_ROW_HASH_TAG_BITS, mls);
         U32 const dmsRelRow = (dmsHash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog;
         dmsTag = dmsHash & ZSTD_ROW_HASH_TAG_MASK;
@@ -1178,23 +1181,34 @@ size_t ZSTD_RowFindBestMatch(
     }
 
     /* Update the hashTable and tagTable up to (but not including) ip */
-    ZSTD_row_update_internal(ms, ip, mls, rowLog, rowMask, 1 /* useCache */);
+    if (!ms->lazySkipping) {
+        ZSTD_row_update_internal(ms, ip, mls, rowLog, rowMask, 1 /* useCache */);
+        hash = ZSTD_row_nextCachedHash(hashCache, hashTable, tagTable, base, curr, hashLog, rowLog, mls, hashSalt);
+    } else {
+        /* Stop inserting every position when in the lazy skipping mode.
+         * The hash cache is also not kept up to date in this mode.
+         */
+        hash = (U32)ZSTD_hashPtrSalted(ip, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls, hashSalt);
+        ms->nextToUpdate = curr;
+    }
+    ms->hashSaltEntropy += hash; /* collect salt entropy */
+
     {   /* Get the hash for ip, compute the appropriate row */
-        U32 const hash = ZSTD_row_nextCachedHash(hashCache, hashTable, tagTable, base, curr, hashLog, rowLog, mls);
         U32 const relRow = (hash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog;
         U32 const tag = hash & ZSTD_ROW_HASH_TAG_MASK;
         U32* const row = hashTable + relRow;
         BYTE* tagRow = (BYTE*)(tagTable + relRow);
-        U32 const head = *tagRow & rowMask;
+        U32 const headGrouped = (*tagRow & rowMask) * groupWidth;
         U32 matchBuffer[ZSTD_ROW_HASH_MAX_ENTRIES];
         size_t numMatches = 0;
         size_t currMatch = 0;
-        ZSTD_VecMask matches = ZSTD_row_getMatchMask(tagRow, (BYTE)tag, head, rowEntries);
+        ZSTD_VecMask matches = ZSTD_row_getMatchMask(tagRow, (BYTE)tag, headGrouped, rowEntries);
 
         /* Cycle through the matches and prefetch */
-        for (; (matches > 0) && (nbAttempts > 0); --nbAttempts, matches &= (matches - 1)) {
-            U32 const matchPos = (head + ZSTD_VecMask_next(matches)) & rowMask;
+        for (; (matches > 0) && (nbAttempts > 0); matches &= (matches - 1)) {
+            U32 const matchPos = ((headGrouped + ZSTD_VecMask_next(matches)) / groupWidth) & rowMask;
             U32 const matchIndex = row[matchPos];
+            if(matchPos == 0) continue;
             assert(numMatches < rowEntries);
             if (matchIndex < lowLimit)
                 break;
@@ -1204,13 +1218,14 @@ size_t ZSTD_RowFindBestMatch(
                 PREFETCH_L1(dictBase + matchIndex);
             }
             matchBuffer[numMatches++] = matchIndex;
+            --nbAttempts;
         }
 
         /* Speed opt: insert current byte into hashtable too. This allows us to avoid one iteration of the loop
            in ZSTD_row_update_internal() at the next search. */
         {
             U32 const pos = ZSTD_row_nextIndex(tagRow, rowMask);
-            tagRow[pos + ZSTD_ROW_HASH_TAG_OFFSET] = (BYTE)tag;
+            tagRow[pos] = (BYTE)tag;
             row[pos] = ms->nextToUpdate++;
         }
 
@@ -1224,7 +1239,8 @@ size_t ZSTD_RowFindBestMatch(
             if ((dictMode != ZSTD_extDict) || matchIndex >= dictLimit) {
                 const BYTE* const match = base + matchIndex;
                 assert(matchIndex >= dictLimit);   /* ensures this is true if dictMode != ZSTD_extDict */
-                if (match[ml] == ip[ml])   /* potentially better */
+                /* read 4B starting from (match + ml + 1 - sizeof(U32)) */
+                if (MEM_read32(match + ml - 3) == MEM_read32(ip + ml - 3))   /* potentially better */
                     currentMl = ZSTD_count(ip, match, iLimit);
             } else {
                 const BYTE* const match = dictBase + matchIndex;
@@ -1236,7 +1252,7 @@ size_t ZSTD_RowFindBestMatch(
             /* Save best solution */
             if (currentMl > ml) {
                 ml = currentMl;
-                *offsetPtr = STORE_OFFSET(curr - matchIndex);
+                *offsetPtr = OFFSET_TO_OFFBASE(curr - matchIndex);
                 if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */
             }
         }
@@ -1254,19 +1270,21 @@ size_t ZSTD_RowFindBestMatch(
         const U32 dmsSize              = (U32)(dmsEnd - dmsBase);
         const U32 dmsIndexDelta        = dictLimit - dmsSize;
 
-        {   U32 const head = *dmsTagRow & rowMask;
+        {   U32 const headGrouped = (*dmsTagRow & rowMask) * groupWidth;
             U32 matchBuffer[ZSTD_ROW_HASH_MAX_ENTRIES];
             size_t numMatches = 0;
             size_t currMatch = 0;
-            ZSTD_VecMask matches = ZSTD_row_getMatchMask(dmsTagRow, (BYTE)dmsTag, head, rowEntries);
+            ZSTD_VecMask matches = ZSTD_row_getMatchMask(dmsTagRow, (BYTE)dmsTag, headGrouped, rowEntries);
 
-            for (; (matches > 0) && (nbAttempts > 0); --nbAttempts, matches &= (matches - 1)) {
-                U32 const matchPos = (head + ZSTD_VecMask_next(matches)) & rowMask;
+            for (; (matches > 0) && (nbAttempts > 0); matches &= (matches - 1)) {
+                U32 const matchPos = ((headGrouped + ZSTD_VecMask_next(matches)) / groupWidth) & rowMask;
                 U32 const matchIndex = dmsRow[matchPos];
+                if(matchPos == 0) continue;
                 if (matchIndex < dmsLowestIndex)
                     break;
                 PREFETCH_L1(dmsBase + matchIndex);
                 matchBuffer[numMatches++] = matchIndex;
+                --nbAttempts;
             }
 
             /* Return the longest match */
@@ -1285,7 +1303,7 @@ size_t ZSTD_RowFindBestMatch(
                 if (currentMl > ml) {
                     ml = currentMl;
                     assert(curr > matchIndex + dmsIndexDelta);
-                    *offsetPtr = STORE_OFFSET(curr - (matchIndex + dmsIndexDelta));
+                    *offsetPtr = OFFSET_TO_OFFBASE(curr - (matchIndex + dmsIndexDelta));
                     if (ip+currentMl == iLimit) break;
                 }
             }
@@ -1491,7 +1509,8 @@ ZSTD_compressBlock_lazy_generic(
     const U32 mls = BOUNDED(4, ms->cParams.minMatch, 6);
     const U32 rowLog = BOUNDED(4, ms->cParams.searchLog, 6);
 
-    U32 offset_1 = rep[0], offset_2 = rep[1], savedOffset=0;
+    U32 offset_1 = rep[0], offset_2 = rep[1];
+    U32 offsetSaved1 = 0, offsetSaved2 = 0;
 
     const int isDMS = dictMode == ZSTD_dictMatchState;
     const int isDDS = dictMode == ZSTD_dedicatedDictSearch;
@@ -1512,8 +1531,8 @@ ZSTD_compressBlock_lazy_generic(
         U32 const curr = (U32)(ip - base);
         U32 const windowLow = ZSTD_getLowestPrefixIndex(ms, curr, ms->cParams.windowLog);
         U32 const maxRep = curr - windowLow;
-        if (offset_2 > maxRep) savedOffset = offset_2, offset_2 = 0;
-        if (offset_1 > maxRep) savedOffset = offset_1, offset_1 = 0;
+        if (offset_2 > maxRep) offsetSaved2 = offset_2, offset_2 = 0;
+        if (offset_1 > maxRep) offsetSaved1 = offset_1, offset_1 = 0;
     }
     if (isDxS) {
         /* dictMatchState repCode checks don't currently handle repCode == 0
@@ -1522,10 +1541,11 @@ ZSTD_compressBlock_lazy_generic(
         assert(offset_2 <= dictAndPrefixLength);
     }
 
+    /* Reset the lazy skipping state */
+    ms->lazySkipping = 0;
+
     if (searchMethod == search_rowHash) {
-        ZSTD_row_fillHashCache(ms, base, rowLog,
-                            MIN(ms->cParams.minMatch, 6 /* mls caps out at 6 */),
-                            ms->nextToUpdate, ilimit);
+        ZSTD_row_fillHashCache(ms, base, rowLog, mls, ms->nextToUpdate, ilimit);
     }
 
     /* Match Loop */
@@ -1537,7 +1557,7 @@ ZSTD_compressBlock_lazy_generic(
 #endif
     while (ip < ilimit) {
         size_t matchLength=0;
-        size_t offcode=STORE_REPCODE_1;
+        size_t offBase = REPCODE1_TO_OFFBASE;
         const BYTE* start=ip+1;
         DEBUGLOG(7, "search baseline (depth 0)");
 
@@ -1562,14 +1582,23 @@ ZSTD_compressBlock_lazy_generic(
         }
 
         /* first search (depth 0) */
-        {   size_t offsetFound = 999999999;
-            size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &offsetFound, mls, rowLog, searchMethod, dictMode);
+        {   size_t offbaseFound = 999999999;
+            size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &offbaseFound, mls, rowLog, searchMethod, dictMode);
             if (ml2 > matchLength)
-                matchLength = ml2, start = ip, offcode=offsetFound;
+                matchLength = ml2, start = ip, offBase = offbaseFound;
         }
 
         if (matchLength < 4) {
-            ip += ((ip-anchor) >> kSearchStrength) + 1;   /* jump faster over incompressible sections */
+            size_t const step = ((size_t)(ip-anchor) >> kSearchStrength) + 1;   /* jump faster over incompressible sections */;
+            ip += step;
+            /* Enter the lazy skipping mode once we are skipping more than 8 bytes at a time.
+             * In this mode we stop inserting every position into our tables, and only insert
+             * positions that we search, which is one in step positions.
+             * The exact cutoff is flexible, I've just chosen a number that is reasonably high,
+             * so we minimize the compression ratio loss in "normal" scenarios. This mode gets
+             * triggered once we've gone 2KB without finding any matches.
+             */
+            ms->lazySkipping = step > kLazySkippingStep;
             continue;
         }
 
@@ -1579,12 +1608,12 @@ ZSTD_compressBlock_lazy_generic(
             DEBUGLOG(7, "search depth 1");
             ip ++;
             if ( (dictMode == ZSTD_noDict)
-              && (offcode) && ((offset_1>0) & (MEM_read32(ip) == MEM_read32(ip - offset_1)))) {
+              && (offBase) && ((offset_1>0) & (MEM_read32(ip) == MEM_read32(ip - offset_1)))) {
                 size_t const mlRep = ZSTD_count(ip+4, ip+4-offset_1, iend) + 4;
                 int const gain2 = (int)(mlRep * 3);
-                int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 1);
+                int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)offBase) + 1);
                 if ((mlRep >= 4) && (gain2 > gain1))
-                    matchLength = mlRep, offcode = STORE_REPCODE_1, start = ip;
+                    matchLength = mlRep, offBase = REPCODE1_TO_OFFBASE, start = ip;
             }
             if (isDxS) {
                 const U32 repIndex = (U32)(ip - base) - offset_1;
@@ -1596,17 +1625,17 @@ ZSTD_compressBlock_lazy_generic(
                     const BYTE* repMatchEnd = repIndex < prefixLowestIndex ? dictEnd : iend;
                     size_t const mlRep = ZSTD_count_2segments(ip+4, repMatch+4, iend, repMatchEnd, prefixLowest) + 4;
                     int const gain2 = (int)(mlRep * 3);
-                    int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 1);
+                    int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)offBase) + 1);
                     if ((mlRep >= 4) && (gain2 > gain1))
-                        matchLength = mlRep, offcode = STORE_REPCODE_1, start = ip;
+                        matchLength = mlRep, offBase = REPCODE1_TO_OFFBASE, start = ip;
                 }
             }
-            {   size_t offset2=999999999;
-                size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &offset2, mls, rowLog, searchMethod, dictMode);
-                int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offset2)));   /* raw approx */
-                int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 4);
+            {   size_t ofbCandidate=999999999;
+                size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &ofbCandidate, mls, rowLog, searchMethod, dictMode);
+                int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)ofbCandidate));   /* raw approx */
+                int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 4);
                 if ((ml2 >= 4) && (gain2 > gain1)) {
-                    matchLength = ml2, offcode = offset2, start = ip;
+                    matchLength = ml2, offBase = ofbCandidate, start = ip;
                     continue;   /* search a better one */
             }   }
 
@@ -1615,12 +1644,12 @@ ZSTD_compressBlock_lazy_generic(
                 DEBUGLOG(7, "search depth 2");
                 ip ++;
                 if ( (dictMode == ZSTD_noDict)
-                  && (offcode) && ((offset_1>0) & (MEM_read32(ip) == MEM_read32(ip - offset_1)))) {
+                  && (offBase) && ((offset_1>0) & (MEM_read32(ip) == MEM_read32(ip - offset_1)))) {
                     size_t const mlRep = ZSTD_count(ip+4, ip+4-offset_1, iend) + 4;
                     int const gain2 = (int)(mlRep * 4);
-                    int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 1);
+                    int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 1);
                     if ((mlRep >= 4) && (gain2 > gain1))
-                        matchLength = mlRep, offcode = STORE_REPCODE_1, start = ip;
+                        matchLength = mlRep, offBase = REPCODE1_TO_OFFBASE, start = ip;
                 }
                 if (isDxS) {
                     const U32 repIndex = (U32)(ip - base) - offset_1;
@@ -1632,17 +1661,17 @@ ZSTD_compressBlock_lazy_generic(
                         const BYTE* repMatchEnd = repIndex < prefixLowestIndex ? dictEnd : iend;
                         size_t const mlRep = ZSTD_count_2segments(ip+4, repMatch+4, iend, repMatchEnd, prefixLowest) + 4;
                         int const gain2 = (int)(mlRep * 4);
-                        int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 1);
+                        int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 1);
                         if ((mlRep >= 4) && (gain2 > gain1))
-                            matchLength = mlRep, offcode = STORE_REPCODE_1, start = ip;
+                            matchLength = mlRep, offBase = REPCODE1_TO_OFFBASE, start = ip;
                     }
                 }
-                {   size_t offset2=999999999;
-                    size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &offset2, mls, rowLog, searchMethod, dictMode);
-                    int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offset2)));   /* raw approx */
-                    int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 7);
+                {   size_t ofbCandidate=999999999;
+                    size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &ofbCandidate, mls, rowLog, searchMethod, dictMode);
+                    int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)ofbCandidate));   /* raw approx */
+                    int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 7);
                     if ((ml2 >= 4) && (gain2 > gain1)) {
-                        matchLength = ml2, offcode = offset2, start = ip;
+                        matchLength = ml2, offBase = ofbCandidate, start = ip;
                         continue;
             }   }   }
             break;  /* nothing found : store previous solution */
@@ -1653,26 +1682,33 @@ ZSTD_compressBlock_lazy_generic(
          * notably if `value` is unsigned, resulting in a large positive `-value`.
          */
         /* catch up */
-        if (STORED_IS_OFFSET(offcode)) {
+        if (OFFBASE_IS_OFFSET(offBase)) {
             if (dictMode == ZSTD_noDict) {
-                while ( ((start > anchor) & (start - STORED_OFFSET(offcode) > prefixLowest))
-                     && (start[-1] == (start-STORED_OFFSET(offcode))[-1]) )  /* only search for offset within prefix */
+                while ( ((start > anchor) & (start - OFFBASE_TO_OFFSET(offBase) > prefixLowest))
+                     && (start[-1] == (start-OFFBASE_TO_OFFSET(offBase))[-1]) )  /* only search for offset within prefix */
                     { start--; matchLength++; }
             }
             if (isDxS) {
-                U32 const matchIndex = (U32)((size_t)(start-base) - STORED_OFFSET(offcode));
+                U32 const matchIndex = (U32)((size_t)(start-base) - OFFBASE_TO_OFFSET(offBase));
                 const BYTE* match = (matchIndex < prefixLowestIndex) ? dictBase + matchIndex - dictIndexDelta : base + matchIndex;
                 const BYTE* const mStart = (matchIndex < prefixLowestIndex) ? dictLowest : prefixLowest;
                 while ((start>anchor) && (match>mStart) && (start[-1] == match[-1])) { start--; match--; matchLength++; }  /* catch up */
             }
-            offset_2 = offset_1; offset_1 = (U32)STORED_OFFSET(offcode);
+            offset_2 = offset_1; offset_1 = (U32)OFFBASE_TO_OFFSET(offBase);
         }
         /* store sequence */
 _storeSequence:
         {   size_t const litLength = (size_t)(start - anchor);
-            ZSTD_storeSeq(seqStore, litLength, anchor, iend, (U32)offcode, matchLength);
+            ZSTD_storeSeq(seqStore, litLength, anchor, iend, (U32)offBase, matchLength);
             anchor = ip = start + matchLength;
         }
+        if (ms->lazySkipping) {
+            /* We've found a match, disable lazy skipping mode, and refill the hash cache. */
+            if (searchMethod == search_rowHash) {
+                ZSTD_row_fillHashCache(ms, base, rowLog, mls, ms->nextToUpdate, ilimit);
+            }
+            ms->lazySkipping = 0;
+        }
 
         /* check immediate repcode */
         if (isDxS) {
@@ -1686,8 +1722,8 @@ ZSTD_compressBlock_lazy_generic(
                    && (MEM_read32(repMatch) == MEM_read32(ip)) ) {
                     const BYTE* const repEnd2 = repIndex < prefixLowestIndex ? dictEnd : iend;
                     matchLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd2, prefixLowest) + 4;
-                    offcode = offset_2; offset_2 = offset_1; offset_1 = (U32)offcode;   /* swap offset_2 <=> offset_1 */
-                    ZSTD_storeSeq(seqStore, 0, anchor, iend, STORE_REPCODE_1, matchLength);
+                    offBase = offset_2; offset_2 = offset_1; offset_1 = (U32)offBase;   /* swap offset_2 <=> offset_1 */
+                    ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, matchLength);
                     ip += matchLength;
                     anchor = ip;
                     continue;
@@ -1701,16 +1737,20 @@ ZSTD_compressBlock_lazy_generic(
                  && (MEM_read32(ip) == MEM_read32(ip - offset_2)) ) {
                 /* store sequence */
                 matchLength = ZSTD_count(ip+4, ip+4-offset_2, iend) + 4;
-                offcode = offset_2; offset_2 = offset_1; offset_1 = (U32)offcode; /* swap repcodes */
-                ZSTD_storeSeq(seqStore, 0, anchor, iend, STORE_REPCODE_1, matchLength);
+                offBase = offset_2; offset_2 = offset_1; offset_1 = (U32)offBase; /* swap repcodes */
+                ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, matchLength);
                 ip += matchLength;
                 anchor = ip;
                 continue;   /* faster when present ... (?) */
     }   }   }
 
-    /* Save reps for next block */
-    rep[0] = offset_1 ? offset_1 : savedOffset;
-    rep[1] = offset_2 ? offset_2 : savedOffset;
+    /* If offset_1 started invalid (offsetSaved1 != 0) and became valid (offset_1 != 0),
+     * rotate saved offsets. See comment in ZSTD_compressBlock_fast_noDict for more context. */
+    offsetSaved2 = ((offsetSaved1 != 0) && (offset_1 != 0)) ? offsetSaved1 : offsetSaved2;
+
+    /* save reps for next block */
+    rep[0] = offset_1 ? offset_1 : offsetSaved1;
+    rep[1] = offset_2 ? offset_2 : offsetSaved2;
 
     /* Return the last literals size */
     return (size_t)(iend - anchor);
@@ -1886,12 +1926,13 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
 
     DEBUGLOG(5, "ZSTD_compressBlock_lazy_extDict_generic (searchFunc=%u)", (U32)searchMethod);
 
+    /* Reset the lazy skipping state */
+    ms->lazySkipping = 0;
+
     /* init */
     ip += (ip == prefixStart);
     if (searchMethod == search_rowHash) {
-        ZSTD_row_fillHashCache(ms, base, rowLog,
-                               MIN(ms->cParams.minMatch, 6 /* mls caps out at 6 */),
-                               ms->nextToUpdate, ilimit);
+        ZSTD_row_fillHashCache(ms, base, rowLog, mls, ms->nextToUpdate, ilimit);
     }
 
     /* Match Loop */
@@ -1903,7 +1944,7 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
 #endif
     while (ip < ilimit) {
         size_t matchLength=0;
-        size_t offcode=STORE_REPCODE_1;
+        size_t offBase = REPCODE1_TO_OFFBASE;
         const BYTE* start=ip+1;
         U32 curr = (U32)(ip-base);
 
@@ -1922,14 +1963,23 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
         }   }
 
         /* first search (depth 0) */
-        {   size_t offsetFound = 999999999;
-            size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &offsetFound, mls, rowLog, searchMethod, ZSTD_extDict);
+        {   size_t ofbCandidate = 999999999;
+            size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &ofbCandidate, mls, rowLog, searchMethod, ZSTD_extDict);
             if (ml2 > matchLength)
-                matchLength = ml2, start = ip, offcode=offsetFound;
+                matchLength = ml2, start = ip, offBase = ofbCandidate;
         }
 
         if (matchLength < 4) {
-            ip += ((ip-anchor) >> kSearchStrength) + 1;   /* jump faster over incompressible sections */
+            size_t const step = ((size_t)(ip-anchor) >> kSearchStrength);
+            ip += step + 1;   /* jump faster over incompressible sections */
+            /* Enter the lazy skipping mode once we are skipping more than 8 bytes at a time.
+             * In this mode we stop inserting every position into our tables, and only insert
+             * positions that we search, which is one in step positions.
+             * The exact cutoff is flexible, I've just chosen a number that is reasonably high,
+             * so we minimize the compression ratio loss in "normal" scenarios. This mode gets
+             * triggered once we've gone 2KB without finding any matches.
+             */
+            ms->lazySkipping = step > kLazySkippingStep;
             continue;
         }
 
@@ -1939,7 +1989,7 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
             ip ++;
             curr++;
             /* check repCode */
-            if (offcode) {
+            if (offBase) {
                 const U32 windowLow = ZSTD_getLowestMatchIndex(ms, curr, windowLog);
                 const U32 repIndex = (U32)(curr - offset_1);
                 const BYTE* const repBase = repIndex < dictLimit ? dictBase : base;
@@ -1951,18 +2001,18 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
                     const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend;
                     size_t const repLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd, prefixStart) + 4;
                     int const gain2 = (int)(repLength * 3);
-                    int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 1);
+                    int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)offBase) + 1);
                     if ((repLength >= 4) && (gain2 > gain1))
-                        matchLength = repLength, offcode = STORE_REPCODE_1, start = ip;
+                        matchLength = repLength, offBase = REPCODE1_TO_OFFBASE, start = ip;
             }   }
 
             /* search match, depth 1 */
-            {   size_t offset2=999999999;
-                size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &offset2, mls, rowLog, searchMethod, ZSTD_extDict);
-                int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offset2)));   /* raw approx */
-                int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 4);
+            {   size_t ofbCandidate = 999999999;
+                size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &ofbCandidate, mls, rowLog, searchMethod, ZSTD_extDict);
+                int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)ofbCandidate));   /* raw approx */
+                int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 4);
                 if ((ml2 >= 4) && (gain2 > gain1)) {
-                    matchLength = ml2, offcode = offset2, start = ip;
+                    matchLength = ml2, offBase = ofbCandidate, start = ip;
                     continue;   /* search a better one */
             }   }
 
@@ -1971,7 +2021,7 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
                 ip ++;
                 curr++;
                 /* check repCode */
-                if (offcode) {
+                if (offBase) {
                     const U32 windowLow = ZSTD_getLowestMatchIndex(ms, curr, windowLog);
                     const U32 repIndex = (U32)(curr - offset_1);
                     const BYTE* const repBase = repIndex < dictLimit ? dictBase : base;
@@ -1983,38 +2033,45 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
                         const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend;
                         size_t const repLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd, prefixStart) + 4;
                         int const gain2 = (int)(repLength * 4);
-                        int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 1);
+                        int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 1);
                         if ((repLength >= 4) && (gain2 > gain1))
-                            matchLength = repLength, offcode = STORE_REPCODE_1, start = ip;
+                            matchLength = repLength, offBase = REPCODE1_TO_OFFBASE, start = ip;
                 }   }
 
                 /* search match, depth 2 */
-                {   size_t offset2=999999999;
-                    size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &offset2, mls, rowLog, searchMethod, ZSTD_extDict);
-                    int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offset2)));   /* raw approx */
-                    int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 7);
+                {   size_t ofbCandidate = 999999999;
+                    size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &ofbCandidate, mls, rowLog, searchMethod, ZSTD_extDict);
+                    int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)ofbCandidate));   /* raw approx */
+                    int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 7);
                     if ((ml2 >= 4) && (gain2 > gain1)) {
-                        matchLength = ml2, offcode = offset2, start = ip;
+                        matchLength = ml2, offBase = ofbCandidate, start = ip;
                         continue;
             }   }   }
             break;  /* nothing found : store previous solution */
         }
 
         /* catch up */
-        if (STORED_IS_OFFSET(offcode)) {
-            U32 const matchIndex = (U32)((size_t)(start-base) - STORED_OFFSET(offcode));
+        if (OFFBASE_IS_OFFSET(offBase)) {
+            U32 const matchIndex = (U32)((size_t)(start-base) - OFFBASE_TO_OFFSET(offBase));
             const BYTE* match = (matchIndex < dictLimit) ? dictBase + matchIndex : base + matchIndex;
             const BYTE* const mStart = (matchIndex < dictLimit) ? dictStart : prefixStart;
             while ((start>anchor) && (match>mStart) && (start[-1] == match[-1])) { start--; match--; matchLength++; }  /* catch up */
-            offset_2 = offset_1; offset_1 = (U32)STORED_OFFSET(offcode);
+            offset_2 = offset_1; offset_1 = (U32)OFFBASE_TO_OFFSET(offBase);
         }
 
         /* store sequence */
 _storeSequence:
         {   size_t const litLength = (size_t)(start - anchor);
-            ZSTD_storeSeq(seqStore, litLength, anchor, iend, (U32)offcode, matchLength);
+            ZSTD_storeSeq(seqStore, litLength, anchor, iend, (U32)offBase, matchLength);
             anchor = ip = start + matchLength;
         }
+        if (ms->lazySkipping) {
+            /* We've found a match, disable lazy skipping mode, and refill the hash cache. */
+            if (searchMethod == search_rowHash) {
+                ZSTD_row_fillHashCache(ms, base, rowLog, mls, ms->nextToUpdate, ilimit);
+            }
+            ms->lazySkipping = 0;
+        }
 
         /* check immediate repcode */
         while (ip <= ilimit) {
@@ -2029,8 +2086,8 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
                 /* repcode detected we should take it */
                 const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend;
                 matchLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd, prefixStart) + 4;
-                offcode = offset_2; offset_2 = offset_1; offset_1 = (U32)offcode;   /* swap offset history */
-                ZSTD_storeSeq(seqStore, 0, anchor, iend, STORE_REPCODE_1, matchLength);
+                offBase = offset_2; offset_2 = offset_1; offset_1 = (U32)offBase;   /* swap offset history */
+                ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, matchLength);
                 ip += matchLength;
                 anchor = ip;
                 continue;   /* faster when present ... (?) */
@@ -2096,7 +2153,6 @@ size_t ZSTD_compressBlock_lazy_extDict_row(
 size_t ZSTD_compressBlock_lazy2_extDict_row(
         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
         void const* src, size_t srcSize)
-
 {
     return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2);
 }
diff --git a/lib/zstd/compress/zstd_lazy.h b/lib/zstd/compress/zstd_lazy.h
index e5bdf4df8dde..9505bed93c03 100644
--- a/lib/zstd/compress/zstd_lazy.h
+++ b/lib/zstd/compress/zstd_lazy.h
@@ -1,5 +1,6 @@
+/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
 /*
- * Copyright (c) Yann Collet, Facebook, Inc.
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
  * All rights reserved.
  *
  * This source code is licensed under both the BSD-style license (found in the
@@ -22,6 +23,8 @@
  */
 #define ZSTD_LAZY_DDSS_BUCKET_LOG 2
 
+#define ZSTD_ROW_HASH_TAG_BITS 8        /* nb bits to use for the tag */
+
 U32 ZSTD_insertAndFindFirstIndex(ZSTD_matchState_t* ms, const BYTE* ip);
 void ZSTD_row_update(ZSTD_matchState_t* const ms, const BYTE* ip);
 
@@ -113,7 +116,7 @@ size_t ZSTD_compressBlock_lazy2_extDict_row(
 size_t ZSTD_compressBlock_btlazy2_extDict(
         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
         void const* src, size_t srcSize);
-        
+
 
 
 #endif /* ZSTD_LAZY_H */
diff --git a/lib/zstd/compress/zstd_ldm.c b/lib/zstd/compress/zstd_ldm.c
index dd86fc83e7dd..b7da76b0db7c 100644
--- a/lib/zstd/compress/zstd_ldm.c
+++ b/lib/zstd/compress/zstd_ldm.c
@@ -1,5 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
 /*
- * Copyright (c) Yann Collet, Facebook, Inc.
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
  * All rights reserved.
  *
  * This source code is licensed under both the BSD-style license (found in the
@@ -242,11 +243,11 @@ static size_t ZSTD_ldm_fillFastTables(ZSTD_matchState_t* ms,
     switch(ms->cParams.strategy)
     {
     case ZSTD_fast:
-        ZSTD_fillHashTable(ms, iend, ZSTD_dtlm_fast);
+        ZSTD_fillHashTable(ms, iend, ZSTD_dtlm_fast, ZSTD_tfp_forCCtx);
         break;
 
     case ZSTD_dfast:
-        ZSTD_fillDoubleHashTable(ms, iend, ZSTD_dtlm_fast);
+        ZSTD_fillDoubleHashTable(ms, iend, ZSTD_dtlm_fast, ZSTD_tfp_forCCtx);
         break;
 
     case ZSTD_greedy:
@@ -549,7 +550,7 @@ size_t ZSTD_ldm_generateSequences(
          * the window through early invalidation.
          * TODO: * Test the chunk size.
          *       * Try invalidation after the sequence generation and test the
-         *         the offset against maxDist directly.
+         *         offset against maxDist directly.
          *
          * NOTE: Because of dictionaries + sequence splitting we MUST make sure
          * that any offset used is valid at the END of the sequence, since it may
@@ -711,7 +712,7 @@ size_t ZSTD_ldm_blockCompress(rawSeqStore_t* rawSeqStore,
             rep[0] = sequence.offset;
             /* Store the sequence */
             ZSTD_storeSeq(seqStore, newLitLength, ip - newLitLength, iend,
-                          STORE_OFFSET(sequence.offset),
+                          OFFSET_TO_OFFBASE(sequence.offset),
                           sequence.matchLength);
             ip += sequence.matchLength;
         }
diff --git a/lib/zstd/compress/zstd_ldm.h b/lib/zstd/compress/zstd_ldm.h
index fbc6a5e88fd7..c540731abde7 100644
--- a/lib/zstd/compress/zstd_ldm.h
+++ b/lib/zstd/compress/zstd_ldm.h
@@ -1,5 +1,6 @@
+/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
 /*
- * Copyright (c) Yann Collet, Facebook, Inc.
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
  * All rights reserved.
  *
  * This source code is licensed under both the BSD-style license (found in the
diff --git a/lib/zstd/compress/zstd_ldm_geartab.h b/lib/zstd/compress/zstd_ldm_geartab.h
index 647f865be290..cfccfc46f6f7 100644
--- a/lib/zstd/compress/zstd_ldm_geartab.h
+++ b/lib/zstd/compress/zstd_ldm_geartab.h
@@ -1,5 +1,6 @@
+/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
 /*
- * Copyright (c) Yann Collet, Facebook, Inc.
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
  * All rights reserved.
  *
  * This source code is licensed under both the BSD-style license (found in the
diff --git a/lib/zstd/compress/zstd_opt.c b/lib/zstd/compress/zstd_opt.c
index fd82acfda62f..1e41cb04f482 100644
--- a/lib/zstd/compress/zstd_opt.c
+++ b/lib/zstd/compress/zstd_opt.c
@@ -1,5 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
 /*
- * Copyright (c) Przemyslaw Skibinski, Yann Collet, Facebook, Inc.
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
  * All rights reserved.
  *
  * This source code is licensed under both the BSD-style license (found in the
@@ -16,7 +17,7 @@
 #define ZSTD_LITFREQ_ADD    2   /* scaling factor for litFreq, so that frequencies adapt faster to new stats */
 #define ZSTD_MAX_PRICE     (1<<30)
 
-#define ZSTD_PREDEF_THRESHOLD 1024   /* if srcSize < ZSTD_PREDEF_THRESHOLD, symbols' cost is assumed static, directly determined by pre-defined distributions */
+#define ZSTD_PREDEF_THRESHOLD 8   /* if srcSize < ZSTD_PREDEF_THRESHOLD, symbols' cost is assumed static, directly determined by pre-defined distributions */
 
 
 /*-*************************************
@@ -26,27 +27,35 @@
 #if 0    /* approximation at bit level (for tests) */
 #  define BITCOST_ACCURACY 0
 #  define BITCOST_MULTIPLIER (1 << BITCOST_ACCURACY)
-#  define WEIGHT(stat, opt) ((void)opt, ZSTD_bitWeight(stat))
+#  define WEIGHT(stat, opt) ((void)(opt), ZSTD_bitWeight(stat))
 #elif 0  /* fractional bit accuracy (for tests) */
 #  define BITCOST_ACCURACY 8
 #  define BITCOST_MULTIPLIER (1 << BITCOST_ACCURACY)
-#  define WEIGHT(stat,opt) ((void)opt, ZSTD_fracWeight(stat))
+#  define WEIGHT(stat,opt) ((void)(opt), ZSTD_fracWeight(stat))
 #else    /* opt==approx, ultra==accurate */
 #  define BITCOST_ACCURACY 8
 #  define BITCOST_MULTIPLIER (1 << BITCOST_ACCURACY)
-#  define WEIGHT(stat,opt) (opt ? ZSTD_fracWeight(stat) : ZSTD_bitWeight(stat))
+#  define WEIGHT(stat,opt) ((opt) ? ZSTD_fracWeight(stat) : ZSTD_bitWeight(stat))
 #endif
 
+/* ZSTD_bitWeight() :
+ * provide estimated "cost" of a stat in full bits only */
 MEM_STATIC U32 ZSTD_bitWeight(U32 stat)
 {
     return (ZSTD_highbit32(stat+1) * BITCOST_MULTIPLIER);
 }
 
+/* ZSTD_fracWeight() :
+ * provide fractional-bit "cost" of a stat,
+ * using linear interpolation approximation */
 MEM_STATIC U32 ZSTD_fracWeight(U32 rawStat)
 {
     U32 const stat = rawStat + 1;
     U32 const hb = ZSTD_highbit32(stat);
     U32 const BWeight = hb * BITCOST_MULTIPLIER;
+    /* Fweight was meant for "Fractional weight"
+     * but it's effectively a value between 1 and 2
+     * using fixed point arithmetic */
     U32 const FWeight = (stat << BITCOST_ACCURACY) >> hb;
     U32 const weight = BWeight + FWeight;
     assert(hb + BITCOST_ACCURACY < 31);
@@ -57,7 +66,7 @@ MEM_STATIC U32 ZSTD_fracWeight(U32 rawStat)
 /* debugging function,
  * @return price in bytes as fractional value
  * for debug messages only */
-MEM_STATIC double ZSTD_fCost(U32 price)
+MEM_STATIC double ZSTD_fCost(int price)
 {
     return (double)price / (BITCOST_MULTIPLIER*8);
 }
@@ -88,20 +97,26 @@ static U32 sum_u32(const unsigned table[], size_t nbElts)
     return total;
 }
 
-static U32 ZSTD_downscaleStats(unsigned* table, U32 lastEltIndex, U32 shift)
+typedef enum { base_0possible=0, base_1guaranteed=1 } base_directive_e;
+
+static U32
+ZSTD_downscaleStats(unsigned* table, U32 lastEltIndex, U32 shift, base_directive_e base1)
 {
     U32 s, sum=0;
-    DEBUGLOG(5, "ZSTD_downscaleStats (nbElts=%u, shift=%u)", (unsigned)lastEltIndex+1, (unsigned)shift);
+    DEBUGLOG(5, "ZSTD_downscaleStats (nbElts=%u, shift=%u)",
+            (unsigned)lastEltIndex+1, (unsigned)shift );
     assert(shift < 30);
     for (s=0; s<lastEltIndex+1; s++) {
-        table[s] = 1 + (table[s] >> shift);
-        sum += table[s];
+        unsigned const base = base1 ? 1 : (table[s]>0);
+        unsigned const newStat = base + (table[s] >> shift);
+        sum += newStat;
+        table[s] = newStat;
     }
     return sum;
 }
 
 /* ZSTD_scaleStats() :
- * reduce all elements in table is sum too large
+ * reduce all elt frequencies in table if sum too large
  * return the resulting sum of elements */
 static U32 ZSTD_scaleStats(unsigned* table, U32 lastEltIndex, U32 logTarget)
 {
@@ -110,7 +125,7 @@ static U32 ZSTD_scaleStats(unsigned* table, U32 lastEltIndex, U32 logTarget)
     DEBUGLOG(5, "ZSTD_scaleStats (nbElts=%u, target=%u)", (unsigned)lastEltIndex+1, (unsigned)logTarget);
     assert(logTarget < 30);
     if (factor <= 1) return prevsum;
-    return ZSTD_downscaleStats(table, lastEltIndex, ZSTD_highbit32(factor));
+    return ZSTD_downscaleStats(table, lastEltIndex, ZSTD_highbit32(factor), base_1guaranteed);
 }
 
 /* ZSTD_rescaleFreqs() :
@@ -129,18 +144,22 @@ ZSTD_rescaleFreqs(optState_t* const optPtr,
     DEBUGLOG(5, "ZSTD_rescaleFreqs (srcSize=%u)", (unsigned)srcSize);
     optPtr->priceType = zop_dynamic;
 
-    if (optPtr->litLengthSum == 0) {  /* first block : init */
-        if (srcSize <= ZSTD_PREDEF_THRESHOLD) {  /* heuristic */
-            DEBUGLOG(5, "(srcSize <= ZSTD_PREDEF_THRESHOLD) => zop_predef");
+    if (optPtr->litLengthSum == 0) {  /* no literals stats collected -> first block assumed -> init */
+
+        /* heuristic: use pre-defined stats for too small inputs */
+        if (srcSize <= ZSTD_PREDEF_THRESHOLD) {
+            DEBUGLOG(5, "srcSize <= %i : use predefined stats", ZSTD_PREDEF_THRESHOLD);
             optPtr->priceType = zop_predef;
         }
 
         assert(optPtr->symbolCosts != NULL);
         if (optPtr->symbolCosts->huf.repeatMode == HUF_repeat_valid) {
-            /* huffman table presumed generated by dictionary */
+
+            /* huffman stats covering the full value set : table presumed generated by dictionary */
             optPtr->priceType = zop_dynamic;
 
             if (compressedLiterals) {
+                /* generate literals statistics from huffman table */
                 unsigned lit;
                 assert(optPtr->litFreq != NULL);
                 optPtr->litSum = 0;
@@ -188,13 +207,14 @@ ZSTD_rescaleFreqs(optState_t* const optPtr,
                     optPtr->offCodeSum += optPtr->offCodeFreq[of];
             }   }
 
-        } else {  /* not a dictionary */
+        } else {  /* first block, no dictionary */
 
             assert(optPtr->litFreq != NULL);
             if (compressedLiterals) {
+                /* base initial cost of literals on direct frequency within src */
                 unsigned lit = MaxLit;
                 HIST_count_simple(optPtr->litFreq, &lit, src, srcSize);   /* use raw first block to init statistics */
-                optPtr->litSum = ZSTD_downscaleStats(optPtr->litFreq, MaxLit, 8);
+                optPtr->litSum = ZSTD_downscaleStats(optPtr->litFreq, MaxLit, 8, base_0possible);
             }
 
             {   unsigned const baseLLfreqs[MaxLL+1] = {
@@ -224,10 +244,9 @@ ZSTD_rescaleFreqs(optState_t* const optPtr,
                 optPtr->offCodeSum = sum_u32(baseOFCfreqs, MaxOff+1);
             }
 
-
         }
 
-    } else {   /* new block : re-use previous statistics, scaled down */
+    } else {   /* new block : scale down accumulated statistics */
 
         if (compressedLiterals)
             optPtr->litSum = ZSTD_scaleStats(optPtr->litFreq, MaxLit, 12);
@@ -255,11 +274,14 @@ static U32 ZSTD_rawLiteralsCost(const BYTE* const literals, U32 const litLength,
         return (litLength*6) * BITCOST_MULTIPLIER;  /* 6 bit per literal - no statistic used */
 
     /* dynamic statistics */
-    {   U32 price = litLength * optPtr->litSumBasePrice;
+    {   U32 price = optPtr->litSumBasePrice * litLength;
+        U32 const litPriceMax = optPtr->litSumBasePrice - BITCOST_MULTIPLIER;
         U32 u;
+        assert(optPtr->litSumBasePrice >= BITCOST_MULTIPLIER);
         for (u=0; u < litLength; u++) {
-            assert(WEIGHT(optPtr->litFreq[literals[u]], optLevel) <= optPtr->litSumBasePrice);   /* literal cost should never be negative */
-            price -= WEIGHT(optPtr->litFreq[literals[u]], optLevel);
+            U32 litPrice = WEIGHT(optPtr->litFreq[literals[u]], optLevel);
+            if (UNLIKELY(litPrice > litPriceMax)) litPrice = litPriceMax;
+            price -= litPrice;
         }
         return price;
     }
@@ -272,10 +294,11 @@ static U32 ZSTD_litLengthPrice(U32 const litLength, const optState_t* const optP
     assert(litLength <= ZSTD_BLOCKSIZE_MAX);
     if (optPtr->priceType == zop_predef)
         return WEIGHT(litLength, optLevel);
-    /* We can't compute the litLength price for sizes >= ZSTD_BLOCKSIZE_MAX
-     * because it isn't representable in the zstd format. So instead just
-     * call it 1 bit more than ZSTD_BLOCKSIZE_MAX - 1. In this case the block
-     * would be all literals.
+
+    /* ZSTD_LLcode() can't compute litLength price for sizes >= ZSTD_BLOCKSIZE_MAX
+     * because it isn't representable in the zstd format.
+     * So instead just pretend it would cost 1 bit more than ZSTD_BLOCKSIZE_MAX - 1.
+     * In such a case, the block would be all literals.
      */
     if (litLength == ZSTD_BLOCKSIZE_MAX)
         return BITCOST_MULTIPLIER + ZSTD_litLengthPrice(ZSTD_BLOCKSIZE_MAX - 1, optPtr, optLevel);
@@ -289,24 +312,25 @@ static U32 ZSTD_litLengthPrice(U32 const litLength, const optState_t* const optP
 }
 
 /* ZSTD_getMatchPrice() :
- * Provides the cost of the match part (offset + matchLength) of a sequence
+ * Provides the cost of the match part (offset + matchLength) of a sequence.
  * Must be combined with ZSTD_fullLiteralsCost() to get the full cost of a sequence.
- * @offcode : expects a scale where 0,1,2 are repcodes 1-3, and 3+ are real_offsets+2
+ * @offBase : sumtype, representing an offset or a repcode, and using numeric representation of ZSTD_storeSeq()
  * @optLevel: when <2, favors small offset for decompression speed (improved cache efficiency)
  */
 FORCE_INLINE_TEMPLATE U32
-ZSTD_getMatchPrice(U32 const offcode,
+ZSTD_getMatchPrice(U32 const offBase,
                    U32 const matchLength,
              const optState_t* const optPtr,
                    int const optLevel)
 {
     U32 price;
-    U32 const offCode = ZSTD_highbit32(STORED_TO_OFFBASE(offcode));
+    U32 const offCode = ZSTD_highbit32(offBase);
     U32 const mlBase = matchLength - MINMATCH;
     assert(matchLength >= MINMATCH);
 
-    if (optPtr->priceType == zop_predef)  /* fixed scheme, do not use statistics */
-        return WEIGHT(mlBase, optLevel) + ((16 + offCode) * BITCOST_MULTIPLIER);
+    if (optPtr->priceType == zop_predef)  /* fixed scheme, does not use statistics */
+        return WEIGHT(mlBase, optLevel)
+             + ((16 + offCode) * BITCOST_MULTIPLIER); /* emulated offset cost */
 
     /* dynamic statistics */
     price = (offCode * BITCOST_MULTIPLIER) + (optPtr->offCodeSumBasePrice - WEIGHT(optPtr->offCodeFreq[offCode], optLevel));
@@ -325,10 +349,10 @@ ZSTD_getMatchPrice(U32 const offcode,
 }
 
 /* ZSTD_updateStats() :
- * assumption : literals + litLengtn <= iend */
+ * assumption : literals + litLength <= iend */
 static void ZSTD_updateStats(optState_t* const optPtr,
                              U32 litLength, const BYTE* literals,
-                             U32 offsetCode, U32 matchLength)
+                             U32 offBase, U32 matchLength)
 {
     /* literals */
     if (ZSTD_compressedLiterals(optPtr)) {
@@ -344,8 +368,8 @@ static void ZSTD_updateStats(optState_t* const optPtr,
         optPtr->litLengthSum++;
     }
 
-    /* offset code : expected to follow storeSeq() numeric representation */
-    {   U32 const offCode = ZSTD_highbit32(STORED_TO_OFFBASE(offsetCode));
+    /* offset code : follows storeSeq() numeric representation */
+    {   U32 const offCode = ZSTD_highbit32(offBase);
         assert(offCode <= MaxOff);
         optPtr->offCodeFreq[offCode]++;
         optPtr->offCodeSum++;
@@ -552,16 +576,17 @@ void ZSTD_updateTree(ZSTD_matchState_t* ms, const BYTE* ip, const BYTE* iend) {
     ZSTD_updateTree_internal(ms, ip, iend, ms->cParams.minMatch, ZSTD_noDict);
 }
 
-FORCE_INLINE_TEMPLATE
-U32 ZSTD_insertBtAndGetAllMatches (
-                    ZSTD_match_t* matches,   /* store result (found matches) in this table (presumed large enough) */
-                    ZSTD_matchState_t* ms,
-                    U32* nextToUpdate3,
-                    const BYTE* const ip, const BYTE* const iLimit, const ZSTD_dictMode_e dictMode,
-                    const U32 rep[ZSTD_REP_NUM],
-                    U32 const ll0,   /* tells if associated literal length is 0 or not. This value must be 0 or 1 */
-                    const U32 lengthToBeat,
-                    U32 const mls /* template */)
+FORCE_INLINE_TEMPLATE U32
+ZSTD_insertBtAndGetAllMatches (
+                ZSTD_match_t* matches,  /* store result (found matches) in this table (presumed large enough) */
+                ZSTD_matchState_t* ms,
+                U32* nextToUpdate3,
+                const BYTE* const ip, const BYTE* const iLimit,
+                const ZSTD_dictMode_e dictMode,
+                const U32 rep[ZSTD_REP_NUM],
+                const U32 ll0,  /* tells if associated literal length is 0 or not. This value must be 0 or 1 */
+                const U32 lengthToBeat,
+                const U32 mls /* template */)
 {
     const ZSTD_compressionParameters* const cParams = &ms->cParams;
     U32 const sufficient_len = MIN(cParams->targetLength, ZSTD_OPT_NUM -1);
@@ -644,7 +669,7 @@ U32 ZSTD_insertBtAndGetAllMatches (
                 DEBUGLOG(8, "found repCode %u (ll0:%u, offset:%u) of length %u",
                             repCode, ll0, repOffset, repLen);
                 bestLength = repLen;
-                matches[mnum].off = STORE_REPCODE(repCode - ll0 + 1);  /* expect value between 1 and 3 */
+                matches[mnum].off = REPCODE_TO_OFFBASE(repCode - ll0 + 1);  /* expect value between 1 and 3 */
                 matches[mnum].len = (U32)repLen;
                 mnum++;
                 if ( (repLen > sufficient_len)
@@ -673,7 +698,7 @@ U32 ZSTD_insertBtAndGetAllMatches (
                 bestLength = mlen;
                 assert(curr > matchIndex3);
                 assert(mnum==0);  /* no prior solution */
-                matches[0].off = STORE_OFFSET(curr - matchIndex3);
+                matches[0].off = OFFSET_TO_OFFBASE(curr - matchIndex3);
                 matches[0].len = (U32)mlen;
                 mnum = 1;
                 if ( (mlen > sufficient_len) |
@@ -706,13 +731,13 @@ U32 ZSTD_insertBtAndGetAllMatches (
         }
 
         if (matchLength > bestLength) {
-            DEBUGLOG(8, "found match of length %u at distance %u (offCode=%u)",
-                    (U32)matchLength, curr - matchIndex, STORE_OFFSET(curr - matchIndex));
+            DEBUGLOG(8, "found match of length %u at distance %u (offBase=%u)",
+                    (U32)matchLength, curr - matchIndex, OFFSET_TO_OFFBASE(curr - matchIndex));
             assert(matchEndIdx > matchIndex);
             if (matchLength > matchEndIdx - matchIndex)
                 matchEndIdx = matchIndex + (U32)matchLength;
             bestLength = matchLength;
-            matches[mnum].off = STORE_OFFSET(curr - matchIndex);
+            matches[mnum].off = OFFSET_TO_OFFBASE(curr - matchIndex);
             matches[mnum].len = (U32)matchLength;
             mnum++;
             if ( (matchLength > ZSTD_OPT_NUM)
@@ -754,12 +779,12 @@ U32 ZSTD_insertBtAndGetAllMatches (
 
             if (matchLength > bestLength) {
                 matchIndex = dictMatchIndex + dmsIndexDelta;
-                DEBUGLOG(8, "found dms match of length %u at distance %u (offCode=%u)",
-                        (U32)matchLength, curr - matchIndex, STORE_OFFSET(curr - matchIndex));
+                DEBUGLOG(8, "found dms match of length %u at distance %u (offBase=%u)",
+                        (U32)matchLength, curr - matchIndex, OFFSET_TO_OFFBASE(curr - matchIndex));
                 if (matchLength > matchEndIdx - matchIndex)
                     matchEndIdx = matchIndex + (U32)matchLength;
                 bestLength = matchLength;
-                matches[mnum].off = STORE_OFFSET(curr - matchIndex);
+                matches[mnum].off = OFFSET_TO_OFFBASE(curr - matchIndex);
                 matches[mnum].len = (U32)matchLength;
                 mnum++;
                 if ( (matchLength > ZSTD_OPT_NUM)
@@ -960,7 +985,7 @@ static void ZSTD_optLdm_maybeAddMatch(ZSTD_match_t* matches, U32* nbMatches,
                                       const ZSTD_optLdm_t* optLdm, U32 currPosInBlock)
 {
     U32 const posDiff = currPosInBlock - optLdm->startPosInBlock;
-    /* Note: ZSTD_match_t actually contains offCode and matchLength (before subtracting MINMATCH) */
+    /* Note: ZSTD_match_t actually contains offBase and matchLength (before subtracting MINMATCH) */
     U32 const candidateMatchLength = optLdm->endPosInBlock - optLdm->startPosInBlock - posDiff;
 
     /* Ensure that current block position is not outside of the match */
@@ -971,11 +996,11 @@ static void ZSTD_optLdm_maybeAddMatch(ZSTD_match_t* matches, U32* nbMatches,
     }
 
     if (*nbMatches == 0 || ((candidateMatchLength > matches[*nbMatches-1].len) && *nbMatches < ZSTD_OPT_NUM)) {
-        U32 const candidateOffCode = STORE_OFFSET(optLdm->offset);
-        DEBUGLOG(6, "ZSTD_optLdm_maybeAddMatch(): Adding ldm candidate match (offCode: %u matchLength %u) at block position=%u",
-                 candidateOffCode, candidateMatchLength, currPosInBlock);
+        U32 const candidateOffBase = OFFSET_TO_OFFBASE(optLdm->offset);
+        DEBUGLOG(6, "ZSTD_optLdm_maybeAddMatch(): Adding ldm candidate match (offBase: %u matchLength %u) at block position=%u",
+                 candidateOffBase, candidateMatchLength, currPosInBlock);
         matches[*nbMatches].len = candidateMatchLength;
-        matches[*nbMatches].off = candidateOffCode;
+        matches[*nbMatches].off = candidateOffBase;
         (*nbMatches)++;
     }
 }
@@ -1062,6 +1087,8 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms,
     ZSTD_optimal_t lastSequence;
     ZSTD_optLdm_t optLdm;
 
+    ZSTD_memset(&lastSequence, 0, sizeof(ZSTD_optimal_t));
+
     optLdm.seqStore = ms->ldmSeqStore ? *ms->ldmSeqStore : kNullRawSeqStore;
     optLdm.endPosInBlock = optLdm.startPosInBlock = optLdm.offset = 0;
     ZSTD_opt_getNextMatchAndUpdateSeqStore(&optLdm, (U32)(ip-istart), (U32)(iend-ip));
@@ -1098,14 +1125,14 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms,
 
             /* large match -> immediate encoding */
             {   U32 const maxML = matches[nbMatches-1].len;
-                U32 const maxOffcode = matches[nbMatches-1].off;
-                DEBUGLOG(6, "found %u matches of maxLength=%u and maxOffCode=%u at cPos=%u => start new series",
-                            nbMatches, maxML, maxOffcode, (U32)(ip-prefixStart));
+                U32 const maxOffBase = matches[nbMatches-1].off;
+                DEBUGLOG(6, "found %u matches of maxLength=%u and maxOffBase=%u at cPos=%u => start new series",
+                            nbMatches, maxML, maxOffBase, (U32)(ip-prefixStart));
 
                 if (maxML > sufficient_len) {
                     lastSequence.litlen = litlen;
                     lastSequence.mlen = maxML;
-                    lastSequence.off = maxOffcode;
+                    lastSequence.off = maxOffBase;
                     DEBUGLOG(6, "large match (%u>%u), immediate encoding",
                                 maxML, sufficient_len);
                     cur = 0;
@@ -1122,15 +1149,15 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms,
                     opt[pos].price = ZSTD_MAX_PRICE;   /* mlen, litlen and price will be fixed during forward scanning */
                 }
                 for (matchNb = 0; matchNb < nbMatches; matchNb++) {
-                    U32 const offcode = matches[matchNb].off;
+                    U32 const offBase = matches[matchNb].off;
                     U32 const end = matches[matchNb].len;
                     for ( ; pos <= end ; pos++ ) {
-                        U32 const matchPrice = ZSTD_getMatchPrice(offcode, pos, optStatePtr, optLevel);
+                        U32 const matchPrice = ZSTD_getMatchPrice(offBase, pos, optStatePtr, optLevel);
                         U32 const sequencePrice = literalsPrice + matchPrice;
                         DEBUGLOG(7, "rPos:%u => set initial price : %.2f",
-                                    pos, ZSTD_fCost(sequencePrice));
+                                    pos, ZSTD_fCost((int)sequencePrice));
                         opt[pos].mlen = pos;
-                        opt[pos].off = offcode;
+                        opt[pos].off = offBase;
                         opt[pos].litlen = litlen;
                         opt[pos].price = (int)sequencePrice;
                 }   }
@@ -1230,7 +1257,7 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms,
                     U32 const startML = (matchNb>0) ? matches[matchNb-1].len+1 : minMatch;
                     U32 mlen;
 
-                    DEBUGLOG(7, "testing match %u => offCode=%4u, mlen=%2u, llen=%2u",
+                    DEBUGLOG(7, "testing match %u => offBase=%4u, mlen=%2u, llen=%2u",
                                 matchNb, matches[matchNb].off, lastML, litlen);
 
                     for (mlen = lastML; mlen >= startML; mlen--) {  /* scan downward */
@@ -1296,7 +1323,7 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms,
                 for (storePos=storeStart; storePos <= storeEnd; storePos++) {
                     U32 const llen = opt[storePos].litlen;
                     U32 const mlen = opt[storePos].mlen;
-                    U32 const offCode = opt[storePos].off;
+                    U32 const offBase = opt[storePos].off;
                     U32 const advance = llen + mlen;
                     DEBUGLOG(6, "considering seq starting at %zi, llen=%u, mlen=%u",
                                 anchor - istart, (unsigned)llen, (unsigned)mlen);
@@ -1308,8 +1335,8 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms,
                     }
 
                     assert(anchor + llen <= iend);
-                    ZSTD_updateStats(optStatePtr, llen, anchor, offCode, mlen);
-                    ZSTD_storeSeq(seqStore, llen, anchor, iend, offCode, mlen);
+                    ZSTD_updateStats(optStatePtr, llen, anchor, offBase, mlen);
+                    ZSTD_storeSeq(seqStore, llen, anchor, iend, offBase, mlen);
                     anchor += advance;
                     ip = anchor;
             }   }
@@ -1349,7 +1376,7 @@ size_t ZSTD_compressBlock_btopt(
 /* ZSTD_initStats_ultra():
  * make a first compression pass, just to seed stats with more accurate starting values.
  * only works on first block, with no dictionary and no ldm.
- * this function cannot error, hence its contract must be respected.
+ * this function cannot error out, its narrow contract must be respected.
  */
 static void
 ZSTD_initStats_ultra(ZSTD_matchState_t* ms,
@@ -1368,7 +1395,7 @@ ZSTD_initStats_ultra(ZSTD_matchState_t* ms,
 
     ZSTD_compressBlock_opt2(ms, seqStore, tmpRep, src, srcSize, ZSTD_noDict);   /* generate stats into ms->opt*/
 
-    /* invalidate first scan from history */
+    /* invalidate first scan from history, only keep entropy stats */
     ZSTD_resetSeqStore(seqStore);
     ms->window.base -= srcSize;
     ms->window.dictLimit += (U32)srcSize;
@@ -1392,20 +1419,20 @@ size_t ZSTD_compressBlock_btultra2(
     U32 const curr = (U32)((const BYTE*)src - ms->window.base);
     DEBUGLOG(5, "ZSTD_compressBlock_btultra2 (srcSize=%zu)", srcSize);
 
-    /* 2-pass strategy:
+    /* 2-passes strategy:
      * this strategy makes a first pass over first block to collect statistics
-     * and seed next round's statistics with it.
-     * After 1st pass, function forgets everything, and starts a new block.
+     * in order to seed next round's statistics with it.
+     * After 1st pass, function forgets history, and starts a new block.
      * Consequently, this can only work if no data has been previously loaded in tables,
      * aka, no dictionary, no prefix, no ldm preprocessing.
      * The compression ratio gain is generally small (~0.5% on first block),
-     * the cost is 2x cpu time on first block. */
+    ** the cost is 2x cpu time on first block. */
     assert(srcSize <= ZSTD_BLOCKSIZE_MAX);
     if ( (ms->opt.litLengthSum==0)   /* first block */
       && (seqStore->sequences == seqStore->sequencesStart)  /* no ldm */
       && (ms->window.dictLimit == ms->window.lowLimit)   /* no dictionary */
-      && (curr == ms->window.dictLimit)   /* start of frame, nothing already loaded nor skipped */
-      && (srcSize > ZSTD_PREDEF_THRESHOLD)
+      && (curr == ms->window.dictLimit)    /* start of frame, nothing already loaded nor skipped */
+      && (srcSize > ZSTD_PREDEF_THRESHOLD) /* input large enough to not employ default stats */
       ) {
         ZSTD_initStats_ultra(ms, seqStore, rep, src, srcSize);
     }
diff --git a/lib/zstd/compress/zstd_opt.h b/lib/zstd/compress/zstd_opt.h
index 22b862858ba7..faa73ff4b03d 100644
--- a/lib/zstd/compress/zstd_opt.h
+++ b/lib/zstd/compress/zstd_opt.h
@@ -1,5 +1,6 @@
+/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
 /*
- * Copyright (c) Yann Collet, Facebook, Inc.
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
  * All rights reserved.
  *
  * This source code is licensed under both the BSD-style license (found in the
diff --git a/lib/zstd/decompress/huf_decompress.c b/lib/zstd/decompress/huf_decompress.c
index 60958afebc41..d172e35fbd9a 100644
--- a/lib/zstd/decompress/huf_decompress.c
+++ b/lib/zstd/decompress/huf_decompress.c
@@ -1,7 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
 /* ******************************************************************
  * huff0 huffman decoder,
  * part of Finite State Entropy library
- * Copyright (c) Yann Collet, Facebook, Inc.
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
  *
  *  You can contact the author at :
  *  - FSE+HUF source repository : https://github.com/Cyan4973/FiniteStateEntropy
@@ -19,10 +20,10 @@
 #include "../common/compiler.h"
 #include "../common/bitstream.h"  /* BIT_* */
 #include "../common/fse.h"        /* to compress headers */
-#define HUF_STATIC_LINKING_ONLY
 #include "../common/huf.h"
 #include "../common/error_private.h"
 #include "../common/zstd_internal.h"
+#include "../common/bits.h"       /* ZSTD_highbit32, ZSTD_countTrailingZeros64 */
 
 /* **************************************************************
 *  Constants
@@ -43,27 +44,25 @@
 #error "Cannot force the use of the X1 and X2 decoders at the same time!"
 #endif
 
-#if ZSTD_ENABLE_ASM_X86_64_BMI2 && DYNAMIC_BMI2
-# define HUF_ASM_X86_64_BMI2_ATTRS BMI2_TARGET_ATTRIBUTE
+/* When DYNAMIC_BMI2 is enabled, fast decoders are only called when bmi2 is
+ * supported at runtime, so we can add the BMI2 target attribute.
+ * When it is disabled, we will still get BMI2 if it is enabled statically.
+ */
+#if DYNAMIC_BMI2
+# define HUF_FAST_BMI2_ATTRS BMI2_TARGET_ATTRIBUTE
 #else
-# define HUF_ASM_X86_64_BMI2_ATTRS
+# define HUF_FAST_BMI2_ATTRS
 #endif
 
 #define HUF_EXTERN_C
 #define HUF_ASM_DECL HUF_EXTERN_C
 
-#if DYNAMIC_BMI2 || (ZSTD_ENABLE_ASM_X86_64_BMI2 && defined(__BMI2__))
+#if DYNAMIC_BMI2
 # define HUF_NEED_BMI2_FUNCTION 1
 #else
 # define HUF_NEED_BMI2_FUNCTION 0
 #endif
 
-#if !(ZSTD_ENABLE_ASM_X86_64_BMI2 && defined(__BMI2__))
-# define HUF_NEED_DEFAULT_FUNCTION 1
-#else
-# define HUF_NEED_DEFAULT_FUNCTION 0
-#endif
-
 /* **************************************************************
 *  Error Management
 ****************************************************************/
@@ -80,6 +79,11 @@
 /* **************************************************************
 *  BMI2 Variant Wrappers
 ****************************************************************/
+typedef size_t (*HUF_DecompressUsingDTableFn)(void *dst, size_t dstSize,
+                                              const void *cSrc,
+                                              size_t cSrcSize,
+                                              const HUF_DTable *DTable);
+
 #if DYNAMIC_BMI2
 
 #define HUF_DGEN(fn)                                                        \
@@ -101,9 +105,9 @@
     }                                                                       \
                                                                             \
     static size_t fn(void* dst, size_t dstSize, void const* cSrc,           \
-                     size_t cSrcSize, HUF_DTable const* DTable, int bmi2)   \
+                     size_t cSrcSize, HUF_DTable const* DTable, int flags)  \
     {                                                                       \
-        if (bmi2) {                                                         \
+        if (flags & HUF_flags_bmi2) {                                       \
             return fn##_bmi2(dst, dstSize, cSrc, cSrcSize, DTable);         \
         }                                                                   \
         return fn##_default(dst, dstSize, cSrc, cSrcSize, DTable);          \
@@ -113,9 +117,9 @@
 
 #define HUF_DGEN(fn)                                                        \
     static size_t fn(void* dst, size_t dstSize, void const* cSrc,           \
-                     size_t cSrcSize, HUF_DTable const* DTable, int bmi2)   \
+                     size_t cSrcSize, HUF_DTable const* DTable, int flags)  \
     {                                                                       \
-        (void)bmi2;                                                         \
+        (void)flags;                                                        \
         return fn##_body(dst, dstSize, cSrc, cSrcSize, DTable);             \
     }
 
@@ -134,15 +138,28 @@ static DTableDesc HUF_getDTableDesc(const HUF_DTable* table)
     return dtd;
 }
 
-#if ZSTD_ENABLE_ASM_X86_64_BMI2
-
-static size_t HUF_initDStream(BYTE const* ip) {
+static size_t HUF_initFastDStream(BYTE const* ip) {
     BYTE const lastByte = ip[7];
-    size_t const bitsConsumed = lastByte ? 8 - BIT_highbit32(lastByte) : 0;
+    size_t const bitsConsumed = lastByte ? 8 - ZSTD_highbit32(lastByte) : 0;
     size_t const value = MEM_readLEST(ip) | 1;
     assert(bitsConsumed <= 8);
+    assert(sizeof(size_t) == 8);
     return value << bitsConsumed;
 }
+
+
+/*
+ * The input/output arguments to the Huffman fast decoding loop:
+ *
+ * ip [in/out] - The input pointers, must be updated to reflect what is consumed.
+ * op [in/out] - The output pointers, must be updated to reflect what is written.
+ * bits [in/out] - The bitstream containers, must be updated to reflect the current state.
+ * dt [in] - The decoding table.
+ * ilimit [in] - The input limit, stop when any input pointer is below ilimit.
+ * oend [in] - The end of the output stream. op[3] must not cross oend.
+ * iend [in] - The end of each input stream. ip[i] may cross iend[i],
+ *             as long as it is above ilimit, but that indicates corruption.
+ */
 typedef struct {
     BYTE const* ip[4];
     BYTE* op[4];
@@ -151,15 +168,17 @@ typedef struct {
     BYTE const* ilimit;
     BYTE* oend;
     BYTE const* iend[4];
-} HUF_DecompressAsmArgs;
+} HUF_DecompressFastArgs;
+
+typedef void (*HUF_DecompressFastLoopFn)(HUF_DecompressFastArgs*);
 
 /*
- * Initializes args for the asm decoding loop.
- * @returns 0 on success
- *          1 if the fallback implementation should be used.
+ * Initializes args for the fast decoding loop.
+ * @returns 1 on success
+ *          0 if the fallback implementation should be used.
  *          Or an error code on failure.
  */
-static size_t HUF_DecompressAsmArgs_init(HUF_DecompressAsmArgs* args, void* dst, size_t dstSize, void const* src, size_t srcSize, const HUF_DTable* DTable)
+static size_t HUF_DecompressFastArgs_init(HUF_DecompressFastArgs* args, void* dst, size_t dstSize, void const* src, size_t srcSize, const HUF_DTable* DTable)
 {
     void const* dt = DTable + 1;
     U32 const dtLog = HUF_getDTableDesc(DTable).tableLog;
@@ -168,9 +187,11 @@ static size_t HUF_DecompressAsmArgs_init(HUF_DecompressAsmArgs* args, void* dst,
 
     BYTE* const oend = (BYTE*)dst + dstSize;
 
-    /* The following condition is false on x32 platform,
-     * but HUF_asm is not compatible with this ABI */
-    if (!(MEM_isLittleEndian() && !MEM_32bits())) return 1;
+    /* The fast decoding loop assumes 64-bit little-endian.
+     * This condition is false on x32.
+     */
+    if (!MEM_isLittleEndian() || MEM_32bits())
+        return 0;
 
     /* strict minimum : jump table + 1 byte per stream */
     if (srcSize < 10)
@@ -181,7 +202,7 @@ static size_t HUF_DecompressAsmArgs_init(HUF_DecompressAsmArgs* args, void* dst,
      * On small inputs we don't have enough data to trigger the fast loop, so use the old decoder.
      */
     if (dtLog != HUF_DECODER_FAST_TABLELOG)
-        return 1;
+        return 0;
 
     /* Read the jump table. */
     {
@@ -195,13 +216,13 @@ static size_t HUF_DecompressAsmArgs_init(HUF_DecompressAsmArgs* args, void* dst,
         args->iend[2] = args->iend[1] + length2;
         args->iend[3] = args->iend[2] + length3;
 
-        /* HUF_initDStream() requires this, and this small of an input
+        /* HUF_initFastDStream() requires this, and this small of an input
          * won't benefit from the ASM loop anyways.
          * length1 must be >= 16 so that ip[0] >= ilimit before the loop
          * starts.
          */
         if (length1 < 16 || length2 < 8 || length3 < 8 || length4 < 8)
-            return 1;
+            return 0;
         if (length4 > srcSize) return ERROR(corruption_detected);   /* overflow */
     }
     /* ip[] contains the position that is currently loaded into bits[]. */
@@ -218,7 +239,7 @@ static size_t HUF_DecompressAsmArgs_init(HUF_DecompressAsmArgs* args, void* dst,
 
     /* No point to call the ASM loop for tiny outputs. */
     if (args->op[3] >= oend)
-        return 1;
+        return 0;
 
     /* bits[] is the bit container.
         * It is read from the MSB down to the LSB.
@@ -227,10 +248,10 @@ static size_t HUF_DecompressAsmArgs_init(HUF_DecompressAsmArgs* args, void* dst,
         * set, so that CountTrailingZeros(bits[]) can be used
         * to count how many bits we've consumed.
         */
-    args->bits[0] = HUF_initDStream(args->ip[0]);
-    args->bits[1] = HUF_initDStream(args->ip[1]);
-    args->bits[2] = HUF_initDStream(args->ip[2]);
-    args->bits[3] = HUF_initDStream(args->ip[3]);
+    args->bits[0] = HUF_initFastDStream(args->ip[0]);
+    args->bits[1] = HUF_initFastDStream(args->ip[1]);
+    args->bits[2] = HUF_initFastDStream(args->ip[2]);
+    args->bits[3] = HUF_initFastDStream(args->ip[3]);
 
     /* If ip[] >= ilimit, it is guaranteed to be safe to
         * reload bits[]. It may be beyond its section, but is
@@ -241,10 +262,10 @@ static size_t HUF_DecompressAsmArgs_init(HUF_DecompressAsmArgs* args, void* dst,
     args->oend = oend;
     args->dt = dt;
 
-    return 0;
+    return 1;
 }
 
-static size_t HUF_initRemainingDStream(BIT_DStream_t* bit, HUF_DecompressAsmArgs const* args, int stream, BYTE* segmentEnd)
+static size_t HUF_initRemainingDStream(BIT_DStream_t* bit, HUF_DecompressFastArgs const* args, int stream, BYTE* segmentEnd)
 {
     /* Validate that we haven't overwritten. */
     if (args->op[stream] > segmentEnd)
@@ -258,15 +279,15 @@ static size_t HUF_initRemainingDStream(BIT_DStream_t* bit, HUF_DecompressAsmArgs
         return ERROR(corruption_detected);
 
     /* Construct the BIT_DStream_t. */
-    bit->bitContainer = MEM_readLE64(args->ip[stream]);
-    bit->bitsConsumed = ZSTD_countTrailingZeros((size_t)args->bits[stream]);
+    assert(sizeof(size_t) == 8);
+    bit->bitContainer = MEM_readLEST(args->ip[stream]);
+    bit->bitsConsumed = ZSTD_countTrailingZeros64(args->bits[stream]);
     bit->start = (const char*)args->iend[0];
     bit->limitPtr = bit->start + sizeof(size_t);
     bit->ptr = (const char*)args->ip[stream];
 
     return 0;
 }
-#endif
 
 
 #ifndef HUF_FORCE_DECOMPRESS_X2
@@ -283,10 +304,11 @@ typedef struct { BYTE nbBits; BYTE byte; } HUF_DEltX1;   /* single-symbol decodi
 static U64 HUF_DEltX1_set4(BYTE symbol, BYTE nbBits) {
     U64 D4;
     if (MEM_isLittleEndian()) {
-        D4 = (symbol << 8) + nbBits;
+        D4 = (U64)((symbol << 8) + nbBits);
     } else {
-        D4 = symbol + (nbBits << 8);
+        D4 = (U64)(symbol + (nbBits << 8));
     }
+    assert(D4 < (1U << 16));
     D4 *= 0x0001000100010001ULL;
     return D4;
 }
@@ -329,13 +351,7 @@ typedef struct {
         BYTE huffWeight[HUF_SYMBOLVALUE_MAX + 1];
 } HUF_ReadDTableX1_Workspace;
 
-
-size_t HUF_readDTableX1_wksp(HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize)
-{
-    return HUF_readDTableX1_wksp_bmi2(DTable, src, srcSize, workSpace, wkspSize, /* bmi2 */ 0);
-}
-
-size_t HUF_readDTableX1_wksp_bmi2(HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize, int bmi2)
+size_t HUF_readDTableX1_wksp(HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize, int flags)
 {
     U32 tableLog = 0;
     U32 nbSymbols = 0;
@@ -350,7 +366,7 @@ size_t HUF_readDTableX1_wksp_bmi2(HUF_DTable* DTable, const void* src, size_t sr
     DEBUG_STATIC_ASSERT(sizeof(DTableDesc) == sizeof(HUF_DTable));
     /* ZSTD_memset(huffWeight, 0, sizeof(huffWeight)); */   /* is not necessary, even though some analyzer complain ... */
 
-    iSize = HUF_readStats_wksp(wksp->huffWeight, HUF_SYMBOLVALUE_MAX + 1, wksp->rankVal, &nbSymbols, &tableLog, src, srcSize, wksp->statsWksp, sizeof(wksp->statsWksp), bmi2);
+    iSize = HUF_readStats_wksp(wksp->huffWeight, HUF_SYMBOLVALUE_MAX + 1, wksp->rankVal, &nbSymbols, &tableLog, src, srcSize, wksp->statsWksp, sizeof(wksp->statsWksp), flags);
     if (HUF_isError(iSize)) return iSize;
 
 
@@ -377,9 +393,8 @@ size_t HUF_readDTableX1_wksp_bmi2(HUF_DTable* DTable, const void* src, size_t sr
      * rankStart[0] is not filled because there are no entries in the table for
      * weight 0.
      */
-    {
-        int n;
-        int nextRankStart = 0;
+    {   int n;
+        U32 nextRankStart = 0;
         int const unroll = 4;
         int const nLimit = (int)nbSymbols - unroll + 1;
         for (n=0; n<(int)tableLog+1; n++) {
@@ -406,10 +421,9 @@ size_t HUF_readDTableX1_wksp_bmi2(HUF_DTable* DTable, const void* src, size_t sr
      * We can switch based on the length to a different inner loop which is
      * optimized for that particular case.
      */
-    {
-        U32 w;
-        int symbol=wksp->rankVal[0];
-        int rankStart=0;
+    {   U32 w;
+        int symbol = wksp->rankVal[0];
+        int rankStart = 0;
         for (w=1; w<tableLog+1; ++w) {
             int const symbolCount = wksp->rankVal[w];
             int const length = (1 << w) >> 1;
@@ -519,7 +533,7 @@ HUF_decodeStreamX1(BYTE* p, BIT_DStream_t* const bitDPtr, BYTE* const pEnd, cons
     while (p < pEnd)
         HUF_DECODE_SYMBOLX1_0(p, bitDPtr);
 
-    return pEnd-pStart;
+    return (size_t)(pEnd-pStart);
 }
 
 FORCE_INLINE_TEMPLATE size_t
@@ -545,6 +559,10 @@ HUF_decompress1X1_usingDTable_internal_body(
     return dstSize;
 }
 
+/* HUF_decompress4X1_usingDTable_internal_body():
+ * Conditions :
+ * @dstSize >= 6
+ */
 FORCE_INLINE_TEMPLATE size_t
 HUF_decompress4X1_usingDTable_internal_body(
           void* dst,  size_t dstSize,
@@ -588,6 +606,7 @@ HUF_decompress4X1_usingDTable_internal_body(
 
         if (length4 > cSrcSize) return ERROR(corruption_detected);   /* overflow */
         if (opStart4 > oend) return ERROR(corruption_detected);      /* overflow */
+        if (dstSize < 6) return ERROR(corruption_detected);         /* stream 4-split doesn't work */
         CHECK_F( BIT_initDStream(&bitD1, istart1, length1) );
         CHECK_F( BIT_initDStream(&bitD2, istart2, length2) );
         CHECK_F( BIT_initDStream(&bitD3, istart3, length3) );
@@ -650,38 +669,142 @@ size_t HUF_decompress4X1_usingDTable_internal_bmi2(void* dst, size_t dstSize, vo
 }
 #endif
 
-#if HUF_NEED_DEFAULT_FUNCTION
 static
 size_t HUF_decompress4X1_usingDTable_internal_default(void* dst, size_t dstSize, void const* cSrc,
                     size_t cSrcSize, HUF_DTable const* DTable) {
     return HUF_decompress4X1_usingDTable_internal_body(dst, dstSize, cSrc, cSrcSize, DTable);
 }
-#endif
 
 #if ZSTD_ENABLE_ASM_X86_64_BMI2
 
-HUF_ASM_DECL void HUF_decompress4X1_usingDTable_internal_bmi2_asm_loop(HUF_DecompressAsmArgs* args) ZSTDLIB_HIDDEN;
+HUF_ASM_DECL void HUF_decompress4X1_usingDTable_internal_fast_asm_loop(HUF_DecompressFastArgs* args) ZSTDLIB_HIDDEN;
+
+#endif
+
+static HUF_FAST_BMI2_ATTRS
+void HUF_decompress4X1_usingDTable_internal_fast_c_loop(HUF_DecompressFastArgs* args)
+{
+    U64 bits[4];
+    BYTE const* ip[4];
+    BYTE* op[4];
+    U16 const* const dtable = (U16 const*)args->dt;
+    BYTE* const oend = args->oend;
+    BYTE const* const ilimit = args->ilimit;
+
+    /* Copy the arguments to local variables */
+    ZSTD_memcpy(&bits, &args->bits, sizeof(bits));
+    ZSTD_memcpy((void*)(&ip), &args->ip, sizeof(ip));
+    ZSTD_memcpy(&op, &args->op, sizeof(op));
+
+    assert(MEM_isLittleEndian());
+    assert(!MEM_32bits());
+
+    for (;;) {
+        BYTE* olimit;
+        int stream;
+        int symbol;
+
+        /* Assert loop preconditions */
+#ifndef NDEBUG
+        for (stream = 0; stream < 4; ++stream) {
+            assert(op[stream] <= (stream == 3 ? oend : op[stream + 1]));
+            assert(ip[stream] >= ilimit);
+        }
+#endif
+        /* Compute olimit */
+        {
+            /* Each iteration produces 5 output symbols per stream */
+            size_t const oiters = (size_t)(oend - op[3]) / 5;
+            /* Each iteration consumes up to 11 bits * 5 = 55 bits < 7 bytes
+             * per stream.
+             */
+            size_t const iiters = (size_t)(ip[0] - ilimit) / 7;
+            /* We can safely run iters iterations before running bounds checks */
+            size_t const iters = MIN(oiters, iiters);
+            size_t const symbols = iters * 5;
+
+            /* We can simply check that op[3] < olimit, instead of checking all
+             * of our bounds, since we can't hit the other bounds until we've run
+             * iters iterations, which only happens when op[3] == olimit.
+             */
+            olimit = op[3] + symbols;
+
+            /* Exit fast decoding loop once we get close to the end. */
+            if (op[3] + 20 > olimit)
+                break;
+
+            /* Exit the decoding loop if any input pointer has crossed the
+             * previous one. This indicates corruption, and a precondition
+             * to our loop is that ip[i] >= ip[0].
+             */
+            for (stream = 1; stream < 4; ++stream) {
+                if (ip[stream] < ip[stream - 1])
+                    goto _out;
+            }
+        }
+
+#ifndef NDEBUG
+        for (stream = 1; stream < 4; ++stream) {
+            assert(ip[stream] >= ip[stream - 1]);
+        }
+#endif
+
+        do {
+            /* Decode 5 symbols in each of the 4 streams */
+            for (symbol = 0; symbol < 5; ++symbol) {
+                for (stream = 0; stream < 4; ++stream) {
+                    int const index = (int)(bits[stream] >> 53);
+                    int const entry = (int)dtable[index];
+                    bits[stream] <<= (entry & 63);
+                    op[stream][symbol] = (BYTE)((entry >> 8) & 0xFF);
+                }
+            }
+            /* Reload the bitstreams */
+            for (stream = 0; stream < 4; ++stream) {
+                int const ctz = ZSTD_countTrailingZeros64(bits[stream]);
+                int const nbBits = ctz & 7;
+                int const nbBytes = ctz >> 3;
+                op[stream] += 5;
+                ip[stream] -= nbBytes;
+                bits[stream] = MEM_read64(ip[stream]) | 1;
+                bits[stream] <<= nbBits;
+            }
+        } while (op[3] < olimit);
+    }
+
+_out:
 
-static HUF_ASM_X86_64_BMI2_ATTRS
+    /* Save the final values of each of the state variables back to args. */
+    ZSTD_memcpy(&args->bits, &bits, sizeof(bits));
+    ZSTD_memcpy((void*)(&args->ip), &ip, sizeof(ip));
+    ZSTD_memcpy(&args->op, &op, sizeof(op));
+}
+
+/*
+ * @returns @p dstSize on success (>= 6)
+ *          0 if the fallback implementation should be used
+ *          An error if an error occurred
+ */
+static HUF_FAST_BMI2_ATTRS
 size_t
-HUF_decompress4X1_usingDTable_internal_bmi2_asm(
+HUF_decompress4X1_usingDTable_internal_fast(
           void* dst,  size_t dstSize,
     const void* cSrc, size_t cSrcSize,
-    const HUF_DTable* DTable)
+    const HUF_DTable* DTable,
+    HUF_DecompressFastLoopFn loopFn)
 {
     void const* dt = DTable + 1;
     const BYTE* const iend = (const BYTE*)cSrc + 6;
     BYTE* const oend = (BYTE*)dst + dstSize;
-    HUF_DecompressAsmArgs args;
-    {
-        size_t const ret = HUF_DecompressAsmArgs_init(&args, dst, dstSize, cSrc, cSrcSize, DTable);
-        FORWARD_IF_ERROR(ret, "Failed to init asm args");
-        if (ret != 0)
-            return HUF_decompress4X1_usingDTable_internal_bmi2(dst, dstSize, cSrc, cSrcSize, DTable);
+    HUF_DecompressFastArgs args;
+    {   size_t const ret = HUF_DecompressFastArgs_init(&args, dst, dstSize, cSrc, cSrcSize, DTable);
+        FORWARD_IF_ERROR(ret, "Failed to init fast loop args");
+        if (ret == 0)
+            return 0;
     }
 
     assert(args.ip[0] >= args.ilimit);
-    HUF_decompress4X1_usingDTable_internal_bmi2_asm_loop(&args);
+    loopFn(&args);
 
     /* Our loop guarantees that ip[] >= ilimit and that we haven't
     * overwritten any op[].
@@ -694,8 +817,7 @@ HUF_decompress4X1_usingDTable_internal_bmi2_asm(
     (void)iend;
 
     /* finish bit streams one by one. */
-    {
-        size_t const segmentSize = (dstSize+3) / 4;
+    {   size_t const segmentSize = (dstSize+3) / 4;
         BYTE* segmentEnd = (BYTE*)dst;
         int i;
         for (i = 0; i < 4; ++i) {
@@ -712,97 +834,59 @@ HUF_decompress4X1_usingDTable_internal_bmi2_asm(
     }
 
     /* decoded size */
+    assert(dstSize != 0);
     return dstSize;
 }
-#endif /* ZSTD_ENABLE_ASM_X86_64_BMI2 */
-
-typedef size_t (*HUF_decompress_usingDTable_t)(void *dst, size_t dstSize,
-                                               const void *cSrc,
-                                               size_t cSrcSize,
-                                               const HUF_DTable *DTable);
 
 HUF_DGEN(HUF_decompress1X1_usingDTable_internal)
 
 static size_t HUF_decompress4X1_usingDTable_internal(void* dst, size_t dstSize, void const* cSrc,
-                    size_t cSrcSize, HUF_DTable const* DTable, int bmi2)
+                    size_t cSrcSize, HUF_DTable const* DTable, int flags)
 {
+    HUF_DecompressUsingDTableFn fallbackFn = HUF_decompress4X1_usingDTable_internal_default;
+    HUF_DecompressFastLoopFn loopFn = HUF_decompress4X1_usingDTable_internal_fast_c_loop;
+
 #if DYNAMIC_BMI2
-    if (bmi2) {
+    if (flags & HUF_flags_bmi2) {
+        fallbackFn = HUF_decompress4X1_usingDTable_internal_bmi2;
 # if ZSTD_ENABLE_ASM_X86_64_BMI2
-        return HUF_decompress4X1_usingDTable_internal_bmi2_asm(dst, dstSize, cSrc, cSrcSize, DTable);
-# else
-        return HUF_decompress4X1_usingDTable_internal_bmi2(dst, dstSize, cSrc, cSrcSize, DTable);
+        if (!(flags & HUF_flags_disableAsm)) {
+            loopFn = HUF_decompress4X1_usingDTable_internal_fast_asm_loop;
+        }
 # endif
+    } else {
+        return fallbackFn(dst, dstSize, cSrc, cSrcSize, DTable);
     }
-#else
-    (void)bmi2;
 #endif
 
 #if ZSTD_ENABLE_ASM_X86_64_BMI2 && defined(__BMI2__)
-    return HUF_decompress4X1_usingDTable_internal_bmi2_asm(dst, dstSize, cSrc, cSrcSize, DTable);
-#else
-    return HUF_decompress4X1_usingDTable_internal_default(dst, dstSize, cSrc, cSrcSize, DTable);
+    if (!(flags & HUF_flags_disableAsm)) {
+        loopFn = HUF_decompress4X1_usingDTable_internal_fast_asm_loop;
+    }
 #endif
-}
-
-
-size_t HUF_decompress1X1_usingDTable(
-          void* dst,  size_t dstSize,
-    const void* cSrc, size_t cSrcSize,
-    const HUF_DTable* DTable)
-{
-    DTableDesc dtd = HUF_getDTableDesc(DTable);
-    if (dtd.tableType != 0) return ERROR(GENERIC);
-    return HUF_decompress1X1_usingDTable_internal(dst, dstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
-}
 
-size_t HUF_decompress1X1_DCtx_wksp(HUF_DTable* DCtx, void* dst, size_t dstSize,
-                                   const void* cSrc, size_t cSrcSize,
-                                   void* workSpace, size_t wkspSize)
-{
-    const BYTE* ip = (const BYTE*) cSrc;
-
-    size_t const hSize = HUF_readDTableX1_wksp(DCtx, cSrc, cSrcSize, workSpace, wkspSize);
-    if (HUF_isError(hSize)) return hSize;
-    if (hSize >= cSrcSize) return ERROR(srcSize_wrong);
-    ip += hSize; cSrcSize -= hSize;
-
-    return HUF_decompress1X1_usingDTable_internal(dst, dstSize, ip, cSrcSize, DCtx, /* bmi2 */ 0);
-}
-
-
-size_t HUF_decompress4X1_usingDTable(
-          void* dst,  size_t dstSize,
-    const void* cSrc, size_t cSrcSize,
-    const HUF_DTable* DTable)
-{
-    DTableDesc dtd = HUF_getDTableDesc(DTable);
-    if (dtd.tableType != 0) return ERROR(GENERIC);
-    return HUF_decompress4X1_usingDTable_internal(dst, dstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
+    if (!(flags & HUF_flags_disableFast)) {
+        size_t const ret = HUF_decompress4X1_usingDTable_internal_fast(dst, dstSize, cSrc, cSrcSize, DTable, loopFn);
+        if (ret != 0)
+            return ret;
+    }
+    return fallbackFn(dst, dstSize, cSrc, cSrcSize, DTable);
 }
 
-static size_t HUF_decompress4X1_DCtx_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize,
+static size_t HUF_decompress4X1_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize,
                                    const void* cSrc, size_t cSrcSize,
-                                   void* workSpace, size_t wkspSize, int bmi2)
+                                   void* workSpace, size_t wkspSize, int flags)
 {
     const BYTE* ip = (const BYTE*) cSrc;
 
-    size_t const hSize = HUF_readDTableX1_wksp_bmi2(dctx, cSrc, cSrcSize, workSpace, wkspSize, bmi2);
+    size_t const hSize = HUF_readDTableX1_wksp(dctx, cSrc, cSrcSize, workSpace, wkspSize, flags);
     if (HUF_isError(hSize)) return hSize;
     if (hSize >= cSrcSize) return ERROR(srcSize_wrong);
     ip += hSize; cSrcSize -= hSize;
 
-    return HUF_decompress4X1_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx, bmi2);
+    return HUF_decompress4X1_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx, flags);
 }
 
-size_t HUF_decompress4X1_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize,
-                                   const void* cSrc, size_t cSrcSize,
-                                   void* workSpace, size_t wkspSize)
-{
-    return HUF_decompress4X1_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, 0);
-}
-
-
 #endif /* HUF_FORCE_DECOMPRESS_X2 */
 
 
@@ -985,7 +1069,7 @@ static void HUF_fillDTableX2Level2(HUF_DEltX2* DTable, U32 targetLog, const U32
 
 static void HUF_fillDTableX2(HUF_DEltX2* DTable, const U32 targetLog,
                            const sortedSymbol_t* sortedList,
-                           const U32* rankStart, rankValCol_t *rankValOrigin, const U32 maxWeight,
+                           const U32* rankStart, rankValCol_t* rankValOrigin, const U32 maxWeight,
                            const U32 nbBitsBaseline)
 {
     U32* const rankVal = rankValOrigin[0];
@@ -1040,14 +1124,7 @@ typedef struct {
 
 size_t HUF_readDTableX2_wksp(HUF_DTable* DTable,
                        const void* src, size_t srcSize,
-                             void* workSpace, size_t wkspSize)
-{
-    return HUF_readDTableX2_wksp_bmi2(DTable, src, srcSize, workSpace, wkspSize, /* bmi2 */ 0);
-}
-
-size_t HUF_readDTableX2_wksp_bmi2(HUF_DTable* DTable,
-                       const void* src, size_t srcSize,
-                             void* workSpace, size_t wkspSize, int bmi2)
+                             void* workSpace, size_t wkspSize, int flags)
 {
     U32 tableLog, maxW, nbSymbols;
     DTableDesc dtd = HUF_getDTableDesc(DTable);
@@ -1069,7 +1146,7 @@ size_t HUF_readDTableX2_wksp_bmi2(HUF_DTable* DTable,
     if (maxTableLog > HUF_TABLELOG_MAX) return ERROR(tableLog_tooLarge);
     /* ZSTD_memset(weightList, 0, sizeof(weightList)); */  /* is not necessary, even though some analyzer complain ... */
 
-    iSize = HUF_readStats_wksp(wksp->weightList, HUF_SYMBOLVALUE_MAX + 1, wksp->rankStats, &nbSymbols, &tableLog, src, srcSize, wksp->calleeWksp, sizeof(wksp->calleeWksp), bmi2);
+    iSize = HUF_readStats_wksp(wksp->weightList, HUF_SYMBOLVALUE_MAX + 1, wksp->rankStats, &nbSymbols, &tableLog, src, srcSize, wksp->calleeWksp, sizeof(wksp->calleeWksp), flags);
     if (HUF_isError(iSize)) return iSize;
 
     /* check result */
@@ -1240,6 +1317,11 @@ HUF_decompress1X2_usingDTable_internal_body(
     /* decoded size */
     return dstSize;
 }
+
+/* HUF_decompress4X2_usingDTable_internal_body():
+ * Conditions:
+ * @dstSize >= 6
+ */
 FORCE_INLINE_TEMPLATE size_t
 HUF_decompress4X2_usingDTable_internal_body(
           void* dst,  size_t dstSize,
@@ -1280,8 +1362,9 @@ HUF_decompress4X2_usingDTable_internal_body(
         DTableDesc const dtd = HUF_getDTableDesc(DTable);
         U32 const dtLog = dtd.tableLog;
 
-        if (length4 > cSrcSize) return ERROR(corruption_detected);   /* overflow */
-        if (opStart4 > oend) return ERROR(corruption_detected);      /* overflow */
+        if (length4 > cSrcSize) return ERROR(corruption_detected);  /* overflow */
+        if (opStart4 > oend) return ERROR(corruption_detected);     /* overflow */
+        if (dstSize < 6) return ERROR(corruption_detected);         /* stream 4-split doesn't work */
         CHECK_F( BIT_initDStream(&bitD1, istart1, length1) );
         CHECK_F( BIT_initDStream(&bitD2, istart2, length2) );
         CHECK_F( BIT_initDStream(&bitD3, istart3, length3) );
@@ -1366,36 +1449,177 @@ size_t HUF_decompress4X2_usingDTable_internal_bmi2(void* dst, size_t dstSize, vo
 }
 #endif
 
-#if HUF_NEED_DEFAULT_FUNCTION
 static
 size_t HUF_decompress4X2_usingDTable_internal_default(void* dst, size_t dstSize, void const* cSrc,
                     size_t cSrcSize, HUF_DTable const* DTable) {
     return HUF_decompress4X2_usingDTable_internal_body(dst, dstSize, cSrc, cSrcSize, DTable);
 }
-#endif
 
 #if ZSTD_ENABLE_ASM_X86_64_BMI2
 
-HUF_ASM_DECL void HUF_decompress4X2_usingDTable_internal_bmi2_asm_loop(HUF_DecompressAsmArgs* args) ZSTDLIB_HIDDEN;
+HUF_ASM_DECL void HUF_decompress4X2_usingDTable_internal_fast_asm_loop(HUF_DecompressFastArgs* args) ZSTDLIB_HIDDEN;
 
-static HUF_ASM_X86_64_BMI2_ATTRS size_t
-HUF_decompress4X2_usingDTable_internal_bmi2_asm(
+#endif
+
+static HUF_FAST_BMI2_ATTRS
+void HUF_decompress4X2_usingDTable_internal_fast_c_loop(HUF_DecompressFastArgs* args)
+{
+    U64 bits[4];
+    BYTE const* ip[4];
+    BYTE* op[4];
+    BYTE* oend[4];
+    HUF_DEltX2 const* const dtable = (HUF_DEltX2 const*)args->dt;
+    BYTE const* const ilimit = args->ilimit;
+
+    /* Copy the arguments to local registers. */
+    ZSTD_memcpy(&bits, &args->bits, sizeof(bits));
+    ZSTD_memcpy((void*)(&ip), &args->ip, sizeof(ip));
+    ZSTD_memcpy(&op, &args->op, sizeof(op));
+
+    oend[0] = op[1];
+    oend[1] = op[2];
+    oend[2] = op[3];
+    oend[3] = args->oend;
+
+    assert(MEM_isLittleEndian());
+    assert(!MEM_32bits());
+
+    for (;;) {
+        BYTE* olimit;
+        int stream;
+        int symbol;
+
+        /* Assert loop preconditions */
+#ifndef NDEBUG
+        for (stream = 0; stream < 4; ++stream) {
+            assert(op[stream] <= oend[stream]);
+            assert(ip[stream] >= ilimit);
+        }
+#endif
+        /* Compute olimit */
+        {
+            /* Each loop does 5 table lookups for each of the 4 streams.
+             * Each table lookup consumes up to 11 bits of input, and produces
+             * up to 2 bytes of output.
+             */
+            /* We can consume up to 7 bytes of input per iteration per stream.
+             * We also know that each input pointer is >= ip[0]. So we can run
+             * iters loops before running out of input.
+             */
+            size_t iters = (size_t)(ip[0] - ilimit) / 7;
+            /* Each iteration can produce up to 10 bytes of output per stream.
+             * Each output stream my advance at different rates. So take the
+             * minimum number of safe iterations among all the output streams.
+             */
+            for (stream = 0; stream < 4; ++stream) {
+                size_t const oiters = (size_t)(oend[stream] - op[stream]) / 10;
+                iters = MIN(iters, oiters);
+            }
+
+            /* Each iteration produces at least 5 output symbols. So until
+             * op[3] crosses olimit, we know we haven't executed iters
+             * iterations yet. This saves us maintaining an iters counter,
+             * at the expense of computing the remaining # of iterations
+             * more frequently.
+             */
+            olimit = op[3] + (iters * 5);
+
+            /* Exit the fast decoding loop if we are too close to the end. */
+            if (op[3] + 10 > olimit)
+                break;
+
+            /* Exit the decoding loop if any input pointer has crossed the
+             * previous one. This indicates corruption, and a precondition
+             * to our loop is that ip[i] >= ip[0].
+             */
+            for (stream = 1; stream < 4; ++stream) {
+                if (ip[stream] < ip[stream - 1])
+                    goto _out;
+            }
+        }
+
+#ifndef NDEBUG
+        for (stream = 1; stream < 4; ++stream) {
+            assert(ip[stream] >= ip[stream - 1]);
+        }
+#endif
+
+        do {
+            /* Do 5 table lookups for each of the first 3 streams */
+            for (symbol = 0; symbol < 5; ++symbol) {
+                for (stream = 0; stream < 3; ++stream) {
+                    int const index = (int)(bits[stream] >> 53);
+                    HUF_DEltX2 const entry = dtable[index];
+                    MEM_write16(op[stream], entry.sequence);
+                    bits[stream] <<= (entry.nbBits);
+                    op[stream] += (entry.length);
+                }
+            }
+            /* Do 1 table lookup from the final stream */
+            {
+                int const index = (int)(bits[3] >> 53);
+                HUF_DEltX2 const entry = dtable[index];
+                MEM_write16(op[3], entry.sequence);
+                bits[3] <<= (entry.nbBits);
+                op[3] += (entry.length);
+            }
+            /* Do 4 table lookups from the final stream & reload bitstreams */
+            for (stream = 0; stream < 4; ++stream) {
+                /* Do a table lookup from the final stream.
+                 * This is interleaved with the reloading to reduce register
+                 * pressure. This shouldn't be necessary, but compilers can
+                 * struggle with codegen with high register pressure.
+                 */
+                {
+                    int const index = (int)(bits[3] >> 53);
+                    HUF_DEltX2 const entry = dtable[index];
+                    MEM_write16(op[3], entry.sequence);
+                    bits[3] <<= (entry.nbBits);
+                    op[3] += (entry.length);
+                }
+                /* Reload the bistreams. The final bitstream must be reloaded
+                 * after the 5th symbol was decoded.
+                 */
+                {
+                    int const ctz = ZSTD_countTrailingZeros64(bits[stream]);
+                    int const nbBits = ctz & 7;
+                    int const nbBytes = ctz >> 3;
+                    ip[stream] -= nbBytes;
+                    bits[stream] = MEM_read64(ip[stream]) | 1;
+                    bits[stream] <<= nbBits;
+                }
+            }
+        } while (op[3] < olimit);
+    }
+
+_out:
+
+    /* Save the final values of each of the state variables back to args. */
+    ZSTD_memcpy(&args->bits, &bits, sizeof(bits));
+    ZSTD_memcpy((void*)(&args->ip), &ip, sizeof(ip));
+    ZSTD_memcpy(&args->op, &op, sizeof(op));
+}
+
+
+static HUF_FAST_BMI2_ATTRS size_t
+HUF_decompress4X2_usingDTable_internal_fast(
           void* dst,  size_t dstSize,
     const void* cSrc, size_t cSrcSize,
-    const HUF_DTable* DTable) {
+    const HUF_DTable* DTable,
+    HUF_DecompressFastLoopFn loopFn) {
     void const* dt = DTable + 1;
     const BYTE* const iend = (const BYTE*)cSrc + 6;
     BYTE* const oend = (BYTE*)dst + dstSize;
-    HUF_DecompressAsmArgs args;
+    HUF_DecompressFastArgs args;
     {
-        size_t const ret = HUF_DecompressAsmArgs_init(&args, dst, dstSize, cSrc, cSrcSize, DTable);
+        size_t const ret = HUF_DecompressFastArgs_init(&args, dst, dstSize, cSrc, cSrcSize, DTable);
         FORWARD_IF_ERROR(ret, "Failed to init asm args");
-        if (ret != 0)
-            return HUF_decompress4X2_usingDTable_internal_bmi2(dst, dstSize, cSrc, cSrcSize, DTable);
+        if (ret == 0)
+            return 0;
     }
 
     assert(args.ip[0] >= args.ilimit);
-    HUF_decompress4X2_usingDTable_internal_bmi2_asm_loop(&args);
+    loopFn(&args);
 
     /* note : op4 already verified within main loop */
     assert(args.ip[0] >= iend);
@@ -1426,91 +1650,72 @@ HUF_decompress4X2_usingDTable_internal_bmi2_asm(
     /* decoded size */
     return dstSize;
 }
-#endif /* ZSTD_ENABLE_ASM_X86_64_BMI2 */
 
 static size_t HUF_decompress4X2_usingDTable_internal(void* dst, size_t dstSize, void const* cSrc,
-                    size_t cSrcSize, HUF_DTable const* DTable, int bmi2)
+                    size_t cSrcSize, HUF_DTable const* DTable, int flags)
 {
+    HUF_DecompressUsingDTableFn fallbackFn = HUF_decompress4X2_usingDTable_internal_default;
+    HUF_DecompressFastLoopFn loopFn = HUF_decompress4X2_usingDTable_internal_fast_c_loop;
+
 #if DYNAMIC_BMI2
-    if (bmi2) {
+    if (flags & HUF_flags_bmi2) {
+        fallbackFn = HUF_decompress4X2_usingDTable_internal_bmi2;
 # if ZSTD_ENABLE_ASM_X86_64_BMI2
-        return HUF_decompress4X2_usingDTable_internal_bmi2_asm(dst, dstSize, cSrc, cSrcSize, DTable);
-# else
-        return HUF_decompress4X2_usingDTable_internal_bmi2(dst, dstSize, cSrc, cSrcSize, DTable);
+        if (!(flags & HUF_flags_disableAsm)) {
+            loopFn = HUF_decompress4X2_usingDTable_internal_fast_asm_loop;
+        }
 # endif
+    } else {
+        return fallbackFn(dst, dstSize, cSrc, cSrcSize, DTable);
     }
-#else
-    (void)bmi2;
 #endif
 
 #if ZSTD_ENABLE_ASM_X86_64_BMI2 && defined(__BMI2__)
-    return HUF_decompress4X2_usingDTable_internal_bmi2_asm(dst, dstSize, cSrc, cSrcSize, DTable);
-#else
-    return HUF_decompress4X2_usingDTable_internal_default(dst, dstSize, cSrc, cSrcSize, DTable);
+    if (!(flags & HUF_flags_disableAsm)) {
+        loopFn = HUF_decompress4X2_usingDTable_internal_fast_asm_loop;
+    }
 #endif
+
+    if (!(flags & HUF_flags_disableFast)) {
+        size_t const ret = HUF_decompress4X2_usingDTable_internal_fast(dst, dstSize, cSrc, cSrcSize, DTable, loopFn);
+        if (ret != 0)
+            return ret;
+    }
+    return fallbackFn(dst, dstSize, cSrc, cSrcSize, DTable);
 }
 
 HUF_DGEN(HUF_decompress1X2_usingDTable_internal)
 
-size_t HUF_decompress1X2_usingDTable(
-          void* dst,  size_t dstSize,
-    const void* cSrc, size_t cSrcSize,
-    const HUF_DTable* DTable)
-{
-    DTableDesc dtd = HUF_getDTableDesc(DTable);
-    if (dtd.tableType != 1) return ERROR(GENERIC);
-    return HUF_decompress1X2_usingDTable_internal(dst, dstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
-}
-
 size_t HUF_decompress1X2_DCtx_wksp(HUF_DTable* DCtx, void* dst, size_t dstSize,
                                    const void* cSrc, size_t cSrcSize,
-                                   void* workSpace, size_t wkspSize)
+                                   void* workSpace, size_t wkspSize, int flags)
 {
     const BYTE* ip = (const BYTE*) cSrc;
 
     size_t const hSize = HUF_readDTableX2_wksp(DCtx, cSrc, cSrcSize,
-                                               workSpace, wkspSize);
+                                               workSpace, wkspSize, flags);
     if (HUF_isError(hSize)) return hSize;
     if (hSize >= cSrcSize) return ERROR(srcSize_wrong);
     ip += hSize; cSrcSize -= hSize;
 
-    return HUF_decompress1X2_usingDTable_internal(dst, dstSize, ip, cSrcSize, DCtx, /* bmi2 */ 0);
-}
-
-
-size_t HUF_decompress4X2_usingDTable(
-          void* dst,  size_t dstSize,
-    const void* cSrc, size_t cSrcSize,
-    const HUF_DTable* DTable)
-{
-    DTableDesc dtd = HUF_getDTableDesc(DTable);
-    if (dtd.tableType != 1) return ERROR(GENERIC);
-    return HUF_decompress4X2_usingDTable_internal(dst, dstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
+    return HUF_decompress1X2_usingDTable_internal(dst, dstSize, ip, cSrcSize, DCtx, flags);
 }
 
-static size_t HUF_decompress4X2_DCtx_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize,
+static size_t HUF_decompress4X2_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize,
                                    const void* cSrc, size_t cSrcSize,
-                                   void* workSpace, size_t wkspSize, int bmi2)
+                                   void* workSpace, size_t wkspSize, int flags)
 {
     const BYTE* ip = (const BYTE*) cSrc;
 
     size_t hSize = HUF_readDTableX2_wksp(dctx, cSrc, cSrcSize,
-                                         workSpace, wkspSize);
+                                         workSpace, wkspSize, flags);
     if (HUF_isError(hSize)) return hSize;
     if (hSize >= cSrcSize) return ERROR(srcSize_wrong);
     ip += hSize; cSrcSize -= hSize;
 
-    return HUF_decompress4X2_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx, bmi2);
+    return HUF_decompress4X2_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx, flags);
 }
 
-size_t HUF_decompress4X2_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize,
-                                   const void* cSrc, size_t cSrcSize,
-                                   void* workSpace, size_t wkspSize)
-{
-    return HUF_decompress4X2_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, /* bmi2 */ 0);
-}
-
-
 #endif /* HUF_FORCE_DECOMPRESS_X1 */
 
 
@@ -1518,44 +1723,6 @@ size_t HUF_decompress4X2_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize,
 /* Universal decompression selectors */
 /* ***********************************/
 
-size_t HUF_decompress1X_usingDTable(void* dst, size_t maxDstSize,
-                                    const void* cSrc, size_t cSrcSize,
-                                    const HUF_DTable* DTable)
-{
-    DTableDesc const dtd = HUF_getDTableDesc(DTable);
-#if defined(HUF_FORCE_DECOMPRESS_X1)
-    (void)dtd;
-    assert(dtd.tableType == 0);
-    return HUF_decompress1X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
-#elif defined(HUF_FORCE_DECOMPRESS_X2)
-    (void)dtd;
-    assert(dtd.tableType == 1);
-    return HUF_decompress1X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
-#else
-    return dtd.tableType ? HUF_decompress1X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0) :
-                           HUF_decompress1X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
-#endif
-}
-
-size_t HUF_decompress4X_usingDTable(void* dst, size_t maxDstSize,
-                                    const void* cSrc, size_t cSrcSize,
-                                    const HUF_DTable* DTable)
-{
-    DTableDesc const dtd = HUF_getDTableDesc(DTable);
-#if defined(HUF_FORCE_DECOMPRESS_X1)
-    (void)dtd;
-    assert(dtd.tableType == 0);
-    return HUF_decompress4X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
-#elif defined(HUF_FORCE_DECOMPRESS_X2)
-    (void)dtd;
-    assert(dtd.tableType == 1);
-    return HUF_decompress4X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
-#else
-    return dtd.tableType ? HUF_decompress4X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0) :
-                           HUF_decompress4X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
-#endif
-}
-
 
 #if !defined(HUF_FORCE_DECOMPRESS_X1) && !defined(HUF_FORCE_DECOMPRESS_X2)
 typedef struct { U32 tableTime; U32 decode256Time; } algo_time_t;
@@ -1610,36 +1777,9 @@ U32 HUF_selectDecoder (size_t dstSize, size_t cSrcSize)
 #endif
 }
 
-
-size_t HUF_decompress4X_hufOnly_wksp(HUF_DTable* dctx, void* dst,
-                                     size_t dstSize, const void* cSrc,
-                                     size_t cSrcSize, void* workSpace,
-                                     size_t wkspSize)
-{
-    /* validation checks */
-    if (dstSize == 0) return ERROR(dstSize_tooSmall);
-    if (cSrcSize == 0) return ERROR(corruption_detected);
-
-    {   U32 const algoNb = HUF_selectDecoder(dstSize, cSrcSize);
-#if defined(HUF_FORCE_DECOMPRESS_X1)
-        (void)algoNb;
-        assert(algoNb == 0);
-        return HUF_decompress4X1_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize);
-#elif defined(HUF_FORCE_DECOMPRESS_X2)
-        (void)algoNb;
-        assert(algoNb == 1);
-        return HUF_decompress4X2_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize);
-#else
-        return algoNb ? HUF_decompress4X2_DCtx_wksp(dctx, dst, dstSize, cSrc,
-                            cSrcSize, workSpace, wkspSize):
-                        HUF_decompress4X1_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize);
-#endif
-    }
-}
-
 size_t HUF_decompress1X_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize,
                                   const void* cSrc, size_t cSrcSize,
-                                  void* workSpace, size_t wkspSize)
+                                  void* workSpace, size_t wkspSize, int flags)
 {
     /* validation checks */
     if (dstSize == 0) return ERROR(dstSize_tooSmall);
@@ -1652,71 +1792,71 @@ size_t HUF_decompress1X_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize,
         (void)algoNb;
         assert(algoNb == 0);
         return HUF_decompress1X1_DCtx_wksp(dctx, dst, dstSize, cSrc,
-                                cSrcSize, workSpace, wkspSize);
+                                cSrcSize, workSpace, wkspSize, flags);
 #elif defined(HUF_FORCE_DECOMPRESS_X2)
         (void)algoNb;
         assert(algoNb == 1);
         return HUF_decompress1X2_DCtx_wksp(dctx, dst, dstSize, cSrc,
-                                cSrcSize, workSpace, wkspSize);
+                                cSrcSize, workSpace, wkspSize, flags);
 #else
         return algoNb ? HUF_decompress1X2_DCtx_wksp(dctx, dst, dstSize, cSrc,
-                                cSrcSize, workSpace, wkspSize):
+                                cSrcSize, workSpace, wkspSize, flags):
                         HUF_decompress1X1_DCtx_wksp(dctx, dst, dstSize, cSrc,
-                                cSrcSize, workSpace, wkspSize);
+                                cSrcSize, workSpace, wkspSize, flags);
 #endif
     }
 }
 
 
-size_t HUF_decompress1X_usingDTable_bmi2(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int bmi2)
+size_t HUF_decompress1X_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int flags)
 {
     DTableDesc const dtd = HUF_getDTableDesc(DTable);
 #if defined(HUF_FORCE_DECOMPRESS_X1)
     (void)dtd;
     assert(dtd.tableType == 0);
-    return HUF_decompress1X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2);
+    return HUF_decompress1X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags);
 #elif defined(HUF_FORCE_DECOMPRESS_X2)
     (void)dtd;
     assert(dtd.tableType == 1);
-    return HUF_decompress1X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2);
+    return HUF_decompress1X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags);
 #else
-    return dtd.tableType ? HUF_decompress1X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2) :
-                           HUF_decompress1X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2);
+    return dtd.tableType ? HUF_decompress1X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags) :
+                           HUF_decompress1X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags);
 #endif
 }
 
 #ifndef HUF_FORCE_DECOMPRESS_X2
-size_t HUF_decompress1X1_DCtx_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int bmi2)
+size_t HUF_decompress1X1_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int flags)
 {
     const BYTE* ip = (const BYTE*) cSrc;
 
-    size_t const hSize = HUF_readDTableX1_wksp_bmi2(dctx, cSrc, cSrcSize, workSpace, wkspSize, bmi2);
+    size_t const hSize = HUF_readDTableX1_wksp(dctx, cSrc, cSrcSize, workSpace, wkspSize, flags);
     if (HUF_isError(hSize)) return hSize;
     if (hSize >= cSrcSize) return ERROR(srcSize_wrong);
     ip += hSize; cSrcSize -= hSize;
 
-    return HUF_decompress1X1_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx, bmi2);
+    return HUF_decompress1X1_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx, flags);
 }
 #endif
 
-size_t HUF_decompress4X_usingDTable_bmi2(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int bmi2)
+size_t HUF_decompress4X_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int flags)
 {
     DTableDesc const dtd = HUF_getDTableDesc(DTable);
 #if defined(HUF_FORCE_DECOMPRESS_X1)
     (void)dtd;
     assert(dtd.tableType == 0);
-    return HUF_decompress4X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2);
+    return HUF_decompress4X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags);
 #elif defined(HUF_FORCE_DECOMPRESS_X2)
     (void)dtd;
     assert(dtd.tableType == 1);
-    return HUF_decompress4X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2);
+    return HUF_decompress4X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags);
 #else
-    return dtd.tableType ? HUF_decompress4X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2) :
-                           HUF_decompress4X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2);
+    return dtd.tableType ? HUF_decompress4X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags) :
+                           HUF_decompress4X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags);
 #endif
 }
 
-size_t HUF_decompress4X_hufOnly_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int bmi2)
+size_t HUF_decompress4X_hufOnly_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int flags)
 {
     /* validation checks */
     if (dstSize == 0) return ERROR(dstSize_tooSmall);
@@ -1726,15 +1866,14 @@ size_t HUF_decompress4X_hufOnly_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t ds
 #if defined(HUF_FORCE_DECOMPRESS_X1)
         (void)algoNb;
         assert(algoNb == 0);
-        return HUF_decompress4X1_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, bmi2);
+        return HUF_decompress4X1_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, flags);
 #elif defined(HUF_FORCE_DECOMPRESS_X2)
         (void)algoNb;
         assert(algoNb == 1);
-        return HUF_decompress4X2_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, bmi2);
+        return HUF_decompress4X2_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, flags);
 #else
-        return algoNb ? HUF_decompress4X2_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, bmi2) :
-                        HUF_decompress4X1_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, bmi2);
+        return algoNb ? HUF_decompress4X2_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, flags) :
+                        HUF_decompress4X1_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, flags);
 #endif
     }
 }
-
diff --git a/lib/zstd/decompress/zstd_ddict.c b/lib/zstd/decompress/zstd_ddict.c
index dbbc7919de53..30ef65e1ab5c 100644
--- a/lib/zstd/decompress/zstd_ddict.c
+++ b/lib/zstd/decompress/zstd_ddict.c
@@ -1,5 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
 /*
- * Copyright (c) Yann Collet, Facebook, Inc.
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
  * All rights reserved.
  *
  * This source code is licensed under both the BSD-style license (found in the
@@ -14,12 +15,12 @@
 /*-*******************************************************
 *  Dependencies
 *********************************************************/
+#include "../common/allocations.h"  /* ZSTD_customMalloc, ZSTD_customFree */
 #include "../common/zstd_deps.h"   /* ZSTD_memcpy, ZSTD_memmove, ZSTD_memset */
 #include "../common/cpu.h"         /* bmi2 */
 #include "../common/mem.h"         /* low level memory routines */
 #define FSE_STATIC_LINKING_ONLY
 #include "../common/fse.h"
-#define HUF_STATIC_LINKING_ONLY
 #include "../common/huf.h"
 #include "zstd_decompress_internal.h"
 #include "zstd_ddict.h"
@@ -131,7 +132,7 @@ static size_t ZSTD_initDDict_internal(ZSTD_DDict* ddict,
         ZSTD_memcpy(internalBuffer, dict, dictSize);
     }
     ddict->dictSize = dictSize;
-    ddict->entropy.hufTable[0] = (HUF_DTable)((HufLog)*0x1000001);  /* cover both little and big endian */
+    ddict->entropy.hufTable[0] = (HUF_DTable)((ZSTD_HUFFDTABLE_CAPACITY_LOG)*0x1000001);  /* cover both little and big endian */
 
     /* parse dictionary content */
     FORWARD_IF_ERROR( ZSTD_loadEntropy_intoDDict(ddict, dictContentType) , "");
@@ -237,5 +238,5 @@ size_t ZSTD_sizeof_DDict(const ZSTD_DDict* ddict)
 unsigned ZSTD_getDictID_fromDDict(const ZSTD_DDict* ddict)
 {
     if (ddict==NULL) return 0;
-    return ZSTD_getDictID_fromDict(ddict->dictContent, ddict->dictSize);
+    return ddict->dictID;
 }
diff --git a/lib/zstd/decompress/zstd_ddict.h b/lib/zstd/decompress/zstd_ddict.h
index 8c1a79d666f8..de459a0dacd1 100644
--- a/lib/zstd/decompress/zstd_ddict.h
+++ b/lib/zstd/decompress/zstd_ddict.h
@@ -1,5 +1,6 @@
+/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
 /*
- * Copyright (c) Yann Collet, Facebook, Inc.
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
  * All rights reserved.
  *
  * This source code is licensed under both the BSD-style license (found in the
diff --git a/lib/zstd/decompress/zstd_decompress.c b/lib/zstd/decompress/zstd_decompress.c
index 6b3177c94711..03dbdf39109f 100644
--- a/lib/zstd/decompress/zstd_decompress.c
+++ b/lib/zstd/decompress/zstd_decompress.c
@@ -1,5 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
 /*
- * Copyright (c) Yann Collet, Facebook, Inc.
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
  * All rights reserved.
  *
  * This source code is licensed under both the BSD-style license (found in the
@@ -52,17 +53,18 @@
 /*-*******************************************************
 *  Dependencies
 *********************************************************/
+#include "../common/allocations.h"  /* ZSTD_customMalloc, ZSTD_customCalloc, ZSTD_customFree */
 #include "../common/zstd_deps.h"   /* ZSTD_memcpy, ZSTD_memmove, ZSTD_memset */
 #include "../common/mem.h"         /* low level memory routines */
 #define FSE_STATIC_LINKING_ONLY
 #include "../common/fse.h"
-#define HUF_STATIC_LINKING_ONLY
 #include "../common/huf.h"
 #include <linux/xxhash.h> /* xxh64_reset, xxh64_update, xxh64_digest, XXH64 */
 #include "../common/zstd_internal.h"  /* blockProperties_t */
 #include "zstd_decompress_internal.h"   /* ZSTD_DCtx */
 #include "zstd_ddict.h"  /* ZSTD_DDictDictContent */
 #include "zstd_decompress_block.h"   /* ZSTD_decompressBlock_internal */
+#include "../common/bits.h"  /* ZSTD_highbit32 */
 
 
 
@@ -72,11 +74,11 @@
  *************************************/
 
 #define DDICT_HASHSET_MAX_LOAD_FACTOR_COUNT_MULT 4
-#define DDICT_HASHSET_MAX_LOAD_FACTOR_SIZE_MULT 3   /* These two constants represent SIZE_MULT/COUNT_MULT load factor without using a float.
-                                                     * Currently, that means a 0.75 load factor.
-                                                     * So, if count * COUNT_MULT / size * SIZE_MULT != 0, then we've exceeded
-                                                     * the load factor of the ddict hash set.
-                                                     */
+#define DDICT_HASHSET_MAX_LOAD_FACTOR_SIZE_MULT 3  /* These two constants represent SIZE_MULT/COUNT_MULT load factor without using a float.
+                                                    * Currently, that means a 0.75 load factor.
+                                                    * So, if count * COUNT_MULT / size * SIZE_MULT != 0, then we've exceeded
+                                                    * the load factor of the ddict hash set.
+                                                    */
 
 #define DDICT_HASHSET_TABLE_BASE_SIZE 64
 #define DDICT_HASHSET_RESIZE_FACTOR 2
@@ -237,6 +239,7 @@ static void ZSTD_DCtx_resetParameters(ZSTD_DCtx* dctx)
     dctx->outBufferMode = ZSTD_bm_buffered;
     dctx->forceIgnoreChecksum = ZSTD_d_validateChecksum;
     dctx->refMultipleDDicts = ZSTD_rmd_refSingleDDict;
+    dctx->disableHufAsm = 0;
 }
 
 static void ZSTD_initDCtx_internal(ZSTD_DCtx* dctx)
@@ -421,16 +424,40 @@ size_t ZSTD_frameHeaderSize(const void* src, size_t srcSize)
  *  note : only works for formats ZSTD_f_zstd1 and ZSTD_f_zstd1_magicless
  * @return : 0, `zfhPtr` is correctly filled,
  *          >0, `srcSize` is too small, value is wanted `srcSize` amount,
- *           or an error code, which can be tested using ZSTD_isError() */
+**           or an error code, which can be tested using ZSTD_isError() */
 size_t ZSTD_getFrameHeader_advanced(ZSTD_frameHeader* zfhPtr, const void* src, size_t srcSize, ZSTD_format_e format)
 {
     const BYTE* ip = (const BYTE*)src;
     size_t const minInputSize = ZSTD_startingInputLength(format);
 
-    ZSTD_memset(zfhPtr, 0, sizeof(*zfhPtr));   /* not strictly necessary, but static analyzer do not understand that zfhPtr is only going to be read only if return value is zero, since they are 2 different signals */
-    if (srcSize < minInputSize) return minInputSize;
-    RETURN_ERROR_IF(src==NULL, GENERIC, "invalid parameter");
+    DEBUGLOG(5, "ZSTD_getFrameHeader_advanced: minInputSize = %zu, srcSize = %zu", minInputSize, srcSize);
+
+    if (srcSize > 0) {
+        /* note : technically could be considered an assert(), since it's an invalid entry */
+        RETURN_ERROR_IF(src==NULL, GENERIC, "invalid parameter : src==NULL, but srcSize>0");
+    }
+    if (srcSize < minInputSize) {
+        if (srcSize > 0 && format != ZSTD_f_zstd1_magicless) {
+            /* when receiving less than @minInputSize bytes,
+             * control these bytes at least correspond to a supported magic number
+             * in order to error out early if they don't.
+            **/
+            size_t const toCopy = MIN(4, srcSize);
+            unsigned char hbuf[4]; MEM_writeLE32(hbuf, ZSTD_MAGICNUMBER);
+            assert(src != NULL);
+            ZSTD_memcpy(hbuf, src, toCopy);
+            if ( MEM_readLE32(hbuf) != ZSTD_MAGICNUMBER ) {
+                /* not a zstd frame : let's check if it's a skippable frame */
+                MEM_writeLE32(hbuf, ZSTD_MAGIC_SKIPPABLE_START);
+                ZSTD_memcpy(hbuf, src, toCopy);
+                if ((MEM_readLE32(hbuf) & ZSTD_MAGIC_SKIPPABLE_MASK) != ZSTD_MAGIC_SKIPPABLE_START) {
+                    RETURN_ERROR(prefix_unknown,
+                                "first bytes don't correspond to any supported magic number");
+        }   }   }
+        return minInputSize;
+    }
 
+    ZSTD_memset(zfhPtr, 0, sizeof(*zfhPtr));   /* not strictly necessary, but static analyzers may not understand that zfhPtr will be read only if return value is zero, since they are 2 different signals */
     if ( (format != ZSTD_f_zstd1_magicless)
       && (MEM_readLE32(src) != ZSTD_MAGICNUMBER) ) {
         if ((MEM_readLE32(src) & ZSTD_MAGIC_SKIPPABLE_MASK) == ZSTD_MAGIC_SKIPPABLE_START) {
@@ -540,49 +567,52 @@ static size_t readSkippableFrameSize(void const* src, size_t srcSize)
     sizeU32 = MEM_readLE32((BYTE const*)src + ZSTD_FRAMEIDSIZE);
     RETURN_ERROR_IF((U32)(sizeU32 + ZSTD_SKIPPABLEHEADERSIZE) < sizeU32,
                     frameParameter_unsupported, "");
-    {
-        size_t const skippableSize = skippableHeaderSize + sizeU32;
+    {   size_t const skippableSize = skippableHeaderSize + sizeU32;
         RETURN_ERROR_IF(skippableSize > srcSize, srcSize_wrong, "");
         return skippableSize;
     }
 }
 
 /*! ZSTD_readSkippableFrame() :
- * Retrieves a zstd skippable frame containing data given by src, and writes it to dst buffer.
+ * Retrieves content of a skippable frame, and writes it to dst buffer.
  *
  * The parameter magicVariant will receive the magicVariant that was supplied when the frame was written,
  * i.e. magicNumber - ZSTD_MAGIC_SKIPPABLE_START.  This can be NULL if the caller is not interested
  * in the magicVariant.
  *
- * Returns an error if destination buffer is not large enough, or if the frame is not skippable.
+ * Returns an error if destination buffer is not large enough, or if this is not a valid skippable frame.
  *
  * @return : number of bytes written or a ZSTD error.
  */
-ZSTDLIB_API size_t ZSTD_readSkippableFrame(void* dst, size_t dstCapacity, unsigned* magicVariant,
-                                            const void* src, size_t srcSize)
+size_t ZSTD_readSkippableFrame(void* dst, size_t dstCapacity,
+                               unsigned* magicVariant,  /* optional, can be NULL */
+                         const void* src, size_t srcSize)
 {
-    U32 const magicNumber = MEM_readLE32(src);
-    size_t skippableFrameSize = readSkippableFrameSize(src, srcSize);
-    size_t skippableContentSize = skippableFrameSize - ZSTD_SKIPPABLEHEADERSIZE;
-
-    /* check input validity */
-    RETURN_ERROR_IF(!ZSTD_isSkippableFrame(src, srcSize), frameParameter_unsupported, "");
-    RETURN_ERROR_IF(skippableFrameSize < ZSTD_SKIPPABLEHEADERSIZE || skippableFrameSize > srcSize, srcSize_wrong, "");
-    RETURN_ERROR_IF(skippableContentSize > dstCapacity, dstSize_tooSmall, "");
+    RETURN_ERROR_IF(srcSize < ZSTD_SKIPPABLEHEADERSIZE, srcSize_wrong, "");
 
-    /* deliver payload */
-    if (skippableContentSize > 0  && dst != NULL)
-        ZSTD_memcpy(dst, (const BYTE *)src + ZSTD_SKIPPABLEHEADERSIZE, skippableContentSize);
-    if (magicVariant != NULL)
-        *magicVariant = magicNumber - ZSTD_MAGIC_SKIPPABLE_START;
-    return skippableContentSize;
+    {   U32 const magicNumber = MEM_readLE32(src);
+        size_t skippableFrameSize = readSkippableFrameSize(src, srcSize);
+        size_t skippableContentSize = skippableFrameSize - ZSTD_SKIPPABLEHEADERSIZE;
+
+        /* check input validity */
+        RETURN_ERROR_IF(!ZSTD_isSkippableFrame(src, srcSize), frameParameter_unsupported, "");
+        RETURN_ERROR_IF(skippableFrameSize < ZSTD_SKIPPABLEHEADERSIZE || skippableFrameSize > srcSize, srcSize_wrong, "");
+        RETURN_ERROR_IF(skippableContentSize > dstCapacity, dstSize_tooSmall, "");
+
+        /* deliver payload */
+        if (skippableContentSize > 0  && dst != NULL)
+            ZSTD_memcpy(dst, (const BYTE *)src + ZSTD_SKIPPABLEHEADERSIZE, skippableContentSize);
+        if (magicVariant != NULL)
+            *magicVariant = magicNumber - ZSTD_MAGIC_SKIPPABLE_START;
+        return skippableContentSize;
+    }
 }
 
 /* ZSTD_findDecompressedSize() :
- *  compatible with legacy mode
  *  `srcSize` must be the exact length of some number of ZSTD compressed and/or
  *      skippable frames
- *  @return : decompressed size of the frames contained */
+ *  note: compatible with legacy mode
+ * @return : decompressed size of the frames contained */
 unsigned long long ZSTD_findDecompressedSize(const void* src, size_t srcSize)
 {
     unsigned long long totalDstSize = 0;
@@ -592,9 +622,7 @@ unsigned long long ZSTD_findDecompressedSize(const void* src, size_t srcSize)
 
         if ((magicNumber & ZSTD_MAGIC_SKIPPABLE_MASK) == ZSTD_MAGIC_SKIPPABLE_START) {
             size_t const skippableSize = readSkippableFrameSize(src, srcSize);
-            if (ZSTD_isError(skippableSize)) {
-                return ZSTD_CONTENTSIZE_ERROR;
-            }
+            if (ZSTD_isError(skippableSize)) return ZSTD_CONTENTSIZE_ERROR;
             assert(skippableSize <= srcSize);
 
             src = (const BYTE *)src + skippableSize;
@@ -602,17 +630,17 @@ unsigned long long ZSTD_findDecompressedSize(const void* src, size_t srcSize)
             continue;
         }
 
-        {   unsigned long long const ret = ZSTD_getFrameContentSize(src, srcSize);
-            if (ret >= ZSTD_CONTENTSIZE_ERROR) return ret;
+        {   unsigned long long const fcs = ZSTD_getFrameContentSize(src, srcSize);
+            if (fcs >= ZSTD_CONTENTSIZE_ERROR) return fcs;
 
-            /* check for overflow */
-            if (totalDstSize + ret < totalDstSize) return ZSTD_CONTENTSIZE_ERROR;
-            totalDstSize += ret;
+            if (totalDstSize + fcs < totalDstSize)
+                return ZSTD_CONTENTSIZE_ERROR; /* check for overflow */
+            totalDstSize += fcs;
         }
+        /* skip to next frame */
         {   size_t const frameSrcSize = ZSTD_findFrameCompressedSize(src, srcSize);
-            if (ZSTD_isError(frameSrcSize)) {
-                return ZSTD_CONTENTSIZE_ERROR;
-            }
+            if (ZSTD_isError(frameSrcSize)) return ZSTD_CONTENTSIZE_ERROR;
+            assert(frameSrcSize <= srcSize);
 
             src = (const BYTE *)src + frameSrcSize;
             srcSize -= frameSrcSize;
@@ -730,10 +758,11 @@ static ZSTD_frameSizeInfo ZSTD_findFrameSizeInfo(const void* src, size_t srcSize
             ip += 4;
         }
 
+        frameSizeInfo.nbBlocks = nbBlocks;
         frameSizeInfo.compressedSize = (size_t)(ip - ipstart);
         frameSizeInfo.decompressedBound = (zfh.frameContentSize != ZSTD_CONTENTSIZE_UNKNOWN)
                                         ? zfh.frameContentSize
-                                        : nbBlocks * zfh.blockSizeMax;
+                                        : (unsigned long long)nbBlocks * zfh.blockSizeMax;
         return frameSizeInfo;
     }
 }
@@ -773,6 +802,48 @@ unsigned long long ZSTD_decompressBound(const void* src, size_t srcSize)
     return bound;
 }
 
+size_t ZSTD_decompressionMargin(void const* src, size_t srcSize)
+{
+    size_t margin = 0;
+    unsigned maxBlockSize = 0;
+
+    /* Iterate over each frame */
+    while (srcSize > 0) {
+        ZSTD_frameSizeInfo const frameSizeInfo = ZSTD_findFrameSizeInfo(src, srcSize);
+        size_t const compressedSize = frameSizeInfo.compressedSize;
+        unsigned long long const decompressedBound = frameSizeInfo.decompressedBound;
+        ZSTD_frameHeader zfh;
+
+        FORWARD_IF_ERROR(ZSTD_getFrameHeader(&zfh, src, srcSize), "");
+        if (ZSTD_isError(compressedSize) || decompressedBound == ZSTD_CONTENTSIZE_ERROR)
+            return ERROR(corruption_detected);
+
+        if (zfh.frameType == ZSTD_frame) {
+            /* Add the frame header to our margin */
+            margin += zfh.headerSize;
+            /* Add the checksum to our margin */
+            margin += zfh.checksumFlag ? 4 : 0;
+            /* Add 3 bytes per block */
+            margin += 3 * frameSizeInfo.nbBlocks;
+
+            /* Compute the max block size */
+            maxBlockSize = MAX(maxBlockSize, zfh.blockSizeMax);
+        } else {
+            assert(zfh.frameType == ZSTD_skippableFrame);
+            /* Add the entire skippable frame size to our margin. */
+            margin += compressedSize;
+        }
+
+        assert(srcSize >= compressedSize);
+        src = (const BYTE*)src + compressedSize;
+        srcSize -= compressedSize;
+    }
+
+    /* Add the max block size back to the margin. */
+    margin += maxBlockSize;
+
+    return margin;
+}
 
 /*-*************************************************************
  *   Frame decoding
@@ -930,6 +1001,7 @@ static size_t ZSTD_decompressFrame(ZSTD_DCtx* dctx,
     }
     ZSTD_DCtx_trace_end(dctx, (U64)(op-ostart), (U64)(ip-istart), /* streaming */ 0);
     /* Allow caller to get size read */
+    DEBUGLOG(4, "ZSTD_decompressFrame: decompressed frame of size %zi, consuming %zi bytes of input", op-ostart, ip - (const BYTE*)*srcPtr);
     *srcPtr = ip;
     *srcSizePtr = remainingSrcSize;
     return (size_t)(op-ostart);
@@ -955,17 +1027,18 @@ static size_t ZSTD_decompressMultiFrame(ZSTD_DCtx* dctx,
     while (srcSize >= ZSTD_startingInputLength(dctx->format)) {
 
 
-        {   U32 const magicNumber = MEM_readLE32(src);
-            DEBUGLOG(4, "reading magic number %08X (expecting %08X)",
-                        (unsigned)magicNumber, ZSTD_MAGICNUMBER);
+        if (srcSize >= 4) {
+            U32 const magicNumber = MEM_readLE32(src);
+            DEBUGLOG(5, "reading magic number %08X", (unsigned)magicNumber);
             if ((magicNumber & ZSTD_MAGIC_SKIPPABLE_MASK) == ZSTD_MAGIC_SKIPPABLE_START) {
+                /* skippable frame detected : skip it */
                 size_t const skippableSize = readSkippableFrameSize(src, srcSize);
-                FORWARD_IF_ERROR(skippableSize, "readSkippableFrameSize failed");
+                FORWARD_IF_ERROR(skippableSize, "invalid skippable frame");
                 assert(skippableSize <= srcSize);
 
                 src = (const BYTE *)src + skippableSize;
                 srcSize -= skippableSize;
-                continue;
+                continue; /* check next frame */
         }   }
 
         if (ddict) {
@@ -1061,8 +1134,8 @@ size_t ZSTD_decompress(void* dst, size_t dstCapacity, const void* src, size_t sr
 size_t ZSTD_nextSrcSizeToDecompress(ZSTD_DCtx* dctx) { return dctx->expected; }
 
 /*
- * Similar to ZSTD_nextSrcSizeToDecompress(), but when a block input can be streamed,
- * we allow taking a partial block as the input. Currently only raw uncompressed blocks can
+ * Similar to ZSTD_nextSrcSizeToDecompress(), but when a block input can be streamed, we
+ * allow taking a partial block as the input. Currently only raw uncompressed blocks can
  * be streamed.
  *
  * For blocks that can be streamed, this allows us to reduce the latency until we produce
@@ -1262,7 +1335,7 @@ size_t ZSTD_decompressContinue(ZSTD_DCtx* dctx, void* dst, size_t dstCapacity, c
 
     default:
         assert(0);   /* impossible */
-        RETURN_ERROR(GENERIC, "impossible to reach");   /* some compiler require default to do something */
+        RETURN_ERROR(GENERIC, "impossible to reach");   /* some compilers require default to do something */
     }
 }
 
@@ -1303,11 +1376,11 @@ ZSTD_loadDEntropy(ZSTD_entropyDTables_t* entropy,
         /* in minimal huffman, we always use X1 variants */
         size_t const hSize = HUF_readDTableX1_wksp(entropy->hufTable,
                                                 dictPtr, dictEnd - dictPtr,
-                                                workspace, workspaceSize);
+                                                workspace, workspaceSize, /* flags */ 0);
 #else
         size_t const hSize = HUF_readDTableX2_wksp(entropy->hufTable,
                                                 dictPtr, (size_t)(dictEnd - dictPtr),
-                                                workspace, workspaceSize);
+                                                workspace, workspaceSize, /* flags */ 0);
 #endif
         RETURN_ERROR_IF(HUF_isError(hSize), dictionary_corrupted, "");
         dictPtr += hSize;
@@ -1403,7 +1476,7 @@ size_t ZSTD_decompressBegin(ZSTD_DCtx* dctx)
     dctx->prefixStart = NULL;
     dctx->virtualStart = NULL;
     dctx->dictEnd = NULL;
-    dctx->entropy.hufTable[0] = (HUF_DTable)((HufLog)*0x1000001);  /* cover both little and big endian */
+    dctx->entropy.hufTable[0] = (HUF_DTable)((ZSTD_HUFFDTABLE_CAPACITY_LOG)*0x1000001);  /* cover both little and big endian */
     dctx->litEntropy = dctx->fseEntropy = 0;
     dctx->dictID = 0;
     dctx->bType = bt_reserved;
@@ -1465,7 +1538,7 @@ unsigned ZSTD_getDictID_fromDict(const void* dict, size_t dictSize)
  *  This could for one of the following reasons :
  *  - The frame does not require a dictionary (most common case).
  *  - The frame was built with dictID intentionally removed.
- *    Needed dictionary is a hidden information.
+ *    Needed dictionary is a hidden piece of information.
  *    Note : this use case also happens when using a non-conformant dictionary.
  *  - `srcSize` is too small, and as a result, frame header could not be decoded.
  *    Note : possible if `srcSize < ZSTD_FRAMEHEADERSIZE_MAX`.
@@ -1474,7 +1547,7 @@ unsigned ZSTD_getDictID_fromDict(const void* dict, size_t dictSize)
  *  ZSTD_getFrameHeader(), which will provide a more precise error code. */
 unsigned ZSTD_getDictID_fromFrame(const void* src, size_t srcSize)
 {
-    ZSTD_frameHeader zfp = { 0, 0, 0, ZSTD_frame, 0, 0, 0 };
+    ZSTD_frameHeader zfp = { 0, 0, 0, ZSTD_frame, 0, 0, 0, 0, 0 };
     size_t const hError = ZSTD_getFrameHeader(&zfp, src, srcSize);
     if (ZSTD_isError(hError)) return 0;
     return zfp.dictID;
@@ -1581,7 +1654,9 @@ size_t ZSTD_initDStream_usingDict(ZSTD_DStream* zds, const void* dict, size_t di
 size_t ZSTD_initDStream(ZSTD_DStream* zds)
 {
     DEBUGLOG(4, "ZSTD_initDStream");
-    return ZSTD_initDStream_usingDDict(zds, NULL);
+    FORWARD_IF_ERROR(ZSTD_DCtx_reset(zds, ZSTD_reset_session_only), "");
+    FORWARD_IF_ERROR(ZSTD_DCtx_refDDict(zds, NULL), "");
+    return ZSTD_startingInputLength(zds->format);
 }
 
 /* ZSTD_initDStream_usingDDict() :
@@ -1589,6 +1664,7 @@ size_t ZSTD_initDStream(ZSTD_DStream* zds)
  * this function cannot fail */
 size_t ZSTD_initDStream_usingDDict(ZSTD_DStream* dctx, const ZSTD_DDict* ddict)
 {
+    DEBUGLOG(4, "ZSTD_initDStream_usingDDict");
     FORWARD_IF_ERROR( ZSTD_DCtx_reset(dctx, ZSTD_reset_session_only) , "");
     FORWARD_IF_ERROR( ZSTD_DCtx_refDDict(dctx, ddict) , "");
     return ZSTD_startingInputLength(dctx->format);
@@ -1599,6 +1675,7 @@ size_t ZSTD_initDStream_usingDDict(ZSTD_DStream* dctx, const ZSTD_DDict* ddict)
  * this function cannot fail */
 size_t ZSTD_resetDStream(ZSTD_DStream* dctx)
 {
+    DEBUGLOG(4, "ZSTD_resetDStream");
     FORWARD_IF_ERROR(ZSTD_DCtx_reset(dctx, ZSTD_reset_session_only), "");
     return ZSTD_startingInputLength(dctx->format);
 }
@@ -1670,6 +1747,11 @@ ZSTD_bounds ZSTD_dParam_getBounds(ZSTD_dParameter dParam)
             bounds.lowerBound = (int)ZSTD_rmd_refSingleDDict;
             bounds.upperBound = (int)ZSTD_rmd_refMultipleDDicts;
             return bounds;
+        case ZSTD_d_disableHuffmanAssembly:
+            bounds.lowerBound = 0;
+            bounds.upperBound = 1;
+            return bounds;
+
         default:;
     }
     bounds.error = ERROR(parameter_unsupported);
@@ -1710,6 +1792,9 @@ size_t ZSTD_DCtx_getParameter(ZSTD_DCtx* dctx, ZSTD_dParameter param, int* value
         case ZSTD_d_refMultipleDDicts:
             *value = (int)dctx->refMultipleDDicts;
             return 0;
+        case ZSTD_d_disableHuffmanAssembly:
+            *value = (int)dctx->disableHufAsm;
+            return 0;
         default:;
     }
     RETURN_ERROR(parameter_unsupported, "");
@@ -1743,6 +1828,10 @@ size_t ZSTD_DCtx_setParameter(ZSTD_DCtx* dctx, ZSTD_dParameter dParam, int value
             }
             dctx->refMultipleDDicts = (ZSTD_refMultipleDDicts_e)value;
             return 0;
+        case ZSTD_d_disableHuffmanAssembly:
+            CHECK_DBOUNDS(ZSTD_d_disableHuffmanAssembly, value);
+            dctx->disableHufAsm = value != 0;
+            return 0;
         default:;
     }
     RETURN_ERROR(parameter_unsupported, "");
@@ -1918,7 +2007,6 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB
                 if (zds->refMultipleDDicts && zds->ddictSet) {
                     ZSTD_DCtx_selectFrameDDict(zds);
                 }
-                DEBUGLOG(5, "header size : %u", (U32)hSize);
                 if (ZSTD_isError(hSize)) {
                     return hSize;   /* error */
                 }
@@ -1932,6 +2020,11 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB
                             zds->lhSize += remainingInput;
                         }
                         input->pos = input->size;
+                        /* check first few bytes */
+                        FORWARD_IF_ERROR(
+                            ZSTD_getFrameHeader_advanced(&zds->fParams, zds->headerBuffer, zds->lhSize, zds->format),
+                            "First few bytes detected incorrect" );
+                        /* return hint input size */
                         return (MAX((size_t)ZSTD_FRAMEHEADERSIZE_MIN(zds->format), hSize) - zds->lhSize) + ZSTD_blockHeaderSize;   /* remaining header bytes + next block header */
                     }
                     assert(ip != NULL);
@@ -1949,8 +2042,9 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB
                     size_t const decompressedSize = ZSTD_decompress_usingDDict(zds, op, (size_t)(oend-op), istart, cSize, ZSTD_getDDict(zds));
                     if (ZSTD_isError(decompressedSize)) return decompressedSize;
                     DEBUGLOG(4, "shortcut to single-pass ZSTD_decompress_usingDDict()")
+                    assert(istart != NULL);
                     ip = istart + cSize;
-                    op += decompressedSize;
+                    op = op ? op + decompressedSize : op; /* can occur if frameContentSize = 0 (empty frame) */
                     zds->expected = 0;
                     zds->streamStage = zdss_init;
                     someMoreWork = 0;
@@ -2034,6 +2128,7 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB
                 }
                 if ((size_t)(iend-ip) >= neededInSize) {  /* decode directly from src */
                     FORWARD_IF_ERROR(ZSTD_decompressContinueStream(zds, &op, oend, ip, neededInSize), "");
+                    assert(ip != NULL);
                     ip += neededInSize;
                     /* Function modifies the stage so we must break */
                     break;
@@ -2048,7 +2143,7 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB
                 int const isSkipFrame = ZSTD_isSkipFrame(zds);
                 size_t loadedSize;
                 /* At this point we shouldn't be decompressing a block that we can stream. */
-                assert(neededInSize == ZSTD_nextSrcSizeToDecompressWithInputSize(zds, iend - ip));
+                assert(neededInSize == ZSTD_nextSrcSizeToDecompressWithInputSize(zds, (size_t)(iend - ip)));
                 if (isSkipFrame) {
                     loadedSize = MIN(toLoad, (size_t)(iend-ip));
                 } else {
@@ -2057,8 +2152,11 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB
                                     "should never happen");
                     loadedSize = ZSTD_limitCopy(zds->inBuff + zds->inPos, toLoad, ip, (size_t)(iend-ip));
                 }
-                ip += loadedSize;
-                zds->inPos += loadedSize;
+                if (loadedSize != 0) {
+                    /* ip may be NULL */
+                    ip += loadedSize;
+                    zds->inPos += loadedSize;
+                }
                 if (loadedSize < toLoad) { someMoreWork = 0; break; }   /* not enough input, wait for more */
 
                 /* decode loaded input */
@@ -2068,14 +2166,17 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB
                 break;
             }
         case zdss_flush:
-            {   size_t const toFlushSize = zds->outEnd - zds->outStart;
+            {
+                size_t const toFlushSize = zds->outEnd - zds->outStart;
                 size_t const flushedSize = ZSTD_limitCopy(op, (size_t)(oend-op), zds->outBuff + zds->outStart, toFlushSize);
-                op += flushedSize;
+
+                op = op ? op + flushedSize : op;
+
                 zds->outStart += flushedSize;
                 if (flushedSize == toFlushSize) {  /* flush completed */
                     zds->streamStage = zdss_read;
                     if ( (zds->outBuffSize < zds->fParams.frameContentSize)
-                      && (zds->outStart + zds->fParams.blockSizeMax > zds->outBuffSize) ) {
+                        && (zds->outStart + zds->fParams.blockSizeMax > zds->outBuffSize) ) {
                         DEBUGLOG(5, "restart filling outBuff from beginning (left:%i, needed:%u)",
                                 (int)(zds->outBuffSize - zds->outStart),
                                 (U32)zds->fParams.blockSizeMax);
@@ -2089,7 +2190,7 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB
 
         default:
             assert(0);    /* impossible */
-            RETURN_ERROR(GENERIC, "impossible to reach");   /* some compiler require default to do something */
+            RETURN_ERROR(GENERIC, "impossible to reach");   /* some compilers require default to do something */
     }   }
 
     /* result */
@@ -2102,8 +2203,8 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB
     if ((ip==istart) && (op==ostart)) {  /* no forward progress */
         zds->noForwardProgress ++;
         if (zds->noForwardProgress >= ZSTD_NO_FORWARD_PROGRESS_MAX) {
-            RETURN_ERROR_IF(op==oend, dstSize_tooSmall, "");
-            RETURN_ERROR_IF(ip==iend, srcSize_wrong, "");
+            RETURN_ERROR_IF(op==oend, noForwardProgress_destFull, "");
+            RETURN_ERROR_IF(ip==iend, noForwardProgress_inputEmpty, "");
             assert(0);
         }
     } else {
@@ -2140,11 +2241,17 @@ size_t ZSTD_decompressStream_simpleArgs (
                             void* dst, size_t dstCapacity, size_t* dstPos,
                       const void* src, size_t srcSize, size_t* srcPos)
 {
-    ZSTD_outBuffer output = { dst, dstCapacity, *dstPos };
-    ZSTD_inBuffer  input  = { src, srcSize, *srcPos };
-    /* ZSTD_compress_generic() will check validity of dstPos and srcPos */
-    size_t const cErr = ZSTD_decompressStream(dctx, &output, &input);
-    *dstPos = output.pos;
-    *srcPos = input.pos;
-    return cErr;
+    ZSTD_outBuffer output;
+    ZSTD_inBuffer  input;
+    output.dst = dst;
+    output.size = dstCapacity;
+    output.pos = *dstPos;
+    input.src = src;
+    input.size = srcSize;
+    input.pos = *srcPos;
+    {   size_t const cErr = ZSTD_decompressStream(dctx, &output, &input);
+        *dstPos = output.pos;
+        *srcPos = input.pos;
+        return cErr;
+    }
 }
diff --git a/lib/zstd/decompress/zstd_decompress_block.c b/lib/zstd/decompress/zstd_decompress_block.c
index c1913b8e7c89..9f5577e5bc19 100644
--- a/lib/zstd/decompress/zstd_decompress_block.c
+++ b/lib/zstd/decompress/zstd_decompress_block.c
@@ -1,5 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
 /*
- * Copyright (c) Yann Collet, Facebook, Inc.
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
  * All rights reserved.
  *
  * This source code is licensed under both the BSD-style license (found in the
@@ -20,12 +21,12 @@
 #include "../common/mem.h"         /* low level memory routines */
 #define FSE_STATIC_LINKING_ONLY
 #include "../common/fse.h"
-#define HUF_STATIC_LINKING_ONLY
 #include "../common/huf.h"
 #include "../common/zstd_internal.h"
 #include "zstd_decompress_internal.h"   /* ZSTD_DCtx */
 #include "zstd_ddict.h"  /* ZSTD_DDictDictContent */
 #include "zstd_decompress_block.h"
+#include "../common/bits.h"  /* ZSTD_highbit32 */
 
 /*_*******************************************************
 *  Macros
@@ -89,7 +90,7 @@ static void ZSTD_allocateLiteralsBuffer(ZSTD_DCtx* dctx, void* const dst, const
             dctx->litBufferEnd = dctx->litBuffer + litSize - ZSTD_LITBUFFEREXTRASIZE;
         }
         else {
-            /* initially this will be stored entirely in dst during huffman decoding, it will partially shifted to litExtraBuffer after */
+            /* initially this will be stored entirely in dst during huffman decoding, it will partially be shifted to litExtraBuffer after */
             dctx->litBuffer = (BYTE*)dst + expectedWriteSize - litSize;
             dctx->litBufferEnd = (BYTE*)dst + expectedWriteSize;
         }
@@ -134,13 +135,16 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
             ZSTD_FALLTHROUGH;
 
         case set_compressed:
-            RETURN_ERROR_IF(srcSize < 5, corruption_detected, "srcSize >= MIN_CBLOCK_SIZE == 3; here we need up to 5 for case 3");
+            RETURN_ERROR_IF(srcSize < 5, corruption_detected, "srcSize >= MIN_CBLOCK_SIZE == 2; here we need up to 5 for case 3");
             {   size_t lhSize, litSize, litCSize;
                 U32 singleStream=0;
                 U32 const lhlCode = (istart[0] >> 2) & 3;
                 U32 const lhc = MEM_readLE32(istart);
                 size_t hufSuccess;
                 size_t expectedWriteSize = MIN(ZSTD_BLOCKSIZE_MAX, dstCapacity);
+                int const flags = 0
+                    | (ZSTD_DCtx_get_bmi2(dctx) ? HUF_flags_bmi2 : 0)
+                    | (dctx->disableHufAsm ? HUF_flags_disableAsm : 0);
                 switch(lhlCode)
                 {
                 case 0: case 1: default:   /* note : default is impossible, since lhlCode into [0..3] */
@@ -165,6 +169,10 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
                 }
                 RETURN_ERROR_IF(litSize > 0 && dst == NULL, dstSize_tooSmall, "NULL not handled");
                 RETURN_ERROR_IF(litSize > ZSTD_BLOCKSIZE_MAX, corruption_detected, "");
+                if (!singleStream)
+                    RETURN_ERROR_IF(litSize < MIN_LITERALS_FOR_4_STREAMS, literals_headerWrong,
+                        "Not enough literals (%zu) for the 4-streams mode (min %u)",
+                        litSize, MIN_LITERALS_FOR_4_STREAMS);
                 RETURN_ERROR_IF(litCSize + lhSize > srcSize, corruption_detected, "");
                 RETURN_ERROR_IF(expectedWriteSize < litSize , dstSize_tooSmall, "");
                 ZSTD_allocateLiteralsBuffer(dctx, dst, dstCapacity, litSize, streaming, expectedWriteSize, 0);
@@ -176,13 +184,14 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
 
                 if (litEncType==set_repeat) {
                     if (singleStream) {
-                        hufSuccess = HUF_decompress1X_usingDTable_bmi2(
+                        hufSuccess = HUF_decompress1X_usingDTable(
                             dctx->litBuffer, litSize, istart+lhSize, litCSize,
-                            dctx->HUFptr, ZSTD_DCtx_get_bmi2(dctx));
+                            dctx->HUFptr, flags);
                     } else {
-                        hufSuccess = HUF_decompress4X_usingDTable_bmi2(
+                        assert(litSize >= MIN_LITERALS_FOR_4_STREAMS);
+                        hufSuccess = HUF_decompress4X_usingDTable(
                             dctx->litBuffer, litSize, istart+lhSize, litCSize,
-                            dctx->HUFptr, ZSTD_DCtx_get_bmi2(dctx));
+                            dctx->HUFptr, flags);
                     }
                 } else {
                     if (singleStream) {
@@ -190,18 +199,18 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
                         hufSuccess = HUF_decompress1X_DCtx_wksp(
                             dctx->entropy.hufTable, dctx->litBuffer, litSize,
                             istart+lhSize, litCSize, dctx->workspace,
-                            sizeof(dctx->workspace));
+                            sizeof(dctx->workspace), flags);
 #else
-                        hufSuccess = HUF_decompress1X1_DCtx_wksp_bmi2(
+                        hufSuccess = HUF_decompress1X1_DCtx_wksp(
                             dctx->entropy.hufTable, dctx->litBuffer, litSize,
                             istart+lhSize, litCSize, dctx->workspace,
-                            sizeof(dctx->workspace), ZSTD_DCtx_get_bmi2(dctx));
+                            sizeof(dctx->workspace), flags);
 #endif
                     } else {
-                        hufSuccess = HUF_decompress4X_hufOnly_wksp_bmi2(
+                        hufSuccess = HUF_decompress4X_hufOnly_wksp(
                             dctx->entropy.hufTable, dctx->litBuffer, litSize,
                             istart+lhSize, litCSize, dctx->workspace,
-                            sizeof(dctx->workspace), ZSTD_DCtx_get_bmi2(dctx));
+                            sizeof(dctx->workspace), flags);
                     }
                 }
                 if (dctx->litBufferLocation == ZSTD_split)
@@ -237,6 +246,7 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
                     break;
                 case 3:
                     lhSize = 3;
+                    RETURN_ERROR_IF(srcSize<3, corruption_detected, "srcSize >= MIN_CBLOCK_SIZE == 2; here we need lhSize = 3");
                     litSize = MEM_readLE24(istart) >> 4;
                     break;
                 }
@@ -279,12 +289,13 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
                     break;
                 case 1:
                     lhSize = 2;
+                    RETURN_ERROR_IF(srcSize<3, corruption_detected, "srcSize >= MIN_CBLOCK_SIZE == 2; here we need lhSize+1 = 3");
                     litSize = MEM_readLE16(istart) >> 4;
                     break;
                 case 3:
                     lhSize = 3;
+                    RETURN_ERROR_IF(srcSize<4, corruption_detected, "srcSize >= MIN_CBLOCK_SIZE == 2; here we need lhSize+1 = 4");
                     litSize = MEM_readLE24(istart) >> 4;
-                    RETURN_ERROR_IF(srcSize<4, corruption_detected, "srcSize >= MIN_CBLOCK_SIZE == 3; here we need lhSize+1 = 4");
                     break;
                 }
                 RETURN_ERROR_IF(litSize > 0 && dst == NULL, dstSize_tooSmall, "NULL not handled");
@@ -506,14 +517,15 @@ void ZSTD_buildFSETable_body(ZSTD_seqSymbol* dt,
                 for (i = 8; i < n; i += 8) {
                     MEM_write64(spread + pos + i, sv);
                 }
-                pos += n;
+                assert(n>=0);
+                pos += (size_t)n;
             }
         }
         /* Now we spread those positions across the table.
-         * The benefit of doing it in two stages is that we avoid the the
+         * The benefit of doing it in two stages is that we avoid the
          * variable size inner loop, which caused lots of branch misses.
          * Now we can run through all the positions without any branch misses.
-         * We unroll the loop twice, since that is what emperically worked best.
+         * We unroll the loop twice, since that is what empirically worked best.
          */
         {
             size_t position = 0;
@@ -540,7 +552,7 @@ void ZSTD_buildFSETable_body(ZSTD_seqSymbol* dt,
             for (i=0; i<n; i++) {
                 tableDecode[position].baseValue = s;
                 position = (position + step) & tableMask;
-                while (position > highThreshold) position = (position + step) & tableMask;   /* lowprob area */
+                while (UNLIKELY(position > highThreshold)) position = (position + step) & tableMask;   /* lowprob area */
         }   }
         assert(position == 0); /* position must reach all cells once, otherwise normalizedCounter is incorrect */
     }
@@ -551,7 +563,7 @@ void ZSTD_buildFSETable_body(ZSTD_seqSymbol* dt,
         for (u=0; u<tableSize; u++) {
             U32 const symbol = tableDecode[u].baseValue;
             U32 const nextState = symbolNext[symbol]++;
-            tableDecode[u].nbBits = (BYTE) (tableLog - BIT_highbit32(nextState) );
+            tableDecode[u].nbBits = (BYTE) (tableLog - ZSTD_highbit32(nextState) );
             tableDecode[u].nextState = (U16) ( (nextState << tableDecode[u].nbBits) - tableSize);
             assert(nbAdditionalBits[symbol] < 255);
             tableDecode[u].nbAdditionalBits = nbAdditionalBits[symbol];
@@ -964,6 +976,11 @@ size_t ZSTD_execSequence(BYTE* op,
 
     assert(op != NULL /* Precondition */);
     assert(oend_w < oend /* No underflow */);
+
+#if defined(__aarch64__)
+    /* prefetch sequence starting from match that will be used for copy later */
+    PREFETCH_L1(match);
+#endif
     /* Handle edge cases in a slow path:
      *   - Read beyond end of literals
      *   - Match end is within WILDCOPY_OVERLIMIT of oend
@@ -1154,7 +1171,7 @@ ZSTD_updateFseStateWithDInfo(ZSTD_fseState* DStatePtr, BIT_DStream_t* bitD, U16
 }
 
 /* We need to add at most (ZSTD_WINDOWLOG_MAX_32 - 1) bits to read the maximum
- * offset bits. But we can only read at most (STREAM_ACCUMULATOR_MIN_32 - 1)
+ * offset bits. But we can only read at most STREAM_ACCUMULATOR_MIN_32
  * bits before reloading. This value is the maximum number of bytes we read
  * after reloading when we are decoding long offsets.
  */
@@ -1169,9 +1186,27 @@ FORCE_INLINE_TEMPLATE seq_t
 ZSTD_decodeSequence(seqState_t* seqState, const ZSTD_longOffset_e longOffsets)
 {
     seq_t seq;
+    /*
+     * ZSTD_seqSymbol is a structure with a total of 64 bits wide. So it can be
+     * loaded in one operation and extracted its fields by simply shifting or
+     * bit-extracting on aarch64.
+     * GCC doesn't recognize this and generates more unnecessary ldr/ldrb/ldrh
+     * operations that cause performance drop. This can be avoided by using this
+     * ZSTD_memcpy hack.
+     */
+#if defined(__aarch64__) && (defined(__GNUC__) && !defined(__clang__))
+    ZSTD_seqSymbol llDInfoS, mlDInfoS, ofDInfoS;
+    ZSTD_seqSymbol* const llDInfo = &llDInfoS;
+    ZSTD_seqSymbol* const mlDInfo = &mlDInfoS;
+    ZSTD_seqSymbol* const ofDInfo = &ofDInfoS;
+    ZSTD_memcpy(llDInfo, seqState->stateLL.table + seqState->stateLL.state, sizeof(ZSTD_seqSymbol));
+    ZSTD_memcpy(mlDInfo, seqState->stateML.table + seqState->stateML.state, sizeof(ZSTD_seqSymbol));
+    ZSTD_memcpy(ofDInfo, seqState->stateOffb.table + seqState->stateOffb.state, sizeof(ZSTD_seqSymbol));
+#else
     const ZSTD_seqSymbol* const llDInfo = seqState->stateLL.table + seqState->stateLL.state;
     const ZSTD_seqSymbol* const mlDInfo = seqState->stateML.table + seqState->stateML.state;
     const ZSTD_seqSymbol* const ofDInfo = seqState->stateOffb.table + seqState->stateOffb.state;
+#endif
     seq.matchLength = mlDInfo->baseValue;
     seq.litLength = llDInfo->baseValue;
     {   U32 const ofBase = ofDInfo->baseValue;
@@ -1186,28 +1221,31 @@ ZSTD_decodeSequence(seqState_t* seqState, const ZSTD_longOffset_e longOffsets)
         U32 const llnbBits = llDInfo->nbBits;
         U32 const mlnbBits = mlDInfo->nbBits;
         U32 const ofnbBits = ofDInfo->nbBits;
+
+        assert(llBits <= MaxLLBits);
+        assert(mlBits <= MaxMLBits);
+        assert(ofBits <= MaxOff);
         /*
          * As gcc has better branch and block analyzers, sometimes it is only
-         * valuable to mark likelyness for clang, it gives around 3-4% of
+         * valuable to mark likeliness for clang, it gives around 3-4% of
          * performance.
          */
 
         /* sequence */
         {   size_t offset;
-    #if defined(__clang__)
-            if (LIKELY(ofBits > 1)) {
-    #else
             if (ofBits > 1) {
-    #endif
                 ZSTD_STATIC_ASSERT(ZSTD_lo_isLongOffset == 1);
                 ZSTD_STATIC_ASSERT(LONG_OFFSETS_MAX_EXTRA_BITS_32 == 5);
-                assert(ofBits <= MaxOff);
+                ZSTD_STATIC_ASSERT(STREAM_ACCUMULATOR_MIN_32 > LONG_OFFSETS_MAX_EXTRA_BITS_32);
+                ZSTD_STATIC_ASSERT(STREAM_ACCUMULATOR_MIN_32 - LONG_OFFSETS_MAX_EXTRA_BITS_32 >= MaxMLBits);
                 if (MEM_32bits() && longOffsets && (ofBits >= STREAM_ACCUMULATOR_MIN_32)) {
-                    U32 const extraBits = ofBits - MIN(ofBits, 32 - seqState->DStream.bitsConsumed);
+                    /* Always read extra bits, this keeps the logic simple,
+                     * avoids branches, and avoids accidentally reading 0 bits.
+                     */
+                    U32 const extraBits = LONG_OFFSETS_MAX_EXTRA_BITS_32;
                     offset = ofBase + (BIT_readBitsFast(&seqState->DStream, ofBits - extraBits) << extraBits);
                     BIT_reloadDStream(&seqState->DStream);
-                    if (extraBits) offset += BIT_readBitsFast(&seqState->DStream, extraBits);
-                    assert(extraBits <= LONG_OFFSETS_MAX_EXTRA_BITS_32);   /* to avoid another reload */
+                    offset += BIT_readBitsFast(&seqState->DStream, extraBits);
                 } else {
                     offset = ofBase + BIT_readBitsFast(&seqState->DStream, ofBits/*>0*/);   /* <=  (ZSTD_WINDOWLOG_MAX-1) bits */
                     if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream);
@@ -1232,11 +1270,7 @@ ZSTD_decodeSequence(seqState_t* seqState, const ZSTD_longOffset_e longOffsets)
             seq.offset = offset;
         }
 
-    #if defined(__clang__)
-        if (UNLIKELY(mlBits > 0))
-    #else
         if (mlBits > 0)
-    #endif
             seq.matchLength += BIT_readBitsFast(&seqState->DStream, mlBits/*>0*/);
 
         if (MEM_32bits() && (mlBits+llBits >= STREAM_ACCUMULATOR_MIN_32-LONG_OFFSETS_MAX_EXTRA_BITS_32))
@@ -1246,11 +1280,7 @@ ZSTD_decodeSequence(seqState_t* seqState, const ZSTD_longOffset_e longOffsets)
         /* Ensure there are enough bits to read the rest of data in 64-bit mode. */
         ZSTD_STATIC_ASSERT(16+LLFSELog+MLFSELog+OffFSELog < STREAM_ACCUMULATOR_MIN_64);
 
-    #if defined(__clang__)
-        if (UNLIKELY(llBits > 0))
-    #else
         if (llBits > 0)
-    #endif
             seq.litLength += BIT_readBitsFast(&seqState->DStream, llBits/*>0*/);
 
         if (MEM_32bits())
@@ -1552,7 +1582,7 @@ ZSTD_decompressSequences_body(ZSTD_DCtx* dctx,
     const BYTE* const prefixStart = (const BYTE*)(dctx->prefixStart);
     const BYTE* const vBase = (const BYTE*)(dctx->virtualStart);
     const BYTE* const dictEnd = (const BYTE*)(dctx->dictEnd);
-    DEBUGLOG(5, "ZSTD_decompressSequences_body");
+    DEBUGLOG(5, "ZSTD_decompressSequences_body: nbSeq = %d", nbSeq);
     (void)frame;
 
     /* Regen sequences */
@@ -1945,34 +1975,79 @@ ZSTD_decompressSequencesLong(ZSTD_DCtx* dctx,
 #endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT */
 
 
+/*
+ * @returns The total size of the history referenceable by zstd, including
+ * both the prefix and the extDict. At @p op any offset larger than this
+ * is invalid.
+ */
+static size_t ZSTD_totalHistorySize(BYTE* op, BYTE const* virtualStart)
+{
+    return (size_t)(op - virtualStart);
+}
+
+typedef struct {
+    unsigned longOffsetShare;
+    unsigned maxNbAdditionalBits;
+} ZSTD_OffsetInfo;
 
-#if !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT) && \
-    !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG)
-/* ZSTD_getLongOffsetsShare() :
+/* ZSTD_getOffsetInfo() :
  * condition : offTable must be valid
  * @return : "share" of long offsets (arbitrarily defined as > (1<<23))
- *           compared to maximum possible of (1<<OffFSELog) */
-static unsigned
-ZSTD_getLongOffsetsShare(const ZSTD_seqSymbol* offTable)
+ *           compared to maximum possible of (1<<OffFSELog),
+ *           as well as the maximum number additional bits required.
+ */
+static ZSTD_OffsetInfo
+ZSTD_getOffsetInfo(const ZSTD_seqSymbol* offTable, int nbSeq)
 {
-    const void* ptr = offTable;
-    U32 const tableLog = ((const ZSTD_seqSymbol_header*)ptr)[0].tableLog;
-    const ZSTD_seqSymbol* table = offTable + 1;
-    U32 const max = 1 << tableLog;
-    U32 u, total = 0;
-    DEBUGLOG(5, "ZSTD_getLongOffsetsShare: (tableLog=%u)", tableLog);
-
-    assert(max <= (1 << OffFSELog));  /* max not too large */
-    for (u=0; u<max; u++) {
-        if (table[u].nbAdditionalBits > 22) total += 1;
+    ZSTD_OffsetInfo info = {0, 0};
+    /* If nbSeq == 0, then the offTable is uninitialized, but we have
+     * no sequences, so both values should be 0.
+     */
+    if (nbSeq != 0) {
+        const void* ptr = offTable;
+        U32 const tableLog = ((const ZSTD_seqSymbol_header*)ptr)[0].tableLog;
+        const ZSTD_seqSymbol* table = offTable + 1;
+        U32 const max = 1 << tableLog;
+        U32 u;
+        DEBUGLOG(5, "ZSTD_getLongOffsetsShare: (tableLog=%u)", tableLog);
+
+        assert(max <= (1 << OffFSELog));  /* max not too large */
+        for (u=0; u<max; u++) {
+            info.maxNbAdditionalBits = MAX(info.maxNbAdditionalBits, table[u].nbAdditionalBits);
+            if (table[u].nbAdditionalBits > 22) info.longOffsetShare += 1;
+        }
+
+        assert(tableLog <= OffFSELog);
+        info.longOffsetShare <<= (OffFSELog - tableLog);  /* scale to OffFSELog */
     }
 
-    assert(tableLog <= OffFSELog);
-    total <<= (OffFSELog - tableLog);  /* scale to OffFSELog */
+    return info;
+}
 
-    return total;
+/*
+ * @returns The maximum offset we can decode in one read of our bitstream, without
+ * reloading more bits in the middle of the offset bits read. Any offsets larger
+ * than this must use the long offset decoder.
+ */
+static size_t ZSTD_maxShortOffset(void)
+{
+    if (MEM_64bits()) {
+        /* We can decode any offset without reloading bits.
+         * This might change if the max window size grows.
+         */
+        ZSTD_STATIC_ASSERT(ZSTD_WINDOWLOG_MAX <= 31);
+        return (size_t)-1;
+    } else {
+        /* The maximum offBase is (1 << (STREAM_ACCUMULATOR_MIN + 1)) - 1.
+         * This offBase would require STREAM_ACCUMULATOR_MIN extra bits.
+         * Then we have to subtract ZSTD_REP_NUM to get the maximum possible offset.
+         */
+        size_t const maxOffbase = ((size_t)1 << (STREAM_ACCUMULATOR_MIN + 1)) - 1;
+        size_t const maxOffset = maxOffbase - ZSTD_REP_NUM;
+        assert(ZSTD_highbit32((U32)maxOffbase) == STREAM_ACCUMULATOR_MIN);
+        return maxOffset;
+    }
 }
-#endif
 
 size_t
 ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx,
@@ -1980,20 +2055,21 @@ ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx,
                         const void* src, size_t srcSize, const int frame, const streaming_operation streaming)
 {   /* blockType == blockCompressed */
     const BYTE* ip = (const BYTE*)src;
-    /* isLongOffset must be true if there are long offsets.
-     * Offsets are long if they are larger than 2^STREAM_ACCUMULATOR_MIN.
-     * We don't expect that to be the case in 64-bit mode.
-     * In block mode, window size is not known, so we have to be conservative.
-     * (note: but it could be evaluated from current-lowLimit)
-     */
-    ZSTD_longOffset_e const isLongOffset = (ZSTD_longOffset_e)(MEM_32bits() && (!frame || (dctx->fParams.windowSize > (1ULL << STREAM_ACCUMULATOR_MIN))));
     DEBUGLOG(5, "ZSTD_decompressBlock_internal (size : %u)", (U32)srcSize);
 
-    RETURN_ERROR_IF(srcSize >= ZSTD_BLOCKSIZE_MAX, srcSize_wrong, "");
+    /* Note : the wording of the specification
+     * allows compressed block to be sized exactly ZSTD_BLOCKSIZE_MAX.
+     * This generally does not happen, as it makes little sense,
+     * since an uncompressed block would feature same size and have no decompression cost.
+     * Also, note that decoder from reference libzstd before < v1.5.4
+     * would consider this edge case as an error.
+     * As a consequence, avoid generating compressed blocks of size ZSTD_BLOCKSIZE_MAX
+     * for broader compatibility with the deployed ecosystem of zstd decoders */
+    RETURN_ERROR_IF(srcSize > ZSTD_BLOCKSIZE_MAX, srcSize_wrong, "");
 
     /* Decode literals section */
     {   size_t const litCSize = ZSTD_decodeLiteralsBlock(dctx, src, srcSize, dst, dstCapacity, streaming);
-        DEBUGLOG(5, "ZSTD_decodeLiteralsBlock : %u", (U32)litCSize);
+        DEBUGLOG(5, "ZSTD_decodeLiteralsBlock : cSize=%u, nbLiterals=%zu", (U32)litCSize, dctx->litSize);
         if (ZSTD_isError(litCSize)) return litCSize;
         ip += litCSize;
         srcSize -= litCSize;
@@ -2001,6 +2077,23 @@ ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx,
 
     /* Build Decoding Tables */
     {
+        /* Compute the maximum block size, which must also work when !frame and fParams are unset.
+         * Additionally, take the min with dstCapacity to ensure that the totalHistorySize fits in a size_t.
+         */
+        size_t const blockSizeMax = MIN(dstCapacity, (frame ? dctx->fParams.blockSizeMax : ZSTD_BLOCKSIZE_MAX));
+        size_t const totalHistorySize = ZSTD_totalHistorySize((BYTE*)dst + blockSizeMax, (BYTE const*)dctx->virtualStart);
+        /* isLongOffset must be true if there are long offsets.
+         * Offsets are long if they are larger than ZSTD_maxShortOffset().
+         * We don't expect that to be the case in 64-bit mode.
+         *
+         * We check here to see if our history is large enough to allow long offsets.
+         * If it isn't, then we can't possible have (valid) long offsets. If the offset
+         * is invalid, then it is okay to read it incorrectly.
+         *
+         * If isLongOffsets is true, then we will later check our decoding table to see
+         * if it is even possible to generate long offsets.
+         */
+        ZSTD_longOffset_e isLongOffset = (ZSTD_longOffset_e)(MEM_32bits() && (totalHistorySize > ZSTD_maxShortOffset()));
         /* These macros control at build-time which decompressor implementation
          * we use. If neither is defined, we do some inspection and dispatch at
          * runtime.
@@ -2008,6 +2101,11 @@ ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx,
 #if !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT) && \
     !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG)
         int usePrefetchDecoder = dctx->ddictIsCold;
+#else
+        /* Set to 1 to avoid computing offset info if we don't need to.
+         * Otherwise this value is ignored.
+         */
+        int usePrefetchDecoder = 1;
 #endif
         int nbSeq;
         size_t const seqHSize = ZSTD_decodeSeqHeaders(dctx, &nbSeq, ip, srcSize);
@@ -2015,28 +2113,42 @@ ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx,
         ip += seqHSize;
         srcSize -= seqHSize;
 
-        RETURN_ERROR_IF(dst == NULL && nbSeq > 0, dstSize_tooSmall, "NULL not handled");
+        RETURN_ERROR_IF((dst == NULL || dstCapacity == 0) && nbSeq > 0, dstSize_tooSmall, "NULL not handled");
+        RETURN_ERROR_IF(MEM_64bits() && sizeof(size_t) == sizeof(void*) && (size_t)(-1) - (size_t)dst < (size_t)(1 << 20), dstSize_tooSmall,
+                "invalid dst");
 
-#if !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT) && \
-    !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG)
-        if ( !usePrefetchDecoder
-          && (!frame || (dctx->fParams.windowSize > (1<<24)))
-          && (nbSeq>ADVANCED_SEQS) ) {  /* could probably use a larger nbSeq limit */
-            U32 const shareLongOffsets = ZSTD_getLongOffsetsShare(dctx->OFTptr);
-            U32 const minShare = MEM_64bits() ? 7 : 20; /* heuristic values, correspond to 2.73% and 7.81% */
-            usePrefetchDecoder = (shareLongOffsets >= minShare);
+        /* If we could potentially have long offsets, or we might want to use the prefetch decoder,
+         * compute information about the share of long offsets, and the maximum nbAdditionalBits.
+         * NOTE: could probably use a larger nbSeq limit
+         */
+        if (isLongOffset || (!usePrefetchDecoder && (totalHistorySize > (1u << 24)) && (nbSeq > 8))) {
+            ZSTD_OffsetInfo const info = ZSTD_getOffsetInfo(dctx->OFTptr, nbSeq);
+            if (isLongOffset && info.maxNbAdditionalBits <= STREAM_ACCUMULATOR_MIN) {
+                /* If isLongOffset, but the maximum number of additional bits that we see in our table is small
+                 * enough, then we know it is impossible to have too long an offset in this block, so we can
+                 * use the regular offset decoder.
+                 */
+                isLongOffset = ZSTD_lo_isRegularOffset;
+            }
+            if (!usePrefetchDecoder) {
+                U32 const minShare = MEM_64bits() ? 7 : 20; /* heuristic values, correspond to 2.73% and 7.81% */
+                usePrefetchDecoder = (info.longOffsetShare >= minShare);
+            }
         }
-#endif
 
         dctx->ddictIsCold = 0;
 
 #if !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT) && \
     !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG)
-        if (usePrefetchDecoder)
+        if (usePrefetchDecoder) {
+#else
+        (void)usePrefetchDecoder;
+        {
 #endif
 #ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT
             return ZSTD_decompressSequencesLong(dctx, dst, dstCapacity, ip, srcSize, nbSeq, isLongOffset, frame);
 #endif
+        }
 
 #ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG
         /* else */
@@ -2060,9 +2172,9 @@ void ZSTD_checkContinuity(ZSTD_DCtx* dctx, const void* dst, size_t dstSize)
 }
 
 
-size_t ZSTD_decompressBlock(ZSTD_DCtx* dctx,
-                            void* dst, size_t dstCapacity,
-                      const void* src, size_t srcSize)
+size_t ZSTD_decompressBlock_deprecated(ZSTD_DCtx* dctx,
+                                       void* dst, size_t dstCapacity,
+                                 const void* src, size_t srcSize)
 {
     size_t dSize;
     ZSTD_checkContinuity(dctx, dst, dstCapacity);
@@ -2070,3 +2182,12 @@ size_t ZSTD_decompressBlock(ZSTD_DCtx* dctx,
     dctx->previousDstEnd = (char*)dst + dSize;
     return dSize;
 }
+
+
+/* NOTE: Must just wrap ZSTD_decompressBlock_deprecated() */
+size_t ZSTD_decompressBlock(ZSTD_DCtx* dctx,
+                            void* dst, size_t dstCapacity,
+                      const void* src, size_t srcSize)
+{
+    return ZSTD_decompressBlock_deprecated(dctx, dst, dstCapacity, src, srcSize);
+}
diff --git a/lib/zstd/decompress/zstd_decompress_block.h b/lib/zstd/decompress/zstd_decompress_block.h
index 3d2d57a5d25a..5888e6cc788b 100644
--- a/lib/zstd/decompress/zstd_decompress_block.h
+++ b/lib/zstd/decompress/zstd_decompress_block.h
@@ -1,5 +1,6 @@
+/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
 /*
- * Copyright (c) Yann Collet, Facebook, Inc.
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
  * All rights reserved.
  *
  * This source code is licensed under both the BSD-style license (found in the
@@ -64,5 +65,10 @@ void ZSTD_buildFSETable(ZSTD_seqSymbol* dt,
                    unsigned tableLog, void* wksp, size_t wkspSize,
                    int bmi2);
 
+/* Internal definition of ZSTD_decompressBlock() to avoid deprecation warnings. */
+size_t ZSTD_decompressBlock_deprecated(ZSTD_DCtx* dctx,
+                            void* dst, size_t dstCapacity,
+                      const void* src, size_t srcSize);
+
 
 #endif /* ZSTD_DEC_BLOCK_H */
diff --git a/lib/zstd/decompress/zstd_decompress_internal.h b/lib/zstd/decompress/zstd_decompress_internal.h
index 98102edb6a83..32f79fb2873d 100644
--- a/lib/zstd/decompress/zstd_decompress_internal.h
+++ b/lib/zstd/decompress/zstd_decompress_internal.h
@@ -1,5 +1,6 @@
+/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
 /*
- * Copyright (c) Yann Collet, Facebook, Inc.
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
  * All rights reserved.
  *
  * This source code is licensed under both the BSD-style license (found in the
@@ -75,12 +76,13 @@ static UNUSED_ATTR const U32 ML_base[MaxML+1] = {
 
 #define ZSTD_BUILD_FSE_TABLE_WKSP_SIZE (sizeof(S16) * (MaxSeq + 1) + (1u << MaxFSELog) + sizeof(U64))
 #define ZSTD_BUILD_FSE_TABLE_WKSP_SIZE_U32 ((ZSTD_BUILD_FSE_TABLE_WKSP_SIZE + sizeof(U32) - 1) / sizeof(U32))
+#define ZSTD_HUFFDTABLE_CAPACITY_LOG 12
 
 typedef struct {
     ZSTD_seqSymbol LLTable[SEQSYMBOL_TABLE_SIZE(LLFSELog)];    /* Note : Space reserved for FSE Tables */
     ZSTD_seqSymbol OFTable[SEQSYMBOL_TABLE_SIZE(OffFSELog)];   /* is also used as temporary workspace while building hufTable during DDict creation */
     ZSTD_seqSymbol MLTable[SEQSYMBOL_TABLE_SIZE(MLFSELog)];    /* and therefore must be at least HUF_DECOMPRESS_WORKSPACE_SIZE large */
-    HUF_DTable hufTable[HUF_DTABLE_SIZE(HufLog)];  /* can accommodate HUF_decompress4X */
+    HUF_DTable hufTable[HUF_DTABLE_SIZE(ZSTD_HUFFDTABLE_CAPACITY_LOG)];  /* can accommodate HUF_decompress4X */
     U32 rep[ZSTD_REP_NUM];
     U32 workspace[ZSTD_BUILD_FSE_TABLE_WKSP_SIZE_U32];
 } ZSTD_entropyDTables_t;
@@ -164,6 +166,7 @@ struct ZSTD_DCtx_s
     ZSTD_dictUses_e dictUses;
     ZSTD_DDictHashSet* ddictSet;                    /* Hash set for multiple ddicts */
     ZSTD_refMultipleDDicts_e refMultipleDDicts;     /* User specified: if == 1, will allow references to multiple DDicts. Default == 0 (disabled) */
+    int disableHufAsm;
 
     /* streaming */
     ZSTD_dStreamStage streamStage;
diff --git a/lib/zstd/decompress_sources.h b/lib/zstd/decompress_sources.h
index a06ca187aab5..8a47eb2a4514 100644
--- a/lib/zstd/decompress_sources.h
+++ b/lib/zstd/decompress_sources.h
@@ -1,6 +1,6 @@
 /* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
 /*
- * Copyright (c) Facebook, Inc.
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
  * All rights reserved.
  *
  * This source code is licensed under both the BSD-style license (found in the
diff --git a/lib/zstd/zstd_common_module.c b/lib/zstd/zstd_common_module.c
index 22686e367e6f..466828e35752 100644
--- a/lib/zstd/zstd_common_module.c
+++ b/lib/zstd/zstd_common_module.c
@@ -1,6 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
 /*
- * Copyright (c) Facebook, Inc.
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
  * All rights reserved.
  *
  * This source code is licensed under both the BSD-style license (found in the
@@ -24,9 +24,6 @@ EXPORT_SYMBOL_GPL(HUF_readStats_wksp);
 EXPORT_SYMBOL_GPL(ZSTD_isError);
 EXPORT_SYMBOL_GPL(ZSTD_getErrorName);
 EXPORT_SYMBOL_GPL(ZSTD_getErrorCode);
-EXPORT_SYMBOL_GPL(ZSTD_customMalloc);
-EXPORT_SYMBOL_GPL(ZSTD_customCalloc);
-EXPORT_SYMBOL_GPL(ZSTD_customFree);
 
 MODULE_LICENSE("Dual BSD/GPL");
 MODULE_DESCRIPTION("Zstd Common");
diff --git a/lib/zstd/zstd_compress_module.c b/lib/zstd/zstd_compress_module.c
index 04e1b5c01d9b..8ecf43226af2 100644
--- a/lib/zstd/zstd_compress_module.c
+++ b/lib/zstd/zstd_compress_module.c
@@ -1,6 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
 /*
- * Copyright (c) Facebook, Inc.
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
  * All rights reserved.
  *
  * This source code is licensed under both the BSD-style license (found in the
diff --git a/lib/zstd/zstd_decompress_module.c b/lib/zstd/zstd_decompress_module.c
index f4ed952ed485..eb1c49e69722 100644
--- a/lib/zstd/zstd_decompress_module.c
+++ b/lib/zstd/zstd_decompress_module.c
@@ -1,6 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
 /*
- * Copyright (c) Facebook, Inc.
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
  * All rights reserved.
  *
  * This source code is licensed under both the BSD-style license (found in the
-- 
2.42.0